diff --git a/.gitignore b/.gitignore
index 9ae0d9c96f188bc6357832f22b4125694302b104..be75938ec401b1d72fa54773c85191aaac7d7f35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,7 @@ node_modules
 /bazel-*
 /bazel_pip
 /tools/python_bin_path.sh
-/tools/git/gen
+/tensorflow/tools/git/gen
 /pip_test
 /_python_build
 *.pyc
@@ -22,3 +22,15 @@ Pods
 Podfile.lock
 *.pbxproj
 *.xcworkspacedata
+/tensorflow/contrib/lite/downloads/**
+/tensorflow/contrib/lite/gen/**
+/tensorflow/contrib/lite/examples/ios/simple/data/*.txt
+/tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
+xcuserdata/**
+
+# Android
+.gradle
+.idea
+*.iml
+local.properties
+gradleBuild
diff --git a/AUTHORS b/AUTHORS
index a46ae7e616ab3a420d9fb2691ee8d8650032a39f..aa4be5169dcc68c579863e8ba6307cd00e9f9a68 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -7,4 +7,4 @@
 # The email address is not required for organizations.
 
 Google Inc.
-Yuan Tang terrytangyuan@gmail.com
+Yuan Tang <terrytangyuan@gmail.com>
diff --git a/CODEOWNERS b/CODEOWNERS
index 6e4b4f5f3f751ca9ab39a5772458349b00f06d57..57a4df40e651f45dc03493af631d73332e46c182 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -11,6 +11,7 @@
 # NEED OWNER: tensorflow/contrib/avro/*
 #tensorflow/contrib/batching/* @alextp @chrisolston
 #tensorflow/contrib/bayesflow/* @ebrevdo @rsepassi @jvdillon
+#tensorflow/contrib/boosted_trees/* @sshrdp @yk5 @nataliaponomareva
 #tensorflow/contrib/cmake/* @mrry @benoitsteiner
 #tensorflow/contrib/copy_graph/* @tucker @poxvoculi
 #tensorflow/contrib/crf/* @kentonl
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 10fd595fec7f240c3fdc871e1f32cc83f2ffd46d..ff11d131409b65880f16b80f9fe38dc39ac0e5fa 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -42,7 +42,7 @@ The Code of Conduct also applies within project spaces and in public spaces when
 
 Conflicts in an open source project can take many forms, from someone having a bad day and using harsh and hurtful language in the issue queue, to more serious instances such as sexist/racist statements or threats of violence, and everything in between.
 
-If the behaviour is threatening or harassing, or for other reasons requires immediate escalation, please see below.
+If the behavior is threatening or harassing, or for other reasons requires immediate escalation, please see below.
 
 However, for the vast majority of issues, we aim to empower individuals to first resolve conflicts themselves, asking for help when needed, and only after that fails to escalate further. This approach gives people more control over the outcome of their dispute. 
 
@@ -55,14 +55,14 @@ If you are experiencing or witnessing conflict, we ask you to use the following
 
 ## Reporting Violations
 
-Violations of the Code of Conduct can be reported to TensorFlow’s Project Steward at conduct@tensorflow.org. The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
+Violations of the Code of Conduct can be reported to TensorFlow’s Project Stewards, Edd Wilder-James (ewj@google.com) and Sarah Novotny (sarahnovotny@google.com). The Project Steward will determine whether the Code of Conduct was violated, and will issue an appropriate sanction, possibly including a written warning or expulsion from the project, project sponsored spaces, or project forums. We ask that you make a good-faith effort to resolve your conflict via the conflict resolution policy before submitting a report.
 
 Violations of the Code of Conduct can occur in any setting, even those unrelated to the project. We will only consider complaints about conduct that has occurred within one year of the report.
 
 
 ## Enforcement
 
-If the Project Steward receives a report alleging a violation of the Code of Conduct, the Project Steward will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Steward will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Steward may issue sanctions without notice.
+If the Project Stewards receive a report alleging a violation of the Code of Conduct, the Project Stewards will notify the accused of the report, and provide them an opportunity to discuss the report before a sanction is issued. The Project Stewards will do their utmost to keep the reporter anonymous. If the act is ongoing (such as someone engaging in harassment), or involves a threat to anyone's safety (e.g. threats of violence), the Project Stewards may issue sanctions without notice.
 
 
 ## Attribution
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 43abdaafbf45379430920cd027b26299cd62553b..1b537ca73cc94e992e7537fe69c8d0cc8fd13102 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -114,6 +114,7 @@ pylint --rcfile=/tmp/pylintrc myfile.py
 * [Google Java Style Guide](https://google.github.io/styleguide/javaguide.html)
 * [Google JavaScript Style Guide](https://google.github.io/styleguide/jsguide.html)
 * [Google Shell Style Guide](https://google.github.io/styleguide/shell.xml)
+* [Google Objective-C Style Guide](http://google.github.io/styleguide/objcguide.html)
 
 #### Running sanity check
 
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
index 2bf2c754cf64ec3bac22a22fbafcebbd4dc54bf4..1a401997c649518766acb2ebb0dea1c128bd0ba4 100644
--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
@@ -19,6 +19,7 @@ If you open a GitHub issue, here is our policy:
 - **TensorFlow version (use command below)**:
 - **Python version**: 
 - **Bazel version (if compiling from source)**:
+- **GCC/Compiler version (if compiling from source)**:
 - **CUDA/cuDNN version**:
 - **GPU model and memory**:
 - **Exact command to reproduce**:
diff --git a/README.md b/README.md
index 24bbb6cec10e16c7b6ae37b7cf8b6f90ebe5e5dd..aff3427bddb307aea6d6c2466eac14c9edffcc32 100644
--- a/README.md
+++ b/README.md
@@ -73,11 +73,11 @@ $ python
 
 ## For more information
 
-* [TensorFlow website](https://www.tensorflow.org)
+* [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
-* [TensorFlow course at Stanford](https://web.stanford.edu/class/cs20si)
+* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/RELEASE.md b/RELEASE.md
index d8db1f72004b5d944e3035a0f33dfc34a674b7ee..e04bd3fc505d51ade9e9fa12c822cb695e90b4f3 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -494,7 +494,7 @@ answered questions, and were part of inspiring discussions.
 This release contains contributions from many people at Google, as well as:
 
 A. Besir Kurtulmus, Adal Chiriliuc, @akash, Alec-Desouza, Alex Rothberg, Alex
-Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton 
+Sergeev, Alexander Heinecke, Allen Guo, Andreas Madsen, Ankesh Anand, Anton
 Loss, @Aravind, @Arie, Ashutosh Das, AuréLien Geron, Bairen Yi, @bakunyo, Ben
 Visser, Brady Zhou, Calpa Liu, Changming Sun, Chih Cheng Liang, Christopher
 Berner, Clark Zinzow, @Conchylicultor, Dan Ellis, Dan J, Dan Jarvis, Daniel
diff --git a/configure.py b/configure.py
index bc7859fee4d2aca9bd7ca24e85ad820c49e01e4a..7a9d315eb0ededf273d1cee3d06cb9b53864a834 100644
--- a/configure.py
+++ b/configure.py
@@ -25,15 +25,19 @@ import re
 import subprocess
 import sys
 
+# pylint: disable=g-import-not-at-top
 try:
   from shutil import which
 except ImportError:
   from distutils.spawn import find_executable as which
+# pylint: enable=g-import-not-at-top
 
 _TF_BAZELRC = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            '.tf_configure.bazelrc')
-_DEFAULT_CUDA_VERSION = '8.0'
-_DEFAULT_CUDNN_VERSION = '6'
+_TF_WORKSPACE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                             'WORKSPACE')
+_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -41,6 +45,14 @@ _DEFAULT_CUDA_PATH_WIN = ('C:/Program Files/NVIDIA GPU Computing '
                           'Toolkit/CUDA/v%s' % _DEFAULT_CUDA_VERSION)
 _TF_OPENCL_VERSION = '1.2'
 _DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
+_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
+_SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15]
+
+_DEFAULT_PROMPT_ASK_ATTEMPTS = 10
+
+
+class UserInputError(Exception):
+  pass
 
 
 def is_windows():
@@ -155,7 +167,7 @@ def get_python_path(environ_cp, python_bin_path):
   try:
     library_paths = run_shell(
         [python_bin_path, '-c',
-         'import site; print("\\n".join(site.getsitepackages()))']).split("\n")
+         'import site; print("\\n".join(site.getsitepackages()))']).split('\n')
   except subprocess.CalledProcessError:
     library_paths = [run_shell(
         [python_bin_path, '-c',
@@ -226,17 +238,9 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --define PYTHON_BIN_PATH="%s"' % python_bin_path)
-  write_to_bazelrc('build --define PYTHON_LIB_PATH="%s"' % python_lib_path)
   write_to_bazelrc('build --force_python=py%s' % python_major_version)
   write_to_bazelrc('build --host_force_python=py%s' % python_major_version)
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
-  write_to_bazelrc('test --force_python=py%s' % python_major_version)
-  write_to_bazelrc('test --host_force_python=py%s' % python_major_version)
-  write_to_bazelrc('test --define PYTHON_BIN_PATH="%s"' % python_bin_path)
-  write_to_bazelrc('test --define PYTHON_LIB_PATH="%s"' % python_lib_path)
-  write_to_bazelrc('run --define PYTHON_BIN_PATH="%s"' % python_bin_path)
-  write_to_bazelrc('run --define PYTHON_LIB_PATH="%s"' % python_lib_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # Write tools/python_bin_path.sh
@@ -485,7 +489,14 @@ def set_cc_opt_flags(environ_cp):
   cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS',
                                                  question, default_cc_opt_flags)
   for opt in cc_opt_flags.split():
-    write_to_bazelrc('build:opt --cxxopt=%s --copt=%s' % (opt, opt))
+    write_to_bazelrc('build:opt --copt=%s' % opt)
+  # It should be safe on the same build host.
+  write_to_bazelrc('build:opt --host_copt=-march=native')
+  write_to_bazelrc('build:opt --define with_default_optimizations=true')
+  # TODO(mikecase): Remove these default defines once we are able to get
+  # TF Lite targets building without them.
+  write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
+  write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
 
 def set_tf_cuda_clang(environ_cp):
@@ -555,6 +566,218 @@ def set_clang_cuda_compiler_path(environ_cp):
                               clang_cuda_compiler_path)
 
 
+def prompt_loop_or_load_from_env(
+    environ_cp,
+    var_name,
+    var_default,
+    ask_for_var,
+    check_success,
+    error_msg,
+    suppress_default_error=False,
+    n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS
+):
+  """Loop over user prompts for an ENV param until receiving a valid response.
+
+  For the env param var_name, read from the environment or verify user input
+  until receiving valid input. When done, set var_name in the environ_cp to its
+  new value.
+
+  Args:
+    environ_cp: (Dict) copy of the os.environ.
+    var_name: (String) string for name of environment variable, e.g. "TF_MYVAR".
+    var_default: (String) default value string.
+    ask_for_var: (String) string for how to ask for user input.
+    check_success: (Function) function that takes one argument and returns a
+      boolean. Should return True if the value provided is considered valid. May
+      contain a complex error message if error_msg does not provide enough
+      information. In that case, set suppress_default_error to True.
+    error_msg: (String) String with one and only one '%s'. Formatted with each
+      invalid response upon check_success(input) failure.
+    suppress_default_error: (Bool) Suppress the above error message in favor of
+      one from the check_success function.
+    n_ask_attempts: (Integer) Number of times to query for valid input before
+      raising an error and quitting.
+
+  Returns:
+    [String] The value of var_name after querying for input.
+
+  Raises:
+    UserInputError: if a query has been attempted n_ask_attempts times without
+    success, assume that the user has made a scripting error, and will continue
+    to provide invalid input. Raise the error to avoid infinitely looping.
+  """
+  default = environ_cp.get(var_name) or var_default
+  full_query = '%s [Default is %s]: ' % (
+      ask_for_var,
+      default,
+  )
+
+  for _ in range(n_ask_attempts):
+    val = get_from_env_or_user_or_default(environ_cp,
+                                          var_name,
+                                          full_query,
+                                          default)
+    if check_success(val):
+      break
+    if not suppress_default_error:
+      print(error_msg % val)
+    environ_cp[var_name] = ''
+  else:
+    raise UserInputError('Invalid %s setting was provided %d times in a row. '
+                         'Assuming to be a scripting mistake.' %
+                         (var_name, n_ask_attempts))
+
+  environ_cp[var_name] = val
+  return val
+
+
+def create_android_ndk_rule(environ_cp):
+  """Set ANDROID_NDK_HOME and write Android NDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_ndk_path = cygpath('%s/Android/Sdk/ndk-bundle' %
+                               environ_cp['APPDATA'])
+  elif is_macos():
+    default_ndk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_ndk_path = '%s/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+
+  def valid_ndk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'source.properties')))
+
+  android_ndk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_NDK_HOME',
+      var_default=default_ndk_path,
+      ask_for_var='Please specify the home path of the Android NDK to use.',
+      check_success=valid_ndk_path,
+      error_msg=('The path %s or its child file "source.properties" '
+                 'does not exist.')
+  )
+
+  write_android_ndk_workspace_rule(android_ndk_home_path)
+
+
+def create_android_sdk_rule(environ_cp):
+  """Set Android variables and write Android SDK WORKSPACE rule."""
+  if is_windows() or is_cygwin():
+    default_sdk_path = cygpath('%s/Android/Sdk' % environ_cp['APPDATA'])
+  elif is_macos():
+    default_sdk_path = '%s/library/Android/Sdk/ndk-bundle' % environ_cp['HOME']
+  else:
+    default_sdk_path = '%s/Android/Sdk' % environ_cp['HOME']
+
+  def valid_sdk_path(path):
+    return (os.path.exists(path) and
+            os.path.exists(os.path.join(path, 'platforms')) and
+            os.path.exists(os.path.join(path, 'build-tools')))
+
+  android_sdk_home_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_SDK_HOME',
+      var_default=default_sdk_path,
+      ask_for_var='Please specify the home path of the Android SDK to use.',
+      check_success=valid_sdk_path,
+      error_msg=('Either %s does not exist, or it does not contain the '
+                 'subdirectories "platforms" and "build-tools".'))
+
+  platforms = os.path.join(android_sdk_home_path, 'platforms')
+  api_levels = sorted(os.listdir(platforms))
+  api_levels = [x.replace('android-', '') for x in api_levels]
+
+  def valid_api_level(api_level):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'platforms',
+                                       'android-' + api_level))
+
+  android_api_level = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_API_LEVEL',
+      var_default=api_levels[-1],
+      ask_for_var=('Please specify the Android SDK API level to use. '
+                   '[Available levels: %s]') % api_levels,
+      check_success=valid_api_level,
+      error_msg='Android-%s is not present in the SDK path.')
+
+  build_tools = os.path.join(android_sdk_home_path, 'build-tools')
+  versions = sorted(os.listdir(build_tools))
+
+  def valid_build_tools(version):
+    return os.path.exists(os.path.join(android_sdk_home_path,
+                                       'build-tools',
+                                       version))
+
+  android_build_tools_version = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='ANDROID_BUILD_TOOLS_VERSION',
+      var_default=versions[-1],
+      ask_for_var=('Please specify an Android build tools version to use. '
+                   '[Available versions: %s]') % versions,
+      check_success=valid_build_tools,
+      error_msg=('The selected SDK does not have build-tools version %s '
+                 'available.'))
+
+  write_android_sdk_workspace_rule(android_sdk_home_path,
+                                   android_build_tools_version,
+                                   android_api_level)
+
+
+def write_android_sdk_workspace_rule(android_sdk_home_path,
+                                     android_build_tools_version,
+                                     android_api_level):
+  print('Writing android_sdk_workspace rule.\n')
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_sdk_repository(
+  name="androidsdk",
+  api_level=%s,
+  path="%s",
+  build_tools_version="%s")\n
+""" % (android_api_level, android_sdk_home_path, android_build_tools_version))
+
+
+def write_android_ndk_workspace_rule(android_ndk_home_path):
+  print('Writing android_ndk_workspace rule.')
+  ndk_api_level = check_ndk_level(android_ndk_home_path)
+  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
+    print('WARNING: The API level of the NDK in %s is %s, which is not '
+          'supported by Bazel (officially supported versions: %s). Please use '
+          'another version. Compiling Android targets may result in confusing '
+          'errors.\n' % (android_ndk_home_path, ndk_api_level,
+                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+  with open(_TF_WORKSPACE, 'a') as f:
+    f.write("""
+android_ndk_repository(
+  name="androidndk",
+  path="%s",
+  api_level=%s)\n
+""" % (android_ndk_home_path, ndk_api_level))
+
+
+def check_ndk_level(android_ndk_home_path):
+  """Check the revision number of an Android NDK path."""
+  properties_path = '%s/source.properties' % android_ndk_home_path
+  if is_windows() or is_cygwin():
+    properties_path = cygpath(properties_path)
+  with open(properties_path, 'r') as f:
+    filedata = f.read()
+
+  revision = re.search(r'Pkg.Revision = (\d+)', filedata)
+  if revision:
+    return revision.group(1)
+  return None
+
+
+def workspace_has_any_android_rule():
+  """Check the WORKSPACE for existing android_*_repository rules."""
+  with open(_TF_WORKSPACE, 'r') as f:
+    workspace = f.read()
+  has_any_rule = re.search(r'^android_[ns]dk_repository',
+                           workspace,
+                           re.MULTILINE)
+  return has_any_rule
+
+
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
   default_gcc_host_compiler_path = which('gcc') or ''
@@ -564,23 +787,16 @@ def set_gcc_host_compiler_path(environ_cp):
     # os.readlink is only available in linux
     default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
-  ask_gcc_path = (
-      'Please specify which gcc should be used by nvcc as the '
-      'host compiler. [Default is %s]: ') % default_gcc_host_compiler_path
-  while True:
-    gcc_host_compiler_path = get_from_env_or_user_or_default(
-        environ_cp, 'GCC_HOST_COMPILER_PATH', ask_gcc_path,
-        default_gcc_host_compiler_path)
-
-    if os.path.exists(gcc_host_compiler_path):
-      break
-
-    # Reset and retry
-    print('Invalid gcc path. %s cannot be found' % gcc_host_compiler_path)
-    environ_cp['GCC_HOST_COMPILER_PATH'] = ''
+  gcc_host_compiler_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='GCC_HOST_COMPILER_PATH',
+      var_default=default_gcc_host_compiler_path,
+      ask_for_var=
+      'Please specify which gcc should be used by nvcc as the host compiler.',
+      check_success=os.path.exists,
+      error_msg='Invalid gcc path. %s cannot be found.',
+  )
 
-  # Set GCC_HOST_COMPILER_PATH
-  environ_cp['GCC_HOST_COMPILER_PATH'] = gcc_host_compiler_path
   write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
 
 
@@ -635,7 +851,7 @@ def set_tf_cuda_version(environ_cp):
   write_action_env_to_bazelrc('TF_CUDA_VERSION', tf_cuda_version)
 
 
-def set_tf_cunn_version(environ_cp):
+def set_tf_cudnn_version(environ_cp):
   """Set CUDNN_INSTALL_PATH and TF_CUDNN_VERSION."""
   ask_cudnn_version = (
       'Please specify the cuDNN version you want to use. '
@@ -808,102 +1024,153 @@ def set_other_cuda_vars(environ_cp):
 def set_host_cxx_compiler(environ_cp):
   """Set HOST_CXX_COMPILER."""
   default_cxx_host_compiler = which('g++') or ''
-  ask_cxx_host_compiler = (
-      'Please specify which C++ compiler should be used as'
-      ' the host C++ compiler. [Default is %s]: ') % default_cxx_host_compiler
 
-  while True:
-    host_cxx_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_CXX_COMPILER', ask_cxx_host_compiler,
-        default_cxx_host_compiler)
-    if os.path.exists(host_cxx_compiler):
-      break
-
-    # Reset and retry
-    print('Invalid C++ compiler path. %s cannot be found' % host_cxx_compiler)
-    environ_cp['HOST_CXX_COMPILER'] = ''
+  host_cxx_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_CXX_COMPILER',
+      var_default=default_cxx_host_compiler,
+      ask_for_var=('Please specify which C++ compiler should be used as the '
+                   'host C++ compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C++ compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_CXX_COMPILER
-  environ_cp['HOST_CXX_COMPILER'] = host_cxx_compiler
   write_action_env_to_bazelrc('HOST_CXX_COMPILER', host_cxx_compiler)
 
 
 def set_host_c_compiler(environ_cp):
   """Set HOST_C_COMPILER."""
   default_c_host_compiler = which('gcc') or ''
-  ask_c_host_compiler = (
-      'Please specify which C compiler should be used as the'
-      ' host C compiler. [Default is %s]: ') % default_c_host_compiler
 
-  while True:
-    host_c_compiler = get_from_env_or_user_or_default(
-        environ_cp, 'HOST_C_COMPILER', ask_c_host_compiler,
-        default_c_host_compiler)
-    if os.path.exists(host_c_compiler):
-      break
-
-    # Reset and retry
-    print('Invalid C compiler path. %s cannot be found' % host_c_compiler)
-    environ_cp['HOST_C_COMPILER'] = ''
+  host_c_compiler = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='HOST_C_COMPILER',
+      var_default=default_c_host_compiler,
+      ask_for_var=('Please specify which C compiler should be used as the host'
+                   'C compiler.'),
+      check_success=os.path.exists,
+      error_msg='Invalid C compiler path. %s cannot be found.',
+  )
 
-  # Set HOST_C_COMPILER
-  environ_cp['HOST_C_COMPILER'] = host_c_compiler
   write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
 
 
 def set_computecpp_toolkit_path(environ_cp):
   """Set COMPUTECPP_TOOLKIT_PATH."""
-  ask_computecpp_toolkit_path = ('Please specify the location where ComputeCpp '
-                                 'for SYCL %s is installed. [Default is %s]: '
-                                ) % (_TF_OPENCL_VERSION,
-                                     _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
 
-  while True:
-    computecpp_toolkit_path = get_from_env_or_user_or_default(
-        environ_cp, 'COMPUTECPP_TOOLKIT_PATH', ask_computecpp_toolkit_path,
-        _DEFAULT_COMPUTECPP_TOOLKIT_PATH)
+  def toolkit_exists(toolkit_path):
+    """Check if a computecpp toolkit path is valid."""
     if is_linux():
       sycl_rt_lib_path = 'lib/libComputeCpp.so'
     else:
       sycl_rt_lib_path = ''
 
-    sycl_rt_lib_path_full = os.path.join(computecpp_toolkit_path,
+    sycl_rt_lib_path_full = os.path.join(toolkit_path,
                                          sycl_rt_lib_path)
-    if os.path.exists(sycl_rt_lib_path_full):
-      break
+    exists = os.path.exists(sycl_rt_lib_path_full)
+    if not exists:
+      print('Invalid SYCL %s library path. %s cannot be found' %
+            (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
+    return exists
 
-    print('Invalid SYCL %s library path. %s cannot be found' %
-          (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
-    environ_cp['COMPUTECPP_TOOLKIT_PATH'] = ''
+  computecpp_toolkit_path = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='COMPUTECPP_TOOLKIT_PATH',
+      var_default=_DEFAULT_COMPUTECPP_TOOLKIT_PATH,
+      ask_for_var=(
+          'Please specify the location where ComputeCpp for SYCL %s is '
+          'installed.' % _TF_OPENCL_VERSION),
+      check_success=toolkit_exists,
+      error_msg='Invalid SYCL compiler path. %s cannot be found.',
+      suppress_default_error=True)
 
-  # Set COMPUTECPP_TOOLKIT_PATH
-  environ_cp['COMPUTECPP_TOOLKIT_PATH'] = computecpp_toolkit_path
   write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
                               computecpp_toolkit_path)
 
+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR"""
+  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
+                             'include directory. (Use --config=sycl_trisycl '
+                             'when building with Bazel) '
+                             '[Default is %s]: '
+                             ) % (_DEFAULT_TRISYCL_INCLUDE_DIR)
+  while True:
+    trisycl_include_dir = get_from_env_or_user_or_default(
+      environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+      _DEFAULT_TRISYCL_INCLUDE_DIR)
+    if os.path.exists(trisycl_include_dir):
+      break
+
+    print('Invalid triSYCL include directory, %s cannot be found'
+          % (trisycl_include_dir))
+
+  # Set TRISYCL_INCLUDE_DIR
+  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
+                              trisycl_include_dir)
+
+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR."""
+  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
+                             'include directory. (Use --config=sycl_trisycl '
+                             'when building with Bazel) '
+                             '[Default is %s]: ') % _DEFAULT_TRISYCL_INCLUDE_DIR
+  while True:
+    trisycl_include_dir = get_from_env_or_user_or_default(
+        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
+        _DEFAULT_TRISYCL_INCLUDE_DIR)
+    if os.path.exists(trisycl_include_dir):
+      break
+
+    print('Invalid triSYCL include directory, %s cannot be found'
+          % (trisycl_include_dir))
+
+  # Set TRISYCL_INCLUDE_DIR
+  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR',
+                              trisycl_include_dir)
+
+
+def set_trisycl_include_dir(environ_cp):
+  """Set TRISYCL_INCLUDE_DIR."""
+
+  trisycl_include_dir = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='TRISYCL_INCLUDE_DIR',
+      var_default=_DEFAULT_TRISYCL_INCLUDE_DIR,
+      ask_for_var=('Please specify the location of the triSYCL include '
+                   'directory. (Use --config=sycl_trisycl when building with '
+                   'Bazel)'),
+      check_success=os.path.exists,
+      error_msg='Invalid trySYCL include directory. %s cannot be found.',
+      suppress_default_error=True)
+
+  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
+
 
 def set_mpi_home(environ_cp):
   """Set MPI_HOME."""
+
   default_mpi_home = which('mpirun') or which('mpiexec') or ''
   default_mpi_home = os.path.dirname(os.path.dirname(default_mpi_home))
 
-  ask_mpi_home = ('Please specify the MPI toolkit folder. [Default is %s]: '
-                 ) % default_mpi_home
-  while True:
-    mpi_home = get_from_env_or_user_or_default(environ_cp, 'MPI_HOME',
-                                               ask_mpi_home, default_mpi_home)
-
-    if os.path.exists(os.path.join(mpi_home, 'include')) and os.path.exists(
-        os.path.join(mpi_home, 'lib')):
-      break
-
-    print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
-          (os.path.join(mpi_home, 'include'),
-           os.path.exists(os.path.join(mpi_home, 'lib'))))
-    environ_cp['MPI_HOME'] = ''
+  def valid_mpi_path(mpi_home):
+    exists = (os.path.exists(os.path.join(mpi_home, 'include')) and
+              os.path.exists(os.path.join(mpi_home, 'lib')))
+    if not exists:
+      print('Invalid path to the MPI Toolkit. %s or %s cannot be found' %
+            (os.path.join(mpi_home, 'include'),
+             os.path.exists(os.path.join(mpi_home, 'lib'))))
+    return exists
 
-  # Set MPI_HOME
-  environ_cp['MPI_HOME'] = str(mpi_home)
+  _ = prompt_loop_or_load_from_env(
+      environ_cp,
+      var_name='MPI_HOME',
+      var_default=default_mpi_home,
+      ask_for_var='Please specify the MPI toolkit folder.',
+      check_success=valid_mpi_path,
+      error_msg='',
+      suppress_default_error=True)
 
 
 def set_other_mpi_vars(environ_cp):
@@ -941,13 +1208,12 @@ def set_other_mpi_vars(environ_cp):
 def set_mkl():
   write_to_bazelrc('build:mkl --define using_mkl=true')
   write_to_bazelrc('build:mkl -c opt')
-  write_to_bazelrc('build:mkl --copt="-DEIGEN_USE_VML"')
   print(
       'Add "--config=mkl" to your bazel command to build with MKL '
       'support.\nPlease note that MKL on MacOS or windows is still not '
       'supported.\nIf you would like to use a local MKL instead of '
       'downloading, please set the environment variable \"TF_MKL_ROOT\" every '
-      'time before build.')
+      'time before build.\n')
 
 
 def set_monolithic():
@@ -976,6 +1242,19 @@ def create_android_bazelrc_configs():
   write_to_bazelrc('build:android_arm64 --cpu=arm64-v8a')
 
 
+def set_grpc_build_flags():
+  write_to_bazelrc('build --define grpc_no_ares=true')
+
+def set_windows_build_flags():
+  if is_windows():
+    # The non-monolithic build is not supported yet
+    write_to_bazelrc('build --config monolithic')
+    # Suppress warning messages
+    write_to_bazelrc('build --copt=-w --host_copt=-w')
+    # Output more verbose information when something goes wrong
+    write_to_bazelrc('build --verbose_failures')
+
+
 def main():
   # Make a copy of os.environ to be clear when functions and getting and setting
   # environment variables.
@@ -993,8 +1272,9 @@ def main():
     environ_cp['TF_NEED_GCP'] = '0'
     environ_cp['TF_NEED_HDFS'] = '0'
     environ_cp['TF_NEED_JEMALLOC'] = '0'
+    environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
+    environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
-    environ_cp['TF_NEED_S3'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
 
   if is_macos():
@@ -1015,17 +1295,21 @@ def main():
   set_build_var(environ_cp, 'TF_NEED_VERBS', 'VERBS', 'with_verbs_support',
                 False, 'verbs')
 
-  set_action_env_var(environ_cp, 'TF_NEED_OPENCL', 'OpenCL', False)
-  if environ_cp.get('TF_NEED_OPENCL') == '1':
+  set_action_env_var(environ_cp, 'TF_NEED_OPENCL_SYCL', 'OpenCL SYCL', False)
+  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
     set_host_cxx_compiler(environ_cp)
     set_host_c_compiler(environ_cp)
-    set_computecpp_toolkit_path(environ_cp)
+    set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
+    if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
+      set_computecpp_toolkit_path(environ_cp)
+    else:
+      set_trisycl_include_dir(environ_cp)
 
   set_action_env_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
       'TF_CUDA_CONFIG_REPO' not in environ_cp):
     set_tf_cuda_version(environ_cp)
-    set_tf_cunn_version(environ_cp)
+    set_tf_cudnn_version(environ_cp)
     set_tf_cuda_compute_capabilities(environ_cp)
 
     set_tf_cuda_clang(environ_cp)
@@ -1044,10 +1328,29 @@ def main():
     set_mpi_home(environ_cp)
     set_other_mpi_vars(environ_cp)
 
+  set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
   set_mkl()
   set_monolithic()
+  set_windows_build_flags()
   create_android_bazelrc_configs()
 
+  if workspace_has_any_android_rule():
+    print('The WORKSPACE file has at least one of ["android_sdk_repository", '
+          '"android_ndk_repository"] already set. Will not ask to help '
+          'configure the WORKSPACE. Please delete the existing rules to '
+          'activate the helper.\n')
+  else:
+    if get_var(
+        environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
+        False,
+        ('Would you like to interactively configure ./WORKSPACE for '
+         'Android builds?'),
+        'Searching for NDK and SDK installations.',
+        'Not configuring the WORKSPACE for Android builds.'):
+      create_android_ndk_rule(environ_cp)
+      create_android_sdk_rule(environ_cp)
+
+
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 3f23203aefd3d42c12c6a40f3711bcdedd22fd23..0054ce4b39e3a054f318b9766be41ec7efe79c0f 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -54,6 +54,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "raspberry_pi_armeabi",
+    values = {
+        "crosstool_top": "@local_config_arm_compiler//:toolchain",
+        "cpu": "armeabi",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "android_arm",
     values = {
@@ -110,7 +119,7 @@ config_setting(
 
 config_setting(
     name = "no_tensorflow_py_deps",
-    values = {"define": "no_tensorflow_py_deps=true"},
+    define_values = {"no_tensorflow_py_deps": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -166,55 +175,122 @@ config_setting(
 # TODO(jhseu): Enable on other platforms other than Linux.
 config_setting(
     name = "with_jemalloc_linux_x86_64",
-    values = {
-        "cpu": "k8",
-        "define": "with_jemalloc=true",
-    },
+    define_values = {"with_jemalloc": "true"},
+    values = {"cpu": "k8"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_jemalloc_linux_ppc64le",
-    values = {
-        "cpu": "ppc",
-        "define": "with_jemalloc=true",
-    },
+    define_values = {"with_jemalloc": "true"},
+    values = {"cpu": "ppc"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_default_optimizations",
+    define_values = {"with_default_optimizations": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_gcp_support",
-    values = {"define": "with_gcp_support=true"},
+    define_values = {"with_gcp_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_hdfs_support",
-    values = {"define": "with_hdfs_support=true"},
+    define_values = {"with_hdfs_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_s3_support",
-    values = {"define": "with_s3_support=true"},
+    define_values = {"with_s3_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+# Crosses between platforms and file system libraries not supported on those
+# platforms due to limitations in nested select() statements.
+config_setting(
+    name = "with_gcp_support_windows_override",
+    define_values = {"with_gcp_support": "true"},
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support_windows_override",
+    define_values = {"with_hdfs_support": "true"},
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_s3_support_windows_override",
+    define_values = {"with_s3_support": "true"},
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_gcp_support_android_override",
+    define_values = {"with_gcp_support": "true"},
+    values = {"crosstool_top": "//external:android/crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support_android_override",
+    define_values = {"with_hdfs_support": "true"},
+    values = {"crosstool_top": "//external:android/crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_s3_support_android_override",
+    define_values = {"with_s3_support": "true"},
+    values = {"crosstool_top": "//external:android/crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_gcp_support_ios_override",
+    define_values = {"with_gcp_support": "true"},
+    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_hdfs_support_ios_override",
+    define_values = {"with_hdfs_support": "true"},
+    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "with_s3_support_ios_override",
+    define_values = {"with_s3_support": "true"},
+    values = {"crosstool_top": "//tools/osx/crosstool:crosstool"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_xla_support",
-    values = {"define": "with_xla_support=true"},
+    define_values = {"with_xla_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_gdr_support",
-    values = {"define": "with_gdr_support=true"},
+    define_values = {"with_gdr_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "with_verbs_support",
-    values = {"define": "with_verbs_support=true"},
+    define_values = {"with_verbs_support": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -291,6 +367,7 @@ config_setting(
 package_group(
     name = "internal",
     packages = [
+        "//learning/meta_rank/...",
         "//tensorflow/...",
         "//tensorflow_fold/llgtm/...",
     ],
@@ -336,6 +413,7 @@ filegroup(
         "//tensorflow/compiler/tf2xla:all_files",
         "//tensorflow/compiler/tf2xla/cc:all_files",
         "//tensorflow/compiler/tf2xla/kernels:all_files",
+        "//tensorflow/compiler/tf2xla/lib:all_files",
         "//tensorflow/compiler/tf2xla/ops:all_files",
         "//tensorflow/compiler/xla:all_files",
         "//tensorflow/compiler/xla/client:all_files",
@@ -408,11 +486,31 @@ filegroup(
         "//tensorflow/contrib/learn/python/learn/datasets:all_files",
         "//tensorflow/contrib/linalg:all_files",
         "//tensorflow/contrib/linear_optimizer:all_files",
+        "//tensorflow/contrib/lite:all_files",
+        "//tensorflow/contrib/lite/java:all_files",
+        "//tensorflow/contrib/lite/java/demo/app/src/main:all_files",
+        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:all_files",
+        "//tensorflow/contrib/lite/java/src/main/native:all_files",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:all_files",
+        "//tensorflow/contrib/lite/kernels:all_files",
+        "//tensorflow/contrib/lite/kernels/internal:all_files",
+        "//tensorflow/contrib/lite/models/smartreply:all_files",
+        "//tensorflow/contrib/lite/nnapi:all_files",
+        "//tensorflow/contrib/lite/python:all_files",
+        "//tensorflow/contrib/lite/schema:all_files",
+        "//tensorflow/contrib/lite/testing:all_files",
+        "//tensorflow/contrib/lite/toco:all_files",
+        "//tensorflow/contrib/lite/toco/graph_transformations/tests:all_files",
+        "//tensorflow/contrib/lite/toco/python:all_files",
+        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:all_files",
+        "//tensorflow/contrib/lite/toco/tflite:all_files",
+        "//tensorflow/contrib/lite/tools:all_files",
         "//tensorflow/contrib/lookup:all_files",
         "//tensorflow/contrib/losses:all_files",
         "//tensorflow/contrib/makefile:all_files",
         "//tensorflow/contrib/meta_graph_transform:all_files",
         "//tensorflow/contrib/metrics:all_files",
+        "//tensorflow/contrib/model_pruning:all_files",
         "//tensorflow/contrib/mpi_collectives:all_files",
         "//tensorflow/contrib/ndlstm:all_files",
         "//tensorflow/contrib/nearest_neighbor:all_files",
@@ -456,6 +554,7 @@ filegroup(
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
         "//tensorflow/contrib/tpu:all_files",
         "//tensorflow/contrib/tpu/profiler:all_files",
+        "//tensorflow/contrib/tpu/proto:all_files",
         "//tensorflow/contrib/training:all_files",
         "//tensorflow/contrib/util:all_files",
         "//tensorflow/contrib/verbs:all_files",
@@ -503,6 +602,7 @@ filegroup(
         "//tensorflow/java/src/main/native:all_files",
         "//tensorflow/python:all_files",
         "//tensorflow/python/data:all_files",
+        "//tensorflow/python/data/kernel_tests:all_files",
         "//tensorflow/python/data/ops:all_files",
         "//tensorflow/python/data/util:all_files",
         "//tensorflow/python/debug:all_files",
@@ -539,6 +639,7 @@ filegroup(
         "//tensorflow/tools/test:all_files",
         "//tensorflow/user_ops:all_files",
         "//third_party/hadoop:all_files",
+        "//third_party/mpi:all_files",
         "//third_party/sycl:all_files",
         "//third_party/sycl/sycl:all_files",
     ],
@@ -669,3 +770,10 @@ tf_cc_shared_object(
         "//tensorflow/core:tensorflow",
     ],
 )
+
+exports_files(
+    [
+        "tf_version_script.lds",
+        "tf_exported_symbols.lds",
+    ],
+)
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 6dd1b999102d0135720b6ab3a43cbe61255acbc1..8a85eba5fc439af59144d3e8b869bf16b9462456 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -383,12 +383,11 @@ void TF_Reset_Helper(const TF_SessionOptions* opt, const char** containers,
 // be less than the total node count.
 Status ValidateNoCycles(const Graph& g) {
   // TODO(nolivia): check this on a subset of the graph instead of all of it.
-  int total_num_nodes = g.num_node_ids();
   // A node is ready when all of its inputs have been visited.
   std::vector<const Node*> ready;
-  std::vector<int> pending_count(total_num_nodes, 0);
+  std::vector<int> pending_count(g.num_node_ids(), 0);
 
-  for (int i = 0; i < total_num_nodes; ++i) {
+  for (int i = 0; i < g.num_node_ids(); ++i) {
     const Node* n = g.FindNodeId(i);
     if (n == nullptr) continue;
     pending_count[i] = n->in_edges().size();
@@ -421,7 +420,7 @@ Status ValidateNoCycles(const Graph& g) {
     }
   }
 
-  if (processed < total_num_nodes) {
+  if (processed < g.num_nodes()) {
     std::vector<string> nodes_in_cycle;
     for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3;
          ++i) {
@@ -430,7 +429,7 @@ Status ValidateNoCycles(const Graph& g) {
       }
     }
     return errors::InvalidArgument(
-        "Graph is invalid, contains a cycle with ", total_num_nodes - processed,
+        "Graph is invalid, contains a cycle with ", g.num_nodes() - processed,
         " nodes, including: ", str_util::Join(nodes_in_cycle, ", "));
   }
   return Status::OK();
@@ -580,6 +579,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
       status->status = InvalidArgument(
           "invalid string tensor encoding (string #", i, " of ",
           srcarray.size(), "): ", status->status.error_message());
+      delete[] base;
       return nullptr;
     }
     dst += consumed;
@@ -589,6 +589,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
     status->status = InvalidArgument(
         "invalid string tensor encoding (decoded ", (dst - base),
         " bytes, but the tensor is encoded in ", size, " bytes");
+    delete[] base;
     return nullptr;
   }
 
@@ -625,6 +626,23 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
   return Status::OK();
 }
 
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type)
+    EXCLUSIVE_LOCKS_REQUIRED(graph->mu) {
+  // If any session has already run this node_id, mark this session as
+  // unrunnable.
+  for (auto it : graph->sessions) {
+    if (it.first->last_num_graph_nodes > op.node.id()) {
+      it.second = FailedPrecondition(
+          "Operation '", op.node.DebugString(), "' was changed by ",
+          mutation_type,
+          " after it was run by a session. Nodes can be mutated "
+          "only before they are executed by a session. Either don't modify "
+          "nodes after running them or create a new session.");
+    }
+  }
+}
+
 // Helpers for loading a TensorFlow plugin (a .so file).
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len);
@@ -890,8 +908,8 @@ const tensorflow::AttrValue* GetAttrValue(TF_Operation* oper,
                                           TF_Status* status) {
   const tensorflow::AttrValue* attr = oper->node.attrs().Find(attr_name);
   if (attr == nullptr) {
-    status->status =
-        InvalidArgument("Operation has no attr named '", attr_name, "'.");
+    status->status = InvalidArgument("Operation '", oper->node.name(),
+                                     "' has no attr named '", attr_name, "'.");
   }
   return attr;
 }
@@ -939,13 +957,17 @@ void TF_GraphSetTensorShape(TF_Graph* graph, TF_Output output,
     return;
   }
 
-  std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
-  dim_vec.reserve(num_dims);
-  for (int i = 0; i < num_dims; ++i) {
-    dim_vec.push_back(ic->MakeDim(dims[i]));
+  tensorflow::shape_inference::ShapeHandle new_shape;
+  if (num_dims != -1) {
+    std::vector<tensorflow::shape_inference::DimensionHandle> dim_vec;
+    dim_vec.reserve(num_dims);
+    for (int i = 0; i < num_dims; ++i) {
+      dim_vec.push_back(ic->MakeDim(dims[i]));
+    }
+    new_shape = ic->MakeShape(dim_vec);
+  } else {
+    new_shape = ic->UnknownShape();
   }
-
-  tensorflow::shape_inference::ShapeHandle new_shape = ic->MakeShape(dim_vec);
   status->status = graph->refiner.SetShape(node, output.index, new_shape);
 }
 
@@ -1741,7 +1763,6 @@ void TF_OperationToNodeDef(TF_Operation* oper, TF_Buffer* output_node_def,
 TF_Graph::TF_Graph()
     : graph(tensorflow::OpRegistry::Global()),
       refiner(graph.versions().producer(), graph.op_registry()),
-      num_sessions(0),
       delete_requested(false),
       parent(nullptr),
       parent_inputs(nullptr) {}
@@ -1751,7 +1772,7 @@ TF_Graph* TF_NewGraph() { return new TF_Graph; }
 void TF_DeleteGraph(TF_Graph* g) {
   g->mu.lock();
   g->delete_requested = true;
-  const bool del = g->num_sessions == 0;
+  const bool del = g->sessions.empty();
   g->mu.unlock();
   if (del) delete g;
 }
@@ -1831,6 +1852,16 @@ void TF_ImportGraphDefOptionsSetPrefix(TF_ImportGraphDefOptions* opts,
   opts->opts.prefix = prefix;
 }
 
+void TF_ImportGraphDefOptionsSetUniquifyNames(TF_ImportGraphDefOptions* opts,
+                                              unsigned char uniquify_names) {
+  opts->opts.uniquify_names = uniquify_names;
+}
+
+void TF_ImportGraphDefOptionsSetUniquifyPrefix(TF_ImportGraphDefOptions* opts,
+                                               unsigned char uniquify_prefix) {
+  opts->opts.uniquify_prefix = uniquify_prefix;
+}
+
 void TF_ImportGraphDefOptionsAddInputMapping(TF_ImportGraphDefOptions* opts,
                                              const char* src_name,
                                              int src_index, TF_Output dst) {
@@ -2321,11 +2352,12 @@ TF_Session* TF_NewSession(TF_Graph* graph, const TF_SessionOptions* opt,
   Session* session;
   status->status = NewSession(opt->options, &session);
   if (status->status.ok()) {
+    TF_Session* new_session = new TF_Session(session, graph);
     if (graph != nullptr) {
       mutex_lock l(graph->mu);
-      graph->num_sessions += 1;
+      graph->sessions[new_session] = Status::OK();
     }
-    return new TF_Session(session, graph);
+    return new_session;
   } else {
     DCHECK_EQ(nullptr, session);
     return nullptr;
@@ -2389,7 +2421,7 @@ TF_Session* TF_LoadSessionFromSavedModel(
 
   TF_Session* session = new TF_Session(bundle.session.release(), graph);
 
-  graph->num_sessions += 1;
+  graph->sessions[session] = Status::OK();
   session->last_num_graph_nodes = graph->graph.num_node_ids();
   return session;
 #endif  // __ANDROID__
@@ -2404,8 +2436,8 @@ void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
     graph->mu.lock();
-    graph->num_sessions -= 1;
-    const bool del = graph->delete_requested && graph->num_sessions == 0;
+    graph->sessions.erase(s);
+    const bool del = graph->delete_requested && graph->sessions.empty();
     graph->mu.unlock();
     if (del) delete graph;
   }
@@ -2421,6 +2453,13 @@ static bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status) {
     mutex_lock session_lock(session->mu);
     session->graph->mu.lock();
     const Graph& graph = session->graph->graph;
+
+    status->status = session->graph->sessions[session];
+    if (!status->status.ok()) {
+      session->graph->mu.unlock();
+      return false;
+    }
+
     const auto num_nodes = graph.num_node_ids();
     if (session->last_num_graph_nodes < num_nodes) {
       status->status = tensorflow::ValidateNoCycles(session->graph->graph);
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index bb569d67fcbcec29e9494236abd79b3e40db91cd..df7fe222b130d2fd58915be112ff08a29d27639a 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -889,6 +889,20 @@ TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
 TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetPrefix(
     TF_ImportGraphDefOptions* opts, const char* prefix);
 
+// Set whether to uniquify imported operation names. If true, imported operation
+// names will be modified if their name already exists in the graph. If false,
+// conflicting names will be treated as an error. Note that this option has no
+// effect if a prefix is set, since the prefix will guarantee all names are
+// unique. Defaults to false.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyNames(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_names);
+
+// If true, the specified prefix will be modified if it already exists as an
+// operation name or prefix in the graph. If false, a conflicting prefix will be
+// treated as an error. This option has no effect if no prefix is specified.
+TF_CAPI_EXPORT extern void TF_ImportGraphDefOptionsSetUniquifyPrefix(
+    TF_ImportGraphDefOptions* opts, unsigned char uniquify_prefix);
+
 // Set any imported nodes with input `src_name:src_index` to have that input
 // replaced with `dst`. `src_name` refers to a node in the graph to be imported,
 // `dst` references a node already existing in the graph being imported into.
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index dcb818b88b6fca460852beb6e948d2eb6964f663..d60d1de315ed37a327bd036ddb914a3c32413f65 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -68,7 +68,7 @@ class NodeNameMapping {
   // This is a superset of values in name_mapping_.
   std::unordered_set<string> used_names_;
   // Mapping from original node name from the graph to the normalized
-  // and uniqified version of it.
+  // and uniquified version of it.
   std::unordered_map<string, string> name_mapping_;
 };
 
@@ -226,12 +226,17 @@ Status FillFunctionBody(
       }
       node_def->add_input(strings::StrCat("^", normalized));
     }
+
+    // A function is stateful if any of its nodes are stateful.
+    if (node->op_def().is_stateful()) {
+      fdef->mutable_signature()->set_is_stateful(true);
+    }
   }
   return Status::OK();
 }
 
 // Graph to FunctionDef conversion. This code is closely modeled on the Python
-// code in third_party/tensorflow/python/framework/function.py.
+// code in tensorflow/python/framework/function.py.
 Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           bool append_hash_to_fn_name,
                           const std::vector<const Node*>& body_nodes,
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index d5580b658992413ae6f9cb79ef88751ee28ce465..4ffc9d69312eae6c683b5701ceb44c13c7e61c5e 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1482,6 +1482,51 @@ TEST_F(CApiFunctionTest, GetOpDef) {
   EXPECT_EQ(op_def.name(), func_name_);
   EXPECT_EQ(op_def.input_arg_size(), 1);
   EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_FALSE(op_def.is_stateful());
+
+  TF_DeleteBuffer(buffer);
+}
+
+void DefineStatefulFunction(const char* name, TF_Function** func) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Tensor* tensor_shape = Int32Tensor({37, 1});
+  TF_Operation* shape = Const(tensor_shape, func_graph.get(), s.get(), "shape");
+  TF_Operation* random =
+      RandomUniform(shape, TF_FLOAT, func_graph.get(), s.get());
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{random, 0}};
+  *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/0, -1,
+                             /*opers=*/nullptr, 0, inputs, 1, outputs,
+                             /*output_names=*/nullptr,
+                             /*opts=*/nullptr, "", s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(*func, nullptr);
+  TF_DeleteTensor(tensor_shape);
+}
+
+TEST_F(CApiFunctionTest, StatefulOpDef) {
+  DefineStatefulFunction(func_name_, &func_);
+  TF_GraphCopyFunction(host_graph_, func_, nullptr, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Test we can retrieve function OpDef from graph
+  TF_Buffer* buffer = TF_NewBuffer();
+  TF_GraphGetOpDef(host_graph_, func_name_, buffer, s_);
+  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
+
+  // Sanity check returned OpDef
+  string data(static_cast<const char*>(buffer->data), buffer->length);
+  OpDef op_def;
+  op_def.ParseFromString(data);
+  EXPECT_EQ(op_def.name(), func_name_);
+  EXPECT_EQ(op_def.input_arg_size(), 0);
+  EXPECT_EQ(op_def.output_arg_size(), 1);
+  EXPECT_TRUE(op_def.is_stateful());
 
   TF_DeleteBuffer(buffer);
 }
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index bb04e01beec931a8ea66d0855eec9625d3a6a5ab..aac333d9e29e60148a271fba95fadde708c7c370 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -81,12 +81,20 @@ struct TF_Graph {
   std::unordered_map<tensorflow::string, tensorflow::Node*> name_map
       GUARDED_BY(mu);
 
-  // TF_Graph may only / must be deleted when
-  //   num_sessions == 0 && delete_requested == true
-
-  // num_sessions incremented by TF_NewSession, and decremented by
+  // The keys of this map are all the active sessions using this graph.
+  // Each value is the current "runnability" status of the corresponding
+  // session. Under normal conditions all statuses are Status::OK(), but
+  // if some operation is mutated after it was run by a session (this
+  // is detected in RecordMutation function), that session is no longer
+  // safe to run. Its status will contain the error that will be returned
+  // to the user, should she try running this session.
+  //
+  // Sessions are added to this map in TF_NewSession, and removed in
   // TF_DeleteSession.
-  int num_sessions GUARDED_BY(mu);
+  // TF_Graph may only / must be deleted when
+  //   sessions.size() == 0 && delete_requested == true
+  tensorflow::gtl::FlatMap<TF_Session*, tensorflow::Status> sessions
+      GUARDED_BY(mu);
   bool delete_requested GUARDED_BY(mu);  // set true by TF_DeleteGraph
 
   // Used to link graphs contained in TF_WhileParams to the parent graph that
@@ -167,6 +175,9 @@ TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
 Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
 
+void RecordMutation(TF_Graph* graph, const TF_Operation& op,
+                    const char* mutation_type);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 05881e619ba232de99e78f315cfa8ab9294e5137..6ec1db8ccfdb713f330b708e604bd4b502ff7202 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -287,6 +287,13 @@ TEST(CAPI, SetShape) {
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
   EXPECT_EQ(-1, num_dims);
 
+  // Set the shape to be unknown, expect no change.
+  TF_GraphSetTensorShape(graph, feed_out_0, /*dims=*/nullptr, -1, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  num_dims = TF_GraphGetTensorNumDims(graph, feed_out_0, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(-1, num_dims);
+
   // Set the shape to be 2 x Unknown
   int64_t dims[] = {2, -1};
   TF_GraphSetTensorShape(graph, feed_out_0, dims, 2, s);
@@ -315,7 +322,17 @@ TEST(CAPI, SetShape) {
   EXPECT_EQ(dims[0], returned_dims[0]);
   EXPECT_EQ(dims[1], returned_dims[1]);
 
-  // Try to set 'unknown' on the shape and see that
+  // Try to set 'unknown' with unknown rank on the shape and see that
+  // it doesn't change.
+  TF_GraphSetTensorShape(graph, feed_out_0, /*dims=*/nullptr, -1, s);
+  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_GraphGetTensorShape(graph, feed_out_0, returned_dims, num_dims, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  EXPECT_EQ(2, num_dims);
+  EXPECT_EQ(2, returned_dims[0]);
+  EXPECT_EQ(3, returned_dims[1]);
+
+  // Try to set 'unknown' with same rank on the shape and see that
   // it doesn't change.
   dims[0] = -1;
   dims[1] = -1;
@@ -383,7 +400,7 @@ TEST(CAPI, Graph) {
   EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s));
 
   ASSERT_FALSE(GetAttrValue(feed, "missing", &attr_value, s));
-  EXPECT_EQ(string("Operation has no attr named 'missing'."),
+  EXPECT_EQ(string("Operation 'feed' has no attr named 'missing'."),
             string(TF_Message(s)));
 
   // Make a constant oper with the scalar "3".
@@ -1054,7 +1071,7 @@ class CApiColocationTest : public ::testing::Test {
         TF_OperationGetAttrMetadata(op, tensorflow::kColocationAttrName, s_);
     if (expected.empty()) {
       ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(s_)) << TF_Message(s_);
-      EXPECT_EQ(std::string("Operation has no attr named '_class'."),
+      EXPECT_EQ(std::string("Operation 'add' has no attr named '_class'."),
                 std::string(TF_Message(s_)));
       return;
     }
diff --git a/tensorflow/c/c_test_util.cc b/tensorflow/c/c_test_util.cc
index c291a2e440a8515e968b0ce0395b289080f04e8b..37439ff0beac5a5220460465e954b6c093ee1ba9 100644
--- a/tensorflow/c/c_test_util.cc
+++ b/tensorflow/c/c_test_util.cc
@@ -193,6 +193,15 @@ TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph,
   return TF_FinishOperation(desc, s);
 }
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s) {
+  TF_OperationDescription* desc =
+      TF_NewOperation(graph, "RandomUniform", "random_uniform");
+  TF_AddInput(desc, {shape, 0});
+  TF_SetAttrType(desc, "dtype", dtype);
+  return TF_FinishOperation(desc, s);
+}
+
 void Split3Helper(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                   const char* name, TF_Operation** op) {
   TF_Operation* zero = ScalarConst(
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index d54733749248fa32c39d88bb0281d329dd50c7bd..96a93afef3e22d352fdbe911c3a5b01c867c6033 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -74,6 +74,9 @@ TF_Operation* Neg(TF_Operation* n, TF_Graph* graph, TF_Status* s,
 
 TF_Operation* LessThan(TF_Output l, TF_Output r, TF_Graph* graph, TF_Status* s);
 
+TF_Operation* RandomUniform(TF_Operation* shape, TF_DataType dtype,
+                            TF_Graph* graph, TF_Status* s);
+
 // Split `input` along the first dimention into 3 tensors
 TF_Operation* Split3(TF_Operation* input, TF_Graph* graph, TF_Status* s,
                      const char* name = "split3");
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index c77896b80b478cd34d3502e1061a7e76204ba021..d533758e360bc44a6f52f57eaae5b222e0482860 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -39,6 +39,7 @@ tf_cuda_library(
 tf_cuda_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":c_api",
         ":runtime",
@@ -105,7 +106,6 @@ tf_cc_test(
 
 cc_library(
     name = "tape",
-    srcs = ["tape.cc"],
     hdrs = ["tape.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 8359de62b7ff690fec9f6a0e3280f947c62f8b6e..706c89536db019c7f7389af576815746b2425520 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -571,6 +571,12 @@ void TFE_ContextAddFunctionDef(TFE_Context* ctx,
   status->status = ctx->func_lib_def.AddFunctionDef(function_def);
 }
 
+void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
+                            TF_Status* status) {
+  tensorflow::mutex_lock l(ctx->functions_mu);
+  status->status = ctx->func_lib_def.AddFunctionDef(function->fdef);
+}
+
 }  // extern "C"
 
 TFE_TensorHandle* TFE_NewTensorHandle(const tensorflow::Tensor& t) {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 865580c5f3a823d9cf49fe460bd007e3b3b88767..ca105962df0d6655946304159937621022e7fcba 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -200,6 +200,13 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunctionDef(TFE_Context* ctx,
                                                      const char* serialized_function_def,
                                                      size_t size, TF_Status* status);
 
+// Adds a function (created from TF_GraphToFunction or
+// TF_FunctionImportFunctionDef) to the context, allowing it to be executed with
+// TFE_Execute by creating an op with the same name as the function.
+TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
+                                                  TF_Function* function,
+                                                  TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 4af91b8853d0e85570bad136752a9d0a04b87da5..3fe0b7efa11bc619ed98bf9a1634ade5b6ed0a7c 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -295,6 +295,67 @@ TEST(CAPI, Execute) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, Function) {
+  // First create a simple identity function.
+  TF_Graph* function_graph = TF_NewGraph();
+  TF_OperationDescription* arg_descr =
+      TF_NewOperation(function_graph, "Placeholder", "arg");
+  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
+  TF_Status* status = TF_NewStatus();
+  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_OperationDescription* id_descr =
+      TF_NewOperation(function_graph, "Identity", "id");
+  TF_SetAttrType(id_descr, "T", TF_INT32);
+  TF_AddInput(id_descr, {arg, 0});
+  TF_Operation* id = TF_FinishOperation(id_descr, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_Output input{arg, 0};
+  TF_Output output{id, 0};
+  TF_Function* fn =
+      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
+                         &output, nullptr, nullptr, "test", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteGraph(function_graph);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_ContextAddFunction(ctx, fn, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteFunction(fn);
+
+  TF_Tensor* t =
+      TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
+  *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
+  TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteTensor(t);
+
+  TFE_Op* op = TFE_NewOp(ctx, "ident", status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, h, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+
+  std::vector<TFE_TensorHandle*> result;
+  result.push_back(nullptr);
+  int num_retvals = 1;
+  TFE_Execute(op, result.data(), &num_retvals, status);
+  TFE_DeleteOp(op);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  ASSERT_EQ(num_retvals, 1);
+
+  TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteTensor(r);
+  TFE_DeleteTensorHandle(result[0]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+  TF_DeleteStatus(status);
+}
+
 string MatMulFunction() {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
diff --git a/tensorflow/c/eager/tape.cc b/tensorflow/c/eager/tape.cc
deleted file mode 100644
index 464612a81ebda428f5582b6927f3a3b00a5aa6f5..0000000000000000000000000000000000000000
--- a/tensorflow/c/eager/tape.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/c/eager/tape.h"
-
-namespace tensorflow {
-namespace eager {
-
-bool GradientTape::ShouldRecord(gtl::ArraySlice<int64> tensor_ids) {
-  for (int64 i : tensor_ids) {
-    if (tensor_tape_.find(i) != tensor_tape_.end()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void GradientTape::Watch(int64 tensor_id) {
-  tensor_tape_.emplace(tensor_id, -1);
-}
-
-void GradientTape::RecordOperation(
-    const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
-    gtl::ArraySlice<int64> input_tensor_id, void* backward_function,
-    const std::function<void()>& backward_function_deleter) {
-  if (!ShouldRecord(input_tensor_id)) {
-    backward_function_deleter();
-    return;
-  }
-  std::vector<int64> ids;
-  ids.reserve(input_tensor_id.size());
-  for (int64 i : input_tensor_id) {
-    tensor_usage_[i]++;
-    ids.push_back(i);
-  }
-  const int64 op_id = next_op_id_++;
-  std::vector<TapeTensor> tensors;
-  tensors.reserve(output_tensors.size());
-  for (const TapeTensor& o : output_tensors) {
-    // Note: the tensor can have already been watched and hence be in the tape,
-    // so we cannot check that we're inserting it here.
-    tensor_tape_[o.id] = op_id;
-    tensor_usage_[o.id] = 1;
-    tensors.push_back(o);
-  }
-  op_tape_[op_id] = OpTapeEntry{op_type, tensors, ids, backward_function,
-                                backward_function_deleter};
-}
-
-void GradientTape::DeleteTrace(int64 tensor_id) {
-  auto it = tensor_usage_.find(tensor_id);
-  if (it == tensor_usage_.end()) {
-    return;
-  }
-  it->second--;
-  if (it->second != 0) {
-    return;
-  }
-  tensor_usage_.erase(it);
-  auto tensor_op_it = tensor_tape_.find(tensor_id);
-  if (tensor_op_it == tensor_tape_.end()) {
-    return;
-  }
-  const int64 op_id = tensor_op_it->second;
-  if (op_id == -1) {
-    // Do not delete watched tensors.
-    return;
-  }
-  tensor_tape_.erase(tensor_op_it);
-  auto op_it = op_tape_.find(op_id);
-  CHECK(op_it != op_tape_.end());
-  for (const auto& output : op_it->second.output_tensor_info) {
-    if (tensor_usage_.find(output.id) != tensor_usage_.end()) {
-      // Found a usage for an output, so cannot delete the op.
-      return;
-    }
-  }
-  for (int64 id : op_it->second.input_tensor_id) {
-    DeleteTrace(id);
-  }
-  op_it->second.backward_function_deleter();
-  op_tape_.erase(op_it);
-}
-
-std::pair<TensorTape, OpTape> GradientTape::Export() {
-  return {std::move(tensor_tape_), std::move(op_tape_)};
-}
-
-}  // namespace eager
-}  // namespace tensorflow
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index df51f300eb61d54cb1e06d5a58a9b10e834f73c4..20ed037c52f34bc7a8aa39243c0b85e58fee1d46 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -19,6 +19,7 @@ limitations under the License.
 // maintains the data structures required to do so.
 
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -36,13 +37,14 @@ struct TapeTensor {
 };
 
 // Represents an entry in the tape.
+template <typename BackwardFunction>
 struct OpTapeEntry {
   string op_type;
   std::vector<TapeTensor> output_tensor_info;
   std::vector<int64> input_tensor_id;
 
   // TODO(apassos) consider narrowing down this interface.
-  void* backward_function;
+  BackwardFunction* backward_function;
 
   // Should be called before deleting the backward function. TODO(apassos) use
   // unique_ptrs to ensure this happens.
@@ -55,13 +57,78 @@ struct OpTapeEntry {
 using TensorTape = std::unordered_map<int64, int64>;
 
 // Map from operation-id to tape entry.
-using OpTape = std::unordered_map<int64, OpTapeEntry>;
+template <typename BackwardFunction>
+using OpTape = std::unordered_map<int64, OpTapeEntry<BackwardFunction>>;
+
+// Operations the tape needs to perform on tensors to do backpropagation. Named
+// "vspace" because a subset of these are related to a vector space, such as
+// adding gradients, getting zeroes, etc. Currently cannot be implemented
+// without using tensorflow python code, hence left unspecified here.
+//
+// Gradient is the type returned by gradient functions. In Python TF it's either
+// Tensor or IndexedSlices or None, which here we map to nullptr. Gradients need
+// to allow their size to be computed and they need to be passable to a backward
+// function and deleted (as the backprop code creates lots of gradients the user
+// is not interested in).
+//
+// BackwardFunction needs to be a closure which stores intermediate activations
+// from the forward computation and calls a vector-jacobian product function
+// (also known as adjoint function) to compute, given downstream gradients,
+// upstream gradients.
+//
+// TODO(apassos) provide concrete template instantiations for TFE_TensorHandle
+// specialization, which is blocked by quite a few things needing to loop back
+// into python now.
+template <typename Gradient, typename BackwardFunction>
+class VSpace {
+ public:
+  virtual ~VSpace() {}
+
+  // Returns the number of elements in the gradient tensor.
+  virtual int64 NumElements(Gradient* tensor) const = 0;
+
+  // Consumes references to the tensors in the gradient_tensors list and returns
+  // a tensor with the result.
+  virtual Gradient* AggregateGradients(
+      gtl::ArraySlice<Gradient*> gradient_tensors) const = 0;
+
+  // Returns a tensor of the right shape and dtype filled with zeros.
+  virtual Gradient* Zeros(TensorShape shape, DataType dtype) const = 0;
+
+  // Returns a Tensor which is filled with ones and like the input.
+  virtual Gradient* Ones(TensorShape shape, DataType dtype) const = 0;
+
+  // Calls the passed-in backward function.
+  virtual Status CallBackwardFunction(
+      BackwardFunction* backward_function,
+      gtl::ArraySlice<Gradient*> output_gradients,
+      std::vector<Gradient*>* result) const = 0;
+
+  // Deletes the input tensor.
+  virtual void DeleteGradient(Gradient* gradient) const = 0;
+
+  // Lets this VSpace know that it can release resources held by the
+  // `backward_function`, It will not be called again.
+  // `backward_function` must not be null.
+  virtual void ReleaseBackwardFunction(
+      BackwardFunction* backward_function) const = 0;
+};
 
 // Traces the execution of operations, doing eager garbage collection, and
 // exporting a full trace so other code can do backpropagation. Not thread-safe.
+template <typename Gradient, typename BackwardFunction>
 class GradientTape {
  public:
-  GradientTape() {}
+  // If `persistent` is true, GradientTape will not eagerly delete backward
+  // functions (and hence the tensors they keep alive). Instead, everything
+  // is deleted in ~GradientTape. Persistent GradientTapes are useful when
+  // users want to compute multiple gradients over the same tape.
+  GradientTape(bool persistent) : persistent_(persistent) {}
+  ~GradientTape() {
+    for (const auto& pair : op_tape_) {
+      pair.second.backward_function_deleter();
+    }
+  }
 
   bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids);
 
@@ -70,26 +137,486 @@ class GradientTape {
   void RecordOperation(const string& op_type,
                        gtl::ArraySlice<TapeTensor> output_tensors,
                        gtl::ArraySlice<int64> input_tensor_id,
-                       void* backward_function,
+                       BackwardFunction* backward_function,
                        const std::function<void()>& backward_function_deleter);
 
   void DeleteTrace(int64 tensor_id);
 
-  // Note: it is only valid to call Export once per tape, and after calling
-  // export the tape is no longer valid (i.e. calls to ShouldRecord, Watch,
-  // Record, and Delete have undefined behavior).
-  std::pair<TensorTape, OpTape> Export();
+  // Consumes the internal state of the tape (so cannot be called more than
+  // once) and produces the gradient of the target tensors with respect to the
+  // source tensors. The output gradients are used if not empty and not
+  // null. The result is populated with one tensor per target element.
+  Status ComputeGradient(const VSpace<Gradient, BackwardFunction>& vspace,
+                         gtl::ArraySlice<int64> target_tensor_ids,
+                         gtl::ArraySlice<int64> source_tensor_id,
+                         gtl::ArraySlice<Gradient*> output_gradients,
+                         std::vector<Gradient*>* result);
 
  private:
   TensorTape tensor_tape_;
-  OpTape op_tape_;
+  OpTape<BackwardFunction> op_tape_;
   int64 next_op_id_{0};
 
   // Map from tensor id to number of remaining usages (i.e. how many entries in
   // the tape refer to it); to aid in tape garbage collection.
   std::unordered_map<int64, int64> tensor_usage_;
+
+  // If false, all activations are deleted in the first call to ComputeGradient.
+  // Else, only when this is destructed.
+  bool persistent_;
+};
+
+// Template instantiations here
+
+template <typename Gradient, typename BackwardFunction>
+bool GradientTape<Gradient, BackwardFunction>::ShouldRecord(
+    gtl::ArraySlice<int64> tensor_ids) {
+  for (int64 i : tensor_ids) {
+    if (tensor_tape_.find(i) != tensor_tape_.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename Gradient, typename BackwardFunction>
+void GradientTape<Gradient, BackwardFunction>::Watch(int64 tensor_id) {
+  tensor_tape_.emplace(tensor_id, -1);
+}
+
+template <typename Gradient, typename BackwardFunction>
+void GradientTape<Gradient, BackwardFunction>::RecordOperation(
+    const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
+    gtl::ArraySlice<int64> input_tensor_id, BackwardFunction* backward_function,
+    const std::function<void()>& backward_function_deleter) {
+  if (!ShouldRecord(input_tensor_id)) {
+    backward_function_deleter();
+    return;
+  }
+  std::vector<int64> ids;
+  ids.reserve(input_tensor_id.size());
+  for (int64 i : input_tensor_id) {
+    tensor_usage_[i]++;
+    ids.push_back(i);
+  }
+  const int64 op_id = next_op_id_++;
+  std::vector<TapeTensor> tensors;
+  tensors.reserve(output_tensors.size());
+  for (const TapeTensor& o : output_tensors) {
+    // Note: the tensor can have already been watched and hence be in the tape,
+    // so we cannot check that we're inserting it here.
+    tensor_tape_[o.id] = op_id;
+    tensor_usage_[o.id] = 1;
+    tensors.push_back(o);
+  }
+  op_tape_[op_id] = OpTapeEntry<BackwardFunction>{
+      op_type, tensors, ids, backward_function, backward_function_deleter};
+}
+
+template <typename Gradient, typename BackwardFunction>
+void GradientTape<Gradient, BackwardFunction>::DeleteTrace(int64 tensor_id) {
+  auto it = tensor_usage_.find(tensor_id);
+  if (it == tensor_usage_.end()) {
+    return;
+  }
+  it->second--;
+  if (it->second != 0) {
+    return;
+  }
+  tensor_usage_.erase(it);
+  auto tensor_op_it = tensor_tape_.find(tensor_id);
+  if (tensor_op_it == tensor_tape_.end()) {
+    return;
+  }
+  const int64 op_id = tensor_op_it->second;
+  if (op_id == -1) {
+    // Do not delete watched tensors.
+    return;
+  }
+  tensor_tape_.erase(tensor_op_it);
+  auto op_it = op_tape_.find(op_id);
+  CHECK(op_it != op_tape_.end());
+  for (const auto& output : op_it->second.output_tensor_info) {
+    if (tensor_usage_.find(output.id) != tensor_usage_.end()) {
+      // Found a usage for an output, so cannot delete the op.
+      return;
+    }
+  }
+  for (int64 id : op_it->second.input_tensor_id) {
+    DeleteTrace(id);
+  }
+  op_it->second.backward_function_deleter();
+  op_tape_.erase(op_it);
+}
+
+// Terminology:
+//
+//  - op: a possibly composite operation, which has an entry in the tape
+//  - target: dy in dx/dy
+//  - source: dx in dx/dy
+//  - tensor: one of the many inputs or outputs of an operation
+//
+// Below here we do the gradient algorithm. It works as follows:
+//
+// First we filter the tape to just the subset of operations we want to
+// differentiate. In the process of doing so we count how many times each Tensor
+// is used as an input to an op (so we know when we're done computing gradients
+// for that Tensor). We also count, for each tape entry, how many of its output
+// Tensors need gradients to be computed (Tensors which are not used do not need
+// any gradients to be computed).
+//
+// Finally, we start a backprop stack with a set of tape entries for which we
+// have all gradients available. This set usually is a subset of the set of
+// targets (not all since targets which have outputs in the tape will not have
+// gradients available initially).
+//
+// Then we repeatedly pop an entry from the stack, run its backprop, and update
+// the gradients of its inputs. Once we have computed all gradients for a single
+// input we can mark this input as done, and this can trigger adding an entry to
+// the stack if all outputs of that entry are now done.
+//
+// When the stack is empty we have gradients for all tensors we're interested
+// in.
+
+namespace {
+
+template <typename BackwardFunction>
+struct BackpropInitialState {
+  OpTape<BackwardFunction> op_tape;
+
+  // Map from tensor ID to how many references still exist for this tensor in
+  // the tape.
+  std::unordered_map<int64, int64> tensor_usage_counts;
+
+  // Maps from op ID to how many output tensors of this op still need to have
+  // their gradients computed.
+  std::unordered_map<int64, int64> op_missing_tensor;
 };
 
+// If `persistent_tape` is true, op_tape is not changed and none of the
+// backwards functions are deleted.
+// If `persistent_tape` is false, op_tape is cleared and backwards functions
+// not needed for gradient computation are deleted. Backwards functions that
+// are needed, are copied and returned in BackpropInitialState.
+template <typename BackwardFunction>
+BackpropInitialState<BackwardFunction> PrepareBackprop(
+    gtl::ArraySlice<int64> target, const TensorTape& tensor_tape,
+    OpTape<BackwardFunction>* op_tape,
+    const std::unordered_set<int64>& sources_set, bool persistent_tape) {
+  std::vector<int64> tensor_stack;
+  tensor_stack.reserve(target.size());
+  for (auto t : target) {
+    tensor_stack.push_back(t);
+  }
+  BackpropInitialState<BackwardFunction> result;
+  while (!tensor_stack.empty()) {
+    int64 tensor_id = tensor_stack.back();
+    tensor_stack.pop_back();
+    auto op_id_it = tensor_tape.find(tensor_id);
+    if (op_id_it == tensor_tape.end()) {
+      continue;
+    }
+    int64 op_id = op_id_it->second;
+    auto op_it = op_tape->find(op_id);
+    auto result_op_it = result.op_tape.find(op_id);
+    if (op_id == -1 || op_it == op_tape->end() ||
+        result_op_it != result.op_tape.end()) {
+      continue;
+    }
+    CHECK(result.op_tape.emplace(op_id, op_it->second).second);
+    for (auto it : op_it->second.input_tensor_id) {
+      auto count_it = result.tensor_usage_counts.find(it);
+      if (count_it != result.tensor_usage_counts.end()) {
+        count_it->second++;
+      } else {
+        result.tensor_usage_counts[it] = 1;
+        if (sources_set.find(it) == sources_set.end() &&
+            tensor_tape.find(it) != tensor_tape.end()) {
+          tensor_stack.push_back(it);
+        }
+      }
+    }
+    if (!persistent_tape) {
+      op_tape->erase(op_it);
+    }
+  }
+  for (auto& pair : result.tensor_usage_counts) {
+    auto it = tensor_tape.find(pair.first);
+    if (it != tensor_tape.end() && it->second != -1) {
+      result.op_missing_tensor[it->second] += 1;
+    }
+  }
+  if (!persistent_tape) {
+    // Call destructors for all unneeded gradient functions and
+    // clear the op_tape. We can clear the tape because ownership of
+    // backward functions that will be used for gradient computation
+    // has been transfered to `result`.
+    for (const auto& op_pair : *op_tape) {
+      op_pair.second.backward_function_deleter();
+    }
+    op_tape->clear();
+  }
+  return result;
+}
+
+template <typename BackwardFunction>
+std::vector<int64> InitialStack(
+    const OpTape<BackwardFunction>& op_tape,
+    const std::unordered_map<int64, int64>& op_missing_tensor) {
+  std::vector<int64> result;
+  for (auto& op_entry : op_tape) {
+    if (op_missing_tensor.find(op_entry.first) == op_missing_tensor.end()) {
+      result.push_back(op_entry.first);
+    }
+  }
+  return result;
+}
+
+template <typename Gradient, typename BackwardFunction>
+Status InitialGradients(
+    const VSpace<Gradient, BackwardFunction>& vspace,
+    gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
+    const OpTape<BackwardFunction>& op_tape,
+    const std::unordered_map<int64, int64>& tensor_usage_counts,
+    std::unordered_map<int64, std::vector<Gradient*>>* result) {
+  for (int i = 0; i < target_tensor_ids.size(); ++i) {
+    const int64 id = target_tensor_ids[i];
+    if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
+      if (!output_gradients.empty() && output_gradients[i] != nullptr) {
+        // TODO(apassos) figure out how to print debugging information here.
+        return errors::InvalidArgument(
+            "A gradient was provided for a tensor which is used as part of the "
+            "computation.");
+      }
+    } else {
+      if (output_gradients.empty() || output_gradients[i] == nullptr) {
+        auto tensor_it = tensor_tape.find(id);
+        if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
+          auto op_it = op_tape.find(tensor_it->second);
+          if (op_it == op_tape.end()) {
+            return errors::Internal(
+                "Internal state of the gradient tape is invalid: "
+                "failed to find operation producing a tensor");
+          }
+          bool found = false;
+          for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
+            if (op_it->second.output_tensor_info[j].id == id) {
+              found = true;
+              (*result)[id].push_back(
+                  vspace.Ones(op_it->second.output_tensor_info[j].shape,
+                              op_it->second.output_tensor_info[j].dtype));
+              break;
+            }
+          }
+          if (!found) {
+            return errors::Internal(
+                "Internal state of the gradient tape is invalid: "
+                "none of operations outputs match expected tensor");
+          }
+        } else {
+          // No record of the target tensor found on the tape, so no gradient
+          // needs to be computed from it. Do nothing.
+        }
+      } else {
+        (*result)[id].push_back(output_gradients[i]);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+// If over kMinAggregateCount gradients are accumulated and the total
+// memory consumption is over kMinAggregateBytes, do an early aggregation
+// so as to release the gradient tensor to save memory.
+constexpr int kMinAggregateCount = 4;
+constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
+
+template <typename Gradient, typename BackwardFunction>
+Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
+    const VSpace<Gradient, BackwardFunction>& vspace,
+    gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::ArraySlice<int64> source_tensor_ids,
+    gtl::ArraySlice<Gradient*> output_gradients,
+    std::vector<Gradient*>* result) {
+  std::unordered_set<int64> sources_set(source_tensor_ids.begin(),
+                                        source_tensor_ids.end());
+  BackpropInitialState<BackwardFunction> state = PrepareBackprop(
+      target_tensor_ids, tensor_tape_, &op_tape_, sources_set, persistent_);
+  std::vector<int64> op_stack =
+      InitialStack(state.op_tape, state.op_missing_tensor);
+  std::unordered_map<int64, std::vector<Gradient*>> gradients;
+  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
+                              tensor_tape_, state.op_tape,
+                              state.tensor_usage_counts, &gradients);
+  auto cleanup = [this, &state]() {
+    if (!persistent_) {
+      // Release all backprop functions
+      for (const auto& pair : state.op_tape) {
+        pair.second.backward_function_deleter();
+      }
+    }
+  };
+  if (!s.ok()) {
+    cleanup();
+    return s;
+  }
+  std::unordered_map<int64, int64> gradients_size;
+  // TODO(apassos) multiple threads could be dequeuing from op_stack at the same
+  // time, for better CPU backprop performance.
+  VLOG(1) << "Initial stack:";
+  if (VLOG_IS_ON(1)) {
+    for (auto t : op_stack) {
+      VLOG(1) << "  " << t;
+    }
+  }
+  std::unordered_map<string, std::unordered_set<int>>
+      functions_accept_none_for_indices({
+          {"SoftmaxCrossEntropyWithLogits", {1}},
+          {"FusedBatchNorm", {1, 2, 3, 4}},
+      });
+  while (!op_stack.empty()) {
+    const int64 op = op_stack.back();
+    VLOG(1) << "Popped " << op;
+    op_stack.pop_back();
+    auto op_it = state.op_tape.find(op);
+    if (op_it == state.op_tape.end()) {
+      // It is possible for ops to end up on the stack if they are unrelated to
+      // the target; we should just skip them.
+      continue;
+    }
+    auto trace = std::move(op_it->second);
+    state.op_tape.erase(op_it);
+    std::vector<Gradient*> out_gradients;
+    out_gradients.reserve(trace.output_tensor_info.size());
+    bool any_gradient_nonzero = false;
+    for (int i = 0; i < trace.output_tensor_info.size(); ++i) {
+      const int64 id = trace.output_tensor_info[i].id;
+      auto grad_it = gradients.find(id);
+      if (grad_it == gradients.end()) {
+        auto func_name_it =
+            functions_accept_none_for_indices.find(trace.op_type);
+        if (func_name_it != functions_accept_none_for_indices.end() &&
+            func_name_it->second.find(i) != func_name_it->second.end()) {
+          out_gradients.push_back(nullptr);
+        } else {
+          out_gradients.push_back(
+              vspace.Zeros(trace.output_tensor_info[i].shape,
+                           trace.output_tensor_info[i].dtype));
+        }
+      } else {
+        any_gradient_nonzero = true;
+        out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
+        if (sources_set.find(grad_it->first) == sources_set.end()) {
+          gradients.erase(grad_it);
+        }
+      }
+    }
+    std::vector<Gradient*> in_gradients;
+    if (any_gradient_nonzero) {
+      Status s = vspace.CallBackwardFunction(trace.backward_function,
+                                             out_gradients, &in_gradients);
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+      if (!s.ok()) {
+        cleanup();
+        return s;
+      }
+    } else {
+      in_gradients.resize(trace.input_tensor_id.size());
+      if (!persistent_) {
+        vspace.ReleaseBackwardFunction(trace.backward_function);
+      }
+    }
+    VLOG(1) << "Got " << in_gradients.size() << " in_gradients for "
+            << trace.input_tensor_id.size() << " sources";
+    for (int i = 0; i < in_gradients.size(); ++i) {
+      const int64 id = trace.input_tensor_id[i];
+      if (in_gradients[i] != nullptr) {
+        auto& unaggregated_grads = gradients[id];
+        unaggregated_grads.push_back(in_gradients[i]);
+        if (unaggregated_grads.size() > kMinAggregateCount) {
+          auto size_it = gradients_size.find(id);
+          int64 size;
+          if (size_it == gradients_size.end()) {
+            size = vspace.NumElements(unaggregated_grads[0]);
+            gradients_size.emplace(id, size);
+          } else {
+            size = size_it->second;
+          }
+          if (unaggregated_grads.size() * size * 4 > kMinAggregateBytes) {
+            Gradient* grad = vspace.AggregateGradients(unaggregated_grads);
+            unaggregated_grads.clear();
+            unaggregated_grads.push_back(grad);
+          }
+        }
+      }
+      auto usage_count_it = state.tensor_usage_counts.find(id);
+      if (usage_count_it == state.tensor_usage_counts.end()) {
+        VLOG(1) << "Tensor " << id << " not used";
+        continue;
+      }
+      usage_count_it->second--;
+      if (usage_count_it->second > 0) {
+        VLOG(1) << "Tensor " << id << " usage count " << usage_count_it->second;
+        continue;
+      }
+      auto tape_it = tensor_tape_.find(id);
+      if (tape_it == tensor_tape_.end()) {
+        VLOG(1) << "Tensor " << id
+                << " has no associated op. Deleting gradient";
+        auto grad_it = gradients.find(id);
+        if (grad_it != gradients.end()) {
+          for (auto g : grad_it->second) {
+            vspace.DeleteGradient(g);
+          }
+          gradients.erase(grad_it);
+        }
+        continue;
+      }
+      const int64 op_id = tape_it->second;
+      if (op_id == -1) {
+        VLOG(1) << "Tensor " << id << " is source";
+        continue;
+      }
+      auto missing_it = state.op_missing_tensor.find(op_id);
+      if (missing_it != state.op_missing_tensor.end()) {
+        missing_it->second--;
+        VLOG(1) << "Op " << op_id << " missing " << missing_it->second
+                << " output gradients";
+        if (missing_it->second == 0) {
+          op_stack.push_back(op_id);
+        }
+      }
+    }
+  }
+  CHECK(state.op_tape.empty());
+  result->reserve(source_tensor_ids.size());
+  for (auto is : source_tensor_ids) {
+    auto grad_it = gradients.find(is);
+    if (grad_it == gradients.end()) {
+      result->push_back(nullptr);
+    } else {
+      if (grad_it->second.size() == 1) {
+        result->push_back(grad_it->second[0]);
+      } else {
+        result->push_back(vspace.AggregateGradients(grad_it->second));
+      }
+      gradients.erase(grad_it);
+    }
+  }
+  VLOG(1) << "Final gradients size: " << gradients.size();
+  for (auto grad_pair : gradients) {
+    for (const auto& g : grad_pair.second) {
+      vspace.DeleteGradient(g);
+    }
+  }
+  return Status::OK();
+}
+
 }  // namespace eager
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 0fe85d5d2c60bcd0566f010b23820ec174b7830b..6e37cdb5f4beea53d4a2ded0705ae482d0bc2d68 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -22,18 +22,81 @@ namespace tensorflow {
 void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input) {
   mutex_lock l(graph->mu);
   graph->graph.AddControlEdge(&input->node, &op->node);
+  RecordMutation(graph, *op, "adding control input");
+}
+
+void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+             TF_Buffer* attr_value_proto, TF_Status* status) {
+  AttrValue attr_val;
+  if (!attr_val.ParseFromArray(attr_value_proto->data,
+                               attr_value_proto->length)) {
+    status->status =
+        tensorflow::errors::InvalidArgument("Invalid AttrValue proto");
+    return;
+  }
+
+  mutex_lock l(graph->mu);
+  op->node.AddAttr(attr_name, attr_val);
+  RecordMutation(graph, *op, "setting attribute");
 }
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
   mutex_lock l(graph->mu);
   op->node.set_requested_device(device);
+  RecordMutation(graph, *op, "setting device");
 }
 
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status) {
   mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&new_src.oper->node);
+
+  if (ic->num_outputs() <= new_src.index) {
+    status->status = tensorflow::errors::OutOfRange(
+        "Cannot update edge. Output index [", new_src.index,
+        "] is greater than the number of total outputs [", ic->num_outputs(),
+        "].");
+    return;
+  }
+  tensorflow::shape_inference::ShapeHandle shape = ic->output(new_src.index);
+
+  tensorflow::shape_inference::InferenceContext* ic_dst =
+      graph->refiner.GetContext(&dst.oper->node);
+  if (ic_dst->num_inputs() <= dst.index) {
+    status->status = tensorflow::errors::OutOfRange(
+        "Cannot update edge. Input index [", dst.index,
+        "] is greater than the number of total inputs [", ic_dst->num_inputs(),
+        "].");
+    return;
+  }
+  if (!ic_dst->MergeInput(dst.index, shape)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Cannot update edge, incompatible shapes: ", ic_dst->DebugString(shape),
+        " and ", ic_dst->DebugString(ic_dst->input(dst.index)), ".");
+    return;
+  }
   status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
                                            &dst.oper->node, dst.index);
+
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst.oper, "updating input tensor");
+  }
+}
+
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
+  mutex_lock l(graph->mu);
+  std::vector<const Edge*> control_edges;
+  for (const Edge* edge : op->node.in_edges()) {
+    if (!edge->IsControlEdge()) continue;
+    control_edges.push_back(edge);
+  }
+  for (const Edge* edge : control_edges) {
+    graph->graph.RemoveControlEdge(edge);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index ab71a4170bb58df46a3d23585cf256eb656d38d2..b51ef2b53122802fef598a26bd6f1843976f11b0 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -25,11 +25,18 @@ namespace tensorflow {
 
 void AddControlInput(TF_Graph* graph, TF_Operation* op, TF_Operation* input);
 
+// Changes an attr value in the node_def Protocol Buffer and sets a status upon
+// completion.
+void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
+             TF_Buffer* attr_value_proto, TF_Status* status);
+
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
+void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op);
+
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 80112f9b44b1d5fd65a7d47788b072dc47a2b29a..e354831d7d25af83c068a68a4f844056263a598c 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -421,6 +421,7 @@ tf_cc_test(
 
 tf_gen_op_wrappers_cc(
     name = "cc_ops",
+    api_def_srcs = ["//tensorflow/core:base_api_def"],
     op_lib_names = [
         "array_ops",
         "audio_ops",
@@ -525,6 +526,30 @@ cc_library_with_android_deps(
         "//tensorflow/core:android_tensorflow_lib",
     ],
     copts = tf_copts(),
+    data = [
+        "//tensorflow/core:base_api_def",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:op_gen_overrides_proto_cc",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "cc_op_gen_test",
+    srcs = [
+        "framework/cc_op_gen.cc",
+        "framework/cc_op_gen.h",
+        "framework/cc_op_gen_test.cc",
+    ],
+    data = [
+        "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
+    ],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -533,6 +558,8 @@ cc_library_with_android_deps(
         "//tensorflow/core:op_gen_overrides_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
 
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 38a17598b8e4161f96ab8134823de033d3284440..d889c518f9c38a9f070970b37a2ad4b1fc26671b 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -18,10 +18,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/cc/framework/cc_op_gen.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/op_gen_overrides.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb_text.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
-
 namespace {
 
 const int kRightMargin = 79;
@@ -297,7 +297,7 @@ string ToCamelCase(const string& str) {
 // argument to a function.
 std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
   static const std::unordered_map<StringPiece, std::pair<const char*, bool>,
-                                  StringPiece::Hasher>
+                                  StringPieceHasher>
       attr_type_map{
           {"string", {"StringPiece", false}},
           {"list(string)", {"gtl::ArraySlice<string>", true}},
@@ -325,29 +325,112 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
 }
 
 bool IsCPPKeyword(StringPiece name) {
-  static const std::unordered_set<StringPiece, StringPiece::Hasher>
+  static const std::unordered_set<StringPiece, StringPieceHasher>
       // Keywords obtained from http://en.cppreference.com/w/cpp/keyword
       kCPPReserved{
-          "alignas", "alignof", "and", "and_eq", "asm", "atomic_cancel",
-          "atomic_commit", "atomic_noexcept", "auto", "bitand", "bitor", "bool",
-          "break", "case", "catch", "char", "char16_t", "char32_t", "class",
-          "compl", "concept", "const", "const_cast", "constexpr", "continue",
-          "decltype", "default", "delete", "do", "double", "dynamic_cast",
-          "else", "enum", "explicit", "export", "extern", "false", "final",
-          "float", "for", "friend", "goto", "if", "import", "inline", "int",
-          "long", "module", "mutable", "namespace", "new", "noexcept", "not",
-          "not_eq", "nullptr", "operator", "or", "or_eq", "override", "private",
-          "protected", "public", "register", "reinterpret_cast", "requires",
-          "return", "short", "signed", "sizeof", "static", "static_assert",
-          "static_cast", "struct", "switch", "synchronized", "template", "this",
-          "thread_local", "throw", "true", "try", "typedef", "typeid",
-          "typename", "union", "unsigned", "using", "virtual", "void",
-          "volatile", "wchar_t", "while", "xor", "xor_eq",
+          "alignas",
+          "alignof",
+          "and",
+          "and_eq",
+          "asm",
+          "atomic_cancel",
+          "atomic_commit",
+          "atomic_noexcept",
+          "auto",
+          "bitand",
+          "bitor",
+          "bool",
+          "break",
+          "case",
+          "catch",
+          "char",
+          "char16_t",
+          "char32_t",
+          "class",
+          "compl",
+          "concept",
+          "const",
+          "const_cast",
+          "constexpr",
+          "continue",
+          "decltype",
+          "default",
+          "delete",
+          "do",
+          "double",
+          "dynamic_cast",
+          "else",
+          "enum",
+          "explicit",
+          "export",
+          "extern",
+          "false",
+          "final",
+          "float",
+          "for",
+          "friend",
+          "goto",
+          "if",
+          "import",
+          "inline",
+          "int",
+          "long",
+          "module",
+          "mutable",
+          "namespace",
+          "new",
+          "noexcept",
+          "not",
+          "not_eq",
+          "nullptr",
+          "operator",
+          "or",
+          "or_eq",
+          "override",
+          "private",
+          "protected",
+          "public",
+          "register",
+          "reinterpret_cast",
+          "requires",
+          "return",
+          "short",
+          "signed",
+          "sizeof",
+          "static",
+          "static_assert",
+          "static_cast",
+          "struct",
+          "switch",
+          "synchronized",
+          "template",
+          "this",
+          "thread_local",
+          "throw",
+          "true",
+          "try",
+          "typedef",
+          "typeid",
+          "typename",
+          "union",
+          "unsigned",
+          "using",
+          "virtual",
+          "void",
+          "volatile",
+          "wchar_t",
+          "while",
+          "xor",
+          "xor_eq",
 
           // The following are not C++ keywords, but names of local variables
           // and parameters used in the op constructor. Treating them as
           // keywords, so that other parameter names don't conflict with these.
-          "builder", "node", "ret", "scope", "unique_name",
+          "builder",
+          "node",
+          "ret",
+          "scope",
+          "unique_name",
       };
   return kCPPReserved.count(name) > 0;
 }
@@ -385,10 +468,10 @@ bool ArgIsList(const OpDef::ArgDef& arg) {
 }
 
 bool HasOptionalAttrs(
-    const OpDef& op_def,
+    const ApiDef& api_def,
     const std::unordered_map<string, string>& inferred_input_attrs) {
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < api_def.attr_size(); ++i) {
+    const auto& attr(api_def.attr(i));
     if ((inferred_input_attrs.find(attr.name()) ==
          inferred_input_attrs.end()) &&
         attr.has_default_value()) {
@@ -398,12 +481,21 @@ bool HasOptionalAttrs(
   return false;
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 struct OpInfo {
   // graph_op_def: The OpDef used by the runtime, has the names that
   //   must be used when calling NodeBuilder.
   // interface_op_def: The OpDef used in the interface in the generated
   //   code, with possibly overridden names and defaults.
-  explicit OpInfo(const OpDef& graph_op_def, const OpDef& inteface_op_def,
+  explicit OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
                   const std::vector<string>& aliases);
   string GetOpAttrStruct() const;
   string GetConstructorDecl(StringPiece op_name_prefix,
@@ -423,74 +515,81 @@ struct OpInfo {
   string comment;
 
   const OpDef& graph_op_def;
-  const OpDef& op_def;
+  const ApiDef& api_def;
   const std::vector<string>& aliases;
+  // Map from type attribute to corresponding original argument name.
   std::unordered_map<string, string> inferred_input_attrs;
 };
 
-OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
-               const std::vector<string>& a)
-    : graph_op_def(g_op_def), op_def(i_op_def), aliases(a) {
-  op_name = op_def.name();
-  InferOpAttributes(op_def, &inferred_input_attrs);
-  has_optional_attrs = HasOptionalAttrs(op_def, inferred_input_attrs);
+OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
+               const std::vector<string>& aliases)
+    : graph_op_def(graph_op_def), api_def(api_def), aliases(aliases) {
+  op_name = api_def.endpoint(0).name();
+  InferOpAttributes(graph_op_def, &inferred_input_attrs);
+  has_optional_attrs = HasOptionalAttrs(api_def, inferred_input_attrs);
   arg_types.push_back("const ::tensorflow::Scope&");
   arg_names.push_back("scope");
 
-  if (op_def.has_deprecation()) {
-    if (!op_def.summary().empty()) {
-      comment = strings::StrCat(op_def.summary(), "\n");
+  if (graph_op_def.has_deprecation()) {
+    if (!api_def.summary().empty()) {
+      comment = strings::StrCat(api_def.summary(), "\n");
     }
     strings::StrAppend(&comment, "DEPRECATED at GraphDef version ",
-                       op_def.deprecation().version(), ":\n",
-                       op_def.deprecation().explanation(), ".\n");
-  } else if (op_def.summary().empty()) {
+                       graph_op_def.deprecation().version(), ":\n",
+                       graph_op_def.deprecation().explanation(), ".\n");
+  } else if (api_def.summary().empty()) {
     comment = "TODO: add doc.\n";
   } else {
-    comment = strings::StrCat(op_def.summary(), "\n");
+    comment = strings::StrCat(api_def.summary(), "\n");
   }
-  if (!op_def.description().empty()) {
-    strings::StrAppend(&comment, "\n", op_def.description(), "\n");
+  if (!api_def.description().empty()) {
+    strings::StrAppend(&comment, "\n", api_def.description(), "\n");
   }
   strings::StrAppend(&comment, "\nArguments:\n* scope: A Scope object\n");
 
   // Process inputs
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
+  for (int i = 0; i < api_def.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def.arg_order(i), graph_op_def);
+    const auto& api_def_arg = *FindInputArg(api_def.arg_order(i), api_def);
     arg_types.push_back(strings::StrCat(
         "::tensorflow::", ArgIsList(arg) ? "InputList" : "Input"));
-    arg_names.push_back(AvoidCPPKeywords(arg.name()));
+    arg_names.push_back(AvoidCPPKeywords(api_def_arg.rename_to()));
 
     // TODO(keveman): Include input type information.
-    StringPiece description = arg.description();
+    StringPiece description = api_def_arg.description();
     if (!description.empty()) {
       ConsumeEquals(&description);
-      strings::StrAppend(&comment, "* ", AvoidCPPKeywords(arg.name()), ": ",
-                         arg.description(), "\n");
+      strings::StrAppend(&comment, "* ",
+                         AvoidCPPKeywords(api_def_arg.rename_to()), ": ",
+                         api_def_arg.description(), "\n");
     }
   }
 
   // Process attrs
   string required_attrs_comment;
   string optional_attrs_comment;
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+    // ApiDef attributes must be in the same order as in OpDef since
+    // we initialize ApiDef based on OpDef.
+    const auto& attr(graph_op_def.attr(i));
+    const auto& api_def_attr(api_def.attr(i));
+    CHECK_EQ(attr.name(), api_def_attr.name());
     // Skip inferred arguments
     if (inferred_input_attrs.count(attr.name()) > 0) continue;
 
     const auto entry = AttrTypeName(attr.type());
     const auto attr_type_name = entry.first;
     const bool use_const = entry.second;
-    string attr_name = AvoidCPPKeywords(attr.name());
+    string attr_name = AvoidCPPKeywords(api_def_attr.rename_to());
 
     string attr_comment;
-    if (!attr.description().empty()) {
+    if (!api_def_attr.description().empty()) {
       // TODO(keveman): Word wrap and indent this, to handle multi-line
       // descriptions.
       strings::StrAppend(&attr_comment, "* ", attr_name, ": ",
-                         attr.description(), "\n");
+                         api_def_attr.description(), "\n");
     }
-    if (attr.has_default_value()) {
+    if (api_def_attr.has_default_value()) {
       strings::StrAppend(&optional_attrs_comment, attr_comment);
     } else {
       strings::StrAppend(&required_attrs_comment, attr_comment);
@@ -508,44 +607,49 @@ OpInfo::OpInfo(const OpDef& g_op_def, const OpDef& i_op_def,
   }
 
   // Process outputs
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
-    const auto& arg = op_def.output_arg(i);
+  for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
+    // ApiDef arguments must be in the same order as in OpDef since
+    // we initialize ApiDef based on OpDef.
+    const auto& arg = graph_op_def.output_arg(i);
+    const auto& api_def_arg(api_def.out_arg(i));
+    CHECK_EQ(arg.name(), api_def_arg.name());
+
     bool is_list = ArgIsList(arg);
     output_types.push_back(
         strings::StrCat("::tensorflow::", is_list ? "OutputList" : "Output"));
-    output_names.push_back(AvoidCPPKeywords(arg.name()));
+    output_names.push_back(AvoidCPPKeywords(api_def_arg.rename_to()));
     is_list_output.push_back(is_list);
   }
 
   strings::StrAppend(&comment, "\nReturns:\n");
-  if (op_def.output_arg_size() == 0) {  // No outputs.
+  if (graph_op_def.output_arg_size() == 0) {  // No outputs.
     strings::StrAppend(&comment, "* the created `Operation`\n");
-  } else if (op_def.output_arg_size() == 1) {  // One output
+  } else if (graph_op_def.output_arg_size() == 1) {  // One output
     if (is_list_output[0]) {
       strings::StrAppend(&comment, "* `OutputList`: ");
     } else {
       strings::StrAppend(&comment, "* `Output`: ");
     }
-    if (op_def.output_arg(0).description().empty()) {
-      strings::StrAppend(&comment, "The ", op_def.output_arg(0).name(),
+    if (api_def.out_arg(0).description().empty()) {
+      strings::StrAppend(&comment, "The ", api_def.out_arg(0).name(),
                          " tensor.\n");
     } else {
       // TODO(josh11b): Word wrap this.
-      strings::StrAppend(&comment, op_def.output_arg(0).description(), "\n");
+      strings::StrAppend(&comment, api_def.out_arg(0).description(), "\n");
     }
   } else {  // Multiple outputs.
-    for (int i = 0; i < op_def.output_arg_size(); ++i) {
+    for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
       if (is_list_output[i]) {
         strings::StrAppend(&comment, "* `OutputList`");
       } else {
         strings::StrAppend(&comment, "* `Output`");
       }
       strings::StrAppend(&comment, " ", output_names[i]);
-      if (op_def.output_arg(i).description().empty()) {
+      if (api_def.out_arg(i).description().empty()) {
         strings::StrAppend(&comment, "\n");
       } else {
         // TODO(josh11b): Word wrap this.
-        strings::StrAppend(&comment, ": ", op_def.output_arg(i).description(),
+        strings::StrAppend(&comment, ": ", api_def.out_arg(i).description(),
                            "\n");
       }
     }
@@ -564,19 +668,20 @@ string OpInfo::GetOpAttrStruct() const {
   string struct_fields;
   string setters;
 
-  for (int i = 0; i < op_def.attr_size(); ++i) {
-    const auto& attr(op_def.attr(i));
+  for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+    const auto& attr(graph_op_def.attr(i));
+    const auto& api_def_attr(api_def.attr(i));
     // If attr will be inferred or it doesn't have a default value, don't
     // add it to the struct.
     if ((inferred_input_attrs.find(attr.name()) !=
          inferred_input_attrs.end()) ||
-        !attr.has_default_value()) {
+        !api_def_attr.has_default_value()) {
       continue;
     }
     const auto entry = AttrTypeName(attr.type());
     const auto attr_type_name = entry.first;
     const bool use_const = entry.second;
-    const string camel_case_name = ToCamelCase(attr.name());
+    const string camel_case_name = ToCamelCase(api_def_attr.rename_to());
     const string suffix =
         (camel_case_name == op_name || camel_case_name == "Attrs") ? "_" : "";
     const string attr_func_def =
@@ -584,22 +689,25 @@ string OpInfo::GetOpAttrStruct() const {
                         attr_type_name, use_const ? "&" : "");
 
     string attr_comment;
-    if (!attr.description().empty()) {
-      strings::StrAppend(&attr_comment, attr.description(), "\n\n");
+    if (!api_def_attr.description().empty()) {
+      strings::StrAppend(&attr_comment, api_def_attr.description(), "\n\n");
     }
     strings::StrAppend(&attr_comment, "Defaults to ",
-                       SummarizeAttrValue(attr.default_value()), "\n");
+                       SummarizeAttrValue(api_def_attr.default_value()), "\n");
     attr_comment = MakeComment(attr_comment, "    ");
 
     strings::StrAppend(&setters, attr_comment);
     strings::StrAppend(&setters, "    Attrs ", attr_func_def, " x) {\n");
     strings::StrAppend(&setters, "      Attrs ret = *this;\n");
-    strings::StrAppend(&setters, "      ret.", attr.name(), "_ = x;\n");
+    strings::StrAppend(&setters, "      ret.", api_def_attr.rename_to(),
+                       "_ = x;\n");
     strings::StrAppend(&setters, "      return ret;\n    }\n\n");
 
     strings::StrAppend(
-        &struct_fields, "    ", attr_type_name, " ", attr.name(), "_ = ",
-        PrintAttrValue(op_def.name(), attr.default_value()), ";\n");
+        &struct_fields, "    ", attr_type_name, " ", api_def_attr.rename_to(),
+        "_ = ",
+        PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
+        ";\n");
   }
 
   if (struct_fields.empty()) {
@@ -676,17 +784,18 @@ void OpInfo::WriteClassDecl(WritableFile* h) const {
   // Add the static functions to set optional attrs
   if (has_optional_attrs) {
     strings::StrAppend(&class_decl, "\n");
-    for (int i = 0; i < op_def.attr_size(); ++i) {
-      const auto& attr(op_def.attr(i));
+    for (int i = 0; i < graph_op_def.attr_size(); ++i) {
+      const auto& attr(graph_op_def.attr(i));
+      const auto& api_def_attr(api_def.attr(i));
       if ((inferred_input_attrs.find(attr.name()) !=
            inferred_input_attrs.end()) ||
-          !attr.has_default_value()) {
+          !api_def_attr.has_default_value()) {
         continue;
       }
       const auto entry = AttrTypeName(attr.type());
       const auto attr_type_name = entry.first;
       const bool use_const = entry.second;
-      const string camel_case_name = ToCamelCase(attr.name());
+      const string camel_case_name = ToCamelCase(api_def_attr.rename_to());
       const string suffix =
           (camel_case_name == op_name || camel_case_name == "Attrs") ? "_" : "";
       const string attr_func_def = strings::StrCat(
@@ -726,11 +835,11 @@ void OpInfo::GetOutput(string* out) const {
       strings::StrCat("if (!", scope_str, ".ok()) return;");
 
   // No outputs.
-  if (op_def.output_arg_size() == 0) {
+  if (graph_op_def.output_arg_size() == 0) {
     strings::StrAppend(out, "  this->operation = Operation(ret);\n  return;\n");
     return;
   }
-  if (op_def.output_arg_size() == 1) {
+  if (graph_op_def.output_arg_size() == 1) {
     // One output, no need for NameRangeMap
     if (is_list_output[0]) {
       strings::StrAppend(out,
@@ -752,7 +861,7 @@ void OpInfo::GetOutput(string* out) const {
                      ".UpdateStatus(_status_);\n", "    return;\n");
   strings::StrAppend(out, "  }\n\n");
 
-  for (int i = 0; i < op_def.output_arg_size(); ++i) {
+  for (int i = 0; i < graph_op_def.output_arg_size(); ++i) {
     const string arg_range = strings::StrCat(
         "_outputs_range[\"", graph_op_def.output_arg(i).name(), "\"]");
     if (is_list_output[i]) {
@@ -776,11 +885,13 @@ string OpInfo::GetConstructorBody() const {
 
   strings::StrAppend(&body, "  ", return_on_error, "\n");
 
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
-    strings::StrAppend(&body, "  auto _", arg.name(), " = ::tensorflow::ops::",
-                       ArgIsList(arg) ? "AsNodeOutList" : "AsNodeOut", "(",
-                       scope_str, ", ", AvoidCPPKeywords(arg.name()), ");\n");
+  for (int i = 0; i < graph_op_def.input_arg_size(); ++i) {
+    const auto& arg(graph_op_def.input_arg(i));
+    const auto& api_def_arg(api_def.in_arg(i));
+    strings::StrAppend(
+        &body, "  auto _", api_def_arg.rename_to(), " = ::tensorflow::ops::",
+        ArgIsList(arg) ? "AsNodeOutList" : "AsNodeOut", "(", scope_str, ", ",
+        AvoidCPPKeywords(api_def_arg.rename_to()), ");\n");
     strings::StrAppend(&body, "  ", return_on_error, "\n");
   }
 
@@ -791,19 +902,21 @@ string OpInfo::GetConstructorBody() const {
       &body, "  auto builder = ::tensorflow::NodeBuilder(unique_name, \"",
       graph_op_def.name(), "\")\n");
   const string spaces = "                     ";
-  for (int i = 0; i < op_def.input_arg_size(); ++i) {
-    const auto& arg(op_def.input_arg(i));
-    strings::StrAppend(&body, spaces, ".Input(_", arg.name(), ")\n");
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    const auto& arg(api_def.in_arg(i));
+    strings::StrAppend(&body, spaces, ".Input(_", arg.rename_to(), ")\n");
   }
-  for (int i = 0; i < op_def.attr_size(); ++i) {
+  for (int i = 0; i < api_def.attr_size(); ++i) {
     const auto& graph_attr(graph_op_def.attr(i));
-    const auto& attr(op_def.attr(i));
-    if (inferred_input_attrs.find(attr.name()) != inferred_input_attrs.end()) {
+    const auto& api_def_attr(api_def.attr(i));
+    if (inferred_input_attrs.find(api_def_attr.name()) !=
+        inferred_input_attrs.end()) {
       continue;
     }
-    const string attr_name = attr.has_default_value()
-                                 ? strings::StrCat("attrs.", attr.name(), "_")
-                                 : AvoidCPPKeywords(attr.name());
+    const string attr_name =
+        api_def_attr.has_default_value()
+            ? strings::StrCat("attrs.", api_def_attr.rename_to(), "_")
+            : AvoidCPPKeywords(api_def_attr.rename_to());
     strings::StrAppend(&body, spaces, ".Attr(\"", graph_attr.name(), "\", ",
                        attr_name, ")\n");
   }
@@ -845,10 +958,10 @@ void OpInfo::WriteClassDef(WritableFile* cc) const {
   TF_CHECK_OK(cc->Append(class_def));
 }
 
-void WriteCCOp(const OpDef& graph_op_def, const OpDef& interface_op_def,
+void WriteCCOp(const OpDef& graph_op_def, const ApiDef& api_def,
                const std::vector<string>& aliases, WritableFile* h,
                WritableFile* cc) {
-  OpInfo op_info(graph_op_def, interface_op_def, aliases);
+  OpInfo op_info(graph_op_def, api_def, aliases);
 
   op_info.WriteClassDecl(h);
   op_info.WriteClassDef(cc);
@@ -943,8 +1056,9 @@ string MakeInternal(const string& fname) {
 
 }  // namespace
 
-void WriteCCOps(const OpList& ops, const string& dot_h_fname,
-                const string& dot_cc_fname, const string& overrides_fnames) {
+void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& dot_h_fname, const string& dot_cc_fname,
+                const string& overrides_fnames) {
   Env* env = Env::Default();
 
   // Load the override map.
@@ -984,24 +1098,23 @@ void WriteCCOps(const OpList& ops, const string& dot_h_fname,
     // code depends on it.
     if (graph_op_def.name() == "Const") continue;
 
-    // Incorporate overrides from override_map.
-    OpDef interface_op_def = graph_op_def;
-    const OpGenOverride* op_override =
-        override_map.ApplyOverride(&interface_op_def);
+    const auto* api_def = api_def_map.GetApiDef(graph_op_def.name());
+
     std::vector<string> aliases;
-    if (op_override) {
-      if (op_override->skip()) continue;
-      aliases.assign(op_override->alias().begin(), op_override->alias().end());
-      if (op_override->hide()) {
-        // Write hidden ops to _internal.h and _internal.cc.
-        WriteCCOp(graph_op_def, interface_op_def, aliases, internal_h.get(),
-                  internal_cc.get());
-        continue;
-      }
+    if (api_def->visibility() == ApiDef::SKIP) continue;
+    // First endpoint is canonical, the rest are aliases.
+    for (int endpoint_i = 1; endpoint_i < api_def->endpoint_size();
+         ++endpoint_i) {
+      aliases.push_back(api_def->endpoint(endpoint_i).name());
+    }
+    if (api_def->visibility() == ApiDef::HIDDEN) {
+      // Write hidden ops to _internal.h and _internal.cc.
+      WriteCCOp(graph_op_def, *api_def, aliases, internal_h.get(),
+                internal_cc.get());
+      continue;
     }
-
     // This isn't a hidden op, write it to the main files.
-    WriteCCOp(graph_op_def, interface_op_def, aliases, h.get(), cc.get());
+    WriteCCOp(graph_op_def, *api_def, aliases, h.get(), cc.get());
   }
 
   FinishFiles(false, h.get(), cc.get(), op_header_guard);
diff --git a/tensorflow/cc/framework/cc_op_gen.h b/tensorflow/cc/framework/cc_op_gen.h
index fa5e004f0317d046d82bee005bdf9f17773a45f3..cea28990144b9371e8009ce13f912b44044f9aac 100644
--- a/tensorflow/cc/framework/cc_op_gen.h
+++ b/tensorflow/cc/framework/cc_op_gen.h
@@ -17,13 +17,15 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CC_FRAMEWORK_CC_OP_GEN_H_
 
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 /// Result is written to files dot_h and dot_cc.
-void WriteCCOps(const OpList& ops, const string& dot_h_fname,
-                const string& dot_cc_fname, const string& overrides_fnames);
+void WriteCCOps(const OpList& ops, const ApiDefMap& api_def_map,
+                const string& dot_h_fname, const string& dot_cc_fname,
+                const string& overrides_fnames);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/framework/cc_op_gen_main.cc b/tensorflow/cc/framework/cc_op_gen_main.cc
index 3b80cf993eb9a5d5f4c41687577414e7216dd174..326d5668b8803ee39ffe24900c92e1db87b93601 100644
--- a/tensorflow/cc/framework/cc_op_gen_main.cc
+++ b/tensorflow/cc/framework/cc_op_gen_main.cc
@@ -16,7 +16,11 @@ limitations under the License.
 #include "tensorflow/cc/framework/cc_op_gen.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -24,10 +28,28 @@ namespace tensorflow {
 namespace {
 
 void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
-                   const std::string& overrides_fnames, bool include_internal) {
+                   const std::string& overrides_fnames, bool include_internal,
+                   const std::vector<string>& api_def_dirs) {
   OpList ops;
   OpRegistry::Global()->Export(include_internal, &ops);
-  WriteCCOps(ops, dot_h, dot_cc, overrides_fnames);
+  ApiDefMap api_def_map(ops);
+  if (!api_def_dirs.empty()) {
+    Env* env = Env::Default();
+    // Only load files that correspond to "ops".
+    for (const auto& op : ops.op()) {
+      for (const auto& api_def_dir : api_def_dirs) {
+        const std::string api_def_file_pattern =
+            io::JoinPath(api_def_dir, "api_def_" + op.name() + ".pbtxt");
+        if (env->FileExists(api_def_file_pattern).ok()) {
+          TF_CHECK_OK(api_def_map.LoadFile(env, api_def_file_pattern));
+        }
+      }
+    }
+  }
+
+  api_def_map.UpdateDocs();
+
+  WriteCCOps(ops, api_def_map, dot_h, dot_cc, overrides_fnames);
 }
 
 }  // namespace
@@ -35,18 +57,24 @@ void PrintAllCCOps(const std::string& dot_h, const std::string& dot_cc,
 
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
-  if (argc != 5) {
+  // TODO(annarev): Update this file to no longer take op_gen_overrides.pbtxt
+  // as an argument.
+  if (argc != 6) {
     for (int i = 1; i < argc; ++i) {
       fprintf(stderr, "Arg %d = %s\n", i, argv[i]);
     }
     fprintf(stderr,
-            "Usage: %s out.h out.cc overrides1.pbtxt,2.pbtxt include_internal\n"
+            "Usage: %s out.h out.cc overrides1.pbtxt,2.pbtxt include_internal "
+            "api_def_dirs1,api_def_dir2 ...\n"
             "  include_internal: 1 means include internal ops\n",
             argv[0]);
     exit(1);
   }
 
   bool include_internal = tensorflow::StringPiece("1") == argv[4];
-  tensorflow::PrintAllCCOps(argv[1], argv[2], argv[3], include_internal);
+  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+      argv[5], ",", tensorflow::str_util::SkipEmpty());
+  tensorflow::PrintAllCCOps(argv[1], argv[2], argv[3], include_internal,
+                            api_def_dirs);
   return 0;
 }
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b7e720a5c7b343415eee1aa157b8de755a1e1a5
--- /dev/null
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -0,0 +1,195 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/framework/cc_op_gen.h"
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(annarev): Remove this op_gen_overrides.pbtxt reference.
+// It is needed only because WriteCCOps takes it as an argument.
+constexpr char kOverridesFnames[] =
+    "tensorflow/cc/ops/op_gen_overrides.pbtxt";
+constexpr char kBaseOpDef[] = R"(
+op {
+  name: "Foo"
+  input_arg {
+    name: "images"
+    description: "Images to process."
+  }
+  input_arg {
+    name: "dim"
+    description: "Description for dim."
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    description: "Description for output."
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    description: "Type for images"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+      }
+    }
+    default_value {
+      i: 1
+    }
+  }
+  summary: "Summary for op Foo."
+  description: "Description for op Foo."
+}
+)";
+
+void ExpectHasSubstr(StringPiece s, StringPiece expected) {
+  EXPECT_TRUE(s.contains(expected))
+      << "'" << s << "' does not contain '" << expected << "'";
+}
+
+void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
+  EXPECT_FALSE(s.contains(expected))
+      << "'" << s << "' contains '" << expected << "'";
+}
+
+void ExpectSubstrOrder(const string& s, const string& before,
+                       const string& after) {
+  int before_pos = s.find(before);
+  int after_pos = s.find(after);
+  ASSERT_NE(std::string::npos, before_pos);
+  ASSERT_NE(std::string::npos, after_pos);
+  EXPECT_LT(before_pos, after_pos)
+      << before << " is not before " << after << " in " << s;
+}
+
+// Runs WriteCCOps and stores output in (internal_)cc_file_path and
+// (internal_)h_file_path.
+void GenerateCcOpFiles(Env* env, const OpList& ops,
+                       const ApiDefMap& api_def_map, string* h_file_text,
+                       string* internal_h_file_text) {
+  const string& tmpdir = testing::TmpDir();
+
+  const auto h_file_path = io::JoinPath(tmpdir, "test.h");
+  const auto cc_file_path = io::JoinPath(tmpdir, "test.cc");
+  const auto internal_h_file_path = io::JoinPath(tmpdir, "test_internal.h");
+  const auto internal_cc_file_path = io::JoinPath(tmpdir, "test_internal.cc");
+
+  WriteCCOps(ops, api_def_map, h_file_path, cc_file_path, kOverridesFnames);
+
+  TF_ASSERT_OK(ReadFileToString(env, h_file_path, h_file_text));
+  TF_ASSERT_OK(
+      ReadFileToString(env, internal_h_file_path, internal_h_file_text));
+}
+
+TEST(CcOpGenTest, TestVisibilityChangedToHidden) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  visibility: HIDDEN
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+  ApiDefMap api_def_map(op_defs);
+
+  string h_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo");
+  ExpectDoesNotHaveSubstr(internal_h_file_text, "class Foo");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(internal_h_file_text, "class Foo");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo");
+}
+
+TEST(CcOpGenTest, TestArgNameChanges) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  arg_order: "dim"
+  arg_order: "images"
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+
+  ApiDefMap api_def_map(op_defs);
+  string cc_file_text, h_file_text;
+  string internal_cc_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectSubstrOrder(h_file_text, "Input images", "Input dim");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectSubstrOrder(h_file_text, "Input dim", "Input images");
+}
+
+TEST(CcOpGenTest, TestEndpoints) {
+  const string api_def = R"(
+op {
+  graph_op_name: "Foo"
+  endpoint {
+    name: "Foo1"
+  }
+  endpoint {
+    name: "Foo2"
+  }
+}
+)";
+  Env* env = Env::Default();
+  OpList op_defs;
+  protobuf::TextFormat::ParseFromString(kBaseOpDef, &op_defs);  // NOLINT
+
+  ApiDefMap api_def_map(op_defs);
+  string cc_file_text, h_file_text;
+  string internal_cc_file_text, internal_h_file_text;
+  // Without ApiDef
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo {");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo1");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo2");
+
+  // With ApiDef
+  TF_ASSERT_OK(api_def_map.LoadApiDef(api_def));
+  GenerateCcOpFiles(env, op_defs, api_def_map, &h_file_text,
+                    &internal_h_file_text);
+  ExpectHasSubstr(h_file_text, "class Foo1");
+  ExpectHasSubstr(h_file_text, "typedef Foo1 Foo2");
+  ExpectDoesNotHaveSubstr(h_file_text, "class Foo {");
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 09fadfcab51575798286876f9a4e0ee9a60940ac..13a3bba5e6d5ca19ff3f0eca76665ba7d3ab628d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -196,6 +196,18 @@ Status MaxPoolGradV2Helper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("MaxPoolV2", MaxPoolGradV2Helper);
 
+Status LRNGradHelper(const Scope& scope, const Operation& op,
+                     const std::vector<Output>& grad_inputs,
+                     std::vector<Output>* grad_outputs){
+  internal::LRNGrad::Attrs grad_attrs;
+
+  auto dx = internal::LRNGrad(scope, grad_inputs[0], op.input(0), op.output(0),
+                              grad_attrs);
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index ac66f51cf01911957722e94ca28e8e78dc6de2ed..f9063e836509669d81d03b1d2f0d32d1166b6eca 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -191,5 +191,12 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   RunTest(x, x_init_value, y, y_shape);
 }
 
+TEST_F(NNGradTest, LRN){
+  TensorShape x_shape({1, 1, 2, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  auto y = LRN(scope_, x);
+  RunTest(x, x_shape, y, x_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/ops/op_gen_overrides.pbtxt b/tensorflow/cc/ops/op_gen_overrides.pbtxt
index 0184c82c5afc99990530b902efdf670a2bdbc4bc..4aac990e748b0a79cbc3b353b4121a582b0883b0 100644
--- a/tensorflow/cc/ops/op_gen_overrides.pbtxt
+++ b/tensorflow/cc/ops/op_gen_overrides.pbtxt
@@ -11,7 +11,7 @@ op { name: "Reverse" skip: true }
 op { name: "ReverseV2" rename_to: "Reverse" }
 op { name: "Split" input_rename: { from: "split_dim" to: "axis" } }
 op { name: "SplitV" input_rename: { from: "split_dim" to: "axis" } }
-op { name: "Squeeze" input_rename: { from: "squeeze_dims" to: "axis" } }
+op { name: "Squeeze" attr_rename: { from: "squeeze_dims" to: "axis" } }
 op { name: "Pack" rename_to: "Stack" }
 op { name: "Unpack" rename_to: "Unstack" }
 op { name: "Select" rename_to: "Where3" input_rename: { from: "t" to: "x" } input_rename: { from: "e" to: "y" } }
diff --git a/tensorflow/cc/saved_model/tag_constants.h b/tensorflow/cc/saved_model/tag_constants.h
index 2b0b2d5c7fb33768494c1781669c1adcb875a579..b71cb263ca42dab7e830c1880ec4b311bc272f82 100644
--- a/tensorflow/cc/saved_model/tag_constants.h
+++ b/tensorflow/cc/saved_model/tag_constants.h
@@ -21,6 +21,9 @@ namespace tensorflow {
 /// Tag for the `gpu` graph.
 constexpr char kSavedModelTagGpu[] = "gpu";
 
+/// Tag for the `tpu` graph.
+constexpr char kSavedModelTagTpu[] = "tpu";
+
 /// Tag for the `serving` graph.
 constexpr char kSavedModelTagServe[] = "serve";
 
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index a9a6ea84319a18a8fbce648391bf5918ff6d9a08..5740c040e309bad8d7e3bdc468c09a3323fb99e0 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -24,7 +24,6 @@ tf_cc_test(
     srcs = ["runtime_test.cc"],
     deps = [
         ":runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -111,6 +110,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ae22f7edc423247b34895411d19d7a3c21f86d4f..53da2881b60db9ad39565567623eb86f754559af 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -101,21 +101,8 @@ Status ComputeArgSizes(const CompileResult& compile_result,
                        std::vector<int64>* arg_sizes) {
   const xla::ProgramShape& ps = compile_result.program_shape;
   for (int i = 0; i < ps.parameters_size(); ++i) {
-    if (i == ps.parameters_size() - 1 && compile_result.has_context_arg) {
-      // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = ps.parameters(i).element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(ps));
-      }
-      arg_sizes->push_back(-1);
-    } else {
-      arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
-          ps.parameters(i), compile_result.pointer_size));
-    }
+    arg_sizes->push_back(xla::ShapeUtil::ByteSizeOf(
+        ps.parameters(i), compile_result.pointer_size));
   }
   return Status::OK();
 }
@@ -165,11 +152,6 @@ string RewriteWithName(const string& name, string code,
 Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (compile_result.has_context_arg) {
-    // If the compiled function needs a XlaLocalRuntimeContext* arg, it's
-    // always last, and is set in the class constructor.
-    num_args--;
-  }
   if (config.feed_size() != num_args) {
     return errors::InvalidArgument("mismatch between feed_size(",
                                    config.feed_size(), ") and num_args(",
@@ -418,7 +400,7 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void {{ENTRY}}(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
 
 {{NS_START}}
 // {{CLASS}} represents a computation previously specified in a
@@ -474,7 +456,6 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = {{HAS_CONTEXT_ARG}};
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -483,7 +464,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  {{CLASS}}(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   {{CLASS}}(const {{CLASS}}&) = delete;
@@ -496,8 +477,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -560,8 +541,6 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
       {"{{CLASS}}", opts.class_name},
       {"{{ENTRY}}", compile_result.entry_point},
-      {"{{HAS_CONTEXT_ARG}}",
-       compile_result.has_context_arg ? "true" : "false"},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 0f6114666fcc89c631434527d2ae8c92c039ffea..75026c57c04a64186a1e5be6c41e4dd7de8520b7 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -145,11 +145,9 @@ TEST(GenerateHeader, Golden) {
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
           xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-          xla::ShapeUtil::MakeOpaqueShape(),
       },
       xla::ShapeUtil::MakeTupleShape(
           {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
-  compile_result.has_context_arg = true;
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
   string header;
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 65f342ce27ef09092f252f791973f245a8cdd6f3..35e50433d63a549bc6fb6a2be9015d7c471509d0 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -19,7 +19,7 @@ namespace xla { class ExecutableRunOptions; }
 // (Implementation detail) Entry point to the function in the object file.
 extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
-    const void** args, void** temps);
+    const void** args, void** temps, tensorflow::int64* profile_counters);
 
 namespace foo {
 namespace bar {
@@ -48,7 +48,7 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): opaque[]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
 //
 // Memory stats:
 //   arg bytes total:    104
@@ -58,11 +58,11 @@ namespace bar {
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
-  static constexpr size_t kNumArgs = 3;
+  static constexpr size_t kNumArgs = 2;
 
   // Byte size of each argument buffer. There are kNumArgs entries.
   static const intptr_t* ArgSizes() {
-    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96, -1};
+    static constexpr intptr_t kArgSizes[kNumArgs] = {8, 96};
     return kArgSizes;
   }
 
@@ -77,7 +77,6 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
       data->temp_sizes = TempSizes();
       data->num_temps = kNumTemps;
       data->result_index = kResultIndex;
-      data->requires_runtime_context = true;
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
@@ -86,7 +85,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
     return *kStaticData;
   }
 
-  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS)
+  MyClass(AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS)
       : XlaCompiledCpuFunction(StaticData(), alloc_mode) {}
 
   MyClass(const MyClass&) = delete;
@@ -99,8 +98,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
   //   any AllocMode. Must be called before Run to have an affect. Must be
-  //   called in AllocMode::RESULTS_AND_TEMPS_ONLY for each positional argument,
-  //   to set the argument buffers.
+  //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
+  //   argument, to set the argument buffers.
   //
   // T* argN_data()
   //   Returns the buffer of type T for positional argument N.
@@ -236,8 +235,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   // Shape of the args and results.
   static const xla::ProgramShape* StaticProgramShape() {
     static const xla::ProgramShape* kShape = []() {
-      static const char kProto[] = {10,12,16,11,26,2,1,2,42,4,10,2,1,0,10,12,16,5,26,2,3,4,42,4,10,2,1,0,10,2,16,14,18,16,16,13,34,12,16,8,26,2,5,6,42,4,10,2,1,0};
-      static constexpr int kProtoSize = 50;
+      static const char kProto[] = {10,12,16,11,26,2,1,2,42,4,10,2,1,0,10,12,16,5,26,2,3,4,42,4,10,2,1,0,18,16,16,13,34,12,16,8,26,2,5,6,42,4,10,2,1,0};
+      static constexpr int kProtoSize = 46;
       xla::ProgramShape* shape = new xla::ProgramShape;
       shape->ParseFromArray(kProto, kProtoSize);
       return shape;
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b8cc6024cb85e4f6269313927ff66d1d9a1cf79..c87f2b75dfa18ad5c3eda4bd6fcbcb3083ef73fd 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -94,9 +94,8 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
   xla::Computation computation;
-  TF_RETURN_IF_ERROR(ConvertGraphDefToXla(graph_def, config, client,
-                                          &computation,
-                                          &compile_result->has_context_arg));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToXla(graph_def, config, client, &computation));
   if (!flags.out_session_module.empty()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
                         computation.Snapshot());
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index 965c2960816b3acc8d2209e6824d88647de0ce14..e03c5b1aa77c1262ed903aae3072ef65f34d80a2 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -34,7 +34,6 @@ struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
   xla::ProgramShape program_shape;  // Static shape of args and results.
-  bool has_context_arg = false;     // Is last arg XlaLocalRuntimeContext?
   string entry_point;               // Name of generated function.
   int pointer_size = 0;             // Size of a pointer in bytes.
 };
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index ac79c278c1fdf8b6aedcb52121c767b8ba0ad358..6d603a02eb4ceade6832ba67b2981814ee25327a 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/runtime.h"
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 6b037f276ad1d6771b904bb970f45f32ae9531b8..413efd9cea3b6f71574615ad9ca92471ff925781 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -70,7 +70,7 @@ TEST(TFCompileTest, Add) {
 // Run tests that use set_argN_data separately, to avoid accidentally re-using
 // non-existent buffers.
 TEST(TFCompileTest, Add_SetArg) {
-  AddComp add(AddComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+  AddComp add(AddComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
 
   int32 arg_x = 10;
   int32 arg_y = 32;
@@ -258,7 +258,7 @@ TEST(TFCompileTest, MatMul2_SetArg) {
   Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
 
   foo::bar::MatMulComp matmul(
-      foo::bar::MatMulComp::AllocMode::RESULTS_AND_TEMPS_ONLY);
+      foo::bar::MatMulComp::AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY);
   matmul.set_thread_pool(&device);
 
   // Test using the set_argN_data() methods.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 363d6925a14dfab8b79617449a73727ab55c4527..542451ed2d14fbceca00c6ccb6e28c1c3a0d4321 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -130,6 +130,10 @@ def tf_library(name, graph, config,
   header_file = name + ".h"
   object_file = name + ".o"
   ep = ("__" + PACKAGE_NAME + "__" + name).replace("/", "_")
+  if type(tfcompile_flags) == type(""):
+    flags = tfcompile_flags
+  else:
+    flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
   native.genrule(
       name=("gen_" + name),
       srcs=[
@@ -148,7 +152,7 @@ def tf_library(name, graph, config,
            " --target_triple=" + target_llvm_triple() +
            " --out_header=$(@D)/" + header_file +
            " --out_object=$(@D)/" + object_file +
-           " " + (tfcompile_flags or "")),
+           " " + flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -185,7 +189,7 @@ def tf_library(name, graph, config,
            " --cpp_class=" + cpp_class +
            " --target_triple=" + target_llvm_triple() +
            " --out_session_module=$(@D)/" + session_module_pb +
-           " " + (tfcompile_flags or "")),
+           " " + flags),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -195,8 +199,7 @@ def tf_library(name, graph, config,
 
   # The cc_library rule packaging up the header and object file, and needed
   # kernel implementations.
-  need_xla_data_proto = (tfcompile_flags and
-                         tfcompile_flags.find("--gen_program_shape") != -1)
+  need_xla_data_proto = (flags and flags.find("--gen_program_shape") != -1)
   native.cc_library(
       name=name,
       srcs=[object_file],
@@ -264,7 +267,6 @@ def tf_library(name, graph, config,
         srcs=[test_file],
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
             "@org_tensorflow//tensorflow/compiler/aot:runtime",
             "@org_tensorflow//tensorflow/compiler/aot:tf_library_test_main",
             "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
@@ -310,7 +312,6 @@ def tf_library(name, graph, config,
         linkopts = if_android(["-pie", "-s"]),
         deps=[
             ":" + name,
-            "@org_tensorflow//tensorflow/compiler/tf2xla:xla_local_runtime_context",
             "@org_tensorflow//tensorflow/compiler/aot:benchmark",
             "@org_tensorflow//tensorflow/compiler/aot:runtime",
             "@org_tensorflow//tensorflow/compiler/xla:executable_run_options",
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index bf7d9cf14d10f41aa48ea594a8d63db97b9973e1..026a1bf879d373fd0f5f4444b3ce10d01702f82b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,6 +251,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 22899ebeebc929055518893b358f7950d380d6f6..dc06b7a4025ddc83bf766b702036297203c16e55 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -48,6 +49,52 @@ const char* const kXlaNumResourceArgsAttr = "_XlaNumResourceArgs";
 
 namespace {
 
+bool AreAllParentsConst(const Node& n,
+                        const gtl::FlatSet<const Node*>& runtime_const_nodes) {
+  if (n.type_string() == "GuaranteeConst" || n.type_string() == "Const") {
+    // If the current node is itself a cast-to-const, no need
+    // to look at the incoming edges.
+    return true;
+  }
+
+  bool all_parents_const = true;
+  bool atleast_one_non_control_edge = false;
+  for (const Edge* in : n.in_edges()) {
+    atleast_one_non_control_edge =
+        atleast_one_non_control_edge || !in->IsControlEdge();
+    if (!in->IsControlEdge() && runtime_const_nodes.count(in->src()) == 0) {
+      all_parents_const = false;
+      break;
+    }
+  }
+  return all_parents_const && atleast_one_non_control_edge;
+}
+
+void MarkGuaranteedConstants(
+    const Graph& graph,
+    const std::vector<std::pair<Node*, Node*>>& src_arg_pairs) {
+  gtl::FlatSet<const Node*> guaranteed_const_nodes;
+  std::vector<Node*> srcs;
+  srcs.reserve(src_arg_pairs.size());
+  for (const auto& src_arg : src_arg_pairs) {
+    srcs.push_back(src_arg.first);
+  }
+  ReverseDFSFrom(graph, srcs, /*enter=*/nullptr,
+                 /*leave=*/[&guaranteed_const_nodes](Node* n) {
+                   // TODO(vinuraja): Doesn't work in the presence of loops.
+                   if (AreAllParentsConst(*n, guaranteed_const_nodes)) {
+                     guaranteed_const_nodes.insert(n);
+                   }
+                 });
+
+  for (auto& src_arg : src_arg_pairs) {
+    if (guaranteed_const_nodes.count(src_arg.first) != 0) {
+      VLOG(1) << "Guaranteed const found: " << src_arg.first->DebugString();
+      src_arg.second->AddAttr("_is_guaranteed_constant", true);
+    }
+  }
+}
+
 // A node/slot pair.
 // TODO(phawkins): is there a common definition of this?
 struct NodeSlot {
@@ -175,9 +222,11 @@ Status Encapsulator::SplitIntoSubgraphs() {
   // Map from input graph nodes to subgraph nodes.
   std::unordered_map<Node*, Node*> node_images;
 
+  std::vector<std::pair<Node*, Node*>> src_arg_pairs;
   // Copy all marked nodes to a subgraph. Do nothing for unmarked nodes.
   for (Node* node : graph_in_->op_nodes()) {
     string func_id = GetFunctionNameAttr(node);
+
     if (func_id.empty()) continue;
 
     Subgraph& subgraph = subgraphs_[func_id];
@@ -276,11 +325,13 @@ Status Encapsulator::SplitIntoSubgraphs() {
                                kArgOp);
         builder.Attr("T", dtype);
         builder.Attr("index", arg_index);
+
         s = builder.Finalize(&arg_def);
         if (!s.ok()) return s;
 
         Node* arg = dst_subgraph.graph->AddNode(arg_def, &s);
         if (!s.ok()) return s;
+        src_arg_pairs.push_back({edge->src(), arg});
 
         dst_subgraph.args.push_back(arg);
       }
@@ -292,6 +343,8 @@ Status Encapsulator::SplitIntoSubgraphs() {
     }
   }
 
+  MarkGuaranteedConstants(*graph_in_, src_arg_pairs);
+
   for (auto& entry : subgraphs_) {
     FixupSourceAndSinkEdges(entry.second.graph.get());
   }
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 4a1dbaf05dc7824835f3567c6abcf48222720230..717efb360185f1ce26ee1e9adb0ee5bf7f4799f8 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -398,5 +398,109 @@ TEST(EncapsulateSubgraphsTest, ParallelChecking) {
   EXPECT_EQ(expected_edges, GraphEdges(*graph));
 }
 
+const Node* FindNodeByName(const Graph& graph, const string& name) {
+  for (const Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  return nullptr;
+}
+
+bool HasGuaranteeConstAttr(const Node& n) {
+  bool is_guaranteed_constant = false;
+  if (!GetNodeAttr(n.attrs(), "_is_guaranteed_constant",
+                   &is_guaranteed_constant)
+           .ok()) {
+    return false;
+  }
+  return is_guaranteed_constant;
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto const_x2 = ops::Const(root.WithOpName("const_x2"), 10.0f);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto add1 = ops::Add(root.WithOpName("add1"), const_guarantee_x1, const_x2);
+  add1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  EXPECT_EQ(2, guaranteed_consts);
+}
+
+TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
+  Scope root = Scope::NewRootScope().ExitOnError().WithDevice(
+      "/job:localhost/replica:0/task:0/cpu:0");
+  auto x1 = ops::Placeholder(root.WithOpName("x1"), DT_FLOAT);
+  auto x2 = ops::Placeholder(root.WithOpName("x2"), DT_FLOAT);
+  auto const_guarantee_x1 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x1"), x1);
+  auto const_guarantee_x2 =
+      ops::GuaranteeConst(root.WithOpName("const_guarantee_x2"), x2);
+  auto const_guarantee_add1 = ops::Add(root.WithOpName("const_guarantee_add1"),
+                                       const_guarantee_x1, const_guarantee_x2);
+  auto add2 = ops::Add(root.WithOpName("add2"), const_guarantee_x1, x2);
+  auto mul1 = ops::Mul(root.WithOpName("mul1"), const_guarantee_add1, add2);
+  mul1.node()->AddAttr("_encapsulate", "encapsulate1");
+
+  Graph graph_before(OpRegistry::Global());
+  TF_ASSERT_OK(root.ToGraph(&graph_before));
+
+  std::unique_ptr<Graph> graph_after;
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  int guaranteed_consts = 0;
+  TF_ASSERT_OK(EncapsulateSubgraphsInFunctions(
+      "_encapsulate", graph_before,
+      /*rewrite_subgraph_fn=*/
+      [&guaranteed_consts](std::unique_ptr<Graph>* graph_ptr,
+                           std::vector<int>* input_permutation,
+                           std::vector<int>* output_permutation,
+                           NodeDef* call_def) {
+        Graph* graph = graph_ptr->get();
+        for (const Node* n : graph->nodes()) {
+          if (n->type_string() == "_Arg" &&
+              StringPiece(n->name()).starts_with("const")) {
+            ++guaranteed_consts;
+            EXPECT_TRUE(HasGuaranteeConstAttr(*n));
+          } else {
+            EXPECT_FALSE(HasGuaranteeConstAttr(*n));
+          }
+        }
+        return Status::OK();
+      },
+      /*parallel_checking=*/false,
+      /*reuse_existing_functions=*/false, &graph_after, &library));
+  // Only 1 runtime const, which is const_guarantee_add1. Add2 has one const
+  // and another non-const, so overall non-const.
+  EXPECT_EQ(1, guaranteed_consts);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 459a582e157f5ddc63997ca93e7c0294293517d3..9bea5663319c8a25249fdc265cee0191556a7c04 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -16,7 +16,6 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 27c5da08c112664d361b5f969d100eed7b9df65c..39a770ab7b9ae56bd24865b86c69331b0a38ccec 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -103,7 +102,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
   }
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t.tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = t;
   return gpu::DeviceMemoryBase(data, size);
 }
@@ -111,7 +109,6 @@ xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
 Status XlaAllocator::RegisterArgument(const Tensor* t) {
   void* data =
       reinterpret_cast<void*>(const_cast<char*>(t->tensor_data().data()));
-  TF_RET_CHECK(data != nullptr);
   tensors_[data] = *t;
   return Status::OK();
 }
@@ -257,7 +254,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
-  options.local_executable_has_hybrid_result = true;
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -268,7 +264,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   // Builds an XLA allocator for the device.
   XlaAllocator xla_allocator(client->platform(), ctx);
-  XlaLocalRuntimeContext local_runtime_context;
 
   std::unique_ptr<xla::ShapedBuffer> output;
   // Build xla::ShapedBuffers that point directly to the Tensor buffers.
@@ -301,18 +296,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES_OK(ctx, xla_allocator.RegisterArgument(t));
   }
 
-  // Make the final parameter point at local_runtime_context.
-  if (kernel->requires_runtime_context) {
-    gpu::DeviceMemoryBase local_runtime_context_dmem(
-        &local_runtime_context, sizeof(local_runtime_context));
-    arg_buffers.push_back(
-        xla::ShapedBuffer::MakeArrayShapedBuffer(
-            xla::ShapeUtil::MakeOpaqueShape(), client->platform(),
-            client->default_device_ordinal(), local_runtime_context_dmem)
-            .ConsumeValueOrDie());
-    arg_ptrs.push_back(arg_buffers.back().get());
-  }
-
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
@@ -324,12 +307,6 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   auto run_result = executable->Run(arg_ptrs, run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
-  if (local_runtime_context.error) {
-    ctx->CtxFailure(errors::InvalidArgument("Compiled kernel returned error: ",
-                                            local_runtime_context.error_msg));
-    return;
-  }
-
   output = run_result.ConsumeValueOrDie()->release();
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
index 09aee39d8cd0e910320674fcfd8a7884ce2fdd04..4bc209b7ecf499d82e7567f7eff12b17cefa9863 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
@@ -39,21 +39,23 @@ static void AllocateFlags() {
   flags->tf_xla_min_cluster_size = 2;
   flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
   flags->tf_xla_clustering_debug = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
-           "Control compilation of operators into XLA computations on CPU and "
-           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-           "things very likely to be improved; 2 = on for everything.  "
-           "Experimental."),
-      Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
-           "Minimum number of operators in an XLA compilation. Ignored for "
-           "operators placed on an XLA device or operators explicitly marked "
-           "for compilation."),
-      Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
-           "Maximum number of operators in an XLA compilation."),
-      Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
-           "Dump graphs during XLA compilation."),
-  });
+  flags->tf_xla_cpu_global_jit = false;
+  flag_list = new std::vector<Flag>(
+      {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
+            "Control compilation of operators into XLA computations on CPU and "
+            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
+            "things very likely to be improved; 2 = on for everything.  "
+            "Experimental."),
+       Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
+            "Minimum number of operators in an XLA compilation. Ignored for "
+            "operators placed on an XLA device or operators explicitly marked "
+            "for compilation."),
+       Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
+            "Maximum number of operators in an XLA compilation."),
+       Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
+            "Dump graphs during XLA compilation."),
+       Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
+            "Enables global JIT compilation for CPU via SessionOptions.")});
   xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
 }
 
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
index 24f80507428b6742c64d3d7e96e4b1c540eda01b..e1ccd7ddb8706ca445b6811ca1fec369af7cd5d5 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
@@ -46,6 +46,8 @@ typedef struct {
   int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
                                   // compilation.
   bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
+  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
+                                  // via SessionOptions.
 } MarkForCompilationPassFlags;
 
 // Return a pointer to the MarkForCompilationPassFlags struct;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 78d0aa86a8fae9a0c6035bdc579ef800337df917..aceedeb823ac47a36435e36e586f219d313ed121 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -210,6 +210,13 @@ Status FindCompilationCandidates(
         !IsCompilableWhile(*node, jit_device_type, 0, lib_runtime)) {
       continue;
     }
+    // _Retval nodes in a top-level function represent fetches.
+    // Do not compile them.
+    if (node->type_string() == "_Retval") {
+      VLOG(2) << "Compilation rejected node: return value " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
     candidates->insert(node);
   }
   return Status::OK();
@@ -290,9 +297,11 @@ Status MarkForCompilationPass::Run(
     global_jit_level =
         static_cast<OptimizerOptions::GlobalJitLevel>(flags->tf_xla_auto_jit);
   }
+  bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
-  auto is_compilable = [global_jit_level, fld](const Node* node,
-                                               const DeviceType& device_type) {
+
+  auto is_compilable = [global_jit_level, cpu_global_jit, fld](
+                           const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
                                              &registration)) {
@@ -315,7 +324,11 @@ Status MarkForCompilationPass::Run(
     if (status.ok()) return compile;
 
     // Otherwise use the value of global_jit_level.
-    return registration->enable_jit_by_default && global_jit_level > 0;
+    // Ignore enable_jit_by_default if global jit compilation for CPU
+    // is explicitly requested via tf_xla_cpu_global_jit flag
+    bool ignore_registration = cpu_global_jit && device_type == DEVICE_CPU;
+    return (ignore_registration || registration->enable_jit_by_default) &&
+           global_jit_level > 0;
   };
   return RunImpl(options, is_compilable);
 }
@@ -556,6 +569,7 @@ Status MarkForCompilationPass::RunImpl(
     if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
         registration->requires_compilation) {
       string& name = cluster_names[cluster];
+
       if (name.empty()) {
         name = strings::StrCat("cluster_", cluster_sequence_num++);
       }
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index b3d258aea177fbefa4bae51d8156da2ff86c9032..454f0aeae98d7afd51f12b2cfb1810de275a57f7 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -525,5 +525,32 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
                             "+-- c\n"));
 }
 
+TEST(XlaCompilationTest, Retval) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    ops::UnaryOp("_Retval", b,
+                 builder.opts()
+                     .WithName("R")
+                     .WithAttr("T", DT_FLOAT)
+                     .WithAttr("index", 0));
+
+    TF_EXPECT_OK(builder.ToGraph(graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_TRUE(clusters.find("R") == clusters.cend());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 23368b6c76a363882956577a20c1bd041211d234..3717c2cc24283e0b218f92ec820d16893cbe0c35 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -214,23 +214,15 @@ Status XlaCompilationCache::BuildExecutable(
     const XlaCompiler::CompilationResult& result,
     std::unique_ptr<xla::LocalExecutable>* executable) {
   VLOG(2) << "Compiling to local executable";
-  xla::Shape opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
 
   std::vector<const xla::Shape*> argument_layouts(
       result.xla_input_shapes.size());
   for (int i = 0; i < result.xla_input_shapes.size(); ++i) {
     argument_layouts[i] = &result.xla_input_shapes[i];
   }
-  if (result.requires_runtime_context) {
-    // The final arg is the XlaLocalRuntimeContext*.
-    argument_layouts.push_back(&opaque_shape);
-  }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client_->default_device_ordinal());
-  build_options.set_platform(client_->platform());
   build_options.set_result_layout(result.xla_output_shape);
-  build_options.set_has_hybrid_result(
-      options.local_executable_has_hybrid_result);
 
   auto compile_result =
       client_->Compile(*result.computation, argument_layouts, build_options);
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 0ff99c5156ded2ae05c6976e3da8f31fce32f8f2..8ace678daa1e9c69af72b65941586ef63a7757a5 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -117,6 +117,33 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "categorical_op_test",
+    size = "small",
+    srcs = ["categorical_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+)
+
+tf_xla_py_test(
+    name = "cholesky_op_test",
+    size = "small",
+    srcs = ["cholesky_op_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
 tf_xla_py_test(
     name = "clustering_test",
     size = "small",
@@ -252,6 +279,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "image_ops_test",
+    size = "small",
+    srcs = ["image_ops_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "lrn_ops_test",
     size = "medium",
@@ -389,6 +429,20 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "segment_reduction_ops_test",
     size = "medium",
@@ -430,6 +484,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "stateless_random_ops_test",
+    size = "small",
+    srcs = ["stateless_random_ops_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/contrib/stateless",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "tensor_array_ops_test",
     size = "small",
@@ -645,7 +712,7 @@ tf_library(
     cpp_class = "LSTMLayerInference",
     graph = "lstm_layer_inference.pbtxt",
     tags = ["manual"],
-    tfcompile_flags = "--xla_cpu_multi_thread_eigen=false",
+    tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
 )
 
 # -----------------------------------------------------------------------------
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index d412c572ae16b84c2434819aa0a2d881defef5f9..654dc15e86b21c7742d49281d53c1a75e6a45d3b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -366,16 +366,52 @@ class BinaryOpsTest(XLATestCase):
 
       self._testBinary(
           gen_math_ops._real_div,
-          np.array([3, 3j, -1.5j, -8, 2 + 3j, 2 + 4j, 44 + 3j], dtype=dtype),
-          np.array([2, -2, 7j, -4j, 4 - 6j, 1 + 2j, 0], dtype=dtype),
+          np.array([3, 3j, -1.5j, -8, 2 + 3j, 2 + 4j], dtype=dtype),
+          np.array([2, -2, 7j, -4j, 4 - 6j, 1 + 2j], dtype=dtype),
+          expected=np.array(
+              [1.5, -1.5j, -0.2142857, -2j, (2 + 3j) / (4 - 6j), 2],
+              dtype=dtype))
+
+      # Test inf/nan scenarios.
+      self._testBinary(
+          gen_math_ops._real_div,
+          np.array([4 + 3j, 4, 3j, -4, -4j, 2 - 3j], dtype=dtype),
+          np.array([0, 0, 0, 0, 0, 0], dtype=dtype),
           expected=np.array(
               [
-                  1.5, -1.5j, -0.2142857, -2j, (2 + 3j) / (4 - 6j), 2,
-                  float("inf")
+                  dtype(1 + 1j) / 0,
+                  dtype(1) / 0,
+                  dtype(1j) / 0,
+                  dtype(-1) / 0,
+                  dtype(-1j) / 0,
+                  dtype(1 - 1j) / 0
               ],
               dtype=dtype))
 
-      # TODO(b/65408531): support+test pow for cplx
+      atan2_supported = self.device == "XLA_GPU"
+      if atan2_supported:
+        self._testBinary(
+            math_ops.pow,
+            dtype(3 + 2j),
+            dtype(4 - 5j),
+            expected=np.power(dtype(3 + 2j), dtype(4 - 5j)))
+        self._testBinary(  # empty rhs
+            math_ops.pow,
+            np.array([1 + 2j, 2 - 3j], dtype=dtype),
+            np.zeros(shape=[0, 2], dtype=dtype),
+            expected=np.zeros(shape=[0, 2], dtype=dtype))
+        self._testBinary(  # to zero power
+            math_ops.pow,
+            np.array([1 + 2j, 2 - 3j], dtype=dtype),
+            np.zeros(shape=[1, 2], dtype=dtype),
+            expected=np.ones(shape=[1, 2], dtype=dtype))
+        lhs = np.array([1 - 2j, 4 + 3j, 2 - 3j, 3, 2j, 1, 4], dtype=dtype)
+        rhs = np.array([2, 3j, 3 + 4j, 2 + 3j, 3 - 2j, 2, 3 + 3j], dtype=dtype)
+        scalar = dtype(2 + 2j)
+        self._testBinary(math_ops.pow, lhs, rhs, expected=np.power(lhs, rhs))
+        self._testBinary(
+            math_ops.pow, scalar, rhs, expected=np.power(scalar, rhs))
+        self._testBinary(math_ops.pow, lhs, scalar, np.power(lhs, scalar))
 
       lhs = np.array([4 + 2j, -3 - 1j, 2j, 1], dtype=dtype)
       rhs = np.array([5, -6j, 7 - 3j, -8j], dtype=dtype)
@@ -385,7 +421,9 @@ class BinaryOpsTest(XLATestCase):
       self._testBinary(
           gen_math_ops._sigmoid_grad, lhs, rhs, expected=rhs * lhs * (1 - lhs))
 
-      # TODO(b/65408531): support+test _rsqrt_grad for cplx (needs pow)
+      if atan2_supported:
+        self._testBinary(
+            gen_math_ops._rsqrt_grad, lhs, rhs, expected=lhs**3 * rhs / -2)
 
       self._testBinary(
           gen_math_ops._sqrt_grad, lhs, rhs, expected=rhs / (2 * lhs))
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..035cdea1786d39f3d21bb63be5c8ccffe1608bdf
--- /dev/null
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -0,0 +1,143 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multinomial generation ops in the XLA JIT compiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import googletest
+
+
+# TODO(srvasude): Merge this with
+# third_party/tensorflow/python/kernel_tests/random/multinomial_op_test.py.
+class CategoricalTest(XLATestCase):
+  """Test cases for random-number generating operators."""
+
+  def output_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _chi2(self, expected, actual):
+    """Returns Chi2 GOF statistic."""
+    actual = np.asarray(actual)
+    expected = np.asarray(expected)
+    diff = actual - expected
+    chi2 = np.sum(diff * diff / expected)
+    return chi2
+
+  def _do_sampling(self, logits, num_samples):
+    """Categorical samples from given input.
+
+    Args:
+      logits: Numpy ndarray of shape [batch_size, num_classes].
+      num_samples: Int; number of samples to draw.
+
+    Returns:
+      Frequencies from sampled classes; shape [batch_size, num_classes].
+    """
+    with self.test_session() as sess, self.test_scope():
+      random_seed.set_random_seed(1618)
+      op = random_ops.multinomial(logits, num_samples,
+                                  output_dtype=dtypes.int32)
+      d = sess.run(op)
+
+    batch_size, num_classes = logits.shape
+    freqs_mat = []
+    for i in range(batch_size):
+      cnts = dict(collections.Counter(d[i, :]))
+
+      # Requires drawn class labels be in range.
+      self.assertLess(max(cnts.keys()), num_classes)
+      self.assertGreaterEqual(min(cnts.keys()), 0)
+
+      freqs = [(cnts[k] * 1. / num_samples if k in cnts else 0)
+               for k in range(num_classes)]
+      freqs_mat.append(freqs)
+
+    return freqs_mat
+
+  def _testRngIsNotConstant(self, rng, dtype, output_dtype):
+    # Tests that 'rng' does not always return the same value.
+    with self.test_session() as sess:
+      with self.test_scope():
+        x = rng(dtype, output_dtype)
+
+      # The random-number generator, if working correctly, should produce the
+      # same output multiple times with low probability.
+      y = sess.run(x)
+      z = sess.run(x)
+      w = sess.run(x)
+
+      # We use exact equality here. If the random-number generator is producing
+      # deterministic output, all three outputs will be bitwise identical.
+      self.assertTrue((not np.array_equal(y, z)) or
+                      (not np.array_equal(z, w)) or
+                      (not np.array_equal(y, w)))
+
+  def testCategoricalIsNotConstant(self):
+    def rng(dtype, output_dtype):
+      return random_ops.multinomial(np.array([[1., 1., 1.]], dtype=dtype), 10,
+                                    output_dtype=output_dtype)
+
+    dtype = np.float32
+    for output_dtype in self.output_dtypes():
+      self._testRngIsNotConstant(rng, dtype, output_dtype)
+
+  def testCategoricalIsInRange(self):
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.test_session() as sess:
+          with self.test_scope():
+            x = random_ops.multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
+                output_dtype=output_dtype)
+          y = sess.run(x)
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
+
+  def testSamplingCorrectness(self):
+    np.random.seed(1618)  # Make it reproducible.
+    num_samples = 21000
+
+    rand_probs = np.random.dirichlet([1., 1., 2., 3.])
+    rand_probs2 = np.random.dirichlet([1., 4., 5.], size=3)  # batched
+    for probs in [[.5, .5], [.85, .05, .1], rand_probs, rand_probs2]:
+      probs = np.asarray(probs)
+      if len(probs.shape) == 1:
+        probs = probs.reshape(1, probs.size)  # singleton batch
+
+      logits = np.log(probs).astype(np.float32)
+      freqs = self._do_sampling(logits, num_samples)
+
+      # the test here is similar to
+      # python/kernel_tests/random/multinomial_op_test.py
+      # Note that df >= 1 in all these cases. Choosing a cutoff of 1e-3
+      # corresponds to an alpha value of 2.5% for df = 1, and smaller for larger
+      # df.
+      chi2 = self._chi2(probs, freqs)
+      self.assertLess(chi2, 1e-3)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5010fe5e21d0782e68d4e6d5bf6b4df1b44793a3
--- /dev/null
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -0,0 +1,126 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.Cholesky."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CholeskyOpTest(XLATestCase):
+
+  def _verifyCholeskyBase(self, sess, placeholder, x, chol, verification, atol):
+    chol_np, verification_np = sess.run([chol, verification], {placeholder: x})
+    self.assertAllClose(x, verification_np, atol=atol)
+    self.assertShapeEqual(x, chol)
+    # Check that the cholesky is lower triangular, and has positive diagonal
+    # elements.
+    if chol_np.shape[-1] > 0:
+      chol_reshaped = np.reshape(chol_np, (-1, chol_np.shape[-2],
+                                           chol_np.shape[-1]))
+      for chol_matrix in chol_reshaped:
+        self.assertAllClose(chol_matrix, np.tril(chol_matrix), atol=atol)
+        self.assertTrue((np.diag(chol_matrix) > 0.0).all())
+
+  def _verifyCholesky(self, x, atol=1e-6):
+    # Verify that LL^T == x.
+    with self.test_session() as sess:
+      placeholder = array_ops.placeholder(
+          dtypes.as_dtype(x.dtype), shape=x.shape)
+      with self.test_scope():
+        chol = linalg_ops.cholesky(placeholder)
+      verification = math_ops.matmul(chol, chol, adjoint_b=True)
+      self._verifyCholeskyBase(sess, placeholder, x, chol, verification, atol)
+
+  def testBasic(self):
+    data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
+    for dtype in self.float_types:
+      self._verifyCholesky(data.astype(dtype))
+
+  def testBatch(self):
+    for dtype in self.float_types:
+      simple_array = np.array(
+          [[[1., 0.], [0., 5.]]], dtype=dtype)  # shape (1, 2, 2)
+      self._verifyCholesky(simple_array)
+      self._verifyCholesky(np.vstack((simple_array, simple_array)))
+      odd_sized_array = np.array(
+          [[[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]], dtype=dtype)
+      self._verifyCholesky(np.vstack((odd_sized_array, odd_sized_array)))
+
+      # Generate random positive-definite matrices.
+      matrices = np.random.rand(10, 5, 5).astype(dtype)
+      for i in xrange(10):
+        matrices[i] = np.dot(matrices[i].T, matrices[i])
+      self._verifyCholesky(matrices, atol=1e-4)
+
+  def testNonSquareMatrix(self):
+    for dtype in self.float_types:
+      with self.assertRaises(ValueError):
+        linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]], dtype=dtype))
+      with self.assertRaises(ValueError):
+        linalg_ops.cholesky(
+            np.array(
+                [[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]],
+                dtype=dtype))
+
+  def testWrongDimensions(self):
+    for dtype in self.float_types:
+      tensor3 = constant_op.constant([1., 2.], dtype=dtype)
+      with self.assertRaises(ValueError):
+        linalg_ops.cholesky(tensor3)
+      with self.assertRaises(ValueError):
+        linalg_ops.cholesky(tensor3)
+
+  @unittest.skip("Test is slow")
+  def testLarge(self):
+    n = 200
+    shape = (n, n)
+    data = np.ones(shape).astype(np.float32) / (2.0 * n) + np.diag(
+        np.ones(n).astype(np.float32))
+    self._verifyCholesky(data, atol=1e-4)
+
+  def testMatrixConditionNumbers(self):
+    for dtype in self.float_types:
+      condition_number = 1000
+      size = 20
+
+      # Generate random positive-definite symmetric matrices, and take their
+      # Eigendecomposition.
+      matrix = np.random.rand(size, size)
+      matrix = np.dot(matrix.T, matrix)
+      _, w = np.linalg.eigh(matrix)
+
+      # Build new Eigenvalues exponentially distributed between 1 and
+      # 1/condition_number
+      v = np.exp(-np.log(condition_number) * np.linspace(0, size, size) / size)
+      matrix = np.dot(np.dot(w, np.diag(v)), w.T).astype(dtype)
+      self._verifyCholesky(matrix, atol=1e-4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index cbe2888696c87c6c2f50c3de71e8531977ea395a..11d8a99ffe1a136a54b16e20f1792062203f7969 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -24,10 +24,12 @@ from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
 
+@test_util.with_c_api
 class FunctionTest(XLATestCase):
 
   def testFunction(self):
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 936fcf8b6be0f8cd67ba07a8bef9d35a732d30ba..a80d69fa5f5099b8a8b67df0da9c92b957e9d194 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -36,7 +36,7 @@ class FusedBatchNormTest(XLATestCase):
     x_square = x * x
     x_square_sum = np.sum(x_square, (0, 1, 2))
     x_sum = np.sum(x, axis=(0, 1, 2))
-    element_count = np.size(x) / int(np.shape(x)[0])
+    element_count = np.size(x) / int(np.shape(x)[-1])
     mean = x_sum / element_count
     var = x_square_sum / element_count - mean * mean
     normalized = (x - mean) / np.sqrt(var + epsilon)
@@ -64,8 +64,9 @@ class FusedBatchNormTest(XLATestCase):
     return grad_x, grad_scale, grad_offset
 
   def testInference(self):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -74,8 +75,9 @@ class FusedBatchNormTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y_ref, mean_ref, var_ref = self._reference_training(
           x_val, scale_val, offset_val, epsilon, data_format)
@@ -97,8 +99,9 @@ class FusedBatchNormTest(XLATestCase):
       self.assertAllClose(y_val, y_ref, atol=1e-3)
 
   def _testLearning(self, use_gradient_checker):
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
 
@@ -109,8 +112,9 @@ class FusedBatchNormTest(XLATestCase):
     with self.test_session() as sess, self.test_scope():
       # To avoid constant folding
       t_val = array_ops.placeholder(np.float32, shape=x_shape, name="x")
-      scale = array_ops.placeholder(np.float32, shape=[2], name="scale")
-      offset = array_ops.placeholder(np.float32, shape=[2], name="offset")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      offset = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="offset")
       epsilon = 0.001
       y, mean, var = nn.fused_batch_norm(
           t_val,
@@ -151,11 +155,12 @@ class FusedBatchNormTest(XLATestCase):
   def testLearningWithGradientChecker(self):
     self._testLearning(True)
 
-  def testGradient(self):
+  def testGradientTraining(self):
     # TODO(b/64270657): Use gradient_checker here in addition to comparing with
     # this reference implementation.
-    x_shape = [2, 2, 6, 2]
-    scale_shape = [2]
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
     grad_val = np.random.random_sample(x_shape).astype(np.float32)
     x_val = np.random.random_sample(x_shape).astype(np.float32)
     scale_val = np.random.random_sample(scale_shape).astype(np.float32)
@@ -170,7 +175,7 @@ class FusedBatchNormTest(XLATestCase):
       var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
       grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
-          grad, x, scale, mean, var, data_format="NHWC")
+          grad, x, scale, mean, var, data_format="NHWC", is_training=True)
 
       grad_x_val, grad_scale_val, grad_offset_val = sess.run(
           [grad_x, grad_scale, grad_offset], {
@@ -188,6 +193,53 @@ class FusedBatchNormTest(XLATestCase):
       self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
       self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
 
+  def testGradientInference(self):
+    # TODO(b/64270657): Use gradient_checker here in addition to comparing with
+    # this reference implementation.
+    channel = 3
+    x_shape = [2, 2, 6, channel]
+    scale_shape = [channel]
+    grad_val = np.random.random_sample(x_shape).astype(np.float32)
+    x_val = np.random.random_sample(x_shape).astype(np.float32)
+    scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+    mean_val = np.random.random_sample(scale_shape).astype(np.float32)
+    var_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+    with self.test_session() as sess, self.test_scope():
+      grad = array_ops.placeholder(np.float32, shape=x_shape, name="grad")
+      x = array_ops.placeholder(np.float32, shape=x_shape, name="x")
+      mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean")
+      var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
+      scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
+      with self.test_scope():
+        out = gen_nn_ops.fused_batch_norm_grad(
+            grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+        grad_x, grad_scale, grad_offset, _, _ = out
+
+      ref_x, ref_scale, ref_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
+          grad, x, scale, mean, var, data_format="NHWC", is_training=False)
+
+      grad_x_val, grad_scale_val, grad_offset_val, = sess.run(
+          [grad_x, grad_scale, grad_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+      grad_x_ref, grad_scale_ref, grad_offset_ref, = sess.run(
+          [ref_x, ref_scale, ref_offset], {
+              grad: grad_val,
+              x: x_val,
+              mean: mean_val,
+              var: var_val,
+              scale: scale_val
+          })
+
+      self.assertAllClose(grad_x_val, grad_x_ref, atol=1e-2)
+      self.assertAllClose(grad_scale_val, grad_scale_ref, atol=1e-2)
+      self.assertAllClose(grad_offset_val, grad_offset_ref, atol=1e-3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/gather_test.py b/tensorflow/compiler/tests/gather_test.py
index 664c77f2000281e3be989665664c1be58d4dd1e5..13cbe6f312f5175edaec28fa7a8f28064194b0e9 100644
--- a/tensorflow/compiler/tests/gather_test.py
+++ b/tensorflow/compiler/tests/gather_test.py
@@ -45,7 +45,7 @@ class GatherTest(xla_test.XLATestCase):
     with self.test_session() as session, self.test_scope():
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in self.all_tf_types:
-        for indices in 4, [1, 2, 2, 4, 5]:
+        for indices in 4, [4], [1, 2, 2, 4, 5]:
           params_np = self._buildParams(data, dtype)
           params = array_ops.placeholder(dtype=dtype)
           indices_tf = constant_op.constant(indices)
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a04f376ebf6092fd9b6e879796454b1a5c648c96
--- /dev/null
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -0,0 +1,142 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for image ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.platform import test
+
+
+class ResizeBilinearTest(XLATestCase):
+
+  def _assertForwardOpMatchesExpected(self,
+                                      image_np,
+                                      target_shape,
+                                      expected=None):
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      image = array_ops.placeholder(image_np.dtype)
+      resized = gen_image_ops.resize_bilinear(
+          image, target_shape, align_corners=True)
+      out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def _assertBackwardOpMatchesExpected(self,
+                                       grads_np,
+                                       input_shape=None,
+                                       dtype=None,
+                                       expected=None):
+    if input_shape is None:
+      self.fail("input_shape must be specified")
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.test_session() as sess, self.test_scope():
+      dtype = dtype or np.float32
+      grads = array_ops.placeholder(np.float32)
+      resized = gen_image_ops._resize_bilinear_grad(
+          grads,
+          np.zeros([1, input_shape[0], input_shape[1], 1], dtype=dtype),
+          align_corners=True)
+      out = sess.run(resized, {grads: grads_np[np.newaxis, :, :, np.newaxis]})
+      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def testAlignCorners1x2To3x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]], dtype=np.float32))
+
+  def testAlignCorners1x2To3x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
+          input_shape=[1, 2],
+          dtype=dtype,
+          expected=np.array([[9, 12]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [1, 1],
+          expected=np.array([[1]], dtype=np.float32))
+
+  def testAlignCorners2x2To1x1Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7]], dtype=np.float32),
+          input_shape=[2, 2],
+          dtype=dtype,
+          expected=np.array([[7, 0], [0, 0]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2], [3, 4]], dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 1.5, 2], [2, 2.5, 3], [3, 3.5, 4]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3Grad(self):
+    self._assertBackwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+        input_shape=[2, 2],
+        expected=np.array([[5.25, 8.25], [14.25, 17.25]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=dtype), [2, 2],
+          expected=np.array([[1, 3], [7, 9]], dtype=np.float32))
+
+  def testAlignCorners3x3To2x2Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[7, 13], [22, 4]], dtype=np.float32),
+          input_shape=[3, 3],
+          dtype=dtype,
+          expected=np.array(
+              [[7, 0, 13], [0, 0, 0], [22, 0, 4]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3(self):
+    for dtype in self.float_types:
+      self._assertForwardOpMatchesExpected(
+          np.array(
+              [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+              dtype=dtype), [3, 3],
+          expected=np.array(
+              [[1, 2.5, 4], [7, 8.5, 10], [13, 14.5, 16]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3Grad(self):
+    for dtype in self.float_types:
+      self._assertBackwardOpMatchesExpected(
+          np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32),
+          input_shape=[4, 4],
+          dtype=dtype,
+          expected=np.array(
+              [[1, 1, 1, 3], [2, 1.25, 1.25, 3], [2, 1.25, 1.25, 3],
+               [7, 4, 4, 9]],
+              dtype=np.float32))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 6a8c3bcd55a6e454a19b6249cf4eb48739c8657f..798daaadbc5be50ef9cf7e1205f6d5a0bde59640 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -2460,6 +2460,36 @@ TEST_F(OpTest, Reshape) {
   });
 }
 
+TEST_F(OpTest, ResizeBilinear) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinear")
+            .RandomInput(DT_FLOAT, in_dims)
+            .Input(test::AsTensor<int32>(
+                std::vector<int32>(out_dims.begin(), out_dims.end())))
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
+TEST_F(OpTest, ResizeBilinearGrad) {
+  Repeatedly([this]() {
+    std::vector<int64> in_dims = RandomDims(4, 4);
+    std::vector<int64> out_dims = RandomDims(2, 2);
+
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("ResizeBilinearGrad")
+            .RandomInput(DT_FLOAT, in_dims)
+            .RandomInput(DT_FLOAT,
+                         {in_dims[0], out_dims[0], out_dims[1], in_dims[3]})
+            .Attr("T", DT_FLOAT)
+            .Attr("align_corners", true));
+  });
+}
+
 TEST_F(OpTest, Reverse) {
   Repeatedly([this]() {
     std::vector<int64> dims = RandomDims(1);
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index efda2cc207b2ab56774d193117a2237f3afbfb55..965fdf684b973498d0b3c3cde17711cca7279705 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -67,25 +67,37 @@ class ReduceOpsTest(XLATestCase):
       np.arange(-10, -4).reshape(2, 3),
       np.arange(-4, 2).reshape(2, 3),
   ]
-  NONEMPTY_FLOAT_DATA = [
-      np.arange(1, 7).reshape(2, 3),
-      np.arange(-10, -4).reshape(2, 3),
-      np.arange(-4, 2).reshape(2, 3),
+  COMPLEX_DATA = [
+      np.zeros(shape=(2, 0)).astype(np.complex64),
+      np.zeros(shape=(0, 30)).astype(np.complex64),
+      np.arange(1, 13, dtype=np.float32).view(np.complex64).reshape(2, 3),
+      np.arange(-14, -2, dtype=np.float32).view(np.complex64).reshape(2, 3),
+      np.arange(-4, 8, dtype=np.float32).view(np.complex64).reshape(2, 3),
   ]
+  NONEMPTY_FLOAT_DATA = [x for x in FLOAT_DATA if np.size(x) > 0]
+  NONEMPTY_COMPLEX_DATA = [x for x in COMPLEX_DATA if np.size(x) > 0]
   BOOL_DATA = [
       np.array([], dtype=np.bool).reshape(2, 0),
       np.array([], dtype=np.bool).reshape(0, 3),
       np.array([[False, True, False], [True, True, False]]),
   ]
 
-  def testReduceSum(self):
+  def testReduceSumF32(self):
     self._testReduction(math_ops.reduce_sum, np.sum, np.float32,
                         self.FLOAT_DATA)
 
-  def testReduceProd(self):
+  def testReduceSumC64(self):
+    self._testReduction(math_ops.reduce_sum, np.sum, np.complex64,
+                        self.COMPLEX_DATA)
+
+  def testReduceProdF32(self):
     self._testReduction(math_ops.reduce_prod, np.prod, np.float32,
                         self.FLOAT_DATA)
 
+  def testReduceProdC64(self):
+    self._testReduction(math_ops.reduce_prod, np.prod, np.complex64,
+                        self.COMPLEX_DATA)
+
   def testReduceMin(self):
 
     def reference_min(inp, axis):
@@ -108,12 +120,16 @@ class ReduceOpsTest(XLATestCase):
     self._testReduction(math_ops.reduce_max, reference_max, np.float32,
                         self.FLOAT_DATA)
 
-  def testReduceMean(self):
+  def testReduceMeanF32(self):
     # TODO(phawkins): mean on XLA currently returns 0 instead of NaN when
     # reducing across zero inputs.
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
                         self.NONEMPTY_FLOAT_DATA)
 
+  def testReduceMeanC64(self):
+    self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
+                        self.NONEMPTY_COMPLEX_DATA)
+
   def testReduceAll(self):
     self._testReduction(math_ops.reduce_all, np.all, np.bool, self.BOOL_DATA)
 
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3260e63b23226d736a7ddc0f21a94a8c791e0442
--- /dev/null
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -0,0 +1,229 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for scan ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def numpy_reverse(x, axis):
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  ix = [
+      slice(None, None, -1) if i == axis else slice(None) for i in range(length)
+  ]
+  return x[ix]
+
+
+def handle_options(func, x, axis, exclusive, reverse):
+  """Adds tf options to numpy scan ops."""
+  length = len(x.shape)
+  if axis < 0:
+    axis = length + axis
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+
+  if exclusive:
+    ix_head = [slice(0, 1) if i == axis else slice(None) for i in range(length)]
+    ix_init = [
+        slice(0, -1) if i == axis else slice(None) for i in range(length)
+    ]
+    if func == np.cumsum:
+      init = np.zeros_like(x[ix_head])
+    elif func == np.cumprod:
+      init = np.ones_like(x[ix_head])
+    else:
+      raise ValueError("Unknown scan function.")
+    x = np.concatenate([init, func(x[ix_init], axis)], axis=axis)
+  else:
+    x = func(x, axis=axis)
+
+  if reverse:
+    x = numpy_reverse(x, axis)
+  return x
+
+
+class CumsumTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumsum, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      tf_out = math_ops.cumsum(p, axis, exclusive, reverse).eval(
+          feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumsum(p, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumsum(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumsum(input_tensor, [0]).eval()
+
+
+class CumprodTest(XLATestCase):
+
+  valid_dtypes = [np.float32]
+
+  def axis_dtypes(self):
+    return set(self.int_types).intersection([np.int32, np.int64])
+
+  def _compare(self, x, axis, exclusive, reverse):
+    np_out = handle_options(np.cumprod, x, axis, exclusive, reverse)
+    with self.test_session(), self.test_scope():
+      p = array_ops.placeholder(x.dtype)
+      prod = math_ops.cumprod(p, axis, exclusive, reverse)
+      tf_out = prod.eval(feed_dict={p: x})
+
+    self.assertAllClose(np_out, tf_out)
+
+  def _compareAll(self, x, axis):
+    for exclusive in [True, False]:
+      for reverse in [True, False]:
+        self._compare(x, axis, exclusive, reverse)
+
+  def testEmpty(self):
+    for dtype in self.valid_dtypes:
+      x = np.zeros([0]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def testAxisType(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis_dtype in self.axis_dtypes():
+        with self.test_session(), self.test_scope():
+          p = array_ops.placeholder(x.dtype)
+          axis = constant_op.constant(0, axis_dtype)
+          math_ops.cumprod(x, axis).eval(feed_dict={p: x})
+
+  def test1D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 6).reshape([5]).astype(dtype)
+      for axis in (-1, 0):
+        self._compareAll(x, axis)
+
+  def test2D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
+      for axis in (-2, -1, 0, 1):
+        self._compareAll(x, axis)
+
+  def test3D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
+      for axis in (-3, -2, -1, 0, 1, 2):
+        self._compareAll(x, axis)
+
+  def test6D(self):
+    for dtype in self.valid_dtypes:
+      x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
+      for axis in range(-6, 6, 3):
+        self._compareAll(x, axis)
+
+  def testInvalidAxis(self):
+    x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
+    with self.test_session(), self.test_scope():
+      input_tensor = ops.convert_to_tensor(x)
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, -3).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "Expected scan axis in the range [-2, 2)" in str(e)):
+        math_ops.cumprod(input_tensor, 2).eval()
+      with self.assertRaisesWithPredicateMatch(
+          errors_impl.InvalidArgumentError,
+          lambda e: "axis must be a scalar" in str(e)):
+        math_ops.cumprod(input_tensor, [0]).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4336ebdbd184a081619f0a6951dd4514735c6eb6
--- /dev/null
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateless random-number generation ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.contrib import stateless
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StatelessRandomOpsTest(XLATestCase):
+  """Test cases for stateless random-number generator operators."""
+
+  def _random_types(self):
+    return [dtypes.float32]
+
+  def testDeterminism(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    with self.test_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for stateless_op in [
+          stateless.stateless_random_uniform, stateless.stateless_random_normal
+      ]:
+        for shape in (), (3,), (2, 5):
+          for dtype in self._random_types():
+            pure = stateless_op(shape, seed=seed_t, dtype=dtype)
+            values = [(seed, pure.eval(feed_dict={
+                seed_t: seed
+            })) for seed in seeds]
+            for s0, v0 in values:
+              for s1, v1 in values:
+                self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def testRandomUniformIsInRange(self):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless.stateless_random_uniform(
+            shape=[1000], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertTrue(np.all(y >= 0))
+        self.assertTrue(np.all(y < 1))
+
+  def _chi_squared(self, x, bins):
+    """Pearson's Chi-squared test."""
+    x = np.ravel(x)
+    n = len(x)
+    histogram, _ = np.histogram(x, bins=bins, range=(0, 1))
+    expected = n / float(bins)
+    return np.sum(np.square(histogram - expected) / expected)
+
+  def testDistributionOfStatelessRandomUniform(self):
+    """Use Pearson's Chi-squared test to test for uniformity."""
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        n = 1000
+        x = stateless.stateless_random_uniform(
+            shape=[n], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [565656, 121212]})
+        # Tests that the values are distributed amongst 10 bins with equal
+        # probability. 16.92 is the Chi^2 value for 9 degrees of freedom with
+        # p=0.05. This test is probabilistic and would be flaky if the random
+        # seed were not fixed.
+        self.assertTrue(self._chi_squared(y, 10) < 16.92)
+
+  def _normal_cdf(self, x):
+    """Cumulative distribution function for a standard normal distribution."""
+    return 0.5 + 0.5 * np.vectorize(math.erf)(x / math.sqrt(2))
+
+  def _anderson_darling(self, x):
+    """Anderson-Darling test for a standard normal distribution."""
+    x = np.sort(np.ravel(x))
+    n = len(x)
+    i = np.linspace(1, n, n)
+    z = np.sum((2 * i - 1) * np.log(self._normal_cdf(x)) +
+               (2 * (n - i) + 1) * np.log(1 - self._normal_cdf(x)))
+    return -n - z / n
+
+  def testDistributionOfStatelessRandomNormal(self):
+    """Use Anderson-Darling test to test distribution appears normal."""
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        n = 1000
+        x = stateless.stateless_random_normal(
+            shape=[n], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [25252, 314159]})
+        # The constant 2.492 is the 5% critical value for the Anderson-Darling
+        # test where the mean and variance are known. This test is probabilistic
+        # so to avoid flakiness the seed is fixed.
+        self.assertTrue(self._anderson_darling(y) < 2.492)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 76644380bdf2e0c24f6d363ddfaabdff836495d7..0da7442a24201011e3126e53c9d884534a0d721e 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -33,6 +33,17 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
+def nhwc_to_format(x, data_format):
+  """Converts a numpy array from NHWC format to `data_format`."""
+  rank = len(x.shape)
+  if data_format == "NCHW":
+    return np.transpose(x, [0, rank - 1] + list(range(1, rank - 1)))
+  elif data_format == "NHWC":
+    return x
+  else:
+    raise ValueError("Unknown format {}".format(data_format))
+
+
 class UnaryOpsTest(XLATestCase):
   """Test cases for unary operators."""
 
@@ -76,6 +87,12 @@ class UnaryOpsTest(XLATestCase):
           array_ops.diag_part,
           np.arange(36).reshape([2, 3, 2, 3]).astype(dtype),
           np.array([[0, 7, 14], [21, 28, 35]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.diag, np.array([[1, 2], [3, 4]], dtype=dtype),
+          np.array(
+              [[[[1, 0], [0, 0]], [[0, 2], [0, 0]]], [[[0, 0], [3, 0]],
+                                                      [[0, 0], [0, 4]]]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           array_ops.identity,
@@ -86,6 +103,21 @@ class UnaryOpsTest(XLATestCase):
           array_ops.matrix_diag,
           np.array([[1, 2], [3, 4]], dtype=dtype),
           np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype),
+          np.array(
+              [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]],
+              dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          array_ops.matrix_diag,
+          np.array(
+              [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype),
+          np.array(
+              [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]],
+                [[4, 0, 0], [0, 5, 0], [0, 0, 6]]],
+               [[[7, 0, 0], [0, 8, 0], [0, 0, 9]],
+                [[10, 0, 0], [0, 11, 0], [0, 0, 12]]]],
+              dtype=dtype))
       self._assertOpOutputMatchesExpected(
           array_ops.matrix_diag_part,
           np.arange(3 * 2 * 4).reshape([3, 2, 4]).astype(dtype),
@@ -330,12 +362,22 @@ class UnaryOpsTest(XLATestCase):
 
   def testComplexOps(self):
     for dtype in self.complex_types:
-      # TODO(b/65408531): math_ops.acosh (needs pow)
-      # TODO(b/65408531): math_ops.asinh (needs pow)
 
       # TODO(b/65408531): Wider support for log (needs atan2).
       atan2_supported = self.device == "XLA_GPU"
       if atan2_supported:
+        self._assertOpOutputMatchesExpected(
+            math_ops.acosh,
+            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+            expected=np.arccosh(
+                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+
+        self._assertOpOutputMatchesExpected(
+            math_ops.asinh,
+            np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
+            expected=np.arcsinh(
+                np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype)))
+
         self._assertOpOutputMatchesExpected(
             math_ops.atanh,
             np.array([0.1, 0.2j, 0.3 - 0.1j, 0.4 + 0.5j], dtype=dtype),
@@ -392,19 +434,26 @@ class UnaryOpsTest(XLATestCase):
             expected=np.log1p(
                 np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
 
-      # TODO(b/34703906): math_ops.rsqrt (needs pow)
+        val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
+        self._assertOpOutputMatchesExpected(
+            math_ops.rsqrt, val, expected=1 / np.sqrt(val))
+
+        self._assertOpOutputMatchesExpected(
+            math_ops.sigmoid, val, expected=1 / (1 + np.exp(-val)))
 
-      # TODO(b/34703906): math_ops.sigmoid (needs tanh)
+        self._assertOpOutputMatchesExpected(
+            math_ops.sqrt, val, expected=np.sqrt(val))
 
-      # TODO(b/34703906): math_ops.sqrt (needs pow)
+        self._assertOpOutputMatchesExpected(
+            math_ops.tanh,
+            np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
+            expected=np.tanh(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
       self._assertOpOutputMatchesExpected(
           math_ops.tan,
           np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
           expected=np.tan(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
-      # TODO(b/34703906): math_ops.tanh (as itself)
-
       ctypes = {np.complex64: np.float32}
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
@@ -624,55 +673,88 @@ class UnaryOpsTest(XLATestCase):
         equality_test=self.ListsAreClose)
 
   def testDepthToSpace(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.depth_to_space(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
-          expected=np.array([[[[1], [2]],
-                              [[3], [4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1], [2]],
+                                               [[3], [4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3], [4, 5, 6]],
-                              [[7, 8, 9], [10, 11, 12]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.depth_to_space(x, block_size=2),
-          np.array([[[[1, 2, 3, 4],
-                      [5, 6, 7, 8]],
-                     [[9, 10, 11, 12],
-                      [13, 14, 15, 16]]]], dtype=dtype),
-          expected=np.array([[[[1], [2], [5], [6]],
-                              [[3], [4], [7], [8]],
-                              [[9], [10], [13], [14]],
-                              [[11], [12], [15], [16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1], [2], [5], [6]],
+                           [[3], [4], [7], [8]],
+                           [[9], [10], [13], [14]],
+                           [[11], [12], [15], [16]]]], dtype=dtype),
+                data_format))
 
   def testSpaceToDepth(self):
+    def make_op(data_format):
+      def op(x):
+        return array_ops.space_to_depth(x, block_size=2,
+                                        data_format=data_format)
+      return op
+
     for dtype in self.numeric_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2]],
-                     [[3], [4]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4]]]], dtype=dtype))
+      for data_format in ["NCHW", "NHWC"]:
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2]],
+                                      [[3], [4]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(np.array([[[[1, 2, 3, 4]]]], dtype=dtype),
+                                    data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1, 2, 3], [4, 5, 6]],
-                     [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
-                            dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1, 2, 3], [4, 5, 6]],
+                                      [[7, 8, 9], [10, 11, 12]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]],
+                         dtype=dtype),
+                data_format))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: array_ops.space_to_depth(x, block_size=2),
-          np.array([[[[1], [2], [5], [6]],
-                     [[3], [4], [7], [8]],
-                     [[9], [10], [13], [14]],
-                     [[11], [12], [15], [16]]]], dtype=dtype),
-          expected=np.array([[[[1, 2, 3, 4],
-                               [5, 6, 7, 8]],
-                              [[9, 10, 11, 12],
-                               [13, 14, 15, 16]]]], dtype=dtype))
+        self._assertOpOutputMatchesExpected(
+            make_op(data_format),
+            nhwc_to_format(np.array([[[[1], [2], [5], [6]],
+                                      [[3], [4], [7], [8]],
+                                      [[9], [10], [13], [14]],
+                                      [[11], [12], [15], [16]]]], dtype=dtype),
+                           data_format),
+            expected=nhwc_to_format(
+                np.array([[[[1, 2, 3, 4],
+                            [5, 6, 7, 8]],
+                           [[9, 10, 11, 12],
+                            [13, 14, 15, 16]]]], dtype=dtype),
+                data_format))
 
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index c50342dee45eba6ae54f01653ecc81ef096b547b..b08d6ab21e0746558cb3d4818d4c822c45d2e9ee 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -107,11 +107,26 @@ class VariableOpsTest(XLATestCase):
                  [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]],
             ).astype(dtype), sess.run(x))
 
+  def testShape(self):
+    for dtype in self.numeric_types:
+      init = np.ones([2, 3]).astype(dtype)
+      with self.test_session() as session, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        session.run(variables.variables_initializer([v]))
+        h = v.handle
+        s32, s64 = session.run([
+            resource_variable_ops.variable_shape(h),
+            resource_variable_ops.variable_shape(h, out_type=dtypes.int64)
+        ])
+        self.assertEqual(s32.dtype, np.int32)
+        self.assertEqual(s64.dtype, np.int64)
+        self.assertAllEqual(s32, [2, 3])
+        self.assertAllEqual(s64, [2, 3])
+
   def testReadWrite(self):
     """Tests initialization, reading, and writing a resource variable."""
     for dtype in self.numeric_types:
       with self.test_session() as session:
-        print(ops.get_default_graph())
         with self.test_scope():
           with variable_scope.variable_scope("ascope", use_resource=True):
             x = variable_scope.get_variable(
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 3c94bcafc1d19b1bc54887e6f2c25b1886be646e..5d1cb6d73570a1a3efbe0d2d37d9746bc0e2528f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package_group(
     name = "internal",
@@ -25,6 +25,30 @@ package(
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 
+cc_library(
+    name = "tf2xla_supported_ops_lib",
+    srcs = ["tf2xla_supported_ops.cc"],
+    hdrs = ["tf2xla_supported_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_compiler",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf2xla_supported_ops",
+    srcs = ["tf2xla_supported_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_supported_ops_lib"],
+)
+
 xla_proto_library(
     name = "tf2xla_proto",
     srcs = ["tf2xla.proto"],
@@ -67,7 +91,6 @@ cc_library(
         # Keep dependencies to a minimum here; this library is used in every AOT
         # binary produced by tfcompile.
         "//tensorflow/compiler/aot:runtime",
-        "//tensorflow/compiler/tf2xla:xla_local_runtime_context",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
     ],
@@ -123,6 +146,9 @@ cc_library(
         ":const_analysis",
         ":dump_graph",
         ":functionalize_control_flow",
+        ":sharding_util",
+        ":tf2xla_util",
+        "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -169,6 +195,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sharding_util",
+    srcs = ["sharding_util.cc"],
+    hdrs = ["sharding_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "sharding_util_test",
+    srcs = ["sharding_util_test.cc"],
+    deps = [
+        ":sharding_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # Internal targets below this point.
 
 cc_library(
@@ -176,11 +231,14 @@ cc_library(
     srcs = ["tf2xla_util.cc"],
     hdrs = ["tf2xla_util.h"],
     deps = [
+        ":sharding_util",
         ":tf2xla_proto",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -190,8 +248,14 @@ tf_cc_test(
     name = "tf2xla_util_test",
     srcs = ["tf2xla_util_test.cc"],
     deps = [
+        ":sharding_util",
         ":tf2xla_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -317,13 +381,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "xla_local_runtime_context",
-    hdrs = ["xla_local_runtime_context.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/core:framework_lite"],
-)
-
 cc_library(
     name = "dump_graph",
     srcs = [
@@ -350,6 +407,7 @@ cc_library(
     srcs = ["functionalize_control_flow.cc"],
     hdrs = ["functionalize_control_flow.h"],
     deps = [
+        ":tf2xla_util",
         "//tensorflow/compiler/jit:graph_to_functiondef",
         "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla:dump_graph",
@@ -359,6 +417,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index 102a2cf07b51486bb445b0311966717b7e82ace6..ab2f1e9a7ab577bbe704e568b21d9912439605ca 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -52,6 +52,8 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Conv2DBackpropInput", "input_sizes"},
       {"Conv3DBackpropFilterV2", "filter_sizes"},
       {"Conv3DBackpropInputV2", "input_sizes"},
+      {"Cumprod", "axis"},
+      {"Cumsum", "axis"},
       {"DepthwiseConv2dNativeBackpropFilter", "filter_sizes"},
       {"DepthwiseConv2dNativeBackpropInput", "input_sizes"},
       {"DynamicStitch", "indices"},
@@ -69,6 +71,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Pad", "paddings"},
       {"PadV2", "paddings"},
       {"MirrorPad", "paddings"},
+      {"Multinomial", "num_samples"},
       {"Prod", "reduction_indices"},
       {"RandomStandardNormal", "shape"},
       {"RandomUniform", "shape"},
@@ -77,6 +80,7 @@ Status BackwardsConstAnalysis(const Graph& g,
       {"Range", "limit"},
       {"Range", "delta"},
       {"Reshape", "shape"},
+      {"ResizeBilinear", "size"},
       {"ResourceStridedSliceAssign", "begin"},
       {"ResourceStridedSliceAssign", "end"},
       {"ResourceStridedSliceAssign", "strides"},
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index ddd912b87315f7943915153b5bf73531107af54d..03603ee9baefd1d20d220faf63c9c1c427ebdf31 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -63,7 +63,12 @@ string MakeUniquePath(string name) {
 
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, graph_def));
+  Status status = WriteTextProto(Env::Default(), path, graph_def);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
@@ -79,7 +84,13 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
   string path = MakeUniquePath(name);
-  TF_CHECK_OK(WriteTextProto(Env::Default(), path, fdef));
+  Status status = WriteTextProto(Env::Default(), path, fdef);
+  if (!status.ok()) {
+    VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : "
+            << status;
+    path.clear();
+    path = "(unavailable)";
+  }
   return path;
 }
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 35b6960a98cda1bf098f3e01cac3df8173bdc729..267268298c97560a3409b0bdc134526b60e39e5b 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graph_to_functiondef.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -129,7 +130,9 @@ Status CopySubgraph(const Graph& graph, const Frame* frame,
         stack.push_back(src);
       }
       Node* src_copy = (*node_map)[e->src()->id()];
-      int src_output = squash_src_outputs[e->src()->id()] ? 0 : e->src_output();
+      int src_output = squash_src_outputs[e->src()->id()] && !e->IsControlEdge()
+                           ? 0
+                           : e->src_output();
       Node* dst_copy = (*node_map)[e->dst()->id()];
       output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
     }
@@ -405,7 +408,15 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
                                        arg.merge->name());
       }
 
-      // Find the Exit successor of the Switch.
+      // Update the device on the Identity outputs of the switch to match their
+      // target. These Identity outputs do not
+
+      // Loop over the switch node's output to:
+      // - Find the Exit successor.
+      // - Set the sharding on all Identity outputs of the switch. These
+      //   identity nodes are values used by the loop body or condition.
+      //   The Identity node may have the wrong device so copy the device from
+      //   one of its outputs instead.
       for (const Edge* edge : arg.switch_node->out_edges()) {
         if (edge->src_output() == 0 && IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
@@ -413,6 +424,9 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
                                            arg.switch_node->name());
           }
           arg.exit = edge->dst();
+        } else if (StringPiece(edge->dst()->type_string()) == "Identity") {
+          TF_RETURN_IF_ERROR(
+              SetNodeShardingFromNeighbors(edge->dst(), /*out_edges=*/true));
         }
       }
     }
@@ -609,11 +623,12 @@ class FunctionalizeCond {
   FunctionalizeCond(Graph* graph, FunctionLibraryDefinition* library)
       : clusters_(graph->num_node_ids()), library_(library), graph_(graph) {}
 
-  // Returns a vector of Merge nodes from the clustered graph where the nodes
+  // Returns a vector of Switch nodes from the clustered graph where the nodes
   // are sorted by the number of switch nodes minus number of merge nodes
   // from a root of the clustered graph to the given Merge node, with ties
-  // broken by the representative of the Cluster.
-  std::vector<std::pair<int, Cluster*>> SortedMergeNodes();
+  // broken by the representative of the Cluster. This corresponds to sorting by
+  // nesting depth, from deepest nested to outermost.
+  std::vector<std::pair<int, Cluster*>> SortedSwitchNodes();
 
   // Returns whether the graph has no conditionals.
   bool NoConditionals() const { return merge_nodes_.empty(); }
@@ -640,15 +655,17 @@ class FunctionalizeCond {
   // extracting the bodies needed for the then and else branch, creates a XlaIf
   // node, removing the nodes of the branches from the graph and replacing the
   // merge node with a XlaIf.
-  Status ConvertMergeToXlaIf(Cluster* merge_cluster);
+  Status ConvertCorrespondingMergeToXlaIf(Cluster* switch_cluster);
 
   // Removes a Switch cluster feeding directly into a Merge cluster by removing
   // the Switch and Merge nodes and collapsing into a single cluster.
-  Status RemoveTrivialMerge(Cluster* merge_cluster);
+  Status RemoveTrivialSwitch(Cluster* switch_cluster);
 
-  // Returns the switch cluster corresponding to the merge node. This function
-  // only returns the switch cluster in the simple case where we have a switch
-  // node is the entry of a diamond corresponding to a conditional:
+  // Returns the merge cluster corresponding to the switch node. This function
+  // only returns the merge cluster in the case where we have a switch node that
+  // is the single entry point for all paths to a common merge cluster, this
+  // merge cluster may be created by combining multiple merge clusters, that
+  // share the switch cluster as common ancestor, together.
   //
   //           Switch
   //          /      \
@@ -657,8 +674,9 @@ class FunctionalizeCond {
   //        merge_cluster
   //
   // Note: either of the branches may be empty. The case where both branches are
-  // empty is handled by RemoveTrivialMerge.
-  gtl::optional<Cluster*> GetSwitchCluster(const Cluster& merge_cluster);
+  // empty is handled by RemoveTrivialSwitch.
+  gtl::optional<Cluster*> CreateCorrespondingMergeCluster(
+      Cluster* switch_cluster);
 
   // Determines the arguments needed as input to the Merge cluster originating
   // from the Switch cluster.
@@ -717,11 +735,12 @@ string DebugString(const Graph& graph,
                    FunctionalizeCond::ClusterHandle::Vector* clusters) {
   string ret = "digraph {\ncompound=true;labeljust=\"r\";ranksep=0.24\n";
   std::map<FunctionalizeCond::ClusterHandle, string> subgraphs;
+  auto name = [](const Node* n) {
+    return strings::StrCat(n->type_string(), "_", n->id());
+  };
   for (Node* n : graph.nodes()) {
-    if (n->IsOp()) {
-      strings::StrAppend(&subgraphs[clusters->at(n).Get()], n->id(),
-                         " [label=\"", n->name(), "\"];\n");
-    }
+    strings::StrAppend(&subgraphs[clusters->at(n).Get()], n->id(), " [label=\"",
+                       name(n), "\"];\n");
   }
   for (auto kv : subgraphs) {
     strings::StrAppend(&ret, "subgraph cluster_", kv.first.ToString(), " {\n",
@@ -729,16 +748,11 @@ string DebugString(const Graph& graph,
                        kv.first.ToString(), "\";\n", kv.second, "}\n");
   }
   for (Node* n : graph.nodes()) {
-    if (!n->IsOp()) {
-      continue;
-    }
     for (Node* in : n->in_nodes()) {
-      if (in->IsOp()) {
-        strings::StrAppend(&ret, in->id(), " -> ", n->id(), ";\n");
-      }
+      strings::StrAppend(&ret, in->id(), " -> ", n->id(), ";\n");
     }
   }
-  return strings::StrCat(ret, "}");
+  return strings::StrCat(ret, "} // end");
 }
 
 string DebugString(const FunctionalizeCond::ClusteredGraph& clustered_graph) {
@@ -747,16 +761,24 @@ string DebugString(const FunctionalizeCond::ClusteredGraph& clustered_graph) {
     return cluster.representative.ToString();
   };
   for (auto kv : clustered_graph) {
-    strings::StrAppend(&ret, kv.first.ToString(), " [label=\"", name(kv.second),
-                       " (", kv.second.switch_nodes.size(), ", ",
-                       kv.second.merge_nodes.size(), ")\"];\n");
+    if (!kv.second.switch_nodes.empty() || !kv.second.merge_nodes.empty()) {
+      strings::StrAppend(
+          &ret, kv.first.ToString(), " [label=\"", name(kv.second),
+          kv.second.switch_nodes.empty()
+              ? ""
+              : strings::StrCat(" switches=", kv.second.switch_nodes.size()),
+          kv.second.merge_nodes.empty()
+              ? ""
+              : strings::StrCat(" merges=", kv.second.merge_nodes.size()),
+          "\"];\n");
+    }
   }
   for (auto kv : clustered_graph) {
     for (auto in : kv.second.in_nodes) {
       strings::StrAppend(&ret, name(*in), " -> ", name(kv.second), ";\n");
     }
   }
-  return strings::StrCat(ret, "}");
+  return strings::StrCat(ret, "} // end");
 }
 
 bool IsDeadSwitch(const Node* node) {
@@ -775,10 +797,11 @@ bool IsDeadSwitch(const Node* node) {
 }
 
 void FunctionalizeCond::CreateClusters() {
+  ClusterHandle source_cluster = ClusterHandle(Graph::kSourceId);
+  auto& source = clusters_.at(source_cluster);
+  std::deque<std::pair<ClusterHandle, std::deque<Node*>>> workqueue;
+  workqueue.push_back({source_cluster, {}});
   for (Node* node : graph_->nodes()) {
-    if (!node->IsOp()) {
-      continue;
-    }
     if (IsSwitch(node)) {
       switch_nodes_.insert(node);
     } else if (IsMerge(node)) {
@@ -786,6 +809,12 @@ void FunctionalizeCond::CreateClusters() {
     }
     ClusterHandle& cluster = clusters_.at(node).Get();
     cluster = ClusterHandle(node->id());
+    // Group all source clusters together.
+    if (node->IsSource() || node->in_edges().empty()) {
+      clusters_.at(node).Merge(&source);
+      source.Merge(&clusters_.at(node));
+      workqueue.front().second.push_back(node);
+    }
   }
 
   // If there are no Merge nodes, then terminate.
@@ -800,15 +829,117 @@ void FunctionalizeCond::CreateClusters() {
   // conservatively assuming all merge nodes become XlaIf nodes.
   clusters_.resize(clusters_.size() + merge_nodes_.size());
 
-  // Merge a cluster with its input, unless the input is a Switch node or
-  // the node is a Merge node.
-  for (const Node* node : graph_->nodes()) {
-    if (IsMerge(node) || IsSwitch(node) || !node->IsOp()) {
-      continue;
+  std::unordered_set<Node*> marked;
+  while (!workqueue.empty()) {
+    auto cluster_queue = workqueue.front();
+    VLOG(4) << "Cluster: " << cluster_queue.first << " Queue: {"
+            << str_util::Join(cluster_queue.second, ",",
+                              [](string* output, const Node* node) {
+                                strings::StrAppend(output, node->id());
+                              })
+            << "}";
+
+    UnionFind<ClusterHandle>& repr = clusters_.at(cluster_queue.first);
+    workqueue.pop_front();
+    std::deque<Node*> switch_nodes;
+    std::deque<Node*> merge_nodes;
+    std::unordered_set<Node*> cluster_member;
+    while (!cluster_queue.second.empty()) {
+      // Iterate node workqueue and flow forward merging all nodes reachable
+      // that are neither a Switch or a Merge and whose inputs are all part of
+      // the same cluster.
+      Node* cur = cluster_queue.second.front();
+      cluster_queue.second.pop_front();
+      if (marked.find(cur) != marked.end()) {
+        continue;
+      }
+      if (IsMerge(cur)) {
+        merge_nodes.push_back(cur);
+        marked.insert(cur);
+        continue;
+      }
+      if (IsSwitch(cur)) {
+        switch_nodes.push_back(cur);
+        marked.insert(cur);
+        continue;
+      }
+      clusters_.at(cur).Merge(&repr);
+      cluster_member.insert(cur);
+      for (Node* out : cur->out_nodes()) {
+        bool all_ancestors_in_cluster = true;
+        for (Node* in : out->in_nodes()) {
+          if (IsMerge(out)) {
+            merge_nodes.push_back(out);
+          }
+          if (IsSwitch(out)) {
+            switch_nodes.push_back(out);
+          }
+          if (cluster_member.find(in) == cluster_member.end()) {
+            all_ancestors_in_cluster = false;
+            break;
+          }
+        }
+        if (all_ancestors_in_cluster && out->IsOp()) {
+          cluster_queue.second.push_back(out);
+          marked.insert(cur);
+        }
+      }
     }
-    for (const Node* in : node->in_nodes()) {
-      if (in->IsOp() && !IsSwitch(in) && !IsMerge(in)) {
-        clusters_.at(node).Merge(&clusters_.at(in));
+
+    VLOG(4) << "Switches: {"
+            << str_util::Join(switch_nodes, ",",
+                              [](string* output, const Node* node) {
+                                strings::StrAppend(output, node->id());
+                              })
+            << "}";
+
+    // Merge Switch nodes with common predicate.
+    std::unordered_map<Node*, std::vector<Node*>> predicate_to_switch;
+    for (Node* node : switch_nodes) {
+      Node* tmp;
+      TF_CHECK_OK(node->input_node(1, &tmp));
+      predicate_to_switch[tmp].push_back(node);
+    }
+    for (auto kv : predicate_to_switch) {
+      Node* first = kv.second.front();
+      for (Node* switch_node : kv.second) {
+        clusters_.at(first).Merge(&clusters_.at(switch_node));
+      }
+    }
+
+    // Enqueue each edge of the switch node separately. That is, group all the
+    // nodes that are due to the true/false edge of the switch together and
+    // consider all nodes that only have a control dependency on the switch node
+    // separately. We want to group together all nodes that are part of the same
+    // branch, as these will be extracted into the `then` and `else` functions
+    // of the functional if. The ops due to control edges are different as they
+    // could be involved with either branch and merging them here could result
+    // in invalid graphs.
+    for (auto kv : predicate_to_switch) {
+      ClusterHandle none = ClusterHandle(-1);
+      ClusterHandle first[2] = {none, none};
+      std::deque<Node*>* queue[2];
+      for (auto switch_node : kv.second) {
+        for (const auto e : switch_node->out_edges()) {
+          if (IsSwitch(e->dst()) || IsMerge(e->dst())) {
+            continue;
+          }
+          // Control edges are enqueued on their own.
+          if (e->IsControlEdge()) {
+            workqueue.push_back({Representative(e->dst()), {e->dst()}});
+            continue;
+          }
+          // Combine all outputs of the same output port of a switch cluster
+          // into the same workqueue entry.
+          if (first[e->src_output()] == none) {
+            ClusterHandle repr = Representative(e->dst());
+            first[e->src_output()] = repr;
+            workqueue.push_back({repr, {}});
+            queue[e->src_output()] = &workqueue.back().second;
+          }
+          clusters_.at(first[e->src_output()]).Merge(&clusters_.at(e->dst()));
+          queue[e->src_output()]->push_back(e->dst());
+        }
       }
     }
   }
@@ -862,7 +993,7 @@ void FunctionalizeCond::CreateClusteredGraph() {
     for (const Node* in : node->in_nodes()) {
       ClusterHandle other_repr = Representative(in);
       // Skip source, sink and internal edges.
-      if (!in->IsOp() || other_repr == repr) {
+      if (other_repr == repr) {
         continue;
       }
       Cluster& cluster_node_in = clustered_graph_[other_repr];
@@ -873,7 +1004,7 @@ void FunctionalizeCond::CreateClusteredGraph() {
     for (const Node* out : node->out_nodes()) {
       ClusterHandle other_repr = Representative(out);
       // Skip source, sink and internal edges.
-      if (!out->IsOp() || other_repr == repr) {
+      if (other_repr == repr) {
         continue;
       }
       Cluster& cluster_node_out = clustered_graph_[other_repr];
@@ -883,6 +1014,7 @@ void FunctionalizeCond::CreateClusteredGraph() {
     }
     return cluster_node;
   };
+  update_cluster_for_node(graph_->source_node());
   for (Node* node : switch_nodes_) {
     update_cluster_for_node(node).switch_nodes.insert(node);
   }
@@ -890,74 +1022,64 @@ void FunctionalizeCond::CreateClusteredGraph() {
     update_cluster_for_node(node).merge_nodes.insert(node);
   }
 
-  // Merge Switch nodes with common predicate.
-  std::unordered_map<Node*, std::vector<Node*>> predicate_to_switch;
-  for (Node* node : switch_nodes_) {
-    Node* tmp;
-    TF_CHECK_OK(node->input_node(1, &tmp));
-    predicate_to_switch[tmp].push_back(node);
-  }
-  for (auto kv : predicate_to_switch) {
-    Cluster& first = clustered_graph_.at(Representative(kv.second.front()));
-    for (Node* switch_node : kv.second) {
-      ClusterHandle handle = Representative(switch_node);
-      Cluster& cluster = clustered_graph_.at(handle);
-      ContractEdge(&cluster, &first, /*remove_from_graph=*/true);
-    }
-  }
-
-  // Merge Merge nodes with common input together.
-  for (Node* node : merge_nodes_) {
-    Cluster& cluster = clustered_graph_.at(Representative(node));
-    for (const Node* in : node->in_nodes()) {
-      if (!in->IsOp()) {
-        continue;
-      }
-      Cluster& cluster_node_in = clustered_graph_.at(Representative(in));
-      // ContractEdge can modify out_nodes of cluster_node_in, so traverse
-      // over out_nodes assuming it does.
-      for (auto it = cluster_node_in.out_nodes.begin();
-           it != cluster_node_in.out_nodes.end();) {
-        if (!(*it)->merge_nodes.empty()) {
-          ContractEdge(*it++, &cluster, /*remove_from_graph=*/true);
-        } else {
-          ++it;
-        }
-      }
-    }
-  }
-
   VLOG(3) << "Graph with clusters: " << DebugString(*graph_, &clusters_);
   VLOG(3) << "ClusteredGraph: " << DebugString(clustered_graph_);
 }
 
-gtl::optional<FunctionalizeCond::Cluster*> FunctionalizeCond::GetSwitchCluster(
-    const Cluster& merge_cluster) {
-  VLOG(3) << "GetSwitchCluster for " << merge_cluster.representative;
-  gtl::optional<Cluster*> switch_cluster;
-  if (merge_cluster.in_nodes.size() > 2) {
-    return gtl::nullopt;
+gtl::optional<FunctionalizeCond::Cluster*>
+FunctionalizeCond::CreateCorrespondingMergeCluster(Cluster* switch_cluster) {
+  VLOG(3) << "CreateCorrespondingMergeCluster for "
+          << switch_cluster->representative;
+  std::unordered_set<Cluster*> merges;
+  std::unordered_set<Cluster*> dominated;
+  dominated.insert(switch_cluster);
+  std::deque<Cluster*> queue;
+  auto enqueue_or_update_merge = [this, &queue, &merges](Cluster* c) {
+    if (c->merge_nodes.empty()) {
+      queue.push_back(c);
+    } else {
+      merges.insert(c);
+    }
+  };
+  // Enqueue all the outputs of the switch cluster in the workqueue.
+  for (auto* out : switch_cluster->out_nodes) {
+    enqueue_or_update_merge(out);
   }
-  for (Cluster* in : merge_cluster.in_nodes) {
-    Cluster* cluster = in;
-    if (in->switch_nodes.empty()) {
-      if (in->in_nodes.size() != 1) {
+  std::unordered_set<Cluster*> visited;
+  while (!queue.empty()) {
+    Cluster* cur = queue.front();
+    queue.pop_front();
+    if (visited.find(cur) != visited.end()) {
+      continue;
+    }
+    visited.insert(cur);
+    // Ensure all inputs to the current node are in the dominated set.
+    for (Cluster* in : cur->in_nodes) {
+      if (dominated.find(in) == dominated.end()) {
         return gtl::nullopt;
       }
-      // There is only a single `in` cluster.
-      cluster = *in->in_nodes.begin();
     }
-    if (cluster->switch_nodes.empty()) {
-      return gtl::nullopt;
-    }
-
-    if (switch_cluster.has_value() && *switch_cluster != cluster) {
-      return gtl::nullopt;
-    } else {
-      switch_cluster = cluster;
+    for (Cluster* out : cur->out_nodes) {
+      // No switch nodes beyond the entry one is expected.
+      if (!out->switch_nodes.empty()) {
+        return gtl::nullopt;
+      }
+      enqueue_or_update_merge(out);
     }
   }
-  return switch_cluster;
+  // Return if there are no merge nodes.
+  if (merges.empty()) {
+    return gtl::nullopt;
+  }
+  auto it = merges.begin();
+  Cluster* merge_cluster = *it;
+  for (++it; it != merges.end(); ++it) {
+    ContractEdge(*it, merge_cluster);
+  }
+
+  // TODO(jpienaar): Clean up graph, merging nodes.
+
+  return merge_cluster;
 }
 
 xla::StatusOr<FunctionalizeCond::CondArgs> FunctionalizeCond::DetermineCondArgs(
@@ -1201,11 +1323,11 @@ void FunctionalizeCond::RemoveMergeNodes(Cluster* merge_cluster) {
   }
 }
 
-Status FunctionalizeCond::RemoveTrivialMerge(Cluster* merge_cluster) {
-  Cluster* switch_cluster = *merge_cluster->in_nodes.begin();
-  if (switch_cluster->switch_nodes.empty()) {
+Status FunctionalizeCond::RemoveTrivialSwitch(Cluster* switch_cluster) {
+  Cluster* merge_cluster = *switch_cluster->out_nodes.begin();
+  if (merge_cluster->merge_nodes.empty()) {
     return errors::FailedPrecondition(
-        "Not a trivial merge: no Switch node feeding into Merge node");
+        "Not a trivial switch: no Merge node feeding into Switch node");
   }
 
   for (auto it = merge_cluster->merge_nodes.begin();
@@ -1232,17 +1354,25 @@ Status FunctionalizeCond::RemoveTrivialMerge(Cluster* merge_cluster) {
   return Status::OK();
 }
 
-Status FunctionalizeCond::ConvertMergeToXlaIf(Cluster* merge_cluster) {
-  VLOG(1) << "ConvertMergeToXlaIf for " << merge_cluster->representative;
-  gtl::optional<Cluster*> switch_cluster = GetSwitchCluster(*merge_cluster);
-  if (!switch_cluster.has_value()) {
+Status FunctionalizeCond::ConvertCorrespondingMergeToXlaIf(
+    Cluster* switch_cluster) {
+  VLOG(1) << "ConvertMergeToXlaIf for " << switch_cluster->representative;
+  gtl::optional<Cluster*> maybe_merge =
+      CreateCorrespondingMergeCluster(switch_cluster);
+  if (!maybe_merge.has_value()) {
     return errors::FailedPrecondition(
-        "Merge cluster was not part of a simple conditional in the clustered "
-        "graph. Graph nodes in merge cluster ",
-        NodesToString(merge_cluster->merge_nodes));
+        "Switch cluster was not part of a simple conditional in the clustered "
+        "graph. Graph nodes in switch cluster ",
+        NodesToString(switch_cluster->switch_nodes));
+  }
+  Cluster* merge_cluster = *maybe_merge;
+  if (merge_cluster->merge_nodes.empty()) {
+    return errors::Internal(
+        "Merge node in clustered graph contains no merge nodes: ",
+        merge_cluster->representative.ToString());
   }
   TF_ASSIGN_OR_RETURN(auto cond_args,
-                      DetermineCondArgs(*merge_cluster, **switch_cluster));
+                      DetermineCondArgs(*merge_cluster, *switch_cluster));
 
   // Sort the outputs by ID to produce more stable output.
   std::vector<Node*> outputs(merge_cluster->merge_nodes.begin(),
@@ -1258,7 +1388,7 @@ Status FunctionalizeCond::ConvertMergeToXlaIf(Cluster* merge_cluster) {
   // Remove the old nodes from the graph_ and contract the edges of the
   // clustered graph.
   for (auto in : merge_cluster->in_nodes) {
-    if (in != *switch_cluster) {
+    if (in != switch_cluster) {
       RemoveClusterNodes(in);
     }
   }
@@ -1266,23 +1396,20 @@ Status FunctionalizeCond::ConvertMergeToXlaIf(Cluster* merge_cluster) {
   RemoveUnusedArgs(cond_args.args);
   auto in_nodes = merge_cluster->in_nodes;
   for (auto it = in_nodes.begin(); it != in_nodes.end();) {
-    ContractEdge(*it++, merge_cluster);
+    ContractEdge(*it++, switch_cluster);
   }
-  ContractEdge(*switch_cluster, merge_cluster);
-  clusters_[if_node].Get() = ClusterHandle(merge_cluster->representative);
+  ContractEdge(merge_cluster, switch_cluster);
+  clusters_[if_node].Get() = ClusterHandle(switch_cluster->representative);
 
   return Status::OK();
 }
 
 std::vector<std::pair<int, FunctionalizeCond::Cluster*>>
-FunctionalizeCond::SortedMergeNodes() {
+FunctionalizeCond::SortedSwitchNodes() {
   VLOG(2) << "ProcessClusteredGraph";
   std::stack<std::pair<int, Cluster*>> stack;
-  for (auto& c : clustered_graph_) {
-    if (c.second.in_nodes.empty()) {
-      stack.push({0, &c.second});
-    }
-  }
+  // Initialize with the source node.
+  stack.push({0, &clustered_graph_[Representative(graph_->source_node())]});
 
   // Perform a depth-first traversal of the clustered graph computing the
   // switch-merge depth.
@@ -1300,10 +1427,10 @@ FunctionalizeCond::SortedMergeNodes() {
 
     size_t new_depth = depth;
     if (!n->merge_nodes.empty()) {
-      queue.emplace_back(depth, n);
       --new_depth;
     }
     if (!n->switch_nodes.empty()) {
+      queue.emplace_back(depth, n);
       ++new_depth;
     }
     for (Cluster* e : n->out_nodes) {
@@ -1333,25 +1460,30 @@ Status FunctionalizeCond::Functionalize(Graph* graph,
   }
   fc.CreateClusteredGraph();
 
-  auto queue = fc.SortedMergeNodes();
+  auto queue = fc.SortedSwitchNodes();
   for (auto it = queue.begin(); it != queue.end();) {
-    Cluster* merge_cluster = (*it).second;
+    Cluster* switch_cluster = (*it).second;
     ++it;
-    if (merge_cluster->in_nodes.size() == 1) {
-      TF_RETURN_IF_ERROR(fc.RemoveTrivialMerge(merge_cluster));
+    if (switch_cluster->out_nodes.size() == 1) {
+      TF_RETURN_IF_ERROR(fc.RemoveTrivialSwitch(switch_cluster));
     } else {
-      TF_RETURN_IF_ERROR(fc.ConvertMergeToXlaIf(merge_cluster));
+      TF_RETURN_IF_ERROR(fc.ConvertCorrespondingMergeToXlaIf(switch_cluster));
     }
 
-    // Contract newly Merge free merge_cluster with incoming nodes without
+    // Contract newly Switch free switch_cluster with outgoing nodes without
     // Switch or Merge nodes.
-    std::vector<Cluster*> in_nodes(merge_cluster->in_nodes.begin(),
-                                   merge_cluster->in_nodes.end());
-    for (auto in : in_nodes) {
-      if (in->merge_nodes.empty() && in->switch_nodes.empty()) {
-        fc.ContractEdge(in, merge_cluster);
+    for (auto& nodes : {switch_cluster->out_nodes, switch_cluster->in_nodes}) {
+      std::vector<Cluster*> copy_nodes(nodes.begin(), nodes.end());
+      for (auto* node : copy_nodes) {
+        if (node->merge_nodes.empty() && node->switch_nodes.empty()) {
+          fc.ContractEdge(node, switch_cluster);
+        }
       }
     }
+
+    VLOG(3) << "Graph with clusters: "
+            << DebugString(*fc.graph_, &fc.clusters_);
+    VLOG(3) << "ClusteredGraph: " << DebugString(fc.clustered_graph_);
   }
 
   if (!fc.switch_nodes_.empty()) {
diff --git a/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..82b3b46a2f1e97001d1e0c6b993ec243170bc7d8
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/cpu_supported_ops.md
@@ -0,0 +1,242 @@
+**Supported operators for device: XLA_CPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={float}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={complex64,double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`RandomStandardNormal`                | `dtype={float}`
+`RandomUniform`                       | `T={int32,int64}`<br>`dtype={double,float}`
+`RandomUniformInt`                    | `T={int32,int64}`<br>`Tout={int32,int64}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`TruncatedNormal`                     | `T={int32,int64}`<br>`dtype={double,float}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_CPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4b7621ad2858fe17e93d292dd807e4f7c1c336b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/g3doc/gpu_supported_ops.md
@@ -0,0 +1,238 @@
+**Supported operators for device: XLA_GPU_JIT**
+
+Operator                              | Type Constraint
+------------------------------------- | ---------------
+`Abs`                                 | `T={double,float,int32,int64}`
+`Acosh`                               | `T={complex64,double,float}`
+`Add`                                 | `T={complex64,double,float,int32,int64}`
+`AddN`                                | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`All`                                 | `Tidx={int32,int64}`
+`Angle`                               | `Tout={double,float}`<br>`T={complex64}`
+`Any`                                 | `Tidx={int32,int64}`
+`ApproximateEqual`                    | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMax`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`ArgMin`                              | `Tidx={int32,int64}`<br>`output_type={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Asinh`                               | `T={complex64,double,float}`
+`AssignAddVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignSubVariableOp`                 | `dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`AssignVariableOp`                    | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Atan2`                               | `T={double,float}`
+`Atanh`                               | `T={complex64,double,float}`
+`AvgPool`                             | `T={double,float}`
+`AvgPool3D`                           | `T={double,float}`
+`AvgPool3DGrad`                       | `T={double,float}`
+`AvgPoolGrad`                         | `T={double,float}`
+`BatchMatMul`                         | `T={complex64,double,float,int32}`
+`BatchToSpace`                        | `Tidx={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BatchToSpaceND`                      | `Tcrops={int32,int64}`<br>`Tblock_shape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAdd`                             | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddGrad`                         | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BiasAddV1`                           | `T={complex64,double,float,int32,int64,uint32,uint64}`
+`BitwiseAnd`                          | `T={int32,int64,uint32,uint64}`
+`BitwiseOr`                           | `T={int32,int64,uint32,uint64}`
+`BroadcastArgs`                       | `T={int32,int64}`
+`BroadcastGradientArgs`               | `T={int32,int64}`
+`Cast`                                | `DstT={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`SrcT={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Ceil`                                | `T={double,float}`
+`Cholesky`                            | `T={complex64,double,float}`
+`Complex`                             | `Tout={complex64}`<br>`T={double,float}`
+`ComplexAbs`                          | `Tout={double,float}`<br>`T={complex64}`
+`Concat`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ConcatOffset`                        |
+`ConcatV2`                            | `Tidx={int32}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Conj`                                | `T={complex64}`
+`Const`                               | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ControlTrigger`                      |
+`Conv2D`                              | `T={float}`
+`Conv2DBackpropFilter`                | `T={float}`
+`Conv2DBackpropInput`                 | `T={float}`
+`Conv3D`                              | `T={double,float}`
+`Conv3DBackpropFilterV2`              | `T={double,float}`
+`Conv3DBackpropInputV2`               | `T={double,float}`
+`Cos`                                 | `T={complex64,double,float}`
+`Cosh`                                | `T={complex64,double,float}`
+`Cross`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Cumprod`                             | `Tidx={int32,int64}`<br>`T={float}`
+`Cumsum`                              | `Tidx={int32,int64}`<br>`T={float}`
+`DepthToSpace`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`DepthwiseConv2dNative`               | `T={double,float}`
+`DepthwiseConv2dNativeBackpropFilter` | `T={double,float}`
+`DepthwiseConv2dNativeBackpropInput`  | `T={double,float}`
+`Diag`                                | `T={complex64,double,float,int32,int64}`
+`DiagPart`                            | `T={complex64,double,float,int32,int64}`
+`Div`                                 | `T={complex64,double,float,int32,int64}`
+`DynamicStitch`                       | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Elu`                                 | `T={double,float}`
+`EluGrad`                             | `T={double,float}`
+`Equal`                               | `T={bool,complex64,double,float,int32,int64}`
+`Exp`                                 | `T={complex64,double,float}`
+`ExpandDims`                          | `Tdim={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Expm1`                               | `T={complex64,double,float}`
+`Fill`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Floor`                               | `T={double,float}`
+`FloorDiv`                            | `T={complex64,double,float,int32,int64}`
+`FloorMod`                            | `T={double,float,int32,int64}`
+`FusedBatchNorm`                      | `T={float}`
+`FusedBatchNormGrad`                  | `T={float}`
+`FusedBatchNormGradV2`                | `U={float}`<br>`T={float}`
+`FusedBatchNormV2`                    | `U={float}`<br>`T={float}`
+`Gather`                              | `Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`GatherV2`                            | `Taxis={int32,int64}`<br>`Tindices={int32,int64}`<br>`Tparams={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Greater`                             | `T={double,float,int32,int64,uint32,uint64}`
+`GreaterEqual`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Identity`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`IdentityN`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Imag`                                | `Tout={double,float}`<br>`T={complex64}`
+`Inv`                                 | `T={complex64,double,float,int32,int64}`
+`Invert`                              | `T={int32,int64,uint32,uint64}`
+`InvertPermutation`                   | `T={int32}`
+`IsFinite`                            | `T={double,float}`
+`IsInf`                               | `T={double,float}`
+`IsNan`                               | `T={double,float}`
+`L2Loss`                              | `T={double,float}`
+`LRN`                                 | `T={float}`
+`LRNGrad`                             | `T={float}`
+`LeftShift`                           | `T={int32,int64,uint32,uint64}`
+`Less`                                | `T={double,float,int32,int64,uint32,uint64}`
+`LessEqual`                           | `T={double,float,int32,int64,uint32,uint64}`
+`LinSpace`                            | `Tidx={int32,int64}`<br>`T={double,float}`
+`Log`                                 | `T={complex64,double,float}`
+`Log1p`                               | `T={complex64,double,float}`
+`LogSoftmax`                          | `T={double,float}`
+`LogicalAnd`                          |
+`LogicalNot`                          |
+`LogicalOr`                           |
+`MatMul`                              | `T={complex64,double,float}`
+`MatrixDiag`                          | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`MatrixDiagPart`                      | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Max`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`MaxPool`                             | `T={double,float,int32,int64}`
+`MaxPool3D`                           | `T={float}`
+`MaxPool3DGrad`                       | `TInput={float}`<br>`T={float}`
+`MaxPoolGrad`                         | `T={double,float,int32,int64,uint32,uint64}`
+`Maximum`                             | `T={double,float,int32,int64}`
+`Mean`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Min`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`Minimum`                             | `T={double,float,int32,int64}`
+`MirrorPad`                           | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Mod`                                 | `T={double,float,int32,int64}`
+`Mul`                                 | `T={complex64,double,float,int32,int64}`
+`Multinomial`                         | `output_dtype={int32,int64}`<br>`T={double,float,int32,int64,uint32,uint64}`
+`Neg`                                 | `T={complex64,double,float,int32,int64}`
+`NoOp`                                |
+`NotEqual`                            | `T={bool,complex64,double,float,int32,int64}`
+`OneHot`                              | `TI={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`OnesLike`                            | `T={bool,complex64,double,float,int32,int64}`
+`Pack`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pad`                                 | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`PadV2`                               | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ParallelDynamicStitch`               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Pow`                                 | `T={complex64,double,float,int32,int64}`
+`PreventGradient`                     | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Prod`                                | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`QuantizeAndDequantizeV2`             | `T={double,float}`
+`Range`                               | `Tidx={double,float,int32,int64}`
+`Rank`                                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ReadVariableOp`                      | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Real`                                | `Tout={double,float}`<br>`T={complex64}`
+`RealDiv`                             | `T={complex64,double,float,int32,int64}`
+`Reciprocal`                          | `T={complex64,double,float,int32,int64}`
+`ReciprocalGrad`                      | `T={complex64,double,float}`
+`Relu`                                | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6`                               | `T={double,float,int32,int64,uint32,uint64}`
+`Relu6Grad`                           | `T={double,float,int32,int64,uint32,uint64}`
+`ReluGrad`                            | `T={double,float,int32,int64,uint32,uint64}`
+`Reshape`                             | `Tshape={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceApplyAdagrad`                | `T={double,float}`
+`ResourceApplyAdam`                   | `T={double,float}`
+`ResourceApplyFtrl`                   | `T={double,float}`
+`ResourceApplyFtrlV2`                 | `T={double,float}`
+`ResourceApplyGradientDescent`        | `T={double,float}`
+`ResourceApplyMomentum`               | `T={double,float}`
+`ResourceApplyRMSProp`                | `T={double,float}`
+`ResourceGather`                      | `Tindices={int32,int64}`<br>`dtype={complex64,double,float,int32,int64,uint32,uint64}`
+`ResourceStridedSliceAssign`          | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Reverse`                             | `T={bool,complex64,double,float,int32,int64}`
+`ReverseV2`                           | `T={bool,complex64,double,float,int32,int64}`<br>`Tidx={int32,int64}`
+`RightShift`                          | `T={int32,int64,uint32,uint64}`
+`Rint`                                | `T={double,float}`
+`Round`                               | `T={complex64,double,float,int32,int64}`
+`Rsqrt`                               | `T={complex64,double,float}`
+`RsqrtGrad`                           | `T={complex64,double,float}`
+`Select`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Selu`                                | `T={double,float}`
+`SeluGrad`                            | `T={double,float}`
+`Shape`                               | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`ShapeN`                              | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sigmoid`                             | `T={complex64,double,float}`
+`SigmoidGrad`                         | `T={complex64,double,float}`
+`Sign`                                | `T={complex64,double,float,int32,int64}`
+`Sin`                                 | `T={complex64,double,float}`
+`Sinh`                                | `T={complex64,double,float}`
+`Size`                                | `out_type={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Slice`                               | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Softmax`                             | `T={double,float}`
+`SoftmaxCrossEntropyWithLogits`       | `T={double,float}`
+`Softplus`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftplusGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`Softsign`                            | `T={double,float,int32,int64,uint32,uint64}`
+`SoftsignGrad`                        | `T={double,float,int32,int64,uint32,uint64}`
+`SpaceToBatch`                        | `Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToBatchND`                      | `Tblock_shape={int32,int64}`<br>`Tpaddings={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SpaceToDepth`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SparseMatMul`                        | `Tb={float}`<br>`Ta={float}`
+`SparseSoftmaxCrossEntropyWithLogits` | `Tlabels={int32,int64}`<br>`T={double,float}`
+`Split`                               | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`SplitV`                              | `Tlen={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sqrt`                                | `T={complex64,double,float}`
+`SqrtGrad`                            | `T={complex64,double,float}`
+`Square`                              | `T={complex64,double,float,int32,int64}`
+`SquaredDifference`                   | `T={complex64,double,float,int32,int64}`
+`Squeeze`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackCloseV2`                        |
+`StackPopV2`                          | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackPushV2`                         | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StackV2`                             | `elem_type={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StatelessRandomNormal`               | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StatelessRandomUniform`              | `Tseed={int32}`<br>`T={int32,int64}`<br>`dtype={float}`
+`StopGradient`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSlice`                        | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`StridedSliceGrad`                    | `Index={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Sub`                                 | `T={complex64,double,float,int32,int64}`
+`Sum`                                 | `Tidx={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`SymbolicGradient`                    | `Tout={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tan`                                 | `T={complex64,double,float,int32,int64}`
+`Tanh`                                | `T={complex64,double,float}`
+`TanhGrad`                            | `T={complex64,double,float}`
+`TensorArrayCloseV3`                  |
+`TensorArrayConcatV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGatherV3`                 | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayGradV3`                   |
+`TensorArrayReadV3`                   | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayScatterV3`                | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArraySizeV3`                   |
+`TensorArraySplitV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayV3`                       | `dtype={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TensorArrayWriteV3`                  | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Tile`                                | `Tmultiples={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`Transpose`                           | `Tperm={int32,int64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`TruncateDiv`                         | `T={complex64,double,float,int32,int64}`
+`TruncateMod`                         | `T={double,float,int32,int64}`
+`Unpack`                              | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`UnsortedSegmentSum`                  | `Tnumsegments={int32,int64}`<br>`Tindices={int32,int64}`<br>`T={complex64,double,float,int32,int64,uint32,uint64}`
+`VarIsInitializedOp`                  |
+`VariableShape`                       | `out_type={int32,int64}`
+`XlaWhile`                            | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`ZerosLike`                           | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Arg`                                | `T={bool,complex64,double,float,int32,int64,resource,uint32,uint64}`
+`_ArrayToList`                        | `out_types={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_ListToArray`                        | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`<br>`Tin={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_Retval`                             | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLARecv`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+`_XLASend`                            | `T={bool,complex64,double,float,int32,int64,uint32,uint64}`
+
+To regenerate this table, run:
+
+```shell
+bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops --device=XLA_GPU_JIT
+```
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 2b43e313eb42c288b891f97c0b6cd3cacdc77711..3e24cf042e17ad4e212d82ac4f24fec06a6c780f 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -18,6 +18,8 @@ tf_kernel_library(
         "bias_ops.cc",
         "binary_ops.cc",
         "cast_op.cc",
+        "categorical_op.cc",
+        "cholesky_op.cc",
         "concat_op.cc",
         "const_op.cc",
         "conv_ops.cc",
@@ -33,6 +35,7 @@ tf_kernel_library(
         "gather_op.cc",
         "gather_op_helpers.h",
         "identity_op.cc",
+        "image_resize_ops.cc",
         "index_ops.cc",
         "l2loss_op.cc",
         "lrn_ops.cc",
@@ -52,17 +55,20 @@ tf_kernel_library(
         "reshape_op.cc",
         "retval_op.cc",
         "reverse_op.cc",
+        "scan_ops.cc",
         "segment_reduction_ops.cc",
         "select_op.cc",
         "sendrecv_ops.cc",
         "sequence_ops.cc",
         "shape_op.cc",
+        "shape_util.cc",
         "slice_op.cc",
         "softmax_op.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
         "split_op.cc",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
         "tile_ops.cc",
@@ -75,12 +81,17 @@ tf_kernel_library(
     hdrs = [
         "gather_op.h",
         "index_ops.h",
+        "shape_util.h",
     ],
     deps = [
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla/lib:batch_dot",
+        "//tensorflow/compiler/tf2xla/lib:cholesky",
+        "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/ops:sendrecv_ops",
+        "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
@@ -89,8 +100,11 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/core:framework",
+        "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:linalg_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:stateless_random_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:concat_lib",
         "//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 73ccc151c1d6bdf70105badd962903297f090abe..a015b8e0e8949f8aaa03a78b0f88b7ea8d6aaa1c 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,11 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// XLA-specific BatchMatMul Op.
-// The current implementation simply unrolls the computation along the batch
-// dimension.
-// TODO(dominikg,phawkins): Use a real batched matmul instead of unrolling.
-
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
@@ -32,110 +28,10 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    const TensorShape x_shape = ctx->InputShape(0);
-    const TensorShape y_shape = ctx->InputShape(1);
-
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    OP_REQUIRES(ctx, x_shape.dims() == y_shape.dims(),
-                errors::InvalidArgument("In[0] and In[1] has different ndims: ",
-                                        x_shape.DebugString(), " vs. ",
-                                        y_shape.DebugString()));
-    const int ndims = x_shape.dims();
-    OP_REQUIRES(
-        ctx, ndims >= 2,
-        errors::InvalidArgument("In[0] and In[1] ndims must be >= 2: ", ndims));
-
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> dimensions;
-    int batch_count = 1;
-    for (int i = 0; i < ndims - 2; ++i) {
-      OP_REQUIRES(
-          ctx, x_shape.dim_size(i) == y_shape.dim_size(i),
-          errors::InvalidArgument("In[0].dim(", i, ") and In[1].dim(", i,
-                                  ") must be the same: ", x_shape.DebugString(),
-                                  " vs ", y_shape.DebugString()));
-      dimensions.push_back(x_shape.dim_size(i));
-      batch_count *= x_shape.dim_size(i);
-    }
-
-    int x_inner_dim = adj_x_ ? (ndims - 2) : (ndims - 1);
-    int y_inner_dim = adj_y_ ? (ndims - 1) : (ndims - 2);
-    OP_REQUIRES(
-        ctx, x_shape.dim_size(x_inner_dim) == y_shape.dim_size(y_inner_dim),
-        errors::InvalidArgument(
-            "In[0] mismatch In[1] shape: ", x_shape.dim_size(x_inner_dim),
-            " vs. ", y_shape.dim_size(y_inner_dim), ": ", x_shape.DebugString(),
-            " ", y_shape.DebugString(), " ", adj_x_, " ", adj_y_));
-
-    int x_outer_dim = adj_x_ ? (ndims - 1) : (ndims - 2);
-    int y_outer_dim = adj_y_ ? (ndims - 2) : (ndims - 1);
-    dimensions.push_back(x_shape.dim_size(x_outer_dim));
-    dimensions.push_back(y_shape.dim_size(y_outer_dim));
-
-    xla::ComputationBuilder* builder = ctx->builder();
-
-    xla::ComputationDataHandle x_handle = ctx->Input(0);
-    if (BaseType(input_type(0)) == DT_COMPLEX64 && adj_x_) {
-      x_handle = builder->Conj(x_handle);
-    }
-    xla::ComputationDataHandle y_handle = ctx->Input(1);
-    if (BaseType(input_type(1)) == DT_COMPLEX64 && adj_y_) {
-      y_handle = builder->Conj(y_handle);
-    }
-
-    // Reshape input tensors into 3D tensors by flattening the batch
-    // dimensions. This makes it easier to unroll the batch dimension.
-    auto x_flat =
-        builder->Reshape(x_handle, {batch_count, x_shape.dim_size(ndims - 2),
-                                    x_shape.dim_size(ndims - 1)});
-    auto y_flat =
-        builder->Reshape(y_handle, {batch_count, y_shape.dim_size(ndims - 2),
-                                    y_shape.dim_size(ndims - 1)});
-
-    // Slice batches into individual matrices and multiply them.
-    std::vector<xla::ComputationDataHandle> out_slices;
-    for (int i = 0; i < batch_count; ++i) {
-      // Slice off individual matrices and reshape to 2D tensors.
-      auto x_slice = builder->Slice(
-          x_flat, {i, 0, 0},
-          {i + 1, x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)},
-          {1, 1, 1});
-      x_slice = builder->Reshape(
-          x_slice, {x_shape.dim_size(ndims - 2), x_shape.dim_size(ndims - 1)});
-      auto y_slice = builder->Slice(
-          y_flat, {i, 0, 0},
-          {i + 1, y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)},
-          {1, 1, 1});
-      y_slice = builder->Reshape(
-          y_slice, {y_shape.dim_size(ndims - 2), y_shape.dim_size(ndims - 1)});
-
-      // Transpose if needed.
-      auto lhs = adj_x_ ? builder->Transpose(x_slice, {1, 0}) : x_slice;
-      auto rhs = adj_y_ ? builder->Transpose(y_slice, {1, 0}) : y_slice;
-
-      // Multiply matrices and add an outer singleton dimension to the output
-      // so we can concatenate along the flattened batch dimension later.
-      auto out = builder->Dot(lhs, rhs);
-      out = builder->Reshape(out,
-                             {1, dimensions[ndims - 2], dimensions[ndims - 1]});
-      out_slices.push_back(out);
-    }
-
-    // Concatenate output slices and reshape to original number of dimensions.
-    xla::ComputationDataHandle data;
-    if (out_slices.empty()) {
-      // It is illegal to pass an empty list to ConcatInDim.
-      // The batch count is empty, so both inputs must have zero elements.
-      // Arbitrarily use the left input as the argument to Reshape().
-      data = x_handle;
-    } else {
-      data = builder->ConcatInDim(out_slices, 0);
-    }
-    data = builder->Reshape(data, dimensions);
-
-    ctx->SetOutput(0, data);
+    auto result =
+        BatchDot(ctx->builder(), ctx->Input(0), ctx->Input(1), adj_x_, adj_y_);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie());
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 248e9d111e556dcdd75581aa6562a66fc8b57063..a249b1869f547f8e5aa725f9f5cf391b10429928 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // XLA implementation of BatchNorm operations.
-#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -26,43 +26,63 @@ namespace {
 class FusedBatchNormOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(/*num_dims=*/4, tensor_format);
-    }
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(0), &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(ctx->input_type(1), &scale_type));
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    TensorShape input_shape = ctx->InputShape(0);
+
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. As a workaround, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    input = builder->ConvertElementType(input, scale_type);
+
     if (is_training_) {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormTraining(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), epsilon_,
-          feature_index_);
+      xla::ComputationDataHandle output = builder->BatchNormTraining(
+          input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index);
 
       // In training mode, outputs the normalized value as well as the
       // calculated mean and variance.
-      for (int i = 0; i < 3; i++) {
-        ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
-      }
+      ctx->SetOutput(0, builder->ConvertElementType(
+                            builder->GetTupleElement(output, 0), input_type));
+      ctx->SetOutput(1, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(2, builder->GetTupleElement(output, 2));
+
       // Output 3 and 4 for "FusedBatchNorm" are currently marked as "reserved
       // space 1 & 2". They are used to pass the per-batch mean and
       // variance to the gradient. Here we maintain the same behavior by setting
       // them to the mean and variance calculated by BatchNormTraining.
-      ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-      ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+      ctx->SetOutput(3, builder->GetTupleElement(output, 1));
+      ctx->SetOutput(4, builder->GetTupleElement(output, 2));
     } else {
-      xla::ComputationDataHandle output = ctx->builder()->BatchNormInference(
-          ctx->Input(0), ctx->Input(1), ctx->Input(2), ctx->Input(3),
-          ctx->Input(4), epsilon_, feature_index_);
-      ctx->SetOutput(0, output);
+      xla::ComputationDataHandle output = builder->BatchNormInference(
+          input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
+          epsilon_, feature_index);
+      ctx->SetOutput(0, builder->ConvertElementType(output, input_type));
       // Directly send input to output as mean and variance in inference mode.
       ctx->SetOutput(1, ctx->Input(3));
       ctx->SetOutput(2, ctx->Input(4));
@@ -73,55 +93,113 @@ class FusedBatchNormOp : public XlaOpKernel {
 
  private:
   float epsilon_;
-  int64 feature_index_;
+  TensorFormat data_format_;
   bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
+REGISTER_XLA_OP(Name("FusedBatchNormV2"), FusedBatchNormOp);
 
 class FusedBatchNormGradOp : public XlaOpKernel {
  public:
   explicit FusedBatchNormGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    string data_format;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    bool is_training;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training));
-    CHECK(is_training) << "FusedBatchNormGradOp with is_training=False cannot "
-                          "be used with XLA for now!";
-    TensorFormat tensor_format;
-    if (ctx->GetAttr("data_format", &data_format).ok()) {
-      OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
-                  errors::InvalidArgument("Invalid data format"));
-      OP_REQUIRES(
-          ctx, (tensor_format == FORMAT_NHWC || tensor_format == FORMAT_NCHW),
-          errors::InvalidArgument("Not supported format"));
-      feature_index_ = GetTensorFeatureDimIndex(4, tensor_format);
-    }
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training_));
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(
+        ctx, FormatFromString(data_format_str, &data_format_),
+        errors::InvalidArgument("Invalid data format: ", data_format_str));
+    OP_REQUIRES(ctx,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument(
+                    "Unsupported data format ", ToString(data_format_),
+                    "; supported formats are NHWC and NCHW"));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto grad_output = ctx->Input(0);
-    auto activation = ctx->Input(1);
+    xla::ComputationBuilder* b = ctx->builder();
+
+    auto grad_backprop = ctx->Input(0);
+    auto activations = ctx->Input(1);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
-    xla::ComputationDataHandle output = ctx->builder()->BatchNormGrad(
-        activation, scale, mean, var, grad_output, epsilon_, feature_index_);
 
-    for (int i = 0; i < 3; i++) {
-      ctx->SetOutput(i, ctx->builder()->GetTupleElement(output, i));
+    TensorShape input_shape = ctx->InputShape(0);
+    int feature_index =
+        GetTensorFeatureDimIndex(input_shape.dims(), data_format_);
+
+    DataType input_dtype = ctx->input_type(0);
+    DataType scale_dtype = ctx->input_type(2);
+    xla::PrimitiveType input_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_dtype, &input_type));
+    xla::PrimitiveType scale_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(scale_dtype, &scale_type));
+
+    // TODO(b/69928690): support mixed precision in the XLA batch normalization
+    // operators. For now, cast everything to the statistics type (which
+    // may be more precise than the input type).
+    grad_backprop = b->ConvertElementType(grad_backprop, scale_type);
+    activations = b->ConvertElementType(activations, scale_type);
+
+    xla::ComputationDataHandle x_backprop;
+    xla::ComputationDataHandle scale_backprop;
+    xla::ComputationDataHandle offset_backprop;
+    if (is_training_) {
+      xla::ComputationDataHandle output =
+          b->BatchNormGrad(activations, scale, mean, var, grad_backprop,
+                           epsilon_, feature_index);
+
+      x_backprop = b->GetTupleElement(output, 0);
+      scale_backprop = b->GetTupleElement(output, 1);
+      offset_backprop = b->GetTupleElement(output, 2);
+    } else {
+      // Reduce over all dimensions except the feature dim.
+      std::vector<int64> reduction_dims(input_shape.dims() - 1);
+      std::iota(reduction_dims.begin(), reduction_dims.begin() + feature_index,
+                0);
+      std::iota(reduction_dims.begin() + feature_index, reduction_dims.end(),
+                feature_index + 1);
+      // offset_backprop  = sum(y_backprop)
+      // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var +
+      // epsilon))
+      // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
+      offset_backprop =
+          b->Reduce(grad_backprop, XlaHelpers::Zero(b, scale_dtype),
+                    *ctx->GetOrCreateAdd(scale_dtype), reduction_dims);
+
+      // scratch1 = rsqrt(pop_var + epsilon)
+      auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
+      auto scratch1 =
+          b->Pow(b->Add(var, b->ConstantR0<float>(epsilon_)), neg_half);
+
+      // scratch2 = sum(y_backprop * (x - mean))
+      auto scratch2 = b->Reduce(
+          b->Mul(grad_backprop, b->Sub(activations, mean, {feature_index})),
+          XlaHelpers::Zero(b, scale_dtype), *ctx->GetOrCreateAdd(scale_dtype),
+          reduction_dims);
+
+      x_backprop =
+          b->Mul(grad_backprop, b->Mul(scratch1, scale), {feature_index});
+      scale_backprop = b->Mul(scratch1, scratch2);
     }
-    ctx->SetOutput(3, ctx->builder()->GetTupleElement(output, 1));
-    ctx->SetOutput(4, ctx->builder()->GetTupleElement(output, 2));
+
+    ctx->SetOutput(0, b->ConvertElementType(x_backprop, input_type));
+    ctx->SetOutput(1, scale_backprop);
+    ctx->SetOutput(2, offset_backprop);
+    ctx->SetConstantOutput(3, Tensor(scale_dtype, {}));
+    ctx->SetConstantOutput(4, Tensor(scale_dtype, {}));
   }
 
  private:
+  TensorFormat data_format_;
   float epsilon_;
-  int64 feature_index_;
+  bool is_training_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
+REGISTER_XLA_OP(Name("FusedBatchNormGradV2"), FusedBatchNormGradOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 1de91924326464338352b1ac9edf77141f25ad35..2436a6074a11ad66387b232dd1c5aa135875bfc3 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
 namespace {
@@ -75,7 +76,7 @@ static xla::ComputationDataHandle FloorDivImpl(xla::ComputationBuilder* b,
   auto abs_y = b->Abs(y);
   auto t = b->Neg(b->Sub(b->Add(abs_x, abs_y), one));
   auto result = b->Select(different_sign, b->Div(t, abs_y), b->Div(x, y));
-  if (dtype == DT_FLOAT || dtype == DT_DOUBLE) {
+  if (DataTypeIsFloating(dtype)) {
     result = b->Floor(result);
   }
   return result;
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..592f3ecc3ce2abf33ddffe8b0e59c4e12e73e956
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -0,0 +1,98 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA implementations of Categorical op.
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class CategoricalOp : public XlaOpKernel {
+ public:
+  explicit CategoricalOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // Get the logits
+    const xla::ComputationDataHandle& logits = ctx->Input(0);
+    TensorShape logits_shape = ctx->InputShape(0);
+    int64 num_samples;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_samples));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(logits_shape),
+                errors::InvalidArgument("logits should be a matrix, got shape ",
+                                        logits_shape.DebugString()));
+    OP_REQUIRES(ctx, num_samples >= 0,
+                errors::InvalidArgument(
+                    "num_samples should be nonnegative, got ", num_samples));
+
+    for (int i = 0; i < 2; i++) {
+      const int64 dim = logits_shape.dim_size(i);
+      OP_REQUIRES(
+          ctx, static_cast<int>(dim) == dim,
+          errors::InvalidArgument("logits.shape = ", logits_shape.DebugString(),
+                                  " too large for int"));
+    }
+
+    const int64 batch_size = logits_shape.dim_size(0);
+    const int64 num_classes = logits_shape.dim_size(1);
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    std::array<int64, 3> uniform_shape_array = {
+        {batch_size, num_samples, num_classes}};
+    xla::PrimitiveType uniform_xla_type;
+    OP_REQUIRES_OK(ctx,
+                   DataTypeToPrimitiveType(input_type(0), &uniform_xla_type));
+    xla::Shape uniform_shape =
+        xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
+    auto uniforms = builder->RngUniform(
+        XlaHelpers::Zero(builder, input_type(0)),
+        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+
+    // Use Gumbel softmax trick to generate categorical samples.
+    // See:
+    // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/
+    // TODO(b/68769470): Switch to using a cumulative sum approach.
+    auto softmax_entries =
+        builder->Sub(logits, builder->Log(builder->Neg(builder->Log(uniforms))),
+                     /*broadcast_dimensions=*/{0, 2});
+
+    TensorShape softmax_shape(uniform_shape_array);
+    xla::ComputationDataHandle argmax;
+    OP_REQUIRES_OK(
+        ctx,
+        XlaHelpers::ArgMax(builder, ctx, softmax_entries, softmax_shape,
+                           input_type(0), output_type(0), /*axis=*/2, &argmax));
+
+    ctx->SetOutput(0, argmax);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
+};
+
+// TODO(b/68769717): Rename this sampler to Categorical.
+REGISTER_XLA_OP(Name("Multinomial"), CategoricalOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_namespace_compat.h b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
similarity index 50%
rename from tensorflow/core/distributed_runtime/rpc/grpc_namespace_compat.h
rename to tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index c178927f5d5411e30bee2470b8b544ff76c28396..87d858f763560be454c162e0cf40307c68217663 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_namespace_compat.h
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -13,20 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_NAMESPACE_COMPAT_H_
-#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_NAMESPACE_COMPAT_H_
-
-// This file is a transitional place-holder until gRPC versions consistently
-// use namespace grpc::internal for library-internal structures
-
-namespace grpc {
-// ensure internal namespace exists
-namespace internal {
-// bring in contents of external namespace
-using namespace ::grpc;
-}  // namespace internal
-// bring in contents of internal namespace
-using namespace internal;
-}  // namespace grpc
-
-#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_NAMESPACE_COMPAT_H_
+#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+class CholeskyOp : public XlaOpKernel {
+ public:
+  explicit CholeskyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result = Cholesky(ctx->builder(), ctx->Input(0));
+    if (!result.ok()) {
+      ctx->SetStatus(result.status());
+      return;
+    }
+    ctx->SetOutput(0, result.ValueOrDie());
+  }
+};
+
+REGISTER_XLA_OP(Name("Cholesky"), CholeskyOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 9833323d851e00e7ca76d0b39cd2b216748a17fa..8f78b4c8f90cf00d5fa9ba71a78bb1c0fe280dc6 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -40,6 +40,11 @@ class ConstOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape shape(proto_.tensor_shape());
 
+    if (proto_.dtype() == DT_STRING) {
+      LOG(WARNING) << "Not computing Const of type DT_STRING";
+      ctx->SetInvalidOutput(0);
+      return;
+    }
     xla::ComputationBuilder* b = ctx->builder();
 
     // To avoid blowups for large constants filled with the same value,
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 885f716afafca7ba23770e38f6693eed1ba50982..aaddbe811c6fbf6da296640eb5a75e82b2fedcfa 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -46,72 +46,130 @@ TensorShape ExpandedFilterShapeForDepthwiseConvolution(
   return expanded_shape;
 }
 
+// Broadcast zeros to ExpandedFilterShapeForDepthwiseConvolution.
+xla::ComputationDataHandle CreateExpandedZero(
+    const TensorShape& filter_shape, DataType dtype,
+    xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
+                            expanded_filter_shape.dim_sizes());
+}
+
+// Create a mask for depthwise convolution that will make a normal convolution
+// produce the same results as a depthwise convolution. For a [2, 2, 3, 2]
+// depthwise filter this returns a [2, 2, 3, 6] tesnsor
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+//   1 1 0 0 0 0   1 1 0 0 0 0
+//   0 0 1 1 0 0   0 0 1 1 0 0
+//   0 0 0 0 1 1   0 0 0 0 1 1
+//
+// The first step is to create a one tensor, A, that is [3]
+//   0 1 2
+//
+// and another tensor, B,  that is [3 * 2]
+//   0 1 2 3 4 5
+//
+// and divide B it by 2 to get
+//   0 0 1 1 2 2
+//
+// then we broadcast the B to [2, 2, 3, 3 * 2]
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 0 1 1 2 2   0 0 1 1 2 2
+//
+// Finally compare A and broadcasted B in dimension 2 amd return the result at
+// the beginning of the comment.
+xla::ComputationDataHandle CreateExpandedFilterMask(
+    const TensorShape& filter_shape, xla::ComputationBuilder* builder) {
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
+
+  // Create a M sized linspace and an M*N sized linspace that will be
+  // broadcasted into perpendicular dimensions and compared.
+  xla::ComputationDataHandle input_feature_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, input_feature,
+                               &input_feature_iota));
+  xla::ComputationDataHandle expanded_feature_iota;
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
+                               input_feature * depthwise_multiplier,
+                               &expanded_feature_iota));
+
+  // Divide the M*N sized linspace by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] in the example in the function comment.
+  expanded_feature_iota =
+      builder->Div(expanded_feature_iota,
+                   XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
+                                              depthwise_multiplier));
+
+  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
+  auto expanded_feature_broadcast_dims = expanded_filter_shape.dim_sizes();
+  expanded_feature_broadcast_dims.pop_back();
+  auto broadcasted_expanded_feature_iota = builder->Broadcast(
+      expanded_feature_iota, expanded_feature_broadcast_dims);
+
+  // Compare the broadcasted linspace to the input feature linspace in the
+  // input feature dimension to create a diagonal predicate.
+  return builder->Eq(broadcasted_expanded_feature_iota, input_feature_iota,
+                     {expanded_filter_shape.dims() - 2});
+}
+
 // Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
 // zeros for the cross-depth filters. Used to build a depthwise convolution.
 xla::ComputationDataHandle ExpandFilterForDepthwiseConvolution(
     const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter,
     xla::ComputationBuilder* builder) {
-  // Filter has shape [H, W, ..., M, N]
-  // Dilate to [H, W, ..., M*M, N] using M inter-element padding, and then
-  // reshape to [H, W, ..., M, M*N].
-  int num_spatial_dims = filter_shape.dims() - 2;
-  const int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  xla::PaddingConfig padding = xla::MakeNoPaddingConfig(filter_shape.dims());
-  padding.mutable_dimensions(num_spatial_dims)->set_interior_padding(in_depth);
-  auto dilated_filter =
-      builder->Pad(filter, XlaHelpers::Zero(builder, dtype), padding);
-
+  int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
+  int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
-  return builder->Reshape(dilated_filter, expanded_filter_shape.dim_sizes());
+
+  // Create a [H, W, ..., 1, N*M] reshape of the filter.
+  TensorShape implicit_broadcast_filter_shape = expanded_filter_shape;
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 2, 1);
+  implicit_broadcast_filter_shape.set_dim(
+      implicit_broadcast_filter_shape.dims() - 1,
+      depthwise_multiplier * input_feature);
+  auto implicit_broadcast_filter =
+      builder->Reshape(filter, implicit_broadcast_filter_shape.dim_sizes());
+
+  // Broadcast the filter to  [H, W, ..., M, M*N].
+  auto expanded_zero = CreateExpandedZero(filter_shape, dtype, builder);
+  auto expanded_filter = builder->Add(implicit_broadcast_filter, expanded_zero);
+
+  // If the filter mask is set, choose the broadcasted filter, othwerwise,
+  // choose zero.
+  return builder->Select(CreateExpandedFilterMask(filter_shape, builder),
+                         expanded_filter, expanded_zero);
 }
 
 // Inverse of ExpandFilterForDepthwiseConvolution.
 xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
-    const TensorShape& filter_shape, DataType dtype,
+    XlaOpKernelContext* ctx, const TensorShape& filter_shape, DataType dtype,
     const xla::ComputationDataHandle& filter_backprop,
     xla::ComputationBuilder* builder) {
-  int num_spatial_dims = filter_shape.dims() - 2;
-
-  // Reshape to [H, W, ..., M*M, N]
-  TensorShape shape = filter_shape;
-  int64 in_depth = filter_shape.dim_size(num_spatial_dims);
-  shape.set_dim(num_spatial_dims, in_depth * in_depth);
-  auto reshaped = builder->Reshape(filter_backprop, shape.dim_sizes());
-
-  std::vector<int64> zeros(filter_shape.dims());
-  std::vector<int64> strides(filter_shape.dims(), 1LL);
-  strides[num_spatial_dims] = in_depth + 1;
-  return builder->Slice(reshaped, zeros, shape.dim_sizes(), strides);
-
-  // Alternate implementation for backends without strided Slice() support.
-  // TODO(phawkins): Remove when all backends support strided slice.
-  //   // Pad [..., M * (M + 1), N]
-  //   xla::PaddingConfig config =
-  //   xla::MakeNoPaddingConfig(filter_shape.dims());
-  //   config.mutable_dimensions(num_spatial_dims)
-  //     ->set_edge_padding_high(in_depth);
-  //   auto zero = XlaHelpers::Zero(builder, dtype);
-  //   auto padded = builder->Pad(reshaped, zero, config);
-  //
-  //   // Reshape to [..., M, M + 1, N]
-  //   shape = filter_shape;
-  //   shape.set_dim(num_spatial_dims, in_depth);
-  //   shape.set_dim(num_spatial_dims + 1, in_depth + 1);
-  //   int64 out_depth = filter_shape.dim_size(num_spatial_dims + 1);
-  //   shape.AddDim(out_depth);
-  //   reshaped = builder->Reshape(padded, shape.dim_sizes());
-  //
-  //   // Slice to [..., M, 1, N]
-  //   std::vector<int64> zeros(shape.dims());
-  //   std::vector<int64> strides(shape.dims(), 1LL);
-  //   shape.set_dim(num_spatial_dims + 1, 1);
-  //   auto sliced = builder->Slice(reshaped, zeros, shape.dim_sizes(),
-  //   strides);
-  //
-  //   // Reshape to [..., M, N]
-  //   return builder->Reshape(sliced, filter_shape.dim_sizes());
+  TensorShape expanded_filter_shape =
+      ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
+  auto masked_expanded_filter = builder->Select(
+      CreateExpandedFilterMask(filter_shape, builder), filter_backprop,
+      CreateExpandedZero(filter_shape, dtype, builder));
+  return builder->Reshape(
+      builder->Reduce(masked_expanded_filter, XlaHelpers::Zero(builder, dtype),
+                      *ctx->GetOrCreateAdd(dtype),
+                      {expanded_filter_shape.dims() - 2}),
+      filter_shape.dim_sizes());
 }
 
 class ConvOp : public XlaOpKernel {
@@ -121,6 +179,7 @@ class ConvOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
 
@@ -144,6 +203,23 @@ class ConvOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     const TensorShape input_shape = ctx->InputShape(0);
     // Input filter is of the following dimensions:
     // [ filter_rows, filter_cols, ..., in_depth, out_depth]
@@ -184,10 +260,11 @@ class ConvOp : public XlaOpKernel {
     dims.set_input_feature_dimension(feature_dim);
     dims.set_output_feature_dimension(feature_dim);
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      dims.add_spatial_dimensions(input_dim);
+      const int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dims.add_input_spatial_dimensions(dim);
       dims.add_kernel_spatial_dimensions(i);
-      window_strides.push_back(strides_.at(input_dim));
+      dims.add_output_spatial_dimensions(dim);
+      window_strides.push_back(strides_.at(dim));
     }
     dims.set_kernel_input_feature_dimension(num_spatial_dims_);
     dims.set_kernel_output_feature_dimension(num_spatial_dims_ + 1);
@@ -203,6 +280,7 @@ class ConvOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -240,6 +318,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -262,6 +341,23 @@ class ConvBackpropInputOp : public XlaOpKernel {
         errors::Unimplemented("Current implementation does not yet support "
                               "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[batch_dim] == 1 && dilations_[feature_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
 
@@ -302,9 +398,10 @@ class ConvBackpropInputOp : public XlaOpKernel {
     std::vector<int64> lhs_dilation(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      dnums.add_spatial_dimensions(
-          GetTensorSpatialDimIndex(num_dims(), data_format_, i));
+      int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dnums.add_input_spatial_dimensions(dim);
       dnums.add_kernel_spatial_dimensions(i);
+      dnums.add_output_spatial_dimensions(dim);
 
       kernel_spatial_dims[i] = i;
       padding[i] = {dims.spatial_dims[i].pad_before,
@@ -334,6 +431,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
@@ -371,6 +469,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
       : XlaOpKernel(ctx),
         num_spatial_dims_(num_spatial_dims),
         depthwise_(depthwise) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("strides", &strides_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("padding", &padding_));
     string data_format;
@@ -390,6 +489,23 @@ class ConvBackpropFilterOp : public XlaOpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
 
+    OP_REQUIRES(ctx, dilations_.size() == num_dims(),
+                errors::InvalidArgument("Dilations field must "
+                                        "specify ",
+                                        num_dims(), " dimensions"));
+    OP_REQUIRES(
+        ctx, dilations_[n_dim] == 1 && dilations_[c_dim] == 1,
+        errors::Unimplemented("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int input_dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      OP_REQUIRES(
+          ctx, dilations_[input_dim] == 1,
+          errors::Unimplemented("Current implementation does not yet support "
+                                "dilations in the ",
+                                i, "th spatial dimension."));
+    }
+
     const TensorShape activations_shape = ctx->InputShape(0);
     TensorShape filter_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &filter_shape));
@@ -424,9 +540,7 @@ class ConvBackpropFilterOp : public XlaOpKernel {
 
     // Swap n_dim and c_dim in the activations.
     dnums.set_input_batch_dimension(c_dim);
-    dnums.set_output_batch_dimension(c_dim);
     dnums.set_input_feature_dimension(n_dim);
-    dnums.set_output_feature_dimension(n_dim);
 
     // The gradients become the RHS of the convolution.
     // The gradients have shape [batch, out_rows, out_cols, ..., out_depth]
@@ -438,9 +552,16 @@ class ConvBackpropFilterOp : public XlaOpKernel {
     std::vector<int64> rhs_dilation(num_spatial_dims_);
     std::vector<int64> ones(num_spatial_dims_, 1);
 
+    // Tensorflow filter shape is [ H, W, ..., inC, outC ].
     for (int i = 0; i < num_spatial_dims_; ++i) {
-      int dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
-      dnums.add_spatial_dimensions(dim);
+      dnums.add_output_spatial_dimensions(i);
+    }
+    dnums.set_output_batch_dimension(num_spatial_dims_);
+    dnums.set_output_feature_dimension(num_spatial_dims_ + 1);
+
+    for (int i = 0; i < num_spatial_dims_; ++i) {
+      int64 dim = GetTensorSpatialDimIndex(num_dims(), data_format_, i);
+      dnums.add_input_spatial_dimensions(dim);
       dnums.add_kernel_spatial_dimensions(dim);
 
       // We will also need to pad the input with zeros such that after the
@@ -498,31 +619,17 @@ class ConvBackpropFilterOp : public XlaOpKernel {
                               /*window_strides=*/ones, padding,
                               /*lhs_dilation=*/ones, rhs_dilation, dnums);
 
-    // The layout of filter_backprop will match the layout of
-    // padded_activations
-    // and so will have layout: [out_feature, h, w, ..., in_feature]
-    // Tensorflow filter shape is [ H, W, ..., inC, outC ], so we transpose the
-    // output.
-    std::vector<int64> transpose_dims;
-    transpose_dims.reserve(num_dims());
-    for (int i = 0; i < num_spatial_dims_; ++i) {
-      transpose_dims.push_back(dnums.spatial_dimensions(i));
-    }
-    transpose_dims.push_back(c_dim);
-    transpose_dims.push_back(n_dim);
-    xla::ComputationDataHandle filter_backprop_reshaped =
-        b->Transpose(filter_backprop, transpose_dims);
-
     if (depthwise_) {
-      filter_backprop_reshaped = ContractFilterForDepthwiseBackprop(
-          filter_shape, ctx->input_type(0), filter_backprop_reshaped, b);
+      filter_backprop = ContractFilterForDepthwiseBackprop(
+          ctx, filter_shape, ctx->input_type(0), filter_backprop, b);
     }
-    ctx->SetOutput(0, filter_backprop_reshaped);
+    ctx->SetOutput(0, filter_backprop);
   }
 
  protected:
   const int num_spatial_dims_;
   const bool depthwise_;
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_ = FORMAT_NHWC;
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index a4ea65ea89e348cb77412efb0c5c0fcb1a9f33f3..96d7809f7995634b6bc31ab801b93526d9da7e6f 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class DepthToSpaceOp : public XlaOpKernel {
  public:
   explicit DepthToSpaceOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,18 +42,79 @@ class DepthToSpaceOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got: ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i]);
+      }
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i + 1);
+        transpose_order.push_back(i + 1 + num_spatial_dims);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] * block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+    } else {
+      // NCHW format.
+      reshaped_shape.push_back(input_shape[0]);
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(block_size_);
+        block_elems *= block_size_;
+      }
+      reshaped_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i]);
+      }
+
+      transpose_order.push_back(0);
+      transpose_order.push_back(1 + num_spatial_dims);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(2 + num_spatial_dims + i);
+        transpose_order.push_back(1 + i);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] / block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] * block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
@@ -51,14 +123,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
-    OP_REQUIRES(ctx, input_shape[3] % (block_size_ * block_size_) == 0,
+    OP_REQUIRES(ctx,
+                input_shape[feature_dim] % (block_size_ * block_size_) == 0,
                 errors::InvalidArgument(
                     "Input depth dimension (", input_shape[3],
                     ") is not divisible by square of the block size (",
                     block_size_, ")"));
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1], input_shape[2], block_size_,
-                block_size_, input_shape[3] / (block_size_ * block_size_)});
+
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -70,7 +142,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -80,15 +152,14 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       input_shape[2] * block_size_,
     //       depth / (block_size_ * block_size_)]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] * block_size_,
-                            input_shape[2] * block_size_,
-                            input_shape[3] / (block_size_ * block_size_)});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("DepthToSpace"), DepthToSpaceOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index ec5017f6ab96bd3fc273a746b77fbb7e74fd9f35..765ea922a532a085a552192348ab360c4c30ff0a 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +24,62 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Create a diagonal / batch diagonal matrix with 'input' on the diagonal.
+xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
+    const xla::ComputationDataHandle& input, int64 last_dim_size,
+    tensorflow::gtl::ArraySlice<int64> other_dims, XlaOpKernelContext* ctx,
+    xla::ComputationBuilder* builder) {
+  // Create two matrices that have the following forms, and compare them:
+  //
+  // [[0, 0, 0, 0]            [[0, 1, 2, 3]
+  //  [1, 1, 1, 1]             [0, 1, 2, 3]
+  //  [2, 2, 2, 2]             [0, 1, 2, 3]
+  //  [3, 3, 3, 3]]            [0, 1, 2, 3]]
+  //
+  // This produces a predicate matrix of the right size, with "true" on the
+  // diagonal.
+  xla::ComputationDataHandle iota;
+  TF_RETURN_IF_ERROR(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, last_dim_size, &iota));
+  xla::ComputationDataHandle iota_broadcast =
+      builder->Broadcast(iota, {last_dim_size});
+  xla::ComputationDataHandle mask = builder->Eq(iota_broadcast, iota, {0});
+
+  // If this is a batched diagonal, broadcast the mask across the other
+  // dimensions.
+  if (!other_dims.empty()) {
+    mask = builder->Broadcast(mask, other_dims);
+  }
+
+  // Broadcast the input, and then use the mask computed above to select the
+  // diagonal:
+  // e.g, in 2D:
+  //         [[t, f, f]    [[1, 1, 1]    [[0, 0, 0]      [[1, 0, 0]
+  // select(  [f, t, f]  ,  [4, 4, 4]  ,  [0, 0, 0]  ) =  [0, 4, 0]
+  //          [f, f, t]]    [9, 9, 9]]    [0, 0, 0]]      [0, 0, 9]]
+  //
+  // Broadcasting the input is less-than-trivial, since we need to broadcast
+  // into a "middle" dimension. We can do this with a reshape + implicit
+  // broadcast.
+  // TODO(b/30112114): Replace with in-dim broadcast when those are supported.
+  std::vector<int64> broadcast_dims(other_dims.begin(), other_dims.end());
+  broadcast_dims.push_back(1LL);
+  broadcast_dims.push_back(last_dim_size);
+  xla::ComputationDataHandle input_broadcast =
+      builder->Reshape(input, broadcast_dims);
+
+  broadcast_dims[broadcast_dims.size() - 2] = last_dim_size;
+  xla::PrimitiveType element_type;
+  TF_RETURN_IF_ERROR(
+      DataTypeToPrimitiveType(ctx->input_type(0), &element_type));
+  auto broadcast_shape =
+      xla::ShapeUtil::MakeShape(element_type, broadcast_dims);
+  xla::ComputationDataHandle zeros = Zeros(builder, broadcast_shape);
+
+  input_broadcast = builder->Add(input_broadcast, zeros);
+  return builder->Select(mask, input_broadcast, zeros);
+}
+
 class DiagOp : public XlaOpKernel {
  public:
   explicit DiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -29,6 +87,8 @@ class DiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("Diag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -36,7 +96,7 @@ class DiagOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 1 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::ComputationDataHandle input = ctx->Input(0);
 
     // Picture:
     // tf.diag([1, 2, 3, 4]) ==> [[1, 0, 0, 0]
@@ -46,13 +106,13 @@ class DiagOp : public XlaOpKernel {
 
     // Flattens the input to 1D.
     int64 size = input_shape.num_elements();
-    diag = builder->Reshape(diag, {size});
+    input = builder->Reshape(input, {size});
 
-    // Adds inter-element padding of 'size'.
-    xla::PaddingConfig config;
-    auto* dim = config.add_dimensions();
-    dim->set_interior_padding(size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
+    // Create an R2 with the R1 diagonal.
+    auto diag_or_status =
+        CreateDiagonal(input, size, /*other_dims=*/{}, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    xla::ComputationDataHandle diag = diag_or_status.ValueOrDie();
 
     // Reshapes to the final shape.
     std::vector<int64> new_dims(dims.size() * 2);
@@ -141,6 +201,8 @@ class MatrixDiagOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::ComputationBuilder* builder = ctx->builder();
 
+    OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
+                errors::InvalidArgument("MatrixDiag op must have at an input"));
     const TensorShape input_shape = ctx->InputShape(0);
 
     auto dims = input_shape.dim_sizes();
@@ -152,17 +214,13 @@ class MatrixDiagOp : public XlaOpKernel {
 
     int last_dim = dims.size() - 1;
     int64 last_dim_size = input_shape.dim_size(last_dim);
+    tensorflow::gtl::ArraySlice<int64> other_dims(dims);
+    other_dims.pop_back();
 
-    // Adds inter-element padding of 'last_dim_size' to the last dimension.
-    xla::PaddingConfig config = xla::MakeNoPaddingConfig(dims.size());
-    auto* dim = config.mutable_dimensions(last_dim);
-    dim->set_interior_padding(last_dim_size);
-    diag = builder->Pad(diag, XlaHelpers::Zero(builder, input_type(0)), config);
-
-    // Reshapes to the final shape.
-    dims.push_back(last_dim_size);
-    diag = builder->Reshape(diag, dims);
-
+    auto diag_or_status =
+        CreateDiagonal(diag, last_dim_size, other_dims, ctx, builder);
+    OP_REQUIRES_OK(ctx, diag_or_status.status());
+    diag = diag_or_status.ValueOrDie();
     ctx->SetOutput(0, diag);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d91ebb500b4479dbb3c8e2ea7719bc79dc24ba4f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -0,0 +1,367 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+// We implement bilinear interpolation by upsampling followed by convolution.
+// The basic idea is as follows. To scale from NxN to RxR:
+//
+//    1. S := (N - 1) /  gcd(N-1, R-1)
+//    2. k := (R - 1) /  gcd(N-1, R-1)
+//    3. Convolution(kxk, stride=S, lhs_dilation=k, padding=k-1)
+//
+// For example, to Scale from 7x7 -> 15x15:
+//
+//    1. S := (7-1) / gcd(7-1, 15-1) = 6 / gcd(6, 14) = 6 / 2 = 3
+//    2. k := (15 - 1) / gcd(7-1, 15-1) = 14 / gcd(6, 14) = 14 / 2 = 7
+//    3. Convolution(7x7, stride=3, lhs_dilation=3, padding=2)
+//
+//
+// The 7x7 -> 15x15 case is much too large to write out in full as an
+// example. The smallest interesting example is 3x3 -> 4x4.
+//
+// S := 2
+// k := 3
+//
+// 00 03 06    00 00 00 00 00 00 00 00 00 00 00      00 02 04 06
+// 09 12 15 -> 00 00 00 00 00 00 00 00 00 00 00   -> 06 08 10 12
+// 18 21 24    00 00 00 00 00 03 00 00 06 00 00      12 14 16 18
+//             00 00 00 00 00 00 00 00 00 00 00      18 20 22 24
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 09 00 00 12 00 00 15 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 18 00 00 21 00 00 24 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//             00 00 00 00 00 00 00 00 00 00 00
+//
+// with the following convolutional kernel, with stride [2, 2]:
+//       1 2 3 2 1
+//       2 4 6 4 2
+// 1/9 * 3 6 9 6 3
+//       2 4 6 4 2
+//       1 2 3 2 1
+
+// Computes the size of the convolutional kernel and stride to use when resizing
+// from in_size to out_size.
+struct ResizeConvolutionDims {
+  // Size of the kernel to use.
+  std::vector<int64> kernel_size;
+
+  // Stride of the convolution to use.
+  std::vector<int64> stride;
+};
+ResizeConvolutionDims ComputeResizeConvolutionParameters(
+    gtl::ArraySlice<int64> in_size, gtl::ArraySlice<int64> out_size) {
+  CHECK_EQ(in_size.size(), out_size.size());
+  int num_spatial_dims = in_size.size();
+  ResizeConvolutionDims dims;
+  dims.kernel_size.resize(num_spatial_dims);
+  dims.stride.resize(num_spatial_dims);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] == 1) {
+      // We must handle input size 1 specially because XLA convolution does
+      // not allow stride 0.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else if (out_size[i] == 1) {
+      // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+      // entry before resizing.
+      dims.stride[i] = dims.kernel_size[i] = 1;
+    } else {
+      int64 gcd = MathUtil::GCD(static_cast<uint64>(in_size[i] - 1),
+                                static_cast<uint64>(out_size[i] - 1));
+      dims.stride[i] = (in_size[i] - 1) / gcd;
+      dims.kernel_size[i] = (out_size[i] - 1) / gcd;
+    }
+  }
+  return dims;
+}
+
+xla::ComputationDataHandle MakeBilinearResizeKernel(
+    xla::ComputationBuilder* builder, gtl::ArraySlice<int64> kernel_size,
+    int64 channels) {
+  // Form a 2D convolution kernel like:
+  //       1 2 3 2 1
+  //       2 4 6 4 2
+  // 1/9 * 3 6 9 6 3
+  //       2 4 6 4 2
+  //       1 2 3 2 1
+  // by multiplying two 1D kernels of the form:
+  // 1/3 * [1 2 3 2 1]
+  auto make_1d_kernel = [](int64 n) {
+    std::vector<float> kernel(n * 2 - 1);
+    for (int64 i = 0; i < n; ++i) {
+      float v = i + 1;
+      kernel[i] = v;
+      kernel[n * 2 - 2 - i] = v;
+    }
+    return kernel;
+  };
+
+  // Form a block diagonal kernel where each channel interacts only with itself.
+  xla::Array4D<float> diag(1, 1, channels, channels, 0.0f);
+  for (int i = 0; i < channels; ++i) {
+    diag(0, 0, i, i) = 1.0f / (kernel_size[0] * kernel_size[1]);
+  }
+  return builder->Mul(
+      builder->ConstantR1<float>(make_1d_kernel(kernel_size[0])),
+      builder->Mul(builder->ConstantR1<float>(make_1d_kernel(kernel_size[1])),
+                   builder->ConstantR4FromArray4D(diag),
+                   /*broadcast_dimensions=*/{1}),
+      /*broadcast_dimensions=*/{0});
+}
+
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented(
+            "ResizeBilinear with align_corners=False is not yet implemented"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    const std::vector<int64> in_size = {input_shape.dim_size(1),
+                                        input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    std::vector<int64> out_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
+    OP_REQUIRES(ctx, out_size.size() == 2,
+                errors::InvalidArgument("output size must be length 2, got ",
+                                        out_size.size()));
+    OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
+                errors::InvalidArgument("output size must be positive, got [",
+                                        out_size[0], ",", out_size[1], "]"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+
+    // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
+    // dimension i.
+    std::vector<int64> slice_size = in_size;
+    bool slice_input = false;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] > 1 && out_size[i] == 1) {
+        // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+        // entry before resizing.
+        slice_input = true;
+        slice_size[i] = 1;
+      }
+    }
+    if (slice_input) {
+      input = b->Slice(input, {0, 0, 0, 0},
+                       {batch, slice_size[0], slice_size[1], channels},
+                       {1, 1, 1, 1});
+    }
+
+    // Output is always type float.
+    input = b->ConvertElementType(input, xla::F32);
+
+    // Picture for a 1x3 to 1x4 resize:
+    // stride = 2, kernel size = 3
+    // Input:
+    // 3 6 9
+    // Input with dilation and padding:
+    // 0 0 3 0 0 6 0 0 9 0 0
+    // Convolution kernel:
+    // 1/3 * [1 2 3 2 1]
+    // Output:
+    // 3 5 7 9
+    xla::ConvolutionDimensionNumbers dnums;
+    dnums.set_input_batch_dimension(0);
+    dnums.set_output_batch_dimension(0);
+    dnums.set_input_feature_dimension(3);
+    dnums.set_output_feature_dimension(3);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      dnums.add_input_spatial_dimensions(1 + i);
+      dnums.add_output_spatial_dimensions(1 + i);
+      dnums.add_kernel_spatial_dimensions(i);
+    }
+    dnums.set_kernel_input_feature_dimension(num_spatial_dims);
+    dnums.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+
+    ResizeConvolutionDims dims =
+        ComputeResizeConvolutionParameters(in_size, out_size);
+    xla::ComputationDataHandle kernel =
+        MakeBilinearResizeKernel(b, dims.kernel_size, channels);
+    xla::ComputationDataHandle output = b->ConvGeneralDilated(
+        input, kernel, dims.stride,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.kernel_size,
+        /*rhs_dilation=*/{1, 1}, dnums);
+
+    // Add broadcasts to handle expanding from a size == 1 dimension to a
+    // size > 1 dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && out_size[i] > 1) {
+        output = b->Add(output, b->ConstantR1<float>(out_size[i], 0),
+                        /*broadcast_dimensions=*/{1 + i});
+      }
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinear"), ResizeBilinearOp);
+
+class ResizeBilinearGradOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearGradOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented("ResizeBilinearGrad with align_corners=False is "
+                              "not yet implemented"));
+
+    DataType output_dtype;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(output_dtype, &output_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* b = ctx->builder();
+
+    TensorShape input_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, input_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input_shape.DebugString()));
+    const int64 batch = input_shape.dim_size(0);
+    const std::vector<int64> in_size = {input_shape.dim_size(1),
+                                        input_shape.dim_size(2)};
+    const int64 channels = input_shape.dim_size(3);
+    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+                errors::InvalidArgument("input size must be positive, got [",
+                                        in_size[0], ",", in_size[1], "]"));
+
+    TensorShape grad_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, grad_shape.dims() == 4,
+                errors::InvalidArgument("gradient must be 4-dimensional",
+                                        grad_shape.DebugString()));
+    const int64 grad_batch = grad_shape.dim_size(0);
+    const std::vector<int64> grad_size = {grad_shape.dim_size(1),
+                                          grad_shape.dim_size(2)};
+    const int64 grad_channels = grad_shape.dim_size(3);
+    OP_REQUIRES(ctx, batch == grad_batch,
+                errors::InvalidArgument(
+                    "activations and gradients must have the same batch size (",
+                    batch, " vs. ", grad_batch, ")"));
+    OP_REQUIRES(ctx, grad_size[0] > 0 && grad_size[1] > 0,
+                errors::InvalidArgument("gradient size must be positive, got [",
+                                        grad_size[0], ",", grad_size[1], "]"));
+    OP_REQUIRES(
+        ctx, channels == grad_channels,
+        errors::InvalidArgument(
+            "activations and gradients must have the same number of channels (",
+            channels, " vs. ", grad_channels, ")"));
+
+    const int num_spatial_dims = 2;
+
+    xla::ComputationDataHandle grad = ctx->Input(0);
+
+    ResizeConvolutionDims dims =
+        ComputeResizeConvolutionParameters(in_size, grad_size);
+
+    // To form the backward convolution, we keep the kernel unchanged (it is
+    // already symmetric) and swap the roles of strides and LHS dilation.
+    xla::ConvolutionDimensionNumbers dnums;
+    dnums.set_input_batch_dimension(0);
+    dnums.set_output_batch_dimension(0);
+    dnums.set_input_feature_dimension(3);
+    dnums.set_output_feature_dimension(3);
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      dnums.add_input_spatial_dimensions(1 + i);
+      dnums.add_output_spatial_dimensions(1 + i);
+      dnums.add_kernel_spatial_dimensions(i);
+    }
+    dnums.set_kernel_input_feature_dimension(num_spatial_dims);
+    dnums.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+    xla::ComputationDataHandle kernel =
+        MakeBilinearResizeKernel(b, dims.kernel_size, channels);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && grad_size[i] > 1) {
+        kernel = b->Add(kernel, b->ConstantR1<float>(grad_size[i], 0),
+                        /*broadcast_dimensions=*/{i});
+      }
+    }
+
+    xla::ComputationDataHandle output = b->ConvGeneralDilated(
+        grad, kernel, /*window_strides=*/dims.kernel_size,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.stride,
+        /*rhs_dilation=*/{1, 1}, dnums);
+
+    // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i.
+    // Opposite of the slice performed by the forward op.
+    xla::PaddingConfig padding = xla::MakeNoPaddingConfig(4);
+    bool pad_output = false;
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] > 1 && grad_size[i] == 1) {
+        pad_output = true;
+        padding.mutable_dimensions(1 + i)->set_edge_padding_high(in_size[i] -
+                                                                 1);
+      }
+    }
+    if (pad_output) {
+      output = b->Pad(output, b->ConstantR0<float>(0.0f), padding);
+    }
+
+    output = b->ConvertElementType(output, output_type_);
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  bool align_corners_;
+  xla::PrimitiveType output_type_;
+};
+
+REGISTER_XLA_OP(Name("ResizeBilinearGrad"), ResizeBilinearGradOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index b8769b3ea2be0a791d9c3e5e7acd8b6184442af2..e0dc1870f2a4934c35163f0cc10196e8fcbed9be 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -60,54 +60,20 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
                               input_shape.DebugString()));
 
   DataType index_type = output_type(0);
-  xla::PrimitiveType xla_input_type;
-  OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &xla_input_type));
-  xla::PrimitiveType xla_index_type;
-  OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(index_type, &xla_index_type));
 
   xla::ComputationBuilder* b = ctx->builder();
   xla::ComputationDataHandle input = ctx->Input(0);
 
-  xla::ComputationDataHandle init_value;
-  const xla::Computation* reducer;
+  xla::ComputationDataHandle output;
   if (is_min_) {
-    init_value = XlaHelpers::MaxValue(b, input_type(0));
-    reducer = ctx->GetOrCreateMin(input_type(0));
+    OP_REQUIRES_OK(ctx,
+                   XlaHelpers::ArgMin(b, ctx, input, input_shape, input_type(0),
+                                      index_type, axis, &output));
   } else {
-    init_value = XlaHelpers::MinValue(b, input_type(0));
-    reducer = ctx->GetOrCreateMax(input_type(0));
+    OP_REQUIRES_OK(ctx,
+                   XlaHelpers::ArgMax(b, ctx, input, input_shape, input_type(0),
+                                      index_type, axis, &output));
   }
-  xla::ComputationDataHandle input_max =
-      b->Reduce(input, init_value, *reducer, /*dimensions_to_reduce=*/{axis});
-  std::vector<int64> broadcast_dims(input_dims - 1);
-  std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
-  std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-  // Compute a mask that has 1s for elements equal to the maximum.
-  xla::ComputationDataHandle partial_mask = b->ConvertElementType(
-      b->Eq(input, input_max, broadcast_dims), xla_index_type);
-
-  // In order to make identity elements for a bitwise And, we:
-  //   Left shift the 1 to the leftmost bit, yielding 0x10...0
-  //   Arithmetic right shift the 1 back to the rightmost bit, yielding 0xFF...F
-  int32 bits_in_type =
-      xla::ShapeUtil::ByteSizeOfPrimitiveType(xla_index_type) * 8 - 1;
-  xla::ComputationDataHandle shift_amount =
-      XlaHelpers::IntegerLiteral(b, index_type, bits_in_type);
-  xla::ComputationDataHandle full_mask = b->ShiftRightArithmetic(
-      b->ShiftLeft(partial_mask, shift_amount), shift_amount);
-
-  // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its index.
-  xla::ComputationDataHandle iota;
-  OP_REQUIRES_OK(ctx, XlaHelpers::Iota(b, index_type, axis_size, &iota));
-  xla::ComputationDataHandle product =
-      b->And(full_mask, iota, /*broadcast_dimensions=*/{axis});
-
-  // If there are multiple maximum elements, choose the one with the highest
-  // index.
-  xla::ComputationDataHandle output =
-      b->Reduce(product, XlaHelpers::MinValue(b, index_type),
-                *ctx->GetOrCreateMax(index_type),
-                /*dimensions_to_reduce=*/{axis});
 
   ctx->SetOutput(0, output);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index fcef497e5845d9080bc83b54e92dcf2fdecf5f12..644abd5905c6ce5a8f61792a1986560bab891040 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -23,8 +23,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 4> kMatmulTypes = {
-    {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
+constexpr std::array<DataType, 5> kMatmulTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
 
 class MatMulOp : public XlaOpKernel {
  public:
@@ -85,10 +85,7 @@ class SparseMatMulOp : public MatMulOp {
   ~SparseMatMulOp() override = default;
 };
 
-REGISTER_XLA_OP(Name("SparseMatMul")
-                    .TypeConstraint("Ta", kFloatTypes)
-                    .TypeConstraint("Tb", kFloatTypes),
-                SparseMatMulOp);
+REGISTER_XLA_OP(Name("SparseMatMul"), SparseMatMulOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 462267d1504f16a5fc1f34f5804649416699005a..c283e3b02c2676785952e3e17bffa671b0dabc1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -60,7 +60,13 @@ class RetvalOp : public XlaOpKernel {
         OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
         OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
       } else {
-        tc.AddRetval(index_, dtype_, input);
+        // The core from which a return value is returned depends on the core
+        // assignment of the input to the retval .Since we can't change the core
+        // assignment of <input> as this point, create a tuple/get-tuple-element
+        // combination so that the core will be set on them.
+        auto tuple_elem =
+            ctx->builder()->GetTupleElement(ctx->builder()->Tuple({input}), 0);
+        tc.AddRetval(index_, dtype_, tuple_elem);
       }
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..650f8c7dc8be0cb08997ec641ca3f82352166fdd
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+// TODO(phawkins): implement double-sized windowed reductions in XLA and remove
+// the type constraint.
+constexpr std::array<DataType, 3> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+
+class ScanOp : public XlaOpKernel {
+ public:
+  ScanOp(OpKernelConstruction* ctx, bool sum) : XlaOpKernel(ctx), sum_(sum) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reverse", &reverse_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("exclusive", &exclusive_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape(0);
+    const TensorShape tensor_axis_shape = ctx->InputShape(1);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_axis_shape),
+                errors::InvalidArgument("ScanOp: axis must be a scalar, not ",
+                                        tensor_axis_shape.DebugString()));
+
+    int64 axis;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &axis));
+    if (axis < 0) {
+      axis += input_shape.dims();
+    }
+    OP_REQUIRES(
+        ctx, FastBoundsCheck(axis, input_shape.dims()),
+        errors::InvalidArgument("ScanOp: Expected scan axis in the range [",
+                                -input_shape.dims(), ", ", input_shape.dims(),
+                                "), but got ", axis));
+
+    DataType dtype = ctx->input_type(0);
+
+    if (input_shape.num_elements() == 0) {
+      // Exit early if there is nothing to compute.
+      ctx->SetOutput(0, ctx->Input(0));
+      return;
+    }
+
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    std::vector<int64> window_strides(input_shape.dims(), 1);
+    std::vector<int64> window_dims(input_shape.dims(), 1);
+    window_dims[axis] = input_shape.dim_size(axis);
+
+    std::vector<std::pair<int64, int64>> padding(input_shape.dims(), {0, 0});
+    padding[axis].first = input_shape.dim_size(axis) - 1;
+    // In exclusive mode, add an extra padding element so there is a complete
+    // window of padding before the data starts.
+    if (exclusive_) {
+      ++padding[axis].first;
+    }
+    if (reverse_) {
+      std::swap(padding[axis].first, padding[axis].second);
+    }
+
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle init;
+    const xla::Computation* reducer;
+    if (sum_) {
+      init = XlaHelpers::Zero(builder, dtype);
+      reducer = ctx->GetOrCreateAdd(dtype);
+    } else {
+      init = XlaHelpers::One(builder, dtype);
+      reducer = ctx->GetOrCreateMul(dtype);
+    }
+    auto output = builder->ReduceWindowWithGeneralPadding(
+        ctx->Input(0), init, *reducer, window_dims, window_strides, padding);
+
+    // In exclusive mode, we have computed an extra element containing the sum
+    // of all the input elements. Slice off this extra "last" element.
+    if (exclusive_) {
+      if (reverse_) {
+        output = builder->SliceInDim(output, 1, input_shape.dim_size(axis) + 1,
+                                     1, axis);
+
+      } else {
+        output =
+            builder->SliceInDim(output, 0, input_shape.dim_size(axis), 1, axis);
+      }
+    }
+    ctx->SetOutput(0, output);
+  }
+
+ private:
+  const bool sum_;  // True=cumulative sum. False=cumulative product.
+  bool reverse_;
+  bool exclusive_;
+};
+
+class CumsumOp : public ScanOp {
+ public:
+  explicit CumsumOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/true) {}
+};
+REGISTER_XLA_OP(Name("Cumsum").TypeConstraint("T", kScanOpTypes), CumsumOp);
+
+class CumprodOp : public ScanOp {
+ public:
+  explicit CumprodOp(OpKernelConstruction* ctx) : ScanOp(ctx, /*sum=*/false) {}
+};
+REGISTER_XLA_OP(Name("Cumprod").TypeConstraint("T", kScanOpTypes), CumprodOp);
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 24a99f253d6dc8bb699fff587c363b12c227e821..e205fadd2b1bcae96a7bfa1bc83096d405ce22c4 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -27,56 +28,42 @@ namespace {
 
 class ShapeOp : public XlaOpKernel {
  public:
-  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    const int rank = input_shape.dims();
-    Tensor shape_constant(DT_INT32, TensorShape({rank}));
-    auto vec = shape_constant.vec<int32>();
-    // TODO(dga): support int64.  b/28119922.
-    for (int i = 0; i < rank; ++i) {
-      int64 dim_size = input_shape.dim_size(i);
-      OP_REQUIRES(
-          ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-          errors::InvalidArgument("Shape does not support tensors > int32max",
-                                  " but dim ", i, " is ", dim_size));
-      vec(i) = static_cast<int32>(dim_size);
-    }
-
+    Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
     ctx->SetConstantOutput(0, shape_constant);
   }
+
+ private:
+  DataType out_dtype_;
 };
 
 REGISTER_XLA_OP(Name("Shape"), ShapeOp);
 
 class ShapeNOp : public XlaOpKernel {
  public:
-  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
-      const TensorShape shape = ctx->InputShape(i);
-      const int dims = shape.dims();
-      Tensor shape_constant(DT_INT32, TensorShape({dims}));
-      auto vec = shape_constant.vec<int32>();
-
-      // TODO(dga): support int64.  b/28119922.
-      for (int j = 0; j < dims; ++j) {
-        int64 dim_size = shape.dim_size(j);
-        OP_REQUIRES(
-            ctx, FastBoundsCheck(dim_size, std::numeric_limits<int32>::max()),
-            errors::InvalidArgument("Shape does not support tensors > int32max",
-                                    " but shape ", i, " dim ", j, " is ",
-                                    dim_size));
-        vec(j) = static_cast<int32>(dim_size);
-      }
-
+      const TensorShape input_shape = ctx->InputShape(i);
+      Tensor shape_constant(out_dtype_, TensorShape({input_shape.dims()}));
+      OP_REQUIRES_OK(ctx, TensorShapeToConstant(input_shape, &shape_constant));
       ctx->SetConstantOutput(i, shape_constant);
     }
   }
 
   bool IsExpensive() override { return false; }
+
+ private:
+  DataType out_dtype_;
 };
 REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76ea5f525598f511f295eb5a30f3cf603fbf57aa
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
+
+#include <limits>
+
+#include "tensorflow/core/kernels/bounds_check.h"
+
+namespace tensorflow {
+
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant) {
+  const int dims = input_shape.dims();
+  if (shape_constant->dtype() == DT_INT32) {
+    auto vec = shape_constant->vec<int32>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      if (!FastBoundsCheck(dim_size, std::numeric_limits<int32>::max())) {
+        return errors::InvalidArgument(
+            "Shape with out_type=int32 does not support tensors > int32max",
+            " but dim ", i, " is ", dim_size);
+      }
+      vec(i) = static_cast<int32>(dim_size);
+    }
+  } else {
+    auto vec = shape_constant->vec<int64>();
+    for (int i = 0; i < dims; ++i) {
+      int64 dim_size = input_shape.dim_size(i);
+      vec(i) = dim_size;
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.h b/tensorflow/compiler/tf2xla/kernels/shape_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..575086e118080f6799a54d3ae6409b2b641c4341
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
+
+#include <limits>
+
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+// Converts a TensorShape to a constant Tensor.
+//
+// The input TensorShape input_shape is used to populate the elements of
+// shape_constant, which is modified in place.
+Status TensorShapeToConstant(const TensorShape& input_shape,
+                             Tensor* shape_constant);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_TF2XLA_KERNELS_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 89befda346ec06fec23ab1d1c9d910ded8cd806d..806fda632cde64c1b37ae3b9199028d6b6b0a215 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 namespace {
@@ -23,6 +24,16 @@ namespace {
 class SpaceToDepthOp : public XlaOpKernel {
  public:
   explicit SpaceToDepthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    OP_REQUIRES(ctx, data_format_ == FORMAT_NCHW || data_format_ == FORMAT_NHWC,
+                errors::InvalidArgument("Unsupported data format ",
+                                        ToString(data_format_),
+                                        "; expected formats NHWC or NCHW"));
+
     OP_REQUIRES_OK(ctx, ctx->GetAttr("block_size", &block_size_));
     OP_REQUIRES(
         ctx, block_size_ > 1,
@@ -31,34 +42,100 @@ class SpaceToDepthOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_tensor_shape = ctx->InputShape(0);
-    // The input is presumed to be [batch, height, width, depth]
     int input_rank = input_tensor_shape.dims();
     static const int kRequiredDims = 4;
     OP_REQUIRES(ctx, kRequiredDims == input_rank,
-                errors::InvalidArgument("Input rank should be: ", kRequiredDims,
-                                        " instead of: ", input_rank));
+                errors::InvalidArgument("Input rank should be ", kRequiredDims,
+                                        "; got ", input_rank));
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
     xla::ComputationBuilder* b = ctx->builder();
     xla::ComputationDataHandle input = ctx->Input(0);
 
+    int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
+    int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
+
+    std::vector<int64> reshaped_shape;
+    std::vector<int64> transpose_order;
+    std::vector<int64> output_shape;
+    reshaped_shape.reserve(input_rank);
+    transpose_order.reserve(input_rank);
+    output_shape.reserve(input_rank);
+    if (data_format_ == FORMAT_NHWC) {
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 1 + i, "]=", input_shape[1 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[1 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+      reshaped_shape.push_back(input_shape[feature_dim]);
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 1);
+      }
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+      transpose_order.push_back(feature_dim + num_spatial_dims);
+
+      output_shape.push_back(input_shape[0]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[1 + i] / block_size_);
+      }
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+    } else {
+      // FORMAT_NCHW
+      int64 block_elems = 1;
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        OP_REQUIRES(ctx, input_shape[2 + i] % block_size_ == 0,
+                    errors::InvalidArgument(
+                        "input shape[", 2 + i, "]=", input_shape[2 + i],
+                        " is not divisible by block_size=", block_size_));
+        block_elems *= block_size_;
+      }
+
+      reshaped_shape.push_back(input_shape[0]);
+      reshaped_shape.push_back(input_shape[feature_dim]);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        reshaped_shape.push_back(input_shape[2 + i] / block_size_);
+        reshaped_shape.push_back(block_size_);
+      }
+
+      transpose_order.push_back(0);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 3);
+      }
+      transpose_order.push_back(feature_dim);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        transpose_order.push_back(i * 2 + 2);
+      }
+
+      output_shape.push_back(input_shape[0]);
+      output_shape.push_back(input_shape[feature_dim] * block_elems);
+      for (int i = 0; i < num_spatial_dims; ++i) {
+        output_shape.push_back(input_shape[2 + i] / block_size_);
+      }
+    }
+
+    // Note: comments are given in NHWC format; NCHW is similar with a different
+    // dimension order.
     // 1. Reshape `input` to `reshaped` of shape:
     //
     //      [batch,
     //       input_shape[1] / block_size_, block_size_,
     //       input_shape[2] / block_size_, block_size_,
     //       depth]
-    const int block_rank = 2;
-    for (int i = 0; i < block_rank; ++i) {
-      OP_REQUIRES(ctx, input_shape[1 + i] % block_size_ == 0,
-                  errors::InvalidArgument(
-                      "input shape[", 1 + i, "]=", input_shape[1 + i],
-                      " is not divisible by block_size=", block_size_));
-    }
-    xla::ComputationDataHandle reshaped = b->Reshape(
-        input, {input_shape[0], input_shape[1] / block_size_, block_size_,
-                input_shape[2] / block_size_, block_size_, input_shape[3]});
+    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -69,7 +146,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       block_size_, block_size_,
     //       depth]
     xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, {0, 1, 3, 2, 4, 5});
+        b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -79,15 +156,14 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[2] / block_size_,
     //       block_size_ * block_size_ * depth]
     //
-    xla::ComputationDataHandle output = b->Reshape(
-        permuted_reshaped, {input_shape[0], input_shape[1] / block_size_,
-                            input_shape[2] / block_size_,
-                            block_size_ * block_size_ * input_shape[3]});
+    xla::ComputationDataHandle output =
+        b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
 
  private:
+  TensorFormat data_format_;
   int block_size_;
 };
 REGISTER_XLA_OP(Name("SpaceToDepth"), SpaceToDepthOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b10880de77e6b9811008076cd4a959c284e558d1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -0,0 +1,279 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+namespace {
+
+// Rotates a 32-bit integer 'v' left by 'distance' bits.
+xla::ComputationDataHandle RotateLeftS32(xla::ComputationBuilder* builder,
+                                         const xla::ComputationDataHandle& v,
+                                         int distance) {
+  return builder->Or(
+      builder->ShiftLeft(v, builder->ConstantR0<int>(distance)),
+      builder->ShiftRightLogical(v, builder->ConstantR0<int>(32 - distance)));
+}
+
+// TODO(b/65209188): add a primitive XOR to XLA and call it here, rather than
+// building XOR out of other bitwise operators.
+xla::ComputationDataHandle BitwiseXor(xla::ComputationBuilder* builder,
+                                      const xla::ComputationDataHandle& x,
+                                      const xla::ComputationDataHandle& y) {
+  return builder->Or(builder->And(x, builder->Not(y)),
+                     builder->And(builder->Not(x), y));
+}
+
+using ThreeFry2x32State = std::array<xla::ComputationDataHandle, 2>;
+
+// Implements the ThreeFry counter-based PRNG algorithm.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+ThreeFry2x32State ThreeFry2x32(xla::ComputationBuilder* builder,
+                               ThreeFry2x32State input, ThreeFry2x32State key) {
+  // Rotation distances specified by the Threefry2x32 algorithm.
+  constexpr std::array<int, 8> rotations = {13, 15, 26, 6, 17, 29, 16, 24};
+  ThreeFry2x32State x;
+
+  std::array<xla::ComputationDataHandle, 3> ks;
+  // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm.
+  ks[2] = builder->ConstantR0<int32>(0x1BD11BDA);
+  for (int i = 0; i < 2; ++i) {
+    ks[i] = key[i];
+    x[i] = input[i];
+    ks[2] = BitwiseXor(builder, ks[2], key[i]);
+  }
+
+  x[0] = builder->Add(x[0], ks[0]);
+  x[1] = builder->Add(x[1], ks[1]);
+
+  // Performs a single round of the Threefry2x32 algorithm, with a rotation
+  // amount 'rotation'.
+  auto round = [builder](ThreeFry2x32State v, int rotation) {
+    v[0] = builder->Add(v[0], v[1]);
+    v[1] = RotateLeftS32(builder, v[1], rotation);
+    v[1] = BitwiseXor(builder, v[0], v[1]);
+    return v;
+  };
+
+  // There are no known statistical flaws with 13 rounds of Threefry2x32.
+  // We are conservative and use 20 rounds.
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[1]);
+  x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0<int32>(1));
+
+  x = round(x, rotations[4]);
+  x = round(x, rotations[5]);
+  x = round(x, rotations[6]);
+  x = round(x, rotations[7]);
+  x[0] = builder->Add(x[0], ks[2]);
+  x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0<int32>(2));
+
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[0]);
+  x[1] = builder->Add(builder->Add(x[1], ks[1]), builder->ConstantR0<int32>(3));
+
+  x = round(x, rotations[4]);
+  x = round(x, rotations[5]);
+  x = round(x, rotations[6]);
+  x = round(x, rotations[7]);
+  x[0] = builder->Add(x[0], ks[1]);
+  x[1] = builder->Add(builder->Add(x[1], ks[2]), builder->ConstantR0<int32>(4));
+
+  x = round(x, rotations[0]);
+  x = round(x, rotations[1]);
+  x = round(x, rotations[2]);
+  x = round(x, rotations[3]);
+  x[0] = builder->Add(x[0], ks[2]);
+  x[1] = builder->Add(builder->Add(x[1], ks[0]), builder->ConstantR0<int32>(5));
+
+  return x;
+}
+
+// Returns a tensor of 'shape' random values uniformly distributed in the range
+// [minval, maxval)
+xla::ComputationDataHandle RandomUniform(xla::ComputationBuilder* builder,
+                                         const xla::ComputationDataHandle& seed,
+                                         const TensorShape& shape,
+                                         double minval, double maxval) {
+  // Split the seed into two 32-bit scalars to form a key.
+  auto seed0 = builder->Reshape(builder->Slice(seed, {0}, {1}, {1}), {});
+  auto seed1 = builder->Reshape(builder->Slice(seed, {1}, {2}, {1}), {});
+  ThreeFry2x32State key = {seed0, seed1};
+  const int64 size = shape.num_elements();
+
+  const int64 half_size = MathUtil::CeilOfRatio<int64>(size, 2);
+  const bool size_is_odd = (half_size * 2 != size);
+
+  // Fill the generator inputs with unique counter values.
+  ThreeFry2x32State inputs;
+  TF_CHECK_OK(XlaHelpers::Iota(builder, DT_INT32, half_size, &inputs[0]));
+  inputs[1] = builder->Add(inputs[0], builder->ConstantR0<int32>(half_size));
+  ThreeFry2x32State outputs = ThreeFry2x32(builder, inputs, key);
+
+  if (size_is_odd) {
+    outputs[1] = builder->Slice(outputs[1], {0}, {half_size - 1}, {1});
+  }
+
+  auto bits =
+      builder->Reshape(builder->ConcatInDim(outputs, 0), shape.dim_sizes());
+
+  // Form 22 random mantissa bits, with a leading 1 bit. The leading 1 bit
+  // forces the random bits into the mantissa.
+  constexpr int kFloatBits = 32;
+  constexpr int kMantissaBits = 23;
+  bits = builder->Or(
+      builder->ShiftRightLogical(
+          bits, builder->ConstantR0<int32>(kFloatBits - kMantissaBits)),
+      builder->ConstantR0<int32>(bit_cast<int32>(1.0f)));
+  auto floats = builder->BitcastConvertType(bits, xla::F32);
+
+  // We have a floating point number in the range [1.0, 2.0).
+  // Subtract 1.0f to shift to the range [0.0, 1.0)
+  floats = builder->Sub(floats, builder->ConstantR0<float>(1.0f));
+  // Multiply and add to shift to the range [minval, maxval).
+  floats = builder->Mul(floats, builder->ConstantR0<float>(maxval - minval));
+  floats = builder->Add(floats, builder->ConstantR0<float>(minval));
+  return floats;
+}
+
+// Approximation for the inverse error function from
+//   Giles, M., "Approximating the erfinv function".
+// The approximation has the form:
+//   w = -log((1 - x) * (1 + x))
+//   if ( w < 5 ) {
+//     w = w - 2.5
+//     p = sum_{i=1}^n lq[i]*w^i
+//   } else {
+//     w = sqrt(w) - 3
+//     p = sum_{i=1}^n gq[i]*w^i
+//   }
+//   return p*x
+xla::ComputationDataHandle ErfInvF32(xla::ComputationBuilder* b,
+                                     const xla::ComputationDataHandle& x,
+                                     const TensorShape& shape) {
+  constexpr int kDegree = 9;
+  constexpr std::array<float, 9> w_less_than_5_constants = {
+      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
+      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
+      -0.00417768164f,  0.246640727f,    1.50140941f};
+  constexpr std::array<float, 9> w_greater_than_5_constants = {
+      -0.000200214257f, 0.000100950558f, 0.00134934322f,
+      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
+      0.00943887047f,   1.00167406f,     2.83297682f};
+
+  auto one = b->ConstantR0<float>(1.0);
+  auto w = b->Neg(b->Log(b->Mul(b->Sub(one, x), b->Add(one, x))));
+
+  auto lt = b->Lt(w, b->ConstantR0<float>(5.0));
+  auto coefficient = [&](int i) {
+    return b->Select(
+        lt,
+        b->Broadcast(b->ConstantR0<float>(w_less_than_5_constants[i]),
+                     shape.dim_sizes()),
+        b->Broadcast(b->ConstantR0<float>(w_greater_than_5_constants[i]),
+                     shape.dim_sizes()));
+  };
+  w = b->Select(lt, b->Sub(w, b->ConstantR0<float>(2.5f)),
+                b->Sub(b->SqrtF32(w), b->ConstantR0<float>(3.0f)));
+  auto p = coefficient(0);
+  for (int i = 1; i < kDegree; ++i) {
+    p = b->Add(coefficient(i), b->Mul(p, w));
+  }
+  return b->Mul(p, x);
+}
+
+}  // namespace
+
+class StatelessRandomUniformOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    TensorShape seed_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::ComputationDataHandle seed = ctx->Input(1);
+    ctx->SetOutput(0, RandomUniform(builder, seed, shape, 0.0, 1.0));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+};
+
+// TODO(phawkins): generalize to non-float, non-int32 seed types.
+REGISTER_XLA_OP(Name("StatelessRandomUniform")
+                    .TypeConstraint("dtype", DT_FLOAT)
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessRandomUniformOp);
+
+class StatelessRandomNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    TensorShape seed_shape = ctx->InputShape(1);
+    OP_REQUIRES(ctx, seed_shape == TensorShape({2}),
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::ComputationDataHandle seed = ctx->Input(1);
+    xla::ComputationBuilder* builder = ctx->builder();
+    auto uniform = RandomUniform(builder, seed, shape, -1.0, 1.0);
+    // Convert uniform distribution to normal distribution by computing
+    // sqrt(2) * erfinv(x)
+    auto normal = builder->Mul(builder->ConstantR0<float>(std::sqrt(2.0)),
+                               ErfInvF32(builder, uniform, shape));
+    ctx->SetOutput(0, normal);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+};
+
+// TODO(phawkins): generalize to non-float, non-int32 seed types.
+REGISTER_XLA_OP(Name("StatelessRandomNormal")
+                    .TypeConstraint("dtype", DT_FLOAT)
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessRandomNormalOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 351fda251798e43b607fb445f2c98abd57b3d86b..03c22354a9425189e6cf7ee5a7201c90ecb1908d 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -311,6 +311,32 @@ class TensorArrayGatherOp : public XlaOpKernel {
 
     xla::ComputationDataHandle ta = resource->value;
 
+    // Look for the case where the gather takes a simple slice from the
+    // tensor array (0, 1, 2, 3, 4, ..., N)
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok()) {
+      bool gather_is_dense_slice = true;
+      for (auto i = 0; i < const_indices.size(); i++) {
+        if (const_indices[i] != i) {
+          gather_is_dense_slice = false;
+          break;
+        }
+      }
+
+      if (gather_is_dense_slice) {
+        std::vector<int64> begin(ta_shape.dims(), 0);
+        std::vector<int64> strides(ta_shape.dims(), 1);
+        std::vector<int64> end(ta_shape.dims(), 1);
+        end[0] = const_indices.size();
+        for (auto i = 1; i < ta_shape.dims(); i++) {
+          end[i] = ta_shape.dim_size(i);
+        }
+        ctx->SetOutput(0, b->Slice(ta, begin, end, strides));
+        return;
+      }
+    }
+
     xla::ComputationDataHandle gather = XlaComputeGatherDynamicSlice(
         ctx, ta, ta_shape, indices, indices_shape, 0, dtype_, index_type, b);
     ctx->SetOutput(0, gather);
@@ -352,28 +378,47 @@ class TensorArrayScatterOp : public XlaOpKernel {
     const xla::ComputationDataHandle value = ctx->Input(2);
     const xla::ComputationDataHandle flow = ctx->Input(3);
 
-    auto slice_dims = value_shape.dim_sizes();
-    slice_dims[0] = 1LL;
-
-    std::vector<int64> value_starts(value_shape.dims(), 0);
-    auto value_ends = value_shape.dim_sizes();
-
-    std::vector<int64> value_strides(value_shape.dims(), 1);
-
-    // For every (index, value) pair, update the corresponding TensorArray
-    // storage.
-    for (int i = 0; i < num_indices; ++i) {
-      // Slice out part of the value.
-      value_starts[0] = i;
-      value_ends[0] = i + 1;
-      auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+    // Look for the case where the scatter is for each sub-tensor in order. The
+    // tensor array implementation allows for this to be a straight addition.
+    bool scatter_all_elements_in_order = false;
+    std::vector<int64> const_indices;
+    Status status = ctx->ConstantInputAsIntVector(1, &const_indices);
+    if (status.ok() && num_indices == value_shape.dim_size(0)) {
+      scatter_all_elements_in_order = true;
+      for (auto i = 0; i < num_indices; i++) {
+        if (const_indices[i] != i) {
+          scatter_all_elements_in_order = false;
+          break;
+        }
+      }
+    }
 
-      // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-      auto index = b->Slice(indices, {i}, {i + 1}, {1});
-      auto start_indices =
-          b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
-      ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+    if (scatter_all_elements_in_order) {
+      ta = b->Add(ta, value);
+    } else {
+      auto slice_dims = value_shape.dim_sizes();
+      slice_dims[0] = 1LL;
+
+      std::vector<int64> value_starts(value_shape.dims(), 0);
+      auto value_ends = value_shape.dim_sizes();
+
+      std::vector<int64> value_strides(value_shape.dims(), 1);
+
+      // For every (index, value) pair, update the corresponding TensorArray
+      // storage.
+      for (int i = 0; i < num_indices; ++i) {
+        // Slice out part of the value.
+        value_starts[0] = i;
+        value_ends[0] = i + 1;
+        auto slice = b->Slice(value, value_starts, value_ends, value_strides);
+
+        // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+        auto index = b->Slice(indices, {i}, {i + 1}, {1});
+        auto start_indices =
+                b->Pad(b->Reshape(index, {1}), b->ConstantR0<int32>(0),
+                       xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices);
+      }
     }
 
     resource->value = ta;
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index b19ea22f50d2dd44e8d1d81f5930263f364030e1..68847ae7a2cb926edd9d29007e24b0db7fb5a75f 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/no_op.h"
 
 namespace tensorflow {
@@ -121,5 +123,26 @@ class ResourceGatherOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ResourceGather").TypeConstraint("dtype", kNumericTypes),
                 ResourceGatherOp);
 
+class VariableShapeOp : public XlaOpKernel {
+ public:
+  explicit VariableShapeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType variable_dtype;
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetVariableTypeAndShape(0, &variable_dtype, &shape));
+    Tensor shape_constant(out_dtype_, TensorShape({shape.dims()}));
+    OP_REQUIRES_OK(ctx, TensorShapeToConstant(shape, &shape_constant));
+    ctx->SetConstantOutput(0, shape_constant);
+  }
+
+ private:
+  DataType out_dtype_;
+};
+
+REGISTER_XLA_OP(Name("VariableShape"), VariableShapeOp);
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..21ad21f73737a289390ed1ea767db1078d05b466
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -0,0 +1,120 @@
+# Utilities for building XLA computations.
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = ["//tensorflow/compiler/tf2xla:friends"],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+
+cc_library(
+    name = "batch_dot",
+    srcs = ["batch_dot.cc"],
+    hdrs = ["batch_dot.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "cholesky",
+    srcs = ["cholesky.cc"],
+    hdrs = ["cholesky.h"],
+    deps = [
+        ":batch_dot",
+        ":triangular_solve",
+        ":util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "triangular_solve",
+    srcs = ["triangular_solve.cc"],
+    hdrs = ["triangular_solve.h"],
+    deps = [
+        ":batch_dot",
+        ":util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    deps = [
+        ":triangular_solve",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b0e6174475c22e325c090bec5f1d56822e106bc
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+// The current implementation simply unrolls the computation along the batch
+// dimension.
+xla::StatusOr<xla::ComputationDataHandle> BatchDot(
+    xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
+                      builder->GetShape(x));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
+                      builder->GetShape(y));
+
+  // Check that both tensors have the same number of dimensions. There must be
+  // at least two (the batch dimensions can be empty).
+  if (xla::ShapeUtil::Rank(*x_shape) != xla::ShapeUtil::Rank(*y_shape)) {
+    return errors::InvalidArgument(
+        "Arguments to BatchedDot have different ranks: ",
+        xla::ShapeUtil::HumanString(*x_shape), " vs. ",
+        xla::ShapeUtil::HumanString(*y_shape));
+  }
+  const int ndims = xla::ShapeUtil::Rank(*x_shape);
+  if (ndims < 2) {
+    return errors::InvalidArgument(
+        "Arguments to BatchedDot must have rank >= 2: ", ndims);
+  }
+
+  // The batch dimensions must be equal and the matrix dimensions must be
+  // valid.
+  std::vector<int64> batch_dimension_numbers;
+  for (int i = 0; i < ndims - 2; ++i) {
+    if (x_shape->dimensions(i) != y_shape->dimensions(i)) {
+      return errors::InvalidArgument(
+          "Dimension ", i, " of inputs to BatchedDot must be equal: ",
+          xla::ShapeUtil::HumanString(*x_shape), " vs ",
+          xla::ShapeUtil::HumanString(*y_shape));
+    }
+    batch_dimension_numbers.push_back(i);
+  }
+
+  int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
+  int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
+  if (x_shape->dimensions(x_inner_dim) != y_shape->dimensions(y_inner_dim)) {
+    return errors::InvalidArgument(
+        "Dimensions ", x_inner_dim, " and ", y_inner_dim,
+        " of arguments to BatchedDot must be equal: ",
+        xla::ShapeUtil::HumanString(*x_shape), " transpose: ", transpose_x,
+        " vs. ", xla::ShapeUtil::HumanString(*y_shape),
+        " transpose: ", transpose_y);
+  }
+
+  // Check for zero lhs/rhs dim size.
+  if (xla::ShapeUtil::HasZeroElements(*x_shape) ||
+      xla::ShapeUtil::HasZeroElements(*y_shape)) {
+    std::vector<int64> dimensions(batch_dimension_numbers.size());
+    for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+      dimensions[i] = x_shape->dimensions(batch_dimension_numbers[i]);
+    }
+    int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
+    int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
+    dimensions.push_back(x_shape->dimensions(x_outer_dim));
+    dimensions.push_back(y_shape->dimensions(y_outer_dim));
+    return builder->Broadcast(
+        builder->ConstantLiteral(xla::Literal::Zero(x_shape->element_type())),
+        dimensions);
+  }
+
+  if (x_shape->element_type() == xla::C64 && transpose_x) {
+    x = builder->Conj(x);
+  }
+  if (y_shape->element_type() == xla::C64 && transpose_y) {
+    y = builder->Conj(y);
+  }
+
+  // If there are no batch dimensions, use a regular Dot.
+  // TODO(b/69062148) Remove this code when Dot emitters can be passed
+  // dimensions to transpose directly (i.e. without requiring a Transpose HLO).
+  if (batch_dimension_numbers.empty()) {
+    auto lhs = transpose_x ? builder->Transpose(x, {1, 0}) : x;
+    auto rhs = transpose_y ? builder->Transpose(y, {1, 0}) : y;
+    return builder->Dot(lhs, rhs);
+  }
+
+  xla::DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+  dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+  for (auto batch_dimension_number : batch_dimension_numbers) {
+    dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+    dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+  }
+  return builder->DotGeneral(x, y, dot_dnums);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
new file mode 100644
index 0000000000000000000000000000000000000000..b46bc7417d29dc5b7e9649ac28cc78b57d4b619c
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+
+namespace tensorflow {
+
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be transposed before multiplication by
+// setting the `transpose_x` or `transpose_y` flag to `true`.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+// TODO(phawkins): add an option to take the complex conjugate of the LHS or
+// RHS.
+xla::StatusOr<xla::ComputationDataHandle> BatchDot(
+    xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
+    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cc489adf6042acb3f56b3a0a6c8fbe43bde629
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -0,0 +1,166 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+
+// def cholesky_unblocked(a):
+//   assert len(a.shape) == 2 and a.shape[-2] == a.shape[-1]
+//   n = a.shape[-2]
+//   l = np.zeros_like(a)
+//   for j in xrange(n):
+//     r = l[..., j, :j]
+//     l[..., j, j] = np.sqrt(a[..., j, j] - np.dot(r, r))
+//     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j],
+//         np.transpose(r))) / l[..., j, j]
+//   return l
+xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(a));
+  xla::ComputationDataHandle l = Zeros(builder, *shape);
+  const int64 n = xla::ShapeUtil::GetDimension(*shape, -2);
+  for (int j = 0; j < n; ++j) {
+    // Picture of block structure:
+    // ...   \
+    //        \
+    // -- r -- d
+    //         |\
+    //    B    c \
+    //         |  \
+    //         |  ...
+    //
+    //         ^
+    //      column j
+    TF_ASSIGN_OR_RETURN(auto d,
+                        SliceInMinorDims(builder, a, {j, j}, {j + 1, j + 1}));
+    TF_ASSIGN_OR_RETURN(auto c,
+                        SliceInMinorDims(builder, a, {j + 1, j}, {n, j + 1}));
+    xla::ComputationDataHandle new_d_squared = d;
+    xla::ComputationDataHandle br;
+    if (j > 0) {
+      TF_ASSIGN_OR_RETURN(auto r,
+                          SliceInMinorDims(builder, l, {j, 0}, {j + 1, j}));
+      TF_ASSIGN_OR_RETURN(auto b,
+                          SliceInMinorDims(builder, l, {j + 1, 0}, {n, j}));
+      TF_ASSIGN_OR_RETURN(auto r_squared,
+                          BatchDot(builder, r, r, /*transpose_x=*/false,
+                                   /*transpose_y=*/true));
+      new_d_squared = builder->Sub(new_d_squared, r_squared);
+
+      TF_ASSIGN_OR_RETURN(br, BatchDot(builder, b, r, /*transpose_x=*/false,
+                                       /*transpose_y=*/true));
+    }
+    auto new_d_inv = builder->Pow(
+        new_d_squared, FloatLiteral(builder, shape->element_type(), -0.5));
+    auto new_d = builder->Mul(new_d_inv, new_d_squared);
+    TF_ASSIGN_OR_RETURN(l, UpdateSliceInMinorDims(builder, l, new_d, {j, j}));
+
+    if (j > 0) {
+      c = builder->Sub(c, br);
+    }
+    auto new_c = builder->Mul(c, new_d_inv);
+    TF_ASSIGN_OR_RETURN(l,
+                        UpdateSliceInMinorDims(builder, l, new_c, {j + 1, j}));
+  }
+  return l;
+}
+
+}  // namespace
+
+xla::StatusOr<xla::ComputationDataHandle> Cholesky(
+    xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
+    int64 block_size) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  const int ndims = xla::ShapeUtil::Rank(*a_shape);
+  if (ndims < 2) {
+    return errors::InvalidArgument(
+        "Arguments to Cholesky must have rank >= 2: ", ndims);
+  }
+
+  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
+  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+    return errors::InvalidArgument(
+        "Arguments to Cholesky must be square matrices: ",
+        xla::ShapeUtil::HumanString(*a_shape));
+  }
+
+  if (block_size < 1) {
+    return errors::InvalidArgument(
+        "block_size argument to Cholesky must be >= 1; got ", block_size);
+  }
+
+  // Blocked left-looking Cholesky factorization.
+  // Algorithm 1 from
+  // Haidar, Azzam, et al. "High-performance Cholesky factorization for GPU-only
+  // execution." Proceedings of General Purpose GPUs. ACM, 2017.
+  xla::ComputationDataHandle l = Zeros(builder, *a_shape);
+  for (int64 i = 0; i < n; i += block_size) {
+    int64 k = std::min(block_size, n - i);
+    if (i > 0) {
+      // TODO(phawkins): consider implementing SYRK for the diagonal part of
+      // the panel.
+      // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
+      TF_ASSIGN_OR_RETURN(auto lhs,
+                          SliceInMinorDims(builder, l, {i, 0}, {n, i}));
+      TF_ASSIGN_OR_RETURN(auto rhs,
+                          SliceInMinorDims(builder, l, {i, 0}, {i + k, i}));
+      TF_ASSIGN_OR_RETURN(auto delta,
+                          BatchDot(builder, lhs, rhs, /*transpose_x=*/false,
+                                   /*transpose_y=*/true));
+      TF_ASSIGN_OR_RETURN(auto before,
+                          SliceInMinorDims(builder, a, {i, i}, {n, i + k}));
+      TF_ASSIGN_OR_RETURN(
+          a, UpdateSliceInMinorDims(builder, a, builder->Sub(before, delta),
+                                    {i, i}));
+    }
+
+    // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k])
+    TF_ASSIGN_OR_RETURN(auto x,
+                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+    TF_ASSIGN_OR_RETURN(auto factorized, CholeskyUnblocked(builder, x));
+    TF_ASSIGN_OR_RETURN(l,
+                        UpdateSliceInMinorDims(builder, l, factorized, {i, i}));
+
+    if (i + k < n) {
+      // l[i+k:, i:i+k] = trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
+      TF_ASSIGN_OR_RETURN(auto panel,
+                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+      TF_ASSIGN_OR_RETURN(auto update,
+                          TriangularSolve(builder, factorized, panel,
+                                          /*block_size=*/8));
+      TF_ASSIGN_OR_RETURN(
+          l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
+    }
+  }
+  return l;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bead7359baaf3582c1230adf0cd4a90046859d2
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+
+namespace tensorflow {
+
+// Computes the Cholesky decompositions of a batch of symmetric positive
+// definite matrices.
+// `a` must be a (batched) square matrix; i.e., it must have rank >= 2 with the
+// two minor dimensions equal.
+// The algorithm implements a blocked Cholesky decomposition; `block_size` is
+// the block size to use.
+// TODO(phawkins): check for negative values on the diagonal and return an
+// error, instead of silently yielding NaNs.
+xla::StatusOr<xla::ComputationDataHandle> Cholesky(
+    xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
+    int64 block_size = 256);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
new file mode 100644
index 0000000000000000000000000000000000000000..579944c3a381e7018b7fee5013d0509158ce21cc
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -0,0 +1,175 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    xla::ComputationDataHandle b, int64 block_size) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
+                      builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
+                      builder->GetShape(b));
+  if (xla::ShapeUtil::Rank(*a_shape) != xla::ShapeUtil::Rank(*b_shape)) {
+    return errors::InvalidArgument(
+        "Arguments to TriangularSolve have different ranks: ",
+        xla::ShapeUtil::HumanString(*a_shape), " vs. ",
+        xla::ShapeUtil::HumanString(*b_shape));
+  }
+  const int ndims = xla::ShapeUtil::Rank(*a_shape);
+  if (ndims < 2) {
+    return errors::InvalidArgument(
+        "Arguments to TriangularSolve must have rank >= 2: ", ndims);
+  }
+  // The batch dimensions must be equal.
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape->dimensions(i);
+    int64 b_size = b_shape->dimensions(i);
+    if (a_size != b_size) {
+      return errors::InvalidArgument(
+          "Batch dimensions of arguments to TriangularSolve must be equal: ",
+          xla::ShapeUtil::HumanString(*a_shape), " vs ",
+          xla::ShapeUtil::HumanString(*b_shape));
+    }
+    batch_dimensions.push_back(a_size);
+  }
+
+  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
+  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
+  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+    return errors::InvalidArgument(
+        "The 'a' arguments to TriangularSolve must be square matrices: ",
+        xla::ShapeUtil::HumanString(*a_shape));
+  }
+  if (n != xla::ShapeUtil::GetDimension(*b_shape, -1)) {
+    return errors::InvalidArgument(
+        "Arguments to TriangularSolve have incompatible matrix shapes: ",
+        xla::ShapeUtil::HumanString(*a_shape), " vs ",
+        xla::ShapeUtil::HumanString(*b_shape));
+  }
+
+  if (block_size < 1) {
+    return errors::InvalidArgument(
+        "block_size argument to TriangularSolve must be >= 1; got ",
+        block_size);
+  }
+
+  // Returns [b1, b2, ... , bn, indices[0], indices[1]].
+  auto prepend_batch_dims = [&](std::array<int64, 2> indices) {
+    std::vector<int64> output(ndims);
+    std::copy(batch_dimensions.begin(), batch_dimensions.end(), output.begin());
+    std::copy(indices.begin(), indices.end(),
+              output.begin() + batch_dimensions.size());
+    return output;
+  };
+
+  std::map<int, xla::Computation> base_computations;
+  auto get_base_triangular_solve =
+      [&](int k) -> xla::StatusOr<xla::Computation*> {
+    xla::Computation& computation = base_computations[k];
+    if (computation.IsNull()) {
+      std::unique_ptr<xla::ComputationBuilder> sub = builder->CreateSubBuilder(
+          tensorflow::strings::StrCat("trsm_base_", k));
+
+      auto a_param =
+          sub->Parameter(0,
+                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
+                                                   prepend_batch_dims({k, k})),
+                         "a");
+
+      auto b_param =
+          sub->Parameter(1,
+                         xla::ShapeUtil::MakeShape(b_shape->element_type(),
+                                                   prepend_batch_dims({m, k})),
+                         "b");
+
+      // TODO(phawkins): it might make sense to use a while loop here, rather
+      // than unrolling.
+      // TODO(phawkins): the left-looking variant of the algorithm might be more
+      // efficient at block size 1.
+      TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
+                                         /*block_size=*/1)
+                             .status());
+
+      TF_ASSIGN_OR_RETURN(computation, sub->Build());
+    }
+    return &computation;
+  };
+
+  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+
+  // Right-looking blocked triangular solve.
+  // For an explanation of the algorithm, see the TRSM discussion in:
+  // Goto, Kazushige, and Robert Van De Geijn. "High-performance implementation
+  // of the level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1
+  // (2008): 4.
+  for (int64 i = 0; i < n; i += block_size) {
+    int64 k = std::min(block_size, n - i);
+
+    // if k > 1:
+    //   output[..., :, i:i+k] = triangular_solve(
+    //       a[..., i:i+k, ..., i:i+k], b[..., :, i:i+k], side='Right',
+    //       kind='Lower', transpose=True, block_size=1)
+    // else:
+    //   output[..., :, i] = b[..., :, i] / a[..., i, i]
+    TF_ASSIGN_OR_RETURN(auto a_slice,
+                        SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
+    TF_ASSIGN_OR_RETURN(auto b_slice,
+                        SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
+    xla::ComputationDataHandle update;
+    if (k > 1) {
+      TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+                          get_base_triangular_solve(k));
+      update = builder->Call(*solve, {a_slice, b_slice});
+    } else {
+      update = builder->Div(b_slice, a_slice);
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
+    // b[..., :, i+k:] -= np.dot(output[..., :, i:i+k],
+    //                           np.transpose(..., a[i+k:, i:i+k]))
+    if (i + k < n) {
+      TF_ASSIGN_OR_RETURN(auto a_slice_2,
+                          SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
+      TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(builder, update, a_slice_2,
+                                                  /*transpose_x=*/false,
+                                                  /*transpose_y=*/true));
+
+      TF_ASSIGN_OR_RETURN(auto b_slice_2,
+                          SliceInMinorDims(builder, b, {0, i + k}, {m, n}));
+      b_update = builder->Sub(b_slice_2, b_update);
+      TF_ASSIGN_OR_RETURN(
+          b, UpdateSliceInMinorDims(builder, b, b_update, {0, i + k}));
+    }
+  }
+  return output;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
new file mode 100644
index 0000000000000000000000000000000000000000..501d026411c80359c7efa406ece5929a2e46ac1f
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+
+namespace tensorflow {
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+// backsubstitution.
+//
+// `a` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. The strictly upper triangular part of each inner-most matrix
+// is assumed to be zero and not accessed.
+// `b` is a tensor of shape `[..., M, K]`.
+//
+// The innermost matrices in the output satisfy matrix equations
+// `output[..., i, j] * adjoint(a[..., k, j]) = b[..., i, k]`.
+//
+// Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
+// blocking is used.
+// TODO(phawkins): equivalent to the BLAS TRSM routine with side=right,
+// kind=lower, and transposed_a=true. Implement the other possible combinations
+// of side, kind and transposed_a.
+xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
+    xla::ComputationDataHandle b, int64 block_size = 256);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..671d9aa4fe0c042a3cc44468074653d51c2be75d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+using TriangularSolveTest = xla::ClientLibraryTestBase;
+
+XLA_TEST_F(TriangularSolveTest, Simple) {
+  xla::ComputationBuilder builder(client_, TestName());
+
+  xla::Array2D<float> a_vals({
+      {2, 0, 0, 0},
+      {3, 6, 0, 0},
+      {4, 7, 9, 0},
+      {5, 8, 10, 11},
+  });
+  xla::Array2D<float> b_vals({
+      {1, 2, 3, 4},
+      {5, 6, 7, 8},
+      {9, 10, 11, 12},
+  });
+
+  xla::ComputationDataHandle a, b;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(b_vals, 1, "b", &builder, &b);
+  auto result = TriangularSolve(&builder, a, b, /*block_size=*/2);
+  TF_ASSERT_OK(result.status());
+
+  xla::Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(2e-3, 2e-3));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..943248aedbdce5e81baa341fdab82fea9a48302d
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -0,0 +1,110 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
+                                 xla::Shape& shape) {
+  return builder->Broadcast(
+      builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())),
+      xla::AsInt64Slice(shape.dimensions()));
+}
+
+xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
+                                        xla::PrimitiveType type, double value) {
+  switch (type) {
+    case xla::F16:
+      return builder->ConstantR0<xla::half>(static_cast<xla::half>(value));
+      break;
+    case xla::BF16:
+      return builder->ConstantR0<bfloat16>(static_cast<bfloat16>(value));
+      break;
+    case xla::F32:
+      return builder->ConstantR0<float>(static_cast<float>(value));
+      break;
+    case xla::F64:
+      return builder->ConstantR0<double>(value);
+      break;
+    case xla::C64:
+      return builder->ConstantR0<xla::complex64>(value);
+      break;
+    default:
+      LOG(FATAL) << "unhandled element type " << type;
+  }
+}
+
+xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end) {
+  TF_RET_CHECK(start.size() == end.size());
+  int64 n_minor_dims = start.size();
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_RET_CHECK(n_minor_dims <= n_dims);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape->dimensions()),
+                                    /*pos=*/0,
+                                    /*len=*/n_dims - n_minor_dims);
+
+  // Prepends 0s in the major dim
+  std::vector<int64> padded_start(n_dims, 0);
+  std::copy(start.begin(), start.end(),
+            padded_start.begin() + major_dims.size());
+
+  // Prepends the shape of the major dims.
+  std::vector<int64> padded_end(n_dims);
+  std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
+  std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
+
+  std::vector<int64> strides(n_dims, 1);
+  return builder->Slice(x, padded_start, padded_end, strides);
+}
+
+xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
+  // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+  std::vector<int32> start_as_int32(start.begin(), start.end());
+  return builder->DynamicUpdateSlice(
+      x, update, builder->ConstantR1<int32>(start_as_int32));
+}
+
+xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  const int64 n_minor_dims = start.size();
+  TF_RET_CHECK(n_minor_dims <= n_dims);
+  std::vector<int64> padded_start(n_dims, 0);
+  std::copy(start.begin(), start.end(),
+            padded_start.begin() + (n_dims - n_minor_dims));
+  return UpdateSlice(builder, x, update, padded_start);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fba6b5cf247e9b2c26533c53ece8b0d7d4f4c36
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
+
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+// Returns a zero-filled tensor with shape `shape`.
+xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
+                                 xla::Shape& shape);
+
+// Returns a floating point scalar constant of 'type' with 'value'.
+// If 'type' is complex, returns a real value with zero imaginary component.
+xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
+                                        xla::PrimitiveType type, double value);
+
+// Performs a slice in the minor dimensions of a Tensor.
+xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end);
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0], ..., start[n]] = update
+xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
+    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
+    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b08a7583cb5ab7efa30a1fa27b973d04992584a7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+const char kDeviceSuffixReplicatedCore[] = "REPLICATED_CORE";
+const char kShardingAttribute[] = "_XlaSharding";
+}  // namespace
+
+namespace {
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+GetShardingFromNodeDef(const NodeDef& node_def) {
+  if (!HasNodeAttr(node_def, kShardingAttribute)) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  }
+  string value;
+  xla::OpSharding sharding;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node_def, kShardingAttribute, &value));
+  if (!sharding.ParseFromString(value)) {
+    return xla::InvalidArgument(
+        "Experimental _XlaSharding attribute was not a valid encoded "
+        "xla::OpSharding proto.");
+  }
+  return tensorflow::gtl::optional<xla::OpSharding>(sharding);
+}
+
+Status CoreOutOfRangeError(int core, int num_cores_per_replica) {
+  return errors::InvalidArgument(
+      "Invalid replicated core id: ", core,
+      "; num_cores_per_replica=", num_cores_per_replica);
+}
+}  // namespace
+
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(
+    const string& device_name, int num_cores_per_replica,
+    tensorflow::gtl::optional<xla::OpSharding> explicit_sharding) {
+  if (device_name.empty()) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  }
+  DeviceNameUtils::ParsedName parsed_device;
+  if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) {
+    return errors::InvalidArgument("Malformed assigned device '", device_name,
+                                   "'");
+  }
+
+  if (explicit_sharding.has_value()) {
+    return explicit_sharding;
+  } else if (!parsed_device.has_type || !parsed_device.has_id ||
+             !StringPiece(parsed_device.type)
+                  .contains(kDeviceSuffixReplicatedCore)) {
+    return tensorflow::gtl::optional<xla::OpSharding>();
+  } else {
+    const int core = parsed_device.id;
+    if (core < 0 || core >= num_cores_per_replica) {
+      return CoreOutOfRangeError(core, num_cores_per_replica);
+    }
+    return tensorflow::gtl::optional<xla::OpSharding>(
+        xla::ShardingBuilder::AssignDevice(core));
+  }
+}
+
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica) {
+  const string& device_name = node_def.device();
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node_def));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
+}
+
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const Node& node, int num_cores_per_replica) {
+  string device_name = node.assigned_device_name();
+  if (device_name.empty()) {
+    device_name = node.requested_device();
+  }
+  TF_ASSIGN_OR_RETURN(tensorflow::gtl::optional<xla::OpSharding> sharding,
+                      GetShardingFromNodeDef(node.def()));
+  return ParseShardingFromDevice(device_name, num_cores_per_replica, sharding);
+}
+
+void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst) {
+  string device_name = src.assigned_device_name();
+  if (device_name.empty()) {
+    device_name = src.requested_device();
+  }
+  dst->set_assigned_device_name(device_name);
+  if (const AttrValue* attr = src.attrs().Find(kShardingAttribute)) {
+    dst->AddAttr(kShardingAttribute, *attr);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/sharding_util.h b/tensorflow/compiler/tf2xla/sharding_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e430e30a1247c7d01910b6d57f7c577964e1dd1
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/sharding_util.h
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Parses the op sharding from the 'replicated core' device_name <device_name>.
+// Returns an error:
+// - if the device name is invalid.
+// - the core is parsed and is out of the range [0, num_cores_per_replica).
+//
+// Otherwise, returns either:
+// - explicit_sharding if explicit_sharding.has_value()
+// - a non-value if there is no assigned core or
+// - a sharding set as per xla::ShardingBuilder::AssignDevice.
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const string& device_name, int num_cores_per_replica,
+                        tensorflow::gtl::optional<xla::OpSharding>
+                            explicit_sharding = tensorflow::gtl::nullopt);
+
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const Node& node, int num_cores_per_replica);
+
+xla::StatusOr<tensorflow::gtl::optional<xla::OpSharding>>
+ParseShardingFromDevice(const NodeDef& node_def, int num_cores_per_replica);
+
+void SetShardingDeviceAssignmentFromNode(const Node& src, Node* dst);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TPU_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/sharding_util_test.cc b/tensorflow/compiler/tf2xla/sharding_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bff5978237a827cb9650541f2cf6984d9e846796
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/sharding_util_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(CoreUtilTest, ParseShardingFromDevice) {
+  Graph graph(OpRegistry::Global());
+
+  auto core_from_sharding =
+      [](tensorflow::gtl::optional<xla::OpSharding> sharding) -> int64 {
+    if (sharding.has_value() &&
+        sharding.value().type() ==
+            xla::OpSharding::Type::OpSharding_Type_MAXIMAL) {
+      return sharding.value().tile_assignment_devices(0);
+    } else {
+      return -1;
+    }
+  };
+
+  auto parse_status = ParseShardingFromDevice("", 1);
+  TF_EXPECT_OK(parse_status.status());
+  EXPECT_EQ(-1, core_from_sharding(parse_status.ValueOrDie()));
+  parse_status = ParseShardingFromDevice("", 100);
+  TF_EXPECT_OK(parse_status.status());
+  EXPECT_EQ(-1, core_from_sharding(parse_status.ValueOrDie()));
+
+  parse_status = ParseShardingFromDevice("/device:A_REPLICATED_CORE:-1", 100);
+  EXPECT_FALSE(parse_status.ok());
+
+  parse_status = ParseShardingFromDevice("/device:A_REPLICATED_CORE:55", 100);
+  TF_EXPECT_OK(parse_status.status());
+  EXPECT_EQ(55, core_from_sharding(parse_status.ValueOrDie()));
+
+  parse_status = ParseShardingFromDevice("/device:A_REPLICATED_CORE:100", 100);
+  EXPECT_FALSE(parse_status.ok());
+
+  parse_status = ParseShardingFromDevice("/cpu:0", 100);
+  TF_EXPECT_OK(parse_status.status());
+  EXPECT_EQ(-1, core_from_sharding(parse_status.ValueOrDie()));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index a14c93a2b9494b89f579bc20ee0510c136f8f01b..906f2290433face4cce3296b2f815d50d8c496ce 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -253,8 +253,7 @@ Status CreateXlaArgs(const Graph& graph,
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
 Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
-                         xla::Computation* computation,
-                         bool* requires_runtime_context) {
+                         xla::Computation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
     node->set_assigned_device_name(
@@ -277,7 +276,6 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   TF_RETURN_IF_ERROR(compiler.CompileGraph(XlaCompiler::CompileOptions(),
                                            "tfcompile", std::move(graph),
                                            xla_args, &result));
-  *requires_runtime_context = result.requires_runtime_context;
   *computation = std::move(*result.computation);
 
   int num_const_results = 0;
@@ -352,12 +350,10 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
 
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context) {
+                            xla::Computation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation,
-                                       requires_runtime_context));
+  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index ab99beebf7946237425d4d304a858ac6817177b8..473c431b12d441c652f1d0d6c11c5e87836ab36d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -30,13 +30,9 @@ namespace tensorflow {
 //
 // The computation is built in the context of the given `client`, which may
 // subsequently be used to compile or execute the computation.
-//
-// If `requires_runtime_context` is filled with true, this indicates the last
-// argument of the computation is XlaLocalRuntimeContext*.
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation,
-                            bool* requires_runtime_context);
+                            xla::Computation* computation);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7aca889a266439538c4cd1c153460e6cc871b246
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace {
+
+void PrintSupportedOps(const string& device, const string& regen_run) {
+  XlaOpRegistry::RegisterCompilationKernels();
+
+  std::vector<const KernelDef*> kdefs =
+      XlaOpRegistry::DeviceKernels(device,
+                                   /*include_compilation_only_kernels=*/true);
+  std::sort(
+      kdefs.begin(), kdefs.end(),
+      [](const KernelDef* a, const KernelDef* b) { return a->op() < b->op(); });
+
+  std::cout << "**Supported operators for device: " << device << "**\n\n"
+            << "Operator | Type Constraint\n"
+            << "-------- | ---------------" << std::endl;
+  for (const KernelDef* kdef : kdefs) {
+    std::vector<string> constraints;
+    for (const KernelDef::AttrConstraint& constraint : kdef->constraint()) {
+      std::vector<string> types;
+      for (int type : constraint.allowed_values().list().type()) {
+        types.push_back(DataTypeString(static_cast<DataType>(type)));
+      }
+      std::sort(types.begin(), types.end());
+      constraints.push_back("`" + constraint.name() + "={" +
+                            str_util::Join(types, ",") + "}`");
+    }
+    std::cout << "`" << kdef->op() << "` | "
+              << str_util::Join(constraints, "<br>") << std::endl;
+  }
+
+  std::cout << "\nTo regenerate this table, run:\n\n```shell\n"
+            << regen_run << " --device=" << device << "\n```" << std::endl;
+}
+
+}  // namespace
+
+void SupportedOpsMain(int argc, char** argv, const char* regen_run) {
+  std::vector<string> device_names = XlaOpRegistry::BackendNames();
+  std::sort(device_names.begin(), device_names.end());
+
+  // Set up and parse flags.
+  string device;
+  std::vector<Flag> flag_list = {
+      {"device", &device,
+       "Name of the compilation device for which to print supported ops, "
+       "one of: " +
+           str_util::Join(device_names, ",")},
+  };
+  string usage = Flags::Usage(argv[0], flag_list);
+  bool parsed_flags_ok = Flags::Parse(&argc, argv, flag_list);
+  QCHECK(parsed_flags_ok) << "\n" << usage;
+  QCHECK(XlaOpRegistry::IsBackendRegistered(device))
+      << "\nUnknown device: " << device << "\n"
+      << usage;
+
+  // Run the program.
+  port::InitMain(usage.c_str(), &argc, &argv);
+  QCHECK(argc == 1) << "\nERROR: This command does not take any arguments "
+                       "other than flags\n\n"
+                    << usage;
+  PrintSupportedOps(device, regen_run);
+}
+
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b45fb4cdd3b0173b04e130b7416874a9a406dc5
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+#define TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
+
+namespace tensorflow {
+namespace tf2xla {
+
+// The implementation of a main function for a binary that prints a table of
+// supported tf2xla operators for a given device, along with their type
+// constraints, to stdout.
+//
+// Pass the argc and argv from main, unmodified.  Use regen_run to specify the
+// command used to regenerate the table.
+void SupportedOpsMain(int argc, char** argv, const char* regen_run);
+
+}  // namespace tf2xla
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_SUPPORTED_OPS_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..690666c2400d45e33c1a5d1818b68a86a70a5be3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/tf2xla_supported_ops_main.cc
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/tf2xla_supported_ops.h"
+
+int main(int argc, char** argv) {
+  const char* regen_run =
+      "bazel run -c opt -- tensorflow/compiler/tf2xla:tf2xla_supported_ops";
+  tensorflow::tf2xla::SupportedOpsMain(argc, argv, regen_run);
+}
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 51ce17deb62117ff8c1075160d0bebe6cf1438f1..a9978e697b091715ce120f0d18fdddd259e08b32 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -70,10 +70,7 @@ TEST(ConvertGraphDefToXla, Sum) {
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation,
-                                    &requires_runtime_context));
-  ASSERT_FALSE(requires_runtime_context);
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
   auto x_literal = xla::Literal::CreateR0<int32>(10);
@@ -92,7 +89,7 @@ TEST(ConvertGraphDefToXla, Sum) {
       client->ExecuteAndTransfer(computation, {x_global.get(), y_global.get()});
   TF_EXPECT_OK(result_or.status());
   std::unique_ptr<xla::Literal> result = std::move(result_or.ValueOrDie());
-  EXPECT_EQ("(s32[]) (\n42,\n)", result->ToString());
+  EXPECT_EQ("(s32[]) (\n42\n)", result->ToString());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 14e0910cab2c3aa329fe798d199454fd6c5ee6a5..55f2f3149c6ba7bfa18608f961c8a76103a50756 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -19,7 +19,9 @@ limitations under the License.
 #include <set>
 #include <unordered_map>
 
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -250,4 +253,32 @@ string TensorIdToString(const tf2xla::TensorId& id) {
   return strings::StrCat(id.node_name(), ":", id.output_index());
 }
 
+Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
+  int core = -1;
+  const Node* matching_node = nullptr;
+  for (const Edge* edge : (out_edges ? n->out_edges() : n->in_edges())) {
+    if (edge->IsControlEdge()) continue;
+    const Node* possible_match = out_edges ? edge->dst() : edge->src();
+    TF_ASSIGN_OR_RETURN(
+        tensorflow::gtl::optional<xla::OpSharding> sharding,
+        ParseShardingFromDevice(
+            *possible_match,
+            /*num_cores_per_replica=*/std::numeric_limits<int32>::max()));
+    if (sharding.has_value()) {
+      TF_RET_CHECK(sharding.value().type() ==
+                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
+      const int core_annotation = sharding.value().tile_assignment_devices(0);
+      if (core == -1 || core > core_annotation) {
+        core = core_annotation;
+        matching_node = possible_match;
+      }
+    }
+  }
+  if (matching_node != nullptr) {
+    n->set_assigned_device_name(matching_node->assigned_device_name());
+    n->set_requested_device(matching_node->requested_device());
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.h b/tensorflow/compiler/tf2xla/tf2xla_util.h
index a29d0c16f9cfde3c97bfa9cf3165890f83939a43..e5fba8ede7745febbb42c572a7b52247213afc95 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.h
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -45,6 +46,11 @@ Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
 // Returns node:port for the given <id>.
 string TensorIdToString(const tf2xla::TensorId& id);
 
+// Updates the sharding of <n> based on the sharding of its neighbors.
+// If <out_edges> is true, outgoing edges from <n> are considered; else incoming
+// edges are considered.
+Status SetNodeShardingFromNeighbors(Node* n, bool out_edges);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_TF2XLA_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index b98c89f284d6a2bfc6d043794a580e60da93617f..436039e154842443f779aba276bc571fc2ab7537 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -15,7 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -211,5 +217,52 @@ TEST(PruneGraphDefInto, Basic) {
   EXPECT_EQ(def.DebugString(), copy.DebugString());
 }
 
+TEST(SetNodeShardingFromNeighbors, Basic) {
+  // Builds a graph that adds two Tensors.
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Arg(scope.WithOpName("B"), DT_INT32, 1);
+  auto c = ops::Add(scope.WithOpName("C"), a, b);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  Node* a_node = nullptr;
+  Node* b_node = nullptr;
+  Node* c_node = nullptr;
+  for (Node* n : graph->nodes()) {
+    if (n->name() == "A") a_node = n;
+    if (n->name() == "B") b_node = n;
+    if (n->name() == "C") c_node = n;
+  }
+
+  const int num_cores_per_replica = 4;
+
+  a_node->set_assigned_device_name("foo");
+  EXPECT_FALSE(SetNodeShardingFromNeighbors(c_node, /*out_edges=*/false).ok());
+
+  // Test where one input to c_node has a device.
+  a_node->set_assigned_device_name("/device:TPU_REPLICATED_CORE:2");
+  TF_ASSERT_OK(SetNodeShardingFromNeighbors(c_node, /*out_edges=*/false));
+  auto parse_status = ParseShardingFromDevice(*c_node, num_cores_per_replica);
+  TF_ASSERT_OK(parse_status.status());
+  ASSERT_TRUE(parse_status.ValueOrDie().has_value());
+  EXPECT_EQ(2, parse_status.ValueOrDie().value().tile_assignment_devices(0));
+
+  // Test where two inputs to c_node have a device.
+  b_node->set_assigned_device_name("/device:TPU_REPLICATED_CORE:1");
+  TF_ASSERT_OK(SetNodeShardingFromNeighbors(c_node, /*out_edges=*/false));
+  parse_status = ParseShardingFromDevice(*c_node, num_cores_per_replica);
+  TF_ASSERT_OK(parse_status.status());
+  ASSERT_TRUE(parse_status.ValueOrDie().has_value());
+  EXPECT_EQ(1, parse_status.ValueOrDie().value().tile_assignment_devices(0));
+
+  // Test setting based on out edges.
+  TF_ASSERT_OK(SetNodeShardingFromNeighbors(a_node, /*out_edges=*/true));
+  parse_status = ParseShardingFromDevice(*a_node, num_cores_per_replica);
+  TF_ASSERT_OK(parse_status.status());
+  ASSERT_TRUE(parse_status.ValueOrDie().has_value());
+  EXPECT_EQ(1, parse_status.ValueOrDie().value().tile_assignment_devices(0));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index 1efbe0ffb17dad5332aa700b2e255d4a99fbef72..c969212a1bfaa6cab0d896ee074cfd4e2b283ae4 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -49,6 +49,9 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
     case tensorflow::DT_UINT64:
       *type = xla::U64;
       return Status::OK();
+    case tensorflow::DT_BFLOAT16:
+      *type = xla::BF16;
+      return Status::OK();
     case tensorflow::DT_HALF:
       *type = xla::F16;
       return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index fc866a4c0a34712dc3906fb60c13a30909ecffd2..cc459dc87c00f19230c65341d53da213e07fe364 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/core/common_runtime/local_device.h"
@@ -77,7 +78,8 @@ XlaCompilationDevice::XlaCompilationDevice(const SessionOptions& options,
     : LocalDevice(
           options,
           Device::BuildDeviceAttributes(
-              "", type, Bytes(256 << 20), DeviceLocality(),
+              strings::StrCat("/device:", type.type(), ":0"), type,
+              Bytes(256 << 20), DeviceLocality(),
               strings::StrCat("device: XLA compilation device ", type.type()))),
       allocator_(new XlaCompilationAllocator()) {}
 
@@ -97,23 +99,19 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   metadata.set_op_name(op_kernel->name());
   b->SetOpMetadata(metadata);
 
-  DeviceNameUtils::ParsedName parsed;
-  OP_REQUIRES(
-      context,
-      DeviceNameUtils::ParseFullName(op_kernel->requested_device(), &parsed),
-      errors::Internal("Unable to parse device name: ",
-                       op_kernel->requested_device()));
-  // If no device ID assignment is found, XLA is free to use whatever device it
-  // wants. In practice this usually has the effect of placing things on
-  // device 0.
-  if (parsed.has_id) {
-    b->SetSharding(xla::ShardingBuilder::AssignDevice(parsed.id));
-  }
+  auto sharding_parse_result = ParseShardingFromDevice(
+      op_kernel->def(), std::numeric_limits<int>::max());
+  OP_REQUIRES_OK(context, sharding_parse_result.status());
+  tensorflow::gtl::optional<xla::OpSharding> op_sharding =
+      sharding_parse_result.ValueOrDie();
 
+  // If no sharding metadata is found, XLA is free to use whatever device it
+  // wants. In practice this usually has the effect of placing things on device
+  // 0.
+  xla::ScopedShardingAssignment assign_sharding(b, op_sharding);
   op_kernel->Compute(context);
 
   b->ClearOpMetadata();
-  b->ClearSharding();
   VLOG(4) << "Done";
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index b5c17c5273bb15e20184b2fefd93880d4828105e..79da701fd244a461a60588153b601d5c1870fa89 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -28,9 +28,10 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       temps_(new void*[static_data.num_temps]),
       arg_names_(static_data.arg_names),
       result_names_(static_data.result_names),
-      program_shape_(static_data.program_shape) {
+      program_shape_(static_data.program_shape),
+      hlo_profile_printer_(static_data.hlo_profile_printer) {
   // Allocate arg and temp buffers.
-  if (alloc_mode == AllocMode::ARGS_RESULTS_AND_TEMPS) {
+  if (alloc_mode == AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS) {
     alloc_args_ = tensorflow::tfcompile::runtime::MallocContiguousBuffers(
         static_data.arg_sizes, static_data.num_args, args_,
         /*annotate_initialized=*/false);
@@ -39,9 +40,13 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
       static_data.temp_sizes, static_data.num_temps, temps_,
       /*annotate_initialized=*/true);
 
-  // The runtime context is always the last arg, if it is required.
-  if (static_data.requires_runtime_context) {
-    args_[static_data.num_args - 1] = &context_;
+  // If Hlo profiling is enabled the generated code expects an appropriately
+  // sized buffer to be passed in as the last argument.  If Hlo profiling is
+  // disabled the last function argument is still present in the function
+  // signature, but it is ignored by the generated code and we pass in null for
+  // it.
+  if (hlo_profiling_enabled()) {
+    profile_counters_ = new int64[static_data.profile_counters_size]();
   }
 }
 
@@ -50,6 +55,7 @@ XlaCompiledCpuFunction::~XlaCompiledCpuFunction() {
   tensorflow::tfcompile::runtime::FreeContiguous(alloc_temps_);
   delete[] args_;
   delete[] temps_;
+  delete[] profile_counters_;
 }
 
 namespace {
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index f49a7889222ff989144217ab10b27595f89e4311..e0ae3ed9a811bcc49ce8862037a67d293e879e57 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -16,10 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_COMPILED_CPU_FUNCTION_H_
 
-#include <functional>
+#include <cassert>
 #include <string>
 
-#include "tensorflow/compiler/tf2xla/xla_local_runtime_context.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -27,6 +26,7 @@ limitations under the License.
 // never use this functionality.
 namespace xla {
 class ProgramShape;
+class HloProfilePrinter;
 }
 
 namespace tensorflow {
@@ -48,12 +48,10 @@ namespace tensorflow {
 class XlaCompiledCpuFunction {
  public:
   // Type of the raw function, produced by either JIT or AOT.
-  //
-  // TODO(toddw): Add support for hlo profiling, and replace std::function with
-  // a raw function pointer, for some codesize savings.
-  using RawFunction = std::function<void(
-      void* result, const xla::ExecutableRunOptions* run_options,
-      const void** args, void** temps)>;
+  using RawFunction = void (*)(void* result,
+                               const xla::ExecutableRunOptions* run_options,
+                               const void** args, void** temps,
+                               int64* profile_counters);
 
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
@@ -71,9 +69,6 @@ class XlaCompiledCpuFunction {
     // The 0-based index of the result tuple, in the temp buffers.
     size_t result_index = 0;
 
-    // Is the final arg XlaLocalRuntimeContext?
-    bool requires_runtime_context = false;
-
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
     const char** arg_names = nullptr;
@@ -81,21 +76,29 @@ class XlaCompiledCpuFunction {
 
     // [Optional] Arg and result shapes.
     const xla::ProgramShape* program_shape = nullptr;
+
+    // [Optional] Profile printer.  Null if profiling is disabled.
+    const xla::HloProfilePrinter* hlo_profile_printer = nullptr;
+
+    // [Optional] The number of profile counters expected in the profile counter
+    // buffer by the generated code and hlo_profile_printer.  0 if profiling is
+    // disabled.
+    int64 profile_counters_size = 0;
   };
 
   // AllocMode controls the buffer allocation mode.
   enum class AllocMode {
-    // Allocate all buffers - args, results and temps.
-    ARGS_RESULTS_AND_TEMPS,
+    // Allocate all buffers - args, results, profile and temps.
+    ARGS_RESULTS_PROFILES_AND_TEMPS,
 
-    // Only allocate result and temp buffers.
+    // Only allocate result, profile and temp buffers.
     // Use set_arg_data to set argument buffers before Run is called.
-    RESULTS_AND_TEMPS_ONLY,
+    RESULTS_PROFILES_AND_TEMPS_ONLY,
   };
 
   XlaCompiledCpuFunction(
       const StaticData& static_data,
-      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_AND_TEMPS);
+      AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
 
   XlaCompiledCpuFunction(const XlaCompiledCpuFunction&) = delete;
@@ -104,21 +107,22 @@ class XlaCompiledCpuFunction {
   // Sets the intra-op thread pool used to run individual ops concurrently.
   void set_thread_pool(const Eigen::ThreadPoolDevice* pool) {
     run_options_.set_intra_op_thread_pool(pool);
-    context_.thread_pool = pool;
   }
 
   // Runs the computation, with inputs read from arg buffers, and outputs
   // written to result buffers. Returns true on success and false on failure.
   bool Run() {
-    context_.error = false;
-    context_.error_msg.clear();
     raw_function_(temps_[result_index_], &run_options_,
-                  const_cast<const void**>(args_), temps_);
-    return !context_.error;
+                  const_cast<const void**>(args_), temps_, profile_counters_);
+    return true;
   }
 
   // Returns the error message from the previous failed Run call.
-  const string& error_msg() const { return context_.error_msg; }
+  //
+  // TODO(fschneider): For now this always returns an empty string because there
+  // is no support for error reporting in XLA. Remove this once all callers are
+  // updated.
+  string error_msg() const { return {}; }
 
   // ------------------------------
   // Arg methods for managing input buffers. Buffers are in row-major order.
@@ -141,10 +145,6 @@ class XlaCompiledCpuFunction {
   // tensorflow::tfcompile::runtime::kAlign. If possible, use the functions in
   // tensorflow/compiler/aot/runtime.h to ensure correct alignment.
   //
-  // If StaticData.requires_runtime_context==true, the final argument is an
-  // XlaLocalRuntimeContext, which is managed internally by this class, and
-  // should not be changed.
-  //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
   void set_arg_data(size_t index, void* data) { args_[index] = data; }
@@ -162,6 +162,16 @@ class XlaCompiledCpuFunction {
     return static_cast<const void* const*>(temps_[result_index_]);
   }
 
+  // Profile counters for this XLA computation.
+  //
+  // When Hlo profiling is enabled (`hlo_profiling_enabled()` return true in
+  // this case) these counters are non-null and are automatically populated by
+  // `Run`.  The counters can then be pretty-printed using
+  // `hlo_profile_printer()`.
+  //
+  // When Hlo profiling is disabled, this accessor returns null.
+  const int64* profile_counters() const { return profile_counters_; }
+
   // Returns the buffer for the positional result at the given `index`.
   void* result_data(size_t index) { return results()[index]; }
   const void* result_data(size_t index) const { return results()[index]; }
@@ -195,6 +205,12 @@ class XlaCompiledCpuFunction {
   // program shape isn't available.
   const xla::ProgramShape* ProgramShape() const { return program_shape_; }
 
+  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
+  const xla::HloProfilePrinter& hlo_profile_printer() const {
+    assert(hlo_profiling_enabled());
+    return *hlo_profile_printer_;
+  }
+
  private:
   const RawFunction raw_function_;
   const size_t result_index_;
@@ -208,14 +224,17 @@ class XlaCompiledCpuFunction {
   void* alloc_args_ = nullptr;
   void* alloc_temps_ = nullptr;
 
+  // Backing memory for profiling counters.
+  int64* profile_counters_ = nullptr;
+
   // Options and context passed to the compiled function.
   xla::ExecutableRunOptions run_options_;
-  tensorflow::XlaLocalRuntimeContext context_;
 
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
   const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::HloProfilePrinter* hlo_profile_printer_ = nullptr;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index e49663b8b047fb5f2c9ba17fa0aa032a673e7ed7..4c01e6732128fbb62fb134ad7fa3233725f53ebb 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -160,10 +162,10 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   return graph;
 }
 
-Status XlaCompiler::CompileFunction(
-    const XlaCompiler::CompileOptions& options, const NameAttrList& function,
-    const std::vector<XlaCompiler::Argument>& args,
-    XlaCompiler::CompilationResult* result) {
+Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
+                                    const NameAttrList& function,
+                                    std::vector<XlaCompiler::Argument> args,
+                                    XlaCompiler::CompilationResult* result) {
   const string function_id =
       Canonicalize(function.name(), AttrSlice(&function.attr()));
   VLOG(1) << "XlaCompiler::CompileFunction " << function_id;
@@ -184,6 +186,25 @@ Status XlaCompiler::CompileFunction(
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
+  // _Arg and _Retval nodes don't exist in the stored subgraph for the function;
+  // they are added by the function body looked up.  Therefore, they don't have
+  // core assignments here.
+  // Attempt to assign a core to each _Retval and _Arg. Chooses the
+  // lowest-numbered core that consumes the argument. We choose the
+  // lowest-numbered core so the assignment is deterministic.
+  for (Node* n : graph->nodes()) {
+    if (StringPiece(n->type_string()) == "_Arg") {
+      TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/true));
+    }
+  }
+  // Do _Retval as a second loop, in case the retval's input is an _Arg (which
+  // may have gotten a device assignment from the first loop).
+  for (Node* n : graph->nodes()) {
+    if (StringPiece(n->type_string()) == "_Retval") {
+      TF_RETURN_IF_ERROR(SetNodeShardingFromNeighbors(n, /*out_edges=*/false));
+    }
+  }
+
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "XlaCompiler::CompileFunction: "
             << dump_graph::DumpGraphToFile(
@@ -241,13 +262,15 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
 
 // Builds XLA computations for each of the arguments to the computation.
 // `args` are the arguments to the computation.
-Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
+Status BuildArguments(const Graph& graph,
+                      const std::vector<XlaCompiler::Argument>& args,
                       bool use_tuple_arg, xla::ComputationBuilder* builder,
-                      XlaContext* context,
+                      XlaContext* context, std::vector<int>* arg_cores,
                       std::vector<XlaExpression>* arg_expressions,
                       std::vector<int>* input_mapping,
                       std::vector<xla::Shape>* input_shapes) {
   arg_expressions->resize(args.size());
+  *arg_cores = std::vector<int>(args.size(), -1);
 
   // Argument numbers of arguments and resources that are to be passed to the
   // XLA computation as runtime parameters.
@@ -302,6 +325,26 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
     (*input_mapping)[i] = parameters[i];
   }
 
+  // Use the _Arg nodes in the graph to resolve core assignments.
+  for (const Node* n : graph.nodes()) {
+    if (StringPiece(n->type_string()) != "_Arg") continue;
+    int index;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index));
+    TF_RET_CHECK(index >= 0 && index < args.size())
+        << "_Arg out of bounds: " << index << " vs " << args.size();
+    TF_ASSIGN_OR_RETURN(
+        auto sharding,
+        ParseShardingFromDevice(*n, std::numeric_limits<int32>::max()));
+    if (sharding.has_value()) {
+      TF_RET_CHECK(sharding.value().type() ==
+                   xla::OpSharding::Type::OpSharding_Type_MAXIMAL);
+      const int core = sharding.value().tile_assignment_devices(0);
+      if ((*arg_cores)[index] == -1 || core < (*arg_cores)[index]) {
+        (*arg_cores)[index] = core;
+      }
+    }
+  }
+
   // Build parameter handles for non-constant arguments.
   std::vector<xla::ComputationDataHandle> arg_handles(parameters.size());
   if (use_tuple_arg) {
@@ -309,10 +352,18 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
     xla::ComputationDataHandle tuple =
         builder->Parameter(0, tuple_shape, "arg_tuple");
     for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
+      const int core = (*arg_cores)[parameters[i]];
+      xla::ScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+                              : xla::ShardingBuilder::AssignDevice(core));
       arg_handles[i] = builder->GetTupleElement(tuple, i);
     }
   } else {
     for (std::vector<int>::size_type i = 0; i < parameters.size(); ++i) {
+      const int core = (*arg_cores)[parameters[i]];
+      xla::ScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+                              : xla::ShardingBuilder::AssignDevice(core));
       arg_handles[i] =
           builder->Parameter(i, (*input_shapes)[i], strings::StrCat("arg", i));
     }
@@ -368,6 +419,7 @@ Status BuildArguments(const std::vector<XlaCompiler::Argument>& args,
 // type of the final output.
 Status BuildComputation(
     const std::vector<XlaCompiler::Argument>& args,
+    const std::vector<int>& arg_cores,
     const std::vector<XlaExpression>& retvals,
     const std::vector<std::unique_ptr<XlaResource>>& resources,
     bool return_updated_values_for_all_resources,
@@ -398,6 +450,8 @@ Status BuildComputation(
 
   for (const XlaResource* resource : arg_resources) {
     const XlaCompiler::Argument& arg = args[resource->arg_num];
+    const int core = arg_cores[resource->arg_num];
+    DCHECK_LT(resource->arg_num, arg_cores.size());
     bool modified =
         resource->value.handle() != resource->initial_value.handle();
     // TensorArray gradients were modified if their values changed or there are
@@ -417,8 +471,21 @@ Status BuildComputation(
       for (const auto& grad : resource->tensor_array_gradients) {
         update.tensor_array_gradients_accessed.insert(grad.first);
       }
+
+      // Request that the value be returned on a specific core.
+      xla::ScopedShardingAssignment assign_sharding(
+          builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
+                              : xla::ShardingBuilder::AssignDevice(core));
+
       xla::ComputationDataHandle handle;
       TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
+
+      // Since we can't change the sharding metadata of <value> as this point,
+      // create a tuple/get-tuple-element combination so that sharding
+      // assignment will be placed on this value, which will cause the resource
+      // update to be returned from the same device that provided the resource.
+      handle = builder->GetTupleElement(builder->Tuple({handle}), 0);
+
       elems.push_back(handle);
     }
   }
@@ -476,12 +543,11 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                      options.resolve_compile_time_constants);
   core::ScopedUnref context_unref(context);
 
-  result->tuple_arg = options.use_tuple_arg;
-
   std::vector<XlaExpression> arg_expressions;
+  std::vector<int> arg_cores;
   TF_RETURN_IF_ERROR(BuildArguments(
-      args, options.use_tuple_arg, &builder, context, &arg_expressions,
-      &result->input_mapping, &result->xla_input_shapes));
+      *graph, args, options.use_tuple_arg, &builder, context, &arg_cores,
+      &arg_expressions, &result->input_mapping, &result->xla_input_shapes));
   context->set_args(std::move(arg_expressions));
 
   TF_RETURN_IF_ERROR(ExecuteGraph(context, std::move(graph), device_,
@@ -491,16 +557,11 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   int num_computation_outputs;
   result->computation = std::make_shared<xla::Computation>();
   TF_RETURN_IF_ERROR(BuildComputation(
-      args, context->retvals(), context->resources(),
+      args, arg_cores, context->retvals(), context->resources(),
       options.return_updated_values_for_all_resources, &builder,
       result->computation.get(), &num_computation_outputs,
       &num_nonconst_outputs, &result->resource_updates));
 
-  result->requires_runtime_context = context->has_context_parameter();
-
-  // Tuple arguments and runtime context parameters are incompatible.
-  TF_RET_CHECK(!(options.use_tuple_arg && result->requires_runtime_context));
-
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
   result->outputs.resize(context->retvals().size());
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index a8882a638caf2d742bfa2b4f68140e1dc4520db1..380e24e96bc713af4453f92a5359995e9ab4734a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -54,8 +54,6 @@ namespace tensorflow {
 //   +---------------------+-----------------------------------------+
 // Within each block, the arguments are arranged by the _Arg index from which
 // they were derived.
-// If `Options::requires_runtime_context` is true, then an additional runtime
-// context argument is passed as a final argument.
 //
 // The run-time outputs of the XLA computation are arranged in the following
 // order:
@@ -191,16 +189,9 @@ class XlaCompiler {
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
-    // Does the computation require the local runtime context to be passed as
-    // the last argument?
-    bool requires_runtime_context = false;
-
     // Input shapes of the computation.
     std::vector<xla::Shape> xla_input_shapes;
 
-    // Should the arguments be packed into a single tuple?
-    bool tuple_arg;
-
     // Output shape in XLA format. The output shape is always a tuple.
     xla::Shape xla_output_shape;
 
@@ -232,16 +223,9 @@ class XlaCompiler {
     int graph_def_version = TF_GRAPH_DEF_VERSION;
 
     // If 'allow_cpu_custom_calls' is true, kernels may make use of CustomCall()
-    // for CPU; additionally, an optional XlaLocalRuntimeContext* may be passed
-    // to the computation.
+    // for CPU.
     bool allow_cpu_custom_calls = false;
 
-    // If 'local_executable_has_hybrid_result', the top-level pointers of the
-    // result tuple of compiled programs are stored in host memory and the
-    // nested buffers in device memory, otherwise the whole result tuple is
-    // stored in device memory.
-    bool local_executable_has_hybrid_result = false;
-
     // If not nullptr, populate_resource_manager is called with the
     // compilation device's resource manager when the compilation
     // device is created, and can be used to create metadata objects
@@ -255,8 +239,7 @@ class XlaCompiler {
 
   Status CompileFunction(const CompileOptions& options,
                          const NameAttrList& fn_name_attrs,
-                         const std::vector<Argument>& args,
-                         CompilationResult* result);
+                         std::vector<Argument> args, CompilationResult* result);
 
   // Compiles a tensorflow::Graph into an xla::Computation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 651bafd6c5d946adfedd63ebbe93e4ea016f0b37..5d19dd353fc04744e196bb50c35cb60b35d8b258 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -70,24 +70,6 @@ XlaContext::XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants) {}
 
-const xla::ComputationDataHandle&
-XlaContext::GetOrCreateRuntimeContextParameter() {
-  CHECK(allow_cpu_custom_calls_);
-  if (has_context_parameter_) return context_parameter_;
-  has_context_parameter_ = true;
-
-  // Allocate the next available parameter for the context parameter.
-  int num_parameters = 0;
-  for (const XlaExpression& arg : args_) {
-    if (!arg.has_constant_value()) {
-      ++num_parameters;
-    }
-  }
-  context_parameter_ = builder_->Parameter(
-      num_parameters, xla::ShapeUtil::MakeOpaqueShape(), "tf_context");
-  return context_parameter_;
-}
-
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
@@ -178,6 +160,20 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   });
 }
 
+const xla::Computation* XlaContext::GetOrCreateMul(const DataType type) {
+  return LookupOrCreate(type, &mul_func_, [this, type] {
+    const string type_string = DataTypeString(type);
+    VLOG(1) << "Building Mul() for " << type_string;
+    xla::ComputationBuilder b(builder()->client(), "mul<" + type_string + ">");
+    xla::PrimitiveType xla_type;
+    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
+    auto y = b.Parameter(1, xla::ShapeUtil::MakeShape(xla_type, {}), "y");
+    b.Mul(x, y);
+    return b.Build().ConsumeValueOrDie();
+  });
+}
+
 const xla::Computation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
     const std::function<xla::Computation()>& create) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index de8aafa3628e6eebdabbc508cd95a2ac86e3472f..ebd758d1540eba5483714265565ad22c244ca4a3 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -56,15 +56,10 @@ class XlaContext : public ResourceBase {
   xla::ComputationBuilder* builder();
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-  bool has_context_parameter() const { return has_context_parameter_; }
 
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  // Get the runtime context parameter, adding one if it does not already exist.
-  // Dies if not compiling a local executable.
-  const xla::ComputationDataHandle& GetOrCreateRuntimeContextParameter();
-
   const std::vector<XlaExpression>& retvals() { return retvals_; }
 
   // This is called by the Retval Op to associate a computed value
@@ -102,6 +97,11 @@ class XlaContext : public ResourceBase {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Get an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
   // The name of the XlaContext resource during symbolic graph execution.
   static const char kXlaContextResourceName[];
 
@@ -119,13 +119,6 @@ class XlaContext : public ResourceBase {
   // run-time computation outptus.
   const bool resolve_compile_time_constants_;
 
-  // When 'has_context_parameter_' is true, this is the computation handle
-  // for an additional final parameter to the computation, through which will be
-  // passed a XlaLocalRuntimeContext* at runtime. Created on demand by
-  // GetOrCreateRuntimeContextParameter().
-  bool has_context_parameter_ = false;
-  xla::ComputationDataHandle context_parameter_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
@@ -155,6 +148,9 @@ class XlaContext : public ResourceBase {
   // Cached computation to compute Sum of two elements, specialized by type.
   ComputationMap add_func_;
 
+  // Cached computation to compute Mul of two elements, specialized by type.
+  ComputationMap mul_func_;
+
   // Cached computation to compute Sigmoid of an element, specialized by type.
   ComputationMap sigmoid_func_;
 
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index d504613d232c779e47a506657d2825d052e726dc..8ca757e72355d890c13b8b448d35c327d3986696 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -21,8 +21,6 @@ namespace tensorflow {
 bool GpuOpFilter(KernelDef* kdef) {
   // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
   // slow code.
-  // TODO(b/34969189) The implementation of TruncatedNormal generates illegal
-  // code on GPU.
   if (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal") {
     return false;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index de5ad5f176536e1453da518b96ee755c7f1e8fdc..ec9e535b707beec6ea26dc81c7ee76b1d4da9225 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines helper routines for Tla JIT compilation.
+// This file defines helper routines for XLA compilation.
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
+
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -26,6 +29,67 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+Status ArgMinMax(xla::ComputationBuilder* builder, XlaOpKernelContext* ctx,
+                 const xla::ComputationDataHandle& input,
+                 const TensorShape& input_shape, DataType input_type,
+                 DataType output_type, int axis, bool is_min,
+                 xla::ComputationDataHandle* argminmax) {
+  xla::ComputationDataHandle init_value;
+  const xla::Computation* reducer;
+  if (is_min) {
+    init_value = XlaHelpers::MaxValue(builder, input_type);
+    reducer = ctx->GetOrCreateMin(input_type);
+  } else {
+    init_value = XlaHelpers::MinValue(builder, input_type);
+    reducer = ctx->GetOrCreateMax(input_type);
+  }
+
+  xla::PrimitiveType xla_output_type;
+  TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(output_type, &xla_output_type));
+
+  xla::ComputationDataHandle input_max = builder->Reduce(
+      input, init_value, *reducer, /*dimensions_to_reduce=*/{axis});
+  std::vector<int64> broadcast_dims(input_shape.dims() - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
+  std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
+  // Compute a mask that has 1s for elements equal to the maximum.
+  xla::ComputationDataHandle partial_mask = builder->ConvertElementType(
+      builder->Eq(input, input_max, broadcast_dims), xla_output_type);
+
+  // In order to make identity elements for a bitwise And, we:
+  //   Left shift the 1 to the leftmost bit, yielding 0x10...0
+  //   Arithmetic right shift the 1 back to the rightmost bit, yielding
+  //   0xFF...F
+  int32 bits_in_type =
+      xla::ShapeUtil::ByteSizeOfPrimitiveType(xla_output_type) * 8 - 1;
+  xla::ComputationDataHandle shift_amount =
+      XlaHelpers::IntegerLiteral(builder, output_type, bits_in_type);
+  xla::ComputationDataHandle full_mask = builder->ShiftRightArithmetic(
+      builder->ShiftLeft(partial_mask, shift_amount), shift_amount);
+
+  // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
+  // index.
+  xla::ComputationDataHandle iota;
+
+  const int64 axis_size = input_shape.dim_size(axis);
+  TF_RETURN_IF_ERROR(XlaHelpers::Iota(builder, output_type, axis_size, &iota));
+  xla::ComputationDataHandle product =
+      builder->And(full_mask, iota, /*broadcast_dimensions=*/{axis});
+
+  // If there are multiple maximum elements, choose the one with the highest
+  // index.
+  xla::ComputationDataHandle output =
+      builder->Reduce(product, XlaHelpers::MinValue(builder, output_type),
+                      *ctx->GetOrCreateMax(output_type),
+                      /*dimensions_to_reduce=*/{axis});
+  *argminmax = output;
+  return Status::OK();
+}
+
+}  // namespace
+
 xla::ComputationDataHandle XlaHelpers::MinValue(xla::ComputationBuilder* b,
                                                 DataType data_type) {
   xla::PrimitiveType type;
@@ -57,6 +121,8 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_BFLOAT16:
+      return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
       return b->ConstantR0<float>(std::numeric_limits<float>::epsilon());
     case DT_DOUBLE:
@@ -105,6 +171,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
     case xla::S16:
     case xla::U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
+    case xla::BF16:
+      literal = *xla::Literal::CreateR0<bfloat16>(static_cast<bfloat16>(value));
+      break;
     case xla::F16:
       literal =
           *xla::Literal::CreateR0<xla::half>(static_cast<xla::half>(value));
@@ -122,25 +191,9 @@ xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
 xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
                                                     DataType data_type,
                                                     double value) {
-  xla::Literal literal;
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
-  switch (type) {
-    case xla::F16:
-      return b->ConstantR0<xla::half>(static_cast<xla::half>(value));
-      break;
-    case xla::F32:
-      return b->ConstantR0<float>(static_cast<float>(value));
-      break;
-    case xla::F64:
-      return b->ConstantR0<double>(value);
-      break;
-    case xla::C64:
-      return b->ConstantR0<complex64>(value);
-      break;
-    default:
-      LOG(FATAL) << "unhandled element type " << type;
-  }
+  return ::tensorflow::FloatLiteral(b, type, value);
 }
 
 /* static */ Status XlaHelpers::ReshapeLiteral(
@@ -174,6 +227,26 @@ static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
   return linspace;
 }
 
+Status XlaHelpers::ArgMax(xla::ComputationBuilder* builder,
+                          XlaOpKernelContext* ctx,
+                          const xla::ComputationDataHandle& input,
+                          const TensorShape& input_shape, DataType input_type,
+                          DataType output_type, int axis,
+                          xla::ComputationDataHandle* argmax) {
+  return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type,
+                   axis, /*is_min=*/false, argmax);
+}
+
+Status XlaHelpers::ArgMin(xla::ComputationBuilder* builder,
+                          XlaOpKernelContext* ctx,
+                          const xla::ComputationDataHandle& input,
+                          const TensorShape& input_shape, DataType input_type,
+                          DataType output_type, int axis,
+                          xla::ComputationDataHandle* argmin) {
+  return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type,
+                   axis, /*is_min=*/true, argmin);
+}
+
 Status XlaHelpers::Iota(xla::ComputationBuilder* builder, DataType dtype,
                         int64 size, xla::ComputationDataHandle* iota) {
   TensorShape linspace_shape({size});
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index af23d20fd306c03b5e47c5ca9dd042187a2d51ed..2a027db4c839c917f3a7acd27184792d157356bf 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -72,14 +72,35 @@ class XlaHelpers {
                                gtl::ArraySlice<int64> shape,
                                xla::Literal* output);
 
+  // Sets `argmax` to the argmax of `input` along `axis`. `input_shape` and
+  // `input_dtype` are the shape and dtype of `input` respectively, and
+  // `output_type` is the dtype to use for `argmax`.
+  static Status ArgMax(xla::ComputationBuilder* builder,
+                       XlaOpKernelContext* ctx,
+                       const xla::ComputationDataHandle& input,
+                       const TensorShape& input_shape, DataType input_type,
+                       DataType output_type, int axis,
+                       xla::ComputationDataHandle* argmax);
+
+  // Sets `argmin` to the argmin of `input` along `axis`. `input_shape` and
+  // `input_dtype` are the shape and dtype of `input` respectively, and
+  // `output_type` is the dtype to use for `argmin`.
+  static Status ArgMin(xla::ComputationBuilder* builder,
+                       XlaOpKernelContext* ctx,
+                       const xla::ComputationDataHandle& input,
+                       const TensorShape& input_shape, DataType input_type,
+                       DataType output_type, int axis,
+                       xla::ComputationDataHandle* argmin);
+
   // Sets *iota to a rank 1 tensor with values [0, 1, 2, ...] of `dtype`.
   static Status Iota(xla::ComputationBuilder* builder, DataType dtype,
                      int64 size, xla::ComputationDataHandle* iota);
 
   // Converts `indices` into a one-hot representation. `depth` is the size
   // of the new axis to add. `axis` is the position at which to add the new
-  // axis. `indices_shape` is the shape of `indices`. `on_value` and `off_value`
-  // represent the values to use for the on and off positions, respectively.
+  // axis. `indices_shape` is the shape of `indices`. `on_value` and
+  // `off_value` represent the values to use for the on and off positions,
+  // respectively.
   static Status OneHot(xla::ComputationBuilder* builder, int64 depth, int axis,
                        DataType index_type, const TensorShape& indices_shape,
                        const xla::ComputationDataHandle& indices,
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 1dd454ea8d57e21526e5bcde0c8efc5514983b93..584417bc72c8f6645c05912e857b031cfb394e54 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -37,27 +37,14 @@ namespace {
 
 // Returns a vector of positional argument buffer sizes.
 xla::StatusOr<std::vector<intptr_t>> ComputeArgSizes(
-    const xla::ProgramShape& program_shape, bool requires_runtime_context) {
+    const xla::ProgramShape& program_shape) {
   std::vector<intptr_t> arg_sizes;
   const size_t num_args = program_shape.parameters_size();
   arg_sizes.reserve(num_args);
   for (int i = 0; i < num_args; ++i) {
     const xla::Shape& arg_shape = program_shape.parameters(i);
-    if (i == num_args - 1 && requires_runtime_context) {
-      // If the compiled function needs an XlaLocalRuntimeContext* arg, it's
-      // always last, and must be represented as an opaque type.
-      const xla::PrimitiveType type = arg_shape.element_type();
-      if (type != xla::OPAQUE) {
-        return errors::InvalidArgument(
-            "expected final context arg to be opaque, but got type: ",
-            xla::PrimitiveType_Name(type), ", from program shape: ",
-            xla::ShapeUtil::HumanString(program_shape));
-      }
-      arg_sizes.push_back(-1);
-    } else {
-      constexpr size_t kPointerSize = sizeof(void*);
-      arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
-    }
+    constexpr size_t kPointerSize = sizeof(void*);
+    arg_sizes.push_back(xla::ShapeUtil::ByteSizeOf(arg_shape, kPointerSize));
   }
   return std::move(arg_sizes);
 }
@@ -90,21 +77,6 @@ xla::StatusOr<size_t> ComputeResultIndex(
   return result_slice.index();
 }
 
-// Adapt ComputeFunctionType, which includes a final profile_counters arg, to
-// RawFunction, which doesn't include that final arg.
-//
-// TODO(toddw): Change RawFunction and AOT to also pass the final
-// profile_counters arg, and remove this adapter.
-XlaCompiledCpuFunction::RawFunction RawFunctionAdapter(
-    xla::cpu::CpuExecutable::ComputeFunctionType compute_function) {
-  return [compute_function](void* result,
-                            const xla::ExecutableRunOptions* run_options,
-                            const void** args, void** temps) {
-    return compute_function(result, run_options, args, temps,
-                            /*profile_counters=*/nullptr);
-  };
-}
-
 // Collect names from `entries`, where T is one of tf2xla::{Feed,Fetch}. We hold
 // the actual strings in nonempty_names, and hold arrays of pointers in
 // name_ptrs, terminated by a nullptr entry.
@@ -144,9 +116,8 @@ XlaJitCompiledCpuFunction::Compile(
   TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
                       xla::ClientLibrary::GetOrCreateLocalClient());
   xla::Computation computation;
-  bool requires_runtime_context;
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(
-      graph_def, config, client, &computation, &requires_runtime_context));
+  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(graph_def, config, client,
+                                                      &computation));
 
   // Get and verify the program shape.
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> program_shape,
@@ -177,14 +148,13 @@ XlaJitCompiledCpuFunction::Compile(
   const xla::cpu::CpuExecutable* cpu_executable =
       static_cast<xla::cpu::CpuExecutable*>(executable->executable());
   XlaCompiledCpuFunction::RawFunction raw_function =
-      RawFunctionAdapter(cpu_executable->compute_function());
+      cpu_executable->compute_function();
   const xla::BufferAssignment& buffer_assignment =
       cpu_executable->buffer_assignment();
 
   // Compute buffer sizes and the result index, needed to run the raw function.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<intptr_t> arg_sizes,
-      ComputeArgSizes(*program_shape, requires_runtime_context));
+  TF_ASSIGN_OR_RETURN(std::vector<intptr_t> arg_sizes,
+                      ComputeArgSizes(*program_shape));
   TF_ASSIGN_OR_RETURN(std::vector<intptr_t> temp_sizes,
                       ComputeTempSizes(buffer_assignment));
   TF_ASSIGN_OR_RETURN(size_t result_index,
@@ -203,7 +173,6 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.temp_sizes = jit->temp_sizes_.data();
   jit->static_data_.num_temps = jit->temp_sizes_.size();
   jit->static_data_.result_index = result_index;
-  jit->static_data_.requires_runtime_context = requires_runtime_context;
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
@@ -211,6 +180,14 @@ XlaJitCompiledCpuFunction::Compile(
   jit->static_data_.arg_names = jit->arg_names_.data();
   jit->static_data_.result_names = jit->result_names_.data();
   jit->static_data_.program_shape = jit->program_shape_.get();
+
+  if (cpu_executable->hlo_profiling_enabled()) {
+    jit->static_data_.hlo_profile_printer =
+        &cpu_executable->hlo_profile_printer();
+    jit->static_data_.profile_counters_size =
+        cpu_executable->hlo_profile_printer().profile_counters_size();
+  }
+
   return std::move(jit_unique_ptr);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h b/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
deleted file mode 100644
index dca420d6ee3fec45f88ac3b450ab0cb4fb83d38a..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/xla_local_runtime_context.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
-
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-// Forward-declare the ThreadPoolDevice so that it can be ignored unless it's
-// actually used.  E.g. some ahead-of-time compiled computations don't need a
-// thread pool.
-namespace Eigen {
-struct ThreadPoolDevice;
-}
-
-namespace tensorflow {
-
-// An instance of this class is passed to each call from tensorflow into a
-// compiled XLA computation. See xla_launch_ops.cc.
-struct XlaLocalRuntimeContext {
- public:
-  XlaLocalRuntimeContext() {}
-
-  // Kernels implemented using custom call ops set this if they encounter an
-  // error. The error is checked after the entire XLA computation is
-  // complete.
-  //
-  // error+error_msg are used instead of Status to reduce the binary size
-  // overhead for ahead-of-time compiled binaries.
-  bool error = false;
-  string error_msg;
-
-  // Kernels that need a thread pool can get it from here.
-  const Eigen::ThreadPoolDevice* thread_pool = nullptr;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalRuntimeContext);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_XLA_LOCAL_RUNTIME_CONTEXT_H_
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index b948dfee6ab33651e52ca5045cfce600c788bc3b..79d501b511bf37ba4a79ab9d375d6f789a36889b 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -345,6 +345,16 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   expression->set_constant_value(constant);
 }
 
+void XlaOpKernelContext::SetInvalidOutput(int index) {
+  Tensor* output = nullptr;
+  OP_REQUIRES_OK(context_,
+                 context_->allocate_output(index, TensorShape({}), &output));
+  XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
+  xla::ComputationDataHandle handle;
+  handle.set_handle(0);
+  expression->set_handle(handle);
+}
+
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
   Tensor* output = nullptr;
   // The shape of the output tensor is the shape of the resource itself
@@ -407,6 +417,11 @@ const xla::Computation* XlaOpKernelContext::GetOrCreateAdd(
   return XlaContext::Get(context_).GetOrCreateAdd(type);
 }
 
+const xla::Computation* XlaOpKernelContext::GetOrCreateMul(
+    const DataType type) {
+  return XlaContext::Get(context_).GetOrCreateMul(type);
+}
+
 XlaOpKernel::XlaOpKernel(OpKernelConstruction* context) : OpKernel(context) {}
 
 void XlaOpKernel::Compute(OpKernelContext* context) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 5519e89252ca5a3964dcdaaeb3d08ce6c9da6bd4..f1ae81a5aa9d507a3e0dd577568377385b1844e6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -142,6 +142,10 @@ class XlaOpKernelContext {
   // SetConstantOutput where possible.
   void SetConstantOutput(int index, const Tensor& host_tensor);
 
+  // Sets output 'index' to an invalid value.
+  // Any subsequent attempt to consume this output will cause an error.
+  void SetInvalidOutput(int index);
+
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
   Status status() { return context_->status(); }
@@ -174,7 +178,7 @@ class XlaOpKernelContext {
 
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return context_->call_frame(); }
+  CallFrameInterface* call_frame() const { return context_->call_frame(); }
 
   FunctionLibraryRuntime* function_library() const {
     return context_->function_library();
@@ -206,6 +210,11 @@ class XlaOpKernelContext {
   // separate specialization of the computation for each DataType.
   const xla::Computation* GetOrCreateAdd(const DataType type);
 
+  // Gets an XLA lambda to compute Mul. This is cached in the
+  // XlaContext since it may be used by multiple Ops. There is a
+  // separate specialization of the computation for each DataType.
+  const xla::Computation* GetOrCreateMul(const DataType type);
+
  private:
   OpKernelContext* const context_;
 };
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 02318cf7fa1d4edc12507f6b4d66a8e897cbe100..faf47434b5dc6b569ec4f9c91a8667de275a6315 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -187,22 +188,39 @@ void XlaOpRegistry::RegisterCompilationKernels() {
 
       // Constrain each type attribute to the intersection of:
       // a) the types supported by the backend, and
-      // b) the attribute's type constraints.
-      // TODO(phawkins): it may be necessary to also take the intersection with
-      // the set of types supported by the OpDef.
+      // b) the types allowed by the OpDef, and
+      // c) the type constraints.
       for (const string& type_attr : type_attrs) {
         KernelDef::AttrConstraint* attr_constraint = kdef->add_constraint();
         attr_constraint->set_name(type_attr);
         auto* allowed_values =
             attr_constraint->mutable_allowed_values()->mutable_list();
 
-        auto it = op_registration->type_constraints.find(type_attr);
+        const OpDef::AttrDef& op_def_attr = *FindAttr(type_attr, *op_def);
+        const auto* op_def_allowed_types =
+            op_def_attr.has_allowed_values()
+                ? &op_def_attr.allowed_values().list().type()
+                : nullptr;
+        auto constraint_it = op_registration->type_constraints.find(type_attr);
+        const std::set<DataType>* type_constraints =
+            constraint_it != op_registration->type_constraints.end()
+                ? &constraint_it->second
+                : nullptr;
         for (DataType dtype : backend.second.supported_types) {
-          if (it == op_registration->type_constraints.end() ||
-              (it != op_registration->type_constraints.end() &&
-               it->second.find(dtype) != it->second.end())) {
-            allowed_values->add_type(dtype);
+          // Filter out types that aren't allowed by the OpDef.
+          if (op_def_allowed_types != nullptr &&
+              std::find(op_def_allowed_types->begin(),
+                        op_def_allowed_types->end(),
+                        dtype) == op_def_allowed_types->end()) {
+            continue;
           }
+          // Filter out types based on the type constraints.
+          if (type_constraints != nullptr &&
+              type_constraints->find(dtype) == type_constraints->end()) {
+            continue;
+          }
+          // Passed all the filters, this type is allowed.
+          allowed_values->add_type(dtype);
         }
         if (op_registration->allow_resource_types) {
           allowed_values->add_type(DT_RESOURCE);
@@ -245,6 +263,22 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return kernels;
 }
 
+std::vector<string> XlaOpRegistry::BackendNames() {
+  std::vector<string> names;
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  for (const auto& backend_pair : registry.backends_) {
+    names.push_back(backend_pair.first);
+  }
+  return names;
+}
+
+bool XlaOpRegistry::IsBackendRegistered(const string& name) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  return registry.backends_.find(name) != registry.backends_.end();
+}
+
 XlaOpRegistry& XlaOpRegistry::Instance() {
   static XlaOpRegistry* r = new XlaOpRegistry;
   return *r;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 6aee8c91cc01b4382ef867fa8e438eede008ac73..2959d2ab690dfb91f8f46f5cf5718a405d9e0c7f 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -97,6 +97,12 @@ class XlaOpRegistry {
                               gtl::ArraySlice<DataType> supported_types,
                               BackendOpFilter op_filter);
 
+  // Returns the names of the registered backends.
+  static std::vector<string> BackendNames();
+
+  // Returns true iff a backend with the given name is registered.
+  static bool IsBackendRegistered(const string& name);
+
   // Registers `device_name` for XLA compilation, using information from
   // `registration`.
   static void RegisterCompilationDevice(const string& device_name,
@@ -116,8 +122,8 @@ class XlaOpRegistry {
   static void RegisterCompilationKernels();
 
   // Returns KernelDefs for compilation ops registered on
-  // 'compilation_device_name'.
-  // Does not include kernels registered as CompilationOnly.
+  // 'compilation_device_name'.  Does not include kernels registered as
+  // CompilationOnly, iff include_compilation_only_kernels=false.
   static std::vector<const KernelDef*> DeviceKernels(
       const string& compilation_device_name,
       bool include_compilation_only_kernels);
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 660f419e464936b01a3644e69c2f056f998140f5..d3f292207fee396fb4248dede5c0eeb5cd2b87c9 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -77,6 +77,7 @@ cc_library(
     hdrs = ["types.h"],
     visibility = [":friends"],
     deps = [
+        "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
@@ -174,6 +175,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
     ],
 )
 
@@ -339,6 +341,7 @@ cc_library(
     name = "array",
     hdrs = ["array.h"],
     deps = [
+        ":status",
         ":types",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index ba898d1f4e9100df59c6e4b28824895c5ae6c08a..213e0bac6c77e9972de8d4dd7dfc8c7cf3a1b865 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -23,8 +23,10 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <random>
+#include <type_traits>
 #include <vector>
 
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -35,10 +37,63 @@ limitations under the License.
 
 namespace xla {
 
+namespace array_impl {
+
+// conjunction
+//
+// Performs a compile-time logical AND operation on the passed types (which
+// must have  `::value` members convertible to `bool`. Short-circuits if it
+// encounters any `false` members (and does not compare the `::value` members
+// of any remaining arguments).
+//
+// This metafunction is designed to be a drop-in replacement for the C++17
+// `std::conjunction` metafunction.
+template <typename... Ts>
+struct conjunction;
+
+template <typename T, typename... Ts>
+struct conjunction<T, Ts...>
+    : std::conditional<T::value, conjunction<Ts...>, T>::type {};
+
+template <>
+struct conjunction<> : std::true_type {};
+
+// A type trait that is valid when all elements in a parameter pack are of
+// integral type.
+template <typename... T>
+using pack_is_integral = conjunction<std::is_integral<T>...>;
+
+// Compares three same-sized vectors elementwise. For each item in `values`,
+// returns false if any of values[i] is outside the half-open range [starts[i],
+// ends[i]).
+template <typename C1, typename C2, typename C3>
+bool all_inside_range(const C1& values, const C2& range_starts,
+                      const C3& range_ends) {
+  for (size_t i = 0, e = values.size(); i < e; ++i) {
+    if (values[i] < range_starts[i] || values[i] >= range_ends[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace array_impl
+
 // General N dimensional array class with arbitrary value type.
 template <typename T>
 class Array {
  public:
+  // Type inference can have a hard time parsing very deep initializer list
+  // nests, especially if one or more dimensions is one as the compiler just
+  // sees a single-element integer initializer. These typedefs allow casting
+  // explicitly with less typing.
+  using InitializerList1D = std::initializer_list<T>;
+  using InitializerList2D = std::initializer_list<InitializerList1D>;
+  using InitializerList3D = std::initializer_list<InitializerList2D>;
+  using InitializerList4D = std::initializer_list<InitializerList3D>;
+
+  using value_type = T;
+
   // Creates a new array with the specified dimensions.
   explicit Array(tensorflow::gtl::ArraySlice<int64> sizes)
       : Array(sizes, T()) {}
@@ -53,7 +108,7 @@ class Array {
   // Creates a 2D array from the given nested initializer list. The outer
   // initializer list is the first dimension, the inner is the second dimension.
   // For example, {{1, 2, 3}, {4, 5, 6}} results in an array with n1=2 and n2=3.
-  Array(std::initializer_list<std::initializer_list<T>> values)
+  Array(InitializerList2D values)
       : Array(ToInt64Vector({values.size(), values.begin()->size()})) {
     int64 idx = 0;
     for (const auto& it1 : values) {
@@ -67,8 +122,7 @@ class Array {
 
   // Creates a 3D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(std::initializer_list<std::initializer_list<std::initializer_list<T>>>
-            values)
+  Array(InitializerList3D values)
       : Array(ToInt64Vector({values.size(), values.begin()->size(),
                              values.begin()->begin()->size()})) {
     int64 idx = 0;
@@ -85,9 +139,7 @@ class Array {
 
   // Creates a 4D array from the given nested initializer list. The outer
   // initializer list is the first dimension, and so on.
-  Array(std::initializer_list<
-        std::initializer_list<std::initializer_list<std::initializer_list<T>>>>
-            values)
+  Array(InitializerList4D values)
       : Array(ToInt64Vector({values.size(), values.begin()->size(),
                              values.begin()->begin()->size(),
                              values.begin()->begin()->begin()->size()})) {
@@ -173,10 +225,46 @@ class Array {
     }
   }
 
+  // Invokes a callback with the (indices, value_ptr) for each cell in the
+  // array. If a callback returns a non-OK status, returns that else returns
+  // Status::OK().
+  Status EachStatus(
+      std::function<Status(tensorflow::gtl::ArraySlice<int64>, T*)> f) {
+    std::vector<int64> index(sizes_.size());
+    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
+      Status s = f(index, &values_[i]);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Invokes a callback with the (indices, value) for each cell in the array.
+  // If a callback returns a non-OK status, returns that else returns
+  // Status::OK().
+  Status EachStatus(
+      std::function<Status(tensorflow::gtl::ArraySlice<int64>, T)> f) const {
+    std::vector<int64> index(sizes_.size());
+    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
+      Status s = f(index, values_[i]);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
+  //
+  // The type trait is required to avoid this overload participating too
+  // eagerly; a parameter pack can take zero or more elements, so we must
+  // restrict this to only parameter packs that are all of integral type.
   template <typename... Dims>
-  const T& operator()(Dims... dims) const {
+  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
+                          const T&>::type
+  operator()(Dims... dims) const {
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64, sizeof...(dims)> indexes{{static_cast<int64>(dims)...}};
@@ -186,7 +274,9 @@ class Array {
   // Returns the value at the cell specified by the indexes. The number of
   // arguments have to match with the number of dimensions for the array.
   template <typename... Dims>
-  T& operator()(Dims... dims) {
+  typename std::enable_if<array_impl::pack_is_integral<Dims...>::value,
+                          T&>::type
+  operator()(Dims... dims) {
     // We are using a std::array to avoid having to allocate memory in this
     // function for performance reasons.
     std::array<int64, sizeof...(dims)> indexes{{static_cast<int64>(dims)...}};
@@ -255,6 +345,59 @@ class Array {
 
   bool operator!=(const Array<T>& other) const { return !(*this == other); }
 
+  // Performs the equivalent of a slice operation on this array.
+  Array<T> Slice(tensorflow::gtl::ArraySlice<int64> starts,
+                 tensorflow::gtl::ArraySlice<int64> limits) const {
+    CHECK_EQ(starts.size(), num_dimensions());
+    CHECK_EQ(limits.size(), num_dimensions());
+
+    std::vector<int64> sizes;
+    std::transform(starts.begin(), starts.end(), limits.begin(),
+                   std::back_inserter(sizes),
+                   [](int64 start, int64 limit) { return limit - start; });
+    Array<T> result(sizes);
+
+    std::vector<int64> index(sizes_.size());
+    int64 slice_i = 0;
+    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
+      if (array_impl::all_inside_range(index, starts, limits)) {
+        // Even though the bounds of result are different to our bounds, we're
+        // iterating in the same order. So we can simply write successive linear
+        // indices instead of recalculating a multi-dimensional index.
+        result.values_[slice_i++] = values_[i];
+      }
+    }
+    return result;
+  }
+
+  // Performs the equivalent of a DynamicUpdateSlice in-place on this array.
+  void UpdateSlice(const Array<T>& from,
+                   tensorflow::gtl::ArraySlice<int64> start_indices) {
+    CHECK_EQ(from.num_dimensions(), num_dimensions());
+    std::vector<int64> limit_indices;
+    std::transform(start_indices.begin(), start_indices.end(),
+                   from.dimensions().begin(), std::back_inserter(limit_indices),
+                   std::plus<int64>{});
+    std::vector<int64> index(sizes_.size());
+    int64 from_i = 0;
+    for (int64 i = 0; i < num_elements(); ++i, next_index(&index)) {
+      if (array_impl::all_inside_range(index, start_indices, limit_indices)) {
+        // Even though the bounds of from are different to our bounds, we're
+        // iterating in the same order. So we can simply write successive linear
+        // indices instead of recalculating a multi-dimensional index.
+        values_[i] = from.values_[from_i++];
+      }
+    }
+  }
+
+  // Performs an in-place reshape, modifying the dimensions but not the
+  // underlying data.
+  void Reshape(tensorflow::gtl::ArraySlice<int64> new_dimensions) {
+    int64 old_num_elements = num_elements();
+    sizes_ = std::vector<int64>(new_dimensions.begin(), new_dimensions.end());
+    CHECK_EQ(num_elements(), old_num_elements);
+  }
+
   // Returns a string representation of the array suitable for debugging.
   string ToString() const {
     std::vector<string> pieces;
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index e9449f01ad69a5722f53cce09e2884e20a0def5a..a1c5840a5f3874e27043c821ed4684da2fa6c542 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -36,6 +36,8 @@ namespace xla {
 template <typename T>
 class Array3D : public Array<T> {
  public:
+  Array3D() : Array<T>(std::vector<int64>{0, 0, 0}) {}
+
   // Creates an array of dimensions n1 x n2 x n3, uninitialized values.
   Array3D(const int64 n1, const int64 n2, const int64 n3)
       : Array<T>(std::vector<int64>{n1, n2, n3}) {}
diff --git a/tensorflow/compiler/xla/array_test.cc b/tensorflow/compiler/xla/array_test.cc
index 093784f541b3bd18f4a1fc1b665cd0d17a892f28..8b9419477479d952126fd831eb44899e7649ca71 100644
--- a/tensorflow/compiler/xla/array_test.cc
+++ b/tensorflow/compiler/xla/array_test.cc
@@ -71,6 +71,19 @@ TEST(ArrayTest, IndexingReadWrite) {
   EXPECT_EQ(arr(1, 2), 61);
 }
 
+TEST(ArrayTest, DynamicIndexingReadWrite) {
+  Array<int> arr({2, 3});
+
+  std::vector<int64> index1 = {1, 1};
+  std::vector<int64> index2 = {1, 2};
+  EXPECT_EQ(arr(index1), 0);
+  EXPECT_EQ(arr(index2), 0);
+  arr(index1) = 51;
+  arr(index2) = 61;
+  EXPECT_EQ(arr(1, 1), 51);
+  EXPECT_EQ(arr(1, 2), 61);
+}
+
 TEST(ArrayTest, IndexingReadWriteBool) {
   Array<bool> arr{{false, true, false}, {false, true, false}};
 
@@ -141,5 +154,37 @@ TEST(ArrayTest, Each) {
   EXPECT_EQ(arr.num_elements() * (arr.num_elements() - 1) / 2, each_sum);
 }
 
+TEST(ArrayTest, Slice) {
+  Array<int64> arr({2, 4});
+  arr.FillWithMultiples(1);
+
+  Array<int64> identity_slice = arr.Slice({0, 0}, {2, 4});
+  EXPECT_EQ(identity_slice.dimensions(), arr.dimensions());
+  for (auto it1 = arr.begin(), it2 = identity_slice.begin(), e = arr.end();
+       it1 != e; ++it1, ++it2) {
+    EXPECT_EQ(*it1, *it2);
+  }
+
+  Array<int64> sub_slice = arr.Slice({1, 0}, {2, 2});
+  EXPECT_EQ(sub_slice.dimensions(), (std::vector<int64>{1, 2}));
+  const string expected = R"([[4, 5]])";
+  EXPECT_EQ(expected, sub_slice.ToString());
+}
+
+TEST(ArrayTest, UpdateSlice) {
+  Array<int64> arr({3, 4});
+  arr.FillWithMultiples(1);
+
+  Array<int64> sub_arr({2, 2});
+  sub_arr.FillWithMultiples(3);
+
+  arr.UpdateSlice(sub_arr, {1, 1});
+
+  const string expected = R"([[0, 1, 2, 3],
+ [4, 0, 3, 7],
+ [8, 6, 9, 11]])";
+  EXPECT_EQ(expected, arr.ToString());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 92cd8e729d659c4ff24c156d89f29275848c3cee..66937d64aff18817bbd5310e0c24e19556e9d727 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -142,8 +142,7 @@ StatusOr<std::unique_ptr<Literal>> Client::TransferFromOutfeed(
         "TransferToClient request");
   }
 
-  Literal literal(response.literal());
-  return MakeUnique<Literal>(literal);
+  return MakeUnique<Literal>(response.literal());
 }
 
 Status Client::ResetDevice() {
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index a716159f9e74041c4823ad20b46fa94c2d7b9d8c..c28380b689c7a0e16bf0bcbf15003f4aa15e42a7 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -67,6 +67,15 @@ class Client {
     std::vector<GlobalData*> arguments;
     ExecutionOptions execution_options;
     ExecutionProfile* execution_profile;
+
+    ComputationInstance(const Computation& computation,
+                        std::vector<GlobalData*> arguments,
+                        ExecutionOptions execution_options,
+                        ExecutionProfile* execution_profile)
+        : computation(computation),
+          arguments(std::move(arguments)),
+          execution_options(execution_options),
+          execution_profile(execution_profile) {}
   };
 
   // Executes a list ComputationInstances and returns global data produced from
@@ -133,7 +142,7 @@ class Client {
 
   // Returns a vector of global data handles that point to the tuple elements.
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> DeconstructTuple(
-      const GlobalData& computation);
+      const GlobalData& data);
 
   // Retrieves the statistics of the given computation.
   StatusOr<ComputationStats> GetComputationStats(
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 24774c4c2a385d9aabd22a550bd8be3acf409d85..317dcb4e41723b93e7e50d911f16e48bc3505a09 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -153,6 +153,7 @@ bool ComputationBuilder::MakeWindow(
     } else {
       dim->set_window_dilation(1);
     }
+    dim->set_window_reversal(false);
   }
   return true;
 }
@@ -624,7 +625,41 @@ ComputationDataHandle ComputationBuilder::Lt(
 
 ComputationDataHandle ComputationBuilder::Dot(
     const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return BinaryOp(BINOP_DOT, lhs, rhs, /*broadcast_dimensions=*/{});
+  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
+  if (!lhs_shape_or_status.ok()) {
+    NoteError(lhs_shape_or_status.status());
+    return ComputationDataHandle();
+  }
+  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
+
+  DotDimensionNumbers dimension_numbers;
+  dimension_numbers.add_lhs_contracting_dimensions(
+      lhs_shape->dimensions_size() == 1 ? 0 : 1);
+  dimension_numbers.add_rhs_contracting_dimensions(0);
+  return DotGeneral(lhs, rhs, dimension_numbers);
+}
+
+ComputationDataHandle ComputationBuilder::DotGeneral(
+    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  DotRequest request;
+  *request.mutable_lhs() = lhs;
+  *request.mutable_rhs() = rhs;
+  *request.mutable_dimension_numbers() = dimension_numbers;
+
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_dot_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making Dot request";
+  Status s = client_->stub()->Op(&op_request, &response);
+  return ParseOpResponse(s, &response);
 }
 
 ComputationDataHandle ComputationBuilder::Conv(
@@ -693,11 +728,15 @@ bool ComputationBuilder::VerifyConvolution(
         }
         return true;
       };
-  return check_spatial_dimensions("spatial_dimensions",
-                                  dimension_numbers.spatial_dimensions()) &&
+  return check_spatial_dimensions(
+             "input_spatial_dimensions",
+             dimension_numbers.input_spatial_dimensions()) &&
          check_spatial_dimensions(
              "kernel_spatial_dimensions",
-             dimension_numbers.kernel_spatial_dimensions());
+             dimension_numbers.kernel_spatial_dimensions()) &&
+         check_spatial_dimensions(
+             "output_spatial_dimensions",
+             dimension_numbers.output_spatial_dimensions());
 }
 
 ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
@@ -729,11 +768,11 @@ ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
   }
 
   std::vector<int64> base_area_dimensions(
-      dimension_numbers.spatial_dimensions_size());
+      dimension_numbers.input_spatial_dimensions_size());
   for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
        ++i) {
     base_area_dimensions[i] =
-        lhs_shape->dimensions(dimension_numbers.spatial_dimensions(i));
+        lhs_shape->dimensions(dimension_numbers.input_spatial_dimensions(i));
   }
 
   std::vector<int64> window_dimensions(
@@ -1163,6 +1202,34 @@ ComputationDataHandle ComputationBuilder::ConvertElementType(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::BitcastConvertType(
+    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
+  if (!shape_status.ok()) {
+    first_error_ = shape_status.status();
+    return ComputationDataHandle();
+  }
+  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
+
+  ConvertRequest request;
+  *request.mutable_operand() = operand;
+  request.set_new_element_type(new_element_type);
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_bitcast_convert_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making bitcast convert request";
+  Status s = client_->stub()->Op(&op_request, &response);
+
+  return ParseOpResponse(s, &response);
+}
+
 ComputationDataHandle ComputationBuilder::SquareF32(
     const ComputationDataHandle& operand) {
   return BinaryOp(BINOP_POW, operand, ConstantR0<float>(2.0),
@@ -1309,7 +1376,7 @@ Status ComputationBuilder::SetReturnValue(
 }
 
 StatusOr<bool> ComputationBuilder::IsConstant(
-    const ComputationDataHandle& operand) {
+    const ComputationDataHandle& operand, int64 num_parameters) {
   if (!first_error_.ok()) {
     return first_error_;
   }
@@ -1317,6 +1384,7 @@ StatusOr<bool> ComputationBuilder::IsConstant(
   IsConstantRequest request;
   *request.mutable_computation() = computation_.handle();
   *request.mutable_operand() = operand;
+  request.set_num_parameters(num_parameters);
   IsConstantResponse response;
 
   VLOG(2) << "making IsConstant request";
@@ -1330,7 +1398,8 @@ StatusOr<bool> ComputationBuilder::IsConstant(
 }
 
 StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
-    const ComputationDataHandle& operand, const Layout* output_layout) {
+    const ComputationDataHandle& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
   if (!first_error_.ok()) {
     return first_error_;
   }
@@ -1341,6 +1410,9 @@ StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
   if (output_layout != nullptr) {
     *request.mutable_output_layout() = *output_layout;
   }
+  for (const auto& param : parameters) {
+    *request.add_parameters() = param.ToProto();
+  }
 
   ComputeConstantResponse response;
 
@@ -1432,6 +1504,34 @@ ComputationDataHandle ComputationBuilder::While(
   return ParseOpResponse(s, &response);
 }
 
+ComputationDataHandle ComputationBuilder::Conditional(
+    const ComputationDataHandle& predicate,
+    const ComputationDataHandle& true_operand,
+    const Computation& true_computation,
+    const ComputationDataHandle& false_operand,
+    const Computation& false_computation) {
+  if (!first_error_.ok() || !PrepareComputation().ok()) {
+    return ComputationDataHandle();
+  }
+
+  ConditionalRequest request;
+  *request.mutable_predicate() = predicate;
+  *request.mutable_true_operand() = true_operand;
+  *request.mutable_true_computation() = true_computation.handle();
+  *request.mutable_false_operand() = false_operand;
+  *request.mutable_false_computation() = false_computation.handle();
+  OpRequest op_request;
+  *op_request.mutable_computation() = computation_.handle();
+  *op_request.mutable_conditional_request() = request;
+  AddCommonFieldsToOpRequest(&op_request);
+  OpResponse response;
+
+  VLOG(2) << "making conditional op request";
+  Status s = client_->stub()->Op(&op_request, &response);
+
+  return ParseOpResponse(s, &response);
+}
+
 ComputationDataHandle ComputationBuilder::Reduce(
     const ComputationDataHandle& operand,
     const ComputationDataHandle& init_value, const Computation& computation,
@@ -1811,25 +1911,27 @@ ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
   dimension_numbers.set_kernel_input_feature_dimension(
       kConvKernelInputDimension);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    dimension_numbers.add_spatial_dimensions(i + 2);
+    dimension_numbers.add_input_spatial_dimensions(i + 2);
     dimension_numbers.add_kernel_spatial_dimensions(i + 2);
+    dimension_numbers.add_output_spatial_dimensions(i + 2);
   }
   return dimension_numbers;
 }
 
 /* static */ StatusOr<ConvolutionDimensionNumbers>
 ComputationBuilder::CreateConvDimensionNumbers(
-    int64 input_batch, int64 input_feature, int64 output_batch,
-    int64 output_feature, int64 first_spatial, int64 second_spatial,
+    int64 input_batch, int64 input_feature, int64 input_first_spatial,
+    int64 input_second_spatial, int64 output_batch, int64 output_feature,
+    int64 output_first_spatial, int64 output_second_spatial,
     int64 kernel_output_feature, int64 kernel_input_feature,
     int64 kernel_first_spatial, int64 kernel_second_spatial) {
-  if (std::set<int64>(
-          {input_batch, input_feature, first_spatial, second_spatial})
+  if (std::set<int64>({input_batch, input_feature, input_first_spatial,
+                       input_second_spatial})
           .size() != 4) {
     return FailedPrecondition(
         "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
         "%lld)",
-        input_batch, input_feature, first_spatial, second_spatial);
+        input_batch, input_feature, input_first_spatial, input_second_spatial);
   }
   if (std::set<int64>({kernel_output_feature, kernel_input_feature,
                        kernel_first_spatial, kernel_second_spatial})
@@ -1840,25 +1942,28 @@ ComputationBuilder::CreateConvDimensionNumbers(
         kernel_output_feature, kernel_input_feature, kernel_first_spatial,
         kernel_second_spatial);
   }
-  if (std::set<int64>(
-          {output_batch, output_feature, first_spatial, second_spatial})
+  if (std::set<int64>({output_batch, output_feature, output_first_spatial,
+                       output_second_spatial})
           .size() != 4) {
     return FailedPrecondition(
         "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
         "%lld)",
-        output_batch, output_feature, first_spatial, second_spatial);
+        output_batch, output_feature, output_first_spatial,
+        output_second_spatial);
   }
   ConvolutionDimensionNumbers dimension_numbers;
   dimension_numbers.set_input_batch_dimension(input_batch);
   dimension_numbers.set_input_feature_dimension(input_feature);
-  dimension_numbers.set_output_batch_dimension(output_batch);
-  dimension_numbers.set_output_feature_dimension(output_feature);
-  dimension_numbers.add_spatial_dimensions(first_spatial);
-  dimension_numbers.add_spatial_dimensions(second_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
+  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
   dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
   dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
   dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
   dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
+  dimension_numbers.set_output_batch_dimension(output_batch);
+  dimension_numbers.set_output_feature_dimension(output_feature);
+  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
+  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
   return dimension_numbers;
 }
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index d282174947970ab13a8b29ba4212d56ceb0c572a..97531cdc750094adeeb2378d53ebc82cced1cbd8 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -68,6 +68,7 @@ class ShardingBuilder {
                          const TileAssignment& tile_assignment) {
     OpSharding result;
     result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
+    *result.mutable_tile_shape() = tile_shape;
     for (int64 dim : tile_assignment.dimensions()) {
       result.add_tile_assignment_dimensions(dim);
     }
@@ -120,23 +121,23 @@ class ComputationBuilder {
   // result, OpMetadata is set on the Computation Builder. All subsequent
   // instructions generated via this Computation Builder will have the same
   // OpMetadata attached until a call to ClearOpMetdata.
-  void SetOpMetadata(const OpMetadata& metadata) {
-    metadata_ = metadata;
-  }
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
 
   // Clears the HloMetadata state.
-  void ClearOpMetadata() {
-    metadata_.Clear();
-  }
+  void ClearOpMetadata() { metadata_.Clear(); }
 
-  // Sets an OpDeviceAssignment that will be attached to all instructions
-  // until cleared.
+  // Sets an OpSharding that will be attached to all instructions until cleared.
   void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
 
-  // Clears the device assignment. Ops will be placed according to the default
-  // placement policy.
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
   void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
 
+  // Returns the OpSharding that will be attached to all instructions.
+  const tensorflow::gtl::optional<OpSharding>& sharding() const {
+    return sharding_;
+  }
+
   // Sets the builder to a mode where it will die immediately when an error is
   // encountered, rather than producing it in a deferred fashion when Build() is
   // called (which is the default).
@@ -392,6 +393,11 @@ class ComputationBuilder {
   ComputationDataHandle Dot(const ComputationDataHandle& lhs,
                             const ComputationDataHandle& rhs);
 
+  // Enqueues a general dot instruction onto the computation.
+  ComputationDataHandle DotGeneral(
+      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   // Default dimension numbers used for a 2D convolution.
   static constexpr int64 kConvBatchDimension = 0;
   static constexpr int64 kConvFeatureDimension = 1;
@@ -412,8 +418,9 @@ class ComputationBuilder {
   // Creates a ConvolutionDimensionNumbers with the given arguments. Returns an
   // error if either the input or the weight dimension numbers have conflicts.
   static StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
-      int64 input_batch, int64 input_feature, int64 output_batch,
-      int64 output_feature, int64 first_spatial, int64 second_spatial,
+      int64 input_batch, int64 input_feature, int64 input_first_spatial,
+      int64 input_second_spatial, int64 output_batch, int64 output_feature,
+      int64 output_first_spatial, int64 output_second_spatial,
       int64 kernel_output_feature, int64 kernel_input_feature,
       int64 kernel_first_spatial, int64 kernel_second_spatial);
 
@@ -668,6 +675,13 @@ class ComputationBuilder {
   ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
                                            PrimitiveType new_element_type);
 
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  ComputationDataHandle BitcastConvertType(const ComputationDataHandle& operand,
+                                           PrimitiveType new_element_type);
+
   // Enqueues a float32 reciprocal instruction onto the computation.
   // (float32 is specified as there is an implicit float32 -1.0f constant
   // exponent).
@@ -727,6 +741,13 @@ class ComputationBuilder {
                               const Computation& body,
                               const ComputationDataHandle& init);
 
+  // Enqueues a conditional node onto the computation.
+  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
+                                    const ComputationDataHandle& true_operand,
+                                    const Computation& true_computation,
+                                    const ComputationDataHandle& false_operand,
+                                    const Computation& false_computation);
+
   // Enqueues a ReducePrecision node onto the computation.
   ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
                                         const int exponent_bits,
@@ -742,11 +763,12 @@ class ComputationBuilder {
   ComputationDataHandle Recv(const Shape& shape, const ChannelHandle& handle);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters, or on stateful operators such
-  // as `RngNormal` or `Infeed`. Unlike `ComputeConstant`, `IsConstant` tests
-  // whether a computation is a compile-time constant without evaluating the
-  // computation.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& operand);
+  // constant does not depend on parameters with higher index then
+  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
+  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
+  // compile-time constant without evaluating the computation.
+  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
+                            int64 num_parameters = 0);
 
   // Normalizes operand across spatial and batch dimensions for each feature.
   //
@@ -791,7 +813,7 @@ class ComputationBuilder {
                                       float epsilon, int64 feature_index);
 
   // Computes the value of a constant indicated by a
-  // ComputationDataHandle.
+  // ComputationDataHandle using a non-optimized interpreter on the host.
   //
   // The operand must be from the computation currently being built -
   // i.e., returned from this builder with no intervening call to
@@ -799,8 +821,11 @@ class ComputationBuilder {
   // that may stop working at any time.
   //
   // The operand must represent a constant value, which in this case
-  // means that it must not statically depend on a parameter to the
-  // computation that is being built.
+  // means that it must not statically depend on any parameter of the
+  // computation that is being built other then the ones specified on the
+  // paramtere list. The parameters in the list will be indexed by their
+  // parameter id property so the number of parameters specified should be at
+  // least as many as the largest used parameter index.
   //
   // `IsConstant` can be used to test whether a computation is a compile-time
   // constant without evaluation it. `ComputeConstant` only succeeds for
@@ -818,7 +843,8 @@ class ComputationBuilder {
   // will be stored using that layout.
   StatusOr<std::unique_ptr<Literal>> ComputeConstant(
       const ComputationDataHandle& operand,
-      const Layout* output_layout = nullptr);
+      const Layout* output_layout = nullptr,
+      tensorflow::gtl::ArraySlice<Literal> parameters = {});
 
   // Returns a new ComputationBuilder whose resultant Computation is used only
   // by this ComputationBuilder. The sub-ComputationBuilder has the same
@@ -1038,6 +1064,33 @@ ComputationDataHandle ComputationBuilder::ConstantR4FromArray4D(
   return ConstantFromArray(values);
 }
 
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+class ScopedShardingAssignment {
+ public:
+  ScopedShardingAssignment(xla::ComputationBuilder* builder,
+                           tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  ~ScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::ComputationBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedShardingAssignment);
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index ee3468208792879c3fe4ff5860e434ef5a0c0155..fca2bf2688cd21b44f099da3bae3b890cbb069ab 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index e6645e4941bd04c658b67117bb689f6fdef7dfc1..5f2b55713e342aa3d0251386d57cb52481fe748d 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -48,65 +49,9 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    std::vector<std::unique_ptr<Literal>> elements;
-    for (const Shape& element_shape : shape.tuple_shapes()) {
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
-                          MakeFakeLiteral(element_shape));
-      elements.push_back(std::move(element));
-    }
-    return Literal::MakeTupleOwned(std::move(elements));
-  }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  std::minstd_rand0 engine;
-  switch (shape.element_type()) {
-    case F32: {
-      std::uniform_real_distribution<float> generator(0.0f, 1.0f);
-      TF_CHECK_OK(literal->Populate<float>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    case S32: {
-      std::uniform_int_distribution<int32> generator(
-          std::numeric_limits<int32>::lowest(),
-          std::numeric_limits<int32>::max());
-      TF_CHECK_OK(literal->Populate<int32>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    case S64: {
-      std::uniform_int_distribution<int64> generator(
-          std::numeric_limits<int64>::lowest(),
-          std::numeric_limits<int64>::max());
-      TF_CHECK_OK(literal->Populate<int64>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      TF_CHECK_OK(literal->Populate<bool>(
-          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
-            return generator(engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape).c_str());
-  }
-  return std::move(literal);
-}
-
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client) {
-  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 30)) {
+  if (ShapeUtil::ByteSizeOf(shape) < (1LL << 20)) {
     StatusOr<std::unique_ptr<Literal>> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
       // If we got an Unimplemented error, fall back to making the fake data via
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index b5c4393dcc3e37c03a5b0e1a806b0f8b07a132ed..7e640d1307edcc3e2c021f4391c456f578a015ee 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -26,10 +26,6 @@ limitations under the License.
 
 namespace xla {
 
-// Generates fake data in a literal of the given shape, or returns an error
-// status if the element type is currently unhandled for fake data generation.
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
-
 // Generates fake data of the given shape on the device or dies. The fake data
 // is created by performing a computation on the device rather than transferring
 // data from the host to the device.
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 15c744ecd349e91dc703bec5708d78a896f132c3..b051955f0fd85b7ca886bc0238068aeb94427209 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -27,16 +27,6 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_platform(
-    perftools::gputools::Platform* platform) {
-  platform_ = platform;
-  return *this;
-}
-
-perftools::gputools::Platform* ExecutableBuildOptions::platform() const {
-  return platform_;
-}
-
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
     int device_ordinal) {
   device_ordinal_ = device_ordinal;
@@ -56,16 +46,6 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
-ExecutableBuildOptions& ExecutableBuildOptions::set_has_hybrid_result(
-    bool has_hybrid_result) {
-  has_hybrid_result_ = has_hybrid_result;
-  return *this;
-}
-
-bool ExecutableBuildOptions::has_hybrid_result() const {
-  return has_hybrid_result_;
-}
-
 namespace {
 StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
                                                    Backend* backend) {
@@ -230,9 +210,9 @@ tensorflow::Status LocalExecutable::RecordArguments(
     SessionModule* session_module) {
   session_module->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
-    Literal literal;
-    TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*argument, &literal));
-    *session_module->add_arguments() = literal.ToProto();
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                        LiteralFromShapedBuffer(*argument));
+    *session_module->add_arguments() = literal->ToProto();
   }
   return Status::OK();
 }
@@ -240,21 +220,19 @@ tensorflow::Status LocalExecutable::RecordArguments(
 tensorflow::Status LocalExecutable::RecordResult(
     const ShapedBuffer* result, SessionModule* session_module) {
   session_module->clear_result();
-  Literal literal(session_module->result());
-  TF_RETURN_IF_ERROR(LiteralFromShapedBuffer(*result, &literal));
-  *session_module->mutable_result() = literal.ToProto();
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                      LiteralFromShapedBuffer(*result));
+  *session_module->mutable_result() = literal->ToProto();
   return Status::OK();
 }
 
-// TODO(dnovillo) Change signature to return StatusOr<Literal>.
-tensorflow::Status LocalExecutable::LiteralFromShapedBuffer(
-    const ShapedBuffer& shaped_buffer, Literal* literal) {
+StatusOr<std::unique_ptr<Literal>> LocalExecutable::LiteralFromShapedBuffer(
+    const ShapedBuffer& shaped_buffer) {
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
       backend_->stream_executor(shaped_buffer.device_ordinal()));
-  return backend_->transfer_manager()->TransferLiteralFromDevice(
-      executor, shaped_buffer.buffer({}), shaped_buffer.shape(),
-      shaped_buffer.shape(), literal);
+  return backend_->transfer_manager()->TransferLiteralFromDevice(executor,
+                                                                 shaped_buffer);
 }
 
 se::Platform* LocalClient::platform() const {
@@ -297,9 +275,6 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
                                         device_ordinal, options));
 }
 
-// Copy the literal data to the device with the given ordinal and return as a
-// ScopedShapedBuffer. The given memory allocator is used for device memory
-// allocation.
 StatusOr<std::unique_ptr<ScopedShapedBuffer>>
 LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
                                    DeviceMemoryAllocator* allocator) {
@@ -308,46 +283,42 @@ LocalClient::LiteralToShapedBuffer(const Literal& literal, int device_ordinal,
   }
   TF_ASSIGN_OR_RETURN(
       auto scoped_buffer,
-      ScopedShapedBuffer::Allocate(literal.shape(), allocator, device_ordinal));
+      ScopedShapedBuffer::Allocate(
+          literal.shape(), allocator, device_ordinal,
+          [this](const Shape& shape) {
+            return backend().transfer_manager()->GetByteSizeRequirement(shape);
+          }));
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                       backend().stream_executor(device_ordinal));
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      literal.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(subshape)) {
-          // This is a leaf of the shape. Transfer the literal array data to the
-          // device buffer.
-          return backend().transfer_manager()->TransferLiteralToDevice(
-              executor, literal.GetSubliteral(index),
-              scoped_buffer->mutable_buffer(index));
-        }
-        return Status::OK();
-      }));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      executor, literal, *scoped_buffer));
   return std::move(scoped_buffer);
 }
 
-// Copy the data from the device contained in the given ShapedBuffer and
-// return as a Literal.
 StatusOr<std::unique_ptr<Literal>> LocalClient::ShapedBufferToLiteral(
     const ShapedBuffer& shaped_buffer) {
-  std::unique_ptr<Literal> literal =
-      Literal::CreateFromShape(shaped_buffer.shape());
   TF_ASSIGN_OR_RETURN(
       se::StreamExecutor * executor,
       backend().stream_executor(shaped_buffer.device_ordinal()));
-  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-      literal->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(subshape)) {
-          // This is a leaf of the shape. Transfer the device buffer into the
-          // literal. The layout of the literal and the device buffer are
-          // necessarily the same so we pass 'subshape' for both device and
-          // literal shapes.
-          return backend().transfer_manager()->TransferLiteralFromDevice(
-              executor, shaped_buffer.buffer(index),
-              /*device_shape=*/subshape,
-              /*literal_shape*/ subshape, &literal->GetSubliteral(index));
-        }
-        return Status::OK();
-      }));
+  return backend().transfer_manager()->TransferLiteralFromDevice(executor,
+                                                                 shaped_buffer);
+}
+
+Status LocalClient::TransferToInfeedLocal(const Literal& literal,
+                                          int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                      backend().stream_executor(device_ordinal));
+  return backend().transfer_manager()->TransferLiteralToInfeed(executor,
+                                                               literal);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalClient::TransferFromOutfeedLocal(
+    const Shape& shape, int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
+                      backend().stream_executor(device_ordinal));
+  auto literal = MakeUnique<Literal>();
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralFromOutfeed(
+      executor, shape, literal.get()));
   return std::move(literal);
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 9f985ed5275815de2d59f6caedbbcc8060420a13..3ca0d2ef5513cfb6b0dbfbc63b311f81a318356e 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -37,14 +37,6 @@ namespace xla {
 // LocalClient::Compile.
 class ExecutableBuildOptions {
  public:
-  // If set, this is the platform to build the computation for. This must match
-  // the underlying platform of the service. A value of nullptr indicates the
-  // option has not been set.
-  //
-  // TODO(b/28616830): Support multiple platforms.
-  ExecutableBuildOptions& set_platform(perftools::gputools::Platform* platform);
-  perftools::gputools::Platform* platform() const;
-
   // If set, this is the device to build the computation for. Valid
   // device_ordinal values are: 0 to # of devices - 1. These values are
   // identical to the device ordinal values used by StreamExecutor. The built
@@ -61,18 +53,10 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
-  // If set, the executable will be built to output a hybrid
-  // ShapedBuffer with top-level tuple pointers in host memory and
-  // result buffers in device memory.
-  ExecutableBuildOptions& set_has_hybrid_result(bool has_hybrid_result);
-  bool has_hybrid_result() const;
-
  private:
-  perftools::gputools::Platform* platform_ = nullptr;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
-  bool has_hybrid_result_ = true;
 };
 
 class LocalExecutable {
@@ -129,9 +113,9 @@ class LocalExecutable {
   tensorflow::Status RecordResult(const ShapedBuffer* result,
                                   SessionModule* session_module);
 
-  // Copies the contents of a ShapedBuffer into a Literal proto.
-  tensorflow::Status LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer,
-                                             Literal* literal);
+  // Returns a literal containing the contents of the given ShapedBuffer.
+  StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
+      const ShapedBuffer& shaped_buffer);
 
   // Compiled computation.
   std::unique_ptr<Executable> executable_;
@@ -178,6 +162,20 @@ class LocalClient : public Client {
   StatusOr<std::unique_ptr<Literal>> ShapedBufferToLiteral(
       const ShapedBuffer& shaped_buffer);
 
+  // Transfer the given literal to the infeed queue of the given device.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferToInfeed.
+  Status TransferToInfeedLocal(const Literal& literal, int device_ordinal);
+
+  // Transfer and return a value of the given shape from the outfeed of the
+  // given device.
+  // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
+  // not inherit from Client and there is no possibility of confusion with
+  // Client::TransferFromOutfeed.
+  StatusOr<std::unique_ptr<Literal>> TransferFromOutfeedLocal(
+      const Shape& shape, int device_ordinal);
+
   // Returns the platform that the underlying service targets.
   perftools::gputools::Platform* platform() const;
 
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index f2cdd9669c727bb778fce495ede0faaf2d9a923d..bfafef0a40f55e13ac94b2d1750df25146081784 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -31,7 +31,6 @@ std::vector<tensorflow::Flag>* flag_objects;
 std::once_flag flags_init;
 
 void SetDebugOptionsDefaults(DebugOptions* flags) {
-  flags->set_xla_hlo_graph_path("/tmp/");
   flags->set_xla_enable_fast_math(true);
   flags->set_xla_llvm_enable_alias_scope_metadata(true);
   flags->set_xla_llvm_enable_noalias_metadata(true);
@@ -117,9 +116,22 @@ void AllocateFlags() {
            bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
            flag_values->xla_hlo_dump_as_graphdef(),
            "Dump HLO graphs as TensorFlow GraphDefs."),
+       tensorflow::Flag(
+           "xla_hlo_graph_sharding_color",
+           bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
+           flag_values->xla_hlo_graph_sharding_color(),
+           "Assign colors based on sharding assignments when generating the "
+           "HLO graphs."),
+       tensorflow::Flag(
+           "xla_hlo_tfgraph_device_scopes",
+           bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
+           flag_values->xla_hlo_tfgraph_device_scopes(),
+           "When generating TensorFlow HLO graphs, if the HLO instructions "
+           "are assigned to a specific device, prefix the name scope with "
+           "\"devX\" with X being the device ordinal."),
        tensorflow::Flag(
            "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
-           "HLO modules matching this regex will be dumped to LOG(INFO). "),
+           "HLO modules matching this regex will be dumped to LOG(INFO)."),
        tensorflow::Flag(
            "xla_generate_hlo_text_to",
            flag_values->mutable_xla_generate_hlo_text_to(),
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 8fc8644a60ef62d7ba5e7f0cc11253742395f09b..42c9d21149a41a3d60f2cfff65d3af08d7c8b9d7 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -33,6 +33,20 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+namespace {
+using tensorflow::int64;
+
+constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
+
+// Converts between little and big endian, assuming elements in the array are 16
+// bits long.
+void ConvertEndianShort(char* bytes, int64 size) {
+  CHECK_EQ(size / 2, 0);
+  for (int64 i = 0; i < size; i += 2) {
+    std::swap(bytes[i], bytes[i + 1]);
+  }
+}
+}  // namespace
 
 namespace xla {
 
@@ -169,6 +183,8 @@ Status Literal::Copy(const Literal& src_literal,
       return CopyRange<int64>(src_literal, src_base, dest_base, copy_size);
     case F16:
       return CopyRange<half>(src_literal, src_base, dest_base, copy_size);
+    case BF16:
+      return CopyRange<bfloat16>(src_literal, src_base, dest_base, copy_size);
     case F32:
       return CopyRange<float>(src_literal, src_base, dest_base, copy_size);
     case F64:
@@ -200,6 +216,8 @@ Status Literal::Copy(const Literal& src_literal,
       return *Literal::CreateR0<int64>(0);
     case F16:
       return *Literal::CreateR0<half>(static_cast<half>(0.0f));
+    case BF16:
+      return *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.0f));
     case F32:
       return *Literal::CreateR0<float>(0);
     case F64:
@@ -234,6 +252,10 @@ Status Literal::Copy(const Literal& src_literal,
       return *Literal::CreateR0<int32>(1);
     case S64:
       return *Literal::CreateR0<int64>(1);
+    case F16:
+      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
+    case BF16:
+      return *Literal::CreateR0<bfloat16>(static_cast<bfloat16>(1.0f));
     case F32:
       return *Literal::CreateR0<float>(1);
     case F64:
@@ -245,8 +267,6 @@ Status Literal::Copy(const Literal& src_literal,
     case S16:
     case U16:
       LOG(FATAL) << "u16/s16 literals not yet implemented";
-    case F16:
-      return *Literal::CreateR0<half>(static_cast<half>(1.0f));
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 1";
     case OPAQUE:
@@ -285,6 +305,9 @@ Status Literal::Copy(const Literal& src_literal,
     case F16:
       return *Literal::CreateR0<half>(
           static_cast<half>(-std::numeric_limits<float>::infinity()));
+    case BF16:
+      return *Literal::CreateR0<bfloat16>(
+          static_cast<bfloat16>(-std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no minimum value";
     case OPAQUE:
@@ -321,6 +344,9 @@ Status Literal::Copy(const Literal& src_literal,
     case F16:
       return *Literal::CreateR0<half>(
           static_cast<half>(std::numeric_limits<float>::infinity()));
+    case BF16:
+      return *Literal::CreateR0<bfloat16>(
+          static_cast<bfloat16>(std::numeric_limits<float>::infinity()));
     case TUPLE:
       LOG(FATAL) << "tuple element type has no maximum value";
     case OPAQUE:
@@ -428,6 +454,7 @@ std::unique_ptr<Literal> Literal::Transpose(
   // The shape with affine layout resulting from that operation will be
   // F32[8,11]{0,1}, since it leaves the original most minor (the 8 sized), the
   // most minor.
+  //
   // Essentially, given MinMaj(Di) the position of the Di dimension within the
   // minor to major vector, and given T(Di) the index that the original Di
   // dimension has within the transposed array, a layout is affine if
@@ -536,6 +563,9 @@ string Literal::GetAsString(
     }
     case F16:
       return tensorflow::strings::StrCat(Get<half>(multi_index));
+    case BF16:
+      return tensorflow::strings::StrCat(
+          static_cast<float>(Get<bfloat16>(multi_index)));
     default:
       return tensorflow::strings::StrCat(
           "[", PrimitiveType_Name(shape().element_type()), "]");
@@ -569,9 +599,17 @@ int64 Literal::LinearIndex(
   return IndexUtil::MultidimensionalIndexToLinearIndex(shape(), multi_index);
 }
 
-string Literal::ToString() const {
+string Literal::ToString(bool print_layout) const {
   std::vector<string> pieces;
 
+  auto shape_to_string = [print_layout](const Shape& shape) {
+    if (print_layout) {
+      return ShapeUtil::HumanStringWithLayout(shape);
+    } else {
+      return ShapeUtil::HumanString(shape);
+    }
+  };
+
   auto element_to_string =
       [this](tensorflow::gtl::ArraySlice<int64> indices) -> string {
     PrimitiveType element_type = shape().element_type();
@@ -585,13 +623,13 @@ string Literal::ToString() const {
 
   // TODO(b/32894291): refactor this code to reduce code duplication.
   if (ShapeUtil::IsTuple(shape())) {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
+    pieces.push_back(shape_to_string(shape()));
     pieces.push_back(" (\n");
-    for (const auto& element_literal : tuple_literals()) {
-      pieces.push_back(element_literal.ToString());
-      pieces.push_back(",\n");
-    }
-    pieces.push_back(")");
+    pieces.push_back(tensorflow::str_util::Join(
+        tuple_literals(), ",\n", [](string* out, const Literal& element) {
+          tensorflow::strings::StrAppend(out, element.ToString());
+        }));
+    pieces.push_back("\n)");
   } else if (ShapeUtil::Rank(shape()) == 0) {
     pieces.push_back(GetAsString({}));
   } else if (ShapeUtil::Rank(shape()) == 1) {
@@ -601,7 +639,7 @@ string Literal::ToString() const {
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 2) {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
+    pieces.push_back(shape_to_string(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back("  { ");
@@ -609,11 +647,11 @@ string Literal::ToString() const {
         pieces.push_back(element_to_string({i0, i1}));
       }
       pieces.push_back(" ");
-      pieces.push_back("},\n");
+      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "}\n" : "},\n");
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 3) {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
+    pieces.push_back(shape_to_string(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
       pieces.push_back(i0 > 0 ? ",\n{" : "{");
@@ -628,53 +666,62 @@ string Literal::ToString() const {
     }
     pieces.push_back("\n}");
   } else if (ShapeUtil::Rank(shape()) == 4) {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
+    pieces.push_back(shape_to_string(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(tensorflow::strings::Printf("  {  // i0=%lld\n", i0));
+      pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
       for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(
-            tensorflow::strings::Printf("    {  // i1=%lld\n", i1));
+            tensorflow::strings::Printf("    {  /*i1=%lld*/\n", i1));
         for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
           pieces.push_back("      {");
           for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
             pieces.push_back(element_to_string({i0, i1, i2, i3}));
           }
-          pieces.push_back("},\n");
+          pieces.push_back(i2 == shape().dimensions(2) - 1 ? "}\n" : "},\n");
         }
-        pieces.push_back("    },\n");
+        pieces.push_back(i1 == shape().dimensions(1) - 1 ? "    }\n"
+                                                         : "    },\n");
       }
-      pieces.push_back("  },\n");
+      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "  }\n" : "  },\n");
     }
     pieces.push_back("}");
   } else if (ShapeUtil::Rank(shape()) == 5) {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
+    pieces.push_back(shape_to_string(shape()));
     pieces.push_back(" {\n");
     for (int64 i0 = 0; i0 < shape().dimensions(0); ++i0) {
-      pieces.push_back(tensorflow::strings::Printf("  {  // i0=%lld\n", i0));
+      pieces.push_back(tensorflow::strings::Printf("  {  /*i0=%lld*/\n", i0));
       for (int64 i1 = 0; i1 < shape().dimensions(1); ++i1) {
         pieces.push_back(
-            tensorflow::strings::Printf("    {  // i1=%lld\n", i1));
+            tensorflow::strings::Printf("    {  /*i1=%lld*/\n", i1));
         for (int64 i2 = 0; i2 < shape().dimensions(2); ++i2) {
           pieces.push_back(
-              tensorflow::strings::Printf("      {  // i2=%lld\n", i2));
+              tensorflow::strings::Printf("      {  /*i2=%lld*/\n", i2));
           for (int64 i3 = 0; i3 < shape().dimensions(3); ++i3) {
             pieces.push_back("        {");
             for (int64 i4 = 0; i4 < shape().dimensions(4); ++i4) {
               pieces.push_back(element_to_string({i0, i1, i2, i3, i4}));
             }
-            pieces.push_back("},\n");
+            pieces.push_back(i3 == shape().dimensions(3) - 1 ? "}\n" : "},\n");
           }
-          pieces.push_back("      },\n");
+          pieces.push_back(i2 == shape().dimensions(2) - 1 ? "      }\n"
+                                                           : "      },\n");
         }
-        pieces.push_back("    },\n");
+        pieces.push_back(i1 == shape().dimensions(1) - 1 ? "    }\n"
+                                                         : "    },\n");
       }
-      pieces.push_back("  },\n");
+      pieces.push_back(i0 == shape().dimensions(0) - 1 ? "  }\n" : "  },\n");
     }
     pieces.push_back("}");
   } else {
-    pieces.push_back(ShapeUtil::HumanString(shape()));
-    pieces.push_back(" {...}");
+    pieces.push_back(shape_to_string(shape()));
+    pieces.push_back(" {");
+    EachCellAsString(
+        [&](tensorflow::gtl::ArraySlice<int64> indices, const string& value) {
+          pieces.push_back(" ");
+          pieces.push_back(value);
+        });
+    pieces.push_back("}");
   }
 
   return tensorflow::str_util::Join(pieces, "");
@@ -732,6 +779,8 @@ void* Literal::MutableInternalData() {
       return reinterpret_cast<void*>(c64s_.data());
     case F16:
       return reinterpret_cast<void*>(f16s_.data());
+    case BF16:
+      return reinterpret_cast<void*>(bf16s_.data());
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(shape().element_type());
@@ -774,6 +823,9 @@ void Literal::Reserve(int64 num_elements) {
     case F16:
       Resize<half>(num_elements, static_cast<half>(0.0f));
       break;
+    case BF16:
+      Resize<bfloat16>(num_elements, static_cast<bfloat16>(0.0f));
+      break;
     default:
       LOG(FATAL) << "primitive type not supported in literals: "
                  << PrimitiveType_Name(shape().element_type());
@@ -813,6 +865,9 @@ tensorflow::Status Literal::ValidateLiteral() const {
     case F16:
       actual = f16s().size() / sizeof(half);
       break;
+    case BF16:
+      actual = bf16s().size();
+      break;
     default:
       return tensorflow::errors::Unimplemented(
           "unhandled element type for literal validation: " +
@@ -909,6 +964,7 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(F16)
     CONVERT_IF_TYPES_MATCH(F32)
     CONVERT_IF_TYPES_MATCH(F64)
+    CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
       return ConvertToC64<primitive_src_type>(src_literal);
@@ -938,8 +994,9 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
     CONVERT_IF_DEST_TYPE_MATCHES(F16)
     CONVERT_IF_DEST_TYPE_MATCHES(F32)
     CONVERT_IF_DEST_TYPE_MATCHES(F64)
+    CONVERT_IF_DEST_TYPE_MATCHES(BF16)
 #undef CONVERT_IF_DEST_TYPE_MATCHES
-    // Other types are not yet supported.
+      // Other types are not yet supported.
     default:
       return InvalidArgument("Unimplemented: Convert from type %s to type %s",
                              PrimitiveType_Name(shape().element_type()).c_str(),
@@ -1008,6 +1065,8 @@ bool Literal::operator==(const Literal& other) const {
         return EqualElements<double>(*this, other, 0, &multi_index);
       case F16:
         return EqualElements<half>(*this, other, 0, &multi_index);
+      case BF16:
+        return EqualElements<bfloat16>(*this, other, 0, &multi_index);
       case C64:
         return EqualElements<complex64>(*this, other, 0, &multi_index);
       default:
@@ -1117,13 +1176,18 @@ tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice() {
 
 template <>
 tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice<half>() {
-  // TODO - there is an endianess problem here. fix it, or wait for uint16
-  //        support in protobuf
   auto values = mutable_f16s();
   return tensorflow::gtl::MutableArraySlice<half>(values->data(),
                                                   values->size());
 }
 
+template <>
+tensorflow::gtl::MutableArraySlice<bfloat16>
+Literal::GetMutableArraySlice<bfloat16>() {
+  auto values = mutable_bf16s();
+  return {values->data(), values->size()};
+}
+
 template <>
 tensorflow::gtl::ArraySlice<bool> Literal::GetArraySlice<bool>() const {
   CHECK_EQ(shape().element_type(), PRED);
@@ -1194,6 +1258,12 @@ tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const {
                                            f16s().size() / sizeof(half));
 }
 
+template <>
+tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const {
+  CHECK_EQ(shape().element_type(), BF16);
+  return {bf16s().data(), bf16s().size()};
+}
+
 template <>
 tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
     const {
@@ -1242,6 +1312,9 @@ bool Literal::IsAll(int8 value) const {
       return AllElementsEqualValue<double>(*this, value);
     case F16:
       return AllElementsEqualValue<half>(*this, static_cast<half>(value));
+    case BF16:
+      return AllElementsEqualValue<bfloat16>(*this,
+                                             static_cast<bfloat16>(value));
     case PRED:
       if (value == 0) {
         return AllElementsEqualValue<bool>(*this, false);
@@ -1263,6 +1336,9 @@ bool Literal::IsAllFloat(float value) const {
       return AllElementsEqualValue<double>(*this, value);
     case F16:
       return AllElementsEqualValue<half>(*this, static_cast<half>(value));
+    case BF16:
+      return AllElementsEqualValue<bfloat16>(*this,
+                                             static_cast<bfloat16>(value));
     default:
       return false;
   }
@@ -1299,6 +1375,8 @@ bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
       return Get<complex64>(indices) == complex64(0.0f, 0.0f);
     case F16:
       return Get<half>(indices) == static_cast<half>(0.0f);
+    case BF16:
+      return Get<bfloat16>(indices) == static_cast<bfloat16>(0.0f);
     case PRED:
       return Get<bool>(indices) == false;
     default:
@@ -1366,6 +1444,12 @@ void Literal::Resize<half>(int64 num_elements, half value) {
   mutable_f16s()->resize(num_elements, value);
 }
 
+template <>
+void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value) {
+  CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
+  mutable_bf16s()->resize(num_elements, value);
+}
+
 template <>
 void Literal::Resize<complex64>(int64 num_elements, complex64 value) {
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), num_elements);
@@ -1414,6 +1498,19 @@ LiteralProto Literal::ToProto() const {
       *proto.mutable_f16s() =
           string(reinterpret_cast<const char*>(f16s_.data()),
                  f16s_.size() * sizeof(half));
+      if (!kLittleEndian) {
+        ConvertEndianShort(const_cast<char*>(proto.mutable_f16s()->data()),
+                           proto.f16s().size());
+      }
+      break;
+    case BF16:
+      *proto.mutable_bf16s() =
+          string(reinterpret_cast<const char*>(bf16s_.data()),
+                 bf16s_.size() * sizeof(bfloat16));
+      if (!kLittleEndian) {
+        ConvertEndianShort(const_cast<char*>(proto.mutable_bf16s()->data()),
+                           proto.bf16s().size());
+      }
       break;
     case F32:
       CopyToRepeatedField(proto.mutable_f32s(), f32s());
@@ -1482,6 +1579,21 @@ void Literal::CopyFromProto(const LiteralProto& literal_proto) {
       CHECK_EQ(0, s.size() % sizeof(half));
       f16s_ = std::vector<half>(s.size() / sizeof(half));
       memcpy(f16s_.data(), s.data(), s.size());
+
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(f16s_.data()), s.size());
+      }
+      break;
+    }
+    case BF16: {
+      const string& s(literal_proto.bf16s());
+      CHECK_EQ(0, s.size() % sizeof(bfloat16));
+      bf16s_ = std::vector<bfloat16>(s.size() / sizeof(bfloat16));
+      memcpy(bf16s_.data(), s.data(), s.size());
+
+      if (!kLittleEndian) {
+        ConvertEndianShort(reinterpret_cast<char*>(bf16s_.data()), s.size());
+      }
       break;
     }
     case F32:
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index a1e288829f22835f94c6e3c041796f84d995211c..2981f9f8753a60f7acb7e3c6bf86f2b9da4c96d8 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -99,6 +99,7 @@ class Literal {
     f16s_.clear();
     f32s_.clear();
     f64s_.clear();
+    c64s_.clear();
     tuple_literals_.clear();
   }
 
@@ -163,6 +164,11 @@ class Literal {
   const std::vector<complex64>& c64s() const { return c64s_; }
   std::vector<complex64>* mutable_c64s() { return &c64s_; }
 
+  int bf16s_size() const { return bf16s().size(); }
+  bfloat16 bf16s(int i) const { return bf16s_[i]; }
+  const std::vector<bfloat16>& bf16s() const { return bf16s_; }
+  std::vector<bfloat16>* mutable_bf16s() { return &bf16s_; }
+
   int tuple_literals_size() const { return tuple_literals().size(); }
   const Literal& tuple_literals(int i) const { return tuple_literals_[i]; }
   Literal* add_tuple_literals() {
@@ -280,11 +286,11 @@ class Literal {
   std::unique_ptr<Literal> Relayout(const Layout& new_layout,
                                     const ShapeIndex& shape_index = {}) const;
 
-  // Creates a new literal by reshaping this literal to have 'shape'. Both the
-  // original shape and 'shape' must contain the same number of elements. The
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
   // implementation currently only supports monotonic dim0-major layouts.
   StatusOr<std::unique_ptr<Literal>> Reshape(
-      tensorflow::gtl::ArraySlice<int64> shape) const;
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
 
   // Creates a new literal by reordering the dimensions of this literal.
   // The given `permutation` must be a permutation of the dimension numbers
@@ -450,7 +456,7 @@ class Literal {
   tensorflow::Status ValidateLiteral() const;
 
   // Returns a string representation of the literal value.
-  string ToString() const;
+  string ToString(bool print_layout = false) const;
 
   // Invokes the "per cell" callback for each element in the provided
   // literal with the element's indices and a string representation of
@@ -622,6 +628,7 @@ class Literal {
   std::vector<uint16> u16s_;
   std::vector<uint32> u32s_;
   std::vector<uint64> u64s_;
+  std::vector<bfloat16> bf16s_;
   std::vector<half> f16s_;
   std::vector<float> f32s_;
   std::vector<double> f64s_;
@@ -674,6 +681,9 @@ tensorflow::gtl::ArraySlice<double> Literal::GetArraySlice<double>() const;
 template <>
 tensorflow::gtl::ArraySlice<half> Literal::GetArraySlice<half>() const;
 
+template <>
+tensorflow::gtl::ArraySlice<bfloat16> Literal::GetArraySlice<bfloat16>() const;
+
 template <>
 tensorflow::gtl::ArraySlice<complex64> Literal::GetArraySlice<complex64>()
     const;
@@ -714,6 +724,9 @@ tensorflow::gtl::MutableArraySlice<double> Literal::GetMutableArraySlice();
 template <>
 tensorflow::gtl::MutableArraySlice<half> Literal::GetMutableArraySlice();
 
+template <>
+tensorflow::gtl::MutableArraySlice<bfloat16> Literal::GetMutableArraySlice();
+
 template <>
 tensorflow::gtl::MutableArraySlice<complex64> Literal::GetMutableArraySlice();
 
@@ -747,6 +760,9 @@ void Literal::Resize<double>(int64 num_elements, double value);
 template <>
 void Literal::Resize<half>(int64 num_elements, half value);
 
+template <>
+void Literal::Resize<bfloat16>(int64 num_elements, bfloat16 value);
+
 template <>
 void Literal::Resize<complex64>(int64 num_elements, complex64 value);
 
@@ -990,6 +1006,14 @@ inline half Literal::Get<half>(
   return GetArraySlice<half>()[linear_index];
 }
 
+template <>
+inline bfloat16 Literal::Get<bfloat16>(
+    tensorflow::gtl::ArraySlice<int64> multi_index) const {
+  CHECK(shape().element_type() == BF16);
+  int64 linear_index = LinearIndex(multi_index);
+  return GetArraySlice<bfloat16>()[linear_index];
+}
+
 template <typename NativeT>
 void Literal::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
                   NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index a9af4849e2124fd47ae42cc06ac8cc5ca5a22cb7..7ff64c4134155e7fe22ab99584970a7d6d6e8803 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -110,6 +110,18 @@ TEST_F(LiteralUtilTest, LiteralScalarToString) {
 
   auto c64_lit = Literal::CreateR0<complex64>({3.14f, 2.78f});
   ASSERT_EQ("(3.14, 2.78)", c64_lit->ToString());
+
+  auto bf16_lit = Literal::CreateR0<bfloat16>(static_cast<bfloat16>(0.5f));
+  ASSERT_EQ("0.5", bf16_lit->ToString());
+
+  // 3.14 will be truncated to 3.125 in bfloat16 format.
+  auto bf16_lit_truncated =
+      Literal::CreateR0<bfloat16>(static_cast<bfloat16>(3.14f));
+  ASSERT_EQ("3.125", bf16_lit_truncated->ToString());
+
+  auto bf16_lit_truncated2 =
+      Literal::CreateR0<bfloat16>(static_cast<bfloat16>(9.001f));
+  ASSERT_EQ("9", bf16_lit_truncated2->ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
@@ -122,7 +134,7 @@ TEST_F(LiteralUtilTest, R2ToString) {
   const string expected = R"(s32[3,2] {
   { 1, 2 },
   { 3, 4 },
-  { 5, 6 },
+  { 5, 6 }
 })";
   ASSERT_EQ(expected, literal->ToString());
 }
@@ -148,8 +160,8 @@ TEST_F(LiteralUtilTest, TupleToString) {
 1,
 f32[2,2] {
   { 1, 2 },
-  { 3, 4 },
-},
+  { 3, 4 }
+}
 ))";
   ASSERT_EQ(expected, tuple->ToString());
 }
@@ -191,18 +203,18 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   EXPECT_THAT(literal->shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = literal->ToString();
   const string expected = R"(f32[1,2,3,2] {
-  {  // i0=0
-    {  // i1=0
+  {  /*i0=0*/
+    {  /*i1=0*/
       {1, 2},
       {1001, 1002},
-      {2001, 2002},
+      {2001, 2002}
     },
-    {  // i1=1
+    {  /*i1=1*/
       {1, 2},
       {1001, 1002},
-      {2001, 2002},
-    },
-  },
+      {2001, 2002}
+    }
+  }
 })";
   ASSERT_EQ(expected, result);
 }
@@ -212,30 +224,30 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
               ElementsAre(2, 2, 3, 3));
   string result = literal_r4_2x2x3x3_dim0major_->ToString();
   const string expected = R"(f32[2,2,3,3] {
-  {  // i0=0
-    {  // i1=0
+  {  /*i0=0*/
+    {  /*i1=0*/
       {1, 2, 3},
       {4, 5, 6},
-      {7, 8, 9},
+      {7, 8, 9}
     },
-    {  // i1=1
+    {  /*i1=1*/
       {11, 12, 13},
       {14, 15, 16},
-      {17, 18, 19},
-    },
+      {17, 18, 19}
+    }
   },
-  {  // i0=1
-    {  // i1=0
+  {  /*i0=1*/
+    {  /*i1=0*/
       {101, 102, 103},
       {104, 105, 106},
-      {107, 108, 109},
+      {107, 108, 109}
     },
-    {  // i1=1
+    {  /*i1=1*/
       {201, 202, 203},
       {204, 205, 206},
-      {207, 208, 209},
-    },
-  },
+      {207, 208, 209}
+    }
+  }
 })";
   ASSERT_EQ(expected, result);
 }
@@ -397,6 +409,18 @@ TEST_F(LiteralUtilTest, IsAll) {
   EXPECT_FALSE(Literal::CreateR2<half>({{h8}, {h9}})->IsAll(8));
   EXPECT_FALSE(Literal::CreateR2<half>({{h9}, {h8}})->IsAll(8));
 
+  bfloat16 b8(8.0f);
+  bfloat16 b9(9.0f);
+
+  EXPECT_TRUE(Literal::CreateR2<bfloat16>({{b8}, {b8}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<bfloat16>({{b8}, {b9}})->IsAll(8));
+  EXPECT_FALSE(Literal::CreateR2<bfloat16>({{b9}, {b8}})->IsAll(8));
+
+  // 9.001 will be truncated to 9.0
+  bfloat16 b91(9.001f);
+  bfloat16 b90(9.00f);
+  EXPECT_TRUE(Literal::CreateR2<bfloat16>({{b91}, {b90}})->IsAll(9.0));
+
   complex64 c8_9 = {8, 9};
   EXPECT_FALSE(Literal::CreateR2<complex64>({{c8_9}, {c8_9}})->IsAll(8));
 
@@ -491,7 +515,7 @@ TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
 
 TEST_F(LiteralUtilTest, ReshapeR0) {
   auto original = Literal::CreateR0<float>(1.7f);
-  auto reshape = original->Reshape(/*shape=*/{}).ConsumeValueOrDie();
+  auto reshape = original->Reshape(/*dimensions=*/{}).ConsumeValueOrDie();
   EXPECT_EQ(*original, *reshape);
 }
 
@@ -691,6 +715,30 @@ TEST_F(LiteralUtilTest, PopulateR2C64) {
   EXPECT_EQ(output, *expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR0BF16) {
+  Literal output;
+  bfloat16 h(0.25f);
+  output.PopulateWithValue<bfloat16>(h, {});
+  auto expected = Literal::CreateR0<bfloat16>(h);
+  EXPECT_EQ(output, *expected);
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR1BF16) {
+  Literal output;
+  bfloat16 h(0.5f);
+  output.PopulateWithValue<bfloat16>(h, {3});
+  auto expected = Literal::CreateR1<bfloat16>({h, h, h});
+  EXPECT_EQ(output, *expected);
+}
+
+TEST_F(LiteralUtilTest, PopulateWithValueR2BF16) {
+  Literal output;
+  bfloat16 h(2.0f);
+  output.PopulateWithValue<bfloat16>(h, {2, 2});
+  auto expected = Literal::CreateR2<bfloat16>({{h, h}, {h, h}});
+  EXPECT_EQ(output, *expected);
+}
+
 TEST_F(LiteralUtilTest, PopulateWithValueR0F32) {
   Literal output;
   output.PopulateWithValue<float>(2.5f, {});
@@ -975,6 +1023,14 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{half(26.0), half(0.0), half(28.0), half(0.0)},
      {half(0.0), half(31.0), half(0.0), half(33.0)}},
   }}, layout_r4_dim0major_);
+  auto bf16 = Literal::CreateR4WithLayout<bfloat16>({{
+    {{bfloat16(10.0), bfloat16(0.0), bfloat16(12.0), bfloat16(0.0)},
+     {bfloat16(0.0), bfloat16(15.0), bfloat16(0.0), bfloat16(17.0)}},
+    {{bfloat16(0.0), bfloat16(19.0), bfloat16(0.0), bfloat16(21.0)},
+     {bfloat16(22.0), bfloat16(0.0), bfloat16(24.0), bfloat16(0.0)}},
+    {{bfloat16(26.0), bfloat16(0.0), bfloat16(28.0), bfloat16(0.0)},
+     {bfloat16(0.0), bfloat16(31.0), bfloat16(0.0), bfloat16(33.0)}},
+  }}, layout_r4_dim0major_);
   auto f32 = Literal::CreateR4WithLayout<float>({{
     {{10.0f, 0.0f, 12.0f, 0.0f}, {0.0f, 15.0f, 0.0f, 17.0f}},
     {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
@@ -1008,6 +1064,12 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   conv = s8->Convert(PRED).ConsumeValueOrDie();
   EXPECT_EQ(*conv, *pred);
 
+  conv = bf16->Convert(S32).ConsumeValueOrDie();
+  EXPECT_EQ(*conv, *s32);
+
+  conv = bf16->Convert(F32).ConsumeValueOrDie();
+  EXPECT_EQ(*conv, *f32);
+
   conv = pred->Convert(S32).ConsumeValueOrDie();
   EXPECT_EQ(*conv, *int32_pred);
 
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index 2113b5e06f3eb0169be50c0ee731a903c0eece9d..2bce56b7bd2f91f20ea670d0e7ccaa432c2b5f9f 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -78,6 +78,11 @@ PrimitiveType NativeToPrimitiveType<double>() {
   return F64;
 }
 
+template <>
+PrimitiveType NativeToPrimitiveType<bfloat16>() {
+  return BF16;
+}
+
 template <>
 PrimitiveType NativeToPrimitiveType<half>() {
   return F16;
@@ -89,7 +94,7 @@ PrimitiveType NativeToPrimitiveType<complex64>() {
 }
 
 bool IsFloatingPointType(PrimitiveType type) {
-  return type == F16 || type == F32 || type == F64;
+  return type == F16 || type == F32 || type == F64 || type == BF16;
 }
 
 bool IsComplexType(PrimitiveType type) { return type == C64; }
@@ -118,6 +123,7 @@ int BitWidth(PrimitiveType type) {
     case S16:
     case U16:
     case F16:
+    case BF16:
       return 16;
 
     case U32:
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index a49c8b86fcfe156ea3733ce05c0fb7337cf60dce..cb4583d198b454be1432134a9f6a77dbbbe5bdd8 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -26,6 +26,13 @@ limitations under the License.
 namespace xla {
 namespace primitive_util {
 
+// The number of exponent bits in a BF16 value.
+const int kBFloat16ExponentBits = 8;
+
+// The number of mantissa bits in a BF16 value. There is an implicit leading
+// 1, so there is an implicit additional bit of precision.
+const int kBFloat16MantissaBits = 7;
+
 // Returns the XLA primitive type (eg, F32) corresponding to the given
 // template parameter native type (eg, float).
 template <typename NativeT>
@@ -77,6 +84,8 @@ template <>
 PrimitiveType NativeToPrimitiveType<double>();
 template <>
 PrimitiveType NativeToPrimitiveType<half>();
+template <>
+PrimitiveType NativeToPrimitiveType<bfloat16>();
 
 // Complex
 template <>
@@ -167,6 +176,11 @@ struct PrimitiveTypeToNative<F16> {
   using type = half;
 };
 
+template <>
+struct PrimitiveTypeToNative<BF16> {
+  using type = bfloat16;
+};
+
 // Complex
 template <>
 struct PrimitiveTypeToNative<C64> {
diff --git a/tensorflow/compiler/xla/ptr_util.h b/tensorflow/compiler/xla/ptr_util.h
index fa670303136ebff0c3e0e32f5c64e879c46fe964..c58c19db2cacbe9b038160f27b9bd76aa58146eb 100644
--- a/tensorflow/compiler/xla/ptr_util.h
+++ b/tensorflow/compiler/xla/ptr_util.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PTR_UTIL_H_
 
-// Utility functions for pointers.
+// As this was moved to tensorflow/core/util, provide indirections here to
+// maintain current functionality of the library.
 
 #include <stddef.h>
 
@@ -24,55 +25,27 @@ limitations under the License.
 #include <type_traits>
 #include <utility>
 
-namespace xla {
-
-namespace internal {
-
-// Trait to select overloads and return types for MakeUnique.
-template <typename T>
-struct MakeUniqueResult {
-  using scalar = std::unique_ptr<T>;
-};
-template <typename T>
-struct MakeUniqueResult<T[]> {
-  using array = std::unique_ptr<T[]>;
-};
-template <typename T, size_t N>
-struct MakeUniqueResult<T[N]> {
-  using invalid = void;
-};
+#include "tensorflow/core/util/ptr_util.h"
 
-}  // namespace internal
+namespace xla {
 
-// Transfers ownership of a raw pointer to a std::unique_ptr of deduced type.
-// Example:
-//   X* NewX(int, int);
-//   auto x = WrapUnique(NewX(1, 2));  // 'x' is std::unique_ptr<X>.
-//
-// WrapUnique is useful for capturing the output of a raw pointer factory.
-// However, prefer 'MakeUnique<T>(args...) over 'WrapUnique(new T(args...))'.
-//   auto x = WrapUnique(new X(1, 2));  // works, but nonideal.
-//   auto x = MakeUnique<X>(1, 2);  // safer, standard, avoids raw 'new'.
-//
-// Note: Cannot wrap pointers to array of unknown bound (i.e. U(*)[]).
 template <typename T>
 std::unique_ptr<T> WrapUnique(T* ptr) {
-  static_assert(!std::is_array<T>::value || std::extent<T>::value != 0,
-                "types T[0] or T[] are unsupported");
-  return std::unique_ptr<T>(ptr);
+  return tensorflow::WrapUnique<T>(ptr);
 }
 
 template <typename T, typename... Args>
-typename internal::MakeUniqueResult<T>::scalar MakeUnique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+typename tensorflow::helper::MakeUniqueResult<T>::scalar MakeUnique(
+    Args&&... args) {
+  return tensorflow::MakeUnique<T, Args...>(std::forward<Args>(args)...);
 }
 
 // Overload for array of unknown bound.
 // The allocation of arrays needs to use the array form of new,
 // and cannot take element constructor arguments.
 template <typename T>
-typename internal::MakeUniqueResult<T>::array MakeUnique(size_t n) {
-  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+typename tensorflow::helper::MakeUniqueResult<T>::array MakeUnique(size_t n) {
+  return tensorflow::MakeUnique<T>(n);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 90aa9720a1e18bad06842adeead46fc3120d01dd..bdf92eaed1ff1d83cf03eec4d126677ea42c577f 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -102,7 +102,9 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
     const Array3D<float>& lhs, const Array3D<float>& rhs, int64 kernel_stride,
     Padding padding, int64 lhs_dilation, int64 rhs_dilation,
     const ConvolutionDimensionNumbers& dnums) {
-  CHECK_EQ(dnums.spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.input_spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.kernel_spatial_dimensions_size(), 1);
+  CHECK_EQ(dnums.output_spatial_dimensions_size(), 1);
   // Reuse the code for Array4D-convolution by extending the 3D input into a 4D
   // array by adding a fourth dummy dimension of size 1 without stride, padding
   // and dilation.
@@ -120,8 +122,9 @@ ReferenceUtil::ConvArray3DGeneralDimensionsDilated(
       });
   // Add a second dummy spatial dimensions.
   ConvolutionDimensionNumbers dnums2d = dnums;
-  dnums2d.add_spatial_dimensions(3);
+  dnums2d.add_input_spatial_dimensions(3);
   dnums2d.add_kernel_spatial_dimensions(3);
+  dnums2d.add_output_spatial_dimensions(3);
   std::unique_ptr<Array4D<float>> convr4 = ConvArray4DGeneralDimensionsDilated(
       a4dlhs, a4drhs, {kernel_stride, 1}, padding, {lhs_dilation, 1},
       {rhs_dilation, 1}, dnums2d);
@@ -192,14 +195,26 @@ ReferenceUtil::ReduceWindow1DGeneric(
     const tensorflow::gtl::ArraySlice<int64>& window,
     const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding) {
   std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
-  auto padding_both = xla::MakePadding(dim_lengths, window, stride, padding);
+  return ReduceWindow1DGeneric(
+      operand, init, reduce_func, window, stride,
+      xla::MakePadding(dim_lengths, window, stride, padding));
+}
 
+/* static  */ std::unique_ptr<std::vector<float>>
+ReferenceUtil::ReduceWindow1DGeneric(
+    const tensorflow::gtl::ArraySlice<float>& operand, float init,
+    const std::function<float(float, float)>& reduce_func,
+    const tensorflow::gtl::ArraySlice<int64>& window,
+    const tensorflow::gtl::ArraySlice<int64>& stride,
+    const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding) {
+  std::vector<int64> dim_lengths{static_cast<int64>(operand.size())};
   std::vector<int64> window_counts(window.size(), 0);
   std::vector<int64> pad_low(window.size(), 0);
   for (int64 i = 0; i < window.size(); ++i) {
+    int64 padded_width = padding[i].first + dim_lengths[i] + padding[i].second;
     window_counts[i] =
-        WindowCount(dim_lengths[i], window[i], stride[i], padding);
-    pad_low[i] = padding_both[i].first;
+        window_util::StridedBound(padded_width, window[i], stride[i]);
+    pad_low[i] = padding[i].first;
   }
   auto result = MakeUnique<std::vector<float>>(window_counts[0]);
 
@@ -465,9 +480,9 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
   }
 
   ordered_input_dimensions[0] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(0));
+      lhs_literal->shape().dimensions(dnums.input_spatial_dimensions(0));
   ordered_input_dimensions[1] =
-      lhs_literal->shape().dimensions(dnums.spatial_dimensions(1));
+      lhs_literal->shape().dimensions(dnums.input_spatial_dimensions(1));
   ordered_kernel_dimensions[0] =
       rhs_literal->shape().dimensions(dnums.kernel_spatial_dimensions(0));
   ordered_kernel_dimensions[1] =
@@ -703,137 +718,4 @@ ReferenceUtil::ReduceToRowArray2D(
   return result;
 }
 
-/* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::PadArray2D(
-    const Array2D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  int64 in0 = operand.n1();
-  int64 high_padding0 = padding.dimensions(0).edge_padding_high();
-  int64 low_padding0 = padding.dimensions(0).edge_padding_low();
-  int64 interior_padding0 = padding.dimensions(0).interior_padding();
-  int64 out0 =
-      in0 + low_padding0 + high_padding0 + (in0 - 1) * interior_padding0;
-
-  int64 in1 = operand.n2();
-  int64 high_padding1 = padding.dimensions(1).edge_padding_high();
-  int64 low_padding1 = padding.dimensions(1).edge_padding_low();
-  int64 interior_padding1 = padding.dimensions(1).interior_padding();
-  int64 out1 =
-      in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
-
-  auto result = MakeUnique<Array2D<float>>(out0, out1);
-  result->Fill(pad);
-  int64 o0 = low_padding0;
-  for (int64 i0 = 0; i0 < in0; ++i0) {
-    int64 o1 = low_padding1;
-    for (int64 i1 = 0; i1 < in1; ++i1) {
-      if (o0 >= 0 && o1 >= 0 && o0 < out0 && o1 < out1) {
-        (*result)(o0, o1) = operand(i0, i1);
-      }
-      o1 += interior_padding1 + 1;
-    }
-    o0 += interior_padding0 + 1;
-  }
-  return result;
-}
-
-/* static */ Array3D<float> ReferenceUtil::PadArray3D(
-    const Array3D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  CHECK_EQ(padding.dimensions_size(), 3);
-
-  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                           operand.n3()};
-  std::vector<int64> pad_low(3);
-  std::vector<int64> pad_high(3);
-  std::vector<int64> pad_interior(3);
-  std::vector<int64> output_bounds(3);
-  for (int64 i = 0; i < 3; ++i) {
-    pad_low[i] = padding.dimensions(i).edge_padding_low();
-    pad_high[i] = padding.dimensions(i).edge_padding_high();
-    CHECK_LE(0, pad_low[i]);
-    CHECK_LE(0, pad_high[i]);
-    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
-    pad_interior[i] = padding.dimensions(i).interior_padding();
-
-    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
-                       (input_bounds[i] - 1) * pad_interior[i];
-  }
-
-  Array3D<float> result(output_bounds[0], output_bounds[1], output_bounds[2]);
-  std::vector<int> indices = {0, 0, 0};
-  for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
-    for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
-      for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
-        float* value = &result(indices[0], indices[1], indices[2]);
-        bool value_padded = false;
-        for (int i = 0; i < 3; ++i) {
-          bool in_low_padding = indices[i] < pad_low[i];
-          bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
-          if (in_low_padding || in_high_padding) {
-            *value = pad;
-            value_padded = true;
-          }
-          if (pad_interior[i] &&
-              (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
-            *value = pad;
-            value_padded = true;
-          }
-        }
-        if (value_padded) {
-          continue;
-        }
-        *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
-                         (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
-                         (indices[2] - pad_low[2]) / (pad_interior[2] + 1));
-      }
-    }
-  }
-  return result;
-}
-
-/* static */ Array4D<float> ReferenceUtil::PadArray4D(
-    const Array4D<float>& operand, const PaddingConfig& padding,
-    const float pad) {
-  CHECK_EQ(padding.dimensions_size(), 4);
-
-  const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                           operand.n3(), operand.n4()};
-  std::vector<int64> pad_low(4);
-  std::vector<int64> pad_high(4);
-  std::vector<int64> pad_interior(4);
-  std::vector<int64> output_bounds(4);
-  for (int64 i = 0; i < 4; ++i) {
-    pad_low[i] = padding.dimensions(i).edge_padding_low();
-    pad_high[i] = padding.dimensions(i).edge_padding_high();
-    CHECK_LE(0, padding.dimensions(i).interior_padding()) << "not implemented";
-    pad_interior[i] = padding.dimensions(i).interior_padding();
-
-    output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
-                       (input_bounds[i] - 1) * pad_interior[i];
-  }
-
-  Array4D<float> result(output_bounds[0], output_bounds[1], output_bounds[2],
-                        output_bounds[3]);
-  result.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* value) {
-    for (int i = 0; i < 4; ++i) {
-      bool in_low_padding = indices[i] < pad_low[i];
-      bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
-      if (in_low_padding || in_high_padding) {
-        *value = pad;
-        return;
-      }
-      if (pad_interior[i] &&
-          (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
-        *value = pad;
-        return;
-      }
-    }
-    *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
-                     (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
-                     (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
-                     (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
-  });
-  return result;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 2da17307817858eea60e868f4be1ab8138784385..58e1a844610678f64677838e93f0379b63f65d39 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -70,7 +70,7 @@ class ReferenceUtil {
   // dilation factors.
   static std::unique_ptr<Array4D<float>> ConvArray4DGeneralDimensionsDilated(
       const Array4D<float>& lhs, const Array4D<float>& rhs,
-      std::pair<int64, int64> stride, Padding padding,
+      std::pair<int64, int64> kernel_stride, Padding padding,
       std::pair<int64, int64> lhs_dilation,
       std::pair<int64, int64> rhs_dilation, ConvolutionDimensionNumbers dnums);
 
@@ -184,6 +184,12 @@ class ReferenceUtil {
       const std::function<float(float, float)>& reduce_func,
       const tensorflow::gtl::ArraySlice<int64>& window,
       const tensorflow::gtl::ArraySlice<int64>& stride, Padding padding);
+  static std::unique_ptr<std::vector<float>> ReduceWindow1DGeneric(
+      const tensorflow::gtl::ArraySlice<float>& operand, float init,
+      const std::function<float(float, float)>& reduce_func,
+      const tensorflow::gtl::ArraySlice<int64>& window,
+      const tensorflow::gtl::ArraySlice<int64>& stride,
+      const tensorflow::gtl::ArraySlice<std::pair<int64, int64>>& padding);
   static std::unique_ptr<Array4D<float>> ReduceWindow4DGeneric(
       const Array4D<float>& operand, float init,
       const std::function<float(float, float)>& reduce_func,
@@ -486,19 +492,147 @@ class ReferenceUtil {
   }
 
   // Returns the result of a 2D pad on an input matrix.
-  static std::unique_ptr<Array2D<float>> PadArray2D(
-      const Array2D<float>& operand, const PaddingConfig& padding,
-      const float pad);
+  template <typename NativeT>
+  static std::unique_ptr<Array2D<NativeT>> PadArray2D(
+      const Array2D<NativeT>& operand, const PaddingConfig& padding,
+      const NativeT pad) {
+    int64 in0 = operand.n1();
+    int64 high_padding0 = padding.dimensions(0).edge_padding_high();
+    int64 low_padding0 = padding.dimensions(0).edge_padding_low();
+    int64 interior_padding0 = padding.dimensions(0).interior_padding();
+    int64 out0 =
+        in0 + low_padding0 + high_padding0 + (in0 - 1) * interior_padding0;
+
+    int64 in1 = operand.n2();
+    int64 high_padding1 = padding.dimensions(1).edge_padding_high();
+    int64 low_padding1 = padding.dimensions(1).edge_padding_low();
+    int64 interior_padding1 = padding.dimensions(1).interior_padding();
+    int64 out1 =
+        in1 + low_padding1 + high_padding1 + (in1 - 1) * interior_padding1;
+
+    auto result = MakeUnique<Array2D<NativeT>>(out0, out1);
+    result->Fill(pad);
+    int64 o0 = low_padding0;
+    for (int64 i0 = 0; i0 < in0; ++i0) {
+      int64 o1 = low_padding1;
+      for (int64 i1 = 0; i1 < in1; ++i1) {
+        if (o0 >= 0 && o1 >= 0 && o0 < out0 && o1 < out1) {
+          (*result)(o0, o1) = operand(i0, i1);
+        }
+        o1 += interior_padding1 + 1;
+      }
+      o0 += interior_padding0 + 1;
+    }
+    return result;
+  }
 
   // Returns the result of a 3D pad on an input matrix.
-  static Array3D<float> PadArray3D(const Array3D<float>& operand,
-                                   const PaddingConfig& padding,
-                                   const float pad);
+  template <typename NativeT>
+  static Array3D<NativeT> PadArray3D(const Array3D<NativeT>& operand,
+                                     const PaddingConfig& padding,
+                                     const NativeT pad) {
+    CHECK_EQ(padding.dimensions_size(), 3);
+
+    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                             operand.n3()};
+    std::vector<int64> pad_low(3);
+    std::vector<int64> pad_high(3);
+    std::vector<int64> pad_interior(3);
+    std::vector<int64> output_bounds(3);
+    for (int64 i = 0; i < 3; ++i) {
+      pad_low[i] = padding.dimensions(i).edge_padding_low();
+      pad_high[i] = padding.dimensions(i).edge_padding_high();
+      CHECK_LE(0, pad_low[i]);
+      CHECK_LE(0, pad_high[i]);
+      CHECK_LE(0, padding.dimensions(i).interior_padding())
+          << "not implemented";
+      pad_interior[i] = padding.dimensions(i).interior_padding();
+
+      output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                         (input_bounds[i] - 1) * pad_interior[i];
+    }
+
+    Array3D<NativeT> result(output_bounds[0], output_bounds[1],
+                            output_bounds[2]);
+    std::vector<int> indices = {0, 0, 0};
+    for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
+      for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
+        for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
+          NativeT* value = &result(indices[0], indices[1], indices[2]);
+          bool value_padded = false;
+          for (int i = 0; i < 3; ++i) {
+            bool in_low_padding = indices[i] < pad_low[i];
+            bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+            if (in_low_padding || in_high_padding) {
+              *value = pad;
+              value_padded = true;
+            }
+            if (pad_interior[i] &&
+                (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+              *value = pad;
+              value_padded = true;
+            }
+          }
+          if (value_padded) {
+            continue;
+          }
+          *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                           (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                           (indices[2] - pad_low[2]) / (pad_interior[2] + 1));
+        }
+      }
+    }
+    return result;
+  }
 
   // Returns the result of a 4D pad on an input array.
-  static Array4D<float> PadArray4D(const Array4D<float>& operand,
-                                   const PaddingConfig& padding,
-                                   const float pad);
+  template <typename NativeT>
+  static Array4D<NativeT> PadArray4D(const Array4D<NativeT>& operand,
+                                     const PaddingConfig& padding,
+                                     const NativeT pad) {
+    CHECK_EQ(padding.dimensions_size(), 4);
+
+    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
+                                             operand.n3(), operand.n4()};
+    std::vector<int64> pad_low(4);
+    std::vector<int64> pad_high(4);
+    std::vector<int64> pad_interior(4);
+    std::vector<int64> output_bounds(4);
+    for (int64 i = 0; i < 4; ++i) {
+      pad_low[i] = padding.dimensions(i).edge_padding_low();
+      pad_high[i] = padding.dimensions(i).edge_padding_high();
+      CHECK_LE(0, padding.dimensions(i).interior_padding())
+          << "not implemented";
+      pad_interior[i] = padding.dimensions(i).interior_padding();
+
+      output_bounds[i] = pad_low[i] + input_bounds[i] + pad_high[i] +
+                         (input_bounds[i] - 1) * pad_interior[i];
+    }
+
+    Array4D<NativeT> result(output_bounds[0], output_bounds[1],
+                            output_bounds[2], output_bounds[3]);
+    result.Each(
+        [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT* value) {
+          for (int i = 0; i < 4; ++i) {
+            bool in_low_padding = indices[i] < pad_low[i];
+            bool in_high_padding = indices[i] >= output_bounds[i] - pad_high[i];
+            if (in_low_padding || in_high_padding) {
+              *value = pad;
+              return;
+            }
+            if (pad_interior[i] &&
+                (indices[i] - pad_low[i]) % (pad_interior[i] + 1)) {
+              *value = pad;
+              return;
+            }
+          }
+          *value = operand((indices[0] - pad_low[0]) / (pad_interior[0] + 1),
+                           (indices[1] - pad_low[1]) / (pad_interior[1] + 1),
+                           (indices[2] - pad_low[2]) / (pad_interior[2] + 1),
+                           (indices[3] - pad_low[3]) / (pad_interior[3] + 1));
+        });
+    return result;
+  }
 
   // ApplyElementwise2D(f, x, y, ...) returns the Array2D formed by running
   // f(x[i], y[i], ...) for each array element in the Array2Ds x, y, ....
diff --git a/tensorflow/compiler/xla/reference_util_test.cc b/tensorflow/compiler/xla/reference_util_test.cc
index eb6a71242ffa1499876b90f14f8a60ffdbdd069c..846ccdc83df900e3afedb6ababe07ebb1bd68f41 100644
--- a/tensorflow/compiler/xla/reference_util_test.cc
+++ b/tensorflow/compiler/xla/reference_util_test.cc
@@ -60,7 +60,9 @@ TEST_F(ReferenceUtilTest, TransposeArray2D) {
 
 TEST_F(ReferenceUtilTest, MatmulArray2D) {
   Array2D<float> rhs({
-      {7.f, 8.f}, {9.f, 10.f}, {11.f, 12.f},
+      {7.f, 8.f},
+      {9.f, 10.f},
+      {11.f, 12.f},
   });
   auto result = ReferenceUtil::MatmulArray2D(*matrix_, rhs);
   auto actual_literal = Literal::CreateR2FromArray2D(*result);
@@ -326,8 +328,10 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithSamePadding) {
   dimension_numbers.set_input_feature_dimension(0);
   dimension_numbers.set_output_batch_dimension(2);
   dimension_numbers.set_output_feature_dimension(0);
-  dimension_numbers.add_spatial_dimensions(1);
-  dimension_numbers.add_spatial_dimensions(3);
+  dimension_numbers.add_input_spatial_dimensions(1);
+  dimension_numbers.add_output_spatial_dimensions(1);
+  dimension_numbers.add_input_spatial_dimensions(3);
+  dimension_numbers.add_output_spatial_dimensions(3);
   dimension_numbers.set_kernel_output_feature_dimension(0);
   dimension_numbers.set_kernel_input_feature_dimension(2);
   dimension_numbers.add_kernel_spatial_dimensions(1);
@@ -380,8 +384,10 @@ TEST_F(ReferenceUtilTest, ConvGeneralDimensionsWithValidPadding) {
   dimension_numbers.set_input_feature_dimension(0);
   dimension_numbers.set_output_batch_dimension(2);
   dimension_numbers.set_output_feature_dimension(0);
-  dimension_numbers.add_spatial_dimensions(1);
-  dimension_numbers.add_spatial_dimensions(3);
+  dimension_numbers.add_input_spatial_dimensions(1);
+  dimension_numbers.add_output_spatial_dimensions(1);
+  dimension_numbers.add_input_spatial_dimensions(3);
+  dimension_numbers.add_output_spatial_dimensions(3);
 
   dimension_numbers.set_kernel_output_feature_dimension(0);
   dimension_numbers.set_kernel_input_feature_dimension(2);
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a15f3f654b14a715a2fbc71cdd38d46ac0268c02..c7432aacd18215d8c561b636a8ccc0da8118398c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -566,7 +566,6 @@ cc_library(
     hdrs = ["shaped_buffer.h"],
     deps = [
         ":device_memory_allocator",
-        ":transfer_manager",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -630,6 +629,7 @@ cc_library(
 
 cc_library(
     name = "llvm_compiler",
+    srcs = ["llvm_compiler.cc"],
     hdrs = ["llvm_compiler.h"],
     deps = [
         ":compiler",
@@ -642,6 +642,7 @@ cc_library(
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
     deps = [
+        ":shaped_buffer",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1053,9 +1054,7 @@ cc_library(
     srcs = ["algebraic_simplifier.cc"],
     hdrs = ["algebraic_simplifier.h"],
     deps = [
-        ":call_inliner",
         ":hlo",
-        ":hlo_evaluator",
         ":hlo_pass",
         ":hlo_query",
         ":shape_inference",
@@ -1091,6 +1090,32 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "while_loop_simplifier",
+    srcs = ["while_loop_simplifier.cc"],
+    hdrs = ["while_loop_simplifier.h"],
+    deps = [
+        ":call_inliner",
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "while_loop_simplifier_test",
+    srcs = ["while_loop_simplifier_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":while_loop_simplifier",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "defuser",
     srcs = ["defuser.cc"],
@@ -1118,6 +1143,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "dot_decomposer",
+    srcs = ["dot_decomposer.cc"],
+    hdrs = ["dot_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
@@ -1267,24 +1308,6 @@ cc_library(
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
 
-tf_cc_test(
-    name = "transfer_manager_test",
-    srcs = ["transfer_manager_test.cc"],
-    deps = [
-        ":generic_transfer_manager",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-    ],
-)
-
 cc_library(
     name = "hlo_cost_analysis",
     srcs = ["hlo_cost_analysis.cc"],
@@ -1297,6 +1320,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -1334,6 +1358,7 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_cost_analysis",
+        ":hlo_profile_printer",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -1342,6 +1367,18 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_execution_profile_test",
+    srcs = ["hlo_execution_profile_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":hlo_cost_analysis",
+        ":hlo_execution_profile",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "hlo_computation_test",
     srcs = ["hlo_computation_test.cc"],
@@ -1618,10 +1655,14 @@ cc_library(
     deps = [
         ":buffer_liveness",
         ":hlo",
+        ":hlo_alias_analysis",
+        ":hlo_dce",
+        ":hlo_graph_dumper",
+        ":hlo_ordering",
         ":hlo_pass",
         ":liveness_util",
         ":logical_buffer",
-        ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1636,15 +1677,17 @@ tf_cc_test(
     deps = [
         ":copy_insertion",
         ":hlo",
+        ":hlo_graph_dumper",
         ":hlo_matchers",
-        ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -1754,7 +1797,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
 )
@@ -1825,7 +1867,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
 )
@@ -1864,6 +1905,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_element_type_converter",
+    srcs = ["hlo_element_type_converter.cc"],
+    hdrs = ["hlo_element_type_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        ":hlo_query",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = ["device_memory_allocator.cc"],
@@ -1961,6 +2018,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -2126,6 +2184,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -2133,6 +2192,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_profile_printer",
+    srcs = ["hlo_profile_printer.cc"],
+    hdrs = ["hlo_profile_printer.h"],
+    deps = [
+        ":human_readable_profile_builder",
+        "//tensorflow/compiler/xla:types",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ee5cf8a10074d72d81374cf9dcb2cb2164f0d9db..2c0d1900eb6108eb8028fd89220758df03746647 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -24,10 +24,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
@@ -48,9 +46,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using tensorflow::gtl::nullopt;
-using tensorflow::gtl::optional;
-
 // Returns whether operand is a literal with the given value.
 bool IsLiteralWithValue(const HloInstruction* operand, int8 value) {
   return operand->opcode() == HloOpcode::kConstant &&
@@ -137,7 +132,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleConvert(HloInstruction* convert) override;
 
+  Status HandleComplex(HloInstruction* complex) override;
+
   Status HandleReal(HloInstruction* real) override;
+
   Status HandleImag(HloInstruction* imag) override;
 
   Status HandleConvolution(HloInstruction* convolution) override;
@@ -175,8 +173,6 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   Status HandleMaximum(HloInstruction* maximum) override;
   Status HandleMinimum(HloInstruction* minimum) override;
 
-  Status HandleWhile(HloInstruction* while_op) override;
-
   // Returns whether algebraic simplification has occurred.
   const bool changed() const { return changed_; }
 
@@ -184,19 +180,46 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   static bool Run(
       HloComputation* computation, bool is_layout_sensitive,
       AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_simplification, bool enable_conv_simplification);
+      bool enable_dot_strength_reduction, bool enable_conv_simplification);
 
  private:
   explicit AlgebraicSimplifierVisitor(
       HloComputation* computation, bool is_layout_sensitive,
       AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_simplification, bool enable_conv_simplification)
+      bool enable_dot_strength_reduction, bool enable_conv_simplification)
       : computation_(computation),
         is_layout_sensitive_(is_layout_sensitive),
         valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_simplification_(enable_dot_simplification),
+        enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
 
+  // Transforms Dots where at least one input is a vector or has a degenerate
+  // dimension and converts it into a multiply and reduce. This should enable
+  // more fusion than leaving the nodes as Dot operations.
+  StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
+
+  // Reshapes an instruction to rank 1 if it is not already rank 1.
+  HloInstruction* Flatten(HloInstruction* hlo) {
+    if (ShapeUtil::Rank(hlo->shape()) == 1) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(hlo->shape().element_type(),
+                             {ShapeUtil::ElementsIn(hlo->shape())}),
+        hlo));
+  }
+
+  // Helper method to perform and add reduction in a single dimension.
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    HloInstruction* zero = computation_->AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
+    HloComputation* AddReduce_computation = CreateScalarBinaryComputation(
+        computation_->parent(), F32, HloOpcode::kAdd);
+    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    return computation_->AddInstruction(HloInstruction::CreateReduce(
+        shape, hlo, zero, {dim}, AddReduce_computation));
+  }
+
   // Convenience method for replacing an instruction with a bitcast.
   void ReplaceWithBitcast(HloInstruction* instruction);
 
@@ -269,8 +292,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // Callback used to determine if a bitcast is possible.
   AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
 
-  // Disable dot simplication on platforms where it causes a slowdown.
-  bool enable_dot_simplification_;
+  // Disable dot strength reduction on platforms where it causes a slowdown.
+  bool enable_dot_strength_reduction_;
 
   // Disable convolution simplication on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
@@ -279,10 +302,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 bool AlgebraicSimplifierVisitor::Run(
     HloComputation* computation, bool is_layout_sensitive,
     AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_simplification, bool enable_conv_simplification) {
+    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
   AlgebraicSimplifierVisitor visitor(
       computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_simplification, enable_conv_simplification);
+      enable_dot_strength_reduction, enable_conv_simplification);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -578,68 +601,72 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
-  auto lhs = dot->mutable_operand(0);
-  auto rhs = dot->mutable_operand(1);
-  if (!enable_dot_simplification_) {
-    return Status::OK();
-  }
-  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
-  // below.
-  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
-      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
-    return Status::OK();
-  }
-
-  // Replace a zero element dot with a broadcast of the constant 0.
-  if (ShapeUtil::HasZeroElements(dot->shape()) ||
-      ShapeUtil::HasZeroElements(lhs->shape()) ||
-      ShapeUtil::HasZeroElements(rhs->shape())) {
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
-  }
-
-  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
-    auto new_dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()), HloOpcode::kDot,
-        rhs->mutable_operand(0), lhs->mutable_operand(0)));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
-  }
+StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
+    HloInstruction* dot) {
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  int64 lhs_collapsing_dim =
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+  if (lhs->IsRank2Transpose()) {
+    lhs = lhs->mutable_operand(0);
+    lhs_collapsing_dim = 1 - lhs_collapsing_dim;
+  }
+  const int64 lhs_kept_dim = 1 - lhs_collapsing_dim;
+
+  int64 rhs_collapsing_dim =
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  if (rhs->IsRank2Transpose()) {
+    rhs = rhs->mutable_operand(0);
+    rhs_collapsing_dim = 1 - rhs_collapsing_dim;
+  }
+  const int64 rhs_kept_dim = 1 - rhs_collapsing_dim;
+
+  auto reshape_if_necessary = [&](HloInstruction* hlo) {
+    if (ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
+      return hlo;
+    }
+    return computation_->AddInstruction(
+        HloInstruction::CreateReshape(dot->shape(), hlo));
+  };
 
-  // Simplify outer product into multiply with implicit broadcasting.
-  //
-  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(0) == 1) {
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
-                                          lhs, rhs));
-  }
+  auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
+                              int64 dim) {
+    return computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(shape, hlo, {dim}));
+  };
 
-  // The following graph transformations take Dots where at least one input is a
-  // vector or has a degenerate dimension and converts it into a multiply and
-  // reduce. This should enable more fusion than leaving the nodes as Dot
-  // operations.
+  auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
+    return computation_->AddInstruction(HloInstruction::CreateBinary(
+        local_lhs->shape(), HloOpcode::kMultiply, local_lhs, local_rhs));
+  };
 
   // Strength reduce dot(a[K] , b[K]) =
   //  reshape(result.shape,
   //          reduce_sum(multiply(a, b), {0}))
   if (ShapeUtil::Rank(rhs->shape()) == 1 &&
       ShapeUtil::Rank(lhs->shape()) == 1) {
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        rhs->shape(), HloOpcode::kMultiply, lhs, rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-        {0}, add_reduce_computation));
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+    TF_RETURN_IF_ERROR(
+        ReplaceInstruction(dot, reshape_if_necessary(AddReduce(
+                                    multiply(Flatten(lhs), Flatten(rhs)), 0))));
+    return true;
+  }
+
+  if (ShapeUtil::IsEffectiveScalar(rhs->shape()) &&
+      ShapeUtil::IsEffectiveScalar(lhs->shape())) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(multiply(Flatten(lhs), Flatten(rhs)))));
+    return true;
+  }
+
+  // Simplify outer product into multiply with implicit broadcasting.
+  //
+  // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
+  if (ShapeUtil::Rank(rhs->shape()) == 2 &&
+      rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
+                      broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
+    return true;
   }
 
   // Strength reduce dot(a[1, K], b) =
@@ -650,35 +677,21 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //      )
   //    )
   if (ShapeUtil::Rank(lhs->shape()) == 1 ||
-      (ShapeUtil::Rank(lhs->shape()) == 2 && lhs->shape().dimensions(0) == 1)) {
-    auto new_lhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(lhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(lhs->shape())}),
-        lhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    HloInstruction* reduce;
+      (ShapeUtil::Rank(lhs->shape()) == 2 &&
+       lhs->shape().dimensions(lhs_kept_dim) == 1)) {
     if (ShapeUtil::Rank(rhs->shape()) == 1) {
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(), {}), multiply, zero,
-          {0}, add_reduce_computation));
-    } else {
-      new_lhs = computation_->AddInstruction(
-          HloInstruction::CreateBroadcast(rhs->shape(), new_lhs, {0}));
-      auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-          rhs->shape(), HloOpcode::kMultiply, new_lhs, rhs));
-
-      reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-          ShapeUtil::MakeShape(dot->shape().element_type(),
-                               {rhs->shape().dimensions(1)}),
-          multiply, zero, {0}, add_reduce_computation));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(
+          dot,
+          reshape_if_necessary(AddReduce(multiply(Flatten(lhs), rhs), 0))));
+      return true;
     }
-    return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(
+                 AddReduce(multiply(broadcast_to_dim(Flatten(lhs), rhs->shape(),
+                                                     rhs_collapsing_dim),
+                                    rhs),
+                           rhs_collapsing_dim))));
+    return true;
   }
 
   // Strength reduce dot(a, b[K, 1]) =
@@ -686,26 +699,60 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
   //  )
   if (ShapeUtil::Rank(rhs->shape()) == 1 ||
-      (ShapeUtil::Rank(rhs->shape()) == 2 && rhs->shape().dimensions(1) == 1)) {
-    auto new_rhs = computation_->AddInstruction(HloInstruction::CreateReshape(
-        ShapeUtil::MakeShape(rhs->shape().element_type(),
-                             {ShapeUtil::ElementsIn(rhs->shape())}),
-        rhs));
-    new_rhs = computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(lhs->shape(), new_rhs, {1}));
-    auto multiply = computation_->AddInstruction(HloInstruction::CreateBinary(
-        lhs->shape(), HloOpcode::kMultiply, lhs, new_rhs));
-    HloComputation* add_reduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
+      (ShapeUtil::Rank(rhs->shape()) == 2 &&
+       rhs->shape().dimensions(rhs_kept_dim) == 1)) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(AddReduce(
+                 multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
+                                                lhs_collapsing_dim)),
+                 lhs_collapsing_dim))));
+    return true;
+  }
+  return false;
+}
+
+Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
+  auto lhs = dot->mutable_operand(0);
+  auto rhs = dot->mutable_operand(1);
+
+  // Only optimize F32 dot operations where the dot, rhs and lhs are rank 2 or
+  // below.
+  if (dot->shape().element_type() != F32 || ShapeUtil::Rank(lhs->shape()) > 2 ||
+      ShapeUtil::Rank(rhs->shape()) > 2 || ShapeUtil::Rank(dot->shape()) > 2) {
+    return Status::OK();
+  }
+
+  // Replace a zero element dot with a broadcast of the constant 0.
+  if (ShapeUtil::HasZeroElements(dot->shape()) ||
+      ShapeUtil::HasZeroElements(lhs->shape()) ||
+      ShapeUtil::HasZeroElements(rhs->shape())) {
     auto zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    auto reduce = computation_->AddInstruction(HloInstruction::CreateReduce(
-        ShapeUtil::MakeShape(dot->shape().element_type(),
-                             {lhs->shape().dimensions(0)}),
-        multiply, zero, {1}, add_reduce_computation));
     return ReplaceWithNewInstruction(
-        dot, HloInstruction::CreateReshape(dot->shape(), reduce));
+        dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
+  }
+
+  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+    TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
+                        HandleDotStrengthReduction(dot));
+    if (did_strength_reduction) {
+      return Status::OK();
+    }
   }
+
+  // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
+  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+    DotDimensionNumbers dot_dimension_numbers;
+    dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+    dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+    auto new_dot = computation_->AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::PermuteDimensions({1, 0}, dot->shape()),
+        rhs->mutable_operand(0), lhs->mutable_operand(0),
+        dot_dimension_numbers));
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateTranspose(dot->shape(), new_dot, {1, 0}));
+  }
+
   return Status::OK();
 }
 
@@ -951,6 +998,18 @@ Status AlgebraicSimplifierVisitor::HandleConvert(HloInstruction* convert) {
   return Status::OK();
 }
 
+// Complex(Real(c), Imag(c)) -> c
+Status AlgebraicSimplifierVisitor::HandleComplex(HloInstruction* complex) {
+  auto real = complex->mutable_operand(0);
+  auto imag = complex->mutable_operand(1);
+  if (real->opcode() == HloOpcode::kReal &&
+      imag->opcode() == HloOpcode::kImag &&
+      real->operand(0) == imag->operand(0)) {
+    return ReplaceInstruction(complex, real->mutable_operand(0));
+  }
+  return Status::OK();
+}
+
 // Real(Complex(r, i)) -> r
 Status AlgebraicSimplifierVisitor::HandleReal(HloInstruction* real) {
   auto operand = real->mutable_operand(0);
@@ -1100,9 +1159,15 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   if (IsAll(rhs, -1)) {
     auto* one = computation_->AddInstruction(HloInstruction::CreateConstant(
         Literal::One(rhs->shape().element_type()).CloneToUnique()));
+
+    // Explicitly broadcast scalar 1 to the output shape, to avoid implicit
+    // broadcast in divide HLO as we are trying to eliminate implicit
+    // broadcasting at HLO level.
+    auto* broadcast_one = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(power->shape(), one, {}));
     return ReplaceWithNewInstruction(
         power, HloInstruction::CreateBinary(power->shape(), HloOpcode::kDivide,
-                                            one, lhs));
+                                            broadcast_one, lhs));
   }
   return Status::OK();
 }
@@ -1390,6 +1455,15 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   auto operand = reduce_window->mutable_operand(0);
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
+  if (ShapeUtil::IsScalar(operand->shape())) {
+    TF_RET_CHECK(ShapeUtil::IsScalar(reduce_window->shape()));
+    return ReplaceWithNewInstruction(
+        reduce_window,
+        HloInstruction::CreateMap(reduce_window->shape(),
+                                  {operand, reduce_window->mutable_operand(1)},
+                                  function));
+  }
+
   VLOG(10) << "Considering folding Pad: " << operand->ToString()
            << "\ninto reduce-window: " << reduce_window->ToString();
 
@@ -1591,8 +1665,11 @@ Status AlgebraicSimplifierVisitor::HandleConvolution(
 
   auto new_lhs = add_bitcast(new_input_shape, lhs);
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
-  auto dot = computation_->AddInstruction(HloInstruction::CreateBinary(
-      dot_output_shape, HloOpcode::kDot, new_lhs, new_rhs));
+  DotDimensionNumbers dot_dimension_numbers;
+  dot_dimension_numbers.add_lhs_contracting_dimensions(1);
+  dot_dimension_numbers.add_rhs_contracting_dimensions(0);
+  auto dot = computation_->AddInstruction(HloInstruction::CreateDot(
+      dot_output_shape, new_lhs, new_rhs, dot_dimension_numbers));
   return ReplaceInstruction(convolution, add_bitcast(convolution_shape, dot));
 }
 
@@ -1673,312 +1750,6 @@ Status AlgebraicSimplifierVisitor::HandleMinimum(HloInstruction* minimum) {
   return Status::OK();
 }
 
-// If all of instr's operands are either constants or have the form
-//   get-tuple-element(gte_operand, N)
-// for the same value N, returns N.  Otherwise, returns nullopt.
-static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
-                                          const HloInstruction* gte_operand) {
-  VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", "
-          << gte_operand->ToString() << ")";
-  optional<int64> tuple_idx;
-  for (const HloInstruction* operand : instr->operands()) {
-    if (operand->IsConstant()) {
-      continue;
-    }
-    if (operand->opcode() != HloOpcode::kGetTupleElement) {
-      VLOG(2) << "instr uses something other than gte(gte_operand): "
-              << operand->ToString();
-      return nullopt;
-    }
-    if (operand->operand(0) != gte_operand) {
-      VLOG(2) << "instr has gte whose operand is not gte_operand: "
-              << operand->ToString();
-      return nullopt;
-    }
-    if (tuple_idx && tuple_idx != operand->tuple_index()) {
-      VLOG(2) << "instr has operands with conflicting gte indices, "
-              << *tuple_idx << " vs " << operand->tuple_index();
-      return nullopt;
-    }
-
-    tuple_idx = operand->tuple_index();
-  }
-  return tuple_idx;
-}
-
-// Tries to get the tuple index of the induction variable of a while loop.
-//
-// Checks that the loop condition and root both plumb the induction variable
-// through the same tuple index, and that they both apply exactly one op to the
-// induction variable before  deciding whether to do another loop iteration (in
-// the loop condition's case) or packing the induction variable into the result
-// tuple (in the loop body's case).
-//
-// Specifically, checks that the loop condition has structure
-//
-//   root = op(constants, get-tuple-elem(param0, N), constants)
-//
-// and the loop body has the structure
-//
-//   inc = op(constants, get-tuple-elem(param0, N), constants)
-//   root = tuple(..., inc, ...)  // inc is N'th operand of tuple().
-//
-// If so, returns N.  Otherwise, returns nullopt.
-static optional<int64> GetLoopInductionVarTupleIdx(
-    const HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-  VLOG(2) << "Finding induction variable for loop "
-          << while_op->ToShortString();
-
-  // The while_cond computation should have the form
-  //
-  //   while_cond_root =
-  //       op(constants, get-tuple-elem(while_cond_param, N), constants).
-  //
-  // If it does, set indvar_tuple_idx to N.
-  auto* while_cond = while_op->while_condition();
-  auto* while_cond_root = while_cond->root_instruction();
-  auto* while_cond_param = while_cond->parameter_instruction(0);
-  optional<int64> indvar_tuple_idx =
-      GetGTEOperandIndex(while_cond_root, while_cond_param);
-  if (!indvar_tuple_idx) {
-    VLOG(2) << "Induction variable not found in loop condition: "
-            << while_cond->root_instruction()->ToString();
-    return nullopt;
-  }
-
-  // The while_body computation should have the form
-  //
-  //   while_body_inc =
-  //       op(constants, get-tuple-elem(while_body_param, N), constants)
-  //   while_body_root = tuple(..., while_body_inc, ...)
-  //
-  // where while_body_inc is operand N of while_body_root.
-  auto* while_body = while_op->while_body();
-  auto* while_body_root = while_body->root_instruction();
-  if (while_body_root->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While body's root is not a tuple instruction: "
-            << while_body_root->ToString();
-    return nullopt;
-  }
-
-  auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx);
-  auto* while_body_param = while_body->parameter_instruction(0);
-  optional<int64> while_body_indvar_tuple_idx =
-      GetGTEOperandIndex(while_body_inc, while_body_param);
-  if (!while_body_indvar_tuple_idx) {
-    VLOG(2)
-        << "Induction variable not found in while body increment instruction: "
-        << while_body_inc->ToString();
-    return nullopt;
-  }
-  if (while_body_indvar_tuple_idx != indvar_tuple_idx) {
-    VLOG(2) << "Tuple index of induction variable does not match between loop "
-               "condition ("
-            << *indvar_tuple_idx << ") and while body ("
-            << *while_body_indvar_tuple_idx << ")";
-    return nullopt;
-  }
-
-  // Finally, check that the while loop's initial value is a tuple with enough
-  // elements.
-  auto* while_init = while_op->operand(0);
-  if (while_init->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While init expected to be a tuple: " << while_init->ToString();
-    return nullopt;
-  }
-
-  VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx;
-  return indvar_tuple_idx;
-}
-
-// Finds and returns the non-constant operand in instr.
-//
-// CHECK-fails if instr doesn't have exactly one unique non-constant operand.
-static const HloInstruction* NonConstantOperand(const HloInstruction* instr) {
-  const HloInstruction* result = nullptr;
-  for (const HloInstruction* operand : instr->operands()) {
-    if (!operand->IsConstant()) {
-      if (result != nullptr) {
-        CHECK_EQ(result, operand);
-      }
-      result = operand;
-    }
-  }
-  CHECK_NE(result, nullptr);
-  return result;
-}
-
-// Tries to determine the number of times the given loop executes.  Currently
-// simply returns 0, 1, or "can't tell" (nullopt).
-static optional<int64> GetLoopTripCount(HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-  VLOG(2) << "Getting trip count for loop " << while_op->ToString();
-
-  // The loop's induction variable is found at
-  //
-  //   get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx),
-  //
-  // where comp is while_op->while_body() or while_op->while_condition().
-  optional<int64> indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op);
-  if (!indvar_tuple_idx) {
-    return nullopt;
-  }
-
-  VLOG(2) << "Induction variable is at index " << *indvar_tuple_idx
-          << " in input tuple.";
-
-  // Now that we know the index of the induction variable, we can we can try to
-  // compute how many times the loop executes.  Start by computing the induction
-  // variable's initial value.
-  HloEvaluator evaluator;
-  auto* while_init = while_op->mutable_operand(0);
-  auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
-  StatusOr<std::unique_ptr<Literal>> indvar_init_result =
-      evaluator.Evaluate(indvar_init);
-  if (!indvar_init_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable init: "
-            << indvar_init_result.status();
-    return nullopt;
-  }
-
-  // Evaluates the while loop's condition, returning either "true" (continue
-  // looping), "false" (stop looping), or nullopt (can't evaluate).
-  auto evaluate_while_cond = [&](const Literal& indvar) -> optional<bool> {
-    auto* while_cond = while_op->while_condition();
-    auto* while_cond_root = while_cond->root_instruction();
-    auto* while_cond_indvar = NonConstantOperand(while_cond_root);
-    StatusOr<std::unique_ptr<Literal>> result =
-        evaluator.EvaluateWithSubstitutions(while_cond_root,
-                                            {{while_cond_indvar, &indvar}});
-    if (!result.ok()) {
-      VLOG(2) << "Couldn't evaluate while cond: " << result.status();
-      return nullopt;
-    }
-    return result.ValueOrDie()->GetArraySlice<bool>() ==
-           tensorflow::gtl::ArraySlice<bool>{true};
-  };
-
-  // The initial value of the induction variable.
-  const Literal& indvar_iter0_val = *indvar_init_result.ValueOrDie();
-
-  // Evaluate whether the while condition is true when seeded with
-  // indvar_iter0_val.
-  optional<bool> while_cond_iter0_val = evaluate_while_cond(indvar_iter0_val);
-  if (while_cond_iter0_val == false) {
-    VLOG(2) << "Loop has static trip count of 0.";
-    return 0;
-  }
-
-  // Calculate the value of the induction variable after one iteration of the
-  // loop, and check whether the while condition is true with this new value.
-  auto* while_body = while_op->while_body();
-  auto* while_body_indvar_update =
-      while_body->root_instruction()->operand(*indvar_tuple_idx);
-  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
-  StatusOr<std::unique_ptr<Literal>> indvar_iter1_result =
-      evaluator.EvaluateWithSubstitutions(
-          while_body_indvar_update, {{while_body_indvar, &indvar_iter0_val}});
-  if (!indvar_iter1_result.ok()) {
-    VLOG(2) << "Couldn't evaluate induction variable update: "
-            << indvar_iter1_result.status();
-    return nullopt;
-  }
-  const Literal& indvar_iter1_val = *indvar_iter1_result.ValueOrDie();
-  optional<bool> while_cond_iter1_val = evaluate_while_cond(indvar_iter1_val);
-  if (while_cond_iter1_val == false) {
-    VLOG(2) << "Determined that loop has static trip count of 1.";
-    return 1;
-  }
-
-  VLOG(2) << "Loop has unknown trip count >= 1.";
-  return nullopt;
-}
-
-// Determines whether the given instruction is a send/recv node, or has a
-// subcomputation which contains a send/recv node.
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
-
-// Determines whether the given computation contains a send or recv node.
-static bool ContainsSendOrRecv(const HloComputation* comp) {
-  for (const auto* instr : comp->instructions()) {
-    if (IsOrContainsSendOrRecv(instr)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
-  if (instr->opcode() == HloOpcode::kSend ||
-      instr->opcode() == HloOpcode::kRecv) {
-    return true;
-  }
-  for (const auto& subcomp : instr->called_computations()) {
-    if (ContainsSendOrRecv(subcomp)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-Status AlgebraicSimplifierVisitor::HandleWhile(HloInstruction* while_op) {
-  // We can't simplify while loops that contain send/recv nodes, because we rely
-  // on the particular loop structure around the node matching on the send and
-  // recv sides.
-  if (ContainsSendOrRecv(while_op->while_body()) ||
-      ContainsSendOrRecv(while_op->while_condition())) {
-    VLOG(2) << "Not attempting to simplify while loop because it contains a "
-               "send/recv node: "
-            << while_op->ToShortString();
-    return Status::OK();
-  }
-
-  // Cowardly refuse to simplify loops that are not removable.  In practice,
-  // this means that we can't simplify loops that contain side-effecting
-  // instructions or have control predecessors/successors.
-  //
-  // This is not a fundamental limitation.  The control operands can be moved
-  // onto the new HLOs after simplification, and any side-effecting ops inside
-  // the loop aren't removed, just cloned and added back to the loop.
-  // Nevertheless our infrastructure sees loop simplification as removal of
-  // these nodes and currently doesn't allow it.
-  if (!while_op->parent()->IsRemovable(while_op)) {
-    VLOG(2) << "Not attempting to simplify while loop it is not removable: "
-            << while_op->ToShortString();
-    return Status::OK();
-  }
-
-  // Remove while loops with static trip count of 0.
-  optional<int64> trip_count = GetLoopTripCount(while_op);
-  if (trip_count && *trip_count == 0) {
-    // The loop never executes, so the value of the loop is the value of its
-    // "init" operand.
-    auto computation = while_op->parent();
-
-    // Remove while_op (i.e., call ReplaceInstruction rather than
-    // ReplaceUsesWithInstruction) so that if the algebraic simplifier is run in
-    // a loop without an intervening DCE, we don't try to re-simplify the loop.
-    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(
-        while_op, while_op->mutable_operand(0)));
-    changed_ = true;
-    return Status::OK();
-  }
-
-  // Transform while loops with static trip count of 1 into a call op, then
-  // inline the call.
-  if (trip_count && *trip_count == 1) {
-    auto computation = while_op->parent();
-    auto call_op = computation->AddInstruction(HloInstruction::CreateCall(
-        while_op->shape(), while_op->operands(), while_op->while_body()));
-    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, call_op));
-    TF_RETURN_IF_ERROR(CallInliner::Inline(call_op));
-    changed_ = true;
-    return Status::OK();
-  }
-  return Status::OK();
-}
-
 StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(2,
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
@@ -1986,7 +1757,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
   for (auto* comp : module->MakeNonfusionComputations()) {
     if (AlgebraicSimplifierVisitor::Run(
             comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_simplification_, enable_conv_simplification_)) {
+            enable_dot_strength_reduction_, enable_conv_simplification_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index a9f476178c7af74c275a10de7727ea64e17d590f..43315f5cdc7afbe79039420320f4a0d0535e11f1 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -40,11 +40,11 @@ class AlgebraicSimplifier : public HloPassInterface {
   // bitcasts.
   AlgebraicSimplifier(bool is_layout_sensitive,
                       ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_simplification = true,
+                      bool enable_dot_strength_reduction = true,
                       bool enable_conv_simplification = true)
       : is_layout_sensitive_(is_layout_sensitive),
         valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_simplification_(enable_dot_simplification),
+        enable_dot_strength_reduction_(enable_dot_strength_reduction),
         enable_conv_simplification_(enable_conv_simplification) {}
   ~AlgebraicSimplifier() override = default;
   tensorflow::StringPiece name() const override { return "algsimp"; }
@@ -58,7 +58,7 @@ class AlgebraicSimplifier : public HloPassInterface {
   ValidBitcastCallback valid_bitcast_callback_;
 
   // Enable dot simplication on platforms where it is profitable.
-  bool enable_dot_simplification_;
+  bool enable_dot_strength_reduction_;
 
   // Enable convolution simplication on platforms where it is profitable.
   bool enable_conv_simplification_;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 87d4fc9663daf3cc2806dfa6550812dd9b08b36c..7462e397ff07779c04bce18b68419bff9686dbd5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -47,69 +47,7 @@ AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloVerifiedTestBase {
- public:
-  // Makes a computation that contains a loop that runs num_iters times.
-  HloComputation* MakeSimpleLoop(HloModule* module, int num_iters);
-};
-
-HloComputation* AlgebraicSimplifierTest::MakeSimpleLoop(HloModule* module,
-                                                        int num_iters) {
-  HloComputation::Builder builder(TestName());
-
-  auto loop_iter_init = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
-  auto loop_data_init = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({0, 1, 2})));
-  auto loop_init = builder.AddInstruction(
-      HloInstruction::CreateTuple({loop_iter_init, loop_data_init}));
-
-  HloComputation* condition;
-  {
-    HloComputation::Builder cond_builder(TestName() + ".condition");
-    auto loop_var = cond_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    auto loop_induction_var =
-        cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
-    auto limit = cond_builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<int32>(42 + num_iters)));
-    cond_builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, loop_induction_var,
-        limit));
-    condition = module->AddEmbeddedComputation(cond_builder.Build());
-  }
-
-  HloComputation* body;
-  {
-    HloComputation::Builder body_builder(TestName() + ".body");
-    auto loop_var = body_builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
-    auto loop_induction_var =
-        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
-    auto new_loop_induction_var =
-        body_builder.AddInstruction(HloInstruction::CreateBinary(
-            loop_induction_var->shape(), HloOpcode::kAdd, loop_induction_var,
-            body_builder.AddInstruction(
-                HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)))));
-    auto loop_data =
-        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
-            loop_data_init->shape(), loop_var, 1));
-    auto new_loop_data =
-        body_builder.AddInstruction(HloInstruction::CreateBinary(
-            loop_data_init->shape(), HloOpcode::kMultiply, loop_data,
-            loop_data));
-    body_builder.AddInstruction(
-        HloInstruction::CreateTuple({new_loop_induction_var, new_loop_data}));
-    body = module->AddEmbeddedComputation(body_builder.Build());
-  }
-
-  builder.AddInstruction(HloInstruction::CreateWhile(
-      loop_init->shape(), condition, body, loop_init));
-
-  return module->AddEntryComputation(builder.Build());
-}
+class AlgebraicSimplifierTest : public HloVerifiedTestBase {};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
@@ -433,6 +371,31 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   EXPECT_EQ(root, param0);
 }
 
+// Test that complex(real(c), imag(c)) is simplified to c.
+TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
+  Shape r2c64 = ShapeUtil::MakeShape(C64, {2, 2});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2c64, "param0"));
+  HloInstruction* real = builder.AddInstruction(
+      HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, param0));
+  HloInstruction* imag = builder.AddInstruction(
+      HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, param0));
+  HloInstruction* cplx = builder.AddInstruction(
+      HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, cplx);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_EQ(root, param0);
+}
+
 // Test that real(complex(r,i)) is simplified to r.
 TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 2});
@@ -798,8 +761,10 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Constant(), param0));
-  EXPECT_EQ(root->operand(0)->literal().GetFirstElement<float>(), 1);
+  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
+  EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
+            1);
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -1659,8 +1624,11 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     ConvolutionDimensionNumbers dnums;
     std::vector<int64> in_dims;
     int in_channel_idx = -1;
-    dnums.add_spatial_dimensions(-1);  // filled in later
-    dnums.add_spatial_dimensions(-1);  // filled in later
+    // filled in later
+    dnums.add_input_spatial_dimensions(-1);
+    dnums.add_output_spatial_dimensions(-1);
+    dnums.add_input_spatial_dimensions(-1);
+    dnums.add_output_spatial_dimensions(-1);
     for (int i = 0; i < strlen(options.dim_order); ++i) {
       char ch = options.dim_order[i];
       if (ch == 'N') {
@@ -1668,10 +1636,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         dnums.set_output_batch_dimension(i);
         in_dims.push_back(options.in_batch);
       } else if (ch == 'H') {
-        dnums.set_spatial_dimensions(0, i);
+        dnums.set_input_spatial_dimensions(0, i);
+        dnums.set_output_spatial_dimensions(0, i);
         in_dims.push_back(options.in_height);
       } else if (ch == 'W') {
-        dnums.set_spatial_dimensions(1, i);
+        dnums.set_input_spatial_dimensions(1, i);
+        dnums.set_output_spatial_dimensions(1, i);
         in_dims.push_back(options.in_width);
       } else if (ch == 'C') {
         dnums.set_input_feature_dimension(i);
@@ -2168,8 +2138,10 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r1f32, "x"));
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r1f32, HloOpcode::kDot, x, y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
 
   HloComputation::Builder call_builder(TestName() + ".Call");
@@ -2208,99 +2180,6 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
               op::Tuple(op::Constant(), op::Constant()));
 }
 
-TEST_F(AlgebraicSimplifierTest, WhileLoopWithZeroIterations) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/0);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Constant(), op::Constant()));
-}
-
-TEST_F(AlgebraicSimplifierTest, WhileLoopWithOneIteration) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/1);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Add(), op::Multiply()));
-}
-
-TEST_F(AlgebraicSimplifierTest, WhileLoopWithTwoIterations) {
-  HloModule module(TestName());
-  MakeSimpleLoop(&module, /*num_iters=*/2);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-}
-
-TEST_F(AlgebraicSimplifierTest, WhileLoopWithControlDependency) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/1);
-  auto* while_op = computation->root_instruction();
-  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
-  auto* true_op = while_op->while_body()->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
-  TF_ASSERT_OK(true_op->AddControlDependencyTo(
-      while_op->while_body()->root_instruction()));
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction()->control_predecessors(),
-              ElementsAre(op::Constant()))
-      << computation->ToString();
-}
-
-// Loops that contain send/recv nodes can't be simplified; the loop structure
-// around send/recv nodes must be preserved.
-TEST_F(AlgebraicSimplifierTest, NotRemovedIfContainsSend) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/1);
-  auto* while_op = computation->root_instruction();
-  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
-  auto* while_body = while_op->while_body();
-  while_body->AddInstruction(HloInstruction::CreateSend(
-      while_body->AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR0<bool>(true))),
-      /*channel_id=*/0));
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-}
-
-TEST_F(AlgebraicSimplifierTest, NotRemovedIfContainsRecv) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/1);
-  auto* while_op = computation->root_instruction();
-  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
-  auto* while_body = while_op->while_body();
-  while_body->AddInstruction(
-      HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}),
-                                 /*channel_id=*/0));
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-}
-
-// The limitation on not being able to simplify loops that contain infeeds (and
-// other non-removable instructions) isn't fundamental -- it just stems from the
-// fact that our infrastructure sees simplifying such a loop as tantamount to
-// removing the non-removable instruction.
-TEST_F(AlgebraicSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
-  HloModule module(TestName());
-  HloComputation* computation = MakeSimpleLoop(&module, /*num_iters=*/1);
-  auto* while_op = computation->root_instruction();
-  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
-  auto* while_body = while_op->while_body();
-  while_body->AddInstruction(
-      HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config"));
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
-}
-
 // A dynamic-slice is trivial if its start indices are all zeroes and the size
 // of its input equals the size of its output.  In this case, the dynamic slice
 // is equal to its input.
@@ -2359,5 +2238,63 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
               op::DynamicSlice(op::Parameter(), op::Parameter()));
 }
 
+class DotStrengthReductionTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<int, int, int, bool, bool>> {};
+TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
+  int m, k, n;
+  bool transpose_lhs, transpose_rhs;
+  std::tie(m, k, n, transpose_lhs, transpose_rhs) = GetParam();
+
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {m, n});
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
+  Shape transposed_lhs_shape = ShapeUtil::MakeShape(F32, {k, m});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {k, n});
+  Shape transposed_rhs_shape = ShapeUtil::MakeShape(F32, {n, k});
+  HloComputation::Builder builder(TestName());
+
+  auto lhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, transpose_lhs ? transposed_lhs_shape : lhs_shape, "lhs"));
+  if (transpose_lhs) {
+    lhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(lhs_shape, lhs, {1, 0}));
+  }
+  auto rhs = builder.AddInstruction(HloInstruction::CreateParameter(
+      1, transpose_rhs ? transposed_rhs_shape : rhs_shape, "rhs"));
+  if (transpose_rhs) {
+    rhs = builder.AddInstruction(
+        HloInstruction::CreateTranspose(rhs_shape, rhs, {1, 0}));
+  }
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dot_dnums));
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
+  const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
+  const bool computation_should_be_modified =
+      dot_should_be_transformed || (transpose_lhs && transpose_rhs);
+  EXPECT_EQ(changed, computation_should_be_modified);
+  bool has_no_dot = true;
+  for (const auto& hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kDot) {
+      has_no_dot = false;
+      break;
+    }
+  }
+  EXPECT_EQ(has_no_dot, dot_should_be_transformed);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DotStrengthReductionTestInstantiation, DotStrengthReductionTest,
+    ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
+                       ::testing::Values(1, 2), ::testing::Bool(),
+                       ::testing::Bool()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 9abe30e3f371cc294c36c1dcd743224b11b0c4f5..05f2d062784147108a94ffb7bb0ca42ddfe4f010 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/compiler/xla/service/backend.h"
 
 #include <algorithm>
 #include <string>
 #include <utility>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
index abe881cd1a58a6173b9b93f10a7308d70106c889..2bbae25aee3db95406fd247deb788d2976207ba3 100644
--- a/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_rewriter.cc
@@ -85,9 +85,9 @@ class BatchNormRewriterVisitor : public DfsHloVisitorWithDefault {
                                              HloOpcode opcode) {
     HloComputation::Builder b("scalar_computation");
     auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-        0, ShapeUtil::MakeShape(F32, {}), "scalar_lhs"));
+        0, ShapeUtil::MakeShape(primitive_type, {}), "scalar_lhs"));
     auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-        1, ShapeUtil::MakeShape(F32, {}), "scalar_rhs"));
+        1, ShapeUtil::MakeShape(primitive_type, {}), "scalar_rhs"));
     auto scalar_op = b.AddInstruction(
         HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
                                      opcode, scalar_lhs, scalar_rhs));
@@ -149,26 +149,41 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
   if (!rewrite_training_op_) {
     return Status::OK();
   }
+
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
   // Expand batch norm training into smaller HLO ops.
   HloInstruction* operand = batch_norm->mutable_operand(0);
   const Shape operand_shape = operand->shape();
+  PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
   const int64 feature_count = operand_shape.dimensions(feature_index);
   const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  auto elements_per_feature =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<float>(size_in_elements / feature_count)));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(size_in_elements / feature_count);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
   const Shape feature_shape = scale->shape();
 
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  auto zero_literal = Literal::CreateR0(0.0f);
+  TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
   std::vector<int64> dimensions_without_feature;
 
   for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
@@ -177,103 +192,110 @@ Status BatchNormRewriterVisitor::HandleBatchNormTraining(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(F32, HloOpcode::kAdd);
+      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // X^2.
-  auto operand_squared =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, operand, operand));
+  auto operand_squared = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, operand, operand));
   // Sum[X].
-  auto sum = computation_->AddInstruction(HloInstruction::CreateReduce(
-      feature_shape, operand, zero, dimensions_without_feature,
-      add_reduce_computation));
+  auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero,
+                                              dimensions_without_feature,
+                                              add_reduce_computation));
 
   // Sum[X^2].
-  auto squared_sum = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto squared_sum = add(HloInstruction::CreateReduce(
       feature_shape, operand_squared, zero, dimensions_without_feature,
       add_reduce_computation));
 
   // Fuse two parallel reduces together to improve performance.
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(
-        HloInstruction::CreateTuple({sum, squared_sum}));
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple({sum, squared_sum}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum, squared_sum, operand_squared},
         HloInstruction::FusionKind::kInput);
 
-    sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum = add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    squared_sum = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    squared_sum =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // E[X].
-  auto mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto square_mean = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
 
   // E^2[X].
-  auto mean_square = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto mean_square = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply, mean, mean));
 
   // Var[X].
-  auto var = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto var = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
-  auto shifted_normalized = computation_->AddInstruction(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kAdd,
-                                   scaled_normalized, offset_broadcasted));
-
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({shifted_normalized, mean, var})));
+  auto shifted_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted));
+
+  auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var});
+
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
   return Status::OK();
 }
 
@@ -286,6 +308,7 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
   HloInstruction* operand = batch_norm->mutable_operand(0);
   const Shape operand_shape = operand->shape();
   int64 feature_index = batch_norm->feature_index();
+  PrimitiveType ptype = operand_shape.element_type();
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -293,8 +316,10 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
   HloInstruction* var = batch_norm->mutable_operand(4);
   const Shape feature_shape = scale->shape();
 
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
   auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+      HloInstruction::CreateConstant(std::move(epsilon_literal)));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -304,50 +329,69 @@ Status BatchNormRewriterVisitor::HandleBatchNormInference(
     }
   }
 
-  auto scale_broadcasted = computation_->AddInstruction(
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
+
+  auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
-  auto offset_broadcasted = computation_->AddInstruction(
+  auto offset_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
-  auto var_broadcasted = computation_->AddInstruction(
-      HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
+  auto var_broadcasted =
+      add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
+  auto var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
 
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
 
   // X - E[X].
-  auto operand_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = computation_->AddInstruction(
+  auto normalized = add(
       HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
                                    operand_minus_mean, rsqrt_var_add_epsilon));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add(HloInstruction::CreateBinary(
+      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
   auto shifted_normalized = HloInstruction::CreateBinary(
       operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted);
 
+  int64 instruction_count_after = computation_->instruction_count();
+  CHECK_EQ(instruction_count_after,
+           instruction_count_before + added_instructions.size());
+  if (batch_norm->has_sharding()) {
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), operand_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    shifted_normalized->set_sharding(batch_norm->sharding());
+  }
   TF_CHECK_OK(
       ReplaceWithNewInstruction(batch_norm, std::move(shifted_normalized)));
   return Status::OK();
@@ -370,9 +414,17 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
   if (!rewrite_grad_op_) {
     return Status::OK();
   }
+  std::vector<HloInstruction*> added_instructions;
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    HloInstruction* added_inst = computation_->AddInstruction(std::move(inst));
+    added_instructions.push_back(added_inst);
+    return added_inst;
+  };
+  int64 instruction_count_before = computation_->instruction_count();
 
   HloInstruction* activation = batch_norm->mutable_operand(0);
   const Shape activation_shape = activation->shape();
+  PrimitiveType ptype = activation_shape.element_type();
   HloInstruction* scale = batch_norm->mutable_operand(1);
   const Shape feature_shape = scale->shape();
   HloInstruction* mean = batch_norm->mutable_operand(2);
@@ -383,18 +435,26 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
 
   const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
   const int64 feature_count = activation_shape.dimensions(feature_index);
-  auto elements_per_feature =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<float>(size_in_elements / feature_count)));
-
-  auto zero = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-
-  auto neg_half = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(-0.5f)));
-
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0(batch_norm->epsilon())));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(size_in_elements / feature_count);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+
+  auto zero_literal = Literal::CreateR0(0.0f);
+  TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
+  auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
+
+  auto neg_half_literal = Literal::CreateR0(-0.5f);
+  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
+  auto neg_half =
+      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+
+  auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
+  TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
+  auto epsilon =
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -404,126 +464,131 @@ Status BatchNormRewriterVisitor::HandleBatchNormGrad(
     }
   }
 
-  auto scale_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, scale, {feature_index}));
-  auto variance_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBroadcast(
-          activation_shape, variance, {feature_index}));
+  auto scale_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, scale, {feature_index}));
+  auto variance_broadcasted = add(HloInstruction::CreateBroadcast(
+      activation_shape, variance, {feature_index}));
 
   // E[X].
-  auto mean_broadcasted = computation_->AddInstruction(
+  auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index}));
 
   // rsqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon_broadcasted =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kPower,
-          computation_->AddInstruction(
-              HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                           variance_broadcasted, epsilon)),
-          neg_half));
-
-  auto rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          feature_shape, HloOpcode::kPower,
-          computation_->AddInstruction(HloInstruction::CreateBinary(
-              feature_shape, HloOpcode::kAdd, variance, epsilon)),
-          neg_half));
+  auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon)),
+      neg_half));
+
+  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      feature_shape, HloOpcode::kPower,
+      add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance,
+                                       epsilon)),
+      neg_half));
 
   // X - E[X].
-  auto activation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
-                                   activation, mean_broadcasted));
+  auto activation_minus_mean = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted));
 
   // Grad[Y] * (X - E[X]).
-  auto grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, activation_minus_mean));
+  auto grad_output_times_activiation_minus_mean =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, activation_minus_mean));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(F32, HloOpcode::kAdd);
+      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
 
   // sum(Grad[Y] * (X - E[X])).
   auto sum_grad_output_times_activiation_minus_mean =
-      computation_->AddInstruction(HloInstruction::CreateReduce(
+      add(HloInstruction::CreateReduce(
           feature_shape, grad_output_times_activiation_minus_mean, zero,
           dimensions_without_feature, add_reduce_computation));
 
   // Grad[beta] = Sum(Grad[Y]).
-  auto grad_beta = computation_->AddInstruction(HloInstruction::CreateReduce(
+  auto grad_beta = add(HloInstruction::CreateReduce(
       feature_shape, grad_output, zero, dimensions_without_feature,
       add_reduce_computation));
 
-  if (use_fusion_) {
-    auto tuple = computation_->AddInstruction(HloInstruction::CreateTuple(
+  if (use_fusion_ && !batch_norm->has_sharding()) {
+    auto tuple = add(HloInstruction::CreateTuple(
         {sum_grad_output_times_activiation_minus_mean, grad_beta}));
 
     auto fused = computation_->CreateFusionInstruction(
         {tuple, sum_grad_output_times_activiation_minus_mean, grad_beta},
         HloInstruction::FusionKind::kInput);
 
-    sum_grad_output_times_activiation_minus_mean = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
+    sum_grad_output_times_activiation_minus_mean =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
 
-    grad_beta = computation_->AddInstruction(
-        HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
+    grad_beta =
+        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
   }
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
-  auto grad_scale = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto grad_scale = add(HloInstruction::CreateBinary(
       feature_shape, HloOpcode::kMultiply,
       sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon));
 
   // I2 = Sum(Grad[Y])
-  auto I2 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
-      activation_shape, grad_beta, {feature_index}));
+  auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta,
+                                                {feature_index}));
 
   // I3 = Sum(Grad[Y] * (X - E[X]))
-  auto I3 = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+  auto i3 = add(HloInstruction::CreateBroadcast(
       activation_shape, sum_grad_output_times_activiation_minus_mean,
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
-  auto I4 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, I3, activation_minus_mean));
+  auto i4 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean));
 
   // I5 = I4 / (Var[X] + epsilon)
-  auto I5 = computation_->AddInstruction(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, I4,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kAdd, variance_broadcasted, epsilon))));
+  auto i5 = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, i4,
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
+                                       variance_broadcasted, epsilon))));
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
-  auto scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kMultiply, scale_broadcasted,
-          rsqrt_var_add_epsilon_broadcasted));
+  auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kMultiply, scale_broadcasted,
+      rsqrt_var_add_epsilon_broadcasted));
 
-  scale_times_rsqrt_var_add_epsilon =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kDivide,
-          scale_times_rsqrt_var_add_epsilon, elements_per_feature));
+  scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
+      activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon,
+      elements_per_feature));
 
-  auto I1 = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   grad_output, elements_per_feature));
+  auto i1 =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       grad_output, elements_per_feature));
 
   // I6 = I1 - I2 - I5
-  auto I6 = computation_->AddInstruction(HloInstruction::CreateBinary(
+  auto i6 = add(HloInstruction::CreateBinary(
       activation_shape, HloOpcode::kSubtract,
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          activation_shape, HloOpcode::kSubtract, I1, I2)),
-      I5));
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
+                                       i1, i2)),
+      i5));
 
   // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6.
-  auto grad_activation = computation_->AddInstruction(
-      HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                   scale_times_rsqrt_var_add_epsilon, I6));
+  auto grad_activation =
+      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
+                                       scale_times_rsqrt_var_add_epsilon, i6));
+  auto tuple =
+      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta});
+  if (batch_norm->has_sharding()) {
+    int64 instruction_count_after = computation_->instruction_count();
+    CHECK_EQ(instruction_count_after,
+             instruction_count_before + added_instructions.size());
+    for (HloInstruction* inst : added_instructions) {
+      if (ShapeUtil::Equal(inst->shape(), activation_shape)) {
+        inst->set_sharding(batch_norm->sharding());
+      } else {
+        inst->set_sharding(HloSharding::Replicate());
+      }
+    }
+    tuple->set_sharding(batch_norm->sharding());
+  }
 
-  TF_CHECK_OK(ReplaceWithNewInstruction(
-      batch_norm,
-      HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta})));
+  TF_CHECK_OK(ReplaceWithNewInstruction(batch_norm, std::move(tuple)));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 8536429846f87fd5c4b073cc4b13b3f1c5eb2e5c..7ece79d781acfaffc21d6a29e8a12e68622a1617 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -101,6 +101,11 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
   }
+  std::sort(proto.mutable_assigned()->begin(), proto.mutable_assigned()->end(),
+            [](const BufferAllocationProto::Assigned& assign1,
+               const BufferAllocationProto::Assigned& assign2) {
+              return assign1.logical_buffer_id() < assign2.logical_buffer_id();
+            });
   return proto;
 }
 
@@ -260,6 +265,42 @@ bool BufferAssignment::SharesSliceAtIndex(
          GetUniqueSlice(hlo_b, shape_index_b).ConsumeValueOrDie();
 }
 
+bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
+                                          const HloInstruction* hlo_b) const {
+  using SliceSet =
+      FlatSet<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
+  // Gets the slices all of instr's subshapes.  If any subshape doesn't have an
+  // assigned slice, returns the empty set.
+  auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
+    SliceSet slices;
+    Status status = ShapeUtil::ForEachSubshapeWithStatus(
+        instr->shape(),
+        [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          auto shape_slices = GetAllSlices(instr, index);
+          if (shape_slices.empty()) {
+            return InvalidArgument("No slices assigned to part of instr.");
+          }
+          slices.insert(shape_slices.begin(), shape_slices.end());
+          return Status::OK();
+        });
+    if (!status.ok()) {
+      return {};
+    }
+    return slices;
+  };
+
+  SliceSet slices_a = collect_slices(hlo_a);
+  SliceSet slices_b = collect_slices(hlo_b);
+  // hlo_a and hlo_b have disjoint slices if collect_slices succeeded (i.e.
+  // didn't return the empty set) for both HLOs, and the two resulting sets of
+  // slices are disjoint.
+  return !slices_a.empty() && !slices_b.empty() &&
+         std::none_of(slices_a.begin(), slices_a.end(),
+                      [&](const BufferAllocation::Slice& slice) {
+                        return slices_b.count(slice) > 0;
+                      });
+}
+
 StatusOr<BufferAllocation::Slice>
 BufferAssignment::GetUniqueTopLevelOutputSlice() const {
   return GetUniqueTopLevelSlice(
@@ -492,19 +533,19 @@ Status GatherComputationsByAllocationType(
     std::vector<const HloComputation*>* global_computations) {
   // Create a worklist of computations paired with whether the allocation must
   // be thread-local.
-  std::deque<std::pair<HloComputation*, bool>> worklist;
+  std::deque<std::pair<const HloComputation*, bool>> worklist;
   worklist.push_back(std::make_pair(module->entry_computation(),
                                     /*is_thread_local*/ false));
 
   // Sets for quickly checking membership. Computations are returned in vectors
   // for stable iteration.
-  FlatSet<HloComputation*> thread_local_set;
-  FlatSet<HloComputation*> global_set;
+  FlatSet<const HloComputation*> thread_local_set;
+  FlatSet<const HloComputation*> global_set;
 
   while (!worklist.empty()) {
     auto worklist_front = worklist.front();
     worklist.pop_front();
-    HloComputation* computation = worklist_front.first;
+    const HloComputation* computation = worklist_front.first;
     bool is_thread_local = worklist_front.second;
     bool in_thread_local_set = thread_local_set.count(computation) > 0;
     bool in_global_set = global_set.count(computation) > 0;
@@ -540,6 +581,7 @@ Status GatherComputationsByAllocationType(
            instruction->called_computations()) {
         switch (instruction->opcode()) {
           case HloOpcode::kCall:
+          case HloOpcode::kConditional:
           case HloOpcode::kWhile:
             // Call and while must be called from a computation with global
             // allocations as they may return references to buffers inside the
@@ -648,7 +690,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   }
 
   if (allow_input_output_aliasing_ && allocation->maybe_live_out()) {
-    HloComputation* entry_computation =
+    const HloComputation* entry_computation =
         assignment->module_->entry_computation();
     for (auto param : entry_computation->parameter_instructions()) {
       for (auto& param_buffer :
@@ -814,17 +856,6 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    if (instruction->opcode() == HloOpcode::kRecv) {
-      // Make sure that recv operations get a new unique allocation so that
-      // don't share their buffer with any other operations.
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
-      allocation_indices.push_back(allocation->index());
-      VLOG(3) << "New allocation #" << allocation->index()
-              << " for recv: " << *buffer;
-      continue;
-    }
-
     if (ShapeUtil::IsTuple(buffer->shape())) {
       // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
       // assumes longer buffer liveness than indicated by the analysis.
@@ -946,8 +977,8 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   const HloOrdering& hlo_ordering = assignment->liveness().hlo_ordering();
   if (run_whole_module_heap_simulation) {
     // Run the heap simulation over the whole module. This reduces memory usage,
-    // since buffers for kCall and kWhile sub-computations are only live for the
-    // duration of their calling instructions.
+    // since buffers for kCall, kWhile, and kConditional sub-computations are
+    // only live for the duration of their calling instructions.
     VLOG(1) << "Running whole-module heap simulation";
     SequentialHloOrdering::HloModuleSequence module_sequence;
     FlatSet<const LogicalBuffer*> all_buffers_to_assign;
@@ -1235,7 +1266,6 @@ const LogicalBuffer* AddBufferToColocatedSet(
   // CopyInsertion ensures root points-to set is unambiguous and distinct.
   const auto& points_to = points_to_analysis.GetPointsToSet(instruction);
   DCHECK(!points_to.IsAmbiguous());
-  DCHECK(points_to.IsDistinct());
   colocated_set->push_back(points_to.element(index)[0]);
   return colocated_set->back();
 }
@@ -1243,7 +1273,8 @@ const LogicalBuffer* AddBufferToColocatedSet(
 }  // namespace
 
 // Builds sets of buffers in 'colocated_buffer_sets' which should be colocated
-// in the same allocation (currently just supports kWhile and kCall).
+// in the same allocation (currently just supports kWhile, kCall, and
+// kConditional).
 void BufferAssigner::BuildColocatedBufferSets(
     const HloModule* module, const BufferLiveness& buffer_liveness,
     const LogicalBuffer::SizeFunction& buffer_size,
@@ -1307,6 +1338,26 @@ void BufferAssigner::BuildColocatedBufferSets(
                                       &colocated_set);
               AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
             });
+      } else if (opcode == HloOpcode::kConditional) {
+        const HloInstruction* conditional_hlo = instruction;
+        ShapeUtil::ForEachSubshape(
+            conditional_hlo->shape(),
+            [this, conditional_hlo, &points_to_analysis, colocated_buffer_sets](
+                const Shape& /*subshape*/, const ShapeIndex& index) {
+              std::vector<const LogicalBuffer*> colocated_set;
+              // Add conditional.result.
+              AddBufferToColocatedSet(conditional_hlo, index,
+                                      points_to_analysis, &colocated_set);
+              // Add conditional.true_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->true_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              // Add conditional.false_computation.root.
+              AddBufferToColocatedSet(
+                  conditional_hlo->false_computation()->root_instruction(),
+                  index, points_to_analysis, &colocated_set);
+              AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
+            });
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 08a53af8baa3f250919517c87c023c329b129024..08a40bfeb2a2a78c25805308e73154c6cc667f21 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -327,6 +327,12 @@ class BufferAssignment {
     return SharesSliceAtIndex(hlo_a, {}, hlo_b, {});
   }
 
+  // Returns true if hlo_a and hlo_b both have at least one buffer assigned for
+  // their top-level and each of their nested shape indices, and if hlo_a's
+  // buffers are all different from hlo_b's buffers.
+  bool HaveDisjointSlices(const HloInstruction* hlo_a,
+                          const HloInstruction* hlo_b) const;
+
   // Returns the underlying points-to analysis used for this assignment.
   const TuplePointsToAnalysis& points_to_analysis() const {
     return liveness_->points_to_analysis();
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 89410f42bd7b5fa8f9b380c868fcd4fedb54576c..6fc9d783f1b34de8c0f93c6aa342591891d08eaf 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -85,7 +85,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, MakeUnique<DependencyHloOrdering>(module),
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
@@ -94,7 +94,7 @@ class BufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunColoredBufferAssignment(
       HloModule* module, BufferLiveness::Colorer colorer, int64 alignment = 1) {
     return BufferAssigner::Run(
-               module, MakeUnique<DependencyHloOrdering>(module),
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
                [alignment](LogicalBuffer::Color) { return alignment; }, false,
                std::move(colorer))
@@ -166,6 +166,15 @@ class BufferAssignmentTest : public HloTestBase {
     return builder.Build();
   }
 
+  std::unique_ptr<HloComputation> BuildR0F32UnaryOpComputation(
+      HloOpcode opcode, const string& name) {
+    auto builder = HloComputation::Builder(name);
+    auto param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "x"));
+    builder.AddInstruction(HloInstruction::CreateUnary(r0f32_, opcode, param));
+    return builder.Build();
+  }
+
   // Verifies that the given instruction hlo has a valid input buffer assigned,
   // i.e., the parameter number matches the op's.
   const BufferAllocation& GetAssignedInputAllocation(
@@ -740,6 +749,56 @@ TEST_F(BufferAssignmentTest, ExampleWhile) {
             << " instructions; total buffer size " << size0 + sizec + sizeb;
 }
 
+TEST_F(BufferAssignmentTest, ExampleConditional) {
+  auto module = CreateNewModule();
+  auto true_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kCeil, "Ceil"));
+  auto false_computation = module->AddEmbeddedComputation(
+      BuildR0F32UnaryOpComputation(HloOpcode::kFloor, "Floor"));
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  auto const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.4f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      r0f32_, pred, const1, true_computation, const2, false_computation));
+  module->AddEntryComputation(builder.Build());
+
+  const std::vector<const HloInstruction*> conditional_instrs =
+      GetInstructions(conditional);
+  const std::vector<const HloInstruction*> true_instrs =
+      GetInstructions(true_computation->root_instruction());
+  const std::vector<const HloInstruction*> false_instrs =
+      GetInstructions(false_computation->root_instruction());
+  EXPECT_EQ(4, conditional_instrs.size());
+  EXPECT_EQ(2, true_instrs.size());
+  EXPECT_EQ(2, false_instrs.size());
+
+  auto buffers = RunBufferAssignment(module.get());
+  ValidateBuffers(conditional_instrs, *buffers);
+  ValidateBuffers(true_instrs, *buffers);
+  ValidateBuffers(false_instrs, *buffers);
+
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, true_instrs, *buffers))
+      << "Should be reuse between conditional and true computation.";
+  EXPECT_FALSE(BuffersDistinct(conditional_instrs, false_instrs, *buffers))
+      << "Should be reuse between conditional and false computation.";
+  EXPECT_FALSE(BuffersDistinct(true_instrs, false_instrs, *buffers))
+      << "Should be reuse between true and false computations.";
+
+  const BufferAllocation& conditional_buffer =
+      GetTopLevelAllocation(*buffers, conditional);
+  const BufferAllocation& true_buffer =
+      GetTopLevelAllocation(*buffers, true_computation->root_instruction());
+  const BufferAllocation& false_buffer =
+      GetTopLevelAllocation(*buffers, false_computation->root_instruction());
+  EXPECT_EQ(conditional_buffer.size(), true_buffer.size());
+  EXPECT_EQ(conditional_buffer.size(), false_buffer.size());
+}
+
 TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   // param0[100] ---> (exp) ---> (tanh) ---> (exp) ---> (neg)
   auto builder = HloComputation::Builder(TestName());
@@ -1360,10 +1419,13 @@ TEST_F(BufferAssignmentTest, OneTempAllocation) {
       HloInstruction::CreateParameter(1, shape_3x4, "param_b"));
   auto param_c = builder.AddInstruction(
       HloInstruction::CreateParameter(2, shape_4x4, "param_c"));
-  auto dot_ab = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_2x4, HloOpcode::kDot, param_a, param_b));
-  auto dot_bc = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape_3x4, HloOpcode::kDot, param_b, param_c));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot_ab = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_2x4, param_a, param_b, dot_dnums));
+  auto dot_bc = builder.AddInstruction(
+      HloInstruction::CreateDot(shape_3x4, param_b, param_c, dot_dnums));
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(shape_5x4, {dot_ab, dot_bc}, 1));
 
@@ -1448,7 +1510,7 @@ class WhileBufferAssignmentTest : public HloTestBase {
     auto sequence =
         CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
-               module, MakeUnique<SequentialHloOrdering>(module, sequence),
+               module, xla::MakeUnique<SequentialHloOrdering>(module, sequence),
                ByteSizeOf,
                [alignment](LogicalBuffer::Color) { return alignment; })
         .ConsumeValueOrDie();
@@ -1469,7 +1531,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1526,7 +1588,7 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1538,8 +1600,6 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
   auto output0 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
-  auto output1 = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
 
   auto cond0 =
       module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
@@ -1556,10 +1616,8 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
   auto body1 =
       module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
 
-  auto tuple1 = builder.AddInstruction(
-      HloInstruction::CreateTuple({input0, weights0, output1}));
   auto while1 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, tuple1));
+      HloInstruction::CreateWhile(loop_state_shape_, cond1, body1, while0));
 
   module->AddEntryComputation(builder.Build());
   RunCopyInsertion(module.get());
@@ -1575,7 +1633,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -1640,7 +1698,7 @@ static bool IsPostOrderTraversal(
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -1676,11 +1734,14 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   auto while1 = builder.AddInstruction(
       HloInstruction::CreateWhile(loop_state_shape_, cond, body, tuple1));
 
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while0, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape_, while1, 1));
   auto root_add = builder.AddInstruction(HloInstruction::CreateBinary(
-      while0->shape(), HloOpcode::kAdd, while0, while1));
-  module->AddEntryComputation(builder.Build());
+      while0->shape(), HloOpcode::kAdd, gte0, gte1));
 
-  RunCopyInsertion(module.get());
+  module->AddEntryComputation(builder.Build());
 
   {
     FlattenCallGraph flatten;
@@ -1688,84 +1749,35 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
     EXPECT_TRUE(result);
   }
 
+  RunCopyInsertion(module.get());
+
   auto sequence =
       CreateMemoryMinimizingSequence(*module, ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo sequence for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  std::vector<const HloInstruction*> sequence_for_buffer_assigment = {
-      input1,   weights1, one,     output1, tuple1, while1,  input0,
-      weights0, zero,     output0, tuple0,  while0, root_add};
+  sequence[module->entry_computation()] = {
+      input1, weights1, one,     output1, while1->operand(0), while1,
+      input0, weights0, zero,    output0, while0->operand(0), while0,
+      gte0,   gte1,     root_add};
 
   // If this ASSERT_TRUE fails, we constructed a bogus sequence above
   // and this test itself is buggy.
-  ASSERT_TRUE(IsPostOrderTraversal(sequence_for_buffer_assigment));
-
-  sequence[module->entry_computation()] =
-      std::move(sequence_for_buffer_assigment);
+  ASSERT_TRUE(IsPostOrderTraversal(sequence[module->entry_computation()]));
 
   auto assignment =
       BufferAssigner::Run(
           module.get(),
-          MakeUnique<SequentialHloOrdering>(module.get(), sequence), ByteSizeOf,
-          [](LogicalBuffer::Color) { return 1; })
+          xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; })
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
 }
 
-// Test buffer assignment for while nodes with multiple uses.
-// TODO(b/37245345): Fix buffer assignment for this case.
-TEST_F(WhileBufferAssignmentTest, DISABLED_TwoWhiles) {
-  auto module = MakeUnique<HloModule>(TestName());
-  auto builder = HloComputation::Builder(TestName());
-
-  auto input0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape_, "input0"));
-  auto weights0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, data_shape_, "weights0"));
-
-  auto zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0)));
-  auto output0 = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape_, zero, {1}));
-
-  auto cond0 =
-      module->AddEmbeddedComputation(BuildWhileConditionComputation("cond"));
-  auto body0 =
-      module->AddEmbeddedComputation(BuildWhileBodyComputation("body"));
-
-  auto tuple0 = builder.AddInstruction(
-      HloInstruction::CreateTuple({input0, weights0, output0}));
-  auto while0 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, tuple0));
-  auto while1 = builder.AddInstruction(
-      HloInstruction::CreateWhile(loop_state_shape_, cond0, body0, while0));
-
-  auto get0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape_, while0, 2));
-  auto get1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape_, while1, 2));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape_, HloOpcode::kAdd, get0, get1));
-  module->AddEntryComputation(builder.Build());
-
-  RunCopyInsertion(module.get());
-
-  {
-    FlattenCallGraph flatten;
-    TF_ASSERT_OK_AND_ASSIGN(bool result, flatten.Run(module.get()));
-    EXPECT_TRUE(result);
-  }
-
-  auto assignment = RunBufferAssignment(module.get());
-
-  EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
-}
-
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = MakeUnique<HloModule>(TestName());
+  auto module = xla::MakeUnique<HloModule>(TestName());
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 56600b583803e23324db778959de620440fce5cf..13825fe05bb1b98045f1a3dac3d7272a2d1151fb 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -120,7 +120,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, negate));
@@ -167,10 +167,10 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
 
   SequentialHloOrdering::HloModuleSequence sequence;
   sequence.insert({entry, {param0, negate, param1, exp, add}});
-  auto liveness = BufferLiveness::Run(
-                      module.get(),
-                      MakeUnique<SequentialHloOrdering>(module.get(), sequence))
-                      .ConsumeValueOrDie();
+  auto liveness =
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
+                                            module.get(), sequence))
+          .ConsumeValueOrDie();
 
   // Entry parameters interfere as if they are defined simultaneously at
   // the very beginning.
@@ -216,7 +216,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_FALSE(InstructionsMayInterfere(*liveness, param, exp));
@@ -250,7 +250,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(InstructionsMayInterfere(*liveness, param, negate));
@@ -294,7 +294,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   std::vector<const HloInstruction*> order = {param, negate, exp, add};
   module_sequence.emplace(computation, order);
   auto liveness =
-      BufferLiveness::Run(module.get(), MakeUnique<SequentialHloOrdering>(
+      BufferLiveness::Run(module.get(), xla::MakeUnique<SequentialHloOrdering>(
                                             module.get(), module_sequence))
           .ConsumeValueOrDie();
 
@@ -334,7 +334,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // All buffers should be live out except the param
@@ -370,7 +370,7 @@ TEST_F(BufferLivenessTest, EmbeddedComputation) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Buffers in different computations should always interfere.
@@ -409,7 +409,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // Only the element buffers of the tuple constant which are pointed to by
@@ -474,7 +474,7 @@ TEST_F(BufferLivenessTest, IndependentTupleElements) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -536,7 +536,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 
   auto liveness =
       BufferLiveness::Run(module.get(),
-                          MakeUnique<DependencyHloOrdering>(module.get()))
+                          xla::MakeUnique<DependencyHloOrdering>(module.get()))
           .ConsumeValueOrDie();
 
   // We compare tuple element pairs that are input/output to the computation:
@@ -624,8 +624,8 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
 
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
@@ -736,8 +736,8 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
     auto liveness =
-        BufferLiveness::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()))
+        BufferLiveness::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()))
             .ConsumeValueOrDie();
     // Return whether or not buffers interference is detected between
     // 'tuple_param0' and 'tuple_root' at shape index '{1}'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 1adecdb939cb2c1259003d3be2c90b5a299b0f30..13eb02ca012f44b2b5ed7c6f5becb7d54b07c33c 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -54,6 +54,7 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
 CallContext GetInstructionCallContext(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
+    case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
     case HloOpcode::kMap:
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index 0395ea8c8b52315f7ca2221f412750ebadda2dd8..1ea7d538cd515c3098b6a1f03c6146d288330406 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -34,12 +34,13 @@ using ::testing::UnorderedElementsAre;
 class CallGraphTest : public HloTestBase {
  protected:
   // Build and return a trivial computation taking and returning a scalar.
-  std::unique_ptr<HloComputation> MakeScalarComputation() {
+  std::unique_ptr<HloComputation> MakeScalarComputation(
+      HloOpcode opcode = HloOpcode::kNegate) {
     HloComputation::Builder builder(TestName() + ".ScalarComputation");
     HloInstruction* param0 = builder.AddInstruction(
         HloInstruction::CreateParameter(0, kScalarShape, "param0"));
     builder.AddInstruction(
-        HloInstruction::CreateUnary(kScalarShape, HloOpcode::kNegate, param0));
+        HloInstruction::CreateUnary(kScalarShape, opcode, param0));
     return builder.Build();
   }
 
@@ -236,6 +237,54 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
+TEST_F(CallGraphTest, ComputationWithConditional) {
+  // Test a call graph of a module with a conditional.
+  auto module = CreateNewModule();
+  HloComputation* true_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kCeil));
+  HloComputation* false_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation(HloOpcode::kFloor));
+
+  HloComputation::Builder builder(TestName());
+  HloInstruction* pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloInstruction* const1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.4f)));
+  HloInstruction* const2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.6f)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          kScalarShape, pred, const1, true_computation, const2,
+          false_computation));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+
+  EXPECT_EQ(3, call_graph->nodes().size());
+
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(1, entry_node.callsites().size());
+
+  const CallSite& conditional_callsite = entry_node.callsites()[0];
+  EXPECT_EQ(conditional, conditional_callsite.instruction());
+  EXPECT_THAT(conditional_callsite.called_computations(),
+              UnorderedElementsAre(true_computation, false_computation));
+  EXPECT_EQ(CallContext::kSequential, conditional_callsite.context());
+  EXPECT_EQ(entry_node.GetCallSite(conditional), &conditional_callsite);
+
+  const CallGraphNode& true_node = call_graph->GetNode(true_computation);
+  EXPECT_TRUE(true_node.callees().empty());
+  EXPECT_EQ(1, true_node.callers().size());
+  EXPECT_EQ(entry_computation, true_node.callers()[0]);
+
+  const CallGraphNode& false_node = call_graph->GetNode(false_computation);
+  EXPECT_TRUE(false_node.callees().empty());
+  EXPECT_EQ(1, false_node.callers().size());
+  EXPECT_EQ(entry_computation, false_node.callers()[0]);
+}
+
 TEST_F(CallGraphTest, ComplexGraph) {
   // Test a call graph of a module with several computation called in various
   // contexts. The call graph looks like:
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 3b1900428af1863c73efe67c27061d979557b3a4..e2e9d2a0c048fec6c6ffbeef1223ae0e6aef50d1 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -27,14 +27,8 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
-/* static */ tensorflow::mutex* Compiler::platform_compiler_mutex_;
-
-/* static */ void Compiler::LazyInitMutex() {
-  static std::once_flag mutex_init_flag;
-  std::call_once(mutex_init_flag, []() {
-    Compiler::platform_compiler_mutex_ = new tensorflow::mutex;
-  });
-}
+/* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
+    tensorflow::LINKER_INITIALIZED);
 
 /* static */ std::map<perftools::gputools::Platform::Id,
                       Compiler::CompilerFactory>*
@@ -55,8 +49,7 @@ Compiler::GetPlatformCompilers() {
 /* static */ void Compiler::RegisterCompilerFactory(
     se::Platform::Id platform_id,
     std::function<std::unique_ptr<Compiler>()> compiler_factory) {
-  LazyInitMutex();
-  tensorflow::mutex_lock lock(*platform_compiler_mutex_);
+  tensorflow::mutex_lock lock(platform_compiler_mutex_);
   auto* factories = GetPlatformCompilerFactories();
   CHECK(factories->find(platform_id) == factories->end())
       << "Compiler factory already registered for platform";
@@ -65,8 +58,7 @@ Compiler::GetPlatformCompilers() {
 
 /* static */ StatusOr<Compiler*> Compiler::GetForPlatform(
     const se::Platform* platform) {
-  LazyInitMutex();
-  tensorflow::mutex_lock lock(*platform_compiler_mutex_);
+  tensorflow::mutex_lock lock(platform_compiler_mutex_);
 
   auto* compilers = GetPlatformCompilers();
   // See if we already instantiated a compiler for this platform.
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 4c2d9600d909e82dcb62f508a10445c08c1cdee6..fc67330f5cbdbcb0d1a259d284599916a908d1fe 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -97,21 +97,32 @@ class Compiler {
   // Returns the ID of the platform that this compiler targets.
   virtual perftools::gputools::Platform::Id PlatformId() const = 0;
 
+  // Runs Hlo passes to optimize the given Hlo module, returns the optimized
+  // module.
+  virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* executor) = 0;
+
   // Compiles the HLO module for execution on a device given by the executor,
-  // and returns an executable object or an error status. Takes ownership of the
-  // HLO module and is free to transform it.
+  // and returns an executable object or an error status. No HLO passes are
+  // applied to module. Generally a module should be passed through RunHloPasses
+  // prior to calling this method because the some HLO passes are required for
+  // correctness. Takes ownership of the HLO module and is free to transform it.
   //
   // The compiler may optionally specialize to the individual device
   // (not just type of device) indicated by the executor.
   //
   // Use the overload below to compile computations that run in parallel.
-  virtual StatusOr<std::unique_ptr<Executable>> Compile(
+  virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* executor) = 0;
 
   // Compiles a set of HLO modules that can run in parallel, potentially
   // communicating data between the modules, and returns a corresponding
   // sequence of executable objects.
+  //
+  // TODO(b/68666782): Remove this method after adding support for multiple
+  // modules to RunHloPasses and RunBackends.
   virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
       std::vector<std::unique_ptr<HloModule>> modules,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>>
@@ -157,8 +168,7 @@ class Compiler {
 
  private:
   // Mutex that guards the platform-compiler map.
-  static tensorflow::mutex* platform_compiler_mutex_;
-  static void LazyInitMutex();
+  static tensorflow::mutex platform_compiler_mutex_;
 
   // Map from platform kind to compiler factory.
   static std::map<perftools::gputools::Platform::Id, CompilerFactory>*
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index cdfa30dd9a7b6a5b9e58087491a9d99caaa1b998..657fba6b6231104bf47f9dec80f7cd36a0ba3efd 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -52,6 +52,12 @@ Status DeviceAssignment::Serialize(DeviceAssignmentProto* proto) const {
 /* static */ StatusOr<std::unique_ptr<DeviceAssignment>>
 DeviceAssignment::Deserialize(const DeviceAssignmentProto& proto) {
   TF_RET_CHECK(proto.computation_devices_size() == proto.computation_count());
+  if (proto.replica_count() <= 0 || proto.computation_count() <= 0) {
+    return InvalidArgument(
+        "Invalid device assignment topology: replica_count=%d, "
+        "computation_count=%d",
+        proto.replica_count(), proto.computation_count());
+  }
   auto assignment = MakeUnique<DeviceAssignment>(proto.replica_count(),
                                                  proto.computation_count());
   for (int computation = 0; computation < proto.computation_count();
@@ -94,7 +100,7 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     se::Platform::Id platform_id,
     ComputationPlacerCreationFunction creation_function) {
   tensorflow::mutex_lock lock(
-      *ComputationPlacer::platform_computation_placer_mutex());
+      ComputationPlacer::platform_computation_placer_mutex_);
   auto* computation_placers = GetPlatformComputationPlacers();
   CHECK(computation_placers->find(platform_id) == computation_placers->end());
   (*computation_placers)[platform_id].creation_function = creation_function;
@@ -103,7 +109,7 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
 /* static */ StatusOr<ComputationPlacer*> ComputationPlacer::GetForPlatform(
     const se::Platform* platform) {
   tensorflow::mutex_lock lock(
-      *ComputationPlacer::platform_computation_placer_mutex());
+      ComputationPlacer::platform_computation_placer_mutex_);
   auto* computation_placers = GetPlatformComputationPlacers();
 
   auto it = computation_placers->find(platform->id());
@@ -122,11 +128,9 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
   return it->second.placer.get();
 }
 
-/* static */ tensorflow::mutex*
-ComputationPlacer::platform_computation_placer_mutex() {
-  static tensorflow::mutex* m = new tensorflow::mutex;
-  return m;
-}
+/* static */ tensorflow::mutex
+    ComputationPlacer::platform_computation_placer_mutex_(
+        tensorflow::LINKER_INITIALIZED);
 
 /* static */ std::map<perftools::gputools::Platform::Id,
                       ComputationPlacer::State>*
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index 7d9abcd100dd9e878da885110bc1bd1ac65e3f84..737ccabaa7a61931b6e2787f75b02857562d4820 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -89,11 +89,8 @@ class ComputationPlacer {
       const perftools::gputools::Platform* platform);
 
  private:
-  // Routine that returns the mutex that guards the platform-to-computation
-  // placer map. Done as a routine to ensure correct initialization ordering,
-  // since RegisterComputationPlacer can be called during program initialization
-  // time.
-  static tensorflow::mutex* platform_computation_placer_mutex();
+  // The mutex that guards the platform-to-computation placer map.
+  static tensorflow::mutex platform_computation_placer_mutex_;
 
   // State kept for each kind of ComputationPlacer. Registration functions set
   // up creation_function, and then we use that to lazily create "placer" the
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 0453a698a09b740d68b35258ede7c537fcf290d4..cd983bc03e993caed883916de01d75dffdbc4bab 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
-#include <memory>
-
+#include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -31,597 +33,1174 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 namespace {
 
-using tensorflow::gtl::FlatMap;
-using tensorflow::gtl::FlatSet;
+bool IsEntryParameterValue(const HloValue& value) {
+  const HloComputation* computation = value.defining_instruction()->parent();
+  return value.defining_instruction()->opcode() == HloOpcode::kParameter &&
+         computation == computation->parent()->entry_computation();
+}
+
+bool IsConstantValue(const HloValue& value) {
+  return value.defining_instruction()->opcode() == HloOpcode::kConstant;
+}
+
+bool ValueIsReadOnly(const HloValue& value) {
+  return IsConstantValue(value) || IsEntryParameterValue(value);
+}
 
-// InstructionCopier encapsulates indices at which to copy 'instruction'.
-// All 'instruction' users in 'copy_users' are updated to use the copy.
+// Deep copy the given instructions 'from' and 'to' at the ShapeIndexes given in
+// 'indices_to_copy'. Add control edges from the respective kCopy instructions
+// in deep copy of 'from' to the respective kCopy instruction in the deep copy
+// of 'to'.
 //
-// Instruction copies are generated in two phases:
-// 1) Recording buffer indices at which 'instruction' requires copies (i.e.
-//    setting 'indices_to_copy_[index]'=true).
-// 2) Inserting kCopy instructions based on indices recorded in phase 1).
-//   *) Array instructions are copied by inserting a single kCopy instruction.
-//   *) Tuple-shaped instructions are copied by recursively expanding tuples
-//      (and tuple-shaped elements), and inserting kCopy instructions for any
-//      tuple elements which require a copy. As the recursion unwinds, new tuple
-//      instructions are added to gather the copied (and uncopied) references
-//      into the output tuple (i.e. the copy of the tuple-shaped instruction).
+// Requirements: 'from' and 'to' must have compatible shapes.
 //
-//      Example two-element tuple with one element that needs a copy:
+// For example, suppose 'from' and 'to' are two-element tuples where index 0 is
+// the only index to copy. Prior to deep-copying we have:
 //
-//             original-instruction
-//                   /    \
-//                GTE(0)  GTE(1)
-//                  |       |
-//                 Copy     |
-//                   \     /
-//                    Tuple  // copied-instruction
 //
-//      As an optimization, if the original instruction is itself a Tuple
-//      instruction, we elide the unnecessary extra GTE and Tuple instructions,
-//      and just insert the copy into a new Tuple instruction, with control
-//      dependencies to ensure the copy occurs after any possible interference.
-class InstructionCopier {
- public:
-  InstructionCopier(HloInstruction* instruction,
-                    const std::vector<HloInstruction*>& copy_users)
-      : instruction_(instruction),
-        copy_users_(copy_users),
-        indices_to_copy_(instruction->shape()),
-        control_predecessors_(instruction->shape()) {}
-
-  // Sets indices that are read-only, and thus do not need to be copied.
-  void SetReadOnlyIndices(const ShapeTree<bool>& read_only_indices) {
-    read_only_indices_ = read_only_indices;
-  }
+//      'from'
+//         |
+//        ...
+//         |
+//       'to'
+//
+// DeepCopyAndAddControlEdges produces:
+//
+//       'from'
+//        /   \
+//      GTE   GTE
+//       |     |
+//     Copy    |
+//    /   \   /
+//   |    Tuple
+//   |      |
+//  ctrl   ...
+//  edge    |
+//   |      |
+//   |    'to'
+//   |    /   \
+//   |  GTE   GTE
+//    \  |     |
+//     Copy    |
+//        \   /
+//        Tuple
+//
+StatusOr<std::pair<HloInstruction*, HloInstruction*>>
+DeepCopyAndAddControlEdges(HloInstruction* from, HloInstruction* to,
+                           const ShapeTree<bool>& indices_to_copy) {
+  DCHECK(ShapeUtil::Compatible(from->shape(), to->shape()));
+  // to/from_copy_tree hold the kCopy instruction produces by the deep
+  // copies. Elements which are not copied (indices_to_copy.element(index) ==
+  // false) have nullptr at that index.
+  ShapeTree<HloInstruction*> from_copy_tree(from->shape(),
+                                            /*init_value=*/nullptr);
+  TF_ASSIGN_OR_RETURN(HloInstruction * from_deep_copy,
+                      from->parent()->DeepCopyInstruction(
+                          from, &indices_to_copy, &from_copy_tree));
 
-  // Sets copy overrides, which are copy instructions to use at each index. This
-  // is used to share a single copy of read-only entry parameters and constants
-  // between multiple While loops.
-  void SetCopyOverrides(const ShapeTree<HloInstruction*>& copy_overrides) {
-    copy_overrides_ = copy_overrides;
+  ShapeTree<HloInstruction*> to_copy_tree(to->shape(), /*init_value=*/nullptr);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * to_deep_copy,
+      to->parent()->DeepCopyInstruction(to, &indices_to_copy, &to_copy_tree));
+
+  // Add control edges between the respective kCopy instructions.
+  for (const auto& pair : from_copy_tree) {
+    const ShapeIndex& index = pair.first;
+    HloInstruction* from_copy = pair.second;
+    HloInstruction* to_copy = to_copy_tree.element(index);
+    if (from_copy == nullptr) {
+      TF_RET_CHECK(to_copy == nullptr);
+      continue;
+    }
+    TF_RET_CHECK(to_copy != nullptr);
+    TF_RETURN_IF_ERROR(from_copy->AddControlDependencyTo(to_copy));
   }
 
-  // Returns true if all recorded indices are false (returns true otherwise).
-  bool HasAllIndicesFalse() const;
+  return std::make_pair(from_deep_copy, to_deep_copy);
+}
 
-  // Records instruction buffer indices which point-to a Parameter or Constant.
-  Status RecordIndicesWhichPointToParamOrConstant(
-      const TuplePointsToAnalysis& points_to_analysis);
+// Compute the indices of the loop state which need copies in order to avoid
+// live range interference. Generally, an element in the loop state does not
+// need to be copied if the element is passed through transparently through the
+// body.
+//
+// Returns whether any indices need to be copied.
+bool IndicesToCopyForWhile(const HloDataflowAnalysis& dataflow,
+                           const HloInstruction* xla_while,
+                           ShapeTree<bool>* indices_to_copy) {
+  DCHECK(ShapeUtil::Compatible(indices_to_copy->shape(), xla_while->shape()));
 
-  // Records instruction buffer indices to copy which are necessary to ensure:
-  // *) PointsToSet of 'instruction_' is unambiguous and distinct.
-  // *) No liveness interference between 'instruction_' and 'other_instruction'.
-  //
-  // If 'read_only_indices_out' is non-null, read-only indices are set to true.
-  Status RecordIndicesToCopyForColocatingBuffers(
-      const BufferLiveness& liveness, const HloInstruction* other_instruction,
-      ShapeTree<bool>* read_only_indices_out);
+  bool any_copies = false;
+  const HloInstruction* init = xla_while->operand(0);
+  for (auto& pair : *indices_to_copy) {
+    const ShapeIndex& index = pair.first;
+    bool& should_copy = pair.second;
+    // If there is any ambiguity, then loop state must be copied.
+    if (dataflow.GetValueSet(init, index).values().size() > 1 ||
+        dataflow.GetValueSet(xla_while, index).values().size() > 1) {
+      should_copy = true;
+    } else {
+      // If the output of the while instruction is not the same as the init
+      // value of the while, then this element is not passed through the body
+      // transparently and must be copied.
+      should_copy = dataflow.GetUniqueValueAt(xla_while, index) !=
+                    dataflow.GetUniqueValueAt(init, index);
+    }
+    any_copies |= should_copy;
+  }
+  return any_copies;
+}
 
-  // Records control predecessors to add for inserted copy instructions.
-  // 'parameter' must have the same shape as the instruction that will be
-  // copied, and must define all buffers in the shape. Control predecessors are
-  // only recorded for indices that have already been marked for copying.
-  Status RecordControlPredecessors(
-      const TuplePointsToAnalysis& points_to_analysis,
-      HloInstruction* parameter);
+// Add kCopy instructions around the given kWhile instruction to eliminate any
+// possible live range interference of HLO values assuming a dependency-based
+// ordering (HloDependencyOrdering). Copies are added conservatively. There
+// likely are copies which are not strictly necessary, but there are removed
+// later in the pass via CopyRemover.
+//
+//
+// Elements (each ShapeIndex) in the loop state are considered independently.  A
+// copy is added to each element of the loop state which is modified in the
+// while body. For each such element, a total of three kCopy instructions are
+// added at following locations:
+//
+//   (1) The init value is copied before the kWhile instruction. Before:
+//
+//           (Init)
+//             |
+//           kWhile
+//             |
+//            ...
+//
+//       After:
+//
+//           (Init)
+//             |
+//           kCopy
+//             |
+//           kWhile
+//             |
+//            ...
+//
+//       This copy is necessary in case the init value is simultaneously live
+//       with the kWhile.
+//
+//   (2) Copies are added to the parameter and root of the while body
+//       computation. Before:
+//
+//           kParameter
+//               |
+//              ...
+//               |
+//           (body root)
+//
+//       After:
+//
+//           kParameter
+//               |
+//             kCopy ----------+
+//               |             |
+//              ...           ctrl
+//               |            edge
+//           (body root)       |
+//               |             |
+//             kCopy <---------+
+//
+//       The root kCopy becomes the new root of the computation. Both copies are
+//       necessary to any potential interference between the parameter value and
+//       the root value. The control edge prevents potential interference
+//       between the copies themselves.
+//
+// If the loop state is a tuple then the above kCopy instructions are a deep
+// copy constructed of kCopy, KGetTupleElement, and kTuple instruction as
+// constructed by HloInstruction::DeepCopyInstruction.
+Status AddCopiesForWhile(const HloAliasAnalysis& alias_analysis,
+                         HloInstruction* xla_while) {
+  VLOG(2) << "Adding copies for kWhile instruction " << xla_while->name();
+  TF_RET_CHECK(xla_while->opcode() == HloOpcode::kWhile);
 
-  // Inserts copies of 'instruction' buffers at indices in 'indices_to_copy',
-  // and replaces all uses for instructions in 'copy_users_' with copy.
-  // Returns the instruction which is a copy 'instruction'.
-  HloInstruction* Copy();
+  ShapeTree<bool> indices_to_copy(xla_while->shape());
+  if (!IndicesToCopyForWhile(alias_analysis.dataflow_analysis(), xla_while,
+                             &indices_to_copy)) {
+    VLOG(2) << "No copies necessary for kWhile instruction "
+            << xla_while->name();
+    return Status::OK();
+  }
 
-  HloInstruction* instruction() { return instruction_; }
+  VLOG(2) << "Adding copies for " << xla_while->name() << " at indices:";
+  for (auto& pair : indices_to_copy) {
+    if (pair.second) {
+      VLOG(2) << "  " << pair.first;
+    }
+  }
 
-  const std::vector<HloInstruction*>& copy_users() const { return copy_users_; }
+  // Deep copy init.
+  HloInstruction* while_init = xla_while->mutable_operand(0);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * while_init_copy,
+      xla_while->parent()->DeepCopyInstruction(while_init, &indices_to_copy));
+  TF_RETURN_IF_ERROR(while_init->ReplaceUseWith(xla_while, while_init_copy));
 
- private:
-  // Does the given index represent a read-only buffer?
-  bool IsReadOnlyIndex(const ShapeIndex& index) const {
-    return !ShapeUtil::IsNil(read_only_indices_.shape()) &&
-           read_only_indices_.element(index);
-  }
+  // Deep copy the parameter and the root. Extend a control edge from the copy
+  // of the parameter value to the corresponding copy value of the root.
+  HloComputation* body = xla_while->while_body();
+  HloInstruction* param = body->parameter_instruction(0);
+  HloInstruction* root = body->root_instruction();
 
-  // Returns the copy override at the given index, or nullptr.
-  HloInstruction* GetCopyOverride(const ShapeIndex& index) const {
-    return ShapeUtil::IsNil(copy_overrides_.shape())
-               ? nullptr
-               : copy_overrides_.element(index);
-  }
+  // If param is the root then all indices should have been passed through the
+  // while body and we should have returned early above.
+  TF_RET_CHECK(param != root);
 
-  // Records instruction buffer indices which have ambiguous or non-distinct
-  // points-to sets.
-  Status RecordAmbiguousOrNonDistinctIndices(
-      const TuplePointsToAnalysis& points_to_analysis);
+  // Copy users before making a deep copy of the parameter as the deep copy
+  // will create new users of the parameter (eg, the GTE instructions of the
+  // deep copy).
+  std::vector<HloInstruction*> param_users = param->users();
 
-  // Records instruction buffer indices which have interfering live ranges
-  // with 'other_instruction' buffers at same index.
-  Status RecordIndicesWhichInterfereWithOtherInstruction(
-      const BufferLiveness& liveness, const HloInstruction* other_instruction,
-      ShapeTree<bool>* read_only_indices_out);
+  ShapeIndex current_index;
+  TF_ASSIGN_OR_RETURN(auto pair,
+                      DeepCopyAndAddControlEdges(param, root, indices_to_copy));
 
-  // Recursively inserts copies of 'instruction' tuple elements at indices
-  // specified in 'indices_to_copy', and returns the copy of 'instruction'.
-  HloInstruction* CopyTuple(HloInstruction* instruction, ShapeIndex* index);
+  HloInstruction* param_copy = pair.first;
+  HloInstruction* root_copy = pair.second;
 
-  void RecordIndex(const ShapeIndex& index) {
-    *indices_to_copy_.mutable_element(index) = true;
+  for (HloInstruction* user : param_users) {
+    TF_RETURN_IF_ERROR(param->ReplaceUseWith(user, param_copy));
   }
 
-  HloInstruction* instruction_;
-  const std::vector<HloInstruction*> copy_users_;
-  ShapeTree<bool> indices_to_copy_;
-  ShapeTree<std::vector<HloInstruction*>> control_predecessors_;
-  ShapeTree<bool> read_only_indices_;
-  ShapeTree<HloInstruction*> copy_overrides_;
-};
+  body->set_root_instruction(root_copy);
 
-bool InstructionCopier::HasAllIndicesFalse() const {
-  bool all_indices_false = true;
-  indices_to_copy_.ForEachElement(
-      [&all_indices_false](const ShapeIndex& /*index*/, bool data) {
-        if (data) {
-          all_indices_false = false;
-        }
-      });
-  return all_indices_false;
+  return Status::OK();
 }
 
-Status InstructionCopier::RecordIndicesWhichPointToParamOrConstant(
-    const TuplePointsToAnalysis& points_to_analysis) {
-  const PointsToSet& points_to =
-      points_to_analysis.GetPointsToSet(instruction_);
-  // Shallow copy the instruction if the points-to set of the top-level
-  // buffer is ambiguous. This is necessary because the backends must know
-  // statically what the top-level buffer of the result is.
-  if (points_to.element(/*index=*/{}).size() > 1) {
-    RecordIndex({});
+// Removes any control dependencies to or from the given instruction.
+Status StripControlDependenciesFrom(HloInstruction* instruction) {
+  while (!instruction->control_successors().empty()) {
+    TF_RETURN_IF_ERROR(instruction->RemoveControlDependencyTo(
+        instruction->control_successors().front()));
+  }
+
+  while (!instruction->control_predecessors().empty()) {
+    TF_RETURN_IF_ERROR(
+        instruction->control_predecessors().front()->RemoveControlDependencyTo(
+            instruction));
   }
 
-  // Multiple buffers within a parameter/constant may be live out, so collect
-  // a set of indices at which to copy first.
-  points_to.ForEachElement([this](const ShapeIndex& index,
-                                  const PointsToSet::BufferList& buffers) {
-    if (IsReadOnlyIndex(index)) {
-      return;
-    }
-    for (const LogicalBuffer* buffer : buffers) {
-      // pointee is the HloInstruction producing the buffer which may be
-      // liveout.
-      HloInstruction* pointee = buffer->instruction();
-      if (pointee->opcode() == HloOpcode::kParameter ||
-          pointee->opcode() == HloOpcode::kConstant) {
-        VLOG(2) << "Parameter or constant buffer " << buffer->ToString()
-                << " index: " << tensorflow::str_util::Join(index, ",")
-                << " may be live out of computation: " << pointee->ToString();
-        RecordIndex(index);
-        break;
-      }
-    }
-  });
   return Status::OK();
 }
 
-Status InstructionCopier::RecordIndicesToCopyForColocatingBuffers(
-    const BufferLiveness& liveness, const HloInstruction* other_instruction,
-    ShapeTree<bool>* read_only_indices_out) {
-  TF_RETURN_IF_ERROR(
-      RecordAmbiguousOrNonDistinctIndices(liveness.points_to_analysis()));
-  TF_RETURN_IF_ERROR(RecordIndicesWhichInterfereWithOtherInstruction(
-      liveness, other_instruction, read_only_indices_out));
+// Add kCopy instructions to the given module to guarantee there is no
+// live-range interference. Generally interference can only occur around kWhile
+// instructions which have update-in-place semantics.
+Status AddCopiesToResolveInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      }
+    }
+  }
   return Status::OK();
 }
 
-Status InstructionCopier::RecordAmbiguousOrNonDistinctIndices(
-    const TuplePointsToAnalysis& points_to_analysis) {
-  const PointsToSet& points_to =
-      points_to_analysis.GetPointsToSet(instruction_);
-  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  FlatMap<const LogicalBuffer*, std::vector<ShapeIndex>>
-      buffer_to_source_indices;
-  points_to.ForEachElement(
-      [this, &buffer_to_source_indices](
-          const ShapeIndex& index, const PointsToSet::BufferList& buffers) {
-        if (buffers.size() > 1) {
-          // Record ambiguous points-to set at 'index'.
-          if (!indices_to_copy_.element(index)) {
-            VLOG(2) << "Adding copy of buffer for instruction: "
-                    << instruction_->name()
-                    << " at index: " << tensorflow::str_util::Join(index, ",")
-                    << " with ambiguous points-to set.";
-            RecordIndex(index);
+// Class for removing unnecessary copies from the module.
+//
+// kCopy instructions are added conservatively to guarantee no live range
+// interference between HLO values. This class uses a more fine-grained analysis
+// to remove some of these added copies which are not strictly necessary.
+class CopyRemover {
+ public:
+  CopyRemover(const HloAliasAnalysis& alias_analysis,
+              const HloOrdering& ordering, HloModule* module)
+      : module_(module),
+        alias_analysis_(alias_analysis),
+        ordering_(ordering),
+        buffer_value_tracker_(*module, alias_analysis, ordering) {}
+
+  // Try to elide the given copy. The copy is elided if the instruction is not
+  // necessary to prevent live-range interference of HLO values. Returns true if
+  // copy was elided.
+  //
+  // The copy instruction is not actually removed here. Instead it is left for
+  // dead in the graph. Later calls to DCE will remove the instruction.
+  StatusOr<bool> TryElideCopy(HloInstruction* copy) {
+    if (buffer_value_tracker_.TryElideCopy(copy)) {
+      TF_RETURN_IF_ERROR(StripControlDependenciesFrom(copy));
+      TF_RETURN_IF_ERROR(copy->ReplaceAllUsesWith(copy->mutable_operand(0)));
+      return true;
+    }
+    return false;
+  }
+
+  string ToString() const {
+    string out = StrCat("CopyRemover, module ", module_->name(), "\n");
+    StrAppend(&out, "  Buffer values, in dependency order:\n");
+    for (const HloBuffer& buffer : alias_analysis_.buffers()) {
+      StrAppend(&out, "    HloBuffer ", buffer.id(), ":\n");
+    }
+    return out;
+  }
+
+ private:
+  // Class which tracks the HLO values within each HLO buffer in the module
+  // during copy removal.
+  //
+  // The values are held in a linked list where there is one list for each
+  // buffer. Removing a copy instruction merges together the values in the
+  // source buffer of the copy to the destination buffer of the copy. This class
+  // tracks these value lists as copies are removed from the graph (and value
+  // lists are merged).
+  //
+  // The BufferValueTracker object is initialized to match the state of
+  // HloAliasAnalysis. However, as copies are removed this state diverges. The
+  // values-to-buffer mapping is maintained outside of HloAliasAnalysis because
+  // a fully updatable alias analysis is very slow.
+  class BufferValueTracker {
+   public:
+    // The values held in a single HLO buffer are represented using a linked
+    // list. An element type in this list is ValueNode.
+    //
+    // This linked list is hand-rolled to enable efficient splicing of lists
+    // using only references to list elements without knowing which lists are
+    // being spliced. std::list requires a reference to the list object to
+    // splice.
+    struct ValueNode {
+      explicit ValueNode(const HloValue* v) : value(v) {}
+
+      const HloValue* value;
+
+      // The uses are maintained outside of HloValue::uses() because
+      // HloValue::uses() is not updatable (a fully updatable dataflow analysis
+      // is slow).
+      std::vector<const HloUse*> uses;
+
+      // next/prev elements in the linked list. The list is circularly linked so
+      // these values are never null for elements in the list.
+      ValueNode* prev = nullptr;
+      ValueNode* next = nullptr;
+    };
+
+    BufferValueTracker(const HloModule& module,
+                       const HloAliasAnalysis& alias_analysis,
+                       const HloOrdering& ordering)
+        : dataflow_(alias_analysis.dataflow_analysis()), ordering_(ordering) {
+      // Construct a list for each HLO buffer in the alias analysis. Maintain a
+      // map from HloValue to the respective list element representing that
+      // value. The map is used to construct the copy info map below.
+      tensorflow::gtl::FlatMap<const HloValue*, ValueNode*> value_to_node;
+      for (const HloBuffer& buffer : alias_analysis.buffers()) {
+        // Verify values contained in the buffer are strictly ordered. This
+        // should always be the case after adding copies to eliminate
+        // interference. Specifically, the addition of the control flow edges
+        // between copies added around aliased operations (kWhile) guarantees
+        // this strict order.
+        for (const HloValue* value_a : buffer.values()) {
+          for (const HloValue* value_b : buffer.values()) {
+            if (value_a != value_b) {
+              DCHECK(ordering_.LiveRangeStrictlyBefore(*value_a, *value_b,
+                                                       dataflow_) ||
+                     ordering_.LiveRangeStrictlyBefore(*value_b, *value_a,
+                                                       dataflow_))
+                  << value_a->ToShortString() << " and "
+                  << value_b->ToShortString() << " are not ordered";
+            }
           }
         }
-        // For each 'buffer': record a mapping from 'buffer' to 'index'.
-        for (const LogicalBuffer* buffer : buffers) {
-          buffer_to_source_indices[buffer].push_back(index);
-        }
-      });
 
-  // Record all non-distinct indices detected in 'buffer_to_source_indices'.
-  for (const auto& buff_to_src : buffer_to_source_indices) {
-    if (buff_to_src.second.size() == 1) {
-      continue;
+        std::vector<const HloValue*> values = buffer.values();
+        std::sort(values.begin(), values.end(),
+                  [this](const HloValue* a, const HloValue* b) {
+                    return ordering_.IsDefinedBefore(*a, *b);
+                  });
+
+        // Create a list containing all of the values in the buffer.
+        AddValueList(values, &value_to_node);
+      }
+
+      // Create copy_map_ which contains the source and destination values
+      // of all copies.
+      CreateCopyMap(module, value_to_node);
+
+      XLA_VLOG_LINES(3, ToString());
+      TF_DCHECK_OK(Verify());
     }
-    for (const ShapeIndex& src_index : buff_to_src.second) {
-      // Record non-distinct points-to set at 'src_index'.
-      if (!indices_to_copy_.element(src_index)) {
-        VLOG(2) << "Adding copy of buffer for instruction: "
-                << instruction_->name()
-                << " at index: " << tensorflow::str_util::Join(src_index, ",")
-                << " because of non-distinct points-to set.";
-        RecordIndex(src_index);
+
+    // Add a list containing the given values to BufferValueTracker. This
+    // represents the values contained in a single buffer. For each value in
+    // 'values' an entry is created in value_to_node which indicates the
+    // respective ValueNode representing that value.
+    void AddValueList(
+        tensorflow::gtl::ArraySlice<const HloValue*> values,
+        tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>* value_to_node) {
+      ValueNode* tail = nullptr;
+      ValueNode* head = nullptr;
+      for (const HloValue* value : values) {
+        auto new_node = new ValueNode(value);
+        (*value_to_node)[value] = new_node;
+
+        // Copy the HLO values's uses into the ValueNode for the value. These
+        // uses in ValueNode are updated as copies are removed.
+        new_node->uses.reserve(value->uses().size());
+        for (const HloUse& use : value->uses()) {
+          new_node->uses.push_back(&use);
+        }
+
+        // Connect the new node into the linked list.
+        if (tail == nullptr) {
+          head = new_node;
+        } else {
+          tail->next = new_node;
+          new_node->prev = tail;
+        }
+        tail = new_node;
       }
+
+      // The linked list is circular so connect the head and tail.
+      tail->next = head;
+      head->prev = tail;
+      value_lists_.insert(head);
     }
-  }
-  return Status::OK();
-}
 
-Status InstructionCopier::RecordIndicesWhichInterfereWithOtherInstruction(
-    const BufferLiveness& liveness, const HloInstruction* other_instruction,
-    ShapeTree<bool>* read_only_indices_out) {
-  // Record all buffer indices for 'instruction_', which interfere with
-  // 'other_instruction' at the same index.
-  ShapeUtil::ForEachSubshape(
-      instruction_->shape(),
-      [this, &liveness, other_instruction, read_only_indices_out](
-          const Shape& /*subshape*/, const ShapeIndex& index) {
-        if (IsReadOnlyIndex(index)) {
-          return;
+    // This method also fills in copy_map_ which indicates which nodes
+    // in the value lists corresponding to the source and destination values of
+    // kCopy instructions. value_to_node should map each HloValue to its
+    // respective ValueNode.
+    void CreateCopyMap(
+        const HloModule& module,
+        const tensorflow::gtl::FlatMap<const HloValue*, ValueNode*>&
+            value_to_node) {
+      for (HloComputation* computation : module.computations()) {
+        for (HloInstruction* instruction : computation->instructions()) {
+          // Add copies with unambiguous source values to the map. Copies with
+          // ambiguous sources are not removable.
+          if (instruction->opcode() == HloOpcode::kCopy) {
+            const HloValueSet& src_value_set =
+                dataflow_.GetValueSet(instruction->operand(0));
+            if (src_value_set.values().size() == 1) {
+              CopyNodes& copy_node = copy_map_[instruction];
+              copy_node.dest =
+                  value_to_node.at(&dataflow_.GetUniqueValueAt(instruction));
+              copy_node.src = value_to_node.at(&src_value_set.GetUniqueValue());
+            }
+          }
         }
-        if (indices_to_copy_.element(index)) {
-          // Return if previous pass already set index.
-          return;
+      }
+    }
+
+    ~BufferValueTracker() {
+      for (const ValueNode* head : value_lists_) {
+        const ValueNode* p = head;
+        do {
+          const ValueNode* tmp = p->next;
+          delete p;
+          p = tmp;
+        } while (p != head);
+      }
+    }
+
+    // Verify invariants within the linked lists.
+    Status Verify() const {
+      for (const ValueNode* head : value_lists_) {
+        const ValueNode* p = head;
+        do {
+          // Verify links between elements are consistent.
+          TF_RET_CHECK(p->prev->next == p);
+          TF_RET_CHECK(p->next->prev == p);
+
+          const HloInstruction* def = p->value->defining_instruction();
+          if (def->opcode() == HloOpcode::kCopy &&
+              ContainsKey(copy_map_, def)) {
+            TF_RET_CHECK(copy_map_.at(def).dest == p);
+          }
+          for (const HloUse* use : p->uses) {
+            if (use->instruction->opcode() == HloOpcode::kCopy &&
+                ContainsKey(copy_map_, use->instruction)) {
+              TF_RET_CHECK(copy_map_.at(use->instruction).src == p);
+            }
+          }
+
+          p = p->next;
+        } while (p != head);
+      }
+      return Status::OK();
+    }
+
+    // Try to elide the given copy. Elision of a copy is possible only if no
+    // live range interference is introduced by the copy's elimination. If
+    // elision is possible, then the internal state (value lists) are updated,
+    // and true is returned. Returns false otherwise.
+    bool TryElideCopy(const HloInstruction* copy) {
+      VLOG(2) << "Trying to remove " << copy->name();
+
+      if (!ContainsKey(copy_map_, copy)) {
+        VLOG(2) << copy->name() << " is not removable";
+        return false;
+      }
+
+      const CopyNodes& copy_node = copy_map_.at(copy);
+      ValueNode* src = copy_node.src;
+      ValueNode* dest = copy_node.dest;
+      DCHECK(src != nullptr);
+      DCHECK(dest != nullptr);
+
+      auto is_live_range_before = [this](const ValueNode& a,
+                                         const ValueNode& b) {
+        if (LiveRangeBefore(a, b)) {
+          VLOG(2) << "  Live range of " << a.value->ToShortString()
+                  << " is before " << b.value->ToShortString();
+          return true;
+        } else {
+          VLOG(2) << "  Live range of " << a.value->ToShortString()
+                  << " is not before " << b.value->ToShortString();
+          return false;
         }
-        const auto& points_to_analysis = liveness.points_to_analysis();
-        // Lookup buffers for 'instruction_' and 'other_instruction'.
-        const auto instruction_buffers =
-            points_to_analysis.GetPointsToSet(instruction_).element(index);
-        // If 'instruction_' has ambiguous points-to-set  at 'index', it would
-        // have been recorded in a previous pass (and we would have returned
-        // early at the entry to this function). As a result, here we know that
-        // 'instruction_' has just one buffer in its points-to-set.
-        CHECK_EQ(1, instruction_buffers.size());
-        const LogicalBuffer* instruction_buffer = instruction_buffers[0];
-
-        const auto other_instruction_buffers =
-            points_to_analysis.GetPointsToSet(other_instruction).element(index);
-        // Do not insert a copy if both instructions point at the same buffer.
-        // This eliminates unnecessary copies of read-only tuple elements.
-        // If 'instruction_' and 'other_instruction' point to the same buffer,
-        // then that buffer is not updated on the path between the two
-        // instructions. Therefore, any other (possibly interference-causing)
-        // users of that buffer from 'other_instruction' will see the same data,
-        // irrespective of whether we insert a copy of this buffer at
-        // 'instruction_' or not.
-        if (other_instruction_buffers.size() == 1 &&
-            other_instruction_buffers[0]->id() == instruction_buffer->id()) {
-          if (read_only_indices_out != nullptr) {
-            *read_only_indices_out->mutable_element(index) = true;
+      };
+
+      VLOG(3) << copy->name() << " copies value "
+              << src->value->ToShortString();
+      VLOG(3) << "Source buffer values: " << ValueListToString(src);
+      VLOG(3) << "Dest buffer values: " << ValueListToString(src);
+
+      // A kCopy instruction copies an HLO value from a source buffer and
+      // defines an HLO value in a destination buffer. Most generally, the
+      // source and destination buffers may each hold more than one value at
+      // different points in the computation so we define the following:
+      //
+      //   Values in source buffer:      {s_0, ..., s_n}
+      //   Values in destination buffer: {d_0, ..., d_m}
+      //
+      // A kCopy instruction between these buffers copies a value s_x in the
+      // source buffer and defines a value d_y in the destination buffer. The
+      // elision of a copy merges the source and destination buffers together,
+      // so the list of values for the source and destination buffers are
+      // merged.
+      //
+      // We handle two different cases for copy elision:
+      //
+      //  (1) the kCopy defines the first value in the destination buffer (d_0).
+      //
+      //  (2) the kCopy copies the last value in the source buffer (s_n).
+      //
+      // For the remaining case where the kCopy copies a not-last value from the
+      // source buffer to a not-first value of the destination buffer, the kCopy
+      // instruction cannot be removed. This case is generated, for example, if
+      // the kCopy copies a while body parameter of the loop state at one tuple
+      // index to a different tuple index in the while body root. Removal of the
+      // copy necessarily results in live range interference of values in the
+      // loop state at the two different tuple indices.
+      //
+      //  We can only perform copy elision if the resulting merged values have
+      //  totally ordered live ranges; otherwise the merged buffer would have
+      //  live range interference.
+      if (IsHead(*dest)) {
+        // The copy copies an arbitrary value in the source buffer (call it s_x)
+        // and defines d_0, the first value in the destination buffer. After
+        // merging, the values in the combined buffer must be strictly ordered
+        // as follows** to elide the copy:
+        //
+        // {s_0, ..., s_x, d_1, ..., d_m, s_{x+1}, ..., s_n}
+        //
+        // Removing the copy eliminates d_0, and uses of d_0 become uses of
+        // s_x. In the above ordering, the live range of d_m must be ordered
+        // before the live range of s_{x+1} and the definition and all uses of
+        // s_x must be ordered before the definition of d_1. These conditions
+        // are checked below prior to elision.
+        //
+        // ** Technically it might be possible to have a non-interfering
+        //    non-trivial interleaving of the values of the source and
+        //    destination buffers in the resulting order. However, this case is
+        //    slow and complicated to check and likely not worth it. So instead
+        //    we simply check for the case where *all* values of the destination
+        //    buffer (d_1 through d_m) are spliced into the point where the copy
+        //    used to be.
+        VLOG(2) << copy->name() << " defines the first value in its buffer";
+        ValueNode* next_dest = Next(*dest);
+        if (next_dest != nullptr) {
+          // Live range of 'from' value (s_x) must be before 'next_dest' (d_1);
+          if (!is_live_range_before(*src, *next_dest)) {
+            return false;
           }
-          return;
         }
-        // We can't say anything about the ambiguity of 'other_instruction' at
-        // this point, so we need to check interference between the single
-        // buffer in the points-to set of 'instruction_' and all buffers in
-        // 'other_instruction_buffers'.
-        for (const LogicalBuffer* other_buffer : other_instruction_buffers) {
-          if (liveness.MayInterfere(*instruction_buffer, *other_buffer)) {
-            VLOG(2) << "Adding copy of buffer for instruction: "
-                    << instruction_->name()
-                    << " instruction_buffer: " << instruction_buffer->ToString()
-                    << " at index: " << tensorflow::str_util::Join(index, ",")
-                    << " because of interference with buffer: "
-                    << other_buffer->ToString();
-            RecordIndex(index);
-            break;
+        ValueNode* next_src = Next(*src);
+
+        if (next_src != nullptr) {
+          // Live range of 'last_dest' (d_m) must be before 'next_src' s_{x+1}.
+          ValueNode* last_dest = dest->prev;
+          DCHECK(IsTail(*last_dest));
+          if (!is_live_range_before(*last_dest, *next_src)) {
+            return false;
           }
         }
-      });
-  return Status::OK();
-}
 
-// This is called when 'instruction_' is a while body root, and 'parameter' is
-// the while body parameter. We record all users of all aliases of 'parameter'
-// as control predecessors, so that when we add a copy of 'instruction_', we can
-// mark the control dependencies. This is necessary because points-to and
-// liveness analysis doesn't know about the aliasing between the while body root
-// and param. Without these control dependencies, the copy might get scheduled
-// to run at a point that interferes with users of the buffer.
-Status InstructionCopier::RecordControlPredecessors(
-    const TuplePointsToAnalysis& points_to_analysis,
-    HloInstruction* parameter) {
-  return indices_to_copy_.ForEachElementWithStatus(
-      [this, &points_to_analysis, parameter](const ShapeIndex& index,
-                                             bool will_copy) {
-        if (will_copy) {
-          TF_ASSIGN_OR_RETURN(
-              const LogicalBuffer* buffer,
-              points_to_analysis.GetBufferDefinedAt(parameter, index));
-          for (const BufferAlias& alias :
-               points_to_analysis.GetBufferAliases(*buffer)) {
-            for (HloInstruction* user : alias.instruction()->users()) {
-              if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                          user, points_to_analysis)) {
-                continue;
-              }
-
-              if (user != instruction_) {
-                control_predecessors_.mutable_element(index)->push_back(user);
-              }
-            }
+        // Splice in destination buffer values list right after 'src'.
+        SpliceAfter(dest, src);
+      } else if (IsTail(*src)) {
+        // The copy copies the last value in the source buffer, s_n, and defines
+        // an arbitrary value in the destination buffer, d_y.  After
+        // merging, the values in the combined buffer must be strictly ordered
+        // as follows** to elide the copy:
+        //
+        // {d_0, ..., d_{y-1}, s_0, ..., s_n, d_{y+1}, ..., d_m}
+        //
+        // Removing the copy eliminates d_y, and uses of d_y become uses of
+        // s_n. To enforce the above order, the live range of d_{y-1} must be
+        // before the live range of s_0, and the live range of s_n must be
+        // before the live range of d_{y+1}.
+        //
+        // ** See comment above in the code handling Case (1).
+        VLOG(2) << copy->name() << " copies the last value ("
+                << src->value->ToShortString() << ") in its buffer";
+
+        ValueNode* prev_dest = Prev(*dest);
+        // nullptr condition handled above in the first 'if' case.
+        DCHECK(prev_dest != nullptr);
+        ValueNode* first_src = src->next;
+        DCHECK(IsHead(*first_src));
+        if (!is_live_range_before(*prev_dest, *first_src)) {
+          // Live range of value d_{y-1} is not before s_0.
+          return false;
+        }
+        ValueNode* next_dest = Next(*dest);
+        if (next_dest != nullptr) {
+          if (!is_live_range_before(*src, *next_dest)) {
+            // Live range of value s_n is not before d_{y+1}.
+            return false;
           }
         }
-        return Status::OK();
-      });
-}
 
-// Recursively inserts copies of 'instruction' tuple element buffers at
-// indices in 'indices_to_copy_', expanding tuples as needed.
-HloInstruction* InstructionCopier::CopyTuple(HloInstruction* instruction,
-                                             ShapeIndex* index) {
-  const int64 num_tuple_elements =
-      ShapeUtil::TupleElementCount(instruction->shape());
-  std::vector<HloInstruction*> elem_copies(num_tuple_elements);
-  for (int64 i = 0; i < num_tuple_elements; ++i) {
-    HloInstruction* elem;
-    if (instruction->opcode() == HloOpcode::kTuple) {
-      // If the instruction is already a Tuple instruction, we know that the
-      // element buffers are aliased, so we can just grab the operand directly.
-      elem = instruction->mutable_operand(i);
-    } else {
-      // Otherwise we need to add a GTE to unpack the element out of the tuple.
-      elem = instruction->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              ShapeUtil::GetSubshape(instruction->shape(), {i}), instruction,
-              i));
-    }
-    index->push_back(i);
-    if (ShapeUtil::IsTuple(elem->shape())) {
-      elem_copies[i] = CopyTuple(elem, index);
-    } else if (!indices_to_copy_.element(*index)) {
-      elem_copies[i] = elem;
-    } else if (HloInstruction* copy_override = GetCopyOverride(*index)) {
-      elem_copies[i] = copy_override;
-    } else {
-      HloInstruction* elem_copy = elem->parent()->AddInstruction(
-          HloInstruction::CreateUnary(elem->shape(), HloOpcode::kCopy, elem));
-      for (HloInstruction* control_predecessor :
-           control_predecessors_.element(*index)) {
-        VLOG(2) << "Adding control dependency from "
-                << control_predecessor->ToString() << " to "
-                << elem_copy->ToString();
-        TF_CHECK_OK(control_predecessor->AddControlDependencyTo(elem_copy));
+        // Splice source buffer values list right after 'prev_dest'.
+        SpliceAfter(first_src, prev_dest);
+      } else {
+        VLOG(2)
+            << copy->name()
+            << " copies value in middle of source buffer to value in middle "
+               "of destination buffer";
+        return false;
       }
-      elem_copies[i] = elem_copy;
+
+      RemoveCopyValue(dest);
+
+      XLA_VLOG_LINES(4, ToString());
+      TF_DCHECK_OK(Verify());
+
+      return true;
     }
-    index->pop_back();
-  }
-  return instruction->parent()->AddInstruction(
-      HloInstruction::CreateTuple(elem_copies));
-}
 
-// Inserts copies of 'instruction_' buffers at indices in 'indices_to_copy_'.
-HloInstruction* InstructionCopier::Copy() {
-  ShapeIndex index;
-  HloInstruction* copy;
-  if (ShapeUtil::IsTuple(instruction_->shape())) {
-    copy = CopyTuple(instruction_, &index);
-  } else {
-    copy = instruction_->parent()->AddInstruction(HloInstruction::CreateUnary(
-        instruction_->shape(), HloOpcode::kCopy, instruction_));
-  }
-  for (HloInstruction* user : copy_users_) {
-    VLOG(2) << "Adding copy between instruction: " << instruction_->name()
-            << " and user: " << user->name();
-    TF_CHECK_OK(instruction_->ReplaceUseWith(user, copy));
+    // Delete the given ValueNode associated with a elided kCopy
+    // instruction. This should be called after splicing the value lists of the
+    // source and destination buffers together.
+    void RemoveCopyValue(ValueNode* copy_value_node) {
+      CHECK_EQ(copy_value_node->value->defining_instruction()->opcode(),
+               HloOpcode::kCopy);
+      ValueNode* operand_node = copy_value_node->prev;
+      CHECK(operand_node != copy_value_node);
+
+      VLOG(2) << "Removing copy " << operand_node->value->ToShortString()
+              << " => " << copy_value_node->value->ToShortString();
+
+      // Splice out the copy value node.
+      operand_node->next = copy_value_node->next;
+      copy_value_node->next->prev = operand_node;
+
+      // Patch up uses. Remove use of copy from operand_node uses.
+      auto it =
+          std::find_if(operand_node->uses.begin(), operand_node->uses.end(),
+                       [copy_value_node](const HloUse* use) {
+                         return use->instruction ==
+                                copy_value_node->value->defining_instruction();
+                       });
+      CHECK(it != operand_node->uses.end());
+      operand_node->uses.erase(it);
+
+      // If the elided copy has any uses which are themselves kCopy instructions
+      // then patch up the copy info to reflect the that this kCopy instruction
+      // has a different operand (the operand of the elided copy).
+      for (const HloUse* copy_use : copy_value_node->uses) {
+        operand_node->uses.push_back(copy_use);
+        if (copy_use->instruction->opcode() == HloOpcode::kCopy) {
+          copy_map_.at(copy_use->instruction).src = operand_node;
+        }
+      }
+
+      // Delete the copy info and the value node.
+      copy_map_.erase(copy_value_node->value->defining_instruction());
+      delete copy_value_node;
+    }
+
+    // Returns true if the live range of given value 'a' is before the live
+    // range of 'b'.
+    //
+    // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
+    // updated as copies are removed.
+    bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
+      if (a.uses.empty()) {
+        VLOG(2) << "Empty uses";
+        return ordering_.IsDefinedBefore(*a.value, *b.value);
+      }
+      for (const HloUse* use : a.uses) {
+        VLOG(2) << "use: " << *use;
+        VLOG(2) << "is before:" << *b.value;
+        if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
+          VLOG(2) << "Not before";
+          return false;
+        }
+      }
+      return true;
+    }
+
+    // Returns whether 'node' is the last node in its list.
+    bool IsTail(const ValueNode& node) const {
+      return ContainsKey(value_lists_, node.next);
+    }
+
+    // Returns whether 'node' is the first node in its list.
+    bool IsHead(const ValueNode& node) const {
+      return ContainsKey(value_lists_, &node);
+    }
+
+    // Returns the next node in the list after 'node'. If 'node' is the
+    // tail, then nullptr is returned.
+    ValueNode* Next(const ValueNode& node) const {
+      if (IsTail(node)) {
+        return nullptr;
+      } else {
+        return node.next;
+      }
+    }
+
+    // Returns the previous node in the list before 'node'. If 'node'
+    // is the head, then nullptr is returned.
+    ValueNode* Prev(const ValueNode& node) const {
+      if (IsHead(node)) {
+        return nullptr;
+      } else {
+        return node.prev;
+      }
+    }
+
+    // Splices the entire linked list with 'head' as its head right after the
+    // node 'insert_after' in another linked list.
+    void SpliceAfter(ValueNode* head, ValueNode* insert_after) {
+      DCHECK(IsHead(*head));
+      value_lists_.erase(head);
+
+      ValueNode* tail = head->prev;
+      tail->next = insert_after->next;
+      insert_after->next->prev = tail;
+
+      insert_after->next = head;
+      head->prev = insert_after;
+    }
+
+    string ValueListToString(const ValueNode* element) {
+      const ValueNode* head = element;
+      while (!IsHead(*head)) {
+        head = Prev(*head);
+      }
+      std::vector<const HloValue*> values;
+      for (const ValueNode* p = head; p != nullptr; p = Next(*p)) {
+        values.push_back(p->value);
+      }
+      return StrCat("{",
+                    Join(values, ", ",
+                         [](string* s, const HloValue* value) {
+                           StrAppend(s, value->ToShortString());
+                         }),
+                    "}");
+    }
+
+    string ToString() const {
+      string out = StrCat("BufferValueTracker:\n");
+      StrAppend(&out, "  Def-use chains in each buffer:\n");
+      for (const ValueNode* head : value_lists_) {
+        StrAppend(&out, "    Buffer defined by ", head->value->ToShortString(),
+                  ":\n");
+        const ValueNode* p = head;
+        do {
+          StrAppend(&out, "      ", p->value->ToShortString(), ", uses: ",
+                    Join(p->uses, "; ",
+                         [](string* s, const HloUse* use) {
+                           StrAppend(s, use->ToString());
+                         }),
+                    "\n");
+
+          p = p->next;
+        } while (p != head);
+      }
+      StrAppend(&out, "  Potentially removable copies:\n");
+      for (const auto& pair : copy_map_) {
+        const HloInstruction* copy = pair.first;
+        const CopyNodes& copy_info = pair.second;
+
+        StrAppend(&out, "    ", copy->name(), " : ",
+                  copy_info.src->value->ToShortString(), " => ",
+                  copy_info.dest->value->ToShortString(), "\n");
+      }
+      return out;
+    }
+
+   private:
+    const HloDataflowAnalysis& dataflow_;
+    const HloOrdering& ordering_;
+
+    // The heads of all the value lists. Each value list represents the HLO
+    // values contained in a particular HLO buffer. The values in the list are
+    // in dependency order.
+    tensorflow::gtl::FlatSet<const ValueNode*> value_lists_;
+
+    // Copy removal requires fast access to the value list elements
+    // corresponding to the source and destination values of the kCopy
+    // instruction. This data structure holds pointers to these elements for
+    // each kCopy instruction in the graph.
+    struct CopyNodes {
+      // The source and destinations values of the kCopy instruction.
+      ValueNode* src = nullptr;
+      ValueNode* dest = nullptr;
+    };
+    tensorflow::gtl::FlatMap<const HloInstruction*, CopyNodes> copy_map_;
+  };
+
+  HloModule* module_;
+  const HloAliasAnalysis& alias_analysis_;
+  const HloOrdering& ordering_;
+
+  // Object tracking the HLO values contained in each HLO buffer.
+  BufferValueTracker buffer_value_tracker_;
+};
+
+// Try to remove as many copies from the module as possible without introducing
+// live range interference. Copy instructions (identified by their unique id) in
+// the set copies_to_exclude are not considered for removal.
+Status RemoveUnnecessaryCopies(
+    const HloOrdering& ordering,
+    const tensorflow::gtl::FlatSet<int>& copies_to_exclude, HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+  CopyRemover copy_remover(*alias_analysis, ordering, module);
+  XLA_VLOG_LINES(3, copy_remover.ToString());
+
+  tensorflow::gtl::FlatSet<int> existing_copies;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          !ContainsKey(copies_to_exclude, instruction->unique_id())) {
+        TF_RETURN_IF_ERROR(copy_remover.TryElideCopy(instruction).status());
+      }
+    }
   }
-  return copy;
+
+  return Status::OK();
 }
 
-// The 'read_only_indices' are initialized based on points-to analysis on the
-// while body corresponding to 'while_hlo'. If the init buffer corresponding to
-// a read-only index aliases with a constant, it cannot be considered read-only,
-// and must be copied. This is necessary because BufferAssignment does not
-// currently assign an allocation for constants (b/32248867).
-// This function performs this fix-up of 'read_only_indices'.
+// Add copies to address special constraints on the roots of computations not
+// related to live range interference:
 //
-// Returns a ShapeTree of copy_overrides, which implements an optimization to
-// allow multiple while loops that share the same read-only constants to
-// share a single copy.
-StatusOr<ShapeTree<HloInstruction*>> RevertReadOnlyIndicesForConstants(
-    const HloInstruction* while_hlo,
-    const TuplePointsToAnalysis& points_to_analysis,
-    ShapeTree<bool>* read_only_indices,
-    FlatMap<const HloInstruction*, HloInstruction*>* shared_copies) {
-  const HloInstruction* init_hlo = while_hlo->operand(0);
-  const PointsToSet& points_to = points_to_analysis.GetPointsToSet(init_hlo);
-
-  // Mapping from LogicalBuffer to index (used to detect non-distinct indices).
-  FlatSet<const LogicalBuffer*> buffer_set;
-
-  ShapeTree<HloInstruction*> copy_overrides(init_hlo->shape());
-  points_to.ForEachElement([init_hlo, read_only_indices, shared_copies,
-                            &buffer_set, &copy_overrides](
-                               const ShapeIndex& index,
-                               const PointsToSet::BufferList& buffers) {
-    // Look for read-only entry parameters.
-    if (!read_only_indices->element(index)) {
-      return;
-    }
-    for (const LogicalBuffer* buffer : buffers) {
-      HloInstruction* pointee = buffer->instruction();
-      const bool is_constant = pointee->opcode() == HloOpcode::kConstant;
-      if (!is_constant) {
-        continue;
-      }
+//    (1) Entry computation root must be unambiguous and distinct.
+//
+//    (2) Any computation called by a kCall instruction must have an
+//        unambiguous root.
+//
+//    (3) Constants and parameters cannot be live out of the entry computation
+//
+Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+
+  // Identify which shape indices of which instructions need to be copied. Store
+  // these results in 'instructions_to_copy'.
+  std::unordered_map<HloInstruction*, ShapeTree<bool>> instructions_to_copy;
+  auto add_index_to_copy = [&instructions_to_copy](HloInstruction* instruction,
+                                                   const ShapeIndex& index) {
+    auto it = instructions_to_copy.find(instruction);
+    if (it == instructions_to_copy.end()) {
+      auto it_added = instructions_to_copy.emplace(
+          std::piecewise_construct, std::forward_as_tuple(instruction),
+          std::forward_as_tuple(instruction->shape(), /*init_value=*/false));
+      it = it_added.first;
+    }
+    *it->second.mutable_element(index) = true;
+  };
 
-      // We have found an constant that is read-only in
-      // the while body. These buffers are managed by the caller, and cannot
-      // be aliased with HLO buffers. Revert this read-only index,
-      // to allow it to be copied.
-      *read_only_indices->mutable_element(index) = false;
-
-      // Optimization to allow multiple while loops that share the same
-      // read-only entry constants to share a single copy.
-      // Only unambiguous and distinct array-shaped buffers are allowed, to
-      // reduce code complexity. The shape of the entry parameter must be
-      // identical to the shape of the init_hlo at this index, to ensure
-      // there were no intervening bitcast or GTE instructions, which are
-      // also hard to handle.
-      const Shape& pointee_shape = pointee->shape();
-      const Shape& init_shape =
-          ShapeUtil::GetSubshape(init_hlo->shape(), index);
-      if (buffers.size() == 1 && ShapeUtil::IsArray(pointee_shape) &&
-          ShapeUtil::Equal(pointee_shape, init_shape) &&
-          buffer_set.count(buffer) < 1) {
-        HloInstruction** copy = &(*shared_copies)[pointee];
-        if (*copy == nullptr) {
-          *copy = pointee->parent()->AddInstruction(HloInstruction::CreateUnary(
-              pointee_shape, HloOpcode::kCopy, pointee));
+  // Iterate through values of all constants and entry parameters. These values
+  // are special because they are held in read-only buffers. If any of these
+  // values share a buffer with other values (for example, the init value of a
+  // while is a constant) then copy the value at its definition and replace all
+  // its uses with the copy.
+  for (const HloValue* value : alias_analysis->dataflow_analysis().values()) {
+    if (ValueIsReadOnly(*value) &&
+        alias_analysis->GetBufferContainingValue(*value).values().size() > 1) {
+      VLOG(2) << "Value " << value->ToShortString()
+              << " is read only, but its buffer contains more than one value. "
+                 "Copying.";
+      add_index_to_copy(value->defining_instruction(), value->defining_index());
+    }
+  }
+
+  // Identify copies which must be added at root instructions
+  for (HloComputation* computation : module->computations()) {
+    const CallGraphNode& node = call_graph.GetNode(computation);
+    if (node.context() == CallContext::kParallel) {
+      continue;
+    }
+    TF_RET_CHECK(node.context() == CallContext::kSequential);
+
+    const bool is_entry = computation == module->entry_computation();
+    HloInstruction* root = computation->root_instruction();
+
+    // Mark nondistinct/ambiguous indices.
+    tensorflow::gtl::FlatSet<const HloBuffer*> seen;
+    ShapeUtil::ForEachSubshape(
+        root->shape(), [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+          std::vector<const HloBuffer*> buffers_at_index =
+              alias_analysis->ComputeBuffersAt(root, index);
+          bool buffer_seen_before = false;
+          for (const HloBuffer* buffer : buffers_at_index) {
+            buffer_seen_before |= !seen.insert(buffer).second;
+          }
+          if (buffers_at_index.size() > 1 || (buffer_seen_before && is_entry)) {
+            VLOG(2) << "Index " << index << " of root of computation "
+                    << computation->name() << " (" << root->name()
+                    << ") has ambiguous or non-distinct buffer. Copying.";
+            add_index_to_copy(root, index);
+          }
+        });
+
+    // For entry instructions, mark any parameter or constant values.
+    if (is_entry) {
+      for (const auto& pair :
+           alias_analysis->dataflow_analysis().GetInstructionValueSet(root)) {
+        const ShapeIndex& index = pair.first;
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (ValueIsReadOnly(*value)) {
+            VLOG(2) << "Root of entry computation (" << root->name()
+                    << ") has constant or entry parameter value at index "
+                    << index << ". Copying.";
+            add_index_to_copy(root, index);
+          }
         }
-        // Add the copy as an override.
-        *copy_overrides.mutable_element(index) = *copy;
       }
+    }
+  }
 
-      // Tracks whether this current buffer is distinct.
-      buffer_set.insert(buffer);
+  // Add copy instructions indicated in 'instructions_to_copy' to the module.
+  for (const auto& pair : instructions_to_copy) {
+    HloInstruction* instruction = pair.first;
+    const ShapeTree<bool>& indices_to_copy = pair.second;
 
-      // We've already reverted the read-only index and handled the
-      // single-copy optimization above, so there's nothing more to do.
-      break;
+    std::vector<HloInstruction*> users = instruction->users();
+    TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                        instruction->parent()->DeepCopyInstruction(
+                            instruction, &indices_to_copy));
+    for (HloInstruction* user : users) {
+      TF_RETURN_IF_ERROR(instruction->ReplaceUseWith(user, deep_copy));
+    }
+    if (instruction == instruction->parent()->root_instruction()) {
+      instruction->parent()->set_root_instruction(deep_copy);
     }
-  });
-  return copy_overrides;
+  }
+
+  return Status::OK();
+}
+
+Status VerifyNoLiveRangeInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module));
+  DependencyHloOrdering ordering(module);
+  TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
+  return Status::OK();
 }
 
-}  // anonymous namespace
-
-// NOTE: This is only called by gpu::CopyInsertion. It's not called here in the
-// base class, since the regular CopyInsertion logic above selectively copies
-// tuple elements, while this method assumes all buffers need to be deep copied.
-StatusOr<HloInstruction*> CopyInsertion::FindOrInsertCopy(HloInstruction* hlo) {
-  auto copy_it = inserted_copies_.find(hlo);
-  if (copy_it == inserted_copies_.end()) {
-    HloInstruction* copy = hlo->parent()->DeepCopyInstruction(hlo).ValueOrDie();
-    inserted_copies_.insert({hlo, copy});
-    return copy;
-  } else {
-    return copy_it->second;
+void MaybeDumpModule(const string& message, const HloModule& module) {
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << message;
+    XLA_VLOG_LINES(3, module.ToString());
+    hlo_graph_dumper::MaybeDumpHloModule(module, message);
   }
 }
 
+}  // namespace
+
 StatusOr<bool> CopyInsertion::Run(HloModule* module) {
-  bool changed = false;
-  VLOG(2) << "CopyInsertion for module " << module->name();
+  // Copy insertion is performed in three steps:
+  //
+  // (1) Add copies conservatively to guarantee that there is no live-range
+  //     interference. This is done simplistically and usually results in more
+  //     copies than is strictly necessary.
+  //
+  // (2) Using a more fine-grained analysis, remove as many copies that were
+  //     added in (1) as possible while ensuring no live-range interference.
+  //
+  // (3) Add copies to resolve issues not related to live range interference
+  //     such as parameters and constants live out of the entry computation.
+  //
+  // We add copies then remove them (step (1) then (2)) rather than simply
+  // adding only the copies that are necessary because, in general, it is
+  // difficult to figure out the minimal set of copies to add once there is
+  // interference. On the other hand, it is easy to determine if removing a copy
+  // will introduce interference.
+  //
+  // The final copy insertion in (3) is done separately to simplify the
+  // implementation of copy removal in (2) which is the most complicated part of
+  // the pass. As is, copy removal only has to reason about live range
+  // interference. If all copies were added in step (1) then copy removal would
+  // also have to reason about things like constants and parameters live out of
+  // the computation.
+  MaybeDumpModule("before copy insertion", *module);
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferLiveness> liveness,
-      BufferLiveness::Run(module, MakeUnique<DependencyHloOrdering>(module)));
-  const auto& points_to_analysis = liveness->points_to_analysis();
-  XLA_VLOG_LINES(2, points_to_analysis.ToString());
-  XLA_VLOG_LINES(2, module->ToString());
-
-  // Gather all while body computations and while instructions.
-  FlatSet<const HloComputation*> while_body_computations;
-  std::vector<HloInstruction*> while_instructions;
-  for (auto* computation : module->computations()) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  if (!call_graph->IsFlattened()) {
+    return FailedPrecondition(
+        "Call graph must be flattened before copy insertion.");
+  }
+
+  // Gather Ids of existing kCopy instructions in the module. We avoid removing
+  // these copies (except via DCE in TupleSimplifier) because they may have been
+  // added for reasons not considered by copy insertion (eg, layout assignment).
+  // Instruction id is used instead of HloInstruction* because the pointer
+  // values may be recycled.
+  tensorflow::gtl::FlatSet<int> existing_copies;
+  for (HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        while_body_computations.insert(instruction->while_body());
-        while_instructions.push_back(instruction);
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        existing_copies.insert(instruction->unique_id());
       }
     }
   }
 
-  // Collect instruction buffer indices to copy in 'instructions_to_copy'.
-  std::vector<InstructionCopier> instructions_to_copy;
-
-  // Add copies of computation root instructions, if needed.
-  FlatMap<const HloComputation*, ShapeTree<bool>> while_body_read_only_indices;
-  for (auto* computation : module->MakeNonfusionComputations()) {
-    VLOG(2) << "computation " << computation->name();
-    InstructionCopier root_copier(computation->root_instruction(),
-                                  /*copy_users=*/{});
-    if (while_body_computations.count(computation) > 0) {
-      // Record root indices to copy for while body sub-computations. We do not
-      // need to call RecordIndicesWhichPointToParamOrConstant for the while
-      // body root instruction here, because any necessary copies needed to
-      // avoid constants or parameters in the output are handled by while.init
-      // operand copy insertion below (which will share an allocation).
-      HloInstruction* while_body_param = computation->parameter_instruction(0);
-      ShapeTree<bool> read_only_indices(while_body_param->shape());
-      TF_RETURN_IF_ERROR(root_copier.RecordIndicesToCopyForColocatingBuffers(
-          *liveness, while_body_param, &read_only_indices));
-      while_body_read_only_indices[computation] = read_only_indices;
-
-      // Mark control predecessors, based on the body param, for any copies
-      // we'll be inserting. This ensures the copy doesn't run too early.
-      TF_RETURN_IF_ERROR(root_copier.RecordControlPredecessors(
-          points_to_analysis, while_body_param));
-    } else {
-      // Record root indices to copy for general computations.
-      TF_RETURN_IF_ERROR(root_copier.RecordIndicesWhichPointToParamOrConstant(
-          points_to_analysis));
+  TF_RETURN_IF_ERROR(AddCopiesToResolveInterference(module));
+
+  // Simplify the tuple structures introduced by the deep copies. This should be
+  // done before removing copies (RemoveUnnecessaryCopies) because tuple
+  // simplification changes dependencies in the graph which changes live range
+  // interference in the graph. Also run DCE to remove the dead Tuple/GTE
+  // instructions introduced by tuple simplification.
+  TupleSimplifier tuple_simplifier;
+  HloDCE dce;
+  TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  TF_RETURN_IF_ERROR(dce.Run(module).status());
+
+  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+
+  MaybeDumpModule("after adding copies to resolve interference", *module);
+
+  DependencyHloOrdering ordering(module);
+  TF_RETURN_IF_ERROR(
+      RemoveUnnecessaryCopies(ordering, existing_copies, module));
+
+  MaybeDumpModule("after removing unnecessary copies", *module);
+
+  TF_RETURN_IF_ERROR(AddSpecialCaseCopies(*call_graph, module));
+
+  MaybeDumpModule("after adding special-case copies", *module);
+
+  TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  TF_RETURN_IF_ERROR(dce.Run(module).status());
+  TF_DCHECK_OK(VerifyNoLiveRangeInterference(module));
+
+  MaybeDumpModule("after copy insertion", *module);
+
+  if (VLOG_IS_ON(1)) {
+    int64 num_total_copies = 0;
+    for (HloComputation* computation : module->computations()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->opcode() == HloOpcode::kCopy) {
+          num_total_copies++;
+        }
+      }
     }
-    instructions_to_copy.push_back(root_copier);
+    VLOG(1) << "Num copies before copy-insertion: " << existing_copies.size();
+    VLOG(1) << "Num copies after copy-insertion: " << num_total_copies;
   }
 
-  // Add copies of while 'init' operand instructions, if needed. 'shared_copies'
-  // is used to ensure that multiple while loops can share a single copy of the
-  // same entry parameter or constant, if all loops use it read-only.
-  //
-  // TODO(b/33301720) Remove redundant while instruction copies.
-  FlatMap<const HloInstruction*, HloInstruction*> shared_copies;
-  for (HloInstruction* while_hlo : while_instructions) {
-    // Fix read_only_indices to account for entry constants. Also
-    // initialize copy_overrides, which ensures a single copy for each read-only
-    // constant that is used in multiple while loops.
-    ShapeTree<bool>* read_only_indices =
-        &while_body_read_only_indices[while_hlo->while_body()];
-    TF_ASSIGN_OR_RETURN(
-        const ShapeTree<HloInstruction*> copy_overrides,
-        RevertReadOnlyIndicesForConstants(while_hlo, points_to_analysis,
-                                          read_only_indices, &shared_copies));
-    // Create InstructionCopier for init operand of while instruction.
-    HloInstruction* init_hlo = while_hlo->mutable_operand(0);
-    InstructionCopier init_copier(init_hlo, {while_hlo});
-    init_copier.SetReadOnlyIndices(*read_only_indices);
-    init_copier.SetCopyOverrides(copy_overrides);
-    // Record 'init' buffer indices which point-to a Constant or Parameter.
-    TF_RETURN_IF_ERROR(init_copier.RecordIndicesWhichPointToParamOrConstant(
-        points_to_analysis));
-    // Record indices necessary to colocate while and init operand buffers.
-    TF_RETURN_IF_ERROR(init_copier.RecordIndicesToCopyForColocatingBuffers(
-        *liveness, while_hlo, /*read_only_indices_out=*/nullptr));
-    instructions_to_copy.push_back(init_copier);
+  return true;
+}
+
+namespace {
+
+bool IsWhileBody(const HloComputation* computation,
+                 const CallGraph& call_graph) {
+  const CallGraphNode& node = call_graph.GetNode(computation);
+
+  if (node.context() == CallContext::kSequential &&
+      !node.caller_callsites().empty()) {
+    // Callgraph should be flattened so sequential context computations can
+    // have at most one caller.
+    CHECK_EQ(node.caller_callsites().size(), 1);
+    const HloInstruction* calling_instruction =
+        node.caller_callsites()[0].instruction();
+    if (calling_instruction->opcode() == HloOpcode::kWhile &&
+        calling_instruction->while_body() == node.computation()) {
+      return true;
+    }
   }
+  return false;
+}
 
-  for (InstructionCopier& to_copy : instructions_to_copy) {
-    if (to_copy.HasAllIndicesFalse()) {
+}  // namespace
+
+/* static */ StatusOr<bool> CopyInsertion::AddCopiesForBufferAssignment(
+    HloModule* module) {
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
+                      HloDataflowAnalysis::Run(module));
+
+  bool changed = false;
+
+  // If a buffer live out of a computation is a constant, a parameter, or not
+  // defined in the computation, then copy it to account for the limited
+  // computation-scoped analysis in buffer assignment. An exception to this rule
+  // is the while body which is handled properly without copies.
+  for (HloComputation* computation : module->computations()) {
+    if (computation == module->entry_computation() ||
+        IsWhileBody(computation, *call_graph)) {
       continue;
     }
-    changed = true;
 
-    // Copy instruction at recorded buffer indices.
-    HloComputation* computation = to_copy.instruction()->parent();
-    HloInstruction* copy = to_copy.Copy();
-    if (to_copy.instruction() == computation->root_instruction()) {
-      computation->set_root_instruction(copy);
+    HloInstruction* root = computation->root_instruction();
+    ShapeTree<bool> indices_to_copy(root->shape(), /*init_value=*/false);
+    bool copy_root = false;
+    for (const auto& pair : dataflow->GetInstructionValueSet(root)) {
+      const ShapeIndex& index = pair.first;
+      const HloValueSet& value_set = pair.second;
+      for (const HloValue* value : value_set.values()) {
+        HloInstruction* def = value->defining_instruction();
+        if (def->parent() != computation ||
+            def->opcode() == HloOpcode::kConstant ||
+            def->opcode() == HloOpcode::kParameter) {
+          *indices_to_copy.mutable_element(index) = true;
+          copy_root = true;
+        }
+      }
+    }
+    if (copy_root) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * root_copy,
+          computation->DeepCopyInstruction(root, &indices_to_copy));
+      computation->set_root_instruction(root_copy);
+      changed = true;
     }
   }
 
-  VLOG(3) << "After copy insertion for module " << module->name();
-  XLA_VLOG_LINES(3, module->ToString());
+  TupleSimplifier tuple_simplifier;
+  HloDCE dce;
+  TF_ASSIGN_OR_RETURN(bool tuple_simplifier_changed,
+                      tuple_simplifier.Run(module));
+  TF_ASSIGN_OR_RETURN(bool dce_changed, dce.Run(module));
 
-  return changed;
+  return changed || tuple_simplifier_changed || dce_changed;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index 28bb62e40c7674960dbb1bb63dc8967b06956028..65e3d31e347e2cb249a072e7d06ca10c55401748 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -25,12 +25,25 @@ limitations under the License.
 
 namespace xla {
 
-// HLO pass which inserts a copy of the root instruction (creating a new root)
-// if the root is or points-to any constant or parameter instruction.
-// If the root instruction is a Tuple, only tuple elements which point to
-// constant or parameter instructions will be copied.
-// Copy insertion is necessary because constant and parameter arrays have
-// different lifetimes than computation results.
+// Copy insertion is a legalization HLO pass which inserts copies (kCopy
+// instructions) to eliminate several kinds of problems in the HLO module.
+//
+//   (1) Entry parameter or a constant live out of the entry computation.  Entry
+//       computation arguments and constants have different lifetimes than the
+//       computation result and cannot share the same allocation. Parameters and
+//       constants live out of non-entry computations do not need copies.
+//
+//   (2) Different values which are simultaneously live and which must be held
+//       in the same buffer. This can occur in while bodies. Specifically, the
+//       while loop state (the arguments to the while instruction) is updated
+//       in-place and the update may clobber the value from the previous
+//       iteration before the previous value is dead. Computations called from
+//       kCall instructions do not need such copies because kCall has no update
+//       in-place semantics.
+//
+//   (3) The buffer set of the root instruction of the entry computation must be
+//       unambiguous and distinct. That is, InstructionAliasSet::IsAmbiguous and
+//       InstructionAliasSet::IsDistinct return true.
 class CopyInsertion : public HloPassInterface {
  public:
   tensorflow::StringPiece name() const override { return "copy-insertion"; }
@@ -39,14 +52,16 @@ class CopyInsertion : public HloPassInterface {
   // (copies were inserted).
   StatusOr<bool> Run(HloModule* module) override;
 
- protected:
-  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
-  // duplicate copies.
-  StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
-
-  // A map containing all copies inserted during the copy insertion pass. The
-  // key is the copied instruction and the value is the copy.
-  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
+  // The CPU and GPU backend need additional copies added due to deficiencies in
+  // buffer assignment. Specifically, copies are needed for constants live-out
+  // of computations, and for values which are live-in and live-out of the same
+  // computation. These copies are needed because buffer-assignment uses a
+  // computation-scoped analyis (TuplePointsToAnalysis) and has limited
+  // visibility across computation boundaries. This method adds these necessary
+  // copies. Returns whether the module was modified.
+  //
+  // TODO(b/62548313): Remove this when buffer assignment is module-scoped.
+  static StatusOr<bool> AddCopiesForBufferAssignment(HloModule* module);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index a2eacc5c7dae2424e01fdd49d82546b5488d4312..8388574716ad1b78eb8868a8cd732005050b3310 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -17,18 +17,19 @@ limitations under the License.
 
 #include <set>
 
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -37,35 +38,53 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
+int64 CountCopies(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64 CountCopies(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    count += instruction->control_successors().size();
+  }
+  return count;
+}
+
+int64 CountControlEdges(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountControlEdges(*computation);
+  }
+  return count;
+}
+
 class CopyInsertionTest : public HloTestBase {
  protected:
   void InsertCopies(HloModule* module) {
     CopyInsertion copy_insertion;
-    EXPECT_IS_OK(copy_insertion.Run(module).status());
-
-    // Verify the points to set of the root of the computation after copy
-    // insertion contains no constants or parameters, and is distinct and
-    // non-ambiguous.
-    auto points_to_analysis =
-        TuplePointsToAnalysis::Run(module).ConsumeValueOrDie();
-    const auto& points_to = points_to_analysis->GetPointsToSet(
-        module->entry_computation()->root_instruction());
-    EXPECT_TRUE(points_to.IsDistinct());
-    EXPECT_TRUE(!points_to.IsAmbiguous());
-
-    auto maybe_live_out_buffers =
-        points_to_analysis
-            ->GetPointsToSet(module->entry_computation()->root_instruction())
-            .CreateFlattenedSet();
-
-    for (const LogicalBuffer* buffer : maybe_live_out_buffers) {
-      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kConstant);
-      EXPECT_NE(buffer->instruction()->opcode(), HloOpcode::kParameter);
-    }
+    ASSERT_IS_OK(copy_insertion.Run(module).status());
   }
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
 };
 
 TEST_F(CopyInsertionTest, SingleParameter) {
+  // Computation is a single parameter passed into a tuple. The parameter should
+  // be copied before entering the tuple.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x"));
@@ -77,14 +96,15 @@ TEST_F(CopyInsertionTest, SingleParameter) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(x)));
 }
 
 TEST_F(CopyInsertionTest, SingleConstant) {
+  // Computation is a single constant passed into a tuple. The parameter should
+  // be copied before entering the tuple.
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -96,11 +116,42 @@ TEST_F(CopyInsertionTest, SingleConstant) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(constant)));
+}
+
+TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
+  // Verify that an kCopy instructions which exist in the pass before
+  // copy-insertion remain in the graph after copy-insertion.
+  auto module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  HloInstruction* copy_1 = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+  HloInstruction* copy_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+  HloInstruction* add = builder.AddInstruction(HloInstruction::CreateBinary(
+      constant->shape(), HloOpcode::kAdd, copy_1, copy_2));
+  HloInstruction* add_copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(constant->shape(), HloOpcode::kCopy, add));
+
+  module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  EXPECT_EQ(module->entry_computation()->root_instruction(), add_copy);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Copy(op::Add(op::Copy(op::Constant()), op::Copy(op::Constant()))));
 }
 
 TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
@@ -127,12 +178,12 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0)),
-                        op::Copy(old_root->operand(1)), old_root->operand(2)));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::Copy(constant2), op::Copy(x), op::Add(constant1, y)));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
@@ -165,6 +216,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::Copy(op::GetTupleElement(old_root)),
@@ -187,6 +239,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -208,6 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -227,11 +281,11 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(old_root->operand(0))));
+              op::Tuple(op::Copy(bitcast)));
 }
 
 TEST_F(CopyInsertionTest, NestedTupleParameter) {
@@ -257,6 +311,8 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 3);
+
   HloInstruction* new_root = module->entry_computation()->root_instruction();
   EXPECT_NE(old_root, new_root);
 
@@ -283,7 +339,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  // The return value of the computation is the zero-th elemnt of the nested
+  // The return value of the computation is the zero-th element of the nested
   // tuple. This element is itself a tuple.
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
@@ -293,12 +349,13 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
 
-  HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Copy(op::GetTupleElement(old_root)),
-                        op::Copy(op::GetTupleElement(old_root))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::Copy(op::GetTupleElement(op::GetTupleElement(param))),
+                op::Copy(op::GetTupleElement(op::GetTupleElement(param)))));
 }
 
 TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
@@ -331,6 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
   InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Copy(old_root));
@@ -346,12 +404,10 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
   // The parameter 'nested' specifies the loop state shape from which to
   // read the induction variable.
   std::unique_ptr<HloComputation> BuildConditionComputation(
-      bool nested = false) {
+      const Shape& loop_state_shape) {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(10)));
-    const Shape& loop_state_shape =
-        nested ? nested_loop_state_shape_ : loop_state_shape_;
     auto loop_state = builder.AddInstruction(
         HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
     auto induction_variable =
@@ -582,7 +638,7 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
       auto loop_state_init = builder.AddInstruction(
           HloInstruction::CreateTuple({induction_var_init, inner_init}));
       auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
-          loop_state_shape_, condition, body, loop_state_init));
+          loop_state_init->shape(), condition, body, loop_state_init));
       module_->AddEntryComputation(builder.Build());
       return while_hlo;
     }
@@ -658,11 +714,28 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
     auto one_vec = builder.AddInstruction(HloInstruction::CreateConstant(
         Literal::CreateR1<float>({1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f})));
     // Take a reference to 'data_init' to make it interfere with while result.
-    builder.AddInstruction(HloInstruction::CreateBinary(
+    auto add = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data_init, one_vec));
 
-    return BuildWhileInstructionWithCustomInit(loop_state_shape_, data_init,
-                                               &builder);
+    auto xla_while = BuildWhileInstructionWithCustomInit(loop_state_shape_,
+                                                         data_init, &builder);
+
+    // Add an additional binary operation operating on the while and the
+    // interfering add so that neither operation is dead.
+    auto gte = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetSubshape(xla_while->shape(), {1}), xla_while, 1));
+    auto sub = xla_while->parent()->AddInstruction(HloInstruction::CreateBinary(
+        data_shape_, HloOpcode::kSubtract, add, gte));
+    auto gte0 = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(
+            ShapeUtil::GetSubshape(xla_while->shape(), {0}), xla_while, 0));
+    auto tuple = xla_while->parent()->AddInstruction(
+        HloInstruction::CreateTuple({gte0, sub}));
+
+    xla_while->parent()->set_root_instruction(tuple);
+
+    return xla_while;
   }
 
   HloInstruction* BuildWhileInstructionWithCustomInit(
@@ -672,8 +745,8 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
         ShapeUtil::Equal(loop_state_shape, nested_loop_state_shape_);
     auto induction_var_init = builder->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)));
-    auto condition =
-        module_->AddEmbeddedComputation(BuildConditionComputation(nested));
+    auto condition = module_->AddEmbeddedComputation(
+        BuildConditionComputation(loop_state_shape));
     auto body = module_->AddEmbeddedComputation(
         BuildIndependentBodyComputation(nested));
     auto loop_state_init = builder->AddInstruction(
@@ -706,23 +779,21 @@ class WhileCopyInsertionTest : public CopyInsertionTest {
 // CopyInsertion pass should not generate any copies.
 //
 TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body =
       module_->AddEmbeddedComputation(BuildIndependentBodyComputation());
   auto while_hlo = BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
 
-  // No copies should be inserted so root should not be updated.
-  EXPECT_EQ(old_root, new_root);
+  // Body should have no copies as the adds can be done inplace.
+  EXPECT_EQ(CountCopies(*body), 0);
+  EXPECT_EQ(CountControlEdges(*module_), 0);
 
-  // Both init indices need copies.
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  // Both init indices need copies as they are constants.
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while body computation with dependent tuple elements:
@@ -737,20 +808,33 @@ TEST_F(WhileCopyInsertionTest, IndependentTupleElements) {
 //     Tuple(Copy(out0), out1)
 //
 TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(BuildDependentBodyComputation());
   auto while_hlo = BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
 
-  EXPECT_THAT(new_root,
-              op::Tuple(op::Copy(old_root->operand(0)), old_root->operand(1)));
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  EXPECT_EQ(CountCopies(*body), 1);
+  EXPECT_EQ(CountControlEdges(*body), 0);
+
+  EXPECT_THAT(
+      body->root_instruction(),
+      op::Tuple(op::Add(), op::Add(op::GetTupleElement(), op::Broadcast())));
+
+  auto add = body->root_instruction()->operand(0);
+  auto bcast = body->root_instruction()->operand(1)->operand(1);
+  ASSERT_EQ(add->opcode(), HloOpcode::kAdd);
+  ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
+
+  EXPECT_THAT(
+      while_hlo->while_body()->root_instruction(),
+      op::Tuple(op::Add(op::Copy(), op::Constant()),
+                op::Add(op::GetTupleElement(), op::Broadcast(op::Copy()))));
+
+  // Both init indices need copies as they are constants.
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while body computation with read-only tuple element 0:
@@ -768,33 +852,26 @@ TEST_F(WhileCopyInsertionTest, DependentTupleElements) {
 //
 // CopyInsertion pass should not generate any copies for the while body.
 TEST_F(WhileCopyInsertionTest, DependentTupleElements_OneReadOnly) {
-  auto condition = module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
-  auto while_hlo = BuildWhileInstruction(condition, body);
+  BuildWhileInstruction(condition, body);
 
-  const HloInstruction* old_init = while_hlo->operand(0);
-  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
-  HloInstruction* new_root = body->root_instruction();
-  const HloInstruction* new_init = while_hlo->operand(0);
-
-  // No copies should be inserted in the body, so root should not be updated.
-  EXPECT_EQ(old_root, new_root);
 
-  // Both indices need copies, even though Index 0 is read-only, since both are
-  // constants, which must be copied.
-  EXPECT_THAT(new_init, op::Tuple(op::Copy(old_init->operand(0)),
-                                  op::Copy(old_init->operand(1))));
+  // No copies or control edges should be inserted. The body is legal as is.
+  EXPECT_EQ(CountCopies(*body), 0);
+  EXPECT_EQ(CountControlEdges(*body), 0);
 }
 
 // Same as above, but with two while loops, sharing entry parameters.
 TEST_F(WhileCopyInsertionTest,
        DependentTupleElements_OneReadOnly_TwoLoops_EntryParams) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body1 = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
   auto body2 = module_->AddEmbeddedComputation(
@@ -812,30 +889,46 @@ TEST_F(WhileCopyInsertionTest,
       loop_state_shape_, condition1, body1, loop_init));
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape_, condition2, body2, loop_init));
-  module_->AddEntryComputation(builder.Build());
+
+  // Add a couple elements from each of the while so both whiles are live.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo2->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
+
+  auto entry = module_->AddEntryComputation(builder.Build());
 
   InsertCopies(module_.get());
 
-  // Both while loops alias iter_param, since index 0 is read-only in the body.
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0),
-            while_hlo2->operand(0)->operand(0));
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_param);
+  // Neither body should have any copies or control edges in them.
+  EXPECT_EQ(CountCopies(*body1), 0);
+  EXPECT_EQ(CountCopies(*body2), 0);
+  EXPECT_EQ(CountControlEdges(*body1), 0);
+  EXPECT_EQ(CountControlEdges(*body2), 0);
 
-  // Each while loop gets its own copy of data_param, since index 1 is not
-  // read-only in the body.
+  // Only two copies should be necessary. Each of the whiles should have
+  // a copy of tuple element 1 (init value is a parameter, and the element is
+  // not non-read-only) so each of the while bodies gets its own buffer to write
+  // element 1 into.
+  EXPECT_EQ(CountCopies(*entry), 2);
+
+  EXPECT_EQ(while_hlo1->operand(0)->operand(1)->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(while_hlo2->operand(0)->operand(1)->opcode(), HloOpcode::kCopy);
+
+  // The two copies of element 1 should be different.
   EXPECT_NE(while_hlo1->operand(0)->operand(1),
             while_hlo2->operand(0)->operand(1));
-  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_param));
-  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_param));
 }
 
 // Same as above, but with two while loops, sharing non-parameters.
 TEST_F(WhileCopyInsertionTest,
        DependentTupleElements_OneReadOnly_TwoLoops_NonParams) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape_));
   auto body1 = module_->AddEmbeddedComputation(
       BuildDependentBodyOneReadOnlyComputation());
   auto body2 = module_->AddEmbeddedComputation(
@@ -858,21 +951,28 @@ TEST_F(WhileCopyInsertionTest,
       loop_state_shape_, condition1, body1, loop_init));
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape_, condition2, body2, loop_init));
-  module_->AddEntryComputation(builder.Build());
+
+  // Add a couple elements from each of the while so both whiles are not dead.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo2->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
+  auto entry = module_->AddEntryComputation(builder.Build());
 
   InsertCopies(module_.get());
 
-  // No copies of iter_value are necessary, since index 0 is read-only in both
-  // while bodies.
-  EXPECT_EQ(while_hlo1->operand(0)->operand(0), iter_value);
-  EXPECT_EQ(while_hlo2->operand(0)->operand(0), iter_value);
+  // Ideally only one copy should be necessary. One of the whiles should
+  // have a copy of tuple element 1 (the non-read-only element) so each of the
+  // while bodies gets its own buffer to write element 1 into. However, the
+  // analysis isn't perfect and adds an additional copy of element 0.
+  EXPECT_EQ(CountCopies(*entry), 2);
 
-  // Each while loop gets its own copy of data_value, since index 1 is not
-  // read-only in the body.
-  EXPECT_NE(while_hlo1->operand(0)->operand(1),
-            while_hlo2->operand(0)->operand(1));
-  EXPECT_THAT(while_hlo1->operand(0)->operand(1), op::Copy(data_value));
-  EXPECT_THAT(while_hlo2->operand(0)->operand(1), op::Copy(data_value));
+  EXPECT_THAT(while_hlo1->operand(0),
+              op::Tuple(op::Exp(), op::Copy(op::Exp())));
+  EXPECT_THAT(while_hlo2->operand(0),
+              op::Tuple(op::Exp(), op::Copy(op::Exp())));
 }
 
 // Tests while body computation with nested tuple elements:
@@ -905,18 +1005,34 @@ TEST_F(WhileCopyInsertionTest,
 //                     Tuple  // new root
 //
 TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
-  auto condition =
-      module_->AddEmbeddedComputation(BuildConditionComputation(true));
+  auto condition = module_->AddEmbeddedComputation(
+      BuildConditionComputation(nested_loop_state_shape_));
   auto body = module_->AddEmbeddedComputation(BuildNestedBodyComputation());
   BuildWhileInstruction(condition, body, true);
 
-  HloInstruction* old_root = body->root_instruction();
+  //  HloInstruction* old_root = body->root_instruction();
   InsertCopies(module_.get());
 
-  EXPECT_THAT(body->root_instruction(),
-              op::Tuple(old_root->operand(0),
-                        op::Tuple(old_root->operand(1)->operand(0),
-                                  op::Copy(old_root->operand(1)->operand(1)))));
+  // The only copy necessary is for the kReverse as it cannot be done
+  // in-place (instruction can share buffer with operand). The other elements of
+  // the loop state are kAdd instructions which can be done in-place.
+  EXPECT_EQ(CountCopies(*body), 1);
+
+  // Each element of the init needs a copy as all are constants.
+  EXPECT_EQ(CountCopies(*module_), 4);
+
+  // Either the kReverse itself must be copied or the operand of the kReverse
+  // must be copied.
+  if (body->root_instruction()->operand(1)->operand(1)->opcode() ==
+      HloOpcode::kCopy) {
+    EXPECT_THAT(
+        body->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Reverse()))));
+  } else {
+    EXPECT_THAT(
+        body->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Reverse(op::Copy()))));
+  }
 }
 
 // Tests while init instruction which points-to a constant.
@@ -927,11 +1043,13 @@ TEST_F(WhileCopyInsertionTest, NestedTupleElements) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
   auto while_hlo = BuildWhileInstruction_InitPointsToConstant();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
+  EXPECT_EQ(CountCopies(*module_), 2);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Constant())));
 }
 
 // Tests while init instruction which points-to a parameter.
@@ -942,11 +1060,13 @@ TEST_F(WhileCopyInsertionTest, InitPointsToConstant) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
   auto while_hlo = BuildWhileInstruction_InitPointsToParameter();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
+  EXPECT_EQ(CountCopies(*module_), 2);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Parameter())));
 }
 
 // Tests while init instruction which has an ambiguous points-to set.
@@ -975,15 +1095,34 @@ TEST_F(WhileCopyInsertionTest, InitPointsToParameter) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
   auto while_hlo = BuildWhileInstruction_InitPointsToAmbiguous();
-  auto old_init = while_hlo->operand(0);
-  InsertCopies(module_.get());
 
-  EXPECT_THAT(
-      while_hlo->operand(0),
-      op::Tuple(
-          op::Copy(old_init->operand(0)),
-          op::Tuple(op::Copy(op::GetTupleElement(old_init->operand(1))),
-                    op::Copy(op::GetTupleElement(old_init->operand(1))))));
+  InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*module_), 4);
+  // The entry computation requires three copies to resolve the ambiguity of two
+  // init elements and the constant passed in as one of the init elements.
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 3);
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()),
+                        op::Tuple(op::Copy(op::GetTupleElement()),
+                                  op::Copy(op::GetTupleElement()))));
+
+  // The body requires one copy because the buffer set is not distinct: the
+  // result of one of the adds is written into two elements of the output of the
+  // loop body. Either element might be copied.
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 1);
+  if (while_hlo->while_body()
+          ->root_instruction()
+          ->operand(1)
+          ->operand(0)
+          ->opcode() == HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Copy(op::Add()), op::Add())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Add()))));
+  }
 }
 
 // Tests while init instruction which has a non-distinct points-to set.
@@ -1011,13 +1150,43 @@ TEST_F(WhileCopyInsertionTest, InitPointsToAmbiguous) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
   auto while_hlo = BuildWhileInstruction_InitPointsToNonDistinct();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
 
-  EXPECT_THAT(while_hlo->operand(0),
-              op::Tuple(op::Copy(old_init->operand(0)),
-                        op::Tuple(op::Copy(old_init->operand(1)->operand(0)),
-                                  op::Copy(old_init->operand(1)->operand(0)))));
+  // The entry computation requires two copies to resolve the non-disinctness of
+  // two init elements and the constant passed in as one of the init
+  // elements. Either element can be copied for the distinctness issue.
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 2);
+  if (while_hlo->operand(0)->operand(1)->operand(0)->opcode() ==
+      HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->operand(0),
+        op::Tuple(op::Copy(op::Constant()),
+                  op::Tuple(op::Copy(op::Broadcast()), op::Broadcast())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->operand(0),
+        op::Tuple(op::Copy(op::Constant()),
+                  op::Tuple(op::Broadcast(), op::Copy(op::Broadcast()))));
+  }
+
+  // The body requires one copy because the buffer set is not distinct: the
+  // result of one of the adds is written into two elements of the output of the
+  // loop body. Either element might be copied.
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 1);
+  if (while_hlo->while_body()
+          ->root_instruction()
+          ->operand(1)
+          ->operand(0)
+          ->opcode() == HloOpcode::kCopy) {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Copy(op::Add()), op::Add())));
+  } else {
+    EXPECT_THAT(
+        while_hlo->while_body()->root_instruction(),
+        op::Tuple(op::Add(), op::Tuple(op::Add(), op::Copy(op::Add()))));
+  }
 }
 
 // Tests while init instruction buffer which interferes with while result
@@ -1031,11 +1200,13 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinct) {
 //
 TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
   auto while_hlo = BuildWhileInstruction_InitPointsToInterfering();
-  auto old_init = while_hlo->operand(0);
+
   InsertCopies(module_.get());
+  EXPECT_EQ(CountCopies(*module_), 2);
+  EXPECT_EQ(CountCopies(*while_hlo->while_body()), 0);
 
-  EXPECT_THAT(while_hlo->operand(0), op::Tuple(op::Copy(old_init->operand(0)),
-                                               op::Copy(old_init->operand(1))));
+  EXPECT_THAT(while_hlo->operand(0),
+              op::Tuple(op::Copy(op::Constant()), op::Copy(op::Broadcast())));
 }
 
 // Tests while init instruction buffer which has a non-distinct points-to set:
@@ -1044,18 +1215,21 @@ TEST_F(WhileCopyInsertionTest, InitPointsToInterfering) {
 //                  Parameter(F32, {8})))
 //
 // where the second and third parameters are identical *and* the tuple shared
-// by another while instruction..
+// by another while instruction.
 //
 // Verifies that the resulting point-to set is distinct in the resulting Tuple
 // (non-identical Copys). In other words, verifies that copy sharing does not
 // insert identical copies to the resulting tuple.
 TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
-  auto condition1 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
-  auto condition2 =
-      module_->AddEmbeddedComputation(BuildConditionComputation());
   // Loop body that outputs tuple comprises two elements dependent on the init
   // tuple.
+  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
+      {induction_variable_shape_, data_shape_, data_shape_});
+
+  auto condition1 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape));
+  auto condition2 = module_->AddEmbeddedComputation(
+      BuildConditionComputation(loop_state_shape));
   auto body1 =
       module_->AddEmbeddedComputation(BuildDependentBodyComputation2());
   auto body2 =
@@ -1072,8 +1246,6 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
   auto loop_init = builder.AddInstruction(
       HloInstruction::CreateTuple({iter_param, data_param, data_param}));
 
-  const Shape& loop_state_shape = ShapeUtil::MakeTupleShape(
-      {induction_variable_shape_, data_shape_, data_shape_});
 
   // Two while loops shares the same loop init tuple.
   auto while_hlo1 = builder.AddInstruction(HloInstruction::CreateWhile(
@@ -1081,43 +1253,478 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
   auto while_hlo2 = builder.AddInstruction(HloInstruction::CreateWhile(
       loop_state_shape, condition2, body2, loop_init));
 
-  module_->AddEntryComputation(builder.Build());
+  // Add add instruction so neither while is dead.
+  auto gte1 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo1, 0));
+  auto gte2 = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetSubshape(while_hlo1->shape(), {0}), while_hlo2, 0));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(gte1->shape(), HloOpcode::kAdd, gte1, gte2));
 
-  auto points_to_analysis =
-      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  module_->AddEntryComputation(builder.Build());
 
-  // Asserts that the init tuples before copy insertion is non-distinct.
-  ASSERT_FALSE(
-      points_to_analysis->GetPointsToSet(while_hlo1->operand(0)).IsDistinct());
-  ASSERT_FALSE(
-      points_to_analysis->GetPointsToSet(while_hlo2->operand(0)).IsDistinct());
+  InsertCopies(module_.get());
 
-  auto old_init1 = while_hlo1->operand(0);
-  auto old_init2 = while_hlo2->operand(0);
+  // None of the bodies should have copies or control flow edges.
+  EXPECT_EQ(CountCopies(*body1), 0);
+  EXPECT_EQ(CountCopies(*body2), 0);
 
-  InsertCopies(module_.get());
+  // The loop bodies pass through elements 1 and 2 in the init tuple, so ideally
+  // these should not need to be copied before either while. However, copy
+  // insertion is not able to reason about the transparency of elements through
+  // while bodies in all circumstances so extra copies are added (b/xxx).
+  EXPECT_EQ(CountCopies(*module_->entry_computation()), 2);
 
   EXPECT_THAT(while_hlo1->operand(0),
-              op::Tuple(op::Copy(old_init1->operand(0)),
-                        op::Copy(old_init1->operand(1)),
-                        op::Copy(old_init1->operand(2))));
-
+              op::Tuple(op::Copy(), op::Parameter(), op::Parameter()));
   EXPECT_THAT(while_hlo2->operand(0),
-              op::Tuple(op::Copy(old_init2->operand(0)),
-                        op::Copy(old_init2->operand(1)),
-                        op::Copy(old_init2->operand(2))));
-
-  // Verifies the init tuples after copy insertion is distinct.
-  points_to_analysis =
-      TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
-  const auto& points_to1 =
-      points_to_analysis->GetPointsToSet(while_hlo1->operand(0));
-  EXPECT_TRUE(points_to1.IsDistinct());
-
-  const auto& points_to2 =
-      points_to_analysis->GetPointsToSet(while_hlo2->operand(0));
-  EXPECT_TRUE(points_to2.IsDistinct());
+              op::Tuple(op::Copy(), op::Parameter(), op::Parameter()));
 }
 
+TEST_F(CopyInsertionTest, SwizzlingWhile) {
+  // Test a while instruction with a body which permutes its tuple parameter
+  // elements.
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body simply interchanges the two tuple elements in the loop state.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_1, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 6);
+
+  // The loop state elements should be copied at the parameter and at the root
+  // with a control edge in between (see DeepCopyAndAddControlEdges). This is
+  // technically one more copy than is strictly necessary, but in order to have
+  // only three copies the copies of different loop state elements must be
+  // ordered with a control edge.
+  EXPECT_EQ(CountCopies(*body), 4);
+  EXPECT_EQ(CountControlEdges(*body), 2);
+
+  EXPECT_THAT(body->root_instruction(),
+              op::Tuple(op::Copy(op::Copy()), op::Copy(op::Copy())));
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
+  // Test a while instruction with a body which permutes its tuple parameter
+  // elements and applies one operation to one of the elements. The addition of
+  // the operation (instruction) on the element makes the live range of the
+  // respective input and output elements different than if the instruction were
+  // not there (as in the SwizzlingWhile test above).
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body interchanges the two tuple elements in the loop state and negates one
+  // of them.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+      scalar_shape_, HloOpcode::kNegate, body_element_1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({negate, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 6);
+
+  // The loop state elements should be copied at the parameter and at the root
+  // with a control edge in between (see DeepCopyAndAddControlEdges).
+  EXPECT_EQ(CountCopies(*body), 4);
+  EXPECT_EQ(CountControlEdges(*body), 2);
+
+  EXPECT_THAT(
+      body->root_instruction(),
+      op::Tuple(op::Copy(op::Negate(op::Copy())), op::Copy(op::Copy())));
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(xla_while->operand(0), op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
+  // Test a while instruction with a body which permutes it's tuple parameter
+  // elements similar to SwizzlinWhile above. However, in this test the input to
+  // the while body is a single constant (both loop state elements are the same
+  // constant). This means no copies are necessary because both loop state
+  // elements are the same so interchanging them is a no-op.
+  auto module = CreateNewModule();
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
+
+  // Body simply interchanges the two tuple elements in the loop state.
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto body_element_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 0));
+  auto body_element_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
+  body_builder.AddInstruction(
+      HloInstruction::CreateTuple({body_element_1, body_element_0}));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+  auto cond_constant = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  cond_builder.AddInstruction(HloInstruction::CreateUnary(
+      cond_constant->shape(), HloOpcode::kNot, cond_constant));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({constant, constant}));
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(loop_state_shape, condition, body, tuple));
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 2);
+  EXPECT_EQ(CountCopies(*body), 0);
+
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(), op::Copy()));
+}
+
+TEST_F(CopyInsertionTest, SequentialWhiles) {
+  // Construct a computation with a series of sequential while instructions
+  // containing four loop state elements:
+  //
+  //   element 0 is passed to each while directly from an entry parameter.
+  //
+  //   element 1 is passed transparently in series through all the while bodies.
+  //
+  //   element 2 is negated in each while body. (in-place possible)
+  //
+  //   element 3 is reversed in each while body. (in-place not possible)
+  //
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {42});
+  const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
+      {element_shape, element_shape, element_shape, element_shape});
+
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, element_shape, "param_0"));
+  auto param_1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, element_shape, "param_1"));
+  auto param_2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, element_shape, "param_2"));
+  auto param_3 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, element_shape, "param_3"));
+
+  // The number of sequential kWhile instructions.
+  const int kNumWhiles = 3;
+
+  HloInstruction* prev_element_1 = param_1;
+  HloInstruction* prev_element_2 = param_2;
+  HloInstruction* prev_element_3 = param_3;
+
+  // Vector containing all of the while instructions.
+  std::vector<const HloInstruction*> whiles;
+  for (int i = 0; i < kNumWhiles; ++i) {
+    auto body_builder = HloComputation::Builder("body");
+    auto body_param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+    auto body_element_0 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 0));
+    auto body_element_1 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 1));
+    auto body_element_2 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 2));
+    auto body_element_3 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(element_shape, body_param, 3));
+    auto negate = body_builder.AddInstruction(HloInstruction::CreateUnary(
+        element_shape, HloOpcode::kNegate, body_element_2));
+    auto reverse = body_builder.AddInstruction(
+        HloInstruction::CreateReverse(element_shape, body_element_3, {0}));
+    body_builder.AddInstruction(HloInstruction::CreateTuple(
+        {body_element_0, body_element_1, negate, reverse}));
+    HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+    auto cond_builder = HloComputation::Builder("condition");
+    cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_state_shape, "param"));
+    auto cond_constant = cond_builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+    cond_builder.AddInstruction(HloInstruction::CreateUnary(
+        cond_constant->shape(), HloOpcode::kNot, cond_constant));
+    HloComputation* condition =
+        module->AddEmbeddedComputation(cond_builder.Build());
+
+    auto while_init = builder.AddInstruction(HloInstruction::CreateTuple(
+        {param_0, prev_element_1, prev_element_2, prev_element_3}));
+
+    auto xla_while = builder.AddInstruction(HloInstruction::CreateWhile(
+        loop_state_shape, condition, body, while_init));
+    whiles.push_back(xla_while);
+    if (i != kNumWhiles - 1) {
+      prev_element_1 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 1));
+      prev_element_2 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 2));
+      prev_element_3 = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(element_shape, xla_while, 3));
+    }
+  }
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  // Each while body has one copy. And each loop state element is copied once in
+  // the entry computation.
+  EXPECT_EQ(CountCopies(*module), 4 + kNumWhiles);
+
+  // Each while body should have exactly one copy for element three which is an
+  // op (kReverse) which cannot be done in place.
+  for (const HloInstruction* xla_while : whiles) {
+    EXPECT_EQ(CountCopies(*xla_while->while_body()), 1);
+  }
+
+  EXPECT_THAT(whiles[0]->operand(0), op::Tuple(op::Parameter(), op::Parameter(),
+                                               op::Copy(), op::Copy()));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Copy(), op::Copy(), op::GetTupleElement(),
+                        op::GetTupleElement()));
+}
+
+TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
+  // Test a while body and condition which are each simply a constant (root of
+  // computation is a constant). The body constant should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+
+  auto body_builder = HloComputation::Builder("body");
+  body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape_, condition, body, param_0));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 2);
+
+  EXPECT_THAT(xla_while->operand(0), op::Copy(op::Parameter()));
+  EXPECT_THAT(body->root_instruction(), op::Copy(op::Constant()));
+  EXPECT_THAT(condition->root_instruction(), op::Constant());
+}
+
+std::unique_ptr<HloComputation> MakeTrivialCondition(const Shape& shape) {
+  auto builder = HloComputation::Builder("trivial_condition");
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "loop_state"));
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kNot, constant));
+  return builder.Build();
+}
+
+std::unique_ptr<HloComputation> MakeBenchmarkWhileBody() {
+  auto builder = HloComputation::Builder("benchmark_loop_body");
+  const Shape element_shape = ShapeUtil::MakeShape(F32, {42});
+  const Shape loop_state_shape =
+      ShapeUtil::MakeTupleShape({element_shape, element_shape, element_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, loop_state_shape, "loop_state"));
+  HloInstruction* element_0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 0));
+  HloInstruction* element_1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 1));
+  HloInstruction* element_2 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(element_shape, param, 2));
+
+  HloInstruction* rev_1 = builder.AddInstruction(
+      HloInstruction::CreateReverse(element_shape, element_1, {0}));
+  HloInstruction* add_1_2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      element_shape, HloOpcode::kAdd, element_1, element_2));
+
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({element_0, rev_1, add_1_2}));
+  return builder.Build();
+}
+
+void BM_SequentialWhiles(int num_iters, int num_whiles) {
+  // This benchmark constructs a chain of sequential while instructions.
+  tensorflow::testing::StopTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
+                     config);
+
+    auto builder = HloComputation::Builder("BM_SequentialWhiles");
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {42}), "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {42}), "y"));
+    HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
+        2, ShapeUtil::MakeShape(F32, {42}), "z"));
+    HloInstruction* init =
+        builder.AddInstruction(HloInstruction::CreateTuple({x, y, z}));
+
+    HloInstruction* prev_loop_state = init;
+    for (int w = 0; w < num_whiles; ++w) {
+      HloComputation* condition =
+          module.AddEmbeddedComputation(MakeTrivialCondition(init->shape()));
+      HloComputation* body =
+          module.AddEmbeddedComputation(MakeBenchmarkWhileBody());
+      prev_loop_state = builder.AddInstruction(HloInstruction::CreateWhile(
+          init->shape(), condition, body, prev_loop_state));
+    }
+    module.AddEntryComputation(builder.Build());
+
+    CopyInsertion copy_insertion;
+
+    tensorflow::testing::StartTiming();
+    ASSERT_IS_OK(copy_insertion.Run(&module).status());
+    tensorflow::testing::StopTiming();
+
+    // The entry computation should have three copies, and each body has one.
+    ASSERT_EQ(CountCopies(module), 3 + num_whiles);
+  }
+}
+
+void BM_ParallelWhiles(int num_iters, int num_whiles) {
+  // This benchmark constructs a fan-out of parallel while instructions.
+  tensorflow::testing::StopTiming();
+  for (int i = 0; i < num_iters; ++i) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
+                     config);
+
+    auto builder = HloComputation::Builder("BM_ParallelWhiles");
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {42}), "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {42}), "y"));
+    HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
+        2, ShapeUtil::MakeShape(F32, {42}), "z"));
+    HloInstruction* init =
+        builder.AddInstruction(HloInstruction::CreateTuple({x, y, z}));
+
+    HloInstruction* sum = nullptr;
+    for (int w = 0; w < num_whiles; ++w) {
+      HloComputation* condition =
+          module.AddEmbeddedComputation(MakeTrivialCondition(init->shape()));
+      HloComputation* body =
+          module.AddEmbeddedComputation(MakeBenchmarkWhileBody());
+
+      HloInstruction* xla_while = builder.AddInstruction(
+          HloInstruction::CreateWhile(init->shape(), condition, body, init));
+
+      if (sum == nullptr) {
+        sum = builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(x->shape(), xla_while, 0));
+      } else {
+        HloInstruction* element_0 = builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(x->shape(), xla_while, 0));
+        sum = builder.AddInstruction(HloInstruction::CreateBinary(
+            x->shape(), HloOpcode::kAdd, sum, element_0));
+      }
+    }
+    module.AddEntryComputation(builder.Build());
+
+    CopyInsertion copy_insertion;
+
+    tensorflow::testing::StartTiming();
+    ASSERT_IS_OK(copy_insertion.Run(&module).status());
+    tensorflow::testing::StopTiming();
+
+    // Each body receives of copy of two of the parameters (the corresponding
+    // elements in the body are modifed), and there is one copy in each body.
+    ASSERT_EQ(CountCopies(module), 3 * num_whiles);
+  }
+}
+
+BENCHMARK(BM_SequentialWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
+BENCHMARK(BM_ParallelWhiles)->Arg(512)->Arg(1024)->Arg(2048)->Arg(4096);
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ef8eed3f88c3d557fcb4ec5b9e1988ce82b777e8..b43597dca983151d59ec7aaba9887313191fc9bd 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -17,6 +17,7 @@ package_group(
 load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -78,14 +79,16 @@ cc_library(
     deps = [
         ":compiler_functor",
         ":conv_canonicalization",
+        ":cpu_copy_insertion",
         ":cpu_executable",
         ":cpu_instruction_fusion",
+        ":cpu_layout_assignment",
         ":cpu_options",
         ":cpu_parallelization_preparation",
         ":disassembler",
+        ":dot_op_emitter",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":parallel_cpu_executable",
         ":parallel_task_assignment",
         ":simple_orc_jit",
@@ -101,13 +104,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
-        "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_constant_folding",
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
@@ -122,6 +126,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",  # fixdeps: keep
         "//tensorflow/core:lib",  # fixdeps: keep
         "//tensorflow/core:stream_executor_no_cuda",
@@ -155,21 +160,23 @@ cc_library(
         ":custom_call_target_registry",
         ":disassembler",
         ":external_constant_pool",
+        ":orc_jit_memory_mapper",
         ":runtime_conv2d",
         ":runtime_fork_join",
         ":runtime_matmul",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-        "@llvm//:core",
         "@llvm//:execution_engine",
+        "@llvm//:core",
         "@llvm//:mc",  # fixdeps: keep
         "@llvm//:orc_jit",
         "@llvm//:support",
         "@llvm//:target",  # fixdeps: keep
-    ],
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ] + ORC_JIT_MEMORY_MAPPER_TARGETS,
 )
 
 cc_library(
@@ -245,6 +252,8 @@ cc_library(
         ":dot_op_emitter",
         ":external_constant_pool",
         ":ir_emission_utils",
+        ":ir_function",
+        ":parallel_loop_emitter",
         ":shape_partition",
         ":simple_orc_jit",
         "//tensorflow/compiler/xla:shape_util",
@@ -268,19 +277,54 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:ops",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@llvm//:code_gen",
         "@llvm//:core",
         "@llvm//:support",
         "@llvm//:target",
     ],
 )
 
+cc_library(
+    name = "ir_function",
+    srcs = ["ir_function.cc"],
+    hdrs = ["ir_function.h"],
+    deps = [
+        ":ir_emission_utils",
+        ":shape_partition",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service/cpu:cpu_runtime",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
+    name = "parallel_loop_emitter",
+    srcs = ["parallel_loop_emitter.cc"],
+    hdrs = ["parallel_loop_emitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
     hdrs = ["dot_op_emitter.h"],
     deps = [
+        ":cpu_options",
         ":cpu_runtime",
-        ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
@@ -289,8 +333,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:vector_support_library",
         "//tensorflow/core:lib",
         "@llvm//:core",
     ],
@@ -607,14 +653,16 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/service:hlo",
+        "@llvm//:core",
     ],
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "cpu_layout_assignment",
+    srcs = ["cpu_layout_assignment.cc"],
+    hdrs = ["cpu_layout_assignment.h"],
     deps = [
+        ":dot_op_emitter",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:computation_layout",
@@ -624,11 +672,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
+    name = "cpu_layout_assignment_test",
     size = "small",
-    srcs = ["layout_assignment_test.cc"],
+    srcs = ["cpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":cpu_layout_assignment",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -702,6 +750,7 @@ cc_library(
     srcs = ["parallel_task_assignment.cc"],
     hdrs = ["parallel_task_assignment.h"],
     deps = [
+        ":dot_op_emitter",
         ":ir_emission_utils",
         ":shape_partition",
         "//tensorflow/compiler/xla/service:hlo",
@@ -716,6 +765,7 @@ cc_library(
     hdrs = ["cpu_options.h"],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -730,6 +780,48 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "orc_jit_memory_mapper",
+    srcs = ["orc_jit_memory_mapper.cc"],
+    hdrs = ["orc_jit_memory_mapper.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@llvm//:execution_engine",
+    ],
+)
+
+cc_library(
+    name = "cpu_copy_insertion",
+    srcs = ["cpu_copy_insertion.cc"],
+    hdrs = ["cpu_copy_insertion.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_copy_insertion_test",
+    srcs = ["cpu_copy_insertion_test.cc"],
+    deps = [
+        ":cpu_copy_insertion",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 44cd2171afdc6eecc22f3f920276a4d95f930573..2136aeb3877685373efaf5bf702a42b39a63f082 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -41,19 +41,17 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       auto kernel_input_feature_dim = dnums.kernel_input_feature_dimension();
       auto kernel_output_feature_dim = dnums.kernel_output_feature_dimension();
 
-      int num_spatial_dims = dnums.spatial_dimensions_size();
-      int num_dims = num_spatial_dims + 2;
+      const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+      const int64 num_dims = num_spatial_dims + 2;
 
       // A canonical convolution's dimension numbers need to satisfy the
       // following conditions (see cs/PotentiallyImplementedAsEigenConvolution).
       //
-      // - the input is in NHWC or NWHC order.
-      // - the kernel is in HWIO or WHIO order.
-      // - the spatial dimensions are in the same relative order in the input,
-      //   kernel and output.
+      // - the input is in NHWC order.
+      // - the kernel is in HWIO order.
       //
       // For simplicity, as a first step, we reshape the input and filter to
-      // NHWC and HWIO order, respectively. This may lose precision but not
+      // NHWC and HWIO order, respectively. This may lose precision but won't
       // break the soundness.
       HloInstruction* input = hlo->mutable_operand(0);
 
@@ -61,10 +59,10 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       std::vector<int64> new_input_dims(num_dims);
       new_input_dim_order[0] = input_batch_dim;
       new_input_dims[0] = input->shape().dimensions(input_batch_dim);
-      for (int i = 0; i < num_spatial_dims; ++i) {
-        new_input_dim_order[i + 1] = dnums.spatial_dimensions(i);
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_input_dim_order[i + 1] = dnums.input_spatial_dimensions(i);
         new_input_dims[i + 1] =
-            input->shape().dimensions(dnums.spatial_dimensions(i));
+            input->shape().dimensions(dnums.input_spatial_dimensions(i));
       }
       new_input_dim_order[num_dims - 1] = input_feature_dim;
       new_input_dims[num_dims - 1] =
@@ -80,7 +78,7 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
 
       std::vector<int64> new_kernel_dim_order(num_dims);
       std::vector<int64> new_kernel_dims(num_dims);
-      for (int i = 0; i < num_spatial_dims; ++i) {
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
         new_kernel_dim_order[i] = dnums.kernel_spatial_dimensions(i);
         new_kernel_dims[i] =
             kernel->shape().dimensions(dnums.kernel_spatial_dimensions(i));
@@ -98,14 +96,18 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
           HloInstruction::CreateTranspose(new_kernel_shape, kernel,
                                           new_kernel_dim_order));
 
+      std::vector<int64> new_output_dim_order(num_dims);
       std::vector<int64> new_conv_dims(num_dims);
       auto output_batch_dim = dnums.output_batch_dimension();
       auto output_feature_dim = dnums.output_feature_dimension();
+      new_output_dim_order[0] = output_batch_dim;
       new_conv_dims[0] = hlo->shape().dimensions(output_batch_dim);
-      for (int i = 0; i < num_spatial_dims; ++i) {
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_output_dim_order[i + 1] = dnums.output_spatial_dimensions(i);
         new_conv_dims[i + 1] =
-            hlo->shape().dimensions(dnums.spatial_dimensions(i));
+            hlo->shape().dimensions(dnums.output_spatial_dimensions(i));
       }
+      new_output_dim_order[num_dims - 1] = output_feature_dim;
       new_conv_dims[num_dims - 1] = hlo->shape().dimensions(output_feature_dim);
       Shape new_conv_shape =
           ShapeUtil::MakeShape(hlo->shape().element_type(), new_conv_dims);
@@ -113,9 +115,10 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       ConvolutionDimensionNumbers new_dnums;
       new_dnums.set_input_batch_dimension(0);
       new_dnums.set_output_batch_dimension(0);
-      for (int i = 0; i < num_spatial_dims; ++i) {
-        new_dnums.add_spatial_dimensions(i + 1);
+      for (int64 i = 0; i < num_spatial_dims; ++i) {
+        new_dnums.add_input_spatial_dimensions(i + 1);
         new_dnums.add_kernel_spatial_dimensions(i);
+        new_dnums.add_output_spatial_dimensions(i + 1);
       }
       new_dnums.set_input_feature_dimension(num_dims - 1);
       new_dnums.set_output_feature_dimension(num_dims - 1);
@@ -129,14 +132,11 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
           HloInstruction::CreateConvolve(new_conv_shape, new_input, new_kernel,
                                          hlo->window(), new_dnums));
 
-      // kConvolution inherits the dimension mapping of its input, so we need to
-      // reshape the output back to the shape of the original convolution. This
-      // is done by apply the inverse permutation of the collapsing order of the
-      // input reshape.
+      // Reshape the output back to the shape of the original convolution.
       TF_RETURN_IF_ERROR(module->entry_computation()->ReplaceWithNewInstruction(
           hlo, HloInstruction::CreateTranspose(
                    hlo->shape(), new_conv,
-                   InversePermutation(new_input_dim_order))));
+                   InversePermutation(new_output_dim_order))));
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index d593ba26b655d00a0f0f0b9a94c9e62fa1835080..968f53d5c706651d2a470a853e0e9b601c0ed2df 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -69,8 +69,10 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(1);
   dnums.set_output_batch_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
   dnums.set_input_feature_dimension(0);
   dnums.set_output_feature_dimension(0);
   dnums.add_kernel_spatial_dimensions(2);
@@ -125,8 +127,10 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   ConvolutionDimensionNumbers dnums;
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
   dnums.add_kernel_spatial_dimensions(0);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index e141066b8fb48896e9f88e0a98f74aad08b63799..55e7c7bc2ca05991ac6dd53bf48bc9fd30f52601 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -46,27 +46,30 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_parallelization_preparation.h"
 #include "tensorflow/compiler/xla/service/cpu/disassembler.h"
+#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h"
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
@@ -82,6 +85,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -195,28 +199,35 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
 class CollectProfileCandidates : public DfsHloVisitorWithDefault {
  public:
   static StatusOr<std::unordered_map<const HloInstruction*, size_t>>
-  GetCandidatesForComputation(HloComputation* computation) {
+  GetCandidatesForComputation(
+      HloComputation* computation,
+      const std::unordered_map<const HloInstruction*, int64>&
+          assigned_indices) {
     std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
     CollectProfileCandidates profile_candidates_for_computation(
-        &hlo_to_profile_idx);
+        &hlo_to_profile_idx, assigned_indices);
     TF_RETURN_IF_ERROR(
         computation->Accept(&profile_candidates_for_computation));
     return hlo_to_profile_idx;
   }
 
  private:
-  explicit CollectProfileCandidates(
-      std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx)
-      : hlo_to_profile_idx_(hlo_to_profile_idx) {}
+  CollectProfileCandidates(
+      std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+      const std::unordered_map<const HloInstruction*, int64>& assigned_indices)
+      : hlo_to_profile_idx_(hlo_to_profile_idx),
+        assigned_indices_(assigned_indices) {}
 
   Status DefaultAction(HloInstruction* hlo_instruction) override {
-    hlo_to_profile_idx_->insert({hlo_instruction, hlo_to_profile_idx_->size()});
+    hlo_to_profile_idx_->insert(
+        {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
     return Status::OK();
   }
 
   Status HandleCall(HloInstruction* call) override {
     TF_RETURN_IF_ERROR(DefaultAction(call));
-    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
+                                                 assigned_indices_);
     TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
     return Status::OK();
   }
@@ -230,17 +241,20 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
   Status HandleWhile(HloInstruction* xla_while) override {
     TF_RETURN_IF_ERROR(DefaultAction(xla_while));
 
-    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
+                                                      assigned_indices_);
     TF_RETURN_IF_ERROR(
         xla_while->while_condition()->Accept(&candidates_for_condition));
 
-    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_);
+    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
+                                                 assigned_indices_);
     TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
 
     return Status::OK();
   }
 
   std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+  const std::unordered_map<const HloInstruction*, int64>& assigned_indices_;
 };
 }  // namespace
 
@@ -260,7 +274,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
-
+  pipeline.AddPass<DotDecomposer>();
   pipeline.AddPass<ConvCanonicalization>();
   {
     auto& pass =
@@ -275,8 +289,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_simplification=*/false);
+        /*enable_dot_strength_reduction=*/false);
     pass.AddPass<TupleSimplifier>();
+    pass.AddPass<WhileLoopSimplifier>();
     pass.AddPass<HloDCE>();
     pass.AddPass<ReshapeMover>();
     pass.AddPass<HloConstantFolding>();
@@ -303,8 +318,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
       /*is_layout_sensitive=*/true,
       [](const Shape&, const Shape&) { return true; },
-      /*enable_dot_simplification=*/false);
+      /*enable_dot_strength_reduction=*/false);
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  pipeline.AddPass<HloElementTypeConverter>(BF16, F32);
   // Outline ops in the entry computation into calls to subcomputations.
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
@@ -320,7 +336,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     // binary size (and most AOT applications are single-threaded).
     // TODO(29630486) Support multi-threaded AOT.
     pipeline.AddPass<ParallelTaskAssigner>(max_parallelism,
-                                           ShapeSizeBytesFunction(), module);
+                                           ShapeSizeBytesFunction());
   }
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -329,15 +345,16 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // (and sometime after) copy insertion, to avoid dead code from interfering
   // with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<CopyInsertion>();
+  pipeline.AddPass<FlattenCallGraph>();
+  pipeline.AddPass<CpuCopyInsertion>();
   if (options::CpuParallelBackendRequested(module->config())) {
     // Re-run the outlining, in case any copies were inserted into the entry
     // computation.
     pipeline.AddPass<ParallelizationPreparation>(max_parallelism,
                                                  ShapeSizeBytesFunction());
+    pipeline.AddPass<CpuCopyInsertion>();
   }
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<FlattenCallGraph>();
   return pipeline.Run(module).status();
 }
 
@@ -423,11 +440,25 @@ Status InitializeModuleHooks(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module,
+    perftools::gputools::StreamExecutor* /*stream_exec*/) {
+  VLOG(2) << "Before optimization:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
+
+  VLOG(2) << "After optimization:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return std::move(module);
+}
+
+StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
+    std::unique_ptr<HloModule> module,
+    perftools::gputools::StreamExecutor* stream_exec) {
   const string timer_message =
       "Compiling [" + module->name() + "] for CPU using JIT";
-  ScopedLoggingTimer compiling_timer(timer_message, 1);
+  XLA_SCOPED_LOGGING_TIMER(timer_message);
 
   VLOG(1) << "Compiling: " << module->name();
   TF_RET_CHECK(stream_exec != nullptr);
@@ -441,11 +472,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       &pre_optimization_ir_hook, &post_optimization_ir_hook));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = MakeUnique<llvm::LLVMContext>();
+  auto llvm_context = xla::MakeUnique<llvm::LLVMContext>();
   auto llvm_module =
-      MakeUnique<llvm::Module>("__compute_module", *llvm_context);
+      xla::MakeUnique<llvm::Module>("__compute_module", *llvm_context);
 
-  auto jit = MakeUnique<SimpleOrcJIT>(
+  auto jit = xla::MakeUnique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
       CodeGenOptLevel(module->config()),
       options::OptimizeForSizeRequested(module->config()),
@@ -455,14 +486,29 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
   llvm_module->setDataLayout(jit->data_layout());
   llvm_module->setTargetTriple(jit->target_triple().getTriple());
 
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
-
   HloComputation* computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
   if (module->config().hlo_profiling_enabled()) {
+    hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+
     TF_ASSIGN_OR_RETURN(
         hlo_to_profile_idx,
-        CollectProfileCandidates::GetCandidatesForComputation(computation));
+        CollectProfileCandidates::GetCandidatesForComputation(
+            computation, hlo_profile_index_map->instruction_to_profile_idx()));
+
+    auto shape_size_bytes = [](const Shape& shape) {
+      // On the cpu, opaques are pointers.
+      if (ShapeUtil::IsOpaque(shape)) {
+        return static_cast<int64>(sizeof(void*));
+      }
+      return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+    };
+
+    HloCostAnalysis cost_analysis(shape_size_bytes);
+    hlo_profile_printer =
+        CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -485,9 +531,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // uses data dependencies for determining order.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(module.get(),
-                            MakeUnique<DependencyHloOrdering>(module.get()),
-                            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(
+            module.get(), xla::MakeUnique<DependencyHloOrdering>(module.get()),
+            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -514,7 +560,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
         const void* data = instruction->literal().InternalData();
         int64 size = CpuExecutable::ShapeSizeBytes(instruction->shape());
         auto iter = aligned_constants.emplace(
-            instruction, MakeUnique<unsigned char[]>(size));
+            instruction, xla::MakeUnique<unsigned char[]>(size));
         CHECK_EQ(iter.second, true);
         unsigned char* aligned_data = iter.first->second.get();
         memcpy(aligned_data, data, size);
@@ -528,12 +574,20 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       parallel_computations.emplace(to_apply, instruction);
     }
 
+    // We always profile the entire computation as a whole, even if hlo
+    // profiling is disabled.  When hlo profiling is diabled, we pass in a
+    // profile counter array of just one element, which corresponds to the whole
+    // computation.
+    size_t entry_computation_profile_idx =
+        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+                                    *module->entry_computation())
+                              : 0;
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         &hlo_to_profile_idx, jit->target_machine(),
-                         jit->external_constant_pool());
+                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         jit->target_machine(), jit->external_constant_pool());
 
-    std::unique_ptr<std::map<HloInstruction*, string>> function_names(
-        new std::map<HloInstruction*, string>());
+    std::unique_ptr<HloInstructionMap<string>> function_names(
+        new HloInstructionMap<string>());
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
       if (embedded_computation->IsFusionComputation()) {
@@ -549,7 +603,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           llvm::Function * ir_function,
           ir_emitter.EmitComputation(
               embedded_computation, embedded_computation->name(),
-              /*is_entry_computation=*/computation_is_parallel,
+              /*is_top_level_computation=*/computation_is_parallel,
               /*instruction_order=*/nullptr));
       // If this computation is parallel, remember it in the function name map.
       // This way we know what function to execute when we try to run code for
@@ -570,8 +624,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new ParallelCpuExecutable(
         std::move(jit), std::move(assignment), std::move(module),
-        std::move(function_names), std::move(hlo_to_profile_idx),
-        std::move(aligned_constants)));
+        std::move(function_names), std::move(aligned_constants),
+        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -591,10 +645,10 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     // temporary buffers are required to run the computation.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
-        BufferAssigner::Run(
-            module.get(),
-            MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-            BufferSizeBytesFunction(), memory_alignment));
+        BufferAssigner::Run(module.get(),
+                            xla::MakeUnique<SequentialHloOrdering>(
+                                module.get(), module_sequence),
+                            BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -604,13 +658,23 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
       TF_RETURN_IF_ERROR(protobuf_util::DumpProtoToDirectory(
           proto, xla_dump_hlo_proto_to, module->name()));
     }
+    // We always profile the entire computation as a whole, even if hlo
+    // profiling is disabled.  When hlo profiling is diabled, we pass in a
+    // profile counter array of just one element, which corresponds to the whole
+    // computation.
+    size_t entry_computation_profile_idx =
+        hlo_profile_index_map ? hlo_profile_index_map->GetProfileIndexFor(
+                                    *module->entry_computation())
+                              : 0;
+
     // Each computation is a single function.  Emit all embedded computations
     // before the entry computation. The order of computations returned from
     // GetEmbeddedComputations guarantees that a called computation occurs
     // before a caller computation.
+
     IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
-                         &hlo_to_profile_idx, jit->target_machine(),
-                         jit->external_constant_pool());
+                         hlo_to_profile_idx, entry_computation_profile_idx,
+                         jit->target_machine(), jit->external_constant_pool());
 
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -621,7 +685,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
@@ -630,7 +694,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, function_name_prefix,
-                                   /*is_entry_computation=*/true,
+                                   /*is_top_level_computation=*/true,
                                    &module_sequence.at(computation)));
 
     string function_name = llvm_ir::AsString(entry_function->getName());
@@ -643,7 +707,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
     jit->AddModule(std::move(llvm_module));
     cpu_executable.reset(new CpuExecutable(
         std::move(jit), std::move(assignment), std::move(module), function_name,
-        std::move(hlo_to_profile_idx)));
+        std::move(hlo_profile_printer), std::move(hlo_profile_index_map)));
 
     if (embed_ir_in_executable) {
       static_cast<CpuExecutable&>(*cpu_executable)
@@ -655,13 +719,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::Compile(
   return std::move(cpu_executable);
 }
 
-StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<se::StreamExecutor*>> stream_execs) {
-  return Unimplemented(
-      "Compilation of multiple HLO modules is not yet supported on CPU.");
-}
-
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                                 const AotCompilationOptions& aot_options) {
@@ -770,7 +827,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<BufferAssignment> assignment,
         BufferAssigner::Run(
-            module, MakeUnique<SequentialHloOrdering>(module, module_sequence),
+            module,
+            xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
             BufferSizeBytesFunction(), memory_alignment));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
@@ -784,9 +842,13 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           proto, xla_dump_hlo_proto_to, module->name()));
     }
 
-    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
-                         /*hlo_to_profile_idx=*/nullptr, target_machine.get(),
-                         /*external_constant_pool=*/nullptr);
+    IrEmitter ir_emitter(
+        *module, *assignment, &llvm_module,
+        /*hlo_to_profile_idx=*/
+        std::unordered_map<const HloInstruction*, size_t>{},
+        /*entry_computation_profile_idx=*/tensorflow::gtl::nullopt,
+        target_machine.get(),
+        /*external_constant_pool=*/nullptr);
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -797,7 +859,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           ir_emitter
               .EmitComputation(embedded_computation,
                                embedded_computation->name(),
-                               /*is_entry_computation=*/false,
+                               /*is_top_level_computation=*/false,
                                &module_sequence.at(embedded_computation))
               .status());
     }
@@ -805,7 +867,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     TF_ASSIGN_OR_RETURN(
         llvm::Function * entry_function,
         ir_emitter.EmitComputation(computation, entry_point_name,
-                                   /*is_entry_computation=*/true,
+                                   /*is_top_level_computation=*/true,
                                    &module_sequence.at(computation)));
 
     CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index d09130247421b11d6d4879466f39b89167eb9564..ebed7058d8f7968c6e03ef90d0da6b2325037eb0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -109,14 +109,20 @@ class CpuCompiler : public LLVMCompiler {
   CpuCompiler();
   ~CpuCompiler() override {}
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
+  // Bring in
+  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+  //     std::vector<std::unique_ptr<HloModule>> modules,
+  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //        stream_execs)
+  using LLVMCompiler::Compile;
+
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..baaacd2ecc9611946678f71ac36ef787ecb57b4e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+StatusOr<bool> CpuCopyInsertion::Run(HloModule* module) {
+  CopyInsertion generic_copy_insertion;
+
+  TF_ASSIGN_OR_RETURN(bool generic_changed, generic_copy_insertion.Run(module));
+
+  // The CPU backend needs additional copies added due to deficiencies in
+  // buffer assignment.
+  TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed,
+                      CopyInsertion::AddCopiesForBufferAssignment(module));
+
+  return generic_changed || buffer_assignment_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
new file mode 100644
index 0000000000000000000000000000000000000000..3313d1e6eb71bff39f509c3d24858568df786422
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Besides the modifications made by the generic xla::CopyInsertion, this
+// CPU-specific copy insertion pass also adds copies to values live out of
+// computations satisfying certain conditions (defined by constant or parameter,
+// etc). This is necessary because of deficiencies of buffer
+// assignment. Specifically, buffer assignment is computation-scoped and does
+// not recognized aliasing between arguments and outputs of computations.
+//
+// TODO(b/62548313): Remove this when buffer assignment is smarter
+// (module-scoped).
+class CpuCopyInsertion : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a05a26941786cbf404c4685abb098c9ac8caaa09
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_copy_insertion_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_copy_insertion.h"
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+int64 CountCopies(const HloComputation& computation) {
+  int64 count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64 CountCopies(const HloModule& module) {
+  int64 count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+class CpuCopyInsertionTest : public HloTestBase {
+ protected:
+  void InsertCopies(HloModule* module) {
+    CpuCopyInsertion copy_insertion;
+    ASSERT_IS_OK(copy_insertion.Run(module).status());
+  }
+
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
+};
+
+TEST_F(CpuCopyInsertionTest, WhileBodyWithConstantRoot) {
+  // Test a while body and condition which are each simply a constant (root of
+  // computation is a constant). Each constant should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param_0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+
+  auto body_builder = HloComputation::Builder("body");
+  body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  HloComputation* body = module->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module->AddEmbeddedComputation(cond_builder.Build());
+
+  auto xla_while = builder.AddInstruction(
+      HloInstruction::CreateWhile(scalar_shape_, condition, body, param_0));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*module), 3);
+
+  EXPECT_THAT(xla_while->operand(0), op::Copy(op::Parameter()));
+  EXPECT_THAT(body->root_instruction(), op::Copy(op::Constant()));
+  EXPECT_THAT(condition->root_instruction(), op::Copy(op::Constant()));
+}
+
+TEST_F(CpuCopyInsertionTest, TupleCall) {
+  // Test a kCall instruction which calls a computation which produces a three
+  // element tuple: one is a constant, one is a parameter, and one is produced
+  // in the computation. The constant and parameter should be copied.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
+  const Shape tuple_shape =
+      ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_, scalar_shape_});
+
+  auto sub_builder = HloComputation::Builder("subcomputation");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto constant = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(123.0)));
+  auto add = sub_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, sub_param, constant));
+  sub_builder.AddInstruction(
+      HloInstruction::CreateTuple({sub_param, constant, add}));
+  HloComputation* subcomputation =
+      module->AddEmbeddedComputation(sub_builder.Build());
+
+  builder.AddInstruction(
+      HloInstruction::CreateCall(tuple_shape, {param}, subcomputation));
+
+  module->AddEntryComputation(builder.Build());
+
+  InsertCopies(module.get());
+
+  EXPECT_EQ(CountCopies(*subcomputation), 2);
+  EXPECT_THAT(subcomputation->root_instruction(),
+              op::Tuple(op::Copy(op::Parameter()), op::Copy(op::Constant()),
+                        op::Add()));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 4dba87f49906739284daea68c70ef1860127f8d0..e956f478b86d9816615e2902f5bbeae6d6384162 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/host/host_stream.h"
 
 namespace se = ::perftools::gputools;
 
@@ -54,11 +55,12 @@ CpuExecutable::CpuExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
     const string& entry_function_name,
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx)
-    : Executable(std::move(hlo_module)),
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
-      assignment_(std::move(assignment)),
-      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)) {
+      assignment_(std::move(assignment)) {
   // Resolve symbols in the constructor rather than at execution time to avoid
   // races because FindSymbol is not thread safe.
   llvm::JITSymbol sym = jit_->FindSymbol(entry_function_name);
@@ -147,8 +149,9 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
   std::vector<se::DeviceMemoryBase> argument_buffers;
-  for (int i = 0; i < arguments.size(); ++i) {
-    argument_buffers.push_back(arguments[i]->buffer(/*index=*/{}));
+  argument_buffers.reserve(arguments.size());
+  for (const auto* argument : arguments) {
+    argument_buffers.push_back(argument->buffer(/*index=*/{}));
   }
   return ExecuteComputeFunction(run_options, argument_buffers, buffers,
                                 hlo_execution_profile);
@@ -181,9 +184,16 @@ Status CpuExecutable::ExecuteComputeFunction(
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
   // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Allocate an additional profile counter for the entire
-  // computation.
-  std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+  // profile.  Even when not Hlo profiling, we allocate a counter for the entire
+  // computation, which we use to update ExecutionProfile below.
+  std::vector<int64>* profile_counters = nullptr;
+  std::vector<int64> profile_counter_for_entry_computation;
+  if (hlo_execution_profile) {
+    profile_counters = hlo_execution_profile->mutable_profile_counters();
+  } else {
+    profile_counters = &profile_counter_for_entry_computation;
+    profile_counter_for_entry_computation.push_back(0);
+  }
 
   // Call the computation function following the calling convention.
   std::vector<void*> buffer_pointers;
@@ -198,7 +208,7 @@ Status CpuExecutable::ExecuteComputeFunction(
     VLOG(3) << tensorflow::strings::Printf(
         "  func(void* result, void* params[%zu], void* temps[%zu], "
         "uint64 profile_counters[%zu])",
-        args_array.size(), buffer_pointers.size(), profile_counters.size());
+        args_array.size(), buffer_pointers.size(), profile_counters->size());
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
       tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -210,11 +220,11 @@ Status CpuExecutable::ExecuteComputeFunction(
         "    temps = [%s]",
         tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
-                                           profile_counters.data());
+                                           profile_counters->data());
   }
 
   compute_function_(result_buffer, run_options, args_array.data(),
-                    buffer_pointers.data(), profile_counters.data());
+                    buffer_pointers.data(), profile_counters->data());
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -223,20 +233,46 @@ Status CpuExecutable::ExecuteComputeFunction(
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
 
-    // The last profile counter is used for the computation as a whole.
-    execution_profile_.set_compute_cycle_count(profile_counters.back());
+    if (hlo_execution_profile) {
+      execution_profile_.set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
+    } else {
+      execution_profile_.set_compute_cycle_count(profile_counters->back());
+    }
   }
 
-  if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(
-        *module().entry_computation(), profile_counters.back());
+  return Status::OK();
+}
+
+static void LogLiveAddresses(
+    const std::unordered_set<const void*>& marked_addresses) {
+  VLOG(3) << "Live addresses in output marking found "
+          << marked_addresses.size() << " addresses:\n"
+          << tensorflow::str_util::Join(
+                 marked_addresses, ", ", [](string* out, const void* address) {
+                   tensorflow::strings::StrAppend(
+                       out, tensorflow::strings::Printf("%p", address));
+                 });
+}
 
-    for (auto hlo_prof_idx : hlo_to_profile_idx_) {
-      const HloInstruction* hlo = hlo_prof_idx.first;
-      uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
-      hlo_execution_profile->AddProfileResult(hlo, cycles_taken);
+static Status DeallocateTempBuffers(
+    DeviceMemoryAllocator* allocator, se::Stream* stream,
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
+    const std::unordered_set<const void*>& marked_addresses) {
+  // Keep those marked live because they are referenced by the output of the
+  // computation and are needed by the service. They will be deallocated by the
+  // service.
+  for (size_t i = 0; i < buffers.size(); ++i) {
+    se::DeviceMemoryBase alloc = buffers[i];
+    if (marked_addresses.count(alloc.opaque()) == 0 && !alloc.is_null()) {
+      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
+              << alloc.opaque() << "]";
+      TF_RETURN_IF_ERROR(
+          allocator->Deallocate(stream->parent()->device_ordinal(), &alloc));
     }
   }
+
   return Status::OK();
 }
 
@@ -262,26 +298,9 @@ StatusOr<perftools::gputools::DeviceMemoryBase> CpuExecutable::ExecuteOnStream(
   MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
                             &marked_addresses);
 
-  VLOG(3) << "Live addresses in output marking found "
-          << marked_addresses.size() << " addresses:\n"
-          << tensorflow::str_util::Join(
-                 marked_addresses, ", ", [](string* out, const void* address) {
-                   tensorflow::strings::StrAppend(
-                       out, tensorflow::strings::Printf("%p", address));
-                 });
-
-  // Computation is done - deallocate temp buffers. Keep those marked live
-  // because they are referenced by the output of the computation and are needed
-  // by the service. They will be deallocated by the service.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (marked_addresses.count(alloc.opaque()) == 0 && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(memory_allocator->Deallocate(
-          stream->parent()->device_ordinal(), &alloc));
-    }
-  }
+  LogLiveAddresses(marked_addresses);
+  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
+                                           marked_addresses));
 
   return top_level_output;
 }
@@ -359,9 +378,44 @@ StatusOr<perftools::gputools::DeviceMemoryBase>
 CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  // TODO(b/30671675): Implement asynchronous execution mode.
-  return Unimplemented(
-      "Asynchronous execution on stream is not yet supported on CPU.");
+  if (hlo_profiling_enabled()) {
+    return Unimplemented(
+        "Asynchronous execution on stream with hlo profiling is not yet "
+        "supported on CPU.");
+  }
+
+  auto* host_stream = dynamic_cast<perftools::gputools::host::HostStream*>(
+      run_options->stream()->implementation());
+  se::Stream* stream = run_options->stream();
+  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+
+  TF_RETURN_IF_ERROR(AllocateBuffers(
+      memory_allocator, stream->parent()->device_ordinal(), &buffers));
+
+  // Mark the buffers that are actually live (used in the output) when the
+  // computation finishes executing.
+  std::unordered_set<const void*> marked_addresses;
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment_->GetUniqueTopLevelOutputSlice());
+  se::DeviceMemoryBase top_level_output = buffers[result_slice.index()];
+  MarkLiveAddressesInOutput(top_level_output.opaque(), result_shape(),
+                            &marked_addresses);
+
+  LogLiveAddresses(marked_addresses);
+
+  host_stream->EnqueueTask([this, run_options, arguments, buffers,
+                            marked_addresses, memory_allocator, stream]() {
+    // Failing a CHECK here is not great, but I don't see an obvious way to
+    // return a failed Status asynchronously.
+    TF_CHECK_OK(ExecuteComputeFunction(&run_options->run_options(), arguments,
+                                       buffers,
+                                       /*hlo_execution_profile=*/nullptr));
+    TF_CHECK_OK(DeallocateTempBuffers(memory_allocator, stream, buffers,
+                                      marked_addresses));
+  });
+
+  return top_level_output;
 }
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
@@ -377,9 +431,5 @@ const PointsToSet& CpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> CpuExecutable::CreateCostAnalysis() const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 238bc9b46ae2bf1b519eaf137d9ae063e769bd2e..17ee2d673ee7cde1847bf29e2399e6033cb7e30e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -47,12 +47,12 @@ namespace cpu {
 // architecture, so JIT-ed code and host code share the same ABI.
 class CpuExecutable : public Executable {
  public:
-  CpuExecutable(
-      std::unique_ptr<SimpleOrcJIT> jit,
-      std::unique_ptr<const BufferAssignment> assignment,
-      std::unique_ptr<const HloModule> hlo_module,
-      const string& entry_function_name,
-      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx);
+  CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
+                std::unique_ptr<const BufferAssignment> assignment,
+                std::unique_ptr<const HloModule> hlo_module,
+                const string& entry_function_name,
+                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~CpuExecutable() override {}
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -85,12 +85,10 @@ class CpuExecutable : public Executable {
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
   // Type of the computation function we expect in the JIT.
   using ComputeFunctionType = void (*)(
       void* /*result*/, const ExecutableRunOptions* /*run_options*/,
-      const void** /*args*/, void** /*temps*/, uint64* /*profile_counters*/);
+      const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/);
 
   const ComputeFunctionType& compute_function() const {
     return compute_function_;
@@ -145,9 +143,6 @@ class CpuExecutable : public Executable {
   // Entry function name for the computation.
   const string entry_function_name_;
 
-  // Maps HLOs to their index into the profile counter array.
-  const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index b9e4d006d77ae76e33ac51440349400ea4eff118..1c04c9835e3e1ecf0f78a74aa74b0b052054004a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -31,6 +31,14 @@ namespace {
 
 using InstructionFusionTest = HloTestBase;
 
+std::unique_ptr<HloInstruction> MakeDot(const Shape& shape, HloInstruction* lhs,
+                                        HloInstruction* rhs) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  return HloInstruction::CreateDot(shape, lhs, rhs, dot_dnums);
+}
+
 TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
   HloComputation::Builder builder(TestName());
   HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -40,8 +48,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_0) {
 
   HloInstruction* exp0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kExp, arg0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, exp0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), exp0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -59,8 +67,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Basic_1) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -80,8 +88,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Bitcast) {
       ShapeUtil::MakeShape(S32, {2, 512, 2, 128}), HloOpcode::kExp, arg0));
   HloInstruction* bitcast0 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kBitcast, exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, bitcast0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), bitcast0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -102,8 +110,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_Reshape) {
   HloInstruction* reshape0 =
       builder.AddInstruction(HloInstruction::CreateReshape(
           ShapeUtil::MakeShape(S32, {1024, 256}), exp0));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1024, 1}), HloOpcode::kDot, reshape0, arg1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1024, 1}), reshape0, arg1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -121,8 +129,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TooLarge) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 32 * 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 32 * 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 32 * 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -140,8 +148,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
 
   HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {256, 1024}), HloOpcode::kExp, arg1));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 1024}), HloOpcode::kDot, arg0, exp1));
+  HloInstruction* dot = builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {2, 1024}), arg0, exp1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
@@ -162,8 +170,8 @@ TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(S32, {256, 1024}), exp1, {1, 0}));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 1024}), HloOpcode::kDot, arg0, transpose1));
+  builder.AddInstruction(
+      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, transpose1));
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
similarity index 59%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index c446b6b792a042da2500ea6a175fdca4c70bcab6..0df10f4af318de3f80e4df599797709c5c43b5cd 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -13,69 +13,76 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <numeric>
 
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 namespace cpu {
 
-Status CpuLayoutAssignment::AddBackendConstraints(
-    LayoutConstraints* constraints) {
-  auto row_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-  auto col_major_shape = [](const Shape& old_shape) {
-    Shape new_shape(old_shape);
-    std::vector<int64> dimension_order(new_shape.dimensions_size());
-    std::iota(dimension_order.begin(), dimension_order.end(), 0);
-    *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
-    return new_shape;
-  };
-
-  // We want to change the layout of constant arrays to be column major when all
-  // of their users are dot operations that can be made faster with the flipped
-  // layout.  To avoid going quadriatic over the # of instructions, we cache
-  // this property in should_make_rhs_col_major -- it maps a constant to true if
-  // all of the users of said constant are dot operations that can be sped up.
-  // This cache is populated lazily as we encounter dot operations traversing
-  // the instruction stream.
-  tensorflow::gtl::FlatMap<const HloInstruction*, bool>
-      should_make_rhs_col_major_cache;
-  auto should_make_rhs_col_major = [&](const HloInstruction& instruction) {
-    if (ProfitableToImplementDotInLlvmIr(instruction) !=
-        DotInLlvmIrProfitable::kWithColumnMajorRhs) {
-      return false;
-    }
+// We want to change the layout of constant arrays to be column major when all
+// of their users are dot operations that can be made faster with the flipped
+// layout.  To avoid going quadriatic over the # of instructions, we cache this
+// property in should_make_rhs_col_major -- it maps a constant to true if all of
+// the users of said constant are dot operations that can be sped up.  This
+// cache is populated lazily as we encounter dot operations traversing the
+// instruction stream.
+
+namespace {
+using ShouldMakeRhsColMajorCache =
+    tensorflow::gtl::FlatMap<const HloInstruction*, bool>;
+}
 
-    const auto* rhs = instruction.operand(1);
-    if (rhs->opcode() != HloOpcode::kConstant) {
-      return false;
-    }
+static bool ShouldMakeRhsColMajor(ShouldMakeRhsColMajorCache* cache,
+                                  const HloInstruction& instruction) {
+  if (!ProfitableToMakeDotRhsColumnMajor(instruction)) {
+    return false;
+  }
 
-    auto it = should_make_rhs_col_major_cache.find(rhs);
-    if (it != should_make_rhs_col_major_cache.end()) {
-      return it->second;
-    }
+  const auto* rhs = instruction.operand(1);
+  if (rhs->opcode() != HloOpcode::kConstant) {
+    return false;
+  }
+
+  auto it = cache->find(rhs);
+  if (it != cache->end()) {
+    return it->second;
+  }
 
-    bool result = std::all_of(
-        rhs->users().begin(), rhs->users().end(), [&](HloInstruction* user) {
-          return ProfitableToImplementDotInLlvmIr(*user) ==
-                     DotInLlvmIrProfitable::kWithColumnMajorRhs &&
-                 user->operand(0) != rhs;
-        });
+  bool result = std::all_of(rhs->users().begin(), rhs->users().end(),
+                            [&](HloInstruction* user) {
+                              return ProfitableToMakeDotRhsColumnMajor(*user) &&
+                                     user->operand(0) != rhs;
+                            });
 
-    InsertOrDie(&should_make_rhs_col_major_cache, rhs, result);
-    return result;
-  };
+  InsertOrDie(cache, rhs, result);
+  return result;
+}
+
+static Shape RowMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.rbegin(), dimension_order.rend(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+static Shape ColMajorShape(const Shape& old_shape) {
+  Shape new_shape(old_shape);
+  std::vector<int64> dimension_order(new_shape.dimensions_size());
+  std::iota(dimension_order.begin(), dimension_order.end(), 0);
+  *new_shape.mutable_layout() = LayoutUtil::MakeLayout(dimension_order);
+  return new_shape;
+}
+
+Status CpuLayoutAssignment::AddBackendConstraints(
+    LayoutConstraints* constraints) {
+  ShouldMakeRhsColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
@@ -90,9 +97,9 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(convolution->shape()));
-      Shape input_shape(row_major_shape(lhs_instruction->shape()));
-      Shape filter_shape(row_major_shape(rhs_instruction->shape()));
+      Shape output_shape(RowMajorShape(convolution->shape()));
+      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
+      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
 
       // Set layouts of the instructions' shapes.
       TF_RETURN_IF_ERROR(
@@ -101,11 +108,11 @@ Status CpuLayoutAssignment::AddBackendConstraints(
           constraints->SetOperandLayout(filter_shape, convolution, 1));
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, convolution));
-    } else if (should_make_rhs_col_major(*instruction)) {
+    } else if (ShouldMakeRhsColMajor(&cache, *instruction)) {
       auto* dot = instruction;
       const auto& rhs_shape = dot->operand(1)->shape();
       TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(col_major_shape(rhs_shape), dot, 1));
+          constraints->SetOperandLayout(ColMajorShape(rhs_shape), dot, 1));
     } else if (PotentiallyImplementedAsEigenDot(*instruction)) {
       const HloInstruction* dot = instruction;
       // In order to implement `dot` with Eigen dot, the layouts of the lhs,
@@ -113,17 +120,17 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       //
       // These constraints are not hard constraints. Ideally, we should decide
       // which layouts to choose according to some cost model.
-      Shape output_shape(row_major_shape(dot->shape()));
+      Shape output_shape(RowMajorShape(dot->shape()));
 
       const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(row_major_shape(lhs_instruction->shape()));
+      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
 
       // dot is a kDot or a kTransposeDot fusion node.  In the latter case, if
       // it represents X @ X, it may have just one operand.
       if (dot->operand_count() > 1) {
         const HloInstruction* rhs_instruction = dot->operand(1);
-        Shape rhs_shape(row_major_shape(rhs_instruction->shape()));
+        Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
       }
 
@@ -140,8 +147,12 @@ Status CpuLayoutAssignment::AddBackendConstraints(
         if (constraints->OperandBufferForwarded(instruction, operand_no)) {
           continue;
         }
+        // Skip operands with non-array shapes.
+        if (!ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+          continue;
+        }
         Shape operand_shape(
-            row_major_shape(instruction->operand(operand_no)->shape()));
+            RowMajorShape(instruction->operand(operand_no)->shape()));
         TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
             operand_shape, instruction, operand_no));
       }
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
similarity index 86%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 4fd8d68dd6b4f2a8b16f6c048743a996ea76a560..c8edbb9e15a5b6f9c574f5fe9d130d149499ebd2 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -38,4 +38,4 @@ class CpuLayoutAssignment : public LayoutAssignment {
 }  // namespace cpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
similarity index 99%
rename from tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
rename to tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 1ea5e8c7fc4896512e62396d0a756cda44785f11..401cf50717959da95f48963c3c83b3036a80eb1b 100644
--- a/tensorflow/compiler/xla/service/cpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h"
 
 #include <initializer_list>
 #include <memory>
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index dba140d1120bc5502d2039e1663b9bf035d8d66a..09f028463af68bbc2841fecdb2ca6c6a42498798 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 
+#include "tensorflow/core/lib/strings/numbers.h"
+
 namespace {
 
 const char* const kXlaParallelCpuOption = "xla_cpu_parallel";
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
+const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 
 }  // namespace
 
@@ -45,6 +48,19 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaOptimizeForSizeCpuOption) > 0;
 }
 
+tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
+    const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  auto it = extra_options_map.find(kLlvmIrDotTilingFactor);
+  int64 tiling_factor;
+  if (it != extra_options_map.end() &&
+      tensorflow::strings::safe_strto64(it->second, &tiling_factor)) {
+    return tiling_factor;
+  }
+  return tensorflow::gtl::nullopt;
+}
+
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 5dc24ebc7b8661092e3bc27c4f30fda1e497e41b..6ba0fd24538b63a3da81083482e6bee3b552dfea 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,6 +27,8 @@ namespace options {
 bool CpuParallelBackendRequested(const HloModuleConfig& config);
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
+tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
+    const HloModuleConfig& config);
 
 }  // namespace options
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f8e260dd90149405fff7beefba3f7fe83b75d4b6..f385829cdf5cafbd35e083f47106734cdd5dde88 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -12,15 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
+#define EIGEN_USE_THREADS
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 
 #include <memory>
 #include <string>
 #include <tuple>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.h b/tensorflow/compiler/xla/service/cpu/disassembler.h
index b6feaa7e45cee26eb7f850081bd1fad2cb63b15c..5e302f88990ee4a3c37758881ecec4d6f71dd8e6 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.h
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.h
@@ -37,7 +37,7 @@ struct DisassemblerResult {
   DisassemblerResult(const string& text, size_t code_size_bytes)
       : text(text), code_size_bytes(code_size_bytes) {}
 
-  // The dissassembled text sections of the object file.
+  // The disassembled text sections of the object file.
   string text;
   // The total number of bytes of executable code in the object file.
   uint64_t code_size_bytes;
@@ -53,7 +53,7 @@ class Disassembler {
   // Returns a DisassemblerResult for the given object file, containing the
   // disassembled code.
   //
-  // If we couldnt' retrieve a disassembler for this platform, an error status
+  // If we couldn't retrieve a disassembler for this platform, an error status
   // is returned.
   StatusOr<DisassemblerResult> DisassembleObjectFile(
       const llvm::object::ObjectFile& object_file) const;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index e57d49172b18beb75cfbb482c5d732ef679ebe41..7f0bf2c8e4e26511e2e69121042540120c281c62 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,9 +23,10 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
-#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -38,6 +39,457 @@ using llvm_ir::SetToFirstInsertPoint;
 
 namespace cpu {
 
+namespace {
+// Loads a tile of values from a 2D tensor.
+class TileLoader {
+ public:
+  // Constructs a TileLoader that will load a tile consisting of
+  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
+  // `major_dim_offset` in the major dimension.  The tile size along the minor
+  // dimension is the vector size, and that is implicitly determined by `vsl`.
+  TileLoader(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
+             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
+             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
+      : vsl_(vsl) {
+    pointers_.reserve(tile_size_along_major_dim);
+    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
+      llvm::Value* total_offset = ir_builder->CreateMul(
+          ir_builder->getInt64(matrix_size_along_minor_dim),
+          ir_builder->CreateAdd(ir_builder->getInt64(i), major_dim_offset));
+      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
+    }
+  }
+
+  // Load a tile consisting of `tile_size_along_major_dim_` vectors starting at
+  // `major_dim_offset_` in the major dimension and `minor_dim_offset` in the
+  // minor dimension.
+  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
+    std::vector<llvm::Value*> result;
+    result.reserve(pointers_.size());
+    for (const auto& pointer : pointers_) {
+      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
+    }
+    return result;
+  }
+
+ private:
+  VectorSupportLibrary* vsl_;
+  std::vector<llvm::Value*> pointers_;
+};
+
+// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+       +--+--+--+--+
+//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//
+// (Legend: rows are horizontal and columns are vertical; and each column is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is from the column major left matrix.
+//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
+//      vector loaded from the RHS vector.
+//
+// As we iterate through the column dimension, we compute the change to the
+// result vector by an elementwise multiplication between the two tiles above
+// followed by a reduction along the major dimension:
+//
+//                     +-----------------------------------+
+//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
+//                     +-----------------------------------+
+//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
+// Result[R:R+4] +=    +-----------------------------------+
+//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
+//                     +-----------------------------------+
+//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
+//                     +-----------------------------------+
+//
+// Where R is the starting row for the tile.
+//
+// We have an inner epilogue loop to deal with the "C" submatrix and an outer
+// epilogue loop to deal with the B,D submarix.
+//
+// TODO(sanjoy): We should investigate if using gather loads and scatter stores
+// can be used here have the same inner loop for both column-major and row-major
+// matrix-vector products.
+class ColumnMajorMatrixVectorProductEmitter {
+ public:
+  ColumnMajorMatrixVectorProductEmitter(PrimitiveType scalar_type,
+                                        int64 tile_rows, int64 tile_cols,
+                                        int64 m, int64 k, llvm::Value* lhs,
+                                        llvm::Value* rhs, llvm::Value* result,
+                                        llvm::IRBuilder<>* ir_builder)
+      : scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        ir_builder_(ir_builder),
+        ksl_(ir_builder_),
+        vsl_(scalar_type_, /*vector_size=*/tile_rows_, ir_builder_, "") {
+    CHECK(tile_rows_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows_)));
+  }
+
+  void Emit();
+
+ private:
+  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
+                         bool is_first_column);
+
+  TileLoader GetLhsTileLoader(llvm::Value* column_start, int64 column_count) {
+    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/m_,
+                      /*major_dim_offset=*/column_start,
+                      /*tile_size_along_major_dim=*/column_count);
+  }
+
+  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
+  // sequnce of `count` values, each one broadcasted to the vector width.
+  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
+    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
+    std::vector<llvm::Value*> result;
+    result.reserve(count);
+    for (int64 i = 0; i < count; i++) {
+      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
+    }
+    return result;
+  }
+
+  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader,
+                          const std::vector<llvm::Value*>& rhs_tile,
+                          int64 columns, bool is_first_column);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
+                             bool is_first_tiled_column);
+
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* ir_builder_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
+    llvm::Value* column, int64 column_count, bool is_first_column) {
+  TileLoader lhs_tile_loader = GetLhsTileLoader(/*column_start=*/column,
+                                                /*column_count=*/column_count);
+
+  std::vector<llvm::Value*> rhs_tile =
+      LoadRhsTile(column, /*count=*/column_count);
+  EmitInnerLoopTiled(&lhs_tile_loader, rhs_tile,
+                     /*columns=*/column_count, is_first_column);
+  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
+}
+
+void ColumnMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 column_remainder = k_ % tile_cols_;
+  int64 column_limit = k_ - column_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols_,
+           [&](llvm::Value* column, bool is_first_column) {
+             EmitOuterLoopBody(column, tile_cols_, is_first_column);
+           });
+
+  if (column_remainder != 0) {
+    EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder,
+                      column_limit == 0);
+  }
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    TileLoader* lhs_tile_loader, const std::vector<llvm::Value*>& rhs_tile,
+    int64 columns, bool is_first_column) {
+  int64 row_limit = m_ - (m_ % tile_rows_);
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
+           /*step=*/tile_rows_, [&](llvm::Value* row) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/row);
+             llvm::Value* accumulator = is_first_column
+                                            ? vsl_.GetZeroVector()
+                                            : vsl_.LoadVector(result_, row);
+             for (int i = 0; i < columns; i++) {
+               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
+             }
+             vsl_.StoreVector(accumulator, result_, row);
+           });
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
+  int64 row_start = m_ - (m_ % tile_rows_);
+  if (row_start == m_) {
+    return;
+  }
+
+  llvm::Value* columns_llvm = ir_builder_->getInt64(columns);
+
+  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
+  //   for (row = row_start, row < m_; row++) {
+  //     result[row] += lhs[row, col] * rhs[col]
+  //     // Also take into account that if col is 0 then result[row] is not
+  //     // initialized.
+  //   }
+
+  ksl_.For(
+      "dot.inner.epilg.outer", /*start=*/current_tile_col,
+      /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col),
+      /*step=*/1, /*peel_first_iteration=*/false,
+      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
+        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
+        llvm::Value* total_offset =
+            ir_builder_->CreateMul(col, ir_builder_->getInt64(m_));
+        llvm::Value* lhs_base_pointer =
+            vsl_.ComputeOffsetPointer(lhs_, total_offset);
+        ksl_.For(
+            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m_,
+            /*step=*/1, [&](llvm::Value* scalar_row) {
+              llvm::Value* product = vsl_.Mul(
+                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
+              llvm::Value* setting_result_first_time = ir_builder_->CreateAnd(
+                  is_first_scalar_col,
+                  ir_builder_->getInt1(is_first_tiled_column));
+              ksl_.If(
+                  setting_result_first_time,
+                  [&]() { vsl_.StoreScalar(product, result_, scalar_row); },
+                  [&]() {
+                    vsl_.StoreScalar(
+                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
+                        result_, scalar_row);
+                  });
+            });
+      });
+}
+
+// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+
+//   |M00|M10|M20|M30|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|
+//   +---+---+---+---+
+//   |M03|M13|M23|M33|
+//   +---+---+---+---+
+//
+// (Legend: rows are horizontal and columns are vertical; and each row is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is loaded from the row major left matrix.
+//   b. The right vector is loaded from the RHS vector.
+//
+// We keep 4 vector accumulators accumulating the following four vector
+// expressions as we iterate over the row dimension:
+//
+//   +------+------+------+------+
+//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
+//   +------+------+------+------+
+//
+// In the end we do a horizontal reduction over these 4 vector accumulators to
+// get 4 values in the result vector.
+//
+// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
+// epilogue loop to deal with the C,D submatrix.
+class RowMajorMatrixVectorProductEmitter {
+ public:
+  RowMajorMatrixVectorProductEmitter(PrimitiveType scalar_type, int64 tile_rows,
+                                     int64 tile_cols, int64 m, int64 k,
+                                     llvm::Value* lhs, llvm::Value* rhs,
+                                     llvm::Value* result,
+                                     llvm::IRBuilder<>* ir_builder)
+      : scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        ir_builder_(ir_builder),
+        ksl_(ir_builder_),
+        vsl_(scalar_type_, /*vector_size=*/tile_cols_, ir_builder_, "") {
+    CHECK(tile_cols_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols_)));
+  }
+
+  void Emit();
+
+ private:
+  TileLoader GetLhsTileLoader(llvm::Value* row_start, int64 row_count) {
+    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/k_,
+                      /*major_dim_offset=*/row_start,
+                      /*tile_size_along_major_dim=*/row_count);
+  }
+
+  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
+
+  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader, int64 rows,
+                          std::vector<VectorVariable>* vector_accumulators);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
+                             std::vector<ScalarVariable>* scalar_accumulators);
+
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* ir_builder_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
+                                                           int64 row_count) {
+  TileLoader lhs_tile_loader = GetLhsTileLoader(/*row_start=*/row,
+                                                /*row_count=*/row_count);
+  std::vector<VectorVariable> vector_accumulators;
+  std::vector<ScalarVariable> scalar_accumulators;
+  for (int i = 0; i < row_count; i++) {
+    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
+    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
+  }
+  EmitInnerLoopTiled(&lhs_tile_loader, /*rows=*/row_count,
+                     &vector_accumulators);
+  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
+                        &scalar_accumulators);
+
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+  std::vector<llvm::Value*> horizontal_sums =
+      vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+
+  for (int i = 0; i < row_count; i++) {
+    llvm::Value* result_value =
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
+    llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
+    vsl_.StoreScalar(result_value, result_, offset);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 row_remainder = m_ % tile_rows_;
+  int64 row_limit = m_ - row_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows_,
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows_); });
+
+  if (row_remainder != 0) {
+    EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    TileLoader* lhs_tile_loader, int64 rows,
+    std::vector<VectorVariable>* vector_accumulators) {
+  int64 column_limit = k_ - (k_ % tile_cols_);
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+           /*step=*/tile_cols_, [&](llvm::Value* col) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/col);
+             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+             for (int i = 0; i < rows; i++) {
+               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+               (*vector_accumulators)[i].Set(
+                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+             }
+           });
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_row, int64 rows,
+    std::vector<ScalarVariable>* scalar_accumulators) {
+  int64 column_start = k_ - (k_ % tile_cols_);
+  if (column_start == k_) {
+    return;
+  }
+
+  for (int r = 0; r < rows; r++) {
+    llvm::Value* total_offset = ir_builder_->CreateMul(
+        ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row),
+        ir_builder_->getInt64(k_));
+    llvm::Value* lhs_base_pointer =
+        vsl_.ComputeOffsetPointer(lhs_, total_offset);
+    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k_,
+             /*step=*/1, [&](llvm::Value* scalar_col) {
+               llvm::Value* product =
+                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
+                            vsl_.LoadScalar(rhs_, scalar_col));
+               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
+               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
+             });
+  }
+}
+
+}  // namespace
+
 DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
                            bool transpose_rhs,
                            const llvm_ir::IrArray& target_array,
@@ -72,6 +524,122 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
 
 bool DotOpEmitter::ShapesAreLegalForRuntimeDot() const { return true; }
 
+bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
+  if (dot_.shape().dimensions_size() != 2) {
+    return false;
+  }
+
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  if (!primitive_util::IsFloatingPointType(primitive_type) &&
+      !primitive_util::IsIntegralType(primitive_type)) {
+    return false;
+  }
+
+  MatMultDims mat_mult_dims = GetMatMultDims();
+  bool is_column_major_matrix_vector = false;
+  bool is_row_major_matrix_vector = false;
+
+  int64 m, k;
+  bool swap_operands;
+
+  if (mat_mult_dims.m == 1) {
+    bool rhs_effectively_row_major =
+        transpose_rhs_ ^ !mat_mult_dims.rhs_column_major;
+    if (rhs_effectively_row_major) {
+      k = mat_mult_dims.k;
+      m = mat_mult_dims.n;
+      is_column_major_matrix_vector = true;
+      swap_operands = true;
+    } else {
+      k = mat_mult_dims.k;
+      m = mat_mult_dims.n;
+      is_row_major_matrix_vector = true;
+      swap_operands = true;
+    }
+  }
+
+  if (mat_mult_dims.n == 1) {
+    bool lhs_effectively_column_major =
+        transpose_lhs_ ^ mat_mult_dims.lhs_column_major;
+    if (lhs_effectively_column_major) {
+      m = mat_mult_dims.m;
+      k = mat_mult_dims.k;
+      is_column_major_matrix_vector = true;
+      swap_operands = false;
+    } else {
+      m = mat_mult_dims.m;
+      k = mat_mult_dims.k;
+      is_row_major_matrix_vector = true;
+      swap_operands = false;
+    }
+  }
+
+  if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
+    return false;
+  }
+
+  int64 tiling_factor = GetGemvTilingFactor();
+  CHECK_GT(tiling_factor, 0);
+
+  llvm::Value* result_op = target_array_.GetBasePointer();
+  llvm::Value* lhs_op =
+      swap_operands ? rhs_array_.GetBasePointer() : lhs_array_.GetBasePointer();
+  llvm::Value* rhs_op =
+      swap_operands ? lhs_array_.GetBasePointer() : rhs_array_.GetBasePointer();
+
+  const bool enable_fast_math =
+      hlo_module_config_.debug_options().xla_enable_fast_math();
+  const bool optimize_for_size =
+      options::OptimizeForSizeRequested(hlo_module_config_);
+
+  if (is_column_major_matrix_vector) {
+    VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
+            << " and k = " << k;
+    int64 tile_rows = 8;
+    int64 tile_cols = tiling_factor;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "col_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k);
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* result_op) {
+          ColumnMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              result_op, ir_builder_);
+          emitter.Emit();
+        });
+  } else {
+    VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
+            << " and k = " << k;
+    int64 tile_rows = tiling_factor;
+    int64 tile_cols = 8;
+
+    string kernel_name = tensorflow::strings::StrCat(
+        "row_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
+        "_", tile_cols, "_", m, "_", k);
+
+    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
+        lhs_op, rhs_op, result_op,
+        [this, tile_rows, tile_cols, m, k, primitive_type](
+            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* result_op) {
+          RowMajorMatrixVectorProductEmitter emitter(
+              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
+              result_op, ir_builder_);
+          emitter.Emit();
+        });
+  }
+
+  return true;
+}
+
 tensorflow::Status DotOpEmitter::Emit() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
@@ -105,6 +673,10 @@ tensorflow::Status DotOpEmitter::Emit() {
     return EmitScalarDot();
   }
 
+  if (EmitLlvmIrDotIfProfitable()) {
+    return Status::OK();
+  }
+
   if (PotentiallyImplementedAsEigenDot(dot_)) {
     return EmitCallToRuntime();
   }
@@ -340,22 +912,17 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   //
   // Effectively this involves swapping the 'lhs' with 'rhs' and 'm' with 'n'.
 
-  const Shape& lhs_shape = lhs_array_.GetShape();
-  const Shape& rhs_shape = rhs_array_.GetShape();
+  MatMultDims mat_mult_dims = GetMatMultDims();
 
-  CHECK(LayoutUtil::Equal(lhs_shape.layout(), rhs_shape.layout()));
+  CHECK_EQ(mat_mult_dims.lhs_column_major, mat_mult_dims.rhs_column_major);
 
-  int64 m = lhs_shape.dimensions(transpose_lhs_ ? 1 : 0);
-  int64 k = lhs_shape.dimensions(transpose_lhs_ ? 0 : 1);
-  int64 n = rhs_shape.dimensions(transpose_rhs_ ? 0 : 1);
   const llvm_ir::IrArray* lhs = &lhs_array_;
   const llvm_ir::IrArray* rhs = &rhs_array_;
   bool transpose_lhs = transpose_lhs_;
   bool transpose_rhs = transpose_rhs_;
 
-  bool is_column_major = lhs_shape.layout().minor_to_major(0) == 0;
-  if (!is_column_major) {
-    std::swap(m, n);
+  if (!mat_mult_dims.lhs_column_major) {
+    std::swap(mat_mult_dims.m, mat_mult_dims.n);
     std::swap(lhs, rhs);
     std::swap(transpose_lhs, transpose_rhs);
   }
@@ -367,12 +934,27 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
                                   float_ptr_type),
        ir_builder_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
        ir_builder_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->getInt64(m), ir_builder_->getInt64(n),
-       ir_builder_->getInt64(k), ir_builder_->getInt32(transpose_lhs),
+       ir_builder_->getInt64(mat_mult_dims.m),
+       ir_builder_->getInt64(mat_mult_dims.n),
+       ir_builder_->getInt64(mat_mult_dims.k),
+       ir_builder_->getInt32(transpose_lhs),
        ir_builder_->getInt32(transpose_rhs)});
   return tensorflow::Status::OK();
 }
 
+DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
+  CHECK_EQ(dot_.shape().dimensions_size(), 2);
+
+  const Shape& lhs_shape = lhs_array_.GetShape();
+  const Shape& rhs_shape = rhs_array_.GetShape();
+
+  return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
+          lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
+          rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
+          lhs_shape.layout().minor_to_major(0) == 0,
+          rhs_shape.layout().minor_to_major(0) == 0};
+}
+
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
     llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
     int64 reduction_dimension, tensorflow::StringPiece name_suffix) {
@@ -403,5 +985,82 @@ llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
   return index;
 }
 
+// Return whether the given shape is a matrix with no padding.
+static bool IsRank2WithNoPadding(const Shape& shape) {
+  return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape);
+}
+
+// In a gemm operation where output = lhs * rhs, check whether the given shapes
+// are valid for the operation.
+static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
+                               const Shape& output_shape) {
+  // The inputs and the output must
+  // 1) be matrices with no padding, and
+  // 2) have an allowed element type.
+  return output_shape.element_type() == F32 &&
+         IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
+         IsRank2WithNoPadding(output_shape);
+}
+
+bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
+  // For certain types of Dot, we can call Eigen
+  if (hlo.opcode() == HloOpcode::kDot) {
+    const Shape& lhs_shape = hlo.operand(0)->shape();
+    const Shape& rhs_shape = hlo.operand(1)->shape();
+
+    if (ShapeUtil::HasZeroElements(lhs_shape) ||
+        ShapeUtil::HasZeroElements(rhs_shape)) {
+      return false;
+    }
+
+    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
+      return false;
+    }
+
+    // If gemm can accept the operand shapes, use it rather than a custom
+    // kernel.
+    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
+      // The size of the reduction dimension should match. The shape inference
+      // guarantees this invariant, so the check here is for programming
+      // errors.
+      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
+      return true;
+    }
+  }
+
+  if (hlo.opcode() == HloOpcode::kFusion &&
+      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
+      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
+    auto* dot = hlo.fused_expression_root();
+    const Shape& lhs_shape = dot->operand(0)->shape();
+    const Shape& rhs_shape = dot->operand(1)->shape();
+    if (ShapeUtil::HasZeroElements(lhs_shape) ||
+        ShapeUtil::HasZeroElements(rhs_shape)) {
+      return false;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+// For vector-matrix dot products, it is always profitable to make the Rhs
+// column major.
+bool ProfitableToMakeDotRhsColumnMajor(const HloInstruction& hlo) {
+  return hlo.opcode() == HloOpcode::kDot &&
+         hlo.shape().dimensions_size() == 2 && hlo.shape().dimensions(0) == 1;
+}
+
+bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
+  // Any Matrix-Vector product of floating point or integral type, or
+  // a transpose-dot fusion of the same can be lowered to a tiled LLVM
+  // IR implementation.
+  const Shape& shape = dot.shape();
+  return shape.dimensions_size() == 2 &&
+         (shape.dimensions(0) == 1 || shape.dimensions(1) == 1) &&
+         (primitive_util::IsFloatingPointType(shape.element_type()) ||
+          primitive_util::IsIntegralType(shape.element_type()));
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index cfc10660453c822635d68270c053977fca779ee1..2badb26f905d6f1fe6de00401f7800b774f44c07 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_H_
 
 #include "llvm/IR/IRBuilder.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
@@ -29,6 +30,16 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo);
+
+// Returns true to indicate that |hlo| is a dot, and that it is profitable to
+// switch the layout of the |hlo|'s RHS operand to column major.
+bool ProfitableToMakeDotRhsColumnMajor(const HloInstruction& hlo);
+
+// Returns true to indicate that we can generate a tiled LLVM IR implementation
+// for |dot|.
+bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot);
+
 // Helper class for emitting LLVM IR to perform the dot operation.
 class DotOpEmitter {
  public:
@@ -59,6 +70,10 @@ class DotOpEmitter {
   // LHS and RHS) and store the results in the target.
   tensorflow::Status EmitScalarDot();
 
+  // Emit an LLVM IR implementation of the dot operation if we can.  Returns
+  // true if an LLVM IR implementation was emitted.
+  bool EmitLlvmIrDotIfProfitable();
+
   // Emits a call to the CPU runtime to perform the matrix multiply.
   tensorflow::Status EmitCallToRuntime();
 
@@ -77,6 +92,38 @@ class DotOpEmitter {
   // no padding, and a rank of two.
   bool ShapesAreLegalForRuntimeDot() const;
 
+  // Represents the dimensions of a matrix-matrix multiply operation.
+  struct MatMultDims {
+    // The number of rows in the LHS.
+    int64 m;
+
+    // The number of columns in the LHS, which is also must be equal to the
+    // number of rows in the RHS.
+    int64 k;
+
+    // The number of columns on the RHS.
+    int64 n;
+
+    // True if the LHS matrix column major.
+    bool lhs_column_major;
+
+    // True if the RHS matrix column major.
+    bool rhs_column_major;
+  };
+
+  // Get the MatMultDims instance for the dot product this DotOpEmitter
+  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
+  // of rank 2 as well).
+  MatMultDims GetMatMultDims() const;
+
+  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
+  // registers.
+  int64 GetGemvTilingFactor() const {
+    const int64 kDefaultTilingFactor = 8;
+    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
+        .value_or(kDefaultTilingFactor);
+  }
+
   const HloInstruction& dot_;
   const bool transpose_lhs_;
   const bool transpose_rhs_;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index b99b36a55eee40bc66dcb1b7b1a464bf764ef0ea..3993779da636e519f8d8fded468c3271d27ee093 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -29,10 +29,8 @@ bool PotentiallyImplementedAsEigenConvolution(
   // The following conditions are necessary (but not sufficient) for
   // implementing `convolution` with Eigen convolution:
   // - the input and kernel have a non-zero number of elements.
-  // - the input is in NHWC or NWHC order.
-  // - the kernel is in HWIO or WHIO order.
-  // - the spatial dimensions are in the same relative order in the input,
-  //   kernel and output.
+  // - the input is in NHWC order.
+  // - the kernel is in HWIO order.
   //
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
@@ -51,15 +49,22 @@ bool PotentiallyImplementedAsEigenConvolution(
       convolution.convolution_dimension_numbers();
   // Only 1D and 2D convolutions are supported at the moment.
   // TODO(b/32897908): add an optimized implementation for 3D convolution.
-  if (dnums.spatial_dimensions_size() > 2) {
+  const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+  if (num_spatial_dims > 2) {
     return false;
   }
 
-  bool input_spatial_dims_ascending = std::is_sorted(
-      dnums.spatial_dimensions().begin(), dnums.spatial_dimensions().end());
-  bool kernel_spatial_dims_ascending =
-      std::is_sorted(dnums.kernel_spatial_dimensions().begin(),
-                     dnums.kernel_spatial_dimensions().end());
+  for (int64 i = 0; i < num_spatial_dims; ++i) {
+    if (dnums.input_spatial_dimensions(i) != i + 1) {
+      return false;
+    }
+    if (dnums.kernel_spatial_dimensions(i) != i) {
+      return false;
+    }
+    if (dnums.output_spatial_dimensions(i) != i + 1) {
+      return false;
+    }
+  }
 
   const Shape& output_shape = convolution.shape();
   return dnums.input_batch_dimension() == 0 &&
@@ -67,116 +72,11 @@ bool PotentiallyImplementedAsEigenConvolution(
          dnums.output_batch_dimension() == 0 &&
          dnums.output_feature_dimension() ==
              output_shape.dimensions_size() - 1 &&
-         input_spatial_dims_ascending == kernel_spatial_dims_ascending &&
          dnums.kernel_input_feature_dimension() ==
              kernel_shape.dimensions_size() - 2 &&
          dnums.kernel_output_feature_dimension() ==
              kernel_shape.dimensions_size() - 1;
 }
 
-namespace {
-
-// Return whether the given shape is a matrix with no padding.
-bool IsRank2WithNoPadding(const Shape& shape) {
-  return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape);
-}
-
-// In a gemm operation where output = lhs * rhs, check whether the given shapes
-// are valid for the operation.
-bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
-                        const Shape& output_shape) {
-  // The inputs and the output must
-  // 1) be matrices with no padding, and
-  // 2) have an allowed element type.
-  return output_shape.element_type() == F32 &&
-         IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
-         IsRank2WithNoPadding(output_shape);
-}
-}  // namespace
-
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
-  // For certain types of Dot, we can call Eigen
-  if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    if (ShapeUtil::HasZeroElements(lhs_shape) ||
-        ShapeUtil::HasZeroElements(rhs_shape)) {
-      return false;
-    }
-
-    if (ProfitableToImplementDotInLlvmIr(hlo) == DotInLlvmIrProfitable::kYes) {
-      return false;
-    }
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
-      return true;
-    }
-  }
-
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
-    auto* dot = hlo.fused_expression_root();
-    const Shape& lhs_shape = dot->operand(0)->shape();
-    const Shape& rhs_shape = dot->operand(1)->shape();
-    if (ShapeUtil::HasZeroElements(lhs_shape) ||
-        ShapeUtil::HasZeroElements(rhs_shape)) {
-      return false;
-    }
-    return true;
-  }
-
-  return false;
-}
-
-DotInLlvmIrProfitable ProfitableToImplementDotInLlvmIr(
-    const HloInstruction& dot) {
-  if (dot.opcode() == HloOpcode::kDot && dot.shape().dimensions_size() == 2) {
-    const Shape& result_shape = dot.shape();
-    // kReductionDimensionThresholdBytes was chosen to be 1/4 of a typical L1
-    // cache line size, so that we can have the reduction dimension of both the
-    // LHS and RHS matrices and still have some space "left over".  This needs
-    // to be tuned further.
-    const int64 kReductionDimensionThresholdBytes = 8 * 1024;
-    const bool single_threaded_eigen =
-        !dot.GetModule()->config().debug_options().xla_cpu_multi_thread_eigen();
-
-    // This is the point at which it is better to call into Eigen and shard the
-    // dot across multiple worker threads.  This is a rough estimate by running
-    // a matmult benchmark on my local machine, and it can be tuned further.
-    const int64 kMaxSingleThreadedFlops = 16 * 1024;
-
-    const int64 M = result_shape.dimensions(0);
-    const int64 N = result_shape.dimensions(1);
-    const int64 K = dot.operand(1)->shape().dimensions(0);
-    const int64 primitive_type_size =
-        ShapeUtil::ByteSizeOfPrimitiveType(result_shape.element_type());
-    if (M == 1 &&
-        K * primitive_type_size <= kReductionDimensionThresholdBytes &&
-        (single_threaded_eigen || M * K * N <= kMaxSingleThreadedFlops)) {
-      // Heuristics:
-      //
-      //  - Look for a configuration where we will likely be able to keep LHS in
-      //    L1 and do a cache-optimal traversal of RHS.
-      //
-      //  - Bail out on matrices that are large enough that Eigen can profitably
-      //    shard the computation across multiple cores.  This only applies when
-      //    multi-threading is enabled.
-      return LayoutUtil::IsMonotonicWithDim0Major(
-                 dot.operand(1)->shape().layout())
-                 ? DotInLlvmIrProfitable::kWithColumnMajorRhs
-                 : DotInLlvmIrProfitable::kYes;
-    }
-  }
-  return DotInLlvmIrProfitable::kNo;
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index 66656ed99765806ec4463f3781644853886cf303..34b2003916933f5ec0a15d9e219063c0a912fa40 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -24,20 +25,17 @@ namespace cpu {
 bool PotentiallyImplementedAsEigenConvolution(
     const HloInstruction& convolution);
 
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& dot);
-
-enum class DotInLlvmIrProfitable { kYes, kNo, kWithColumnMajorRhs };
-
-// Returns a value to indicate if (and under what conditions) will lowering
-// |dot| as a pure LLVM IR dot operation be profitable over calling into Eigen.
-// Possible return values are:
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
 //
-//  * DotInLlvmIrProfitable::kYes - always profitable.
-//  * DotInLlvmIrProfitable::kNo - never profitable.
-//  * DotInLlvmIrProfitable::kWithColumnMajorRhs - only if we can manage to make
-//    the Rhs layout column major.
-DotInLlvmIrProfitable ProfitableToImplementDotInLlvmIr(
-    const HloInstruction& dot);
+// See IrFunction and ParallelLoopEmitter for details.
+using DynamicLoopBounds = std::vector<std::pair<llvm::Value*, llvm::Value*>>;
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index a20ce6826ca0a86f8c0d441c1e89f091cfb434f1..c82a0c7ef4a797d9e1cf853badc84130a3e062b1 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,16 +24,17 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -42,6 +43,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
@@ -76,14 +79,16 @@ namespace cpu {
 IrEmitter::IrEmitter(
     const HloModule& hlo_module, const BufferAssignment& assignment,
     llvm::Module* llvm_module,
-    const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx,
+    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
+    tensorflow::gtl::optional<size_t> entry_computation_profile_idx,
     llvm::TargetMachine* target_machine,
     ExternalConstantPool* external_constant_pool)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       ir_builder_(llvm_module->getContext()),
-      hlo_to_profile_idx_(hlo_to_profile_idx),
+      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
+      entry_computation_profile_idx_(std::move(entry_computation_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       parallel_cpu_backend_(
@@ -122,133 +127,27 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   } else {
     TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
   }
-  InsertOrDie(&emitted_functions_, computation, compute_function_);
-
-  return compute_function_;
-}
-
-static llvm::Argument* GetArg(llvm::Function* f, int idx) {
-  llvm::Function::arg_iterator arg_iter = f->arg_begin();
-  std::advance(arg_iter, idx);
-  return &*arg_iter;
+  llvm::Function* ir_function = compute_function_->function();
+  InsertOrDie(&emitted_functions_, computation, ir_function);
+  // Delete 'compute_function', finalizing 'ir_function' and restoring caller
+  // IR insert point.
+  compute_function_.reset();
+  return ir_function;
 }
 
 void IrEmitter::InitializeIrFunction(const string& function_name) {
-  // The function signature is:
-  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
-  //                 i64* dynamic_loop_bounds, i64* prof_counters)
-  //
-  // retval: points to the returned value.
-  // params: address of an array with pointers to parameters.
-  // temps: address of an array with pointers to temporary buffers.
-  //
-  // Therefore, the generated function's signature (FunctionType) is statically
-  // determined - parameter unpacking is done in code generated into the
-  // function, rather than by a prologue dictated by the platform ABI.
-  //
-  //                      /--------------\
-  //   retval ----------> | return value |
-  //                      \--------------/
-  //
-  //                      /-------------------------------\
-  //   run_options -----> | xla::ExecutableRunOptions |
-  //                      \-------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | param 0 |  | param 1 |         | param N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                     /---------------------------------------------\
-  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
-  //                     |   addr    |   addr    |       |   addr      |
-  //                     \---------------------------------------------/
-  //                          |           |                   |
-  //                          |           |                   |
-  //                          V           V                   V
-  //                     /---------\  /---------\         /-----------\
-  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
-  //                     \---------/  \---------/         \-----------/
-  //
-  //                        /--------------------------------------------\
-  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
-  //  (elided for aot)      \--------------------------------------------/
-  //
-  //                     /---------------------------------------------\
-  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
-  //  (elided for aot)   \---------------------------------------------/
-
-  // Even though the type of params and temps is void** in the host's view, in
-  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
-  // to use GEPs to unravel the indirection layers.
-  llvm::FunctionType* compute_function_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/GetComputeFunctionParams(),
-      /*isVarArg=*/false);
-
   // Functions with local linkage get an inlining bonus.  Because we know
   // a-priori that embedded functions (non-entry functions) will not have its
   // name resolved, give it local linkage.
   llvm::Function::LinkageTypes linkage =
       is_top_level_computation_ ? llvm::GlobalValue::ExternalLinkage
                                 : llvm::GlobalValue::InternalLinkage;
-  compute_function_ =
-      llvm::Function::Create(/*Ty=*/compute_function_type,
-                             /*Linkage=*/linkage,
-                             /*Name=*/AsStringRef(function_name),
-                             /*Module=*/module_);
-  compute_function_->setCallingConv(llvm::CallingConv::C);
-
-  // Set meaningful names for the function's arguments: useful for debugging.
-  llvm::Function::arg_iterator arg_iter = compute_function_->arg_begin();
-  arg_iter->setName("retval");
-  (++arg_iter)->setName("run_options");
-  (++arg_iter)->setName("params");
-  (++arg_iter)->setName("temps");
-  if (num_dynamic_loop_bounds_ > 0) {
-    (++arg_iter)->setName("dynamic_loop_bounds");
-  }
-  if (hlo_to_profile_idx_) {
-    (++arg_iter)->setName("prof_counters");
-  }
-
-  // We know a-priori that the function arguments are guaranteed to point to
-  // disjoint objects.
-  llvm::Argument* retval = GetResultArgument();
-  for (llvm::Argument& argument : compute_function_->args()) {
-    // However, the return buffer aliases the temporaries and thus cannot be
-    // marked noalias.
-    if (&argument == retval) {
-      continue;
-    }
-    compute_function_->addAttribute(argument.getArgNo() + 1,
-                                    llvm::Attribute::NoAlias);
-  }
-
-  // Add the optize attribute to the function if optimizing for size. This
-  // controls internal behavior of some optimization passes (e.g. loop
-  // unrolling).
-  if (options::OptimizeForSizeRequested(hlo_module_config_)) {
-    compute_function_->addFnAttr(llvm::Attribute::OptimizeForSize);
-  }
-
-  if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
-    compute_function_->addFnAttr("unsafe-fp-math", "true");
-    compute_function_->addFnAttr("no-infs-fp-math", "true");
-    compute_function_->addFnAttr("no-nans-fp-math", "true");
-    compute_function_->addFnAttr("no-signed-zeros-fp-math", "true");
-  }
-
-  ir_builder_.SetInsertPoint(llvm::BasicBlock::Create(
-      /*Context=*/module_->getContext(),
-      /*Name=*/"entry",
-      /*Parent=*/compute_function_));
+  // Create and initialize new IrFunction.
+  compute_function_.reset(
+      new IrFunction(function_name, linkage,
+                     options::OptimizeForSizeRequested(hlo_module_config_),
+                     hlo_module_config_.debug_options().xla_enable_fast_math(),
+                     module_, &ir_builder_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -344,11 +243,12 @@ int IrEmitter::MinimumAlignmentForBufferSize(int64 buffer_size) {
 
 // Calculate the alignment of a buffer allocated for a given primitive type.
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
-  int64 buffer_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  DCHECK_GE(buffer_size, 0);
-  DCHECK_LE(buffer_size, SIZE_MAX);
-
-  return MinimumAlignmentForBufferSize(buffer_size);
+  int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  DCHECK_GE(byte_size, 0);
+  // Largest scalar is a complex64 so we don't need to worry about the
+  // int64->int truncation here.
+  DCHECK_LE(byte_size, 8);
+  return byte_size;
 }
 
 int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
@@ -357,6 +257,10 @@ int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
 
 // Calculate the alignment of a buffer allocated for a given shape.
 int IrEmitter::MinimumAlignmentForShape(const Shape& shape) {
+  if (ShapeUtil::IsScalar(shape)) {
+    return MinimumAlignmentForPrimitiveType(shape.element_type());
+  }
+
   int64 buffer_size = ByteSizeOf(shape);
   DCHECK_GE(buffer_size, 0);
   DCHECK_LE(buffer_size, SIZE_MAX);
@@ -612,7 +516,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   HloComputation* function = reduce_window->to_apply();
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window, /*operands=*/{operand},
-      /*supported_types=*/{F32}));
+      /*supported_types=*/{F32, BF16}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
@@ -795,7 +699,7 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // operand index is within the bounds. The unsigned comparison includes
   // checking whether the operand index >= 0.
   llvm_ir::IrArray::Index operand_index(source_index.size());
-  llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+  llvm::Value* in_bounds_condition = ir_builder_.getTrue();
   for (int64 i = 0; i < rank; ++i) {
     llvm::Value* strided_index = ir_builder_.CreateNSWMul(
         source_index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
@@ -822,14 +726,16 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   // If the initialized_flag is false, initialize the selected value and index
   // with the currently visiting operand.
   SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
-  const auto save_operand_index = [&](
-      const llvm_ir::IrArray::Index& operand_index) {
-    for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-          selected_index_address, {ir_builder_.getInt32(i)});
-      ir_builder_.CreateStore(operand_index[i], selected_index_address_slot);
-    }
-  };
+  const auto save_operand_index =
+      [&](const llvm_ir::IrArray::Index& operand_index) {
+        for (int64 i = 0; i < rank; ++i) {
+          llvm::Value* selected_index_address_slot =
+              ir_builder_.CreateInBoundsGEP(selected_index_address,
+                                            {ir_builder_.getInt32(i)});
+          ir_builder_.CreateStore(operand_index[i],
+                                  selected_index_address_slot);
+        }
+      };
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
       operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
@@ -896,6 +802,24 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*dot, /*operands=*/{lhs, rhs},
       /*supported_types=*/{F32, F64, C64}));
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions_size() != 1) {
+    // This is disallowed by ShapeInference today.
+    return Unimplemented(
+        "Dot with multiple contracting dimensions not implemented.");
+  }
+
+  if (dnums.lhs_contracting_dimensions(0) !=
+          std::min(lhs->shape().dimensions_size() - 1, 1) ||
+      dnums.rhs_contracting_dimensions(0) != 0) {
+    return Unimplemented(
+        "Dot with non-standard contracting dimensions not implemented.");
+  }
 
   llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
   llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
@@ -952,11 +876,12 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       // Input tensor.
       const Shape& input_shape = convolution->operand(0)->shape();
       int64 input_batch = input_shape.dimensions(dnums.input_batch_dimension());
-      int64 input_rows = input_shape.dimensions(dnums.spatial_dimensions(0));
+      int64 input_rows =
+          input_shape.dimensions(dnums.input_spatial_dimensions(0));
       int64 input_cols =
           one_dim_convolution
               ? 1
-              : input_shape.dimensions(dnums.spatial_dimensions(1));
+              : input_shape.dimensions(dnums.input_spatial_dimensions(1));
       int64 input_channels =
           input_shape.dimensions(dnums.input_feature_dimension());
 
@@ -976,11 +901,11 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       // Output tensor.
       const Shape& convolution_shape = convolution->shape();
       int64 output_rows =
-          convolution_shape.dimensions(dnums.spatial_dimensions(0));
-      int64 output_cols =
-          one_dim_convolution
-              ? 1
-              : convolution_shape.dimensions(dnums.spatial_dimensions(1));
+          convolution_shape.dimensions(dnums.output_spatial_dimensions(0));
+      int64 output_cols = one_dim_convolution
+                              ? 1
+                              : convolution_shape.dimensions(
+                                    dnums.output_spatial_dimensions(1));
 
       // Extract the window stride for the convolution.
       const Window& window = convolution->window();
@@ -1068,10 +993,10 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   return EmitTargetElementLoop(
       convolution, [this, convolution, lhs, rhs, window,
                     dnums](const llvm_ir::IrArray::Index& index) {
-        int num_spatial_dims = dnums.spatial_dimensions_size();
+        int num_spatial_dims = dnums.output_spatial_dimensions_size();
         std::vector<llvm::Value*> output_spatial(num_spatial_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          output_spatial[i] = index[dnums.spatial_dimensions(i)];
+          output_spatial[i] = index[dnums.output_spatial_dimensions(i)];
         }
         llvm::Value* output_feature = index[dnums.output_feature_dimension()];
         llvm::Value* batch = index[dnums.output_batch_dimension()];
@@ -1091,8 +1016,9 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         for (int i = 0; i < num_spatial_dims; ++i) {
           kernel_spatial[i] =
               loops
-                  .AddLoop(0, rhs->shape().dimensions(
-                                  dnums.kernel_spatial_dimensions(i)),
+                  .AddLoop(0,
+                           rhs->shape().dimensions(
+                               dnums.kernel_spatial_dimensions(i)),
                            tensorflow::strings::StrCat("k", i))
                   ->GetIndVarValue();
         }
@@ -1108,17 +1034,18 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         // Calculate the spatial index in the input array, taking striding,
         // dilation and padding into account. An index in the padding will be
         // out of the bounds of the array.
-        const auto calculate_input_index = [this](
-            llvm::Value* output_index, llvm::Value* kernel_index,
-            const WindowDimension& window_dim) {
-          llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-              output_index, ir_builder_.getInt64(window_dim.stride()));
-          llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
-              kernel_index, ir_builder_.getInt64(window_dim.window_dilation()));
-          return ir_builder_.CreateNSWSub(
-              ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
-              ir_builder_.getInt64(window_dim.padding_low()));
-        };
+        const auto calculate_input_index =
+            [this](llvm::Value* output_index, llvm::Value* kernel_index,
+                   const WindowDimension& window_dim) {
+              llvm::Value* strided_index = ir_builder_.CreateNSWMul(
+                  output_index, ir_builder_.getInt64(window_dim.stride()));
+              llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
+                  kernel_index,
+                  ir_builder_.getInt64(window_dim.window_dilation()));
+              return ir_builder_.CreateNSWSub(
+                  ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
+                  ir_builder_.getInt64(window_dim.padding_low()));
+            };
         std::vector<llvm::Value*> input_spatial(num_spatial_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
           input_spatial[i] = calculate_input_index(
@@ -1140,11 +1067,11 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
           return ir_builder_.CreateICmpEQ(remainder, ir_builder_.getInt64(0));
         };
 
-        llvm::Value* in_bounds_condition = nullptr;
+        llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
         for (int i = 0; i < num_spatial_dims; ++i) {
           llvm::ConstantInt* input_bound =
               ir_builder_.getInt64(window_util::DilatedBound(
-                  lhs->shape().dimensions(dnums.spatial_dimensions(i)),
+                  lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
                   window.dimensions(i).base_dilation()));
           llvm::Value* dim_in_bound =
               ir_builder_.CreateICmpULT(input_spatial[i], input_bound);
@@ -1153,9 +1080,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
           llvm::Value* dim_ok =
               ir_builder_.CreateAnd(dim_in_bound, dim_not_in_hole);
           in_bounds_condition =
-              in_bounds_condition
-                  ? ir_builder_.CreateAnd(in_bounds_condition, dim_ok)
-                  : dim_ok;
+              ir_builder_.CreateAnd(in_bounds_condition, dim_ok);
         }
 
         // Now we need to map the dilated base coordinates back to the actual
@@ -1178,7 +1103,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         int num_dims = num_spatial_dims + 2;
         llvm_ir::IrArray::Index input_index(num_dims);
         for (int i = 0; i < num_spatial_dims; ++i) {
-          input_index[dnums.spatial_dimensions(i)] = input_spatial[i];
+          input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
         }
         input_index[dnums.input_feature_dimension()] = input_feature;
         input_index[dnums.input_batch_dimension()] = batch;
@@ -1449,7 +1374,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   //
   // Where Param is the actual element type of the underlying buffer (for
   // example, float for an XLA F32 element type).
-  llvm::Argument* params = GetArg(compute_function_, 2);
+  llvm::Value* params = compute_function_->parameters_arg();
   llvm::Value* param_address_offset =
       llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_);
   llvm::LoadInst* param_address_untyped =
@@ -1587,7 +1512,7 @@ IrEmitter::ShardedVectorType IrEmitter::CreateShardedVectorType(
   // Here we assume that the largest register is a vector register.
   int max_vector_register_size_in_bytes =
       target_machine_features_.largest_register_size_in_bytes(
-          compute_function_);
+          compute_function_->function());
 
   int vector_register_size_in_elements =
       max_vector_register_size_in_bytes /
@@ -1745,19 +1670,6 @@ void IrEmitter::EmitShardedVectorStore(
   }
 }
 
-namespace {
-// TODO(sanjoy): This is duplicated in tensorflow/core/lib/core/arena.cc.
-// Extract out a common implementation to tensorflow/core/lib/math/math_util.h
-uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-}  // namespace
-
 StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     HloInstruction* reduce, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions, HloComputation* function,
@@ -1780,9 +1692,9 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
       std::find(dimensions.begin(), dimensions.end(),
                 arg->shape().layout().minor_to_major(0)) != dimensions.end();
 
-  unsigned element_alignment =
-      GCD(ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
-          MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
+      ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
+      MinimumAlignmentForPrimitiveType(reduce->shape().element_type()));
 
   if (is_reduction_over_minor_dimension) {
     // TODO(sanjoy): Implement vectorized reduction over the minor dimension.
@@ -1983,11 +1895,16 @@ Status IrEmitter::HandleSend(HloInstruction* send) {
   return Unimplemented("Send is not implemented on CPU. See b/33942983.");
 }
 
+Status IrEmitter::HandleSendDone(HloInstruction* send_done) {
+  // TODO(b/33942983): Support Send/Recv on CPU.
+  return Unimplemented("Send-done is not implemented on CPU. See b/33942983.");
+}
+
 Status IrEmitter::HandleSlice(HloInstruction* slice) {
   VLOG(2) << "HandleSlice: " << slice->ToString();
   auto operand = slice->operand(0);
   // The code below emits a sequential loop nest. For the parallel backend, use
-  // EmitParallelTargetElementLoop() which respects dynamic loop bounds.
+  // ParallelLoopEmitter which respects dynamic loop bounds.
   if (ShouldEmitParallelLoopFor(*slice)) {
     return DefaultAction(slice);
   }
@@ -2148,6 +2065,11 @@ Status IrEmitter::HandleRecv(HloInstruction* recv) {
   return Unimplemented("Recv is not implemented on CPU. See b/33942983.");
 }
 
+Status IrEmitter::HandleRecvDone(HloInstruction* recv_done) {
+  // TODO(b/33942983): Support Send/Recv on CPU.
+  return Unimplemented("Recv-done is not implemented on CPU. See b/33942983.");
+}
+
 Status IrEmitter::HandlePad(HloInstruction* pad) {
   // CPU backend does not properly handle negative padding but this is ok
   // because negative padding should be removed by the algebraic simplifier.
@@ -2292,9 +2214,17 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
       !parallel_cpu_backend_) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
-    TF_RETURN_IF_ERROR(EmitParallelForkJoin(parameter_addresses,
-                                            emitted_value_[call], computation,
-                                            call_ir_function));
+    std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
+        parameter_addresses, &ir_builder_, computation->name(),
+        /*return_value_buffer=*/emitted_value_[call],
+        /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+        /*temp_buffers_arg=*/GetTempBuffersArgument(),
+        /*profile_counters_arg=*/GetProfileCountersArgument());
+
+    HloInstruction* root = computation->root_instruction();
+    TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin(
+        call_args, root->shape(), root->outer_dimension_partitions(),
+        &ir_builder_, call_ir_function, computation->name()));
   } else {
     EmitArrayFunctionCallInto(call_ir_function, parameter_addresses,
                               emitted_value_[call], computation->name());
@@ -2397,7 +2327,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Terminates the current block with a branch to a while header.
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "header")),
-      compute_function_);
+      compute_function_->function());
   ir_builder_.CreateBr(header_bb);
   ir_builder_.SetInsertPoint(header_bb);
 
@@ -2414,7 +2344,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   // Branches to the body or to the while exit depending on the condition.
   llvm::BasicBlock* body_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "body")),
-      compute_function_);
+      compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
   ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb);
@@ -2429,7 +2359,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   ir_builder_.CreateBr(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
-  compute_function_->getBasicBlockList().push_back(exit_bb);
+  compute_function_->function()->getBasicBlockList().push_back(exit_bb);
   ir_builder_.SetInsertPoint(exit_bb);
 
   return Status::OK();
@@ -2547,7 +2477,7 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
                                      const llvm_ir::IrArray& source_array) {
   unsigned primitive_type_size =
       ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  unsigned element_alignment = GCD(
+  unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       primitive_type_size, MinimumAlignmentForPrimitiveType(primitive_type));
   llvm::Type* primitive_ptr_type = llvm::PointerType::getUnqual(
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_));
@@ -2594,6 +2524,65 @@ Status IrEmitter::HandleConcatenate(HloInstruction* concatenate) {
   return DefaultAction(concatenate);
 }
 
+Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+  auto pred = conditional->operand(0);
+  auto true_arg = conditional->operand(1);
+  auto false_arg = conditional->operand(2);
+  TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()) &&
+               pred->shape().element_type() == PRED)
+      << "Predicate on a Conditional must be bool; got: "
+      << ShapeUtil::HumanString(pred->shape());
+
+  HloComputation* true_computation = conditional->true_computation();
+  HloComputation* false_computation = conditional->false_computation();
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                true_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the true "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(true_computation->root_instruction()->shape());
+
+  TF_RET_CHECK(ShapeUtil::Equal(conditional->shape(),
+                                false_computation->root_instruction()->shape()))
+      << "Shape of conditional should be same as the shape of the false "
+      << "computation; got: " << ShapeUtil::HumanString(conditional->shape())
+      << " and "
+      << ShapeUtil::HumanString(false_computation->root_instruction()->shape());
+
+  llvm::Function* true_function =
+      FindOrDie(emitted_functions_, true_computation);
+  llvm::Function* false_function =
+      FindOrDie(emitted_functions_, false_computation);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(conditional));
+  llvm::Value* conditional_result = GetEmittedValueFor(conditional);
+
+  // Generating:
+  //   if (pred)
+  //     cond_result = true_computation(true_operand)
+  //   else
+  //     cond_result = false_computation(false_operand)
+  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+      GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
+  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+      pred_value,
+      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+      "boolean_predicate");
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_);
+
+  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)},
+                            conditional_result, IrName(conditional, "_true"));
+
+  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)},
+                            conditional_result, IrName(conditional, "_false"));
+
+  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  return Status::OK();
+}
+
 Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
@@ -2605,53 +2594,56 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
   llvm::Value* root_value = GetEmittedValueFor(root);
   VLOG(2) << "  value: " << llvm_ir::DumpToString(*root_value);
 
-  // For the parallel cpu backend, we record the total for each embedded
-  // computation callee with its caller kCall HLO.
-  HloInstruction* hlo_to_lookup = nullptr;
-  if (parallel_cpu_backend_ && is_top_level_computation_) {
-    auto* computation = root->parent();
-    auto* entry_computation = computation->parent()->entry_computation();
-    if (computation != entry_computation) {
-      for (HloInstruction* instruction : entry_computation->instructions()) {
-        if (instruction->opcode() == HloOpcode::kCall &&
-            instruction->to_apply()->root_instruction() == root) {
-          hlo_to_lookup = instruction;
-          break;
+  llvm::Value* prof_counter = [&]() {
+    // For the parallel cpu backend, we record the total for each embedded
+    // computation callee with its caller kCall HLO.
+    if (parallel_cpu_backend_ && is_top_level_computation_) {
+      auto* computation = root->parent();
+      auto* entry_computation = computation->parent()->entry_computation();
+      if (computation != entry_computation) {
+        for (HloInstruction* instruction : entry_computation->instructions()) {
+          if (instruction->opcode() == HloOpcode::kCall &&
+              instruction->to_apply()->root_instruction() == root) {
+            return GetProfileCounterFor(*instruction);
+          }
         }
       }
     }
-  }
-  if (auto* prof_counter = GetProfileCounterFor(hlo_to_lookup)) {
+
+    // Otherwise we record the total computation cycles in a dedicated slot for
+    // the entry computation.
+    return GetProfileCounterForEntryComputation();
+  }();
+
+  if (prof_counter) {
     profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
   }
-
-  ir_builder_.CreateRetVoid();
   return Status::OK();
 }
 
-llvm::Value* IrEmitter::GetProfileCounterFor(const HloInstruction* hlo) {
-  string counter_name;
-  size_t prof_counter_idx;
-  if (!hlo_to_profile_idx_) {
+llvm::Value* IrEmitter::GetProfileCounterFor(const HloInstruction& hlo) {
+  auto it = hlo_to_profile_idx_.find(&hlo);
+  if (it == hlo_to_profile_idx_.end()) {
     return nullptr;
   }
-  if (hlo) {
-    auto it = hlo_to_profile_idx_->find(hlo);
-    if (it == hlo_to_profile_idx_->end()) {
-      return nullptr;
-    }
 
-    prof_counter_idx = it->second;
-    counter_name = IrName("prof_counter", hlo->name());
-  } else {
-    prof_counter_idx = hlo_to_profile_idx_->size();
-    counter_name = "prof_counter.computation";
-  }
+  size_t prof_counter_idx = it->second;
+  string counter_name = IrName("prof_counter", hlo.name());
   return ir_builder_.CreateGEP(GetProfileCountersArgument(),
                                ir_builder_.getInt64(prof_counter_idx),
                                AsStringRef(counter_name));
 }
 
+llvm::Value* IrEmitter::GetProfileCounterForEntryComputation() {
+  if (entry_computation_profile_idx_) {
+    return ir_builder_.CreateGEP(
+        GetProfileCountersArgument(),
+        ir_builder_.getInt64(*entry_computation_profile_idx_),
+        "prof_counter.computation");
+  }
+  return nullptr;
+}
+
 void IrEmitter::ProfilingState::UpdateProfileCounter(
     llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter,
     llvm::Value* cycle_end, llvm::Value* cycle_start) {
@@ -2723,14 +2715,14 @@ void IrEmitter::ProfilingState::RecordCompleteComputation(
 
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
-  if (hlo_to_profile_idx_ && hlo_to_profile_idx_->count(hlo)) {
+  if (hlo_to_profile_idx_.count(hlo)) {
     profiling_state_.RecordCycleStart(&ir_builder_, hlo);
   }
   return Status::OK();
 }
 
 Status IrEmitter::Postprocess(HloInstruction* hlo) {
-  if (auto* prof_counter = GetProfileCounterFor(hlo)) {
+  if (auto* prof_counter = GetProfileCounterFor(*hlo)) {
     profiling_state_.RecordCycleDelta(&ir_builder_, hlo, prof_counter);
   }
   return Status::OK();
@@ -2766,45 +2758,16 @@ llvm::Type* IrEmitter::IrShapeType(const Shape& shape) {
   return llvm_ir::ShapeToIrType(shape, module_);
 }
 
-std::vector<llvm::Type*> IrEmitter::GetComputeFunctionParams() {
-  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
-  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
-  llvm::Type* i64_ptr_type = llvm::Type::getInt64PtrTy(module_->getContext());
-  std::vector<llvm::Type*> compute_function_params(
-      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
-  if (num_dynamic_loop_bounds_ > 0) {
-    compute_function_params.push_back(i64_ptr_type);
-  }
-  if (hlo_to_profile_idx_) {
-    compute_function_params.push_back(i64_ptr_type);
-  }
-  return compute_function_params;
-}
-
-llvm::Argument* IrEmitter::GetResultArgument() {
-  return GetArg(compute_function_, 0);
-}
-
-llvm::Argument* IrEmitter::GetProfileCountersArgument() {
-  const int64 arg_index = num_dynamic_loop_bounds_ > 0 ? 5 : 4;
-  return hlo_to_profile_idx_ ? GetArg(compute_function_, arg_index) : nullptr;
+llvm::Value* IrEmitter::GetProfileCountersArgument() {
+  return compute_function_->profile_counters_arg();
 }
 
 llvm::Value* IrEmitter::GetTempBuffersArgument() {
-  return GetArg(compute_function_, 3);
-}
-
-llvm::Value* IrEmitter::GetDynamicLoopBound(const int64 offset) {
-  CHECK_GT(num_dynamic_loop_bounds_, 0);
-  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
-  llvm::Argument* loop_bounds_arg = GetArg(compute_function_, 4);
-  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
-  return ir_builder_.CreateLoad(ir_builder_.CreateGEP(
-      loop_bounds_arg, ir_builder_.getInt64(offset), AsStringRef(name)));
+  return compute_function_->temp_buffers_arg();
 }
 
 llvm::Value* IrEmitter::GetExecutableRunOptionsArgument() {
-  return GetArg(compute_function_, 1);
+  return compute_function_->exec_run_options_arg();
 }
 
 llvm::Value* IrEmitter::EmitTempBufferPointer(
@@ -2869,42 +2832,6 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
       AsStringRef(tensorflow::strings::StrCat(name, "_return_value")));
 }
 
-// Emits code to allocate an array of parameter address pointers, and store
-// each address from 'parameter_addresses'.
-// Returns an array of compute function call arguments (including parameter
-// address buffer).
-std::vector<llvm::Value*> IrEmitter::GetArrayFunctionCallArguments(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
-  llvm::Value* parameter_addresses_buffer =
-      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder_.getInt8PtrTy(),
-          ir_builder_.getInt32(parameter_addresses.size()),
-          tensorflow::strings::StrCat(name, "_parameter_addresses"),
-          &ir_builder_);
-  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-    llvm::Value* parameter_as_i8ptr = ir_builder_.CreateBitCast(
-        parameter_addresses[i], ir_builder_.getInt8PtrTy(),
-        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
-                                                "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_adresses = ir_builder_.CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder_.getInt64(i)});
-    ir_builder_.CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
-  }
-
-  const auto to_int8_ptr = [this](llvm::Value* ptr) {
-    return ir_builder_.CreatePointerCast(ptr, ir_builder_.getInt8PtrTy());
-  };
-  std::vector<llvm::Value*> arguments{
-      to_int8_ptr(return_value_buffer),
-      to_int8_ptr(GetExecutableRunOptionsArgument()),
-      parameter_addresses_buffer, GetTempBuffersArgument()};
-  if (auto* profile_counters = GetProfileCountersArgument()) {
-    arguments.push_back(profile_counters);
-  }
-  return arguments;
-}
-
 // Emits a core function call based on the following pseudo-code.
 //
 //   char** parameter_addresses_buffer =
@@ -2920,8 +2847,12 @@ void IrEmitter::EmitArrayFunctionCallInto(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
   ir_builder_.CreateCall(
-      function, GetArrayFunctionCallArguments(parameter_addresses,
-                                              return_value_buffer, name));
+      function, GetArrayFunctionCallArguments(
+                    parameter_addresses, &ir_builder_, name,
+                    /*return_value_buffer=*/return_value_buffer,
+                    /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
+                    /*temp_buffers_arg=*/GetTempBuffersArgument(),
+                    /*profile_counters_arg=*/GetProfileCountersArgument()));
 }
 
 llvm::Value* IrEmitter::EmitArrayFunctionCall(
@@ -2941,117 +2872,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
   return return_value_buffer;
 }
 
-// Emits a call to a runtime fork/join function which dispatches parallel
-// calls to 'parallel_function' (and joins threads before returning).
-Status IrEmitter::EmitParallelForkJoin(
-    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::Value* output_address, HloComputation* computation,
-    llvm::Function* parallel_function) {
-  HloInstruction* root = computation->root_instruction();
-
-  // Build ParallelForkJoin function type.
-  std::vector<llvm::Type*> compute_function_params = GetComputeFunctionParams();
-  // Number of parallel compute functions.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Array of partitions. There is an array element for each
-  // partition x partition_dim x 2 (for dimension start and limit).
-  compute_function_params.push_back(
-      llvm::Type::getInt64PtrTy(module_->getContext()));
-  // Number of partitioned most-major dimensions in 'root.shape'.
-  compute_function_params.push_back(ir_builder_.getInt32Ty());
-  // Function pointer for compute function to be dispatched in parallel.
-  compute_function_params.push_back(
-      llvm::Type::getInt8PtrTy(module_->getContext()));
-
-  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
-      /*Result=*/llvm::Type::getVoidTy(module_->getContext()),
-      /*Params=*/compute_function_params,
-      /*isVarArg=*/false);
-
-  llvm::Function* fork_join_func =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          runtime::kParallelForkJoinSymbolName, fork_join_type));
-  fork_join_func->setCallingConv(llvm::CallingConv::C);
-  fork_join_func->setDoesNotThrow();
-
-  // Add common compute function arguments.
-  const string name = computation->name();
-  std::vector<llvm::Value*> arguments =
-      GetArrayFunctionCallArguments(parameter_addresses, output_address, name);
-
-  // Create ShapePartitionIterator to generate all partitions of 'root.shape'.
-  ShapePartitionIterator partition_iterator(root->shape(),
-                                            root->outer_dimension_partitions());
-  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
-  // Add argument specifying the number of parallel partitions.
-  arguments.push_back(ir_builder_.getInt32(num_partitions));
-
-  // The number of partitioned most-major dimensions in 'root.shape'.
-  const int32 num_partitioned_dims = root->outer_dimension_partitions().size();
-  // A dimension partition consists of two elements: [start_index, limit_index).
-  const int32 dim_partition_size = 2;
-  // Calculate array partition stride.
-  const int32 array_partition_stride =
-      num_partitioned_dims * dim_partition_size;
-  // Calculate the total number of elements in the partition array.
-  const int32 partition_array_size =
-      dim_partition_size * num_partitioned_dims * num_partitions;
-
-  // Store dimension partition values as llvm constants in 'partitions'.
-  // See comments in runtime_fork_join.cc for array layout description.
-  std::vector<llvm::Constant*> partitions(partition_array_size);
-  for (int32 i = 0; i < num_partitions; ++i) {
-    std::vector<std::pair<int64, int64>> dim_partitions =
-        partition_iterator.GetPartition(i);
-    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
-    const int32 partition_index = i * array_partition_stride;
-    for (int32 j = 0; j < num_partitioned_dims; ++j) {
-      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
-      const int32 index = partition_index + j * dim_partition_size;
-      // Store partition [dim_start, dim_limit) intervals for each dimension.
-      partitions[index] = ir_builder_.getInt64(dim_partition.first);
-      partitions[index + 1] =
-          ir_builder_.getInt64(dim_partition.first + dim_partition.second);
-    }
-  }
-
-  // Create global variable out of dimension partitions in 'partitions'.
-  llvm::ArrayType* partitions_array_type =
-      llvm::ArrayType::get(ir_builder_.getInt64Ty(), partition_array_size);
-  llvm::Constant* partitions_array =
-      llvm::ConstantArray::get(partitions_array_type, partitions);
-  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
-      /*Module=*/*module_,
-      /*Type=*/partitions_array_type,
-      /*isConstant=*/true,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/partitions_array,
-      /*Name=*/
-      AsStringRef(
-          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
-
-  // Add argument specifying parallel dimension partitions.
-  arguments.push_back(ir_builder_.CreateBitCast(
-      global_partitions_array,
-      llvm::Type::getInt64PtrTy(module_->getContext())));
-  // Add argument specifying the number of partitioned most-major dimensions.
-  arguments.push_back(ir_builder_.getInt32(num_partitioned_dims));
-  // Add argument for parallel compute function pointer.
-  arguments.push_back(
-      ir_builder_.CreateBitCast(parallel_function, ir_builder_.getInt8PtrTy()));
-  // Emit call to parallel fork/join.
-  ir_builder_.CreateCall(fork_join_func, arguments);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
   llvm::Value* addr;
   const Shape& target_shape = op->shape();
   if (op == op->parent()->root_instruction()) {
     // For the root node, we write directly to the output buffer of the
     // function.
-    llvm::Argument* retval = GetResultArgument();
+    llvm::Argument* retval = compute_function_->result_arg();
     if (!ShapeUtil::IsNil(target_shape)) {
       llvm::AttrBuilder attr_builder;
       attr_builder.addAlignmentAttr(MinimumAlignmentForShape(target_shape));
@@ -3112,8 +2939,13 @@ Status IrEmitter::EmitTargetElementLoop(
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
-      TF_RETURN_IF_ERROR(EmitParallelTargetElementLoop(
-          target_shape, element_generator, IrName(target_op), &target_array));
+      // Emit code to read dynamic loop bounds from compute function argument.
+      std::vector<std::pair<llvm::Value*, llvm::Value*>> dynamic_loop_bounds =
+          compute_function_->GetDynamicLoopBounds();
+      // Emit parallel loop with dynamic loop bounds for most-major dimensions.
+      TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array,
+                                             &dynamic_loop_bounds, &ir_builder_)
+                             .EmitLoop(IrName(target_op)));
     } else {
       TF_RETURN_IF_ERROR(
           llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
@@ -3123,60 +2955,6 @@ Status IrEmitter::EmitTargetElementLoop(
   return Status::OK();
 }
 
-Status IrEmitter::EmitParallelTargetElementLoop(
-    const Shape& target_shape,
-    const llvm_ir::ElementGenerator& element_generator,
-    tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array) {
-  CHECK(!ShapeUtil::IsTuple(target_shape));
-  CHECK(!ShapeUtil::IsScalar(target_shape));
-
-  // Emit code to read dynamic loop bounds from function argument 4.
-  std::vector<llvm::Value*> dynamic_loop_bounds(2 * num_dynamic_loop_bounds_);
-  for (int i = 0; i < 2 * num_dynamic_loop_bounds_; ++i) {
-    dynamic_loop_bounds[i] = GetDynamicLoopBound(i);
-  }
-
-  llvm_ir::ForLoopNest loop_nest(loop_name, &ir_builder_);
-  const int64 num_dims = target_shape.dimensions_size();
-  llvm_ir::IrArray::Index array_index(num_dims);
-
-  // Add loops from outer-most to inner-most dimensions.
-  for (int i = target_shape.layout().minor_to_major_size() - 1; i >= 0; --i) {
-    const int64 dimension = target_shape.layout().minor_to_major(i);
-    const int bounds_index = num_dims - 1 - i;
-    if (bounds_index < num_dynamic_loop_bounds_) {
-      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
-      // are read from ir function dynamic loop bounds argument.
-      llvm::Value* start_index = dynamic_loop_bounds[bounds_index * 2 + 0];
-      llvm::Value* end_index = dynamic_loop_bounds[bounds_index * 2 + 1];
-
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
-          start_index, end_index);
-      array_index[dimension] = loop->GetIndVarValue();
-    } else {
-      // Emit static loop bounds for this dimension.
-      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
-          /*start_index=*/0,
-          /*end_index=*/target_shape.dimensions(dimension),
-          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
-      array_index[dimension] = loop->GetIndVarValue();
-    }
-  }
-  // Point IR builder at inner loop BB.
-  SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), &ir_builder_);
-
-  // Emit loop body.
-  TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
-                      element_generator(array_index));
-  target_array->EmitWriteArrayElement(array_index, target_element,
-                                      &ir_builder_);
-  // Point IR builder at outer loop exit BB.
-  SetToFirstInsertPoint(loop_nest.GetOuterLoopExitBasicBlock(), &ir_builder_);
-
-  return Status::OK();
-}
-
 Status IrEmitter::EmitMemcpy(const HloInstruction& source,
                              const HloInstruction& destination) {
   llvm::Value* source_value = GetEmittedValueFor(&source);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 5d061e11e3c9e07bdcfdc749711e4369ec2bea2a..9bc2d9739757168562b8dc7b482eff203f303766 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -30,6 +31,7 @@ limitations under the License.
 #include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/external_constant_pool.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -105,15 +107,18 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // llvm_module: the LLVM module to emit IR into.
   // hlo_to_profile_idx: the mapping from HLO to its index in the profiling
   //                     array.
+  // entry_computation_profile_idx: the index in the profiling array
+  //                                for the entry computation.
   // external_constant_pool: if non-null, points to an ExternalConstantPool
   //                         instance into which the Ir emitter can spill
   //                         constants.
-  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
-            llvm::Module* llvm_module,
-            const std::unordered_map<const HloInstruction*, size_t>*
-                hlo_to_profile_idx,
-            llvm::TargetMachine* target_machine,
-            ExternalConstantPool* external_constant_pool);
+  IrEmitter(
+      const HloModule& hlo_module, const BufferAssignment& assignment,
+      llvm::Module* llvm_module,
+      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
+      tensorflow::gtl::optional<size_t> entry_computation_profile_idx,
+      llvm::TargetMachine* target_machine,
+      ExternalConstantPool* external_constant_pool);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -171,11 +176,13 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleReduceWindow(HloInstruction* reduce_window) override;
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override;
   Status HandleSend(HloInstruction* send) override;
+  Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleSlice(HloInstruction* slice) override;
   Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
   Status HandleDynamicUpdateSlice(
       HloInstruction* dynamic_update_slice) override;
   Status HandleRecv(HloInstruction* recv) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandlePad(HloInstruction* pad) override;
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleMap(HloInstruction* map) override;
@@ -184,6 +191,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleWhile(HloInstruction* xla_while) override;
   Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleConditional(HloInstruction* conditional) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -195,7 +203,12 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   // Convenience function to generate a GEP into the profile counter parameter
   // which would correspond to the index for a given HLO.
-  llvm::Value* GetProfileCounterFor(const HloInstruction* hlo);
+  llvm::Value* GetProfileCounterFor(const HloInstruction& hlo);
+
+  // Convenience function to generate a GEP into the profile counter parameter
+  // corresponding to the index for the entry computation.  Returns nullptr if
+  // profiling the entry computation is disabled.
+  llvm::Value* GetProfileCounterForEntryComputation();
 
   // Gets the IR Value emitted previously for the given hlo.
   //
@@ -223,16 +236,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Convenience function to get the IR type matching the given shape.
   llvm::Type* IrShapeType(const Shape& shape);
 
-  // Returns an array of compute function parameter types.
-  std::vector<llvm::Type*> GetComputeFunctionParams();
-
-  // Get the llvm::Value* that represents the "retval" argument of the
-  // computation function being emitted by this emitter.
-  llvm::Argument* GetResultArgument();
-
   // Get the llvm::Value* that represents the "prof_counters" argument of the
   // computation function being emitted by this emitter.
-  llvm::Argument* GetProfileCountersArgument();
+  llvm::Value* GetProfileCountersArgument();
 
   // Get the xla::ExecutableRunOptions that represents the "run_options"
   // argument of the computation function being emitted by this emitter.
@@ -242,11 +248,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // computation function being emitted by this emitter.
   llvm::Value* GetTempBuffersArgument();
 
-  // Emit ir to read and return the ir value for the dynamic loop bound at
-  // 'offset' from the "dynamic_loop_bounds" argument of the computation
-  // function being emitted by this emitter.
-  llvm::Value* GetDynamicLoopBound(const int64 offset);
-
   // Emits code that computes the address of the given temporary buffer to the
   // function. target_shape is the shape of this temporary buffer.
   // The returned Value's type is a pointer to element_type.
@@ -300,18 +301,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
       tensorflow::StringPiece name);
 
-  // Returns an array of compute function call arguments.
-  std::vector<llvm::Value*> GetArrayFunctionCallArguments(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* return_value_buffer, tensorflow::StringPiece name);
-
-  // Emits a call to a runtime fork/join function which dispatches parallel
-  // calls to 'parallel_function' (and joins threads before returning).
-  Status EmitParallelForkJoin(
-      tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-      llvm::Value* output_address, HloComputation* computation,
-      llvm::Function* parallel_function);
-
   // Verifies that the element types of all of the given operand instructions
   // match and are of one of the given supported types.
   Status ElementTypesSameAndSupported(
@@ -336,15 +325,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       HloInstruction* target_op, tensorflow::StringPiece desc,
       const llvm_ir::ElementGenerator& element_generator);
 
-  // Emit IR to perform a computation for every element in a partition/slice of
-  // 'target_shape'. The loop bounds for the outer-dimension partitions are
-  // passed into the compute function as a runtime argument (accessible from
-  // GetDynamicLoopBound).
-  Status EmitParallelTargetElementLoop(
-      const Shape& target_shape,
-      const llvm_ir::ElementGenerator& element_generator,
-      tensorflow::StringPiece loop_name, llvm_ir::IrArray* target_array);
-
   // Emits a memcpy from the source instruction's result value to the
   // destination's.  Both source and destination must have an entry in the
   // emitted_value_ table.
@@ -466,12 +446,15 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       thread_local_buffers_;
 
   // The following fields track the IR emission state. According to LLVM memory
-  // management rules, their memory is owned by the module.
-  llvm::Function* compute_function_;
+  // management rules, their memory is owned by the module (Note that IrFunction
+  // creates the encapsulated llvm::Function s.t. it is added to the llvm
+  // module's function list).
+  std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> ir_builder_;
 
   // Maps HLOs to their index into the profile counter array.
-  const std::unordered_map<const HloInstruction*, size_t>* hlo_to_profile_idx_;
+  std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
+  const tensorflow::gtl::optional<size_t> entry_computation_profile_idx_;
 
   // Maps HLOs to Values emitted for them.
   std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
@@ -479,7 +462,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm_ir::AliasAnalysis alias_analysis_;
 
   // The number of root instruction outer dimensions used in parallel loop
-  // emission (EmitParallelTargetElementLoop).
+  // emission (ParallelLoopEmitter).
   int64 num_dynamic_loop_bounds_ = 0;
 
   // Returns whether the given instruction should be emitted as a parallel loop.
@@ -499,7 +482,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
           use_rdtscp_(false),
           prof_counters_(nullptr) {}
     ProfilingState(bool is_top_level_computation, bool use_rdtscp,
-                   llvm::Argument* prof_counters)
+                   llvm::Value* prof_counters)
         : is_top_level_computation_(is_top_level_computation),
           use_rdtscp_(use_rdtscp),
           prof_counters_(prof_counters) {}
@@ -532,7 +515,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
     bool use_rdtscp_;
 
     // The argument which corresponds to the profile counter buffer.
-    llvm::Argument* prof_counters_;
+    llvm::Value* prof_counters_;
 
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca8c290dd1c4959e42026c3917d37f8fc95a1011
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -0,0 +1,333 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iterator>
+
+#include "tensorflow/compiler/xla/service/cpu/ir_function.h"
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+namespace {
+using llvm_ir::AsStringRef;
+}  // namespace
+
+namespace cpu {
+
+static std::vector<llvm::Type*> GetComputeFunctionParams(
+    llvm::Module* llvm_module, const int64 num_dynamic_loop_bounds) {
+  llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(llvm_module->getContext());
+  llvm::Type* i8_ptr_ptr_type = i8_ptr_type->getPointerTo();
+  llvm::Type* i64_ptr_type =
+      llvm::Type::getInt64PtrTy(llvm_module->getContext());
+  std::vector<llvm::Type*> compute_function_params(
+      {i8_ptr_type, i8_ptr_type, i8_ptr_ptr_type, i8_ptr_ptr_type});
+  if (num_dynamic_loop_bounds > 0) {
+    compute_function_params.push_back(i64_ptr_type);
+  }
+  compute_function_params.push_back(i64_ptr_type);
+  return compute_function_params;
+}
+
+IrFunction::IrFunction(const string& function_name,
+                       llvm::Function::LinkageTypes linkage,
+                       const bool optimize_for_size_requested,
+                       const bool enable_fast_math, llvm::Module* llvm_module,
+                       llvm::IRBuilder<>* ir_builder,
+                       int64 num_dynamic_loop_bounds)
+    : ir_builder_(ir_builder),
+      llvm_module_(llvm_module),
+      caller_insert_point_guard_(*ir_builder),
+      num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
+  Initialize(function_name, linkage, optimize_for_size_requested,
+             enable_fast_math);
+}
+
+IrFunction::~IrFunction() {
+  // Emit function return value.
+  ir_builder_->CreateRetVoid();
+}
+
+DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
+  DynamicLoopBounds dynamic_loop_bounds(num_dynamic_loop_bounds_);
+  for (int i = 0; i < num_dynamic_loop_bounds_; ++i) {
+    dynamic_loop_bounds[i].first = GetDynamicLoopBound(i * 2 + 0);
+    dynamic_loop_bounds[i].second = GetDynamicLoopBound(i * 2 + 1);
+  }
+  return dynamic_loop_bounds;
+}
+
+void IrFunction::Initialize(const string& function_name,
+                            llvm::Function::LinkageTypes linkage,
+                            const bool optimize_for_size_requested,
+                            const bool enable_fast_math) {
+  // The function signature is:
+  //   void function(i8* retval, i8* run_options, i8** params, i8** temps,
+  //                 i64* dynamic_loop_bounds, i64* prof_counters)
+  //
+  // retval: points to the returned value.
+  // params: address of an array with pointers to parameters.
+  // temps: address of an array with pointers to temporary buffers.
+  //
+  // Therefore, the generated function's signature (FunctionType) is statically
+  // determined - parameter unpacking is done in code generated into the
+  // function, rather than by a prologue dictated by the platform ABI.
+  //
+  //                      /--------------\
+  //   retval ----------> | return value |
+  //                      \--------------/
+  //
+  //                      /-------------------------------\
+  //   run_options -----> | xla::ExecutableRunOptions |
+  //                      \-------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   params -------->  |  param 0  |  param 1  | ..... |  param N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | param 0 |  | param 1 |         | param N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                     /---------------------------------------------\
+  //   temps --------->  |  temp  0  |  temp  1  | ..... |  temp  N-1  |
+  //                     |   addr    |   addr    |       |   addr      |
+  //                     \---------------------------------------------/
+  //                          |           |                   |
+  //                          |           |                   |
+  //                          V           V                   V
+  //                     /---------\  /---------\         /-----------\
+  //                     | temp  0 |  | temp  1 |         | temp  N-1 |
+  //                     \---------/  \---------/         \-----------/
+  //
+  //                        /--------------------------------------------\
+  // dynamic loop bounds -> | outer_dim0_start | outer_dim0_limit | .....|
+  //  (elided for aot)      \--------------------------------------------/
+  //
+  //                     /---------------------------------------------\
+  //   prof counters ->  | counter 0 | counter 1 | ..... | counter N-1 |
+  //                     \---------------------------------------------/
+
+  // Even though the type of params and temps is void** in the host's view, in
+  // LLVM IR this is represented by i8*, similarly to void*. It's up to the code
+  // to use GEPs to unravel the indirection layers.
+  llvm::FunctionType* function_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(llvm_module_->getContext()),
+      /*Params=*/
+      GetComputeFunctionParams(llvm_module_, num_dynamic_loop_bounds_),
+      /*isVarArg=*/false);
+
+  // Functions with local linkage get an inlining bonus.  Because we know
+  // a-priori that embedded functions (non-entry functions) will not have its
+  // name resolved, give it local linkage.
+  function_ =
+      llvm_ir::CreateFunction(function_type, linkage,
+                              /*enable_fast_math=*/enable_fast_math,
+                              /*optimize_for_size=*/optimize_for_size_requested,
+                              function_name, llvm_module_);
+
+  // Set meaningful names for the function's arguments: useful for debugging.
+  llvm::Function::arg_iterator arg_iter = function_->arg_begin();
+  arg_iter->setName("retval");
+  result_arg_ = &*arg_iter;
+  (++arg_iter)->setName("run_options");
+  exec_run_options_arg_ = &*arg_iter;
+  (++arg_iter)->setName("params");
+  parameters_arg_ = &*arg_iter;
+  (++arg_iter)->setName("temps");
+  temp_buffers_arg_ = &*arg_iter;
+  if (num_dynamic_loop_bounds_ > 0) {
+    (++arg_iter)->setName("dynamic_loop_bounds");
+    dynamic_loop_bounds_arg_ = &*arg_iter;
+  }
+  (++arg_iter)->setName("prof_counters");
+  profile_counters_arg_ = &*arg_iter;
+
+  // We know a-priori that the function arguments are guaranteed to point to
+  // disjoint objects.
+  llvm::Argument* retval = result_arg();
+  for (llvm::Argument& argument : function_->args()) {
+    // However, the return buffer aliases the temporaries and thus cannot be
+    // marked noalias.
+    if (&argument == retval) {
+      continue;
+    }
+    function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias);
+  }
+
+  ir_builder_->SetInsertPoint(llvm::BasicBlock::Create(
+      /*Context=*/llvm_module_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/function_));
+}
+
+llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
+  CHECK_GT(num_dynamic_loop_bounds_, 0);
+  CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
+  string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
+  return ir_builder_->CreateLoad(
+      ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                             ir_builder_->getInt64(offset), AsStringRef(name)));
+}
+
+// Emits code to allocate an array of parameter address pointers, and store
+// each address from 'parameter_addresses'.
+// Returns an array of compute function call arguments (including parameter
+// address buffer).
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
+  llvm::Value* parameter_addresses_buffer =
+      llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+          ir_builder->getInt8PtrTy(),
+          ir_builder->getInt32(parameter_addresses.size()),
+          tensorflow::strings::StrCat(name, "_parameter_addresses"),
+          ir_builder);
+  for (size_t i = 0; i < parameter_addresses.size(); ++i) {
+    llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast(
+        parameter_addresses[i], ir_builder->getInt8PtrTy(),
+        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
+                                                "_address_as_i8ptr")));
+    llvm::Value* slot_in_param_adresses = ir_builder->CreateInBoundsGEP(
+        parameter_addresses_buffer, {ir_builder->getInt64(i)});
+    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_adresses);
+  }
+
+  const auto to_int8_ptr = [=](llvm::Value* ptr) {
+    return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy());
+  };
+  std::vector<llvm::Value*> arguments{
+      to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg),
+      parameter_addresses_buffer, temp_buffers_arg};
+  if (profile_counters_arg != nullptr) {
+    arguments.push_back(profile_counters_arg);
+  }
+  return arguments;
+}
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+
+  // Build ParallelForkJoin function type.
+  std::vector<llvm::Type*> compute_function_params =
+      GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0);
+  // Number of parallel compute functions.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Array of partitions. There is an array element for each
+  // partition x partition_dim x 2 (for dimension start and limit).
+  compute_function_params.push_back(
+      llvm::Type::getInt64PtrTy(module->getContext()));
+  // Number of partitioned most-major dimensions in 'shape'.
+  compute_function_params.push_back(ir_builder->getInt32Ty());
+  // Function pointer for compute function to be dispatched in parallel.
+  compute_function_params.push_back(
+      llvm::Type::getInt8PtrTy(module->getContext()));
+
+  llvm::FunctionType* fork_join_type = llvm::FunctionType::get(
+      /*Result=*/llvm::Type::getVoidTy(module->getContext()),
+      /*Params=*/compute_function_params,
+      /*isVarArg=*/false);
+
+  llvm::Function* fork_join_func =
+      llvm::cast<llvm::Function>(module->getOrInsertFunction(
+          runtime::kParallelForkJoinSymbolName, fork_join_type));
+  fork_join_func->setCallingConv(llvm::CallingConv::C);
+  fork_join_func->setDoesNotThrow();
+
+  // Add common compute function arguments.
+  std::vector<llvm::Value*> fork_join_arguments(arguments);
+
+  // Create ShapePartitionIterator to generate all partitions of 'shape'.
+  ShapePartitionIterator partition_iterator(shape, dimension_partition_counts);
+  const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
+  // Add argument specifying the number of parallel partitions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitions));
+
+  // The number of partitioned most-major dimensions in 'shape'.
+  const int32 num_partitioned_dims = dimension_partition_counts.size();
+  // A dimension partition consists of two elements: [start_index, limit_index).
+  const int32 dim_partition_size = 2;
+  // Calculate array partition stride.
+  const int32 array_partition_stride =
+      num_partitioned_dims * dim_partition_size;
+  // Calculate the total number of elements in the partition array.
+  const int32 partition_array_size =
+      dim_partition_size * num_partitioned_dims * num_partitions;
+
+  // Store dimension partition values as llvm constants in 'partitions'.
+  // See comments in runtime_fork_join.cc for array layout description.
+  std::vector<llvm::Constant*> partitions(partition_array_size);
+  for (int32 i = 0; i < num_partitions; ++i) {
+    std::vector<std::pair<int64, int64>> dim_partitions =
+        partition_iterator.GetPartition(i);
+    CHECK_EQ(num_partitioned_dims, dim_partitions.size());
+    const int32 partition_index = i * array_partition_stride;
+    for (int32 j = 0; j < num_partitioned_dims; ++j) {
+      const std::pair<int64, int64>& dim_partition = dim_partitions[j];
+      const int32 index = partition_index + j * dim_partition_size;
+      // Store partition [dim_start, dim_limit) intervals for each dimension.
+      partitions[index] = ir_builder->getInt64(dim_partition.first);
+      partitions[index + 1] =
+          ir_builder->getInt64(dim_partition.first + dim_partition.second);
+    }
+  }
+
+  // Create global variable out of dimension partitions in 'partitions'.
+  llvm::ArrayType* partitions_array_type =
+      llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size);
+  llvm::Constant* partitions_array =
+      llvm::ConstantArray::get(partitions_array_type, partitions);
+  llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
+      /*M=*/*module,
+      /*Ty=*/partitions_array_type,
+      /*isConstant=*/true,
+      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+      /*Initializer=*/partitions_array,
+      /*Name=*/
+      AsStringRef(
+          tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
+
+  // Add argument specifying parallel dimension partitions.
+  fork_join_arguments.push_back(ir_builder->CreateBitCast(
+      global_partitions_array,
+      llvm::Type::getInt64PtrTy(module->getContext())));
+  // Add argument specifying the number of partitioned most-major dimensions.
+  fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims));
+  // Add argument for parallel compute function pointer.
+  fork_join_arguments.push_back(
+      ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy()));
+  // Emit call to parallel fork/join.
+  ir_builder->CreateCall(fork_join_func, fork_join_arguments);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd2da4dce23982ed030f3aa8ec604182d0ebab8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+namespace cpu {
+
+// IrFunction creates and encapsulates an llvm::Function, exposing methods to
+// emitters for function and function argument access.
+// The llvm::Function is created with the standard function signature
+// used in the XLA CPU backend (see ir_function.cc for argument details).
+// In addtion IrFunction saves the callers IR insert point during contruction,
+// and restores it after desctruction.
+//
+// Example usage:
+//
+//    // Create and initialize new IrFunction.
+//    std::unique_ptr<IrFunction> compute_function(new IrFunction(...));
+//    // Emit IR for function body using IrFunction helper methods.
+//    ...
+//    // Store reference to llvm::Function for future invocation.
+//    ir_functions.push_back(compute_function.function());
+//    // Delete IrFunction (finalizes IR function and restores caller insertion
+//    // point).
+//    compute_function.reset();
+//
+
+class IrFunction {
+ public:
+  IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
+             const bool optimize_for_size_requested,
+             const bool enable_fast_math, llvm::Module* llvm_module,
+             llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds);
+  ~IrFunction();
+
+  // Emit ir to read and return the set of ir values representing the dynamic
+  // loop bounds argument of this function.
+  // Each element in returned vector is a pair of ir values representing
+  // the loop bounds for a specific dimension, where the first element of the
+  // pair is the dimension start index, and the second element of the pair
+  // is the dimension limit.
+  // EX: [dimension_i_index_start_ir_value, dimension_i_index_limit_ir_value]
+  //
+  DynamicLoopBounds GetDynamicLoopBounds();
+
+  // Returns the encapculated llvm::Function.
+  llvm::Function* function() { return function_; }
+
+  // Get the llvm::Value* that represents this functions "retval" argument.
+  llvm::Argument* result_arg() { return result_arg_; }
+
+  // Get the xla::ExecutableRunOptions that represents this functions
+  // "run_options" argument.
+  llvm::Value* exec_run_options_arg() { return exec_run_options_arg_; }
+
+  // Get the llvm::Value* that represents this functions parameters argument.
+  llvm::Value* parameters_arg() { return parameters_arg_; }
+
+  // Get the llvm::Value* that represents this functions "temps" argument.
+  llvm::Value* temp_buffers_arg() { return temp_buffers_arg_; }
+
+  // Get the llvm::Value* that represents this functions "prof_counters"
+  // argument.
+  llvm::Value* profile_counters_arg() { return profile_counters_arg_; }
+
+ private:
+  // Initialize an llvm::Function with standard signature based on arguments.
+  void Initialize(const string& function_name,
+                  llvm::Function::LinkageTypes linkage,
+                  bool optimize_for_size_requested, bool enable_fast_math);
+
+  // Emit ir to read and return the ir value for the dynamic loop bound at
+  // 'offset' from the "dynamic_loop_bounds" argument of this function.
+  llvm::Value* GetDynamicLoopBound(int64 offset);
+
+  llvm::IRBuilder<>* ir_builder_;
+  llvm::Module* llvm_module_;
+  llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_;
+
+  int64 num_dynamic_loop_bounds_ = 0;
+  // Encapsulated llvm::Function.
+  llvm::Function* function_;
+  // Function argument IR values.
+  llvm::Argument* result_arg_;
+  llvm::Value* exec_run_options_arg_;
+  llvm::Value* parameters_arg_;
+  llvm::Value* temp_buffers_arg_;
+  llvm::Value* dynamic_loop_bounds_arg_ = nullptr;
+  llvm::Value* profile_counters_arg_;
+};
+
+// Returns an array of compute function call argument ir values.
+std::vector<llvm::Value*> GetArrayFunctionCallArguments(
+    tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
+    llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
+
+// Emits a call to a runtime fork/join function which dispatches parallel
+// calls to 'parallel_function' (and joins threads before returning).
+Status EmitCallToParallelForkJoin(
+    const std::vector<llvm::Value*>& arguments, const Shape& shape,
+    const std::vector<int64>& dimension_partition_counts,
+    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
+    const string& name);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_FUNCTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index b49047283119fb2f10b9f68eaa37a7bdc27f63a6..81c29e4726c7be53b433be896f558f502e43c885 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -52,7 +52,7 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   llvm::IRBuilder<> ir_builder(vector_tanh_body);
 
   llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setUnsafeAlgebra();
+  fast_math_flags.setFast();
   ir_builder.setFastMathFlags(fast_math_flags);
 
   llvm::Value* input = &*vector_tanh_function->arg_begin();
diff --git a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e624e5cc7ebdbb79a8a3b3c73633ec697a71d172
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.cc
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace xla {
+namespace cpu {
+namespace orc_jit_memory_mapper {
+
+static tensorflow::mutex mapper_instance_mutex(tensorflow::LINKER_INITIALIZED);
+static llvm::SectionMemoryManager::MemoryMapper* mapper_instance
+    GUARDED_BY(mapper_instance_mutex) = nullptr;
+
+llvm::SectionMemoryManager::MemoryMapper* GetInstance() {
+  tensorflow::mutex_lock lock(mapper_instance_mutex);
+  return mapper_instance;
+}
+
+Registrar::Registrar(
+    std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper> mapper) {
+  tensorflow::mutex_lock lock(mapper_instance_mutex);
+  mapper_instance = mapper.release();
+}
+}  // namespace orc_jit_memory_mapper
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d29550fd5bd659770cc6300e56b57bf1763e671
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
+
+#include <memory>
+
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+
+namespace xla {
+namespace cpu {
+
+namespace orc_jit_memory_mapper {
+// Returns the registered memory mapper if there is one.  Returns nullptr if no
+// memory mapper is registered.
+llvm::SectionMemoryManager::MemoryMapper* GetInstance();
+
+class Registrar {
+ public:
+  // Registers the `mapper` as a memory mapper.  This is a no-op if `mapper` is
+  // null.  Precondition:  no other memory mapper has been registered yet.
+  explicit Registrar(
+      std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper> mapper);
+};
+}  // namespace orc_jit_memory_mapper
+
+#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER(mapper_instance, ctr) \
+  static ::xla::cpu::orc_jit_memory_mapper::Registrar                     \
+      XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_NAME(ctr)(mapper_instance)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER_NAME(ctr) \
+  __orc_jit_memory_mapper_registrar_##ctr
+
+// Registers the std::unique_ptr<llvm::SectionMemoryManager::MemoryMapper>
+// returned by the `factory` expression.  `factory` is allowed to evaluate to
+// a null unique_ptr in which case this macro does nothing.
+#define XLA_REGISTER_ORC_JIT_MEMORY_MAPPER(factory) \
+  XLA_INTERNAL_REGISTER_ORC_JIT_MEMORY_MAPPER(factory, __COUNTER__)
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_ORC_JIT_MEMORY_MAPPER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
index adedc1c37fdc8fb3c3e017f0773ef3fc52ebdec6..0077e344e2bd34aa598ee076220fee678f31b4ad 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.cc
@@ -58,20 +58,21 @@ ParallelCpuExecutable::ParallelCpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<const HloModule> hlo_module,
-    std::unique_ptr<const std::map<HloInstruction*, string>> function_names,
-    std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
+    std::unique_ptr<const HloInstructionMap<string>> function_names,
     std::unordered_map<const HloInstruction*, std::unique_ptr<unsigned char[]>>
-        aligned_constants)
-    : Executable(std::move(hlo_module)),
+        aligned_constants,
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       jit_(std::move(jit)),
       assignment_(std::move(assignment)),
       function_names_(std::move(function_names)),
-      hlo_to_profile_idx_(std::move(hlo_to_profile_idx)),
       aligned_constants_(std::move(aligned_constants)) {}
 
 // Type of the computation function we expect in the JIT.
 using ComputeFunctionType = void (*)(void*, const void*, const void**, void**,
-                                     int64*, uint64*);
+                                     int64*, int64*);
 
 // Given a pointer to an output buffer (following the CPU JIT calling
 // conventions), mark addresses that are "live". The initial pointer itself is
@@ -102,11 +103,11 @@ namespace {
 // in 'pending' on 'thread_pool' (storing resulting data in 'results').
 class Executor {
  public:
-  Executor(const std::map<HloInstruction*, ComputeFunctionType>& functions,
+  Executor(const HloInstructionMap<ComputeFunctionType>& functions,
            const ServiceExecutableRunOptions* run_options,
            std::list<HloInstruction*>* pending,
-           std::map<HloInstruction*, const void*>* results, void** temps_array,
-           uint64* profile_counters_array, const BufferAssignment* assignment)
+           HloInstructionMap<const void*>* results, void** temps_array,
+           int64* profile_counters_array, const BufferAssignment* assignment)
       : functions_(functions),
         run_options_(run_options),
         pending_(pending),
@@ -142,12 +143,12 @@ class Executor {
   const void** GetOperandBuffers(HloInstruction* instruction);
 
   // Arguments passed into Executor.
-  const std::map<HloInstruction*, ComputeFunctionType>& functions_;
+  const HloInstructionMap<ComputeFunctionType>& functions_;
   const ServiceExecutableRunOptions* run_options_;
   std::list<HloInstruction*>* pending_;
-  std::map<HloInstruction*, const void*>* results_;
+  HloInstructionMap<const void*>* results_;
   void** temps_array_;
-  uint64* profile_counters_array_;
+  int64* profile_counters_array_;
   tensorflow::thread::ThreadPool* thread_pool_;
   const BufferAssignment* assignment_;
 
@@ -389,9 +390,11 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
     HloExecutionProfile* hlo_execution_profile) {
   // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Allocate an additional profile counter for the entire
-  // computation.
-  std::vector<uint64> profile_counters(hlo_to_profile_idx_.size() + 1);
+  // profile.
+  std::vector<int64>* profile_counters = nullptr;
+  if (hlo_execution_profile) {
+    profile_counters = hlo_execution_profile->mutable_profile_counters();
+  }
 
   std::vector<void*> buffer_pointers;
   buffer_pointers.reserve(buffers.size());
@@ -400,7 +403,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   }
 
   // Resolve functions for all the HLO instructions ahead of time.
-  std::map<HloInstruction*, ComputeFunctionType> functions;
+  HloInstructionMap<ComputeFunctionType> functions;
   for (auto& entry : *function_names_) {
     tensorflow::mutex_lock lock(jit_mutex_);
     HloInstruction* instruction = entry.first;
@@ -412,7 +415,7 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   }
 
   // Map containing pointers to result buffers for each instruction.
-  std::map<HloInstruction*, const void*> results;
+  HloInstructionMap<const void*> results;
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -441,9 +444,9 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
   // For example, if we expect a library conv/matmul call to run at max
   // concurrency, we should not dispatch runnable instructions until the
   // library call is finished (to avoid expensive cache invalidation).
-  Executor executor(functions, run_options, &pending, &results,
-                    buffer_pointers.data(), profile_counters.data(),
-                    assignment_.get());
+  Executor executor(
+      functions, run_options, &pending, &results, buffer_pointers.data(),
+      profile_counters ? profile_counters->data() : nullptr, assignment_.get());
 
   TF_RETURN_IF_ERROR(executor.Run());
 
@@ -453,18 +456,6 @@ Status ParallelCpuExecutable::ExecuteComputeFunctions(
     tensorflow::mutex_lock lock(mutex_);
     double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-    // The last profile counter is used for the computation as a whole.
-    execution_profile_.set_compute_cycle_count(profile_counters.back());
-  }
-  if (hlo_execution_profile != nullptr) {
-    hlo_execution_profile->set_total_cycles_executed(entry_computation,
-                                                     profile_counters.back());
-
-    for (auto hlo_prof_idx : hlo_to_profile_idx_) {
-      const HloInstruction* hlo = hlo_prof_idx.first;
-      uint64 cycles_taken = profile_counters[hlo_prof_idx.second];
-      hlo_execution_profile->AddProfileResult(hlo, cycles_taken);
-    }
   }
 
   return Status::OK();
@@ -618,10 +609,5 @@ const PointsToSet& ParallelCpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> ParallelCpuExecutable::CreateCostAnalysis()
-    const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index a75552b7d1eeda2f04e95fb8abc3a597f423024a..d65e3f42f3cb34eff005f34b51b81fd5c42974a3 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -51,11 +51,12 @@ class ParallelCpuExecutable : public Executable {
       std::unique_ptr<SimpleOrcJIT> jit,
       std::unique_ptr<const BufferAssignment> assignment,
       std::unique_ptr<const HloModule> hlo_module,
-      std::unique_ptr<const std::map<HloInstruction*, string>> function_names,
-      std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx,
+      std::unique_ptr<const HloInstructionMap<string>> function_names,
       std::unordered_map<const HloInstruction*,
                          std::unique_ptr<unsigned char[]>>
-          aligned_constants);
+          aligned_constants,
+      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
   ~ParallelCpuExecutable() override {}
 
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
@@ -95,8 +96,6 @@ class ParallelCpuExecutable : public Executable {
         "Equality test on CPU parallel executable is not implemented.");
   }
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
@@ -141,11 +140,7 @@ class ParallelCpuExecutable : public Executable {
   string ir_module_string_;
 
   // Map containing the JITted function names for each HLO instruction.
-  const std::unique_ptr<const std::map<HloInstruction*, string>>
-      function_names_;
-
-  // Maps HLOs to their index into the profile counter array.
-  const std::unordered_map<const HloInstruction*, size_t> hlo_to_profile_idx_;
+  const std::unique_ptr<const HloInstructionMap<string>> function_names_;
 
   // Map from HLO Constant instructions to a pointer to their literal data.
   // The data stored in the protocol buffer might be insufficiently aligned,
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3c3c1e5efc91af6b924a3712689f3d7ccf5d6f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h"
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace xla {
+namespace cpu {
+
+ParallelLoopEmitter::ParallelLoopEmitter(
+    const llvm_ir::ElementGenerator& target_element_generator,
+    const llvm_ir::IrArray& target_array,
+    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder)
+    : LoopEmitter(target_element_generator, target_array, ir_builder),
+      dynamic_loop_bounds_(dynamic_loop_bounds) {}
+
+llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+    tensorflow::StringPiece loop_name) {
+  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!ShapeUtil::IsScalar(shape_));
+
+  llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_);
+  const int64 num_dims = shape_.dimensions_size();
+  llvm_ir::IrArray::Index array_index(num_dims);
+
+  // Add loops from outer-most to inner-most dimensions.
+  for (int i = shape_.layout().minor_to_major_size() - 1; i >= 0; --i) {
+    const int64 dimension = shape_.layout().minor_to_major(i);
+    const int bounds_index = num_dims - 1 - i;
+    if (bounds_index < dynamic_loop_bounds_->size()) {
+      // Emit dynamic loop bounds for this dimension. Dynamic loop bounds
+      // are read from ir function dynamic loop bounds argument.
+      llvm::Value* start_index = (*dynamic_loop_bounds_)[bounds_index].first;
+      llvm::Value* end_index = (*dynamic_loop_bounds_)[bounds_index].second;
+
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension),
+          start_index, end_index);
+      array_index[dimension] = loop->GetIndVarValue();
+    } else {
+      // Emit static loop bounds for this dimension.
+      std::unique_ptr<llvm_ir::ForLoop> loop = loop_nest.AddLoop(
+          /*start_index=*/0,
+          /*end_index=*/shape_.dimensions(dimension),
+          /*suffix=*/tensorflow::strings::Printf("dim.%lld", dimension));
+      array_index[dimension] = loop->GetIndVarValue();
+    }
+  }
+  // Point IR builder at inner loop BB.
+  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(),
+                                 ir_builder_);
+
+  // Set exit_bb_ to the exit block of the loop nest.
+  exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
+  CHECK(exit_bb_ != nullptr);
+
+  return array_index;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..9335d2818e99eb3588537d80dabddda08c1c020e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+
+namespace xla {
+namespace cpu {
+
+// ParallelLoopEmitter emits a loop nest for the target array shape.
+// The outer loop bounds of the loop nest are passed as ir values at runtime
+// (specified in 'dynamic_loop_bounds'), and the inner loop bounds are static.
+// Dynamic loop bounds are specified as an array of dimension index
+// [start, limit) pairs of ir values (one for each partitioned outer dimension).
+//
+// EX: Let 'shape' = [8, 16, 32], with the loop bounds of the two-most major
+//     dimensions dynamic. Then 'dynamic_loop_bounds' will contain the
+//     following ir values for the two most-major dimensions:
+//       [dim0_index_start_ir_value, dim0_index_limit_ir_value]
+//       [dim1_index_start_ir_value, dim1_index_limit_ir_value]
+//
+// Code emitted by ParallelLoopEmitter will be called in a multi-threaded
+// context where each thread will be assigned a different set of outer dimension
+// partitions, and where all threads will collectively iterate over the
+// entire target array shape.
+//
+// Outer dimension partitions can be generated using the ShapePartitionAssigner
+// and ShapePartitionIterator utility classes from shape_partition.cc.
+//
+class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
+ public:
+  // Constructs a ParallelLoopEmitter which uses 'target_element_generator' to
+  // generate elements, 'dynamic_loop_bounds' to set the loop bounds of the
+  // most-major dimensions, and 'target_array.' shape to set the static loop
+  // bounds for the most-minor dimensions.
+  ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
+                      const llvm_ir::IrArray& target_array,
+                      const DynamicLoopBounds* dynamic_loop_bounds,
+                      llvm::IRBuilder<>* ir_builder);
+
+  ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
+  ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
+  ~ParallelLoopEmitter() override = default;
+
+  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+      tensorflow::StringPiece loop_name) override;
+
+ private:
+  const DynamicLoopBounds* dynamic_loop_bounds_;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_LOOP_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index c2213c8f2ef592c537daf9abe2ffa10b83a8fa4c..4b44ac8941e222d5954121bbb9654062e41f55d6 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 
+#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/shape_partition.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -101,11 +102,9 @@ class DefaultCostModel : public ParallelCostModel {
   const std::unique_ptr<HloCostAnalysis> cost_analysis_;
 };
 
-
 ParallelTaskAssignment::ParallelTaskAssignment(
     const int64 max_parallelism,
-    const HloCostAnalysis::ShapeSizeFunction& shape_size,
-    HloModule* module) {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size, HloModule* module) {
   VLOG(1) << "ParallelTaskAssignment max_parallelism: " << max_parallelism;
   // Run cost analysis on 'module'.
   auto cost_analysis = MakeUnique<HloCostAnalysis>(shape_size);
@@ -153,7 +152,6 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
 StatusOr<bool> ParallelTaskAssigner::Run(HloModule* module) {
   XLA_VLOG_LINES(2, "ParallelTaskAssigner ENTRY");
   XLA_VLOG_LINES(3, module->ToString());
-
   // Compute target parallel task counts for all instructions in 'module'.
   HloToParallelTasks hlo_to_parallel_tasks;
   ComputeTargetParallelTasks(module, &hlo_to_parallel_tasks);
@@ -230,6 +228,9 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
 
 void ParallelTaskAssigner::ComputeTargetParallelTasks(
     HloModule* module, HloToParallelTasks* hlo_to_parallel_tasks) {
+  ParallelTaskAssignment parallel_task_assignment(max_parallelism_,
+                                                  shape_size_function_, module);
+
   // Compute parallel task counts for all instructions in 'module'.
   for (auto* computation : module->computations()) {
     if (computation->IsFusionComputation()) {
@@ -238,7 +239,7 @@ void ParallelTaskAssigner::ComputeTargetParallelTasks(
     for (auto* instruction : computation->instructions()) {
       // Query ParallelTaskAssignment for target parallel task count.
       const int64 target_parallel_task_count =
-          parallel_task_assignment_.GetTargetParallelTaskCount(instruction);
+          parallel_task_assignment.GetTargetParallelTaskCount(instruction);
       if (target_parallel_task_count > 1) {
         hlo_to_parallel_tasks->insert(
             {instruction, target_parallel_task_count});
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index e036da5784f6151eb3b01107ec7f3ab820071a60..5801ec8d270cdaed7f2f65c24987a9ea643edb02 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -37,10 +37,9 @@ class ParallelTaskAssignment {
   // 'shape_size': shape size function used by HloCostAnalysis during parallel
   //               task assignment.
   // 'module': the containing HloModule.
-  ParallelTaskAssignment(
-      const int64 max_parallelism,
-      const HloCostAnalysis::ShapeSizeFunction& shape_size,
-      HloModule* module);
+  ParallelTaskAssignment(const int64 max_parallelism,
+                         const HloCostAnalysis::ShapeSizeFunction& shape_size,
+                         HloModule* module);
   ~ParallelTaskAssignment() {}
 
   // Computes and returns the target parallel task count for 'instruction'.
@@ -63,11 +62,9 @@ class ParallelTaskAssigner : public HloPassInterface {
   // 'max_parallelism': the maximum parallel task count per instruction.
   // 'shape_size': shape size function used by HloCostAnalysis during parallel
   //               task assignment.
-  // 'module': the containing HloModule.
   ParallelTaskAssigner(const int64 max_parallelism,
-                       const HloCostAnalysis::ShapeSizeFunction& shape_size,
-                       HloModule* module)
-      : parallel_task_assignment_(max_parallelism, shape_size, module) {}
+                       const HloCostAnalysis::ShapeSizeFunction& shape_size)
+      : max_parallelism_(max_parallelism), shape_size_function_(shape_size) {}
   ~ParallelTaskAssigner() override {}
 
   tensorflow::StringPiece name() const override {
@@ -95,7 +92,8 @@ class ParallelTaskAssigner : public HloPassInterface {
   void ComputeTargetParallelTasks(HloModule* module,
                                   HloToParallelTasks* hlo_to_parallel_tasks);
 
-  ParallelTaskAssignment parallel_task_assignment_;
+  int64 max_parallelism_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_function_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index fdf02e5b422f75e256feec77470bb0d079e8ef1f..cda2783307925b77ac6d8cfe679c5b325db2befc 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
@@ -125,8 +126,10 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      object_layer_(
-          [] { return std::make_shared<llvm::SectionMemoryManager>(); }),
+      object_layer_([] {
+        return std::make_shared<llvm::SectionMemoryManager>(
+            orc_jit_memory_mapper::GetInstance());
+      }),
       compile_layer_(
           object_layer_,
           CompilerFunctor(target_machine_.get(), &disassembler_, opt_level,
@@ -210,71 +213,75 @@ bool RegisterKnownJITSymbols() {
 
 #undef REGISTER_CPU_RUNTIME_SYMBOL
 
-#define REGISTER_LIBM_SYMBOL(name)                                    \
-  do {                                                                \
-    /* Register both the F32 and F64 variants of the libm symbol.  */ \
-    registry->Register(#name "f", reinterpret_cast<void*>(name##f));  \
-    registry->Register(#name, reinterpret_cast<void*>(name));         \
+// Register both the f32 (float) and f64 (double) versions of a libm symbol.
+// Unfortunately the double versions are overloaded on some systems, e.g.
+// Mac so we need an explicit cast. This requires passing the function signature
+// for that case.
+#define REGISTER_LIBM_SYMBOL(name, double_sig)                          \
+  do {                                                                  \
+    registry->Register(#name "f", reinterpret_cast<void*>(name##f));    \
+    registry->Register(                                                 \
+        #name, reinterpret_cast<void*>(static_cast<double_sig>(name))); \
   } while (false)
 
-  REGISTER_LIBM_SYMBOL(acos);
-  REGISTER_LIBM_SYMBOL(acosh);
-  REGISTER_LIBM_SYMBOL(asin);
-  REGISTER_LIBM_SYMBOL(asinh);
-  REGISTER_LIBM_SYMBOL(atan);
-  REGISTER_LIBM_SYMBOL(atan2);
-  REGISTER_LIBM_SYMBOL(atanh);
-  REGISTER_LIBM_SYMBOL(cbrt);
-  REGISTER_LIBM_SYMBOL(ceil);
-  REGISTER_LIBM_SYMBOL(copysign);
-  REGISTER_LIBM_SYMBOL(cos);
-  REGISTER_LIBM_SYMBOL(cosh);
-  REGISTER_LIBM_SYMBOL(erf);
-  REGISTER_LIBM_SYMBOL(erfc);
-  REGISTER_LIBM_SYMBOL(exp);
-  REGISTER_LIBM_SYMBOL(exp2);
-  REGISTER_LIBM_SYMBOL(expm1);
-  REGISTER_LIBM_SYMBOL(fabs);
-  REGISTER_LIBM_SYMBOL(fdim);
-  REGISTER_LIBM_SYMBOL(floor);
-  REGISTER_LIBM_SYMBOL(fma);
-  REGISTER_LIBM_SYMBOL(fmax);
-  REGISTER_LIBM_SYMBOL(fmin);
-  REGISTER_LIBM_SYMBOL(fmod);
-  REGISTER_LIBM_SYMBOL(frexp);
-  REGISTER_LIBM_SYMBOL(hypot);
-  REGISTER_LIBM_SYMBOL(ilogb);
-  REGISTER_LIBM_SYMBOL(ldexp);
-  REGISTER_LIBM_SYMBOL(lgamma);
-  REGISTER_LIBM_SYMBOL(llrint);
-  REGISTER_LIBM_SYMBOL(llround);
-  REGISTER_LIBM_SYMBOL(log);
-  REGISTER_LIBM_SYMBOL(log10);
-  REGISTER_LIBM_SYMBOL(log1p);
-  REGISTER_LIBM_SYMBOL(log2);
-  REGISTER_LIBM_SYMBOL(logb);
-  REGISTER_LIBM_SYMBOL(lrint);
-  REGISTER_LIBM_SYMBOL(lround);
-  REGISTER_LIBM_SYMBOL(modf);
-  REGISTER_LIBM_SYMBOL(nan);
-  REGISTER_LIBM_SYMBOL(nearbyint);
-  REGISTER_LIBM_SYMBOL(nextafter);
-  REGISTER_LIBM_SYMBOL(nexttoward);
-  REGISTER_LIBM_SYMBOL(pow);
-  REGISTER_LIBM_SYMBOL(remainder);
-  REGISTER_LIBM_SYMBOL(remquo);
-  REGISTER_LIBM_SYMBOL(rint);
-  REGISTER_LIBM_SYMBOL(round);
-  REGISTER_LIBM_SYMBOL(scalbln);
-  REGISTER_LIBM_SYMBOL(scalbn);
-  REGISTER_LIBM_SYMBOL(sin);
-  REGISTER_LIBM_SYMBOL(sincos);
-  REGISTER_LIBM_SYMBOL(sinh);
-  REGISTER_LIBM_SYMBOL(sqrt);
-  REGISTER_LIBM_SYMBOL(tan);
-  REGISTER_LIBM_SYMBOL(tanh);
-  REGISTER_LIBM_SYMBOL(tgamma);
-  REGISTER_LIBM_SYMBOL(trunc);
+  REGISTER_LIBM_SYMBOL(acos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(acosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(asinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(atan2, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(atanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cbrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(ceil, double (*)(double));
+  REGISTER_LIBM_SYMBOL(copysign, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(cos, double (*)(double));
+  REGISTER_LIBM_SYMBOL(cosh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erf, double (*)(double));
+  REGISTER_LIBM_SYMBOL(erfc, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp, double (*)(double));
+  REGISTER_LIBM_SYMBOL(exp2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(expm1, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fabs, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fdim, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(floor, double (*)(double));
+  REGISTER_LIBM_SYMBOL(fma, double (*)(double, double, double));
+  REGISTER_LIBM_SYMBOL(fmax, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmin, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(fmod, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(frexp, double (*)(double, int*));
+  REGISTER_LIBM_SYMBOL(hypot, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(ilogb, int (*)(double));
+  REGISTER_LIBM_SYMBOL(ldexp, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(lgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(llrint, long long (*)(double));
+  REGISTER_LIBM_SYMBOL(llround, long long (*)(double));
+  REGISTER_LIBM_SYMBOL(log, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log10, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log1p, double (*)(double));
+  REGISTER_LIBM_SYMBOL(log2, double (*)(double));
+  REGISTER_LIBM_SYMBOL(logb, double (*)(double));
+  REGISTER_LIBM_SYMBOL(lrint, long (*)(double));
+  REGISTER_LIBM_SYMBOL(lround, long (*)(double));
+  REGISTER_LIBM_SYMBOL(modf, double (*)(double, double*));
+  REGISTER_LIBM_SYMBOL(nan, double (*)(const char*));
+  REGISTER_LIBM_SYMBOL(nearbyint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(nextafter, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(nexttoward, double (*)(double, long double));
+  REGISTER_LIBM_SYMBOL(pow, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remainder, double (*)(double, double));
+  REGISTER_LIBM_SYMBOL(remquo, double (*)(double, double, int*));
+  REGISTER_LIBM_SYMBOL(rint, double (*)(double));
+  REGISTER_LIBM_SYMBOL(round, double (*)(double));
+  REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long));
+  REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int));
+  REGISTER_LIBM_SYMBOL(sin, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
+  REGISTER_LIBM_SYMBOL(sinh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(sqrt, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tan, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tanh, double (*)(double));
+  REGISTER_LIBM_SYMBOL(tgamma, double (*)(double));
+  REGISTER_LIBM_SYMBOL(trunc, double (*)(double));
 
 #undef REGISTER_LIBM_SYMBOL
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
index 6efd0bcee58d19b355b6c2afa6d9497f75ef4b3c..2172ae0a29626660e8abd29a789e0baa3831519d 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.cc
@@ -24,37 +24,55 @@ limitations under the License.
 
 namespace xla {
 
-Status DfsHloVisitor::HandleElementwiseUnary(HloInstruction* hlo) {
+template <typename HloInstructionPtr>
+Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseUnary(
+    HloInstructionPtr hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseUnary: %s",
                        HloOpcodeString(hlo->opcode()).c_str());
 }
 
-Status DfsHloVisitor::HandleElementwiseBinary(HloInstruction* hlo) {
+template <typename HloInstructionPtr>
+Status DfsHloVisitorBase<HloInstructionPtr>::HandleElementwiseBinary(
+    HloInstructionPtr hlo) {
   return Unimplemented("DfsHloVisitor::HandleElementwiseBinary: %s",
                        HloOpcodeString(hlo->opcode()).c_str());
 }
 
-DfsHloVisitor::VisitState DfsHloVisitor::GetVisitState(
+template <typename HloInstructionPtr>
+typename DfsHloVisitorBase<HloInstructionPtr>::VisitState
+DfsHloVisitorBase<HloInstructionPtr>::GetVisitState(
     const HloInstruction& instruction) {
   return GetVisitState(instruction.unique_id());
 }
 
-void DfsHloVisitor::SetVisiting(const HloInstruction& instruction) {
+template <typename HloInstructionPtr>
+void DfsHloVisitorBase<HloInstructionPtr>::SetVisiting(
+    const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visiting: ";
   DCHECK(NotVisited(instruction));
   visit_state_.SetState(instruction.unique_id(), VisitState::kVisiting);
 }
 
-void DfsHloVisitor::SetVisited(const HloInstruction& instruction) {
+template <typename HloInstructionPtr>
+void DfsHloVisitorBase<HloInstructionPtr>::SetVisited(
+    const HloInstruction& instruction) {
   VLOG(3) << "marking HLO " << &instruction << " as visited: ";
   DCHECK(NotVisited(instruction) || IsVisiting(instruction));
   visit_state_.SetState(instruction.unique_id(), VisitState::kVisited);
 }
 
-Status DfsHloVisitor::Preprocess(HloInstruction* hlo) { return Status::OK(); }
+template <typename HloInstructionPtr>
+Status DfsHloVisitorBase<HloInstructionPtr>::Preprocess(HloInstructionPtr) {
+  return Status::OK();
+}
 
-Status DfsHloVisitor::Postprocess(HloInstruction* visited) {
+template <typename HloInstructionPtr>
+Status DfsHloVisitorBase<HloInstructionPtr>::Postprocess(HloInstructionPtr) {
   return Status::OK();
 }
 
+// Explicit instantiations.
+template class DfsHloVisitorBase<HloInstruction*>;
+template class DfsHloVisitorBase<const HloInstruction*>;
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 237cd8c31de1ba1aa97739c579d6d92264ddc61b..91086fd4a5f68211ef56c2417bb0ef4a38de2cff 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
 
+#include <type_traits>
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -52,170 +53,183 @@ class HloInstruction;
 // "unimplemented" error status.
 //
 // Note: this may change to an iterator in the future for flexibility purposes.
-class DfsHloVisitor {
+//
+// Users should not use this class directly, but use the type-aliases
+// DfsHloVisitor/ConstDfsHloVisitor instead.
+template <typename HloInstructionPtr>
+class DfsHloVisitorBase {
+  static_assert(
+      std::is_same<HloInstruction*, HloInstructionPtr>::value ||
+          std::is_same<const HloInstruction*, HloInstructionPtr>::value,
+      "Template argument expected to be HloInstruction* or const "
+      "HloInstruction*");
+
  public:
-  DfsHloVisitor() {}
-  virtual ~DfsHloVisitor() {}
+  DfsHloVisitorBase() {}
+  virtual ~DfsHloVisitorBase() {}
 
   // These routines are self-descriptive, see class comment for usage
   // information.
 
-  virtual Status HandleElementwiseUnary(HloInstruction* hlo);
-  virtual Status HandleElementwiseBinary(HloInstruction* hlo);
-  virtual Status HandleClamp(HloInstruction* clamp) = 0;
-  virtual Status HandleSelect(HloInstruction* select) = 0;
-  virtual Status HandleMaximum(HloInstruction* maximum) {
-    return HandleElementwiseBinary(maximum);
+  virtual Status HandleElementwiseUnary(HloInstructionPtr hlo);
+  virtual Status HandleElementwiseBinary(HloInstructionPtr hlo);
+
+  virtual Status HandleClamp(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSelect(HloInstructionPtr hlo) = 0;
+  virtual Status HandleMaximum(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleMinimum(HloInstruction* minimum) {
-    return HandleElementwiseBinary(minimum);
+  virtual Status HandleMinimum(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleConcatenate(HloInstruction* concatenate) = 0;
-  virtual Status HandleConvert(HloInstruction* convert) {
-    return HandleElementwiseUnary(convert);
+  virtual Status HandleConcatenate(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCopy(HloInstruction* copy) {
-    return HandleElementwiseUnary(copy);
+  virtual Status HandleBitcastConvert(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleComplex(HloInstruction* complex) {
-    return HandleElementwiseBinary(complex);
+  virtual Status HandleCopy(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleMultiply(HloInstruction* multiply) {
-    return HandleElementwiseBinary(multiply);
+  virtual Status HandleComplex(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleDot(HloInstruction* dot) = 0;
-  virtual Status HandlePower(HloInstruction* power) {
-    return HandleElementwiseBinary(power);
+  virtual Status HandleMultiply(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleConvolution(HloInstruction* convolution) = 0;
-  virtual Status HandleCrossReplicaSum(HloInstruction* crs) = 0;
-  virtual Status HandleCompare(HloInstruction* compare) {
-    return HandleElementwiseBinary(compare);
+  virtual Status HandleDot(HloInstructionPtr hlo) = 0;
+  virtual Status HandlePower(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleAdd(HloInstruction* add) {
-    return HandleElementwiseBinary(add);
+  virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCompare(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleDivide(HloInstruction* divide) {
-    return HandleElementwiseBinary(divide);
+  virtual Status HandleAdd(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleRemainder(HloInstruction* remainder) {
-    return HandleElementwiseBinary(remainder);
+  virtual Status HandleDivide(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleSubtract(HloInstruction* subtract) {
-    return HandleElementwiseBinary(subtract);
+  virtual Status HandleRemainder(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleAbs(HloInstruction* abs) {
-    return HandleElementwiseUnary(abs);
+  virtual Status HandleSubtract(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleAtan2(HloInstruction* atan2) {
-    return HandleElementwiseBinary(atan2);
+  virtual Status HandleAbs(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleRound(HloInstruction* round) {
-    return HandleElementwiseUnary(round);
+  virtual Status HandleAtan2(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleSign(HloInstruction* sign) {
-    return HandleElementwiseUnary(sign);
+  virtual Status HandleRound(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleNegate(HloInstruction* negate) {
-    return HandleElementwiseUnary(negate);
+  virtual Status HandleSign(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleExp(HloInstruction* exp) {
-    return HandleElementwiseUnary(exp);
+  virtual Status HandleNegate(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleFloor(HloInstruction* floor) {
-    return HandleElementwiseUnary(floor);
+  virtual Status HandleExp(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCeil(HloInstruction* ceil) {
-    return HandleElementwiseUnary(ceil);
+  virtual Status HandleFloor(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleLog(HloInstruction* log) {
-    return HandleElementwiseUnary(log);
+  virtual Status HandleCeil(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleCos(HloInstruction* cos) {
-    return HandleElementwiseUnary(cos);
+  virtual Status HandleLog(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleSin(HloInstruction* sin) {
-    return HandleElementwiseUnary(sin);
+  virtual Status HandleCos(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleTanh(HloInstruction* tanh) {
-    return HandleElementwiseUnary(tanh);
+  virtual Status HandleSin(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleReal(HloInstruction* real) {
-    return HandleElementwiseUnary(real);
+  virtual Status HandleTanh(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleImag(HloInstruction* imag) {
-    return HandleElementwiseUnary(imag);
+  virtual Status HandleReal(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleIsFinite(HloInstruction* is_finite) {
-    return HandleElementwiseUnary(is_finite);
+  virtual Status HandleImag(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleAnd(HloInstruction* and_) {
-    return HandleElementwiseBinary(and_);
+  virtual Status HandleIsFinite(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleNot(HloInstruction* not_) {
-    return HandleElementwiseUnary(not_);
+  virtual Status HandleAnd(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleOr(HloInstruction* or_) {
-    return HandleElementwiseBinary(or_);
+  virtual Status HandleNot(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
-  virtual Status HandleShiftLeft(HloInstruction* shift_left) {
-    return HandleElementwiseBinary(shift_left);
+  virtual Status HandleOr(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleShiftRightArithmetic(
-      HloInstruction* shift_right_arithmetic) {
-    return HandleElementwiseBinary(shift_right_arithmetic);
+  virtual Status HandleShiftLeft(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
-  virtual Status HandleShiftRightLogical(HloInstruction* shift_right_logical) {
-    return HandleElementwiseBinary(shift_right_logical);
+  virtual Status HandleShiftRightArithmetic(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
+  }
+  virtual Status HandleShiftRightLogical(HloInstructionPtr hlo) {
+    return HandleElementwiseBinary(hlo);
   }
 
-  virtual Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return HandleElementwiseUnary(reduce_precision);
+  virtual Status HandleReducePrecision(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
   }
 
-  virtual Status HandleInfeed(HloInstruction* infeed) = 0;
-  virtual Status HandleOutfeed(HloInstruction* outfeed) = 0;
-  virtual Status HandleRng(HloInstruction* random) = 0;
-  virtual Status HandleReverse(HloInstruction* reverse) = 0;
-  virtual Status HandleSort(HloInstruction* sort) = 0;
-  virtual Status HandleConstant(HloInstruction* constant) = 0;
-  virtual Status HandleGetTupleElement(HloInstruction* get_tuple_element) = 0;
-  virtual Status HandleReduce(HloInstruction* reduce) = 0;
-  virtual Status HandleBitcast(HloInstruction* bitcast) = 0;
-  virtual Status HandleBroadcast(HloInstruction* broadcast) = 0;
-  virtual Status HandleReshape(HloInstruction* reshape) = 0;
-  virtual Status HandleTranspose(HloInstruction* transpose) = 0;
-  virtual Status HandleParameter(HloInstruction* parameter) = 0;
-  virtual Status HandleFusion(HloInstruction* fusion) = 0;
-  virtual Status HandleCall(HloInstruction* call) = 0;
-  virtual Status HandleCustomCall(HloInstruction* custom_call) = 0;
-  virtual Status HandleSlice(HloInstruction* slice) = 0;
-  virtual Status HandleDynamicSlice(HloInstruction* dynamic_slice) = 0;
-  virtual Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) = 0;
-  virtual Status HandleTuple(HloInstruction* tuple) = 0;
-  virtual Status HandleMap(HloInstruction* map) = 0;
-  virtual Status HandleReduceWindow(HloInstruction* reduce_window) = 0;
-  virtual Status HandleSelectAndScatter(HloInstruction* instruction) = 0;
-  virtual Status HandleWhile(HloInstruction* xla_while) = 0;
+  virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
+  virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
+  virtual Status HandleRng(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSort(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
+  virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
+  virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
+  virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
+  virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
+  virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
+  virtual Status HandleFusion(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCall(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCustomCall(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSlice(HloInstructionPtr hlo) = 0;
+  virtual Status HandleDynamicSlice(HloInstructionPtr hlo) = 0;
+  virtual Status HandleDynamicUpdateSlice(HloInstructionPtr hlo) = 0;
+  virtual Status HandleTuple(HloInstructionPtr hlo) = 0;
+  virtual Status HandleMap(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReduceWindow(HloInstructionPtr hlo) = 0;
+  virtual Status HandleSelectAndScatter(HloInstructionPtr hlo) = 0;
+  virtual Status HandleWhile(HloInstructionPtr hlo) = 0;
+  virtual Status HandleConditional(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandlePad(HloInstruction* pad) = 0;
+  virtual Status HandlePad(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleSend(HloInstruction* send) = 0;
+  virtual Status HandleSend(HloInstructionPtr send) = 0;
+  virtual Status HandleSendDone(HloInstructionPtr send_done) = 0;
 
-  virtual Status HandleRecv(HloInstruction* recv) = 0;
+  virtual Status HandleRecv(HloInstructionPtr recv) = 0;
+  virtual Status HandleRecvDone(HloInstructionPtr recv_done) = 0;
 
-  virtual Status HandleBatchNormTraining(
-      HloInstruction* batch_norm_training) = 0;
+  virtual Status HandleBatchNormTraining(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleBatchNormInference(
-      HloInstruction* batch_norm_inference) = 0;
+  virtual Status HandleBatchNormInference(HloInstructionPtr hlo) = 0;
 
-  virtual Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) = 0;
+  virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
-  virtual Status FinishVisit(HloInstruction* root) = 0;
+  virtual Status FinishVisit(HloInstructionPtr root) = 0;
 
   // 3 possible visitation states of HLO instructions. Each instruction's
   // state only flows one way: kNotVisited -> kVisiting -> kVisited.
@@ -273,7 +287,7 @@ class DfsHloVisitor {
   //
   // Overriding methods should call DfsHloVisitor::Preprocess before doing their
   // own preprocessing.
-  virtual Status Preprocess(HloInstruction* hlo);
+  virtual Status Preprocess(HloInstructionPtr hlo);
 
   // This method should be overridden by subclasses that wish to run some
   // operation on an op after its Handle* visitor method is called. See
@@ -281,7 +295,7 @@ class DfsHloVisitor {
   //
   // Overriding methods should call DfsHloVisitor::Postprocess after doing their
   // own postprocessing.
-  virtual Status Postprocess(HloInstruction* visited);
+  virtual Status Postprocess(HloInstructionPtr hlo);
 
  private:
   class DFSVisitStates {
@@ -322,9 +336,14 @@ class DfsHloVisitor {
 
   DFSVisitStates visit_state_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitor);
+  TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitorBase);
 };
 
+// Users should use one of these two type aliases, which are the only two valid
+// instantiations of DfsHloVisitorBase.
+using DfsHloVisitor = DfsHloVisitorBase<HloInstruction*>;
+using ConstDfsHloVisitor = DfsHloVisitorBase<const HloInstruction*>;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index a1d7acf90429e3611bb6dea56d98bbd6ffb8f580..133aa2509405738de8388708b0c61a82023e2738 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -33,161 +33,198 @@ class HloComputation;
 class HloInstruction;
 
 // DfsHloVisitor with default action based on the HloInstruction being visited.
-class DfsHloVisitorWithDefault : public DfsHloVisitor {
+// Users should not use this class directly, but use the type aliases
+// DfsHloVisitorWithDefault/ConstDfsHloVisitorWithDefault instead.
+template <typename HloInstructionPtr>
+class DfsHloVisitorWithDefaultBase
+    : public DfsHloVisitorBase<HloInstructionPtr> {
  public:
-  DfsHloVisitorWithDefault() {}
-  ~DfsHloVisitorWithDefault() override {}
+  DfsHloVisitorWithDefaultBase() {}
+  ~DfsHloVisitorWithDefaultBase() override {}
 
   // Default action performed on HloInstruction.
-  virtual Status DefaultAction(HloInstruction* hlo_instruction) = 0;
+  virtual Status DefaultAction(HloInstructionPtr hlo_instruction) = 0;
 
-  Status HandleElementwiseUnary(HloInstruction* hlo) override {
+  Status HandleElementwiseUnary(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
-  Status HandleElementwiseBinary(HloInstruction* hlo) override {
+  Status HandleElementwiseBinary(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormTraining(HloInstruction* hlo) override {
+  Status HandleBatchNormTraining(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormInference(HloInstruction* hlo) override {
+  Status HandleBatchNormInference(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleBatchNormGrad(HloInstruction* hlo) override {
+  Status HandleBatchNormGrad(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
 
-  Status HandleClamp(HloInstruction* clamp) override {
+  Status HandleClamp(HloInstructionPtr clamp) override {
     return DefaultAction(clamp);
   }
-  Status HandleConcatenate(HloInstruction* concatenate) override {
+  Status HandleConcatenate(HloInstructionPtr concatenate) override {
     return DefaultAction(concatenate);
   }
-  Status HandleConvert(HloInstruction* convert) override {
+  Status HandleConvert(HloInstructionPtr convert) override {
     return DefaultAction(convert);
   }
-  Status HandleCopy(HloInstruction* copy) override {
+  Status HandleCopy(HloInstructionPtr copy) override {
     return DefaultAction(copy);
   }
-  Status HandleSelect(HloInstruction* select) override {
+  Status HandleSelect(HloInstructionPtr select) override {
     return DefaultAction(select);
   }
-  Status HandleDot(HloInstruction* dot) override { return DefaultAction(dot); }
-  Status HandleConvolution(HloInstruction* convolution) override {
+  Status HandleDot(HloInstructionPtr dot) override {
+    return DefaultAction(dot);
+  }
+  Status HandleConvolution(HloInstructionPtr convolution) override {
     return DefaultAction(convolution);
   }
-  Status HandleCrossReplicaSum(HloInstruction* crs) override {
+  Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleCompare(HloInstruction* compare) override {
+  Status HandleCompare(HloInstructionPtr compare) override {
     return DefaultAction(compare);
   }
-  Status HandleRng(HloInstruction* random) override {
+  Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
-  Status HandleInfeed(HloInstruction* infeed) override {
+  Status HandleInfeed(HloInstructionPtr infeed) override {
     return DefaultAction(infeed);
   }
-  Status HandleOutfeed(HloInstruction* outfeed) override {
+  Status HandleOutfeed(HloInstructionPtr outfeed) override {
     return DefaultAction(outfeed);
   }
-  Status HandleReverse(HloInstruction* reverse) override {
+  Status HandleReverse(HloInstructionPtr reverse) override {
     return DefaultAction(reverse);
   }
-  Status HandleSort(HloInstruction* sort) override {
+  Status HandleSort(HloInstructionPtr sort) override {
     return DefaultAction(sort);
   }
-  Status HandleConstant(HloInstruction* constant) override {
+  Status HandleConstant(HloInstructionPtr constant) override {
     return DefaultAction(constant);
   }
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override {
+  Status HandleGetTupleElement(HloInstructionPtr get_tuple_element) override {
     return DefaultAction(get_tuple_element);
   }
-  Status HandleParameter(HloInstruction* parameter) override {
+  Status HandleParameter(HloInstructionPtr parameter) override {
     return DefaultAction(parameter);
   }
-  Status HandleFusion(HloInstruction* fusion) override {
+  Status HandleFusion(HloInstructionPtr fusion) override {
     return DefaultAction(fusion);
   }
-  Status HandleCall(HloInstruction* call) override {
+  Status HandleCall(HloInstructionPtr call) override {
     return DefaultAction(call);
   }
-  Status HandleCustomCall(HloInstruction* custom_call) override {
+  Status HandleCustomCall(HloInstructionPtr custom_call) override {
     return DefaultAction(custom_call);
   }
-  Status HandleSlice(HloInstruction* slice) override {
+  Status HandleSlice(HloInstructionPtr slice) override {
     return DefaultAction(slice);
   }
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
+  Status HandleDynamicSlice(HloInstructionPtr dynamic_slice) override {
     return DefaultAction(dynamic_slice);
   }
   Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override {
+      HloInstructionPtr dynamic_update_slice) override {
     return DefaultAction(dynamic_update_slice);
   }
-  Status HandleTuple(HloInstruction* tuple) override {
+  Status HandleTuple(HloInstructionPtr tuple) override {
     return DefaultAction(tuple);
   }
-  Status HandleMap(HloInstruction* map) override { return DefaultAction(map); }
-  Status HandleReduce(HloInstruction* reduce) override {
+  Status HandleMap(HloInstructionPtr map) override {
+    return DefaultAction(map);
+  }
+  Status HandleReduce(HloInstructionPtr reduce) override {
     return DefaultAction(reduce);
   }
-  Status HandleReduceWindow(HloInstruction* reduce_window) override {
+  Status HandleReduceWindow(HloInstructionPtr reduce_window) override {
     return DefaultAction(reduce_window);
   }
-  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+  Status HandleSelectAndScatter(HloInstructionPtr select_and_scatter) override {
     return DefaultAction(select_and_scatter);
   }
-  Status HandleBitcast(HloInstruction* bitcast) override {
+  Status HandleBitcast(HloInstructionPtr bitcast) override {
     return DefaultAction(bitcast);
   }
-  Status HandleBroadcast(HloInstruction* broadcast) override {
+  Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
-  Status HandlePad(HloInstruction* pad) override { return DefaultAction(pad); }
-  Status HandleReshape(HloInstruction* reshape) override {
+  Status HandlePad(HloInstructionPtr pad) override {
+    return DefaultAction(pad);
+  }
+  Status HandleReshape(HloInstructionPtr reshape) override {
     return DefaultAction(reshape);
   }
-  Status HandleTranspose(HloInstruction* transpose) override {
+  Status HandleTranspose(HloInstructionPtr transpose) override {
     return DefaultAction(transpose);
   }
-  Status HandleWhile(HloInstruction* xla_while) override {
+  Status HandleWhile(HloInstructionPtr xla_while) override {
     return DefaultAction(xla_while);
   }
-  Status HandleSend(HloInstruction* send) override {
-    return DefaultAction(send);
+  Status HandleConditional(HloInstructionPtr conditional) override {
+    return DefaultAction(conditional);
   }
-  Status HandleRecv(HloInstruction* recv) override {
+  Status HandleRecv(HloInstructionPtr recv) override {
     return DefaultAction(recv);
   }
+  Status HandleRecvDone(HloInstructionPtr recv_done) override {
+    return DefaultAction(recv_done);
+  }
+  Status HandleSend(HloInstructionPtr send) override {
+    return DefaultAction(send);
+  }
+  Status HandleSendDone(HloInstructionPtr send_done) override {
+    return DefaultAction(send_done);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
-  Status FinishVisit(HloInstruction* /*root*/) override { return Status::OK(); }
+  Status FinishVisit(HloInstructionPtr /*root*/) override {
+    return Status::OK();
+  }
 
  private:
-  TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitorWithDefault);
+  TF_DISALLOW_COPY_AND_ASSIGN(DfsHloVisitorWithDefaultBase);
 };
 
-// Helper class for Accept(VisitorFunction) which visits instructions in DFS
-// order calling the given function at each instruction.
-class FunctionVisitor : public DfsHloVisitorWithDefault {
+// Users should use these type aliases which are only two valid instantiations.
+using DfsHloVisitorWithDefault = DfsHloVisitorWithDefaultBase<HloInstruction*>;
+using ConstDfsHloVisitorWithDefault =
+    DfsHloVisitorWithDefaultBase<const HloInstruction*>;
+
+// (Const)FunctionVisitor lets you transform an
+// std::function<Status((const) HloInstruction*)> into a (Const)DfsHloVisitor.
+//
+// This is useful if you have code that needs to handle visitors in the form of
+// both std::function and DfsHloVisitor.  You can wrap the function in a
+// FunctionVisitor and then treat it like any other DfsHloVisitor.
+template <typename HloInstructionPtr>
+class FunctionVisitorBase
+    : public DfsHloVisitorWithDefaultBase<HloInstructionPtr> {
  public:
-  using VisitorFunction = std::function<Status(HloInstruction*)>;
-  explicit FunctionVisitor(VisitorFunction visitor_func)
+  explicit FunctionVisitorBase(
+      std::function<Status(HloInstructionPtr)> visitor_func)
       : visitor_func_(std::move(visitor_func)) {}
 
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
+  Status DefaultAction(HloInstructionPtr hlo_instruction) override {
     return visitor_func_(hlo_instruction);
   }
 
  private:
-  VisitorFunction visitor_func_;
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionVisitorBase);
+
+  std::function<Status(HloInstructionPtr)> visitor_func_;
 };
 
+using FunctionVisitor = FunctionVisitorBase<HloInstruction*>;
+using ConstFunctionVisitor = FunctionVisitorBase<const HloInstruction*>;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DFS_HLO_VISITOR_WITH_DEFAULT_H_
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12faed69677cd99c6ed82c8d13dad3138d9461b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -0,0 +1,185 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// TODO(b/69062148) Remove this code when all backends support BatchDot
+// natively.
+Status DecomposeBatchDot(HloInstruction* dot) {
+  auto computation = dot->parent();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& dot_shape = dot->shape();
+
+  // ShapeInference should guarantee that lhs/rhs batch dimensions match.
+  CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+           dnums.rhs_batch_dimensions_size());
+  const int64 num_batch_dims = dnums.lhs_batch_dimensions_size();
+  // Calculate total batch size (note that ShapeInference requires that
+  // the batch dimensions are most-major).
+  int64 batch_size = 1;
+  for (int i = 0; i < num_batch_dims; ++i) {
+    CHECK_EQ(lhs_shape.dimensions(dnums.lhs_batch_dimensions(i)),
+             rhs_shape.dimensions(dnums.rhs_batch_dimensions(i)));
+    batch_size *= lhs_shape.dimensions(dnums.lhs_batch_dimensions(i));
+  }
+
+  // Set lhs/rhs_transpose.
+  CHECK_EQ(1, dnums.lhs_contracting_dimensions_size());
+  const int64 lhs_contracting_dim_number = dnums.lhs_contracting_dimensions(0);
+  const bool lhs_transpose = (lhs_contracting_dim_number - num_batch_dims) == 0;
+
+  CHECK_EQ(1, dnums.rhs_contracting_dimensions_size());
+  const int64 rhs_contracting_dim_number = dnums.rhs_contracting_dimensions(0);
+  const bool rhs_transpose = (rhs_contracting_dim_number - num_batch_dims) == 1;
+
+  // Compute R3 and R3 shapes for lhs.
+  PrimitiveType lhs_type = lhs_shape.element_type();
+  const int64 lhs_rows = lhs_shape.dimensions(num_batch_dims + 0);
+  const int64 lhs_cols = lhs_shape.dimensions(num_batch_dims + 1);
+  Shape lhs_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {batch_size, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(lhs_type, {1, lhs_rows, lhs_cols});
+  Shape lhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(lhs_type, {lhs_rows, lhs_cols});
+
+  // Compute R3 and R3 shapes for rhs.
+  PrimitiveType rhs_type = rhs_shape.element_type();
+  const int64 rhs_rows = rhs_shape.dimensions(num_batch_dims + 0);
+  const int64 rhs_cols = rhs_shape.dimensions(num_batch_dims + 1);
+  Shape rhs_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {batch_size, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r3 =
+      ShapeUtil::MakeShape(rhs_type, {1, rhs_rows, rhs_cols});
+  Shape rhs_slice_shape_r2 =
+      ShapeUtil::MakeShape(rhs_type, {rhs_rows, rhs_cols});
+
+  // Compute R3 and R3 shapes for dot output.
+  PrimitiveType dot_type = dot_shape.element_type();
+  const int64 dot_rows = dot_shape.dimensions(num_batch_dims + 0);
+  const int64 dot_cols = dot_shape.dimensions(num_batch_dims + 1);
+  Shape dot_shape_r2 = ShapeUtil::MakeShape(dot_type, {dot_rows, dot_cols});
+  Shape dot_shape_r3 = ShapeUtil::MakeShape(dot_type, {1, dot_rows, dot_cols});
+  Shape concat_shape_r3 =
+      ShapeUtil::MakeShape(dot_type, {batch_size, dot_rows, dot_cols});
+
+  // Reshape lhs/rhs into R3.
+  auto lhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(lhs_shape_r3, lhs));
+  auto rhs_r3 = computation->AddInstruction(
+      HloInstruction::CreateReshape(rhs_shape_r3, rhs));
+
+  // Loop through batch size, slicing out required lhs/rhs to compute each Dot.
+  std::vector<HloInstruction*> output_slices(batch_size);
+  for (int64 i = 0; i < batch_size; ++i) {
+    // Slice R3 shape from 'lhs' and reshape to R2.
+    auto lhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(lhs_slice_shape_r3, lhs_r3, {i, 0, 0},
+                                    {i + 1, lhs_rows, lhs_cols}, {1, 1, 1}));
+    auto lhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(lhs_slice_shape_r2, lhs_slice_r3));
+
+    // Slice R3 shape from 'rhs' and reshape to R2.
+    auto rhs_slice_r3 = computation->AddInstruction(
+        HloInstruction::CreateSlice(rhs_slice_shape_r3, rhs_r3, {i, 0, 0},
+                                    {i + 1, rhs_rows, rhs_cols}, {1, 1, 1}));
+    auto rhs_slice_r2 = computation->AddInstruction(
+        HloInstruction::CreateReshape(rhs_slice_shape_r2, rhs_slice_r3));
+
+    // Transpose lhs/rhs (if needed).
+    if (lhs_transpose) {
+      Shape lhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(lhs_type, {lhs_cols, lhs_rows});
+      lhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              lhs_slice_shape_r2_transpose, lhs_slice_r2, {1, 0}));
+    }
+    if (rhs_transpose) {
+      Shape rhs_slice_shape_r2_transpose =
+          ShapeUtil::MakeShape(rhs_type, {rhs_cols, rhs_rows});
+      rhs_slice_r2 =
+          computation->AddInstruction(HloInstruction::CreateTranspose(
+              rhs_slice_shape_r2_transpose, rhs_slice_r2, {1, 0}));
+    }
+
+    // Compute Dot of lhs/rhs R2 slices.
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    auto dot_r2 = computation->AddInstruction(HloInstruction::CreateDot(
+        dot_shape_r2, lhs_slice_r2, rhs_slice_r2, dot_dnums));
+
+    // Reshape Dot to R3 so we can concat along batch dimension.
+    auto dot_r3 = computation->AddInstruction(
+        HloInstruction::CreateReshape(dot_shape_r3, dot_r2));
+
+    output_slices[i] = dot_r3;
+  }
+
+  // Concatenate slices from 'output_slices' along batch dimension.
+  auto concat = computation->AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape_r3, output_slices, 0));
+  // Reshape output 'new_dot' to original dimensions.
+  auto new_dot = computation->AddInstruction(
+      HloInstruction::CreateReshape(dot_shape, concat));
+
+  // Replace all uses of 'dot' in 'computation' with 'new_dot'.
+  return computation->ReplaceInstruction(dot, new_dot);
+}
+
+}  // namespace
+
+StatusOr<bool> DotDecomposer::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
+  // Gather all batch Dot operations.
+  std::vector<HloInstruction*> batch_dots;
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kDot) {
+        continue;
+      }
+      const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
+        batch_dots.push_back(instruction);
+      }
+    }
+  }
+  // Decompose each batch Dot in 'batch_dots'.
+  bool changed = false;
+  for (auto* dot : batch_dots) {
+    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+    changed = true;
+  }
+  XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.h b/tensorflow/compiler/xla/service/dot_decomposer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff0ab34eac0cd0fbc264b408c57653c944402a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_decomposer.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// DotDecomposer is a pass which decomposes batch Dot operations into a
+// sequence of smaller (R2) Dot operations.
+class DotDecomposer : public HloPassInterface {
+ public:
+  // Decomposes batch Dot operations when 'decompose_batch_dot' is true.
+  DotDecomposer(bool decompose_batch_dot = true)
+      : decompose_batch_dot_(decompose_batch_dot) {}
+  ~DotDecomposer() = default;
+  tensorflow::StringPiece name() const override { return "dot_decomposer"; }
+
+  // Run DotDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool decompose_batch_dot_;
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index fd4c332cba94513ec5b4cd88a842189e716f35d5..7e88bbd63123cd33682bb5ff67761ae5c5bdc98c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -50,11 +50,161 @@ using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
 using tensorflow::strings::StrCat;
 
+namespace {
+
+llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
+                                      int64 mantissa_bits,
+                                      llvm::IRBuilder<>* ir_builder) {
+  // Integer and float types for casting and constant generation.
+  llvm::Type* float_type = x->getType();
+  llvm::IntegerType* int_type = ir_builder->getInt32Ty();
+
+  // Cast the input value to an integer for bitwise manipulation.
+  llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type);
+
+  if (mantissa_bits < 23) {
+    // Last remaining mantissa bit.
+    const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+    // Compute rounding bias for round-to-nearest with ties to even.  This is
+    // equal to a base value of 0111... plus one bit if the last remaining
+    // mantissa bit is 1.
+    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
+    llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr(
+        ir_builder->CreateAnd(
+            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+        (23 - mantissa_bits));
+    llvm::Value* x_rounding_bias = ir_builder->CreateAdd(
+        x_last_mantissa_bit,
+        llvm::ConstantInt::get(int_type, base_rounding_bias));
+
+    // Add rounding bias, and mask out truncated bits.  Note that the case
+    // where adding the rounding bias overflows into the exponent bits is
+    // correct; the non-masked mantissa bits will all be zero, and the
+    // exponent will be incremented by one.
+    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+    x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+  }
+
+  if (exponent_bits < 8) {
+    // Masks for f32 values.
+    const uint32_t f32_sign_bit_mask = 1u << 31;
+    const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
+    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
+    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
+    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
+    // exponent (corresponding to 0.0f).
+    //
+    // Thus, the f32 exponent corresponding to the highest non-infinite
+    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+    // exponent corresponding to the lowest exponent for a bit size of n is
+    // (2^7-1) - 2^(n-1)-1.
+    //
+    // Note that we have already checked that exponents_bits >= 1.
+    const uint32_t f32_exponent_bias = (1 << 7) - 1;
+    const uint32_t reduced_exponent_bias = (1 << (exponent_bits - 1)) - 1;
+    const uint32_t reduced_max_exponent =
+        f32_exponent_bias + reduced_exponent_bias;
+    const uint32_t reduced_min_exponent =
+        f32_exponent_bias - reduced_exponent_bias;
+
+    // Do we overflow or underflow?
+    llvm::Value* x_exponent = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+    llvm::Value* x_overflows = ir_builder->CreateICmpUGT(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
+    llvm::Value* x_underflows = ir_builder->CreateICmpULE(
+        x_exponent,
+        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
+
+    // Compute appropriately-signed values of zero and infinity.
+    llvm::Value* x_signed_zero = ir_builder->CreateAnd(
+        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
+    llvm::Value* x_signed_inf = ir_builder->CreateOr(
+        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
+
+    // Force to zero or infinity if overflow or underflow.  (Note that this
+    // truncates all denormal values to zero, rather than rounding them.)
+    x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+  }
+
+  // Cast the result back to a floating-point type.
+  llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type);
+
+  // Correct result for NaN inputs.
+  //
+  // The exponent handling will "normalize" NaN values to infinities, which is
+  // undesirable (except in the case with no mantissa bits, in which case it
+  // is mandatory).  This logic also handles cases where mantissa-rounding
+  // causes a NaN's mantissa to overflow into the exponent bits, which would
+  // otherwise create an erroneous zero value.
+  //
+  // If the fast-math flags are set to assume no NaNs, the comparison is likely
+  // to be optimized away, so there's no point in even emitting it.
+  if (!ir_builder->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x);
+
+    if (mantissa_bits > 0) {
+      result = ir_builder->CreateSelect(x_is_nan, x, result);
+    } else {
+      result = ir_builder->CreateSelect(
+          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
+    }
+  }
+  return result;
+}
+
+llvm::Value* EmitF32ToBF16(llvm::Value* f32_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto reduced_precision = EmitReducePrecisionFloat(
+      f32_value,
+      /*exponent_bits=*/primitive_util::kBFloat16ExponentBits,
+      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder);
+  auto as_int32 =
+      ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateLShr(as_int32, 16);
+  auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty());
+  return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty());
+}
+
+llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value,
+                           llvm::IRBuilder<>* ir_builder) {
+  auto as_int16 =
+      ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty());
+  auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty());
+  auto shifted = ir_builder->CreateShl(as_int32, 16);
+  return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy());
+}
+
+llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
+                                    PrimitiveType from_type,
+                                    PrimitiveType to_type, llvm::Module* module,
+                                    llvm::IRBuilder<>* ir_builder) {
+  if (primitive_util::IsSignedIntegralType(from_type)) {
+    return ir_builder->CreateSIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  } else {
+    CHECK(primitive_util::IsUnsignedIntegralType(from_type) ||
+          from_type == PRED);
+    return ir_builder->CreateUIToFP(
+        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+  }
+}
+
+}  // namespace
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   if (op->opcode() == HloOpcode::kCopy) {
     return operand_value;
-  } else if (operand_value->getType()->isIntegerTy()) {
+  } else if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+             op->operand(0)->shape().element_type() == PRED) {
     return EmitIntegerUnaryOp(op, operand_value);
   } else if (ShapeUtil::ElementIsComplex(op->operand(0)->shape())) {
     return EmitComplexUnaryOp(op, operand_value);
@@ -79,28 +229,27 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
             primitive_util::IsSignedIntegralType(to_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
-        if (primitive_util::IsSignedIntegralType(from_type)) {
-          return ir_builder_->CreateSIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
-        }
-        if (primitive_util::IsUnsignedIntegralType(from_type) ||
-            from_type == PRED) {
-          return ir_builder_->CreateUIToFP(
-              operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+        if (to_type == BF16) {
+          return EmitF32ToBF16(
+              EmitIntegralToFloating(operand_value, from_type, F32, module_,
+                                     ir_builder_),
+              ir_builder_);
         }
+        return EmitIntegralToFloating(operand_value, from_type, to_type,
+                                      module_, ir_builder_);
       }
       if (primitive_util::IsComplexType(to_type)) {
         auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(
             primitive_util::ComplexComponentType(to_type), module_);
         if (primitive_util::IsSignedIntegralType(from_type)) {
-          return ComposeComplex(
+          return EmitComposeComplex(
               op,
               ir_builder_->CreateSIToFP(operand_value, to_ir_component_type),
               nullptr);
         }
         if (primitive_util::IsUnsignedIntegralType(from_type) ||
             from_type == PRED) {
-          return ComposeComplex(
+          return EmitComposeComplex(
               op,
               ir_builder_->CreateUIToFP(operand_value, to_ir_component_type),
               nullptr);
@@ -110,6 +259,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
                            PrimitiveType_Name(from_type).c_str(),
                            PrimitiveType_Name(to_type).c_str());
     }
+    case HloOpcode::kBitcastConvert: {
+      PrimitiveType from_type = op->operand(0)->shape().element_type();
+      PrimitiveType to_type = op->shape().element_type();
+      CHECK(primitive_util::IsIntegralType(from_type));
+      if (from_type == to_type) {
+        return operand_value;
+      }
+      if (primitive_util::BitWidth(from_type) ==
+          primitive_util::BitWidth(to_type)) {
+        return ir_builder_->CreateBitCast(
+            operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+      }
+      return InvalidArgument(
+          "bitcast conversion from primitive type %s to %s with unequal "
+          "bit-widths (%u versus %u) ",
+          PrimitiveType_Name(from_type).c_str(),
+          PrimitiveType_Name(to_type).c_str(),
+          primitive_util::BitWidth(from_type),
+          primitive_util::BitWidth(to_type));
+    }
     case HloOpcode::kAbs: {
       bool is_signed =
           primitive_util::IsSignedIntegralType(op->shape().element_type());
@@ -178,15 +347,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         PrimitiveType to_component_type =
             primitive_util::ComplexComponentType(to_type);
         if (from_type == to_component_type) {
-          return ComposeComplex(op, operand_value, nullptr);
+          return EmitComposeComplex(op, operand_value, nullptr);
         }
-        return ComposeComplex(
+        return EmitComposeComplex(
             op,
             ir_builder_->CreateFPCast(
                 operand_value,
                 llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)),
             nullptr);
       }
+      if (from_type == BF16) {
+        TF_RET_CHECK(to_type != BF16);
+        operand_value = EmitBF16ToF32(operand_value, ir_builder_);
+        from_type = F32;
+        if (from_type == to_type) {
+          return operand_value;
+        }
+      }
+      if (from_type == F32 && to_type == BF16) {
+        return EmitF32ToBF16(operand_value, ir_builder_);
+      }
       if (primitive_util::IsFloatingPointType(to_type)) {
         return ir_builder_->CreateFPCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
@@ -203,6 +383,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                            PrimitiveType_Name(from_type).c_str(),
                            PrimitiveType_Name(to_type).c_str());
     }
+    case HloOpcode::kBitcastConvert: {
+      PrimitiveType from_type = op->operand(0)->shape().element_type();
+      PrimitiveType to_type = op->shape().element_type();
+      CHECK(primitive_util::IsFloatingPointType(from_type));
+      if (from_type == to_type) {
+        return operand_value;
+      }
+      if (primitive_util::BitWidth(from_type) ==
+          primitive_util::BitWidth(to_type)) {
+        return ir_builder_->CreateBitCast(
+            operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
+      }
+      return InvalidArgument(
+          "bitcast conversion from primitive type %s to %s with unequal "
+          "bit-widths (%u versus %u) ",
+          PrimitiveType_Name(from_type).c_str(),
+          PrimitiveType_Name(to_type).c_str(),
+          primitive_util::BitWidth(from_type),
+          primitive_util::BitWidth(to_type));
+    }
     case HloOpcode::kExp:
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {operand_value},
                                           {operand_value->getType()},
@@ -269,15 +469,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
-  auto real = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {0});
-  };
-  auto imag = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {1});
-  };
   switch (op->opcode()) {
     // TODO(b/65209142): Angle/Log require atan2.
-    // case HloOpcode::kAngle:
     // case HloOpcode::kLog:  // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -291,24 +484,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           primitive_util::ComplexComponentType(to_type);
       auto to_ir_component_type =
           llvm_ir::PrimitiveTypeToIrType(to_component_type, module_);
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
-          ir_builder_->CreateFPCast(real(operand_value), to_ir_component_type),
-          ir_builder_->CreateFPCast(imag(operand_value), to_ir_component_type));
+          ir_builder_->CreateFPCast(EmitExtractReal(operand_value),
+                                    to_ir_component_type),
+          ir_builder_->CreateFPCast(EmitExtractImag(operand_value),
+                                    to_ir_component_type));
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
       auto exp_a = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::exp, {real(operand_value)},
-          {real(operand_value)->getType()}, ir_builder_);
+          llvm::Intrinsic::exp, {EmitExtractReal(operand_value)},
+          {EmitExtractReal(operand_value)->getType()}, ir_builder_);
       auto cos_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::cos, {imag(operand_value)},
-          {imag(operand_value)->getType()}, ir_builder_);
+          llvm::Intrinsic::cos, {EmitExtractImag(operand_value)},
+          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
       auto sin_b = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::sin, {imag(operand_value)},
-          {imag(operand_value)->getType()}, ir_builder_);
-      return ComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                            ir_builder_->CreateFMul(exp_a, sin_b));
+          llvm::Intrinsic::sin, {EmitExtractImag(operand_value)},
+          {EmitExtractImag(operand_value)->getType()}, ir_builder_);
+      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
+                                ir_builder_->CreateFMul(exp_a, sin_b));
     }
     case HloOpcode::kCos: {
       // cos(z) = .5(e^(iz) + e^(-iz))
@@ -318,8 +513,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       // cos(-x) = cos(x) and sin(-x) = -sin(x), so
       // cos(a+bi) = .5(e^-b*(cos(a)+sin(a)i) + e^b*(cos(a)-sin(a)i))
       //           = .5(cos(a)*(e^-b+e^b) + i*sin(a)*(e^-b-e^b))
-      auto a = real(operand_value);
-      auto b = imag(operand_value);
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
                                                 {type}, ir_builder_);
@@ -331,7 +526,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
                                                 {type}, ir_builder_);
       auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
                                                 {type}, ir_builder_);
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
               cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
@@ -348,8 +543,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       // cos(-x) = cos(x) and sin(-x) = -sin(x), so
       //           = 0.5(e^b*(cos(a)i+sin(a)) - e^-b*(cos(a)i-sin(a)))
       //           = 0.5(sin(a)*(e^b+e^-b) + i*cos(a)*(e^b-e^-b)
-      auto a = real(operand_value);
-      auto b = imag(operand_value);
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       auto exp_b = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {b},
                                                 {type}, ir_builder_);
@@ -361,7 +556,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
                                                 {type}, ir_builder_);
       auto sin_a = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {a},
                                                 {type}, ir_builder_);
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
               sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
@@ -370,33 +565,40 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kAbs: {
       auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(real(operand_value), real(operand_value)),
-          ir_builder_->CreateFMul(imag(operand_value), imag(operand_value)));
+          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
+                                  EmitExtractReal(operand_value)),
+          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
+                                  EmitExtractImag(operand_value)));
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {sum_sq},
                                           {sum_sq->getType()}, ir_builder_);
     }
     case HloOpcode::kSign: {  // Sign(c) = c / |c|
       auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(real(operand_value), real(operand_value)),
-          ir_builder_->CreateFMul(imag(operand_value), imag(operand_value)));
+          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
+                                  EmitExtractReal(operand_value)),
+          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
+                                  EmitExtractImag(operand_value)));
       auto cplx_abs = llvm_ir::EmitCallToIntrinsic(
           llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, ir_builder_);
       auto type = cplx_abs->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(cplx_abs, zero);
       return ir_builder_->CreateSelect(
-          oeq, ComposeComplex(op, zero, zero),
-          ComposeComplex(
-              op, ir_builder_->CreateFDiv(real(operand_value), cplx_abs),
-              ir_builder_->CreateFDiv(imag(operand_value), cplx_abs)));
+          oeq, EmitComposeComplex(op, zero, zero),
+          EmitComposeComplex(
+              op,
+              ir_builder_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
+              ir_builder_->CreateFDiv(EmitExtractImag(operand_value),
+                                      cplx_abs)));
     }
     case HloOpcode::kNegate:
-      return ComposeComplex(op, ir_builder_->CreateFNeg(real(operand_value)),
-                            ir_builder_->CreateFNeg(imag(operand_value)));
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFNeg(EmitExtractReal(operand_value)),
+          ir_builder_->CreateFNeg(EmitExtractImag(operand_value)));
     case HloOpcode::kReal:
-      return real(operand_value);
+      return EmitExtractReal(operand_value);
     case HloOpcode::kImag:
-      return imag(operand_value);
+      return EmitExtractImag(operand_value);
     default:
       return Unimplemented("unary complex op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -407,7 +609,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
   PrimitiveType operand_type = op->operand(0)->shape().element_type();
-  if (lhs_value->getType()->isIntegerTy()) {
+  if (ShapeUtil::ElementIsIntegral(op->operand(0)->shape()) ||
+      operand_type == PRED) {
     return EmitIntegerBinaryOp(
         op, lhs_value, rhs_value,
         primitive_util::IsSignedIntegralType(operand_type));
@@ -424,7 +627,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
   switch (op->opcode()) {
     // case HloOpcode::kAtan2:  // TODO(b/65209142): CPU atan2 support
     case HloOpcode::kComplex:
-      return ComposeComplex(op, lhs_value, rhs_value);
+      return EmitComposeComplex(op, lhs_value, rhs_value);
     case HloOpcode::kAdd:
       return ir_builder_->CreateFAdd(lhs_value, rhs_value);
     case HloOpcode::kSubtract:
@@ -479,54 +682,66 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value,
     llvm::Value* rhs_value) const {
-  auto real = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {0});
-  };
-  auto imag = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {1});
-  };
   switch (op->opcode()) {
     case HloOpcode::kAdd:
-      return ComposeComplex(
-          op, ir_builder_->CreateFAdd(real(lhs_value), real(rhs_value)),
-          ir_builder_->CreateFAdd(imag(lhs_value), imag(rhs_value)));
+      return EmitComposeComplex(
+          op,
+          ir_builder_->CreateFAdd(EmitExtractReal(lhs_value),
+                                  EmitExtractReal(rhs_value)),
+          ir_builder_->CreateFAdd(EmitExtractImag(lhs_value),
+                                  EmitExtractImag(rhs_value)));
     case HloOpcode::kSubtract:
-      return ComposeComplex(
-          op, ir_builder_->CreateFSub(real(lhs_value), real(rhs_value)),
-          ir_builder_->CreateFSub(imag(lhs_value), imag(rhs_value)));
+      return EmitComposeComplex(
+          op,
+          ir_builder_->CreateFSub(EmitExtractReal(lhs_value),
+                                  EmitExtractReal(rhs_value)),
+          ir_builder_->CreateFSub(EmitExtractImag(lhs_value),
+                                  EmitExtractImag(rhs_value)));
     case HloOpcode::kMultiply:
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
           ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(real(lhs_value), real(rhs_value)),
-              ir_builder_->CreateFMul(imag(lhs_value), imag(rhs_value))),
+              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                      EmitExtractReal(rhs_value)),
+              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                      EmitExtractImag(rhs_value))),
           ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(real(lhs_value), imag(rhs_value)),
-              ir_builder_->CreateFMul(imag(lhs_value), real(rhs_value))));
+              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                      EmitExtractImag(rhs_value)),
+              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                      EmitExtractReal(rhs_value))));
     case HloOpcode::kDivide: {
       // (a+bi) / (c+di) = ((a+bi)(c-di)) / ((c+di)(c-di))
       // = ((ac + bd) + (bc - ad)i) / (c^2 + d^2)
       auto rhs_sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(real(rhs_value), real(rhs_value)),
-          ir_builder_->CreateFMul(imag(rhs_value), imag(rhs_value)));
+          ir_builder_->CreateFMul(EmitExtractReal(rhs_value),
+                                  EmitExtractReal(rhs_value)),
+          ir_builder_->CreateFMul(EmitExtractImag(rhs_value),
+                                  EmitExtractImag(rhs_value)));
       auto type = rhs_sum_sq->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = ir_builder_->CreateFCmpOEQ(rhs_sum_sq, zero);
+      auto real_inf_or_nan =
+          ir_builder_->CreateFDiv(EmitExtractReal(lhs_value), zero);
+      auto imag_inf_or_nan =
+          ir_builder_->CreateFDiv(EmitExtractImag(lhs_value), zero);
       return ir_builder_->CreateSelect(
-          oeq, ComposeComplex(op, llvm::ConstantFP::getInfinity(type), zero),
-          ComposeComplex(
+          oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan),
+          EmitComposeComplex(
               op,
               ir_builder_->CreateFDiv(
                   ir_builder_->CreateFAdd(
-                      ir_builder_->CreateFMul(real(lhs_value), real(rhs_value)),
-                      ir_builder_->CreateFMul(imag(lhs_value),
-                                              imag(rhs_value))),
+                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                              EmitExtractReal(rhs_value)),
+                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                              EmitExtractImag(rhs_value))),
                   rhs_sum_sq),
               ir_builder_->CreateFDiv(
                   ir_builder_->CreateFSub(
-                      ir_builder_->CreateFMul(imag(lhs_value), real(rhs_value)),
-                      ir_builder_->CreateFMul(real(lhs_value),
-                                              imag(rhs_value))),
+                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                              EmitExtractReal(rhs_value)),
+                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                              EmitExtractImag(rhs_value))),
                   rhs_sum_sq)));
     }
     // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
@@ -538,16 +753,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     // matches C++'s semantics.
     case HloOpcode::kEq:
       return ir_builder_->CreateAnd(
-          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, real(lhs_value),
-                                  real(rhs_value), ir_builder_),
-          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, imag(lhs_value),
-                                  imag(rhs_value), ir_builder_));
+          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
+                                  EmitExtractReal(lhs_value),
+                                  EmitExtractReal(rhs_value), ir_builder_),
+          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
+                                  EmitExtractImag(lhs_value),
+                                  EmitExtractImag(rhs_value), ir_builder_));
     case HloOpcode::kNe:
       return ir_builder_->CreateOr(
-          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, real(lhs_value),
-                                  real(rhs_value), ir_builder_),
-          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, imag(lhs_value),
-                                  imag(rhs_value), ir_builder_));
+          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
+                                  EmitExtractReal(lhs_value),
+                                  EmitExtractReal(rhs_value), ir_builder_),
+          llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
+                                  EmitExtractImag(lhs_value),
+                                  EmitExtractImag(rhs_value), ir_builder_));
 
     // TODO(b/65209142): requires arg(z) -> requires atan|atan2 intrinsic
     // case HloOpcode::kPower:
@@ -659,111 +878,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
   if (hlo->operand(0)->shape().element_type() != F32) {
     return Unimplemented("reduce-precision only implemented for F32");
   }
-
-  // Integer and float types for casting and constant generation.
-  llvm::Type* float_type = x->getType();
-  llvm::IntegerType* int_type = ir_builder_->getInt32Ty();
-
-  // Cast the input value to an integer for bitwise manipulation.
-  llvm::Value* x_as_int = ir_builder_->CreateBitCast(x, int_type);
-
-  if (hlo->mantissa_bits() < 23) {
-    // Last remaining mantissa bit.
-    const uint32_t last_mantissa_bit_mask = 1u << (23 - hlo->mantissa_bits());
-
-    // Compute rounding bias for round-to-nearest with ties to even.  This is
-    // equal to a base value of 0111... plus one bit if the last remaining
-    // mantissa bit is 1.
-    const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
-    llvm::Value* x_last_mantissa_bit = ir_builder_->CreateLShr(
-        ir_builder_->CreateAnd(
-            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
-        (23 - hlo->mantissa_bits()));
-    llvm::Value* x_rounding_bias = ir_builder_->CreateAdd(
-        x_last_mantissa_bit,
-        llvm::ConstantInt::get(int_type, base_rounding_bias));
-
-    // Add rounding bias, and mask out truncated bits.  Note that the case
-    // where adding the rounding bias overflows into the exponent bits is
-    // correct; the non-masked mantissa bits will all be zero, and the
-    // exponent will be incremented by one.
-    const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-    x_as_int = ir_builder_->CreateAdd(x_as_int, x_rounding_bias);
-    x_as_int = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
-  }
-
-  if (hlo->exponent_bits() < 8) {
-    // Masks for f32 values.
-    const uint32_t f32_sign_bit_mask = 1u << 31;
-    const uint32_t f32_exp_bits_mask = 0xffu << 23;
-
-    // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the most-
-    // significant bit -- is equal to 1.0f for all exponent sizes.  Adding
-    // 2^(n-1)-1 to this gives us the highest non-infinite exponent for a bit-
-    // size of n, and subtracting 2^(n-1)-1 from this gives us the lowest'
-    // exponent (corresponding to 0.0f).
-    //
-    // Thus, the f32 exponent corresponding to the highest non-infinite
-    // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
-    // exponent corresponding to the lowest exponent for a bit size of n is
-    // (2^7-1) - 2^(n-1)-1.
-    //
-    // Note that we have already checked that exponents_bits >= 1.
-    const uint32_t f32_exponent_bias = (1 << 7) - 1;
-    const uint32_t reduced_exponent_bias =
-        (1 << (hlo->exponent_bits() - 1)) - 1;
-    const uint32_t reduced_max_exponent =
-        f32_exponent_bias + reduced_exponent_bias;
-    const uint32_t reduced_min_exponent =
-        f32_exponent_bias - reduced_exponent_bias;
-
-    // Do we overflow or underflow?
-    llvm::Value* x_exponent = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-    llvm::Value* x_overflows = ir_builder_->CreateICmpUGT(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
-    llvm::Value* x_underflows = ir_builder_->CreateICmpULE(
-        x_exponent,
-        llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
-
-    // Compute appropriately-signed values of zero and infinity.
-    llvm::Value* x_signed_zero = ir_builder_->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
-    llvm::Value* x_signed_inf = ir_builder_->CreateOr(
-        x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-
-    // Force to zero or infinity if overflow or underflow.  (Note that this
-    // truncates all denormal values to zero, rather than rounding them.)
-    x_as_int = ir_builder_->CreateSelect(x_overflows, x_signed_inf, x_as_int);
-    x_as_int = ir_builder_->CreateSelect(x_underflows, x_signed_zero, x_as_int);
-  }
-
-  // Cast the result back to a floating-point type.
-  llvm::Value* result = ir_builder_->CreateBitCast(x_as_int, float_type);
-
-  // Correct result for NaN inputs.
-  //
-  // The exponent handling will "normalize" NaN values to infinities, which is
-  // undesirable (except in the case with no mantissa bits, in which case it
-  // is mandatory).  This logic also handles cases where mantissa-rounding
-  // causes a NaN's mantissa to overflow into the exponent bits, which would
-  // otherwise create an erroneous zero value.
-  //
-  // If the fast-math flags are set to assume no NaNs, the comparison is likely
-  // to be optimized away, so there's no point in even emitting it.
-  if (!ir_builder_->getFastMathFlags().noNaNs()) {
-    llvm::Value* x_is_nan = ir_builder_->CreateFCmpUNO(x, x);
-
-    if (hlo->mantissa_bits() > 0) {
-      result = ir_builder_->CreateSelect(x_is_nan, x, result);
-    } else {
-      result = ir_builder_->CreateSelect(
-          x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
-    }
-  }
-  return result;
+  return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(),
+                                  /*mantissa_bits=*/hlo->mantissa_bits(),
+                                  ir_builder_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
@@ -847,7 +964,7 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
 
   // If no implicit broadcast is needed for this operand, returns the target
   // index as the source index.
-  if (ShapeUtil::Compatible(operand_shape, hlo.shape())) {
+  if (ShapeUtil::CompatibleIgnoringElementType(operand_shape, hlo.shape())) {
     return target_index;
   }
 
@@ -1055,6 +1172,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -1063,11 +1181,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
     case HloOpcode::kReal:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kTanh:
-    case HloOpcode::kNot:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
@@ -1076,6 +1194,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         return EmitUnaryOp(hlo, operand_value);
       };
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kComplex:
     case HloOpcode::kDivide:
@@ -1088,14 +1207,13 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
+    case HloOpcode::kOr:
     case HloOpcode::kPower:
     case HloOpcode::kRemainder:
-    case HloOpcode::kSubtract:
-    case HloOpcode::kAnd:
-    case HloOpcode::kOr:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+    case HloOpcode::kSubtract:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         const HloInstruction* lhs = hlo->operand(0);
@@ -1289,6 +1407,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const int64 rank = ShapeUtil::Rank(input_hlo->shape());
         llvm_ir::IrArray::Index slice_start_index(rank);
         llvm_ir::IrArray::Index slice_limit_index(rank);
+        // Slice starts at update[index - slice_start_index_adjusted],
+        // where adjusted value = slice_start_index when in bounds, and
+        // adjusted value = slice_start_index - input_dim, when wrapping.
+        llvm_ir::IrArray::Index slice_start_index_adjusted(rank);
+
+        // Slice intersection gathers (ANDs) conditions on all ranks for which
+        // 'input' is set to 'update'
+        llvm::Value* slice_intersection = ir_builder_->getTrue();
+
         for (int64 i = 0; i < rank; ++i) {
           // Emit IR to read dynamic start indices from 'start_hlo'.
           llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
@@ -1298,38 +1425,97 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
               AsStringRef(IrName(hlo, StrCat("start_idx", i))));
           slice_start_index[i] = ir_builder_->CreateZExtOrBitCast(
               start_index_value, index[i]->getType());
-          // Emit IR to compute: slice_limit_index = start_index + update_dim
-          // NOTE: Although 'start_indices' is dynamic and could be
-          // out-of-range, we do not compute 'slice_limit_index' mod input dim
-          // size here, because subsequent array index calculations will be
-          // computed mod input dim size for safety.
+
+          llvm::Value* input_dim_size = llvm::ConstantInt::get(
+              index[i]->getType(), input_hlo->shape().dimensions(i));
           llvm::Value* update_dim_size = llvm::ConstantInt::get(
               index[i]->getType(), update_hlo->shape().dimensions(i));
+
+          // Generate code to handle wrapping semantics:
+          // slice_start_index[i] = slice_start_index[i] % input_dim_size;
+          // slice_limit_index[i] = slice_start_index[i] + update_dim_size.
+          // slice_start_index[i] is updated in place and it will now be in
+          // range. slice_limit_index[i] may be out of range, and it's being
+          // URem-ed below if so.
+          slice_start_index[i] =
+              ir_builder_->CreateURem(slice_start_index[i], input_dim_size);
           slice_limit_index[i] =
               ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
-        }
-
-        // Check if 'index' intersects start/end indices.
-        llvm::Value* slice_intersection =
-            llvm::ConstantInt::get(ir_builder_->getInt1Ty(), 1);
 
-        for (int64 i = 0; i < rank; ++i) {
-          // Check that index[i] >= slice_start_index[i].
-          slice_intersection = ir_builder_->CreateAnd(
+          // Test if slice_limit_index[i] is in bounds
+          llvm::Value* in_bounds =
+              ir_builder_->CreateICmpULE(slice_limit_index[i], input_dim_size);
+          llvm_ir::LlvmIfData if_in_bounds =
+              llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+
+          // Handle true BB (slice_limit_index[i] <= input_dim_size).
+          SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
+          // Check that index[i] >= slice_start_index[i] &&
+          //            index[i] < slice_limit_index[i]
+          llvm::Value* slice_intersection_in_bounds = ir_builder_->CreateAnd(
               slice_intersection,
               ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-              "slice_intersection");
-
-          // Check that index[i] < slice_limit_index[i].
-          slice_intersection = ir_builder_->CreateAnd(
-              slice_intersection,
+              "slice_intersection_in");
+          slice_intersection_in_bounds = ir_builder_->CreateAnd(
+              slice_intersection_in_bounds,
               ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
-              "slice_intersection");
+              "slice_intersection_in");
+
+          // Handle false BB (slice_limit_index[i] > input_dim_size).
+          SetToFirstInsertPoint(if_in_bounds.false_block, ir_builder_);
+          // Check that index[i] >= slice_start_index[i] ||
+          //            index[i] < slice_limit_index[i]%input_dim_size.
+          llvm::Value* index_wraps = ir_builder_->CreateICmpSLT(
+              index[i],
+              ir_builder_->CreateURem(slice_limit_index[i], input_dim_size));
+          llvm::Value* slice_intersection_or = ir_builder_->CreateOr(
+              ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
+              index_wraps, "slice_intersection_out");
+          llvm::Value* slice_intersection_out_of_bounds =
+              ir_builder_->CreateAnd(slice_intersection, slice_intersection_or,
+                                     "slice_intersection_out");
+          // Create value for slice_start_index_adjusted[i] when out of bounds.
+          // If within out-of-bounds if.
+          llvm_ir::LlvmIfData if_start_needs_adjustment =
+              llvm_ir::EmitIfThenElse(index_wraps, "adjust_start", ir_builder_);
+          SetToFirstInsertPoint(if_start_needs_adjustment.true_block,
+                                ir_builder_);
+          llvm::Value* slice_start_index_adjusted_oob =
+              ir_builder_->CreateSub(slice_start_index[i], input_dim_size);
+          SetToFirstInsertPoint(if_start_needs_adjustment.after_block,
+                                ir_builder_);
+          llvm::PHINode* slice_start_index_adjusted_phi =
+              ir_builder_->CreatePHI(slice_start_index_adjusted_oob->getType(),
+                                     2);
+          slice_start_index_adjusted_phi->addIncoming(
+              slice_start_index_adjusted_oob,
+              if_start_needs_adjustment.true_block);
+          slice_start_index_adjusted_phi->addIncoming(
+              slice_start_index[i], if_start_needs_adjustment.false_block);
+          // End of if within if.
+
+          // After checking in/out of bounds.
+          SetToFirstInsertPoint(if_in_bounds.after_block, ir_builder_);
+          llvm::PHINode* phi_slice_intersection =
+              ir_builder_->CreatePHI(slice_intersection->getType(), 2);
+          phi_slice_intersection->addIncoming(slice_intersection_in_bounds,
+                                              if_in_bounds.true_block);
+          phi_slice_intersection->addIncoming(
+              slice_intersection_out_of_bounds,
+              if_start_needs_adjustment.after_block);
+          slice_intersection = phi_slice_intersection;
+
+          llvm::PHINode* phi_index =
+              ir_builder_->CreatePHI(slice_start_index[i]->getType(), 2);
+          phi_index->addIncoming(slice_start_index[i], if_in_bounds.true_block);
+          phi_index->addIncoming(slice_start_index_adjusted_phi,
+                                 if_start_needs_adjustment.after_block);
+          slice_start_index_adjusted[i] = phi_index;
         }
 
         // Emit:
         // if (slice_intersection) -> return data from 'update'.
-        // else                    -> return data from 'index'.
+        // else                    -> return data from 'input'.
         llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
             llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
                                            module_),
@@ -1337,7 +1523,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
             slice_intersection, "slice_intersection", ir_builder_);
 
-        // Handle true BB.
+        // Handle true BB (return data from 'update')
         SetToFirstInsertPoint(if_data.true_block, ir_builder_);
         // Compute update index for intersection case.
         llvm_ir::IrArray::Index update_index(rank);
@@ -1346,14 +1532,14 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
               index[i]->getType(), update_hlo->shape().dimensions(i));
           // NOTE: Subtraction will be positive due to bounds checking above.
           update_index[i] = ir_builder_->CreateURem(
-              ir_builder_->CreateSub(index[i], slice_start_index[i]),
+              ir_builder_->CreateSub(index[i], slice_start_index_adjusted[i]),
               update_dim_size);
         }
         TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
                             operand_to_generator.at(update_hlo)(update_index));
         ir_builder_->CreateStore(true_value, ret_value_addr);
 
-        // Handle false BB.
+        // Handle false BB (return data from 'input')
         SetToFirstInsertPoint(if_data.false_block, ir_builder_);
         TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
                             operand_to_generator.at(input_hlo)(index));
@@ -1497,25 +1683,25 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
         llvm::Value* next_accumulator;
         if (primitive_util::IsComplexType(primitive_type)) {
-          auto real = [&](llvm::Value* x) {
-            return ir_builder_->CreateExtractValue(x, {0});
-          };
-          auto imag = [&](llvm::Value* x) {
-            return ir_builder_->CreateExtractValue(x, {1});
-          };
           llvm::Value* product_real = ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(real(lhs_value), real(rhs_value)),
-              ir_builder_->CreateFMul(imag(lhs_value), imag(rhs_value)));
+              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                      EmitExtractReal(rhs_value)),
+              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                      EmitExtractImag(rhs_value)));
           llvm::Value* product_imag = ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(real(lhs_value), imag(rhs_value)),
-              ir_builder_->CreateFMul(imag(lhs_value), real(rhs_value)));
+              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                      EmitExtractImag(rhs_value)),
+              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                      EmitExtractReal(rhs_value)));
           next_accumulator = ir_builder_->CreateInsertValue(
               current_accumulator,
-              ir_builder_->CreateFAdd(real(current_accumulator), product_real),
+              ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
+                                      product_real),
               {0});
           next_accumulator = ir_builder_->CreateInsertValue(
               next_accumulator,
-              ir_builder_->CreateFAdd(imag(current_accumulator), product_imag),
+              ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
+                                      product_imag),
               {1});
         } else if (primitive_util::IsFloatingPointType(primitive_type)) {
           next_accumulator = ir_builder_->CreateFAdd(
@@ -1539,9 +1725,17 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
   }
 }
 
-llvm::Value* ElementalIrEmitter::ComposeComplex(const HloInstruction* op,
-                                                llvm::Value* real,
-                                                llvm::Value* imag) const {
+llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) const {
+  return ir_builder_->CreateExtractValue(value, {0});
+}
+
+llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) const {
+  return ir_builder_->CreateExtractValue(value, {1});
+}
+
+llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
+                                                    llvm::Value* real,
+                                                    llvm::Value* imag) const {
   auto cplx_type =
       llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
   auto complex = ir_builder_->CreateInsertValue(
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 9d32436e38fa2fb3e27d09f01b860cd2edf2c8ac..cccb498f82936283a215370787907b293827ff2d 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -95,6 +95,13 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitReducePrecision(const HloInstruction* hlo,
                                                      llvm::Value* x) const;
 
+  virtual llvm::Value* EmitExtractReal(llvm::Value* value) const;
+  virtual llvm::Value* EmitExtractImag(llvm::Value* value) const;
+
+  // Composes a complex struct. imag may be nullptr for simple cast operations.
+  llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
+                                  llvm::Value* imag) const;
+
   // A helper method for MakeElementGenerator. Given an elementwise op `hlo` and
   // the target array index, computes the source array index of its
   // `operand_no`-th operand.
@@ -117,11 +124,6 @@ class ElementalIrEmitter {
   // compiled executable outside of the HLO code itself.
   const HloModuleConfig& hlo_module_config_;
 
- protected:
-  // Composes a complex struct. imag may be nullptr for simple cast operations.
-  llvm::Value* ComposeComplex(const HloInstruction* op, llvm::Value* real,
-                              llvm::Value* imag) const;
-
  private:
   // Returns a ElementGenerator for a RNG HloInstruction.
   llvm_ir::ElementGenerator MakeRngElementGenerator(
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2d32e59d36c4e3026e0e151561db3076146fabe4..08862308c90af736c1adcaa9438973f858852506 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -44,8 +44,15 @@ namespace xla {
 // interface that is used for launching compiled programs across platforms.
 class Executable {
  public:
-  explicit Executable(std::unique_ptr<const HloModule> hlo_module)
-      : hlo_module_(std::move(hlo_module)) {}
+  explicit Executable(std::unique_ptr<const HloModule> hlo_module,
+                      std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+      : hlo_module_(std::move(hlo_module)),
+        hlo_profile_printer_(std::move(hlo_profile_printer)),
+        hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
+    CHECK_EQ(hlo_profile_printer_.get() == nullptr,
+             hlo_profile_index_map_.get() == nullptr);
+  }
   virtual ~Executable() {}
 
   // Enqueues the compilation result on the provided stream, passing the given
@@ -88,6 +95,16 @@ class Executable {
           tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
           arguments);
 
+  // Populates `hlo_execution_profile` from `executor`. This is implicit in any
+  // Execute* API call that takes a hlo_execution_profile argument, but must be
+  // called explicitly for other (async, for example) variants after the stream
+  // has completed.
+  virtual Status PopulateExecutionProfile(
+      HloExecutionProfile* hlo_execution_profile,
+      perftools::gputools::StreamExecutor* executor) {
+    return Status::OK();
+  }
+
   // Convenience wrapper for calling Executable::ExecuteOnStream. Sets up a
   // timer for the execution, sets up HLO profiling if enabled, and fills in the
   // given ExecutionProfile if non-null.  The ExecuteOnStream overloads have
@@ -113,12 +130,20 @@ class Executable {
         "Equality test on this executable is not implemented.");
   }
 
+  const HloProfilePrinter& hlo_profile_printer() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_printer_;
+  }
+
+  const HloProfileIndexMap& hlo_profile_index_map() const {
+    CHECK(hlo_profiling_enabled());
+    return *hlo_profile_index_map_;
+  }
+
   // Returns whether this executable was compiled with HLO profilings support
   // enabled. If not, the caller should not expect an hlo_execution_profile
   // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const {
-    return hlo_module_->config().hlo_profiling_enabled();
-  }
+  bool hlo_profiling_enabled() const { return hlo_profile_printer_ != nullptr; }
 
   const HloModule& module() const { return *hlo_module_; }
 
@@ -150,10 +175,6 @@ class Executable {
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const SessionModule& session_module);
 
-  // Returns a cost analysis object appropriate for the platform on which this
-  // executable can run.
-  virtual std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const = 0;
-
  protected:
   mutable tensorflow::mutex mutex_;
 
@@ -171,6 +192,9 @@ class Executable {
   // Execution count, used to generate a unique filename for each dumped
   // execution.
   int64 execution_count_ = 0;
+
+  std::unique_ptr<HloProfilePrinter> hlo_profile_printer_;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
 };
 
 template <typename ReturnT, typename ArgT>
@@ -187,14 +211,15 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
   VLOG(1) << "enqueueing executable on stream...";
   // If the profiling flag isn't enabled, we pass nullptr as the profile to
   // indicate profiling is not requested.
-  HloExecutionProfile hlo_execution_profile;
-  HloExecutionProfile* profile_ptr =
+  std::unique_ptr<HloExecutionProfile> profile_ptr =
       module_config().debug_options().xla_hlo_profile() &&
               hlo_profiling_enabled()
-          ? &hlo_execution_profile
+          ? MakeUnique<HloExecutionProfile>(&hlo_profile_printer(),
+                                            &hlo_profile_index_map())
           : nullptr;
 
-  auto return_value = ExecuteOnStream(run_options, arguments, profile_ptr);
+  auto return_value =
+      ExecuteOnStream(run_options, arguments, profile_ptr.get());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
@@ -222,24 +247,11 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
   }
 
   if (profile_ptr != nullptr) {
-    std::unordered_set<const xla::HloComputation*> profiled_computations =
-        profile_ptr->profiled_computations();
-    // To ensure we have print the profiles in a stable order, iterate over the
-    // computations in post order.
-    std::list<xla::HloComputation*> all_computations =
-        module().MakeComputationPostOrder();
-    for (xla::HloComputation* computation : all_computations) {
-      if (profiled_computations.count(computation) > 0) {
-        string profile_string = profile_ptr->ToString(
-            *computation, stream->parent()->GetDeviceDescription(),
-            CreateCostAnalysis().get());
-        if (!profile_string.empty()) {
-          XLA_LOG_LINES(tensorflow::INFO, profile_string);
-        }
-      }
-    }
+    XLA_LOG_LINES(
+        tensorflow::INFO,
+        profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
     hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
-                                         profile_ptr);
+                                         profile_ptr.get());
   }
 
   return return_value;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index dfba22a6c4c5cf071c2cd8621643b8da6587ee3b..2b6caa149439a86d6d047605099bc3ff7b295a8e 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -26,7 +26,10 @@ namespace xla {
 
 namespace {
 
-// Helper to replace the called computation at a while- or call-instruction.
+// Helper to replace the called computation at a while-, call-, or
+// conditional-instruction. This function replaces exactly one instance of
+// 'computation' with 'new_computation' even if 'instruction' calls
+// 'computation' more than once.
 void ReplaceCalledComputation(HloInstruction* instruction,
                               HloComputation* computation,
                               HloComputation* new_computation) {
@@ -45,6 +48,15 @@ void ReplaceCalledComputation(HloInstruction* instruction,
       instruction->set_to_apply(new_computation);
       break;
     }
+    case HloOpcode::kConditional: {
+      if (computation == instruction->true_computation()) {
+        instruction->set_true_computation(new_computation);
+      } else {
+        CHECK_EQ(computation, instruction->false_computation());
+        instruction->set_false_computation(new_computation);
+      }
+      break;
+    }
     default:
       LOG(FATAL) << "unexpected opcode: "
                  << HloOpcodeString(instruction->opcode());
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
index a68e90b7d009890012f94baa790d911871c9c960..d3854b40de3572a60df1ad99d8a4589f59ad7194 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph_test.cc
@@ -223,5 +223,35 @@ TEST_F(FlattenCallGraphTest, FlattenCalls) {
   EXPECT_EQ(1, b_node.caller_callsites().size());
 }
 
+TEST_F(FlattenCallGraphTest, FlattenCallsInConditional) {
+  auto module = CreateNewModule();
+  HloComputation* sub_computation =
+      module->AddEmbeddedComputation(MakeScalarComputation());
+
+  // Create entry computation, which is a conditional that has the same
+  // computation in the true and false branch.
+  HloComputation::Builder builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  builder.AddInstruction(HloInstruction::CreateConditional(
+      kScalarShape, pred, constant1, sub_computation, constant2,
+      sub_computation));
+  module->AddEntryComputation(builder.Build());
+  EXPECT_EQ(2, module->computation_count());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunFlattenCallGraph(module.get()));
+  EXPECT_TRUE(result);
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+  // The true and false computations must now be different.
+  EXPECT_EQ(3, module->computation_count());
+
+  const CallGraphNode& sub_node = call_graph->GetNode(sub_computation);
+  EXPECT_EQ(1, sub_node.caller_callsites().size());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index d3c83ea72e33b959e21d0cc9c1706d92bd659a5c..74aa77b4f165be76fbc0a8aa1a4a7e90a8e9acec 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -103,8 +104,7 @@ GenericTransferManager::ShallowCopyTupleFromDevice(
   // a vector of void* pointers.
   std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
                                       nullptr);
-  int64 tuple_size =
-      ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, pointer_size_);
   auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
                                                     element_pointers.data());
   if (!copy_status.ok()) {
@@ -121,9 +121,8 @@ GenericTransferManager::ShallowCopyTupleFromDevice(
         !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
       return FailedPrecondition("tuple contains nullptr at element %lu", i);
     }
-    int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i),
-                                              /*pointer_size=*/sizeof(void*));
-    destination.emplace_back(element_pointers[i], buffer_size);
+    destination.emplace_back(element_pointers[i],
+                             GetByteSizeRequirement(shape.tuple_shapes(i)));
   }
   return std::move(destination);
 }
@@ -138,11 +137,79 @@ Status GenericTransferManager::WriteTuplePointersToDevice(
   for (const se::DeviceMemoryBase& element : elements) {
     element_pointers.push_back(element.opaque());
   }
-  int64 tuple_size =
-      ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
+                                element_pointers.data(), region);
+}
+
+StatusOr<std::unique_ptr<Literal>>
+GenericTransferManager::TransferLiteralFromDevice(
+    se::StreamExecutor* executor, const ShapedBuffer& device_buffer) {
+  VLOG(2) << "transferring literal from device ordinal "
+          << executor->device_ordinal() << "; device shape: "
+          << ShapeUtil::HumanStringWithLayout(device_buffer.shape())
+          << "; opaque: " << device_buffer.buffer(/*index=*/{}).opaque();
+  TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
+
+  std::unique_ptr<Literal> literal =
+      Literal::CreateFromShape(device_buffer.shape());
+
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      device_buffer.shape(),
+      [&](const Shape& subshape, const ShapeIndex& index) -> Status {
+        if (!ShapeUtil::IsTuple(subshape)) {
+          TF_RETURN_IF_ERROR(TransferBufferFromDevice(
+              executor,
+              /*source=*/device_buffer.buffer(index),
+              /*size=*/GetByteSizeRequirement(subshape),
+              /*destination=*/
+              literal->GetSubliteral(index).MutableInternalData()));
+        }
+
+        return Status::OK();
+      }));
+  return std::move(literal);
+}
+
+Status GenericTransferManager::TransferLiteralToDevice(
+    se::StreamExecutor* executor, const Literal& literal,
+    const ShapedBuffer& device_buffer) {
+  const Shape& shape = literal.shape();
+  VLOG(2) << "transferring literal shape to device: "
+          << ShapeUtil::HumanString(shape) << "; device location: "
+          << device_buffer.buffer(/*index=*/{}).opaque();
+
+  TF_RET_CHECK(ShapeUtil::Compatible(literal.shape(), device_buffer.shape()));
+  TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
+
+  TF_RETURN_IF_ERROR(WriteTupleIndexTables(executor, device_buffer));
 
-  return TransferBufferToDevice(executor, tuple_size, element_pointers.data(),
-                                region);
+  return ShapeUtil::ForEachSubshapeWithStatus(
+      device_buffer.shape(),
+      [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
+        se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
+        if (ShapeUtil::IsArray(device_subshape)) {
+          TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
+                       device_memory.size());
+          // Element is array-shaped: transfer array data to device buffer.
+          const Literal& subliteral = literal.GetSubliteral(index);
+          std::unique_ptr<Literal> relayed_out_literal;
+          const void* source;
+          if (LayoutUtil::Equal(device_subshape.layout(),
+                                subliteral.shape().layout())) {
+            source = subliteral.InternalData();
+          } else {
+            // Relayout data before transferring.
+            relayed_out_literal = subliteral.Relayout(device_subshape.layout(),
+                                                      /*shape_index=*/{});
+            source = relayed_out_literal->InternalData();
+          }
+          return TransferBufferToDevice(
+              executor,
+              /*size=*/GetByteSizeRequirement(device_subshape), source,
+              &device_memory);
+        }
+        return Status::OK();
+      });
 }
 
 Status GenericTransferManager::TransferLiteralToDevice(
@@ -197,8 +264,8 @@ Status GenericTransferManager::ResetDevices(
       "Device reset is not yet supported on this platform (b/30481585)");
 }
 
-int64 GenericTransferManager::GetByteSizeRequirement(const Shape& shape) {
-  return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+int64 GenericTransferManager::GetByteSizeRequirement(const Shape& shape) const {
+  return ShapeUtil::ByteSizeOf(shape, pointer_size_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 26488d6ec651b75c753119a7ce818c692c6c03dd..50dca6aec5012f0b02cb54846b622f008600e48e 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -52,6 +52,14 @@ class GenericTransferManager : public TransferManager {
       perftools::gputools::StreamExecutor* executor, const Literal& literal,
       perftools::gputools::DeviceMemoryBase* destination) override;
 
+  StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
+      perftools::gputools::StreamExecutor* executor,
+      const ShapedBuffer& device_buffer) override;
+
+  Status TransferLiteralToDevice(perftools::gputools::StreamExecutor* executor,
+                                 const Literal& literal,
+                                 const ShapedBuffer& device_buffer) override;
+
   Status TransferLiteralToInfeed(perftools::gputools::StreamExecutor* executor,
                                  const Literal& literal) override;
   Status TransferBufferToInfeed(perftools::gputools::StreamExecutor* executor,
@@ -71,6 +79,9 @@ class GenericTransferManager : public TransferManager {
       const perftools::gputools::DeviceMemoryBase& source,
       const Shape& shape) override;
 
+  int64 GetByteSizeRequirement(const Shape& shape) const override;
+
+ protected:
   Status WriteTuplePointersToDevice(
       perftools::gputools::StreamExecutor* executor,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
@@ -78,8 +89,6 @@ class GenericTransferManager : public TransferManager {
       const Shape& shape,
       perftools::gputools::DeviceMemoryBase* region) override;
 
-  int64 GetByteSizeRequirement(const Shape& shape) override;
-
  private:
   // The platform this transfer manager targets.
   const perftools::gputools::Platform::Id platform_id_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index de84e06cebab72d272bd888f280f5e5b221b97d1..4a72f87efdd92497ac4c2cd73b56c4990ed5b04c 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -343,15 +343,16 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "copy_insertion",
-    srcs = ["copy_insertion.cc"],
-    hdrs = ["copy_insertion.h"],
+    name = "gpu_copy_insertion",
+    srcs = ["gpu_copy_insertion.cc"],
+    hdrs = ["gpu_copy_insertion.h"],
     deps = [
         ":ir_emission_utils",
+        "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:logical_buffer",
-        "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
+        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
     ],
 )
@@ -427,14 +428,14 @@ cc_library(
     hdrs = ["gpu_compiler.h"],
     deps = [
         ":convolution_folding",
-        ":copy_insertion",
         ":fusion_merger",
+        ":gpu_copy_insertion",
         ":gpu_executable",
+        ":gpu_layout_assignment",
         ":hlo_schedule",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
-        ":layout_assignment",
         ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
@@ -448,6 +449,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -465,10 +467,12 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:core",
         "@llvm//:support",
@@ -489,9 +493,9 @@ cc_library(
 )
 
 cc_library(
-    name = "layout_assignment",
-    srcs = ["layout_assignment.cc"],
-    hdrs = ["layout_assignment.h"],
+    name = "gpu_layout_assignment",
+    srcs = ["gpu_layout_assignment.cc"],
+    hdrs = ["gpu_layout_assignment.h"],
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
@@ -505,10 +509,10 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "layout_assignment_test",
-    srcs = ["layout_assignment_test.cc"],
+    name = "gpu_layout_assignment_test",
+    srcs = ["gpu_layout_assignment_test.cc"],
     deps = [
-        ":layout_assignment",
+        ":gpu_layout_assignment",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -572,11 +576,14 @@ tf_cc_test(
     deps = [
         ":instruction_fusion",
         ":while_transformer",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
index 5aaf072f9d2c95e2fff70a1c5337432a12a1aa48..f198c4c08e93277b3a14a32d906b8083a94a8a2c 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding.cc
@@ -55,28 +55,20 @@ MatchBackwardFilter(HloInstruction* conv) {
   //               v       v
   //              Convolution
   //                 conv
-  //                   |
-  //                   v
-  //               Transpose (optional if identity transposition)
   CHECK_EQ(HloOpcode::kConvolution, conv->opcode());
-  // If the forward convolution is followed by a transpose, we can fuse the
-  // transpose into the backward convolution as well.
-  HloInstruction* transpose = nullptr;
-  if (conv->user_count() == 1) {
-    HloInstruction* single_user = *conv->users().begin();
-    if (single_user->opcode() == HloOpcode::kTranspose) {
-      transpose = single_user;
-    }
-  }
 
   // Step 2: match paddings and dimension numbers of the forward convolution.
   const ConvolutionDimensionNumbers& conv_dnums =
       conv->convolution_dimension_numbers();
   auto input_batch_dim = conv_dnums.input_batch_dimension();
   auto input_feature_dim = conv_dnums.input_feature_dimension();
+  auto input_spatial_dims = conv_dnums.input_spatial_dimensions();
+  auto kernel_input_feature_dim = conv_dnums.kernel_input_feature_dimension();
+  auto kernel_output_feature_dim = conv_dnums.kernel_output_feature_dimension();
+  auto kernel_spatial_dims = conv_dnums.kernel_spatial_dimensions();
   auto output_batch_dim = conv_dnums.output_batch_dimension();
   auto output_feature_dim = conv_dnums.output_feature_dimension();
-  auto spatial_dims = conv_dnums.spatial_dimensions();
+  auto output_spatial_dims = conv_dnums.output_spatial_dimensions();
 
   for (const WindowDimension& window_dim : conv->window().dimensions()) {
     if (window_dim.stride() != 1) {
@@ -97,7 +89,8 @@ MatchBackwardFilter(HloInstruction* conv) {
     }
     // Padding high will be checked in Step 3.
   }
-  if (transpose == nullptr && !window_util::HasWindowDilation(conv->window())) {
+  if (input_batch_dim == output_batch_dim &&
+      !window_util::HasWindowDilation(conv->window())) {
     VLOG(1) << conv->ToString()
             << " is a regular forward convolution. No need "
                "to fold it to a backward filter convolution.";
@@ -108,11 +101,11 @@ MatchBackwardFilter(HloInstruction* conv) {
   //
   // Compute the window of the backward convolution.
   Window backward_conv_window;
-  for (int i = 0; i < spatial_dims.size(); ++i) {
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
     WindowDimension* dim = backward_conv_window.add_dimensions();
     // The window size of the backward convolution equals the output size of the
     // forward convolution.
-    int64 filter_size = conv->shape().dimensions(spatial_dims[i]);
+    int64 filter_size = conv->shape().dimensions(output_spatial_dims[i]);
     dim->set_size(filter_size);
     // The window stride equals the window dilation of the forward convolution.
     dim->set_stride(conv->window().dimensions(i).window_dilation());
@@ -120,7 +113,8 @@ MatchBackwardFilter(HloInstruction* conv) {
     // activations.
     dim->set_padding_low(conv->window().dimensions(i).padding_low());
 
-    int64 input_size = conv->operand(0)->shape().dimensions(spatial_dims[i]);
+    int64 input_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
     int64 output_size = conv->window().dimensions(i).size();
     // Compute the range of the amount of valid high padding. We first compute
     // min_padding_high, the amount of padding on the right/bottom to ensure the
@@ -167,50 +161,32 @@ MatchBackwardFilter(HloInstruction* conv) {
     }
   }
 
-  // To make future HLO passes easier, we canonicalize the fused expression by
-  // adding an identity transposition if it's omitted in the pattern.
-  if (transpose == nullptr) {
-    // Create an identity transposition with the same rank as the forward
-    // convolution.
-    HloComputation* parent_computation = conv->parent();
-    std::vector<int64> transpose_dimensions(ShapeUtil::Rank(conv->shape()));
-    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 0);
-    transpose =
-        parent_computation->AddInstruction(HloInstruction::CreateTranspose(
-            conv->shape(), conv, transpose_dimensions));
-    TF_CHECK_OK(conv->ReplaceAllUsesWith(transpose));
-  }
-
   // Restore the dimension numbers of the backward convolution from the forward
   // convolution. The two activation dimensions are reversed (batch and
   // feature).
   ConvolutionDimensionNumbers backward_conv_dnums;
   backward_conv_dnums.set_input_batch_dimension(input_feature_dim);
   backward_conv_dnums.set_input_feature_dimension(input_batch_dim);
-  backward_conv_dnums.set_output_batch_dimension(output_feature_dim);
-  backward_conv_dnums.set_output_feature_dimension(output_batch_dim);
-  for (int i = 0; i < spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_spatial_dimensions(spatial_dims[i]);
+  for (int i = 0; i < input_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_input_spatial_dimensions(input_spatial_dims[i]);
+  }
+  backward_conv_dnums.set_output_batch_dimension(kernel_input_feature_dim);
+  backward_conv_dnums.set_output_feature_dimension(kernel_output_feature_dim);
+  for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_output_spatial_dimensions(kernel_spatial_dims[i]);
   }
   // The dimension numbering of the output of the forward convolution (before
   // transposition) is the same as that of the activations (according to the
   // semantics of kConvolution). The batch dimension of the activations should
   // be treated as the input feature dimension, and the feature dimension should
   // be treated as the output feature.
-  //
-  // The output of the forward convolution needs to be transposed to fit into
-  // the dimension numbering of the weight gradients. This transposition maps
-  // dimension i to PositionInContainer(transpose->dimensions(), i).
-  backward_conv_dnums.set_kernel_input_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_batch_dim));
-  backward_conv_dnums.set_kernel_output_feature_dimension(
-      PositionInContainer(transpose->dimensions(), output_feature_dim));
-  for (int i = 0; i < spatial_dims.size(); ++i) {
-    backward_conv_dnums.add_kernel_spatial_dimensions(
-        PositionInContainer(transpose->dimensions(), spatial_dims[i]));
+  backward_conv_dnums.set_kernel_input_feature_dimension(output_batch_dim);
+  backward_conv_dnums.set_kernel_output_feature_dimension(output_feature_dim);
+  for (int i = 0; i < output_spatial_dims.size(); ++i) {
+    backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]);
   }
 
-  return std::make_tuple(true, std::vector<HloInstruction*>({transpose, conv}),
+  return std::make_tuple(true, std::vector<HloInstruction*>({conv}),
                          backward_conv_window, backward_conv_dnums);
 }
 
@@ -272,12 +248,14 @@ MatchBackwardInput(HloInstruction* conv) {
     }
   }
 
-  const auto& spatial_dims = dnums.spatial_dimensions();
-  CHECK_EQ(conv->window().dimensions().size(), spatial_dims.size());
+  const auto& input_spatial_dims = dnums.input_spatial_dimensions();
+  const auto& output_spatial_dims = dnums.output_spatial_dimensions();
+  CHECK_EQ(conv->window().dimensions().size(), input_spatial_dims.size());
+  CHECK_EQ(output_spatial_dims.size(), input_spatial_dims.size());
 
   const Window& old_window = conv->window();
   Window new_window = old_window;
-  for (size_t i = 0; i < spatial_dims.size(); ++i) {
+  for (size_t i = 0; i < input_spatial_dims.size(); ++i) {
     // Restore backward convolution's padding config from the matched pattern.
     // See the comment in tensorflow/core/kernels/conv_grad_tuple_ops.cc
     // for how we convert backward input convolution to a variant of forward
@@ -310,8 +288,9 @@ MatchBackwardInput(HloInstruction* conv) {
     // end at the border. The maximum amount (max_padding_high) equals
     // min_padding_high+stride-1 -- max_padding_high+1 would cause the output
     // size to change.
-    auto unpadded_input_size = conv->shape().dimensions(spatial_dims[i]);
-    auto output_size = conv->operand(0)->shape().dimensions(spatial_dims[i]);
+    auto unpadded_input_size = conv->shape().dimensions(output_spatial_dims[i]);
+    auto output_size =
+        conv->operand(0)->shape().dimensions(input_spatial_dims[i]);
     auto padded_input_size = kernel_size + dim->stride() * (output_size - 1);
     auto total_pad_size = padded_input_size - unpadded_input_size;
     auto min_padding_high = total_pad_size - backward_padding_low;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
index 19b122ba0603b4ec08d73e05da4c2ae11a760553..34e6bdb117d47a3d7e1eb3bae5806e130e94ea79 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_folding_test.cc
@@ -46,23 +46,27 @@ class ConvolutionFoldingTest : public HloTestBase {
     //
     // TODO(jingyue): Add more tests on NCHW input order which TF also supports.
     tf_default_dnums_for_backward_filter_.set_input_batch_dimension(3);
-    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(3);
     tf_default_dnums_for_backward_filter_.set_input_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(0);
-    tf_default_dnums_for_backward_filter_.add_spatial_dimensions(1);
-    tf_default_dnums_for_backward_filter_.add_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.add_input_spatial_dimensions(2);
     tf_default_dnums_for_backward_filter_.set_kernel_input_feature_dimension(0);
     tf_default_dnums_for_backward_filter_.set_kernel_output_feature_dimension(
         3);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(1);
     tf_default_dnums_for_backward_filter_.add_kernel_spatial_dimensions(2);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(0);
+    tf_default_dnums_for_backward_filter_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_filter_.set_output_batch_dimension(2);
+    tf_default_dnums_for_backward_filter_.set_output_feature_dimension(3);
 
     tf_default_dnums_for_backward_input_.set_input_batch_dimension(0);
     tf_default_dnums_for_backward_input_.set_output_batch_dimension(0);
     tf_default_dnums_for_backward_input_.set_input_feature_dimension(3);
     tf_default_dnums_for_backward_input_.set_output_feature_dimension(3);
-    tf_default_dnums_for_backward_input_.add_spatial_dimensions(1);
-    tf_default_dnums_for_backward_input_.add_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(1);
+    tf_default_dnums_for_backward_input_.add_input_spatial_dimensions(2);
+    tf_default_dnums_for_backward_input_.add_output_spatial_dimensions(2);
     tf_default_dnums_for_backward_input_.set_kernel_input_feature_dimension(3);
     tf_default_dnums_for_backward_input_.set_kernel_output_feature_dimension(2);
     tf_default_dnums_for_backward_input_.add_kernel_spatial_dimensions(0);
@@ -82,7 +86,7 @@ class ConvolutionFoldingTest : public HloTestBase {
   ConvolutionDimensionNumbers tf_default_dnums_for_backward_input_;
 };
 
-TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithoutTranspose) {
+TEST_F(ConvolutionFoldingTest, BackwardFilterConvolve) {
   HloComputation::Builder builder(TestName());
   HloInstruction* activations =
       builder.AddInstruction(HloInstruction::CreateParameter(
@@ -132,7 +136,7 @@ TEST_F(ConvolutionFoldingTest,
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
-  EXPECT_FALSE(FoldConvolution(module.get()));
+  EXPECT_TRUE(FoldConvolution(module.get()));
 }
 
 // Extracted from block35 training.
@@ -151,13 +155,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedActivations) {
     conv_window.mutable_dimensions(i)->set_padding_low(1);
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -185,13 +185,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithPaddedGradients) {
     conv_window.mutable_dimensions(i)->set_padding_high(-1);
     conv_window.mutable_dimensions(i)->set_window_dilation(2);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {3, 3, 192, 320}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -218,13 +214,9 @@ TEST_F(ConvolutionFoldingTest, BackwardFilterConvolveWithUnevenPadding) {
     // Uneven padding: padding_low=0, padding_high=1
     conv_window.mutable_dimensions(i)->set_padding_high(1);
   }
-  HloInstruction* convolution =
-      builder.AddInstruction(HloInstruction::CreateConvolve(
-          ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
-          conv_window, tf_default_dnums_for_backward_filter_));
-
-  builder.AddInstruction(HloInstruction::CreateTranspose(
-      ShapeUtil::MakeShape(F32, {2, 2, 32, 32}), convolution, {1, 2, 3, 0}));
+  builder.AddInstruction(HloInstruction::CreateConvolve(
+      ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
+      conv_window, tf_default_dnums_for_backward_filter_));
 
   auto module = CreateNewModule();
   HloComputation* entry_computation =
@@ -258,8 +250,10 @@ TEST_F(ConvolutionFoldingTest, BackwardInputConvolveEvenPadding) {
   conv_dnums.set_output_batch_dimension(0);
   conv_dnums.set_input_feature_dimension(1);
   conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_spatial_dimensions(2);
-  conv_dnums.add_spatial_dimensions(3);
+  conv_dnums.add_input_spatial_dimensions(2);
+  conv_dnums.add_output_spatial_dimensions(2);
+  conv_dnums.add_input_spatial_dimensions(3);
+  conv_dnums.add_output_spatial_dimensions(3);
   conv_dnums.set_kernel_input_feature_dimension(0);
   conv_dnums.set_kernel_output_feature_dimension(1);
   conv_dnums.add_kernel_spatial_dimensions(2);
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 536b96dcf620e908e25a775bc2efb57ba5f5edd6..899cc5c83b99f1bb6154f883ca17871863e1f457 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -28,12 +29,12 @@ namespace se = ::perftools::gputools;
 namespace xla {
 namespace gpu {
 
+using se::dnn::AlgorithmDesc;
 using se::dnn::BatchDescriptor;
 using se::dnn::ConvolutionDescriptor;
 using se::dnn::DataLayout;
 using se::dnn::FilterDescriptor;
 using se::dnn::FilterLayout;
-using se::dnn::AlgorithmDesc;
 
 ConvolveScratchAllocator::ConvolveScratchAllocator(
     int device_ordinal, DeviceMemoryAllocator* memory_allocator)
@@ -130,8 +131,9 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   const int effective_num_dimensions = std::max(2, num_dimensions);
 
   CHECK_EQ(F32, output_shape_.element_type());
-  CHECK_EQ(num_dimensions, dim_nums_.spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dim_nums_.kernel_spatial_dimensions_size());
+  CHECK_EQ(num_dimensions, dim_nums_.output_spatial_dimensions_size());
   for (const WindowDimension& dim : window_.dimensions()) {
     CHECK_EQ(dim.padding_low(), dim.padding_high());
   }
@@ -147,7 +149,7 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
     // Note that the dimensions are reversed. The same holds below.
     input_descriptor.set_spatial_dim(
         static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        input_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+        input_shape_.dimensions(dim_nums_.input_spatial_dimensions(dim)));
   }
 
   FilterDescriptor filter_descriptor(effective_num_dimensions);
@@ -181,7 +183,7 @@ tensorflow::Status ConvolutionThunk::ExecuteOnStream(
   for (int dim = 0; dim < num_dimensions; ++dim) {
     output_descriptor.set_spatial_dim(
         static_cast<se::dnn::DimIndex>(effective_num_dimensions - dim - 1),
-        output_shape_.dimensions(dim_nums_.spatial_dimensions(dim)));
+        output_shape_.dimensions(dim_nums_.output_spatial_dimensions(dim)));
   }
 
   // Add a singleton dimension in the 1D convolution case.
@@ -257,28 +259,52 @@ tensorflow::Status ConvolutionThunk::Convolve(
 }
 
 std::vector<AlgorithmDesc> ConvolutionThunk::GetAlgorithms(
-    se::StreamExecutor* stream_exec) const {
+    bool with_winograd_nonfused, se::StreamExecutor* stream_exec) const {
   std::vector<AlgorithmDesc> algorithms;
-  // TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
-  // by default. Should send in conv parameters and enable it when
-  // ShouldIncludeWinogradNonfusedAlgo() returns true.
   switch (convolution_kind_) {
     case ConvolutionKind::kBackwardFilter:
       CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          /*with_winograd_nonfused=*/false, &algorithms));
+          with_winograd_nonfused, &algorithms));
       break;
     case ConvolutionKind::kBackwardInput:
       CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          /*with_winograd_nonfused=*/false, &algorithms));
+          with_winograd_nonfused, &algorithms));
       break;
     case ConvolutionKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(/*with_winograd_nonfused=*/false,
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
                                                &algorithms));
       break;
   }
   return algorithms;
 }
 
+static string AlgorithmToString(const se::dnn::AlgorithmDesc& algo) {
+  if (algo.tensor_ops_enabled()) {
+    return tensorflow::strings::StrCat(algo.algo_id(), "+TC");
+  }
+  return tensorflow::strings::StrCat(algo.algo_id());
+}
+
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output descriptors.  This works around b/68264959, an
+// integer overflow in cuDNNv5 and cuDNNv6.
+static bool ShouldIncludeWinogradNonfusedAlgo(
+    const BatchDescriptor& input_descriptor,
+    const BatchDescriptor& output_descriptor) {
+  int64 batch = input_descriptor.count();
+  int64 in_depths = input_descriptor.feature_map_count();
+  int64 in_rows = input_descriptor.height();
+  int64 in_cols = input_descriptor.width();
+  int64 out_depths = output_descriptor.feature_map_count();
+
+  int64 total_size = 16 * std::ceil(batch / 16.0) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+  int64 threshold = 1L << 31;
+
+  return total_size < threshold;
+}
+
 tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const BatchDescriptor& input_descriptor, se::DeviceMemory<float> input_data,
     const FilterDescriptor& filter_descriptor,
@@ -288,21 +314,29 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
     const ConvolutionDescriptor& convolution_descriptor,
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   // TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
-  if (best_algorithm_.algorithm().is_default()) {
+  if (!best_algorithm_.has_value()) {
+    best_algorithm_.emplace();
+
     // Auto-tuning either is disabled or only happens in the first run of this
     // function.
     VLOG(2) << "Profiling for best convolution algorithm used for "
                "ConvolutionThunk: "
             << this;
 
+    bool with_winograd_nonfused =
+        ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor);
+
     se::dnn::ProfileResult best_result;
     se::dnn::ProfileResult best_result_without_scratch;
-    std::vector<AlgorithmDesc> algorithms = GetAlgorithms(stream->parent());
+    std::vector<AlgorithmDesc> algorithms =
+        GetAlgorithms(with_winograd_nonfused, stream->parent());
     for (auto algorithm : algorithms) {
       ConvolveScratchAllocator scratch_allocator(
           buffer_allocations.device_ordinal(),
           buffer_allocations.memory_allocator());
       se::dnn::ProfileResult profile_result;
+      VLOG(3) << "Trying algorithm " << AlgorithmToString(algorithm)
+              << " for ConvolutionThunk: " << this;
       bool launch_ok =
           Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
                    output_descriptor, output_data, convolution_descriptor,
@@ -310,6 +344,11 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
                    &scratch_allocator, &profile_result)
               .ok();
       if (launch_ok && profile_result.is_valid()) {
+        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
+                << " for ConvolutionThunk " << this << " succeeded, taking "
+                << profile_result.elapsed_time_in_ms()
+                << "ms. (Best result: " << best_result.elapsed_time_in_ms()
+                << "ms)";
         if (profile_result.elapsed_time_in_ms() <
             best_result.elapsed_time_in_ms()) {
           best_result = profile_result;
@@ -319,39 +358,42 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
                 best_result_without_scratch.elapsed_time_in_ms()) {
           best_result_without_scratch = profile_result;
         }
+      } else {
+        VLOG(3) << "Run of algorithm " << AlgorithmToString(algorithm)
+                << " for ConvolutionThunk " << this << " failed.";
       }
     }
 
     if (best_result.is_valid()) {
-      best_algorithm_.set_algorithm(best_result.algorithm());
+      best_algorithm_->set_algorithm(best_result.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm(AlgorithmDesc());
+      best_algorithm_->set_algorithm(AlgorithmDesc());
     }
 
     if (best_result_without_scratch.is_valid()) {
-      best_algorithm_.set_algorithm_no_scratch(
+      best_algorithm_->set_algorithm_no_scratch(
           best_result_without_scratch.algorithm());
     } else {
       LOG(ERROR) << "No convolution algorithm without scratch works with "
                     "profiling. Fall back "
                     "to the default algorithm.";
-      best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
+      best_algorithm_->set_algorithm_no_scratch(AlgorithmDesc());
     }
   }
 
   {
     VLOG(2) << "Using convolution algorithm ("
-            << best_algorithm_.algorithm().algo_id() << ", "
-            << best_algorithm_.algorithm_no_scratch().algo_id()
+            << AlgorithmToString(best_algorithm_->algorithm()) << ", "
+            << AlgorithmToString(best_algorithm_->algorithm_no_scratch())
             << ") for ConvolutionThunk: " << this;
     ConvolveScratchAllocator scratch_allocator(
         buffer_allocations.device_ordinal(),
         buffer_allocations.memory_allocator());
     return Convolve(input_descriptor, input_data, filter_descriptor,
                     filter_data, output_descriptor, output_data,
-                    convolution_descriptor, best_algorithm_, stream,
+                    convolution_descriptor, *best_algorithm_, stream,
                     &scratch_allocator, nullptr);
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 13432301b2af34ab4bd0864e39ce22366cc1d11d..7c25a2e6450e30292667ecd7de54b50ac2450767 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -87,6 +88,14 @@ class ConvolutionThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if the next run of ExecuteOnStream will do autotuning.  If so,
+  // we want the GPU to be quiescent during autotuning, so as not to introduce
+  // noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream*) override {
+    return !best_algorithm_.has_value();
+  }
+
  private:
   tensorflow::Status ConvolveWithTune(
       const perftools::gputools::dnn::BatchDescriptor& input_descriptor,
@@ -116,13 +125,15 @@ class ConvolutionThunk : public Thunk {
 
   // Returns the convolve algorithms that can be used for this ConvolutionThunk.
   std::vector<perftools::gputools::dnn::AlgorithmDesc> GetAlgorithms(
+      bool with_winograd_nonfused,
       perftools::gputools::StreamExecutor* stream_exec) const;
 
   // Fastest cuDNN convolution algorithm for this thunk learned from
   // auto-tuning. If auto-tuning is disabled or failed, best_algorithm_ is set
-  // to the default value indicating cuDNN's convolution will choose
-  // the best algorithm from some heuristics based on its parameters.
-  perftools::gputools::dnn::AlgorithmConfig best_algorithm_;
+  // to the default value, indicating cuDNN's convolution will choose the best
+  // algorithm from some heuristics based on its parameters.
+  tensorflow::gtl::optional<perftools::gputools::dnn::AlgorithmConfig>
+      best_algorithm_;
 
   const ConvolutionKind convolution_kind_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/copy_insertion.cc
deleted file mode 100644
index 3dc85552015be67c20db9099704334c864b44b51..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/gpu/copy_insertion.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
-
-#include <memory>
-#include <set>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
-#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-namespace gpu {
-
-StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
-  TF_ASSIGN_OR_RETURN(bool changed, CopyInsertion::Run(module));
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Make sure all operands of a library call are in memory instead of constants
-  // in IR. The top-level (index {}) of the points-to set of each operand
-  // indicates the source(s) of the array buffer. If any of these are constant,
-  // then add a copy to materialize the array.
-  HloComputation* computation = module->entry_computation();
-  for (HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
-    if (ImplementedAsLibraryCall(*hlo)) {
-      for (int64 i = 0; i < hlo->operand_count(); ++i) {
-        HloInstruction* operand = hlo->mutable_operand(i);
-        const PointsToSet& points_to =
-            points_to_analysis->GetPointsToSet(operand);
-        const auto& element = points_to.element(/*index=*/{});
-        if (std::any_of(element.begin(), element.end(),
-                        [](const LogicalBuffer* buffer_source) {
-                          return buffer_source->instruction()->opcode() ==
-                                 HloOpcode::kConstant;
-                        })) {
-          TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                              CopyInsertion::FindOrInsertCopy(operand));
-          TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, copy));
-          changed = true;
-        }
-      }
-    }
-  }
-
-  return changed;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 1b94499bc6ef6d587cdb1fafec48bc4e5b917c51..6bf00cfb8a53723ae9608093480bf2eed10144dd 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -230,6 +230,66 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   }
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexBinaryOp(
+    const HloInstruction* op, llvm::Value* lhs_value,
+    llvm::Value* rhs_value) const {
+  PrimitiveType input_type = op->operand(0)->shape().element_type();
+  TF_RET_CHECK(primitive_util::IsComplexType(input_type));
+  PrimitiveType component_type =
+      primitive_util::ComplexComponentType(input_type);
+  switch (op->opcode()) {
+    case HloOpcode::kPower: {
+      // (a+bi)^(c+di) =
+      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
+      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
+      auto a = EmitExtractReal(lhs_value);
+      auto b = EmitExtractImag(lhs_value);
+      auto c = EmitExtractReal(rhs_value);
+      auto d = EmitExtractImag(rhs_value);
+      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
+                                             ir_builder_->CreateFMul(b, b));
+      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto half_c = ir_builder_->CreateFMul(one_half, c);
+
+      TF_ASSIGN_OR_RETURN(
+          auto aa_p_bb_to_half_c,
+          EmitLibdeviceMathCall("__nv_pow", {aa_p_bb, half_c},
+                                {component_type, component_type},
+                                component_type));
+      auto neg_d = ir_builder_->CreateFNeg(d);
+      TF_ASSIGN_OR_RETURN(
+          auto arg_lhs, EmitLibdeviceMathCall("__nv_atan2", {b, a},
+                                              {component_type, component_type},
+                                              component_type));
+      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto e_to_neg_d_arg_lhs,
+          EmitLibdeviceMathCall("__nv_exp", {neg_d_arg_lhs}, {component_type},
+                                component_type));
+      auto coeff =
+          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto ln_aa_p_bb,
+          EmitLibdeviceMathCall("__nv_log", {aa_p_bb}, {component_type},
+                                component_type));
+      auto half_d = ir_builder_->CreateFMul(one_half, d);
+      auto q =
+          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
+                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_q, EmitLibdeviceMathCall("__nv_cos", {q}, {component_type},
+                                            component_type));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_q, EmitLibdeviceMathCall("__nv_sin", {q}, {component_type},
+                                            component_type));
+      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
+                                ir_builder_->CreateFMul(coeff, sin_q));
+    }
+    default:
+      return ElementalIrEmitter::EmitComplexBinaryOp(op, lhs_value, rhs_value);
+  }
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
     const HloInstruction* op, llvm::Value* operand_value) const {
   PrimitiveType input_type = op->operand(0)->shape().element_type();
@@ -237,18 +297,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
       primitive_util::IsComplexType(input_type)
           ? primitive_util::ComplexComponentType(input_type)
           : input_type;
-  auto real = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {0});
-  };
-  auto imag = [&](llvm::Value* x) {
-    return ir_builder_->CreateExtractValue(x, {1});
-  };
 
   switch (op->opcode()) {
     case HloOpcode::kLog: {
       // log(a+bi) = .5*log(a^2+b^2) + i*atan2(b, a)
-      auto a = real(operand_value);
-      auto b = imag(operand_value);
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
       auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
                                             ir_builder_->CreateFMul(b, b));
@@ -261,34 +315,33 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
                                             {component_type, component_type},
                                             component_type));
       auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return ComposeComplex(op, ir_builder_->CreateFMul(one_half, log_sum_sq),
-                            angle);
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
     }
-    // TODO(b/65408531): Implement kPower on GPU, where atan2 is available.
-    // case HloOpcode::kPower:
-    // // (a+bi)^(c+di) = exp(i(c+di)*arg(a+bi)) * (a*a+b*b)^(0.5(c+di))
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
-      auto b = imag(operand_value);
+      auto b = EmitExtractImag(operand_value);
       TF_ASSIGN_OR_RETURN(
-          auto exp_a, EmitLibdeviceMathCall("__nv_exp", {real(operand_value)},
-                                            {component_type}, component_type));
+          auto exp_a,
+          EmitLibdeviceMathCall("__nv_exp", {EmitExtractReal(operand_value)},
+                                {component_type}, component_type));
       TF_ASSIGN_OR_RETURN(
           auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
                                             component_type));
       TF_ASSIGN_OR_RETURN(
           auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
                                             component_type));
-      return ComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                            ir_builder_->CreateFMul(exp_a, sin_b));
+      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
+                                ir_builder_->CreateFMul(exp_a, sin_b));
     }
     case HloOpcode::kCos: {
       // cos(a+bi) = .5(cos(a)*(e^-b+e^b) + i*sin(a)*(e^-b-e^b))
-      auto a = real(operand_value);
+      auto a = EmitExtractReal(operand_value);
       auto llvm_ty = a->getType();
       TF_ASSIGN_OR_RETURN(
-          auto exp_b, EmitLibdeviceMathCall("__nv_exp", {imag(operand_value)},
-                                            {component_type}, component_type));
+          auto exp_b,
+          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
+                                {component_type}, component_type));
       TF_ASSIGN_OR_RETURN(
           auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
                                             component_type));
@@ -299,7 +352,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
           ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
               cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
@@ -309,11 +362,12 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
 
     case HloOpcode::kSin: {
       // sin(a+bi) = 0.5(sin(a)*(e^b+e^-b) + i*cos(a)*(e^b-e^-b)
-      auto a = real(operand_value);
+      auto a = EmitExtractReal(operand_value);
       auto llvm_ty = a->getType();
       TF_ASSIGN_OR_RETURN(
-          auto exp_b, EmitLibdeviceMathCall("__nv_exp", {imag(operand_value)},
-                                            {component_type}, component_type));
+          auto exp_b,
+          EmitLibdeviceMathCall("__nv_exp", {EmitExtractImag(operand_value)},
+                                {component_type}, component_type));
       TF_ASSIGN_OR_RETURN(
           auto cos_a, EmitLibdeviceMathCall("__nv_cos", {a}, {component_type},
                                             component_type));
@@ -324,13 +378,71 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexUnaryOp(
           ir_builder_->CreateFMul(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
       auto half_exp_neg_b =
           ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 0.5), exp_b);
-      return ComposeComplex(
+      return EmitComposeComplex(
           op,
           ir_builder_->CreateFMul(
               sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
           ir_builder_->CreateFMul(
               cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
     }
+    case HloOpcode::kTanh: {
+      /*
+      tanh=(exp(x)-exp(-x)) / (exp(x)+exp(-x))
+      e^(a+bi) = e^a*(cos(b)+sin(b)i)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(-b)+sin(-b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(-b)+sin(-b)i)e^-a))
+      cos(b)=cos(-b), sin(-b)=-sin(b)
+      so tanh=(((cos(b)+sin(b)i)e^a - (cos(b)-sin(b)i)e^-a)) /
+              (((cos(b)+sin(b)i)e^a + (cos(b)-sin(b)i)e^-a))
+             =(cos(b)e^a+i*sin(b)e^a + cos(b)(-e^-a)+i*sin(b)e^-a) /
+              (cos(b)e^a+i*sin(b)e^a + cos(b)e^-a+i*sin(b)(-e^-a))
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) /
+              (cos(b)(e^a+e^-a) + i*sin(b)(e^a-e^-a))
+      This is a complex division, so we can multiply by denom_conj/denom_conj
+             =(cos(b)(e^a-e^-a) + i*sin(b)(e^a+e^-a)) *
+              (cos(b)(e^a+e^-a) - i*sin(b)(e^a-e^-a)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+             =(cos(b)^2(e^(2a)-e^(-2a)) + sin(b)^2(e^(2a)-e^(-2a)) +
+               i*(cos(b)sin(b)(e^a+e^-a)^2 - cos(b)sin(b)(e^a-e^-a)^2)) /
+              ((cos(b)(e^a+e^-a))^2 + (sin(b)(e^a-e^-a))^2)
+      */
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      TF_ASSIGN_OR_RETURN(
+          auto exp_a, EmitLibdeviceMathCall("__nv_exp", {a}, {component_type},
+                                            component_type));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_b, EmitLibdeviceMathCall("__nv_cos", {b}, {component_type},
+                                            component_type));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_b, EmitLibdeviceMathCall("__nv_sin", {b}, {component_type},
+                                            component_type));
+      auto exp_neg_a = ir_builder_->CreateFDiv(
+          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
+      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
+          ir_builder_->CreateFMul(exp_a, exp_a),
+          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
+      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
+      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
+      auto real_num = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
+          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
+      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
+      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
+      auto exp_a_plus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
+      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
+      auto exp_a_minus_exp_neg_a_sq =
+          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
+      auto imag_num = ir_builder_->CreateFMul(
+          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
+                                               exp_a_minus_exp_neg_a_sq));
+      auto denom = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
+          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
+      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
+                                ir_builder_->CreateFDiv(imag_num, denom));
+    }
     default:
       return ElementalIrEmitter::EmitComplexUnaryOp(op, operand_value);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 3defa1b696d3addc012702e23102bb1fa140170d..6a537d015209bc507af36b13eeb5d69ce58d8fea 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -61,6 +61,10 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const HloInstruction* op, llvm::Value* lhs_value,
       llvm::Value* rhs_value) const override;
 
+  StatusOr<llvm::Value*> EmitComplexBinaryOp(
+      const HloInstruction* op, llvm::Value* lhs_value,
+      llvm::Value* rhs_value) const override;
+
   StatusOr<llvm::Value*> EmitErfcInv(PrimitiveType prim_type,
                                      llvm::Value* value) const override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 983cb872924f22be0dfad8aa9ad86f233b909c46..8c6a1f51a8a09ef78950dfe7e89994a3fe247f49 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -52,6 +52,15 @@ class GemmThunk : public Thunk {
       const BufferAllocations& buffer_allocations,
       perftools::gputools::Stream* stream) override;
 
+  // Returns true if we'll perform autotuning if run on the given stream.  If
+  // so, we want the GPU to be quiescent during autotuning, so as not to
+  // introduce noise in our results.
+  bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* stream) override {
+    return autotune_results_.count(
+               stream->parent()->GetDeviceDescription().name()) != 0;
+  }
+
  private:
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index b5331fe4e2ba34443555e9bf46dfc188cbd6548a..1ccfe323c58422c99fab5efa578be2a1e23e3d1b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 
 #include <stdlib.h>
+#include <atomic>
 #include <functional>
 #include <utility>
 
@@ -30,17 +31,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_folding.h"
-#include "tensorflow/compiler/xla/service/gpu/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
@@ -62,10 +64,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
@@ -73,6 +77,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/tracing.h"
 
 namespace se = ::perftools::gputools;
 
@@ -85,6 +90,7 @@ namespace gpu {
 
 namespace {
 
+using tensorflow::port::Tracing;
 using tensorflow::strings::StrCat;
 
 // Any address of a variable residing in global memory or returned by one of the
@@ -94,15 +100,13 @@ using tensorflow::strings::StrCat;
 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
 constexpr int64 kMemoryAlignment = 256;
 
-// Returns the directory containing nvvm libdevice files. This function is
-// called in GpuCompiler's constructor, so can't return an error. But
-// GpuCompiler::Compile will return an error when the wanted libdevice file
-// doesn't exist in the folder this function returns.
-string GetLibdeviceDir(const HloModuleConfig& config) {
+// Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
+// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
+// HloModule being compiled.
+string GetLibdeviceDir(const string& config_cuda_data_dir) {
   std::vector<string> potential_libdevice_dirs;
-  const string datadir = config.debug_options().xla_gpu_cuda_data_dir();
-  if (!datadir.empty()) {
-    potential_libdevice_dirs.push_back(datadir);
+  if (!config_cuda_data_dir.empty()) {
+    potential_libdevice_dirs.push_back(config_cuda_data_dir);
   }
   potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot());
 
@@ -123,7 +127,7 @@ string GetLibdeviceDir(const HloModuleConfig& config) {
 
 // Runs optimization passes on the given HLO module.
 tensorflow::Status OptimizeHloModule(
-    HloModule* hlo_module, const se::DeviceDescription& device_desc,
+    HloModule* hlo_module,
     const HloCostAnalysis::ShapeSizeFunction& shape_size_function) {
   {
     HloPassPipeline pipeline("optimization");
@@ -134,7 +138,7 @@ tensorflow::Status OptimizeHloModule(
 
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
-
+    pipeline.AddPass<DotDecomposer>();
     {
       auto& pass =
           pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -151,6 +155,7 @@ tensorflow::Status OptimizeHloModule(
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
       pass.AddPass<TupleSimplifier>();
+      pass.AddPass<WhileLoopSimplifier>();
       pass.AddPass<HloDCE>();
       pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
@@ -220,66 +225,94 @@ tensorflow::Status PrepareHloModuleForIrEmitting(
   // (and sometime after) copy insertion, to avoid dead code from interfering
   // with the rewrites.
   pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<GpuCopyInsertion>();
-  pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
+  pipeline.AddPass<GpuCopyInsertion>();
   return pipeline.Run(hlo_module).status();
 }
 
-// Invokes the ptxas tool on the given PTX string, and dumps its output.
-void DumpPtxasInfo(const string& ptx, int cc_major, int cc_minor) {
+// Compiles the given PTX string using ptxas and returns the resulting machine
+// code (i.e. a cubin) as a byte array.
+StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
+                                        int cc_minor) {
+  Tracing::TraceMe annotation("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
-      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin/ptxas");
-  // Do not log PTX stats if ptxas is not found at the given path.
-  if (!tensorflow::Env::Default()->FileExists(ptxas_path).ok()) {
-    LOG(WARNING)
-        << "Failed to dump PTX stats because ptxas is not found at path \""
-        << ptxas_path << "\".";
-    return;
+      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
+  VLOG(2) << "Using ptxas at " << ptxas_path;
+  auto env = tensorflow::Env::Default();
+  TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
+
+  // Write ptx into a temporary file.
+  string ptx_path;
+  if (!env->LocalTempFilename(&ptx_path)) {
+    return InternalError("couldn't get temp PTX file name");
   }
+  auto ptx_cleaner = tensorflow::gtl::MakeCleanup([&ptx_path] {
+    TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(ptx_path));
+  });
 
-  // Write `ptx` into a temporary file.
-  char tempdir_template[] = "/tmp/ptxXXXXXX";
-  char* tempdir_name = mkdtemp(tempdir_template);
-  CHECK_NOTNULL(tempdir_name);
-  string ptx_path = tensorflow::io::JoinPath(tempdir_name, "ptx");
-  TF_CHECK_OK(
-      tensorflow::WriteStringToFile(tensorflow::Env::Default(), ptx_path, ptx));
-  LOG(INFO) << "ptx file written to: " << ptx_path;
+  TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_path, ptx));
+  VLOG(2) << "ptx written to: " << ptx_path;
 
   // Invoke ptxas and collect its output.
+  string cubin_path;
+  if (!env->LocalTempFilename(&cubin_path)) {
+    return InternalError("couldn't get temp CUBIN file name");
+  }
+  auto cubin_cleaner = tensorflow::gtl::MakeCleanup([&cubin_path] {
+    // CUBIN file may never be created, so the failure to delete it should not
+    // produce TF error.
+    tensorflow::Env::Default()->DeleteFile(cubin_path).IgnoreError();
+  });
   tensorflow::SubProcess ptxas_info_dumper;
-  ptxas_info_dumper.SetProgram(ptxas_path,
-                               {ptxas_path, ptx_path, "-o", "/dev/null", "-v",
-                                StrCat("-arch=sm_", cc_major, cc_minor)});
+  std::vector<string> ptxas_args = {ptxas_path, ptx_path, "-o", cubin_path,
+                                    StrCat("-arch=sm_", cc_major, cc_minor)};
+  if (VLOG_IS_ON(2)) {
+    ptxas_args.push_back("-v");
+  }
+  ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
   if (!ptxas_info_dumper.Start()) {
-    LOG(ERROR) << "Failed to launch ptxas.";
-    return;
+    return InternalError("Failed to launch ptxas");
   }
   string stderr_output;
   int exit_status = ptxas_info_dumper.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   XLA_LOG_LINES(tensorflow::INFO, stderr_output);
   if (exit_status != 0) {
-    LOG(ERROR) << "ptxas exited with non-zero error code " << exit_status
-               << ".";
+    return InternalError("ptxas exited with non-zero error code %d",
+                         exit_status);
   }
+
+  // Read in the result of compilation and return it as a byte vector.
+  string cubin;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  cubin_path, &cubin));
+  std::vector<uint8> cubin_vector(cubin.begin(), cubin.end());
+  return cubin_vector;
 }
 
 }  // namespace
 
 GpuCompiler::GpuCompiler()
-    : pointer_size_(llvm::DataLayout(kDataLayout).getPointerSize()) {}
+    : pointer_size_(llvm::DataLayout(kDataLayout)
+                        .getPointerSize(0 /* default address space */)) {}
+
+StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunHloPasses");
+  Tracing::TraceMe annotation("HLO Transforms", module->name(),
+                              /*is_expensive=*/true);
+  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(), ShapeSizeBytesFunction()));
+  return std::move(module);
+}
 
-StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
+StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend");
+
   TF_RET_CHECK(stream_exec != nullptr);
 
-  TF_RETURN_IF_ERROR(OptimizeHloModule(module.get(),
-                                       stream_exec->GetDeviceDescription(),
-                                       ShapeSizeBytesFunction()));
   TF_RETURN_IF_ERROR(
       PrepareHloModuleForIrEmitting(module.get(), ShapeSizeBytesFunction()));
 
@@ -318,7 +351,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
   XLA_VLOG_LINES(2, buffer_assignment->ToString());
-
+  XLA_VLOG_LINES(2, module->ToString());
   const string xla_dump_hlo_proto_to =
       module->config().debug_options().xla_dump_hlo_proto_to();
   if (!xla_dump_hlo_proto_to.empty()) {
@@ -334,8 +367,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
                                &ir_emitter_context);
-  TF_RETURN_IF_ERROR(
-      entry_computation->root_instruction()->Accept(&ir_emitter));
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
+    TF_RETURN_IF_ERROR(
+        entry_computation->root_instruction()->Accept(&ir_emitter));
+  }
 
   if (user_pre_optimization_hook_) {
     TF_CHECK_OK(user_pre_optimization_hook_(llvm_module));
@@ -359,12 +395,21 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
         /*optimized=*/false));
   }
 
-  // Reserve space for the PTX to be generated for this module.
-  string* ptx;
+  string libdevice_dir;
   {
     tensorflow::mutex_lock lock(mutex_);
-    generated_ptxes_.emplace_back(MakeUnique<string>());
-    ptx = generated_ptxes_.back().get();
+
+    // Find the directory containing libdevice.  To avoid searching for it every
+    // time, we have a one-element cache, keyed on the module's config's
+    // cuda_data_dir.
+    const auto& config_cuda_data_dir =
+        module->config().debug_options().xla_gpu_cuda_data_dir();
+    if (cached_libdevice_dir_.empty() ||
+        cached_cuda_data_dir_ != config_cuda_data_dir) {
+      cached_cuda_data_dir_ = config_cuda_data_dir;
+      cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir);
+    }
+    libdevice_dir = cached_libdevice_dir_;
   }
   int cc_major, cc_minor;
   if (!stream_exec->GetDeviceDescription().cuda_compute_capability(&cc_major,
@@ -374,12 +419,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
     cc_major = 2;
     cc_minor = 0;
   }
-  if (libdevice_dir_.empty()) {
-    // Compute libdevice_dir_ just once and cache it in this member.
-    libdevice_dir_ = GetLibdeviceDir(module->config());
+
+  string ptx;
+  {
+    XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - CompileToPtx");
+    TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
+                                          module->config(), libdevice_dir));
   }
-  TF_ASSIGN_OR_RETURN(*ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor},
-                                         module->config(), libdevice_dir_));
 
   if (!ir_dump_directory.empty()) {
     TF_RETURN_IF_ERROR(llvm_ir::DumpIRToDirectory(
@@ -394,20 +440,47 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   VLOG(2) << "LLVM module after optimizations:";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(llvm_module));
   VLOG(2) << "PTX:";
-  XLA_VLOG_LINES(2, *ptx);
-  if (VLOG_IS_ON(2)) {
-    DumpPtxasInfo(*ptx, cc_major, cc_minor);
+  XLA_VLOG_LINES(2, ptx);
+
+  // Write PTX to IR dump directory, if IR dumping was requested.
+  if (!ir_dump_directory.empty()) {
+    const string ptx_outfile = tensorflow::io::JoinPath(
+        ir_dump_directory, StrCat(module->name(), ".ptx"));
+    auto status = [&] {
+      auto* env = tensorflow::Env::Default();
+      TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(ir_dump_directory));
+      TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, ptx_outfile, ptx));
+      return Status::OK();
+    }();
+    if (!status.ok()) {
+      LOG(WARNING) << "Couldn't dump PTX for module " << module->name()
+                   << " to " << ptx_outfile << ": " << status;
+    }
   }
 
+  const std::vector<uint8> cubin =
+      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor);
+
   auto thunk_schedule = MakeUnique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
       hlo_schedule->ThunkLaunchOrder());
   VLOG(2) << "Printing the thunk schedule...";
   XLA_VLOG_LINES(2, thunk_schedule->ToString());
 
-  auto* gpu_executable =
-      new GpuExecutable(*ptx, std::move(thunk_schedule), std::move(module),
-                        std::move(buffer_assignment), ShapeSizeBytesFunction());
+  std::unique_ptr<HloProfileIndexMap> profile_index_map;
+  std::unique_ptr<HloProfilePrinter> profile_printer;
+
+  if (module->config().hlo_profiling_enabled()) {
+    HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
+    profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
+    profile_printer =
+        CreateHloProfilePrinter(*profile_index_map, cost_analysis);
+  }
+
+  auto* gpu_executable = new GpuExecutable(
+      ptx, cubin, {cc_major, cc_minor}, std::move(thunk_schedule),
+      std::move(module), std::move(buffer_assignment),
+      std::move(profile_printer), std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -415,11 +488,75 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::Compile(
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
-StatusOr<std::vector<std::unique_ptr<Executable>>> GpuCompiler::Compile(
-    std::vector<std::unique_ptr<HloModule>> modules,
-    std::vector<std::vector<se::StreamExecutor*>> stream_execs) {
-  return Unimplemented(
-      "Compilation of multiple HLO modules is not yet supported on GPU.");
+std::vector<uint8> GpuCompiler::CompilePtxOrGetCachedResult(const string& ptx,
+                                                            int cc_major,
+                                                            int cc_minor) {
+  XLA_SCOPED_LOGGING_TIMER("GpuCompiler::CompilePtxOrGetCachedResult");
+  Tracing::TraceMe annotation("PTX->CUBIN", /*is_expensive=*/true);
+  bool inserted;
+  decltype(compilation_cache_.begin()) iter;
+  // Pointers into compilation_cache_ where the ptx and (optional) cubin are
+  // stored.
+  const string* cache_ptx = nullptr;
+  CompilationCacheValue* cache_value = nullptr;
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    std::tie(iter, inserted) = compilation_cache_.emplace(
+        std::piecewise_construct,
+        std::forward_as_tuple(ptx, cc_major, cc_minor),
+        std::forward_as_tuple());
+    cache_ptx = &iter->first.ptx;
+    cache_value = &iter->second;
+  }
+
+  // Compile the ptx if it wasn't in the cache before we called this function.
+  // Other threads asking for the same compilation key will block on
+  // cache_value->mutex_ until compilation is done.
+  {
+    tensorflow::mutex_lock lock(cache_value->mutex_);
+    if (inserted) {
+      CHECK(!cache_value->compilation_done);
+      if (!ptx.empty()) {
+        StatusOr<std::vector<uint8>> maybe_cubin =
+            CompilePtx(*cache_ptx, cc_major, cc_minor);
+        if (maybe_cubin.ok()) {
+          cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
+          VLOG(2) << "Compiled PTX size:" << ptx.size()
+                  << " CUBIN size: " << cache_value->cubin_data.size();
+        } else {
+          bool log_warning = true;
+          if (maybe_cubin.status().code() ==
+              tensorflow::error::Code::NOT_FOUND) {
+            // Missing ptxas is expected in some environments where CUDA SDK
+            // binaries are not available. We don't want to spam logs with
+            // identical warnings in this case.
+
+            // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N
+            // for more general usage.
+            static std::atomic<bool> warning_done(false);
+            log_warning = !warning_done.exchange(true);
+          }
+          if (log_warning) {
+            LOG(WARNING)
+                << "Failed to compile ptx to cubin.  Will attempt to let "
+                   "GPU driver compile the ptx. "
+                << maybe_cubin.status();
+          }
+        }
+      }
+      cache_value->compilation_done = true;
+      cache_value->compilation_done_cv_.notify_all();
+    } else {
+      while (!cache_value->compilation_done) {
+        cache_value->compilation_done_cv_.wait(lock);
+      }
+    }
+  }
+
+  CHECK(cache_value != nullptr);
+  CHECK(cache_value->compilation_done);
+  return cache_value->cubin_data;
 }
 
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 58e835e5ee3f77b7b5cb3579514b7501bed2a2a1..18e34340205b6f51497e26c45520799d21c55a46 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -40,14 +42,20 @@ class GpuCompiler : public LLVMCompiler {
   GpuCompiler();
   ~GpuCompiler() override {}
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
+  // Bring in
+  // StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+  //     std::vector<std::unique_ptr<HloModule>> modules,
+  //     std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+  //        stream_execs)
+  using LLVMCompiler::Compile;
+
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
       std::unique_ptr<HloModule> module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> modules,
-      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
-          stream_execs) override;
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> module,
@@ -71,17 +79,72 @@ class GpuCompiler : public LLVMCompiler {
   static const char* kDataLayout;
 
  private:
-  // The parent directory of libdevice IR libraries.
-  string libdevice_dir_;
+  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
+  const int64 pointer_size_;
 
-  // The list of PTX strings generated by this GpuCompiler. We let GpuCompiler
-  // to own them because they need to be alive across the life span of the
-  // StreamExecutor (b/24776264).
   tensorflow::mutex mutex_;
-  std::vector<std::unique_ptr<string>> generated_ptxes_ GUARDED_BY(mutex_);
 
-  // The size in bytes of a pointer. Used by ShapeSizeBytesFunction.
-  int64 pointer_size_;
+  // When compiling an HLO module, we need to find a path to the nvvm libdevice
+  // files.  We search in the module's config.debug_options().cuda_data_dir()
+  // and in tensorflow::LibdeviceRoot(), the latter of which is a constant.
+  //
+  // We cache the cuda_data_dir() and the result of our search, so that if the
+  // next module we have to compile has the same cuda_data_dir(), we can skip
+  // the search.
+  string cached_cuda_data_dir_ GUARDED_BY(mutex_);
+  string cached_libdevice_dir_ GUARDED_BY(mutex_);
+
+  // Tries to compile the given ptx string to cubin.  Returns a vector with the
+  // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
+  std::vector<uint8> CompilePtxOrGetCachedResult(const string& ptx,
+                                                 int cc_major, int cc_minor);
+
+  // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
+  // -> cubin so we don't recompile the same ptx twice.  This is important for
+  // some interactive workflows.  (We also cache at the HLO level, but sometimes
+  // we can't realize that two modules are the same until we lower to ptx.)
+  //
+  // Compilation of distinct PTX happens in parallel. If more than one thread
+  // attempts to compile the same PTX, the fist thread to obtain
+  // cache_value_->mutex_ performs the compilation. The rest wait() on
+  // cache_value_->compilation_done_cv_ until the compilation is done.
+  //
+  // If compiling the ptx fails, we return an empty cubin, cross our fingers,
+  // and leave compilation up to the driver.
+  struct CompilationCacheKey {
+    CompilationCacheKey(std::string ptx, int cc_major, int cc_minor)
+        : ptx(std::move(ptx)), cc_major(cc_major), cc_minor(cc_minor) {}
+    string ptx;
+    int cc_major;
+    int cc_minor;
+  };
+  struct CompilationCacheHash {
+    size_t operator()(const CompilationCacheKey& key) const {
+      return tensorflow::Hash64Combine(
+          tensorflow::Hash64Combine(tensorflow::Hash64(key.ptx), key.cc_major),
+          key.cc_minor);
+    }
+  };
+  struct CompilationCacheEq {
+    size_t operator()(const CompilationCacheKey& a,
+                      const CompilationCacheKey& b) const {
+      return a.cc_major == b.cc_major && a.cc_minor == b.cc_minor &&
+             a.ptx == b.ptx;
+    }
+  };
+  struct CompilationCacheValue {
+    bool compilation_done = false;
+    std::vector<uint8> cubin_data;
+    // mutex and condition variable to serialize compilation completing.
+    tensorflow::mutex mutex_;
+    tensorflow::condition_variable compilation_done_cv_;
+  };
+
+  // Don't even think about switching this to FlatMap; iterator stability is
+  // critical here.
+  std::unordered_map<CompilationCacheKey, CompilationCacheValue,
+                     CompilationCacheHash, CompilationCacheEq>
+      compilation_cache_ GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(GpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33d739b79d3664fec3586bbc924b7fa2e10d3256
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace gpu {
+
+StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy(
+    HloInstruction* hlo) {
+  HloInstruction*& copy = inserted_copies_[hlo];
+  if (copy == nullptr) {
+    TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo));
+  }
+  return copy;
+}
+
+StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
+  CopyInsertion generic_copy_insertion;
+
+  TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
+                      HloDataflowAnalysis::Run(module));
+
+  // Make sure all operands of a library call are in memory instead of constants
+  // in IR.
+  for (HloInstruction* hlo :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    if (ImplementedAsLibraryCall(*hlo)) {
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        HloInstruction* operand = hlo->mutable_operand(i);
+        TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
+        const auto& values = dataflow->GetValueSet(operand).values();
+        if (std::any_of(values.begin(), values.end(),
+                        [](const HloValue* value) {
+                          return value->defining_instruction()->opcode() ==
+                                 HloOpcode::kConstant;
+                        })) {
+          TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand));
+          TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(i, copy));
+          changed = true;
+        }
+      }
+    }
+  }
+
+  // Init values of a while node cannot be constants. Insert copies for any
+  // constants found at the operand of a while.
+  tensorflow::gtl::FlatSet<HloInstruction*> copied_constants;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+      for (auto& pair :
+               dataflow->GetInstructionValueSet(instruction->operand(0))) {
+        const HloValueSet& value_set = pair.second;
+        for (const HloValue* value : value_set.values()) {
+          if (value->defining_instruction()->opcode() ==
+              HloOpcode::kConstant &&
+              !ContainsKey(copied_constants, value->defining_instruction())) {
+            HloInstruction* constant = value->defining_instruction();
+            TF_ASSIGN_OR_RETURN(HloInstruction * copy,
+                                FindOrInsertCopy(constant));
+            TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
+            copied_constants.insert(constant);
+            changed = true;
+          }
+        }
+      }
+    }
+  }
+
+  // The GPU backend needs additional copies added due to deficiencies in
+  // buffer assignment.
+  TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed,
+                      CopyInsertion::AddCopiesForBufferAssignment(module));
+
+  return changed || buffer_assignment_changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/copy_insertion.h b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
similarity index 56%
rename from tensorflow/compiler/xla/service/gpu/copy_insertion.h
rename to tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
index 11077dad2e5506eab4fa84d47ad13a26ed1c035a..4d77f337e6eb20f7d79acc0829fde26bbe443f25 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
 
-#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
 namespace gpu {
@@ -25,12 +25,23 @@ namespace gpu {
 // Besides the modifications made by the generic xla::CopyInsertion, this
 // GPU-specific copy insertion also materializes operands of library calls by
 // inserting kCopy instructions.
-class GpuCopyInsertion : public CopyInsertion {
+class GpuCopyInsertion : public HloPassInterface {
  public:
+  tensorflow::StringPiece name() const override { return "copy-insertion"; }
+
   StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Returns a copy of `hlo`. Looks in inserted_copies_ first to avoid making
+  // duplicate copies.
+  StatusOr<HloInstruction*> FindOrInsertCopy(HloInstruction* hlo);
+
+  // A map containing all copies inserted to materialize operands of library
+  // calls. The key is the copied instruction and the value is the copy.
+  tensorflow::gtl::FlatMap<HloInstruction*, HloInstruction*> inserted_copies_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_COPY_INSERTION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 2c4d5150741d75ec2d1cb7e3d41c07ad24f800b0..21e9fc96f61c4f84490fb4d21748e58272564048 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -88,7 +88,7 @@ class HloExecutionProfiler {
     if (do_profile_) {
       stream_->ThenStopTimer(per_op_timer_.get());
       stream_->BlockHostUntilDone();
-      profile_->AddProfileResult(
+      profile_->SetCyclesTakenBy(
           hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
     }
   }
@@ -108,16 +108,20 @@ class HloExecutionProfiler {
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(
-    tensorflow::StringPiece ptx,
+    const string& ptx, const std::vector<uint8>& cubin,
+    std::pair<int, int> compute_capability,
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::unique_ptr<const HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
-    HloCostAnalysis::ShapeSizeFunction shape_size_function)
-    : Executable(std::move(hlo_module)),
+    std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    : Executable(std::move(hlo_module), std::move(hlo_profile_printer),
+                 std::move(hlo_profile_index_map)),
       ptx_(ptx),
+      cubin_(cubin),
+      compute_capability_(compute_capability),
       thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)),
-      shape_size_function_(std::move(shape_size_function)) {}
+      assignment_(std::move(assignment)) {}
 
 Status GpuExecutable::ExecuteThunks(
     const ServiceExecutableRunOptions* run_options,
@@ -125,6 +129,16 @@ Status GpuExecutable::ExecuteThunks(
     HloExecutionProfile* hlo_execution_profile) {
   se::Stream* main_stream = run_options->stream();
 
+  std::pair<int, int> stream_compute_compatibility;
+  main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
+      &stream_compute_compatibility.first,
+      &stream_compute_compatibility.second);
+  TF_RET_CHECK(stream_compute_compatibility == compute_capability_)
+      << "Compute capability mismatch; expected {" << compute_capability_.first
+      << ", " << compute_capability_.second << "}, but was {"
+      << stream_compute_compatibility.first << ", "
+      << stream_compute_compatibility.second << "}";
+
   bool do_profile = hlo_execution_profile != nullptr;
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
@@ -153,9 +167,16 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
+    // If this thunk requests it, wait for all currently-executing thunks to
+    // finish.  This is useful e.g. if the thunk is about to perform autotuning.
+    if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
+      main_stream->BlockHostUntilDone();
+    }
+
     profiler.StartOperation();
     VLOG(2) << "Executing the thunk for "
-            << thunk->hlo_instruction()->ToString();
+            << thunk->hlo_instruction()->ToString() << " on stream "
+            << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
     if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
@@ -345,9 +366,5 @@ const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
       module().entry_computation()->root_instruction());
 }
 
-std::unique_ptr<HloCostAnalysis> GpuExecutable::CreateCostAnalysis() const {
-  return MakeUnique<HloCostAnalysis>(shape_size_function_);
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 748a8f521bc5293d58de19ab52f4bdecec6cb1e5..e7307e07c0b5608e31f15597d31d11c50f81c6d5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -47,11 +47,15 @@ namespace gpu {
 // This is an immutable data type after initialization, and thus thread safe.
 class GpuExecutable : public Executable {
  public:
-  GpuExecutable(tensorflow::StringPiece ptx,
+  // cubin (i.e. the compiled ptx) may be empty, in which case we leave
+  // compilation up to the GPU driver.
+  GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
+                std::pair<int, int> compute_capability,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::unique_ptr<const HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
-                HloCostAnalysis::ShapeSizeFunction shape_size_function);
+                std::unique_ptr<HloProfilePrinter> hlo_profile_printer,
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
 
   // This should be called after set_ir_module_string.
   const string& ir_module_string() const { return ir_module_string_; }
@@ -64,6 +68,13 @@ class GpuExecutable : public Executable {
   // Returns the compiled PTX for the computation.
   tensorflow::StringPiece ptx() const { return ptx_; }
 
+  // Returns the cubin (compiled PTX) stored in this GpuExecutable.  May be
+  // empty, in which case compilation is left up to the GPU driver.
+  const std::vector<uint8>& cubin() const { return cubin_; }
+
+  // Both overloads of ExecuteOnStream will fail if the compute capability of
+  // the stream doesn't match the compute capability passed to this object's
+  // constructor.
   StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
@@ -85,8 +96,6 @@ class GpuExecutable : public Executable {
     return Unimplemented("Equality test on GPU executable is not implemented.");
   }
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
@@ -110,8 +119,17 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  // The reference to the compiled PTX for the computation.
-  const tensorflow::StringPiece ptx_;
+  // The PTX for the computation.
+  const string ptx_;
+
+  // The GPU machine code for the computation, targeting GPUs at
+  // compute_capability_.
+  //
+  // May be empty, in which case we leave compilation up to the GPU driver.
+  const std::vector<uint8> cubin_;
+
+  // The compute capability of the GPU we're targeting with this GpuExecutable.
+  std::pair<int, int> compute_capability_;
 
   // The thunks to be invoked by this GpuExecutable. They are generated by the
   // IrEmitter.
@@ -121,9 +139,6 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::unique_ptr<const BufferAssignment> assignment_;
 
-  // Function to compute the size of a given Shape, in bytes.
-  const HloCostAnalysis::ShapeSizeFunction shape_size_function_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment.cc
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 0bbd63fb7bfc657cb7bb1de673253c198f5bd25f..50a249f448e7b4956e7bf6bd603d256eca88f71d 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
 #include <memory>
 
@@ -80,9 +80,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       const ConvolutionDimensionNumbers& dimension_numbers =
           instruction->convolution_dimension_numbers();
       std::vector<int64> input_layout;
-      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
-           --i) {
-        input_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      for (int i = dimension_numbers.input_spatial_dimensions_size() - 1;
+           i >= 0; --i) {
+        input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
       }
       input_layout.push_back(dimension_numbers.input_feature_dimension());
       input_layout.push_back(dimension_numbers.input_batch_dimension());
@@ -102,9 +102,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
 
       std::vector<int64> output_layout;
-      for (int i = dimension_numbers.spatial_dimensions_size() - 1; i >= 0;
-           --i) {
-        output_layout.push_back(dimension_numbers.spatial_dimensions(i));
+      for (int i = dimension_numbers.output_spatial_dimensions_size() - 1;
+           i >= 0; --i) {
+        output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
       }
       output_layout.push_back(dimension_numbers.output_feature_dimension());
       output_layout.push_back(dimension_numbers.output_batch_dimension());
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
similarity index 86%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment.h
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 169041eb85c633cb4f1f679bcea127714828308f..7655a3ebf45f83c0125a4257baae7a7229ebdc6d 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -38,4 +38,4 @@ class GpuLayoutAssignment : public LayoutAssignment {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_LAYOUT_ASSIGNMENT_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_LAYOUT_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
similarity index 97%
rename from tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
rename to tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index ac206b89d329d7e4ac91ee51162c9694f6899d78..f68b23c8ce969372a01ce77840e016d82ca5d2ed 100644
--- a/tensorflow/compiler/xla/service/gpu/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f0f036f7f381db15b84db85d3efeec5d8141884e..4cf49d4a723fd2223564afb86f003901f9712b39 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -44,7 +44,7 @@ GpuTransferManager::GpuTransferManager()
     : GenericTransferManager(
           se::cuda::kCudaPlatformId,
           /*pointer_size=*/llvm::DataLayout(gpu::GpuCompiler::kDataLayout)
-              .getPointerSize()) {}
+              .getPointerSize(0 /* default address space */)) {}
 
 Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
                                                    const Literal& literal) {
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 163a161353fdb90cee2968269d572b8414855551..c2115c49993ef71c4b6dd584e7e0498807666613 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -166,11 +166,46 @@ void HloToIrBindings::BindHloToIrValue(const HloInstruction& hlo,
   *(base_ptrs_[&hlo].mutable_element(shape_index)) = typed_ir_value;
 }
 
+// Determines whether hlo's buffers are never modified within the execution of
+// consumer.
+static bool BuffersInvariantWithinConsumer(
+    const HloInstruction& hlo, const HloInstruction& consumer,
+    const BufferAssignment* buffer_assignment) {
+  // Check if consumer is inside a fusion node -- if so, "dereference" it until
+  // we get to a non-fusion node.
+  const HloInstruction* c = &consumer;
+  while (c->IsFused()) {
+    c = c->parent()->FusionInstruction();
+  }
+
+  // If, after dereferencing c, we end up with a node that's not inside our
+  // module's top-level computation (say our node is inside a while loop), we
+  // give up on marking array as invariant, because this HLO may be run multiple
+  // times (e.g. multiple while loop iterations, or multiple invocations of a
+  // reducer's computation).  TODO(jlebar): We could relax this constraint if we
+  // emitted an llvm.invariant.group.barrier at the end of the computation.
+  return c->parent() == c->GetModule()->entry_computation() &&
+         buffer_assignment->HaveDisjointSlices(&hlo, &consumer);
+}
+
 llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
+                                             const HloInstruction& consumer,
                                              const ShapeIndex& shape_index) {
   llvm_ir::IrArray ir_array(GetBasePointer(hlo, shape_index),
                             ShapeUtil::GetSubshape(hlo.shape(), shape_index));
   alias_analysis_.AddAliasingInformationToIrArray(hlo, &ir_array);
+
+  // The GPU backend emits one kernel per top-level HLO, and LLVM views
+  // execution of one kernel as the "whole program" executed on the GPU.
+  // Therefore if hlo's output buffer is not modified within consumer, and if
+  // consumer runs hlo only once (so that it doesn't create two different
+  // outputs), then we can mark ir_array as invariant over the whole program.
+  if (BuffersInvariantWithinConsumer(hlo, consumer, buffer_assignment_)) {
+    VLOG(2) << "Marking " << hlo.name() << " as invariant within "
+            << consumer.name();
+    ir_array.MarkInvariantOverWholeProgram(&module_->getContext());
+  }
+
   return ir_array;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index a3120f15bcbfb0f2f0bfbd806e7a4ff05316d5dd..62ae1769a1f2fb3b9acaf35bdf18a793232500b0 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -76,8 +76,15 @@ class HloToIrBindings {
     return it->second.element(shape_index);
   }
 
-  // Return the underlying IrArray of the output of the given instruction.
+  // Returns the IrArray which contains the output of hlo.
+  //
+  // consumer is the HLO in which this IrArray is used -- we use this to (try
+  // to) add metadata indicating that the array is invariant within consumer.
+  //
+  // To get the buffer into which hlo should write its own output, call
+  // GetIrArray(hlo, hlo).
   llvm_ir::IrArray GetIrArray(const HloInstruction& hlo,
+                              const HloInstruction& consumer,
                               const ShapeIndex& shape_index = {});
 
  private:
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 9a4bfd0905bb62c02c70e7f2eea46872c07bca89..1d47ffde4331868cbc8a8afb2d01b11e77a7fab0 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -156,8 +156,10 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfConvolutionUnfused) {
   conv_dnums.set_output_batch_dimension(0);
   conv_dnums.set_input_feature_dimension(1);
   conv_dnums.set_output_feature_dimension(1);
-  conv_dnums.add_spatial_dimensions(2);
-  conv_dnums.add_spatial_dimensions(3);
+  conv_dnums.add_input_spatial_dimensions(2);
+  conv_dnums.add_output_spatial_dimensions(2);
+  conv_dnums.add_input_spatial_dimensions(3);
+  conv_dnums.add_output_spatial_dimensions(3);
   conv_dnums.set_kernel_output_feature_dimension(0);
   conv_dnums.set_kernel_input_feature_dimension(1);
   conv_dnums.add_kernel_spatial_dimensions(2);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 8fb7a6adda9dc7c36eb9aabcbcdc9d77e6c22c4a..658fd05cd4b63c923d21b4a1de16468c0aeec65d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -100,7 +100,7 @@ bool ImplementedAsDnnConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() == HloOpcode::kConvolution) {
     const ConvolutionDimensionNumbers& dnums =
         hlo.convolution_dimension_numbers();
-    if (dnums.spatial_dimensions_size() > 3) {
+    if (dnums.input_spatial_dimensions_size() > 3) {
       return false;
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 57a3f713e35b506ad9d5caab1ced2c7b74f8efcf..f64e93024fe134e585411f555810711763f6fcb5 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -68,7 +68,8 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand).EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArray(*operand, *hlo)
+          .EmitReadArrayElement(index, &ir_builder_);
     };
   }
   return EmitTargetElementLoop(
@@ -128,16 +129,25 @@ Status IrEmitter::HandleSend(HloInstruction*) {
   return Unimplemented("Send is not implemented on GPU");
 }
 
+Status IrEmitter::HandleSendDone(HloInstruction*) {
+  return Unimplemented("Send-Done is not implemented on GPU");
+}
+
 Status IrEmitter::HandleRecv(HloInstruction*) {
   return Unimplemented("Recv is not implemented on GPU");
 }
 
+Status IrEmitter::HandleRecvDone(HloInstruction*) {
+  return Unimplemented("Recv-done is not implemented on GPU");
+}
+
 Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   std::vector<llvm::Value*> base_ptrs;
   for (const HloInstruction* operand : tuple->operands()) {
     base_ptrs.push_back(GetBasePointer(*operand));
   }
-  llvm_ir::EmitTuple(GetIrArray(*tuple), base_ptrs, &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &ir_builder_,
+                     module_);
   return Status::OK();
 }
 
@@ -163,7 +173,7 @@ Status IrEmitter::EmitCallToNestedComputation(
   return Status::OK();
 }
 
-bool IrEmitter::MaybeEmitSpecialAtomicOperation(
+bool IrEmitter::MaybeEmitDirectAtomicOperation(
     const HloComputation& computation, llvm::Value* output_address,
     llvm::Value* source_address) {
   CHECK_EQ(2, computation.num_parameters());
@@ -223,101 +233,189 @@ bool IrEmitter::MaybeEmitSpecialAtomicOperation(
   return false;
 }
 
-Status IrEmitter::EmitAtomicOperationForNestedComputation(
-    const HloComputation& computation, llvm::Value* output_address,
-    llvm::Value* source_address) {
-  if (computation.num_parameters() != 2) {
-    // TODO(b/30258929): We only accept binary computations so far.
-    return Unimplemented(
-        "We only support atomic functions with exactly two parameters, but "
-        "computation %s has %lld.",
-        computation.name().c_str(), computation.num_parameters());
-  }
-
-  if (MaybeEmitSpecialAtomicOperation(computation, output_address,
-                                      source_address)) {
-    return Status::OK();
-  }
+// Implements atomic binary operations using atomic compare-and-swap
+// (atomicCAS) as follows:
+//   1. Reads the value from the memory pointed to by output_address and
+//     records it as old_output.
+//   2. Uses old_output as one of the source operand to perform the binary
+//     operation and stores the result in new_output.
+//   3. Calls atomicCAS which implements compare-and-swap as an atomic
+//     operation. In particular, atomicCAS reads the value from the memory
+//     pointed to by output_address, and compares the value with old_output. If
+//     the two values equal, new_output is written to the same memory location
+//     and true is returned to indicate that the atomic operation succeeds.
+//     Otherwise, the new value read from the memory is returned. In this case,
+//     the new value is copied to old_output, and steps 2. and 3. are repeated
+//     until atomicCAS succeeds.
+//
+// On Nvidia GPUs, atomicCAS can only operate on 32 bit and 64 bit integers. If
+// the element type of the binary operation is 32 bits or 64 bits, the integer
+// type of the same size is used for the atomicCAS operation. On the other hand,
+// if the element type is smaller than 32 bits, int32 is used for the atomicCAS
+// operation. In this case, atomicCAS reads and writes 32 bit values from
+// the memory, which is larger than the memory size required by the original
+// atomic binary operation. We mask off the last two bits of the output_address
+// and use the result as an address to read the 32 bit values from the memory.
+// This can avoid out of bound memory accesses if tensor buffers are 4 byte
+// aligned and have a size of 4N, an assumption that the runtime can guarantee.
+//
+// The pseudo code is shown below. Variables *_address are pointers to a memory
+// region with a size equal to the size of the atomicCAS operation, with the
+// exception that new_output_address is a pointer to a memory region with a size
+// equal to the element size of the binary operation.
+//
+//   element_size = sizeof(element_type);
+//   atomic_size = max(32, element_size);
+//   cas_new_output_address = alloca(atomic_size);
+//   cas_old_output_address = alloca(atomic_size);
+//   if (atomic_size != element_size) {
+//     atomic_address = output_address & ((int64)(-2));
+//     new_output_address = cas_new_output_address + (output_address & 3);
+//   } else {
+//     atomic_address = output_address;
+//     new_output_address = cas_new_output_address;
+//   }
+//
+//   *cas_old_output_address = *atomic_address;
+//   do {
+//     *cas_new_output_address = *cas_old_output_address;
+//     *new_output_address = operation(*new_output_address, *source_address);
+//     (*cas_old_output_address, success) =
+//       atomicCAS(atomic_address, *cas_old_output_address,
+//       *cas_new_output_address);
+//   } while (!success);
+//
+Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                              llvm::Value* output_address,
+                                              llvm::Value* source_address) {
+  llvm::PointerType* output_address_type =
+      llvm::dyn_cast<llvm::PointerType>(output_address->getType());
+  CHECK_NE(output_address_type, nullptr);
+
+  // element_type is the data type for the binary operation.
+  llvm::Type* element_type = output_address_type->getPointerElementType();
+  int element_size = llvm_ir::GetSizeInBits(element_type);
+  llvm::Type* element_address_type = element_type->getPointerTo();
+
+  int atomic_size = (element_size < 32) ? 32 : element_size;
+  llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size);
+  llvm::Type* atomic_address_type =
+      atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
+
+  // cas_old_output_address and cas_new_output_address point to the scratch
+  // memory where we store the old and new values for the repeated atomicCAS
+  // operations.
+  llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
+  llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca(
+      atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
 
-  // Other binary computations can be made atomic as following (labels are basic
-  // block names used in the IR emitting code later).
-  //
-  // atomic_op_loop_preheader:
-  //   ...
-  //   source = *source_address;
-  //   old_output = *output_address;
-  //   do {
-  // atomic_op_loop_body_entry:
-  //     new_output = computation(old_output, source);
-  //     (old_output, success) =
-  //         atomicCAS(output_address, old_output, new_output);
-  //   } while (!success);
-  //
-  // atomic_op_loop_exit:
-  //   ...
-  //
-  // TODO(jingyue): Consider encapsulate the logic of emitting control flow to
-  // something similar to llvm_ir::ForLoop.
-  //
   // Emit preparation code to the preheader.
   llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock();
-  llvm::Type* element_ir_type =
-      output_address->getType()->getPointerElementType();
-  // old_output = *output_address;
-  llvm::Value* old_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "old_output_location");
-  ir_builder_.CreateStore(ir_builder_.CreateLoad(output_address, "old_output"),
-                          old_output_location);
+
+  llvm::Value* atomic_memory_address;
+  // binop_output_address points to the scratch memory that stores the
+  // result of the binary operation.
+  llvm::Value* binop_output_address;
+  if (element_size < 32) {
+    // Assume the element size is an integer number of bytes.
+    CHECK_EQ((element_size % sizeof(char)), 0);
+    llvm::Type* address_int_type =
+        module_->getDataLayout().getIntPtrType(output_address_type);
+    atomic_memory_address =
+        ir_builder_.CreatePtrToInt(output_address, address_int_type);
+    llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3);
+    llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    mask = llvm::ConstantInt::get(address_int_type, -2);
+    atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    atomic_memory_address =
+        ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
+    binop_output_address = ir_builder_.CreateAdd(
+        ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type),
+        offset);
+    binop_output_address =
+        ir_builder_.CreateIntToPtr(binop_output_address, element_address_type);
+  } else {
+    atomic_memory_address =
+        ir_builder_.CreateBitCast(output_address, atomic_address_type);
+    binop_output_address =
+        ir_builder_.CreateBitCast(cas_new_output_address, element_address_type);
+  }
+
+  // Use the value from the memory that atomicCAS operates on to initialize
+  // cas_old_output.
+  llvm::Value* cas_old_output =
+      ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_old_output_address);
+
   llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock(
       ir_builder_.GetInsertPoint(), "atomic_op_loop_exit");
-
-  // Emit the body of the loop that repeatedly invokes atomicCAS.
   llvm::BasicBlock* loop_body_bb =
       llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body",
                                ir_builder_.GetInsertBlock()->getParent());
   ir_builder_.SetInsertPoint(loop_body_bb);
   // Change preheader's successor from loop_exit_bb to loop_body_bb.
   loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb);
-  // new_output = computation(old_output, source);
-  llvm::Value* new_output_location = ir_builder_.CreateAlloca(
-      element_ir_type, /*ArraySize=*/nullptr, "new_output_location");
+
+  // Emit the body of the loop that repeatedly invokes atomicCAS.
+  //
+  // Use cas_old_output to initialize cas_new_output.
+  cas_old_output =
+      ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output");
+  ir_builder_.CreateStore(cas_old_output, cas_new_output_address);
+  // Emits code to calculate new_output = operation(old_output, source);
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-      computation, {old_output_location, source_address}, new_output_location));
-
-  // (old_output, success) = atomicCAS(output_address, old_output, new_output);
-  llvm::Type* element_int_ir_type =
-      ir_builder_.getIntNTy(element_ir_type->getScalarSizeInBits());
-  // cmpxchg accetps integer only, so we bitcast the operands (old_output and
-  // new_output) to integers of the same bit width, and bitcast the result
-  // back to the original element type.
-  llvm::Value* old_output =
-      ir_builder_.CreateLoad(old_output_location, "old_output");
-  llvm::Value* new_output =
-      ir_builder_.CreateLoad(new_output_location, "new_output");
+      computation, {binop_output_address, source_address},
+      binop_output_address));
+
+  llvm::Value* cas_new_output =
+      ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output");
+
+  // Emit code to perform the atomicCAS operation
+  // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
+  //                                       cas_new_output);
   llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg(
-      ir_builder_.CreateBitCast(output_address,
-                                element_int_ir_type->getPointerTo()),
-      ir_builder_.CreateBitCast(old_output, element_int_ir_type),
-      ir_builder_.CreateBitCast(new_output, element_int_ir_type),
+      atomic_memory_address, cas_old_output, cas_new_output,
       llvm::AtomicOrdering::SequentiallyConsistent,
       llvm::AtomicOrdering::SequentiallyConsistent);
-  // cmpxchg returns a pair. The first element is the original value at
-  // output_address and the second element is whether the swap is successful.
+
+  // Extract the memory value returned from atomicCAS and store it as
+  // cas_old_output.
   ir_builder_.CreateStore(
-      ir_builder_.CreateBitCast(
-          ir_builder_.CreateExtractValue(ret_value, 0, "old_output"),
-          element_ir_type),
-      old_output_location);
+      ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"),
+      cas_old_output_address);
+  // Extract the success bit returned from atomicCAS and generate a
+  // conditional branch on the success bit.
   ir_builder_.CreateCondBr(
       ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
       loop_body_bb);
 
-  // Restore the insertion point to the exit basic block so that the caller of
+  // Set the insertion point to the exit basic block so that the caller of
   // this method can continue emitting code to the right place.
   SetToFirstInsertPoint(loop_exit_bb, &ir_builder_);
   return Status::OK();
 }
 
+Status IrEmitter::EmitAtomicOperationForNestedComputation(
+    const HloComputation& computation, llvm::Value* output_address,
+    llvm::Value* source_address) {
+  if (computation.num_parameters() != 2) {
+    // TODO(b/30258929): We only accept binary computations so far.
+    return Unimplemented(
+        "We only support atomic functions with exactly two parameters, but "
+        "computation %s has %lld.",
+        computation.name().c_str(), computation.num_parameters());
+  }
+
+  if (MaybeEmitDirectAtomicOperation(computation, output_address,
+                                     source_address)) {
+    return Status::OK();
+  }
+
+  return EmitAtomicOperationUsingCAS(computation, output_address,
+                                     source_address);
+}
+
 Status IrEmitter::HandleSelect(HloInstruction* select) {
   auto pred = select->operand(0);
   auto on_true = select->operand(1);
@@ -325,7 +423,8 @@ Status IrEmitter::HandleSelect(HloInstruction* select) {
   TF_RET_CHECK(pred->shape().element_type() == PRED);
 
   if (ShapeUtil::IsTuple(select->shape())) {
-    llvm_ir::EmitTupleSelect(GetIrArray(*select), GetIrArray(*pred),
+    llvm_ir::EmitTupleSelect(GetIrArray(*select, *select),
+                             GetIrArray(*pred, *select),
                              GetBasePointer(*on_true),
                              GetBasePointer(*on_false), &ir_builder_, module_);
     return Status::OK();
@@ -340,9 +439,9 @@ Status IrEmitter::HandleSelect(HloInstruction* select) {
 Status IrEmitter::HandleDot(HloInstruction* dot) {
   auto lhs_instruction = dot->operand(0);
   auto rhs_instruction = dot->operand(1);
-  const llvm_ir::IrArray& target_array = GetIrArray(*dot);
-  const llvm_ir::IrArray& lhs_array = GetIrArray(*lhs_instruction);
-  const llvm_ir::IrArray& rhs_array = GetIrArray(*rhs_instruction);
+  const llvm_ir::IrArray& target_array = GetIrArray(*dot, *dot);
+  const llvm_ir::IrArray& lhs_array = GetIrArray(*lhs_instruction, *dot);
+  const llvm_ir::IrArray& rhs_array = GetIrArray(*rhs_instruction, *dot);
 
   const Shape& lhs_shape = lhs_instruction->shape();
   const Shape& rhs_shape = rhs_instruction->shape();
@@ -562,7 +661,8 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
         // Apply the reduction function to the loaded value.
         llvm::Value* input_address =
-            GetIrArray(*arg).EmitArrayElementAddress(input_index, &ir_builder_);
+            GetIrArray(*arg, *reduce)
+                .EmitArrayElementAddress(input_index, &ir_builder_);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *function, {accumulator_addr, input_address}, accumulator_addr));
 
@@ -578,7 +678,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
   std::vector<llvm_ir::IrArray> parameter_arrays;
   for (HloInstruction* operand : fusion->operands()) {
-    parameter_arrays.push_back(GetIrArray(*operand));
+    parameter_arrays.push_back(GetIrArray(*operand, *fusion));
   }
   GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_,
                                           &ir_builder_, GetNestedComputer());
@@ -613,7 +713,8 @@ Status IrEmitter::HandleRng(HloInstruction* random) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : random->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand).EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArray(*operand, *random)
+          .EmitReadArrayElement(index, &ir_builder_);
     };
   }
   // Emits a single-threaded loop because the loop body generated by the element
@@ -622,10 +723,41 @@ Status IrEmitter::HandleRng(HloInstruction* random) {
              GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_,
                                    GetNestedComputer())
                  .MakeElementGenerator(random, operand_to_generator),
-             GetIrArray(*random), &ir_builder_)
+             GetIrArray(*random, *random), &ir_builder_)
       .EmitLoop(IrName(random));
 }
 
+Status IrEmitter::HandleConditional(HloInstruction* conditional) {
+  auto pred = conditional->operand(0);
+  auto true_arg = conditional->operand(1);
+  auto false_arg = conditional->operand(2);
+
+  llvm::Value* conditional_result = GetBasePointer(*conditional);
+
+  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+      GetBasePointer(*pred),
+      llvm_ir::AsStringRef(IrName(conditional, "load_predicate_value")));
+  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+      pred_value,
+      llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
+      llvm_ir::AsStringRef(IrName(conditional, "boolean_predicate")));
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      pred_cond, IrName(conditional, "if_then_else"), &ir_builder_);
+
+  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+      *conditional->true_computation(), {GetBasePointer(*true_arg)},
+      conditional_result));
+
+  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+      *conditional->false_computation(), {GetBasePointer(*false_arg)},
+      conditional_result));
+
+  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  return Status::OK();
+}
+
 llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
     const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
     tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 263992d92544166c0d08a6c60b43e78f10f06aed..08bbbe36c72872ba68104c8f328c2f602eb30fa8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -84,7 +84,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSort(HloInstruction* sort) override;
   Status HandleSend(HloInstruction* send) override;
+  Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecv(HloInstruction* recv) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleParameter(HloInstruction* parameter) override;
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleTuple(HloInstruction* tuple) override;
@@ -93,6 +95,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
   Status HandleRng(HloInstruction* random) override;
+  Status HandleConditional(HloInstruction* conditional) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -103,10 +106,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   explicit IrEmitter(const HloModuleConfig& hlo_module_config,
                      IrEmitterContext* ir_emitter_context, bool is_nested);
 
-  // A convenient helper for calling HloToIrBindings::GetIrArray.
+  // Helper for calling HloToIrBindings::GetIrArray.
+  //
+  // Gets the IrArray which contains inst.  This array has metadata that makes
+  // it valid only within the IR that implements consumer.  If you are
+  // implementing an HLO and want to get its own output buffer, call
+  // GetIrArray(hlo, hlo).
   llvm_ir::IrArray GetIrArray(const HloInstruction& inst,
+                              const HloInstruction& consumer,
                               const ShapeIndex& shape_index = {}) {
-    return bindings_.GetIrArray(inst, shape_index);
+    return bindings_.GetIrArray(inst, consumer, shape_index);
   }
   // A convenient helper for calling HloToIrBindings::GetBasePointer.
   llvm::Value* GetBasePointer(const HloInstruction& inst) const {
@@ -177,9 +186,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // be simply implemented using an LLVM atomic instruction. If "computation" is
   // one of this kind, emits code to do that and returns true; otherwise,
   // returns false.
-  bool MaybeEmitSpecialAtomicOperation(const HloComputation& computation,
-                                       llvm::Value* output_address,
-                                       llvm::Value* source_address);
+  bool MaybeEmitDirectAtomicOperation(const HloComputation& computation,
+                                      llvm::Value* output_address,
+                                      llvm::Value* source_address);
+
+  // A helper method for EmitAtomicOperationForNestedComputation. It implements
+  // binary atomic operations using atomicCAS with special handling to support
+  // small data types.
+  Status EmitAtomicOperationUsingCAS(const HloComputation& computation,
+                                     llvm::Value* output_address,
+                                     llvm::Value* source_address);
 
   StatusOr<llvm::Value*> ComputeNestedElement(
       const HloComputation& computation,
@@ -219,6 +235,7 @@ class IrEmitterUnnested : public IrEmitter {
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter.
   Status HandleCopy(HloInstruction* copy) override;
+  Status HandleConditional(HloInstruction* conditional) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleDot(HloInstruction* dot) override;
   Status HandleFusion(HloInstruction* fusion) override;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 5da1a130d5654b86803396b07a6501c59a182c67..5225ff36ff3a8a1b049479c34aa301de8724f73e 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -115,7 +115,8 @@ Status IrEmitterNested::HandleParameter(HloInstruction* parameter) {
 Status IrEmitterNested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
-  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo), &ir_builder_)
+  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo),
+                              &ir_builder_)
       .EmitLoop();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7b4662fc80c5518135c827489a3724e477b2bad1..8dbc90ee1fb5678f070bdc8999ffa8980197188f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -123,10 +123,12 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
   llvm::ConstantInt* threads_per_block_ir_value = llvm::ConstantInt::get(
       llvm::IntegerType::get(llvm_context, /*NumBits=*/32),
       launch_dims.threads_per_block());
+  // Our launch bounds are exact, so we can specify them as reqntidx rather than
+  // maxntidx.
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
       llvm_context,
       {llvm::ConstantAsMetadata::get(ir_kernel),
-       llvm::MDString::get(llvm_context, "maxntidx"),
+       llvm::MDString::get(llvm_context, "reqntidx"),
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 }  // namespace
@@ -246,6 +248,11 @@ Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
 }
 
 Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_batch_dimensions_size() > 0 ||
+      dnums.rhs_batch_dimensions_size() > 0) {
+    return Unimplemented("Dot with batch dimensions not implemented.");
+  }
   if (ImplementedAsGemm(*dot)) {
     thunk_sequence_->emplace_back(BuildGemmThunk(dot));
     return Status::OK();
@@ -254,6 +261,11 @@ Status IrEmitterUnnested::HandleDot(HloInstruction* dot) {
   return IrEmitter::HandleDot(dot);
 }
 
+Status IrEmitterUnnested::HandleConditional(HloInstruction* conditional) {
+  thunk_sequence_->push_back(BuildKernelThunk(conditional));
+  return IrEmitter::HandleConditional(conditional);
+}
+
 Status IrEmitterUnnested::HandleConvolution(HloInstruction* convolution) {
   if (ImplementedAsDnnConvolution(*convolution)) {
     thunk_sequence_->emplace_back(BuildConvolutionThunk(convolution));
@@ -282,7 +294,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
             MakeUnique<SequentialThunk>(std::move(thunks), fusion));
         std::vector<llvm_ir::IrArray> parameter_arrays;
         for (HloInstruction* operand : fusion->operands()) {
-          parameter_arrays.push_back(GetIrArray(*operand));
+          parameter_arrays.push_back(GetIrArray(*operand, *fusion));
         }
         GpuElementalIrEmitter elemental_emitter(
             hlo_module_config_, ir_emitter_context_->llvm_module(),
@@ -344,7 +356,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
     std::vector<llvm_ir::IrArray> operand_arrays;
     for (HloInstruction* operand : fusion->operands()) {
-      operand_arrays.push_back(GetIrArray(*operand));
+      operand_arrays.push_back(GetIrArray(*operand, *fusion));
     }
     GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                             ir_emitter_context_->llvm_module(),
@@ -355,7 +367,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
 
     // Array to write into.  Because this is an in-place operation, this is the
     // same as operand 0's array.
-    llvm_ir::IrArray output_array = GetIrArray(*fusion);
+    llvm_ir::IrArray output_array = GetIrArray(*fusion, *fusion);
 
     LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
         update_shape, ir_emitter_context_->device_description());
@@ -693,9 +705,10 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
     constexpr int64 tile_size = 32;
     constexpr int64 num_rows = 8;
     int64 num_tiles = EmitTranspose021Tiled(
-        GetIrArray(*(copy->operand(0)))
+        GetIrArray(*copy->operand(0), *copy)
             .CastToShape(reduced_input_shape, &ir_builder_),
-        GetIrArray(*copy).CastToShape(reduced_output_shape, &ir_builder_),
+        GetIrArray(*copy, *copy)
+            .CastToShape(reduced_output_shape, &ir_builder_),
         tile_size, num_rows, &ir_builder_);
     UpdateLaunchDimensions(LaunchDimensions(num_tiles, num_rows * tile_size),
                            LastThunk(), ir_emitter_context_->llvm_module());
@@ -850,9 +863,11 @@ Status IrEmitterUnnested::EmitColumnReduction(
                                    &ir_builder_);
     const HloInstruction* output =
         reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-    llvm::Value* output_address = GetIrArray(*output).EmitArrayElementAddress(
-        llvm_ir::IrArray::Index(x, output->shape(), &ir_builder_), &ir_builder_,
-        "output_element_address");
+    llvm::Value* output_address =
+        GetIrArray(*output, *output)
+            .EmitArrayElementAddress(
+                llvm_ir::IrArray::Index(x, output->shape(), &ir_builder_),
+                &ir_builder_, "output_element_address");
     return EmitAtomicOperationForNestedComputation(
         *reducer, output_address, partial_reduction_result_address);
   };
@@ -1081,16 +1096,25 @@ Status IrEmitterUnnested::EmitRowReduction(
     // from the warp.
     llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block,
                                    &ir_builder_);
+    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
+    // bitcast cannot be applied to aggregate types (even packed ones), so we
+    // instead bitcast addresses of load/store to intN* of the same bit-width.
+    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
+                                      ? ir_builder_.getIntNTy(bit_width)
+                                      : element_ir_type;
     for (int shuffle_distance = 16; shuffle_distance >= 1;
          shuffle_distance /= 2) {
       llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-          partial_reduction_result_address, "partial_reduction_result");
+          ir_builder_.CreateBitCast(partial_reduction_result_address,
+                                    shuffle_ir_type->getPointerTo()),
+          "partial_reduction_result");
       llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
           element_ir_type, nullptr, "result_from_other_lane");
       ir_builder_.CreateStore(
           EmitShuffleDown(partial_reduction_result,
                           ir_builder_.getInt32(shuffle_distance), &ir_builder_),
-          result_from_other_lane);
+          ir_builder_.CreateBitCast(result_from_other_lane,
+                                    shuffle_ir_type->getPointerTo()));
       TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
           *reducer, {partial_reduction_result_address, result_from_other_lane},
           partial_reduction_result_address));
@@ -1107,9 +1131,11 @@ Status IrEmitterUnnested::EmitRowReduction(
         "lane_id_is_zero", &ir_builder_);
     llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
                                    &ir_builder_);
-    llvm::Value* output_address = GetIrArray(*output).EmitArrayElementAddress(
-        llvm_ir::IrArray::Index(y, output->shape(), &ir_builder_), &ir_builder_,
-        "output_element_address");
+    llvm::Value* output_address =
+        GetIrArray(*output, *output)
+            .EmitArrayElementAddress(
+                llvm_ir::IrArray::Index(y, output->shape(), &ir_builder_),
+                &ir_builder_, "output_element_address");
     return EmitAtomicOperationForNestedComputation(
         *reducer, output_address, partial_reduction_result_address);
   };
@@ -1249,11 +1275,12 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
         MakeUnique<SequentialThunk>(std::move(thunks), reduce));
     return EmitReductionToVector(
         reduce, input->shape(),
-        [this, input](const llvm_ir::IrArray::Index& index) {
-          return GetIrArray(*input).EmitReadArrayElement(index, &ir_builder_);
+        [&](const llvm_ir::IrArray::Index& index) {
+          return GetIrArray(*input, *reduce)
+              .EmitReadArrayElement(index, &ir_builder_);
         },
-        [this, init_value](const llvm_ir::IrArray::Index& index) {
-          return GetIrArray(*init_value)
+        [&](const llvm_ir::IrArray::Index& index) {
+          return GetIrArray(*init_value, *reduce)
               .EmitReadArrayElement(index, &ir_builder_);
         },
         dimensions_to_reduce, reducer);
@@ -1417,7 +1444,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
         ir_builder_.CreateStore(operand_index[i], selected_index_address_slot);
       }
     };
-    llvm_ir::IrArray operand_array(GetIrArray(*operand));
+    llvm_ir::IrArray operand_array = GetIrArray(*operand, *select_and_scatter);
     llvm::Value* operand_data =
         operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
     ir_builder_.CreateStore(operand_data, selected_value_address);
@@ -1470,9 +1497,10 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
           ir_builder_.CreateLoad(selected_index_address_slot));
     }
     llvm::Value* source_value_address =
-        GetIrArray(*source).EmitArrayElementAddress(source_index, &ir_builder_);
+        GetIrArray(*source, *select_and_scatter)
+            .EmitArrayElementAddress(source_index, &ir_builder_);
     llvm::Value* output_value_address =
-        GetIrArray(*select_and_scatter)
+        GetIrArray(*select_and_scatter, *select_and_scatter)
             .EmitArrayElementAddress(selected_index, &ir_builder_);
     return EmitAtomicOperationForNestedComputation(
         *select_and_scatter->scatter(), output_value_address,
@@ -1749,7 +1777,7 @@ Status IrEmitterUnnested::EmitInitializer(const HloInstruction* hlo,
   return EmitTargetElementLoopInThunk(
       *hlo,
       [=](const llvm_ir::IrArray::Index& index) {
-        return GetIrArray(*init_value)
+        return GetIrArray(*init_value, *hlo)
             .EmitReadArrayElement(index, &ir_builder_);
       },
       thunk);
@@ -1850,7 +1878,7 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
-    return ParallelLoopEmitter(element_generator, GetIrArray(hlo),
+    return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
                                launch_dimensions, &ir_builder_)
         .EmitLoop(IrName(&hlo));
   }
@@ -1858,7 +1886,7 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
-    output_arrays.push_back(GetIrArray(hlo, {i}));
+    output_arrays.push_back(GetIrArray(hlo, hlo, {i}));
   }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays,
                                          launch_dimensions, &ir_builder_)
@@ -1869,7 +1897,7 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
   }
   ir_builder_.SetInsertPoint(ir_builder_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo), tuple_operand_ptrs, &ir_builder_,
+  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
                      module_);
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 69399e36c4c4faa7c6ed5c79a3f094490f022001..96606993696354f36e143b3b994bbe6afb902df3 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -48,6 +48,12 @@ tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
   // StreamExecutor uses the latter.
   loader_spec_->AddCudaPtxInMemory(
       se::port::StringPiece(ptx.data(), ptx.size()), kernel_name_);
+
+  if (!executable.cubin().empty()) {
+    loader_spec_->AddCudaCubinInMemory(
+        reinterpret_cast<const char*>(executable.cubin().data()), kernel_name_);
+  }
+
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 81cca312982a3a5ee98b3914447f2d878354c3a5..059943d48cd34b0ac487b91c3f3079ee3f761229 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/CommandFlags.def"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
@@ -60,6 +60,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
 
 namespace xla {
 namespace gpu {
@@ -76,7 +77,7 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
   // Since CUDA 9.0, all GPU versions are included in a single file
   const char* unified_libdevice_filename = "libdevice.10.bc";
   std::vector<string> unified_libdevice_files;
-  const tensorflow::Status status = 
+  const tensorflow::Status status =
     tensorflow::Env::Default()->GetMatchingPaths(
       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
       &unified_libdevice_files);
@@ -342,6 +343,13 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
                                     std::pair<int, int> compute_capability,
                                     const HloModuleConfig& hlo_module_config,
                                     const string& libdevice_dir_path) {
+  // If the module has no functions or globals, there's nothing to compile. Just
+  // return an empty string.
+  if (module->empty() && module->global_empty()) {
+    VLOG(2) << "Module '" << llvm_ir::AsString(module->getName())
+            << "' is empty. Skipping compilation.";
+    return string();
+  }
   // Link the input module with libdevice, to pull in implementations of some
   // builtins.
   TF_RETURN_IF_ERROR(
@@ -481,9 +489,11 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
 
   string ptx;
   {
-    ScopedLoggingTimer compilation_timer(
-        "Compile module " + llvm_ir::AsString(module->getName()),
-        /*vlog_level=*/2);
+    tensorflow::port::Tracing::TraceMe annotation(
+        "Compiling IR", llvm_ir::AsString(module->getName()),
+        /*is_expensive=*/true);
+    XLA_SCOPED_LOGGING_TIMER("Compile module " +
+                             llvm_ir::AsString(module->getName()));
     TF_ASSIGN_OR_RETURN(
         ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config,
                                 libdevice_dir_path));
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 9274e16a455fc1a958cee5101b6a9ef7ce619347..c29fee0879c02021fdc23ac0e02ab398cf40f99e 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -49,8 +49,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
     // applies positive padding and dilation.
     PaddingConfig padding_config =
         MakeNoPaddingConfig(input->shape().dimensions_size());
-    for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.spatial_dimensions(i);
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
       padding_config.mutable_dimensions(dim)->set_edge_padding_low(
           std::max<int64>(0LL, conv_window.dimensions(i).padding_low()));
       padding_config.mutable_dimensions(dim)->set_edge_padding_high(
@@ -81,8 +81,8 @@ HloInstruction* MaybePaddedAndSlicedInput(
     std::vector<int64> limit_indices(input->shape().dimensions().begin(),
                                      input->shape().dimensions().end());
     std::vector<int64> strides(input->shape().dimensions_size(), 1);
-    for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-      int64 dim = conv_dnums.spatial_dimensions(i);
+    for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
+      int64 dim = conv_dnums.input_spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
       // decrement the limit index by the amount of negative padding.
       start_indices[dim] +=
@@ -117,8 +117,8 @@ HloInstruction* MaybePaddedKernel(const Window& conv_window,
   for (size_t i = 0; i < kernel->shape().dimensions_size(); ++i) {
     padding_config.add_dimensions();
   }
-  for (size_t i = 0; i < conv_dnums.spatial_dimensions().size(); ++i) {
-    int64 dim = conv_dnums.spatial_dimensions(i);
+  for (size_t i = 0; i < conv_dnums.kernel_spatial_dimensions().size(); ++i) {
+    int64 dim = conv_dnums.kernel_spatial_dimensions(i);
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         conv_window.dimensions(i).window_dilation() - 1);
   }
@@ -202,8 +202,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
   //   ABCD0 = Pad(ABCD, padding_high=1)
   //   BackwardFilterConv(ABCD0, xyz, padding_low=pading_high=1)
   // We choose the lesser of padding_low and padding_high as the new padding.
-  HloInstruction* transpose = backward_conv->fused_expression_root();
-  HloInstruction* forward_conv = transpose->mutable_operand(0);
+  HloInstruction* forward_conv = backward_conv->fused_expression_root();
   HloInstruction* input = backward_conv->mutable_operand(0);
   Window new_forward_conv_window = forward_conv->window();
   Window new_backward_conv_window = backward_conv->window();
@@ -229,7 +228,7 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
     // later. Therefore, the amount of new padding (low or high) is the minimum
     // of the amount of old padding low and old padding high.
     int64 new_conv_padding = std::min(padding_low, padding_high);
-    int64 dim = backward_conv_dnums.spatial_dimensions(i);
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_low(
         padding_low - new_conv_padding);
     input_padding_config.mutable_dimensions(dim)->set_edge_padding_high(
@@ -269,19 +268,10 @@ bool PadInsertion::CanonicalizeBackwardFilterConvolution(
               .ConsumeValueOrDie(),
           padded_input, output, new_forward_conv_window, forward_conv_dnums));
 
-  HloInstruction* new_transpose =
-      computation->AddInstruction(HloInstruction::CreateTranspose(
-          ShapeInference::InferTransposeShape(new_forward_conv->shape(),
-                                              transpose->dimensions())
-              .ConsumeValueOrDie(),
-          new_forward_conv, transpose->dimensions()));
-
-  // Fuse the new forward convolution and the new transpose to the new backward
-  // convolution.
+  // Fuse the new forward convolution to the new backward convolution.
   HloInstruction* new_backward_conv =
       computation->CreateFusionInstructionForBackwardConvolution(
-          {new_transpose, new_forward_conv},
-          HloInstruction::FusionKind::kConvBackwardFilter,
+          {new_forward_conv}, HloInstruction::FusionKind::kConvBackwardFilter,
           new_backward_conv_window, backward_conv_dnums);
 
   VLOG(1) << "Canonicalizing backward filter conv";
@@ -369,12 +359,11 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   std::vector<int64> limit_indices(
       new_backward_conv->shape().dimensions().begin(),
       new_backward_conv->shape().dimensions().end());
-  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(),
-                             1LL);
+  std::vector<int64> strides(new_backward_conv->shape().dimensions_size(), 1LL);
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    int64 dim = backward_conv_dnums.spatial_dimensions(i);
+    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
     if (padding_low > padding_high) {
       // If the amount of low padding (of the old backward convolution) is
       // larger, we internally pad the low end of the activations and slice
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index d0d2deee24848184278e3e51dcaa3bb673b5fadc..6cf280df05496716a0780d61ded92efd9982734c 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,37 +44,41 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy) {
-  int64 warp_size = device_desc.threads_per_warp();
-
+    const Shape& shape, const se::DeviceDescription& device_desc) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
-  // Calculate the number of threads per block.
-  // Initialize threads_per_block as the threads-per-block limit.
-  int64 threads_per_block = device_desc.threads_per_block_limit();
-  VLOG(2) << "Initial # of threads per block = " << threads_per_block;
-
-  if (partition_strategy == PartitionStrategy::kLatency) {
-    // Limit the thread count to allow maximum number of registers per thread.
-    // TODO(b/28560520): We don't have to assume the emitted kernel will use up
-    // all the registers. We could use ptxas to examine the actual number of
-    // register used, and set the thread count accordingly.
-    int64 threads_per_block_limit_due_to_registers =
-        device_desc.registers_per_core_limit() /
-        device_desc.registers_per_thread_limit();
-    CHECK_NE(0, threads_per_block_limit_due_to_registers);
-    if (threads_per_block_limit_due_to_registers < threads_per_block) {
-      threads_per_block =
-          // Make `threads_per_block` a multiple of warp size to use GPU
-          // efficiently.
-          warp_size *
-          std::max(1LL, threads_per_block_limit_due_to_registers / warp_size);
-      VLOG(2) << "Update # of threads per block due to register pressure = "
-              << threads_per_block;
+  // Since we don't do any inter-warp communication, we're free to choose any
+  // block size we want, subject to hardware constraints.  We choose the
+  // smallest block size that allows the GPU to reach full occupancy (assuming
+  // the kernel uses sufficiently few registers).  This gives us max performance
+  // when the kernel uses few registers, and lets us scale down gracefully as
+  // the kernel uses more registers.
+  //
+  // Specifically, we choose the number of threads per block such that
+  //
+  //   <num threads per block> * <max blocks per core> = <max threads per core>
+
+  auto threads_per_core = device_desc.threads_per_core_limit();
+  auto blocks_per_core = device_desc.blocks_per_core_limit();
+  int64 threads_per_block;
+  if (threads_per_core != 0 && blocks_per_core != 0) {
+    threads_per_block = device_desc.threads_per_core_limit() /
+                        device_desc.blocks_per_core_limit();
+  } else {
+    static std::atomic<int64> log_count{0};
+    if (log_count.fetch_add(1) < 8) {
+      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
+                      "without full information about its capabilities.  "
+                      "StreamExecutor's PopulateDeviceDescription should be "
+                      "updated for this device.";
+    }
+    threads_per_block = device_desc.threads_per_warp();
+    if (threads_per_block == 0) {
+      // Fall back to *something* if we can't even get num threads per warp.
+      threads_per_block = 32;
     }
   }
 
@@ -84,8 +88,6 @@ LaunchDimensions CalculateLaunchDimensions(
             << threads_per_block << ") because the latter is smaller.";
   }
 
-  // Calculate the block count. We copy the strategy used by Eigen:
-  // eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
   int64 block_count = CeilOfRatio(num_elements, threads_per_block);
   VLOG(2) << tensorflow::strings::Printf(
       "Initialized the block count to ceil(# of elements / threads per "
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 8f7fce884acc93fd39510ad0826b819a6d9731a7..0bf463a6ef95d5a32784838c08ad239752fd1acf 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -30,14 +30,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-enum class PartitionStrategy {
-  // Optimized for latency by allowing maximum number of registers per thread.
-  kLatency,
-  // Optimized for throughput. This may limit registers per thread and cause
-  // longer latency.
-  kThroughput
-};
-
 // Encapsulates the launch dimensions of a kernel, e.g., the block count and the
 // number of threads per block.
 class LaunchDimensions {
@@ -66,8 +58,7 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc,
-    PartitionStrategy partition_strategy = PartitionStrategy::kLatency);
+    const perftools::gputools::DeviceDescription& device_desc);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 0ff27888ad72f8190400c22a9086d1965448662c..486ea7d7e1dad3f7f37d50565e176fbf567f5cc4 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -70,6 +70,19 @@ class Thunk {
     return tensorflow::Status::OK();
   }
 
+  // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
+  // before calling ExecuteOnStream(stream).  If it returns true, it's the
+  // user's responsibility to wait for all activity on the GPU to finish before
+  // calling ExecuteOnStream.
+  //
+  // This value is not required to be constant for a given Thunk.  For example,
+  // a Thunk that performs autotuning may return true for its first run and
+  // false thereafter.
+  virtual bool ShouldHaltAllActivityBeforeRunning(
+      perftools::gputools::Stream* /*stream*/) {
+    return false;
+  }
+
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index 44188473d39088923c67216facab472a4e4ee09f..f16daa0b5481474e754c880ead1945297ca50168 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -33,8 +36,6 @@ class WhileTransformerTest : public HloTestBase {
       : module_(CreateNewModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
-        loop_state_shape_(ShapeUtil::MakeTupleShape(
-            {induction_variable_shape_, data_shape_})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
 
   std::unique_ptr<HloComputation> BuildConditionComputation(
@@ -42,8 +43,8 @@ class WhileTransformerTest : public HloTestBase {
     auto builder = HloComputation::Builder(TestName() + ".Condition");
     auto limit_const = builder.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<int32>(limit)));
-    auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto loop_state = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, GetLoopStateShape(tuple_index), "loop_state"));
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
             limit_const->shape(), loop_state, tuple_index));
@@ -58,8 +59,8 @@ class WhileTransformerTest : public HloTestBase {
       const int64 increment) {
     auto builder = HloComputation::Builder(TestName() + ".Body");
     // Create param instruction to access loop state.
-    auto loop_state = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, loop_state_shape_, "loop_state"));
+    auto loop_state = builder.AddInstruction(HloInstruction::CreateParameter(
+        0, GetLoopStateShape(ind_var_tuple_index), "loop_state"));
     // Update the induction variable GTE(ind_var_tuple_index).
     auto induction_variable =
         builder.AddInstruction(HloInstruction::CreateGetTupleElement(
@@ -73,7 +74,7 @@ class WhileTransformerTest : public HloTestBase {
         data_shape_, loop_state, data_tuple_index));
     // Use 'induction_variable' in computation with no path to output tuple.
     auto update = builder.AddInstruction(
-        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {8}));
+        HloInstruction::CreateBroadcast(data_shape_, induction_variable, {}));
     auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
         data_shape_, HloOpcode::kAdd, data, update));
     // Create output Tuple.
@@ -98,8 +99,9 @@ class WhileTransformerTest : public HloTestBase {
                   HloInstruction::CreateTuple({induction_var_init, data_init}))
             : builder.AddInstruction(
                   HloInstruction::CreateTuple({data_init, induction_var_init}));
-    auto while_hlo = builder.AddInstruction(HloInstruction::CreateWhile(
-        loop_state_shape_, condition, body, loop_state_init));
+    auto while_hlo = builder.AddInstruction(
+        HloInstruction::CreateWhile(GetLoopStateShape(ind_var_tuple_index),
+                                    condition, body, loop_state_init));
     module_->AddEntryComputation(builder.Build());
     return while_hlo;
   }
@@ -115,18 +117,34 @@ class WhileTransformerTest : public HloTestBase {
   }
 
   void RunCopyInsertionPass() {
+    HloVerifier verifier([](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape, /*pointer_size=*/sizeof(void*));
+    });
+    TF_ASSERT_OK(verifier.Run(module_.get()).status());
     CopyInsertion copy_insertion;
-    EXPECT_IS_OK(copy_insertion.Run(module_.get()).status());
+    TF_ASSERT_OK(copy_insertion.Run(module_.get()).status());
+  }
+
+  Shape GetLoopStateShape(const int64 ind_var_tuple_index) {
+    if (ind_var_tuple_index == 0) {
+      return ShapeUtil::MakeTupleShape(
+          {induction_variable_shape_, data_shape_});
+    } else {
+      return ShapeUtil::MakeTupleShape(
+          {data_shape_, induction_variable_shape_});
+    }
   }
 
   std::unique_ptr<HloModule> module_;
   Shape induction_variable_shape_;
   Shape data_shape_;
-  Shape loop_state_shape_;
   Shape condition_result_shape_;
 };
 
-TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement0) {
   // Build computation with induction variable at tuple element 0.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
@@ -137,13 +155,16 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement0) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_TRUE(result.ok());
+  TF_ASSERT_OK(result.status());
   // Check results.
   EXPECT_THAT(result.ConsumeValueOrDie(),
               Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
-TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InductionVariableAtTupleElement1) {
   // Build computation with induction variable at tuple element 1.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(1, 10));
@@ -154,13 +175,16 @@ TEST_F(WhileTransformerTest, InductionVariableAtTupleElement1) {
   RunCopyInsertionPass();
   // Run WhileTransformer.
   auto result = gpu::CanTransformWhileToFor(while_hlo);
-  ASSERT_TRUE(result.ok());
+  TF_ASSERT_OK(result.status());
   // Check results.
   EXPECT_THAT(result.ConsumeValueOrDie(),
               Eq(std::tuple<int64, int64, int64>(0, 10, 1)));
 }
 
-TEST_F(WhileTransformerTest, InvalidLoopLimit) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InvalidLoopLimit) {
   // Build computation with invalid loop limit.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 5));
@@ -176,7 +200,10 @@ TEST_F(WhileTransformerTest, InvalidLoopLimit) {
               HasSubstr("Loop start must be less than loop limit."));
 }
 
-TEST_F(WhileTransformerTest, InvalidLoopIncrement) {
+// TODO(b/68830972): The while transformer is far too fragile. It patterns
+// matches the exact expressions of opcodes. Re-enable when transformation is
+// more general
+TEST_F(WhileTransformerTest, DISABLED_InvalidLoopIncrement) {
   // Build computation with invalid loop increment.
   auto condition =
       module_->AddEmbeddedComputation(BuildConditionComputation(0, 10));
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index 049e8d80d80c835bca4a4d38592564ba82a3ecf9..05017008e2ddbe0b9e78d06275fdec5d08d94bfa 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -108,8 +108,11 @@ std::unique_ptr<HloModule> MakeBigGraph() {
       HloInstruction::CreateUnary(vshape, HloOpcode::kCopy, param_v0));
   auto clamp = builder.AddInstruction(HloInstruction::CreateTernary(
       vshape, HloOpcode::kClamp, copy, param_v1, param_v2));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(vshape, HloOpcode::kDot, clamp, param_v0));
+      HloInstruction::CreateDot(vshape, clamp, param_v0, dot_dnums));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({dot, param_s, clamp}));
   auto scalar = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 17b926c8748e45b55f380e7595711b9e7a748f64..387b649a731ebcbfd8307807469f39f22d192b06 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -259,8 +259,11 @@ TEST_F(HeapSimulatorTest, MultiplyDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
 
   // The buffer for dot is the output, and it cannot be shared with the buffer
   // for mul, since dot isn't elementwise.
@@ -292,8 +295,11 @@ TEST_F(HeapSimulatorTest, MultiplyDotAdd) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32vec4_, HloOpcode::kAdd, dot, paramA));
 
@@ -327,10 +333,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDot) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
 
   // The buffer for dot1 is the output.  No buffers can be shared.  The buffer
   // for mul is freed before the end, since it's no longer used after dot0
@@ -365,10 +374,13 @@ TEST_F(HeapSimulatorTest, MultiplyDotDotTuple) {
       HloInstruction::CreateParameter(2, f32scalar_, "paramY"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec4_, HloOpcode::kMultiply, paramA, paramX));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot0 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, mul, paramY));
+      HloInstruction::CreateDot(f32vec4_, mul, paramY, dot_dnums));
   auto dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32vec4_, HloOpcode::kDot, dot0, paramY));
+      HloInstruction::CreateDot(f32vec4_, dot0, paramY, dot_dnums));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({dot0, dot1}));
 
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 79493c4112804f8454d200f3f83aa85d718f0d0a..5d0cfba1fc8ab255c228c671fee641e9302f5ec6 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -118,6 +118,9 @@ message HloInstructionProto {
 
   // Shape of outfeed request.
   xla.Shape outfeed_shape = 29;
+
+  // Describes the dimension numbers used for a dot operation
+  xla.DotDimensionNumbers dot_dimension_numbers = 30;
 }
 
 // Serialization of HloComputation.
@@ -250,7 +253,3 @@ message HloProto {
   HloOrderingProto hlo_ordering = 2;
   BufferAssignmentProto buffer_assignment = 3;
 }
-
-message HloProtos {
-  repeated HloProto hlo_protos = 1;
-}
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 6f8099475146e6bbcfb61d2e5a91a7a6f9e63e58..6d2a3aa5b531650a658502531e050702ffbd3760 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -144,8 +144,10 @@ class BufferValueMap {
   // Move the given value into the given buffer.
   void MoveValueToBuffer(const HloValue& value, BufferNumber buffer_number) {
     BufferNumber old_buffer_number = value_to_buffer_number_.at(&value);
-    buffers_.at(old_buffer_number).erase(&value);
-    if (buffers_.at(old_buffer_number).empty()) {
+    tensorflow::gtl::FlatSet<const HloValue*>& old_value_set =
+        buffers_.at(old_buffer_number);
+    old_value_set.erase(&value);
+    if (old_value_set.empty()) {
       buffers_.erase(old_buffer_number);
     }
 
@@ -175,7 +177,7 @@ class BufferValueMap {
     // Value is init of a while (use is while).
     std::vector<BufferNumber> aliased_buffers;
     for (const HloUse& use : value.uses()) {
-      VLOG(1) << "use of value " << value.ToShortString() << ": " << use;
+      VLOG(2) << "use of value " << value.ToShortString() << ": " << use;
       if (use.instruction->opcode() == HloOpcode::kWhile) {
         // Determine the while value that this shares a buffer with.
         const HloValue& while_value =
@@ -411,7 +413,7 @@ string HloAliasAnalysis::ToString() const {
 /* static */
 StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     HloModule* module) {
-  VLOG(1) << "HloAliasAnalysis::Run on module " << module->name();
+  VLOG(2) << "HloAliasAnalysis::Run on module " << module->name();
   XLA_VLOG_LINES(2, module->ToString());
 
   auto alias_analysis = WrapUnique(new HloAliasAnalysis(module));
@@ -444,7 +446,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
 
   TF_DCHECK_OK(alias_analysis->Verify());
 
-  XLA_VLOG_LINES(1, alias_analysis->ToString());
+  XLA_VLOG_LINES(2, alias_analysis->ToString());
   return std::move(alias_analysis);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 72c70b38238eedb67622f4816e1de264f3c9ed4b..014a851c96ed1d530cfd5fa4e854cf1df45fc4d0 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -176,10 +176,6 @@ bool HloComputation::IsRemovable(const HloInstruction* instruction) {
     return false;
   }
 
-  if (instruction->HasSideEffect()) {
-    return false;
-  }
-
   return true;
 }
 
@@ -207,7 +203,8 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
     worklist.pop();
 
     if (removed.count(item) != 0 || item->user_count() != 0 ||
-        item == root_instruction() || !IsRemovable(item)) {
+        item == root_instruction() || !IsRemovable(item) ||
+        item->HasSideEffect()) {
       continue;
     }
     for (int i = 0; i < item->operand_count(); ++i) {
@@ -367,7 +364,8 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   return post_order;
 }
 
-string HloComputation::ToString(int nested_level) const {
+string HloComputation::ToString(int nested_level,
+                                bool include_large_constants) const {
   std::ostringstream s;
   for (int i = 0; i < nested_level; i++) {
     s << "    ";
@@ -379,12 +377,11 @@ string HloComputation::ToString(int nested_level) const {
       s << "    ";
     }
     s << "  " << (instruction == root_instruction_ ? "ROOT " : "")
-      << instruction->ToString() << "\n";
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      s << instruction->fused_instructions_computation()->ToString(
-               nested_level + 1)
-        << "\n";
-    }
+      << instruction->ToString(
+             /*compact_operands=*/false,
+             /*include_metadata=*/true,
+             /*include_large_constants=*/include_large_constants)
+      << "\n";
   }
   for (int i = 0; i < nested_level; i++) {
     s << "    ";
@@ -407,16 +404,18 @@ HloComputationProto HloComputation::ToProto() const {
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
     HloModule* module, const HloComputationProto& proto,
-    tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map,
+    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+    const std::function<void(std::unique_ptr<HloComputation>)>&
+        add_fused_computation,
     HloInstruction* fusion_instruction) {
   std::vector<std::unique_ptr<HloInstruction>> instructions;
   tensorflow::gtl::FlatMap<string, HloInstruction*> instruction_map;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloInstruction> instruction,
-        HloInstruction::CreateFromProto(module, instruction_proto,
-                                        instruction_map, computation_map));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloInstruction> instruction,
+                        HloInstruction::CreateFromProto(
+                            module, instruction_proto, instruction_map,
+                            computation_map, add_fused_computation));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
@@ -654,7 +653,9 @@ std::vector<HloInstruction*> HloComputation::CollectUnreachableRoots() const {
   return unreachable_roots;
 }
 
-Status HloComputation::Accept(DfsHloVisitor* visitor) const {
+template <typename HloInstructionPtr>
+Status HloComputation::Accept(
+    DfsHloVisitorBase<HloInstructionPtr>* visitor) const {
   // Visit unreachable roots. Beware that the visitor might delete the currently
   // visited root, which would invalidate iterators if the unreachable roots
   // weren't computed ahead of time.
@@ -667,6 +668,10 @@ Status HloComputation::Accept(DfsHloVisitor* visitor) const {
   return root_instruction()->Accept(visitor, /*call_finish_visit=*/true);
 }
 
+// Explicit instantiations.
+template Status HloComputation::Accept(DfsHloVisitor* visitor) const;
+template Status HloComputation::Accept(ConstDfsHloVisitor* visitor) const;
+
 Status HloComputation::AcceptWithOperandOrder(
     DfsHloVisitor* visitor,
     const HloInstruction::CompareFunction& operand_order) const {
@@ -683,8 +688,9 @@ Status HloComputation::AcceptWithOperandOrder(
                                                     /*call_finish_visit=*/true);
 }
 
+template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
-    DfsHloVisitor* visitor,
+    DfsHloVisitorBase<HloInstructionPtr>* visitor,
     const std::vector<const HloInstruction*>& order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
@@ -713,49 +719,111 @@ Status HloComputation::AcceptOrdered(
   return Status::OK();
 }
 
+// Explicit instantiations.
+template Status HloComputation::AcceptOrdered(
+    DfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+template Status HloComputation::AcceptOrdered(
+    ConstDfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+
 Status HloComputation::Accept(
-    const FunctionVisitor::VisitorFunction& visitor_func) const {
+    const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
   return this->Accept(&visitor);
 }
 
-std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix) {
+Status HloComputation::Accept(
+    const std::function<Status(const HloInstruction*)>& visitor_func) const {
+  ConstFunctionVisitor visitor(visitor_func);
+  return this->Accept(&visitor);
+}
+
+std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix,
+                                                      HloModule* module) {
+  return CloneWithReplacements(
+      /*replacements=*/std::unordered_map<const HloInstruction*,
+                                          std::unique_ptr<HloInstruction>>(),
+      module, suffix);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
+    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements,
+    HloModule* module, const string& suffix) {
+  // Look up instr in the replacements map, and return either the replacement,
+  // or instr, if the replacement isn't present.
+  //
+  // Note: This can return null, indicating that instr should not be present in
+  // the new computation.
+  auto replace = [&](HloInstruction* instr) {
+    auto it = replacements.find(instr);
+    if (it == replacements.end()) {
+      return instr;
+    }
+    return it->second.get();
+  };
+
   VLOG(1) << "Cloning " << name() << " --> " << suffix << "\n";
-  auto postorder = MakeInstructionPostOrder();
+  std::vector<HloInstruction*> postorder;
+  for (HloInstruction* instr : MakeInstructionPostOrder()) {
+    if (HloInstruction* replacement = replace(instr)) {
+      postorder.push_back(replacement);
+    }
+  }
+
   std::unordered_map<HloInstruction*, HloInstruction*> clone_map;
   std::vector<std::unique_ptr<HloInstruction>> instructions;
   std::unique_ptr<HloInstruction> new_instr = nullptr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
-      HloInstruction* new_operand = FindOrDie(clone_map, operand);
-      CHECK(new_operand != nullptr);
-      new_operands.push_back(new_operand);
-    }
-
-    new_instr = instr->CloneWithNewOperands(instr->shape(), new_operands);
-    new_instr->set_metadata(instr->metadata());
-    if (instr->has_sharding()) {
-      new_instr->set_sharding(instr->sharding());
+      auto replaced_operand = replace(operand);
+      // If replaced_operand is null, that means 'replacements' asked us not to
+      // include operand in the new computation.  But we can't do that, because
+      // operand is used by instr.
+      CHECK_NE(replaced_operand, nullptr)
+          << "replacements map tried to eliminate a used instruction "
+          << operand->ToString() << ", used by " << instr->ToString();
+      new_operands.push_back(FindOrDie(clone_map, replaced_operand));
     }
+    new_instr =
+        instr->CloneWithNewOperands(instr->shape(), new_operands, module);
     InsertOrDie(&clone_map, instr, new_instr.get());
     instructions.push_back(std::move(new_instr));
   }
-  Builder builder(name() + suffix);
+  Builder builder(name() + "." + suffix);
   for (auto& instr : instructions) {
     builder.AddInstruction(std::move(instr));
   }
   auto result = builder.Build(
-      /*root_instruction=*/FindOrDie(clone_map, root_instruction()));
+      /*root_instruction=*/FindOrDie(clone_map, replace(root_instruction())));
 
   // Clone control dependencies.
   for (auto instr : postorder) {
     HloInstruction* new_instr = FindOrDie(clone_map, instr);
     for (auto successor : instr->control_successors()) {
-      TF_CHECK_OK(
-          new_instr->AddControlDependencyTo(FindOrDie(clone_map, successor)));
+      auto replaced_successor = replace(successor);
+
+      // successor may not be in clone_map, because it might have been
+      // removed by the replacements map.
+      if (replaced_successor == nullptr) {
+        continue;
+      }
+
+      TF_CHECK_OK(new_instr->AddControlDependencyTo(
+          FindOrDie(clone_map, replaced_successor)));
+    }
+  }
+
+  // We cloned the elements of 'replacements', so they're all going to be
+  // destroyed.  HloInstructions need to be detached from their operands before
+  // they're destroyed, otherwise they stick around in the operands' users lists
+  // and cause use-after-frees.
+  for (auto& kv : replacements) {
+    if (std::unique_ptr<HloInstruction>& new_instr = kv.second) {
+      new_instr->DetachFromOperands();
     }
   }
+
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index f4edd175016ee30d31cc0cad6bdbd3eaa014c704..ccedda2a03c088b93883dd79a101c832497a937a 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -138,7 +138,8 @@ class HloComputation {
   void UniquifyName(NameUniquer* name_uniquer);
 
   // Return a string representation of the computation.
-  string ToString(int nested_level = 0) const;
+  string ToString(int nested_level = 0,
+                  bool include_large_constants = false) const;
 
   // Returns a serialized representation of this computation.
   HloComputationProto ToProto() const;
@@ -151,12 +152,16 @@ class HloComputation {
   //   computation_map: a map from computation name to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
-  //  fusion_instruction: if non-null then the newly created computation will be
-  //     constructed as a fused computation with this instruction as its fusion
-  //     parent.
+  //   add_fused_computation: A function to call to add a fused
+  //     computation. Used only when the instruction is a fusion instruction.
+  //   fusion_instruction: if non-null then the newly created computation will
+  //     be constructed as a fused computation with this instruction as its
+  //     fusion parent.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
       HloModule* module, const HloComputationProto& proto,
-      tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map,
+      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+      const std::function<void(std::unique_ptr<HloComputation>)>&
+          add_fused_computation,
       HloInstruction* fusion_instruction = nullptr);
 
   // Gets the instructions in this computation.
@@ -270,7 +275,8 @@ class HloComputation {
   // via the root. The root instruction of the computation is visited last, and
   // the visitor's FinishVisit method is called once upon completion (with the
   // root instruction as the argument).
-  Status Accept(DfsHloVisitor* visitor) const;
+  template <typename HloInstructionPtr>
+  Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor) const;
 
   // Same as Accept() above, but the order of operand and control predecessor
   // visitation is determined by the given operand order; if compare(A, B) ==
@@ -281,20 +287,43 @@ class HloComputation {
 
   // Visit every node in the computation in the given order. 'order' must
   // be a topological sort of all instructions in the computation.
-  Status AcceptOrdered(DfsHloVisitor* visitor,
+  template <typename HloInstructionPtr>
+  Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
                        const std::vector<const HloInstruction*>& order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
-  Status Accept(const FunctionVisitor::VisitorFunction& visitor_func) const;
+  Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
+  Status Accept(
+      const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
   // Returns a deep copy of this computation including all instructions.
-  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone");
-
-  // Returns true if the given instruction can be removed from the
-  // computation. Instructions such as parameters and send/receive instructions
-  // cannot be removed without violating invariants of the HLO computation or
-  // module with the exception of fusion computation.  A parameter instruction
-  // is removable for a fusion computation.
+  // If the module pointer is not nullptr, it will be the module where
+  // the cloned computations will be added to (in order to support deep
+  // cloning).
+  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone",
+                                        HloModule* module = nullptr);
+
+  // Like Clone(), but if an instruction is present in replacement_map, we use
+  // the map's value to replace that instruction in the cloned computation.
+  //
+  // If replacements maps a key to nullptr, we remove that instruction from the
+  // new computation.
+  std::unique_ptr<HloComputation> CloneWithReplacements(
+      std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+          replacements,
+      HloModule* module = nullptr, const string& suffix = "clone");
+
+  // Returns true if the given instruction can be removed from the computation.
+  // Parameter instructions cannot be removed without violating invariants of
+  // the HLO computation with the exception of fusion computation. A parameter
+  // instruction is removable for a fusion computation.
+  //
+  // Note that IsRemovable() is a necessariy condition to remove an instruction
+  // rather than a sufficient condition. For example, instructions with
+  // side-effect (e.g., Send, Infeed) may be removed from a computation, but the
+  // transformation must guarantee the invariants relevant to the instructions
+  // still hold (e.g., Send and Recv must be removed together to make each
+  // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
   // Returns true if this computation has a side effect. A computation has a
@@ -307,6 +336,9 @@ class HloComputation {
   // Returns the owning fusion instruction, or nullptr if this is not a fusion
   // computation.
   HloInstruction* FusionInstruction() const { return fusion_instruction_; }
+  void SetFusionInstruction(HloInstruction* fusion_instruction) {
+    fusion_instruction_ = fusion_instruction;
+  }
 
  private:
   explicit HloComputation(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index ab018c4cf2da770eabe74d7b5a670a19937b1b9a..b933695b823871c6c0174da6d6f99e618219442a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -22,13 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace xla {
 
 constexpr char HloCostAnalysis::kFlopsKey[];
 constexpr char HloCostAnalysis::kTranscendentalsKey[];
 constexpr char HloCostAnalysis::kBytesAccessedKey[];
-constexpr char HloCostAnalysis::kSecondsKey[];
+constexpr char HloCostAnalysis::kOptimalSecondsKey[];
 
 HloCostAnalysis::HloCostAnalysis(const ShapeSizeFunction& shape_size)
     : HloCostAnalysis(shape_size, {}) {}
@@ -37,7 +38,7 @@ HloCostAnalysis::HloCostAnalysis(const ShapeSizeFunction& shape_size,
                                  const Properties& per_second_rates)
     : shape_size_(shape_size), per_second_rates_(per_second_rates) {}
 
-Status HloCostAnalysis::Preprocess(HloInstruction* hlo) {
+Status HloCostAnalysis::Preprocess(const HloInstruction* hlo) {
   // Set current instruction cost values to reasonable default values. Each
   // handler can overwrite these values. In Postprocess, these values are
   // accumulated and written to the per-instruction maps.
@@ -56,20 +57,20 @@ Status HloCostAnalysis::Preprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
+Status HloCostAnalysis::Postprocess(const HloInstruction* hlo) {
   if (current_should_compute_bottleneck_time_) {
     // Compute the time as the time of the bottleneck, i.e. the slowest property
     // given the per-second rate of each property.
-    float max_seconds = 0.0f;
+    float optimal_seconds = 0.0f;
     for (const auto& property : current_properties_) {
-      if (property.first != kSecondsKey) {
-        max_seconds = std::max(
-            max_seconds,
+      if (property.first != kOptimalSecondsKey) {
+        optimal_seconds = std::max(
+            optimal_seconds,
             property.second /
                 GetProperty(property.first, per_second_rates_, INFINITY));
       }
     }
-    current_properties_[kSecondsKey] = max_seconds;
+    current_properties_[kOptimalSecondsKey] = optimal_seconds;
   }
 
   TF_RET_CHECK(hlo_properties_.emplace(hlo, current_properties_).second);
@@ -80,7 +81,8 @@ Status HloCostAnalysis::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
+Status HloCostAnalysis::HandleElementwiseOp(
+    const HloInstruction* hlo_instruction) {
   const auto& shape = hlo_instruction->shape();
   // For element-wise operations, the number of computations is the same as the
   // number of elements in the output shape.
@@ -118,58 +120,64 @@ Status HloCostAnalysis::HandleElementwiseOp(HloInstruction* hlo_instruction) {
   }
 }
 
-Status HloCostAnalysis::HandleElementwiseUnary(HloInstruction* hlo) {
+Status HloCostAnalysis::HandleElementwiseUnary(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleElementwiseBinary(HloInstruction* hlo) {
+Status HloCostAnalysis::HandleElementwiseBinary(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleCompare(HloInstruction* compare) {
+Status HloCostAnalysis::HandleCompare(const HloInstruction* compare) {
   return HandleElementwiseOp(compare);
 }
 
-Status HloCostAnalysis::HandleClamp(HloInstruction* clamp) {
+Status HloCostAnalysis::HandleClamp(const HloInstruction* clamp) {
   return HandleElementwiseOp(clamp);
 }
 
-Status HloCostAnalysis::HandleReducePrecision(HloInstruction* hlo) {
+Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
   return HandleElementwiseOp(hlo);
 }
 
-Status HloCostAnalysis::HandleParameter(HloInstruction*) {
+Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
   current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleConstant(HloInstruction*) {
+Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleGetTupleElement(HloInstruction*) {
+Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
   current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleSelect(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleSelect(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleReverse(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleReverse(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleSlice(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleSlice(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleDynamicSlice(HloInstruction*) {
+Status HloCostAnalysis::HandleDynamicSlice(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleDynamicUpdateSlice(HloInstruction*) {
+Status HloCostAnalysis::HandleDynamicUpdateSlice(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleTuple(HloInstruction* tuple) {
+Status HloCostAnalysis::HandleTuple(const HloInstruction* tuple) {
   // The tuple instruction only gathers pointers from inputs (it doesn't iterate
   // through them). The memory touched is then only the size of the output
   // index table of the tuple.
@@ -178,23 +186,26 @@ Status HloCostAnalysis::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleConcatenate(HloInstruction*) {
+Status HloCostAnalysis::HandleConcatenate(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleConvert(HloInstruction* convert) {
+Status HloCostAnalysis::HandleConvert(const HloInstruction* convert) {
   return HandleElementwiseOp(convert);
 }
 
-Status HloCostAnalysis::HandleCopy(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleCopy(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleDot(HloInstruction* dot) {
+Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
   const Shape& rhs_shape = dot->operand(1)->shape();
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width = lhs_shape.dimensions(ShapeUtil::Rank(lhs_shape) - 1);
-
+  int64 reduction_width =
+      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
   // First divide by reduction width before multiplying by rhs elements to avoid
   // overflow.
   int64 fma_count;
@@ -210,11 +221,15 @@ Status HloCostAnalysis::HandleDot(HloInstruction* dot) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleInfeed(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleInfeed(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleOutfeed(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleOutfeed(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleMap(HloInstruction* map) {
+Status HloCostAnalysis::HandleMap(const HloInstruction* map) {
   // Compute properties of the mapped function.
   TF_ASSIGN_OR_RETURN(const Properties sub_properties,
                       ProcessSubcomputation(map->to_apply()));
@@ -229,7 +244,7 @@ Status HloCostAnalysis::HandleMap(HloInstruction* map) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleReduce(HloInstruction* reduce) {
+Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   auto arg = reduce->operand(0);
   HloComputation* function = reduce->to_apply();
   // Compute the cost of the user function.
@@ -247,7 +262,8 @@ Status HloCostAnalysis::HandleReduce(HloInstruction* reduce) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window) {
+Status HloCostAnalysis::HandleReduceWindow(
+    const HloInstruction* reduce_window) {
   const Window& window = reduce_window->window();
   auto function = reduce_window->to_apply();
   // Compute the properties of the reduction function.
@@ -272,7 +288,8 @@ Status HloCostAnalysis::HandleReduceWindow(HloInstruction* reduce_window) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
+Status HloCostAnalysis::HandleSelectAndScatter(
+    const HloInstruction* instruction) {
   // Compute the properties of the select and scatter function.
   // Compute the properties of the reduction function.
   TF_ASSIGN_OR_RETURN(const Properties select_properties,
@@ -304,44 +321,60 @@ Status HloCostAnalysis::HandleSelectAndScatter(HloInstruction* instruction) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBitcast(HloInstruction*) {
+Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   // A bitcast does no computation and touches no memory.
   current_properties_[kBytesAccessedKey] = 0;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBroadcast(HloInstruction*) {
+Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandlePad(const HloInstruction*) {
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleSend(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandlePad(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleSendDone(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleSend(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleRecv(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleRecv(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleRecvDone(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleReshape(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleReshape(const HloInstruction*) {
+  return Status::OK();
+}
 
-Status HloCostAnalysis::HandleBatchNormTraining(HloInstruction*) {
+Status HloCostAnalysis::HandleBatchNormTraining(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-training.
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBatchNormInference(HloInstruction*) {
+Status HloCostAnalysis::HandleBatchNormInference(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-inference.
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleBatchNormGrad(HloInstruction*) {
+Status HloCostAnalysis::HandleBatchNormGrad(const HloInstruction*) {
   // TODO(b/62294698): Implement cost analysis for batch-norm-grad.
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleTranspose(HloInstruction*) {
+Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleConvolution(HloInstruction* convolution) {
+Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
   auto rhs_instruction = convolution->operand(1);
   const auto& dnums = convolution->convolution_dimension_numbers();
   const int64 output_features =
@@ -359,17 +392,24 @@ Status HloCostAnalysis::HandleConvolution(HloInstruction* convolution) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCrossReplicaSum(HloInstruction* crs) {
+Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
   //
   // TODO(b/33004697): Compute correct cost here, taking the actual number of
   // replicas into account.
-  current_properties_[kFlopsKey] = ShapeUtil::ElementsIn(crs->shape());
+  double flops = 0.0;
+  ShapeUtil::ForEachSubshape(
+      crs->shape(), [&, this](const Shape& subshape, const ShapeIndex&) {
+        if (ShapeUtil::IsArray(subshape)) {
+          flops += ShapeUtil::ElementsIn(subshape);
+        }
+      });
+  current_properties_[kFlopsKey] = flops;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleRng(HloInstruction* random) {
+Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
   // the cost of each RNG is same as a transcendental operation.
@@ -378,7 +418,7 @@ Status HloCostAnalysis::HandleRng(HloInstruction* random) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
+Status HloCostAnalysis::HandleFusion(const HloInstruction* fusion) {
   // Compute the properties of the fused expression and attribute them to the
   // fusion node. Use a dummy shape_size to avoid any errors from trying to
   // calculate the size of a shape that does not have a layout, since nodes
@@ -406,18 +446,18 @@ Status HloCostAnalysis::HandleFusion(HloInstruction* fusion) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCall(HloInstruction* call) {
+Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
   TF_ASSIGN_OR_RETURN(current_properties_,
                       ProcessSubcomputation(call->to_apply()));
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCustomCall(HloInstruction*) {
+Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
   return Unimplemented("Custom-call is not implemented for HLO cost analysis.");
 }
 
-Status HloCostAnalysis::HandleSort(HloInstruction* sort) {
+Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
   // This assumes a comparison based N*log(N) algorithm. As for all ops, the
   // actual properties of the op depend on the backend implementation.
   int64 elements = ShapeUtil::ElementsIn(sort->operand(0)->shape());
@@ -425,7 +465,7 @@ Status HloCostAnalysis::HandleSort(HloInstruction* sort) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while) {
+Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
   // cost of a while node. For now compute the cost of a single iteration.
@@ -449,7 +489,28 @@ Status HloCostAnalysis::HandleWhile(HloInstruction* xla_while) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::FinishVisit(HloInstruction*) { return Status::OK(); }
+Status HloCostAnalysis::HandleConditional(const HloInstruction* conditional) {
+  // Compute the cost of the true and false computations and take the maximum
+  // from those for each property.
+  TF_ASSIGN_OR_RETURN(const Properties true_computation_properties,
+                      ProcessSubcomputation(conditional->true_computation()));
+  TF_ASSIGN_OR_RETURN(const Properties false_computation_properties,
+                      ProcessSubcomputation(conditional->false_computation()));
+  current_properties_ = true_computation_properties;
+  for (const auto& property : false_computation_properties) {
+    if (!tensorflow::gtl::InsertIfNotPresent(&current_properties_, property)) {
+      current_properties_[property.first] =
+          std::max(current_properties_[property.first], property.second);
+    }
+  }
+  current_should_compute_bottleneck_time_ = false;
+
+  return Status::OK();
+}
+
+Status HloCostAnalysis::FinishVisit(const HloInstruction*) {
+  return Status::OK();
+}
 
 float HloCostAnalysis::flop_count() const {
   return GetProperty(kFlopsKey, properties_sum_);
@@ -463,8 +524,8 @@ float HloCostAnalysis::bytes_accessed() const {
   return GetProperty(kBytesAccessedKey, properties_sum_);
 }
 
-float HloCostAnalysis::seconds() const {
-  return GetProperty(kSecondsKey, properties_sum_);
+float HloCostAnalysis::optimal_seconds() const {
+  return GetProperty(kOptimalSecondsKey, properties_sum_);
 }
 
 int64 HloCostAnalysis::flop_count(const HloInstruction& hlo) const {
@@ -479,8 +540,8 @@ int64 HloCostAnalysis::bytes_accessed(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kBytesAccessedKey, hlo_properties_);
 }
 
-float HloCostAnalysis::seconds(const HloInstruction& hlo) const {
-  return GetPropertyForHlo(hlo, kSecondsKey, hlo_properties_);
+float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
+  return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
 StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 93b1b3eb20cf88292d38549016c9a0b662e155ee..fade19522cf0c30eab037aa355de1f9203f80014 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -34,7 +34,7 @@ namespace xla {
 // the computation cost of the instruction, and the values are accumulated
 // during the traversal for the entire graph. We treat normal floating point
 // operations separately from transcendental operations.
-class HloCostAnalysis : public DfsHloVisitor {
+class HloCostAnalysis : public ConstDfsHloVisitor {
  public:
   // Each HLO is associated to a vector of properties with the indices given
   // below. Sub-classes can add further properties.
@@ -42,61 +42,66 @@ class HloCostAnalysis : public DfsHloVisitor {
   static constexpr char kFlopsKey[] = "flops";
   static constexpr char kTranscendentalsKey[] = "transcendentals";
   static constexpr char kBytesAccessedKey[] = "bytes accessed";
-  static constexpr char kSecondsKey[] = "seconds";
+  static constexpr char kOptimalSecondsKey[] = "optimal_seconds";
 
   // shape_size is a function which returns the size in bytes of the top-level
   // buffer of a shape.
   using ShapeSizeFunction = std::function<int64(const Shape&)>;
   explicit HloCostAnalysis(const ShapeSizeFunction& shape_size);
 
-  Status HandleElementwiseUnary(HloInstruction* hlo) override;
-  Status HandleElementwiseBinary(HloInstruction* hlo) override;
-  Status HandleConstant(HloInstruction* constant) override;
-  Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
-  Status HandleSelect(HloInstruction* select) override;
-  Status HandleCompare(HloInstruction* compare) override;
-  Status HandleClamp(HloInstruction* clamp) override;
-  Status HandleReducePrecision(HloInstruction* hlo) override;
-  Status HandleConcatenate(HloInstruction* concatenate) override;
-  Status HandleSend(HloInstruction* send) override;
-  Status HandleRecv(HloInstruction* recv) override;
-  Status HandleConvert(HloInstruction* convert) override;
-  Status HandleCopy(HloInstruction* copy) override;
-  Status HandleDot(HloInstruction* dot) override;
-  Status HandleConvolution(HloInstruction* convolution) override;
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
-  Status HandleInfeed(HloInstruction* infeed) override;
-  Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleRng(HloInstruction* random) override;
-  Status HandleReverse(HloInstruction* reverse) override;
-  Status HandleSort(HloInstruction* sort) override;
-  Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleReduce(HloInstruction* reduce) override;
-  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override;
+  Status HandleElementwiseUnary(const HloInstruction* hlo) override;
+  Status HandleElementwiseBinary(const HloInstruction* hlo) override;
+  Status HandleConstant(const HloInstruction* constant) override;
+  Status HandleGetTupleElement(
+      const HloInstruction* get_tuple_element) override;
+  Status HandleSelect(const HloInstruction* select) override;
+  Status HandleCompare(const HloInstruction* compare) override;
+  Status HandleClamp(const HloInstruction* clamp) override;
+  Status HandleReducePrecision(const HloInstruction* hlo) override;
+  Status HandleConcatenate(const HloInstruction* concatenate) override;
+  Status HandleSend(const HloInstruction* send) override;
+  Status HandleSendDone(const HloInstruction* send_done) override;
+  Status HandleRecv(const HloInstruction* recv) override;
+  Status HandleRecvDone(const HloInstruction* recv_done) override;
+  Status HandleConvert(const HloInstruction* convert) override;
+  Status HandleCopy(const HloInstruction* copy) override;
+  Status HandleDot(const HloInstruction* dot) override;
+  Status HandleConvolution(const HloInstruction* convolution) override;
+  Status HandleCrossReplicaSum(const HloInstruction* crs) override;
+  Status HandleInfeed(const HloInstruction* infeed) override;
+  Status HandleOutfeed(const HloInstruction* outfeed) override;
+  Status HandleRng(const HloInstruction* random) override;
+  Status HandleReverse(const HloInstruction* reverse) override;
+  Status HandleSort(const HloInstruction* sort) override;
+  Status HandleParameter(const HloInstruction* parameter) override;
+  Status HandleReduce(const HloInstruction* reduce) override;
+  Status HandleBatchNormTraining(
+      const HloInstruction* batch_norm_training) override;
   Status HandleBatchNormInference(
-      HloInstruction* batch_norm_inference) override;
-  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
-  Status HandleFusion(HloInstruction* fusion) override;
-  Status HandleCall(HloInstruction* call) override;
-  Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleSlice(HloInstruction* slice) override;
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override;
+      const HloInstruction* batch_norm_inference) override;
+  Status HandleBatchNormGrad(const HloInstruction* batch_norm_grad) override;
+  Status HandleFusion(const HloInstruction* fusion) override;
+  Status HandleCall(const HloInstruction* call) override;
+  Status HandleCustomCall(const HloInstruction* custom_call) override;
+  Status HandleSlice(const HloInstruction* slice) override;
+  Status HandleDynamicSlice(const HloInstruction* dynamic_slice) override;
   Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override;
-  Status HandleTuple(HloInstruction* tuple) override;
-  Status HandleMap(HloInstruction* map) override;
-  Status HandleReduceWindow(HloInstruction* reduce_window) override;
-  Status HandleSelectAndScatter(HloInstruction* instruction) override;
-  Status HandleBitcast(HloInstruction* bitcast) override;
-  Status HandleBroadcast(HloInstruction* broadcast) override;
-  Status HandlePad(HloInstruction* pad) override;
-  Status HandleReshape(HloInstruction* reshape) override;
-  Status HandleTranspose(HloInstruction* transpose) override;
-  Status HandleWhile(HloInstruction* xla_while) override;
-  Status FinishVisit(HloInstruction* root) override;
-
-  Status Preprocess(HloInstruction* hlo) override;
-  Status Postprocess(HloInstruction* hlo) override;
+      const HloInstruction* dynamic_update_slice) override;
+  Status HandleTuple(const HloInstruction* tuple) override;
+  Status HandleMap(const HloInstruction* map) override;
+  Status HandleReduceWindow(const HloInstruction* reduce_window) override;
+  Status HandleSelectAndScatter(const HloInstruction* instruction) override;
+  Status HandleBitcast(const HloInstruction* bitcast) override;
+  Status HandleBroadcast(const HloInstruction* broadcast) override;
+  Status HandlePad(const HloInstruction* pad) override;
+  Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleTranspose(const HloInstruction* transpose) override;
+  Status HandleWhile(const HloInstruction* xla_while) override;
+  Status HandleConditional(const HloInstruction* conditional) override;
+  Status FinishVisit(const HloInstruction* root) override;
+
+  Status Preprocess(const HloInstruction* hlo) override;
+  Status Postprocess(const HloInstruction* hlo) override;
 
   // Set the rates used to calculate the time taken by the computation. These
   // need to be set before visiting starts.
@@ -114,14 +119,14 @@ class HloCostAnalysis : public DfsHloVisitor {
   float flop_count() const;
   float transcendental_count() const;
   float bytes_accessed() const;
-  float seconds() const;
+  float optimal_seconds() const;
 
   // Returns the respective cost computed for a particular HLO instruction, or 0
   // if the HLO was not found to have a cost in the analysis.
   int64 flop_count(const HloInstruction& hlo) const;
   int64 transcendental_count(const HloInstruction& hlo) const;
   int64 bytes_accessed(const HloInstruction& hlo) const;
-  float seconds(const HloInstruction& hlo) const;
+  float optimal_seconds(const HloInstruction& hlo) const;
 
   const Properties& properties() const { return properties_sum_; }
   const float property(const string& key) const {
@@ -145,7 +150,7 @@ class HloCostAnalysis : public DfsHloVisitor {
       const ShapeSizeFunction* shape_size = nullptr);
 
   // Utility function to handle all element-wise operations.
-  Status HandleElementwiseOp(HloInstruction* hlo_instruction);
+  Status HandleElementwiseOp(const HloInstruction* hlo_instruction);
 
   // Returns the default value if the key is not present in the
   // properties. Otherwise, returns the value that the key maps to from the
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 0eaa21ef254e3461baaaca57503ab24ce35ac929..3b289c240a45e8f3df8156ed89e879da2132d01a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -389,7 +389,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
     static_assert(bytes_accessed == 64, "");
     EXPECT_EQ(fusion_analysis.bytes_accessed(), bytes_accessed);
 
-    EXPECT_EQ(fusion_analysis.seconds(), 1 << i);
+    EXPECT_EQ(fusion_analysis.optimal_seconds(), 1 << i);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 7c4626e78a3e84c9723a9f8e39d56614c4fa25ce..3601a790c4428ee39c264b217a4b9a991ad8456c 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -79,12 +79,12 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
   // Test that two identical constants with different layouts are commoned if
   // the pass is not layout sensitive.
   auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   /*minor_to_major=*/{0, 1})));
-  auto constant2 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   /*minor_to_major=*/{1, 0})));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
@@ -111,12 +111,12 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
   // Test that two identical constants with different layouts are *not* commoned
   // if the pass is layout sensitive.
   auto builder = HloComputation::Builder(TestName());
-  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   /*minor_to_major=*/{0, 1})));
-  auto constant2 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   /*minor_to_major=*/{1, 0})));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 92261bce6270e3c37165c10ed804d036d2abb984..2a335843f507e2071807245d4dd256e1ec6f08c8 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -75,11 +75,43 @@ HloValue* HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
       std::forward_as_tuple(value_id, instruction, index, is_phi));
   CHECK(emplaced.second);
 
+  VLOG(4) << "NewHloValue = " << emplaced.first->second.ToShortString();
+
   return &emplaced.first->second;
 }
 
-void HloDataflowAnalysis::DeleteHloValue(HloValue::Id value_id) {
-  values_.erase(value_id);
+void HloDataflowAnalysis::MarkValueForDeletion(HloValue::Id value_id) {
+  HloValue& value = values_.at(value_id);
+  VLOG(4) << "MarkValueForDeletion(" << value.ToShortString() << ")";
+
+  value_ids_to_delete_.push_back(value_id);
+}
+
+void HloDataflowAnalysis::DeleteMarkedValues() {
+#ifndef NDEBUG
+  // Verify that no marked-for-deletion values are in any of the value sets.
+  tensorflow::gtl::FlatSet<HloValue::Id> id_set(value_ids_to_delete_.begin(),
+                                                value_ids_to_delete_.end());
+  for (const auto& pair : value_sets_) {
+    const HloInstruction* instruction = pair.first;
+    const InstructionValueSet& instruction_value_set = pair.second;
+    for (const auto& index_value_set : instruction_value_set) {
+      const HloValueSet& value_set = index_value_set.second;
+      for (const HloValue* value : value_set.values()) {
+        DCHECK(!ContainsKey(id_set, value->id()))
+            << "Value " << value->ToShortString()
+            << " marked for deletion, but still exists in value set for "
+               "instruction "
+            << instruction->name();
+      }
+    }
+  }
+#endif
+
+  for (HloValue::Id value_id : value_ids_to_delete_) {
+    values_.erase(value_id);
+  }
+  value_ids_to_delete_.clear();
 }
 
 string HloDataflowAnalysis::ToString() const {
@@ -121,6 +153,7 @@ bool HloDataflowAnalysis::Phi(
     HloInstruction* instruction,
     tensorflow::gtl::ArraySlice<const InstructionValueSet*> inputs) {
   CHECK(ssa_form_);
+  VLOG(4) << "Phi(" << instruction->name() << ")";
 
   for (const InstructionValueSet* input : inputs) {
     DCHECK(ShapeUtil::Compatible(instruction->shape(), input->shape()));
@@ -183,7 +216,7 @@ bool HloDataflowAnalysis::Phi(
       } else if (current_value != &new_value) {
         if (current_value_defined_here) {
           // Remove the existing phi.
-          DeleteHloValue(current_value->id());
+          MarkValueForDeletion(current_value->id());
         }
         value_set.Clear();
         value_set.AddValue(&new_value);
@@ -193,7 +226,8 @@ bool HloDataflowAnalysis::Phi(
       // Multiple distinct values reach this point. A phi value is
       // necessary.
       CHECK_GT(input_value_ids.size(), 1);
-      if (current_value == nullptr || !current_value->is_phi()) {
+      if (current_value == nullptr ||
+          !(current_value->is_phi() && current_value_defined_here)) {
         value_set.Clear();
         value_set.AddValue(NewHloValue(instruction, index, /*is_phi=*/true));
         changed = true;
@@ -242,6 +276,51 @@ bool HloDataflowAnalysis::UpdateBitcastValueSet(HloInstruction* bitcast) {
   return false;
 }
 
+bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) {
+  CHECK_EQ(send->opcode(), HloOpcode::kSend);
+  bool changed = false;
+  // Send forwards the operand value to the output tuple at {0}.
+  for (auto& pair : GetInstructionValueSet(send->operand(0))) {
+    const ShapeIndex& operand_index = pair.first;
+    const HloValueSet& operand_value_set = pair.second;
+
+    ShapeIndex index = {0};
+    for (int64 i : operand_index) {
+      index.push_back(i);
+    }
+
+    HloValueSet& value_set = GetValueSet(send, index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+bool HloDataflowAnalysis::UpdateRecvDoneValueSet(HloInstruction* recv_done) {
+  CHECK_EQ(recv_done->opcode(), HloOpcode::kRecvDone);
+  bool changed = false;
+  // RecvDone forwards the operand value at {0} to the output.
+  for (auto& pair : GetInstructionValueSet(recv_done)) {
+    ShapeIndex& index = pair.first;
+    HloValueSet& value_set = pair.second;
+
+    ShapeIndex operand_index = {0};
+    for (int64 i : index) {
+      operand_index.push_back(i);
+    }
+
+    const HloValueSet& operand_value_set =
+        GetValueSet(recv_done->operand(0), operand_index);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
 bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
   CHECK_EQ(call->opcode(), HloOpcode::kCall);
   InstructionValueSet& value_set = GetInstructionValueSet(call);
@@ -254,6 +333,21 @@ bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
   return false;
 }
 
+bool HloDataflowAnalysis::UpdateConditionalValueSet(
+    HloInstruction* conditional) {
+  CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
+  std::vector<const InstructionValueSet*> inputs = {
+      &GetInstructionValueSet(
+          conditional->true_computation()->root_instruction()),
+      &GetInstructionValueSet(
+          conditional->false_computation()->root_instruction())};
+  // A phi-node is not defined for a kConditional instruction even though it
+  // represents a join point. This is because the current approach is to define
+  // a phi-node only for kWhile to account for the dataflow through back-edges
+  // and deal with the ambiguity in other cases.
+  return GetInstructionValueSet(conditional).AssignUnionOf(inputs);
+}
+
 bool HloDataflowAnalysis::UpdateCopyValueSet(HloInstruction* copy) {
   CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
   bool changed = false;
@@ -315,7 +409,7 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
   CHECK_EQ(call_graph_node.context(), CallContext::kSequential);
 
   std::vector<const InstructionValueSet*> inputs;
-  bool called_from_while = false;
+  bool need_phi = false;
   for (const CallSite& callsite : call_graph_node.caller_callsites()) {
     if (callsite.instruction()->opcode() == HloOpcode::kCall) {
       // The operand values of a call instruction are forwarded to the
@@ -337,14 +431,32 @@ bool HloDataflowAnalysis::UpdateParameterValueSet(HloInstruction* parameter) {
         inputs.push_back(&GetInstructionValueSet(
             callsite.instruction()->while_body()->root_instruction()));
       }
-      called_from_while = true;
+      need_phi = true;
+    } else if (callsite.instruction()->opcode() == HloOpcode::kConditional) {
+      CHECK_EQ(parameter->parameter_number(), 0);
+      auto conditional = callsite.instruction();
+      // Conditional has 3 operands. Operand 0 is the predicate, operand 1 is
+      // the argument to the true computation and operand 2 is the argument to
+      // the false computation.
+      //
+      // If the parameter belongs to conditional's true computation, then
+      // operand 1 is forwarded to this parameter instruction. If the parameter
+      // belongs to conditional's false computation, then operand 2 is forwarded
+      // to this parameter instruction.
+      if (parameter->parent() == conditional->true_computation()) {
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(1)));
+      } else {
+        CHECK_EQ(parameter->parent(), conditional->false_computation());
+        inputs.push_back(&GetInstructionValueSet(conditional->operand(2)));
+      }
+      need_phi = true;
     } else {
       LOG(FATAL) << "CallContext::kSequential computations should only be "
-                    "called from call or while instructions";
+                    "called from call, while, or conditional instructions";
     }
   }
 
-  if (ssa_form_ && called_from_while) {
+  if (ssa_form_ && need_phi) {
     return Phi(parameter, inputs);
   } else {
     return GetInstructionValueSet(parameter).AssignUnionOf(inputs);
@@ -429,6 +541,12 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateCallValueSet(instruction);
     case HloOpcode::kWhile:
       return UpdateWhileValueSet(instruction);
+    case HloOpcode::kSend:
+      return UpdateSendValueSet(instruction);
+    case HloOpcode::kRecvDone:
+      return UpdateRecvDoneValueSet(instruction);
+    case HloOpcode::kConditional:
+      return UpdateConditionalValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
@@ -436,11 +554,13 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
   }
 }
 
-void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
-    tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
+void HloDataflowAnalysis::Propagate() {
   std::queue<HloInstruction*> worklist;
-  for (HloInstruction* instruction : instructions) {
-    worklist.push(instruction);
+
+  for (HloComputation* computation : module_->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      worklist.push(instruction);
+    }
   }
 
   while (!worklist.empty()) {
@@ -465,13 +585,31 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
 
       // If user sequentially calls a computation, then the respective
       // parameter(s) of the computation need to be updated.
-      for (HloComputation* called_computation : user->called_computations()) {
-        const CallGraphNode& call_graph_node =
-            call_graph_->GetNode(called_computation);
-        if (call_graph_node.context() == CallContext::kSequential) {
-          for (int64 operand_number : user->OperandIndices(instruction)) {
-            worklist.push(
-                called_computation->parameter_instruction(operand_number));
+      if (user->opcode() == HloOpcode::kConditional) {
+        // If operand 0 is the use of instruction, then no parameters need to be
+        // updated, since that is the predicate of the conditional.
+        // If operand 1 is the use of instruction, then the true_computation's
+        // parameter need to be updated.
+        // If operand 2 is the use of instruction, then the false_computation's
+        // parameter need to be updated.
+        //
+        // Note that the same instruction can be used in both operand 1 and
+        // operand 2.
+        if (user->operand(1) == instruction) {
+          worklist.push(user->true_computation()->parameter_instruction(0));
+        }
+        if (user->operand(2) == instruction) {
+          worklist.push(user->false_computation()->parameter_instruction(0));
+        }
+      } else {
+        for (HloComputation* called_computation : user->called_computations()) {
+          const CallGraphNode& call_graph_node =
+              call_graph_->GetNode(called_computation);
+          if (call_graph_node.context() == CallContext::kSequential) {
+            for (int64 operand_number : user->OperandIndices(instruction)) {
+              worklist.push(
+                  called_computation->parameter_instruction(operand_number));
+            }
           }
         }
       }
@@ -483,7 +621,8 @@ void HloDataflowAnalysis::UpdateInstructionsAndPropagate(
       const CallGraphNode& call_graph_node =
           call_graph_->GetNode(instruction->parent());
       for (const CallSite& callsite : call_graph_node.caller_callsites()) {
-        if (callsite.instruction()->opcode() == HloOpcode::kCall) {
+        if ((callsite.instruction()->opcode() == HloOpcode::kCall) ||
+            (callsite.instruction()->opcode() == HloOpcode::kConditional)) {
           worklist.push(callsite.instruction());
         } else if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
           // Add the while itself, and the body and condition parameters.
@@ -537,6 +676,12 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
         GetValueSet(instruction, /*index=*/{}).AddValue(value);
       };
 
+      // Lambda to set the value set at the given index of the output.
+      auto define_value_at = [this, &instruction](const ShapeIndex& index) {
+        HloValue* value = NewHloValue(instruction, index, /*is_phi=*/false);
+        GetValueSet(instruction, index).AddValue(value);
+      };
+
       switch (instruction->opcode()) {
         case HloOpcode::kBitcast:
           if (bitcast_defines_value_) {
@@ -545,6 +690,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           break;
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
+        case HloOpcode::kConditional:
         case HloOpcode::kGetTupleElement:
           // These instructions define no values. The values in their output
           // flow from their operands or from cross computation dataflow.
@@ -577,6 +723,16 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           // values flow from their operands.
           define_top_level_only();
           break;
+        case HloOpcode::kRecvDone:
+          // RecvDone aliases its input tuple element {0}, therefore does not
+          // define any values.
+          break;
+        case HloOpcode::kSend:
+          // Send produces a tuple of {aliased operand, U32 context}, therefore
+          // only defines the top-level tuple and the tuple element at {1}.
+          define_value_at(/*index=*/{});
+          define_value_at(/*index=*/{1});
+          break;
         default:
           define_all_values();
           break;
@@ -597,20 +753,17 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
       new HloDataflowAnalysis(module, ssa_form, bitcast_defines_value));
 
   TF_RETURN_IF_ERROR(dataflow_analysis->InitializeInstructionValueSets());
+  dataflow_analysis->Propagate();
 
-  // Construct list of all instructions to initialize the worklist to propagate
-  // the data flow. For efficiency sort the instruction in post order so
-  // producers appear before consumers.
-  std::vector<HloInstruction*> all_instructions;
-  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      all_instructions.push_back(instruction);
-    }
-  }
-  dataflow_analysis->UpdateInstructionsAndPropagate(all_instructions);
+  // Delete all values marked for deletion.
+  dataflow_analysis->DeleteMarkedValues();
 
-  // Add in positions to all values.
+  // Gather and set all non-definition positions of all values. Value deletion
+  // is rare, so just use a vector indexed by Value::Id rather than a map from
+  // Value::Id to positions. There should be very few holes in the vector, and
+  // lookup is faster.
+  std::vector<std::vector<HloPosition>> value_positions(
+      dataflow_analysis->next_value_id_);
   for (const HloComputation* computation : module->computations()) {
     for (HloInstruction* instruction : computation->instructions()) {
       for (const auto& pair :
@@ -619,13 +772,18 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
         const HloValueSet& value_set = pair.second;
         for (const HloValue* value : value_set.values()) {
           if (value->defining_instruction() != instruction) {
-            dataflow_analysis->GetValue(value->id())
-                .AddPosition(instruction, index);
+            value_positions[value->id()].push_back(
+                HloPosition{instruction, index});
           }
         }
       }
     }
   }
+  for (auto& pair : dataflow_analysis->values_) {
+    HloValue::Id value_id = pair.first;
+    HloValue& value = pair.second;
+    value.SetPositionsAndComputeUses(value_positions[value_id]);
+  }
 
   // Construct vector of values.
   dataflow_analysis->values_vector_.reserve(dataflow_analysis->values_.size());
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 207e553bf7fb62e19b9fa89eaf6bfb3234592c11..469620d01295f90e0c36a48cac9be47c12473a68 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -126,13 +126,16 @@ class HloDataflowAnalysis {
   HloValue* NewHloValue(HloInstruction* instruction, const ShapeIndex& index,
                         bool is_phi = false);
 
-  // Delete the HloValue with the given ID.
-  void DeleteHloValue(HloValue::Id value_id);
+  // Mark the HloValue with the given ID for deletion.
+  void MarkValueForDeletion(HloValue::Id value_id);
+
+  // Delete all HloValues marked for deletion. Should be called after
+  // propagation is complete.
+  void DeleteMarkedValues();
 
   // Constructs and initializes the InstructionValueSets of all instructions to
   // contain exactly the HloValues defined by each instruction. These values can
-  // then propagated throughout the HLO graph by calling
-  // UpdateInstructionsAndPropagate.
+  // then propagated throughout the HLO graph by calling Propagate.
   Status InitializeInstructionValueSets();
 
   // Updates the value set of the given instruction based on the values flowing
@@ -143,17 +146,18 @@ class HloDataflowAnalysis {
   // the instruction value set changed.
   bool UpdateBitcastValueSet(HloInstruction* bitcast);
   bool UpdateCallValueSet(HloInstruction* call);
+  bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
+  bool UpdateRecvDoneValueSet(HloInstruction* recv_done);
   bool UpdateSelectValueSet(HloInstruction* select);
+  bool UpdateSendValueSet(HloInstruction* send);
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
 
-  // Update the value sets of the given instructions and propagate the
-  // changes to fixed point.
-  void UpdateInstructionsAndPropagate(
-      tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
+  // Propagate the dataflow through the module.
+  void Propagate();
 
   // Return the result of the SSA Phi function applied to the given inputs at
   // the given instruction. If skip_top_level is true, then the top level of the
@@ -189,6 +193,11 @@ class HloDataflowAnalysis {
   // A map from instruction to InstructionValueSet.
   std::unordered_map<const HloInstruction*, InstructionValueSet> value_sets_;
 
+  // Values marked for deletion during construction. We don't delete them
+  // immediately because references to them may remain in ValueSets temporarily
+  // during propagation. After construction, these values are deleted.
+  std::vector<HloValue::Id> value_ids_to_delete_;
+
   // A vector containing all HloValues sorted by HloValue::Id.
   std::vector<const HloValue*> values_vector_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 4b8eb237a6712804657bb7b67cdde9a2d331bd11..e714b2567fd1b3eab607a19f0bb7e3288150dc64 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
 // Test is parameterized on a bool which is whether the dataflow analysis is
@@ -77,11 +78,23 @@ class HloDataflowAnalysisTest : public HloTestBase,
                                  analysis_->GetValueDefinedAt(b), *analysis_);
   }
 
+  std::unique_ptr<HloComputation> CreateR0F32UnaryOpComputation(
+      HloOpcode opcode) {
+    HloComputation::Builder builder(TestName() + "." + HloOpcodeString(opcode));
+    HloInstruction* param0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+    builder.AddInstruction(
+        HloInstruction::CreateUnary(scalar_shape_, opcode, param0));
+    return builder.Build();
+  }
+
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<HloDataflowAnalysis> analysis_;
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape vector_shape_ = ShapeUtil::MakeShape(F32, {42});
+  const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
 };
 
 TEST_P(HloDataflowAnalysisTest, BinaryOperation) {
@@ -211,10 +224,10 @@ TEST_P(HloDataflowAnalysisTest, NestedTuple) {
           HloPosition{nested_tuple, {0, 0}}, HloPosition{nested_tuple, {1, 0}},
           HloPosition{nested_tuple, {2}}, HloPosition{gte_tuple, {0}},
           HloPosition{gte_out, {}}));
-  // Constant values should have no uses though one is live out. The positions
-  // where they appear as operands are on instructions which do not use the
-  // values (eg, Tuple).
-  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).uses().empty());
+  // Constant values should have only a single use, which is the root of the
+  // computation.
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1, /*index=*/{}).uses(),
+              UnorderedElementsAre(HloUse{gte_out, 0, {0}}));
   EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).uses().empty());
 
   // The top-level tuple values are used in GTE instructions.
@@ -274,12 +287,11 @@ TEST_P(HloDataflowAnalysisTest, SingleCall) {
   EXPECT_EQ(analysis.GetUniqueValueAt(call), analysis.GetValueDefinedAt(add));
 
   EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(HloUse{add, 0, {}}));
+              UnorderedElementsAre(HloUse{call, 0, {}}, HloUse{add, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
-              UnorderedElementsAre(HloUse{add, 1, {}}));
+              UnorderedElementsAre(HloUse{call, 1, {}}, HloUse{add, 1, {}}));
 
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
 }
 
 TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
@@ -323,18 +335,17 @@ TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithSameArguments) {
   EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
 
   EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(HloUse{add, 0, {}}));
+              UnorderedElementsAre(HloUse{call1, 0, {}}, HloUse{call2, 0, {}},
+                                   HloUse{add, 0, {}}));
   EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
-              UnorderedElementsAre(HloUse{add, 1, {}}));
+              UnorderedElementsAre(HloUse{call1, 1, {}}, HloUse{call2, 1, {}},
+                                   HloUse{add, 1, {}}));
   // The Add from the subcomputation is used as both operands of the Subtract.
   EXPECT_THAT(analysis.GetValueDefinedAt(add).uses(),
               UnorderedElementsAre(HloUse{sub, 0, {}}, HloUse{sub, 1, {}}));
 
   EXPECT_FALSE(analysis.GetValueDefinedAt(add).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
-
   EXPECT_TRUE(analysis.GetValueDefinedAt(sub).live_out_of_module());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(sub).live_out_of_computation());
 }
 
 TEST_P(HloDataflowAnalysisTest, ComputationCalledTwiceWithDifferentArguments) {
@@ -408,7 +419,7 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
   auto outer_param1 = outer_builder.AddInstruction(
       HloInstruction::CreateParameter(1, scalar_shape_, "param1"));
   // Swizzle parameters.
-  outer_builder.AddInstruction(HloInstruction::CreateCall(
+  auto nested_call = outer_builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {outer_param1, outer_param0}, inner_computation));
   HloComputation* outer_computation =
       module_->AddEmbeddedComputation(outer_builder.Build());
@@ -418,7 +429,7 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   auto constant2 = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
-  builder.AddInstruction(HloInstruction::CreateCall(
+  auto call = builder.AddInstruction(HloInstruction::CreateCall(
       scalar_shape_, {constant1, constant2}, outer_computation));
   module_->AddEntryComputation(builder.Build());
 
@@ -431,10 +442,14 @@ TEST_P(HloDataflowAnalysisTest, NestedCalls) {
 
   // Verify that the uses of the constants are properly swizzled by parameter
   // permutation in nested_call.
-  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
-              UnorderedElementsAre(HloUse{add, 1, {}}));
-  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
-              UnorderedElementsAre(HloUse{add, 0, {}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant1).uses(),
+      UnorderedElementsAre(HloUse{call, 0, {}}, HloUse{nested_call, 1, {}},
+                           HloUse{add, 1, {}}));
+  EXPECT_THAT(
+      analysis.GetValueDefinedAt(constant2).uses(),
+      UnorderedElementsAre(HloUse{call, 1, {}}, HloUse{nested_call, 0, {}},
+                           HloUse{add, 0, {}}));
 
   EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
 }
@@ -469,7 +484,7 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
       HloInstruction::CreateGetTupleElement(scalar_shape_, body_param, 1));
   auto add = body_builder.AddInstruction(HloInstruction::CreateBinary(
       scalar_shape_, HloOpcode::kAdd, body_element_0, body_element_1));
-  body_builder.AddInstruction(
+  auto body_root = body_builder.AddInstruction(
       HloInstruction::CreateTuple({body_element_0, add}));
   HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
 
@@ -496,8 +511,6 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
   bool ssa_form = GetParam();
   const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
 
-  EXPECT_TRUE(
-      analysis.GetValueDefinedAt(cond_constant).live_out_of_computation());
   EXPECT_FALSE(analysis.GetValueDefinedAt(cond_constant).live_out_of_module());
 
   if (ssa_form) {
@@ -517,14 +530,14 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
 
     EXPECT_THAT(
         analysis.GetValueDefinedAt(constant1).uses(),
-        UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{xla_while, 0, {0}}));
+        UnorderedElementsAre(HloUse{add, 0, {}}, HloUse{body_root, 0, {}},
+                             HloUse{xla_while, 0, {0}}));
 
     // Constant1 passes through the body and out of the module.
     EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
     EXPECT_TRUE(analysis.GetValueDefinedAt(xla_while, /*index=*/{1})
                     .live_out_of_module());
 
-    EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
     EXPECT_FALSE(analysis.GetValueDefinedAt(add).live_out_of_module());
   } else {
     // While instruction and subcomputation parameters should not define values
@@ -538,7 +551,6 @@ TEST_P(HloDataflowAnalysisTest, SingleWhile) {
 
     EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
     EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_module());
-    EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
   }
 }
 
@@ -915,9 +927,11 @@ TEST_P(HloDataflowAnalysisTest, TupleSelect) {
                            HloUse{select12, 1, {}}));
 
   // The two constant values just pass through the Selects and are not
-  // used. They are live out however.
-  EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).uses().empty());
-  EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).uses().empty());
+  // used except at the root. They are live out however.
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{select1234, 1, {0}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{select1234, 1, {0}}));
   EXPECT_TRUE(analysis.GetValueDefinedAt(constant1).live_out_of_module());
   EXPECT_TRUE(analysis.GetValueDefinedAt(constant2).live_out_of_module());
 }
@@ -1139,6 +1153,54 @@ TEST_P(HloDataflowAnalysisTest, TupleCopy) {
       analysis.GetValueDefinedAt(copy, /*index=*/{}).live_out_of_module());
 }
 
+TEST_P(HloDataflowAnalysisTest, SendAndSendDone) {
+  // Test that a Send forwards its operand to the output tuple at {0}.
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param0"));
+  auto send = builder.AddInstruction(
+      HloInstruction::CreateSend(param, /*channel_id=*/0));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+  module_->AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 4);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(param));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(send, /*index=*/{0}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(send, /*index=*/{1}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(send_done));
+  EXPECT_THAT(HloValuesAt(send, /*index=*/{0}),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(param)));
+}
+
+TEST_P(HloDataflowAnalysisTest, RecvAndRecvDone) {
+  // Test that a RecvDone forwards its operand tuple element at {0} to the
+  // output.
+  auto builder = HloComputation::Builder(TestName());
+  auto recv = builder.AddInstruction(
+      HloInstruction::CreateRecv(scalar_shape_, /*channel_id=*/0));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+  module_->AddEntryComputation(builder.Build());
+
+  bool ssa_form = GetParam();
+  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+
+  EXPECT_EQ(analysis.values().size(), 3);
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{0}));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(recv, /*index=*/{1}));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(recv_done));
+  EXPECT_THAT(HloValuesAt(recv_done),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(recv, {0})));
+  EXPECT_TRUE(
+      analysis.GetValueDefinedAt(recv, /*index=*/{0}).live_out_of_module());
+}
+
 TEST_P(HloDataflowAnalysisTest, ElementwiseChainInterference) {
   // A simple chain of elementwise operations. No values should interfere.
   //
@@ -1270,7 +1332,7 @@ TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) {
 
   auto entry = module_->AddEntryComputation(builder.Build());
   bool ssa_form = GetParam();
-  const HloDataflowAnalysis& analysis = RunAnalysis(ssa_form);
+  RunAnalysis(ssa_form);
 
   SequentialHloOrdering::HloModuleSequence sequence;
   sequence.insert({entry, {param, xla_while}});
@@ -1281,12 +1343,6 @@ TEST_P(HloDataflowAnalysisTest, WhileParameters_Sequential) {
 
   SequentialHloOrdering ordering(module_.get(), sequence);
 
-  // 'add' is the body root even though later instructions follow in the order
-  // like 'dead_negate'. Only 'add' should be live out of the computation.
-  EXPECT_TRUE(analysis.GetValueDefinedAt(add).live_out_of_computation());
-  EXPECT_FALSE(
-      analysis.GetValueDefinedAt(dead_negate).live_out_of_computation());
-
   // 'add' is live out of the body and will interfere with an later instructions
   // such as 'dead_constant' and 'dead_negate'.
   EXPECT_TRUE(InstructionsMayInterfere(ordering, add, dead_constant));
@@ -1485,6 +1541,315 @@ TEST_P(HloDataflowAnalysisTest, EmbeddedComputationInterference) {
   EXPECT_TRUE(InstructionsMayInterfere(ordering, negate, embedded_log));
 }
 
+TEST_P(HloDataflowAnalysisTest, ConditionalWithIdentity) {
+  // Test conditional with identity computations in both true and false cases.
+  //
+  // true_computation(F32[] %true_param):
+  //   return %true_param
+  //
+  // false_computation(F32[] %false_param):
+  //   return %false_param
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   return Conditional(%pred, %constant1, true_computation,
+  //                      %constant2, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "true_param"));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "false_param"));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, constant1, true_computation, constant2,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              ElementsAre(HloUse{conditional, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              ElementsAre(HloUse{conditional, 2, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 3);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(constant1),
+                                   analysis.GetValueDefinedAt(constant2)));
+}
+
+TEST_P(HloDataflowAnalysisTest, ConditionalTakingTupleOperand) {
+  // Test conditional with true and false computations taking a tuple operand.
+  //
+  // true_computation((F32[], F32[]) %true_param):
+  //   %true_x = GetTupleElement(%true_param, 0)
+  //   %true_y = GetTupleElement(%true_param, 1)
+  //   return Add(%true_x, %true_y)
+  //
+  // false_computation((F32[], F32[]) %false_param):
+  //   %false_x = GetTupleElement(%false_param, 0)
+  //   %false_y = GetTupleElement(%false_param, 1)
+  //   return Subtract(%false_x, %false_y)
+  //
+  // entry:
+  //   %pred = Constant(true)
+  //   %constant1 = Constant(56.0)
+  //   %constant2 = Constant(12.0)
+  //   %tuple_operand = Tuple(%constant1, %constant2)
+  //   return Conditional(%pred, %tuple_operand, true_computation,
+  //                      %tuple_operand, false_computation)
+
+  auto true_builder = HloComputation::Builder(TestName() + "_true");
+  auto true_param = true_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "true_param"));
+  auto true_x = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 0));
+  auto true_y = true_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, true_param, 1));
+  auto add = true_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kAdd, true_x, true_y));
+  HloComputation* true_computation =
+      module_->AddEmbeddedComputation(true_builder.Build());
+
+  auto false_builder = HloComputation::Builder(TestName() + "_false");
+  auto false_param = false_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape_, "false_param"));
+  auto false_x = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 0));
+  auto false_y = false_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, false_param, 1));
+  auto sub = false_builder.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape_, HloOpcode::kSubtract, false_x, false_y));
+  HloComputation* false_computation =
+      module_->AddEmbeddedComputation(false_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(56.0f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(12.0f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred, tuple_operand, true_computation, tuple_operand,
+      false_computation));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(add));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(sub));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_y));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_x));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_y));
+
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_param),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_y),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_x),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_y),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_THAT(analysis.GetValueDefinedAt(pred).uses(),
+              ElementsAre(HloUse{conditional, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant1).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {0}},
+                                   HloUse{conditional, 2, {0}},
+                                   HloUse{add, 0, {}}, HloUse{sub, 0, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(constant2).uses(),
+              UnorderedElementsAre(HloUse{conditional, 1, {1}},
+                                   HloUse{conditional, 2, {1}},
+                                   HloUse{add, 1, {}}, HloUse{sub, 1, {}}));
+  EXPECT_THAT(analysis.GetValueDefinedAt(tuple_operand).uses(),
+              UnorderedElementsAre(
+                  HloUse{conditional, 1, {}}, HloUse{conditional, 2, {}},
+                  HloUse{true_x, 0, {}}, HloUse{true_y, 0, {}},
+                  HloUse{false_x, 0, {}}, HloUse{false_y, 0, {}}));
+
+  EXPECT_EQ(analysis.values().size(), 6);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(HloValuesAt(conditional),
+              UnorderedElementsAre(analysis.GetValueDefinedAt(add),
+                                   analysis.GetValueDefinedAt(sub)));
+}
+
+TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
+  // computation1(F32[] %param1):
+  //   %ceil = Ceil(%param1)
+  //   return %ceil
+  //
+  // computation2(F32[] %param2):
+  //   %floor = Floor(%param2)
+  //   return %floor
+  //
+  // computation3(F32[] %param3):
+  //   %negate = Negate(%param3)
+  //   return %negate
+  //
+  // inner_conditional((PRED, F32[], F32[]) %param_cond):
+  //   %pred_cond = GetTupleElement(%param_cond, 0)
+  //   %true_operand_cond = GetTupleElement(%param_cond, 1)
+  //   %false_opearnd_cond = GetTupleElement(%param_cond, 2)
+  //   return Conditional(%pred_cond, %true_operand_cond, computation1,
+  //                      %false_operand_cond, computation2)
+  //
+  // entry:
+  //   %pred1 = Constant(true)
+  //   %pred2 = Constant(false)
+  //   %constant1 = Constant(1.1);
+  //   %constant2 = Constant(2.2);
+  //   %constant3 = Constant(3.3);
+  //   return Conditional(%pred1, (%pred2, %constant1, %constant2),
+  //                      inner_conditional, %constant3, computation3)
+
+  auto computation1 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kCeil));
+  auto computation2 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kFloor));
+  auto computation3 = module_->AddEmbeddedComputation(
+      CreateR0F32UnaryOpComputation(HloOpcode::kNegate));
+
+  // Build inner_conditional computation.
+  const Shape scalar_bool_shape = ShapeUtil::MakeShape(PRED, {});
+  const Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {scalar_bool_shape, scalar_shape_, scalar_shape_});
+  auto inner_builder =
+      HloComputation::Builder(TestName() + "_inner_conditional");
+  auto param_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_param_shape, "param_cond"));
+  auto pred_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_bool_shape, param_cond, 0));
+  auto true_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 1));
+  auto false_operand_cond = inner_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(scalar_shape_, param_cond, 2));
+  auto inner_conditional =
+      inner_builder.AddInstruction(HloInstruction::CreateConditional(
+          scalar_shape_, pred_cond, true_operand_cond, computation1,
+          false_operand_cond, computation2));
+  auto inner_conditional_computation =
+      module_->AddEmbeddedComputation(inner_builder.Build());
+
+  // Build entry computation.
+  auto builder = HloComputation::Builder(TestName());
+  auto pred1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  auto pred2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.2f)));
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(3.3f)));
+  auto tuple_operand = builder.AddInstruction(
+      HloInstruction::CreateTuple({pred2, constant1, constant2}));
+  auto conditional = builder.AddInstruction(HloInstruction::CreateConditional(
+      scalar_shape_, pred1, tuple_operand, inner_conditional_computation,
+      constant3, computation3));
+  module_->AddEntryComputation(builder.Build());
+
+  const HloDataflowAnalysis& analysis = RunAnalysis(GetParam());
+
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(pred2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant1));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant2));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(constant3));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(tuple_operand));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation1->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation2->root_instruction()));
+  EXPECT_TRUE(analysis.ValueIsDefinedAt(computation3->root_instruction()));
+
+  auto computation1_param = computation1->parameter_instruction(0);
+  auto computation2_param = computation2->parameter_instruction(0);
+  auto computation3_param = computation3->parameter_instruction(0);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation1_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation2_param));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(computation3_param));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation1_param),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation2_param),
+            analysis.GetValueDefinedAt(constant2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(computation3_param),
+            analysis.GetValueDefinedAt(constant3));
+
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(param_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(pred_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(true_operand_cond));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(false_operand_cond));
+  EXPECT_EQ(analysis.GetUniqueValueAt(param_cond),
+            analysis.GetValueDefinedAt(tuple_operand));
+  EXPECT_EQ(analysis.GetUniqueValueAt(pred_cond),
+            analysis.GetValueDefinedAt(pred2));
+  EXPECT_EQ(analysis.GetUniqueValueAt(true_operand_cond),
+            analysis.GetValueDefinedAt(constant1));
+  EXPECT_EQ(analysis.GetUniqueValueAt(false_operand_cond),
+            analysis.GetValueDefinedAt(constant2));
+
+  EXPECT_EQ(analysis.values().size(), 9);
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(inner_conditional));
+  EXPECT_FALSE(analysis.ValueIsDefinedAt(conditional));
+  EXPECT_THAT(
+      HloValuesAt(inner_conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction())));
+  EXPECT_THAT(
+      HloValuesAt(conditional),
+      UnorderedElementsAre(
+          analysis.GetValueDefinedAt(computation1->root_instruction()),
+          analysis.GetValueDefinedAt(computation2->root_instruction()),
+          analysis.GetValueDefinedAt(computation3->root_instruction())));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index a4921232f5848dbe1789c4c641e2b0ba3c1848bb..1e5f0f797a13fd7e7ce1cc934387a274a74153bc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -37,6 +37,9 @@ namespace xla {
 StatusOr<bool> HloDCE::Run(HloModule* module) {
   bool changed = false;
 
+  VLOG(2) << "Before dce:";
+  XLA_VLOG_LINES(2, module->ToString());
+
   for (auto* computation : module->MakeNonfusionComputations()) {
     std::unordered_set<HloInstruction*> live_instructions;
     TF_RETURN_IF_ERROR(computation->root_instruction()->Accept(
@@ -52,12 +55,15 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->user_count() == 0 &&
           live_instructions.count(instruction) == 0 &&
-          computation->IsRemovable(instruction)) {
+          computation->IsRemovable(instruction) &&
+          !instruction->HasSideEffect()) {
         dead_roots.push_back(instruction);
       }
     }
 
     for (HloInstruction* dead_root : dead_roots) {
+      VLOG(1) << "Removing dead root " << dead_root->ToString()
+              << " and it's unused operands";
       TF_RETURN_IF_ERROR(
           computation->RemoveInstructionAndUnusedOperands(dead_root));
       changed = true;
@@ -87,6 +93,9 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
     }
   }
 
+  VLOG(2) << "After dce:";
+  XLA_VLOG_LINES(2, module->ToString());
+
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index d54b9a27087a42fd23eab0bd06e8deaca567312b..5a56607a665c4cbeb7b2572f182b88e890602968 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -70,6 +70,26 @@ TEST_F(HloDceTest, NoDeadCode) {
   EXPECT_EQ(3, computation->instruction_count());
 }
 
+TEST_F(HloDceTest, InstructionsWithSideEffect) {
+  // Verify that side-effect instructions (Send in this test) are not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateSend(constant, /*channel_id=*/0));
+  builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(3, computation->instruction_count());
+
+  HloDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(3, computation->instruction_count());
+}
+
 TEST_F(HloDceTest, DeadParameters) {
   // Verify that dead parameters are not removed, but use of the dead parameters
   // are.
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1773bb401d380031f6c860d295e76d2f62c9e5ff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -0,0 +1,137 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace {
+
+HloInstruction* ToElementType(HloInstruction* hlo, PrimitiveType type) {
+  if (hlo->shape().element_type() != type) {
+    Shape shape = ShapeUtil::ChangeElementType(hlo->shape(), type);
+    hlo = hlo->parent()->AddInstruction(
+        HloInstruction::CreateConvert(shape, hlo));
+  }
+  CHECK_EQ(hlo->shape().element_type(), type);
+  return hlo;
+}
+
+bool HasOperandType(HloInstruction* hlo, PrimitiveType type) {
+  for (HloInstruction* operand : hlo->operands()) {
+    if (operand->shape().element_type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+HloElementTypeConverter::HloElementTypeConverter(
+    PrimitiveType eliminate_type, PrimitiveType replace_with_type)
+    : eliminate_type_(eliminate_type), replace_with_type_(replace_with_type) {}
+
+StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      3, "HloElementTypeConverter::Run(), before:\n" + module->ToString());
+  bool changed = false;
+  for (auto* computation : module->computations()) {
+    for (auto* hlo : computation->MakeInstructionPostOrder()) {
+      // These are ops where it does not make sense to convert them.
+      if (hlo->opcode() == HloOpcode::kParameter ||
+          hlo->opcode() == HloOpcode::kConstant ||
+          hlo->opcode() == HloOpcode::kTuple ||
+          hlo->opcode() == HloOpcode::kConvert ||
+          hlo->opcode() == HloOpcode::kGetTupleElement ||
+          hlo->opcode() == HloOpcode::kInfeed ||
+          hlo->opcode() == HloOpcode::kOutfeed) {
+        continue;
+      }
+
+      // We cannot change a CustomCall since we have no way of adjusting the
+      // called binary to expect the updated type.
+      if (hlo->opcode() == HloOpcode::kCustomCall) {
+        continue;
+      }
+
+      // These are ops with embedded computations where it suffices to convert
+      // the embedded computations instead of converting the ops themselves.
+      if (hlo->opcode() == HloOpcode::kWhile ||
+          hlo->opcode() == HloOpcode::kCall ||
+          hlo->opcode() == HloOpcode::kFusion ||
+          hlo->opcode() == HloOpcode::kMap ||
+          hlo->opcode() == HloOpcode::kReduce ||
+          hlo->opcode() == HloOpcode::kReduceWindow ||
+          hlo->opcode() == HloOpcode::kSelectAndScatter ||
+          hlo->opcode() == HloOpcode::kConditional) {
+        continue;
+      }
+      TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
+
+      if (!HasOperandType(hlo, eliminate_type_)) {
+        // If this CHECK fires, then this was an instruction that does not take
+        // the elimination type as an operand but it does return it. This pass
+        // does not have a feature to change the output type in that case, so
+        // instead of silently failing to eliminate the type, it fails loudly.
+        TF_RET_CHECK(hlo->shape().element_type() != eliminate_type_);
+        continue;
+      }
+
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* operand : hlo->operands()) {
+        if (operand->shape().element_type() == eliminate_type_) {
+          operand = ToElementType(operand, replace_with_type_);
+        }
+        new_operands.push_back(operand);
+      }
+
+      HloInstruction* new_hlo;
+      if (hlo->shape().element_type() == eliminate_type_) {
+        Shape shape =
+            ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+        new_hlo = ToElementType(new_hlo, eliminate_type_);
+      } else {
+        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
+            hlo->shape(), new_operands, hlo->GetModule()));
+      }
+      TF_RETURN_IF_ERROR(computation->ReplaceInstruction(hlo, new_hlo));
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(
+      2, "HloElementTypeConverter::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.h b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b109225d0b192e5c9e4f6d841377ffad8078dc2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// A pass that eliminates certain element types as the input or output of ops by
+// inserting Convert ops. This allows a backend to support an element type while
+// only actually implementing the Convert op for that element type. This is
+// generally not the fastest approach, but it works.
+class HloElementTypeConverter : public HloPassInterface {
+ public:
+  // eliminate_type is the type to eliminate as the input or output of ops,
+  // using Convert ops to replace it with replace_with_type.
+  HloElementTypeConverter(PrimitiveType eliminate_type,
+                          PrimitiveType replace_with_type);
+
+  tensorflow::StringPiece name() const override {
+    return "element_type_converter";
+  }
+
+  // Returns the pass on the module and returns whether the module was modified.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  PrimitiveType eliminate_type_;
+  PrimitiveType replace_with_type_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_ELEMENT_TYPE_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 88b77ccdd03eb129f81cfa1da430e882ea569df4..e693d167a1f96f65b894d07fb2c8f33e61ff8c49 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -335,9 +335,31 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                          return ~elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
   Status HandleNot(HloInstruction* not_) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
                         ElementWiseUnaryOp(not_, [](ReturnT elem_operand) {
@@ -357,7 +379,24 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleNot<ReturnT>(not_);
   }
 
-  Status HandleNegate(HloInstruction* negate) override {
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
+                        ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
+                          return NativeT(-type(elem_operand));
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                !std::is_signed<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[negate],
                         ElementWiseUnaryOp(negate, [](ReturnT elem_operand) {
                           return -elem_operand;
@@ -365,6 +404,10 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleNegate(HloInstruction* negate) override {
+    return HandleNegate<ReturnT>(negate);
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
@@ -402,7 +445,26 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleMultiply(HloInstruction* multiply) override {
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
+          return NativeT(type(lhs_elem) * type(rhs_elem));
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_unsigned<NativeT>::value ||
+                              std::is_floating_point<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
         ElementWiseBinaryOp(multiply, [](ReturnT lhs_elem, ReturnT rhs_elem) {
@@ -411,6 +473,10 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleMultiply(HloInstruction* multiply) override {
+    return HandleMultiply<ReturnT>(multiply);
+  }
+
   Status HandleSubtract(HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
@@ -516,9 +582,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleRemainder<ReturnT>(remainder);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el & rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
   Status HandleAnd(HloInstruction* and_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[and_],
@@ -539,9 +616,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return HandleAnd<ReturnT>(and_);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ReturnT lhs_el, ReturnT rhs_el) {
+          return lhs_el | rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
   Status HandleOr(HloInstruction* or_) {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[or_],
@@ -645,7 +733,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ReturnT(ReturnT, ReturnT, ReturnT)> clamp_op =
-        [](ReturnT low, ReturnT high, ReturnT value) {
+        [](ReturnT low, ReturnT value, ReturnT high) {
           return std::fmax(low, std::fmin(value, high));
         };
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[clamp],
@@ -724,7 +812,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
 
     const auto& dnums = conv->convolution_dimension_numbers();
-    const int64 num_spatial_dims = dnums.spatial_dimensions_size();
+    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
     CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
@@ -789,13 +878,15 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
             // Spatial dimension number for input (lhs) and output.
-            const int64 spatial_dim = dnums.spatial_dimensions(ki);
+            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+            const int64 output_spatial_dim =
+                dnums.output_spatial_dimensions(ki);
 
             // Calculate lhs (input) index without taking base dilation into
             // account.
             const auto& window_dim = window.dimensions(ki);
             const int64 undilated_index =
-                out_index[spatial_dim] * window_dim.stride() -
+                out_index[output_spatial_dim] * window_dim.stride() -
                 window_dim.padding_low() +
                 rhs_spatial_index[ki] * window_dim.window_dilation();
             // Skip if the lhs (input) index is to be dilated.
@@ -804,23 +895,26 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             }
 
             // Calculate the actual lhs (input) index after dilation.
-            lhs_index[spatial_dim] =
+            lhs_index[input_spatial_dim] =
                 undilated_index / window_dim.base_dilation();
 
             // Skip if input index is not in bound.
-            if (!(lhs_index[spatial_dim] >= 0 &&
-                  lhs_index[spatial_dim] < lhs_shape.dimensions(spatial_dim))) {
+            if (!(lhs_index[input_spatial_dim] >= 0 &&
+                  lhs_index[input_spatial_dim] <
+                      lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
             rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                rhs_spatial_index[ki];
+                window_dim.window_reversal()
+                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                    : rhs_spatial_index[ki];
           }
 
           result_val += lhs_literal.Get<ReturnT>(lhs_index) *
                         rhs_literal.Get<ReturnT>(rhs_index);
         }
-      cnt:;
+      cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
 
       return result_val;
@@ -1287,6 +1381,50 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
+                        ElementWiseUnaryOp(sin, [](ReturnT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    return InvalidArgument("Unsupported type for Sin");
+  }
+
+  Status HandleSin(HloInstruction* sin) override {
+    return HandleSin<ReturnT>(sin);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
+                        ElementWiseUnaryOp(cos, [](ReturnT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    return InvalidArgument("Unsupported type for Cos");
+  }
+
+  Status HandleCos(HloInstruction* cos) override {
+    return HandleCos<ReturnT>(cos);
+  }
+
  private:
   template <typename IndexT>
   StatusOr<std::unique_ptr<Literal>> DynamicSlice(
@@ -1397,8 +1535,8 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
 
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
+    // broadcast is removed.
     if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
           ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
           ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
@@ -1450,6 +1588,10 @@ HloEvaluator::HloEvaluator() {
   typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
   typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
   typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
+
+  typed_visitors_[BF16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
+    return Unimplemented("HloEvaluator: unhandled primitive type: BF16.");
+  });
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented("HloEvaluator: unhandled primitive type: TUPLE.");
   });
@@ -1561,6 +1703,7 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateWithSubstitutions(
   }
 
   std::vector<HloInstruction*> operands;
+  operands.reserve(owned_operands.size());
   for (auto& operand : owned_operands) {
     operands.push_back(operand.get());
   }
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 67b6e215fcb23598f1a8ab6212d6e7e58a64e976..7557aaa2484d184555411a79d8dce2c9241427b0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -39,16 +39,18 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   HloEvaluator();
   // Evaluates an HLO module and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
-  // Precondition: argument literals correspond to each input computation's
-  // parameters in their post-ordering. See comment below for example.
+  // Precondition: The indices of arg_literals correspond to the parameter
+  // numbers of the HLO parameters in the computation. See comment below for an
+  // example.
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloModule& module,
       tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
-  // Precondition: argument literals correspond to the input computation's
-  // parameters in their post-ordering. For e.g., consider the following graph:
+  // Precondition: The indices of arg_literals correspond to the parameter
+  // numbers of the HLO parameters in the computation. For e.g., consider the
+  // following graph:
   //
   //                *
   //            /       \
@@ -57,8 +59,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //       /        \
   //    Parameter0  Constant
   //
-  // The input literals array will have its first literal map to Parameter0 and
-  // the second map to Parameter1.
+  // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
+  // 1 in this computation. The input literals array will then have its first
+  // literal map to Parameter0 and the second map to Parameter1.
   StatusOr<std::unique_ptr<Literal>> Evaluate(
       const HloComputation& computation,
       tensorflow::gtl::ArraySlice<const Literal*> arg_literals);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 85477af6fe26f53504c07204348566c16a24392c..a5d39fe08699f1ec17462f3ac5600fbe2191f307 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -46,20 +46,57 @@ class HloEvaluatorTest : public HloVerifiedTestBase {
   HloEvaluatorTest() { evaluator_ = MakeUnique<HloEvaluator>(); }
 
   std::unique_ptr<HloEvaluator> evaluator_;
+
+  void TestUnaryOp(HloOpcode opcode, std::unique_ptr<Literal> expected,
+                   std::unique_ptr<Literal> input, float aabs = 0) {
+    HloComputation::Builder b(TestName());
+    auto c1 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(input)));
+    auto instruction = b.AddInstruction(
+        HloInstruction::CreateUnary(expected->shape(), opcode, c1));
+    module().AddEntryComputation(b.Build());
+
+    std::unique_ptr<Literal> result =
+        evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+
+    auto element_type = expected->shape().element_type();
+    if (element_type == F32 || element_type == F64) {
+      ErrorSpec error(aabs);
+      LiteralTestUtil::ExpectNear(*expected, *result, error);
+    } else {
+      LiteralTestUtil::ExpectEqual(*expected, *result);
+    }
+  }
+
+  void TestBinaryOp(HloOpcode opcode, std::unique_ptr<Literal> expected,
+                    std::unique_ptr<Literal> lhs,
+                    std::unique_ptr<Literal> rhs) {
+    HloComputation::Builder b(TestName());
+    auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
+    auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
+    auto instruction = b.AddInstruction(
+        HloInstruction::CreateBinary(expected->shape(), opcode, c1, c2));
+    module().AddEntryComputation(b.Build());
+
+    std::unique_ptr<Literal> result =
+        evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+
+    LiteralTestUtil::ExpectEqual(*expected, *result);
+  }
 };
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
 TEST_F(HloEvaluatorTest, DoesClamp) {
   auto low = Literal::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
-  auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto value = Literal::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
+  auto high = Literal::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
 
   Shape shape = low->shape();
   HloComputation::Builder b(TestName());
   auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
-  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
-  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
   auto instruction = b.AddInstruction(
       HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
   module().AddEntryComputation(b.Build());
@@ -72,6 +109,28 @@ TEST_F(HloEvaluatorTest, DoesClamp) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+TEST_F(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+  auto low = Literal::CreateR0<float>(0.f);
+  auto value = Literal::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
+  auto high = Literal::CreateR0<float>(1.f);
+
+  Shape shape = value->shape();
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  auto instruction = b.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
+  module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
+
+  auto expected = Literal::CreateR2<float>({{0, 0}, {1, 1}});
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
 TEST_F(HloEvaluatorTest, DoesSelect) {
@@ -103,120 +162,101 @@ TEST_F(HloEvaluatorTest, DoesSelect) {
 TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
-
-  Shape shape = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs)));
-  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs)));
-  auto instruction = b.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, c1, c2));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
   auto expected = Literal::CreateR2<int64>({{3, 4}, {-96, 8}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kAdd, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise and with 2 operands.
+TEST_F(HloEvaluatorTest, DoesAnd) {
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int64>({{0, 0}, {4, 4}});
+  TestBinaryOp(HloOpcode::kAnd, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise or with 2 operands.
+TEST_F(HloEvaluatorTest, DoesOr) {
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int64>({{3, 4}, {-100, 4}});
+  TestBinaryOp(HloOpcode::kOr, std::move(expected), std::move(lhs),
+               std::move(rhs));
+}
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise multiply with 2 operands.
+TEST_F(HloEvaluatorTest, DoesMultiply) {
+  auto lhs = Literal::CreateR2<int32>({{-1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int32>(
+      {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
+  auto expected = Literal::CreateR2<int32>(
+      {{std::numeric_limits<int32>::min(), 0}, {-400, 16}});
+  TestBinaryOp(HloOpcode::kMultiply, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
-
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
 TEST_F(HloEvaluatorTest, DoesDivideInt64) {
-  auto lhs_s64 = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
-  auto rhs_s64 = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
-
-  Shape shape_s64 = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1_s64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_s64)));
-  auto c2_s64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_s64)));
-  auto instruction = b.AddInstruction(HloInstruction::CreateBinary(
-      shape_s64, HloOpcode::kDivide, c1_s64, c2_s64));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
+  auto lhs = Literal::CreateR2<int64>({{1, 0}, {-100, 4}});
+  auto rhs = Literal::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = Literal::CreateR2<int64>({{0, 0}, {-25, 1}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
 TEST_F(HloEvaluatorTest, DoesDivideDouble) {
-  auto lhs_f64 = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
-  auto rhs_f64 = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
-
-  Shape shape_f64 = ShapeUtil::MakeShape(F64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1_f64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_f64)));
-  auto c2_f64 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_f64)));
-  auto instruction = b.AddInstruction(HloInstruction::CreateBinary(
-      shape_f64, HloOpcode::kDivide, c1_f64, c2_f64));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
+  auto lhs = Literal::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
+  auto rhs = Literal::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
       Literal::CreateR2<double>({{0.45454545454545453, 0}, {-25, 1}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
 TEST_F(HloEvaluatorTest, DoesAbsR2) {
   auto operand = Literal::CreateR2<int64>({{1, -20}, {-100, 4}});
-  const Shape& shape = ShapeUtil::MakeShape(S64, {2, 2});
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction =
-      b.AddInstruction(HloInstruction::CreateUnary(shape, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  std::unique_ptr<Literal> result =
-      evaluator_->Evaluate(instruction, {}).ConsumeValueOrDie();
-
   auto expected = Literal::CreateR2<int64>({{1, 20}, {100, 4}});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
 TEST_F(HloEvaluatorTest, DoesAbsR0) {
-  // For R0 literal.
-  const Shape& r0 = ShapeUtil::MakeShape(F32, {});
   auto operand = Literal::CreateR0<float>(-1.0f);
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction =
-      b.AddInstruction(HloInstruction::CreateUnary(r0, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction).ConsumeValueOrDie();
   auto expected = Literal::CreateR0<float>(1.0f);
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
 TEST_F(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
-  // For R1 literal with dimension of size 0.
-  Shape empty_r1 = ShapeUtil::MakeShape(F32, {0});
   auto operand = Literal::CreateR1<float>({});
-  HloComputation::Builder b(TestName());
-  auto c1 =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(operand)));
-  auto instruction = b.AddInstruction(
-      HloInstruction::CreateUnary(empty_r1, HloOpcode::kAbs, c1));
-  module().AddEntryComputation(b.Build());
-
-  auto result = evaluator_->Evaluate(instruction).ConsumeValueOrDie();
   auto expected = Literal::CreateR1<float>({});
-
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
+}
+TEST_F(HloEvaluatorTest, DoesNegateR2) {
+  auto operand = Literal::CreateR2<int32>(
+      {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
+  auto expected =
+      Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()}, {1, -4}});
+  TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
+}
+TEST_F(HloEvaluatorTest, DoesCosR2) {
+  auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
+  auto expected = Literal::CreateR2<float>({{1, -1}, {-1, 1}});
+  TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand));
+}
+TEST_F(HloEvaluatorTest, DoesSinR2) {
+  auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
+  auto expected = Literal::CreateR2<float>({{0, 0}, {0, 0}});
+  TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
+              0x1.0P-20);
+}
+TEST_F(HloEvaluatorTest, DoesNotR2) {
+  auto operand =
+      Literal::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
+                                {-1, std::numeric_limits<int>::max()}});
+  auto expected =
+      Literal::CreateR2<int32>({{-1, std::numeric_limits<int>::max()},
+                                {0, std::numeric_limits<int>::min()}});
+  TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand));
 }
-
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
 TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
@@ -581,8 +621,11 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank1) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
   auto computation = module().AddEntryComputation(b.Build());
 
   std::unique_ptr<Literal> result =
@@ -624,8 +667,11 @@ TEST_F(HloEvaluatorTest, DotRank1AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
   auto computation = module().AddEntryComputation(b.Build());
 
   std::unique_ptr<Literal> result =
@@ -665,8 +711,11 @@ TEST_F(HloEvaluatorTest, DotRank2AndRank2) {
       b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {4, 2});
-  b.AddInstruction(HloInstruction::CreateBinary(
-      shape, HloOpcode::kDot, lhs_instruction, rhs_instruction));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums));
   auto computation = module().AddEntryComputation(b.Build());
 
   std::unique_ptr<Literal> result =
@@ -711,7 +760,8 @@ TEST_F(HloEvaluatorTest, SimpleConv1D) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
 
   dnums.set_kernel_output_feature_dimension(0);
   dnums.set_kernel_input_feature_dimension(1);
@@ -794,6 +844,85 @@ TEST_F(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+TEST_F(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
+  HloComputation::Builder b(TestName());
+
+  // clang-format off
+  // Input dimensions: [feature=2, height=3, batch=1, width=4]
+  Array4D<float> input({
+    {{{1, 2, 3, 4}},
+     {{5, 6, 7, 8}},
+     {{9, 10, 11, 12}}},
+    {{{13, 14, 15, 16}},
+     {{17, 18, 19, 20}},
+     {{21, 22, 23, 24}}}
+  });
+  // Weight dimensions:
+  // [kernel_output_feature=1, width=3, kernel_input_feature=2, height=3]
+  Array4D<float> weight({{
+    {{1, 7, 13},
+     {4, 10, 16}},
+    {{2, 8, 14},
+     {5, 11, 17}},
+    {{3, 9, 15},
+     {6, 12, 18}}
+  }});
+  // clang-format on
+
+  auto lhs_literal = Literal::CreateR4FromArray4D<float>(input);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  auto rhs_literal = Literal::CreateR4FromArray4D<float>(weight);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+  rhs_instruction = b.AddInstruction(HloInstruction::CreateReverse(
+      rhs_instruction->shape(), rhs_instruction, {3, 1}));
+
+  Window window;
+  WindowDimension dim;
+  dim.set_size(3);
+  dim.set_stride(1);
+  dim.set_padding_low(0);
+  dim.set_padding_high(0);
+  dim.set_window_dilation(1);
+  dim.set_base_dilation(1);
+  dim.set_window_reversal(true);
+  *window.add_dimensions() = dim;
+  *window.add_dimensions() = dim;
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(2);
+  dnums.set_output_batch_dimension(2);
+  dnums.set_input_feature_dimension(0);
+  dnums.set_output_feature_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
+
+  dnums.set_kernel_output_feature_dimension(0);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.add_kernel_spatial_dimensions(3);
+  dnums.add_kernel_spatial_dimensions(1);
+
+  const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  b.AddInstruction(HloInstruction::CreateConvolve(
+      shape, lhs_instruction, rhs_instruction, window, dnums));
+  auto computation = module().AddEntryComputation(b.Build());
+
+  std::unique_ptr<Literal> result =
+      evaluator_->Evaluate(*computation, {}).ConsumeValueOrDie();
+
+  // clang-format off
+  // Result dimensions: [feature=1, height=1, batch=1, width=2]
+  Array4D<float> expected_array({{{{2514, 2685}}}});
+  // clang-format on
+  auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
+
+  LiteralTestUtil::ExpectEqual(*expected, *result);
+}
+
 TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   HloComputation::Builder b(TestName());
 
@@ -843,8 +972,10 @@ TEST_F(HloEvaluatorTest, Conv2DGeneralDimensions) {
   dnums.set_output_batch_dimension(2);
   dnums.set_input_feature_dimension(0);
   dnums.set_output_feature_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   dnums.set_kernel_output_feature_dimension(0);
   dnums.set_kernel_input_feature_dimension(2);
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index eaeb352183bdf6cc7f4a164c31af4f641e37440e..0809fe780d21baf366b63bdab118653630c33872 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -26,45 +26,110 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
-
-void HloExecutionProfile::AddProfileResult(const HloInstruction* hlo,
-                                           uint64 cycles_taken) {
-  hlo_to_cycles_taken_[hlo] = cycles_taken;
-  profiled_computations_.insert(hlo->parent());
+HloProfileIndexMap::HloProfileIndexMap(const HloModule& module) {
+  size_t current_profile_index = 0;
+  for (xla::HloComputation* computation : module.MakeComputationPostOrder()) {
+    InsertOrDie(&computation_to_profile_idx_, computation,
+                current_profile_index++);
+    for (const HloInstruction* instruction : computation->instructions()) {
+      // For simplicity we track all instrutions here, but we could skip
+      // non-executing instructions like constants and parameters.
+      InsertOrDie(&instruction_to_profile_idx_, instruction,
+                  current_profile_index++);
+    }
+  }
 }
 
-uint64 HloExecutionProfile::GetProfileResult(const HloInstruction& hlo) const {
-  auto iter = hlo_to_cycles_taken_.find(&hlo);
-  if (iter == hlo_to_cycles_taken_.end()) {
-    return 0;
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+    const HloProfileIndexMap& hlo_profile_index_map,
+    const HloCostAnalysis& cost_analysis) {
+  using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
+  using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo;
+
+  HloComputationInfo* computation_infos =
+      new HloComputationInfo[hlo_profile_index_map.computation_count()];
+
+  // There are two "indices" in play here.  The first one is the index of the
+  // HloComputationInfo or HloInstructionInfo in the array that contains said
+  // HloComputationInfo or HloInstructionInfo.  The second index is the index of
+  // the HloComputationInfo or HloInstructionInfo in the profile counters array,
+  // as decided by hlo_profile_index_map.  The latter index is always referred
+  // to as "profile_index".
+
+  size_t computation_index_in_static_data = 0;
+  size_t max_profile_index = hlo_profile_index_map.total_count();
+  for (const auto& pair : hlo_profile_index_map.computation_to_profile_idx()) {
+    CHECK_LT(pair.second, max_profile_index);
+    const HloComputation* computation = pair.first;
+    size_t current_computation_index = computation_index_in_static_data++;
+    HloComputationInfo* computation_info =
+        &computation_infos[current_computation_index];
+
+    computation_info->name = strdup(computation->name().c_str());
+    computation_info->profile_index = pair.second;
+    computation_info->instructions =
+        new HloInstructionInfo[computation->instruction_count()];
+    computation_info->instructions_size = computation->instruction_count();
+
+    size_t instruction_index_in_static_data = 0;
+    for (const HloInstruction* hlo : computation->instructions()) {
+      HloProfilePrinter::HloInstructionInfo* instruction_info =
+          &computation_info->instructions[instruction_index_in_static_data++];
+      instruction_info->long_name = strdup(hlo->ToString().c_str());
+      instruction_info->short_name =
+          strdup(hlo->ToString(/*compact_operands=*/true).c_str());
+      instruction_info->category = strdup(hlo->ToCategory().c_str());
+      instruction_info->flop_count = cost_analysis.flop_count(*hlo);
+      instruction_info->transcendental_count =
+          cost_analysis.transcendental_count(*hlo);
+      instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
+      instruction_info->optimal_seconds = cost_analysis.optimal_seconds(*hlo);
+      instruction_info->profile_index =
+          hlo_profile_index_map.GetProfileIndexFor(*hlo);
+      CHECK_LT(instruction_info->profile_index, max_profile_index);
+    }
   }
-  return iter->second;
+
+  auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos,
+                    int64 computation_infos_size) {
+    for (int64 i = 0; i < computation_infos_size; i++) {
+      HloInstructionInfo* instruction_infos = computation_infos[i].instructions;
+      for (int64 j = 0; j < computation_infos[i].instructions_size; j++) {
+        // We can't make instruction_infos[j].long_name etc. non-const pointers
+        // since they may point into static storage, so we have a const_cast
+        // here.
+        free(const_cast<char*>(instruction_infos[j].long_name));
+        free(const_cast<char*>(instruction_infos[j].short_name));
+        free(const_cast<char*>(instruction_infos[j].category));
+      }
+      delete[] instruction_infos;
+      free(const_cast<char*>(computation_infos[i].name));
+    }
+    delete[] computation_infos;
+  };
+
+  return MakeUnique<HloProfilePrinter>(
+      computation_infos, hlo_profile_index_map.computation_count(),
+      /*profile_counters_size=*/max_profile_index, deleter);
 }
 
-string HloExecutionProfile::ToString(
-    const HloComputation& computation,
-    const DeviceDescription& device_description,
-    HloCostAnalysis* cost_analysis) const {
-  tensorflow::Status analysis_status = computation.Accept(cost_analysis);
-  if (!analysis_status.ok()) {
-    return "";
-  }
+HloExecutionProfile::HloExecutionProfile(
+    const HloProfilePrinter* hlo_profile_printer,
+    const HloProfileIndexMap* hlo_profile_index_map)
+    : hlo_profile_printer_(*hlo_profile_printer),
+      hlo_profile_index_map_(*hlo_profile_index_map),
+      profile_counters_(
+          /*count*/ hlo_profile_index_map_.total_count(),
+          /*value*/ 0) {}
 
-  HumanReadableProfileBuilder builder(computation.name(),
-                                      total_cycles_executed(computation),
-                                      device_description.clock_rate_ghz());
-  for (const auto& item : hlo_to_cycles_taken_) {
-    const HloInstruction* hlo = item.first;
-    int64 cycles = item.second;
-
-    builder.AddOp(/*op_name=*/hlo->ToString(),
-                  /*short_name=*/hlo->ToString(/*compact_operands=*/true),
-                  hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo),
-                  cost_analysis->transcendental_count(*hlo),
-                  cost_analysis->bytes_accessed(*hlo),
-                  cost_analysis->seconds(*hlo));
-  }
-  return builder.ToString();
+void HloExecutionProfile::SetCyclesTakenBy(const HloInstruction* hlo,
+                                           uint64 cycles_taken) {
+  profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(*hlo)] =
+      cycles_taken;
+}
+
+uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
+  return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(hlo)];
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index a980c1617f395fc6668b8f8739e04d18fd1b689e..470fd4ce3c205d84152238f4b18daad77e403f68 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,6 +29,59 @@ namespace xla {
 
 class HloInstruction;
 
+// Maps all HloInstructions and HloComputations in an HloModule to integers.
+// These integers form the contiguous range [0, total_count()).
+class HloProfileIndexMap {
+ public:
+  // Scans `module` to populate this instance of HloProfileIndexMap.
+  explicit HloProfileIndexMap(const HloModule& module);
+
+  HloProfileIndexMap(const HloProfileIndexMap&) = default;
+  HloProfileIndexMap(HloProfileIndexMap&&) = default;
+
+  HloProfileIndexMap& operator=(const HloProfileIndexMap&) = default;
+  HloProfileIndexMap& operator=(HloProfileIndexMap&&) = default;
+
+  size_t GetProfileIndexFor(const HloInstruction& instruction) const {
+    return FindOrDie(instruction_to_profile_idx(), &instruction);
+  }
+
+  size_t GetProfileIndexFor(const HloComputation& computation) const {
+    return FindOrDie(computation_to_profile_idx(), &computation);
+  }
+
+  size_t instruction_count() const {
+    return instruction_to_profile_idx().size();
+  }
+
+  size_t computation_count() const {
+    return computation_to_profile_idx().size();
+  }
+
+  size_t total_count() const {
+    return instruction_count() + computation_count();
+  }
+
+  const std::unordered_map<const HloInstruction*, int64>&
+  instruction_to_profile_idx() const {
+    return instruction_to_profile_idx_;
+  }
+
+  const std::unordered_map<const HloComputation*, int64>&
+  computation_to_profile_idx() const {
+    return computation_to_profile_idx_;
+  }
+
+ private:
+  std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_;
+  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
+};
+
+// Create an instance of `HloProfilePrinter` that owns its memory.
+std::unique_ptr<HloProfilePrinter> CreateHloProfilePrinter(
+    const HloProfileIndexMap& hlo_profile_index_map,
+    const HloCostAnalysis& cost_analysis);
+
 // Describes how much time each HLO operation took.
 //
 // Each HloComputation takes a certain number of cycles.  This class helps break
@@ -35,26 +90,27 @@ class HloExecutionProfile {
  public:
   using DeviceDescription = perftools::gputools::DeviceDescription;
 
+  HloExecutionProfile(const HloProfilePrinter* hlo_profile_printer,
+                      const HloProfileIndexMap* hlo_profile_index_map);
+
   // Record how many cycles this HLO took to execute.
-  void AddProfileResult(const HloInstruction* hlo, uint64 cycles_taken);
+  void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
 
   // Returns how many cycles this HLO took to execute.  Profiling information
   // may not be available for some instructions in which case zero is returned.
-  uint64 GetProfileResult(const HloInstruction& hlo) const;
+  uint64 GetCyclesTakenBy(const HloInstruction& hlo) const;
 
   // Return the number of cycles this computation took to execute.
   uint64 total_cycles_executed(const HloComputation& computation) const {
-    auto it = total_cycles_executed_.find(&computation);
-    if (it != total_cycles_executed_.end()) {
-      return it->second;
-    }
-    return 0;
+    return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(
+        computation)];
   }
 
   // Record how many cycles a computation took to execute.
   void set_total_cycles_executed(const HloComputation& computation,
                                  uint64 total_cycles_executed) {
-    total_cycles_executed_[&computation] = total_cycles_executed;
+    profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(computation)] =
+        total_cycles_executed;
   }
 
   // Returns a version of the execution profile suitable for performance
@@ -63,25 +119,20 @@ class HloExecutionProfile {
   // for the operations in a given computation. Returns an empty string if it
   // wasn't possible to generate a printable version. cost_analysis should be a
   // clean analysis that can be used to visit the computation.
-  string ToString(const HloComputation& computation,
-                  const DeviceDescription& device_description,
-                  HloCostAnalysis* cost_analysis) const;
-
-  // Returns the computations we have profiled.
-  std::unordered_set<const HloComputation*> profiled_computations() const {
-    return profiled_computations_;
+  string ToString(const DeviceDescription& device_description) const {
+    return hlo_profile_printer_.ToString(profile_counters_.data(),
+                                         device_description.clock_rate_ghz());
   }
 
- private:
-  // Contains a mapping from HLO to the number of cycles it took to execute it.
-  std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
+  std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
 
-  // If non-empty, contains the total number of cycles a computation took to
-  // execute.
-  std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+ private:
+  const HloProfilePrinter& hlo_profile_printer_;
+  const HloProfileIndexMap& hlo_profile_index_map_;
 
-  // The computations we have profiled.
-  std::unordered_set<const HloComputation*> profiled_computations_;
+  // Stores per-Hlo profile counters.  This is the only thing that changes when
+  // we execute an XLA computation.
+  std::vector<int64> profile_counters_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e6729e2bccad4bdbe075a635d8a9b1ede6fecb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+class HloExecutionProfileTest : public HloTestBase {
+ protected:
+  static constexpr int64 kInstructionCyclesIndex = 0;
+  static constexpr int64 kInstructionNameIndex = 19;
+};
+
+// Splits `lines` into a sequence of lines delimited by newlines and then split
+// each of those lines into a sequence of words delimited by spaces.  Filter out
+// empty words.
+std::vector<std::vector<string>> SplitIntoLinesAndWords(
+    tensorflow::StringPiece lines) {
+  std::vector<std::vector<string>> result;
+  for (const string& line : tensorflow::str_util::Split(lines, '\n')) {
+    std::vector<string> words;
+    for (const string& word : tensorflow::str_util::Split(line, ' ')) {
+      if (!word.empty()) {
+        words.push_back(word);
+      }
+    }
+    result.push_back(std::move(words));
+  }
+
+  return result;
+}
+
+TEST_F(HloExecutionProfileTest, Basic) {
+  std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+
+  HloComputation::Builder builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {30, 30});
+  HloInstruction* param_lhs =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
+  HloInstruction* param_rhs =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
+  HloInstruction* add_instruction =
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  HloInstruction* dot_instruction =
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          shape, HloOpcode::kDot, param_lhs, add_instruction));
+
+  hlo_module->AddEntryComputation(builder.Build());
+
+  auto shape_size_function = [&](const Shape& shape) {
+    const int64 pointer_size = 8;
+    if (ShapeUtil::IsOpaque(shape)) {
+      return pointer_size;
+    }
+    return ShapeUtil::ByteSizeOf(shape, pointer_size);
+  };
+
+  HloCostAnalysis cost_analysis(shape_size_function);
+  HloProfileIndexMap profile_index_map(*hlo_module);
+  std::unique_ptr<HloProfilePrinter> profile_printer =
+      CreateHloProfilePrinter(profile_index_map, cost_analysis);
+  HloExecutionProfile execution_profile(profile_printer.get(),
+                                        &profile_index_map);
+
+  const int64 add_cycles = 1000;
+  const int64 dot_cycles = 4000;
+
+  execution_profile.SetCyclesTakenBy(add_instruction, add_cycles);
+  execution_profile.SetCyclesTakenBy(dot_instruction, dot_cycles);
+
+  string rendered_profile = execution_profile.ToString(
+      backend().default_stream_executor()->GetDeviceDescription());
+  std::vector<std::vector<string>> lines_and_words =
+      SplitIntoLinesAndWords(rendered_profile);
+  ASSERT_EQ(lines_and_words.size(), 8);
+
+  const std::vector<string>& line_2 = lines_and_words[2];
+  const std::vector<string>& line_3 = lines_and_words[3];
+
+  EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles));
+  EXPECT_EQ(line_2[kInstructionNameIndex], '%' + dot_instruction->name());
+
+  EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles));
+  EXPECT_EQ(line_3[kInstructionNameIndex], '%' + add_instruction->name());
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index d7bdd4117d947add448ff660abc621d9ae3118b6..84187d578346eafd5e32727a15f5eab9cc79feef 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,11 +312,11 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
-               bool show_addresses, bool show_metadata,
+               const DebugOptions& debug_options, bool show_metadata,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label.ToString()),
-        show_addresses_(show_addresses),
+        debug_options_(debug_options),
         show_metadata_(show_metadata),
         profile_(profile),
         filter_(std::move(filter)) {}
@@ -382,7 +382,7 @@ class HloDotDumper {
 
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
-  const bool show_addresses_;
+  const DebugOptions& debug_options_;
   const bool show_metadata_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
@@ -414,6 +414,11 @@ class HloDotDumper {
   // appears before both the inner computation and the destination node are
   // defined.
   std::vector<string> edges_;
+
+  // When coloring by sharding information, we track the sharding string
+  // representation to color association, by round-robin the color schemes.
+  std::unordered_map<string, ColorScheme> sharding_colors_;
+  int64 next_shard_color_ = 0;
 };
 
 string HloDotDumper::Dump() {
@@ -734,15 +739,16 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
   AddInstructionIncomingEdges(instr);
 
-  // Override the node's styling if it should be (de-)emphasized.
-  if (filter_.Deemphasized(instr)) {
-    color = kDashedBorder;
-  }
-  if (filter_.Highlight(instr)) {
-    node_shape = "diamond";
-    color = kDarkRed;
+  if (!debug_options_.xla_hlo_graph_sharding_color()) {
+    // Override the node's styling if it should be (de-)emphasized.
+    if (filter_.Deemphasized(instr)) {
+      color = kDashedBorder;
+    }
+    if (filter_.Highlight(instr)) {
+      node_shape = "diamond";
+      color = kDarkRed;
+    }
   }
-
   // Build the text that will be displayed inside the node.
   string node_body = node_label;
   for (const string& s :
@@ -761,12 +767,22 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
 string HloDotDumper::GetInstructionNodeInlinedOperands(
     const HloInstruction* instr) {
   auto stringify_constant = [](const HloInstruction* constant) {
-    if (ShapeUtil::IsEffectiveScalar(constant->shape())) {
-      auto elem_idx = IndexUtil::LinearIndexToMultidimensionalIndex(
-          constant->shape(), /*linear_index=*/0);
-      return Printf("%s (%s)", constant->literal().GetAsString(elem_idx),
+    const auto& shape = constant->shape();
+
+    // Print the literal value of constants with <= K elements.
+    optional<int64> elem_count;
+    if (!ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)) {
+      elem_count = 1;
+      for (int64 dim : shape.dimensions()) {
+        *elem_count *= dim;
+      }
+    }
+    if (elem_count.has_value() && *elem_count <= 8) {
+      return Printf("%s (%s)", constant->literal().ToString(),
                     ShapeUtil::HumanString(constant->shape()));
     }
+
+    // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
     if (tensorflow::StringPiece(constant->name()).starts_with("%constant")) {
       constant_name = constant->name();
@@ -817,6 +833,20 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 }
 
 ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
+  if (debug_options_.xla_hlo_graph_sharding_color()) {
+    if (!instr->has_sharding()) {
+      return kDashedBorder;
+    }
+    string shard_str = instr->sharding().ToString();
+    auto it = sharding_colors_.find(shard_str);
+    if (it != sharding_colors_.end()) {
+      return it->second;
+    }
+    ColorScheme color = static_cast<ColorScheme>(
+        kBlue + (next_shard_color_++ % (kDashedBorder - kBlue)));
+    sharding_colors_.emplace(shard_str, color);
+    return color;
+  }
   const auto kParameterColor = kOrange;
 
   // Special case: If this instruction has a parameter merged into it, paint it
@@ -834,9 +864,10 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   // (eg, parameter).
   switch (instr->opcode()) {
     case HloOpcode::kAbs:
-    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -852,18 +883,19 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
-    case HloOpcode::kAnd:
-    case HloOpcode::kNot:
-    case HloOpcode::kOr:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
+    case HloOpcode::kRng:
+    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
@@ -873,7 +905,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSort:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
-    case HloOpcode::kRng:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
       // uninteresting.
       if (ShapeUtil::IsEffectiveScalar(instr->shape())) {
@@ -881,9 +912,9 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       }
       return kYellow;
     case HloOpcode::kBitcast:
-    case HloOpcode::kTuple:
-    case HloOpcode::kTrace:
     case HloOpcode::kGetTupleElement:
+    case HloOpcode::kTrace:
+    case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
@@ -922,25 +953,28 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       return kRed;
     case HloOpcode::kParameter:
       return kParameterColor;
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
     case HloOpcode::kReduce:
-    case HloOpcode::kSelectAndScatter:
     case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
       return kPurple;
-    case HloOpcode::kMap:
     case HloOpcode::kFusion:
+    case HloOpcode::kMap:
       return kGray;
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
       return kBrown;
+    case HloOpcode::kCall:
+    case HloOpcode::kConditional:
     case HloOpcode::kCustomCall:
     case HloOpcode::kWhile:
-    case HloOpcode::kCall:
       return kDarkGreen;
     case HloOpcode::kConstant:
       LOG(FATAL) << "Constants don't get their own nodes in the graph.";
@@ -969,10 +1003,13 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
           .starts_with(StrCat("%", HloOpcodeString(instr->opcode())))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
-
+  string extended_opcode =
+      StrCat(HloOpcodeString(instr->opcode()),
+             instr->opcode() != HloOpcode::kFusion
+                 ? ""
+                 : StrCat(":", xla::ToString(instr->fusion_kind())));
   // If the name does not contain the opcode, render both.
-  return Printf("<b>%s</b><br/>%s",
-                HtmlLikeStringSanitize(instr->ExtendedOpcodeStr()),
+  return Printf("<b>%s</b><br/>%s", HtmlLikeStringSanitize(extended_opcode),
                 HtmlLikeStringSanitize(instr->name()));
 }
 
@@ -1027,7 +1064,9 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
                    ? ""
                    : StrCat("stride=", VectorString(instr->slice_strides()));
       case HloOpcode::kSend:
+      case HloOpcode::kSendDone:
       case HloOpcode::kRecv:
+      case HloOpcode::kRecvDone:
         return StrCat("channel_id=", instr->channel_id());
       default:
         return "";
@@ -1065,12 +1104,11 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
     }
     lines.push_back(instr_shape);
   }
-
-  if (show_addresses_) {
+  if (debug_options_.xla_hlo_graph_addresses()) {
     lines.push_back(Printf("[%p]", instr));
   }
   if (profile_ != nullptr) {
-    double hlo_cycles_executed = profile_->GetProfileResult(*instr);
+    double hlo_cycles_executed = profile_->GetCyclesTakenBy(*instr);
     double total_cycles_executed =
         profile_->total_cycles_executed(*instr->parent());
     if (hlo_cycles_executed > 0 && total_cycles_executed > 0) {
@@ -1163,70 +1201,36 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
   return instr;
 }
 
-tensorflow::mutex& RendererMutex() {
-  static tensorflow::mutex* mu = new tensorflow::mutex;
-  return *mu;
-}
+class GraphRendererRegistry {
+ public:
+  void AddRenderer(GraphRendererInterface* graph_renderer) {
+    tensorflow::mutex_lock lock(mu_);
+    graph_renderer_ = graph_renderer;
+  }
 
-std::map<int, GraphRendererInterface*>* GraphRenderers() {
-  static auto* graph_renderers = new std::map<int, GraphRendererInterface*>();
-  return graph_renderers;
-}
+  GraphRendererInterface* GetDefaultRenderer() {
+    tensorflow::mutex_lock lock(mu_);
+    return graph_renderer_;
+  }
 
-GraphRendererInterface* GetGraphRenderer() {
-  tensorflow::mutex_lock lock(RendererMutex());
-  auto* graph_renderers = GraphRenderers();
-  auto it = graph_renderers->rbegin();
-  CHECK(it != graph_renderers->rend()) << "No registered graph dumpers";
-  return it->second;
-}
+  static GraphRendererRegistry* Default() {
+    static GraphRendererRegistry* registry = new GraphRendererRegistry();
+    return registry;
+  }
+
+ private:
+  tensorflow::mutex mu_;
+  GraphRendererInterface* graph_renderer_ = nullptr;
+};
 
 }  // namespace
 
-Registrar::Registrar(GraphRendererInterface* dumper, int priority) {
-  tensorflow::mutex_lock lock(RendererMutex());
-  auto* graph_renderers = GraphRenderers();
-  graph_renderers->emplace(priority, dumper);
+Registrar::Registrar(GraphRendererInterface* dumper) {
+  GraphRendererRegistry::Default()->AddRenderer(dumper);
 }
 
 namespace {
 
-class FileGraphRenderer : public GraphRendererInterface {
- public:
-  string RenderGraph(const string& graph, GraphKind graph_kind,
-                     const DebugOptions& debug_options) override {
-    static std::atomic<int> output_num(0);
-    string file_extension;
-    switch (graph_kind) {
-      case DOT_GRAPH:
-        file_extension = ".dot";
-        break;
-      case TF_GRAPHDEF:
-        file_extension = ".pbtxt";
-        break;
-    }
-    string path =
-        JoinPath(debug_options.xla_hlo_graph_path(),
-                 StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
-    auto status = Status::OK();
-    int fd = mkstemps(&path[0], file_extension.length());
-    if (fd < 0) {
-      status =
-          Status(tensorflow::error::Code::UNKNOWN,
-                 StrCat("Failed to create temporary file to dump HLO graph: ",
-                        strerror(errno)));
-    } else {
-      status = tensorflow::WriteStringToFile(tensorflow::Env::Default(), path,
-                                             graph);
-      close(fd);
-    }
-    if (!status.ok()) {
-      LOG(WARNING) << "Saving HLO graph failed: " << status;
-    }
-    return path;
-  }
-};
-
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
 NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
@@ -1289,7 +1293,9 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
 
   auto is_displayed = [&](const HloInstruction* instr) {
     // Constants are displayed inline with their users; they're never omitted.
-    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant;
+    // Nodes in subcomputations are always shown.
+    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant ||
+           instr->parent() != root->parent();
   };
 
   // Make a second pass over 'nodes' to fix up the NodeFilterResults now that we
@@ -1334,7 +1340,54 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   });
 }
 
-XLA_REGISTER_GRAPH_RENDERER(FileGraphRenderer, 0);
+string SaveGraph(const string& graph,
+                 GraphRendererInterface::GraphKind graph_kind,
+                 const string& dest_path) {
+  static std::atomic<int> output_num(0);
+  string file_extension;
+  switch (graph_kind) {
+    case GraphRendererInterface::DOT_GRAPH:
+      file_extension = ".dot";
+      break;
+    case GraphRendererInterface::TF_GRAPHDEF:
+      file_extension = ".pbtxt";
+      break;
+  }
+  string path = JoinPath(
+      dest_path, StrCat("hlo_graph_", output_num++, ".XXXXXX", file_extension));
+  auto status = Status::OK();
+  int fd = mkstemps(&path[0], file_extension.length());
+  if (fd < 0) {
+    status =
+        Status(tensorflow::error::Code::UNKNOWN,
+               StrCat("Failed to create temporary file to dump HLO graph: ",
+                      strerror(errno)));
+  } else {
+    status =
+        tensorflow::WriteStringToFile(tensorflow::Env::Default(), path, graph);
+    close(fd);
+  }
+  if (!status.ok()) {
+    LOG(WARNING) << "Saving HLO graph failed: " << status;
+  }
+  return path;
+}
+
+string ExportGraph(const string& graph,
+                   GraphRendererInterface::GraphKind graph_kind,
+                   const DebugOptions& debug_options) {
+  string path = debug_options.xla_hlo_graph_path();
+  if (!path.empty()) {
+    return SaveGraph(graph, graph_kind, path);
+  } else {
+    auto graph_renderer =
+        GraphRendererRegistry::Default()->GetDefaultRenderer();
+    CHECK(graph_renderer != nullptr)
+        << "No registered renderer for the HLO graph. "
+           "Use --xla_hlo_graph_path=PATH to export to local file system";
+    return graph_renderer->RenderGraph(graph, graph_kind, debug_options);
+  }
+}
 
 }  // namespace
 
@@ -1342,27 +1395,22 @@ string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile,
                  bool show_metadata) {
+  GraphRendererInterface::GraphKind graph_kind;
   string graph;
-  string graph_url;
   if (debug_options.xla_hlo_dump_as_graphdef()) {
-    HloTfGraphBuilder builder;
+    HloTfGraphBuilder builder(debug_options);
     TF_CHECK_OK(builder.AddComputation(computation));
     CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
                                                           &graph));
-    // TODO(b/37198616): Use the default registered renderers when all
-    // renderers support rendering GraphDefs. Always dump GraphDefs to files
-    // for now.
-    graph_url = FileGraphRenderer().RenderGraph(
-        graph, GraphRendererInterface::TF_GRAPHDEF, debug_options);
+    graph_kind = GraphRendererInterface::TF_GRAPHDEF;
   } else {
-    graph =
-        HloDotDumper(&computation, label,
-                     /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
-                     show_metadata, hlo_execution_profile, NodeFilter())
-            .Dump();
-    graph_url = GetGraphRenderer()->RenderGraph(
-        graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+    graph = HloDotDumper(&computation, label, debug_options, show_metadata,
+                         hlo_execution_profile, NodeFilter())
+                .Dump();
+    graph_kind = GraphRendererInterface::DOT_GRAPH;
   }
+
+  string graph_url = ExportGraph(graph, graph_kind, debug_options);
   LOG(INFO) << "computation " << computation.name() << " [" << label
             << "]: " << graph_url;
   return graph_url;
@@ -1375,12 +1423,10 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
   NodeFilter filter = MakeNodeFilter(&node, radius);
   string graph =
-      HloDotDumper(node.parent(), label,
-                   /*show_addresses=*/debug_options.xla_hlo_graph_addresses(),
-                   show_metadata, /*profile=*/nullptr, filter)
+      HloDotDumper(node.parent(), label, debug_options, show_metadata,
+                   /*profile=*/nullptr, filter)
           .Dump();
-  return GetGraphRenderer()->RenderGraph(
-      graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
 void DumpText(const HloModule& module, const string& label,
@@ -1391,7 +1437,8 @@ void DumpText(const HloModule& module, const string& label,
   string filename =
       do_prefix ? StrCat(prefix, "-", label, ".txt") : StrCat(label, ".txt");
   string path = JoinPath(directory_path, filename);
-  TF_CHECK_OK(WriteStringToFile(env, path, module.ToString()));
+  TF_CHECK_OK(WriteStringToFile(
+      env, path, module.ToString(/*include_large_constants=*/true)));
   LOG(INFO) << "dumping module '" << module.name() << "' to " << path;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index dd304ec76cd903a6175337551fc50808b1797104..2704aae1e3ba7fb131bfcb1287d807d785fd9774 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -84,11 +84,10 @@ void DumpText(const HloModule& module, const string& label,
 
 // Internal implementation details below this point.
 
-// Class that registers a graph renderer. Higher-priority renders are chosen
-// first.
+// Class that registers a graph renderer.
 class Registrar {
  public:
-  Registrar(GraphRendererInterface* dumper, int priority);
+  Registrar(GraphRendererInterface* dumper);
 };
 
 #define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...)   \
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 7b0f937f383a416f805a799bd6787afe15b324b0..8e1531c87f9c6e133e2d6763b046b1d5dcbcd09f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -45,7 +45,7 @@ class DotRenderer : public hlo_graph_dumper::GraphRendererInterface {
   string last_graph_;
 };
 
-XLA_REGISTER_GRAPH_RENDERER(DotRenderer, std::numeric_limits<int>::max());
+XLA_REGISTER_GRAPH_RENDERER(DotRenderer);
 
 TEST(HloGraphDumperTest, NestedFusion) {
   HloComputation::Builder b("b");
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index e6a4f68fb38001a65ea4d9d0b2b1ddaca4d85106..784930195796220646e80cc1cd7a1b342083acfc 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -43,6 +43,7 @@ limitations under the License.
 
 namespace xla {
 
+using tensorflow::str_util::CEscape;
 using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
@@ -51,7 +52,9 @@ using ::tensorflow::strings::StrCat;
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     HloModule* module, const HloInstructionProto& proto,
     const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-    tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map) {
+    const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+    const std::function<void(std::unique_ptr<HloComputation>)>&
+        add_fused_computation) {
   TF_RET_CHECK(!proto.opcode().empty());
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
@@ -77,19 +80,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     TF_RET_CHECK(!proto.fusion_kind().empty());
     TF_ASSIGN_OR_RETURN(instruction->fusion_kind_,
                         StringToFusionKind(proto.fusion_kind()));
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> fused_computation,
-        HloComputation::CreateFromProto(
-            module, proto.fused_instructions_computation(), computation_map,
-            /*fusion_instruction=*/instruction.get()));
-    instruction->called_computations_.push_back(
-        module->AddEmbeddedComputation(std::move(fused_computation)));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> fused_computation,
+                        HloComputation::CreateFromProto(
+                            module, proto.fused_instructions_computation(),
+                            computation_map, add_fused_computation,
+                            /*fusion_instruction=*/instruction.get()));
+    instruction->called_computations_.push_back(fused_computation.get());
+    add_fused_computation(std::move(fused_computation));
   } else {
     for (const string& computation_name : proto.called_computation_names()) {
-      TF_RET_CHECK(ContainsKey(*computation_map, computation_name))
+      TF_RET_CHECK(ContainsKey(computation_map, computation_name))
           << "No computation named " << computation_name;
       instruction->called_computations_.push_back(
-          computation_map->at(computation_name));
+          computation_map.at(computation_name));
     }
   }
 
@@ -115,6 +118,10 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         MakeUnique<ConvolutionDimensionNumbers>(
             proto.convolution_dimension_numbers());
   }
+  if (proto.has_dot_dimension_numbers()) {
+    instruction->dot_dimension_numbers_ =
+        MakeUnique<DotDimensionNumbers>(proto.dot_dimension_numbers());
+  }
   for (const HloInstructionProto::SliceDimensions& slice_dimensions :
        proto.slice_dimensions()) {
     instruction->slice_starts_.push_back(slice_dimensions.start());
@@ -148,7 +155,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       WrapUnique(new HloInstruction(HloOpcode::kParameter, shape));
   instruction->parameter_number_ = parameter_number;
   instruction->parameter_name_ = name;
-  instruction->name_ = "%" + name;
+  instruction->name_ = name;
   return instruction;
 }
 
@@ -329,6 +336,17 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
+    const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDot, shape));
+  instruction->AppendOperand(lhs);
+  instruction->AppendOperand(rhs);
+  instruction->dot_dimension_numbers_ =
+      MakeUnique<DotDimensionNumbers>(dimension_numbers);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateReducePrecision(const Shape& shape,
                                       HloInstruction* operand,
@@ -343,12 +361,9 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateCrossReplicaSum(const Shape& shape,
-                                      HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kCrossReplicaSum, shape));
-  instruction->AppendOperand(operand);
-  return instruction;
+HloInstruction::CreateCrossReplicaSum(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
@@ -371,20 +386,50 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSend(
     HloInstruction* operand, int64 channel_id) {
+  // Send instruction produces a tuple of {aliased operand, U32 context}.
+  Shape output_shape = ShapeUtil::MakeTupleShape(
+      {operand->shape(), ShapeUtil::MakeShape(U32, {})});
   auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kSend, ShapeUtil::MakeNil()));
+      WrapUnique(new HloInstruction(HloOpcode::kSend, output_shape));
   instruction->AppendOperand(operand);
   instruction->channel_id_ = channel_id;
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSendDone(
+    HloInstruction* operand) {
+  CHECK(operand->opcode() == HloOpcode::kSend)
+      << "SendDone must take the context operand from Send";
+  auto instruction = WrapUnique(
+      new HloInstruction(HloOpcode::kSendDone, ShapeUtil::MakeNil()));
+  instruction->AppendOperand(operand);
+  instruction->channel_id_ = operand->channel_id();
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecv(
     const Shape& shape, int64 channel_id) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kRecv, shape));
+  // Recv instruction produces a tuple of {receive buffer, U32 context}.
+  Shape output_shape =
+      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kRecv, output_shape));
   instruction->channel_id_ = channel_id;
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecvDone(
+    HloInstruction* operand) {
+  CHECK(operand->opcode() == HloOpcode::kRecv)
+      << "RecvDone must take the context operand from Recv";
+  Shape output_shape = ShapeUtil::GetTupleElementShape(operand->shape(), 0);
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kRecvDone, output_shape));
+  instruction->AppendOperand(operand);
+  instruction->channel_id_ = operand->channel_id();
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
@@ -405,6 +450,23 @@ HloInstruction::CreateCrossReplicaSum(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConditional(
+    const Shape& shape, HloInstruction* pred,
+    HloInstruction* true_computation_arg, HloComputation* true_computation,
+    HloInstruction* false_computation_arg, HloComputation* false_computation) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kConditional, shape));
+  instruction->AppendOperand(pred);
+  instruction->AppendOperand(true_computation_arg);
+  instruction->AppendOperand(false_computation_arg);
+  // In called_computations_, the index of true_computation must be 0 and that
+  // of false computation must be 1, as defined by kTrueComputationIndex and
+  // kFalseComputationIndex.
+  instruction->called_computations_.push_back(true_computation);
+  instruction->called_computations_.push_back(false_computation);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSlice(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
@@ -468,6 +530,15 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBitcastConvert(const Shape& shape,
+                                     HloInstruction* operand) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBitcastConvert, shape));
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReduce(
     const Shape& shape, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
@@ -600,7 +671,10 @@ HloInstruction::CreateSelectAndScatter(
   CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size());
   CHECK(std::equal(operand->shape().dimensions().begin(),
                    operand->shape().dimensions().end(),
-                   Permute(dimensions, shape.dimensions()).begin()));
+                   Permute(dimensions, shape.dimensions()).begin()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << ", operand->shape(): " << ShapeUtil::HumanString(shape)
+      << ", dimensions: {" << Join(dimensions, ", ") << "}";
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kTranspose, shape));
   instruction->AppendOperand(operand);
@@ -618,6 +692,20 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
+    const Shape& shape, FusionKind fusion_kind,
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* fusion_computation) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
+  for (auto operand : operands) {
+    instruction->AppendOperand(operand);
+  }
+  instruction->fusion_kind_ = fusion_kind;
+  instruction->called_computations_.push_back(fusion_computation);
+  fusion_computation->SetFusionInstruction(instruction.get());
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateFusionForBackwardConvolution(
     const Shape& shape, FusionKind fusion_kind, const Window& window,
@@ -746,7 +834,7 @@ HloInstruction* HloInstruction::FuseInstructionInternal(
 HloInstruction* HloInstruction::CloneAndFuseInternal(
     HloInstruction* instruction_to_fuse, bool add_output) {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(instruction_to_fuse->IsFusable());
+  CHECK(instruction_to_fuse->IsFusable()) << instruction_to_fuse->ToString();
   VLOG(3) << "CloneAndFuseInternal:\n" << instruction_to_fuse->ToString();
   HloInstruction* clone = nullptr;
   if (called_computations_.empty()) {
@@ -824,10 +912,8 @@ HloInstruction* HloInstruction::CloneAndFuseInternal(
       // parameter instruction.
       int64 param_no = fused_parameters.size();
       // Name the parameter after the instruction it represents in the outer
-      // (non-fusion) computation. Strip the leading "%" from the operand name
-      // to avoid a double %%.
-      string param_name =
-          StrCat(operand->name().substr(1), ".param_", param_no);
+      // (non-fusion) computation.
+      string param_name = StrCat(operand->name(), ".param_", param_no);
       fused_param = fused_instructions_computation()->AddParameter(
           CreateParameter(param_no, operand->shape(), param_name));
       AppendOperand(operand);
@@ -908,7 +994,10 @@ RandomDistribution HloInstruction::random_distribution() const {
 bool HloInstruction::HasSideEffect() const {
   switch (opcode_) {
     case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kRng:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
@@ -961,11 +1050,12 @@ bool HloInstruction::HasSideEffect() const {
 
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     const Shape& shape,
-    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands) const {
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloModule* module) const {
   VLOG(3) << "CloneWithNewOperands:\n  " << ToString();
   VLOG(3) << "  new operands:";
   for (const HloInstruction* new_operand : new_operands) {
-    VLOG(3) << "    " << new_operand->name();
+    VLOG(3) << "    %" << new_operand->name();
   }
 
   std::unique_ptr<HloInstruction> clone;
@@ -1009,7 +1099,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kLe:
     case HloOpcode::kLt:
     case HloOpcode::kNe:
-    case HloOpcode::kDot:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kPower:
@@ -1047,6 +1136,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateConvert(shape, new_operands[0]);
       break;
+    case HloOpcode::kBitcastConvert:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateBitcastConvert(shape, new_operands[0]);
+      break;
     case HloOpcode::kReducePrecision:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateReducePrecision(shape, new_operands[0], exponent_bits_,
@@ -1057,9 +1150,13 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateConvolve(shape, new_operands[0], new_operands[1], *window_,
                              *convolution_dimension_numbers_);
       break;
+    case HloOpcode::kDot:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateDot(shape, new_operands[0], new_operands[1],
+                        *dot_dimension_numbers_);
+      break;
     case HloOpcode::kCrossReplicaSum:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateCrossReplicaSum(shape, new_operands[0]);
+      clone = CreateCrossReplicaSum(shape, new_operands);
       break;
     case HloOpcode::kGetTupleElement:
       CHECK_EQ(new_operands.size(), 1);
@@ -1131,7 +1228,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateConstant(literal_->CloneToUnique());
       break;
     case HloOpcode::kFusion:
-      clone = CloneFusionWithNewOperands(shape, new_operands);
+      clone = CloneFusionWithNewOperands(shape, new_operands, module);
       break;
     case HloOpcode::kParameter:
       clone = CreateParameter(parameter_number_, shape, parameter_name_);
@@ -1162,21 +1259,28 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                                   new_operands[2], new_operands[3],
                                   new_operands[4], epsilon(), feature_index());
       break;
+    case HloOpcode::kConditional:
     case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
   clone->set_metadata(metadata_);
+  if (has_sharding()) {
+    clone->set_sharding(sharding());
+  }
+  clone->set_parent(parent_);
   return clone;
 }
 
 HloInstruction::~HloInstruction() {}
 
-std::unique_ptr<HloInstruction> HloInstruction::Clone(
-    const string& suffix) const {
+std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix,
+                                                      HloModule* module) const {
   std::unique_ptr<HloInstruction> clone =
-      CloneWithNewOperands(shape_, operands_);
+      CloneWithNewOperands(shape_, operands_, module);
   if (suffix.empty()) {
     clone->name_ = name();
   } else {
@@ -1210,16 +1314,12 @@ std::unique_ptr<HloInstruction> HloInstruction::Clone(
       }
     }
   }
-  clone->set_parent(parent_);
-  if (has_sharding()) {
-    clone->set_sharding(sharding());
-  }
   return clone;
 }
 
 std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
-    const Shape& shape,
-    tensorflow::gtl::ArraySlice<HloInstruction*> operands) const {
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloModule* module) const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(parent() != nullptr);
 
@@ -1230,13 +1330,14 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     new_instruction->AppendOperand(new_operand);
   }
   // Clone all the fused instructions for the new fusion instruction.
-  std::map<HloInstruction*, HloInstruction*> old_to_new;
+  HloInstructionMap<HloInstruction*> old_to_new;
   std::list<std::unique_ptr<HloInstruction>> new_fused_instructions;
   // Create the list of fused parameters by mapping through the cloned,
   // fused instructions.
   for (HloInstruction* old_fused_parameter :
        fused_instructions_computation()->parameter_instructions()) {
-    new_fused_instructions.push_back(old_fused_parameter->Clone());
+    new_fused_instructions.push_back(
+        old_fused_parameter->Clone("clone", module));
     HloInstruction* new_fusion_parameter = new_fused_instructions.back().get();
     InsertOrDie(&old_to_new, old_fused_parameter, new_fusion_parameter);
   }
@@ -1255,7 +1356,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
     }
     new_fused_instructions.push_back(
         old_fused_instruction->CloneWithNewOperands(
-            old_fused_instruction->shape(), new_operands));
+            old_fused_instruction->shape(), new_operands, module));
     HloInstruction* new_fused_instruction = new_fused_instructions.back().get();
     new_fused_instruction->set_parent(parent_);
     InsertOrDie(&old_to_new, old_fused_instruction, new_fused_instruction);
@@ -1271,12 +1372,13 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
        ++new_fused_instruction_iter) {
     computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
   }
+  if (module == nullptr) {
+    module = GetModule();
+  }
   auto fused_root_ = fused_expression_root();
   new_instruction->called_computations_.push_back(
-      CHECK_NOTNULL(GetModule())
-          ->AddEmbeddedComputation(
-              computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
-  new_instruction->set_parent(parent_);
+      CHECK_NOTNULL(module)->AddEmbeddedComputation(
+          computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
   return new_instruction;
 }
 
@@ -1350,7 +1452,7 @@ int64 HloInstruction::operand_index(const HloInstruction* target) const {
       return i;
     }
   }
-  LOG(FATAL) << "target was not an operand";
+  LOG(FATAL) << "target was not an operand: " << target->ToString();
 }
 
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
@@ -1423,7 +1525,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCos:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
-    case HloOpcode::kDot:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
     case HloOpcode::kFloor:
@@ -1482,6 +1583,7 @@ bool HloInstruction::IdenticalSlowPath(
     // A convert result is determined by the primitive type that the operand is
     // converted into.
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
       return shape().element_type() == other.shape().element_type();
 
     // A reduce-precision operation is determined by the bit sizes.
@@ -1495,6 +1597,10 @@ bool HloInstruction::IdenticalSlowPath(
              protobuf_util::ProtobufEquals(
                  convolution_dimension_numbers(),
                  other.convolution_dimension_numbers());
+    // Check dot dimension numbers.
+    case HloOpcode::kDot:
+      return protobuf_util::ProtobufEquals(dot_dimension_numbers(),
+                                           other.dot_dimension_numbers());
 
     // Reduction results are determined by the reduction dimension and the
     // reduction computation.
@@ -1535,7 +1641,8 @@ bool HloInstruction::IdenticalSlowPath(
                                            other.padding_config());
     case HloOpcode::kSlice:
       return slice_starts_ == other.slice_starts_ &&
-             slice_limits_ == other.slice_limits_;
+             slice_limits_ == other.slice_limits_ &&
+             slice_strides_ == other.slice_strides_;
     case HloOpcode::kDynamicSlice:
       return ShapeUtil::Compatible(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
@@ -1550,11 +1657,14 @@ bool HloInstruction::IdenticalSlowPath(
       return dimensions() == other.dimensions();
 
     // These opcodes are not yet supported.
+    case HloOpcode::kConditional:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kSort:
-    case HloOpcode::kSend:
     case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
       return false;
   }
 }
@@ -1757,6 +1867,32 @@ void HloInstruction::set_scatter(HloComputation* computation) {
   called_computations_[kScatterComputationIndex] = computation;
 }
 
+HloComputation* HloInstruction::true_computation() const {
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  return called_computations_[kTrueComputationIndex];
+}
+
+HloComputation* HloInstruction::false_computation() const {
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  return called_computations_[kFalseComputationIndex];
+}
+
+void HloInstruction::set_true_computation(HloComputation* true_computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  called_computations_[kTrueComputationIndex] = true_computation;
+}
+
+void HloInstruction::set_false_computation(HloComputation* false_computation) {
+  // Don't allow changing the computation for fused instructions so we don't
+  // have to recompute called_instructions for the entire fusion instruction.
+  CHECK(!IsFused());
+  CHECK_EQ(HloOpcode::kConditional, opcode_);
+  called_computations_[kFalseComputationIndex] = false_computation;
+}
+
 string HloInstruction::SignatureString() const {
   string operands =
       Join(operands_, ", ", [](string* out, HloInstruction* operand) {
@@ -1765,36 +1901,31 @@ string HloInstruction::SignatureString() const {
   return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape()));
 }
 
-string HloInstruction::ExtendedOpcodeStr() const {
-  string opc_name = HloOpcodeString(opcode());
-  HloOpcode opc = opcode();
-  if (HloOpcode::kFusion == opc) {
-    opc_name += ":" + xla::ToString(fusion_kind());
-  }
-  return opc_name;
-}
-
-string HloInstruction::ToString(bool compact_operands,
-                                bool include_metadata) const {
+string HloInstruction::ToString(bool compact_operands, bool include_metadata,
+                                bool include_large_constants) const {
   string result =
-      StrCat(name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
-             ExtendedOpcodeStr(), "(", OperandsToString(compact_operands), ")");
+      StrCat("%", name(), " = ", ShapeUtil::HumanStringWithLayout(shape()), " ",
+             HloOpcodeString(opcode()), "(",
+             OperandsToString(compact_operands, include_large_constants), ")");
   for (const string& extra : ExtraAttributesToString()) {
     StrAppend(&result, ", ", extra);
   }
   if (include_metadata &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
-    StrAppend(&result, " # metadata=", metadata_.ShortDebugString());
+    StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
   }
   return result;
 }
 
-string HloInstruction::OperandsToString(bool compact) const {
+string HloInstruction::OperandsToString(bool compact,
+                                        bool include_large_constants) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
-    if (!ShapeUtil::IsTuple(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) {
+    if ((!ShapeUtil::IsTuple(shape()) &&
+         ShapeUtil::ElementsIn(shape()) <= 10) ||
+        include_large_constants) {
       // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
       string tmp = literal().ToString();
@@ -1825,7 +1956,7 @@ string HloInstruction::OperandsToString(bool compact) const {
     operands = Join(slice, ", ", [&](string* out, HloInstruction* operand) {
       *out += ShapeUtil::HumanStringWithLayout(operand->shape());
       if (!compact) {
-        StrAppend(out, " ", operand->name());
+        StrAppend(out, " %", operand->name());
       }
     });
     const int64 remaining = operands_.size() - slice.size();
@@ -1838,16 +1969,20 @@ string HloInstruction::OperandsToString(bool compact) const {
 
 std::vector<string> HloInstruction::ExtraAttributesToString() const {
   std::vector<string> extra;
+  if (opcode() == HloOpcode::kFusion) {
+    extra.push_back(StrCat("kind=", xla::ToString(fusion_kind())));
+  }
   if (CanHaveDimensionsField()) {
     extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}"));
   }
-  if (window_ != nullptr) {
-    extra.push_back(window_util::ToString(*window_));
+  if (window_ != nullptr && window_->dimensions_size() != 0) {
+    extra.push_back(StrCat("window={", window_util::ToString(*window_), "}"));
   }
   if (padding_config_ != nullptr) {
-    extra.push_back(StrCat("padding=", padding_config_->ShortDebugString()));
+    extra.push_back(
+        StrCat("padding=", xla::PaddingConfigToString(*padding_config_)));
   }
-  if (!slice_starts_.empty() && !slice_limits_.empty()) {
+  if (opcode() == HloOpcode::kSlice) {
     std::vector<string> bounds;
     bounds.reserve(slice_starts_.size());
     const bool omit_stride =
@@ -1860,10 +1995,23 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
     }
     extra.push_back(StrCat("slice={", Join(bounds, ", "), "}"));
   }
+  if (opcode() == HloOpcode::kDynamicSlice) {
+    extra.push_back(
+        StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}"));
+  }
+  if (opcode() == HloOpcode::kBatchNormTraining ||
+      opcode() == HloOpcode::kBatchNormInference ||
+      opcode() == HloOpcode::kBatchNormGrad) {
+    extra.push_back(StrCat("epsilon=", epsilon()));
+    extra.push_back(StrCat("feature_index=", feature_index()));
+  }
 
   if (convolution_dimension_numbers_ != nullptr) {
     extra.push_back(ConvolutionDimensionNumbersToString());
   }
+  if (dot_dimension_numbers_ != nullptr) {
+    extra.push_back(DotDimensionNumbersToString());
+  }
 
   if (opcode() == HloOpcode::kWhile) {
     extra.push_back(StrCat("condition=%", while_condition()->name()));
@@ -1883,7 +2031,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
                        })));
   }
 
-  if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv) {
+  if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
+      opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) {
     extra.push_back(StrCat("channel_id=", channel_id_));
   }
 
@@ -1893,21 +2042,37 @@ std::vector<string> HloInstruction::ExtraAttributesToString() const {
   if (has_sharding()) {
     extra.push_back(StrCat("sharding=", sharding().ToString()));
   }
-  if (!control_successors_.empty()) {
-    extra.push_back(StrCat(
-        "control-successors=",
-        Join(control_successors_, ", ", [](string* out, HloInstruction* succ) {
-          StrAppend(out, succ->name());
-        })));
+  if (!control_predecessors_.empty()) {
+    extra.push_back(StrCat("control-predecessors={",
+                           Join(control_predecessors_, ", ",
+                                [](string* out, HloInstruction* pre) {
+                                  StrAppend(out, "%", pre->name());
+                                }),
+                           "}"));
+  }
+  if (opcode() == HloOpcode::kInfeed && !infeed_config_.empty()) {
+    extra.push_back(StrCat("infeed_config=\"", CEscape(infeed_config_), "\""));
+  }
+  if (opcode() == HloOpcode::kOutfeed && !outfeed_config_.empty()) {
+    extra.push_back(
+        StrCat("outfeed_config=\"", CEscape(outfeed_config_), "\""));
+  }
+  if (opcode() == HloOpcode::kRng) {
+    extra.push_back(
+        StrCat("distribution=", RandomDistributionToString(distribution_)));
+  }
+  if (opcode() == HloOpcode::kReducePrecision) {
+    extra.push_back(StrCat("exponent_bits=", exponent_bits_));
+    extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
   }
   return extra;
 }
 
 string HloInstruction::ToShortString() const {
-  return StrCat(name(), " = ", HloOpcodeString(opcode()), "(",
+  return StrCat("%", name(), " = ", HloOpcodeString(opcode()), "(",
                 Join(operands_, ", ",
                      [](string* out, HloInstruction* operand) {
-                       StrAppend(out, operand->name());
+                       StrAppend(out, "%", operand->name());
                      }),
                 ")");
 }
@@ -1951,6 +2116,9 @@ HloInstructionProto HloInstruction::ToProto() const {
     *proto.mutable_convolution_dimension_numbers() =
         *convolution_dimension_numbers_;
   }
+  if (dot_dimension_numbers_ != nullptr) {
+    *proto.mutable_dot_dimension_numbers() = *dot_dimension_numbers_;
+  }
   for (int i = 0; i < slice_starts_.size(); ++i) {
     auto* slice_dimension = proto.add_slice_dimensions();
     slice_dimension->set_start(slice_starts_[i]);
@@ -2001,8 +2169,10 @@ string HloInstruction::ToCategory() const {
       bool saw_rank_1 = false;
       bool saw_higher_rank = false;
       for (const auto* operand : operands()) {
-        saw_rank_1 |= ShapeUtil::Rank(operand->shape()) == 1;
-        saw_higher_rank |= ShapeUtil::Rank(operand->shape()) > 1;
+        if (!ShapeUtil::IsTuple(operand->shape())) {
+          saw_rank_1 |= ShapeUtil::Rank(operand->shape()) == 1;
+          saw_higher_rank |= ShapeUtil::Rank(operand->shape()) > 1;
+        }
       }
       if (saw_rank_1 && saw_higher_rank) {
         return "rank-1-broadcast binary fusion";
@@ -2055,23 +2225,13 @@ bool HloInstruction::IsFusable() const {
   if (tracing()) {
     return false;
   }
-
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
-    case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
     case HloOpcode::kParameter:
-    case HloOpcode::kTrace:
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
       return false;
-    // Only fuse Rng if it is used once, otherwise the random numbers generated
-    // will be different in each fusion. If it is the root (user count = 0)
-    // then it is the equivalent of having one user.
-    case HloOpcode::kRng:
-      return users_.size() <= 1;
+    // Side effecting instrutions cannot be fused.
     default:
-      return true;
+      return !HasSideEffect();
   }
 }
 
@@ -2122,11 +2282,12 @@ HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
     : unique_id_(-1),
       opcode_(opcode),
       shape_(shape),
-      name_("%" + HloOpcodeString(opcode)) {
+      name_(HloOpcodeString(opcode)) {
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape_));
 }
 
-Status HloInstruction::Visit(DfsHloVisitor* visitor) {
+template <typename HloInstructionPtr>
+Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
   switch (opcode_) {
     case HloOpcode::kAbs:
       return visitor->HandleAbs(this);
@@ -2181,6 +2342,8 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleConcatenate(this);
     case HloOpcode::kConvert:
       return visitor->HandleConvert(this);
+    case HloOpcode::kBitcastConvert:
+      return visitor->HandleBitcastConvert(this);
     case HloOpcode::kCopy:
       return visitor->HandleCopy(this);
     case HloOpcode::kMultiply:
@@ -2267,12 +2430,18 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
       return visitor->HandleFusion(this);
     case HloOpcode::kCall:
       return visitor->HandleCall(this);
+    case HloOpcode::kConditional:
+      return visitor->HandleConditional(this);
     case HloOpcode::kCustomCall:
       return visitor->HandleCustomCall(this);
-    case HloOpcode::kSend:
-      return visitor->HandleSend(this);
     case HloOpcode::kRecv:
       return visitor->HandleRecv(this);
+    case HloOpcode::kRecvDone:
+      return visitor->HandleRecvDone(this);
+    case HloOpcode::kSend:
+      return visitor->HandleSend(this);
+    case HloOpcode::kSendDone:
+      return visitor->HandleSendDone(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2282,25 +2451,30 @@ Status HloInstruction::Visit(DfsHloVisitor* visitor) {
                        HloOpcodeString(opcode_).c_str());
 }
 
+// Explicit instantiations.
+template Status HloInstruction::Visit(DfsHloVisitor* visitor);
+template Status HloInstruction::Visit(ConstDfsHloVisitor* visitor);
+
 using DFSStack =
     tensorflow::gtl::InlinedVector<std::pair<int, HloInstruction*>, 16>;
 
 // Push "child" onto the dfs_stack if not already visited.  Returns false if a
 // cycle was detected, and true otherwise.
-inline bool PushDFSChild(DfsHloVisitor* visitor, DFSStack* dfs_stack,
+template <typename Visitor>
+inline bool PushDFSChild(Visitor* visitor, DFSStack* dfs_stack,
                          HloInstruction* child) {
   CHECK(child != nullptr);
   const int id = child->unique_id();
   CHECK_GE(id, 0) << "instruction may not have a parent computation";
   switch (visitor->GetVisitState(id)) {
-    case DfsHloVisitor::kVisiting:
+    case Visitor::kVisiting:
       return false;
 
-    case DfsHloVisitor::kVisited:
+    case Visitor::kVisited:
       // Nothing to do
       return true;
 
-    case DfsHloVisitor::kNotVisited:
+    case Visitor::kNotVisited:
       dfs_stack->push_back(std::make_pair(id, child));
       return true;
   }
@@ -2309,7 +2483,8 @@ inline bool PushDFSChild(DfsHloVisitor* visitor, DFSStack* dfs_stack,
 using InternalCompareFunction =
     std::function<bool(std::pair<int, const HloInstruction*>,
                        std::pair<int, const HloInstruction*>)>;
-static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
+template <typename Visitor>
+static Status PostOrderDFS(HloInstruction* root, Visitor* visitor,
                            const InternalCompareFunction* operand_order,
                            bool ignore_control_predecessors) {
   visitor->ReserveVisitStates(root->GetModule()->NumUniqueInstructionIds());
@@ -2330,26 +2505,27 @@ static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
     HloInstruction* current_node = dfs_stack.back().second;
     CHECK_GE(current_id, 0) << current_id << ": " << current_node
                             << ": instruction may not have parent computation";
-    DfsHloVisitor::VisitState visit_state = visitor->GetVisitState(current_id);
-    if (visit_state == DfsHloVisitor::kVisited) {
+    typename Visitor::VisitState visit_state =
+        visitor->GetVisitState(current_id);
+    if (visit_state == Visitor::kVisited) {
       dfs_stack.pop_back();
-      VLOG(3) << "Not visiting HLO " << current_node->name()
+      VLOG(3) << "Not visiting HLO %" << current_node->name()
               << " as it was already visited.";
       continue;
     }
 
-    if (visit_state == DfsHloVisitor::kVisiting) {
+    if (visit_state == Visitor::kVisiting) {
       dfs_stack.pop_back();
 
       TF_RETURN_IF_ERROR(visitor->Preprocess(current_node));
-      VLOG(2) << "Visiting HLO " << current_node->name();
+      VLOG(2) << "Visiting HLO %" << current_node->name();
       TF_RETURN_IF_ERROR(current_node->Visit(visitor));
-      visitor->SetVisitState(current_id, DfsHloVisitor::kVisited);
+      visitor->SetVisitState(current_id, Visitor::kVisited);
       TF_RETURN_IF_ERROR(visitor->Postprocess(current_node));
       continue;
     }
 
-    visitor->SetVisitState(current_id, DfsHloVisitor::kVisiting);
+    visitor->SetVisitState(current_id, Visitor::kVisiting);
 
     const size_t old_dfs_stack_size = dfs_stack.size();
     for (HloInstruction* child : current_node->operands()) {
@@ -2383,9 +2559,11 @@ static Status PostOrderDFS(HloInstruction* root, DfsHloVisitor* visitor,
   return Status::OK();
 }
 
-Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
+template <typename HloInstructionPtr>
+Status HloInstruction::Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                              bool call_finish_visit,
                               bool ignore_control_predecessors) {
-  VLOG(3) << "HloInstruction::Accept(" << name() << ")";
+  VLOG(3) << "HloInstruction::Accept(%" << name() << ")";
   TF_RETURN_IF_ERROR(
       PostOrderDFS(this, visitor, nullptr, ignore_control_predecessors));
   if (call_finish_visit) {
@@ -2394,10 +2572,14 @@ Status HloInstruction::Accept(DfsHloVisitor* visitor, bool call_finish_visit,
   return Status::OK();
 }
 
+// Explicit instantiations.
+template Status HloInstruction::Accept(DfsHloVisitor*, bool, bool);
+template Status HloInstruction::Accept(ConstDfsHloVisitor*, bool, bool);
+
 Status HloInstruction::AcceptWithOperandOrder(
     DfsHloVisitor* visitor, const CompareFunction& operand_order,
     bool call_finish_visit) {
-  VLOG(2) << "HloInstruction::AcceptWithOperandOrder(" << name() << ")";
+  VLOG(2) << "HloInstruction::AcceptWithOperandOrder(%" << name() << ")";
   InternalCompareFunction func = [&operand_order](
                                      std::pair<int, const HloInstruction*> a,
                                      std::pair<int, const HloInstruction*> b) {
@@ -2447,14 +2629,20 @@ bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
 }  // namespace
 
 Status HloInstruction::Accept(
-    const FunctionVisitor::VisitorFunction& visitor_func) {
+    const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
   return this->Accept(&visitor);
 }
 
+Status HloInstruction::Accept(
+    const std::function<Status(const HloInstruction*)>& visitor_func) const {
+  ConstFunctionVisitor visitor(visitor_func);
+  return this->Accept(&visitor);
+}
+
 Status HloInstruction::AcceptOrdered(
     DfsHloVisitor* visitor, const std::vector<const HloInstruction*>& order) {
-  VLOG(2) << "HloInstruction::AcceptOrdered(" << name() << ")";
+  VLOG(2) << "HloInstruction::AcceptOrdered(%" << name() << ")";
   TF_RET_CHECK(OrderIsTopologicalSort(order));
 
   // Compute the predecessors of this instruction.
@@ -2473,7 +2661,7 @@ Status HloInstruction::AcceptOrdered(
     // The visitor can mark instructions as visited to skip particular
     // instructions.
     if (visitor->DidVisit(*const_instruction)) {
-      VLOG(3) << "Not visiting HLO " << const_instruction->name()
+      VLOG(3) << "Not visiting HLO %" << const_instruction->name()
               << " as it was already visited.";
       continue;
     }
@@ -2482,7 +2670,7 @@ Status HloInstruction::AcceptOrdered(
         const_cast<HloInstruction*>(const_instruction);
 
     TF_RETURN_IF_ERROR(visitor->Preprocess(instruction));
-    VLOG(2) << "Visiting HLO " << instruction->name();
+    VLOG(2) << "Visiting HLO %" << instruction->name();
     TF_RETURN_IF_ERROR(instruction->Visit(visitor));
     visitor->SetVisited(*instruction);
     TF_RETURN_IF_ERROR(visitor->Postprocess(instruction));
@@ -2514,33 +2702,7 @@ std::vector<int64> HloInstruction::OperandIndices(
 }
 
 bool HloInstruction::IsElementwiseBinary() const {
-  switch (opcode_) {
-    // Binary elementwise operations. If you update this, please update
-    // IsElementwise() accordingly.
-    case HloOpcode::kAdd:
-    case HloOpcode::kComplex:
-    case HloOpcode::kDivide:
-    case HloOpcode::kEq:
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kMaximum:
-    case HloOpcode::kMinimum:
-    case HloOpcode::kMultiply:
-    case HloOpcode::kNe:
-    case HloOpcode::kPower:
-    case HloOpcode::kRemainder:
-    case HloOpcode::kSubtract:
-    case HloOpcode::kAnd:
-    case HloOpcode::kOr:
-    case HloOpcode::kShiftLeft:
-    case HloOpcode::kShiftRightArithmetic:
-    case HloOpcode::kShiftRightLogical:
-      return true;
-    default:
-      return false;
-  }
+  return IsElementwise() && operand_count() == 2;
 }
 
 bool HloInstruction::IsElementwise() const {
@@ -2551,10 +2713,10 @@ bool HloInstruction::IsElementwise() const {
 
     // Unary elementwise operations.
     case HloOpcode::kAbs:
-    case HloOpcode::kAtan2:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kCeil:
     case HloOpcode::kConvert:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
@@ -2569,11 +2731,12 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kSign:
     case HloOpcode::kSin:
     case HloOpcode::kTanh:
+      CHECK_EQ(1, operand_count());
       return true;
 
     // Binary elementwise operations, the same as in IsElementwiseBinary().
-    // If you update this, please update IsElementwiseBinary() accordingly.
     case HloOpcode::kAdd:
+    case HloOpcode::kAtan2:
     case HloOpcode::kComplex:
     case HloOpcode::kDivide:
     case HloOpcode::kEq:
@@ -2593,6 +2756,7 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
+      CHECK_EQ(2, operand_count());
       return true;
 
     // Ternary elementwise operations.
@@ -2837,6 +3001,61 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   return InvalidArgument("Unknown fusion kind: %s", kind_name.c_str());
 }
 
+string PaddingConfigToString(const PaddingConfig& padding) {
+  bool has_interior_padding =
+      std::any_of(padding.dimensions().begin(), padding.dimensions().end(),
+                  [](const PaddingConfig::PaddingConfigDimension& dim) {
+                    return dim.interior_padding() != 0;
+                  });
+  return Join(
+      padding.dimensions(), "x",
+      [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
+        StrAppend(
+            out, dim.edge_padding_low(), "_", dim.edge_padding_high(),
+            has_interior_padding ? StrCat("_", dim.interior_padding()) : "");
+      });
+}
+
+string OpMetadataToString(const OpMetadata& metadata) {
+  std::vector<string> result;
+  if (!metadata.op_type().empty()) {
+    result.push_back(StrCat("op_type=\"", CEscape(metadata.op_type()), "\""));
+  }
+  if (!metadata.op_name().empty()) {
+    result.push_back(StrCat("op_name=\"", CEscape(metadata.op_name()), "\""));
+  }
+  if (!metadata.source_file().empty()) {
+    result.push_back(
+        StrCat("source_file=\"", CEscape(metadata.source_file()), "\""));
+  }
+  if (metadata.source_line() != 0) {
+    result.push_back(StrCat("source_line=", metadata.source_line()));
+  }
+  return Join(result, " ");
+}
+
+string RandomDistributionToString(const RandomDistribution& distribution) {
+  return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
+}
+
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
+  static std::unordered_map<string, RandomDistribution>* map = [] {
+    static auto* map = new std::unordered_map<string, RandomDistribution>;
+    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
+      if (RandomDistribution_IsValid(i)) {
+        auto value = static_cast<RandomDistribution>(i);
+        (*map)[RandomDistributionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
   return os << ToString(kind);
 }
@@ -2852,36 +3071,30 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   const auto append_dims = [&](const std::vector<string>& dims,
                                const Shape& shape) {
     CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
-    for (int64 logical = 0; logical < dims.size(); ++logical) {
-      int64 physical = logical;
-      if (!shape.layout().minor_to_major().empty()) {
-        physical = LayoutUtil::Major(shape.layout(), logical);
-      }
-      result += dims[physical];
-    }
+    StrAppend(&result, Join(dims, ""));
   };
 
   // lhs_dims[i] is the symbol of the logical dimension i for the lhs
   // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
-  std::vector<string> lhs_dims(2 + dnums.spatial_dimensions().size());
+  std::vector<string> lhs_dims(2 + dnums.input_spatial_dimensions().size());
   lhs_dims[dnums.input_batch_dimension()] = 'b';
   lhs_dims[dnums.input_feature_dimension()] = 'f';
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    lhs_dims[dnums.spatial_dimensions(i)] = StrCat(i);
+  for (int64 i = 0; i < dnums.input_spatial_dimensions().size(); ++i) {
+    lhs_dims[dnums.input_spatial_dimensions(i)] = StrCat(i);
   }
 
   std::vector<string> rhs_dims(2 + dnums.kernel_spatial_dimensions().size());
   rhs_dims[dnums.kernel_input_feature_dimension()] = "i";
   rhs_dims[dnums.kernel_output_feature_dimension()] = "o";
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
+  for (int64 i = 0; i < dnums.kernel_spatial_dimensions().size(); ++i) {
     rhs_dims[dnums.kernel_spatial_dimensions(i)] = StrCat(i);
   }
 
-  std::vector<string> output_dims(2 + dnums.spatial_dimensions().size());
+  std::vector<string> output_dims(2 + dnums.output_spatial_dimensions().size());
   output_dims[dnums.output_batch_dimension()] = 'b';
   output_dims[dnums.output_feature_dimension()] = 'f';
-  for (int64 i = 0; i < dnums.spatial_dimensions().size(); ++i) {
-    output_dims[dnums.spatial_dimensions(i)] = StrCat(i);
+  for (int64 i = 0; i < dnums.output_spatial_dimensions().size(); ++i) {
+    output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i);
   }
 
   result += "dim_labels=";
@@ -2893,6 +3106,30 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
   return result;
 }
 
+string HloInstruction::DotDimensionNumbersToString() const {
+  string result;
+  if (dot_dimension_numbers_ == nullptr) {
+    return result;
+  }
+  const DotDimensionNumbers& dnums = *dot_dimension_numbers_;
+  if (!dnums.lhs_batch_dimensions().empty()) {
+    result += "lhs_batch_dims=";
+    StrAppend(&result, Join(dnums.lhs_batch_dimensions(), ","));
+  }
+  result += "lhs_contracting_dims=";
+  StrAppend(&result, Join(dnums.lhs_contracting_dimensions(), ","));
+
+  result += ",";
+  if (!dnums.rhs_batch_dimensions().empty()) {
+    result += "rhs_batch_dims=";
+    StrAppend(&result, Join(dnums.rhs_batch_dimensions(), ","));
+  }
+  result += "rhs_contracting_dims=";
+  StrAppend(&result, Join(dnums.rhs_contracting_dimensions(), ","));
+
+  return result;
+}
+
 bool HloInstruction::CouldBeBitcast() const {
   switch (opcode_) {
     case HloOpcode::kTranspose:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e714d7bc71d86815b1b2df44cdd5c67281cdeb62..03cf9aaf907e7437596b9cc1f093fd79d22963b9 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <tuple>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
@@ -34,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
@@ -83,12 +84,16 @@ class HloInstruction {
   //     must contain all operands of the newly constructed instruction.
   //   computation_map: a map from computation name to HloComputation*. This map
   //     must contain all computations which the newly constructed instruction
-  //     calls. If the instruction is a fusion instruction, then the fusion
-  //     computation is added to this map and the module.
+  //     calls.
+  //   add_fused_computation: A function to call to add a fused
+  //     computation. Used (clearly) when the instruction is a fusion
+  //     instruction.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
       HloModule* module, const HloInstructionProto& proto,
       const tensorflow::gtl::FlatMap<string, HloInstruction*>& instruction_map,
-      tensorflow::gtl::FlatMap<string, HloComputation*>* computation_map);
+      const tensorflow::gtl::FlatMap<string, HloComputation*>& computation_map,
+      const std::function<void(std::unique_ptr<HloComputation>)>&
+          add_fused_computation);
 
   // Creates a parameter-retrieving instruction.
   static std::unique_ptr<HloInstruction> CreateParameter(int64 parameter_number,
@@ -155,6 +160,12 @@ class HloInstruction {
       const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
+  // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
+  // dimensions specified in 'dimension_numbers'.
+  static std::unique_ptr<HloInstruction> CreateDot(
+      const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
+      const DotDimensionNumbers& dimension_numbers);
+
   // Creates a reduce-precision op, where operand is the data to reduce in
   // precision, and exponent_bits and mantissa_bits describe the precision to
   // reduce it to.
@@ -164,13 +175,19 @@ class HloInstruction {
 
   // Creates a cross replica sum op.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
-      const Shape& shape, HloInstruction* operand);
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
                                                        HloInstruction* operand);
 
+  // Creates a bitcast conversion instruction, where operand is the data to
+  // convert and shape is the target shape for the conversion.
+  static std::unique_ptr<HloInstruction> CreateBitcastConvert(
+      const Shape& shape, HloInstruction* operand);
+
   // Creates an infeed instruction, which reads data of the given shape from the
   // Infeed interface of the device.
   static std::unique_ptr<HloInstruction> CreateInfeed(const Shape& shape,
@@ -181,18 +198,28 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::StringPiece outfeed_config);
 
-  // Creates a send instruction with the given channel id, which sends the
-  // operand data to a unique receive instruction in another computation that
-  // has the same channel id.
+  // Creates an asynchronous send instruction with the given channel id, which
+  // initiates sending the operand data to a unique receive instruction in
+  // another computation that has the same channel id.
   static std::unique_ptr<HloInstruction> CreateSend(HloInstruction* operand,
                                                     int64 channel_id);
 
-  // Creates a receive instruction with the given channel id, which receives
-  // data of the given shape from a unique send instruction in another
-  // computation that has the same channel id.
+  // Blocks until data transfer for the Send instruction (operand) is complete.
+  // The operand must be kSend.
+  static std::unique_ptr<HloInstruction> CreateSendDone(
+      HloInstruction* operand);
+
+  // Creates an asynchronous receive instruction with the given channel id,
+  // which allocates resources to receive data of the given shape from a unique
+  // send instruction in another computation that has the same channel id.
   static std::unique_ptr<HloInstruction> CreateRecv(const Shape& shape,
                                                     int64 channel_id);
 
+  // Blocks until data transfer for the Recv instruction (operand) is complete
+  // and returns the receive buffer. The operand must be kRecv.
+  static std::unique_ptr<HloInstruction> CreateRecvDone(
+      HloInstruction* operand);
+
   // Creates a slice instruction, where the operand is sliced by the given
   // start/limit indices.
   static std::unique_ptr<HloInstruction> CreateSlice(
@@ -202,7 +229,7 @@ class HloInstruction {
       tensorflow::gtl::ArraySlice<int64> strides);
 
   // Creates a slice instruction, where the first operand is sliced by
-  // start indices specified in the second operand, and by size specfied in
+  // start indices specified in the second operand, and by size specified in
   // 'slice_sizes'.
   static std::unique_ptr<HloInstruction> CreateDynamicSlice(
       const Shape& shape, HloInstruction* operand,
@@ -295,6 +322,11 @@ class HloInstruction {
                                                      HloComputation* body,
                                                      HloInstruction* init);
 
+  static std::unique_ptr<HloInstruction> CreateConditional(
+      const Shape& shape, HloInstruction* pred,
+      HloInstruction* true_computation_arg, HloComputation* true_computation,
+      HloInstruction* false_computation_arg, HloComputation* false_computation);
+
   // Creates a fusion instruction. A fusion instruction contains one or more
   // fused instructions forming an expression with a single root
   // "fused_root". Additional instructions can be added to the fusion
@@ -302,6 +334,11 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateFusion(
       const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root);
 
+  static std::unique_ptr<HloInstruction> CreateFusion(
+      const Shape& shape, FusionKind fusion_kind,
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloComputation* fusion_computation);
+
   // Creates a fusion instruction that represents backward convolution. This is
   // similar to CreateFusion, but with extra arguments indicating the window and
   // dimemsion mapping of the backward convolution.
@@ -391,7 +428,7 @@ class HloInstruction {
   Status RemoveControlDependencyTo(HloInstruction* instruction);
 
   // Returns the set of control predecessors (successors) of this
-  // instruction. Control predecessors (sucessors) must execute before (after)
+  // instruction. Control predecessors (successors) must execute before (after)
   // the current instruction.
   const std::vector<HloInstruction*>& control_predecessors() const {
     return control_predecessors_;
@@ -458,8 +495,15 @@ class HloInstruction {
   // reachable via control dependencies will not be visited, and the postorder
   // will not take control dependencies into account. It is as if the control
   // dependencies didn't exist in the graph at all.
-  Status Accept(DfsHloVisitor* visitor, bool call_finish_visit = true,
+  template <typename HloInstructionPtr>
+  Status Accept(DfsHloVisitorBase<HloInstructionPtr>* visitor,
+                bool call_finish_visit = true,
                 bool ignore_control_predecessors = false);
+  Status Accept(ConstDfsHloVisitor* visitor, bool call_finish_visit = true,
+                bool ignore_control_predecessors = false) const {
+    return const_cast<HloInstruction*>(this)->Accept(
+        visitor, call_finish_visit, ignore_control_predecessors);
+  }
 
   // Same as Accept() above, but the order of operand and control predecessor
   // visitation is determined by the given operand order; if compare(A, B) ==
@@ -472,7 +516,9 @@ class HloInstruction {
 
   // Performs a postorder DFS visit using this node as the root. Calls the given
   // visitor function at each instruction.
-  Status Accept(const FunctionVisitor::VisitorFunction& visitor_func);
+  Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
+  Status Accept(
+      const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
   // Visits all instructions rooted at this instruction using the given visitor
   // in the given order. 'order' must contain at least the set of instructions
@@ -485,7 +531,8 @@ class HloInstruction {
                        const std::vector<const HloInstruction*>& order);
 
   // Visit this instruction and only this instruction with the given visitor.
-  Status Visit(DfsHloVisitor* visitor);
+  template <typename HloInstructionPtr>
+  Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
 
   // Returns the literal associated with this instruction.
   //
@@ -583,18 +630,27 @@ class HloInstruction {
   void set_select(HloComputation* select);
   void set_scatter(HloComputation* scatter);
 
+  // Gets/sets the true and false HloComputation for Conditional. The setters
+  // should only be called by HloModule or HloComputation methods.
+  //
+  // Precondition: The instruction is a Conditional instruction.
+  HloComputation* true_computation() const;
+  HloComputation* false_computation() const;
+  void set_true_computation(HloComputation* true_computation);
+  void set_false_computation(HloComputation* false_computation);
+
   // Returns a string for the signature of this instruction if considered as a
   // function, e.g. the signature of an F32 add is (F32, F32) -> F32.
   string SignatureString() const;
 
   // Returns a debugging string that represents this instruction.
-  string ToString(bool compact_operands = false,
-                  bool include_metadata = true) const;
+  string ToString(bool compact_operands = false, bool include_metadata = true,
+                  bool include_large_constants = false) const;
 
   // Components of the ToString() representation:
 
   // Returns a string representation of the operand list.
-  string OperandsToString(bool compact) const;
+  string OperandsToString(bool compact, bool include_large_constants) const;
 
   // Returns string representation of op-specific attributes.
   std::vector<string> ExtraAttributesToString() const;
@@ -843,6 +899,11 @@ class HloInstruction {
     return *window_;
   }
 
+  // Sets the window data in a windowed operation such as convolution.
+  void set_window(const Window& window) {
+    window_ = MakeUnique<Window>(window);
+  }
+
   // Returns the padding configuration for a pad node.
   //
   // Precondition: opcode() == HloOpcode::kPad
@@ -861,6 +922,15 @@ class HloInstruction {
   // Returns the dump string of the convolution dimension numbers.
   string ConvolutionDimensionNumbersToString() const;
 
+  // Returns data on the dimension numbers used for a dot operation.
+  const DotDimensionNumbers& dot_dimension_numbers() const {
+    CHECK(dot_dimension_numbers_ != nullptr);
+    return *dot_dimension_numbers_;
+  }
+
+  // Returns the dump string of the dot dimension numbers.
+  string DotDimensionNumbersToString() const;
+
   // Returns the random distribution for this rng node.
   //
   // Precondition: opcode() == HloOpcode::kRng
@@ -870,12 +940,19 @@ class HloInstruction {
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
   // the instruction to form the name of the cloned instruction.
-  std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone") const;
+  // If the module pointer is not nullptr, it will be the module where
+  // the cloned computations will be added to (in order to support deep
+  // cloning).
+  std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
+                                        HloModule* module = nullptr) const;
 
   // Clones the HLO instruction as above but with new shape and operands.
+  // If the module pointer is not nullptr, it will be the module where
+  // the cloned computations will be added to (in order to support deep
+  // cloning).
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
-      const Shape& shape,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) const;
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloModule* module = nullptr) const;
 
   // Returns the computations this instruction directly calls (if any).
   const std::vector<HloComputation*>& called_computations() const {
@@ -945,11 +1022,6 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
-  // Returns the opcode string for this instruction. This is the result from
-  // HloOpcodeString plus, for fusion nodes, the fusion kind, separated by a
-  // ':'.
-  string ExtendedOpcodeStr() const;
-
   // Returns a string identifier for this instruction. If no string identifier
   // has been explicitly set, then the identifier is the serialized pointer to
   // this instruction.
@@ -1061,8 +1133,8 @@ class HloInstruction {
 
   // Clones a fusion instruction with a new shape and operands.
   std::unique_ptr<HloInstruction> CloneFusionWithNewOperands(
-      const Shape& shape,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands) const;
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloModule* module = nullptr) const;
 
   // Returns true if this instruction can legally have the dimensions field
   // set. Used for checking precondition of dimensions field accessors.
@@ -1117,6 +1189,9 @@ class HloInstruction {
   // Describes the dimension numbers used for a convolution.
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
 
+  // Describes the dimension numbers used for a dot.
+  std::unique_ptr<DotDimensionNumbers> dot_dimension_numbers_;
+
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
@@ -1160,6 +1235,10 @@ class HloInstruction {
     // kSelectAndScatter computations.
     kSelectComputationIndex = 0,
     kScatterComputationIndex = 1,
+
+    // kConditional computations.
+    kTrueComputationIndex = 0,
+    kFalseComputationIndex = 1,
   };
 
   // Outfeed configuration information, only present for kOutfeed.
@@ -1207,8 +1286,37 @@ string ToString(HloInstruction::FusionKind kind);
 StatusOr<HloInstruction::FusionKind> StringToFusionKind(
     const string& kind_name);
 
+// Custom (de)stringification functions for protos that live inside
+// HloInstruction.
+string PaddingConfigToString(const PaddingConfig& padding);
+string OpMetadataToString(const OpMetadata& metadata);
+string RandomDistributionToString(const RandomDistribution& distribution);
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
+
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 
+// Map classes that guarantee a deterministic iteration order when the key is
+// an HloInstruction* or a const HloInstruction*.
+// To make the iteration order over the map deterministic, the comparator
+// should not be using the pointer values, but rather an intrinsic property of
+// the hlo.
+//
+// Note that this cannot be used for HLO instructions across multiple modules
+// since the id of HLO instructions are only unique within each HLO module.
+struct HloPtrComparator {
+  bool operator()(const HloInstruction* const& lhs,
+                  const HloInstruction* const& rhs) const {
+    return lhs->unique_id() < rhs->unique_id();
+  }
+};
+
+template <typename ValueT>
+using HloInstructionMap = std::map<HloInstruction*, ValueT, HloPtrComparator>;
+
+template <typename ValueT>
+using ConstHloInstructionMap =
+    std::map<const HloInstruction*, ValueT, HloPtrComparator>;
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTION_H_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 4ead64d997df1a6a85b028374949a4e5c9eab549..aa3fd0cf4f7410ed7034c65d72e16489d4f0ba71 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -792,8 +792,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   //   sub = Sub(mul, clamp)
   //   tuple = Tuple({sub, sub, mul, C1})
   //
-  // Notable complexities are repeated operands in a same instruction, different
-  // shapes, use of value in different expressions.
+  // Notable complexities are repeated operands in the same instruction,
+  // different shapes, use of value in different expressions.
   auto c1 = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
   auto c2 = builder.AddInstruction(
@@ -1068,8 +1068,11 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
@@ -1088,48 +1091,6 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
                                root2->operand(1)->operand(0)->shape()));
 }
 
-TEST_F(HloInstructionTest, IsRandomFusable) {
-  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->opcode());
-  }
-  {
-    auto builder = HloComputation::Builder(TestName());
-    auto hlo_module = CreateNewModule();
-    auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(0.0)));
-    auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-        Literal::CreateR0<float>(1.0)));
-    auto rng = builder.AddInstruction(HloInstruction::CreateRng(
-        shape, RandomDistribution::RNG_NORMAL, {const0, const1}));
-    builder.AddInstruction(HloInstruction::CreateUnary(
-        shape, HloOpcode::kNegate, rng));
-    auto* computation = hlo_module->AddEntryComputation(builder.Build());
-    computation->CreateFusionInstruction({rng, const0, const1},
-      HloInstruction::FusionKind::kLoop);
-
-    auto* root = computation->root_instruction();
-
-    EXPECT_EQ(HloOpcode::kFusion, root->operand(0)->opcode());
-  }
-}
-
-
 TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test that the suffix string added to cloned instructions is not
   // duplicated. Rather a numeric incrementing value should be appended. That
@@ -1138,35 +1099,34 @@ TEST_F(HloInstructionTest, CloneSuffixNames) {
   // Test cloning the same instruction multiple times.
   auto foo =
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "foo");
-  EXPECT_EQ(foo->Clone()->name(), "%foo.clone");
-  EXPECT_EQ(foo->Clone()->Clone()->name(), "%foo.clone2");
-  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "%foo.clone3");
+  EXPECT_EQ(foo->Clone()->name(), "foo.clone");
+  EXPECT_EQ(foo->Clone()->Clone()->name(), "foo.clone2");
+  EXPECT_EQ(foo->Clone()->Clone()->Clone()->name(), "foo.clone3");
 
   // Test custom suffixes.
-  EXPECT_EQ(foo->Clone("bar")->name(), "%foo.bar");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "%foo.bar2");
-  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(),
-            "%foo.bar2.clone");
+  EXPECT_EQ(foo->Clone("bar")->name(), "foo.bar");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->name(), "foo.bar2");
+  EXPECT_EQ(foo->Clone("bar")->Clone("bar")->Clone()->name(), "foo.bar2.clone");
 
   // Test instruction name with a dot.
   auto foo_baz = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.baz");
-  EXPECT_EQ(foo_baz->Clone()->name(), "%foo.baz.clone");
+  EXPECT_EQ(foo_baz->Clone()->name(), "foo.baz.clone");
 
   // Test incrementing a large number after the suffix.
   auto foo_clone234 = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clone234");
-  EXPECT_EQ(foo_clone234->Clone()->name(), "%foo.clone235");
+  EXPECT_EQ(foo_clone234->Clone()->name(), "foo.clone235");
 
   // Test a non-numeric string after the cloning suffix.
   auto foo_clonexyz = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clonexyz");
-  EXPECT_EQ(foo_clonexyz->Clone()->name(), "%foo.clonexyz.clone");
+  EXPECT_EQ(foo_clonexyz->Clone()->name(), "foo.clonexyz.clone");
 
   // Test a name with multiple appearances of the suffix.
   auto foo_clone_clone3 = HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(F32, {}), "foo.clone.clone3");
-  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "%foo.clone.clone4");
+  EXPECT_EQ(foo_clone_clone3->Clone()->name(), "foo.clone.clone4");
 }
 
 TEST_F(HloInstructionTest, Stringification) {
@@ -1183,21 +1143,25 @@ TEST_F(HloInstructionTest, Stringification) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
   HloInstruction* reshape =
       builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(sout, HloOpcode::kDot, x, reshape));
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
   EXPECT_EQ(dot->ToString(false, false),
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
-            "%transpose)");
+            "%transpose), lhs_contracting_dims=1,rhs_contracting_dims=0");
 
   HloModule module(TestName());
   auto* computation = module.AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
 
-  EXPECT_EQ(fusion->ToString(false, false),
-            "%fusion = f32[5,20]{1,0} fusion:kTransposeDot(f32[5,10]{1,0} %x, "
-            "f32[20,10]{1,0} %y), calls=%fused_computation");
+  EXPECT_EQ(
+      fusion->ToString(false, false),
+      "%fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
+      "f32[20,10]{1,0} %y), kind=kTransposeDot, calls=%fused_computation");
 
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 0660d5a1820f068a1e6a765c133f3b9654339c57..4255d6086625dfb9a045e4431e968a5ee0106ac7 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -73,6 +73,35 @@ void HloMatcher::DescribeTo(::std::ostream* os) const {
   }
 }
 
+bool HloParameterMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  if (instruction->parameter_number() != parameter_number_) {
+    *listener << "has wrong parameter number (got "
+              << instruction->parameter_number() << ", want "
+              << parameter_number_ << ")";
+    return false;
+  }
+  return true;
+}
+
+bool HloGetTupleElementMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+  if (instruction->tuple_index() != tuple_index_) {
+    *listener << "has wrong tuple index (got " << instruction->tuple_index()
+              << ", want " << tuple_index_ << ")";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index bc5ed029a45b4f92a240138dc1e933610efe1789..992f55788b4900949f4994ba5b7be015bcd0d3de 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -38,6 +38,36 @@ class HloMatcher : public ::testing::MatcherInterface<const HloInstruction*> {
   std::vector<::testing::Matcher<const HloInstruction*>> operands_;
 };
 
+// Custom matcher for parameters, which accepts a parameter number.
+class HloParameterMatcher : public HloMatcher {
+ public:
+  explicit HloParameterMatcher(int64 parameter_number)
+      : HloMatcher(HloOpcode::kParameter, /*operands=*/{}),
+        parameter_number_(parameter_number) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  int64 parameter_number_;
+};
+
+// Custom matcher for get-tuple-element instructions, which accepts a tuple
+// index to match.
+class HloGetTupleElementMatcher : public HloMatcher {
+ public:
+  explicit HloGetTupleElementMatcher(
+      ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index)
+      : HloMatcher(HloOpcode::kGetTupleElement, /*operands=*/{operand}),
+        tuple_index_(tuple_index) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+
+ private:
+  int64 tuple_index_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -57,6 +87,7 @@ HLO_MATCHER(Call);
 HLO_MATCHER(Ceil);
 HLO_MATCHER(Clamp);
 HLO_MATCHER(Concatenate);
+HLO_MATCHER(Conditional);
 HLO_MATCHER(Constant);
 HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
@@ -72,7 +103,6 @@ HLO_MATCHER(Exp);
 HLO_MATCHER(Floor);
 HLO_MATCHER(Fusion);
 HLO_MATCHER(Ge);
-HLO_MATCHER(GetTupleElement);
 HLO_MATCHER(Gt);
 HLO_MATCHER(Infeed);
 HLO_MATCHER(IsFinite);
@@ -90,9 +120,9 @@ HLO_MATCHER(Ne);
 HLO_MATCHER(Negate);
 HLO_MATCHER(Outfeed);
 HLO_MATCHER(Pad);
-HLO_MATCHER(Parameter);
 HLO_MATCHER(Power);
 HLO_MATCHER(Recv);
+HLO_MATCHER(RecvDone);
 HLO_MATCHER(Reduce);
 HLO_MATCHER(ReducePrecision);
 HLO_MATCHER(ReduceWindow);
@@ -103,6 +133,7 @@ HLO_MATCHER(Rng);
 HLO_MATCHER(Select);
 HLO_MATCHER(SelectAndScatter);
 HLO_MATCHER(Send);
+HLO_MATCHER(SendDone);
 HLO_MATCHER(ShiftLeft);
 HLO_MATCHER(ShiftRightLogical);
 HLO_MATCHER(ShiftRightArithmetic);
@@ -115,6 +146,43 @@ HLO_MATCHER(Trace);
 HLO_MATCHER(Transpose);
 HLO_MATCHER(Tuple);
 HLO_MATCHER(While);
+
+// The special cases below let you check additional information about the
+// HloInstruction, beyond just its opcode and operands.  In all cases you can
+// still use the generic matcher which doesn't check this info.
+//
+// Feel free to add additional custom matchers below.
+
+//  - Parameter(N) matches parameter number N.
+//  - Parameter() matches any parameter.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Parameter(
+    int64 parameter_number) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloParameterMatcher(parameter_number));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> Parameter() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kParameter, {}));
+}
+
+// GetTupleElement(operand, N) matches a GTE instruction which gets the N'th
+// tuple element of operand, while GetTupleElement(operand) matches any GTE
+// operation on operand, and GetTupleElement() matches any GTE operation at all.
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement(
+    ::testing::Matcher<const HloInstruction*> operand, int64 tuple_index) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloGetTupleElementMatcher(operand, tuple_index));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement(
+    ::testing::Matcher<const HloInstruction*> operand) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {operand}));
+}
+inline ::testing::Matcher<const ::xla::HloInstruction*> GetTupleElement() {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloMatcher(HloOpcode::kGetTupleElement, {}));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 1758f2760c46a5f0f5876ac6ba8dd013e71455b6..6fe2134466ffaf1402e5ecbc81aea9aafe2a468b 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -35,14 +35,15 @@ namespace xla {
 HloModule::HloModule(const string& name,
                      const VersionedComputationHandle& entry_computation_handle,
                      const HloModuleConfig& config)
-    : name_(name),
+    : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       has_entry_computation_handle_(true),
       entry_computation_handle_(entry_computation_handle) {}
 
-HloModule::HloModule(const string& name) : name_(name) {}
+HloModule::HloModule(const string& name)
+    : name_(NameUniquer::GetSanitizedName(name)) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
-    : name_(name), config_(config) {}
+    : name_(NameUniquer::GetSanitizedName(name)), config_(config) {}
 
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
@@ -170,20 +171,17 @@ void HloModule::ReplaceComputations(
   computations_ = std::move(new_computations);
 }
 
-string HloModule::ToString() const {
+string HloModule::ToString(bool include_large_constants) const {
   std::ostringstream s;
   s << "HloModule " << name() << ":\n\n";
   for (const HloComputation* computation : MakeComputationPostOrder()) {
-    // Fusion computations are emitted with their fusion instruction and
-    // therefore don't need to be emitted as a separate comptutation in the
-    // module.
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
     if (computation == entry_computation()) {
       s << "ENTRY ";
     }
-    s << computation->ToString() << "\n\n";
+    s << computation->ToString(
+             /*nested_level=*/0,
+             /*include_large_constants=*/include_large_constants)
+      << "\n\n";
   }
   return s.str();
 }
@@ -293,9 +291,16 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   tensorflow::gtl::FlatMap<string, HloComputation*> computation_map;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
-                        HloComputation::CreateFromProto(
-                            module.get(), computation_proto, &computation_map));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloComputation> computation,
+        HloComputation::CreateFromProto(
+            module.get(), computation_proto, computation_map,
+            /*add_fused_computation=*/
+            [&module](std::unique_ptr<HloComputation> fused_computation) {
+              module->AddComputationInternal(std::move(fused_computation),
+                                             /*is_entry=*/false,
+                                             /*uniquify_names=*/false);
+            }));
     CHECK_NE(computation.get(), nullptr);
     TF_RET_CHECK(!ContainsKey(computation_map, computation->name()));
     string computation_name = computation->name();
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index ad11d56006a79b509309daba55e94342911f76a1..5141e7bc8d4cf0ef4cd83310772e0c5d66b5da12 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -85,7 +85,11 @@ class HloModule {
   std::unique_ptr<HloModule> Clone(const string& suffix = "clone") const;
 
   // Return a pointer to the entry computation of the module..
-  HloComputation* entry_computation() const {
+  const HloComputation* entry_computation() const {
+    CHECK_NE(nullptr, entry_computation_);
+    return entry_computation_;
+  }
+  HloComputation* entry_computation() {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
@@ -139,7 +143,7 @@ class HloModule {
 
   const HloModuleConfig& config() const { return config_; }
 
-  string ToString() const;
+  string ToString(bool include_large_constants = false) const;
 
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index 8974deb530c2e4561b5ab57f43c65fd525db3617..822e2f1f53e5ee460b88c2241ecf7f6b91ef608b 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -39,8 +39,8 @@ void HloModuleConfig::SetDefaultComputationLayout(
 }
 
 string HloModuleConfig::compilation_cache_key() const {
-  string key = tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_,
-                                           "::hybrid=", has_hybrid_result_);
+  string key =
+      tensorflow::strings::StrCat("profiling=", hlo_profiling_enabled_);
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 4a7ead9c104d2ed50d5c895b3cdf2d3767ae16e8..a5ee895e48448fbb8fa3879dc1b6764c1f9f6966 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -104,16 +104,6 @@ class HloModuleConfig {
   // Whether to enable HLO-level profiling.
   bool hlo_profiling_enabled_ = false;
 
-  // If this flag is true, the generated executable will return a ShapedBuffer
-  // holding the result of the computation. In a ShapedBuffer, tuples have their
-  // structure held in host memory and the element arrays (leaves of the tuple
-  // structure) stored in device memory. The ShapedBuffer is considered "hybrid"
-  // because its leaves are on device but its structure is stored on
-  // host. Otherwise, if this flag is false, the generated executable will
-  // return a DeviceMemoryBase where the result is held entirely in device
-  // memory.
-  bool has_hybrid_result_ = false;
-
   // Module/graph-level seed handle.
   uint64 seed_ = 0;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 20eef2f7d53251a374971e55441f6a4585e9b35c..bf6440d66cac0d3a929c377202b212aba262f887 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -101,7 +101,7 @@ TEST_F(HloModuleTest, CloneTest) {
   for (auto origin = post_order.begin(), copied = post_order_copied.begin();
        origin != post_order.end() && copied != post_order_copied.end();
        ++origin, ++copied) {
-    EXPECT_EQ((*origin)->name() + "copy", (*copied)->name());
+    EXPECT_EQ((*origin)->name() + ".copy", (*copied)->name());
   }
 }
 
@@ -125,6 +125,26 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   EXPECT_EQ(post_order.front(), computation1);
 }
 
+TEST_F(HloModuleTest, LargeConstantToString) {
+  // Create a module with a single computation.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder("Constant");
+  std::vector<float> values(16, 42.0);
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(values)));
+  module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(
+      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "ROOT %constant = f32[16]{0} constant({...})\n}\n\n",
+      module->ToString(/*include_large_constants=*/false));
+  EXPECT_EQ(
+      "HloModule LargeConstantToString:\n\nENTRY %Constant () -> f32[16] {\n  "
+      "ROOT %constant = f32[16]{0} constant({42, 42, 42, 42, 42, 42, 42, 42, "
+      "42, 42, 42, 42, 42, 42, 42, 42})\n}\n\n",
+      module->ToString(/*include_large_constants=*/true));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 157d19f5a9996ff90c4a5c3655f82ff5b8e62cfc..d1eaf357855205f1e9867e86f3042b96b6beff97 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -21,243 +21,22 @@ limitations under the License.
 namespace xla {
 
 string HloOpcodeString(HloOpcode opcode) {
-  // Note: Do not use ':' in opcode strings. It is used as a special character
-  // in these places:
-  // - In extended opcode strings (HloInstruction::ExtendedOpcodeString()), to
-  //   separate the opcode from the fusion kind
-  // - In fully qualified names (HloInstruction::FullyQualifiedName()), to
-  //   separate the qualifiers (name of the computation and potentially the
-  //   fusion instruction) from the name
   switch (opcode) {
-    case HloOpcode::kAbs:
-      return "abs";
-    case HloOpcode::kAdd:
-      return "add";
-    case HloOpcode::kAnd:
-      return "and";
-    case HloOpcode::kAtan2:
-      return "atan2";
-    case HloOpcode::kBatchNormTraining:
-      return "batch-norm-training";
-    case HloOpcode::kBatchNormInference:
-      return "batch-norm-inference";
-    case HloOpcode::kBatchNormGrad:
-      return "batch-norm-grad";
-    case HloOpcode::kBitcast:
-      return "bitcast";
-    case HloOpcode::kBroadcast:
-      return "broadcast";
-    case HloOpcode::kCall:
-      return "call";
-    case HloOpcode::kClamp:
-      return "clamp";
-    case HloOpcode::kComplex:
-      return "complex";
-    case HloOpcode::kConcatenate:
-      return "concatenate";
-    case HloOpcode::kConstant:
-      return "constant";
-    case HloOpcode::kConvert:
-      return "convert";
-    case HloOpcode::kConvolution:
-      return "convolution";
-    case HloOpcode::kCos:
-      return "cosine";
-    case HloOpcode::kCrossReplicaSum:
-      return "cross-replica-sum";
-    case HloOpcode::kCustomCall:
-      return "custom-call";
-    case HloOpcode::kCopy:
-      return "copy";
-    case HloOpcode::kDivide:
-      return "divide";
-    case HloOpcode::kDot:
-      return "dot";
-    case HloOpcode::kDynamicSlice:
-      return "dynamic-slice";
-    case HloOpcode::kDynamicUpdateSlice:
-      return "dynamic-update-slice";
-    case HloOpcode::kEq:
-      return "equal-to";
-    case HloOpcode::kExp:
-      return "exponential";
-    case HloOpcode::kFloor:
-      return "floor";
-    case HloOpcode::kCeil:
-      return "ceil";
-    case HloOpcode::kFusion:
-      return "fusion";
-    case HloOpcode::kGe:
-      return "greater-than-or-equal-to";
-    case HloOpcode::kGetTupleElement:
-      return "get-tuple-element";
-    case HloOpcode::kGt:
-      return "greater-than";
-    case HloOpcode::kImag:
-      return "imag";
-    case HloOpcode::kInfeed:
-      return "infeed";
-    case HloOpcode::kIsFinite:
-      return "is-finite";
-    case HloOpcode::kLe:
-      return "less-than-or-equal-to";
-    case HloOpcode::kLog:
-      return "log";
-    case HloOpcode::kLt:
-      return "less-than";
-    case HloOpcode::kMap:
-      return "map";
-    case HloOpcode::kMaximum:
-      return "maximum";
-    case HloOpcode::kMinimum:
-      return "minimum";
-    case HloOpcode::kMultiply:
-      return "multiply";
-    case HloOpcode::kNe:
-      return "not-equal-to";
-    case HloOpcode::kNegate:
-      return "negate";
-    case HloOpcode::kNot:
-      return "not";
-    case HloOpcode::kOr:
-      return "or";
-    case HloOpcode::kOutfeed:
-      return "outfeed";
-    case HloOpcode::kPad:
-      return "pad";
-    case HloOpcode::kParameter:
-      return "parameter";
-    case HloOpcode::kPower:
-      return "power";
-    case HloOpcode::kReal:
-      return "real";
-    case HloOpcode::kRecv:
-      return "recv";
-    case HloOpcode::kReduce:
-      return "reduce";
-    case HloOpcode::kReducePrecision:
-      return "reduce-precision";
-    case HloOpcode::kReduceWindow:
-      return "reduce-window";
-    case HloOpcode::kRemainder:
-      return "remainder";
-    case HloOpcode::kReshape:
-      return "reshape";
-    case HloOpcode::kReverse:
-      return "reverse";
-    case HloOpcode::kRng:
-      return "rng";
-    case HloOpcode::kRoundNearestAfz:
-      return "round-nearest-afz";
-    case HloOpcode::kSelectAndScatter:
-      return "select-and-scatter";
-    case HloOpcode::kSelect:
-      return "select";
-    case HloOpcode::kSend:
-      return "send";
-    case HloOpcode::kShiftLeft:
-      return "shift-left";
-    case HloOpcode::kShiftRightArithmetic:
-      return "shift-right-arithmetic";
-    case HloOpcode::kShiftRightLogical:
-      return "shift-right-logical";
-    case HloOpcode::kSign:
-      return "sign";
-    case HloOpcode::kSin:
-      return "sine";
-    case HloOpcode::kSlice:
-      return "slice";
-    case HloOpcode::kSort:
-      return "sort";
-    case HloOpcode::kSubtract:
-      return "subtract";
-    case HloOpcode::kTanh:
-      return "tanh";
-    case HloOpcode::kTrace:
-      return "trace";
-    case HloOpcode::kTranspose:
-      return "transpose";
-    case HloOpcode::kTuple:
-      return "tuple";
-    case HloOpcode::kWhile:
-      return "while";
+#define CASE_OPCODE_STRING(enum_name, opcode_name, ...) \
+  case HloOpcode::enum_name:                            \
+    return opcode_name;
+    HLO_OPCODE_LIST(CASE_OPCODE_STRING)
+#undef CASE_OPCODE_STRING
   }
 }
 
 StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name) {
-  static auto* opcode_map = new tensorflow::gtl::FlatMap<string, HloOpcode>(
-      {{"abs", HloOpcode::kAbs},
-       {"add", HloOpcode::kAdd},
-       {"and", HloOpcode::kAnd},
-       {"batch-norm-training", HloOpcode::kBatchNormTraining},
-       {"batch-norm-inference", HloOpcode::kBatchNormInference},
-       {"batch-norm-grad", HloOpcode::kBatchNormGrad},
-       {"bitcast", HloOpcode::kBitcast},
-       {"broadcast", HloOpcode::kBroadcast},
-       {"call", HloOpcode::kCall},
-       {"clamp", HloOpcode::kClamp},
-       {"concatenate", HloOpcode::kConcatenate},
-       {"constant", HloOpcode::kConstant},
-       {"convert", HloOpcode::kConvert},
-       {"convolution", HloOpcode::kConvolution},
-       {"cosine", HloOpcode::kCos},
-       {"cross-replica-sum", HloOpcode::kCrossReplicaSum},
-       {"custom-call", HloOpcode::kCustomCall},
-       {"copy", HloOpcode::kCopy},
-       {"divide", HloOpcode::kDivide},
-       {"dot", HloOpcode::kDot},
-       {"dynamic-slice", HloOpcode::kDynamicSlice},
-       {"dynamic-update-slice", HloOpcode::kDynamicUpdateSlice},
-       {"equal-to", HloOpcode::kEq},
-       {"exponential", HloOpcode::kExp},
-       {"floor", HloOpcode::kFloor},
-       {"ceil", HloOpcode::kCeil},
-       {"fusion", HloOpcode::kFusion},
-       {"greater-than-or-equal-to", HloOpcode::kGe},
-       {"get-tuple-element", HloOpcode::kGetTupleElement},
-       {"greater-than", HloOpcode::kGt},
-       {"infeed", HloOpcode::kInfeed},
-       {"is-finite", HloOpcode::kIsFinite},
-       {"less-than-or-equal-to", HloOpcode::kLe},
-       {"log", HloOpcode::kLog},
-       {"less-than", HloOpcode::kLt},
-       {"map", HloOpcode::kMap},
-       {"maximum", HloOpcode::kMaximum},
-       {"minimum", HloOpcode::kMinimum},
-       {"multiply", HloOpcode::kMultiply},
-       {"not", HloOpcode::kNot},
-       {"not-equal-to", HloOpcode::kNe},
-       {"negate", HloOpcode::kNegate},
-       {"or", HloOpcode::kOr},
-       {"outfeed", HloOpcode::kOutfeed},
-       {"pad", HloOpcode::kPad},
-       {"parameter", HloOpcode::kParameter},
-       {"power", HloOpcode::kPower},
-       {"recv", HloOpcode::kRecv},
-       {"reduce", HloOpcode::kReduce},
-       {"reduce-precision", HloOpcode::kReducePrecision},
-       {"reduce-window", HloOpcode::kReduceWindow},
-       {"remainder", HloOpcode::kRemainder},
-       {"reshape", HloOpcode::kReshape},
-       {"reverse", HloOpcode::kReverse},
-       {"rng", HloOpcode::kRng},
-       {"round-nearest-afz", HloOpcode::kRoundNearestAfz},
-       {"select-and-scatter", HloOpcode::kSelectAndScatter},
-       {"select", HloOpcode::kSelect},
-       {"send", HloOpcode::kSend},
-       {"shift-left", HloOpcode::kShiftLeft},
-       {"shift-right-arithmetic", HloOpcode::kShiftRightArithmetic},
-       {"shift-right-logical", HloOpcode::kShiftRightLogical},
-       {"sign", HloOpcode::kSign},
-       {"sine", HloOpcode::kSin},
-       {"slice", HloOpcode::kSlice},
-       {"sort", HloOpcode::kSort},
-       {"subtract", HloOpcode::kSubtract},
-       {"tanh", HloOpcode::kTanh},
-       {"trace", HloOpcode::kTrace},
-       {"transpose", HloOpcode::kTranspose},
-       {"tuple", HloOpcode::kTuple},
-       {"while", HloOpcode::kWhile}});
+  static auto* opcode_map = new tensorflow::gtl::FlatMap<string, HloOpcode>({
+#define STRING_TO_OPCODE_ENTRY(enum_name, opcode_name, ...) \
+  {opcode_name, HloOpcode::enum_name},
+      HLO_OPCODE_LIST(STRING_TO_OPCODE_ENTRY)
+#undef STRING_TO_OPCODE_ENTRY
+  });
   auto it = opcode_map->find(opcode_name);
   if (it == opcode_map->end()) {
     return InvalidArgument("Unknown opcode: %s", opcode_name.c_str());
@@ -265,31 +44,36 @@ StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name) {
   return it->second;
 }
 
+#define CHECK_DEFAULT(property_name, opcode_name) false
+#define CHECK_PROPERTY(property_name, opcode_name, value) \
+  (value & property_name)
+#define RESOLVE(_1, _2, target, ...) target
+#define HAS_PROPERTY(property, ...) \
+  RESOLVE(__VA_ARGS__, CHECK_PROPERTY, CHECK_DEFAULT)(property, __VA_ARGS__)
+
 bool HloOpcodeIsComparison(HloOpcode opcode) {
   switch (opcode) {
-    case HloOpcode::kGe:
-    case HloOpcode::kGt:
-    case HloOpcode::kLe:
-    case HloOpcode::kLt:
-    case HloOpcode::kEq:
-    case HloOpcode::kNe:
-      return true;
-    default:
-      return false;
+#define CASE_IS_COMPARISON(enum_name, ...) \
+  case HloOpcode::enum_name:               \
+    return HAS_PROPERTY(kHloOpcodeIsComparison, __VA_ARGS__);
+    HLO_OPCODE_LIST(CASE_IS_COMPARISON)
+#undef CASE_IS_COMPARISON
   }
 }
 
 bool HloOpcodeIsVariadic(HloOpcode opcode) {
   switch (opcode) {
-    case HloOpcode::kCall:
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kFusion:
-    case HloOpcode::kMap:
-    case HloOpcode::kTuple:
-      return true;
-    default:
-      return false;
+#define CASE_IS_VARIADIC(enum_name, ...) \
+  case HloOpcode::enum_name:             \
+    return HAS_PROPERTY(kHloOpcodeIsVariadic, __VA_ARGS__);
+    HLO_OPCODE_LIST(CASE_IS_VARIADIC)
+#undef CASE_IS_VARIADIC
   }
 }
 
+#undef HAS_PROPERTY
+#undef RESOLVE
+#undef CHECK_DEFAULT
+#undef CHECK_PROPERTY
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 07c2d26f00f2338d306b57933e5f0fb77b38b892..f3f79357582ac7661a532e94031acdbca0b86784 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -28,83 +28,116 @@ namespace xla {
 // present in the XLA service protobuf.
 //
 // See the XLA documentation for the semantics of each opcode.
+//
+// Each entry has the format:
+// (enum_name, opcode_name)
+// or
+// (enum_name, opcode_name, p1 | p2 | ...)
+//
+// with p1, p2, ... are members of HloOpcodeProperty. They are combined
+// using bitwise-or.
+//
+// Note: Do not use ':' in opcode names. It is used as a special character
+// in these places:
+// - In extended opcode strings (HloInstruction::ExtendedOpcodeString()), to
+//   separate the opcode from the fusion kind
+// - In fully qualified names (HloInstruction::FullyQualifiedName()), to
+//   separate the qualifiers (name of the computation and potentially the
+//   fusion instruction) from the name
+#define HLO_OPCODE_LIST(V)                                   \
+  V(kAbs, "abs")                                             \
+  V(kAdd, "add")                                             \
+  V(kAtan2, "atan2")                                         \
+  V(kBatchNormGrad, "batch-norm-grad")                       \
+  V(kBatchNormInference, "batch-norm-inference")             \
+  V(kBatchNormTraining, "batch-norm-training")               \
+  V(kBitcast, "bitcast")                                     \
+  V(kBitcastConvert, "bitcast-convert")                      \
+  V(kBroadcast, "broadcast")                                 \
+  V(kCall, "call", kHloOpcodeIsVariadic)                     \
+  V(kCeil, "ceil")                                           \
+  V(kClamp, "clamp")                                         \
+  V(kComplex, "complex")                                     \
+  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
+  V(kConditional, "conditional")                             \
+  V(kConstant, "constant")                                   \
+  V(kConvert, "convert")                                     \
+  V(kConvolution, "convolution")                             \
+  V(kCopy, "copy")                                           \
+  V(kCos, "cosine")                                          \
+  V(kCrossReplicaSum, "cross-replica-sum")                   \
+  V(kCustomCall, "custom-call")                              \
+  V(kDivide, "divide")                                       \
+  V(kDot, "dot")                                             \
+  V(kDynamicSlice, "dynamic-slice")                          \
+  V(kDynamicUpdateSlice, "dynamic-update-slice")             \
+  V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
+  V(kExp, "exponential")                                     \
+  V(kFloor, "floor")                                         \
+  V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
+  V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
+  V(kGetTupleElement, "get-tuple-element")                   \
+  V(kGt, "greater-than", kHloOpcodeIsComparison)             \
+  V(kImag, "imag")                                           \
+  V(kInfeed, "infeed")                                       \
+  V(kIsFinite, "is-finite")                                  \
+  V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
+  V(kLog, "log")                                             \
+  V(kAnd, "and")                                             \
+  V(kNot, "not")                                             \
+  V(kOr, "or")                                               \
+  V(kLt, "less-than", kHloOpcodeIsComparison)                \
+  V(kMap, "map", kHloOpcodeIsVariadic)                       \
+  V(kMaximum, "maximum")                                     \
+  V(kMinimum, "minimum")                                     \
+  V(kMultiply, "multiply")                                   \
+  V(kNe, "not-equal-to", kHloOpcodeIsComparison)             \
+  V(kNegate, "negate")                                       \
+  V(kOutfeed, "outfeed")                                     \
+  V(kPad, "pad")                                             \
+  V(kParameter, "parameter")                                 \
+  V(kPower, "power")                                         \
+  V(kReal, "real")                                           \
+  V(kRecv, "recv")                                           \
+  V(kRecvDone, "recv-done")                                  \
+  V(kReduce, "reduce")                                       \
+  V(kReducePrecision, "reduce-precision")                    \
+  V(kReduceWindow, "reduce-window")                          \
+  V(kRemainder, "remainder")                                 \
+  V(kReshape, "reshape")                                     \
+  V(kReverse, "reverse")                                     \
+  V(kRng, "rng")                                             \
+  V(kRoundNearestAfz, "round-nearest-afz")                   \
+  V(kSelect, "select")                                       \
+  V(kSelectAndScatter, "select-and-scatter")                 \
+  V(kSend, "send")                                           \
+  V(kSendDone, "send-done")                                  \
+  V(kShiftLeft, "shift-left")                                \
+  V(kShiftRightArithmetic, "shift-right-arithmetic")         \
+  V(kShiftRightLogical, "shift-right-logical")               \
+  V(kSign, "sign")                                           \
+  V(kSin, "sine")                                            \
+  V(kSlice, "slice")                                         \
+  V(kSort, "sort")                                           \
+  V(kSubtract, "subtract")                                   \
+  V(kTanh, "tanh")                                           \
+  V(kTrace, "trace")                                         \
+  V(kTranspose, "transpose")                                 \
+  V(kTuple, "tuple", kHloOpcodeIsVariadic)                   \
+  V(kWhile, "while")
+
 enum class HloOpcode {
-  kAbs,
-  kAdd,
-  kAtan2,
-  kBatchNormGrad,
-  kBatchNormInference,
-  kBatchNormTraining,
-  kBitcast,
-  kBroadcast,
-  kCall,
-  kCeil,
-  kClamp,
-  kComplex,
-  kConcatenate,
-  kConstant,
-  kConvert,
-  kConvolution,
-  kCopy,
-  kCos,
-  kCrossReplicaSum,
-  kCustomCall,
-  kDivide,
-  kDot,
-  kDynamicSlice,
-  kDynamicUpdateSlice,
-  kEq,
-  kExp,
-  kFloor,
-  kFusion,
-  kGe,
-  kGetTupleElement,
-  kGt,
-  kImag,
-  kInfeed,
-  kIsFinite,
-  kLe,
-  kLog,
-  kAnd,
-  kNot,
-  kOr,
-  kLt,
-  kMap,
-  kMaximum,
-  kMinimum,
-  kMultiply,
-  kNe,
-  kNegate,
-  kOutfeed,
-  kPad,
-  kParameter,
-  kPower,
-  kReal,
-  kRecv,
-  kReduce,
-  kReducePrecision,
-  kReduceWindow,
-  kRemainder,
-  kReshape,
-  kReverse,
-  kRng,
-  kRoundNearestAfz,
-  kSelect,
-  kSelectAndScatter,
-  kSend,
-  kShiftLeft,
-  kShiftRightArithmetic,
-  kShiftRightLogical,
-  kSign,
-  kSin,
-  kSlice,
-  kSort,
-  kSubtract,
-  kTanh,
-  kTrace,
-  kTranspose,
-  kTuple,
-  kWhile,
+#define DECLARE_ENUM(enum_name, opcode_name, ...) enum_name,
+  HLO_OPCODE_LIST(DECLARE_ENUM)
+#undef DECLARE_ENUM
+};
+
+// List of properties associated with opcodes.
+// Properties are defined as increasing powers of two, so that we can use
+// bitwise-or to combine properties, and bitwise-and to test for them.
+enum HloOpcodeProperty {
+  kHloOpcodeIsComparison = 1 << 0,
+  kHloOpcodeIsVariadic = 1 << 1,
 };
 
 // Returns a string representation of the opcode.
@@ -125,7 +158,9 @@ bool HloOpcodeIsVariadic(HloOpcode opcode);
 
 // Returns the number of HloOpcode values.
 inline const uint32_t HloOpcodeCount() {
-  return static_cast<uint32_t>(HloOpcode::kWhile) + 1;
+#define HLO_COUNT_ONE(...) +1
+#define HLO_XLIST_LENGTH(list) list(HLO_COUNT_ONE)
+  return HLO_XLIST_LENGTH(HLO_OPCODE_LIST);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 892c89f9df209f2e39005a4901feae6699ce4d0b..cd2ce5c69f030c65b889d67e082a3677b8739ddb 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -26,5 +26,46 @@ TEST(HloOpcodeTest, StringifyMultiply) {
   ASSERT_EQ("multiply", HloOpcodeString(HloOpcode::kMultiply));
 }
 
+TEST(HloOpcodeTest, OpcodeProperties) {
+  // Test counting macro.
+#define SOME_LIST(X) \
+  X(One)             \
+  X(Two)             \
+  X(Three)
+  EXPECT_EQ(3, HLO_XLIST_LENGTH(SOME_LIST));
+#undef SOME_LIST
+
+  for (int i = 0; i < HloOpcodeCount(); ++i) {
+    auto opcode = static_cast<HloOpcode>(i);
+    // Test round-trip conversion to and from string.
+    EXPECT_EQ(opcode, StringToHloOpcode(HloOpcodeString(opcode)).ValueOrDie());
+
+    // Test some properties.
+    switch (opcode) {
+      case HloOpcode::kEq:
+      case HloOpcode::kNe:
+      case HloOpcode::kGt:
+      case HloOpcode::kLt:
+      case HloOpcode::kGe:
+      case HloOpcode::kLe:
+        EXPECT_TRUE(HloOpcodeIsComparison(opcode));
+        break;
+      default:
+        EXPECT_FALSE(HloOpcodeIsComparison(opcode));
+    }
+    switch (opcode) {
+      case HloOpcode::kCall:
+      case HloOpcode::kConcatenate:
+      case HloOpcode::kFusion:
+      case HloOpcode::kMap:
+      case HloOpcode::kTuple:
+        EXPECT_TRUE(HloOpcodeIsVariadic(opcode));
+        break;
+      default:
+        EXPECT_FALSE(HloOpcodeIsVariadic(opcode));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index 37009369797693dcd06647fad845bb0c004cec67..6f6e679a21870e46da85963c3b2998465ac43420 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -173,6 +173,19 @@ bool HloOrdering::UseIsBeforeValueDefinition(
       return true;
     }
   }
+
+  // The use at a call occurs before values that are defined in the called
+  // computation.
+  if (use.instruction->opcode() == HloOpcode::kCall) {
+    const HloInstruction* call = use.instruction;
+    if (call_graph_->InstructionIsNestedIn(value.defining_instruction(),
+                                           call->to_apply())) {
+      VLOG(4) << "  use is call " << use.instruction->name()
+              << " and def is in called computation";
+      return true;
+    }
+  }
+
   VLOG(4) << "  use is not before value";
   return false;
 }
@@ -187,23 +200,6 @@ bool HloOrdering::LiveRangeStrictlyBefore(
     return false;
   }
 
-  // Live-out values from the module can never have ranges strictly before any
-  // other value.
-  if (a.live_out_of_module()) {
-    VLOG(4) << "a is live out of module";
-    return false;
-  }
-
-  // Live-out values of computations can never have ranges strictly before any
-  // other value in the computation (including values nested in
-  // subcomputations).
-  if (a.live_out_of_computation() &&
-      call_graph_->InstructionIsNestedIn(b.defining_instruction(),
-                                         a.defining_instruction()->parent())) {
-    VLOG(4) << "a is live out of computation containing b";
-    return false;
-  }
-
   // All uses of 'a' must be before 'b' is defined.
   for (const HloUse& use : a.uses()) {
     if (!UseIsBeforeValueDefinition(use, b, dataflow)) {
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e944ad15139af0d2f98e8e68d3d48303f47ecf1c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
+
+#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
+
+namespace xla {
+string HloProfilePrinter::ToString(const int64* counters,
+                                   double clock_rate_ghz) const {
+  string result;
+
+  for (int computation_idx = 0; computation_idx < computation_infos_size_;
+       computation_idx++) {
+    const HloComputationInfo& computation = computation_infos_[computation_idx];
+    const HloInstructionInfo* instructions_begin = computation.instructions;
+    const HloInstructionInfo* instructions_end =
+        computation.instructions + computation.instructions_size;
+    bool any_instruction_profiled =
+        std::any_of(instructions_begin, instructions_end,
+                    [&](const HloInstructionInfo& instruction_info) {
+                      return counters[instruction_info.profile_index] != 0;
+                    });
+
+    if (!any_instruction_profiled) {
+      continue;
+    }
+
+    // Once we start using this in AOT for real, we will probably need a more
+    // minimal version of HumanReadableProfileBuilder.
+    HumanReadableProfileBuilder builder(
+        computation.name, counters[computation.profile_index], clock_rate_ghz);
+
+    for (const auto* instruction = instructions_begin;
+         instruction != instructions_end; instruction++) {
+      builder.AddOp(
+          /*op_name=*/instruction->long_name,
+          /*short_name=*/instruction->short_name, instruction->category,
+          counters[instruction->profile_index], instruction->flop_count,
+          instruction->transcendental_count, instruction->bytes_accessed,
+          instruction->optimal_seconds);
+    }
+
+    result += builder.ToString();
+  }
+
+  return result;
+}
+
+HloProfilePrinter::~HloProfilePrinter() {
+  if (deleter_) {
+    deleter_(computation_infos_, computation_infos_size_);
+  }
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f056490ae027872570f7a0821ee63114f49fab8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+// Instances of this class can pretty-print profile counters gathered from
+// running an XLA computation without having access to the backing module.
+class HloProfilePrinter {
+ public:
+  // Holds meta information about an HloInstruction.
+  //
+  // The pointer-typed fields can be owning or non-owning -- this decision is
+  // manifested as the deleter_ function in the containing HloProfilePrinter.
+  struct HloInstructionInfo {
+    // Textual information for pretty printing.
+    const char* long_name;
+    const char* short_name;
+    const char* category;
+
+    // Metrics computed by HloCostAnalysis.
+    float flop_count;
+    float transcendental_count;
+    float bytes_accessed;
+    float optimal_seconds;
+
+    // The index into the profile counters array for the HloInstruction
+    // corresponding to this HloInstructionInfo.
+    int64 profile_index;
+  };
+
+  // Holds meta information about an HloComputation.
+  //
+  // The pointer-typed fields can be owning or non-owning -- this decision is
+  // manifested as the deleter_ function in the containing HloProfilePrinter.
+  struct HloComputationInfo {
+    const char* name;
+
+    // The index into the profile counters array for the HloInstruction
+    // corresponding to this HloComputationInfo.
+    int64 profile_index;
+
+    HloInstructionInfo* instructions;
+    int64 instructions_size;
+  };
+
+  HloProfilePrinter(
+      HloComputationInfo* computation_infos, int64 computation_infos_size,
+      int64 profile_counters_size,
+      std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
+      : computation_infos_(computation_infos),
+        computation_infos_size_(computation_infos_size),
+        profile_counters_size_(profile_counters_size),
+        deleter_(std::move(deleter)) {}
+
+  HloProfilePrinter(HloProfilePrinter&& other) {
+    std::swap(other.computation_infos_, computation_infos_);
+    std::swap(other.computation_infos_size_, computation_infos_size_);
+    std::swap(other.deleter_, deleter_);
+  }
+
+  HloProfilePrinter(const HloProfilePrinter&) = delete;
+  HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
+
+  // Converts the profile counter sequence `counters` to a human readable string
+  // representation.
+  string ToString(const int64* counters, double clock_rate_ghz) const;
+
+  // Returns the size of the profile buffer expected by this printer.
+  int64 profile_counters_size() const { return profile_counters_size_; }
+
+  ~HloProfilePrinter();
+
+ private:
+  // The `computation_infos_` field can be owning or non-owning -- this decision
+  // is manifested as the deleter_ function.
+  HloComputationInfo* computation_infos_ = nullptr;
+  int64 computation_infos_size_ = 0;
+  int64 profile_counters_size_ = 0;
+  std::function<void(HloComputationInfo*, int64)> deleter_;
+};
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index d7bdac9c86579f19afbba133772c2c50894853d1..553ec11f6f9a2997ab7113f9b8241e04c7fe20d5 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -30,11 +30,17 @@ namespace xla {
 
 class HloInstruction;
 
-// A class for computing and representing reachability between HloInstructions.
+// A class for representing reachability between HloInstructions.
+//
+// !!! THIS CLASS DOES NOT COMPUTE REACHABILITY !!! It has an adjacency matrix
+// and it is up to the user of the class to set the adjacency matrix such that
+// it represents reachability, i.e. such that it is transitive. That the graph
+// be transitive is thus not an invariant of this class, but it is required for
+// the name of the class and its methods to make sense.
 class HloReachabilityMap {
  public:
-  // Sets up an empty reachable matrix for the full set of instructions
-  // specified in 'instructions'.
+  // Sets up a graph with no edges and where the nodes correspond to the given
+  // instructions.
   explicit HloReachabilityMap(const std::list<HloInstruction*>& instructions);
 
   // Set the reachability set of 'instruction' to the union of the reachability
@@ -42,17 +48,33 @@ class HloReachabilityMap {
   // 'x' is not 'instruction' will return true iff IsReachable(x, input) is true
   // for some 'input' in 'inputs'. Also sets 'instruction' to be reachable from
   // itself. Returns whether the reachability set of 'instruction' changed.
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // vector in the internal graph of this HloReachabilityMap for the given
+  // instruction and does not transitively update any other part of the
+  // adjacency matrix.
   bool SetReachabilityToUnion(
       tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
       const HloInstruction* instruction);
 
   // Sets entry so that IsReachable(a, b) will return true
+  //
+  // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
+  // matrix in the internal graph of this HloReachabilityMap to have an edge
+  // from a to b and does not transitively update any other part of the
+  // adjacency matrix.
   void SetReachable(const HloInstruction* a, const HloInstruction* b);
 
   // Returns true if "b" is reachable from "a"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsReachable(const HloInstruction* a, const HloInstruction* b) const;
 
   // Returns true if "b" is reachable from "a" or "a" is reachable from "b"
+  //
+  // Note that this function only correctly answers queries about reachability
+  // if the set of edges that have been provided to this class are transitive.
   bool IsConnected(const HloInstruction* a, const HloInstruction* b) const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index c96df50e79a3c6d4ca5f8e7e0abec33cdfca1c70..1747790e63c6af997eea096b68e5525fdd9d131a 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -62,16 +62,11 @@ bool IsRematerializable(const HloInstruction* instruction) {
     case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
-    case HloOpcode::kOutfeed:
-    case HloOpcode::kInfeed:
     case HloOpcode::kParameter:
-    case HloOpcode::kRecv:
-    case HloOpcode::kSend:
-    case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
     default:
-      return true;
+      return !instruction->HasSideEffect();
   }
 }
 
@@ -571,7 +566,9 @@ Status MemoryUsageTracker::BeginInstruction(Item* item) {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -608,8 +605,9 @@ Status MemoryUsageTracker::EndInstruction() {
   VLOG(3) << "  memory usage = " << memory_usage_;
   VLOG(10) << ToString();
 
-  DCHECK(Check());
-
+  if (VLOG_IS_ON(1)) {
+    DCHECK(Check());
+  }
   return Status::OK();
 }
 
@@ -1026,7 +1024,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 
       HloInstruction* best = best_item->instruction;
       VLOG(1) << "Rematerializing instruction " << best->name() << " (saving "
-              << memory_tracker.MemoryReducedIfRematerialized(best_item) << ")";
+              << HumanReadableNumBytes(
+                     memory_tracker.MemoryReducedIfRematerialized(best_item))
+              << ")";
       changed = true;
       remat_count++;
 
@@ -1106,8 +1106,8 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         net_instructions_added++;
       }
 
-      VLOG(3) << "memory_usage after rematerialization = "
-              << memory_tracker.memory_usage();
+      VLOG(1) << "memory_usage after rematerialization = "
+              << HumanReadableNumBytes(memory_tracker.memory_usage());
     }
 
     const CallSite* callsite = call_graph_node.GetCallSite(instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index d88aa4bb567c6c5f6eab54f12239bf7040339c39..c9b57166af438ef19ae4f079b8ecc8ddd5aede00 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -323,6 +323,76 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
   EXPECT_EQ(inner_computation->instruction_count(), 8);
 }
 
+TEST_F(HloRematerializationTest, RngNotRematerialized) {
+  // Test that a single rng is not rematerialized:
+  //
+  // Entry computation:
+  //   F32[] %param = {...}
+  //   F32[1024] rng = rng(param)
+  //   F32[1024] tanh = tanh(rng)
+  //   F32[1024] exp = exp(rng)
+  //   F32[1024] add_0 = add(rng, tanh)              // LIVE: add_0 + rng +
+  //                                                 //       tanh + exp
+  //
+  //   F32[1024] add_1 = add(rng, add(exp, add_0))   // LIVE: add_1 + add_0 +
+  //                                                 //       rng + tanh + exp
+  //
+  //   F32[1024] add_2 = add(rng, add(tanh, add_1))  // LIVE: add_2 + add_1 +
+  //                                                 //       rng + tanh + exp
+  auto module = CreateNewModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+  auto rng = builder.AddInstruction(HloInstruction::CreateRng(
+      vec1024_shape_, RandomDistribution::RNG_BERNOULLI, {param}));
+  auto tanh = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kTanh, rng));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kExp, rng));
+  auto add_0 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vec1024_shape_, HloOpcode::kAdd, rng, tanh));
+  auto add_1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, exp, add_0))));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      vec1024_shape_, HloOpcode::kAdd, rng,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          vec1024_shape_, HloOpcode::kAdd, tanh, add_1))));
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  auto count_rngs = [](const HloComputation* computation) {
+    int64 rng_count = 0;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kRng) {
+        ++rng_count;
+      }
+    }
+    return rng_count;
+  };
+  // Before rematerialization there should be a single broadcast rng in
+  // the graph.
+  ASSERT_EQ(count_rngs(entry_computation), 1);
+  const int64 original_instruction_count =
+      entry_computation->instruction_count();
+  SequentialHloOrdering::HloModuleSequence sequence;
+  // Pick a memory limit some where between 24KB (initial peak memory including
+  // parameter and output) and 20KB (peak memory possible with
+  // rematerialization).
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, HloRematerialization::RematerializeAndSchedule(
+                        ByteSizeOf,
+                        /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
+                        module.get(), &sequence));
+  EXPECT_TRUE(changed);
+  // The rng should not have been rematerialized.
+  EXPECT_EQ(count_rngs(entry_computation), 1);
+  // There should have been rematerialization.
+  EXPECT_GT(entry_computation->instruction_count(), original_instruction_count);
+}
+
 TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Test that a single instruction is rematerialized several times. Module:
   //
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index c3f74e253f7a7882ec1c72e0ce634017dd2f0957..4a7caf3ebd81e4ca81400c67aa29a6a10bfe59d8 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
 
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
 
@@ -19,8 +20,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
@@ -40,11 +40,29 @@ namespace se = ::perftools::gputools;
 namespace xla {
 
 /*static*/ StatusOr<std::unique_ptr<HloModule>>
-HloRunner::ReadModuleFromHloProtoFile(const char* filename,
+HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
+                                  const DebugOptions& debug_options) {
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  return tools::Parse(hlo_string, config);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloProtoFile(const std::string& filename,
                                       const DebugOptions& debug_options) {
   HloProto proto;
-  TF_RETURN_IF_ERROR(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
-                                                 filename, &proto));
+
+  const Status s =
+      tensorflow::ReadBinaryProto(tensorflow::Env::Default(), filename, &proto);
+
+  if (!s.ok()) {
+    const Status s2 =
+        tensorflow::ReadTextProto(tensorflow::Env::Default(), filename, &proto);
+    if (!s2.ok()) {
+      return Status(s2.code(), s.error_message() + "\n" + s2.error_message());
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
@@ -54,6 +72,30 @@ HloRunner::ReadModuleFromHloProtoFile(const char* filename,
   return std::move(module);
 }
 
+/*static*/ StatusOr<std::unique_ptr<HloModule>>
+HloRunner::ReadModuleFromHloTextDumpFile(const std::string& filename,
+                                         const DebugOptions& debug_options) {
+  string hlo_string;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  filename, &hlo_string));
+  HloModuleConfig config;
+  config.set_debug_options(debug_options);
+  return tools::Parse(hlo_string, config);
+}
+
+/*static*/ StatusOr<std::unique_ptr<HloModule>> HloRunner::ReadModule(
+    const std::string& filename, const DebugOptions& debug_options) {
+  auto module = HloRunner::ReadModuleFromHloProtoFile(filename, debug_options);
+  if (module.ok()) {
+    return module;
+  }
+  const std::string e = module.status().error_message();
+  module = HloRunner::ReadModuleFromHloTextDumpFile(filename, debug_options);
+  return module.ok() ? std::move(module)
+                     : Status(module.status().code(),
+                              e + "\n" + module.status().error_message());
+}
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
 struct HloRunner::EigenThreadPoolWrapper {
@@ -80,11 +122,16 @@ HloRunner::~HloRunner() {
 StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
-    Shape* result_shape) {
+    Shape* result_shape, bool run_hlo_passes) {
+  if (run_hlo_passes) {
+    TF_ASSIGN_OR_RETURN(
+        module, backend().compiler()->RunHloPasses(
+                    std::move(module), backend().default_stream_executor()));
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend().compiler()->Compile(std::move(module),
-                                    backend().default_stream_executor()));
+      backend().compiler()->RunBackend(std::move(module),
+                                       backend().default_stream_executor()));
 
   se::Stream stream(backend().default_stream_executor());
   stream.Init();
@@ -96,14 +143,13 @@ StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
   run_options.set_intra_op_thread_pool(
       backend().eigen_intra_op_thread_pool_device());
 
-  HloExecutionProfile hlo_execution_profile;
   ServiceExecutableRunOptions service_run_options(
       run_options, backend().StreamBorrower(),
       backend().inter_op_thread_pool());
   TF_ASSIGN_OR_RETURN(
       se::DeviceMemoryBase result,
       executable->ExecuteOnStream(&service_run_options, arguments,
-                                  &hlo_execution_profile));
+                                  /*hlo_execution_profile=*/nullptr));
   TF_RET_CHECK(stream.BlockHostUntilDone());
 
   allocations_.push_back(result);
@@ -160,10 +206,12 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::TransferFromDevice(
 
 StatusOr<std::unique_ptr<Literal>> HloRunner::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
+    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
+    bool run_hlo_passes) {
   Shape result_shape;
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase device_base,
-                      Execute(std::move(module), arguments, &result_shape));
+  TF_ASSIGN_OR_RETURN(
+      se::DeviceMemoryBase device_base,
+      Execute(std::move(module), arguments, &result_shape, run_hlo_passes));
   return TransferFromDevice(result_shape, device_base);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index a4d7b653dbfbfdb169c07bca3e461147fd9d077a..a65c66fd4b6db858a532096a5ee466aa9bf0d844 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -35,7 +35,8 @@ namespace xla {
 
 // A base class for running an HloModule. This executes the given HloModule on a
 // certain backend directly without using the client interface. HloModule can be
-// explicitly built, or loaded from a serialization file (e.g., hlo proto file).
+// explicitly built, or loaded from a serialization file (e.g., hlo proto
+// file), or parsed from a hlo textual IR string.
 class HloRunner {
  public:
   HloRunner();
@@ -44,25 +45,48 @@ class HloRunner {
 
   ~HloRunner();
 
-  // Reads the binary proto file in xla.HloProto format, creates and returns the
-  // HloModule.
+  // Converts an HloModule from the given hlo textual IR string (in
+  // HloModule::ToString format).
+  static StatusOr<std::unique_ptr<HloModule>> CreateModuleFromString(
+      const tensorflow::StringPiece hlo_string,
+      const DebugOptions& debug_options);
+
+  // Reads the proto file in xla.HloProto format, creates and returns the
+  // HloModule. Will try to parse the filename as binary proto, then try as
+  // text proto if that fails.
   static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloProtoFile(
-      const char* filename, const DebugOptions& debug_options);
+      const std::string& filename, const DebugOptions& debug_options);
+
+  // Reads the hlo text dump file in HloModule::ToString format, creates and
+  // returns the HloModule.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextDumpFile(
+      const std::string& filename, const DebugOptions& debug_options);
+
+  // Tries to parse the filename specified first as binary proto format, then
+  // as a textual proto format, then textual IR, then gives up if both fail.
+  // ReadModuleFromHloProtoFile or ReadModuleFromHloTextDumpFile should be used
+  // explicitly when you know the format, this if you don't.
+  static StatusOr<std::unique_ptr<HloModule>> ReadModule(
+      const std::string& filename, const DebugOptions& debug_options);
 
   // Executes the given module with given literals as input and returns the
   // result as a Literal. The LiteralPtr type accepts Literal* or
   // std::unique_ptr<Literal>.
+  //
+  // If run_hlo_passes is false, the module will be executed without Hlo
+  // optimization.
   template <typename LiteralPtr>
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<LiteralPtr> literals);
+      const tensorflow::gtl::ArraySlice<LiteralPtr> literals,
+      bool run_hlo_passes = true);
 
   // Executes the given module and returns a global data handle.
   StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments,
-      Shape* result_shape);
+      Shape* result_shape, bool run_hlo_passes = true);
 
   // Transfers the given literal to the device and returns the data handle.
   StatusOr<perftools::gputools::DeviceMemoryBase> TransferToDevice(
@@ -77,7 +101,8 @@ class HloRunner {
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       std::unique_ptr<HloModule> module,
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          arguments);
+          arguments,
+      bool run_hlo_passes = true);
 
   // If backend is not created in the constructor, creates and returns the
   // default backend. If creation fails, crashes the program.
@@ -99,14 +124,15 @@ class HloRunner {
 template <typename LiteralPtr>
 StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<LiteralPtr> literals) {
+    const tensorflow::gtl::ArraySlice<LiteralPtr> literals,
+    bool run_hlo_passes) {
   std::vector<perftools::gputools::DeviceMemoryBase> arguments;
   for (const auto& literal : literals) {
     TF_ASSIGN_OR_RETURN(perftools::gputools::DeviceMemoryBase argument,
                         TransferToDevice(*literal));
     arguments.push_back(argument);
   }
-  return ExecuteAndTransfer(std::move(module), arguments);
+  return ExecuteAndTransfer(std::move(module), arguments, run_hlo_passes);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 8ccbcaeee4a9c9e94b344231953e20ac8f4b2053..0dc17392f1f520a415083c92b51db9d9abb321c0 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
+using ::tensorflow::strings::HumanReadableNumBytes;
+
 namespace xla {
 
 StatusOr<int64> MinimumMemoryForSequence(
@@ -375,6 +377,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
   // Note that this is just a heuristic. One obvious inaccuracy is that the
   // memory required for sub-computations might be different when considered
   // within the caller's context. But it's good enough for now.
+  VLOG(2) << "Computation: " << computation.name();
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
       ListScheduler::Run(computation, points_to_analysis, size_function));
@@ -382,7 +385,7 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
                                   points_to_analysis, size_function));
-  VLOG(2) << "Min-memory list sequence: " << list_memory << " bytes";
+  VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> dfs_sequence,
@@ -391,13 +394,15 @@ StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
                                   size_function));
-  VLOG(2) << "Min-memory dfs sequence: " << dfs_memory << " bytes";
+  VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
   if (list_memory <= dfs_memory) {
-    VLOG(2) << "Chose min-memory list sequence: " << list_memory << " bytes";
+    VLOG(2) << "Chose min-memory list sequence: "
+            << HumanReadableNumBytes(list_memory);
     return list_sequence;
   } else {
-    VLOG(2) << "Chose min-memory dfs sequence: " << dfs_memory << " bytes";
+    VLOG(2) << "Chose min-memory dfs sequence: "
+            << HumanReadableNumBytes(dfs_memory);
     return dfs_sequence;
   }
 }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 0d019d22f5d4cd401c0fc5572f99636dec4f7383..447c2446668253c932b44b51b2db22bfd47f9957 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 
@@ -38,6 +39,15 @@ HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
 }
 
 string HloSharding::ToString() const {
+  if (IsTuple()) {
+    std::vector<string> parts;
+    parts.reserve(tuple_elements_.size());
+    for (const HloSharding& element : tuple_elements_) {
+      parts.push_back(element.ToString());
+    }
+    return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}");
+  }
+
   string result = StrCat("{", (replicated_ ? " replicated" : ""),
                          (maximal_ ? " maximal" : ""));
 
@@ -53,6 +63,11 @@ string HloSharding::ToString() const {
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
+  if (IsTuple()) {
+    return std::any_of(
+        tuple_elements_.begin(), tuple_elements_.end(),
+        [&](const HloSharding& s) { return s.UsesDevice(device); });
+  }
   const auto& devices = tile_assignment_;
   return replicated_ ||
          std::find(devices.begin(), devices.end(), device) != devices.end();
@@ -61,6 +76,7 @@ bool HloSharding::UsesDevice(int64 device) const {
 std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
   CHECK(!ShapeUtil::IsTuple(tile_shape_));
   CHECK(!maximal_);
+  CHECK(!IsTuple());
   std::vector<int64> ret_index;
   tile_assignment_.Each([&](tensorflow::gtl::ArraySlice<int64> index, int64 d) {
     if (d == device) {
@@ -74,6 +90,7 @@ std::vector<int64> HloSharding::TileIndexForDevice(int64 device) const {
 int64 HloSharding::DeviceForTileIndex(
     tensorflow::gtl::ArraySlice<int64> index) const {
   CHECK(!replicated_);
+  CHECK(!IsTuple());
   if (maximal_) {
     return *tile_assignment_.begin();
   }
@@ -82,7 +99,7 @@ int64 HloSharding::DeviceForTileIndex(
 }
 
 std::vector<int64> HloSharding::TileOffsetForDevice(int64 device) const {
-  CHECK(!ShapeUtil::IsTuple(tile_shape_));
+  CHECK(!IsTuple());
 
   std::vector<int64> index = TileIndexForDevice(device);
   if (maximal_) {
@@ -97,7 +114,7 @@ std::vector<int64> HloSharding::TileOffsetForDevice(int64 device) const {
 }
 
 std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
-  CHECK(!ShapeUtil::IsTuple(tile_shape_));
+  CHECK(!IsTuple());
   CHECK(!maximal_);  // Maximal shardings do not have a valid tile shape.
 
   std::vector<int64> index = TileIndexForDevice(device);
@@ -108,14 +125,94 @@ std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
 }
 
 StatusOr<int64> HloSharding::UniqueDevice() const {
-  if (!replicated_ && maximal_) {
+  if (IsTuple()) {
+    if (tuple_elements_.empty()) {
+      return tensorflow::errors::InvalidArgument(
+          "UniqueDevice() called on empty tuple");
+    }
+    std::vector<StatusOr<int64>> results;
+    std::transform(tuple_elements_.begin(), tuple_elements_.end(),
+                   std::back_inserter(results),
+                   [](const HloSharding& s) { return s.UniqueDevice(); });
+    if (std::all_of(results.begin(), results.end(),
+                    [&](const StatusOr<int64>& s) {
+                      return s.ok() && results[0].ok() &&
+                             s.ValueOrDie() == results[0].ValueOrDie();
+                    })) {
+      return results[0];
+    } else {
+      return tensorflow::errors::InvalidArgument(
+          "Tuple did not contain a unique device");
+    }
+  }
+  if (!replicated_ && maximal_ && !IsTuple()) {
     return static_cast<int64>(*tile_assignment_.begin());
   }
   return tensorflow::errors::InvalidArgument(
       "UniqueDevice() called on sharding that executes on multiple devices");
 }
 
+bool HloSharding::HasUniqueDevice() const {
+  if (IsTuple()) {
+    return UniqueDevice().status().ok();
+  } else {
+    return !IsReplicated() && IsTileMaximal();
+  }
+}
+
+Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const {
+  if (!ShapeUtil::IsTuple(shape)) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Sharding is tuple-shaped but validation shape is not."));
+  }
+  // The easiest way to get the number of elements in a nested tuple is just to
+  // create a shape tree. We could call GetAsShapeTree, but that will try and
+  // apply our tuple_shardings_ to the shape tree, and that might cause a crash
+  // at this point as we haven't validated them.
+  ShapeTree<bool> bool_shape_tree(shape, false);
+  int64 num_leaves =
+      std::distance(bool_shape_tree.leaf_begin(), bool_shape_tree.leaf_end());
+  if (num_leaves != tuple_elements_.size()) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Validation tuple shape has ", num_leaves,
+               " leaf elements, but this sharding contains ",
+               tuple_elements_.size(), " elements."));
+  }
+
+  // Now we've validated the number of tuple elements, it's safe to request a
+  // shape tree.
+  ShapeTree<HloSharding> shape_tree = GetAsShapeTree(shape);
+  for (const auto& index_to_sharding : shape_tree.leaves()) {
+    Status status = index_to_sharding.second.ValidateNonTuple(
+        ShapeUtil::GetSubshape(shape, index_to_sharding.first), num_devices);
+    if (!status.ok()) {
+      tensorflow::errors::AppendToMessage(
+          &status, StrCat("Note: While validating sharding tuple element ",
+                          index_to_sharding.first.ToString(), " which is ",
+                          index_to_sharding.second.ToString()));
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
 Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
+  Status status = IsTuple() ? ValidateTuple(shape, num_devices)
+                            : ValidateNonTuple(shape, num_devices);
+  if (!status.ok()) {
+    tensorflow::errors::AppendToMessage(
+        &status, StrCat("Note: While validating sharding ", ToString(),
+                        " against shape ", ShapeUtil::HumanString(shape)));
+  }
+  return status;
+}
+
+Status HloSharding::ValidateNonTuple(const Shape& shape,
+                                     int64 num_devices) const {
+  if (ShapeUtil::IsTuple(shape)) {
+    return tensorflow::errors::InvalidArgument(
+        StrCat("Validation shape is a tuple but sharding is not."));
+  }
   if (replicated_) {
     return Status::OK();
   }
@@ -129,13 +226,11 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
         // Don't overwrite a bad status, so we report the first error.
         if (status.ok()) {
           if (core >= num_devices) {
-            status =
-                tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-                    "core ", core, " > ", num_devices, " in tile assignment"));
+            status = tensorflow::errors::InvalidArgument(StrCat(
+                "core ", core, " > ", num_devices, " in tile assignment"));
           } else if (seen_cores.count(core) != 0) {
-            status =
-                tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-                    "core ", core, " is not unique in tile assignment"));
+            status = tensorflow::errors::InvalidArgument(
+                StrCat("core ", core, " is not unique in tile assignment"));
           }
         }
         seen_cores.insert(core);
@@ -151,7 +246,8 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
   // The tile rank must be the same as the input rank.
   if (ShapeUtil::Rank(shape) != ShapeUtil::Rank(tile_shape_)) {
     return tensorflow::errors::InvalidArgument(
-        "Tile rank is different to the input rank");
+        "Tile rank is different to the input rank. sharding=", ToString(),
+        ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
   // The tile shape must not be the same as the input shape without maximal_
@@ -169,9 +265,9 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
     auto tile_dim = tile_shape_.dimensions(i);
     auto shape_dim = shape.dimensions(i);
     if (tile_dim > shape_dim) {
-      return tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-          "Tile is larger than input shape (dimension ", i, ", ", tile_dim,
-          " > ", shape_dim));
+      return tensorflow::errors::InvalidArgument(
+          StrCat("Tile is larger than input shape (dimension ", i, ", ",
+                 tile_dim, " > ", shape_dim));
     }
   }
 
@@ -181,10 +277,10 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
     int64 expected_dim =
         CeilOfRatio(shape.dimensions(i), tile_shape_.dimensions(i));
     if (tile_assignment_.dimensions()[i] != expected_dim) {
-      return tensorflow::errors::InvalidArgument(tensorflow::strings::StrCat(
-          "Tile assignment tensor has incorrect shape. Dimension ", i,
-          " expected ", expected_dim, " but got ",
-          tile_assignment_.dimensions()[i]));
+      return tensorflow::errors::InvalidArgument(
+          StrCat("Tile assignment tensor has incorrect shape. Dimension ", i,
+                 " expected ", expected_dim, " but got ",
+                 tile_assignment_.dimensions()[i]));
     }
   }
 
@@ -193,9 +289,19 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 /*static*/ StatusOr<HloSharding> HloSharding::FromProto(
     const OpSharding& proto) {
-  if (proto.type() == OpSharding::Type::OpSharding_Type_REPLICATED) {
+  if (proto.type() == OpSharding::Type::OpSharding_Type_TUPLE) {
+    std::vector<HloSharding> tuple_shardings;
+    tuple_shardings.reserve(proto.tuple_shardings().size());
+    for (const OpSharding& tuple_sharding_proto : proto.tuple_shardings()) {
+      TF_ASSIGN_OR_RETURN(HloSharding sharding,
+                          HloSharding::FromProto(tuple_sharding_proto));
+      tuple_shardings.push_back(sharding);
+    }
+    return HloSharding(tuple_shardings);
+  } else if (proto.type() == OpSharding::Type::OpSharding_Type_REPLICATED) {
     return Replicate();
-  } else if (proto.type() == OpSharding::Type::OpSharding_Type_MAXIMAL) {
+  } else if (proto.type() == OpSharding::Type::OpSharding_Type_MAXIMAL ||
+             proto.tile_assignment_devices().size() == 1) {
     return HloSharding(proto.tile_assignment_devices(0));
   }
   // Some versions of gcc cannot infer the TileAssignment constructor from a
@@ -212,6 +318,15 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 OpSharding HloSharding::ToProto() const {
   OpSharding result;
+
+  if (IsTuple()) {
+    for (const HloSharding& element : tuple_elements_) {
+      *result.add_tuple_shardings() = element.ToProto();
+    }
+    result.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
+    return result;
+  }
+
   *result.mutable_tile_shape() = tile_shape_;
   for (int64 dim : tile_assignment_.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index d7ada30c70bc3b41b3117375380eac2e883d9a9d..7263198385cf0c84b1dac1e15177dcac99adaafb 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -67,6 +68,29 @@ class HloSharding {
   // `num_tiles` tiles.
   static HloSharding Tile1D(const Shape& input_shape, int64 num_tiles);
 
+  // Creates a new sharding for a tuple type. The given ShapeTree must have
+  // elements for every leaf shape contained in the tuple.
+  static HloSharding Tuple(const ShapeTree<HloSharding>& sub_shardings) {
+    std::vector<HloSharding> flattened_list;
+    flattened_list.reserve(
+        std::distance(sub_shardings.leaf_begin(), sub_shardings.leaf_end()));
+    for (const auto& index_to_sharding : sub_shardings.leaves()) {
+      flattened_list.push_back(index_to_sharding.second);
+    }
+    return HloSharding(flattened_list);
+  }
+
+  // Creates a new sharding for a tuple type. The requested tuple shape must not
+  // be nested. For nested tuples, use the ShapeTree overload.
+  static HloSharding Tuple(const Shape& tuple_shape,
+                           tensorflow::gtl::ArraySlice<HloSharding> shardings) {
+    CHECK(ShapeUtil::IsTuple(tuple_shape));
+    CHECK(!ShapeUtil::IsNestedTuple(tuple_shape));
+    std::vector<HloSharding> flattened_list(shardings.begin(), shardings.end());
+    CHECK_EQ(flattened_list.size(), ShapeUtil::TupleElementCount(tuple_shape));
+    return HloSharding(flattened_list);
+  }
+
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
@@ -76,47 +100,93 @@ class HloSharding {
   // Validate that this sharding can be applied to a tensor with shape `shape`.
   Status Validate(const Shape& shape, int64 num_devices) const;
 
+  // Returns true if the sharding has tuple type.
+  bool IsTuple() const { return tuple_; }
+
   // Returns true if the sharding is trivial: replicate on all devices.
-  bool IsReplicated() const { return replicated_; }
+  bool IsReplicated() const {
+    if (!IsTuple()) {
+      return replicated_;
+    }
+    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
+                       [](const HloSharding& s) { return s.IsReplicated(); });
+  }
 
   // Returns true if the tile size is the same as the input size.
-  bool IsTileMaximal() const { return maximal_; }
+  bool IsTileMaximal() const {
+    if (!IsTuple()) {
+      return maximal_;
+    }
+    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
+                       [](const HloSharding& s) { return s.IsTileMaximal(); });
+  }
 
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
   // Returns the tile that should be executed on the given device.
+  // REQUIRES: !IsTuple()
   std::vector<int64> TileIndexForDevice(int64 device) const;
 
   // Returns the device that should execute the given tile.
   // It is an error to call this if is_replicated() is true.
+  // REQUIRES: !IsTuple()
   int64 DeviceForTileIndex(tensorflow::gtl::ArraySlice<int64> index) const;
 
   // Given a device ID, returns the offset within the input space of the
   // tile that should be executed on the given core. This returns the lower
   // extent of the tile in the input space.
+  // REQUIRES: !IsTuple()
   std::vector<int64> TileOffsetForDevice(int64 device) const;
 
   // Given a device ID, returns the limit within the input space of the
   // tile that should be executed on the given core. This returns the upper
   // extent of the tile in the input space.
+  // REQUIRES: !IsTuple()
   std::vector<int64> TileLimitForDevice(int64 device) const;
 
   // Returns the single device this op operates on.
-  // Requires !Replicated() && IsTileMaximal().
+  // REQUIRES: !IsTuple&& !Replicated() && IsTileMaximal()
   StatusOr<int64> UniqueDevice() const;
 
   // Returns true if this op only uses a single device.
-  bool HasUniqueDevice() const { return !IsReplicated() && IsTileMaximal(); }
+  bool HasUniqueDevice() const;
+
+  // Returns the ShapeTree containing the shardings for each element of this
+  // tuple, if IsTuple, or a ShapeTree with a single element containing this
+  // sharding. Only the leaf elements are populated. This creates a new
+  // ShapeTree object so is not cheap.
+  ShapeTree<HloSharding> GetAsShapeTree(const Shape& shape) const {
+    if (IsTuple()) {
+      ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
+      CHECK_EQ(std::distance(result.leaf_begin(), result.leaf_end()),
+               tuple_elements_.size());
+      auto it = tuple_elements_.begin();
+      for (auto& index_to_sharding : result.leaves()) {
+        index_to_sharding.second = *it++;
+      }
+      return result;
+    } else {
+      return ShapeTree<HloSharding>(shape, *this);
+    }
+  }
 
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            protobuf_util::ProtobufEquals(tile_shape_, other.tile_shape_) &&
-           tile_assignment_ == other.tile_assignment_;
+           tile_assignment_ == other.tile_assignment_ &&
+           tuple_elements_ == other.tuple_elements_;
   }
   bool operator!=(const HloSharding& other) const { return !(*this == other); }
 
   size_t Hash() const {
+    if (!tuple_) {
+      size_t h = 0;
+      for (const auto& element : tuple_elements_) {
+        h = tensorflow::Hash64Combine(h, element.Hash());
+      }
+      return h;
+    }
     if (replicated_) {
       return 0;
     }
@@ -131,33 +201,52 @@ class HloSharding {
   }
 
   // Gets the tile shape.
-  // It is an error to call this if IsTileMaximal() is true.
+  // REQUIRES: !IsTileMaximal() && !IsTuple()
   const Shape& tile_shape() const { return tile_shape_; }
   // Gets the tile assignment tensor.
-  // It is an error to call this if IsReplicated() is true.
+  // REQUIRES: !IsReplicated() && !IsTuple()
   const Array<int64>& tile_assignment() const { return tile_assignment_; }
 
  private:
   HloSharding()
       : replicated_(true),
         maximal_(true),
+        tuple_(false),
         tile_shape_(),
         tile_assignment_({0}) {}
   explicit HloSharding(int64 device_id)
       : replicated_(false),
         maximal_(true),
+        tuple_(false),
         tile_shape_(),
         tile_assignment_({1}, device_id) {}
   HloSharding(const Shape& tile_shape, const Array<int64>& tile_assignment)
       : replicated_(false),
         maximal_(false),
+        tuple_(false),
         tile_shape_(tile_shape),
         tile_assignment_(tile_assignment) {}
+  HloSharding(const std::vector<HloSharding>& tuple_shardings)
+      : replicated_(false),
+        maximal_(false),
+        tuple_(true),
+        tile_assignment_({0}),
+        tuple_elements_(tuple_shardings) {}
+
+  // Internal helper to validate a tuple sharding.
+  Status ValidateTuple(const Shape& shape, int64 num_devices) const;
+  // Internal helper to validate a non-tuple (leaf) sharding.
+  Status ValidateNonTuple(const Shape& shape, int64 num_devices) const;
 
   bool replicated_;
   bool maximal_;
+  bool tuple_;
   Shape tile_shape_;
   Array<int64> tile_assignment_;
+  // Only non-empty when tuple_ is true, but because empty tuples are allowed
+  // may also be empty even then. This is a flattened list of all the leaf
+  // shardings in a tuple shape, by pre-order walk (ShapeTree iterator order).
+  std::vector<HloSharding> tuple_elements_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index d0a20471a0f22a5fa414b71bb5160eed7cdc431b..0c7487b3ac77ff181d44dd55ebcf2608feaf02ea 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -70,6 +70,11 @@ TEST_F(HloShardingTest, DevicePlacement) {
                                  /*num_devices=*/6));
   EXPECT_IS_NOT_OK(
       sharding.Validate(ShapeUtil::MakeShape(U32, {4}), /*num_devices=*/5));
+
+  ShapeTree<HloSharding> shape_tree =
+      sharding.GetAsShapeTree(ShapeUtil::MakeShape(U32, {4}));
+  EXPECT_EQ(shape_tree.element({}), sharding);
+  EXPECT_TRUE(shape_tree.IsLeaf({}));
 }
 
 TEST_F(HloShardingTest, Tile) {
@@ -132,6 +137,39 @@ TEST_F(HloShardingTest, Tile) {
   }
 }
 
+TEST_F(HloShardingTest, NestedTuple) {
+  // nested_tuple_shape = (f32[], (f32[3]), f32[4, 6])
+  Shape nested_tuple_shape = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {}),
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3})}),
+      ShapeUtil::MakeShape(F32, {4, 6}),
+  });
+
+  HloSharding tiled_sharding = HloSharding::Tile(
+      ShapeUtil::MakeShape(F32, {4, 3}), Array<int64>({{0, 1}}));
+  OpSharding proto;
+  proto.set_type(OpSharding::Type::OpSharding_Type_TUPLE);
+  *proto.add_tuple_shardings() = HloSharding::Replicate().ToProto();
+  *proto.add_tuple_shardings() = HloSharding::AssignDevice(0).ToProto();
+  *proto.add_tuple_shardings() = tiled_sharding.ToProto();
+  HloSharding tuple_sharding =
+      HloSharding::FromProto(proto).ConsumeValueOrDie();
+
+  ShapeTree<HloSharding> shape_tree =
+      tuple_sharding.GetAsShapeTree(nested_tuple_shape);
+  EXPECT_EQ(shape_tree.element({0}), HloSharding::Replicate());
+  EXPECT_EQ(shape_tree.element({1, 0}), HloSharding::AssignDevice(0));
+  EXPECT_EQ(shape_tree.element({2}), tiled_sharding);
+
+  EXPECT_IS_OK(tuple_sharding.Validate(nested_tuple_shape, /*num_devices=*/5));
+  // Test should fail because tuple element count does not match.
+  EXPECT_IS_NOT_OK(tuple_sharding.Validate(ShapeUtil::MakeTupleShape({}),
+                                           /*num_devices=*/5));
+  // Test should fail because the input type is not a tuple.
+  EXPECT_IS_NOT_OK(tuple_sharding.Validate(ShapeUtil::MakeShape(F32, {}),
+                                           /*num_devices=*/5));
+}
+
 TEST_F(HloShardingTest, Hash) {
   auto hash_compare_equal = [](const HloSharding& a, const HloSharding& b) {
     if (a.Hash() != b.Hash()) {
@@ -184,6 +222,51 @@ TEST_F(HloShardingTest, Hash) {
                                               MakeArray({2, 2}, {0, 3, 1, 2}));
     EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
   }
+
+  HloSharding default_sharding = HloSharding::Replicate();
+  {
+    ShapeTree<HloSharding> shape_tree(ShapeUtil::MakeTupleShape({}),
+                                      default_sharding);
+    HloSharding sharding1 = HloSharding::Replicate();
+    HloSharding sharding2 = HloSharding::Tuple(shape_tree);
+    EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
+  }
+
+  {
+    ShapeTree<HloSharding> shape_tree(ShapeUtil::MakeTupleShape({}),
+                                      default_sharding);
+    HloSharding sharding1 = HloSharding::Tuple(shape_tree);
+    HloSharding sharding2 = HloSharding::Tuple(shape_tree);
+    EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
+  }
+
+  {
+    ShapeTree<HloSharding> shape_tree1(
+        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
+        default_sharding);
+    *shape_tree1.mutable_element({0}) = HloSharding::Replicate();
+    ShapeTree<HloSharding> shape_tree2(
+        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
+        default_sharding);
+    *shape_tree2.mutable_element({0}) = HloSharding::AssignDevice(0);
+    HloSharding sharding1 = HloSharding::Tuple(shape_tree1);
+    HloSharding sharding2 = HloSharding::Tuple(shape_tree2);
+    EXPECT_FALSE(hash_compare_equal(sharding1, sharding2));
+  }
+
+  {
+    ShapeTree<HloSharding> shape_tree1(
+        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
+        default_sharding);
+    *shape_tree1.mutable_element({0}) = HloSharding::AssignDevice(0);
+    ShapeTree<HloSharding> shape_tree2(
+        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {4})}),
+        default_sharding);
+    *shape_tree2.mutable_element({0}) = HloSharding::AssignDevice(0);
+    HloSharding sharding1 = HloSharding::Tuple(shape_tree1);
+    HloSharding sharding2 = HloSharding::Tuple(shape_tree2);
+    EXPECT_TRUE(hash_compare_equal(sharding1, sharding2));
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
index 06abe007477dbcd00bcdc7f2656c4dece6d1cf74..101a710d1cad9401134fdfe1d0ec9df241bc01e1 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
@@ -58,8 +58,6 @@ TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
 
 string GetDeviceName(int device) { return StrCat("/device/XLA:", device); }
 
-}  // namespace
-
 void CleanNodeName(string* name) {
   name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
   const string chars_to_replace = "<>[]";
@@ -70,6 +68,11 @@ void CleanNodeName(string* name) {
   std::replace_if(name->begin(), name->end(), pred, '_');
 }
 
+}  // namespace
+
+HloTfGraphBuilder::HloTfGraphBuilder(const DebugOptions& debug_options)
+    : debug_options_(debug_options) {}
+
 Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
   VLOG(2) << "Adding computation " << computation.name();
   for (auto embedded : computation.MakeEmbeddedComputationsList()) {
@@ -90,24 +93,38 @@ const string& HloTfGraphBuilder::GetNodeNameForInstruction(
   if (ContainsKey(instruction_to_node_name_, instruction)) {
     return instruction_to_node_name_[instruction];
   }
+  auto append = [](string* str, const string& other) {
+    if (str->empty()) {
+      *str = other;
+    } else if (!other.empty()) {
+      StrAppend(str, "/", other);
+    }
+  };
   string node_name;
+  if (debug_options_.xla_hlo_tfgraph_device_scopes() &&
+      instruction->has_sharding() &&
+      instruction->sharding().HasUniqueDevice()) {
+    node_name = StrCat(
+        "dev", instruction->sharding().UniqueDevice().ConsumeValueOrDie());
+  }
   // If an instruction is fused, put it in the subgraph of the fusion;
   // otherwise, put it in the computation subgraph.
   const HloComputation* computation = instruction->parent();
   if (computation->IsFusionComputation()) {
-    node_name = GetNodeNameForInstruction(computation->FusionInstruction());
+    append(&node_name,
+           GetNodeNameForInstruction(computation->FusionInstruction()));
   } else {
-    node_name = computation->name();
+    append(&node_name, computation->name());
     if (!instruction->metadata().op_name().empty()) {
       // Always make computations contain TF ops but not the other way around.
-      StrAppend(&node_name, "/", instruction->metadata().op_name());
+      append(&node_name, instruction->metadata().op_name());
     }
   }
   string instruction_name = instruction->name();
   if (instruction->opcode() == HloOpcode::kParameter) {
     StrAppend(&instruction_name, ".", instruction->parameter_number());
   }
-  StrAppend(&node_name, "/", instruction_name);
+  append(&node_name, instruction_name);
   CleanNodeName(&node_name);
   auto ret =
       instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
index b2c578af912ac0b777d1bc72a198504735a6b845..9aa3e501d5f85e3b61b20555e3d13c5687f33f2f 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 
@@ -26,6 +27,8 @@ namespace hlo_graph_dumper {
 // This constructs a tensorflow graph for HLO computations.
 class HloTfGraphBuilder {
  public:
+  HloTfGraphBuilder(const DebugOptions& debug_options = DebugOptions());
+
   // Adds a computation to the graph.
   Status AddComputation(const HloComputation& computation);
 
@@ -42,6 +45,7 @@ class HloTfGraphBuilder {
 
   Status AddInstruction(const HloInstruction* instruction);
 
+  DebugOptions debug_options_;
   tensorflow::GraphDef graph_def_;
   // This records instructions that have been visited.
   std::unordered_set<const HloInstruction*> visited_instructions_;
@@ -49,9 +53,6 @@ class HloTfGraphBuilder {
   std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
 };
 
-// Cleans the node name to make it a valid name in a tensorflow graph.
-void CleanNodeName(string* name);
-
 }  // namespace hlo_graph_dumper
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index e6cf0d37b8a0f42dc04cfaad067a4741bc803705..05b7dce3d1ecf935b80ba1cb46ef089b7b3b6f33 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -71,7 +71,7 @@ HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
                    const ShapeIndex& index, bool is_phi)
     : id_(id), is_phi_(is_phi) {
   // The defining position is always the first element in the positions_ vector.
-  AddPosition(instruction, index);
+  positions_.push_back(HloPosition{instruction, index});
 }
 
 bool HloValue::operator==(const HloValue& other) const {
@@ -130,18 +130,14 @@ bool MayUseOperandValue(int64 operand_number, const ShapeIndex& index,
       CHECK_LE(operand_number, 2);
       return operand_number == 0 || index.empty();
 
-    case HloOpcode::kCall:
     case HloOpcode::kTuple:
       // These instructions always pass through their operands transparently.
       return false;
 
+    case HloOpcode::kCall:
     case HloOpcode::kWhile:
-      // Though the while instructions passes through its operands, we return
-      // true because in SSA form there may be a Phi at the parameter of the
-      // while which is considered a use of its incoming value because the Phi
-      // input values are not passed through into the body computation. Because
-      // this function is used in both SSA and non-SSA forms of the analysis
-      // conservatively return true.
+      // Although call and while instructions pass through their operands, they
+      // are considered uses.
       return true;
 
     default:
@@ -151,103 +147,58 @@ bool MayUseOperandValue(int64 operand_number, const ShapeIndex& index,
 
 }  // namespace
 
-void HloValue::AddPosition(HloInstruction* instruction,
-                           const ShapeIndex& index) {
-  HloPosition new_position{instruction, index};
-
-  // The new position must not already exist in positions_.
-  for (const HloPosition& position : positions_) {
-    DCHECK_NE(position, new_position);
-  }
-
-  positions_.push_back(std::move(new_position));
-
-  // Update uses.
-  for (HloInstruction* user : instruction->users()) {
-    for (int64 operand_number : user->OperandIndices(instruction)) {
-      if (MayUseOperandValue(operand_number, index, user)) {
-        HloUse new_use{user, operand_number, index};
-
-        // The new use must not already exist in uses_.
-        for (const HloUse& use : uses_) {
-          DCHECK_NE(use, new_use);
-        }
-
-        uses_.push_back(std::move(new_use));
+void HloValue::SetPositionsAndComputeUses(
+    tensorflow::gtl::ArraySlice<HloPosition> positions) {
+  CHECK_EQ(positions_.size(), 1) << "SetPositions should only be called once.";
+
+  // The positions must be unique and should not contain the defining position
+  // as this is added at construction time.
+  for (const HloPosition& position_a : positions) {
+    DCHECK_NE(position_a, defining_position());
+    for (const HloPosition& position_b : positions) {
+      if (&position_a != &position_b) {
+        DCHECK_NE(position_a, position_b);
       }
     }
   }
 
-  // Update liveout status of this HloValue.
-  const HloModule& module = *instruction->parent()->parent();
-  if (instruction == module.entry_computation()->root_instruction()) {
-    live_out_of_module_ = true;
-  }
-
-  if (instruction == instruction->parent()->root_instruction()) {
-    live_out_of_computation_ = true;
-  }
-}
+  positions_.insert(positions_.end(), positions.begin(), positions.end());
 
-void HloValue::RemovePosition(HloInstruction* instruction,
-                              const ShapeIndex& index) {
-  // The defining position cannot be removed.
-  CHECK(!(instruction == defining_instruction() && index == defining_index()));
-
-  int64 size_before = positions_.size();
-  positions_.erase(
-      std::remove_if(positions_.begin(), positions_.end(),
-                     [instruction, &index](const HloPosition& position) {
-                       return position.instruction == instruction &&
-                              position.index == index;
-                     }),
-      positions_.end());
-  // Only a single position should have been removed.
-  CHECK_EQ(positions_.size(), size_before - 1);
-
-  //  Update uses which referred to this position.
-  uses_.erase(std::remove_if(uses_.begin(), uses_.end(),
-                             [instruction, &index](const HloUse& use) {
-                               return use.instruction->operand(
-                                          use.operand_number) == instruction &&
-                                      use.operand_index == index;
-                             }),
-              uses_.end());
-
-  // Returns whether this value is contained in the given instruction's output.
-  auto is_contained_in = [this](const HloInstruction* instruction) {
-    for (const HloPosition& position : positions()) {
-      if (position.instruction == instruction) {
-        return true;
-      }
+  // Gather the computation roots at which this value appears.
+  tensorflow::gtl::FlatSet<HloInstruction*> root_positions;
+  for (const HloPosition& position : positions_) {
+    if (position.instruction ==
+        position.instruction->parent()->root_instruction()) {
+      root_positions.insert(position.instruction);
     }
-    return false;
-  };
-
-  const HloModule& module = *instruction->parent()->parent();
-  if (instruction == module.entry_computation()->root_instruction()) {
-    // Value has been removed from a position in the entry root instruction.
-    live_out_of_module_ =
-        is_contained_in(module.entry_computation()->root_instruction());
-  }
-  if (instruction == defining_instruction()->parent()->root_instruction()) {
-    // Value has been removed from the root of the computation the value has
-    // been defined in.
-    live_out_of_computation_ =
-        is_contained_in(defining_instruction()->parent()->root_instruction());
   }
-}
 
-void HloValue::RecomputeUses() {
-  uses_.clear();
-  for (const HloPosition& position : positions()) {
+  // Build vector of HloUses for the value.
+  for (const HloPosition& position : positions_) {
     for (HloInstruction* user : position.instruction->users()) {
       for (int64 operand_number : user->OperandIndices(position.instruction)) {
-        if (MayUseOperandValue(operand_number, position.index, user)) {
-          uses_.push_back(HloUse{user, operand_number, position.index});
+        // Root instructions of computations are considered to be uses whether
+        // or not the root instruction itself actually uses the value.
+        if (MayUseOperandValue(operand_number, position.index, user) ||
+            ContainsKey(root_positions, user)) {
+          HloUse new_use{user, operand_number, position.index};
+
+          // The new use must not already exist in uses_.
+          for (const HloUse& use : uses_) {
+            DCHECK_NE(use, new_use);
+          }
+
+          uses_.push_back(std::move(new_use));
         }
       }
     }
+
+    // Update liveout status of this HloValue.
+    const HloModule& module = *position.instruction->parent()->parent();
+    if (position.instruction ==
+        module.entry_computation()->root_instruction()) {
+      live_out_of_module_ = true;
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index 6872bc76a82253b916e826aa1afabc3d309c1d12..2a711e8b42590c29d0aaab95dcf110063ada3182 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -121,6 +121,12 @@ class HloValue {
   HloValue(Id id, HloInstruction* instruction, const ShapeIndex& index,
            bool is_phi = false);
 
+  // Sets the positions in the module at which the HloValue appears. Updates
+  // uses. Should be called once and only once. The defining position should not
+  // be included in 'positions' as this is set at construction time.
+  void SetPositionsAndComputeUses(
+      tensorflow::gtl::ArraySlice<HloPosition> positions);
+
   // Return a unique identifier for this HloValue. This value is used for stable
   // sorting and iteration
   Id id() const { return id_; }
@@ -143,28 +149,15 @@ class HloValue {
   // Return the shape of this HloValue.
   const Shape& shape() const { return defining_position().shape(); }
 
-  // Add or remove a position at which the HloValue appears. The definition
-  // position can not be removed. The uses of the HloValue are updated.
-  void AddPosition(HloInstruction* instruction, const ShapeIndex& index);
-  void RemovePosition(HloInstruction* instruction, const ShapeIndex& index);
-
-  // Remove all positions except the defining position. Updates uses.
-  void ClearPositions();
-
   // Return all positions of the HloValue in the module.
   const std::vector<HloPosition>& positions() const { return positions_; }
 
   // Return all uses of the HloValue.
   const std::vector<HloUse>& uses() const { return uses_; }
 
-  void RecomputeUses();
-
   // Get whether this HloValue is live out of the module.
   bool live_out_of_module() const { return live_out_of_module_; }
 
-  // Get whether this HloValue is live out of the computation it is defined in.
-  bool live_out_of_computation() const { return live_out_of_computation_; }
-
   bool operator==(const HloValue& other) const;
   bool operator!=(const HloValue& other) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index c1aa655401a2be68af943e2ed29c4ab99d341383..b8fd7a89efd4d86630eed1f29db5b7b1b7876d23 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -59,21 +59,27 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleConvert(HloInstruction* convert) override {
-    if (ShapeUtil::ElementIsComplex(convert->operand(0)->shape())) {
-      TF_RET_CHECK(ShapeUtil::ElementIsComplex(convert->shape()))
-          << "Unsupported complex->real kConvert";
-    }
     return CheckShape(convert, ShapeInference::InferConvertShape(
                                    convert->operand(0)->shape(),
                                    convert->shape().element_type()));
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
+                                   convert->operand(0)->shape(),
+                                   convert->shape().element_type()));
+  }
+
   Status HandleCopy(HloInstruction* copy) override {
     return CheckUnaryShape(copy);
   }
 
   Status HandleDot(HloInstruction* dot) override {
-    return CheckBinaryShape(dot);
+    TF_ASSIGN_OR_RETURN(const Shape expected,
+                        ShapeInference::InferDotOpShape(
+                            dot->operand(0)->shape(), dot->operand(1)->shape(),
+                            dot->dot_dimension_numbers()));
+    return CheckShape(dot, expected);
   }
 
   Status HandleConvolution(HloInstruction* convolution) override {
@@ -87,8 +93,12 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleCrossReplicaSum(HloInstruction* crs) override {
-    return CheckShape(crs, ShapeInference::InferCrossReplicaSumShape(
-                               crs->operand(0)->shape()));
+    std::vector<const Shape*> operand_shapes;
+    for (const HloInstruction* operand : crs->operands()) {
+      operand_shapes.push_back(&operand->shape());
+    }
+    return CheckShape(
+        crs, ShapeInference::InferCrossReplicaSumShape(operand_shapes));
   }
 
   Status HandleReducePrecision(HloInstruction* reduce_precision) override {
@@ -141,9 +151,6 @@ class ShapeVerifier : public DfsHloVisitor {
   }
 
   Status HandleBitcast(HloInstruction* bitcast) override {
-    // Bitcasts can be any shape, as long as the size matches the operand size.
-    TF_RET_CHECK(shape_size_fn_(bitcast->shape()) ==
-                 shape_size_fn_(bitcast->operand(0)->shape()));
     return tensorflow::Status::OK();
   }
 
@@ -263,6 +270,15 @@ class ShapeVerifier : public DfsHloVisitor {
                       xla_while->while_body()->ComputeProgramShape().result());
   }
 
+  Status HandleConditional(HloInstruction* conditional) override {
+    TF_RETURN_IF_ERROR(CheckShape(
+        conditional,
+        conditional->true_computation()->ComputeProgramShape().result()));
+    return CheckShape(
+        conditional,
+        conditional->false_computation()->ComputeProgramShape().result());
+  }
+
   Status HandlePad(HloInstruction* pad) override {
     return CheckShape(pad,
                       ShapeInference::InferPadShape(pad->operand(0)->shape(),
@@ -270,12 +286,40 @@ class ShapeVerifier : public DfsHloVisitor {
                                                     pad->padding_config()));
   }
 
-  Status HandleSend(HloInstruction*) override {
-    return tensorflow::Status::OK();
+  Status HandleSend(HloInstruction* send) override {
+    TF_RET_CHECK(send->users().size() == 1);
+    const HloInstruction* send_done = send->users().front();
+    TF_RET_CHECK(send_done->opcode() == HloOpcode::kSendDone);
+    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
+    return CheckShape(
+        send, ShapeUtil::MakeTupleShape(
+                  {send->operand(0)->shape(), ShapeUtil::MakeShape(U32, {})}));
   }
 
-  Status HandleRecv(HloInstruction*) override {
-    return tensorflow::Status::OK();
+  Status HandleSendDone(HloInstruction* send_done) override {
+    TF_RET_CHECK(send_done->operands().size() == 1);
+    const HloInstruction* send = send_done->operand(0);
+    TF_RET_CHECK(send->opcode() == HloOpcode::kSend);
+    TF_RETURN_IF_ERROR(CheckSameChannel(send, send_done));
+    return CheckShape(send_done, ShapeUtil::MakeNil());
+  }
+
+  Status HandleRecv(HloInstruction* recv) override {
+    TF_RET_CHECK(recv->users().size() == 1);
+    const HloInstruction* recv_done = recv->users().front();
+    TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
+    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
+    return CheckShape(recv,
+                      ShapeUtil::MakeTupleShape(
+                          {recv_done->shape(), ShapeUtil::MakeShape(U32, {})}));
+  }
+
+  Status HandleRecvDone(HloInstruction* recv_done) override {
+    TF_RET_CHECK(recv_done->operands().size() == 1);
+    const HloInstruction* recv = recv_done->operand(0);
+    TF_RET_CHECK(recv->opcode() == HloOpcode::kRecv);
+    TF_RETURN_IF_ERROR(CheckSameChannel(recv, recv_done));
+    return CheckShape(recv_done, recv->shape().tuple_shapes(0));
   }
 
   Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
@@ -365,6 +409,19 @@ class ShapeVerifier : public DfsHloVisitor {
                           instruction->opcode(), instruction->operands()));
   }
 
+  // Checks if the given two instructions shares the same channel id.
+  Status CheckSameChannel(const HloInstruction* instr1,
+                          const HloInstruction* instr2) {
+    if (instr1->channel_id() != instr2->channel_id()) {
+      return FailedPrecondition(
+          "Expected to have the same channel id, actual channel ids are: %s "
+          "(%lld), %s (%lld)",
+          instr1->ToString().c_str(), instr1->channel_id(),
+          instr2->ToString().c_str(), instr2->channel_id());
+    }
+    return tensorflow::Status::OK();
+  }
+
   // Returns the size of a Shape in bytes.
   const std::function<int64(const Shape&)> shape_size_fn_;
 };
@@ -530,7 +587,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
         // or ComputationLowerer::Visit()
         TF_RET_CHECK(instruction->dimensions().size() ==
                      ShapeUtil::Rank(instruction->operand(0)->shape()))
-                << "Broadcast HLO has invalid number of dimensions.";
+            << "Broadcast HLO has invalid number of dimensions.";
       } else if (instruction->opcode() == HloOpcode::kWhile) {
         auto* while_cond = instruction->while_condition();
         auto* while_body = instruction->while_body();
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index d620f45d27eba706fbd7fc30d3b27b0d963475d4..b7c40fdeeb157fc74900bd9cf9d68a06a2cb1d56 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -68,12 +68,20 @@ string HumanReadableProfileBuilder::ToString() const {
   };
 
   float optimal_seconds_sum = 0.0;
+  int64 total_flops = 0.;
+  int64 total_transcendentals = 0.;
+  int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
     optimal_seconds_sum += op.optimal_seconds;
+    total_flops += op.flop_count;
+    total_transcendentals += op.transcendental_count;
+    total_bytes += op.bytes_accessed;
   }
 
-  append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1, -1,
-             optimal_seconds_sum});
+  VLOG(1) << "Total floating point ops: " << total_flops;
+
+  append_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
+             total_transcendentals, total_bytes, optimal_seconds_sum});
 
   // Sort ops in decreasing order of cycles.
   std::vector<OpInfo> sorted_ops(op_infos_);
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 0d1b7bc109c56bc4290ede09284c6d20142bdb08..ba901b99e4f3c72c84c1ecdf4e19e58ad9ab6506 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -33,7 +33,9 @@ namespace xla {
   switch (instruction.opcode()) {
     // Cheap instructions.
     case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
     case HloOpcode::kBitcast:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
@@ -53,15 +55,14 @@ namespace xla {
     case HloOpcode::kInfeed:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
-    case HloOpcode::kAnd:
-    case HloOpcode::kNot:
-    case HloOpcode::kOr:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
     case HloOpcode::kMultiply:
     case HloOpcode::kNe:
     case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPad:
     case HloOpcode::kReal:
@@ -88,10 +89,11 @@ namespace xla {
 
     // Expensive instructions.
     case HloOpcode::kAtan2:
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
     case HloOpcode::kCall:
+    case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
@@ -103,17 +105,19 @@ namespace xla {
     case HloOpcode::kMap:
     case HloOpcode::kParameter:
     case HloOpcode::kPower:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kSort:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
       return true;
   }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index b273f091f148ad2155067782a51adb41ae557797..2704a805a91b93c69b751cdb61305ea7780f0ef2 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -52,8 +52,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
     ],
     alwayslink = True,  # Contains compiler registration
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 93ea2f736742eab06ee0d7e881ee7c51daee9878..dc63a2224d659fa427d4d1a30c5dc0f94d643b36 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -56,6 +57,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
       false, [](const Shape&, const Shape&) { return false; });
+  pipeline.AddPass<WhileLoopSimplifier>();
   pipeline.AddPass<ReshapeMover>();
   pipeline.AddPass<HloConstantFolding>();
   pipeline.AddPass<HloCSE>(true);
@@ -67,13 +69,19 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   return pipeline.Run(hlo_module).status();
 }
 
-StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::Compile(
+StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> hlo_module,
+    se::StreamExecutor* /*stream_exec*/) {
+  VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
+  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+  return std::move(hlo_module);
+}
+
+StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
     std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
   TF_RET_CHECK(stream_exec != nullptr);
 
-  VLOG(1) << "Generate graph " << hlo_module->name();
-
-  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
+  VLOG(1) << "Run backend " << hlo_module->name();
 
   // Typically you would visit the HLO graph, building up a compiled equivalent
   // In this case we are using an HloEvaluator at execution time, so we don't
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h
index cfdc9b6256569b0137784b0d1db846a5f2339a5d..278cf5184227ae25518b1d46c0e16e4cce7bd1a8 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@@ -43,8 +43,12 @@ class InterpreterCompiler : public Compiler {
   InterpreterCompiler() {}
   ~InterpreterCompiler() override {}
 
-  StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_modules,
+  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+      std::unique_ptr<HloModule> hlo_module,
+      perftools::gputools::StreamExecutor* stream_exec) override;
+
+  StatusOr<std::unique_ptr<Executable>> RunBackend(
+      std::unique_ptr<HloModule> hlo_module,
       perftools::gputools::StreamExecutor* stream_exec) override;
 
   StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 86dee8462fd4fdda580ada892e244f19177fb3e5..9183a1d1bfb8c2f6e1933c004f9c9f5f9ad8eced 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -42,7 +42,8 @@ namespace sep = ::perftools::gputools::interpreter;
 
 InterpreterExecutable::InterpreterExecutable(
     std::unique_ptr<const HloModule> hlo_module)
-    : Executable(std::move(hlo_module)) {}
+    : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
+                 /*hlo_profile_index_map=*/nullptr) {}
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
@@ -89,7 +90,7 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  HloComputation* computation = module().entry_computation();
+  const HloComputation* computation = module().entry_computation();
   if (computation->num_parameters() != arguments.size()) {
     return tensorflow::errors::Internal(
         "Mismatch between argument count and graph parameter count.");
@@ -156,10 +157,5 @@ StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
 }
 
-std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis()
-    const {
-  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
-}
-
 }  // namespace interpreter
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index c69b0d036d1058a6b24ee609a9923895d3246eec..0e87eb90bff4b896fc4bc0efc4fa7b851631be6f 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -61,8 +61,6 @@ class InterpreterExecutable : public Executable {
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
-  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;
-
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 0bb3259ef43915067e614e72038387e8300ecc41..511de87b1be10741a4632d82cf726071c5c3fc12 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -100,9 +100,9 @@ bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status InterpreterExecutor::BlockHostUntilDoneWithStatus(Stream *stream) {
   AsExecutorStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index c59b2ccb1505b78be0c459ac9311428d65cc7e44..d3753a6a65d64c3d77644367bbd82068d4cf3044 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -157,7 +157,7 @@ class InterpreterExecutor : public internal::StreamExecutorInterface {
   bool StartTimer(Stream *stream, Timer *timer) override;
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDoneWithStatus(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 7eda7c2284c2457703fcfcd4226172e41dd4ae01..328afe42bad64713013f761a6819ae8a47a52e04 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -1303,7 +1303,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     TF_RET_CHECK(LayoutUtil::HasLayout(instruction->shape()));
   }
 
-  // Copy the root instrucion's result if the it does not match the result
+  // Copy the root instruction's result if the it does not match the result
   // layout constraint
   if (constraints.ResultLayout() != nullptr &&
       !constraints.ResultLayout()->MatchesLayoutInShape(
@@ -1328,6 +1328,20 @@ Status LayoutAssignment::RunOnComputation(
           << ")";
   VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
 
+  // Clear existing layouts of the instructions. All layouts must be assigned by
+  // the LayoutAssignment pass, except for Infeed, Outfeed, Parameters and the
+  // computation result. The latter two are specified in computation_layout, so
+  // we only need to keep the existing layouts for Infeed and Outfeed. Clearing
+  // the layouts here avoids hiding potential bugs in the layout assignment pass
+  // that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kInfeed ||
+        instruction->opcode() == HloOpcode::kOutfeed) {
+      continue;
+    }
+    LayoutUtil::ClearLayout(instruction->mutable_shape());
+  }
+
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index c39ff52230055ec322ecf77f8df8ebdea12cdb6c..d51c0d1dfb727801d6d2a8328eba60838373479f 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -131,10 +131,10 @@ TEST_F(LayoutAssignmentTest, FusionInstruction) {
   std::vector<std::initializer_list<int64>> minor_to_majors = {{0, 1}, {1, 0}};
   for (auto& minor_to_major : minor_to_majors) {
     auto builder = HloComputation::Builder(TestName());
-    auto constant_literal1 = test_utils::CreateR2LiteralWithLayout<float>(
-        {{1.0, 2.0}, {3.0, 4.0}}, minor_to_major);
-    auto constant_literal2 = test_utils::CreateR2LiteralWithLayout<float>(
-        {{5.0, 6.0}, {7.0, 8.0}}, minor_to_major);
+    auto constant_literal1 = Literal::CreateR2WithLayout<float>(
+        {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout(minor_to_major));
+    auto constant_literal2 = Literal::CreateR2WithLayout<float>(
+        {{5.0, 6.0}, {7.0, 8.0}}, LayoutUtil::MakeLayout(minor_to_major));
     Shape ashape = constant_literal1->shape();
 
     auto constant1 = builder.AddInstruction(
@@ -181,12 +181,12 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
   // Verify the layouts of a tuple are assigned properly (the element layouts
   // match their source).
   auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   {0, 1})));
-  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   {1, 0})));
+  auto constant0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant0, constant1}));
 
@@ -218,12 +218,12 @@ TEST_F(LayoutAssignmentTest, TupleLayout) {
 TEST_F(LayoutAssignmentTest, TupleSelect) {
   // Verify layouts of a select with tuple operands is assigned properly.
   auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   {0, 1})));
-  auto constant1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      test_utils::CreateR2LiteralWithLayout<float>({{1.0, 2.0}, {3.0, 4.0}},
-                                                   {1, 0})));
+  auto constant0 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2WithLayout<float>(
+          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({1, 0}))));
   auto tuple0 = builder.AddInstruction(
       HloInstruction::CreateTuple({constant0, constant1}));
   auto tuple1 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
index c27a8956a706febd1855854a2d0560754caf5c03..68c99256a246edcf43a8358f667fc4458b9b4fea 100644
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ b/tensorflow/compiler/xla/service/liveness_util.cc
@@ -103,7 +103,7 @@ namespace {
 
 // Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
 // Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
-// where 'user' is a user of an alias of 'intruction' at 'index', and
+// where 'user' is a user of an alias of 'instruction' at 'index', and
 // 'operand_index' is the operand index at which the alias appears in the
 // operand list of 'user'.
 std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
@@ -215,7 +215,8 @@ bool CanShareOperandBufferWithUser(
       auto add_operand_it =
           std::find_if(add->operands().begin(), add->operands().end(),
                        [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kDot ||
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot ||
                                 (operand->opcode() == HloOpcode::kFusion &&
                                  operand->fusion_kind() ==
                                      HloInstruction::FusionKind::kTransposeDot);
@@ -242,6 +243,31 @@ bool CanShareOperandBufferWithUser(
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && operand_indices[0] == 0;
   }
+  if (user->opcode() == HloOpcode::kCall) {
+    // TODO(b/62548313): Remove when buffer assignment is module scoped and
+    // does not assign buffers to calls.
+    // Find called computation parameter associated with 'operand'.
+    const std::vector<int64> operand_indices = user->OperandIndices(operand);
+    if (operand_indices.size() > 1) {
+      return false;
+    }
+    CHECK_EQ(1, operand_indices.size());
+    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
+    // Get all uses of 'operand' at 'index' in called computation.
+    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index,
+                                                     points_to_analysis);
+
+    // Return true iff:
+    // *) There exists exactly one use of 'operand' in called computation.
+    // *) The unique use is by the root instruction of called computation.
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    auto* callee_root = user->to_apply()->root_instruction();
+    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
+           callee_root->IsElementwiseOnOperand(param_uses[0].second);
+  }
   // Check if 'user' is element-wise.
   return user->IsElementwise();
 }
@@ -294,7 +320,8 @@ bool CanShareOperandBufferWithUser(HloInstruction* operand,
       auto add_operand_it =
           std::find_if(add->operands().begin(), add->operands().end(),
                        [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kDot ||
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot ||
                                 (operand->opcode() == HloOpcode::kFusion &&
                                  operand->fusion_kind() ==
                                      HloInstruction::FusionKind::kTransposeDot);
@@ -320,6 +347,31 @@ bool CanShareOperandBufferWithUser(HloInstruction* operand,
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && operand_indices[0] == 0;
   }
+  if (user->opcode() == HloOpcode::kCall) {
+    // Get all uses of value defined by 'operand' at 'operand_index'.
+    const auto& uses =
+        dataflow.GetValueDefinedAt(operand, operand_index).uses();
+    // Return true iff:
+    // *) There exists two uses of 'operand'.
+    // *) One use is by 'user' (caller).
+    // *) One use is by root instruction of called computation (callee root).
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    const bool found_caller_use =
+        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+          return use.instruction == user;
+        }) != uses.end();
+    auto* callee_root = user->to_apply()->root_instruction();
+    const bool found_elementwise_callee_use =
+        std::find_if(
+            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
+              return use.instruction == callee_root &&
+                     callee_root->IsElementwiseOnOperand(use.operand_number);
+            }) != uses.end();
+    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
+  }
   // Check if 'user' is element-wise.
   return user->IsElementwise();
 }
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
index b5e15906d3c085f773eb46b543515a614e63c59a..2c2a02f6375343d67dfb155bbb03729ff6e490d2 100644
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ b/tensorflow/compiler/xla/service/liveness_util_test.cc
@@ -277,8 +277,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
   auto b = builder.AddInstruction(HloInstruction::CreateConstant(
       Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b));
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -312,8 +315,11 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
   auto b_t = builder.AddInstruction(
       HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
 
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto dot = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kDot, a, b_t));
+      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
 
   auto one = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
@@ -415,5 +421,44 @@ TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
       CanShareOperandBufferWithUser(data, {}, whil, {}, *dataflow_analysis_));
 }
 
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
+                                            *points_to_analysis_));
+  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
+                                            *dataflow_analysis_));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34f3419269abbc73cd0ddb13c723a8da38ab19ff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_compiler.h"
+
+namespace xla {
+StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
+    std::vector<std::unique_ptr<HloModule>> modules,
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+        stream_execs) {
+  std::vector<std::unique_ptr<Executable>> result;
+  for (size_t i = 0; i < modules.size(); i++) {
+    if (stream_execs[i].size() != 1) {
+      return Unimplemented(
+          "Model partitioning not implemented for the CPU/GPU compilers!");
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        modules[i], RunHloPasses(std::move(modules[i]), stream_execs[i][0]));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                        RunBackend(std::move(modules[i]), stream_execs[i][0]));
+    result.push_back(std::move(executable));
+  }
+
+  return {std::move(result)};
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h
index b2e72871c10192c84349b117797c7bd7e6ee251a..c5393cef4f961c5d04c32d0d4291732b8ec702f1 100644
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@@ -57,6 +57,21 @@ class LLVMCompiler : public Compiler {
 
   void RemovePostOptimizationHook() { user_post_optimization_hook_ = nullptr; }
 
+  // Bring in
+  //   StatusOr<std::unique_ptr<Executable>> RunBackend(
+  //       std::unique_ptr<HloModule> module,
+  //       perftools::gputools::StreamExecutor* stream_exec)
+  //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
+  //       std::unique_ptr<HloModule> module,
+  //       perftools::gputools::StreamExecutor* stream_exec)
+  using Compiler::RunBackend;
+  using Compiler::RunHloPasses;
+
+  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
+      std::vector<std::unique_ptr<HloModule>> modules,
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>>
+          stream_execs) override;
+
  protected:
   ModuleHook user_pre_optimization_hook_;
   ModuleHook user_post_optimization_hook_;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 075d4a1ab5e5f39394ade393d21525ca3e97136e..d878061f724de1c82f8285b0f082d0be4d5778df 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
@@ -155,6 +156,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "vector_support_library",
+    srcs = ["vector_support_library.cc"],
+    hdrs = ["vector_support_library.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "@llvm//:core",
+    ],
+)
+
+cc_library(
+    name = "kernel_support_library",
+    srcs = ["kernel_support_library.cc"],
+    hdrs = ["kernel_support_library.h"],
+    deps = [
+        ":llvm_loop",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index bdddc232ef74dfa37e2d5cc780b0fe11e7bc8e76..21bca1d6beff5b2804531724b94b123d4523c173 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -83,7 +83,7 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
       if (std::find(parameter_instructions.begin(),
                     parameter_instructions.end(),
                     &hlo) != parameter_instructions.end()) {
-        array->AddInvariantLoad(llvm::MDNode::get(*context_, /*MDs=*/{}));
+        array->MarkInvariantOverWholeProgram(context_);
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index e3f98ac13e76f0df465066422ca7918a0f218b60..7224bd689842d89563b374f3db3d4e314be18764 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -256,10 +256,10 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
     llvm::Instruction* instruction) const {
   CHECK(llvm::isa<llvm::LoadInst>(instruction) ||
         llvm::isa<llvm::StoreInst>(instruction));
+  CHECK(!llvm::isa<llvm::StoreInst>(instruction) || !is_invariant_)
+      << "Trying to create a store to an invariant IRArray.";
 
   for (const auto& kind_md_pair : metadata_) {
-    CHECK(kind_md_pair.first != llvm::LLVMContext::MD_invariant_load ||
-          llvm::isa<llvm::LoadInst>(instruction));
     instruction->setMetadata(kind_md_pair.first, kind_md_pair.second);
   }
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 1ed7e99a829f5b0daa709913554d2300503ca33e..387d4629125cbb791840e943013188d14159908a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -229,9 +229,33 @@ class IrArray {
     AddMetadata(llvm::LLVMContext::MD_noalias, noalias);
   }
 
-  void AddInvariantLoad(llvm::MDNode* invariant_load) {
-    CHECK_NE(invariant_load, nullptr);
-    AddMetadata(llvm::LLVMContext::MD_invariant_load, invariant_load);
+  // Promises LLVM that the data pointed to by this IrArray never changes after
+  // it's first loaded.
+  //
+  // The temporal scope of this promise is the "whole program" from LLVM's point
+  // of view, but how this translates to HLOs differs between backends.
+  //
+  // In the single-threaded CPU backend, we emit one function that
+  // runs all the HLOs in sequence, so the whole program is the whole HLO
+  // module.
+  //
+  // In the GPU backend, we emit one GPU kernel per top-level HLO (i.e. per HLO
+  // in the entry computation).  From LLVM's perspective, launching a new kernel
+  // is like launching a new program, and so the whole program is one top-level
+  // HLO.  Since the scope of the promise is smaller than in the CPU backend, we
+  // can mark more things as invariant in the GPU backend.
+  //
+  // Marking loads as invariant is particularly helpful on GPUs because
+  // invariant loads can be lowered to PTX ld.global.nc (equivalent to CUDA's
+  // __ldg intrinsic).  These loads use a special cache, and can be
+  // significantly faster than regular loads.
+  void MarkInvariantOverWholeProgram(llvm::LLVMContext* context) {
+    if (is_invariant_) {
+      return;
+    }
+    is_invariant_ = true;
+    AddMetadata(llvm::LLVMContext::MD_invariant_load,
+                llvm::MDNode::get(*context, {}));
   }
 
   const std::map<int, llvm::MDNode*>& metadata() const { return metadata_; }
@@ -261,6 +285,8 @@ class IrArray {
   // loads/stores for this array.  They keys are the metadata kinds and the
   // values are the metadata nodes.
   std::map<int, llvm::MDNode*> metadata_;
+
+  bool is_invariant_ = false;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d68d699d7ef420bb644829125e46b5f565c93825
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+void KernelSupportLibrary::For(
+    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+    llvm::Value* step,
+    const std::function<void(llvm::Value*, bool)>& for_body_generator) {
+  If(ir_builder_->CreateICmpSLT(start, end), [&]() {
+    for_body_generator(start, /*is_first_iteration=*/true);
+    For(name, ir_builder_->CreateAdd(start, step), end, step,
+        [&](llvm::Value* iv) { for_body_generator(iv, false); });
+  });
+}
+
+void KernelSupportLibrary::For(
+    tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+    llvm::Value* step, bool peel_first_iteration,
+    const std::function<void(llvm::Value*, llvm::Value*)>& for_body_generator) {
+  if (peel_first_iteration) {
+    For(name, start, end, step, true,
+        [&](llvm::Value* indvar, bool is_first_iteration) {
+          for_body_generator(indvar, ir_builder_->getInt1(is_first_iteration));
+        });
+  } else {
+    std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
+        name, start, end, step, ir_builder_,
+        /*prevent_unrolling=*/prevent_unrolling_,
+        /*prevent_vectorization=*/prevent_vectorization_);
+    ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
+    for_body_generator(loop->GetIndVarValue(),
+                       /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
+                           loop->GetIndVarValue(), start));
+    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_);
+  }
+}
+
+void KernelSupportLibrary::If(
+    llvm::Value* condition, const std::function<void()>& true_block_generator,
+    const std::function<void()>& false_block_generator) {
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(condition, "", ir_builder_);
+  ir_builder_->SetInsertPoint(&if_data.true_block->back());
+  true_block_generator();
+  ir_builder_->SetInsertPoint(&if_data.false_block->back());
+  false_block_generator();
+  llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
+}
+
+void KernelSupportLibrary::EmitAndCallOutlinedKernel(
+    bool enable_fast_math, bool optimize_for_size,
+    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+    KernelSupportLibrary::ArgumentVector arguments,
+    const std::function<void(KernelSupportLibrary::ArgumentVector)>&
+        kernel_body_generator) {
+  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+  llvm::Function* function =
+      module->getFunction(llvm_ir::AsStringRef(kernel_name));
+  if (!function) {
+    VLOG(2) << "Generating kernel for " << kernel_name;
+    std::vector<llvm::Type*> arg_types;
+    std::transform(arguments.begin(), arguments.end(),
+                   std::back_inserter(arg_types),
+                   [](llvm::Value* arg) { return arg->getType(); });
+
+    auto* function_type = llvm::FunctionType::get(
+        ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false);
+
+    function = llvm_ir::CreateFunction(
+        function_type, llvm::GlobalValue::InternalLinkage,
+        /*enable_fast_math=*/enable_fast_math,
+        /*optimize_for_size=*/optimize_for_size, kernel_name, module);
+
+    llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder);
+
+    auto* entry_bb =
+        llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function);
+    auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(),
+                                                 /*retVal=*/nullptr, entry_bb);
+    // Set the insert point to before return_inst.
+    ir_builder->SetInsertPoint(return_inst);
+
+    std::vector<llvm::Value*> arg_values;
+    std::transform(function->arg_begin(), function->arg_end(),
+                   std::back_inserter(arg_values), std::addressof<llvm::Value>);
+    kernel_body_generator(arg_values);
+  } else {
+    VLOG(3) << "Re-using kernel for " << kernel_name;
+  }
+
+  ir_builder->CreateCall(function, llvm_ir::AsArrayRef(arguments));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..150a464c66961a0e68149bb4729d60cc4e363ba3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -0,0 +1,163 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
+
+#include <string>
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+// A thin wrapper around llvm_loop.h to make code generating structured control
+// flow more readable.
+class KernelSupportLibrary {
+ public:
+  // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR.
+  // If `prevent_unrolling` is true then unrolling is explicitly disabled on
+  // every loop generated by this instance of KernelSupportLibrary.
+  explicit KernelSupportLibrary(llvm::IRBuilder<>* ir_builder,
+                                bool prevent_unrolling = true,
+                                bool prevent_vectorization = true)
+      : ir_builder_(ir_builder),
+        prevent_unrolling_(prevent_unrolling),
+        prevent_vectorization_(prevent_vectorization) {}
+
+  // Generates the following control flow structure:
+  //
+  //   if (`start` < `end`) {
+  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/true)`;
+  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
+  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
+  //   }
+  void For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
+          for_body_generator);
+
+  void For(
+      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
+          for_body_generator) {
+    For(name, /*start=*/ir_builder_->getInt64(start),
+        /*end=*/ir_builder_->getInt64(end),
+        /*step=*/ir_builder_->getInt64(step), for_body_generator);
+  }
+
+  // Generates the following control flow structure if `peel_first_iteration` is
+  // true:
+  //
+  //   if (`start` < `end`) {
+  //     `for_body_generator(/*ind_var=*/start, /*is_first_iteration=*/,true)`;
+  //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
+  //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/,false)`;
+  //   }
+  //
+  // and the following if `peel_first_iteration` is false:
+  //
+  //   for (i64 i = `start`; i s< `end`; i += `step`)
+  //     `for_body_generator(/*ind_var=*/,i,
+  //                         /*is_first_iteration=*/,(i != `start`))`;
+  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+           llvm::Value* step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator);
+
+  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+           int64 step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator) {
+    For(name, /*start=*/start, /*end=*/end,
+        /*step=*/ir_builder_->getInt64(step), peel_first_iteration,
+        for_body_generator);
+  }
+
+  void For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, start, end, step,
+        /*peel_first_iteration=*/false,
+        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+  }
+
+  void For(
+      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, /*start=*/ir_builder_->getInt64(start),
+        /*end=*/ir_builder_->getInt64(end),
+        /*step=*/ir_builder_->getInt64(step), for_body_generator);
+  }
+
+  // Generates the following control flow structure:
+  //
+  //   if (`condition`)
+  //     `true_block_generator()`;
+  //   else
+  //      `false_block_generator()`;
+  void If(llvm::Value* condition,
+          const std::function<void()>& true_block_generator,
+          const std::function<void()>& false_block_generator = []() {});
+
+  using ArgumentVector = tensorflow::gtl::ArraySlice<llvm::Value*>;
+
+  // Generates the following control flow structure:
+  //
+  //  define @`kernel_name`(arg0, arg1, ... arg`arguments.size()`) {
+  //    kernel_body_generator({arg0, arg1, ... arg`arguments.size()`});
+  //  }
+  //
+  //  ...
+  //  call @`kernel_name`(arguments[0], arguments[1] ...)
+  //  ...
+  //
+  // If a function called `kernel_name` is already present in the module then
+  // that function is re-used.  In that sense we're using the llvm::Module as a
+  // cache of outlined kernels, keyed by function name.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      ArgumentVector arguments,
+      const std::function<void(ArgumentVector)>& kernel_body_generator);
+
+  // Thin wrapper around the more general EmitAndCallOutlinedKernel above.
+  static void EmitAndCallOutlinedKernel(
+      bool enable_fast_math, bool optimize_for_size,
+      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
+          kernel_body_generator) {
+    EmitAndCallOutlinedKernel(
+        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        {arg0, arg1, arg2}, [&](ArgumentVector args) {
+          kernel_body_generator(args[0], args[1], args[2]);
+        });
+  }
+
+ private:
+  llvm::IRBuilder<>* ir_builder_;
+  bool prevent_unrolling_;
+  bool prevent_vectorization_;
+};
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_CPU_KERNEL_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 83d35cb9efca0c27765045ce214e0e1060b18ed0..7b227ce294176cfbbf7308bbf65afe21814f3dea 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -34,21 +34,24 @@ namespace llvm_ir {
 
 ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
                  llvm::Value* start_index, llvm::Value* end_index,
-                 llvm::Value* step, bool prevent_unrolling)
+                 llvm::Value* step, bool prevent_unrolling,
+                 bool prevent_vectorization)
     : prefix_(prefix.ToString()),
       suffix_(suffix.ToString()),
       start_index_(start_index),
       end_index_(end_index),
       step_(step),
       insert_before_bb_(nullptr),
-      prevent_unrolling_(prevent_unrolling) {}
+      prevent_unrolling_(prevent_unrolling),
+      prevent_vectorization_(prevent_vectorization) {}
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
     tensorflow::StringPiece prefix, llvm::Value* start_index,
     llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
-    bool prevent_unrolling) {
-  std::unique_ptr<ForLoop> loop(new ForLoop(
-      prefix, /*suffix=*/"", start_index, end_index, step, prevent_unrolling));
+    bool prevent_unrolling, bool prevent_vectorization) {
+  std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
+                                            end_index, step, prevent_unrolling,
+                                            prevent_vectorization));
   loop->Emit(ir_builder);
   return loop;
 }
@@ -127,14 +130,12 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   ir_builder->CreateStore(indvar_inc, indvar_address);
   llvm::BranchInst* back_branch = ir_builder->CreateBr(header_bb_);
 
-  if (prevent_unrolling_) {
-    const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
-    llvm::LLVMContext* ctx = &back_branch->getContext();
-
+  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(ir_builder);
+  if (!loop_metadata.empty()) {
+    llvm::LLVMContext* ctx = &start_index_->getContext();
     auto temp_node = llvm::MDNode::getTemporary(*ctx, llvm::None);
-    auto no_unroll_node = llvm::MDNode::get(
-        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)});
-    auto loop_id = llvm::MDNode::get(*ctx, {temp_node.get(), no_unroll_node});
+    loop_metadata.insert(loop_metadata.begin(), temp_node.get());
+    auto loop_id = llvm::MDNode::get(*ctx, loop_metadata);
     loop_id->replaceOperandWith(0, loop_id);
     back_branch->setMetadata(llvm::LLVMContext::MD_loop, loop_id);
   }
@@ -143,6 +144,27 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   ir_builder->SetInsertPoint(exit_bb_);
 }
 
+std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
+    llvm::IRBuilder<>* ir_builder) {
+  const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
+  const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable";
+  llvm::LLVMContext* ctx = &start_index_->getContext();
+
+  std::vector<llvm::Metadata*> result;
+  if (prevent_unrolling_) {
+    result.push_back(llvm::MDNode::get(
+        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)}));
+  }
+
+  if (prevent_vectorization_) {
+    result.push_back(llvm::MDNode::get(
+        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopVectorizeMDName),
+               llvm::ConstantAsMetadata::get(ir_builder->getFalse())}));
+  }
+
+  return result;
+}
+
 string ForLoop::GetQualifiedName(tensorflow::StringPiece name) {
   return llvm_ir::IrName(prefix_, llvm_ir::IrName(name, suffix_));
 }
@@ -156,23 +178,25 @@ llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
-                                              bool prevent_unrolling) {
+                                              bool prevent_unrolling,
+                                              bool prevent_vectorization) {
   return AddLoop(suffix, start_index, end_index, ir_builder_->getInt64(1),
-                 prevent_unrolling);
+                 prevent_unrolling, prevent_vectorization);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
                                               llvm::Value* stride,
-                                              bool prevent_unrolling) {
+                                              bool prevent_unrolling,
+                                              bool prevent_vectorization) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
     ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
   std::unique_ptr<ForLoop> loop(new ForLoop(
       /*prefix=*/name_, suffix, start_index, end_index, stride,
-      prevent_unrolling));
+      prevent_unrolling, prevent_vectorization));
   loop->Emit(ir_builder_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
@@ -191,20 +215,24 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling) {
+                                              bool prevent_unrolling,
+                                              bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
-                 ir_builder_->getInt64(end_index), prevent_unrolling);
+                 ir_builder_->getInt64(end_index), prevent_unrolling,
+                 prevent_vectorization);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index, int64 stride,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling) {
+                                              bool prevent_unrolling,
+                                              bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
                  ir_builder_->getInt64(end_index),
-                 ir_builder_->getInt64(stride), prevent_unrolling);
+                 ir_builder_->getInt64(stride), prevent_unrolling,
+                 prevent_vectorization);
 }
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 90f7c7df9e22d6404e9fdad2ce210506583bd427..20069ce5a28184a5a9216d1a3751d1cee547727d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -71,12 +71,10 @@ class ForLoop {
   //
   // If `prevent_unrolling` is true then emit metadata that directs LLVM to not
   // unroll the generated loop.
-  static std::unique_ptr<ForLoop> EmitForLoop(tensorflow::StringPiece prefix,
-                                              llvm::Value* start_index,
-                                              llvm::Value* end_index,
-                                              llvm::Value* step,
-                                              llvm::IRBuilder<>* ir_builder,
-                                              bool prevent_unrolling = false);
+  static std::unique_ptr<ForLoop> EmitForLoop(
+      tensorflow::StringPiece prefix, llvm::Value* start_index,
+      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
+      bool prevent_unrolling = false, bool prevent_vectorization = false);
 
   // The names of the blocks follow LLVM's conventions. Control flow amongst the
   // blocks for the example C code looks like:
@@ -130,7 +128,7 @@ class ForLoop {
 
   ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
           llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step,
-          bool prevent_unrolling);
+          bool prevent_unrolling, bool prevent_vectorization);
 
   // Emit the loop at the insert point of the builder.
   void Emit(llvm::IRBuilder<>* ir_builder);
@@ -142,6 +140,10 @@ class ForLoop {
   // they are set.
   string GetQualifiedName(tensorflow::StringPiece name);
 
+  // Return a list of metadata nodes that should be associated with the
+  // llvm::Loop for this `ForLoop`.
+  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* ir_builder);
+
   string prefix_;
   string suffix_;
   llvm::Value* start_index_;
@@ -160,6 +162,7 @@ class ForLoop {
   llvm::BasicBlock* exit_bb_;
   llvm::Value* indvar_;
   bool prevent_unrolling_;
+  bool prevent_vectorization_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ForLoop);
 };
@@ -185,24 +188,28 @@ class ForLoopNest {
   std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
                                    llvm::Value* start_index,
                                    llvm::Value* end_index, llvm::Value* stride,
-                                   bool prevent_unrolling = false);
+                                   bool prevent_unrolling = false,
+                                   bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
                                    llvm::Value* start_index,
                                    llvm::Value* end_index,
-                                   bool prevent_unrolling = false);
+                                   bool prevent_unrolling = false,
+                                   bool prevent_vectorization = false);
 
   // A convenient wrapper of the other flavor of AddLoop. The given start and
   // end index are constant.
   std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
                                    int64 stride, tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false);
+                                   bool prevent_unrolling = false,
+                                   bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
   std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
                                    tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false);
+                                   bool prevent_unrolling = false,
+                                   bool prevent_vectorization = false);
 
   // Add loops to iterate through the indices within the specified
   // shape. The returned index collects the induction variables of the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 5dff4b5778970dd473c5f158b3828a850847d1ff..9a0c94b1c73c48682c1e868d4518b3797b01bbed 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/Target/TargetOptions.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -141,6 +142,13 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       return llvm::Type::getInt8Ty(module->getContext());
     case S16:
     case U16:
+    case BF16:
+      // For BF16 we just need some type that is 16 bits wide so that it will
+      // take up the right amount of space in memory. LLVM does not have a BF16
+      // type (the LLVM half type is IEEE 16 bit floating point, not bfloat), so
+      // we can't map it directly to an LLVM type. We will not map a BF16
+      // addition to an addition on this type (int16) - this is just the type
+      // used for storage.
       return llvm::Type::getInt16Ty(module->getContext());
     case S32:
     case U32:
@@ -163,8 +171,9 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
         // z, and reinterpret_cast<cv T(&)[2]>(z)[1] shall designate the
         // imaginary part of z.
         return llvm::StructType::create(
-            "complex64", llvm::Type::getFloatTy(module->getContext()),
-            llvm::Type::getFloatTy(module->getContext()));
+            {llvm::Type::getFloatTy(module->getContext()),
+             llvm::Type::getFloatTy(module->getContext())},
+            "complex64", /*isPacked=*/true);
       }
       return cplx_t;
     }
@@ -178,6 +187,21 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
   }
 }
 
+int GetSizeInBits(llvm::Type* type) {
+  const llvm::StructType* struct_ty = llvm::dyn_cast<llvm::StructType>(type);
+  if (struct_ty) {
+    CHECK(struct_ty->isPacked());
+    int bits = 0;
+    for (auto element_type : struct_ty->elements()) {
+      bits += GetSizeInBits(element_type);
+    }
+    return bits;
+  }
+  int bits = type->getPrimitiveSizeInBits();
+  CHECK_GT(bits, 0) << "type is not sized";
+  return bits;
+}
+
 llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   llvm::Type* result_type = PrimitiveTypeToIrType(shape.element_type(), module);
   if (ShapeUtil::IsTuple(shape)) {
@@ -263,6 +287,11 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<float>(*multi_index));
         break;
+      case BF16:
+        value = llvm::ConstantInt::get(
+            ir_element_type,
+            tensorflow::bit_cast<uint16>(literal.Get<bfloat16>(*multi_index)));
+        break;
       case F64:
         value = llvm::ConstantFP::get(ir_element_type,
                                       literal.Get<double>(*multi_index));
@@ -537,6 +566,14 @@ void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder) {
   builder->SetInsertPoint(blk, blk->getFirstInsertionPt());
 }
 
+void SetToLastInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder) {
+  if (llvm::Instruction* terminator = blk->getTerminator()) {
+    builder->SetInsertPoint(terminator);
+  } else {
+    builder->SetInsertPoint(blk);
+  }
+}
+
 llvm::Value* CreateRor(llvm::Value* rotand, llvm::Value* rotor,
                        llvm::IRBuilder<>* builder) {
   auto size = rotand->getType()->getPrimitiveSizeInBits();
@@ -555,8 +592,9 @@ int64 ByteSizeOf(const Shape& shape, const llvm::DataLayout& data_layout) {
 llvm::FastMathFlags GetFastMathFlags(bool fast_math_enabled) {
   llvm::FastMathFlags flags;
   if (fast_math_enabled) {
-    // UnsafeAlgebra implies NoInfs, NoNaNs, NoSignedZeros, and AllowReciprocal.
-    flags.setUnsafeAlgebra();
+    // Fast implies AllowReassoc, NoInfs, NoNaNs, NoSignedZeros,
+    // AllowReciprocal, AllowContract, and ApproxFunc.
+    flags.setFast();
   }
   return flags;
 }
@@ -619,14 +657,27 @@ std::map<int, llvm::MDNode*> MergeMetadata(
   return result;
 }
 
+static string GetProcessUniqueIrFileName(tensorflow::StringPiece prefix) {
+  static tensorflow::mutex mu(tensorflow::LINKER_INITIALIZED);
+  static NameUniquer* uniquer = new NameUniquer(/*separator=*/"-");
+
+  tensorflow::mutex_lock lock(mu);
+  return uniquer->GetUniqueName(prefix);
+}
+
 Status DumpIRToDirectory(const string& directory_name,
                          const string& hlo_module_name,
                          const llvm::Module& llvm_module, bool optimized) {
-  string safe_file_name_base = SanitizeFileName(hlo_module_name);
+  // We can end up compiling different modules with the same name when using
+  // XlaJitCompiledCpuFunction::Compile.  Avoid overwriting IR files previously
+  // dumped from the same process in such cases.
+  string unique_and_safe_file_name = GetProcessUniqueIrFileName(
+      tensorflow::strings::StrCat("ir-", SanitizeFileName(hlo_module_name), "-",
+                                  optimized ? "with" : "no", "-opt"));
+
   string ir_file_name = tensorflow::io::JoinPath(
       directory_name,
-      tensorflow::strings::StrCat("ir-", safe_file_name_base, "-",
-                                  optimized ? "with" : "no", "-opt.ll"));
+      tensorflow::strings::StrCat(unique_and_safe_file_name, ".ll"));
 
   std::unique_ptr<tensorflow::WritableFile> f;
   TF_RETURN_IF_ERROR(
@@ -637,5 +688,32 @@ Status DumpIRToDirectory(const string& directory_name,
   return f->Close();
 }
 
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module) {
+  llvm::Function* function =
+      llvm::Function::Create(function_type, linkage, AsStringRef(name), module);
+  function->setCallingConv(llvm::CallingConv::C);
+  function->addFnAttr("no-frame-pointer-elim", "false");
+
+  if (enable_fast_math) {
+    function->addFnAttr("unsafe-fp-math", "true");
+    function->addFnAttr("no-infs-fp-math", "true");
+    function->addFnAttr("no-nans-fp-math", "true");
+    function->addFnAttr("no-signed-zeros-fp-math", "true");
+  }
+
+  // Add the optize attribute to the function if optimizing for size. This
+  // controls internal behavior of some optimization passes (e.g. loop
+  // unrolling).
+  if (optimize_for_size) {
+    function->addFnAttr(llvm::Attribute::OptimizeForSize);
+  }
+
+  return function;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 304192b58e9331c2544f973bf65299111122aea8..6bdc6a01a2b487df3dd80a02e67f5bcf62dead31 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -129,6 +129,9 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index,
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
                                   llvm::Module* module);
 
+// Returns the type size in bits. If "type" is a struct, it must be packed.
+int GetSizeInBits(llvm::Type* type);
+
 // Returns the LLVM type which represents the given XLA shape. For example,
 // if "shape" is [5 x [10 x f32]], the function returns [5 x [10 x float]].
 llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module);
@@ -243,6 +246,8 @@ llvm::Instruction* AddRangeMetadata(int64 lower, int64 upper,
 
 void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder);
 
+void SetToLastInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder);
+
 // Create a bitwise rotation of `rotand` by `rotor`.
 llvm::Value* CreateRor(llvm::Value* rotand, llvm::Value* rotor,
                        llvm::IRBuilder<>* builder);
@@ -276,6 +281,12 @@ Status DumpIRToDirectory(const string& directory_name,
                          const string& hlo_module_name,
                          const llvm::Module& llvm_module, bool optimized);
 
+llvm::Function* CreateFunction(llvm::FunctionType* function_type,
+                               llvm::GlobalValue::LinkageTypes linkage,
+                               bool enable_fast_math, bool optimize_for_size,
+                               tensorflow::StringPiece name,
+                               llvm::Module* module);
+
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.h b/tensorflow/compiler/xla/service/llvm_ir/ops.h
index 11e84d9cb5defbcb87a8f696d56c139686c960d8..f72f482e3128c61e53cc454e7da8b5795ba6f695 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.h
@@ -40,11 +40,24 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 inline bool CanEmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion, const BufferAssignment& assignment) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
-  return fusion->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-         fusion->fused_expression_root()->opcode() ==
-             HloOpcode::kDynamicUpdateSlice &&
-         CanUpdateDynamicSliceInPlace(fusion->fused_expression_root(),
-                                      assignment);
+  HloInstruction* fused_root = fusion->fused_expression_root();
+  if (fused_root->opcode() != HloOpcode::kDynamicUpdateSlice ||
+      fusion->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    return false;
+  }
+  // Walk DynamicUpdateSlice operand(0) to fused parameter and get its
+  // associated operand. See if it shares an allocation with this operand.
+  HloInstruction* fusion_operand;
+  ShapeIndex index;
+  std::tie(fusion_operand, index) =
+      fused_root->mutable_operand(0)->LatestNonGteAncestorAndIndex();
+  if (fusion_operand->opcode() != HloOpcode::kParameter) {
+    return false;
+  }
+  auto* operand = fusion->operand(fusion_operand->parameter_number());
+  return assignment.HasAllocationAt(operand, index) &&
+         assignment.HasAllocationAt(fusion, {}) &&
+         assignment.SharesSliceAtIndex(fusion, {}, operand, index);
 }
 
 // Emits IR for running the given dynamic-update-slice op in-place -- that is,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59e82960787918d4747ad4dedf4bfb4f2fd40352
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.cc
@@ -0,0 +1,268 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h"
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
+                                           int64 vector_size,
+                                           llvm::IRBuilder<>* ir_builder,
+                                           std::string name)
+    : vector_size_(vector_size),
+      primitive_type_(primitive_type),
+      ir_builder_(ir_builder),
+      name_(std::move(name)) {
+  scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
+      primitive_type, ir_builder_->GetInsertBlock()->getModule());
+  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
+  vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
+  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
+}
+
+llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  return MulInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFMul(lhs, rhs, name());
+  } else {
+    return ir_builder()->CreateMul(lhs, rhs, name());
+  }
+}
+
+llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
+  CHECK(lhs->getType() == scalar_type() || lhs->getType() == vector_type());
+  return AddInternal(lhs, rhs);
+}
+
+llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
+                                               llvm::Value* rhs) {
+  if (scalar_type_->isFloatingPointTy()) {
+    return ir_builder()->CreateFAdd(lhs, rhs, name());
+  } else {
+    return ir_builder()->CreateAdd(lhs, rhs, name());
+  }
+}
+
+llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
+    llvm::Value* base_pointer, llvm::Value* offset_elements) {
+  if (base_pointer->getType() != scalar_pointer_type()) {
+    base_pointer = ir_builder()->CreateBitCast(base_pointer,
+                                               scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
+                                         name());
+}
+
+llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
+  if (pointer->getType() != vector_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
+  }
+  return ir_builder()->CreateAlignedLoad(
+      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
+}
+
+llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateAlignedLoad(
+      pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
+}
+
+void VectorSupportLibrary::StoreVector(llvm::Value* value,
+                                       llvm::Value* pointer) {
+  if (pointer->getType() != vector_pointer_type()) {
+    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
+  }
+  ir_builder()->CreateAlignedStore(
+      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+}
+
+void VectorSupportLibrary::StoreScalar(llvm::Value* value,
+                                       llvm::Value* pointer) {
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  ir_builder()->CreateAlignedStore(
+      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+}
+
+llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
+  if (pointer->getType() != scalar_pointer_type()) {
+    pointer =
+        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+  }
+  return ir_builder()->CreateVectorSplat(
+      vector_size(), ir_builder()->CreateLoad(pointer), name());
+}
+
+llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask(vector_size(), nullptr);
+  for (unsigned i = vector_size(); i != 1; i >>= 1) {
+    // On every iteration, we shuffle half of the remaining lanes to the top
+    // half of shuffle, and add two old and the new vector.
+
+    for (unsigned j = 0; j < vector_size(); ++j) {
+      if (j < (i / 2)) {
+        mask[j] = ir_builder()->getInt32(i / 2 + j);
+      } else {
+        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
+      }
+    }
+
+    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
+        vector, llvm::UndefValue::get(vector_type()),
+        llvm::ConstantVector::get(mask), "");
+    vector = Add(vector, half_remaining_lanes);
+  }
+
+  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
+                                            name());
+}
+
+llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
+                                                         llvm::Value* rhs) {
+  CHECK_EQ(lhs->getType(), vector_type());
+  CHECK_EQ(rhs->getType(), vector_type());
+  CHECK_EQ(vector_size() % 2, 0);
+
+  llvm::SmallVector<llvm::Constant*, 32> mask_a, mask_b;
+
+  // Adding the values shuffled using mask_a and mask_b gives us the
+  // AVX-style horizontal add we want.  The masks work as documented
+  // in https://llvm.org/docs/LangRef.html#shufflevector-instruction
+  //
+  // Here are the masks for vector_width() == 8:
+  //
+  //    index: |0 |1 |2 | 3 |4 |5 | 6 | 7
+  //   --------+--+--+--+---+--+--+---+---
+  //   mask_a: |0 |2 |8 |10 |4 |6 |12 |14
+  //   mask_b: |1 |3 |9 |11 |5 |7 |13 |16
+  //
+  // So, as an example, the value at lane 3 of the result vector is
+  // the result of adding lane 10 and lane 11 in the combined lhs++rhs
+  // vector, which are the lanes 2 and 3 in the rhs vector.
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2);
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+  for (int i = 0; i < vector_size(); i += 2) {
+    int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size();
+    mask_a.push_back(ir_builder()->getInt32(increment + i));
+    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+  }
+
+  llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_a));
+  llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector(
+      lhs, rhs, llvm::ConstantVector::get(mask_b));
+
+  return Add(shuffle_0, shuffle_1);
+}
+
+llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) {
+  llvm::SmallVector<llvm::Constant*, 32> mask;
+  for (int i = 0; i < vector_size() / 2; i++) {
+    mask.push_back(ir_builder()->getInt32(i + vector_size() / 2));
+  }
+
+  return ir_builder()->CreateShuffleVector(vector,
+                                           llvm::UndefValue::get(vector_type()),
+                                           llvm::ConstantVector::get(mask));
+}
+
+std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
+    std::vector<llvm::Value*> vectors) {
+  // TODO(sanjoy): Move this magic constant to TargetMachineFeatures.
+  const int kAvxVectorWidth = 8;
+  if (vector_size() == kAvxVectorWidth && vectors.size() == kAvxVectorWidth) {
+    return ComputeAvxOptimizedHorizontalSums(std::move(vectors));
+  }
+
+  std::vector<llvm::Value*> result;
+  std::transform(vectors.begin(), vectors.end(), std::back_inserter(result),
+                 [this](llvm::Value* vector) { return AddReduce(vector); });
+  return result;
+}
+
+std::vector<llvm::Value*>
+VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
+    std::vector<llvm::Value*> vectors) {
+  while (vectors.size() != 2) {
+    std::vector<llvm::Value*> new_vectors;
+    for (int i = 0; i < vectors.size(); i += 2) {
+      new_vectors.push_back(AvxStyleHorizontalAdd(vectors[i], vectors[i + 1]));
+    }
+
+    vectors = std::move(new_vectors);
+  }
+
+  llvm::Value* low =
+      AddInternal(ExtractLowHalf(vectors[0]), ExtractHighHalf(vectors[0]));
+  llvm::Value* high =
+      AddInternal(ExtractLowHalf(vectors[1]), ExtractHighHalf(vectors[1]));
+
+  std::vector<llvm::Value*> results;
+  for (int i = 0; i < 8; i++) {
+    llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
+        i < 4 ? low : high, ir_builder()->getInt32(i % 4), name());
+    results.push_back(scalar_result);
+  }
+
+  return results;
+}
+
+llvm::Value* VectorSupportLibrary::GetZeroVector() {
+  return llvm::Constant::getNullValue(vector_type());
+}
+
+llvm::Value* VectorSupportLibrary::GetZeroScalar() {
+  return llvm::Constant::getNullValue(scalar_type());
+}
+
+LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
+    : ir_builder_(ir_builder) {
+  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
+}
+
+llvm::Value* LlvmVariable::Get() const {
+  return ir_builder_->CreateLoad(alloca_);
+}
+
+void LlvmVariable::Set(llvm::Value* new_value) {
+  ir_builder_->CreateStore(new_value, alloca_);
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4c7a6a420a55db5760e67cf3725dc9cfe9e8b52
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/vector_support_library.h
@@ -0,0 +1,205 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
+
+#include <string>
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+// A thin wrapper around llvm_util.h to make code generating vector math flow
+// more readable.
+class VectorSupportLibrary {
+ public:
+  // This VectorSupportLibrary instance remembers `primitive_type` and
+  // `vector_size`, and these are implicitly used by the methods on this
+  // instance (i.e. LoadVector will load a vector of type <`vector_size` x
+  // `primitive_type`>).
+  VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
+                       llvm::IRBuilder<>* ir_builder, std::string name);
+
+  llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
+    return Mul(ir_builder()->getInt64(lhs), rhs);
+  }
+
+  llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
+    return Add(ir_builder()->getInt64(lhs), rhs);
+  }
+
+  llvm::Value* MulAdd(llvm::Value* a, llvm::Value* b, llvm::Value* c) {
+    return Add(c, Mul(a, b));
+  }
+
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements);
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    int64 offset_elements) {
+    return ComputeOffsetPointer(base_pointer,
+                                ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* pointer);
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadVector(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* pointer);
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer,
+                          llvm::Value* offset_elements) {
+    return LoadScalar(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* pointer);
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreVector(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
+                   int64 offset_elements) {
+    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* pointer);
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   llvm::Value* offset_elements) {
+    StoreScalar(value, ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+
+  void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
+                   int64 offset_elements) {
+    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  llvm::Value* LoadBroadcast(llvm::Value* pointer);
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer,
+                             llvm::Value* offset_elements) {
+    return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
+  }
+  llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
+    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
+  }
+
+  // Compute the horizontal sum of each vector in `vectors`.  The i'th element
+  // in the result vector is the (scalar) horizontal sum of the i'th vector in
+  // `vectors`.
+  std::vector<llvm::Value*> ComputeHorizontalSums(
+      std::vector<llvm::Value*> vectors);
+
+  llvm::Value* GetZeroVector();
+  llvm::Value* GetZeroScalar();
+
+  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  int64 vector_size() const { return vector_size_; }
+  llvm::Type* vector_type() const { return vector_type_; }
+  llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
+  llvm::Type* scalar_type() const { return scalar_type_; }
+  llvm::Type* scalar_pointer_type() const { return scalar_pointer_type_; }
+
+  const std::string& name() const { return name_; }
+
+ private:
+  llvm::Value* ExtractLowHalf(llvm::Value*);
+  llvm::Value* ExtractHighHalf(llvm::Value*);
+
+  llvm::Value* MulInternal(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* AddInternal(llvm::Value* lhs, llvm::Value* rhs);
+
+  llvm::Value* AddReduce(llvm::Value* vector);
+
+  // Perform an X86 AVX style horizontal add between `lhs` and `rhs`.  The
+  // resulting IR for an 8-float wide vector is expected to lower to a single
+  // vhaddps instruction on a CPU that supports vhaddps, and not be too bad in
+  // other cases.
+  //
+  // For a vector width of 8, the result vector is computed as:
+  //   Result[0] = Lhs[0] + Lhs[1]
+  //   Result[1] = Lhs[2] + Lhs[3]
+  //   Result[2] = Rhs[0] + Rhs[1]
+  //   Result[3] = Rhs[2] + Rhs[3]
+  //   Result[4] = Lhs[4] + Lhs[5]
+  //   Result[5] = Lhs[6] + Lhs[7]
+  //   Result[6] = Rhs[4] + Rhs[5]
+  //   Result[7] = Rhs[6] + Rhs[7]
+  llvm::Value* AvxStyleHorizontalAdd(llvm::Value* lhs, llvm::Value* rhs);
+
+  std::vector<llvm::Value*> ComputeAvxOptimizedHorizontalSums(
+      std::vector<llvm::Value*> vectors);
+
+  int64 vector_size_;
+  PrimitiveType primitive_type_;
+  llvm::IRBuilder<>* ir_builder_;
+  llvm::Type* vector_type_;
+  llvm::Type* vector_pointer_type_;
+  llvm::Type* scalar_type_;
+  llvm::Type* scalar_pointer_type_;
+  std::string name_;
+};
+
+// This wraps an alloca-backed stack variable which LLVM's SSA construction pass
+// can later convert to a SSA value.
+class LlvmVariable {
+ public:
+  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
+
+  llvm::Value* Get() const;
+  void Set(llvm::Value* new_value);
+
+ private:
+  llvm::AllocaInst* alloca_;
+  llvm::IRBuilder<>* ir_builder_;
+};
+
+class VectorVariable : public LlvmVariable {
+ public:
+  VectorVariable(VectorSupportLibrary* vector_support,
+                 llvm::Value* initial_value)
+      : LlvmVariable(vector_support->vector_type(),
+                     vector_support->ir_builder()) {
+    Set(initial_value);
+  }
+};
+
+class ScalarVariable : public LlvmVariable {
+ public:
+  ScalarVariable(VectorSupportLibrary* vector_support,
+                 llvm::Value* initial_value)
+      : LlvmVariable(vector_support->scalar_type(),
+                     vector_support->ir_builder()) {
+    Set(initial_value);
+  }
+};
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_VECTOR_SUPPORT_LIBRARY_H_
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index d4d35da9d636e6e204f36850e7987327ab258696..06f43bd3cb2376d34a3104133c868c4f4e5cc730 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -68,26 +68,6 @@ LocalService::LocalService(const ServiceOptions& options,
                            std::unique_ptr<Backend> execute_backend)
     : Service(options, std::move(execute_backend)) {}
 
-namespace {
-// Returns the space required to allocate a shape. If
-// allocate_space_for_deep_copy the space includes all sub-buffers of
-// a tuple.
-int64 RequiredSpace(const Shape& shape, bool allocate_space_for_deep_copy,
-                    TransferManager* transfer_manager) {
-  int64 size = 0;
-  // TODO(b/33492279) remove once no devices represent result tuples as
-  // contiguous buffers.
-  if (allocate_space_for_deep_copy) {
-    ShapeUtil::ForEachSubshape(
-        shape, [&size, transfer_manager](const Shape& subshape,
-                                         const ShapeIndex& /*index*/) {
-          size += transfer_manager->GetByteSizeRequirement(subshape);
-        });
-  }
-  return size;
-}
-}  // namespace
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ComputationHandle& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index b92017c6cbc43d78ab4e5b32f25f5980b8d4ae56..6aca6ba38572c5311797fbb91acbbcd6610a3410 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -23,6 +23,23 @@ limitations under the License.
 
 namespace xla {
 
+namespace {
+
+// Gather fusion instructions from 'instruction' into 'fusion_instructions'.
+void GatherFusionInstructions(
+    HloInstruction* instruction,
+    std::vector<HloInstruction*>* fusion_instructions) {
+  CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
+  for (auto* fused : instruction->fused_instructions()) {
+    if (fused->opcode() == HloOpcode::kFusion) {
+      GatherFusionInstructions(fused, fusion_instructions);
+    }
+  }
+  fusion_instructions->push_back(instruction);
+}
+
+}  // namespace
+
 /* static */ StatusOr<std::unique_ptr<LogicalBufferAnalysis>>
 LogicalBufferAnalysis::Run(const HloModule* module) {
   std::unique_ptr<LogicalBufferAnalysis> analysis(
@@ -41,15 +58,19 @@ Status LogicalBufferAnalysis::Analyze() {
   // We filter out fusion computations, and get to them through fusion
   // instructions. This is because it's possible to have orphaned (unreachable)
   // fusion computations, and we don't want to try to assign buffers to those.
+  std::vector<HloInstruction*> fusion_instructions;
   for (auto* computation : module_->MakeNonfusionComputations()) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kFusion) {
         continue;
       }
-      TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+      GatherFusionInstructions(instruction, &fusion_instructions);
     }
   }
+  for (auto* instruction : fusion_instructions) {
+    TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+  }
   return Status::OK();
 }
 
@@ -104,6 +125,21 @@ Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction*) {
+  // RecvDone doesn't create a new buffer but rather aliases its input (Recv)
+  // tuple element at {0} to its output.
+  return Status::OK();
+}
+
+Status LogicalBufferAnalysis::HandleSend(HloInstruction* send) {
+  // Send creates new buffers for the top-level tuple and the context (tuple
+  // element at {1}). Tuple element at {0} is an alias of the Send operand, so
+  // we don't need to create a new Logical Buffer for that.
+  NewLogicalBuffer(send, /*index=*/{});
+  NewLogicalBuffer(send, /*index=*/{1});
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleTuple(HloInstruction* tuple) {
   // A Tuple instruction only creates the top-level buffer.
   NewLogicalBuffer(tuple, /*index=*/{});
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index a82e83ec5c3d2b0e011d85f3d03bea8fca870154..598d08b7203b25b194dfc3b3125ec58c96b2cd4c 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -60,6 +60,8 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleCopy(HloInstruction* copy) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
+  Status HandleSend(HloInstruction* send) override;
   Status HandleSelect(HloInstruction* select) override;
 
   // A map from the buffer ID to the logical buffer
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index a0d08c288dbcc45e83a36ce7b094b04a9dbae532..7d8c05fffa4ab11d7dbf9956d2cb7ebd5bcdd3c4 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -17,12 +17,44 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace {
+
+bool IsAllowed(char character) {
+  auto c = static_cast<unsigned char>(character);
+  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+}
+
+}  // namespace
+
+NameUniquer::NameUniquer(const string& separator) {
+  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+      << "separator should comprises allowed characters only";
+  separator_ = separator;
+}
+
+/*static*/ string NameUniquer::GetSanitizedName(const string& name) {
+  string result = name;
+  CHECK(!result.empty()) << "name should not be empty";
+  char c = static_cast<unsigned char>(result[0]);
+  if (!isalpha(c) && c != '_') {
+    result[0] = '_';
+  }
+  for (int i = 1; i < result.length(); i++) {
+    if (!IsAllowed(result[i])) {
+      result[i] = '_';
+    }
+  }
+  return result;
+}
+
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
   string root = prefix.empty() ? "name" : prefix.ToString();
+  root = GetSanitizedName(root);
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
diff --git a/tensorflow/compiler/xla/service/name_uniquer.h b/tensorflow/compiler/xla/service/name_uniquer.h
index ed379b52258463b960dea788721c2c4325ef0260..4139c2700b25e8600182a034a8ac6f4f041c12e6 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.h
+++ b/tensorflow/compiler/xla/service/name_uniquer.h
@@ -28,14 +28,21 @@ namespace xla {
 // Simple stateful class that helps generate "unique" names. To use it, simply
 // call GetUniqueName as many times as needed. The names returned by
 // GetUniqueName are guaranteed to be distinct for this instance of the class.
+// Note that the names will be sanitized to match regexp
+// "[a-zA-Z_][a-zA-Z0-9_.-]*".
 class NameUniquer {
  public:
-  explicit NameUniquer(const string& separator = "__")
-      : separator_(separator) {}
+  // The separator must contain allowed characters only: "[a-zA-Z0-9_.-]".
+  explicit NameUniquer(const string& separator = "__");
 
-  // Get a unique name in a string, with an optional prefix for convenience.
+  // Get a sanitized unique name in a string, with an optional prefix for
+  // convenience.
   string GetUniqueName(tensorflow::StringPiece prefix = "");
 
+  // Sanitizes and returns the name. Unallowed characters will be replaced with
+  // '_'. The result will match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  static string GetSanitizedName(const string& name);
+
  private:
   // The string to use to separate the prefix of the name from the uniquing
   // integer value.
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 9f0747a6e2175a968d8f3661ac51512009e86f29..4258cf16876ab46dce6df062ab701b1b1a4a7580 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -60,12 +60,30 @@ TEST_F(NameUniquerTest, NumericSuffixes) {
   EXPECT_EQ("bar", uniquer.GetUniqueName("bar.-1000"));
   EXPECT_EQ("bar.1", uniquer.GetUniqueName("bar.-2000"));
   EXPECT_EQ("bar.2", uniquer.GetUniqueName("bar.1"));
+}
+
+TEST_F(NameUniquerTest, Sanitize) {
+  NameUniquer uniquer("_");
+
+  EXPECT_EQ("foo", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo_1", uniquer.GetUniqueName("foo"));
+  EXPECT_EQ("foo.54", uniquer.GetUniqueName("foo.54"));
+  EXPECT_EQ("foo_54", uniquer.GetUniqueName("foo_54"));
+  EXPECT_EQ("foo_54.1", uniquer.GetUniqueName("foo_54.1"));
+  EXPECT_EQ("foo_55", uniquer.GetUniqueName("foo"));
+
+  // Invalid characters will be replaced with '_'.
+  EXPECT_EQ("bar", uniquer.GetUniqueName("bar<-1000"));
+  EXPECT_EQ("bar_1", uniquer.GetUniqueName("bar<-2000"));
+  EXPECT_EQ("bar_2", uniquer.GetUniqueName("bar_1"));
 
   // Separator is only recognized in the middle of the prefix.
-  EXPECT_EQ(".10", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ(".10.1", uniquer.GetUniqueName(".10"));
-  EXPECT_EQ("foobar.", uniquer.GetUniqueName("foobar."));
-  EXPECT_EQ("foobar..1", uniquer.GetUniqueName("foobar."));
+  EXPECT_EQ("_10", uniquer.GetUniqueName(
+                       ".10"));  // the leading '.' is replaced with '_'.
+  EXPECT_EQ("_10_1", uniquer.GetUniqueName(".10"));
+  EXPECT_EQ("_10_2", uniquer.GetUniqueName("_10"));
+  EXPECT_EQ("foobar_", uniquer.GetUniqueName("foobar_"));
+  EXPECT_EQ("foobar__1", uniquer.GetUniqueName("foobar_"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index 3a1818de82d3fd305e2c6b3bd1f2cf8125806a75..aa974ee61a27de9c19e97d8a6eb48f9261ce4bd9 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -33,10 +33,32 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+using tensorflow::str_util::Lowercase;
+
 // Minimum supported CUDA compute capability is 3.5.
 constexpr int kMinCudaComputeCapabilityMajor = 3;
 constexpr int kMinCudaComputeCapabilityMinor = 5;
 
+// The name of the interpreter platform.
+constexpr char kInterpreter[] = "interpreter";
+
+namespace {
+
+string CanonicalPlatformName(const string& name) {
+  string platform_str = Lowercase(name);
+  // "cpu" and "host" mean the same thing.
+  if (platform_str == "cpu") {
+    platform_str = "host";
+  }
+  // "gpu" and "cuda" mean the same thing.
+  if (platform_str == "gpu") {
+    platform_str = "cuda";
+  }
+  return platform_str;
+}
+
+}  // namespace
+
 /* static */ StatusOr<std::vector<se::Platform*>>
 PlatformUtil::GetSupportedPlatforms() {
   se::MultiPlatformManager::PlatformMap platform_map;
@@ -78,7 +100,7 @@ PlatformUtil::GetSupportedPlatforms() {
   return platforms;
 }
 
-/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetSolePlatform() {
   TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
   if (platforms.empty()) {
     return NotFound("no platforms found");
@@ -87,13 +109,77 @@ PlatformUtil::GetSupportedPlatforms() {
   }
 
   // Multiple platforms present and we can't pick a reasonable default.
-  auto l = [](string* out, const se::Platform* p) { out->append(p->Name()); };
-  string platforms_string = tensorflow::str_util::Join(platforms, ", ", l);
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
   return InvalidArgument(
       "must specify platform because more than one platform found: %s",
       platforms_string.c_str());
 }
 
+/* static */ StatusOr<se::Platform*> PlatformUtil::GetDefaultPlatform() {
+  TF_ASSIGN_OR_RETURN(auto platforms, GetSupportedPlatforms());
+  if (platforms.empty()) {
+    return NotFound("no platforms found");
+  } else if (platforms.size() == 1) {
+    return platforms[0];
+  } else if (platforms.size() == 2) {
+    for (int i = 0; i < 2; i++) {
+      if (Lowercase(platforms[i]->Name()) == kInterpreter &&
+          Lowercase(platforms[1 - i]->Name()) != kInterpreter) {
+        return platforms[1 - i];
+      }
+    }
+  }
+
+  // Multiple platforms present and we can't pick a reasonable default.
+  string platforms_string = tensorflow::str_util::Join(
+      platforms, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "must specify platform because more than one platform (except for the "
+      "interpreter platform) found: %s",
+      platforms_string.c_str());
+}
+
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
+  TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
+  for (se::Platform* platform : platforms) {
+    if (Lowercase(platform->Name()) == platform_str) {
+      return platform;
+    }
+  }
+  return InvalidArgument("platform %s not found", platform_name.c_str());
+}
+
+/*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatformExceptFor(
+    const string& platform_name) {
+  string platform_str = CanonicalPlatformName(platform_name);
+
+  TF_ASSIGN_OR_RETURN(auto platforms, PlatformUtil::GetSupportedPlatforms());
+  std::vector<se::Platform*> matched;
+  for (se::Platform* platform : platforms) {
+    if (Lowercase(platform->Name()) != platform_name) {
+      matched.push_back(platform);
+    }
+  }
+  if (matched.empty()) {
+    return InvalidArgument("unable to find platform that is not %s",
+                           platform_name.c_str());
+  }
+  if (matched.size() == 1) {
+    return matched[0];
+  }
+  string matched_string = tensorflow::str_util::Join(
+      matched, ", ",
+      [](string* out, const se::Platform* p) { out->append(p->Name()); });
+  return InvalidArgument(
+      "found multiple platforms %s, but expected one platform except for %s",
+      matched_string.c_str(), platform_name.c_str());
+}
+
 // Returns whether the device underlying the given StreamExecutor is supported
 // by XLA.
 static bool IsDeviceSupported(se::StreamExecutor* executor) {
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index eac573703085aca2801885cd9abbe0022f1c029e..69188820a70707d9c9be10b20fb7de92ad4d9873 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -34,10 +37,27 @@ class PlatformUtil {
   static StatusOr<std::vector<perftools::gputools::Platform*>>
   GetSupportedPlatforms();
 
-  // Convenience function which returns the default supported platform. If
+  // Convenience function which returns the default supported platform for
+  // tests. If exactly one supported platform is present, then this platform is
+  // the default platform. If exactly two platforms are present and one of them
+  // is the interpreter platform, then the other platform is the default
+  // platform. Otherwise returns an error.
+  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+
+  // Convenience function which returns the sole supported platform. If
   // exactly one supported platform is present, then this platform is the
   // default platform. Otherwise returns an error.
-  static StatusOr<perftools::gputools::Platform*> GetDefaultPlatform();
+  static StatusOr<perftools::gputools::Platform*> GetSolePlatform();
+
+  // Returns the platform according to the given name. Returns error if there is
+  // no such platform.
+  static StatusOr<perftools::gputools::Platform*> GetPlatform(
+      const string& platform_name);
+
+  // Returns exactly one platform that does not have given name. Returns error
+  // if there is no such platform, or there are multiple such platforms.
+  static StatusOr<perftools::gputools::Platform*> GetPlatformExceptFor(
+      const string& platform_name);
 
   // Returns a vector of StreamExecutors for the given platform. The vector is
   // indexed by device ordinal (device numbering used by StreamExecutor). If an
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index bac33d8102e07766531a4ce6eac77aff4971bfef..fe6993db983ef66f5de5a8eee1ed277318a7f7ee 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -430,9 +430,12 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
                                           /*include_unreachable_instructions=*/
                                           true));
 
+  TF_ASSIGN_OR_RETURN(
+      module, backend->compiler()->RunHloPasses(std::move(module), executor));
+
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      backend->compiler()->Compile(std::move(module), executor));
+      backend->compiler()->RunBackend(std::move(module), executor));
 
   if (!other_directory_path.empty()) {
     executable->set_session_module(std::move(session_module));
@@ -490,14 +493,20 @@ Service::ExecuteParallelAndRegisterResult(
         std::vector<perftools::gputools::DeviceMemoryBase>>
         arguments,
     Backend* backend, tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
-    tensorflow::gtl::ArraySlice<string> result_tags) {
+    tensorflow::gtl::ArraySlice<string> result_tags,
+    ExecutionProfile* profile) {
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
   std::vector<Pool<se::Stream>::SmartPtr> streams;
+  std::vector<std::unique_ptr<perftools::gputools::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
   std::vector<GlobalDataHandle> result_handles;
 
+  // Device ID to stream executor, populated only with devices that are being
+  // profiled.
+  std::map<int64, se::Stream*> index_to_profiled_streams;
+
   TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
                       backend->computation_placer()->AssignDevices(
                           options_.number_of_replicas(), executables.size()));
@@ -510,6 +519,21 @@ Service::ExecuteParallelAndRegisterResult(
                           backend->BorrowStream(replicas[replica]));
       streams.push_back(std::move(stream));
 
+      if (replica == 0 && profile != nullptr) {
+        timers.emplace_back(
+            new perftools::gputools::Timer(streams.back()->parent()));
+        streams.back()
+            ->InitTimer(timers.back().get())
+            .ThenStartTimer(timers.back().get());
+        CHECK(timers.front() != nullptr);
+      }
+
+      if (replica == 0 &&
+          executables[i]->module_config().debug_options().xla_hlo_profile() &&
+          executables[i]->hlo_profiling_enabled()) {
+        index_to_profiled_streams[i] = streams.back().get();
+      }
+
       // Set up run options.
       ExecutableRunOptions options;
       options.set_stream(streams.back().get());
@@ -526,6 +550,10 @@ Service::ExecuteParallelAndRegisterResult(
           perftools::gputools::DeviceMemoryBase result,
           executables[i]->ExecuteAsyncOnStream(&run_options, arguments[i]));
 
+      if (replica == 0 && profile != nullptr) {
+        streams.back()->ThenStopTimer(timers.back().get());
+      }
+
       // All replicas share the same device address for the result allocation,
       // so only one of the replicas need to register the result handle.
       if (replica == 0) {
@@ -543,6 +571,55 @@ Service::ExecuteParallelAndRegisterResult(
     }
   }
 
+  // For every stream that had profiling enabled, obtain and debug-dump the HLO
+  // profile.
+  for (auto& index_to_profiled_stream : index_to_profiled_streams) {
+    int64 device = index_to_profiled_stream.first;
+    se::Stream* stream = index_to_profiled_stream.second;
+    Executable* executable = executables[device];
+    const HloModule& module = executable->module();
+    HloExecutionProfile hlo_profile(&executable->hlo_profile_printer(),
+                                    &executable->hlo_profile_index_map());
+    TF_RETURN_IF_ERROR(
+        executable->PopulateExecutionProfile(&hlo_profile, stream->parent()));
+    XLA_LOG_LINES(
+        tensorflow::INFO,
+        hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));
+    hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute",
+                                         &hlo_profile);
+  }
+
+  if (profile != nullptr) {
+    CHECK(!timers.empty());
+    std::vector<uint64> timer_nanoseconds;
+    timer_nanoseconds.reserve(timers.size());
+    for (auto& timer : timers) {
+      timer_nanoseconds.push_back(timer->Nanoseconds());
+    }
+    uint64 nanoseconds =
+        *std::max_element(timer_nanoseconds.begin(), timer_nanoseconds.end());
+
+    // Merge in run-time profile information from execution_profile on the
+    // zeroth device.
+    profile->MergeFrom(executables[0]->execution_profile());
+
+    // Overall execution time (in nanoseconds) from the executor timer.
+    profile->set_compute_and_transfer_time_ns(nanoseconds);
+
+    // TODO(b/28123297): On GPU we end up including transfer time in
+    // the compute time this way. Instead, we should get the correct
+    // value by measuring it. Setting the field here at least lets
+    // benchmarks provide *some* value for GPU computations.
+    //
+    // TODO(b/28447609): The value in compute_and_transfer_time_ns is actually
+    // the compute time without the transfer time, so this way we get the
+    // correct compute time. We should instead have the correct value for
+    // compute_and_transfer_time and set compute_time to the compute time.
+    if (profile->compute_time_ns() == 0) {
+      profile->set_compute_time_ns(profile->compute_and_transfer_time_ns());
+    }
+  }
+
   return result_handles;
 }
 
@@ -589,6 +666,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
         result, executable->ExecuteOnStreamWrapper<se::DeviceMemoryBase>(
                     &run_options[0], profile, arguments));
   } else {
+    // TODO(b/69985541): Support profiling also on this path.
     std::vector<
         tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>>
         repeated_arguments(options_.number_of_replicas(), arguments);
@@ -715,14 +793,16 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
 
   // Execute the generated executables in parallel and return the device
   // handles for each computation's output.
+  ExecutionProfile profile;
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDataHandle> outputs,
       ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
                                        execute_backend_.get(), device_handles,
-                                       computation_names));
+                                       computation_names, &profile));
   for (const GlobalDataHandle& output : outputs) {
     ExecuteResponse response;
     *response.mutable_output() = output;
+    *response.mutable_profile() = profile;
     *result->add_responses() = response;
   }
 
@@ -963,18 +1043,29 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
   return tensorflow::Status::OK();
 }
 
+namespace {
+
+// Creates a clone of the given shaped buffer with the given device ordinal. The
+// shape and DeviceMemoryBase values of the clone are identical to the original.
+std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
+    const ShapedBuffer& shaped_buffer, int device_ordinal) {
+  auto clone = MakeUnique<ShapedBuffer>(
+      shaped_buffer.shape(), shaped_buffer.platform(), device_ordinal);
+  ShapeUtil::ForEachSubshape(
+      shaped_buffer.shape(), [&clone, &shaped_buffer](const Shape& /*subshape*/,
+                                                      const ShapeIndex& index) {
+        clone->AddBufferAtIndex(shaped_buffer.buffer(index), index);
+      });
+  return clone;
+}
+
+}  // namespace
+
 tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                                              TransferToServerResponse* result) {
   Literal literal = Literal(arg->literal());
   const Shape& shape = literal.shape();
 
-  if (ShapeUtil::IsTuple(shape) && options_.number_of_replicas() > 1) {
-    // TODO(b/32990684): Tuple transfers to host end up allocating further
-    // buffers - implement that correctly.
-    return Unimplemented(
-        "Tuple transfers to the device not supported with replication.");
-  }
-
   std::vector<se::StreamExecutor*> replicas;
   if (arg->has_device_handle()) {
     TF_ASSIGN_OR_RETURN(replicas,
@@ -984,24 +1075,45 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
         replicas, Replicas(*execute_backend_, SingleComputationDeviceHandle()));
   }
 
-  // Allocate memory on the device, using the stream executor. The size of the
-  // allocation is obtained by examining the shape of the literal passed from
-  // the client. An allocation handle is returned in the response.
-  int64 allocation_size =
-      execute_backend_->transfer_manager()->GetByteSizeRequirement(shape);
-
-  TF_ASSIGN_OR_RETURN(se::DeviceMemoryBase allocation,
-                      execute_backend_->memory_allocator()->Allocate(
-                          replicas[0]->device_ordinal(), allocation_size));
-
+  // All memory allocation is done on the first replica. The allocations in all
+  // other replicas mirror the firsts'.
+  int master_device_ordinal = replicas[0]->device_ordinal();
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> shaped_buffer,
+      ShapedBuffer::Allocate(
+          execute_backend_->transfer_manager()->HostShapeToDeviceShape(shape),
+          execute_backend_->memory_allocator(), master_device_ordinal,
+          [this](const Shape& shape) {
+            return execute_backend_->transfer_manager()->GetByteSizeRequirement(
+                shape);
+          }));
+
+  // The allocation tracker only keeps track of the top-level buffer of the
+  // shape so pass in the buffer at shape index {}.
+  // TODO(b/37515654): Allocation tracker should hold a ShapedBuffer.
   *result->mutable_data() = allocation_tracker_.Register(
-      execute_backend_.get(), replicas[0]->device_ordinal(), allocation, shape,
-      StrCat("TransferToServer literal of size ", allocation_size));
+      execute_backend_.get(), master_device_ordinal,
+      shaped_buffer->buffer(/*index=*/{}), shape,
+      StrCat("TransferToServer literal of shape ",
+             ShapeUtil::HumanString(shape)));
 
+  // Transfer the data to the replicas.
   for (se::StreamExecutor* executor : replicas) {
-    TF_RETURN_IF_ERROR(
-        execute_backend_->transfer_manager()->TransferLiteralToDevice(
-            executor, literal, &allocation));
+    if (executor->device_ordinal() == master_device_ordinal) {
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, literal, *shaped_buffer));
+    } else {
+      // The replica is not the master. Create an cloned shaped buffer with
+      // the replica's device ordinal. This is required because
+      // TransferLiteralToDevice verifies that the device ordinal of the shaped
+      // buffer matches that of the executor.
+      std::unique_ptr<ShapedBuffer> clone =
+          CloneShapedBufferOnDevice(*shaped_buffer, executor->device_ordinal());
+      TF_RETURN_IF_ERROR(
+          execute_backend_->transfer_manager()->TransferLiteralToDevice(
+              executor, literal, *clone));
+    }
   }
   return tensorflow::Status::OK();
 }
@@ -1082,8 +1194,9 @@ tensorflow::Status Service::IsConstant(const IsConstantRequest* arg,
     return InvalidArgument("computations may not be empty");
   }
 
-  TF_ASSIGN_OR_RETURN(bool is_constant,
-                      user_computation->IsConstant(arg->operand()));
+  TF_ASSIGN_OR_RETURN(
+      bool is_constant,
+      user_computation->IsConstant(arg->operand(), arg->num_parameters()));
 
   result->set_is_constant(is_constant);
   return tensorflow::Status::OK();
@@ -1101,8 +1214,9 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
     return InvalidArgument("computations may not be empty");
   }
 
-  TF_ASSIGN_OR_RETURN(bool is_constant,
-                      user_computation->IsConstant(arg->operand()));
+  TF_ASSIGN_OR_RETURN(
+      bool is_constant,
+      user_computation->IsConstant(arg->operand(), arg->parameters_size()));
   if (!is_constant) {
     return InvalidArgument("Operand to ComputeConstant depends on parameter.");
   }
@@ -1141,8 +1255,18 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
                                           /*include_unreachable_instructions=*/
                                           false));
 
+  std::vector<Literal> parameters(arg->parameters_size());
+  for (int64 i = 0; i < arg->parameters_size(); ++i) {
+    parameters[i] = Literal(arg->parameters(i));
+  }
+  std::vector<const Literal*> parameter_ptrs;
+  std::transform(parameters.begin(), parameters.end(),
+                 std::back_inserter(parameter_ptrs),
+                 [](const Literal& literal) { return &literal; });
+
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate(*module, {}));
+  TF_ASSIGN_OR_RETURN(auto result_literal,
+                      evaluator.Evaluate(*module, parameter_ptrs));
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
   if (arg->has_output_layout()) {
@@ -1266,6 +1390,17 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddConcatenateInstruction(arg->concatenate_request());
       break;
+    case OpRequest::kConditionalRequest: {
+      TF_ASSIGN_OR_RETURN(UserComputation * true_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().true_computation()));
+      TF_ASSIGN_OR_RETURN(UserComputation * false_computation,
+                          computation_tracker_.Resolve(
+                              arg->conditional_request().false_computation()));
+      handle_status = computation->AddConditionalInstruction(
+          arg->conditional_request(), *true_computation, *false_computation);
+      break;
+    }
     case OpRequest::kConstantRequest:
       handle_status =
           computation->AddConstantInstruction(arg->constant_request());
@@ -1274,6 +1409,10 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddConvertInstruction(arg->convert_request());
       break;
+    case OpRequest::kBitcastConvertRequest:
+      handle_status = computation->AddBitcastConvertInstruction(
+          arg->bitcast_convert_request());
+      break;
     case OpRequest::kConvolveRequest:
       handle_status =
           computation->AddConvolveInstruction(arg->convolve_request());
@@ -1286,6 +1425,9 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status =
           computation->AddCustomCallInstruction(arg->custom_call_request());
       break;
+    case OpRequest::kDotRequest:
+      handle_status = computation->AddDotInstruction(arg->dot_request());
+      break;
     case OpRequest::kDynamicSliceRequest:
       handle_status =
           computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
@@ -1406,8 +1548,12 @@ tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
       handle_status = computation->AddRecvInstruction(arg->recv_request());
       break;
     }
+    case OpRequest::kFftRequest:
+      return Unimplemented("FftRequest not implemented in XLA service.");
+    case OpRequest::OP_NOT_SET:
+      return InvalidArgument("XLA service received OpRequest with OP_NOT_SET");
     default:
-      return InvalidArgument("Unsupported operation");
+      return InvalidArgument("Unsupported operation in XLA service");
   }
   TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 2452259f736054b5bf1f03fc5103d65eded7f398..47f4f0ade594089aa71717ef1e122886b0a6c7ac 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -272,8 +272,6 @@ class Service : public ServiceInterface {
 
   // Create a Hlo module config for the given program shape and arguments.
   // execution_options is optional; if not given a default is used.
-  // has_hybrid_result is used to initialize the same-named field in
-  // HloModuleConfig -- see that class for documentation.
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
@@ -327,7 +325,8 @@ class Service : public ServiceInterface {
           arguments,
       Backend* backend,
       tensorflow::gtl::ArraySlice<DeviceHandle> device_handles,
-      tensorflow::gtl::ArraySlice<string> result_tags);
+      tensorflow::gtl::ArraySlice<string> result_tags,
+      ExecutionProfile* profile);
 
   // Convenience function for adding a function to a user computation.
   template <typename RequestT, typename ResponseT>
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 791d17365b1d756714b5feb0439e6919d9f23edc..9c1b951d017569a6dc89bc6583c72b5e42f0c07c 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -29,8 +29,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -89,8 +91,6 @@ BinaryOperation OpcodeToBinaryOperation(HloOpcode opcode) {
       return BINOP_ATAN2;
     case HloOpcode::kComplex:
       return BINOP_COMPLEX;
-    case HloOpcode::kDot:
-      return BINOP_DOT;
     case HloOpcode::kMultiply:
       return BINOP_MUL;
     case HloOpcode::kAdd:
@@ -440,6 +440,37 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
+  auto old_element_type = operand_shape.element_type();
+  if (primitive_util::IsComplexType(old_element_type) &&
+      !primitive_util::IsComplexType(new_element_type)) {
+    return Unimplemented(
+        "Unsupported conversion from complex to real type: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
+  if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) {
+    // Note: we may want to support tuple conversions via this operation in the
+    // future, by recursing into the tuple elements to check all sub-conversions
+    // are valid. For now we just reject them, though.
+    return InvalidArgument(
+        "cannot convert from or to tuple type; requested conversion: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
+
+  return ShapeUtil::ChangeElementType(operand_shape, new_element_type);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferBitcastConvertShape(
+    const Shape& operand_shape, PrimitiveType new_element_type) {
+  auto old_element_type = operand_shape.element_type();
+  if (primitive_util::IsComplexType(old_element_type) !=
+      primitive_util::IsComplexType(new_element_type)) {
+    return Unimplemented(
+        "Unsupported conversion between real and complex types: %s => %s",
+        ShapeUtil::HumanString(operand_shape).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
   if (ShapeUtil::IsTuple(operand_shape) || new_element_type == TUPLE) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -449,6 +480,13 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
         ShapeUtil::HumanString(operand_shape).c_str(),
         PrimitiveType_Name(new_element_type).c_str());
   }
+  if (primitive_util::BitWidth(old_element_type) !=
+      primitive_util::BitWidth(new_element_type)) {
+    return InvalidArgument(
+        "cannot bitcast types with different bit-widths: %s => %s",
+        PrimitiveType_Name(old_element_type).c_str(),
+        PrimitiveType_Name(new_element_type).c_str());
+  }
 
   return ShapeUtil::ChangeElementType(operand_shape, new_element_type);
 }
@@ -510,8 +548,113 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(operand_shape.element_type(), dimensions);
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(const Shape& lhs,
-                                                             const Shape& rhs) {
+// Current DotDimensionNumbers Requirements:
+//
+// Contracting Dimensions:
+// *) Exactly one contracting dimension on both lhs and rhs.
+// *) Contracting dimension size must be the same on both lhs and rhs.
+// *) Contracting dimension numbers do not need to be the same (i.e. transposes
+//    are passed on to emitter implementations).
+//
+// Batch Dimensions:
+// *) Same number of batch dimensions on both lhs and rhs.
+// *) Same batch dimension numbers (and sizes) on both lhs and rhs.
+// *) Batch dimension numbers must be ordered before contracting and
+//    non-contracting/non-batch dimension numbers.
+//
+// Non-Contracting-Non-Batch Dimensions:
+// *) Can be 0 (matrix-vector) or 1 (matrix-matrix).
+//
+
+namespace {
+
+Status ValidateDotDimensionNumbers(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
+  // Check that dimension numbers are in range.
+  auto dims_in_range =
+      [](const int64 rank, tensorflow::gtl::ArraySlice<int64> contracting_dims,
+         tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       in_range) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+  };
+
+  tensorflow::gtl::ArraySlice<int64> lhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_contracting_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_contracting_dimensions());
+  tensorflow::gtl::ArraySlice<int64> lhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.lhs_batch_dimensions());
+  tensorflow::gtl::ArraySlice<int64> rhs_batch_dimensions =
+      AsInt64Slice(dimension_numbers.rhs_batch_dimensions());
+
+  if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions,
+                     lhs_batch_dimensions) ||
+      !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
+                     rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is out of range in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that dimension numbers are unique.
+  auto dims_unique = [](tensorflow::gtl::ArraySlice<int64> contracting_dims,
+                        tensorflow::gtl::ArraySlice<int64> batch_dims) -> bool {
+    tensorflow::gtl::FlatSet<int64> dim_set;
+    auto is_unique = [&dim_set](int64 i) -> bool {
+      return dim_set.insert(i).second;
+    };
+    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
+                       is_unique) &&
+           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+  };
+
+  if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
+      !dims_unique(rhs_contracting_dimensions, rhs_batch_dimensions)) {
+    return InvalidArgument("A dimension number is not unique in dot: %s",
+                           dimension_numbers.DebugString().c_str());
+  }
+
+  // Check that the count of non-contracting-non-batch dimensions is in {0, 1}.
+  const int64 lhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(lhs) -
+      dimension_numbers.lhs_contracting_dimensions_size() -
+      dimension_numbers.lhs_batch_dimensions_size();
+  const int64 rhs_non_contracting_non_batch_dims =
+      ShapeUtil::Rank(rhs) -
+      dimension_numbers.rhs_contracting_dimensions_size() -
+      dimension_numbers.rhs_batch_dimensions_size();
+  if (lhs_non_contracting_non_batch_dims < 0 ||
+      lhs_non_contracting_non_batch_dims > 1 ||
+      rhs_non_contracting_non_batch_dims < 0 ||
+      rhs_non_contracting_non_batch_dims > 1) {
+    return InvalidArgument(
+        "batch and contracting dimension number mismatch "
+        "with rank ");
+  }
+
+  // Check that batch dimension numbers are ordered before all others, and
+  // that they are monotonically increasing.
+  std::vector<int64> batch_dim_numbers(lhs_batch_dimensions.size());
+  std::iota(batch_dim_numbers.begin(), batch_dim_numbers.end(), 0);
+  if (!std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  lhs_batch_dimensions.begin()) ||
+      !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
+                  rhs_batch_dimensions.begin())) {
+    return InvalidArgument(
+        "batch dimension numbers must precede non-batch dimensions and be"
+        "monotonically increasing.");
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+/* static */ StatusOr<Shape> ShapeInference::InferDotOpShape(
+    const Shape& lhs, const Shape& rhs,
+    const DotDimensionNumbers& dimension_numbers) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of dot"));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of dot"));
 
@@ -531,37 +674,62 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return fail("element types do not match");
   }
 
-  if (ShapeUtil::Rank(lhs) < 1 || ShapeUtil::Rank(lhs) > 2 ||
-      ShapeUtil::Rank(rhs) < 1 || ShapeUtil::Rank(rhs) > 2) {
-    return fail("dot only supports rank 1 or 2");
+  if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
+    return fail("dot only supports rank 1 or above.");
+  }
+
+  // Validate basic properties of dot dimension numbers.
+  TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
+
+  // Check that there is only one contracting dimension for both lhs and rhs.
+  if (dimension_numbers.lhs_contracting_dimensions_size() !=
+          dimension_numbers.rhs_contracting_dimensions_size() ||
+      dimension_numbers.lhs_contracting_dimensions_size() != 1) {
+    return fail("must specify one contracting dimension for both lhs and rhs.");
+  }
+
+  // Check that contracting dimension sizes match.
+  const int64 lhs_contracting_dimension =
+      dimension_numbers.lhs_contracting_dimensions(0);
+  const int64 rhs_contracting_dimension =
+      dimension_numbers.rhs_contracting_dimensions(0);
+  if (lhs.dimensions(lhs_contracting_dimension) !=
+      rhs.dimensions(rhs_contracting_dimension)) {
+    return fail("contracting dimension sizes do not match.");
   }
 
-  // Determine the index of the contracted dimensions for input tensors.
-  // dimensions -1 of lhs and dimension 0 of rhs are contracted.
-  int64 lhs_contracted_dimension = ShapeUtil::GetDimensionNumber(lhs, -1);
-  int64 rhs_contracted_dimension = 0;
+  // Check that number of batch dimensions match.
+  if (dimension_numbers.lhs_batch_dimensions_size() !=
+      dimension_numbers.rhs_batch_dimensions_size()) {
+    return fail("must the same number of batch dimensions for lhs and rhs.");
+  }
 
-  // Check if the contracted dimension sizes are the same.
-  if ((lhs_contracted_dimension < ShapeUtil::Rank(lhs) &&
-       rhs_contracted_dimension < ShapeUtil::Rank(rhs)) &&
-      lhs.dimensions(lhs_contracted_dimension) !=
-          rhs.dimensions(rhs_contracted_dimension)) {
-    return fail("contracted dimensions mismatch");
+  // Check that batch dimension numbers and sizes match.
+  for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
+    if (dimension_numbers.lhs_batch_dimensions(i) !=
+            dimension_numbers.rhs_batch_dimensions(i) ||
+        lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
+      return fail("batch dimension numbers and sizes must match for lhs/rhs.");
+    }
   }
 
   // The ranks of lhs and rhs are decremented by 1 respectively due to the
   // contraction, and added for the rank of the result. When an input tensor is
   // a scalar, its contribution to the rank of the result is 0.
   // Generate the result dimensions in order, rhs dimensions followed by lhs
-  // dimensions except the contracted dimensions.
+  // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
+  std::unordered_set<int64> rhs_batch_dims(
+      dimension_numbers.rhs_batch_dimensions().begin(),
+      dimension_numbers.rhs_batch_dimensions().end());
   for (int64 i = 0; i < ShapeUtil::Rank(lhs); i++) {
-    if (i != lhs_contracted_dimension) {
+    if (i != lhs_contracting_dimension) {
       dimensions.push_back(lhs.dimensions(i));
     }
   }
   for (int64 i = 0; i < ShapeUtil::Rank(rhs); i++) {
-    if (i != rhs_contracted_dimension) {
+    if (i != rhs_contracting_dimension && rhs_batch_dims.count(i) == 0) {
       dimensions.push_back(rhs.dimensions(i));
     }
   }
@@ -770,11 +938,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
-  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(lhs, "lhs of binary operation"));
-  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(rhs, "rhs of binary operation"));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      lhs, tensorflow::strings::StrCat("lhs of binary operation ",
+                                       BinaryOperation_Name(operation))));
+  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
+      rhs, tensorflow::strings::StrCat("rhs of binary operation ",
+                                       BinaryOperation_Name(operation))));
   switch (operation) {
-    case BINOP_DOT:
-      return InferDotOpShape(lhs, rhs);
     case BINOP_MAX:
     case BINOP_MIN:
     case BINOP_SUB:
@@ -1402,7 +1572,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
   }
-  if (dnums.spatial_dimensions_size() !=
+  if (dnums.input_spatial_dimensions_size() !=
       dnums.kernel_spatial_dimensions_size()) {
     return InvalidArgument(
         "Both arguments to convolution must have same number of dimensions.\n"
@@ -1410,7 +1580,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         window.DebugString().c_str());
   }
 
-  const int num_spatial_dims = dnums.spatial_dimensions_size();
+  const int num_spatial_dims = dnums.input_spatial_dimensions_size();
   if (window.dimensions_size() != num_spatial_dims) {
     return InvalidArgument(
         "Window must have same number of dimensions as dimension numbers.\n"
@@ -1439,8 +1609,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   std::vector<int64> input_dnums(num_dims);
   input_dnums[0] = dnums.input_batch_dimension();
   input_dnums[1] = dnums.input_feature_dimension();
-  std::copy(dnums.spatial_dimensions().begin(),
-            dnums.spatial_dimensions().end(), input_dnums.begin() + 2);
+  std::copy(dnums.input_spatial_dimensions().begin(),
+            dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
   std::sort(input_dnums.begin(), input_dnums.end());
 
   std::vector<int64> window_dnums(num_dims);
@@ -1450,12 +1620,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
   std::sort(window_dnums.begin(), window_dnums.end());
 
+  std::vector<int64> output_dnums(num_dims);
+  output_dnums[0] = dnums.output_batch_dimension();
+  output_dnums[1] = dnums.output_feature_dimension();
+  std::copy(dnums.output_spatial_dimensions().begin(),
+            dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
+  std::sort(output_dnums.begin(), output_dnums.end());
+
   std::vector<int64> expected_dnums(num_dims);
   std::iota(expected_dnums.begin(), expected_dnums.end(), 0);
 
   const auto in_range = [num_dims](int64 i) { return 0 <= i && i < num_dims; };
   if (!std::all_of(input_dnums.begin(), input_dnums.end(), in_range) ||
-      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range)) {
+      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
+      !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
     return InvalidArgument(
         "A dimension number is out of range in convolution: %s",
         dnums.DebugString().c_str());
@@ -1473,10 +1651,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         "once: %s",
         dnums.DebugString().c_str());
   }
+  if (output_dnums != expected_dnums) {
+    return InvalidArgument(
+        "Output dimensions of convolution must contain each dimension exactly "
+        "once: %s",
+        dnums.DebugString().c_str());
+  }
 
   std::vector<int64> input_spatial_dims(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    input_spatial_dims[i] = lhs.dimensions(dnums.spatial_dimensions(i));
+    input_spatial_dims[i] = lhs.dimensions(dnums.input_spatial_dimensions(i));
   }
   const int64 input_features = lhs.dimensions(dnums.input_feature_dimension());
   const int64 input_batch = lhs.dimensions(dnums.input_batch_dimension());
@@ -1524,17 +1708,27 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   dimensions[dnums.output_batch_dimension()] = input_batch;
   dimensions[dnums.output_feature_dimension()] = kernel_output_features;
   for (int i = 0; i < num_spatial_dims; ++i) {
-    dimensions[dnums.spatial_dimensions(i)] = window_output_shape.dimensions(i);
+    dimensions[dnums.output_spatial_dimensions(i)] =
+        window_output_shape.dimensions(i);
   }
 
   return ShapeUtil::MakeShape(lhs.element_type(), dimensions);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferCrossReplicaSumShape(
-    const Shape& operand) {
-  TF_RETURN_IF_ERROR(
-      ExpectNotTupleOrOpaque(operand, "operand of cross replica sum"));
-  return operand;
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
+  for (const Shape* operand_shape : operand_shapes) {
+    TF_RETURN_IF_ERROR(
+        ExpectNotTupleOrOpaque(*operand_shape, "operand of cross replica sum"));
+  }
+  if (operand_shapes.size() == 1) {
+    return *operand_shapes[0];
+  }
+  std::vector<Shape> operand_shape_values;
+  for (const Shape* operand_shape : operand_shapes) {
+    operand_shape_values.push_back(*operand_shape);
+  }
+  return ShapeUtil::MakeTupleShape(operand_shape_values);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferReduceShape(
@@ -1900,6 +2094,64 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   return init;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferConditionalShape(
+    const Shape& predicate, const Shape& true_operand,
+    const Shape& false_operand, const ProgramShape& true_computation,
+    const ProgramShape& false_computation) {
+  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+    return InvalidArgument("predicate must be a boolean; got %s.",
+                           ShapeUtil::HumanString(predicate).c_str());
+  }
+
+  if (true_computation.parameters_size() != 1) {
+    return InvalidArgument("true_computation must take 1 argument; got %d.",
+                           true_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(true_computation.parameters(0), true_operand)) {
+    auto true_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_operand: %s; true_computation: %s",
+          ShapeUtil::HumanString(true_operand).c_str(),
+          ShapeUtil::HumanString(true_computation).c_str());
+    };
+    return InvalidArgument(
+        "true_operand must match the shape of the only parameter of "
+        "true_computation: got %s.",
+        true_shape_string().c_str());
+  }
+
+  if (false_computation.parameters_size() != 1) {
+    return InvalidArgument("false_computation must take 1 argument; got %d.",
+                           false_computation.parameters_size());
+  }
+  if (!ShapeUtil::Compatible(false_computation.parameters(0), false_operand)) {
+    auto false_shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "false_operand: %s; false_computation: %s",
+          ShapeUtil::HumanString(false_operand).c_str(),
+          ShapeUtil::HumanString(false_computation).c_str());
+    };
+    return InvalidArgument(
+        "false_operand must match the shape of the only parameter of "
+        "false_computation: got %s.",
+        false_shape_string().c_str());
+  }
+  if (!ShapeUtil::Compatible(true_computation.result(),
+                             false_computation.result())) {
+    auto shape_string = [&]() {
+      return tensorflow::strings::Printf(
+          "true_computation result: %s; false_computation result: %s.",
+          ShapeUtil::HumanString(true_computation.result()).c_str(),
+          ShapeUtil::HumanString(false_computation.result()).c_str());
+    };
+    return InvalidArgument(
+        "the result of true_computation and false_computation must have the "
+        "same shape: got %s.",
+        shape_string().c_str());
+  }
+  return true_computation.result();
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferBroadcastShape(
     const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(operand, "operand of broadcast"));
@@ -1943,7 +2195,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
-        "Reshape dimensions not a permutation of the operand dimensions.");
+        "Reshape dimensions [%s] are not a permutation of the operand "
+        "dimensions (operand shape is %s).",
+        tensorflow::str_util::Join(dimensions, ",").c_str(),
+        ShapeUtil::HumanString(operand).c_str());
   }
 
   return inferred_shape;
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index d5d497176d6c340d8c8f34cdacf6a9e32040c387..c06340d2d5df239642eb0af4836df64a898a1eaf 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -109,8 +109,10 @@ class ShapeInference {
       const Shape& lhs, const Shape& rhs, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  // Infers the shape produced a cross replica sum with the given operand shape.
-  static StatusOr<Shape> InferCrossReplicaSumShape(const Shape& operand);
+  // Infers the shape produced a cross replica sum with the given operand
+  // shapes.
+  static StatusOr<Shape> InferCrossReplicaSumShape(
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
 
   // Infers the shape produced by applying the given reduction computation
   // shape to the given input operand shape.
@@ -178,6 +180,12 @@ class ShapeInference {
                                          const ProgramShape& body,
                                          const Shape& init);
 
+  // Infers the shape produced by a conditional operation.
+  static StatusOr<Shape> InferConditionalShape(
+      const Shape& predicate, const Shape& true_operand,
+      const Shape& false_operand, const ProgramShape& true_computation,
+      const ProgramShape& false_computation);
+
   // Infers the shape produced by a broadcast operation.
   static StatusOr<Shape> InferBroadcastShape(
       const Shape& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
@@ -204,6 +212,13 @@ class ShapeInference {
   static StatusOr<Shape> InferConvertShape(const Shape& operand_shape,
                                            PrimitiveType new_element_type);
 
+  // Helper that validates the given operand shape can be bitcast converted to
+  // the target output_shape via a bitcast convert instruction -- the
+  // requirement is that the shape is identical except for the element type and
+  // the element types have identical bit-widths.
+  static StatusOr<Shape> InferBitcastConvertShape(
+      const Shape& operand_shape, PrimitiveType new_element_type);
+
   // Helper that validates the input data type for a reduce-precision operation,
   // and returns the result shape.
   static StatusOr<Shape> InferReducePrecisionShape(const Shape& operand_shape,
@@ -222,11 +237,13 @@ class ShapeInference {
       tensorflow::gtl::ArraySlice<const Shape*> arg_shapes,
       const ProgramShape& to_apply);
 
- private:
   // Helper that infers the shape produced by performing a dot operation with
   // the given LHS and RHS shapes.
-  static StatusOr<Shape> InferDotOpShape(const Shape& lhs, const Shape& rhs);
+  static StatusOr<Shape> InferDotOpShape(
+      const Shape& lhs, const Shape& rhs,
+      const DotDimensionNumbers& dimension_numbers);
 
+ private:
   // Helper that infers the shape produced by performing an element-wise binary
   // operation with the given LHS and RHS shapes.
   // Note: By "element-wise" we mean operations that look at a single element in
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index d12f7bd1453890db3280e54719a6ce811006336d..99d87f3b550ae72befe254f23fad080dd210aaf4 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -395,8 +395,10 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
@@ -437,8 +439,10 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 3});
@@ -480,8 +484,10 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dnums.set_output_batch_dimension(0);
   dnums.set_input_feature_dimension(1);
   dnums.set_output_feature_dimension(1);
-  dnums.add_spatial_dimensions(2);
-  dnums.add_spatial_dimensions(3);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(3);
+  dnums.add_output_spatial_dimensions(3);
 
   // Dimension order: x1, batch, feature, x0
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 12, 11, 4});
@@ -524,8 +530,10 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dnums.set_output_batch_dimension(3);
   dnums.set_input_feature_dimension(2);
   dnums.set_output_feature_dimension(2);
-  dnums.add_spatial_dimensions(0);
-  dnums.add_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(0);
+  dnums.add_output_spatial_dimensions(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
   dnums.set_kernel_input_feature_dimension(0);  // duplicated with kernel_x0
   dnums.set_kernel_output_feature_dimension(3);
   dnums.add_kernel_spatial_dimensions(0);
@@ -890,8 +898,11 @@ TEST_F(ShapeInferenceTest, BroadcastScalar) {
 
 // scalar <dot> vector: error
 TEST_F(ShapeInferenceTest, ScalarDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, f32_, vector_32_, {});
+      ShapeInference::InferDotOpShape(f32_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
               HasSubstr("dot only supports rank"));
@@ -899,61 +910,199 @@ TEST_F(ShapeInferenceTest, ScalarDotVector) {
 
 // 3D <dot> 2D: error
 TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BINOP_DOT, ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status = ShapeInference::InferDotOpShape(
+      ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("dot only supports rank"));
+              HasSubstr("batch and contracting dimension number mismatch"));
 }
 
 // vector <dot> vector -> scalar
 TEST_F(ShapeInferenceTest, VectorDotVector) {
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_64_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
   auto inferred_status_mismatch =
-      ShapeInference::InferBinaryOpShape(BINOP_DOT, vector_64_, vector_32_, {});
+      ShapeInference::InferDotOpShape(vector_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> vector -> vector
 TEST_F(ShapeInferenceTest, MatrixDotVector) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_32_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, vector_32_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, vector_32_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // vector <dot> matrix -> vector
 TEST_F(ShapeInferenceTest, VectorDotMatrix) {
-  auto inferred_status = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_32_, matrix_32_64_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(vector_32_, matrix_32_64_, dot_dnums);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), vector_64_));
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, vector_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(vector_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
 // matrix <dot> matrix -> matrix
 TEST_F(ShapeInferenceTest, MatrixDotMatrix) {
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_64_48_, {});
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_64_48_, dot_dnums);
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(
       ShapeUtil::Equal(inferred_status_match.ValueOrDie(), matrix_32_48_))
       << "inferred: "
       << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
       << " expected: " << ShapeUtil::HumanString(matrix_64_48_);
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_DOT, matrix_32_64_, matrix_32_64_, {});
+  auto inferred_status_mismatch =
+      ShapeInference::InferDotOpShape(matrix_32_64_, matrix_32_64_, dot_dnums);
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
+// BatchMatMul with two batch dimensions and one contracting dimension.
+TEST_F(ShapeInferenceTest, DotGeneral) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {5, 2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {5, 2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(1);
+
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status_match =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_IS_OK(inferred_status_match.status());
+  ASSERT_TRUE(
+      ShapeUtil::Equal(inferred_status_match.ValueOrDie(), output_shape))
+      << "inferred: "
+      << ShapeUtil::HumanString(inferred_status_match.ValueOrDie())
+      << " expected: " << ShapeUtil::HumanString(output_shape);
+}
+
+// BatchMatMul with two contracting dimensions fails.
+TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("must specify one contracting dimension for both "
+                        "lhs and rhs"));
+}
+
+// BatchMatMul with different batch dimension sizes fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers and sizes must match"));
+}
+
+// BatchMatMul with different batch dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("batch dimension numbers must precede non-batch"));
+}
+
+// BatchMatMul with out-of-range dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingDimNumberOutOfRange) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is out of range"));
+}
+
+// BatchMatMul with non-unique dimension numbers fails.
+TEST_F(ShapeInferenceTest, DotWithContractingNonUniqueDimNumber) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  ASSERT_FALSE(inferred_status.ok());
+  ASSERT_THAT(inferred_status.status().error_message(),
+              HasSubstr("A dimension number is not unique"));
+}
+
 TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   // Test variations of broadcasting a vector for a binary add with a
   // matrix.
@@ -1288,5 +1437,80 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
+TEST_F(ShapeInferenceTest, Conditional) {
+  auto inferred_status0 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_IS_OK(inferred_status0.status());
+  EXPECT_TRUE(ShapeUtil::Equal(f32_, inferred_status0.ValueOrDie()));
+
+  auto inferred_status1 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, vector_32_,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_64_),
+      ShapeUtil::MakeProgramShape({vector_32_}, vector_64_));
+  EXPECT_IS_OK(inferred_status1.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_64_, inferred_status1.ValueOrDie()));
+
+  auto tuple_f32_v32 = ShapeUtil::MakeTupleShape({f32_, vector_32_});
+  auto inferred_status2 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, tuple_f32_v32,
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({tuple_f32_v32}, vector_32_));
+  EXPECT_IS_OK(inferred_status2.status());
+  EXPECT_TRUE(ShapeUtil::Equal(vector_32_, inferred_status2.ValueOrDie()));
+
+  auto inferred_status_error0 = ShapeInference::InferConditionalShape(
+      s32_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error0.ok());
+  EXPECT_THAT(inferred_status_error0.status().error_message(),
+              HasSubstr("predicate must be a boolean"));
+
+  auto inferred_status_error1 = ShapeInference::InferConditionalShape(
+      pred_, ShapeUtil::MakeTupleShape({f32_, vector_32_}), matrix_32_48_,
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error1.ok());
+  EXPECT_THAT(inferred_status_error1.status().error_message(),
+              HasSubstr("true_computation must take 1 argument"));
+
+  auto inferred_status_error2 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, f32_));
+  EXPECT_FALSE(inferred_status_error2.ok());
+  EXPECT_THAT(inferred_status_error2.status().error_message(),
+              HasSubstr("true_operand must match the shape of the only "
+                        "parameter of true_computation"));
+
+  auto inferred_status_error3 = ShapeInference::InferConditionalShape(
+      pred_, matrix_32_48_, ShapeUtil::MakeTupleShape({f32_, vector_32_}),
+      ShapeUtil::MakeProgramShape({matrix_32_48_}, vector_32_),
+      ShapeUtil::MakeProgramShape({f32_, vector_32_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error3.ok());
+  EXPECT_THAT(inferred_status_error3.status().error_message(),
+              HasSubstr("false_computation must take 1 argument"));
+
+  auto inferred_status_error4 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_));
+  EXPECT_FALSE(inferred_status_error4.ok());
+  EXPECT_THAT(inferred_status_error4.status().error_message(),
+              HasSubstr("false_operand must match the shape of the only "
+                        "parameter of false_computation"));
+
+  auto inferred_status_error5 = ShapeInference::InferConditionalShape(
+      pred_, vector_32_, vector_64_,
+      ShapeUtil::MakeProgramShape({vector_32_}, f32_),
+      ShapeUtil::MakeProgramShape({vector_64_}, vector_32_));
+  EXPECT_FALSE(inferred_status_error5.ok());
+  EXPECT_THAT(inferred_status_error5.status().error_message(),
+              HasSubstr("the result of true_computation and false_computation "
+                        "must have the same shape"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index a2a442eb1a33d976a114f68d112a7d8f3b540f4b..aa0a24a2833ec0b152f32f26f32e57ec6f7b5d14 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -21,17 +21,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace se = ::perftools::gputools;
 
 namespace xla {
 
+using ::tensorflow::strings::Appendf;
+
 /* static */ StatusOr<std::unique_ptr<ShapedBuffer>>
 ShapedBuffer::MakeArrayShapedBuffer(const Shape& shape,
                                     const se::Platform* platform,
@@ -49,6 +51,34 @@ ShapedBuffer::MakeArrayShapedBuffer(const Shape& shape,
   return std::move(shaped_buffer);
 }
 
+/* static */ StatusOr<std::unique_ptr<ShapedBuffer>> ShapedBuffer::Allocate(
+    const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
+    const std::function<int64(const Shape&)>& shape_size_fn) {
+  if (!LayoutUtil::HasLayout(shape)) {
+    return InvalidArgument("Shape must have a layout: %s",
+                           ShapeUtil::HumanStringWithLayout(shape).c_str());
+  }
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
+  auto shaped_buffer = WrapUnique(
+      new ShapedBuffer(shape, allocator->platform(), device_ordinal));
+
+  // Allocate an appropriate sized buffer for each element in the shape
+  // including the tuple pointer arrays.
+  for (auto& pair : shaped_buffer->shape_index_to_buffer_entry_) {
+    const ShapeIndex& index = pair.first;
+    size_t& buffer_entry = pair.second;
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceMemoryBase memory_base,
+        allocator->Allocate(shaped_buffer->device_ordinal(),
+                            shape_size_fn(ShapeUtil::GetSubshape(
+                                shaped_buffer->shape(), index))));
+    shaped_buffer->buffers_.push_back(memory_base);
+    buffer_entry = shaped_buffer->buffers_.size() - 1;
+  }
+
+  return std::move(shaped_buffer);
+}
+
 ShapedBuffer::ShapedBuffer(const Shape& shape, const se::Platform* platform,
                            int device_ordinal)
     : shape_(shape),
@@ -63,6 +93,14 @@ void ShapedBuffer::clear() {
   }
 }
 
+void ShapedBuffer::AddBufferAtIndex(
+    const perftools::gputools::DeviceMemoryBase& buffer,
+    const ShapeIndex& shape_index) {
+  *mutable_shape_index_to_buffer_entry()->mutable_element(shape_index) =
+      buffers().size();
+  mutable_buffers()->push_back(buffer);
+}
+
 const se::DeviceMemoryBase& ShapedBuffer::buffer(
     const ShapeIndex& index) const {
   return buffers_[shape_index_to_buffer_entry_.element(index)];
@@ -72,67 +110,37 @@ se::DeviceMemoryBase* ShapedBuffer::mutable_buffer(const ShapeIndex& index) {
   return &buffers_[shape_index_to_buffer_entry_.element(index)];
 }
 
-/* static */ StatusOr<std::unique_ptr<ScopedShapedBuffer>>
-ScopedShapedBuffer::Allocate(const Shape& shape,
-                             DeviceMemoryAllocator* allocator,
-                             int device_ordinal) {
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Shape must have a layout: %s",
-                           ShapeUtil::HumanStringWithLayout(shape).c_str());
-  }
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  auto shaped_buffer =
-      WrapUnique(new ScopedShapedBuffer(shape, allocator, device_ordinal));
-
-  // Allocate an appropriate sized buffer for each element in the shape
-  // including the tuple pointer arrays. Gather tuple element addresses in
-  // 'element_addresses'. These will be written in the respective tuple's array
-  // of pointers on the device.
-  TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
-                      TransferManager::GetForPlatform(allocator->platform()));
-  ShapeTree<std::vector<se::DeviceMemoryBase>> element_addresses(shape);
-  for (auto& pair : shaped_buffer->shape_index_to_buffer_entry_) {
-    const ShapeIndex& index = pair.first;
-    size_t& buffer_entry = pair.second;
-    TF_ASSIGN_OR_RETURN(
-        se::DeviceMemoryBase memory_base,
-        shaped_buffer->allocator_->Allocate(
-            shaped_buffer->device_ordinal(),
-            transfer_manager->GetByteSizeRequirement(
-                ShapeUtil::GetSubshape(shaped_buffer->shape(), index))));
-    shaped_buffer->buffers_.push_back(memory_base);
-    buffer_entry = shaped_buffer->buffers_.size() - 1;
-
-    // If this is a tuple element, then push the address on to the
-    // vector of tuple element addresses.
-    if (!index.empty()) {
-      ShapeIndex parent_index = index;
-      parent_index.pop_back();
-      element_addresses.mutable_element(parent_index)->push_back(memory_base);
-    }
-  }
-
-  // Fill in the tuple pointer arrays with the addresses of their respective
-  // elements.
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
-                      allocator->platform()->ExecutorForDevice(
-                          shaped_buffer->device_ordinal()));
-  for (const auto& pair : element_addresses) {
-    const ShapeIndex& index = pair.first;
-    const std::vector<se::DeviceMemoryBase>& addresses = pair.second;
-    const Shape& subshape = ShapeUtil::GetSubshape(shape, index);
+string ShapedBuffer::ToString() const {
+  string s = "ShapedBuffer(" + platform_->Name() + "):\n";
+  ShapeUtil::ForEachSubshape(
+      shape(), [this, &s](const Shape& subshape, const ShapeIndex& index) {
+        string shape_str;
+        if (ShapeUtil::IsTuple(subshape)) {
+          shape_str = "tuple";
+        } else {
+          shape_str = ShapeUtil::HumanStringWithLayout(subshape);
+        }
+        const se::DeviceMemoryBase& memory = buffer(index);
+        Appendf(&s, "  %s%p (%lld bytes) : %s\n",
+                string(index.size() * 2, ' ').c_str(), memory.opaque(),
+                memory.size(), shape_str.c_str());
+      });
+  return s;
+}
 
-    if (addresses.empty()) {
-      TF_RET_CHECK(!ShapeUtil::IsTuple(subshape) ||
-                   ShapeUtil::TupleElementCount(subshape) == 0);
-      continue;
-    }
-    TF_RET_CHECK(ShapeUtil::IsTuple(subshape));
-    TF_RETURN_IF_ERROR(transfer_manager->WriteTuplePointersToDevice(
-        executor, addresses, subshape, shaped_buffer->mutable_buffer(index)));
-  }
+std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
+  out << buffer.ToString();
+  return out;
+}
 
-  return std::move(shaped_buffer);
+/* static */ StatusOr<std::unique_ptr<ScopedShapedBuffer>>
+ScopedShapedBuffer::Allocate(
+    const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
+    const std::function<int64(const Shape&)>& shape_size_fn) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<ShapedBuffer> unscoped_buffer,
+      ShapedBuffer::Allocate(shape, allocator, device_ordinal, shape_size_fn));
+  return MakeScoped(unscoped_buffer.get(), allocator);
 }
 
 /* static */
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e5ea06fb136fa714eab0f340f98b7191a4c5caa3..ca8bfff674d2fad0fc5731cb2dc30b60bcf11997 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SHAPED_BUFFER_H_
 
 #include <memory>
+#include <ostream>
+#include <string>
 
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -41,6 +43,14 @@ class ShapedBuffer {
       const Shape& shape, const perftools::gputools::Platform* platform,
       int device_ordinal, const perftools::gputools::DeviceMemoryBase& buffer);
 
+  // Return a newly allocated ShapedBuffer of an arbitrary shape. Array buffers
+  // (leaves in the shape) are allocated and uninitialized. Tuple buffers (if
+  // any) are allocated and initialized to the backend-specific representation
+  // of an array of pointers to the tuple elements.
+  static StatusOr<std::unique_ptr<ShapedBuffer>> Allocate(
+      const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
+      const std::function<int64(const Shape&)>& shape_size_fn);
+
   ShapedBuffer(const Shape& shape,
                const perftools::gputools::Platform* platform,
                int device_ordinal);
@@ -75,6 +85,12 @@ class ShapedBuffer {
   // Set all device memory pointers in the object to null.
   void clear();
 
+  // Adds a new buffer at the given shape index.
+  void AddBufferAtIndex(const perftools::gputools::DeviceMemoryBase& buffer,
+                        const ShapeIndex& shape_index);
+
+  string ToString() const;
+
  protected:
   // The shape of the device buffer with layout.
   const Shape shape_;
@@ -95,17 +111,17 @@ class ShapedBuffer {
   ShapeTree<size_t> shape_index_to_buffer_entry_;
 };
 
+std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
+
 // ShapedBuffer derived class which allocates all internal buffers on
 // construction and deallocates the memory when the object is
 // destructed.
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
-  // Return a newly allocated ScopedShapedBuffer of an arbitrary shape. Array
-  // buffers (leaves in the shape) are allocated and uninitialized. Tuple
-  // buffers (if any) are allocated and initialized to the backend-specific
-  // representation of an array of pointers to the tuple elements.
+  // Identical to ShapedBuffer::Allocate.
   static StatusOr<std::unique_ptr<ScopedShapedBuffer>> Allocate(
-      const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal);
+      const Shape& shape, DeviceMemoryAllocator* allocator, int device_ordinal,
+      const std::function<int64(const Shape&)>& shape_size_fn);
 
   // Takes a ShapedBuffer and returns a ScopedShapedBuffer which manages the
   // deallocation of the device memory held in the shaped buffer. All device
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 4da0a0d36841a6dfaed5c7eebdfb9e6980ad1090..d5f53ad56fb019d0ae7c27fc28706f05614ece68 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -28,12 +28,9 @@ limitations under the License.
 namespace se = ::perftools::gputools;
 
 namespace xla {
-
-/* static */ tensorflow::mutex*
-TransferManager::platform_transfer_manager_mutex() {
-  static tensorflow::mutex* m = new tensorflow::mutex;
-  return m;
-}
+/* static */ tensorflow::mutex
+    TransferManager::platform_transfer_manager_mutex_(
+        tensorflow::LINKER_INITIALIZED);
 
 /* static */ std::map<perftools::gputools::Platform::Id,
                       TransferManager::State>*
@@ -47,7 +44,7 @@ TransferManager::GetPlatformTransferManagers() {
     se::Platform::Id platform_id,
     TransferManagerCreationFunction creation_function) {
   tensorflow::mutex_lock lock(
-      *TransferManager::platform_transfer_manager_mutex());
+      TransferManager::platform_transfer_manager_mutex_);
   auto* managers = GetPlatformTransferManagers();
   CHECK(managers->find(platform_id) == managers->end());
   (*managers)[platform_id].creation_function = creation_function;
@@ -56,7 +53,7 @@ TransferManager::GetPlatformTransferManagers() {
 /* static */ StatusOr<TransferManager*> TransferManager::GetForPlatform(
     const se::Platform* platform) {
   tensorflow::mutex_lock lock(
-      *TransferManager::platform_transfer_manager_mutex());
+      TransferManager::platform_transfer_manager_mutex_);
   auto* managers = GetPlatformTransferManagers();
 
   auto it = managers->find(platform->id());
@@ -75,6 +72,39 @@ TransferManager::GetPlatformTransferManagers() {
   return it->second.manager.get();
 }
 
+Status TransferManager::WriteTupleIndexTables(
+    perftools::gputools::StreamExecutor* executor,
+    const ShapedBuffer& device_buffer) {
+  VLOG(2) << "Writing tuple index tables to ShapedBuffer rooted at "
+          << device_buffer.buffer(/*index=*/{}).opaque()
+          << "; shape: " << ShapeUtil::HumanString(device_buffer.shape());
+
+  TF_RET_CHECK(executor->device_ordinal() == device_buffer.device_ordinal());
+
+  return ShapeUtil::ForEachSubshapeWithStatus(
+      device_buffer.shape(),
+      [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
+        if (ShapeUtil::IsTuple(device_subshape)) {
+          se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
+          TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
+                       device_memory.size());
+
+          std::vector<se::DeviceMemoryBase> elements;
+          ShapeIndex element_index = index;
+          for (int64 i = 0; i < ShapeUtil::TupleElementCount(device_subshape);
+               ++i) {
+            element_index.push_back(i);
+            elements.push_back(device_buffer.buffer(element_index));
+            element_index.pop_back();
+          }
+          return WriteTuplePointersToDevice(executor, elements, device_subshape,
+                                            &device_memory);
+        }
+
+        return Status::OK();
+      });
+}
+
 Status TransferManager::TransferBufferFromDevice(
     se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
     int64 size, void* destination) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index f63d91604cf40edfae98b56a8bacdbded697ffc3..be9b769ac8cf3cf1fcfd13dfe9f1458e55a5323d 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -47,6 +48,8 @@ class TransferManager {
   // executor. device_shape is the shape, including layout, of the data on the
   // device, while literal_shape will be the shape for the literal. device_shape
   // and literal_shape must be compatible, but need not have the same layout.
+  // TODO(b/66694934): Remove TransferLiteral* methods which accept bare
+  // DeviceMemoryBase.
   virtual Status TransferLiteralFromDevice(
       perftools::gputools::StreamExecutor* executor,
       const perftools::gputools::DeviceMemoryBase& region,
@@ -59,6 +62,28 @@ class TransferManager {
       perftools::gputools::StreamExecutor* executor, const Literal& literal,
       perftools::gputools::DeviceMemoryBase* region) = 0;
 
+  // Returns the shape of the on-device representation for the given shape on
+  // the host. This is intended for use with ShapedBuffer where buffers are
+  // pre-allocated by the host, e.g. TransferLiteralToDevice, without the user
+  // needing to consider device-specific behaviors.
+  virtual Shape HostShapeToDeviceShape(const Shape& host_shape) const {
+    return host_shape;
+  }
+
+  // Transfers the data held in the given ShapedBuffer into the provided literal
+  // using the provided executor. literal_shape will be the shape for the
+  // literal. The shape of the ShapedBuffer and DeviceShape(literal_shape) must
+  // be compatible, but need not have the same layout.
+  virtual StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
+      perftools::gputools::StreamExecutor* executor,
+      const ShapedBuffer& device_buffer) = 0;
+
+  // Transfers the given literal into the previously allocated device memory
+  // represented by the given ShapedBuffer using the given executor.
+  virtual Status TransferLiteralToDevice(
+      perftools::gputools::StreamExecutor* executor, const Literal& literal,
+      const ShapedBuffer& device_buffer) = 0;
+
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(
@@ -97,15 +122,11 @@ class TransferManager {
       const perftools::gputools::DeviceMemoryBase& source,
       const Shape& shape) = 0;
 
-  // Writes the given device-memory pointers in 'elements' to the given region
-  // to construct a tuple in the platform-specific tuple representation. This
-  // can handle nested tuples as well. In the nested case, the element
-  // DeviceMemoryBase points to another array of pointers on the device.
-  virtual Status WriteTuplePointersToDevice(
-      perftools::gputools::StreamExecutor* executor,
-      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
-          elements,
-      const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0;
+  // Given an allocated ShapedBuffer, constructs the tuple index table(s) in
+  // each buffer of the given ShapedBuffer corresponding to tuple shapes. If the
+  // ShapedBuffer is array-shaped this method does nothing.
+  Status WriteTupleIndexTables(perftools::gputools::StreamExecutor* executor,
+                               const ShapedBuffer& device_buffer);
 
   // Returns all buffer pointers that the tuple `source` refers to. Unlike
   // ShallowCopyTupleFromDevice, this function gather buffer pointers in nested
@@ -119,24 +140,7 @@ class TransferManager {
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
   // region for a host-to-device transfer.
-  virtual int64 GetByteSizeRequirement(const Shape& shape) = 0;
-
-  // Transfer a memory block of the given size from the device source into the
-  // 'destination' buffer.
-  //
-  // size is the size to transfer to destination in bytes.
-  virtual Status TransferBufferFromDevice(
-      perftools::gputools::StreamExecutor* executor,
-      const perftools::gputools::DeviceMemoryBase& source, int64 size,
-      void* destination);
-
-  // Transfer a memory block of the given size from 'source' buffer to the given
-  // destination of the device.
-  //
-  // size is the size to transfer from source in bytes.
-  virtual Status TransferBufferToDevice(
-      perftools::gputools::StreamExecutor* executor, int64 size,
-      const void* source, perftools::gputools::DeviceMemoryBase* destination);
+  virtual int64 GetByteSizeRequirement(const Shape& shape) const = 0;
 
   typedef std::unique_ptr<TransferManager> (*TransferManagerCreationFunction)();
 
@@ -157,12 +161,37 @@ class TransferManager {
   static StatusOr<TransferManager*> GetForPlatform(
       const perftools::gputools::Platform* platform);
 
+ protected:
+  // Transfer a memory block of the given size from the device source into the
+  // 'destination' buffer.
+  //
+  // size is the size to transfer to destination in bytes.
+  virtual Status TransferBufferFromDevice(
+      perftools::gputools::StreamExecutor* executor,
+      const perftools::gputools::DeviceMemoryBase& source, int64 size,
+      void* destination);
+
+  // Transfer a memory block of the given size from 'source' buffer to the given
+  // destination of the device.
+  //
+  // size is the size to transfer from source in bytes.
+  virtual Status TransferBufferToDevice(
+      perftools::gputools::StreamExecutor* executor, int64 size,
+      const void* source, perftools::gputools::DeviceMemoryBase* destination);
+
+  // Writes the given device-memory pointers in 'elements' to the given region
+  // to construct a tuple in the platform-specific tuple representation. This
+  // can handle nested tuples as well. In the nested case, the element
+  // DeviceMemoryBase points to another array of pointers on the device.
+  virtual Status WriteTuplePointersToDevice(
+      perftools::gputools::StreamExecutor* executor,
+      tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
+          elements,
+      const Shape& shape, perftools::gputools::DeviceMemoryBase* region) = 0;
+
  private:
-  // Routine that returns the mutex that guards the
-  // platform-to-transfer manager map.  Done as a routine to
-  // ensure correct initialization ordering, since RegisterTransferManager
-  // can be called during program initialization time.
-  static tensorflow::mutex* platform_transfer_manager_mutex();
+  // The mutex that guards the platform-to-transfer manager map.
+  static tensorflow::mutex platform_transfer_manager_mutex_;
 
   // State kept for each kind of TransferManager.  Registration functions
   // set up creation_function, and then we use that to lazily create
diff --git a/tensorflow/compiler/xla/service/transfer_manager_test.cc b/tensorflow/compiler/xla/service/transfer_manager_test.cc
deleted file mode 100644
index c25a0861e9b90bc0f2cde43933e14204aa4e3598..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/transfer_manager_test.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace se = ::perftools::gputools;
-
-namespace xla {
-
-namespace {
-
-class CpuTransferManagerTest : public ::testing::Test {
- protected:
-  CpuTransferManagerTest()
-      : transfer_manager_(se::host::kHostPlatformId,
-                          /*pointer_size=*/sizeof(void*)) {
-    se::Platform* platform =
-        se::MultiPlatformManager::PlatformWithId(se::host::kHostPlatformId)
-            .ValueOrDie();
-    stream_exec_ =
-        platform->GetExecutor(se::StreamExecutorConfig(/*ordinal=*/0))
-            .ValueOrDie();
-  }
-
-  ~CpuTransferManagerTest() override {}
-
-  se::StreamExecutor* stream_exec_;
-  GenericTransferManager transfer_manager_;
-};
-
-TEST_F(CpuTransferManagerTest, TransferR0U32ToDevice) {
-  std::vector<uint8> storage(sizeof(uint32), '\x00');
-  se::DeviceMemoryBase memptr(storage.data(), storage.size());
-  std::unique_ptr<Literal> literal = Literal::CreateR0<uint32>(42);
-  TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
-                                                        &memptr));
-
-  CHECK_EQ(42, *reinterpret_cast<uint32*>(&storage[0]));
-}
-
-TEST_F(CpuTransferManagerTest, TransferR1F32ToDevice) {
-  std::vector<uint8> storage(4 * sizeof(float), '\x00');
-  se::DeviceMemoryBase memptr(storage.data(), storage.size());
-  std::unique_ptr<Literal> literal =
-      Literal::CreateR1<float>({1.25f, 2.5f, -17.0f, -20.125f});
-  TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
-                                                        &memptr));
-
-  CHECK_EQ(1.25f, *reinterpret_cast<float*>(&storage[0]));
-  CHECK_EQ(2.5f, *reinterpret_cast<float*>(&storage[sizeof(float)]));
-  CHECK_EQ(-17.0f, *reinterpret_cast<float*>(&storage[2 * sizeof(float)]));
-  CHECK_EQ(-20.125f, *reinterpret_cast<float*>(&storage[3 * sizeof(float)]));
-}
-
-TEST_F(CpuTransferManagerTest, TransferR1U8ToDevice) {
-  std::vector<uint8> storage(16, '\x00');
-  se::DeviceMemoryBase memptr(storage.data(), storage.size());
-  const char* str = "0123456789abcdef";
-  std::unique_ptr<Literal> literal = Literal::CreateR1U8(str);
-  TF_CHECK_OK(transfer_manager_.TransferLiteralToDevice(stream_exec_, *literal,
-                                                        &memptr));
-
-  CHECK_EQ('0', storage[0]);
-  CHECK_EQ('8', storage[8]);
-  CHECK_EQ('f', storage[15]);
-}
-
-TEST_F(CpuTransferManagerTest, TransferR0U32FromDevice) {
-  std::vector<uint32> storage(1, 42);
-  se::DeviceMemoryBase memptr(storage.data(),
-                              storage.size() * sizeof(storage[0]));
-  Literal literal;
-  const Shape shape = ShapeUtil::MakeShape(U32, {});
-  TF_CHECK_OK(transfer_manager_.TransferLiteralFromDevice(
-      stream_exec_, memptr, shape, shape, &literal));
-
-  LiteralTestUtil::ExpectR0Equal<uint32>(42, literal);
-}
-
-TEST_F(CpuTransferManagerTest, TransferR1F32FromDevice) {
-  std::vector<float> storage{1.25f, 2.5f, -17.0f, -20.125f};
-  se::DeviceMemoryBase memptr(storage.data(),
-                              storage.size() * sizeof(storage[0]));
-  Literal literal;
-  const Shape shape = ShapeUtil::MakeShape(F32, {4});
-  TF_CHECK_OK(transfer_manager_.TransferLiteralFromDevice(
-      stream_exec_, memptr, shape, shape, &literal));
-
-  LiteralTestUtil::ExpectR1Equal<float>({1.25, 2.5, -17.0, -20.125}, literal);
-}
-
-TEST_F(CpuTransferManagerTest, TransferR1U8FromDevice) {
-  std::vector<uint8> storage{'k', 'l', 'm', 'n'};
-  se::DeviceMemoryBase memptr(storage.data(),
-                              storage.size() * sizeof(storage[0]));
-  Literal literal;
-  const Shape shape = ShapeUtil::MakeShape(U8, {4});
-  TF_CHECK_OK(transfer_manager_.TransferLiteralFromDevice(
-      stream_exec_, memptr, shape, shape, &literal));
-  CHECK_EQ("klmn", literal.u8s_string());
-}
-
-TEST_F(CpuTransferManagerTest, TransferBufferFromDevice) {
-  std::vector<uint64> storage{1, 5, 42};
-  int64 size = storage.size() * sizeof(storage[0]);
-  se::DeviceMemoryBase memptr(storage.data(), size);
-
-  std::vector<uint64> dest(3, 0);
-  TF_CHECK_OK(transfer_manager_.TransferBufferFromDevice(stream_exec_, memptr,
-                                                         size, dest.data()));
-  ASSERT_EQ(1, dest[0]);
-  ASSERT_EQ(5, dest[1]);
-  ASSERT_EQ(42, dest[2]);
-}
-
-TEST_F(CpuTransferManagerTest, TransferBufferToDevice) {
-  int64 size = 3 * sizeof(uint64);
-  std::vector<uint8> storage(size, 0);
-  se::DeviceMemoryBase memptr(storage.data(), size);
-
-  std::vector<uint64> dest{1, 5, 42};
-  TF_CHECK_OK(transfer_manager_.TransferBufferToDevice(stream_exec_, size,
-                                                       dest.data(), &memptr));
-  std::vector<uint64>* storage64 =
-      reinterpret_cast<std::vector<uint64>*>(&storage);
-  ASSERT_EQ(1, (*storage64)[0]);
-  ASSERT_EQ(5, (*storage64)[1]);
-  ASSERT_EQ(42, (*storage64)[2]);
-}
-
-// TODO(b/24679870): add similar tests for GPUs
-
-}  // namespace
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 8c2640adf52f10c387e7a9c09c0d73a09c054919..42b616f4c3446957eec13874eac74e80195f85a4 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -58,27 +58,11 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoConvolution(
     return {};
   }
 
-  const ConvolutionDimensionNumbers& dnums =
-      convolution.convolution_dimension_numbers();
-
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < convolution.operand_count(); ++i) {
     auto& operand = *convolution.operand(i);
     if (operand.opcode() == HloOpcode::kTranspose &&
         operand.user_count() == 1) {
-      const auto& transpose_dimensions = operand.dimensions();
-      // We can transpose the LHS so long as it doesn't move around spatial
-      // dimensions because ConvolutionDimensionNumbers doesn't have different
-      // fields for input and output spatial dimensions.
-      if (i == 0 &&
-          std::any_of(dnums.spatial_dimensions().begin(),
-                      dnums.spatial_dimensions().end(),
-                      [&](const int64 spatial_dimension) {
-                        return transpose_dimensions[spatial_dimension] !=
-                               spatial_dimension;
-                      })) {
-        continue;
-      }
       operand_set.push_back(i);
     }
   }
@@ -118,6 +102,10 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
   auto& convolution = *pair.first;
   auto& operand_indices = pair.second;
 
+  if (operand_indices.empty()) {
+    return false;
+  }
+
   const ConvolutionDimensionNumbers& dnums =
       convolution.convolution_dimension_numbers();
   ConvolutionDimensionNumbers new_dnums = dnums;
@@ -137,8 +125,9 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
         transpose_dimensions[dnums.input_batch_dimension()]);
     new_dnums.set_input_feature_dimension(
         transpose_dimensions[dnums.input_feature_dimension()]);
-    for (const auto& spatial_dimension : dnums.spatial_dimensions()) {
-      CHECK_EQ(spatial_dimension, transpose_dimensions[spatial_dimension]);
+    for (auto& input_spatial_dimension :
+         *new_dnums.mutable_input_spatial_dimensions()) {
+      input_spatial_dimension = transpose_dimensions[input_spatial_dimension];
     }
     new_lhs = &transpose_operand;
   } else {
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 00462f9be1e9beb2f2694060ebfaa70b0b9dd4a0..caa1a111ad880b9dee62c1c94e32e8275c196fbf 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -64,9 +64,12 @@ TEST_F(TransposeFoldingTest, FoldDotTranspose) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -104,9 +107,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
   HloInstruction* transpose1 =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3}), const1, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {1, 3}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/transpose0, /*rhs=*/transpose1));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(F32, {1, 3}),
+      /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -169,9 +175,12 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(F32, {2, 2}), /*opcode=*/HloOpcode::kDot,
-      /*lhs=*/x, /*rhs=*/transpose_y));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
+                                /*rhs=*/transpose_y, dot_dnums));
 
   HloModule module("test_module");
   HloComputation* entry_computation =
@@ -362,10 +371,82 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
   EXPECT_EQ(
       dnums.input_batch_dimension(),
       new_conv->convolution_dimension_numbers().input_feature_dimension());
-  EXPECT_EQ(dnums.spatial_dimensions(0),
-            new_conv->convolution_dimension_numbers().spatial_dimensions(0));
-  EXPECT_EQ(dnums.spatial_dimensions(1),
-            new_conv->convolution_dimension_numbers().spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
+}
+
+// Test that a transpose of every dimension in the activations gets folded into
+// convolution.
+TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
+  auto builder = HloComputation::Builder("entry_computation");
+  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {3, 2, 1, 1}),
+      /*name=*/"x"));
+  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3, 1, 1}),
+      /*name=*/"y"));
+  HloInstruction* transpose_x =
+      builder.AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 3, 2}));
+  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  Window window;
+  for (int i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+    dim->set_window_dilation(1);
+    dim->set_stride(1);
+    dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
+  }
+  StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
+      transpose_x->shape(), y->shape(), window, dnums);
+  EXPECT_IS_OK(conv_shape);
+  HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
+
+  HloModule module("test_module");
+  HloComputation* entry_computation =
+      module.AddEntryComputation(builder.Build(conv));
+  FoldTranspose(&module);
+
+  // Instructions after folding: x, y, and the convolution.
+  std::unordered_set<HloInstruction*> instruction_set(
+      entry_computation->instructions().begin(),
+      entry_computation->instructions().end());
+  EXPECT_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
+  EXPECT_EQ(1, instruction_set.size())
+      << "entry_computation should contain exactly 3 instructions.";
+  HloInstruction* new_conv = *instruction_set.begin();
+  EXPECT_EQ(HloOpcode::kConvolution, new_conv->opcode());
+  EXPECT_EQ(dnums.input_feature_dimension(),
+            new_conv->convolution_dimension_numbers().input_batch_dimension());
+  EXPECT_EQ(
+      dnums.input_batch_dimension(),
+      new_conv->convolution_dimension_numbers().input_feature_dimension());
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(1));
+  EXPECT_EQ(
+      dnums.input_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().input_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(0),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(0));
+  EXPECT_EQ(
+      dnums.output_spatial_dimensions(1),
+      new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index df537bd7c15a1f15ed77ca9be6ce70fbfd2e63be..0c848566478a25d4862cb0698e029dacd71f7a6a 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -120,6 +120,23 @@ void PointsToSet::add_tuple_source(const ShapeIndex& index,
   tree_.mutable_element(index)->tuple_sources.insert(tuple);
 }
 
+namespace {
+
+// Gather fusion instructions from 'instruction' into 'fusion_instructions'.
+void GatherFusionInstructions(
+    HloInstruction* instruction,
+    std::vector<HloInstruction*>* fusion_instructions) {
+  CHECK_EQ(HloOpcode::kFusion, instruction->opcode());
+  for (auto* fused : instruction->fused_instructions()) {
+    if (fused->opcode() == HloOpcode::kFusion) {
+      GatherFusionInstructions(fused, fusion_instructions);
+    }
+  }
+  fusion_instructions->push_back(instruction);
+}
+
+}  // namespace
+
 /* static */ StatusOr<std::unique_ptr<TuplePointsToAnalysis>>
 TuplePointsToAnalysis::Run(const HloModule* module) {
   auto logical_buffer_analysis = LogicalBufferAnalysis::Run(module);
@@ -137,20 +154,23 @@ Status TuplePointsToAnalysis::Analyze() {
   logical_buffer_aliases_.resize(
       logical_buffer_analysis_->num_logical_buffers());
 
+  std::vector<HloInstruction*> fusion_instructions;
   for (auto* computation : module_->MakeNonfusionComputations()) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
     TF_RETURN_IF_ERROR(
         PopulateDefinedBuffersAndAliases(computation->instructions()));
-    // Run points-to analysis on fusion instructions in 'computation'.
     for (auto* instruction : computation->instructions()) {
-      if (instruction->opcode() != HloOpcode::kFusion) {
-        continue;
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        GatherFusionInstructions(instruction, &fusion_instructions);
       }
-      TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
-      TF_RETURN_IF_ERROR(
-          PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
     }
   }
+  // Run points-to analysis on fusion instructions in 'computation'.
+  for (auto* instruction : fusion_instructions) {
+    TF_RETURN_IF_ERROR(instruction->fused_expression_root()->Accept(this));
+    TF_RETURN_IF_ERROR(
+        PopulateDefinedBuffersAndAliases(instruction->fused_instructions()));
+  }
 
   XLA_VLOG_LINES(3, ToString());
 
@@ -253,6 +273,64 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
+  // RecvDone aliases its input (Recv) tuple element {0} to its output.
+  PointsToSet& points_to_set = CreateEmptyPointsToSet(recv_done);
+  const PointsToSet& operand_points_to_set =
+      GetPointsToSet(recv_done->operand(0));
+
+  // Recursively copy the points to set of the operand tuple {0}.
+  points_to_set.ForEachMutableElement(
+      [this, &points_to_set, &operand_points_to_set](
+          const ShapeIndex& index, PointsToSet::BufferList* buffers) {
+        ShapeIndex src_index({0});
+        for (auto element : index) {
+          src_index.push_back(element);
+        }
+        *buffers = operand_points_to_set.element(src_index);
+        for (auto& tuple_source :
+             operand_points_to_set.tuple_sources(src_index)) {
+          points_to_set.add_tuple_source(index, tuple_source);
+        }
+      });
+  return Status::OK();
+}
+
+Status TuplePointsToAnalysis::HandleSend(HloInstruction* send) {
+  // Send creates a tuple of {aliased operand, U32 context}.
+  PointsToSet& points_to_set = CreateEmptyPointsToSet(send);
+
+  // Creates the points to set for the tuple and its element at {1}.
+  auto top_buffer = points_to_set.mutable_element(ShapeIndex({}));
+  top_buffer->push_back(
+      &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({})));
+  points_to_set.add_tuple_source({}, send);
+
+  auto context_buffer = points_to_set.mutable_element(ShapeIndex({1}));
+  context_buffer->push_back(
+      &logical_buffer_analysis_->GetBuffer(send, ShapeIndex({1})));
+
+  // Recursively copy the points to set of the operand to output tuple {0}.
+  const PointsToSet& operand_points_to_set = GetPointsToSet(send->operand(0));
+  operand_points_to_set.ForEachElement(
+      [&points_to_set, &operand_points_to_set](
+          const ShapeIndex& src_index,
+          const PointsToSet::BufferList& points_to) {
+        ShapeIndex target_index({0});
+        for (auto element : src_index) {
+          target_index.push_back(element);
+        }
+        *points_to_set.mutable_element(target_index) = points_to;
+
+        for (HloInstruction* tuple :
+             operand_points_to_set.tuple_sources(src_index)) {
+          points_to_set.add_tuple_source(target_index, tuple);
+        }
+      });
+
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleTuple(HloInstruction* tuple) {
   tensorflow::gtl::ArraySlice<HloInstruction*> operands(tuple->operands());
   PointsToSet& points_to_set = CreateEmptyPointsToSet(tuple);
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index e6157a1ed11b5df24458fe820a4e0e329eb86ae4..8928de107eed8c40bbe2130e26fe83ca3802d2f6 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -251,6 +251,8 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleCopy(HloInstruction* copy) override;
+  Status HandleRecvDone(HloInstruction* recv_done) override;
+  Status HandleSend(HloInstruction* send) override;
   Status HandleSelect(HloInstruction* select) override;
 
   string ToString() const;
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 694ed57fa24d59bd0a28c7bb9b67af8165e90363..dec446d4dac650ba43992f7870764eedc80cb2cf 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -313,6 +313,51 @@ TEST_F(TuplePointsToAnalysisTest, TupleCopy) {
       {constant1, constant2, copy});
 }
 
+TEST_F(TuplePointsToAnalysisTest, SendAndSendDone) {
+  // Send forwards its operand to the output tuple at {0}.
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto send = builder.AddInstruction(
+      HloInstruction::CreateSend(constant, /*channel_id=*/0));
+  auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(send).IsAmbiguous());
+  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(send).IsDistinct());
+  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(send_done).IsAmbiguous());
+  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(send_done).IsDistinct());
+
+  ExpectHasTopLevelBuffers(
+      points_to_analysis_->GetPointsToSet(send).element({}), {send});
+  ExpectHasTopLevelBuffers(
+      points_to_analysis_->GetPointsToSet(send).element({0}), {constant});
+  ExpectHasTopLevelBuffers(
+      points_to_analysis_->GetPointsToSet(send_done).CreateFlattenedSet(),
+      {send_done});
+  ExpectHasBufferAliases(constant, {}, {{constant, {}}, {send, {0}}});
+}
+
+TEST_F(TuplePointsToAnalysisTest, RecvAndRecvDone) {
+  // RecvDone forwards its operand tuple element at {0} to the output.
+  auto builder = HloComputation::Builder(TestName());
+  auto recv = builder.AddInstruction(HloInstruction::CreateRecv(
+      ShapeUtil::MakeShape(F32, {1, 2, 3}), /*channel_id=*/0));
+  auto recv_done = builder.AddInstruction(HloInstruction::CreateRecvDone(recv));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(recv).IsAmbiguous());
+  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(recv).IsDistinct());
+  EXPECT_FALSE(points_to_analysis_->GetPointsToSet(recv_done).IsAmbiguous());
+  EXPECT_TRUE(points_to_analysis_->GetPointsToSet(recv_done).IsDistinct());
+
+  ExpectHasTopLevelBuffers(
+      points_to_analysis_->GetPointsToSet(recv).element({}), {recv});
+  ExpectHasBufferAliases(recv, {0}, {{recv, {0}}, {recv_done, {}}});
+}
+
 TEST_F(TuplePointsToAnalysisTest, TupleSelect) {
   // Select from two different tuples. This should create an ambiguous points to
   // set containing the union of both sides.
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 006c814996df9b209e6cd4d75bc04689c4e297c5..e6893c8133b17cac3ca381df58d417eef15b60c4 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -88,8 +88,6 @@ HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) {
       return HloOpcode::kAtan2;
     case BINOP_COMPLEX:
       return HloOpcode::kComplex;
-    case BINOP_DOT:
-      return HloOpcode::kDot;
     case BINOP_MUL:
       return HloOpcode::kMultiply;
     case BINOP_ADD:
@@ -765,6 +763,54 @@ StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddConditionalInstruction(
+    const ConditionalRequest& conditional_request,
+    const UserComputation& true_computation,
+    const UserComputation& false_computation) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* pred,
+                      LookUpRequest(conditional_request.predicate()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand,
+                      LookUpRequest(conditional_request.true_operand()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand,
+                      LookUpRequest(conditional_request.false_operand()));
+
+  VersionedComputationHandle::Version true_computation_version =
+      true_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> true_computation_shape,
+      true_computation.ComputeProgramShape(true_computation_version));
+
+  VersionedComputationHandle::Version false_computation_version =
+      false_computation.version();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<const ProgramShape> false_computation_shape,
+      false_computation.ComputeProgramShape(false_computation_version));
+
+  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                      ShapeInference::InferConditionalShape(
+                          pred->output_shape(), true_operand->output_shape(),
+                          false_operand->output_shape(),
+                          *true_computation_shape, *false_computation_shape));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = inferred_shape;
+  request.add_embedded_computation_versions(true_computation_version);
+  request.add_embedded_computation_versions(false_computation_version);
+  *request.mutable_request()->mutable_conditional_request() =
+      conditional_request;
+
+  VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << conditional_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
     const BroadcastRequest& broadcast_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -994,6 +1040,32 @@ StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddBitcastConvertInstruction(
+    const ConvertRequest& convert_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
+                      LookUpRequest(convert_request.operand()));
+
+  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
+                                           operand->output_shape(),
+                                           convert_request.new_element_type()));
+
+  ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = new_shape;
+  *request.mutable_request()->mutable_bitcast_convert_request() =
+      convert_request;
+
+  VLOG(1) << "AddBitcastConvertInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << convert_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddReducePrecisionInstruction(
     const ReducePrecisionRequest& reduce_precision_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1056,7 +1128,7 @@ StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
   TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
                       LookUpRequest(cross_replica_sum_request.operand()));
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                       operand->output_shape()));
+                                       {&operand->output_shape()}));
 
   ComputationDataHandle handle = CreateComputationDataHandle();
 
@@ -1181,6 +1253,33 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
   return handle;
 }
 
+StatusOr<ComputationDataHandle> UserComputation::AddDotInstruction(
+    const DotRequest& dot_request) {
+  tensorflow::mutex_lock lock(mutex_);
+
+  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
+                      LookUpRequest(dot_request.lhs()));
+  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
+                      LookUpRequest(dot_request.rhs()));
+
+  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape(
+                                       lhs->output_shape(), rhs->output_shape(),
+                                       dot_request.dimension_numbers()));
+
+  const ComputationDataHandle handle = CreateComputationDataHandle();
+
+  OperationRequest& request =
+      (*session_computation_.mutable_requests())[handle.handle()];
+  *request.mutable_output_handle() = handle;
+  *request.mutable_output_shape() = shape;
+  *request.mutable_request()->mutable_dot_request() = dot_request;
+
+  VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal()
+          << "), data handle " << handle.handle() << ": "
+          << dot_request.ShortDebugString();
+  return handle;
+}
+
 StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
     const UnaryOpRequest& unary_request) {
   tensorflow::mutex_lock lock(mutex_);
@@ -1482,14 +1581,15 @@ UserComputation::ComputeProgramShape(
 
 namespace {
 
-// A visitor which checks whether an operation is a compile-time constant. That
-// is, the operation does not depend on any parameter instructions. The visitor
-// walks the computation starting at a given operation and sets is_constant to
-// false iff a parameter or RNG operation is encountered.
-void ConstantVisitor(const SessionComputation& session_computation,
-                     const ComputationDataHandle& handle,
-                     std::set<int64>* visited, bool* is_constant) {
-  if (visited->count(handle.handle()) != 0 || !*is_constant) {
+// A visitor which checks whether an operation is pure functional meaning that
+// it doesn't depend on any parameter with an index higher then num_parameters.
+// The visitor walks the computation starting at a given operation and sets
+// is_functional to false iff a parameter or RNG operation is encountered.
+void PureFunctionalVisitor(const SessionComputation& session_computation,
+                           const ComputationDataHandle& handle,
+                           int64 num_parameters, std::set<int64>* visited,
+                           bool* is_functional) {
+  if (visited->count(handle.handle()) != 0 || !*is_functional) {
     return;
   }
 
@@ -1497,7 +1597,7 @@ void ConstantVisitor(const SessionComputation& session_computation,
       session_computation.requests().at(handle.handle());
   switch (request.request().op_case()) {
     case OpRequest::kRngRequest:
-      *is_constant = false;
+      *is_functional = false;
       break;
 
     case OpRequest::kConstantRequest:
@@ -1506,41 +1606,43 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kGetTupleElementRequest: {
       const GetTupleElementRequest& get_tuple_element_request =
           request.request().get_tuple_element_request();
-      ConstantVisitor(session_computation, get_tuple_element_request.operand(),
-                      visited, is_constant);
+      PureFunctionalVisitor(session_computation,
+                            get_tuple_element_request.operand(), num_parameters,
+                            visited, is_functional);
       break;
     }
 
     case OpRequest::kSliceRequest: {
       const SliceRequest& slice_request = request.request().slice_request();
-      ConstantVisitor(session_computation, slice_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, slice_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kDynamicSliceRequest: {
       const DynamicSliceRequest& dynamic_slice_request =
           request.request().dynamic_slice_request();
-      ConstantVisitor(session_computation, dynamic_slice_request.operand(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation,
-                      dynamic_slice_request.start_indices(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation,
+                            dynamic_slice_request.operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            dynamic_slice_request.start_indices(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kDynamicUpdateSliceRequest: {
       const DynamicUpdateSliceRequest& dynamic_update_slice_request =
           request.request().dynamic_update_slice_request();
-      ConstantVisitor(session_computation,
-                      dynamic_update_slice_request.operand(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation,
-                      dynamic_update_slice_request.update(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation,
-                      dynamic_update_slice_request.start_indices(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation,
+                            dynamic_update_slice_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            dynamic_update_slice_request.update(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            dynamic_update_slice_request.start_indices(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
@@ -1549,7 +1651,8 @@ void ConstantVisitor(const SessionComputation& session_computation,
           request.request().concatenate_request();
       for (const ComputationDataHandle& handle :
            concatenate_request.operands()) {
-        ConstantVisitor(session_computation, handle, visited, is_constant);
+        PureFunctionalVisitor(session_computation, handle, num_parameters,
+                              visited, is_functional);
       }
       break;
     }
@@ -1557,61 +1660,72 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kConvolveRequest: {
       const ConvolveRequest& convolve_request =
           request.request().convolve_request();
-      ConstantVisitor(session_computation, convolve_request.lhs(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, convolve_request.rhs(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, convolve_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, convolve_request.rhs(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kCrossReplicaSumRequest: {
       // TODO(b/33009255): Implmement constant folding for cross replica sum.
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kInfeedRequest: {
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kOutfeedRequest: {
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kCallRequest: {
       const CallRequest& call_request = request.request().call_request();
       for (const ComputationDataHandle& handle : call_request.operands()) {
-        ConstantVisitor(session_computation, handle, visited, is_constant);
+        PureFunctionalVisitor(session_computation, handle, num_parameters,
+                              visited, is_functional);
       }
       // TODO(b/32495713): We aren't checking the to_apply computation itself,
       // so we conservatively say that computations containing the Call op
-      // cannot be constant.  We cannot set is_constant=false in other similar
+      // cannot be constant.  We cannot set is_functional=false in other similar
       // cases since we're already relying on IsConstant to return true.
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kCustomCallRequest: {
-      *is_constant = false;
+      *is_functional = false;
+      break;
+    }
+
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      PureFunctionalVisitor(session_computation, dot_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, dot_request.rhs(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kSendRequest: {
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kRecvRequest: {
-      *is_constant = false;
+      *is_functional = false;
       break;
     }
 
     case OpRequest::kMapRequest: {
       const MapRequest& map_request = request.request().map_request();
       for (const ComputationDataHandle& handle : map_request.operands()) {
-        ConstantVisitor(session_computation, handle, visited, is_constant);
+        PureFunctionalVisitor(session_computation, handle, num_parameters,
+                              visited, is_functional);
       }
       // TODO(b/32495713): We aren't checking the to_apply computation itself.
       break;
@@ -1619,10 +1733,10 @@ void ConstantVisitor(const SessionComputation& session_computation,
 
     case OpRequest::kReduceRequest: {
       const ReduceRequest& reduce_request = request.request().reduce_request();
-      ConstantVisitor(session_computation, reduce_request.operand(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, reduce_request.init_value(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, reduce_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, reduce_request.init_value(),
+                            num_parameters, visited, is_functional);
       // TODO(b/32495713): We aren't checking the to_apply computation itself.
       break;
     }
@@ -1630,10 +1744,12 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kReduceWindowRequest: {
       const ReduceWindowRequest& reduce_window_request =
           request.request().reduce_window_request();
-      ConstantVisitor(session_computation, reduce_window_request.operand(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, reduce_window_request.init_value(),
-                      visited, is_constant);
+      PureFunctionalVisitor(session_computation,
+                            reduce_window_request.operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            reduce_window_request.init_value(), num_parameters,
+                            visited, is_functional);
       // TODO(b/32495713): We aren't checking the to_apply computation itself.
       break;
     }
@@ -1641,13 +1757,15 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kSelectAndScatterRequest: {
       const SelectAndScatterRequest& select_and_scatter_request =
           request.request().select_and_scatter_request();
-      ConstantVisitor(session_computation, select_and_scatter_request.operand(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, select_and_scatter_request.source(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation,
-                      select_and_scatter_request.init_value(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation,
+                            select_and_scatter_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            select_and_scatter_request.source(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            select_and_scatter_request.init_value(),
+                            num_parameters, visited, is_functional);
       // TODO(b/32495713): We aren't checking the select and scatter
       // computations themselves.
       break;
@@ -1656,76 +1774,105 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kBroadcastRequest: {
       const BroadcastRequest& broadcast_request =
           request.request().broadcast_request();
-      ConstantVisitor(session_computation, broadcast_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, broadcast_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kReshapeRequest: {
       const ReshapeRequest& reshape_request =
           request.request().reshape_request();
-      ConstantVisitor(session_computation, reshape_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, reshape_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kReverseRequest: {
       const ReverseRequest& reverse_request =
           request.request().reverse_request();
-      ConstantVisitor(session_computation, reverse_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, reverse_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kPadRequest: {
       const PadRequest& pad_request = request.request().pad_request();
-      ConstantVisitor(session_computation, pad_request.operand(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, pad_request.padding_value(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, pad_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, pad_request.padding_value(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kParameterRequest: {
-      *is_constant = false;
+      const ParameterRequest& parameter_request =
+          request.request().parameter_request();
+      if (parameter_request.parameter() >= num_parameters) {
+        *is_functional = false;
+      }
       break;
     }
 
     case OpRequest::kConvertRequest: {
       const ConvertRequest& convert_request =
           request.request().convert_request();
-      ConstantVisitor(session_computation, convert_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, convert_request.operand(),
+                            num_parameters, visited, is_functional);
+      break;
+    }
+
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      PureFunctionalVisitor(session_computation, convert_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
-      ConstantVisitor(session_computation, while_request.init(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, while_request.init(),
+                            num_parameters, visited, is_functional);
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
-      *is_constant = false;
+      *is_functional = false;
+      break;
+    }
+
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.predicate(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.true_operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            conditional_request.false_operand(), num_parameters,
+                            visited, is_functional);
+      // TODO(b/32495713): We aren't checking the true and false computations
+      // themselves.
       break;
     }
 
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
-      ConstantVisitor(session_computation, ternary_op_request.lhs(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, ternary_op_request.rhs(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, ternary_op_request.ehs(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, ternary_op_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, ternary_op_request.rhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, ternary_op_request.ehs(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kTransposeRequest: {
       const TransposeRequest& transpose_request =
           request.request().transpose_request();
-      ConstantVisitor(session_computation, transpose_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, transpose_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
@@ -1734,7 +1881,8 @@ void ConstantVisitor(const SessionComputation& session_computation,
           request.request().variadic_op_request();
       for (const ComputationDataHandle& handle :
            variadic_op_request.operands()) {
-        ConstantVisitor(session_computation, handle, visited, is_constant);
+        PureFunctionalVisitor(session_computation, handle, num_parameters,
+                              visited, is_functional);
       }
       break;
     }
@@ -1742,67 +1890,74 @@ void ConstantVisitor(const SessionComputation& session_computation,
     case OpRequest::kUnaryOpRequest: {
       const UnaryOpRequest& unary_op_request =
           request.request().unary_op_request();
-      ConstantVisitor(session_computation, unary_op_request.operand(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, unary_op_request.operand(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kBatchNormTrainingRequest: {
       const BatchNormTrainingRequest& batch_norm_training_request =
           request.request().batch_norm_training_request();
-      ConstantVisitor(session_computation,
-                      batch_norm_training_request.operand(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, batch_norm_training_request.scale(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, batch_norm_training_request.offset(),
-                      visited, is_constant);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_training_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_training_request.scale(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_training_request.offset(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kBatchNormInferenceRequest: {
       const BatchNormInferenceRequest& batch_norm_inference_request =
           request.request().batch_norm_inference_request();
-      ConstantVisitor(session_computation,
-                      batch_norm_inference_request.operand(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, batch_norm_inference_request.scale(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation,
-                      batch_norm_inference_request.offset(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, batch_norm_inference_request.mean(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation,
-                      batch_norm_inference_request.variance(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_inference_request.operand(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_inference_request.scale(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_inference_request.offset(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_inference_request.mean(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_inference_request.variance(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kBatchNormGradRequest: {
       const BatchNormGradRequest& batch_norm_grad_request =
           request.request().batch_norm_grad_request();
-      ConstantVisitor(session_computation, batch_norm_grad_request.operand(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, batch_norm_grad_request.scale(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, batch_norm_grad_request.mean(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation, batch_norm_grad_request.variance(),
-                      visited, is_constant);
-      ConstantVisitor(session_computation,
-                      batch_norm_grad_request.grad_output(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_grad_request.operand(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_grad_request.scale(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation, batch_norm_grad_request.mean(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_grad_request.variance(), num_parameters,
+                            visited, is_functional);
+      PureFunctionalVisitor(session_computation,
+                            batch_norm_grad_request.grad_output(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
     case OpRequest::kBinaryOpRequest: {
       const BinaryOpRequest& binary_op_request =
           request.request().binary_op_request();
-      ConstantVisitor(session_computation, binary_op_request.lhs(), visited,
-                      is_constant);
-      ConstantVisitor(session_computation, binary_op_request.rhs(), visited,
-                      is_constant);
+      PureFunctionalVisitor(session_computation, binary_op_request.lhs(),
+                            num_parameters, visited, is_functional);
+      PureFunctionalVisitor(session_computation, binary_op_request.rhs(),
+                            num_parameters, visited, is_functional);
       break;
     }
 
@@ -1817,8 +1972,8 @@ void ConstantVisitor(const SessionComputation& session_computation,
 
 }  // namespace
 
-StatusOr<bool> UserComputation::IsConstant(
-    const ComputationDataHandle& handle) {
+StatusOr<bool> UserComputation::IsConstant(const ComputationDataHandle& handle,
+                                           int64 num_parameters) {
   tensorflow::mutex_lock lock(mutex_);
 
   // Verify that the handle is valid.
@@ -1829,7 +1984,8 @@ StatusOr<bool> UserComputation::IsConstant(
 
   bool is_constant = true;
   std::set<int64> visited;
-  ConstantVisitor(session_computation_, handle, &visited, &is_constant);
+  PureFunctionalVisitor(session_computation_, handle, num_parameters, &visited,
+                        &is_constant);
 
   return is_constant;
 }
@@ -1928,6 +2084,21 @@ UserComputation::GetEmbeddedComputations(
           break;
         }
 
+        case OpRequest::kConditionalRequest: {
+          CHECK_EQ(2, request.embedded_computation_versions_size());
+          const ConditionalRequest& conditional_request =
+              request.request().conditional_request();
+          const VersionedComputationHandle true_computation_versioned_handle = {
+              conditional_request.true_computation(),
+              request.embedded_computation_versions(0)};
+          computations.push_back(true_computation_versioned_handle);
+          const VersionedComputationHandle false_computation_versioned_handle =
+              {conditional_request.false_computation(),
+               request.embedded_computation_versions(1)};
+          computations.push_back(false_computation_versioned_handle);
+          break;
+        }
+
         default:
           // No embedded computation.
           break;
@@ -2014,6 +2185,16 @@ Status UserComputation::RemapEmbeddedComputations(
         TF_RETURN_IF_ERROR(update(while_request->mutable_body()));
         break;
       }
+      case OpRequest::kConditionalRequest: {
+        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
+        ConditionalRequest* conditional_request =
+            request.mutable_request()->mutable_conditional_request();
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_true_computation()));
+        TF_RETURN_IF_ERROR(
+            update(conditional_request->mutable_false_computation()));
+        break;
+      }
       default:
         // No embedded computation.
         TF_RET_CHECK(0 == request.embedded_computation_versions_size());
@@ -2347,12 +2528,28 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      apply(convert_request.operand());
+      break;
+    }
+
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
       apply(while_request.init());
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      apply(conditional_request.predicate());
+      apply(conditional_request.true_operand());
+      apply(conditional_request.false_operand());
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -2389,6 +2586,13 @@ static void ForEachOperand(
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      apply(dot_request.rhs());
+      apply(dot_request.lhs());
+      break;
+    }
+
     case OpRequest::kUnaryOpRequest: {
       const UnaryOpRequest& unary_op_request =
           request.request().unary_op_request();
@@ -2515,6 +2719,7 @@ HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
   if (ShapeUtil::IsScalar(operand->shape())) {
     HloInstruction* broadcast = hlo_builder_.AddInstruction(
         HloInstruction::CreateBroadcast(broadcast_shape, operand, {}));
+    broadcast->set_metadata(operand->metadata());
     if (operand->has_sharding()) {
       broadcast->set_sharding(operand->sharding());
     }
@@ -2535,6 +2740,7 @@ HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
           ShapeUtil::MakeShape(operand->shape().element_type(),
                                reshaped_dimensions),
           operand));
+  reshaped_operand->set_metadata(operand->metadata());
   if (operand->has_sharding()) {
     reshaped_operand->set_sharding(operand->sharding());
   }
@@ -2542,6 +2748,7 @@ HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
   HloInstruction* broadcast =
       hlo_builder_.AddInstruction(HloInstruction::CreateBroadcast(
           broadcast_shape, reshaped_operand, broadcast_dimensions));
+  broadcast->set_metadata(operand->metadata());
   if (operand->has_sharding()) {
     broadcast->set_sharding(operand->sharding());
   }
@@ -2665,13 +2872,22 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kDotRequest: {
+      const DotRequest& dot_request = request.request().dot_request();
+      HloInstruction* lhs = lookup_instruction(dot_request.lhs());
+      HloInstruction* rhs = lookup_instruction(dot_request.rhs());
+      hlo_instruction = add_instruction(HloInstruction::CreateDot(
+          request.output_shape(), lhs, rhs, dot_request.dimension_numbers()));
+      break;
+    }
+
     case OpRequest::kCrossReplicaSumRequest: {
       const CrossReplicaSumRequest& cross_replica_sum_request =
           request.request().cross_replica_sum_request();
       HloInstruction* operand =
           lookup_instruction(cross_replica_sum_request.operand());
       hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
-          request.output_shape(), operand));
+          request.output_shape(), {operand}));
       break;
     }
 
@@ -2904,8 +3120,9 @@ void ComputationLowerer::Visit(
 
     case OpRequest::kRecvRequest: {
       const RecvRequest& recv_request = request.request().recv_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateRecv(
+      HloInstruction* recv = add_instruction(HloInstruction::CreateRecv(
           request.output_shape(), recv_request.channel_handle().handle()));
+      hlo_instruction = add_instruction(HloInstruction::CreateRecvDone(recv));
       break;
     }
 
@@ -2927,6 +3144,15 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kBitcastConvertRequest: {
+      const ConvertRequest& convert_request =
+          request.request().bitcast_convert_request();
+      HloInstruction* operand = lookup_instruction(convert_request.operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateBitcastConvert(
+          request.output_shape(), operand));
+      break;
+    }
+
     case OpRequest::kWhileRequest: {
       const WhileRequest& while_request = request.request().while_request();
       CHECK_EQ(2, request.embedded_computation_versions_size());
@@ -2944,6 +3170,30 @@ void ComputationLowerer::Visit(
       break;
     }
 
+    case OpRequest::kConditionalRequest: {
+      const ConditionalRequest& conditional_request =
+          request.request().conditional_request();
+      CHECK_EQ(2, request.embedded_computation_versions_size());
+      VersionedComputationHandle::Version true_computation_version =
+          request.embedded_computation_versions(0);
+      HloComputation* true_computation = ResolveComputation(
+          conditional_request.true_computation(), true_computation_version);
+      VersionedComputationHandle::Version false_computation_version =
+          request.embedded_computation_versions(1);
+      HloComputation* false_computation = ResolveComputation(
+          conditional_request.false_computation(), false_computation_version);
+      HloInstruction* predicate =
+          lookup_instruction(conditional_request.predicate());
+      HloInstruction* true_operand =
+          lookup_instruction(conditional_request.true_operand());
+      HloInstruction* false_operand =
+          lookup_instruction(conditional_request.false_operand());
+      hlo_instruction = add_instruction(HloInstruction::CreateConditional(
+          request.output_shape(), predicate, true_operand, true_computation,
+          false_operand, false_computation));
+      break;
+    }
+
     case OpRequest::kTernaryOpRequest: {
       const TernaryOpRequest& ternary_op_request =
           request.request().ternary_op_request();
@@ -2951,6 +3201,25 @@ void ComputationLowerer::Visit(
       HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
       HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
       auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
+
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
+        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
+          // lhs side is being implicitly broadcast. Change to explicit.
+          lhs =
+              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
+        }
+
+        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
+          rhs =
+              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
+        }
+
+        if (!ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
+          ehs =
+              ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape());
+        }
+      }
+
       hlo_instruction = add_instruction(HloInstruction::CreateTernary(
           request.output_shape(), hlo_opcode, lhs, rhs, ehs));
       break;
@@ -3055,8 +3324,7 @@ void ComputationLowerer::Visit(
         lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
         rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
       }
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
-          binary_op_request.binop() != BINOP_DOT) {
+      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
         if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
           // lhs side is being implicitly broadcast. Change to explicit.
           lhs =
@@ -3097,8 +3365,9 @@ void ComputationLowerer::Visit(
     case OpRequest::kSendRequest: {
       const SendRequest& send_request = request.request().send_request();
       HloInstruction* operand = lookup_instruction(send_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateSend(
+      HloInstruction* send = add_instruction(HloInstruction::CreateSend(
           operand, send_request.channel_handle().handle()));
+      hlo_instruction = add_instruction(HloInstruction::CreateSendDone(send));
       break;
     }
 
@@ -3109,7 +3378,7 @@ void ComputationLowerer::Visit(
       LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
   }
   (*instructions)[handle.handle()] = hlo_instruction;
-}
+}  // NOLINT(readability/fn_size)
 
 }  // namespace
 
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
index dabf68e298ed2600d5248b7b8c7b1e014efedb14..8a78d520e19024f5e397d6e0c2f4e0523264e176 100644
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ b/tensorflow/compiler/xla/service/user_computation.h
@@ -70,7 +70,7 @@ class UserComputation {
 
   // Enqueues a pad instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddPadInstruction(
-      const PadRequest& parameter_request);
+      const PadRequest& pad_request);
 
   // Enqueues a tracing instruction onto this user computation.
   // Returns an error status if the operand cannot be resolved.
@@ -105,7 +105,7 @@ class UserComputation {
   // Enqueues a ternary instruction onto this user computation.
   // Returns an error status if the operand indices are out of bounds.
   StatusOr<ComputationDataHandle> AddTernaryInstruction(
-      const TernaryOpRequest& request);
+      const TernaryOpRequest& ternary_request);
 
   // Enqueues a variadic instruction onto this user computation.
   // Returns an error status if the operand indices are out of bounds.
@@ -153,6 +153,10 @@ class UserComputation {
   StatusOr<ComputationDataHandle> AddCustomCallInstruction(
       const CustomCallRequest& custom_call_request);
 
+  // Enqueues a dot instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddDotInstruction(
+      const DotRequest& dot_request);
+
   // Enqueues a broadcast instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddBroadcastInstruction(
       const BroadcastRequest& broadcast_request);
@@ -179,26 +183,30 @@ class UserComputation {
 
   // Enqueues a concatenate instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddConcatenateInstruction(
-      const ConcatenateRequest& slice_request);
+      const ConcatenateRequest& concatenate_request);
 
   // Enqueues a convert instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddConvertInstruction(
       const ConvertRequest& convert_request);
 
+  // Enqueues a bitcast element instruction onto this user computation.
+  StatusOr<ComputationDataHandle> AddBitcastConvertInstruction(
+      const ConvertRequest& convert_request);
+
   // Enqueues a reduce instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddReduceInstruction(
       const ReduceRequest& reduce_request,
-      const UserComputation& reduction_computation);
+      const UserComputation& to_apply_computation);
 
   // Enqueues a windowed reduce instruction onto this user computation.
   StatusOr<ComputationDataHandle> AddReduceWindowInstruction(
       const ReduceWindowRequest& reduce_window_request,
-      const UserComputation& reduction_computation);
+      const UserComputation& to_apply_computation);
 
   // Enqueues a select-and-scatter instruction onto this user
   // computation.
   StatusOr<ComputationDataHandle> AddSelectAndScatterInstruction(
-      const SelectAndScatterRequest& scatter_to_selected_window_element_request,
+      const SelectAndScatterRequest& select_and_scatter_request,
       const UserComputation& select_computation,
       const UserComputation& scatter_computation);
 
@@ -212,6 +220,12 @@ class UserComputation {
       const UserComputation& condition_computation,
       const UserComputation& body_computation);
 
+  // Enqueues a conditional instruction on this user computation.
+  StatusOr<ComputationDataHandle> AddConditionalInstruction(
+      const ConditionalRequest& conditional_request,
+      const UserComputation& true_computation,
+      const UserComputation& false_computation);
+
   // Enqueues a Send instruction onto this user computation.
   Status AddSendInstruction(const SendRequest& send_request);
 
@@ -250,9 +264,11 @@ class UserComputation {
   StatusOr<std::shared_ptr<const ProgramShape>> ComputeProgramShape(
       VersionedComputationHandle::Version version) const;
 
-  // Returns true if the given data handle does not depend on any
-  // parameters. That is, the value can be computed at compile time.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& handle);
+  // Returns true if the given data handle does not depend on any parameter with
+  // index higher then num_parameters. That is, the value can be computed at
+  // compile time if we know the first num_parameters arguments.
+  StatusOr<bool> IsConstant(const ComputationDataHandle& handle,
+                            int64 num_parameters);
 
   // Returns the output shape of the operation indicated by the given handle.
   StatusOr<Shape> GetShape(const ComputationDataHandle& handle);
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
index 5afaf226ae0cce7e9afc966c6b4adf838aeebc91..e45673300b6c5f85be4153f2db821d8abbced7cd 100644
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ b/tensorflow/compiler/xla/service/user_computation_test.cc
@@ -334,50 +334,5 @@ TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
               operands[1]->opcode() == HloOpcode::kBroadcast);
 }
 
-TEST_F(UserComputationTest, SkipDotInEliminatingImplicitBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  //  %a = Param({1, 3});
-  //  %b = Param({3, 1});
-  //  %dot = Dot(%a, %b);
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {3, 1});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  BinaryOpRequest dot;
-  dot.set_binop(BINOP_DOT);
-  *dot.mutable_lhs() = a_handle;
-  *dot.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(dot).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  EXPECT_EQ(3, hlo_computation->instruction_count());
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2fd64a4d9f3dc343b2e44b5efa31aacc6085042
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -0,0 +1,644 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+#include "tensorflow/compiler/xla/service/call_inliner.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+
+using tensorflow::gtl::nullopt;
+using tensorflow::gtl::optional;
+
+// Finds and returns the non-constant operand in instr.
+//
+// CHECK-fails if instr doesn't have exactly one unique non-constant operand.
+static const HloInstruction* NonConstantOperand(const HloInstruction* instr) {
+  const HloInstruction* result = nullptr;
+  for (const HloInstruction* operand : instr->operands()) {
+    if (!operand->IsConstant()) {
+      if (result != nullptr) {
+        CHECK_EQ(result, operand);
+      }
+      result = operand;
+    }
+  }
+  CHECK_NE(result, nullptr);
+  return result;
+}
+
+// Determines whether the given instruction is a send/recv node, or has a
+// subcomputation which contains a send/recv node.
+static bool IsOrContainsSendOrRecv(const HloInstruction* instr);
+
+// Determines whether the given computation contains a send or recv node.
+static bool ContainsSendOrRecv(const HloComputation* comp) {
+  for (const auto* instr : comp->instructions()) {
+    if (IsOrContainsSendOrRecv(instr)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool IsOrContainsSendOrRecv(const HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kSend ||
+      instr->opcode() == HloOpcode::kSendDone ||
+      instr->opcode() == HloOpcode::kRecv ||
+      instr->opcode() == HloOpcode::kRecvDone) {
+    return true;
+  }
+  for (const auto& subcomp : instr->called_computations()) {
+    if (ContainsSendOrRecv(subcomp)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// If all of instr's operands are either constants or have the form
+//   get-tuple-element(gte_operand, N)
+// for the same value N, returns N.  Otherwise, returns nullopt.
+static optional<int64> GetGTEOperandIndex(const HloInstruction* instr,
+                                          const HloInstruction* gte_operand) {
+  VLOG(2) << "GetGTEOperandIndex(" << instr->ToString() << ", "
+          << gte_operand->ToString() << ")";
+  optional<int64> tuple_idx;
+  for (const HloInstruction* operand : instr->operands()) {
+    if (operand->IsConstant()) {
+      continue;
+    }
+    if (operand->opcode() != HloOpcode::kGetTupleElement) {
+      VLOG(2) << "instr uses something other than gte(gte_operand): "
+              << operand->ToString();
+      return nullopt;
+    }
+    if (operand->operand(0) != gte_operand) {
+      VLOG(2) << "instr has gte whose operand is not gte_operand: "
+              << operand->ToString();
+      return nullopt;
+    }
+    if (tuple_idx && tuple_idx != operand->tuple_index()) {
+      VLOG(2) << "instr has operands with conflicting gte indices, "
+              << *tuple_idx << " vs " << operand->tuple_index();
+      return nullopt;
+    }
+
+    tuple_idx = operand->tuple_index();
+  }
+  return tuple_idx;
+}
+
+// Tries to get the tuple index of the induction variable of a while loop.
+//
+// Checks that the loop condition and root both plumb the induction variable
+// through the same tuple index, and that they both apply exactly one op to the
+// induction variable before  deciding whether to do another loop iteration (in
+// the loop condition's case) or packing the induction variable into the result
+// tuple (in the loop body's case).
+//
+// Specifically, checks that the loop condition has structure
+//
+//   root = op(constants, get-tuple-elem(param0, N), constants)
+//
+// and the loop body has the structure
+//
+//   inc = op(constants, get-tuple-elem(param0, N), constants)
+//   root = tuple(..., inc, ...)  // inc is N'th operand of tuple().
+//
+// If so, returns N.  Otherwise, returns nullopt.
+static optional<int64> GetLoopInductionVarTupleIdx(
+    const HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+  VLOG(2) << "Finding induction variable for loop "
+          << while_op->ToShortString();
+
+  // The while_cond computation should have the form
+  //
+  //   while_cond_root =
+  //       op(constants, get-tuple-elem(while_cond_param, N), constants).
+  //
+  // If it does, set indvar_tuple_idx to N.
+  auto* while_cond = while_op->while_condition();
+  auto* while_cond_root = while_cond->root_instruction();
+  auto* while_cond_param = while_cond->parameter_instruction(0);
+  optional<int64> indvar_tuple_idx =
+      GetGTEOperandIndex(while_cond_root, while_cond_param);
+  if (!indvar_tuple_idx) {
+    VLOG(2) << "Induction variable not found in loop condition: "
+            << while_cond->root_instruction()->ToString();
+    return nullopt;
+  }
+
+  // The while_body computation should have the form
+  //
+  //   while_body_inc =
+  //       op(constants, get-tuple-elem(while_body_param, N), constants)
+  //   while_body_root = tuple(..., while_body_inc, ...)
+  //
+  // where while_body_inc is operand N of while_body_root.
+  auto* while_body = while_op->while_body();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While body's root is not a tuple instruction: "
+            << while_body_root->ToString();
+    return nullopt;
+  }
+
+  auto* while_body_inc = while_body_root->operand(*indvar_tuple_idx);
+  auto* while_body_param = while_body->parameter_instruction(0);
+  optional<int64> while_body_indvar_tuple_idx =
+      GetGTEOperandIndex(while_body_inc, while_body_param);
+  if (!while_body_indvar_tuple_idx) {
+    VLOG(2)
+        << "Induction variable not found in while body increment instruction: "
+        << while_body_inc->ToString();
+    return nullopt;
+  }
+  if (while_body_indvar_tuple_idx != indvar_tuple_idx) {
+    VLOG(2) << "Tuple index of induction variable does not match between loop "
+               "condition ("
+            << *indvar_tuple_idx << ") and while body ("
+            << *while_body_indvar_tuple_idx << ")";
+    return nullopt;
+  }
+
+  // Finally, check that the while loop's initial value is a tuple with enough
+  // elements.
+  auto* while_init = while_op->operand(0);
+  if (while_init->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While init expected to be a tuple: " << while_init->ToString();
+    return nullopt;
+  }
+
+  VLOG(2) << "Induction variable's tuple index: " << *indvar_tuple_idx;
+  return indvar_tuple_idx;
+}
+
+// Tries to determine the number of times the given loop executes.  Currently
+// simply returns 0, 1, or "can't tell" (nullopt).
+static optional<int64> GetLoopTripCount(HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+  VLOG(2) << "Getting trip count for loop " << while_op->ToString();
+
+  // The loop's induction variable is found at
+  //
+  //   get-tuple-elem(comp->parameter_instruction(0), *indvar_tuple_idx),
+  //
+  // where comp is while_op->while_body() or while_op->while_condition().
+  optional<int64> indvar_tuple_idx = GetLoopInductionVarTupleIdx(while_op);
+  if (!indvar_tuple_idx) {
+    return nullopt;
+  }
+
+  VLOG(2) << "Induction variable is at index " << *indvar_tuple_idx
+          << " in input tuple.";
+
+  // Now that we know the index of the induction variable, we can we can try to
+  // compute how many times the loop executes.  Start by computing the induction
+  // variable's initial value.
+  HloEvaluator evaluator;
+  auto* while_init = while_op->mutable_operand(0);
+  auto* indvar_init = while_init->mutable_operand(*indvar_tuple_idx);
+  StatusOr<std::unique_ptr<Literal>> indvar_init_result =
+      evaluator.Evaluate(indvar_init);
+  if (!indvar_init_result.ok()) {
+    VLOG(2) << "Couldn't evaluate induction variable init: "
+            << indvar_init_result.status();
+    return nullopt;
+  }
+
+  // Evaluates the while loop's condition, returning either "true" (continue
+  // looping), "false" (stop looping), or nullopt (can't evaluate).
+  auto evaluate_while_cond = [&](const Literal& indvar) -> optional<bool> {
+    auto* while_cond = while_op->while_condition();
+    auto* while_cond_root = while_cond->root_instruction();
+    auto* while_cond_indvar = NonConstantOperand(while_cond_root);
+    StatusOr<std::unique_ptr<Literal>> result =
+        evaluator.EvaluateWithSubstitutions(while_cond_root,
+                                            {{while_cond_indvar, &indvar}});
+    if (!result.ok()) {
+      VLOG(2) << "Couldn't evaluate while cond: " << result.status();
+      return nullopt;
+    }
+    return result.ValueOrDie()->GetArraySlice<bool>() ==
+           tensorflow::gtl::ArraySlice<bool>{true};
+  };
+
+  // The initial value of the induction variable.
+  const Literal& indvar_iter0_val = *indvar_init_result.ValueOrDie();
+
+  // Evaluate whether the while condition is true when seeded with
+  // indvar_iter0_val.
+  optional<bool> while_cond_iter0_val = evaluate_while_cond(indvar_iter0_val);
+  if (while_cond_iter0_val == false) {
+    VLOG(2) << "Loop has static trip count of 0.";
+    return 0;
+  }
+
+  // Calculate the value of the induction variable after one iteration of the
+  // loop, and check whether the while condition is true with this new value.
+  auto* while_body = while_op->while_body();
+  auto* while_body_indvar_update =
+      while_body->root_instruction()->operand(*indvar_tuple_idx);
+  auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
+  StatusOr<std::unique_ptr<Literal>> indvar_iter1_result =
+      evaluator.EvaluateWithSubstitutions(
+          while_body_indvar_update, {{while_body_indvar, &indvar_iter0_val}});
+  if (!indvar_iter1_result.ok()) {
+    VLOG(2) << "Couldn't evaluate induction variable update: "
+            << indvar_iter1_result.status();
+    return nullopt;
+  }
+  const Literal& indvar_iter1_val = *indvar_iter1_result.ValueOrDie();
+  optional<bool> while_cond_iter1_val = evaluate_while_cond(indvar_iter1_val);
+  if (while_cond_iter1_val == false) {
+    VLOG(2) << "Determined that loop has static trip count of 1.";
+    return 1;
+  }
+
+  VLOG(2) << "Loop has unknown trip count >= 1.";
+  return nullopt;
+}
+
+// Tries to remove elements in a while loop's tuple that aren't used within the
+// loop.
+//
+// Specifically, if a loop is tuple-shaped, and there exists some element of
+// that tuple that is not used by the loop condition and is not used by the loop
+// body except to pass it to the next iteration of the loop, then we can remove
+// that element from the loop's tuples.
+static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // Don't try this transformation if the while loop isn't removable, since if
+  // it succeeds ultimately we're going to have to replace the old while loop
+  // with a new one.
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
+    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
+    return false;
+  }
+
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  HloInstruction* while_init = while_op->mutable_operand(0);
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_body_root = while_body->root_instruction();
+
+  if (!ShapeUtil::IsTuple(while_init->shape())) {
+    VLOG(2) << "While op's carried value isn't tuple shaped.";
+    return false;
+  }
+
+  // Bail if param0 of while_cond or while_body has users which aren't of type
+  // get-tuple-element.
+  for (const HloInstruction* instr : {while_body->parameter_instruction(0),
+                                      while_cond->parameter_instruction(0)}) {
+    for (const HloInstruction* user : instr->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        VLOG(2) << "Cowardly refusing to analyze while loop with "
+                << instr->ToStringNoMetadata()
+                << " used by non-GTE instruction " << user->ToStringNoMetadata()
+                << " in computation " << instr->parent()->name();
+        return false;
+      }
+    }
+  }
+
+  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
+  if (tuple_size == 0) {
+    VLOG(2) << "Can't remove elements from while loop's tuple -- it's already "
+               "empty.";
+    return false;
+  }
+
+  tensorflow::gtl::FlatSet<int64> used_tuple_indices;
+  for (HloComputation* comp : {while_body, while_cond}) {
+    // The HLO verifier ensures that while_input's shape matches while_init's
+    // shape, which we verified above is a tuple.
+    HloInstruction* while_input = comp->parameter_instruction(0);
+
+    for (const HloInstruction* user : while_input->users()) {
+      // This user doesn't count if it's only used by the while body's root, and
+      // the root places the tuple element into the same index of the tuple as
+      // it came from.  That just amounts to us carrying the variable through
+      // the loop.
+      //
+      // Careful: HloInstruction::operand_index returns the first index the
+      // operand appears in, but it may appear more than once!
+      if (user->user_count() == 1 && user->users().front() == while_body_root &&
+          while_body_root->operand_index(user) == user->tuple_index() &&
+          std::count(while_body_root->operands().begin(),
+                     while_body_root->operands().end(), user) == 1) {
+        continue;
+      }
+
+      used_tuple_indices.insert(user->tuple_index());
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If a tuple element is not passed unmodified from the while body's param0
+  // through to the while body's root, count that element as "used", since
+  // removing that element would be observable.
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    if (used_tuple_indices.count(i)) {
+      continue;
+    }
+
+    auto* operand = while_body_root->operand(i);
+    if (operand->opcode() != HloOpcode::kGetTupleElement ||
+        operand->operand(0) != while_body->parameter_instruction(0) ||
+        operand->tuple_index() != i) {
+      VLOG(2) << "Tuple index " << i
+              << " is not passed through loop body unmodified.";
+      used_tuple_indices.insert(i);
+
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToStringNoMetadata()
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If we got here, used_tuple_indices.size() < tuple_size, meaning some
+  // elements of the loop's tuple aren't used by while_body or while_cond.
+  CHECK_LT(used_tuple_indices.size(), tuple_size);
+
+  VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
+          << " elements from tuple of " << while_op->ToStringNoMetadata();
+
+  // Build up maps from the old/new to the new/old tuple indices.
+  std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
+                                          used_tuple_indices.end());
+  std::sort(new_to_old_tuple_idx.begin(), new_to_old_tuple_idx.end());
+
+  tensorflow::gtl::FlatMap<int64, int64> old_to_new_tuple_idx;
+  for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
+    int64 old_idx = new_to_old_tuple_idx[new_idx];
+    old_to_new_tuple_idx[old_idx] = new_idx;
+    VLOG(2) << "Remapping tuple index " << old_idx << " to " << new_idx;
+  }
+
+  // Compute the shape of the while op after we remove the dead indices.
+  std::vector<Shape> new_while_tuple_elem_shapes;
+  new_while_tuple_elem_shapes.reserve(new_to_old_tuple_idx.size());
+  for (int64 old_idx : new_to_old_tuple_idx) {
+    new_while_tuple_elem_shapes.push_back(
+        while_init->shape().tuple_shapes(old_idx));
+  }
+  Shape new_while_shape =
+      ShapeUtil::MakeTupleShape(new_while_tuple_elem_shapes);
+
+  // Returns a map from elements in the computation to new instructions which
+  // replace the old instructions after we remove unused elements from the while
+  // tuple.
+  auto make_while_computation_replacements = [&](const HloComputation* comp) {
+    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements;
+
+    auto* param = comp->parameter_instruction(0);
+    replacements.emplace(param, HloInstruction::CreateParameter(
+                                    0, new_while_shape, param->name()));
+
+    // Materialize param's users, since we're about to add new ones below.
+    std::vector<HloInstruction*> materialized_users(param->users().begin(),
+                                                    param->users().end());
+    for (const auto* user : materialized_users) {
+      // The while body root is handled separately.
+      if (user == while_body_root) {
+        continue;
+      }
+      CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement)
+          << user->ToStringNoMetadata();
+
+      int64 old_idx = user->tuple_index();
+      auto new_idx_iter = old_to_new_tuple_idx.find(old_idx);
+      if (new_idx_iter != old_to_new_tuple_idx.end()) {
+        // This is a GTE of an index that survives.  Replace it.
+        replacements.emplace(
+            user, HloInstruction::CreateGetTupleElement(user->shape(), param,
+                                                        new_idx_iter->second));
+      } else {
+        // This is a GTE of an index that we've removed.  Remove it from the
+        // cloned computation.
+        CHECK(user->user_count() == 0 ||
+              user->user_count() == 1 &&
+                  user->users().front() == while_body_root)
+            << "Instruction " << user->ToStringNoMetadata()
+            << " should be unused (except by root of while body), but has "
+               "users: {"
+            << tensorflow::str_util::Join(
+                   user->users(), ", ",
+                   [](string* out, const HloInstruction* instr) {
+                     tensorflow::strings::StrAppend(
+                         out, instr->ToStringNoMetadata());
+                   })
+            << "}";
+
+        replacements.emplace(user, nullptr);
+      }
+    }
+    return replacements;
+  };
+
+  // Create the new while condition, body, and init value.
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacements(
+          make_while_computation_replacements(while_cond));
+
+  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      while_body_replacements = make_while_computation_replacements(while_body);
+  std::vector<HloInstruction*> new_while_body_root_elems;
+  new_while_body_root_elems.reserve(new_to_old_tuple_idx.size());
+  for (int64 old_idx : new_to_old_tuple_idx) {
+    new_while_body_root_elems.push_back(
+        while_body_root->mutable_operand(old_idx));
+  }
+  while_body_replacements.emplace(
+      while_body_root, HloInstruction::CreateTuple(new_while_body_root_elems));
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacements(std::move(while_body_replacements));
+
+  // Add a new while_init instruction that repackages the old while_init
+  // instruction's elements.  We rely on the AlgebraicSimplifier and DCE to
+  // clean this up in the common case where while_init is a tuple op.  (It's
+  // definitely tuple-shaped, but it's not necessarily a tuple op.)
+  std::vector<HloInstruction*> new_while_init_elems;
+  new_while_init_elems.reserve(new_to_old_tuple_idx.size());
+  for (int64 old_idx : new_to_old_tuple_idx) {
+    new_while_init_elems.push_back(
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            while_init->shape().tuple_shapes(old_idx), while_init, old_idx)));
+  }
+  auto* new_while_init = computation->AddInstruction(
+      HloInstruction::CreateTuple(new_while_init_elems));
+
+  // Create the new while op.
+  auto* new_while_op = computation->AddInstruction(HloInstruction::CreateWhile(
+      new_while_shape,
+      module->AddEmbeddedComputation(std::move(new_while_cond)),
+      module->AddEmbeddedComputation(std::move(new_while_body)),
+      new_while_init));
+
+  // Create a tuple op that recreates the output of the old while op.  That is,
+  // we transform to
+  //
+  //  new_while_init   while_init
+  //       |              |
+  //       V              |
+  //   new_while          |
+  //       |              |
+  //       -------|   |----
+  //              V   V
+  //            new_tuple
+  //                |
+  //                V
+  //    (orig. users of while op)
+  //
+  // The tuple simplifier will then simplify this if possible, removing
+  // new_tuple and while_init.
+  std::vector<HloInstruction*> new_tuple_elems;
+  for (int64 old_idx = 0; old_idx < tuple_size; ++old_idx) {
+    auto new_tuple_idx_it = old_to_new_tuple_idx.find(old_idx);
+    if (new_tuple_idx_it != old_to_new_tuple_idx.end()) {
+      int64 gte_idx = new_tuple_idx_it->second;
+      new_tuple_elems.push_back(
+          computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+              new_while_op->shape().tuple_shapes(gte_idx), new_while_op,
+              gte_idx)));
+    } else {
+      new_tuple_elems.push_back(
+          computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+              while_init->shape().tuple_shapes(old_idx), while_init, old_idx)));
+    }
+  }
+  HloInstruction* new_tuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_tuple_elems));
+  TF_RETURN_IF_ERROR(while_op->ReplaceAllUsesWith(new_tuple));
+
+  return true;
+}
+
+// Tries to remove a while loop from the graph.
+//
+//  - Loops with trip count of 0 can be replaced by the loop's "init" value.
+//  - Loops with trip count of 1 can be replaced by the loop's body, with the
+//    loop itself removed.
+//
+// Returns true if it made a change to the graph.
+static StatusOr<bool> TryRemoveWhileLoop(HloInstruction* while_op) {
+  // Cowardly refuse to remove loops that are not removable.  In practice,
+  // this means that we can't remove loops that contain side-effecting
+  // instructions or have control predecessors/successors.
+  //
+  // This is not a fundamental limitation.  The control operands can be moved
+  // onto the new HLOs after simplification, and any side-effecting ops inside
+  // the loop aren't removed, just cloned and added back to the loop.
+  // Nevertheless our infrastructure sees loop simplification as removal of
+  // these nodes and currently doesn't allow it.
+  if (!while_op->parent()->IsRemovable(while_op) || while_op->HasSideEffect()) {
+    VLOG(2) << "Not attempting to remove while loop it is not removable: "
+            << while_op->ToShortString();
+    return false;
+  }
+
+  // Remove while loops with static trip count of 0.
+  optional<int64> trip_count = GetLoopTripCount(while_op);
+  if (trip_count && *trip_count == 0) {
+    // The loop never executes, so the value of the loop is the value of its
+    // "init" operand.
+    auto computation = while_op->parent();
+
+    // Remove while_op (i.e., call ReplaceInstruction rather than
+    // ReplaceUsesWithInstruction) so that if the algebraic simplifier is run in
+    // a loop without an intervening DCE, we don't try to re-remove the loop.
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(
+        while_op, while_op->mutable_operand(0)));
+    return true;
+  }
+
+  // Transform while loops with static trip count of 1 into a call op, then
+  // inline the call.
+  if (trip_count && *trip_count == 1) {
+    auto computation = while_op->parent();
+    auto call_op = computation->AddInstruction(HloInstruction::CreateCall(
+        while_op->shape(), while_op->operands(), while_op->while_body()));
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, call_op));
+    TF_RETURN_IF_ERROR(CallInliner::Inline(call_op));
+    return true;
+  }
+  return false;
+}
+
+StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
+  XLA_VLOG_LINES(3,
+                 "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
+  bool changed = false;
+
+  // Gather all the while ops in our module.  We do this ahead of time so we
+  // don't have to worry about mutating the lists of computations or
+  // instructions while we iterate.
+  std::vector<HloInstruction*> while_ops;
+  for (auto* comp : module->computations()) {
+    for (auto* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kWhile) {
+        while_ops.push_back(instr);
+      }
+    }
+  }
+
+  for (HloInstruction* while_op : while_ops) {
+    // We can't remove while loops that contain send/recv nodes, because we rely
+    // on the particular loop structure around the node matching on the send and
+    // recv sides.  Removing dead while params requires us to remove the loop
+    // and replace it with a new one, so we can't do that either.
+    if (ContainsSendOrRecv(while_op->while_body()) ||
+        ContainsSendOrRecv(while_op->while_condition())) {
+      VLOG(2) << "Not attempting to simplify while loop because it contains a "
+                 "send/recv node: "
+              << while_op->ToShortString();
+      continue;
+    }
+
+    StatusOr<bool> result = TryRemoveWhileLoop(while_op);
+    TF_RETURN_IF_ERROR(result.status());
+    if (result.ValueOrDie()) {
+      changed = true;
+      // Don't try to remove dead while params after successfully removing the
+      // while loop -- that would result in use-after-free nastiness.
+      continue;
+    }
+
+    result = TryRemoveDeadWhileParams(while_op);
+    TF_RETURN_IF_ERROR(result.status());
+    changed |= result.ValueOrDie();
+  }
+
+  XLA_VLOG_LINES(3,
+                 "WhileLoopSimplifier::Run(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.h b/tensorflow/compiler/xla/service/while_loop_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..50dac32a4ab0a5de756c1ddf5e62c3560e54a079
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass that makes the following transformations on while loops:
+//
+//  - A while loop with static trip count of 0 is deleted.
+//  - A while loops with static trip count of 1 is replaced by its body (sans
+//    loop).
+//  - Elements of a while loop's tuple that the loop doesn't use are removed
+//    from the tuple.
+//
+class WhileLoopSimplifier : public HloPassInterface {
+ public:
+  ~WhileLoopSimplifier() override {}
+  tensorflow::StringPiece name() const override {
+    return "simplify-while-loops";
+  }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d99b31dc0037968bc88d5f22d53309a6a4546963
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -0,0 +1,422 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class WhileLoopSimplifierTest : public HloVerifiedTestBase {
+ public:
+  // Makes a computation that contains a loop that runs num_iters times.
+  HloComputation* MakeSimpleLoop(int num_iters, HloModule* module);
+
+  // Makes a computation which has one parameter, of the given shape, and always
+  // returns PRED[]{true}.  This is useful as a dummy loop condition.
+  HloComputation* MakeAlwaysTrueComputation(const Shape& param_shape,
+                                            HloModule* module);
+};
+
+HloComputation* WhileLoopSimplifierTest::MakeSimpleLoop(int num_iters,
+                                                        HloModule* module) {
+  HloComputation::Builder builder(TestName());
+
+  auto loop_iter_init = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+  auto loop_data_init = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({0, 1, 2})));
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({loop_iter_init, loop_data_init}));
+
+  HloComputation* condition;
+  {
+    HloComputation::Builder cond_builder(TestName() + ".condition");
+    auto loop_var = cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    auto loop_induction_var =
+        cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
+    auto limit = cond_builder.AddInstruction(HloInstruction::CreateConstant(
+        Literal::CreateR0<int32>(42 + num_iters)));
+    cond_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, loop_induction_var,
+        limit));
+    condition = module->AddEmbeddedComputation(cond_builder.Build());
+  }
+
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    auto loop_var = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    auto loop_induction_var =
+        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            ShapeUtil::MakeShape(S32, {}), loop_var, 0));
+    auto new_loop_induction_var =
+        body_builder.AddInstruction(HloInstruction::CreateBinary(
+            loop_induction_var->shape(), HloOpcode::kAdd, loop_induction_var,
+            body_builder.AddInstruction(
+                HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)))));
+    auto loop_data =
+        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            loop_data_init->shape(), loop_var, 1));
+    auto new_loop_data =
+        body_builder.AddInstruction(HloInstruction::CreateBinary(
+            loop_data_init->shape(), HloOpcode::kMultiply, loop_data,
+            loop_data));
+    body_builder.AddInstruction(
+        HloInstruction::CreateTuple({new_loop_induction_var, new_loop_data}));
+    body = module->AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+
+  return module->AddEntryComputation(builder.Build());
+}
+
+HloComputation* WhileLoopSimplifierTest::MakeAlwaysTrueComputation(
+    const Shape& param_shape, HloModule* module) {
+  HloComputation::Builder builder(TestName() + ".always_true");
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "param"));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  return module->AddEmbeddedComputation(builder.Build());
+}
+
+TEST_F(WhileLoopSimplifierTest, WhileLoopWithZeroIterations) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/0, &module());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Tuple(op::Constant(), op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, WhileLoopWithOneIteration) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction(),
+              op::Tuple(op::Add(), op::Multiply()));
+}
+
+TEST_F(WhileLoopSimplifierTest, WhileLoopWithTwoIterations) {
+  MakeSimpleLoop(/*num_iters=*/2, &module());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, WhileLoopWithControlDependency) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+  auto* while_op = computation->root_instruction();
+  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
+  auto* true_op = while_op->while_body()->AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  TF_ASSERT_OK(true_op->AddControlDependencyTo(
+      while_op->while_body()->root_instruction()));
+  ASSERT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+  EXPECT_THAT(computation->root_instruction()->control_predecessors(),
+              ElementsAre(op::Constant()))
+      << computation->ToString();
+}
+
+// Loops that contain send/recv nodes can't be simplified; the loop structure
+// around send/recv nodes must be preserved.
+TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsSend) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+  auto* while_op = computation->root_instruction();
+  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
+  auto* while_body = while_op->while_body();
+  auto* send = while_body->AddInstruction(HloInstruction::CreateSend(
+      while_body->AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<bool>(true))),
+      /*channel_id=*/0));
+  while_body->AddInstruction(HloInstruction::CreateSendDone(send));
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsRecv) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+  auto* while_op = computation->root_instruction();
+  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
+  auto* while_body = while_op->while_body();
+  auto* recv = while_body->AddInstruction(
+      HloInstruction::CreateRecv(ShapeUtil::MakeShape(F32, {1}),
+                                 /*channel_id=*/0));
+  while_body->AddInstruction(HloInstruction::CreateRecvDone(recv));
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// The limitation on not being able to simplify loops that contain infeeds (and
+// other non-removable instructions) isn't fundamental -- it just stems from the
+// fact that our infrastructure sees simplifying such a loop as tantamount to
+// removing the non-removable instruction.
+TEST_F(WhileLoopSimplifierTest, NotRemovedIfContainsNonRemovableInstruction) {
+  HloComputation* computation = MakeSimpleLoop(/*num_iters=*/1, &module());
+  auto* while_op = computation->root_instruction();
+  ASSERT_EQ(while_op->opcode(), HloOpcode::kWhile);
+  auto* while_body = while_op->while_body();
+  while_body->AddInstruction(
+      HloInstruction::CreateInfeed(ShapeUtil::MakeShape(F32, {1}), "config"));
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// Check that we don't crash when given a loop whose shape is not a tuple.
+TEST_F(WhileLoopSimplifierTest, IgnoreNonTupleShapedLoop) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+
+  HloComputation* condition;
+  {
+    HloComputation::Builder cond_builder(TestName() + ".condition");
+    auto param = cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    cond_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param,
+        cond_builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<int32>(100)))));
+    condition = module().AddEmbeddedComputation(cond_builder.Build());
+  }
+
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    auto param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    body_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(S32, {}), HloOpcode::kAdd, param,
+        body_builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<int32>(-1)))));
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+
+  module().AddEntryComputation(builder.Build());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// Construct a loop where we swap the tuple elements in each iteration.
+// Although the tuple elements aren't used in the loop, we don't eliminate them,
+// because the swapping side-effect is visible to users of the loop.
+TEST_F(WhileLoopSimplifierTest, SwapTupleIndices) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
+  }));
+
+  HloComputation* condition =
+      MakeAlwaysTrueComputation(loop_init->shape(), &module());
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    auto param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+    body_builder.AddInstruction(HloInstruction::CreateTuple({
+        body_builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(scalar_s32, param, 1)),
+        body_builder.AddInstruction(
+            HloInstruction::CreateGetTupleElement(scalar_s32, param, 0)),
+    }));
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+
+  module().AddEntryComputation(builder.Build());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// Construct a loop where we assign a constant to tuple element 0 in each
+// iteration.  We can't eliminate tuple element 0, even though we never use its
+// value.
+TEST_F(WhileLoopSimplifierTest, UnusedButModifiedTupleElement) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(
+      HloInstruction::CreateTuple({builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0)))}));
+
+  HloComputation* condition =
+      MakeAlwaysTrueComputation(loop_init->shape(), &module());
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    body_builder.AddInstruction(HloInstruction::CreateTuple({
+        body_builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
+    }));
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+
+  module().AddEntryComputation(builder.Build());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// Nothing to simplify in a while loop whose tuple has 0 elements.
+TEST_F(WhileLoopSimplifierTest, EmptyTuple) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  HloComputation* condition =
+      MakeAlwaysTrueComputation(loop_init->shape(), &module());
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "loop_var"));
+    body_builder.AddInstruction(HloInstruction::CreateTuple({}));
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+  module().AddEntryComputation(builder.Build());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// While loop where one tuple element is used twice in the body, and thus can't
+// be simplified away.
+TEST_F(WhileLoopSimplifierTest, ElemUsedTwice) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(1))),
+  }));
+
+  HloComputation* condition =
+      MakeAlwaysTrueComputation(loop_init->shape(), &module());
+
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    auto* param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_init->shape(), "param0"));
+    auto* gte0 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/0));
+    // get0 is used twice in the loop body's tuple.
+    body_builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte0}));
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+  module().AddEntryComputation(builder.Build());
+  EXPECT_FALSE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+}
+
+// This while loop has three tuple elements.  Element 0 is unused and should be
+// removed. Element 1 is used by the loop body, and element 2 is used by the
+// loop condition; these two should stay.
+TEST_F(WhileLoopSimplifierTest, RemoveUnusedOperand) {
+  HloComputation::Builder builder(TestName());
+  auto loop_init = builder.AddInstruction(HloInstruction::CreateTuple({
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+  }));
+  auto loop_shape = loop_init->shape();
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+
+  HloComputation* condition;
+  {
+    HloComputation::Builder cond_builder(TestName() + ".loop_condition");
+    auto param = cond_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_shape, "param0"));
+    cond_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq,
+        cond_builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<int32>(0))),
+        cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            scalar_s32, param, /*index=*/2))));
+    condition = module().AddEmbeddedComputation(cond_builder.Build());
+  }
+
+  HloComputation* body;
+  {
+    HloComputation::Builder body_builder(TestName() + ".body");
+    auto* param = body_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, loop_shape, "loop_var"));
+
+    auto* tuple0 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/0));
+    auto* tuple1 = body_builder.AddInstruction(HloInstruction::CreateBinary(
+        scalar_s32, HloOpcode::kAdd,
+        body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+            scalar_s32, param, /*index=*/1)),
+        body_builder.AddInstruction(
+            HloInstruction::CreateConstant(Literal::CreateR0<int32>(1)))));
+    auto* tuple2 = body_builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_s32, param, /*index=*/2));
+    body_builder.AddInstruction(
+        HloInstruction::CreateTuple({tuple0, tuple1, tuple2}));
+
+    body = module().AddEmbeddedComputation(body_builder.Build());
+  }
+
+  auto* while_op = builder.AddInstruction(HloInstruction::CreateWhile(
+      loop_init->shape(), condition, body, loop_init));
+
+  module().AddEntryComputation(builder.Build());
+  EXPECT_TRUE(WhileLoopSimplifier().Run(&module()).ValueOrDie());
+
+  // We leave most of the checking to HloVerifiedTestBase, which runs the
+  // verifier on module() at the end of this test.
+  HloInstruction* new_while_op = *std::find_if(
+      module().entry_computation()->instructions().begin(),
+      module().entry_computation()->instructions().end(),
+      [&](const HloInstruction* instr) {
+        return instr != while_op && instr->opcode() == HloOpcode::kWhile;
+      });
+  EXPECT_TRUE(
+      ShapeUtil::Equal(new_while_op->shape(),
+                       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32})))
+      << ShapeUtil::HumanString(new_while_op->shape());
+  EXPECT_THAT(
+      new_while_op->while_body()->root_instruction(),
+      op::Tuple(
+          op::Add(op::GetTupleElement(op::Parameter(0), /*tuple_index=*/0),
+                  op::Constant()),
+          op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
+
+  EXPECT_THAT(new_while_op->while_condition()->root_instruction(),
+              op::Eq(op::Constant(),
+                     op::GetTupleElement(op::Parameter(0), /*tuple_index=*/1)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index 5bf9842a6ce7be747f58c10f302f85c6f82ac6f9..789eba5780d37e1fd4d80ec881855951c8bba0eb 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -32,13 +32,13 @@ tensorflow::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* other_shape) const {
-  if (!ShapeUtil::Compatible(*other_shape, shape_)) {
+tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+  if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
-                           ShapeUtil::HumanString(*other_shape).c_str(),
+                           ShapeUtil::HumanString(*to_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
-  *other_shape = shape_;
+  *to_shape = shape_;
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index 92564660f21bf1b596c4b9ca04c07eaca27ed192..4c83750f3e6f3c735db66d8e0b86ae3f43e5ca11 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -38,18 +38,19 @@ class ShapeLayout {
   explicit ShapeLayout(const Shape& shape) : shape_(shape) {}
 
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
-  // shape. 'shape' and the shape of the ShapeLayout object must be compatible.
-  tensorflow::Status AssignLayoutToShape(Shape* shape) const;
+  // shape. 'to_shape' and the shape of the ShapeLayout object must be
+  // compatible.
+  tensorflow::Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
   // with the ShapeLayout's shape, then false is returned.
   bool MatchesLayoutInShape(const Shape& shape) const;
 
-  // Copies the layout from the given shape into this ShapeLayout. 'shape' must
-  // be compatible with the ShapeLayout's shape, and 'shape' must have a layout
-  // (LayoutUtil::HasLayout).
-  tensorflow::Status CopyLayoutFromShape(const Shape& shape);
+  // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
+  // must be compatible with the ShapeLayout's shape, and 'other_shape' must
+  // have a layout (LayoutUtil::HasLayout).
+  tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 64a36471b9f1b35517c29c01554e02c5d1035086..d752619bd65751779c24f061e44e206d66b01465 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -116,6 +116,7 @@ class ShapeTree {
   ShapeTree(const Shape* shape, const T& init_value);
 
   ShapeTree(const ShapeTree& other) { *this = other; }
+  ShapeTree(ShapeTree&&) = default;
 
   ShapeTree& operator=(const ShapeTree& other) {
     root_ = other.root_;
@@ -132,6 +133,8 @@ class ShapeTree {
     return *this;
   }
 
+  ShapeTree& operator=(ShapeTree&& other) = default;
+
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
   const T& element(const ShapeIndex& index) const;
@@ -152,28 +155,57 @@ class ShapeTree {
   using const_iterator = ShapeTreeIterator<T, /*is_const=*/true>;
 
   // begin/end for iterating over all nodes.
-  iterator begin() { return iterator(&root_, /*iterate_leaves_only=*/false); }
-  iterator end() { return iterator(nullptr, /*iterate_leaves_only=*/false); }
+  iterator begin() {
+    return iterator(&root_, /*iterate_leaves_only=*/false,
+                    /*reverse=*/false);
+  }
+  iterator end() {
+    return iterator(nullptr, /*iterate_leaves_only=*/false,
+                    /*reverse=*/false);
+  }
   const_iterator begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/false);
+    return const_iterator(&root_, /*iterate_leaves_only=*/false,
+                          /*reverse=*/false);
   }
   const_iterator end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/false);
+    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
+                          /*reverse=*/false);
+  }
+
+  // rbegin/rend for iterating over all nodes in reverse.
+  iterator rbegin() {
+    return iterator(&root_, /*iterate_leaves_only=*/false,
+                    /*reverse=*/true);
+  }
+  iterator rend() {
+    return iterator(nullptr, /*iterate_leaves_only=*/false,
+                    /*reverse=*/true);
+  }
+  const_iterator rbegin() const {
+    return const_iterator(&root_, /*iterate_leaves_only=*/false,
+                          /*reverse=*/true);
+  }
+  const_iterator rend() const {
+    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
+                          /*reverse=*/true);
   }
 
   // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
   // children).
   iterator leaf_begin() {
-    return iterator(&root_, /*iterate_leaves_only=*/true);
+    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/false);
   }
   iterator leaf_end() {
-    return iterator(nullptr, /*iterate_leaves_only=*/true);
+    return iterator(nullptr, /*iterate_leaves_only=*/true,
+                    /*reverse=*/false);
   }
   const_iterator leaf_begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/true);
+    return const_iterator(&root_, /*iterate_leaves_only=*/true,
+                          /*reverse=*/false);
   }
   const_iterator leaf_end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/true);
+    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
+                          /*reverse=*/false);
   }
   // range-based iterator for leaf_begin()/leaf_end().
   tensorflow::gtl::iterator_range<iterator> leaves() {
@@ -183,6 +215,22 @@ class ShapeTree {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
 
+  iterator leaf_rbegin() {
+    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/true);
+  }
+  iterator leaf_rend() {
+    return iterator(nullptr, /*iterate_leaves_only=*/true,
+                    /*reverse=*/true);
+  }
+  const_iterator leaf_rbegin() const {
+    return const_iterator(&root_, /*iterate_leaves_only=*/true,
+                          /*reverse=*/true);
+  }
+  const_iterator leaf_rend() const {
+    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
+                          /*reverse=*/true);
+  }
+
   // Recursively traverses the shape and calls the given function at each
   // element. The function has the following arguments:
   //
@@ -190,7 +238,7 @@ class ShapeTree {
   //           (or compatible).
   //   index : the index of the element in the shape. See ShapeUtil::GetSubshape
   //           for definition of index.
-  //   data : The data value at this elemnt.
+  //   data : The data value at this element.
   template <typename Fn>
   void ForEachElement(const Fn& func) const;
 
@@ -277,42 +325,61 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
   // Construct an iterator pointing at node. Node must either be the tree root
   // or nullptr (which is equivalent to end() and should not be dereferenced or
   // incremented). If iterate_leaves_only is true, the iterator will not include
-  // interior tree nodes, only leaves.
-  ShapeTreeIterator(NodeType* node, bool iterate_leaves_only)
-      : node_(node), iterate_leaves_only_(iterate_leaves_only) {
-    if (node_ && !node_->children.empty() && iterate_leaves_only) {
-      ++*this;
+  // interior tree nodes, only leaves. If reverse is true, the iterator will
+  // visit nodes in the reverse of pre-order traversal.
+  ShapeTreeIterator(NodeType* node, bool iterate_leaves_only, bool reverse)
+      : node_(node),
+        iterate_leaves_only_(iterate_leaves_only),
+        reverse_(reverse) {
+    if (node_) {
+      if (reverse_) {
+        while (!node_->children.empty()) {
+          const int child_index = node_->children.size() - 1;
+          stack_.push_back({node_, child_index});
+          node_ = node_->children[child_index].get();
+        }
+      } else {
+        if (!node_->children.empty() && iterate_leaves_only) {
+          ++*this;
+        }
+      }
     }
   }
   ShapeTreeIterator(const ShapeTreeIterator& other)
       : node_(other.node_),
         stack_(other.stack_),
-        iterate_leaves_only_(other.iterate_leaves_only_) {}
+        iterate_leaves_only_(other.iterate_leaves_only_),
+        reverse_(other.reverse_) {}
 
   ShapeTreeIterator& operator++() {
     CHECK_NE(nullptr, node_) << "walking off the end() of an iterator!";
-    // We're doing a pre-order walk, so if our current node has children take
-    // the first child.
-    if (!node_->children.empty()) {
-      stack_.push_back({node_, /*child-index=*/0});
-      node_ = node_->children[0].get();
-      if (node_->children.empty() || !iterate_leaves_only_) {
-        return *this;
-      } else {
-        // This is a non-leaf; tail-recurse.
-        return ++(*this);
+    if (reverse_) {
+      while (!stack_.empty()) {
+        node_ = stack_.back().first;
+        int64 next_child_index = stack_.back().second - 1;
+        stack_.pop_back();
+        if (next_child_index < 0) {
+          if (!iterate_leaves_only_) {
+            // All children are visited, yield <node_>.
+            return *this;
+          }
+        } else {
+          stack_.push_back({node_, next_child_index});
+          node_ = node_->children[next_child_index].get();
+          while (!node_->children.empty()) {
+            const int child_index = node_->children.size() - 1;
+            stack_.push_back({node_, child_index});
+            node_ = node_->children[child_index].get();
+          }
+          return *this;
+        }
       }
-    }
-    // Otherwise we are currently at a leaf. Walk back up until a node contains
-    // a child we haven't visited yet.
-    while (!stack_.empty()) {
-      node_ = stack_.back().first;
-      int64 next_child_index = stack_.back().second + 1;
-      stack_.pop_back();
-      if (node_->children.size() > next_child_index) {
-        stack_.push_back({node_, next_child_index});
-        node_ = node_->children[next_child_index].get();
-
+    } else {
+      // We're doing a pre-order walk, so if our current node has children take
+      // the first child.
+      if (!node_->children.empty()) {
+        stack_.push_back({node_, /*child-index=*/0});
+        node_ = node_->children[0].get();
         if (node_->children.empty() || !iterate_leaves_only_) {
           return *this;
         } else {
@@ -320,6 +387,24 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
           return ++(*this);
         }
       }
+      // Otherwise we are currently at a leaf. Walk back up until a node
+      // contains a child we haven't visited yet.
+      while (!stack_.empty()) {
+        node_ = stack_.back().first;
+        int64 next_child_index = stack_.back().second + 1;
+        stack_.pop_back();
+        if (node_->children.size() > next_child_index) {
+          stack_.push_back({node_, next_child_index});
+          node_ = node_->children[next_child_index].get();
+
+          if (node_->children.empty() || !iterate_leaves_only_) {
+            return *this;
+          } else {
+            // This is a non-leaf; tail-recurse.
+            return ++(*this);
+          }
+        }
+      }
     }
     // We've walked off the end of the tree. Set node_ to nullptr to signify
     // end().
@@ -361,6 +446,8 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
   std::vector<std::pair<NodeType*, int64>> stack_;
   // True if we should not include interior nodes in our walk.
   bool iterate_leaves_only_;
+  // True if we should yield the reverse of the pre-order traversal.
+  bool reverse_;
   // Placeholder for the current value. Ideally this wouldn't exist and would
   // just be an rvalue, but operator -> needs to return a pointer to something.
   // We cannot just use a plain old value_type as it contains a reference so
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index 7b4b5cb0fb5e1564ca12ac6e3b901e94ea4c8db6..4b6ab772811f4a6c6ffc1d10befc7122f883b8f9 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -456,6 +456,26 @@ TEST_F(ShapeTreeTest, IterateOrder) {
                                         {2, 1}}));
 }
 
+TEST_F(ShapeTreeTest, ReverseIterateOrder) {
+  ShapeTree<int> t(nested_tuple_shape_, 42);
+  std::vector<ShapeIndex> v;
+  for (auto it = t.rbegin(); it != t.rend(); ++it) {
+    v.push_back(it->first);
+  }
+  EXPECT_EQ(v, (std::vector<ShapeIndex>{
+                   {2, 1},
+                   {2, 0, 1},
+                   {2, 0, 0},
+                   {2, 0},
+                   {2},
+                   {1, 1},
+                   {1, 0},
+                   {1},
+                   {0},
+                   {},
+               }));
+}
+
 TEST_F(ShapeTreeTest, IterateOrderLeaves) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
@@ -466,5 +486,21 @@ TEST_F(ShapeTreeTest, IterateOrderLeaves) {
                    {0}, {1, 0}, {1, 1}, {2, 0, 0}, {2, 0, 1}, {2, 1}}));
 }
 
+TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
+  ShapeTree<int> t(nested_tuple_shape_, 42);
+  std::vector<ShapeIndex> v;
+  for (auto it = t.leaf_rbegin(); it != t.leaf_rend(); ++it) {
+    v.push_back(it->first);
+  }
+  EXPECT_EQ(v, (std::vector<ShapeIndex>{
+                   {2, 1},
+                   {2, 0, 1},
+                   {2, 0, 0},
+                   {1, 1},
+                   {1, 0},
+                   {0},
+               }));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index b5eb81dfc6a4117909dcb18fdbe61443b1a1eb95..fe5166643df573ab8cbbea56ac791bccf5b7a4a8 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <numeric>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -263,6 +264,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
     case S32:
     case S64:
     case F16:
+    case BF16:
     case F32:
     case F64:
       return true;
@@ -328,6 +330,14 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return MakeTupleShape(new_elements);
 }
 
+// Returns the shape of a real or imaginary component.
+/* static */ Shape ShapeUtil::ComplexComponentShape(
+    const Shape& complex_shape) {
+  CHECK(ElementIsComplex(complex_shape)) << HumanString(complex_shape);
+  return ChangeElementType(complex_shape, primitive_util::ComplexComponentType(
+                                              complex_shape.element_type()));
+}
+
 /* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
                                      PrimitiveType element_type,
                                      std::initializer_list<int64> dimensions) {
@@ -395,6 +405,26 @@ const string& LowercasePrimitiveTypeName(PrimitiveType s) {
   static PrimitiveTypeNameGenerator* gen = new PrimitiveTypeNameGenerator();
   return gen->LowercaseName(s);
 }
+
+StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
+  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
+    static auto* map = new std::unordered_map<string, PrimitiveType>;
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i)) {
+        auto value = static_cast<PrimitiveType>(i);
+        (*map)[LowercasePrimitiveTypeName(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = name_to_type->find(name);
+  if (found == name_to_type->end()) {
+    return InvalidArgument("Invalid element type string: \"%s\".",
+                           name.c_str());
+  }
+  return found->second;
+}
+
 }  // namespace
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
@@ -499,17 +529,10 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                         comma_list_to_int64s(dimensions_string));
 
     // Extract the primitive element type.
-    PrimitiveType primitive_type = PRIMITIVE_TYPE_INVALID;
-    for (PrimitiveType i =
-             static_cast<PrimitiveType>(PRIMITIVE_TYPE_INVALID + 1);
-         i < TUPLE; i = static_cast<PrimitiveType>(i + 1)) {
-      if (tensorflow::str_util::Lowercase(PrimitiveType_Name(i)) ==
-          element_type_string) {
-        primitive_type = i;
-        break;
-      }
-    }
-    if (primitive_type == PRIMITIVE_TYPE_INVALID) {
+    TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
+                        StringToPrimitiveType(element_type_string));
+    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE ||
+        primitive_type == OPAQUE) {
       return InvalidArgument("Invalid element type string: \"%s\".",
                              element_type_string.c_str());
     }
@@ -552,6 +575,16 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return SameDimensions(lhs, rhs) && SameElementType(lhs, rhs);
 }
 
+/* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
+                                                           const Shape& rhs) {
+  if (lhs.element_type() == TUPLE) {
+    return rhs.element_type() == TUPLE &&
+           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+                           CompatibleIgnoringElementType);
+  }
+  return SameDimensions(lhs, rhs);
+}
+
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
                                            int64 dimension_number) {
   return shape.dimensions(GetDimensionNumber(shape, dimension_number));
@@ -591,6 +624,8 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       return sizeof(uint32);
     case U64:
       return sizeof(uint64);
+    case BF16:
+      return sizeof(float) / 2;
     case F16:
       return sizeof(float) / 2;
     case F32:
@@ -681,9 +716,9 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   return LayoutUtil::ValidateLayoutInShape(shape);
 }
 
-/* static */ Shape ShapeUtil::ChangeElementType(const Shape& shape,
+/* static */ Shape ShapeUtil::ChangeElementType(const Shape& original,
                                                 PrimitiveType type) {
-  Shape new_shape = shape;
+  Shape new_shape = original;
   new_shape.set_element_type(type);
   return new_shape;
 }
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 8f8d4a73c9ecb3f4236f3877323ad1127bb0b9c2..666c7da697c7cbad4dc30a7b3feb2b2804562442 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -68,6 +68,9 @@ class ShapeIndex {
 
   const int64* data() const { return indices_.data(); }
 
+  int64 back() const { return indices_.back(); }
+  int64& back() { return indices_.back(); }
+
   const int64& operator[](size_t i) const { return indices_[i]; }
   int64& operator[](size_t i) { return indices_[i]; }
 
@@ -167,7 +170,7 @@ class ShapeUtil {
   // As above, but for program shapes, returns a string for the form:
   //
   // (param_name: f32[42x12], ...) -> f32[24x42]
-  static string HumanString(const ProgramShape& shape);
+  static string HumanString(const ProgramShape& program_shape);
 
   // Parses a ShapeUtil::HumanString-format shape string back into a shape
   // object.
@@ -187,6 +190,11 @@ class ShapeUtil {
   // compatibility.
   static bool Compatible(const Shape& lhs, const Shape& rhs);
 
+  // Returns true if the rank and dimension sizes are identical. Element type
+  // and layout are ignored. Tuple elements are compared recursively for
+  // compatibility.
+  static bool CompatibleIgnoringElementType(const Shape& lhs, const Shape& rhs);
+
   // Returns whether the lhs and rhs shapes are identical protobufs.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
@@ -343,6 +351,10 @@ class ShapeUtil {
   // shape. E.g. a tuple like (f32, s32, u32) would slice via 1,3 to (s32, u32).
   static Shape SliceTuple(const Shape& tuple, int64 start, int64 limit);
 
+  // Returns the shape of the real/imaginary components of the given complex
+  // shape.
+  static Shape ComplexComponentShape(const Shape& complex_shape);
+
   // Shorthand for testing whether a shape is of a given element type and
   // sequence of dimensions.
   //
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0ba542ad1bec290c35c52a8dd5177893770310fd..4bce7ca51d0534cbcad6faac12818c5f3e94b29e 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -145,6 +145,7 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithSwappedElements) {
   Shape tuple2 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(PRED, {4, 5})});
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(tuple1, tuple2));
 }
 
 TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentPrimitiveType) {
@@ -153,6 +154,7 @@ TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentPrimitiveType) {
   Shape tuple2 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(PRED, {4, 5}), ShapeUtil::MakeShape(S32, {3, 2})});
   EXPECT_FALSE(ShapeUtil::Compatible(tuple1, tuple2));
+  EXPECT_TRUE(ShapeUtil::CompatibleIgnoringElementType(tuple1, tuple2));
 }
 
 TEST(ShapeUtilTest, IncompatibleTuplesWithDifferentDimensions) {
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index 5fa2211ac66177514ac8ecabfa8791e7c8c014a2..f9d25945bc617507735fb6c4d011c39723497f69 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -32,26 +32,26 @@ namespace {
 class Base1 {
  public:
   virtual ~Base1() {}
-  int pad;
+  int pad_;
 };
 
 class Base2 {
  public:
   virtual ~Base2() {}
-  int yetotherpad;
+  int yetotherpad_;
 };
 
 class Derived : public Base1, public Base2 {
  public:
   ~Derived() override {}
-  int evenmorepad;
+  int evenmorepad_;
 };
 
 class CopyNoAssign {
  public:
-  explicit CopyNoAssign(int value) : foo(value) {}
-  CopyNoAssign(const CopyNoAssign& other) : foo(other.foo) {}
-  int foo;
+  explicit CopyNoAssign(int value) : foo_(value) {}
+  CopyNoAssign(const CopyNoAssign& other) : foo_(other.foo_) {}
+  int foo_;
 
  private:
   const CopyNoAssign& operator=(const CopyNoAssign&);
@@ -253,7 +253,7 @@ TEST(StatusOr, TestCopyCtorNonAssignable) {
   StatusOr<CopyNoAssign> original(value);
   StatusOr<CopyNoAssign> copy(original);
   EXPECT_EQ(copy.status(), original.status());
-  EXPECT_EQ(original.ValueOrDie().foo, copy.ValueOrDie().foo);
+  EXPECT_EQ(original.ValueOrDie().foo_, copy.ValueOrDie().foo_);
 }
 
 TEST(StatusOr, TestCopyCtorStatusOKConverting) {
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 4e1be24b61cc436b0baf62cc6e28ad8d13fe71ac..6af01ae80d9ac8cdf8e7ba5cff4c24ef1d31cf94 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -61,14 +61,19 @@ generate_backend_test_macros()
 
 cc_library(
     name = "test_utils",
-    testonly = True,
+    srcs = ["test_utils.cc"],
     hdrs = ["test_utils.h"],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_headers_lib",
     ],
 )
 
@@ -100,7 +105,9 @@ cc_library(
     hdrs = ["hlo_test_base.h"],
     deps = [
         ":literal_test_util",
+        ":test_utils",
         "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -110,6 +117,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -427,6 +437,27 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conditional_test",
+    srcs = ["conditional_test.cc"],
+    # Currently, Conditional is supported only in CPU and GPU backends.
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
@@ -508,6 +539,7 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
+    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -766,6 +798,41 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "bfloat16_test",
+    srcs = ["bfloat16_test.cc"],
+    blacklisted_backends = [
+        "gpu",
+    ],
+    shard_count = 40,
+    deps = [
+        ":test_utils",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:reference_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
@@ -1226,6 +1293,23 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "bitcast_convert_test",
+    srcs = ["bitcast_convert_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "compilation_cache_test",
     srcs = ["compilation_cache_test.cc"],
@@ -1290,6 +1374,7 @@ xla_test(
     srcs = ["client_test.cc"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1343,22 +1428,23 @@ xla_test(
     ],
 )
 
-xla_test(
+tf_cc_test(
     name = "llvm_compiler_test",
     srcs = ["llvm_compiler_test.cc"],
-    backends = [
-        "cpu",
-        "gpu",
-        "cpu_parallel",
-    ],
+    tags = ["requires-gpu-sm35"],
     deps = [
-        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:backend",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:llvm_compiler",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor",
         "@llvm//:core",
     ],
 )
@@ -1596,6 +1682,65 @@ tf_cc_test(
     ],
 )
 
+xla_test(
+    name = "transfer_manager_test",
+    srcs = ["transfer_manager_test.cc"],
+    deps = [
+        ":literal_test_util",
+        ":local_client_test_base",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/compiler/xla/service:generic_transfer_manager",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+# A demo of textual IR based test.
+xla_test(
+    name = "sample_text_test",
+    srcs = ["sample_text_test.cc"],
+    # You can leave this empty if you want to test all supported backends.
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
+# A demo of test that loads an hlo module from a file and compares results on gpu and cpu.
+tf_cc_test(
+    name = "sample_file_test",
+    srcs = ["sample_file_test.cc"],
+    data = ["isolated_convolution.hlo"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        ":hlo_test_base",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/service:cpu_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:gpu_plugin",  # test backend
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 
 filegroup(
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index a62b13e04ff35b06846039d7665dfc8e4205eec2..c6e8b24d1211743d07878d388522feacf9c0e7f1 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -82,6 +82,25 @@ XLA_TEST_F(ArrayElementwiseOpTest, NegConstantS32) {
                              {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, NegConstantZeroElementC64) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>({});
+  auto result = builder.Neg(a);
+
+  ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, NegConstantC64) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>(
+      {{-2.5f, 1.0f}, {0.0f, 3.14f}, {2.25f, -1.0f}, {-10.0f, 0.0f}});
+  auto result = builder.Neg(a);
+
+  ComputeAndCompareR1<complex64>(
+      &builder, {{2.5f, -1.0f}, {0.0f, -3.14f}, {-2.25f, 1.0f}, {10.0f, 0.0f}},
+      {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, IsFiniteZeroElementF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({});
@@ -145,6 +164,28 @@ XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementF32s) {
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>(
+      {{-2.5f, 0.0f}, {0.0f, 3.14f}, {2.25f, 0.0f}, {1.0f, -10.0f}});
+  auto b = builder.ConstantR1<complex64>(
+      {{100.0f, 0.0f}, {3.13f, 0.0f}, {2.75f, 1.0f}, {-2.0f, 10.5f}});
+  auto add = builder.Add(a, b);
+
+  ComputeAndCompareR1<complex64>(
+      &builder, {97.5f, {3.13f, 3.14f}, {5.0f, 1.0f}, {-1.0f, 0.5f}}, {},
+      error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, AddTwoConstantZeroElementC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>({});
+  auto b = builder.ConstantR1<complex64>({});
+  auto add = builder.Add(a, b);
+
+  ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
+}
+
 TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   const int count = GetParam();
   ComputationBuilder builder(client_, TestName());
@@ -222,6 +263,28 @@ XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementS32s) {
   ComputeAndCompareR1<int32>(&builder, {}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>(
+      {{-2.5f, 0.0f}, {0.0f, 3.14f}, {3.0f, 2.25f}});
+  auto b = builder.ConstantR1<complex64>(
+      {{0.0f, 10.0f}, {3.13f, 0.0f}, {2.75f, -0.25f}});
+  auto add = builder.Sub(a, b);
+
+  ComputeAndCompareR1<complex64>(
+      &builder, {{-2.5f, -10.0f}, {-3.13f, 3.14f}, {0.25f, 2.5f}}, {},
+      error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantZeroElementC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>({});
+  auto b = builder.ConstantR1<complex64>({});
+  auto add = builder.Sub(a, b);
+
+  ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
@@ -385,6 +448,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, DivU32s) {
   }
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>(
+      {{-2.5f, 1.0f}, {-25.5f, 0.0f}, {2.0f, -1.0f}});
+  auto b = builder.ConstantR1<complex64>(
+      {{10.0f, 0.0f}, {0.0f, 1.0f}, {2.0f, -1.0f}});
+  auto div = builder.Div(a, b);
+
+  ComputeAndCompareR1<complex64>(
+      &builder, {{-0.25f, 0.1f}, {0.0f, 25.5f}, {1.0f, 0.0f}}, {}, error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, DivTwoConstantZeroElementC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>({});
+  auto b = builder.ConstantR1<complex64>({});
+  auto div = builder.Div(a, b);
+
+  ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, RemF32s) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<float>(
@@ -496,6 +580,28 @@ XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantU32s) {
   ComputeAndCompareR1<uint32>(&builder, expected, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>(
+      {{-2.5f, 0.0f}, {0.0f, 25.5f}, {2.0f, -10.0f}});
+  auto b = builder.ConstantR1<complex64>(
+      {{0.0f, 10.0f}, {5.0f, 1.0f}, {10.0f, -6.0f}});
+  auto add = builder.Mul(a, b);
+
+  ComputeAndCompareR1<complex64>(
+      &builder, {{0.0f, -25.0f}, {-25.5f, 127.5f}, {-40.0f, -112.0}}, {},
+      error_spec_);
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, MulTwoConstantZeroElementC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<complex64>({});
+  auto b = builder.ConstantR1<complex64>({});
+  auto add = builder.Mul(a, b);
+
+  ComputeAndCompareR1<complex64>(&builder, {}, {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AndPredR1) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<bool>({false, false, true, true});
@@ -886,6 +992,53 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementS32s) {
   ComputeAndCompareR1<bool>(&builder, {}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqC64s) {
+  SetFastMathDisabled(true);
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
+                                            {1.0f, 25.5f},
+                                            {2.25f, -3.0f},
+                                            {NAN, 0.0f},
+                                            {1.0f, 6.0f}});
+  auto rhs = builder.ConstantR1<complex64>({{0.0f, 10.0f},
+                                            {1.0f, 5.0f},
+                                            {2.25f, -3.0f},
+                                            {10.0f, 0.0f},
+                                            {1.0f, NAN}});
+  auto compare = builder.Eq(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementC64s) {
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<complex64>({});
+  auto rhs = builder.ConstantR1<complex64>({});
+  auto compare = builder.Eq(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {}, {});
+}
+
+XLA_TEST_F(ArrayElementwiseOpTest, CompareNeC64s) {
+  // Disable fast-math because we're operating on NaNs.
+  SetFastMathDisabled(true);
+
+  ComputationBuilder builder(client_, TestName());
+  auto lhs = builder.ConstantR1<complex64>({{-2.5f, 10.0f},
+                                            {1.0f, 25.5f},
+                                            {2.25f, -3.0f},
+                                            {NAN, 0.0f},
+                                            {1.0f, 6.0f}});
+  auto rhs = builder.ConstantR1<complex64>({{0.0f, 10.0f},
+                                            {1.0f, 5.0f},
+                                            {2.25f, -3.0f},
+                                            {10.0f, 0.0f},
+                                            {1.0f, NAN}});
+  auto compare = builder.Ne(lhs, rhs);
+
+  ComputeAndCompareR1<bool>(&builder, {true, true, false, true, true}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, CompareNeF32s) {
   // Disable fast-math because we're operating on NaNs.
   SetFastMathDisabled(true);
@@ -2027,7 +2180,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ne) {
 
   const string expected = R"(pred[2,2] {
   { 00 },
-  { 01 },
+  { 01 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2041,7 +2194,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Ge) {
 
   const string expected = R"(pred[2,4] {
   { 1100 },
-  { 0001 },
+  { 0001 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2055,7 +2208,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Gt) {
 
   const string expected = R"(pred[2,4] {
   { 0100 },
-  { 0000 },
+  { 0000 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2069,7 +2222,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Le) {
 
   const string expected = R"(pred[2,4] {
   { 1011 },
-  { 1111 },
+  { 1111 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -2083,7 +2236,7 @@ XLA_TEST_F(ArrayElementwiseOpTest, Compare1DTo2DS32Lt) {
 
   const string expected = R"(pred[2,4] {
   { 0011 },
-  { 1110 },
+  { 1110 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac3f3f4c9ddb03d003a44f5abd7a2e26c42f490d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/reference_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class Bfloat16Test : public ClientLibraryTestBase {
+ protected:
+  const ErrorSpec error_spec_{0.001, 0.001};
+};
+
+XLA_TEST_F(Bfloat16Test, ScalarOperation) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.0f));
+  auto y = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(1.0f));
+  builder.Add(x, y);
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(3.0f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, LogOperation) {
+  ComputationBuilder builder(client_, TestName());
+  auto x = builder.ConstantR0<bfloat16>(static_cast<bfloat16>(4.0f));
+  builder.Log(x);
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(1.387f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
+  ComputationBuilder builder(client_, TestName());
+  builder.Neg(builder.ConstantR0<bfloat16>(static_cast<bfloat16>(2.1f)));
+
+  ComputeAndCompareR0<bfloat16>(&builder, static_cast<bfloat16>(-2.1f), {},
+                                error_spec_);
+}
+
+XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<bfloat16>(
+      {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
+        {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(4.f)}}},
+       {{{static_cast<bfloat16>(5.f)}, {static_cast<bfloat16>(6.f)}},
+        {{static_cast<bfloat16>(7.f)}, {static_cast<bfloat16>(8.f)}}}});
+
+  auto scale = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(2.0f), static_cast<bfloat16>(3.0f)});
+
+  auto offset = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(2.0f)});
+
+  auto tuple = builder.BatchNormTraining(operand, scale, offset,
+                                         /*epsilon=*/0.001, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<bfloat16>(
+           {{{{static_cast<bfloat16>(-1.7f)}, {static_cast<bfloat16>(-2.04f)}},
+             {{static_cast<bfloat16>(0.105f)}, {static_cast<bfloat16>(0.65f)}}},
+            {{{static_cast<bfloat16>(1.89f)}, {static_cast<bfloat16>(3.35f)}},
+             {{static_cast<bfloat16>(3.7f)}, {static_cast<bfloat16>(6.04f)}}}})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(4), static_cast<bfloat16>(5)})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(5), static_cast<bfloat16>(5)})
+           .get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+}
+
+XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
+  const int kFeatureIndex = 2;
+  ComputationBuilder builder(client_, TestName());
+
+  auto operand = builder.ConstantR4FromArray4D<bfloat16>(
+      Array4D<bfloat16>(2, 2, 2, 1, static_cast<bfloat16>(0.0f)));
+
+  auto scale = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(1.0f)});
+
+  auto mean = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(0.0f), static_cast<bfloat16>(0.0f)});
+
+  auto var = builder.ConstantR1<bfloat16>(
+      {static_cast<bfloat16>(1.0f), static_cast<bfloat16>(1.0f)});
+
+  auto grad_output = builder.ConstantR4FromArray4D<bfloat16>(
+      {{{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(2.f)}},
+        {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(4.f)}}},
+       {{{static_cast<bfloat16>(5.f)}, {static_cast<bfloat16>(6.f)}},
+        {{static_cast<bfloat16>(7.f)}, {static_cast<bfloat16>(8.f)}}}});
+
+  builder.BatchNormGrad(operand, scale, mean, var, grad_output,
+                        /*epsilon=*/0.0, kFeatureIndex);
+
+  auto expected = *Literal::MakeTuple(
+      {Literal::CreateR4<bfloat16>(
+           {{{{static_cast<bfloat16>(-3.f)}, {static_cast<bfloat16>(-3.f)}},
+             {{static_cast<bfloat16>(-1.f)}, {static_cast<bfloat16>(-1.f)}}},
+            {{{static_cast<bfloat16>(1.f)}, {static_cast<bfloat16>(1.f)}},
+             {{static_cast<bfloat16>(3.f)}, {static_cast<bfloat16>(3.f)}}}})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(0), static_cast<bfloat16>(0)})
+           .get(),
+       Literal::CreateR1<bfloat16>(
+           {static_cast<bfloat16>(16), static_cast<bfloat16>(20)})
+           .get()});
+
+  ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d94d65c1015fb54ada3fdfc95d0c31d0a0f158b
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class BitcastConvertTest : public ClientLibraryTestBase {
+ public:
+  explicit BitcastConvertTest(perftools::gputools::Platform* platform = nullptr)
+      : ClientLibraryTestBase(platform) {
+    mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
+    mutable_debug_options()->add_xla_disable_hlo_passes("inline");
+  }
+};
+
+TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>({42, 64});
+  builder.BitcastConvertType(a, S32);
+
+  std::vector<int32> expected = {42, 64};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({42.0f, 64.0f});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {42.0f, 64.0f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a =
+      builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
+                                 static_cast<int32>(0xBF800000), 0x3F000000,
+                                 static_cast<int32>(0xBF000000)});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {0.0f, -0.0f, 1.0f, -1.0f, 0.5f, -0.5f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>({});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<float>({42.6, 64.4});
+  builder.BitcastConvertType(a, S32);
+
+  std::vector<int32> expected = {0x422a6666, 0x4280cccd};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertS32Extremes) {
+  ComputationBuilder builder(client_, TestName());
+  auto a = builder.ConstantR1<int32>(
+      {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
+  builder.BitcastConvertType(a, F32);
+
+  std::vector<float> expected = {-0.0f, NAN};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0, 0));
+}
+
+TEST_F(BitcastConvertTest, ConvertMapToS32) {
+  ComputationBuilder builder(client_, TestName());
+  auto b = builder.CreateSubBuilder("convert");
+  auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
+  b->BitcastConvertType(param, S32);
+  auto a = builder.ConstantR1<float>({42.0f, 64.0f});
+  builder.Map({a}, b->BuildAndNoteError(), {0});
+
+  std::vector<int32> expected = {0x42280000, 0x42800000};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(BitcastConvertTest, ConvertMapToF32) {
+  ComputationBuilder builder(client_, TestName());
+  auto b = builder.CreateSubBuilder("convert");
+  auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
+  b->BitcastConvertType(param, F32);
+  auto a = builder.ConstantR1<int32>({0x42280000, 0x42800000});
+  builder.Map({a}, b->BuildAndNoteError(), {0});
+
+  std::vector<float> expected = {42.0f, 64.0f};
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+// Regression test for b/31758660. When ReshapeMover transforms
+//   input -> reshape -> convert
+// to
+//   input -> convert -> reshape
+// the new convert should have the same element type as the old convert.
+TEST_F(BitcastConvertTest, ConvertReshape) {
+  ComputationBuilder builder(client_, TestName());
+  auto input = builder.ConstantR1<int32>({0x42280000});
+  auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
+  builder.BitcastConvertType(reshape, F32);
+
+  ComputeAndCompareR0<float>(&builder, 42.0f, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 36d10fff5400b78fa3ea9a03f6b9cd73059f1427..610302ac1256a57db6ed6e18016a4136973e3891 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -29,6 +29,7 @@ def xla_test(name,
              deps,
              xla_test_library_deps=[],
              backends=[],
+             blacklisted_backends=[],
              args=[],
              tags=[],
              copts=[],
@@ -92,17 +93,24 @@ def xla_test(name,
     backends: A list of backends to generate tests for. Supported
       values: "cpu", "cpu_parallel", "gpu". If this list is empty, the test will
       be generated for all supported backends.
+    blacklisted_backends: A list of backends to NOT generate tests for.
     args: Test arguments for the target.
     tags: Tags for the target.
-    backend_args: A dict mapping backend name to list of additional args to
-      use for that target.
+    copts: Additional copts to pass to the build.
+    data: Additional data to pass to the build.
     backend_tags: A dict mapping backend name to list of additional tags to
       use for that target.
+    backend_args: A dict mapping backend name to list of additional args to
+      use for that target.
+    **kwargs: Additional keyword arguments to pass to native.cc_test.
   """
   test_names = []
   if not backends:
     backends = all_backends
 
+  backends = [backend for backend in backends
+              if backend not in blacklisted_backends]
+
   native.cc_library(
       name="%s_lib" % name,
       srcs=srcs,
@@ -248,5 +256,6 @@ def generate_backend_test_macros(backends=[]):
         deps = [
             "//tensorflow/compiler/xla:types",
             "//tensorflow/core:lib",
+            "//tensorflow/core:regexp_internal",
             "//tensorflow/core:test",
         ])
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 065bce7e3146c93568bbce2b0e7e23ddddc4ea31..50bf185936808fbd9c49f7fbd5ab0c0b4a76504b 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -262,20 +262,39 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
                  expected.shape().element_type() == PRED)
         << ShapeUtil::HumanString(expected.shape());
   }
+  // We allow using a float expected literal for a bfloat16 output. In this
+  // case, we need to convert the expected literal to bfloat16.
+  const Literal* expected_ptr = &expected;
+  std::unique_ptr<Literal> converted_expected;
+  Shape layout_shape;
+  if (use_bfloat16_) {
+    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    expected_ptr = converted_expected.get();
+    if (shape_with_layout != nullptr) {
+      layout_shape = *shape_with_layout;
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
+      shape_with_layout = &layout_shape;
+    }
+  }
   auto expect_equal = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectEqual(expected, actual, error_message);
+    LiteralTestUtil::ExpectEqual(*expected_ptr, actual, error_message);
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
-        computation, expected, arguments, expect_equal);
+        computation, *expected_ptr, arguments, expect_equal);
   }
   if (execution_options_.debug_options().xla_test_all_input_layouts()) {
     return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, expected, arguments, expect_equal, shape_with_layout);
+        computation, *expected_ptr, arguments, expect_equal, shape_with_layout);
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectEqual(expected, *actual);
+  LiteralTestUtil::ExpectEqual(*expected_ptr, *actual);
   return tensorflow::Status::OK();
 }
 
@@ -286,20 +305,39 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   TF_RET_CHECK(ShapeUtil::ElementIsFloating(expected.shape()) ||
                ShapeUtil::ElementIsComplex(expected.shape()));
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+  // We allow using a float expected literal for a bfloat16 output. In this
+  // case, we need to convert the expected literal to bfloat16.
+  const Literal* expected_ptr = &expected;
+  std::unique_ptr<Literal> converted_expected;
+  Shape layout_shape;
+  if (use_bfloat16_) {
+    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    expected_ptr = converted_expected.get();
+    if (shape_with_layout != nullptr) {
+      layout_shape = *shape_with_layout;
+      ShapeUtil::ForEachMutableSubshape(
+          &layout_shape, [&](Shape* subshape, const ShapeIndex& /*index*/) {
+            if (subshape->element_type() == F32) {
+              subshape->set_element_type(BF16);
+            }
+          });
+      shape_with_layout = &layout_shape;
+    }
+  }
   auto expect_near = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectNear(expected, actual, error, error_message);
+    LiteralTestUtil::ExpectNear(*expected_ptr, actual, error, error_message);
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
-    return ComputeAndCompareLiteralWithAllOutputLayouts(computation, expected,
-                                                        arguments, expect_near);
+    return ComputeAndCompareLiteralWithAllOutputLayouts(
+        computation, *expected_ptr, arguments, expect_near);
   }
   if (execution_options_.debug_options().xla_test_all_input_layouts()) {
     return ComputeAndCompareLiteralWithAllInputLayouts(
-        computation, expected, arguments, expect_near, shape_with_layout);
+        computation, *expected_ptr, arguments, expect_near, shape_with_layout);
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectNear(expected, *actual, error);
+  LiteralTestUtil::ExpectNear(*expected_ptr, *actual, error);
   return tensorflow::Status::OK();
 }
 
@@ -346,10 +384,67 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
   LiteralTestUtil::ExpectNearTuple(expected, *actual, error);
 }
 
+void ClientLibraryTestBase::ComputeAndCompare(
+    ComputationBuilder* builder, const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<Literal> arguments) {
+  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*reference, *result);
+}
+
+void ClientLibraryTestBase::ComputeAndCompare(
+    ComputationBuilder* builder, const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<Literal> arguments, ErrorSpec error) {
+  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectNear(*reference, *result, error);
+}
+
+StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+ClientLibraryTestBase::ComputeValueAndReference(
+    ComputationBuilder* builder, const ComputationDataHandle& operand,
+    tensorflow::gtl::ArraySlice<Literal> arguments) {
+  // Transfer the arguments to the executor service. We put the unique_ptr's
+  // into a vector to keep the data alive on the service until the end of this
+  // function.
+  std::vector<std::unique_ptr<GlobalData>> argument_data;
+  for (const auto& arg : arguments) {
+    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg));
+    argument_data.push_back(std::move(data));
+  }
+
+  // Create raw pointers to the GlobalData for the rest of the call stack.
+  std::vector<GlobalData*> argument_data_ptr;
+  std::transform(
+      argument_data.begin(), argument_data.end(),
+      std::back_inserter(argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+
+  TF_ASSIGN_OR_RETURN(
+      auto reference,
+      builder->ComputeConstant(operand, /*output_layout=*/nullptr, arguments));
+  TF_ASSIGN_OR_RETURN(auto result,
+                      ExecuteAndTransfer(builder, argument_data_ptr));
+  return std::make_pair(std::move(reference), std::move(result));
+}
+
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
-  auto z_value = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "z_value");
-  auto zero = builder.ConstantR0<float>(0.0);
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto z_value = builder.Parameter(0, shape, "z_value");
+  auto zero = use_bfloat16_
+                  ? builder.ConstantR0<bfloat16>(static_cast<bfloat16>(0.0f))
+                  : builder.ConstantR0<float>(0.0f);
   builder.Max(z_value, zero);
   auto computation_status = builder.Build();
   TF_CHECK_OK(computation_status.status());
@@ -358,8 +453,9 @@ Computation ClientLibraryTestBase::CreateScalarRelu() {
 
 Computation ClientLibraryTestBase::CreateScalarMax() {
   ComputationBuilder builder(client_, "max");
-  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
-  auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto x = builder.Parameter(0, shape, "x");
+  auto y = builder.Parameter(1, shape, "y");
   builder.Max(x, y);
   auto computation_status = builder.Build();
   TF_CHECK_OK(computation_status.status());
@@ -368,11 +464,12 @@ Computation ClientLibraryTestBase::CreateScalarMax() {
 
 Computation ClientLibraryTestBase::CreateScalarReluSensitivity() {
   ComputationBuilder builder(client_, "relu_sensitivity");
-  auto activation =
-      builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "activation");
-  auto backprop =
-      builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "backprop");
-  auto zero = builder.ConstantR0<float>(0.0);
+  auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
+  auto activation = builder.Parameter(0, shape, "activation");
+  auto backprop = builder.Parameter(1, shape, "backprop");
+  auto zero = use_bfloat16_
+                  ? builder.ConstantR0<bfloat16>(static_cast<bfloat16>(0.0f))
+                  : builder.ConstantR0<float>(0.0f);
   auto activation_gtz = builder.Gt(activation, zero);
   builder.Select(activation_gtz, /*on_true=*/backprop, /*on_false=*/zero);
 
@@ -407,4 +504,27 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
   return array;
 }
 
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    ComputationBuilder* builder, ComputationDataHandle* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal).ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
+}
+
+ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
+    const Literal& literal, ComputationBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 7cfc276ec19e3b177f87a08e716cb34b7676dd6b..4d0cf8bf71cf22d7c046bb22754a8d4e299ed9db 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -194,7 +194,17 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   void ComputeAndCompareTuple(
       ComputationBuilder* builder, const Literal& expected,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec abs_error);
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+  // Convenience method for running a built computation and comparing the result
+  // with the HloEvaluator.
+  void ComputeAndCompare(ComputationBuilder* builder,
+                         const ComputationDataHandle& operand,
+                         tensorflow::gtl::ArraySlice<Literal> arguments);
+  void ComputeAndCompare(ComputationBuilder* builder,
+                         const ComputationDataHandle& operand,
+                         tensorflow::gtl::ArraySlice<Literal> arguments,
+                         ErrorSpec error);
 
   // Create scalar operations for use in reductions.
   Computation CreateScalarRelu();
@@ -235,51 +245,102 @@ class ClientLibraryTestBase : public ::testing::Test {
       const int rows, const int cols, const int rows_padded,
       const int cols_padded);
 
-  // Create a parameter instruction that wraps a given value and then stores
+  // Creates a parameter instruction, transfers the literal for the parameter to
+  // server, then stores into "data_handle" the global handle for that
+  // parameter. When the use_bfloat16 flag is set but the literal has F32
+  // elements, the literal will be converted to BF16 before being transferred.
+  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+      int64 parameter_number, const Literal& literal, const string& name,
+      ComputationBuilder* builder, ComputationDataHandle* data_handle);
+
+  // Creates a constant instruction with the given literal. When the
+  // use_bfloat16 flag is set but the literal has F32 elements, the elements
+  // will be converted to BF16s.
+  ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
+                                                  ComputationBuilder* builder);
+
+  // Creates a constant instruction with the given array. When the use_bfloat16
+  // flag is set but the array has float elements, the elements will be
+  // converted to bfloat16s.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromArray(const Array<NativeT>& array,
+                                                ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
+  // Same as CreateConstantFromArray, but for scalars.
+  template <typename NativeT>
+  ComputationDataHandle CreateConstantFromScalar(NativeT value,
+                                                 ComputationBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
+  // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR0Parameter(
       NativeT value, int64 parameter_number, const string& name,
       ComputationBuilder* builder, ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given values and then stores
+  // Creates a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR1Parameter(
       tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given constant array
+  // Creates a parameter instruction that wraps the given constant array
   // "array_2d" and then stores to "data_handle" the global handle for that
   // parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR2Parameter(
       const Array2D<NativeT>& array_2d, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
-  // Create a parameter instruction that wraps the given constant array
+  // Creates a parameter instruction that wraps the given constant array
   // "array_3d" and then stores to "data_handle" the global handle for that
   // parameter.
   //
   // "parameter_number" is the parameter number.
   // "name" is the name of the parameter instruction.
+  //
+  // When the use_bfloat16 flag is set but NativeT is float, the data will be
+  // converted to bfloat16.
   template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR3Parameter(
       const Array3D<NativeT>& array_3d, int64 parameter_number,
       const string& name, ComputationBuilder* builder,
       ComputationDataHandle* data_handle);
 
+  // Getter and setter for the use_bfloat16 flag, which indicates whether to run
+  // tests with all float-type input/output converted to bfloat16.
+  bool use_bfloat16() const { return use_bfloat16_; }
+  void set_use_bfloat16(bool value) { use_bfloat16_ = value; }
+
+  // The float type used in this test, BF16 or F32 according to use_bfloat16.
+  PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
+
   Client* client_;
   ExecutionOptions execution_options_;
 
@@ -298,6 +359,17 @@ class ClientLibraryTestBase : public ::testing::Test {
       const std::function<void(const Literal& actual,
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
+
+  // Executes the computation and calculates the expected reference value using
+  // the HloEvaluator. Returns two literal in the order of (expected, actual).
+  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+  ComputeValueAndReference(ComputationBuilder* builder,
+                           const ComputationDataHandle& operand,
+                           tensorflow::gtl::ArraySlice<Literal> arguments);
+
+  // Whether to run tests with all float-type input/output converted to
+  // bfloat16.
+  bool use_bfloat16_ = false;
 };
 
 template <typename NativeT>
@@ -315,8 +387,10 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
     ComputationBuilder* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value,
-                "Floating point type required when specifying an ErrorSpec");
+                    std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, complex64>::value,
+                "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
@@ -338,8 +412,10 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
     ComputationBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value,
-                "Floating point type required when specifying an ErrorSpec");
+                    std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
+                    std::is_same<NativeT, complex64>::value,
+                "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
@@ -362,6 +438,7 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -386,6 +463,7 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -410,6 +488,7 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
+                    std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, complex64>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   std::unique_ptr<Literal> expected_literal =
@@ -423,6 +502,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
     ComputationBuilder* builder, ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -435,6 +517,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -447,6 +532,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -459,6 +547,9 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const string& name, ComputationBuilder* builder,
     ComputationDataHandle* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
+  if (use_bfloat16_ && literal->shape().element_type() == F32) {
+    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+  }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
   *data_handle = builder->Parameter(parameter_number, literal->shape(), name);
@@ -469,8 +560,7 @@ template <typename NativeT>
 std::vector<NativeT> ClientLibraryTestBase::CreatePseudorandomR1(
     const int width, NativeT min_value, NativeT max_value, uint32 seed) {
   std::vector<NativeT> result(width);
-  test_utils::PseudorandomGenerator<NativeT> generator(min_value, max_value,
-                                                       seed);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
   for (int i = 0; i < width; ++i) {
     result[i] = generator.get();
   }
@@ -482,8 +572,7 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
     const int rows, const int cols, NativeT min_value, NativeT max_value,
     uint32 seed) {
   auto result = MakeUnique<Array2D<NativeT>>(rows, cols);
-  test_utils::PseudorandomGenerator<NativeT> generator(min_value, max_value,
-                                                       seed);
+  PseudorandomGenerator<NativeT> generator(min_value, max_value, seed);
   for (int y = 0; y < rows; ++y) {
     for (int x = 0; x < cols; ++x) {
       (*result)(y, x) = generator.get();
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 0853feeebd6f7a249cf767e1f8a63675d4bddd27..8853ed9e5780672d4006c326291767b8b5253f56 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
@@ -42,26 +44,26 @@ TEST_F(ClientTest, ExecuteWithLayout) {
     for (const std::vector<int64>& transfer_layout : layouts) {
       b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
             b.ConstantR2<int32>({{10, 20}, {30, 40}}));
-      auto computation = b.Build();
-      ASSERT_TRUE(computation.ok()) << computation.status();
+      TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
 
       ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                          execute_layout);
-      std::unique_ptr<GlobalData> data =
-          client_->Execute(computation.ValueOrDie(), {}, &execution_options)
-              .ConsumeValueOrDie();
+      TF_ASSERT_OK_AND_ASSIGN(
+          std::unique_ptr<GlobalData> data,
+          client_->Execute(computation, {}, &execution_options));
 
       std::unique_ptr<Literal> expected_literal =
-          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
-                                                       transfer_layout);
+          Literal::CreateR2WithLayout<int32>(
+              {{11, 22}, {33, 44}}, LayoutUtil::MakeLayout(transfer_layout));
 
-      auto computed = client_->Transfer(*data, &expected_literal->shape());
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, client_->Transfer(*data, &expected_literal->shape()));
 
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
@@ -72,8 +74,7 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
   b.Tuple({b.ConstantR2<int32>({{1, 2}, {3, 4}}),
            b.ConstantR2<int32>({{10, 20}, {30, 40}})});
 
-  auto computation = b.Build();
-  ASSERT_TRUE(computation.ok()) << computation.status();
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
 
   ExecutionOptions execution_options = execution_options_;
   // Create a result shape with one element column major and the other row
@@ -85,10 +86,9 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
            ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                           /*minor_to_major=*/{1, 0})});
 
-  auto result =
-      client_
-          ->ExecuteAndTransfer(computation.ValueOrDie(), {}, &execution_options)
-          .ConsumeValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      client_->ExecuteAndTransfer(computation, {}, &execution_options));
   LiteralTestUtil::ExpectR2Equal<int32>({{1, 2}, {3, 4}},
                                         result->tuple_literals(0));
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
@@ -107,5 +107,42 @@ TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
+TEST_F(ClientTest, DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
+  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+  Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> const_arg,
+      client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
+
+  ComputationBuilder b(client_, TestName() + ".add");
+  b.Add(b.Parameter(0, shape, "param_0"),
+        b.ConstantR2<int32>({{1, 2}, {3, 4}}));
+  TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
+
+  // We can't really test parallel execution on CPU since all of the cores in a
+  // CPU are presented as a single device.  So for now we test "parallel"
+  // execution on a single device.
+  std::vector<Client::ComputationInstance> computation_instances;
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
+                          client_->GetDeviceHandles(1));
+  ASSERT_EQ(devices.size(), 1);
+
+  ExecutionOptions options = execution_options_;
+  *options.add_device_handles() = devices[0];
+  computation_instances.push_back(Client::ComputationInstance(
+      add_with_one_arg, {const_arg.get()}, options, nullptr));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto results,
+                          client_->ExecuteParallel(computation_instances));
+  auto expected_result = Literal::CreateR2<int32>({{6, 8}, {10, 12}});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result_literal,
+      client_->Transfer(*results[0], &expected_result->shape()));
+
+  LiteralTestUtil::ExpectEqual(*expected_result, *result_literal);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/codegen_test_base.cc b/tensorflow/compiler/xla/tests/codegen_test_base.cc
index 43ea7f6019415a171123ee0315533b8a3b1ff984..e472408dcf7ed5fec74e886fd0092ce47ee2e7eb 100644
--- a/tensorflow/compiler/xla/tests/codegen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/codegen_test_base.cc
@@ -19,8 +19,11 @@ namespace xla {
 
 StatusOr<std::unique_ptr<Executable>> CodegenTestBase::CompileToExecutable(
     std::unique_ptr<HloModule> hlo_module) {
-  return backend().compiler()->Compile(std::move(hlo_module),
-                                       backend().default_stream_executor());
+  TF_ASSIGN_OR_RETURN(hlo_module, backend().compiler()->RunHloPasses(
+                                      std::move(hlo_module),
+                                      backend().default_stream_executor()));
+  return backend().compiler()->RunBackend(std::move(hlo_module),
+                                          backend().default_stream_executor());
 }
 
 StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 707e439245c29a1ddf80bfd9205aa14b0d4765f6..0f780fa87ef98fd5c48726ef83fa8efc1e90fbf7 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -138,13 +138,13 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
   // layouts. Use these arrays as parameters to a simple computation. If the
   // layout of the array changes then computation should be recompiled (cache
   // miss).
-  auto rowmaj_array = test_utils::CreateR2LiteralWithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, /*minor_to_major=*/{1, 0});
+  auto rowmaj_array = Literal::CreateR2WithLayout(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({1, 0}));
   auto rowmaj_handle =
       client_->TransferToServer(*rowmaj_array).ConsumeValueOrDie();
 
-  auto colmaj_array = test_utils::CreateR2LiteralWithLayout(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, /*minor_to_major=*/{0, 1});
+  auto colmaj_array = Literal::CreateR2WithLayout(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1}));
   auto colmaj_handle =
       client_->TransferToServer(*colmaj_array).ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index b2e9743af79d0e4658451e7a9522c338036851ba..5226a78386824a94572d3e5cc3329677108a910a 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -71,24 +71,27 @@ class ComputeConstantTest : public ::testing::Test {
 
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
       Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder, Layout* output_layout = nullptr) {
-    TF_ASSIGN_OR_RETURN(auto computed,
-                        builder->ComputeConstant(operand, output_layout));
+      ComputationBuilder* builder, Layout* output_layout = nullptr,
+      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
+    TF_ASSIGN_OR_RETURN(auto computed, builder->ComputeConstant(
+                                           operand, output_layout, parameters));
     return std::move(computed);
   }
 
   template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(Client* client,
-                                         const ComputationDataHandle& operand,
-                                         ComputationBuilder* builder) {
-    TF_ASSIGN_OR_RETURN(auto literal,
-                        ComputeConstantLiteral(client, operand, builder));
+  StatusOr<Scalar> ComputeConstantScalar(
+      Client* client, const ComputationDataHandle& operand,
+      ComputationBuilder* builder,
+      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
+    TF_ASSIGN_OR_RETURN(
+        auto literal,
+        ComputeConstantLiteral(client, operand, builder, nullptr, parameters));
     return literal->Get<Scalar>({});
   }
 
   bool IsConstant(const ComputationDataHandle& operand,
-                  ComputationBuilder* builder) {
-    StatusOr<bool> result = builder->IsConstant(operand);
+                  ComputationBuilder* builder, int64 num_parameters = 0) {
+    StatusOr<bool> result = builder->IsConstant(operand, num_parameters);
     EXPECT_TRUE(result.ok()) << result.status();
     return result.ok() ? result.ValueOrDie() : false;
   }
@@ -138,7 +141,25 @@ TEST_F(ComputeConstantTest, ScalarRng) {
   }
 }
 
-TEST_F(ComputeConstantTest, DirectParam) {
+TEST_F(ComputeConstantTest, Param) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    ComputationBuilder b(client, TestName());
+    auto param = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "lhs");
+    auto computation = b.Add(param, b.ConstantR0<float>(1.5f));
+
+    std::vector<Literal> arguments;
+    arguments.emplace_back(*Literal::CreateR0(42.5f));
+    EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
+
+    auto value =
+        ComputeConstantScalar<float>(client, computation, &b, arguments);
+    ASSERT_TRUE(value.ok()) << value.status();
+    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+  }
+}
+
+TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     ComputationBuilder b(client, TestName());
@@ -152,7 +173,7 @@ TEST_F(ComputeConstantTest, DirectParam) {
   }
 }
 
-TEST_F(ComputeConstantTest, IndirectParam) {
+TEST_F(ComputeConstantTest, IndirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     ComputationBuilder b(client, TestName());
@@ -243,8 +264,8 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
       ASSERT_TRUE(computed.ok()) << computed.status();
 
       std::unique_ptr<Literal> expected_literal =
-          test_utils::CreateR2LiteralWithLayout<int32>({{11, 22}, {33, 44}},
-                                                       layout);
+          Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
+                                             LayoutUtil::MakeLayout(layout));
       LiteralTestUtil::AssertEqualShapesAndLayouts(
           expected_literal->shape(), computed.ValueOrDie()->shape());
       LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbfacaea53952b02596eb3e84b13a5749335651d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -0,0 +1,238 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class ConditionalOpTest : public ClientLibraryTestBase {
+ protected:
+  Computation CreateR0F32ConstantComputation(float value) {
+    ComputationBuilder builder(client_, "Constant");
+    builder.Parameter(0, empty_tuple_, "tuple");
+    builder.ConstantR0<float>(value);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32IdentityComputation() {
+    ComputationBuilder builder(client_, "Identity");
+    builder.Parameter(0, r0f32_, "x");
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32CeilComputation() {
+    ComputationBuilder builder(client_, "Ceil");
+    auto param = builder.Parameter(0, r0f32_, "param");
+    builder.Ceil(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateR0F32FloorComputation() {
+    ComputationBuilder builder(client_, "Ceil");
+    auto param = builder.Parameter(0, r0f32_, "param");
+    builder.Floor(param);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateAddTupleComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Add(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateAddR0Computation() {
+    return CreateAddTupleComputation("AddR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateAddR1Computation() {
+    return CreateAddTupleComputation("AddR1", tuple_2_r1s2f32_);
+  }
+
+  Computation CreateSubTupleComputation(const string& computation_name,
+                                        const Shape& tuple_shape) {
+    ComputationBuilder builder(client_, computation_name);
+    auto tuple = builder.Parameter(0, tuple_shape, "tuple");
+    auto x = builder.GetTupleElement(tuple, 0);
+    auto y = builder.GetTupleElement(tuple, 1);
+    builder.Sub(x, y);
+    auto build_status = builder.Build();
+    EXPECT_IS_OK(build_status.status());
+    return build_status.ConsumeValueOrDie();
+  }
+
+  Computation CreateSubR0Computation() {
+    return CreateSubTupleComputation("SubR0", tuple_2_r0f32_);
+  }
+
+  Computation CreateSubR1Computation() {
+    return CreateSubTupleComputation("SubR1", tuple_2_r1s2f32_);
+  }
+
+  Shape r0f32_ = ShapeUtil::MakeShape(F32, {});
+  Shape tuple_2_r0f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(F32, {})});
+  Shape tuple_2_r1s2f32_ = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeShape(F32, {2})});
+  Shape empty_tuple_ = ShapeUtil::MakeTupleShape({});
+  ErrorSpec error_spec_{0.001};
+};
+
+// Test true and false computations that do not take any parameters.
+XLA_TEST_F(ConditionalOpTest, Parameters0) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operands = builder.Tuple({});
+  auto true_computation = CreateR0F32ConstantComputation(56.0f);
+  auto false_computation = CreateR0F32ConstantComputation(12.0f);
+  auto result = builder.Conditional(pred, operands, true_computation, operands,
+                                    false_computation);
+
+  ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 1 parameter.
+XLA_TEST_F(ConditionalOpTest, Parameters1) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto identity = CreateR0F32IdentityComputation();
+  auto result =
+      builder.Conditional(pred, operand1, identity, operand2, identity);
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// true.
+XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR0Computation(),
+                                    operands, CreateSubR0Computation());
+
+  ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 parameters and predicate is
+// false.
+XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR0Computation(),
+                                    operands, CreateSubR0Computation());
+
+  ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is true.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR1Computation(),
+                                    operands, CreateSubR1Computation());
+
+  ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
+}
+
+// Test true and false computations that take in 2 array parameters and
+// predicate is false.
+XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
+  auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
+  auto operands = builder.Tuple({operand1, operand2});
+  auto result = builder.Conditional(pred, operands, CreateAddR1Computation(),
+                                    operands, CreateSubR1Computation());
+
+  ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
+}
+
+// Test the case where one conditional is nested within another.
+XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
+  Shape r0bool = ShapeUtil::MakeShape(PRED, {});
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
+  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  auto param0 = inner_builder.Parameter(0, tuple_shape, "param0");
+  auto pred_cond = inner_builder.GetTupleElement(param0, 0);
+  auto true_operand = inner_builder.GetTupleElement(param0, 1);
+  auto false_operand = inner_builder.GetTupleElement(param0, 2);
+  inner_builder.Conditional(pred_cond, true_operand,
+                            CreateR0F32CeilComputation(), false_operand,
+                            CreateR0F32FloorComputation());
+  auto inner_builder_result = inner_builder.Build();
+
+  ComputationBuilder builder(client_, TestName());
+  auto pred1 = builder.ConstantR0<bool>(true);
+  auto pred2 = builder.ConstantR0<bool>(false);
+  auto operand1 = builder.ConstantR0<float>(1.1f);
+  auto operand2 = builder.ConstantR0<float>(12.2f);
+  auto operand3 = builder.ConstantR0<float>(43.3f);
+  auto tuple_operand = builder.Tuple({pred2, operand1, operand2});
+  builder.Conditional(pred1, tuple_operand,
+                      inner_builder_result.ConsumeValueOrDie(), operand3,
+                      CreateR0F32IdentityComputation());
+
+  ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
+}
+
+// Test a mismatch in the shape of the true operand and true computation.
+XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
+  ComputationBuilder builder(client_, TestName());
+  auto pred = builder.ConstantR0<bool>(true);
+  auto operand1 = builder.ConstantR0<float>(56.0f);
+  auto operand2 = builder.ConstantR0<float>(12.0f);
+  auto operands = builder.Tuple({operand1, operand2});
+  builder.Conditional(pred, operands, CreateAddR1Computation(), operands,
+                      CreateSubR0Computation());
+
+  auto result = builder.Build();
+  EXPECT_FALSE(result.ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("true_operand must match the shape of the "
+                                   "only parameter of true_computation"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index b0a63bccbb93f226175beff2e30e2a243fdca1d3..896b34fb6e2762c14bd9ec2bf1ba13c548d4cf60 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -39,8 +39,8 @@ class ConvolutionDimensionNumbersTest : public ClientLibraryTestBase {};
 // Tests the convolution operation with invalid input dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 0, 2, 2, 3, 0, 1, 2,
-                                                     3);
+      ComputationBuilder::CreateConvDimensionNumbers(0, 2, 2, 3, 0, 1, 2, 3, 0,
+                                                     1, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("input are not unique"));
@@ -49,13 +49,23 @@ TEST_F(ConvolutionDimensionNumbersTest, InvalidInputDimensionNumbers) {
 // Tests the convolution operation with invalid weight dimension numbers.
 TEST_F(ConvolutionDimensionNumbersTest, InvalidWeightDimensionNumbers) {
   auto dimension_numbers_status =
-      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 0, 1, 2, 3, 2, 3, 2,
-                                                     3);
+      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 1, 2, 3, 0,
+                                                     2, 2, 3);
   ASSERT_FALSE(dimension_numbers_status.ok());
   ASSERT_THAT(dimension_numbers_status.status().error_message(),
               ::testing::HasSubstr("weight are not unique"));
 }
 
+// Tests the convolution operation with invalid output dimension numbers.
+TEST_F(ConvolutionDimensionNumbersTest, InvalidOutputDimensionNumbers) {
+  auto dimension_numbers_status =
+      ComputationBuilder::CreateConvDimensionNumbers(0, 1, 2, 3, 0, 2, 2, 3, 0,
+                                                     1, 2, 3);
+  ASSERT_FALSE(dimension_numbers_status.ok());
+  ASSERT_THAT(dimension_numbers_status.status().error_message(),
+              ::testing::HasSubstr("output are not unique"));
+}
+
 XLA_TEST_F(ConvolutionDimensionNumbersTest,
            TwoConvsWithDifferentDimensionNumbers) {
   auto input_array = MakeUnique<Array4D<float>>(2, 3, 5, 5);
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 0cc2e5fb7e655884f3334426a684dd3ce00d4052..2924c08615fa706bb19addf04bf58e1d5dd5a659 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -82,177 +82,127 @@ XLA_TEST_F(ConvolutionTest, ForwardPassConvolution_3x3x256_256_OutputZ_Iota) {
   ComputationBuilder builder(client_, TestName());
   auto lhs = builder.ConstantR4FromArray4D<float>(*alhs);
   auto rhs = builder.ConstantR4FromArray4D<float>(*arhs);
-  builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+  auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-  std::unique_ptr<Array4D<float>> aexpected =
-      ReferenceUtil::ConvArray4D(*alhs, *arhs, {1, 1}, Padding::kValid);
-
-  ComputeAndCompareR4<float>(&builder, *aexpected, {}, error_spec_);
+  ComputeAndCompare(&builder, conv, {}, error_spec_);
 }
 
 TEST_F(ConvolutionTest, Convolve_1x1x1x2_1x1x1x2_Valid) {
   ComputationBuilder builder(client_, TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
-    auto input = builder.Parameter(0, input_shape, "input");
-    auto filter = builder.Parameter(1, filter_shape, "filter");
-    builder.Conv(input, filter, {1, 1}, Padding::kValid);
-  }
-
-  Array4D<float> input(1, 1, 1, 2);
-  input.FillWithYX(Array2D<float>({
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+
+  Array4D<float> input_data(1, 1, 1, 2);
+  input_data.FillWithYX(Array2D<float>({
       {1, 2},
   }));
-  Array4D<float> filter(1, 1, 1, 2);
-  filter.FillWithYX(Array2D<float>({
+  Array4D<float> filter_data(1, 1, 1, 2);
+  filter_data.FillWithYX(Array2D<float>({
       {5, 6},
   }));
 
-  std::unique_ptr<Array4D<float>> aexpected =
-      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
-
-  auto input_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR4<float>(&builder, *aexpected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+  ComputeAndCompare(&builder, conv,
+                    {*Literal::CreateFromArray(input_data),
+                     *Literal::CreateFromArray(filter_data)},
+                    error_spec_);
 }
 
 // Tests valid padding for 2D convolution in raster space.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Valid) {
   ComputationBuilder builder(client_, TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
-    auto input = builder.Parameter(0, input_shape, "input");
-    auto filter = builder.Parameter(1, filter_shape, "filter");
-    builder.Conv(input, filter, {1, 1}, Padding::kValid);
-  }
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
-  Array4D<float> input(1, 1, 4, 4);
+  Array4D<float> input_data(1, 1, 4, 4);
   // clang-format off
-  input.FillWithYX(Array2D<float>({
+  input_data.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter(1, 1, 2, 2);
+  Array4D<float> filter_data(1, 1, 2, 2);
   // clang-format off
-  filter.FillWithYX(Array2D<float>({
+  filter_data.FillWithYX(Array2D<float>({
     {5, 6},
     {7, 8},
   }));
   // clang-format on
-
-  std::unique_ptr<Array4D<float>> aexpected =
-      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kValid);
-
-  auto input_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR4<float>(&builder, *aexpected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+  ComputeAndCompare(&builder, conv,
+                    {*Literal::CreateFromArray(input_data),
+                     *Literal::CreateFromArray(filter_data)},
+                    error_spec_);
 }
 
 // Tests same padding for 2D convolution in raster space.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x2x2_Same) {
   ComputationBuilder builder(client_, TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
-    auto input = builder.Parameter(0, input_shape, "input");
-    auto filter = builder.Parameter(1, filter_shape, "filter");
-    builder.Conv(input, filter, {1, 1}, Padding::kSame);
-  }
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 2, 2});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
-  Array4D<float> input(1, 1, 4, 4);
+  Array4D<float> input_data(1, 1, 4, 4);
   // clang-format off
-  input.FillWithYX(Array2D<float>({
+  input_data.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter(1, 1, 2, 2);
+  Array4D<float> filter_data(1, 1, 2, 2);
   // clang-format off
-  filter.FillWithYX(Array2D<float>({
+  filter_data.FillWithYX(Array2D<float>({
     {5, 6},
     {7, 8},
   }));
   // clang-format on
-
-  std::unique_ptr<Array4D<float>> aexpected =
-      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
-
-  auto input_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR4<float>(&builder, *aexpected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+  ComputeAndCompare(&builder, conv,
+                    {*Literal::CreateFromArray(input_data),
+                     *Literal::CreateFromArray(filter_data)},
+                    error_spec_);
 }
 
 // Tests same padding for 2D convolution in raster space with an odd sized
 // kernel.
 TEST_F(ConvolutionTest, Convolve_1x1x4x4_1x1x3x3_Same) {
   ComputationBuilder builder(client_, TestName());
-  {
-    Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
-    Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 3, 3});
-    auto input = builder.Parameter(0, input_shape, "input");
-    auto filter = builder.Parameter(1, filter_shape, "filter");
-    builder.Conv(input, filter, {1, 1}, Padding::kSame);
-  }
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 1, 3, 3});
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
-  Array4D<float> input(1, 1, 4, 4);
+  Array4D<float> input_data(1, 1, 4, 4);
   // clang-format off
-  input.FillWithYX(Array2D<float>({
+  input_data.FillWithYX(Array2D<float>({
     {1,  2,  3,  4 },
     {5,  6,  7,  8 },
     {9,  10, 11, 12},
     {13, 14, 15, 16},
   }));
   // clang-format on
-  Array4D<float> filter(1, 1, 3, 3);
+  Array4D<float> filter_data(1, 1, 3, 3);
   // clang-format off
-  filter.FillWithYX(Array2D<float>({
+  filter_data.FillWithYX(Array2D<float>({
     { 5,  6,  7},
     { 8,  9, 10},
     {11, 12, 13},
   }));
   // clang-format on
-
-  std::unique_ptr<Array4D<float>> aexpected =
-      ReferenceUtil::ConvArray4D(input, filter, {1, 1}, Padding::kSame);
-
-  auto input_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(input))
-          .ConsumeValueOrDie();
-  auto filter_literal =
-      client_->TransferToServer(*Literal::CreateR4FromArray4D(filter))
-          .ConsumeValueOrDie();
-
-  ComputeAndCompareR4<float>(&builder, *aexpected,
-                             {input_literal.get(), filter_literal.get()},
-                             error_spec_);
+  ComputeAndCompare(&builder, conv,
+                    {*Literal::CreateFromArray(input_data),
+                     *Literal::CreateFromArray(filter_data)},
+                    error_spec_);
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
@@ -420,9 +370,12 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
-    dnums.add_spatial_dimensions(2);
-    dnums.add_spatial_dimensions(3);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(2);
+    dnums.add_output_spatial_dimensions(2);
+    dnums.add_input_spatial_dimensions(3);
+    dnums.add_output_spatial_dimensions(3);
     dnums.set_input_feature_dimension(4);
     dnums.set_output_feature_dimension(4);
     dnums.add_kernel_spatial_dimensions(0);
@@ -473,8 +426,10 @@ XLA_TEST_F(ConvolutionTest, Convolve2D_1x3x3x5_3x3x5x5_Valid) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
-    dnums.add_spatial_dimensions(2);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(2);
+    dnums.add_output_spatial_dimensions(2);
     dnums.set_input_feature_dimension(3);
     dnums.set_output_feature_dimension(3);
     dnums.add_kernel_spatial_dimensions(0);
@@ -508,6 +463,54 @@ XLA_TEST_F(ConvolutionTest, Convolve2D_1x3x3x5_3x3x5x5_Valid) {
                            error_spec_);
 }
 
+// Test fixture to run convolution tests with and without convolution
+// canonicalization enabled.
+class ConvolveWithAndWithoutCanonicalization
+    : public ConvolutionTest,
+      public ::testing::WithParamInterface<bool> {};
+
+XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
+           DISABLED_ON_GPU(Convolve2D_NoSpatialDims)) {
+  if (GetParam()) {
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "convolution-canonicalization");
+  }
+  ComputationBuilder builder(client_, TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
+  Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
+
+  auto input = builder.Parameter(0, input_shape, "input");
+  auto filter = builder.Parameter(1, filter_shape, "filter");
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_feature_dimension(0);
+  dnums.set_input_batch_dimension(1);
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_output_batch_dimension(0);
+  dnums.set_output_feature_dimension(1);
+  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
+                                                Padding::kValid, dnums);
+
+  Array2D<float> param0(4, 29);
+  param0.FillUnique();
+
+  Array2D<float> param1(4, 10);
+  param1.FillUnique();
+
+  Array2D<float> expected_result(29, 10);
+  expected_result.Fill(0);
+
+  ComputeAndCompare(
+      &builder, conv,
+      {*Literal::CreateFromArray(param0), *Literal::CreateFromArray(param1)},
+      error_spec_);
+}
+
+INSTANTIATE_TEST_CASE_P(ConvolveWithAndWithoutCanonicalization_Instantiation,
+                        ConvolveWithAndWithoutCanonicalization,
+                        ::testing::Values(true, false));
+
 struct Convolve1DTestParam {
   int64 input_feature;
   int64 output_feature;
@@ -540,7 +543,8 @@ XLA_TEST_P(Convolve1D1WindowTest, Convolve1D1Window) {
     ConvolutionDimensionNumbers dnums;
     dnums.set_input_batch_dimension(0);
     dnums.set_output_batch_dimension(0);
-    dnums.add_spatial_dimensions(1);
+    dnums.add_input_spatial_dimensions(1);
+    dnums.add_output_spatial_dimensions(1);
     dnums.set_input_feature_dimension(2);
     dnums.set_output_feature_dimension(2);
     dnums.add_kernel_spatial_dimensions(0);
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 9b36e3722b8f8a5d01c426425fdfb0c4b9ae3a16..9c1145def8c11f1222c63adf006102887d49f00d 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -320,9 +320,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter3x3in2x2Padded) {
   Array4D<float> input_array(1, 1, 2, 2, {1, 2, 3, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
 
-  const Array4D<float> filter_array(1, 1, 3, 3, {10000, 0, 1000,  // row 0
-                                                 0, 100, 0,       // row 1
-                                                 10, 0, 1});      // row 2
+  const Array4D<float> filter_array(1, 1, 3, 3,
+                                    {10000, 0, 1000,  // row 0
+                                     0, 100, 0,       // row 1
+                                     10, 0, 1});      // row 2
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
 
   builder.Conv(input, filter, {1, 1}, Padding::kSame);
@@ -472,7 +473,9 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x2Input3x1x2x2) {
   builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   std::vector<float> expected_data = {
-      23, 33, 43,
+      23,
+      33,
+      43,
   };
   Array4D<float> expected(bs, 1, 1, 1, expected_data);
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
@@ -669,10 +672,11 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
   std::iota(input_data.begin(), input_data.end(), 1.0);
   Array4D<float> input_array(1, 1, 3, 4, input_data);
 
-  Array4D<float> filter_array(1, 1, 4, 3, {100, 10, 1,  //
-                                           200, 20, 2,  //
-                                           300, 30, 3,  //
-                                           400, 40, 4});
+  Array4D<float> filter_array(1, 1, 4, 3,
+                              {100, 10, 1,  //
+                               200, 20, 2,  //
+                               300, 30, 3,  //
+                               400, 40, 4});
   auto input = builder.ConstantR4FromArray4D<float>(input_array);
   auto filter = builder.ConstantR4FromArray4D<float>(filter_array);
   builder.ConvGeneralDilated(
@@ -681,9 +685,10 @@ XLA_TEST_F(ConvolutionVariantsTest, FlatLhsDilation) {
       /*rhs_dilation=*/{},
       ComputationBuilder::CreateDefaultConvDimensionNumbers());
 
-  Array4D<float> expected(1, 1, 3, 5, {204, 40, 406, 60, 608,       //
-                                       1518, 180, 1821, 210, 2124,  //
-                                       4146, 460, 4651, 510, 5156});
+  Array4D<float> expected(1, 1, 3, 5,
+                          {204, 40, 406, 60, 608,       //
+                           1518, 180, 1821, 210, 2124,  //
+                           4146, 460, 4651, 510, 5156});
   ComputeAndCompareR4<float>(&builder, expected, {}, error_spec_);
 }
 
@@ -926,7 +931,8 @@ XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x1x1_Filter16x16x1x1) {
   ComputeAndCompareR4<float>(&builder, *expected, {}, error_spec_);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, RandomData_Input16x16x16x16_Filter16x16x16x16) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           RandomData_Input16x16x16x16_Filter16x16x16x16) {
   constexpr int bs = 16;
   constexpr int iz = 16;
   constexpr int oz = 16;
@@ -976,8 +982,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x2x1x1Input1x2x3x1GeneralPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1018,8 +1026,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1GeneralPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1060,8 +1070,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x1x1Input1x2x3x1NoPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1099,8 +1111,10 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
   // NHWC input format.
   dnums.set_input_batch_dimension(0);
   dnums.set_output_batch_dimension(0);
-  dnums.add_spatial_dimensions(1);
-  dnums.add_spatial_dimensions(2);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
   dnums.set_input_feature_dimension(3);
   dnums.set_output_feature_dimension(3);
 
@@ -1131,7 +1145,8 @@ XLA_TEST_F(ConvolutionVariantsTest, Filter1x1x2x3Input1x2x3x2NoPadding) {
 //   Conv([1,2,3], Reverse([5,6]), padding_low=1)
 // into
 //   BackwardInputConv([1,2,3], [5,6], padding_low=0, padding_high=1)
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardInputLowPaddingLessThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
@@ -1149,7 +1164,8 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingLessThanHighPadding)
 //   Conv([1], Reverse([1,10,100]), padding_high=3, base_dilation=3)
 // into
 //   BackwardInputConv([1], [1,10,100], stride=3, padding=(2,1))
-XLA_TEST_F(ConvolutionVariantsTest, BackwardInputLowPaddingGreaterThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardInputLowPaddingGreaterThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   auto gradients = builder.ConstantR4FromArray4D<float>(
@@ -1206,7 +1222,8 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputWithNegativePaddingHigh) {
   ComputeAndCompareR4<float>(&builder, {{{{12, 23, 30, 0}}}}, {}, error_spec_);
 }
 
-XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding) {
+XLA_TEST_F(ConvolutionVariantsTest,
+           BackwardFilterLowPaddingLessThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,1,2,3,4,0,0
@@ -1230,7 +1247,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterLowPaddingLessThanHighPadding)
 }
 
 XLA_TEST_F(ConvolutionVariantsTest,
-       BackwardFilterLowPaddingGreaterThanHighPadding) {
+           BackwardFilterLowPaddingGreaterThanHighPadding) {
   ComputationBuilder builder(client_, TestName());
 
   // activations:      1,2,3,4  ---pad--> 0,0,1,2,3,4
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index cf089d748dcd4f5db637ff9087c5fbc504c82572..2058cd04a5e765e22be1733c835f07e237afbfbd 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -177,15 +177,15 @@ void DotOperationTest::TestSquareMatrixDot(bool lhs_row_major,
                                            bool rhs_row_major) {
   auto lhs_handle =
       client_
-          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
+          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
               {{1.0, 2.0}, {3.0, -4.0}},
-              MinorToMajorForIsRowMajor(lhs_row_major)))
+              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
           .ConsumeValueOrDie();
   auto rhs_handle =
       client_
-          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
+          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
               {{1.0, 6.0}, {7.0, -4.0}},
-              MinorToMajorForIsRowMajor(rhs_row_major)))
+              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -277,10 +277,64 @@ XLA_TEST_F(DotOperationTest, MatrixDotF32_260_3_520_MinorToMajorFF) {
   TestMatrixDot(260, 3, 520, false, false);
 }
 
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x8) {
+  TestMatrixDot(1, 8, 8, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x130x8) {
+  TestMatrixDot(1, 130, 8, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x130) {
+  TestMatrixDot(1, 8, 130, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x290x130) {
+  TestMatrixDot(1, 290, 130, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_2x1x1) {
+  TestMatrixDot(2, 1, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_8x8x1) {
+  TestMatrixDot(8, 8, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x1x1) {
+  TestMatrixDot(16, 1, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_16x3x1) {
+  TestMatrixDot(16, 3, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_3x3x1) {
+  TestMatrixDot(3, 3, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_29x29x1) {
+  TestMatrixDot(29, 29, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x8x2) {
+  TestMatrixDot(1, 8, 2, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_1x2x8) {
+  TestMatrixDot(1, 2, 8, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1) {
+  TestMatrixDot(259, 258, 1, true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorDotF32_259x258x1_FT) {
+  TestMatrixDot(259, 258, 1, false, true);
+}
+
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFF) {
-  constexpr bool kLhsRowMajor = false;
-  constexpr bool kRhsRowMajor = false;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+  TestSquareMatrixDot<float>(false, false);
 }
 
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorFT) {
@@ -291,10 +345,24 @@ XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTF) {
   TestSquareMatrixDot<float>(true, false);
 }
 
-TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
-  constexpr bool kLhsRowMajor = true;
-  constexpr bool kRhsRowMajor = true;
-  TestSquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+XLA_TEST_F(DotOperationTest, SquareMatrixDotF32MinorToMajorTT) {
+  TestSquareMatrixDot<float>(true, true);
+}
+
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFF) {
+  TestSquareMatrixDot<complex64>(false, false);
+}
+
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorFT) {
+  TestSquareMatrixDot<complex64>(false, true);
+}
+
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTF) {
+  TestSquareMatrixDot<complex64>(true, false);
+}
+
+XLA_TEST_F(DotOperationTest, SquareMatrixDotC64MinorToMajorTT) {
+  TestSquareMatrixDot<complex64>(true, true);
 }
 
 XLA_TEST_F(DotOperationTest, SquareMatrixDotF64) {
@@ -306,15 +374,15 @@ void DotOperationTest::TestNonsquareMatrixDot(bool lhs_row_major,
                                               bool rhs_row_major) {
   auto lhs_handle =
       client_
-          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
+          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
               {{1.0, 2.0, 3.0}, {3.0, -4.0, -1.0}},
-              MinorToMajorForIsRowMajor(lhs_row_major)))
+              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(lhs_row_major))))
           .ConsumeValueOrDie();
   auto rhs_handle =
       client_
-          ->TransferToServer(*test_utils::CreateR2LiteralWithLayout<Element>(
+          ->TransferToServer(*Literal::CreateR2WithLayout<Element>(
               {{1.0, 6.0}, {2.0, 3.0}, {7.0, -4.0}},
-              MinorToMajorForIsRowMajor(rhs_row_major)))
+              LayoutUtil::MakeLayout(MinorToMajorForIsRowMajor(rhs_row_major))))
           .ConsumeValueOrDie();
 
   ComputationBuilder builder(client_, TestName());
@@ -330,35 +398,64 @@ void DotOperationTest::TestNonsquareMatrixDot(bool lhs_row_major,
 }
 
 XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorFF) {
-  constexpr bool kLhsRowMajor = false;
-  constexpr bool kRhsRowMajor = false;
-  TestNonsquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+  TestNonsquareMatrixDot<float>(false, false);
 }
 
 XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorFT) {
-  constexpr bool kLhsRowMajor = false;
-  constexpr bool kRhsRowMajor = true;
-  TestNonsquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+  TestNonsquareMatrixDot<float>(false, true);
 }
 
 XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorTF) {
-  constexpr bool kLhsRowMajor = true;
-  constexpr bool kRhsRowMajor = false;
-  TestNonsquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+  TestNonsquareMatrixDot<float>(true, false);
 }
 
 XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF32MajorToMinorTT) {
-  constexpr bool kLhsRowMajor = true;
-  constexpr bool kRhsRowMajor = true;
-  TestNonsquareMatrixDot<float>(kLhsRowMajor, kRhsRowMajor);
+  TestNonsquareMatrixDot<float>(true, true);
 }
 
 XLA_TEST_F(DotOperationTest, NonsquareMatrixDotF64) {
   TestNonsquareMatrixDot<double>();
 }
 
-XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64) {
-  TestNonsquareMatrixDot<complex64>();
+XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorFF) {
+  TestNonsquareMatrixDot<complex64>(false, false);
+}
+
+XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorFT) {
+  TestNonsquareMatrixDot<complex64>(false, true);
+}
+
+XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorTF) {
+  TestNonsquareMatrixDot<complex64>(true, false);
+}
+
+XLA_TEST_F(DotOperationTest, NonsquareMatrixDotC64MajorToMinorTT) {
+  TestNonsquareMatrixDot<complex64>(true, true);
+}
+
+XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
+  auto lhs_handle =
+      client_
+          ->TransferToServer(*Literal::CreateR2WithLayout<complex64>(
+              {{1.0, 2.0, 3.0, -4.0}}, LayoutUtil::MakeLayout({1, 0})))
+          .ConsumeValueOrDie();
+  auto rhs_handle =
+      client_
+          ->TransferToServer(*Literal::CreateR2WithLayout<complex64>(
+              {{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}, {-4.0, 4.0}},
+              LayoutUtil::MakeLayout({1, 0})))
+          .ConsumeValueOrDie();
+
+  ComputationBuilder builder(client_, TestName());
+  auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
+  auto result = builder.Dot(
+      builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"),
+      builder.Parameter(1, ShapeUtil::MakeShape(prim_type, {4, 2}), "rhs"));
+
+  Array2D<complex64> expected({{30.0, -2.0}});
+
+  ComputeAndCompareR2<complex64>(
+      &builder, expected, {lhs_handle.get(), rhs_handle.get()}, error_spec_);
 }
 
 XLA_TEST_F(DotOperationTest, ConcurrentMatMul) {
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 19252f50f25eee42e4e492b7f0e2ec3960c62126..8baaf39e3cf8fa7f6fa4a0224c1297f82e0d92aa 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -250,9 +250,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     // Slice at dimension boundaries.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {5},
                          {0, 1, 2, 3, 4, 8, 9, 10});
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
-                         {0, 1, 2, 3, 4, 5, 8, 9});
     // Zero-sized update.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {}, {2},
                          {0, 1, 2, 3, 4, 5, 6, 7});
@@ -269,9 +266,6 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     // Slice at dimension boundaries.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 1},
                          {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}});
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
-                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 10}});
     // Zero-sized update.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{}}, {2, 1},
                          {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
@@ -289,10 +283,20 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
         {1, 1, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}});
+  }
+
+  template <typename IndexT, typename DataT>
+  void TestWrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
+                         {10, 1, 2, 3, 4, 5, 8, 9});
+    // R2 Shape: [3, 3]
+    RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
+                         {{1, 2, 3}, {4, 5, 6}, {11, 8, 10}});
+    // R3 Shape: [2, 3, 2]
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
-        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 13}}});
+        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -425,6 +429,12 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
+XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
+
+XLA_TEST_F(DynamicUpdateSliceTest, Int64Wrap) { TestWrap<int64, int64>(); }
+
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64Wrap) { TestWrap<uint64, uint64>(); }
+
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
   // Slice at dimension start.
   RunR1<int32, bool>({false, false, true, true, false, true, true, false},
@@ -497,19 +507,13 @@ XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
   RunR3Contiguous(operand_shape, /*index=*/1, /*size=*/2);
 }
 
-// TODO(b/34128753) CPU and GPU failed on 2016-01-06.  Appears not to handle
-// wrapping as expected.
-XLA_TEST_F(DynamicUpdateSliceTest,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(R3ContiguousMultipleWrapping))) {
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrapping) {
   // Multiple element, wrapping.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous(operand_shape, /*index=*/3, /*size=*/2);
 }
 
-// TODO(b/34128753) CPU and GPU failed on 2016-01-06.  Appears not to handle
-// wrapping as expected.
-XLA_TEST_F(DynamicUpdateSliceTest,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(R3ContiguousTooLarge))) {
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousTooLarge) {
   // Multiple element, update size larger than operand.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous(operand_shape, /*index=*/5, /*size=*/2);
@@ -555,7 +559,11 @@ void BM_DynamicSlice(int num_iters) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Initialize and transfer parameter buffer.
-  auto buffer = ScopedShapedBuffer::Allocate(start_indices_shape, &allocator, 0)
+  auto shape_size_fn = [client](const Shape& shape) {
+    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
+  };
+  auto buffer = ScopedShapedBuffer::Allocate(start_indices_shape, &allocator, 0,
+                                             shape_size_fn)
                     .ConsumeValueOrDie();
 
   auto start_indices_literal = Literal::CreateR1<int32>({0, 1, 2, 3});
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index a8f6488996087b57e3121ce2c7de918070950c72..2686afccc216095345dbb7b43e916fbbe7c8ea39 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -770,8 +770,6 @@ void BM_ParallelFusion(int num_iters) {
   auto client =
       ClientLibrary::GetOrCreateLocalClient(client_options).ValueOrDie();
 
-  auto* transfer_manager =
-      TransferManager::GetForPlatform(platform).ValueOrDie();
   int device_ordinal = client->default_device_ordinal();
 
   // Computation shape parameters.
@@ -796,29 +794,23 @@ void BM_ParallelFusion(int num_iters) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Transfer literals to device.
-  auto buffer0 =
-      ScopedShapedBuffer::Allocate(shape0, &allocator, /*device_ordinal=*/0)
-          .ConsumeValueOrDie();
   auto param0_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param0_dim0, param0_dim1);
-  ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *param0_literal, buffer0->mutable_buffer({})));
-
-  auto buffer1 =
-      ScopedShapedBuffer::Allocate(shape1, &allocator, /*device_ordinal=*/0)
+  std::unique_ptr<ShapedBuffer> buffer0 =
+      client->LiteralToShapedBuffer(*param0_literal, device_ordinal)
           .ConsumeValueOrDie();
+
   auto param1_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param1_dim0, param1_dim1);
-  ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *param1_literal, buffer1->mutable_buffer({})));
-
-  auto buffer2 =
-      ScopedShapedBuffer::Allocate(shape2, &allocator, /*device_ordinal=*/0)
+  std::unique_ptr<ShapedBuffer> buffer1 =
+      client->LiteralToShapedBuffer(*param1_literal, device_ordinal)
           .ConsumeValueOrDie();
+
   auto param2_literal =
       Literal::CreateR2F32Linspace(1.0, 2.0, param2_dim0, param2_dim1);
-  ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      executors[device_ordinal], *param2_literal, buffer2->mutable_buffer({})));
+  std::unique_ptr<ShapedBuffer> buffer2 =
+      client->LiteralToShapedBuffer(*param2_literal, device_ordinal)
+          .ConsumeValueOrDie();
 
   // Build executable.
   std::unique_ptr<LocalExecutable> executable =
@@ -828,7 +820,7 @@ void BM_ParallelFusion(int num_iters) {
                     ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
-  se::Stream stream(executors[client->default_device_ordinal()]);
+  se::Stream stream(executors[device_ordinal]);
   stream.Init();
 
   // Initialize thread pool.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d73c05ff92578209143e0679558848160cae99bd..e7a18828db064f82cad2a15f797b557d2be1f88a 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -15,13 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
+#include <memory>
 #include <set>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,18 +39,72 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 
+namespace {
+
+using tensorflow::StringPiece;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::optional;
+
+constexpr char kInterpreter[] = "interpreter";
+
+// Helper functions to get test and reference platforms.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
+se::Platform* GetTestPlatform() {
+  auto result = PlatformUtil::GetDefaultPlatform();
+  TF_CHECK_OK(result.status()) << "could not get test platform";
+  return result.ValueOrDie();
+}
+
+bool ProgramShapesEqual(const ProgramShape& lhs, const ProgramShape& rhs) {
+  if (lhs.parameters_size() != rhs.parameters_size()) {
+    return false;
+  }
+  for (int i = 0; i < lhs.parameters_size(); i++) {
+    if (!ShapeUtil::Equal(lhs.parameters(i), rhs.parameters(i))) {
+      return false;
+    }
+  }
+  return ShapeUtil::Equal(lhs.result(), rhs.result());
+}
+
+ProgramShape GetProgramShapeWithLayout(const HloModule& module) {
+  ProgramShape program_shape;
+  const auto* entry = module.entry_computation();
+  for (const auto* param : entry->parameter_instructions()) {
+    *program_shape.add_parameters() = param->shape();
+    *program_shape.add_parameter_names() = param->name();
+  }
+  *program_shape.mutable_result() = entry->root_instruction()->shape();
+  return program_shape;
+}
+
+}  // namespace
+
+HloTestBase::HloTestBase()
+    : HloTestBase(GetTestPlatform(), GetReferencePlatform()) {}
+
+HloTestBase::HloTestBase(se::Platform* test_platform,
+                         se::Platform* reference_platform)
+    : test_runner_(test_platform), reference_runner_(reference_platform) {}
+
 /* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
   HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                               config);
+}
 
+/*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
-
-  config.set_debug_options(debug_options);
-
-  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                               config);
+  return debug_options;
 }
 
 StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
@@ -49,25 +112,168 @@ StatusOr<perftools::gputools::DeviceMemoryBase> HloTestBase::Execute(
     tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
         arguments,
     Shape* result_shape) {
-  return runner_.Execute(std::move(module), arguments, result_shape);
+  return test_runner_.Execute(std::move(module), arguments, result_shape);
 }
 
 se::DeviceMemoryBase HloTestBase::TransferToDevice(const Literal& literal) {
-  return runner_.TransferToDevice(literal).ValueOrDie();
+  return test_runner_.TransferToDevice(literal).ValueOrDie();
 }
 
 std::unique_ptr<Literal> HloTestBase::TransferFromDevice(
     const Shape& shape, se::DeviceMemoryBase device_base) {
-  return runner_.TransferFromDevice(shape, device_base).ValueOrDie();
+  return test_runner_.TransferFromDevice(shape, device_base).ValueOrDie();
 }
 
 std::unique_ptr<Literal> HloTestBase::ExecuteAndTransfer(
     std::unique_ptr<HloModule> module,
     tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
-  return runner_.ExecuteAndTransfer(std::move(module), arguments).ValueOrDie();
+  return test_runner_.ExecuteAndTransfer(std::move(module), arguments)
+      .ValueOrDie();
+}
+
+StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
+    const HloModule& test_module,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  std::unique_ptr<HloModule> reference_module = test_module.Clone();
+  const auto& program_shape = GetProgramShapeWithLayout(test_module);
+
+  if (reference_preprocessor != nullptr) {
+    reference_preprocessor(reference_module.get());
+    if (!ProgramShapesEqual(program_shape,
+                            GetProgramShapeWithLayout(*reference_module))) {
+      return InvalidArgument(
+          "reference preprocessor must not modify the program shape");
+    }
+  }
+  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
+                                     reference_module.get()));
+  return std::move(reference_module);
+}
+
+template <typename LiteralPtr>
+StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error, bool run_hlo_passes,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  static_assert(
+      std::is_same<Literal*, LiteralPtr>::value ||
+          std::is_same<std::unique_ptr<Literal>, LiteralPtr>::value,
+      "The LiteralPtr type only accepts Literal* or std::unique_ptr<Literal>.");
+  TF_RETURN_IF_ERROR(
+      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_ASSIGN_OR_RETURN(auto reference_module,
+                      MakeReferenceModule(*module, reference_preprocessor));
+
+  // Execute on two backends.
+  TF_ASSIGN_OR_RETURN(
+      auto test,
+      test_runner_.Execute(std::move(module), arguments, run_hlo_passes));
+  TF_ASSIGN_OR_RETURN(auto reference,
+                      reference_runner_.Execute(std::move(reference_module),
+                                                arguments, run_hlo_passes));
+  return LiteralTestUtil::NearOrEqual(/*expected=*/*reference, /*actual=*/*test,
+                                      error);
+}
+
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/true, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+template <typename LiteralPtr>
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const ArraySlice<LiteralPtr> arguments,
+    const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto result =
+      RunAndCompareInternal(std::move(module), arguments, error,
+                            /*run_hlo_passes=*/false, reference_preprocessor);
+  if (!result.ok()) {
+    return ::testing::AssertionFailure() << result.status();
+  }
+  return result.ValueOrDie();
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompare<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    std::unique_ptr<HloModule> module, const optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  return RunAndCompareNoHloPasses<std::unique_ptr<Literal>>(
+      std::move(module), fake_arguments, error, reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompare(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure() << "failed parsing hlo textual IR";
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompare(module_or_status.ConsumeValueOrDie(), error,
+                       reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPasses(
+    const StringPiece hlo_string,
+    const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure() << "failed parsing hlo textual IR";
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
+}
+
+::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
+    const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+    const std::function<void(HloModule*)>& reference_preprocessor) {
+  auto module_or_status =
+      HloRunner::ReadModule(filename, GetDebugOptionsForTest());
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "failed reading hlo module from file";
+  }
+  return RunAndCompareNoHloPasses(module_or_status.ConsumeValueOrDie(), error,
+                                  reference_preprocessor);
 }
 
-Backend& HloTestBase::backend() { return runner_.backend(); }
+Backend& HloTestBase::backend() { return test_runner_.backend(); }
 
 /* static */
 string HloTestBase::TestName() {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 7f068dce36be3546298de2f06bf6d33446d07ca2..3cbbb7aa247dda3e5b6589a2a6aa74cf074babe7 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -24,31 +24,74 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 
-// A base class for tests which build and run HLO code. This is a lower level of
-// abstraction than using the client interface and enables, for one, explicitly
-// building a graph of HLO instructions to run.
+// A base class for tests which build and/or run HLO code. The class includes
+// support for running an HLO module on two platforms and compare the results.
+// This is a lower level of abstraction than using the client interface and
+// enables, for one, explicitly building a graph of HLO instructions to run.
+//
+// This can also be used to write text/file-based test cases. Note that the test
+// target is responsible for linking the needed backends. A covenient way to do
+// this is to make it an xla_test: it will generate test targets linking with
+// the respective backends, which will be used as the test backend; the
+// interpreter backend is already linked with hlo_test_base so it will be the
+// default reference backend. For example, if you want to compare both cpu vs.
+// interpreter, and gpu vs. interpreter, you can:
+//
+//  xla_test (
+//    name = "sample_text_test",
+//    srcs = ["sample_text_test.cc"],
+//    backends = [
+//      "cpu",
+//      "gpu",
+//    ],
+//    deps = [
+//      "//third_party/tensorflow/compiler/xla/tests:hlo_test_base",
+//      ...
+//    ],
+//  )
+//
+// For a more detailed example, see "../tests/sample_text_test.cc".
 class HloTestBase : public ::testing::Test {
  protected:
-  HloTestBase() {}
+  // This uses the interpreter backend as the reference backend and
+  // automatically finds another supported backend as the test backend. If the
+  // interpreter is the only supported backend, it will be both the test backend
+  // and the reference backend.
+  HloTestBase();
+
+  // If your test doesn't use interpreter as the reference backend, you can use
+  // this constructor. Note that your test target is responsible for linking in
+  // both needed backends.
+  HloTestBase(::perftools::gputools::Platform* test_platform,
+              ::perftools::gputools::Platform* reference_platform);
 
   ~HloTestBase() override {}
 
   // Creates a new HLO module for a test. The module created will have
   // TestName() for its name; it will also automatically populate its debug
-  // options from command-line flags. It's recommended to use this method to
-  // create all HloModules for tests.
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
   static std::unique_ptr<HloModule> CreateNewModule();
 
+  // Populates debug options from command-line flags and adjusts the options for
+  // testing. It is recommended to use this when you need to pass in
+  // DebugOptions, e.g. when creating a module from a string or a file.
+  static DebugOptions GetDebugOptionsForTest();
+
   // Executes the given module and returns a global data handle.
   StatusOr<perftools::gputools::DeviceMemoryBase> Execute(
       std::unique_ptr<HloModule> module,
@@ -71,6 +114,73 @@ class HloTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase>
           arguments);
 
+  // Executes the given hlo module on two backends and compares results.
+  //
+  // 'arguments': the input of the hlo module. The LiteralPtr type accepts
+  // Literal* or std::unique_ptr<Literal>.
+  //
+  // 'error': if has value, expects the results to be near (within the error
+  // bound). Otherwise, expects the results to be equal.
+  //
+  // 'reference_preprocessor': the module should be ready to run on the test
+  // backend, but it might need to be tailored so that it is able to run on the
+  // reference backend. Note that the program shape of the module must not be
+  // modified.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  template <typename LiteralPtr>
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Executes an hlo module with fake inputs and compares the results.
+  ::testing::AssertionResult RunAndCompare(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Same as above, except that the module will be executed without Hlo
+  // optimization.
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
+  // Convenient wrappers for executing and comparing an hlo module with fake
+  // input. Module can be passed in directly, or parsed from an hlo_string,
+  // or loaded from a file.
+  ::testing::AssertionResult RunAndCompare(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPasses(
+      const tensorflow::StringPiece hlo_string,
+      const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunAndCompareNoHloPassesFromFile(
+      const string& filename, const tensorflow::gtl::optional<ErrorSpec>& error,
+      const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
+      TF_MUST_USE_RESULT;
+
   // Convenience method to force the layout of a given parameter in a module.
   // The layout of parameter number 'param_no' in the 'module' is set to
   // 'layout'.
@@ -101,12 +211,31 @@ class HloTestBase : public ::testing::Test {
 
   static string TestName();
 
-  // Returns the backend owned by the HloRunner.
+  // Returns the backend owned by the test runner.
   Backend& backend();
 
-  HloRunner runner_;
+  HloRunner test_runner_;
+  HloRunner reference_runner_;
 
   ErrorSpec error_spec_{0.0001};
+
+ private:
+  // Given the test module, makes a reference module that is ready to run on the
+  // reference platform. This assumes that the given module is ready to run on
+  // the test platform.
+  StatusOr<std::unique_ptr<HloModule>> MakeReferenceModule(
+      const HloModule& test_module,
+      const std::function<void(HloModule*)>& reference_preprocessor);
+
+  // Runs the module on two platforms with or without running hlo passes and
+  // compares the results. Returns whether the results are near or equal. If any
+  // error happens before the results are computed, returns the error status.
+  template <typename LiteralPtr>
+  StatusOr<::testing::AssertionResult> RunAndCompareInternal(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<LiteralPtr> arguments,
+      const tensorflow::gtl::optional<ErrorSpec>& error, bool run_hlo_passes,
+      const std::function<void(HloModule*)>& reference_preprocessor);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/isolated_convolution.hlo b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
new file mode 100644
index 0000000000000000000000000000000000000000..9452780930efbb1ecc13b35cd4ab53678d36c37f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/isolated_convolution.hlo
@@ -0,0 +1,8 @@
+HloModule convolution.167:
+
+ENTRY %convolution.167 (parameter.0: f32[16,28,28,128], parameter.1: f32[3,3,128,128]) -> f32[16,28,28,128] {
+  %parameter.0 = f32[16,28,28,128]{3,0,2,1} parameter(0)
+  %parameter.1 = f32[3,3,128,128]{3,2,1,0} parameter(1)
+  ROOT %convolution.167 = f32[16,28,28,128]{3,0,2,1} convolution(f32[16,28,28,128]{3,0,2,1} %parameter.0, f32[3,3,128,128]{3,2,1,0} %parameter.1), window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01oi->b01f
+}
+
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 95a52ecd2f5cfc97ec1ccba7d1b7ca6257a8267e..bf6631a4310d3504e4dfa8c46bf66125a94b9315 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -100,6 +100,58 @@ namespace xla {
   ASSERT_EQ(expected.ShortDebugString(), actual.ShortDebugString());
 }
 
+/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertBF16ToF32(
+    const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<std::unique_ptr<Literal>> converted_elements;
+    for (const auto& element : literal.tuple_literals()) {
+      converted_elements.push_back(ConvertBF16ToF32(element));
+    }
+    return Literal::MakeTupleOwned(std::move(converted_elements));
+  }
+
+  if (literal.shape().element_type() != BF16) {
+    return MakeUnique<Literal>(literal);
+  }
+  Shape converted_shape = literal.shape();
+  converted_shape.set_element_type(F32);
+  auto converted = Literal::CreateFromShape(converted_shape);
+  if (!ShapeUtil::HasZeroElements(converted_shape)) {
+    std::vector<int64> index(converted_shape.dimensions_size(), 0);
+    do {
+      converted->Set<float>(index,
+                            static_cast<float>(literal.Get<bfloat16>(index)));
+    } while (IndexUtil::BumpIndices(converted_shape, &index));
+  }
+  return converted;
+}
+
+/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertF32ToBF16(
+    const Literal& literal) {
+  if (ShapeUtil::IsTuple(literal.shape())) {
+    std::vector<std::unique_ptr<Literal>> converted_elements;
+    for (const auto& element : literal.tuple_literals()) {
+      converted_elements.push_back(ConvertF32ToBF16(element));
+    }
+    return Literal::MakeTupleOwned(std::move(converted_elements));
+  }
+
+  if (literal.shape().element_type() != F32) {
+    return MakeUnique<Literal>(literal);
+  }
+  Shape converted_shape = literal.shape();
+  converted_shape.set_element_type(BF16);
+  auto converted = Literal::CreateFromShape(converted_shape);
+  if (!ShapeUtil::HasZeroElements(converted_shape)) {
+    std::vector<int64> index(converted_shape.dimensions_size(), 0);
+    do {
+      converted->Set<bfloat16>(
+          index, static_cast<bfloat16>(literal.Get<float>(index)));
+    } while (IndexUtil::BumpIndices(converted_shape, &index));
+  }
+  return converted;
+}
+
 namespace {
 
 string Hostname() {
@@ -116,16 +168,18 @@ template <typename FloatT, typename UnsignedT>
 ::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
   auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
   auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
+  auto lhs_double = static_cast<double>(lhs);
+  auto rhs_double = static_cast<double>(rhs);
   if (ulhs != urhs) {
     return ::testing::AssertionFailure() << tensorflow::strings::Printf(
                "floating values are not bitwise-equal; and equality testing "
                "was requested: %s=%g=%a vs %s=%g=%a",
                tensorflow::strings::StrCat(tensorflow::strings::Hex(ulhs))
                    .c_str(),
-               lhs, lhs,
+               lhs_double, lhs_double,
                tensorflow::strings::StrCat(tensorflow::strings::Hex(urhs))
                    .c_str(),
-               rhs, rhs);
+               rhs_double, rhs_double);
   }
   return ::testing::AssertionSuccess();
 }
@@ -149,6 +203,10 @@ template <typename NativeT>
 // Specializations for floating types that do bitwise comparisons when equality
 // comparison is requested.
 template <>
+::testing::AssertionResult CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
+}
+template <>
 ::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
   return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
 }
@@ -238,6 +296,9 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
     case U64:
       match = ExpectLiteralsEqual<uint64>(expected, actual, &multi_index, 0);
       break;
+    case BF16:
+      match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
+      break;
     case F32:
       match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
       break;
@@ -272,23 +333,37 @@ bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
   return result;
 }
 
-/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
-                                                    const Literal& actual) {
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualTuple(
+    const Literal& expected, const Literal& actual) {
   VLOG(1) << "expected: " << expected.ToString();
   VLOG(1) << "actual:   " << actual.ToString();
 
-  ASSERT_TRUE(ShapeUtil::IsTuple(expected.shape()));
-  ASSERT_TRUE(ShapeUtil::IsTuple(actual.shape()));
+  if (!ShapeUtil::IsTuple(expected.shape()) ||
+      !ShapeUtil::IsTuple(actual.shape())) {
+    return ::testing::AssertionFailure()
+           << "tuples expected shape = " << expected.shape().ShortDebugString()
+           << " actual shape = " << actual.shape().ShortDebugString();
+  }
   AssertEqualShapes(expected.shape(), actual.shape());
   for (uint64 i = 0; i < expected.tuple_literals_size(); ++i) {
     const auto& expected_element = expected.tuple_literals(i);
     const auto& actual_element = actual.tuple_literals(i);
     if (ShapeUtil::IsTuple(expected_element.shape())) {
-      ExpectEqualTuple(expected_element, actual_element);
+      auto ret = EqualTuple(expected_element, actual_element);
+      if (!ret) {
+        return ret;
+      }
     } else {
-      ExpectEqual(expected_element, actual_element);
+      return Equal(expected_element, actual_element);
     }
   }
+
+  return ::testing::AssertionSuccess();
+}
+
+/* static */ void LiteralTestUtil::ExpectEqualTuple(const Literal& expected,
+                                                    const Literal& actual) {
+  EXPECT_TRUE(EqualTuple(expected, actual));
 }
 
 namespace {
@@ -331,6 +406,9 @@ class NearComparator {
     multi_index_.resize(expected.shape().dimensions_size(), 0);
 
     switch (expected.shape().element_type()) {
+      case BF16:
+        ExpectLiteralsNear<bfloat16>(expected, actual, 0);
+        break;
       case F32:
         ExpectLiteralsNear<float>(expected, actual, 0);
         break;
@@ -516,6 +594,13 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
       << message;
 }
 
+template <>
+bool NearComparator::ExpectValuesNear<bfloat16>(bfloat16 expected,
+                                                bfloat16 actual) {
+  return ExpectValuesNear(static_cast<float>(expected),
+                          static_cast<float>(actual));
+}
+
 }  // namespace
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
@@ -544,8 +629,7 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
   if (!ShapeUtil::IsTuple(expected.shape()) ||
       !ShapeUtil::IsTuple(actual.shape())) {
     return ::testing::AssertionFailure()
-           << "tuples expected expected shape = "
-           << expected.shape().ShortDebugString()
+           << "tuples expected shape = " << expected.shape().ShortDebugString()
            << " actual shape = " << actual.shape().ShortDebugString();
   }
   AssertEqualShapes(expected.shape(), actual.shape());
@@ -579,6 +663,32 @@ void NearComparator::ExpectNear<complex64>(complex64 expected, complex64 actual,
   EXPECT_TRUE(NearTuple(expected, actual, error));
 }
 
+/*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  bool is_tuple = ShapeUtil::IsTuple(expected.shape());
+  if (error.has_value()) {
+    if (is_tuple) {
+      VLOG(1) << "Expects near tuple";
+      return NearTuple(expected, actual, *error);
+    }
+    VLOG(1) << "Expects near";
+    return Near(expected, actual, *error);
+  }
+  if (is_tuple) {
+    VLOG(1) << "Expects equal tuple";
+    return EqualTuple(expected, actual);
+  }
+  VLOG(1) << "Expects equal";
+  return Equal(expected, actual);
+}
+
+/*static*/ void LiteralTestUtil::ExpectNearOrEqual(
+    const Literal& expected, const Literal& actual,
+    const tensorflow::gtl::optional<ErrorSpec>& error) {
+  EXPECT_TRUE(NearOrEqual(expected, actual, error));
+}
+
 /* static */ string LiteralTestUtil::MultiIndexAsString(
     tensorflow::gtl::ArraySlice<int64> multi_index) {
   return tensorflow::strings::StrCat(
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 467d44b857b74d2a38e9b3f8a32a9b1d39a4a10d..f53553c70170bdcda717e72ffd791016effd0774 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -59,6 +60,16 @@ class LiteralTestUtil {
   static void AssertEqualShapesAndLayouts(const Shape& expected,
                                           const Shape& actual);
 
+  // If the given literal's data type is bfloat16, converts it to a float
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertBF16ToF32(const Literal& bf16_literal);
+
+  // If the given literal's data type is float, converts it to a bfloat16
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertF32ToBF16(const Literal& f32_literal);
+
   // Asserts that the expected and actual literals are (bitwise) equal for all
   // elements in the literal. Also, asserts that the rank, dimensions sizes, and
   // primitive type are equal.
@@ -100,6 +111,10 @@ class LiteralTestUtil {
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
                                    const Literal& actual);
 
+  // Returns whether the two tuples are equal.
+  static ::testing::AssertionResult EqualTuple(
+      const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
+
   // Expects that the values of the elements in the expected and actual tuples
   // are equal. Tuples are matched recursively.
   static void ExpectEqualTuple(const Literal& expected, const Literal& actual);
@@ -167,6 +182,19 @@ class LiteralTestUtil {
   static void ExpectNearTuple(const Literal& expected, const Literal& actual,
                               const ErrorSpec& error);
 
+  // If the error spec is given, returns whether the expected and the actual are
+  // within the error bound; otherwise, returns whether they are equal. Tuples
+  // will be compared recursively.
+  static ::testing::AssertionResult NearOrEqual(
+      const Literal& expected, const Literal& actual,
+      const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
+
+  // If the error spec is given, expects the expected and the actual to be near;
+  // otherwise, expects them to be equal. Tuples will be compared recursively.
+  static void ExpectNearOrEqual(
+      const Literal& expected, const Literal& actual,
+      const tensorflow::gtl::optional<ErrorSpec>& error);
+
   // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
   // be returned for a 2-dimensional index with dimension 0 index equal to 7,
   // dimension 1 equal to 8.
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 458258e7ee1fee6964275c51ef38de5ff2ccd7b1..b5b95967ff9162301a092f3a57996e0f3f78658f 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -14,50 +14,147 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
+#include "tensorflow/compiler/xla/service/backend.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace {
 
-class LLVMCompilerTest : public HloTestBase {};
-
-XLA_TEST_F(LLVMCompilerTest, CompilerHooks) {
-  int pre_opt_hook_call_count = 0;
-  int post_opt_hook_call_count = 0;
-
-  auto pre_opt_hook = [&pre_opt_hook_call_count](const llvm::Module &) {
-    ++pre_opt_hook_call_count;
-    return Status::OK();
-  };
-  auto post_opt_hook = [&post_opt_hook_call_count](const llvm::Module &) {
-    ++post_opt_hook_call_count;
-    return Status::OK();
-  };
-
-  // Create HLO module, and run the compiler.
-  auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
-
-  auto hlo_module = CreateNewModule();
-  hlo_module->AddEntryComputation(builder.Build());
-
-  auto compiler = static_cast<LLVMCompiler *>(backend().compiler());
-  compiler->SetPreOptimizationHook(pre_opt_hook);
-  compiler->SetPostOptimizationHook(post_opt_hook);
-
-  ASSERT_TRUE(
-      compiler
-          ->Compile(std::move(hlo_module), backend().default_stream_executor())
-          .ok());
-
-  // Test that hooks were called.
-  EXPECT_EQ(1, pre_opt_hook_call_count);
-  EXPECT_EQ(1, post_opt_hook_call_count);
+class LLVMCompilerTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    Platform *platform = FindPlatform();
+    ASSERT_NE(platform, nullptr);
+
+    BackendOptions backend_options;
+    backend_options.set_platform(platform);
+    StatusOr<std::unique_ptr<Backend>> backend_or_status =
+        Backend::CreateBackend(backend_options);
+    ASSERT_IS_OK(backend_or_status.status());
+    backend_ = backend_or_status.ConsumeValueOrDie();
+  }
+
+  ~LLVMCompilerTest() override {}
+
+ protected:
+  using Platform = ::perftools::gputools::Platform;
+
+  explicit LLVMCompilerTest(string platform_name)
+      : platform_name_(std::move(platform_name)) {}
+
+  void TestCompilerHooks(LLVMCompiler *compiler) {
+    int pre_opt_hook_call_count = 0;
+    int post_opt_hook_call_count = 0;
+
+    auto pre_opt_hook = [&pre_opt_hook_call_count](const llvm::Module &) {
+      ++pre_opt_hook_call_count;
+      return Status::OK();
+    };
+    auto post_opt_hook = [&post_opt_hook_call_count](const llvm::Module &) {
+      ++post_opt_hook_call_count;
+      return Status::OK();
+    };
+
+    // Create HLO module, and run the compiler.
+    auto builder = HloComputation::Builder(TestName());
+    builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
+
+    auto hlo_module = CreateNewModule();
+    hlo_module->AddEntryComputation(builder.Build());
+
+    compiler->SetPreOptimizationHook(pre_opt_hook);
+    compiler->SetPostOptimizationHook(post_opt_hook);
+
+    ASSERT_TRUE(compiler
+                    ->RunBackend(std::move(hlo_module),
+                                 backend_->default_stream_executor())
+                    .ok());
+
+    // Test that hooks were called.
+    EXPECT_EQ(1, pre_opt_hook_call_count);
+    EXPECT_EQ(1, post_opt_hook_call_count);
+  }
+
+  void TestMultiModuleCompilation(LLVMCompiler *compiler) {
+    HloComputation::Builder builder(TestName());
+    builder.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
+
+    std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+    hlo_module->AddEntryComputation(builder.Build());
+
+    std::vector<std::unique_ptr<HloModule>> modules;
+    modules.push_back(hlo_module->Clone());
+    modules.push_back(std::move(hlo_module));
+
+    std::vector<std::vector<perftools::gputools::StreamExecutor *>> executors;
+    executors.push_back({backend_->default_stream_executor()});
+    executors.push_back({backend_->default_stream_executor()});
+
+    EXPECT_IS_OK(compiler->Compile(std::move(modules), std::move(executors)));
+  }
+
+ private:
+  Platform *FindPlatform() {
+    for (Platform *platform :
+         PlatformUtil::GetSupportedPlatforms().ConsumeValueOrDie()) {
+      if (platform->Name() == platform_name_) {
+        return platform;
+      }
+    }
+    return nullptr;
+  }
+
+  string platform_name_;
+  std::unique_ptr<Backend> backend_;
+
+  static string TestName() {
+    return ::testing::UnitTest::GetInstance()->current_test_info()->name();
+  }
+
+  static std::unique_ptr<HloModule> CreateNewModule() {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
+                                 config);
+  }
+};
+
+class CpuCompilerTest : public LLVMCompilerTest {
+ public:
+  CpuCompilerTest() : LLVMCompilerTest("Host") {}
+};
+
+class GpuCompilerTest : public LLVMCompilerTest {
+ public:
+  GpuCompilerTest() : LLVMCompilerTest("CUDA") {}
+};
+
+TEST_F(CpuCompilerTest, HooksTest) {
+  cpu::CpuCompiler compiler;
+  TestCompilerHooks(&compiler);
+}
+
+TEST_F(GpuCompilerTest, HooksTest) {
+  gpu::GpuCompiler compiler;
+  TestCompilerHooks(&compiler);
 }
 
+TEST_F(CpuCompilerTest, MultiModuleCompilation) {
+  cpu::CpuCompiler compiler;
+  TestMultiModuleCompilation(&compiler);
+}
+
+TEST_F(GpuCompilerTest, MultModuleCompilation) {
+  gpu::GpuCompiler compiler;
+  TestMultiModuleCompilation(&compiler);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 329b53012f58c8d084cc05f9a567a8aa432c4a3a..ad71d40197fe48b4343ee5f5f7f71b282a05cbf5 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -136,16 +136,14 @@ XLA_TEST_F(LocalClientExecuteTest, AddArraysWithDifferentInputLayouts) {
   auto computation = builder.Build().ConsumeValueOrDie();
 
   // Create x as a col-major array.
-  auto x_array = LiteralToShapedBuffer(
-      *test_utils::CreateR2LiteralWithLayout({{1.0f, 2.0f}, {3.0f, 4.0f}},
-                                             /*minor_to_major=*/{0, 1}));
+  auto x_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LayoutUtil::MakeLayout({0, 1})));
   EXPECT_TRUE(LayoutUtil::Equal(x_array->shape().layout(),
                                 LayoutUtil::MakeLayout({0, 1})));
 
   // Create y as a row-major array.
-  auto y_array = LiteralToShapedBuffer(
-      *test_utils::CreateR2LiteralWithLayout({{10.0f, 20.0f}, {30.0f, 40.0f}},
-                                             /*minor_to_major=*/{1, 0}));
+  auto y_array = LiteralToShapedBuffer(*Literal::CreateR2WithLayout(
+      {{10.0f, 20.0f}, {30.0f, 40.0f}}, LayoutUtil::MakeLayout({1, 0})));
   EXPECT_TRUE(LayoutUtil::Equal(y_array->shape().layout(),
                                 LayoutUtil::MakeLayout({1, 0})));
 
@@ -876,11 +874,13 @@ XLA_TEST_F(LocalClientExecuteTest,
           tensorflow::ThreadOptions(), "execute_thread",
           [&] { ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {}); }));
 
-  ASSERT_IS_OK(local_client_->TransferToInfeed(
-      *Literal::CreateR1<float>({-5.0, 123.0, 42.0})));
+  ASSERT_IS_OK(local_client_->TransferToInfeedLocal(
+      *Literal::CreateR1<float>({-5.0, 123.0, 42.0}),
+      local_client_->default_device_ordinal()));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
-                          local_client_->TransferFromOutfeed(&shape));
+                          local_client_->TransferFromOutfeedLocal(
+                              shape, local_client_->default_device_ordinal()));
 
   LiteralTestUtil::ExpectR1Equal<float>({-4.0, 125.0, 45.0}, *result);
 }
@@ -906,9 +906,12 @@ void BM_LocalClientOverhead(int num_iters) {
   builder.Add(x, x);
   auto computation = builder.Build().ConsumeValueOrDie();
 
-  auto buffer =
-      ScopedShapedBuffer::Allocate(shape, &allocator, /*device_ordinal=*/0)
-          .ConsumeValueOrDie();
+  auto shape_size_fn = [client](const Shape& shape) {
+    return client->backend().transfer_manager()->GetByteSizeRequirement(shape);
+  };
+  auto buffer = ScopedShapedBuffer::Allocate(
+                    shape, &allocator, /*device_ordinal=*/0, shape_size_fn)
+                    .ConsumeValueOrDie();
   auto literal = Literal::CreateR2<float>({{0, 0, 0}, {0, 0, 0}});
   ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
       executors[device_ordinal], *literal, buffer->mutable_buffer({})));
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index c11e1df0a7890a6c3aada5ff47494b42fdaf3b9d..062a9246e49598d5d03dce8c1f437138923449bf 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#define EIGEN_USE_THREADS
 
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
 
 #include <vector>
 
-#define EIGEN_USE_THREADS
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -136,29 +135,10 @@ std::unique_ptr<ScopedShapedBuffer> LocalClientTestBase::LiteralToShapedBuffer(
       .ConsumeValueOrDie();
 }
 
-void LocalClientTestBase::CopyShapedBufferToLiteral(
-    const ShapedBuffer& shaped_buffer, ShapeIndex* index, Literal* literal) {
-  const Shape& shape = ShapeUtil::GetSubshape(shaped_buffer.shape(), *index);
-  if (ShapeUtil::IsTuple(shape)) {
-    *literal->mutable_shape() = shape;
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      Literal* element_literal = literal->add_tuple_literals();
-      index->push_back(i);
-      CopyShapedBufferToLiteral(shaped_buffer, index, element_literal);
-      index->pop_back();
-    }
-  } else {
-    ASSERT_IS_OK(transfer_manager_->TransferLiteralFromDevice(
-        stream_executor_, shaped_buffer.buffer(*index), shape, shape, literal));
-  }
-}
-
 std::unique_ptr<Literal> LocalClientTestBase::ShapedBufferToLiteral(
     const ShapedBuffer& shaped_buffer) {
-  auto literal = MakeUnique<Literal>();
-  ShapeIndex index;
-  CopyShapedBufferToLiteral(shaped_buffer, &index, literal.get());
-  return literal;
+  return local_client_->ShapedBufferToLiteral(shaped_buffer)
+      .ConsumeValueOrDie();
 }
 
 ExecutableBuildOptions LocalClientTestBase::DefaultExecutableBuildOptions()
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 3edfcb656ed8278d403103f0cfd820a10892476a..f0c73f04f6eb67b2e9cb5e111eccdc3818059b2b 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -93,10 +93,6 @@ class LocalClientTestBase : public ::testing::Test {
   std::unique_ptr<Literal> ShapedBufferToLiteral(
       const ShapedBuffer& shaped_buffer);
 
-  // Helper for converting a ShapedBuffer into a literal.
-  void CopyShapedBufferToLiteral(const ShapedBuffer& shaped_buffer,
-                                 ShapeIndex* index, Literal* literal);
-
   // Execute the given computation on the local client. With and without
   // options.
   StatusOr<std::unique_ptr<ScopedShapedBuffer>> ExecuteLocally(
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 2ef392508d14cf6dc14b2c979f07a79bc60d7426..2b0f7e6e80c48435ca55432a2afa3b6d69162625 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -405,13 +405,13 @@ TEST_F(MapTest, MapBinaryAdder) {
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
   ComputationBuilder builder(client_, TestName());
-  std::unique_ptr<Literal> param0_literal =
-      test_utils::CreateR2LiteralWithLayout({{1, 2}, {3, 4}}, {1, 0});
+  std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
+      {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
-  std::unique_ptr<Literal> param1_literal =
-      test_utils::CreateR2LiteralWithLayout({{10, 20}, {30, 40}}, {0, 1});
+  std::unique_ptr<Literal> param1_literal = Literal::CreateR2WithLayout(
+      {{10, 20}, {30, 40}}, LayoutUtil::MakeLayout({0, 1}));
   std::unique_ptr<GlobalData> param1_data =
       client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 22d2b917a1d55f4f453e21c2d8fea38e32ff796b..89fa6ed9f7fe590f3ac872cce48a329b2894048a 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -76,8 +76,11 @@ class MultiOutputFusionTest : public HloTestBase {
         elem_shape2, HloOpcode::kAdd, broadcast, param1));
     HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
         elem_shape2, HloOpcode::kSubtract, param1, broadcast));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(1);
+    dot_dnums.add_rhs_contracting_dimensions(0);
     HloInstruction* dot = builder.AddInstruction(
-        HloInstruction::CreateBinary(elem_shape2, HloOpcode::kDot, sub, add2));
+        HloInstruction::CreateDot(elem_shape2, sub, add2, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
@@ -133,8 +136,11 @@ class MultiOutputFusionTest : public HloTestBase {
     HloInstruction* reshape =
         builder.AddInstruction(HloInstruction::CreateReshape(
             ShapeUtil::MakeShape(F32, {size, 1}), add));
-    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(F32, {1}), HloOpcode::kDot, sub, reshape));
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(0);
+    dot_dnums.add_rhs_contracting_dimensions(0);
+    HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
+        ShapeUtil::MakeShape(F32, {1}), sub, reshape, dot_dnums));
     auto computation = hlo_module->AddEntryComputation(builder.Build(dot));
 
     if (manual_fusion) {
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index fda4389f479cdc7a659e4d7c8a2facba55e17e83..24c5daed3d09dc447ef92a4bc7e0d7185ec903ed 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -252,8 +252,8 @@ XLA_TEST_F(ParamsTest, HundredLargeR1Parameters) {
 }
 
 // Only run the 3,000-parameter tests in opt mode to avoid test timeouts.
-// Timeout last observed on 2017-09-12.
-#ifndef NDEBUG
+// Timeout last observed on 2017-11-20.
+#ifdef NDEBUG
 
 // TODO(b/65525254) Fails on GPU on 2017-09-10 because we try to reserve too
 // much space in parameter memory for the kernel.
@@ -334,6 +334,106 @@ XLA_TEST_F(ParamsTest, DISABLED_ON_CPU(DISABLED_ON_GPU(
   ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
 }
 
+// Test large number of parameters flowing into a while-loop.
+// Construct conceptually the following HLO graph:
+//
+// p0 = parameter(0)
+// p1 = parameter(1)
+// ...
+// pN = parameter(N)
+// result = while (false) {
+//   p0 += (1, 1);
+//   p1 += (1, 1);
+//   ...
+//   pN += (1, 1)
+// }
+// result = {p0, p1, ..., pN}
+//
+// TODO(b/70173746): Times out during compilation on GPU and CPU-parallel
+// backend as of 2017-12-03.
+XLA_TEST_F(ParamsTest, DISABLED_ON_CPU_PARALLEL(
+                           DISABLED_ON_GPU(ManyParametersIntoWhileLoop))) {
+  ComputationBuilder builder(client_, TestName());
+
+  std::vector<std::unique_ptr<GlobalData>> param_data_owner;
+  constexpr int kParamCount = 1900;
+  std::vector<ComputationDataHandle> params;
+  std::vector<Shape> parameter_shapes;
+  for (int i = 0; i < kParamCount; ++i) {
+    std::unique_ptr<Literal> literal = Literal::CreateR1<int32>({i, i});
+    param_data_owner.push_back(
+        std::move(client_->TransferToServer(*literal)).ValueOrDie());
+    ComputationDataHandle param =
+        builder.Parameter(i, literal->shape(), "param");
+    params.push_back(param);
+    parameter_shapes.push_back(literal->shape());
+  }
+
+  // Add bool parameter for the loop condition. Use a parameter HLO instead of a
+  // constant because DCE may eliminate the while-body otherwise.
+  std::unique_ptr<Literal> bool_literal = Literal::CreateR0<bool>(false);
+  param_data_owner.push_back(
+      std::move(client_->TransferToServer(*bool_literal)).ValueOrDie());
+  ComputationDataHandle bool_param =
+      builder.Parameter(kParamCount, bool_literal->shape(), "bool_param");
+  params.push_back(bool_param);
+  parameter_shapes.push_back(bool_literal->shape());
+
+  auto init = builder.Tuple(params);
+
+  // Create a computation for the condition: while(bool_param).
+  Shape while_shape = ShapeUtil::MakeTupleShape(parameter_shapes);
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto condition_parameter =
+        builder.Parameter(0, while_shape, "condition_parameter");
+    builder.GetTupleElement(condition_parameter, kParamCount);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add {1, 1} to the each tuple element.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto body_parameter = builder.Parameter(0, while_shape, "body_parameter");
+    std::vector<ComputationDataHandle> updates;
+    for (int i = 0; i < kParamCount; ++i) {
+      auto add = builder.Add(builder.GetTupleElement(body_parameter, i),
+                             builder.ConstantR1<int32>({1, 1}));
+      updates.push_back(add);
+    }
+    // Add bool parameter.
+    updates.push_back(builder.GetTupleElement(body_parameter, kParamCount));
+
+    builder.Tuple(updates);
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  auto loop = builder.While(condition, body, init);
+
+  std::vector<ComputationDataHandle> outputs;
+  for (int i = 0; i < kParamCount; ++i) {
+    outputs.push_back(builder.GetTupleElement(loop, i));
+  }
+  builder.Tuple(outputs);
+
+  std::vector<GlobalData*> param_data;
+  param_data.reserve(param_data_owner.size());
+  for (const std::unique_ptr<GlobalData>& data : param_data_owner) {
+    param_data.push_back(data.get());
+  }
+
+  std::vector<std::unique_ptr<Literal>> elements;
+  std::vector<const Literal*> ptrs;
+  for (int i = 0; i < kParamCount; ++i) {
+    elements.push_back(Literal::CreateR1<int32>({i, i}));
+    ptrs.push_back(elements.back().get());
+  }
+  ComputeAndCompareTuple(&builder, *Literal::MakeTuple(ptrs), param_data);
+}
+
 #endif
 
 XLA_TEST_F(ParamsTest,
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 3500e8dc28570fe216f53b746c3757e080aa689f..10e44b274a8a9f3ac28dc40d7b1938d24a9ee40c 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -90,7 +90,7 @@ TEST_F(PredTest, ConstantR2Pred) {
       builder.ConstantR2<bool>({{false, true, true}, {true, false, false}});
   const string expected = R"(pred[2,3] {
   { 011 },
-  { 100 },
+  { 100 }
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
@@ -119,7 +119,9 @@ TEST_F(PredTest, AnyR1VacuouslyFalse) {
 TEST_F(PredTest, AnyR2True) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2<bool>({
-      {false, false, false}, {false, false, false}, {false, false, true},
+      {false, false, false},
+      {false, false, false},
+      {false, false, true},
   });
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, true, {});
@@ -128,7 +130,9 @@ TEST_F(PredTest, AnyR2True) {
 TEST_F(PredTest, AnyR2False) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR2<bool>({
-      {false, false, false}, {false, false, false}, {false, false, false},
+      {false, false, false},
+      {false, false, false},
+      {false, false, false},
   });
   TF_ASSERT_OK(Any(a, &builder).status());
   ComputeAndCompareR0<bool>(&builder, false, {});
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 7bc3185c367f076c9a7d211c9799557e1a91d92f..b09ccdd679b6c8f628e40f78f58dbd1734926af6 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -352,15 +352,13 @@ XLA_TEST_F(ReduceTest, ReduceR2_111x50_01_To_R1) {
 XLA_TEST_F(ReduceTest, ReduceR2_1024x1024_To_R1) { RunR2ToR1Test(1024, 1024); }
 XLA_TEST_F(ReduceTest, ReduceR2_1000x1500_To_R1) { RunR2ToR1Test(1000, 1500); }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, AndReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
@@ -369,15 +367,13 @@ XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(AndReduceOnesAndZerosR1_10_Pred)) {
   RunR1ToR0PredTest(/*and_reduce=*/true, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceAllOnesR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceAllOnesR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count, 1);
   RunR1ToR0PredTest(/*and_reduce=*/false, input);
 }
 
-// TODO(b/34969189): Invalid CAS generated on GPU.
-XLA_TEST_F(ReduceTest, DISABLED_ON_GPU(OrReduceOnesAndZerosR1_10_Pred)) {
+XLA_TEST_F(ReduceTest, OrReduceOnesAndZerosR1_10_Pred) {
   constexpr int element_count = 10;
   std::vector<int> input(element_count);
   for (int i = 0; i < element_count; ++i) {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 6c9b62b48d8bb2ad93b2ce98839e5e52d8eaa8cc..b32df74312ed1b513bcdd161c1516c5a5a2f0faf 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -41,16 +41,40 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReduceWindowTest : public ClientLibraryTestBase {
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+// Tests both F32 and BF16.
+static std::array<bool, 2> use_bfloat16_params{false, true};
+#else
+// Only tests F32.
+static std::array<bool, 1> use_bfloat16_params{false};
+#endif
+
+class ReduceWindowTestBase : public ClientLibraryTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {}
+  ErrorSpec DefaultErrorSpec() const {
+    if (use_bfloat16()) {
+      return ErrorSpec(1e-1, 5e-2);
+    } else {
+      return ErrorSpec(1e-3, 1e-3);
+    }
+  }
+};
+
+class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
+                         public ReduceWindowTestBase {
+ public:
+  ReduceWindowTest() : builder_(client_, TestName()) {
+    set_use_bfloat16(GetParam());
+  }
 
   void ReduceWindowAdd(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input, builder_.ConstantR0<float>(0.0f),
-                          CreateScalarAddComputation(F32, &builder_),
+    auto init =
+        CreateConstantFromLiteral(*Literal::CreateR0<float>(0.0f), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarAddComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
@@ -58,30 +82,32 @@ class ReduceWindowTest : public ClientLibraryTestBase {
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(
-        input, builder_.ConstantLiteral(Literal::MinValue(F32)),
-        CreateScalarMax(), window_dimensions, window_strides, padding);
+    auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
+    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
+                          window_strides, padding);
   }
 
   void ReduceWindowMin(const ComputationDataHandle& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-    builder_.ReduceWindow(input,
-                          builder_.ConstantLiteral(Literal::MaxValue(F32)),
-                          CreateScalarMinComputation(F32, &builder_),
+    auto init = CreateConstantFromLiteral(Literal::MaxValue(F32), &builder_);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMinComputation(FloatType(), &builder_),
                           window_dimensions, window_strides, padding);
   }
 
   ComputationBuilder builder_;
 };
 
-TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
-  const auto input = builder_.ConstantR1<float>({1, 1, 1, 1});
-  const auto init_value = builder_.ConstantR0<float>(0);
+TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({1, 1, 1, 1}), &builder_);
+  const auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(0), &builder_);
   TF_ASSERT_OK(builder_.first_error());
   builder_.ReduceWindow(input, init_value,
-                        CreateScalarAddComputation(F32, &builder_),
+                        CreateScalarAddComputation(FloatType(), &builder_),
                         /*window_dimensions=*/{1, 2},
                         /*window_strides=*/{1}, Padding::kValid);
   ASSERT_EQ(builder_.first_error().code(), tensorflow::error::INVALID_ARGUMENT)
@@ -90,79 +116,97 @@ TEST_F(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
               ::testing::HasSubstr("Want input dimensions size"));
 }
 
-TEST_F(ReduceWindowTest, Min3In5Stride2) {
-  const auto input = builder_.ConstantR1<float>({10000, 1000, 100, 10, 1});
+// Regression test for b/68964348.
+TEST_P(ReduceWindowTest, R0ReduceWindow) {
+  const auto input =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(42.0), &builder_);
+  const auto init =
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(1.0), &builder_);
+  builder_.ReduceWindow(input, init,
+                        CreateScalarAddComputation(FloatType(), &builder_),
+                        /*window_dimensions=*/{},
+                        /*window_strides=*/{}, Padding::kSame);
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR0<float>(43.0), {},
+                           ErrorSpec(0.00001));
+}
+
+TEST_P(ReduceWindowTest, Min3In5Stride2) {
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>({10000, 1000, 100, 10, 1}), &builder_);
   ReduceWindowMin(input, {3}, {2}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {100, 1}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({100, 1}), {},
+                           ErrorSpec(0.00001));
 }
 
-XLA_TEST_F(ReduceWindowTest, ZeroElementSmall) {
+XLA_TEST_P(ReduceWindowTest, ZeroElementSmall) {
   Array4D<float> input_array(1, 0, 2, 1);
-
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, NonSquareSmall) {
+TEST_P(ReduceWindowTest, NonSquareSmall) {
   Array4D<float> input_array(1, 2, 2, 1);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 2, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 2, 1},
                                               {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, MiddleDimsSmall) {
+TEST_P(ReduceWindowTest, MiddleDimsSmall) {
   Array4D<float> input_array(1, 3, 3, 1);
-  input_array.FillRandom(2.f);
-
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 1, 1}, {1, 2, 2, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 1, 1},
                                               {1, 2, 2, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, Along2ndMinorDim) {
+TEST_P(ReduceWindowTest, Along2ndMinorDim) {
   Array4D<float> input_array(3, 6, 7, 32);
-  input_array.FillRandom(2.f);
+  input_array.FillRandom(2.f, 2.f);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   // The parameters of this reduction mimic feature norm (e.g. LRN).
   int lrn_diameter = 7;  // diameter = 2*radius + 1 --> must be odd
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, lrn_diameter, 1}, {1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2Dims) {
+TEST_P(ReduceWindowTest, AmongMajor2Dims) {
   Array4D<float> input_array(4, 4, 6, 8);
   input_array.FillWithMinorDimNum();
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int win_stride = 1;
 
   Padding padding = Padding::kSame;
-  const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -170,18 +214,20 @@ TEST_F(ReduceWindowTest, AmongMajor2Dims) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
   Array4D<float> input_array(9, 12, 4, 89);
-  input_array.FillRandom(2.0f);
+  input_array.FillRandom(2.f, 2.f);
 
   int win_len = 3;
   int win_stride = 2;
 
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
 
   Padding padding = Padding::kSame;
   // Reduce only along the x and y dimensions, according to the win_len.
@@ -192,20 +238,21 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMediumSize) {
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
 // TODO(b/32173947): Test support for arbitrary-sized padding.
-TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
+TEST_P(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
   Array4D<float> input_array(9, 12, 4, 89);  // simulate Dim0IsMinor layout
-  input_array.FillRandom(2.0f);
+  input_array.FillRandom(2.f, 2.f);
 
   int64 rank = 4;
   int win_len = 3;
   int win_stride = 2;
 
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
 
   Padding padding = Padding::kSame;
   // Reduce only along the x and y dimensions, according to the win_len.
@@ -222,26 +269,28 @@ TEST_F(ReduceWindowTest, DISABLED_AmongMajor2DimsMediumSizeLargePadding) {
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x2) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x2) {
   Array3D<float> input_array(2, 1, 2);
   input_array(0, 0, 0) = 1000;
   input_array(0, 0, 1) = 100;
   input_array(1, 0, 0) = 10;
   input_array(1, 0, 1) = 1;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kValid);
 
   Array3D<float> expected(2, 1, 1);
   expected(0, 0, 0) = 1100;
   expected(1, 0, 0) = 11;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
   Array3D<float> input_array(2, 1, 3);
   input_array(0, 0, 0) = 100;
   input_array(0, 0, 1) = 10;
@@ -249,17 +298,18 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3Stride1x1x2) {
   input_array(1, 0, 0) = 500;
   input_array(1, 0, 1) = 50;
   input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 2}, Padding::kValid);
 
   Array3D<float> expected(2, 1, 1);
   expected(0, 0, 0) = 110;
   expected(1, 0, 0) = 550;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
+XLA_TEST_P(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   Array3D<float> input_array(2, 1, 3);
   input_array(0, 0, 0) = 100;
   input_array(0, 0, 1) = 10;
@@ -267,7 +317,7 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   input_array(1, 0, 0) = 500;
   input_array(1, 0, 1) = 50;
   input_array(1, 0, 2) = 5;
-  auto input = builder_.ConstantR3FromArray3D<float>(input_array);
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   ReduceWindowAdd(input, {1, 1, 2}, {1, 1, 1}, Padding::kSame);
 
@@ -278,30 +328,34 @@ XLA_TEST_F(ReduceWindowTest, Add1x1x2In2x1x3SamePad) {
   expected(1, 0, 0) = 550;
   expected(1, 0, 1) = 55;
   expected(1, 0, 2) = 5;
-  ComputeAndCompareR3<float>(&builder_, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(expected), {},
+                           DefaultErrorSpec());
 }
 
 // Tests a reduction function that is not a simple add/min/max/etc.
-XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
+XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
   input_array(0, 0, 0, 0) = 1;
   input_array(0, 0, 1, 0) = 2;
   input_array(0, 1, 0, 0) = 3;
   input_array(0, 1, 1, 0) = 4;
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
-  const auto input = builder_.ConstantR4FromArray4D<float>(input_array);
   Padding padding = Padding::kValid;
-
-  const Shape scalar = ShapeUtil::MakeShape(F32, {});
+  const Shape scalar = ShapeUtil::MakeShape(FloatType(), {});
   auto b = builder_.CreateSubBuilder("unusual");
   auto lhs = b->Parameter(0, scalar, "lhs");
   auto rhs = b->Parameter(1, scalar, "rhs");
-  b->Min(b->Add(lhs, rhs), b->ConstantR0<float>(8.0f));
+  b->Min(b->Add(lhs, rhs),
+         CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
   Computation reduce_fn = b->BuildAndNoteError();
 
-  builder_.ReduceWindow(input, builder_.ConstantR0<float>(3.0f), reduce_fn,
-                        /*window_dimensions=*/{1, 1, 2, 1},
-                        /*window_strides=*/{1, 1, 1, 1}, padding);
+  builder_.ReduceWindow(
+      input,
+      CreateConstantFromLiteral(*Literal::CreateR0<float>(3.0f), &builder_),
+      reduce_fn,
+      /*window_dimensions=*/{1, 1, 2, 1},
+      /*window_strides=*/{1, 1, 1, 1}, padding);
 
   const auto reduce_func = [](float arg1, float arg2) {
     return std::min<float>(arg1 + arg2, 8.0f);
@@ -312,17 +366,19 @@ XLA_TEST_F(ReduceWindowTest, NonstandardReduceFunction) {
                                            /*window=*/{1, 1, 2, 1},
                                            /*stride=*/{1, 1, 1, 1}, padding);
 
-  ComputeAndCompareR4<float>(&builder_, *expected, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*expected), {},
+                           DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R4UnitWindow) {
+TEST_P(ReduceWindowTest, R4UnitWindow) {
   Array4D<float> input_array(13, 12, 8, 15);
-  input_array.Fill(1.0f);
+  input_array.FillRandom(2.f, 2.f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
@@ -330,15 +386,11 @@ TEST_F(ReduceWindowTest, R4UnitWindow) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(input_array, 0.0f, {1, 1, 7, 1},
                                               {1, 4, 1, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
@@ -348,56 +400,15 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(arg_literal->Populate<float>(generator));
 
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  // Non-monotonic output layout with minor dims trivial.
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {3, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
+
   std::vector<int64> output_layout = {1, 5, 3, 2, 0, 4};
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  b.AddInstruction(HloInstruction::CreateReduceWindow(
-      result_shape, input, init_value, window, add_func));
-
   std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
   auto out_generator =
       [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
@@ -405,82 +416,37 @@ XLA_TEST_F(HloTestBase, R6AddMultipleStrides) {
   };
   TF_EXPECT_OK(expected->Populate<float>(out_generator));
 
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
-
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(HloTestBase, R6Add) {
-  auto b = HloComputation::Builder(TestName());
-
+XLA_TEST_P(ReduceWindowTest, R6Add) {
   std::vector<int64> input_dims(6, 8);
+  auto shape = ShapeUtil::MakeShape(F32, input_dims);
+
   std::unique_ptr<Literal> arg_literal =
       Literal::CreateFullWithMonotonicDim0MajorLayout<float>(input_dims, 1.0f);
-  auto input =
-      b.AddInstruction(HloInstruction::CreateConstant(std::move(arg_literal)));
-
-  auto init_value = b.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
-
-  HloComputation::Builder add_computation("add");
-  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  auto param_lhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-  auto param_rhs = add_computation.AddInstruction(
-      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-  add_computation.AddInstruction(HloInstruction::CreateBinary(
-      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
-
-  auto module = CreateNewModule();
-  auto add_func = module->AddEmbeddedComputation(add_computation.Build());
-
-  WindowDimension trivial_dim;
-  trivial_dim.set_size(1);
-  trivial_dim.set_stride(1);
-  trivial_dim.set_padding_low(0);
-  trivial_dim.set_padding_high(0);
-  trivial_dim.set_window_dilation(1);
-  trivial_dim.set_base_dilation(1);
-
-  WindowDimension active_dim;
-  active_dim.set_size(3);
-  active_dim.set_stride(1);
-  active_dim.set_padding_low(0);
-  active_dim.set_padding_high(0);
-  active_dim.set_window_dilation(1);
-  active_dim.set_base_dilation(1);
-
-  Window window;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = active_dim;
-  *window.add_dimensions() = trivial_dim;
-  *window.add_dimensions() = trivial_dim;
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8, 8, 6, 6, 8, 8});
-  b.AddInstruction(HloInstruction::CreateReduceWindow(shape, input, init_value,
-                                                      window, add_func));
+
+  const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
+
+  Padding padding = Padding::kValid;
+  ReduceWindowAdd(input, {1, 1, 3, 3, 1, 1}, {1, 1, 1, 1, 1, 1}, padding);
 
   std::vector<int64> output_dims = {8, 8, 6, 6, 8, 8};
   std::unique_ptr<Literal> expected =
       Literal::CreateFullWithMonotonicDim0MajorLayout<float>(output_dims, 9.0f);
 
-  module->AddEntryComputation(b.Build());
-  auto actual = ExecuteAndTransfer(std::move(module), {});
-
-  LiteralTestUtil::ExpectNear(*actual, *expected, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   Array4D<float> input_array(2, 1, 27, 119);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 1;
   int stride = 8;
@@ -490,20 +456,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   Array4D<float> input_array(3, 2, 4, 64);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 3;
   int stride = 1;
@@ -513,20 +478,19 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorUnitStride) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
+XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   Array4D<float> input_array(1, 3, 12, 200);
   input_array.FillRandom(2.0f);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input =
-      builder_.Parameter(0, input_literal->shape(), "operand");
+  ComputationDataHandle input;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "parameter", &builder_, &input);
 
   int win_len = 8;
   int stride = 5;
@@ -536,13 +500,11 @@ XLA_TEST_F(ReduceWindowTest, R4SecondMinorWin) {
   auto res = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {1, 1, win_len, 1}, {1, 1, stride, 1}, padding);
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                          client_->TransferToServer(*input_literal));
-  ComputeAndCompareR4<float>(&builder_, *res, {input_data.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*res),
+                           {input_data.get()}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
+TEST_P(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   Array4D<float> input_array(6, 4, 10, 130);
   input_array.FillRandom(2.0f);
 
@@ -551,7 +513,7 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
 
   Padding padding = Padding::kSame;
   const auto input_data_handle =
-      builder_.ConstantR4FromArray4D<float>(input_array);
+      CreateConstantFromArray(input_array, &builder_);
   // Reduce only along the x and y dimensions, according to the win_len.
   ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
                   {win_stride, win_stride, 1, 1}, padding);
@@ -559,36 +521,42 @@ TEST_F(ReduceWindowTest, AmongMajor2DimsMultipleMinor) {
   auto result = ReferenceUtil::ReduceWindow4DAdd(
       input_array, 0.0f, {win_len, win_len, 1, 1},
       {win_stride, win_stride, 1, 1}, padding);
-  ComputeAndCompareR4<float>(&builder_, *result, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add24In1152_NoOverlap) {
+XLA_TEST_P(ReduceWindowTest, Add24In1152_NoOverlap) {
   std::vector<float> input_vector(128 * 9, 1);
-  const auto input = builder_.ConstantR1<float>(input_vector);
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {32}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {32, 32, 32, 32, 32, 32, 32, 32, 32},
-                             {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(
+      &builder_,
+      *Literal::CreateR1<float>({32, 32, 32, 32, 32, 32, 32, 32, 32}), {},
+      DefaultErrorSpec());
 }
 
-XLA_TEST_F(ReduceWindowTest, Add128In128Stride128) {
-  const auto input = builder_.ConstantR1<float>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+XLA_TEST_P(ReduceWindowTest, Add128In128Stride128) {
+  std::vector<float> input_vector{
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  const auto input = CreateConstantFromLiteral(
+      *Literal::CreateR1<float>(input_vector), &builder_);
   ReduceWindowAdd(input, {128}, {128}, Padding::kValid);
-  ComputeAndCompareR1<float>(&builder_, {1088}, {}, ErrorSpec(0.0001));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateR1<float>({1088}), {},
+                           DefaultErrorSpec());
 }
 
 // Regression test for a bug that appeared in Inception (b/34784899).
-TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   Array2D<float> input_array(14, 14, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {14, 14});
+  const auto input = CreateConstantFromArray(input_array, &builder_);
 
   int win_len = 3;
   int stride = 1;
@@ -598,13 +566,14 @@ TEST_F(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(
       input_array, 0.0f, {win_len, win_len}, {stride, stride}, padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
-TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
+TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input =
-      builder_.Broadcast(builder_.ConstantLiteral(Literal::One(F32)), {6, 4});
+  ComputationDataHandle input = builder_.Broadcast(
+      CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {4, 2}, {3, 3}, padding);
@@ -612,9 +581,13 @@ TEST_F(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   auto res = ReferenceUtil::ReduceWindow2DAdd(input_array, 0.0f, {4, 2}, {3, 3},
                                               padding);
 
-  ComputeAndCompareR2<float>(&builder_, *res, {}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray<float>(*res),
+                           {}, DefaultErrorSpec());
 }
 
+INSTANTIATE_TEST_CASE_P(ReduceWindowTestInstance, ReduceWindowTest,
+                        ::testing::ValuesIn(use_bfloat16_params));
+
 enum Reducer { kAdd, kMax };
 
 struct R4ReduceWindowTestData {
@@ -628,30 +601,36 @@ struct R4ReduceWindowTestData {
 };
 
 string R4ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R4ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R4ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),            //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),    //
-      "__pad_low_", tensorflow::str_util::Join(data.param.pad_low, "x"),    //
-      "__pad_high_", tensorflow::str_util::Join(data.param.pad_high, "x"),  //
-      (data.param.reducer == kAdd) ? "add" : "max");
-  CHECK(data.param.reducer == kAdd || data.param.reducer == kMax);
+      tensorflow::str_util::Join(param.window_bounds, "x"),            //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),    //
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),    //
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),  //
+      (param.reducer == kAdd) ? "add" : "max");
+  CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   // Test names are not allowed to contain the '-' character.
   std::replace(str.begin(), str.end(), '-', 'n');
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R4ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R4ReduceWindowTestData> {
+class R4ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R4ReduceWindowTestData, bool>> {
  protected:
+  R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+
   void DoIt() {
     ComputationBuilder b(client_, TestName());
-    const auto& param = GetParam();
+    const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
 
@@ -660,23 +639,24 @@ class R4ReduceWindowTest
     input.FillIota(1);
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4D(input);
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                            client_->TransferToServer(*input_literal));
+    ComputationDataHandle parameter;
+    auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                       &b, &parameter);
 
     std::vector<std::pair<int64, int64>> padding(4);
     for (int i = 0; i < 4; ++i) {
       padding[i] = {param.pad_low[i], param.pad_high[i]};
     }
 
-    auto parameter = b.Parameter(0, input_literal->shape(), "p0");
-    auto pad_value = b.ConstantR0<float>(kInitValue);
+    auto init_value =
+        CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto computation = param.reducer == kAdd
-                           ? CreateScalarAddComputation(F32, &b)
-                           : CreateScalarMaxComputation(F32, &b);
+                           ? CreateScalarAddComputation(FloatType(), &b)
+                           : CreateScalarMaxComputation(FloatType(), &b);
     b.ReduceWindowWithGeneralPadding(
         /*operand=*/parameter,
-        /*init_value=*/pad_value,
+        /*init_value=*/init_value,
         /*computation=*/computation,
         /*window_dimensions=*/param.window_bounds,
         /*window_strides=*/param.strides,
@@ -694,8 +674,8 @@ class R4ReduceWindowTest
             /*window=*/param.window_bounds,
             /*stride=*/param.strides,
             /*padding=*/padding);
-    ComputeAndCompareR4<float>(&b, *expected, {input_arg.get()},
-                               ErrorSpec(1e-3, 1e-3));
+    ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                             {input_arg.get()}, DefaultErrorSpec());
   }
 };
 
@@ -824,9 +804,11 @@ const R4ReduceWindowTestData kR4ReduceWindowTestValues[] = {
                            /*reducer=*/kAdd},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
-                        ::testing::ValuesIn(kR4ReduceWindowTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowTestInstantiation, R4ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
 
 class R4ReduceWindowLargeTest : public R4ReduceWindowTest {};
 
@@ -849,10 +831,11 @@ const R4ReduceWindowTestData kR4ReduceWindowLargeTestValues[] = {
                            /*reducer=*/kAdd},
 };
 
-INSTANTIATE_TEST_CASE_P(R4ReduceWindowLargeTestInstantiation,
-                        R4ReduceWindowLargeTest,
-                        ::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
-                        R4ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R4ReduceWindowLargeTestInstantiation, R4ReduceWindowLargeTest,
+    ::testing::Combine(::testing::ValuesIn(kR4ReduceWindowLargeTestValues),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R4ReduceWindowTestDataToString);
 
 struct R2ReduceWindowTestData {
   int64 base_bounds[2];
@@ -900,26 +883,33 @@ struct R2ReduceWindowTestData {
 };
 
 string R2ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R2ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R2ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),  //
       "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__layout_", data.param.layout[0], "_", data.param.layout[1],           //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      tensorflow::str_util::Join(param.window_bounds, "x"),              //
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),      //
+      "__padding_", param.padding == Padding::kSame ? "same" : "valid",  //
+      "__layout_", param.layout[0], "_", param.layout[1],                //
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R2ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R2ReduceWindowTestData> {};
+class R2ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R2ReduceWindowTestData, bool>> {
+ protected:
+  R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
 
 TEST_P(R2ReduceWindowTest, Add) {
   ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
+  const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
   const float kInitValue = 0.0f;
@@ -927,12 +917,15 @@ TEST_P(R2ReduceWindowTest, Add) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR2FromArray2DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/CreateScalarAddComputation(F32, &b),
+
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindow(/*operand=*/parameter,
+                 /*init_value=*/init_value,
+                 /*computation=*/CreateScalarAddComputation(FloatType(), &b),
                  /*window_dimensions=*/param.window_bounds,
                  /*window_strides=*/param.strides, /*padding=*/param.padding);
 
@@ -940,90 +933,145 @@ TEST_P(R2ReduceWindowTest, Add) {
       /*operand=*/input, /*init=*/kInitValue, /*window=*/param.window_bounds,
       /*stride=*/param.strides, /*padding=*/param.padding);
 
-  ComputeAndCompareR2<float>(&b, *expected, {input_arg.get()},
-                             ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&b, *Literal::CreateFromArray(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
 }
 
-INSTANTIATE_TEST_CASE_P(R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
-                        ::testing::ValuesIn(kR2TestCases),
-                        R2ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R2ReduceWindowTestInstantiation, R2ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR2TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R2ReduceWindowTestDataToString);
 
 struct R1ReduceWindowTestData {
   int64 base_bounds[1];
   int64 window_bounds[1];
   int64 strides[1];
-  Padding padding;
+  int64 pad_low[1];
+  int64 pad_high[1];
   Reducer reducer;
 } kR1TestCases[] = {
     {/*base_bounds=*/{1}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({1}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{3},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {3}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{1},
      /*strides=*/{1},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {1}, {1}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{4},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {4}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kMax},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{30},
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{30},
      /*strides=*/{27},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 17}, /*window_bounds=*/{7},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {30}, {27}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 17},
+     /*window_bounds=*/{7},
      /*strides=*/{64},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
-
-    {/*base_bounds=*/{128 * 2}, /*window_bounds=*/{32},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 17}, {7}, {64}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{128 * 2},
+     /*window_bounds=*/{32},
      /*strides=*/{56},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].first},
+     /*pad_high=*/
+     {xla::MakePadding({128 * 2}, {32}, {56}, Padding::kValid)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{3}, /*window_bounds=*/{2},
      /*strides=*/{1},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({3}, {2}, {1}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{5}, /*window_bounds=*/{3},
      /*strides=*/{2},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({5}, {3}, {2}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
 
     {/*base_bounds=*/{16}, /*window_bounds=*/{4},
      /*strides=*/{3},
-     /*padding=*/Padding::kSame, /*reducer=*/Reducer::kAdd},
+     /*pad_low=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].first},
+     /*pad_high=*/{xla::MakePadding({16}, {4}, {3}, Padding::kSame)[0].second},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{0},
+     /*pad_high=*/{5},
+     /*reducer=*/Reducer::kAdd},
+
+    {/*base_bounds=*/{5}, /*window_bounds=*/{5},
+     /*strides=*/{1},
+     /*pad_low=*/{5},
+     /*pad_high=*/{0},
+     /*reducer=*/Reducer::kAdd},
 };
 
 string R1ReduceWindowTestDataToString(
-    const ::testing::TestParamInfo<R1ReduceWindowTestData>& data) {
+    const ::testing::TestParamInfo<
+        ::testing::tuple<R1ReduceWindowTestData, bool>>& data) {
+  const auto& param = ::testing::get<0>(data.param);
   string str = tensorflow::strings::StrCat(
-      "base_bounds_",
-      tensorflow::str_util::Join(data.param.base_bounds, "x"),  //
-      "__window_bounds_",
-      tensorflow::str_util::Join(data.param.window_bounds, "x"),              //
-      "__strides_", tensorflow::str_util::Join(data.param.strides, "x"),      //
-      "__padding_", data.param.padding == Padding::kSame ? "same" : "valid",  //
-      "__reducer_", data.param.reducer == kAdd ? "add" : "max");
+      "base_bounds_", tensorflow::str_util::Join(param.base_bounds, "x"),
+      "__window_bounds_", tensorflow::str_util::Join(param.window_bounds, "x"),
+      "__strides_", tensorflow::str_util::Join(param.strides, "x"),
+      "__pad_low_", tensorflow::str_util::Join(param.pad_low, "x"),
+      "__pad_high_", tensorflow::str_util::Join(param.pad_high, "x"),
+      "__reducer_", param.reducer == kAdd ? "add" : "max");
+  if (::testing::get<1>(data.param)) {
+    str = tensorflow::strings::StrCat(str, "_bfloat16");
+  }
   return str;
 }
 
-class R1ReduceWindowTest
-    : public ClientLibraryTestBase,
-      public ::testing::WithParamInterface<R1ReduceWindowTestData> {};
+class R1ReduceWindowTest : public ReduceWindowTestBase,
+                           public ::testing::WithParamInterface<
+                               ::testing::tuple<R1ReduceWindowTestData, bool>> {
+ protected:
+  R1ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
+};
 
 TEST_P(R1ReduceWindowTest, DoIt) {
   ComputationBuilder b(client_, TestName());
-  const auto& param = GetParam();
+  const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
   const float kInitValue = 0.0f;
@@ -1031,18 +1079,24 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_arg,
-                          client_->TransferToServer(*input_literal));
+  ComputationDataHandle parameter;
+  auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
+                                                     &b, &parameter);
+
+  std::vector<std::pair<int64, int64>> padding(1);
+  padding[0] = {param.pad_low[0], param.pad_high[0]};
 
   auto computation = param.reducer == kAdd
-                         ? CreateScalarAddComputation(F32, &b)
-                         : CreateScalarMaxComputation(F32, &b);
-  b.ReduceWindow(/*operand=*/
-                 b.Parameter(0, input_literal->shape(), "p0"),
-                 /*init_value=*/b.ConstantR0<float>(kInitValue),
-                 /*computation=*/computation,
-                 /*window_dimensions=*/param.window_bounds,
-                 /*window_strides=*/param.strides, /*padding=*/param.padding);
+                         ? CreateScalarAddComputation(FloatType(), &b)
+                         : CreateScalarMaxComputation(FloatType(), &b);
+  auto init_value =
+      CreateConstantFromLiteral(*Literal::CreateR0(kInitValue), &b);
+  b.ReduceWindowWithGeneralPadding(
+      /*operand=*/parameter,
+      /*init_value=*/init_value,
+      /*computation=*/computation,
+      /*window_dimensions=*/param.window_bounds,
+      /*window_strides=*/param.strides, /*padding=*/padding);
 
   auto reduce_func = param.reducer == kAdd
                          ? +[](float a, float b) { return a + b; }
@@ -1052,14 +1106,17 @@ TEST_P(R1ReduceWindowTest, DoIt) {
       /*init=*/kInitValue,
       /*reduce_func=*/reduce_func,
       /*window=*/param.window_bounds,
-      /*stride=*/param.strides, /*padding=*/param.padding);
+      /*stride=*/param.strides,
+      /*padding=*/padding);
 
-  ComputeAndCompareR1<float>(&b, tensorflow::gtl::ArraySlice<float>(*expected),
-                             {input_arg.get()}, ErrorSpec(1e-3, 1e-3));
+  ComputeAndCompareLiteral(&b, *Literal::CreateR1<float>(*expected),
+                           {input_arg.get()}, DefaultErrorSpec());
 }
 
-INSTANTIATE_TEST_CASE_P(R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
-                        ::testing::ValuesIn(kR1TestCases),
-                        R1ReduceWindowTestDataToString);
+INSTANTIATE_TEST_CASE_P(
+    R1ReduceWindowTestInstantiation, R1ReduceWindowTest,
+    ::testing::Combine(::testing::ValuesIn(kR1TestCases),
+                       ::testing::ValuesIn(use_bfloat16_params)),
+    R1ReduceWindowTestDataToString);
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 72c68f24a0a954deb0564e9a0e924edfaf5b5484..ddd50d7a5864d73de7916ce736bb7cd40c1c4dc9 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -41,326 +41,467 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class ReshapeTest : public ClientLibraryTestBase {
+// Use a bool parameter to indicate whether to use bfloat16.
+class ReshapeTest : public ::testing::WithParamInterface<bool>,
+                    public ClientLibraryTestBase {
  public:
+  ReshapeTest() { set_use_bfloat16(GetParam()); }
+
   ErrorSpec zero_error_spec_{0.0};
 };
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
-XLA_TEST_F(ReshapeTest, CollapseTrivial1x1) {
+XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1EmptyDims) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, CollapseTrivialR1OnlyDim) {
+XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0});
-  builder.Collapse(/*operand=*/a, /*dimensions=*/{0});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f}, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
+
+  auto expected_literal = Literal::CreateR1<float>({1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
-XLA_TEST_F(ReshapeTest, SingleElementArrayToScalar) {
+XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0}});
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1}, /*new_sizes=*/{});
+  Array2D<float> input_array(1, 1);
+  input_array.Fill(1.0f);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
+                                                 &builder, &parameter);
+  auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                                 /*new_sizes=*/{});
   auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie();
 
-  ComputeAndCompareR0<float>(&builder, 1.0f, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateR0<float>(1.0f);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ScalarToSingleElementArray) {
+XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
-  a = builder.Neg(a);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  auto a = builder.Neg(parameter);
   auto reshape =
       builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
-  ComputeAndCompareR1<float>(&builder, {-1.0f}, {param0_data.get()},
-                             zero_error_spec_);
+  auto expected_literal = Literal::CreateR1<float>({-1.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial0x3) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 3));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(0, 3);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // TODO(b/29185393): Make this work with the GPU backend. The GPU backend
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
-XLA_TEST_F(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
   ComputationBuilder builder(client_, TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  std::unique_ptr<GlobalData> param0_data =
-      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
-
-  auto a = builder.Parameter(0, ShapeUtil::MakeShape(F32, {0, 3}), "param0");
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
-                             zero_error_spec_);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, Trivial3x0) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {}, {}, zero_error_spec_);
+  Array2D<float> input_array(3, 0);
+  auto input_literal = Literal::CreateR2FromArray2D(input_array);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional row vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial1x3) {
+XLA_TEST_P(ReshapeTest, Trivial1x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f, 2.0f, 3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses a 2-dimensional column vector to 1 dimension.
-XLA_TEST_F(ReshapeTest, Trivial3x1) {
+XLA_TEST_P(ReshapeTest, Trivial3x1) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{0, 1});
-
-  ComputeAndCompareR1<float>(&builder, {1.0f, 2.0f, 3.0f}, {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
+  auto expected_literal = Literal::CreateR1<float>({1.0f, 2.0f, 3.0f});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Splits an empty vector into an empty matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_0_To_2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(R1ToR2_0_To_2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 0});
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateR1<float>({});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Splits a vector into a matrix.
-XLA_TEST_F(ReshapeTest, R1ToR2_6_To_2x3) {
+XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
-  auto result =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{0}, /*new_sizes=*/{2, 3});
-  Array2D<float> expected_2x3({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected_2x3, {}, zero_error_spec_);
+  auto input_literal =
+      Literal::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0},
+                  /*new_sizes=*/{2, 3});
+  auto expected_literal =
+      Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 2x0 array to a 0x2 array.
-XLA_TEST_F(ReshapeTest, Reshape0x2To2x0) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Reshape0x2To2x0)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(2, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 2));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional row vector to a column vector.
-XLA_TEST_F(ReshapeTest, ReshapeRowToCol) {
+XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
   ComputationBuilder builder(client_, TestName());
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*simple);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{3, 1});
+  auto input_literal = Literal::CreateFromArray(*simple);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{3, 1});
 
   auto expected = ReferenceUtil::TransposeArray2D(*simple);
-  ComputeAndCompareR2<float>(&builder, *expected, {}, zero_error_spec_);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array.
-XLA_TEST_F(ReshapeTest, TransposeAsReshape) {
+XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 4});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 4});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Transposes a 0x4 array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose0x4) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Transpose0x4)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 4));
-  auto result = builder.Transpose(a, {1, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(4, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 4));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+  auto expected_literal = Literal::CreateR2<float>({{}, {}, {}, {}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Transposes a 2-dimensional array with ComputationBuilder::Trans.
-XLA_TEST_F(ReshapeTest, Transpose4x3) {
+XLA_TEST_P(ReshapeTest, Transpose4x3) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Transpose(a, {1, 0});
-
-  auto expected3x4 = ReferenceUtil::TransposeArray2D(*a4x3);
-  ComputeAndCompareR2<float>(&builder, *expected3x4, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Transpose(parameter, {1, 0});
+
+  auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
 // Reshapes an empty 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(6, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 3, 0, 0});
-
-  ComputeAndCompareR4<float>(&builder, Array4D<float>(2, 3, 0, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(6, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 3, 0, 0});
+  auto expected_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 0, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, ReshapeR4ToR2ZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 3, 4, 0));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{24, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(24, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{24, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(24, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitNoShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1},
-                                /*new_sizes=*/{2, 6});
-
-  auto expected2x6 = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
-  ComputeAndCompareR2<float>(&builder, *expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
+                  /*new_sizes=*/{2, 6});
+
+  auto expected = MakeLinspaceArray2D(1.0f, 12.0f, 2, 6);
+  auto expected_literal = Literal::CreateFromArray(*expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-// Reshapes a 2-dimensional array with dimensions that are not just a
-// rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
+// TODO(b/29185393): Make this work with the GPU backend. The GPU backend
+// does not handle zero-sized shapes correctly. Failed last on 2017-11-30
+// with an incorrect result rank.
+//
+XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 6));
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{3, 0});
-
-  ComputeAndCompareR2<float>(&builder, Array2D<float>(3, 0), {},
-                             zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{3, 0});
+  auto expected_literal = Literal::CreateFromArray(Array2D<float>(3, 0));
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
-XLA_TEST_F(ReshapeTest, ReshapeSplitAndShuffle) {
+XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   ComputationBuilder builder(client_, TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
-  auto a = builder.ConstantR2FromArray2D<float>(*a4x3);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{1, 0},
-                                /*new_sizes=*/{2, 6});
-
-  Array2D<float> expected2x6({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
-                              {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x6, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(*a4x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
+                  /*new_sizes=*/{2, 6});
+  Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
+                           {8.0f, 11.0f, 3.0f, 6.0f, 9.0f, 12.0f}});
+  auto expected_literal = Literal::CreateFromArray(expected);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // The following tests use the same input 3D array; they test the examples we
 // show for the Reshape operation in the operation_semantics document.
 // TODO(b/34503277): find a way to show this code in the documentation without
 // duplication on the TF documentation server.
-Array3D<int> v_array_for_doc_R3_tests({{{10, 11, 12}, {15, 16, 17}},
-                                       {{20, 21, 22}, {25, 26, 27}},
-                                       {{30, 31, 32}, {35, 36, 37}},
-                                       {{40, 41, 42}, {45, 46, 47}}});
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
-                            30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{0, 1, 2},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 11, 12},
-                         {15, 16, 17},
-                         {20, 21, 22},
-                         {25, 26, 27},
-                         {30, 31, 32},
-                         {35, 36, 37},
-                         {40, 41, 42},
-                         {45, 46, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{24});
-  ComputeAndCompareR1<int>(&builder,
-                           {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
-                            15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47},
-                           {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{8, 3});
-  Array2D<int> expected({{10, 20, 30},
-                         {40, 11, 21},
-                         {31, 41, 12},
-                         {22, 32, 42},
-                         {15, 25, 35},
-                         {45, 16, 26},
-                         {36, 46, 17},
-                         {27, 37, 47}});
-  ComputeAndCompareR2<int>(&builder, expected, {});
-}
-
-XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
-  auto v = builder.ConstantR3FromArray3D<int>(v_array_for_doc_R3_tests);
-  auto result = builder.Reshape(/*operand=*/v, /*dimensions=*/{1, 2, 0},
-                                /*new_sizes=*/{2, 6, 2});
-  Array3D<int> expected(
+static Array3D<float> ArrayForDocR3Tests() {
+  return Array3D<float>({{{10, 11, 12}, {15, 16, 17}},
+                         {{20, 21, 22}, {25, 26, 27}},
+                         {{30, 31, 32}, {35, 36, 37}},
+                         {{40, 41, 42}, {45, 46, 47}}});
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 11, 12, 15, 16, 17, 20, 21, 22, 25, 26, 27,
+       30, 31, 32, 35, 36, 37, 40, 41, 42, 45, 46, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 11, 12},
+                                                    {15, 16, 17},
+                                                    {20, 21, 22},
+                                                    {25, 26, 27},
+                                                    {30, 31, 32},
+                                                    {35, 36, 37},
+                                                    {40, 41, 42},
+                                                    {45, 46, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{24});
+  auto expected_literal = Literal::CreateR1<float>(
+      {10, 20, 30, 40, 11, 21, 31, 41, 12, 22, 32, 42,
+       15, 25, 35, 45, 16, 26, 36, 46, 17, 27, 37, 47});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{8, 3});
+  auto expected_literal = Literal::CreateR2<float>({{10, 20, 30},
+                                                    {40, 11, 21},
+                                                    {31, 41, 12},
+                                                    {22, 32, 42},
+                                                    {15, 25, 35},
+                                                    {45, 16, 26},
+                                                    {36, 46, 17},
+                                                    {27, 37, 47}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
+}
+
+XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
+  ComputationBuilder builder(client_, TestName());
+  auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
+                  /*new_sizes=*/{2, 6, 2});
+  auto expected_literal = Literal::CreateR3<float>(
       {{{10, 20}, {30, 40}, {11, 21}, {31, 41}, {12, 22}, {32, 42}},
        {{15, 25}, {35, 45}, {16, 26}, {36, 46}, {17, 27}, {37, 47}}});
-  ComputeAndCompareR3<int>(&builder, expected, {});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Collapses the low dimensions of a 4D tensor to get a 2D matrix, without
@@ -378,23 +519,26 @@ XLA_TEST_F(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 // Then we collapse Z be collapsed so we just end up with planes:
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapse) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
-  auto a = builder.ConstantR4FromArray4D<float>(t2x2x2x3);
-  auto result = builder.Collapse(/*operand=*/a, /*dimensions=*/{1, 2, 3});
-
-  Array2D<float> expected2x12(
+  auto input_literal = Literal::CreateFromArray(t2x2x2x3);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
+  auto expected_literal = Literal::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
        {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
         6.0f}});
-  ComputeAndCompareR2<float>(&builder, expected2x12, {}, zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // As above, but uses reshape directly.
-XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
+XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   ComputationBuilder builder(client_, TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
@@ -405,51 +549,67 @@ XLA_TEST_F(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 0, 1) = 5;
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
-  auto a = builder.ConstantR4FromArray4D<float>(t);
-  auto result = builder.Reshape(/*operand=*/a, /*dimensions=*/{0, 1, 2, 3},
-                                /*new_sizes=*/{2, 4});
-
-  Array2D<float> expected({{0, 1, 2, 3}, {4, 5, 6, 7}});
-  ComputeAndCompareR2<float>(&builder, expected, {}, zero_error_spec_);
+  auto input_literal = Literal::CreateFromArray(t);
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
+                  /*new_sizes=*/{2, 4});
+
+  auto expected_literal =
+      Literal::CreateR2<float>({{0, 1, 2, 3}, {4, 5, 6, 7}});
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Reshape various ranks to a scalar.
-XLA_TEST_F(ReshapeTest, ToScalar) {
+XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
     ComputationBuilder b(client_, TestName());
-    auto input = Literal::CreateR1<float>({83.0f});
+    auto input_literal = Literal::CreateR1<float>({83.0f});
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
-    *input->mutable_shape() = ShapeUtil::MakeShape(F32, ones);
-    b.Reshape(b.ConstantLiteral(*input), dimensions, {});
+    *input_literal->mutable_shape() = ShapeUtil::MakeShape(F32, ones);
+
+    ComputationDataHandle parameter;
+    auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                   &b, &parameter);
+    b.Reshape(parameter, dimensions, {});
 
-    ComputeAndCompareR0<float>(&b, 83.0f, {}, zero_error_spec_);
+    auto expected_literal = Literal::CreateR0<float>(83.0f);
+    ComputeAndCompareLiteral(&b, *expected_literal, {input.get()},
+                             zero_error_spec_);
   }
 }
 
-XLA_TEST_F(ReshapeTest, BadDimensions) {
+XLA_TEST_P(ReshapeTest, BadDimensions) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1}), {}, {});
-  EXPECT_THAT(ExecuteToString(&b, {}),
-              ::testing::HasSubstr("dimensions not a permutation"));
+  auto input_literal = Literal::CreateR1<float>({1.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {}, {});
+  EXPECT_THAT(
+      ExecuteToString(&b, {}),
+      ::testing::HasSubstr("not a permutation of the operand dimensions"));
 }
 
-XLA_TEST_F(ReshapeTest, BadNewSizes) {
+XLA_TEST_P(ReshapeTest, BadNewSizes) {
   ComputationBuilder b(client_, TestName());
-  b.Reshape(b.ConstantR1<int32>({1, 2}), {1}, {});
+  auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
+                                                 &parameter);
+  b.Reshape(parameter, {1}, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
 }
 
-XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  const Shape parameter_shape = ShapeUtil::MakeShape(F32, {2, 2, 2, 2});
+XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, parameter_shape, "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
-
   // clang-format off
-  auto literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
+  auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
       {
         {0, 1},
@@ -473,8 +633,12 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  std::unique_ptr<GlobalData> input =
-      client_->TransferToServer(*literal).ConsumeValueOrDie();
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
+
   Array2D<float> expected_array({
       {0, 1, 2, 3, 100, 101, 102, 103},
       {222, 333, 444, 555, 666, 777, 888, 999},
@@ -483,72 +647,75 @@ XLA_TEST_F(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   Computation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {2, 8}, {1, 0});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
+                                     {1, 0});
   std::unique_ptr<Literal> actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
           .ConsumeValueOrDie();
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<float>(expected_array);
+  if (use_bfloat16()) {
+    expected = LiteralTestUtil::ConvertF32ToBF16(*expected);
+  }
   LiteralTestUtil::ExpectEqual(*expected, *actual);
 }
 
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 1, 2, 3}},
      {{4, 5, 6, 7}}},
     {{{100, 101, 102, 103}},
      {{104, 105, 106, 107}}},
     {{{200, 201, 202, 203}},
      {{204, 205, 206, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
-XLA_TEST_F(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  std::unique_ptr<Literal> input = Literal::CreateR2<float>({
+XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
+  ComputationBuilder builder(client_, TestName());
+  std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
-  Array4D<float> expected = {
+  auto expected_literal = Literal::CreateR4<float>({
     {{{0, 100, 200, 1}},
      {{101, 201, 2, 102}}},
     {{{202, 3, 103, 203}},
      {{4, 104, 204, 5}}},
     {{{105, 205, 6, 106}},
      {{206, 7, 107, 207}}}
-  };
+  });
   // clang-format on
-  ComputeAndCompareR4<float>(&builder, expected, {input_data.get()},
-                             zero_error_spec_);
+  ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -558,12 +725,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({2, 1}, {1, 0}, *input_literal);
@@ -571,7 +736,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
                            zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -581,12 +747,10 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape({4, 2}, {1, 0}, *input_literal);
@@ -595,7 +759,8 @@ XLA_TEST_F(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 }
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
-XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -605,12 +770,11 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 2, 1, 3}, /*new_sizes=*/{5, 60});
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
+                  /*new_sizes=*/{5, 60});
 
   Array2D<float> expected_array(5, 60);
   input.Each([&](tensorflow::gtl::ArraySlice<int64> indices, float* cell) {
@@ -618,10 +782,12 @@ XLA_TEST_F(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
         *cell;
   });
   auto expected = Literal::CreateR2FromArray2D(expected_array);
-  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()});
+  ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
+                           zero_error_spec_);
 }
 
-XLA_TEST_F(ReshapeTest, NoopReshape) {
+XLA_TEST_P(ReshapeTest, NoopReshape) {
+  ComputationBuilder builder(client_, TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -631,18 +797,17 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.Parameter(0, input_literal->shape(), "input");
-  builder.Reshape(input, /*dimensions=*/{3, 0, 1, 2},
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
   Computation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
-      ShapeUtil::MakeShapeWithLayout(F32, {7, 2, 3, 5}, {2, 3, 0, 1});
+      ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
+                                     {2, 3, 0, 1});
   std::unique_ptr<Literal> output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
@@ -651,35 +816,45 @@ XLA_TEST_F(ReshapeTest, NoopReshape) {
 
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
-  EXPECT_EQ(tensorflow::gtl::ArraySlice<float>(input_literal->f32s()),
-            tensorflow::gtl::ArraySlice<float>(output_literal->f32s()));
+  if (use_bfloat16()) {
+    auto expected = LiteralTestUtil::ConvertF32ToBF16(*input_literal);
+    EXPECT_EQ(tensorflow::gtl::ArraySlice<bfloat16>(expected->bf16s()),
+              tensorflow::gtl::ArraySlice<bfloat16>(output_literal->bf16s()));
+  } else {
+    EXPECT_EQ(tensorflow::gtl::ArraySlice<float>(input_literal->f32s()),
+              tensorflow::gtl::ArraySlice<float>(output_literal->f32s()));
+  }
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape_Trivial) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
+  ComputationBuilder builder(client_, TestName());
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{0, 1, 2, 3},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
                   /*new_sizes=*/{1, 2, 3, 4});
 
-  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {});
+  ComputeAndCompareLiteral(&builder, *literal_1x2x3x4, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
-  auto literal_1x2x3x4 = Literal::CreateR4(
+XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
+  auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   ComputationBuilder builder(client_, TestName());
-  auto input = builder.ConstantLiteral(*literal_1x2x3x4);
-  builder.Reshape(input, /*dimensions=*/{1, 3, 2, 0},
+  ComputationDataHandle parameter;
+  auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
+                                                 &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
                   /*new_sizes=*/{2, 4, 3, 1});
 
   // clang-format off
-  auto expected_2x4x3x1 = Literal::CreateR4(
+  auto expected_2x4x3x1 = Literal::CreateR4<float>(
       {{{{1}, {5}, {9}},
         {{2}, {6}, {10}},
         {{3}, {7}, {11}},
@@ -690,10 +865,10 @@ XLA_TEST_F(ReshapeTest, R4ToR4Reshape) {
         {{16}, {20}, {24}}}});
   // clang-format on
 
-  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {});
+  ComputeAndCompareLiteral(&builder, *expected_2x4x3x1, {input.get()});
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {2, 2, 2, 2};
@@ -705,12 +880,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -722,7 +897,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeSimple) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {1, 1, 250, 300};
@@ -734,12 +909,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -751,7 +926,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {5, 5, 1, 10};
@@ -763,12 +938,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -780,7 +955,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   // This happens in NN-Builder MNIST.
@@ -793,12 +968,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{0, 1, 3, 2}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
@@ -810,7 +985,7 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
                            zero_error_spec_, &expected->shape());
 }
 
-XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
+XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   std::vector<int64> bounds = {3, 3, 1, 3};
@@ -822,12 +997,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  std::unique_ptr<GlobalData> input_data =
-      client_->TransferToServer(*input_literal).ConsumeValueOrDie();
-
   ComputationBuilder builder(client_, TestName());
-  auto a = builder.Parameter(0, input_literal->shape(), "a");
-  builder.Reshape(a, /*dimensions=*/{1, 0, 2, 3}, /*new_sizes=*/new_bounds);
+  ComputationDataHandle parameter;
+  auto input_data = CreateParameterAndTransferLiteral(
+      0, *input_literal, "input", &builder, &parameter);
+  builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
+                  /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
       LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
@@ -839,5 +1014,12 @@ XLA_TEST_F(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
                            zero_error_spec_, &expected->shape());
 }
 
+#ifdef XLA_BACKEND_SUPPORTS_BFLOAT16
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest, ::testing::Bool());
+#else
+INSTANTIATE_TEST_CASE_P(ReshapeTestInstance, ReshapeTest,
+                        ::testing::ValuesIn(std::vector<bool>{false}));
+#endif
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/sample_file_test.cc b/tensorflow/compiler/xla/tests/sample_file_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b104f4e37f77d47f56ff8183ee1de1cc22e44d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_file_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create a file based testcase
+// and compare results on gpu and cpu.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class SampleFileTest : public HloTestBase {
+ protected:
+  SampleFileTest()
+      : HloTestBase(
+            /*test_platform=*/PlatformUtil::GetPlatform("gpu").ValueOrDie(),
+            /*reference_platform=*/PlatformUtil::GetPlatform("cpu")
+                .ValueOrDie()) {}
+};
+
+TEST_F(SampleFileTest, Convolution) {
+  const string& filename = "compiler/xla/tests/isolated_convolution.hlo";
+  string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+  EXPECT_TRUE(RunAndCompareFromFile(
+      tensorflow::io::JoinPath(test_srcdir, filename), ErrorSpec{0.01}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/sample_text_test.cc b/tensorflow/compiler/xla/tests/sample_text_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f2b74e3dc9e80f50454b28eb6f2502cef3e681
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/sample_text_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This demonstrates how to use hlo_test_base to create textual IR based
+// testcases.
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class SampleTextTest : public HloTestBase {};
+
+TEST_F(SampleTextTest, Axpy) {
+  const string& hlo_string = R"(
+HloModule axpy_module:
+ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
+  %alpha = f32[] parameter(0)
+  %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={}
+  %x = f32[2,4]{1,0} parameter(1)
+  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)
+  %y = f32[2,4]{1,0} parameter(2)
+  ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
+}
+)";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_string, ErrorSpec{0.0001}));
+}
+
+TEST_F(SampleTextTest, Tuple) {
+  const string& hlo_string = R"(
+HloModule TupleCreate_module:
+ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
+  %v1 = f32[] parameter(0)
+  %v2 = f32[3]{0} parameter(1)
+  %v3 = f32[2,3]{1,0} parameter(2)
+  ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3)
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, nullopt));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index c21124750ad512cad69b1483e708613ee2857ac0..4db566f7841829359ea06fe25408048418c547ad 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -211,6 +212,13 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
+  const R1Spec& spec = data.param;
+  return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
+                                       spec.slice_start, spec.slice_limit,
+                                       spec.slice_stride);
+}
+
 XLA_TEST_P(SliceR1Test, DoIt_F32) { Run<float>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_F64) { Run<double>(GetParam()); }
@@ -223,30 +231,66 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
-INSTANTIATE_TEST_CASE_P(                          //
-    SliceR1TestInstantiation,                     //
-    SliceR1Test,                                  //
-    ::testing::Values(                            //
-        R1Spec{10, 0, 0, 1},                      //
-        R1Spec{10, 7, 7, 1},                      //
-        R1Spec{10, 2, 4, 1},                      //
-        R1Spec{10, 2, 4, 2},                      //
-        R1Spec{10, 0, 10, 1},                     //
-        R1Spec{1024, 1024 - 4, 1024, 1},          //
-        R1Spec{4096, 7, 7 + 1024, 1},             //
-        R1Spec{10, 0, 10, 2},                     //
-        R1Spec{10, 0, 10, 3},                     //
-        R1Spec{10, 0, 10, 4},                     //
-        R1Spec{10, 0, 10, 5},                     //
-        R1Spec{10, 0, 10, 10},                    //
-        R1Spec{500, 200, 400, 7},                 //
-        R1Spec{4096, 1, 4095, 3},                 //
-        R1Spec{2047, 1024 - 24, 1024 + 160, 31},  //
-        R1Spec{2047, 1, 2046, 3 * 128},           //
-        R1Spec{4096, 1024 + 3, 4095, 500},        //
-        R1Spec{8192, 0, 8192, 1024 * 3 + 400}     //
-        )                                         //
+// Tests for R1 slice ops.
+// The format for each testcase is {input size, start, limit, stride}.
+// clang-format off
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestInstantiation,
+    SliceR1Test,
+    ::testing::Values(
+        R1Spec{10, 0, 0, 1},
+        R1Spec{10, 7, 7, 1},
+        R1Spec{10, 0, 5, 1},
+        R1Spec{10, 3, 5, 1},
+        R1Spec{10, 0, 10, 1},
+        R1Spec{1024, 0, 5, 1},
+        R1Spec{1024, 3, 5, 1},
+        R1Spec{1024 + 17, 0, 5, 1},
+        R1Spec{1024 + 17, 3, 5, 1},
+        R1Spec{1024 + 17, 1024, 1024 + 6, 1},
+        R1Spec{1024 + 17, 1024 + 1, 1024 + 6, 1},
+        R1Spec{1024, 1024 - 4, 1024, 1},
+        R1Spec{4 * 1024, 7, 7 + 1024, 1},
+        R1Spec{4 * 1024, 0, 4 * 1024, 1},
+        R1Spec{4 * 1024, 1, 4 * 1024 - 1, 1},
+        R1Spec{4 * 1024, 1024, 3 * 1024, 1},
+        R1Spec{4 * 1024, 1024 + 1, 3 * 1024 - 1, 1},
+        R1Spec{16 * 1024, 0, 5, 1},
+        R1Spec{16 * 1024, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 0, 5, 1},
+        R1Spec{16 * 1024 + 17, 3, 5, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024 + 17, 16 * 1024 + 1, 16 * 1024 + 6, 1},
+        R1Spec{16 * 1024, 4 * 1024 - 17, 8 * 1024 - 18, 1},
+        R1Spec{64 * 1024, 0, 64 * 1024, 1},
+        R1Spec{64 * 1024, 1, 64 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 1024, 63 * 1024, 1},
+        R1Spec{64 * 1024, 1024 + 1, 63 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024, 33 * 1024, 1},
+        R1Spec{64 * 1024, 32 * 1024 + 1, 33 * 1024 - 1, 1},
+        R1Spec{64 * 1024, 32 * 1024 - 17, 36 * 1024 - 18, 1},
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
+#endif
+        R1Spec{10, 2, 4, 2},
+        R1Spec{10, 0, 10, 2},
+        R1Spec{10, 0, 10, 3},
+        R1Spec{10, 0, 10, 4},
+        R1Spec{10, 0, 10, 5},
+        R1Spec{10, 0, 10, 10},
+        R1Spec{500, 200, 400, 7},
+        R1Spec{4096, 1, 4095, 3},
+        R1Spec{2047, 1024 - 24, 1024 + 160, 31},
+        R1Spec{2047, 1, 2046, 3 * 128},
+        R1Spec{4096, 1024 + 3, 4095, 500},
+        R1Spec{8192, 0, 8192, 1024 * 3 + 400}
+        ),
+    SliceR1TestDataToString
 );
+// clang-format on
 
 struct R2Spec {
   int64 input_dim0;
diff --git a/tensorflow/compiler/xla/tests/test_macros.cc b/tensorflow/compiler/xla/tests/test_macros.cc
index 173fb1b0008c9e6edaa1902a5eb3ca5f054a2a67..978a669bcab720bddec5c4bcd0144810ba3c8477 100644
--- a/tensorflow/compiler/xla/tests/test_macros.cc
+++ b/tensorflow/compiler/xla/tests/test_macros.cc
@@ -21,12 +21,13 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
 namespace {
 
 // Mapping from test name; i.e. MyTest.MyTestCase to platforms on which it is
-// disabled.
+// disabled - a sequence of regexps.
 using ManifestT = std::unordered_map<string, std::vector<string>>;
 
 ManifestT ReadManifest() {
@@ -66,9 +67,6 @@ ManifestT ReadManifest() {
 
 string PrependDisabledIfIndicated(const string& test_case_name,
                                   const string& test_name) {
-  // TODO(leary): this code reads the manifest for every test case instantiated
-  // in every file. Consider switching to a singleton or using a compile-time
-  // genrule instead.
   ManifestT manifest = ReadManifest();
 
   // First try full match: test_case_name.test_name
@@ -83,11 +81,13 @@ string PrependDisabledIfIndicated(const string& test_case_name,
     }
   }
 
+  // Expect a full match vs. one of the platform regexps to disable the test.
   const std::vector<string>& disabled_platforms = it->second;
   string platform_string = XLA_PLATFORM;
-  if (std::find(disabled_platforms.begin(), disabled_platforms.end(),
-                platform_string) != disabled_platforms.end()) {
-    return "DISABLED_" + test_name;
+  for (const auto& s : disabled_platforms) {
+    if (RE2::FullMatch(/*text=*/platform_string, /*re=*/s)) {
+      return "DISABLED_" + test_name;
+    }
   }
 
   // We didn't hit in the disabled manifest entries, so don't disable it.
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 3878ac1013ef1459cbe3c92a48fc6149b6a4948e..28a2d0198a707cec1aa5e0fbed341ee9b2a927f7 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -66,8 +66,10 @@ limitations under the License.
 
 namespace xla {
 
-// Reads a disabled manifest file (and retains it as a singleton) to resolve
-// whether test cases should be disabled on a particular platform.
+// Reads a disabled manifest file to resolve whether test cases should be
+// disabled on a particular platform. For a test that should be disabled,
+// returns DISABLED_ prepended to its name; otherwise returns the test name
+// unmodified.
 string PrependDisabledIfIndicated(const string& test_case_name,
                                   const string& test_name);
 
@@ -96,7 +98,8 @@ string PrependDisabledIfIndicated(const string& test_case_name,
                                                     test_name)::test_info_ =  \
       ::testing::internal::MakeAndRegisterTestInfo(                           \
           #test_case_name,                                                    \
-          PrependDisabledIfIndicated(#test_case_name, #test_name).c_str(),    \
+          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)      \
+              .c_str(),                                                       \
           nullptr, nullptr,                                                   \
           ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
           parent_class::SetUpTestCase, parent_class::TearDownTestCase,        \
@@ -135,7 +138,8 @@ string PrependDisabledIfIndicated(const string& test_case_name,
               ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
           ->AddTestPattern(                                                    \
               #test_case_name,                                                 \
-              PrependDisabledIfIndicated(#test_case_name, #test_name).c_str(), \
+              ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)   \
+                  .c_str(),                                                    \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
                   test_case_name, test_name)>());                              \
       return 0;                                                                \
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..780b292d1a9b819f0f37e959cdec019f03b4a595
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -0,0 +1,259 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+
+namespace xla {
+
+namespace {
+
+template <typename FloatT>
+void PopulateWithRandomFloatingPointData(Literal* literal) {
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<FloatT>());
+  std::minstd_rand0 engine;
+  std::uniform_real_distribution<FloatT> generator(0.0f, 1.0f);
+  TF_CHECK_OK(literal->Populate<FloatT>(
+      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+        return generator(engine);
+      }));
+}
+
+// The standard library does not have a case for bfloat16, unsurprisingly, so we
+// handle that one specially.
+template <>
+void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal) {
+  CHECK_EQ(literal->shape().element_type(), BF16);
+  std::minstd_rand0 engine;
+  std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+  TF_CHECK_OK(literal->Populate<bfloat16>(
+      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+        return static_cast<bfloat16>(generator(engine));
+      }));
+}
+
+template <typename IntT>
+void PopulateWithRandomIntegralData(Literal* literal) {
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<IntT>());
+  std::minstd_rand0 engine;
+  std::uniform_int_distribution<IntT> generator(
+      std::numeric_limits<IntT>::lowest(), std::numeric_limits<IntT>::max());
+  TF_CHECK_OK(literal->Populate<IntT>(
+      [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+        return generator(engine);
+      }));
+}
+
+// Matches binary addition computations.
+bool LooksLikeSum(const HloComputation& computation) {
+  const HloInstruction* const root = computation.root_instruction();
+  return root->opcode() == HloOpcode::kAdd &&
+         computation.num_parameters() == 2 &&
+         root->operand(0)->opcode() == HloOpcode::kParameter &&
+         root->operand(1)->opcode() == HloOpcode::kParameter &&
+         root->operand(0) != root->operand(1);
+}
+
+// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
+// which requires an init_value of 0 rather than a random value.
+bool NeedsZeroInitValue(const HloUse& use) {
+  const HloInstruction* const instruction = use.instruction;
+  const HloOpcode opcode = instruction->opcode();
+  const int64 op_num = use.operand_number;
+  return (
+      ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
+       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
+       LooksLikeSum(*instruction->scatter())));
+}
+
+// Generate random values that are constrained to the input_shape minus the
+// output_shape so as not to produce wrapping slices, for instance.
+std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
+    const Shape& input_shape, const Shape& slice_shape) {
+  const int64 rank = ShapeUtil::Rank(input_shape);
+  std::vector<int32> start_indices(rank);
+  std::minstd_rand0 engine;
+  for (int i = 0; i < rank; ++i) {
+    const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
+                              ShapeUtil::GetDimension(slice_shape, i);
+    std::uniform_int_distribution<int32> generator(0, upper_bound);
+    start_indices[i] = generator(engine);
+  }
+  return Literal::CreateR1<int32>(start_indices);
+}
+
+// Use dataflow analysis on each parameter to see if there are uses that would
+// be problematic when generating input data.  Returns the list of instructions
+// that correspond to their uses.
+//
+// Should be paired with the CreateLiteralForConstrainedUses() function below.
+std::vector<HloInstruction*> FindConstrainedUses(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+  std::vector<HloInstruction*> constrained_uses;
+  for (const auto& pair : dataflow.GetInstructionValueSet(&param)) {
+    const HloValue& value = dataflow.GetUniqueValueAt(&param, pair.first);
+    for (const HloUse& use : value.uses()) {
+      HloInstruction* instruction = use.instruction;
+      const HloOpcode opcode = instruction->opcode();
+      const int64 op_num = use.operand_number;
+      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+        constrained_uses.push_back(instruction);
+      } else if (opcode == HloOpcode::kFusion) {
+        const HloInstruction* const to_analyze =
+            instruction->fused_parameter(op_num);
+        auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
+        constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
+                                fused_uses.end());
+      } else if (NeedsZeroInitValue(use)) {
+        constrained_uses.push_back(instruction);
+      }
+    }
+  }
+  return constrained_uses;
+}
+
+// Given a parameter, generate a random Literal to use as input if there exist
+// no constrained uses in the dataflow graph.  If such constraints exist,
+// generate a constrained literal (either bounded in the case of indices, or
+// zero in the case of init_values for reductions).
+StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
+    const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
+    const HloInstruction& param) {
+  const auto count = constrained_uses.size();
+  if (count > 1) {
+    return Unimplemented("multiple constrained uses not yet supported");
+  }
+
+  if (count == 0) {
+    return MakeFakeLiteral(param.shape());
+  }
+
+  const HloInstruction* const use = constrained_uses[0];
+  switch (use->opcode()) {
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+      return MakeRandomNonwrappingSliceIndex(use->operand(0)->shape(),
+                                             use->shape());
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kSelectAndScatter:
+      return Literal::CreateFromShape(param.shape());
+    default:
+      return Unimplemented("constrained use given; no equivalent literal");
+  }
+}
+
+// Given a module entry parameter, use the dataflow analysis to see if a
+// special case literal must be created, or if we can generate fake data.
+StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
+    const HloDataflowAnalysis& dataflow, const HloInstruction& param) {
+  const auto constrained_uses = FindConstrainedUses(dataflow, param);
+  return CreateLiteralForConstrainedUses(constrained_uses, param);
+}
+
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    std::vector<std::unique_ptr<Literal>> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> element,
+                          MakeFakeLiteral(element_shape));
+      elements.push_back(std::move(element));
+    }
+    return Literal::MakeTupleOwned(std::move(elements));
+  }
+  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  switch (shape.element_type()) {
+    case BF16:
+      PopulateWithRandomFloatingPointData<bfloat16>(literal.get());
+      break;
+    case F32:
+      PopulateWithRandomFloatingPointData<float>(literal.get());
+      break;
+    case F64:
+      PopulateWithRandomFloatingPointData<double>(literal.get());
+      break;
+    case S8:
+      PopulateWithRandomIntegralData<int8>(literal.get());
+      break;
+    case U8:
+      PopulateWithRandomIntegralData<uint8>(literal.get());
+      break;
+    case S16:
+      PopulateWithRandomIntegralData<int16>(literal.get());
+      break;
+    case U16:
+      PopulateWithRandomIntegralData<uint16>(literal.get());
+      break;
+    case S32:
+      PopulateWithRandomIntegralData<int32>(literal.get());
+      break;
+    case U32:
+      PopulateWithRandomIntegralData<uint32>(literal.get());
+      break;
+    case S64:
+      PopulateWithRandomIntegralData<int64>(literal.get());
+      break;
+    case U64:
+      PopulateWithRandomIntegralData<uint64>(literal.get());
+      break;
+    case PRED: {
+      std::uniform_int_distribution<int> generator(0, 1);
+      std::minstd_rand0 engine;
+      TF_CHECK_OK(literal->Populate<bool>(
+          [&](tensorflow::gtl::ArraySlice<int64> /*indices*/) {
+            return generator(engine);
+          }));
+      break;
+    }
+    default:
+      return Unimplemented("Unsupported type for fake literal generation: %s",
+                           ShapeUtil::HumanString(shape).c_str());
+  }
+  return std::move(literal);
+}
+
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module) {
+  TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(module));
+  const auto params = module->entry_computation()->parameter_instructions();
+  std::vector<std::unique_ptr<Literal>> arguments(params.size());
+  for (int i = 0; i < params.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(arguments[i],
+                        MakeConstrainedArgument(*dataflow, *params[i]));
+  }
+  return std::move(arguments);
+}
+
+Status VerifyHloModule(const perftools::gputools::Platform& platform,
+                       HloModule* const module) {
+  return HloVerifier(
+             std::bind(
+                 &TransferManager::GetByteSizeRequirement,
+                 TransferManager::GetForPlatform(&platform).ConsumeValueOrDie(),
+                 std::placeholders::_1))
+      .Run(module)
+      .status();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index f3a522b05ebae4f1f86d6d7ddbac6e1749d3e286..0fb024ffb074f1c90b75022bc7f5a8b58b03c0c2 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -23,12 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
-namespace test_utils {
 
 // A class which generates pseudorandom numbers of a given type within a given
 // range. Not cryptographically secure and likely not perfectly evenly
@@ -53,63 +54,23 @@ class PseudorandomGenerator {
   std::mt19937 generator_;
 };
 
-// Convenience function for creating a rank-2 array with arbitrary layout.
-template <typename NativeT>
-std::unique_ptr<Literal> CreateR2LiteralWithLayout(
-    std::initializer_list<std::initializer_list<NativeT>> values,
-    tensorflow::gtl::ArraySlice<int64> minor_to_major) {
-  auto literal = MakeUnique<Literal>();
-  const int64 d0 = values.size();
-  const int64 d1 = values.begin()->size();
-  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1});
-  *literal->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout(minor_to_major);
-  TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
+// Generates fake data in a literal of the given shape, or returns an error
+// status if the element type is currently unhandled for fake data generation.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
 
-  int64 dim0 = 0;
-  for (auto inner_list : values) {
-    int64 dim1 = 0;
-    for (auto value : inner_list) {
-      literal.get()->Set({dim0, dim1}, value);
-      ++dim1;
-    }
-    ++dim0;
-  }
-  return literal;
-}
+// Generates a vector of arguments containing fake data. The number, shape and
+// layout of the arguments is appropriate for given HLO module.
+//
+// Will handle special cases such as making sure that indices used for dynamic
+// slices are bounded, reduces that call adds use 0 as an init value, etc.
+StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
+    HloModule* const module);
 
-// Convenience function for creating a rank-3 array with arbitrary layout.
-template <typename NativeT>
-std::unique_ptr<Literal> CreateR3LiteralWithLayout(
-    std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
-        values,
-    tensorflow::gtl::ArraySlice<int64> minor_to_major) {
-  auto literal = MakeUnique<Literal>();
-  const int64 d0 = values.size();
-  const int64 d1 = values.begin()->size();
-  const int64 d2 = values.begin()->begin()->size();
-  literal.get()->PopulateWithValue<NativeT>(0, {d0, d1, d2});
-  *literal->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout(minor_to_major);
-  TF_CHECK_OK(ShapeUtil::ValidateShape(literal->shape()));
-
-  int64 dim0 = 0;
-  for (auto inner_list : values) {
-    int64 dim1 = 0;
-    for (auto inner_inner_list : inner_list) {
-      int64 dim2 = 0;
-      for (auto value : inner_inner_list) {
-        literal.get()->Set({dim0, dim1, dim2}, value);
-        ++dim2;
-      }
-      ++dim1;
-    }
-    ++dim0;
-  }
-  return literal;
-}
+// Check that a given module satisfies various constraints before trying to
+// execute it.
+Status VerifyHloModule(const perftools::gputools::Platform& platform,
+                       HloModule* const module);
 
-}  // namespace test_utils
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_TEST_UTILS_H_
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2a64749482e5f5a8c5d72034fb7a4eee07baf48
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -0,0 +1,215 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class TransferManagerTest : public LocalClientTestBase {
+ protected:
+  TransferManagerTest()
+      : shape_size_fn_([this](const Shape& shape) {
+          return transfer_manager_->GetByteSizeRequirement(shape);
+        }) {}
+
+  ~TransferManagerTest() override = default;
+
+  std::unique_ptr<ScopedShapedBuffer> AllocateDeviceBuffer(const Shape& shape) {
+    return ScopedShapedBuffer::Allocate(
+               shape, GetOrCreateAllocator(local_client_->platform()),
+               /*device_ordinal=*/0, shape_size_fn_)
+        .ValueOrDie();
+  }
+
+ private:
+  std::function<int64(const Shape&)> shape_size_fn_;
+};
+
+XLA_TEST_F(TransferManagerTest, TransferR0U32) {
+  std::unique_ptr<Literal> literal = Literal::CreateR0<uint32>(42);
+  const Shape& shape = literal->shape();
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectR0Equal<uint32>(42, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferR1F32) {
+  std::unique_ptr<Literal> literal =
+      Literal::CreateR1<float>({1.25f, 2.5f, -17.0f, -20.125f});
+  const Shape& shape = literal->shape();
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectR1Equal<float>({1.25f, 2.5f, -17.0f, -20.125f},
+                                        *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferR1LargeF32) {
+  std::vector<float> test_vector(1024 * 1024);
+  std::iota(test_vector.begin(), test_vector.end(), 0);
+  std::unique_ptr<Literal> literal = Literal::CreateR1<float>(test_vector);
+  const Shape& shape = literal->shape();
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectR1Equal<float>(test_vector, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferR1U8) {
+  const char* test_string = "0123456789abcdef";
+  std::unique_ptr<Literal> literal = Literal::CreateR1U8(test_string);
+  const Shape& shape = literal->shape();
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  EXPECT_EQ(result->u8s_string(), test_string);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferR2F32) {
+  std::unique_ptr<Literal> literal =
+      Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}});
+  const Shape& shape = literal->shape();
+  auto device_buffer = AllocateDeviceBuffer(shape);
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result);
+}
+
+XLA_TEST_F(TransferManagerTest,
+           TransferR2F32AndChangeLayoutTransferringToDevice) {
+  std::unique_ptr<Literal> literal = Literal::CreateR2WithLayout<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, LayoutUtil::MakeLayout({0, 1}));
+  const Shape ondevice_shape =
+      ShapeUtil::MakeShapeWithLayout(F32, {2, 3}, {1, 0});
+  auto device_buffer = AllocateDeviceBuffer(ondevice_shape);
+
+  // Round trip literal through device. Set the on-device layout to something
+  // different than the literal layout.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  EXPECT_FALSE(
+      LayoutUtil::Equal(result->shape().layout(), literal->shape().layout()));
+  LiteralTestUtil::ExpectR2Equal<float>(
+      {{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}}, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferTuple) {
+  std::unique_ptr<Literal> literal = Literal::MakeTuple(
+      {Literal::CreateR0<float>(123.0f).get(),
+       Literal::CreateR2<float>({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(),
+       Literal::CreateR1<float>({44.0f, -10.0f, 3333333.3f}).get()});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
+  std::unique_ptr<Literal> literal = Literal::MakeTuple({});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
+XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
+  std::unique_ptr<Literal> literal = Literal::MakeTuple(
+      {Literal::CreateR0<float>(123.0f).get(),
+       Literal::MakeTuple(
+           {Literal::CreateR2<float>({{1.0f, 2.0f}, {4.0f, 5.0f}}).get(),
+            Literal::CreateR1<float>({44.0f, -10.0f, 3333333.3f}).get()})
+           .get(),
+       Literal::CreateR1<float>({-10.0f, 123.0f}).get()});
+  auto device_buffer = AllocateDeviceBuffer(literal->shape());
+
+  // Round trip literal through device.
+  ASSERT_IS_OK(transfer_manager_->TransferLiteralToDevice(
+      stream_executor_, *literal, *device_buffer));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result,
+                          transfer_manager_->TransferLiteralFromDevice(
+                              stream_executor_, *device_buffer));
+
+  LiteralTestUtil::ExpectEqual(*literal, *result);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 4920f17a7ed21d587c15b8deac550d5e5bb566c9..65489cfff19c8fecbdead8a7e295bf9cca56038f 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -180,7 +180,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
-XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
+// TODO(b/68395210): GPU does not tolerate ambiguous top-level buffers.
+XLA_TEST_F(TupleTest, DISABLED_ON_GPU(SelectBetweenPredTuples)) {
   ComputationBuilder b(client_, TestName());
   ComputationDataHandle v1, v2;
 
@@ -444,5 +445,61 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
   ComputeAndCompareR1<float>(&builder, expected, arguments, ErrorSpec(1e-5));
 }
 
+XLA_TEST_F(TupleTest, ComplexTuples) {
+  ComputationBuilder builder(client_, TestName());
+  {
+    Shape c64r0 = ShapeUtil::MakeShape(C64, {});
+    Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
+    Shape c64r2 = ShapeUtil::MakeShape(C64, {3, 2});
+    Shape arg0_shape = ShapeUtil::MakeTupleShape(
+        {c64r0, ShapeUtil::MakeTupleShape({c64r1, c64r2})});
+    auto input0 = builder.Parameter(0, arg0_shape, "input0");
+    auto t0 = builder.GetTupleElement(input0, 0);
+    auto t1 = builder.GetTupleElement(input0, 1);
+    auto t10 = builder.GetTupleElement(t1, 0);
+    auto t11 = builder.GetTupleElement(t1, 1);
+    auto sum = builder.Add(builder.Add(t10, t11, {1}), t0);
+    auto input1 = builder.Parameter(1, c64r1, "input1");
+    auto prod = builder.Mul(input1, sum, {1});
+    builder.Tuple({builder.Tuple({prod, sum}),
+                   builder.ConstantR0<complex64>({123, 456})});
+  }
+
+  std::unique_ptr<GlobalData> arg0 =
+      client_
+          ->TransferToServer(*Literal::MakeTuple(
+              {Literal::CreateR0<complex64>({1, 2}).get(),
+               Literal::MakeTuple(
+                   {Literal::CreateR1<complex64>({{10, 20}, {30, 40}}).get(),
+                    Literal::CreateR2<complex64>(
+                        {{{100, 200}, {300, 400}},
+                         {{1000, 2000}, {3000, 4000}},
+                         {{10000, 20000}, {30000, 40000}}})
+                        .get()})
+                   .get()}))
+          .ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> arg1 =
+      client_
+          ->TransferToServer(*Literal::CreateR1<complex64>({{1, 2}, {1, -2}}))
+          .ConsumeValueOrDie();
+  auto sum = Literal::CreateR2<complex64>({{{111, 222}, {331, 442}},
+                                           {{1011, 2022}, {3031, 4042}},
+                                           {{10011, 20022}, {30031, 40042}}});
+  auto prod = Literal::CreateFromShape(sum->shape());
+  ASSERT_TRUE(prod->Populate<complex64>(
+                      [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
+                        return sum->Get<complex64>(indexes) *
+                               (indexes[indexes.size() - 1] == 0
+                                    ? complex64(1, 2)
+                                    : complex64(1, -2));
+                      })
+                  .ok());
+  auto expected =
+      Literal::MakeTuple({Literal::MakeTuple({prod.get(), sum.get()}).get(),
+                          Literal::CreateR0<complex64>({123, 456}).get()});
+  ComputeAndCompareTuple(&builder, *expected, {arg0.get(), arg1.get()},
+                         error_spec_);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 71a1b0abee51ba2819daed23208b0da8d5107207..0b3430ee1ee515c2c98c64a947b7a7021c04f22b 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -357,6 +357,109 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
 }
 
+TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
+  std::vector<Shape> shape_elements = {
+      ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
+      ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for N iterations.
+  const int N = 2;
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Gt(builder.ConstantR0<int32>(N), iteration);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable and permute the weights.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto w1 = builder.GetTupleElement(prev, 1);
+    auto w2 = builder.GetTupleElement(prev, 2);
+    auto w3 = builder.GetTupleElement(prev, 3);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
+       builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
+  auto result = builder.While(condition, body, init);
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+
+  auto expected_counter = Literal::CreateR0<int32>(N);
+  auto expected_w1 = Literal::CreateR1<float>({1.0f, 1.0f, 1.0f});
+  auto expected_w2 = Literal::CreateR1<float>({2.0f, 2.0f, 2.0f});
+  auto expected_w3 = Literal::CreateR1<float>({3.0f, 3.0f, 3.0f});
+  auto expected = Literal::MakeTuple({expected_counter.get(), expected_w2.get(),
+                                      expected_w3.get(), expected_w1.get()});
+  VLOG(2) << "expected = " << ShapeUtil::HumanString(expected->shape());
+  ComputeAndCompareTuple(&builder, *expected, {}, ErrorSpec(0.0001));
+}
+
+TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
+  std::vector<Shape> shape_elements = {
+      ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(F32, {3}),
+      ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(F32, {3})};
+  Shape result_shape = ShapeUtil::MakeTupleShape(shape_elements);
+
+  // Create a computation for the condition.
+  // Repeat for N iterations.
+  const int N = 2;
+  Computation condition;
+  {
+    ComputationBuilder builder(client_, "condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    builder.Gt(builder.ConstantR0<int32>(N), iteration);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a computation for the body.
+  // Add 1 to the iteration variable permute the weights.
+  Computation body;
+  {
+    ComputationBuilder builder(client_, "body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto iteration = builder.GetTupleElement(prev, 0);
+    auto w1 = builder.GetTupleElement(prev, 1);
+    auto w2 = builder.GetTupleElement(prev, 2);
+    auto w3 = builder.GetTupleElement(prev, 3);
+    auto result = builder.Tuple(
+        {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
+    body = builder.Build().ConsumeValueOrDie();
+  }
+
+  // Create a While node with computations for the condition and the body.
+  ComputationBuilder builder(client_, "while");
+  auto init = builder.Tuple(
+      {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
+       builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
+  auto xla_while = builder.While(condition, body, init);
+
+  auto add12 = builder.Add(builder.GetTupleElement(xla_while, 1),
+                           builder.GetTupleElement(xla_while, 2));
+  auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3));
+  VLOG(2) << "result = "
+          << ShapeUtil::HumanString(
+                 *builder.GetShape(result).ConsumeValueOrDie());
+  std::vector<float> expected = {6.f, 6.f, 6.f};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
 // Tests a while node when the result type T is a Tuple.
 //
 // tuple<int32, vector<float>> result(0, vector<float>(10, 0.0f));
@@ -808,8 +911,7 @@ TEST_F(WhileTest, WhileWithPrngScalarResult) {
   }
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
@@ -845,8 +947,7 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithTupleElement)) {
                          ErrorSpec(1e-6));
 }
 
-// TODO(b/34969189) Fails with bad AtomicCmpSwap on GPU on 2017-09-11.
-TEST_F(WhileTest, DISABLED_ON_GPU(WhileThatSwapsParameterWithBroadcast)) {
+TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
   ComputationBuilder outer(client_, "outer");
@@ -899,6 +1000,51 @@ TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
                              ErrorSpec(1e-6));
 }
 
+// Tests loop where the init value comes from two sources (constant and
+// parameter).
+//
+// int32 result = (0, 1);
+// while (result[0] + result[1] < 30) {
+//   result[0] = result[0] + 1;
+//   result[1] = result[1] + 1;
+// }
+TEST_F(WhileTest, WhileWithMixedTupleElements) {
+  auto result_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
+
+  ComputationBuilder outer(client_, "outer");
+  auto p =
+      outer.Tuple({outer.ConstantR0<int32>(0),
+                   outer.Parameter(0, ShapeUtil::MakeShape(S32, {}), "t")});
+
+  ComputationBuilder cond(client_, "cond");
+  auto params = cond.Parameter(0, result_shape, "prev");
+  auto cond_t = cond.Add(cond.GetTupleElement(params, 1),
+                         cond.GetTupleElement(params, 0));
+  cond.Lt(cond_t, cond.ConstantR0<int32>(30));
+
+  ComputationBuilder body(client_, "body");
+  auto body_t = body.Parameter(0, result_shape, "t");
+
+  auto tuple = body.Tuple(
+      {body.Add(body.GetTupleElement(params, 0), body.ConstantR0<int32>(1)),
+       body.Add(body.GetTupleElement(params, 1), body.ConstantR0<int32>(1))});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build());
+  outer.While(cond_computation, body_computation, p);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> parameter_data,
+      client_->TransferToServer(*Literal::CreateR0<int32>(1)));
+
+  auto add1 = Literal::CreateR0<int32>(15);
+  auto add2 = Literal::CreateR0<int32>(16);
+  auto expected = Literal::MakeTuple({add1.get(), add2.get()});
+  ComputeAndCompareTuple(&outer, *expected, {parameter_data.get()},
+                         ErrorSpec(1e-6));
+}
+
 // Tests nested while loops.
 //
 // int32 result = 0;
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 759921dce5acf3cd23a121776f3ab0731c9bb623..091fa0c3ec807a66449eca0bfbb141285b8eb532 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -88,6 +88,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index c84ca9fc833881ce49bcaad5dd85394145151912..97aacf6b39f83978e732060817cd93ede81ca782 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -34,9 +34,9 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
     ],
@@ -48,6 +48,7 @@ cc_library(
     hdrs = ["hlo_parser.h"],
     deps = [
         ":hlo_lexer",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -64,6 +65,7 @@ tf_cc_test(
     srcs = ["hlo_parser_test.cc"],
     deps = [
         ":hlo_parser",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/tools/parser/README.md
index 2feaa49db86ea700cab0b794ec441b95ac03b468..6232967f5f04cbf316d985357ae84c28335531e2 100644
--- a/tensorflow/compiler/xla/tools/parser/README.md
+++ b/tensorflow/compiler/xla/tools/parser/README.md
@@ -43,14 +43,22 @@ operand
   : shape name
   ;
 
-extra_attributes
+attributes
   : /*empty*/
-  | ',' extra_attribute
-  | ',' extra_attribute extra_attributes
+  | ',' attribute
+  | ',' attribute attributes
   ;
-extra_attribute
+attribute
   : attribute_name attribute_value
   ;
+attribute_value
+  : kInt
+  | kName
+  | [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}                /*dim_labels_pattern*/
+  | [0-9]+(x[0-9]+)+                                    /*dxd_pattern*/
+  | [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*  /*pad_pattern*/
+  | '{' sub_attributes '}'
+  ;
 
 param_list
   : '(' param_list1 ')'
@@ -82,4 +90,25 @@ identifier
   : [a-zA-Z_][a-zA-Z0-9_.-]*
   ;
 
+/* literal is in the right hand side of a constant instruction. */
+literal
+  : tuple
+  | non_tuple
+  ;
+tuple
+  : shape '(' literal_list ')'
+  ;
+literal_list
+  : /*empty*/
+  : literal
+  | literal_list ',' literal
+  ;
+non_tuple
+  : rank01
+  | rank2345
+  ;
+rank2345
+  : shape nested_array
+  ;
+
 ```
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
index 486df6854016d2d796781d722e6a6a27273e1cf3..459d511e90d87537f3a3404b82df7b28b1fe08bd 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
@@ -122,7 +123,7 @@ TokKind HloLexer::LexToken() {
           current_ptr_++;
           return TokKind::kArrow;
         }
-        return LexDigitOrNegative();
+        return LexNumberOrPattern();
       case '=':
         return TokKind::kEqual;
       case ',':
@@ -143,22 +144,29 @@ TokKind HloLexer::LexToken() {
         return TokKind::kLparen;
       case ')':
         return TokKind::kRparen;
+      case '/':
+        return LexComment();
+      case '"':
+        return LexString();
     }
   }
 }
 
-// Lex a shape, name, keyword, or opcode.
+// Lex a shape, name, keyword, attribute name, the dim labels pattern, and
+// other identifiers.
+//
 // shape    ::= ([a-zA-Z0-9_]*[0-9]*)\[([0-9,]*)\](?:\s*{([0-9,]*)})?
 // name     ::= [a-zA-Z_][a-zA-Z0-9_.-]*:
 // keyword  ::= HloModule, ENTRY, ...
-// opcode   ::= add, greater-than, ...
 // attribute_name ::= condition, body, dimensions, ...
+// dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+// identifiers ::= other cases that match [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexIdentifier() {
   {
     auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
     // 'consumable' will be advanced iff its prefix matches the pattern.
     static LazyRE2 shape_pattern = {
-        R"(^(\w*\d*)\[([\d,]*)\](?:\s*{([\d,]*)})?)"};
+        R"(^(\w*\d*)\[([\d,]*)\](?:{([\d,]*)})?)"};
     if (RE2::Consume(&consumable, *shape_pattern)) {
       auto status_or_shape = ShapeUtil::ParseShapeString(
           StringPieceFromPointers(token_start_, consumable.begin()));
@@ -201,6 +209,8 @@ TokKind HloLexer::LexIdentifier() {
 
   KEYWORD(true);
   KEYWORD(false);
+  KEYWORD(inf);
+  KEYWORD(nan);
   KEYWORD(HloModule);
   KEYWORD(ENTRY);
   KEYWORD(ROOT);
@@ -209,15 +219,19 @@ TokKind HloLexer::LexIdentifier() {
 
 #undef KEYWORD
 
-  // See if this is an opcode.
-  auto opcode = StringToHloOpcode(identifier.ToString());
-  if (opcode.ok()) {
-    opcode_val_ = opcode.ValueOrDie();
-    return TokKind::kOpcode;
+  {
+    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+    static LazyRE2 dim_labels_pattern = {
+        R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
+    if (RE2::Consume(&consumable, *dim_labels_pattern)) {
+      current_ptr_ = consumable.begin();
+      str_val_.assign(token_start_, current_ptr_);
+      return TokKind::kDimLabels;
+    }
   }
 
-  current_ptr_ = token_start_ + 1;
-  return TokKind::kError;
+  str_val_ = identifier.ToString();
+  return TokKind::kIdent;
 }
 
 // Lex names after a % character.
@@ -236,14 +250,20 @@ TokKind HloLexer::LexPercent() {
   return TokKind::kError;
 }
 
-// Lex integer and floating-point values.
-// int             [-]?[0-9]+
-// fp with exp     [-]?([0-9]+|[0-9]+[.][0-9]*|[0-9]*[.][0-9]+)([eE][+-]?[0-9]+)
-// fp without exp  [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
-TokKind HloLexer::LexDigitOrNegative() {
+// Lex integer and floating-point values, -inf, and patterns for dim labels,
+// dxd (e.g. 1x2x3), and pad.
+//
+// fp with exp ::= [-]?([0-9]+|[0-9]+[.][0-9]*|[0-9]*[.][0-9]+)([eE][+-]?[0-9]+)
+// fp without exp ::= [-]?([0-9]+[.][0-9]*|[0-9]*[.][0-9]+)
+// dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+// dxd_pattern ::= [0-9]+(x[0-9]+)+
+// pad_pattern ::= [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+// int ::=  [-]?[0-9]+
+// negative inf ::= '-inf'
+TokKind HloLexer::LexNumberOrPattern() {
   auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
   static LazyRE2 float_pattern = {
-      R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|(\d+[.]\d*|\d*[.]\d+))"};
+      R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"};
   if (RE2::Consume(&consumable, *float_pattern)) {
     current_ptr_ = consumable.begin();
     tensorflow::strings::safe_strtod(string(token_start_, current_ptr_).c_str(),
@@ -251,6 +271,30 @@ TokKind HloLexer::LexDigitOrNegative() {
     return TokKind::kDecimal;
   }
 
+  static LazyRE2 dim_labels_pattern = {
+      R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
+  static LazyRE2 dxd_pattern = {R"([0-9]+(x[0-9]+)+)"};
+  static LazyRE2 pad_pattern = {
+      R"([0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*)"};
+
+  if (RE2::Consume(&consumable, *dim_labels_pattern)) {
+    current_ptr_ = consumable.begin();
+    str_val_.assign(token_start_, current_ptr_);
+    return TokKind::kDimLabels;
+  }
+
+  if (RE2::Consume(&consumable, *dxd_pattern)) {
+    current_ptr_ = consumable.begin();
+    str_val_.assign(token_start_, current_ptr_);
+    return TokKind::kDxD;
+  }
+
+  if (RE2::Consume(&consumable, *pad_pattern)) {
+    current_ptr_ = consumable.begin();
+    str_val_.assign(token_start_, current_ptr_);
+    return TokKind::kPad;
+  }
+
   static LazyRE2 int_pattern = {R"([-]?\d+)"};
   if (RE2::Consume(&consumable, *int_pattern)) {
     current_ptr_ = consumable.begin();
@@ -259,23 +303,154 @@ TokKind HloLexer::LexDigitOrNegative() {
     return TokKind::kInt;
   }
 
+  static LazyRE2 neg_inf = {"-inf"};
+  if (RE2::Consume(&consumable, *neg_inf)) {
+    current_ptr_ = consumable.begin();
+    return TokKind::kNegInf;
+  }
+
   return TokKind::kError;
 }
 
-StringPiece HloLexer::GetCurrentLine() const {
-  const char* start = token_start_;
-  const char* end = current_ptr_;
-  if (!CanDereference(start) || !CanDereference(end)) {
-    return "LINE OUT OF RANGE";
+std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
+  unsigned line_no = 1;
+  const char* start = buf_.begin();
+  const char* ptr = start;
+  if (line_no_cache_.last_query && CanDereference(line_no_cache_.last_query) &&
+      line_no_cache_.last_query <= location) {
+    ptr = line_no_cache_.last_query;
+    line_no = line_no_cache_.line_no_of_query;
+  }
+  for (; ptr != location; ptr++) {
+    if (*ptr == '\n') {
+      line_no++;
+    }
   }
-  while (start > buf_.begin() && *start != '\n') {
-    start--;
+
+  // Update the line number cache.
+  line_no_cache_.last_query = ptr;
+  line_no_cache_.line_no_of_query = line_no;
+  size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
+  if (line_offset == StringPiece::npos) {
+    line_offset = 0;
   }
-  while (end < buf_.end() && *end != '\n') {
-    end++;
+  return {line_no, ptr - start - line_offset};
+}
+
+StringPiece HloLexer::GetLine(LocTy loc) const {
+  if (!CanDereference(loc)) {
+    return "LINE OUT OF RANGE";
   }
+  size_t line_start =
+      StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
+  const char* start = line_start == StringPiece::npos
+                          ? buf_.begin()
+                          : buf_.begin() + line_start + 1;
+  size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
+  const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end;
+
   return StringPieceFromPointers(start, end);
 }
 
+TokKind HloLexer::LexComment() {
+  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  static LazyRE2 comment_pattern = {R"(\/\*.*?\*\/)"};
+  if (RE2::Consume(&consumable, *comment_pattern)) {
+    current_ptr_ = consumable.begin();
+    return TokKind::kComment;
+  }
+  return TokKind::kError;
+}
+
+// Lexes quoted string with escaping characters. If matched, the quoted string
+// will be unescaped and stored to str_val_.
+TokKind HloLexer::LexString() {
+  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
+  if (RE2::Consume(&consumable, *escaping_pattern)) {
+    current_ptr_ = consumable.begin();
+    StringPiece raw =
+        StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
+    string error;
+    if (!tensorflow::str_util::CUnescape(raw, &str_val_, &error)) {
+      LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error;
+      return TokKind::kError;
+    }
+    return TokKind::kString;
+  }
+  return TokKind::kError;
+}
+
+string TokKindToString(TokKind kind) {
+  switch (kind) {
+    case TokKind::kEof:
+      return "kEof";
+    case TokKind::kError:
+      return "kError";
+    case TokKind::kEqual:
+      return "kEqaul";
+    case TokKind::kComma:
+      return "kComma";
+    case TokKind::kColon:
+      return "kColon";
+    case TokKind::kLsquare:
+      return "kLsquare";
+    case TokKind::kRsquare:
+      return "kRsquare";
+    case TokKind::kLbrace:
+      return "kLbrace";
+    case TokKind::kRbrace:
+      return "kRbrace";
+    case TokKind::kLparen:
+      return "kLparen";
+    case TokKind::kRparen:
+      return "kRparen";
+    case TokKind::kArrow:
+      return "kArrow";
+    case TokKind::kComment:
+      return "kComment";
+    case TokKind::kw_HloModule:
+      return "kw_HloModule";
+    case TokKind::kw_ENTRY:
+      return "kw_ENTRY";
+    case TokKind::kw_ROOT:
+      return "kw_ROOT";
+    case TokKind::kw_true:
+      return "kw_true";
+    case TokKind::kw_false:
+      return "kw_false";
+    case TokKind::kw_maximal:
+      return "kw_maximal";
+    case TokKind::kw_replicated:
+      return "kw_replicated";
+    case TokKind::kw_nan:
+      return "kw_nan";
+    case TokKind::kw_inf:
+      return "kw_inf";
+    case TokKind::kNegInf:
+      return "kNegInf";
+    case TokKind::kName:
+      return "kName";
+    case TokKind::kAttributeName:
+      return "kAttributeName";
+    case TokKind::kDimLabels:
+      return "kDimLabels";
+    case TokKind::kDxD:
+      return "kDxD";
+    case TokKind::kPad:
+      return "kPad";
+    case TokKind::kIdent:
+      return "kIdent";
+    case TokKind::kString:
+      return "kString";
+    case TokKind::kShape:
+      return "kShape";
+    case TokKind::kInt:
+      return "kInt";
+    case TokKind::kDecimal:
+      return "kDecimal";
+  }
+}
+
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
index 433a3a3601e969de154d2f463f650f5f0b07a49f..27880b9b8afbfa58abfedc3b2cecd5236b78a6d6 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_token.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/logging.h"
@@ -37,11 +37,17 @@ class HloLexer {
   }
 
   TokKind Lex() { return current_kind_ = LexToken(); }
+
   TokKind GetKind() const { return current_kind_; }
   string GetStrVal() const {
     switch (GetKind()) {
       case TokKind::kName:
       case TokKind::kAttributeName:
+      case TokKind::kDimLabels:
+      case TokKind::kDxD:
+      case TokKind::kPad:
+      case TokKind::kString:
+      case TokKind::kIdent:
         return str_val_;
       default:
         LOG(FATAL) << "This token does not have string value";
@@ -51,10 +57,6 @@ class HloLexer {
     CHECK(GetKind() == TokKind::kShape);
     return shape_val_;
   }
-  HloOpcode GetOpcodeVal() const {
-    CHECK(GetKind() == TokKind::kOpcode);
-    return opcode_val_;
-  }
   int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return int64_val_;
@@ -64,8 +66,16 @@ class HloLexer {
     return decimal_val_;
   }
 
-  // Returns the line of text that is currently being lexed.
-  tensorflow::StringPiece GetCurrentLine() const;
+  typedef const char* LocTy;
+
+  // Returns the location of the current token.
+  LocTy GetLoc() const { return token_start_; }
+
+  // Returns the line and column of a location in the buffer.
+  std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
+
+  // Returns the whole line given the location.
+  tensorflow::StringPiece GetLine(LocTy loc) const;
 
  private:
   // Returns the current character. If it's neither the end of input buffer nor
@@ -92,7 +102,9 @@ class HloLexer {
   TokKind LexPercent();
   TokKind LexShape();
   TokKind LexConstant();
-  TokKind LexDigitOrNegative();
+  TokKind LexNumberOrPattern();
+  TokKind LexComment();
+  TokKind LexString();
 
   const tensorflow::StringPiece buf_;
   const char* current_ptr_;
@@ -102,9 +114,15 @@ class HloLexer {
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
-  HloOpcode opcode_val_;
   int64 int64_val_;
   double decimal_val_;
+
+  struct LineNoCacheTy {
+    const char* last_query;
+    unsigned line_no_of_query;
+  };
+  // This caches the line number of the previous query.
+  mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
 }  // namespace tools
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 5dd8ec6636ecca6f34fff39f285454ee0764a8ad..457b6557836bb2767ce9d05c4494855a0944ca60 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
 namespace tools {
@@ -25,12 +29,22 @@ namespace tools {
 namespace {
 
 using tensorflow::StringPiece;
+using tensorflow::gtl::optional;
+using tensorflow::str_util::Split;
+using tensorflow::str_util::SplitAndParseAsInts;
+using tensorflow::strings::Printf;
+using tensorflow::strings::StrAppend;
 using tensorflow::strings::StrCat;
 
+const double kF16max = 65504;
+
 // Parser for the HloModule::ToString() format text.
 class HloParser {
  public:
-  explicit HloParser(StringPiece str) : lexer_(str) {}
+  using LocTy = HloLexer::LocTy;
+
+  explicit HloParser(StringPiece str, const HloModuleConfig& config)
+      : lexer_(str), config_(config) {}
 
   // Runs the parser. Returns false if an error occurred.
   bool Run();
@@ -49,42 +63,146 @@ class HloParser {
   bool ParseInstructionList(HloComputation::Builder* builder,
                             string* root_name);
   bool ParseInstruction(HloComputation::Builder* builder, string* root_name);
-  bool ParseSharding(HloInstruction* instruction);
+  bool ParseControlPredecessors(HloInstruction* instruction);
   bool ParseLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
+  bool ParseTupleLiteral(std::unique_ptr<Literal>* literal, const Shape& shape);
+  bool ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
+                            const Shape& shape);
+  // Sets the sub-value of literal at the given index to the given value. The
+  // literal's shape must have the default layout.
+  bool SetValueInLiteral(int64 value, int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(double value, int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(bool value, int64 linear_index, Literal* literal);
+  template <typename LiteralNativeT, typename ParsedElemT>
+  bool SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+                               Literal* literal);
+
   bool ParseOperands(std::vector<HloInstruction*>* operands);
-  // Fill parsed operands into 'operands' and expect a certain number of
+  // Fills parsed operands into 'operands' and expects a certain number of
   // operands.
   bool ParseOperands(std::vector<HloInstruction*>* operands,
                      const int expected_size);
 
-  template <typename T>
-  bool ParseExtraAttribute(T* value, const string& expected_attribute);
-  template <typename T>
-  bool ParseAttributeValue(T* value);
+  // Describes the start, limit, and stride on every dimension of the operand
+  // being sliced.
+  struct SliceRanges {
+    std::vector<int64> starts;
+    std::vector<int64> limits;
+    std::vector<int64> strides;
+  };
+
+  // Types of attributes.
+  enum class AttrTy {
+    kInt64,
+    kInt32,
+    kFloat,
+    kString,
+    kBracedInt64List,
+    kHloComputation,
+    kWindow,
+    kConvolutionDimensionNumbers,
+    kSharding,
+    kInstructionList,
+    kSliceRanges,
+    kPaddingConfig,
+    kMetadata,
+    kFusionKind,
+    kDistribution,
+  };
+
+  struct AttrConfig {
+    bool required;     // whether it's required or optional
+    AttrTy attr_type;  // what type it is
+    void* result;      // where to store the parsed result.
+  };
+
+  // attributes ::= (',' attribute)*
+  //
+  // Parses attributes given names and configs of the attributes. Each parsed
+  // result is passed back through the result pointer in corresponding
+  // AttrConfig. Note that the result pointer must point to a optional<T> typed
+  // variable which outlives this function. Returns false on error. You should
+  // not use the any of the results if this function failed.
+  //
+  // Example usage:
+  //
+  //  std::unordered_map<string, AttrConfig> attrs;
+  //  optional<int64> foo;
+  //  attrs["foo"] = {/*required=*/false, AttrTy::kInt64, &foo};
+  //  optional<Window> bar;
+  //  attrs["bar"] = {/*required=*/true, AttrTy::kWindow, &bar};
+  //  if (!ParseAttributes(attrs)) {
+  //    return false; // Do not use 'foo' 'bar' if failed.
+  //  }
+  //  // Do something with 'bar'.
+  //  if (foo) { // If attr foo is seen, do something with 'foo'. }
+  //
+  bool ParseAttributes(const std::unordered_map<string, AttrConfig>& attrs);
+
+  // sub_attributes ::= '{' (','? attribute)* '}'
+  //
+  // Usage is the same as ParseAttributes. See immediately above.
+  bool ParseSubAttributes(const std::unordered_map<string, AttrConfig>& attrs);
+
+  // Parses one attribute. If it has already been seen, return error. Returns
+  // true and adds to seen_attrs on success.
+  //
+  // Do not call this except in ParseAttributes or ParseSubAttributes.
+  bool ParseAttributeHelper(const std::unordered_map<string, AttrConfig>& attrs,
+                            std::unordered_set<string>* seen_attrs);
+
+  // Parses a name and finds the corresponding hlo computation.
+  bool ParseComputationName(HloComputation** value);
+  // Parses a list of names and finds the corresponding hlo instructions.
+  bool ParseInstructionNames(std::vector<HloInstruction*>* instructions);
+  bool ParseWindow(Window* window);
+  bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums);
+  bool ParsePaddingConfig(PaddingConfig* padding);
+  bool ParseMetadata(OpMetadata* metadata);
+  bool ParseSharding(OpSharding* sharding);
+  bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
+
+  // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
+  bool ParseDxD(const string& name, std::vector<int64>* result);
+  // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
+  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
+
+  bool ParseSliceRanges(SliceRanges* result);
+  bool ParseInt64List(const TokKind start, const TokKind end,
+                      const TokKind delim, std::vector<int64>* result);
 
   bool ParseParamList();
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
+  bool ParseString(string* result);
   bool ParseShape(Shape* result);
   bool ParseOpcode(HloOpcode* result);
+  bool ParseFusionKind(HloInstruction::FusionKind* result);
+  bool ParseRandomDistribution(RandomDistribution* result);
   bool ParseInt64(int64* result);
-  bool ParseDecimal(double* result);
+  bool ParseDouble(double* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
 
   // Logs the current parsing line and the given message. Always returns false.
   bool TokenError(StringPiece msg);
+  bool Error(LocTy loc, StringPiece msg);
 
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
   bool EatIfPresent(TokKind kind);
+  // Parses a shape, and returns true if the result is compatible with the given
+  // shape.
+  bool EatShapeAndCheckCompatible(const Shape& shape);
 
   // Adds the instruction to the pool. Returns false and emits an error if the
   // instruction already exists.
-  bool AddInstruction(const string& name, HloInstruction* instruction);
+  bool AddInstruction(const string& name, HloInstruction* instruction,
+                      LocTy name_loc);
   // Adds the computation to the pool. Returns false and emits an error if the
   // computation already exists.
-  bool AddComputation(const string& name, HloComputation* computation);
+  bool AddComputation(const string& name, HloComputation* computation,
+                      LocTy name_loc);
 
   // The map from the instruction name to the instruction. This does not own the
   // instructions.
@@ -93,15 +211,29 @@ class HloParser {
 
   HloLexer lexer_;
   std::unique_ptr<HloModule> module_;
+  const HloModuleConfig config_;
   std::vector<string> error_;
 };
 
-bool HloParser::TokenError(StringPiece msg) {
-  error_.push_back(
-      StrCat("was parsing \"", lexer_.GetCurrentLine(), "\"; ", msg));
+bool HloParser::Error(LocTy loc, StringPiece msg) {
+  auto line_col = lexer_.GetLineAndColumn(loc);
+  const unsigned line = line_col.first;
+  const unsigned col = line_col.second;
+  std::vector<string> error_lines;
+  error_lines.push_back(
+      StrCat("was parsing ", line, ":", col, ": error: ", msg));
+  error_lines.push_back(lexer_.GetLine(loc).ToString());
+  error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
+
+  error_.push_back(tensorflow::str_util::Join(error_lines, "\n"));
+  VLOG(1) << "Error: " << error_.back();
   return false;
 }
 
+bool HloParser::TokenError(StringPiece msg) {
+  return Error(lexer_.GetLoc(), msg);
+}
+
 bool HloParser::Run() {
   lexer_.Lex();
   return ParseHloModule();
@@ -120,7 +252,7 @@ bool HloParser::ParseHloModule() {
     return false;
   }
 
-  module_ = MakeUnique<HloModule>(name);
+  module_ = MakeUnique<HloModule>(name, config_);
 
   return ParseComputations();
 }
@@ -139,6 +271,7 @@ bool HloParser::ParseComputations() {
 bool HloParser::ParseComputation() {
   const bool is_entry_computation = EatIfPresent(TokKind::kw_ENTRY);
   string name;
+  LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
     return false;
   }
@@ -159,6 +292,7 @@ bool HloParser::ParseComputation() {
     LOG(FATAL) << "instruction " << root_name
                << " was marked as ROOT but the parser has not seen it before";
   }
+
   // Now root can be either an existing instruction or a nullptr. If it's a
   // nullptr, the implementation of Builder will set the last instruction as
   // root instruction.
@@ -166,7 +300,7 @@ bool HloParser::ParseComputation() {
       is_entry_computation
           ? module_->AddEntryComputation(builder->Build(root))
           : module_->AddEmbeddedComputation(builder->Build(root));
-  return AddComputation(name, computation);
+  return AddComputation(name, computation, name_loc);
 }
 
 // instruction_list ::= '{' instruction_list1 '}'
@@ -186,7 +320,7 @@ bool HloParser::ParseInstructionList(HloComputation::Builder* builder,
                     "expects '}' at the end of instruction list.");
 }
 
-// instruction ::= ('ROOT')? name '=' shape opcode operands (extra_attribute)*
+// instruction ::= ('ROOT')? name '=' shape opcode operands (attribute)*
 bool HloParser::ParseInstruction(HloComputation::Builder* builder,
                                  string* root_name) {
   string name;
@@ -194,6 +328,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   HloOpcode opcode;
   std::vector<HloInstruction*> operands;
   bool is_root = EatIfPresent(TokKind::kw_ROOT);
+
+  const LocTy name_loc = lexer_.GetLoc();
   if (!ParseName(&name) ||
       !ParseToken(TokKind::kEqual, "expects '=' in instruction") ||
       !ParseShape(&shape) || !ParseOpcode(&opcode)) {
@@ -202,6 +338,17 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   if (is_root) {
     *root_name = name;
   }
+
+  // Add optional attributes.
+  std::unordered_map<string, AttrConfig> attrs;
+  optional<OpSharding> sharding;
+  attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
+  optional<std::vector<HloInstruction*>> predecessors;
+  attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList,
+                                   &predecessors};
+  optional<OpMetadata> metadata;
+  attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata};
+
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
@@ -209,7 +356,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
           !ParseInt64(&parameter_number) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after parameter number")) {
+          !ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -221,7 +369,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before constant literal") ||
           !ParseLiteral(&literal, shape) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after constant literal")) {
+          !ParseToken(TokKind::kRparen, "expects ')' after constant literal") ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -247,7 +396,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kSin:
     case HloOpcode::kSort:
     case HloOpcode::kTanh: {
-      if (!ParseOperands(&operands, /*expected_size=*/1)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -277,7 +427,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical: {
-      if (!ParseOperands(&operands, /*expected_size=*/2)) {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateBinary(
@@ -287,7 +438,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     // Ternary ops.
     case HloOpcode::kClamp:
     case HloOpcode::kSelect: {
-      if (!ParseOperands(&operands, /*expected_size=*/3)) {
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateTernary(
@@ -296,23 +448,34 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     // Other supported ops.
     case HloOpcode::kConvert: {
-      if (!ParseOperands(&operands, /*expected_size=*/1)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
           HloInstruction::CreateConvert(shape, operands[0]));
       break;
     }
+    case HloOpcode::kBitcastConvert: {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateBitcastConvert(shape, operands[0]));
+      break;
+    }
     case HloOpcode::kCrossReplicaSum: {
-      if (!ParseOperands(&operands, /*expected_size=*/1)) {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCrossReplicaSum(shape, operands[0]));
+          HloInstruction::CreateCrossReplicaSum(shape, operands));
       break;
     }
     case HloOpcode::kReshape: {
-      if (!ParseOperands(&operands, /*expected_size=*/1)) {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
@@ -320,7 +483,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTuple: {
-      if (!ParseOperands(&operands)) {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction =
@@ -328,114 +491,452 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kWhile: {
-      HloComputation* condition;
-      HloComputation* body;
+      optional<HloComputation*> condition;
+      optional<HloComputation*> body;
+      attrs["condition"] = {/*required=*/true, AttrTy::kHloComputation,
+                            &condition};
+      attrs["body"] = {/*required=*/true, AttrTy::kHloComputation, &body};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseExtraAttribute(&condition,
-                               /*expected_attribute=*/"condition") ||
-          !ParseExtraAttribute(&body, /*expected_attribute=*/"body")) {
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateWhile(
-          shape, condition, body, /*init=*/operands[0]));
+          shape, *condition, *body, /*init=*/operands[0]));
       break;
     }
     case HloOpcode::kRecv: {
-      int64 channel_id;
+      optional<int64> channel_id;
+      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
-          !ParseExtraAttribute(&channel_id,
-                               /*expected_attribute=*/"channel_id")) {
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateRecv(shape, channel_id));
+          HloInstruction::CreateRecv(shape.tuple_shapes(0), *channel_id));
+      break;
+    }
+    case HloOpcode::kRecvDone: {
+      optional<int64> channel_id;
+      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      if (channel_id != operands[0]->channel_id()) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateRecvDone(operands[0]));
       break;
     }
     case HloOpcode::kSend: {
-      int64 channel_id;
+      optional<int64> channel_id;
+      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseExtraAttribute(&channel_id,
-                               /*expected_attribute=*/"channel_id")) {
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateSend(operands[0], channel_id));
+          HloInstruction::CreateSend(operands[0], *channel_id));
+      break;
+    }
+    case HloOpcode::kSendDone: {
+      optional<int64> channel_id;
+      attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      if (channel_id != operands[0]->channel_id()) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateSendDone(operands[0]));
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      int64 index;
+      optional<int64> index;
+      attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
-          !ParseExtraAttribute(&index, /*expected_attribute=*/"index")) {
+          !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateGetTupleElement(shape, operands[0], index));
+          HloInstruction::CreateGetTupleElement(shape, operands[0], *index));
       break;
     }
     case HloOpcode::kCall: {
-      HloComputation* to_apply;
-      if (!ParseOperands(&operands) ||
-          !ParseExtraAttribute(&to_apply,
-                               /*expected_attribute=*/"to_apply")) {
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateCall(shape, operands, *to_apply));
+      break;
+    }
+    case HloOpcode::kReduceWindow: {
+      optional<HloComputation*> reduce_computation;
+      optional<Window> window;
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &reduce_computation};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      if (!window) {
+        window.emplace();
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateReduceWindow(
+          shape, /*operand=*/operands[0], /*init_value=*/operands[1], *window,
+          *reduce_computation));
+      break;
+    }
+    case HloOpcode::kConvolution: {
+      optional<Window> window;
+      optional<ConvolutionDimensionNumbers> dnums;
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
+      attrs["dim_labels"] = {/*required=*/true,
+                             AttrTy::kConvolutionDimensionNumbers, &dnums};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      if (!window) {
+        window.emplace();
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
+          shape, /*lhs=*/operands[0], /*rhs=*/operands[1], *window, *dnums));
+      break;
+    }
+    case HloOpcode::kBroadcast: {
+      optional<std::vector<int64>> broadcast_dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &broadcast_dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateBroadcast(
+          shape, operands[0], *broadcast_dimensions));
+      break;
+    }
+    case HloOpcode::kConcatenate: {
+      optional<std::vector<int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
+          dimensions->size() != 1) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateConcatenate(
+          shape, operands, dimensions->at(0)));
+      break;
+    }
+    case HloOpcode::kMap: {
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCall(shape, operands, to_apply));
+          HloInstruction::CreateMap(shape, operands, *to_apply));
       break;
     }
-    case HloOpcode::kBroadcast:
+    case HloOpcode::kReduce: {
+      optional<HloComputation*> reduce_computation;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &reduce_computation};
+      optional<std::vector<int64>> dimensions_to_reduce;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions_to_reduce};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateReduce(
+          shape, /*operand=*/operands[0], /*init_value=*/operands[1],
+          *dimensions_to_reduce, *reduce_computation));
+      break;
+    }
+    case HloOpcode::kReverse: {
+      optional<std::vector<int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateReverse(shape, operands[0], *dimensions));
+      break;
+    }
+    case HloOpcode::kSelectAndScatter: {
+      optional<HloComputation*> select;
+      attrs["select"] = {/*required=*/true, AttrTy::kHloComputation, &select};
+      optional<HloComputation*> scatter;
+      attrs["scatter"] = {/*required=*/true, AttrTy::kHloComputation, &scatter};
+      optional<Window> window;
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      if (!window) {
+        window.emplace();
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateSelectAndScatter(
+              shape, /*operand=*/operands[0], *select, *window,
+              /*source=*/operands[1], /*init_value=*/operands[2], *scatter));
+      break;
+    }
+    case HloOpcode::kSlice: {
+      optional<SliceRanges> slice_ranges;
+      attrs["slice"] = {/*required=*/true, AttrTy::kSliceRanges, &slice_ranges};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateSlice(
+          shape, operands[0], slice_ranges->starts, slice_ranges->limits,
+          slice_ranges->strides));
+      break;
+    }
+    case HloOpcode::kDynamicSlice: {
+      optional<std::vector<int64>> dynamic_slice_sizes;
+      attrs["dynamic_slice_sizes"] = {
+          /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice(
+          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          *dynamic_slice_sizes));
+      break;
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              shape, /*operand=*/operands[0], /*update=*/operands[1],
+              /*start_indices=*/operands[2]));
+      break;
+    }
+    case HloOpcode::kTranspose: {
+      optional<std::vector<int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateTranspose(shape, operands[0], *dimensions));
+      break;
+    }
+    case HloOpcode::kBatchNormTraining: {
+      optional<float> epsilon;
+      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
+      optional<int64> feature_index;
+      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
+                                &feature_index};
+      if (!ParseOperands(&operands, /*expected_size=*/3) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateBatchNormTraining(
+              shape, /*operand=*/operands[0], /*scale=*/operands[1],
+              /*offset=*/operands[2], *epsilon, *feature_index));
+      break;
+    }
+    case HloOpcode::kBatchNormInference: {
+      optional<float> epsilon;
+      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
+      optional<int64> feature_index;
+      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
+                                &feature_index};
+      if (!ParseOperands(&operands, /*expected_size=*/5) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateBatchNormInference(
+              shape, /*operand=*/operands[0], /*scale=*/operands[1],
+              /*offset=*/operands[2], /*mean=*/operands[3],
+              /*variance=*/operands[4], *epsilon, *feature_index));
+      break;
+    }
+    case HloOpcode::kBatchNormGrad: {
+      optional<float> epsilon;
+      attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
+      optional<int64> feature_index;
+      attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
+                                &feature_index};
+      if (!ParseOperands(&operands, /*expected_size=*/5) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateBatchNormGrad(
+          shape, /*operand=*/operands[0], /*scale=*/operands[1],
+          /*mean=*/operands[2], /*variance=*/operands[3],
+          /*grad_output=*/operands[4], *epsilon, *feature_index));
+      break;
+    }
+    case HloOpcode::kPad: {
+      optional<PaddingConfig> padding;
+      attrs["padding"] = {/*required=*/true, AttrTy::kPaddingConfig, &padding};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreatePad(
+          shape, operands[0], /*padding_value=*/operands[1], *padding));
+      break;
+    }
+    case HloOpcode::kFusion: {
+      optional<HloComputation*> fusion_computation;
+      attrs["calls"] = {/*required=*/true, AttrTy::kHloComputation,
+                        &fusion_computation};
+      optional<HloInstruction::FusionKind> fusion_kind;
+      attrs["kind"] = {/*required=*/true, AttrTy::kFusionKind, &fusion_kind};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateFusion(
+          shape, *fusion_kind, operands, *fusion_computation));
+      break;
+    }
+    case HloOpcode::kInfeed: {
+      optional<string> config;
+      attrs["infeed_config"] = {/*required=*/false, AttrTy::kString, &config};
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateInfeed(shape, config ? *config : ""));
+      break;
+    }
+    case HloOpcode::kOutfeed: {
+      optional<string> config;
+      attrs["outfeed_config"] = {/*required=*/false, AttrTy::kString, &config};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateOutfeed(
+          shape, operands[0], config ? *config : ""));
+      break;
+    }
+    case HloOpcode::kRng: {
+      optional<RandomDistribution> distribution;
+      attrs["distribution"] = {/*required=*/true, AttrTy::kDistribution,
+                               &distribution};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateRng(shape, *distribution, operands));
+      break;
+    }
+    case HloOpcode::kReducePrecision: {
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
+      attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &exponent_bits};
+      attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
+                                &mantissa_bits};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateReducePrecision(
+              shape, operands[0], static_cast<int>(*exponent_bits),
+              static_cast<int>(*mantissa_bits)));
+      break;
+    }
+    case HloOpcode::kConditional:
     case HloOpcode::kCustomCall:
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kReducePrecision:
-    case HloOpcode::kConvolution:
-    case HloOpcode::kMap:
-    case HloOpcode::kPad:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReduceWindow:
-    case HloOpcode::kSelectAndScatter:
-    case HloOpcode::kReverse:
-    case HloOpcode::kRng:
-    case HloOpcode::kSlice:
-    case HloOpcode::kDynamicSlice:
-    case HloOpcode::kDynamicUpdateSlice:
-    case HloOpcode::kTranspose:
-    case HloOpcode::kFusion:
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
-    case HloOpcode::kInfeed:
-    case HloOpcode::kOutfeed:
-    case HloOpcode::kBatchNormGrad:
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
   }
-  // Parse "sharding=".
-  if (lexer_.GetKind() == TokKind::kComma) {
-    if (!ParseSharding(instruction)) {
-      return false;
+
+  // Add common attrs (sharding, control predecessors) to the instruction, if
+  // they were seen.
+  if (sharding) {
+    instruction->set_sharding(
+        HloSharding::FromProto(sharding.value()).ValueOrDie());
+  }
+  if (predecessors) {
+    for (auto* pre : *predecessors) {
+      Status status = pre->AddControlDependencyTo(instruction);
+      if (!status.ok()) {
+        return Error(name_loc, StrCat("error adding control dependency for: ",
+                                      name, " status: ", status.ToString()));
+      }
     }
   }
+  if (metadata) {
+    instruction->set_metadata(*metadata);
+  }
+  return AddInstruction(name, instruction, name_loc);
+}  // NOLINT(readability/fn_size)
 
-  return AddInstruction(name, instruction);
-}
-
-// ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape? ('devices=' ('['
-// dims ']')* device_list)? '}' dims ::= int_list device_list ::= int_list
-bool HloParser::ParseSharding(HloInstruction* instruction) {
-  if (!ParseToken(TokKind::kComma,
-                  "expects ',' in front of an extra attribute")) {
+// ::= '{' (single_sharding | tuple_sharding) '}'
+//
+// tuple_sharding ::= single_sharding* (',' single_sharding)*
+bool HloParser::ParseSharding(OpSharding* sharding) {
+  // A single sharding starts with '{' and is not followed by '{'.
+  // A tuple sharding starts with '{' and is followed by '{', or is '{''}' for
+  // an empty tuple.
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start sharding attribute")) {
     return false;
   }
-  string attribute_name;
-  if (!ParseAttributeName(&attribute_name) || attribute_name != "sharding") {
-    return TokenError("expects attribute name: sharding");
+
+  if (lexer_.GetKind() != TokKind::kLbrace &&
+      lexer_.GetKind() != TokKind::kRbrace) {
+    return ParseSingleSharding(sharding, /*lbrace_pre_lexed=*/true);
   }
 
-  if (!ParseToken(TokKind::kLbrace,
+  // Tuple sharding.
+  // Allow empty tuple shardings.
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    do {
+      if (!ParseSingleSharding(sharding->add_tuple_shardings(),
+                               /*lbrace_pre_lexed=*/false)) {
+        return false;
+      }
+    } while (EatIfPresent(TokKind::kComma));
+  }
+  sharding->set_type(OpSharding::Type::OpSharding_Type_TUPLE);
+
+  return ParseToken(TokKind::kRbrace, "expected '}' to end sharding attribute");
+}
+
+//  ::= '{' 'replicated'? 'maximal'? ('device=' int)? shape?
+//          ('devices=' ('[' dims ']')* device_list)? '}'
+// dims ::= int_list device_list ::= int_list
+bool HloParser::ParseSingleSharding(OpSharding* sharding,
+                                    bool lbrace_pre_lexed) {
+  if (!lbrace_pre_lexed &&
+      !ParseToken(TokKind::kLbrace,
                   "expected '{' to start sharding attribute")) {
     return false;
   }
 
+  LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
   std::vector<int64> devices;
@@ -501,83 +1002,370 @@ bool HloParser::ParseSharding(HloInstruction* instruction) {
     }
   }
 
-  OpSharding sharding;
   if (replicated) {
     if (!devices.empty()) {
-      return TokenError(
-          "replicated shardings should not have any devices assigned");
+      return Error(loc,
+                   "replicated shardings should not have any devices assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError(
-          "replicated shardings should not have any tile shape set");
+      return Error(loc,
+                   "replicated shardings should not have any tile shape set");
     }
-    sharding.set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
+    sharding->set_type(OpSharding::Type::OpSharding_Type_REPLICATED);
   } else if (maximal) {
     if (devices.size() != 1) {
-      return TokenError(
-          "maximal shardings should have exactly one device assigned");
+      return Error(loc,
+                   "maximal shardings should have exactly one device assigned");
     }
     if (!ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("maximal shardings should not have any tile shape set");
+      return Error(loc, "maximal shardings should not have any tile shape set");
     }
-    sharding.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
-    sharding.add_tile_assignment_devices(devices[0]);
+    sharding->set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
+    sharding->add_tile_assignment_devices(devices[0]);
   } else {
     if (devices.size() <= 1) {
-      return TokenError(
-          "non-maximal shardings must have more than one device assigned");
+      return Error(
+          loc, "non-maximal shardings must have more than one device assigned");
     }
     if (ShapeUtil::Equal(tile_shape, Shape())) {
-      return TokenError("non-maximal shardings should have a tile shape set");
+      return Error(loc, "non-maximal shardings should have a tile shape set");
     }
     if (tile_assignment_dimensions.empty()) {
-      return TokenError(
+      return Error(
+          loc,
           "non-maximal shardings must have a tile assignment list including "
           "dimensions");
     }
-    sharding.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    *sharding.mutable_tile_shape() = tile_shape;
+    sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
+    *sharding->mutable_tile_shape() = tile_shape;
     for (int64 dim : tile_assignment_dimensions) {
-      sharding.add_tile_assignment_dimensions(dim);
+      sharding->add_tile_assignment_dimensions(dim);
     }
     for (int64 device : devices) {
-      sharding.add_tile_assignment_devices(device);
+      sharding->add_tile_assignment_devices(device);
     }
   }
 
-  instruction->set_sharding(HloSharding::FromProto(sharding).ValueOrDie());
   lexer_.Lex();
   return true;
 }
 
-bool HloParser::ParseLiteral(std::unique_ptr<Literal>* literal,
-                             const Shape& shape) {
+// '{' name+ '}'
+bool HloParser::ParseInstructionNames(
+    std::vector<HloInstruction*>* instructions) {
+  if (!ParseToken(TokKind::kLbrace,
+                  "expects '{' at the beginning of instruction name list")) {
+    return false;
+  }
+  LocTy loc = lexer_.GetLoc();
+  do {
+    string name;
+    if (!ParseName(&name)) {
+      return Error(loc, "expects a instruction name");
+    }
+    HloInstruction* instr =
+        tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
+    if (!instr) {
+      return TokenError(
+          Printf("instruction '%s' is not defined", name.c_str()));
+    }
+    instructions->push_back(instr);
+  } while (EatIfPresent(TokKind::kComma));
+
+  return ParseToken(TokKind::kRbrace,
+                    "expects '}' at the end of instruction name list");
+}
+
+bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
+                                  Literal* literal) {
+  const Shape& shape = literal->shape();
   switch (shape.element_type()) {
-    case PRED:
-      bool b;
-      if (!ParseBool(&b)) {
-        return false;
-      }
-      *literal = Literal::CreateR0<bool>(b);
-      return true;
+    case S8:
+      return SetValueInLiteralHelper<int8>(value, linear_index, literal);
+    case S16:
+      return SetValueInLiteralHelper<int16>(value, linear_index, literal);
     case S32:
-      int64 i;
-      if (!ParseInt64(&i)) {
-        return false;
-      }
-      *literal = Literal::CreateR0<int32>(i);
-      return true;
+      return SetValueInLiteralHelper<int32>(value, linear_index, literal);
+    case S64:
+      return SetValueInLiteralHelper<int64>(value, linear_index, literal);
+    case U8:
+      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+    case U16:
+      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+    case U32:
+      return SetValueInLiteralHelper<uint32>(value, linear_index, literal);
+    case U64:
+      return SetValueInLiteralHelper<uint64>(value, linear_index, literal);
+    default:
+      LOG(FATAL) << "unknown integral primitive type "
+                 << PrimitiveType_Name(shape.element_type());
+  }
+}
+
+bool HloParser::SetValueInLiteral(double value, int64 linear_index,
+                                  Literal* literal) {
+  const Shape& shape = literal->shape();
+  switch (shape.element_type()) {
+    case F16:
+      return SetValueInLiteralHelper<half>(value, linear_index, literal);
+    case BF16:
+      return SetValueInLiteralHelper<bfloat16>(value, linear_index, literal);
     case F32:
-      double d;
-      if (!ParseDecimal(&d)) {
-        return false;
-      }
-      *literal = Literal::CreateR0<float>(d);
-      return true;
+      return SetValueInLiteralHelper<float>(value, linear_index, literal);
+    case F64:
+      return SetValueInLiteralHelper<double>(value, linear_index, literal);
+    default:
+      LOG(FATAL) << "unknown floating point primitive type "
+                 << PrimitiveType_Name(shape.element_type());
+  }
+}
+
+bool HloParser::SetValueInLiteral(bool value, int64 linear_index,
+                                  Literal* literal) {
+  const Shape& shape = literal->shape();
+  switch (shape.element_type()) {
+    case PRED:
+      return SetValueInLiteralHelper<bool>(value, linear_index, literal);
     default:
-      return TokenError(StrCat("unsupported constant in shape: ",
-                               ShapeUtil::HumanString(shape)));
+      LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+                 << " is not PRED type";
+  }
+}
+
+template <typename LiteralNativeT, typename ParsedElemT>
+bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+                                        Literal* literal) {
+  // Check that linear_index is in range.
+  if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
+    return TokenError(
+        StrCat("trys to set value ", value, " to a literal in shape ",
+               ShapeUtil::HumanString(literal->shape()), " at linear index ",
+               linear_index, ", but the index is out of range"));
+  }
+
+  if (std::isnan(value) ||
+      (std::numeric_limits<ParsedElemT>::has_infinity &&
+       (std::numeric_limits<ParsedElemT>::infinity() == value ||
+        -std::numeric_limits<ParsedElemT>::infinity() == value))) {
+    // Skip range checking for non-finite value.
+  } else if (literal->shape().element_type() == F16 ||
+             literal->shape().element_type() == BF16) {
+    if (value > kF16max || value < -kF16max) {
+      return TokenError(StrCat(
+          "value ", value, " is out of range for literal's primitive type ",
+          PrimitiveType_Name(literal->shape().element_type())));
+    }
+  } else if (value > static_cast<ParsedElemT>(
+                         std::numeric_limits<LiteralNativeT>::max()) ||
+             value < static_cast<ParsedElemT>(
+                         std::numeric_limits<LiteralNativeT>::lowest())) {
+    // Value is out of range for LiteralNativeT.
+    return TokenError(StrCat(
+        "value ", value, " is out of range for literal's primitive type ",
+        PrimitiveType_Name(literal->shape().element_type())));
   }
+
+  literal->GetMutableArraySlice<LiteralNativeT>().at(linear_index) =
+      static_cast<LiteralNativeT>(value);
+  return true;
+}
+
+bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) {
+  Shape new_shape;
+  if (!ParseShape(&new_shape)) {
+    return TokenError(StrCat("expects shape ", ShapeUtil::HumanString(shape)));
+  }
+  if (!ShapeUtil::Compatible(shape, new_shape)) {
+    return TokenError(StrCat(
+        "expects shape ", ShapeUtil::HumanString(shape),
+        ", but sees a different shape: ", ShapeUtil::HumanString(new_shape)));
+  }
+  return true;
+}
+
+// literal
+//  ::= tuple
+//  ::= non_tuple
+bool HloParser::ParseLiteral(std::unique_ptr<Literal>* literal,
+                             const Shape& shape) {
+  return ShapeUtil::IsTuple(shape) ? ParseTupleLiteral(literal, shape)
+                                   : ParseNonTupleLiteral(literal, shape);
+}
+
+// tuple
+//  ::= shape '(' literal_list ')'
+// literal_list
+//  ::= /*empty*/
+//  ::= literal (',' literal)*
+bool HloParser::ParseTupleLiteral(std::unique_ptr<Literal>* literal,
+                                  const Shape& shape) {
+  if (!EatShapeAndCheckCompatible(shape)) {
+    return TokenError(StrCat("expects tuple constant in shape ",
+                             ShapeUtil::HumanString(shape)));
+  }
+  if (!ParseToken(TokKind::kLparen, "expects '(' in front of tuple elements")) {
+    return false;
+  }
+  std::vector<std::unique_ptr<Literal>> elements(
+      ShapeUtil::TupleElementCount(shape));
+
+  if (lexer_.GetKind() == TokKind::kRparen) {
+    // empty
+  } else {
+    // literal, (',' literal)*
+    for (int i = 0; i < elements.size(); i++) {
+      if (i > 0) {
+        ParseToken(TokKind::kComma, "exepcts ',' to separate tuple elements");
+      }
+      if (!ParseLiteral(&elements[i],
+                        ShapeUtil::GetTupleElementShape(shape, i))) {
+        return TokenError(StrCat("expects the ", i, "th element"));
+      }
+    }
+  }
+  *literal = Literal::MakeTupleOwned(std::move(elements));
+  return ParseToken(TokKind::kRparen,
+                    StrCat("expects ')' at the end of the tuple with ",
+                           ShapeUtil::TupleElementCount(shape), "elements"));
+}
+
+// non_tuple
+//   ::= rank01
+//   ::= rank2345
+// rank2345 ::= shape nested_array
+bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
+                                     const Shape& shape) {
+  const int64 rank = ShapeUtil::Rank(shape);
+  if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
+    return false;
+  }
+
+  // Create a literal with the given shape in default layout.
+  *literal = Literal::CreateFromDimensions(shape.element_type(),
+                                           AsInt64Slice(shape.dimensions()));
+  int64 nest_level = 0;
+  int64 linear_index = 0;
+  // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
+  // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
+  // when we are parsing the 2nd '{' (right before '1'), we are seeing a
+  // sub-array of the dimension 0, so elems_seen_per_dim[0]++. When we are at
+  // the first '}' (right after '3'), it means the sub-array ends, and the
+  // sub-array is supposed to contain exactly 3 elements, so check if
+  // elems_seen_per_dim[1] is 3.
+  std::vector<int64> elems_seen_per_dim(rank);
+  auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
+    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
+                                            elems_seen_per_dim.begin() + dim);
+    return StrCat("[",
+                  tensorflow::str_util::Join(
+                      elems_seen_until_dim, ",",
+                      [](string* out, const int64& num_elems) {
+                        tensorflow::strings::StrAppend(out, num_elems - 1);
+                      }),
+                  "]");
+  };
+  do {
+    switch (lexer_.GetKind()) {
+      default:
+        return TokenError("unexpected token type in a literal");
+      case TokKind::kLbrace: {
+        nest_level++;
+        if (nest_level > rank) {
+          return TokenError(Printf(
+              "expects nested array in rank %lld, but sees larger", rank));
+        }
+        if (nest_level > 1) {
+          elems_seen_per_dim[nest_level - 2]++;
+          if (elems_seen_per_dim[nest_level - 2] >
+              shape.dimensions(nest_level - 2)) {
+            return TokenError(Printf(
+                "expects %lld elements in the %sth element, but sees more",
+                shape.dimensions(nest_level - 2),
+                get_index_str(nest_level - 2).c_str()));
+          }
+        }
+        lexer_.Lex();
+        break;
+      }
+      case TokKind::kRbrace: {
+        nest_level--;
+        if (elems_seen_per_dim[nest_level] != shape.dimensions(nest_level)) {
+          return TokenError(Printf(
+              "expects %lld elements in the %sth element, but sees %lld",
+              shape.dimensions(nest_level), get_index_str(nest_level).c_str(),
+              elems_seen_per_dim[nest_level]));
+        }
+        elems_seen_per_dim[nest_level] = 0;
+        lexer_.Lex();
+        break;
+      }
+      case TokKind::kComma:
+      case TokKind::kComment:
+        // Skip.
+        lexer_.Lex();
+        break;
+      case TokKind::kw_true:
+      case TokKind::kw_false:
+      case TokKind::kInt:
+      case TokKind::kDecimal:
+      case TokKind::kw_nan:
+      case TokKind::kw_inf:
+      case TokKind::kNegInf: {
+        if (rank > 0) {
+          if (nest_level != rank) {
+            return TokenError(
+                Printf("expects nested array in rank %lld, but sees %lld", rank,
+                       nest_level));
+          }
+          elems_seen_per_dim[rank - 1]++;
+          if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) {
+            return TokenError(
+                Printf("expects %lld elements on the minor-most dimension, but "
+                       "sees more",
+                       shape.dimensions(rank - 1)));
+          }
+        }
+        if (lexer_.GetKind() == TokKind::kw_true ||
+            lexer_.GetKind() == TokKind::kw_false) {
+          // TODO(congliu): bool type literals with rank >= 1 are actually
+          // printed in a compact form instead of "true" or "false". Fix that.
+          if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true,
+                                 linear_index++, literal->get())) {
+            return false;
+          }
+          lexer_.Lex();
+        } else if (primitive_util::IsIntegralType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
+          int64 value;
+          if (!ParseInt64(&value)) {
+            return Error(loc, StrCat("expects integer for primitive type: ",
+                                     PrimitiveType_Name(shape.element_type())));
+          }
+          if (!SetValueInLiteral(value, linear_index++, literal->get())) {
+            return false;
+          }
+        } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
+          LocTy loc = lexer_.GetLoc();
+          double value;
+          if (!ParseDouble(&value)) {
+            return Error(
+                loc, StrCat("expect floating point value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
+          }
+          if (!SetValueInLiteral(value, linear_index++, literal->get())) {
+            return false;
+          }
+        } else {
+          return TokenError(StrCat("unsupported premitive type ",
+                                   PrimitiveType_Name(shape.element_type())));
+        }
+        break;
+      }
+    }  // end of switch
+  } while (nest_level > 0);
+
+  *literal = (*literal)->Relayout(shape.layout());
+  return true;
 }
 
 // operands ::= '(' operands1 ')'
@@ -594,6 +1382,7 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
     // empty
   } else {
     do {
+      LocTy loc = lexer_.GetLoc();
       Shape shape;
       string name;
       if (!ParseShape(&shape) || !ParseName(&name)) {
@@ -602,7 +1391,7 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
       HloInstruction* instruction =
           tensorflow::gtl::FindPtrOrNull(instruction_pool_, name);
       if (!instruction) {
-        return TokenError(StrCat("instruction does not exist: ", name));
+        return Error(loc, StrCat("instruction does not exist: ", name));
       }
       operands->push_back(instruction);
     } while (EatIfPresent(TokKind::kComma));
@@ -612,52 +1401,513 @@ bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands) {
 
 bool HloParser::ParseOperands(std::vector<HloInstruction*>* operands,
                               const int expected_size) {
+  LocTy loc = lexer_.GetLoc();
   if (!ParseOperands(operands)) {
     return false;
   }
   if (expected_size != operands->size()) {
-    return TokenError(StrCat("expects ", expected_size, " operands, but has ",
+    return Error(loc, StrCat("expects ", expected_size, " operands, but has ",
                              operands->size(), " operands"));
   }
   return true;
 }
 
-// extra_attribute ::= ',' attribute_name value
-template <typename T>
-bool HloParser::ParseExtraAttribute(T* value,
-                                    const string& expected_attribute) {
-  if (!ParseToken(TokKind::kComma,
-                  "expects ',' in front of an extra attribute")) {
+// sub_attributes ::= '{' (','? attribute)* '}'
+bool HloParser::ParseSubAttributes(
+    const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
+  if (!ParseToken(TokKind::kLbrace, "expects '{' to start sub attributes")) {
     return false;
   }
-  string attribute_name;
-  if (!ParseAttributeName(&attribute_name) &&
-      attribute_name != expected_attribute) {
-    return TokenError(StrCat("expects attribute name: ", expected_attribute));
+  std::unordered_set<string> seen_attrs;
+  if (lexer_.GetKind() == TokKind::kRbrace) {
+    // empty
+  } else {
+    do {
+      EatIfPresent(TokKind::kComma);
+      if (!ParseAttributeHelper(attrs, &seen_attrs)) {
+        return false;
+      }
+    } while (lexer_.GetKind() != TokKind::kRbrace);
   }
-  if (!ParseAttributeValue(value)) {
-    return TokenError(
-        StrCat("expects value for attribute: ", expected_attribute));
+  // Check that all required attrs were seen.
+  for (const auto& attr_it : attrs) {
+    if (attr_it.second.required &&
+        seen_attrs.find(attr_it.first) == seen_attrs.end()) {
+      return Error(loc, Printf("sub-attribute %s is expected but not seen",
+                               attr_it.first.c_str()));
+    }
+  }
+  return ParseToken(TokKind::kRbrace, "expects '}' to end sub attributes");
+}
+
+// attributes ::= (',' attribute)*
+bool HloParser::ParseAttributes(
+    const std::unordered_map<string, AttrConfig>& attrs) {
+  LocTy loc = lexer_.GetLoc();
+  std::unordered_set<string> seen_attrs;
+  while (EatIfPresent(TokKind::kComma)) {
+    if (!ParseAttributeHelper(attrs, &seen_attrs)) {
+      return false;
+    }
+  }
+  // Check that all required attrs were seen.
+  for (const auto& attr_it : attrs) {
+    if (attr_it.second.required &&
+        seen_attrs.find(attr_it.first) == seen_attrs.end()) {
+      return Error(loc, Printf("attribute %s is expected but not seen",
+                               attr_it.first.c_str()));
+    }
+  }
+  return true;
+}
+
+bool HloParser::ParseAttributeHelper(
+    const std::unordered_map<string, AttrConfig>& attrs,
+    std::unordered_set<string>* seen_attrs) {
+  LocTy loc = lexer_.GetLoc();
+  string name;
+  if (!ParseAttributeName(&name)) {
+    return Error(loc, "error parsing attributes");
+  }
+  VLOG(1) << "Parsing attribute " << name;
+  if (!seen_attrs->insert(name).second) {
+    return Error(loc, Printf("attribute %s already exists", name.c_str()));
+  }
+  auto attr_it = attrs.find(name);
+  if (attr_it == attrs.end()) {
+    return Error(loc, Printf("unexpected attribute %s", name.c_str()));
+  }
+  AttrTy attr_type = attr_it->second.attr_type;
+  void* attr_out_ptr = attr_it->second.result;
+  bool success = [&] {
+    LocTy attr_loc = lexer_.GetLoc();
+    switch (attr_type) {
+      case AttrTy::kInt64: {
+        int64 result;
+        if (!ParseInt64(&result)) {
+          return false;
+        }
+        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kInt32: {
+        int64 result;
+        if (!ParseInt64(&result)) {
+          return false;
+        }
+        if (result != static_cast<int32>(result)) {
+          return Error(attr_loc, "value out of range for int32");
+        }
+        static_cast<optional<int32>*>(attr_out_ptr)
+            ->emplace(static_cast<int32>(result));
+        return true;
+      }
+      case AttrTy::kFloat: {
+        double result;
+        if (!ParseDouble(&result)) {
+          return false;
+        }
+        if (result > std::numeric_limits<float>::max() ||
+            result < std::numeric_limits<float>::lowest()) {
+          return Error(attr_loc, "value out of range for float");
+        }
+        static_cast<optional<float>*>(attr_out_ptr)
+            ->emplace(static_cast<float>(result));
+        return true;
+      }
+      case AttrTy::kHloComputation: {
+        HloComputation* result;
+        if (!ParseComputationName(&result)) {
+          return false;
+        }
+        static_cast<optional<HloComputation*>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kWindow: {
+        Window result;
+        if (!ParseWindow(&result)) {
+          return false;
+        }
+        static_cast<optional<Window>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kConvolutionDimensionNumbers: {
+        ConvolutionDimensionNumbers result;
+        if (!ParseConvolutionDimensionNumbers(&result)) {
+          return false;
+        }
+        static_cast<optional<ConvolutionDimensionNumbers>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
+      case AttrTy::kSharding: {
+        OpSharding sharding;
+        if (!ParseSharding(&sharding)) {
+          return false;
+        }
+        static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
+        return true;
+      }
+      case AttrTy::kInstructionList: {
+        std::vector<HloInstruction*> result;
+        if (!ParseInstructionNames(&result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<HloInstruction*>>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
+      case AttrTy::kFusionKind: {
+        HloInstruction::FusionKind result;
+        if (!ParseFusionKind(&result)) {
+          return false;
+        }
+        static_cast<optional<HloInstruction::FusionKind>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
+      case AttrTy::kBracedInt64List: {
+        std::vector<int64> result;
+        if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
+                            &result)) {
+          return false;
+        }
+        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
+      case AttrTy::kSliceRanges: {
+        SliceRanges result;
+        if (!ParseSliceRanges(&result)) {
+          return false;
+        }
+        static_cast<optional<SliceRanges>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kPaddingConfig: {
+        PaddingConfig result;
+        if (!ParsePaddingConfig(&result)) {
+          return false;
+        }
+        static_cast<optional<PaddingConfig>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kString: {
+        string result;
+        if (!ParseString(&result)) {
+          return false;
+        }
+        static_cast<optional<string>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kMetadata: {
+        OpMetadata result;
+        if (!ParseMetadata(&result)) {
+          return false;
+        }
+        static_cast<optional<OpMetadata>*>(attr_out_ptr)->emplace(result);
+        return true;
+      }
+      case AttrTy::kDistribution: {
+        RandomDistribution result;
+        if (!ParseRandomDistribution(&result)) {
+          return false;
+        }
+        static_cast<optional<RandomDistribution>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
+    }
+  }();
+  if (!success) {
+    return Error(loc, Printf("error parsing attribute %s", name.c_str()));
   }
   return true;
 }
 
-template <>
-bool HloParser::ParseAttributeValue<HloComputation*>(HloComputation** value) {
+bool HloParser::ParseComputationName(HloComputation** value) {
   string name;
+  LocTy loc = lexer_.GetLoc();
   if (!ParseName(&name)) {
-    return TokenError("expects computation name");
+    return Error(loc, "expects computation name");
   }
   *value = tensorflow::gtl::FindPtrOrNull(computation_pool_, name);
   if (*value == nullptr) {
-    return TokenError(StrCat("computation does not exist: ", name));
+    return Error(loc, StrCat("computation does not exist: ", name));
   }
   return true;
 }
 
-template <>
-bool HloParser::ParseAttributeValue<int64>(int64* value) {
-  return ParseInt64(value);
+// ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}'
+// The subattributes can appear in any order. 'size=' is required, others are
+// optional.
+bool HloParser::ParseWindow(Window* window) {
+  LocTy loc = lexer_.GetLoc();
+  if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
+    return false;
+  }
+
+  std::vector<int64> size;
+  std::vector<int64> stride;
+  std::vector<std::vector<int64>> pad;
+  std::vector<int64> lhs_dilate;
+  std::vector<int64> rhs_dilate;
+  std::vector<int64> rhs_reversal;
+  while (lexer_.GetKind() != TokKind::kRbrace) {
+    LocTy attr_loc = lexer_.GetLoc();
+    string field_name;
+    if (!ParseAttributeName(&field_name)) {
+      return Error(attr_loc, "expects sub-attributes in window");
+    }
+    bool ok = [&] {
+      if (field_name == "size") {
+        return ParseDxD("size", &size);
+      }
+      if (field_name == "stride") {
+        return ParseDxD("stride", &stride);
+      }
+      if (field_name == "lhs_dilate") {
+        return ParseDxD("lhs_dilate", &lhs_dilate);
+      }
+      if (field_name == "rhs_dilate") {
+        return ParseDxD("rls_dilate", &rhs_dilate);
+      }
+      if (field_name == "pad") {
+        return ParseWindowPad(&pad);
+      }
+      if (field_name == "rhs_reversal") {
+        return ParseDxD("rhs_reversal", &rhs_reversal);
+      }
+      return Error(loc, StrCat("unexpected attribute name: ", field_name));
+    }();
+    if (!ok) {
+      return false;
+    }
+  }
+
+  if (size.empty()) {
+    return Error(loc,
+                 "sub-attribute 'size=' is required in the window attribute");
+  }
+  if (!stride.empty() && stride.size() != size.size()) {
+    return Error(loc, "expects 'stride=' has the same size as 'size='");
+  }
+  if (!lhs_dilate.empty() && lhs_dilate.size() != size.size()) {
+    return Error(loc, "expects 'lhs_dilate=' has the same size as 'size='");
+  }
+  if (!rhs_dilate.empty() && rhs_dilate.size() != size.size()) {
+    return Error(loc, "expects 'rhs_dilate=' has the same size as 'size='");
+  }
+  if (!pad.empty() && pad.size() != size.size()) {
+    return Error(loc, "expects 'pad=' has the same size as 'size='");
+  }
+
+  for (int i = 0; i < size.size(); i++) {
+    window->add_dimensions()->set_size(size[i]);
+    if (!pad.empty()) {
+      window->mutable_dimensions(i)->set_padding_low(pad[i][0]);
+      window->mutable_dimensions(i)->set_padding_high(pad[i][1]);
+    }
+    // If some field is not present, it has the default value.
+    window->mutable_dimensions(i)->set_stride(stride.empty() ? 1 : stride[i]);
+    window->mutable_dimensions(i)->set_base_dilation(
+        lhs_dilate.empty() ? 1 : lhs_dilate[i]);
+    window->mutable_dimensions(i)->set_window_dilation(
+        rhs_dilate.empty() ? 1 : rhs_dilate[i]);
+    window->mutable_dimensions(i)->set_window_reversal(
+        rhs_reversal.empty() ? false : (rhs_reversal[i] == 1));
+  }
+  return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
+}
+
+// This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString.
+// The string looks like "dim_labels=0bf_0io->0bf".
+bool HloParser::ParseConvolutionDimensionNumbers(
+    ConvolutionDimensionNumbers* dnums) {
+  if (lexer_.GetKind() != TokKind::kDimLabels) {
+    return TokenError("expects dim labels pattern, e.g., 'bf0_0io->0bf'");
+  }
+  string str = lexer_.GetStrVal();
+
+  // The str is expected to have 3 items, lhs, rhs, out, and it must looks like
+  // lhs_rhs->out, that is, the first separator is "_" and the second is "->".
+  // So we replace the "->" with "_" and then split on "_".
+  str = tensorflow::str_util::StringReplace(str, /*oldsub=*/"->",
+                                            /*newsub=*/"_",
+                                            /*replace_all=*/false);
+  std::vector<string> lhs_rhs_out = Split(str, "_");
+  if (lhs_rhs_out.size() != 3) {
+    LOG(FATAL) << "expects 3 items: lhs, rhs, and output dims, but sees "
+               << str;
+  }
+
+  const int64 rank = lhs_rhs_out[0].length();
+  if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) {
+    return TokenError(
+        "convolution lhs, rhs, and output must have the same rank");
+  }
+  if (rank < 2) {
+    return TokenError("convolution rank must >=2");
+  }
+
+  auto is_unique = [](string str) -> bool {
+    std::sort(str.begin(), str.end());
+    return std::unique(str.begin(), str.end()) == str.end();
+  };
+
+  // lhs
+  {
+    const string& lhs = lhs_rhs_out[0];
+    if (!is_unique(lhs)) {
+      return TokenError(
+          StrCat("expects unique lhs dimension numbers, but sees ", lhs));
+    }
+    for (int i = 0; i < rank - 2; i++) {
+      dnums->add_input_spatial_dimensions(-1);
+    }
+    for (int i = 0; i < rank; i++) {
+      char c = lhs[i];
+      if (c == 'b') {
+        dnums->set_input_batch_dimension(i);
+      } else if (c == 'f') {
+        dnums->set_input_feature_dimension(i);
+      } else if (c < '0' + rank && c >= '0') {
+        dnums->set_input_spatial_dimensions(c - '0', i);
+      } else {
+        return TokenError(
+            Printf("expects [0-%lldbf] in lhs dimension numbers", rank - 1));
+      }
+    }
+  }
+  // rhs
+  {
+    const string& rhs = lhs_rhs_out[1];
+    if (!is_unique(rhs)) {
+      return TokenError(
+          StrCat("expects unique rhs dimension numbers, but sees ", rhs));
+    }
+    for (int i = 0; i < rank - 2; i++) {
+      dnums->add_kernel_spatial_dimensions(-1);
+    }
+    for (int i = 0; i < rank; i++) {
+      char c = rhs[i];
+      if (c == 'i') {
+        dnums->set_kernel_input_feature_dimension(i);
+      } else if (c == 'o') {
+        dnums->set_kernel_output_feature_dimension(i);
+      } else if (c < '0' + rank && c >= '0') {
+        dnums->set_kernel_spatial_dimensions(c - '0', i);
+      } else {
+        return TokenError(
+            Printf("expects [0-%lldio] in rhs dimension numbers", rank - 1));
+      }
+    }
+  }
+  // output
+  {
+    const string& out = lhs_rhs_out[2];
+    if (!is_unique(out)) {
+      return TokenError(
+          StrCat("expects unique output dimension numbers, but sees ", out));
+    }
+    for (int i = 0; i < rank - 2; i++) {
+      dnums->add_output_spatial_dimensions(-1);
+    }
+    for (int i = 0; i < rank; i++) {
+      char c = out[i];
+      if (c == 'b') {
+        dnums->set_output_batch_dimension(i);
+      } else if (c == 'f') {
+        dnums->set_output_feature_dimension(i);
+      } else if (c < '0' + rank && c >= '0') {
+        dnums->set_output_spatial_dimensions(c - '0', i);
+      } else {
+        return TokenError(
+            Printf("expects [0-%lldbf] in output dimension numbers", rank - 1));
+      }
+    }
+  }
+
+  lexer_.Lex();
+  return true;
+}
+
+// ::= '{' ranges '}'
+//   ::= /*empty*/
+//   ::= range (',' range)*
+// range ::= '[' start ':' limit (':' stride)? ']'
+//
+// The slice ranges are printed as:
+//
+//  {[dim0_start:dim0_limit:dim0stride], [dim1_start:dim1_limit], ...}
+//
+// This function extracts the starts, limits, and strides as 3 vectors to the
+// result. If stride is not present, stride is 1. For example, if the slice
+// ranges is printed as:
+//
+//  {[2:3:4], [5:6:7], [8:9]}
+//
+// The the parsed result will be:
+//
+//  {/*starts=*/{2, 5, 8}, /*limits=*/{3, 6, 9}, /*strides=*/{4, 7, 1}}
+//
+bool HloParser::ParseSliceRanges(SliceRanges* result) {
+  if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
+    return false;
+  }
+  std::vector<std::vector<int64>> ranges;
+  if (lexer_.GetKind() == TokKind::kRbrace) {
+    // empty
+    return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
+  }
+  do {
+    LocTy loc = lexer_.GetLoc();
+    ranges.emplace_back();
+    if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kColon,
+                        &ranges.back())) {
+      return false;
+    }
+    const auto& range = ranges.back();
+    if (range.size() != 2 && range.size() != 3) {
+      return Error(loc, Printf("expects [start:limit:step] or [start:limit], "
+                               "but sees %ld elements.",
+                               range.size()));
+    }
+  } while (EatIfPresent(TokKind::kComma));
+
+  for (const auto& range : ranges) {
+    result->starts.push_back(range[0]);
+    result->limits.push_back(range[1]);
+    result->strides.push_back(range.size() == 3 ? range[2] : 1);
+  }
+  return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
+}
+
+// int64list ::= start int64_elements end
+// int64_elements
+//   ::= /*empty*/
+//   ::= int64_val (delim int64_val)*
+bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
+                               const TokKind delim,
+                               std::vector<int64>* result) {
+  if (!ParseToken(start, StrCat("expects an int64 list starting with ",
+                                TokKindToString(start)))) {
+    return false;
+  }
+  if (lexer_.GetKind() == end) {
+    // empty
+  } else {
+    do {
+      int64 i;
+      if (!ParseInt64(&i)) {
+        return false;
+      }
+      result->push_back(i);
+    } while (EatIfPresent(delim));
+  }
+  return ParseToken(
+      end, StrCat("expects an int64 list to end with ", TokKindToString(end)));
 }
 
 // param_list ::= '(' param_list1 ')'
@@ -735,12 +1985,171 @@ bool HloParser::ParseAttributeName(string* result) {
   return true;
 }
 
+bool HloParser::ParseString(string* result) {
+  VLOG(1) << "ParseString";
+  if (lexer_.GetKind() != TokKind::kString) {
+    return TokenError("expects string");
+  }
+  *result = lexer_.GetStrVal();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
+  LocTy loc = lexer_.GetLoc();
+  if (!result->empty()) {
+    return Error(loc,
+                 Printf("sub-attribute '%s=' already exists", name.c_str()));
+  }
+  // 1D
+  if (lexer_.GetKind() == TokKind::kInt) {
+    int64 number;
+    if (!ParseInt64(&number)) {
+      return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str()));
+    }
+    result->push_back(number);
+    return true;
+  }
+  // 2D or higher.
+  if (lexer_.GetKind() == TokKind::kDxD) {
+    string str = lexer_.GetStrVal();
+    if (!SplitAndParseAsInts(str, 'x', result)) {
+      return Error(loc,
+                   Printf("expects sub-attribute '%s=ixj...'", name.c_str()));
+    }
+    lexer_.Lex();
+    return true;
+  }
+  return TokenError("expects token type kInt or kDxD");
+}
+
+bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
+  LocTy loc = lexer_.GetLoc();
+  if (!pad->empty()) {
+    return Error(loc, "sub-attribute 'pad=' already exists");
+  }
+  if (lexer_.GetKind() != TokKind::kPad) {
+    return TokenError("expects window pad pattern, e.g., '0_0x3_3'");
+  }
+  string str = lexer_.GetStrVal();
+  std::vector<string> padding_str = Split(str, 'x');
+  for (int i = 0; i < padding_str.size(); i++) {
+    std::vector<int64> low_high;
+    if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
+        low_high.size() != 2) {
+      return Error(loc,
+                   "expects padding_low and padding_high separated by '_'");
+    }
+    pad->push_back(low_high);
+  }
+  lexer_.Lex();
+  return true;
+}
+
+// This is the inverse xla::ToString(PaddingConfig). The padding config string
+// looks like "0_0_0x3_3_1". The string is first separated by 'x', each
+// substring represents one PaddingConfigDimension. The substring is 3 (or 2)
+// numbers joined by '_'.
+bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
+  if (lexer_.GetKind() != TokKind::kPad) {
+    return TokenError("expects padding config, e.g., '0_0_0x3_3_1'");
+  }
+  LocTy loc = lexer_.GetLoc();
+  string str = lexer_.GetStrVal();
+  std::vector<string> padding_str = Split(str, 'x');
+  for (const auto& padding_dim_str : padding_str) {
+    std::vector<int64> padding_dim;
+    if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
+        (padding_dim.size() != 2 && padding_dim.size() != 3)) {
+      return Error(loc,
+                   "expects padding config pattern like 'low_high_interior' or "
+                   "'low_high'");
+    }
+    auto* dim = padding->add_dimensions();
+    dim->set_edge_padding_low(padding_dim[0]);
+    dim->set_edge_padding_high(padding_dim[1]);
+    dim->set_interior_padding(padding_dim.size() == 3 ? padding_dim[2] : 0);
+  }
+  lexer_.Lex();
+  return true;
+}
+
+// '{' metadata_string '}'
+bool HloParser::ParseMetadata(OpMetadata* metadata) {
+  std::unordered_map<string, AttrConfig> attrs;
+  optional<string> op_type;
+  optional<string> op_name;
+  optional<string> source_file;
+  optional<int32> source_line;
+  attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
+  attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
+  attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
+  attrs["source_line"] = {/*required=*/false, AttrTy::kInt32, &source_line};
+  if (!ParseSubAttributes(attrs)) {
+    return false;
+  }
+  if (op_type) {
+    metadata->set_op_type(*op_type);
+  }
+  if (op_name) {
+    metadata->set_op_name(*op_name);
+  }
+  if (source_file) {
+    metadata->set_source_file(*source_file);
+  }
+  if (source_line) {
+    metadata->set_source_line(*source_line);
+  }
+  return true;
+}
+
 bool HloParser::ParseOpcode(HloOpcode* result) {
   VLOG(1) << "ParseOpcode";
-  if (lexer_.GetKind() != TokKind::kOpcode) {
+  if (lexer_.GetKind() != TokKind::kIdent) {
     return TokenError("expects opcode");
   }
-  *result = lexer_.GetOpcodeVal();
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToHloOpcode(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects opcode but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
+  VLOG(1) << "ParseFusionKind";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects fusion kind");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToFusionKind(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects fusion kind but sees: %s, error: %s", val.c_str(),
+               status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
+  lexer_.Lex();
+  return true;
+}
+
+bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
+  VLOG(1) << "ParseRandomDistribution";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects random distribution");
+  }
+  string val = lexer_.GetStrVal();
+  auto status_or_result = StringToRandomDistribution(val);
+  if (!status_or_result.ok()) {
+    return TokenError(
+        Printf("expects random distribution but sees: %s, error: %s",
+               val.c_str(), status_or_result.status().error_message().c_str()));
+  }
+  *result = status_or_result.ValueOrDie();
   lexer_.Lex();
   return true;
 }
@@ -755,7 +2164,7 @@ bool HloParser::ParseInt64(int64* result) {
   return true;
 }
 
-bool HloParser::ParseDecimal(double* result) {
+bool HloParser::ParseDouble(double* result) {
   switch (lexer_.GetKind()) {
     case TokKind::kDecimal:
       *result = lexer_.GetDecimalVal();
@@ -763,6 +2172,15 @@ bool HloParser::ParseDecimal(double* result) {
     case TokKind::kInt:
       *result = static_cast<double>(lexer_.GetInt64Val());
       break;
+    case TokKind::kw_nan:
+      *result = std::numeric_limits<double>::quiet_NaN();
+      break;
+    case TokKind::kw_inf:
+      *result = std::numeric_limits<double>::infinity();
+      break;
+    case TokKind::kNegInf:
+      *result = -std::numeric_limits<double>::infinity();
+      break;
     default:
       return TokenError("expects decimal or integer");
   }
@@ -781,6 +2199,7 @@ bool HloParser::ParseBool(bool* result) {
 }
 
 bool HloParser::ParseToken(TokKind kind, const string& msg) {
+  VLOG(1) << "ParseToken " << TokKindToString(kind) << " " << msg;
   if (lexer_.GetKind() != kind) {
     return TokenError(msg);
   }
@@ -796,33 +2215,39 @@ bool HloParser::EatIfPresent(TokKind kind) {
   return true;
 }
 
-bool HloParser::AddInstruction(const string& name,
-                               HloInstruction* instruction) {
+bool HloParser::AddInstruction(const string& name, HloInstruction* instruction,
+                               LocTy name_loc) {
   auto result = instruction_pool_.insert({name, instruction});
   if (!result.second) {
-    return TokenError(StrCat("instruction already exists: ", name));
+    return Error(name_loc, StrCat("instruction already exists: ", name));
   }
   return true;
 }
 
-bool HloParser::AddComputation(const string& name,
-                               HloComputation* computation) {
+bool HloParser::AddComputation(const string& name, HloComputation* computation,
+                               LocTy name_loc) {
   auto result = computation_pool_.insert({name, computation});
   if (!result.second) {
-    return TokenError(StrCat("computation already exists: ", name));
+    return Error(name_loc, StrCat("computation already exists: ", name));
   }
   return true;
 }
 
 }  // namespace
 
-StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str) {
-  HloParser parser(str);
+StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
+                                           const HloModuleConfig& config) {
+  HloParser parser(str, config);
   if (!parser.Run()) {
-    return InvalidArgument("Syntax error: %s", parser.GetError().c_str());
+    return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
   }
   return parser.ConsumeHloModule();
 }
 
+StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str) {
+  HloModuleConfig config;
+  return Parse(str, config);
+}
+
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.h b/tensorflow/compiler/xla/tools/parser/hlo_parser.h
index 9aaf18ef20d769cd9ac6f0e48bc92f62292ba31a..2f97a2b9b19d0cdb64a2869913da62c55e14c1d5 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.h
@@ -28,7 +28,12 @@ namespace xla {
 namespace tools {
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
-// format, returns the parsed HloModule.
+// format, parses the string and creates a HloModule with the given config.
+StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str,
+                                           const HloModuleConfig& config);
+
+// The api of the hlo parser. Given a string in the HloModule::ToString()
+// format, parses the string and creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str);
 
 }  // namespace tools
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 5be4d6a2cb1b09355e09e25a40e8dc88bae01650..7eebc5dc93ffff1f5895e69023a4d81ab7279241 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -17,12 +17,16 @@ limitations under the License.
 
 #include <string>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace tools {
 namespace {
 
+using tensorflow::StringPiece;
+using tensorflow::strings::StrCat;
+
 struct TestData {
   string test_name;
   string module_string;
@@ -32,6 +36,10 @@ string TestDataToString(const ::testing::TestParamInfo<TestData>& data) {
   return data.param.test_name;
 }
 
+// For each string below, we check that:
+//  - we parse it to an HloModule successfully, and
+//  - the stringification of the resulting HloModule is equal to our original
+//    string.
 std::vector<TestData> CreateTestCases() {
   // clang-format off
   return std::vector<TestData>({
@@ -40,10 +48,11 @@ std::vector<TestData> CreateTestCases() {
 "AxpyParam",
 R"(HloModule axpy_module:
 
-ENTRY %axpy.v5 (alpha: f32[2,4], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
-  %alpha = f32[2,4]{1,0} parameter(0)
+ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
+  %alpha = f32[] parameter(0)
+  %broadcast = f32[2,4]{1,0} broadcast(f32[] %alpha), dimensions={}
   %x = f32[2,4]{1,0} parameter(1)
-  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %alpha, f32[2,4]{1,0} %x)
+  %multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, f32[2,4]{1,0} %x)
   %y = f32[2,4]{1,0} parameter(2)
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
@@ -56,7 +65,7 @@ ENTRY %axpy.v5 (alpha: f32[2,4], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 R"(HloModule constant_pred_module:
 
 ENTRY %constant_pred () -> pred[] {
-  ROOT %constant = pred[] constant(true)
+  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}
 }
 
 )"
@@ -74,12 +83,80 @@ ENTRY %constant_s32 () -> s32[] {
 },
 // f32 constant, but the value is not a decimal
 {
-"ConstantF32", R"(HloModule ConstantF32_module:
+"ConstantF32",
+R"(HloModule ConstantF32_module:
 
 ENTRY %ConstantF32.v4 () -> f32[] {
   ROOT %constant = f32[] constant(42)
 }
 
+)"
+},
+// f32 constant, rank 1 empty array.
+{
+"ConstantF32R1Empty",
+R"(HloModule ConstantF32Empty_module:
+
+ENTRY %ConstantF32Empty.v4 () -> f32[0] {
+  ROOT %constant = f32[0]{0} constant({})
+}
+
+)"
+},
+// f32 constant, rank 4 empty array.
+{
+"ConstantF32R4Empty",
+R"(HloModule ConstantF32R4Empty_module:
+
+ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
+  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } })
+}
+
+)"
+},
+// constant 4D
+{
+"Constant4D",
+R"(HloModule Small_3x2x1x1_module:
+
+ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
+  ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+}
+
+)"
+},
+// non-finite constants: nan, inf, -inf
+{
+"ConstantNonFinite",
+R"(HloModule IsFiniteR1F32s_module:
+
+ENTRY %IsFiniteR1F32s.v2 () -> pred[6] {
+  %constant = f32[6]{0} constant({nan, 7, nan, -1, inf, -inf})
+  ROOT %is-finite = pred[6]{0} is-finite(f32[6]{0} %constant)
+}
+
+)"
+},
+// constant f16
+{
+"ConstantF16",
+R"(HloModule ConstantF16_module:
+
+ENTRY %ConstantF16.v4 () -> f16[] {
+  ROOT %constant = f16[] constant(500)
+}
+
+)"
+},
+// bf16
+{
+"BF16",
+R"(HloModule BF16:
+
+ENTRY %BF16.v4 () -> bf16[] {
+  ROOT %constant = bf16[] constant(500)
+}
+
 )"
 },
 // constant + constant
@@ -92,6 +169,17 @@ ENTRY %add_constants () -> f32[] {
   ROOT %add = f32[] add(f32[] %constant, f32[] %constant)
 }
 
+)"
+},
+// tuple constant
+{
+"TupleConstant",
+R"(HloModule TupleConstant_module:
+
+ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+}
+
 )"
 },
 // v1 > v2 ? v1 : v2
@@ -103,7 +191,7 @@ ENTRY %SelectR1F32WithCmpR1F32sFromParamsSmall.v4 (v1: f32[4], v2: f32[4]) -> f3
   %v1 = f32[4]{0} parameter(0), sharding={maximal device=1}
   %v2 = f32[4]{0} parameter(1), sharding={maximal device=1}
   %greater-than = pred[4]{0} greater-than(f32[4]{0} %v1, f32[4]{0} %v2), sharding={replicated}
-  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2)
+  ROOT %select = f32[4]{0} select(pred[4]{0} %greater-than, f32[4]{0} %v1, f32[4]{0} %v2), sharding={}
 }
 
 )"
@@ -131,6 +219,19 @@ ENTRY %TupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f
   ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3)
 }
 
+)"
+},
+{
+"ShardedTupleCreate",
+R"(HloModule ShardedTupleCreate_module:
+
+ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f32[3], f32[2,3]) {
+  %v1 = f32[] parameter(0)
+  %v2 = f32[3]{0} parameter(1)
+  %v3 = f32[2,3]{1,0} parameter(2)
+  ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}}
+}
+
 )"
 },
 // int32 result = 0;
@@ -164,9 +265,11 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 R"(HloModule TwoSendRecvBothWayRecvFist_module:
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %recv = f32[] recv(), channel_id=15, sharding={maximal device=1}
-  ROOT %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = () send(f32[] %constant), channel_id=16, sharding={maximal device=0}
+  %recv = (f32[], u32[]) recv(), channel_id=15, sharding={maximal device=1}
+  ROOT %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15, sharding={maximal device=1}
+  %constant = f32[] constant(2.1), sharding={maximal device=0}
+  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
+  %send-done = () send-done((f32[], u32[]) %send), channel_id=16, sharding={maximal device=0}
 }
 
 )"
@@ -176,11 +279,11 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 "GetTupleElement",
 R"(HloModule GetTupleElement_module:
 
-ENTRY %GetTupleElement.v4 () -> s32[] {
-  %constant = f32[] constant(1.23)
-  %constant.1 = s32[] constant(4)
-  %tuple = (f32[], s32[]) tuple(f32[] %constant, s32[] %constant.1)
-  ROOT %get-tuple-element = s32[] get-tuple-element((f32[], s32[]) %tuple), index=1, sharding={maximal device=0}
+ENTRY %GetTupleElement.v4 () -> s32[2,3] {
+  %constant = f32[3]{0} constant({1, 2, 3})
+  %constant.1 = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 4, 5, 6 } })
+  %tuple = (f32[3]{0}, s32[2,3]{1,0}) tuple(f32[3]{0} %constant, s32[2,3]{1,0} %constant.1)
+  ROOT %get-tuple-element = s32[2,3]{1,0} get-tuple-element((f32[3]{0}, s32[2,3]{1,0}) %tuple), index=1, sharding={maximal device=0}
 }
 
 )"
@@ -199,6 +302,407 @@ ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] {
   ROOT %call = f32[] call(f32[] %constant), to_apply=%Identity.v1
 }
 
+)"
+},
+// reduce window
+{
+"ReduceWindow",
+R"(HloModule R4UnitWindow_module:
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %R4UnitWindow.v3 (operand: f32[13,12,8,15]) -> f32[13,3,8,15] {
+  %operand = f32[13,12,8,15]{0,3,2,1} parameter(0)
+  %constant = f32[] constant(0)
+  ROOT %reduce-window = f32[13,3,8,15]{0,3,2,1} reduce-window(f32[13,12,8,15]{0,3,2,1} %operand, f32[] %constant), window={size=1x1x7x1 stride=1x4x1x1 pad=0_0x0_0x3_3x0_0}, to_apply=%add_F32.v3
+}
+
+)"
+},
+// reduce window on scalar
+{
+"ReduceWindowScalar",
+R"(HloModule reduce_window_scalar:
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %R4UnitWindowScalar () -> f32[] {
+  %constant = f32[] constant(42)
+  %constant.1 = f32[] constant(1)
+  ROOT %reduce-window = f32[] reduce-window(f32[] %constant, f32[] %constant.1), to_apply=%add_F32.v3
+}
+
+)"
+},
+// convolution
+{
+"Convolution",
+R"(HloModule Convolve1D1Window_0_module:
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
+  %input = f32[1,2,1]{2,1,0} parameter(0)
+  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
+  %filter = f32[1,1,1]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f
+}
+
+)"
+},
+// convolution rank 2
+{
+"ConvolutionR2",
+R"(HloModule ConvolveR2_module:
+
+ENTRY %ConvolveR2.v3 (input: f32[1,2], filter: f32[1,1]) -> f32[1,2] {
+  %input = f32[1,2]{1,0} parameter(0)
+  %filter = f32[1,1]{1,0} parameter(1)
+  ROOT %convolution = f32[1,2]{0,1} convolution(f32[1,2]{1,0} %input, f32[1,1]{1,0} %filter), dim_labels=bf_io->bf
+}
+
+)"
+},
+// convolution backward
+{
+"ConvolutionBackward",
+R"(HloModule ConvolveBackward_module:
+
+ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f32[128,14,14,512] {
+  %input = f32[128,7,7,512]{0,3,2,1} parameter(0)
+  %filter = f32[3,3,512,512]{3,2,1,0} parameter(1)
+  ROOT %convolution-base-dilated = f32[128,14,14,512]{0,3,2,1} convolution(f32[128,7,7,512]{0,3,2,1} %input, f32[3,3,512,512]{3,2,1,0} %filter), window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1}, dim_labels=b01f_01oi->b01f
+}
+
+)"
+},
+// reverse(constant)
+{
+"Reverse4D",
+R"(HloModule Reverse4DFloatArrayOnDim01_module:
+
+ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
+  %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
+  ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1}
+}
+
+)"
+},
+// concat
+{
+"Concat",
+R"(HloModule Concat2x3With2x5_module:
+
+ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
+  %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
+  %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
+  ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1}
+}
+
+)"
+},
+// map
+{
+"Map",
+R"(HloModule MapBinaryAdder_module:
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %MapBinaryAdder.v3 (param0: f32[4], param1: f32[4]) -> f32[4] {
+  %param0 = f32[4]{0} parameter(0)
+  %param1 = f32[4]{0} parameter(1)
+  ROOT %map = f32[4]{0} map(f32[4]{0} %param0, f32[4]{0} %param1), to_apply=%add_F32.v3
+}
+
+)"
+},
+// reduce
+{
+"Reduce",
+R"(HloModule ReduceR3ToR2_module:
+
+%add_F32.v3 (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs, f32[] %rhs)
+}
+
+ENTRY %ReduceR3ToR2.v3 (input: f32[8,16,256]) -> f32[8,16] {
+  %input = f32[8,16,256]{2,1,0} parameter(0)
+  %constant = f32[] constant(0)
+  ROOT %reduce = f32[8,16]{1,0} reduce(f32[8,16,256]{2,1,0} %input, f32[] %constant), dimensions={2}, to_apply=%add_F32.v3
+}
+
+)"
+},
+// select and scatter
+{
+"SelectAndScatter",
+R"(HloModule R4F32OverlapSmall_module:
+
+%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
+}
+
+%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
+  %lhs.1 = f32[] parameter(0)
+  %rhs.1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1)
+}
+
+ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
+  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
+  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
+  %constant.2 = f32[] constant(0)
+  ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
+}
+
+)"
+},
+// select and scatter on scalar
+{
+"SelectAndScatterScalar",
+R"(HloModule select_and_scatter_scalar:
+
+%ge_F32.v3 (lhs: f32[], rhs: f32[]) -> pred[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %greater-than-or-equal-to = pred[] greater-than-or-equal-to(f32[] %lhs, f32[] %rhs)
+}
+
+%add_F32.v3 (lhs.1: f32[], rhs.1: f32[]) -> f32[] {
+  %lhs.1 = f32[] parameter(0)
+  %rhs.1 = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %lhs.1, f32[] %rhs.1)
+}
+
+ENTRY %SelectAndScatterScalar () -> f32[] {
+  %constant = f32[] constant(42)
+  %constant.1 = f32[] constant(1)
+  %constant.2 = f32[] constant(2)
+  ROOT %select-and-scatter = f32[] select-and-scatter(f32[] %constant, f32[] %constant.1, f32[] %constant.2), select=%ge_F32.v3, scatter=%add_F32.v3
+}
+
+)"
+},
+// slice
+{
+"Slice",
+R"(HloModule slice_module:
+
+ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
+  %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
+  ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3:1], [0:3:1], [0:4:2], [0:4:1]}
+}
+
+)"
+},
+// slice, no stride
+{
+"SliceNoStride",
+R"(HloModule Slice3x3x3_To_1x3x3_F32_module:
+
+ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
+  %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
+  ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]}
+}
+
+)"
+},
+// slice R0
+{
+"SliceR0",
+R"(HloModule SliceR0_module:
+
+ENTRY %SliceR0.v2 () -> s32[] {
+  %constant = s32[] constant(1)
+  ROOT %slice = s32[] slice(s32[] %constant), slice={}
+}
+
+)"
+},
+// transpose
+{
+"Transpose",
+R"(HloModule Transpose_module:
+
+ENTRY %Transpose.v2 () -> s32[1,2,3] {
+  %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
+  ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2}
+}
+
+)"
+},
+// Dynamic slice
+{
+"DynamicSlice",
+R"(HloModule DynamicSlice_module:
+
+ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -> s32[2,2,258] {
+  %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
+  %constant = s32[1]{0} constant({0})
+  %start_index = s32[1]{0} parameter(1)
+  %concatenate = s32[3]{0} concatenate(s32[1]{0} %constant, s32[1]{0} %constant, s32[1]{0} %start_index), dimensions={0}
+  ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258}
+}
+
+)"
+},
+// Dynamic update slice
+{
+"DynamicUpdateSlice",
+R"(HloModule DynamicUpdateSlice_module:
+
+ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
+  %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+  %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+  %start_indices = s32[4]{0} parameter(2)
+  ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices)
+}
+
+)"
+},
+// batch norm training
+{
+"BatchNormTraining",
+R"(HloModule BasicTraining_module:
+
+ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
+  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
+  %constant.1 = f32[2]{0} constant({2, 3})
+  %constant.2 = f32[2]{0} constant({1, 2})
+  ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
+}
+
+)"
+},
+// batch norm inference
+{
+"BatchNormInference",
+R"(HloModule BatchNormInference_module:
+
+ENTRY %BatchNormInference.v6 (input: f32[2,2,2,2], offset: f32[2], scale: f32[2], mean: f32[2], variance: f32[2]) -> f32[2,2,2,2] {
+  %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  %offset = f32[2]{0} parameter(1)
+  %scale = f32[2]{0} parameter(2)
+  %mean = f32[2]{0} parameter(3)
+  %variance = f32[2]{0} parameter(4)
+  ROOT %batch-norm-inference = f32[2,2,2,2]{3,2,1,0} batch-norm-inference(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %offset, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance), epsilon=0.001, feature_index=0
+}
+
+)"
+},
+// batch norm grad
+{
+"BatchNormGrad",
+R"(HloModule BatchNormGrad_module:
+
+ENTRY %BatchNormGrad.v4 (input: f32[2,2,2,2], scale: f32[2], mean: f32[2], variance: f32[2], grad_output: f32[2,2,2,2]) -> (f32[2,2,2,2], f32[2], f32[2]) {
+  %input = f32[2,2,2,2]{3,2,1,0} parameter(0)
+  %scale = f32[2]{0} parameter(1)
+  %mean = f32[2]{0} parameter(2)
+  %variance = f32[2]{0} parameter(3)
+  %grad_output = f32[2,2,2,2]{3,2,1,0} parameter(4)
+  ROOT %batch-norm-grad = (f32[2,2,2,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-grad(f32[2,2,2,2]{3,2,1,0} %input, f32[2]{0} %scale, f32[2]{0} %mean, f32[2]{0} %variance, f32[2,2,2,2]{3,2,1,0} %grad_output), epsilon=0.001, feature_index=0
+}
+
+)"
+},
+// pad
+{
+"Pad",
+R"(HloModule Pad1DS3Array_module:
+
+ENTRY %Pad1DS3Array.v3 () -> f32[8] {
+  %constant = f32[3]{0} constant({1, 2, 3})
+  %constant.1 = f32[] constant(0.1)
+  ROOT %pad = f32[8]{0} pad(f32[3]{0} %constant, f32[] %constant.1), padding=3_1
+}
+
+)"
+},
+// pad has interior
+{
+"PadHasInterior",
+R"(HloModule PadHasInterior_module:
+
+ENTRY %PadHasInterior.v3 (input: f32[1,25,7,7]) -> f32[1,25,17,11] {
+  %input = f32[1,25,7,7]{3,2,1,0} parameter(0)
+  %constant = f32[] constant(-5.123)
+  ROOT %pad = f32[1,25,17,11]{3,2,1,0} pad(f32[1,25,7,7]{3,2,1,0} %input, f32[] %constant), padding=0_0_0x0_0_0x2_2_1x2_2_0
+}
+
+)"
+},
+// fusion
+{
+"Fusion",
+R"(HloModule fusion_module:
+
+%fused_computation (constant.param_0: f32[3,2,1,1], constant.1.param_1: f32[2]) -> f32[3,2,1,1] {
+  %constant.param_0 = f32[3,2,1,1]{3,2,1,0} parameter(0)
+  %constant.1.param_1 = f32[2]{0} parameter(1)
+  %broadcast = f32[3,2,1,1]{3,2,1,0} broadcast(f32[2]{0} %constant.1.param_1), dimensions={1}
+  ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %constant.param_0, f32[3,2,1,1]{3,2,1,0} %broadcast)
+}
+
+ENTRY %fusion.v3 () -> f32[3,2,1,1] {
+  %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+  %constant.1 = f32[2]{0} constant({3.14, 4.25})
+  ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
+}
+
+)"
+},
+// infeed/outfeed
+{
+"InfeedOutfeed",
+R"(HloModule outfeed_module:
+
+ENTRY %InfeedToOutfeed () -> (u32[3], pred[]) {
+  %infeed = (u32[3]{0}, pred[]) infeed()
+  %outfeed = () outfeed((u32[3]{0}, pred[]) %infeed)
+  ROOT %infeed.1 = (u32[3]{0}, pred[]) infeed()
+  %outfeed.1 = () outfeed((u32[3]{0}, pred[]) %infeed.1)
+}
+
+)"
+},
+// Rng
+{
+"Rng",
+R"(HloModule rng_module:
+
+ENTRY %Rng () -> f32[8] {
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(1)
+  ROOT %rng = f32[8]{0} rng(f32[] %constant, f32[] %constant.1), distribution=rng_uniform
+}
+
+)"
+},
+// Reduce precision
+{
+"ReducePrevison",
+R"(HloModule reduce_precision:
+
+ENTRY %ReducePrecision () -> f32[1] {
+  %constant = f32[1]{0} constant({3.14159})
+  ROOT %reduce-precision = f32[1]{0} reduce-precision(f32[1]{0} %constant), exponent_bits=8, mantissa_bits=10
+}
+
 )"
 }
   });
@@ -208,15 +712,24 @@ ENTRY %CallR0F32IdentityScalar.v2 () -> f32[] {
 class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
-  void ExpectSuccess() {
+  static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
+    EXPECT_TRUE(StringPiece(s).contains(expected))
+        << "'" << s << "' does not contain '" << expected << "'";
+  }
+
+  // Expects "ToString(Parse(string)) == string", that is, parses the string,
+  // asserts that it succeeded, stringifies the parsed module, and checks that
+  // the it equals the original string.
+  void ExpectEqual() {
     const string& original = GetParam().module_string;
     auto result = Parse(original);
-    TF_EXPECT_OK(result.status());
-    EXPECT_EQ(original, result.ValueOrDie()->ToString());
+    TF_ASSERT_OK(result.status());
+    EXPECT_EQ(original,
+              result.ValueOrDie()->ToString(/*include_large_constants=*/true));
   }
 };
 
-TEST_P(HloParserTest, Run) { ExpectSuccess(); }
+TEST_P(HloParserTest, Run) { ExpectEqual(); }
 
 INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTest,
                         ::testing::ValuesIn(CreateTestCases()),
@@ -301,6 +814,63 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
   // but the constant names will not be exactly the same.
 }
 
+TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
+  const string original = R"(HloModule some_2_module:
+
+ENTRY %some_2 () -> f32[2] {
+  ROOT %constant = f32[2]{0} constant({1,{2}})
+}
+
+)";
+  auto result = Parse(original);
+  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  ExpectHasSubstr(result.status().error_message(),
+                  "expects nested array in rank 1, but sees larger");
+}
+
+TEST_F(HloParserTest, LiteralDimensionsMismatch_2) {
+  const string original = R"(HloModule some_2x3_module:
+
+ENTRY %some_2x3 () -> f32[2,3] {
+  ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6})
+}
+
+)";
+  auto result = Parse(original);
+  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  ExpectHasSubstr(result.status().error_message(),
+                  "expects nested array in rank 2, but sees 1");
+}
+
+TEST_F(HloParserTest, LiteralDimensionsMismatch_3) {
+  const string original = R"(HloModule some_2x3x2_module:
+
+ENTRY %some_2x3x2 () -> f32[2,3,2] {
+  ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
+}
+
+)";
+  auto result = Parse(original);
+  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  ExpectHasSubstr(result.status().error_message(),
+                  "expects 3 elements in the [0]th element");
+}
+
+TEST_F(HloParserTest, ConstantF16Overflow) {
+  const string original =
+      R"(HloModule ConstantF16Overflow_module:
+
+ENTRY %ConstantF16Overflow.v4 () -> f16[] {
+  ROOT %constant = f16[] constant(-65505)
+}
+
+)";
+  auto result = Parse(original);
+  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  ExpectHasSubstr(result.status().error_message(),
+                  "is out of range for literal's primitive type F16");
+}
+
 TEST_F(HloParserTest, ConstantWithExp) {
   const string original = R"(HloModule ConstantWithExp_module:
 
@@ -316,6 +886,130 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
   // printed as "300".
 }
 
+TEST_F(HloParserTest, AttibutesAnyOrder) {
+  const string original = R"(HloModule any_order_module:
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
+  %input = f32[1,2,1]{2,1,0} parameter(0)
+  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
+  %filter = f32[1,1,1]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
+}
+
+)";
+  TF_EXPECT_OK(Parse(original).status());
+}
+
+TEST_F(HloParserTest, InvalidDimLabels) {
+  string prefix = R"(HloModule invalid_dim_labels_module:
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
+  %input = f32[1,2,1]{2,1,0} parameter(0)
+  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
+  %filter = f32[1,1,1]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1} )";
+  string suffix = R"(
+}
+
+)";
+
+  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=00_01_10", suffix))
+                      .status()
+                      .error_message(),
+                  "expects dim labels pattern");
+
+  ExpectHasSubstr(Parse(StrCat(prefix, ",dim_labels=010_1100->010", suffix))
+                      .status()
+                      .error_message(),
+                  "must have the same rank");
+}
+
+TEST_F(HloParserTest, UnexpectedAttribute) {
+  const string original = R"(HloModule unexpected_attr_module:
+
+ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
+  %recv = (f32[], u32[]) recv(), channel_id=15
+  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
+  ROOT %constant = f32[] constant(2.1)
+  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, calls=%recv
+  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
+}
+
+)";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "unexpected attribute calls");
+}
+
+TEST_F(HloParserTest, MissingAttribute) {
+  const string original = R"(HloModule missing_attr_module:
+
+ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
+  %recv = (f32[], u32[]) recv(), channel_id=15
+  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
+  ROOT %constant = f32[] constant(-2.1)
+  %send = (f32[], u32[]) send(f32[] %constant)
+  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
+}
+
+)";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "attribute channel_id is expected but not seen");
+}
+
+TEST_F(HloParserTest, PredecessorUndefined) {
+  const string original = R"(HloModule pre_not_found_module:
+
+ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
+  %recv = (f32[], u32[]) recv(), channel_id=15
+  %recv-done = f32[] recv-done((f32[], u32[]) %recv), channel_id=15
+  ROOT %constant = f32[] constant(2.1)
+  %send = (f32[], u32[]) send(f32[] %constant), channel_id=16, control-predecessors={%done}
+  %send-done = () send-done((f32[], u32[]) %send), channel_id=16
+}
+
+)";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "'done' is not defined");
+}
+
+TEST_F(HloParserTest, SliceAllowOmitStride1) {
+  const string original = R"(HloModule slice_module:
+
+ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
+  %p0 = f32[3,3,4,4]{3,2,1,0} parameter(0)
+  ROOT %slice = f32[3,3,2,4]{3,2,1,0} slice(f32[3,3,4,4]{3,2,1,0} %p0), slice={[0:3], [0:3], [0:4:2], [0:4]}
+}
+
+)";
+  TF_EXPECT_OK(Parse(original).status());
+}
+
+TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
+  const string original = R"(HloModule window_pad_module:
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2,1] {
+  %input = f32[1,2,1]{2,1,0} parameter(0)
+  %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
+  %filter = f32[1,1,1]{2,1,0} parameter(1)
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), dim_labels=b0f_0io->b0f, window={pad=1_1_0 size=1}
+}
+
+)";
+  ExpectHasSubstr(Parse(original).status().error_message(),
+                  "expects padding_low and padding_high separated by '_'");
+}
+
+TEST_F(HloParserTest, CommaBetweenSubAttributes) {
+  const string original = R"(HloModule test_comma_module:
+
+ENTRY %test_comma.v4 () -> f32[] {
+  ROOT %constant = f32[] constant(-4.2), metadata={source_line=5, op_type="::const"}
+}
+
+)";
+  TF_EXPECT_OK(Parse(original).status());
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/tools/parser/hlo_token.h
index a40300e2bf0d3279967826be6bf74875f8320f11..7928bee5c2097f353b182095a555c334d7b69c95 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_token.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
 #define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
 
+#include <string>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/types.h"
+
 namespace xla {
 namespace tools {
 
@@ -36,7 +41,8 @@ enum class TokKind {
   kLparen,
   kRparen,  // (  )
 
-  kArrow,  // ->
+  kArrow,    // ->
+  kComment,  // /*xxx*/
 
   // Keywords
   kw_HloModule,
@@ -46,16 +52,26 @@ enum class TokKind {
   kw_false,
   kw_maximal,
   kw_replicated,
+  kw_nan,
+  kw_inf,
+
+  kNegInf,  // -inf
 
   // Typed tokens.
   kName,           // %foo
   kAttributeName,  // dimensions=
+  kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+  kDxD,            // [0-9]+(x[0-9]+)+
+  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kIdent,          // other identifiers
+  kString,         // "abcd\"\n"
   kShape,          // f32[2,3]{1,0}
-  kOpcode,         // add
   kInt,            // 42
   kDecimal,        // 4.2
 };
 
+string TokKindToString(TokKind kind);
+
 }  // namespace tools
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 89b26b8916b67eeb38852c9e91314187fc8a7d48..a7dc5862057047f7c56faeb211cc0b13992caec7 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -58,18 +59,26 @@ namespace xla {
 namespace tools {
 namespace {
 
+// Command-line opts to this tool.  See main() for descriptions of these
+// fields.
+struct Options {
+  string fake_infeed_shape;
+  bool use_fake_data = false;
+  bool print_result = true;
+  int num_runs = 1;
+};
+
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
 //
 // Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
 // otherwise, no infeed is performed.
 StatusOr<std::unique_ptr<Literal>> ReplayComputation(
-    const SessionModule& module, tensorflow::StringPiece fake_infeed_shape,
-    bool use_fake_data, Client* client) {
+    const SessionModule& module, Client* client, const Options& opts) {
   TF_ASSIGN_OR_RETURN(Computation computation, client->LoadSnapshot(module));
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
-  if (use_fake_data) {
+  if (opts.use_fake_data) {
     arguments = MakeFakeArgumentsOrDie(computation, client);
   } else {  // use recorded data if available
     for (const auto& proto : module.arguments()) {
@@ -84,12 +93,12 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   // concurrent infeed occur via the fake_infeed_shape.
   tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
 
-  if (!fake_infeed_shape.empty()) {
+  if (!opts.fake_infeed_shape.empty()) {
     pool.emplace(tensorflow::Env::Default(), "infeed",
                  /*num_threads=*/1);
-    pool->Schedule([fake_infeed_shape, client]() {
+    pool->Schedule([opts, client]() {
       StatusOr<Shape> shape_status =
-          ShapeUtil::ParseShapeString(fake_infeed_shape);
+          ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
       TF_CHECK_OK(shape_status.status());
       Shape shape = std::move(shape_status).ValueOrDie();
       StatusOr<std::unique_ptr<Literal>> data_status = MakeFakeLiteral(shape);
@@ -106,11 +115,32 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(
   for (auto& argument : arguments) {
     execute_arguments.push_back(argument.get());
   }
-  return client->ExecuteAndTransfer(computation, execute_arguments);
+
+  // Run the computation num_runs times, and return the result from the last
+  // execution.
+  std::unique_ptr<Literal> result;
+  for (int i = 0; i < opts.num_runs; ++i) {
+    ExecutionProfile profile;
+    if (opts.print_result) {
+      TF_ASSIGN_OR_RETURN(result, client->ExecuteAndTransfer(
+                                      computation, execute_arguments,
+                                      /*execution_options=*/nullptr, &profile));
+    } else {
+      // If we're not printing the result, execute the computation but don't
+      // bother retrieving the result.  This can be a significant speedup.
+      TF_RETURN_IF_ERROR(client
+                             ->Execute(computation, execute_arguments,
+                                       /*execution_options=*/nullptr, &profile)
+                             .status());
+    }
+    LOG(INFO) << "Execution took "
+              << static_cast<double>(profile.compute_time_ns()) / 1e9 << "s";
+  }
+
+  return std::move(result);
 }
 
-int RealMain(tensorflow::gtl::ArraySlice<char*> args,
-             tensorflow::StringPiece fake_infeed_shape, bool use_fake_data) {
+int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
   Client* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
@@ -118,21 +148,24 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args,
     SessionModule module;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
     StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(module, fake_infeed_shape, use_fake_data, client);
+        ReplayComputation(module, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
       exit_status = EXIT_FAILURE;
       continue;
     }
+
     std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
-    fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
-            ShapeUtil::HumanString(result->shape()).c_str(),
-            result->ToString().c_str());
-    if (module.has_result()) {
-      fprintf(stdout, "was %s:%s\n",
-              ShapeUtil::HumanString(module.result().shape()).c_str(),
-              Literal(module.result()).ToString().c_str());
+    if (result != nullptr) {
+      fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
+              ShapeUtil::HumanString(result->shape()).c_str(),
+              result->ToString().c_str());
+      if (module.has_result()) {
+        fprintf(stdout, "was %s:%s\n",
+                ShapeUtil::HumanString(module.result().shape()).c_str(),
+                Literal(module.result()).ToString().c_str());
+      }
     }
   }
   return exit_status;
@@ -143,13 +176,15 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args,
 }  // namespace xla
 
 int main(int argc, char** argv) {
-  // Flags
-  xla::string fake_infeed_shape;
-  bool use_fake_data = false;
+  xla::tools::Options opts;
   const std::vector<tensorflow::Flag> flag_list = {
-      tensorflow::Flag("use_fake_data", &use_fake_data,
+      tensorflow::Flag("use_fake_data", &opts.use_fake_data,
                        "Replay computation using fake data"),
-      tensorflow::Flag("fake_infeed_shape", &fake_infeed_shape,
+      tensorflow::Flag("print_result", &opts.print_result,
+                       "Print the result of the computation to stdout"),
+      tensorflow::Flag("num_runs", &opts.num_runs,
+                       "Number of times to run each computation"),
+      tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
@@ -161,5 +196,5 @@ int main(int argc, char** argv) {
 
   tensorflow::gtl::ArraySlice<char*> args(argv, argc);
   args.pop_front();  // Pop off the binary name, argv[0]
-  return xla::tools::RealMain(args, fake_infeed_shape, use_fake_data);
+  return xla::tools::RealMain(args, opts);
 }
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index 3b19ca321cad35aad18f7f498e08fd744ffbc371..9fa4297523bab0748863479be52dff1b7b523a8b 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <complex>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
 #include <Eigen/Core>
@@ -32,6 +33,8 @@ using ::tensorflow::int16;
 using ::tensorflow::int32;
 using ::tensorflow::int64;
 
+using ::tensorflow::bfloat16;
+
 using ::tensorflow::uint8;
 using ::tensorflow::uint16;
 using ::tensorflow::uint32;
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 2624ef0252fd9482a600fe3aec07f7f328a86d69..fe5d29a6b655a89d559eb1214c2b8dd54d34094c 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -42,15 +42,15 @@ Status WithLogBacktrace(const Status& status) {
 
 }  // namespace
 
-ScopedLoggingTimer::ScopedLoggingTimer(const string& label, int32 vlog_level)
-    : label(label), vlog_level(vlog_level) {
-  if (VLOG_IS_ON(vlog_level)) {
+ScopedLoggingTimer::ScopedLoggingTimer(const string& label, bool enabled)
+    : enabled(enabled), label(label) {
+  if (enabled) {
     start_micros = tensorflow::Env::Default()->NowMicros();
   }
 }
 
 ScopedLoggingTimer::~ScopedLoggingTimer() {
-  if (VLOG_IS_ON(vlog_level)) {
+  if (enabled) {
     uint64 end_micros = tensorflow::Env::Default()->NowMicros();
     double secs = (end_micros - start_micros) / 1000000.0;
 
@@ -191,9 +191,9 @@ std::vector<int64> ComposePermutations(tensorflow::gtl::ArraySlice<int64> p1,
   return output;
 }
 
-bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> p) {
-  for (int64 i = 0; i < p.size(); ++i) {
-    if (p[i] != i) {
+bool IsIdentityPermutation(tensorflow::gtl::ArraySlice<int64> permutation) {
+  for (int64 i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] != i) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index f58f57b44396c90a3820835a3d0ecc792aaa7cd0..b722095d1f38bf8a984c3ce9092a65f8e0baa911 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -50,13 +50,43 @@ using DimensionVector = tensorflow::gtl::InlinedVector<int64, kInlineRank>;
 // RAII timer that logs with a given label the wall clock time duration in human
 // readable form. This differs from base's ElapsedTimer primarily in that it
 // spits out the human-readable duration form.
+//
+// By default, the timing traces are only printed at VLOG(1) and above:
+//
+//   XLA_SCOPED_LOGGING_TIMER("fooing bar");  // nop if !VLOG_IS_ON(1).
+//
+// but you can control this via:
+//
+//   XLA_SCOPED_LOGGING_TIMER_LEVEL("fooing bar", 2);  // nop if !VLOG_IS_ON(2)
+//
+#define XLA_SCOPED_LOGGING_TIMER(label) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, 1, __COUNTER__)
+#define XLA_SCOPED_LOGGING_TIMER_LEVEL(label, level) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, __COUNTER__)
+
+// Helper for implementing macros above.  Do not use directly.
+//
+// Forces the evaluation of "counter", which we expect is equal to __COUNTER__.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER(label, level, counter) \
+  XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)
+
+// Helper for macros above.  Don't use directly.
+#define XLA_SCOPED_LOGGING_TIMER_HELPER2(label, level, counter)      \
+  ::xla::ScopedLoggingTimer XLA_ScopedLoggingTimerInstance##counter( \
+      label, VLOG_IS_ON(level))
+
+// RAII timer for XLA_SCOPED_LOGGING_TIMER and XLA_SCOPED_LOGGING_TIMER_LEVEL
+// macros above.  Recommended usage is via the macros so you don't have to give
+// the timer a name or worry about calling VLOG_IS_ON yourself.
 struct ScopedLoggingTimer {
-  explicit ScopedLoggingTimer(const string& label, int32 vlog_level = 1);
+  // The timer does nothing if enabled is false.  This lets you pass in your
+  // file's VLOG_IS_ON value.
+  ScopedLoggingTimer(const string& label, bool enabled);
   ~ScopedLoggingTimer();
 
-  uint64 start_micros;
+  bool enabled;
   string label;
-  int32 vlog_level;
+  uint64 start_micros;
 };
 
 // Given a vector<T>, returns a MutableArraySlice<char> that points at its
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 23161873a0b722dfbea34507fefc38a7a02c023d..293f0781a203d092a7996d5548de1dbf5bf32e4c 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -26,8 +26,8 @@ namespace xla {
 namespace window_util {
 
 /* static */ string ToString(const WindowDimension& dim) {
-  using tensorflow::strings::StrCat;
   using tensorflow::strings::StrAppend;
+  using tensorflow::strings::StrCat;
   string str = StrCat("(size=", dim.size());
   if (dim.stride() != 1) {
     StrAppend(&str, ",stride=", dim.stride());
@@ -44,27 +44,30 @@ namespace window_util {
   if (dim.window_dilation() != 1) {
     StrAppend(&str, ",window_dilation=", dim.window_dilation());
   }
+  if (dim.window_reversal()) {
+    StrAppend(&str, ",window_reversal");
+  }
   StrAppend(&str, ")");
   return str;
 }
 
 string ToString(const Window& window) {
-  using tensorflow::strings::StrCat;
   using tensorflow::strings::StrAppend;
+  using tensorflow::strings::StrCat;
 
   string str;
-  const auto add_field = [&](
-      const char* heading,
-      std::function<string(const WindowDimension&)> format) {
-    StrAppend(&str, heading, "=");
-    const char* prefix = "";
-    for (const auto& window_dimension : window.dimensions()) {
-      StrAppend(&str, prefix, format(window_dimension));
-      prefix = "x";
-    }
-  };
-
-  add_field("window",
+  const auto add_field =
+      [&](const char* heading,
+          std::function<string(const WindowDimension&)> format) {
+        StrAppend(&str, heading, "=");
+        const char* prefix = "";
+        for (const auto& window_dimension : window.dimensions()) {
+          StrAppend(&str, prefix, format(window_dimension));
+          prefix = "x";
+        }
+      };
+
+  add_field("size",
             [](const WindowDimension& dim) { return StrCat(dim.size()); });
   if (HasStride(window)) {
     add_field(" stride",
@@ -85,6 +88,11 @@ string ToString(const Window& window) {
       return StrCat(dim.window_dilation());
     });
   }
+  if (HasWindowReversal(window)) {
+    add_field(" rhs_reversal", [](const WindowDimension& dim) {
+      return StrCat(dim.window_reversal() ? 1 : 0);
+    });
+  }
   return str;
 }
 
@@ -138,6 +146,15 @@ bool HasWindowDilation(const Window& window) {
   return false;
 }
 
+bool HasWindowReversal(const Window& window) {
+  for (const auto& dim : window.dimensions()) {
+    if (dim.window_reversal()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 235cb2d59d451a25dc4f824ab488f8cef6b03bfb..125900dac0c5ab478b834c315b4a438c9238ef6d 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -39,6 +39,8 @@ bool HasBaseDilation(const Window& window);
 bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
+bool HasWindowReversal(const Window& window);
+
 // Returns the new bound after dilation.
 //
 // If a window with the given bound in some dimension is dilated with the given
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 3fa5bcc1df4f0294582b6c74735fef08c87433eb..6b136d333bbf079efd314833f46fe3b98743fbac 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -17,3 +17,5 @@ def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
                    protoc="@protobuf_archive//:protoc",
                    testonly=testonly,
                    visibility=visibility,)
+
+ORC_JIT_MEMORY_MAPPER_TARGETS = []
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index ce3c3eee68ad7f7ebb42836e3cae14803f8650d7..127e5e81ac6d21945c7125ef913d236e8892758e 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -167,6 +167,14 @@ message DebugOptions {
   // computation will run 2! * 4! times.
   bool xla_test_all_input_layouts = 91;
 
+  // Assign colors based on sharding information when generating the Graphviz
+  // HLO graph.
+  bool xla_hlo_graph_sharding_color = 92;
+
+  // Prefix the name scopes of the TF graph exports with "devX" device
+  // assignments, if available.
+  bool xla_hlo_tfgraph_device_scopes = 93;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -361,6 +369,7 @@ message WaitForExecutionResponse {
 message IsConstantRequest {
   ComputationHandle computation = 1;
   ComputationDataHandle operand = 2;
+  int64 num_parameters = 3;
 }
 
 message IsConstantResponse {
@@ -371,6 +380,7 @@ message ComputeConstantRequest {
   ComputationHandle computation = 1;
   ComputationDataHandle operand = 2;
   Layout output_layout = 3;
+  repeated LiteralProto parameters = 4;
 }
 
 message ComputeConstantResponse {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 080e3c4267a2dca2b70c5cff51126cbf4b3e2881..215707634bc29263bc1ef472f498ac1bb1ca9181 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -46,6 +46,12 @@ enum PrimitiveType {
   // converted to f16 from f32 at arbirary points in the computation.
   F16 = 10;
   F32 = 11;
+
+  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+  // floating-point format, but uses 1 bit for the sign, 8 bits for the exponent
+  // and 7 bits for the mantissa.
+  BF16 = 16;
+
   F64 = 12;
 
   // Complex values of fixed width.
@@ -63,6 +69,8 @@ enum PrimitiveType {
   // An opaque type used for passing context specific data to a custom
   // operation.
   OPAQUE = 14;
+
+  // Next = 17
 }
 
 // Describes the value held inside padding elements.
@@ -310,7 +318,10 @@ message LiteralProto {
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
   repeated LiteralProto tuple_literals = 10;
-  bytes f16s = 11;  // Note: the F16s are encoded in little endian byte order
+  // The F16s and BF16s are encoded in little endian byte order
+  bytes f16s = 11;
+  bytes bf16s = 13;
+  // Next = 14
 }
 
 message WindowDimension {
@@ -346,6 +357,10 @@ message WindowDimension {
   // means no dilation. base_dilation - 1 no-op entries ("holes") are implicitly
   // placed between each base area element. See documentation for convolution.
   int64 base_dilation = 6;
+
+  // Window reversal means that this dimension was logically reversed before the
+  // operation.
+  bool window_reversal = 7;
 }
 
 // Describes the windowing in an operation such as convolution.
@@ -402,15 +417,9 @@ message ConvolutionDimensionNumbers {
   // The number of the dimension that represents features in the input.
   int64 input_feature_dimension = 8;
 
-  // The number of the dimension that represents batch in the output.
-  int64 output_batch_dimension = 9;
-
-  // The number of the dimension that represents features in the output.
-  int64 output_feature_dimension = 10;
-
   // The dimension numbers for the spatial dimensions that the window
-  // moves through in the input (lhs) and output.
-  repeated int64 spatial_dimensions = 5;
+  // moves through in the input.
+  repeated int64 input_spatial_dimensions = 11;
 
   // The number of the dimension that represents input features in the
   // convolutional kernel (rhs).
@@ -424,15 +433,41 @@ message ConvolutionDimensionNumbers {
   // moves through in the kernel (rhs). window.strides(0) is the
   // stride in the kernel_spatial_dimensions(0) dimension.
   repeated int64 kernel_spatial_dimensions = 6;
+
+  // The number of the dimension that represents batch in the output.
+  int64 output_batch_dimension = 9;
+
+  // The number of the dimension that represents features in the output.
+  int64 output_feature_dimension = 10;
+
+  // The dimension numbers for the spatial dimensions that the window
+  // moves through in the output.
+  repeated int64 output_spatial_dimensions = 12;
+
+  // Next = 13
 };
 
 message ConvolveRequest {
   ComputationDataHandle lhs = 2;
   ComputationDataHandle rhs = 3;  // This is the filter/kernel.
-  Window window = 4;              // Describes the filter/kenel.
+  Window window = 4;              // Describes the filter/kernel.
   ConvolutionDimensionNumbers dimension_numbers = 5;
 }
 
+enum FftType {
+  FFT = 0;    // Forward FFT; complex in, complex out.
+  IFFT = 1;   // Inverse FFT; complex in, complex out.
+  RFFT = 2;   // Forward real FFT; real in, fft_length / 2 + 1 complex out
+  IRFFT = 3;  // Inverse real FFT; fft_length / 2 + 1 complex in,
+              //                   fft_length real out
+}
+
+message FftRequest {
+  FftType fft_type = 1;
+  repeated int64 fft_length = 2;  // Multivalent for higher-order FFT.
+  ComputationDataHandle operand = 3;
+}
+
 message InfeedRequest {
   // The shape of the data returned by reading the device's infeed buffer.
   Shape shape = 2;
@@ -463,6 +498,23 @@ message CustomCallRequest {
   Shape shape = 4;
 }
 
+message DotDimensionNumbers {
+  // The dimension numbers that represent the 'lhs' contracting dimensions.
+  repeated int64 lhs_contracting_dimensions = 1;
+  // The dimension numbers that represent the 'rhs' contracting dimensions.
+  repeated int64 rhs_contracting_dimensions = 2;
+  // The dimension numbers that represent the 'lhs' batch dimensions.
+  repeated int64 lhs_batch_dimensions = 3;
+  // The dimension numbers that represent the 'rhs' batch dimensions.
+  repeated int64 rhs_batch_dimensions = 4;
+};
+
+message DotRequest {
+  ComputationDataHandle lhs = 2;
+  ComputationDataHandle rhs = 3;
+  DotDimensionNumbers dimension_numbers = 4;
+}
+
 message MapRequest {
   repeated ComputationDataHandle operands = 2;
   ComputationHandle to_apply = 3;
@@ -616,6 +668,14 @@ message ConcatenateRequest {
   int64 dimension = 3;
 }
 
+message ConditionalRequest {
+  ComputationDataHandle predicate = 2;
+  ComputationDataHandle true_operand = 3;
+  ComputationHandle true_computation = 4;
+  ComputationDataHandle false_operand = 5;
+  ComputationHandle false_computation = 6;
+}
+
 message WhileRequest {
   ComputationHandle condition = 2;
   ComputationHandle body = 3;
@@ -697,9 +757,6 @@ enum BinaryOperation {
   BINOP_LT = 9;
   BINOP_NE = 10;
 
-  // Dot product, matrix multiply.
-  BINOP_DOT = 12;
-
   // Element-wise maximum.
   BINOP_MAX = 14;
 
@@ -811,8 +868,10 @@ message OpSharding {
     REPLICATED = 0;
     // This sharding is maximal - one device runs the entire operation.
     MAXIMAL = 1;
-    // Neither of the above; tile_shape and tile_assignment are both used.
-    OTHER = 2;
+    // This sharding is a tuple - only the tuple_shardings field is valid.
+    TUPLE = 2;
+    // None of the above; tile_shape and tile_assignment are both used.
+    OTHER = 3;
   }
   Type type = 1;
   // The shape of the sharded tile.
@@ -824,6 +883,13 @@ message OpSharding {
   // Flattened list of device IDs. The order of flattening is the same as used
   // by IndexUtil::MultiToLinearIndex(tile_assignment_shape).
   repeated int64 tile_assignment_devices = 4;
+  // If type == TUPLE, the sub-shardings, one per leaf node in the tuple shape,
+  // in pre-order. The tuple shape could be nested; here we store just a
+  // flattened list of all leaves in the tuple shape. Note that the tuple shape
+  // is not stored here; shardings do not store the shapes to which they are
+  // applied, this is inferred from the instruction this sharding gets attached
+  // to.
+  repeated OpSharding tuple_shardings = 5;
 }
 
 message OpRequest {
@@ -841,6 +907,7 @@ message OpRequest {
     ConvolveRequest convolve_request = 8;
     CrossReplicaSumRequest cross_replica_sum_request = 9;
     CustomCallRequest custom_call_request = 10;
+    DotRequest dot_request = 43;
     DynamicSliceRequest dynamic_slice_request = 11;
     DynamicUpdateSliceRequest dynamic_update_slice_request = 12;
     GetTupleElementRequest get_tuple_element_request = 13;
@@ -868,7 +935,10 @@ message OpRequest {
     BatchNormTrainingRequest batch_norm_training_request = 35;
     BatchNormGradRequest batch_norm_grad_request = 37;
     BatchNormInferenceRequest batch_norm_inference_request = 38;
-    // Next: 41
+    FftRequest fft_request = 41;
+    ConvertRequest bitcast_convert_request = 42;
+    ConditionalRequest conditional_request = 44;
+    // Next: 45
   }
 }
 
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 2e9b96bb1d31f7c985df992c094784660d6e274c..604c41bf8acc910b47f8ee4a871d4740a2f1ba2f 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -9,7 +9,12 @@ load("//third_party/mpi:mpi.bzl", "if_mpi")
 
 py_library(
     name = "contrib_py",
-    srcs = glob(["**/*.py"]),
+    srcs = glob(
+        ["**/*.py"],
+        exclude = [
+            "**/*_test.py",
+        ],
+    ),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -51,17 +56,20 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
+        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
         "//tensorflow/contrib/memory_stats:memory_stats_py",
         "//tensorflow/contrib/meta_graph_transform",
         "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/contrib/model_pruning",
         "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/ndlstm",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index a26fdb982c0f4d6d85b73912c194647a989d0ef6..08247c6b38a4df663ad28a6b4d3c41a1da41a020 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -51,9 +51,11 @@ from tensorflow.contrib import lookup
 from tensorflow.contrib import losses
 from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
+from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
 from tensorflow.contrib import opt
+from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
@@ -78,6 +80,7 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
+from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.ndlstm import python as ndlstm
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
diff --git a/tensorflow/contrib/android/README.md b/tensorflow/contrib/android/README.md
index f49e5857fe5255c2459793cb1389052a2ff5f88f..c7c128bf14f03d3769ef08e83da61f6d2f91fbd2 100644
--- a/tensorflow/contrib/android/README.md
+++ b/tensorflow/contrib/android/README.md
@@ -15,9 +15,9 @@ For prebuilt libraries, see the
 page for a recent build.
 
 The TensorFlow Inference Interface is also available as a
-[JCenter package](https://bintray.com/google/tensorflow/tensorflow-android) and
-can be included quite simply in your android project with a couple of lines in
-the project's `build.gradle` file:
+[JCenter package](https://bintray.com/google/tensorflow/tensorflow)
+(see the tensorflow-android directory) and can be included quite simply in your
+android project with a couple of lines in the project's `build.gradle` file:
 
 ```
 allprojects {
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index 9e4d3290c3d99fab42f512f7144defde54f8ece8..380a652435ad089f46f3ca80e4fd43097fd96e10 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -97,7 +97,7 @@ class RandomAccessFileFromAsset : public RandomAccessFile {
     off64_t new_offset = AAsset_seek64(asset.get(), offset, SEEK_SET);
     off64_t length = AAsset_getLength64(asset.get());
     if (new_offset < 0) {
-      result->set(scratch, 0);
+      *result = StringPiece(scratch, 0);
       return errors::OutOfRange("Read after file end.");
     }
     const off64_t region_left =
@@ -106,7 +106,7 @@ class RandomAccessFileFromAsset : public RandomAccessFile {
     if (read < 0) {
       return errors::Internal("Error reading from asset.");
     }
-    result->set(scratch, region_left);
+    *result = StringPiece(scratch, region_left);
     return (region_left == to_read)
                ? Status::OK()
                : errors::OutOfRange("Read less bytes than requested.");
diff --git a/tensorflow/contrib/android/cmake/CMakeLists.txt b/tensorflow/contrib/android/cmake/CMakeLists.txt
index 25ada5ba27aa167e4aaf4cebd6517e3b80aa1058..a115d1610e2334a6626f29674f3dd195e3a3c648 100644
--- a/tensorflow/contrib/android/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/android/cmake/CMakeLists.txt
@@ -34,10 +34,12 @@ add_library(lib_tf STATIC IMPORTED )
 set_target_properties(lib_tf PROPERTIES IMPORTED_LOCATION
         ${PREBUILT_DIR}/lib/libtensorflow-core.a)
 # Change to compile flags should be replicated into bazel build file
+# TODO: Consider options other than -O2 for binary size.
+#       e.g. -Os for gcc, and -Oz for clang.
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIS_SLIM_BUILD \
                      -std=c++11 -fno-rtti -fno-exceptions \
                      -O2 -Wno-narrowing -fomit-frame-pointer \
-                     -mfpu=neon -mfloat-abi=softfp -fPIE \
+                     -mfpu=neon -mfloat-abi=softfp -fPIE -fPIC \
                      -ftemplate-depth=900 \
                      -DGOOGLE_PROTOBUF_NO_RTTI \
                      -DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER")
diff --git a/tensorflow/contrib/android/cmake/README.md b/tensorflow/contrib/android/cmake/README.md
index 6f19b657fe72064bd7b005b568540cd52a5e19e8..934b58c7242fc06064ee3c06bc8f4c2740bd24ef 100644
--- a/tensorflow/contrib/android/cmake/README.md
+++ b/tensorflow/contrib/android/cmake/README.md
@@ -14,7 +14,7 @@ Add TensorFlow-Android-Inference as a dependency of your Android application
 
 ```
 include ':TensorFlow-Android-Inference'
-findProject(":TensorFlow-Android-Inference").projectDir = 
+findProject(":TensorFlow-Android-Inference").projectDir =
             new File("${/path/to/tensorflow_repo}/contrib/android/cmake")
 ```
 
diff --git a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 1f423a7a5bf6a115dc627ddd6f5e98c074282585..dc5b9fb88742d78d0f40207b589e29451a6358dd 100644
--- a/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/contrib/android/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -160,7 +160,7 @@ public class TensorFlowInferenceInterface {
       throw new RuntimeException("Failed to load model from the input stream", e);
     }
   }
-  
+
   /*
    * Construct a TensorFlowInferenceInterface with provided Graph
    *
@@ -168,7 +168,7 @@ public class TensorFlowInferenceInterface {
    */
   public TensorFlowInferenceInterface(Graph g) {
     prepareNativeRuntime();
-      
+
     // modelName is redundant here, here is for
     // avoiding error in initialization as modelName is marked final.
     this.modelName = "";
@@ -290,7 +290,7 @@ public class TensorFlowInferenceInterface {
    */
   public void feed(String inputName, boolean[] src, long... dims) {
     byte[] b = new byte[src.length];
-    
+
     for (int i = 0; i < src.length; i++) {
       b[i] = src[i] ? (byte) 1 : (byte) 0;
     }
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 8b7df4a84c558f662405a28a42426583d5ab39cd..a111cfecb366fe245150cc71d2c43662d0d69090 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -82,6 +82,7 @@ cc_library(
 tf_cc_test(
     name = "adaptive_shared_batch_scheduler_test",
     srcs = ["adaptive_shared_batch_scheduler_test.cc"],
+    tags = ["manual"],  # b/69013768
     deps = [
         ":adaptive_shared_batch_scheduler",
         "//tensorflow/contrib/batching/test_util:fake_clock_env",
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
index a0606427a526ffc67e10d12a084eabc64564e4ab..9e32bee505640ea04edfeffea0a14d1937c3a2b1 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler.h
@@ -208,6 +208,8 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   // place any more tasks in this batch.
   void ReleaseBatch(const ASBSBatch<TaskType>* batch);
 
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
  private:
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
@@ -399,7 +401,7 @@ ASBSQueue<TaskType>::~ASBSQueue() {
 
 template <typename TaskType>
 Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  bool added_new_batch = false;
+  ASBSBatch<TaskType>* new_batch = nullptr;
   size_t size = (*task)->size();
   if (size > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", size,
@@ -418,15 +420,14 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
       current_batch_ = nullptr;
     }
     if (!current_batch_) {
-      added_new_batch = true;
       num_enqueued_batches_++;
-      current_batch_ =
+      current_batch_ = new_batch =
           new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
     }
     current_batch_->AddTask(std::move(*task));
     num_enqueued_tasks_++;
   }
-  if (added_new_batch) scheduler_->AddBatch(current_batch_);
+  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
index a07cd6d834fa28904bf7748b16972cca217503c1..e2aac54eebccaf53da9560591cfe909989774bab 100644
--- a/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/adaptive_shared_batch_scheduler_test.cc
@@ -186,6 +186,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, ObeysQueueOptions) {
     queue_options.max_enqueued_batches = 2;
     TF_ASSERT_OK(
         scheduler->AddQueue(queue_options, queue_0_callback, &queue_0));
+    EXPECT_EQ(10, queue_0->max_task_size());
     queue_options.max_batch_size = 0;
     // Queue must have max_batch_size > 0.
     EXPECT_FALSE(
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler.h b/tensorflow/contrib/batching/basic_batch_scheduler.h
index 9d3805fbaf39978159dd2f4a754e6d41a07acf6a..91065db2499dffd2687a53bd6304d9b7593f7b3a 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler.h
+++ b/tensorflow/contrib/batching/basic_batch_scheduler.h
@@ -192,6 +192,10 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
   size_t NumEnqueuedTasks() const override;
   size_t SchedulingCapacity() const override;
 
+  size_t max_task_size() const override {
+    return shared_scheduler_queue_->max_task_size();
+  }
+
  private:
   explicit BasicBatchScheduler(
       std::unique_ptr<BatchScheduler<TaskType>> shared_scheduler_queue);
diff --git a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
index e020301795c7dadee2815c0e0d727e53e5fb9e6e..187823151cf840dcf8058677fcf74d1beffc3bc2 100644
--- a/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/basic_batch_scheduler_test.cc
@@ -73,6 +73,7 @@ TEST(BasicBatchSchedulerTest, Basic) {
     std::unique_ptr<BasicBatchScheduler<FakeTask>> scheduler;
     TF_ASSERT_OK(
         BasicBatchScheduler<FakeTask>::Create(options, callback, &scheduler));
+    EXPECT_EQ(10, scheduler->max_task_size());
     EXPECT_EQ(0, scheduler->NumEnqueuedTasks());
     EXPECT_EQ(3 * 10, scheduler->SchedulingCapacity());
     TF_ASSERT_OK(ScheduleTask(3, scheduler.get()));
diff --git a/tensorflow/contrib/batching/batch_scheduler.h b/tensorflow/contrib/batching/batch_scheduler.h
index a5072f439abad3c5db79a514a7f2baff0b021b39..e18cf6c35059e4d720768e3b2c02b03727a6bac4 100644
--- a/tensorflow/contrib/batching/batch_scheduler.h
+++ b/tensorflow/contrib/batching/batch_scheduler.h
@@ -178,6 +178,10 @@ class BatchScheduler {
   // This method is useful for monitoring, or for guaranteeing a future slot in
   // the schedule (but being mindful about the caveats listed above).
   virtual size_t SchedulingCapacity() const = 0;
+
+  // Returns the maximum allowed size of tasks submitted to the scheduler. (This
+  // is typically equal to a configured maximum batch size.)
+  virtual size_t max_task_size() const = 0;
 };
 
 //////////
diff --git a/tensorflow/contrib/batching/kernels/batch_kernels.cc b/tensorflow/contrib/batching/kernels/batch_kernels.cc
index 3b7c538fcc42b2e8f100d374c273ee3ca3d6056b..6041d8c9b2ca14bd325d1e7ea562bc4bc27d6a51 100644
--- a/tensorflow/contrib/batching/kernels/batch_kernels.cc
+++ b/tensorflow/contrib/batching/kernels/batch_kernels.cc
@@ -461,7 +461,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
+  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
   // creates it.
   Status LookupOrCreateBatcherQueue(const string& queue_name,
                                     BatcherQueue** queue) {
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler.h b/tensorflow/contrib/batching/shared_batch_scheduler.h
index 41a3f99137ade2552432fee62ddce17d064148a4..1d2158062e589db71b7df4c47af1b7851b41a036 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler.h
+++ b/tensorflow/contrib/batching/shared_batch_scheduler.h
@@ -248,6 +248,9 @@ class Queue {
   // BatchScheduler::SchedulingCapacity().
   size_t SchedulingCapacity() const;
 
+  // Returns the maximum allowed size of tasks submitted to the queue.
+  size_t max_task_size() const { return options_.max_batch_size; }
+
   // Called by a thread that is ready to process a batch, to request one from
   // this queue. Either returns a batch that is ready to be processed, or
   // nullptr if the queue declines to schedule a batch at this time. If it
@@ -338,6 +341,8 @@ class QueueHandle : public BatchScheduler<TaskType> {
   size_t NumEnqueuedTasks() const override;
   size_t SchedulingCapacity() const override;
 
+  size_t max_task_size() const override { return queue_->max_task_size(); }
+
  private:
   // The scheduler that owns 'queue_'.
   std::shared_ptr<SharedBatchScheduler<TaskType>> scheduler_;
diff --git a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
index 3e924ae5f13519b4fe9a3f4b510773ca2bddaf23..3ac79a8fdc47389816db8ca09f27846d1c4623c2 100644
--- a/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
+++ b/tensorflow/contrib/batching/shared_batch_scheduler_test.cc
@@ -429,6 +429,7 @@ TEST(SharedBatchSchedulerTest, ConstMethods) {
     queue_options.max_enqueued_batches = max_enqueued_batches;
     std::unique_ptr<BatchScheduler<FakeTask>> queue;
     TF_ASSERT_OK(scheduler->AddQueue(queue_options, callback, &queue));
+    EXPECT_EQ(2, queue->max_task_size());
     EXPECT_EQ(0, queue->NumEnqueuedTasks());
     EXPECT_EQ(max_enqueued_batches * 2, queue->SchedulingCapacity());
 
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index 8bb742d289a0836378a9a03c90d46293cfbfe75b..a262d4aecdbb69dfcd8b88bc0a09060500d6b1c9 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -3,12 +3,15 @@
 #   particularly useful for Bayesian inference.
 #   APIs here are meant to evolve over time.
 
+package(default_visibility = [
+    "//learning/brain/contrib/bayesflow:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
@@ -16,9 +19,9 @@ py_library(
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:functional_ops",
@@ -29,12 +32,8 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -101,61 +100,61 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "entropy_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/entropy_test.py"],
+    name = "layers_dense_variational_test",
+    size = "small",
+    srcs = ["python/kernel_tests/layers_dense_variational_test.py"],
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
-        "//tensorflow/python:variables",
     ],
 )
 
 cuda_py_test(
-    name = "stochastic_variables_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/stochastic_variables_test.py"],
+    name = "monte_carlo_test",
+    size = "small",
+    srcs = ["python/kernel_tests/monte_carlo_test.py"],
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:random_seed",
     ],
 )
 
 cuda_py_test(
-    name = "monte_carlo_test",
+    name = "halton_sequence_test",
     size = "small",
-    srcs = ["python/kernel_tests/monte_carlo_test.py"],
+    srcs = ["python/kernel_tests/halton_sequence_test.py"],
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_seed",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -181,84 +180,23 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "stochastic_graph_test",
+    name = "sgld_optimizer_test",
     size = "small",
-    srcs = ["python/kernel_tests/stochastic_graph_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "variational_inference_test",
-    size = "small",
-    srcs = ["python/kernel_tests/variational_inference_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "stochastic_tensor_test",
-    size = "small",
-    srcs = ["python/kernel_tests/stochastic_tensor_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "stochastic_gradient_estimators_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/stochastic_gradient_estimators_test.py"],
+    srcs = ["python/kernel_tests/sgld_optimizer_test.py"],
     additional_deps = [
         ":bayesflow_py",
         "//third_party/py/numpy",
         "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/python/ops/distributions",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "reinforce_simple_example",
-    size = "small",
-    srcs = ["examples/reinforce_simple/reinforce_simple_example.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_seed",
     ],
 )
 
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index 8b27fa76bd31a926558abe681d6e510c0a4997c1..95b9452b1ada60c44672f37800ced2133d2bd8b2 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -23,24 +23,30 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence
 from tensorflow.contrib.bayesflow.python.ops import custom_grad
-from tensorflow.contrib.bayesflow.python.ops import entropy
+from tensorflow.contrib.bayesflow.python.ops import halton_sequence
 from tensorflow.contrib.bayesflow.python.ops import hmc
+from tensorflow.contrib.bayesflow.python.ops import layers
 from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import stochastic_variables
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
+from tensorflow.contrib.bayesflow.python.ops import optimizers
 # pylint: enable=unused-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-_allowed_symbols = ['csiszar_divergence', 'custom_grad', 'entropy',
-                    'metropolis_hastings', 'monte_carlo', 'hmc', 'special_math',
-                    'stochastic_gradient_estimators', 'stochastic_graph',
-                    'stochastic_tensor', 'stochastic_variables',
-                    'variational_inference']
+_allowed_symbols = [
+    'csiszar_divergence',
+    'custom_grad',
+    'entropy',
+    'halton_sequence',
+    'hmc',
+    'layers',
+    'metropolis_hastings',
+    'monte_carlo',
+    'optimizers',
+    'special_math',
+    'stochastic_variables',
+    'variational_inference',
+]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/examples/reinforce_simple/reinforce_simple_example.py b/tensorflow/contrib/bayesflow/examples/reinforce_simple/reinforce_simple_example.py
deleted file mode 100644
index 2eb625487f4cd18bdec10ddbc0cf64cb8c8499b8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/examples/reinforce_simple/reinforce_simple_example.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Simple examples of the REINFORCE algorithm."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-distributions = tf.contrib.distributions
-sg = tf.contrib.bayesflow.stochastic_graph
-st = tf.contrib.bayesflow.stochastic_tensor
-
-
-def split_apply_merge(inp, partitions, fns):
-  """Split input according to partitions.  Pass results through fns and merge.
-
-  Args:
-    inp: the input vector
-    partitions: tensor of same length as input vector, having values 0, 1
-    fns: the two functions.
-
-  Returns:
-    the vector routed, where routed[i] = fns[partitions[i]](inp[i])
-  """
-  new_inputs = tf.dynamic_partition(inp, partitions, len(fns))
-  new_outputs = [fns[i](x) for i, x in enumerate(new_inputs)]
-  new_indices = tf.dynamic_partition(
-      tf.range(0, inp.get_shape()[0]), partitions, len(fns))
-  return tf.dynamic_stitch(new_indices, new_outputs)
-
-
-def plus_1(inputs):
-  return inputs + 1.0
-
-
-def minus_1(inputs):
-  return inputs - 1.0
-
-
-def build_split_apply_merge_model():
-  """Build the Split-Apply-Merge Model.
-
-  Route each value of input [-1, -1, 1, 1] through one of the
-  functions, plus_1, minus_1.  The decision for routing is made by
-  4 Bernoulli R.V.s whose parameters are determined by a neural network
-  applied to the input.  REINFORCE is used to update the NN parameters.
-
-  Returns:
-    The 3-tuple (route_selection, routing_loss, final_loss), where:
-
-      - route_selection is an int 4-vector
-      - routing_loss is a float 4-vector
-      - final_loss is a float scalar.
-  """
-  inputs = tf.constant([[-1.0], [-1.0], [1.0], [1.0]])
-  targets = tf.constant([[0.0], [0.0], [0.0], [0.0]])
-  paths = [plus_1, minus_1]
-  weights = tf.get_variable("w", [1, 2])
-  bias = tf.get_variable("b", [1, 1])
-  logits = tf.matmul(inputs, weights) + bias
-
-  # REINFORCE forward step
-  route_selection = st.StochasticTensor(
-      distributions.Categorical(logits=logits))
-
-  # Accessing route_selection as a Tensor below forces a sample of
-  # the Categorical distribution based on its logits.
-  # This is equivalent to calling route_selection.value().
-  #
-  # route_selection.value() returns an int32 4-vector with random
-  # values in {0, 1}
-  # COPY+ROUTE+PASTE
-  outputs = split_apply_merge(inputs, route_selection, paths)
-
-  # flatten routing_loss to a row vector (from a column vector)
-  routing_loss = tf.reshape(tf.square(outputs - targets), shape=[-1])
-
-  # Total loss: score function loss + routing loss.
-  # The score function loss (through `route_selection.loss(routing_loss)`)
-  # returns:
-  #  [stop_gradient(routing_loss) *
-  #   route_selection.log_pmf(stop_gradient(route_selection.value()))],
-  # where log_pmf has gradients going all the way back to weights and bias.
-  # In this case, the routing_loss depends on the variables only through
-  # "route_selection", which has a stop_gradient on it.  So the
-  # gradient of the loss really come through the score function
-  surrogate_loss = sg.surrogate_loss([routing_loss])
-  final_loss = tf.reduce_sum(surrogate_loss)
-
-  return (route_selection, routing_loss, final_loss)
-
-
-class REINFORCESimpleExample(tf.test.TestCase):
-
-  def testSplitApplyMerge(self):
-    # Repeatability.  SGD has a tendency to jump around, even here.
-    tf.set_random_seed(1)
-
-    with self.test_session() as sess:
-      # Use sampling to train REINFORCE
-      with st.value_type(st.SampleValue()):
-        (route_selection,
-         routing_loss,
-         final_loss) = build_split_apply_merge_model()
-
-      sgd = tf.train.GradientDescentOptimizer(1.0).minimize(final_loss)
-
-      tf.global_variables_initializer().run()
-
-      for i in range(10):
-        # Run loss and inference step.  This toy problem converges VERY quickly.
-        (routing_loss_v, final_loss_v, route_selection_v, _) = sess.run(
-            [routing_loss, final_loss, tf.identity(route_selection), sgd])
-        print(
-            "Iteration %d, routing loss: %s, final_loss: %s, "
-            "route selection: %s"
-            % (i, routing_loss_v, final_loss_v, route_selection_v))
-
-      self.assertAllEqual([0, 0, 1, 1], route_selection_v)
-      self.assertAllClose([0.0, 0.0, 0.0, 0.0], routing_loss_v)
-      self.assertAllClose(0.0, final_loss_v)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
index 8c6a614beb194180d8b075526a5395aa65d354de..2e94b7206de4f7c40c89f083f3bfa2a22bb7b917 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/csiszar_divergence_test.py
@@ -759,7 +759,7 @@ class CsiszarVIMCOTest(test.TestCase):
   def _csiszar_vimco_helper_grad(self, logu, delta):
     """Finite difference approximation of `grad(csiszar_vimco_helper, logu)`."""
 
-    # This code actually estimates the sum of the Jacobiab because thats what
+    # This code actually estimates the sum of the Jacobiab because that's what
     # TF's `gradients` does.
     np_log_avg_u1, np_log_sooavg_u1 = self._csiszar_vimco_helper(
         logu[..., None] + np.diag([delta]*len(logu)))
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
deleted file mode 100644
index 0bd12b84d12a9c3219f6b24830b1b82db9716043..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Monte Carlo Ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy
-from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
-from tensorflow.contrib.distributions.python.ops import mvn_tril as mvn_tril_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import kullback_leibler as kullback_leibler_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-from tensorflow.python.platform import test
-
-layers = layers_lib
-
-
-class NormalNoEntropy(normal_lib.Normal):  # pylint: disable=no-init
-  """Normal distribution without a `.entropy` method."""
-
-  def entropy(self):
-    return NotImplementedError('Entropy removed by gremlins')
-
-
-def get_train_op(scalar_loss, optimizer='SGD', learning_rate=1.0, decay=0.0):
-  global_step = variables.Variable(0)
-
-  def decay_fn(rate, t):
-    return rate * (1 + math_ops.to_float(t))**(-decay)
-
-  train_op = layers.optimize_loss(
-      scalar_loss,
-      global_step,
-      learning_rate,
-      optimizer,
-      learning_rate_decay_fn=decay_fn)
-  return train_op
-
-
-def _assert_monotonic_decreasing(array, atol=1e-5):
-  array = np.asarray(array)
-  _assert_monotonic_increasing(-array, atol=atol)
-
-
-def _assert_monotonic_increasing(array, atol=1e-5):
-  array = np.asarray(array)
-  diff = np.diff(array.ravel())
-  np.testing.assert_array_less(-1 * atol, diff)
-
-
-class ElboRatioTest(test.TestCase):
-  """Show sampling converges to true KL values."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(0)
-
-  def test_convergence_to_kl_using_sample_form_on_3dim_normal(self):
-    # Test that the sample mean KL is the same as analytic when we use samples
-    # to estimate every part of the KL divergence ratio.
-    vector_shape = (2, 3)
-    n_samples = 5000
-
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      p = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=p.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.sample,
-          seed=42)
-      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
-
-  def test_convergence_to_kl_using_analytic_entropy_form_on_3dim_normal(self):
-    # Test that the sample mean KL is the same as analytic when we use an
-    # analytic entropy combined with sampled cross-entropy.
-    n_samples = 5000
-
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      p = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=p.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.analytic_entropy,
-          seed=42)
-      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.1)
-
-  def test_sample_kl_zero_when_p_and_q_are_the_same_distribution(self):
-    n_samples = 50
-
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=q.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.sample,
-          seed=42)
-
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(np.zeros(2), sample_kl.eval())
-
-
-class EntropyShannonTest(test.TestCase):
-
-  def test_normal_entropy_default_form_uses_exact_entropy(self):
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(dist, n=11)
-      exact_entropy = dist.entropy()
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval())
-
-  def test_normal_entropy_analytic_form_uses_exact_entropy(self):
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(
-          dist, form=entropy.ELBOForms.analytic_entropy)
-      exact_entropy = dist.entropy()
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval())
-
-  def test_normal_entropy_sample_form_gets_approximate_answer(self):
-    # Tested by showing we get a good answer that is not exact.
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(
-          dist, n=1000, form=entropy.ELBOForms.sample, seed=0)
-      exact_entropy = dist.entropy()
-
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval(), rtol=0.01)
-
-      # Make sure there is some error, proving we used samples
-      self.assertLess(0.0001, math_ops.abs(exact_entropy - mc_entropy).eval())
-
-  def test_default_entropy_falls_back_on_sample_if_analytic_not_available(self):
-    # Tested by showing we get a good answer that is not exact.
-    with self.test_session():
-      # NormalNoEntropy is like a Normal, but does not have .entropy method, so
-      # we are forced to fall back on sample entropy.
-      dist_no_entropy = NormalNoEntropy(loc=1.11, scale=2.22)
-      dist_yes_entropy = normal_lib.Normal(loc=1.11, scale=2.22)
-
-      mc_entropy = entropy.entropy_shannon(
-          dist_no_entropy, n=1000, form=entropy.ELBOForms.sample, seed=0)
-      exact_entropy = dist_yes_entropy.entropy()
-
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval(), rtol=0.01)
-
-      # Make sure there is some error, proving we used samples
-      self.assertLess(0.0001, math_ops.abs(exact_entropy - mc_entropy).eval())
-
-
-class RenyiRatioTest(test.TestCase):
-  """Show renyi_ratio is minimized when the distributions match."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(0)
-
-  def test_fitting_two_dimensional_normal_n_equals_1000(self):
-    # Minmizing Renyi divergence should allow us to make one normal match
-    # another one exactly.
-    n = 1000
-    mu_true = np.array([1.0, -1.0], dtype=np.float64)
-    chol_true = np.array([[2.0, 0.0], [0.5, 1.0]], dtype=np.float64)
-    with self.test_session() as sess:
-      target = mvn_tril_lib.MultivariateNormalTriL(mu_true, chol_true)
-
-      # Set up q distribution by defining mean/covariance as Variables
-      mu = variables.Variable(
-          np.zeros(mu_true.shape), dtype=mu_true.dtype, name='mu')
-      mat = variables.Variable(
-          np.zeros(chol_true.shape), dtype=chol_true.dtype, name='mat')
-      chol = distribution_util.matrix_diag_transform(
-          mat, transform=nn_ops.softplus)
-      q = mvn_tril_lib.MultivariateNormalTriL(mu, chol)
-      for alpha in [0.25, 0.75]:
-
-        negative_renyi_divergence = entropy.renyi_ratio(
-            log_p=target.log_prob, q=q, n=n, alpha=alpha, seed=0)
-        train_op = get_train_op(
-            math_ops.reduce_mean(-negative_renyi_divergence),
-            optimizer='SGD',
-            learning_rate=0.5,
-            decay=0.1)
-
-        variables.global_variables_initializer().run()
-        renyis = []
-        for step in range(1000):
-          sess.run(train_op)
-          if step in [1, 5, 100]:
-            renyis.append(negative_renyi_divergence.eval())
-
-        # This optimization should maximize the renyi divergence.
-        _assert_monotonic_increasing(renyis, atol=0)
-
-        # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-        # pass.
-        self.assertAllClose(target.loc.eval(), q.loc.eval(), rtol=0.06)
-        self.assertAllClose(target.scale.to_dense().eval(),
-                            q.scale.to_dense().eval(),
-                            rtol=0.1)
-
-  def test_divergence_between_identical_distributions_is_zero(self):
-    n = 1000
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      for alpha in [0.25, 0.75]:
-
-        negative_renyi_divergence = entropy.renyi_ratio(
-            log_p=q.log_prob, q=q, n=n, alpha=alpha, seed=0)
-
-        self.assertEqual((2,), negative_renyi_divergence.get_shape())
-        self.assertAllClose(np.zeros(2), negative_renyi_divergence.eval())
-
-
-class RenyiAlphaTest(test.TestCase):
-
-  def test_with_three_alphas(self):
-    with self.test_session():
-      for dtype in (dtypes.float32, dtypes.float64):
-        alpha_min = constant_op.constant(0.0, dtype=dtype)
-        alpha_max = 0.5
-        decay_time = 3
-
-        alpha_0 = entropy.renyi_alpha(
-            0, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_1 = entropy.renyi_alpha(
-            1, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_2 = entropy.renyi_alpha(
-            2, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_3 = entropy.renyi_alpha(
-            3, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-
-        # Alpha should start at alpha_max.
-        self.assertAllClose(alpha_max, alpha_0.eval(), atol=1e-5)
-        # Alpha should finish at alpha_min.
-        self.assertAllClose(alpha_min.eval(), alpha_3.eval(), atol=1e-5)
-        # In between, alpha should be monotonically decreasing.
-        _assert_monotonic_decreasing(
-            [alpha_0.eval(), alpha_1.eval(), alpha_2.eval(), alpha_3.eval()])
-
-  def test_non_scalar_input_raises(self):
-    with self.test_session():
-      # Good values here
-      step = 0
-      alpha_min = 0.0
-      alpha_max = 0.5
-      decay_time = 3
-
-      # Use one bad value inside each check.
-      # The "bad" value is always the non-scalar one.
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            [step], decay_time, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, [decay_time], alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, decay_time, alpha_min=[alpha_min], alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, decay_time, alpha_min=alpha_min, alpha_max=[alpha_max]).eval()
-
-  def test_input_with_wrong_sign_raises(self):
-    with self.test_session():
-      # Good values here
-      step = 0
-      alpha_min = 0.0
-      alpha_max = 0.5
-      decay_time = 3
-
-      # Use one bad value inside each check.
-      # The "bad" value is always the non-scalar one.
-      with self.assertRaisesOpError('decay_time must be positive'):
-        entropy.renyi_alpha(
-            step, 0.0, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesOpError('step must be non-negative'):
-        entropy.renyi_alpha(
-            -1, decay_time, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a85862abfd744a86b9a38e10dbb5b985d0a0e94
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/halton_sequence_test.py
@@ -0,0 +1,131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for halton_sequence.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.bayesflow.python.ops import halton_sequence as halton
+from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+mc = monte_carlo_lib
+
+
+class HaltonSequenceTest(test.TestCase):
+
+  def test_known_values_small_bases(self):
+    with self.test_session():
+      # The first five elements of the Halton sequence with base 2 and 3
+      expected = np.array(((1. / 2, 1. / 3),
+                           (1. / 4, 2. / 3),
+                           (3. / 4, 1. / 9),
+                           (1. / 8, 4. / 9),
+                           (5. / 8, 7. / 9)), dtype=np.float32)
+      sample = halton.sample(2, num_samples=5)
+      self.assertAllClose(expected, sample.eval(), rtol=1e-6)
+
+  def test_sample_indices(self):
+    with self.test_session():
+      dim = 5
+      indices = math_ops.range(10, dtype=dtypes.int32)
+      sample_direct = halton.sample(dim, num_samples=10)
+      sample_from_indices = halton.sample(dim, sample_indices=indices)
+      self.assertAllClose(sample_direct.eval(), sample_from_indices.eval(),
+                          rtol=1e-6)
+
+  def test_dtypes_works_correctly(self):
+    with self.test_session():
+      dim = 3
+      sample_float32 = halton.sample(dim, num_samples=10, dtype=dtypes.float32)
+      sample_float64 = halton.sample(dim, num_samples=10, dtype=dtypes.float64)
+      self.assertEqual(sample_float32.eval().dtype, np.float32)
+      self.assertEqual(sample_float64.eval().dtype, np.float64)
+
+  def test_normal_integral_mean_and_var_correctly_estimated(self):
+    n = int(1000)
+    # This test is almost identical to the similarly named test in
+    # monte_carlo_test.py. The only difference is that we use the Halton
+    # samples instead of the random samples to evaluate the expectations.
+    # MC with pseudo random numbers converges at the rate of 1/ Sqrt(N)
+    # (N=number of samples). For QMC in low dimensions, the expected convergence
+    # rate is ~ 1/N. Hence we should only need 1e3 samples as compared to the
+    # 1e6 samples used in the pseudo-random monte carlo.
+    with self.test_session():
+      mu_p = array_ops.constant([-1.0, 1.0], dtype=dtypes.float64)
+      mu_q = array_ops.constant([0.0, 0.0], dtype=dtypes.float64)
+      sigma_p = array_ops.constant([0.5, 0.5], dtype=dtypes.float64)
+      sigma_q = array_ops.constant([1.0, 1.0], dtype=dtypes.float64)
+      p = normal_lib.Normal(loc=mu_p, scale=sigma_p)
+      q = normal_lib.Normal(loc=mu_q, scale=sigma_q)
+
+      cdf_sample = halton.sample(2, num_samples=n, dtype=dtypes.float64)
+      q_sample = q.quantile(cdf_sample)
+
+      # Compute E_p[X].
+      e_x = mc.expectation_importance_sampler(
+          f=lambda x: x, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
+          seed=42)
+
+      # Compute E_p[X^2].
+      e_x2 = mc.expectation_importance_sampler(
+          f=math_ops.square, log_p=p.log_prob, sampling_dist_q=q, z=q_sample,
+          seed=42)
+
+      stddev = math_ops.sqrt(e_x2 - math_ops.square(e_x))
+      # Keep the tolerance levels the same as in monte_carlo_test.py.
+      self.assertEqual(p.batch_shape, e_x.get_shape())
+      self.assertAllClose(p.mean().eval(), e_x.eval(), rtol=0.01)
+      self.assertAllClose(p.stddev().eval(), stddev.eval(), rtol=0.02)
+
+  def test_docstring_example(self):
+    # Produce the first 1000 members of the Halton sequence in 3 dimensions.
+    num_samples = 1000
+    dim = 3
+    with self.test_session():
+      sample = halton.sample(dim, num_samples=num_samples)
+
+      # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
+      # hypercube.
+      powers = math_ops.range(1.0, limit=dim + 1)
+      integral = math_ops.reduce_mean(
+          math_ops.reduce_prod(sample ** powers, axis=-1))
+      true_value = 1.0 / math_ops.reduce_prod(powers + 1.0)
+
+      # Produces a relative absolute error of 1.7%.
+      self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02)
+
+    # Now skip the first 1000 samples and recompute the integral with the next
+    # thousand samples. The sample_indices argument can be used to do this.
+
+      sample_indices = math_ops.range(start=1000, limit=1000 + num_samples,
+                                      dtype=dtypes.int32)
+      sample_leaped = halton.sample(dim, sample_indices=sample_indices)
+
+      integral_leaped = math_ops.reduce_mean(
+          math_ops.reduce_prod(sample_leaped ** powers, axis=-1))
+      self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.001)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
index b1f108e5f01e4945ee83d8262f1d99877f0fe9f0..cbc66b6dc13db62c25952de6b6c13b2fdfe27f12 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/hmc_test.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Hamiltonian Monte Carlo.
-"""
+"""Tests for Hamiltonian Monte Carlo."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,6 +26,7 @@ from tensorflow.contrib.bayesflow.python.ops import hmc
 
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -46,6 +46,9 @@ class HMCTest(test.TestCase):
     random_seed.set_random_seed(10003)
     np.random.seed(10003)
 
+  def assertAllFinite(self, x):
+    self.assertAllEqual(np.ones_like(x).astype(bool), np.isfinite(x))
+
   def _log_gamma_log_prob(self, x, event_dims=()):
     """Computes log-pdf of a log-gamma random variable.
 
@@ -345,5 +348,97 @@ class HMCTest(test.TestCase):
   def testAIS12(self):
     self._ais_gets_correct_log_normalizer_wrapper([1, 2])
 
+  def testNanRejection(self):
+    """Tests that an update that yields NaN potentials gets rejected.
+
+    We run HMC with a target distribution that returns NaN
+    log-likelihoods if any element of x < 0, and unit-scale
+    exponential log-likelihoods otherwise. The exponential potential
+    pushes x towards 0, ensuring that any reasonably large update will
+    push us over the edge into NaN territory.
+    """
+    def _unbounded_exponential_log_prob(x):
+      """An exponential distribution with log-likelihood NaN for x < 0."""
+      per_element_potentials = array_ops.where(x < 0,
+                                               np.nan * array_ops.ones_like(x),
+                                               -x)
+      return math_ops.reduce_sum(per_element_potentials)
+
+    with self.test_session() as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, acceptance_probs, _, _ = hmc.kernel(
+          2., 5, initial_x, _unbounded_exponential_log_prob, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
+
+  def testNanFromGradsDontPropagate(self):
+    """Test that update with NaN gradients does not cause NaN in results."""
+    def _nan_log_prob_with_nan_gradient(x):
+      return np.nan * math_ops.reduce_sum(x)
+
+    with self.test_session() as sess:
+      initial_x = math_ops.linspace(0.01, 5, 10)
+      updated_x, acceptance_probs, new_log_prob, new_grad = hmc.kernel(
+          2., 5, initial_x, _nan_log_prob_with_nan_gradient, [0])
+      initial_x_val, updated_x_val, acceptance_probs_val = sess.run(
+          [initial_x, updated_x, acceptance_probs])
+
+      logging.vlog(1, 'initial_x = {}'.format(initial_x_val))
+      logging.vlog(1, 'updated_x = {}'.format(updated_x_val))
+      logging.vlog(1, 'acceptance_probs = {}'.format(acceptance_probs_val))
+
+      self.assertAllEqual(initial_x_val, updated_x_val)
+      self.assertEqual(acceptance_probs_val, 0.)
+
+      self.assertAllFinite(
+          gradients_impl.gradients(updated_x, initial_x)[0].eval())
+      self.assertTrue(
+          gradients_impl.gradients(new_grad, initial_x)[0] is None)
+
+      # Gradients of the acceptance probs and new log prob are not finite.
+      _ = new_log_prob  # Prevent unused arg error.
+      # self.assertAllFinite(
+      #     gradients_impl.gradients(acceptance_probs, initial_x)[0].eval())
+      # self.assertAllFinite(
+      #     gradients_impl.gradients(new_log_prob, initial_x)[0].eval())
+
+  def testChainWorksIn64Bit(self):
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float64(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float64),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float64, states_.dtype)
+    self.assertEqual(np.float64, acceptance_probs_.dtype)
+
+  def testChainWorksIn16Bit(self):
+    def log_prob(x):
+      return - math_ops.reduce_sum(x * x, axis=-1)
+    states, acceptance_probs = hmc.chain(
+        n_iterations=10,
+        step_size=np.float16(0.01),
+        n_leapfrog_steps=10,
+        initial_x=np.zeros(5).astype(np.float16),
+        target_log_prob_fn=log_prob,
+        event_dims=[-1])
+    with self.test_session() as sess:
+      states_, acceptance_probs_ = sess.run([states, acceptance_probs])
+    self.assertEqual(np.float16, states_.dtype)
+    self.assertEqual(np.float16, acceptance_probs_.dtype)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..50358fd1c2b7635ffe2d08c5af3219bb0a11498b
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py
@@ -0,0 +1,304 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dense Bayesian layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational_impl as prob_layers_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class Counter(object):
+  """Helper class to manage incrementing a counting `int`."""
+
+  def __init__(self):
+    self._value = -1
+
+  @property
+  def value(self):
+    return self._value
+
+  def __call__(self):
+    self._value += 1
+    return self._value
+
+
+class MockDistribution(normal_lib.Normal):
+  """Monitors DenseVariational calls to the underlying distribution."""
+
+  def __init__(self, result_sample, result_log_prob, loc=None, scale=None):
+    self.result_sample = result_sample
+    self.result_log_prob = result_log_prob
+    self.result_loc = loc
+    self.result_scale = scale
+    self.called_log_prob = Counter()
+    self.called_sample = Counter()
+    self.called_loc = Counter()
+    self.called_scale = Counter()
+
+  def log_prob(self, *args, **kwargs):
+    self.called_log_prob()
+    return self.result_log_prob
+
+  def sample(self, *args, **kwargs):
+    self.called_sample()
+    return self.result_sample
+
+  @property
+  def loc(self):
+    self.called_loc()
+    return self.result_loc
+
+  @property
+  def scale(self):
+    self.called_scale()
+    return self.result_scale
+
+
+class MockKLDivergence(object):
+  """Monitors DenseVariational calls to the divergence implementation."""
+
+  def __init__(self, result):
+    self.result = result
+    self.args = []
+    self.called = Counter()
+
+  def __call__(self, *args, **kwargs):
+    self.called()
+    self.args.append(args)
+    return self.result
+
+
+class DenseVariationalLocalReparametrization(test.TestCase):
+
+  def testKLPenaltyKernel(self):
+    with self.test_session():
+      dense_vi = prob_layers_lib.DenseVariational(units=2)
+      inputs = random_ops.random_uniform([2, 3], seed=1)
+
+      # No keys.
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 0)
+      self.assertListEqual(dense_vi.losses, loss_keys)
+
+      _ = dense_vi(inputs)
+
+      # Yes keys.
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 1)
+      self.assertListEqual(dense_vi.losses, loss_keys)
+
+  def testKLPenaltyBoth(self):
+    def _make_normal(dtype, *args):  # pylint: disable=unused-argument
+      return normal_lib.Normal(
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.))
+    with self.test_session():
+      dense_vi = prob_layers_lib.DenseVariational(
+          units=2,
+          bias_posterior_fn=prob_layers_lib.default_mean_field_normal_fn(),
+          bias_prior_fn=_make_normal)
+      inputs = random_ops.random_uniform([2, 3], seed=1)
+
+      # No keys.
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 0)
+      self.assertListEqual(dense_vi.losses, loss_keys)
+
+      _ = dense_vi(inputs)
+
+      # Yes keys.
+      loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+      self.assertEqual(len(loss_keys), 2)
+      self.assertListEqual(dense_vi.losses, loss_keys)
+
+  def testVariationalNonLocal(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      seed = Counter()
+      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+      kernel_size = [in_size, out_size]
+      kernel_posterior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+      kernel_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+      kernel_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(kernel_size, seed=seed()))
+
+      bias_size = [out_size]
+      bias_posterior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(bias_size, seed=seed()))
+
+      expected_outputs = (
+          math_ops.matmul(inputs, kernel_posterior.result_sample) +
+          bias_posterior.result_sample)
+
+      dense_vi = prob_layers_lib.DenseVariational(
+          units=2,
+          kernel_use_local_reparameterization=False,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          kernel_prior_fn=lambda *args: kernel_prior,
+          kernel_divergence_fn=kernel_divergence,
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          bias_prior_fn=lambda *args: bias_prior,
+          bias_divergence_fn=bias_divergence)
+
+      outputs = dense_vi(inputs)
+
+      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_, actual_kernel_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_posterior.result_sample, dense_vi.kernel.posterior_tensor,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
+
+      self.assertAllClose(
+          expected_kernel_, actual_kernel_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior, kernel_prior, kernel_posterior.result_sample]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          bias_divergence.args)
+
+  def testVariationalLocal(self):
+    batch_size, in_size, out_size = 2, 3, 4
+    with self.test_session() as sess:
+      seed = Counter()
+      inputs = random_ops.random_uniform([batch_size, in_size], seed=seed())
+
+      kernel_size = [in_size, out_size]
+      kernel_posterior = MockDistribution(
+          loc=random_ops.random_uniform(kernel_size, seed=seed()),
+          scale=random_ops.random_uniform(kernel_size, seed=seed()),
+          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+      kernel_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()),
+          result_sample=random_ops.random_uniform(kernel_size, seed=seed()))
+      kernel_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(kernel_size, seed=seed()))
+
+      bias_size = [out_size]
+      bias_posterior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_prior = MockDistribution(
+          result_log_prob=random_ops.random_uniform(bias_size, seed=seed()),
+          result_sample=random_ops.random_uniform(bias_size, seed=seed()))
+      bias_divergence = MockKLDivergence(
+          result=random_ops.random_uniform(bias_size, seed=seed()))
+
+      expected_kernel_posterior_affine = normal_lib.Normal(
+          loc=math_ops.matmul(inputs, kernel_posterior.result_loc),
+          scale=math_ops.matmul(
+              inputs**2., kernel_posterior.result_scale**2)**0.5)
+      expected_kernel_posterior_affine_tensor = (
+          expected_kernel_posterior_affine.sample(seed=42))
+      expected_outputs = (expected_kernel_posterior_affine_tensor +
+                          bias_posterior.result_sample)
+
+      dense_vi = prob_layers_lib.DenseVariational(
+          units=2,
+          kernel_use_local_reparameterization=True,
+          kernel_posterior_fn=lambda *args: kernel_posterior,
+          kernel_posterior_tensor_fn=lambda d: d.sample(seed=42),
+          kernel_prior_fn=lambda *args: kernel_prior,
+          kernel_divergence_fn=kernel_divergence,
+          bias_posterior_fn=lambda *args: bias_posterior,
+          bias_posterior_tensor_fn=lambda d: d.sample(seed=43),
+          bias_prior_fn=lambda *args: bias_prior,
+          bias_divergence_fn=bias_divergence)
+
+      outputs = dense_vi(inputs)
+
+      kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)
+
+      [
+          expected_outputs_, actual_outputs_,
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          expected_bias_, actual_bias_,
+          expected_bias_divergence_, actual_bias_divergence_,
+      ] = sess.run([
+          expected_outputs, outputs,
+          kernel_divergence.result, kl_penalty[0],
+          bias_posterior.result_sample, dense_vi.bias.posterior_tensor,
+          bias_divergence.result, kl_penalty[1],
+      ])
+
+      self.assertAllClose(
+          expected_bias_, actual_bias_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_outputs_, actual_outputs_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_kernel_divergence_, actual_kernel_divergence_,
+          rtol=1e-6, atol=0.)
+      self.assertAllClose(
+          expected_bias_divergence_, actual_bias_divergence_,
+          rtol=1e-6, atol=0.)
+
+      self.assertAllEqual(
+          [[kernel_posterior, kernel_prior, None]],
+          kernel_divergence.args)
+
+      self.assertAllEqual(
+          [[bias_posterior, bias_prior, bias_posterior.result_sample]],
+          bias_divergence.args)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..66793383fdd5c71f136900197a91be6966e2f8c7
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
@@ -0,0 +1,209 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+from tensorflow.contrib.bayesflow.python.ops.optimizers import SGLDOptimizer
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class SGLDOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.53
+        sgd_op = SGLDOptimizer(
+            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
+                                              (1 - decay_rate) * 0.1**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
+        grads_scaled = (0.5 * 0.01 / math.sqrt(
+            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
+
+  def testBasicMultiInstance(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        vara = variables.Variable([1.1, 2.1], dtype=dtype)
+        varb = variables.Variable([3.0, 4.0], dtype=dtype)
+        gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
+        gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.5
+        sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
+        sgd_op = sgd_optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        sgd_optimizer2 = SGLDOptimizer(
+            3.0, preconditioner_decay_rate=decay_rate)
+        sgd_op2 = sgd_optimizer2.apply_gradients(
+            zip([gradsa, gradsb], [vara, varb]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
+
+        # Run 1 step of sgd
+        sgd_op.run()
+        sgd_op2.run()
+        # Validate updated params
+        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
+                                              (1 - decay_rate) * 0.1**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
+        self.assertAllCloseAccordingToType(
+            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], vara.eval())
+
+        grads_scaled = (0.5 * 0.01 / math.sqrt(
+            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], varb.eval())
+        self.assertNotEqual(sgd_optimizer.variable_scope,
+                            sgd_optimizer2.variable_scope)
+        self.assertNotEqual(sgd_optimizer.variable_scope.name,
+                            sgd_optimizer2.variable_scope.name)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        decay_rate = 0.5
+        sgd_op = SGLDOptimizer(
+            lrate, preconditioner_decay_rate=constant_op.constant(
+                decay_rate)).apply_gradients(
+                    zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
+                                              (1 - decay_rate) * 0.1**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
+        grads_scaled = (0.5 * 0.01 / math.sqrt(
+            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        opt = SGLDOptimizer(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.1, 2.1], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        decay_rate = 0.1
+        sgd_op = SGLDOptimizer(
+            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
+                zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+
+        # Validate updated params and global_step
+        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
+                                              (1 - decay_rate) * 0.1**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
+        grads_scaled = (0.5 * 0.01 / math.sqrt(
+            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        decay_rate = 0.9
+        sgd_op = SGLDOptimizer(
+            3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
+                                              (1 - decay_rate) * 0.1**2 + 1e-8))
+        self.assertAllCloseAccordingToType([[1.1 - 3.0 * grads_scaled], [2.1]],
+                                           var0.eval())
+        grads_scaled = (0.5 * 0.01 / math.sqrt(
+            decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
+        self.assertAllCloseAccordingToType(
+            [[3.0 - 3.0 * 0], [4.0 - 3.0 * grads_scaled]], var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
deleted file mode 100644
index 9b1f482b34967082d6ac44494123879fb8fb0ee3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.contrib import distributions
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-sge = stochastic_gradient_estimators
-dists = distributions
-
-
-def _vimco(loss):
-  """Python implementation of VIMCO."""
-  n = loss.shape[0]
-  log_loss = np.log(loss)
-  geometric_mean = []
-  for j in range(n):
-    geometric_mean.append(
-        np.exp(np.mean([log_loss[i, :] for i in range(n) if i != j], 0)))
-  geometric_mean = np.array(geometric_mean)
-
-  learning_signal = []
-  for j in range(n):
-    learning_signal.append(np.sum([loss[i, :] for i in range(n) if i != j], 0))
-  learning_signal = np.array(learning_signal)
-
-  local_learning_signal = np.log(1 / n * (learning_signal + geometric_mean))
-
-  # log_mean - local_learning_signal
-  log_mean = np.log(np.mean(loss, 0))
-  advantage = log_mean - local_learning_signal
-
-  return advantage
-
-
-class StochasticGradientEstimatorsTest(test.TestCase):
-
-  def setUp(self):
-    self._p = constant_op.constant(0.999999)
-    self._final_loss = constant_op.constant(3.2)
-
-  def _testScoreFunction(self, loss_fn, expected):
-    x = st.StochasticTensor(dists.Bernoulli(probs=self._p), loss_fn=loss_fn)
-    sf = x.loss(self._final_loss)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllClose(*sess.run([expected, sf]))
-
-  def testScoreFunction(self):
-    expected = math_ops.log(self._p) * self._final_loss
-    self._testScoreFunction(sge.score_function, expected)
-
-  def testScoreFunctionWithConstantBaseline(self):
-    b = constant_op.constant(9.8)
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_constant_baseline(b), expected)
-
-  def testScoreFunctionWithBaselineFn(self):
-    b = constant_op.constant(9.8)
-
-    def baseline_fn(stoch_tensor, loss):
-      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
-      self.assertTrue(isinstance(loss, ops.Tensor))
-      return b
-
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_baseline(baseline_fn), expected)
-
-  def testScoreFunctionWithMeanBaseline(self):
-    ema_decay = 0.8
-    num_steps = 6
-    x = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    sf = x.loss(self._final_loss)
-
-    # Expected EMA value
-    ema = 0.
-    for _ in range(num_steps):
-      ema -= (1. - ema_decay) * (ema - self._final_loss)
-
-    # Baseline is EMA with bias correction
-    bias_correction = 1. - ema_decay**num_steps
-    baseline = ema / bias_correction
-    expected = math_ops.log(self._p) * (self._final_loss - baseline)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      for _ in range(num_steps - 1):
-        sess.run(sf)  # run to update EMA
-      self.assertAllClose(*sess.run([expected, sf]))
-
-  def testScoreFunctionWithAdvantageFn(self):
-    b = constant_op.constant(9.8)
-
-    def advantage_fn(stoch_tensor, loss):
-      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
-      self.assertTrue(isinstance(loss, ops.Tensor))
-      return loss - b
-
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_advantage(advantage_fn), expected)
-
-  def testVIMCOAdvantageFn(self):
-    # simple_loss: (3, 2) with 3 samples, batch size 2
-    simple_loss = np.array(
-        [[1.0, 1.5],
-         [1e-6, 1e4],
-         [2.0, 3.0]])
-    # random_loss: (100, 50, 64) with 100 samples, batch shape (50, 64)
-    random_loss = 100 * np.random.rand(100, 50, 64)
-
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=False)
-
-    with self.test_session() as sess:
-      for loss in [simple_loss, random_loss]:
-        expected = _vimco(loss)
-        loss_t = constant_op.constant(loss, dtype=dtypes.float32)
-        advantage_t = advantage_fn(None, loss_t)  # ST is not used
-        advantage = sess.run(advantage_t)
-        self.assertEqual(expected.shape, advantage_t.get_shape())
-        self.assertAllClose(expected, advantage, atol=5e-5)
-
-  def testVIMCOAdvantageGradients(self):
-    loss = np.log(
-        [[1.0, 1.5],
-         [1e-6, 1e4],
-         [2.0, 3.0]])
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=True)
-
-    with self.test_session():
-      loss_t = constant_op.constant(loss, dtype=dtypes.float64)
-      advantage_t = advantage_fn(None, loss_t)  # ST is not used
-      gradient_error = gradient_checker.compute_gradient_error(
-          loss_t,
-          loss_t.get_shape().as_list(),
-          advantage_t,
-          advantage_t.get_shape().as_list(),
-          x_init_value=loss)
-      self.assertLess(gradient_error, 1e-3)
-
-  def testVIMCOAdvantageWithSmallProbabilities(self):
-    theta_value = np.random.rand(10, 100000)
-    # Test with float16 dtype to ensure stability even in this extreme case.
-    theta = constant_op.constant(theta_value, dtype=dtypes.float16)
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=True)
-
-    with self.test_session() as sess:
-      log_loss = -math_ops.reduce_sum(theta, [1])
-      advantage_t = advantage_fn(None, log_loss)
-      grad_t = gradients_impl.gradients(advantage_t, theta)[0]
-      advantage, grad = sess.run((advantage_t, grad_t))
-      self.assertTrue(np.all(np.isfinite(advantage)))
-      self.assertTrue(np.all(np.isfinite(grad)))
-
-  def testScoreFunctionWithMeanBaselineHasUniqueVarScope(self):
-    ema_decay = 0.8
-    x = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    y = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    sf_x = x.loss(self._final_loss)
-    sf_y = y.loss(self._final_loss)
-    with self.test_session() as sess:
-      # Smoke test
-      sess.run(variables.global_variables_initializer())
-      sess.run([sf_x, sf_y])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
deleted file mode 100644
index 44e27db03b18d0e6a789db676bea684c10dcfca7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-sg = stochastic_graph_impl
-distributions = distributions_lib
-
-
-class NormalNotParam(distributions.Normal):
-
-  @property
-  def reparameterization_type(self):
-    return distributions.NOT_REPARAMETERIZED
-
-
-class TestSurrogateLosses(test.TestCase):
-
-  def testPathwiseDerivativeDoesNotAddSurrogateLosses(self):
-    with self.test_session():
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
-        likelihood = st.StochasticTensor(
-            distributions.Normal(
-                loc=prior, scale=sigma))
-        self.assertEqual(
-            prior.distribution.reparameterization_type,
-            distributions.FULLY_REPARAMETERIZED)
-        self.assertEqual(
-            likelihood.distribution.reparameterization_type,
-            distributions.FULLY_REPARAMETERIZED)
-
-      loss = math_ops.square(array_ops.identity(likelihood) - [0.0, 0.1, 0.2])
-      sum_loss = math_ops.reduce_sum(loss)
-
-      surrogate_loss = sg.surrogate_loss([loss])
-      with self.assertRaisesRegexp(ValueError, "dimensionality 1 or greater"):
-        _ = sg.surrogate_loss([sum_loss])
-      surrogate_from_both = sg.surrogate_loss(
-          [loss, sum_loss * array_ops.ones_like(loss)])
-
-      # Pathwise derivative terms do not require add'l surrogate loss terms.
-      with self.test_session() as sess:
-        self.assertAllClose(*sess.run([loss, surrogate_loss]))
-        self.assertAllClose(*sess.run([(loss + sum_loss), surrogate_from_both]))
-
-  def _testSurrogateLoss(self, session, losses, expected_addl_terms, xs):
-    surrogate_loss = sg.surrogate_loss(losses)
-    expected_surrogate_loss = math_ops.add_n(losses + expected_addl_terms)
-    self.assertAllClose(*session.run([surrogate_loss, expected_surrogate_loss]))
-
-    # Test backprop
-    expected_grads = gradients_impl.gradients(ys=expected_surrogate_loss, xs=xs)
-    surrogate_grads = gradients_impl.gradients(ys=surrogate_loss, xs=xs)
-    self.assertEqual(len(expected_grads), len(surrogate_grads))
-    grad_values = session.run(expected_grads + surrogate_grads)
-    n_grad = len(expected_grads)
-    self.assertAllClose(grad_values[:n_grad], grad_values[n_grad:])
-
-  def testSurrogateLoss(self):
-    with self.test_session() as sess:
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        likelihood = st.StochasticTensor(NormalNotParam(loc=prior, scale=sigma))
-        prior_2 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-
-      loss = math_ops.square(array_ops.identity(likelihood) - mu)
-      part_loss = math_ops.square(array_ops.identity(prior) - mu)
-      sum_loss = math_ops.reduce_sum(loss)
-      loss_nodeps = math_ops.square(array_ops.identity(prior_2) - mu)
-
-      # For ground truth, use the stop-gradient versions of the losses
-      loss_nograd = array_ops.stop_gradient(loss)
-      loss_nodeps_nograd = array_ops.stop_gradient(loss_nodeps)
-      sum_loss_nograd = array_ops.stop_gradient(sum_loss)
-
-      # These score functions should ignore prior_2
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss],
-          expected_addl_terms=[
-              likelihood.distribution.log_prob(
-                  likelihood.value()) * loss_nograd,
-              prior.distribution.log_prob(prior.value()) * loss_nograd
-          ],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, part_loss],
-          expected_addl_terms=[
-              likelihood.distribution.log_prob(
-                  likelihood.value()) * loss_nograd,
-              (prior.distribution.log_prob(prior.value()) *
-               array_ops.stop_gradient(part_loss + loss))
-          ],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[sum_loss * array_ops.ones_like(loss)],
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              sum_loss_nograd), prior.distribution.log_prob(prior.value()) *
-                               sum_loss_nograd],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, sum_loss * array_ops.ones_like(loss)],
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              array_ops.stop_gradient(loss + sum_loss)),
-                               (prior.distribution.log_prob(prior.value()) *
-                                array_ops.stop_gradient(loss + sum_loss))],
-          xs=[mu, sigma])
-
-      # These score functions should ignore prior and likelihood
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss_nodeps],
-          expected_addl_terms=[(prior_2.distribution.log_prob(prior_2.value()) *
-                                loss_nodeps_nograd)],
-          xs=[mu, sigma])
-
-      # These score functions should include all terms selectively
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, loss_nodeps],
-          # We can't guarantee ordering of output losses in this case.
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              loss_nograd), prior.distribution.log_prob(prior.value()) *
-                               loss_nograd,
-                               (prior_2.distribution.log_prob(prior_2.value()) *
-                                loss_nodeps_nograd)],
-          xs=[mu, sigma])
-
-  def testNoSurrogateLoss(self):
-    with self.test_session():
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        dt = st.StochasticTensor(
-            NormalNotParam(
-                loc=mu, scale=sigma), loss_fn=None)
-        self.assertEqual(None, dt.loss(constant_op.constant([2.0])))
-
-  def testExplicitStochasticTensors(self):
-    with self.test_session() as sess:
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        dt1 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        dt2 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        loss = math_ops.square(array_ops.identity(dt1)) + 10. + dt2
-
-        sl_all = sg.surrogate_loss([loss])
-        sl_dt1 = sg.surrogate_loss([loss], stochastic_tensors=[dt1])
-        sl_dt2 = sg.surrogate_loss([loss], stochastic_tensors=[dt2])
-
-        dt1_term = dt1.distribution.log_prob(dt1) * loss
-        dt2_term = dt2.distribution.log_prob(dt2) * loss
-
-        self.assertAllClose(*sess.run(
-            [sl_all, sum([loss, dt1_term, dt2_term])]))
-        self.assertAllClose(*sess.run([sl_dt1, sum([loss, dt1_term])]))
-        self.assertAllClose(*sess.run([sl_dt2, sum([loss, dt2_term])]))
-
-
-class StochasticDependenciesMapTest(test.TestCase):
-
-  def testBuildsMapOfUpstreamNodes(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    dt2 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    out1 = dt1.value() + 1.
-    out2 = dt2.value() + 2.
-    x = out1 + out2
-    y = out2 * 3.
-    dep_map = sg._stochastic_dependencies_map([x, y])
-    self.assertEqual(dep_map[dt1], set([x]))
-    self.assertEqual(dep_map[dt2], set([x, y]))
-
-  def testHandlesStackedStochasticNodes(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    out1 = dt1.value() + 1.
-    dt2 = st.StochasticTensor(distributions.Normal(loc=out1, scale=1.))
-    x = dt2.value() + 2.
-    dt3 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    y = dt3.value() * 3.
-    dep_map = sg._stochastic_dependencies_map([x, y])
-    self.assertEqual(dep_map[dt1], set([x]))
-    self.assertEqual(dep_map[dt2], set([x]))
-    self.assertEqual(dep_map[dt3], set([y]))
-
-  def testTraversesControlInputs(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    logits = dt1.value() * 3.
-    dt2 = st.StochasticTensor(distributions.Bernoulli(logits=logits))
-    dt3 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    x = dt3.value()
-    y = array_ops.ones((2, 2)) * 4.
-    z = array_ops.ones((2, 2)) * 3.
-    out = control_flow_ops.cond(
-        math_ops.cast(dt2, dtypes.bool), lambda: math_ops.add(x, y),
-        lambda: math_ops.square(z))
-    out += 5.
-    dep_map = sg._stochastic_dependencies_map([out])
-    self.assertEqual(dep_map[dt1], set([out]))
-    self.assertEqual(dep_map[dt2], set([out]))
-    self.assertEqual(dep_map[dt3], set([out]))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
deleted file mode 100644
index 6d0cff4678972719cb5c565bc409041e298beadb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions import normal
-from tensorflow.python.platform import test
-
-sge = stochastic_gradient_estimators
-st = stochastic_tensor_impl
-
-
-class StochasticTensorTest(test.TestCase):
-
-  def testConstructionAndValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      sigma2 = constant_op.constant([0.1, 0.2, 0.3])
-
-      prior_default = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma))
-      self.assertTrue(isinstance(prior_default.value_type, st.SampleValue))
-      prior_0 = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma),
-          dist_value_type=st.SampleValue())
-      self.assertTrue(isinstance(prior_0.value_type, st.SampleValue))
-
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior.value_type, st.SampleValue))
-        likelihood = st.StochasticTensor(
-            normal.Normal(loc=prior, scale=sigma2))
-        self.assertTrue(isinstance(likelihood.value_type, st.SampleValue))
-
-      coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-      self.assertEqual(coll, [prior_default, prior_0, prior, likelihood])
-
-      # Also works: tf.convert_to_tensor(prior)
-      prior_default = array_ops.identity(prior_default)
-      prior_0 = array_ops.identity(prior_0)
-      prior = array_ops.identity(prior)
-      likelihood = array_ops.identity(likelihood)
-
-      # Mostly a smoke test for now...
-      prior_0_val, prior_val, prior_default_val, _ = sess.run(
-          [prior_0, prior, prior_default, likelihood])
-
-      self.assertEqual(prior_0_val.shape, prior_val.shape)
-      self.assertEqual(prior_default_val.shape, prior_val.shape)
-      # These are different random samples from the same distribution,
-      # so the values should differ.
-      self.assertGreater(np.abs(prior_0_val - prior_val).sum(), 1e-6)
-      self.assertGreater(np.abs(prior_default_val - prior_val).sum(), 1e-6)
-
-  def testMeanValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, -1.0, 1.0]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-
-      with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior.value_type, st.MeanValue))
-
-      prior_mean = prior.mean()
-      prior_value = prior.value()
-
-      prior_mean_val, prior_value_val = sess.run([prior_mean, prior_value])
-      self.assertAllEqual(prior_mean_val, mu)
-      self.assertAllEqual(prior_mean_val, prior_value_val)
-
-  def testSampleValueScalar(self):
-    with self.test_session() as sess:
-      mu = [[0.0, -1.0, 1.0], [0.0, -1.0, 1.0]]
-      sigma = constant_op.constant([[1.1, 1.2, 1.3], [1.1, 1.2, 1.3]])
-
-      with st.value_type(st.SampleValue()):
-        prior_single = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-
-      prior_single_value = prior_single.value()
-      self.assertEqual(prior_single_value.get_shape(), (2, 3))
-
-      prior_single_value_val = sess.run([prior_single_value])[0]
-      self.assertEqual(prior_single_value_val.shape, (2, 3))
-
-      with st.value_type(st.SampleValue(1)):
-        prior_single = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior_single.value_type, st.SampleValue))
-
-      prior_single_value = prior_single.value()
-      self.assertEqual(prior_single_value.get_shape(), (1, 2, 3))
-
-      prior_single_value_val = sess.run([prior_single_value])[0]
-      self.assertEqual(prior_single_value_val.shape, (1, 2, 3))
-
-      with st.value_type(st.SampleValue(2)):
-        prior_double = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-
-      prior_double_value = prior_double.value()
-      self.assertEqual(prior_double_value.get_shape(), (2, 2, 3))
-
-      prior_double_value_val = sess.run([prior_double_value])[0]
-      self.assertEqual(prior_double_value_val.shape, (2, 2, 3))
-
-  def testDistributionEntropy(self):
-    with self.test_session() as sess:
-      mu = [0.0, -1.0, 1.0]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        entropy = prior.entropy()
-        deep_entropy = prior.distribution.entropy()
-        expected_deep_entropy = normal.Normal(
-            loc=mu, scale=sigma).entropy()
-        entropies = sess.run([entropy, deep_entropy, expected_deep_entropy])
-        self.assertAllEqual(entropies[2], entropies[0])
-        self.assertAllEqual(entropies[1], entropies[0])
-
-  def testSurrogateLoss(self):
-    with self.test_session():
-      mu = [[3.0, -4.0, 5.0], [6.0, -7.0, 8.0]]
-      sigma = constant_op.constant(1.0)
-
-      # With default
-      with st.value_type(st.MeanValue(stop_gradient=True)):
-        dt = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-      loss = dt.loss([constant_op.constant(2.0)])
-      self.assertTrue(loss is not None)
-      self.assertAllClose(
-          dt.distribution.log_prob(mu).eval() * 2.0, loss.eval())
-
-      # With passed-in loss_fn.
-      dt = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma),
-          dist_value_type=st.MeanValue(stop_gradient=True),
-          loss_fn=sge.get_score_function_with_constant_baseline(
-              baseline=constant_op.constant(8.0)))
-      loss = dt.loss([constant_op.constant(2.0)])
-      self.assertTrue(loss is not None)
-      self.assertAllClose((dt.distribution.log_prob(mu) * (2.0 - 8.0)).eval(),
-                          loss.eval())
-
-
-class ValueTypeTest(test.TestCase):
-
-  def testValueType(self):
-    type_mean = st.MeanValue()
-    type_reshape = st.SampleValue()
-    type_full = st.SampleValue()
-    with st.value_type(type_mean):
-      self.assertEqual(st.get_current_value_type(), type_mean)
-      with st.value_type(type_reshape):
-        self.assertEqual(st.get_current_value_type(), type_reshape)
-      with st.value_type(type_full):
-        self.assertEqual(st.get_current_value_type(), type_full)
-      self.assertEqual(st.get_current_value_type(), type_mean)
-    with self.assertRaisesRegexp(ValueError, "No value type currently set"):
-      st.get_current_value_type()
-
-
-class ObservedStochasticTensorTest(test.TestCase):
-
-  def testConstructionAndValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      obs = array_ops.zeros((2, 3))
-      z = st.ObservedStochasticTensor(
-          normal.Normal(loc=mu, scale=sigma), value=obs)
-      [obs_val, z_val] = sess.run([obs, z.value()])
-      self.assertAllEqual(obs_val, z_val)
-
-      coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-      self.assertEqual(coll, [z])
-
-  def testConstructionWithUnknownShapes(self):
-    mu = array_ops.placeholder(dtypes.float32)
-    sigma = array_ops.placeholder(dtypes.float32)
-    obs = array_ops.placeholder(dtypes.float32)
-    z = st.ObservedStochasticTensor(
-        normal.Normal(loc=mu, scale=sigma), value=obs)
-
-    mu2 = array_ops.placeholder(dtypes.float32, shape=[None])
-    sigma2 = array_ops.placeholder(dtypes.float32, shape=[None])
-    obs2 = array_ops.placeholder(dtypes.float32, shape=[None, None])
-    z2 = st.ObservedStochasticTensor(
-        normal.Normal(loc=mu2, scale=sigma2), value=obs2)
-
-    coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-    self.assertEqual(coll, [z, z2])
-
-  def testConstructionErrors(self):
-    mu = [0., 0.]
-    sigma = [1., 1.]
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((3,)))
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((3, 1)))
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((1, 2), dtype=dtypes.int32))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
deleted file mode 100644
index 9ee59a03ca76c6095e34b869d9b175e2c9223cd7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.contrib import distributions
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import stochastic_variables
-from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-sv = stochastic_variables
-st = stochastic_tensor
-vi = variational_inference_impl
-dist = distributions
-
-
-class StochasticVariablesTest(test.TestCase):
-
-  def testStochasticVariables(self):
-    shape = (10, 20)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale)):
-      v = variable_scope.get_variable("sv", shape)
-
-    self.assertTrue(isinstance(v, st.StochasticTensor))
-    self.assertTrue(isinstance(v.distribution, dist.NormalWithSoftplusScale))
-
-    self.assertEqual(
-        {"stochastic_variables/sv_loc", "stochastic_variables/sv_scale"},
-        set([v.op.name for v in variables.global_variables()]))
-    self.assertEqual(
-        set(variables.trainable_variables()), set(variables.global_variables()))
-
-    v = ops.convert_to_tensor(v)
-    self.assertEqual(list(shape), v.get_shape().as_list())
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithConstantInitializer(self):
-    shape = (10, 20)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale,
-            dist_kwargs={"validate_args": True},
-            param_initializers={
-                "loc": np.ones(shape) * 4.,
-                "scale": np.ones(shape) * 2.
-            })):
-      v = variable_scope.get_variable("sv")
-
-    for var in variables.global_variables():
-      if "loc" in var.name:
-        mu_var = var
-      if "scale" in var.name:
-        sigma_var = var
-
-    v = ops.convert_to_tensor(v)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(np.ones(shape) * 4., sess.run(mu_var))
-      self.assertAllEqual(np.ones(shape) * 2., sess.run(sigma_var))
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithCallableInitializer(self):
-    shape = (10, 20)
-
-    def sigma_init(shape, dtype, partition_info):
-      _ = partition_info
-      return array_ops.ones(shape, dtype=dtype) * 2.
-
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale,
-            dist_kwargs={"validate_args": True},
-            param_initializers={
-                "loc": np.ones(
-                    shape, dtype=np.float32) * 4.,
-                "scale": sigma_init
-            })):
-      v = variable_scope.get_variable("sv", shape)
-
-    for var in variables.global_variables():
-      if "loc" in var.name:
-        mu_var = var
-      if "scale" in var.name:
-        sigma_var = var
-
-    v = ops.convert_to_tensor(v)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(np.ones(shape) * 4., sess.run(mu_var))
-      self.assertAllEqual(np.ones(shape) * 2., sess.run(sigma_var))
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithPrior(self):
-    shape = (10, 20)
-    prior = dist.Normal(0., 1.)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale, prior=prior)):
-      w = variable_scope.get_variable("weights", shape)
-
-    x = random_ops.random_uniform((8, 10))
-    y = math_ops.matmul(x, w)
-
-    prior_map = vi._find_variational_and_priors(y, None)
-    self.assertEqual(prior_map[w], prior)
-    elbo = vi.elbo(y, keep_batch_dim=False)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      sess.run(elbo)
-
-  def testStochasticVariablesWithCallablePriorInitializer(self):
-
-    def prior_init(shape, dtype):
-      return dist.Normal(
-          array_ops.zeros(shape, dtype), array_ops.ones(shape, dtype))
-
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale, prior=prior_init)):
-      w = variable_scope.get_variable("weights", (10, 20))
-
-    x = random_ops.random_uniform((8, 10))
-    y = math_ops.matmul(x, w)
-
-    prior_map = vi._find_variational_and_priors(y, None)
-    self.assertTrue(isinstance(prior_map[w], dist.Normal))
-    elbo = vi.elbo(y, keep_batch_dim=False)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      sess.run(elbo)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
deleted file mode 100644
index fff6b74b2efed27abd7b25cbe0e8e8b3904767e1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for variational inference."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib import layers
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import kullback_leibler
-from tensorflow.python.ops.distributions import normal
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-vi = variational_inference_impl
-distributions = distributions_lib
-
-
-class NormalNoEntropy(distributions.Normal):
-
-  def entropy(self):
-    raise NotImplementedError("entropy not implemented")
-
-
-# For mini-VAE
-def inference_net(x, latent_size):
-  return layers.linear(x, latent_size)
-
-
-def generative_net(z, data_size):
-  return layers.linear(z, data_size)
-
-
-def mini_vae():
-  x = [[-6., 3., 6.], [-8., 4., 8.]]
-  prior = distributions.Normal(loc=0., scale=1.)
-  variational = st.StochasticTensor(
-      distributions.Normal(
-          loc=inference_net(x, 1), scale=1.))
-  vi.register_prior(variational, prior)
-  px = distributions.Normal(loc=generative_net(variational, 3), scale=1.)
-  log_likelihood = math_ops.reduce_sum(px.log_prob(x), 1)
-  log_likelihood = array_ops.expand_dims(log_likelihood, -1)
-  return x, prior, variational, px, log_likelihood
-
-
-class VariationalInferenceTest(test.TestCase):
-
-  def testDefaultVariationalAndPrior(self):
-    _, prior, variational, _, log_likelihood = mini_vae()
-    elbo = vi.elbo(log_likelihood)
-    expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
-        variational.distribution, prior)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testExplicitVariationalAndPrior(self):
-    with self.test_session() as sess:
-      _, _, variational, _, log_likelihood = mini_vae()
-      prior = normal.Normal(loc=3., scale=2.)
-      elbo = vi.elbo(
-          log_likelihood, variational_with_prior={variational: prior})
-      expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
-          variational.distribution, prior)
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testExplicitForms(self):
-    _, prior, variational, _, log_likelihood = mini_vae()
-
-    elbos = []
-    forms = vi.ELBOForms
-    for form in [
-        forms.default, forms.analytic_kl, forms.sample, forms.analytic_entropy
-    ]:
-      elbo = vi.elbo(
-          log_likelihood=log_likelihood,
-          variational_with_prior={variational: prior},
-          form=form)
-      elbos.append(elbo)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      log_likelihood_shape = array_ops.shape(log_likelihood).eval()
-      for elbo in elbos:
-        elbo.eval()
-        elbo_shape = array_ops.shape(elbo).eval()
-        self.assertAllEqual(log_likelihood_shape, elbo_shape)
-        self.assertEqual(elbo.dtype, log_likelihood.dtype)
-
-  def testDefaultsSampleKLWithoutAnalyticKLOrEntropy(self):
-    x = constant_op.constant([[-6., 3., 6.]])
-
-    prior = distributions.Bernoulli(0.5)
-    variational = st.StochasticTensor(
-        NormalNoEntropy(
-            loc=inference_net(x, 1), scale=1.))
-    vi.register_prior(variational, prior)
-    px = distributions.Normal(loc=generative_net(variational, 3), scale=1.)
-    log_likelihood = math_ops.reduce_sum(px.log_prob(x), 1)
-
-    # No analytic KL available between prior and variational distributions.
-    with self.assertRaisesRegexp(NotImplementedError, "No KL"):
-      distributions.kl_divergence(variational.distribution, prior)
-
-    elbo = vi.elbo(
-        variational_with_prior={variational: prior},
-        log_likelihood=log_likelihood)
-    expected_elbo = log_likelihood + prior.log_prob(
-        variational) - variational.distribution.log_prob(variational)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testElboWithLogJoint(self):
-    with self.test_session() as sess:
-      _, prior, variational, _, log_likelihood = mini_vae()
-      log_joint = log_likelihood + prior.log_prob(variational)
-      elbo = vi.elbo_with_log_joint(log_joint)
-      sess.run(variables.global_variables_initializer())
-      elbo.eval()
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
deleted file mode 100644
index 4a7679fb436b91c9ae70daf85552099e5b710cbc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
+++ /dev/null
@@ -1,386 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}.
-
-@@elbo_ratio
-@@entropy_shannon
-@@renyi_ratio
-@@renyi_alpha
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
-from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import _get_samples as get_samples
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-# Make utility functions from monte_carlo available.
-# pylint: disable=protected-access
-_get_samples = get_samples
-_logspace_mean = monte_carlo._logspace_mean
-_sample_mean = monte_carlo._sample_mean
-
-# pylint: enable=protected-access
-
-__all__ = [
-    'elbo_ratio',
-    'entropy_shannon',
-    'renyi_ratio',
-    'renyi_alpha',
-]
-
-ELBOForms = variational_inference.ELBOForms  # pylint: disable=invalid-name
-
-
-def elbo_ratio(log_p,
-               q,
-               z=None,
-               n=None,
-               seed=None,
-               form=None,
-               name='elbo_ratio'):
-  r"""Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
-
-  With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
-
-  ```
-  E_q[ Log[p(Z) / q(Z)] ]
-  ```
-
-  The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
-  The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
-  if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
-
-  This log-ratio appears in different contexts:
-
-  #### `KL[q || p]`
-
-  If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
-  the negative Kullback-Leibler divergence.
-
-  ```
-  elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
-  KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
-  ```
-
-  Note that if `p` is a `Distribution`, then
-  `distributions.kl_divergence(q, p)` may be defined and available as an
-  exact result.
-
-  #### ELBO
-
-  If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
-  the Evidence Lower BOund (ELBO):
-
-  ```
-  ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
-        = Log[p(x)] - KL[q || p]
-       <= Log[p(x)]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q:  `tf.contrib.distributions.Distribution`.
-    z:  `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
-      shape of `q`, and `dtype` is the same as `q`.
-
-  Raises:
-    ValueError:  If `form` is not handled by this function.
-  """
-  form = ELBOForms.default if form is None else form
-
-  with ops.name_scope(name, values=[n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    entropy = entropy_shannon(q, z=z, form=form)
-
-    # If log_p(z) = Log[p(z)], cross entropy = -E_q[log(p(Z))]
-    negative_cross_entropy = _sample_mean(log_p(z))
-
-    return entropy + negative_cross_entropy
-
-
-def entropy_shannon(p,
-                    z=None,
-                    n=None,
-                    seed=None,
-                    form=None,
-                    name='entropy_shannon'):
-  r"""Monte Carlo or deterministic computation of Shannon's entropy.
-
-  Depending on the kwarg `form`, this `Op` returns either the analytic entropy
-  of the distribution `p`, or the sampled entropy:
-
-  ```
-  -n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
-      \approx - E_p[ Log[p(Z)] ]
-      = Entropy[p]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    p:  `tf.contrib.distributions.Distribution`
-    z:  `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
-
-  Raises:
-    ValueError:  If `form` not handled by this function.
-    ValueError:  If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
-  """
-  form = ELBOForms.default if form is None else form
-
-  if n is not None and form == ELBOForms.analytic_entropy:
-    raise ValueError('If form == ELBOForms.analytic_entropy, n must be None.')
-
-  with ops.name_scope(name, values=[n, z]):
-    # Entropy: -E_p[log(p(Z))].
-    entropy = None
-
-    # Try analytic path
-    if form in [ELBOForms.default, ELBOForms.analytic_entropy]:
-      try:
-        entropy = p.entropy()
-        logging.info('Using analytic entropy(p:%s)', p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    elif form != ELBOForms.sample:
-      raise ValueError('ELBOForm not handled by this function: %s' % form)
-
-    # Sample path
-    if entropy is None:
-      logging.info('Using sampled entropy(p:%s)', p)
-      if z is None:
-        z = p.sample(n, seed=seed)
-      entropy = -monte_carlo.expectation(p.log_prob, z)
-
-    return entropy
-
-
-def renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio'):
-  r"""Monte Carlo estimate of the ratio appearing in Renyi divergence.
-
-  This can be used to compute the Renyi (alpha) divergence, or a log evidence
-  approximation based on Renyi divergence.
-
-  #### Definition
-
-  With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
-  the (biased for finite `n`) estimate:
-
-  ```
-  (1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
-  \approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
-  ```
-
-  This ratio appears in different contexts:
-
-  #### Renyi divergence
-
-  If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
-  `alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
-
-  ```
-  # Choose reasonably high n to limit bias, see below.
-  renyi_ratio(log_p, q, alpha, n=100)
-                  \approx -1 * D_alpha[q || p],  where
-  D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
-  ```
-
-  The Renyi (or "alpha") divergence is non-negative and equal to zero iff
-  `q = p`.  Various limits of `alpha` lead to different special case results:
-
-  ```
-  alpha       D_alpha[q || p]
-  -----       ---------------
-  --> 0       Log[ int_{q > 0} p(z) dz ]
-  = 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
-  --> 1       KL[q || p]
-  = 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
-  --> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
-  ```
-
-  See "Renyi Divergence Variational Inference", by Li and Turner.
-
-  #### Log evidence approximation
-
-  If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
-  an alternative to the ELBO common in variational inference.
-
-  ```
-  L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
-  ```
-
-  If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
-  `ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
-  interpolation between the ELBO and the true evidence.
-
-  #### Stability notes
-
-  Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
-  is subject to underflow/overflow issues.  For that reason, it is evaluated in
-  log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
-  that reason, one may wish to shrink `alpha` gradually.  See the `Op`
-  `renyi_alpha`.  Using `float64` will also help.
-
-
-  #### Bias for finite sample size
-
-  Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
-  `E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
-  estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
-  with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
-  same result as `elbo_ratio`, and as `n` increases the expected value
-  of the estimator increases.
-
-  #### Call signature
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q: `tf.contrib.distributions.Distribution`.
-       `float64` `dtype` recommended.
-       `log_p` and `q` should be supported on the same set.
-    alpha:  `Tensor` with shape `q.batch_shape` and values not equal to 1.
-    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-    n:  Integer `Tensor`.  The number of samples to use if `z` is not provided.
-      Note that this can be highly biased for small `n`, see docstring.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    renyi_result:  The scaled log of sample mean.  `Tensor` with `shape` equal
-      to batch shape of `q`, and `dtype` = `q.dtype`.
-  """
-  with ops.name_scope(name, values=[alpha, n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    # Evaluate sample mean in logspace.  Note that _logspace_mean will compute
-    # (among other things) the mean of q.log_prob(z), which could also be
-    # obtained with q.entropy().  However, DON'T use analytic entropy, because
-    # that increases variance, and could result in NaN/Inf values of a sensitive
-    # term.
-
-    # log_values
-    # = (1 - alpha) * ( Log p - Log q )
-    log_values = (1. - alpha) * (log_p(z) - q.log_prob(z))
-
-    # log_mean_values
-    # = Log[ E[ values ] ]
-    # = Log[ E[ (p / q)^{1-alpha} ] ]
-    log_mean_values = _logspace_mean(log_values)
-
-    return log_mean_values / (1. - alpha)
-
-
-def renyi_alpha(step,
-                decay_time,
-                alpha_min,
-                alpha_max=0.99999,
-                name='renyi_alpha'):
-  r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.
-
-  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
-  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
-  `NaN` and `inf` values when `alpha` is far from `1`.
-
-  For that reason, it is often desirable to start the optimization with `alpha`
-  very close to 1, and reduce it to a final `alpha_min` according to some
-  schedule.  The user may even want to optimize using `elbo_ratio` for
-  some fixed time before switching to Renyi based methods.
-
-  This `Op` returns an `alpha` decaying exponentially with step:
-
-  ```
-  s(step) = (exp{step / decay_time} - 1) / (e - 1)
-  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
-  alpha(t) = (1 - t) alpha_min + t alpha_max
-  ```
-
-  Args:
-    step:  Non-negative scalar `Tensor`.  Typically the global step or an
-      offset version thereof.
-    decay_time:  Positive scalar `Tensor`.
-    alpha_min:  `float` or `double` `Tensor`.
-      The minimal, final value of `alpha`, achieved when `step >= decay_time`
-    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
-      The maximal, beginning value of `alpha`, achieved when `step == 0`
-    name:  A name to give this `Op`.
-
-  Returns:
-    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
-  """
-  with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
-    alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
-    dtype = alpha_min.dtype
-
-    alpha_max = ops.convert_to_tensor(alpha_max, dtype=dtype, name='alpha_max')
-    decay_time = math_ops.cast(decay_time, dtype)
-    step = math_ops.cast(step, dtype)
-
-    check_scalars = [
-        check_ops.assert_rank(step, 0, message='step must be scalar'),
-        check_ops.assert_rank(
-            decay_time, 0, message='decay_time must be scalar'),
-        check_ops.assert_rank(alpha_min, 0, message='alpha_min must be scalar'),
-        check_ops.assert_rank(alpha_max, 0, message='alpha_max must be scalar'),
-    ]
-    check_sign = [
-        check_ops.assert_non_negative(
-            step, message='step must be non-negative'),
-        check_ops.assert_positive(
-            decay_time, message='decay_time must be positive'),
-    ]
-
-    with ops.control_dependencies(check_scalars + check_sign):
-      theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
-      theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
-      return alpha_max * (1. - theta) + alpha_min * theta
diff --git a/tensorflow/contrib/bayesflow/python/ops/entropy.py b/tensorflow/contrib/bayesflow/python/ops/halton_sequence.py
similarity index 82%
rename from tensorflow/contrib/bayesflow/python/ops/entropy.py
rename to tensorflow/contrib/bayesflow/python/ops/halton_sequence.py
index a22e1c1d4e098439760267fca1374f986e45be8f..49d747d538f5a4aa3134d28ba00a651cb509fa41 100644
--- a/tensorflow/contrib/bayesflow/python/ops/entropy.py
+++ b/tensorflow/contrib/bayesflow/python/ops/halton_sequence.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}."""
+"""Support for low discrepancy Halton sequences.
+
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,12 +22,12 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.entropy_impl import *
+from tensorflow.contrib.bayesflow.python.ops.halton_sequence_impl import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'ELBOForms', 'elbo_ratio', 'entropy_shannon', 'renyi_ratio', 'renyi_alpha'
+    'sample',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cabf18903b5f15002470acdfb8fdd3ec31a7413
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/halton_sequence_impl.py
@@ -0,0 +1,264 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Quasi Monte Carlo support: Halton sequence.
+
+@@sample
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+__all__ = [
+    'sample',
+]
+
+
+# The maximum dimension we support. This is limited by the number of primes
+# in the _PRIMES array.
+_MAX_DIMENSION = 1000
+
+
+def sample(dim, num_samples=None, sample_indices=None, dtype=None, name=None):
+  r"""Returns a sample from the `m` dimensional Halton sequence.
+
+  Warning: The sequence elements take values only between 0 and 1. Care must be
+  taken to appropriately transform the domain of a function if it differs from
+  the unit cube before evaluating integrals using Halton samples. It is also
+  important to remember that quasi-random numbers are not a replacement for
+  pseudo-random numbers in every context. Quasi random numbers are completely
+  deterministic and typically have significant negative autocorrelation (unless
+  randomized).
+
+  Computes the members of the low discrepancy Halton sequence in dimension
+  `dim`. The d-dimensional sequence takes values in the unit hypercube in d
+  dimensions. Currently, only dimensions up to 1000 are supported. The prime
+  base for the `k`-th axes is the k-th prime starting from 2. For example,
+  if dim = 3, then the bases will be [2, 3, 5] respectively and the first
+  element of the sequence will be: [0.5, 0.333, 0.2]. For a more complete
+  description of the Halton sequences see:
+  https://en.wikipedia.org/wiki/Halton_sequence. For low discrepancy sequences
+  and their applications see:
+  https://en.wikipedia.org/wiki/Low-discrepancy_sequence.
+
+  The user must supply either `num_samples` or `sample_indices` but not both.
+  The former is the number of samples to produce starting from the first
+  element. If `sample_indices` is given instead, the specified elements of
+  the sequence are generated. For example, sample_indices=tf.range(10) is
+  equivalent to specifying n=10.
+
+  Example Use:
+
+  ```python
+  bf = tf.contrib.bayesflow
+
+  # Produce the first 1000 members of the Halton sequence in 3 dimensions.
+  num_samples = 1000
+  dim = 3
+  sample = bf.halton_sequence.sample(dim, num_samples=num_samples)
+
+  # Evaluate the integral of x_1 * x_2^2 * x_3^3  over the three dimensional
+  # hypercube.
+  powers = tf.range(1.0, limit=dim + 1)
+  integral = tf.reduce_mean(tf.reduce_prod(sample ** powers, axis=-1))
+  true_value = 1.0 / tf.reduce_prod(powers + 1.0)
+  with tf.Session() as session:
+    values = session.run((integral, true_value))
+
+  # Produces a relative absolute error of 1.7%.
+  print ("Estimated: %f, True Value: %f" % values)
+
+  # Now skip the first 1000 samples and recompute the integral with the next
+  # thousand samples. The sample_indices argument can be used to do this.
+
+
+  sample_indices = tf.range(start=1000, limit=1000 + num_samples,
+                            dtype=tf.int32)
+  sample_leaped = halton.sample(dim, sample_indices=sample_indices)
+
+  integral_leaped = tf.reduce_mean(tf.reduce_prod(sample_leaped ** powers,
+                                                  axis=-1))
+  with tf.Session() as session:
+    values = session.run((integral_leaped, true_value))
+  # Now produces a relative absolute error of 0.05%.
+  print ("Leaped Estimated: %f, True Value: %f" % values)
+  ```
+
+  Args:
+    dim: Positive Python `int` representing each sample's `event_size.` Must
+      not be greater than 1000.
+    num_samples: (Optional) positive Python `int`. The number of samples to
+      generate. Either this parameter or sample_indices must be specified but
+      not both. If this parameter is None, then the behaviour is determined by
+      the `sample_indices`.
+    sample_indices: (Optional) `Tensor` of dtype int32 and rank 1. The elements
+      of the sequence to compute specified by their position in the sequence.
+      The entries index into the Halton sequence starting with 0 and hence,
+      must be whole numbers. For example, sample_indices=[0, 5, 6] will produce
+      the first, sixth and seventh elements of the sequence. If this parameter
+      is None, then the `num_samples` parameter must be specified which gives
+      the number of desired samples starting from the first sample.
+    dtype: (Optional) The dtype of the sample. One of `float32` or `float64`.
+      Default is `float32`.
+    name:  (Optional) Python `str` describing ops managed by this function. If
+    not supplied the name of this function is used.
+
+  Returns:
+    halton_elements: Elements of the Halton sequence. `Tensor` of supplied dtype
+    and `shape` `[num_samples, dim]` if `num_samples` was specified or shape
+    `[s, dim]` where s is the size of `sample_indices` if `sample_indices`
+    were specified.
+
+  Raises:
+    ValueError: if both `sample_indices` and `num_samples` were specified or
+    if dimension `dim` is less than 1 or greater than 1000.
+  """
+  if dim < 1 or dim > _MAX_DIMENSION:
+    raise ValueError(
+        'Dimension must be between 1 and {}. Supplied {}'.format(_MAX_DIMENSION,
+                                                                 dim))
+  if (num_samples is None) == (sample_indices is None):
+    raise ValueError('Either `num_samples` or `sample_indices` must be'
+                     ' specified but not both.')
+
+  dtype = dtype or dtypes.float32
+  if not dtype.is_floating:
+    raise ValueError('dtype must be of `float`-type')
+
+  with ops.name_scope(name, 'sample', values=[sample_indices]):
+    # Here and in the following, the shape layout is as follows:
+    # [sample dimension, event dimension, coefficient dimension].
+    # The coefficient dimension is an intermediate axes which will hold the
+    # weights of the starting integer when expressed in the (prime) base for
+    # an event dimension.
+    indices = _get_indices(num_samples, sample_indices, dtype)
+    radixes = array_ops.constant(_PRIMES[0:dim], dtype=dtype, shape=[dim, 1])
+
+    max_sizes_by_axes = _base_expansion_size(math_ops.reduce_max(indices),
+                                             radixes)
+
+    max_size = math_ops.reduce_max(max_sizes_by_axes)
+
+    # The powers of the radixes that we will need. Note that there is a bit
+    # of an excess here. Suppose we need the place value coefficients of 7
+    # in base 2 and 3. For 2, we will have 3 digits but we only need 2 digits
+    # for base 3. However, we can only create rectangular tensors so we
+    # store both expansions in a [2, 3] tensor. This leads to the problem that
+    # we might end up attempting to raise large numbers to large powers. For
+    # example, base 2 expansion of 1024 has 10 digits. If we were in 10
+    # dimensions, then the 10th prime (29) we will end up computing 29^10 even
+    # though we don't need it. We avoid this by setting the exponents for each
+    # axes to 0 beyond the maximum value needed for that dimension.
+    exponents_by_axes = array_ops.tile([math_ops.range(max_size)], [dim, 1])
+    weight_mask = exponents_by_axes > max_sizes_by_axes
+    capped_exponents = array_ops.where(
+        weight_mask, array_ops.zeros_like(exponents_by_axes), exponents_by_axes)
+    weights = radixes ** capped_exponents
+    coeffs = math_ops.floor_div(indices, weights)
+    coeffs *= 1 - math_ops.cast(weight_mask, dtype)
+    coeffs = (coeffs % radixes) / radixes
+    return math_ops.reduce_sum(coeffs / weights, axis=-1)
+
+
+def _get_indices(n, sample_indices, dtype, name=None):
+  """Generates starting points for the Halton sequence procedure.
+
+  The k'th element of the sequence is generated starting from a positive integer
+  which must be distinct for each `k`. It is conventional to choose the starting
+  point as `k` itself (or `k+1` if k is zero based). This function generates
+  the starting integers for the required elements and reshapes the result for
+  later use.
+
+  Args:
+    n: Positive `int`. The number of samples to generate. If this
+      parameter is supplied, then `sample_indices` should be None.
+    sample_indices: `Tensor` of dtype int32 and rank 1. The entries
+      index into the Halton sequence starting with 0 and hence, must be whole
+      numbers. For example, sample_indices=[0, 5, 6] will produce the first,
+      sixth and seventh elements of the sequence. If this parameter is not None
+      then `n` must be None.
+    dtype: The dtype of the sample. One of `float32` or `float64`.
+      Default is `float32`.
+    name: Python `str` name which describes ops created by this function.
+
+  Returns:
+    indices: `Tensor` of dtype `dtype` and shape = `[n, 1, 1]`.
+  """
+  with ops.name_scope(name, 'get_indices', [n, sample_indices]):
+    if sample_indices is None:
+      sample_indices = math_ops.range(n, dtype=dtype)
+    else:
+      sample_indices = math_ops.cast(sample_indices, dtype)
+
+    # Shift the indices so they are 1 based.
+    indices = sample_indices + 1
+
+    # Reshape to make space for the event dimension and the place value
+    # coefficients.
+    return array_ops.reshape(indices, [-1, 1, 1])
+
+
+def _base_expansion_size(num, bases):
+  """Computes the number of terms in the place value expansion.
+
+  Let num = a0 + a1 b + a2 b^2 + ... ak b^k be the place value expansion of
+  `num` in base b (ak <> 0). This function computes and returns `k` for each
+  base `b` specified in `bases`.
+
+  This can be inferred from the base `b` logarithm of `num` as follows:
+    $$k = Floor(log_b (num)) + 1  = Floor( log(num) / log(b)) + 1$$
+
+  Args:
+    num: Scalar `Tensor` of dtype either `float32` or `float64`. The number to
+      compute the base expansion size of.
+    bases: `Tensor` of the same dtype as num. The bases to compute the size
+      against.
+
+  Returns:
+    Tensor of same dtype and shape as `bases` containing the size of num when
+    written in that base.
+  """
+  return math_ops.floor(math_ops.log(num) / math_ops.log(bases)) + 1
+
+
+def _primes_less_than(n):
+  # Based on
+  # https://stackoverflow.com/questions/2068372/fastest-way-to-list-all-primes-below-n-in-python/3035188#3035188
+  """Returns sorted array of primes such that `2 <= prime < n`."""
+  small_primes = np.array((2, 3, 5))
+  if n <= 6:
+    return small_primes[small_primes < n]
+  sieve = np.ones(n // 3 + (n % 6 == 2), dtype=np.bool)
+  sieve[0] = False
+  m = int(n ** 0.5) // 3 + 1
+  for i in range(m):
+    if not sieve[i]:
+      continue
+    k = 3 * i + 1 | 1
+    sieve[k ** 2 // 3::2 * k] = False
+    sieve[(k ** 2 + 4 * k - 2 * k * (i & 1)) // 3::2 * k] = False
+  return np.r_[2, 3, 3 * np.nonzero(sieve)[0] + 1 | 1]
+
+_PRIMES = _primes_less_than(7919+1)
+
+assert len(_PRIMES) == _MAX_DIMENSION
diff --git a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
index 333dce929530adceb30dcb63653a5bd009c059e0..5685a942e98800a39ec718adc67bcfd43aeafd52 100644
--- a/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/hmc_impl.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -174,9 +175,11 @@ def chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
 
     potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
     potential, grad = potential_and_grad(initial_x)
-    return functional_ops.scan(body, array_ops.zeros(n_iterations),
-                               (initial_x, array_ops.zeros(non_event_shape),
-                                -potential, -grad))[:2]
+    return functional_ops.scan(
+        body, array_ops.zeros(n_iterations, dtype=initial_x.dtype),
+        (initial_x,
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         -potential, -grad))[:2]
 
 
 def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
@@ -298,8 +301,9 @@ def ais_chain(n_iterations, step_size, n_leapfrog_steps, initial_x,
       return updated_x, acceptance_probs, w
 
     x, acceptance_probs, w = functional_ops.scan(
-        _body, beta_series, (initial_x, array_ops.zeros(non_event_shape),
-                             array_ops.zeros(non_event_shape)))
+        _body, beta_series,
+        (initial_x, array_ops.zeros(non_event_shape, dtype=initial_x.dtype),
+         array_ops.zeros(non_event_shape, dtype=initial_x.dtype)))
   return w[-1], x[-1], acceptance_probs[-1]
 
 
@@ -446,9 +450,10 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
   """
   with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
     potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
+    x = ops.convert_to_tensor(x, name='x')
 
     x_shape = array_ops.shape(x)
-    m = random_ops.random_normal(x_shape)
+    m = random_ops.random_normal(x_shape, dtype=x.dtype)
 
     kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)
 
@@ -468,26 +473,33 @@ def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
 
     kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)
 
-    # TODO(mhoffman): It seems like there may be an opportunity for nans here.
-    # I'm delaying addressing this because we're going to refactor this part
-    # to use the more general Metropolis abstraction anyway.
-    acceptance_probs = math_ops.exp(math_ops.minimum(0., log_potential_0 -
-                                                     log_potential_1 +
-                                                     kinetic_0 - kinetic_1))
-    accepted = math_ops.cast(
-        random_ops.random_uniform(array_ops.shape(acceptance_probs)) <
-        acceptance_probs, np.float32)
-    new_log_prob = (-log_potential_0 * (1. - accepted) -
-                    log_potential_1 * accepted)
+    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
+    # Treat NaN as infinite energy (and therefore guaranteed rejection).
+    energy_change = array_ops.where(
+        math_ops.is_nan(energy_change),
+        array_ops.fill(array_ops.shape(energy_change),
+                       energy_change.dtype.as_numpy_dtype(np.inf)),
+        energy_change)
+    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
+    accepted = (
+        random_ops.random_uniform(
+            array_ops.shape(acceptance_probs), dtype=x.dtype)
+        < acceptance_probs)
+    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)
 
     # TODO(b/65738010): This should work, but it doesn't for now.
     # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
     reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                         keep_dims=True))
     accepted = array_ops.reshape(accepted, reduced_shape)
-    new_x = x * (1. - accepted) + new_x * accepted
-    new_grad = -grad_0 * (1. - accepted) - grad_1 * accepted
-
+    accepted = math_ops.logical_or(
+        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
+    new_x = array_ops.where(accepted, new_x, x)
+    new_grad = -array_ops.where(accepted, grad_1, grad_0)
+
+  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
+  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
+  # should be fixed.
   return new_x, acceptance_probs, new_log_prob, new_grad
 
 
@@ -525,6 +537,7 @@ def leapfrog_integrator(step_size, n_steps, initial_position, initial_momentum,
       Has shape matching `initial_position`.
 
   Example: Simple quadratic potential.
+
   ```python
   def potential_and_grad(position):
     return tf.reduce_sum(0.5 * tf.square(position)), position
@@ -600,6 +613,7 @@ def leapfrog_step(step_size, position, momentum, potential_and_grad, grad,
       Has shape matching `position`.
 
   Example: Simple quadratic potential.
+
   ```python
   def potential_and_grad(position):
     # Simple quadratic potential
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference.py b/tensorflow/contrib/bayesflow/python/ops/layers.py
similarity index 74%
rename from tensorflow/contrib/bayesflow/python/ops/variational_inference.py
rename to tensorflow/contrib/bayesflow/python/ops/layers.py
index 6316361da2accf39dfe2e77902eec06813ca7036..dcead38af826a12e776160bdb251ba021e6b953c 100644
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
+++ b/tensorflow/contrib/bayesflow/python/ops/layers.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Variational inference.
+"""Probabilistic neural layers.
 
-See the ${@python/contrib.bayesflow.variational_inference} guide.
+See ${python/contrib.bayesflow.layers}.
 """
 
 from __future__ import absolute_import
@@ -23,12 +23,15 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.variational_inference_impl import *
+from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational_impl import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    "elbo", "elbo_with_log_joint", "ELBOForms", "register_prior"
+    'DenseVariational',
+    'dense_variational',
+    'default_loc_scale_fn',
+    'default_mean_field_normal_fn',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b05ce0ffc1dd55ffb029b339a846a9aa5c877620
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py
@@ -0,0 +1,797 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Dense Bayesian layer using KL-divergence based variational inference.
+
+@@DenseVariational
+@@dense_variational
+
+@@default_loc_scale_fn
+@@default_mean_field_normal_fn
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base as layers_lib
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
+from tensorflow.python.ops.distributions import kullback_leibler as kl_lib
+from tensorflow.python.ops.distributions import normal as normal_lib
+
+
+__all__ = [
+    "DenseVariational",
+    "dense_variational",
+    "default_loc_scale_fn",
+    "default_mean_field_normal_fn",
+]
+
+
+def default_loc_scale_fn(
+    is_singular=False,
+    loc_initializer=init_ops.random_normal_initializer(stddev=0.1),
+    untransformed_scale_initializer=init_ops.random_normal_initializer(
+        mean=-3., stddev=0.1),
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Makes closure which creates `loc`, `scale` params from `tf.get_variable`.
+
+  This function produces a closure which produces `loc`, `scale` using
+  `tf.get_variable`. The closure accepts the following arguments:
+
+    dtype: Type of parameter's event.
+    shape: Python `list`-like representing the parameter's event shape.
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` indicating if `scale is None`. Default: `False`.
+    loc_initializer: Initializer function for the `loc` parameters.
+      The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. Default value: `tf.random_normal_initializer(mean=-3.,
+      stddev=0.1)`. This implies the softplus transformed result has mean
+      approximately `0.05` and std. deviation approximately `0.005`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters. The default (`None`) is to use the `tf.get_variable` default.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+      The default (`None`) is to use the `tf.get_variable` default.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training. The default
+      (`None`) is to use the `tf.get_variable` default.
+
+  Returns:
+    default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale`
+    parameters from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates `loc`, `scale` parameters."""
+    loc = add_variable_fn(
+        name=name + "_loc",
+        shape=shape,
+        initializer=loc_initializer,
+        regularizer=loc_regularizer,
+        constraint=loc_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    if is_singular:
+      return loc, None
+    untransformed_scale = add_variable_fn(
+        name=name + "_untransformed_scale",
+        shape=shape,
+        initializer=untransformed_scale_initializer,
+        regularizer=untransformed_scale_regularizer,
+        constraint=untransformed_scale_constraint,
+        dtype=dtype,
+        trainable=trainable)
+    scale = (np.finfo(dtype.as_numpy_dtype).eps +
+             nn_ops.softplus(untransformed_scale))
+    return loc, scale
+  return _fn
+
+
+def default_mean_field_normal_fn(
+    is_singular=False,
+    loc_initializer=None,
+    untransformed_scale_initializer=None,
+    loc_regularizer=None,
+    untransformed_scale_regularizer=None,
+    loc_constraint=None,
+    untransformed_scale_constraint=None):
+  """Creates a function to build Normal distributions with trainable params.
+
+  This function produces a closure which produces `tf.distributions.Normal`
+  parameterized by a loc` and `scale` each created using `tf.get_variable`. The
+  produced closure accepts the following arguments:
+
+    name: Python `str` name prepended to any created (or existing)
+      `tf.Variable`s.
+    shape: Python `list`-like representing the parameter's event shape.
+    dtype: Type of parameter's event.
+    trainable: Python `bool` indicating all created `tf.Variable`s should be
+      added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+    add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+      access existing) `tf.Variable`s.
+
+  Args:
+    is_singular: Python `bool` if `True`, forces the special case limit of
+      `scale->0`, i.e., a `Deterministic` distribution.
+    loc_initializer: Initializer function for the `loc` parameters.
+      If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    untransformed_scale_initializer: Initializer function for the `scale`
+      parameters. If `None` (default), values are initialized using the default
+      initializer used by `tf.get_variable`.
+    loc_regularizer: Regularizer function for the `loc` parameters.
+    untransformed_scale_regularizer: Regularizer function for the `scale`
+      parameters.
+    loc_constraint: An optional projection function to be applied to the
+      loc after being updated by an `Optimizer`. The function must take as input
+      the unprojected variable and must return the projected variable (which
+      must have the same shape). Constraints are not safe to use when doing
+      asynchronous distributed training.
+    untransformed_scale_constraint: An optional projection function to be
+      applied to the `scale` parameters after being updated by an `Optimizer`
+      (e.g. used to implement norm constraints or value constraints). The
+      function must take as input the unprojected variable and must return the
+      projected variable (which must have the same shape). Constraints are not
+      safe to use when doing asynchronous distributed training.
+
+  Returns:
+    make_normal_fn: Python `callable` which creates a `tf.distributions.Normal`
+      using from args: `dtype, shape, name, trainable, add_variable_fn`.
+  """
+  loc_scale_fn_ = default_loc_scale_fn(
+      is_singular,
+      loc_initializer,
+      untransformed_scale_initializer,
+      loc_regularizer,
+      untransformed_scale_regularizer,
+      loc_constraint,
+      untransformed_scale_constraint)
+  def _fn(dtype, shape, name, trainable, add_variable_fn):
+    """Creates a batch of `Deterministic` or `Normal` distributions."""
+    loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn)
+    if scale is None:
+      return deterministic_lib.Deterministic(loc=loc)
+    return normal_lib.Normal(loc=loc, scale=scale)
+  return _fn
+
+
+class DenseVariational(layers_lib.Layer):
+  """Densely-connected variational class.
+
+  This layer implements the Bayesian variational inference analogue to:
+  `outputs = activation(matmul(inputs, kernel) + bias)`
+  by assuming the `kernel` and/or the `bias` are random variables.
+
+  The layer implements a stochastic dense calculation by making a Monte Carlo
+  approximation of a [variational Bayesian method based on KL divergence](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
+
+  ```none
+  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
+              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
+             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
+              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
+             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
+                 + KL[q(W|x), p(W)]
+  ```
+
+  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
+  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
+  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
+  bound is sometimes referred to as the negative Evidence Lower BOund or
+  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
+  layer is appropriate to use when the final loss is a negative log-likelihood.
+
+  The Monte-Carlo sum portion is used for the feed-forward calculation of the
+  DNN. The KL divergence portion can be added to the final loss via:
+  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  random variables (which together comprise `W`).
+
+  Args:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+      When `True`, `kernel_posterior_fn` must create an instance of
+      `tf.distributions.Normal`.
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (`callable`).
+    activity_regularizer: Regularizer function for the output.
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+    kernel: `VariationalKernelParamater` instance containing all `kernel`
+      related properties and `callable`s.
+    bias: `VariationalParameter` instance containing all `kernel`
+      related properties and `callable`s.
+  """
+
+  def __init__(
+      self,
+      units,
+      activation=None,
+      activity_regularizer=None,
+      trainable=True,
+      kernel_use_local_reparameterization=True,
+      kernel_posterior_fn=default_mean_field_normal_fn(),
+      kernel_posterior_tensor_fn=lambda d: d.sample(),
+      kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+          loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+      kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
+      bias_posterior_tensor_fn=lambda d: d.sample(),
+      bias_prior_fn=None,
+      bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+      name=None,
+      **kwargs):
+    super(DenseVariational, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self._units = units
+    self._activation = activation
+    self._input_spec = layers_lib.InputSpec(min_ndim=2)
+    self._kernel_use_local_reparameterization = (
+        kernel_use_local_reparameterization)
+    self._kernel = VariationalKernelParameter(
+        kernel_posterior_fn,
+        kernel_posterior_tensor_fn,
+        kernel_prior_fn,
+        kernel_divergence_fn)
+    self._bias = VariationalParameter(
+        bias_posterior_fn,
+        bias_posterior_tensor_fn,
+        bias_prior_fn,
+        bias_divergence_fn)
+
+  @property
+  def units(self):
+    return self._units
+
+  @property
+  def activation(self):
+    return self._activation
+
+  @property
+  def input_spec(self):
+    return self._input_spec
+
+  @input_spec.setter
+  def input_spec(self, value):
+    self._input_spec = value
+
+  @property
+  def kernel_use_local_reparameterization(self):
+    return self._kernel_use_local_reparameterization
+
+  @property
+  def kernel(self):
+    return self._kernel
+
+  @property
+  def bias(self):
+    return self._bias
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    in_size = input_shape.with_rank_at_least(2)[-1].value
+    if in_size is None:
+      raise ValueError("The last dimension of the inputs to `Dense` "
+                       "should be defined. Found `None`.")
+    self._input_spec = layers_lib.InputSpec(min_ndim=2, axes={-1: in_size})
+    dtype = dtypes.as_dtype(self.dtype)
+
+    # Must have a posterior kernel.
+    self.kernel.posterior = self.kernel.posterior_fn(
+        dtype, [in_size, self.units], "kernel_posterior",
+        self.trainable, self.add_variable)
+
+    if self.kernel.prior_fn is None:
+      self.kernel_prior = None
+    else:
+      self.kernel.prior = self.kernel.prior_fn(
+          dtype, [in_size, self.units], "kernel_prior",
+          self.trainable, self.add_variable)
+    self._built_kernel_divergence = False
+
+    if self.bias.posterior_fn is None:
+      self.bias.posterior = None
+    else:
+      self.bias.posterior = self.bias.posterior_fn(
+          dtype, [self.units], "bias_posterior",
+          self.trainable, self.add_variable)
+
+    if self.bias.prior_fn is None:
+      self.bias.prior = None
+    else:
+      self.bias.prior = self.bias.prior_fn(
+          dtype, [self.units], "bias_prior",
+          self.trainable, self.add_variable)
+    self._built_bias_divergence = False
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+
+    outputs = self._apply_variational_kernel(inputs)
+    outputs = self._apply_variational_bias(outputs)
+    if self.activation is not None:
+      outputs = self.activation(outputs)  # pylint: disable=not-callable
+    if not self._built_kernel_divergence:
+      self._apply_divergence(self.kernel, name="divergence_kernel")
+      self._built_kernel_divergence = True
+    if not self._built_bias_divergence:
+      self._apply_divergence(self.bias, name="divergence_bias")
+      self._built_bias_divergence = True
+    return outputs
+
+  def _apply_variational_kernel(self, inputs):
+    if not self.kernel_use_local_reparameterization:
+      self.kernel.posterior_tensor = self.kernel.posterior_tensor_fn(
+          self.kernel.posterior)
+      self.kernel.posterior_affine = None
+      self.kernel.posterior_affine_tensor = None
+      return self._matmul(inputs, self.kernel.posterior_tensor)
+    if not isinstance(self.kernel.posterior, normal_lib.Normal):
+      raise TypeError("`kernel_use_local_reparameterization=True` requires "
+                      "`kernel_posterior_fn` produce an instance of "
+                      "`tf.distributions.Normal` (saw: \"{}\").".format(
+                          type(self.kernel.posterior).__name__))
+    self.kernel.posterior_affine = normal_lib.Normal(
+        loc=self._matmul(inputs, self.kernel.posterior.loc),
+        scale=standard_ops.sqrt(self._matmul(
+            standard_ops.square(inputs),
+            standard_ops.square(self.kernel.posterior.scale))))
+    self.kernel.posterior_affine_tensor = (
+        self.kernel.posterior_tensor_fn(self.kernel.posterior_affine))
+    self.kernel.posterior_tensor = None
+    return self.kernel.posterior_affine_tensor
+
+  def _apply_variational_bias(self, inputs):
+    if self.bias.posterior is None:
+      self.bias.posterior_tensor = None
+      return inputs
+    self.bias.posterior_tensor = self.bias.posterior_tensor_fn(
+        self.bias.posterior)
+    return nn.bias_add(inputs, self.bias.posterior_tensor)
+
+  def _apply_divergence(self, param, name):
+    if (param.divergence_fn is None or
+        param.posterior is None or
+        param.prior is None):
+      param.divergence = None
+      return
+    param.divergence = standard_ops.identity(
+        param.divergence_fn(
+            param.posterior, param.prior, param.posterior_tensor),
+        name=name)
+    self.add_loss(param.divergence)
+
+  def _matmul(self, inputs, kernel):
+    if inputs.shape.ndims <= 2:
+      return standard_ops.matmul(inputs, kernel)
+    # To handle broadcasting, we must use `tensordot`.
+    return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]])
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          "The innermost dimension of input_shape must be defined, "
+          "but saw: {}".format(input_shape))
+    return input_shape[:-1].concatenate(self.units)
+
+
+def dense_variational(
+    inputs,
+    units,
+    activation=None,
+    activity_regularizer=None,
+    trainable=True,
+    kernel_use_local_reparameterization=True,
+    kernel_posterior_fn=default_mean_field_normal_fn(),
+    kernel_posterior_tensor_fn=lambda d: d.sample(),
+    kernel_prior_fn=lambda dtype, *args: normal_lib.Normal(  # pylint: disable=g-long-lambda
+        loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)),
+    kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    bias_posterior_fn=default_mean_field_normal_fn(is_singular=True),
+    bias_posterior_tensor_fn=lambda d: d.sample(),
+    bias_prior_fn=None,
+    bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p),
+    name=None,
+    reuse=None):
+  """Densely-connected variational layer.
+
+  This layer implements the Bayesian variational inference analogue to:
+  `outputs = activation(matmul(inputs, kernel) + bias)`
+  by assuming the `kernel` and/or the `bias` are random variables.
+
+  The layer implements a stochastic dense calculation by making a Monte Carlo
+  approximation of a [variational Bayesian method based on KL divergence](
+  https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e.,
+
+  ```none
+  -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw
+              = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw
+             <= E_q(W|x)[-log p(y,W|x) + log q(W|x)]       # Jensen's
+              = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)]
+             ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m }
+                 + KL[q(W|x), p(W)]
+  ```
+
+  where `W` denotes the (independent) `kernel` and `bias` random variables, `w`
+  is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`,
+  and `~=` denotes an approximation which becomes exact as `m->inf`. The above
+  bound is sometimes referred to as the negative Evidence Lower BOund or
+  negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this
+  layer is appropriate to use when the final loss is a negative log-likelihood.
+
+  The Monte-Carlo sum portion is used for the feed-forward calculation of the
+  DNN. The KL divergence portion can be added to the final loss via:
+  `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`.
+
+  The arguments permit separate specification of the surrogate posterior
+  (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias`
+  random variables (which together comprise `W`).
+
+  Args:
+    inputs: Tensor input.
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (`callable`). Set it to None to maintain a
+      linear activation.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    kernel_use_local_reparameterization: Python `bool` indicating whether
+      `kernel` calculation should employ the Local Reparameterization Trick.
+      When `True`, `kernel_posterior_fn` must create an instance of
+      `tf.distributions.Normal`.
+    kernel_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `kernel` parameter. Default value:
+      `default_mean_field_normal_fn()`.
+    kernel_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    kernel_prior_fn: Python `callable` which creates `tf.distributions`
+      instance. See `default_mean_field_normal_fn` docstring for required
+      parameter signature.
+      Default value: `tf.distributions.Normal(loc=0., scale=1.)`.
+    kernel_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    bias_posterior_fn: Python `callable` which creates
+      `tf.distributions.Distribution` instance representing the surrogate
+      posterior of the `bias` parameter. Default value:
+      `default_mean_field_normal_fn(is_singular=True)` (which creates an
+      instance of `tf.distributions.Deterministic`).
+    bias_posterior_tensor_fn: Python `callable` which takes a
+      `tf.distributions.Distribution` instance and returns a representative
+      value. Default value: `lambda d: d.sample()`.
+    bias_prior_fn: Python `callable` which creates `tf.distributions` instance.
+      See `default_mean_field_normal_fn` docstring for required parameter
+      signature. Default value: `None` (no prior, no variational inference)
+    bias_divergence_fn: Python `callable` which takes the surrogate posterior
+      distribution, prior distribution and random variate sample(s) from the
+      surrogate posterior and computes or approximates the KL divergence. The
+      distributions are `tf.distributions.Distribution`-like instances and the
+      sample is a `Tensor`.
+    name: Python `str`, the name of the layer. Layers with the same name will
+      share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in
+      such cases.
+    reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous
+      layer by the same name.
+
+  Returns:
+    output: `Tensor` representing a the affine transformed input under a random
+      draw from the surrogate posterior distribution.
+  """
+  layer = DenseVariational(
+      units,
+      activation=activation,
+      activity_regularizer=activity_regularizer,
+      trainable=trainable,
+      kernel_use_local_reparameterization=(
+          kernel_use_local_reparameterization),
+      kernel_posterior_fn=kernel_posterior_fn,
+      kernel_posterior_tensor_fn=kernel_posterior_tensor_fn,
+      kernel_prior_fn=kernel_prior_fn,
+      kernel_divergence_fn=kernel_divergence_fn,
+      bias_posterior_fn=bias_posterior_fn,
+      bias_posterior_tensor_fn=bias_posterior_tensor_fn,
+      bias_prior_fn=bias_prior_fn,
+      bias_divergence_fn=bias_divergence_fn,
+      name=name,
+      dtype=inputs.dtype.base_dtype,
+      _scope=name,
+      _reuse=reuse)
+  return layer.apply(inputs)
+
+
+class NotSet(object):
+  """Helper to track whether a `VariationalParameter` value has been set."""
+  pass
+
+
+class VariationalParameter(object):
+  """Struct-like container of variational parameter properties.
+
+  A `VariationalParameter` is intitialized with Python `callable`s which set the
+  value of correspondingly named members. Corresponding values have "set once"
+  semantics, i.e., once set to any value they are immutable.
+  """
+
+  def __init__(
+      self,
+      posterior_fn,
+      posterior_tensor_fn,
+      prior_fn,
+      divergence_fn):
+    """Creates the `VariationalParameter` struct-like object.
+
+    Args:
+      posterior_fn: Python `callable` which creates a
+        `tf.distribution.Distribution` like object representing the posterior
+        distribution. See `VariationalParameter.posterior_fn` for `callable`'s
+        required parameters.
+      posterior_tensor_fn: Python `callable` which computes a `Tensor`
+        which represents the `posterior`.
+      prior_fn: Python `callable` which creates a
+        `tf.distribution.Distribution` like object representing the prior
+        distribution. See `VariationalParameter.prior_fn` for `callable`'s
+        required parameters.
+      divergence_fn: Python `callable` which computes the KL divergence from
+        `posterior` to `prior`. See `VariationalParameter.divergence_fn` for
+        required `callable`'s parameters.
+    """
+    self._posterior_fn = posterior_fn
+    self._posterior = NotSet()
+    self._posterior_tensor_fn = posterior_tensor_fn
+    self._posterior_tensor = NotSet()
+    self._prior_fn = prior_fn
+    self._prior = NotSet()
+    self._divergence_fn = divergence_fn
+    self._divergence = NotSet()
+    self._init_helper()
+
+  @property
+  def posterior_fn(self):
+    """`callable` which creates `tf.distributions.Distribution`-like posterior.
+
+    The `callable` must accept the following parameters:
+      name: Python `str` name prepended to any created (or existing)
+        `tf.Variable`s.
+      shape: Python `list`-like representing the parameter's event shape.
+      dtype: Type of parameter's event.
+      trainable: Python `bool` indicating all created `tf.Variable`s should be
+        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+        access existing) `tf.Variable`s.
+
+    Returns:
+      posterior_fn: The Python `callable` specified in `__init__`.
+    """
+    return self._posterior_fn
+
+  @property
+  def posterior(self):
+    """`tf.distributions.Distribution`-like instance representing posterior."""
+    return self._posterior
+
+  @posterior.setter
+  def posterior(self, value):
+    """One-time setter of the `posterior` distribution."""
+    if not isinstance(self._posterior, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._posterior = value
+
+  @property
+  def posterior_tensor_fn(self):
+    """Creates `Tensor` representing the `posterior` distribution.
+
+    The `callable` must accept the following parameters:
+      posterior: `tf.distributions.Distribution`-like instance.
+
+    Returns:
+      posterior_tensor_fn: The Python `callable` specified in
+        `__init__`.
+    """
+    return self._posterior_tensor_fn
+
+  @property
+  def posterior_tensor(self):
+    """`Tensor` representing the `posterior` distribution."""
+    return self._posterior_tensor
+
+  @posterior_tensor.setter
+  def posterior_tensor(self, value):
+    """One-time setter of the `posterior_tensor`."""
+    if not isinstance(self._posterior_tensor, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._posterior_tensor = value
+
+  @property
+  def prior_fn(self):
+    """`callable` which creates `tf.distributions.Distribution`-like prior.
+
+    The `callable` must accept the following parameters:
+      name: Python `str` name prepended to any created (or existing)
+        `tf.Variable`s.
+      shape: Python `list`-like representing the parameter's event shape.
+      dtype: Type of parameter's event.
+      trainable: Python `bool` indicating all created `tf.Variable`s should be
+        added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`.
+      add_variable_fn: `tf.get_variable`-like `callable` used to create (or
+        access existing) `tf.Variable`s.
+
+    Returns:
+      prior_fn: The Python `callable` specified in `__init__`.
+    """
+    return self._prior_fn
+
+  @property
+  def prior(self):
+    """`tf.distributions.Distribution`-like instance representing posterior."""
+    return self._prior
+
+  @prior.setter
+  def prior(self, value):
+    """One-time setter of the `prior` distribution."""
+    if not isinstance(self._prior, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._prior = value
+
+  @property
+  def divergence_fn(self):
+    """`callable` which computes KL-divergence `Tensor` from posterior to prior.
+
+    The `callable` must accept the following parameters:
+      posterior: `tf.distributions.Distribution`-like instance.
+      prior: `tf.distributions.Distribution`-like instance.
+      posterior_tensor: `Tensor` representing value of posterior.
+
+    Returns:
+      divergence_fn: The Python `callable` specified in `__init__`.
+    """
+    return self._divergence_fn
+
+  @property
+  def divergence(self):
+    """`Tensor` representing KL-divergence from posterior to prior."""
+    return self._divergence
+
+  @divergence.setter
+  def divergence(self, value):
+    """One-time setter of the `divergence`."""
+    if not isinstance(self._divergence, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._divergence = value
+
+  def _init_helper(self):
+    pass
+
+
+class VariationalKernelParameter(VariationalParameter):
+  """Struct-like container of variational kernel properties.
+
+  A `VariationalKernelParameter` is intitialized with Python `callable`s which
+  set the value of correspondingly named members. Corresponding values have "set
+  once" semantics, i.e., once set to any value they are immutable.
+  """
+
+  @property
+  def posterior_affine(self):
+    """`tf.distributions.Distribution` affine transformed posterior."""
+    return self._posterior_affine
+
+  @posterior_affine.setter
+  def posterior_affine(self, value):
+    """One-time setter of `posterior_affine`."""
+    if not isinstance(self._posterior_affine, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._posterior_affine = value
+
+  @property
+  def posterior_affine_tensor(self):
+    """`Tensor` representing the `posterior_affine` distribution."""
+    return self._posterior_affine_tensor
+
+  @posterior_affine_tensor.setter
+  def posterior_affine_tensor(self, value):
+    """One-time setter of the `posterior_affine_tensor`."""
+    if not isinstance(self._posterior_affine_tensor, NotSet):
+      raise ValueError("Cannot override already set attribute.")
+    self._posterior_affine_tensor = value
+
+  def _init_helper(self):
+    self._posterior_affine = NotSet()
+    self._posterior_affine_tensor = NotSet()
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py b/tensorflow/contrib/bayesflow/python/ops/optimizers.py
similarity index 77%
rename from tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
rename to tensorflow/contrib/bayesflow/python/ops/optimizers.py
index b8e38b6f9bf86aef42627cf127a93ce2edd42451..ee32e6b5c3d9efaeaf73436638c5eea55f2cfc70 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
+++ b/tensorflow/contrib/bayesflow/python/ops/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Support for Stochastic Computation Graphs.
+"""Probabilistic optimizer modules.
 
-See the @{$python/contrib.bayesflow.stochastic_graph} guide.
-
-@@surrogate_loss
+See ${python/contrib.bayesflow.optimizers}.
 """
 
 from __future__ import absolute_import
@@ -25,13 +23,12 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.stochastic_graph_impl import *
+from tensorflow.contrib.bayesflow.python.ops.sgld_optimizer import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util.all_util import remove_undocumented
 
-
 _allowed_symbols = [
-    "surrogate_loss"
+    'SGLDOptimizer',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d36ea7a2b51aa45cdc253992a2a58634c068987
--- /dev/null
+++ b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
@@ -0,0 +1,216 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An optimizer module for stochastic gradient Langevin dynamics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope as varscope_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class SGLDOptimizer(optimizer.Optimizer):
+  """An optimizer module for stochastic gradient Langevin dynamics.
+
+  This implements the preconditioned Stochastic Gradient Langevin Dynamics
+  optimizer [1]. The optimization variable is regarded as a sample from the
+  posterior under Stochastic Gradient Langevin Dynamics with noise rescaled in
+  each dimension according to RMSProp [2].
+
+  Note: If a prior is included in the loss, it should be scaled by
+  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
+  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
+  described below.
+
+  [1]: "Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural
+       Networks." Chunyuan Li, Changyou Chen, David Carlson, Lawrence Carin.
+       ArXiv:1512.07666, 2015. https://arxiv.org/abs/1512.07666
+  [2]: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+
+  Args:
+    learning_rate: Scalar `float`-like `Tensor`. The base learning rate for the
+      optimizer. Must be tuned to the specific function being minimized.
+    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
+      decay rate of the rescaling of the preconditioner (RMSprop). (This is
+      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
+      sampling from the posterior. (Default: `0.95`)
+    num_pseudo_batches: Scalar `int`-like `Tensor`. The effective number of
+      minibatches in the data set.  Trades off noise and prior with the SGD
+      likelihood term. Note: Assumes the loss is taken as the mean over a
+      minibatch. Otherwise if the sum was taken, divide this number by the
+      batch size.  (Default: `1`)
+    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
+      gradient statistics to update the preconditioner before starting to draw
+      noisy samples. (Default: `25`)
+    diagonal_bias: Scalar `float`-like `Tensor`. Term added to the diagonal of
+      the preconditioner to prevent the preconditioner from degenerating.
+      (Default: `1e-8`)
+    name: Python `str` describing ops managed by this function.
+      (Default: `"SGLDOptimizer"`)
+    variable_scope: Variable scope used for calls to `tf.get_variable`.
+      If `None`, a new variable scope is created using name
+      `ops.get_default_graph().unique_name(name or default_name)`.
+
+  Raises:
+    InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
+      `(0,1]`.
+  """
+
+  def __init__(self,
+               learning_rate,
+               preconditioner_decay_rate=0.95,
+               num_pseudo_batches=1,
+               burnin=25,
+               diagonal_bias=1e-8,
+               name=None,
+               variable_scope=None):
+    default_name = 'SGLDOptimizer'
+    with ops.name_scope(name, default_name, [
+        learning_rate, preconditioner_decay_rate, num_pseudo_batches, burnin,
+        diagonal_bias
+    ]):
+      if variable_scope is None:
+        var_scope_name = ops.get_default_graph().unique_name(
+            name or default_name)
+        with varscope_ops.variable_scope(var_scope_name) as scope:
+          self._variable_scope = scope
+      else:
+        self._variable_scope = variable_scope
+
+      self._preconditioner_decay_rate = ops.convert_to_tensor(
+          preconditioner_decay_rate, name='preconditioner_decay_rate')
+      self._num_pseudo_batches = ops.convert_to_tensor(
+          num_pseudo_batches, name='num_pseudo_batches')
+      self._burnin = ops.convert_to_tensor(burnin, name='burnin')
+      self._diagonal_bias = ops.convert_to_tensor(
+          diagonal_bias, name='diagonal_bias')
+      self._learning_rate = ops.convert_to_tensor(
+          learning_rate, name='learning_rate')
+
+      with varscope_ops.variable_scope(self._variable_scope):
+        self._counter = varscope_ops.get_variable(
+            'counter', initializer=0, trainable=False)
+
+      self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._preconditioner_decay_rate,
+              message='`preconditioner_decay_rate` must be non-negative'),
+          check_ops.assert_less_equal(
+              self._preconditioner_decay_rate,
+              1.,
+              message='`preconditioner_decay_rate` must be at most 1.'),
+      ], self._preconditioner_decay_rate)
+
+      self._num_pseudo_batches = control_flow_ops.with_dependencies([
+          check_ops.assert_greater(
+              self._num_pseudo_batches,
+              0,
+              message='`num_pseudo_batches` must be greater than zero')
+      ], self._num_pseudo_batches)
+
+      self._burnin = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._burnin, message='`burnin` must be non-negative'),
+          check_ops.assert_integer(
+              self._burnin, message='`burnin` must be an integer')
+      ], self._burnin)
+
+      self._diagonal_bias = control_flow_ops.with_dependencies([
+          check_ops.assert_non_negative(
+              self._diagonal_bias,
+              message='`diagonal_bias` must be non-negative')
+      ], self._diagonal_bias)
+
+      super(SGLDOptimizer, self).__init__(use_locking=False,
+                                          name=name or default_name)
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      init_rms = init_ops.ones_initializer(dtype=v.dtype)
+      self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
+                                              v.dtype, 'rms', self._name)
+
+  def _prepare(self):
+    # We need to put the conversion and check here because a user will likely
+    # want to decay the learning rate dynamically.
+    self._learning_rate_tensor = control_flow_ops.with_dependencies([
+        check_ops.assert_non_negative(
+            self._learning_rate, message='`learning_rate` must be non-negative')
+    ], ops.convert_to_tensor(self._learning_rate, name='learning_rate_tensor'))
+    self._decay_tensor = ops.convert_to_tensor(
+        self._preconditioner_decay_rate, name='preconditioner_decay_rate')
+
+    super(SGLDOptimizer, self)._prepare()
+
+  def _apply_dense(self, grad, var):
+    rms = self.get_slot(var, 'rms')
+
+    with ops.control_dependencies([
+        self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
+                                                       var.dtype.base_dtype))]):
+      new_grad = self._apply_noisy_update(rms, grad)
+
+    return training_ops.apply_gradient_descent(
+        var,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        new_grad,
+        use_locking=self._use_locking).op
+
+  def _apply_sparse(self, grad, var):
+    rms = self.get_slot(var, 'rms')
+
+    with ops.control_dependencies([
+        self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
+                                                       var.dtype.base_dtype))]):
+      new_grad = self._apply_noisy_update(rms, grad)
+
+    return training_ops.apply_gradient_descent(
+        var,
+        math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
+        new_grad,
+        use_locking=self._use_locking).op
+
+  @property
+  def variable_scope(self):
+    """Variable scope of all calls to `tf.get_variable`."""
+    return self._variable_scope
+
+  def _apply_noisy_update(self, mom, grad):
+    # Compute and apply the gradient update following
+    # preconditioned Langevin dynamics
+    stddev = array_ops.where(
+        array_ops.squeeze(self._counter > self._burnin),
+        math_ops.cast(math_ops.rsqrt(self._learning_rate), grad.dtype),
+        array_ops.zeros([], grad.dtype))
+
+    preconditioner = math_ops.rsqrt(
+        mom + math_ops.cast(self._diagonal_bias, grad.dtype))
+    return (
+        0.5 * preconditioner * grad * math_ops.cast(self._num_pseudo_batches,
+                                                    grad.dtype) +
+        random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
+        stddev * math_ops.sqrt(preconditioner))
+
+  def _update_momentum(self, mom, grad, decay):
+    # Keep an exponentially weighted moving average of squared gradients.
+    # Not thread safe
+    return mom.assign_add((1.0 - decay) * (math_ops.square(grad) - mom))
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
deleted file mode 100644
index 695310837e0f6a58842f45c28608f12fbe162c6e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stochastic gradient estimators.
-
-These functions are meant to be used in conjunction with `StochasticTensor`
-(`loss_fn` parameter) and `surrogate_loss`.
-
-See Gradient Estimation Using Stochastic Computation Graphs
-(http://arxiv.org/abs/1506.05254) by Schulman et al., eq. 1 and section 4, for
-mathematical details.
-
-## Score function estimator
-
-The score function is an unbiased estimator of the gradient of `E_p(x)[f(x)]`,
-where `f(x)` can be considered to be a "loss" term. It is computed as
-`E_p(x)[f(x) grad(log p(x))]`. A constant `b`, referred to here as the
-"baseline", can be subtracted from `f(x)` without affecting the expectation. The
-term `(f(x) - b)` is referred to here as the "advantage".
-
-Note that the methods defined in this module actually compute the integrand of
-the score function, such that when taking the gradient, the true score function
-is computed.
-
-@@score_function
-@@get_score_function_with_baseline
-@@get_score_function_with_constant_baseline
-@@get_score_function_with_advantage
-
-## Baseline functions
-
-Baselines reduce the variance of Monte Carlo estimate of an expectation. The
-baseline for a stochastic node can be a function of all non-influenced nodes
-(see section 4 of Schulman et al., linked above). Baselines are also known as
-"control variates."
-
-In the context of a MC estimate of `E_p(x)[f(x) - b]`, baseline functions have
-the signature `(st, fx) => Tensor`, where `st` is a `StochasticTensor` backed by
-the distribution `p(x)` and `fx` is the influenced loss.
-
-@@get_mean_baseline
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import training
-from tensorflow.python.util.all_util import make_all
-
-
-def score_function(stochastic_tensor, value, loss, baseline=None,
-                   name="ScoreFunction"):
-  """Score function estimator.
-
-  Computes the integrand of the score function with a baseline:
-  `p.log_prob(value) * (loss - baseline)`.
-
-  It will add a `stop_gradient` to the advantage `(loss - baseline)`.
-
-  Args:
-    stochastic_tensor: `StochasticTensor` p(x).
-    value: `Tensor` x. Samples from p(x).
-    loss: `Tensor`.
-    baseline: `Tensor` broadcastable to `loss`.
-    name: name to prepend ops with.
-
-  Returns:
-    `Tensor` `p.log_prob(x) * (loss - b)`. Taking the gradient yields the score
-    function estimator.
-  """
-  with ops.name_scope(name, values=[value, loss, baseline]):
-    value = ops.convert_to_tensor(value)
-    loss = ops.convert_to_tensor(loss)
-    if baseline is not None:
-      baseline = ops.convert_to_tensor(baseline)
-      advantage = loss - baseline
-    else:
-      advantage = loss
-
-    advantage = array_ops.stop_gradient(advantage)
-    return stochastic_tensor.distribution.log_prob(value) * advantage
-
-
-def get_score_function_with_advantage(advantage_fn=None,
-                                      name="ScoreFunctionWithAdvantage"):
-  """Score function estimator with advantage function.
-
-  Args:
-    advantage_fn: callable that takes the `StochasticTensor` and the
-      downstream `loss` and returns a `Tensor` advantage
-      (e.g. `loss - baseline`).
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and uses the provided advantage.
-  """
-
-  def score_function_with_advantage(stochastic_tensor, value, loss):
-    with ops.name_scope(name, values=[value, loss]):
-      advantage = advantage_fn(stochastic_tensor, loss)
-      advantage = array_ops.stop_gradient(advantage)
-      return stochastic_tensor.distribution.log_prob(value) * advantage
-
-  return score_function_with_advantage
-
-
-def get_score_function_with_constant_baseline(baseline, name="ScoreFunction"):
-  """Score function estimator with constant baseline.
-
-  Args:
-    baseline: `Tensor` to be subtracted from loss.
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and subtracts the provided
-    `baseline` from the `loss`.
-  """
-
-  def score_function_with_constant_baseline(stochastic_tensor, value, loss):
-    return score_function(stochastic_tensor, value, loss, baseline, name)
-
-  return score_function_with_constant_baseline
-
-
-def get_score_function_with_baseline(baseline_fn=None, name="ScoreFunction"):
-  """Score function estimator with baseline function.
-
-  Args:
-    baseline_fn: callable that takes the `StochasticTensor` and the downstream
-      `loss` and returns a `Tensor` baseline to be subtracted from the `loss`.
-      If None, defaults to `get_mean_baseline`, which is an EMA of the loss.
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and subtracts the provided
-    `baseline` from the `loss`.
-  """
-  if baseline_fn is None:
-    baseline_fn = get_mean_baseline()
-
-  def score_function_with_baseline(stochastic_tensor, value, loss):
-    with ops.name_scope(name):
-      b = baseline_fn(stochastic_tensor, loss)
-      return score_function(stochastic_tensor, value, loss, b)
-
-  return score_function_with_baseline
-
-
-def get_mean_baseline(ema_decay=0.99, name=None):
-  """ExponentialMovingAverage baseline.
-
-  Args:
-    ema_decay: decay rate for the ExponentialMovingAverage.
-    name: name for variable scope of the ExponentialMovingAverage.
-
-  Returns:
-    Callable baseline function that takes the `StochasticTensor` (unused) and
-    the downstream `loss`, and returns an EMA of the loss.
-  """
-
-  def mean_baseline(_, loss):
-    with vs.variable_scope(name, default_name="MeanBaseline"):
-      reduced_loss = math_ops.reduce_mean(loss)
-
-      ema = training.ExponentialMovingAverage(decay=ema_decay, zero_debias=True)
-      update_op = ema.apply([reduced_loss])
-
-      with ops.control_dependencies([update_op]):
-        # Using `identity` causes an op to be added in this context, which
-        # triggers the update. Removing the `identity` means nothing is updated.
-        baseline = array_ops.identity(ema.average(reduced_loss))
-
-      return baseline
-
-  return mean_baseline
-
-
-def get_vimco_advantage_fn(have_log_loss=False):
-  """VIMCO (Variational Inference for Monte Carlo Objectives) baseline.
-
-  Implements VIMCO baseline from the article of the same name:
-
-  https://arxiv.org/pdf/1602.06725v2.pdf
-
-  Given a `loss` tensor (containing non-negative probabilities or ratios),
-  calculates the advantage VIMCO advantage via Eq. 9 of the above paper.
-
-  The tensor `loss` should be shaped `[n, ...]`, with rank at least 1.  Here,
-  the first axis is considered the single sampling dimension and `n` must
-  be at least 2.  Specifically, the `StochasticTensor` is assumed to have
-  used the `SampleValue(n)` value type with `n > 1`.
-
-  Args:
-    have_log_loss: Python `Boolean`.  If `True`, the loss is assumed to be the
-      log loss.  If `False` (the default), it is assumed to be a nonnegative
-      probability or probability ratio.
-
-  Returns:
-    Callable baseline function that takes the `StochasticTensor` (unused) and
-    the downstream `loss`, and returns the VIMCO baseline for the loss.
-  """
-  def vimco_advantage_fn(_, loss, name=None):
-    """Internal VIMCO function.
-
-    Args:
-      _: ignored `StochasticTensor`.
-      loss: The loss `Tensor`.
-      name: Python string, the name scope to use.
-
-    Returns:
-      The advantage `Tensor`.
-    """
-    with ops.name_scope(name, "VIMCOAdvantage", values=[loss]):
-      loss = ops.convert_to_tensor(loss)
-      loss_shape = loss.get_shape()
-      loss_num_elements = loss_shape[0].value
-      n = math_ops.cast(
-          loss_num_elements or array_ops.shape(loss)[0], dtype=loss.dtype)
-
-      if have_log_loss:
-        log_loss = loss
-      else:
-        log_loss = math_ops.log(loss)
-
-      # Calculate L_hat, Eq. (4) -- stably
-      log_mean = math_ops.reduce_logsumexp(log_loss, [0]) - math_ops.log(n)
-
-      # expand_dims: Expand shape [a, b, c] to [a, 1, b, c]
-      log_loss_expanded = array_ops.expand_dims(log_loss, [1])
-
-      # divide: log_loss_sub with shape [a, a, b, c], where
-      #
-      #  log_loss_sub[i] = log_loss - log_loss[i]
-      #
-      #       = [ log_loss[j] - log_loss[i] for rows j = 0 ... i - 1     ]
-      #         [ zeros                                                  ]
-      #         [ log_loss[j] - log_loss[i] for rows j = i + 1 ... a - 1 ]
-      #
-      log_loss_sub = log_loss - log_loss_expanded
-
-      # reduce_sum: Sums each row across all the sub[i]'s; result is:
-      #   reduce_sum[j] = (n - 1) * log_loss[j] - (sum_{i != j} loss[i])
-      # divide by (n - 1) to get:
-      #   geometric_reduction[j] =
-      #     log_loss[j] - (sum_{i != j} log_loss[i]) / (n - 1)
-      geometric_reduction = math_ops.reduce_sum(log_loss_sub, [0]) / (n - 1)
-
-      # subtract this from the original log_loss to get the baseline:
-      #   geometric_mean[j] = exp((sum_{i != j} log_loss[i]) / (n - 1))
-      log_geometric_mean = log_loss - geometric_reduction
-
-      ## Equation (9)
-
-      # Calculate sum_{i != j} loss[i] -- via exp(reduce_logsumexp(.))
-      # reduce_logsumexp: log-sum-exp each row across all the
-      # -sub[i]'s, result is:
-      #
-      #  exp(reduce_logsumexp[j]) =
-      #    1 + sum_{i != j} exp(log_loss[i] - log_loss[j])
-      log_local_learning_reduction = math_ops.reduce_logsumexp(
-          -log_loss_sub, [0])
-
-      # convert local_learning_reduction to the sum-exp of the log-sum-exp
-      #  (local_learning_reduction[j] - 1) * exp(log_loss[j])
-      #    = sum_{i != j} exp(log_loss[i])
-      local_learning_log_sum = (
-          _logexpm1(log_local_learning_reduction) + log_loss)
-
-      # Add (logaddexp) the local learning signals (Eq. 9)
-      local_learning_signal = (
-          math_ops.reduce_logsumexp(
-              array_ops.stack((local_learning_log_sum, log_geometric_mean)),
-              [0])
-          - math_ops.log(n))
-
-      advantage = log_mean - local_learning_signal
-
-      return advantage
-
-  return vimco_advantage_fn
-
-
-def _logexpm1(x):
-  """Stably calculate log(exp(x)-1)."""
-  with ops.name_scope("logsumexp1"):
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
-    # Choose a small offset that makes gradient calculations stable for
-    # float16, float32, and float64.
-    safe_log = lambda y: math_ops.log(y + eps / 1e8)  # For gradient stability
-    return array_ops.where(
-        math_ops.abs(x) < eps,
-        safe_log(x) + x/2 + x*x/24,  # small x approximation to log(expm1(x))
-        safe_log(math_ops.exp(x) - 1))
-
-
-__all__ = make_all(__name__)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
deleted file mode 100644
index b2338bca8c94e0c7c44182f3f6bba7d7e79595e1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and helper functions for Stochastic Computation Graphs.
-
-## Stochastic Computation Graph Helper Functions
-
-@@surrogate_loss
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _upstream_stochastic_nodes(tensors):
-  """Map tensors to the stochastic tensors upstream of them.
-
-  Args:
-    tensors: a list of Tensors.
-
-  Returns:
-    A dict that maps the tensors passed in to the `StochasticTensor` objects
-    upstream of them.
-  """
-  reverse_map = _stochastic_dependencies_map(tensors)
-  upstream = collections.defaultdict(set)
-  for st, ts in reverse_map.items():
-    for t in ts:
-      upstream[t].add(st)
-  return upstream
-
-
-def _stochastic_dependencies_map(fixed_losses, stochastic_tensors=None):
-  """Map stochastic tensors to the fixed losses that depend on them.
-
-  Args:
-    fixed_losses: a list of `Tensor`s.
-    stochastic_tensors: a list of `StochasticTensor`s to map to fixed losses.
-      If `None`, all `StochasticTensor`s in the graph will be used.
-
-  Returns:
-    A dict `dependencies` that maps `StochasticTensor` objects to subsets of
-    `fixed_losses`.
-
-    If `loss in dependencies[st]`, for some `loss` in `fixed_losses` then there
-    is a direct path from `st.value()` to `loss` in the graph.
-  """
-  stoch_value_collection = stochastic_tensors or ops.get_collection(
-      stochastic_tensor_impl.STOCHASTIC_TENSOR_COLLECTION)
-
-  if not stoch_value_collection:
-    return {}
-
-  stoch_value_map = dict(
-      (node.value(), node) for node in stoch_value_collection)
-
-  # Step backwards through the graph to see which surrogate losses correspond
-  # to which fixed_losses.
-  #
-  # TODO(ebrevdo): Ensure that fixed_losses and stochastic values are in the
-  # same frame.
-  stoch_dependencies_map = collections.defaultdict(set)
-  for loss in fixed_losses:
-    boundary = set([loss])
-    while boundary:
-      edge = boundary.pop()
-      edge_stoch_node = stoch_value_map.get(edge, None)
-      if edge_stoch_node:
-        stoch_dependencies_map[edge_stoch_node].add(loss)
-      boundary.update(edge.op.inputs)
-
-  return stoch_dependencies_map
-
-
-def surrogate_loss(sample_losses,
-                   stochastic_tensors=None,
-                   name="SurrogateLoss"):
-  """Surrogate loss for stochastic graphs.
-
-  This function will call `loss_fn` on each `StochasticTensor`
-  upstream of `sample_losses`, passing the losses that it influenced.
-
-  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
-  instantiated in `while_loop`s or other control structures.
-
-  Args:
-    sample_losses: a list or tuple of final losses. Each loss should be per
-      example in the batch (and possibly per sample); that is, it should have
-      dimensionality of 1 or greater. All losses should have the same shape.
-    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
-      If None, defaults to all `StochasticTensor`s in the graph upstream of
-      the `Tensor`s in `sample_losses`.
-    name: the name with which to prepend created ops.
-
-  Returns:
-    `Tensor` loss, which is the sum of `sample_losses` and the
-    `loss_fn`s returned by the `StochasticTensor`s.
-
-  Raises:
-    TypeError: if `sample_losses` is not a list or tuple, or if its elements
-      are not `Tensor`s.
-    ValueError: if any loss in `sample_losses` does not have dimensionality 1
-      or greater.
-  """
-  with ops.name_scope(name, values=sample_losses):
-    if not isinstance(sample_losses, (list, tuple)):
-      raise TypeError("sample_losses must be a list or tuple")
-    for loss in sample_losses:
-      if not isinstance(loss, ops.Tensor):
-        raise TypeError("loss is not a Tensor: %s" % loss)
-      ndims = loss.get_shape().ndims
-      if not (ndims is not None and ndims >= 1):
-        raise ValueError("loss must have dimensionality 1 or greater: %s" %
-                         loss)
-
-    stoch_dependencies_map = _stochastic_dependencies_map(
-        sample_losses, stochastic_tensors=stochastic_tensors)
-    if not stoch_dependencies_map:
-      logging.warn(
-          "No collection of Stochastic Tensors found for current graph.")
-      return math_ops.add_n(sample_losses)
-
-    # Iterate through all of the stochastic dependencies, adding
-    # surrogate terms where necessary.
-    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
-    loss_terms = sample_losses
-    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
-      dependent_losses = list(dependent_losses)
-
-      logging.info("Losses influenced by StochasticTensor %s: [%s]",
-                   stoch_node.name, ", ".join(
-                       [loss.name for loss in dependent_losses]))
-
-      # Sum up the downstream losses for this ST
-      influenced_loss = _add_n_or_sum(dependent_losses)
-
-      # Compute surrogate loss term
-      loss_term = stoch_node.loss(array_ops.stop_gradient(influenced_loss))
-      if loss_term is not None:
-        loss_terms.append(loss_term)
-
-    return _add_n_or_sum(loss_terms)
-
-
-def _add_n_or_sum(terms):
-  # add_n works for Tensors of the same dtype and shape
-  shape = terms[0].get_shape()
-  dtype = terms[0].dtype
-
-  if all(term.get_shape().is_fully_defined() and
-         term.get_shape().is_compatible_with(shape) and term.dtype == dtype
-         for term in terms):
-    return math_ops.add_n(terms)
-  else:
-    return sum(terms)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
deleted file mode 100644
index ce5fdd98c69ca6b3482bfafa8859accdf8a78749..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
+++ /dev/null
@@ -1,477 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and helper functions for creating Stochastic Tensors.
-
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
-
-@@BaseStochasticTensor
-@@StochasticTensor
-
-## Stochastic Tensor Value Types
-
-@@MeanValue
-@@SampleValue
-
-@@value_type
-@@get_current_value_type
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import collections
-import contextlib
-import threading
-
-import six
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions import distribution
-
-STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
-
-
-@six.add_metaclass(abc.ABCMeta)
-class BaseStochasticTensor(object):
-  """Base Class for Tensor-like objects that emit stochastic values."""
-
-  def __init__(self):
-    # Add self to this graph's Stochsatic Tensor collection for
-    # purposes of later performing correct surrogate loss calculation.
-    ops.add_to_collection(STOCHASTIC_TENSOR_COLLECTION, self)
-
-  @abc.abstractproperty
-  def name(self):
-    pass
-
-  @abc.abstractproperty
-  def dtype(self):
-    pass
-
-  @abc.abstractproperty
-  def graph(self):
-    pass
-
-  @abc.abstractmethod
-  def value(self, name=None):
-    pass
-
-  @abc.abstractmethod
-  def loss(self, sample_loss):
-    """Returns the term to add to the surrogate loss.
-
-    This method is called by `surrogate_loss`.  The input `sample_loss` should
-    have already had `stop_gradient` applied to it.  This is because the
-    surrogate_loss usually provides a Monte Carlo sample term of the form
-    `differentiable_surrogate * sample_loss` where `sample_loss` is considered
-    constant with respect to the input for purposes of the gradient.
-
-    Args:
-      sample_loss: `Tensor`, sample loss downstream of this `StochasticTensor`.
-
-    Returns:
-      Either `None` or a `Tensor`.
-    """
-    raise NotImplementedError("surrogate_loss not implemented")
-
-  @staticmethod
-  def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
-    _ = name
-    if dtype and not dtype.is_compatible_with(v.dtype):
-      raise ValueError(
-          "Incompatible type conversion requested to type '%s' for variable "
-          "of type '%s'" % (dtype.name, v.dtype.name))
-    if as_ref:
-      raise ValueError("%s: Ref type is not supported." % v)
-    return v.value()
-
-
-# pylint: disable=protected-access
-ops.register_tensor_conversion_function(
-    BaseStochasticTensor, BaseStochasticTensor._tensor_conversion_function)
-
-# pylint: enable=protected-access
-
-
-class _StochasticValueType(object):
-  """Interface for the ValueType classes.
-
-  This is the base class for MeanValue, SampleValue, and their descendants.
-  """
-
-  def pushed_above(self, unused_value_type):
-    pass
-
-  def popped_above(self, unused_value_type):
-    pass
-
-  def declare_inputs(self, unused_stochastic_tensor, unused_inputs_dict):
-    pass
-
-  @abc.abstractproperty
-  def stop_gradient(self):
-    """Whether the value should be wrapped in stop_gradient.
-
-    StochasticTensors must respect this property.
-    """
-    pass
-
-
-class MeanValue(_StochasticValueType):
-
-  def __init__(self, stop_gradient=False):
-    self._stop_gradient = stop_gradient
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-class SampleValue(_StochasticValueType):
-  """Draw samples, possibly adding new outer dimensions along the way.
-
-  This ValueType draws samples from StochasticTensors run within its
-  context, increasing the rank according to the requested shape.
-
-  Examples:
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue()):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 1 sample and does not reshape
-  assertEqual(st.value().get_shape(), (2, 3))
-  ```
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue(4)):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 4 samples each with shape (2, 3) and concatenates
-  assertEqual(st.value().get_shape(), (4, 2, 3))
-  ```
-  """
-
-  def __init__(self, shape=(), stop_gradient=False):
-    """Sample according to shape.
-
-    For the given StochasticTensor `st` using this value type,
-    the shape of `st.value()` will match that of
-    `st.distribution.sample(shape)`.
-
-    Args:
-      shape: A shape tuple or int32 tensor.  The sample shape.
-        Default is a scalar: take one sample and do not change the size.
-      stop_gradient: If `True`, StochasticTensors' values are wrapped in
-        `stop_gradient`, to avoid backpropagation through.
-    """
-    self._shape = shape
-    self._stop_gradient = stop_gradient
-
-  @property
-  def shape(self):
-    return self._shape
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-# Keeps track of how a StochasticTensor's value should be accessed.
-# Used by value_type and get_current_value_type below.
-_STOCHASTIC_VALUE_STACK = collections.defaultdict(list)
-
-
-@contextlib.contextmanager
-def value_type(dist_value_type):
-  """Creates a value type context for any StochasticTensor created within.
-
-  Typical usage:
-
-  ```
-  with sg.value_type(sg.MeanValue(stop_gradients=True)):
-    st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
-                             sigma=sigma)
-  ```
-
-  In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
-  be the mean value of the Normal distribution, i.e., `mu` (possibly
-  broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
-  was marked with `stop_gradients=True`, this value will have been wrapped
-  in a `stop_gradients` call to disable any possible backpropagation.
-
-  Args:
-    dist_value_type: An instance of `MeanValue`, `SampleValue`, or
-      any other stochastic value type.
-
-  Yields:
-    A context for `StochasticTensor` objects that controls the
-    value created when they are initialized.
-
-  Raises:
-    TypeError: if `dist_value_type` is not an instance of a stochastic value
-      type.
-  """
-  if not isinstance(dist_value_type, _StochasticValueType):
-    raise TypeError("dist_value_type must be a Distribution Value Type")
-  thread_id = threading.current_thread().ident
-  stack = _STOCHASTIC_VALUE_STACK[thread_id]
-  if stack:
-    stack[-1].pushed_above(dist_value_type)
-  stack.append(dist_value_type)
-  yield
-  stack.pop()
-  if stack:
-    stack[-1].popped_above(dist_value_type)
-
-
-class NoValueTypeSetError(ValueError):
-  pass
-
-
-def get_current_value_type():
-  thread_id = threading.current_thread().ident
-  if not _STOCHASTIC_VALUE_STACK[thread_id]:
-    raise NoValueTypeSetError(
-        "No value type currently set for this thread (%s).  Did you forget to "
-        "wrap 'with stochastic_graph.value_type(...)'?" % thread_id)
-  return _STOCHASTIC_VALUE_STACK[thread_id][-1]
-
-
-class StochasticTensor(BaseStochasticTensor):
-  """StochasticTensor is a BaseStochasticTensor backed by a distribution."""
-
-  def __init__(self,
-               dist,
-               name="StochasticTensor",
-               dist_value_type=None,
-               loss_fn=sge.score_function):
-    """Construct a `StochasticTensor`.
-
-    `StochasticTensor` is backed by the `dist` distribution and its `value`
-    method will return the same value each time it is called. What `value` is
-    returned is controlled by the `dist_value_type` (defaults to
-    `SampleValue`).
-
-    Some distributions' sample functions are not differentiable (e.g. a sample
-    from a discrete distribution like a Bernoulli) and so to differentiate
-    wrt parameters upstream of the sample requires a gradient estimator like
-    the score function estimator. This is accomplished by passing a
-    differentiable `loss_fn` to the `StochasticTensor`, which
-    defaults to a function whose derivative is the score function estimator.
-    Calling `stochastic_graph.surrogate_loss(final_losses)` will call
-    `loss()` on every `StochasticTensor` upstream of final losses.
-
-    `loss()` will return None for `StochasticTensor`s backed by
-    reparameterized distributions; it will also return None if the value type is
-    `MeanValueType` or if `loss_fn=None`.
-
-    Args:
-      dist: an instance of `Distribution`.
-      name: a name for this `StochasticTensor` and its ops.
-      dist_value_type: a `_StochasticValueType`, which will determine what the
-          `value` of this `StochasticTensor` will be. If not provided, the
-          value type set with the `value_type` context manager will be used.
-      loss_fn: callable that takes
-          `(st, st.value(), influenced_loss)`, where
-          `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
-          default, `loss_fn` is the `score_function`, or more precisely, the
-          integral of the score function, such that when the gradient is taken,
-          the score function results. See the `stochastic_gradient_estimators`
-          module for additional loss functions and baselines.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      TypeError: if `loss_fn` is not `callable`.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    if dist_value_type is None:
-      try:
-        self._value_type = get_current_value_type()
-      except NoValueTypeSetError:
-        self._value_type = SampleValue()
-    else:
-      # We want to enforce a value type here, but use the value_type()
-      # context manager to enforce some error checking.
-      with value_type(dist_value_type):
-        self._value_type = get_current_value_type()
-
-    if loss_fn is not None and not callable(loss_fn):
-      raise TypeError("loss_fn must be callable")
-    self._loss_fn = loss_fn
-
-    with ops.name_scope(name) as scope:
-      self._name = scope
-      self._dist = dist
-      self._value = self._create_value()
-
-    super(StochasticTensor, self).__init__()
-
-  @property
-  def value_type(self):
-    return self._value_type
-
-  @property
-  def distribution(self):
-    return self._dist
-
-  def _create_value(self):
-    """Create the value Tensor based on the value type, store as self._value."""
-
-    if isinstance(self._value_type, MeanValue):
-      value_tensor = self._dist.mean()
-    elif isinstance(self._value_type, SampleValue):
-      value_tensor = self._dist.sample(self._value_type.shape)
-    else:
-      raise TypeError("Unrecognized Distribution Value Type: %s",
-                      self._value_type)
-
-    if self._value_type.stop_gradient:
-      # stop_gradient is being enforced by the value type
-      return array_ops.stop_gradient(value_tensor)
-
-    if isinstance(self._value_type, MeanValue):
-      return value_tensor  # Using pathwise-derivative for this one.
-    if self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED:
-      return value_tensor  # Using pathwise-derivative for this one.
-    else:
-      # Will have to perform some variant of score function
-      # estimation.  Call stop_gradient on the sampler just in case we
-      # may accidentally leak some gradient from it.
-      return array_ops.stop_gradient(value_tensor)
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def graph(self):
-    return self._value.graph
-
-  @property
-  def dtype(self):
-    return self._dist.dtype
-
-  def entropy(self, name="entropy"):
-    return self._dist.entropy(name=name)
-
-  def mean(self, name="mean"):
-    return self._dist.mean(name=name)
-
-  def value(self, name="value"):
-    return self._value
-
-  def loss(self, final_loss, name="Loss"):
-    # Return a loss based on final_loss and the distribution. Returns
-    # None if pathwise derivatives are supported, if the loss_fn
-    # was explicitly set to None, or if the value type is MeanValue.
-    if self._loss_fn is None:
-      return None
-
-    if (self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED
-        and not self._value_type.stop_gradient):
-      # Can perform pathwise-derivative on this one; no additional loss needed.
-      return None
-
-    with ops.name_scope(self.name, values=[final_loss]):
-      with ops.name_scope(name):
-        if (self._value_type.stop_gradient or
-            isinstance(self._value_type, SampleValue)):
-          return self._loss_fn(self, self._value, final_loss)
-        elif isinstance(self._value_type, MeanValue):
-          return None  # MeanValue generally provides its own gradient
-        else:
-          raise TypeError("Unrecognized Distribution Value Type: %s",
-                          self._value_type)
-
-
-class ObservedStochasticTensor(StochasticTensor):
-  """A StochasticTensor with an observed value."""
-
-  # pylint: disable=super-init-not-called
-  def __init__(self, dist, value, name=None):
-    """Construct an `ObservedStochasticTensor`.
-
-    `ObservedStochasticTensor` is backed by distribution `dist` and uses the
-    provided value instead of using the current value type to draw a value from
-    the distribution. The provided value argument must be appropriately shaped
-    to have come from the distribution.
-
-    Args:
-      dist: an instance of `Distribution`.
-      value: a Tensor containing the observed value
-      name: a name for this `ObservedStochasticTensor` and its ops.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      ValueError: if `value` is not compatible with the distribution.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    with ops.name_scope(name, "ObservedStochasticTensor", [value]) as scope:
-      self._name = scope
-      self._dist = dist
-      dist_shape = self._dist.batch_shape.concatenate(
-          self._dist.event_shape)
-      value = ops.convert_to_tensor(value)
-      value_shape = value.get_shape()
-
-      if not value_shape.is_compatible_with(dist_shape):
-        if value_shape.ndims < dist_shape.ndims:
-          raise ValueError(
-              "Rank of observed value (%d) must be >= rank of a sample from the"
-              " distribution (%d)." % (value_shape.ndims, dist_shape.ndims))
-        sample_shape = value_shape[(value_shape.ndims - dist_shape.ndims):]
-        if not sample_shape.is_compatible_with(dist_shape):
-          raise ValueError(
-              "Shape of observed value %s is incompatible with the shape of a "
-              "sample from the distribution %s." % (value_shape, dist_shape))
-      if value.dtype != self._dist.dtype:
-        raise ValueError("Type of observed value (%s) does not match type of "
-                         "distribution (%s)." % (value.dtype, self._dist.dtype))
-      self._value = array_ops.identity(value)
-    # pylint: disable=non-parent-init-called
-    BaseStochasticTensor.__init__(self)
-
-  def loss(self, final_loss, name=None):
-    return None
-
-
-__all__ = [
-    "BaseStochasticTensor",
-    "StochasticTensor",
-    "ObservedStochasticTensor",
-    "MeanValue",
-    "SampleValue",
-    "value_type",
-    "get_current_value_type",
-]
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_variables.py b/tensorflow/contrib/bayesflow/python/ops/stochastic_variables.py
deleted file mode 100644
index e16dbec11a188d42615c4e63d9f93925a6df30a3..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_variables.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Custom `get_variable` for stochastic variables.
-
-@@get_stochastic_variable
-@@make_stochastic_variable_getter
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor as st
-from tensorflow.contrib.bayesflow.python.ops import variational_inference as vi
-
-
-def get_stochastic_variable(getter,
-                            name,
-                            shape=None,
-                            dist_cls=None,
-                            dist_kwargs=None,
-                            param_initializers=None,
-                            prior=None,
-                            **kwargs):
-  """Custom variable getter for stochastic variables.
-
-  `get_stochastic_variable` will create variables backing the parameters of a
-  distribution, defined by `dist_cls`, and return a `StochasticTensor` which
-  represents a sample from the backing distribution.
-
-  Meant to be passed as the `custom_getter` to a `variable_scope`. Use
-  `make_stochastic_variable_getter` to partially apply distribution-related
-  args.
-
-  Usage:
-
-  ```python
-
-  sv = tf.contrib.bayesflow.stochastic_variables
-  dist = tf.contrib.distributions
-
-  with tf.variable_scope('my_scope',
-                         custom_getter=sv.make_stochastic_variable_getter(
-                             dist_cls=dist.NormalWithSoftplusSigma
-                             param_initializers={
-                               "sigma": lambda shape, dtype, pi: (
-                                   tf.constant(0.5, dtype=dtype, shape=shape))
-                             })):
-    v = tf.get_variable('my_var', (10, 20))
-  ```
-
-  `v` is a `StochasticTensor`, which is a sample from a backing
-  `NormalWithSoftplusSigma` distribution. Underneath, 2 variables have been
-  created: `my_var_mu` and `my_var_sigma`. `my_var_sigma` has been appropriately
-  constrained to be positive by the `NormalWithSoftplusSigma` constructor, and
-  initialized to a value of 0.5, which results in a sigma of ~1 after the
-  softplus. The sample will have shape `(10, 20)`.
-
-  Args:
-    getter: original variable getter.
-    name: prefix for variable(s) backing distribution parameters.
-    shape: shape of the sample from the distribution (i.e. shape of the
-        returned `StochasticTensor`).
-    dist_cls: subclass of `Distribution` that implements `param_shapes`. Should
-        accept unconstrained parameters (e.g. `NormalWithSoftplusSigma` accepts
-        real-valued `sigma` and constrains it to be positive with `softplus`).
-    dist_kwargs: `dict` of kwargs to be forwarded to `dist_cls`.
-    param_initializers: `dict` from parameter name to initializer (see
-        `get_variable` for initializer docs). Will override `initializer` in
-        `kwargs`. `param_initializers` may contain initializers for only some of
-        the parameters. Those parameters that do not contain entries will be
-        initialized by `kwargs['initializer']`, if provided; otherwise, the
-        default initialization of `getter` will be used.
-    prior: instance of `Distribution` or a callable
-        `(TensorShape, dtype) => Distribution`. If provided, will be registered
-        as the prior for the `StochasticTensor` using
-        `variational_inference.register_prior`.
-    **kwargs: kwargs forwarded to `getter`.
-
-  Returns:
-    `StochasticTensor`, which represents a sample from the backing distribution.
-  """
-  param_initializers = param_initializers or {}
-  param_shapes = {}
-
-  if shape is not None:
-    param_shapes = dist_cls.param_static_shapes(shape)
-
-  param_names = set(list(param_shapes.keys()) + list(param_initializers.keys()))
-  params = {}
-  for param_name in param_names:
-    # For each parameter, its param_initializer is used, if provided. Otherwise,
-    # kwargs['initializer'] is used. If neither were provided, the default
-    # variable initialization in getter will be used (i.e. getter will be passed
-    # initializer=None.
-    original_initializer = kwargs.pop('initializer', None)
-    param_initializer = param_initializers.get(param_name, None)
-    if param_initializer is None:
-      param_initializer = original_initializer
-
-    if callable(param_initializer) or param_initializer is None:
-      param_shape = param_shapes.get(param_name, None)
-    else:
-      param_shape = None
-
-    params[param_name] = getter(
-        name + '_' + param_name,
-        shape=param_shape,
-        initializer=param_initializer,
-        **kwargs)
-
-  dist_kwargs = dist_kwargs or {}
-  dist_kwargs.update(params)
-  sample = st.StochasticTensor(dist_cls(**dist_kwargs))
-
-  if prior is not None:
-    if callable(prior):
-      sample_value = sample.value()
-      sample_value.get_shape().assert_is_fully_defined()
-      prior = prior(sample_value.get_shape(), sample_value.dtype)
-
-    vi.register_prior(sample, prior)
-
-  return sample
-
-
-def make_stochastic_variable_getter(dist_cls,
-                                    dist_kwargs=None,
-                                    param_initializers=None,
-                                    prior=None):
-  """`get_stochastic_variable` with args partially applied."""
-  return functools.partial(
-      get_stochastic_variable,
-      dist_cls=dist_cls,
-      dist_kwargs=dist_kwargs,
-      param_initializers=param_initializers,
-      prior=prior)
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
deleted file mode 100644
index 8d932a7c340e21da012d4ab93883735b13e01175..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Variational inference.
-
-See the ${@python/contrib.bayesflow.variational_inference} guide.
-
-@@elbo
-@@elbo_with_log_joint
-@@ELBOForms
-@@register_prior
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl as sg
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl as st
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import distribution
-from tensorflow.python.ops.distributions import kullback_leibler
-from tensorflow.python.platform import tf_logging as logging
-
-VI_PRIORS = "__vi_priors__"
-
-
-def register_prior(variational, prior):
-  """Associate a variational `StochasticTensor` with a `Distribution` prior.
-
-  This is a helper function used in conjunction with `elbo` that allows users
-  to specify the mapping between variational distributions and their priors
-  without having to pass in `variational_with_prior` explicitly.
-
-  Args:
-    variational: `StochasticTensor` q(Z). Approximating distribution.
-    prior: `Distribution` p(Z). Prior distribution.
-
-  Returns:
-    None
-
-  Raises:
-    ValueError: if variational is not a `StochasticTensor` or `prior` is not
-      a `Distribution`.
-  """
-  if not isinstance(variational, st.StochasticTensor):
-    raise TypeError("variational must be a StochasticTensor")
-  if not isinstance(prior, distribution.Distribution):
-    raise TypeError("prior must be a Distribution")
-  ops.add_to_collection(VI_PRIORS, (variational, prior))
-
-
-class _ELBOForm(object):
-  pass
-
-
-class ELBOForms(object):
-  """Constants to control the `elbo` calculation.
-
-  `analytic_kl` uses the analytic KL divergence between the
-  variational distribution(s) and the prior(s).
-
-  `analytic_entropy` uses the analytic entropy of the variational
-  distribution(s).
-
-  `sample` uses the sample KL or the sample entropy is the joint is provided.
-
-  See `elbo` for what is used with `default`.
-  """
-  default, analytic_kl, analytic_entropy, sample = (_ELBOForm()
-                                                    for _ in range(4))
-
-  @staticmethod
-  def check_form(form):
-    if form not in {
-        ELBOForms.default, ELBOForms.analytic_kl, ELBOForms.analytic_entropy,
-        ELBOForms.sample
-    }:
-      raise TypeError("form must be an ELBOForms constant")
-
-
-def elbo(log_likelihood,
-         variational_with_prior=None,
-         keep_batch_dim=True,
-         form=None,
-         name="ELBO"):
-  r"""Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  Optimization objective for inference of hidden variables by variational
-  inference.
-
-  This function is meant to be used in conjunction with `StochasticTensor`.
-  The user should build out the inference network, using `StochasticTensor`s
-  as latent variables, and the generative network. `elbo` at minimum needs
-  `p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
-  the variational distributions. Use `register_prior` to register `Distribution`
-  priors for each `StochasticTensor`. Alternatively, pass in
-  `variational_with_prior` specifying all variational distributions and their
-  priors.
-
-  Mathematical details:
-
-  ```
-  log p(x) =  log \int p(x, Z) dZ
-           =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
-           =  log E_q[\frac {p(x, Z)}{q(Z)}]
-           >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
-
-  L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
-             = E_q[log p(x|Z)p(Z)] + H[q]           (1)
-             = E_q[log p(x|Z)] - KL(q || p)         (2)
-
-  H - Entropy
-  KL - Kullback-Leibler divergence
-  ```
-
-  See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
-  more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
-  in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
-
-  `form` specifies which form of the ELBO is used. `form=ELBOForms.default`
-  tries, in order of preference: analytic KL, analytic entropy, sampling.
-
-  Multiple entries in the `variational_with_prior` dict implies a factorization.
-  e.g. `q(Z) = q(z1)q(z2)q(z3)`.
-
-  Args:
-    log_likelihood: `Tensor` log p(x|Z).
-    variational_with_prior: dict from `StochasticTensor` q(Z) to
-      `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
-      objects upstream of `log_likelihood` with priors registered with
-      `register_prior`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy/KL term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_likelihood`.
-
-  Raises:
-    TypeError: if variationals in `variational_with_prior` are not
-      `StochasticTensor`s or if priors are not `Distribution`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational_with_prior` is None and there are no
-      `StochasticTensor`s upstream of `log_likelihood`.
-    ValueError: if any variational does not have a prior passed or registered.
-  """
-  if form is None:
-    form = ELBOForms.default
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_likelihood)
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior)
-    return _elbo(form, log_likelihood, None, variational_with_prior,
-                 keep_batch_dim)
-
-
-def elbo_with_log_joint(log_joint,
-                        variational=None,
-                        keep_batch_dim=True,
-                        form=None,
-                        name="ELBO"):
-  """Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
-  See `elbo` for further details.
-
-  Because only the joint is specified, analytic KL is not available.
-
-  Args:
-    log_joint: `Tensor` log p(x, Z).
-    variational: list of `StochasticTensor` q(Z). If `None`, defaults to all
-      `StochasticTensor` objects upstream of `log_joint`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_joint`.
-
-  Raises:
-    TypeError: if variationals in `variational` are not `StochasticTensor`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational` is None and there are no `StochasticTensor`s
-      upstream of `log_joint`.
-    ValueError: if form is ELBOForms.analytic_kl.
-  """
-  if form is None:
-    form = ELBOForms.default
-  if form == ELBOForms.analytic_kl:
-    raise ValueError("ELBOForms.analytic_kl is not available when using "
-                     "elbo_with_log_joint. Use elbo or a different form.")
-
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_joint)
-
-    variational_with_prior = None
-    if variational is not None:
-      variational_with_prior = dict(zip(variational, [None] * len(variational)))
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior, require_prior=False)
-    return _elbo(form, None, log_joint, variational_with_prior, keep_batch_dim)
-
-
-def _elbo(form, log_likelihood, log_joint, variational_with_prior,
-          keep_batch_dim):
-  """Internal implementation of ELBO. Users should use `elbo`.
-
-  Args:
-    form: ELBOForms constant. Controls how the ELBO is computed.
-    log_likelihood: `Tensor` log p(x|Z).
-    log_joint: `Tensor` log p(x, Z).
-    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
-      distributions to prior distributions.
-    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
-      the entropy/KL.
-
-  Returns:
-    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
-  """
-  ELBOForms.check_form(form)
-
-  # Order of preference
-  # 1. Analytic KL: log_likelihood - KL(q||p)
-  # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
-  # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
-  #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)
-
-  def _reduce(val):
-    if keep_batch_dim:
-      return val
-    else:
-      return math_ops.reduce_sum(val)
-
-  kl_terms = []
-  entropy_terms = []
-  prior_terms = []
-  for q, z, p in [(qz.distribution, qz.value(), pz)
-                  for qz, pz in variational_with_prior.items()]:
-    # Analytic KL
-    kl = None
-    if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
-      try:
-        kl = kullback_leibler.kl_divergence(q, p)
-        logging.info("Using analytic KL between q:%s, p:%s", q, p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_kl:
-          raise e
-    if kl is not None:
-      kl_terms.append(-1. * _reduce(kl))
-      continue
-
-    # Analytic entropy
-    entropy = None
-    if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
-      try:
-        entropy = q.entropy()
-        logging.info("Using analytic entropy for q:%s", q)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    if entropy is not None:
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-      continue
-
-    # Sample
-    if form in {ELBOForms.default, ELBOForms.sample}:
-      entropy = -q.log_prob(z)
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-
-  first_term = log_joint if log_joint is not None else log_likelihood
-  return sum([first_term] + kl_terms + entropy_terms + prior_terms)
-
-
-def _find_variational_and_priors(model,
-                                 variational_with_prior,
-                                 require_prior=True):
-  """Find upstream StochasticTensors and match with registered priors."""
-  if variational_with_prior is None:
-    # pylint: disable=protected-access
-    upstreams = sg._upstream_stochastic_nodes([model])
-    # pylint: enable=protected-access
-    upstreams = list(upstreams[model])
-    if not upstreams:
-      raise ValueError("No upstream stochastic nodes found for tensor: %s",
-                       model)
-    prior_map = dict(ops.get_collection(VI_PRIORS))
-    variational_with_prior = {}
-    for q in upstreams:
-      if require_prior and (q not in prior_map or prior_map[q] is None):
-        raise ValueError("No prior specified for StochasticTensor: %s", q)
-      variational_with_prior[q] = prior_map.get(q)
-
-  if not all(
-      [isinstance(q, st.StochasticTensor) for q in variational_with_prior]):
-    raise TypeError("variationals must be StochasticTensors")
-  if not all([
-      p is None or isinstance(p, distribution.Distribution)
-      for p in variational_with_prior.values()
-  ]):
-    raise TypeError("priors must be Distribution objects")
-
-  return variational_with_prior
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 66a04d42e93331de74b6f3d41f83f071115c1097..7072f56420ac9e576b20b62c0aa67498857403a7 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -359,8 +359,8 @@ tf_custom_op_library(
     ],
     deps = [
         "//tensorflow/contrib/boosted_trees/lib:example_partitioner",
-        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
         "//tensorflow/contrib/boosted_trees/lib:models",
+        "//tensorflow/contrib/boosted_trees/lib:node-stats",
         "//tensorflow/contrib/boosted_trees/lib:utils",
         "//tensorflow/contrib/boosted_trees/lib:weighted_quantiles",
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
@@ -404,10 +404,12 @@ tf_kernel_library(
     name = "split_handler_ops_kernels",
     srcs = ["kernels/split_handler_ops.cc"],
     deps = [
-        "//tensorflow/contrib/boosted_trees/lib:feature-column-handlers",
+        "//tensorflow/contrib/boosted_trees/lib:node-stats",
         "//tensorflow/contrib/boosted_trees/proto:split_info_proto_cc",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index ef8dee91b6cc05c4c3dd5eb3c81de4fb65b473e3..6ebc7d7911df878ec91701db8b75feb9a27d18a2 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -33,6 +33,8 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.saved_model import loader as saved_model_loader
 from tensorflow.python.saved_model import tag_constants
 
+_SPARSE_FLOAT_FEATURE_NAME_TEMPLATE = "%s_%d"
+
 
 def make_custom_export_strategy(name,
                                 convert_fn,
@@ -147,13 +149,12 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           inequality_test.threshold.float_value = split.threshold
         elif node_type == "sparse_float_binary_split_default_left":
           split = gtflow_node.sparse_float_binary_split_default_left.split
-          node.default_direction = (
-              generic_tree_model_pb2.BinaryNode.LEFT)
-          # TODO(nponomareva): adjust this id assignement when we allow multi-
-          # column sparse tensors.
+          node.default_direction = (generic_tree_model_pb2.BinaryNode.LEFT)
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -165,7 +166,9 @@ def convert_to_universal_format(dtec, sorted_feature_names,
           # column sparse tensors.
           feature_id = split.feature_column + num_dense
           inequality_test = node.inequality_left_child_test
-          inequality_test.feature_id.id.value = sorted_feature_names[feature_id]
+          inequality_test.feature_id.id.value = (
+              _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE %
+              (sorted_feature_names[feature_id], split.dimension_id))
           inequality_test.type = (
               generic_tree_model_pb2.InequalityTest.LESS_OR_EQUAL)
           inequality_test.threshold.float_value = split.threshold
@@ -201,10 +204,14 @@ def _get_feature_importances(dtec, feature_names, num_dense_floats,
         split_column = feature_names[split.feature_column]
       elif node_type == "sparse_float_binary_split_default_left":
         split = tree_node.sparse_float_binary_split_default_left.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "sparse_float_binary_split_default_right":
         split = tree_node.sparse_float_binary_split_default_right.split
-        split_column = feature_names[split.feature_column + num_dense_floats]
+        split_column = _SPARSE_FLOAT_FEATURE_NAME_TEMPLATE % (
+            feature_names[split.feature_column + num_dense_floats],
+            split.dimension_id)
       elif node_type == "categorical_id_binary_split":
         split = tree_node.categorical_id_binary_split
         split_column = feature_names[split.feature_column + num_dense_floats +
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
index 4ed18b2d34c5af47826ab1c058f5d13797593bd4..492d9ca40c5cfa84e186020605429aacc02af6a6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy_test.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the conversion code from GTFlow format to Chauffeur."""
+"""Tests for the conversion code and for feature importances export.
+
+Tests that cover conversion from TFBT format to a tensorflow.contrib.
+decision_tree generic_tree_model format and feature importances export.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -95,10 +99,31 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
           }
         }
       }
+      nodes {
+        sparse_float_binary_split_default_right {
+          split {
+            feature_column: 1
+            dimension_id:3
+            threshold: -0.4
+            left_id: 7
+            right_id: 8
+          }
+        }
+        node_metadata {
+            gain: 3600
+        }
+      }
+      nodes {
+        leaf {
+          vector {
+            value: 0.36
+          }
+        }
+      }
       nodes {
         leaf {
           vector {
-            value: 0.3
+            value: 18
           }
         }
       }
@@ -108,17 +133,25 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
     """
     dtec = tree_config_pb2.DecisionTreeEnsembleConfig()
     text_format.Merge(dtec_str, dtec)
-    feature_columns = ["feature_b", "feature_a", "feature_d"]
+    feature_columns = [
+        "feature_b",
+        "feature_a",
+        "feature_a_m",
+        "feature_d",
+    ]
     return dtec, feature_columns
 
   def testConvertModel(self):
     dtec, feature_columns = self._make_trees()
+    # Assume 2 sparse float columns, one with 1 dimension, the second one with
+    # 5 dimensions.
     # The feature columns in the order they were added.
     out = custom_export_strategy.convert_to_universal_format(
-        dtec, feature_columns, 1, 1,
-        1)
+        dtec, feature_columns, 1, 2, 1)
+    # Features a and a_m are sparse float features, a_m is multidimensional.
     expected_tree = """
     features { key: "feature_a" }
+    features { key: "feature_a_m" }
     features { key: "feature_b" }
     features { key: "feature_d" }
     model {
@@ -169,7 +202,6 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   }
                 }
               }
-
               nodes {
                 node_id {
                   value: 1
@@ -196,7 +228,7 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                   inequality_left_child_test {
                     feature_id {
                       id {
-                        value: "feature_a"
+                        value: "feature_a_0"
                       }
                     }
                     threshold {
@@ -259,14 +291,51 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
                 node_id {
                   value: 6
                 }
+                binary_node {
+                  left_child_id {
+                    value: 7
+                  }
+                  right_child_id {
+                    value: 8
+                  }
+                  default_direction: RIGHT
+                  inequality_left_child_test {
+                      feature_id {
+                        id {
+                          value: "feature_a_m_3"
+                        }
+                      }
+                      threshold {
+                        float_value: -0.4
+                      }
+                  }
+                }
+              }
+              nodes {
+                node_id {
+                  value: 7
+                }
                 leaf {
                   vector {
                     value {
-                      float_value: 0.03
+                      float_value: 0.036
                     }
                   }
                 }
               }
+              nodes {
+                node_id {
+                  value: 8
+                }
+                leaf {
+                  vector {
+                    value {
+                      float_value: 1.8
+                    }
+                  }
+                }
+              }
+
             }
           }
           submodel_id {
@@ -280,12 +349,15 @@ class ConvertModelTest(test_util.TensorFlowTestCase):
   def testFeatureImportance(self):
     dtec, feature_columns = self._make_trees()
     feature_importances = custom_export_strategy._get_feature_importances(
-        dtec, feature_columns, 1, 1, 1)
-    self.assertItemsEqual(["feature_b", "feature_a", "feature_d"],
-                          feature_importances.keys())
+        dtec, feature_columns, 1, 2, 1)
+    self.assertItemsEqual(
+        ["feature_b", "feature_a_0", "feature_a_m_3", "feature_d"],
+        feature_importances.keys())
     self.assertAlmostEqual(50.0, feature_importances["feature_b"], places=4)
-    self.assertAlmostEqual(50.0, feature_importances["feature_a"], places=4)
+    self.assertAlmostEqual(50.0, feature_importances["feature_a_0"], places=4)
     self.assertAlmostEqual(50.0, feature_importances["feature_d"], places=4)
+    self.assertAlmostEqual(
+        360.0, feature_importances["feature_a_m_3"], places=4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/examples/boston.py b/tensorflow/contrib/boosted_trees/examples/boston.py
index 2c0a3c4912b82aba88e2f8f1b97a227c894ee2ae..e9dbdb0fd784052eeb36ac1aa9342165ef2ac0a7 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston.py
@@ -22,7 +22,7 @@ r"""Demonstrates a regression on Boston housing data.
 
   python tensorflow/contrib/boosted_trees/examples/boston.py \
   --batch_size=404 --output_dir="/tmp/boston" --depth=4 --learning_rate=0.1 \
-  --num_eval_steps=1 --num_trees=500 --l2=4 \
+  --num_eval_steps=1 --num_trees=500 --l2=0.001 \
   --vmodule=training_ops=1
 
   When training is done, mean squared error on eval data is reported.
@@ -37,8 +37,10 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
 import sys
 import tensorflow as tf
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_export_strategy
 from tensorflow.contrib.boosted_trees.estimator_batch.estimator import GradientBoostedDecisionTreeRegressor
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -51,22 +53,18 @@ _BOSTON_NUM_FEATURES = 13
 def _get_tfbt(output_dir, feature_cols):
   """Configures TF Boosted Trees estimator based on flags."""
   learner_config = learner_pb2.LearnerConfig()
-
   learner_config.learning_rate_tuner.fixed.learning_rate = FLAGS.learning_rate
   learner_config.regularization.l1 = 0.0
-  # Set the regularization per instance in such a way that
-  # regularization for the full training data is equal to l2 flag.
-  learner_config.regularization.l2 = FLAGS.l2 / FLAGS.batch_size
+  learner_config.regularization.l2 = FLAGS.l2
   learner_config.constraints.max_tree_depth = FLAGS.depth
-  learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
 
   run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=300)
 
   # Create a TF Boosted trees regression estimator.
   estimator = GradientBoostedDecisionTreeRegressor(
       learner_config=learner_config,
-      # For the WHOLE_TREE strategy, set the examples_per_layer to be equal to
-      # batch size.
+      # This should be the number of examples. For large datasets it can be
+      # larger than the batch_size.
       examples_per_layer=FLAGS.batch_size,
       feature_columns=feature_cols,
       label_dimension=1,
@@ -77,6 +75,14 @@ def _get_tfbt(output_dir, feature_cols):
   return estimator
 
 
+def _convert_fn(dtec, sorted_feature_names, num_dense, num_sparse_float,
+                num_sparse_int, export_dir, unused_eval_result):
+  universal_format = custom_export_strategy.convert_to_universal_format(
+      dtec, sorted_feature_names, num_dense, num_sparse_float, num_sparse_int)
+  with tf.gfile.GFile(os.path.join(export_dir, "tree_proto"), "w") as f:
+    f.write(str(universal_format))
+
+
 def _make_experiment_fn(output_dir):
   """Creates experiment for gradient boosted decision trees."""
   (x_train, y_train), (x_test,
@@ -88,21 +94,31 @@ def _make_experiment_fn(output_dir):
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-
   eval_input_fn = tf.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
       feature_column.real_valued_column("x", dimension=_BOSTON_NUM_FEATURES)
   ]
-
+  feature_spec = tf.contrib.layers.create_feature_spec_for_parsing(
+      feature_columns)
+  serving_input_fn = tf.contrib.learn.utils.build_parsing_serving_input_fn(
+      feature_spec)
+  # An export strategy that outputs the feature importance and also exports
+  # the internal tree representation in another format.
+  export_strategy = custom_export_strategy.make_custom_export_strategy(
+      "exports",
+      convert_fn=_convert_fn,
+      feature_columns=feature_columns,
+      export_input_fn=serving_input_fn)
   return tf.contrib.learn.Experiment(
       estimator=_get_tfbt(output_dir, feature_columns),
       train_input_fn=train_input_fn,
       eval_input_fn=eval_input_fn,
       train_steps=None,
       eval_steps=FLAGS.num_eval_steps,
-      eval_metrics=None)
+      eval_metrics=None,
+      export_strategies=[export_strategy])
 
 
 def main(unused_argv):
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index 766982b4f2023310e6046619939f83bef63b0302..f8086b0c2bb93eae6af0336bbe33fc23f8fcde22 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -63,19 +63,26 @@ const char* kPredictionsTensorName = "predictions";
 void CalculateTreesToInclude(
     const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
     const std::vector<int32>& trees_to_drop, const int32 num_trees,
-    const bool only_finalized, std::vector<int32>* trees_to_include) {
+    const bool only_finalized, const bool center_bias,
+    std::vector<int32>* trees_to_include) {
   trees_to_include->reserve(num_trees - trees_to_drop.size());
 
   int32 index = 0;
   // This assumes that trees_to_drop is a sorted list of tree ids.
   for (int32 tree = 0; tree < num_trees; ++tree) {
-    if ((!trees_to_drop.empty() && index < trees_to_drop.size() &&
-         trees_to_drop[index] == tree) ||
-        (only_finalized && config.tree_metadata_size() > 0 &&
-         !config.tree_metadata(tree).is_finalized())) {
+    // Skip the tree if tree is in the list of trees_to_drop.
+    if (!trees_to_drop.empty() && index < trees_to_drop.size() &&
+        trees_to_drop[index] == tree) {
       ++index;
       continue;
     }
+    // Or skip if the tree is not finalized and only_finalized is set,
+    // with the exception of centering bias.
+    if (only_finalized && !(center_bias && tree == 0) &&
+        config.tree_metadata_size() > 0 &&
+        !config.tree_metadata(tree).is_finalized()) {
+      continue;
+    }
     trees_to_include->push_back(tree);
   }
 }
@@ -250,7 +257,7 @@ class GradientTreesPredictionOp : public OpKernel {
     CalculateTreesToInclude(
         ensemble_resource->decision_tree_ensemble(), dropped_trees,
         ensemble_resource->decision_tree_ensemble().trees_size(),
-        only_finalized_trees_, &trees_to_include);
+        only_finalized_trees_, center_bias_, &trees_to_include);
 
     // Allocate output predictions matrix.
     Tensor* output_predictions_t = nullptr;
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index b08028eb635385357ba13b48d88157936978b6f1..8600c8c53caa5fd4274ba6730fc764d8315d680c 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -50,6 +50,7 @@ const char* const kAreBucketsReadyName = "are_buckets_ready";
 const char* const kNumSparseFeaturesName = "num_sparse_features";
 const char* const kSparseBucketsName = "sparse_buckets";
 const char* const kSparseValuesName = "sparse_values";
+const char* const kSparseIndicesName = "sparse_indices";
 const char* const kSparseStreamsStateName = "sparse_streams_state";
 const char* const kSparseSummariesName = "sparse_summaries";
 const char* const kSparseConfigName = "sparse_config";
@@ -85,9 +86,23 @@ std::vector<float> GetBuckets(const int32 feature,
   return buckets_vector;
 }
 
-void QuantizeFeatures(const string& output_name, const OpInputList& values_list,
-                      const OpInputList& buckets_list,
-                      OpKernelContext* const context) {
+int32 GetFeatureDimension(const int32 feature_index, const int64 instance,
+                          const OpInputList* const indices_list) {
+  if (indices_list != nullptr) {
+    // Sparse multidimensional.
+    return (*indices_list)[feature_index].matrix<int64>()(instance, 1);
+  }
+  // No indices, assume one-dimensional tensor.
+  return 0;
+}
+
+// Allows quantization for each of multiple dimensions of a sparse feature.
+void QuantizeFeatures(
+    const string& output_name, const OpInputList& values_list,
+    const OpInputList& buckets_list,
+    const OpInputList* const
+        indices_list /** Optional, provide for sparse features **/,
+    OpKernelContext* const context) {
   if (values_list.size() == 0) {
     return;
   }
@@ -100,10 +115,13 @@ void QuantizeFeatures(const string& output_name, const OpInputList& values_list,
     const int64 num_values = values_tensor.dim_size(0);
 
     Tensor* output_t = nullptr;
+    // Output will have bucket id and dimension of the features for that bucket.
     OP_REQUIRES_OK(
-        context, output_list.allocate(feature_index, TensorShape({num_values}),
-                                      &output_t));
-    TTypes<int32>::Vec output = output_t->vec<int32>();
+        context, output_list.allocate(feature_index,
+                                      TensorShape({num_values, 2}), &output_t));
+
+    auto output = output_t->matrix<int32>();
+
     const std::vector<float>& buckets_vector =
         GetBuckets(feature_index, buckets_list);
     auto flat_values = values_tensor.flat<float>();
@@ -116,7 +134,11 @@ void QuantizeFeatures(const string& output_name, const OpInputList& values_list,
       }
       const int32 bucket =
           static_cast<int32>(bucket_iter - buckets_vector.begin());
-      output(instance) = bucket;
+      // Bucket id.
+      output(instance, 0) = bucket;
+      // Dimension.
+      output(instance, 1) =
+          GetFeatureDimension(feature_index, instance, indices_list);
     }
   }
 }
@@ -851,6 +873,11 @@ class QuantilesOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->input_list(kSparseValuesName,
                                        &sparse_float_feature_values_list));
+
+    OpInputList sparse_float_indices_list;
+    OP_REQUIRES_OK(context, context->input_list(kSparseIndicesName,
+                                                &sparse_float_indices_list));
+
     OpInputList sparse_buckets_list;
     OP_REQUIRES_OK(
         context, context->input_list(kSparseBucketsName, &sparse_buckets_list));
@@ -865,10 +892,10 @@ class QuantilesOp : public OpKernel {
 
     // Quantize the feature values
     QuantizeFeatures(kDenseOutputTensorName, dense_float_features_list,
-                     dense_buckets_list, context);
+                     dense_buckets_list, nullptr, context);
 
     QuantizeFeatures(kSparseOutputTensorName, sparse_float_feature_values_list,
-                     sparse_buckets_list, context);
+                     sparse_buckets_list, &sparse_float_indices_list, context);
   }
 };
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 29635bb3c404e54f0561d9b9189270022f063cbe..18b4abd654ea3541d646a43ac901aca1a678446f 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/split_info.pb.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -39,6 +39,10 @@ using boosted_trees::learner::stochastic::GradientStats;
 using boosted_trees::learner::stochastic::NodeStats;
 using boosted_trees::learner::LearnerConfig_MultiClassStrategy;
 
+namespace {
+const int32 DUMMY_FEATURE_DIMENSION = -1;
+}  // namespace
+
 class BaseBuildSplitOp : public OpKernel {
  public:
   explicit BaseBuildSplitOp(OpKernelConstruction* const context)
@@ -128,7 +132,7 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
 
     const Tensor* bucket_ids_t;
     OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
-    const auto& bucket_ids = bucket_ids_t->vec<int64>();
+    const auto& bucket_ids = bucket_ids_t->matrix<int64>();
 
     const Tensor* gradients_t;
     OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
@@ -219,7 +223,7 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
           split_info.mutable_split_node()->mutable_dense_float_binary_split();
       dense_split->set_feature_column(feature_column_group_id_);
       dense_split->set_threshold(
-          bucket_boundaries(bucket_ids(best_bucket_idx)));
+          bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
@@ -262,7 +266,9 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
 
     const Tensor* bucket_ids_t;
     OP_REQUIRES_OK(context, context->input("bucket_ids", &bucket_ids_t));
-    const auto& bucket_ids = bucket_ids_t->vec<int64>();
+    const auto& bucket_ids_and_dimensions = bucket_ids_t->matrix<int64>();
+
+    const int32 tensor_elements = partition_ids.size();
 
     const Tensor* gradients_t;
     OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
@@ -273,24 +279,59 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
     int class_id;
     ReadClassId(context, &class_id);
 
-    // Find the number of unique partitions before we allocate the output.
-    std::vector<int32> partition_boundaries;
+    // For each partition (tree node), store starting index for each dimension.
+    PartitionAndDimensionBoundaries partition_boundaries;
+    // Stores indices in partition_boundaries for those partitions that are
+    // not empty (have at least one dimension and a bucket apart from catch-all
+    // bucket of -1 bucket id and dimension 0.
     std::vector<int32> non_empty_partitions;
-    for (int i = 0; i < partition_ids.size() - 1; ++i) {
+    bool non_empty_partition = false;
+
+    for (int i = 0; i < partition_ids.size(); ++i) {
       // Make sure the input is sorted by partition_ids;
-      CHECK_LE(partition_ids(i), partition_ids(i + 1));
-      if (i == 0 || partition_ids(i) != partition_ids(i - 1)) {
-        partition_boundaries.push_back(i);
-        // Some partitions might only have bias feature. We don't want to split
-        // those so check that the partition has at least 2 buckets.
-        if (partition_ids(i) == partition_ids(i + 1)) {
-          non_empty_partitions.push_back(partition_boundaries.size() - 1);
+      if (i > 0) {
+        CHECK_LE(partition_ids(i - 1), partition_ids(i))
+            << "Partition ids should be sorted. Not sorted for " << i;
+      }
+      const int32 dimension = bucket_ids_and_dimensions(i, 1);
+
+      if (i == 0 || (partition_ids(i) != partition_ids(i - 1))) {
+        if (i != 0) {
+          // Not the first entry, so partition has changed.
+          if (non_empty_partition) {
+            // Saves the id of a previous partition in a list of non empty
+            // partitions, since it was non empty (had more than just a bias
+            // bucket -1.
+            non_empty_partitions.push_back(partition_boundaries.size() - 1);
+          }
+          // Add dummy dimension to signify the end for the previous dimension.
+          partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION, i);
         }
+        // Allocate for a new partition.
+        partition_boundaries.emplace_back();
+        // Save info about the first dimension for a new partition.
+        partition_boundaries.back().emplace_back(dimension, i);
+
+        // Each partition has dummy -1 bucket with all gradients and then info
+        // for all other dimensions -> if we have >1 elements for a partition,
+        // then it is not empty.
+        non_empty_partition = (i < partition_ids.size() - 1) &&
+                              (partition_ids(i) == partition_ids(i + 1));
+      } else if (bucket_ids_and_dimensions(i, 1) !=
+                 bucket_ids_and_dimensions(i - 1, 1)) {
+        // Dimension changed.
+        partition_boundaries.back().emplace_back(dimension, i);
       }
     }
-    if (partition_ids.size() > 0) {
-      partition_boundaries.push_back(partition_ids.size());
+    if (tensor_elements > 0) {
+      if (non_empty_partition) {
+        non_empty_partitions.push_back(partition_boundaries.size() - 1);
+      }
+      // Add dummy dimension to signify the end for the previous dimension.
+      partition_boundaries.back().emplace_back(DUMMY_FEATURE_DIMENSION,
+                                               partition_ids.size());
     }
+
     int num_elements = non_empty_partitions.size();
     Tensor* output_partition_ids_t = nullptr;
     OP_REQUIRES_OK(context,
@@ -314,73 +355,128 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
                                 &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+    // For each tree node that needs to be split.
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
+      const auto& dimension_boundaries =
+          partition_boundaries[non_empty_partitions[root_idx]];
+
       float best_gain = std::numeric_limits<float>::lowest();
-      int start_index = partition_boundaries[non_empty_partitions[root_idx]];
-      int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
-      // First bucket ID in each partition should be the bias feature.
-      OP_REQUIRES(context, bucket_ids(start_index) == bias_feature_id_,
-                  errors::InvalidArgument("Bias feature ID missing."));
+      int32 best_dimension_idx = 0;
+      bool default_right = false;
+      int32 best_element_idx = 0;
+
+      NodeStats best_right_node_stats(0);
+      NodeStats best_left_node_stats(0);
+
+      // For each partition, the first bucket is dummy catch all.
+      int32 bias_start_index = dimension_boundaries[0].start_index;
+
+      OP_REQUIRES(
+          context,
+          bucket_ids_and_dimensions(bias_start_index, 0) == bias_feature_id_,
+          errors::InvalidArgument("Bias feature ID missing."));
+
+      // Dimension for bias feature is always 0
+      OP_REQUIRES(
+          context, bucket_ids_and_dimensions(bias_start_index, 1) == 0,
+          errors::InvalidArgument("Bias feature ID must be with dimension 0."));
+
       // For each root, we do two passes over the quantized feature buckets
       // accumulating gradients on one side and using the root aggregate
       // gradients to get the gradients for the other side.
       // Split gains are evaluated for each pass at every threshold and the best
       // split is picked.
-      GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
+      GradientStats root_gradient_stats(*gradients_t, *hessians_t,
+                                        bias_start_index);
       root_gradient_stats *= normalizer_ratio;
       NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
-      GradientStats present_gradient_stats;
-      for (int64 bucket_idx = start_index + 1; bucket_idx < end_index;
-           ++bucket_idx) {
-        present_gradient_stats +=
-            GradientStats(*gradients_t, *hessians_t, bucket_idx);
-      }
-      present_gradient_stats *= normalizer_ratio;
-      int32 best_bucket_idx = 0;
-      NodeStats best_right_node_stats(0);
-      NodeStats best_left_node_stats(0);
-      GradientStats left_gradient_stats;
-      bool default_right = false;
-      for (int64 bucket_idx = start_index + 1; bucket_idx < end_index;
-           ++bucket_idx) {
-        GradientStats g(*gradients_t, *hessians_t, bucket_idx);
-        g *= normalizer_ratio;
-        left_gradient_stats += g;
-        // We have the sum of all present gradients. Use that to compute the
-        // backward pass gradients.
-        GradientStats right_gradient_stats =
-            present_gradient_stats - left_gradient_stats;
-        {
-          NodeStats left_stats_default_left =
-              ComputeNodeStats(root_gradient_stats - right_gradient_stats);
-          NodeStats right_stats_default_left =
-              ComputeNodeStats(right_gradient_stats);
-          if (left_stats_default_left.gain + right_stats_default_left.gain >
-              best_gain) {
-            best_gain =
-                left_stats_default_left.gain + right_stats_default_left.gain;
-            best_left_node_stats = left_stats_default_left;
-            best_right_node_stats = right_stats_default_left;
-            best_bucket_idx = bucket_idx;
-            default_right = false;
-          }
+
+      // Iterate through dimensions.
+      for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
+        const DimensionBoundary& dimension_and_start = dimension_boundaries[j];
+        const int32 dimension_id = dimension_and_start.dimension_id;
+
+        int start_index = dimension_and_start.start_index;
+        // Even for the last dimension, we always have additional dummy
+        // dimension that we can use to find the end index.
+        const int end_index =
+            partition_boundaries[non_empty_partitions[root_idx]][j + 1]
+                .start_index;
+        CHECK(bucket_ids_and_dimensions(start_index, 1) ==
+              bucket_ids_and_dimensions(end_index - 1, 1))
+            << "For bucket " << bucket_ids_and_dimensions(start_index, 0)
+            << " the dimension was "
+            << bucket_ids_and_dimensions(start_index, 1) << " and for "
+            << bucket_ids_and_dimensions(end_index - 1, 0) << " "
+            << bucket_ids_and_dimensions(end_index - 1, 1);
+        if (bucket_ids_and_dimensions(start_index, 0) == bias_feature_id_) {
+          // 0-dimension case which has a first bucket for catch all feature.
+          CHECK(bucket_ids_and_dimensions(start_index, 1) == 0)
+              << "Dimension of bias feature should be 0";
+          ++start_index;
         }
-        {
-          NodeStats left_stats_default_right =
-              ComputeNodeStats(left_gradient_stats);
-          NodeStats right_stats_default_right =
-              ComputeNodeStats(root_gradient_stats - left_gradient_stats);
-          if (left_stats_default_right.gain + right_stats_default_right.gain >
-              best_gain) {
-            best_gain =
-                left_stats_default_right.gain + right_stats_default_right.gain;
-            best_left_node_stats = left_stats_default_right;
-            best_right_node_stats = right_stats_default_right;
-            best_bucket_idx = bucket_idx;
-            default_right = true;
+
+        GradientStats present_gradient_stats;
+        for (int64 bucket_idx = start_index; bucket_idx < end_index;
+             ++bucket_idx) {
+          present_gradient_stats +=
+              GradientStats(*gradients_t, *hessians_t, bucket_idx);
+        }
+        present_gradient_stats *= normalizer_ratio;
+
+        GradientStats left_gradient_stats;
+        for (int64 element_idx = start_index; element_idx < end_index;
+             ++element_idx) {
+          // Check that bucket ids are sorted.
+          if (element_idx != start_index) {
+            CHECK(bucket_ids_and_dimensions(element_idx - 1, 0) <
+                  bucket_ids_and_dimensions(element_idx, 0))
+                << "Bucket ids must be sorted."
+                << ", problem on " << element_idx << " and dimension is " << j;
+          }
+
+          GradientStats g(*gradients_t, *hessians_t, element_idx);
+          g *= normalizer_ratio;
+          left_gradient_stats += g;
+          // We have the sum of all present gradients. Use that to compute the
+          // backward pass gradients.
+          GradientStats right_gradient_stats =
+              present_gradient_stats - left_gradient_stats;
+          {
+            NodeStats left_stats_default_left =
+                ComputeNodeStats(root_gradient_stats - right_gradient_stats);
+            NodeStats right_stats_default_left =
+                ComputeNodeStats(right_gradient_stats);
+            if (left_stats_default_left.gain + right_stats_default_left.gain >
+                best_gain) {
+              best_gain =
+                  left_stats_default_left.gain + right_stats_default_left.gain;
+              best_left_node_stats = left_stats_default_left;
+              best_right_node_stats = right_stats_default_left;
+              best_element_idx = element_idx;
+              default_right = false;
+              best_dimension_idx = dimension_id;
+            }
+          }
+          {
+            NodeStats left_stats_default_right =
+                ComputeNodeStats(left_gradient_stats);
+            NodeStats right_stats_default_right =
+                ComputeNodeStats(root_gradient_stats - left_gradient_stats);
+            if (left_stats_default_right.gain + right_stats_default_right.gain >
+                best_gain) {
+              best_gain = left_stats_default_right.gain +
+                          right_stats_default_right.gain;
+              best_left_node_stats = left_stats_default_right;
+              best_right_node_stats = right_stats_default_right;
+              best_element_idx = element_idx;
+              default_right = true;
+              best_dimension_idx = dimension_id;
+            }
           }
         }
       }
+
       SplitInfo split_info;
       boosted_trees::trees::DenseFloatBinarySplit* dense_split = nullptr;
       if (default_right) {
@@ -393,8 +489,13 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
                           ->mutable_split();
       }
       dense_split->set_feature_column(feature_column_group_id_);
-      dense_split->set_threshold(
-          bucket_boundaries(bucket_ids(best_bucket_idx)));
+      // Set the feature index for the best feature column.
+      const int64 best_dimension_id =
+          bucket_ids_and_dimensions(best_element_idx, 1);
+      const int32 best_bucket_id =
+          bucket_ids_and_dimensions(best_element_idx, 0);
+      dense_split->set_dimension_id(best_dimension_id);
+      dense_split->set_threshold(bucket_boundaries(best_bucket_id));
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
@@ -403,11 +504,23 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
       split_info.SerializeToString(&output_splits(root_idx));
       gains(root_idx) =
           best_gain - root_stats.gain - tree_complexity_regularization_;
-      output_partition_ids(root_idx) = partition_ids(start_index);
+      output_partition_ids(root_idx) = partition_ids(bias_start_index);
     }
   }
 
  private:
+  struct DimensionBoundary {
+    DimensionBoundary(const int32 dimension_id, const int32 start_index)
+        : dimension_id(dimension_id), start_index(start_index) {}
+
+    int32 dimension_id;
+    int32 start_index;
+  };
+
+  // For each partition, store start indices of feature column dimensions.
+  typedef std::vector<std::vector<DimensionBoundary>>
+      PartitionAndDimensionBoundaries;
+
   int64 bias_feature_id_;
 };
 REGISTER_KERNEL_BUILDER(Name("BuildSparseInequalitySplits").Device(DEVICE_CPU),
@@ -434,7 +547,7 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
 
     const Tensor* feature_ids_t;
     OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
-    const auto& feature_ids = feature_ids_t->vec<int64>();
+    const auto& feature_ids = feature_ids_t->matrix<int64>();
 
     const Tensor* gradients_t;
     OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
@@ -491,7 +604,7 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
       int start_index = partition_boundaries[non_empty_partitions[root_idx]];
       int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
       // First feature ID in each partition should be the bias feature.
-      OP_REQUIRES(context, feature_ids(start_index) == bias_feature_id_,
+      OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id_,
                   errors::InvalidArgument("Bias feature ID missing."));
       GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
       root_gradient_stats *= normalizer_ratio;
@@ -519,7 +632,7 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
       auto* equality_split = split_info.mutable_split_node()
                                  ->mutable_categorical_id_binary_split();
       equality_split->set_feature_column(feature_column_group_id_);
-      equality_split->set_feature_id(feature_ids(best_feature_idx));
+      equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
       FillLeaf(class_id, best_left_node_stats, left_child);
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index cff75e71d93cb703d87bb09a4b32439e01d70f76..a9a229c8ae0c26bba5f0a684dad7e546298577bb 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -39,13 +39,14 @@ const char* const kStampTokenName = "stamp_token";
 const char* const kNextStampTokenName = "next_stamp_token";
 
 struct PartitionKey {
-  PartitionKey() : partition_id(-1), feature_id(-1) {}
+  PartitionKey() : partition_id(-1), feature_id(-1), dimension(-1) {}
 
-  PartitionKey(int32 p, int64 f) : partition_id(p), feature_id(f) {}
+  PartitionKey(int32 p, int64 f, int32 d)
+      : partition_id(p), feature_id(f), dimension(d) {}
 
   bool operator==(const PartitionKey& other) const {
-    return (feature_id == other.feature_id) &&
-           (partition_id == other.partition_id);
+    return (partition_id == other.partition_id) &&
+           (dimension == other.dimension) && (feature_id == other.feature_id);
   }
 
   // Compare for PartitionKey.
@@ -54,7 +55,11 @@ struct PartitionKey {
       if (a.partition_id < b.partition_id) {
         return true;
       }
-      if ((a.partition_id == b.partition_id) && (a.feature_id < b.feature_id)) {
+      if ((a.partition_id == b.partition_id) && (a.dimension < b.dimension)) {
+        return true;
+      }
+      if ((a.partition_id == b.partition_id) && (a.dimension == b.dimension) &&
+          (a.feature_id < b.feature_id)) {
         return true;
       }
       return false;
@@ -64,8 +69,11 @@ struct PartitionKey {
   // Tree partition defined by traversing the tree to the leaf.
   int32 partition_id;
 
-  // Feature Id within the feature column.
+  // Feature column id.
   int64 feature_id;
+
+  // Dimension within feature column.
+  int32 dimension;
 };
 
 template <typename GradientType, typename HessianType>
@@ -132,12 +140,12 @@ void SerializeScalarAccumulatorToOutput(
                                &partition_ids_t));
   auto partition_ids = partition_ids_t->vec<int32>();
 
+  // Feature ids tensor has ids of feature columns and their dimensions.
   Tensor* feature_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_feature_ids", TensorShape({num_slots}),
-                               &feature_ids_t));
-  auto feature_ids = feature_ids_t->vec<int64>();
+  OP_REQUIRES_OK(context, context->allocate_output("output_feature_ids",
+                                                   TensorShape({num_slots, 2}),
+                                                   &feature_ids_t));
+  auto feature_ids = feature_ids_t->matrix<int64>();
 
   Tensor* gradients_t = nullptr;
   OP_REQUIRES_OK(
@@ -155,7 +163,9 @@ void SerializeScalarAccumulatorToOutput(
   int i = 0;
   for (const auto& iter : accumulator_resource.values()) {
     partition_ids(i) = iter.first.partition_id;
-    feature_ids(i) = iter.first.feature_id;
+    feature_ids(i, 0) = iter.first.feature_id;
+    feature_ids(i, 1) = iter.first.dimension;
+
     gradients(i) = iter.second.first;
     hessians(i) = iter.second.second;
     ++i;
@@ -174,11 +184,10 @@ void SerializeTensorAccumulatorToOutput(
   auto partition_ids = partition_ids_t->vec<int32>();
 
   Tensor* feature_ids_t = nullptr;
-  OP_REQUIRES_OK(
-      context,
-      context->allocate_output("output_feature_ids", TensorShape({num_slots}),
-                               &feature_ids_t));
-  auto feature_ids = feature_ids_t->vec<int64>();
+  OP_REQUIRES_OK(context, context->allocate_output("output_feature_ids",
+                                                   TensorShape({num_slots, 2}),
+                                                   &feature_ids_t));
+  auto feature_ids = feature_ids_t->matrix<int64>();
 
   TensorShape gradient_shape = accumulator_resource.gradient_shape();
   int64 num_gradient_elements = gradient_shape.num_elements();
@@ -201,7 +210,9 @@ void SerializeTensorAccumulatorToOutput(
   int i = 0;
   for (const auto& iter : accumulator_resource.values()) {
     partition_ids(i) = iter.first.partition_id;
-    feature_ids(i) = iter.first.feature_id;
+    feature_ids(i, 0) = iter.first.feature_id;
+    feature_ids(i, 1) = iter.first.dimension;
+
     for (int j = 0; j < num_gradient_elements; ++j) {
       gradients(i, j) = iter.second.first[j];
     }
@@ -220,14 +231,16 @@ void AddToScalarAccumulator(
                                         1);
   const TensorShape& partition_ids_shape = partition_ids_t.shape();
   const auto& partition_ids = partition_ids_t.vec<int32>();
-  const auto& feature_ids = feature_ids_t.vec<int64>();
+  const auto& feature_ids_and_dimensions = feature_ids_t.matrix<int64>();
   const auto& gradients = gradients_t.vec<float>();
   const auto& hessians = hessians_t.vec<float>();
 
   int64 num_updates = partition_ids_shape.dim_size(0);
   auto stats_map = accumulator_resource->mutable_values();
   for (int64 i = 0; i < num_updates; ++i) {
-    const auto key = PartitionKey(partition_ids(i), feature_ids(i));
+    const auto key =
+        PartitionKey(partition_ids(i), feature_ids_and_dimensions(i, 0),
+                     feature_ids_and_dimensions(i, 1));
     auto itr = stats_map->find(key);
     if (itr != stats_map->end()) {
       itr->second.first += gradients(i);
@@ -263,7 +276,7 @@ void AddToTensorAccumulator(
 
   const TensorShape& partition_ids_shape = partition_ids_t.shape();
   const auto& partition_ids = partition_ids_t.vec<int32>();
-  const auto& feature_ids = feature_ids_t.vec<int64>();
+  const auto& feature_ids_and_dimensions = feature_ids_t.matrix<int64>();
   TensorShape gradients_shape = gradients_t.shape();
   const auto& gradients = gradients_t.flat_outer_dims<float>();
   TensorShape hessians_shape = hessians_t.shape();
@@ -288,7 +301,9 @@ void AddToTensorAccumulator(
   int64 num_updates = partition_ids_shape.dim_size(0);
   auto stats_map = accumulator_resource->mutable_values();
   for (int64 i = 0; i < num_updates; ++i) {
-    const auto key = PartitionKey(partition_ids(i), feature_ids(i));
+    const auto key =
+        PartitionKey(partition_ids(i), feature_ids_and_dimensions(i, 0),
+                     feature_ids_and_dimensions(i, 1));
     auto itr = stats_map->find(key);
     if (itr == stats_map->end()) {
       std::vector<float> new_gradients(gradients_shape.num_elements());
diff --git a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
index 4c56718f1bbc0b42c1f5454ddfafe6ccd8c35c2c..c77d90e243c304ec8e9a10a0b63401f9bd825c3e 100644
--- a/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/training_ops.cc
@@ -208,27 +208,19 @@ class CenterTreeEnsembleBiasOp : public OpKernel {
     int64 next_stamp_token = next_stamp_token_t->scalar<int64>()();
     CHECK(stamp_token != next_stamp_token);
 
+    // Update the ensemble stamp.
+    ensemble_resource->set_stamp(next_stamp_token);
+
     // Get the delta updates.
     const Tensor* delta_updates_t;
     OP_REQUIRES_OK(context, context->input("delta_updates", &delta_updates_t));
-    OP_REQUIRES(
-        context,
-        delta_updates_t->dim_size(0) + 1 == learner_config_.num_classes(),
-        errors::InvalidArgument(
-            "Delta updates size must be consistent with label dimensions."));
     auto delta_updates = delta_updates_t->vec<float>();
-
-    // Update the ensemble stamp.
-    ensemble_resource->set_stamp(next_stamp_token);
+    const int64 logits_dimension = delta_updates_t->dim_size(0);
 
     // Get the bias.
-    boosted_trees::trees::Leaf* const bias = RetrieveBias(ensemble_resource);
+    boosted_trees::trees::Leaf* const bias =
+        RetrieveBias(ensemble_resource, logits_dimension);
     CHECK(bias->has_vector());
-    OP_REQUIRES(
-        context,
-        bias->vector().value_size() + 1 == learner_config_.num_classes(),
-        errors::InvalidArgument(
-            "Bias vector size must be consistent with label dimensions."));
 
     // Update the bias.
     float total_delta = 0;
@@ -245,6 +237,7 @@ class CenterTreeEnsembleBiasOp : public OpKernel {
       VLOG(1) << "Continuing to center bias, delta=" << total_delta;
     } else {
       VLOG(1) << "Done centering bias, delta=" << total_delta;
+      ensemble_resource->LastTreeMetadata()->set_is_finalized(true);
     }
     Tensor* continue_centering_t = nullptr;
     OP_REQUIRES_OK(
@@ -256,7 +249,8 @@ class CenterTreeEnsembleBiasOp : public OpKernel {
  private:
   // Helper method to retrieve the bias from the tree ensemble.
   boosted_trees::trees::Leaf* RetrieveBias(
-      boosted_trees::models::DecisionTreeEnsembleResource* ensemble_resource) {
+      boosted_trees::models::DecisionTreeEnsembleResource* ensemble_resource,
+      int64 logits_dimension) {
     const int32 num_trees = ensemble_resource->num_trees();
     if (num_trees <= 0) {
       // Add a new bias leaf.
@@ -264,10 +258,9 @@ class CenterTreeEnsembleBiasOp : public OpKernel {
       boosted_trees::trees::DecisionTreeConfig* const tree_config =
           ensemble_resource->AddNewTree(1.0);
       auto* const leaf = tree_config->add_nodes()->mutable_leaf();
-      for (size_t idx = 0; idx + 1 < learner_config_.num_classes(); ++idx) {
+      for (size_t idx = 0; idx < logits_dimension; ++idx) {
         leaf->mutable_vector()->add_value(0.0);
       }
-      ensemble_resource->LastTreeMetadata()->set_is_finalized(true);
       return leaf;
     } else if (num_trees == 1) {
       // Confirms that the only tree is a bias and returns its leaf.
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 107ff0d295bee530c1711a97849fbd3c6cdb2f00..131bd48562a55a08981ac73277e93024db0d85d3 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -406,51 +406,9 @@ tf_cc_test(
 )
 
 # Learner/stochastic
-
-cc_library(
-    name = "feature-column-handlers",
-    srcs = [
-        "learner/stochastic/handlers/bias-feature-column-handler.cc",
-        "learner/stochastic/handlers/categorical-feature-column-handler.cc",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler.cc",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc",
-    ],
-    hdrs = [
-        "learner/stochastic/handlers/bias-feature-column-handler.h",
-        "learner/stochastic/handlers/categorical-feature-column-handler.h",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler.h",
-        "learner/stochastic/handlers/feature-column-handler.h",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler.h",
-    ],
-    deps = [
-        ":feature-split-candidate",
-        ":feature-stats-accumulator",
-        "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_cc_test(
-    name = "feature-column-handlers_test",
-    size = "small",
-    srcs = [
-        "learner/stochastic/handlers/bias-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/categorical-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc",
-        "learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc",
-    ],
-    deps = [
-        ":feature-column-handlers",
-        "//tensorflow/core:tensor_testutil",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "gradient-stats",
-    hdrs = ["learner/stochastic/stats/gradient-stats.h"],
+    hdrs = ["learner/common/stats/gradient-stats.h"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//third_party/eigen3",
@@ -459,7 +417,7 @@ cc_library(
 
 cc_library(
     name = "node-stats",
-    hdrs = ["learner/stochastic/stats/node-stats.h"],
+    hdrs = ["learner/common/stats/node-stats.h"],
     deps = [
         ":gradient-stats",
         "//tensorflow/contrib/boosted_trees/proto:learner_proto_cc",
@@ -471,7 +429,7 @@ cc_library(
 
 cc_library(
     name = "split-stats",
-    hdrs = ["learner/stochastic/stats/split-stats.h"],
+    hdrs = ["learner/common/stats/split-stats.h"],
     deps = [
         ":node-stats",
     ],
@@ -479,7 +437,7 @@ cc_library(
 
 cc_library(
     name = "feature-split-candidate",
-    hdrs = ["learner/stochastic/stats/feature-split-candidate.h"],
+    hdrs = ["learner/common/stats/feature-split-candidate.h"],
     deps = [
         ":split-stats",
         "//tensorflow/contrib/boosted_trees/proto:tree_config_proto_cc",
@@ -489,7 +447,7 @@ cc_library(
 tf_cc_test(
     name = "node-stats_test",
     size = "small",
-    srcs = ["learner/stochastic/stats/node-stats_test.cc"],
+    srcs = ["learner/common/stats/node-stats_test.cc"],
     deps = [
         ":node-stats",
         "//tensorflow/core:tensor_testutil",
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index 83dad7e4b3301327bcbae5203e9d9330c9e0084d..9f78ab20242800fd8af7ad049d5970fbe26ec0ea 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -110,8 +110,8 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
 
     def not_active_inputs():
       return (constant_op.constant([], dtype=dtypes.int32),
-              constant_op.constant([], dtype=dtypes.int64), empty_gradients,
-              empty_hessians)
+              constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+              empty_gradients, empty_hessians)
 
     def active_inputs():
       """The normal flow when the handler is active."""
@@ -154,7 +154,12 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
           [per_partition_hessians, filtered_hessians], 0)
       feature_ids = array_ops.concat(
           [bias_feature_ids, self._sparse_int_column.values], 0)
-      return partition_ids, feature_ids, filtered_gradients, filtered_hessians
+      # Dimension is always zero for sparse int features.
+      dimension_ids = array_ops.zeros_like(feature_ids, dtype=dtypes.int64)
+      feature_ids_and_dimensions = array_ops.stack(
+          [feature_ids, dimension_ids], axis=1)
+      return (partition_ids, feature_ids_and_dimensions, filtered_gradients,
+              filtered_hessians)
 
     partition_ids, feature_ids, gradients_out, hessians_out = (
         control_flow_ops.cond(is_active[0], active_inputs, not_active_inputs))
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 8c0a3f0d91e0fbd6b6ca02352c8b80b8485d029d..72e20aaa127cda592bd314786cddb925cc87a075 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -257,6 +257,7 @@ class DenseSplitHandler(InequalitySplitHandler):
         # Put quantile and stats accumulator flushing in the dependency path.
         are_splits_ready = control_flow_ops.with_dependencies(
             [flush_quantiles, partition_ids], are_splits_ready)
+
         partition_ids, gains, split_infos = (
             split_handler_ops.build_dense_inequality_splits(
                 num_minibatches=num_minibatches,
@@ -433,14 +434,15 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
   def ready_inputs_fn():
     """Branch to execute when quantiles are ready."""
     quantized_feature = quantile_ops.quantiles([float_column], [],
-                                               [quantile_buckets], [])
+                                               [quantile_buckets], [], [])
     quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
-    quantized_feature = array_ops.reshape(quantized_feature, [-1])
+    quantized_feature = array_ops.squeeze(quantized_feature)
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
-    return (constant_op.constant([], dtype=dtypes.int32), constant_op.constant(
-        [], dtype=dtypes.int64), empty_gradients, empty_hessians)
+    return (constant_op.constant([], dtype=dtypes.int32),
+            constant_op.constant([[]], dtype=dtypes.int64, shape=[1, 2]),
+            empty_gradients, empty_hessians)
 
   example_partition_ids, feature_ids, gradients, hessians = (
       control_flow_ops.cond(
@@ -461,10 +463,13 @@ def sparse_make_stats_update(
 
   def quantiles_ready():
     """The subgraph for when the quantiles are ready."""
-    quantized_feature = quantile_ops.quantiles([sparse_column_values], [],
-                                               [quantile_buckets], [])
-    quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
-    quantized_feature = array_ops.reshape(quantized_feature, [-1])
+    quantized_feature = quantile_ops.quantiles([], [sparse_column_values], [],
+                                               [quantile_buckets],
+                                               [sparse_column_indices])
+
+    quantized_feature = math_ops.cast(quantized_feature[1], dtypes.int64)
+    quantized_feature = array_ops.squeeze(quantized_feature)
+
     example_indices, _ = array_ops.split(
         sparse_column_indices, num_or_size_splits=2, axis=1)
     example_indices = array_ops.squeeze(example_indices, [1])
@@ -486,19 +491,25 @@ def sparse_make_stats_update(
     bias_feature_ids = array_ops.fill(
         array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
     bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
+    zeros = array_ops.zeros_like(bias_feature_ids)
+    bias_feature_ids = array_ops.stack([bias_feature_ids, zeros], axis=1)
+
     partition_ids = array_ops.concat(
         [unique_partitions, filtered_partition_ids], 0)
     filtered_gradients = array_ops.concat(
         [per_partition_gradients, filtered_gradients], 0)
     filtered_hessians = array_ops.concat(
         [per_partition_hessians, filtered_hessians], 0)
+
     bucket_ids = array_ops.concat([bias_feature_ids, quantized_feature], 0)
+
     return partition_ids, bucket_ids, filtered_gradients, filtered_hessians
 
   def quantiles_not_ready():
     """The subgraph for when the quantiles are not ready."""
-    return (constant_op.constant([], dtype=dtypes.int32), constant_op.constant(
-        [], dtype=dtypes.int64), empty_gradients, empty_hessians)
+    return (constant_op.constant([], dtype=dtypes.int32),
+            constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+            empty_gradients, empty_hessians)
 
   empty_float = constant_op.constant([], dtype=dtypes.float32)
   handler_not_active = (constant_op.constant(
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
similarity index 90%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
index fe22691178213094b9affcdee06af98011f85bd2..339c2e0fded10e6a7b140da62e152e2868ffd164 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/feature-split-candidate.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 //
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 
 namespace tensorflow {
@@ -58,4 +58,4 @@ struct FeatureSplitCandidate {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_FEATURE_SPLIT_CANDIDATE_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_FEATURE_SPLIT_CANDIDATE_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
similarity index 98%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
index dad64bf165a41bc4f32eea6b37e7afb569887a06..34e3ddb777242553d62035a51f1aec33d0f9ba54 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
 
 #include <math.h>
 
@@ -190,4 +190,4 @@ inline GradientStats operator-(const GradientStats& a, const GradientStats& b) {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_GRADIENT_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_GRADIENT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
similarity index 98%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
index 4e5f53874df2207ffa6664a33675f84ef055394b..642a183aec5c7e591579fa5ee91d45729bfb624d 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/Eigen/Eigenvalues"
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/gradient-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/gradient-stats.h"
 #include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
 #include "tensorflow/contrib/boosted_trees/proto/tree_config.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -298,4 +298,4 @@ struct NodeStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_NODE_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_NODE_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
similarity index 99%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
index ecb7a04efb96248210d9af770c8377b7f6906598..f867e77d3ef0609774628b2a9c36ca52bcf2a957 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
similarity index 94%
rename from tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
rename to tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
index f700cbced833543227de39f54c9ecbb03a7ce7c9..054ccd9a8cd0be0c48b14cca013f15677deba900 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/split-stats.h
+++ b/tensorflow/contrib/boosted_trees/lib/learner/common/stats/split-stats.h
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
 
 #include <string>
 
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/node-stats.h"
+#include "tensorflow/contrib/boosted_trees/lib/learner/common/stats/node-stats.h"
 
 namespace tensorflow {
 namespace boosted_trees {
@@ -81,4 +81,4 @@ struct SplitStats {
 }  // namespace boosted_trees
 }  // namespace tensorflow
 
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_STATS_SPLIT_STATS_H_
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_COMMON_STATS_SPLIT_STATS_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
deleted file mode 100644
index b880cf2c47989b1434f17802befb7dd7c248b36f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-void BiasFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all examples and aggregate gradient stats for each sub-root.
-  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
-    auto partition_id = example_partition_ids[example_idx];
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, kBiasFeatureId,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void BiasFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  boosted_trees::trees::TreeNode tree_node;
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    const NodeStats& root_node_stats = root_stats[root_idx];
-    tree_node.Clear();
-    root_node_stats.FillLeaf(class_id_, tree_node.mutable_leaf());
-    split_candidates->emplace_back(slot_id_, tree_node,
-                                   SplitStats(learner_config, root_node_stats));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
deleted file mode 100644
index 5c0f99185a63db33a391a98fa16f37bef99507c9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a bias feature column in the single class case.
-// This handler is useful even if we don't introduce a bias feature because
-// it allows us to aggregate stats per partition which in turn allows us
-// to compute node stats for each root to split.
-class BiasFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  BiasFeatureColumnHandler(const uint32 class_id, const uint32 slot_id,
-                           const int64 batch_size)
-      : FeatureColumnHandler(class_id, slot_id, batch_size) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
-  static constexpr auto kBiasFeatureId = 0;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_H_  // NOLINT
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
deleted file mode 100644
index f4c7df7fabda1a38d7e6cca4c5c8bc81cb7551b1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler_test.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 7;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class BiasFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  BiasFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 1, 3}) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-
-    // Create handler.
-    handler_.reset(new BiasFeatureColumnHandler(kClassId, kSlotId, kBatchSize));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  std::unique_ptr<BiasFeatureColumnHandler> handler_;
-};
-
-TEST_F(BiasFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition.
-  // Partition 0.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(-0.3f, 0.19f),
-      accumulator.GetStats(kSlotId, kClassId, 0,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 1.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(1.2f, 0.2f),
-      accumulator.GetStats(kSlotId, kClassId, 1,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 2.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(0.0f, 0.0f),
-      accumulator.GetStats(kSlotId, kClassId, 2,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-  // Partition 3.
-  EXPECT_GRADIENT_STATS_EQ(
-      GradientStats(4.0f, 0.13f),
-      accumulator.GetStats(kSlotId, kClassId, 3,
-                           BiasFeatureColumnHandler::kBiasFeatureId));
-}
-
-TEST_F(BiasFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 3.
-  // Root 0 has zero gain and root 3 has the same gain as the leaf.
-  const std::vector<int32> roots = {0, 3};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(1), NodeStats(learner_config_, GradientStats(4.0f, 0.13f))};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect two candidate splits (one per root).
-  EXPECT_EQ(2, split_candidates.size());
-
-  // Verify first candidate for root 0, gain is expected to be the same as
-  // the left child since the root node gain is zero.
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0]);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node0.node_case());
-  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().index_size());
-  EXPECT_EQ(kClassId, tree_node0.leaf().sparse_vector().index(0));
-  EXPECT_EQ(1, tree_node0.leaf().sparse_vector().value_size());
-  EXPECT_EQ(root_stats[0].weight_contribution[0],
-            tree_node0.leaf().sparse_vector().value(0));
-
-  // Verify second candidate for root 3, gain is expected to be zero as
-  // the left child gain is equal to the parent gain.
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1]);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kLeaf, tree_node1.node_case());
-  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().index_size());
-  EXPECT_EQ(kClassId, tree_node1.leaf().sparse_vector().index(0));
-  EXPECT_EQ(1, tree_node1.leaf().sparse_vector().value_size());
-  EXPECT_EQ(root_stats[1].weight_contribution[0],
-            tree_node1.leaf().sparse_vector().value(0));
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
deleted file mode 100644
index 3a6c409f846c9ca0bd6b5101e96447642b949978..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
-
-#include "tensorflow/core/platform/macros.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a categorical Id split node without assigning children.
-boosted_trees::trees::TreeNode CreateCategoricalIdNode(
-    const int32 feature_column, const int32 id) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_categorical_id_binary_split();
-  split->set_feature_column(feature_column);
-  split->set_feature_id(id);
-  return split_node;
-}
-
-}  // namespace
-
-void CategoricalFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all rows and aggregate gradient stats for each feature id.
-  const int64 num_rows = indices_.dimension(0);
-  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
-    auto example_idx = indices_(row_idx, 0);
-    auto feature_id = values_(row_idx);
-    const GradientStats norm_gradient_stats(example_first_order_gradients,
-                                            example_second_order_gradients,
-                                            example_idx);
-    auto partition_id = example_partition_ids[example_idx];
-    gradient_stats_accumulator->AddStats(slot_id_, class_id_, partition_id,
-                                         feature_id, norm_gradient_stats);
-  }
-}
-
-void CategoricalFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Build a reverse lookup of partition id to root idx.
-  std::unordered_map<int32, size_t> partition_id_to_root_idx;
-  partition_id_to_root_idx.reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    partition_id_to_root_idx[roots[root_idx]] = root_idx;
-  }
-
-  // Initialize split candidates.
-  split_candidates->clear();
-  if (!roots.empty()) {
-    FeatureSplitCandidate empty_candidate(
-        root_stats[0].weight_contribution.size());
-    split_candidates->resize(roots.size(), empty_candidate);
-  }
-  for (auto& split_candidate : *split_candidates) {
-    split_candidate.split_stats.gain = std::numeric_limits<float>::lowest();
-  }
-
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we evaluate every feature id as an equality split
-  // and pick the highest split gain.
-  for (const auto& entry :
-       gradient_stats_accumulator.GetFeatureStats(slot_id_)) {
-    DCHECK_EQ(entry.first.class_id, class_id_);
-
-    // Get partition id and root node stats.
-    const int32 partition_id = entry.first.partition_id;
-    auto root_idx_it = partition_id_to_root_idx.find(partition_id);
-    if (root_idx_it == partition_id_to_root_idx.end()) {
-      // Inactive partition.
-      continue;
-    }
-    size_t root_idx = root_idx_it->second;
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Get gradient stats.
-    const auto& left_gradient_stats = entry.second;
-    auto right_gradient_stats =
-        root_node_stats.gradient_stats - left_gradient_stats;
-
-    // Get node stats.
-    NodeStats left_node_stats(learner_config, left_gradient_stats);
-    NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-    // Generate split candidate and update best split candidate for the
-    // current root if needed.
-    FeatureSplitCandidate split_candidate(
-        slot_id_,
-        CreateCategoricalIdNode(feature_column_, entry.first.feature_id),
-        SplitStats(learner_config, root_node_stats, left_node_stats,
-                   right_node_stats));
-    FeatureSplitCandidate& best_split_candidate = (*split_candidates)[root_idx];
-    if (TF_PREDICT_FALSE(best_split_candidate.tree_node.node_case() ==
-                         boosted_trees::trees::TreeNode::NODE_NOT_SET)) {
-      // Always replace candidates with no node set.
-      best_split_candidate = std::move(split_candidate);
-    } else if (TF_PREDICT_FALSE(split_candidate.split_stats.gain ==
-                                best_split_candidate.split_stats.gain)) {
-      // Tie break on feature id.
-      auto best_split_feature_id =
-          best_split_candidate.tree_node.categorical_id_binary_split()
-              .feature_id();
-      if (entry.first.feature_id < best_split_feature_id) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    } else if (split_candidate.split_stats.gain >
-               best_split_candidate.split_stats.gain) {
-      best_split_candidate = std::move(split_candidate);
-    }
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
deleted file mode 100644
index ef964ba716c6adf9cf9c291cca5f52f7a6efe26f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a categorical feature column in the single class case.
-class CategoricalFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  CategoricalFeatureColumnHandler(const int32 class_id, const int32 slot_id,
-                                  const int64 batch_size,
-                                  const int32 feature_column,
-                                  TTypes<int64>::ConstMatrix indices,
-                                  TTypes<int64>::ConstVec values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        feature_column_(feature_column),
-        indices_(indices),
-        values_(values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 feature_column_;
-  TTypes<int64>::ConstMatrix indices_;
-  TTypes<int64>::ConstVec values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_CATEGORICAL_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
deleted file mode 100644
index ea82b3f086d24dc1f9ceb4783abd68be35b34b00..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler_test.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 7;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 3;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class CategoricalFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Feature Id |
-  // i0      |  (0.2, 0.12)  |     0     |    1,2     |
-  // i1      |  (-0.5, 0.07) |     0     |            |
-  // i2      |  (1.2, 0.2)   |     0     |     2      |
-  // i3      |  (4.0, 0.13)  |     1     |     0      |
-  CategoricalFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        indices_(test::AsTensor<int64>({0, 0, 0, 1, 2, 0, 3, 0}, {4, 2})),
-        values_(test::AsTensor<int64>({1, 2, 2, 0}, {4})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new CategoricalFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn, indices_.matrix<int64>(),
-        values_.vec<int64>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor indices_;
-  const Tensor values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(CategoricalFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 0, Feature 2.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f + 1.2f, 0.12f + 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 2));
-
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-  // Partition 1, Feature 2.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 2));
-}
-
-TEST_F(CategoricalFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 5};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i0, i2 left and i1 right.
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(
-      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
-      tree_node0.node_case());
-  const auto& split0 = tree_node0.categorical_id_binary_split();
-  EXPECT_EQ(2, split0.feature_id());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active feature here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(
-      boosted_trees::trees::TreeNode::kCategoricalIdBinarySplitFieldNumber,
-      tree_node1.node_case());
-  const auto& split1 = tree_node1.categorical_id_binary_split();
-  EXPECT_EQ(0, split1.feature_id());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 5.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
deleted file mode 100644
index ca7bb71e7d0b0fc945ee29092b1e36022d4c0943..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a dense split node without assigning children.
-boosted_trees::trees::TreeNode CreateDenseSplitNode(const int32 feature_column,
-                                                    const float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_dense_float_binary_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-}  // namespace
-
-void DenseQuantizedFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all examples and aggregate gradient stats for each partition
-  // and quantized feature bucket.
-  for (int64 example_idx = 0; example_idx < batch_size_; ++example_idx) {
-    auto partition_id = example_partition_ids[example_idx];
-    auto feature_id = dense_quantized_values_(example_idx);
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, feature_id,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void DenseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we do a forward-only pass over the quantized
-  // feature buckets accumulating gradients from left to right.
-  // Split gains are evaluated at every threshold and the best split is picked.
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    // Get partition Id and root node stats.
-    const int32 partition_id = roots[root_idx];
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Forward left to right pass over quantiles.
-    GradientStats left_gradient_stats;
-    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
-    FeatureSplitCandidate best_split_candidate(
-        root_node_stats.weight_contribution.size());
-    best_split_candidate.split_stats.gain =
-        std::numeric_limits<float>::lowest();
-    for (int bucket_id = 0; bucket_id < dense_quantiles_.size(); ++bucket_id) {
-      // Get gradient stats.
-      auto gradient_stats = gradient_stats_accumulator.GetStats(
-          slot_id_, class_id_, partition_id, bucket_id);
-      if (gradient_stats.IsZero()) {
-        continue;
-      }
-
-      // Update gradient stats.
-      left_gradient_stats += gradient_stats;
-      right_gradient_stats =
-          root_node_stats.gradient_stats - left_gradient_stats;
-
-      // Get node stats
-      NodeStats left_node_stats(learner_config, left_gradient_stats);
-      NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-      // Generate split candidate.
-      const float threshold = dense_quantiles_(bucket_id);
-      FeatureSplitCandidate split_candidate(
-          slot_id_, CreateDenseSplitNode(dense_feature_column_, threshold),
-          SplitStats(learner_config, root_node_stats, left_node_stats,
-                     right_node_stats));
-      if (split_candidate.split_stats.gain >
-          best_split_candidate.split_stats.gain) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    }
-
-    // Add best candidate for partition.
-    split_candidates->push_back(std::move(best_split_candidate));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
deleted file mode 100644
index 0f3858e4d8c406e9ec3ae7079b241e94ef4aa35c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a dense quantized feature column in the single class case.
-class DenseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  DenseQuantizedFeatureColumnHandler(
-      const int32 class_id, const int32 slot_id, const int64 batch_size,
-      const int32 dense_feature_column, TTypes<float>::ConstVec dense_quantiles,
-      TTypes<int32>::ConstVec dense_quantized_values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        dense_feature_column_(dense_feature_column),
-        dense_quantiles_(dense_quantiles),
-        dense_quantized_values_(dense_quantized_values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 dense_feature_column_;
-  TTypes<float>::ConstVec dense_quantiles_;
-  TTypes<int32>::ConstVec dense_quantized_values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_DENSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
deleted file mode 100644
index 1bc9d733ad3090f1cfc9547644061f54d7d2c8c6..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 1;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 2;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class DenseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Dense Quantile |
-  // i0      |  (0.2, 0.12)  | 0         | 1              |
-  // i1      |  (-0.5, 0.07) | 0         | 1              |
-  // i2      |  (1.2, 0.2)   | 0         | 0              |
-  // i3      |  (4.0, 0.13)  | 1         | 1              |
-  DenseQuantizedFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        dense_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
-        dense_quantized_values_(test::AsTensor<int32>({1, 1, 0, 1}, {4})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new DenseQuantizedFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn,
-        dense_quantiles_.vec<float>(), dense_quantized_values_.vec<int32>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor dense_quantiles_;
-  const Tensor dense_quantized_values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(DenseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(-0.3f, 0.19f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-}
-
-TEST_F(DenseQuantizedFeatureColumnHandlerTest, GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 5};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i2 left and i0, i1 right.
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(1.2f, 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
-            tree_node0.node_case());
-  const auto& split0 = tree_node0.dense_float_binary_split();
-  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(0), split0.threshold());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active bucket here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kDenseFloatBinarySplit,
-            tree_node1.node_case());
-  const auto& split1 = tree_node1.dense_float_binary_split();
-  EXPECT_FLOAT_EQ(dense_quantiles_.vec<float>()(1), split1.threshold());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 5.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
deleted file mode 100644
index 8bd2092f9609cb684b89f70cab35a92789fb39a4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
-
-#include <vector>
-#include "tensorflow/contrib/boosted_trees/lib/learner/common/accumulators/feature-stats-accumulator.h"
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/stats/feature-split-candidate.h"
-#include "tensorflow/contrib/boosted_trees/proto/learner.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_types.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler interface for feature columns. Each feature column type may
-// have its own handler which encapsulates the logic of aggregating gradient
-// stats as well as generating split candidates for each partition.
-// Handlers can be stateful and must be thread compatible.
-class FeatureColumnHandler {
- public:
-  FeatureColumnHandler(const int32 class_id, const int32 slot_id,
-                       const int64 batch_size)
-      : class_id_(class_id), slot_id_(slot_id), batch_size_(batch_size) {}
-
-  virtual ~FeatureColumnHandler() {}
-  FeatureColumnHandler(const FeatureColumnHandler& other) = delete;
-  FeatureColumnHandler& operator=(const FeatureColumnHandler& other) = delete;
-
-  // Aggregates example gradient stats for the feature column.
-  virtual void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const = 0;
-
-  // Generates feature column split candidates for the specified roots.
-  virtual void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const = 0;
-
-  // Accessors.
-  int32 class_id() const { return class_id_; }
-  int32 slot_id() const { return slot_id_; }
-  int64 batch_size() const { return batch_size_; }
-
- protected:
-  // The class Id.
-  const int32 class_id_;
-
-  // The slod Id for use as a unique Id across all feature columns.
-  const int32 slot_id_;
-
-  // Size of the batch of examples.
-  const int64 batch_size_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
deleted file mode 100644
index a0e9efbbc5030e8c2e25fafab98271337a2e582a..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-namespace {
-
-// Creates a sparse default right split node without assigning children.
-boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultRight(
-    int32 feature_column, float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_sparse_float_binary_split_default_right()
-                    ->mutable_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-// Creates a sparse default left split node without assigning children.
-boosted_trees::trees::TreeNode CreateSparseSplitNodeDefaultLeft(
-    int32 feature_column, float threshold) {
-  boosted_trees::trees::TreeNode split_node;
-  auto* split = split_node.mutable_sparse_float_binary_split_default_left()
-                    ->mutable_split();
-  split->set_feature_column(feature_column);
-  split->set_threshold(threshold);
-  return split_node;
-}
-
-}  // namespace
-
-void SparseQuantizedFeatureColumnHandler::AggregateGradientStats(
-    const std::vector<int32>& example_partition_ids,
-    const Tensor& example_first_order_gradients,
-    const Tensor& example_second_order_gradients,
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-        gradient_stats_accumulator) const {
-  // Pass over all rows and aggregate gradient stats for each partition
-  // and quantized feature bucket.
-  const int64 num_rows = sparse_indices_.dimension(0);
-  for (int64 row_idx = 0; row_idx < num_rows; ++row_idx) {
-    auto example_idx = sparse_indices_(row_idx, 0);
-    auto partition_id = example_partition_ids[example_idx];
-    auto feature_id = sparse_quantized_values_(row_idx);
-    gradient_stats_accumulator->AddStats(
-        slot_id_, class_id_, partition_id, feature_id,
-        GradientStats(example_first_order_gradients,
-                      example_second_order_gradients, example_idx));
-  }
-}
-
-void SparseQuantizedFeatureColumnHandler::GenerateFeatureSplitCandidates(
-    const LearnerConfig& learner_config, const std::vector<int32>& roots,
-    const std::vector<NodeStats>& root_stats,
-    const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-        gradient_stats_accumulator,
-    std::vector<FeatureSplitCandidate>* split_candidates) const {
-  // Evaluate split candidates for every root as each is a separate
-  // logical partition over the examples.
-  // Then for each root, we do both a forward left to right pass and a backward
-  // right to left pass over the quantized feature buckets accumulating
-  // gradients on one side and using the root aggregate gradients to get the
-  // gradients for the other side. Split gains are evaluated for each pass at
-  // every threshold and the best split is picked.
-  split_candidates->clear();
-  split_candidates->reserve(roots.size());
-  for (size_t root_idx = 0; root_idx < roots.size(); ++root_idx) {
-    // Get partition Id and root node stats.
-    const int32 partition_id = roots[root_idx];
-    const NodeStats& root_node_stats = root_stats[root_idx];
-
-    // Forward pass with right default direction.
-    GradientStats left_gradient_stats;
-    GradientStats right_gradient_stats(root_node_stats.gradient_stats);
-    FeatureSplitCandidate best_split_candidate(
-        root_node_stats.weight_contribution.size());
-    best_split_candidate.split_stats.gain =
-        std::numeric_limits<float>::lowest();
-    for (int bucket_id = 0; bucket_id < sparse_quantiles_.size(); ++bucket_id) {
-      // Get gradient stats.
-      auto gradient_stats = gradient_stats_accumulator.GetStats(
-          slot_id_, class_id_, partition_id, bucket_id);
-      if (gradient_stats.IsZero()) {
-        continue;
-      }
-
-      // Update gradient stats.
-      left_gradient_stats += gradient_stats;
-      right_gradient_stats =
-          root_node_stats.gradient_stats - left_gradient_stats;
-
-      // Get node stats
-      NodeStats left_node_stats(learner_config, left_gradient_stats);
-      NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-      // Generate split candidate.
-      const float threshold = sparse_quantiles_(bucket_id);
-      FeatureSplitCandidate split_candidate(
-          slot_id_,
-          CreateSparseSplitNodeDefaultRight(sparse_feature_column_, threshold),
-          SplitStats(learner_config, root_node_stats, left_node_stats,
-                     right_node_stats));
-      if (split_candidate.split_stats.gain >
-          best_split_candidate.split_stats.gain) {
-        best_split_candidate = std::move(split_candidate);
-      }
-    }
-
-    // Determine if we need a backward pass by checking if the residual gradient
-    // after forward aggregation is almost the same as the aggregated gradient.
-    // for the current root. This helps avoid unnecessary computation as well
-    // as consistency due to floating point precision.
-    if (!right_gradient_stats.IsAlmostZero()) {
-      // Backward pass with left default direction.
-      right_gradient_stats = GradientStats();
-      left_gradient_stats = root_node_stats.gradient_stats;
-      for (int bucket_id = sparse_quantiles_.size() - 1; bucket_id > 0;
-           --bucket_id) {
-        // Get gradient stats.
-        auto gradient_stats = gradient_stats_accumulator.GetStats(
-            slot_id_, class_id_, partition_id, bucket_id);
-        if (gradient_stats.IsZero()) {
-          continue;
-        }
-
-        // Update gradient stats.
-        right_gradient_stats += gradient_stats;
-        left_gradient_stats = root_node_stats.gradient_stats - gradient_stats;
-
-        // Get node stats
-        NodeStats left_node_stats(learner_config, left_gradient_stats);
-        NodeStats right_node_stats(learner_config, right_gradient_stats);
-
-        // Generate split candidate.
-        const float threshold = sparse_quantiles_(bucket_id - 1);
-        FeatureSplitCandidate split_candidate(
-            slot_id_,
-            CreateSparseSplitNodeDefaultLeft(sparse_feature_column_, threshold),
-            SplitStats(learner_config, root_node_stats, left_node_stats,
-                       right_node_stats));
-        if (split_candidate.split_stats.gain >
-            best_split_candidate.split_stats.gain) {
-          best_split_candidate = std::move(split_candidate);
-        }
-      }
-    }
-
-    // Add best candidate for partition.
-    split_candidates->push_back(std::move(best_split_candidate));
-  }
-}
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
deleted file mode 100644
index eb63e705471a65e8448bda38b2e31eb971d5c1bb..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/feature-column-handler.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-
-// Handler for a sparse quantized feature column in the single class case.
-class SparseQuantizedFeatureColumnHandler : public FeatureColumnHandler {
- public:
-  SparseQuantizedFeatureColumnHandler(
-      const int32 class_id, const int32 slot_id, const int64 batch_size,
-      const int32 sparse_feature_column,
-      TTypes<float>::ConstVec sparse_quantiles,
-      TTypes<int64>::ConstMatrix sparse_indices,
-      TTypes<int32>::ConstVec sparse_quantized_values)
-      : FeatureColumnHandler(class_id, slot_id, batch_size),
-        sparse_feature_column_(sparse_feature_column),
-        sparse_quantiles_(sparse_quantiles),
-        sparse_indices_(sparse_indices),
-        sparse_quantized_values_(sparse_quantized_values) {}
-
-  void AggregateGradientStats(
-      const std::vector<int32>& example_partition_ids,
-      const Tensor& example_first_order_gradients,
-      const Tensor& example_second_order_gradients,
-      FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>*
-          gradient_stats_accumulator) const override;
-
-  void GenerateFeatureSplitCandidates(
-      const LearnerConfig& learner_config, const std::vector<int32>& roots,
-      const std::vector<NodeStats>& root_stats,
-      const FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>&
-          gradient_stats_accumulator,
-      std::vector<FeatureSplitCandidate>* split_candidates) const override;
-
- protected:
-  const int32 sparse_feature_column_;
-  TTypes<float>::ConstVec sparse_quantiles_;
-  TTypes<int64>::ConstMatrix sparse_indices_;
-  TTypes<int32>::ConstVec sparse_quantized_values_;
-};
-
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
-
-#endif  //  THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_LEARNER_STOCHASTIC_HANDLERS_SPARSE_QUANTIZED_FEATURE_COLUMN_HANDLER_H_
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc b/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
deleted file mode 100644
index 643d936ad23850e601bc5518d69c8637011f53c0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler_test.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.h"
-
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace boosted_trees {
-namespace learner {
-namespace stochastic {
-namespace {
-
-using boosted_trees::learner::LearnerConfig;
-
-const auto kClassId = 3;
-const auto kSlotId = 0;
-const auto kBatchSize = 4;
-const auto kFeatureColumn = 4;
-
-using FeatureStatsAccumulator =
-    FeatureStatsAccumulator<GradientStats, GradientStatsAccumulator>;
-
-class SparseQuantizedFeatureColumnHandlerTest : public ::testing::Test {
- protected:
-  // The data looks like the following:
-  // Example |  Gradients    | Partition | Sparse Quantile |
-  // i0      |  (0.2, 0.12)  | 0         | 1               |
-  // i1      |  (-0.5, 0.07) | 0         | N/A             |
-  // i2      |  (1.2, 0.2)   | 0         | 0               |
-  // i3      |  (4.0, 0.13)  | 1         | 1               |
-  SparseQuantizedFeatureColumnHandlerTest()
-      : example_first_order_gradients_(
-            test::AsTensor<float>({0.2f, -0.5f, 1.2f, 4.0f}, {4})),
-        example_second_order_gradients_(
-            test::AsTensor<float>({0.12f, 0.07f, 0.2f, 0.13f}, {4})),
-        example_partitions_({0, 0, 0, 1}),
-        sparse_quantiles_(test::AsTensor<float>({0.3f, 0.52f}, {2})),
-        sparse_indices_(test::AsTensor<int64>({0, 0, 2, 0, 3, 0}, {3, 2})),
-        sparse_quantized_values_(test::AsTensor<int32>({1, 0, 1}, {3})) {
-    // Set L2 regularization.
-    learner_config_.mutable_regularization()->set_l2(2.0f);
-    learner_config_.set_multi_class_strategy(LearnerConfig::TREE_PER_CLASS);
-    // Create handler.
-    handler_.reset(new SparseQuantizedFeatureColumnHandler(
-        kClassId, kSlotId, kBatchSize, kFeatureColumn,
-        sparse_quantiles_.vec<float>(), sparse_indices_.matrix<int64>(),
-        sparse_quantized_values_.vec<int32>()));
-  }
-
-  LearnerConfig learner_config_;
-  const Tensor example_first_order_gradients_;
-  const Tensor example_second_order_gradients_;
-  const std::vector<int32> example_partitions_;
-  const Tensor sparse_quantiles_;
-  const Tensor sparse_indices_;
-  const Tensor sparse_quantized_values_;
-  std::unique_ptr<FeatureColumnHandler> handler_;
-};
-
-TEST_F(SparseQuantizedFeatureColumnHandlerTest, AggregateGradientStats) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Check stats for each partition and feature.
-  // Partition 0, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(1.2f, 0.2f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 0));
-  // Partition 0, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.2f, 0.12f),
-                           accumulator.GetStats(kSlotId, kClassId, 0, 1));
-  // Partition 1, Feature 0.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(0.0f, 0.0f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 0));
-  // Partition 1, Feature 1.
-  EXPECT_GRADIENT_STATS_EQ(GradientStats(4.0f, 0.13f),
-                           accumulator.GetStats(kSlotId, kClassId, 1, 1));
-}
-
-TEST_F(SparseQuantizedFeatureColumnHandlerTest,
-       GenerateFeatureSplitCandidates) {
-  // Create handler.
-  FeatureStatsAccumulator accumulator(1);
-  handler_->AggregateGradientStats(
-      example_partitions_, example_first_order_gradients_,
-      example_second_order_gradients_, &accumulator);
-
-  // Get feature split candidates for two roots 0 and 1.
-  // The root stats are derived from the per-partition total gradient stats.
-  const std::vector<int32> roots = {0, 1, 9};
-  const std::vector<NodeStats>& root_stats = {
-      NodeStats(learner_config_, GradientStats(0.9f, 0.39f)),
-      NodeStats(learner_config_, GradientStats(4.0f, 0.13f)), NodeStats(1)};
-  std::vector<FeatureSplitCandidate> split_candidates;
-  handler_->GenerateFeatureSplitCandidates(learner_config_, roots, root_stats,
-                                           accumulator, &split_candidates);
-  // Expect three candidate splits (one per root).
-  EXPECT_EQ(3, split_candidates.size());
-
-  // Verify candidate for root 0, the best split occurs when we route
-  // example i0 and i2 to the left and i1 to the right (by default direction).
-  const NodeStats expected_left_node0(learner_config_,
-                                      GradientStats(0.2f + 1.2f, 0.12f + 0.2f));
-  const NodeStats expected_right_node0(
-      learner_config_,
-      root_stats[0].gradient_stats - expected_left_node0.gradient_stats);
-  const SplitStats expected_split_stats0(learner_config_, root_stats[0],
-                                         expected_left_node0,
-                                         expected_right_node0);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats0, split_candidates[0].split_stats);
-  const auto& tree_node0 = split_candidates[0].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
-            tree_node0.node_case());
-  const auto& split0 =
-      tree_node0.sparse_float_binary_split_default_right().split();
-  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split0.threshold());
-  EXPECT_EQ(kFeatureColumn, split0.feature_column());
-
-  // Verify candidate for root 1, there's only one active bucket here
-  // so zero gain is expected.
-  const NodeStats expected_left_node1(learner_config_,
-                                      root_stats[1].gradient_stats);
-  const NodeStats expected_right_node1(learner_config_, GradientStats(0, 0));
-  const SplitStats expected_split_stats1(learner_config_, root_stats[1],
-                                         expected_left_node1,
-                                         expected_right_node1);
-  EXPECT_SPLIT_STATS_EQ(expected_split_stats1, split_candidates[1].split_stats);
-  const auto& tree_node1 = split_candidates[1].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::kSparseFloatBinarySplitDefaultRight,
-            tree_node1.node_case());
-  const auto& split1 =
-      tree_node1.sparse_float_binary_split_default_right().split();
-  EXPECT_FLOAT_EQ(sparse_quantiles_.vec<float>()(1), split1.threshold());
-  EXPECT_EQ(kFeatureColumn, split1.feature_column());
-
-  // Verify there are no candidate splits for root 9.
-  const auto& tree_node2 = split_candidates[2].tree_node;
-  EXPECT_EQ(boosted_trees::trees::TreeNode::NODE_NOT_SET,
-            tree_node2.node_case());
-}
-
-}  // namespace
-}  // namespace stochastic
-}  // namespace learner
-}  // namespace boosted_trees
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
index 5e316538cefed30b2867252c9ebc4754216db329..70037d5bd8f446bdbbfcc468edb8a76c05e4fab7 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_buffer.h
@@ -33,9 +33,9 @@ template <typename ValueType, typename WeightType,
 class WeightedQuantilesBuffer {
  public:
   struct BufferEntry {
-    BufferEntry(const ValueType& v, const WeightType& w)
-        : value(v), weight(w) {}
-    BufferEntry() : value(0), weight(0) {}
+    BufferEntry(ValueType v, WeightType w)
+        : value(std::move(v)), weight(std::move(w)) {}
+    BufferEntry() : value(), weight(0) {}
 
     bool operator<(const BufferEntry& other) const {
       return kCompFn(value, other.value);
@@ -67,7 +67,7 @@ class WeightedQuantilesBuffer {
 
   // Push entry to buffer and maintain a compact representation within
   // pre-defined size limit.
-  void PushEntry(const ValueType& value, const WeightType& weight) {
+  void PushEntry(ValueType value, WeightType weight) {
     // Callers are expected to act on a full compacted buffer after the
     // PushEntry call returns.
     QCHECK(!IsFull()) << "Buffer already full: " << max_size_;
@@ -78,7 +78,7 @@ class WeightedQuantilesBuffer {
     }
 
     // Push back the entry to the buffer.
-    vec_.push_back(BufferEntry(value, weight));
+    vec_.push_back(BufferEntry(std::move(value), std::move(weight)));
   }
 
   // Returns a sorted vector view of the base buffer and clears the buffer.
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
index f8750e7191673274772fc869c198dd5fbbefbc49..0e5578693a7b90b16eada1127cad992612fb6dad 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc
@@ -52,13 +52,13 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
             example.sparse_float_features[split.feature_column()];
         // Feature id for the split when multivalent sparse float column, or 0
         // by default.
-        const int32 feature_id = split.feature_id();
+        const int32 dimension_id = split.dimension_id();
 
-        node_id =
-            !sparse_feature[feature_id].has_value() ||
-                    sparse_feature[feature_id].get_value() <= split.threshold()
-                ? split.left_id()
-                : split.right_id();
+        node_id = !sparse_feature[dimension_id].has_value() ||
+                          sparse_feature[dimension_id].get_value() <=
+                              split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
         break;
       }
       case TreeNode::kSparseFloatBinarySplitDefaultRight: {
@@ -68,12 +68,12 @@ int DecisionTree::Traverse(const DecisionTreeConfig& config,
             example.sparse_float_features[split.feature_column()];
         // Feature id for the split when multivalent sparse float column, or 0
         // by default.
-        const int32 feature_id = split.feature_id();
-        node_id =
-            sparse_feature[feature_id].has_value() &&
-                    sparse_feature[feature_id].get_value() <= split.threshold()
-                ? split.left_id()
-                : split.right_id();
+        const int32 dimension_id = split.dimension_id();
+        node_id = sparse_feature[dimension_id].has_value() &&
+                          sparse_feature[dimension_id].get_value() <=
+                              split.threshold()
+                      ? split.left_id()
+                      : split.right_id();
         break;
       }
       case TreeNode::kCategoricalIdBinarySplit: {
diff --git a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
index 93924d429c19aef51b6f1d85655de3798a76e3e0..58fe8e335af28fe811c1ee785578aa58d898335b 100644
--- a/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/trees/decision_tree_test.cc
@@ -190,7 +190,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     tree_config.add_nodes()->mutable_leaf();
 
     // Split on first column
-    split_node->set_feature_id(0);
+    split_node->set_dimension_id(0);
     split_node->set_threshold(2.0f);
 
     // Both instances have this feature value.
@@ -199,7 +199,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
 
     // Split on second column
-    split_node->set_feature_id(1);
+    split_node->set_dimension_id(1);
     split_node->set_threshold(5.0f);
 
     // First instance does not have it (default right), second does have it.
@@ -208,7 +208,7 @@ TEST_F(DecisionTreeTest, TraverseSparseBinarySplit) {
     EXPECT_EQ(1, DecisionTree::Traverse(tree_config, 0, *++example_it));
 
     // Split on third column
-    split_node->set_feature_id(2);
+    split_node->set_dimension_id(2);
     split_node->set_threshold(3.0f);
     example_it = example_iterable.begin();
 
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index 7a550d6f7328765d8815a947885e47fa0b0a8f8b..badc629a118f768d5aa25ef1b94b8190e6910c7f 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -56,7 +56,7 @@ class BatchFeatures {
     *num_sparse_int_features = sparse_int_feature_columns_.size();
     if (*num_dense_float_features == 0 && *num_sparse_float_features == 0 &&
         *num_sparse_int_features == 0) {
-      return errors::FailedPrecondition("Not intialized yet.");
+      return errors::FailedPrecondition("Not initialized yet.");
     }
     return Status::OK();
   }
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h
index e388cf332c3ff327f79ea57e3a0bccbbaa1b5e45..54f60e1dee49a4a40b84fcc6e042fac1858aa187 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/example.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h
@@ -63,7 +63,7 @@ class SparseFloatFeatureColumn {
  public:
   void Reserve(const int32 size) {
     if (!single_dimensional_) {
-      mutlidimensional_values.Reserve(size);
+      multidimensional_values.Reserve(size);
     }
   }
 
@@ -76,7 +76,7 @@ class SparseFloatFeatureColumn {
       DCHECK_EQ(0, feature_idx);
       single_value_ = value;
     } else {
-      mutlidimensional_values.Add(feature_idx, value);
+      multidimensional_values.Add(feature_idx, value);
     }
     initialized_ = true;
   }
@@ -84,7 +84,7 @@ class SparseFloatFeatureColumn {
   void Clear() {
     single_dimensional_ = false;
     initialized_ = false;
-    mutlidimensional_values.Clear();
+    multidimensional_values.Clear();
   }
 
   OptionalValue<T> operator[](int feature_idx) const {
@@ -94,7 +94,7 @@ class SparseFloatFeatureColumn {
     if (single_dimensional_) {
       return OptionalValue<T>(single_value_);
     } else {
-      return mutlidimensional_values[feature_idx];
+      return multidimensional_values[feature_idx];
     }
   }
 
@@ -102,7 +102,7 @@ class SparseFloatFeatureColumn {
   bool single_dimensional_;
   bool initialized_;
   T single_value_;
-  SparseMultidimensionalValues<T> mutlidimensional_values;
+  SparseMultidimensionalValues<T> multidimensional_values;
 };
 
 // Holds data for one example and enables lookup by feature column.
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
index bc0a93db8c39abf737d11682088233e2fd88e868..ccee9530b6897924453461c13b1238402c0f6cfa 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
+++ b/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc
@@ -96,6 +96,10 @@ class IndicesRowIterator
     return (row_idx_ != other.row_idx_);
   }
 
+  bool operator<(const IndicesRowIterator& other) const {
+    return (row_idx_ < other.row_idx_);
+  }
+
   bool operator==(const IndicesRowIterator& other) const {
     QCHECK_EQ(iter_, other.iter_);
     return (row_idx_ == other.row_idx_);
diff --git a/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
index 82b8e8c1c272ca415b5841f5ba9433e00173f8fa..d66f645f62aba84261337eb37d6e3204930f8f15 100644
--- a/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
@@ -36,7 +36,7 @@ static Status ApplyGradientTreesPredictionShapeFn(InferenceContext* c) {
   c->set_output(0, {c->Matrix(InferenceContext::kUnknownDim,
                               reduce_dim ? learner_config.num_classes() - 1
                                          : learner_config.num_classes())});
-  c->set_output(1, {c->Vector(InferenceContext::kUnknownDim)});
+  c->set_output(1, {c->UnknownShape()});
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
index 4ca73ef6e3301aadda48d5c971c31b57b7925614..1fa70bafddb0c94f47d006d5694bea941edaddf9 100644
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@@ -268,6 +268,7 @@ REGISTER_OP("Quantiles")
     .Input("sparse_values: num_sparse_features * float")
     .Input("dense_buckets: num_dense_features * float")
     .Input("sparse_buckets: num_sparse_features * float")
+    .Input("sparse_indices: num_sparse_features * int64")
     .Output("dense_quantiles: num_dense_features * int32")
     .Output("sparse_quantiles: num_sparse_features * int32")
     .Doc(R"doc(
@@ -280,10 +281,13 @@ dense_values: List of rank 1 tensors containing the dense values.
 sparse_values: List of rank 1 tensors containing the sparse feature values.
 dense_buckets: Quantile summary for each of the dense float tensor.
 sparse_buckets: Quantile summary for each of the sparse feature float tensor.
-dense_quantiles: Rank 1 tensors representing associated quantiles for each of
-dense float tensors.
-sparse_quantiles: Rank 1 tensors representing associated quantiles for each of
-the sparse feature tensors.
+sparse_indices: List of rank 2 tensors with indices for sparse float
+tensors.
+dense_quantiles: Rank 2 tensors representing associated quantiles for each of
+dense float tensors and the dimension.
+sparse_quantiles: Rank 2 tensors representing associated quantiles for each of
+the sparse feature tensors for each of sparse feature dimensions:
+[quantile id, dimension id].
 )doc");
 
 REGISTER_OP("BucketizeWithInputBoundaries")
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 07cfd413bbd389053ff52ca65693445ef28e8ede..0d27ddaf3a1d540efee268c2bcca217077ff5871 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -47,9 +47,7 @@ REGISTER_OP("BuildDenseInequalitySplits")
       ShapeHandle partition_ids_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
       ShapeHandle bucket_ids_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
-      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &bucket_ids_shape));
       ShapeHandle gradients_shape;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
       TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
@@ -71,7 +69,7 @@ Find the split that has the best gain for the accumulated stats.
 num_minibatches: A scalar, the number of times per example gradients & hessians
     were accumulated. The stats are divided by this to get per example stats.
 partition_ids: A rank 1 tensor of partition IDs.
-bucket_ids: A rank 1 tensor of buckets IDs.
+bucket_ids: A rank 2 tensor of buckets IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
 bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
@@ -108,9 +106,7 @@ REGISTER_OP("BuildSparseInequalitySplits")
       ShapeHandle partition_ids_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
       ShapeHandle bucket_ids_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
-      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &bucket_ids_shape));
       ShapeHandle gradients_shape;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
       TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
@@ -127,12 +123,13 @@ REGISTER_OP("BuildSparseInequalitySplits")
       return Status::OK();
     })
     .Doc(R"doc(
-Find the split that has the best gain for the accumulated stats.
+Find the split that has the best gain for the accumulated stats for a particular
+feature column.
 
 num_minibatches: A scalar, the number of times per example gradients & hessians
     were accumulated. The stats are divided by this to get per example stats.
-partition_ids: A rank 1 tensor of partition IDs.
-bucket_ids: A rank 1 tensor of buckets IDs.
+partition_ids: A rank 2 tensor of partition IDs for each dimension of feature column.
+bucket_ids: A rank 2 tensor of buckets IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
 bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
@@ -168,9 +165,7 @@ REGISTER_OP("BuildCategoricalEqualitySplits")
       ShapeHandle partition_ids_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &partition_ids_shape));
       ShapeHandle bucket_ids_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bucket_ids_shape));
-      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                  c->Dim(bucket_ids_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &bucket_ids_shape));
       ShapeHandle gradients_shape;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &gradients_shape));
       TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
@@ -190,7 +185,7 @@ Find the split that has the best gain for the accumulated stats.
 num_minibatches: A scalar, the number of times per example gradients & hessians
     were accumulated. The stats are divided by this to get per example stats.
 partition_ids: A rank 1 tensor of partition IDs.
-feature_ids: A rank 1 tensor of feature IDs.
+feature_ids: A rank 2 tensor of feature IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
diff --git a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
index f988755de021034fc0d33529286dd3b508d746ed..0354f7853cbedf22d0a299273b4dbd225b3121ab 100644
--- a/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/stats_accumulator_ops.cc
@@ -73,9 +73,7 @@ REGISTER_OP("StatsAccumulatorScalarAdd")
                                        1, &partition_ids_shape));
         ShapeHandle feature_ids_shape;
         TF_RETURN_IF_ERROR(c->WithRank(
-            c->input(num_resource_handles * 2 + i + 1), 1, &feature_ids_shape));
-        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                    c->Dim(feature_ids_shape, 0), &unused_dim));
+            c->input(num_resource_handles * 2 + i + 1), 2, &feature_ids_shape));
         ShapeHandle gradients_shape;
         TF_RETURN_IF_ERROR(c->WithRank(
             c->input(num_resource_handles * 3 + i + 1), 1, &gradients_shape));
@@ -96,11 +94,11 @@ stamp_token: Stamp token for Read/Write operations.
              Any operation with a mismatching token will be dropped.
 stats_accumulator_handles: A list of handles to the stats accumulator.
 partition_ids: A list of vectors of partition_ids.
-feature_ids: A list of vectors of feature_ids.
+feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 gradients: A list of vectors of gradients for each slot in
-    <partition_id, feature_id>.
+    <partition_id, feature_id, feature_dimension_id>.
 hessians: A list of vectors of hessians for each slot in
-    <partition_id, feature_id>.
+    <partition_id, feature_id, feature_dimension_id>.
 )doc");
 
 REGISTER_OP("StatsAccumulatorScalarFlush")
@@ -119,7 +117,7 @@ REGISTER_OP("StatsAccumulatorScalarFlush")
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
       c->set_output(0, c->Scalar());
       c->set_output(1, c->Vector(c->UnknownDim()));
-      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->UnknownShape());
       c->set_output(3, c->Vector(c->UnknownDim()));
       c->set_output(4, c->Vector(c->UnknownDim()));
       return Status::OK();
@@ -134,7 +132,7 @@ next_stamp_token: Stamp token for the next iteration.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 output_partition_ids A vector of partition_ids for the slots.
-output_feature_ids: A vector of feature_ids for the slots.
+output_feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 output_gradients: A vector of gradients, with a value for each slot
                   in <output_partition_id, output_feature_id>.
 output_hessians: A vector of hessians, with a value for each slot
@@ -161,9 +159,7 @@ REGISTER_OP("StatsAccumulatorScalarDeserialize")
       ShapeHandle partition_ids_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &partition_ids_shape));
       ShapeHandle feature_ids_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &feature_ids_shape));
-      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                  c->Dim(feature_ids_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &feature_ids_shape));
       ShapeHandle gradients_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 1, &gradients_shape));
       TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
@@ -183,9 +179,11 @@ stamp_token: Stamp token for Read/Write operations.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 partition_ids: A vector of partition_ids.
-feature_ids: A vector of feature_ids.
-gradients: A vector of gradients for each slot in <partition_id, feature_id>.
-hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+feature_ids: Rank 2 tensor of feature id and feature dimension ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id,
+feature_dimension_id>.
+hessians: A vector of hessians for each slot in <partition_id, feature_id,
+feature_dimension_id>
 )doc");
 
 REGISTER_OP("StatsAccumulatorScalarSerialize")
@@ -204,7 +202,7 @@ REGISTER_OP("StatsAccumulatorScalarSerialize")
       // num_updates
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Vector(c->UnknownDim()));
-      c->set_output(3, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->UnknownShape());
       c->set_output(4, c->Vector(c->UnknownDim()));
       c->set_output(5, c->Vector(c->UnknownDim()));
       return Status::OK();
@@ -217,7 +215,7 @@ stamp_token: The current stamp token for the resource.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 output_partition_ids A vector of partition_ids for the slots.
-output_feature_ids: A vector of feature_ids for the slots.
+output_feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 output_gradients: A vector of gradients, with a value for each slot
                   in <output_partition_id, output_feature_id>.
 output_hessians: A vector of hessians, with a value for each slot
@@ -293,9 +291,7 @@ REGISTER_OP("StatsAccumulatorTensorAdd")
                                        1, &partition_ids_shape));
         ShapeHandle feature_ids_shape;
         TF_RETURN_IF_ERROR(c->WithRank(
-            c->input(num_resource_handles * 2 + i + 1), 1, &feature_ids_shape));
-        TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                    c->Dim(feature_ids_shape, 0), &unused_dim));
+            c->input(num_resource_handles * 2 + i + 1), 2, &feature_ids_shape));
         ShapeHandle gradients_shape;
         TF_RETURN_IF_ERROR(c->WithRankAtLeast(
             c->input(num_resource_handles * 3 + i + 1), 2, &gradients_shape));
@@ -316,11 +312,11 @@ stats_accumulator_handles: A list of handles to the stats accumulator.
 stamp_token: Stamp token for Read/Write operations.
              Any operation with a mismatching token will be dropped.
 partition_ids: A list of vectors of partition_ids.
-feature_ids: A list of vectors of feature_ids.
+feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 gradients: A list of vectors of gradients for each slot in
-    <partition_id, feature_id>.
+    <partition_id, feature_id, feature_dimension_id>.
 hessians: A list of vectors of hessians for each slot in
-    <partition_id, feature_id>.
+    <partition_id, feature_id, feature_dimension_id>.
 )doc");
 
 REGISTER_OP("StatsAccumulatorTensorFlush")
@@ -340,7 +336,7 @@ REGISTER_OP("StatsAccumulatorTensorFlush")
       // num_updates
       c->set_output(0, c->Scalar());
       c->set_output(1, c->Vector(c->UnknownDim()));
-      c->set_output(2, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->UnknownShape());
       c->set_output(3, c->UnknownShape());
       c->set_output(4, c->UnknownShape());
       return Status::OK();
@@ -355,11 +351,11 @@ next_stamp_token: Stamp token to be used for the next iteration.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 output_partition_ids: A vector of partition_ids for the slots.
-output_feature_ids: A vector of feature_ids for the slots.
+output_feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 output_gradients: A tensor of gradients, first dimension matches slots
-                  in <partition_id, feature_id>.
+                  in <partition_id, feature_id, feature_dimension_id>.
 output_hessians: A tensor of hessians, first dimension matches slots
-                 in <partition_id, feature_id>.
+                 in <partition_id, feature_id, feature_dimension_id>>.
 )doc");
 
 REGISTER_OP("StatsAccumulatorTensorDeserialize")
@@ -382,9 +378,7 @@ REGISTER_OP("StatsAccumulatorTensorDeserialize")
       ShapeHandle partition_ids_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &partition_ids_shape));
       ShapeHandle feature_ids_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &feature_ids_shape));
-      TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
-                                  c->Dim(feature_ids_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &feature_ids_shape));
       ShapeHandle gradients_shape;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(5), 2, &gradients_shape));
       TF_RETURN_IF_ERROR(c->Merge(c->Dim(partition_ids_shape, 0),
@@ -405,9 +399,11 @@ stamp_token: Stamp token for Read/Write operations.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 partition_ids: A vector of partition_ids.
-feature_ids: A vector of feature_ids.
-gradients: A vector of gradients for each slot in <partition_id, feature_id>.
-hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+feature_ids: Rank 2 tensor of feature id and feature dimension ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id,
+feature_dimension_id>
+hessians: A vector of hessians for each slot in <partition_id, feature_id,
+feature_dimension_id>.
 )doc");
 
 REGISTER_OP("StatsAccumulatorTensorSerialize")
@@ -426,7 +422,7 @@ REGISTER_OP("StatsAccumulatorTensorSerialize")
       // num_updates
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Vector(c->UnknownDim()));
-      c->set_output(3, c->Vector(c->UnknownDim()));
+      c->set_output(3, c->UnknownShape());
       c->set_output(4, c->UnknownShape());
       c->set_output(5, c->UnknownShape());
       return Status::OK();
@@ -440,11 +436,11 @@ stamp_token: Stamp token for Read/Write operations.
 num_updates: Number of times stats were added to this accumulator since last
     flush.
 output_partition_ids: A vector of partition_ids for the slots.
-output_feature_ids: A vector of feature_ids for the slots.
+output_feature_ids: Rank 2 tensor of feature id and feature dimension ids.
 output_gradients: A tensor of gradients, first dimension matches slots
-                  in <partition_id, feature_id>.
+                  in <partition_id, feature_id, feature_dimension_id>.
 output_hessians: A tensor of hessians, first dimension matches slots
-                 in <partition_id, feature_id>.
+                 in <partition_id, feature_id, feature_dimension_id>.
 )doc");
 
 REGISTER_OP("StatsAccumulatorTensorMakeSummary")
@@ -458,18 +454,20 @@ REGISTER_OP("StatsAccumulatorTensorMakeSummary")
     .Output("output_hessians: float")
     .Doc(R"doc(
 Summarizes the stats by summing the <gradients, hessians> that are for the same
-<partition_id, feature_id>.
+<partition_id, feature_id, feature_dimension_id>.
 
 partition_ids: A vector of partition_ids.
-feature_ids: A vector of feature_ids.
-gradients: A vector of gradients for each slot in <partition_id, feature_id>.
-hessians: A vector of hessians for each slot in <partition_id, feature_id>.
+feature_ids: Rank 2 tensor of feature id and feature dimension ids.
+gradients: A vector of gradients for each slot in <partition_id, feature_id,
+feature_dimension_id>.
+hessians: A vector of hessians for each slot in <partition_id, feature_id,
+feature_dimension_id>.
 output_partition_ids: A vector of partition_ids for the slots.
-output_feature_ids: A vector of feature_ids for the slots.
+output_feature_ids: A rank2 tensor of feature_ids and dimensions for the slots.
 output_gradients: A tensor of gradients, first dimension matches slots
-                  in <partition_id, feature_id>.
+                  in <partition_id, feature_id, feature_dimension_id>.
 output_hessians: A tensor of hessians, first dimension matches slots
-                 in <partition_id, feature_id>.
+                 in <partition_id, feature_id, feature_dimension_id>.
 )doc");
 }  // namespace boosted_trees
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/proto/tree_config.proto b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
index f14abf45a517ad7c4c6d7bb1ab88b7a1d47d6fb6..fc570c1083d01a65760a456c109dad93afd9f62a 100644
--- a/tensorflow/contrib/boosted_trees/proto/tree_config.proto
+++ b/tensorflow/contrib/boosted_trees/proto/tree_config.proto
@@ -53,9 +53,9 @@ message DenseFloatBinarySplit {
   // Float feature column and split threshold describing
   // the rule feature <= threshold.
   int32 feature_column = 1;
-  // If feature column is multivalent, this holds the index of the feature for
-  // the split. Defaults to 0.
-  int32 feature_id = 5;
+  // If feature column is multivalent, this holds the index of the dimensiong
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
   float threshold = 2;
 
   // Node children indexing into a contiguous
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
index cf0958511350f82d548c56849f6179ae0f0215f5..c1acf351603dd80c2d14c7ee0a5b4c89706bc1bf 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/prediction_ops_test.py
@@ -75,7 +75,7 @@ def _append_multi_values_to_dense_leaf(leaf, w):
     leaf.vector.value.append(x)
 
 
-def _set_float_split(split, feat_col, thresh, l_id, r_id):
+def _set_float_split(split, feat_col, thresh, l_id, r_id, feature_dim_id=None):
   """Helper method for building tree float splits.
 
   Sets split feature column, threshold and children.
@@ -86,11 +86,14 @@ def _set_float_split(split, feat_col, thresh, l_id, r_id):
     thresh: threshold to split on forming rule x <= thresh.
     l_id: left child Id.
     r_id: right child Id.
+    feature_dim_id: dimension of the feature column to be used in the split.
   """
   split.feature_column = feat_col
   split.threshold = thresh
   split.left_id = l_id
   split.right_id = r_id
+  if feature_dim_id is not None:
+    split.dimension_id = feature_dim_id
 
 
 def _set_categorical_id_split(split, feat_col, feat_id, l_id, r_id):
@@ -116,12 +119,12 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
   def setUp(self):
     """Sets up the prediction tests.
 
-    Create a batch of two examples having one dense float, two sparse float and
-    one sparse int features.
+    Create a batch of two examples having one dense float, two sparse float
+    single valued, one sparse float multidimensionl and one sparse int features.
     The data looks like the following:
-    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 |
-    | 0        |  7     |    -3    |          |    9,1   |
-    | 1        | -2     |          | 4        |          |
+    | Instance | Dense0 | SparseF0 | SparseF1 | SparseI0 | SparseM
+    | 0        |  7     |    -3    |          |    9,1   | __, 5.0
+    | 1        | -2     |          | 4        |          |  3, ___
     """
     super(PredictionOpsTest, self).setUp()
     self._dense_float_tensor = np.array([[7.0], [-2.0]])
@@ -131,11 +134,37 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
     self._sparse_float_indices2 = np.array([[1, 0]])
     self._sparse_float_values2 = np.array([4.0])
     self._sparse_float_shape2 = np.array([2, 1])
+    # Multi dimensional sparse float
+    self._sparse_float_indices_m = np.array([[0, 1], [1, 0]])
+    self._sparse_float_values_m = np.array([5.0, 3.0])
+    self._sparse_float_shape_m = np.array([2, 2])
+
     self._sparse_int_indices1 = np.array([[0, 0], [0, 1]])
     self._sparse_int_values1 = np.array([9, 1])
     self._sparse_int_shape1 = np.array([2, 2])
     self._seed = 123
 
+  def _get_predictions(self,
+                       tree_ensemble_handle,
+                       learner_config,
+                       apply_dropout=False,
+                       apply_averaging=False,
+                       center_bias=False,
+                       reduce_dim=False):
+    return prediction_ops.gradient_trees_prediction(
+        tree_ensemble_handle,
+        self._seed, [self._dense_float_tensor],
+        [self._sparse_float_indices1, self._sparse_float_indices2],
+        [self._sparse_float_values1, self._sparse_float_values2],
+        [self._sparse_float_shape1, self._sparse_float_shape2],
+        [self._sparse_int_indices1], [self._sparse_int_values1],
+        [self._sparse_int_shape1],
+        learner_config=learner_config,
+        apply_dropout=apply_dropout,
+        apply_averaging=apply_averaging,
+        center_bias=center_bias,
+        reduce_dim=reduce_dim)
+
   def testEmptyEnsemble(self):
     with self.test_session():
       # Empty tree ensenble.
@@ -151,18 +180,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
       self.assertAllEqual([[0], [0]], result.eval())
       # Empty dropout.
@@ -187,18 +207,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
       self.assertAllClose([[-0.4], [-0.4]], result.eval())
 
@@ -226,18 +237,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 3
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
       self.assertAllClose([[-0.4, 0.9], [-0.4, 0.9]], result.eval())
 
@@ -279,14 +281,94 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
 
+      result, dropout_info = self._get_predictions(
+          tree_ensemble_handle,
+          learner_config=learner_config.SerializeToString(),
+          reduce_dim=True)
+
+      # The first example will get bias -0.4 from first tree and
+      # leaf 4 payload of -0.9 hence -1.3, the second example will
+      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
+      # of 1.2 hence 0.8.
+      self.assertAllClose([[-1.3], [0.8]], result.eval())
+
+      # Empty dropout.
+      self.assertAllEqual([[], []], dropout_info.eval())
+
+  def testFullEnsembleWithMultidimensionalSparseSingleClass(self):
+    with self.test_session():
+      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      # Bias tree.
+      tree1 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)
+
+      # Depth 3 tree.
+      tree2 = tree_ensemble_config.trees.add()
+      tree_ensemble_config.tree_metadata.add().is_finalized = True
+      # Use feature column 2 (sparse multidimensional), split on first value
+      # node 0.
+      _set_float_split(
+          tree2.nodes.add().sparse_float_binary_split_default_right.split,
+          2,
+          7.0,
+          1,
+          2,
+          feature_dim_id=0)
+      # Leafs split on second dimension of sparse multidimensional feature.
+      # Node 1.
+      _set_float_split(
+          tree2.nodes.add().sparse_float_binary_split_default_left.split,
+          2,
+          4.5,
+          3,
+          4,
+          feature_dim_id=1)
+      # Node 2.
+      _set_float_split(
+          tree2.nodes.add().sparse_float_binary_split_default_right.split,
+          2,
+          9,
+          5,
+          6,
+          feature_dim_id=1)
+
+      # Node 3.
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.6)
+      # Node 4.
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.3)
+
+      # Node 5.
+      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.1)
+      # Node 6.
+      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.8)
+
+      tree_ensemble_config.tree_weights.append(1.0)
+      tree_ensemble_config.tree_weights.append(1.0)
+
+      tree_ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
+          name="full_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare learner config.
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.num_classes = 2
+
       result, dropout_info = prediction_ops.gradient_trees_prediction(
           tree_ensemble_handle,
           self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
+              self._sparse_float_indices1, self._sparse_float_indices2,
+              self._sparse_float_indices_m
+          ], [
+              self._sparse_float_values1, self._sparse_float_values2,
+              self._sparse_float_values_m
+          ], [
+              self._sparse_float_shape1, self._sparse_float_shape2,
+              self._sparse_float_shape_m
+          ], [self._sparse_int_indices1], [self._sparse_int_values1],
+          [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
           apply_dropout=False,
           apply_averaging=False,
@@ -294,10 +376,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
           reduce_dim=True)
 
       # The first example will get bias -0.4 from first tree and
-      # leaf 4 payload of -0.9 hence -1.3, the second example will
-      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
-      # of 1.2 hence 0.8.
-      self.assertAllClose([[-1.3], [0.8]], result.eval())
+      # leaf 5 payload of -0.1 hence -0.5, the second example will
+      # get the same bias -0.4 and leaf 3 payload (0.6) hence 0.2
+      self.assertAllClose([[-0.5], [0.2]], result.eval())
 
       # Empty dropout.
       self.assertAllEqual([[], []], dropout_info.eval())
@@ -337,19 +418,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
       learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
-
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
 
       # All the examples should get only the bias since the second tree is
@@ -394,19 +465,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
       learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
-
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
 
       # The first example will get bias -0.4 from first tree and
@@ -453,19 +514,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Prepare learner config.
       learner_config = learner_pb2.LearnerConfig()
       learner_config.num_classes = 2
-
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
 
       # The first example will get bias -0.4 from first tree and
@@ -512,18 +563,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config.multi_class_strategy = (
           learner_pb2.LearnerConfig.TREE_PER_CLASS)
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=True)
       # The first example will get bias class 1 -0.2 from first tree and
       # leaf 2 payload (sparse feature missing) of 0.5 hence [0.5, -0.2],
@@ -572,18 +614,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config.multi_class_strategy = (
           learner_pb2.LearnerConfig.FULL_HESSIAN)
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=False)
       # The first example will get bias class 1 -0.2 from first tree and
       # leaf 2 payload (sparse feature missing) of 0.5 hence [0.5, -0.2],
@@ -631,18 +664,9 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       learner_config.multi_class_strategy = (
           learner_pb2.LearnerConfig.FULL_HESSIAN)
 
-      result, dropout_info = prediction_ops.gradient_trees_prediction(
+      result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
-          apply_dropout=False,
-          apply_averaging=False,
-          center_bias=False,
           reduce_dim=False)
       # The first example will get bias class 1 -0.2 and -2 for class 2 from
       # first tree and leaf 2 payload (sparse feature missing) of 0.5 hence
@@ -653,26 +677,6 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Empty dropout.
       self.assertAllEqual([[], []], dropout_info.eval())
 
-  def _get_predictions(self,
-                       tree_ensemble_handle,
-                       learner_config,
-                       apply_dropout=False,
-                       apply_averaging=False,
-                       center_bias=False):
-    return prediction_ops.gradient_trees_prediction(
-        tree_ensemble_handle,
-        self._seed, [self._dense_float_tensor], [
-            self._sparse_float_indices1, self._sparse_float_indices2
-        ], [self._sparse_float_values1, self._sparse_float_values2],
-        [self._sparse_float_shape1,
-         self._sparse_float_shape2], [self._sparse_int_indices1],
-        [self._sparse_int_values1], [self._sparse_int_shape1],
-        learner_config=learner_config.SerializeToString(),
-        apply_dropout=apply_dropout,
-        apply_averaging=apply_averaging,
-        center_bias=center_bias,
-        reduce_dim=True)
-
   def testDropout(self):
     with self.test_session():
       # Empty tree ensenble.
@@ -699,10 +703,11 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       # We expect approx 500 trees were dropped.
       dropout_info = dropout_info.eval()
@@ -719,10 +724,11 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Don't apply dropout.
       result_no_dropout, no_dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=False,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       self.assertEqual(result.eval().size, result_no_dropout.eval().size)
       for i in range(result.eval().size):
@@ -760,17 +766,19 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       result_center, dropout_info_center = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=True)
+          center_bias=True,
+          reduce_dim=True)
 
       dropout_info = dropout_info.eval()
       dropout_info_center = dropout_info_center.eval()
@@ -830,17 +838,19 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       result_center, dropout_info_center = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=True)
+          center_bias=True,
+          reduce_dim=True)
 
       dropout_info = dropout_info.eval()
       dropout_info_center = dropout_info_center.eval()
@@ -888,28 +898,16 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
           name="empty")
       resources.initialize_resources(resources.shared_resources()).run()
 
-      _, dropout_info_1 = prediction_ops.gradient_trees_prediction(
+      _, dropout_info_1 = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
           center_bias=False,
           reduce_dim=True)
 
-      _, dropout_info_2 = prediction_ops.gradient_trees_prediction(
+      _, dropout_info_2 = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
@@ -919,12 +917,12 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       # Different seed.
       _, dropout_info_3 = prediction_ops.gradient_trees_prediction(
           tree_ensemble_handle,
-          112314, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
+          112314, [self._dense_float_tensor],
+          [self._sparse_float_indices1, self._sparse_float_indices2],
+          [self._sparse_float_values1, self._sparse_float_values2],
+          [self._sparse_float_shape1, self._sparse_float_shape2],
+          [self._sparse_int_indices1], [self._sparse_int_values1],
+          [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
@@ -932,14 +930,8 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
           reduce_dim=True)
 
       # First seed with centering bias.
-      _, dropout_info_4 = prediction_ops.gradient_trees_prediction(
+      _, dropout_info_4 = self._get_predictions(
           tree_ensemble_handle,
-          self._seed, [self._dense_float_tensor], [
-              self._sparse_float_indices1, self._sparse_float_indices2
-          ], [self._sparse_float_values1, self._sparse_float_values2],
-          [self._sparse_float_shape1,
-           self._sparse_float_shape2], [self._sparse_int_indices1],
-          [self._sparse_int_values1], [self._sparse_int_shape1],
           learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
@@ -983,17 +975,19 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       result, dropout_info = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=True,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       result_no_dropout, _ = self._get_predictions(
           tree_ensemble_handle,
-          learner_config=learner_config,
+          learner_config=learner_config.SerializeToString(),
           apply_dropout=False,
           apply_averaging=False,
-          center_bias=False)
+          center_bias=False,
+          reduce_dim=True)
 
       self.assertAllEqual([[], []], dropout_info.eval())
       self.assertAllClose(result.eval(), result_no_dropout.eval())
@@ -1048,12 +1042,16 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 
       # Do averaging.
       result, dropout_info = self._get_predictions(
-          tree_ensemble_handle, learner_config, apply_averaging=True)
+          tree_ensemble_handle,
+          learner_config.SerializeToString(),
+          apply_averaging=True,
+          reduce_dim=True)
 
-      pattern_result, pattern_dropout_info = (self._get_predictions(
+      pattern_result, pattern_dropout_info = self._get_predictions(
           adjusted_tree_ensemble_handle,
-          learner_config_no_averaging,
-          apply_averaging=False))
+          learner_config_no_averaging.SerializeToString(),
+          apply_averaging=False,
+          reduce_dim=True)
 
       self.assertAllEqual(result.eval(), pattern_result.eval())
       self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
@@ -1116,15 +1114,22 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       result_1, dropout_info_1 = self._get_predictions(
-          tree_ensemble_handle, learner_config_1, apply_averaging=True)
+          tree_ensemble_handle,
+          learner_config_1.SerializeToString(),
+          apply_averaging=True,
+          reduce_dim=True)
 
       result_2, dropout_info_2 = self._get_predictions(
-          tree_ensemble_handle, learner_config_2, apply_averaging=True)
+          tree_ensemble_handle,
+          learner_config_2.SerializeToString(),
+          apply_averaging=True,
+          reduce_dim=True)
 
       pattern_result, pattern_dropout_info = self._get_predictions(
           adjusted_tree_ensemble_handle,
-          learner_config_no_averaging,
-          apply_averaging=False)
+          learner_config_no_averaging.SerializeToString(),
+          apply_averaging=False,
+          reduce_dim=True)
 
       self.assertAllEqual(result_1.eval(), pattern_result.eval())
       self.assertAllEqual(result_2.eval(), pattern_result.eval())
@@ -1179,12 +1184,16 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       result, dropout_info = self._get_predictions(
-          tree_ensemble_handle, learner_config, apply_averaging=True)
+          tree_ensemble_handle,
+          learner_config.SerializeToString(),
+          apply_averaging=True,
+          reduce_dim=True)
 
-      pattern_result, pattern_dropout_info = (self._get_predictions(
+      pattern_result, pattern_dropout_info = self._get_predictions(
           adjusted_tree_ensemble_handle,
-          learner_config_no_averaging,
-          apply_averaging=False))
+          learner_config_no_averaging.SerializeToString(),
+          apply_averaging=False,
+          reduce_dim=True)
 
       self.assertAllEqual(result.eval(), pattern_result.eval())
       self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
@@ -1224,10 +1233,6 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
           name="full_ensemble")
       resources.initialize_resources(resources.shared_resources()).run()
 
-      # Prepare learner config.
-      learner_config = learner_pb2.LearnerConfig()
-      learner_config.num_classes = 2
-
       result = prediction_ops.gradient_trees_partition_examples(
           tree_ensemble_handle, [self._dense_float_tensor], [
               self._sparse_float_indices1, self._sparse_float_indices2
@@ -1263,10 +1268,6 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
           name="full_ensemble")
       resources.initialize_resources(resources.shared_resources()).run()
 
-      # Prepare learner config.
-      learner_config = learner_pb2.LearnerConfig()
-      learner_config.num_classes = 2
-
       result = prediction_ops.gradient_trees_partition_examples(
           tree_ensemble_handle, [self._dense_float_tensor], [
               self._sparse_float_indices1, self._sparse_float_indices2
@@ -1302,10 +1303,6 @@ class PartitionExamplesOpsTest(test_util.TensorFlowTestCase):
           name="full_ensemble")
       resources.initialize_resources(resources.shared_resources()).run()
 
-      # Prepare learner config.
-      learner_config = learner_pb2.LearnerConfig()
-      learner_config.num_classes = 2
-
       result = prediction_ops.gradient_trees_partition_examples(
           tree_ensemble_handle, [self._dense_float_tensor], [
               self._sparse_float_indices1, self._sparse_float_indices2
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
index 1513c11c33d538dedabe10e4411bdd1373b16c7f..888d5c57ed33446c8b6f18d2d1e393647613d132 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@@ -48,15 +48,16 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
   def testBasicQuantileBuckets(self):
     """Sets up the quantile summary op test as follows.
 
-    Create a batch of 6 examples having a dense and sparse features.
+    Create a batch of 6 examples having a dense and sparse features. SparseM is
+    a sparse multi-dimensional (multivalent) feature.
     The data looks like this
-    | Instance | instance weights | Dense 0  | Sparse 0
-    | 0        |     10           |   1      |
-    | 1        |     1            |   2      |    2
-    | 2        |     1            |   3      |    3
-    | 3        |     1            |   4      |    4
-    | 4        |     1            |   4      |    5
-    | 5        |     1            |   5      |    6
+    | Instance | instance weights | Dense 0  | Sparse 0 | SparseM
+    | 0        |     10           |   1      |          |   |   |
+    | 1        |     1            |   2      |    2     | 2 |   |
+    | 2        |     1            |   3      |    3     | 3 |   |
+    | 3        |     1            |   4      |    4     |   | 4 |
+    | 4        |     1            |   4      |    5     |   | 5 |
+    | 5        |     1            |   5      |    6     |   | 6 |
     """
 
     dense_float_tensor_0 = constant_op.constant(
@@ -66,20 +67,29 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
     sparse_values_0 = constant_op.constant(
         [2, 3, 4, 5, 6], dtype=dtypes.float32)
     sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+    # Multi-dimensional feature that should have the same quantiles as Sparse 0.
+    sparse_indices_m = constant_op.constant(
+        [[1, 1], [2, 0], [3, 1], [4, 1], [5, 1]], dtype=dtypes.int64)
+    sparse_values_m = constant_op.constant(
+        [2, 3, 4, 5, 6], dtype=dtypes.float32)
+    sparse_shape_m = constant_op.constant([6, 2], dtype=dtypes.int64)
+
     example_weights = constant_op.constant(
         [10, 1, 1, 1, 1, 1], dtype=dtypes.float32)
 
     with self.test_session():
       config = self._gen_config(0.33, 3)
       dense_buckets, sparse_buckets = quantile_ops.quantile_buckets(
-          [dense_float_tensor_0], [sparse_indices_0], [sparse_values_0],
-          [sparse_shape_0],
+          [dense_float_tensor_0], [sparse_indices_0, sparse_indices_m],
+          [sparse_values_0, sparse_values_m], [sparse_shape_0, sparse_shape_m],
           example_weights=example_weights,
           dense_config=[config],
-          sparse_config=[config])
+          sparse_config=[config, config])
 
       self.assertAllEqual([1, 3, 5], dense_buckets[0].eval())
       self.assertAllEqual([2, 4, 6.], sparse_buckets[0].eval())
+      # Multidimensional sparse.
+      self.assertAllEqual([2, 4, 6.], sparse_buckets[1].eval())
 
   def testStreamingQuantileBucketsWithVaryingBatch(self):
     """Sets up the quantile summary op test as follows.
@@ -214,10 +224,10 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
 
       sparse_indices_0 = constant_op.constant(
-          [[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]], dtype=dtypes.int64)
+          [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64)
       sparse_values_0 = constant_op.constant(
           [2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32)
-      sparse_shape_0 = constant_op.constant([6, 1], dtype=dtypes.int64)
+      sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64)
       example_weights = constant_op.constant(
           [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1])
       update = accumulator.add_summary(
@@ -349,19 +359,21 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
   def setUp(self):
     """Sets up the quantile op tests.
 
-    Create a batch of 4 examples having 2 dense and 3 sparse features.
+    Create a batch of 4 examples having 2 dense and 4 sparse features.
+    Forth sparse feature is multivalent (3 dimensional)
     The data looks like this
-    | Instance | Dense 0 | Dense 1 | Sparse 0 | Sparse 1 | Sparse 2
-    | 0        |   -0.1  |  -1     |   -2     |   0.1    |
-    | 1        |    0.4  |  -15    |   5.5    |          |   2
-    | 2        |    3.2  |  18     |   16     |   3      |
-    | 3        |    190  |  1000   |   17.5   |  -3      |   4
+    | Instance | Dense 0 | Dense 1 | Sparse 0 | Sparse 1 |Sparse 2| SparseM
+    | 0        |   -0.1  |  -1     |   -2     |   0.1    |        |_ ,1,_
+    | 1        |    0.4  |  -15    |   5.5    |          |  2     |2 ,_,_
+    | 2        |    3.2  |  18     |   16     |   3      |        |__,_,_
+    | 3        |    190  |  1000   |   17.5   |  -3      |  4     |1 ,8,1
     Quantiles are:
     Dense 0: (-inf,0.4], (0.4,5], (5, 190]
     Dense 1: (-inf, -9], (-9,15], (15, 1000)
     Sparse 0: (-inf, 5], (5,16], (16, 100]
     Sparse 1: (-inf, 2], (2, 5]
     Sparse 2: (-inf, 100]
+    SparseM: (-inf, 1], (1,2], (2,1000]
     """
     super(QuantilesOpTest, self).setUp()
     self._dense_float_tensor_0 = constant_op.constant(
@@ -369,18 +381,26 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
     self._dense_float_tensor_1 = constant_op.constant(
         [[-1], [-15], [18], [1000]], dtype=dtypes.float32)
     # Sparse feature 0
-    self._sparse_indices_0 = constant_op.constant([[0, 0], [1, 0], [2, 0],
-                                                   [3, 0]])
+    self._sparse_indices_0 = constant_op.constant(
+        [[0, 0], [1, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
     self._sparse_values_0 = constant_op.constant([-2, 5.5, 16, 17.5])
     self._sparse_shape_0 = constant_op.constant([4, 1])
     # Sprase feature 1
-    self._sparse_indices_1 = constant_op.constant([[0, 0], [2, 0], [3, 0]])
+    self._sparse_indices_1 = constant_op.constant(
+        [[0, 0], [2, 0], [3, 0]], dtype=dtypes.int64)
     self._sparse_values_1 = constant_op.constant([0.1, 3, -3])
     self._sparse_shape_1 = constant_op.constant([4, 1])
     # Sprase feature 2
-    self._sparse_indices_2 = constant_op.constant([[1, 0], [3, 0]])
+    self._sparse_indices_2 = constant_op.constant(
+        [[1, 0], [3, 0]], dtype=dtypes.int64)
     self._sparse_values_2 = constant_op.constant([2, 4], dtype=dtypes.float32)
     self._sparse_shape_2 = constant_op.constant([4, 1])
+    # Sprase feature M
+    self._sparse_indices_m = constant_op.constant(
+        [[0, 1], [1, 0], [3, 0], [3, 1], [3, 2]], dtype=dtypes.int64)
+    self._sparse_values_m = constant_op.constant(
+        [1, 2, 1, 8, 1], dtype=dtypes.float32)
+    self._sparse_shape_m = constant_op.constant([4, 1])
     # Quantiles
     self._dense_thresholds_0 = [0.4, 5, 190]
     self._dense_thresholds_1 = [-9, 15, 1000]
@@ -388,52 +408,76 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
     self._sparse_thresholds_0 = [5, 16, 100]
     self._sparse_thresholds_1 = [2, 5]
     self._sparse_thresholds_2 = [100]
+    self._sparse_thresholds_m = [1, 2, 1000]
 
   def testDenseFeaturesOnly(self):
     with self.test_session():
       dense_quantiles, _ = quantile_ops.quantiles(
           [self._dense_float_tensor_0, self._dense_float_tensor_1], [],
-          [self._dense_thresholds_0, self._dense_thresholds_1], [])
+          [self._dense_thresholds_0, self._dense_thresholds_1], [], [])
 
       # Dense feature 0
-      self.assertAllEqual([0, 0, 1, 2], dense_quantiles[0].eval())
+      self.assertAllEqual([[0, 0], [0, 0], [1, 0], [2, 0]],
+                          dense_quantiles[0].eval())
       # Dense feature 1
-      self.assertAllEqual([1, 0, 2, 2], dense_quantiles[1].eval())
+      self.assertAllEqual([[1, 0], [0, 0], [2, 0], [2, 0]],
+                          dense_quantiles[1].eval())
 
   def testSparseFeaturesOnly(self):
     with self.test_session():
-      _, sparse_quantiles = quantile_ops.quantiles(
-          [],
-          [self._sparse_values_0, self._sparse_values_1, self._sparse_values_2],
-          [], [self._sparse_thresholds_0, self._sparse_thresholds_1,
-               self._sparse_thresholds_2])
-
+      _, sparse_quantiles = quantile_ops.quantiles([], [
+          self._sparse_values_0, self._sparse_values_1, self._sparse_values_2,
+          self._sparse_values_m
+      ], [], [
+          self._sparse_thresholds_0, self._sparse_thresholds_1,
+          self._sparse_thresholds_2, self._sparse_thresholds_m
+      ], [
+          self._sparse_indices_0, self._sparse_indices_1,
+          self._sparse_indices_2, self._sparse_indices_m
+      ])
+
+      self.assertAllEqual(4, len(sparse_quantiles))
       # Sparse feature 0
-      self.assertAllEqual([0, 1, 1, 2], sparse_quantiles[0].eval())
+      self.assertAllEqual([[0, 0], [1, 0], [1, 0], [2, 0]],
+                          sparse_quantiles[0].eval())
       # Sparse feature 1
-      self.assertAllEqual([0, 1, 0], sparse_quantiles[1].eval())
+      self.assertAllEqual([[0, 0], [1, 0], [0, 0]], sparse_quantiles[1].eval())
       # Sparse feature 2
-      self.assertAllEqual([0, 0], sparse_quantiles[2].eval())
+      self.assertAllEqual([[0, 0], [0, 0]], sparse_quantiles[2].eval())
+      # Multidimensional feature.
+      self.assertAllEqual([[0, 1], [1, 0], [0, 0], [2, 1], [0, 2]],
+                          sparse_quantiles[3].eval())
 
   def testDenseAndSparseFeatures(self):
     with self.test_session():
       dense_quantiles, sparse_quantiles = quantile_ops.quantiles(
-          [self._dense_float_tensor_0, self._dense_float_tensor_1],
-          [self._sparse_values_0, self._sparse_values_1, self._sparse_values_2],
-          [self._dense_thresholds_0, self._dense_thresholds_1],
-          [self._sparse_thresholds_0, self._sparse_thresholds_1,
-           self._sparse_thresholds_2])
+          [self._dense_float_tensor_0, self._dense_float_tensor_1], [
+              self._sparse_values_0, self._sparse_values_1,
+              self._sparse_values_2, self._sparse_values_m
+          ], [self._dense_thresholds_0, self._dense_thresholds_1], [
+              self._sparse_thresholds_0, self._sparse_thresholds_1,
+              self._sparse_thresholds_2, self._sparse_thresholds_m
+          ], [
+              self._sparse_indices_0, self._sparse_indices_1,
+              self._sparse_indices_2, self._sparse_indices_m
+          ])
 
       # Dense feature 0
-      self.assertAllEqual([0, 0, 1, 2], dense_quantiles[0].eval())
+      self.assertAllEqual([[0, 0], [0, 0], [1, 0], [2, 0]],
+                          dense_quantiles[0].eval())
       # Dense feature 1
-      self.assertAllEqual([1, 0, 2, 2], dense_quantiles[1].eval())
+      self.assertAllEqual([[1, 0], [0, 0], [2, 0], [2, 0]],
+                          dense_quantiles[1].eval())
       # Sparse feature 0
-      self.assertAllEqual([0, 1, 1, 2], sparse_quantiles[0].eval())
+      self.assertAllEqual([[0, 0], [1, 0], [1, 0], [2, 0]],
+                          sparse_quantiles[0].eval())
       # Sparse feature 1
-      self.assertAllEqual([0, 1, 0], sparse_quantiles[1].eval())
+      self.assertAllEqual([[0, 0], [1, 0], [0, 0]], sparse_quantiles[1].eval())
       # Sparse feature 2
-      self.assertAllEqual([0, 0], sparse_quantiles[2].eval())
+      self.assertAllEqual([[0, 0], [0, 0]], sparse_quantiles[2].eval())
+      # Multidimensional feature.
+      self.assertAllEqual([[0, 1], [1, 0], [0, 0], [2, 1], [0, 2]],
+                          sparse_quantiles[3].eval())
 
   def testBucketizeWithInputBoundaries(self):
     with self.test_session():
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index edf088b5fa28d3e465d4e3d8ea7cf6745d48a91f..28834ef55bf8e1f32cc8f2380a4be3bf3824d8e1 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -38,7 +38,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
       # (-0.3, 0.19) | 0         | 1              |
       # (4.0, 0.13)  | 1         | 1              |
       partition_ids = array_ops.constant([0, 0, 1], dtype=dtypes.int32)
-      bucket_ids = array_ops.constant([0, 1, 1], dtype=dtypes.int64)
+      bucket_ids = array_ops.constant(
+          [[0, 0], [1, 0], [1, 0]], dtype=dtypes.int64)
       gradients = array_ops.constant([2.4, -0.6, 8.0])
       hessians = array_ops.constant([0.4, 0.38, 0.26])
       bucket_boundaries = [0.3, 0.52]
@@ -109,7 +110,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     """Tests split handler op."""
     with self.test_session() as sess:
       partition_ids = array_ops.constant([0, 0, 1], dtype=dtypes.int32)
-      bucket_ids = array_ops.constant([0, 1, 1], dtype=dtypes.int64)
+      bucket_ids = array_ops.constant(
+          [[0, 0], [1, 0], [1, 0]], dtype=dtypes.int64)
       gradients = array_ops.constant([[2.4, 3.0], [-0.6, 0.1], [8.0, 1.0]])
       hessians = array_ops.constant([[[0.4, 1], [1, 1]], [[0.38, 1], [1, 1]],
                                      [[0.26, 1], [1, 1]]])
@@ -149,7 +151,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     """Tests empty inputs op."""
     with self.test_session() as sess:
       partition_ids = array_ops.constant([], dtype=dtypes.int32)
-      bucket_ids = array_ops.constant([], dtype=dtypes.int64)
+      bucket_ids = array_ops.constant([[]], dtype=dtypes.int64)
       gradients = array_ops.constant([])
       hessians = array_ops.constant([])
       bucket_boundaries = [0.3, 0.52]
@@ -185,7 +187,11 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
       # (4.0, 0.13)  | 1         | -1              |
       # (4.0, 0.13)  | 1         | 1               |
       partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
+      # We have only 1 dimension in our sparse feature column.
       bucket_ids = array_ops.constant([-1, 0, 1, -1, 1], dtype=dtypes.int64)
+      dimension_ids = array_ops.constant([0, 0, 0, 0, 0], dtype=dtypes.int64)
+      bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)
+
       gradients = array_ops.constant([1.8, 2.4, 0.4, 8.0, 8.0])
       hessians = array_ops.constant([0.78, 0.4, 0.24, 0.26, 0.26])
       bucket_boundaries = array_ops.constant([0.3, 0.52])
@@ -207,6 +213,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
               multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
       partitions, gains, splits = (sess.run([partitions, gains, splits]))
     self.assertAllEqual([0, 1], partitions)
+    self.assertEqual(2, len(splits))
     # Check the split on partition 0.
     # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
     expected_left_weight = -0.603448275862069
@@ -232,6 +239,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     self.assertAllClose([expected_right_weight], right_child.value)
 
     self.assertEqual(0, split_node.split.feature_column)
+    # Sparse is one dimensional.
+    self.assertEqual(0, split_node.split.dimension_id)
 
     self.assertAllClose(0.52, split_node.split.threshold)
 
@@ -253,14 +262,149 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
     self.assertAllClose([expected_right_weight], right_child.value)
 
     self.assertEqual(0, split_node.split.feature_column)
+    # Sparse is one dimensional.
+    self.assertEqual(0, split_node.split.dimension_id)
 
     self.assertAllClose(0.52, split_node.split.threshold)
 
+  def testMakeSparseSplitAllEmptyDimensions(self):
+    """Tests split handler op when all dimensions have only bias bucket id."""
+    with self.test_session() as sess:
+      # The data looks like the following after dividing by number of steps (2).
+      # Gradients    | Partition | Dimension | bucket ID       |
+      # (0.9, 0.39)  | 0         |    0      |  -1             |
+      # (4.0, 0.13)  | 1         |    0      |  -1             |
+      partition_ids = array_ops.constant([0, 1], dtype=dtypes.int32)
+      # We have only 1 dimension in our sparse feature column.
+      bucket_ids = array_ops.constant([[-1, 0], [-1, 0]], dtype=dtypes.int64)
+      gradients = array_ops.constant([1.8, 8.0])
+      hessians = array_ops.constant([0.78, 0.26])
+      bucket_boundaries = array_ops.constant([0.3, 0.52])
+      partitions, gains, splits = (
+          split_handler_ops.build_sparse_inequality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0,
+              l2_regularization=2,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = (sess.run([partitions, gains, splits]))
+    self.assertEqual(0, len(partitions))
+    self.assertEqual(0, len(splits))
+
+  def testMakeSparseMultidimensionalSplit(self):
+    """Tests split handler op."""
+    with self.test_session() as sess:
+      # Num of steps is 2.
+      # The feature column is three dimensional.
+      # First dimension has bias bucket only, the second has bias bucket and
+      # two valid buckets, the third has just one bias bucket and one valid
+      # bucket.
+      # Gradients    | Partition | Dimension | bucket ID       |
+      # (0.9, 0.39)  |    0      |     0     |     -1          |
+      # (1.2, 0.2)   |    0      |     1     |      0          |
+      # (0.2, 0.12)  |    0      |     1     |      2          |
+      # (0.1, 0.1)   |    0      |     2     |      3          |
+      # Now second node - nothing interesting there, just one dimension.
+      # Second node has the same bucket ids for all dimensions.
+      # (4.0, 0.13)  |    1      |     0     |     -1          |
+      # (4.0, 0.13)  |    1      |     2     |      3          |
+
+      # Tree node ids.
+      partition_ids = array_ops.constant([0, 0, 0, 0, 1, 1], dtype=dtypes.int32)
+
+      dimension_ids = array_ops.constant([0, 1, 1, 2, 0, 2], dtype=dtypes.int64)
+      bucket_ids = array_ops.constant([-1, 0, 2, 3, -1, 3], dtype=dtypes.int64)
+      bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)
+
+      gradients = array_ops.constant([1.8, 2.4, 0.4, 0.2, 8.0, 8.0])
+      hessians = array_ops.constant([0.78, 0.4, 0.24, 0.2, 0.26, 0.26])
+      bucket_boundaries = array_ops.constant([0.3, 0.52, 0.58, 0.6])
+      partitions, gains, splits = (
+          split_handler_ops.build_sparse_inequality_splits(
+              num_minibatches=2,
+              partition_ids=partition_ids,
+              bucket_ids=bucket_ids,
+              gradients=gradients,
+              hessians=hessians,
+              bucket_boundaries=bucket_boundaries,
+              l1_regularization=0,
+              l2_regularization=2,
+              tree_complexity_regularization=0,
+              min_node_weight=0,
+              feature_column_group_id=0,
+              bias_feature_id=-1,
+              class_id=-1,
+              multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+      partitions, gains, splits = (sess.run([partitions, gains, splits]))
+    self.assertAllEqual([0, 1], partitions)
+    self.assertEqual(2, len(splits))
+    # Check the split on node 0 - it should split on second dimension
+    # -(0.2 + 1.2) / (0.12 + 0.2 + 2)
+    expected_left_weight = -0.603448275862069
+    # (0.2 + 1.2) ** 2 / (0.12 + 0.2 + 2)
+    expected_left_gain = 0.8448275862068965
+    # 0.5 / (0.07 + 2)
+    expected_right_weight = 0.24154589371980678
+    # 0.5 ** 2 / (0.07 + 2)
+    expected_right_gain = 0.12077294685990339
+    # (0.2 + 1.2 - 0.5) ** 2 /  (0.12 + 0.2 + 0.07 + 2)
+    expected_bias_gain = 0.3389121338912133
+
+    split_info = split_info_pb2.SplitInfo()
+    split_info.ParseFromString(splits[0])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_right
+    self.assertAllClose(
+        expected_left_gain + expected_right_gain - expected_bias_gain, gains[0])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+    # Split happened on second dimension.
+    self.assertEqual(1, split_node.split.dimension_id)
+
+    self.assertAllClose(0.58, split_node.split.threshold)
+
+    # Check the split on partition 1.
+    expected_left_weight = -1.8779342723004695
+    expected_right_weight = 0
+
+    # Verify candidate for partition 1, there's only one active bucket here
+    # so zero gain is expected.
+    split_info.ParseFromString(splits[1])
+    left_child = split_info.left_child.vector
+    right_child = split_info.right_child.vector
+    split_node = split_info.split_node.sparse_float_binary_split_default_left
+
+    self.assertAllClose(0.0, gains[1])
+
+    self.assertAllClose([expected_left_weight], left_child.value)
+
+    self.assertAllClose([expected_right_weight], right_child.value)
+
+    self.assertEqual(0, split_node.split.feature_column)
+    self.assertEqual(2, split_node.split.dimension_id)
+
+    self.assertAllClose(0.6, split_node.split.threshold)
+
   def testMakeMulticlassSparseSplit(self):
     """Tests split handler op."""
     with self.test_session() as sess:
       partition_ids = array_ops.constant([0, 0, 0, 1, 1], dtype=dtypes.int32)
-    bucket_ids = array_ops.constant([-1, 0, 1, -1, 1], dtype=dtypes.int64)
+    bucket_ids = array_ops.constant(
+        [[-1, 0], [0, 0], [1, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
     gradients = array_ops.constant([[1.8, 3.5], [2.4, 1.0], [0.4, 4.0],
                                     [8.0, 3.1], [8.0, 0.8]])
 
@@ -317,7 +461,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
       gradients = [1.8, 0.4, 2.8, 8.0, 8.0]
       hessians = [0.78, 0.24, 0.64, 0.26, 0.26]
       partition_ids = [0, 0, 0, 1, 1]
-      feature_ids = array_ops.constant([-1, 1, 2, -1, 1], dtype=dtypes.int64)
+      feature_ids = array_ops.constant(
+          [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
       partitions, gains, splits = (
           split_handler_ops.build_categorical_equality_splits(
               num_minibatches=2,
@@ -412,7 +557,8 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
       hessians = array_ops.constant(
           [hessian_0, hessian_1, hessian_2, hessian_3, hessian_4])
       partition_ids = [0, 0, 0, 1, 1]
-      feature_ids = array_ops.constant([-1, 1, 2, -1, 1], dtype=dtypes.int64)
+      feature_ids = array_ops.constant(
+          [[-1, 0], [1, 0], [2, 0], [-1, 0], [1, 0]], dtype=dtypes.int64)
       partitions, gains, splits = (
           split_handler_ops.build_categorical_equality_splits(
               num_minibatches=2,
@@ -449,7 +595,7 @@ class SplitHandlerOpsTest(test_util.TensorFlowTestCase):
       gradients = []
       hessians = []
       partition_ids = []
-      feature_ids = []
+      feature_ids = [[]]
       partitions, gains, splits = (
           split_handler_ops.build_categorical_equality_splits(
               num_minibatches=0,
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
index 0022d4ad52b0699e6706ad04435f09d0d1cd57c3..978bf530cd99ec6af74a49cb96ff98023d7a15cb 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py
@@ -38,22 +38,52 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
             gradients=[0.1, 0.3],
             hessians=[0.2, 0.4])
-        op2 = accumulator.add(0, [1], [2], [0.1], [0.2])
+        op2 = accumulator.add(0, [1], [[2, 0]], [0.1], [0.2])
 
       with ops.control_dependencies([op1, op2]):
-        num_updates, partition, feature, grads, hessians = accumulator.flush(
+        num_updates, partition, bucket_ids, grads, hessians = accumulator.flush(
             stamp_token=0, next_stamp_token=1)
-        num_updates, partition, feature, grads, hessians = sess.run(
-            [num_updates, partition, feature, grads, hessians])
+        num_updates, partition, bucket_ids, grads, hessians = sess.run(
+            [num_updates, partition, bucket_ids, grads, hessians])
 
-      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      result = _AccumulatorResultToDict(partition, bucket_ids, grads, hessians)
       self.assertEqual(num_updates, 2)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)], [0.2, 0.4])
-      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+      # Key is partion, bucket, dimension
+      self.assertAllClose(result[(1, 2, 0)], [0.2, 0.4])
+      self.assertAllClose(result[(2, 3, 0)], [0.3, 0.4])
+
+  def testMultidimensionalAcculumator(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.scalar(),
+          hessian_shape=tensor_shape.scalar())
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2, 1],
+            feature_ids=[[2, 2], [3, 0], [2, 2]],
+            gradients=[0.1, 0.3, 0.8],
+            hessians=[0.2, 0.4, -9])
+        op2 = accumulator.add(0, [2, 1], [[3, 1], [2, 2]], [0.1, 1], [0.2, -1])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, bucket_ids, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, bucket_ids, grads, hessians = sess.run(
+            [num_updates, partition, bucket_ids, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, bucket_ids, grads, hessians)
+      self.assertEqual(num_updates, 2)
+      self.assertEqual(len(result), 3)
+      # Key is partion, bucket, dimension.
+      self.assertAllClose(result[(1, 2, 2)], [1.9, -9.8])
+      self.assertAllClose(result[(2, 3, 0)], [0.3, 0.4])
+      self.assertAllClose(result[(2, 3, 1)], [0.1, 0.2])
 
   def testDropStaleUpdate(self):
     with self.test_session() as sess:
@@ -65,13 +95,13 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
             gradients=[0.1, 0.3],
             hessians=[0.2, 0.4])
         op2 = accumulator.add(
             stamp_token=-1,
             partition_ids=[1],
-            feature_ids=[2],
+            feature_ids=[[2, 0]],
             gradients=[0.1],
             hessians=[0.2])
 
@@ -84,8 +114,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       result = _AccumulatorResultToDict(partition, feature, grads, hessians)
       self.assertEqual(num_updates, 1)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)], [0.1, 0.2])
-      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+      self.assertAllClose(result[(1, 2, 0)], [0.1, 0.2])
+      self.assertAllClose(result[(2, 3, 0)], [0.3, 0.4])
 
   def testSerialize(self):
     with self.test_session() as sess:
@@ -97,7 +127,7 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
             gradients=[0.1, 0.3],
             hessians=[0.2, 0.4])
 
@@ -123,8 +153,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
       self.assertEqual(num_updates, 1)
       self.assertEqual(num_updates_2, 1)
       self.assertEqual(len(result_1), 2)
-      self.assertAllClose(result_1[(1, 2)], [0.1, 0.2])
-      self.assertAllClose(result_1[(2, 3)], [0.3, 0.4])
+      self.assertAllClose(result_1[(1, 2, 0)], [0.1, 0.2])
+      self.assertAllClose(result_1[(2, 3, 0)], [0.3, 0.4])
       self.assertAllEqual(result_1, result_2)
       self.assertEqual(0, stamp_token)
 
@@ -139,18 +169,19 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 1]],
             gradients=[0.1, 0.3],
             hessians=[0.2, 0.4])
 
       with ops.control_dependencies([op1]):
-        deserialize = (accumulator.deserialize(
-            stamp_token=2,
-            num_updates=3,
-            partition_ids=[3, 4],
-            feature_ids=[5, 6],
-            gradients=[0.4, 0.5],
-            hessians=[0.6, 0.7]))
+        deserialize = (
+            accumulator.deserialize(
+                stamp_token=2,
+                num_updates=3,
+                partition_ids=[3, 4],
+                feature_ids=[[5, 0], [6, 2]],
+                gradients=[0.4, 0.5],
+                hessians=[0.6, 0.7]))
       with ops.control_dependencies([deserialize]):
         num_updates, partition, feature, grads, hessians = accumulator.flush(
             stamp_token=2, next_stamp_token=3)
@@ -161,8 +192,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
                                         hessians)
       self.assertEqual(num_updates, 3)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(3, 5)], [0.4, 0.6])
-      self.assertAllClose(result[(4, 6)], [0.5, 0.7])
+      self.assertAllClose(result[(3, 5, 0)], [0.4, 0.6])
+      self.assertAllClose(result[(4, 6, 2)], [0.5, 0.7])
 
   def testMakeSummary(self):
     with self.test_session() as sess:
@@ -172,15 +203,15 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase):
           hessian_shape=tensor_shape.scalar())
       partition, feature, grads, hessians = accumulator._make_summary(
           partition_ids=[1, 2, 1],
-          feature_ids=[2, 3, 2],
+          feature_ids=[[2, 0], [3, 1], [2, 0]],
           gradients=[0.1, 0.3, 0.1],
           hessians=[0.2, 0.4, 0.2])
       partition, feature, grads, hessians = sess.run(
           [partition, feature, grads, hessians])
       result = _AccumulatorResultToDict(partition, feature, grads, hessians)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)], [0.2, 0.4])
-      self.assertAllClose(result[(2, 3)], [0.3, 0.4])
+      self.assertAllClose(result[(1, 2, 0)], [0.2, 0.4])
+      self.assertAllClose(result[(2, 3, 1)], [0.3, 0.4])
 
 
 class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
@@ -196,16 +227,54 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
+            # Two values for gradients,
+            gradients=[[0.1, 0.1], [0.2, 0.2]],
+            # A 2x2 matrix for each hessian.
+            hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07,
+                                                                    0.08]]])
+        op2 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1],
+            feature_ids=[[2, 0]],
+            gradients=[[0.10, 0.11]],
+            hessians=[[[0.011, 0.022], [0.033, 0.044]]])
+
+      with ops.control_dependencies([op1, op2]):
+        num_updates, partition, feature, grads, hessians = accumulator.flush(
+            stamp_token=0, next_stamp_token=1)
+        num_updates, partition, feature, grads, hessians = sess.run(
+            [num_updates, partition, feature, grads, hessians])
+
+      result = _AccumulatorResultToDict(partition, feature, grads, hessians)
+      self.assertEqual(num_updates, 2)
+      self.assertEqual(len(result), 2)
+      self.assertAllClose(result[(1, 2, 0)][0], [0.20, 0.21])
+      self.assertAllClose(result[(1, 2, 0)][1],
+                          [[0.021, 0.042], [0.063, 0.084]])
+      self.assertAllClose(result[(2, 3, 0)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3, 0)][1], [[0.05, 0.06], [0.07, 0.08]])
+
+  def testMultidimensionalAcculumator(self):
+    with self.test_session() as sess:
+      accumulator = stats_accumulator_ops.StatsAccumulator(
+          stamp_token=0,
+          gradient_shape=tensor_shape.TensorShape([2]),
+          hessian_shape=tensor_shape.TensorShape([2, 2]))
+      with ops.control_dependencies([accumulator._create_op]):
+        op1 = accumulator.add(
+            stamp_token=0,
+            partition_ids=[1, 2],
+            feature_ids=[[2, 4], [3, 1]],
             # Two values for gradients,
             gradients=[[0.1, 0.1], [0.2, 0.2]],
             # A 2x2 matrix for each hessian.
-            hessians=[[[0.01, 0.02], [0.03, 0.04]],
-                      [[0.05, 0.06], [0.07, 0.08]]])
+            hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07,
+                                                                    0.08]]])
         op2 = accumulator.add(
             stamp_token=0,
             partition_ids=[1],
-            feature_ids=[2],
+            feature_ids=[[2, 4]],
             gradients=[[0.10, 0.11]],
             hessians=[[[0.011, 0.022], [0.033, 0.044]]])
 
@@ -218,10 +287,11 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       result = _AccumulatorResultToDict(partition, feature, grads, hessians)
       self.assertEqual(num_updates, 2)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)][0], [0.20, 0.21])
-      self.assertAllClose(result[(1, 2)][1], [[0.021, 0.042], [0.063, 0.084]])
-      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
-      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+      self.assertAllClose(result[(1, 2, 4)][0], [0.20, 0.21])
+      self.assertAllClose(result[(1, 2, 4)][1],
+                          [[0.021, 0.042], [0.063, 0.084]])
+      self.assertAllClose(result[(2, 3, 1)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3, 1)][1], [[0.05, 0.06], [0.07, 0.08]])
 
   def testDropStaleUpdate(self):
     with self.test_session() as sess:
@@ -233,16 +303,16 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 5], [3, 0]],
             # Two values for gradients,
             gradients=[[0.1, 0.1], [0.2, 0.2]],
             # A 2x2 matrix for each hessian.
-            hessians=[[[0.01, 0.02], [0.03, 0.04]],
-                      [[0.05, 0.06], [0.07, 0.08]]])
+            hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07,
+                                                                    0.08]]])
         op2 = accumulator.add(
             stamp_token=-1,
             partition_ids=[1],
-            feature_ids=[2],
+            feature_ids=[[2, 5]],
             gradients=[[0.10, 0.11]],
             hessians=[[[0.011, 0.022], [0.033, 0.044]]])
 
@@ -255,10 +325,10 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       result = _AccumulatorResultToDict(partition, feature, grads, hessians)
       self.assertEqual(num_updates, 1)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)][0], [0.1, 0.1])
-      self.assertAllClose(result[(1, 2)][1], [[0.01, 0.02], [0.03, 0.04]])
-      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
-      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+      self.assertAllClose(result[(1, 2, 5)][0], [0.1, 0.1])
+      self.assertAllClose(result[(1, 2, 5)][1], [[0.01, 0.02], [0.03, 0.04]])
+      self.assertAllClose(result[(2, 3, 0)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3, 0)][1], [[0.05, 0.06], [0.07, 0.08]])
 
   def testSerialize(self):
     with self.test_session() as sess:
@@ -270,12 +340,12 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
             # Two values for gradients,
             gradients=[[0.1, 0.1], [0.2, 0.2]],
             # A 2x2 matrix for each hessian.
-            hessians=[[[0.01, 0.02], [0.03, 0.04]],
-                      [[0.05, 0.06], [0.07, 0.08]]])
+            hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07,
+                                                                    0.08]]])
 
       with ops.control_dependencies([op1]):
         (stamp_token, num_updates_1, partition_1, feature_1, grads_1,
@@ -300,15 +370,15 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(num_updates_1, 1)
       self.assertEqual(num_updates_2, 1)
       self.assertEqual(len(result_1), 2)
-      self.assertAllClose(result_1[(1, 2)][0], [0.1, 0.1])
-      self.assertAllClose(result_1[(1, 2)][1], [[0.01, 0.02], [0.03, 0.04]])
-      self.assertAllClose(result_1[(2, 3)][0], [0.2, 0.2])
-      self.assertAllClose(result_1[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+      self.assertAllClose(result_1[(1, 2, 0)][0], [0.1, 0.1])
+      self.assertAllClose(result_1[(1, 2, 0)][1], [[0.01, 0.02], [0.03, 0.04]])
+      self.assertAllClose(result_1[(2, 3, 0)][0], [0.2, 0.2])
+      self.assertAllClose(result_1[(2, 3, 0)][1], [[0.05, 0.06], [0.07, 0.08]])
 
-      self.assertAllEqual(result_1[1, 2][0], result_2[1, 2][0])
-      self.assertAllEqual(result_1[1, 2][1], result_2[1, 2][1])
-      self.assertAllEqual(result_1[2, 3][0], result_2[2, 3][0])
-      self.assertAllEqual(result_1[2, 3][1], result_2[2, 3][1])
+      self.assertAllEqual(result_1[1, 2, 0][0], result_2[1, 2, 0][0])
+      self.assertAllEqual(result_1[1, 2, 0][1], result_2[1, 2, 0][1])
+      self.assertAllEqual(result_1[2, 3, 0][0], result_2[2, 3, 0][0])
+      self.assertAllEqual(result_1[2, 3, 0][1], result_2[2, 3, 0][1])
 
   def testDeserialize(self):
     with self.test_session() as sess:
@@ -321,19 +391,19 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
         op1 = accumulator.add(
             stamp_token=0,
             partition_ids=[1, 2],
-            feature_ids=[2, 3],
+            feature_ids=[[2, 0], [3, 0]],
             # Two values for gradients,
             gradients=[[0.1, 0.1], [0.2, 0.2]],
             # A 2x2 matrix for each hessian.
-            hessians=[[[0.01, 0.02], [0.03, 0.04]],
-                      [[0.05, 0.06], [0.07, 0.08]]])
+            hessians=[[[0.01, 0.02], [0.03, 0.04]], [[0.05, 0.06], [0.07,
+                                                                    0.08]]])
 
       with ops.control_dependencies([op1]):
         deserialize = accumulator.deserialize(
             stamp_token=2,
             num_updates=3,
             partition_ids=[3, 4],
-            feature_ids=[4, 5],
+            feature_ids=[[4, 0], [5, 0]],
             # Two values for gradients,
             gradients=[[0.3, 0.3], [0.5, 0.5]],
             # A 2x2 matrix for each hessian.
@@ -349,10 +419,10 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
                                         hessians)
       self.assertEqual(num_updates, 3)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(3, 4)][0], [0.3, 0.3])
-      self.assertAllClose(result[(3, 4)][1], [[0.03, 0.04], [0.05, 0.06]])
-      self.assertAllClose(result[(4, 5)][0], [0.5, 0.5])
-      self.assertAllClose(result[(4, 5)][1], [[0.07, 0.08], [0.09, 0.10]])
+      self.assertAllClose(result[(3, 4, 0)][0], [0.3, 0.3])
+      self.assertAllClose(result[(3, 4, 0)][1], [[0.03, 0.04], [0.05, 0.06]])
+      self.assertAllClose(result[(4, 5, 0)][0], [0.5, 0.5])
+      self.assertAllClose(result[(4, 5, 0)][1], [[0.07, 0.08], [0.09, 0.10]])
 
   def testMakeSummary(self):
     with self.test_session() as sess:
@@ -362,7 +432,7 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
           hessian_shape=tensor_shape.TensorShape([2, 2]))
       partition, feature, grads, hessians = accumulator._make_summary(
           partition_ids=[1, 2, 1],
-          feature_ids=[2, 3, 2],
+          feature_ids=[[2, 0], [3, 2], [2, 0]],
           # Two values for gradients,
           gradients=[[0.1, 0.1], [0.2, 0.2], [0.10, 0.11]],
           # A 2x2 matrix for each hessian.
@@ -373,15 +443,16 @@ class StatsAccumulatorTensorTest(test_util.TensorFlowTestCase):
 
       result = _AccumulatorResultToDict(partition, feature, grads, hessians)
       self.assertEqual(len(result), 2)
-      self.assertAllClose(result[(1, 2)][0], [0.20, 0.21])
-      self.assertAllClose(result[(1, 2)][1], [[0.021, 0.042], [0.063, 0.084]])
-      self.assertAllClose(result[(2, 3)][0], [0.2, 0.2])
-      self.assertAllClose(result[(2, 3)][1], [[0.05, 0.06], [0.07, 0.08]])
+      self.assertAllClose(result[(1, 2, 0)][0], [0.20, 0.21])
+      self.assertAllClose(result[(1, 2, 0)][1],
+                          [[0.021, 0.042], [0.063, 0.084]])
+      self.assertAllClose(result[(2, 3, 2)][0], [0.2, 0.2])
+      self.assertAllClose(result[(2, 3, 2)][1], [[0.05, 0.06], [0.07, 0.08]])
 
 
 def _AccumulatorResultToDict(partition, feature, grads, hessians):
   """Converts the inputs to a dictionary since the ordering changes."""
-  return {(partition[i], feature[i]): (grads[i], hessians[i])
+  return {(partition[i], feature[i, 0], feature[i, 1]): (grads[i], hessians[i])
           for i in range(len(partition))}
 
 
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
index f0413fee5a8249d15f2cdae095dc7fa2c76a22b8..c2e65b643df90e88aadb0bb9acaf692da35b1a16 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py
@@ -181,7 +181,6 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
         tree_weights: 1.0
         tree_metadata {
           num_layers_grown: 1
-          is_finalized: true
         }
         growing_metadata {
           num_trees_attempted: 1
@@ -189,7 +188,7 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
         }
       """
       self.assertEqual(new_stamp, 1)
-      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_trees, 0)
       self.assertEqual(stats.num_layers, 1)
       self.assertEqual(stats.active_tree, 1)
       self.assertEqual(stats.active_layer, 1)
@@ -231,7 +230,6 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
         tree_weights: 1.0
         tree_metadata {
           num_layers_grown: 1
-          is_finalized: true
         }
         growing_metadata {
           num_trees_attempted: 1
@@ -239,7 +237,7 @@ class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase):
         }
       """
       self.assertEqual(new_stamp, 2)
-      self.assertEqual(stats.num_trees, 1)
+      self.assertEqual(stats.num_trees, 0)
       self.assertEqual(stats.num_layers, 1)
       self.assertEqual(stats.active_tree, 1)
       self.assertEqual(stats.active_layer, 1)
diff --git a/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
index d1e6d98efbc588df3db7a8d8186c1135e09bbe57..58f0d36b0f78eeed6abcec1c4fa696f4ccffa615 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
+from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import gradient_trees_partition_examples
+from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import gradient_trees_prediction
 # pylint: enable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import *
-# pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 7e8e15e7d8c89d1adaa472b1da7e8bb3c73ca17e..294e04002adac62fc123a3242a05a1b36f422433 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -45,6 +45,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                init_stamp_token,
                epsilon,
                num_quantiles,
+               max_elements=None,
                name=None,
                container=None):
     """Creates a QuantileAccumulator object.
@@ -53,6 +54,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
       init_stamp_token: The initial value for the stamp token.
       epsilon: Error bound on the quantile computation.
       num_quantiles: Number of quantiles to produce from the final summary.
+      max_elements: Maximum number of elements added to the accumulator.
       name: the name to save the accumulator under.
       container: An optional `string`. Defaults to `""`
     """
@@ -67,6 +69,7 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
           self._quantile_accumulator_handle,
           init_stamp_token,
           epsilon=epsilon,
+          max_elements=max_elements,
           num_quantiles=num_quantiles)
       is_initialized_op = gen_quantile_ops.quantile_accumulator_is_initialized(
           self._quantile_accumulator_handle)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 5a917ca42897a263bf9f868393453ba232745e65..b95956dae2a62b28643cd31815c5f5650eca337b 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -208,7 +208,7 @@ def extract_features(features, feature_columns):
       if tensor.dtype == dtypes.float32:
         if len(tensor.shape) > 1 and tensor.shape[1] > 1:
           unstacked = array_ops.unstack(tensor, axis=1)
-          for i in xrange(len(unstacked)):
+          for i in range(len(unstacked)):
             dense_float_names.append(_FEATURE_NAME_TEMPLATE % (key, i))
             dense_floats.append(array_ops.reshape(unstacked[i], [-1, 1]))
         else:
@@ -322,9 +322,11 @@ class GradientBoostedDecisionTreeModel(object):
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
     self._attempted_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="attempted_trees")
     self._finalized_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False)
+        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        name="finalized_trees")
     if not features:
       raise ValueError("Features dictionary must be specified.")
     (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
@@ -494,7 +496,6 @@ class GradientBoostedDecisionTreeModel(object):
         gate_gradients=0,
         aggregation_method=None)[0]
     strategy = self._learner_config.multi_class_strategy
-    num_classes = self._learner_config.num_classes
 
     class_id = -1
     # Handle different multiclass strategies.
@@ -503,7 +504,7 @@ class GradientBoostedDecisionTreeModel(object):
       gradient_shape = tensor_shape.scalar()
       hessian_shape = tensor_shape.scalar()
 
-      if num_classes == 2:
+      if self._logits_dimension == 1:
         # We have only 1 score, gradients is of shape [batch, 1].
         hessians = gradients_impl.gradients(
             gradients,
@@ -522,7 +523,7 @@ class GradientBoostedDecisionTreeModel(object):
 
         # Choose the class for which the tree is built (one vs rest).
         class_id = math_ops.to_int32(
-            predictions_dict[NUM_TREES_ATTEMPTED] % num_classes)
+            predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension)
 
         # Use class id tensor to get the column with that index from gradients
         # and hessians.
@@ -532,14 +533,15 @@ class GradientBoostedDecisionTreeModel(object):
             _get_column_by_index(hessians, class_id))
     else:
       # Other multiclass strategies.
-      gradient_shape = tensor_shape.TensorShape([num_classes])
+      gradient_shape = tensor_shape.TensorShape([self._logits_dimension])
 
       if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
-        hessian_shape = tensor_shape.TensorShape(([num_classes, num_classes]))
+        hessian_shape = tensor_shape.TensorShape(
+            ([self._logits_dimension, self._logits_dimension]))
         hessian_list = self._full_hessian(gradients, predictions)
       else:
         # Diagonal hessian strategy.
-        hessian_shape = tensor_shape.TensorShape(([num_classes]))
+        hessian_shape = tensor_shape.TensorShape(([self._logits_dimension]))
         hessian_list = self._diagonal_hessian(gradients, predictions)
 
       squeezed_gradients = gradients
@@ -739,7 +741,7 @@ class GradientBoostedDecisionTreeModel(object):
     # Accumulate a step after updating stats.
     batch_size = math_ops.cast(array_ops.shape(labels)[0], dtypes.float32)
     with ops.control_dependencies(stats_update_ops):
-      add_step_op = steps_accumulator.add(ensemble_stamp, [0], [0],
+      add_step_op = steps_accumulator.add(ensemble_stamp, [0], [[0, 0]],
                                           [batch_size], [1.0])
 
     # Determine learning rate.
@@ -804,10 +806,10 @@ class GradientBoostedDecisionTreeModel(object):
     # compute the full hessian with a single call to gradients, but instead
     # must compute it row-by-row.
     gradients_list = array_ops.unstack(
-        grads, num=self._learner_config.num_classes, axis=1)
+        grads, num=self._logits_dimension, axis=1)
     hessian_rows = []
 
-    for row in range(self._learner_config.num_classes):
+    for row in range(self._logits_dimension):
       # If current row is i, K is number of classes,each row returns a tensor of
       # size batch_size x K representing for each example dx_i dx_1, dx_i dx_2
       # etc dx_i dx_K
@@ -830,7 +832,7 @@ class GradientBoostedDecisionTreeModel(object):
     diag_hessian_list = []
 
     gradients_list = array_ops.unstack(
-        grads, num=self._learner_config.num_classes, axis=1)
+        grads, num=self._logits_dimension, axis=1)
 
     for row, row_grads in enumerate(gradients_list):
       # If current row is i, K is number of classes,each row returns a tensor of
@@ -891,8 +893,10 @@ class GradientBoostedDecisionTreeModel(object):
       hess_sum = math_ops.reduce_sum(hess, 0)
 
       # Accumulate gradients and hessians.
-      partition_ids = math_ops.range(predictions.get_shape()[1])
-      feature_ids = array_ops.zeros_like(partition_ids, dtype=dtypes.int64)
+      partition_ids = math_ops.range(self._logits_dimension)
+      feature_ids = array_ops.zeros(
+          [self._logits_dimension, 2], dtype=dtypes.int64)
+
       add_stats_op = bias_stats_accumulator.add(
           ensemble_stamp, partition_ids, feature_ids, grads_sum, hess_sum)
       return control_flow_ops.group(*[add_stats_op], name="update_bias_stats")
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 16e24d97ddee0751e0b808b89080074c1b4baba7..dba51d4f527792d2a8dedc693f74c07119fd231d 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -912,8 +912,10 @@ class GbdtTest(test_util.TensorFlowTestCase):
       self.assertEqual(1,
                        len(output.trees[0].nodes[2].leaf.sparse_vector.index))
       self.assertEqual(3, output.trees[0].nodes[2].leaf.sparse_vector.index[0])
-      self.assertAlmostEqual(
-          0.893284678459, output.trees[0].nodes[2].leaf.sparse_vector.value[0])
+      self.assertAllClose(
+          0.893284678459,
+          output.trees[0].nodes[2].leaf.sparse_vector.value[0],
+          atol=1e-4, rtol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
index dde16426863b60e9df64da1ee6b36caec273bfd6..ccb8509c0347f9c9b6f1e8f4f620230aac9a6c2d 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 
 from tensorflow.contrib.boosted_trees.python.utils import losses
@@ -60,35 +58,27 @@ class LossesTest(test_util.TensorFlowTestCase):
       neg_loss = loss_for_negatives.eval()
       # For positive labels, points <= 0.3 get max loss of e.
       # For negative labels, these points have minimum loss of 1/e.
-      for i in range(2):
-        self.assertAlmostEqual(math.exp(1), pos_loss[i], places=4)
-        self.assertAlmostEqual(math.exp(-1), neg_loss[i], places=4)
+      self.assertAllClose(np.exp(np.ones([2, 1])), pos_loss[:2], atol=1e-4)
+      self.assertAllClose(np.exp(-np.ones([2, 1])), neg_loss[:2], atol=1e-4)
 
       # For positive lables, p oints with predictions 0.7 and larger get minimum
       # loss value of 1/e. For negative labels, these points are wrongly
       # classified and get loss e.
-      for i in range(6, 10):
-        self.assertAlmostEqual(math.exp(-1), pos_loss[i], places=4)
-        self.assertAlmostEqual(math.exp(1), neg_loss[i], places=4)
+      self.assertAllClose(np.exp(-np.ones([4, 1])), pos_loss[6:10], atol=1e-4)
+      self.assertAllClose(np.exp(np.ones([4, 1])), neg_loss[6:10], atol=1e-4)
 
       # Points in between 0.5-eps, 0..5+eps get loss exp(-label_m*y), where
       # y = 1/eps *x -1/(2eps), where x is the probability and label_m is either
       # 1 or -1 (for label of 0).
-      for i in range(2, 6):
-        self.assertAlmostEqual(
-            math.exp(-1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
-            pos_loss[i],
-            places=4)
-        self.assertAlmostEqual(
-            math.exp(1.0 * (predictions_probs[i] * 1.0 / eps - 0.5 / eps)),
-            neg_loss[i],
-            places=4)
+      self.assertAllClose(
+          np.exp(-(predictions_probs[2:6] * 1.0 / eps - 0.5 / eps)),
+          pos_loss[2:6], atol=1e-4)
+      self.assertAllClose(
+          np.exp(predictions_probs[2:6] * 1.0 / eps - 0.5 / eps),
+          neg_loss[2:6], atol=1e-4)
 
   def test_per_example_squared_loss(self):
 
-    def _squared_loss(p, y):
-      return np.mean(1.0 * (p - y) * (p - y))
-
     labels = np.array([[0.123], [224.2], [-3], [2], [.3]], dtype=np.float32)
     weights = array_ops.ones([5, 1], dtypes.float32)
     predictions = np.array(
@@ -99,9 +89,8 @@ class LossesTest(test_util.TensorFlowTestCase):
                                                        predictions)
 
       loss = loss_tensor.eval()
-      for i in range(5):
-        self.assertAlmostEqual(
-            _squared_loss(labels[i], predictions[i]), loss[i], places=4)
+      self.assertAllClose(
+          np.square(labels[:5] - predictions[:5]), loss[:5], atol=1e-4)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index aa8f5ed12bc6f779e3c1a923b9225ec283189747..fe8bd072afd43a64fa62a65bd8900b5a98dbe761 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -60,9 +60,7 @@ tf_py_test(
     size = "small",
     srcs = ["python/ops/bigquery_reader_ops_test.py"],
     additional_deps = [
-        ":bigquery_reader_ops_op_lib",
         ":cloud_py",
-        "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index b31b882fa19a7eaad304d6d423961234f9affef4..e9b79a066def566096d6c3f3745974423e3371d1 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -421,7 +421,7 @@ TEST_F(BigQueryTableAccessorTest, MultiplePagesTest) {
   TF_EXPECT_OK(accessor_->ReadRow(&row_id, &example));
   EXPECT_EQ(3, row_id);
   EXPECT_TRUE(accessor_->Done());
-  
+
   Example expected_example;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(kTestExampleProtoWithNulls,
                                                     &expected_example));
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index d76ddf8c657b9b5d02bbdc4d6759053396dcd6d2..c74da9cabd6816bc9c7891e32937534cff2d677d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -80,16 +80,31 @@ class TPUClusterResolver(ClusterResolver):
         raise ImportError('googleapiclient must be installed before using the '
                           'TPU cluster resolver')
 
-      # TODO(b/67375680): Remove custom URL once TPU APIs are finalized
       self._service = discovery.build(
-          'tpu',
-          'v1',
-          credentials=self._credentials,
-          discoveryServiceUrl='https://storage.googleapis.com'
-                              '/tpu-api-definition/v1alpha1.json')
+          'tpu', 'v1alpha1',
+          credentials=self._credentials)
     else:
       self._service = service
 
+  def get_master(self):
+    """Get the ClusterSpec grpc master path.
+
+    This returns the grpc path (grpc://1.2.3.4:8470) of first instance in the
+    ClusterSpec returned by the cluster_spec function. This is suitable for use
+    for the `master` argument in tf.Session() when you are using one TPU.
+
+    Returns:
+      string, the grpc path of the first instance in the ClusterSpec.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+    job_tasks = self.cluster_spec().job_tasks(self._job_name)
+    if not job_tasks:
+      raise ValueError('No TPUs exists with the specified names exist.')
+
+    return 'grpc://' + job_tasks[0]
+
   def cluster_spec(self):
     """Returns a ClusterSpec object based on the latest TPU information.
 
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 5bd5cd1a8702840bd3eeb264ff19810fefa1fb62..db7419be06b58e1c5737f69f2c7fd9fee44b9d95 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -26,6 +26,28 @@ from tensorflow.python.training import server_lib
 mock = test.mock
 
 
+class MockRequestClass(object):
+
+  def __init__(self, name, tpu_map):
+    self._name = name
+    self._tpu_map = tpu_map
+
+  def execute(self):
+    if self._name in self._tpu_map:
+      return self._tpu_map[self._name]
+    else:
+      raise KeyError('Resource %s was not found' % self._name)
+
+
+class MockNodeClass(object):
+
+  def __init__(self, tpu_map):
+    self._tpu_map = tpu_map
+
+  def get(self, name):
+    return MockRequestClass(name, self._tpu_map)
+
+
 class TPUClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
@@ -56,11 +78,15 @@ class TPUClusterResolverTest(test.TestCase):
     if tpu_map is None:
       tpu_map = {}
 
-    def get_side_effect(name):
-      return tpu_map[name]
+    mock_locations = mock.MagicMock()
+    mock_locations.nodes.return_value = MockNodeClass(tpu_map)
+
+    mock_project = mock.MagicMock()
+    mock_project.locations.return_value = mock_locations
 
     mock_client = mock.MagicMock()
-    mock_client.projects.locations.nodes.get.side_effect = get_side_effect
+    mock_client.projects.return_value = mock_project
+
     return mock_client
 
   def testSimpleSuccessfulRetrieval(self):
@@ -109,3 +135,38 @@ class TPUClusterResolverTest(test.TestCase):
                              tasks { key: 1 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testGetMasterMultipleEntries(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470'
+        },
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-2': {
+            'ipAddress': '10.4.5.6',
+            'port': '8470'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu_names=['test-tpu-2', 'test-tpu-1'],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+    self.assertEqual('grpc://10.4.5.6:8470', tpu_cluster_resolver.get_master())
+
+  def testGetMasterNoEntries(self):
+    tpu_map = {}
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu_names=[],
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+    with self.assertRaises(ValueError):
+      tpu_cluster_resolver.get_master()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 8744fc492ff67064bff2097c99be5af8a739b60d..481caf6bb076fe823b3cce7a5b574b2e8d08de00 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -18,7 +18,6 @@ cmake_policy(SET CMP0022 NEW)
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
-option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
@@ -35,12 +34,46 @@ option(tensorflow_OPTIMIZE_FOR_NATIVE_ARCH "Enable compiler optimizations for th
 option(tensorflow_WIN_CPU_SIMD_OPTIONS "Enables CPU SIMD instructions")
 option(tensorflow_ENABLE_SNAPPY_SUPPORT "Enable SNAPPY compression support" ON)
 
+# GPU, CUDA and cuDNN options
+option(tensorflow_ENABLE_GPU "Enable GPU support" OFF)
+option(tensorflow_CUDA_VERSION "CUDA version to build against" 9.0)
+option(tensorflow_CUDNN_VERSION "cuDNN version to build against" 7)
+
+if(HAIKU)
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" OFF)
+else()
+	option(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE "Enable PIE support" ON)
+endif()
+
+
 if (NOT WIN32)
   # Threads: defines CMAKE_THREAD_LIBS_INIT and adds -pthread compile option
   # for targets that link ${CMAKE_THREAD_LIBS_INIT}.
   find_package (Threads)
+
+  option(tensorflow_PATH_STATIC_LIB "Additional library search path for libcudnn_static.a, libnccl_static.a, libculibos.a" /usr/local/cuda/lib64/)
+  option(tensorflow_CUDNN_INCLUDE "cudnn.h header install path" /usr/include/)
+  if (NOT tensorflow_CUDNN_INCLUDE)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDNN_INCLUDE /usr/include)
+  endif (NOT tensorflow_CUDNN_INCLUDE)
+  option(tensorflow_PATH_CUDNN_STATIC_LIB "Override PATH_STATIC_LIB for libcudnn_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_PATH_NCCL_STATIC_LIB "Override PATH_STATIC_LIB for libnccl_static.a" ${tensorflow_PATH_STATIC_LIB})
+  option(tensorflow_CUDA_LIBRARY_PATH "Designate the default CUDA library paths" /usr/local/cuda/lib64)
+  if (NOT tensorflow_CUDA_LIBRARY_PATH)
+    # option's default value is OFF. Fill it with real default values
+    set(tensorflow_CUDA_LIBRARY_PATH /usr/local/cuda/lib64)
+  endif (NOT tensorflow_CUDA_LIBRARY_PATH)
 endif()
 
+if (WIN32)
+  set(BOOL_WIN32 ON)
+else (WIN32)
+  set(BOOL_WIN32 OFF)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+endif (WIN32)
+
 # [CLEANUP] Remove when done
 # For debugging
 function(SHOW_VARIABLES)
@@ -58,7 +91,12 @@ set (DOWNLOAD_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/downloads"
      CACHE PATH "Location where external projects will be downloaded.")
 mark_as_advanced(DOWNLOAD_LOCATION)
 
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+if (tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+	set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+else()
+	set(CMAKE_POSITION_INDEPENDENT_CODE OFF)
+endif()
+
 add_definitions(-DEIGEN_AVOID_STL_ARRAY)
 if(WIN32)
   add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC)
@@ -217,20 +255,35 @@ endif()
 if(UNIX)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CMAKE_THREAD_LIBS_INIT} ${CMAKE_DL_LIBS})
 endif()
+if(HAIKU)
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
+endif()
 
 if (tensorflow_ENABLE_GPU)
+  if (NOT WIN32)
+    # Default install paths for cuda libraries in Linux
+    # In some Linux distros, find_package(CUDA) seems to require CMAKE_LIBRARY_PATH to include cuda-lib paths
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}")
+    list(APPEND CMAKE_LIBRARY_PATH "${tensorflow_CUDA_LIBRARY_PATH}/stubs")
+  endif (NOT WIN32)
+
+  find_package(CUDA ${tensorflow_CUDA_VERSION} REQUIRED)
+
+  # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
+  # CUDA_NVCC_FLAGS and cuda_config.h below
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
+  set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
+  include_directories(${CUDA_INCLUDE})
   if (WIN32)
-    find_package(CUDA 8.0 REQUIRED)
-
-    # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
-    # CUDA_NVCC_FLAGS and cuda_config.h below
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
-    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
-    set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
-    include_directories(${CUDA_INCLUDE})
     add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
+  else (WIN32)
+    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
+    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
+  endif (WIN32)
 
+  if (WIN32)
     # add cudnn
     if(NOT CUDNN_HOME)
       set(CUDNN_HOME ${CUDA_TOOLKIT_TARGET_DIR})
@@ -238,18 +291,51 @@ if (tensorflow_ENABLE_GPU)
     include_directories(${CUDNN_HOME})
     set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
       ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${CUDNN_HOME}/lib/x64/cudnn.lib)
+  else (WIN32)
+    set(CUDNN_INCLUDE "${tensorflow_CUDNN_INCLUDE}")
 
-    # create cuda_config.h
-    FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
-      "#ifndef CUDA_CUDA_CONFIG_H_\n"
-      "#define CUDA_CUDA_CONFIG_H_\n"
-      "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
-      "#define TF_CUDA_VERSION \"64_80\"\n"
-      "#define TF_CUDNN_VERSION \"64_6\"\n"
-      "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
-      "#endif  // CUDA_CUDA_CONFIG_H_\n"
-    )
+    find_library(nccl_STATIC_LIBRARY NAMES libnccl_static.a PATHS ${tensorflow_PATH_NCCL_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT nccl_STATIC_LIBRARY)
+      message(FATAL_ERROR "NCCL is required for GPU-build")
+    else (NOT nccl_STATIC_LIBRARY)
+      message("nccl-static: ${nccl_STATIC_LIBRARY}")
+      # something like /usr/lib64/libnccl_static.a
+    endif (NOT nccl_STATIC_LIBRARY)
+
+    find_library(cudnn_STATIC_LIBRARY NAMES libcudnn_static.a PATHS ${tensorflow_PATH_CUDNN_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT cudnn_STATIC_LIBRARY)
+      message(FATAL_ERROR "CUDNN is required for GPU-build")
+    else (NOT cudnn_STATIC_LIBRARY)
+      message("cudnn-static: ${cudnn_STATIC_LIBRARY}")
+    endif (NOT cudnn_STATIC_LIBRARY)
+
+    find_library(culibos_STATIC_LIBRARY NAMES libculibos.a PATHS ${tensorflow_PATH_STATIC_LIB} ${CUDA_TOOLKIT_ROOT_DIR})
+    if (NOT culibos_STATIC_LIBRARY)
+      message(FATAL_ERROR "CULIBOS is required for GPU-build")
+    else (NOT culibos_STATIC_LIBRARY)
+      message("culibos-static: ${culibos_STATIC_LIBRARY}")
+    endif (NOT culibos_STATIC_LIBRARY)
+
+    include_directories(${CUDNN_INCLUDE})
+    set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_CUFFT_LIBRARIES}
+      ${CUDA_curand_LIBRARY} ${CUDA_cupti_LIBRARY} ${CUDA_cusolver_LIBRARY} ${cudnn_STATIC_LIBRARY} ${culibos_STATIC_LIBRARY} ${nccl_STATIC_LIBRARY})
+  endif (WIN32)
 
+  # Remove "." from CUDA version variable.
+  string(REPLACE "." "" short_CUDA_VER ${tensorflow_CUDA_VERSION})
+
+  # create cuda_config.h
+  FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
+    "#ifndef CUDA_CUDA_CONFIG_H_\n"
+    "#define CUDA_CUDA_CONFIG_H_\n"
+    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+    "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
+    "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
+    "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
+    "#endif  // CUDA_CUDA_CONFIG_H_\n"
+  )
+
+  if (WIN32)
     # tf assumes in various places header files to be in cuda/include. On windows the cuda sdk
     # installs them under cuda/version/include and to avoid that we need to change tf we copy a
     # few files to cuda/include
@@ -261,21 +347,36 @@ if (tensorflow_ENABLE_GPU)
       ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
       DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
     )
-    include_directories(${tensorflow_source_dir}/third_party/gpus)
-    # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
-    list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+  else(WIN32)
+    # Linux has slightly differnt install paths than Windows
+    FILE(COPY
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_INCLUDE}/cudnn.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h
+      ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
+      DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include
+    )
+  endif(WIN32)
 
-    # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
-    # in the default build is upgraded.
+  include_directories(${tensorflow_source_dir}/third_party/gpus)
+  # add cuda libraries to tensorflow_EXTERNAL_LIBRARIES
+  list(APPEND tensorflow_EXTERNAL_LIBRARIES ${CUDA_LIBRARIES})
+
+  # NOTE(mrry): Update these flags when the version of CUDA or cuDNN used
+  # in the default build is upgraded.
+  if(WIN32)
     set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
       msvcp_dll_name=msvcp140.dll
-      cudart_dll_name=cudart64_80.dll
-      cuda_version_number=8.0
+      cudart_dll_name=cudart64_${short_CUDA_VER}.dll
+      cuda_version_number=${tensorflow_CUDA_VERSION}
       nvcuda_dll_name=nvcuda.dll
-      cudnn_dll_name=cudnn64_6.dll
-      cudnn_version_number=6)
+      cudnn_dll_name=cudnn64_${tensorflow_CUDNN_VERSION}.dll
+      cudnn_version_number=${tensorflow_CUDNN_VERSION})
   else(WIN32)
-    message(FATAL_ERROR "CMake GPU build is currently only supported on Windows.")
+    set(tensorflow_BUILD_INFO_FLAGS --build_config cuda --key_value
+	    cuda_version_number=${tensorflow_CUDA_VERSION}
+	    cudnn_version_number=${tensorflow_CUDNN_VERSION})
   endif(WIN32)
 else(tensorflow_ENABLE_GPU)
   set(tensorflow_BUILD_INFO_FLAGS --build_config cpu --key_value
@@ -290,13 +391,7 @@ endif()
 
 # Let's get to work!
 include(tf_core_framework.cmake)
-# NOTE: Disabled until issue #3996 is fixed.
-# include(tf_stream_executor.cmake)
-if (tensorflow_ENABLE_GPU)
-  if (WIN32)
-    include(tf_stream_executor.cmake)
-  endif()
-endif()
+include(tf_stream_executor.cmake)
 
 include(tf_core_cpu.cmake)
 include(tf_core_ops.cmake)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index 4ddfec5960d2b759bacb376202cd8dab6ef2b024..4be733a2809f366a214fa2bb853bccffb10ecaba 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -19,23 +19,6 @@ for instructions on how to install a pre-built TensorFlow package on Windows.
 ### Current known limitations
 * It is not possible to load a custom Op library.
 * GCS file system is not supported.
-* The following Ops are not currently implemented:
- - Dequantize
- - QuantizeAndDequantize
- - QuantizedAvgPool
- - QuantizedBatchNomWithGlobalNormalization
- - QuantizedBiasAdd
- - QuantizedConcat
- - QuantizedConv2D
- - QuantizedMatmul
- - QuantizedMaxPoo
- - QuantizeDownAndShrinkRange
- - QuantizedRelu
- - QuantizedRelu6
- - QuantizedReshape
- - QuantizeV2
- - RequantizationRange
- - Requantize
 
 ## Building with CMake
 
diff --git a/tensorflow/contrib/cmake/external/boringssl.cmake b/tensorflow/contrib/cmake/external/boringssl.cmake
index dc27eadaca14361ffeffa6eadf6d4d97524de310..cca8444e2ae9952ea7c69a9392580ead715d363b 100644
--- a/tensorflow/contrib/cmake/external/boringssl.cmake
+++ b/tensorflow/contrib/cmake/external/boringssl.cmake
@@ -39,8 +39,12 @@ ExternalProject_Add(boringssl
     # BUILD_IN_SOURCE 1
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+        if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+        else()
+        	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+        endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
diff --git a/tensorflow/contrib/cmake/external/farmhash.cmake b/tensorflow/contrib/cmake/external/farmhash.cmake
index 96fade8b53273afdc379c7c13017e4917ee534f3..0cd0c1030c73d5218411f281d2b077af217e8275 100644
--- a/tensorflow/contrib/cmake/external/farmhash.cmake
+++ b/tensorflow/contrib/cmake/external/farmhash.cmake
@@ -15,8 +15,8 @@
 include (ExternalProject)
 
 set(farmhash_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive ${CMAKE_CURRENT_BINARY_DIR}/external/farmhash_archive/util)
-set(farmhash_URL https://github.com/google/farmhash/archive/34c13ddfab0e35422f4c3979f360635a8c050260.zip)
-set(farmhash_HASH SHA256=e3d37a59101f38fd58fb799ed404d630f0eee18bfc2a2433910977cc8fea9c28)
+set(farmhash_URL https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz)
+set(farmhash_HASH SHA256=6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0)
 set(farmhash_BUILD ${CMAKE_CURRENT_BINARY_DIR}/farmhash/src/farmhash)
 set(farmhash_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/farmhash/install)
 set(farmhash_INCLUDES ${farmhash_BUILD})
diff --git a/tensorflow/contrib/cmake/external/fft2d.cmake b/tensorflow/contrib/cmake/external/fft2d.cmake
index a35c24e9e01101f837ba961c06429c981ddc4648..d3af2a46761c0f7f0b5db134af8400fc93f2f095 100644
--- a/tensorflow/contrib/cmake/external/fft2d.cmake
+++ b/tensorflow/contrib/cmake/external/fft2d.cmake
@@ -15,7 +15,7 @@
 
 include (ExternalProject)
 
-set(fft2d_URL http://www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz)
+set(fft2d_URL https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz)
 set(fft2d_HASH SHA256=52bb637c70b971958ec79c9c8752b1df5ff0218a4db4510e60826e0cb79b5296)
 set(fft2d_BUILD ${CMAKE_CURRENT_BINARY_DIR}/fft2d/)
 set(fft2d_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/fft2d/src)
diff --git a/tensorflow/contrib/cmake/external/gemmlowp.cmake b/tensorflow/contrib/cmake/external/gemmlowp.cmake
index 54a9e96ce58c5501217368b0d12089aa14696b71..a235442dc5c0a07e249653381436eeae81575883 100644
--- a/tensorflow/contrib/cmake/external/gemmlowp.cmake
+++ b/tensorflow/contrib/cmake/external/gemmlowp.cmake
@@ -14,8 +14,8 @@
 # ==============================================================================
 include (ExternalProject)
 
-set(gemmlowp_URL http://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.tar.gz)
-set(gemmlowp_HASH SHA256=861cc6d9d902861f54fd77e1ab79286477dcc559b2a283e75b9c22d37b61f6ae)
+set(gemmlowp_URL https://github.com/google/gemmlowp/archive/6a2a90822e8546fc2bfa7044de0faf1c1cb4862f.zip)
+set(gemmlowp_HASH SHA256=3447948d219f3270383766bbe08942888c0eb4e0ca6663c0e0548502ec5bb77d)
 set(gemmlowp_BUILD ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 set(gemmlowp_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/gemmlowp/src/gemmlowp)
 
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 464aad74c6c8623981338695af01b026dcc0e6e3..41ea0b48a4600d7ca2dd2f4a61c14ec0cc5b4734 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 781fd6f6ea03645a520cd5c675da67ab61f87e4b)
+set(GRPC_TAG 54e8f37e537794c2d814c1604c1282125f64f093)
 
 if(WIN32)
   set(grpc_STATIC_LIBRARIES
@@ -28,10 +28,11 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/libcares.a)
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
 
+add_definitions(-DGRPC_ARES=0)
+
 ExternalProject_Add(grpc
     PREFIX grpc
     DEPENDS protobuf zlib
@@ -39,9 +40,6 @@ ExternalProject_Add(grpc
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
-    # TODO(jhseu): Remove this PATCH_COMMAND once grpc removes the dependency
-    # on "grpc" from the "grpc++_unsecure" rule.
-    PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
     BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
     COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
     INSTALL_COMMAND ""
diff --git a/tensorflow/contrib/cmake/external/jemalloc.cmake b/tensorflow/contrib/cmake/external/jemalloc.cmake
index e4737a1dd825409133cdfd8a54f20dac819c0d5b..198ba13e64e4b6df57c4325a0104b1a6745d173a 100644
--- a/tensorflow/contrib/cmake/external/jemalloc.cmake
+++ b/tensorflow/contrib/cmake/external/jemalloc.cmake
@@ -15,7 +15,7 @@
 include (ExternalProject)
 
 set(jemalloc_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc/include)
-set(jemalloc_URL https://github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
+set(jemalloc_URL https://mirror.bazel.build/github.com/jemalloc/jemalloc-cmake/archive/jemalloc-cmake.4.3.1.tar.gz)
 set(jemalloc_HASH SHA256=f9be9a05fe906deb5c1c8ca818071a7d2e27d66fd87f5ba9a7bf3750bcedeaf0)
 set(jemalloc_BUILD ${CMAKE_CURRENT_BINARY_DIR}/jemalloc/src/jemalloc)
 
diff --git a/tensorflow/contrib/cmake/external/jsoncpp.cmake b/tensorflow/contrib/cmake/external/jsoncpp.cmake
index 5127d7e8f79abdda4516eb9f006e243b7438bc65..d2ae4c76e8cd175cdc3ba41fdf4e4009f8237309 100644
--- a/tensorflow/contrib/cmake/external/jsoncpp.cmake
+++ b/tensorflow/contrib/cmake/external/jsoncpp.cmake
@@ -42,8 +42,12 @@ ExternalProject_Add(jsoncpp
     BUILD_IN_SOURCE 1
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+  	  if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+  	      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+  	  else()
+   	    	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+   	 endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
diff --git a/tensorflow/contrib/cmake/external/lmdb.cmake b/tensorflow/contrib/cmake/external/lmdb.cmake
index 79971b7cfc3c72e4b6290ccb71d40a20d1180c01..e41384f023ca9fc4cba697917b491af5a9db92bc 100644
--- a/tensorflow/contrib/cmake/external/lmdb.cmake
+++ b/tensorflow/contrib/cmake/external/lmdb.cmake
@@ -29,10 +29,14 @@ ExternalProject_Add(lmdb
     INSTALL_DIR ${lmdb_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${lmdb_INSTALL}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 if(WIN32)
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 2c42377f5078d55e72e37eb5e880624bc09ddef0..05080060479b6240edb8ab9f65160b3dd182feb9 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 394e71f0ebeed6788ae6c84d42c1bedf6e1ee9f7)
+set(nsync_TAG 8502189abfa44c249c01c2cad64e6ed660a9a668)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/external/png.cmake b/tensorflow/contrib/cmake/external/png.cmake
index 2b2bd47d1c95ca886469c525191c27f22d416c29..aad6618f52f909096fd2388e867ef3a965d033cb 100644
--- a/tensorflow/contrib/cmake/external/png.cmake
+++ b/tensorflow/contrib/cmake/external/png.cmake
@@ -41,10 +41,14 @@ ExternalProject_Add(png
     INSTALL_DIR ${png_INSTALL}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DCMAKE_INSTALL_PREFIX:STRING=${png_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 	-DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
 
diff --git a/tensorflow/contrib/cmake/external/protobuf.cmake b/tensorflow/contrib/cmake/external/protobuf.cmake
index 1e300e21df17eeee0abfc2becdab746fbfc62ff6..b53857a47bfbf797af02fe7f69474263119161cd 100644
--- a/tensorflow/contrib/cmake/external/protobuf.cmake
+++ b/tensorflow/contrib/cmake/external/protobuf.cmake
@@ -44,8 +44,12 @@ ExternalProject_Add(protobuf
         ${PROTOBUF_ADDITIONAL_CMAKE_OPTIONS}
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
 )
diff --git a/tensorflow/contrib/cmake/external/re2.cmake b/tensorflow/contrib/cmake/external/re2.cmake
index cb4ec9c2de3388ef918c75d842dab6e1f4ffee9b..d10f5959f71dd350e6e2bcb81be8882b203fb231 100644
--- a/tensorflow/contrib/cmake/external/re2.cmake
+++ b/tensorflow/contrib/cmake/external/re2.cmake
@@ -38,7 +38,12 @@ ExternalProject_Add(re2
     BUILD_IN_SOURCE 1
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${re2_INSTALL}
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-)
\ No newline at end of file
+        -DRE2_BUILD_TESTING:BOOL=OFF
+)
diff --git a/tensorflow/contrib/cmake/external/snappy.cmake b/tensorflow/contrib/cmake/external/snappy.cmake
index 2d2451521c0f9127e2c76e6270694ac21fe8db93..926c271fd9ea6e2a30251aa408bd49859ae95070 100644
--- a/tensorflow/contrib/cmake/external/snappy.cmake
+++ b/tensorflow/contrib/cmake/external/snappy.cmake
@@ -40,11 +40,15 @@ ExternalProject_Add(snappy
     LOG_CONFIGURE ON
     LOG_BUILD ON
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
         -DSNAPPY_BUILD_TESTS:BOOL=OFF
-        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 # actually enables snappy in the source code
-add_definitions(-DTF_USE_SNAPPY)
+add_definitions(-DTF_USE_SNAPPY)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/sqlite.cmake b/tensorflow/contrib/cmake/external/sqlite.cmake
index 6fa3a576998acef529942ccfab3a6a544795d712..785039a46983747557607562675349c150e064ad 100644
--- a/tensorflow/contrib/cmake/external/sqlite.cmake
+++ b/tensorflow/contrib/cmake/external/sqlite.cmake
@@ -15,7 +15,7 @@
 include (ExternalProject)
 
 set(sqlite_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/sqlite)
-set(sqlite_URL http://www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
+set(sqlite_URL https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip)
 set(sqlite_HASH SHA256=208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4)
 set(sqlite_BUILD ${CMAKE_CURRENT_BINARY_DIR}/sqlite/src/sqlite)
 set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
@@ -53,9 +53,13 @@ else()
         INSTALL_DIR ${sqlite_INSTALL}
         DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
         CMAKE_CACHE_ARGS
+			if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+			else()
+				-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+			endif()
             -DCMAKE_BUILD_TYPE:STRING=Release
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
-            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
             -DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
     )
 
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index c8af611e1eaefdf135551940a66985a4d50b26ed..f10f84336e8b1c0a2c7de7ea1f8b8af7c21f8b51 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -42,9 +42,13 @@ ExternalProject_Add(zlib
     BUILD_IN_SOURCE 1
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     CMAKE_CACHE_ARGS
+		if(tensorflow_ENABLE_POSITION_INDEPENDENT_CODE)
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+		else()
+			-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=OFF
+		endif()
         -DCMAKE_BUILD_TYPE:STRING=Release
         -DCMAKE_INSTALL_PREFIX:STRING=${ZLIB_INSTALL}
-	-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
 )
 
 # put zlib includes in the directory where they are expected
diff --git a/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt b/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt
deleted file mode 100644
index 84722c5ca2a9f9253c7a76dd610dde615a176c07..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/cmake/patches/grpc/CMakeLists.txt
+++ /dev/null
@@ -1,14415 +0,0 @@
-# GRPC global cmake file
-# This currently builds C and C++ code.
-# This file has been automatically generated from a template file.
-# Please look at the templates directory instead.
-# This file can be regenerated from the template by running
-# tools/buildgen/generate_projects.sh
-#
-# Copyright 2015 gRPC authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-cmake_minimum_required(VERSION 2.8)
-
-set(PACKAGE_NAME      "grpc")
-set(PACKAGE_VERSION   "1.5.0-dev")
-set(PACKAGE_STRING    "${PACKAGE_NAME} ${PACKAGE_VERSION}")
-set(PACKAGE_TARNAME   "${PACKAGE_NAME}-${PACKAGE_VERSION}")
-set(PACKAGE_BUGREPORT "https://github.com/grpc/grpc/issues/")
-project(${PACKAGE_NAME} C CXX)
-
-set(gRPC_INSTALL_BINDIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
-set(gRPC_INSTALL_LIBDIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
-set(gRPC_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_PREFIX}/include" CACHE PATH "Installation directory for headers")
-set(gRPC_INSTALL_CMAKEDIR "${CMAKE_INSTALL_PREFIX}/lib/cmake/${PACKAGE_NAME}" CACHE PATH "Installation directory for cmake config files")
-
-# Options
-option(gRPC_BUILD_TESTS "Build tests" OFF)
-
-set(gRPC_INSTALL_default ON)
-if (NOT CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
-  # Disable gRPC_INSTALL by default if building as a submodule
-  set(gRPC_INSTALL_default OFF)
-endif()
-set(gRPC_INSTALL ${gRPC_INSTALL_default} CACHE BOOL
-    "Generate installation target: gRPC_ZLIB_PROVIDER, gRPC_CARES_PROVIDER, gRPC_SSL_PROVIDER and gRPC_PROTOBUF_PROVIDER must all be \"package\"")
-
-set(gRPC_ZLIB_PROVIDER "module" CACHE STRING "Provider of zlib library")
-set_property(CACHE gRPC_ZLIB_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_CARES_PROVIDER "module" CACHE STRING "Provider of c-ares library")
-set_property(CACHE gRPC_CARES_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_SSL_PROVIDER "module" CACHE STRING "Provider of ssl library")
-set_property(CACHE gRPC_SSL_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_PROTOBUF_PROVIDER "module" CACHE STRING "Provider of protobuf library")
-set_property(CACHE gRPC_PROTOBUF_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_PROTOBUF_PACKAGE_TYPE "" CACHE STRING "Algorithm for searching protobuf package")
-set_property(CACHE gRPC_PROTOBUF_PACKAGE_TYPE PROPERTY STRINGS "CONFIG" "MODULE")
-
-set(gRPC_GFLAGS_PROVIDER "module" CACHE STRING "Provider of gflags library")
-set_property(CACHE gRPC_GFLAGS_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_BENCHMARK_PROVIDER "module" CACHE STRING "Provider of benchmark library")
-set_property(CACHE gRPC_BENCHMARK_PROVIDER PROPERTY STRINGS "module" "package")
-
-set(gRPC_USE_PROTO_LITE OFF CACHE BOOL "Use the protobuf-lite library")
-
-if(UNIX)
-  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
-    set(_gRPC_PLATFORM_LINUX ON)
-  elseif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(_gRPC_PLATFORM_MAC ON)
-  else()
-    set(_gRPC_PLATFORM_POSIX ON)
-  endif()
-endif()
-if(WIN32)
-  set(_gRPC_PLATFORM_WINDOWS ON)
-endif()
-
-set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-
-if (MSVC)
-  include(cmake/msvc_static_runtime.cmake)
-  add_definitions(-D_WIN32_WINNT=0x600 -D_SCL_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_WARNINGS -D_WINSOCK_DEPRECATED_NO_WARNINGS)
-  # needed to compile protobuf
-  add_definitions(/wd4065 /wd4506)
-  # TODO(jtattermusch): revisit C4267 occurrences throughout the code
-  add_definitions(/wd4267)
-endif()
-
-if (gRPC_USE_PROTO_LITE)
-  set(_gRPC_PROTOBUF_LIBRARY_NAME "libprotobuf-lite")
-  add_definitions("-DGRPC_USE_PROTO_LITE")
-else()
-  set(_gRPC_PROTOBUF_LIBRARY_NAME "libprotobuf")
-endif()
-
-if("${gRPC_ZLIB_PROVIDER}" STREQUAL "module")
-  if(NOT ZLIB_ROOT_DIR)
-    set(ZLIB_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
-  endif()
-  set(ZLIB_INCLUDE_DIR "${ZLIB_ROOT_DIR}")
-  if(EXISTS "${ZLIB_ROOT_DIR}/CMakeLists.txt")
-      # TODO(jtattermusch): workaround for https://github.com/madler/zlib/issues/218
-      include_directories(${ZLIB_INCLUDE_DIR})
-
-      add_subdirectory(${ZLIB_ROOT_DIR} third_party/zlib)
-      if(TARGET zlibstatic)
-          set(_gRPC_ZLIB_LIBRARIES zlibstatic)
-      endif()
-  else()
-      message(WARNING "gRPC_ZLIB_PROVIDER is \"module\" but ZLIB_ROOT_DIR is wrong")
-  endif()
-  if(gRPC_INSTALL)
-    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_ZLIB_PROVIDER is \"module\"")
-    set(gRPC_INSTALL FALSE)
-  endif()
-elseif("${gRPC_ZLIB_PROVIDER}" STREQUAL "package")
-  find_package(ZLIB)
-  if(TARGET ZLIB::ZLIB)
-    set(_gRPC_ZLIB_LIBRARIES ZLIB::ZLIB)
-  endif()
-  set(_gRPC_FIND_ZLIB "if(NOT ZLIB_FOUND)\n  find_package(ZLIB)\nendif()")
-endif()
-
-if("${gRPC_CARES_PROVIDER}" STREQUAL "module")
-  if(NOT CARES_ROOT_DIR)
-    set(CARES_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/c-ares)
-  endif()
-  string(TOLOWER ${CMAKE_SYSTEM_NAME} CARES_SYSTEM_NAME)
-  set(CARES_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares/cares")
-  set(CARES_BUILD_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares")
-  set(CARES_PLATFORM_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cares/config_${CARES_SYSTEM_NAME}")
-  if(EXISTS "${CARES_ROOT_DIR}/CMakeLists.txt")
-    if("${CARES_SYSTEM_NAME}" MATCHES "windows")
-      add_definitions(-DCARES_STATICLIB=1)
-      add_definitions(-DWIN32_LEAN_AND_MEAN=1)
-    else()
-      add_definitions(-DHAVE_CONFIG_H=1)
-      add_definitions(-D_GNU_SOURCE=1)
-    endif()
-    add_subdirectory(src/c-ares third_party/cares)
-    if(TARGET cares)
-        set(_gRPC_CARES_LIBRARIES cares)
-    endif()
-  else()
-    message(WARNING "gRPC_CARES_PROVIDER is \"module\" but CARES_ROOT_DIR is wrong")
-  endif()
-  if(gRPC_INSTALL)
-    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_CARES_PROVIDER is \"module\"")
-    set(gRPC_INSTALL FALSE)
-  endif()
-elseif("${gRPC_CARES_PROVIDER}" STREQUAL "package")
-  find_package(c-ares CONFIG)
-  if(TARGET c-ares::cares)
-    set(_gRPC_CARES_LIBRARIES c-ares::cares)
-  endif()
-  set(_gRPC_FIND_CARES "if(NOT c-ares_FOUND)\n  find_package(c-ares CONFIG)\nendif()")
-endif()
-
-if("${gRPC_PROTOBUF_PROVIDER}" STREQUAL "module")
-  # Building the protobuf tests require gmock what is not part of a standard protobuf checkout.
-  # Disable them unless they are explicitly requested from the cmake command line (when we assume
-  # gmock is downloaded to the right location inside protobuf).
-  if(NOT protobuf_BUILD_TESTS)
-    set(protobuf_BUILD_TESTS OFF CACHE BOOL "Build protobuf tests")
-  endif()
-  # Disable building protobuf with zlib. Building protobuf with zlib breaks
-  # the build if zlib is not installed on the system.
-  if(NOT protobuf_WITH_ZLIB)
-    set(protobuf_WITH_ZLIB OFF CACHE BOOL "Build protobuf with zlib.")
-  endif()
-  if(NOT PROTOBUF_ROOT_DIR)
-    set(PROTOBUF_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
-  endif()
-  set(PROTOBUF_WELLKNOWN_IMPORT_DIR ${PROTOBUF_ROOT_DIR}/src)
-  if(EXISTS "${PROTOBUF_ROOT_DIR}/cmake/CMakeLists.txt")
-    set(protobuf_MSVC_STATIC_RUNTIME OFF CACHE BOOL "Link static runtime libraries")
-    add_subdirectory(${PROTOBUF_ROOT_DIR}/cmake third_party/protobuf)
-    if(TARGET ${_gRPC_PROTOBUF_LIBRARY_NAME})
-      set(_gRPC_PROTOBUF_LIBRARIES ${_gRPC_PROTOBUF_LIBRARY_NAME})
-    endif()
-    if(TARGET libprotoc)
-      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES libprotoc)
-    endif()
-    if(TARGET protoc)
-      set(_gRPC_PROTOBUF_PROTOC protoc)
-    endif()
-  else()
-      message(WARNING "gRPC_PROTOBUF_PROVIDER is \"module\" but PROTOBUF_ROOT_DIR is wrong")
-  endif()
-  if(gRPC_INSTALL)
-    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_PROTOBUF_PROVIDER is \"module\"")
-    set(gRPC_INSTALL FALSE)
-  endif()
-elseif("${gRPC_PROTOBUF_PROVIDER}" STREQUAL "package")
-  find_package(Protobuf ${gRPC_PROTOBUF_PACKAGE_TYPE})
-  if(Protobuf_FOUND OR PROTOBUF_FOUND)
-    if(TARGET protobuf::${_gRPC_PROTOBUF_LIBRARY_NAME})
-      set(_gRPC_PROTOBUF_LIBRARIES protobuf::${_gRPC_PROTOBUF_LIBRARY_NAME})
-    else()
-      set(_gRPC_PROTOBUF_LIBRARIES ${PROTOBUF_LIBRARIES})
-    endif()
-    if(TARGET protobuf::libprotoc)
-      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES protobuf::libprotoc)
-    else()
-      set(_gRPC_PROTOBUF_PROTOC_LIBRARIES ${PROTOBUF_PROTOC_LIBRARIES})
-    endif()
-    if(TARGET protobuf::protoc)
-      set(_gRPC_PROTOBUF_PROTOC protobuf::protoc)
-    else()
-      set(_gRPC_PROTOBUF_PROTOC ${PROTOBUF_PROTOC_EXECUTABLE})
-    endif()
-    set(_gRPC_FIND_PROTOBUF "if(NOT Protobuf_FOUND AND NOT PROTOBUF_FOUND)\n  find_package(Protobuf ${gRPC_PROTOBUF_PACKAGE_TYPE})\nendif()")
-  endif()
-  if(PROTOBUF_FOUND)
-    include_directories(${PROTOBUF_INCLUDE_DIRS})
-  endif()
-  set(PROTOBUF_WELLKNOWN_IMPORT_DIR /usr/local/include)
-endif()
-
-if("${gRPC_SSL_PROVIDER}" STREQUAL "module")
-  if(NOT BORINGSSL_ROOT_DIR)
-    set(BORINGSSL_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/boringssl)
-  endif()
-  if(EXISTS "${BORINGSSL_ROOT_DIR}/CMakeLists.txt")
-    set(OPENSSL_NO_ASM ON)  # make boringssl buildable with Visual Studio
-    add_subdirectory(${BORINGSSL_ROOT_DIR} third_party/boringssl)
-    if(TARGET ssl)
-      set(_gRPC_SSL_LIBRARIES ssl)
-    endif()
-  else()
-      message(WARNING "gRPC_SSL_PROVIDER is \"module\" but BORINGSSL_ROOT_DIR is wrong")
-  endif()
-  if(gRPC_INSTALL)
-    message(WARNING "gRPC_INSTALL will be forced to FALSE because gRPC_SSL_PROVIDER is \"module\"")
-    set(gRPC_INSTALL FALSE)
-  endif()
-elseif("${gRPC_SSL_PROVIDER}" STREQUAL "package")
-  find_package(OpenSSL)
-  if(TARGET OpenSSL::SSL)
-    set(_gRPC_SSL_LIBRARIES OpenSSL::SSL)
-  endif()
-  set(_gRPC_FIND_SSL "if(NOT OpenSSL_FOUND)\n  find_package(OpenSSL)\nendif()")
-endif()
-
-if("${gRPC_GFLAGS_PROVIDER}" STREQUAL "module")
-  if(NOT GFLAGS_ROOT_DIR)
-    set(GFLAGS_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
-  endif()
-  if(EXISTS "${GFLAGS_ROOT_DIR}/CMakeLists.txt")
-      add_subdirectory(${GFLAGS_ROOT_DIR} third_party/gflags)
-      if(TARGET gflags_static)
-          set(_gRPC_GFLAGS_LIBRARIES gflags_static)
-      endif()
-  else()
-      message(WARNING "gRPC_GFLAGS_PROVIDER is \"module\" but GFLAGS_ROOT_DIR is wrong")
-  endif()
-elseif("${gRPC_GFLAGS_PROVIDER}" STREQUAL "package")
-  find_package(gflags)
-  if(TARGET gflags::gflags)
-    set(_gRPC_GFLAGS_LIBRARIES gflags::gflags)
-  endif()
-  set(_gRPC_FIND_GFLAGS "if(NOT gflags_FOUND)\n  find_package(gflags)\nendif()")
-endif()
-
-if("${gRPC_BENCHMARK_PROVIDER}" STREQUAL "module")
-  if(NOT BENCHMARK_ROOT_DIR)
-    set(BENCHMARK_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/benchmark)
-  endif()
-  if(EXISTS "${BENCHMARK_ROOT_DIR}/CMakeLists.txt")
-      add_subdirectory(${BENCHMARK_ROOT_DIR} third_party/benchmark)
-      if(TARGET benchmark)
-          set(_gRPC_BENCHMARK_LIBRARIES benchmark)
-      endif()
-  else()
-      message(WARNING "gRPC_BENCHMARK_PROVIDER is \"module\" but BENCHMARK_ROOT_DIR is wrong")
-  endif()
-elseif("${gRPC_BENCHMARK_PROVIDER}" STREQUAL "package")
-  find_package(benchmark)
-  if(TARGET benchmark::benchmark)
-    set(_gRPC_BENCHMARK_LIBRARIES benchmark::benchmark)
-  endif()
-  set(_gRPC_FIND_BENCHMARK "if(NOT benchmark_FOUND)\n  find_package(benchmark)\nendif()")
-endif()
-
-if(NOT MSVC)
-  set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -std=c99")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
-
-if(_gRPC_PLATFORM_MAC)
-  set(_gRPC_ALLTARGETS_LIBRARIES ${CMAKE_DL_LIBS} m pthread)
-elseif(UNIX)
-  set(_gRPC_ALLTARGETS_LIBRARIES ${CMAKE_DL_LIBS} rt m pthread)
-endif()
-
-if(WIN32 AND MSVC)
-  set(_gRPC_BASELIB_LIBRARIES wsock32 ws2_32)
-endif()
-
-# Create directory for generated .proto files
-set(_gRPC_PROTO_GENS_DIR ${CMAKE_BINARY_DIR}/gens)
-file(MAKE_DIRECTORY ${_gRPC_PROTO_GENS_DIR})
-
-#  protobuf_generate_grpc_cpp
-#  --------------------------
-#
-#   Add custom commands to process ``.proto`` files to C++ using protoc and
-#   GRPC plugin::
-#
-#     protobuf_generate_grpc_cpp [<ARGN>...]
-#
-#   ``ARGN``
-#     ``.proto`` files
-#
-function(protobuf_generate_grpc_cpp)
-  if(NOT ARGN)
-    message(SEND_ERROR "Error: PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
-    return()
-  endif()
-
-  set(_protobuf_include_path -I . -I ${PROTOBUF_WELLKNOWN_IMPORT_DIR})
-  foreach(FIL ${ARGN})
-    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
-    get_filename_component(FIL_WE ${FIL} NAME_WE)
-    file(RELATIVE_PATH REL_FIL ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL})
-    get_filename_component(REL_DIR ${REL_FIL} DIRECTORY)
-    set(RELFIL_WE "${REL_DIR}/${FIL_WE}")
-
-    add_custom_command(
-      OUTPUT "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.cc"
-             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.h"
-             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}_mock.grpc.pb.h"
-             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.cc"
-             "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.h"
-      COMMAND $<TARGET_FILE:${_gRPC_PROTOBUF_PROTOC}>
-      ARGS --grpc_out=generate_mock_code=true:${_gRPC_PROTO_GENS_DIR}
-           --cpp_out=${_gRPC_PROTO_GENS_DIR}
-           --plugin=protoc-gen-grpc=$<TARGET_FILE:grpc_cpp_plugin>
-           ${_protobuf_include_path}
-           ${REL_FIL}
-      DEPENDS ${ABS_FIL} ${_gRPC_PROTOBUF_PROTOC} grpc_cpp_plugin
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-      COMMENT "Running gRPC C++ protocol buffer compiler on ${FIL}"
-      VERBATIM)
-
-      set_source_files_properties("${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.cc" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.grpc.pb.h"  "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}_mock.grpc.pb.h" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.cc" "${_gRPC_PROTO_GENS_DIR}/${RELFIL_WE}.pb.h" PROPERTIES GENERATED TRUE)
-  endforeach()
-endfunction()
-
-add_custom_target(plugins
-  DEPENDS
-  grpc_cpp_plugin
-  grpc_csharp_plugin
-  grpc_node_plugin
-  grpc_objective_c_plugin
-  grpc_php_plugin
-  grpc_python_plugin
-  grpc_ruby_plugin
-)
-
-add_custom_target(tools_c
-  DEPENDS
-  check_epollexclusive
-  gen_hpack_tables
-  gen_legal_metadata_characters
-  gen_percent_encoding_tables
-  grpc_create_jwt
-  grpc_print_google_default_creds_token
-  grpc_verify_jwt
-)
-
-add_custom_target(tools_cxx
-  DEPENDS
-)
-
-add_custom_target(tools
-  DEPENDS tools_c tools_cxx)
-
-if (gRPC_BUILD_TESTS)
-add_custom_target(buildtests_c)
-add_dependencies(buildtests_c alarm_test)
-add_dependencies(buildtests_c algorithm_test)
-add_dependencies(buildtests_c alloc_test)
-add_dependencies(buildtests_c alpn_test)
-add_dependencies(buildtests_c arena_test)
-add_dependencies(buildtests_c bad_server_response_test)
-add_dependencies(buildtests_c bdp_estimator_test)
-add_dependencies(buildtests_c bin_decoder_test)
-add_dependencies(buildtests_c bin_encoder_test)
-add_dependencies(buildtests_c census_context_test)
-add_dependencies(buildtests_c census_intrusive_hash_map_test)
-add_dependencies(buildtests_c census_resource_test)
-add_dependencies(buildtests_c census_trace_context_test)
-add_dependencies(buildtests_c channel_create_test)
-add_dependencies(buildtests_c chttp2_hpack_encoder_test)
-add_dependencies(buildtests_c chttp2_stream_map_test)
-add_dependencies(buildtests_c chttp2_varint_test)
-add_dependencies(buildtests_c combiner_test)
-add_dependencies(buildtests_c compression_test)
-add_dependencies(buildtests_c concurrent_connectivity_test)
-add_dependencies(buildtests_c connection_refused_test)
-add_dependencies(buildtests_c dns_resolver_connectivity_test)
-add_dependencies(buildtests_c dns_resolver_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c dualstack_socket_test)
-endif()
-add_dependencies(buildtests_c endpoint_pair_test)
-add_dependencies(buildtests_c error_test)
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c ev_epollsig_linux_test)
-endif()
-add_dependencies(buildtests_c fake_resolver_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c fd_conservation_posix_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c fd_posix_test)
-endif()
-add_dependencies(buildtests_c fling_client)
-add_dependencies(buildtests_c fling_server)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c fling_stream_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c fling_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c goaway_server_test)
-endif()
-add_dependencies(buildtests_c gpr_avl_test)
-add_dependencies(buildtests_c gpr_backoff_test)
-add_dependencies(buildtests_c gpr_cmdline_test)
-add_dependencies(buildtests_c gpr_cpu_test)
-add_dependencies(buildtests_c gpr_env_test)
-add_dependencies(buildtests_c gpr_histogram_test)
-add_dependencies(buildtests_c gpr_host_port_test)
-add_dependencies(buildtests_c gpr_log_test)
-add_dependencies(buildtests_c gpr_mpscq_test)
-add_dependencies(buildtests_c gpr_spinlock_test)
-add_dependencies(buildtests_c gpr_stack_lockfree_test)
-add_dependencies(buildtests_c gpr_string_test)
-add_dependencies(buildtests_c gpr_sync_test)
-add_dependencies(buildtests_c gpr_thd_test)
-add_dependencies(buildtests_c gpr_time_test)
-add_dependencies(buildtests_c gpr_tls_test)
-add_dependencies(buildtests_c gpr_useful_test)
-add_dependencies(buildtests_c grpc_auth_context_test)
-add_dependencies(buildtests_c grpc_b64_test)
-add_dependencies(buildtests_c grpc_byte_buffer_reader_test)
-add_dependencies(buildtests_c grpc_channel_args_test)
-add_dependencies(buildtests_c grpc_channel_stack_test)
-add_dependencies(buildtests_c grpc_completion_queue_test)
-add_dependencies(buildtests_c grpc_completion_queue_threading_test)
-add_dependencies(buildtests_c grpc_credentials_test)
-add_dependencies(buildtests_c grpc_fetch_oauth2)
-add_dependencies(buildtests_c grpc_invalid_channel_args_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c grpc_json_token_test)
-endif()
-add_dependencies(buildtests_c grpc_jwt_verifier_test)
-add_dependencies(buildtests_c grpc_security_connector_test)
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c handshake_client)
-endif()
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c handshake_server)
-endif()
-add_dependencies(buildtests_c hpack_parser_test)
-add_dependencies(buildtests_c hpack_table_test)
-add_dependencies(buildtests_c http_parser_test)
-add_dependencies(buildtests_c httpcli_format_request_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c httpcli_test)
-endif()
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c httpscli_test)
-endif()
-add_dependencies(buildtests_c init_test)
-add_dependencies(buildtests_c invalid_call_argument_test)
-add_dependencies(buildtests_c json_rewrite)
-add_dependencies(buildtests_c json_rewrite_test)
-add_dependencies(buildtests_c json_stream_error_test)
-add_dependencies(buildtests_c json_test)
-add_dependencies(buildtests_c lame_client_test)
-add_dependencies(buildtests_c lb_policies_test)
-add_dependencies(buildtests_c load_file_test)
-add_dependencies(buildtests_c memory_profile_client)
-add_dependencies(buildtests_c memory_profile_server)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c memory_profile_test)
-endif()
-add_dependencies(buildtests_c message_compress_test)
-add_dependencies(buildtests_c minimal_stack_is_minimal_test)
-add_dependencies(buildtests_c mlog_test)
-add_dependencies(buildtests_c multiple_server_queues_test)
-add_dependencies(buildtests_c murmur_hash_test)
-add_dependencies(buildtests_c no_server_test)
-add_dependencies(buildtests_c num_external_connectivity_watchers_test)
-add_dependencies(buildtests_c parse_address_test)
-add_dependencies(buildtests_c percent_encoding_test)
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c pollset_set_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c resolve_address_posix_test)
-endif()
-add_dependencies(buildtests_c resolve_address_test)
-add_dependencies(buildtests_c resource_quota_test)
-add_dependencies(buildtests_c secure_channel_create_test)
-add_dependencies(buildtests_c secure_endpoint_test)
-add_dependencies(buildtests_c sequential_connectivity_test)
-add_dependencies(buildtests_c server_chttp2_test)
-add_dependencies(buildtests_c server_test)
-add_dependencies(buildtests_c slice_buffer_test)
-add_dependencies(buildtests_c slice_hash_table_test)
-add_dependencies(buildtests_c slice_string_helpers_test)
-add_dependencies(buildtests_c slice_test)
-add_dependencies(buildtests_c sockaddr_resolver_test)
-add_dependencies(buildtests_c sockaddr_utils_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c socket_utils_test)
-endif()
-add_dependencies(buildtests_c status_conversion_test)
-add_dependencies(buildtests_c stream_compression_test)
-add_dependencies(buildtests_c stream_owned_slice_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c tcp_client_posix_test)
-endif()
-add_dependencies(buildtests_c tcp_client_uv_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c tcp_posix_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c tcp_server_posix_test)
-endif()
-add_dependencies(buildtests_c tcp_server_uv_test)
-add_dependencies(buildtests_c time_averaged_stats_test)
-add_dependencies(buildtests_c timeout_encoding_test)
-add_dependencies(buildtests_c timer_heap_test)
-add_dependencies(buildtests_c timer_list_test)
-add_dependencies(buildtests_c transport_connectivity_state_test)
-add_dependencies(buildtests_c transport_metadata_test)
-add_dependencies(buildtests_c transport_pid_controller_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c transport_security_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c udp_server_test)
-endif()
-add_dependencies(buildtests_c uri_parser_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c wakeup_fd_cv_test)
-endif()
-add_dependencies(buildtests_c public_headers_must_be_c89)
-add_dependencies(buildtests_c badreq_bad_client_test)
-add_dependencies(buildtests_c connection_prefix_bad_client_test)
-add_dependencies(buildtests_c head_of_line_blocking_bad_client_test)
-add_dependencies(buildtests_c headers_bad_client_test)
-add_dependencies(buildtests_c initial_settings_frame_bad_client_test)
-add_dependencies(buildtests_c large_metadata_bad_client_test)
-add_dependencies(buildtests_c server_registered_method_bad_client_test)
-add_dependencies(buildtests_c simple_request_bad_client_test)
-add_dependencies(buildtests_c unknown_frame_bad_client_test)
-add_dependencies(buildtests_c window_overflow_bad_client_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c bad_ssl_cert_server)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c bad_ssl_cert_test)
-endif()
-add_dependencies(buildtests_c h2_census_test)
-add_dependencies(buildtests_c h2_compress_test)
-add_dependencies(buildtests_c h2_fakesec_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c h2_fd_test)
-endif()
-add_dependencies(buildtests_c h2_full_test)
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c h2_full+pipe_test)
-endif()
-add_dependencies(buildtests_c h2_full+trace_test)
-add_dependencies(buildtests_c h2_full+workarounds_test)
-add_dependencies(buildtests_c h2_http_proxy_test)
-add_dependencies(buildtests_c h2_load_reporting_test)
-add_dependencies(buildtests_c h2_oauth2_test)
-add_dependencies(buildtests_c h2_proxy_test)
-add_dependencies(buildtests_c h2_sockpair_test)
-add_dependencies(buildtests_c h2_sockpair+trace_test)
-add_dependencies(buildtests_c h2_sockpair_1byte_test)
-add_dependencies(buildtests_c h2_ssl_test)
-add_dependencies(buildtests_c h2_ssl_cert_test)
-add_dependencies(buildtests_c h2_ssl_proxy_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c h2_uds_test)
-endif()
-add_dependencies(buildtests_c inproc_test)
-add_dependencies(buildtests_c h2_census_nosec_test)
-add_dependencies(buildtests_c h2_compress_nosec_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c h2_fd_nosec_test)
-endif()
-add_dependencies(buildtests_c h2_full_nosec_test)
-if(_gRPC_PLATFORM_LINUX)
-add_dependencies(buildtests_c h2_full+pipe_nosec_test)
-endif()
-add_dependencies(buildtests_c h2_full+trace_nosec_test)
-add_dependencies(buildtests_c h2_full+workarounds_nosec_test)
-add_dependencies(buildtests_c h2_http_proxy_nosec_test)
-add_dependencies(buildtests_c h2_load_reporting_nosec_test)
-add_dependencies(buildtests_c h2_proxy_nosec_test)
-add_dependencies(buildtests_c h2_sockpair_nosec_test)
-add_dependencies(buildtests_c h2_sockpair+trace_nosec_test)
-add_dependencies(buildtests_c h2_sockpair_1byte_nosec_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_c h2_uds_nosec_test)
-endif()
-add_dependencies(buildtests_c inproc_nosec_test)
-add_dependencies(buildtests_c api_fuzzer_one_entry)
-add_dependencies(buildtests_c client_fuzzer_one_entry)
-add_dependencies(buildtests_c hpack_parser_fuzzer_test_one_entry)
-add_dependencies(buildtests_c http_request_fuzzer_test_one_entry)
-add_dependencies(buildtests_c http_response_fuzzer_test_one_entry)
-add_dependencies(buildtests_c json_fuzzer_test_one_entry)
-add_dependencies(buildtests_c nanopb_fuzzer_response_test_one_entry)
-add_dependencies(buildtests_c nanopb_fuzzer_serverlist_test_one_entry)
-add_dependencies(buildtests_c percent_decode_fuzzer_one_entry)
-add_dependencies(buildtests_c percent_encode_fuzzer_one_entry)
-add_dependencies(buildtests_c server_fuzzer_one_entry)
-add_dependencies(buildtests_c ssl_server_fuzzer_one_entry)
-add_dependencies(buildtests_c uri_fuzzer_test_one_entry)
-
-add_custom_target(buildtests_cxx)
-add_dependencies(buildtests_cxx alarm_cpp_test)
-add_dependencies(buildtests_cxx async_end2end_test)
-add_dependencies(buildtests_cxx auth_property_iterator_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_arena)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_call_create)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_chttp2_hpack)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_chttp2_transport)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_closure)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_cq)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_cq_multiple_threads)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_error)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_fullstack_streaming_ping_pong)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_fullstack_streaming_pump)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_fullstack_trickle)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_fullstack_unary_ping_pong)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_metadata)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx bm_pollset)
-endif()
-add_dependencies(buildtests_cxx channel_arguments_test)
-add_dependencies(buildtests_cxx channel_filter_test)
-add_dependencies(buildtests_cxx cli_call_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx client_crash_test)
-endif()
-add_dependencies(buildtests_cxx client_crash_test_server)
-add_dependencies(buildtests_cxx client_lb_end2end_test)
-add_dependencies(buildtests_cxx codegen_test_full)
-add_dependencies(buildtests_cxx codegen_test_minimal)
-add_dependencies(buildtests_cxx credentials_test)
-add_dependencies(buildtests_cxx cxx_byte_buffer_test)
-add_dependencies(buildtests_cxx cxx_slice_test)
-add_dependencies(buildtests_cxx cxx_string_ref_test)
-add_dependencies(buildtests_cxx cxx_time_test)
-add_dependencies(buildtests_cxx end2end_test)
-add_dependencies(buildtests_cxx error_details_test)
-add_dependencies(buildtests_cxx filter_end2end_test)
-add_dependencies(buildtests_cxx generic_end2end_test)
-add_dependencies(buildtests_cxx golden_file_test)
-add_dependencies(buildtests_cxx grpc_cli)
-add_dependencies(buildtests_cxx grpc_tool_test)
-add_dependencies(buildtests_cxx grpclb_api_test)
-add_dependencies(buildtests_cxx grpclb_end2end_test)
-add_dependencies(buildtests_cxx grpclb_test)
-add_dependencies(buildtests_cxx health_service_end2end_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx http2_client)
-endif()
-add_dependencies(buildtests_cxx hybrid_end2end_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx interop_client)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx interop_server)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx interop_test)
-endif()
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx json_run_localhost)
-endif()
-add_dependencies(buildtests_cxx memory_test)
-add_dependencies(buildtests_cxx metrics_client)
-add_dependencies(buildtests_cxx mock_test)
-add_dependencies(buildtests_cxx noop-benchmark)
-add_dependencies(buildtests_cxx proto_server_reflection_test)
-add_dependencies(buildtests_cxx proto_utils_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx qps_interarrival_test)
-endif()
-add_dependencies(buildtests_cxx qps_json_driver)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx qps_openloop_test)
-endif()
-add_dependencies(buildtests_cxx qps_worker)
-add_dependencies(buildtests_cxx reconnect_interop_client)
-add_dependencies(buildtests_cxx reconnect_interop_server)
-add_dependencies(buildtests_cxx secure_auth_context_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx secure_sync_unary_ping_pong_test)
-endif()
-add_dependencies(buildtests_cxx server_builder_plugin_test)
-add_dependencies(buildtests_cxx server_builder_test)
-add_dependencies(buildtests_cxx server_context_test_spouse_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx server_crash_test)
-endif()
-add_dependencies(buildtests_cxx server_crash_test_client)
-add_dependencies(buildtests_cxx server_request_call_test)
-add_dependencies(buildtests_cxx shutdown_test)
-add_dependencies(buildtests_cxx status_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx streaming_throughput_test)
-endif()
-add_dependencies(buildtests_cxx stress_test)
-add_dependencies(buildtests_cxx thread_manager_test)
-add_dependencies(buildtests_cxx thread_stress_test)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-add_dependencies(buildtests_cxx writes_per_rpc_test)
-endif()
-
-add_custom_target(buildtests
-  DEPENDS buildtests_c buildtests_cxx)
-endif (gRPC_BUILD_TESTS)
-
-
-add_library(gpr
-  src/core/lib/profiling/basic_timers.c
-  src/core/lib/profiling/stap_timers.c
-  src/core/lib/support/alloc.c
-  src/core/lib/support/arena.c
-  src/core/lib/support/atm.c
-  src/core/lib/support/avl.c
-  src/core/lib/support/backoff.c
-  src/core/lib/support/cmdline.c
-  src/core/lib/support/cpu_iphone.c
-  src/core/lib/support/cpu_linux.c
-  src/core/lib/support/cpu_posix.c
-  src/core/lib/support/cpu_windows.c
-  src/core/lib/support/env_linux.c
-  src/core/lib/support/env_posix.c
-  src/core/lib/support/env_windows.c
-  src/core/lib/support/histogram.c
-  src/core/lib/support/host_port.c
-  src/core/lib/support/log.c
-  src/core/lib/support/log_android.c
-  src/core/lib/support/log_linux.c
-  src/core/lib/support/log_posix.c
-  src/core/lib/support/log_windows.c
-  src/core/lib/support/mpscq.c
-  src/core/lib/support/murmur_hash.c
-  src/core/lib/support/stack_lockfree.c
-  src/core/lib/support/string.c
-  src/core/lib/support/string_posix.c
-  src/core/lib/support/string_util_windows.c
-  src/core/lib/support/string_windows.c
-  src/core/lib/support/subprocess_posix.c
-  src/core/lib/support/subprocess_windows.c
-  src/core/lib/support/sync.c
-  src/core/lib/support/sync_posix.c
-  src/core/lib/support/sync_windows.c
-  src/core/lib/support/thd.c
-  src/core/lib/support/thd_posix.c
-  src/core/lib/support/thd_windows.c
-  src/core/lib/support/time.c
-  src/core/lib/support/time_posix.c
-  src/core/lib/support/time_precise.c
-  src/core/lib/support/time_windows.c
-  src/core/lib/support/tls_pthread.c
-  src/core/lib/support/tmpfile_msys.c
-  src/core/lib/support/tmpfile_posix.c
-  src/core/lib/support/tmpfile_windows.c
-  src/core/lib/support/wrap_memcpy.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(gpr PROPERTIES COMPILE_PDB_NAME "gpr"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gpr.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(gpr
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-foreach(_hdr
-  include/grpc/support/alloc.h
-  include/grpc/support/atm.h
-  include/grpc/support/atm_gcc_atomic.h
-  include/grpc/support/atm_gcc_sync.h
-  include/grpc/support/atm_windows.h
-  include/grpc/support/avl.h
-  include/grpc/support/cmdline.h
-  include/grpc/support/cpu.h
-  include/grpc/support/histogram.h
-  include/grpc/support/host_port.h
-  include/grpc/support/log.h
-  include/grpc/support/log_windows.h
-  include/grpc/support/port_platform.h
-  include/grpc/support/string_util.h
-  include/grpc/support/subprocess.h
-  include/grpc/support/sync.h
-  include/grpc/support/sync_generic.h
-  include/grpc/support/sync_posix.h
-  include/grpc/support/sync_windows.h
-  include/grpc/support/thd.h
-  include/grpc/support/time.h
-  include/grpc/support/tls.h
-  include/grpc/support/tls_gcc.h
-  include/grpc/support/tls_msvc.h
-  include/grpc/support/tls_pthread.h
-  include/grpc/support/useful.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS gpr EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(gpr_test_util
-  test/core/util/test_config.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(gpr_test_util PROPERTIES COMPILE_PDB_NAME "gpr_test_util"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gpr_test_util.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(gpr_test_util
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_test_util
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc
-  src/core/lib/surface/init.c
-  src/core/lib/channel/channel_args.c
-  src/core/lib/channel/channel_stack.c
-  src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/handshaker.c
-  src/core/lib/channel/handshaker_factory.c
-  src/core/lib/channel/handshaker_registry.c
-  src/core/lib/compression/compression.c
-  src/core/lib/compression/message_compress.c
-  src/core/lib/compression/stream_compression.c
-  src/core/lib/http/format_request.c
-  src/core/lib/http/httpcli.c
-  src/core/lib/http/parser.c
-  src/core/lib/iomgr/closure.c
-  src/core/lib/iomgr/combiner.c
-  src/core/lib/iomgr/endpoint.c
-  src/core/lib/iomgr/endpoint_pair_posix.c
-  src/core/lib/iomgr/endpoint_pair_uv.c
-  src/core/lib/iomgr/endpoint_pair_windows.c
-  src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll1_linux.c
-  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
-  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
-  src/core/lib/iomgr/ev_epollex_linux.c
-  src/core/lib/iomgr/ev_epollsig_linux.c
-  src/core/lib/iomgr/ev_poll_posix.c
-  src/core/lib/iomgr/ev_posix.c
-  src/core/lib/iomgr/ev_windows.c
-  src/core/lib/iomgr/exec_ctx.c
-  src/core/lib/iomgr/executor.c
-  src/core/lib/iomgr/iocp_windows.c
-  src/core/lib/iomgr/iomgr.c
-  src/core/lib/iomgr/iomgr_posix.c
-  src/core/lib/iomgr/iomgr_uv.c
-  src/core/lib/iomgr/iomgr_windows.c
-  src/core/lib/iomgr/is_epollexclusive_available.c
-  src/core/lib/iomgr/load_file.c
-  src/core/lib/iomgr/lockfree_event.c
-  src/core/lib/iomgr/network_status_tracker.c
-  src/core/lib/iomgr/polling_entity.c
-  src/core/lib/iomgr/pollset_set_uv.c
-  src/core/lib/iomgr/pollset_set_windows.c
-  src/core/lib/iomgr/pollset_uv.c
-  src/core/lib/iomgr/pollset_windows.c
-  src/core/lib/iomgr/resolve_address_posix.c
-  src/core/lib/iomgr/resolve_address_uv.c
-  src/core/lib/iomgr/resolve_address_windows.c
-  src/core/lib/iomgr/resource_quota.c
-  src/core/lib/iomgr/sockaddr_utils.c
-  src/core/lib/iomgr/socket_factory_posix.c
-  src/core/lib/iomgr/socket_mutator.c
-  src/core/lib/iomgr/socket_utils_common_posix.c
-  src/core/lib/iomgr/socket_utils_linux.c
-  src/core/lib/iomgr/socket_utils_posix.c
-  src/core/lib/iomgr/socket_utils_uv.c
-  src/core/lib/iomgr/socket_utils_windows.c
-  src/core/lib/iomgr/socket_windows.c
-  src/core/lib/iomgr/tcp_client_posix.c
-  src/core/lib/iomgr/tcp_client_uv.c
-  src/core/lib/iomgr/tcp_client_windows.c
-  src/core/lib/iomgr/tcp_posix.c
-  src/core/lib/iomgr/tcp_server_posix.c
-  src/core/lib/iomgr/tcp_server_utils_posix_common.c
-  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
-  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
-  src/core/lib/iomgr/tcp_server_uv.c
-  src/core/lib/iomgr/tcp_server_windows.c
-  src/core/lib/iomgr/tcp_uv.c
-  src/core/lib/iomgr/tcp_windows.c
-  src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer_generic.c
-  src/core/lib/iomgr/timer_heap.c
-  src/core/lib/iomgr/timer_manager.c
-  src/core/lib/iomgr/timer_uv.c
-  src/core/lib/iomgr/udp_server.c
-  src/core/lib/iomgr/unix_sockets_posix.c
-  src/core/lib/iomgr/unix_sockets_posix_noop.c
-  src/core/lib/iomgr/wakeup_fd_cv.c
-  src/core/lib/iomgr/wakeup_fd_eventfd.c
-  src/core/lib/iomgr/wakeup_fd_nospecial.c
-  src/core/lib/iomgr/wakeup_fd_pipe.c
-  src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/json/json.c
-  src/core/lib/json/json_reader.c
-  src/core/lib/json/json_string.c
-  src/core/lib/json/json_writer.c
-  src/core/lib/slice/b64.c
-  src/core/lib/slice/percent_encoding.c
-  src/core/lib/slice/slice.c
-  src/core/lib/slice/slice_buffer.c
-  src/core/lib/slice/slice_hash_table.c
-  src/core/lib/slice/slice_intern.c
-  src/core/lib/slice/slice_string_helpers.c
-  src/core/lib/surface/alarm.c
-  src/core/lib/surface/api_trace.c
-  src/core/lib/surface/byte_buffer.c
-  src/core/lib/surface/byte_buffer_reader.c
-  src/core/lib/surface/call.c
-  src/core/lib/surface/call_details.c
-  src/core/lib/surface/call_log_batch.c
-  src/core/lib/surface/channel.c
-  src/core/lib/surface/channel_init.c
-  src/core/lib/surface/channel_ping.c
-  src/core/lib/surface/channel_stack_type.c
-  src/core/lib/surface/completion_queue.c
-  src/core/lib/surface/completion_queue_factory.c
-  src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.cc
-  src/core/lib/surface/metadata_array.c
-  src/core/lib/surface/server.c
-  src/core/lib/surface/validate_metadata.c
-  src/core/lib/surface/version.c
-  src/core/lib/transport/bdp_estimator.c
-  src/core/lib/transport/byte_stream.c
-  src/core/lib/transport/connectivity_state.c
-  src/core/lib/transport/error_utils.c
-  src/core/lib/transport/metadata.c
-  src/core/lib/transport/metadata_batch.c
-  src/core/lib/transport/pid_controller.c
-  src/core/lib/transport/service_config.c
-  src/core/lib/transport/static_metadata.c
-  src/core/lib/transport/status_conversion.c
-  src/core/lib/transport/timeout_encoding.c
-  src/core/lib/transport/transport.c
-  src/core/lib/transport/transport_op_string.c
-  src/core/lib/debug/trace.c
-  src/core/ext/transport/chttp2/server/secure/server_secure_chttp2.c
-  src/core/ext/transport/chttp2/transport/bin_decoder.c
-  src/core/ext/transport/chttp2/transport/bin_encoder.c
-  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
-  src/core/ext/transport/chttp2/transport/chttp2_transport.c
-  src/core/ext/transport/chttp2/transport/frame_data.c
-  src/core/ext/transport/chttp2/transport/frame_goaway.c
-  src/core/ext/transport/chttp2/transport/frame_ping.c
-  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
-  src/core/ext/transport/chttp2/transport/frame_settings.c
-  src/core/ext/transport/chttp2/transport/frame_window_update.c
-  src/core/ext/transport/chttp2/transport/hpack_encoder.c
-  src/core/ext/transport/chttp2/transport/hpack_parser.c
-  src/core/ext/transport/chttp2/transport/hpack_table.c
-  src/core/ext/transport/chttp2/transport/http2_settings.c
-  src/core/ext/transport/chttp2/transport/huffsyms.c
-  src/core/ext/transport/chttp2/transport/incoming_metadata.c
-  src/core/ext/transport/chttp2/transport/parsing.c
-  src/core/ext/transport/chttp2/transport/stream_lists.c
-  src/core/ext/transport/chttp2/transport/stream_map.c
-  src/core/ext/transport/chttp2/transport/varint.c
-  src/core/ext/transport/chttp2/transport/writing.c
-  src/core/ext/transport/chttp2/alpn/alpn.c
-  src/core/ext/filters/http/client/http_client_filter.c
-  src/core/ext/filters/http/http_filters_plugin.c
-  src/core/ext/filters/http/message_compress/message_compress_filter.c
-  src/core/ext/filters/http/server/http_server_filter.c
-  src/core/lib/http/httpcli_security_connector.c
-  src/core/lib/security/context/security_context.c
-  src/core/lib/security/credentials/composite/composite_credentials.c
-  src/core/lib/security/credentials/credentials.c
-  src/core/lib/security/credentials/credentials_metadata.c
-  src/core/lib/security/credentials/fake/fake_credentials.c
-  src/core/lib/security/credentials/google_default/credentials_generic.c
-  src/core/lib/security/credentials/google_default/google_default_credentials.c
-  src/core/lib/security/credentials/iam/iam_credentials.c
-  src/core/lib/security/credentials/jwt/json_token.c
-  src/core/lib/security/credentials/jwt/jwt_credentials.c
-  src/core/lib/security/credentials/jwt/jwt_verifier.c
-  src/core/lib/security/credentials/oauth2/oauth2_credentials.c
-  src/core/lib/security/credentials/plugin/plugin_credentials.c
-  src/core/lib/security/credentials/ssl/ssl_credentials.c
-  src/core/lib/security/transport/client_auth_filter.c
-  src/core/lib/security/transport/lb_targets_info.c
-  src/core/lib/security/transport/secure_endpoint.c
-  src/core/lib/security/transport/security_connector.c
-  src/core/lib/security/transport/security_handshaker.c
-  src/core/lib/security/transport/server_auth_filter.c
-  src/core/lib/security/transport/tsi_error.c
-  src/core/lib/security/util/json_util.c
-  src/core/lib/surface/init_secure.c
-  src/core/tsi/fake_transport_security.c
-  src/core/tsi/gts_transport_security.c
-  src/core/tsi/ssl_transport_security.c
-  src/core/tsi/transport_security.c
-  src/core/tsi/transport_security_adapter.c
-  src/core/ext/transport/chttp2/server/chttp2_server.c
-  src/core/ext/transport/chttp2/client/secure/secure_channel_create.c
-  src/core/ext/filters/client_channel/channel_connectivity.c
-  src/core/ext/filters/client_channel/client_channel.c
-  src/core/ext/filters/client_channel/client_channel_factory.c
-  src/core/ext/filters/client_channel/client_channel_plugin.c
-  src/core/ext/filters/client_channel/connector.c
-  src/core/ext/filters/client_channel/http_connect_handshaker.c
-  src/core/ext/filters/client_channel/http_proxy.c
-  src/core/ext/filters/client_channel/lb_policy.c
-  src/core/ext/filters/client_channel/lb_policy_factory.c
-  src/core/ext/filters/client_channel/lb_policy_registry.c
-  src/core/ext/filters/client_channel/parse_address.c
-  src/core/ext/filters/client_channel/proxy_mapper.c
-  src/core/ext/filters/client_channel/proxy_mapper_registry.c
-  src/core/ext/filters/client_channel/resolver.c
-  src/core/ext/filters/client_channel/resolver_factory.c
-  src/core/ext/filters/client_channel/resolver_registry.c
-  src/core/ext/filters/client_channel/retry_throttle.c
-  src/core/ext/filters/client_channel/subchannel.c
-  src/core/ext/filters/client_channel/subchannel_index.c
-  src/core/ext/filters/client_channel/uri_parser.c
-  src/core/ext/filters/deadline/deadline_filter.c
-  src/core/ext/transport/chttp2/client/chttp2_connector.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
-  src/core/ext/transport/chttp2/client/insecure/channel_create.c
-  src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
-  src/core/ext/transport/inproc/inproc_plugin.c
-  src/core/ext/transport/inproc/inproc_transport.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_channel_secure.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_client_stats.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c
-  third_party/nanopb/pb_common.c
-  third_party/nanopb/pb_decode.c
-  third_party/nanopb/pb_encode.c
-  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
-  src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.c
-  src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_posix.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.c
-  src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.c
-  src/core/ext/filters/client_channel/resolver/sockaddr/sockaddr_resolver.c
-  src/core/ext/filters/load_reporting/load_reporting.c
-  src/core/ext/filters/load_reporting/load_reporting_filter.c
-  src/core/ext/census/base_resources.c
-  src/core/ext/census/context.c
-  src/core/ext/census/gen/census.pb.c
-  src/core/ext/census/gen/trace_context.pb.c
-  src/core/ext/census/grpc_context.c
-  src/core/ext/census/grpc_filter.c
-  src/core/ext/census/grpc_plugin.c
-  src/core/ext/census/initialize.c
-  src/core/ext/census/intrusive_hash_map.c
-  src/core/ext/census/mlog.c
-  src/core/ext/census/operation.c
-  src/core/ext/census/placeholders.c
-  src/core/ext/census/resource.c
-  src/core/ext/census/trace_context.c
-  src/core/ext/census/tracing.c
-  src/core/ext/filters/max_age/max_age_filter.c
-  src/core/ext/filters/message_size/message_size_filter.c
-  src/core/ext/filters/workarounds/workaround_cronet_compression_filter.c
-  src/core/ext/filters/workarounds/workaround_utils.c
-  src/core/plugin_registry/grpc_plugin_registry.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc PROPERTIES COMPILE_PDB_NAME "grpc"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ZLIB_LIBRARIES}
-  ${_gRPC_CARES_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-)
-
-foreach(_hdr
-  include/grpc/byte_buffer.h
-  include/grpc/byte_buffer_reader.h
-  include/grpc/compression.h
-  include/grpc/grpc.h
-  include/grpc/grpc_posix.h
-  include/grpc/grpc_security_constants.h
-  include/grpc/load_reporting.h
-  include/grpc/slice.h
-  include/grpc/slice_buffer.h
-  include/grpc/status.h
-  include/grpc/support/workaround_list.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc/grpc_security.h
-  include/grpc/census.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_library(grpc_cronet
-  src/core/lib/surface/init.c
-  src/core/lib/channel/channel_args.c
-  src/core/lib/channel/channel_stack.c
-  src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/handshaker.c
-  src/core/lib/channel/handshaker_factory.c
-  src/core/lib/channel/handshaker_registry.c
-  src/core/lib/compression/compression.c
-  src/core/lib/compression/message_compress.c
-  src/core/lib/compression/stream_compression.c
-  src/core/lib/http/format_request.c
-  src/core/lib/http/httpcli.c
-  src/core/lib/http/parser.c
-  src/core/lib/iomgr/closure.c
-  src/core/lib/iomgr/combiner.c
-  src/core/lib/iomgr/endpoint.c
-  src/core/lib/iomgr/endpoint_pair_posix.c
-  src/core/lib/iomgr/endpoint_pair_uv.c
-  src/core/lib/iomgr/endpoint_pair_windows.c
-  src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll1_linux.c
-  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
-  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
-  src/core/lib/iomgr/ev_epollex_linux.c
-  src/core/lib/iomgr/ev_epollsig_linux.c
-  src/core/lib/iomgr/ev_poll_posix.c
-  src/core/lib/iomgr/ev_posix.c
-  src/core/lib/iomgr/ev_windows.c
-  src/core/lib/iomgr/exec_ctx.c
-  src/core/lib/iomgr/executor.c
-  src/core/lib/iomgr/iocp_windows.c
-  src/core/lib/iomgr/iomgr.c
-  src/core/lib/iomgr/iomgr_posix.c
-  src/core/lib/iomgr/iomgr_uv.c
-  src/core/lib/iomgr/iomgr_windows.c
-  src/core/lib/iomgr/is_epollexclusive_available.c
-  src/core/lib/iomgr/load_file.c
-  src/core/lib/iomgr/lockfree_event.c
-  src/core/lib/iomgr/network_status_tracker.c
-  src/core/lib/iomgr/polling_entity.c
-  src/core/lib/iomgr/pollset_set_uv.c
-  src/core/lib/iomgr/pollset_set_windows.c
-  src/core/lib/iomgr/pollset_uv.c
-  src/core/lib/iomgr/pollset_windows.c
-  src/core/lib/iomgr/resolve_address_posix.c
-  src/core/lib/iomgr/resolve_address_uv.c
-  src/core/lib/iomgr/resolve_address_windows.c
-  src/core/lib/iomgr/resource_quota.c
-  src/core/lib/iomgr/sockaddr_utils.c
-  src/core/lib/iomgr/socket_factory_posix.c
-  src/core/lib/iomgr/socket_mutator.c
-  src/core/lib/iomgr/socket_utils_common_posix.c
-  src/core/lib/iomgr/socket_utils_linux.c
-  src/core/lib/iomgr/socket_utils_posix.c
-  src/core/lib/iomgr/socket_utils_uv.c
-  src/core/lib/iomgr/socket_utils_windows.c
-  src/core/lib/iomgr/socket_windows.c
-  src/core/lib/iomgr/tcp_client_posix.c
-  src/core/lib/iomgr/tcp_client_uv.c
-  src/core/lib/iomgr/tcp_client_windows.c
-  src/core/lib/iomgr/tcp_posix.c
-  src/core/lib/iomgr/tcp_server_posix.c
-  src/core/lib/iomgr/tcp_server_utils_posix_common.c
-  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
-  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
-  src/core/lib/iomgr/tcp_server_uv.c
-  src/core/lib/iomgr/tcp_server_windows.c
-  src/core/lib/iomgr/tcp_uv.c
-  src/core/lib/iomgr/tcp_windows.c
-  src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer_generic.c
-  src/core/lib/iomgr/timer_heap.c
-  src/core/lib/iomgr/timer_manager.c
-  src/core/lib/iomgr/timer_uv.c
-  src/core/lib/iomgr/udp_server.c
-  src/core/lib/iomgr/unix_sockets_posix.c
-  src/core/lib/iomgr/unix_sockets_posix_noop.c
-  src/core/lib/iomgr/wakeup_fd_cv.c
-  src/core/lib/iomgr/wakeup_fd_eventfd.c
-  src/core/lib/iomgr/wakeup_fd_nospecial.c
-  src/core/lib/iomgr/wakeup_fd_pipe.c
-  src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/json/json.c
-  src/core/lib/json/json_reader.c
-  src/core/lib/json/json_string.c
-  src/core/lib/json/json_writer.c
-  src/core/lib/slice/b64.c
-  src/core/lib/slice/percent_encoding.c
-  src/core/lib/slice/slice.c
-  src/core/lib/slice/slice_buffer.c
-  src/core/lib/slice/slice_hash_table.c
-  src/core/lib/slice/slice_intern.c
-  src/core/lib/slice/slice_string_helpers.c
-  src/core/lib/surface/alarm.c
-  src/core/lib/surface/api_trace.c
-  src/core/lib/surface/byte_buffer.c
-  src/core/lib/surface/byte_buffer_reader.c
-  src/core/lib/surface/call.c
-  src/core/lib/surface/call_details.c
-  src/core/lib/surface/call_log_batch.c
-  src/core/lib/surface/channel.c
-  src/core/lib/surface/channel_init.c
-  src/core/lib/surface/channel_ping.c
-  src/core/lib/surface/channel_stack_type.c
-  src/core/lib/surface/completion_queue.c
-  src/core/lib/surface/completion_queue_factory.c
-  src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.cc
-  src/core/lib/surface/metadata_array.c
-  src/core/lib/surface/server.c
-  src/core/lib/surface/validate_metadata.c
-  src/core/lib/surface/version.c
-  src/core/lib/transport/bdp_estimator.c
-  src/core/lib/transport/byte_stream.c
-  src/core/lib/transport/connectivity_state.c
-  src/core/lib/transport/error_utils.c
-  src/core/lib/transport/metadata.c
-  src/core/lib/transport/metadata_batch.c
-  src/core/lib/transport/pid_controller.c
-  src/core/lib/transport/service_config.c
-  src/core/lib/transport/static_metadata.c
-  src/core/lib/transport/status_conversion.c
-  src/core/lib/transport/timeout_encoding.c
-  src/core/lib/transport/transport.c
-  src/core/lib/transport/transport_op_string.c
-  src/core/lib/debug/trace.c
-  src/core/ext/transport/cronet/client/secure/cronet_channel_create.c
-  src/core/ext/transport/cronet/transport/cronet_api_dummy.c
-  src/core/ext/transport/cronet/transport/cronet_transport.c
-  src/core/ext/transport/chttp2/client/secure/secure_channel_create.c
-  src/core/ext/transport/chttp2/transport/bin_decoder.c
-  src/core/ext/transport/chttp2/transport/bin_encoder.c
-  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
-  src/core/ext/transport/chttp2/transport/chttp2_transport.c
-  src/core/ext/transport/chttp2/transport/frame_data.c
-  src/core/ext/transport/chttp2/transport/frame_goaway.c
-  src/core/ext/transport/chttp2/transport/frame_ping.c
-  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
-  src/core/ext/transport/chttp2/transport/frame_settings.c
-  src/core/ext/transport/chttp2/transport/frame_window_update.c
-  src/core/ext/transport/chttp2/transport/hpack_encoder.c
-  src/core/ext/transport/chttp2/transport/hpack_parser.c
-  src/core/ext/transport/chttp2/transport/hpack_table.c
-  src/core/ext/transport/chttp2/transport/http2_settings.c
-  src/core/ext/transport/chttp2/transport/huffsyms.c
-  src/core/ext/transport/chttp2/transport/incoming_metadata.c
-  src/core/ext/transport/chttp2/transport/parsing.c
-  src/core/ext/transport/chttp2/transport/stream_lists.c
-  src/core/ext/transport/chttp2/transport/stream_map.c
-  src/core/ext/transport/chttp2/transport/varint.c
-  src/core/ext/transport/chttp2/transport/writing.c
-  src/core/ext/transport/chttp2/alpn/alpn.c
-  src/core/ext/filters/http/client/http_client_filter.c
-  src/core/ext/filters/http/http_filters_plugin.c
-  src/core/ext/filters/http/message_compress/message_compress_filter.c
-  src/core/ext/filters/http/server/http_server_filter.c
-  src/core/ext/filters/client_channel/channel_connectivity.c
-  src/core/ext/filters/client_channel/client_channel.c
-  src/core/ext/filters/client_channel/client_channel_factory.c
-  src/core/ext/filters/client_channel/client_channel_plugin.c
-  src/core/ext/filters/client_channel/connector.c
-  src/core/ext/filters/client_channel/http_connect_handshaker.c
-  src/core/ext/filters/client_channel/http_proxy.c
-  src/core/ext/filters/client_channel/lb_policy.c
-  src/core/ext/filters/client_channel/lb_policy_factory.c
-  src/core/ext/filters/client_channel/lb_policy_registry.c
-  src/core/ext/filters/client_channel/parse_address.c
-  src/core/ext/filters/client_channel/proxy_mapper.c
-  src/core/ext/filters/client_channel/proxy_mapper_registry.c
-  src/core/ext/filters/client_channel/resolver.c
-  src/core/ext/filters/client_channel/resolver_factory.c
-  src/core/ext/filters/client_channel/resolver_registry.c
-  src/core/ext/filters/client_channel/retry_throttle.c
-  src/core/ext/filters/client_channel/subchannel.c
-  src/core/ext/filters/client_channel/subchannel_index.c
-  src/core/ext/filters/client_channel/uri_parser.c
-  src/core/ext/filters/deadline/deadline_filter.c
-  src/core/lib/http/httpcli_security_connector.c
-  src/core/lib/security/context/security_context.c
-  src/core/lib/security/credentials/composite/composite_credentials.c
-  src/core/lib/security/credentials/credentials.c
-  src/core/lib/security/credentials/credentials_metadata.c
-  src/core/lib/security/credentials/fake/fake_credentials.c
-  src/core/lib/security/credentials/google_default/credentials_generic.c
-  src/core/lib/security/credentials/google_default/google_default_credentials.c
-  src/core/lib/security/credentials/iam/iam_credentials.c
-  src/core/lib/security/credentials/jwt/json_token.c
-  src/core/lib/security/credentials/jwt/jwt_credentials.c
-  src/core/lib/security/credentials/jwt/jwt_verifier.c
-  src/core/lib/security/credentials/oauth2/oauth2_credentials.c
-  src/core/lib/security/credentials/plugin/plugin_credentials.c
-  src/core/lib/security/credentials/ssl/ssl_credentials.c
-  src/core/lib/security/transport/client_auth_filter.c
-  src/core/lib/security/transport/lb_targets_info.c
-  src/core/lib/security/transport/secure_endpoint.c
-  src/core/lib/security/transport/security_connector.c
-  src/core/lib/security/transport/security_handshaker.c
-  src/core/lib/security/transport/server_auth_filter.c
-  src/core/lib/security/transport/tsi_error.c
-  src/core/lib/security/util/json_util.c
-  src/core/lib/surface/init_secure.c
-  src/core/tsi/fake_transport_security.c
-  src/core/tsi/gts_transport_security.c
-  src/core/tsi/ssl_transport_security.c
-  src/core/tsi/transport_security.c
-  src/core/tsi/transport_security_adapter.c
-  src/core/ext/transport/chttp2/client/chttp2_connector.c
-  src/core/ext/filters/load_reporting/load_reporting.c
-  src/core/ext/filters/load_reporting/load_reporting_filter.c
-  src/core/plugin_registry/grpc_cronet_plugin_registry.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_cronet PROPERTIES COMPILE_PDB_NAME "grpc_cronet"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_cronet.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_cronet
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_cronet
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ZLIB_LIBRARIES}
-  ${_gRPC_CARES_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-)
-
-foreach(_hdr
-  include/grpc/byte_buffer.h
-  include/grpc/byte_buffer_reader.h
-  include/grpc/compression.h
-  include/grpc/grpc.h
-  include/grpc/grpc_posix.h
-  include/grpc/grpc_security_constants.h
-  include/grpc/load_reporting.h
-  include/grpc/slice.h
-  include/grpc/slice_buffer.h
-  include/grpc/status.h
-  include/grpc/support/workaround_list.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc/grpc_cronet.h
-  include/grpc/grpc_security.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_cronet EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc_test_util
-  test/core/end2end/data/client_certs.c
-  test/core/end2end/data/server1_cert.c
-  test/core/end2end/data/server1_key.c
-  test/core/end2end/data/test_root_cert.c
-  test/core/security/oauth2_utils.c
-  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
-  test/core/end2end/cq_verifier.c
-  test/core/end2end/fixtures/http_proxy_fixture.c
-  test/core/end2end/fixtures/proxy.c
-  test/core/iomgr/endpoint_tests.c
-  test/core/util/debugger_macros.c
-  test/core/util/grpc_profiler.c
-  test/core/util/memory_counters.c
-  test/core/util/mock_endpoint.c
-  test/core/util/parse_hexstring.c
-  test/core/util/passthru_endpoint.c
-  test/core/util/port.c
-  test/core/util/port_server_client.c
-  test/core/util/slice_splitter.c
-  test/core/util/trickle_endpoint.c
-  src/core/lib/channel/channel_args.c
-  src/core/lib/channel/channel_stack.c
-  src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/handshaker.c
-  src/core/lib/channel/handshaker_factory.c
-  src/core/lib/channel/handshaker_registry.c
-  src/core/lib/compression/compression.c
-  src/core/lib/compression/message_compress.c
-  src/core/lib/compression/stream_compression.c
-  src/core/lib/http/format_request.c
-  src/core/lib/http/httpcli.c
-  src/core/lib/http/parser.c
-  src/core/lib/iomgr/closure.c
-  src/core/lib/iomgr/combiner.c
-  src/core/lib/iomgr/endpoint.c
-  src/core/lib/iomgr/endpoint_pair_posix.c
-  src/core/lib/iomgr/endpoint_pair_uv.c
-  src/core/lib/iomgr/endpoint_pair_windows.c
-  src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll1_linux.c
-  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
-  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
-  src/core/lib/iomgr/ev_epollex_linux.c
-  src/core/lib/iomgr/ev_epollsig_linux.c
-  src/core/lib/iomgr/ev_poll_posix.c
-  src/core/lib/iomgr/ev_posix.c
-  src/core/lib/iomgr/ev_windows.c
-  src/core/lib/iomgr/exec_ctx.c
-  src/core/lib/iomgr/executor.c
-  src/core/lib/iomgr/iocp_windows.c
-  src/core/lib/iomgr/iomgr.c
-  src/core/lib/iomgr/iomgr_posix.c
-  src/core/lib/iomgr/iomgr_uv.c
-  src/core/lib/iomgr/iomgr_windows.c
-  src/core/lib/iomgr/is_epollexclusive_available.c
-  src/core/lib/iomgr/load_file.c
-  src/core/lib/iomgr/lockfree_event.c
-  src/core/lib/iomgr/network_status_tracker.c
-  src/core/lib/iomgr/polling_entity.c
-  src/core/lib/iomgr/pollset_set_uv.c
-  src/core/lib/iomgr/pollset_set_windows.c
-  src/core/lib/iomgr/pollset_uv.c
-  src/core/lib/iomgr/pollset_windows.c
-  src/core/lib/iomgr/resolve_address_posix.c
-  src/core/lib/iomgr/resolve_address_uv.c
-  src/core/lib/iomgr/resolve_address_windows.c
-  src/core/lib/iomgr/resource_quota.c
-  src/core/lib/iomgr/sockaddr_utils.c
-  src/core/lib/iomgr/socket_factory_posix.c
-  src/core/lib/iomgr/socket_mutator.c
-  src/core/lib/iomgr/socket_utils_common_posix.c
-  src/core/lib/iomgr/socket_utils_linux.c
-  src/core/lib/iomgr/socket_utils_posix.c
-  src/core/lib/iomgr/socket_utils_uv.c
-  src/core/lib/iomgr/socket_utils_windows.c
-  src/core/lib/iomgr/socket_windows.c
-  src/core/lib/iomgr/tcp_client_posix.c
-  src/core/lib/iomgr/tcp_client_uv.c
-  src/core/lib/iomgr/tcp_client_windows.c
-  src/core/lib/iomgr/tcp_posix.c
-  src/core/lib/iomgr/tcp_server_posix.c
-  src/core/lib/iomgr/tcp_server_utils_posix_common.c
-  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
-  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
-  src/core/lib/iomgr/tcp_server_uv.c
-  src/core/lib/iomgr/tcp_server_windows.c
-  src/core/lib/iomgr/tcp_uv.c
-  src/core/lib/iomgr/tcp_windows.c
-  src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer_generic.c
-  src/core/lib/iomgr/timer_heap.c
-  src/core/lib/iomgr/timer_manager.c
-  src/core/lib/iomgr/timer_uv.c
-  src/core/lib/iomgr/udp_server.c
-  src/core/lib/iomgr/unix_sockets_posix.c
-  src/core/lib/iomgr/unix_sockets_posix_noop.c
-  src/core/lib/iomgr/wakeup_fd_cv.c
-  src/core/lib/iomgr/wakeup_fd_eventfd.c
-  src/core/lib/iomgr/wakeup_fd_nospecial.c
-  src/core/lib/iomgr/wakeup_fd_pipe.c
-  src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/json/json.c
-  src/core/lib/json/json_reader.c
-  src/core/lib/json/json_string.c
-  src/core/lib/json/json_writer.c
-  src/core/lib/slice/b64.c
-  src/core/lib/slice/percent_encoding.c
-  src/core/lib/slice/slice.c
-  src/core/lib/slice/slice_buffer.c
-  src/core/lib/slice/slice_hash_table.c
-  src/core/lib/slice/slice_intern.c
-  src/core/lib/slice/slice_string_helpers.c
-  src/core/lib/surface/alarm.c
-  src/core/lib/surface/api_trace.c
-  src/core/lib/surface/byte_buffer.c
-  src/core/lib/surface/byte_buffer_reader.c
-  src/core/lib/surface/call.c
-  src/core/lib/surface/call_details.c
-  src/core/lib/surface/call_log_batch.c
-  src/core/lib/surface/channel.c
-  src/core/lib/surface/channel_init.c
-  src/core/lib/surface/channel_ping.c
-  src/core/lib/surface/channel_stack_type.c
-  src/core/lib/surface/completion_queue.c
-  src/core/lib/surface/completion_queue_factory.c
-  src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.cc
-  src/core/lib/surface/metadata_array.c
-  src/core/lib/surface/server.c
-  src/core/lib/surface/validate_metadata.c
-  src/core/lib/surface/version.c
-  src/core/lib/transport/bdp_estimator.c
-  src/core/lib/transport/byte_stream.c
-  src/core/lib/transport/connectivity_state.c
-  src/core/lib/transport/error_utils.c
-  src/core/lib/transport/metadata.c
-  src/core/lib/transport/metadata_batch.c
-  src/core/lib/transport/pid_controller.c
-  src/core/lib/transport/service_config.c
-  src/core/lib/transport/static_metadata.c
-  src/core/lib/transport/status_conversion.c
-  src/core/lib/transport/timeout_encoding.c
-  src/core/lib/transport/transport.c
-  src/core/lib/transport/transport_op_string.c
-  src/core/lib/debug/trace.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_test_util PROPERTIES COMPILE_PDB_NAME "grpc_test_util"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_test_util.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_test_util
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_test_util
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-  grpc
-)
-
-foreach(_hdr
-  include/grpc/byte_buffer.h
-  include/grpc/byte_buffer_reader.h
-  include/grpc/compression.h
-  include/grpc/grpc.h
-  include/grpc/grpc_posix.h
-  include/grpc/grpc_security_constants.h
-  include/grpc/load_reporting.h
-  include/grpc/slice.h
-  include/grpc/slice_buffer.h
-  include/grpc/status.h
-  include/grpc/support/workaround_list.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc_test_util_unsecure
-  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
-  test/core/end2end/cq_verifier.c
-  test/core/end2end/fixtures/http_proxy_fixture.c
-  test/core/end2end/fixtures/proxy.c
-  test/core/iomgr/endpoint_tests.c
-  test/core/util/debugger_macros.c
-  test/core/util/grpc_profiler.c
-  test/core/util/memory_counters.c
-  test/core/util/mock_endpoint.c
-  test/core/util/parse_hexstring.c
-  test/core/util/passthru_endpoint.c
-  test/core/util/port.c
-  test/core/util/port_server_client.c
-  test/core/util/slice_splitter.c
-  test/core/util/trickle_endpoint.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_test_util_unsecure PROPERTIES COMPILE_PDB_NAME "grpc_test_util_unsecure"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_test_util_unsecure.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_test_util_unsecure
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_test_util_unsecure
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-  gpr_test_util
-  grpc_unsecure
-  grpc
-)
-
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc_unsecure
-  src/core/lib/surface/init.c
-  src/core/lib/surface/init_unsecure.c
-  src/core/lib/channel/channel_args.c
-  src/core/lib/channel/channel_stack.c
-  src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/handshaker.c
-  src/core/lib/channel/handshaker_factory.c
-  src/core/lib/channel/handshaker_registry.c
-  src/core/lib/compression/compression.c
-  src/core/lib/compression/message_compress.c
-  src/core/lib/compression/stream_compression.c
-  src/core/lib/http/format_request.c
-  src/core/lib/http/httpcli.c
-  src/core/lib/http/parser.c
-  src/core/lib/iomgr/closure.c
-  src/core/lib/iomgr/combiner.c
-  src/core/lib/iomgr/endpoint.c
-  src/core/lib/iomgr/endpoint_pair_posix.c
-  src/core/lib/iomgr/endpoint_pair_uv.c
-  src/core/lib/iomgr/endpoint_pair_windows.c
-  src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll1_linux.c
-  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
-  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
-  src/core/lib/iomgr/ev_epollex_linux.c
-  src/core/lib/iomgr/ev_epollsig_linux.c
-  src/core/lib/iomgr/ev_poll_posix.c
-  src/core/lib/iomgr/ev_posix.c
-  src/core/lib/iomgr/ev_windows.c
-  src/core/lib/iomgr/exec_ctx.c
-  src/core/lib/iomgr/executor.c
-  src/core/lib/iomgr/iocp_windows.c
-  src/core/lib/iomgr/iomgr.c
-  src/core/lib/iomgr/iomgr_posix.c
-  src/core/lib/iomgr/iomgr_uv.c
-  src/core/lib/iomgr/iomgr_windows.c
-  src/core/lib/iomgr/is_epollexclusive_available.c
-  src/core/lib/iomgr/load_file.c
-  src/core/lib/iomgr/lockfree_event.c
-  src/core/lib/iomgr/network_status_tracker.c
-  src/core/lib/iomgr/polling_entity.c
-  src/core/lib/iomgr/pollset_set_uv.c
-  src/core/lib/iomgr/pollset_set_windows.c
-  src/core/lib/iomgr/pollset_uv.c
-  src/core/lib/iomgr/pollset_windows.c
-  src/core/lib/iomgr/resolve_address_posix.c
-  src/core/lib/iomgr/resolve_address_uv.c
-  src/core/lib/iomgr/resolve_address_windows.c
-  src/core/lib/iomgr/resource_quota.c
-  src/core/lib/iomgr/sockaddr_utils.c
-  src/core/lib/iomgr/socket_factory_posix.c
-  src/core/lib/iomgr/socket_mutator.c
-  src/core/lib/iomgr/socket_utils_common_posix.c
-  src/core/lib/iomgr/socket_utils_linux.c
-  src/core/lib/iomgr/socket_utils_posix.c
-  src/core/lib/iomgr/socket_utils_uv.c
-  src/core/lib/iomgr/socket_utils_windows.c
-  src/core/lib/iomgr/socket_windows.c
-  src/core/lib/iomgr/tcp_client_posix.c
-  src/core/lib/iomgr/tcp_client_uv.c
-  src/core/lib/iomgr/tcp_client_windows.c
-  src/core/lib/iomgr/tcp_posix.c
-  src/core/lib/iomgr/tcp_server_posix.c
-  src/core/lib/iomgr/tcp_server_utils_posix_common.c
-  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
-  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
-  src/core/lib/iomgr/tcp_server_uv.c
-  src/core/lib/iomgr/tcp_server_windows.c
-  src/core/lib/iomgr/tcp_uv.c
-  src/core/lib/iomgr/tcp_windows.c
-  src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer_generic.c
-  src/core/lib/iomgr/timer_heap.c
-  src/core/lib/iomgr/timer_manager.c
-  src/core/lib/iomgr/timer_uv.c
-  src/core/lib/iomgr/udp_server.c
-  src/core/lib/iomgr/unix_sockets_posix.c
-  src/core/lib/iomgr/unix_sockets_posix_noop.c
-  src/core/lib/iomgr/wakeup_fd_cv.c
-  src/core/lib/iomgr/wakeup_fd_eventfd.c
-  src/core/lib/iomgr/wakeup_fd_nospecial.c
-  src/core/lib/iomgr/wakeup_fd_pipe.c
-  src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/json/json.c
-  src/core/lib/json/json_reader.c
-  src/core/lib/json/json_string.c
-  src/core/lib/json/json_writer.c
-  src/core/lib/slice/b64.c
-  src/core/lib/slice/percent_encoding.c
-  src/core/lib/slice/slice.c
-  src/core/lib/slice/slice_buffer.c
-  src/core/lib/slice/slice_hash_table.c
-  src/core/lib/slice/slice_intern.c
-  src/core/lib/slice/slice_string_helpers.c
-  src/core/lib/surface/alarm.c
-  src/core/lib/surface/api_trace.c
-  src/core/lib/surface/byte_buffer.c
-  src/core/lib/surface/byte_buffer_reader.c
-  src/core/lib/surface/call.c
-  src/core/lib/surface/call_details.c
-  src/core/lib/surface/call_log_batch.c
-  src/core/lib/surface/channel.c
-  src/core/lib/surface/channel_init.c
-  src/core/lib/surface/channel_ping.c
-  src/core/lib/surface/channel_stack_type.c
-  src/core/lib/surface/completion_queue.c
-  src/core/lib/surface/completion_queue_factory.c
-  src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.cc
-  src/core/lib/surface/metadata_array.c
-  src/core/lib/surface/server.c
-  src/core/lib/surface/validate_metadata.c
-  src/core/lib/surface/version.c
-  src/core/lib/transport/bdp_estimator.c
-  src/core/lib/transport/byte_stream.c
-  src/core/lib/transport/connectivity_state.c
-  src/core/lib/transport/error_utils.c
-  src/core/lib/transport/metadata.c
-  src/core/lib/transport/metadata_batch.c
-  src/core/lib/transport/pid_controller.c
-  src/core/lib/transport/service_config.c
-  src/core/lib/transport/static_metadata.c
-  src/core/lib/transport/status_conversion.c
-  src/core/lib/transport/timeout_encoding.c
-  src/core/lib/transport/transport.c
-  src/core/lib/transport/transport_op_string.c
-  src/core/lib/debug/trace.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
-  src/core/ext/transport/chttp2/transport/bin_decoder.c
-  src/core/ext/transport/chttp2/transport/bin_encoder.c
-  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
-  src/core/ext/transport/chttp2/transport/chttp2_transport.c
-  src/core/ext/transport/chttp2/transport/frame_data.c
-  src/core/ext/transport/chttp2/transport/frame_goaway.c
-  src/core/ext/transport/chttp2/transport/frame_ping.c
-  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
-  src/core/ext/transport/chttp2/transport/frame_settings.c
-  src/core/ext/transport/chttp2/transport/frame_window_update.c
-  src/core/ext/transport/chttp2/transport/hpack_encoder.c
-  src/core/ext/transport/chttp2/transport/hpack_parser.c
-  src/core/ext/transport/chttp2/transport/hpack_table.c
-  src/core/ext/transport/chttp2/transport/http2_settings.c
-  src/core/ext/transport/chttp2/transport/huffsyms.c
-  src/core/ext/transport/chttp2/transport/incoming_metadata.c
-  src/core/ext/transport/chttp2/transport/parsing.c
-  src/core/ext/transport/chttp2/transport/stream_lists.c
-  src/core/ext/transport/chttp2/transport/stream_map.c
-  src/core/ext/transport/chttp2/transport/varint.c
-  src/core/ext/transport/chttp2/transport/writing.c
-  src/core/ext/transport/chttp2/alpn/alpn.c
-  src/core/ext/filters/http/client/http_client_filter.c
-  src/core/ext/filters/http/http_filters_plugin.c
-  src/core/ext/filters/http/message_compress/message_compress_filter.c
-  src/core/ext/filters/http/server/http_server_filter.c
-  src/core/ext/transport/chttp2/server/chttp2_server.c
-  src/core/ext/transport/chttp2/client/insecure/channel_create.c
-  src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
-  src/core/ext/transport/chttp2/client/chttp2_connector.c
-  src/core/ext/filters/client_channel/channel_connectivity.c
-  src/core/ext/filters/client_channel/client_channel.c
-  src/core/ext/filters/client_channel/client_channel_factory.c
-  src/core/ext/filters/client_channel/client_channel_plugin.c
-  src/core/ext/filters/client_channel/connector.c
-  src/core/ext/filters/client_channel/http_connect_handshaker.c
-  src/core/ext/filters/client_channel/http_proxy.c
-  src/core/ext/filters/client_channel/lb_policy.c
-  src/core/ext/filters/client_channel/lb_policy_factory.c
-  src/core/ext/filters/client_channel/lb_policy_registry.c
-  src/core/ext/filters/client_channel/parse_address.c
-  src/core/ext/filters/client_channel/proxy_mapper.c
-  src/core/ext/filters/client_channel/proxy_mapper_registry.c
-  src/core/ext/filters/client_channel/resolver.c
-  src/core/ext/filters/client_channel/resolver_factory.c
-  src/core/ext/filters/client_channel/resolver_registry.c
-  src/core/ext/filters/client_channel/retry_throttle.c
-  src/core/ext/filters/client_channel/subchannel.c
-  src/core/ext/filters/client_channel/subchannel_index.c
-  src/core/ext/filters/client_channel/uri_parser.c
-  src/core/ext/filters/deadline/deadline_filter.c
-  src/core/ext/transport/inproc/inproc_plugin.c
-  src/core/ext/transport/inproc/inproc_transport.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/dns_resolver_ares.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_ev_driver_posix.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper.c
-  src/core/ext/filters/client_channel/resolver/dns/c_ares/grpc_ares_wrapper_fallback.c
-  src/core/ext/filters/client_channel/resolver/dns/native/dns_resolver.c
-  src/core/ext/filters/client_channel/resolver/sockaddr/sockaddr_resolver.c
-  src/core/ext/filters/client_channel/resolver/fake/fake_resolver.c
-  src/core/ext/filters/load_reporting/load_reporting.c
-  src/core/ext/filters/load_reporting/load_reporting_filter.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/client_load_reporting_filter.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_channel.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/grpclb_client_stats.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/load_balancer_api.c
-  src/core/ext/filters/client_channel/lb_policy/grpclb/proto/grpc/lb/v1/load_balancer.pb.c
-  third_party/nanopb/pb_common.c
-  third_party/nanopb/pb_decode.c
-  third_party/nanopb/pb_encode.c
-  src/core/ext/filters/client_channel/lb_policy/pick_first/pick_first.c
-  src/core/ext/filters/client_channel/lb_policy/round_robin/round_robin.c
-  src/core/ext/census/base_resources.c
-  src/core/ext/census/context.c
-  src/core/ext/census/gen/census.pb.c
-  src/core/ext/census/gen/trace_context.pb.c
-  src/core/ext/census/grpc_context.c
-  src/core/ext/census/grpc_filter.c
-  src/core/ext/census/grpc_plugin.c
-  src/core/ext/census/initialize.c
-  src/core/ext/census/intrusive_hash_map.c
-  src/core/ext/census/mlog.c
-  src/core/ext/census/operation.c
-  src/core/ext/census/placeholders.c
-  src/core/ext/census/resource.c
-  src/core/ext/census/trace_context.c
-  src/core/ext/census/tracing.c
-  src/core/ext/filters/max_age/max_age_filter.c
-  src/core/ext/filters/message_size/message_size_filter.c
-  src/core/ext/filters/workarounds/workaround_cronet_compression_filter.c
-  src/core/ext/filters/workarounds/workaround_utils.c
-  src/core/plugin_registry/grpc_unsecure_plugin_registry.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_unsecure PROPERTIES COMPILE_PDB_NAME "grpc_unsecure"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_unsecure.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_unsecure
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_unsecure
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_ZLIB_LIBRARIES}
-  ${_gRPC_CARES_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-)
-
-foreach(_hdr
-  include/grpc/byte_buffer.h
-  include/grpc/byte_buffer_reader.h
-  include/grpc/compression.h
-  include/grpc/grpc.h
-  include/grpc/grpc_posix.h
-  include/grpc/grpc_security_constants.h
-  include/grpc/load_reporting.h
-  include/grpc/slice.h
-  include/grpc/slice_buffer.h
-  include/grpc/status.h
-  include/grpc/support/workaround_list.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc/census.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_unsecure EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(reconnect_server
-  test/core/util/reconnect_server.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(reconnect_server PROPERTIES COMPILE_PDB_NAME "reconnect_server"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/reconnect_server.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(reconnect_server
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(reconnect_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  test_tcp_server
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(test_tcp_server
-  test/core/util/test_tcp_server.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(test_tcp_server PROPERTIES COMPILE_PDB_NAME "test_tcp_server"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/test_tcp_server.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(test_tcp_server
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(test_tcp_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc++
-  src/cpp/client/insecure_credentials.cc
-  src/cpp/client/secure_credentials.cc
-  src/cpp/common/auth_property_iterator.cc
-  src/cpp/common/secure_auth_context.cc
-  src/cpp/common/secure_channel_arguments.cc
-  src/cpp/common/secure_create_auth_context.cc
-  src/cpp/server/insecure_server_credentials.cc
-  src/cpp/server/secure_server_credentials.cc
-  src/cpp/client/channel_cc.cc
-  src/cpp/client/client_context.cc
-  src/cpp/client/create_channel.cc
-  src/cpp/client/create_channel_internal.cc
-  src/cpp/client/create_channel_posix.cc
-  src/cpp/client/credentials_cc.cc
-  src/cpp/client/generic_stub.cc
-  src/cpp/common/channel_arguments.cc
-  src/cpp/common/channel_filter.cc
-  src/cpp/common/completion_queue_cc.cc
-  src/cpp/common/core_codegen.cc
-  src/cpp/common/resource_quota_cc.cc
-  src/cpp/common/rpc_method.cc
-  src/cpp/common/version_cc.cc
-  src/cpp/server/async_generic_service.cc
-  src/cpp/server/channel_argument_option.cc
-  src/cpp/server/create_default_thread_pool.cc
-  src/cpp/server/dynamic_thread_pool.cc
-  src/cpp/server/health/default_health_check_service.cc
-  src/cpp/server/health/health.pb.c
-  src/cpp/server/health/health_check_service.cc
-  src/cpp/server/health/health_check_service_server_builder_option.cc
-  src/cpp/server/server_builder.cc
-  src/cpp/server/server_cc.cc
-  src/cpp/server/server_context.cc
-  src/cpp/server/server_credentials.cc
-  src/cpp/server/server_posix.cc
-  src/cpp/thread_manager/thread_manager.cc
-  src/cpp/util/byte_buffer_cc.cc
-  src/cpp/util/slice_cc.cc
-  src/cpp/util/status.cc
-  src/cpp/util/string_ref.cc
-  src/cpp/util/time_cc.cc
-  third_party/nanopb/pb_common.c
-  third_party/nanopb/pb_decode.c
-  third_party/nanopb/pb_encode.c
-  src/cpp/codegen/codegen_init.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++ PROPERTIES COMPILE_PDB_NAME "grpc++"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc++
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/alarm.h
-  include/grpc++/channel.h
-  include/grpc++/client_context.h
-  include/grpc++/completion_queue.h
-  include/grpc++/create_channel.h
-  include/grpc++/create_channel_posix.h
-  include/grpc++/ext/health_check_service_server_builder_option.h
-  include/grpc++/generic/async_generic_service.h
-  include/grpc++/generic/generic_stub.h
-  include/grpc++/grpc++.h
-  include/grpc++/health_check_service_interface.h
-  include/grpc++/impl/call.h
-  include/grpc++/impl/channel_argument_option.h
-  include/grpc++/impl/client_unary_call.h
-  include/grpc++/impl/codegen/core_codegen.h
-  include/grpc++/impl/grpc_library.h
-  include/grpc++/impl/method_handler_impl.h
-  include/grpc++/impl/rpc_method.h
-  include/grpc++/impl/rpc_service_method.h
-  include/grpc++/impl/serialization_traits.h
-  include/grpc++/impl/server_builder_option.h
-  include/grpc++/impl/server_builder_plugin.h
-  include/grpc++/impl/server_initializer.h
-  include/grpc++/impl/service_type.h
-  include/grpc++/resource_quota.h
-  include/grpc++/security/auth_context.h
-  include/grpc++/security/auth_metadata_processor.h
-  include/grpc++/security/credentials.h
-  include/grpc++/security/server_credentials.h
-  include/grpc++/server.h
-  include/grpc++/server_builder.h
-  include/grpc++/server_context.h
-  include/grpc++/server_posix.h
-  include/grpc++/support/async_stream.h
-  include/grpc++/support/async_unary_call.h
-  include/grpc++/support/byte_buffer.h
-  include/grpc++/support/channel_arguments.h
-  include/grpc++/support/config.h
-  include/grpc++/support/slice.h
-  include/grpc++/support/status.h
-  include/grpc++/support/status_code_enum.h
-  include/grpc++/support/string_ref.h
-  include/grpc++/support/stub_options.h
-  include/grpc++/support/sync_stream.h
-  include/grpc++/support/time.h
-  include/grpc++/impl/codegen/async_stream.h
-  include/grpc++/impl/codegen/async_unary_call.h
-  include/grpc++/impl/codegen/call.h
-  include/grpc++/impl/codegen/call_hook.h
-  include/grpc++/impl/codegen/channel_interface.h
-  include/grpc++/impl/codegen/client_context.h
-  include/grpc++/impl/codegen/client_unary_call.h
-  include/grpc++/impl/codegen/completion_queue.h
-  include/grpc++/impl/codegen/completion_queue_tag.h
-  include/grpc++/impl/codegen/config.h
-  include/grpc++/impl/codegen/core_codegen_interface.h
-  include/grpc++/impl/codegen/create_auth_context.h
-  include/grpc++/impl/codegen/grpc_library.h
-  include/grpc++/impl/codegen/metadata_map.h
-  include/grpc++/impl/codegen/method_handler_impl.h
-  include/grpc++/impl/codegen/rpc_method.h
-  include/grpc++/impl/codegen/rpc_service_method.h
-  include/grpc++/impl/codegen/security/auth_context.h
-  include/grpc++/impl/codegen/serialization_traits.h
-  include/grpc++/impl/codegen/server_context.h
-  include/grpc++/impl/codegen/server_interface.h
-  include/grpc++/impl/codegen/service_type.h
-  include/grpc++/impl/codegen/slice.h
-  include/grpc++/impl/codegen/status.h
-  include/grpc++/impl/codegen/status_code_enum.h
-  include/grpc++/impl/codegen/string_ref.h
-  include/grpc++/impl/codegen/stub_options.h
-  include/grpc++/impl/codegen/sync_stream.h
-  include/grpc++/impl/codegen/time.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc++/impl/codegen/proto_utils.h
-  include/grpc++/impl/codegen/config_protobuf.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc++ EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_library(grpc++_cronet
-  src/cpp/client/cronet_credentials.cc
-  src/cpp/client/insecure_credentials.cc
-  src/cpp/common/insecure_create_auth_context.cc
-  src/cpp/server/insecure_server_credentials.cc
-  src/cpp/client/channel_cc.cc
-  src/cpp/client/client_context.cc
-  src/cpp/client/create_channel.cc
-  src/cpp/client/create_channel_internal.cc
-  src/cpp/client/create_channel_posix.cc
-  src/cpp/client/credentials_cc.cc
-  src/cpp/client/generic_stub.cc
-  src/cpp/common/channel_arguments.cc
-  src/cpp/common/channel_filter.cc
-  src/cpp/common/completion_queue_cc.cc
-  src/cpp/common/core_codegen.cc
-  src/cpp/common/resource_quota_cc.cc
-  src/cpp/common/rpc_method.cc
-  src/cpp/common/version_cc.cc
-  src/cpp/server/async_generic_service.cc
-  src/cpp/server/channel_argument_option.cc
-  src/cpp/server/create_default_thread_pool.cc
-  src/cpp/server/dynamic_thread_pool.cc
-  src/cpp/server/health/default_health_check_service.cc
-  src/cpp/server/health/health.pb.c
-  src/cpp/server/health/health_check_service.cc
-  src/cpp/server/health/health_check_service_server_builder_option.cc
-  src/cpp/server/server_builder.cc
-  src/cpp/server/server_cc.cc
-  src/cpp/server/server_context.cc
-  src/cpp/server/server_credentials.cc
-  src/cpp/server/server_posix.cc
-  src/cpp/thread_manager/thread_manager.cc
-  src/cpp/util/byte_buffer_cc.cc
-  src/cpp/util/slice_cc.cc
-  src/cpp/util/status.cc
-  src/cpp/util/string_ref.cc
-  src/cpp/util/time_cc.cc
-  third_party/nanopb/pb_common.c
-  third_party/nanopb/pb_decode.c
-  third_party/nanopb/pb_encode.c
-  src/cpp/codegen/codegen_init.cc
-  src/core/ext/transport/chttp2/client/insecure/channel_create.c
-  src/core/ext/transport/chttp2/client/insecure/channel_create_posix.c
-  src/core/ext/transport/chttp2/client/chttp2_connector.c
-  src/core/ext/transport/chttp2/transport/bin_decoder.c
-  src/core/ext/transport/chttp2/transport/bin_encoder.c
-  src/core/ext/transport/chttp2/transport/chttp2_plugin.c
-  src/core/ext/transport/chttp2/transport/chttp2_transport.c
-  src/core/ext/transport/chttp2/transport/frame_data.c
-  src/core/ext/transport/chttp2/transport/frame_goaway.c
-  src/core/ext/transport/chttp2/transport/frame_ping.c
-  src/core/ext/transport/chttp2/transport/frame_rst_stream.c
-  src/core/ext/transport/chttp2/transport/frame_settings.c
-  src/core/ext/transport/chttp2/transport/frame_window_update.c
-  src/core/ext/transport/chttp2/transport/hpack_encoder.c
-  src/core/ext/transport/chttp2/transport/hpack_parser.c
-  src/core/ext/transport/chttp2/transport/hpack_table.c
-  src/core/ext/transport/chttp2/transport/http2_settings.c
-  src/core/ext/transport/chttp2/transport/huffsyms.c
-  src/core/ext/transport/chttp2/transport/incoming_metadata.c
-  src/core/ext/transport/chttp2/transport/parsing.c
-  src/core/ext/transport/chttp2/transport/stream_lists.c
-  src/core/ext/transport/chttp2/transport/stream_map.c
-  src/core/ext/transport/chttp2/transport/varint.c
-  src/core/ext/transport/chttp2/transport/writing.c
-  src/core/lib/channel/channel_args.c
-  src/core/lib/channel/channel_stack.c
-  src/core/lib/channel/channel_stack_builder.c
-  src/core/lib/channel/connected_channel.c
-  src/core/lib/channel/handshaker.c
-  src/core/lib/channel/handshaker_factory.c
-  src/core/lib/channel/handshaker_registry.c
-  src/core/lib/compression/compression.c
-  src/core/lib/compression/message_compress.c
-  src/core/lib/compression/stream_compression.c
-  src/core/lib/http/format_request.c
-  src/core/lib/http/httpcli.c
-  src/core/lib/http/parser.c
-  src/core/lib/iomgr/closure.c
-  src/core/lib/iomgr/combiner.c
-  src/core/lib/iomgr/endpoint.c
-  src/core/lib/iomgr/endpoint_pair_posix.c
-  src/core/lib/iomgr/endpoint_pair_uv.c
-  src/core/lib/iomgr/endpoint_pair_windows.c
-  src/core/lib/iomgr/error.c
-  src/core/lib/iomgr/ev_epoll1_linux.c
-  src/core/lib/iomgr/ev_epoll_limited_pollers_linux.c
-  src/core/lib/iomgr/ev_epoll_thread_pool_linux.c
-  src/core/lib/iomgr/ev_epollex_linux.c
-  src/core/lib/iomgr/ev_epollsig_linux.c
-  src/core/lib/iomgr/ev_poll_posix.c
-  src/core/lib/iomgr/ev_posix.c
-  src/core/lib/iomgr/ev_windows.c
-  src/core/lib/iomgr/exec_ctx.c
-  src/core/lib/iomgr/executor.c
-  src/core/lib/iomgr/iocp_windows.c
-  src/core/lib/iomgr/iomgr.c
-  src/core/lib/iomgr/iomgr_posix.c
-  src/core/lib/iomgr/iomgr_uv.c
-  src/core/lib/iomgr/iomgr_windows.c
-  src/core/lib/iomgr/is_epollexclusive_available.c
-  src/core/lib/iomgr/load_file.c
-  src/core/lib/iomgr/lockfree_event.c
-  src/core/lib/iomgr/network_status_tracker.c
-  src/core/lib/iomgr/polling_entity.c
-  src/core/lib/iomgr/pollset_set_uv.c
-  src/core/lib/iomgr/pollset_set_windows.c
-  src/core/lib/iomgr/pollset_uv.c
-  src/core/lib/iomgr/pollset_windows.c
-  src/core/lib/iomgr/resolve_address_posix.c
-  src/core/lib/iomgr/resolve_address_uv.c
-  src/core/lib/iomgr/resolve_address_windows.c
-  src/core/lib/iomgr/resource_quota.c
-  src/core/lib/iomgr/sockaddr_utils.c
-  src/core/lib/iomgr/socket_factory_posix.c
-  src/core/lib/iomgr/socket_mutator.c
-  src/core/lib/iomgr/socket_utils_common_posix.c
-  src/core/lib/iomgr/socket_utils_linux.c
-  src/core/lib/iomgr/socket_utils_posix.c
-  src/core/lib/iomgr/socket_utils_uv.c
-  src/core/lib/iomgr/socket_utils_windows.c
-  src/core/lib/iomgr/socket_windows.c
-  src/core/lib/iomgr/tcp_client_posix.c
-  src/core/lib/iomgr/tcp_client_uv.c
-  src/core/lib/iomgr/tcp_client_windows.c
-  src/core/lib/iomgr/tcp_posix.c
-  src/core/lib/iomgr/tcp_server_posix.c
-  src/core/lib/iomgr/tcp_server_utils_posix_common.c
-  src/core/lib/iomgr/tcp_server_utils_posix_ifaddrs.c
-  src/core/lib/iomgr/tcp_server_utils_posix_noifaddrs.c
-  src/core/lib/iomgr/tcp_server_uv.c
-  src/core/lib/iomgr/tcp_server_windows.c
-  src/core/lib/iomgr/tcp_uv.c
-  src/core/lib/iomgr/tcp_windows.c
-  src/core/lib/iomgr/time_averaged_stats.c
-  src/core/lib/iomgr/timer_generic.c
-  src/core/lib/iomgr/timer_heap.c
-  src/core/lib/iomgr/timer_manager.c
-  src/core/lib/iomgr/timer_uv.c
-  src/core/lib/iomgr/udp_server.c
-  src/core/lib/iomgr/unix_sockets_posix.c
-  src/core/lib/iomgr/unix_sockets_posix_noop.c
-  src/core/lib/iomgr/wakeup_fd_cv.c
-  src/core/lib/iomgr/wakeup_fd_eventfd.c
-  src/core/lib/iomgr/wakeup_fd_nospecial.c
-  src/core/lib/iomgr/wakeup_fd_pipe.c
-  src/core/lib/iomgr/wakeup_fd_posix.c
-  src/core/lib/json/json.c
-  src/core/lib/json/json_reader.c
-  src/core/lib/json/json_string.c
-  src/core/lib/json/json_writer.c
-  src/core/lib/slice/b64.c
-  src/core/lib/slice/percent_encoding.c
-  src/core/lib/slice/slice.c
-  src/core/lib/slice/slice_buffer.c
-  src/core/lib/slice/slice_hash_table.c
-  src/core/lib/slice/slice_intern.c
-  src/core/lib/slice/slice_string_helpers.c
-  src/core/lib/surface/alarm.c
-  src/core/lib/surface/api_trace.c
-  src/core/lib/surface/byte_buffer.c
-  src/core/lib/surface/byte_buffer_reader.c
-  src/core/lib/surface/call.c
-  src/core/lib/surface/call_details.c
-  src/core/lib/surface/call_log_batch.c
-  src/core/lib/surface/channel.c
-  src/core/lib/surface/channel_init.c
-  src/core/lib/surface/channel_ping.c
-  src/core/lib/surface/channel_stack_type.c
-  src/core/lib/surface/completion_queue.c
-  src/core/lib/surface/completion_queue_factory.c
-  src/core/lib/surface/event_string.c
-  src/core/lib/surface/lame_client.cc
-  src/core/lib/surface/metadata_array.c
-  src/core/lib/surface/server.c
-  src/core/lib/surface/validate_metadata.c
-  src/core/lib/surface/version.c
-  src/core/lib/transport/bdp_estimator.c
-  src/core/lib/transport/byte_stream.c
-  src/core/lib/transport/connectivity_state.c
-  src/core/lib/transport/error_utils.c
-  src/core/lib/transport/metadata.c
-  src/core/lib/transport/metadata_batch.c
-  src/core/lib/transport/pid_controller.c
-  src/core/lib/transport/service_config.c
-  src/core/lib/transport/static_metadata.c
-  src/core/lib/transport/status_conversion.c
-  src/core/lib/transport/timeout_encoding.c
-  src/core/lib/transport/transport.c
-  src/core/lib/transport/transport_op_string.c
-  src/core/lib/debug/trace.c
-  src/core/ext/transport/chttp2/alpn/alpn.c
-  src/core/ext/filters/http/client/http_client_filter.c
-  src/core/ext/filters/http/http_filters_plugin.c
-  src/core/ext/filters/http/message_compress/message_compress_filter.c
-  src/core/ext/filters/http/server/http_server_filter.c
-  src/core/ext/filters/client_channel/channel_connectivity.c
-  src/core/ext/filters/client_channel/client_channel.c
-  src/core/ext/filters/client_channel/client_channel_factory.c
-  src/core/ext/filters/client_channel/client_channel_plugin.c
-  src/core/ext/filters/client_channel/connector.c
-  src/core/ext/filters/client_channel/http_connect_handshaker.c
-  src/core/ext/filters/client_channel/http_proxy.c
-  src/core/ext/filters/client_channel/lb_policy.c
-  src/core/ext/filters/client_channel/lb_policy_factory.c
-  src/core/ext/filters/client_channel/lb_policy_registry.c
-  src/core/ext/filters/client_channel/parse_address.c
-  src/core/ext/filters/client_channel/proxy_mapper.c
-  src/core/ext/filters/client_channel/proxy_mapper_registry.c
-  src/core/ext/filters/client_channel/resolver.c
-  src/core/ext/filters/client_channel/resolver_factory.c
-  src/core/ext/filters/client_channel/resolver_registry.c
-  src/core/ext/filters/client_channel/retry_throttle.c
-  src/core/ext/filters/client_channel/subchannel.c
-  src/core/ext/filters/client_channel/subchannel_index.c
-  src/core/ext/filters/client_channel/uri_parser.c
-  src/core/ext/filters/deadline/deadline_filter.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2.c
-  src/core/ext/transport/chttp2/server/insecure/server_chttp2_posix.c
-  src/core/ext/transport/chttp2/server/chttp2_server.c
-  src/core/ext/census/base_resources.c
-  src/core/ext/census/context.c
-  src/core/ext/census/gen/census.pb.c
-  src/core/ext/census/gen/trace_context.pb.c
-  src/core/ext/census/grpc_context.c
-  src/core/ext/census/grpc_filter.c
-  src/core/ext/census/grpc_plugin.c
-  src/core/ext/census/initialize.c
-  src/core/ext/census/intrusive_hash_map.c
-  src/core/ext/census/mlog.c
-  src/core/ext/census/operation.c
-  src/core/ext/census/placeholders.c
-  src/core/ext/census/resource.c
-  src/core/ext/census/trace_context.c
-  src/core/ext/census/tracing.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_cronet PROPERTIES COMPILE_PDB_NAME "grpc++_cronet"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_cronet.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc++_cronet
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_cronet
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-  grpc_cronet
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/alarm.h
-  include/grpc++/channel.h
-  include/grpc++/client_context.h
-  include/grpc++/completion_queue.h
-  include/grpc++/create_channel.h
-  include/grpc++/create_channel_posix.h
-  include/grpc++/ext/health_check_service_server_builder_option.h
-  include/grpc++/generic/async_generic_service.h
-  include/grpc++/generic/generic_stub.h
-  include/grpc++/grpc++.h
-  include/grpc++/health_check_service_interface.h
-  include/grpc++/impl/call.h
-  include/grpc++/impl/channel_argument_option.h
-  include/grpc++/impl/client_unary_call.h
-  include/grpc++/impl/codegen/core_codegen.h
-  include/grpc++/impl/grpc_library.h
-  include/grpc++/impl/method_handler_impl.h
-  include/grpc++/impl/rpc_method.h
-  include/grpc++/impl/rpc_service_method.h
-  include/grpc++/impl/serialization_traits.h
-  include/grpc++/impl/server_builder_option.h
-  include/grpc++/impl/server_builder_plugin.h
-  include/grpc++/impl/server_initializer.h
-  include/grpc++/impl/service_type.h
-  include/grpc++/resource_quota.h
-  include/grpc++/security/auth_context.h
-  include/grpc++/security/auth_metadata_processor.h
-  include/grpc++/security/credentials.h
-  include/grpc++/security/server_credentials.h
-  include/grpc++/server.h
-  include/grpc++/server_builder.h
-  include/grpc++/server_context.h
-  include/grpc++/server_posix.h
-  include/grpc++/support/async_stream.h
-  include/grpc++/support/async_unary_call.h
-  include/grpc++/support/byte_buffer.h
-  include/grpc++/support/channel_arguments.h
-  include/grpc++/support/config.h
-  include/grpc++/support/slice.h
-  include/grpc++/support/status.h
-  include/grpc++/support/status_code_enum.h
-  include/grpc++/support/string_ref.h
-  include/grpc++/support/stub_options.h
-  include/grpc++/support/sync_stream.h
-  include/grpc++/support/time.h
-  include/grpc++/impl/codegen/async_stream.h
-  include/grpc++/impl/codegen/async_unary_call.h
-  include/grpc++/impl/codegen/call.h
-  include/grpc++/impl/codegen/call_hook.h
-  include/grpc++/impl/codegen/channel_interface.h
-  include/grpc++/impl/codegen/client_context.h
-  include/grpc++/impl/codegen/client_unary_call.h
-  include/grpc++/impl/codegen/completion_queue.h
-  include/grpc++/impl/codegen/completion_queue_tag.h
-  include/grpc++/impl/codegen/config.h
-  include/grpc++/impl/codegen/core_codegen_interface.h
-  include/grpc++/impl/codegen/create_auth_context.h
-  include/grpc++/impl/codegen/grpc_library.h
-  include/grpc++/impl/codegen/metadata_map.h
-  include/grpc++/impl/codegen/method_handler_impl.h
-  include/grpc++/impl/codegen/rpc_method.h
-  include/grpc++/impl/codegen/rpc_service_method.h
-  include/grpc++/impl/codegen/security/auth_context.h
-  include/grpc++/impl/codegen/serialization_traits.h
-  include/grpc++/impl/codegen/server_context.h
-  include/grpc++/impl/codegen/server_interface.h
-  include/grpc++/impl/codegen/service_type.h
-  include/grpc++/impl/codegen/slice.h
-  include/grpc++/impl/codegen/status.h
-  include/grpc++/impl/codegen/status_code_enum.h
-  include/grpc++/impl/codegen/string_ref.h
-  include/grpc++/impl/codegen/stub_options.h
-  include/grpc++/impl/codegen/sync_stream.h
-  include/grpc++/impl/codegen/time.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc/byte_buffer.h
-  include/grpc/byte_buffer_reader.h
-  include/grpc/compression.h
-  include/grpc/grpc.h
-  include/grpc/grpc_posix.h
-  include/grpc/grpc_security_constants.h
-  include/grpc/load_reporting.h
-  include/grpc/slice.h
-  include/grpc/slice_buffer.h
-  include/grpc/status.h
-  include/grpc/support/workaround_list.h
-  include/grpc/census.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc++_cronet EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_library(grpc++_error_details
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/status/status.grpc.pb.h
-  src/cpp/util/error_details.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_error_details PROPERTIES COMPILE_PDB_NAME "grpc++_error_details"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_error_details.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/status/status.proto
-)
-
-target_include_directories(grpc++_error_details
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_error_details
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-)
-
-foreach(_hdr
-  include/grpc++/support/error_details.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc++_error_details EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc++_proto_reflection_desc_db
-  test/cpp/util/proto_reflection_descriptor_database.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_proto_reflection_desc_db PROPERTIES COMPILE_PDB_NAME "grpc++_proto_reflection_desc_db"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_proto_reflection_desc_db.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/reflection/v1alpha/reflection.proto
-)
-
-target_include_directories(grpc++_proto_reflection_desc_db
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_proto_reflection_desc_db
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/impl/codegen/config_protobuf.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc++_reflection
-  src/cpp/ext/proto_server_reflection.cc
-  src/cpp/ext/proto_server_reflection_plugin.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_reflection PROPERTIES COMPILE_PDB_NAME "grpc++_reflection"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_reflection.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/reflection/v1alpha/reflection.proto
-)
-
-target_include_directories(grpc++_reflection
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_reflection
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/ext/proto_server_reflection_plugin.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc++_reflection EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc++_test_config
-  test/cpp/util/test_config_cc.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_test_config PROPERTIES COMPILE_PDB_NAME "grpc++_test_config"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_test_config.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc++_test_config
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_test_config
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc++_test_util
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/health/v1/health.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_mock.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/duplicate/echo_duplicate.grpc.pb.h
-  test/cpp/end2end/test_service_impl.cc
-  test/cpp/util/byte_buffer_proto_helper.cc
-  test/cpp/util/create_test_channel.cc
-  test/cpp/util/string_ref_helper.cc
-  test/cpp/util/subprocess.cc
-  test/cpp/util/test_credentials_provider.cc
-  src/cpp/codegen/codegen_init.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_test_util PROPERTIES COMPILE_PDB_NAME "grpc++_test_util"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_test_util.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/health/v1/health.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo_messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/duplicate/echo_duplicate.proto
-)
-
-target_include_directories(grpc++_test_util
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_test_util
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc_test_util
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/impl/codegen/async_stream.h
-  include/grpc++/impl/codegen/async_unary_call.h
-  include/grpc++/impl/codegen/call.h
-  include/grpc++/impl/codegen/call_hook.h
-  include/grpc++/impl/codegen/channel_interface.h
-  include/grpc++/impl/codegen/client_context.h
-  include/grpc++/impl/codegen/client_unary_call.h
-  include/grpc++/impl/codegen/completion_queue.h
-  include/grpc++/impl/codegen/completion_queue_tag.h
-  include/grpc++/impl/codegen/config.h
-  include/grpc++/impl/codegen/core_codegen_interface.h
-  include/grpc++/impl/codegen/create_auth_context.h
-  include/grpc++/impl/codegen/grpc_library.h
-  include/grpc++/impl/codegen/metadata_map.h
-  include/grpc++/impl/codegen/method_handler_impl.h
-  include/grpc++/impl/codegen/rpc_method.h
-  include/grpc++/impl/codegen/rpc_service_method.h
-  include/grpc++/impl/codegen/security/auth_context.h
-  include/grpc++/impl/codegen/serialization_traits.h
-  include/grpc++/impl/codegen/server_context.h
-  include/grpc++/impl/codegen/server_interface.h
-  include/grpc++/impl/codegen/service_type.h
-  include/grpc++/impl/codegen/slice.h
-  include/grpc++/impl/codegen/status.h
-  include/grpc++/impl/codegen/status_code_enum.h
-  include/grpc++/impl/codegen/string_ref.h
-  include/grpc++/impl/codegen/stub_options.h
-  include/grpc++/impl/codegen/sync_stream.h
-  include/grpc++/impl/codegen/time.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-  include/grpc++/impl/codegen/proto_utils.h
-  include/grpc++/impl/codegen/config_protobuf.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc++_unsecure
-  src/cpp/client/insecure_credentials.cc
-  src/cpp/common/insecure_create_auth_context.cc
-  src/cpp/server/insecure_server_credentials.cc
-  src/cpp/client/channel_cc.cc
-  src/cpp/client/client_context.cc
-  src/cpp/client/create_channel.cc
-  src/cpp/client/create_channel_internal.cc
-  src/cpp/client/create_channel_posix.cc
-  src/cpp/client/credentials_cc.cc
-  src/cpp/client/generic_stub.cc
-  src/cpp/common/channel_arguments.cc
-  src/cpp/common/channel_filter.cc
-  src/cpp/common/completion_queue_cc.cc
-  src/cpp/common/core_codegen.cc
-  src/cpp/common/resource_quota_cc.cc
-  src/cpp/common/rpc_method.cc
-  src/cpp/common/version_cc.cc
-  src/cpp/server/async_generic_service.cc
-  src/cpp/server/channel_argument_option.cc
-  src/cpp/server/create_default_thread_pool.cc
-  src/cpp/server/dynamic_thread_pool.cc
-  src/cpp/server/health/default_health_check_service.cc
-  src/cpp/server/health/health.pb.c
-  src/cpp/server/health/health_check_service.cc
-  src/cpp/server/health/health_check_service_server_builder_option.cc
-  src/cpp/server/server_builder.cc
-  src/cpp/server/server_cc.cc
-  src/cpp/server/server_context.cc
-  src/cpp/server/server_credentials.cc
-  src/cpp/server/server_posix.cc
-  src/cpp/thread_manager/thread_manager.cc
-  src/cpp/util/byte_buffer_cc.cc
-  src/cpp/util/slice_cc.cc
-  src/cpp/util/status.cc
-  src/cpp/util/string_ref.cc
-  src/cpp/util/time_cc.cc
-  third_party/nanopb/pb_common.c
-  third_party/nanopb/pb_decode.c
-  third_party/nanopb/pb_encode.c
-  src/cpp/codegen/codegen_init.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc++_unsecure PROPERTIES COMPILE_PDB_NAME "grpc++_unsecure"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc++_unsecure.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc++_unsecure
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc++_unsecure
-  ${_gRPC_BASELIB_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-  grpc_unsecure
-)
-
-foreach(_hdr
-  include/grpc++/alarm.h
-  include/grpc++/channel.h
-  include/grpc++/client_context.h
-  include/grpc++/completion_queue.h
-  include/grpc++/create_channel.h
-  include/grpc++/create_channel_posix.h
-  include/grpc++/ext/health_check_service_server_builder_option.h
-  include/grpc++/generic/async_generic_service.h
-  include/grpc++/generic/generic_stub.h
-  include/grpc++/grpc++.h
-  include/grpc++/health_check_service_interface.h
-  include/grpc++/impl/call.h
-  include/grpc++/impl/channel_argument_option.h
-  include/grpc++/impl/client_unary_call.h
-  include/grpc++/impl/codegen/core_codegen.h
-  include/grpc++/impl/grpc_library.h
-  include/grpc++/impl/method_handler_impl.h
-  include/grpc++/impl/rpc_method.h
-  include/grpc++/impl/rpc_service_method.h
-  include/grpc++/impl/serialization_traits.h
-  include/grpc++/impl/server_builder_option.h
-  include/grpc++/impl/server_builder_plugin.h
-  include/grpc++/impl/server_initializer.h
-  include/grpc++/impl/service_type.h
-  include/grpc++/resource_quota.h
-  include/grpc++/security/auth_context.h
-  include/grpc++/security/auth_metadata_processor.h
-  include/grpc++/security/credentials.h
-  include/grpc++/security/server_credentials.h
-  include/grpc++/server.h
-  include/grpc++/server_builder.h
-  include/grpc++/server_context.h
-  include/grpc++/server_posix.h
-  include/grpc++/support/async_stream.h
-  include/grpc++/support/async_unary_call.h
-  include/grpc++/support/byte_buffer.h
-  include/grpc++/support/channel_arguments.h
-  include/grpc++/support/config.h
-  include/grpc++/support/slice.h
-  include/grpc++/support/status.h
-  include/grpc++/support/status_code_enum.h
-  include/grpc++/support/string_ref.h
-  include/grpc++/support/stub_options.h
-  include/grpc++/support/sync_stream.h
-  include/grpc++/support/time.h
-  include/grpc++/impl/codegen/async_stream.h
-  include/grpc++/impl/codegen/async_unary_call.h
-  include/grpc++/impl/codegen/call.h
-  include/grpc++/impl/codegen/call_hook.h
-  include/grpc++/impl/codegen/channel_interface.h
-  include/grpc++/impl/codegen/client_context.h
-  include/grpc++/impl/codegen/client_unary_call.h
-  include/grpc++/impl/codegen/completion_queue.h
-  include/grpc++/impl/codegen/completion_queue_tag.h
-  include/grpc++/impl/codegen/config.h
-  include/grpc++/impl/codegen/core_codegen_interface.h
-  include/grpc++/impl/codegen/create_auth_context.h
-  include/grpc++/impl/codegen/grpc_library.h
-  include/grpc++/impl/codegen/metadata_map.h
-  include/grpc++/impl/codegen/method_handler_impl.h
-  include/grpc++/impl/codegen/rpc_method.h
-  include/grpc++/impl/codegen/rpc_service_method.h
-  include/grpc++/impl/codegen/security/auth_context.h
-  include/grpc++/impl/codegen/serialization_traits.h
-  include/grpc++/impl/codegen/server_context.h
-  include/grpc++/impl/codegen/server_interface.h
-  include/grpc++/impl/codegen/service_type.h
-  include/grpc++/impl/codegen/slice.h
-  include/grpc++/impl/codegen/status.h
-  include/grpc++/impl/codegen/status_code_enum.h
-  include/grpc++/impl/codegen/string_ref.h
-  include/grpc++/impl/codegen/stub_options.h
-  include/grpc++/impl/codegen/sync_stream.h
-  include/grpc++/impl/codegen/time.h
-  include/grpc/impl/codegen/byte_buffer_reader.h
-  include/grpc/impl/codegen/compression_types.h
-  include/grpc/impl/codegen/connectivity_state.h
-  include/grpc/impl/codegen/exec_ctx_fwd.h
-  include/grpc/impl/codegen/grpc_types.h
-  include/grpc/impl/codegen/propagation_bits.h
-  include/grpc/impl/codegen/slice.h
-  include/grpc/impl/codegen/status.h
-  include/grpc/impl/codegen/atm.h
-  include/grpc/impl/codegen/atm_gcc_atomic.h
-  include/grpc/impl/codegen/atm_gcc_sync.h
-  include/grpc/impl/codegen/atm_windows.h
-  include/grpc/impl/codegen/gpr_slice.h
-  include/grpc/impl/codegen/gpr_types.h
-  include/grpc/impl/codegen/port_platform.h
-  include/grpc/impl/codegen/sync.h
-  include/grpc/impl/codegen/sync_generic.h
-  include/grpc/impl/codegen/sync_posix.h
-  include/grpc/impl/codegen/sync_windows.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc++_unsecure EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc_benchmark
-  test/cpp/microbenchmarks/helpers.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_benchmark PROPERTIES COMPILE_PDB_NAME "grpc_benchmark"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_benchmark.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_benchmark
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_benchmark
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  benchmark
-  grpc++
-  grpc_test_util
-  grpc
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(grpc_cli_libs
-  test/cpp/util/cli_call.cc
-  test/cpp/util/cli_credentials.cc
-  test/cpp/util/grpc_tool.cc
-  test/cpp/util/proto_file_parser.cc
-  test/cpp/util/service_describer.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/reflection/v1alpha/reflection.grpc.pb.h
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_cli_libs PROPERTIES COMPILE_PDB_NAME "grpc_cli_libs"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_cli_libs.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/reflection/v1alpha/reflection.proto
-)
-
-target_include_directories(grpc_cli_libs
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_cli_libs
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_proto_reflection_desc_db
-  grpc++
-  grpc
-)
-
-foreach(_hdr
-  include/grpc++/impl/codegen/config_protobuf.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc_plugin_support
-  src/compiler/cpp_generator.cc
-  src/compiler/csharp_generator.cc
-  src/compiler/node_generator.cc
-  src/compiler/objective_c_generator.cc
-  src/compiler/php_generator.cc
-  src/compiler/python_generator.cc
-  src/compiler/ruby_generator.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_plugin_support PROPERTIES COMPILE_PDB_NAME "grpc_plugin_support"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_plugin_support.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_plugin_support
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_plugin_support
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-foreach(_hdr
-  include/grpc++/impl/codegen/config_protobuf.h
-)
-  string(REPLACE "include/" "" _path ${_hdr})
-  get_filename_component(_path ${_path} PATH)
-  install(FILES ${_hdr}
-    DESTINATION "${gRPC_INSTALL_INCLUDEDIR}/${_path}"
-  )
-endforeach()
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_plugin_support EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(http2_client_main
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/http2_client.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(http2_client_main PROPERTIES COMPILE_PDB_NAME "http2_client_main"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/http2_client_main.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(http2_client_main
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(http2_client_main
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  grpc++_test_config
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(interop_client_helper
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  test/cpp/interop/client_helper.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(interop_client_helper PROPERTIES COMPILE_PDB_NAME "interop_client_helper"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_client_helper.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-
-target_include_directories(interop_client_helper
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_client_helper
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(interop_client_main
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/client.cc
-  test/cpp/interop/interop_client.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(interop_client_main PROPERTIES COMPILE_PDB_NAME "interop_client_main"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_client_main.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(interop_client_main
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_client_main
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  interop_client_helper
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(interop_server_helper
-  test/cpp/interop/server_helper.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(interop_server_helper PROPERTIES COMPILE_PDB_NAME "interop_server_helper"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_helper.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(interop_server_helper
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_server_helper
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(interop_server_lib
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/interop_server.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(interop_server_lib PROPERTIES COMPILE_PDB_NAME "interop_server_lib"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_lib.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(interop_server_lib
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_server_lib
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  interop_server_helper
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(interop_server_main
-  test/cpp/interop/interop_server_bootstrap.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(interop_server_main PROPERTIES COMPILE_PDB_NAME "interop_server_main"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/interop_server_main.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(interop_server_main
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_server_main
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  interop_server_lib
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(qps
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
-  test/cpp/qps/benchmark_config.cc
-  test/cpp/qps/client_async.cc
-  test/cpp/qps/client_sync.cc
-  test/cpp/qps/driver.cc
-  test/cpp/qps/parse_json.cc
-  test/cpp/qps/qps_worker.cc
-  test/cpp/qps/report.cc
-  test/cpp/qps/server_async.cc
-  test/cpp/qps/server_sync.cc
-  test/cpp/qps/usage_timer.cc
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(qps PROPERTIES COMPILE_PDB_NAME "qps"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/qps.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/payloads.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/stats.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/control.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/services.proto
-)
-
-target_include_directories(qps
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(qps
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++_test_util
-  grpc++
-  grpc
-)
-
-
-endif (gRPC_BUILD_TESTS)
-
-add_library(grpc_csharp_ext SHARED
-  src/csharp/ext/grpc_csharp_ext.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(grpc_csharp_ext PROPERTIES COMPILE_PDB_NAME "grpc_csharp_ext"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/grpc_csharp_ext.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(grpc_csharp_ext
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_csharp_ext
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_csharp_ext EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_library(ares
-  third_party/cares/cares/ares__close_sockets.c
-  third_party/cares/cares/ares__get_hostent.c
-  third_party/cares/cares/ares__read_line.c
-  third_party/cares/cares/ares__timeval.c
-  third_party/cares/cares/ares_cancel.c
-  third_party/cares/cares/ares_create_query.c
-  third_party/cares/cares/ares_data.c
-  third_party/cares/cares/ares_destroy.c
-  third_party/cares/cares/ares_expand_name.c
-  third_party/cares/cares/ares_expand_string.c
-  third_party/cares/cares/ares_fds.c
-  third_party/cares/cares/ares_free_hostent.c
-  third_party/cares/cares/ares_free_string.c
-  third_party/cares/cares/ares_getenv.c
-  third_party/cares/cares/ares_gethostbyaddr.c
-  third_party/cares/cares/ares_gethostbyname.c
-  third_party/cares/cares/ares_getnameinfo.c
-  third_party/cares/cares/ares_getopt.c
-  third_party/cares/cares/ares_getsock.c
-  third_party/cares/cares/ares_init.c
-  third_party/cares/cares/ares_library_init.c
-  third_party/cares/cares/ares_llist.c
-  third_party/cares/cares/ares_mkquery.c
-  third_party/cares/cares/ares_nowarn.c
-  third_party/cares/cares/ares_options.c
-  third_party/cares/cares/ares_parse_a_reply.c
-  third_party/cares/cares/ares_parse_aaaa_reply.c
-  third_party/cares/cares/ares_parse_mx_reply.c
-  third_party/cares/cares/ares_parse_naptr_reply.c
-  third_party/cares/cares/ares_parse_ns_reply.c
-  third_party/cares/cares/ares_parse_ptr_reply.c
-  third_party/cares/cares/ares_parse_soa_reply.c
-  third_party/cares/cares/ares_parse_srv_reply.c
-  third_party/cares/cares/ares_parse_txt_reply.c
-  third_party/cares/cares/ares_platform.c
-  third_party/cares/cares/ares_process.c
-  third_party/cares/cares/ares_query.c
-  third_party/cares/cares/ares_search.c
-  third_party/cares/cares/ares_send.c
-  third_party/cares/cares/ares_strcasecmp.c
-  third_party/cares/cares/ares_strdup.c
-  third_party/cares/cares/ares_strerror.c
-  third_party/cares/cares/ares_timeout.c
-  third_party/cares/cares/ares_version.c
-  third_party/cares/cares/ares_writev.c
-  third_party/cares/cares/bitncmp.c
-  third_party/cares/cares/inet_net_pton.c
-  third_party/cares/cares/inet_ntop.c
-  third_party/cares/cares/windows_port.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(ares PROPERTIES COMPILE_PDB_NAME "ares"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ares.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(ares
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(ares
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(bad_client_test
-  test/core/bad_client/bad_client.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(bad_client_test PROPERTIES COMPILE_PDB_NAME "bad_client_test"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/bad_client_test.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(bad_client_test
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bad_client_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(bad_ssl_test_server
-  test/core/bad_ssl/server_common.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(bad_ssl_test_server PROPERTIES COMPILE_PDB_NAME "bad_ssl_test_server"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/bad_ssl_test_server.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(bad_ssl_test_server
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bad_ssl_test_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(end2end_tests
-  test/core/end2end/end2end_tests.c
-  test/core/end2end/end2end_test_utils.c
-  test/core/end2end/tests/authority_not_supported.c
-  test/core/end2end/tests/bad_hostname.c
-  test/core/end2end/tests/bad_ping.c
-  test/core/end2end/tests/binary_metadata.c
-  test/core/end2end/tests/call_creds.c
-  test/core/end2end/tests/cancel_after_accept.c
-  test/core/end2end/tests/cancel_after_client_done.c
-  test/core/end2end/tests/cancel_after_invoke.c
-  test/core/end2end/tests/cancel_after_round_trip.c
-  test/core/end2end/tests/cancel_before_invoke.c
-  test/core/end2end/tests/cancel_in_a_vacuum.c
-  test/core/end2end/tests/cancel_with_status.c
-  test/core/end2end/tests/compressed_payload.c
-  test/core/end2end/tests/connectivity.c
-  test/core/end2end/tests/default_host.c
-  test/core/end2end/tests/disappearing_server.c
-  test/core/end2end/tests/empty_batch.c
-  test/core/end2end/tests/filter_call_init_fails.c
-  test/core/end2end/tests/filter_causes_close.c
-  test/core/end2end/tests/filter_latency.c
-  test/core/end2end/tests/graceful_server_shutdown.c
-  test/core/end2end/tests/high_initial_seqno.c
-  test/core/end2end/tests/hpack_size.c
-  test/core/end2end/tests/idempotent_request.c
-  test/core/end2end/tests/invoke_large_request.c
-  test/core/end2end/tests/keepalive_timeout.c
-  test/core/end2end/tests/large_metadata.c
-  test/core/end2end/tests/load_reporting_hook.c
-  test/core/end2end/tests/max_concurrent_streams.c
-  test/core/end2end/tests/max_connection_age.c
-  test/core/end2end/tests/max_connection_idle.c
-  test/core/end2end/tests/max_message_length.c
-  test/core/end2end/tests/negative_deadline.c
-  test/core/end2end/tests/network_status_change.c
-  test/core/end2end/tests/no_logging.c
-  test/core/end2end/tests/no_op.c
-  test/core/end2end/tests/payload.c
-  test/core/end2end/tests/ping.c
-  test/core/end2end/tests/ping_pong_streaming.c
-  test/core/end2end/tests/proxy_auth.c
-  test/core/end2end/tests/registered_call.c
-  test/core/end2end/tests/request_with_flags.c
-  test/core/end2end/tests/request_with_payload.c
-  test/core/end2end/tests/resource_quota_server.c
-  test/core/end2end/tests/server_finishes_request.c
-  test/core/end2end/tests/shutdown_finishes_calls.c
-  test/core/end2end/tests/shutdown_finishes_tags.c
-  test/core/end2end/tests/simple_cacheable_request.c
-  test/core/end2end/tests/simple_delayed_request.c
-  test/core/end2end/tests/simple_metadata.c
-  test/core/end2end/tests/simple_request.c
-  test/core/end2end/tests/streaming_error_response.c
-  test/core/end2end/tests/trailing_metadata.c
-  test/core/end2end/tests/workaround_cronet_compression.c
-  test/core/end2end/tests/write_buffering.c
-  test/core/end2end/tests/write_buffering_at_end.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(end2end_tests PROPERTIES COMPILE_PDB_NAME "end2end_tests"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/end2end_tests.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(end2end_tests
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(end2end_tests
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_library(end2end_nosec_tests
-  test/core/end2end/end2end_nosec_tests.c
-  test/core/end2end/end2end_test_utils.c
-  test/core/end2end/tests/authority_not_supported.c
-  test/core/end2end/tests/bad_hostname.c
-  test/core/end2end/tests/bad_ping.c
-  test/core/end2end/tests/binary_metadata.c
-  test/core/end2end/tests/cancel_after_accept.c
-  test/core/end2end/tests/cancel_after_client_done.c
-  test/core/end2end/tests/cancel_after_invoke.c
-  test/core/end2end/tests/cancel_after_round_trip.c
-  test/core/end2end/tests/cancel_before_invoke.c
-  test/core/end2end/tests/cancel_in_a_vacuum.c
-  test/core/end2end/tests/cancel_with_status.c
-  test/core/end2end/tests/compressed_payload.c
-  test/core/end2end/tests/connectivity.c
-  test/core/end2end/tests/default_host.c
-  test/core/end2end/tests/disappearing_server.c
-  test/core/end2end/tests/empty_batch.c
-  test/core/end2end/tests/filter_call_init_fails.c
-  test/core/end2end/tests/filter_causes_close.c
-  test/core/end2end/tests/filter_latency.c
-  test/core/end2end/tests/graceful_server_shutdown.c
-  test/core/end2end/tests/high_initial_seqno.c
-  test/core/end2end/tests/hpack_size.c
-  test/core/end2end/tests/idempotent_request.c
-  test/core/end2end/tests/invoke_large_request.c
-  test/core/end2end/tests/keepalive_timeout.c
-  test/core/end2end/tests/large_metadata.c
-  test/core/end2end/tests/load_reporting_hook.c
-  test/core/end2end/tests/max_concurrent_streams.c
-  test/core/end2end/tests/max_connection_age.c
-  test/core/end2end/tests/max_connection_idle.c
-  test/core/end2end/tests/max_message_length.c
-  test/core/end2end/tests/negative_deadline.c
-  test/core/end2end/tests/network_status_change.c
-  test/core/end2end/tests/no_logging.c
-  test/core/end2end/tests/no_op.c
-  test/core/end2end/tests/payload.c
-  test/core/end2end/tests/ping.c
-  test/core/end2end/tests/ping_pong_streaming.c
-  test/core/end2end/tests/proxy_auth.c
-  test/core/end2end/tests/registered_call.c
-  test/core/end2end/tests/request_with_flags.c
-  test/core/end2end/tests/request_with_payload.c
-  test/core/end2end/tests/resource_quota_server.c
-  test/core/end2end/tests/server_finishes_request.c
-  test/core/end2end/tests/shutdown_finishes_calls.c
-  test/core/end2end/tests/shutdown_finishes_tags.c
-  test/core/end2end/tests/simple_cacheable_request.c
-  test/core/end2end/tests/simple_delayed_request.c
-  test/core/end2end/tests/simple_metadata.c
-  test/core/end2end/tests/simple_request.c
-  test/core/end2end/tests/streaming_error_response.c
-  test/core/end2end/tests/trailing_metadata.c
-  test/core/end2end/tests/workaround_cronet_compression.c
-  test/core/end2end/tests/write_buffering.c
-  test/core/end2end/tests/write_buffering_at_end.c
-)
-
-if(WIN32 AND MSVC)
-  set_target_properties(end2end_nosec_tests PROPERTIES COMPILE_PDB_NAME "end2end_nosec_tests"
-    COMPILE_PDB_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  )
-  if (gRPC_INSTALL)
-    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/end2end_nosec_tests.pdb
-      DESTINATION ${gRPC_INSTALL_LIBDIR} OPTIONAL
-    )
-  endif()
-endif()
-
-
-target_include_directories(end2end_nosec_tests
-  PUBLIC $<INSTALL_INTERFACE:include> $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${ZLIB_INCLUDE_DIR}
-  PRIVATE ${BENCHMARK}/include
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(end2end_nosec_tests
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-
-endif (gRPC_BUILD_TESTS)
-
-if (gRPC_BUILD_TESTS)
-
-add_executable(alarm_test
-  test/core/surface/alarm_test.c
-)
-
-
-target_include_directories(alarm_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(alarm_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(algorithm_test
-  test/core/compression/algorithm_test.c
-)
-
-
-target_include_directories(algorithm_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(algorithm_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(alloc_test
-  test/core/support/alloc_test.c
-)
-
-
-target_include_directories(alloc_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(alloc_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(alpn_test
-  test/core/transport/chttp2/alpn_test.c
-)
-
-
-target_include_directories(alpn_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(alpn_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(arena_test
-  test/core/support/arena_test.c
-)
-
-
-target_include_directories(arena_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(arena_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(bad_server_response_test
-  test/core/end2end/bad_server_response_test.c
-)
-
-
-target_include_directories(bad_server_response_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bad_server_response_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  test_tcp_server
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(bdp_estimator_test
-  test/core/transport/bdp_estimator_test.c
-)
-
-
-target_include_directories(bdp_estimator_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bdp_estimator_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(bin_decoder_test
-  test/core/transport/chttp2/bin_decoder_test.c
-)
-
-
-target_include_directories(bin_decoder_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bin_decoder_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(bin_encoder_test
-  test/core/transport/chttp2/bin_encoder_test.c
-)
-
-
-target_include_directories(bin_encoder_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bin_encoder_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(census_context_test
-  test/core/census/context_test.c
-)
-
-
-target_include_directories(census_context_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(census_context_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(census_intrusive_hash_map_test
-  test/core/census/intrusive_hash_map_test.c
-)
-
-
-target_include_directories(census_intrusive_hash_map_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(census_intrusive_hash_map_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(census_resource_test
-  test/core/census/resource_test.c
-)
-
-
-target_include_directories(census_resource_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(census_resource_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(census_trace_context_test
-  test/core/census/trace_context_test.c
-)
-
-
-target_include_directories(census_trace_context_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(census_trace_context_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(channel_create_test
-  test/core/surface/channel_create_test.c
-)
-
-
-target_include_directories(channel_create_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(channel_create_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-
-add_executable(check_epollexclusive
-  test/build/check_epollexclusive.c
-)
-
-
-target_include_directories(check_epollexclusive
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(check_epollexclusive
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS check_epollexclusive EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_executable(chttp2_hpack_encoder_test
-  test/core/transport/chttp2/hpack_encoder_test.c
-)
-
-
-target_include_directories(chttp2_hpack_encoder_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(chttp2_hpack_encoder_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(chttp2_stream_map_test
-  test/core/transport/chttp2/stream_map_test.c
-)
-
-
-target_include_directories(chttp2_stream_map_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(chttp2_stream_map_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(chttp2_varint_test
-  test/core/transport/chttp2/varint_test.c
-)
-
-
-target_include_directories(chttp2_varint_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(chttp2_varint_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(combiner_test
-  test/core/iomgr/combiner_test.c
-)
-
-
-target_include_directories(combiner_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(combiner_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(compression_test
-  test/core/compression/compression_test.c
-)
-
-
-target_include_directories(compression_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(compression_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(concurrent_connectivity_test
-  test/core/surface/concurrent_connectivity_test.c
-)
-
-
-target_include_directories(concurrent_connectivity_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(concurrent_connectivity_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(connection_refused_test
-  test/core/end2end/connection_refused_test.c
-)
-
-
-target_include_directories(connection_refused_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(connection_refused_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(dns_resolver_connectivity_test
-  test/core/client_channel/resolvers/dns_resolver_connectivity_test.c
-)
-
-
-target_include_directories(dns_resolver_connectivity_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(dns_resolver_connectivity_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(dns_resolver_test
-  test/core/client_channel/resolvers/dns_resolver_test.c
-)
-
-
-target_include_directories(dns_resolver_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(dns_resolver_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(dualstack_socket_test
-  test/core/end2end/dualstack_socket_test.c
-)
-
-
-target_include_directories(dualstack_socket_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(dualstack_socket_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(endpoint_pair_test
-  test/core/iomgr/endpoint_pair_test.c
-)
-
-
-target_include_directories(endpoint_pair_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(endpoint_pair_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(error_test
-  test/core/iomgr/error_test.c
-)
-
-
-target_include_directories(error_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(error_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(ev_epollsig_linux_test
-  test/core/iomgr/ev_epollsig_linux_test.c
-)
-
-
-target_include_directories(ev_epollsig_linux_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(ev_epollsig_linux_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(fake_resolver_test
-  test/core/client_channel/resolvers/fake_resolver_test.c
-)
-
-
-target_include_directories(fake_resolver_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fake_resolver_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(fd_conservation_posix_test
-  test/core/iomgr/fd_conservation_posix_test.c
-)
-
-
-target_include_directories(fd_conservation_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fd_conservation_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(fd_posix_test
-  test/core/iomgr/fd_posix_test.c
-)
-
-
-target_include_directories(fd_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fd_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(fling_client
-  test/core/fling/client.c
-)
-
-
-target_include_directories(fling_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fling_client
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(fling_server
-  test/core/fling/server.c
-)
-
-
-target_include_directories(fling_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fling_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(fling_stream_test
-  test/core/fling/fling_stream_test.c
-)
-
-
-target_include_directories(fling_stream_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fling_stream_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(fling_test
-  test/core/fling/fling_test.c
-)
-
-
-target_include_directories(fling_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(fling_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-
-add_executable(gen_hpack_tables
-  tools/codegen/core/gen_hpack_tables.c
-)
-
-
-target_include_directories(gen_hpack_tables
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gen_hpack_tables
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr
-  grpc
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS gen_hpack_tables EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(gen_legal_metadata_characters
-  tools/codegen/core/gen_legal_metadata_characters.c
-)
-
-
-target_include_directories(gen_legal_metadata_characters
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gen_legal_metadata_characters
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS gen_legal_metadata_characters EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(gen_percent_encoding_tables
-  tools/codegen/core/gen_percent_encoding_tables.c
-)
-
-
-target_include_directories(gen_percent_encoding_tables
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gen_percent_encoding_tables
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS gen_percent_encoding_tables EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(goaway_server_test
-  test/core/end2end/goaway_server_test.c
-)
-
-
-target_include_directories(goaway_server_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(goaway_server_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_avl_test
-  test/core/support/avl_test.c
-)
-
-
-target_include_directories(gpr_avl_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_avl_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_backoff_test
-  test/core/support/backoff_test.c
-)
-
-
-target_include_directories(gpr_backoff_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_backoff_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_cmdline_test
-  test/core/support/cmdline_test.c
-)
-
-
-target_include_directories(gpr_cmdline_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_cmdline_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_cpu_test
-  test/core/support/cpu_test.c
-)
-
-
-target_include_directories(gpr_cpu_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_cpu_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_env_test
-  test/core/support/env_test.c
-)
-
-
-target_include_directories(gpr_env_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_env_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_histogram_test
-  test/core/support/histogram_test.c
-)
-
-
-target_include_directories(gpr_histogram_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_histogram_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_host_port_test
-  test/core/support/host_port_test.c
-)
-
-
-target_include_directories(gpr_host_port_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_host_port_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_log_test
-  test/core/support/log_test.c
-)
-
-
-target_include_directories(gpr_log_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_log_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_mpscq_test
-  test/core/support/mpscq_test.c
-)
-
-
-target_include_directories(gpr_mpscq_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_mpscq_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_spinlock_test
-  test/core/support/spinlock_test.c
-)
-
-
-target_include_directories(gpr_spinlock_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_spinlock_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_stack_lockfree_test
-  test/core/support/stack_lockfree_test.c
-)
-
-
-target_include_directories(gpr_stack_lockfree_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_stack_lockfree_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_string_test
-  test/core/support/string_test.c
-)
-
-
-target_include_directories(gpr_string_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_string_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_sync_test
-  test/core/support/sync_test.c
-)
-
-
-target_include_directories(gpr_sync_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_sync_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_thd_test
-  test/core/support/thd_test.c
-)
-
-
-target_include_directories(gpr_thd_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_thd_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_time_test
-  test/core/support/time_test.c
-)
-
-
-target_include_directories(gpr_time_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_time_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_tls_test
-  test/core/support/tls_test.c
-)
-
-
-target_include_directories(gpr_tls_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_tls_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(gpr_useful_test
-  test/core/support/useful_test.c
-)
-
-
-target_include_directories(gpr_useful_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(gpr_useful_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_auth_context_test
-  test/core/security/auth_context_test.c
-)
-
-
-target_include_directories(grpc_auth_context_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_auth_context_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_b64_test
-  test/core/slice/b64_test.c
-)
-
-
-target_include_directories(grpc_b64_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_b64_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_byte_buffer_reader_test
-  test/core/surface/byte_buffer_reader_test.c
-)
-
-
-target_include_directories(grpc_byte_buffer_reader_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_byte_buffer_reader_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_channel_args_test
-  test/core/channel/channel_args_test.c
-)
-
-
-target_include_directories(grpc_channel_args_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_channel_args_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_channel_stack_test
-  test/core/channel/channel_stack_test.c
-)
-
-
-target_include_directories(grpc_channel_stack_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_channel_stack_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_completion_queue_test
-  test/core/surface/completion_queue_test.c
-)
-
-
-target_include_directories(grpc_completion_queue_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_completion_queue_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_completion_queue_threading_test
-  test/core/surface/completion_queue_threading_test.c
-)
-
-
-target_include_directories(grpc_completion_queue_threading_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_completion_queue_threading_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-
-add_executable(grpc_create_jwt
-  test/core/security/create_jwt.c
-)
-
-
-target_include_directories(grpc_create_jwt
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_create_jwt
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_create_jwt EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_credentials_test
-  test/core/security/credentials_test.c
-)
-
-
-target_include_directories(grpc_credentials_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_credentials_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_fetch_oauth2
-  test/core/security/fetch_oauth2.c
-)
-
-
-target_include_directories(grpc_fetch_oauth2
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_fetch_oauth2
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_invalid_channel_args_test
-  test/core/surface/invalid_channel_args_test.c
-)
-
-
-target_include_directories(grpc_invalid_channel_args_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_invalid_channel_args_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(grpc_json_token_test
-  test/core/security/json_token_test.c
-)
-
-
-target_include_directories(grpc_json_token_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_json_token_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_jwt_verifier_test
-  test/core/security/jwt_verifier_test.c
-)
-
-
-target_include_directories(grpc_jwt_verifier_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_jwt_verifier_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-
-add_executable(grpc_print_google_default_creds_token
-  test/core/security/print_google_default_creds_token.c
-)
-
-
-target_include_directories(grpc_print_google_default_creds_token
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_print_google_default_creds_token
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_print_google_default_creds_token EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_security_connector_test
-  test/core/security/security_connector_test.c
-)
-
-
-target_include_directories(grpc_security_connector_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_security_connector_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-
-add_executable(grpc_verify_jwt
-  test/core/security/verify_jwt.c
-)
-
-
-target_include_directories(grpc_verify_jwt
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(grpc_verify_jwt
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_verify_jwt EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(handshake_client
-  test/core/handshake/client_ssl.c
-)
-
-
-target_include_directories(handshake_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(handshake_client
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(handshake_server
-  test/core/handshake/server_ssl.c
-)
-
-
-target_include_directories(handshake_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(handshake_server
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(hpack_parser_test
-  test/core/transport/chttp2/hpack_parser_test.c
-)
-
-
-target_include_directories(hpack_parser_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(hpack_parser_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(hpack_table_test
-  test/core/transport/chttp2/hpack_table_test.c
-)
-
-
-target_include_directories(hpack_table_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(hpack_table_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(http_parser_test
-  test/core/http/parser_test.c
-)
-
-
-target_include_directories(http_parser_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(http_parser_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(httpcli_format_request_test
-  test/core/http/format_request_test.c
-)
-
-
-target_include_directories(httpcli_format_request_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(httpcli_format_request_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(httpcli_test
-  test/core/http/httpcli_test.c
-)
-
-
-target_include_directories(httpcli_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(httpcli_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(httpscli_test
-  test/core/http/httpscli_test.c
-)
-
-
-target_include_directories(httpscli_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(httpscli_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(init_test
-  test/core/surface/init_test.c
-)
-
-
-target_include_directories(init_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(init_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(invalid_call_argument_test
-  test/core/end2end/invalid_call_argument_test.c
-)
-
-
-target_include_directories(invalid_call_argument_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(invalid_call_argument_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(json_rewrite
-  test/core/json/json_rewrite.c
-)
-
-
-target_include_directories(json_rewrite
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(json_rewrite
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(json_rewrite_test
-  test/core/json/json_rewrite_test.c
-)
-
-
-target_include_directories(json_rewrite_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(json_rewrite_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(json_stream_error_test
-  test/core/json/json_stream_error_test.c
-)
-
-
-target_include_directories(json_stream_error_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(json_stream_error_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(json_test
-  test/core/json/json_test.c
-)
-
-
-target_include_directories(json_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(json_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(lame_client_test
-  test/core/surface/lame_client_test.c
-)
-
-
-target_include_directories(lame_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(lame_client_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(lb_policies_test
-  test/core/client_channel/lb_policies_test.c
-)
-
-
-target_include_directories(lb_policies_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(lb_policies_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(load_file_test
-  test/core/iomgr/load_file_test.c
-)
-
-
-target_include_directories(load_file_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(load_file_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(memory_profile_client
-  test/core/memory_usage/client.c
-)
-
-
-target_include_directories(memory_profile_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(memory_profile_client
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(memory_profile_server
-  test/core/memory_usage/server.c
-)
-
-
-target_include_directories(memory_profile_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(memory_profile_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(memory_profile_test
-  test/core/memory_usage/memory_usage_test.c
-)
-
-
-target_include_directories(memory_profile_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(memory_profile_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(message_compress_test
-  test/core/compression/message_compress_test.c
-)
-
-
-target_include_directories(message_compress_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(message_compress_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(minimal_stack_is_minimal_test
-  test/core/channel/minimal_stack_is_minimal_test.c
-)
-
-
-target_include_directories(minimal_stack_is_minimal_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(minimal_stack_is_minimal_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(mlog_test
-  test/core/census/mlog_test.c
-)
-
-
-target_include_directories(mlog_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(mlog_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(multiple_server_queues_test
-  test/core/end2end/multiple_server_queues_test.c
-)
-
-
-target_include_directories(multiple_server_queues_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(multiple_server_queues_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(murmur_hash_test
-  test/core/support/murmur_hash_test.c
-)
-
-
-target_include_directories(murmur_hash_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(murmur_hash_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(no_server_test
-  test/core/end2end/no_server_test.c
-)
-
-
-target_include_directories(no_server_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(no_server_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(num_external_connectivity_watchers_test
-  test/core/surface/num_external_connectivity_watchers_test.c
-)
-
-
-target_include_directories(num_external_connectivity_watchers_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(num_external_connectivity_watchers_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(parse_address_test
-  test/core/client_channel/parse_address_test.c
-)
-
-
-target_include_directories(parse_address_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(parse_address_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(percent_encoding_test
-  test/core/slice/percent_encoding_test.c
-)
-
-
-target_include_directories(percent_encoding_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(percent_encoding_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(pollset_set_test
-  test/core/iomgr/pollset_set_test.c
-)
-
-
-target_include_directories(pollset_set_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(pollset_set_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(resolve_address_posix_test
-  test/core/iomgr/resolve_address_posix_test.c
-)
-
-
-target_include_directories(resolve_address_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(resolve_address_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(resolve_address_test
-  test/core/iomgr/resolve_address_test.c
-)
-
-
-target_include_directories(resolve_address_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(resolve_address_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(resource_quota_test
-  test/core/iomgr/resource_quota_test.c
-)
-
-
-target_include_directories(resource_quota_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(resource_quota_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(secure_channel_create_test
-  test/core/surface/secure_channel_create_test.c
-)
-
-
-target_include_directories(secure_channel_create_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(secure_channel_create_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(secure_endpoint_test
-  test/core/security/secure_endpoint_test.c
-)
-
-
-target_include_directories(secure_endpoint_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(secure_endpoint_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(sequential_connectivity_test
-  test/core/surface/sequential_connectivity_test.c
-)
-
-
-target_include_directories(sequential_connectivity_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(sequential_connectivity_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_chttp2_test
-  test/core/surface/server_chttp2_test.c
-)
-
-
-target_include_directories(server_chttp2_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(server_chttp2_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_test
-  test/core/surface/server_test.c
-)
-
-
-target_include_directories(server_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(server_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(slice_buffer_test
-  test/core/slice/slice_buffer_test.c
-)
-
-
-target_include_directories(slice_buffer_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(slice_buffer_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(slice_hash_table_test
-  test/core/slice/slice_hash_table_test.c
-)
-
-
-target_include_directories(slice_hash_table_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(slice_hash_table_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(slice_string_helpers_test
-  test/core/slice/slice_string_helpers_test.c
-)
-
-
-target_include_directories(slice_string_helpers_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(slice_string_helpers_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(slice_test
-  test/core/slice/slice_test.c
-)
-
-
-target_include_directories(slice_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(slice_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(sockaddr_resolver_test
-  test/core/client_channel/resolvers/sockaddr_resolver_test.c
-)
-
-
-target_include_directories(sockaddr_resolver_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(sockaddr_resolver_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(sockaddr_utils_test
-  test/core/iomgr/sockaddr_utils_test.c
-)
-
-
-target_include_directories(sockaddr_utils_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(sockaddr_utils_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(socket_utils_test
-  test/core/iomgr/socket_utils_test.c
-)
-
-
-target_include_directories(socket_utils_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(socket_utils_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(status_conversion_test
-  test/core/transport/status_conversion_test.c
-)
-
-
-target_include_directories(status_conversion_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(status_conversion_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(stream_compression_test
-  test/core/compression/stream_compression_test.c
-)
-
-
-target_include_directories(stream_compression_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(stream_compression_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(stream_owned_slice_test
-  test/core/transport/stream_owned_slice_test.c
-)
-
-
-target_include_directories(stream_owned_slice_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(stream_owned_slice_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(tcp_client_posix_test
-  test/core/iomgr/tcp_client_posix_test.c
-)
-
-
-target_include_directories(tcp_client_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(tcp_client_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(tcp_client_uv_test
-  test/core/iomgr/tcp_client_uv_test.c
-)
-
-
-target_include_directories(tcp_client_uv_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(tcp_client_uv_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(tcp_posix_test
-  test/core/iomgr/tcp_posix_test.c
-)
-
-
-target_include_directories(tcp_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(tcp_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(tcp_server_posix_test
-  test/core/iomgr/tcp_server_posix_test.c
-)
-
-
-target_include_directories(tcp_server_posix_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(tcp_server_posix_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(tcp_server_uv_test
-  test/core/iomgr/tcp_server_uv_test.c
-)
-
-
-target_include_directories(tcp_server_uv_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(tcp_server_uv_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(time_averaged_stats_test
-  test/core/iomgr/time_averaged_stats_test.c
-)
-
-
-target_include_directories(time_averaged_stats_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(time_averaged_stats_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(timeout_encoding_test
-  test/core/transport/timeout_encoding_test.c
-)
-
-
-target_include_directories(timeout_encoding_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(timeout_encoding_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(timer_heap_test
-  test/core/iomgr/timer_heap_test.c
-)
-
-
-target_include_directories(timer_heap_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(timer_heap_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(timer_list_test
-  test/core/iomgr/timer_list_test.c
-)
-
-
-target_include_directories(timer_list_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(timer_list_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(transport_connectivity_state_test
-  test/core/transport/connectivity_state_test.c
-)
-
-
-target_include_directories(transport_connectivity_state_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(transport_connectivity_state_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(transport_metadata_test
-  test/core/transport/metadata_test.c
-)
-
-
-target_include_directories(transport_metadata_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(transport_metadata_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(transport_pid_controller_test
-  test/core/transport/pid_controller_test.c
-)
-
-
-target_include_directories(transport_pid_controller_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(transport_pid_controller_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(transport_security_test
-  test/core/tsi/transport_security_test.c
-)
-
-
-target_include_directories(transport_security_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(transport_security_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(udp_server_test
-  test/core/iomgr/udp_server_test.c
-)
-
-
-target_include_directories(udp_server_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(udp_server_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(uri_parser_test
-  test/core/client_channel/uri_parser_test.c
-)
-
-
-target_include_directories(uri_parser_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(uri_parser_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(wakeup_fd_cv_test
-  test/core/iomgr/wakeup_fd_cv_test.c
-)
-
-
-target_include_directories(wakeup_fd_cv_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(wakeup_fd_cv_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(alarm_cpp_test
-  test/cpp/common/alarm_cpp_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(alarm_cpp_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(alarm_cpp_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(async_end2end_test
-  test/cpp/end2end/async_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(async_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(async_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(auth_property_iterator_test
-  test/cpp/common/auth_property_iterator_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(auth_property_iterator_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(auth_property_iterator_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_arena
-  test/cpp/microbenchmarks/bm_arena.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_arena
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_arena
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_call_create
-  test/cpp/microbenchmarks/bm_call_create.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_call_create
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_call_create
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_chttp2_hpack
-  test/cpp/microbenchmarks/bm_chttp2_hpack.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_chttp2_hpack
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_chttp2_hpack
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_chttp2_transport
-  test/cpp/microbenchmarks/bm_chttp2_transport.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_chttp2_transport
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_chttp2_transport
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_closure
-  test/cpp/microbenchmarks/bm_closure.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_closure
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_closure
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_cq
-  test/cpp/microbenchmarks/bm_cq.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_cq
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_cq
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_cq_multiple_threads
-  test/cpp/microbenchmarks/bm_cq_multiple_threads.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_cq_multiple_threads
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_cq_multiple_threads
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_error
-  test/cpp/microbenchmarks/bm_error.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_error
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_error
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_fullstack_streaming_ping_pong
-  test/cpp/microbenchmarks/bm_fullstack_streaming_ping_pong.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_fullstack_streaming_ping_pong
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_fullstack_streaming_ping_pong
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_fullstack_streaming_pump
-  test/cpp/microbenchmarks/bm_fullstack_streaming_pump.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_fullstack_streaming_pump
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_fullstack_streaming_pump
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_fullstack_trickle
-  test/cpp/microbenchmarks/bm_fullstack_trickle.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_fullstack_trickle
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_fullstack_trickle
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_fullstack_unary_ping_pong
-  test/cpp/microbenchmarks/bm_fullstack_unary_ping_pong.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_fullstack_unary_ping_pong
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_fullstack_unary_ping_pong
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_metadata
-  test/cpp/microbenchmarks/bm_metadata.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_metadata
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_metadata
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bm_pollset
-  test/cpp/microbenchmarks/bm_pollset.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(bm_pollset
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(bm_pollset
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_benchmark
-  benchmark
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(channel_arguments_test
-  test/cpp/common/channel_arguments_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(channel_arguments_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(channel_arguments_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(channel_filter_test
-  test/cpp/common/channel_filter_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(channel_filter_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(channel_filter_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(cli_call_test
-  test/cpp/util/cli_call_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(cli_call_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(cli_call_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_cli_libs
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(client_crash_test
-  test/cpp/end2end/client_crash_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(client_crash_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(client_crash_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(client_crash_test_server
-  test/cpp/end2end/client_crash_test_server.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(client_crash_test_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(client_crash_test_server
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(client_lb_end2end_test
-  test/cpp/end2end/client_lb_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(client_lb_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(client_lb_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(codegen_test_full
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
-  test/cpp/codegen/codegen_test_full.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/control.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/payloads.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/services.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/stats.proto
-)
-
-target_include_directories(codegen_test_full
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(codegen_test_full
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(codegen_test_minimal
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/control.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/payloads.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/services.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/stats.grpc.pb.h
-  test/cpp/codegen/codegen_test_minimal.cc
-  src/cpp/codegen/codegen_init.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/control.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/payloads.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/services.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/stats.proto
-)
-
-target_include_directories(codegen_test_minimal
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(codegen_test_minimal
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(credentials_test
-  test/cpp/client/credentials_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(credentials_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(credentials_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(cxx_byte_buffer_test
-  test/cpp/util/byte_buffer_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(cxx_byte_buffer_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(cxx_byte_buffer_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(cxx_slice_test
-  test/cpp/util/slice_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(cxx_slice_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(cxx_slice_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(cxx_string_ref_test
-  test/cpp/util/string_ref_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(cxx_string_ref_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(cxx_string_ref_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(cxx_time_test
-  test/cpp/util/time_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(cxx_time_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(cxx_time_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(end2end_test
-  test/cpp/end2end/end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(error_details_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
-  test/cpp/util/error_details_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo_messages.proto
-)
-
-target_include_directories(error_details_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(error_details_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_error_details
-  grpc++
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(filter_end2end_test
-  test/cpp/end2end/filter_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(filter_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(filter_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(generic_end2end_test
-  test/cpp/end2end/generic_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(generic_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(generic_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(golden_file_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/compiler_test.grpc.pb.h
-  test/cpp/codegen/golden_file_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/compiler_test.proto
-)
-
-target_include_directories(golden_file_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(golden_file_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_cli
-  test/cpp/util/grpc_cli.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(grpc_cli
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_cli
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_cli_libs
-  grpc++_proto_reflection_desc_db
-  grpc++
-  grpc
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-
-add_executable(grpc_cpp_plugin
-  src/compiler/cpp_plugin.cc
-)
-
-
-target_include_directories(grpc_cpp_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_cpp_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_cpp_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_csharp_plugin
-  src/compiler/csharp_plugin.cc
-)
-
-
-target_include_directories(grpc_csharp_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_csharp_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_csharp_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_node_plugin
-  src/compiler/node_plugin.cc
-)
-
-
-target_include_directories(grpc_node_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_node_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_node_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_objective_c_plugin
-  src/compiler/objective_c_plugin.cc
-)
-
-
-target_include_directories(grpc_objective_c_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_objective_c_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_objective_c_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_php_plugin
-  src/compiler/php_plugin.cc
-)
-
-
-target_include_directories(grpc_php_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_php_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_php_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_python_plugin
-  src/compiler/python_plugin.cc
-)
-
-
-target_include_directories(grpc_python_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_python_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_python_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-
-add_executable(grpc_ruby_plugin
-  src/compiler/ruby_plugin.cc
-)
-
-
-target_include_directories(grpc_ruby_plugin
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_ruby_plugin
-  ${_gRPC_PROTOBUF_PROTOC_LIBRARIES}
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_plugin_support
-)
-
-
-if (gRPC_INSTALL)
-  install(TARGETS grpc_ruby_plugin EXPORT gRPCTargets
-    RUNTIME DESTINATION ${gRPC_INSTALL_BINDIR}
-    LIBRARY DESTINATION ${gRPC_INSTALL_LIBDIR}
-    ARCHIVE DESTINATION ${gRPC_INSTALL_LIBDIR}
-  )
-endif()
-
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpc_tool_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
-  test/cpp/util/grpc_tool_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo_messages.proto
-)
-
-target_include_directories(grpc_tool_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpc_tool_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_cli_libs
-  grpc++_proto_reflection_desc_db
-  grpc++_reflection
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpclb_api_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
-  test/cpp/grpclb/grpclb_api_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/lb/v1/load_balancer.proto
-)
-
-target_include_directories(grpclb_api_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpclb_api_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpclb_end2end_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
-  test/cpp/end2end/grpclb_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/lb/v1/load_balancer.proto
-)
-
-target_include_directories(grpclb_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpclb_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(grpclb_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/lb/v1/load_balancer.grpc.pb.h
-  test/cpp/grpclb/grpclb_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/lb/v1/load_balancer.proto
-)
-
-target_include_directories(grpclb_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(grpclb_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(health_service_end2end_test
-  test/cpp/end2end/health_service_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(health_service_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(health_service_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(http2_client
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(http2_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(http2_client
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  http2_client_main
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(hybrid_end2end_test
-  test/cpp/end2end/hybrid_end2end_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(hybrid_end2end_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(hybrid_end2end_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(interop_client
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(interop_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_client
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  interop_client_main
-  interop_client_helper
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(interop_server
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(interop_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_server
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  interop_server_main
-  interop_server_helper
-  interop_server_lib
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(interop_test
-  test/cpp/interop/interop_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(interop_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(interop_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(json_run_localhost
-  test/cpp/qps/json_run_localhost.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(json_run_localhost
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(json_run_localhost
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(memory_test
-  test/core/support/memory_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(memory_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(memory_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(metrics_client
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.h
-  test/cpp/interop/metrics_client.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/metrics.proto
-)
-
-target_include_directories(metrics_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(metrics_client
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(mock_test
-  test/cpp/end2end/mock_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(mock_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(mock_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(noop-benchmark
-  test/cpp/microbenchmarks/noop-benchmark.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(noop-benchmark
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(noop-benchmark
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  benchmark
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(proto_server_reflection_test
-  test/cpp/end2end/proto_server_reflection_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(proto_server_reflection_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(proto_server_reflection_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_proto_reflection_desc_db
-  grpc++_reflection
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(proto_utils_test
-  test/cpp/codegen/proto_utils_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(proto_utils_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(proto_utils_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(qps_interarrival_test
-  test/cpp/qps/qps_interarrival_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(qps_interarrival_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(qps_interarrival_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  qps
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(qps_json_driver
-  test/cpp/qps/qps_json_driver.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(qps_json_driver
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(qps_json_driver
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  qps
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(qps_openloop_test
-  test/cpp/qps/qps_openloop_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(qps_openloop_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(qps_openloop_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  qps
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(qps_worker
-  test/cpp/qps/worker.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(qps_worker
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(qps_worker
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  qps
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(reconnect_interop_client
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/reconnect_interop_client.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(reconnect_interop_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(reconnect_interop_client
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(reconnect_interop_server
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/reconnect_interop_server.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(reconnect_interop_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(reconnect_interop_server
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  reconnect_server
-  test_tcp_server
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(secure_auth_context_test
-  test/cpp/common/secure_auth_context_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(secure_auth_context_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(secure_auth_context_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(secure_sync_unary_ping_pong_test
-  test/cpp/qps/secure_sync_unary_ping_pong_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(secure_sync_unary_ping_pong_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(secure_sync_unary_ping_pong_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  qps
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_builder_plugin_test
-  test/cpp/end2end/server_builder_plugin_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(server_builder_plugin_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_builder_plugin_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_builder_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
-  test/cpp/server/server_builder_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo_messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo.proto
-)
-
-target_include_directories(server_builder_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_builder_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  gpr_test_util
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_context_test_spouse_test
-  test/cpp/test/server_context_test_spouse_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(server_context_test_spouse_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_context_test_spouse_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(server_crash_test
-  test/cpp/end2end/server_crash_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(server_crash_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_crash_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_crash_test_client
-  test/cpp/end2end/server_crash_test_client.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(server_crash_test_client
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_crash_test_client
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_request_call_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo_messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/echo.grpc.pb.h
-  test/cpp/server/server_request_call_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo_messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/echo.proto
-)
-
-target_include_directories(server_request_call_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(server_request_call_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  gpr_test_util
-  grpc++
-  grpc
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(shutdown_test
-  test/cpp/end2end/shutdown_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(shutdown_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(shutdown_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(status_test
-  test/cpp/util/status_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(status_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(status_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(streaming_throughput_test
-  test/cpp/end2end/streaming_throughput_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(streaming_throughput_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(streaming_throughput_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(stress_test
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/empty.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/messages.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/metrics.grpc.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.cc
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.pb.h
-  ${_gRPC_PROTO_GENS_DIR}/src/proto/grpc/testing/test.grpc.pb.h
-  test/cpp/interop/interop_client.cc
-  test/cpp/interop/stress_interop_client.cc
-  test/cpp/interop/stress_test.cc
-  test/cpp/util/metrics_server.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/empty.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/messages.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/metrics.proto
-)
-protobuf_generate_grpc_cpp(
-  src/proto/grpc/testing/test.proto
-)
-
-target_include_directories(stress_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(stress_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(thread_manager_test
-  test/cpp/thread_manager/thread_manager_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(thread_manager_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(thread_manager_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++
-  grpc
-  gpr
-  grpc++_test_config
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(thread_stress_test
-  test/cpp/end2end/thread_stress_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(thread_stress_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(thread_stress_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(writes_per_rpc_test
-  test/cpp/performance/writes_per_rpc_test.cc
-  third_party/googletest/googletest/src/gtest-all.cc
-  third_party/googletest/googlemock/src/gmock-all.cc
-)
-
-
-target_include_directories(writes_per_rpc_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-  PRIVATE third_party/googletest/googletest/include
-  PRIVATE third_party/googletest/googletest
-  PRIVATE third_party/googletest/googlemock/include
-  PRIVATE third_party/googletest/googlemock
-  PRIVATE ${_gRPC_PROTO_GENS_DIR}
-)
-
-target_link_libraries(writes_per_rpc_test
-  ${_gRPC_PROTOBUF_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc++_test_util
-  grpc_test_util
-  grpc++
-  grpc
-  gpr_test_util
-  gpr
-  ${_gRPC_GFLAGS_LIBRARIES}
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(public_headers_must_be_c89
-  test/core/surface/public_headers_must_be_c89.c
-)
-
-
-target_include_directories(public_headers_must_be_c89
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(public_headers_must_be_c89
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(badreq_bad_client_test
-  test/core/bad_client/tests/badreq.c
-)
-
-
-target_include_directories(badreq_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(badreq_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(connection_prefix_bad_client_test
-  test/core/bad_client/tests/connection_prefix.c
-)
-
-
-target_include_directories(connection_prefix_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(connection_prefix_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(head_of_line_blocking_bad_client_test
-  test/core/bad_client/tests/head_of_line_blocking.c
-)
-
-
-target_include_directories(head_of_line_blocking_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(head_of_line_blocking_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(headers_bad_client_test
-  test/core/bad_client/tests/headers.c
-)
-
-
-target_include_directories(headers_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(headers_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(initial_settings_frame_bad_client_test
-  test/core/bad_client/tests/initial_settings_frame.c
-)
-
-
-target_include_directories(initial_settings_frame_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(initial_settings_frame_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(large_metadata_bad_client_test
-  test/core/bad_client/tests/large_metadata.c
-)
-
-
-target_include_directories(large_metadata_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(large_metadata_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_registered_method_bad_client_test
-  test/core/bad_client/tests/server_registered_method.c
-)
-
-
-target_include_directories(server_registered_method_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(server_registered_method_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(simple_request_bad_client_test
-  test/core/bad_client/tests/simple_request.c
-)
-
-
-target_include_directories(simple_request_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(simple_request_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(unknown_frame_bad_client_test
-  test/core/bad_client/tests/unknown_frame.c
-)
-
-
-target_include_directories(unknown_frame_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(unknown_frame_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(window_overflow_bad_client_test
-  test/core/bad_client/tests/window_overflow.c
-)
-
-
-target_include_directories(window_overflow_bad_client_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(window_overflow_bad_client_test
-  ${_gRPC_SSL_LIBRARIES}
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_client_test
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bad_ssl_cert_server
-  test/core/bad_ssl/servers/cert.c
-)
-
-
-target_include_directories(bad_ssl_cert_server
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bad_ssl_cert_server
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  bad_ssl_test_server
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(bad_ssl_cert_test
-  test/core/bad_ssl/bad_ssl_test.c
-)
-
-
-target_include_directories(bad_ssl_cert_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(bad_ssl_cert_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_census_test
-  test/core/end2end/fixtures/h2_census.c
-)
-
-
-target_include_directories(h2_census_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_census_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_compress_test
-  test/core/end2end/fixtures/h2_compress.c
-)
-
-
-target_include_directories(h2_compress_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_compress_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_fakesec_test
-  test/core/end2end/fixtures/h2_fakesec.c
-)
-
-
-target_include_directories(h2_fakesec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_fakesec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(h2_fd_test
-  test/core/end2end/fixtures/h2_fd.c
-)
-
-
-target_include_directories(h2_fd_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_fd_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full_test
-  test/core/end2end/fixtures/h2_full.c
-)
-
-
-target_include_directories(h2_full_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(h2_full+pipe_test
-  test/core/end2end/fixtures/h2_full+pipe.c
-)
-
-
-target_include_directories(h2_full+pipe_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+pipe_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full+trace_test
-  test/core/end2end/fixtures/h2_full+trace.c
-)
-
-
-target_include_directories(h2_full+trace_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+trace_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full+workarounds_test
-  test/core/end2end/fixtures/h2_full+workarounds.c
-)
-
-
-target_include_directories(h2_full+workarounds_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+workarounds_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_http_proxy_test
-  test/core/end2end/fixtures/h2_http_proxy.c
-)
-
-
-target_include_directories(h2_http_proxy_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_http_proxy_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_load_reporting_test
-  test/core/end2end/fixtures/h2_load_reporting.c
-)
-
-
-target_include_directories(h2_load_reporting_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_load_reporting_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_oauth2_test
-  test/core/end2end/fixtures/h2_oauth2.c
-)
-
-
-target_include_directories(h2_oauth2_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_oauth2_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_proxy_test
-  test/core/end2end/fixtures/h2_proxy.c
-)
-
-
-target_include_directories(h2_proxy_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_proxy_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair_test
-  test/core/end2end/fixtures/h2_sockpair.c
-)
-
-
-target_include_directories(h2_sockpair_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair+trace_test
-  test/core/end2end/fixtures/h2_sockpair+trace.c
-)
-
-
-target_include_directories(h2_sockpair+trace_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair+trace_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair_1byte_test
-  test/core/end2end/fixtures/h2_sockpair_1byte.c
-)
-
-
-target_include_directories(h2_sockpair_1byte_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair_1byte_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_ssl_test
-  test/core/end2end/fixtures/h2_ssl.c
-)
-
-
-target_include_directories(h2_ssl_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_ssl_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_ssl_cert_test
-  test/core/end2end/fixtures/h2_ssl_cert.c
-)
-
-
-target_include_directories(h2_ssl_cert_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_ssl_cert_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_ssl_proxy_test
-  test/core/end2end/fixtures/h2_ssl_proxy.c
-)
-
-
-target_include_directories(h2_ssl_proxy_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_ssl_proxy_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(h2_uds_test
-  test/core/end2end/fixtures/h2_uds.c
-)
-
-
-target_include_directories(h2_uds_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_uds_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(inproc_test
-  test/core/end2end/fixtures/inproc.c
-)
-
-
-target_include_directories(inproc_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(inproc_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_tests
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_census_nosec_test
-  test/core/end2end/fixtures/h2_census.c
-)
-
-
-target_include_directories(h2_census_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_census_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_compress_nosec_test
-  test/core/end2end/fixtures/h2_compress.c
-)
-
-
-target_include_directories(h2_compress_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_compress_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(h2_fd_nosec_test
-  test/core/end2end/fixtures/h2_fd.c
-)
-
-
-target_include_directories(h2_fd_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_fd_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full_nosec_test
-  test/core/end2end/fixtures/h2_full.c
-)
-
-
-target_include_directories(h2_full_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX)
-
-add_executable(h2_full+pipe_nosec_test
-  test/core/end2end/fixtures/h2_full+pipe.c
-)
-
-
-target_include_directories(h2_full+pipe_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+pipe_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full+trace_nosec_test
-  test/core/end2end/fixtures/h2_full+trace.c
-)
-
-
-target_include_directories(h2_full+trace_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+trace_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_full+workarounds_nosec_test
-  test/core/end2end/fixtures/h2_full+workarounds.c
-)
-
-
-target_include_directories(h2_full+workarounds_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_full+workarounds_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_http_proxy_nosec_test
-  test/core/end2end/fixtures/h2_http_proxy.c
-)
-
-
-target_include_directories(h2_http_proxy_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_http_proxy_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_load_reporting_nosec_test
-  test/core/end2end/fixtures/h2_load_reporting.c
-)
-
-
-target_include_directories(h2_load_reporting_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_load_reporting_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_proxy_nosec_test
-  test/core/end2end/fixtures/h2_proxy.c
-)
-
-
-target_include_directories(h2_proxy_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_proxy_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair_nosec_test
-  test/core/end2end/fixtures/h2_sockpair.c
-)
-
-
-target_include_directories(h2_sockpair_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair+trace_nosec_test
-  test/core/end2end/fixtures/h2_sockpair+trace.c
-)
-
-
-target_include_directories(h2_sockpair+trace_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair+trace_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(h2_sockpair_1byte_nosec_test
-  test/core/end2end/fixtures/h2_sockpair_1byte.c
-)
-
-
-target_include_directories(h2_sockpair_1byte_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_sockpair_1byte_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-if(_gRPC_PLATFORM_LINUX OR _gRPC_PLATFORM_MAC OR _gRPC_PLATFORM_POSIX)
-
-add_executable(h2_uds_nosec_test
-  test/core/end2end/fixtures/h2_uds.c
-)
-
-
-target_include_directories(h2_uds_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(h2_uds_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif()
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(inproc_nosec_test
-  test/core/end2end/fixtures/inproc.c
-)
-
-
-target_include_directories(inproc_nosec_test
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(inproc_nosec_test
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  end2end_nosec_tests
-  grpc_test_util_unsecure
-  grpc_unsecure
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(api_fuzzer_one_entry
-  test/core/end2end/fuzzers/api_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(api_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(api_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(client_fuzzer_one_entry
-  test/core/end2end/fuzzers/client_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(client_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(client_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(hpack_parser_fuzzer_test_one_entry
-  test/core/transport/chttp2/hpack_parser_fuzzer_test.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(hpack_parser_fuzzer_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(hpack_parser_fuzzer_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(http_request_fuzzer_test_one_entry
-  test/core/http/request_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(http_request_fuzzer_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(http_request_fuzzer_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(http_response_fuzzer_test_one_entry
-  test/core/http/response_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(http_response_fuzzer_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(http_response_fuzzer_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(json_fuzzer_test_one_entry
-  test/core/json/fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(json_fuzzer_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(json_fuzzer_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(nanopb_fuzzer_response_test_one_entry
-  test/core/nanopb/fuzzer_response.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(nanopb_fuzzer_response_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(nanopb_fuzzer_response_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(nanopb_fuzzer_serverlist_test_one_entry
-  test/core/nanopb/fuzzer_serverlist.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(nanopb_fuzzer_serverlist_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(nanopb_fuzzer_serverlist_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(percent_decode_fuzzer_one_entry
-  test/core/slice/percent_decode_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(percent_decode_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(percent_decode_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(percent_encode_fuzzer_one_entry
-  test/core/slice/percent_encode_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(percent_encode_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(percent_encode_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(server_fuzzer_one_entry
-  test/core/end2end/fuzzers/server_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(server_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(server_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(ssl_server_fuzzer_one_entry
-  test/core/security/ssl_server_fuzzer.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(ssl_server_fuzzer_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(ssl_server_fuzzer_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-if (gRPC_BUILD_TESTS)
-
-add_executable(uri_fuzzer_test_one_entry
-  test/core/client_channel/uri_fuzzer_test.c
-  test/core/util/one_corpus_entry_fuzzer.c
-)
-
-
-target_include_directories(uri_fuzzer_test_one_entry
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
-  PRIVATE ${BORINGSSL_ROOT_DIR}/include
-  PRIVATE ${PROTOBUF_ROOT_DIR}/src
-  PRIVATE ${BENCHMARK_ROOT_DIR}/include
-  PRIVATE ${ZLIB_ROOT_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/zlib
-  PRIVATE ${CARES_BUILD_INCLUDE_DIR}
-  PRIVATE ${CARES_INCLUDE_DIR}
-  PRIVATE ${CARES_PLATFORM_INCLUDE_DIR}
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/cares/cares
-  PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/third_party/gflags/include
-)
-
-target_link_libraries(uri_fuzzer_test_one_entry
-  ${_gRPC_ALLTARGETS_LIBRARIES}
-  grpc_test_util
-  grpc
-  gpr_test_util
-  gpr
-)
-
-endif (gRPC_BUILD_TESTS)
-
-
-
-
-
-
-
-if (gRPC_INSTALL)
-  install(EXPORT gRPCTargets
-    DESTINATION ${gRPC_INSTALL_CMAKEDIR}
-    NAMESPACE gRPC::
-  )
-endif()
-
-foreach(_config gRPCConfig gRPCConfigVersion)
-  configure_file(tools/cmake/${_config}.cmake.in
-    ${_config}.cmake @ONLY)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${_config}.cmake
-    DESTINATION ${gRPC_INSTALL_CMAKEDIR}
-  )
-endforeach()
diff --git a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
index fbd89bad079c5d7f6c2909ca643f4c175428e77f..aaae18a313dd082b428654091c9411600c981ec9 100644
--- a/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/patches/nsync/CMakeLists.txt
@@ -61,9 +61,15 @@ if ("${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
     )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "DarwinX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/macos")
+    include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
+    # Some versions of MacOS, such as Sierra, require _DARWIN_C_SOURCE
+    # when including certin C++ standard header files, such as <mutex>.
+    add_definitions ("-D_DARWIN_C_SOURCE")
     add_compile_options ("-std=c++11")
     set (NSYNC_OS_SRC
       ${NSYNC_OS_CPP_SRC}
+      "platform/posix/src/clock_gettime.c"
+      "platform/posix/src/nsync_semaphore_mutex.c"
     )
     set (NSYNC_TEST_OS_SRC
       "platform/posix/src/start_thread.c"
@@ -138,6 +144,10 @@ if (NOT "${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "DarwinX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/macos")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/clock_gettime.c"
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
     include_directories ("${PROJECT_SOURCE_DIR}/platform/posix")
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "LinuxX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/linux")
@@ -148,12 +158,21 @@ if (NOT "${NSYNC_LANGUAGE}X" STREQUAL "c++11X")
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "NetBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/netbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "FreeBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/freebsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   elseif ("${CMAKE_SYSTEM_NAME}X" STREQUAL "OpenBSDX")
     include_directories ("${PROJECT_SOURCE_DIR}/platform/openbsd")
     set (NSYNC_POSIX ON)
+    set (NSYNC_OS_EXTRA_SRC
+      "platform/posix/src/nsync_semaphore_mutex.c"
+    )
   endif ()
 endif ()
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0fca690ef6bedc5a872498583dfd0cbb55e2143
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -0,0 +1,449 @@
+tensorflow
+tensorflow/core
+tensorflow/core/example
+tensorflow/core/framework
+tensorflow/core/lib
+tensorflow/core/lib/core
+tensorflow/core/protobuf
+tensorflow/core/util
+tensorflow/examples
+tensorflow/examples/tutorials
+tensorflow/examples/tutorials/mnist
+tensorflow/python
+tensorflow/python/client
+tensorflow/python/data
+tensorflow/python/data/ops
+tensorflow/python/data/util
+tensorflow/python/debug
+tensorflow/python/debug/cli
+tensorflow/python/debug/examples
+tensorflow/python/debug/lib
+tensorflow/python/debug/wrappers
+tensorflow/python/eager
+tensorflow/python/estimator
+tensorflow/python/estimator/canned
+tensorflow/python/estimator/export
+tensorflow/python/estimator/inputs
+tensorflow/python/estimator/inputs/queues
+tensorflow/python/feature_column
+tensorflow/python/framework
+tensorflow/python/grappler
+tensorflow/python/keras
+tensorflow/python/keras/activations
+tensorflow/python/keras/applications
+tensorflow/python/keras/applications/inception_resnet_v2
+tensorflow/python/keras/applications/inception_v3
+tensorflow/python/keras/applications/mobilenet
+tensorflow/python/keras/applications/resnet50
+tensorflow/python/keras/applications/vgg16
+tensorflow/python/keras/applications/vgg19
+tensorflow/python/keras/applications/xception
+tensorflow/python/keras/backend
+tensorflow/python/keras/callbacks
+tensorflow/python/keras/constraints
+tensorflow/python/keras/datasets
+tensorflow/python/keras/datasets/boston_housing
+tensorflow/python/keras/datasets/cifar10
+tensorflow/python/keras/datasets/cifar100
+tensorflow/python/keras/datasets/fashion_mnist
+tensorflow/python/keras/datasets/imdb
+tensorflow/python/keras/datasets/mnist
+tensorflow/python/keras/datasets/reuters
+tensorflow/python/keras/estimator
+tensorflow/python/keras/initializers
+tensorflow/python/keras/layers
+tensorflow/python/keras/losses
+tensorflow/python/keras/metrics
+tensorflow/python/keras/models
+tensorflow/python/keras/optimizers
+tensorflow/python/keras/preprocessing
+tensorflow/python/keras/preprocessing/image
+tensorflow/python/keras/preprocessing/sequence
+tensorflow/python/keras/preprocessing/text
+tensorflow/python/keras/regularizers
+tensorflow/python/keras/utils
+tensorflow/python/keras/wrappers
+tensorflow/python/keras/wrappers/scikit_learn
+tensorflow/python/keras/_impl
+tensorflow/python/keras/_impl/keras
+tensorflow/python/keras/_impl/keras/applications
+tensorflow/python/keras/_impl/keras/datasets
+tensorflow/python/keras/_impl/keras/engine
+tensorflow/python/keras/_impl/keras/layers
+tensorflow/python/keras/_impl/keras/preprocessing
+tensorflow/python/keras/_impl/keras/utils
+tensorflow/python/keras/_impl/keras/wrappers
+tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/distributions
+tensorflow/python/kernel_tests/linalg
+tensorflow/python/kernel_tests/random
+tensorflow/python/layers
+tensorflow/python/lib
+tensorflow/python/lib/core
+tensorflow/python/lib/io
+tensorflow/python/ops
+tensorflow/python/ops/distributions
+tensorflow/python/ops/linalg
+tensorflow/python/ops/losses
+tensorflow/python/platform
+tensorflow/python/platform/default
+tensorflow/python/platform/summary
+tensorflow/python/profiler/
+tensorflow/python/profiler/internal
+tensorflow/python/saved_model
+tensorflow/python/summary
+tensorflow/python/summary/writer
+tensorflow/python/tools
+tensorflow/python/training
+tensorflow/python/user_ops
+tensorflow/python/util
+tensorflow/python/util/protobuf
+tensorflow/tools
+tensorflow/tools/graph_transforms
+tensorflow/contrib
+tensorflow/contrib/all_reduce
+tensorflow/contrib/all_reduce/python
+tensorflow/contrib/android
+tensorflow/contrib/android/java
+tensorflow/contrib/android/java/org
+tensorflow/contrib/android/java/org/tensorflow
+tensorflow/contrib/android/java/org/tensorflow/contrib
+tensorflow/contrib/android/java/org/tensorflow/contrib/android
+tensorflow/contrib/android/jni
+tensorflow/contrib/batching
+tensorflow/contrib/batching/kernels
+tensorflow/contrib/batching/python
+tensorflow/contrib/batching/python/ops
+tensorflow/contrib/bayesflow
+tensorflow/contrib/bayesflow/examples
+tensorflow/contrib/bayesflow/examples/reinforce_simple
+tensorflow/contrib/bayesflow/python
+tensorflow/contrib/bayesflow/python/ops
+tensorflow/contrib/boosted_trees
+tensorflow/contrib/boosted_trees/estimator_batch
+tensorflow/contrib/boosted_trees/kernels
+tensorflow/contrib/boosted_trees/ops
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/boosted_trees/python
+tensorflow/contrib/boosted_trees/python/ops
+tensorflow/contrib/cloud
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/cloud/ops
+tensorflow/contrib/cloud/python
+tensorflow/contrib/cloud/python/ops
+tensorflow/contrib/cluster_resolver
+tensorflow/contrib/cluster_resolver/python
+tensorflow/contrib/cluster_resolver/python/training
+tensorflow/contrib/compiler
+tensorflow/contrib/copy_graph
+tensorflow/contrib/copy_graph/python
+tensorflow/contrib/copy_graph/python/util
+tensorflow/contrib/crf
+tensorflow/contrib/crf/python
+tensorflow/contrib/crf/python/ops
+tensorflow/contrib/cudnn_rnn
+tensorflow/contrib/cudnn_rnn/kernels
+tensorflow/contrib/cudnn_rnn/ops
+tensorflow/contrib/cudnn_rnn/python
+tensorflow/contrib/cudnn_rnn/python/layers
+tensorflow/contrib/cudnn_rnn/python/ops
+tensorflow/contrib/data
+tensorflow/contrib/data/kernels
+tensorflow/contrib/data/python
+tensorflow/contrib/data/python/kernel_tests
+tensorflow/contrib/data/python/ops
+tensorflow/contrib/decision_trees
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/deprecated
+tensorflow/contrib/distributions
+tensorflow/contrib/distributions/python
+tensorflow/contrib/distributions/python/ops
+tensorflow/contrib/distributions/python/ops/bijectors
+tensorflow/contrib/eager
+tensorflow/contrib/eager/python
+tensorflow/contrib/estimator
+tensorflow/contrib/estimator/python
+tensorflow/contrib/estimator/python/estimator
+tensorflow/contrib/factorization
+tensorflow/contrib/factorization/examples
+tensorflow/contrib/factorization/kernels
+tensorflow/contrib/factorization/ops
+tensorflow/contrib/factorization/python
+tensorflow/contrib/factorization/python/ops
+tensorflow/contrib/ffmpeg
+tensorflow/contrib/ffmpeg/default
+tensorflow/contrib/framework
+tensorflow/contrib/framework/kernels
+tensorflow/contrib/framework/ops
+tensorflow/contrib/framework/python
+tensorflow/contrib/framework/python/framework
+tensorflow/contrib/framework/python/ops
+tensorflow/contrib/fused_conv
+tensorflow/contrib/fused_conv/kernels
+tensorflow/contrib/fused_conv/python
+tensorflow/contrib/fused_conv/python/ops
+tensorflow/contrib/gan
+tensorflow/contrib/gan/python
+tensorflow/contrib/gan/python/estimator
+tensorflow/contrib/gan/python/estimator/python
+tensorflow/contrib/gan/python/eval
+tensorflow/contrib/gan/python/eval/python
+tensorflow/contrib/gan/python/features
+tensorflow/contrib/gan/python/features/python
+tensorflow/contrib/gan/python/losses
+tensorflow/contrib/gan/python/losses/python
+tensorflow/contrib/graph_editor
+tensorflow/contrib/graph_editor/examples
+tensorflow/contrib/grid_rnn
+tensorflow/contrib/grid_rnn/python
+tensorflow/contrib/grid_rnn/python/ops
+tensorflow/contrib/hooks
+tensorflow/contrib/hooks/python
+tensorflow/contrib/image
+tensorflow/contrib/image/kernels
+tensorflow/contrib/image/ops
+tensorflow/contrib/image/python
+tensorflow/contrib/image/python/ops
+tensorflow/contrib/input_pipeline
+tensorflow/contrib/input_pipeline/kernels
+tensorflow/contrib/input_pipeline/ops
+tensorflow/contrib/input_pipeline/python
+tensorflow/contrib/input_pipeline/python/ops
+tensorflow/contrib/integrate
+tensorflow/contrib/integrate/python
+tensorflow/contrib/integrate/python/ops
+tensorflow/contrib/ios_examples
+tensorflow/contrib/ios_examples/benchmark
+tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj
+tensorflow/contrib/ios_examples/benchmark/data
+tensorflow/contrib/ios_examples/camera
+tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj
+tensorflow/contrib/ios_examples/camera/en.lproj
+tensorflow/contrib/ios_examples/simple
+tensorflow/contrib/ios_examples/simple/data
+tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj
+tensorflow/contrib/keras
+tensorflow/contrib/keras/api
+tensorflow/contrib/keras/api/keras
+tensorflow/contrib/keras/api/keras/activations
+tensorflow/contrib/keras/api/keras/applications
+tensorflow/contrib/keras/api/keras/applications/inception_v3
+tensorflow/contrib/keras/api/keras/applications/mobilenet
+tensorflow/contrib/keras/api/keras/applications/resnet50
+tensorflow/contrib/keras/api/keras/applications/vgg16
+tensorflow/contrib/keras/api/keras/applications/vgg19
+tensorflow/contrib/keras/api/keras/applications/xception
+tensorflow/contrib/keras/api/keras/backend
+tensorflow/contrib/keras/api/keras/callbacks
+tensorflow/contrib/keras/api/keras/constraints
+tensorflow/contrib/keras/api/keras/datasets
+tensorflow/contrib/keras/api/keras/datasets/boston_housing
+tensorflow/contrib/keras/api/keras/datasets/cifar10
+tensorflow/contrib/keras/api/keras/datasets/cifar100
+tensorflow/contrib/keras/api/keras/datasets/imdb
+tensorflow/contrib/keras/api/keras/datasets/mnist
+tensorflow/contrib/keras/api/keras/datasets/reuters
+tensorflow/contrib/keras/api/keras/initializers
+tensorflow/contrib/keras/api/keras/layers
+tensorflow/contrib/keras/api/keras/losses
+tensorflow/contrib/keras/api/keras/metrics
+tensorflow/contrib/keras/api/keras/models
+tensorflow/contrib/keras/api/keras/optimizers
+tensorflow/contrib/keras/api/keras/preprocessing
+tensorflow/contrib/keras/api/keras/preprocessing/image
+tensorflow/contrib/keras/api/keras/preprocessing/sequence
+tensorflow/contrib/keras/api/keras/preprocessing/text
+tensorflow/contrib/keras/api/keras/regularizers
+tensorflow/contrib/keras/api/keras/utils
+tensorflow/contrib/keras/api/keras/wrappers
+tensorflow/contrib/keras/api/keras/wrappers/scikit_learn
+tensorflow/contrib/kernel_methods
+tensorflow/contrib/kernel_methods/python
+tensorflow/contrib/kernel_methods/python/mappers
+tensorflow/contrib/kfac
+tensorflow/contrib/kfac/examples
+tensorflow/contrib/kfac/python
+tensorflow/contrib/kfac/python/ops
+tensorflow/contrib/labeled_tensor
+tensorflow/contrib/labeled_tensor/python
+tensorflow/contrib/labeled_tensor/python/ops
+tensorflow/contrib/layers
+tensorflow/contrib/layers/kernels
+tensorflow/contrib/layers/ops
+tensorflow/contrib/layers/python
+tensorflow/contrib/layers/python/layers
+tensorflow/contrib/layers/python/ops
+tensorflow/contrib/learn
+tensorflow/contrib/learn/python
+tensorflow/contrib/learn/python/learn
+tensorflow/contrib/learn/python/learn/dataframe
+tensorflow/contrib/learn/python/learn/dataframe/queues
+tensorflow/contrib/learn/python/learn/dataframe/transforms
+tensorflow/contrib/learn/python/learn/datasets
+tensorflow/contrib/learn/python/learn/datasets/data
+tensorflow/contrib/learn/python/learn/estimators
+tensorflow/contrib/learn/python/learn/learn_io
+tensorflow/contrib/learn/python/learn/ops
+tensorflow/contrib/learn/python/learn/preprocessing
+tensorflow/contrib/learn/python/learn/utils
+tensorflow/contrib/legacy_seq2seq
+tensorflow/contrib/legacy_seq2seq/python
+tensorflow/contrib/legacy_seq2seq/python/ops
+tensorflow/contrib/linalg
+tensorflow/contrib/linalg/python
+tensorflow/contrib/linalg/python/ops
+tensorflow/contrib/linear_optimizer
+tensorflow/contrib/linear_optimizer/kernels
+tensorflow/contrib/linear_optimizer/kernels/g3doc
+tensorflow/contrib/linear_optimizer/python
+tensorflow/contrib/linear_optimizer/python/ops
+tensorflow/contrib/lookup
+tensorflow/contrib/losses
+tensorflow/contrib/losses/python
+tensorflow/contrib/losses/python/losses
+tensorflow/contrib/losses/python/metric_learning
+tensorflow/contrib/makefile
+tensorflow/contrib/memory_stats
+tensorflow/contrib/memory_stats/kernels
+tensorflow/contrib/memory_stats/ops
+tensorflow/contrib/memory_stats/python
+tensorflow/contrib/memory_stats/python/ops
+tensorflow/contrib/meta_graph_transform
+tensorflow/contrib/metrics
+tensorflow/contrib/metrics/ops
+tensorflow/contrib/metrics/python
+tensorflow/contrib/metrics/python/metrics
+tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/model_pruning
+tensorflow/contrib/model_pruning/examples
+tensorflow/contrib/model_pruning/examples/cifar10
+tensorflow/contrib/model_pruning/python
+tensorflow/contrib/model_pruning/python/layers
+tensorflow/contrib/nccl
+tensorflow/contrib/nccl/kernels
+tensorflow/contrib/nccl/ops
+tensorflow/contrib/nccl/python
+tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/ndlstm
+tensorflow/contrib/ndlstm/python
+tensorflow/contrib/nearest_neighbor/kernels
+tensorflow/contrib/nearest_neighbor/ops
+tensorflow/contrib/nearest_neighbor/python
+tensorflow/contrib/nearest_neighbor/python/ops
+tensorflow/contrib/nn
+tensorflow/contrib/nn/python
+tensorflow/contrib/nn/python/ops
+tensorflow/contrib/opt
+tensorflow/contrib/opt/python
+tensorflow/contrib/opt/python/training
+tensorflow/contrib/pi_examples
+tensorflow/contrib/pi_examples/camera
+tensorflow/contrib/pi_examples/label_image
+tensorflow/contrib/pi_examples/label_image/data
+tensorflow/contrib/periodic_resample
+tensorflow/contrib/periodic_resample/python
+tensorflow/contrib/periodic_resample/python/kernels
+tensorflow/contrib/periodic_resample/python/ops
+tensorflow/contrib/predictor
+tensorflow/contrib/quantization
+tensorflow/contrib/quantization/python
+tensorflow/contrib/quantize
+tensorflow/contrib/quantize/python
+tensorflow/contrib/receptive_field
+tensorflow/contrib/receptive_field/python
+tensorflow/contrib/reduce_slice_ops
+tensorflow/contrib/reduce_slice_ops/kernels
+tensorflow/contrib/reduce_slice_ops/ops
+tensorflow/contrib/reduce_slice_ops/python
+tensorflow/contrib/reduce_slice_ops/python/ops
+tensorflow/contrib/remote_fused_graph/pylib
+tensorflow/contrib/remote_fused_graph/pylib/python
+tensorflow/contrib/remote_fused_graph/pylib/python/ops
+tensorflow/contrib/resampler
+tensorflow/contrib/resampler/kernels
+tensorflow/contrib/resampler/ops
+tensorflow/contrib/resampler/python
+tensorflow/contrib/resampler/python/ops
+tensorflow/contrib/rnn
+tensorflow/contrib/rnn/kernels
+tensorflow/contrib/rnn/ops
+tensorflow/contrib/rnn/python
+tensorflow/contrib/rnn/python/kernel_tests
+tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/saved_model
+tensorflow/contrib/saved_model/python
+tensorflow/contrib/saved_model/python/saved_model
+tensorflow/contrib/seq2seq
+tensorflow/contrib/seq2seq/kernels
+tensorflow/contrib/seq2seq/ops
+tensorflow/contrib/seq2seq/python
+tensorflow/contrib/seq2seq/python/ops
+tensorflow/contrib/session_bundle
+tensorflow/contrib/session_bundle/example
+tensorflow/contrib/signal
+tensorflow/contrib/signal/python
+tensorflow/contrib/signal/python/ops
+tensorflow/contrib/slim
+tensorflow/contrib/slim/python
+tensorflow/contrib/slim/python/slim
+tensorflow/contrib/slim/python/slim/data
+tensorflow/contrib/slim/python/slim/nets
+tensorflow/contrib/solvers
+tensorflow/contrib/solvers/python
+tensorflow/contrib/solvers/python/ops
+tensorflow/contrib/sparsemax
+tensorflow/contrib/sparsemax/python
+tensorflow/contrib/sparsemax/python/ops
+tensorflow/contrib/specs
+tensorflow/contrib/specs/python
+tensorflow/contrib/staging
+tensorflow/contrib/stat_summarizer
+tensorflow/contrib/stat_summarizer/python
+tensorflow/contrib/stateless
+tensorflow/contrib/stateless/python
+tensorflow/contrib/summary
+tensorflow/contrib/tensorboard
+tensorflow/contrib/tensorboard/plugins
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensor_forest
+tensorflow/contrib/tensor_forest/client
+tensorflow/contrib/tensor_forest/core
+tensorflow/contrib/tensor_forest/core/ops
+tensorflow/contrib/tensor_forest/data
+tensorflow/contrib/tensor_forest/hybrid
+tensorflow/contrib/tensor_forest/hybrid/core
+tensorflow/contrib/tensor_forest/hybrid/core/ops
+tensorflow/contrib/tensor_forest/hybrid/ops
+tensorflow/contrib/tensor_forest/hybrid/python
+tensorflow/contrib/tensor_forest/hybrid/python/layers
+tensorflow/contrib/tensor_forest/hybrid/python/models
+tensorflow/contrib/tensor_forest/hybrid/python/ops
+tensorflow/contrib/tensor_forest/kernels
+tensorflow/contrib/tensor_forest/python
+tensorflow/contrib/tensor_forest/python/ops
+tensorflow/contrib/testing
+tensorflow/contrib/testing/python
+tensorflow/contrib/testing/python/framework
+tensorflow/contrib/text
+tensorflow/contrib/text/kernels
+tensorflow/contrib/text/ops
+tensorflow/contrib/text/python
+tensorflow/contrib/text/python/ops
+tensorflow/contrib/tfprof
+tensorflow/contrib/timeseries
+tensorflow/contrib/timeseries/examples
+tensorflow/contrib/timeseries/examples/data
+tensorflow/contrib/timeseries/python
+tensorflow/contrib/timeseries/python/timeseries
+tensorflow/contrib/timeseries/python/timeseries/state_space_models
+tensorflow/contrib/tpu
+tensorflow/contrib/tpu/ops
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/tpu/python
+tensorflow/contrib/tpu/python/ops
+tensorflow/contrib/tpu/python/profiler
+tensorflow/contrib/tpu/python/tpu
+tensorflow/contrib/training
+tensorflow/contrib/training/python
+tensorflow/contrib/training/python/training
+tensorflow/contrib/util
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8a9c406d8b118c10ddcaafb0e4fc242aa79cdb57
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -0,0 +1,19 @@
+tensorflow/core
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/boosted_trees/proto
+tensorflow/contrib/cloud/kernels
+tensorflow/contrib/decision_trees/proto
+tensorflow/contrib/gdr
+tensorflow/contrib/lite/toco
+tensorflow/contrib/mpi
+tensorflow/contrib/mpi_collectives
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensor_forest/proto
+tensorflow/contrib/tensorboard/graph_explorer/proto
+tensorflow/contrib/tensorboard/plugins/projector
+tensorflow/contrib/tensorboard/plugins/trace
+tensorflow/contrib/tpu/proto
+tensorflow/contrib/tpu/profiler
+tensorflow/contrib/training/python/training
+tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/python_protos_cc.txt b/tensorflow/contrib/cmake/python_protos_cc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4a257b25c814a1464308d0e6ce3ce65d21f6a36
--- /dev/null
+++ b/tensorflow/contrib/cmake/python_protos_cc.txt
@@ -0,0 +1,5 @@
+tensorflow/core/profiler
+tensorflow/python
+tensorflow/contrib/session_bundle
+tensorflow/contrib/tensorboard
+tensorflow/contrib/training
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index f3882e8cf76c6dad31371fc340de959c05411a2f..c6a15f2ca075c8de96786a580c7ddb89541df5bc 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -21,7 +21,6 @@ set(tf_c_srcs
     "${tensorflow_source_dir}/tensorflow/c/c_api_function.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h"
-    "${tensorflow_source_dir}/tensorflow/c/eager/tape.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/tape.h"
     "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/runtime.h"
@@ -47,4 +46,5 @@ add_dependencies(
   tf_c_python_api
   tf_c
   tf_core_lib
+  tf_core_framework
   tf_protos_cc)
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index a5f5ae5478f3ca82f428d494f2822d0c69064b98..6e2ac203f9a7f96cb14752a91483840a9eb6b451 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -83,7 +83,7 @@ foreach(tf_cc_op_lib_name ${tf_cc_op_lib_names})
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.h
                ${cc_ops_target_dir}/${tf_cc_op_lib_name}_internal.cc
-        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${tensorflow_source_dir}/tensorflow/cc/ops/op_gen_overrides.pbtxt ${cc_ops_include_internal}
+        COMMAND ${tf_cc_op_lib_name}_gen_cc ${cc_ops_target_dir}/${tf_cc_op_lib_name}.h ${cc_ops_target_dir}/${tf_cc_op_lib_name}.cc ${tensorflow_source_dir}/tensorflow/cc/ops/op_gen_overrides.pbtxt ${cc_ops_include_internal} ${tensorflow_source_dir}/tensorflow/core/api_def/base_api
         DEPENDS ${tf_cc_op_lib_name}_gen_cc create_cc_ops_header_dir
     )
 
@@ -148,7 +148,11 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs})
 add_library(tf_cc OBJECT ${tf_cc_srcs})
 add_dependencies(tf_cc tf_cc_framework tf_cc_ops)
 
-set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+if (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib")
+else (WIN32)
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+endif (WIN32)
 add_custom_target(tf_extension_ops)
 
 function(AddUserOps)
@@ -164,15 +168,13 @@ function(AddUserOps)
   # create shared library from source and cuda obj
   add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib})
   target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib})
-  if(WIN32)
-    if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
-        # some ops call out to cuda directly; need to link libs for the cuda dlls
-        target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
-    endif()
-    if (_AT_DISTCOPY)
-        add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
-    endif()
+  if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES)
+      # some ops call out to cuda directly; need to link libs for the cuda dlls
+      target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES})
+  endif()
+  if (_AT_DISTCOPY)
+      add_custom_command(TARGET ${_AT_TARGET} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${_AT_TARGET}> ${_AT_DISTCOPY}/)
   endif()
   if (_AT_DEPENDS)
     add_dependencies(${_AT_TARGET} ${_AT_DEPENDS})
@@ -180,9 +182,19 @@ function(AddUserOps)
   # make sure TF_COMPILE_LIBRARY is not defined for this target
   get_target_property(target_compile_flags  ${_AT_TARGET} COMPILE_FLAGS)
   if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND")
-    set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "/UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "-finput-charset=UTF-8")
+    endif (WIN32)
   else()
-    set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    if (WIN32)
+      set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY")
+    else (WIN32)
+      # gcc uses UTF as default
+      set(target_compile_flags "${target_compile_flags} -finput-charset=UTF-8")
+    endif (WIN32)
   endif()
   set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags})
   add_dependencies(tf_extension_ops ${_AT_TARGET})
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 5c01ca382fb9cc7a01a6f2b60a510c59f0aa7119..e4213ea2a47da2a7381cccd0504235ad62018d4e 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -63,7 +63,7 @@ if (tensorflow_ENABLE_GPU)
   file(GLOB_RECURSE tf_core_gpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu/cupti_wrapper.cc"
-    "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc"
+    "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.h"
     "${tensorflow_source_dir}/tensorflow/core/grappler/devices.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index c3dc8531bb9f0164f06841d9715f227202fdb7c9..5ec1a8d04fa41c6b36400fc0998af77592866150 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -211,7 +211,7 @@ if (NOT tensorflow_ENABLE_GPU)
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_gpu_srcs})
 else()
   file(GLOB tf_core_platform_srcs_exclude
-      "${tensorflow_source_dir}/tensorflow/core/platform/default/gpu_tracer.cc")
+      "${tensorflow_source_dir}/tensorflow/core/platform/default/device_tracer.cc")
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_srcs_exclude})
 endif()
 
@@ -301,6 +301,8 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.h"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -314,6 +316,7 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
 )
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index f978c8ccd5a454ca4a89de0ab5d757b566295c60..eb6bf567aa7dc2e87f3d5ce462a7680fc9850bbf 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -55,10 +55,6 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/sparse_column_iterable.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/utils/tensor_utils.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/common/partitioners/example_partitioner.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/bias-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/categorical-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/dense-quantized-feature-column-handler.cc"
-      "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/learner/stochastic/handlers/sparse-quantized-feature-column-handler.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/lib/trees/decision_tree.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/ops/model_ops.cc"
@@ -154,9 +150,6 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 if(WIN32)
   file(GLOB_RECURSE tf_core_kernels_windows_exclude_srcs
       # not working on windows yet
-      "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h"
-      "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc"
       "${tensorflow_source_dir}/tensorflow/core/kernels/neon/*"
       # not in core - those are loaded dynamically as dll
       "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc"
@@ -183,6 +176,7 @@ file(GLOB_RECURSE tf_core_gpu_kernels_srcs
     "${tensorflow_source_dir}/tensorflow/contrib/image/kernels/*.cu.cc"
     "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/*.cu.cc"
     "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/*.cu.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/resampler/kernels/*.cu.cc"
 )
 
 if(WIN32 AND tensorflow_ENABLE_GPU)
@@ -206,16 +200,16 @@ endif(WIN32 AND tensorflow_ENABLE_GPU)
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
 add_dependencies(tf_core_kernels tf_core_cpu)
 
-if(WIN32)
+if (WIN32)
   target_compile_options(tf_core_kernels PRIVATE /MP)
-  if (tensorflow_ENABLE_GPU)
-    set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
-    set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
-    cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
-    set_target_properties(${tf_core_gpu_kernels_lib}
-                          PROPERTIES DEBUG_POSTFIX ""
-                          COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
-    )
-    add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
-  endif()
+endif (WIN32)
+if (tensorflow_ENABLE_GPU)
+  set_source_files_properties(${tf_core_gpu_kernels_srcs} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+  set(tf_core_gpu_kernels_lib tf_core_gpu_kernels)
+  cuda_add_library(${tf_core_gpu_kernels_lib} ${tf_core_gpu_kernels_srcs})
+  set_target_properties(${tf_core_gpu_kernels_lib}
+                        PROPERTIES DEBUG_POSTFIX ""
+                        COMPILE_FLAGS "${TF_REGULAR_CXX_FLAGS}"
+  )
+  add_dependencies(${tf_core_gpu_kernels_lib} tf_core_cpu)
 endif()
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index 4a61ed7a3548b1992ddc71acb8a7761e252296ea..e8c2cd347327843d10d13c1d24a800ff776aa8c1 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -92,6 +92,7 @@ GENERATE_CONTRIB_OP_LIBRARY(image_sirds "${tensorflow_source_dir}/tensorflow/con
 GENERATE_CONTRIB_OP_LIBRARY(layers_sparse_feature_cross "${tensorflow_source_dir}/tensorflow/contrib/layers/ops/sparse_feature_cross_op.cc")
 GENERATE_CONTRIB_OP_LIBRARY(memory_stats "${tensorflow_source_dir}/tensorflow/contrib/memory_stats/ops/memory_stats_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nccl "${tensorflow_source_dir}/tensorflow/contrib/nccl/ops/nccl_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(periodic_resample "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(nearest_neighbor "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/ops/nearest_neighbor_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(resampler "${tensorflow_source_dir}/tensorflow/contrib/resampler/ops/resampler_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(rnn_gru "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc")
diff --git a/tensorflow/contrib/cmake/tf_grappler.cmake b/tensorflow/contrib/cmake/tf_grappler.cmake
index a7841c98e83ec8c3eb91edfd9d639e169cb5f440..410490531a300c091afdd857d7f2d4e789a4c80e 100644
--- a/tensorflow/contrib/cmake/tf_grappler.cmake
+++ b/tensorflow/contrib/cmake/tf_grappler.cmake
@@ -23,7 +23,7 @@ file(GLOB tf_grappler_srcs
    "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.cc"
    "${tensorflow_source_dir}/tensorflow/python/grappler/model_analyzer.h"
  )
- 
+
 add_library(tf_grappler OBJECT ${tf_grappler_srcs})
 
 add_dependencies(tf_grappler tf_core_cpu)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_label_image_example.cmake b/tensorflow/contrib/cmake/tf_label_image_example.cmake
index 0d3a4699ebb102257e8a4a816652c90ffff42d92..7f2f60b0897f62d335416f4fcffd91c1e629cf28 100644
--- a/tensorflow/contrib/cmake/tf_label_image_example.cmake
+++ b/tensorflow/contrib/cmake/tf_label_image_example.cmake
@@ -34,3 +34,8 @@ target_link_libraries(tf_label_image_example PUBLIC
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_label_image_example
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 277818b159062da4ba6efaacbe006da623c8619c..8db6929e31a1a5f5c793721f455a664bd6741b06 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -120,32 +120,34 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
   set(${HDRS} ${${HDRS}} PARENT_SCOPE)
 endfunction()
 
-file(GLOB_RECURSE tf_protos_python_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/*.proto"
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/decision_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos.txt python_protos)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos "${python_protos}")
+STRING(REGEX REPLACE "\n" ";" python_protos "${python_protos}")
+
+foreach(python_proto ${python_protos})
+  file(GLOB_RECURSE tf_python_protos_src RELATIVE ${tensorflow_source_dir}
+      "${tensorflow_source_dir}/${python_proto}/*.proto"
+  )
+  list(APPEND tf_python_protos_srcs ${tf_python_protos_src})
+endforeach(python_proto)
+
 RELATIVE_PROTOBUF_GENERATE_PYTHON(
-    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_protos_python_srcs}
+    ${tensorflow_source_dir} PYTHON_PROTO_GENFILES ${tf_python_protos_srcs}
 )
 
-# NOTE(mrry): Avoid regenerating the tensorflow/core protos because this
-# can cause benign-but-failing-on-Windows-due-to-file-locking conflicts
-# when two rules attempt to generate the same file.
-file(GLOB_RECURSE tf_python_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
-    "${tensorflow_source_dir}/tensorflow/core/profiler/*.proto"
-    "${tensorflow_source_dir}/tensorflow/python/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/session_bundle/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/training/*.proto"
-)
+FILE(READ python_protos_cc.txt python_protos_cc)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_protos_cc "${python_protos_cc}")
+STRING(REGEX REPLACE "\n" ";" python_protos_cc "${python_protos_cc}")
+
+foreach(python_proto_cc ${python_protos_cc})
+  file(GLOB_RECURSE tf_python_protos_cc_src RELATIVE ${tensorflow_source_dir}
+      "${tensorflow_source_dir}/${python_proto_cc}/*.proto"
+  )
+  list(APPEND tf_python_protos_cc_srcs ${tf_python_protos_cc_src})
+endforeach(python_proto_cc)
+
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
     ${tensorflow_source_dir} ${tf_python_protos_cc_srcs}
 )
@@ -191,458 +193,28 @@ function(add_python_module MODULE_NAME)
     endif()
 endfunction()
 
-add_python_module("tensorflow")
-add_python_module("tensorflow/core")
-add_python_module("tensorflow/core/example")
-add_python_module("tensorflow/core/framework")
-add_python_module("tensorflow/core/lib")
-add_python_module("tensorflow/core/lib/core")
-add_python_module("tensorflow/core/protobuf")
-add_python_module("tensorflow/core/util")
-add_python_module("tensorflow/examples")
-add_python_module("tensorflow/examples/tutorials")
-add_python_module("tensorflow/examples/tutorials/mnist")
-add_python_module("tensorflow/python")
-add_python_module("tensorflow/python/client")
-add_python_module("tensorflow/python/data")
-add_python_module("tensorflow/python/data/ops")
-add_python_module("tensorflow/python/data/util")
-add_python_module("tensorflow/python/debug")
-add_python_module("tensorflow/python/debug/cli")
-add_python_module("tensorflow/python/debug/examples")
-add_python_module("tensorflow/python/debug/lib")
-add_python_module("tensorflow/python/debug/wrappers")
-add_python_module("tensorflow/python/eager")
-add_python_module("tensorflow/python/estimator")
-add_python_module("tensorflow/python/estimator/canned")
-add_python_module("tensorflow/python/estimator/export")
-add_python_module("tensorflow/python/estimator/inputs")
-add_python_module("tensorflow/python/estimator/inputs/queues")
-add_python_module("tensorflow/python/feature_column")
-add_python_module("tensorflow/python/framework")
-add_python_module("tensorflow/python/grappler")
-add_python_module("tensorflow/python/keras")
-add_python_module("tensorflow/python/keras/activations")
-add_python_module("tensorflow/python/keras/applications")
-add_python_module("tensorflow/python/keras/applications/inception_v3")
-add_python_module("tensorflow/python/keras/applications/mobilenet")
-add_python_module("tensorflow/python/keras/applications/resnet50")
-add_python_module("tensorflow/python/keras/applications/vgg16")
-add_python_module("tensorflow/python/keras/applications/vgg19")
-add_python_module("tensorflow/python/keras/applications/xception")
-add_python_module("tensorflow/python/keras/backend")
-add_python_module("tensorflow/python/keras/callbacks")
-add_python_module("tensorflow/python/keras/constraints")
-add_python_module("tensorflow/python/keras/datasets")
-add_python_module("tensorflow/python/keras/datasets/boston_housing")
-add_python_module("tensorflow/python/keras/datasets/cifar10")
-add_python_module("tensorflow/python/keras/datasets/cifar100")
-add_python_module("tensorflow/python/keras/datasets/imdb")
-add_python_module("tensorflow/python/keras/datasets/mnist")
-add_python_module("tensorflow/python/keras/datasets/reuters")
-add_python_module("tensorflow/python/keras/estimator")
-add_python_module("tensorflow/python/keras/initializers")
-add_python_module("tensorflow/python/keras/layers")
-add_python_module("tensorflow/python/keras/losses")
-add_python_module("tensorflow/python/keras/metrics")
-add_python_module("tensorflow/python/keras/models")
-add_python_module("tensorflow/python/keras/optimizers")
-add_python_module("tensorflow/python/keras/preprocessing")
-add_python_module("tensorflow/python/keras/preprocessing/image")
-add_python_module("tensorflow/python/keras/preprocessing/sequence")
-add_python_module("tensorflow/python/keras/preprocessing/text")
-add_python_module("tensorflow/python/keras/regularizers")
-add_python_module("tensorflow/python/keras/utils")
-add_python_module("tensorflow/python/keras/wrappers")
-add_python_module("tensorflow/python/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/python/keras/_impl")
-add_python_module("tensorflow/python/keras/_impl/keras")
-add_python_module("tensorflow/python/keras/_impl/keras/applications")
-add_python_module("tensorflow/python/keras/_impl/keras/datasets")
-add_python_module("tensorflow/python/keras/_impl/keras/engine")
-add_python_module("tensorflow/python/keras/_impl/keras/layers")
-add_python_module("tensorflow/python/keras/_impl/keras/preprocessing")
-add_python_module("tensorflow/python/keras/_impl/keras/utils")
-add_python_module("tensorflow/python/keras/_impl/keras/wrappers")
-add_python_module("tensorflow/python/kernel_tests")
-add_python_module("tensorflow/python/kernel_tests/distributions")
-add_python_module("tensorflow/python/kernel_tests/linalg")
-add_python_module("tensorflow/python/layers")
-add_python_module("tensorflow/python/lib")
-add_python_module("tensorflow/python/lib/core")
-add_python_module("tensorflow/python/lib/io")
-add_python_module("tensorflow/python/ops")
-add_python_module("tensorflow/python/ops/distributions")
-add_python_module("tensorflow/python/ops/linalg")
-add_python_module("tensorflow/python/ops/losses")
-add_python_module("tensorflow/python/platform")
-add_python_module("tensorflow/python/platform/default")
-add_python_module("tensorflow/python/platform/summary")
-add_python_module("tensorflow/python/profiler/")
-add_python_module("tensorflow/python/profiler/internal")
-add_python_module("tensorflow/python/saved_model")
-add_python_module("tensorflow/python/summary")
-add_python_module("tensorflow/python/summary/writer")
-add_python_module("tensorflow/python/tools")
-add_python_module("tensorflow/python/training")
-add_python_module("tensorflow/python/user_ops")
-add_python_module("tensorflow/python/util")
-add_python_module("tensorflow/python/util/protobuf")
-add_python_module("tensorflow/tools")
-add_python_module("tensorflow/tools/graph_transforms")
-add_python_module("tensorflow/contrib")
-add_python_module("tensorflow/contrib/all_reduce")
-add_python_module("tensorflow/contrib/all_reduce/python")
-add_python_module("tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/java")
-add_python_module("tensorflow/contrib/android/java/org")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib")
-add_python_module("tensorflow/contrib/android/java/org/tensorflow/contrib/android")
-add_python_module("tensorflow/contrib/android/jni")
-add_python_module("tensorflow/contrib/bayesflow")
-add_python_module("tensorflow/contrib/bayesflow/examples")
-add_python_module("tensorflow/contrib/bayesflow/examples/reinforce_simple")
-add_python_module("tensorflow/contrib/bayesflow/python")
-add_python_module("tensorflow/contrib/bayesflow/python/kernel_tests")
-add_python_module("tensorflow/contrib/bayesflow/python/ops")
-add_python_module("tensorflow/contrib/boosted_trees")
-add_python_module("tensorflow/contrib/boosted_trees/estimator_batch")
-add_python_module("tensorflow/contrib/boosted_trees/ops")
-add_python_module("tensorflow/contrib/boosted_trees/proto")
-add_python_module("tensorflow/contrib/boosted_trees/python")
-add_python_module("tensorflow/contrib/boosted_trees/python/kernel_tests")
-add_python_module("tensorflow/contrib/boosted_trees/python/ops")
-add_python_module("tensorflow/contrib/cloud")
-add_python_module("tensorflow/contrib/cloud/kernels")
-add_python_module("tensorflow/contrib/cloud/ops")
-add_python_module("tensorflow/contrib/cloud/python")
-add_python_module("tensorflow/contrib/cloud/python/ops")
-add_python_module("tensorflow/contrib/cluster_resolver")
-add_python_module("tensorflow/contrib/cluster_resolver/python")
-add_python_module("tensorflow/contrib/cluster_resolver/python/training")
-add_python_module("tensorflow/contrib/compiler")
-add_python_module("tensorflow/contrib/copy_graph")
-add_python_module("tensorflow/contrib/copy_graph/python")
-add_python_module("tensorflow/contrib/copy_graph/python/util")
-add_python_module("tensorflow/contrib/crf")
-add_python_module("tensorflow/contrib/crf/python")
-add_python_module("tensorflow/contrib/crf/python/kernel_tests")
-add_python_module("tensorflow/contrib/crf/python/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn")
-add_python_module("tensorflow/contrib/cudnn_rnn/kernels")
-add_python_module("tensorflow/contrib/cudnn_rnn/ops")
-add_python_module("tensorflow/contrib/cudnn_rnn/python")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/cudnn_rnn/python/ops")
-add_python_module("tensorflow/contrib/data")
-add_python_module("tensorflow/contrib/data/python")
-add_python_module("tensorflow/contrib/data/python/kernel_tests")
-add_python_module("tensorflow/contrib/data/python/ops")
-add_python_module("tensorflow/contrib/decision_trees")
-add_python_module("tensorflow/contrib/decision_trees/proto")
-add_python_module("tensorflow/contrib/deprecated")
-add_python_module("tensorflow/contrib/distributions")
-add_python_module("tensorflow/contrib/distributions/python")
-add_python_module("tensorflow/contrib/distributions/python/kernel_tests")
-add_python_module("tensorflow/contrib/distributions/python/ops")
-add_python_module("tensorflow/contrib/distributions/python/ops/bijectors")
-add_python_module("tensorflow/contrib/eager")
-add_python_module("tensorflow/contrib/eager/python")
-add_python_module("tensorflow/contrib/estimator")
-add_python_module("tensorflow/contrib/estimator/python")
-add_python_module("tensorflow/contrib/estimator/python/estimator")
-add_python_module("tensorflow/contrib/factorization")
-add_python_module("tensorflow/contrib/factorization/examples")
-add_python_module("tensorflow/contrib/factorization/kernels")
-add_python_module("tensorflow/contrib/factorization/ops")
-add_python_module("tensorflow/contrib/factorization/python")
-add_python_module("tensorflow/contrib/factorization/python/kernel_tests")
-add_python_module("tensorflow/contrib/factorization/python/ops")
-add_python_module("tensorflow/contrib/ffmpeg")
-add_python_module("tensorflow/contrib/ffmpeg/default")
-add_python_module("tensorflow/contrib/ffmpeg/testdata")
-add_python_module("tensorflow/contrib/framework")
-add_python_module("tensorflow/contrib/framework/kernels")
-add_python_module("tensorflow/contrib/framework/ops")
-add_python_module("tensorflow/contrib/framework/python")
-add_python_module("tensorflow/contrib/framework/python/framework")
-add_python_module("tensorflow/contrib/framework/python/ops")
-add_python_module("tensorflow/contrib/gan")
-add_python_module("tensorflow/contrib/gan/python")
-add_python_module("tensorflow/contrib/gan/python/eval")
-add_python_module("tensorflow/contrib/gan/python/eval/python")
-add_python_module("tensorflow/contrib/gan/python/features")
-add_python_module("tensorflow/contrib/gan/python/features/python")
-add_python_module("tensorflow/contrib/gan/python/estimator")
-add_python_module("tensorflow/contrib/gan/python/estimator/python")
-add_python_module("tensorflow/contrib/gan/python/losses")
-add_python_module("tensorflow/contrib/gan/python/losses/python")
-add_python_module("tensorflow/contrib/graph_editor")
-add_python_module("tensorflow/contrib/graph_editor/examples")
-add_python_module("tensorflow/contrib/graph_editor/tests")
-add_python_module("tensorflow/contrib/grid_rnn")
-add_python_module("tensorflow/contrib/grid_rnn/python")
-add_python_module("tensorflow/contrib/grid_rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/grid_rnn/python/ops")
-add_python_module("tensorflow/contrib/hooks")
-add_python_module("tensorflow/contrib/image")
-add_python_module("tensorflow/contrib/image/ops")
-add_python_module("tensorflow/contrib/image/python")
-add_python_module("tensorflow/contrib/image/python/ops")
-add_python_module("tensorflow/contrib/input_pipeline")
-add_python_module("tensorflow/contrib/input_pipeline/ops")
-add_python_module("tensorflow/contrib/input_pipeline/python")
-add_python_module("tensorflow/contrib/input_pipeline/python/ops")
-add_python_module("tensorflow/contrib/integrate")
-add_python_module("tensorflow/contrib/integrate/python")
-add_python_module("tensorflow/contrib/integrate/python/ops")
-add_python_module("tensorflow/contrib/ios_examples")
-add_python_module("tensorflow/contrib/ios_examples/benchmark")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/benchmark.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/benchmark/data")
-add_python_module("tensorflow/contrib/ios_examples/camera")
-add_python_module("tensorflow/contrib/ios_examples/camera/camera_example.xcodeproj")
-add_python_module("tensorflow/contrib/ios_examples/camera/en.lproj")
-add_python_module("tensorflow/contrib/ios_examples/simple")
-add_python_module("tensorflow/contrib/ios_examples/simple/data")
-add_python_module("tensorflow/contrib/ios_examples/simple/tf_ios_makefile_example.xcodeproj")
-add_python_module("tensorflow/contrib/keras")
-add_python_module("tensorflow/contrib/keras/api")
-add_python_module("tensorflow/contrib/keras/api/keras")
-add_python_module("tensorflow/contrib/keras/api/keras/activations")
-add_python_module("tensorflow/contrib/keras/api/keras/applications")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/inception_v3")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/mobilenet")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/resnet50")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg16")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/vgg19")
-add_python_module("tensorflow/contrib/keras/api/keras/applications/xception")
-add_python_module("tensorflow/contrib/keras/api/keras/backend")
-add_python_module("tensorflow/contrib/keras/api/keras/callbacks")
-add_python_module("tensorflow/contrib/keras/api/keras/constraints")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/boston_housing")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar10")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/cifar100")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/imdb")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/mnist")
-add_python_module("tensorflow/contrib/keras/api/keras/datasets/reuters")
-add_python_module("tensorflow/contrib/keras/api/keras/initializers")
-add_python_module("tensorflow/contrib/keras/api/keras/layers")
-add_python_module("tensorflow/contrib/keras/api/keras/losses")
-add_python_module("tensorflow/contrib/keras/api/keras/metrics")
-add_python_module("tensorflow/contrib/keras/api/keras/models")
-add_python_module("tensorflow/contrib/keras/api/keras/optimizers")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/image")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/sequence")
-add_python_module("tensorflow/contrib/keras/api/keras/preprocessing/text")
-add_python_module("tensorflow/contrib/keras/api/keras/regularizers")
-add_python_module("tensorflow/contrib/keras/api/keras/utils")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers")
-add_python_module("tensorflow/contrib/keras/api/keras/wrappers/scikit_learn")
-add_python_module("tensorflow/contrib/keras/python")
-add_python_module("tensorflow/contrib/keras/python/keras")
-add_python_module("tensorflow/contrib/keras/python/keras/applications")
-add_python_module("tensorflow/contrib/keras/python/keras/datasets")
-add_python_module("tensorflow/contrib/keras/python/keras/engine")
-add_python_module("tensorflow/contrib/keras/python/keras/layers")
-add_python_module("tensorflow/contrib/keras/python/keras/preprocessing")
-add_python_module("tensorflow/contrib/keras/python/keras/utils")
-add_python_module("tensorflow/contrib/keras/python/keras/wrappers")
-add_python_module("tensorflow/contrib/kernel_methods")
-add_python_module("tensorflow/contrib/kernel_methods/python")
-add_python_module("tensorflow/contrib/kernel_methods/python/mappers")
-add_python_module("tensorflow/contrib/kfac")
-add_python_module("tensorflow/contrib/kfac/examples")
-add_python_module("tensorflow/contrib/kfac/python")
-add_python_module("tensorflow/contrib/kfac/python/ops")
-add_python_module("tensorflow/contrib/labeled_tensor")
-add_python_module("tensorflow/contrib/labeled_tensor/python")
-add_python_module("tensorflow/contrib/labeled_tensor/python/ops")
-add_python_module("tensorflow/contrib/layers")
-add_python_module("tensorflow/contrib/layers/kernels")
-add_python_module("tensorflow/contrib/layers/ops")
-add_python_module("tensorflow/contrib/layers/python")
-add_python_module("tensorflow/contrib/layers/python/kernel_tests")
-add_python_module("tensorflow/contrib/layers/python/layers")
-add_python_module("tensorflow/contrib/layers/python/ops")
-add_python_module("tensorflow/contrib/learn")
-add_python_module("tensorflow/contrib/learn/python")
-add_python_module("tensorflow/contrib/learn/python/learn")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/queues")
-add_python_module("tensorflow/contrib/learn/python/learn/dataframe/transforms")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets")
-add_python_module("tensorflow/contrib/learn/python/learn/datasets/data")
-add_python_module("tensorflow/contrib/learn/python/learn/estimators")
-add_python_module("tensorflow/contrib/learn/python/learn/learn_io")
-add_python_module("tensorflow/contrib/learn/python/learn/ops")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing")
-add_python_module("tensorflow/contrib/learn/python/learn/preprocessing/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests")
-add_python_module("tensorflow/contrib/learn/python/learn/tests/dataframe")
-add_python_module("tensorflow/contrib/learn/python/learn/utils")
-add_python_module("tensorflow/contrib/legacy_seq2seq")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python")
-add_python_module("tensorflow/contrib/legacy_seq2seq/python/ops")
-add_python_module("tensorflow/contrib/linalg")
-add_python_module("tensorflow/contrib/linalg/python")
-add_python_module("tensorflow/contrib/linalg/python/ops")
-add_python_module("tensorflow/contrib/linalg/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels")
-add_python_module("tensorflow/contrib/linear_optimizer/kernels/g3doc")
-add_python_module("tensorflow/contrib/linear_optimizer/python")
-add_python_module("tensorflow/contrib/linear_optimizer/python/kernel_tests")
-add_python_module("tensorflow/contrib/linear_optimizer/python/ops")
-add_python_module("tensorflow/contrib/lookup")
-add_python_module("tensorflow/contrib/losses")
-add_python_module("tensorflow/contrib/losses/python")
-add_python_module("tensorflow/contrib/losses/python/losses")
-add_python_module("tensorflow/contrib/losses/python/metric_learning")
-add_python_module("tensorflow/contrib/makefile")
-add_python_module("tensorflow/contrib/makefile/test")
-add_python_module("tensorflow/contrib/memory_stats")
-add_python_module("tensorflow/contrib/memory_stats/kernels")
-add_python_module("tensorflow/contrib/memory_stats/ops")
-add_python_module("tensorflow/contrib/memory_stats/python")
-add_python_module("tensorflow/contrib/memory_stats/python/kernel_tests")
-add_python_module("tensorflow/contrib/memory_stats/python/ops")
-add_python_module("tensorflow/contrib/meta_graph_transform")
-add_python_module("tensorflow/contrib/metrics")
-add_python_module("tensorflow/contrib/metrics/kernels")
-add_python_module("tensorflow/contrib/metrics/ops")
-add_python_module("tensorflow/contrib/metrics/python")
-add_python_module("tensorflow/contrib/metrics/python/kernel_tests")
-add_python_module("tensorflow/contrib/metrics/python/metrics")
-add_python_module("tensorflow/contrib/metrics/python/ops")
-add_python_module("tensorflow/contrib/ndlstm")
-add_python_module("tensorflow/contrib/ndlstm/python")
-add_python_module("tensorflow/contrib/nn")
-add_python_module("tensorflow/contrib/nn/python")
-add_python_module("tensorflow/contrib/nn/python/ops")
-add_python_module("tensorflow/contrib/nccl")
-add_python_module("tensorflow/contrib/nccl/kernels")
-add_python_module("tensorflow/contrib/nccl/ops")
-add_python_module("tensorflow/contrib/nccl/python")
-add_python_module("tensorflow/contrib/nccl/python/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/kernels")
-add_python_module("tensorflow/contrib/nearest_neighbor/ops")
-add_python_module("tensorflow/contrib/nearest_neighbor/python")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/kernel_tests")
-add_python_module("tensorflow/contrib/nearest_neighbor/python/ops")
-add_python_module("tensorflow/contrib/opt")
-add_python_module("tensorflow/contrib/opt/python")
-add_python_module("tensorflow/contrib/opt/python/training")
-add_python_module("tensorflow/contrib/pi_examples")
-add_python_module("tensorflow/contrib/pi_examples/camera")
-add_python_module("tensorflow/contrib/pi_examples/label_image")
-add_python_module("tensorflow/contrib/pi_examples/label_image/data")
-add_python_module("tensorflow/contrib/predictor")
-add_python_module("tensorflow/contrib/quantization")
-add_python_module("tensorflow/contrib/quantization/python")
-add_python_module("tensorflow/contrib/quantize")
-add_python_module("tensorflow/contrib/quantize/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python")
-add_python_module("tensorflow/contrib/remote_fused_graph/pylib/python/ops")
-add_python_module("tensorflow/contrib/resampler")
-add_python_module("tensorflow/contrib/resampler/kernels")
-add_python_module("tensorflow/contrib/resampler/ops")
-add_python_module("tensorflow/contrib/resampler/python")
-add_python_module("tensorflow/contrib/resampler/python/ops")
-add_python_module("tensorflow/contrib/rnn")
-add_python_module("tensorflow/contrib/rnn/kernels")
-add_python_module("tensorflow/contrib/rnn/ops")
-add_python_module("tensorflow/contrib/rnn/python")
-add_python_module("tensorflow/contrib/rnn/python/kernel_tests")
-add_python_module("tensorflow/contrib/rnn/python/ops")
-add_python_module("tensorflow/contrib/saved_model")
-add_python_module("tensorflow/contrib/saved_model/python")
-add_python_module("tensorflow/contrib/saved_model/python/saved_model")
-add_python_module("tensorflow/contrib/seq2seq")
-add_python_module("tensorflow/contrib/seq2seq/kernels")
-add_python_module("tensorflow/contrib/seq2seq/ops")
-add_python_module("tensorflow/contrib/seq2seq/python")
-add_python_module("tensorflow/contrib/seq2seq/python/kernel_tests")
-add_python_module("tensorflow/contrib/seq2seq/python/ops")
-add_python_module("tensorflow/contrib/session_bundle")
-add_python_module("tensorflow/contrib/session_bundle/example")
-add_python_module("tensorflow/contrib/session_bundle/testdata")
-add_python_module("tensorflow/contrib/signal")
-add_python_module("tensorflow/contrib/signal/python")
-add_python_module("tensorflow/contrib/signal/python/ops")
-add_python_module("tensorflow/contrib/slim")
-add_python_module("tensorflow/contrib/slim/python")
-add_python_module("tensorflow/contrib/slim/python/slim")
-add_python_module("tensorflow/contrib/slim/python/slim/data")
-add_python_module("tensorflow/contrib/slim/python/slim/nets")
-add_python_module("tensorflow/contrib/solvers")
-add_python_module("tensorflow/contrib/solvers/python")
-add_python_module("tensorflow/contrib/solvers/python/ops")
-add_python_module("tensorflow/contrib/sparsemax")
-add_python_module("tensorflow/contrib/sparsemax/python")
-add_python_module("tensorflow/contrib/sparsemax/python/ops")
-add_python_module("tensorflow/contrib/specs")
-add_python_module("tensorflow/contrib/specs/python")
-add_python_module("tensorflow/contrib/staging")
-add_python_module("tensorflow/contrib/stat_summarizer")
-add_python_module("tensorflow/contrib/stateless")
-add_python_module("tensorflow/contrib/tensorboard")
-add_python_module("tensorflow/contrib/tensorboard/plugins")
-add_python_module("tensorflow/contrib/tensorboard/plugins/projector")
-add_python_module("tensorflow/contrib/tensor_forest")
-add_python_module("tensorflow/contrib/tensor_forest/client")
-add_python_module("tensorflow/contrib/tensor_forest/core")
-add_python_module("tensorflow/contrib/tensor_forest/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/data")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/core/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/ops")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/layers")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/models")
-add_python_module("tensorflow/contrib/tensor_forest/hybrid/python/ops")
-add_python_module("tensorflow/contrib/tensor_forest/python")
-add_python_module("tensorflow/contrib/tensor_forest/python/kernel_tests")
-add_python_module("tensorflow/contrib/tensor_forest/python/ops")
-add_python_module("tensorflow/contrib/testing")
-add_python_module("tensorflow/contrib/testing/python")
-add_python_module("tensorflow/contrib/testing/python/framework")
-add_python_module("tensorflow/contrib/text")
-add_python_module("tensorflow/contrib/text/kernels")
-add_python_module("tensorflow/contrib/text/ops")
-add_python_module("tensorflow/contrib/text/python")
-add_python_module("tensorflow/contrib/text/python/ops")
-add_python_module("tensorflow/contrib/tfprof")
-add_python_module("tensorflow/contrib/timeseries")
-add_python_module("tensorflow/contrib/timeseries/examples")
-add_python_module("tensorflow/contrib/timeseries/examples/data")
-add_python_module("tensorflow/contrib/timeseries/python")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries")
-add_python_module("tensorflow/contrib/timeseries/python/timeseries/state_space_models")
-add_python_module("tensorflow/contrib/tpu")
-add_python_module("tensorflow/contrib/tpu/ops")
-add_python_module("tensorflow/contrib/tpu/profiler")
-add_python_module("tensorflow/contrib/tpu/python")
-add_python_module("tensorflow/contrib/tpu/python/ops")
-add_python_module("tensorflow/contrib/tpu/python/profiler")
-add_python_module("tensorflow/contrib/tpu/python/tpu")
-add_python_module("tensorflow/contrib/training")
-add_python_module("tensorflow/contrib/training/python")
-add_python_module("tensorflow/contrib/training/python/training")
-add_python_module("tensorflow/contrib/util")
-add_python_module("tensorflow/contrib/reduce_slice_ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/kernels")
-add_python_module("tensorflow/contrib/reduce_slice_ops/ops")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/kernel_tests")
-add_python_module("tensorflow/contrib/reduce_slice_ops/python/ops")
-add_python_module("tensorflow/contrib/summary")
+FILE(READ python_modules.txt python_modules)
+# Convert file contents into a CMake list (where each element in the list is one line of the file)
+STRING(REGEX REPLACE ";" "\\\\;" python_modules "${python_modules}")
+STRING(REGEX REPLACE "\n" ";" python_modules "${python_modules}")
+
+foreach(python_module ${python_modules})
+  add_python_module(${python_module})
+endforeach(python_module)
+
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite")
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python")
+add_custom_command(TARGET tf_python_touchup_modules PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E touch
+    "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/__init__.py")
+add_custom_command(
+    TARGET tf_python_copy_scripts_to_destination PRE_BUILD
+    COMMAND ${CMAKE_COMMAND} -E touch
+    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/lite/python/lite.py)
 
 # Generate the tensorflow.python.platform.build_info module.
 set(BUILD_INFO_PY "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/platform/build_info.py")
@@ -694,6 +266,9 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
       set(require_shape_fn 1)
     endif()
 
+    get_filename_component(GENERATE_PYTHON_OP_LIB_MKDIRPATH ${GENERATE_PYTHON_OP_LIB_DESTINATION} PATH)
+    file(MAKE_DIRECTORY ${GENERATE_PYTHON_OP_LIB_MKDIRPATH})
+
     # Create a C++ executable that links in the appropriate op
     # registrations and generates Python wrapper code based on the
     # registered ops.
@@ -714,7 +289,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
     # containing the wrappers.
     add_custom_command(
       OUTPUT ${GENERATE_PYTHON_OP_LIB_DESTINATION}
-      COMMAND ${tf_python_op_lib_name}_gen_python @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
+      COMMAND ${tf_python_op_lib_name}_gen_python ${tensorflow_source_dir}/tensorflow/core/api_def/base_api,${tensorflow_source_dir}/tensorflow/core/api_def/python_api @${tensorflow_source_dir}/tensorflow/python/ops/hidden_ops.txt ${require_shape_fn} > ${GENERATE_PYTHON_OP_LIB_DESTINATION}
       DEPENDS ${tf_python_op_lib_name}_gen_python
     )
 
@@ -722,6 +297,7 @@ function(GENERATE_PYTHON_OP_LIB tf_python_op_lib_name)
         ${GENERATE_PYTHON_OP_LIB_DESTINATION} PARENT_SCOPE)
 endfunction()
 
+GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
@@ -791,6 +367,9 @@ GENERATE_PYTHON_OP_LIB("contrib_memory_stats_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/memory_stats/ops/gen_memory_stats_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_nccl_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nccl/ops/gen_nccl_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_periodic_resample_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/gen_periodic_resample_op.py)
+
 GENERATE_PYTHON_OP_LIB("contrib_nearest_neighbor_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/nearest_neighbor/ops/gen_nearest_neighbor_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_resampler_ops"
@@ -863,6 +442,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor.h"
@@ -873,6 +454,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/safe_ptr.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/io/py_record_reader.h"
@@ -966,7 +549,7 @@ add_library(pywrap_tensorflow_internal SHARED
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
     ${pywrap_tensorflow_deffile}
 )
@@ -989,6 +572,20 @@ target_link_libraries(pywrap_tensorflow_internal PRIVATE
 )
 
 if(WIN32)
+
+    # include contrib/periodic_resample as .so
+    #
+    set(tf_periodic_resample_srcs
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+       "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/ops/array_ops.cc"
+    )
+
+    AddUserOps(TARGET _periodic_resample_op
+        SOURCES "${tf_periodic_resample_srcs}"
+        DEPENDS pywrap_tensorflow_internal tf_python_ops
+        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/periodic_resample/python/ops/)
+
     # include contrib/nearest_neighbor as .so
     #
     set(tf_nearest_neighbor_srcs
@@ -1042,25 +639,23 @@ if(WIN32)
         DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/)
 endif(WIN32)
 
-if(WIN32)
-    # include contrib/seq2seq as .so
-    #
-    set(tf_beam_search_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
-    )
+# include contrib/seq2seq as .so
+#
+set(tf_beam_search_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.cc"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/ops/beam_search_ops.cc"
+)
 
-    set(tf_beam_search_gpu_srcs
-        "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
-    )
+set(tf_beam_search_gpu_srcs
+    "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc"
+)
 
-    AddUserOps(TARGET _beam_search_ops
-        SOURCES "${tf_beam_search_srcs}"
-        GPUSOURCES ${tf_beam_search_gpu_srcs}
-        DEPENDS pywrap_tensorflow_internal tf_python_ops
-        DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
-endif(WIN32)
+AddUserOps(TARGET _beam_search_ops
+    SOURCES "${tf_beam_search_srcs}"
+    GPUSOURCES ${tf_beam_search_gpu_srcs}
+    DEPENDS pywrap_tensorflow_internal tf_python_ops
+    DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 9bf45bab3041142206900bf96beeddefb3308ee4..571d2b0decb5e9afcec2314f9837546f0974e90d 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -45,7 +45,7 @@ if(WIN32)
       $<TARGET_FILE:tensorflow_static>
       $<TARGET_FILE:tf_protos_cc>
   )
-    
+
   set(tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/tensorflow.def")
   set_source_files_properties(${tensorflow_deffile} PROPERTIES GENERATED TRUE)
 
@@ -73,7 +73,7 @@ add_library(tensorflow SHARED
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<$<BOOL:${tensorflow_ENABLE_GRPC_SUPPORT}>:$<TARGET_OBJECTS:tf_core_distributed_runtime>>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
     ${tensorflow_deffile}
 )
@@ -94,3 +94,54 @@ endif()
 if(WIN32)
   add_dependencies(tensorflow tensorflow_static)
 endif(WIN32)
+
+target_include_directories(tensorflow PUBLIC 
+    $<INSTALL_INTERFACE:include/>
+    $<INSTALL_INTERFACE:include/external/nsync/public>)
+
+install(TARGETS tensorflow EXPORT tensorflow_export
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
+        
+install(EXPORT tensorflow_export
+        FILE TensorflowConfig.cmake
+        DESTINATION lib/cmake)
+
+# install necessary headers
+# tensorflow headers
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/cc/
+        DESTINATION include/tensorflow/cc
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tensorflow/core/
+        DESTINATION include/tensorflow/core
+        FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY ${tensorflow_source_dir}/tensorflow/stream_executor/
+        DESTINATION include/tensorflow/stream_executor
+        FILES_MATCHING PATTERN "*.h")
+# google protobuf headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/protobuf/src/protobuf/src/google/
+        DESTINATION include/google
+        FILES_MATCHING PATTERN "*.h")
+# nsync headers
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/
+        DESTINATION include/external/nsync
+        FILES_MATCHING PATTERN "*.h")
+# Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/Eigen/
+        DESTINATION include/Eigen)
+# external directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external/eigen_archive/
+        DESTINATION include/external/eigen_archive)
+# third_party eigen directory
+install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
+        DESTINATION include/third_party/eigen3)
+# unsupported Eigen directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
+        DESTINATION include/unsupported/Eigen)
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 3d84f1ebb9c1fa1b2f3ccdd8d5ae8eaf182f7715..91ca33f4c4d5f6c822f45b0676e6e46d2e4c2860 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -61,19 +61,22 @@ file(GLOB tf_stream_executor_srcs
     "${tensorflow_source_dir}/tensorflow/stream_executor/platform/default/*.h"
 )
 
-if (tensorflow_ENABLE_GPU)    
+if (tensorflow_ENABLE_GPU)
     file(GLOB tf_stream_executor_gpu_srcs
         "${tensorflow_source_dir}/tensorflow/stream_executor/cuda/*.cc"
     )
     list(APPEND tf_stream_executor_srcs ${tf_stream_executor_gpu_srcs})
-endif()    
+endif()
 
 #file(GLOB_RECURSE tf_stream_executor_test_srcs
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.cc"
 #    "${tensorflow_source_dir}/tensorflow/stream_executor/*_test.h"
 #)
-#list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs}) 
+#list(REMOVE_ITEM tf_stream_executor_srcs ${tf_stream_executor_test_srcs})
 
+if (NOT WIN32)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgomp")
+endif (NOT WIN32)
 add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
 
 add_dependencies(tf_stream_executor
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 77d21249148cc900a1bb4fc2742956aee47734de..94ca4b00175dffb4461fca34c5ecd79ba79be778 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -139,12 +139,15 @@ if (tensorflow_BUILD_PYTHON_TESTS)
 
   file(GLOB_RECURSE tf_test_src_py
     ${tf_test_rnn_src_py}
+    "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/estimator/python/estimator/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/*.py"
     "${tensorflow_source_dir}/tensorflow/python/meta_graph_transform/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_conv_ops_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/ops/quantized_ops_test.py"
     "${tensorflow_source_dir}/tensorflow/python/platform/build_info_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/*_test.py"
     "${tensorflow_source_dir}/tensorflow/python/profiler/internal/*_test.py"
@@ -153,7 +156,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/data/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/image/*_test.py"
-    "${tensorflow_source_dir}/tensorflow/contrib/keras/python/keras/integration_test.py"
+    "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/*_test.py"
+    "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/nearest_neighbor/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/seq2seq/python/kernel_tests/*_test.py"
     "${tensorflow_source_dir}/tensorflow/contrib/stateless/python/kernel_tests/*_test.py"
@@ -171,7 +175,6 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/contrib/graph_editor/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/bayesflow/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/framework/*_test.py"
-      "${tensorflow_source_dir}/tensorflow/contrib/keras/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/distributions/*_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/learn/*_test.py"
     )
@@ -179,17 +182,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
 
   # exclude the ones we don't want
   set(tf_test_src_py_exclude
-    # generally excluded
+    # Not a test.
     "${tensorflow_source_dir}/tensorflow/python/kernel_tests/__init__.py"
-
-    # Python source line inspection tests are flaky on Windows (b/36375074).
-    "${tensorflow_source_dir}/tensorflow/python/debug/cli/analyzer_cli_test.py"
-    "${tensorflow_source_dir}/tensorflow/python/debug/cli/profile_analyzer_cli_test.py"
-    # Windows does not have the curses library and uses readline.
-    "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
-    # TFDBG grpc:// mode is not yet available on Windows.
-    "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
-    "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
+    # Flaky because of port collisions.
+    "${tensorflow_source_dir}/tensorflow/python/training/localhost_cluster_performance_test.py"
     # generally not working
     "${tensorflow_source_dir}/tensorflow/python/profiler/pprof_profiler_test.py"
     # flaky test
@@ -216,7 +212,15 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # TODO: failing tests.
       # Nothing critical in here but should get this list down to []
       # The failing list is grouped by failure source
-
+      # Python source line inspection tests are flaky on Windows (b/36375074).
+      "${tensorflow_source_dir}/tensorflow/python/debug/cli/analyzer_cli_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/cli/profile_analyzer_cli_test.py"
+      # Windows does not have the curses library and uses readline.
+      "${tensorflow_source_dir}/tensorflow/python/debug/cli/curses_ui_test.py"
+      # TFDBG grpc:// mode is not yet available on Windows.
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/debug/lib/source_remote_test.py"
       # stl on windows handles overflows different
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/as_string_op_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/string_to_number_op_test.py"
@@ -225,6 +229,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Numerical issues, calculations off.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/concat_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/wals_test.py"
+      "${tensorflow_source_dir}/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/backend_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py"
       # Float division by zero
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/benchmark_test.py"
       # Flaky, for unknown reasons. Cannot reproduce in terminal. Revisit once we can get stack traces.
@@ -233,11 +241,11 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/training/sync_replicas_optimizer_test.py"
       "${tensorflow_source_dir}/tensorflow/python/debug/lib/session_debug_grpc_test.py"
       "${tensorflow_source_dir}tensorflow/python/training/localhost_cluster_performance_test.py"
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"
       # Type error in testRemoteIteratorUsingRemoteCallOpDirectSessionGPUCPU.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_test.py"
       # IteratorGetMax OutOfRangeError
@@ -261,9 +269,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/linalg_grad_test.py"  # cudaSolver handle creation fails.
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"  # depends on python/framework/test_ops
       # Dataset tests
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on windows
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py"  # Segfaults on Windows.
-      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       # Broken tensorboard test due to cmake issues.
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/iterator_ops_cluster_test.py"  # Needs portpicker
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
@@ -294,6 +302,9 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       # Test should only be run manually
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/reduction_ops_test_big.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/svd_op_test.py"
+      # Depends on python/framework/test_ops
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/array_ops_test.py"
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/control_flow_util_test.py"
   )
   endif()
   list(REMOVE_ITEM tf_test_src_py ${tf_test_src_py_exclude})
diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake
index 6ef95989630a39eaedaddda68f7da709e7d9ab03..cb58a2e7df85b2f214654eff5547c5788592f208 100644
--- a/tensorflow/contrib/cmake/tf_tools.cmake
+++ b/tensorflow/contrib/cmake/tf_tools.cmake
@@ -73,7 +73,7 @@ add_executable(${transform_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -95,7 +95,7 @@ add_executable(${summarize_graph}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -117,7 +117,7 @@ add_executable(${compare_graphs}
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_tools_transform_graph_lib>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -138,7 +138,7 @@ add_executable(${benchmark_model}
     $<TARGET_OBJECTS:tf_core_ops>
     $<TARGET_OBJECTS:tf_core_direct_session>
     $<TARGET_OBJECTS:tf_core_kernels>
-    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>
+    $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<$<BOOL:${BOOL_WIN32}>:$<TARGET_OBJECTS:tf_core_kernels_cpu_only>>>
     $<$<BOOL:${tensorflow_ENABLE_GPU}>:$<TARGET_OBJECTS:tf_stream_executor>>
 )
 
@@ -147,3 +147,8 @@ target_link_libraries(${benchmark_model} PUBLIC
   ${tf_core_gpu_kernels_lib}
   ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS ${transform_graph} ${summarize_graph} ${compare_graphs} ${benchmark_model}
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index 858e7dda92e9e9f456d5fc56b563b2e3ec998520..e63fccc1810b348e543159681a73e7a9c1422c01 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -34,3 +34,8 @@ target_link_libraries(tf_tutorials_example_trainer PUBLIC
     ${tf_core_gpu_kernels_lib}
     ${tensorflow_EXTERNAL_LIBRARIES}
 )
+
+install(TARGETS tf_tutorials_example_trainer
+        RUNTIME DESTINATION bin
+        LIBRARY DESTINATION lib
+        ARCHIVE DESTINATION lib)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 8c2528f548799f9facef740b0134ac56966b2b04..bae66ffd4289308f2cbfc730ec50d057b13923fb 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -19,7 +19,7 @@ from one graph to another. The copied elements are initialized inside a
 user-specified scope in the other graph. There are separate functions to
 copy ops and variables.
 There is also a function to retrive the copied version of an op from the
-first graph inside a scope in the second graph. 
+first graph inside a scope in the second graph.
 
 @@copy_op_to_graph
 @@copy_variable_to_graph
@@ -225,7 +225,7 @@ def copy_op_to_graph(org_instance, to_graph, variables,
                            new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
-    to_graph._add_op(new_op)
+    to_graph._add_op(new_op)  # pylint: disable=protected-access
     to_graph._record_op_seen_by_control_dependencies(new_op)
     for device_function in reversed(to_graph._device_function_stack):
       new_op._set_device(device_function(new_op))
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 9174c5eb989908d5a318e228bf231686b5117798..b47fb426a193e0fcc075deafae3eaab698f18ec9 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -23,7 +23,6 @@ import itertools
 import numpy as np
 
 from tensorflow.contrib.crf.python.ops import crf
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -33,43 +32,58 @@ from tensorflow.python.platform import test
 class CrfTest(test.TestCase):
 
   def testCrfSequenceScore(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    with self.test_session() as sess:
-      sequence_score = crf.crf_sequence_score(
-          inputs=array_ops.expand_dims(inputs, 0),
-          tag_indices=array_ops.expand_dims(tag_indices, 0),
-          sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-          transition_params=constant_op.constant(transition_params))
-      sequence_score = array_ops.squeeze(sequence_score, [0])
-      tf_sequence_score = sess.run(sequence_score)
-      expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                 for i in range(sequence_lengths))
-      expected_binary_score = sum(
-          transition_params[tag_indices[i], tag_indices[i + 1]]
-          for i in range(sequence_lengths - 1))
-      expected_sequence_score = expected_unary_score + expected_binary_score
-      self.assertAllClose(tf_sequence_score, expected_sequence_score)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[4, 5, -3]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([1], dtype=np.int32)
+    ]
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      with self.test_session() as sess:
+        sequence_score = crf.crf_sequence_score(
+            inputs=array_ops.expand_dims(inputs, 0),
+            tag_indices=array_ops.expand_dims(tag_indices, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        sequence_score = array_ops.squeeze(sequence_score, [0])
+        tf_sequence_score = sess.run(sequence_score)
+        expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                   for i in range(sequence_lengths))
+        expected_binary_score = sum(
+            transition_params[tag_indices[i], tag_indices[i + 1]]
+            for i in range(sequence_lengths - 1))
+        expected_sequence_score = expected_unary_score + expected_binary_score
+        self.assertAllClose(tf_sequence_score, expected_sequence_score)
 
   def testCrfUnaryScore(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
-    tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    with self.test_session() as sess:
-      unary_score = crf.crf_unary_score(
-          tag_indices=array_ops.expand_dims(tag_indices, 0),
-          sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-          inputs=array_ops.expand_dims(inputs, 0))
-      unary_score = array_ops.squeeze(unary_score, [0])
-      tf_unary_score = sess.run(unary_score)
-      expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                 for i in range(sequence_lengths))
-      self.assertAllClose(tf_unary_score, expected_unary_score)
+    for dtype in (np.int32, np.int64):
+      tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
+      sequence_lengths = np.array(3, dtype=np.int32)
+      with self.test_session() as sess:
+        unary_score = crf.crf_unary_score(
+            tag_indices=array_ops.expand_dims(tag_indices, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            inputs=array_ops.expand_dims(inputs, 0))
+        unary_score = array_ops.squeeze(unary_score, [0])
+        tf_unary_score = sess.run(unary_score)
+        expected_unary_score = sum(inputs[i][tag_indices[i]]
+                                   for i in range(sequence_lengths))
+        self.assertAllClose(tf_unary_score, expected_unary_score)
 
   def testCrfBinaryScore(self):
     tag_indices = np.array([1, 2, 1, 0], dtype=np.int32)
@@ -89,38 +103,54 @@ class CrfTest(test.TestCase):
       self.assertAllClose(tf_binary_score, expected_binary_score)
 
   def testCrfLogNorm(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-    sequence_lengths = np.array(3, dtype=np.int32)
-    with self.test_session() as sess:
-      all_sequence_scores = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-          range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequence_scores.append(
-            crf.crf_sequence_score(
-                inputs=array_ops.expand_dims(inputs, 0),
-                tag_indices=array_ops.expand_dims(tag_indices, 0),
-                sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-                transition_params=constant_op.constant(transition_params)))
-
-      brute_force_log_norm = math_ops.reduce_logsumexp(all_sequence_scores)
-      log_norm = crf.crf_log_norm(
-          inputs=array_ops.expand_dims(inputs, 0),
-          sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-          transition_params=constant_op.constant(transition_params))
-      log_norm = array_ops.squeeze(log_norm, [0])
-      tf_brute_force_log_norm, tf_log_norm = sess.run(
-          [brute_force_log_norm, log_norm])
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[3, -1, 3]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+      with self.test_session() as sess:
+        all_sequence_scores = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+          tag_indices = list(tag_indices)
+          tag_indices.extend([0] * (num_words - sequence_lengths))
+          all_sequence_scores.append(
+              crf.crf_sequence_score(
+                  inputs=array_ops.expand_dims(inputs, 0),
+                  tag_indices=array_ops.expand_dims(tag_indices, 0),
+                  sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+                  transition_params=constant_op.constant(transition_params)))
+
+        brute_force_log_norm = math_ops.reduce_logsumexp(all_sequence_scores)
+        log_norm = crf.crf_log_norm(
+            inputs=array_ops.expand_dims(inputs, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        log_norm = array_ops.squeeze(log_norm, [0])
+        tf_brute_force_log_norm, tf_log_norm = sess.run(
+            [brute_force_log_norm, log_norm])
 
-      self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
+        self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
 
   def testCrfLogLikelihood(self):
     inputs = np.array(
@@ -201,50 +231,66 @@ class CrfTest(test.TestCase):
                        expected_max_sequence[:sequence_lengths])
 
   def testCrfDecode(self):
-    inputs = np.array(
-        [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
-    sequence_lengths = np.array(3, dtype=np.int32)
-    num_words = inputs.shape[0]
-    num_tags = inputs.shape[1]
-
-    with self.test_session() as sess:
-      all_sequence_scores = []
-      all_sequences = []
-
-      # Compare the dynamic program with brute force computation.
-      for tag_indices in itertools.product(
-          range(num_tags), repeat=sequence_lengths):
-        tag_indices = list(tag_indices)
-        tag_indices.extend([0] * (num_words - sequence_lengths))
-        all_sequences.append(tag_indices)
-        sequence_score = crf.crf_sequence_score(
-            inputs=array_ops.expand_dims(inputs, 0),
-            tag_indices=array_ops.expand_dims(tag_indices, 0),
-            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
-            transition_params=constant_op.constant(transition_params))
-        sequence_score = array_ops.squeeze(sequence_score, [0])
-        all_sequence_scores.append(sequence_score)
-
-      tf_all_sequence_scores = sess.run(all_sequence_scores)
-
-      expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
-      expected_max_sequence = all_sequences[expected_max_sequence_index]
-      expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
-
-      actual_max_sequence, actual_max_score = crf.crf_decode(
-          array_ops.expand_dims(inputs, 0),
-          constant_op.constant(transition_params),
-          array_ops.expand_dims(sequence_lengths, 0))
-      actual_max_sequence = array_ops.squeeze(actual_max_sequence, [0])
-      actual_max_score = array_ops.squeeze(actual_max_score, [0])
-      tf_actual_max_sequence, tf_actual_max_score = sess.run(
-          [actual_max_sequence, actual_max_score])
-
-      self.assertAllClose(tf_actual_max_score, expected_max_score)
-      self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
-                       expected_max_sequence[:sequence_lengths])
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[-1, 2, 1]],
+                 dtype=np.float32),
+    ]
+    tag_indices_list = [
+        np.array([1, 2, 1, 0], dtype=np.int32),
+        np.array([2], dtype=np.int32)
+    ]
+
+    for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list,
+                                                     inputs_list,
+                                                     tag_indices_list):
+      num_words = inputs.shape[0]
+      num_tags = inputs.shape[1]
+
+      with self.test_session() as sess:
+        all_sequence_scores = []
+        all_sequences = []
+
+        # Compare the dynamic program with brute force computation.
+        for tag_indices in itertools.product(
+            range(num_tags), repeat=sequence_lengths):
+          tag_indices = list(tag_indices)
+          tag_indices.extend([0] * (num_words - sequence_lengths))
+          all_sequences.append(tag_indices)
+          sequence_score = crf.crf_sequence_score(
+              inputs=array_ops.expand_dims(inputs, 0),
+              tag_indices=array_ops.expand_dims(tag_indices, 0),
+              sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+              transition_params=constant_op.constant(transition_params))
+          sequence_score = array_ops.squeeze(sequence_score, [0])
+          all_sequence_scores.append(sequence_score)
+
+        tf_all_sequence_scores = sess.run(all_sequence_scores)
+
+        expected_max_sequence_index = np.argmax(tf_all_sequence_scores)
+        expected_max_sequence = all_sequences[expected_max_sequence_index]
+        expected_max_score = tf_all_sequence_scores[expected_max_sequence_index]
+
+        actual_max_sequence, actual_max_score = crf.crf_decode(
+            array_ops.expand_dims(inputs, 0),
+            constant_op.constant(transition_params),
+            array_ops.expand_dims(sequence_lengths, 0))
+        actual_max_sequence = array_ops.squeeze(actual_max_sequence, [0])
+        actual_max_score = array_ops.squeeze(actual_max_score, [0])
+        tf_actual_max_sequence, tf_actual_max_score = sess.run(
+            [actual_max_sequence, actual_max_score])
+
+        self.assertAllClose(tf_actual_max_score, expected_max_score)
+        self.assertEqual(list(tf_actual_max_sequence[:sequence_lengths]),
+                         expected_max_sequence[:sequence_lengths])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 7166e38b28365a6dbce9cf134f81b08a57c722de..7f5ae937b26f465076c6976429697c35924432e5 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -53,7 +53,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
@@ -101,12 +103,29 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
   Returns:
     sequence_scores: A [batch_size] vector of unnormalized sequence scores.
   """
-  # Compute the scores of the given tag sequence.
-  unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
-  binary_scores = crf_binary_score(tag_indices, sequence_lengths,
-                                   transition_params)
-  sequence_scores = unary_scores + binary_scores
-  return sequence_scores
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of the single tag.
+  def _single_seq_fn():
+    batch_size = array_ops.shape(inputs, out_type=tag_indices.dtype)[0]
+    example_inds = array_ops.reshape(
+        math_ops.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
+    return array_ops.gather_nd(
+        array_ops.squeeze(inputs, [1]),
+        array_ops.concat([example_inds, tag_indices], axis=1))
+
+  def _multi_seq_fn():
+    # Compute the scores of the given tag sequence.
+    unary_scores = crf_unary_score(tag_indices, sequence_lengths, inputs)
+    binary_scores = crf_binary_score(tag_indices, sequence_lengths,
+                                     transition_params)
+    sequence_scores = unary_scores + binary_scores
+    return sequence_scores
+
+  return utils.smart_cond(
+      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
+                          1),
+      fn1=_single_seq_fn,
+      fn2=_multi_seq_fn)
 
 
 def crf_log_norm(inputs, sequence_lengths, transition_params):
@@ -124,19 +143,32 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
   # algorithm.
   first_input = array_ops.slice(inputs, [0, 0, 0], [-1, 1, -1])
   first_input = array_ops.squeeze(first_input, [1])
-  rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
 
-  # Compute the alpha values in the forward algorithm in order to get the
-  # partition function.
-  forward_cell = CrfForwardRnnCell(transition_params)
-  _, alphas = rnn.dynamic_rnn(
-      cell=forward_cell,
-      inputs=rest_of_input,
-      sequence_length=sequence_lengths - 1,
-      initial_state=first_input,
-      dtype=dtypes.float32)
-  log_norm = math_ops.reduce_logsumexp(alphas, [1])
-  return log_norm
+  # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
+  # the "initial state" (the unary potentials).
+  def _single_seq_fn():
+    return math_ops.reduce_logsumexp(first_input, [1])
+
+  def _multi_seq_fn():
+    """Forward computation of alpha values."""
+    rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
+
+    # Compute the alpha values in the forward algorithm in order to get the
+    # partition function.
+    forward_cell = CrfForwardRnnCell(transition_params)
+    _, alphas = rnn.dynamic_rnn(
+        cell=forward_cell,
+        inputs=rest_of_input,
+        sequence_length=sequence_lengths - 1,
+        initial_state=first_input,
+        dtype=dtypes.float32)
+    log_norm = math_ops.reduce_logsumexp(alphas, [1])
+    return log_norm
+
+  max_seq_len = array_ops.shape(inputs)[1]
+  return control_flow_ops.cond(pred=math_ops.equal(max_seq_len, 1),
+                               true_fn=_single_seq_fn,
+                               false_fn=_multi_seq_fn)
 
 
 def crf_log_likelihood(inputs,
@@ -193,6 +225,9 @@ def crf_unary_score(tag_indices, sequence_lengths, inputs):
   offsets = array_ops.expand_dims(
       math_ops.range(batch_size) * max_seq_len * num_tags, 1)
   offsets += array_ops.expand_dims(math_ops.range(max_seq_len) * num_tags, 0)
+  # Use int32 or int64 based on tag_indices' dtype.
+  if tag_indices.dtype == dtypes.int64:
+    offsets = math_ops.to_int64(offsets)
   flattened_tag_indices = array_ops.reshape(offsets + tag_indices, [-1])
 
   unary_scores = array_ops.reshape(
@@ -305,7 +340,7 @@ def viterbi_decode(score, transition_params):
 
   Returns:
     viterbi: A [seq_len] list of integers containing the highest scoring tag
-        indicies.
+        indices.
     viterbi_score: A float containing the score for the Viterbi sequence.
   """
   trellis = np.zeros_like(score)
@@ -360,8 +395,8 @@ class CrfDecodeForwardRnnCell(rnn_cell.RNNCell):
       scope: Unused variable scope of this cell.
 
     Returns:
-      backpointers: [batch_size, num_tags], containing backpointers.
-      new_state: [batch_size, num_tags], containing new score values.
+      backpointers: A [batch_size, num_tags] matrix of backpointers.
+      new_state: A [batch_size, num_tags] matrix of new score values.
     """
     # For simplicity, in shape comments, denote:
     # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
@@ -385,7 +420,7 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
     """Initialize the CrfDecodeBackwardRnnCell.
 
     Args:
-      num_tags
+      num_tags: An integer. The number of tags.
     """
     self._num_tags = num_tags
 
@@ -401,8 +436,9 @@ class CrfDecodeBackwardRnnCell(rnn_cell.RNNCell):
     """Build the CrfDecodeBackwardRnnCell.
 
     Args:
-      inputs: [batch_size, num_tags], backpointer of next step (in time order).
-      state: [batch_size, 1], next position's tag index.
+      inputs: A [batch_size, num_tags] matrix of
+            backpointer of next step (in time order).
+      state: A [batch_size, 1] matrix of tag index of next step.
       scope: Unused variable scope of this cell.
 
     Returns:
@@ -426,52 +462,71 @@ def crf_decode(potentials, transition_params, sequence_length):
   This is a function for tensor.
 
   Args:
-    potentials: A [batch_size, max_seq_len, num_tags] tensor, matrix of
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
               unary potentials.
-    transition_params: A [num_tags, num_tags] tensor, matrix of
+    transition_params: A [num_tags, num_tags] matrix of
               binary potentials.
-    sequence_length: A [batch_size] tensor, containing sequence lengths.
+    sequence_length: A [batch_size] vector of true sequence lengths.
 
   Returns:
-    decode_tags: A [batch_size, max_seq_len] tensor, with dtype tf.int32.
-                Contains the highest scoring tag indicies.
-    best_score: A [batch_size] tensor, containing the score of decode_tags.
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
   """
-  # For simplicity, in shape comments, denote:
-  # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
-  num_tags = potentials.get_shape()[2].value
-
-  # Computes forward decoding. Get last score and backpointers.
-  crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
-  initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
-  initial_state = array_ops.squeeze(initial_state, axis=[1])      # [B, O]
-  inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])   # [B, T-1, O]
-  backpointers, last_score = rnn.dynamic_rnn(
-      crf_fwd_cell,
-      inputs=inputs,
-      sequence_length=sequence_length - 1,
-      initial_state=initial_state,
-      time_major=False,
-      dtype=dtypes.int32)             # [B, T - 1, O], [B, O]
-  backpointers = gen_array_ops.reverse_sequence(
-      backpointers, sequence_length - 1, seq_dim=1)               # [B, T-1, O]
-
-  # Computes backward decoding. Extract tag indices from backpointers.
-  crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
-  initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1),
-                                dtype=dtypes.int32)               # [B]
-  initial_state = array_ops.expand_dims(initial_state, axis=-1)   # [B, 1]
-  decode_tags, _ = rnn.dynamic_rnn(
-      crf_bwd_cell,
-      inputs=backpointers,
-      sequence_length=sequence_length - 1,
-      initial_state=initial_state,
-      time_major=False,
-      dtype=dtypes.int32)           # [B, T - 1, 1]
-  decode_tags = array_ops.squeeze(decode_tags, axis=[2])           # [B, T - 1]
-  decode_tags = array_ops.concat([initial_state, decode_tags], axis=1)  # [B, T]
-  decode_tags = gen_array_ops.reverse_sequence(
-      decode_tags, sequence_length, seq_dim=1)                     # [B, T]
-
-  best_score = math_ops.reduce_max(last_score, axis=1)             # [B]
-  return decode_tags, best_score
+  # If max_seq_len is 1, we skip the algorithm and simply return the argmax tag
+  # and the max activation.
+  def _single_seq_fn():
+    squeezed_potentials = array_ops.squeeze(potentials, [1])
+    decode_tags = array_ops.expand_dims(
+        math_ops.argmax(squeezed_potentials, axis=1), 1)
+    best_score = math_ops.reduce_max(squeezed_potentials, axis=1)
+    return math_ops.cast(decode_tags, dtype=dtypes.int32), best_score
+
+  def _multi_seq_fn():
+    """Decoding of highest scoring sequence."""
+
+    # For simplicity, in shape comments, denote:
+    # 'batch_size' by 'B', 'max_seq_len' by 'T' , 'num_tags' by 'O' (output).
+    num_tags = potentials.get_shape()[2].value
+
+    # Computes forward decoding. Get last score and backpointers.
+    crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params)
+    initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
+    initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
+    inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
+    backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
+        crf_fwd_cell,
+        inputs=inputs,
+        sequence_length=sequence_length - 1,
+        initial_state=initial_state,
+        time_major=False,
+        dtype=dtypes.int32)
+    backpointers = gen_array_ops.reverse_sequence(  # [B, T - 1, O]
+        backpointers, sequence_length - 1, seq_dim=1)
+
+    # Computes backward decoding. Extract tag indices from backpointers.
+    crf_bwd_cell = CrfDecodeBackwardRnnCell(num_tags)
+    initial_state = math_ops.cast(math_ops.argmax(last_score, axis=1),  # [B]
+                                  dtype=dtypes.int32)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)  # [B, 1]
+    decode_tags, _ = rnn.dynamic_rnn(  # [B, T - 1, 1]
+        crf_bwd_cell,
+        inputs=backpointers,
+        sequence_length=sequence_length - 1,
+        initial_state=initial_state,
+        time_major=False,
+        dtype=dtypes.int32)
+    decode_tags = array_ops.squeeze(decode_tags, axis=[2])  # [B, T - 1]
+    decode_tags = array_ops.concat([initial_state, decode_tags],   # [B, T]
+                                   axis=1)
+    decode_tags = gen_array_ops.reverse_sequence(  # [B, T]
+        decode_tags, sequence_length, seq_dim=1)
+
+    best_score = math_ops.reduce_max(last_score, axis=1)  # [B]
+    return decode_tags, best_score
+
+  return utils.smart_cond(
+      pred=math_ops.equal(
+          potentials.shape[1].value or array_ops.shape(potentials)[1], 1),
+      fn1=_single_seq_fn,
+      fn2=_multi_seq_fn)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index f192f78b98174d4e1af2e91f90b6a285fe51b628..fce2c03e69bc4b8b0ac46b8e081a33c43c9d41ab 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -54,48 +54,13 @@ tf_gen_op_wrapper_py(
     deps = [":cudnn_rnn_ops_op_lib"],
 )
 
-tf_custom_op_py_library(
-    name = "cudnn_rnn_ops_py",
-    srcs = [
-        "__init__.py",
-        "python/ops/cudnn_rnn_ops.py",
-    ],
-    dso = [
-        ":python/ops/_cudnn_rnn_ops.so",
-    ],
-    kernels = [
-        ":cudnn_rnn_kernels",
-        ":cudnn_rnn_ops_op_lib",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cudnn_rnn_ops",
-        "//tensorflow/contrib/rnn:rnn_py",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:common_shapes",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-    ],
-)
-
 tf_custom_op_py_library(
     name = "cudnn_rnn_py",
     srcs = [
         "__init__.py",
+        "python/layers/__init__.py",
         "python/layers/cudnn_rnn.py",
+        "python/ops/cudnn_rnn_ops.py",
     ],
     dso = [
         ":python/ops/_cudnn_rnn_ops.so",
@@ -108,7 +73,6 @@ tf_custom_op_py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cudnn_rnn_ops",
-        ":cudnn_rnn_ops_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -129,7 +93,7 @@ cuda_py_test(
     size = "large",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
-        ":cudnn_rnn_ops_py",
+        ":cudnn_rnn_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python/ops/losses:losses",
@@ -154,7 +118,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "cudnn_rnn_test",
-    size = "large",
+    size = "enormous",
     srcs = ["python/kernel_tests/cudnn_rnn_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
diff --git a/tensorflow/contrib/cudnn_rnn/__init__.py b/tensorflow/contrib/cudnn_rnn/__init__.py
index 87ba834770d8f707c5364ed7bb8db4aaaa21f286..5d8c6191f8db9f96532aa78e4790a4665d3b4877 100644
--- a/tensorflow/contrib/cudnn_rnn/__init__.py
+++ b/tensorflow/contrib/cudnn_rnn/__init__.py
@@ -30,15 +30,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleGRUCell
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleLSTMCell
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRU
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRUSaveable
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTM
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTMSaveable
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNRelu
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNReluSaveable
-from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanhSaveable
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.cudnn_rnn.python.layers import *
+# pylint: enable=unused-import,wildcard-import
 
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -56,4 +50,4 @@ _allowed_symbols = [
     "CudnnRNNTanhSaveable",
 ]
 
-remove_undocumented(__name__)
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
index 55fce0a916c9b057234d11d475b56322ce1e29d2..5d5f593d016a3bb9f7b5ea8f5cd40c29268dc4f5 100644
--- a/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/kernels/cudnn_rnn_ops.cc
@@ -577,6 +577,7 @@ class CudnnRNNParamsSizeOp<GPUDevice, T, Index> : public CudnnRNNKernelCommon {
                               .TypeConstraint<int32>("S"), \
                           CudnnRNNParamsSizeOp<GPUDevice, T, int32>);
 
+TF_CALL_half(REGISTER_GPU);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -711,6 +712,7 @@ class CudnnRNNParamsToCanonical<GPUDevice, T> : public CudnnRNNKernelCommon {
                               .HostMemory("input_size")     \
                               .TypeConstraint<T>("T"),      \
                           CudnnRNNParamsToCanonical<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -757,7 +759,9 @@ class CudnnRNNCanonicalToParams<GPUDevice, T> : public CudnnRNNKernelCommon {
                               .HostMemory("input_size")     \
                               .TypeConstraint<T>("T"),      \
                           CudnnRNNCanonicalToParams<GPUDevice, T>);
-TF_CALL_float(REGISTER_GPU) TF_CALL_double(REGISTER_GPU);
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // Run the forward operation of the RNN model.
@@ -906,6 +910,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       Name("CudnnRNN").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       CudnnRNNForwardOp<GPUDevice, T>);
 
+TF_CALL_half(REGISTER_GPU);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
@@ -1125,6 +1130,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       Name("CudnnRNNBackprop").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       CudnnRNNBackwardOp<GPUDevice, T>);
 
+TF_CALL_half(REGISTER_GPU);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
diff --git a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
index 2b297282b264a3777e0a981a1ecccabb0a3a2c4e..9e41e67857101534e8bfef8d5d0b8a45ed8f1f76 100644
--- a/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/contrib/cudnn_rnn/ops/cudnn_rnn_ops.cc
@@ -75,7 +75,7 @@ REGISTER_OP("CudnnRNNParamsSize")
     .Input("num_layers: int32")
     .Input("num_units: int32")
     .Input("input_size: int32")
-    .Attr("T: {float32, float64}")
+    .Attr("T: {float16, float32, float64}")
     .Attr("S: {int32, int64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -130,7 +130,7 @@ REGISTER_OP("CudnnRNN")
     .Output("output_h: T")
     .Output("output_c: T")
     .Output("reserve_space: T")
-    .Attr("T: {float32, float64}")
+    .Attr("T: {float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -190,7 +190,7 @@ REGISTER_OP("CudnnRNNBackprop")
     .Output("input_h_backprop: T")
     .Output("input_c_backprop: T")
     .Output("params_backprop: T")
-    .Attr("T: {float32, float64}")
+    .Attr("T: {float16, float32, float64}")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
     .Attr(kRNNDirectionAttrs)
@@ -236,7 +236,7 @@ REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("params: T")
     .Output("weights: num_params * T")
     .Output("biases: num_params * T")
-    .Attr("T: {float32, float64}")
+    .Attr("T: {float16, float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
@@ -279,7 +279,7 @@ REGISTER_OP("CudnnRNNCanonicalToParams")
     .Input("weights: num_params * T")
     .Input("biases: num_params * T")
     .Output("params: T")
-    .Attr("T: {float32, float64}")
+    .Attr("T: {float16, float32, float64}")
     .Attr("num_params: int")
     .Attr(kRNNModeAttrs)
     .Attr(kRNNInputModeAttrs)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index 9156087f338f0f59f102560d7538b1871c84e23e..5a667485beebe4bee7f051b5920920c72134987f 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -35,15 +35,11 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import rnn as rnn_lib
-from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 CUDNN_RNN_UNIDIRECTION = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
@@ -123,45 +119,6 @@ def _CreateParamsSavable(params,
   return params_saveable
 
 
-def _BuildCudnnForward(rnn_mode,
-                       num_layers,
-                       num_units,
-                       input_data,
-                       is_training=False):
-  input_data_shape = input_data.get_shape().with_rank(3)
-  batch_size = input_data_shape[1].value
-  input_size = input_data_shape[2].value
-  model = _CreateModel(rnn_mode, num_layers, num_units, input_size)
-
-  # Set zero init input states
-  input_h = constant_op.constant(
-      np.zeros([num_layers, batch_size, num_units]), dtype=dtypes.float32)
-  has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-  if has_input_c:
-    input_c = constant_op.constant(
-        np.zeros([num_layers, batch_size, num_units]), dtype=dtypes.float32)
-
-  # Set rnn params
-  params_size_t = model.params_size()
-  params = variables.Variable(
-      random_ops.random_uniform([params_size_t]), validate_shape=False)
-  args = {
-      "input_data": input_data,
-      "input_h": input_h,
-      "params": params,
-      "is_training": is_training
-  }
-  if has_input_c:
-    args["input_c"] = input_c
-  # Build cell
-  output_tuple = model(**args)
-
-  # Create savable objects for params
-  _CreateParamsSavable(params, model)
-
-  return output_tuple, model
-
-
 def _MinLSTMParamSize(num_layers,
                       num_units,
                       input_size,
@@ -181,25 +138,6 @@ def _MinLSTMParamSize(num_layers,
     raise ValueError("%s direction is not supported.")
 
 
-def _CreateCudnnCompatibleCanonicalRNN(cudnn_model,
-                                       inputs,
-                                       scope=None):
-  model = cudnn_model.rnn_mode
-  if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU):
-    raise ValueError("%s is not supported!" % model)
-
-  num_units = cudnn_model.num_units
-  num_layers = cudnn_model.num_layers
-  # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
-  if model == cudnn_rnn_ops.CUDNN_LSTM:
-    single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units)
-  else:
-    single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units)
-  cell = rnn_cell_impl.MultiRNNCell([single_cell() for _ in range(num_layers)])
-  return rnn_lib.dynamic_rnn(
-      cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope)
-
-
 class CudnnRNNTestSaveRestore(TensorFlowTestCase):
 
   def _CompareWeights(self, lhs, rhs):
@@ -436,143 +374,6 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
       self._testSaveRestoreOutput(rnn_mode, direction, dtype)
 
 
-class CudnnRNNTestCompatibleRnnCells(TensorFlowTestCase):
-
-  @unittest.skipUnless(test.is_built_with_cuda(),
-                       "Test only applicable when running on GPUs")
-  def testCudnnCompatibleRnnCells(self):
-    configs = [
-        {
-            "num_layers": 1,
-            "seq_length": 3,
-            "num_units": 4,
-            "input_size": 5,
-            "batch_size": 6,
-        },
-        {
-            "num_layers": 2,
-            "seq_length": 8,
-            "num_units": 4,
-            "input_size": 8,
-            "batch_size": 16,
-        },
-        {
-            "num_layers": 2,
-            "seq_length": 3,
-            "num_units": 4,
-            "input_size": 5,
-            "batch_size": 6,
-        },
-        {
-            "num_layers": 1,
-            "seq_length": 2,
-            "num_units": 2,
-            "input_size": 4,
-            "batch_size": 1,
-        },
-    ]
-    for rnn, cfg in itertools.product((cudnn_rnn_ops.CUDNN_LSTM,), configs):
-      self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"],
-                                        cfg["num_units"], cfg["input_size"],
-                                        cfg["batch_size"], rnn)
-    # TODO(jamesqin): Add CudnnCompatibleGRUBlockCell.
-    for rnn, cfg in itertools.product((cudnn_rnn_ops.CUDNN_GRU,), configs):
-      self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"],
-                                        cfg["num_units"], cfg["input_size"],
-                                        cfg["batch_size"], rnn)
-
-  def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units,
-                                   input_size, batch_size, rnn_mode):
-    has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM
-    np.random.seed(0)
-    # Train graph
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(299)
-      input_data = array_ops.placeholder(
-          dtypes.float32, shape=[seq_length, batch_size, input_size])
-      output_tuple, cudnn_model = _BuildCudnnForward(
-          rnn_mode, num_layers, num_units, input_data, is_training=True)
-      target_output = array_ops.placeholder(dtype=dtypes.float32, shape=None)
-      total_sum = sum(map(math_ops.reduce_sum, output_tuple))
-
-      loss_op = losses.log_loss(labels=target_output, predictions=total_sum)
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
-      train_op = optimizer.minimize(loss_op)
-
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-
-      # Train Cudnn model
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        # Train 128 steps
-        num_steps = 128
-        for _ in range(num_steps):
-          inputs = np.random.rand(seq_length, batch_size,
-                                  input_size).astype(np.float32)
-          targets = np.random.rand()
-          sess.run(
-              train_op, feed_dict={input_data: inputs,
-                                   target_output: targets})
-
-        save_path = os.path.join(self.get_temp_dir(),
-                                 ("cudnn-rnn-%s-test" % rnn_mode))
-        save_v = saver.save(sess, save_path)
-        self.assertEqual(save_path, save_v)
-
-    # cuDNN inference graph
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(299)
-      cudnn_inputs = array_ops.placeholder(
-          dtypes.float32, shape=[seq_length, batch_size, input_size])
-      (cudnn_output_tuple, cudnn_model) = _BuildCudnnForward(
-          rnn_mode, num_layers, num_units, cudnn_inputs, is_training=False)
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-
-      inference_input = np.random.rand(seq_length, batch_size,
-                                       input_size).astype(np.float32)
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        saver.restore(sess, save_path)
-
-        # Cudnn inference
-        cudnn_output = sess.run(
-            cudnn_output_tuple, feed_dict={cudnn_inputs: inference_input})
-
-    # Canonical RNN inference graph
-    with ops.Graph().as_default():
-      random_seed.set_random_seed(299)
-      cell_inputs = array_ops.placeholder(
-          dtypes.float32, shape=[seq_length, batch_size, input_size])
-      (output, states) = _CreateCudnnCompatibleCanonicalRNN(
-          cudnn_model, cell_inputs)
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        saver.restore(sess, save_path)
-
-        # BlockCell inference
-        output_v, states_v = sess.run(
-            [output, states], feed_dict={cell_inputs: inference_input})
-
-        # output across timestamps are packed into one tensor.
-        self.assertAllClose(cudnn_output[0], output_v, atol=1e-6, rtol=1e-6)
-
-        for i in range(num_layers):
-          if has_state_c:
-            # output_h
-            self.assertAllClose(
-                cudnn_output[1][i, :], states_v[i].h, atol=1e-6, rtol=1e-6)
-            # output_c
-            self.assertAllClose(
-                cudnn_output[2][i, :], states_v[i].c, atol=1e-6, rtol=1e-6)
-          else:
-            self.assertAllClose(
-                cudnn_output[1][i, :], states_v[i], atol=1e-6, rtol=1e-6)
-
-
 class CudnnRNNTestParamsSize(TensorFlowTestCase):
 
   def _testOneLSTMParamsSize(self, num_layers, num_units, input_size,
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 1ce8954bb09d7444a552d0ba6b3d9bb72cd919fd..e65394cba07574ed49398981f1cbd8bcb402e24f 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -17,8 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
+import collections
 import itertools
 import os
+import sys
 import unittest
 
 import numpy as np
@@ -49,6 +52,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
+
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
 CUDNN_GRU = cudnn_rnn_ops.CUDNN_GRU
 CUDNN_RNN_RELU = cudnn_rnn_ops.CUDNN_RNN_RELU
@@ -78,9 +82,10 @@ class CudnnTestModel(object):
                dropout=0.,
                dtype=dtypes.float32,
                training=False,
+               seed=None,
                kernel_initializer=None,
                bias_initializer=None):
-    if dtype not in (dtypes.float32, dtypes.float64):
+    if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64):
       raise ValueError("Invalid dtype: %s" % dtype)
     self._dtype = dtype
 
@@ -110,6 +115,7 @@ class CudnnTestModel(object):
         direction=direction,
         dropout=dropout,
         dtype=dtype,
+        seed=seed,
         kernel_initializer=kernel_initializer,
         bias_initializer=bias_initializer)
     self._rnn.build([None, None, input_size])
@@ -499,7 +505,7 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
 
   def _TestSaveRestoreHelper(self, rnn_mode):
     directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
-    dtype_list = [dtypes.float32, dtypes.float64]
+    dtype_list = [dtypes.float16, dtypes.float32, dtypes.float64]
     for direction, dtype in itertools.product(directions, dtype_list):
       self._TestSaveRestoreVariable(rnn_mode, direction, dtype)
       self._TestSaveRestoreTwoVariables(rnn_mode, direction, dtype)
@@ -722,19 +728,17 @@ class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
           outputs_v, output_state_v = sess.run(
               [outputs, output_state],
               feed_dict={cell_inputs: inference_input})
-          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=1e-5, rtol=1e-5)
+          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=2e-5, rtol=2e-5)
           (cudnn_output_h_v,) = cudnn_output_states_v
-          self.assertAllClose(cudnn_output_h_v, output_state_v, atol=1e-5,
-                              rtol=1e-5)
+          self.assertAllClose(cudnn_output_h_v, output_state_v, atol=2e-5,
+                              rtol=2e-5)
 
 
 class CudnnRNNTestParamsSize(TensorFlowTestCase):
 
   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
-                            direction):
+                            dtype, direction):
     logging.info("Testing one lstm param size with config: %s", locals())
-    dtype = dtypes.float32
-
     model = CudnnTestModel(
         rnn_mode,
         num_layers,
@@ -767,13 +771,14 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
         [3, 200, 400],
     ]
     directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+    dtype_list = [dtypes.float16, dtypes.float32, dtypes.float64]
     rnns = [CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH]
-    for (rnn, config, direction) in itertools.product(rnns, test_configs,
-                                                      directions):
+    for (rnn, config, dtype, direction) in itertools.product(
+        rnns, test_configs, dtype_list, directions):
       num_layers, num_units, input_size = config
       with ops.Graph().as_default():
         self._TestOpaqueParamsSize(rnn, num_layers, num_units, input_size,
-                                   direction)
+                                   dtype, direction)
 
 
 class CudnnRNNTestTraining(TensorFlowTestCase):
@@ -819,9 +824,63 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
       numeric_grad[i] = (y_pos - y_neg) / (2 * delta)
     return numeric_grad.reshape(x_shape)
 
+  def _GetShape(self, sess, inputs):
+    if not isinstance(inputs, collections.Iterable):
+      return sess.run(array_ops.shape(inputs))
+    else:
+      return sess.run([array_ops.shape(x) for x in inputs])
+
+  def _GradientCheckFp16(self, sess, y, xs, num_samples,
+                         tolerance=1e-6, delta=1e-4):
+    """Gradient check for Fp16.
+
+    Fp16 numerical gradients end up being zeros. Use a new way to check
+    gradients:
+
+    Given multi-variant function:
+    y = f(x1, x2, ... xn)
+    delta_y = f(x1 + delta_x1, x2+delta_x2, ..., xn+delta_xn) -
+              f(x1, x2, ..., xn)
+            = f'(x1) * delta_x1 + f'(x2) * delta_x2 + .. + f'(xn) * delta_xn
+    where:
+      delta_xi are very small disturbance.
+      f'(xi) is the gradient of y w.r.t xi.
+
+    The gradient check verifies the expected delta_y calculated by the above
+    equation is close to the actual delta_y.
+    Args:
+      sess: tf.Session object.
+      y: output tensor.
+      xs: a tensor or a list of input tensors.
+      num_samples: number of test samples to run.
+      tolerance: error tolerance.
+      delta: the order of magnititued of input disturbance to apply to calculate
+        the output change w.r.t inputs.
+    """
+    sym_grads = self._ComputeSymGrads(sess, y, xs)
+    xs_shapes = self._GetShape(sess, xs)
+
+    x_vals = [sess.run(x) for x in xs]
+    for _ in range(num_samples):
+      delta_xs = [delta * np.random.rand(*shape.tolist())
+                  for shape in xs_shapes]
+
+      feed_dict = {}
+      for x, x_val, delta_x in zip(xs, x_vals, delta_xs):
+        feed_dict[x] = x_val + delta_x
+      actual_delta_y = (float(sess.run(y, feed_dict=feed_dict)) -
+                        float(sess.run(y)))
+
+      expected_delta_y = 0.
+      for sym_grad, delta_x in zip(sym_grads, delta_xs):
+        expected_delta_y += np.dot(
+            sym_grad.astype(np.float32).flatten(),
+            delta_x.astype(np.float32).flatten())
+      self.assertAllClose(expected_delta_y, actual_delta_y,
+                          atol=tolerance, rtol=tolerance)
+
   def _GradientCheck(self, sess, y, xs, tolerance=1e-6, delta=1e-4):
-    sym_grads_t = gradients.gradients(y, xs)
-    sym_grads = sess.run(sym_grads_t)
+    sym_grads = self._ComputeSymGrads(sess, y, xs)
 
     num_grads = [self._ComputeNumericGrad(sess, y, x, delta) for x in xs]
     self.assertEqual(len(sym_grads), len(num_grads))
@@ -830,6 +889,10 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
       self.assertFalse(np.any(np.isnan(num)))
       self.assertAllClose(sym, num, atol=tolerance, rtol=tolerance)
 
+  def _ComputeSymGrads(self, sess, y, xs):
+    sym_grads_t = gradients.gradients(y, xs)
+    return sess.run(sym_grads_t)
+
   def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                              batch_size, seq_length, dir_count, dropout, dtype,
                              delta, tolerance):
@@ -838,6 +901,8 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
     logging.info("Training test with config: %s", locals())
     old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
+
+    np.random.seed(1234)
     random_seed.set_random_seed(5678)
     has_input_c = (rnn_mode == CUDNN_LSTM)
     direction = (CUDNN_RNN_UNIDIRECTION
@@ -879,12 +944,22 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
       all_inputs = [inputs, params]
       for s in initial_state:
         all_inputs.append(s)
-      self._GradientCheck(
-          sess, total_sum, all_inputs, tolerance=tolerance, delta=delta)
+      if dtype == dtypes.float16:
+        self._GradientCheckFp16(
+            sess, total_sum, all_inputs,
+            num_samples=FLAGS.grad_check_num_samples,
+            tolerance=tolerance, delta=delta)
+      else:
+        for _ in range(FLAGS.grad_check_num_samples):
+          # Each time choose a different set of inputs.
+          sess.run(variables.global_variables_initializer())
+          self._GradientCheck(
+              sess, total_sum, all_inputs,
+              tolerance=tolerance, delta=delta)
       os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
 
   def _TestSimpleTrainingHelper(self, rnn_mode, test_configs):
-    dropouts = [0., 0.5, 1.]
+    dropouts = [0, 0.5, 1.]
     for config, dropout in itertools.product(test_configs, dropouts):
       dtype = config.get("dtype", dtypes.float32)
       delta = config.get("delta", 1e-4)
@@ -895,11 +970,12 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
         self._TestOneSimpleTraining(rnn_mode, shape["num_layers"],
                                     shape["num_units"], shape["input_size"],
                                     shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta, tolerance)
+                                    dir_count, dropout, dtype, delta,
+                                    tolerance)
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingLSTM64(self):
+  def testSimpleTrainingLSTMFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
@@ -917,7 +993,7 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingLSTM32(self):
+  def testSimpleTrainingLSTMFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
@@ -936,7 +1012,38 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingGRU64(self):
+  def testSimpleTrainingLSTMFp16(self):
+    test_configs = [
+        {
+            "dtype": dtypes.float16,
+            "delta": 1e-3,
+            "tolerance": 9e-2,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+            },
+        },
+        {
+            "dtype": dtypes.float16,
+            "delta": 1e-2,
+            "tolerance": 9e-2,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 6,
+                "input_size": 8,
+                "batch_size": 6,
+                "seq_length": 4,
+            },
+        },
+    ]
+    self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSimpleTrainingGRUFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
@@ -954,7 +1061,7 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingGRU32(self):
+  def testSimpleTrainingGRUFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
@@ -973,7 +1080,26 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingRNNTanh64(self):
+  def testSimpleTrainingGRUFp16(self):
+    test_configs = [
+        {
+            "dtype": dtypes.float16,
+            "delta": 2e-3,
+            "tolerance": 6e-2,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+            },
+        },
+    ]
+    self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSimpleTrainingRNNTanhFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
@@ -991,7 +1117,7 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingRNNTanh32(self):
+  def testSimpleTrainingRNNTanhFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
@@ -1010,7 +1136,26 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingRNNRelu64(self):
+  def testSimpleTrainingRNNTanhFp16(self):
+    test_configs = [
+        {
+            "dtype": dtypes.float16,
+            "delta": 1e-3,
+            "tolerance": 5e-2,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+            },
+        },
+    ]
+    self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSimpleTrainingRNNReluFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
@@ -1028,10 +1173,29 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleTrainingRNNRelu32(self):
+  def testSimpleTrainingRNNReluFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
+            "delta": 1e-4,
+            "tolerance": 3e-1,
+            "shape": {
+                "num_layers": 2,
+                "num_units": 3,
+                "input_size": 4,
+                "batch_size": 3,
+                "seq_length": 4,
+            },
+        },
+    ]
+    self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def testSimpleTrainingRNNReluFp16(self):
+    test_configs = [
+        {
+            "dtype": dtypes.float16,
             "delta": 1e-3,
             "tolerance": 7e-2,
             "shape": {
@@ -1047,4 +1211,13 @@ class CudnnRNNTestTraining(TensorFlowTestCase):
 
 
 if __name__ == "__main__":
+  argv0 = sys.argv[0]
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "--grad_check_num_samples",
+      type=int,
+      default=5,
+      help="Number of samples to run for gradient check.")
+  FLAGS, unparsed = parser.parse_known_args()
+  sys.argv = [argv0] + unparsed
   googletest.main()
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f09466b631f69d6234573dd5eafada650421c117
--- /dev/null
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""layers module with higher level CudnnRNN primitives."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.cudnn_rnn.python.layers.cudnn_rnn import *
+# pylint: enable=unused-import,wildcard-import
+
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleGRUCell
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnCompatibleLSTMCell
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnGRUSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnLSTMSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNReluSaveable
+from tensorflow.contrib.cudnn_rnn.python.ops.cudnn_rnn_ops import CudnnRNNTanhSaveable
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 3d3f8a3be0554c709ce053106f754f27d8ed630a..37c61a71a3bdac4fadef58ba8c24b853fb3638ef 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 
+
 CUDNN_RNN_UNIDIRECTION = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
 CUDNN_RNN_BIDIRECTION = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -45,6 +46,9 @@ CUDNN_INPUT_SKIP_MODE = cudnn_rnn_ops.CUDNN_INPUT_SKIP_MODE
 CUDNN_INPUT_AUTO_MODE = cudnn_rnn_ops.CUDNN_INPUT_AUTO_MODE
 
 
+__all__ = ["CudnnLSTM", "CudnnGRU", "CudnnRNNTanh", "CudnnRNNRelu"]
+
+
 class _CudnnRNN(base_layer.Layer):
   # pylint:disable=line-too-long
   """Abstract class for RNN layers with Cudnn implementation.
@@ -146,7 +150,6 @@ class _CudnnRNN(base_layer.Layer):
   # Custom SaveableObject class for the CudnnRNN class.
   _saveable_cls = None
 
-  # TODO(jamesqin): support float16 CuDNN RNN
   def __init__(self,
                num_layers,
                num_units,
@@ -177,7 +180,7 @@ class _CudnnRNN(base_layer.Layer):
           inputs of each layer. When set to 0, dropout is disabled.
       seed: the op seed used for initializing dropout. See @{tf.set_random_seed}
           for behavior.
-      dtype: tf.float32 or tf.float64
+      dtype: tf.float16, tf.float32 or tf.float64
       kernel_initializer: starting value to initialize the weight.
       bias_initializer: starting value to initialize the bias
         (default is all zeros).
@@ -192,8 +195,9 @@ class _CudnnRNN(base_layer.Layer):
     cudnn_rnn_ops.check_direction(direction)
     cudnn_rnn_ops.check_input_mode(input_mode)
 
-    if dtype not in [dtypes.float32, dtypes.float64]:
-      raise ValueError("Only support float32, float64, provided %s" % dtype)
+    if dtype not in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      raise ValueError(
+          "Only support float16, float32, float64, provided %s" % dtype)
     # Layer self.dtype is type name, the original DType object is kept here.
     self._plain_dtype = dtype
     self._num_layers = num_layers
@@ -454,6 +458,8 @@ class _CudnnRNN(base_layer.Layer):
         weights=cu_weights,
         biases=cu_biases,
         input_mode=self._input_mode,
+        seed=self._seed,
+        dropout=self._dropout,
         direction=self._direction)
 
   def _forward(self, inputs, h, c, opaque_params, training):
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 7d658c746ee1ecd21cefca9c9e52f611869f6176..dcd3d4732a27ae4bec579ac12ac568dc4a53baaa 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
@@ -54,6 +55,11 @@ CUDNN_INPUT_LINEAR_MODE = "linear_input"
 CUDNN_INPUT_SKIP_MODE = "skip_input"
 CUDNN_INPUT_AUTO_MODE = "auto_select"
 
+# pylint:disable=protected-access
+_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
+_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
+# pylint:enable=protected-access
+
 
 class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
   """Cudnn Compatible LSTMCell.
@@ -86,9 +92,9 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
   Cudnn compatible GRU (from Cudnn library user guide):
   ```python
   r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
-  i_t = sigma(x_t * W_i + h_t-1 * R_i + b_Wi + b_Ru)  # update gate
+  u_t = sigma(x_t * W_u + h_t-1 * R_u + b_Wu + b_Ru)  # update gate
   h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
-  h_t = (1 - i_t) .* h'_t + i_t .* h_t-1
+  h_t = (1 - u_t) .* h'_t + u_t .* h_t-1
   ```
 
   Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
@@ -99,9 +105,6 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
   ```python
   r .* (h * R) != (r .* h) * R
   ```
-
-  TODO(jamesqin): update the impl after Cudnn 7.1 when Nvidia would adopt the
-  canonical version compatible with other tf GRU cells.
   """
 
   def __init__(self, num_units, reuse=None, kernel_initializer=None):
@@ -111,33 +114,65 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
         reuse=reuse,
         kernel_initializer=kernel_initializer)
 
+  def build(self, inputs_shape):
+    if inputs_shape[1].value is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % inputs_shape)
+
+    input_depth = inputs_shape[1].value
+    self._gate_kernel = self.add_variable(
+        "gates/%s" % _WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth + self._num_units, 2 * self._num_units],
+        initializer=self._kernel_initializer)
+    self._gate_bias = self.add_variable(
+        "gates/%s" % _BIAS_VARIABLE_NAME,
+        shape=[2 * self._num_units],
+        initializer=(
+            self._bias_initializer
+            if self._bias_initializer is not None
+            else init_ops.constant_initializer(1.0, dtype=self.dtype)))
+
+    self._candidate_input_kernel = self.add_variable(
+        "candidate/input_projection/%s" % _WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth, self._num_units],
+        initializer=self._kernel_initializer)
+    self._candidate_hidden_kernel = self.add_variable(
+        "candidate/hidden_projection/%s" % _WEIGHTS_VARIABLE_NAME,
+        shape=[self._num_units, self._num_units],
+        initializer=self._kernel_initializer)
+
+    self._candidate_input_bias = self.add_variable(
+        "candidate/input_projection/%s" % _BIAS_VARIABLE_NAME,
+        shape=[self._num_units],
+        initializer=(
+            self._bias_initializer
+            if self._bias_initializer is not None
+            else init_ops.zeros_initializer(dtype=self.dtype)))
+    self._candidate_hidden_bias = self.add_variable(
+        "candidate/hidden_projection/%s" % _BIAS_VARIABLE_NAME,
+        shape=[self._num_units],
+        initializer=(
+            self._bias_initializer
+            if self._bias_initializer is not None
+            else init_ops.zeros_initializer(dtype=self.dtype)))
+
   def call(self, inputs, state):
     """Gated recurrent unit (GRU) with nunits cells."""
-    with vs.variable_scope("gates"):  # Reset gate and update gate.
-      # We start with bias of 1.0 to not reset and not update.
-      bias_ones = self._bias_initializer
-      if self._bias_initializer is None:
-        dtype = inputs.dtype
-        bias_ones = init_ops.constant_initializer(1.0, dtype=dtype)
-      # pylint: disable=protected-access
-      value = math_ops.sigmoid(
-          rnn_cell_impl._linear([inputs, state], 2 * self._num_units, True,
-                                bias_ones, self._kernel_initializer))
-      r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
-      # pylint: enable=protected-access
-    with vs.variable_scope("candidate"):
-      # pylint: disable=protected-access
-      with vs.variable_scope("input_projection"):
-        hi = rnn_cell_impl._linear(inputs, self._num_units, True,
-                                   self._bias_initializer,
-                                   self._kernel_initializer)
-      with vs.variable_scope("hidden_projection"):
-        hh = r * (rnn_cell_impl._linear(state, self._num_units, True,
-                                        self._bias_initializer,
-                                        self._kernel_initializer))
-      # pylint: enable=protected-access
-      c = self._activation(hi + hh)
-    new_h = u * state + (1 - u) * c
+    gate_inputs = math_ops.matmul(
+        array_ops.concat([inputs, state], 1), self._gate_kernel)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias)
+
+    value = math_ops.sigmoid(gate_inputs)
+    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
+
+    candidate = nn_ops.bias_add(
+        math_ops.matmul(inputs, self._candidate_input_kernel),
+        self._candidate_input_bias)
+    candidate += r * nn_ops.bias_add(
+        math_ops.matmul(state, self._candidate_hidden_kernel),
+        self._candidate_hidden_bias)
+    candidate = self._activation(candidate)
+    new_h = (1-u) * candidate + u * state
     return new_h, new_h
 
 
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index eaede0e00ecf1986873d50709d135d3f4b3ac9cd..3b1c33063f1214b68f79560f50d56bf5d31c9560 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -17,8 +17,8 @@ py_library(
     deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:iterator_ops",
-        "//tensorflow/contrib/data/python/ops:prefetching_py",
         "//tensorflow/contrib/data/python/ops:readers",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -27,12 +27,8 @@ py_library(
 
 tf_custom_op_library(
     name = "_prefetching_ops.so",
-    srcs = [
-        "ops/prefetching_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/data/kernels:prefetching_kernels",
-    ],
+    srcs = ["ops/prefetching_ops.cc"],
+    deps = ["//tensorflow/contrib/data/kernels:prefetching_kernels"],
 )
 
 tf_gen_op_libs(
@@ -42,7 +38,9 @@ tf_gen_op_libs(
 filegroup(
     name = "all_files",
     srcs = glob(
-        ["**/*"],
+        include = [
+            "**/*",
+        ],
         exclude = [
             "**/METADATA",
             "**/OWNERS",
diff --git a/tensorflow/contrib/data/README.md b/tensorflow/contrib/data/README.md
index 30e909111f460bb4d0ea5fcdefaf5bdedc93b9c0..848782e8d89b8670caf3b45de4912a7e0855c102 100644
--- a/tensorflow/contrib/data/README.md
+++ b/tensorflow/contrib/data/README.md
@@ -18,7 +18,7 @@ The arguments accepted by the `Dataset.map()` transformation have changed:
 
 * `dataset.map(..., num_threads=T)` is now `dataset.map(num_parallel_calls=T)`.
 * `dataset.map(..., output_buffer_size=B)` is now
-  `dataset.map(...).prefetch(B).
+  `dataset.map(...).prefetch(B)`.
 
 Some transformations have been removed from `tf.data.Dataset`, and you must
 instead apply them using `Dataset.apply()` transformation. The full list of
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 6c46acf20442c2cc435829afa57e8383b493d6af..c9ad091bd44d6e3a9368e182c3df9fc1c6e48071 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -17,12 +17,14 @@
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Dataset
+@@Counter
 @@Iterator
 @@TFRecordDataset
 @@FixedLengthRecordDataset
 @@TextLineDataset
 
 @@batch_and_drop_remainder
+@@padded_batch_and_drop_remainder
 @@dense_to_sparse_batch
 @@enumerate_dataset
 @@group_by_window
@@ -30,7 +32,9 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@make_saveable_from_iterator
 @@read_batch_features
 @@unbatch
+@@parallel_interleave
 @@rejection_resample
+@@scan
 @@sloppy_interleave
 
 @@get_single_element
@@ -44,12 +48,15 @@ from __future__ import print_function
 
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
+from tensorflow.contrib.data.python.ops.batching import padded_batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import unbatch
+from tensorflow.contrib.data.python.ops.counter import Counter
 from tensorflow.contrib.data.python.ops.dataset_ops import Dataset
 from tensorflow.contrib.data.python.ops.dataset_ops import get_single_element
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.error_ops import ignore_errors
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
+from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
 from tensorflow.contrib.data.python.ops.readers import FixedLengthRecordDataset
@@ -58,6 +65,8 @@ from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.readers import TextLineDataset
 from tensorflow.contrib.data.python.ops.readers import TFRecordDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
+from tensorflow.contrib.data.python.ops.scan_ops import scan
+from tensorflow.contrib.data.python.ops.shuffle_ops import shuffle_and_repeat
 from tensorflow.python.data.ops.iterator_ops import Iterator
 # pylint: enable=unused-import
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 424eb198522ce3d11152c2f8da6a2a5d82432cec..9b6ad9329482815b666d11d1b32b245e3ea62b54 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -4,14 +4,16 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
     size = "small",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
@@ -95,8 +97,8 @@ py_test(
         "nomac",  # b/62040583
     ],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -108,18 +110,42 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
 )
 
+py_library(
+    name = "dataset_serialization_test",
+    testonly = 1,
+    srcs = [
+        "dataset_serialization_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "filter_dataset_op_test",
     size = "small",
     srcs = ["filter_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -131,21 +157,28 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "flat_map_dataset_op_test",
     size = "small",
     srcs = ["flat_map_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
+        ":dataset_serialization_test",
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
+        "//tensorflow/python:variable_scope",
     ],
+    grpc_enabled = True,
+    tags = ["no_pip"],
 )
 
 py_test(
@@ -157,6 +190,7 @@ py_test(
         "manual",  # b/67958761
     ],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
@@ -166,18 +200,18 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "iterator_ops_cluster_test",
     size = "small",
     srcs = ["iterator_ops_cluster_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -191,14 +225,19 @@ py_test(
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_windows",
+        "oss_serial",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "iterator_ops_test",
     size = "small",
     srcs = ["iterator_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
+        "//third_party/py/numpy",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/core:protos_all_py",
@@ -220,8 +259,8 @@ py_test(
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python/data/ops:iterator_ops",
-        "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
 )
 
 py_test(
@@ -241,12 +280,13 @@ py_test(
 
 py_test(
     name = "map_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["map_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:transformation_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -255,20 +295,35 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//third_party/py/numpy",
     ],
 )
 
+py_test(
+    name = "prefetch_dataset_op_test",
+    size = "small",
+    srcs = ["prefetch_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "range_dataset_op_test",
     size = "small",
@@ -297,25 +352,22 @@ py_test(
 
 py_test(
     name = "reader_dataset_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["reader_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
@@ -341,10 +393,12 @@ py_test(
 
 py_test(
     name = "sequence_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["sequence_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -368,16 +422,24 @@ py_test(
 
 py_test(
     name = "shuffle_dataset_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["shuffle_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:iterator_ops",
+        "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
     ],
@@ -397,21 +459,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "stats_dataset_ops_test",
+    size = "small",
+    srcs = ["stats_dataset_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/data/python/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+    ],
+)
+
 py_test(
     name = "zip_dataset_op_test",
     size = "small",
     srcs = ["zip_dataset_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_pip"],
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
 )
@@ -421,20 +494,31 @@ py_test(
     size = "small",
     srcs = ["prefetching_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        "no_oss",  # b/68785503
+    ],
     deps = [
-        "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/data/python/ops:prefetching_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 filegroup(
     name = "all_files",
     srcs = glob(
-        ["**/*"],
+        include = [
+            "**/*",
+        ],
         exclude = [
             "**/METADATA",
             "**/OWNERS",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index add17ff8bcea0f228dc36ec6157fe95b9ce44d80..d975a0167fe2cc8ae81431a8687aaf8695119a98 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -21,6 +21,7 @@ import math
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -51,8 +52,9 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-                .repeat(count).batch(batch_size).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+        .repeat(count).batch(batch_size).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -68,7 +70,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -83,12 +85,12 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
       result = sess.run(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1)*8 + j) % 7]**2,
+          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -102,14 +104,67 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
+        2).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+          dense_shape=[2, 5, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens)
-                .map(lambda x: array_ops.fill([x], x)).padded_batch(
-                    4,
-                    padded_shapes=padded_shape).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens)
+        .map(lambda x: array_ops.fill([x], x)).padded_batch(
+            4, padded_shapes=padded_shape).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -117,35 +172,40 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result)
         self.assertEqual((4, padded_len), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test with random sequence lengths, and constant padding.
-      sess.run(init_op, feed_dict={padded_shape: [25],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [25],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         self.assertEqual((4, 25), result.shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
       # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: [0, 0, 0, 0]})
+      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
       result = sess.run(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
@@ -153,8 +213,7 @@ class BatchDatasetTest(test.TestCase):
 
       # Test error handling with constant sequence lengths, and
       # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5],
-                                   seq_lens: [6, 5, 5, 5]})
+      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
       with self.assertRaises(errors.DataLossError):
         result = sess.run(get_next)
 
@@ -165,11 +224,13 @@ class BatchDatasetTest(test.TestCase):
     def fill_tuple(x):
       filled = array_ops.fill([x], x)
       return (filled, string_ops.as_string(filled))
-    iterator = (dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-                .padded_batch(
-                    4,
-                    padded_shapes=(padded_shape, padded_shape),
-                    padding_values=(-1, "<end>")).make_initializable_iterator())
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
+        .padded_batch(
+            4,
+            padded_shapes=(padded_shape, padded_shape),
+            padding_values=(-1, "<end>")).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -177,15 +238,18 @@ class BatchDatasetTest(test.TestCase):
     with self.test_session() as sess:
       # Test with random sequence lengths, and max padding.
       random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(init_op, feed_dict={padded_shape: [-1],
-                                   seq_lens: random_seq_lens})
+      sess.run(
+          init_op, feed_dict={
+              padded_shape: [-1],
+              seq_lens: random_seq_lens
+          })
       for i in range(8):
         result = sess.run(get_next)
         padded_len = np.max(result[0])
         self.assertEqual((4, padded_len), result[0].shape)
         self.assertEqual((4, padded_len), result[1].shape)
         for j in range(4):
-          seq_len = random_seq_lens[(i*4)+j]
+          seq_len = random_seq_lens[(i * 4) + j]
           self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
           self.assertAllEqual(result[0][j, seq_len:],
                               [-1] * (padded_len - seq_len))
@@ -219,20 +283,30 @@ class BatchDatasetTest(test.TestCase):
                        constant_op.constant([-1, -1], dtype=dtypes.int64),
                        constant_op.constant([37], dtype=dtypes.int64)))
 
-    for dataset in [dynamic_padding_from_tensor_shapes,
-                    dynamic_padding_from_lists,
-                    dynamic_padding_from_lists_with_minus_one,
-                    dynamic_padding_from_tensors]:
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
       self.assertEqual([None, None], dataset.output_shapes[0].as_list())
       self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
       self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
 
+  def testPaddedBatchSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .map(lambda x: array_ops.fill([x], x)).apply(
-                    batching.dense_to_sparse_batch(4, [12]))
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([x], x)).apply(
+            batching.dense_to_sparse_batch(4,
+                                           [12])).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -241,24 +315,26 @@ class BatchDatasetTest(test.TestCase):
 
       for start in range(0, len(components), 4):
         results = sess.run(get_next)
+        self.assertAllEqual([[i, j]
+                             for i, c in enumerate(components[start:start + 4])
+                             for j in range(c)], results.indices)
         self.assertAllEqual(
-            [[i, j] for i, c in enumerate(components[start:start+4])
-             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start+4] for _ in range(c)],
+            [c for c in components[start:start + 4] for _ in range(c)],
             results.values)
-        self.assertAllEqual(
-            [min(4, len(components) - start), 12], results.dense_shape)
+        self.assertAllEqual([min(4,
+                                 len(components) - start), 12],
+                            results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .map(lambda x: array_ops.fill([x, x], x)).apply(
-                    batching.dense_to_sparse_batch(
-                        4, [5, -1])).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .map(lambda x: array_ops.fill([x, x], x)).apply(
+            batching.dense_to_sparse_batch(
+                4, [5, -1])).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -267,27 +343,30 @@ class BatchDatasetTest(test.TestCase):
 
       for start in range(0, len(components), 4):
         results = sess.run(get_next)
-        self.assertAllEqual(
-            [[i, j, z] for i, c in enumerate(components[start:start+4])
-             for j in range(c) for z in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start+4]
-             for _ in range(c) for _ in range(c)],
-            results.values)
-        self.assertAllEqual(
-            [min(4, len(components) - start),
-             5,
-             np.max(components[start:start+4])],
-            results.dense_shape)
+        self.assertAllEqual([[i, j, z]
+                             for i, c in enumerate(components[start:start + 4])
+                             for j in range(c)
+                             for z in range(c)], results.indices)
+        self.assertAllEqual([
+            c
+            for c in components[start:start + 4] for _ in range(c)
+            for _ in range(c)
+        ], results.values)
+        self.assertAllEqual([
+            min(4,
+                len(components) - start), 5,
+            np.max(components[start:start + 4])
+        ], results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
-    iterator = (dataset_ops.Dataset.from_tensors(input_tensor)
-                .apply(batching.dense_to_sparse_batch(4, [-2]))
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensors(input_tensor).apply(
+            batching.dense_to_sparse_batch(4, [-2]))
+        .make_initializable_iterator())
     init_op = iterator.initializer
 
     with self.test_session() as sess:
@@ -297,8 +376,10 @@ class BatchDatasetTest(test.TestCase):
 
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (dataset_ops.Dataset.from_tensors(input_tensor).apply(
-        batching.dense_to_sparse_batch(4, [12])).make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensors(input_tensor).apply(
+            batching.dense_to_sparse_batch(4,
+                                           [12])).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = sparse_tensor.SparseTensor(*iterator.get_next())
 
@@ -355,8 +436,7 @@ class BatchDatasetTest(test.TestCase):
 
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
-                   array_ops.fill([10], "hi"))
-                  for i in range(3)])
+                   array_ops.fill([10], "hi")) for i in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
     expected_types = ((dtypes.int32, dtypes.string),) * 3
     data = data.batch(2)
@@ -369,9 +449,7 @@ class BatchDatasetTest(test.TestCase):
 
     with self.test_session() as sess:
       for i in range(10):
-        self.assertEqual(((i, b"hi"),
-                          (10 + i, b"hi"),
-                          (20 + i, b"hi")),
+        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
                          sess.run(op))
 
       with self.assertRaises(errors.OutOfRangeError):
@@ -384,9 +462,10 @@ class BatchDatasetTest(test.TestCase):
 
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        batching.batch_and_drop_remainder(batch_size))
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            batching.batch_and_drop_remainder(batch_size))
+        .make_initializable_iterator())
 
     next_element = iterator.get_next()
 
@@ -403,14 +482,85 @@ class BatchDatasetTest(test.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(next_element)
 
+  def testBatchAndDropRemainderSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(12).map(_sparse).apply(
+        batching.batch_and_drop_remainder(5)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testPaddedBatchAndDropRemainder(self):
+    els = []
+    for length in [3, 6, 9, 4, 12, 10, 2]:
+      els.append((np.array(length), np.arange(length) + 1,
+                  np.array(length * 2)))
+
+    dataset = dataset_ops.Dataset.from_tensors(els[0])
+    for el in els[1:]:
+      dataset = dataset.concatenate(dataset_ops.Dataset.from_tensors(el))
+
+    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = (
+        dataset.apply(
+            batching.padded_batch_and_drop_remainder(
+                batch_size, ([], [None], []))).make_initializable_iterator())
+
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for test_batch_size in [1, 3, 7, 10]:
+        sess.run(iterator.initializer, feed_dict={batch_size: test_batch_size})
+        num_batches = 7 // test_batch_size
+        for i in range(num_batches):
+          result = sess.run(next_element)
+          for component_idx, result_component in enumerate(result):
+            for j in range(test_batch_size):
+              data_idx = i * test_batch_size + j
+              comp = result_component[j]
+              unpadded = comp[comp > 0]
+              if np.isscalar(comp):
+                # The boolean mask indexing above adds a dim back. Rm it.
+                unpadded = unpadded[0]
+              self.assertAllEqual(els[data_idx][component_idx], unpadded)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+
+  def testPaddedBatchAndDropRemainderSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).apply(
+          batching.padded_batch_and_drop_remainder(5))
+
   def testBatchAndDropRemainderShapeInference(self):
-    components = (array_ops.placeholder(dtypes.int32), (array_ops.placeholder(
-        dtypes.int32, shape=[None]), array_ops.placeholder(
-            dtypes.int32, shape=[20, 30])))
+    components = (array_ops.placeholder(dtypes.int32),
+                  (array_ops.placeholder(dtypes.int32, shape=[None]),
+                   array_ops.placeholder(dtypes.int32, shape=[20, 30])))
 
     # Test with a statically known batch size.
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        batching.batch_and_drop_remainder(128)))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            batching.batch_and_drop_remainder(128)))
 
     self.assertIs(None, dataset.output_shapes[0].ndims)
     self.assertEqual([128], dataset.output_shapes[1][0].as_list())
@@ -419,14 +569,15 @@ class BatchDatasetTest(test.TestCase):
     # Test with a dynamic batch size: the static shape will be unknown, because
     # `batch_size` is a placeholder.
     batch_size = array_ops.placeholder(dtypes.int64)
-    dataset = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        batching.batch_and_drop_remainder(batch_size)))
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            batching.batch_and_drop_remainder(batch_size)))
 
     self.assertIs(None, dataset.output_shapes[0].ndims)
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def testBatchAndMapDataset(self):
+  def _testBatchAndMapDatasetHelper(self, num_parallel_batches=1):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> BatchAndMapDataset(square_3, batch_size).
@@ -440,9 +591,13 @@ class BatchDatasetTest(test.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).repeat(count)
-                .apply(batching.map_and_batch(_map_fn, batch_size))
-                .make_initializable_iterator())
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
+            batching.map_and_batch(
+                map_func=_map_fn,
+                batch_size=batch_size,
+                num_parallel_batches=num_parallel_batches))
+        .make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -458,7 +613,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
-            self.assertAllEqual(component[(i*14 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
@@ -473,7 +628,7 @@ class BatchDatasetTest(test.TestCase):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
-            self.assertAllEqual(component[(i*8 + j) % 7]**2,
+            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
       # The last batch should fail with `OutOfRange`.
       with self.assertRaises(errors.OutOfRangeError):
@@ -488,14 +643,49 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
+  def testBatchAndMapDataset(self):
+    return self._testBatchAndMapDatasetHelper()
+
+  def testBatchAndMapDatasetWithParallelBatching(self):
+    # TODO(b/70299909): This test surfaces a bug in the `map_and_batch`
+    # transformation, which manifests as premature EOF. Fix it.
+    #
+    # return self._testBatchAndMapDatasetHelper(num_parallel_batches=10)
+    pass
+
+  def testMapAndBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).apply(
+        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testBatchAndMapDatasetFails(self):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
     batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-                .make_initializable_iterator())
+    iterator = (
+        dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
+        .make_initializable_iterator())
     init_op = iterator.initializer
     with self.test_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
@@ -503,6 +693,7 @@ class BatchDatasetTest(test.TestCase):
 
   def testBatchAndMapDatasetShapeMismatch(self):
     """Test a dataset that maps a TF function across its input elements."""
+
     def generator():
       yield [1]
       yield [2]
@@ -523,5 +714,63 @@ class BatchDatasetTest(test.TestCase):
                                    "number of elements does not match"):
         sess.run(get_next)
 
+
+class BatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def build_dataset(self, multiplier=15.0, tensor_slice_len=2, batch_size=2):
+    components = (
+        np.arange(tensor_slice_len),
+        np.array([[1, 2, 3]]) * np.arange(tensor_slice_len)[:, np.newaxis],
+        np.array(multiplier) * np.arange(tensor_slice_len))
+
+    return dataset_ops.Dataset.from_tensor_slices(components).batch(batch_size)
+
+  def testCore(self):
+    tensor_slice_len = 8
+    batch_size = 2
+    num_outputs = tensor_slice_len // batch_size
+    self.run_core_tests(
+        lambda: self.build_dataset(15.0, tensor_slice_len, batch_size),
+        lambda: self.build_dataset(20.0, tensor_slice_len, batch_size),
+        num_outputs)
+
+
+class PaddedBatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testPaddedBatch(self):
+
+    def build_dataset(seq_lens):
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          lambda x: array_ops.fill([x], x)).padded_batch(
+              4, padded_shapes=[-1])
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+  def testPaddedBatchNonDefaultPadding(self):
+
+    def build_dataset(seq_lens):
+
+      def fill_tuple(x):
+        filled = array_ops.fill([x], x)
+        return (filled, string_ops.as_string(filled))
+
+      padded_shape = [-1]
+      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+          fill_tuple).padded_batch(
+              4,
+              padded_shapes=(padded_shape, padded_shape),
+              padding_values=(-1, "<end>"))
+
+    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
+    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
+    self.run_core_tests(lambda: build_dataset(seq_lens1),
+                        lambda: build_dataset(seq_lens2), 8)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
index c3d6bfc097798530008f186cce68906b6af8fe47..55a1d3b95b212466b262ad3c26f1efd7ed0e067e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_constructor_op_test.py
@@ -17,14 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import threading
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import dataset_ops
-from tensorflow.contrib.data.python.ops import iterator_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.util import nest
@@ -32,16 +31,16 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
 
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testTensorDataset(self):
+  def testFromTensors(self):
     """Test an dataset that represents a single tuple of tensors."""
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
 
@@ -61,7 +60,75 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDataset(self):
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testFromTensorsSparse(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlices(self):
     """Test an dataset that represents the slices from a tuple of tensors."""
     components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
@@ -86,7 +153,127 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDatasetWithDict(self):
+  def testFromTensorSlicesSparse(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(expected[i], results):
+          self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesMixed(self):
+    """Test an dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            (zip(*components[:3])[i] + expected[i]), results):
+          if sparse_tensor.is_sparse(component):
+            self.assertSparseValuesEqual(component, result_component)
+          else:
+            self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesWithDict(self):
     components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
     iterator = (dataset_ops.Dataset.from_tensor_slices(components)
                 .make_initializable_iterator())
@@ -107,7 +294,7 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testSparseTensorSliceDataset(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
@@ -574,135 +761,63 @@ class DatasetConstructorTest(test.TestCase):
         new = batching._RestructuredDataset(dataset, new_types, new_shape_lists)
         # pylint: enable=protected-access
 
-  def _iterator_checkpoint_prefix(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
 
-  def _testSaveRestoreFromTensorsUtility(self, start, break_range, stop):
-    path = self._iterator_checkpoint_prefix()
-    step = 0
-    meta_filename = path + "-%d.meta" % step
+class DatasetConstructorSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+  def _build_tensor_dataset(self, variable_array):
+    components = (variable_array, np.array([1, 2, 3]), np.array(37.0))
 
-    with ops.Graph().as_default() as g:
-      iterator = (
-          dataset_ops.Dataset.from_tensors(components)
-          .make_initializable_iterator())
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      saveable = iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-      for t in nest.flatten(get_next):
-        ops.add_to_collection("get_next", t)
-      saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(start, break_range):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component, result_component)
-        saver.save(sess, path, step)
-
-    with ops.Graph().as_default() as g:
-      saver = saver_lib.import_meta_graph(meta_filename)
-      with self.test_session(graph=g) as sess:
-        get_next = nest.pack_sequence_as(("a", "b", "c"),
-                                         ops.get_collection("get_next"))
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for _ in range(break_range, stop):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    return dataset_ops.Dataset.from_tensors(components)
 
-  def testRestoreFromTensors(self):
-    self._testSaveRestoreFromTensorsUtility(0, 0, 1)
+  def testFromTensorsCore(self):
+    # Equal length components
+    arr = np.array(1)
+    num_outputs = 1
+    diff_arr = np.array(2)
+    self.run_core_tests(lambda: self._build_tensor_dataset(arr),
+                        lambda: self._build_tensor_dataset(diff_arr),
+                        num_outputs)
 
-  def testRestoreExhuatedIteratorFromTensors(self):
-    self._testSaveRestoreFromTensorsUtility(0, 1, 1)
+  def _build_tensor_slices_dataset(self, components):
+    return dataset_ops.Dataset.from_tensor_slices(components)
 
-  def _build_graph_tensor_slices(self, components):
-    iterator = dataset_ops.Dataset.from_tensor_slices(
-        components).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    saveable = iterator_ops.make_saveable_from_iterator(iterator)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    for t in nest.flatten(get_next):
-      ops.add_to_collection("get_next", t)
-    return init_op, get_next
-
-  def _testSaveRestoreFromTensorSlicesUtility(self, start, break_range, stop):
-    path = self._iterator_checkpoint_prefix()
-    step = 0
-    meta_filename = path + "-%d.meta" % step
-
-    components = (np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-        np.array([[12], [13], [14], [15]]), 22),
+  def testFromTensorSlicesCore(self):
+    # Equal length components
+    components = (np.tile(np.array([[1], [2], [3], [4]]), 20),
+                  np.tile(np.array([[12], [13], [14], [15]]), 22),
                   np.array([37.0, 38.0, 39.0, 40.0]))
 
-    with ops.Graph().as_default() as g:
-      init_op, get_next = self._build_graph_tensor_slices(components)
-      saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for i in range(start, break_range):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i], result_component)
-        saver.save(sess, path, step)
-
-    with ops.Graph().as_default() as g:
-      saver = saver_lib.import_meta_graph(meta_filename)
-      with self.test_session(graph=g) as sess:
-        get_next = nest.pack_sequence_as(("a", "b", "c"),
-                                         ops.get_collection("get_next"))
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(break_range, stop):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i], result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testRestoreFromTensorSlices(self):
-    self._testSaveRestoreFromTensorSlicesUtility(0, 4, 2)
-
-  def testRestoreExhaustedIteratorFromTensorSlices(self):
-    self._testSaveRestoreFromTensorSlicesUtility(0, 4, 4)
-
-  def tesRestoreFromTensorSlicesWithDict(self):
-
-    path = self._iterator_checkpoint_prefix()
-    step = 0
-    meta_filename = path + "-%d.meta" % step
-
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next = self._build_graph_tensor_slices(components)
-      saver = saver_lib.Saver()
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for i in range(2):
-          results = sess.run(get_next)
-          self.assertEqual(components["foo"][i], results["foo"])
-          self.assertEqual(components["bar"][i], results["bar"])
-        saver.save(sess, path, step)
-
-    with ops.Graph().as_default() as g:
-      saver = saver_lib.import_meta_graph(meta_filename)
-      with self.test_session(graph=g) as sess:
-        get_next = nest.pack_sequence_as(("a", "b"),
-                                         ops.get_collection("get_next"))
-        saver.restore(sess, saver_lib.latest_checkpoint(self.get_temp_dir()))
-        for i in range(2, 3):
-          results = sess.run(get_next)
-          self.assertEqual(components["foo"][i], results["foo"])
-          self.assertEqual(components["bar"][i], results["bar"])
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    diff_comp = (np.tile(np.array([[1], [2], [3], [4]]), 20),
+                 np.tile(np.array([[5], [6], [7], [8]]), 22),
+                 np.array([1.0, 2.0, 3.0, 4.0]))
+
+    dict_components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
+
+    self.run_core_tests(lambda: self._build_tensor_slices_dataset(components),
+                        lambda: self._build_tensor_slices_dataset(diff_comp), 4)
+    self.run_core_tests(
+        lambda: self._build_tensor_slices_dataset(dict_components), None, 3)
+
+  def _build_sparse_tensor_slice_dataset(self, slices):
+    indices = np.array(
+        [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))],
+        dtype=np.int64)
+    values = np.array([val for s in slices for val in s], dtype=np.float64)
+    dense_shape = np.array(
+        [len(slices), max(len(s) for s in slices) + 1], dtype=np.int64)
+    sparse_components = sparse_tensor.SparseTensor(indices, values, dense_shape)
+    return dataset_ops.Dataset.from_sparse_tensor_slices(sparse_components)
+
+  def testFromSparseTensorSlicesCore(self):
+    slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+    diff_slices = [[1., 2.], [2.], [2., 3., 4.], [], [], []]
+
+    self.run_core_tests(
+        lambda: self._build_sparse_tensor_slice_dataset(slices),
+        lambda: self._build_sparse_tensor_slice_dataset(diff_slices),
+        9,
+        sparse_tensors=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf25cc60a1c0efc09bed6501fd2d6f4ccb07764b
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -0,0 +1,633 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base class for testing serializable datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import nest
+
+
+class DatasetSerializationTestBase(test.TestCase):
+  """Base class for testing serializable datasets."""
+
+  def tearDown(self):
+    self._delete_ckpt()
+
+  def run_core_tests(self, ds_fn1, ds_fn2, num_outputs, sparse_tensors=False):
+    """Runs the core tests.
+
+    Args:
+      ds_fn1: 0-argument function that returns a Dataset.
+      ds_fn2: 0-argument function that returns a Dataset different from
+        ds_fn1. If None, verify_restore_in_modified_graph test is not run.
+      num_outputs: Total number of outputs expected from this Dataset.
+      sparse_tensors: Whether dataset is built from SparseTensor(s).
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_unused_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_fully_used_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_exhausted_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_init_before_restore(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_multiple_breaks(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_reset_restored_iterator(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    self.verify_restore_in_empty_graph(
+        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+    if ds_fn2:
+      self.verify_restore_in_modified_graph(
+          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
+
+  def verify_unused_iterator(self,
+                             ds_fn,
+                             num_outputs,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Verifies that saving and restoring an unused iterator works.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn, [0],
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_fully_used_iterator(self, ds_fn, num_outputs,
+                                 sparse_tensors=False):
+    """Verifies that saving and restoring a fully used iterator works.
+
+    Note that this only checks saving and restoring an iterator from which
+    `num_outputs` items have been produced but does not check for an
+    exhausted iterator, i.e., one from which an OutOfRange error has been
+    returned.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn, [num_outputs], num_outputs, sparse_tensors=sparse_tensors)
+
+  def verify_exhausted_iterator(self, ds_fn, num_outputs, sparse_tensors=False):
+    """Verifies that saving and restoring an exhausted iterator works.
+
+    An exhausted iterator is one which has returned an OutOfRange error.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        verify_exhausted=True,
+        sparse_tensors=sparse_tensors)
+    actual = self.gen_outputs(
+        ds_fn, [],
+        0,
+        ckpt_saved=True,
+        verify_exhausted=True,
+        sparse_tensors=sparse_tensors)
+    self.assertEqual(len(actual), 0)
+
+  def verify_init_before_restore(self,
+                                 ds_fn,
+                                 num_outputs,
+                                 sparse_tensors=False,
+                                 verify_exhausted=True):
+    """Verifies that restoring into an already initilized iterator works.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn,
+        self.gen_break_points(num_outputs),
+        num_outputs,
+        init_before_restore=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_multiple_breaks(self,
+                             ds_fn,
+                             num_outputs,
+                             num_breaks=10,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Attempts to save/restore at multiple break points.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      num_breaks: The number of break points. These are uniformly spread in
+        [0, num_outputs] both inclusive.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    self.verify_run_with_breaks(
+        ds_fn,
+        self.gen_break_points(num_outputs, num_breaks),
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+  def verify_reset_restored_iterator(self,
+                                     ds_fn,
+                                     num_outputs,
+                                     break_point=None,
+                                     sparse_tensors=False,
+                                     verify_exhausted=True):
+    """Attempts to re-initialize a restored iterator.
+
+    This is useful when restoring a training checkpoint during validation.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Collect ground truth containing all outputs.
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Skip some items and save checkpoint.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Restore from checkpoint and then run init_op.
+    with ops.Graph().as_default() as g:
+      saver = self._import_meta_graph()
+      init_op, get_next_op = self._get_iterator_ops_from_collection(
+          ds_fn, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        self._restore(saver, sess)
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for _ in range(num_outputs):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+    self.match(expected, actual)
+
+  def verify_restore_in_modified_graph(self,
+                                       ds_fn1,
+                                       ds_fn2,
+                                       num_outputs,
+                                       break_point=None,
+                                       sparse_tensors=False,
+                                       verify_exhausted=True):
+    """Attempts to restore an iterator in a modified graph.
+
+    Builds an input pipeline using ds_fn1, runs it for `break_point` steps
+    and saves a checkpoint. Then builds a new graph using ds_fn2, restores
+    the checkpoint from ds_fn1 and verifies that the restore is successful.
+
+    Args:
+      ds_fn1: See `run_core_tests`.
+      ds_fn2: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Skip `break_point` items and store the remaining produced from ds_fn1
+    # in `expected`.
+    self.gen_outputs(
+        ds_fn1, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+    expected = self.gen_outputs(
+        ds_fn1, [],
+        num_outputs - break_point,
+        ckpt_saved=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Generate `break_point` items from ds_fn1 and save checkpoint.
+    self.gen_outputs(
+        ds_fn1, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Build graph for ds_fn2 but load checkpoint for ds_fn1.
+    with ops.Graph().as_default() as g:
+      _, get_next_op, saver = self._build_graph(
+          ds_fn2, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        self._restore(saver, sess)
+        for _ in range(num_outputs - break_point):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+    self.match(expected, actual)
+
+  def verify_restore_in_empty_graph(self,
+                                    ds_fn,
+                                    num_outputs,
+                                    break_point=None,
+                                    sparse_tensors=False,
+                                    verify_exhausted=True):
+    """Attempts to restore an iterator in an empty graph.
+
+    Builds an input pipeline using ds_fn, runs it for `break_point` steps
+    and saves a checkpoint. Then builds a new empty graph, restores
+    the checkpoint from ds_fn and verifies that the restore is successful.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    break_point = num_outputs // 2 if not break_point else break_point
+
+    # Skip `break_point` items and store the remaining produced from ds_fn
+    # in `expected`.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs - break_point,
+        ckpt_saved=True,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    # Generate `break_point` items from ds_fn and save checkpoint.
+    self.gen_outputs(
+        ds_fn, [],
+        break_point,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=False)
+
+    actual = []
+    # Build an empty graph but load checkpoint for ds_fn.
+    with ops.Graph().as_default() as g:
+      get_next_op, saver = self._build_empty_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        self._restore(saver, sess)
+        for _ in range(num_outputs - break_point):
+          actual.append(sess.run(get_next_op))
+        if verify_exhausted:
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+    self.match(expected, actual)
+
+  def verify_error_on_save(self,
+                           ds_fn,
+                           num_outputs,
+                           error,
+                           break_point=None,
+                           sparse_tensors=False):
+    """Attempts to save a non-saveable iterator.
+
+    Args:
+      ds_fn: See `run_core_tests`.
+      num_outputs: See `run_core_tests`.
+      error: Declared error when trying to save iterator.
+      break_point: Break point. Optional. Defaults to num_outputs/2.
+      sparse_tensors: See `run_core_tests`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+
+    break_point = num_outputs // 2 if not break_point else break_point
+    with ops.Graph().as_default() as g:
+      init_op, get_next_op, saver = self._build_graph(
+          ds_fn, sparse_tensors=sparse_tensors)
+      with self.test_session(graph=g) as sess:
+        sess.run(variables.global_variables_initializer())
+        sess.run(init_op)
+        for _ in range(break_point):
+          sess.run(get_next_op)
+        with self.assertRaises(error):
+          self._save(sess, saver)
+
+  def verify_run_with_breaks(self,
+                             ds_fn,
+                             break_points,
+                             num_outputs,
+                             init_before_restore=False,
+                             sparse_tensors=False,
+                             verify_exhausted=True):
+    """Verifies that ds_fn() produces the same outputs with and without breaks.
+
+    1. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
+       *without* stopping at break points.
+    2. Builds a Dataset using `ds_fn` and produces `num_outputs` items from it
+       with stopping at break points.
+
+    Deep matches outputs from 1 and 2.
+
+    Args:
+      ds_fn: See `gen_outputs`.
+      break_points: See `gen_outputs`.
+      num_outputs: See `gen_outputs`.
+      init_before_restore: See `gen_outputs`.
+      sparse_tensors: See `run_core_tests`.
+      verify_exhausted: See `gen_outputs`.
+
+    Raises:
+      AssertionError if any test fails.
+    """
+    expected = self.gen_outputs(
+        ds_fn, [],
+        num_outputs,
+        init_before_restore=init_before_restore,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    actual = self.gen_outputs(
+        ds_fn,
+        break_points,
+        num_outputs,
+        init_before_restore=init_before_restore,
+        sparse_tensors=sparse_tensors,
+        verify_exhausted=verify_exhausted)
+
+    self.match(expected, actual)
+
+  def gen_outputs(self,
+                  ds_fn,
+                  break_points,
+                  num_outputs,
+                  ckpt_saved=False,
+                  init_before_restore=False,
+                  sparse_tensors=False,
+                  verify_exhausted=True):
+    """Generates elements from input dataset while stopping at break points.
+
+    Produces `num_outputs` outputs and saves the state of the iterator in the
+    Saver checkpoint.
+
+    Args:
+      ds_fn: 0-argument function that returns the dataset.
+      break_points: A list of integers. For each `break_point` in
+        `break_points`, we produce outputs till `break_point` number of items
+        have been produced and then checkpoint the state. The current graph
+        and session are destroyed and a new graph and session are used to
+        produce outputs till next checkpoint or till `num_outputs` elements
+        have been produced. `break_point` must be <= `num_outputs`.
+      num_outputs: The total number of outputs to produce from the iterator.
+      ckpt_saved: Whether a checkpoint already exists. If False, we build the
+        graph from ds_fn.
+      init_before_restore: Whether init should be called before saver.restore.
+        This is just so that we can verify that restoring an already initialized
+        iterator works.
+      sparse_tensors:  Whether dataset is built from SparseTensor(s).
+      verify_exhausted: Whether to verify that the iterator has been exhausted
+        after producing `num_outputs` elements.
+
+    Returns:
+      A list of `num_outputs` items.
+    """
+    outputs = []
+
+    def get_ops():
+      if ckpt_saved:
+        saver = self._import_meta_graph()
+        init_op, get_next_op = self._get_iterator_ops_from_collection(
+            ds_fn, sparse_tensors=sparse_tensors)
+      else:
+        init_op, get_next_op, saver = self._build_graph(
+            ds_fn, sparse_tensors=sparse_tensors)
+      return init_op, get_next_op, saver
+
+    for i in range(len(break_points) + 1):
+      with ops.Graph().as_default() as g:
+        init_op, get_next_op, saver = get_ops()
+        with self.test_session(graph=g) as sess:
+          if ckpt_saved:
+            if init_before_restore:
+              sess.run(variables.global_variables_initializer())
+              sess.run(init_op)
+            self._restore(saver, sess)
+          else:
+            sess.run(variables.global_variables_initializer())
+            sess.run(init_op)
+          start = break_points[i - 1] if i > 0 else 0
+          end = break_points[i] if i < len(break_points) else num_outputs
+          num_iters = end - start
+          for _ in range(num_iters):
+            outputs.append(sess.run(get_next_op))
+          if i == len(break_points) and verify_exhausted:
+            with self.assertRaises(errors.OutOfRangeError):
+              sess.run(get_next_op)
+          self._save(sess, saver)
+          ckpt_saved = True
+
+    return outputs
+
+  def match(self, expected, actual):
+    """Matches nested structures.
+
+    Recursively matches shape and values of `expected` and `actual`.
+    Handles scalars, numpy arrays and other python sequence containers
+    e.g. list, dict.
+
+    Args:
+      expected: Nested structure 1.
+      actual: Nested structure 2.
+
+    Raises:
+      AssertionError if matching fails.
+    """
+    if isinstance(expected, np.ndarray):
+      expected = expected.tolist()
+    if isinstance(actual, np.ndarray):
+      actual = actual.tolist()
+    self.assertEqual(type(expected), type(actual))
+
+    if nest.is_sequence(expected):
+      self.assertEqual(len(expected), len(actual))
+      if isinstance(expected, dict):
+        for key1, key2 in zip(sorted(expected), sorted(actual)):
+          self.assertEqual(key1, key2)
+          self.match(expected[key1], actual[key2])
+      else:
+        for item1, item2 in zip(expected, actual):
+          self.match(item1, item2)
+    else:
+      self.assertEqual(expected, actual)
+
+  def does_not_match(self, expected, actual):
+    with self.assertRaises(AssertionError):
+      self.match(expected, actual)
+
+  def gen_break_points(self, num_outputs, num_samples=10):
+    """Generates `num_samples` breaks points in [0, num_outputs]."""
+    return np.linspace(0, num_outputs, num_samples, dtype=int)
+
+  def _build_graph(self, ds_fn, sparse_tensors=False):
+    iterator = ds_fn().make_initializable_iterator()
+
+    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    init_op = iterator.initializer
+    if sparse_tensors:
+      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    else:
+      get_next = iterator.get_next()
+    self._add_iterator_ops_to_collection(init_op, get_next, sparse_tensors)
+    saver = saver_lib.Saver(allow_empty=True)
+    return init_op, get_next, saver
+
+  def _build_empty_graph(self, ds_fn, sparse_tensors=False):
+    iterator = iterator_ops.Iterator.from_structure(
+        self._get_output_types(ds_fn), self._get_output_shapes(ds_fn))
+    saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    if sparse_tensors:
+      get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+    else:
+      get_next = iterator.get_next()
+    saver = saver_lib.Saver(allow_empty=True)
+    return get_next, saver
+
+  def _add_iterator_ops_to_collection(self,
+                                      init_op,
+                                      get_next,
+                                      sparse_tensors=False):
+    ops.add_to_collection("iterator_ops", init_op)
+    # `get_next` may be a tuple e.g. in TensorSliceDataset. Since Collections
+    # do not support tuples we flatten the tensors and restore the shape in
+    # `_get_iterator_ops_from_collection`.
+    if sparse_tensors:
+      ops.add_to_collection("iterator_ops", get_next.indices)
+      ops.add_to_collection("iterator_ops", get_next.values)
+      ops.add_to_collection("iterator_ops", get_next.dense_shape)
+    else:
+      for el in nest.flatten(get_next):
+        ops.add_to_collection("iterator_ops", el)
+
+  def _get_iterator_ops_from_collection(self, ds_fn, sparse_tensors=False):
+    all_ops = ops.get_collection("iterator_ops")
+    if sparse_tensors:
+      init_op, indices, values, dense_shape = all_ops
+      return init_op, sparse_tensor.SparseTensor(indices, values, dense_shape)
+    else:
+      return all_ops[0], nest.pack_sequence_as(
+          self._get_output_types(ds_fn), all_ops[1:])
+
+  def _get_output_types(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_types
+
+  def _get_output_shapes(self, ds_fn):
+    with ops.Graph().as_default():
+      return ds_fn().output_shapes
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def _latest_ckpt(self):
+    return saver_lib.latest_checkpoint(self.get_temp_dir())
+
+  def _save(self, sess, saver):
+    saver.save(sess, self._ckpt_path())
+
+  def _restore(self, saver, sess):
+    saver.restore(sess, self._latest_ckpt())
+
+  def _import_meta_graph(self):
+    meta_file_path = self._ckpt_path() + ".meta"
+    return saver_lib.import_meta_graph(meta_file_path)
+
+  def _delete_ckpt(self):
+    # Remove all checkpoint files.
+    prefix = self._ckpt_path()
+    pattern = prefix + "*"
+    files = gfile.Glob(pattern)
+    map(gfile.Remove, files)
diff --git a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
index 00323da3110bb7f32b589f72e4e867f9c71e92ee..5921be2ae89ba1bbbb8d6e3a509cf49c65949544 100644
--- a/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/filter_dataset_op_test.py
@@ -19,9 +19,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
@@ -124,6 +126,74 @@ class FilterDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+            lambda x, i: x).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(5):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+class FilterDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_filter_range_graph(self, div):
+    return dataset_ops.Dataset.range(100).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, div), 2))
+
+  def testFilterCore(self):
+    div = 3
+    num_outputs = np.sum([x % 3 is not 2 for x in range(100)])
+    self.run_core_tests(lambda: self._build_filter_range_graph(div),
+                        lambda: self._build_filter_range_graph(div * 2),
+                        num_outputs)
+
+  def _build_filter_dict_graph(self):
+    return dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
+            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
+                lambda d: d["foo"] + d["bar"])
+
+  def testFilterDictCore(self):
+    num_outputs = np.sum([(x**2) % 2 == 0 for x in range(10)])
+    self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
+
+  def _build_sparse_filter(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    return dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+        lambda x, i: x)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
index 2a582ae6620ac8276d290c7b995588640e36929c..d4fbaa5cdcdd315aa0524134b48eb0515169722c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/flat_map_dataset_op_test.py
@@ -17,16 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import random
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import function
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -123,154 +129,101 @@ class FlatMapDatasetTest(test.TestCase):
         sess.run(get_next)
   # pylint: enable=g-long-lambda
 
+  def testSparse(self):
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
 
-class InterleaveDatasetTest(test.TestCase):
-
-  def _interleave(self, lists, cycle_length, block_length):
-    num_open = 0
-
-    # `all_iterators` acts as a queue of iterators over each element of `lists`.
-    all_iterators = [iter(l) for l in lists]
-
-    # `open_iterators` are the iterators whose elements are currently being
-    # interleaved.
-    open_iterators = []
-    for i in range(cycle_length):
-      if all_iterators:
-        open_iterators.append(all_iterators.pop(0))
-        num_open += 1
-      else:
-        open_iterators.append(None)
-
-    while num_open or all_iterators:
-      for i in range(cycle_length):
-        if open_iterators[i] is None:
-          if all_iterators:
-            open_iterators[i] = all_iterators.pop(0)
-            num_open += 1
-          else:
-            continue
-        for _ in range(block_length):
-          try:
-            yield next(open_iterators[i])
-          except StopIteration:
-            open_iterators[i] = None
-            num_open -= 1
-            break
-
-  def testPythonImplementation(self):
-    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
-                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
-
-    # Cycle length 1 acts like `Dataset.flat_map()`.
-    expected_elements = itertools.chain(*input_lists)
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 1, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1.
-    expected_elements = [4, 5, 4, 5, 4, 5, 4,
-                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
-                                      # to a list and are already at
-                                      # the end of that list, we move
-                                      # on to the next element.
-                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 1)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > 1 and block length > 1.
-    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
-                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 2, 3)):
-      self.assertEqual(expected, produced)
-
-    # Cycle length > len(input_values).
-    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
-                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-    for expected, produced in zip(
-        expected_elements, self._interleave(input_lists, 7, 2)):
-      self.assertEqual(expected, produced)
-
-  def testInterleaveDataset(self):
-    input_values = array_ops.placeholder(dtypes.int64, shape=[None])
-    cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
-    block_length = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_count = 2
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_values)
-        .repeat(repeat_count)
-        .interleave(lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
-                    cycle_length, block_length))
-    iterator = dataset.make_initializable_iterator()
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+        .make_initializable_iterator())
     init_op = iterator.initializer
-    next_element = iterator.get_next()
+    get_next = iterator.get_next()
 
     with self.test_session() as sess:
-      # Cycle length 1 acts like `Dataset.flat_map()`.
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 1, block_length: 3})
-
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 1, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-
-      # Cycle length > 1.
-      # expected: [4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5,
-      #            6, 5, 6, 5, 6, 5, 6, 5]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 1})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 1):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > 1 and block length > 1.
-      # expected: [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5,
-      #            5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Cycle length > len(input_values) * repeat_count.
-      # expected: [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4,
-      #            5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
-      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
-                                   cycle_length: 7, block_length: 2})
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 7, 2):
-        self.assertEqual(expected_element, sess.run(next_element))
+      sess.run(init_op)
+      for i in range(10):
+        for j in range(2):
+          expected = [i, 0] if j % 2 == 0 else [0, -i]
+          self.assertAllEqual(expected, sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        sess.run(get_next)
 
-      # Empty input.
-      sess.run(init_op, feed_dict={input_values: [],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
 
-      # Non-empty input leading to empty output.
-      sess.run(init_op, feed_dict={input_values: [0, 0, 0],
-                                   cycle_length: 2, block_length: 3})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-      # Mixture of non-empty and empty interleaved datasets.
-      sess.run(init_op, feed_dict={input_values: [4, 0, 6],
-                                   cycle_length: 2, block_length: 3})
-      for expected_element in self._interleave(
-          [[4] * 4, [], [6] * 6] * repeat_count, 2, 3):
-        self.assertEqual(expected_element, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+class FlatMapDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+    # Complicated way of saying range(start, start+25).
+    def build_ds(start):
+
+      def map_fn(x):
+        return dataset_ops.Dataset.range(x, x + 5)
+
+      return dataset_ops.Dataset.range(start, start + 5 * 5, 5).flat_map(map_fn)
+
+    self.run_core_tests(lambda: build_ds(0), lambda: build_ds(10), 25)
+
+  def testMapThenFlatMap(self):
+
+    def build_ds():
+
+      def flat_map_fn(_):
+
+        def map_fn(y):
+          return 10 * math_ops.to_int32(y)
+
+        return dataset_ops.Dataset.range(100).map(map_fn)
+
+      return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+
+    self.run_core_tests(build_ds, None, 500)
+
+  def testCaptureDefunInMapFn(self):
+
+    def build_ds():
+
+      def map_fn(x):
+
+        @function.Defun(dtypes.int64)
+        def defun_fn(x):
+          return constant_op.constant(1000) + math_ops.to_int32(x)
+
+        return dataset_ops.Dataset.from_tensor_slices([defun_fn(x)])
+
+      return dataset_ops.Dataset.range(100).flat_map(map_fn)
+
+    self.run_core_tests(build_ds, None, 100)
+
+  def testDisallowVariableCapture(self):
+
+    def build_ds():
+      test_var = variable_scope.get_variable(
+          name="test_var", shape=(), use_resource=True)
+      return dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensor_slices([test_var]))
+
+    self.verify_error_on_save(build_ds, 5, errors.InvalidArgumentError)
+
+  def testDisallowCapturingStatefulOps(self):
+
+    def build_ds():
+
+      def flat_map_fn(_):
+
+        def map_fn(x):
+          return random_ops.random_uniform(
+              (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+
+        return dataset_ops.Dataset.range(100).map(map_fn)
+
+      return dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+
+    self.verify_error_on_save(build_ds, 500, errors.InvalidArgumentError)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 0aa9ea88de82b0851b0236d9412039d6573ab291..e66ed3f7aa2a512813ef353d2d0744ae67005884 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -22,18 +22,236 @@ import math
 import threading
 import time
 
+import numpy as np
 from six.moves import zip_longest
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+class InterleaveDatasetTest(test.TestCase):
+
+  def _interleave(self, lists, cycle_length, block_length):
+    num_open = 0
+
+    # `all_iterators` acts as a queue of iterators over each element of `lists`.
+    all_iterators = [iter(l) for l in lists]
+
+    # `open_iterators` are the iterators whose elements are currently being
+    # interleaved.
+    open_iterators = []
+    for i in range(cycle_length):
+      if all_iterators:
+        open_iterators.append(all_iterators.pop(0))
+        num_open += 1
+      else:
+        open_iterators.append(None)
+
+    while num_open or all_iterators:
+      for i in range(cycle_length):
+        if open_iterators[i] is None:
+          if all_iterators:
+            open_iterators[i] = all_iterators.pop(0)
+            num_open += 1
+          else:
+            continue
+        for _ in range(block_length):
+          try:
+            yield next(open_iterators[i])
+          except StopIteration:
+            open_iterators[i] = None
+            num_open -= 1
+            break
+
+  def testPythonImplementation(self):
+    input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6],
+                   [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]]
+
+    # Cycle length 1 acts like `Dataset.flat_map()`.
+    expected_elements = itertools.chain(*input_lists)
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 1, 1)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > 1.
+    expected_elements = [4, 5, 4, 5, 4, 5, 4,
+                         5, 5, 6, 6,  # NOTE(mrry): When we cycle back
+                                      # to a list and are already at
+                                      # the end of that list, we move
+                                      # on to the next element.
+                         4, 6, 4, 6, 4, 6, 4, 6, 5, 6, 5, 6, 5, 6, 5, 6, 5]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 2, 1)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > 1 and block length > 1.
+    expected_elements = [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6,
+                         4, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 2, 3)):
+      self.assertEqual(expected, produced)
+
+    # Cycle length > len(input_values).
+    expected_elements = [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6,
+                         4, 4, 5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
+    for expected, produced in zip(
+        expected_elements, self._interleave(input_lists, 7, 2)):
+      self.assertEqual(expected, produced)
+
+  def testInterleaveDataset(self):
+    input_values = array_ops.placeholder(dtypes.int64, shape=[None])
+    cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
+    block_length = array_ops.placeholder(dtypes.int64, shape=[])
+
+    repeat_count = 2
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_values)
+        .repeat(repeat_count)
+        .interleave(lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+                    cycle_length, block_length))
+    iterator = dataset.make_initializable_iterator()
+    init_op = iterator.initializer
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      # Cycle length 1 acts like `Dataset.flat_map()`.
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 1, block_length: 3})
+
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 1, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+
+      # Cycle length > 1.
+      # expected: [4, 5, 4, 5, 4, 5, 4, 5, 5, 6, 6, 4, 6, 4, 6, 4, 6, 4, 6, 5,
+      #            6, 5, 6, 5, 6, 5, 6, 5]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 2, block_length: 1})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 1):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Cycle length > 1 and block length > 1.
+      # expected: [4, 4, 4, 5, 5, 5, 4, 5, 5, 6, 6, 6, 4, 4, 4, 6, 6, 6, 4, 5,
+      #            5, 5, 6, 6, 6, 5, 5, 6, 6, 6]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 2, block_length: 3})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 2, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Cycle length > len(input_values) * repeat_count.
+      # expected: [4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4, 5, 5, 6, 6, 4, 4,
+      #            5, 5, 6, 6, 5, 6, 6, 5, 6, 6]
+      sess.run(init_op, feed_dict={input_values: [4, 5, 6],
+                                   cycle_length: 7, block_length: 2})
+      for expected_element in self._interleave(
+          [[4] * 4, [5] * 5, [6] * 6] * repeat_count, 7, 2):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Empty input.
+      sess.run(init_op, feed_dict={input_values: [],
+                                   cycle_length: 2, block_length: 3})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Non-empty input leading to empty output.
+      sess.run(init_op, feed_dict={input_values: [0, 0, 0],
+                                   cycle_length: 2, block_length: 3})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      # Mixture of non-empty and empty interleaved datasets.
+      sess.run(init_op, feed_dict={input_values: [4, 0, 6],
+                                   cycle_length: 2, block_length: 3})
+      for expected_element in self._interleave(
+          [[4] * 4, [], [6] * 6] * repeat_count, 2, 3):
+        self.assertEqual(expected_element, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+            _interleave_fn, cycle_length=1).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for j in range(2):
+          expected = [i, 0] if j % 2 == 0 else [0, -i]
+          self.assertAllEqual(expected, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+class InterleaveDatasetSeriazationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_iterator_graph(self, input_values, cycle_length, block_length):
+    repeat_count = 2
+    return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        repeat_count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length)
+
+  def testSerializationCore(self):
+    input_values = np.array([4, 5, 6], dtype=np.int64)
+    num_outputs = np.sum(input_values) * 2
+    # cycle_length > 1, block_length > 1
+    cycle_length = 2
+    block_length = 3
+    # pylint: disable=g-long-lambda
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length * 2, block_length * 1),
+        num_outputs)
+    # cycle_length = 1
+    cycle_length = 1
+    block_length = 3
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        None, num_outputs)
+    # block_length = 1
+    cycle_length = 2
+    block_length = 1
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(
+            input_values, cycle_length, block_length),
+        None, num_outputs)
+    # pylint: enable=g-long-lambda
+
+
 class ParallelInterleaveDatasetTest(test.TestCase):
 
   def setUp(self):
@@ -547,5 +765,31 @@ class ParallelInterleaveDatasetTest(test.TestCase):
   def testTooManyReadersSloppy(self):
     self._testTooManyReaders(sloppy=True)
 
+  def testSparse(self):
+    def _map_fn(i):
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    iterator = dataset.apply(
+        interleave_ops.parallel_interleave(
+            _interleave_fn, cycle_length=1)).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for j in range(2):
+          expected = [i, 0] if j % 2 == 0 else [0, -i]
+          self.assertAllEqual(expected, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index 8a1d99499be702d91f87f65f443261b47ce5c5cd..e9a07da84a8c80c09ebd4dab0b1d69febe1c9790 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -20,15 +20,18 @@ from collections import namedtuple
 
 import os
 import threading
-from collections import namedtuple
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -37,6 +40,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
@@ -616,6 +620,182 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(_sparse)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _sparse(i))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparseChain(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _check(i):
+      self.assertTrue(sparse_tensor.is_sparse(i))
+      return sparse_ops.sparse_concat(0, [i, i])
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testCaptureResourceInMapFn(self):
+
+    def _build_ds(iterator):
+
+      def _map_fn(x):
+        get_next = iterator.get_next()
+        return x * get_next
+
+      return dataset_ops.Dataset.range(10).map(_map_fn)
+
+    def _build_graph():
+      captured_iterator = dataset_ops.Dataset.range(
+          10).make_initializable_iterator()
+      ds = _build_ds(captured_iterator)
+      iterator = ds.make_initializable_iterator()
+      init_op = iterator.initializer
+      return captured_iterator.initializer, init_op
+
+    with ops.Graph().as_default() as g:
+      captured_init_op, init_op = _build_graph()
+      with self.test_session(graph=g) as sess:
+        sess.run(captured_init_op)
+        with self.assertRaises(errors.UnimplementedError):
+          # CapturedFunction does not support capturing IteratorResource.
+          sess.run(init_op)
+
+
+class MapDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def setUp(self):
+    self._tensor_slice_len = 7
+    self._num_epochs = 14
+    self._num_outputs = self._tensor_slice_len * self._num_epochs
+
+  def _build_ds(self, multiplier=37.0):
+    components = (np.arange(self._tensor_slice_len), np.array([[1, 2, 3]]) *
+                  np.arange(self._tensor_slice_len)[:, np.newaxis],
+                  np.array(multiplier) * np.arange(self._tensor_slice_len))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
+            .repeat(self._num_epochs))
+
+  def testSaveRestoreCore(self):
+    self.run_core_tests(
+        self._build_ds,
+        lambda: self._build_ds(multiplier=15.0),
+        self._num_outputs)
+
+  def testSaveStatefulFunction(self):
+
+    def _build_ds():
+
+      def _map_fn(x):
+        return random_ops.random_uniform(
+            (), 0, 10, dtype=dtypes.int32) * math_ops.to_int32(x)
+
+      return dataset_ops.Dataset.range(100).map(_map_fn)
+
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+
+  def testCaptureVariableInMapFn(self):
+
+    def _build_ds():
+      counter_var = variable_scope.get_variable(
+          "counter", (), dtypes.int32, use_resource=True)
+      return (dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+          lambda _: counter_var.assign_add(1)))
+
+    self.verify_error_on_save(_build_ds, 15, errors.InvalidArgumentError)
+
+  def testCaptureDefunInMapFn(self):
+    num_outputs = 100
+
+    def _build_ds():
+
+      @function.Defun(dtypes.int64)
+      def defun_fn(x):
+        return constant_op.constant(1000) + math_ops.to_int32(x)
+
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+
+    self.run_core_tests(_build_ds, None, num_outputs)
+
+  def testBuildDefunInMapFn(self):
+    num_outputs = 100
+
+    def _build_ds():
+
+      @function.Defun(dtypes.int64)
+      def defun_fn(x):
+
+        @function.Defun(dtypes.int32)
+        def defun_fn_deep(x):
+          return constant_op.constant(1000) + math_ops.to_int32(x)
+
+        return constant_op.constant(11000) + defun_fn_deep(math_ops.to_int32(x))
+
+      return dataset_ops.Dataset.range(num_outputs).map(defun_fn)
+
+    self.run_core_tests(_build_ds, None, num_outputs)
+
+
+class IgnoreErrorsSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, components):
+    return dataset_ops.Dataset.from_tensor_slices(components).map(
+        lambda x: array_ops.check_numerics(x, "message")).apply(
+            error_ops.ignore_errors())
+
+  def testIgnoreErrorsCore(self):
+    components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
+    diff_components = np.array([1., 2., 3., np.nan]).astype(np.float32)
+    num_outputs = 4
+    self.run_core_tests(lambda: self._build_ds(components),
+                        lambda: self._build_ds(diff_components), num_outputs)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py b/tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
similarity index 51%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
rename to tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
index a640dfe7dfbcce96261589c7fc49107deaefdd54..3d120a3071ef730f21221e3291d8c84385b51aa3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_impl.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetch_dataset_op_test.py
@@ -12,37 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sigmoid bijector."""
-
+"""Tests for the experimental input pipeline ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Sigmoid",
-]
-
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
 
-class Sigmoid(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
 
-  def __init__(self, validate_args=False, name="sigmoid"):
-    super(Sigmoid, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+class PrefetchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def _forward(self, x):
-    return math_ops.sigmoid(x)
+  def build_dataset(self, seed):
+    return dataset_ops.Dataset.range(100).prefetch(10).shuffle(
+        buffer_size=10, seed=seed, reshuffle_each_iteration=False)
 
-  def _inverse(self, y):
-    return math_ops.log(y) - math_ops.log1p(-y)
+  def testCore(self):
+    num_outputs = 100
+    self.run_core_tests(lambda: self.build_dataset(10),
+                        lambda: self.build_dataset(20), num_outputs)
 
-  def _inverse_log_det_jacobian(self, y):
-    return -math_ops.log(y) - math_ops.log1p(-y)
 
-  def _forward_log_det_jacobian(self, x):
-    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 539c6f215536f50a0b56f173a9240542faa2e643..dc3e38db59301bf1819999f479171af35930e9d2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
@@ -85,6 +86,9 @@ class StagingAreaOpsTest(test.TestCase):
       self._event.wait()
       elem = sess.run(prefetch_op)
       self.assertEqual(elem, [5.0])
+      sess.run(
+          resource_variable_ops.destroy_resource_op(
+              buffer_resource_handle, ignore_lookup_error=True))
 
   def testSameDeviceCPU(self):
     self._prefetch_fn_helper("same_device_cpu",
diff --git a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
index f59ac760dc83a504e563f055b91f1002cb0c80fc..8e6ad061a11752ab7b1ffc13c90b4fa52f67d6aa 100644
--- a/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/range_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.contrib.data.python.ops import counter
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.contrib.data.python.ops import enumerate_ops
 from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
@@ -194,6 +195,27 @@ class RangeDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testCounter(self):
+    """Test dataset construction using `count`."""
+    iterator = (counter.Counter(start=3, step=4)
+                .make_one_shot_iterator())
+    get_next = iterator.get_next()
+    self.assertEqual([], get_next.shape.as_list())
+    self.assertEqual(dtypes.int64, get_next.dtype)
+
+    negative_iterator = (counter.Counter(start=0, step=-1)
+                         .make_one_shot_iterator())
+    negative_get_next = negative_iterator.get_next()
+
+    with self.test_session() as sess:
+      self.assertEqual(3, sess.run(get_next))
+      self.assertEqual(3 + 4, sess.run(get_next))
+      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+
+      self.assertEqual(0, sess.run(negative_get_next))
+      self.assertEqual(-1, sess.run(negative_get_next))
+      self.assertEqual(-2, sess.run(negative_get_next))
+
   def _iterator_checkpoint_prefix(self):
     return os.path.join(self.get_temp_dir(), "iterator")
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 3ae8f71d77fa6ecf08e42bedac702b8f75eec309..1c42a3d855bc16c21e385d7108c3106884ae4f5e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -21,7 +21,7 @@ import gzip
 import os
 import zlib
 
-from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
@@ -30,18 +30,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import compat
 
 
-class TextLineDatasetTest(test.TestCase):
+class TextLineDatasetTestBase(test.TestCase):
 
   def _lineText(self, f, l):
     return compat.as_bytes("%d: %d" % (f, l))
@@ -79,6 +75,9 @@ class TextLineDatasetTest(test.TestCase):
 
     return filenames
 
+
+class TextLineDatasetTest(TextLineDatasetTestBase):
+
   def _testTextLineDataset(self, compression_type=None):
     test_filenames = self._createFiles(
         2, 5, crlf=True, compression_type=compression_type)
@@ -165,282 +164,37 @@ class TextLineDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(iterator.get_next())
 
-  def _ckpt_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _latest_ckpt(self):
-    return saver_lib.latest_checkpoint(self.get_temp_dir())
-
-  def _save(self, saver, sess):
-    saver.save(sess, self._ckpt_path())
-
-  def _restore(self, saver, sess):
-    saver.restore(sess, self._latest_ckpt())
 
-  def _import_meta_graph(self):
-    meta_file_path = self._ckpt_path() + ".meta"
-    return saver_lib.import_meta_graph(meta_file_path)
+class TextLineDatasetSerializationTest(
+    TextLineDatasetTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def _build_graph(self,
-                   test_filenames,
-                   compression_type=None,
-                   build_saveable=True):
-    ds = readers.TextLineDataset(
+  def _build_iterator_graph(self, test_filenames, compression_type=None):
+    return readers.TextLineDataset(
         test_filenames, compression_type=compression_type, buffer_size=10)
-    iterator = ds.make_initializable_iterator()
-    if build_saveable:
-      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
-      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    ops.add_to_collection("iterator_ops", init_op)
-    ops.add_to_collection("iterator_ops", get_next)
-    saver = saver_lib.Saver(allow_empty=True)
-    return init_op, get_next, saver
-
-  def _testReadWithBreaks(self, breaks, num_files=5, lines_per_file=5):
-    """Tests reading from input pipeline with regular breaks.
-
-    At each break point the iterator state gets saved using Saver and reloaded
-    in a new Graph and session.
-
-    Args:
-      breaks: List of counts of records after reading which iterator state is
-        checkpointed. Must to in non-decreasing order.
-      num_files: Total number of files.
-      lines_per_file: Total number of lines per file.
-    """
+
+  def testTextLineCore(self):
     compression_types = [None, "GZIP", "ZLIB"]
+    num_files = 5
+    lines_per_file = 5
+    num_outputs = num_files * lines_per_file
     for compression_type in compression_types:
       test_filenames = self._createFiles(
           num_files,
           lines_per_file,
           crlf=True,
           compression_type=compression_type)
+      # pylint: disable=cell-var-from-loop
+      self.run_core_tests(
+          lambda: self._build_iterator_graph(test_filenames, compression_type),
+          lambda: self._build_iterator_graph(test_filenames), num_outputs)
+      # pylint: enable=cell-var-from-loop
 
-      # Collect ground truth.
-      total_records = num_files * lines_per_file
-      expected_records = []
-      with ops.Graph().as_default() as g:
-        init_op, get_next, saver = self._build_graph(
-            test_filenames, compression_type=compression_type)
-        with self.test_session(graph=g) as sess:
-          sess.run(init_op)
-          for _ in range(total_records):
-            expected_records.append(sess.run(get_next))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next)
-
-      # Simulate run with breaks.
-      actual_records = []
-      next_record_index = 0
-      load_from_ckpt = False
-      breaks.append(total_records)
-      for break_index in breaks:
-        with ops.Graph().as_default() as g:
-          if not load_from_ckpt:
-            init_op, get_next, saver = self._build_graph(
-                test_filenames, compression_type=compression_type)
-          else:
-            saver = self._import_meta_graph()
-            init_op, get_next = ops.get_collection("iterator_ops")
 
-          with self.test_session(graph=g) as sess:
-            if not load_from_ckpt:
-              sess.run(init_op)
-            else:
-              self._restore(saver, sess)
-            while next_record_index != break_index:
-              actual_records.append(sess.run(get_next))
-              next_record_index += 1
-            if break_index == total_records:
-              with self.assertRaises(errors.OutOfRangeError):
-                sess.run(get_next)
-            self._save(saver, sess)
-            load_from_ckpt = True
-      self.assertEqual(actual_records, expected_records)
-
-  def testSaveAtFileBoundary(self):
-    self._testReadWithBreaks([10])
-
-  def testSaveWithinFile(self):
-    self._testReadWithBreaks([12])
-
-  def testSaveUnusedIterator(self):
-    self._testReadWithBreaks([0])
-
-  def testSaveRestoreIdempotence(self):
-    # Attempt to save an iterator immediately after it has been
-    # restored.
-    self._testReadWithBreaks([0, 0])
-    self._testReadWithBreaks([10, 10])
-    self._testReadWithBreaks([12, 12])
-
-  def testMultipleBreaks(self):
-    self._testReadWithBreaks([0, 4, 20])
-
-  def testRestoreExhaustedIterator(self):
-    num_files = 2
-    lines_per_file = 5
-    test_filenames = self._createFiles(num_files, lines_per_file, crlf=True)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = self._build_graph(test_filenames)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(num_files * lines_per_file):
-          sess.run(get_next)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-        self._save(saver, sess)
-
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        saver = self._import_meta_graph()
-        self._restore(saver, sess)
-        _, get_next = ops.get_collection("iterator_ops")
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  def testInitThenRestore(self):
-    num_files = 5
-    lines_per_file = 5
-    total_records = num_files * lines_per_file
-    break_record = 8
-    test_filenames = self._createFiles(num_files, lines_per_file, crlf=True)
-
-    expected_records = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = self._build_graph(test_filenames)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_record):
-          sess.run(get_next)
-        self._save(saver, sess)
-        for _ in range(total_records - break_record):
-          expected_records.append(sess.run(get_next))
-
-    actual_records = []
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        saver = self._import_meta_graph()
-        init_op, get_next = ops.get_collection("iterator_ops")
-        sess.run(init_op)
-        self._restore(saver, sess)
-        for _ in range(total_records - break_record):
-          actual_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-    self.assertEqual(actual_records, expected_records)
-
-  def testRestoreInModifiedGraph(self):
-    num_files = 5
-    lines_per_file = 5
-    total_records = num_files * lines_per_file
-    break_record = 8
-    test_filenames = self._createFiles(num_files, lines_per_file, crlf=True)
-
-    expected_records = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = self._build_graph(test_filenames)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_record):
-          sess.run(get_next)
-        self._save(saver, sess)
-        for _ in range(total_records - break_record):
-          expected_records.append(sess.run(get_next))
-
-    actual_records = []
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        init_op, get_next, saver = self._build_graph(
-            test_filenames, compression_type="GZIP")
-        self._restore(saver, sess)
-        for _ in range(total_records - break_record):
-          actual_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-    self.assertEqual(actual_records, expected_records)
-
-  def testRestoreInModifiedGraphThenInit(self):
-    num_files = 5
-    lines_per_file = 5
-    total_records = num_files * lines_per_file
-    break_record = 8
-    test_filenames = self._createFiles(num_files, lines_per_file, crlf=True)
-
-    expected_records = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = self._build_graph(test_filenames)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_record):
-          expected_records.append(sess.run(get_next))
-        self._save(saver, sess)
-        for _ in range(total_records - break_record):
-          expected_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-    # Test that calling the init_op overrides the restored iterator. The
-    # iterator for the old graph was build to read uncompressed files and
-    # would fail when trying to read the new files.
-    actual_records = []
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        test_filenames = self._createFiles(
-            num_files, lines_per_file, crlf=True, compression_type="GZIP")
-        init_op, get_next, saver = self._build_graph(
-            test_filenames, compression_type="GZIP")
-        self._restore(saver, sess)
-        sess.run(init_op)
-        for _ in range(total_records):
-          actual_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-    self.assertEqual(actual_records, expected_records)
-
-  def testDoNotRestoreIterator(self):
-    num_files = 5
-    lines_per_file = 5
-    total_records = num_files * lines_per_file
-    break_record = 8
-    test_filenames = self._createFiles(num_files, lines_per_file, crlf=True)
-
-    expected_records = []
-    with ops.Graph().as_default() as g:
-      init_op, get_next, saver = self._build_graph(test_filenames)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        for _ in range(break_record):
-          expected_records.append(sess.run(get_next))
-        self._save(saver, sess)
-        for _ in range(total_records - break_record):
-          expected_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-    actual_records = []
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        init_op, get_next, saver = self._build_graph(
-            test_filenames, build_saveable=False)
-        self._restore(saver, sess)
-        with self.assertRaises(errors.FailedPreconditionError):
-          sess.run(get_next)
-        sess.run(init_op)
-        for _ in range(total_records):
-          actual_records.append(sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-    self.assertEqual(actual_records, expected_records)
-
-
-class FixedLengthRecordReaderTest(test.TestCase):
+class FixedLengthRecordReaderTestBase(test.TestCase):
 
   def setUp(self):
-    super(FixedLengthRecordReaderTest, self).setUp()
+    super(FixedLengthRecordReaderTestBase, self).setUp()
     self._num_files = 2
     self._num_records = 7
     self._header_bytes = 5
@@ -462,6 +216,9 @@ class FixedLengthRecordReaderTest(test.TestCase):
         f.write(b"F" * self._footer_bytes)
     return filenames
 
+
+class FixedLengthRecordReaderTest(FixedLengthRecordReaderTestBase):
+
   def testFixedLengthRecordDataset(self):
     test_filenames = self._createFiles()
     filenames = array_ops.placeholder(dtypes.string, shape=[None])
@@ -547,304 +304,29 @@ class FixedLengthRecordReaderTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(iterator.get_next())
 
-  def _iterator_checkpoint_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_path(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_path()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def _build_iterator_graph(self, num_epochs):
+
+class FixedLengthRecordDatasetSerializationTest(
+    FixedLengthRecordReaderTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_iterator_graph(self, num_epochs, compression_type=None):
     filenames = self._createFiles()
-    dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-               .repeat(num_epochs))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next_op = iterator.get_next()
-    save_op = self._save_op(iterator._iterator_resource)
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return init_op, get_next_op, save_op, restore_op
-
-  def _restore_iterator(self):
-    output_types = dtypes.string
-    output_shapes = tensor_shape.scalar()
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    get_next = iterator.get_next()
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return restore_op, get_next
-
-  def testSaveRestore(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreInModifiedGraph(self):
-    num_epochs = 10
-    num_epochs_1 = 20
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs_1)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      restore_op, get_next_op = self._restore_iterator()
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreUnusedIterator(self):
-    num_epochs = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        # Save unused iterator.
-        sess.run(save_op)
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreExhaustedIterator(self):
-    num_epochs = 10
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.test_session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-
-class TFRecordDatasetTest(test.TestCase):
+    return readers.FixedLengthRecordDataset(
+        filenames, self._record_bytes, self._header_bytes,
+        self._footer_bytes).repeat(num_epochs)
+
+  def testFixedLengthRecordCore(self):
+    num_epochs = 5
+    num_outputs = num_epochs * self._num_files * self._num_records
+    self.run_core_tests(lambda: self._build_iterator_graph(num_epochs),
+                        lambda: self._build_iterator_graph(num_epochs * 2),
+                        num_outputs)
+
+
+class TFRecordDatasetTestBase(test.TestCase):
 
   def setUp(self):
-    super(TFRecordDatasetTest, self).setUp()
+    super(TFRecordDatasetTestBase, self).setUp()
     self._num_files = 2
     self._num_records = 7
 
@@ -880,6 +362,9 @@ class TFRecordDatasetTest(test.TestCase):
       writer.close()
     return filenames
 
+
+class TFRecordDatasetTest(TFRecordDatasetTestBase):
+
   def testReadOneEpoch(self):
     with self.test_session() as sess:
       # Basic test: read from file 0.
@@ -1001,6 +486,74 @@ class TFRecordDatasetTest(test.TestCase):
         sess.run(iterator.get_next())
 
 
+class TFRecordDatasetSerializationTest(
+    TFRecordDatasetTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_iterator_graph(self,
+                            num_epochs,
+                            batch_size=1,
+                            compression_type=None,
+                            buffer_size=None):
+    filenames = self._createFiles()
+    if compression_type is "ZLIB":
+      zlib_files = []
+      for i, fn in enumerate(filenames):
+        with open(fn, "rb") as f:
+          cdata = zlib.compress(f.read())
+          zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+          with open(zfn, "wb") as f:
+            f.write(cdata)
+          zlib_files.append(zfn)
+      filenames = zlib_files
+
+    elif compression_type is "GZIP":
+      gzip_files = []
+      for i, fn in enumerate(self.test_filenames):
+        with open(fn, "rb") as f:
+          gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+          with gzip.GzipFile(gzfn, "wb") as gzf:
+            gzf.write(f.read())
+          gzip_files.append(gzfn)
+      filenames = gzip_files
+
+    return readers.TFRecordDataset(
+        filenames, compression_type,
+        buffer_size=buffer_size).repeat(num_epochs).batch(batch_size)
+
+  def testTFRecordWithoutBufferCore(self):
+    num_epochs = 5
+    batch_size = num_epochs
+    num_outputs = num_epochs * self._num_files * self._num_records // batch_size
+    # pylint: disable=g-long-lambda
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(num_epochs, batch_size,
+                                           buffer_size=0),
+        lambda: self._build_iterator_graph(num_epochs * 2, batch_size),
+        num_outputs)
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(num_epochs, buffer_size=0), None,
+        num_outputs * batch_size)
+    # pylint: enable=g-long-lambda
+
+  def testTFRecordWithBufferCore(self):
+    num_epochs = 5
+    num_outputs = num_epochs * self._num_files * self._num_records
+    self.run_core_tests(lambda: self._build_iterator_graph(num_epochs),
+                        lambda: self._build_iterator_graph(num_epochs * 2),
+                        num_outputs)
+
+  def testTFRecordWithCompressionCore(self):
+    num_epochs = 5
+    num_outputs = num_epochs * self._num_files * self._num_records
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(num_epochs, compression_type="ZLIB"),
+        lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
+    self.run_core_tests(
+        lambda: self._build_iterator_graph(num_epochs, compression_type="GZIP"),
+        lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
+
+
 class ReadBatchFeaturesTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 91615e9f6205cc95ff531b98683ff485964f714e..1a26da82e533ec01106ea10525c1cd96627c34fb 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -207,5 +208,82 @@ class SequenceDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class SequenceDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_skip_dataset(self, count):
+    components = (np.arange(10),)
+    return dataset_ops.Dataset.from_tensor_slices(components).skip(count)
+
+  def testSkipFewerThanInputs(self):
+    count = 4
+    num_outputs = 10 - count
+    self.run_core_tests(lambda: self._build_skip_dataset(count),
+                        lambda: self._build_skip_dataset(count + 2),
+                        num_outputs)
+
+  def testSkipVarious(self):
+    # Skip more than inputs
+    self.run_core_tests(lambda: self._build_skip_dataset(20), None, 0)
+    # Skip exactly the input size
+    self.run_core_tests(lambda: self._build_skip_dataset(10), None, 0)
+    self.run_core_tests(lambda: self._build_skip_dataset(-1), None, 0)
+    # Skip nothing
+    self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
+
+  def _build_take_dataset(self, count):
+    components = (np.arange(10),)
+    return dataset_ops.Dataset.from_tensor_slices(components).take(count)
+
+  def testTakeFewerThanInputs(self):
+    count = 4
+    self.run_core_tests(
+        lambda: self._build_take_dataset(count),
+        lambda: self._build_take_dataset(count + 2),
+        count,
+    )
+
+  def testTakeVarious(self):
+    # Take more than inputs
+    self.run_core_tests(lambda: self._build_take_dataset(20), None, 10)
+    # Take exactly the input size
+    self.run_core_tests(lambda: self._build_take_dataset(10), None, 10)
+    # Take all
+    self.run_core_tests(lambda: self._build_take_dataset(-1), None, 10)
+    # Take nothing
+    self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
+
+  def _build_repeat_dataset(self, count, take_count=3):
+    components = (np.arange(10),)
+    return dataset_ops.Dataset.from_tensor_slices(components).take(
+        take_count).repeat(count)
+
+  def testFiniteRepeat(self):
+    count = 10
+    self.run_core_tests(lambda: self._build_repeat_dataset(count),
+                        lambda: self._build_repeat_dataset(count + 2),
+                        3 * count)
+
+  def testEmptyRepeat(self):
+    self.run_core_tests(lambda: self._build_repeat_dataset(0), None, 0)
+
+  def testInfiniteRepeat(self):
+    self.verify_unused_iterator(
+        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
+    self.verify_init_before_restore(
+        lambda: self._build_repeat_dataset(-1), 10, verify_exhausted=False)
+    self.verify_multiple_breaks(
+        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
+    self.verify_reset_restored_iterator(
+        lambda: self._build_repeat_dataset(-1), 20, verify_exhausted=False)
+    self.verify_restore_in_modified_graph(
+        lambda: self._build_repeat_dataset(-1),
+        lambda: self._build_repeat_dataset(2),
+        20,
+        verify_exhausted=False)
+    # Test repeat empty dataset
+    self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index e9ebaf4f21534fb43218d9579127b4aeb1dbd85e..ba1be0690ff3d72df9fe40980c0f5d53b33e41c5 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -18,16 +18,24 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import dataset_ops
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import dataset_ops as contrib_dataset_ops
+from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
 
 
 class ShuffleDatasetTest(test.TestCase):
@@ -42,8 +50,9 @@ class ShuffleDatasetTest(test.TestCase):
     buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
     seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
 
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
+    repeat_dataset = (
+        contrib_dataset_ops.Dataset.from_tensor_slices(components)
+        .repeat(count_placeholder))
 
     shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
                                              seed_placeholder)
@@ -134,8 +143,9 @@ class ShuffleDatasetTest(test.TestCase):
 
   def testDefaultArguments(self):
     components = [0, 1, 2, 3, 4]
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-                .repeat().make_one_shot_iterator())
+    iterator = (
+        contrib_dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
+        .repeat().make_one_shot_iterator())
 
     get_next = iterator.get_next()
 
@@ -148,6 +158,401 @@ class ShuffleDatasetTest(test.TestCase):
     for i in range(5):
       self.assertEqual(10, counts[i])
 
+  def testSeedNoneSeed2NonNone(self):
+    with self.assertRaises(ValueError):
+      dataset_ops.ShuffleDataset(dataset_ops.Dataset.range(5),
+                                 buffer_size=1,
+                                 seed=None,
+                                 seed2=10)
+
+
+class ShuffleDatasetSerializationTest(test.TestCase):
+
+  def tearDown(self):
+    # Remove all checkpoint files.
+    prefix = self._ckpt_path()
+    pattern = prefix + "*"
+    files = gfile.Glob(pattern)
+    map(gfile.Remove, files)
+
+  def _build_graph(self,
+                   range_limit=10,
+                   num_repeats=5,
+                   buffer_size=5,
+                   seed=None,
+                   reshuffle_each_iteration=None,
+                   build_saveable=True):
+    iterator = dataset_ops.Dataset.range(range_limit).shuffle(
+        buffer_size,
+        seed=seed,
+        reshuffle_each_iteration=reshuffle_each_iteration).repeat(
+            num_repeats).make_initializable_iterator()
+    if build_saveable:
+      saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    ops.add_to_collection("iterator_ops", init_op)
+    ops.add_to_collection("iterator_ops", get_next)
+    saver = saver_lib.Saver(allow_empty=True)
+    return init_op, get_next, saver
+
+  def _ckpt_path(self):
+    return os.path.join(self.get_temp_dir(), "iterator")
+
+  def _latest_ckpt(self):
+    return saver_lib.latest_checkpoint(self.get_temp_dir())
+
+  def _save(self, sess, saver):
+    saver.save(sess, self._ckpt_path())
+
+  def _restore(self, saver, sess):
+    saver.restore(sess, self._latest_ckpt())
+
+  def _import_meta_graph(self):
+    meta_file_path = self._ckpt_path() + ".meta"
+    return saver_lib.import_meta_graph(meta_file_path)
+
+  def _testReadWithBreaks(self, break_points, init_before_restore=False):
+    seed = 55
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [1, 3, 8, 10, 25, 50]
+    reshuffle_each_iteration = False
+    for buffer_size in buffer_sizes:
+      expected = []
+      actual = []
+      # Generate the ground truth.
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, _ = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(num_outputs):
+            expected.append(sess.run(get_next_op))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+      # Run and checkpoint after first break_point.
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(break_points[0]):
+            actual.append(sess.run(get_next_op))
+          self._save(sess, saver)
+
+      # Load from checkpoint and continue running while stopping at each
+      # subsequent checkpoint.
+      for i in range(len(break_points)):
+        with ops.Graph().as_default() as g:
+          saver = self._import_meta_graph()
+          init_op, get_next_op = ops.get_collection("iterator_ops")
+          with self.test_session(graph=g) as sess:
+            if init_before_restore:
+              sess.run(init_op)
+            self._restore(saver, sess)
+            start = break_points[i]
+            end = break_points[
+                i + 1] if i < len(break_points) - 1 else num_outputs
+            for _ in range(end - start):
+              actual.append(sess.run(get_next_op))
+            self._save(sess, saver)
+            if end == num_outputs:
+              with self.assertRaises(errors.OutOfRangeError):
+                sess.run(get_next_op)
+      self.assertEqual(expected, actual)
+
+  def testSaveRestore(self):
+    self._testReadWithBreaks([8])  # rng buffer_size: 0
+    self._testReadWithBreaks([13])  # rng buffer_size: 1
+    self._testReadWithBreaks([18])  # rng buffer_size: 2
+    self._testReadWithBreaks([23])  # rng buffer_size: 3
+
+  def testSaveUnusedIterator(self):
+    self._testReadWithBreaks([0])
+
+  def testSaveFullyUsedIterator(self):
+    self._testReadWithBreaks([50])
+
+  def testMultipleBreaks(self):
+    self._testReadWithBreaks([0, 5, 9, 15, 25, 32])
+
+  def testIdempotence(self):
+    # Attempt to save iterator immediately after restoring.
+    self._testReadWithBreaks([1, 1, 5, 5, 5, 25, 32])
+
+  def testInitThenRestore(self):
+    self._testReadWithBreaks([0, 5, 9, 15, 25, 32], init_before_restore=True)
+
+  def testRestoreExhaustedIterator(self):
+    seed = 55
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [1, 3, 8, 10, 25, 50]
+    reshuffle_each_iteration = False
+    for buffer_size in buffer_sizes:
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(num_outputs):
+            sess.run(get_next_op)
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+          self._save(sess, saver)
+
+        with ops.Graph().as_default() as g:
+          saver = self._import_meta_graph()
+          init_op, get_next_op = ops.get_collection("iterator_ops")
+          with self.test_session(graph=g) as sess:
+            self._restore(saver, sess)
+            with self.assertRaises(errors.OutOfRangeError):
+              sess.run(get_next_op)
+
+  def testResetRestoredIterator(self):
+    seed = 55
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [1, 3, 8, 10, 25, 50]
+    reshuffle_each_iteration = False
+    for buffer_size in buffer_sizes:
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(num_outputs // 2):
+            sess.run(get_next_op)
+          self._save(sess, saver)
+
+        outputs = []
+        with ops.Graph().as_default() as g:
+          saver = self._import_meta_graph()
+          init_op, get_next_op = ops.get_collection("iterator_ops")
+          with self.test_session(graph=g) as sess:
+            self._restore(saver, sess)
+            sess.run(init_op)
+            for _ in range(num_outputs):
+              outputs.append(sess.run(get_next_op))
+            with self.assertRaises(errors.OutOfRangeError):
+              sess.run(get_next_op)
+        expected_outputs_sorted = sorted(
+            np.array([range(range_limit)
+                      for _ in range(num_repeats)]).flatten())
+        self.assertEqual(expected_outputs_sorted, sorted(outputs))
+
+  def testRestoreInModifiedGraph(self):
+    seed = 55
+    break_point = 25
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [3, 8, 10, 25, 50]
+    reshuffle_each_iteration = False
+    for buffer_size in buffer_sizes:
+      expected = []
+      actual_without_restore = []
+      actual = []
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(break_point):
+            expected.append(sess.run(get_next_op))
+          actual.extend(expected)
+          self._save(sess, saver)
+          for _ in range(num_outputs - break_point):
+            expected.append(sess.run(get_next_op))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+      with ops.Graph().as_default() as g:
+        g.seed = 20  # Different seed than previous graph for shuffle rngs.
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(num_outputs):
+            actual_without_restore.append(sess.run(get_next_op))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+      with ops.Graph().as_default() as g:
+        g.seed = 20  # Different seed than previous graph for shuffle rngs.
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          self._restore(saver, sess)
+          for _ in range(num_outputs - break_point):
+            actual.append(sess.run(get_next_op))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+
+      # Since the modified graph has a different random seed it produces a
+      # different order of examples.
+      self.assertNotEqual(expected, actual_without_restore)
+      self.assertEqual(sorted(expected), sorted(actual_without_restore))
+      self.assertEqual(expected, actual)
+
+  def testDoNotBuildSaveable(self):
+    seed = 55
+    break_point = 25
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [3, 8, 10, 25, 50]
+    reshuffle_each_iteration = False
+    for buffer_size in buffer_sizes:
+      actual = []
+      with ops.Graph().as_default() as g:
+        g.seed = 10
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration)
+        with self.test_session(graph=g) as sess:
+          sess.run(init_op)
+          for _ in range(break_point):
+            sess.run(get_next_op)
+          self._save(sess, saver)
+
+      with ops.Graph().as_default() as g:
+        g.seed = 20  # Different seed than previous graph for shuffle rngs.
+        init_op, get_next_op, saver = self._build_graph(
+            range_limit=range_limit,
+            num_repeats=num_repeats,
+            buffer_size=buffer_size,
+            seed=seed,
+            reshuffle_each_iteration=reshuffle_each_iteration,
+            build_saveable=False)
+        with self.test_session(graph=g) as sess:
+          # Since the SaveableObject was not added to Saver's list
+          # of saveables, iterator state is not restored by saver.restore().
+          self._restore(saver, sess)
+          with self.assertRaises(errors.FailedPreconditionError):
+            sess.run(get_next_op)
+          sess.run(init_op)
+          for _ in range(num_outputs):
+            actual.append(sess.run(get_next_op))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(get_next_op)
+      expected_outputs_sorted = sorted(
+          np.array([range(range_limit) for _ in range(num_repeats)]).flatten())
+      self.assertEqual(expected_outputs_sorted, sorted(actual))
+
+
+class ShuffleAndRepeatTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed, count=5):
+    return dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
+
+  def testCorrectOutput(self):
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertSequenceEqual(
+        sorted(output), sorted(
+            np.array([range(20) for _ in range(5)]).flatten()))
+    for i in range(5):
+      self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
+
+  def testReshuffling(self):
+    # Check that the output orders of different epochs are indeed different.
+    output = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    for i in range(4):
+      epoch1 = output[i * 20:(i + 1) * 20]
+      epoch2 = output[(i + 1) * 20:(i + 2) * 20]
+      self.assertNotEqual(epoch1, epoch2)
+
+  def testSameOrderForSameSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    self.assertEqual(output1, output2)
+
+  def testDifferentOrderForDifferentSeeds(self):
+    output1 = self.gen_outputs(lambda: self._build_ds(10), [], 100)
+    output2 = self.gen_outputs(lambda: self._build_ds(20), [], 100)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountNone(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=None), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=None), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testCountMinusOne(self):
+    output1 = self.gen_outputs(
+        lambda: self._build_ds(10, count=-1), [], 100, verify_exhausted=False)
+    output2 = self.gen_outputs(
+        lambda: self._build_ds(20, count=-1), [], 100, verify_exhausted=False)
+    self.assertNotEqual(output1, output2)
+    self.assertEqual(sorted(output1), sorted(output2))
+
+  def testInfiniteOutputs(self):
+    # Asserting that the iterator is exhausted after producing 100 items should
+    # fail.
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=None), [], 100)
+    with self.assertRaises(AssertionError):
+      self.gen_outputs(lambda: self._build_ds(10, count=-1), [], 100)
+
+
+class ShuffleAndRepeatSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_ds(self, seed):
+    return dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=5, count=5, seed=seed))
+
+  def testCore(self):
+    self.run_core_tests(lambda: self._build_ds(10), lambda: self._build_ds(20),
+                        100)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f24d6b2f612cff662aa8a36085bc69a9ea1a290
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline statistics gathering ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import stats_ops
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StatsDatasetTest(test.TestCase):
+
+  def _assertSummaryHasCount(self, summary_str, tag, expected_value):
+    summary_proto = summary_pb2.Summary()
+    summary_proto.ParseFromString(summary_str)
+    for value in summary_proto.value:
+      if tag == value.tag:
+        self.assertEqual(expected_value, value.histo.num)
+        return
+    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
+  def _assertSummaryHasSum(self, summary_str, tag, expected_value):
+    summary_proto = summary_pb2.Summary()
+    summary_proto.ParseFromString(summary_str)
+    for value in summary_proto.value:
+      if tag == value.tag:
+        self.assertEqual(expected_value, value.histo.sum)
+        return
+    self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
+
+  def testBytesProduced(self):
+    dataset = dataset_ops.Dataset.range(100).map(
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
+            stats_ops.bytes_produced_stats("bytes_produced"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      expected_sum = 0.0
+      for i in range(100):
+        self.assertAllEqual(
+            np.array([i] * i, dtype=np.int64), sess.run(next_element))
+        summary_str = sess.run(summary_t)
+        self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
+        expected_sum += i * 8.0
+        self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      summary_str = sess.run(summary_t)
+      self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+      self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+
+  def testLatencyStats(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+
+  def testReinitialize(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run(stats_aggregator_subscriber)
+      for j in range(5):
+        sess.run(iterator.initializer)
+        for i in range(100):
+          self.assertEqual(i, sess.run(next_element))
+          self._assertSummaryHasCount(
+              sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", (j + 1) * 100.0)
+
+  def testNoAggregatorRegistered(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testMultipleTags(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.latency_stats("record_latency_2"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(i + 1))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency_2", float(i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_latency_2", 100.0)
+
+  def testRepeatedTags(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency")).apply(
+            stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscriber = stats_aggregator.subscribe(iterator)
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator.initializer, stats_aggregator_subscriber])
+      for i in range(100):
+        self.assertEqual(i, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+
+  def testMultipleIteratorsSameAggregator(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator_0 = dataset.make_initializable_iterator()
+    iterator_1 = dataset.make_initializable_iterator()
+    stats_aggregator = stats_ops.StatsAggregator()
+    stats_aggregator_subscribers = [stats_aggregator.subscribe(iterator_0),
+                                    stats_aggregator.subscribe(iterator_1)]
+    next_element = iterator_0.get_next() + iterator_1.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run([iterator_0.initializer, iterator_1.initializer,
+                stats_aggregator_subscribers])
+      for i in range(100):
+        self.assertEqual(i * 2, sess.run(next_element))
+        self._assertSummaryHasCount(
+            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+
+  def testMultipleStatsAggregatorsSameIteratorFail(self):
+    dataset = dataset_ops.Dataset.range(100).apply(
+        stats_ops.latency_stats("record_latency"))
+    iterator = dataset.make_initializable_iterator()
+    stats_aggregator_0 = stats_ops.StatsAggregator()
+    stats_aggregator_1 = stats_ops.StatsAggregator()
+
+    with self.test_session() as sess:
+      sess.run(stats_aggregator_0.subscribe(iterator))
+      # TODO(mrry): Consider making this allowable (and also allowing
+      # aggregators to unsubscribe).
+      with self.assertRaises(errors.FailedPreconditionError):
+        sess.run(stats_aggregator_1.subscribe(iterator))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
index b0e72183019e4d53756542e2a2ef071111120dcd..5d34b0024c472d0393544ff3dad8acea7964345f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/zip_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -110,5 +111,31 @@ class ZipDatasetTest(test.TestCase):
         sess.run(get_next)
 
 
+class ZipDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, arr):
+    components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array(arr)
+    ]
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component)
+        for component in components
+    ]
+    return dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+  def testCore(self):
+    # Equal length components
+    arr = [37.0, 38.0, 39.0, 40.0]
+    num_outputs = len(arr)
+    self.run_core_tests(lambda: self._build_dataset(arr), None, num_outputs)
+    # Variable length components
+    diff_size_arr = [1.0, 2.0]
+    self.run_core_tests(lambda: self._build_dataset(diff_size_arr),
+                        lambda: self._build_dataset(arr), 2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 1b81cf5be9190ffab646192fb9a72fd3da7deee1..1f35ee056b7f897ce5e7488b205ecf5a05ef0268 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -14,11 +14,13 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 py_library(
     name = "dataset_ops",
     srcs = [
+        "counter.py",
         "dataset_ops.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":transformation_ops",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
@@ -38,6 +40,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "random_ops",
+    srcs = [
+        "random_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
 py_library(
     name = "readers",
     srcs = [
@@ -60,6 +81,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "shuffle_ops",
+    srcs = [
+        "shuffle_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_ops",
+        ":transformation_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "transformation_ops",
     srcs = [
@@ -70,6 +104,7 @@ py_library(
         "interleave_ops.py",
         "resampling.py",
         "scan_ops.py",
+        "stats_ops.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -84,8 +119,10 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
     ],
 )
@@ -117,14 +154,7 @@ tf_custom_op_py_library(
     deps = [
         ":prefetching_ops",
         "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index abc9212a87550745490b974d25a929a66287f785..63782d229e1535892686f202ca1f0833dee6ed80 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -103,6 +104,48 @@ def unbatch():
   return _apply_fn
 
 
+def filter_irregular_batches(batch_size):
+  """Transformation that filters out batches that are not of size batch_size."""
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    tensor_batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+
+    flattened = _RestructuredDataset(
+        dataset,
+        tuple(nest.flatten(dataset.output_types)),
+        output_classes=tuple(nest.flatten(dataset.output_classes)))
+
+    def _predicate(*xs):
+      """Return `True` if this element is a full batch."""
+      # Extract the dynamic batch size from the first component of the flattened
+      # batched element.
+      first_component = xs[0]
+      first_component_batch_size = array_ops.shape(
+          first_component, out_type=dtypes.int64)[0]
+
+      return math_ops.equal(first_component_batch_size, tensor_batch_size)
+
+    filtered = flattened.filter(_predicate)
+
+    maybe_constant_batch_size = tensor_util.constant_value(tensor_batch_size)
+
+    def _set_first_dimension(shape):
+      return shape.merge_with(
+          tensor_shape.vector(maybe_constant_batch_size).concatenate(shape[1:]))
+
+    known_shapes = nest.map_structure(_set_first_dimension,
+                                      dataset.output_shapes)
+    return _RestructuredDataset(
+        filtered,
+        dataset.output_types,
+        known_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 def batch_and_drop_remainder(batch_size):
   """A batching transformation that omits the final small batch (if present).
 
@@ -135,34 +178,43 @@ def batch_and_drop_remainder(batch_size):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    tensor_batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
+    batched = dataset.batch(batch_size)
+    return filter_irregular_batches(batch_size)(batched)
+
+  return _apply_fn
 
-    batched = dataset.batch(tensor_batch_size)
-    flattened = _RestructuredDataset(batched,
-                                     tuple(nest.flatten(batched.output_types)))
 
-    def _predicate(*xs):
-      """Return `True` if this element is a full batch."""
-      # Extract the dynamic batch size from the first component of the flattened
-      # batched element.
-      first_component = xs[0]
-      first_component_batch_size = array_ops.shape(
-          first_component, out_type=dtypes.int64)[0]
+def padded_batch_and_drop_remainder(batch_size,
+                                    padded_shapes,
+                                    padding_values=None):
+  """A batching and padding transformation that omits the final small batch.
 
-      return math_ops.equal(first_component_batch_size, tensor_batch_size)
+  Like @{tf.data.Dataset.padded_batch}, this transformation combines
+  consecutive elements of this dataset into batches. However, if the batch
+  size does not evenly divide the input dataset size, this transformation will
+  drop the final smaller element.
 
-    filtered = flattened.filter(_predicate)
+  See `@{tf.contrib.data.batch_and_drop_remainder}` for more details.
 
-    maybe_constant_batch_size = tensor_util.constant_value(tensor_batch_size)
+  Args:
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    padded_shapes: A nested structure of `tf.TensorShape` or
+      `tf.int64` vector tensor-like objects. See
+      @{tf.data.Dataset.padded_batch} for details.
+    padding_values: (Optional.) A nested structure of scalar-shaped
+      `tf.Tensor`. See @{tf.data.Dataset.padded_batch} for details.
 
-    def _set_first_dimension(shape):
-      return shape.merge_with(
-          tensor_shape.vector(maybe_constant_batch_size).concatenate(shape[1:]))
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
 
-    known_shapes = nest.map_structure(_set_first_dimension,
-                                      batched.output_shapes)
-    return _RestructuredDataset(filtered, batched.output_types, known_shapes)
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    batched = dataset.padded_batch(
+        batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
+    return filter_irregular_batches(batch_size)(batched)
 
   return _apply_fn
 
@@ -191,6 +243,10 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
         output_shapes=self.output_shapes,
         output_types=self.output_types)
 
+  @property
+  def output_classes(self):
+    return (ops.Tensor, ops.Tensor, ops.Tensor)
+
   @property
   def output_shapes(self):
     num_elements = tensor_shape.Dimension(None)
@@ -206,7 +262,11 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
 class _RestructuredDataset(dataset_ops.Dataset):
   """An internal helper for changing the structure and shape of a dataset."""
 
-  def __init__(self, dataset, output_types, output_shapes=None):
+  def __init__(self,
+               dataset,
+               output_types,
+               output_shapes=None,
+               output_classes=None):
     """Creates a new dataset with the given output types and shapes.
 
     The given `dataset` must have a structure that is convertible:
@@ -222,6 +282,8 @@ class _RestructuredDataset(dataset_ops.Dataset):
       output_types: A nested structure of `tf.DType` objects.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects.
         If omitted, the shapes will be inherited from `dataset`.
+      output_classes: (Optional.) A nested structure of class types.
+        If omitted, the class types will be inherited from `dataset`.
 
     Raises:
       ValueError: If either `output_types` or `output_shapes` is not compatible
@@ -261,10 +323,21 @@ class _RestructuredDataset(dataset_ops.Dataset):
                                                  output_shapes))
       self._output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      # Inherit class types from the original `dataset`.
+      self._output_classes = nest.pack_sequence_as(output_types,
+                                                   nest.flatten(
+                                                       dataset.output_classes))
+    else:
+      self._output_classes = output_classes
 
   def _as_variant_tensor(self):
     return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_types(self):
     return self._output_types
@@ -280,7 +353,6 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
-
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_batches = ops.convert_to_tensor(
@@ -295,8 +367,10 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
         f=self._map_func,
         batch_size=self._batch_size,
         num_parallel_batches=self._num_parallel_batches,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     # pylint: enable=protected-access
 
   @property
diff --git a/tensorflow/contrib/data/python/ops/counter.py b/tensorflow/contrib/data/python/ops/counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..63226fe78163c59025623a362d17c400fbe57c67
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/counter.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Counter Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import scan_ops
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+
+
+def Counter(start=0, step=1, dtype=dtypes.int64):
+  """Creates a `Dataset` of a `step`-separated count startin from `start`.
+
+  For example:
+
+  ```python
+  Dataset.count() == [0, 1, 2, ...)
+  Dataset.count(2) == [2, 3, ...)
+  Dataset.count(2, 5) == [2, 7, 12, ...)
+  Dataset.count(0, -1) == [0, -1, -2, ...)
+  Dataset.count(10, -1) == [10, 9, ...)
+  ```
+
+  Args:
+    start: starting value for count.
+    step: step size.
+    dtype: counter data type.
+
+  Returns:
+    A `Dataset` of scalar elements.
+  """
+  with ops.name_scope("counter"):
+    start = ops.convert_to_tensor(start, dtype=dtype, name="start")
+    step = ops.convert_to_tensor(step, dtype=dtype, name="step")
+    return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
+        scan_ops.scan(start, lambda state, _: (state + step, state)))
diff --git a/tensorflow/contrib/data/python/ops/dataset_ops.py b/tensorflow/contrib/data/python/ops/dataset_ops.py
index 45d6dbe7438957029b4d6b71e181cb1fc3596ecb..626a9e0edcea5928b1636c1a2a86e83657c966a5 100644
--- a/tensorflow/contrib/data/python/ops/dataset_ops.py
+++ b/tensorflow/contrib/data/python/ops/dataset_ops.py
@@ -21,7 +21,6 @@ from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import enumerate_ops
 from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import grouping
-
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.ops import gen_dataset_ops
@@ -48,6 +47,10 @@ class Dataset(dataset_ops.Dataset):
   def _as_variant_tensor(self):
     return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._dataset.output_classes
+
   @property
   def output_shapes(self):
     return self._dataset.output_shapes
diff --git a/tensorflow/contrib/data/python/ops/error_ops.py b/tensorflow/contrib/data/python/ops/error_ops.py
index 238bb52b0205f9ab66f479f1b92e72ab6e38725b..aa629cba479102ee4244884e7c546615b28cf4e5 100644
--- a/tensorflow/contrib/data/python/ops/error_ops.py
+++ b/tensorflow/contrib/data/python/ops/error_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.ops import gen_dataset_ops
 
 
@@ -62,8 +63,14 @@ class IgnoreErrorsDataset(dataset_ops.Dataset):
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 6df7b22fb69bb14c41a26bd630a825442f67ee23..ef91c56726e969053fdad667dda3e89430045652 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -87,15 +88,21 @@ def group_by_window(key_func,
 class _VariantDataset(dataset_ops.Dataset):
   """A Dataset wrapper for a tf.variant-typed function argument."""
 
-  def __init__(self, dataset_variant, output_types, output_shapes):
+  def __init__(self, dataset_variant, output_types, output_shapes,
+               output_classes):
     super(_VariantDataset, self).__init__()
     self._dataset_variant = dataset_variant
     self._output_types = output_types
     self._output_shapes = output_shapes
+    self._output_classes = output_classes
 
   def _as_variant_tensor(self):
     return self._dataset_variant
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_shapes(self):
     return self._output_shapes
@@ -137,13 +144,21 @@ class GroupByWindowDataset(dataset_ops.Dataset):
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping Defun for key_func."""
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_key_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
+
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       # pylint: disable=protected-access
       if dataset_ops._should_unpack_args(nested_args):
         ret = key_func(*nested_args)
@@ -165,14 +180,15 @@ class GroupByWindowDataset(dataset_ops.Dataset):
     def tf_reduce_func(key, window_dataset_variant):
       """A wrapper for Defun that facilitates shape inference."""
       key.set_shape([])
-      window_dataset = _VariantDataset(window_dataset_variant,
-                                       input_dataset.output_types,
-                                       input_dataset.output_shapes)
+      window_dataset = _VariantDataset(
+          window_dataset_variant, input_dataset.output_types,
+          input_dataset.output_shapes, input_dataset.output_classes)
       if not isinstance(window_dataset, dataset_ops.Dataset):
         raise TypeError("`window_dataset` must return a `Dataset` object.")
       output_dataset = reduce_func(key, window_dataset)
       if not isinstance(output_dataset, dataset_ops.Dataset):
         raise TypeError("`reduce_func` must return a `Dataset` object.")
+      self._output_classes = output_dataset.output_classes
       self._output_types = output_dataset.output_types
       self._output_shapes = output_dataset.output_shapes
       return output_dataset._as_variant_tensor()  # pylint: disable=protected-access
@@ -180,6 +196,10 @@ class GroupByWindowDataset(dataset_ops.Dataset):
     self._reduce_func = tf_reduce_func
     self._reduce_func.add_to_graph(ops.get_default_graph())
 
+  @property
+  def output_classes(self):
+    return self._output_classes
+
   @property
   def output_shapes(self):
     return self._output_shapes
@@ -197,5 +217,7 @@ class GroupByWindowDataset(dataset_ops.Dataset):
         key_func=self._key_func,
         reduce_func=self._reduce_func,
         window_size_func=self._window_size_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 74a919c1fff62cfa79b0877a3d081077ca6776f0..53324e06e7f1dc249388410f0e14e42336630cd1 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
@@ -35,16 +36,22 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
     super(ParallelInterleaveDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-
-      if nest.is_sequence(nested_args):
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
+      if dataset_ops._should_unpack_args(nested_args):  # pylint: disable=protected-access
         dataset = map_func(*nested_args)
       else:
         dataset = map_func(nested_args)
@@ -52,6 +59,7 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
       if not isinstance(dataset, dataset_ops.Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -75,8 +83,14 @@ class ParallelInterleaveDataset(dataset_ops.Dataset):
         self._block_length,
         self._sloppy,
         f=self._map_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/random_ops.py b/tensorflow/contrib/data/python/ops/random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d727165feabb101549567f28a2dfa07083de244
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/random_ops.py
@@ -0,0 +1,67 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Datasets for random number generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class RandomDataset(dataset_ops.Dataset):
+  """A `Dataset` of pseudorandom values."""
+
+  def __init__(self, seed=None):
+    """A `Dataset` of pseudorandom values."""
+    super(RandomDataset, self).__init__()
+    seed, seed2 = random_seed.get_seed(seed)
+    if seed is None:
+      self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
+    else:
+      self._seed = ops.convert_to_tensor(seed, dtype=dtypes.int64, name="seed")
+    if seed2 is None:
+      self._seed2 = constant_op.constant(0, dtype=dtypes.int64, name="seed2")
+    else:
+      self._seed2 = ops.convert_to_tensor(
+          seed2, dtype=dtypes.int64, name="seed2")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.random_dataset(
+        seed=self._seed,
+        seed2=self._seed2,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
+  @property
+  def output_shapes(self):
+    return tensor_shape.scalar()
+
+  @property
+  def output_types(self):
+    return dtypes.int64
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 2e1c3153ca78e20e2628e8754b9827b817f8c732..347e5edc7b0d479dfa260e8cec500ffaaba375be 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -23,7 +23,6 @@ from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import parsing_ops
@@ -156,8 +155,7 @@ def read_batch_features(file_pattern,
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
     reader: A function or class that can be called with a `filenames` tensor
-      and (optional) `reader_args` and returns a `Dataset` of serialized
-      Examples.
+      and (optional) `reader_args` and returns a `Dataset` of Examples.
     reader_args: Additional arguments to pass to the reader class.
     randomize_input: Whether the input should be randomized.
     num_epochs: Integer specifying the number of times to read through the
@@ -166,7 +164,7 @@ def read_batch_features(file_pattern,
       shuffling but would increase memory usage and startup time.
 
   Returns:
-    A dict from keys in features to Tensor or SparseTensor objects.
+    A dict from keys in features to `Tensor` or `SparseTensor` objects.
   """
   filenames = _get_file_names(file_pattern, randomize_input)
   if reader_args:
@@ -174,32 +172,17 @@ def read_batch_features(file_pattern,
   else:
     dataset = reader(filenames)
   if dataset.output_types == (dtypes.string, dtypes.string):
-    dataset = dataset.map(lambda unused_k, v: v)
-  elif dataset.output_types != dtypes.string:
-    raise TypeError("`reader` must be a dataset of `tf.string` values, "
-                    "or `(tf.string, tf.string)` key-value pairs.")
+    dataset = dataset.map(lambda _, v: v)
   if num_epochs != 1:
     dataset = dataset.repeat(num_epochs)
   if randomize_input:
     dataset = dataset.shuffle(capacity)
   dataset = dataset.batch(batch_size)
-  dataset = dataset.map(lambda x: _parse_example(x, features))
+  dataset = dataset.map(lambda x: parsing_ops.parse_example(x, features))
+  dataset = dataset.prefetch(1)
   iterator = dataset.make_one_shot_iterator()
   outputs = iterator.get_next()
-  index = 0
-  result = {}
-  for key in sorted(features.keys()):
-    feature = features[key]
-    if isinstance(feature, parsing_ops.FixedLenFeature):
-      result[key] = outputs[index]
-      index += 1
-    else:
-      result[key] = sparse_tensor_lib.SparseTensor(
-          indices=outputs[index],
-          values=outputs[index + 1],
-          dense_shape=outputs[index + 2])
-      index += 3
-  return result
+  return outputs
 
 
 def _get_file_names(file_pattern, randomize_input):
@@ -233,18 +216,6 @@ def _get_file_names(file_pattern, randomize_input):
   return file_names
 
 
-def _parse_example(serialized, features):
-  parsed = parsing_ops.parse_example(serialized, features)
-  result = []
-  for key in sorted(features.keys()):
-    val = parsed[key]
-    if isinstance(val, sparse_tensor_lib.SparseTensor):
-      result.extend([val.indices, val.values, val.dense_shape])
-    else:
-      result.append(val)
-  return tuple(result)
-
-
 class SqlDataset(contrib_dataset_ops.Dataset):
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -299,6 +270,10 @@ class _SqlDataset(dataset_ops.Dataset):
                                        nest.flatten(self.output_types),
                                        nest.flatten(self.output_shapes))
 
+  @property
+  def output_classes(self):
+    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
+
   @property
   def output_shapes(self):
     return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 5acaed48a3d73e93706bdd0b5b2d614b0c565ab7..2744786e9eec4c9268ba854df6ea761339bb0b4e 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -21,6 +21,7 @@ import collections
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
@@ -43,6 +44,7 @@ class _ScanDataset(dataset_ops.Dataset):
     # Compute initial values for the state shapes and types based on
     # the initial state. These will be refined by running
     # `tf_scan_func` one or more times below.
+    # TODO(b/68937811): Allow the initial state to be a tf.SparseTensor.
     self._state_shapes = nest.pack_sequence_as(
         self._initial_state,
         [t.shape for t in nest.flatten(self._initial_state)])
@@ -51,6 +53,7 @@ class _ScanDataset(dataset_ops.Dataset):
         [t.dtype for t in nest.flatten(self._initial_state)])
 
     # Will be populated by calling `tf_scan_func`.
+    self._output_classes = None
     self._output_shapes = None
     self._output_types = None
 
@@ -65,14 +68,17 @@ class _ScanDataset(dataset_ops.Dataset):
       # Create a list in which `tf_scan_func` will store the s
       flat_new_state_shapes = []
 
-      @function.Defun(
-          *(flat_state_types + nest.flatten(input_dataset.output_types)))
+      @function.Defun(*(flat_state_types + nest.flatten(
+          sparse.as_dense_types(input_dataset.output_types,
+                                input_dataset.output_classes))))
       def tf_scan_func(*args):
         """A wrapper for Defun that facilitates shape inference."""
         # Pass in shape information from the state and input_dataset.
-        for arg, shape in zip(
-            args,
-            flat_state_shapes + nest.flatten(input_dataset.output_shapes)):
+        # TODO(b/69424092): Check that neither inputs nor outputs are sparse.
+        dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                              input_dataset.output_classes)
+        for arg, shape in zip(args,
+                              flat_state_shapes + nest.flatten(dense_shapes)):
           arg.set_shape(shape)
 
         pivot = len(flat_state_shapes)
@@ -106,6 +112,8 @@ class _ScanDataset(dataset_ops.Dataset):
                 "state. Expected %s; got %s." %
                 (self._state_types, nest.pack_sequence_as(
                     self._state_types, [t.dtype for t in flat_new_state])))
+        self._output_classes = nest.pack_sequence_as(
+            output_value, [ops.Tensor for _ in flat_output_value])
         self._output_types = nest.pack_sequence_as(
             output_value, [t.dtype for t in flat_output_value])
 
@@ -144,8 +152,14 @@ class _ScanDataset(dataset_ops.Dataset):
         nest.flatten(self._initial_state),
         self._scan_func.captured_inputs,
         f=self._scan_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/contrib/data/python/ops/shuffle_ops.py b/tensorflow/contrib/data/python/ops/shuffle_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..460732d65e4e652058ad821fbed45d365b4f41c1
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/shuffle_ops.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental shuffle ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import random_ops
+from tensorflow.python.data.ops import dataset_ops
+
+
+def shuffle_and_repeat(buffer_size, count=None, seed=None):
+  """Shuffles and repeats a Dataset returning a new permutation for each epoch.
+
+  `dataset.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count))`
+
+  is equivalent to
+
+  `dataset.shuffle(buffer_size, reshuffle_each_iteration=True).repeat(count)`
+
+  The difference is that the latter dataset is not serializable. So,
+  if you need to checkpoint an input pipeline with reshuffling you must use
+  this implementation.
+
+  Args:
+    buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+      maximum number elements that will be buffered when prefetching.
+    count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      number of times the dataset should be repeated. The default behavior
+      (if `count` is `None` or `-1`) is for the dataset be repeated
+      indefinitely.
+    seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+      random seed that will be used to create the distribution. See
+      @{tf.set_random_seed} for behavior.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
+    random_ds = random_ops.RandomDataset(seed).apply(
+        batching.batch_and_drop_remainder(2))
+    if count is not None and count is not -1:
+      random_ds = random_ds.take(count)
+
+    def map_fn(seeds):
+      return dataset_ops.ShuffleDataset(
+          input_dataset=dataset,
+          buffer_size=buffer_size,
+          seed=seeds[0],
+          reshuffle_each_iteration=False,
+          seed2=seeds[1])
+
+    return random_ds.flat_map(map_fn)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8875bd533ddc9e2c195646619dccf3aab5225e4
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for gathering statistics from `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+class StatsAggregator(object):
+  """A stateful resource that aggregates statistics from one or more iterators.
+
+  To record statistics, use one of the custom transformation functions defined
+  in this module when defining your @{tf.data.Dataset}. All statistics will be
+  aggregated by the `StatsAggregator` that is associated with a particular
+  iterator (see below). For example, to record the total number of bytes
+  produced by iterating over a dataset:
+
+  ```python
+  dataset = ...
+  dataset = dataset.apply(stats_ops.bytes_produced_stats("total_bytes"))
+  ```
+
+  To associate a `StatsAggregator` with a @{tf.data.Iterator} object, use
+  the following pattern:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_one_shot_iterator()
+  stats_aggregator = stats_ops.StatsAggregator()
+  set_op = stats_op.set_stats_aggregator_op(iterator, stats_aggregator)
+
+  with tf.Session() as sess:
+    # Running `set_op` will associate `iterator` with `stats_aggregator`.
+    sess.run(set_op)
+  ```
+
+  To get a protocol buffer summary of the currently aggregated statistics,
+  use the `StatsAggregator.get_summary()` tensor. The easiest way to do this
+  is to add the returned tensor to the @{tf.GraphKeys.SUMMARIES} collection,
+  so that the summaries will be included with any existing summaries.
+
+  ```python
+  stats_aggregator = stats_ops.StatsAggregator()
+  stats_summary = stats_aggregator.get_summary()
+  tf.add_to_collection(tf.GraphKeys.SUMMARIES, stats_summary)
+  ```
+
+  Note: This interface is experimental and expected to change. In particular,
+  we expect to add other implementations of `StatsAggregator` that provide
+  different ways of exporting statistics, and add more types of statistics.
+  """
+
+  def __init__(self):
+    """Creates a `StatsAggregator`."""
+    self._resource = gen_dataset_ops.stats_aggregator_handle()
+
+  def get_summary(self):
+    """Returns a string @{tf.Tensor} that summarizes the aggregated statistics.
+
+    The returned tensor will contain a serialized @{tf.summary.Summary} protocol
+    buffer, which can be used with the standard TensorBoard logging facilities.
+
+    Returns:
+      A scalar string @{tf.Tensor} that summarizes the aggregated statistics.
+    """
+    return gen_dataset_ops.stats_aggregator_summary(self._resource)
+
+  def subscribe(self, iterator):
+    """Returns a @{tf.Operation} to associate this aggregator with `iterator`.
+
+    Note: Each @{tf.data.Iterator} can be associated with at most one
+    `StatsAggregator`. After running the operation that this function
+    returns, all statistics recorded in the iteration of `iterator`
+    will be stored in `stats_aggregator`.
+
+    Args:
+      iterator: A @{tf.data.Iterator} object.
+
+    Returns:
+      A @{tf.Operation} that, when run, associates this aggregator with
+      `iterator`.
+    """
+    if not isinstance(iterator, iterator_ops.Iterator):
+      raise TypeError("`iterator` must be a `tf.data.Iterator` object.")
+    return gen_dataset_ops.iterator_set_stats_aggregator(
+        iterator._iterator_resource, self._resource)  # pylint: disable=protected-access
+
+
+def bytes_produced_stats(tag):
+  """Records the number of bytes produced by each element of the input dataset.
+
+  To consume the statistics, associate a `StatsAggregator` with an iterator
+  over the output dataset.
+
+  Args:
+    tag: String. All statistics recorded by the returned transformation will
+      be associated with the given `tag`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
+                         tag)
+
+  return _apply_fn
+
+
+def latency_stats(tag):
+  """Records the latency of producing each element of the input dataset.
+
+  To consume the statistics, associate a `StatsAggregator` with an iterator
+  over the output dataset.
+
+  Args:
+    tag: String. All statistics recorded by the returned transformation will
+      be associated with the given `tag`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.contrib.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+
+  return _apply_fn
+
+
+class _StatsDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and also records statistics."""
+
+  def __init__(self, input_dataset, op_function, tag):
+    super(_StatsDataset, self).__init__()
+    self._input_dataset = input_dataset
+    self._op_function = op_function
+    self._tag = ops.convert_to_tensor(tag, dtype=dtypes.string)
+
+  def _as_variant_tensor(self):
+    return self._op_function(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._tag,
+        output_shapes=nest.flatten(self.output_shapes),
+        output_types=nest.flatten(self.output_types))
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
diff --git a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
index d3d201afd5761e7c5c136301c779222bedc68492..cafb9314caee1c4907786b8101e7c71bd7095306 100644
--- a/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
+++ b/tensorflow/contrib/decision_trees/proto/generic_tree_model_proto.swig
@@ -2,7 +2,7 @@
 
 %include "net/proto/swig/protofunc.swig"
 
-#ifndef MUST_USE_RESULT
+#ifndef ABSL_MUST_USE_RESULT
 #error Use this file only as a %include or %import after google.swig.
 #endif
 
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 4a4f3789016bed5db475da81b2448b682f158353..b2c641f8ab3ea23c5135042e4b1223d487ae8cbc 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -2,12 +2,15 @@
 #   Contains ops for statistical distributions (with pdf, cdf, sample, etc...).
 #   APIs here are meant to evolve over time.
 
+package(default_visibility = [
+    "//learning/brain/contrib/bayesflow:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
@@ -137,6 +140,23 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "cauchy_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/cauchy_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "chi2_test",
     srcs = ["python/kernel_tests/chi2_test.py"],
@@ -184,6 +204,24 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "half_normal_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/half_normal_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "inverse_gamma_test",
     srcs = ["python/kernel_tests/inverse_gamma_test.py"],
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 16f6533e57347a5fe41b017c9855d216fba9da82..66827179e9fa1bea852f55246c263c4696cf3bdc 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.binomial import *
+from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
 from tensorflow.contrib.distributions.python.ops.conditional_distribution import *
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
@@ -35,6 +36,7 @@ from tensorflow.contrib.distributions.python.ops.distribution_util import softpl
 from tensorflow.contrib.distributions.python.ops.distribution_util import tridiag
 from tensorflow.contrib.distributions.python.ops.estimator import *
 from tensorflow.contrib.distributions.python.ops.geometric import *
+from tensorflow.contrib.distributions.python.ops.half_normal import *
 from tensorflow.contrib.distributions.python.ops.independent import *
 from tensorflow.contrib.distributions.python.ops.inverse_gamma import *
 from tensorflow.contrib.distributions.python.ops.logistic import *
@@ -83,6 +85,7 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'bijectors',
+    'Cauchy',
     'ConditionalDistribution',
     'ConditionalTransformedDistribution',
     'FULLY_REPARAMETERIZED',
@@ -105,6 +108,7 @@ _allowed_symbols = [
     'Gamma',
     'GammaWithSoftplusConcentrationRate',
     'Geometric',
+    'HalfNormal',
     'Independent',
     'InverseGamma',
     'InverseGammaWithSoftplusConcentrationRate',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
index 25a9b6f5fe2ed6d218d6b44650fce17fa89c0664..288d9d8dd6f17cd6348d3d72aea4408e26913ebd 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/masked_autoregressive_test.py
@@ -22,9 +22,9 @@ import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import test_util
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import Invert
+from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import _gen_mask
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import masked_autoregressive_default_template
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import MaskedAutoregressiveFlow
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import _gen_mask
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 38b3a23c2d684a6f89b7c4be4a763c649bf4de15..49451446b56d290f130c5db90c13b94974d92dc9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -28,8 +28,19 @@ from tensorflow.python.ops.distributions.bijector_test_util import assert_biject
 from tensorflow.python.platform import test
 
 
-class ReshapeBijectorTest(test.TestCase):
-  """Tests correctness of the reshape transformation."""
+class _ReshapeBijectorTest(object):
+  """Base class for testing the reshape transformation.
+
+  Methods defined in this class call a method self.build_shapes() that
+  is implemented by subclasses defined below, returning respectively
+   ReshapeBijectorTestStatic: static shapes,
+   ReshapeBijectorTestDynamic: shape placeholders of known ndims, and
+   ReshapeBijectorTestDynamicNdims: shape placeholders of unspecified ndims,
+  so that each test in this base class is automatically run over all
+  three cases. The subclasses also implement assertRaisesError to test
+  for either Python exceptions (in the case of static shapes) or
+  TensorFlow op errors (dynamic shapes).
+  """
 
   def setUp(self):
     self._rng = np.random.RandomState(42)
@@ -40,9 +51,10 @@ class ReshapeBijectorTest(test.TestCase):
     expected_y = np.reshape(expected_x, [4, 6])
 
     with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([3, 2], [6,])
       bijector = Reshape(
-          event_shape_out=[6,],
-          event_shape_in=[3, 2],
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
       (x_,
        y_,
@@ -52,66 +64,23 @@ class ReshapeBijectorTest(test.TestCase):
            bijector.forward(expected_x),
            bijector.forward_log_det_jacobian(expected_x),
            bijector.inverse_log_det_jacobian(expected_y),
-       ))
+       ), feed_dict=feed_dict)
       self.assertEqual("reshape", bijector.name)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(0., fldj_, rtol=1e-6, atol=0)
       self.assertAllClose(0., ildj_, rtol=1e-6, atol=0)
 
-  def testEventShapeDynamicNdims(self):
-    """Check forward/inverse shape methods with dynamic ndims."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_ph = array_ops.placeholder(dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_ph = array_ops.placeholder(dtype=dtypes.int32)
-
-    bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph, validate_args=True)
-
-    # using the _tensor methods, we should always get a fully-specified
-    # result since these are evaluated at graph runtime.
-    with self.test_session() as sess:
-      (shape_out_,
-       shape_in_) = sess.run((
-           bijector.forward_event_shape_tensor(shape_in),
-           bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeDynamic(self):
-    """Check shape methods with static ndims but dynamic shape."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_in_partial = tensor_shape.TensorShape([None,])
-    shape_in_ph = array_ops.placeholder(
-        shape=[1,], dtype=dtypes.int32)
-
-    shape_out = tensor_shape.TensorShape([2, 3])
-    shape_out_partial = tensor_shape.TensorShape([None, None])
-    shape_out_ph = array_ops.placeholder(
-        shape=[2,], dtype=dtypes.int32)
+  def testEventShapeTensor(self):
+    """Test event_shape_tensor methods when even ndims may be dynamic."""
 
+    shape_in_static = [2, 3]
+    shape_out_static = [6,]
+    shape_in, shape_out, feed_dict = self.build_shapes(shape_in_static,
+                                                       shape_out_static)
     bijector = Reshape(
-        event_shape_out=shape_out_ph,
-        event_shape_in=shape_in_ph,
-        validate_args=True)
-
-    # if event shapes are not statically available, should
-    # return partially-specified TensorShapes.
-    self.assertAllEqual(
-        bijector.forward_event_shape(shape_in).as_list(),
-        shape_out_partial.as_list())
-    self.assertAllEqual(
-        bijector.inverse_event_shape(shape_out).as_list(),
-        shape_in_partial.as_list())
+        event_shape_out=shape_out,
+        event_shape_in=shape_in, validate_args=True)
 
     # using the _tensor methods, we should always get a fully-specified
     # result since these are evaluated at graph runtime.
@@ -120,42 +89,9 @@ class ReshapeBijectorTest(test.TestCase):
        shape_in_) = sess.run((
            bijector.forward_event_shape_tensor(shape_in),
            bijector.inverse_event_shape_tensor(shape_out),
-       ), feed_dict={
-           shape_in_ph: shape_in,
-           shape_out_ph: shape_out,
-       })
-      self.assertAllEqual(shape_out, shape_out_)
-      self.assertAllEqual(shape_in, shape_in_)
-
-  def testEventShapeStatic(self):
-    """Check shape methods when shape is statically known."""
-
-    shape_in = tensor_shape.TensorShape([6,])
-    shape_out = tensor_shape.TensorShape([2, 3])
-
-    bijector_static = Reshape(
-        event_shape_out=shape_out,
-        event_shape_in=shape_in,
-        validate_args=True)
-
-    # test that forward_ and inverse_event_shape do sensible things
-    # when shapes are statically known.
-    self.assertEqual(
-        bijector_static.forward_event_shape(shape_in),
-        shape_out)
-    self.assertEqual(
-        bijector_static.inverse_event_shape(shape_out),
-        shape_in)
-
-    with self.test_session() as sess:
-      (shape_out_static_,
-       shape_in_static_,
-      ) = sess.run((
-          bijector_static.forward_event_shape_tensor(shape_in),
-          bijector_static.inverse_event_shape_tensor(shape_out),
-      ))
-      self.assertAllEqual(shape_out, shape_out_static_)
-      self.assertAllEqual(shape_in, shape_in_static_)
+       ), feed_dict=feed_dict)
+      self.assertAllEqual(shape_out_static, shape_out_)
+      self.assertAllEqual(shape_in_static, shape_in_)
 
   def testScalarReshape(self):
     """Test reshaping to and from a scalar shape ()."""
@@ -166,11 +102,11 @@ class ReshapeBijectorTest(test.TestCase):
     expected_x_scalar = np.random.randn(1,)
     expected_y_scalar = expected_x_scalar[0]
 
+    shape_in, shape_out, feed_dict = self.build_shapes([], [1,])
     with self.test_session() as sess:
       bijector = Reshape(
-          event_shape_out=[],
-          event_shape_in=[1,], validate_args=True)
-
+          event_shape_out=shape_in,
+          event_shape_in=shape_out, validate_args=True)
       (x_,
        y_,
        x_scalar_,
@@ -180,53 +116,178 @@ class ReshapeBijectorTest(test.TestCase):
           bijector.forward(expected_x),
           bijector.inverse(expected_y_scalar),
           bijector.forward(expected_x_scalar),
-      ))
+      ), feed_dict=feed_dict)
       self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_y_scalar, y_scalar_, rtol=1e-6, atol=0)
       self.assertAllClose(expected_x_scalar, x_scalar_, rtol=1e-6, atol=0)
 
-  def testRaisesOpError(self):
-    x1 = np.random.randn(4, 2, 3)
-    x2 = np.random.randn(4, 3, 2)
-    x3 = np.random.randn(4, 5, 1, 1)
+  def testMultipleUnspecifiedDimensionsOpError(self):
 
     with self.test_session() as sess:
-      shape_in_ph = array_ops.placeholder(shape=[2,], dtype=dtypes.int32)
-      shape_out_ph = array_ops.placeholder(shape=[3,], dtype=dtypes.int32)
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [4, -1, -1,])
       bijector = Reshape(
-          event_shape_out=shape_out_ph,
-          event_shape_in=shape_in_ph,
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
           validate_args=True)
 
-      with self.assertRaisesOpError(
+      with self.assertRaisesError(
+          "elements must have at most one `-1`."):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+
+  def testInvalidDimensionsOpError(self):
+
+    with self.test_session() as sess:
+
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(
+          "elements must be either positive integers or `-1`."):
+        sess.run(bijector.forward_event_shape_tensor(shape_in),
+                 feed_dict=feed_dict)
+
+  def testValidButNonMatchingInputOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      # Here we pass in a tensor (x) whose shape is compatible with
+      # the output shape, so tf.reshape will throw no error, but
+      # doesn't match the expected input shape.
+      with self.assertRaisesError(
           "Input `event_shape` does not match `event_shape_in`."):
-        sess.run(bijector.forward(x2),
-                 feed_dict={shape_out_ph: [1, 6, 1],
-                            shape_in_ph: [2, 3]})
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
 
-      with self.assertRaisesOpError(
-          "event_shape_out entries must be positive."):
-        sess.run(bijector.forward(x1),
-                 feed_dict={shape_out_ph: [-1, -1, 6],
-                            shape_in_ph: [2, 3]})
+  def testValidButNonMatchingInputPartiallySpecifiedOpError(self):
+    x = np.random.randn(4, 3, 2)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([2, -1], [1, 6, 1,])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+
+      with self.assertRaisesError(
+          "Input `event_shape` does not match `event_shape_in`."):
+        sess.run(bijector.forward(x),
+                 feed_dict=feed_dict)
+
+  def testInputOutputMismatchOpError(self):
+    x1 = np.random.randn(4, 2, 3)
+    x2 = np.random.randn(4, 1, 1, 5)
+
+    with self.test_session() as sess:
+      shape_in, shape_out, fd_mismatched = self.build_shapes([2, 3],
+                                                             [1, 1, 5])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
 
       # test that *all* methods check basic assertions
-      fd_mismatched = {shape_out_ph: [1, 1, 5], shape_in_ph: [2, 3]}
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
+      with self.assertRaisesError(
+          "Input to reshape is a tensor with"):
         sess.run(bijector.forward(x1), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse(x3), feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.inverse_log_det_jacobian(x3),
-                 feed_dict=fd_mismatched)
-      with self.assertRaisesOpError(
-          "Input/output `event_size`s do not match."):
-        sess.run(bijector.forward_log_det_jacobian(x1),
-                 feed_dict=fd_mismatched)
+      with self.assertRaisesError(
+          "Input to reshape is a tensor with"):
+        sess.run(bijector.inverse(x2), feed_dict=fd_mismatched)
+
+  def testOneShapePartiallySpecified(self):
+    expected_x = np.random.randn(4, 6)
+    expected_y = np.reshape(expected_x, [4, 2, 3])
+
+    with self.test_session() as sess:
+      # one of input/output shapes is partially specified
+      shape_in, shape_out, feed_dict = self.build_shapes([-1,], [2, 3])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testBothShapesPartiallySpecified(self):
+    expected_x = np.random.randn(4, 2, 3)
+    expected_y = np.reshape(expected_x, [4, 3, 2])
+    with self.test_session() as sess:
+      shape_in, shape_out, feed_dict = self.build_shapes([-1, 3], [-1, 2])
+      bijector = Reshape(
+          event_shape_out=shape_out,
+          event_shape_in=shape_in,
+          validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def testDefaultVectorShape(self):
+    expected_x = np.random.randn(4, 4)
+    expected_y = np.reshape(expected_x, [4, 2, 2])
+    with self.test_session() as sess:
+      _, shape_out, feed_dict = self.build_shapes([-1,], [-1, 2])
+      bijector = Reshape(shape_out,
+                         validate_args=True)
+      (x_,
+       y_,
+      ) = sess.run((
+          bijector.inverse(expected_y),
+          bijector.forward(expected_x),
+      ), feed_dict=feed_dict)
+      self.assertAllClose(expected_y, y_, rtol=1e-6, atol=0)
+      self.assertAllClose(expected_x, x_, rtol=1e-6, atol=0)
+
+  def build_shapes(self, *args, **kwargs):
+    raise NotImplementedError("Subclass failed to implement `build_shapes`.")
+
+
+class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_static = shape_in
+    shape_out_static = shape_out
+    feed_dict = {}
+    return shape_in_static, shape_out_static, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesRegexp(Exception, msg)
+
+  def testEventShape(self):
+    shape_in_static = tensor_shape.TensorShape([2, 3])
+    shape_out_static = tensor_shape.TensorShape([6,])
+    bijector = Reshape(
+        event_shape_out=shape_out_static,
+        event_shape_in=shape_in_static, validate_args=True)
+
+    # test that forward_ and inverse_event_shape do sensible things
+    # when shapes are statically known.
+    self.assertEqual(
+        bijector.forward_event_shape(shape_in_static),
+        shape_out_static)
+    self.assertEqual(
+        bijector.inverse_event_shape(shape_out_static),
+        shape_in_static)
 
   def testBijectiveAndFinite(self):
     x = np.random.randn(4, 2, 3)
@@ -238,5 +299,32 @@ class ReshapeBijectorTest(test.TestCase):
           validate_args=True)
       assert_bijective_and_finite(bijector, x, y, rtol=1e-6, atol=0)
 
+
+class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=(len(shape_in),),
+                                        dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=(len(shape_out),),
+                                         dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+
+class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
+
+  def build_shapes(self, shape_in, shape_out):
+    shape_in_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    shape_out_ph = array_ops.placeholder(shape=None, dtype=dtypes.int32)
+    feed_dict = {shape_in_ph: shape_in, shape_out_ph: shape_out}
+    return shape_in_ph, shape_out_ph, feed_dict
+
+  def assertRaisesError(self, msg):
+    return self.assertRaisesOpError(msg)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..73747db31c86b67eaad5aeab7d5e80191e12b333
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py
@@ -0,0 +1,438 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Cauchy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+
+stats = try_import("scipy.stats")
+
+
+class CauchyTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(123)
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def _testParamShapes(self, sample_shape, expected):
+    with self.test_session():
+      param_shapes = cauchy_lib.Cauchy.param_shapes(sample_shape)
+      loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+      self.assertAllEqual(expected, loc_shape.eval())
+      self.assertAllEqual(expected, scale_shape.eval())
+      loc = array_ops.zeros(loc_shape)
+      scale = array_ops.ones(scale_shape)
+      self.assertAllEqual(expected,
+                          array_ops.shape(
+                              cauchy_lib.Cauchy(loc, scale).sample()).eval())
+
+  def _testParamStaticShapes(self, sample_shape, expected):
+    param_shapes = cauchy_lib.Cauchy.param_static_shapes(sample_shape)
+    loc_shape, scale_shape = param_shapes["loc"], param_shapes["scale"]
+    self.assertEqual(expected, loc_shape)
+    self.assertEqual(expected, scale_shape)
+
+  def testParamShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamShapes(sample_shape, sample_shape)
+    self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
+
+  def testParamStaticShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamStaticShapes(sample_shape, sample_shape)
+    self._testParamStaticShapes(
+        tensor_shape.TensorShape(sample_shape), sample_shape)
+
+  def testCauchyLogPDF(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([3.0] * batch_size)
+      scale = constant_op.constant([np.sqrt(10.0)] * batch_size)
+      x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.eval().shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testCauchyLogPDFMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant(
+          [[np.sqrt(10.0), np.sqrt(15.0)]] * batch_size)
+      x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      log_pdf = cauchy.log_prob(x)
+      log_pdf_values = log_pdf.eval()
+      self.assertEqual(log_pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          log_pdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, log_pdf.eval().shape)
+
+      pdf = cauchy.prob(x)
+      pdf_values = pdf.eval()
+      self.assertEqual(pdf.shape, (6, 2))
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), pdf_values.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, pdf_values.shape)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.cauchy(loc.eval(), scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf_values)
+      self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
+
+  def testCauchyCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      cdf = cauchy.cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).cdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
+
+  def testCauchySurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).sf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0)
+
+  def testCauchyLogCDF(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-100.0, 10.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      cdf = cauchy.log_cdf(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.shape)
+      self.assertAllEqual(cauchy.batch_shape, cdf.eval().shape)
+
+      if not stats:
+        return
+      expected_cdf = stats.cauchy(loc, scale).logcdf(x)
+      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+
+  def testFiniteGradientAtDifficultPoints(self):
+    for dtype in [np.float32, np.float64]:
+      g = ops.Graph()
+      with g.as_default():
+        loc = variables.Variable(dtype(0.0))
+        scale = variables.Variable(dtype(1.0))
+        dist = cauchy_lib.Cauchy(loc=loc, scale=scale)
+        x = np.array([-100., -20., -5., 0., 5., 20., 100.]).astype(dtype)
+        for func in [
+            dist.cdf, dist.log_cdf, dist.survival_function,
+            dist.log_survival_function, dist.log_prob, dist.prob
+        ]:
+          value = func(x)
+          grads = gradients_impl.gradients(value, [loc, scale])
+          with self.test_session(graph=g):
+            variables.global_variables_initializer().run()
+            self.assertAllFinite(value)
+            self.assertAllFinite(grads[0])
+            self.assertAllFinite(grads[1])
+
+  def testCauchyLogSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-10.0, 100.0, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      sf = cauchy.log_survival_function(x)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.shape)
+      self.assertAllEqual(cauchy.batch_shape, sf.eval().shape)
+
+      if not stats:
+        return
+      expected_sf = stats.cauchy(loc, scale).logsf(x)
+      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+
+  def testCauchyEntropy(self):
+    with self.test_session():
+      loc = np.array([1.0, 1.0, 1.0])
+      scale = np.array([[1.0, 2.0, 3.0]])
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      entropy = cauchy.entropy()
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(),
+                          entropy.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.shape)
+      self.assertAllEqual(cauchy.batch_shape, entropy.eval().shape)
+
+      if not stats:
+        return
+      expected_entropy = stats.cauchy(loc, scale[0]).entropy().reshape((1, 3))
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testCauchyMode(self):
+    with self.test_session():
+      # Mu will be broadcast to [7, 7, 7].
+      loc = [7.]
+      scale = [11., 12., 13.]
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mode().shape)
+      self.assertAllEqual([7., 7, 7], cauchy.mode().eval())
+
+  def testCauchyMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.mean().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.mean().eval())
+
+  def testCauchyNanMean(self):
+    with self.test_session():
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.mean().eval()
+
+  def testCauchyQuantile(self):
+    with self.test_session():
+      batch_size = 50
+      loc = self._rng.randn(batch_size)
+      scale = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0.000001, 0.999999, batch_size).astype(np.float64)
+
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      x = cauchy.quantile(p)
+
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.shape)
+      self.assertAllEqual(cauchy.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(cauchy.batch_shape, x.shape)
+      self.assertAllEqual(cauchy.batch_shape, x.eval().shape)
+
+      if not stats:
+        return
+      expected_x = stats.cauchy(loc, scale).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def testCauchyVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.variance().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.variance().eval())
+
+  def testCauchyNanVariance(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.variance().eval()
+
+  def testCauchyStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertAllEqual((3,), cauchy.stddev().shape)
+      self.assertAllEqual([np.nan] * 3, cauchy.stddev().eval())
+
+  def testCauchyNanStandardDeviation(self):
+    with self.test_session():
+      # scale will be broadcast to [7, 7, 7]
+      loc = [1., 2., 3.]
+      scale = [7.]
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale, allow_nan_stats=False)
+
+      with self.assertRaises(ValueError):
+        cauchy.stddev().eval()
+
+  def testCauchySample(self):
+    with self.test_session():
+      loc = constant_op.constant(3.0)
+      scale = constant_op.constant(1.0)
+      loc_v = 3.0
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+
+      self.assertEqual(sample_values.shape, (100000,))
+      self.assertAllClose(np.median(sample_values), loc_v, atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
+
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchySampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      loc = constant_op.constant([[3.0, -3.0]] * batch_size)
+      scale = constant_op.constant([[0.5, 1.0]] * batch_size)
+      loc_v = [3.0, -3.0]
+      n = constant_op.constant(100000)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+      samples = cauchy.sample(n)
+      sample_values = samples.eval()
+      self.assertEqual(samples.shape, (100000, batch_size, 2))
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 0]), loc_v[0], atol=1e-1)
+      self.assertAllClose(
+          np.median(sample_values[:, 0, 1]), loc_v[1], atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(cauchy.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+      expected_shape = (
+          tensor_shape.TensorShape([n.eval()]).concatenate(cauchy.batch_shape))
+      self.assertAllEqual(expected_shape, samples.shape)
+      self.assertAllEqual(expected_shape, sample_values.shape)
+
+  def testCauchyNegativeLocFails(self):
+    with self.test_session():
+      cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True)
+      with self.assertRaisesOpError("Condition x > 0 did not hold"):
+        cauchy.mode().eval()
+
+  def testCauchyShape(self):
+    with self.test_session():
+      loc = constant_op.constant([-3.0] * 5)
+      scale = constant_op.constant(11.0)
+      cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+      self.assertEqual(cauchy.batch_shape_tensor().eval(), [5])
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape([5]))
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertEqual(cauchy.event_shape, tensor_shape.TensorShape([]))
+
+  def testCauchyShapeWithPlaceholders(self):
+    loc = array_ops.placeholder(dtype=dtypes.float32)
+    scale = array_ops.placeholder(dtype=dtypes.float32)
+    cauchy = cauchy_lib.Cauchy(loc=loc, scale=scale)
+
+    with self.test_session() as sess:
+      # get_batch_shape should return an "<unknown>" tensor.
+      self.assertEqual(cauchy.batch_shape, tensor_shape.TensorShape(None))
+      self.assertEqual(cauchy.event_shape, ())
+      self.assertAllEqual(cauchy.event_shape_tensor().eval(), [])
+      self.assertAllEqual(
+          sess.run(
+              cauchy.batch_shape_tensor(),
+              feed_dict={
+                  loc: 5.0,
+                  scale: [1.0, 2.0]
+              }), [2])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e75660083dc2edd1759a3a54e221d9e8a268c3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py
@@ -0,0 +1,320 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for initializers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+stats = try_import("scipy.stats")
+
+
+class HalfNormalTest(test.TestCase):
+
+  def setUp(self):
+    self._rng = np.random.RandomState(123)
+
+  def assertAllFinite(self, tensor):
+    is_finite = np.isfinite(tensor.eval())
+    all_true = np.ones_like(is_finite, dtype=np.bool)
+    self.assertAllEqual(all_true, is_finite)
+
+  def _testParamShapes(self, sample_shape, expected):
+    with self.test_session():
+      param_shapes = hn_lib.HalfNormal.param_shapes(sample_shape)
+      scale_shape = param_shapes["scale"]
+      self.assertAllEqual(expected, scale_shape.eval())
+      scale = array_ops.ones(scale_shape)
+      self.assertAllEqual(
+          expected,
+          array_ops.shape(hn_lib.HalfNormal(scale).sample()).eval())
+
+  def _testParamStaticShapes(self, sample_shape, expected):
+    param_shapes = hn_lib.HalfNormal.param_static_shapes(sample_shape)
+    scale_shape = param_shapes["scale"]
+    self.assertEqual(expected, scale_shape)
+
+  def _testBatchShapes(self, dist, tensor):
+    self.assertAllEqual(dist.batch_shape_tensor().eval(), tensor.shape)
+    self.assertAllEqual(dist.batch_shape_tensor().eval(), tensor.eval().shape)
+    self.assertAllEqual(dist.batch_shape, tensor.shape)
+    self.assertAllEqual(dist.batch_shape, tensor.eval().shape)
+
+  def testParamShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamShapes(sample_shape, sample_shape)
+    self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
+
+  def testParamStaticShapes(self):
+    sample_shape = [10, 3, 4]
+    self._testParamStaticShapes(sample_shape, sample_shape)
+    self._testParamStaticShapes(
+        tensor_shape.TensorShape(sample_shape), sample_shape)
+
+  def testHalfNormalLogPDF(self):
+    with self.test_session():
+      batch_size = 6
+      scale = constant_op.constant([3.0] * batch_size)
+      x = np.array([-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], dtype=np.float32)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      log_pdf = halfnorm.log_prob(x)
+      self._testBatchShapes(halfnorm, log_pdf)
+
+      pdf = halfnorm.prob(x)
+      self._testBatchShapes(halfnorm, pdf)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.halfnorm(scale=scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testHalfNormalLogPDFMultidimensional(self):
+    with self.test_session():
+      batch_size = 6
+      scale = constant_op.constant([[3.0, 1.0]] * batch_size)
+      x = np.array([[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0]], dtype=np.float32).T
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      log_pdf = halfnorm.log_prob(x)
+      self._testBatchShapes(halfnorm, log_pdf)
+
+      pdf = halfnorm.prob(x)
+      self._testBatchShapes(halfnorm, pdf)
+
+      if not stats:
+        return
+      expected_log_pdf = stats.halfnorm(scale=scale.eval()).logpdf(x)
+      self.assertAllClose(expected_log_pdf, log_pdf.eval())
+      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+
+  def testHalfNormalCDF(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      cdf = halfnorm.cdf(x)
+      self._testBatchShapes(halfnorm, cdf)
+
+      log_cdf = halfnorm.log_cdf(x)
+      self._testBatchShapes(halfnorm, log_cdf)
+
+      if not stats:
+        return
+      expected_logcdf = stats.halfnorm(scale=scale).logcdf(x)
+      self.assertAllClose(expected_logcdf, log_cdf.eval(), atol=0)
+      self.assertAllClose(np.exp(expected_logcdf), cdf.eval(), atol=0)
+
+  def testHalfNormalSurvivalFunction(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      x = np.linspace(-8.0, 8.0, batch_size).astype(np.float64)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sf = halfnorm.survival_function(x)
+      self._testBatchShapes(halfnorm, sf)
+
+      log_sf = halfnorm.log_survival_function(x)
+      self._testBatchShapes(halfnorm, log_sf)
+
+      if not stats:
+        return
+      expected_logsf = stats.halfnorm(scale=scale).logsf(x)
+      self.assertAllClose(expected_logsf, log_sf.eval(), atol=0)
+      self.assertAllClose(np.exp(expected_logsf), sf.eval(), atol=0)
+
+  def testHalfNormalQuantile(self):
+    with self.test_session():
+      batch_size = 50
+      scale = self._rng.rand(batch_size) + 1.0
+      p = np.linspace(0., 1.0, batch_size).astype(np.float64)
+
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      x = halfnorm.quantile(p)
+      self._testBatchShapes(halfnorm, x)
+
+      if not stats:
+        return
+      expected_x = stats.halfnorm(scale=scale).ppf(p)
+      self.assertAllClose(expected_x, x.eval(), atol=0)
+
+  def testFiniteGradients(self):
+    for dtype in [np.float32, np.float64]:
+      g = ops.Graph()
+      with g.as_default():
+        scale = variables.Variable(dtype(3.0))
+        dist = hn_lib.HalfNormal(scale=scale)
+        x = np.array([0.01, 0.1, 1., 5., 10.]).astype(dtype)
+        for func in [
+            dist.cdf, dist.log_cdf, dist.survival_function,
+            dist.log_prob, dist.prob, dist.log_survival_function,
+        ]:
+          print(func.__name__)
+          value = func(x)
+          grads = gradients_impl.gradients(value, [scale])
+          with self.test_session(graph=g):
+            variables.global_variables_initializer().run()
+            self.assertAllFinite(value)
+            self.assertAllFinite(grads[0])
+
+  def testHalfNormalEntropy(self):
+    with self.test_session():
+      scale = np.array([[1.0, 2.0, 3.0]])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      # See https://en.wikipedia.org/wiki/Half-normal_distribution for the
+      # entropy formula used here.
+      expected_entropy = 0.5 * np.log(np.pi * scale ** 2.0 / 2.0) + 0.5
+
+      entropy = halfnorm.entropy()
+      self._testBatchShapes(halfnorm, entropy)
+      self.assertAllClose(expected_entropy, entropy.eval())
+
+  def testHalfNormalMeanAndMode(self):
+    with self.test_session():
+      scale = np.array([11., 12., 13.])
+
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_mean = scale * np.sqrt(2.0) / np.sqrt(np.pi)
+
+      self.assertAllEqual((3,), halfnorm.mean().eval().shape)
+      self.assertAllEqual(expected_mean, halfnorm.mean().eval())
+
+      self.assertAllEqual((3,), halfnorm.mode().eval().shape)
+      self.assertAllEqual([0., 0., 0.], halfnorm.mode().eval())
+
+  def testHalfNormalVariance(self):
+    with self.test_session():
+      scale = np.array([7., 7., 7.])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
+
+      self.assertAllEqual((3,), halfnorm.variance().eval().shape)
+      self.assertAllEqual(expected_variance, halfnorm.variance().eval())
+
+  def testHalfNormalStandardDeviation(self):
+    with self.test_session():
+      scale = np.array([7., 7., 7.])
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+      expected_variance = scale ** 2.0 * (1.0 - 2.0 / np.pi)
+
+      self.assertAllEqual((3,), halfnorm.stddev().shape)
+      self.assertAllEqual(np.sqrt(expected_variance), halfnorm.stddev().eval())
+
+  def testHalfNormalSample(self):
+    with self.test_session():
+      scale = constant_op.constant(3.0)
+      n = constant_op.constant(100000)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sample = halfnorm.sample(n)
+
+      self.assertEqual(sample.eval().shape, (100000,))
+      self.assertAllClose(sample.eval().mean(),
+                          3.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(halfnorm.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, sample.shape)
+      self.assertAllEqual(expected_shape, sample.eval().shape)
+
+      expected_shape_static = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(halfnorm.batch_shape))
+      self.assertAllEqual(expected_shape_static, sample.shape)
+      self.assertAllEqual(expected_shape_static, sample.eval().shape)
+
+  def testHalfNormalSampleMultiDimensional(self):
+    with self.test_session():
+      batch_size = 2
+      scale = constant_op.constant([[2.0, 3.0]] * batch_size)
+      n = constant_op.constant(100000)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      sample = halfnorm.sample(n)
+      self.assertEqual(sample.shape, (100000, batch_size, 2))
+      self.assertAllClose(sample.eval()[:, 0, 0].mean(),
+                          2.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+      self.assertAllClose(sample.eval()[:, 0, 1].mean(),
+                          3.0 * np.sqrt(2.0) / np.sqrt(np.pi), atol=1e-1)
+
+      expected_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
+          tensor_shape.TensorShape(halfnorm.batch_shape_tensor().eval()))
+      self.assertAllEqual(expected_shape, sample.shape)
+      self.assertAllEqual(expected_shape, sample.eval().shape)
+
+      expected_shape_static = (tensor_shape.TensorShape(
+          [n.eval()]).concatenate(halfnorm.batch_shape))
+      self.assertAllEqual(expected_shape_static, sample.shape)
+      self.assertAllEqual(expected_shape_static, sample.eval().shape)
+
+  def testNegativeSigmaFails(self):
+    with self.test_session():
+      halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G")
+      with self.assertRaisesOpError("Condition x > 0 did not hold"):
+        halfnorm.mean().eval()
+
+  def testHalfNormalShape(self):
+    with self.test_session():
+      scale = constant_op.constant([6.0] * 5)
+      halfnorm = hn_lib.HalfNormal(scale=scale)
+
+      self.assertEqual(halfnorm.batch_shape_tensor().eval(), [5])
+      self.assertEqual(halfnorm.batch_shape, tensor_shape.TensorShape([5]))
+      self.assertAllEqual(halfnorm.event_shape_tensor().eval(), [])
+      self.assertEqual(halfnorm.event_shape, tensor_shape.TensorShape([]))
+
+  def testHalfNormalShapeWithPlaceholders(self):
+    scale = array_ops.placeholder(dtype=dtypes.float32)
+    halfnorm = hn_lib.HalfNormal(scale=scale)
+
+    with self.test_session() as sess:
+      # get_batch_shape should return an "<unknown>" tensor.
+      self.assertEqual(halfnorm.batch_shape, tensor_shape.TensorShape(None))
+      self.assertEqual(halfnorm.event_shape, ())
+      self.assertAllEqual(halfnorm.event_shape_tensor().eval(), [])
+      self.assertAllEqual(
+          sess.run(halfnorm.batch_shape_tensor(),
+                   feed_dict={scale: [1.0, 2.0]}), [2])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
index ece6bc077d9e21502fdfd01300a9d3e9f2c9c380..ff6092fc260660b512e8123823c63e98a023af6d 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mixture_same_family_test.py
@@ -45,6 +45,17 @@ class MixtureSameFamilyTest(test_util.VectorDistributionTestHelpers,
       self.assertEqual([4, 5], x.shape)
       self.assertEqual([4, 5], log_prob_x.shape)
 
+  def testSampleAndLogProbBatch(self):
+    with self.test_session():
+      gm = mixture_same_family_lib.MixtureSameFamily(
+          mixture_distribution=categorical_lib.Categorical(probs=[[0.3, 0.7]]),
+          components_distribution=normal_lib.Normal(
+              loc=[[-1., 1]], scale=[[0.1, 0.5]]))
+      x = gm.sample([4, 5], seed=42)
+      log_prob_x = gm.log_prob(x)
+      self.assertEqual([4, 5, 1], x.shape)
+      self.assertEqual([4, 5, 1], log_prob_x.shape)
+
   def testSampleAndLogProbShapesBroadcastMix(self):
     mix_probs = np.float32([.3, .7])
     bern_probs = np.float32([[.4, .6], [.25, .75]])
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
index 103d8e186221e879d1734a097114708429f725bd..cbaf74d3f66253ae5727e1ba579e2d49235b748e 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/transformed_distribution_test.py
@@ -200,6 +200,27 @@ class TransformedDistributionTest(test.TestCase):
       self.assertAllEqual([2], multi_logit_normal.event_shape)
       self.assertAllEqual([2], multi_logit_normal.event_shape_tensor().eval())
 
+  def testCastLogDetJacobian(self):
+    """Test log_prob when Jacobian and log_prob dtypes do not match."""
+
+    with self.test_session():
+      # Create an identity bijector whose jacobians have dtype int32
+      int_identity = bs.Inline(
+          forward_fn=array_ops.identity,
+          inverse_fn=array_ops.identity,
+          inverse_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          forward_log_det_jacobian_fn=lambda x: math_ops.cast(0, dtypes.int32),
+          is_constant_jacobian=True)
+      normal = self._cls()(
+          distribution=ds.Normal(loc=0., scale=1.),
+          bijector=int_identity,
+          validate_args=True)
+
+      y = normal.sample()
+      normal.log_prob(y).eval()
+      normal.prob(y).eval()
+      normal.entropy().eval()
+
   def testEntropy(self):
     with self.test_session():
       shift = np.array([[-1, 0, 1], [-1, -2, -3]], dtype=np.float32)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
index 6049419818e18c54209f0be95d41fcecf6627b7e..0fe9f6aa78fbe845b99d0668f075b0162ec2a9f7 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value.py
@@ -18,12 +18,117 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.absolute_value_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["AbsoluteValue"]
+__all__ = [
+    "AbsoluteValue",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class AbsoluteValue(bijector.Bijector):
+  """Computes `Y = g(X) = Abs(X)`, element-wise.
+
+  This non-injective bijector allows for transformations of scalar distributions
+  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
+
+  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
+    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
+  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
+    (the set inverse is the singleton `{0}`), but "works" in conjunction with
+    `TransformedDistribution` to produce a left semi-continuous pdf.
+  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
+    wrong thing, `-y, y`.  This is done for efficiency.  If
+    `validate_args == True`, `y < 0` will raise an exception.
+
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  abs = tfd.bijectors.AbsoluteValue()
+
+  abs.forward([-1., 0., 1.])
+  ==> [1., 0.,  1.]
+
+  abs.inverse(1.)
+  ==> [-1., 1.]
+
+  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
+  abs.inverse_log_det_jacobian(1.)
+  ==> [0., 0.]
+
+  # Special case handling of 0.
+  abs.inverse(0.)
+  ==> [0., 0.]
+
+  abs.inverse_log_det_jacobian(0.)
+  ==> [0., 0.]
+  ```
+
+  """
+
+  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
+    """Instantiates the `AbsoluteValue` bijector.
+
+    Args:
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.  Currently only zero is
+        supported.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness, in particular whether inputs to `inverse` and
+        `inverse_log_det_jacobian` are non-negative.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError:  If `event_ndims` is not zero.
+    """
+    self._graph_parents = []
+    self._name = name
+
+    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+    event_ndims_const = tensor_util.constant_value(event_ndims)
+    if event_ndims_const is not None and event_ndims_const not in (0,):
+      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
+    else:
+      if validate_args:
+        event_ndims = control_flow_ops.with_dependencies(
+            [check_ops.assert_equal(
+                event_ndims, 0, message="event_ndims was not 0")],
+            event_ndims)
+
+    with self._name_scope("init"):
+      super(AbsoluteValue, self).__init__(
+          event_ndims=event_ndims,
+          validate_args=validate_args,
+          name=name)
+
+  def _forward(self, x):
+    return math_ops.abs(x)
+
+  def _inverse(self, y):
+    if self.validate_args:
+      y = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          y)
+    return -y, y
+
+  def _inverse_log_det_jacobian(self, y):
+    # If event_ndims = 2,
+    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
+    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
+    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
+    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
+    if self.validate_args:
+      zeros = control_flow_ops.with_dependencies(
+          [check_ops.assert_non_negative(y, message="Argument y was negative")],
+          zeros)
+    return zeros, zeros
+
+  @property
+  def _is_injective(self):
+    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
deleted file mode 100644
index b84502003ab6c0c4ffdda21eea162f441509e1fa..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/absolute_value_impl.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AbsoluteValue bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "AbsoluteValue",
-]
-
-
-class AbsoluteValue(bijector.Bijector):
-  """Computes `Y = g(X) = Abs(X)`, element-wise.
-
-  This non-injective bijector allows for transformations of scalar distributions
-  with the absolute value function, which maps `(-inf, inf)` to `[0, inf)`.
-
-  * For `y in (0, inf)`, `AbsoluteValue.inverse(y)` returns the set inverse
-    `{x in (-inf, inf) : |x| = y}` as a tuple, `-y, y`.
-  * `AbsoluteValue.inverse(0)` returns `0, 0`, which is not the set inverse
-    (the set inverse is the singleton `{0}`), but "works" in conjunction with
-    `TransformedDistribution` to produce a left semi-continuous pdf.
-  * For `y < 0`, `AbsoluteValue.inverse(y)` happily returns the
-    wrong thing, `-y, y`.  This is done for efficiency.  If
-    `validate_args == True`, `y < 0` will raise an exception.
-
-
-  ```python
-  abs = ds.bijectors.AbsoluteValue()
-
-  abs.forward([-1., 0., 1.])
-  ==> [1., 0.,  1.]
-
-  abs.inverse(1.)
-  ==> [-1., 1.]
-
-  # The |dX/dY| is constant, == 1.  So Log|dX/dY| == 0.
-  abs.inverse_log_det_jacobian(1.)
-  ==> [0., 0.]
-
-  # Special case handling of 0.
-  abs.inverse(0.)
-  ==> [0., 0.]
-
-  abs.inverse_log_det_jacobian(0.)
-  ==> [0., 0.]
-  ```
-
-  """
-
-  def __init__(self, event_ndims=0, validate_args=False, name="absolute_value"):
-    """Instantiates the `AbsoluteValue` bijector.
-
-    Args:
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.  Currently only zero is
-        supported.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness, in particular whether inputs to `inverse` and
-        `inverse_log_det_jacobian` are non-negative.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError:  If `event_ndims` is not zero.
-    """
-    self._graph_parents = []
-    self._name = name
-
-    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-    event_ndims_const = tensor_util.constant_value(event_ndims)
-    if event_ndims_const is not None and event_ndims_const not in (0,):
-      raise ValueError("event_ndims(%s) was not 0" % event_ndims_const)
-    else:
-      if validate_args:
-        event_ndims = control_flow_ops.with_dependencies(
-            [check_ops.assert_equal(
-                event_ndims, 0, message="event_ndims was not 0")],
-            event_ndims)
-
-    with self._name_scope("init"):
-      super(AbsoluteValue, self).__init__(
-          event_ndims=event_ndims,
-          validate_args=validate_args,
-          name=name)
-
-  def _forward(self, x):
-    return math_ops.abs(x)
-
-  def _inverse(self, y):
-    if self.validate_args:
-      y = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          y)
-    return -y, y
-
-  def _inverse_log_det_jacobian(self, y):
-    # If event_ndims = 2,
-    # F^{-1}(y) = (-y, y), so DF^{-1}(y) = (-1, 1),
-    # so Log|DF^{-1}(y)| = Log[1, 1] = [0, 0].
-    batch_shape = array_ops.shape(y)[:array_ops.rank(y) - self.event_ndims]
-    zeros = array_ops.zeros(batch_shape, dtype=y.dtype)
-    if self.validate_args:
-      zeros = control_flow_ops.with_dependencies(
-          [check_ops.assert_non_negative(y, message="Argument y was negative")],
-          zeros)
-    return zeros, zeros
-
-  @property
-  def _is_injective(self):
-    return False
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
index 940cceff04e77cfc2f7caae5a798d135f7601b95..05bb9c2f9bdf35e222c94db3491157893da64ebd 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine.py
@@ -18,12 +18,386 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib import linalg
+from tensorflow.contrib.distributions.python.ops import distribution_util
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Affine"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Affine",
+]
+
+
+def _as_tensor(x, name):
+  """Convenience to convert to `Tensor` or leave as `None`."""
+  return None if x is None else ops.convert_to_tensor(x, name=name)
+
+
+class Affine(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
+
+  In TF parlance, the `scale` term is logically equivalent to:
+
+  ```python
+  scale = (
+    scale_identity_multiplier * tf.diag(tf.ones(d)) +
+    tf.diag(scale_diag) +
+    scale_tril +
+    scale_perturb_factor @ diag(scale_perturb_diag) @
+      tf.transpose([scale_perturb_factor])
+  )
+  ```
+
+  The `scale` term is applied without necessarily materializing constituent
+  matrices, i.e., the matmul is [matrix-free](
+  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
+
+  Examples:
+
+  ```python
+  # Y = X
+  b = Affine()
+
+  # Y = X + shift
+  b = Affine(shift=[1., 2, 3])
+
+  # Y = 2 * I @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_identity_multiplier=2.)
+
+  # Y = tf.diag(d1) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
+
+  # Y = (I + v * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
+  b = Affine(shift=[1., 2, 3],
+             scale_diag=[1., 3, 3],          # Implicitly 3x3.
+             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
+             scale_perturb_factor=[[1., 0],
+                                   [0, 1],
+                                   [1, 1]])
+
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale_identity_multiplier=None,
+               scale_diag=None,
+               scale_tril=None,
+               scale_perturb_factor=None,
+               scale_perturb_diag=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine"):
+    """Instantiates the `Affine` bijector.
+
+    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
+    giving the forward operation:
+
+    ```none
+    Y = g(X) = scale @ X + shift
+    ```
+
+    where the `scale` term is logically equivalent to:
+
+    ```python
+    scale = (
+      scale_identity_multiplier * tf.diag(tf.ones(d)) +
+      tf.diag(scale_diag) +
+      scale_tril +
+      scale_perturb_factor @ diag(scale_perturb_diag) @
+        tf.transpose([scale_perturb_factor])
+    )
+    ```
+
+    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
+    specified then `scale += IdentityMatrix`. Otherwise specifying a
+    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
+    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
+
+    Args:
+      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
+        applied.
+      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
+        scaling done to the identity matrix.
+        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
+        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
+        to `scale`.
+      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+        When `None` no diagonal term is added to `scale`.
+      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
+        lower triangular matrix.
+        When `None` no `scale_tril` term is added to `scale`.
+        The upper triangular elements above the diagonal are ignored.
+      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
+        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
+        update is added to `scale`.
+      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
+        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
+        represents an `r x r` diagonal matrix. When `None` low rank updates will
+        take the form `scale_perturb_factor * scale_perturb_factor.T`.
+      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
+      TypeError: if `shift` has different `dtype` from `scale` arguments.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+
+    # Ambiguous definition of low rank update.
+    if scale_perturb_diag is not None and scale_perturb_factor is None:
+      raise ValueError("When scale_perturb_diag is specified, "
+                       "scale_perturb_factor must be specified.")
+
+    # Special case, only handling a scaled identity matrix. We don't know its
+    # dimensions, so this is special cased.
+    # We don't check identity_multiplier, since below we set it to 1. if all
+    # other scale args are None.
+    self._is_only_identity_multiplier = (scale_tril is None and
+                                         scale_diag is None and
+                                         scale_perturb_factor is None)
+
+    with self._name_scope("init", values=[
+        shift, scale_identity_multiplier, scale_diag, scale_tril,
+        scale_perturb_diag, scale_perturb_factor]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims_const = tensor_util.constant_value(event_ndims)
+      if event_ndims_const is not None and event_ndims_const not in (0, 1):
+        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+
+      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
+        raise ValueError(
+            "If event_ndims == 0, the only scale argument you can pass is "
+            "scale_identity_multiplier.  All others operate on vectors.")
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      # When no args are specified, pretend the scale matrix is the identity
+      # matrix.
+      if (self._is_only_identity_multiplier and
+          scale_identity_multiplier is None):
+        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
+
+      # self._create_scale_operator returns a LinearOperator in all cases
+      # except if self._is_only_identity_multiplier; in which case it
+      # returns a scalar Tensor.
+      scale = self._create_scale_operator(
+          identity_multiplier=scale_identity_multiplier,
+          diag=scale_diag,
+          tril=scale_tril,
+          perturb_diag=scale_perturb_diag,
+          perturb_factor=scale_perturb_factor,
+          shift=shift,
+          validate_args=validate_args)
+
+      if scale.dtype is not None:
+        dtype = scale.dtype.base_dtype
+
+      if scale is not None and not self._is_only_identity_multiplier:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+      else:
+        # We won't need shape inference when scale is None or when scale is a
+        # scalar.
+        batch_ndims = 0
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(Affine, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=(
+              [event_ndims] +
+              [self._scale] if tensor_util.is_tensor(self._scale)
+              else self._scale.graph_parents +
+              [self._shift] if self._shift is not None else []),
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  def _create_scale_operator(self, identity_multiplier, diag, tril,
+                             perturb_diag, perturb_factor, shift,
+                             validate_args):
+    """Construct `scale` from various components.
+
+    Args:
+      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
+        done to the identity matrix.
+      diag: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
+        diagonal matrix.
+      tril: Floating-point `Tensor` representing the diagonal matrix.
+        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
+        triangular matrix.
+      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
+        the low rank update.
+      perturb_factor: Floating-point `Tensor` representing factor matrix.
+      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+
+    Returns:
+      scale. In the case of scaling by a constant, scale is a
+      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
+
+    Raises:
+      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
+    """
+    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
+    diag = _as_tensor(diag, "diag")
+    tril = _as_tensor(tril, "tril")
+    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
+    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
+
+    # If possible, use the low rank update to infer the shape of
+    # the identity matrix, when scale represents a scaled identity matrix
+    # with a low rank update.
+    shape_hint = None
+    if perturb_factor is not None:
+      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
+
+    if self._is_only_identity_multiplier:
+      if validate_args:
+        return control_flow_ops.with_dependencies(
+            [check_ops.assert_none_equal(
+                identity_multiplier,
+                array_ops.zeros([], identity_multiplier.dtype),
+                ["identity_multiplier should be non-zero."])],
+            identity_multiplier)
+      return identity_multiplier
+
+    scale = distribution_util.make_tril_scale(
+        loc=shift,
+        scale_tril=tril,
+        scale_diag=diag,
+        scale_identity_multiplier=identity_multiplier,
+        validate_args=validate_args,
+        assert_positive=False,
+        shape_hint=shape_hint)
+
+    if perturb_factor is not None:
+      return linalg.LinearOperatorLowRankUpdate(
+          scale,
+          u=perturb_factor,
+          diag_update=perturb_diag,
+          is_diag_update_positive=perturb_diag is None,
+          is_non_singular=True,  # Implied by is_positive_definite=True.
+          is_self_adjoint=True,
+          is_positive_definite=True,
+          is_square=True)
+
+    return scale
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self._is_only_identity_multiplier:
+      y *= self._scale
+      if self.shift is not None:
+        return y + self.shift
+      return y
+    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        y, expand_batch_dim=False)
+    with ops.control_dependencies(self._maybe_check_scale() if
+                                  self.validate_args else []):
+      y = self.scale.matmul(y)
+    y = self._shaper.undo_make_batch_of_event_sample_matrices(
+        y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self._is_only_identity_multiplier:
+      return x / self._scale
+
+    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+        x, expand_batch_dim=False)
+    # Solve fails if the op is singular so we may safely skip this assertion.
+    x = self.scale.solve(x)
+    x = self._shaper.undo_make_batch_of_event_sample_matrices(
+        x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._is_only_identity_multiplier:
+      # We don't pad in this case and instead let the fldj be applied
+      # via broadcast.
+      event_size = distribution_util.pick_vector(
+          math_ops.equal(self._shaper.event_ndims, 0),
+          [1], array_ops.shape(x))[-1]
+      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
+      return math_ops.log(math_ops.abs(self._scale)) * event_size
+    return self.scale.log_abs_determinant()
+
+  def _maybe_check_scale(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
deleted file mode 100644
index 05bb9c2f9bdf35e222c94db3491157893da64ebd..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_impl.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Affine bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import linalg
-from tensorflow.contrib.distributions.python.ops import distribution_util
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Affine",
-]
-
-
-def _as_tensor(x, name):
-  """Convenience to convert to `Tensor` or leave as `None`."""
-  return None if x is None else ops.convert_to_tensor(x, name=name)
-
-
-class Affine(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  Here `scale = c * I + diag(D1) + tril(L) + V @ diag(D2) @ V.T`.
-
-  In TF parlance, the `scale` term is logically equivalent to:
-
-  ```python
-  scale = (
-    scale_identity_multiplier * tf.diag(tf.ones(d)) +
-    tf.diag(scale_diag) +
-    scale_tril +
-    scale_perturb_factor @ diag(scale_perturb_diag) @
-      tf.transpose([scale_perturb_factor])
-  )
-  ```
-
-  The `scale` term is applied without necessarily materializing constituent
-  matrices, i.e., the matmul is [matrix-free](
-  https://en.wikipedia.org/wiki/Matrix-free_methods) when possible.
-
-  Examples:
-
-  ```python
-  # Y = X
-  b = Affine()
-
-  # Y = X + shift
-  b = Affine(shift=[1., 2, 3])
-
-  # Y = 2 * I @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_identity_multiplier=2.)
-
-  # Y = tf.diag(d1) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[-1., 2, 1])         # Implicitly 3x3.
-
-  # Y = (I + v * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  # Y = (diag(d1) + v * diag(d2) * v.T) @ X.T + shift
-  b = Affine(shift=[1., 2, 3],
-             scale_diag=[1., 3, 3],          # Implicitly 3x3.
-             scale_perturb_diag=[2., 1],     # Implicitly 2x2.
-             scale_perturb_factor=[[1., 0],
-                                   [0, 1],
-                                   [1, 1]])
-
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale_identity_multiplier=None,
-               scale_diag=None,
-               scale_tril=None,
-               scale_perturb_factor=None,
-               scale_perturb_diag=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine"):
-    """Instantiates the `Affine` bijector.
-
-    This `Bijector` is initialized with `shift` `Tensor` and `scale` arguments,
-    giving the forward operation:
-
-    ```none
-    Y = g(X) = scale @ X + shift
-    ```
-
-    where the `scale` term is logically equivalent to:
-
-    ```python
-    scale = (
-      scale_identity_multiplier * tf.diag(tf.ones(d)) +
-      tf.diag(scale_diag) +
-      scale_tril +
-      scale_perturb_factor @ diag(scale_perturb_diag) @
-        tf.transpose([scale_perturb_factor])
-    )
-    ```
-
-    If none of `scale_identity_multiplier`, `scale_diag`, or `scale_tril` are
-    specified then `scale += IdentityMatrix`. Otherwise specifying a
-    `scale` argument has the semantics of `scale += Expand(arg)`, i.e.,
-    `scale_diag != None` means `scale += tf.diag(scale_diag)`.
-
-    Args:
-      shift: Floating-point `Tensor`. If this is set to `None`, no shift is
-        applied.
-      scale_identity_multiplier: floating point rank 0 `Tensor` representing a
-        scaling done to the identity matrix.
-        When `scale_identity_multiplier = scale_diag = scale_tril = None` then
-        `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
-        to `scale`.
-      scale_diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-        When `None` no diagonal term is added to `scale`.
-      scale_tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
-        lower triangular matrix.
-        When `None` no `scale_tril` term is added to `scale`.
-        The upper triangular elements above the diagonal are ignored.
-      scale_perturb_factor: Floating-point `Tensor` representing factor matrix
-        with last two dimensions of shape `(k, r)`. When `None`, no rank-r
-        update is added to `scale`.
-      scale_perturb_diag: Floating-point `Tensor` representing the diagonal
-        matrix. `scale_perturb_diag` has shape [N1, N2, ...  r], which
-        represents an `r x r` diagonal matrix. When `None` low rank updates will
-        take the form `scale_perturb_factor * scale_perturb_factor.T`.
-      event_ndims: Scalar `int` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `perturb_diag` is specified but not `perturb_factor`.
-      TypeError: if `shift` has different `dtype` from `scale` arguments.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-
-    # Ambiguous definition of low rank update.
-    if scale_perturb_diag is not None and scale_perturb_factor is None:
-      raise ValueError("When scale_perturb_diag is specified, "
-                       "scale_perturb_factor must be specified.")
-
-    # Special case, only handling a scaled identity matrix. We don't know its
-    # dimensions, so this is special cased.
-    # We don't check identity_multiplier, since below we set it to 1. if all
-    # other scale args are None.
-    self._is_only_identity_multiplier = (scale_tril is None and
-                                         scale_diag is None and
-                                         scale_perturb_factor is None)
-
-    with self._name_scope("init", values=[
-        shift, scale_identity_multiplier, scale_diag, scale_tril,
-        scale_perturb_diag, scale_perturb_factor]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims_const = tensor_util.constant_value(event_ndims)
-      if event_ndims_const is not None and event_ndims_const not in (0, 1):
-        raise ValueError("event_ndims(%s) was not 0 or 1" % event_ndims_const)
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-
-      if event_ndims_const == 0 and not self._is_only_identity_multiplier:
-        raise ValueError(
-            "If event_ndims == 0, the only scale argument you can pass is "
-            "scale_identity_multiplier.  All others operate on vectors.")
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      # When no args are specified, pretend the scale matrix is the identity
-      # matrix.
-      if (self._is_only_identity_multiplier and
-          scale_identity_multiplier is None):
-        scale_identity_multiplier = ops.convert_to_tensor(1., dtype=dtype)
-
-      # self._create_scale_operator returns a LinearOperator in all cases
-      # except if self._is_only_identity_multiplier; in which case it
-      # returns a scalar Tensor.
-      scale = self._create_scale_operator(
-          identity_multiplier=scale_identity_multiplier,
-          diag=scale_diag,
-          tril=scale_tril,
-          perturb_diag=scale_perturb_diag,
-          perturb_factor=scale_perturb_factor,
-          shift=shift,
-          validate_args=validate_args)
-
-      if scale.dtype is not None:
-        dtype = scale.dtype.base_dtype
-
-      if scale is not None and not self._is_only_identity_multiplier:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-      else:
-        # We won't need shape inference when scale is None or when scale is a
-        # scalar.
-        batch_ndims = 0
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(Affine, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=(
-              [event_ndims] +
-              [self._scale] if tensor_util.is_tensor(self._scale)
-              else self._scale.graph_parents +
-              [self._shift] if self._shift is not None else []),
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  def _create_scale_operator(self, identity_multiplier, diag, tril,
-                             perturb_diag, perturb_factor, shift,
-                             validate_args):
-    """Construct `scale` from various components.
-
-    Args:
-      identity_multiplier: floating point rank 0 `Tensor` representing a scaling
-        done to the identity matrix.
-      diag: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
-        diagonal matrix.
-      tril: Floating-point `Tensor` representing the diagonal matrix.
-        `scale_tril` has shape [N1, N2, ...  k], which represents a k x k lower
-        triangular matrix.
-      perturb_diag: Floating-point `Tensor` representing the diagonal matrix of
-        the low rank update.
-      perturb_factor: Floating-point `Tensor` representing factor matrix.
-      shift: Floating-point `Tensor` representing `shift in `scale @ X + shift`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-
-    Returns:
-      scale. In the case of scaling by a constant, scale is a
-      floating point `Tensor`. Otherwise, scale is a `LinearOperator`.
-
-    Raises:
-      ValueError: if all of `tril`, `diag` and `identity_multiplier` are `None`.
-    """
-    identity_multiplier = _as_tensor(identity_multiplier, "identity_multiplier")
-    diag = _as_tensor(diag, "diag")
-    tril = _as_tensor(tril, "tril")
-    perturb_diag = _as_tensor(perturb_diag, "perturb_diag")
-    perturb_factor = _as_tensor(perturb_factor, "perturb_factor")
-
-    # If possible, use the low rank update to infer the shape of
-    # the identity matrix, when scale represents a scaled identity matrix
-    # with a low rank update.
-    shape_hint = None
-    if perturb_factor is not None:
-      shape_hint = distribution_util.dimension_size(perturb_factor, axis=-2)
-
-    if self._is_only_identity_multiplier:
-      if validate_args:
-        return control_flow_ops.with_dependencies(
-            [check_ops.assert_none_equal(
-                identity_multiplier,
-                array_ops.zeros([], identity_multiplier.dtype),
-                ["identity_multiplier should be non-zero."])],
-            identity_multiplier)
-      return identity_multiplier
-
-    scale = distribution_util.make_tril_scale(
-        loc=shift,
-        scale_tril=tril,
-        scale_diag=diag,
-        scale_identity_multiplier=identity_multiplier,
-        validate_args=validate_args,
-        assert_positive=False,
-        shape_hint=shape_hint)
-
-    if perturb_factor is not None:
-      return linalg.LinearOperatorLowRankUpdate(
-          scale,
-          u=perturb_factor,
-          diag_update=perturb_diag,
-          is_diag_update_positive=perturb_diag is None,
-          is_non_singular=True,  # Implied by is_positive_definite=True.
-          is_self_adjoint=True,
-          is_positive_definite=True,
-          is_square=True)
-
-    return scale
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self._is_only_identity_multiplier:
-      y *= self._scale
-      if self.shift is not None:
-        return y + self.shift
-      return y
-    y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        y, expand_batch_dim=False)
-    with ops.control_dependencies(self._maybe_check_scale() if
-                                  self.validate_args else []):
-      y = self.scale.matmul(y)
-    y = self._shaper.undo_make_batch_of_event_sample_matrices(
-        y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self._is_only_identity_multiplier:
-      return x / self._scale
-
-    x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-        x, expand_batch_dim=False)
-    # Solve fails if the op is singular so we may safely skip this assertion.
-    x = self.scale.solve(x)
-    x = self._shaper.undo_make_batch_of_event_sample_matrices(
-        x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._is_only_identity_multiplier:
-      # We don't pad in this case and instead let the fldj be applied
-      # via broadcast.
-      event_size = distribution_util.pick_vector(
-          math_ops.equal(self._shaper.event_ndims, 0),
-          [1], array_ops.shape(x))[-1]
-      event_size = math_ops.cast(event_size, dtype=self._scale.dtype)
-      return math_ops.log(math_ops.abs(self._scale)) * event_size
-    return self.scale.log_abs_determinant()
-
-  def _maybe_check_scale(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
index aca04a89df7c3ee09d5f7cc10f6779e33fa7aa66..89043b1410370074f11f2cfa59b6b6663fa62521 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator.py
@@ -18,12 +18,214 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.linalg import linear_operator
 
-_allowed_symbols = ["AffineLinearOperator"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "AffineLinearOperator",
+]
+
+
+class AffineLinearOperator(bijector.Bijector):
+  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
+
+  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
+
+  If `X` is a scalar then the forward transformation is: `scale * X + shift`
+  where `*` denotes the scalar product.
+
+  Note: we don't always simply transpose `X` (but write it this way for
+  brevity). Actually the input `X` undergoes the following transformation
+  before being premultiplied by `scale`:
+
+  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
+     `new_sample_shape = [1]`. Otherwise do nothing.
+  2. The sample shape is flattened to have one dimension, i.e.,
+     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
+  3. The sample dim is cyclically rotated left by 1, i.e.,
+     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
+     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
+     dimensions.
+
+  (For more details see `shape.make_batch_of_event_sample_matrices`.)
+
+  The result of the above transformation is that `X` can be regarded as a batch
+  of matrices where each column is a draw from the distribution. After
+  premultiplying by `scale`, we take the inverse of this procedure. The input
+  `Y` also undergoes the same transformation before/after premultiplying by
+  `inv(scale)`.
+
+  Example Use:
+
+  ```python
+  linalg = tf.linalg
+
+  x = [1., 2, 3]
+
+  shift = [-1., 0., 1]
+  diag = [1., 2, 3]
+  scale = linalg.LinearOperatorDiag(diag)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # y = scale @ x + shift
+  y = affine.forward(x)  # [0., 4, 10]
+
+  shift = [2., 3, 1]
+  tril = [[1., 0, 0],
+          [2, 1, 0],
+          [3, 2, 1]]
+  scale = linalg.LinearOperatorLowerTriangular(tril)
+  affine = AffineLinearOperator(shift, scale)
+  # In this case, `forward` is equivalent to:
+  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
+  y = affine.forward(x)  # [3., 7, 11]
+  ```
+
+  """
+
+  def __init__(self,
+               shift=None,
+               scale=None,
+               event_ndims=1,
+               validate_args=False,
+               name="affine_linear_operator"):
+    """Instantiates the `AffineLinearOperator` bijector.
+
+    Args:
+      shift: Floating-point `Tensor`.
+      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
+        definite matrix `M` in `R^{k x k}`.
+      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution. Must be 0 or 1.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `event_ndims` is not 0 or 1.
+      TypeError: if `scale` is not a `LinearOperator`.
+      TypeError: if `shift.dtype` does not match `scale.dtype`.
+      ValueError: if not `scale.is_non_singular`.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    graph_parents = []
+    with self._name_scope("init", values=[shift]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      if tensor_util.constant_value(event_ndims) is not None:
+        event_ndims = tensor_util.constant_value(event_ndims)
+        if event_ndims not in (0, 1):
+          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
+      else:
+        if validate_args:
+          # Shape tool will catch if event_ndims is negative.
+          event_ndims = control_flow_ops.with_dependencies(
+              [check_ops.assert_less(
+                  event_ndims, 2, message="event_ndims must be 0 or 1")],
+              event_ndims)
+        graph_parents += [event_ndims]
+
+      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
+      dtype = dtypes.float32
+
+      if shift is not None:
+        shift = ops.convert_to_tensor(shift, name="shift")
+        graph_parents += [shift]
+        dtype = shift.dtype.base_dtype
+      self._shift = shift
+
+      if scale is not None:
+        if (shift is not None and
+            shift.dtype.base_dtype != scale.dtype.base_dtype):
+          raise TypeError(
+              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
+                  shift.dtype, scale.dtype))
+        if not isinstance(scale, linear_operator.LinearOperator):
+          raise TypeError("scale is not an instance of tf.LinearOperator")
+        if validate_args and not scale.is_non_singular:
+          raise ValueError("Scale matrix must be non-singular.")
+        graph_parents += scale.graph_parents
+        if scale.tensor_rank is not None:
+          batch_ndims = scale.tensor_rank - 2
+        else:
+          batch_ndims = scale.tensor_rank_tensor() - 2
+          graph_parents += [batch_ndims]
+        if scale.dtype is not None:
+          dtype = scale.dtype.base_dtype
+      else:
+        batch_ndims = 0  # We won't need shape inference when scale is None.
+      self._scale = scale
+      self._shaper = _DistributionShape(
+          batch_ndims=batch_ndims,
+          event_ndims=event_ndims,
+          validate_args=validate_args)
+      super(AffineLinearOperator, self).__init__(
+          event_ndims=event_ndims,
+          graph_parents=graph_parents,
+          is_constant_jacobian=True,
+          dtype=dtype,
+          validate_args=validate_args,
+          name=name)
+
+  @property
+  def shift(self):
+    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
+    return self._shift
+
+  @property
+  def scale(self):
+    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
+    return self._scale
+
+  def _forward(self, x):
+    y = x
+    if self.scale is not None:
+      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          y, expand_batch_dim=False)
+      with ops.control_dependencies(self._maybe_collect_assertions() if
+                                    self.validate_args else []):
+        y = self.scale.matmul(y)
+      y = self._shaper.undo_make_batch_of_event_sample_matrices(
+          y, sample_shape, expand_batch_dim=False)
+    if self.shift is not None:
+      y += self.shift
+    return y
+
+  def _inverse(self, y):
+    x = y
+    if self.shift is not None:
+      x -= self.shift
+    if self.scale is not None:
+      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
+          x, expand_batch_dim=False)
+      # Solve fails if the op is singular so we may safely skip this assertion.
+      x = self.scale.solve(x)
+      x = self._shaper.undo_make_batch_of_event_sample_matrices(
+          x, sample_shape, expand_batch_dim=False)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(y)
+
+  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
+    if self.scale is None:
+      return constant_op.constant(0, dtype=x.dtype.base_dtype)
+    with ops.control_dependencies(self._maybe_collect_assertions() if
+                                  self.validate_args else []):
+      return self.scale.log_abs_determinant()
+
+  def _maybe_collect_assertions(self):
+    try:
+      return [self.scale.assert_non_singular()]
+    except NotImplementedError:
+      pass
+    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
deleted file mode 100644
index 89043b1410370074f11f2cfa59b6b6663fa62521..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/affine_linear_operator_impl.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""AffineLinearOperator bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.shape import _DistributionShape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.linalg import linear_operator
-
-
-__all__ = [
-    "AffineLinearOperator",
-]
-
-
-class AffineLinearOperator(bijector.Bijector):
-  """Compute `Y = g(X; shift, scale) = scale @ X + shift`.
-
-  `shift` is a numeric `Tensor` and `scale` is a `LinearOperator`.
-
-  If `X` is a scalar then the forward transformation is: `scale * X + shift`
-  where `*` denotes the scalar product.
-
-  Note: we don't always simply transpose `X` (but write it this way for
-  brevity). Actually the input `X` undergoes the following transformation
-  before being premultiplied by `scale`:
-
-  1. If there are no sample dims, we call `X = tf.expand_dims(X, 0)`, i.e.,
-     `new_sample_shape = [1]`. Otherwise do nothing.
-  2. The sample shape is flattened to have one dimension, i.e.,
-     `new_sample_shape = [n]` where `n = tf.reduce_prod(old_sample_shape)`.
-  3. The sample dim is cyclically rotated left by 1, i.e.,
-     `new_shape = [B1,...,Bb, k, n]` where `n` is as above, `k` is the
-     event_shape, and `B1,...,Bb` are the batch shapes for each of `b` batch
-     dimensions.
-
-  (For more details see `shape.make_batch_of_event_sample_matrices`.)
-
-  The result of the above transformation is that `X` can be regarded as a batch
-  of matrices where each column is a draw from the distribution. After
-  premultiplying by `scale`, we take the inverse of this procedure. The input
-  `Y` also undergoes the same transformation before/after premultiplying by
-  `inv(scale)`.
-
-  Example Use:
-
-  ```python
-  linalg = tf.linalg
-
-  x = [1., 2, 3]
-
-  shift = [-1., 0., 1]
-  diag = [1., 2, 3]
-  scale = linalg.LinearOperatorDiag(diag)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # y = scale @ x + shift
-  y = affine.forward(x)  # [0., 4, 10]
-
-  shift = [2., 3, 1]
-  tril = [[1., 0, 0],
-          [2, 1, 0],
-          [3, 2, 1]]
-  scale = linalg.LinearOperatorLowerTriangular(tril)
-  affine = AffineLinearOperator(shift, scale)
-  # In this case, `forward` is equivalent to:
-  # np.squeeze(np.matmul(tril, np.expand_dims(x, -1)), -1) + shift
-  y = affine.forward(x)  # [3., 7, 11]
-  ```
-
-  """
-
-  def __init__(self,
-               shift=None,
-               scale=None,
-               event_ndims=1,
-               validate_args=False,
-               name="affine_linear_operator"):
-    """Instantiates the `AffineLinearOperator` bijector.
-
-    Args:
-      shift: Floating-point `Tensor`.
-      scale:  Subclass of `LinearOperator`. Represents the (batch) positive
-        definite matrix `M` in `R^{k x k}`.
-      event_ndims: Scalar `integer` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution. Must be 0 or 1.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `event_ndims` is not 0 or 1.
-      TypeError: if `scale` is not a `LinearOperator`.
-      TypeError: if `shift.dtype` does not match `scale.dtype`.
-      ValueError: if not `scale.is_non_singular`.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    graph_parents = []
-    with self._name_scope("init", values=[shift]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      if tensor_util.constant_value(event_ndims) is not None:
-        event_ndims = tensor_util.constant_value(event_ndims)
-        if event_ndims not in (0, 1):
-          raise ValueError("event_ndims({}) was not 0 or 1".format(event_ndims))
-      else:
-        if validate_args:
-          # Shape tool will catch if event_ndims is negative.
-          event_ndims = control_flow_ops.with_dependencies(
-              [check_ops.assert_less(
-                  event_ndims, 2, message="event_ndims must be 0 or 1")],
-              event_ndims)
-        graph_parents += [event_ndims]
-
-      # In the absence of `loc` and `scale`, we'll assume `dtype` is `float32`.
-      dtype = dtypes.float32
-
-      if shift is not None:
-        shift = ops.convert_to_tensor(shift, name="shift")
-        graph_parents += [shift]
-        dtype = shift.dtype.base_dtype
-      self._shift = shift
-
-      if scale is not None:
-        if (shift is not None and
-            shift.dtype.base_dtype != scale.dtype.base_dtype):
-          raise TypeError(
-              "shift.dtype({}) is incompatible with scale.dtype({}).".format(
-                  shift.dtype, scale.dtype))
-        if not isinstance(scale, linear_operator.LinearOperator):
-          raise TypeError("scale is not an instance of tf.LinearOperator")
-        if validate_args and not scale.is_non_singular:
-          raise ValueError("Scale matrix must be non-singular.")
-        graph_parents += scale.graph_parents
-        if scale.tensor_rank is not None:
-          batch_ndims = scale.tensor_rank - 2
-        else:
-          batch_ndims = scale.tensor_rank_tensor() - 2
-          graph_parents += [batch_ndims]
-        if scale.dtype is not None:
-          dtype = scale.dtype.base_dtype
-      else:
-        batch_ndims = 0  # We won't need shape inference when scale is None.
-      self._scale = scale
-      self._shaper = _DistributionShape(
-          batch_ndims=batch_ndims,
-          event_ndims=event_ndims,
-          validate_args=validate_args)
-      super(AffineLinearOperator, self).__init__(
-          event_ndims=event_ndims,
-          graph_parents=graph_parents,
-          is_constant_jacobian=True,
-          dtype=dtype,
-          validate_args=validate_args,
-          name=name)
-
-  @property
-  def shift(self):
-    """The `shift` `Tensor` in `Y = scale @ X + shift`."""
-    return self._shift
-
-  @property
-  def scale(self):
-    """The `scale` `LinearOperator` in `Y = scale @ X + shift`."""
-    return self._scale
-
-  def _forward(self, x):
-    y = x
-    if self.scale is not None:
-      y, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          y, expand_batch_dim=False)
-      with ops.control_dependencies(self._maybe_collect_assertions() if
-                                    self.validate_args else []):
-        y = self.scale.matmul(y)
-      y = self._shaper.undo_make_batch_of_event_sample_matrices(
-          y, sample_shape, expand_batch_dim=False)
-    if self.shift is not None:
-      y += self.shift
-    return y
-
-  def _inverse(self, y):
-    x = y
-    if self.shift is not None:
-      x -= self.shift
-    if self.scale is not None:
-      x, sample_shape = self._shaper.make_batch_of_event_sample_matrices(
-          x, expand_batch_dim=False)
-      # Solve fails if the op is singular so we may safely skip this assertion.
-      x = self.scale.solve(x)
-      x = self._shaper.undo_make_batch_of_event_sample_matrices(
-          x, sample_shape, expand_batch_dim=False)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(y)
-
-  def _forward_log_det_jacobian(self, x):  # pylint: disable=unused-argument
-    if self.scale is None:
-      return constant_op.constant(0, dtype=x.dtype.base_dtype)
-    with ops.control_dependencies(self._maybe_collect_assertions() if
-                                  self.validate_args else []):
-      return self.scale.log_abs_determinant()
-
-  def _maybe_collect_assertions(self):
-    try:
-      return [self.scale.assert_non_singular()]
-    except NotImplementedError:
-      pass
-    return []
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 0db10fb75c8483a8209f39370362b05a03d047ca..3ce7c26213034c7345a20faa803c94a1bfa8d579 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -18,12 +18,151 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.chain_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import itertools
 
-_allowed_symbols = ["Chain"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Chain",
+]
+
+
+class Chain(bijector.Bijector):
+  """Bijector which applies a sequence of bijectors.
+
+  Example Use:
+
+  ```python
+  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
+  ```
+
+  Results in:
+
+  * Forward:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).forward(x)
+   = exp.forward(softplus.forward(x))
+   = tf.exp(tf.log(1. + tf.exp(x)))
+   = 1. + tf.exp(x)
+   ```
+
+  * Inverse:
+
+   ```python
+   exp = Exp()
+   softplus = Softplus()
+   Chain([exp, softplus]).inverse(y)
+   = softplus.inverse(exp.inverse(y))
+   = tf.log(tf.exp(tf.log(y)) - 1.)
+   = tf.log(y - 1.)
+   ```
+
+  """
+
+  def __init__(self, bijectors=None, validate_args=False, name=None):
+    """Instantiates `Chain` bijector.
+
+    Args:
+      bijectors: Python `list` of bijector instances. An empty list makes this
+        bijector equivalent to the `Identity` bijector.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object. Default:
+        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
+
+    Raises:
+      ValueError: if bijectors have different dtypes.
+    """
+    if bijectors is None:
+      bijectors = ()
+    self._bijectors = bijectors
+
+    for a_bijector in bijectors:
+      if not a_bijector._is_injective:  # pylint: disable=protected-access
+        raise NotImplementedError(
+            "Invert is not implemented for non-injective bijector ({})".format(
+                a_bijector.name))
+
+    dtype = list(set([b.dtype for b in bijectors]))
+    if len(dtype) > 2:
+      raise ValueError("incompatible dtypes: %s" % dtype)
+    elif len(dtype) == 2:
+      dtype = dtype[1] if dtype[0] is None else dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    elif len(dtype) == 1:
+      dtype = dtype[0]
+      event_ndims = bijectors[0].event_ndims
+    else:
+      dtype = None
+      event_ndims = None
+
+    super(Chain, self).__init__(
+        graph_parents=list(itertools.chain.from_iterable(
+            b.graph_parents for b in bijectors)),
+        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
+        validate_args=validate_args,
+        dtype=dtype,
+        event_ndims=event_ndims,
+        name=name or ("identity" if not bijectors else
+                      "_of_".join(["chain"] + [b.name for b in bijectors])))
+
+  @property
+  def bijectors(self):
+    return self._bijectors
+
+  def _shape_helper(self, func_name, input_shape, reverse):
+    new_shape = input_shape
+    for b in reversed(self.bijectors) if reverse else self.bijectors:
+      func = getattr(b, func_name, None)
+      if func is None:
+        raise ValueError("unable to call %s on bijector %s (%s)" %
+                         (func_name, b.name, func))
+      new_shape = func(new_shape)
+    return new_shape
+
+  def _forward_event_shape(self, input_shape):
+    return self._shape_helper("forward_event_shape", input_shape,
+                              reverse=True)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self._shape_helper(
+        "forward_event_shape_tensor", input_shape, reverse=True)
+
+  def _inverse_event_shape(self, output_shape):
+    return self._shape_helper("inverse_event_shape", output_shape,
+                              reverse=False)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self._shape_helper("inverse_event_shape_tensor", output_shape,
+                              reverse=False)
+
+  def _inverse(self, y, **kwargs):
+    for b in self.bijectors:
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return y
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    ildj = constant_op.constant(0., dtype=y.dtype,
+                                name="inverse_log_det_jacobian")
+    for b in self.bijectors:
+      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
+      y = b.inverse(y, **kwargs.get(b.name, {}))
+    return ildj
+
+  def _forward(self, x, **kwargs):
+    for b in reversed(self.bijectors):
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return x
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    fldj = constant_op.constant(0., dtype=x.dtype,
+                                name="forward_log_det_jacobian")
+    for b in reversed(self.bijectors):
+      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
+      x = b.forward(x, **kwargs.get(b.name, {}))
+    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
deleted file mode 100644
index 3ce7c26213034c7345a20faa803c94a1bfa8d579..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain_impl.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Chain bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import itertools
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Chain",
-]
-
-
-class Chain(bijector.Bijector):
-  """Bijector which applies a sequence of bijectors.
-
-  Example Use:
-
-  ```python
-  chain = Chain([Exp(), Softplus()], name="one_plus_exp")
-  ```
-
-  Results in:
-
-  * Forward:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).forward(x)
-   = exp.forward(softplus.forward(x))
-   = tf.exp(tf.log(1. + tf.exp(x)))
-   = 1. + tf.exp(x)
-   ```
-
-  * Inverse:
-
-   ```python
-   exp = Exp()
-   softplus = Softplus()
-   Chain([exp, softplus]).inverse(y)
-   = softplus.inverse(exp.inverse(y))
-   = tf.log(tf.exp(tf.log(y)) - 1.)
-   = tf.log(y - 1.)
-   ```
-
-  """
-
-  def __init__(self, bijectors=None, validate_args=False, name=None):
-    """Instantiates `Chain` bijector.
-
-    Args:
-      bijectors: Python `list` of bijector instances. An empty list makes this
-        bijector equivalent to the `Identity` bijector.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object. Default:
-        E.g., `Chain([Exp(), Softplus()]).name == "chain_of_exp_of_softplus"`.
-
-    Raises:
-      ValueError: if bijectors have different dtypes.
-    """
-    if bijectors is None:
-      bijectors = ()
-    self._bijectors = bijectors
-
-    for a_bijector in bijectors:
-      if not a_bijector._is_injective:  # pylint: disable=protected-access
-        raise NotImplementedError(
-            "Invert is not implemented for non-injective bijector ({})".format(
-                a_bijector.name))
-
-    dtype = list(set([b.dtype for b in bijectors]))
-    if len(dtype) > 2:
-      raise ValueError("incompatible dtypes: %s" % dtype)
-    elif len(dtype) == 2:
-      dtype = dtype[1] if dtype[0] is None else dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    elif len(dtype) == 1:
-      dtype = dtype[0]
-      event_ndims = bijectors[0].event_ndims
-    else:
-      dtype = None
-      event_ndims = None
-
-    super(Chain, self).__init__(
-        graph_parents=list(itertools.chain.from_iterable(
-            b.graph_parents for b in bijectors)),
-        is_constant_jacobian=all(b.is_constant_jacobian for b in bijectors),
-        validate_args=validate_args,
-        dtype=dtype,
-        event_ndims=event_ndims,
-        name=name or ("identity" if not bijectors else
-                      "_of_".join(["chain"] + [b.name for b in bijectors])))
-
-  @property
-  def bijectors(self):
-    return self._bijectors
-
-  def _shape_helper(self, func_name, input_shape, reverse):
-    new_shape = input_shape
-    for b in reversed(self.bijectors) if reverse else self.bijectors:
-      func = getattr(b, func_name, None)
-      if func is None:
-        raise ValueError("unable to call %s on bijector %s (%s)" %
-                         (func_name, b.name, func))
-      new_shape = func(new_shape)
-    return new_shape
-
-  def _forward_event_shape(self, input_shape):
-    return self._shape_helper("forward_event_shape", input_shape,
-                              reverse=True)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self._shape_helper(
-        "forward_event_shape_tensor", input_shape, reverse=True)
-
-  def _inverse_event_shape(self, output_shape):
-    return self._shape_helper("inverse_event_shape", output_shape,
-                              reverse=False)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self._shape_helper("inverse_event_shape_tensor", output_shape,
-                              reverse=False)
-
-  def _inverse(self, y, **kwargs):
-    for b in self.bijectors:
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return y
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(0., dtype=y.dtype,
-                                name="inverse_log_det_jacobian")
-    for b in self.bijectors:
-      ildj += b.inverse_log_det_jacobian(y, **kwargs.get(b.name, {}))
-      y = b.inverse(y, **kwargs.get(b.name, {}))
-    return ildj
-
-  def _forward(self, x, **kwargs):
-    for b in reversed(self.bijectors):
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return x
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    fldj = constant_op.constant(0., dtype=x.dtype,
-                                name="forward_log_det_jacobian")
-    for b in reversed(self.bijectors):
-      fldj += b.forward_log_det_jacobian(x, **kwargs.get(b.name, {}))
-      x = b.forward(x, **kwargs.get(b.name, {}))
-    return fldj
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index 4686af8bc42a3232cb3a34f2cfcce8323c5896dd..cbd60f92a60612c6cf791b2c7708a3310c6e2b6b 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -18,12 +18,219 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["CholeskyOuterProduct"]
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "CholeskyOuterProduct",
+]
+
+
+class CholeskyOuterProduct(bijector.Bijector):
+  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
+
+  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
+
+  Note: the upper-triangular part of X is ignored (whether or not its zero).
+
+  The surjectivity of g as a map from  the set of n x n positive-diagonal
+  lower-triangular matrices to the set of SPD matrices follows immediately from
+  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
+  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
+
+  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
+  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
+    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
+  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
+  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
+  lower-triangular (which follows from the diagonal of a triangular matrix being
+  its spectrum), and that the product of two positive-diagonal lower-triangular
+  matrices is another positive-diagonal lower-triangular matrix.
+
+  A simple inductive argument (proceding one column of L_3 at a time) shows
+  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
+  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
+
+  Examples:
+
+  ```python
+  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
+
+  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
+  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
+  ```
+
+  """
+
+  def __init__(self, event_ndims=2, validate_args=False,
+               name="cholesky_outer_product"):
+    """Instantiates the `CholeskyOuterProduct` bijector.
+
+    Args:
+      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
+        dimensions associated with a particular draw from the distribution. Must
+        be 0 or 2.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if event_ndims is neither 0 or 2.
+    """
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+    if event_ndims is None or event_ndims not in [0, 2]:
+      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
+    self._static_event_ndims = event_ndims
+    super(CholeskyOuterProduct, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self._static_event_ndims == 0:
+      return math_ops.square(x)
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(x, 2)
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(shape[-2], shape[-1])
+      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
+    # For safety, explicitly zero-out the upper triangular part.
+    x = array_ops.matrix_band_part(x, -1, 0)
+    return math_ops.matmul(x, x, adjoint_b=True)
+
+  def _inverse(self, y):
+    return (math_ops.sqrt(y) if self._static_event_ndims == 0
+            else linalg_ops.cholesky(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    return -self._forward_log_det_jacobian(x=self._inverse(y))
+
+  def _forward_log_det_jacobian(self, x):
+    # Let Y be a symmetric, positive definite matrix and write:
+    #   Y = X X.T
+    # where X is lower-triangular.
+    #
+    # Observe that,
+    #   dY[i,j]/dX[a,b]
+    #   = d/dX[a,b] { X[i,:] X[j,:] }
+    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
+    #
+    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
+    # symmetric and X is lower-triangular, we need vectors of dimension:
+    #   d = p (p + 1) / 2
+    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
+    #   k = { i (i + 1) / 2 + j   i>=j
+    #       { undef               i<j
+    # and assume zero-based indexes. When k is undef, the element is dropped.
+    # Example:
+    #           j      k
+    #        0 1 2 3  /
+    #    0 [ 0 . . . ]
+    # i  1 [ 1 2 . . ]
+    #    2 [ 3 4 5 . ]
+    #    3 [ 6 7 8 9 ]
+    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
+    # slight abuse: k(i,j)=undef means the element is dropped.)
+    #
+    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
+    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
+    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
+    # (1) j<=i<a thus i,j!=a.
+    # (2) i=a>j  thus i,j!=a.
+    #
+    # Since the Jacobian is lower-triangular, we need only compute the product
+    # of diagonal elements:
+    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
+    #   = X[j,j] + I[i=j] X[i,j]
+    #   = 2 X[j,j].
+    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
+    # conclude:
+    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
+    if self._static_event_ndims == 0:
+      if self.validate_args:
+        is_positive = check_ops.assert_positive(
+            x, message="All elements must be positive.")
+        x = control_flow_ops.with_dependencies([is_positive], x)
+      return np.log(2.) + math_ops.log(x)
+
+    diag = array_ops.matrix_diag_part(x)
+
+    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
+    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
+    # output is unchanged.
+    diag = self._make_columnar(diag)
+
+    if self.validate_args:
+      is_matrix = check_ops.assert_rank_at_least(
+          x, 2, message="Input must be a (batch of) matrix.")
+      shape = array_ops.shape(x)
+      is_square = check_ops.assert_equal(
+          shape[-2], shape[-1],
+          message="Input must be a (batch of) square matrix.")
+      # Assuming lower-triangular means we only need check diag>0.
+      is_positive_definite = check_ops.assert_positive(
+          diag, message="Input must be positive definite.")
+      x = control_flow_ops.with_dependencies(
+          [is_matrix, is_square, is_positive_definite], x)
+
+    # Create a vector equal to: [p, p-1, ..., 2, 1].
+    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
+      p_int = array_ops.shape(x)[-1]
+      p_float = math_ops.cast(p_int, dtype=x.dtype)
+    else:
+      p_int = x.get_shape()[-1].value
+      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
+    exponents = math_ops.linspace(p_float, 1., p_int)
+
+    sum_weighted_log_diag = array_ops.squeeze(
+        math_ops.matmul(math_ops.log(diag),
+                        exponents[..., array_ops.newaxis]),
+        squeeze_dims=-1)
+    fldj = p_float * np.log(2.) + sum_weighted_log_diag
+
+    return fldj
+
+  def _make_columnar(self, x):
+    """Ensures non-scalar input has at least one column.
+
+    Example:
+      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
+
+      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
+
+      If `x = 1` then the output is unchanged.
+
+    Args:
+      x: `Tensor`.
+
+    Returns:
+      columnar_x: `Tensor` with at least two dimensions.
+    """
+    if x.get_shape().ndims is not None:
+      if x.get_shape().ndims == 1:
+        x = x[array_ops.newaxis, :]
+      return x
+    shape = array_ops.shape(x)
+    maybe_expanded_shape = array_ops.concat([
+        shape[:-1],
+        distribution_util.pick_vector(
+            math_ops.equal(array_ops.rank(x), 1),
+            [1], np.array([], dtype=np.int32)),
+        shape[-1:],
+    ], 0)
+    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
deleted file mode 100644
index cbd60f92a60612c6cf791b2c7708a3310c6e2b6b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product_impl.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""CholeskyOuterProduct bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "CholeskyOuterProduct",
-]
-
-
-class CholeskyOuterProduct(bijector.Bijector):
-  """Compute `g(X) = X @ X.T`; X is lower-triangular, positive-diagonal matrix.
-
-  `event_ndims` must be 0 or 2, i.e., scalar or matrix.
-
-  Note: the upper-triangular part of X is ignored (whether or not its zero).
-
-  The surjectivity of g as a map from  the set of n x n positive-diagonal
-  lower-triangular matrices to the set of SPD matrices follows immediately from
-  executing the Cholesky factorization algorithm on an SPD matrix A to produce a
-  positive-diagonal lower-triangular matrix L such that `A = L @ L.T`.
-
-  To prove the injectivity of g, suppose that L_1 and L_2 are lower-triangular
-  with positive diagonals and satisfy `A = L_1 @ L_1.T = L_2 @ L_2.T`. Then
-    `inv(L_1) @ A @ inv(L_1).T = [inv(L_1) @ L_2] @ [inv(L_1) @ L_2].T = I`.
-  Setting `L_3 := inv(L_1) @ L_2`, that L_3 is a positive-diagonal
-  lower-triangular matrix follows from `inv(L_1)` being positive-diagonal
-  lower-triangular (which follows from the diagonal of a triangular matrix being
-  its spectrum), and that the product of two positive-diagonal lower-triangular
-  matrices is another positive-diagonal lower-triangular matrix.
-
-  A simple inductive argument (proceding one column of L_3 at a time) shows
-  that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
-  diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
-
-  Examples:
-
-  ```python
-  bijector.CholeskyOuterProduct(event_ndims=2).forward(x=[[1., 0], [2, 1]])
-  # Result: [[1., 2], [2, 5]], i.e., x @ x.T
-
-  bijector.CholeskyOuterProduct(event_ndims=2).inverse(y=[[1., 2], [2, 5]])
-  # Result: [[1., 0], [2, 1]], i.e., cholesky(y).
-  ```
-
-  """
-
-  def __init__(self, event_ndims=2, validate_args=False,
-               name="cholesky_outer_product"):
-    """Instantiates the `CholeskyOuterProduct` bijector.
-
-    Args:
-      event_ndims: `constant` `int32` scalar `Tensor` indicating the number of
-        dimensions associated with a particular draw from the distribution. Must
-        be 0 or 2.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if event_ndims is neither 0 or 2.
-    """
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-    if event_ndims is None or event_ndims not in [0, 2]:
-      raise ValueError("`event_ndims` must be a TF constant which is 0 or 2")
-    self._static_event_ndims = event_ndims
-    super(CholeskyOuterProduct, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self._static_event_ndims == 0:
-      return math_ops.square(x)
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(x, 2)
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(shape[-2], shape[-1])
-      x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
-    # For safety, explicitly zero-out the upper triangular part.
-    x = array_ops.matrix_band_part(x, -1, 0)
-    return math_ops.matmul(x, x, adjoint_b=True)
-
-  def _inverse(self, y):
-    return (math_ops.sqrt(y) if self._static_event_ndims == 0
-            else linalg_ops.cholesky(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    return -self._forward_log_det_jacobian(x=self._inverse(y))
-
-  def _forward_log_det_jacobian(self, x):
-    # Let Y be a symmetric, positive definite matrix and write:
-    #   Y = X X.T
-    # where X is lower-triangular.
-    #
-    # Observe that,
-    #   dY[i,j]/dX[a,b]
-    #   = d/dX[a,b] { X[i,:] X[j,:] }
-    #   = sum_{d=1}^p { I[i=a] I[d=b] X[j,d] + I[j=a] I[d=b] X[i,d] }
-    #
-    # To compute the Jacobian dX/dY we must represent X,Y as vectors. Since Y is
-    # symmetric and X is lower-triangular, we need vectors of dimension:
-    #   d = p (p + 1) / 2
-    # where X, Y are p x p matrices, p > 0. We use a row-major mapping, i.e.,
-    #   k = { i (i + 1) / 2 + j   i>=j
-    #       { undef               i<j
-    # and assume zero-based indexes. When k is undef, the element is dropped.
-    # Example:
-    #           j      k
-    #        0 1 2 3  /
-    #    0 [ 0 . . . ]
-    # i  1 [ 1 2 . . ]
-    #    2 [ 3 4 5 . ]
-    #    3 [ 6 7 8 9 ]
-    # Write vec[.] to indicate transforming a matrix to vector via k(i,j). (With
-    # slight abuse: k(i,j)=undef means the element is dropped.)
-    #
-    # We now show d vec[Y] / d vec[X] is lower triangular. Assuming both are
-    # defined, observe that k(i,j) < k(a,b) iff (1) i<a or (2) i=a and j<b.
-    # In both cases dvec[Y]/dvec[X]@[k(i,j),k(a,b)] = 0 since:
-    # (1) j<=i<a thus i,j!=a.
-    # (2) i=a>j  thus i,j!=a.
-    #
-    # Since the Jacobian is lower-triangular, we need only compute the product
-    # of diagonal elements:
-    #   d vec[Y] / d vec[X] @[k(i,j), k(i,j)]
-    #   = X[j,j] + I[i=j] X[i,j]
-    #   = 2 X[j,j].
-    # Since there is a 2 X[j,j] term for every lower-triangular element of X we
-    # conclude:
-    #   |Jac(d vec[Y]/d vec[X])| = 2^p prod_{j=0}^{p-1} X[j,j]^{p-j}.
-    if self._static_event_ndims == 0:
-      if self.validate_args:
-        is_positive = check_ops.assert_positive(
-            x, message="All elements must be positive.")
-        x = control_flow_ops.with_dependencies([is_positive], x)
-      return np.log(2.) + math_ops.log(x)
-
-    diag = array_ops.matrix_diag_part(x)
-
-    # We now ensure diag is columnar. Eg, if `diag = [1, 2, 3]` then the output
-    # is `[[1], [2], [3]]` and if `diag = [[1, 2, 3], [4, 5, 6]]` then the
-    # output is unchanged.
-    diag = self._make_columnar(diag)
-
-    if self.validate_args:
-      is_matrix = check_ops.assert_rank_at_least(
-          x, 2, message="Input must be a (batch of) matrix.")
-      shape = array_ops.shape(x)
-      is_square = check_ops.assert_equal(
-          shape[-2], shape[-1],
-          message="Input must be a (batch of) square matrix.")
-      # Assuming lower-triangular means we only need check diag>0.
-      is_positive_definite = check_ops.assert_positive(
-          diag, message="Input must be positive definite.")
-      x = control_flow_ops.with_dependencies(
-          [is_matrix, is_square, is_positive_definite], x)
-
-    # Create a vector equal to: [p, p-1, ..., 2, 1].
-    if x.get_shape().ndims is None or x.get_shape()[-1].value is None:
-      p_int = array_ops.shape(x)[-1]
-      p_float = math_ops.cast(p_int, dtype=x.dtype)
-    else:
-      p_int = x.get_shape()[-1].value
-      p_float = np.array(p_int, dtype=x.dtype.as_numpy_dtype)
-    exponents = math_ops.linspace(p_float, 1., p_int)
-
-    sum_weighted_log_diag = array_ops.squeeze(
-        math_ops.matmul(math_ops.log(diag),
-                        exponents[..., array_ops.newaxis]),
-        squeeze_dims=-1)
-    fldj = p_float * np.log(2.) + sum_weighted_log_diag
-
-    return fldj
-
-  def _make_columnar(self, x):
-    """Ensures non-scalar input has at least one column.
-
-    Example:
-      If `x = [1, 2, 3]` then the output is `[[1], [2], [3]]`.
-
-      If `x = [[1, 2, 3], [4, 5, 6]]` then the output is unchanged.
-
-      If `x = 1` then the output is unchanged.
-
-    Args:
-      x: `Tensor`.
-
-    Returns:
-      columnar_x: `Tensor` with at least two dimensions.
-    """
-    if x.get_shape().ndims is not None:
-      if x.get_shape().ndims == 1:
-        x = x[array_ops.newaxis, :]
-      return x
-    shape = array_ops.shape(x)
-    maybe_expanded_shape = array_ops.concat([
-        shape[:-1],
-        distribution_util.pick_vector(
-            math_ops.equal(array_ops.rank(x), 1),
-            [1], np.array([], dtype=np.int32)),
-        shape[-1:],
-    ], 0)
-    return array_ops.reshape(x, maybe_expanded_shape)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
index d254b635d28099a09a2054536f04ffee3a355b2f..ccb1f029277bc07011df7be047a075274f2b3a27 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector.py
@@ -18,12 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["ConditionalBijector"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = ["ConditionalBijector"]
+
+
+class ConditionalBijector(bijector.Bijector):
+  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward(self, x, name="forward", **condition_kwargs):
+    return self._call_forward(x, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse(self, y, name="inverse", **condition_kwargs):
+    return self._call_inverse(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def inverse_log_det_jacobian(
+      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
+    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
+
+  @distribution_util.AppendDocstring(kwargs_dict={
+      "**condition_kwargs":
+      "Named arguments forwarded to subclass implementation."})
+  def forward_log_det_jacobian(
+      self, x, name="forward_log_det_jacobian", **condition_kwargs):
+    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
deleted file mode 100644
index ccb1f029277bc07011df7be047a075274f2b3a27..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/conditional_bijector_impl.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ConditionalBijector base."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = ["ConditionalBijector"]
-
-
-class ConditionalBijector(bijector.Bijector):
-  """Conditional Bijector is a Bijector that allows intrinsic conditioning."""
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward(self, x, name="forward", **condition_kwargs):
-    return self._call_forward(x, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse(self, y, name="inverse", **condition_kwargs):
-    return self._call_inverse(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def inverse_log_det_jacobian(
-      self, y, name="inverse_log_det_jacobian", **condition_kwargs):
-    return self._call_inverse_log_det_jacobian(y, name, **condition_kwargs)
-
-  @distribution_util.AppendDocstring(kwargs_dict={
-      "**condition_kwargs":
-      "Named arguments forwarded to subclass implementation."})
-  def forward_log_det_jacobian(
-      self, x, name="forward_log_det_jacobian", **condition_kwargs):
-    return self._call_forward_log_det_jacobian(x, name, **condition_kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
index 399d713098eb7223601beb9518dc51dd6160ad64..b1ff840d62a73c941a4d67dec73b5c9f4d5353f9 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/exp.py
@@ -18,12 +18,49 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.exp_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
 
-_allowed_symbols = ["Exp"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Exp",
+]
+
+
+class Exp(power_transform.PowerTransform):
+  """Compute `Y = g(X) = exp(X)`.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    exp = Exp(event_ndims=2)
+    x = [[[1., 2],
+           [3, 4]],
+          [[5, 6],
+           [7, 8]]]
+    exp(x) == exp.forward(x)
+    log(x) == exp.inverse(x)
+    ```
+
+    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
+    over the event space.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="exp"):
+    """Instantiates the `Exp` bijector.
+
+    Args:
+      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
+        associated with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    super(Exp, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
deleted file mode 100644
index b1ff840d62a73c941a4d67dec73b5c9f4d5353f9..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/exp_impl.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Exp bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distributions.python.ops.bijectors import power_transform
-
-
-__all__ = [
-    "Exp",
-]
-
-
-class Exp(power_transform.PowerTransform):
-  """Compute `Y = g(X) = exp(X)`.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=exp(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    exp = Exp(event_ndims=2)
-    x = [[[1., 2],
-           [3, 4]],
-          [[5, 6],
-           [7, 8]]]
-    exp(x) == exp.forward(x)
-    log(x) == exp.inverse(x)
-    ```
-
-    Note: the exp(.) is applied element-wise but the Jacobian is a reduction
-    over the event space.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="exp"):
-    """Instantiates the `Exp` bijector.
-
-    Args:
-      event_ndims: Scalar `int32` `Tensor` indicating the number of dimensions
-        associated with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    super(Exp, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
index cf37aa51115ed98ab263bc03bcb297a03432a7ae..67f39785563255be0fe154aca3cbcf01c6a01e73 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel.py
@@ -18,12 +18,107 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.gumbel_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Gumbel"]
+__all__ = [
+    "Gumbel",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Gumbel(bijector.Bijector):
+  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+
+  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
+
+  ```none
+  Y ~ Gumbel(loc, scale)
+  pdf(y; loc, scale) = exp(
+    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
+  ```
+  """
+
+  def __init__(self,
+               loc=0.,
+               scale=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="gumbel"):
+    """Instantiates the `Gumbel` bijector.
+
+    Args:
+      loc: Float-like `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      scale: Positive Float-like `Tensor` that is the same dtype and is
+        broadcastable with `loc`.
+        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[loc, scale]):
+      self._loc = ops.convert_to_tensor(loc, name="loc")
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      check_ops.assert_same_float_dtype([self._loc, self._scale])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale, message="Argument scale was not positive")
+        ], self._scale)
+
+    super(Gumbel, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def loc(self):
+    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
+    return self._scale
+
+  def _forward(self, x):
+    z = (x - self.loc) / self.scale
+    return math_ops.exp(-math_ops.exp(-z))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    event_dims = self._event_dims_tensor(x)
+    z = (x - self.loc) / self.scale
+    return math_ops.reduce_sum(
+        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y,
+        constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
deleted file mode 100644
index 67f39785563255be0fe154aca3cbcf01c6a01e73..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/gumbel_impl.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gumbel bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "Gumbel",
-]
-
-
-class Gumbel(bijector.Bijector):
-  """Compute `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-
-  This bijector maps inputs from `[-inf, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Gumbel distribution](https://en.wikipedia.org/wiki/Gumbel_distribution):
-
-  ```none
-  Y ~ Gumbel(loc, scale)
-  pdf(y; loc, scale) = exp(
-    -( (y - loc) / scale + exp(- (y - loc) / scale) ) ) / scale
-  ```
-  """
-
-  def __init__(self,
-               loc=0.,
-               scale=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="gumbel"):
-    """Instantiates the `Gumbel` bijector.
-
-    Args:
-      loc: Float-like `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      scale: Positive Float-like `Tensor` that is the same dtype and is
-        broadcastable with `loc`.
-        This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[loc, scale]):
-      self._loc = ops.convert_to_tensor(loc, name="loc")
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      check_ops.assert_same_float_dtype([self._loc, self._scale])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale, message="Argument scale was not positive")
-        ], self._scale)
-
-    super(Gumbel, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def loc(self):
-    """The `loc` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._loc
-
-  @property
-  def scale(self):
-    """This is `scale` in `Y = g(X) = exp(-exp(-(X - loc) / scale))`."""
-    return self._scale
-
-  def _forward(self, x):
-    z = (x - self.loc) / self.scale
-    return math_ops.exp(-math_ops.exp(-z))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.loc - self.scale * math_ops.log(-math_ops.log(y))
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        math_ops.log(self.scale / (-math_ops.log(y) * y)), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    event_dims = self._event_dims_tensor(x)
-    z = (x - self.loc) / self.scale
-    return math_ops.reduce_sum(
-        -z - math_ops.exp(-z) - math_ops.log(self.scale), axis=event_dims)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y,
-        constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
index db10c3fc3a9135b4c408ada74622ba9b360f9ec1..fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/inline.py
@@ -18,12 +18,124 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.inline_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Inline"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Inline",
+]
+
+
+class Inline(bijector.Bijector):
+  """Bijector constructed from custom callables.
+
+  Example Use:
+
+  ```python
+  exp = Inline(
+    forward_fn=tf.exp,
+    inverse_fn=tf.log,
+    inverse_log_det_jacobian_fn=(
+      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
+    name="exp")
+  ```
+
+  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
+  """
+
+  def __init__(self,
+               forward_fn=None,
+               inverse_fn=None,
+               inverse_log_det_jacobian_fn=None,
+               forward_log_det_jacobian_fn=None,
+               forward_event_shape_fn=None,
+               forward_event_shape_tensor_fn=None,
+               inverse_event_shape_fn=None,
+               inverse_event_shape_tensor_fn=None,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name="inline"):
+    """Creates a `Bijector` from callables.
+
+    Args:
+      forward_fn: Python callable implementing the forward transformation.
+      inverse_fn: Python callable implementing the inverse transformation.
+      inverse_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the inverse transformation.
+      forward_log_det_jacobian_fn: Python callable implementing the
+        log o det o jacobian of the forward transformation.
+      forward_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      forward_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_fn: Python callable implementing non-identical
+        static event shape changes. Default: shape is assumed unchanged.
+      inverse_event_shape_tensor_fn: Python callable implementing non-identical
+        event shape changes. Default: shape is assumed unchanged.
+      is_constant_jacobian: Python `bool` indicating that the Jacobian is
+        constant for all input arguments.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    super(Inline, self).__init__(
+        event_ndims=0,
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+    self._forward_fn = forward_fn
+    self._inverse_fn = inverse_fn
+    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
+    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
+    self._forward_event_shape_fn = forward_event_shape_fn
+    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
+    self._inverse_event_shape_fn = inverse_event_shape_fn
+    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
+
+  def _forward_event_shape(self, input_shape):
+    if self._forward_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_fn(input_shape)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    if self._forward_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return input_shape
+    return self._forward_event_shape_tensor_fn(input_shape)
+
+  def _inverse_event_shape(self, output_shape):
+    if self._inverse_event_shape_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_fn(output_shape)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    if self._inverse_event_shape_tensor_fn is None:
+      # By default assume shape doesn't change.
+      return output_shape
+    return self._inverse_event_shape_tensor_fn(output_shape)
+
+  def _forward(self, x, **kwargs):
+    if not callable(self._forward_fn):
+      raise NotImplementedError(
+          "forward_fn is not a callable function.")
+    return self._forward_fn(x, **kwargs)
+
+  def _inverse(self, y, **kwargs):
+    if not callable(self._inverse_fn):
+      raise NotImplementedError(
+          "inverse_fn is not a callable function.")
+    return self._inverse_fn(y, **kwargs)
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._inverse_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "inverse_log_det_jacobian_fn is not a callable function.")
+    return self._inverse_log_det_jacobian_fn(y, **kwargs)
+
+  def _forward_log_det_jacobian(self, y, **kwargs):
+    if not callable(self._forward_log_det_jacobian_fn):
+      raise NotImplementedError(
+          "forward_log_det_jacobian_fn is not a callable function.")
+    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
deleted file mode 100644
index fab1b22fbf92e7b92a5ec86ec62d66bec71a8c94..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/inline_impl.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Inline bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Inline",
-]
-
-
-class Inline(bijector.Bijector):
-  """Bijector constructed from custom callables.
-
-  Example Use:
-
-  ```python
-  exp = Inline(
-    forward_fn=tf.exp,
-    inverse_fn=tf.log,
-    inverse_log_det_jacobian_fn=(
-      lambda y: -tf.reduce_sum(tf.log(y), axis=-1)),
-    name="exp")
-  ```
-
-  The above example is equivalent to the `Bijector` `Exp(event_ndims=1)`.
-  """
-
-  def __init__(self,
-               forward_fn=None,
-               inverse_fn=None,
-               inverse_log_det_jacobian_fn=None,
-               forward_log_det_jacobian_fn=None,
-               forward_event_shape_fn=None,
-               forward_event_shape_tensor_fn=None,
-               inverse_event_shape_fn=None,
-               inverse_event_shape_tensor_fn=None,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name="inline"):
-    """Creates a `Bijector` from callables.
-
-    Args:
-      forward_fn: Python callable implementing the forward transformation.
-      inverse_fn: Python callable implementing the inverse transformation.
-      inverse_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the inverse transformation.
-      forward_log_det_jacobian_fn: Python callable implementing the
-        log o det o jacobian of the forward transformation.
-      forward_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      forward_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_fn: Python callable implementing non-identical
-        static event shape changes. Default: shape is assumed unchanged.
-      inverse_event_shape_tensor_fn: Python callable implementing non-identical
-        event shape changes. Default: shape is assumed unchanged.
-      is_constant_jacobian: Python `bool` indicating that the Jacobian is
-        constant for all input arguments.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    super(Inline, self).__init__(
-        event_ndims=0,
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-    self._forward_fn = forward_fn
-    self._inverse_fn = inverse_fn
-    self._inverse_log_det_jacobian_fn = inverse_log_det_jacobian_fn
-    self._forward_log_det_jacobian_fn = forward_log_det_jacobian_fn
-    self._forward_event_shape_fn = forward_event_shape_fn
-    self._forward_event_shape_tensor_fn = forward_event_shape_tensor_fn
-    self._inverse_event_shape_fn = inverse_event_shape_fn
-    self._inverse_event_shape_tensor_fn = inverse_event_shape_tensor_fn
-
-  def _forward_event_shape(self, input_shape):
-    if self._forward_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_fn(input_shape)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    if self._forward_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return input_shape
-    return self._forward_event_shape_tensor_fn(input_shape)
-
-  def _inverse_event_shape(self, output_shape):
-    if self._inverse_event_shape_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_fn(output_shape)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    if self._inverse_event_shape_tensor_fn is None:
-      # By default assume shape doesn't change.
-      return output_shape
-    return self._inverse_event_shape_tensor_fn(output_shape)
-
-  def _forward(self, x, **kwargs):
-    if not callable(self._forward_fn):
-      raise NotImplementedError(
-          "forward_fn is not a callable function.")
-    return self._forward_fn(x, **kwargs)
-
-  def _inverse(self, y, **kwargs):
-    if not callable(self._inverse_fn):
-      raise NotImplementedError(
-          "inverse_fn is not a callable function.")
-    return self._inverse_fn(y, **kwargs)
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._inverse_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "inverse_log_det_jacobian_fn is not a callable function.")
-    return self._inverse_log_det_jacobian_fn(y, **kwargs)
-
-  def _forward_log_det_jacobian(self, y, **kwargs):
-    if not callable(self._forward_log_det_jacobian_fn):
-      raise NotImplementedError(
-          "forward_log_det_jacobian_fn is not a callable function.")
-    return self._forward_log_det_jacobian_fn(y, **kwargs)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
index c134e10109ce5065eb58de1d847e3c487258954c..2c603fe61f36dd27f4984fe6c13c11f2fb534321 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/invert.py
@@ -18,12 +18,85 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.invert_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-_allowed_symbols = ["Invert"]
+__all__ = [
+    "Invert",
+]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class Invert(bijector_lib.Bijector):
+  """Bijector which inverts another Bijector.
+
+  Example Use: [ExpGammaDistribution (see Background & Context)](
+  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
+  models `Y=log(X)` where `X ~ Gamma`.
+
+  ```python
+  exp_gamma_distribution = TransformedDistribution(
+    distribution=Gamma(concentration=1., rate=2.),
+    bijector=bijector.Invert(bijector.Exp())
+  ```
+
+  """
+
+  def __init__(self, bijector, validate_args=False, name=None):
+    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
+
+    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
+    efficient if the base bijector implements `_forward_log_det_jacobian`. If
+    `_forward_log_det_jacobian` is not implemented then the following code is
+    used:
+
+    ```python
+    y = self.inverse(x, **kwargs)
+    return -self.inverse_log_det_jacobian(y, **kwargs)
+    ```
+
+    Args:
+      bijector: Bijector instance.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+
+    if not bijector._is_injective:  # pylint: disable=protected-access
+      raise NotImplementedError(
+          "Invert is not implemented for non-injective bijectors.")
+
+    self._bijector = bijector
+    super(Invert, self).__init__(
+        event_ndims=bijector.event_ndims,
+        graph_parents=bijector.graph_parents,
+        is_constant_jacobian=bijector.is_constant_jacobian,
+        validate_args=validate_args,
+        dtype=bijector.dtype,
+        name=name or "_".join(["invert", bijector.name]))
+
+  def _forward_event_shape(self, input_shape):
+    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
+
+  def _forward_event_shape_tensor(self, input_shape):
+    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape(self, output_shape):
+    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
+
+  @property
+  def bijector(self):
+    return self._bijector
+
+  def _forward(self, x, **kwargs):
+    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse(self, y, **kwargs):
+    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
+
+  def _inverse_log_det_jacobian(self, y, **kwargs):
+    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
+
+  def _forward_log_det_jacobian(self, x, **kwargs):
+    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
deleted file mode 100644
index 2c603fe61f36dd27f4984fe6c13c11f2fb534321..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/invert_impl.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Invert bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-__all__ = [
-    "Invert",
-]
-
-
-class Invert(bijector_lib.Bijector):
-  """Bijector which inverts another Bijector.
-
-  Example Use: [ExpGammaDistribution (see Background & Context)](
-  https://reference.wolfram.com/language/ref/ExpGammaDistribution.html)
-  models `Y=log(X)` where `X ~ Gamma`.
-
-  ```python
-  exp_gamma_distribution = TransformedDistribution(
-    distribution=Gamma(concentration=1., rate=2.),
-    bijector=bijector.Invert(bijector.Exp())
-  ```
-
-  """
-
-  def __init__(self, bijector, validate_args=False, name=None):
-    """Creates a `Bijector` which swaps the meaning of `inverse` and `forward`.
-
-    Note: An inverted bijector's `inverse_log_det_jacobian` is often more
-    efficient if the base bijector implements `_forward_log_det_jacobian`. If
-    `_forward_log_det_jacobian` is not implemented then the following code is
-    used:
-
-    ```python
-    y = self.inverse(x, **kwargs)
-    return -self.inverse_log_det_jacobian(y, **kwargs)
-    ```
-
-    Args:
-      bijector: Bijector instance.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-
-    if not bijector._is_injective:  # pylint: disable=protected-access
-      raise NotImplementedError(
-          "Invert is not implemented for non-injective bijectors.")
-
-    self._bijector = bijector
-    super(Invert, self).__init__(
-        event_ndims=bijector.event_ndims,
-        graph_parents=bijector.graph_parents,
-        is_constant_jacobian=bijector.is_constant_jacobian,
-        validate_args=validate_args,
-        dtype=bijector.dtype,
-        name=name or "_".join(["invert", bijector.name]))
-
-  def _forward_event_shape(self, input_shape):
-    return self.bijector._inverse_event_shape(input_shape)  # pylint: disable=protected-access
-
-  def _forward_event_shape_tensor(self, input_shape):
-    return self.bijector._inverse_event_shape_tensor(input_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape(self, output_shape):
-    return self.bijector._forward_event_shape(output_shape)  # pylint: disable=protected-access
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    return self.bijector._forward_event_shape_tensor(output_shape)  # pylint: disable=protected-access
-
-  @property
-  def bijector(self):
-    return self._bijector
-
-  def _forward(self, x, **kwargs):
-    return self.bijector._inverse(x, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse(self, y, **kwargs):
-    return self.bijector._forward(y, **kwargs)  # pylint: disable=protected-access
-
-  def _inverse_log_det_jacobian(self, y, **kwargs):
-    return self.bijector._forward_log_det_jacobian(y, **kwargs)  # pylint: disable=protected-access
-
-  def _forward_log_det_jacobian(self, x, **kwargs):
-    return self.bijector._inverse_log_det_jacobian(x, **kwargs)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
index 132dc570f94719b6c71fb269866c943774481b7e..06c7c61ec3dc3980e0d12a984739dca5a925ac9f 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py
@@ -18,16 +18,459 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = [
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import template as template_ops
+from tensorflow.python.ops import variable_scope as variable_scope_lib
+from tensorflow.python.ops.distributions import bijector as bijector_lib
+
+
+__all__ = [
     "MaskedAutoregressiveFlow",
-    "masked_dense",
     "masked_autoregressive_default_template",
+    "masked_dense",
 ]
 
-remove_undocumented(__name__, _allowed_symbols)
+
+class MaskedAutoregressiveFlow(bijector_lib.Bijector):
+  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
+
+  The affine autoregressive flow [1] provides a relatively simple framework for
+  user-specified (deep) architectures to learn a distribution over vector-valued
+  events. Regarding terminology,
+
+    "Autoregressive models decompose the joint density as a product of
+    conditionals, and model each conditional in turn. Normalizing flows
+    transform a base density (e.g. a standard Gaussian) into the target density
+    by an invertible transformation with tractable Jacobian." [1]
+
+  In other words, the "autoregressive property" is equivalent to the
+  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
+  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
+  this property by zeroing out weights in its `masked_dense` layers.
+
+  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
+  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
+  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
+  masked weights such that the autoregressive property is automatically met in
+  the `inverse`.
+
+  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
+  (expensive) forward-mode calculation to draw samples and the (cheap)
+  reverse-mode calculation to compute log-probabilities. Conversely, a
+  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
+  the (expensive) forward-mode calculation to compute log-probabilities and the
+  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
+  [below] for more details.
+
+  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
+  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
+  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
+  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
+  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
+  [below] are possible.
+
+  For convenience, `masked_autoregressive_default_template` is offered as a
+  possible `shift_and_log_scale_fn` function. It implements the MADE
+  architecture [2]. MADE is a feed-forward network that computes a `shift` and
+  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
+  masked to ensure the autoregressive property. It is possible that this
+  architecture is suboptimal for your task. To build alternative networks,
+  either change the arguments to `masked_autoregressive_default_template`, use
+  the `masked_dense` function to roll-out your own, or use some other
+  architecture, e.g., using `tf.layers`.
+
+  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
+  enforces the "autoregressive property".
+
+  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
+  semantics, the forward transformation is,
+
+  ```python
+  def forward(x):
+    y = zeros_like(x)
+    event_size = x.shape[-1]
+    for _ in range(event_size):
+      shift, log_scale = shift_and_log_scale_fn(y)
+      y = x * math_ops.exp(log_scale) + shift
+    return y
+  ```
+
+  and the inverse transformation is,
+
+  ```python
+  def inverse(y):
+    shift, log_scale = shift_and_log_scale_fn(y)
+    return (y - shift) / math_ops.exp(log_scale)
+  ```
+
+  Notice that the `inverse` does not need a for-loop. This is because in the
+  forward pass each calculation of `shift` and `log_scale` is based on the `y`
+  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
+  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
+  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
+  also proves the transform is bijective.)
+
+  #### Example Use
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  dims = 5
+
+  # A common choice for a normalizing flow is to use a Gaussian for the base
+  # distribution. (However, any continuous distribution would work.) E.g.,
+  maf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512])),
+      event_shape=[dims])
+
+  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
+  maf.log_prob(x)   # Almost free; uses Bijector caching.
+  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
+
+  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
+  iaf = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.Invert(tfb.MaskedAutoregressiveFlow(
+          shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
+              hidden_layers=[512, 512]))),
+      event_shape=[dims])
+
+  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
+  iaf.log_prob(x)   # Almost free; uses Bijector caching.
+  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
+
+  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
+  # poor choice. Here's an example of using a "shift only" version and with a
+  # different number/depth of hidden layers.
+  shift_only = True
+  maf_no_scale_hidden2 = tfd.TransformedDistribution(
+      distribution=tfd.Normal(loc=0., scale=1.),
+      bijector=tfb.MaskedAutoregressiveFlow(
+          tfb.masked_autoregressive_default_template(
+              hidden_layers=[32],
+              shift_only=shift_only),
+          is_constant_jacobian=shift_only),
+      event_shape=[dims])
+  ```
+
+  [1]: "Masked Autoregressive Flow for Density Estimation."
+       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
+       https://arxiv.org/abs/1705.07057
+
+  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  """
+
+  def __init__(self,
+               shift_and_log_scale_fn,
+               is_constant_jacobian=False,
+               validate_args=False,
+               name=None):
+    """Creates the MaskedAutoregressiveFlow bijector.
+
+    Args:
+      shift_and_log_scale_fn: Python `callable` which computes `shift` and
+        `log_scale` from both the forward domain (`x`) and the inverse domain
+        (`y`). Calculation must respect the "autoregressive property" (see class
+        docstring). Suggested default
+        `masked_autoregressive_default_template(hidden_layers=...)`.
+        Typically the function contains `tf.Variables` and is wrapped using
+        `tf.make_template`. Returning `None` for either (both) `shift`,
+        `log_scale` is equivalent to (but more efficient than) returning zero.
+      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
+        implementation assumes `log_scale` does not depend on the forward domain
+        (`x`) or inverse domain (`y`) values. (No validation is made;
+        `is_constant_jacobian=False` is always safe but possibly computationally
+        inefficient.)
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+    """
+    name = name or "masked_autoregressive_flow"
+    self._shift_and_log_scale_fn = shift_and_log_scale_fn
+    super(MaskedAutoregressiveFlow, self).__init__(
+        is_constant_jacobian=is_constant_jacobian,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    event_size = array_ops.shape(x)[-1]
+    y0 = array_ops.zeros_like(x, name="y0")
+    # call the template once to ensure creation
+    _ = self._shift_and_log_scale_fn(y0)
+    def _loop_body(index, y0):
+      """While-loop body for autoregression calculation."""
+      # Set caching device to avoid re-getting the tf.Variable for every while
+      # loop iteration.
+      with variable_scope_lib.variable_scope(
+          variable_scope_lib.get_variable_scope()) as vs:
+        if vs.caching_device is None:
+          vs.set_caching_device(lambda op: op.device)
+        shift, log_scale = self._shift_and_log_scale_fn(y0)
+      y = x
+      if log_scale is not None:
+        y *= math_ops.exp(log_scale)
+      if shift is not None:
+        y += shift
+      return index + 1, y
+    _, y = control_flow_ops.while_loop(
+        cond=lambda index, _: index < event_size,
+        body=_loop_body,
+        loop_vars=[0, y0])
+    return y
+
+  def _inverse(self, y):
+    shift, log_scale = self._shift_and_log_scale_fn(y)
+    x = y
+    if shift is not None:
+      x -= shift
+    if log_scale is not None:
+      x *= math_ops.exp(-log_scale)
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    _, log_scale = self._shift_and_log_scale_fn(y)
+    if log_scale is None:
+      return constant_op.constant(0., dtype=y.dtype, name="ildj")
+    return -math_ops.reduce_sum(log_scale, axis=-1)
+
+
+MASK_INCLUSIVE = "inclusive"
+MASK_EXCLUSIVE = "exclusive"
+
+
+def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
+  """Generate the slices for building an autoregressive mask."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  slices = []
+  col = 0
+  d_in = n_in // num_blocks
+  d_out = n_out // num_blocks
+  row = d_out if mask_type == MASK_EXCLUSIVE else 0
+  for _ in range(num_blocks):
+    row_slice = slice(row, None)
+    col_slice = slice(col, col + d_in)
+    slices.append([row_slice, col_slice])
+    col += d_in
+    row += d_out
+  return slices
+
+
+def _gen_mask(num_blocks,
+              n_in,
+              n_out,
+              mask_type=MASK_EXCLUSIVE,
+              dtype=dtypes.float32):
+  """Generate the mask for building an autoregressive dense layer."""
+  # TODO(b/67594795): Better support of dynamic shape.
+  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
+  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
+  for [row_slice, col_slice] in slices:
+    mask[row_slice, col_slice] = 1
+  return mask
+
+
+def masked_dense(inputs,
+                 units,
+                 num_blocks=None,
+                 exclusive=False,
+                 kernel_initializer=None,
+                 reuse=None,
+                 name=None,
+                 *args,
+                 **kwargs):
+  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
+
+  See [1] for detailed explanation.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    inputs: Tensor input.
+    units: Python `int` scalar representing the dimensionality of the output
+      space.
+    num_blocks: Python `int` scalar representing the number of blocks for the
+      MADE masks.
+    exclusive: Python `bool` scalar representing whether to zero the diagonal of
+      the mask, used for the first layer of a MADE.
+    kernel_initializer: Initializer function for the weight matrix.
+      If `None` (default), weights are initialized using the
+      `tf.glorot_random_initializer`.
+    reuse: Python `bool` scalar representing whether to reuse the weights of a
+      previous layer by the same name.
+    name: Python `str` used to describe ops managed by this function.
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    Output tensor.
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+  # TODO(b/67594795): Better support of dynamic shape.
+  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
+  if input_depth is None:
+    raise NotImplementedError(
+        "Rightmost dimension must be known prior to graph execution.")
+
+  mask = _gen_mask(num_blocks, input_depth, units,
+                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
+
+  if kernel_initializer is None:
+    kernel_initializer = init_ops.glorot_normal_initializer()
+
+  def masked_initializer(shape, dtype=None, partition_info=None):
+    return mask * kernel_initializer(shape, dtype, partition_info)
+
+  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
+    layer = layers.Dense(
+        units,
+        kernel_initializer=masked_initializer,
+        kernel_constraint=lambda x: mask * x,
+        name=name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=name,
+        _reuse=reuse,
+        *args,
+        **kwargs)
+    return layer.apply(inputs)
+
+
+def masked_autoregressive_default_template(
+    hidden_layers,
+    shift_only=False,
+    activation=nn_ops.relu,
+    log_scale_min_clip=-5.,
+    log_scale_max_clip=3.,
+    log_scale_clip_gradient=False,
+    name=None,
+    *args,
+    **kwargs):
+  """Build the MADE Model [1].
+
+  This will be wrapped in a make_template to ensure the variables are only
+  created once. It takes the input and returns the `loc` ("mu" [1]) and
+  `log_scale` ("alpha" [1]) from the MADE network.
+
+  Warning: This function uses `masked_dense` to create randomly initialized
+  `tf.Variables`. It is presumed that these will be fit, just as you would any
+  other neural architecture which uses `tf.layers.dense`.
+
+  #### About Hidden Layers:
+
+  Each element of `hidden_layers` should be greater than the `input_depth`
+  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
+  neural network). This is necessary to ensure the autoregressivity property.
+
+  #### About Clipping:
+
+  This function also optionally clips the `log_scale` (but possibly not its
+  gradient). This is useful because if `log_scale` is too small/large it might
+  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
+  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
+  `bool` indicates whether the gradient should also be clipped. The default does
+  not clip the gradient; this is useful because it still provides gradient
+  information (for fitting) yet solves the numerical stability problem. I.e.,
+  `log_scale_clip_gradient = False` means
+  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
+  `grad[clip(x)] exp(clip(x))`.
+
+  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
+       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
+       https://arxiv.org/abs/1502.03509
+
+  Arguments:
+    hidden_layers: Python `list`-like of non-negative integer, scalars
+      indicating the number of units in each hidden layer. Default: `[512, 512].
+    shift_only: Python `bool` indicating if only the `shift` term shall be
+      computed. Default: `False`.
+    activation: Activation function (callable). Explicitly setting to `None`
+      implies a linear activation.
+    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The minimum value to clip by. Default: -5.
+    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
+      same shape as `log_scale`. The maximum value to clip by. Default: 3.
+    log_scale_clip_gradient: Python `bool` indicating that the gradient of
+      `tf.clip_by_value` should be preserved. Default: `False`.
+    name: A name for ops managed by this function. Default:
+      "masked_autoregressive_default_template".
+    *args: `tf.layers.dense` arguments.
+    **kwargs: `tf.layers.dense` keyword arguments.
+
+  Returns:
+    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
+    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
+
+  Raises:
+    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
+      graph execution.
+  """
+
+  with ops.name_scope(name, "masked_autoregressive_default_template",
+                      values=[log_scale_min_clip, log_scale_max_clip]):
+    def _fn(x):
+      """MADE parameterized via `masked_autoregressive_default_template`."""
+      # TODO(b/67594795): Better support of dynamic shape.
+      input_depth = x.shape.with_rank_at_least(1)[-1].value
+      if input_depth is None:
+        raise NotImplementedError(
+            "Rightmost dimension must be known prior to graph execution.")
+      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
+                     else array_ops.shape(x))
+      for i, units in enumerate(hidden_layers):
+        x = masked_dense(
+            inputs=x,
+            units=units,
+            num_blocks=input_depth,
+            exclusive=True if i == 0 else False,
+            activation=activation,
+            *args,
+            **kwargs)
+      x = masked_dense(
+          inputs=x,
+          units=(1 if shift_only else 2) * input_depth,
+          num_blocks=input_depth,
+          activation=None,
+          *args,
+          **kwargs)
+      if shift_only:
+        x = array_ops.reshape(x, shape=input_shape)
+        return x, None
+      x = array_ops.reshape(
+          x, shape=array_ops.concat([input_shape, [2]], axis=0))
+      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
+      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
+                    else _clip_by_value_preserve_grad)
+      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
+      return shift, log_scale
+    return template_ops.make_template(
+        "masked_autoregressive_default_template", _fn)
+
+
+def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
+  """Clips input while leaving gradient unaltered."""
+  with ops.name_scope(name, "clip_by_value_preserve_grad",
+                      [x, clip_value_min, clip_value_max]):
+    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
deleted file mode 100644
index ae142883931274b594dbbafbe86bd71e75c621bc..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive_impl.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MaskedAutoregressiveFlow bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.layers import core as layers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import template as template_ops
-from tensorflow.python.ops import variable_scope as variable_scope_lib
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "MaskedAutoregressiveFlow",
-    "masked_autoregressive_default_template",
-    "masked_dense",
-]
-
-
-class MaskedAutoregressiveFlow(bijector_lib.Bijector):
-  """Affine MaskedAutoregressiveFlow bijector for vector-valued events.
-
-  The affine autoregressive flow [1] provides a relatively simple framework for
-  user-specified (deep) architectures to learn a distribution over vector-valued
-  events. Regarding terminology,
-
-    "Autoregressive models decompose the joint density as a product of
-    conditionals, and model each conditional in turn. Normalizing flows
-    transform a base density (e.g. a standard Gaussian) into the target density
-    by an invertible transformation with tractable Jacobian." [1]
-
-  In other words, the "autoregressive property" is equivalent to the
-  decomposition, `p(x) = prod{ p(x[i] | x[0:i]) : i=0, ..., d }`. The provided
-  `shift_and_log_scale_fn`, `masked_autoregressive_default_template`, achieves
-  this property by zeroing out weights in its `masked_dense` layers.
-
-  In the `tf.distributions` framework, a "normalizing flow" is implemented as a
-  `tf.distributions.bijectors.Bijector`. The `forward` "autoregression"
-  is implemented using a `tf.while_loop` and a deep neural network (DNN) with
-  masked weights such that the autoregressive property is automatically met in
-  the `inverse`.
-
-  A `TransformedDistribution` using `MaskedAutoregressiveFlow(...)` uses the
-  (expensive) forward-mode calculation to draw samples and the (cheap)
-  reverse-mode calculation to compute log-probabilities. Conversely, a
-  `TransformedDistribution` using `Invert(MaskedAutoregressiveFlow(...))` uses
-  the (expensive) forward-mode calculation to compute log-probabilities and the
-  (cheap) reverse-mode calculation to compute samples.  See "Example Use"
-  [below] for more details.
-
-  Given a `shift_and_log_scale_fn`, the forward and inverse transformations are
-  (a sequence of) affine transformations. A "valid" `shift_and_log_scale_fn`
-  must compute each `shift` (aka `loc` or "mu" [2]) and `log(scale)` (aka
-  "alpha" [2]) such that each are broadcastable with the arguments to `forward`
-  and `inverse`, i.e., such that the calculations in `forward`, `inverse`
-  [below] are possible.
-
-  For convenience, `masked_autoregressive_default_template` is offered as a
-  possible `shift_and_log_scale_fn` function. It implements the MADE
-  architecture [2]. MADE is a feed-forward network that computes a `shift` and
-  `log(scale)` using `masked_dense` layers in a deep neural network. Weights are
-  masked to ensure the autoregressive property. It is possible that this
-  architecture is suboptimal for your task. To build alternative networks,
-  either change the arguments to `masked_autoregressive_default_template`, use
-  the `masked_dense` function to roll-out your own, or use some other
-  architecture, e.g., using `tf.layers`.
-
-  Warning: no attempt is made to validate that the `shift_and_log_scale_fn`
-  enforces the "autoregressive property".
-
-  Assuming `shift_and_log_scale_fn` has valid shape and autoregressive
-  semantics, the forward transformation is,
-
-  ```python
-  def forward(x):
-    y = zeros_like(x)
-    event_size = x.shape[-1]
-    for _ in range(event_size):
-      shift, log_scale = shift_and_log_scale_fn(y)
-      y = x * math_ops.exp(log_scale) + shift
-    return y
-  ```
-
-  and the inverse transformation is,
-
-  ```python
-  def inverse(y):
-    shift, log_scale = shift_and_log_scale_fn(y)
-    return (y - shift) / math_ops.exp(log_scale)
-  ```
-
-  Notice that the `inverse` does not need a for-loop. This is because in the
-  forward pass each calculation of `shift` and `log_scale` is based on the `y`
-  calculated so far (not `x`). In the `inverse`, the `y` is fully known, thus is
-  equivalent to the scaling used in `forward` after `event_size` passes, i.e.,
-  the "last" `y` used to compute `shift`, `log_scale`. (Roughly speaking, this
-  also proves the transform is bijective.)
-
-  #### Example Use
-
-  ```python
-  ds = tf.contrib.distributions
-  bs = tf.contrib.distributions.bijectors
-
-  dims = 5
-
-  # A common choice for a normalizing flow is to use a Gaussian for the base
-  # distribution. (However, any continuous distribution would work.) E.g.,
-  maf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512])),
-      event_shape=[dims])
-
-  x = maf.sample()  # Expensive; uses `tf.while_loop`, no Bijector caching.
-  maf.log_prob(x)   # Almost free; uses Bijector caching.
-  maf.log_prob(0.)  # Cheap; no `tf.while_loop` despite no Bijector caching.
-
-  # [1] also describes an "Inverse Autoregressive Flow", e.g.,
-  iaf = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.Invert(bs.MaskedAutoregressiveFlow(
-          shift_and_log_scale_fn=bs.masked_autoregressive_default_template(
-              hidden_layers=[512, 512]))),
-      event_shape=[dims])
-
-  x = iaf.sample()  # Cheap; no `tf.while_loop` despite no Bijector caching.
-  iaf.log_prob(x)   # Almost free; uses Bijector caching.
-  iaf.log_prob(0.)  # Expensive; uses `tf.while_loop`, no Bijector caching.
-
-  # In many (if not most) cases the default `shift_and_log_scale_fn` will be a
-  # poor choice. Here's an example of using a "shift only" version and with a
-  # different number/depth of hidden layers.
-  shift_only = True
-  maf_no_scale_hidden2 = ds.TransformedDistribution(
-      distribution=ds.Normal(loc=0., scale=1.),
-      bijector=bs.MaskedAutoregressiveFlow(
-          bs.masked_autoregressive_default_template(
-              hidden_layers=[32],
-              shift_only=shift_only),
-          is_constant_jacobian=shift_only),
-      event_shape=[dims])
-  ```
-
-  [1]: "Masked Autoregressive Flow for Density Estimation."
-       George Papamakarios, Theo Pavlakou, Iain Murray. Arxiv. 2017.
-       https://arxiv.org/abs/1705.07057
-
-  [2]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  """
-
-  def __init__(self,
-               shift_and_log_scale_fn,
-               is_constant_jacobian=False,
-               validate_args=False,
-               name=None):
-    """Creates the MaskedAutoregressiveFlow bijector.
-
-    Args:
-      shift_and_log_scale_fn: Python `callable` which computes `shift` and
-        `log_scale` from both the forward domain (`x`) and the inverse domain
-        (`y`). Calculation must respect the "autoregressive property" (see class
-        docstring). Suggested default
-        `masked_autoregressive_default_template(hidden_layers=...)`.
-        Typically the function contains `tf.Variables` and is wrapped using
-        `tf.make_template`. Returning `None` for either (both) `shift`,
-        `log_scale` is equivalent to (but more efficient than) returning zero.
-      is_constant_jacobian: Python `bool`. Default: `False`. When `True` the
-        implementation assumes `log_scale` does not depend on the forward domain
-        (`x`) or inverse domain (`y`) values. (No validation is made;
-        `is_constant_jacobian=False` is always safe but possibly computationally
-        inefficient.)
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-    """
-    name = name or "masked_autoregressive_flow"
-    self._shift_and_log_scale_fn = shift_and_log_scale_fn
-    super(MaskedAutoregressiveFlow, self).__init__(
-        is_constant_jacobian=is_constant_jacobian,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    event_size = array_ops.shape(x)[-1]
-    def _loop_body(index, y0):
-      """While-loop body for autoregression calculation."""
-      # Set caching device to avoid re-getting the tf.Variable for every while
-      # loop iteration.
-      with variable_scope_lib.variable_scope(
-          variable_scope_lib.get_variable_scope()) as vs:
-        if vs.caching_device is None:
-          vs.set_caching_device(lambda op: op.device)
-        shift, log_scale = self._shift_and_log_scale_fn(y0)
-      y = x
-      if log_scale is not None:
-        y *= math_ops.exp(log_scale)
-      if shift is not None:
-        y += shift
-      return index + 1, y
-    _, y = control_flow_ops.while_loop(
-        cond=lambda index, _: index < event_size,
-        body=_loop_body,
-        loop_vars=[0, array_ops.zeros_like(x, name="y0")])
-    return y
-
-  def _inverse(self, y):
-    shift, log_scale = self._shift_and_log_scale_fn(y)
-    x = y
-    if shift is not None:
-      x -= shift
-    if log_scale is not None:
-      x *= math_ops.exp(-log_scale)
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    _, log_scale = self._shift_and_log_scale_fn(y)
-    if log_scale is None:
-      return constant_op.constant(0., dtype=y.dtype, name="ildj")
-    return -math_ops.reduce_sum(log_scale, axis=-1)
-
-
-MASK_INCLUSIVE = "inclusive"
-MASK_EXCLUSIVE = "exclusive"
-
-
-def _gen_slices(num_blocks, n_in, n_out, mask_type=MASK_EXCLUSIVE):
-  """Generate the slices for building an autoregressive mask."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  slices = []
-  col = 0
-  d_in = n_in // num_blocks
-  d_out = n_out // num_blocks
-  row = d_out if mask_type == MASK_EXCLUSIVE else 0
-  for _ in range(num_blocks):
-    row_slice = slice(row, None)
-    col_slice = slice(col, col + d_in)
-    slices.append([row_slice, col_slice])
-    col += d_in
-    row += d_out
-  return slices
-
-
-def _gen_mask(num_blocks,
-              n_in,
-              n_out,
-              mask_type=MASK_EXCLUSIVE,
-              dtype=dtypes.float32):
-  """Generate the mask for building an autoregressive dense layer."""
-  # TODO(b/67594795): Better support of dynamic shape.
-  mask = np.zeros([n_out, n_in], dtype=dtype.as_numpy_dtype())
-  slices = _gen_slices(num_blocks, n_in, n_out, mask_type=mask_type)
-  for [row_slice, col_slice] in slices:
-    mask[row_slice, col_slice] = 1
-  return mask
-
-
-def masked_dense(inputs,
-                 units,
-                 num_blocks=None,
-                 exclusive=False,
-                 kernel_initializer=None,
-                 reuse=None,
-                 name=None,
-                 *args,
-                 **kwargs):
-  """A autoregressively masked dense layer. Analogous to `tf.layers.dense`.
-
-  See [1] for detailed explanation.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    inputs: Tensor input.
-    units: Python `int` scalar representing the dimensionality of the output
-      space.
-    num_blocks: Python `int` scalar representing the number of blocks for the
-      MADE masks.
-    exclusive: Python `bool` scalar representing whether to zero the diagonal of
-      the mask, used for the first layer of a MADE.
-    kernel_initializer: Initializer function for the weight matrix.
-      If `None` (default), weights are initialized using the
-      `tf.glorot_random_initializer`.
-    reuse: Python `bool` scalar representing whether to reuse the weights of a
-      previous layer by the same name.
-    name: Python `str` used to describe ops managed by this function.
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    Output tensor.
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-  # TODO(b/67594795): Better support of dynamic shape.
-  input_depth = inputs.shape.with_rank_at_least(1)[-1].value
-  if input_depth is None:
-    raise NotImplementedError(
-        "Rightmost dimension must be known prior to graph execution.")
-
-  mask = _gen_mask(num_blocks, input_depth, units,
-                   MASK_EXCLUSIVE if exclusive else MASK_INCLUSIVE).T
-
-  if kernel_initializer is None:
-    kernel_initializer = init_ops.glorot_normal_initializer()
-
-  def masked_initializer(shape, dtype=None, partition_info=None):
-    return mask * kernel_initializer(shape, dtype, partition_info)
-
-  with ops.name_scope(name, "masked_dense", [inputs, units, num_blocks]):
-    layer = layers.Dense(
-        units,
-        kernel_initializer=masked_initializer,
-        kernel_constraint=lambda x: mask * x,
-        name=name,
-        dtype=inputs.dtype.base_dtype,
-        _scope=name,
-        _reuse=reuse,
-        *args,
-        **kwargs)
-    return layer.apply(inputs)
-
-
-def masked_autoregressive_default_template(
-    hidden_layers,
-    shift_only=False,
-    activation=nn_ops.relu,
-    log_scale_min_clip=-5.,
-    log_scale_max_clip=3.,
-    log_scale_clip_gradient=False,
-    name=None,
-    *args,
-    **kwargs):
-  """Build the MADE Model [1].
-
-  This will be wrapped in a make_template to ensure the variables are only
-  created once. It takes the input and returns the `loc` ("mu" [1]) and
-  `log_scale` ("alpha" [1]) from the MADE network.
-
-  Warning: This function uses `masked_dense` to create randomly initialized
-  `tf.Variables`. It is presumed that these will be fit, just as you would any
-  other neural architecture which uses `tf.layers.dense`.
-
-  #### About Hidden Layers:
-
-  Each element of `hidden_layers` should be greater than the `input_depth`
-  (i.e., `input_depth = tf.shape(input)[-1]` where `input` is the input to the
-  neural network). This is necessary to ensure the autoregressivity property.
-
-  #### About Clipping:
-
-  This function also optionally clips the `log_scale` (but possibly not its
-  gradient). This is useful because if `log_scale` is too small/large it might
-  underflow/overflow making it impossible for the `MaskedAutoregressiveFlow`
-  bijector to implement a bijection. Additionally, the `log_scale_clip_gradient`
-  `bool` indicates whether the gradient should also be clipped. The default does
-  not clip the gradient; this is useful because it still provides gradient
-  information (for fitting) yet solves the numerical stability problem. I.e.,
-  `log_scale_clip_gradient = False` means
-  `grad[exp(clip(x))] = grad[x] exp(clip(x))` rather than the usual
-  `grad[clip(x)] exp(clip(x))`.
-
-  [1]: "MADE: Masked Autoencoder for Distribution Estimation."
-       Mathieu Germain, Karol Gregor, Iain Murray, Hugo Larochelle. ICML. 2015.
-       https://arxiv.org/abs/1502.03509
-
-  Arguments:
-    hidden_layers: Python `list`-like of non-negative integer, scalars
-      indicating the number of units in each hidden layer. Default: `[512, 512].
-    shift_only: Python `bool` indicating if only the `shift` term shall be
-      computed. Default: `False`.
-    activation: Activation function (callable). Explicitly setting to `None`
-      implies a linear activation.
-    log_scale_min_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The minimum value to clip by. Default: -5.
-    log_scale_max_clip: `float`-like scalar `Tensor`, or a `Tensor` with the
-      same shape as `log_scale`. The maximum value to clip by. Default: 3.
-    log_scale_clip_gradient: Python `bool` indicating that the gradient of
-      `tf.clip_by_value` should be preserved. Default: `False`.
-    name: A name for ops managed by this function. Default:
-      "masked_autoregressive_default_template".
-    *args: `tf.layers.dense` arguments.
-    **kwargs: `tf.layers.dense` keyword arguments.
-
-  Returns:
-    shift: `Float`-like `Tensor` of shift terms (the "mu" in [2]).
-    log_scale: `Float`-like `Tensor` of log(scale) terms (the "alpha" in [2]).
-
-  Raises:
-    NotImplementedError: if rightmost dimension of `inputs` is unknown prior to
-      graph execution.
-  """
-
-  with ops.name_scope(name, "masked_autoregressive_default_template",
-                      values=[log_scale_min_clip, log_scale_max_clip]):
-    def _fn(x):
-      """MADE parameterized via `masked_autoregressive_default_template`."""
-      # TODO(b/67594795): Better support of dynamic shape.
-      input_depth = x.shape.with_rank_at_least(1)[-1].value
-      if input_depth is None:
-        raise NotImplementedError(
-            "Rightmost dimension must be known prior to graph execution.")
-      input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined()
-                     else array_ops.shape(x))
-      for i, units in enumerate(hidden_layers):
-        x = masked_dense(
-            inputs=x,
-            units=units,
-            num_blocks=input_depth,
-            exclusive=True if i == 0 else False,
-            activation=activation,
-            *args,
-            **kwargs)
-      x = masked_dense(
-          inputs=x,
-          units=(1 if shift_only else 2) * input_depth,
-          num_blocks=input_depth,
-          activation=None,
-          *args,
-          **kwargs)
-      if shift_only:
-        x = array_ops.reshape(x, shape=input_shape)
-        return x, None
-      x = array_ops.reshape(
-          x, shape=array_ops.concat([input_shape, [2]], axis=0))
-      shift, log_scale = array_ops.unstack(x, num=2, axis=-1)
-      which_clip = (math_ops.clip_by_value if log_scale_clip_gradient
-                    else _clip_by_value_preserve_grad)
-      log_scale = which_clip(log_scale, log_scale_min_clip, log_scale_max_clip)
-      return shift, log_scale
-    return template_ops.make_template(
-        "masked_autoregressive_default_template", _fn)
-
-
-def _clip_by_value_preserve_grad(x, clip_value_min, clip_value_max, name=None):
-  """Clips input while leaving gradient unaltered."""
-  with ops.name_scope(name, "clip_by_value_preserve_grad",
-                      [x, clip_value_min, clip_value_max]):
-    clip_x = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-    return x + array_ops.stop_gradient(clip_x - x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
index a187ce22d686ee1203802ae2bfe64b0e1a3ea850..8654cc39d0c41ec4f1b85cd5fc4366ceaf4b224d 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/permute.py
@@ -12,18 +12,127 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Permute bijector."""
+"""Permutation bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.permute_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Permute"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Permute",
+]
+
+
+class Permute(bijector_lib.Bijector):
+  """Permutes the rightmost dimension of a `Tensor`.
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  reverse = tfd.bijectors.Permute(permutation=[2, 1, 0])
+
+  reverse.forward([-1., 0., 1.])
+  # ==> [1., 0., -1]
+
+  reverse.inverse([1., 0., -1])
+  # ==> [-1., 0., 1.]
+
+  reverse.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  reverse.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  Warning: `tf.estimator` may repeatedly build the graph thus
+  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
+  reliable parameterization (nor would it be even if using `tf.constant`). A
+  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
+  i.e.,
+
+  ```python
+  def init_once(x, name):
+    return tf.get_variable(name, initializer=x, trainable=False)
+
+  Permute(permutation=init_once(
+      np.random.permutation(event_size).astype("int32"),
+      name="permutation"))
+  ```
+
+  """
+
+  def __init__(self, permutation, validate_args=False, name=None):
+    """Creates the `Permute` bijector.
+
+    Args:
+      permutation: An `int`-like vector-shaped `Tensor` representing the
+        permutation to apply to the rightmost dimension of the transformed
+        `Tensor`.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if `not permutation.dtype.is_integer`.
+      ValueError: if `permutation` does not contain exactly one of each of
+        `{0, 1, ..., d}`.
+    """
+    with ops.name_scope(name, "permute", values=[permutation]):
+      permutation = ops.convert_to_tensor(
+          permutation,
+          name="permutation")
+      if not permutation.dtype.is_integer:
+        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
+            permutation.dtype.name))
+      p = tensor_util.constant_value(permutation)
+      if p is not None:
+        if set(p) != set(np.arange(p.size)):
+          raise ValueError("Permutation over `d` must contain exactly one of "
+                           "each of `{0, 1, ..., d}`.")
+      elif validate_args:
+        p, _ = nn_ops.top_k(-permutation,
+                            k=array_ops.shape(permutation)[-1],
+                            sorted=True)
+        permutation = control_flow_ops.with_dependencies([
+            check_ops.assert_equal(
+                -p, math_ops.range(array_ops.size(p)),
+                message=("Permutation over `d` must contain exactly one of "
+                         "each of `{0, 1, ..., d}`.")),
+        ], permutation)
+      self._permutation = permutation
+      super(Permute, self).__init__(
+          is_constant_jacobian=True,
+          validate_args=validate_args,
+          name=name or "permute")
+
+  @property
+  def permutation(self):
+    return self._permutation
+
+  def _forward(self, x):
+    return array_ops.gather(x, self.permutation, axis=-1)
+
+  def _inverse(self, y):
+    return array_ops.gather(
+        y,
+        array_ops.invert_permutation(self.permutation),
+        axis=-1)
+
+  def _inverse_log_det_jacobian(self, y):
+    return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
deleted file mode 100644
index b1d8f2f41b28a88208a19824377f93882b767f03..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/permute_impl.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Permutation bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Permute",
-]
-
-
-class Permute(bijector_lib.Bijector):
-  """Permutes the rightmost dimension of a `Tensor`.
-
-  ```python
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Permute(permutation=[2, 1, 0])
-
-  reverse.forward([-1., 0., 1.])
-  # ==> [1., 0., -1]
-
-  reverse.inverse([1., 0., -1])
-  # ==> [-1., 0., 1.]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  Warning: `tf.estimator` may repeatedly build the graph thus
-  `Permute(np.random.permutation(event_size)).astype("int32"))` is not a
-  reliable parameterization (nor would it be even if using `tf.constant`). A
-  safe alternative is to use `tf.get_variable` to achieve "init once" behavior,
-  i.e.,
-
-  ```python
-  def init_once(x, name):
-    return tf.get_variable(name, initializer=x, trainable=False)
-
-  Permute(permutation=init_once(
-      np.random.permutation(event_size).astype("int32"),
-      name="permutation"))
-  ```
-
-  """
-
-  def __init__(self, permutation, validate_args=False, name=None):
-    """Creates the `Permute` bijector.
-
-    Args:
-      permutation: An `int`-like vector-shaped `Tensor` representing the
-        permutation to apply to the rightmost dimension of the transformed
-        `Tensor`.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if `not permutation.dtype.is_integer`.
-      ValueError: if `permutation` does not contain exactly one of each of
-        `{0, 1, ..., d}`.
-    """
-    with ops.name_scope(name, "permute", values=[permutation]):
-      permutation = ops.convert_to_tensor(
-          permutation,
-          name="permutation")
-      if not permutation.dtype.is_integer:
-        raise TypeError("permutation.dtype ({}) should be `int`-like.".format(
-            permutation.dtype.name))
-      p = tensor_util.constant_value(permutation)
-      if p is not None:
-        if set(p) != set(np.arange(p.size)):
-          raise ValueError("Permutation over `d` must contain exactly one of "
-                           "each of `{0, 1, ..., d}`.")
-      elif validate_args:
-        p, _ = nn_ops.top_k(-permutation,
-                            k=array_ops.shape(permutation)[-1],
-                            sorted=True)
-        permutation = control_flow_ops.with_dependencies([
-            check_ops.assert_equal(
-                -p, math_ops.range(array_ops.size(p)),
-                message=("Permutation over `d` must contain exactly one of "
-                         "each of `{0, 1, ..., d}`.")),
-        ], permutation)
-      self._permutation = permutation
-      super(Permute, self).__init__(
-          is_constant_jacobian=True,
-          validate_args=validate_args,
-          name=name or "permute")
-
-  @property
-  def permutation(self):
-    return self._permutation
-
-  def _forward(self, x):
-    return array_ops.gather(x, self.permutation, axis=-1)
-
-  def _inverse(self, y):
-    return array_ops.gather(
-        y,
-        array_ops.invert_permutation(self.permutation),
-        axis=-1)
-
-  def _inverse_log_det_jacobian(self, y):
-    return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    return constant_op.constant(0., dtype=x.dtype)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
index a83199549cd16101ab7b39b43d19a17bc66f03df..c37db61720d10949f294ff7b2e9778ba6efa57f0 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform.py
@@ -18,12 +18,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.power_transform_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["PowerTransform"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "PowerTransform",
+]
+
+
+class PowerTransform(bijector.Bijector):
+  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
+
+  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
+  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
+  of this bijector.
+
+  This bijector is equivalent to the `Exp` bijector when `c=0`.
+  """
+
+  def __init__(self,
+               power=0.,
+               event_ndims=0,
+               validate_args=False,
+               name="power_transform"):
+    """Instantiates the `PowerTransform` bijector.
+
+    Args:
+      power: Python `float` scalar indicating the transform power, i.e.,
+        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+
+    Raises:
+      ValueError: if `power < 0` or is not known statically.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[power]):
+      power = tensor_util.constant_value(
+          ops.convert_to_tensor(power, name="power"))
+    if power is None or power < 0:
+      raise ValueError("`power` must be a non-negative TF constant.")
+    self._power = power
+    super(PowerTransform, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def power(self):
+    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
+    return self._power
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    if self.power == 0.:
+      return math_ops.exp(x)
+    # If large x accuracy is an issue, consider using:
+    # (1. + x * self.power)**(1. / self.power) when x >> 1.
+    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    if self.power == 0.:
+      return math_ops.log(y)
+    # If large y accuracy is an issue, consider using:
+    # (y**self.power - 1.) / self.power when y >> 1.
+    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return (self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log(y), axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    if self.power == 0.:
+      return math_ops.reduce_sum(x, axis=event_dims)
+    return (1. / self.power - 1.) * math_ops.reduce_sum(
+        math_ops.log1p(x * self.power),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args or self.power == 0.:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        1. + self.power * x,
+        message="Forward transformation input must be at least {}.".format(
+            -1. / self.power))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_valid = check_ops.assert_positive(
+        y, message="Inverse transformation input must be greater than 0.")
+    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
deleted file mode 100644
index c37db61720d10949f294ff7b2e9778ba6efa57f0..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/power_transform_impl.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""PowerTransform bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "PowerTransform",
-]
-
-
-class PowerTransform(bijector.Bijector):
-  """Compute `Y = g(X) = (1 + X * c)**(1 / c), X >= -1 / c`.
-
-  The [power transform](https://en.wikipedia.org/wiki/Power_transform) maps
-  inputs from `[0, inf]` to `[-1/c, inf]`; this is equivalent to the `inverse`
-  of this bijector.
-
-  This bijector is equivalent to the `Exp` bijector when `c=0`.
-  """
-
-  def __init__(self,
-               power=0.,
-               event_ndims=0,
-               validate_args=False,
-               name="power_transform"):
-    """Instantiates the `PowerTransform` bijector.
-
-    Args:
-      power: Python `float` scalar indicating the transform power, i.e.,
-        `Y = g(X) = (1 + X * c)**(1 / c)` where `c` is the `power`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-
-    Raises:
-      ValueError: if `power < 0` or is not known statically.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[power]):
-      power = tensor_util.constant_value(
-          ops.convert_to_tensor(power, name="power"))
-    if power is None or power < 0:
-      raise ValueError("`power` must be a non-negative TF constant.")
-    self._power = power
-    super(PowerTransform, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def power(self):
-    """The `c` in: `Y = g(X) = (1 + X * c)**(1 / c)`."""
-    return self._power
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    if self.power == 0.:
-      return math_ops.exp(x)
-    # If large x accuracy is an issue, consider using:
-    # (1. + x * self.power)**(1. / self.power) when x >> 1.
-    return math_ops.exp(math_ops.log1p(x * self.power) / self.power)
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    if self.power == 0.:
-      return math_ops.log(y)
-    # If large y accuracy is an issue, consider using:
-    # (y**self.power - 1.) / self.power when y >> 1.
-    return math_ops.expm1(math_ops.log(y) * self.power) / self.power
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return (self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log(y), axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    if self.power == 0.:
-      return math_ops.reduce_sum(x, axis=event_dims)
-    return (1. / self.power - 1.) * math_ops.reduce_sum(
-        math_ops.log1p(x * self.power),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args or self.power == 0.:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        1. + self.power * x,
-        message="Forward transformation input must be at least {}.".format(
-            -1. / self.power))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_valid = check_ops.assert_positive(
-        y, message="Inverse transformation input must be greater than 0.")
-    return control_flow_ops.with_dependencies([is_valid], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
index 8997f7ab6929745275edb38712a5bbb0a9b25ddb..55eca063126797d577653f0d6bcdfddf8192bdb5 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/reshape.py
@@ -12,18 +12,303 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Reshape bijector."""
+"""Reshape bijectors."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.reshape_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["Reshape"]
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector as bijector_lib
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "Reshape",
+]
+
+
+def _static_ndims_from_shape(shape):
+  return shape.shape.with_rank_at_least(1)[0].value
+
+
+def _ndims_from_shape(shape):
+  return array_ops.shape(shape)[0]
+
+
+class Reshape(bijector_lib.Bijector):
+  """Reshapes the `event_shape` of a `Tensor`.
+
+  The semantics generally follow that of `tf.reshape()`, with
+  a few differences:
+
+  * The user must provide both the input and output shape, so that
+    the transformation can be inverted. If an input shape is not
+    specified, the default assumes a vector-shaped input, i.e.,
+    event_shape_in = (-1,).
+  * The `Reshape` bijector automatically broadcasts over the leftmost
+    dimensions of its input (`sample_shape` and `batch_shape`); only
+    the rightmost `event_ndims_in` dimensions are reshaped. The
+    number of dimensions to reshape is inferred from the provided
+    `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
+
+  Example usage:
+  ```python
+
+  tfd = tf.contrib.distributions
+
+  r = tfd.bijectors.Reshape(event_shape_out=[1, -1])
+
+  r.forward([3., 4.])    # shape [2]
+  # ==> [[3., 4.]]       # shape [1, 2]
+
+  r.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
+  # ==> [[[1., 2.]],
+  #      [[3., 4.]]]   # shape [2, 1, 2]
+
+  r.inverse([[3., 4.]])  # shape [1,2]
+  # ==> [3., 4.]         # shape [2]
+
+  r.forward_log_det_jacobian(any_value)
+  # ==> 0.
+
+  r.inverse_log_det_jacobian(any_value)
+  # ==> 0.
+  ```
+
+  """
+
+  def __init__(self, event_shape_out, event_shape_in=(-1,),
+               validate_args=False, name=None):
+    """Creates a `Reshape` bijector.
+
+    Args:
+      event_shape_out: An `int`-like vector-shaped `Tensor`
+        representing the event shape of the transformed output.
+      event_shape_in: An optional `int`-like vector-shape `Tensor`
+        representing the event shape of the input. This is required in
+        order to define inverse operations; the default of (-1,)
+        assumes a vector-shaped input.
+      validate_args: Python `bool` indicating whether arguments should
+        be checked for correctness.
+      name: Python `str`, name given to ops managed by this object.
+
+    Raises:
+      TypeError: if either `event_shape_in` or `event_shape_out` has
+        non-integer `dtype`.
+      ValueError: if either of `event_shape_in` or `event_shape_out`
+       has non-vector shape (`rank > 1`), or if their sizes do not
+       match.
+    """
+    with ops.name_scope(name, "reshape",
+                        values=[event_shape_out, event_shape_in]):
+
+      event_shape_out = ops.convert_to_tensor(event_shape_out,
+                                              name="event_shape_out",
+                                              preferred_dtype=dtypes.int32)
+      event_shape_in = ops.convert_to_tensor(event_shape_in,
+                                             name="event_shape_in",
+                                             preferred_dtype=dtypes.int32)
+
+      assertions = []
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_out, validate_args))
+      assertions.extend(self._maybe_check_valid_shape(
+          event_shape_in, validate_args))
+
+      self._assertions = assertions
+      self._event_shape_in = event_shape_in
+      self._event_shape_out = event_shape_out
+
+      super(Reshape, self).__init__(is_constant_jacobian=True,
+                                    validate_args=validate_args,
+                                    name=name or "reshape")
+
+  def _maybe_check_valid_shape(self, shape, validate_args):
+    """Check that a shape Tensor is int-type and otherwise sane."""
+    if not shape.dtype.is_integer:
+      raise TypeError("{} dtype ({}) should be `int`-like.".format(
+          shape.op.name, shape.dtype.name))
+
+    assertions = []
+
+    ndims = array_ops.rank(shape)
+    ndims_ = tensor_util.constant_value(ndims)
+    if ndims_ is not None and ndims_ > 1:
+      raise ValueError("`{}` rank ({}) should be <= 1.".format(
+          shape.op.name, ndims_))
+    elif validate_args:
+      assertions.append(check_ops.assert_less_equal(
+          ndims, 1, message="`{}` rank should be <= 1.".format(shape.op.name)))
+
+    shape_ = tensor_util.constant_value_as_shape(shape)
+    if shape_.is_fully_defined():
+      es = np.int32(shape_.as_list())
+      if sum(es == -1) > 1:
+        raise ValueError(
+            "`{}` must have at most one `-1` (given {})"
+            .format(shape.op.name, es))
+      if np.any(es < -1):
+        raise ValueError(
+            "`{}` elements must be either positive integers or `-1`"
+            "(given {})."
+            .format(shape.op.name, es))
+    elif validate_args:
+      assertions.extend([
+          check_ops.assert_less_equal(
+              math_ops.reduce_sum(
+                  math_ops.cast(math_ops.equal(shape, -1), dtypes.int32)),
+              1,
+              message="`{}` elements must have at most one `-1`."
+              .format(shape.op.name)),
+          check_ops.assert_greater_equal(
+              shape, -1,
+              message="`{}` elements must be either positive integers or `-1`."
+              .format(shape.op.name)),
+      ])
+    return assertions
+
+  def _reshape_helper(self, x, event_shape_in, event_shape_out):
+    """Reshape only the event_shape of an input `Tensor`."""
+
+    event_ndims_in_ = _static_ndims_from_shape(event_shape_in)
+    event_ndims_in = _ndims_from_shape(event_shape_in)
+    x_ndims_, x_ndims = x.shape.ndims, array_ops.rank(x)
+
+    assertions = []
+
+    # Ensure x.event_shape is compatible with event_shape_in.
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and x.shape.with_rank_at_least(event_ndims_in_)[
+            x_ndims_-event_ndims_in_:].is_fully_defined()):
+      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
+          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
+    else:
+      x_event_shape_, x_event_shape = (
+          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
+
+    event_shape_in_ = tensor_util.constant_value(event_shape_in)
+
+    if x_event_shape_ is not None and event_shape_in_ is not None:
+      # Compare the shape dimensions that are fully specified in the
+      # input (i.e., for which event_shape_in is not -1). If x_event_shape
+      # matches along all of these dimensions, it is compatible with
+      # the desired input shape and any further mismatches (i.e.,
+      # imcompatibility with the desired *output* shape) will be
+      # caught inside of array_ops.reshape() below.
+      x_event_shape_specified_ = x_event_shape_[event_shape_in_ >= 0]
+      event_shape_in_specified_ = event_shape_in_[event_shape_in_ >= 0]
+      if not np.equal(x_event_shape_specified_,
+                      event_shape_in_specified_).all():
+        raise ValueError(
+            "Input `event_shape` does not match `event_shape_in` ({} vs {}).".
+            format(x_event_shape_, event_shape_in_))
+    elif self.validate_args:
+      # Similarly to the static case, we compare the shape dimensions
+      # that are fully specified in the input. We extract these
+      # dimensions using boolean_mask(), which requires that the mask
+      # have known ndims. We can assume that shape Tensors always have
+      # ndims==1 (this assumption is verified inside of
+      # _maybe_check_valid_shape), so the reshape operation is just a
+      # no-op that formally encodes this fact to make boolean_mask()
+      # happy.
+      event_shape_mask = array_ops.reshape(event_shape_in >= 0, [-1])
+      x_event_shape_specified = array_ops.boolean_mask(x_event_shape,
+                                                       event_shape_mask)
+      event_shape_in_specified = array_ops.boolean_mask(event_shape_in,
+                                                        event_shape_mask)
+      assertions.append(check_ops.assert_equal(
+          x_event_shape_specified, event_shape_in_specified,
+          message="Input `event_shape` does not match `event_shape_in`."))
+
+    if assertions:
+      x = control_flow_ops.with_dependencies(assertions, x)
+
+    # get the parts of shape(x) that will not change
+    sample_and_batch_shape = array_ops.shape(x)
+
+    ndims = (x.shape.ndims if x.shape.ndims is not None
+             else array_ops.rank(x))
+    sample_and_batch_shape = sample_and_batch_shape[
+        :(ndims - math_ops.abs(event_ndims_in))]
+
+    if (event_ndims_in_ is not None
+        and x_ndims_ is not None
+        and event_ndims_in_ == x_ndims_):
+      # Hack to allow forward/inverse_event_shape to do shape
+      # inference by calling this helper method with a dummy Tensor of
+      # shape event_shape_in. In this special case,
+      # sample_and_batch_shape will be empty so we can preserve static
+      # shape information by avoiding the concat operation below
+      # (which would be a no-op).
+      new_shape = event_shape_out
+    else:
+      new_shape = array_ops.concat(
+          [sample_and_batch_shape, event_shape_out], axis=0)
+
+    return array_ops.reshape(x, new_shape)
+
+  def _forward(self, x):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(x,
+                                  self._event_shape_in,
+                                  self._event_shape_out)
+
+  def _inverse(self, y):
+    with ops.control_dependencies(self._assertions):
+      return self._reshape_helper(y,
+                                  self._event_shape_out,
+                                  self._event_shape_in)
+
+  def _inverse_log_det_jacobian(self, y):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=y.dtype)
+
+  def _forward_log_det_jacobian(self, x):
+    with ops.control_dependencies(self._assertions):
+      return constant_op.constant(0., dtype=x.dtype)
+
+  def _forward_event_shape(self, input_shape):
+    # NOTE: this method and the other *_event_shape* methods
+    # compute shape by explicit transformation of a dummy
+    # variable. This approach is not generally recommended because it
+    # bloats the graph and could in general trigger side effects.
+    #
+    # In this particular case of the Reshape bijector, the
+    # forward and inverse transforms have no side effects, and we
+    # believe the reduction in code complexity from delegating the
+    # heavy lifting to tf.reshape() is worth the added graph ops.
+    # However, you should think hard before implementing this approach
+    # in other Bijectors; it is strongly preferred to compute
+    # shapes explicitly whenever it's feasible to do so.
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return dummy_reshaped.shape
+
+  def _inverse_event_shape(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return dummy_reshaped.shape
+
+  def _forward_event_shape_tensor(self, input_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=input_shape)
+      dummy_reshaped = self.forward(dummy)
+      return array_ops.shape(dummy_reshaped)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    with ops.control_dependencies(self._assertions):
+      dummy = array_ops.zeros(dtype=dtypes.float32, shape=output_shape)
+      dummy_reshaped = self.inverse(dummy)
+      return array_ops.shape(dummy_reshaped)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
deleted file mode 100644
index 93682639aa3be3b8f59a369dedb6ee773c468130..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/reshape_impl.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Reshape bijectors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector as bijector_lib
-
-
-__all__ = [
-    "Reshape",
-]
-
-
-class Reshape(bijector_lib.Bijector):
-  """Reshapes the `event_shape` of a `Tensor`.
-
-  The semantics generally follow that of `tf.reshape()`, with
-  a few differences:
-   * The user must provide both the input and output shape, so that
-     the transformation can be inverted.
-   * The `Reshape` bijector automatically broadcasts over the leftmost
-     dimensions of its input (`sample_shape` and `batch_shape`); only
-     the rightmost `event_ndims_in` dimensions are reshaped. The
-     number of dimensions to reshape is inferred from the provided
-     `event_shape_in` (`event_ndims_in = len(event_shape_in)`).
-   * The `Reshape` bijector does not currently support
-     partially-specified shapes, i.e., those with a dimension
-     implicitly specified by `-1`.
-
-  Example usage:
-  ```python
-
-  bs = tf.contrib.distributions.bijectors
-
-  reverse = bs.Reshape(event_shape_out=[1,2],
-                       event_shape_in=[2,])
-
-  reverse.forward([1., 2.])    # shape [2,]
-  # ==> [[1., 2.]]             # shape [1,2]
-
-  reverse.forward([[1., 2.], [3., 4.]])  # shape [2, 2]
-  # ==> [[[1., 2.]], [[3., 4.]]]         # shape [2, 1, 2]
-
-  reverse.inverse([[1., 2.]])  # shape [1,2]
-  # ==> [1., 2.]               # shape [2,]
-
-  reverse.forward_log_det_jacobian(any_value)
-  # ==> 0.
-
-  reverse.inverse_log_det_jacobian(any_value)
-  # ==> 0.
-  ```
-
-  """
-
-  def __init__(self, event_shape_out, event_shape_in,
-               validate_args=False, name=None):
-    """Creates a `Reshape` bijector.
-
-    Args:
-      event_shape_out: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        transformed output.
-      event_shape_in: An `int`-like vector-shaped `Tensor`
-        representing the fully specified (no -1's) event shape of the
-        input.
-      validate_args: Python `bool` indicating whether arguments should
-        be checked for correctness.
-      name: Python `str`, name given to ops managed by this object.
-
-    Raises:
-      TypeError: if either `event_shape_in` or `event_shape_out` has
-       non-vector shape (`rank > 1`), or non-integer `dtype`.
-      ValueError: if either `event_shape_in` or `event_shape_out`
-       contains non-positive entries, or if their sizes do not match
-       (`prod(event_shape_in)` != `prod(event_shape_out)`), or if
-       their dimensionality(s) cannot be statically inferred.
-    """
-    with ops.name_scope(name, "reshape",
-                        values=[event_shape_out, event_shape_in]):
-
-      event_shape_out = ops.convert_to_tensor(event_shape_out,
-                                              name="event_shape_out",
-                                              preferred_dtype=dtypes.int32)
-      event_shape_in = ops.convert_to_tensor(event_shape_in,
-                                             name="event_shape_in",
-                                             preferred_dtype=dtypes.int32)
-
-      # check that input shapes are positive integers
-      assertions = []
-      assertions += self._maybe_check_valid_shape(
-          event_shape_out, "event_shape_out",
-          validate_args=validate_args)
-      assertions += self._maybe_check_valid_shape(
-          event_shape_in, "event_shape_in", validate_args=validate_args)
-
-      # check that prod(event_shape_in) = prod(event_shape_out)
-      assertions += self._maybe_check_matching_sizes(
-          event_shape_in, event_shape_out, validate_args=validate_args)
-
-      self._assertions = assertions
-      self._event_shape_in = event_shape_in
-      self._event_shape_out = event_shape_out
-      self._event_shape_in_static = tensor_util.constant_value_as_shape(
-          event_shape_in)
-      self._event_shape_out_static = tensor_util.constant_value_as_shape(
-          event_shape_out)
-
-      super(Reshape, self).__init__(is_constant_jacobian=True,
-                                    validate_args=validate_args,
-                                    name=name or "reshape")
-
-  def _maybe_check_valid_shape(self, shape_tensor, label,
-                               validate_args=False):
-    """Check that a shape Tensor is int-type and positive."""
-
-    assertions = []
-
-    if not shape_tensor.dtype.is_integer:
-      raise TypeError("{} dtype ({}) should be `int`-like.".format(
-          label, shape_tensor.dtype.name))
-
-    shape_rank = tensor_util.constant_value(array_ops.rank(shape_tensor))
-    if shape_rank is not None and shape_rank > 1:
-      raise ValueError("{} rank should be <= 1.".format(label))
-
-    s = tensor_util.constant_value(shape_tensor)
-    if s is not None:
-      if (s <= 0).any():
-        raise ValueError("{} entries must be positive, but found {}".format(
-            label, s))
-    elif validate_args:
-      assertions.append(check_ops.assert_positive(
-          shape_tensor, message="{} entries must be positive".format(label)))
-
-    return assertions
-
-  def _maybe_check_matching_sizes(self, event_shape_in, event_shape_out,
-                                  validate_args=False):
-    """Check that prod(event_shape_in)==prod(event_shape_out)."""
-
-    def _get_size_from_shape(shape):
-      """Computes size from a shape `Tensor`, statically if possible."""
-      s = tensor_util.constant_value(shape)
-      if s is not None:
-        return [np.int32(np.prod(s))]*2
-      return None, math_ops.reduce_prod(shape, name="size")
-
-    # Ensure `event_shape_in` is compatible with `event_shape_out`.
-    event_size_in_, event_size_in = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_in)
-    event_size_out_, event_size_out = _get_size_from_shape(  # pylint: disable=unbalanced-tuple-unpacking
-        event_shape_out)
-
-    assertions = []
-    if event_size_in_ is not None and event_size_out_ is not None:
-      if event_size_in_ != event_size_out_:
-        raise ValueError(
-            "Input `event_size` ({}) does not match output `event_size` ({}).".
-            format(event_size_in, event_size_out_))
-    elif validate_args:
-      assertions.append(check_ops.assert_equal(
-          event_size_in, event_size_out,
-          message="Input/output `event_size`s do not match."))
-
-    return assertions
-
-  def _reshape_helper(self, x, event_shape_in, event_shape_out):
-    """Reshape only the event_shape of an input `Tensor`."""
-
-    def _get_rank_from_shape(shape):
-      """Computes rank from a shape `Tensor`, statically if possible."""
-      # Uses fact that rank is "shape of shape".
-      ndims = shape.shape.with_rank_at_least(1)[0].value
-      if ndims is not None:
-        return ndims, ndims
-      return None, array_ops.shape(shape)[0]
-
-    event_ndims_in_, event_ndims_in = _get_rank_from_shape(event_shape_in)
-
-    assertions = []
-    # Ensure x.event_shape is compatible with event_shape_in.
-    if x.shape.ndims is not None:
-      x_ndims_, x_ndims = [x.shape.ndims]*2
-    else:
-      x_ndims_, x_ndims = None, array_ops.rank(x)
-
-    if (event_ndims_in_ is not None
-        and x_ndims_ is not None
-        and x.shape.with_rank_at_least(event_ndims_in_)[
-            x_ndims_-event_ndims_in_:].is_fully_defined()):
-      x_event_shape_, x_event_shape = [  # pylint: disable=unbalanced-tuple-unpacking
-          np.int32(x.shape[x_ndims_-event_ndims_in_:])]*2
-    else:
-      x_event_shape_, x_event_shape = (
-          None, array_ops.shape(x)[x_ndims-event_ndims_in:])
-
-    event_shape_in_ = tensor_util.constant_value(event_shape_in)
-
-    if x_event_shape_ is not None and event_shape_in_ is not None:
-      if not np.equal(x_event_shape_, event_shape_in_).all():
-        raise ValueError(
-            "Input `event_shape` ({}) does not match `event_shape_in` ({}).".
-            format(x_event_shape_, event_shape_in_))
-    elif self.validate_args:
-      assertions.append(check_ops.assert_equal(
-          x_event_shape, event_shape_in,
-          message="Input `event_shape` does not match `event_shape_in`."))
-
-    if assertions:
-      x = control_flow_ops.with_dependencies(assertions, x)
-
-    # get the parts of shape(x) that will not change
-    sample_and_batch_shape = array_ops.shape(x)
-
-    ndims = (x.shape.ndims if x.shape.ndims is not None
-             else array_ops.rank(x))
-    sample_and_batch_shape = sample_and_batch_shape[
-        :(ndims - math_ops.abs(event_ndims_in))]
-
-    new_shape = array_ops.concat(
-        [sample_and_batch_shape, event_shape_out], axis=0)
-
-    return array_ops.reshape(x, new_shape)
-
-  def _forward(self, x):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(x,
-                                  self._event_shape_in,
-                                  self._event_shape_out)
-
-  def _inverse(self, y):
-    with ops.control_dependencies(self._assertions):
-      return self._reshape_helper(y,
-                                  self._event_shape_out,
-                                  self._event_shape_in)
-
-  def _inverse_log_det_jacobian(self, y):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=y.dtype)
-
-  def _forward_log_det_jacobian(self, x):
-    with ops.control_dependencies(self._assertions):
-      return constant_op.constant(0., dtype=x.dtype)
-
-  def _forward_event_shape(self, input_shape):
-    self._event_shape_in_static.assert_is_compatible_with(input_shape)
-    return self._event_shape_out_static
-
-  def _inverse_event_shape(self, output_shape):
-    self._event_shape_out_static.assert_is_compatible_with(output_shape)
-    return self._event_shape_in_static
-
-  def _forward_event_shape_tensor(self, input_shape):
-    input_assertions = self._maybe_check_valid_shape(
-        input_shape, "input event shape", validate_args=self.validate_args)
-    input_assertions += self._maybe_check_matching_sizes(
-        input_shape, self._event_shape_out,
-        validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        input_assertions + self._assertions, self._event_shape_out)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-
-    output_assertions = self._maybe_check_valid_shape(
-        output_shape, "output event shape", validate_args=self.validate_args)
-    output_assertions += self._maybe_check_matching_sizes(
-        output_shape, self._event_shape_in, validate_args=self.validate_args)
-
-    return control_flow_ops.with_dependencies(
-        output_assertions + self._assertions, self._event_shape_in)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
index c20e76c0b7367369865faf973377201c8b8b17e6..a640dfe7dfbcce96261589c7fc49107deaefdd54 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid.py
@@ -18,12 +18,31 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Sigmoid"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Sigmoid",
+]
+
+
+class Sigmoid(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = 1 / (1 + exp(-X))`."""
+
+  def __init__(self, validate_args=False, name="sigmoid"):
+    super(Sigmoid, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
+
+  def _forward(self, x):
+    return math_ops.sigmoid(x)
+
+  def _inverse(self, y):
+    return math_ops.log(y) - math_ops.log1p(-y)
+
+  def _inverse_log_det_jacobian(self, y):
+    return -math_ops.log(y) - math_ops.log1p(-y)
+
+  def _forward_log_det_jacobian(self, x):
+    return -nn_ops.softplus(-x) - nn_ops.softplus(x)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
index 448125230d24066697624bce03fed71a2c2f00b1..223bc9d042c69be05b0e578835a31ed6e83c0c97 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered.py
@@ -18,12 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
 
-_allowed_symbols = ["SigmoidCentered"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SigmoidCentered",
+]
+
+
+class SigmoidCentered(softmax_centered.SoftmaxCentered):
+  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
+
+  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
+
+  See `bijector.SoftmaxCentered` for more details.
+  """
+
+  def __init__(self, validate_args=False, name="sigmoid_centered"):
+    super(SigmoidCentered, self).__init__(
+        event_ndims=0, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
index b3cf03c24612f5c618c71c0a8615f272acdf2d10..3a75e4ae9495793901b0da91a5aa3982aab35852 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py
@@ -18,12 +18,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SinhArcsinh"]
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "SinhArcsinh",
+]
+
+
+def _sqrtx2p1(x):
+  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
+  return array_ops.where(
+      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
+      math_ops.sqrt(x**2. + 1.),
+      # For large x, calculating x**2 can overflow. This can be alleviated by
+      # considering:
+      # sqrt(1 + x**2)
+      # = exp(0.5 log(1 + x**2))
+      # = exp(0.5 log(x**2 * (1 + x**-2)))
+      # = exp(log(x) + 0.5 * log(1 + x**-2))
+      # = |x| * exp(0.5 log(1 + x**-2))
+      # = |x| * sqrt(1 + x**-2)
+      # We omit the last term in this approximation.
+      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
+      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
+      # and higher order gradients, since the first order derivative of
+      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
+      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
+      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
+      math_ops.abs(x))
+
+
+class SinhArcsinh(bijector.Bijector):
+  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
+
+  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
+  transformation is a
+  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
+  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
+
+  The `SinhArcsinh` transformation of the Normal is described in
+  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
+  This Bijector allows a similar transformation of any distribution supported on
+  `(-inf, inf)`.
+
+  #### Meaning of the parameters
+
+  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
+  * Positive (negative) `skewness` leads to positive (negative) skew.
+    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
+      "tilted" to the right.
+    * positive skew means positive values of `Y` become more likely, and
+      negative values become less likely.
+  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
+    * Fatter tails mean larger values of `|Y|` become more likely.
+    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
+      "flat" around `Y = 0`, and a very steep drop-off in the tails.
+    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
+      peaked at the mode with heavier tails.
+
+  To see the argument about the tails, note that for `|X| >> 1` and
+  `|X| >> (|skewness| * tailweight)**tailweight`, we have
+  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
+  """
+
+  def __init__(self,
+               skewness=None,
+               tailweight=None,
+               event_ndims=0,
+               validate_args=False,
+               name="SinhArcsinh"):
+    """Instantiates the `SinhArcsinh` bijector.
+
+    Args:
+      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
+        of type `float32`.
+      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
+        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[skewness, tailweight]):
+      tailweight = 1. if tailweight is None else tailweight
+      skewness = 0. if skewness is None else skewness
+      self._skewness = ops.convert_to_tensor(
+          skewness, name="skewness")
+      self._tailweight = ops.convert_to_tensor(
+          tailweight, name="tailweight", dtype=self._skewness.dtype)
+      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
+      if validate_args:
+        self._tailweight = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._tailweight,
+                message="Argument tailweight was not positive")
+        ], self._tailweight)
+    super(SinhArcsinh, self).__init__(
+        event_ndims=event_ndims, validate_args=validate_args, name=name)
+
+  @property
+  def skewness(self):
+    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._skewness
+
+  @property
+  def tailweight(self):
+    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
+    return self._tailweight
+
+  def _forward(self, x):
+    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
+
+  def _inverse(self, y):
+    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # x = sinh(arcsinh(y) / tailweight - skewness)
+    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
+    # dx/dy
+    # = cosh(arcsinh(y) / tailweight - skewness)
+    #     / (tailweight * sqrt(y**2 + 1))
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            math_ops.asinh(y) / self.tailweight - self.skewness)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
+                     / _sqrtx2p1(y))
+        - math_ops.log(self.tailweight),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    # y = sinh((arcsinh(x) + skewness) * tailweight)
+    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
+    # dy/dx
+    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        # This is computed inside the log to avoid catastrophic cancellations
+        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
+        math_ops.log(math_ops.cosh(
+            (math_ops.asinh(x) + self.skewness) * self.tailweight)
+                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
+                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
+                     / _sqrtx2p1(x))
+        + math_ops.log(self.tailweight),
+        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
deleted file mode 100644
index 3a75e4ae9495793901b0da91a5aa3982aab35852..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh_impl.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SinhArcsinh bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-__all__ = [
-    "SinhArcsinh",
-]
-
-
-def _sqrtx2p1(x):
-  """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`."""
-  return array_ops.where(
-      math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1.,
-      math_ops.sqrt(x**2. + 1.),
-      # For large x, calculating x**2 can overflow. This can be alleviated by
-      # considering:
-      # sqrt(1 + x**2)
-      # = exp(0.5 log(1 + x**2))
-      # = exp(0.5 log(x**2 * (1 + x**-2)))
-      # = exp(log(x) + 0.5 * log(1 + x**-2))
-      # = |x| * exp(0.5 log(1 + x**-2))
-      # = |x| * sqrt(1 + x**-2)
-      # We omit the last term in this approximation.
-      # When |x| > 1 / sqrt(machineepsilon), the second term will be 1,
-      # due to sqrt(1 + x**-2) = 1. This is also true with the gradient term,
-      # and higher order gradients, since the first order derivative of
-      # sqrt(1 + x**-2) is -2 * x**-3 / (1 + x**-2) = -2 / (x**3 + x),
-      # and all nth-order derivatives will be O(x**-(n + 2)). This makes any
-      # gradient terms that contain any derivatives of sqrt(1 + x**-2) vanish.
-      math_ops.abs(x))
-
-
-class SinhArcsinh(bijector.Bijector):
-  """Compute `Y = g(X) = Sinh( (Arcsinh(X) + skewness) * tailweight )`.
-
-  For `skewness in (-inf, inf)` and `tailweight in (0, inf)`, this
-  transformation is a
-  diffeomorphism of the real line `(-inf, inf)`.  The inverse transform is
-  `X = g^{-1}(Y) = Sinh( ArcSinh(Y) / tailweight - skewness )`.
-
-  The `SinhArcsinh` transformation of the Normal is described in
-  [Sinh-arcsinh distributions](https://www.jstor.org/stable/27798865)
-  This Bijector allows a similar transformation of any distribution supported on
-  `(-inf, inf)`.
-
-  #### Meaning of the parameters
-
-  * If `skewness = 0` and `tailweight = 1`, this transform is the identity.
-  * Positive (negative) `skewness` leads to positive (negative) skew.
-    * positive skew means, for unimodal `X` centered at zero, the mode of `Y` is
-      "tilted" to the right.
-    * positive skew means positive values of `Y` become more likely, and
-      negative values become less likely.
-  * Larger (smaller) `tailweight` leads to fatter (thinner) tails.
-    * Fatter tails mean larger values of `|Y|` become more likely.
-    * If `X` is a unit Normal, `tailweight < 1` leads to a distribution that is
-      "flat" around `Y = 0`, and a very steep drop-off in the tails.
-    * If `X` is a unit Normal, `tailweight > 1` leads to a distribution more
-      peaked at the mode with heavier tails.
-
-  To see the argument about the tails, note that for `|X| >> 1` and
-  `|X| >> (|skewness| * tailweight)**tailweight`, we have
-  `Y approx 0.5 X**tailweight e**(sign(X) skewness * tailweight)`.
-  """
-
-  def __init__(self,
-               skewness=None,
-               tailweight=None,
-               event_ndims=0,
-               validate_args=False,
-               name="SinhArcsinh"):
-    """Instantiates the `SinhArcsinh` bijector.
-
-    Args:
-      skewness:  Skewness parameter.  Float-type `Tensor`.  Default is `0`
-        of type `float32`.
-      tailweight:  Tailweight parameter.  Positive `Tensor` of same `dtype` as
-        `skewness` and broadcastable `shape`.  Default is `1` of type `float32`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[skewness, tailweight]):
-      tailweight = 1. if tailweight is None else tailweight
-      skewness = 0. if skewness is None else skewness
-      self._skewness = ops.convert_to_tensor(
-          skewness, name="skewness")
-      self._tailweight = ops.convert_to_tensor(
-          tailweight, name="tailweight", dtype=self._skewness.dtype)
-      check_ops.assert_same_float_dtype([self._skewness, self._tailweight])
-      if validate_args:
-        self._tailweight = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._tailweight,
-                message="Argument tailweight was not positive")
-        ], self._tailweight)
-    super(SinhArcsinh, self).__init__(
-        event_ndims=event_ndims, validate_args=validate_args, name=name)
-
-  @property
-  def skewness(self):
-    """The `skewness` in: `Y  = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._skewness
-
-  @property
-  def tailweight(self):
-    """The `tailweight` in: `Y = Sinh((Arcsinh(X) + skewness) * tailweight)`."""
-    return self._tailweight
-
-  def _forward(self, x):
-    return math_ops.sinh((math_ops.asinh(x) + self.skewness) * self.tailweight)
-
-  def _inverse(self, y):
-    return math_ops.sinh(math_ops.asinh(y) / self.tailweight - self.skewness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # x = sinh(arcsinh(y) / tailweight - skewness)
-    # Using sinh' = cosh, arcsinh'(y) = 1 / sqrt(y**2 + 1),
-    # dx/dy
-    # = cosh(arcsinh(y) / tailweight - skewness)
-    #     / (tailweight * sqrt(y**2 + 1))
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(y) / tailweight) - skewness) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            math_ops.asinh(y) / self.tailweight - self.skewness)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) / tailweight) - skewness ~= arcsinh(x).
-                     / _sqrtx2p1(y))
-        - math_ops.log(self.tailweight),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    # y = sinh((arcsinh(x) + skewness) * tailweight)
-    # Using sinh' = cosh, arcsinh'(x) = 1 / sqrt(x**2 + 1),
-    # dy/dx
-    # = cosh((arcsinh(x) + skewness) * tailweight) * tailweight / sqrt(x**2 + 1)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        # This is computed inside the log to avoid catastrophic cancellations
-        # from cosh((arcsinh(x) + skewness) * tailweight) and sqrt(x**2 + 1).
-        math_ops.log(math_ops.cosh(
-            (math_ops.asinh(x) + self.skewness) * self.tailweight)
-                     # TODO(srvasude): Consider using cosh(arcsinh(x)) in cases
-                     # where (arcsinh(x) + skewness) * tailweight ~= arcsinh(x).
-                     / _sqrtx2p1(x))
-        + math_ops.log(self.tailweight),
-        axis=event_dims)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index be6608f97880ae68e10b17c815bf2d8438293261..e4a1d3dde230724e74d5076c5bba079590b94a70 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -18,12 +18,232 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+import numpy as np
 
-_allowed_symbols = ["SoftmaxCentered"]
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
 
-remove_undocumented(__name__, _allowed_symbols)
+
+__all__ = [
+    "SoftmaxCentered",
+]
+
+
+class SoftmaxCentered(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
+
+  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
+  bijection, the forward transformation appends a value to the input and the
+  inverse removes this coordinate. The appended coordinate represents a pivot,
+  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
+  coordinate.
+
+  Because we append a coordinate, this bijector only supports `event_ndim in [0,
+  1]`, i.e., scalars and vectors.
+
+  Example Use:
+
+  ```python
+  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
+  # Result: [0.2, 0.3, 0.4, 0.1]
+  # Extra result: 0.1
+
+  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
+  # Result: tf.log([2, 3, 4])
+  # Extra coordinate removed.
+  ```
+
+  At first blush it may seem like the [Invariance of domain](
+  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
+  implementation is not a bijection. However, the appended dimension
+  makes the (forward) image non-open and the theorem does not directly apply.
+  """
+
+  def __init__(self,
+               event_ndims=0,
+               validate_args=False,
+               name="softmax_centered"):
+    self._graph_parents = []
+    self._name = name
+    with self._name_scope("init", values=[event_ndims]):
+      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+      event_ndims = tensor_util.constant_value(event_ndims)
+      if event_ndims is None or event_ndims not in [0, 1]:
+        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
+    self._static_event_ndims = event_ndims
+    super(SoftmaxCentered, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward_event_shape(self, input_shape):
+    if input_shape.ndims is None:
+      return input_shape
+    if input_shape.ndims != self._static_event_ndims:
+      raise ValueError("input_shape.dims = %d != %d" %
+                       (input_shape.ndims, self._static_event_ndims))
+    if input_shape.ndims == 0:
+      return tensor_shape.TensorShape([2])
+    if input_shape.ndims == 1:
+      return tensor_shape.TensorShape(input_shape[0] + 1)
+    # Unreachable code:
+    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
+
+  def _forward_event_shape_tensor(self, input_shape):
+    ndims = array_ops.shape(input_shape)
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_zero_or_one = check_ops.assert_equal(
+          ndims, 0 if self._static_event_ndims == 0 else 1,
+          message="event_ndims must be 0 or 1")
+      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor(
+          [2], dtype=dtypes.int32, name="output_shape")
+    return input_shape + 1
+
+  def _inverse_event_shape(self, output_shape):
+    if output_shape.ndims is None:
+      return output_shape
+    if output_shape.ndims != 1:
+      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
+    if self._static_event_ndims == 0:
+      return tensor_shape.TensorShape([])
+    return tensor_shape.TensorShape(output_shape[0] - 1)
+
+  def _inverse_event_shape_tensor(self, output_shape):
+    ndims = array_ops.shape(output_shape)[0]
+    if self.validate_args:
+      # It is not possible for a negative shape so we need only check <= 1.
+      is_one = check_ops.assert_equal(
+          ndims, 1, message="event_ndims must be 1")
+      ndims = control_flow_ops.with_dependencies([is_one], ndims)
+    if self._static_event_ndims == 0:
+      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
+    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
+
+  def _forward(self, x):
+    # Pad the last dim with a zeros vector. We need this because it lets us
+    # infer the scale in the inverse function.
+    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
+    ndims = _get_ndims(y)
+    y = array_ops.pad(y, paddings=array_ops.one_hot(indices=[-1, ndims - 1],
+                                                    depth=ndims,
+                                                    axis=0,
+                                                    dtype=dtypes.int32))
+    # Set shape hints.
+    if x.shape.ndims is not None:
+      shape = x.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape += [2]
+      elif shape[-1] is not None:
+        shape[-1] += 1
+      shape = tensor_shape.TensorShape(shape)
+      y.shape.assert_is_compatible_with(shape)
+      y.set_shape(shape)
+
+    # Since we only support event_ndims in [0, 1] and we do padding, we always
+    # reduce over the last dimension, i.e., dim=-1 (which is the default).
+    return nn_ops.softmax(y)
+
+  def _inverse(self, y):
+    # To derive the inverse mapping note that:
+    #   y[i] = exp(x[i]) / normalization
+    # and
+    #   y[end] = 1 / normalization.
+    # Thus:
+    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
+    #      = log(exp(x[i])/normalization) - log(y[end])
+    #      = log(y[i]) - log(y[end])
+    shape = (np.asarray(y.shape.as_list(), dtype=np.int32)
+             if y.shape.is_fully_defined()
+             else array_ops.shape(y, name="shape"))
+    ndims = _get_ndims(y)
+
+    # Do this first to make sure CSE catches that it'll happen again in
+    # _inverse_log_det_jacobian.
+    x = math_ops.log(y)
+
+    # We now extract the last coordinate of the rightmost dimension.
+    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
+    begin = array_ops.one_hot(indices=ndims-1,
+                              depth=ndims,
+                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
+                              dtype=shape.dtype)
+    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
+    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
+
+    # Here we slice out all but the last coordinate; see above for idea.
+    begin = array_ops.zeros_like(shape)
+    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
+    x = array_ops.strided_slice(x, begin, begin + size)
+
+    x += log_normalization
+
+    if self._static_event_ndims == 0:
+      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
+
+    # Set shape hints.
+    if y.shape.ndims is not None:
+      shape = y.shape.as_list()
+      if self._static_event_ndims == 0:
+        shape = shape[:-1]
+      elif shape[-1] is not None:
+        shape[-1] -= 1
+      shape = tensor_shape.TensorShape(shape)
+      x.shape.assert_is_compatible_with(shape)
+      x.set_shape(shape)
+
+    return x
+
+  def _inverse_log_det_jacobian(self, y):
+    # WLOG, consider the vector case:
+    #   x = log(y[:-1]) - log(y[-1])
+    # where,
+    #   y[-1] = 1 - sum(y[:-1]).
+    # We have:
+    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
+    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
+    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
+    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
+    #                        det(diag(y[:-1])) }                     (2)
+    #                = 1 / { y[-1] prod(y[:-1]) }
+    #                = 1 / prod(y)
+    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
+    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
+    #       docstring "Tip".
+    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
+    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
+
+  def _forward_log_det_jacobian(self, x):
+    if self._static_event_ndims == 0:
+      return x - 2. * nn_ops.softplus(x)
+    else:
+      # This code is similar to nn_ops.log_softmax but different because we have
+      # an implicit zero column to handle. I.e., instead of:
+      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
+      # we must do:
+      #   log_normalization = 1 + reduce_sum(exp(logits))
+      #   -log_normalization + reduce_sum(logits - log_normalization)
+      log_normalization = nn_ops.softplus(
+          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+      fldj = (-log_normalization +
+              math_ops.reduce_sum(x - log_normalization,
+                                  axis=-1,
+                                  keep_dims=True))
+      return array_ops.squeeze(fldj, squeeze_dims=-1)
+
+
+def _get_ndims(x):
+  """Returns `ndims`, statically if possible."""
+  if x.shape.ndims is not None:
+    return x.shape.ndims
+  return array_ops.rank(x, name="ndims")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
deleted file mode 100644
index 8645cc1b6b04be75a419342591272f07a4a1711c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered_impl.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SoftmaxCentered bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "SoftmaxCentered",
-]
-
-
-class SoftmaxCentered(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = exp([X 0]) / sum(exp([X 0]))`.
-
-  To implement [softmax](https://en.wikipedia.org/wiki/Softmax_function) as a
-  bijection, the forward transformation appends a value to the input and the
-  inverse removes this coordinate. The appended coordinate represents a pivot,
-  e.g., `softmax(x) = exp(x-c) / sum(exp(x-c))` where `c` is the implicit last
-  coordinate.
-
-  Because we append a coordinate, this bijector only supports `event_ndim in [0,
-  1]`, i.e., scalars and vectors.
-
-  Example Use:
-
-  ```python
-  bijector.SoftmaxCentered(event_ndims=1).forward(tf.log([2, 3, 4]))
-  # Result: [0.2, 0.3, 0.4, 0.1]
-  # Extra result: 0.1
-
-  bijector.SoftmaxCentered(event_ndims=1).inverse([0.2, 0.3, 0.4, 0.1])
-  # Result: tf.log([2, 3, 4])
-  # Extra coordinate removed.
-  ```
-
-  At first blush it may seem like the [Invariance of domain](
-  https://en.wikipedia.org/wiki/Invariance_of_domain) theorem implies this
-  implementation is not a bijection. However, the appended dimension
-  makes the (forward) image non-open and the theorem does not directly apply.
-  """
-
-  def __init__(self,
-               event_ndims=0,
-               validate_args=False,
-               name="softmax_centered"):
-    self._graph_parents = []
-    self._name = name
-    with self._name_scope("init", values=[event_ndims]):
-      event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
-      event_ndims = tensor_util.constant_value(event_ndims)
-      if event_ndims is None or event_ndims not in [0, 1]:
-        raise ValueError("`event_ndims` must be a TF constant which is 0 or 1")
-    self._static_event_ndims = event_ndims
-    super(SoftmaxCentered, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward_event_shape(self, input_shape):
-    if input_shape.ndims is None:
-      return input_shape
-    if input_shape.ndims != self._static_event_ndims:
-      raise ValueError("input_shape.dims = %d != %d" %
-                       (input_shape.ndims, self._static_event_ndims))
-    if input_shape.ndims == 0:
-      return tensor_shape.TensorShape([2])
-    if input_shape.ndims == 1:
-      return tensor_shape.TensorShape(input_shape[0] + 1)
-    # Unreachable code:
-    raise ValueError("event_ndims = %d must be 0 or 1" % input_shape.ndims)
-
-  def _forward_event_shape_tensor(self, input_shape):
-    ndims = array_ops.shape(input_shape)
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_zero_or_one = check_ops.assert_equal(
-          ndims, 0 if self._static_event_ndims == 0 else 1,
-          message="event_ndims must be 0 or 1")
-      ndims = control_flow_ops.with_dependencies([is_zero_or_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor(
-          [2], dtype=dtypes.int32, name="output_shape")
-    return input_shape + 1
-
-  def _inverse_event_shape(self, output_shape):
-    if output_shape.ndims is None:
-      return output_shape
-    if output_shape.ndims != 1:
-      raise ValueError("output_shape.ndims = %d != 1" % output_shape.ndims)
-    if self._static_event_ndims == 0:
-      return tensor_shape.TensorShape([])
-    return tensor_shape.TensorShape(output_shape[0] - 1)
-
-  def _inverse_event_shape_tensor(self, output_shape):
-    ndims = array_ops.shape(output_shape)[0]
-    if self.validate_args:
-      # It is not possible for a negative shape so we need only check <= 1.
-      is_one = check_ops.assert_equal(
-          ndims, 1, message="event_ndims must be 1")
-      ndims = control_flow_ops.with_dependencies([is_one], ndims)
-    if self._static_event_ndims == 0:
-      return ops.convert_to_tensor([], dtype=dtypes.int32, name="output_shape")
-    return array_ops.expand_dims(output_shape[0] - 1, dim=0)
-
-  def _forward(self, x):
-    # Pad the last dim with a zeros vector. We need this because it lets us
-    # infer the scale in the inverse function.
-    y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x
-    ndims = (y.get_shape().ndims if y.get_shape().ndims is not None
-             else array_ops.rank(y))
-    y = array_ops.pad(y,
-                      paddings=array_ops.concat(
-                          (array_ops.zeros(
-                              (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]),
-                          0))
-
-    # Set shape hints.
-    if x.get_shape().ndims is not None:
-      shape = x.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape += [2]
-      elif shape[-1] is not None:
-        shape[-1] += 1
-      shape = tensor_shape.TensorShape(shape)
-      y.get_shape().assert_is_compatible_with(shape)
-      y.set_shape(shape)
-
-    # Since we only support event_ndims in [0, 1] and we do padding, we always
-    # reduce over the last dimension, i.e., dim=-1 (which is the default).
-    return nn_ops.softmax(y)
-
-  def _inverse(self, y):
-    # To derive the inverse mapping note that:
-    #   y[i] = exp(x[i]) / normalization
-    # and
-    #   y[end] = 1 / normalization.
-    # Thus:
-    # x[i] = log(exp(x[i])) - log(y[end]) - log(normalization)
-    #      = log(exp(x[i])/normalization) - log(y[end])
-    #      = log(y[i]) - log(y[end])
-    shape = (np.asarray(y.get_shape().as_list(), dtype=np.int32)
-             if y.get_shape().is_fully_defined()
-             else array_ops.shape(y, name="shape"))
-    ndims = y.get_shape().ndims or math_ops.rank(y, name="ndims")
-
-    # Do this first to make sure CSE catches that it'll happen again in
-    # _inverse_log_det_jacobian.
-    x = math_ops.log(y)
-
-    # We now extract the last coordinate of the rightmost dimension.
-    # Our trick is to slice from [0,0,...,shape[-1]-1] to shape[:-1]+[1].
-    begin = array_ops.one_hot(indices=ndims-1,
-                              depth=ndims,
-                              on_value=shape[-1]-np.array(1, dtype=shape.dtype),
-                              dtype=shape.dtype)
-    size = array_ops.concat([shape[:-1], np.asarray([1], dtype=shape.dtype)], 0)
-    log_normalization = -array_ops.strided_slice(x, begin, begin + size)
-
-    # Here we slice out all but the last coordinate; see above for idea.
-    begin = array_ops.zeros_like(shape)
-    size = array_ops.concat([shape[:-1], [shape[-1] - 1]], 0)
-    x = array_ops.strided_slice(x, begin, begin + size)
-
-    x += log_normalization
-
-    if self._static_event_ndims == 0:
-      x = array_ops.squeeze(x, squeeze_dims=[ndims-1])
-
-    # Set shape hints.
-    if y.get_shape().ndims is not None:
-      shape = y.get_shape().as_list()
-      if self._static_event_ndims == 0:
-        shape = shape[:-1]
-      elif shape[-1] is not None:
-        shape[-1] -= 1
-      shape = tensor_shape.TensorShape(shape)
-      x.get_shape().assert_is_compatible_with(shape)
-      x.set_shape(shape)
-
-    return x
-
-  def _inverse_log_det_jacobian(self, y):
-    # WLOG, consider the vector case:
-    #   x = log(y[:-1]) - log(y[-1])
-    # where,
-    #   y[-1] = 1 - sum(y[:-1]).
-    # We have:
-    #   det{ dX/dY } = det{ diag(1 ./ y[:-1]) + 1 / y[-1] }
-    #                = det{ inv{ diag(y[:-1]) - y[:-1]' y[:-1] } }   (1)
-    #                = 1 / det{ diag(y[:-1]) - y[:-1]' y[:-1] }
-    #                = 1 / { (1 + y[:-1]' inv(diag(y[:-1])) y[:-1]) *
-    #                        det(diag(y[:-1])) }                     (2)
-    #                = 1 / { y[-1] prod(y[:-1]) }
-    #                = 1 / prod(y)
-    # (1) - https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
-    #       or by noting that det{ dX/dY } = 1 / det{ dY/dX } from Bijector
-    #       docstring "Tip".
-    # (2) - https://en.wikipedia.org/wiki/Matrix_determinant_lemma
-    return -math_ops.reduce_sum(math_ops.log(y), axis=-1)
-
-  def _forward_log_det_jacobian(self, x):
-    if self._static_event_ndims == 0:
-      return x - 2. * nn_ops.softplus(x)
-    else:
-      # This code is similar to nn_ops.log_softmax but different because we have
-      # an implicit zero column to handle. I.e., instead of:
-      #   reduce_sum(logits - reduce_sum(exp(logits), dim))
-      # we must do:
-      #   log_normalization = 1 + reduce_sum(exp(logits))
-      #   -log_normalization + reduce_sum(logits - log_normalization)
-      log_normalization = nn_ops.softplus(
-          math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
-      fldj = (-log_normalization +
-              math_ops.reduce_sum(x - log_normalization,
-                                  axis=-1,
-                                  keep_dims=True))
-      return array_ops.squeeze(fldj, squeeze_dims=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
index 250a1144b53bb43271ff7ee494604d9bae6feda8..81957fcf78922fa15fd20a25d144071f431161ae 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softplus.py
@@ -18,12 +18,127 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.softplus_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as distribution_util
 
-_allowed_symbols = ["Softplus"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Softplus",
+]
+
+
+class Softplus(bijector.Bijector):
+  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
+
+  The softplus `Bijector` has the following two useful properties:
+
+  * The domain is the positive real numbers
+  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
+    the `Exp` `Bijector`.
+
+  The optional nonzero `hinge_softness` parameter changes the transition at
+  zero.  With `hinge_softness = c`, the bijector is:
+
+    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
+
+  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
+  so the behavior for large `x` is the same as the standard softplus.
+
+  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
+  approaching `max(0, x)`.
+
+  * `c = 1` is the default.
+  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
+  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
+  * `c = 0` results in a non-bijective transformation and triggers an exception.
+
+    Example Use:
+
+    ```python
+    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
+    # batch ndim and 2 event ndims (i.e., vector of matrices).
+    softplus = Softplus(event_ndims=2)
+    x = [[[1., 2],
+          [3, 4]],
+         [[5, 6],
+          [7, 8]]]
+    log(1 + exp(x)) == softplus.forward(x)
+    log(exp(x) - 1) == softplus.inverse(x)
+    ```
+
+    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
+    reduction over the event space.
+  """
+
+  @distribution_util.AppendDocstring(
+      kwargs_dict={
+          "hinge_softness": (
+              "Nonzero floating point `Tensor`.  Controls the softness of what "
+              "would otherwise be a kink at the origin.  Default is 1.0")})
+  def __init__(self,
+               event_ndims=0,
+               hinge_softness=None,
+               validate_args=False,
+               name="softplus"):
+    with ops.name_scope(name, values=[hinge_softness]):
+      if hinge_softness is not None:
+        self._hinge_softness = ops.convert_to_tensor(
+            hinge_softness, name="hinge_softness")
+      else:
+        self._hinge_softness = None
+      if validate_args:
+        nonzero_check = check_ops.assert_none_equal(
+            ops.convert_to_tensor(
+                0, dtype=self.hinge_softness.dtype),
+            self.hinge_softness,
+            message="hinge_softness must be non-zero")
+        self._hinge_softness = control_flow_ops.with_dependencies(
+            [nonzero_check], self.hinge_softness)
+
+    super(Softplus, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    if self.hinge_softness is None:
+      return nn_ops.softplus(x)
+    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
+    return hinge_softness * nn_ops.softplus(x / hinge_softness)
+
+  def _inverse(self, y):
+    if self.hinge_softness is None:
+      return distribution_util.softplus_inverse(y)
+    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
+    return hinge_softness * distribution_util.softplus_inverse(
+        y / hinge_softness)
+
+  def _inverse_log_det_jacobian(self, y):
+    # Could also do:
+    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
+    #                              axis=event_dims)
+    # but the following is more numerically stable. Ie,
+    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
+    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
+    #           = 1 / (1 - exp{-Y}),
+    # which is the most stable for large Y > 0. For small Y, we use
+    # 1 - exp{-Y} approx Y.
+    if self.hinge_softness is not None:
+      y /= math_ops.cast(self.hinge_softness, y.dtype)
+    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
+                                axis=self._event_dims_tensor(y))
+
+  def _forward_log_det_jacobian(self, x):
+    if self.hinge_softness is not None:
+      x /= math_ops.cast(self.hinge_softness, x.dtype)
+    return -math_ops.reduce_sum(nn_ops.softplus(-x),
+                                axis=self._event_dims_tensor(x))
+
+  @property
+  def hinge_softness(self):
+    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
deleted file mode 100644
index 81957fcf78922fa15fd20a25d144071f431161ae..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softplus_impl.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Softplus bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops.distributions import bijector
-from tensorflow.python.ops.distributions import util as distribution_util
-
-
-__all__ = [
-    "Softplus",
-]
-
-
-class Softplus(bijector.Bijector):
-  """Bijector which computes `Y = g(X) = Log[1 + exp(X)]`.
-
-  The softplus `Bijector` has the following two useful properties:
-
-  * The domain is the positive real numbers
-  * `softplus(x) approx x`, for large `x`, so it does not overflow as easily as
-    the `Exp` `Bijector`.
-
-  The optional nonzero `hinge_softness` parameter changes the transition at
-  zero.  With `hinge_softness = c`, the bijector is:
-
-    ```f_c(x) := c * g(x / c) = c * Log[1 + exp(x / c)].```
-
-  For large `x >> 1`, `c * Log[1 + exp(x / c)] approx c * Log[exp(x / c)] = x`,
-  so the behavior for large `x` is the same as the standard softplus.
-
-  As `c > 0` approaches 0 from the right, `f_c(x)` becomes less and less soft,
-  approaching `max(0, x)`.
-
-  * `c = 1` is the default.
-  * `c > 0` but small means `f(x) approx ReLu(x) = max(0, x)`.
-  * `c < 0` flips sign and reflects around the `y-axis`: `f_{-c}(x) = -f_c(-x)`.
-  * `c = 0` results in a non-bijective transformation and triggers an exception.
-
-    Example Use:
-
-    ```python
-    # Create the Y=g(X)=softplus(X) transform which works only on Tensors with 1
-    # batch ndim and 2 event ndims (i.e., vector of matrices).
-    softplus = Softplus(event_ndims=2)
-    x = [[[1., 2],
-          [3, 4]],
-         [[5, 6],
-          [7, 8]]]
-    log(1 + exp(x)) == softplus.forward(x)
-    log(exp(x) - 1) == softplus.inverse(x)
-    ```
-
-    Note: log(.) and exp(.) are applied element-wise but the Jacobian is a
-    reduction over the event space.
-  """
-
-  @distribution_util.AppendDocstring(
-      kwargs_dict={
-          "hinge_softness": (
-              "Nonzero floating point `Tensor`.  Controls the softness of what "
-              "would otherwise be a kink at the origin.  Default is 1.0")})
-  def __init__(self,
-               event_ndims=0,
-               hinge_softness=None,
-               validate_args=False,
-               name="softplus"):
-    with ops.name_scope(name, values=[hinge_softness]):
-      if hinge_softness is not None:
-        self._hinge_softness = ops.convert_to_tensor(
-            hinge_softness, name="hinge_softness")
-      else:
-        self._hinge_softness = None
-      if validate_args:
-        nonzero_check = check_ops.assert_none_equal(
-            ops.convert_to_tensor(
-                0, dtype=self.hinge_softness.dtype),
-            self.hinge_softness,
-            message="hinge_softness must be non-zero")
-        self._hinge_softness = control_flow_ops.with_dependencies(
-            [nonzero_check], self.hinge_softness)
-
-    super(Softplus, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  def _forward(self, x):
-    if self.hinge_softness is None:
-      return nn_ops.softplus(x)
-    hinge_softness = math_ops.cast(self.hinge_softness, x.dtype)
-    return hinge_softness * nn_ops.softplus(x / hinge_softness)
-
-  def _inverse(self, y):
-    if self.hinge_softness is None:
-      return distribution_util.softplus_inverse(y)
-    hinge_softness = math_ops.cast(self.hinge_softness, y.dtype)
-    return hinge_softness * distribution_util.softplus_inverse(
-        y / hinge_softness)
-
-  def _inverse_log_det_jacobian(self, y):
-    # Could also do:
-    #   ildj = math_ops.reduce_sum(y - distribution_util.softplus_inverse(y),
-    #                              axis=event_dims)
-    # but the following is more numerically stable. Ie,
-    # Y = Log[1 + exp{X}] ==> X = Log[exp{Y} - 1]
-    # ==> dX/dY = exp{Y} / (exp{Y} - 1)
-    #           = 1 / (1 - exp{-Y}),
-    # which is the most stable for large Y > 0. For small Y, we use
-    # 1 - exp{-Y} approx Y.
-    if self.hinge_softness is not None:
-      y /= math_ops.cast(self.hinge_softness, y.dtype)
-    return -math_ops.reduce_sum(math_ops.log(-math_ops.expm1(-y)),
-                                axis=self._event_dims_tensor(y))
-
-  def _forward_log_det_jacobian(self, x):
-    if self.hinge_softness is not None:
-      x /= math_ops.cast(self.hinge_softness, x.dtype)
-    return -math_ops.reduce_sum(nn_ops.softplus(-x),
-                                axis=self._event_dims_tensor(x))
-
-  @property
-  def hinge_softness(self):
-    return self._hinge_softness
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
index d439f28884d8bd7f2b808317e10c5b5e44bfcfa2..00520bcda85e9527767e6342bf75f10667c264a8 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/weibull.py
@@ -18,12 +18,132 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.distributions.python.ops.bijectors.weibull_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
 
-_allowed_symbols = ["Weibull"]
 
-remove_undocumented(__name__, _allowed_symbols)
+__all__ = [
+    "Weibull",
+]
+
+
+class Weibull(bijector.Bijector):
+  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
+
+  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
+  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
+  random variable with the
+  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
+
+  ```none
+  Y ~ Weibull(scale, concentration)
+  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
+    scale / concentration) ** (concentration - 1) * exp(
+      -(y / scale) ** concentration)
+  ```
+  """
+
+  def __init__(self,
+               scale=1.,
+               concentration=1.,
+               event_ndims=0,
+               validate_args=False,
+               name="weibull"):
+    """Instantiates the `Weibull` bijector.
+
+    Args:
+      scale: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `concentration`.
+        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      concentration: Positive Float-type `Tensor` that is the same dtype and is
+        broadcastable with `scale`.
+        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
+      event_ndims: Python scalar indicating the number of dimensions associated
+        with a particular draw from the distribution.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    self._validate_args = validate_args
+    with self._name_scope("init", values=[scale, concentration]):
+      self._scale = ops.convert_to_tensor(scale, name="scale")
+      self._concentration = ops.convert_to_tensor(
+          concentration, name="concentration")
+      check_ops.assert_same_float_dtype([self._scale, self._concentration])
+      if validate_args:
+        self._scale = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._scale,
+                message="Argument scale was not positive")
+        ], self._scale)
+        self._concentration = control_flow_ops.with_dependencies([
+            check_ops.assert_positive(
+                self._concentration,
+                message="Argument concentration was not positive")
+        ], self._concentration)
+
+    super(Weibull, self).__init__(
+        event_ndims=event_ndims,
+        validate_args=validate_args,
+        name=name)
+
+  @property
+  def scale(self):
+    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._scale
+
+  @property
+  def concentration(self):
+    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
+    return self._concentration
+
+  def _forward(self, x):
+    x = self._maybe_assert_valid_x(x)
+    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
+
+  def _inverse(self, y):
+    y = self._maybe_assert_valid_y(y)
+    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
+
+  def _inverse_log_det_jacobian(self, y):
+    y = self._maybe_assert_valid_y(y)
+    event_dims = self._event_dims_tensor(y)
+    return math_ops.reduce_sum(
+        -math_ops.log1p(-y) +
+        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
+        math_ops.log(self.scale / self.concentration),
+        axis=event_dims)
+
+  def _forward_log_det_jacobian(self, x):
+    x = self._maybe_assert_valid_x(x)
+    event_dims = self._event_dims_tensor(x)
+    return math_ops.reduce_sum(
+        -(x / self.scale) ** self.concentration +
+        (self.concentration - 1) * math_ops.log(x) +
+        math_ops.log(self.concentration) +
+        -self.concentration * math_ops.log(self.scale),
+        axis=event_dims)
+
+  def _maybe_assert_valid_x(self, x):
+    if not self.validate_args:
+      return x
+    is_valid = check_ops.assert_non_negative(
+        x,
+        message="Forward transformation input must be at least {}.".format(0))
+    return control_flow_ops.with_dependencies([is_valid], x)
+
+  def _maybe_assert_valid_y(self, y):
+    if not self.validate_args:
+      return y
+    is_positive = check_ops.assert_non_negative(
+        y, message="Inverse transformation input must be greater than 0.")
+    less_than_one = check_ops.assert_less_equal(
+        y, constant_op.constant(1., y.dtype),
+        message="Inverse transformation input must be less than or equal to 1.")
+    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py b/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
deleted file mode 100644
index 00520bcda85e9527767e6342bf75f10667c264a8..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distributions/python/ops/bijectors/weibull_impl.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Weibull bijector."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import bijector
-
-
-__all__ = [
-    "Weibull",
-]
-
-
-class Weibull(bijector.Bijector):
-  """Compute `Y = g(X) = 1 - exp((-X / scale) ** concentration), X >= 0`.
-
-  This bijector maps inputs from `[0, inf]` to [0, 1]`. The inverse of the
-  bijector applied to a uniform random variable `X ~ U(0, 1) gives back a
-  random variable with the
-  [Weibull distribution](https://en.wikipedia.org/wiki/Weibull_distribution):
-
-  ```none
-  Y ~ Weibull(scale, concentration)
-  pdf(y; scale, concentration, y >= 0) = (scale / concentration) * (
-    scale / concentration) ** (concentration - 1) * exp(
-      -(y / scale) ** concentration)
-  ```
-  """
-
-  def __init__(self,
-               scale=1.,
-               concentration=1.,
-               event_ndims=0,
-               validate_args=False,
-               name="weibull"):
-    """Instantiates the `Weibull` bijector.
-
-    Args:
-      scale: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `concentration`.
-        This is `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      concentration: Positive Float-type `Tensor` that is the same dtype and is
-        broadcastable with `scale`.
-        This is `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`.
-      event_ndims: Python scalar indicating the number of dimensions associated
-        with a particular draw from the distribution.
-      validate_args: Python `bool` indicating whether arguments should be
-        checked for correctness.
-      name: Python `str` name given to ops managed by this object.
-    """
-    self._graph_parents = []
-    self._name = name
-    self._validate_args = validate_args
-    with self._name_scope("init", values=[scale, concentration]):
-      self._scale = ops.convert_to_tensor(scale, name="scale")
-      self._concentration = ops.convert_to_tensor(
-          concentration, name="concentration")
-      check_ops.assert_same_float_dtype([self._scale, self._concentration])
-      if validate_args:
-        self._scale = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._scale,
-                message="Argument scale was not positive")
-        ], self._scale)
-        self._concentration = control_flow_ops.with_dependencies([
-            check_ops.assert_positive(
-                self._concentration,
-                message="Argument concentration was not positive")
-        ], self._concentration)
-
-    super(Weibull, self).__init__(
-        event_ndims=event_ndims,
-        validate_args=validate_args,
-        name=name)
-
-  @property
-  def scale(self):
-    """The `l` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._scale
-
-  @property
-  def concentration(self):
-    """The `k` in `Y = g(X) = 1 - exp((-x / l) ** k)`."""
-    return self._concentration
-
-  def _forward(self, x):
-    x = self._maybe_assert_valid_x(x)
-    return -math_ops.expm1(-((x / self.scale) ** self.concentration))
-
-  def _inverse(self, y):
-    y = self._maybe_assert_valid_y(y)
-    return self.scale * (-math_ops.log1p(-y)) ** (1 / self.concentration)
-
-  def _inverse_log_det_jacobian(self, y):
-    y = self._maybe_assert_valid_y(y)
-    event_dims = self._event_dims_tensor(y)
-    return math_ops.reduce_sum(
-        -math_ops.log1p(-y) +
-        (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) +
-        math_ops.log(self.scale / self.concentration),
-        axis=event_dims)
-
-  def _forward_log_det_jacobian(self, x):
-    x = self._maybe_assert_valid_x(x)
-    event_dims = self._event_dims_tensor(x)
-    return math_ops.reduce_sum(
-        -(x / self.scale) ** self.concentration +
-        (self.concentration - 1) * math_ops.log(x) +
-        math_ops.log(self.concentration) +
-        -self.concentration * math_ops.log(self.scale),
-        axis=event_dims)
-
-  def _maybe_assert_valid_x(self, x):
-    if not self.validate_args:
-      return x
-    is_valid = check_ops.assert_non_negative(
-        x,
-        message="Forward transformation input must be at least {}.".format(0))
-    return control_flow_ops.with_dependencies([is_valid], x)
-
-  def _maybe_assert_valid_y(self, y):
-    if not self.validate_args:
-      return y
-    is_positive = check_ops.assert_non_negative(
-        y, message="Inverse transformation input must be greater than 0.")
-    less_than_one = check_ops.assert_less_equal(
-        y, constant_op.constant(1., y.dtype),
-        message="Inverse transformation input must be less than or equal to 1.")
-    return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5d724a2a945ed8f9c159d8314327c6f994d1db
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -0,0 +1,221 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Cauchy distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+
+__all__ = [
+    "Cauchy",
+]
+
+
+class Cauchy(distribution.Distribution):
+  """The Cauchy distribution with location `loc` and scale `scale`.
+
+  #### Mathematical details
+
+  The probability density function (pdf) is,
+
+  ```none
+  pdf(x; loc, scale) = 1 / (pi scale (1 + z**2))
+  z = (x - loc) / scale
+  ```
+  where `loc` is the location, and `scale` is the scale.
+
+  The Cauchy distribution is a member of the [location-scale family](
+  https://en.wikipedia.org/wiki/Location-scale_family), i.e.
+  `Y ~ Cauchy(loc, scale)` is equivalent to,
+
+  ```none
+  X ~ Cauchy(loc=0, scale=1)
+  Y = loc + scale * X
+  ```
+
+  #### Examples
+
+  Examples of initialization of one or a batch of distributions.
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  # Define a single scalar Cauchy distribution.
+  dist = tfd.Cauchy(loc=0., scale=3.)
+
+  # Evaluate the cdf at 1, returning a scalar.
+  dist.cdf(1.)
+
+  # Define a batch of two scalar valued Cauchy distributions.
+  dist = tfd.Cauchy(loc=[1, 2.], scale=[11, 22.])
+
+  # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
+  # returning a length two tensor.
+  dist.prob([0, 1.5])
+
+  # Get 3 samples, returning a 3 x 2 tensor.
+  dist.sample([3])
+
+  # Arguments are broadcast when possible.
+  # Define a batch of two scalar valued Cauchy distributions.
+  # Both have median 1, but different scales.
+  dist = tfd.Cauchy(loc=1., scale=[11, 22.])
+
+  # Evaluate the pdf of both distributions on the same point, 3.0,
+  # returning a length 2 tensor.
+  dist.prob(3.)
+  ```
+
+  """
+
+  def __init__(self,
+               loc,
+               scale,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="Cauchy"):
+    """Construct Cauchy distributions.
+
+    The parameters `loc` and `scale` must be shaped in a way that supports
+    broadcasting (e.g. `loc + scale` is a valid operation).
+
+    Args:
+      loc: Floating point tensor; the modes of the distribution(s).
+      scale: Floating point tensor; the locations of the distribution(s).
+        Must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+
+    Raises:
+      TypeError: if `loc` and `scale` have different `dtype`.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[loc, scale]):
+      with ops.control_dependencies([check_ops.assert_positive(scale)]
+                                    if validate_args else []):
+        self._loc = array_ops.identity(loc, name="loc")
+        self._scale = array_ops.identity(scale, name="scale")
+        check_ops.assert_same_float_dtype([self._loc, self._scale])
+    super(Cauchy, self).__init__(
+        dtype=self._scale.dtype,
+        reparameterization_type=distribution.FULLY_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._loc, self._scale],
+        name=name)
+
+  @staticmethod
+  def _param_shapes(sample_shape):
+    return dict(
+        zip(("loc", "scale"),
+            ([ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)] * 2)))
+
+  @property
+  def loc(self):
+    """Distribution parameter for the mean."""
+    return self._loc
+
+  @property
+  def scale(self):
+    """Distribution parameter for standard deviation."""
+    return self._scale
+
+  def _batch_shape_tensor(self):
+    return array_ops.broadcast_dynamic_shape(
+        array_ops.shape(self.loc), array_ops.shape(self.scale))
+
+  def _batch_shape(self):
+    return array_ops.broadcast_static_shape(self.loc.shape, self.scale.shape)
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    probs = random_ops.random_uniform(
+        shape=shape, minval=0., maxval=1., dtype=self.dtype, seed=seed)
+    return self._quantile(probs)
+
+  def _log_prob(self, x):
+    return self._log_unnormalized_prob(x) - self._log_normalization()
+
+  def _cdf(self, x):
+    return math_ops.atan(self._z(x)) / np.pi + 0.5
+
+  def _log_cdf(self, x):
+    return math_ops.log1p(2 / np.pi * math_ops.atan(self._z(x))) - np.log(2)
+
+  def _log_unnormalized_prob(self, x):
+    return -math_ops.log1p(math_ops.square(self._z(x)))
+
+  def _log_normalization(self):
+    return np.log(np.pi) + math_ops.log(self.scale)
+
+  def _entropy(self):
+    h = np.log(4 * np.pi) + math_ops.log(self.scale)
+    return h * array_ops.ones_like(self.loc)
+
+  def _quantile(self, p):
+    return self.loc + self.scale * math_ops.tan(np.pi * (p - 0.5))
+
+  def _mode(self):
+    return self.loc * array_ops.ones_like(self.scale)
+
+  def _z(self, x):
+    """Standardize input `x`."""
+    with ops.name_scope("standardize", values=[x]):
+      return (x - self.loc) / self.scale
+
+  def _inv_z(self, z):
+    """Reconstruct input `x` from a its normalized version."""
+    with ops.name_scope("reconstruct", values=[z]):
+      return z * self.scale + self.loc
+
+  def _mean(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`mean` is undefined for Cauchy distribution.")
+
+  def _stddev(self):
+    if self.allow_nan_stats:
+      return array_ops.fill(self.batch_shape_tensor(),
+                            self.dtype.as_numpy_dtype(np.nan))
+    else:
+      raise ValueError("`stddev` is undefined for Cauchy distribution.")
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 599c855cda434d9249187d5d154d50a8a8c49a6c..1d4c5660d8d73b7b6a7e758fc834ccfddeb5c8ea 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -121,7 +121,7 @@ class ConditionalTransformedDistribution(
     log_prob = self.distribution.log_prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    return ildj + log_prob
+    return math_ops.cast(ildj, log_prob.dtype) + log_prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
@@ -143,7 +143,7 @@ class ConditionalTransformedDistribution(
     prob = self.distribution.prob(x, **distribution_kwargs)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    return math_ops.exp(ildj) * prob
+    return math_ops.exp(math_ops.cast(ildj, prob.dtype)) * prob
 
   @distribution_util.AppendDocstring(kwargs_dict=_condition_kwargs_dict)
   def _log_cdf(self, y, bijector_kwargs=None, distribution_kwargs=None):
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index 850d08d1bd69ebc7661557d648e2bffe77e6a908..8049522e9f5dc26b244b7e710a9ae8b981efd6b6 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -290,8 +290,10 @@ class VectorDeterministic(_BaseDeterministic):
   #### Examples
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Initialize a single VectorDeterministic supported at [0., 2.] in R^2.
-  constant = tf.contrib.distributions.Deterministic([0., 2.])
+  constant = tfd.Deterministic([0., 2.])
   constant.prob([0., 2.])
   ==> 1.
   constant.prob([0., 3.])
@@ -299,7 +301,7 @@ class VectorDeterministic(_BaseDeterministic):
 
   # Initialize a [3] batch of constants on R^2.
   loc = [[0., 1.], [2., 3.], [4., 5.]]
-  constant = constant_lib.VectorDeterministic(loc)
+  constant = tfd.VectorDeterministic(loc)
   constant.prob([[0., 1.], [1.9, 3.], [3.99, 5.]])
   ==> [1., 0., 0.]
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index ba8d3c639b397422f0f6210ba9f48650f0da1e3e..d0efaefb8e78ddf4436e9e5a112d2c1cdddaf3b5 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -62,15 +62,17 @@ class _Gumbel(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Gumbel distribution.
-  dist = tf.contrib.distributions.Gumbel(loc=0., scale=3.)
+  dist = tfd.Gumbel(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Gumbels.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Gumbel(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Gumbel(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -85,7 +87,7 @@ class _Gumbel(distribution.Distribution):
   ```python
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Gumbel(loc=1., scale=[11, 22.])
+  dist = tfd.Gumbel(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0751a6e0b78cb3d79bd3478e740bb05cd26428
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The Half Normal distribution class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import special_math
+
+
+__all__ = [
+    "HalfNormal",
+]
+
+
+class HalfNormal(distribution.Distribution):
+  """The Half Normal distribution with scale `scale`.
+
+  #### Mathematical details
+
+  The half normal is a transformation of a centered normal distribution.
+  If some random variable `X` has normal distribution,
+  ```none
+  X ~ Normal(0.0, scale)
+  Y = |X|
+  ```
+  Then `Y` will have half normal distribution. The probability density
+  function (pdf) is:
+
+  ```none
+  pdf(x; scale, x > 0) = sqrt(2) / (scale * sqrt(pi)) *
+    exp(- 1/2 * (x / scale) ** 2)
+  )
+  ```
+  Where `scale = sigma` is the standard deviation of the underlying normal
+  distribution.
+
+  #### Examples
+
+  Examples of initialization of one or a batch of distributions.
+
+  ```python
+  # Define a single scalar HalfNormal distribution.
+  dist = tf.contrib.distributions.HalfNormal(scale=3.0)
+
+  # Evaluate the cdf at 1, returning a scalar.
+  dist.cdf(1.)
+
+  # Define a batch of two scalar valued HalfNormals.
+  # The first has scale 11.0, the second 22.0
+  dist = tf.contrib.distributions.HalfNormal(scale=[11.0, 22.0])
+
+  # Evaluate the pdf of the first distribution on 1.0, and the second on 1.5,
+  # returning a length two tensor.
+  dist.prob([1.0, 1.5])
+
+  # Get 3 samples, returning a 3 x 2 tensor.
+  dist.sample([3])
+  ```
+
+  """
+
+  def __init__(self,
+               scale,
+               validate_args=False,
+               allow_nan_stats=True,
+               name="HalfNormal"):
+    """Construct HalfNormals with scale `scale`.
+
+    Args:
+      scale: Floating point tensor; the scales of the distribution(s).
+        Must contain only positive values.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`,
+        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+        indicate the result is undefined. When `False`, an exception is raised
+        if one or more of the statistic's batch members are undefined.
+      name: Python `str` name prefixed to Ops created by this class.
+    """
+    parameters = locals()
+    with ops.name_scope(name, values=[scale]):
+      with ops.control_dependencies([check_ops.assert_positive(scale)] if
+                                    validate_args else []):
+        self._scale = array_ops.identity(scale, name="scale")
+    super(HalfNormal, self).__init__(
+        dtype=self._scale.dtype,
+        reparameterization_type=distribution.FULLY_REPARAMETERIZED,
+        validate_args=validate_args,
+        allow_nan_stats=allow_nan_stats,
+        parameters=parameters,
+        graph_parents=[self._scale],
+        name=name)
+
+  @staticmethod
+  def _param_shapes(sample_shape):
+    return {"scale": ops.convert_to_tensor(sample_shape, dtype=dtypes.int32)}
+
+  @property
+  def scale(self):
+    """Distribution parameter for the scale."""
+    return self._scale
+
+  def _batch_shape_tensor(self):
+    return array_ops.shape(self.scale)
+
+  def _batch_shape(self):
+    return self.scale.shape
+
+  def _event_shape_tensor(self):
+    return constant_op.constant([], dtype=dtypes.int32)
+
+  def _event_shape(self):
+    return tensor_shape.scalar()
+
+  def _sample_n(self, n, seed=None):
+    shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
+    sampled = random_ops.random_normal(
+        shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=seed)
+    return math_ops.abs(sampled * self.scale)
+
+  def _prob(self, x):
+    coeff = np.sqrt(2) / self.scale / np.sqrt(np.pi)
+    pdf = coeff * math_ops.exp(- 0.5 * (x / self.scale) ** 2)
+    return pdf * math_ops.cast(x >= 0, self.dtype)
+
+  def _cdf(self, x):
+    truncated_x = nn.relu(x)
+    return math_ops.erf(truncated_x / self.scale / np.sqrt(2.0))
+
+  def _entropy(self):
+    return 0.5 * math_ops.log(np.pi * self.scale ** 2.0 / 2.0) + 0.5
+
+  def _mean(self):
+    return self.scale * np.sqrt(2.0) / np.sqrt(np.pi)
+
+  def _quantile(self, p):
+    return np.sqrt(2.0) * self.scale * special_math.erfinv(p)
+
+  def _mode(self):
+    return array_ops.zeros(self.batch_shape_tensor())
+
+  def _variance(self):
+    return self.scale ** 2.0 * (1.0 - 2.0 / np.pi)
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 6a74ca9a0ae1ad30081d21cc15a65be052a99e2a..cbce005013281ff3c58c94d525d5ce7a865d725a 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -68,11 +68,11 @@ class Independent(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Make independent distribution from a 2-batch Normal.
-  ind = ds.Independent(
-      distribution=ds.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
+  ind = tfd.Independent(
+      distribution=tfd.Normal(loc=[-1., 1], scale=[0.1, 0.5]),
       reinterpreted_batch_ndims=1)
 
   # All batch dims have been "absorbed" into event dims.
@@ -80,8 +80,8 @@ class Independent(distribution_lib.Distribution):
   ind.event_shape  # ==> [2]
 
   # Make independent distribution from a 2-batch bivariate Normal.
-  ind = ds.Independent(
-      distribution=ds.MultivariateNormalDiag(
+  ind = tfd.Independent(
+      distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1], [1, -1]],
           scale_identity_multiplier=[1., 0.5]),
       reinterpreted_batch_ndims=1)
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 956dee38a378813434656a28a69c89b6ec1e8b72..ee4d86867d48b20e97757bcec57d452085814b80 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -88,8 +88,9 @@ class InverseGamma(distribution.Distribution):
   #### Examples
 
   ```python
-  dist = InverseGamma(concentration=3.0, rate=2.0)
-  dist2 = InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
+  tfd = tf.contrib.distributions
+  dist = tfd.InverseGamma(concentration=3.0, rate=2.0)
+  dist2 = tfd.InverseGamma(concentration=[3.0, 4.0], rate=[2.0, 3.0])
   ```
 
   """
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index 48794a48828fe796e233e968d8c755136ce166ad..473677f8d91b184e029f345bb05f5c5d63df7a40 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -60,15 +60,17 @@ class Logistic(distribution.Distribution):
   Examples of initialization of one or a batch of distributions.
 
   ```python
+  tfd = tf.contrib.distributions
+
   # Define a single scalar Logistic distribution.
-  dist = tf.contrib.distributions.Logistic(loc=0., scale=3.)
+  dist = tfd.Logistic(loc=0., scale=3.)
 
   # Evaluate the cdf at 1, returning a scalar.
   dist.cdf(1.)
 
   # Define a batch of two scalar valued Logistics.
   # The first has mean 1 and scale 11, the second 2 and 22.
-  dist = tf.contrib.distributions.Logistic(loc=[1, 2.], scale=[11, 22.])
+  dist = tfd.Logistic(loc=[1, 2.], scale=[11, 22.])
 
   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
@@ -76,14 +78,11 @@ class Logistic(distribution.Distribution):
 
   # Get 3 samples, returning a 3 x 2 tensor.
   dist.sample([3])
-  ```
 
-  Arguments are broadcast when possible.
-
-  ```python
+  # Arguments are broadcast when possible.
   # Define a batch of two scalar valued Logistics.
   # Both have mean 1, but different scales.
-  dist = tf.contrib.distributions.Logistic(loc=1., scale=[11, 22.])
+  dist = tfd.Logistic(loc=1., scale=[11, 22.])
 
   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index e676931d9145e72907d990148ee2d180e0da0258..f2d492f5489a197157558ae727416b51db04793e 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -49,13 +49,13 @@ class Mixture(distribution.Distribution):
 
   ```python
   # Create a mixture of two Gaussians:
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
   mix = 0.3
-  bimix_gauss = ds.Mixture(
-    cat=ds.Categorical(probs=[mix, 1.-mix]),
+  bimix_gauss = tfd.Mixture(
+    cat=tfd.Categorical(probs=[mix, 1.-mix]),
     components=[
-      ds.Normal(loc=-1., scale=0.1),
-      ds.Normal(loc=+1., scale=0.5),
+      tfd.Normal(loc=-1., scale=0.1),
+      tfd.Normal(loc=+1., scale=0.5),
   ])
 
   # Plot the PDF.
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 5558ef0f255db684b229d129666634e50c625887..0ca236c3761f9d3a0fcc79ff9db792319108db0d 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -43,15 +43,14 @@ class MixtureSameFamily(distribution.Distribution):
   #### Examples
 
   ```python
-  import matplotlib.pyplot as plt
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   ### Create a mixture of two scalar Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.Normal(
+      components_distribution=tfd.Normal(
         loc=[-1., 1],       # One for each component.
         scale=[0.1, 0.5]))  # And same here.
 
@@ -63,14 +62,15 @@ class MixtureSameFamily(distribution.Distribution):
 
   # Plot PDF.
   x = np.linspace(-2., 3., int(1e4), dtype=np.float32)
+  import matplotlib.pyplot as plt
   plt.plot(x, gm.prob(x).eval());
 
   ### Create a mixture of two Bivariate Gaussians:
 
-  gm = ds.MixtureSameFamily(
-      mixture_distribution=ds.Categorical(
+  gm = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(
           probs=[0.3, 0.7]),
-      components_distribution=ds.MultivariateNormalDiag(
+      components_distribution=tfd.MultivariateNormalDiag(
           loc=[[-1., 1],  # component 1
                [1, -1]],  # component 2
           scale_identity_multiplier=[.3, .6]))
@@ -320,13 +320,14 @@ class MixtureSameFamily(distribution.Distribution):
         return array_ops.shape(d.batch_shape_tensor())[0]
       dist_batch_ndims = _get_ndims(self)
       cat_batch_ndims = _get_ndims(self.mixture_distribution)
-      bnd = distribution_util.pick_vector(
+      pad_ndims = array_ops.where(
           self.mixture_distribution.is_scalar_batch(),
-          [dist_batch_ndims], [cat_batch_ndims])[0]
+          dist_batch_ndims,
+          dist_batch_ndims - cat_batch_ndims)
       s = array_ops.shape(x)
       x = array_ops.reshape(x, shape=array_ops.concat([
           s[:-1],
-          array_ops.ones([bnd], dtype=dtypes.int32),
+          array_ops.ones([pad_ndims], dtype=dtypes.int32),
           s[-1:],
           array_ops.ones([self._event_ndims], dtype=dtypes.int32),
       ], axis=0))
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index 163cf75d990d5fe7ec1e3aaf0040fc71f61774a7..e862552880f4073c8fa8e90134d0633e7484b0bf 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -84,10 +84,10 @@ class MultivariateNormalDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -101,7 +101,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity Gaussian.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -119,7 +119,7 @@ class MultivariateNormalDiag(
   mvn.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate Gaussians.
-  mvn = ds.MultivariateNormalDiag(
+  mvn = tfd.MultivariateNormalDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 040bc230722194316b8a74627344e315a2578281..413e88f03ae0286c294f3404549a73e1a47dcff7 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -86,7 +86,7 @@ class MultivariateNormalDiagPlusLowRank(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian with covariance `cov = S @ S.T`,
   # `S = diag(d) + U @ diag(m) @ U.T`. The perturbation, `U @ diag(m) @ U.T`, is
@@ -97,7 +97,7 @@ class MultivariateNormalDiagPlusLowRank(
        [-1, 1],
        [2, -0.5]]        # shape: [3, 2]
   m = [4., 5]            # shape: [2]
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu
       scale_diag=d
       scale_perturb_factor=U,
@@ -118,7 +118,7 @@ class MultivariateNormalDiagPlusLowRank(
   m = [[0.1, 0.2],
        [0.4, 0.5]]         # shape: [b, r] = [2, 2]
 
-  mvn = ds.MultivariateNormalDiagPlusLowRank(
+  mvn = tfd.MultivariateNormalDiagPlusLowRank(
       loc=mu,
       scale_perturb_factor=U,
       scale_perturb_diag=m)
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index f9952b2069d6dfd2593e6bd71ede0badf44cdf98..8e69dadfb42e8d885b3af552b1f093b2857a6aa3 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -73,14 +73,14 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
   cov = [[ 0.36,  0.12,  0.06],
          [ 0.12,  0.29, -0.13],
          [ 0.06, -0.13,  0.26]]
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance_matrix=cov)
 
@@ -100,7 +100,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   covariance_matrix = ...  # shape: [2, 3, 3], symmetric, positive definite.
-  mvn = ds.MultivariateNormalFullCovariance(
+  mvn = tfd.MultivariateNormalFullCovariance(
       loc=mu,
       covariance=covariance_matrix)
 
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 300bdd5f6064a1cc9c336689ac4fae04338edb30..a7399792892f4c179c05168184d76ec95c168b51 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -90,8 +90,7 @@ class MultivariateNormalLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -103,9 +102,9 @@ class MultivariateNormalLinearOperator(
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale))
 
   # Covariance agrees with cholesky(cov) parameterization.
   mvn.covariance().eval()
@@ -122,9 +121,9 @@ class MultivariateNormalLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  mvn = ds.MultivariateNormalLinearOperator(
+  mvn = tfd.MultivariateNormalLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index 260dcc18f513d5440d3d39368539274c03faa72a..6c7dc4ca7aaf5b3a20b072e9360d15528ad10556 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -76,12 +76,13 @@ class MultivariateNormalTriL(
   ```
 
   Trainable (batch) lower-triangular matrices can be created with
-  `ds.matrix_diag_transform()` and/or `ds.fill_triangular()`
+  `tf.contrib.distributions.matrix_diag_transform()` and/or
+  `tf.contrib.distributions.fill_triangular()`
 
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate Gaussian.
   mu = [1., 2, 3]
@@ -92,7 +93,7 @@ class MultivariateNormalTriL(
   # ==> [[ 0.6,  0. ,  0. ],
   #      [ 0.2,  0.5,  0. ],
   #      [ 0.1, -0.3,  0.4]])
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=scale)
 
@@ -112,7 +113,7 @@ class MultivariateNormalTriL(
   mu = [[1., 2, 3],
         [11, 22, 33]]              # shape: [2, 3]
   tril = ...  # shape: [2, 3, 3], lower triangular, non-zero diagonal.
-  mvn = ds.MultivariateNormalTriL(
+  mvn = tfd.MultivariateNormalTriL(
       loc=mu,
       scale_tril=tril)
 
@@ -124,9 +125,9 @@ class MultivariateNormalTriL(
   # Instantiate a "learnable" MVN.
   dims = 4
   with tf.variable_scope("model"):
-    mvn = ds.MultivariateNormalTriL(
+    mvn = tfd.MultivariateNormalTriL(
         loc=tf.get_variable(shape=[dims], dtype=tf.float32, name="mu"),
-        scale_tril=ds.fill_triangular(
+        scale_tril=tfd.fill_triangular(
             tf.get_variable(shape=[dims * (dims + 1) / 2],
                             dtype=tf.float32, name="chol_Sigma")))
   ```
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 8a95038a3c8eccf8a75fea79d0a62f9883b4f13a..2701c36fb53b1ae3fd736be3b1288e3dd40c739a 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -107,10 +107,11 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
+
   # Create two batches of PoissonLogNormalQuadratureCompounds, one with
   # prior `loc = 0.` and another with `loc = 1.` In both cases `scale = 1.`
-  pln = ds.PoissonLogNormalQuadratureCompound(
+  pln = tfd.PoissonLogNormalQuadratureCompound(
       loc=[0., -0.5],
       scale=1.,
       quadrature_grid_and_probs=(
@@ -292,7 +293,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution):
     # where,
     #
     # Z|v ~ interpolate_affine[v](distribution)
-    # V ~ mixture_distrubution
+    # V ~ mixture_distribution
     #
     # thus,
     #
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index b05f15771a3a94779ffddea8f16ad2fa4ea2fdd1..c4b8f055b7fbc3f0835b503eddd7617610326d8c 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -115,7 +115,7 @@ class SinhArcsinh(transformed_distribution.TransformedDistribution):
       tailweight:  Tailweight parameter. Default is `1.0` (unchanged tailweight)
       distribution: `tf.Distribution`-like instance. Distribution that is
         transformed to produce this distribution.
-        Default is `ds.Normal(0., 1.)`.
+        Default is `tf.distributions.Normal(0., 1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index 92043d6a08833888c36009261addca0d14949ea8..904724af429f3cb5835f6e05abcb574467ef6918 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -188,8 +188,7 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Create two batches of VectorDiffeomixtures, one with mix_loc=[0.] and
   # another with mix_loc=[1]. In both cases, `K=2` and the affine
@@ -197,20 +196,20 @@ class VectorDiffeomixture(distribution_lib.Distribution):
   # k=0: loc=zeros(dims)  scale=LinearOperatorScaledIdentity
   # k=1: loc=[2.]*dims    scale=LinOpDiag
   dims = 5
-  vdm = ds.VectorDiffeomixture(
+  vdm = tfd.VectorDiffeomixture(
       mix_loc=[[0.], [1]],
       mix_scale=[1.],
-      distribution=ds.Normal(loc=0., scale=1.),
+      distribution=tfd.Normal(loc=0., scale=1.),
       loc=[
           None,  # Equivalent to `np.zeros(dims, dtype=np.float32)`.
           np.float32([2.]*dims),
       ],
       scale=[
-          la.LinearOperatorScaledIdentity(
+          tf.linalg.LinearOperatorScaledIdentity(
             num_rows=dims,
             multiplier=np.float32(1.1),
             is_positive_definite=True),
-          la.LinearOperatorDiag(
+          tf.linalg.LinearOperatorDiag(
             diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
             is_positive_definite=True),
       ],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index 356d78b67a8107750f68f7f84d73d1231f5b2b03..526fe2d39aef9aed833b889de80e849c469435e7 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -89,14 +89,13 @@ class VectorExponentialDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
 
   # The first component has pdf exp{-x}, the second 0.5 exp{-x / 2}
-  vex = ds.VectorExponentialDiag(scale_diag=[1., 2.])
+  vex = tfd.VectorExponentialDiag(scale_diag=[1., 2.])
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([3., 4.]).eval()  # shape: []
@@ -107,7 +106,7 @@ class VectorExponentialDiag(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialDiag(loc, scale_diag)
+  vex = tfd.VectorExponentialDiag(loc, scale_diag)
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index b313a851b381e5b3a057fd17e6c2ef4eb0fc34f1..9d5fd9ac4178a1ae29b1ce32f304b22fd3d234dc 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -107,16 +107,15 @@ class VectorExponentialLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorExponential, supported on
   # {(x, y) in R^2 : x > 0, y > 0}.
   mat = [[1.0, 0.1],
          [0.1, 1.0]]
 
-  vex = ds.VectorExponentialLinearOperator(
-      scale=la.LinearOperatorFullMatrix(mat))
+  vex = tfd.VectorExponentialLinearOperator(
+      scale=tf.linalg.LinearOperatorFullMatrix(mat))
 
   # Compute the pdf of an`R^2` observation; return a scalar.
   vex.prob([1., 2.]).eval()  # shape: []
@@ -127,9 +126,9 @@ class VectorExponentialLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vex = ds.VectorExponentialLinearOperator(
+  vex = tfd.VectorExponentialLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[1.9, 2.2, 3.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
index 0e3867809a820f49cfa7f5282c47f786626481a6..8dd983b750d9b39775e570800006011f4968f7f3 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -101,10 +101,10 @@ class VectorLaplaceDiag(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 2-variate VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_diag=[1, 2.])
 
@@ -118,7 +118,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: []
 
   # Initialize a 3-batch, 2-variate scaled-identity VectorLaplace.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[1., -1],
       scale_identity_multiplier=[1, 2., 3])
 
@@ -136,7 +136,7 @@ class VectorLaplaceDiag(
   vla.prob([-1., 0]).eval()  # shape: [3]
 
   # Initialize a 2-batch of 3-variate VectorLaplace's.
-  vla = ds.VectorLaplaceDiag(
+  vla = tfd.VectorLaplaceDiag(
       loc=[[1., 2, 3],
            [11, 22, 33]]           # shape: [2, 3]
       scale_diag=[[1., 2, 3],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index c7abdbb4caf9bee4cbd5991eb5d652f20dd0f8d1..ec485c95c15da2794b67d2699d2bdd9db97bb6c4 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -109,8 +109,7 @@ class VectorLaplaceLinearOperator(
   #### Examples
 
   ```python
-  ds = tf.contrib.distributions
-  la = tf.linalg
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate VectorLaplace with some desired covariance.
   mu = [1., 2, 3]
@@ -124,9 +123,9 @@ class VectorLaplaceLinearOperator(
   #      [ 0.1, -0.3,  0.4]])
 
   # Divide scale by sqrt(2) so that the final covariance will be what we want.
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorLowerTriangular(scale / tf.sqrt(2)))
+      scale=tf.linalg.LinearOperatorLowerTriangular(scale / tf.sqrt(2.)))
 
   # Covariance agrees with cholesky(cov) parameterization.
   vla.covariance().eval()
@@ -143,9 +142,9 @@ class VectorLaplaceLinearOperator(
   scale_diag = [[1., 2, 3],
                 [0.5, 1, 1.5]]     # shape: [2, 3]
 
-  vla = ds.VectorLaplaceLinearOperator(
+  vla = tfd.VectorLaplaceLinearOperator(
       loc=mu,
-      scale=la.LinearOperatorDiag(scale_diag))
+      scale=tf.linalg.LinearOperatorDiag(scale_diag))
 
   # Compute the pdf of two `R^3` observations; return a length-2 vector.
   x = [[-0.9, 0, 0.1],
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 544a8710709a0afb56c6ae6f36d35de892e8e420..e1ccf116457a97261b9ce3965552764771d3bdd2 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -143,7 +143,7 @@ class VectorSinhArcsinhDiag(transformed_distribution.TransformedDistribution):
         broadcastable with `event_shape`.
       distribution: `tf.Distribution`-like instance. Distribution from which `k`
         iid samples are used as input to transformation `F`.  Default is
-        `ds.Normal(0., 1.)`.
+        `tf.distributions.Normal(loc=0., scale=1.)`.
         Must be a scalar-batch, scalar-event distribution.  Typically
         `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
         a function of non-trainable parameters. WARNING: If you backprop through
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 29d41ab81c62d621c3c3533e1449341e9a085645..8c67647a618d22a58428d78865c4ebf7d98bdf9e 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -91,14 +91,14 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   Extra leading dimensions, if provided, allow for batches.
 
   ```python
-  ds = tf.contrib.distributions
+  tfd = tf.contrib.distributions
 
   # Initialize a single 3-variate vector Student's t-distribution.
   mu = [1., 2, 3]
   chol = [[1., 0, 0.],
           [1, 3, 0],
           [1, 2, 3]]
-  vt = ds.VectorStudentT(df=2, loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(df=2, loc=mu, scale_tril=chol)
 
   # Evaluate this on an observation in R^3, returning a scalar.
   vt.prob([-1., 0, 1])
@@ -107,7 +107,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   mu = [[1., 2, 3],
         [11, 22, 33]]
   chol = ...  # shape 2 x 3 x 3, lower triangular, positive diagonal.
-  vt = ds.VectorStudentT(loc=mu, scale_tril=chol)
+  vt = tfd.VectorStudentT(loc=mu, scale_tril=chol)
 
   # Evaluate this on a two observations, each in R^3, returning a length two
   # tensor.
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index ae4b07799f5c123b68529443a1765fbfbac05492..09242ee47ddd044dfc99e22d5b7751a989c86485 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,4 +1,4 @@
-# TensorFlow Eager Execution
+# Eager Execution
 
 > *WARNING*: This is a preview/pre-alpha version. The API and performance
 > characteristics are subject to change.
@@ -76,3 +76,6 @@ For an introduction to eager execution in TensorFlow, see:
 ## Changelog
 
 - 2017/10/31: Initial preview release.
+- 2017/12/01: Example of dynamic neural network:
+  [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
+  See [README.md](python/examples/spinn/README.md) for details.
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 2b84bc2e9b7453fac99ea2becc328ca854cf555d..fb667cd91bdb5296e6aacf1963981ce5cfd76be3 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -12,16 +12,16 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":datasets",
-        ":evaluator",
         ":metrics",
         ":network",
         ":saver",
-        ":summary_writer",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:numerics",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
@@ -51,21 +51,22 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        "//tensorflow/contrib/data/python/ops:prefetching_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/eager:context",
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "datasets_test",
     srcs = ["datasets_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":datasets",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -103,37 +104,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "summary_writer",
-    srcs = ["summary_writer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/summary:gen_summary_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-cuda_py_test(
-    name = "summary_writer_test",
-    srcs = ["summary_writer_test.py"],
-    additional_deps = [
-        ":summary_writer",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_library(
     name = "metrics",
     srcs = [
@@ -165,11 +135,9 @@ py_test(
         ":metrics",
         "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/summary:summary_test_util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -219,8 +187,11 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers_base",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
     ],
 )
@@ -231,13 +202,17 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":network",
+        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 98e6983658aed77277d87915ff26a8c676224503..b559cce6b12a809d671ce7855680063f02a4ac22 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -20,11 +20,15 @@ from __future__ import print_function
 
 import threading
 
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
 
@@ -32,12 +36,12 @@ _uid_counter = 0
 _uid_lock = threading.Lock()
 
 
-def _iterator_shared_name():
+def _generate_shared_name(prefix):
   with _uid_lock:
     global _uid_counter
     uid = _uid_counter
     _uid_counter += 1
-  return "eager_iterator_{}".format(uid)
+  return "{}_{}".format(prefix, uid)
 
 
 class Iterator(object):
@@ -72,11 +76,12 @@ class Iterator(object):
     with ops.device("/device:CPU:0"):
       ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
       self._output_types = dataset.output_types
+      self._output_shapes = dataset.output_shapes
       self._flat_output_types = nest.flatten(dataset.output_types)
       self._flat_output_shapes = nest.flatten(dataset.output_shapes)
       self._resource = gen_dataset_ops.iterator(
           container="",
-          shared_name=_iterator_shared_name(),
+          shared_name=_generate_shared_name("eager_iterator"),
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
       gen_dataset_ops.make_iterator(ds_variant, self._resource)
@@ -84,6 +89,35 @@ class Iterator(object):
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="/device:CPU:0")
     self._device = context.context().device_name
+    self._buffer_resource_handle = None
+    if not context.context().device_spec.device_type:
+      is_remote_device = False
+    else:
+      is_remote_device = context.context().device_spec.device_type != "CPU"
+    if is_remote_device:
+      with ops.device("/device:CPU:0"):
+        iter_string_handle = gen_dataset_ops.iterator_to_string_handle(
+            self._resource)
+
+        @function.Defun(dtypes.string)
+        def remote_fn(h):
+          remote_iterator = iterator_ops.Iterator.from_string_handle(
+              h, self._output_types, self._output_shapes)
+          return remote_iterator.get_next()
+
+        remote_fn.add_to_graph(None)
+        target = constant_op.constant("/device:CPU:0")
+      with ops.device(self._device):
+        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(
+            string_arg=iter_string_handle,
+            f=remote_fn,
+            target_device=target,
+            buffer_size=10,
+            thread_pool_size=1,
+            container="",
+            shared_name=_generate_shared_name("function_buffer_resource"))
+        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=self._buffer_resource_handle, handle_device=self._device)
 
   def __iter__(self):
     return self
@@ -93,20 +127,20 @@ class Iterator(object):
 
   def next(self):
     """Return the next tf.Tensor from the dataset."""
-    try:
-      # TODO(ashankar): Consider removing this ops.device() contextmanager
-      # and instead mimic ops placement in graphs: Operations on resource
-      # handles execute on the same device as where the resource is placed.
-      with ops.device("/device:CPU:0"):
-        ret = gen_dataset_ops.iterator_get_next(
-            self._resource,
-            output_types=self._flat_output_types,
-            output_shapes=self._flat_output_shapes)
-    except errors.OutOfRangeError:
-      raise StopIteration
-    # Copies tensors from CPU to the current device if necessary.
-    # TODO(rohanj): This should be replaced by the mechanism to have the
-    # runtime's threads copy tensors to the destination device.
     with ops.device(self._device):
-      ret = [array_ops.identity(x) for x in ret]
+      try:
+        if self._buffer_resource_handle is not None:
+          ret = prefetching_ops.function_buffering_resource_get_next(
+              function_buffer_resource=self._buffer_resource_handle,
+              output_types=self._flat_output_types)
+        else:
+          # TODO(ashankar): Consider removing this ops.device() contextmanager
+          # and instead mimic ops placement in graphs: Operations on resource
+          # handles execute on the same device as where the resource is placed.
+          ret = gen_dataset_ops.iterator_get_next(
+              self._resource,
+              output_types=self._flat_output_types,
+              output_shapes=self._flat_output_shapes)
+      except errors.OutOfRangeError:
+        raise StopIteration
       return nest.pack_sequence_as(self._output_types, ret)
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index bd0ab02ecf7ae6025e08dde1c3ddc634db9255c1..3faaeef5903615ea122800a6690117dde682e830 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -110,7 +110,7 @@ class Evaluator(object):
         return self._all_metric_results()
     else:
       def f():
-        with summary_ops.create_summary_file_writer(
+        with summary_ops.create_file_writer(
             summary_logdir).as_default(), summary_ops.always_record_summaries():
           return self._all_metric_results()
       if context.in_eager_mode():
diff --git a/tensorflow/contrib/eager/python/evaluator_test.py b/tensorflow/contrib/eager/python/evaluator_test.py
index 02f82cb216983accc7bc2dfa20cbb1ee0b8d8d26..7d2274db9b051e604266074651f4cbd331f20f48 100644
--- a/tensorflow/contrib/eager/python/evaluator_test.py
+++ b/tensorflow/contrib/eager/python/evaluator_test.py
@@ -87,7 +87,7 @@ class EvaluatorTest(test.TestCase):
 
     e.all_metric_results(logdir)
 
-    events = summary_test_util.events_from_file(logdir)
+    events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].simple_value, 6.0)
 
@@ -136,7 +136,7 @@ class EvaluatorTest(test.TestCase):
       variables.global_variables_initializer().run()
       e.run_evaluation(init_op, call_op, results_op)
 
-    events = summary_test_util.events_from_file(logdir)
+    events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].simple_value, 6.0)
 
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index aa21a6ab994acf929890ecebc07a86cf7ebf97db..6aef010a2139c4cd2ae19c008aa21d4e3592ca98 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -11,5 +11,6 @@ py_library(
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
         "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index d0130ebd118dbaff4f0161c8b2528764c6103e02..7bc5007c5655bed81b5600ee283c35bd332a1ebe 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -85,7 +85,7 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
   if logdir:
     # Support for TensorBoard summaries. Once training has started, use:
     #   tensorboard --logdir=<logdir>
-    summary_writer = tf.contrib.summary.create_summary_file_writer(logdir)
+    summary_writer = tf.contrib.summary.create_file_writer(logdir)
 
   # Training loop.
   for i, (xs, ys) in enumerate(tfe.Iterator(dataset)):
diff --git a/tensorflow/contrib/eager/python/examples/mnist/mnist.py b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
index ae01bac0b560e15f655c883da4ccc1944c07232c..bb121c7704b4772dde520ddc928a13c50ec8bb18 100644
--- a/tensorflow/contrib/eager/python/examples/mnist/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/mnist/mnist.py
@@ -190,10 +190,10 @@ def main(_):
   else:
     train_dir = None
     test_dir = None
-  summary_writer = tf.contrib.summary.create_summary_file_writer(
-      train_dir, flush_secs=10)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
-      test_dir, flush_secs=10, name='test')
+  summary_writer = tf.contrib.summary.create_file_writer(
+      train_dir, flush_millis=10000)
+  test_summary_writer = tf.contrib.summary.create_file_writer(
+      test_dir, flush_millis=10000, name='test')
   checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
 
   with tf.device(device):
@@ -211,7 +211,7 @@ def main(_):
         test(model, test_ds)
       all_variables = (
           model.variables
-          + tfe.get_optimizer_variables(optimizer)
+          + optimizer.variables()
           + [global_step])
       tfe.Saver(all_variables).save(
           checkpoint_prefix, global_step=global_step)
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
index 01616f2e7dbab8084153e6554ce0e64c13f5d710..459f2f4a7d2afa153e77069bc3ce0c5360ddd7e2 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
@@ -429,7 +429,9 @@
         "cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
         "\n",
         "if is_gpu_available:\n",
-        "  gpu_tensor = cpu_tensor.gpu()"
+        "  gpu_tensor = cpu_tensor.gpu()\n",
+        "else:\n",
+        "  print(\"GPU not available.\")"
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
index 3b7e2cd435e7f34cb950545a9fe5ee6eafefde7e..e6c7c117333e1e10aa571dae295e88747bd7d764 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
@@ -383,7 +383,7 @@
         "\n",
         "`implicit_value_and_gradients()` returns a function that accepts the same inputs as the function passed in, and returns a tuple consisting of:\n",
         "\n",
-        "1. the value returned by the function passed in (in this case, the loss calculated by `calculate_linear_model_loss()`), and\n",
+        "1. the value returned by the function passed in (in this case, the loss calculated by `loss_fn()`), and\n",
         "1. a list of tuples consisting of:\n",
         "  1. The value of the gradient (a `tf.Tensor`) with respect to a given variable\n",
         "  1. The corresponding variable (`tf.Variable`)\n",
@@ -698,7 +698,7 @@
       "source": [
         "## Other Ways to Compute Gradients\n",
         "\n",
-        "Using our loss function as an example (`calculate_linear_model_loss()`), there are several other ways we could compute gradients:\n",
+        "Using our loss function as an example (`loss_fn()`), there are several other ways we could compute gradients:\n",
         "\n",
         "1. `tfe.implicit_gradients()`\n",
         "1. `tfe.gradients_function()`\n",
@@ -841,7 +841,7 @@
         "# tfe.implicit_value_and_gradients() demo\n",
         "value_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)\n",
         "\n",
-        "# Returns only gradients:\n",
+        "# Returns the value returned by the function passed in, gradients, and variables:\n",
         "value_gradients_fn(inputs, labels, wb)"
       ]
     }
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
index ebcc7027c1d34c47a339a49ede1d80e58ad43780..0088da5c4b583dd13251de5839235de666fe8b78 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
@@ -9,7 +9,7 @@
       "source": [
         "# Eager Execution Tutorial: Importing Data\n",
         "\n",
-        "This notebook demonstrates the use of the [`tf.contrib.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
+        "This notebook demonstrates the use of the [`tf.data.Dataset` API](https://www.tensorflow.org/programmers_guide/datasets) to build pipelines to feed data to your program. It covers:\n",
         "\n",
         "* Creating a `Dataset`.\n",
         "* Iteration over a `Dataset` with eager execution enabled.\n",
@@ -64,7 +64,7 @@
       "source": [
         "# Step 1: Create a source `Dataset`\n",
         "\n",
-        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
+        "Create a _source_ dataset using one of the factory functions like [`Dataset.from_tensors`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensors), [`Dataset.from_tensor_slices`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices) or using objects that read from files like [`TextLineDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TextLineDataset) or [`TFRecordDataset`](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset). See the [Programmer's Guide](https://www.google.com/url?sa=D\u0026q=https%3A%2F%2Fwww.tensorflow.org%2Fprogrammers_guide%2Fdatasets%23reading_input_data) for more information."
       ]
     },
     {
@@ -83,7 +83,7 @@
       },
       "outputs": [],
       "source": [
-        "ds_tensors = tf.contrib.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
+        "ds_tensors = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])\n",
         "\n",
         "# Create a CSV file\n",
         "import tempfile\n",
@@ -93,7 +93,7 @@
         "Line 2\n",
         "Line 3\n",
         "  \"\"\")\n",
-        "ds_file = tf.contrib.data.TextLineDataset(filename)\n"
+        "ds_file = tf.data.TextLineDataset(filename)\n"
       ]
     },
     {
@@ -105,7 +105,7 @@
       "source": [
         "# Step 2: Apply transformations\n",
         "\n",
-        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.contrib.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/contrib/data/Dataset) for details."
+        "Use the transformations functions like [`map`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map), [`batch`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch), [`shuffle`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#shuffle) etc. to apply transformations to the records of the dataset. See the [API documentation for `tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for details."
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index 5759ca17facda2e94a35bcc7e2a54b80ff5ac858..536cad998d94e45187d30fce3be0d7a57178e0c1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -39,5 +39,6 @@ cuda_py_test(
     tags = [
         "noasan",
         "nomsan",
+        "notsan",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/README.md b/tensorflow/contrib/eager/python/examples/resnet50/README.md
index f6c1defa4246d46447028f86c87c4ea9b39bb2ad..db023e6c976c8eda09ef0dee7eecb144678773c4 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/README.md
+++ b/tensorflow/contrib/eager/python/examples/resnet50/README.md
@@ -11,7 +11,18 @@ Contents:
 
 # Benchmarks
 
-Using a synthetic data.
+Using a synthetic data, run:
+
+```
+# Using eager execution
+python resnet50_test.py --benchmarks=.
+
+# Using graph execution
+python resnet50_graph_test.py --benchmarks=.
+```
+
+The above uses the model definition included with the TensorFlow pip
+package. To build (and run benchmarks) from source:
 
 ```
 # Using eager execution
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index 736a75332ff6403ea1b21387211df6b8fb6034f3..23317886e712323f4b520000e0fd372734fc53a1 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -73,7 +73,7 @@ class ResNet50GraphTest(tf.test.TestCase):
       tf.train.get_or_create_global_step()
       logdir = tempfile.mkdtemp()
       with tf.contrib.summary.always_record_summaries():
-        with tf.contrib.summary.create_summary_file_writer(
+        with tf.contrib.summary.create_file_writer(
             logdir, max_queue=0,
             name='t0').as_default():
           model = resnet50.ResNet50(data_format())
@@ -95,7 +95,7 @@ class ResNet50GraphTest(tf.test.TestCase):
         sess.run([train_op, tf.contrib.summary.all_summary_ops()],
                  feed_dict={images: np_images, labels: np_labels})
 
-      events = summary_test_util.events_from_file(logdir)
+      events = summary_test_util.events_from_logdir(logdir)
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index d6389f2e385b3637b178d49fc56e8baf913eccaa..d8d8644dde10498e5fd480f92b69656fca1558dd 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -95,7 +95,7 @@ class ResNet50Test(tf.test.TestCase):
     model = resnet50.ResNet50(data_format)
     tf.train.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with tf.contrib.summary.create_summary_file_writer(
+    with tf.contrib.summary.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
       with tf.device(device):
@@ -103,7 +103,7 @@ class ResNet50Test(tf.test.TestCase):
         images, labels = random_batch(2)
         train_one_step(model, images, labels, optimizer)
         self.assertEqual(320, len(model.variables))
-    events = summary_test_util.events_from_file(logdir)
+    events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index b657d31f35bafd6624ac7e4d6a6f6b2db362649d..f83eb5c476ed9f45d70849a0de6c0f20973682a5 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -11,6 +11,7 @@ py_binary(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 318962c634e0d050b35da5efc405400380c1b759..40919f2d4cf511eb35fac954719286366aef6c7c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -247,10 +247,10 @@ def main(_):
 
   log_dir = os.path.join(FLAGS.dir, "summaries")
   tf.gfile.MakeDirs(log_dir)
-  train_summary_writer = tf.contrib.summary.create_summary_file_writer(
-      os.path.join(log_dir, "train"), flush_secs=10)
-  test_summary_writer = tf.contrib.summary.create_summary_file_writer(
-      os.path.join(log_dir, "eval"), flush_secs=10, name="eval")
+  train_summary_writer = tf.contrib.summary.create_file_writer(
+      os.path.join(log_dir, "train"), flush_millis=10000)
+  test_summary_writer = tf.contrib.summary.create_file_writer(
+      os.path.join(log_dir, "eval"), flush_millis=10000, name="eval")
 
   with tf.device(device):
     for epoch in range(FLAGS.num_epochs):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index db2587bf2cb548ae37e58597691e96ae2c2e8177..4b4792cd49bf8bd4ad46a0371ef0d2f8a07ddd1c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -10,7 +10,9 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/eager/python:tfe",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
index ea92d59e5863226a1bc28a07919518f209587cb5..743ebb68ee5bba5635899267cc4839828f7e4e2f 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/README.md
@@ -18,6 +18,18 @@ To run:
 
 Benchmarks (using synthetic data):
 
+```
+# Using eager execution
+python rnn_ptb_test.py --benchmarks=.
+
+# Using graph execution
+python rnn_ptb_graph_test.py --benchmarks=.
+```
+
+The above uses the model definition included with the TensorFlow pip
+package. To build (and run benchmarks) from source:
+
+
 ```
 # Using eager execution
 bazel run -c opt --config=cuda :rnn_ptb_test -- --benchmarks=.
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a1f8a759e2a556bc219f0aa13942f293c4f34cfa
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -0,0 +1,42 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "data",
+    srcs = ["data.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//third_party/py/numpy"],
+)
+
+py_test(
+    name = "data_test",
+    size = "small",
+    srcs = ["data_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":data",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+cuda_py_test(
+    name = "spinn_test",
+    size = "medium",
+    srcs = ["spinn_test.py"],
+    additional_deps = [
+        ":data",
+        "//third_party/examples/eager/spinn",
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/summary:summary_test_util",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = ["no_pip"],  # because spinn.py is under third_party/.
+)
diff --git a/tensorflow/contrib/eager/python/examples/spinn/README.md b/tensorflow/contrib/eager/python/examples/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb0637df473e22e5d39ca1b0816464cb2b7c6435
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/README.md
@@ -0,0 +1,13 @@
+# SPINN: Dynamic neural network with TensorFlow eager execution
+
+This directory contains files supporting the
+[spinn.py model in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/spinn.py),
+including
+
+- `data.py`: Utility library for loading and preprocessing the SNLI and GloVe
+  data.
+- `data_test.py` and `spinn_test.py`: Unit tests for the data and model modules.
+
+See the [README.md in third_party/examples/eager/spinn/](../../../../../../third_party/examples/eager/spinn/README.md)
+for detailed background, license and usage information regarding the SPINN code.
+
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data.py b/tensorflow/contrib/eager/python/examples/spinn/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6e046320f78541bef4e091e97f08fd51857af83
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data.py
@@ -0,0 +1,350 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities of SNLI data and GloVe word vectors for SPINN model.
+
+See more details about the SNLI data set at:
+  https://nlp.stanford.edu/projects/snli/
+
+See more details about the GloVe pretrained word embeddings at:
+  https://nlp.stanford.edu/projects/glove/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import math
+import os
+import random
+
+import numpy as np
+
+POSSIBLE_LABELS = ("entailment", "contradiction", "neutral")
+
+UNK_CODE = 0   # Code for unknown word tokens.
+PAD_CODE = 1   # Code for padding tokens.
+
+SHIFT_CODE = 3
+REDUCE_CODE = 2
+
+WORD_VECTOR_LEN = 300  # Embedding dimensions.
+
+LEFT_PAREN = "("
+RIGHT_PAREN = ")"
+PARENTHESES = (LEFT_PAREN, RIGHT_PAREN)
+
+
+def get_non_parenthesis_words(items):
+  """Get the non-parenthesis items from a SNLI parsed sentence.
+
+  Args:
+    items: Data items from a parsed SNLI setence, with parentheses. E.g.,
+      ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of non-parenthis word items, all converted to lower case. E.g.,
+      ["man", "wearing", "pass", ...
+  """
+  return [x.lower() for x in items if x not in PARENTHESES and x]
+
+
+def get_shift_reduce(items):
+  """Obtain shift-reduce vector from a list of items from the SNLI data.
+
+  Args:
+    items: Data items as a list of str, e.g.,
+       ["(", "Man", "(", "(", "(", "(", "(", "wearing", "pass", ")", ...
+
+  Returns:
+    A list of shift-reduce transitions, encoded as `SHIFT_CODE` for shift and
+      `REDUCE_CODE` for reduce. See code above for the values of `SHIFT_CODE`
+      and `REDUCE_CODE`.
+  """
+  trans = []
+  for item in items:
+    if item == LEFT_PAREN:
+      continue
+    elif item == RIGHT_PAREN:
+      trans.append(REDUCE_CODE)
+    else:
+      trans.append(SHIFT_CODE)
+  return trans
+
+
+def pad_and_reverse_word_ids(sentences):
+  """Pad a list of sentences to the common maximum length + 1.
+
+  Args:
+    sentences: A list of sentences as a list of list of integers. Each integer
+      is a word ID. Each list of integer corresponds to one sentence.
+
+  Returns:
+    A numpy.ndarray of shape (num_sentences, max_length + 1), wherein max_length
+      is the maximum sentence length (in # of words). Each sentence is reversed
+      and then padded with an extra one at head, as required by the model.
+  """
+  max_len = max(len(sent) for sent in sentences)
+  for sent in sentences:
+    if len(sent) < max_len:
+      sent.extend([PAD_CODE] * (max_len - len(sent)))
+  # Reverse in time order and pad an extra one.
+  sentences = np.fliplr(np.array(sentences, dtype=np.int64))
+  sentences = np.concatenate(
+      [np.ones([sentences.shape[0], 1], dtype=np.int64), sentences], axis=1)
+  return sentences
+
+
+def pad_transitions(sentences_transitions):
+  """Pad a list of shift-reduce transitions to the maximum length."""
+  max_len = max(len(transitions) for transitions in sentences_transitions)
+  for transitions in sentences_transitions:
+    if len(transitions) < max_len:
+      transitions.extend([PAD_CODE] * (max_len - len(transitions)))
+  return np.array(sentences_transitions, dtype=np.int64)
+
+
+def load_vocabulary(data_root):
+  """Load vocabulary from SNLI data files.
+
+  Args:
+    data_root: Root directory of the data. It is assumed that the SNLI data
+      files have been downloaded and extracted to the "snli/snli_1.0"
+      subdirectory of it.
+
+  Returns:
+    Vocabulary as a set of strings.
+
+  Raises:
+    ValueError: If SNLI data files cannot be found.
+  """
+  snli_path = os.path.join(data_root, "snli")
+  snli_glob_pattern = os.path.join(snli_path, "snli_1.0/snli_1.0_*.txt")
+  file_names = glob.glob(snli_glob_pattern)
+  if not file_names:
+    raise ValueError(
+        "Cannot find SNLI data files at %s. "
+        "Please download and extract SNLI data first." % snli_glob_pattern)
+
+  print("Loading vocabulary...")
+  vocab = set()
+  for file_name in file_names:
+    with open(os.path.join(snli_path, file_name), "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          continue
+        items = line.split("\t")
+        premise_words = get_non_parenthesis_words(items[1].split(" "))
+        hypothesis_words = get_non_parenthesis_words(items[2].split(" "))
+        vocab.update(premise_words)
+        vocab.update(hypothesis_words)
+  return vocab
+
+
+def load_word_vectors(data_root, vocab):
+  """Load GloVe word vectors for words present in the vocabulary.
+
+  Args:
+    data_root: Data root directory. It is assumed that the GloVe file
+     has been downloaded and extracted at the "glove/" subdirectory of it.
+    vocab: A `set` of words, representing the vocabulary.
+
+  Returns:
+    1. word2index: A dict from lower-case word to row index in the embedding
+       matrix, i.e, `embed` below.
+    2. embed: The embedding matrix as a float32 numpy array. Its shape is
+       [vocabulary_size, WORD_VECTOR_LEN]. vocabulary_size is len(vocab).
+       WORD_VECTOR_LEN is the embedding dimension (300).
+
+  Raises:
+    ValueError: If GloVe embedding file cannot be found.
+  """
+  glove_path = os.path.join(data_root, "glove/glove.42B.300d.txt")
+  if not os.path.isfile(glove_path):
+    raise ValueError(
+        "Cannot find GloVe embedding file at %s. "
+        "Please download and extract GloVe embeddings first." % glove_path)
+
+  print("Loading word vectors...")
+
+  word2index = dict()
+  embed = []
+
+  embed.append([0] * WORD_VECTOR_LEN)  # <unk>
+  embed.append([0] * WORD_VECTOR_LEN)  # <pad>
+  word2index["<unk>"] = UNK_CODE
+  word2index["<pad>"] = PAD_CODE
+
+  with open(glove_path, "rt") as f:
+    for line in f:
+      items = line.split(" ")
+      word = items[0]
+      if word in vocab and word not in word2index:
+        word2index[word] = len(embed)
+        vector = np.array([float(item) for item in items[1:]])
+        assert (WORD_VECTOR_LEN,) == vector.shape
+        embed.append(vector)
+  embed = np.array(embed, dtype=np.float32)
+  return word2index, embed
+
+
+def calculate_bins(length2count, min_bin_size):
+  """Cacluate bin boundaries given a histogram of lengths and mininum bin size.
+
+  Args:
+    length2count: A `dict` mapping length to sentence count.
+    min_bin_size: Minimum bin size in terms of total number of sentence pairs
+      in the bin.
+
+  Returns:
+    A `list` representing the right bin boundaries, starting from the inclusive
+    right boundary of the first bin. For example, if the output is
+      [10, 20, 35],
+    it means there are three bins: [1, 10], [11, 20] and [21, 35].
+  """
+  bounds = []
+  lengths = sorted(length2count.keys())
+  cum_count = 0
+  for length in lengths:
+    cum_count += length2count[length]
+    if cum_count >= min_bin_size:
+      bounds.append(length)
+      cum_count = 0
+  if bounds[-1] != lengths[-1]:
+    bounds.append(lengths[-1])
+  return bounds
+
+
+class SnliData(object):
+  """A split of SNLI data."""
+
+  def __init__(self, data_file, word2index, sentence_len_limit=-1):
+    """SnliData constructor.
+
+    Args:
+      data_file: Full path to the data file, e.g.,
+        "/tmp/spinn-data/snli/snli_1.0/snli_1.0.train.txt"
+      word2index: A dict from lower-case word to row index in the embedding
+        matrix (see `load_word_vectors()` for details).
+      sentence_len_limit: Maximum allowed sentence length (# of words).
+        A value of <= 0 means unlimited. Sentences longer than this limit
+        are currently discarded, not truncated.
+    """
+
+    self._labels = []
+    self._premises = []
+    self._premise_transitions = []
+    self._hypotheses = []
+    self._hypothesis_transitions = []
+
+    with open(data_file, "rt") as f:
+      for i, line in enumerate(f):
+        if i == 0:
+          # Skip header line.
+          continue
+        items = line.split("\t")
+        if items[0] not in POSSIBLE_LABELS:
+          continue
+
+        premise_items = items[1].split(" ")
+        hypothesis_items = items[2].split(" ")
+        premise_words = get_non_parenthesis_words(premise_items)
+        hypothesis_words = get_non_parenthesis_words(hypothesis_items)
+
+        if (sentence_len_limit > 0 and
+            (len(premise_words) > sentence_len_limit or
+             len(hypothesis_words) > sentence_len_limit)):
+          # TODO(cais): Maybe truncate; do not discard.
+          continue
+
+        premise_ids = [
+            word2index.get(word, UNK_CODE) for word in premise_words]
+        hypothesis_ids = [
+            word2index.get(word, UNK_CODE) for word in hypothesis_words]
+
+        self._premises.append(premise_ids)
+        self._hypotheses.append(hypothesis_ids)
+        self._premise_transitions.append(get_shift_reduce(premise_items))
+        self._hypothesis_transitions.append(get_shift_reduce(hypothesis_items))
+        assert (len(self._premise_transitions[-1]) ==
+                2 * len(premise_words) - 1)
+        assert (len(self._hypothesis_transitions[-1]) ==
+                2 * len(hypothesis_words) - 1)
+
+        self._labels.append(POSSIBLE_LABELS.index(items[0]) + 1)
+
+    assert len(self._labels) == len(self._premises)
+    assert len(self._labels) == len(self._hypotheses)
+    assert len(self._labels) == len(self._premise_transitions)
+    assert len(self._labels) == len(self._hypothesis_transitions)
+
+  def num_batches(self, batch_size):
+    """Calculate number of batches given batch size."""
+    return int(math.ceil(len(self._labels) / batch_size))
+
+  def get_generator(self, batch_size):
+    """Obtain a generator for batched data.
+
+    All examples of this SnliData object are randomly shuffled, sorted
+    according to the maximum sentence length of the premise and hypothesis
+    sentences in the pair, and batched.
+
+    Args:
+      batch_size: Desired batch size.
+
+    Returns:
+      A generator for data batches. The generator yields a 5-tuple:
+        label: An array of the shape (batch_size,).
+        premise: An array of the shape (max_premise_len, batch_size), wherein
+          max_premise_len is the maximum length of the (padded) premise
+          sentence in the batch.
+        premise_transitions: An array of the shape (2 * max_premise_len -3,
+          batch_size).
+        hypothesis: Same as `premise`, but for hypothesis sentences.
+        hypothesis_transitions: Same as `premise_transitions`, but for
+          hypothesis sentences.
+      All the elements of the 5-tuple have dtype `int64`.
+    """
+    # Randomly shuffle examples.
+    zipped = list(zip(
+        self._labels, self._premises, self._premise_transitions,
+        self._hypotheses, self._hypothesis_transitions))
+    random.shuffle(zipped)
+    # Then sort the examples by maximum of the premise and hypothesis sentence
+    # lengths in the pair. During training, the batches are expected to be
+    # shuffled. So it is okay to leave them sorted by max length here.
+    (labels, premises, premise_transitions, hypotheses,
+     hypothesis_transitions) = zip(
+         *sorted(zipped, key=lambda x: max(len(x[1]), len(x[3]))))
+
+    def _generator():
+      begin = 0
+      while begin < len(labels):
+        # The sorting above and the batching here makes sure that sentences of
+        # similar max lengths are batched together, minimizing the inefficiency
+        # due to uneven max lengths. The sentences are batched differently in
+        # each call to get_generator() due to the shuffling before sotring
+        # above. The pad_and_reverse_word_ids() and pad_transitions() functions
+        # take care of any remaning unevenness of the max sentence lengths.
+        end = min(begin + batch_size, len(labels))
+        # Transpose, because the SPINN model requires time-major, instead of
+        # batch-major.
+        yield (labels[begin:end],
+               pad_and_reverse_word_ids(premises[begin:end]).T,
+               pad_transitions(premise_transitions[begin:end]).T,
+               pad_and_reverse_word_ids(hypotheses[begin:end]).T,
+               pad_transitions(hypothesis_transitions[begin:end]).T)
+        begin = end
+    return _generator
diff --git a/tensorflow/contrib/eager/python/examples/spinn/data_test.py b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4f0b37c5099e45b7e3b258b258c0a203c36b3b7
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/data_test.py
@@ -0,0 +1,243 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for SPINN data module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+class DataTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(DataTest, self).setUp()
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(DataTest, self).tearDown()
+
+  def testGenNonParenthesisWords(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        ["man", "wearing", "pass", "on", "a", "lanyard", "and", "standing",
+         "in", "a", "crowd", "of", "people", "."],
+        data.get_non_parenthesis_words(seq_with_parse.split(" ")))
+
+  def testGetShiftReduce(self):
+    seq_with_parse = (
+        "( Man ( ( ( ( ( wearing pass ) ( on ( a lanyard ) ) ) and "
+        ") ( standing ( in ( ( a crowd ) ( of people ) ) ) ) ) . ) )")
+    self.assertEqual(
+        [3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 2, 3, 3, 2, 2, 2, 2, 2,
+         3, 2, 2], data.get_shift_reduce(seq_with_parse.split(" ")))
+
+  def testPadAndReverseWordIds(self):
+    id_sequences = [[0, 2, 3, 4, 5],
+                    [6, 7, 8],
+                    [9, 10, 11, 12, 13, 14, 15, 16]]
+    self.assertAllClose(
+        [[1, 1, 1, 1, 5, 4, 3, 2, 0],
+         [1, 1, 1, 1, 1, 1, 8, 7, 6],
+         [1, 16, 15, 14, 13, 12, 11, 10, 9]],
+        data.pad_and_reverse_word_ids(id_sequences))
+
+  def testPadTransitions(self):
+    unpadded = [[3, 3, 3, 2, 2, 2, 2],
+                [3, 3, 2, 2, 2]]
+    self.assertAllClose(
+        [[3, 3, 3, 2, 2, 2, 2],
+         [3, 3, 2, 2, 2, 1, 1]],
+        data.pad_transitions(unpadded))
+
+  def testCalculateBins(self):
+    length2count = {
+        1: 10,
+        2: 15,
+        3: 25,
+        4: 40,
+        5: 35,
+        6: 10}
+    self.assertEqual([2, 3, 4, 5, 6],
+                     data.calculate_bins(length2count, 20))
+    self.assertEqual([3, 4, 6], data.calculate_bins(length2count, 40))
+    self.assertEqual([4, 6], data.calculate_bins(length2count, 60))
+
+  def testLoadVoacbulary(self):
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    fake_dev_file = os.path.join(snli_1_0_dir, "snli_1.0_dev.txt")
+    os.makedirs(snli_1_0_dir)
+
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo baz ) . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+    with open(fake_dev_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Quux quuz ) ? )\t( ( Corge grault ) ! )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Quux quuz?\t.Corge grault!\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    self.assertSetEqual(
+        {".", "?", "!", "foo", "bar", "baz", "quux", "quuz", "corge", "grault"},
+        vocab)
+
+  def testLoadVoacbularyWithoutFileRaisesError(self):
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "snli/snli_1.0"))
+    with self.assertRaisesRegexp(ValueError, "Cannot find SNLI data files at"):
+      data.load_vocabulary(self._temp_data_dir)
+
+  def testLoadWordVectors(self):
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", ",", "foo", "bar", "baz"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    # Notice that "qux" is not present in `words`.
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    self.assertEqual(6, len(word2index))
+    self.assertEqual(0, word2index["<unk>"])
+    self.assertEqual(1, word2index["<pad>"])
+    self.assertEqual(2, word2index["."])
+    self.assertEqual(3, word2index["foo"])
+    self.assertEqual(4, word2index["bar"])
+    self.assertEqual(5, word2index["baz"])
+    self.assertEqual((6, data.WORD_VECTOR_LEN), embed.shape)
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[0, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[1, :])
+    self.assertAllClose([0.0] * data.WORD_VECTOR_LEN, embed[2, :])
+    self.assertAllClose([0.2] * data.WORD_VECTOR_LEN, embed[3, :])
+    self.assertAllClose([0.3] * data.WORD_VECTOR_LEN, embed[4, :])
+    self.assertAllClose([0.4] * data.WORD_VECTOR_LEN, embed[5, :])
+
+  def testLoadWordVectorsWithoutFileRaisesError(self):
+    vocab = {"foo", "bar", "baz", "qux", "."}
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+    os.makedirs(os.path.join(self._temp_data_dir, "glove"))
+    with self.assertRaisesRegexp(
+        ValueError, "Cannot find GloVe embedding file at"):
+      data.load_word_vectors(self._temp_data_dir, vocab)
+
+  def testSnliData(self):
+    """Unit test for SnliData objects."""
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+
+    # Four sentences in total.
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, _ = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    self.assertEqual(4, train_data.num_batches(1))
+    self.assertEqual(2, train_data.num_batches(2))
+    self.assertEqual(2, train_data.num_batches(3))
+    self.assertEqual(1, train_data.num_batches(4))
+
+    generator = train_data.get_generator(2)()
+    for i in range(2):
+      label, prem, prem_trans, hypo, hypo_trans = next(generator)
+      self.assertEqual(2, len(label))
+      self.assertEqual((4, 2), prem.shape)
+      self.assertEqual((5, 2), prem_trans.shape)
+      self.assertEqual((3, 2), hypo.shape)
+      self.assertEqual((3, 2), hypo_trans.shape)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..84e25cf81a2223800c47994b26d000caddee6b01
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -0,0 +1,409 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import gc
+import glob
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+import tensorflow as tf
+
+# pylint: disable=g-bad-import-order
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+from third_party.examples.eager.spinn import spinn
+from tensorflow.contrib.summary import summary_test_util
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+# pylint: enable=g-bad-import-order
+
+
+def _generate_synthetic_snli_data_batch(sequence_length,
+                                        batch_size,
+                                        vocab_size):
+  """Generate a fake batch of SNLI data for testing."""
+  with tf.device("cpu:0"):
+    labels = tf.random_uniform([batch_size], minval=1, maxval=4, dtype=tf.int64)
+    prem = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    prem_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+    hypo = tf.random_uniform(
+        (sequence_length, batch_size), maxval=vocab_size, dtype=tf.int64)
+    hypo_trans = tf.constant(np.array(
+        [[3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 3,
+          2, 3, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2,
+          3, 2, 2]] * batch_size, dtype=np.int64).T)
+  if tfe.num_gpus():
+    labels = labels.gpu()
+    prem = prem.gpu()
+    prem_trans = prem_trans.gpu()
+    hypo = hypo.gpu()
+    hypo_trans = hypo_trans.gpu()
+  return labels, prem, prem_trans, hypo, hypo_trans
+
+
+def _test_spinn_config(d_embed, d_out, logdir=None):
+  config_tuple = collections.namedtuple(
+      "Config", ["d_hidden", "d_proj", "d_tracker", "predict",
+                 "embed_dropout", "mlp_dropout", "n_mlp_layers", "d_mlp",
+                 "d_out", "projection", "lr", "batch_size", "epochs",
+                 "force_cpu", "logdir", "log_every", "dev_every", "save_every",
+                 "lr_decay_every", "lr_decay_by"])
+  return config_tuple(
+      d_hidden=d_embed,
+      d_proj=d_embed * 2,
+      d_tracker=8,
+      predict=False,
+      embed_dropout=0.1,
+      mlp_dropout=0.1,
+      n_mlp_layers=2,
+      d_mlp=32,
+      d_out=d_out,
+      projection=True,
+      lr=2e-2,
+      batch_size=2,
+      epochs=10,
+      force_cpu=False,
+      logdir=logdir,
+      log_every=1,
+      dev_every=2,
+      save_every=2,
+      lr_decay_every=1,
+      lr_decay_by=0.75)
+
+
+class SpinnTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(SpinnTest, self).setUp()
+    self._test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    self._temp_data_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self._temp_data_dir)
+    super(SpinnTest, self).tearDown()
+
+  def testBundle(self):
+    with tf.device(self._test_device):
+      lstm_iter = [np.array([[0, 1], [2, 3]], dtype=np.float32),
+                   np.array([[0, -1], [-2, -3]], dtype=np.float32),
+                   np.array([[0, 2], [4, 6]], dtype=np.float32),
+                   np.array([[0, -2], [-4, -6]], dtype=np.float32)]
+      out = spinn._bundle(lstm_iter)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 2, 0, -2, 0, 4, 0, -4]]).T,
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[1, 3, -1, -3, 2, 6, -2, -6]]).T,
+                          out[1].numpy())
+
+  def testUnbunbdle(self):
+    with tf.device(self._test_device):
+      state = [np.array([[0, 1, 2], [3, 4, 5]], dtype=np.float32),
+               np.array([[0, -1, -2], [-3, -4, -5]], dtype=np.float32)]
+      out = spinn._unbundle(state)
+
+      self.assertEqual(2, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual(tf.float32, out[1].dtype)
+      self.assertAllEqual(np.array([[0, 1, 2, 0, -1, -2]]),
+                          out[0].numpy())
+      self.assertAllEqual(np.array([[3, 4, 5, -3, -4, -5]]),
+                          out[1].numpy())
+
+  def testReducer(self):
+    with tf.device(self._test_device):
+      batch_size = 3
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      left_in = []
+      right_in = []
+      tracking = []
+      for _ in range(batch_size):
+        left_in.append(tf.random_normal((1, size * 2)))
+        right_in.append(tf.random_normal((1, size * 2)))
+        tracking.append(tf.random_normal((1, tracker_size * 2)))
+
+      out = reducer(left_in, right_in, tracking=tracking)
+      self.assertEqual(batch_size, len(out))
+      self.assertEqual(tf.float32, out[0].dtype)
+      self.assertEqual((1, size * 2), out[0].shape)
+
+  def testReduceTreeLSTM(self):
+    with tf.device(self._test_device):
+      size = 10
+      tracker_size = 8
+      reducer = spinn.Reducer(size, tracker_size=tracker_size)
+
+      lstm_in = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                          [0, -1, -2, -3, -4, -5, -6, -7, -8, -9]],
+                         dtype=np.float32)
+      c1 = np.array([[0, 1], [2, 3]], dtype=np.float32)
+      c2 = np.array([[0, -1], [-2, -3]], dtype=np.float32)
+
+      h, c = reducer._tree_lstm(c1, c2, lstm_in)
+      self.assertEqual(tf.float32, h.dtype)
+      self.assertEqual(tf.float32, c.dtype)
+      self.assertEqual((2, 2), h.shape)
+      self.assertEqual((2, 2), c.shape)
+
+  def testTracker(self):
+    with tf.device(self._test_device):
+      batch_size = 2
+      size = 10
+      tracker_size = 8
+      buffer_length = 18
+      stack_size = 3
+
+      tracker = spinn.Tracker(tracker_size, False)
+      tracker.reset_state()
+
+      # Create dummy inputs for testing.
+      bufs = []
+      buf = []
+      for _ in range(buffer_length):
+        buf.append(tf.random_normal((batch_size, size * 2)))
+      bufs.append(buf)
+      self.assertEqual(1, len(bufs))
+      self.assertEqual(buffer_length, len(bufs[0]))
+      self.assertEqual((batch_size, size * 2), bufs[0][0].shape)
+
+      stacks = []
+      stack = []
+      for _ in range(stack_size):
+        stack.append(tf.random_normal((batch_size, size * 2)))
+      stacks.append(stack)
+      self.assertEqual(1, len(stacks))
+      self.assertEqual(3, len(stacks[0]))
+      self.assertEqual((batch_size, size * 2), stacks[0][0].shape)
+
+      for _ in range(2):
+        out1, out2 = tracker(bufs, stacks)
+        self.assertIsNone(out2)
+        self.assertEqual(batch_size, len(out1))
+        self.assertEqual(tf.float32, out1[0].dtype)
+        self.assertEqual((1, tracker_size * 2), out1[0].shape)
+
+        self.assertEqual(tf.float32, tracker.state.c.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.c.shape)
+        self.assertEqual(tf.float32, tracker.state.h.dtype)
+        self.assertEqual((batch_size, tracker_size), tracker.state.h.shape)
+
+  def testSPINN(self):
+    with tf.device(self._test_device):
+      embedding_dims = 10
+      d_tracker = 8
+      sequence_length = 15
+      num_transitions = 27
+
+      config_tuple = collections.namedtuple(
+          "Config", ["d_hidden", "d_proj", "d_tracker", "predict"])
+      config = config_tuple(
+          embedding_dims, embedding_dims * 2, d_tracker, False)
+      s = spinn.SPINN(config)
+
+      # Create some fake data.
+      buffers = tf.random_normal((sequence_length, 1, config.d_proj))
+      transitions = tf.constant(
+          [[3], [3], [2], [3], [3], [3], [2], [2], [2], [3], [3], [3],
+           [2], [3], [3], [2], [2], [3], [3], [3], [2], [2], [2], [2],
+           [3], [2], [2]], dtype=tf.int64)
+      self.assertEqual(tf.int64, transitions.dtype)
+      self.assertEqual((num_transitions, 1), transitions.shape)
+
+      out = s(buffers, transitions, training=True)
+      self.assertEqual(tf.float32, out.dtype)
+      self.assertEqual((1, embedding_dims), out.shape)
+
+  def testSNLIClassifierAndTrainer(self):
+    with tf.device(self._test_device):
+      vocab_size = 40
+      batch_size = 2
+      d_embed = 10
+      sequence_length = 15
+      d_out = 4
+
+      config = _test_spinn_config(d_embed, d_out)
+
+      # Create fake embedding matrix.
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      # Invoke model under non-training mode.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Invoke model under training model.
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=True)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+
+      # Calculate loss.
+      loss1 = trainer.loss(labels, logits)
+      self.assertEqual(tf.float32, loss1.dtype)
+      self.assertEqual((), loss1.shape)
+
+      loss2, logits = trainer.train_batch(
+          labels, prem, prem_trans, hypo, hypo_trans)
+      self.assertEqual(tf.float32, loss2.dtype)
+      self.assertEqual((), loss2.shape)
+      self.assertEqual(tf.float32, logits.dtype)
+      self.assertEqual((batch_size, d_out), logits.shape)
+      # Training on the batch should have led to a change in the loss value.
+      self.assertNotEqual(loss1.numpy(), loss2.numpy())
+
+  def testTrainSpinn(self):
+    """Test with fake toy SNLI data and GloVe vectors."""
+
+    # 1. Create and load a fake SNLI data file and a fake GloVe embedding file.
+    snli_1_0_dir = os.path.join(self._temp_data_dir, "snli/snli_1.0")
+    fake_train_file = os.path.join(snli_1_0_dir, "snli_1.0_train.txt")
+    os.makedirs(snli_1_0_dir)
+
+    # Four sentences in total.
+    with open(fake_train_file, "wt") as f:
+      f.write("gold_label\tsentence1_binary_parse\tsentence2_binary_parse\t"
+              "sentence1_parse\tsentence2_parse\tsentence1\tsentence2\t"
+              "captionID\tpairID\tlabel1\tlabel2\tlabel3\tlabel4\tlabel5\n")
+      f.write("neutral\t( ( Foo bar ) . )\t( ( foo . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("contradiction\t( ( Bar foo ) . )\t( ( baz . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quux quuz ) . )\t( ( grault . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+      f.write("entailment\t( ( Quuz quux ) . )\t( ( garply . )\t"
+              "DummySentence1Parse\tDummySentence2Parse\t"
+              "Foo bar.\tfoo baz.\t"
+              "4705552913.jpg#2\t4705552913.jpg#2r1n\t"
+              "neutral\tentailment\tneutral\tneutral\tneutral\n")
+
+    glove_dir = os.path.join(self._temp_data_dir, "glove")
+    os.makedirs(glove_dir)
+    glove_file = os.path.join(glove_dir, "glove.42B.300d.txt")
+
+    words = [".", "foo", "bar", "baz", "quux", "quuz", "grault", "garply"]
+    with open(glove_file, "wt") as f:
+      for i, word in enumerate(words):
+        f.write("%s " % word)
+        for j in range(data.WORD_VECTOR_LEN):
+          f.write("%.5f" % (i * 0.1))
+          if j < data.WORD_VECTOR_LEN - 1:
+            f.write(" ")
+          else:
+            f.write("\n")
+
+    vocab = data.load_vocabulary(self._temp_data_dir)
+    word2index, embed = data.load_word_vectors(self._temp_data_dir, vocab)
+
+    train_data = data.SnliData(fake_train_file, word2index)
+    dev_data = data.SnliData(fake_train_file, word2index)
+    test_data = data.SnliData(fake_train_file, word2index)
+    print(embed)
+
+    # 2. Create a fake config.
+    config = _test_spinn_config(
+        data.WORD_VECTOR_LEN, 4,
+        logdir=os.path.join(self._temp_data_dir, "logdir"))
+
+    # 3. Test training of a SPINN model.
+    spinn.train_spinn(embed, train_data, dev_data, test_data, config)
+
+    # 4. Load train loss values from the summary files and verify that they
+    #    decrease with training.
+    summary_file = glob.glob(os.path.join(config.logdir, "events.out.*"))[0]
+    events = summary_test_util.events_from_file(summary_file)
+    train_losses = [event.summary.value[0].simple_value for event in events
+                    if event.summary.value
+                    and event.summary.value[0].tag == "train/loss"]
+    self.assertEqual(config.epochs, len(train_losses))
+    self.assertLess(train_losses[-1], train_losses[0])
+
+
+class EagerSpinnSNLIClassifierBenchmark(test.Benchmark):
+
+  def benchmarkEagerSpinnSNLIClassifier(self):
+    test_device = "gpu:0" if tfe.num_gpus() else "cpu:0"
+    with tf.device(test_device):
+      burn_in_iterations = 2
+      benchmark_iterations = 10
+
+      vocab_size = 1000
+      batch_size = 128
+      sequence_length = 15
+      d_embed = 200
+      d_out = 4
+
+      embed = tf.random_normal((vocab_size, d_embed))
+
+      config = _test_spinn_config(d_embed, d_out)
+      model = spinn.SNLIClassifier(config, embed)
+      trainer = spinn.SNLIClassifierTrainer(model, config.lr)
+
+      (labels, prem, prem_trans, hypo,
+       hypo_trans) = _generate_synthetic_snli_data_batch(sequence_length,
+                                                         batch_size,
+                                                         vocab_size)
+
+      for _ in range(burn_in_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+
+      gc.collect()
+      start_time = time.time()
+      for _ in xrange(benchmark_iterations):
+        trainer.train_batch(labels, prem, prem_trans, hypo, hypo_trans)
+      wall_time = time.time() - start_time
+      # Named "examples"_per_sec to conform with other benchmarks.
+      extras = {"examples_per_sec": benchmark_iterations / wall_time}
+      self.report_benchmark(
+          name="Eager_SPINN_SNLIClassifier_Benchmark",
+          iters=benchmark_iterations,
+          wall_time=wall_time,
+          extras=extras)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index e76745a807cb10adf2aedc56e69cea0ceded3ad7..0095ffa0db99d46d25654d73504d0d7d41c18b6f 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -388,7 +388,7 @@ many arguments.
 
 In fact, eager execution encourages use of the [Keras](https://keras.io)-style
 "Layer" classes in the
-[`tf.layers`](https://www.tensorflow.org/versions/master/api_docs/python/tf/layers)
+[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
 module.
 
 Furthermore, you may want to apply more sophisticated techniques to compute
@@ -488,10 +488,10 @@ parameters of the model as arguments to the `loss` function.
 ### Using Keras and the Layers API
 
 [Keras](https://keras.io) is a popular API for defining model structures. The
-[`tf.keras.layers`](https://www.tensorflow.org/versions/master/api_docs/python/tf/keras/layers)
+[`tf.keras.layers`](https://www.tensorflow.org/api_docs/python/tf/keras/layers)
 module provides a set of building blocks for models and is implemented using the
 `tf.layers.Layer` subclasses in the
-[`tf.layers`](https://www.tensorflow.org/versions/master/api_docs/python/tf/layers)
+[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
 module. We encourage the use of these same building blocks when using
 TensorFlow's eager execution feature. For example, the very same linear
 regression model can be built using `tf.layers.Dense`:
@@ -608,9 +608,9 @@ it provides conveniences like keeping track of all model variables and methods
 to save and restore from checkpoints.
 
 Sub-classes of `tfe.Network` may register `Layer`s (like classes in
-[`tf.layers`](https://www.tensorflow.org/versions/master/api_docs/python/tf/layers),
+[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers),
 or [Keras
-layers](https://www.tensorflow.org/versions/master/api_docs/python/tf/keras/layers))
+layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers))
 using a call to `self.track_layer()` and define the computation in an
 implementation of `call()`.
 
@@ -704,7 +704,7 @@ with tfe.restore_variables_on_create(
                                     net(inp).numpy()))
       all_variables = (
           net.variables
-          + tfe.get_optimizer_variables(optimizer)
+          + optimizer.variables()
           + [global_step])
       # Save the checkpoint.
       tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
@@ -757,7 +757,7 @@ For example, to record summaries once every 100 global steps, use:
 
 ```python
 tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_summary_file_writer(logdir)
+writer = tf.contrib.summary.create_file_writer(logdir)
 
 for _ in range(iterations):
   with writer.as_default():
@@ -800,7 +800,7 @@ example in
 
 The discussion above has been centered around the computation executed by your
 model. The
-[`tf.data`](https://www.tensorflow.org/versions/master/api_docs/python/tf/data)
+[`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data)
 module provides APIs to build complex input pipelines from simple, reusable
 pieces.
 
@@ -810,8 +810,7 @@ However, the process of iterating over elements of the dataset differs between
 eager execution and graph construction. When eager execution is enabled, the
 discussion on iterator creation using `make_one_shot_iterator()` and
 `get_next()` in the
-[Programmer's
-Guide](https://www.tensorflow.org/versions/master/programmers_guide/datasets) is
+[Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is
 *not* applicable. Instead, a more Pythonic `Iterator` class is available.
 
 For example:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 2ba653af4a2465a17a17ff4ff019e69476f6434e..2f8016ede3caee6dbb6fd8f5226f1464b5c3976b 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -73,7 +73,7 @@ class Metric(object):
   * `result()`: Computes and returns a final value for the metric
     from the variables in `self`.
 
-  Decendants may override `aggregate()`, but usually won't need to.  It
+  Descendants may override `aggregate()`, but usually won't need to.  It
   adds in the state from a list of metrics of the same type as `self`.
   (Default is to sum all the variables.) Note that users should not call
   `aggregate()`, it is for use by TensorFlow infrastructure.
@@ -223,8 +223,17 @@ class Metric(object):
     """***Only for use by descendants of Metric***."""
     if self._built:
       raise RuntimeError("Can't call add_variable() except in build().")
-    v = variable_scope.get_variable(name, shape, dtype, initializer,
-                                    trainable=False, use_resource=True)
+    collections = None if context.in_eager_mode() else [
+        ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
+    ]
+    v = variable_scope.get_variable(
+        name,
+        shape,
+        dtype,
+        initializer,
+        trainable=False,
+        collections=collections,
+        use_resource=True)
     self._vars.append(v)
     if context.in_eager_mode():
       self._initial_values[v] = v.value()
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index b945e97a0049441d356f41e4d19fe6f01836ec40..1055f4563cd4608189281450aed512fbf5f31de1 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.training import training_util
 
@@ -41,6 +42,17 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
+  def testVariableCollections(self):
+    with context.graph_mode(), ops.Graph().as_default():
+      m = metrics.Mean()
+      m(1000)
+      self.assertEqual(
+          set(m.variables),
+          set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES)))
+      self.assertEqual(
+          set(m.variables),
+          set(ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
+
   def testInitVariables(self):
     m = metrics.Mean()
     m([1, 10, 100, 1000])
@@ -55,12 +67,12 @@ class MetricsTest(test.TestCase):
     m([1, 10, 100])
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name="t0").as_default(), summary_ops.always_record_summaries():
       m.result()  # As a side-effect will write summaries.
 
-    events = summary_test_util.events_from_file(logdir)
+    events = summary_test_util.events_from_logdir(logdir)
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].simple_value, 37.0)
 
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 5b53a597f20a1cd0ba9be7f1d3a89e117cde66e8..e3c13cbd2e8ccd2ab79da74e0e97905c6ed5c02d 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -37,164 +37,98 @@ from tensorflow.python.training import training_util
 # functions in base.py which should be reused.
 
 
-_DeferredRestoration = collections.namedtuple(
+def _network_name_scope_naming(current_variable_scope):
+  """Name scope naming to match operation names to variable names.
 
-    "_DeferredRestoration",
-    [
-        # The map_func to use (either user-specified or the default).
-        "map_func",
-        # Boolean, True if the user specified an explicit map_func, for error
-        # messages.
-        "map_func_is_user",
-        # A mapping from checkpoint names to initial values of not-yet-created
-        # variables which should be restored. These values come from parsing a
-        # checkpoint.
-        "checkpointed_variables_to_restore",
-        # A mapping from checkpoint name to variable objects of variables which
-        # have already been restored, for error checking.
-        "restored_variables",
-        # The session to restore with (if in graph mode).
-        "session",
-        # Names of the Network where the restore was requested, for error
-        # messages.
-        "network_name",
-        "network_scope_name"
-    ])
+  Used in Networks and also applied to non-Network Layers which are added to
+  Networks before being built.
 
-
-def _default_naming_conflict_error_message(
-    mapped_name, first_variable, second_variable,
-    network_name, network_scope_name):
-  return (
-      ("The default checkpoint variable name mapping strategy for Network "
-       "'%s' resulted in a naming conflict. We attempted to strip off the "
-       "variable prefix for the Network ('%s'), but this resulted in two "
-       "variables named '%s' (originally '%s' and '%s'). This should only "
-       "happen when using variable sharing (i.e. the Network contains Networks "
-       "or Layers which were first added to another Network, and therefore "
-       "have that Network's variable prefix). One solution is to pass "
-       "`map_func=lambda n: n` to Network.save and Network.restore to use "
-       "fully qualified variable names in the checkpoint, although this will "
-       "require that the variable prefix of the Network being restored into "
-       "is also '%s'. You may alternatively write an arbitrary mapping.")
-      % (
-          network_name, network_scope_name, mapped_name,
-          first_variable._shared_name,
-          second_variable._shared_name, network_scope_name
-      ))
-
-
-def _restore_custom_map_func_error_message(
-    mapped_name, first_variable, second_variable,
-    network_name, network_scope_name):
-  return (
-      ("The map_func passed to Network.restore for the Network '%s' "
-       "resulted in two variables named '%s' (originally '%s' and '%s'). Since "
-       "this is also an error on Network.save, this Network was "
-       "probably not saved with this map_func. Note that map_func "
-       "always maps from full variable names to checkpoint names; "
-       "there is no need to specify an inverse mapping.\n\n"
-       "Try stripping less from the variable names, or renaming parts "
-       "of the Network. For reference, variables created by sub-Layers "
-       "of this Network are prefixed with '%s', but if they are "
-       "re-used after being added to another Network they will have "
-       "that Network's full variable prefix instead.") % (
-           network_name, mapped_name,
-           first_variable._shared_name,
-           second_variable._shared_name,
-           network_scope_name))
-
-
-def _make_custom_getter_for_deferred_restorations():
-  """Returns a custom getter which searches `deferred_restorations`.
-
-  Returns: A tuple of (_custom_getter, deferred_restorations)
-    _custom_getter: The getter which should be added to variable_scopes where
-      variables will be created.
-    deferred_restorations: A list for _DeferredRestoration objects. Typically
-      empty when the getter is set, and expanded as deferred restorations are
-      requested. All new deferred restorations should be appended to the end of
-      the list, where they will have priority over older deferred restorations.
+  Args:
+    current_variable_scope: A VariableScope object.
+  Returns:
+    A name scope name.
   """
-  deferred_restorations = []
-
-  def _custom_getter(getter, name, shape=None, dtype=None,
-                     initializer=None,
-                     *args, **kwargs):
-    """A custom getter which processes deferred restorations."""
-    # Iterate over restorations, newest first (newer restorations will take
-    # precedence over older restorations, just like with immediate restorations
-    # into existing variables).
-    delayed_restoration = None
-    found_value = False
-    value_to_restore = None
-    for delayed_restoration in reversed(
-        deferred_restorations):
-      checkpoint_name = delayed_restoration.map_func(name)
-      if (checkpoint_name
-          in delayed_restoration.checkpointed_variables_to_restore):
-        found_value = True
-        value_to_restore = (
-            delayed_restoration.checkpointed_variables_to_restore[
-                checkpoint_name])
-      if found_value:
-        break
-    # value_to_restore may be False because this variable is not in any
-    # checkpoint we are restoring, or None because we have explicitly set it to
-    # None when it was previously fetched. In either case, we don't need to
-    # set an initializer.
-    if found_value and value_to_restore is not None:
-      initializer = value_to_restore
-      shape = None
-    variable = getter(name, shape=shape, dtype=dtype, initializer=initializer,
-                      *args, **kwargs)
-    if found_value and value_to_restore is not None:
-      # Mark as already restored from this checkpoint.
-      delayed_restoration.checkpointed_variables_to_restore[
-          checkpoint_name] = None
-      if context.in_graph_mode():
-        delayed_restoration.session.run(variable.initializer)
-    if found_value:
-      # Error checking should run even if we've already restored a value.
-      if delayed_restoration.restored_variables.setdefault(
-          checkpoint_name, variable) is not variable:
-        # Naming conflict. We've tried to initialize two variables with the
-        # same value from the checkpoint.
-        if delayed_restoration.map_func_is_user:
-          raise ValueError(
-              _restore_custom_map_func_error_message(
-                  mapped_name=checkpoint_name,
-                  first_variable=delayed_restoration.restored_variables[
-                      checkpoint_name],
-                  second_variable=variable,
-                  network_name=delayed_restoration.network_name,
-                  network_scope_name=delayed_restoration.network_scope_name))
-        else:
-          raise ValueError(
-              _default_naming_conflict_error_message(
-                  mapped_name=checkpoint_name,
-                  first_variable=delayed_restoration.restored_variables[
-                      checkpoint_name],
-                  second_variable=variable,
-                  network_name=delayed_restoration.network_name,
-                  network_scope_name=delayed_restoration.network_scope_name))
-    return variable
-  return _custom_getter, deferred_restorations
+  return current_variable_scope.name + "/"
 
 
 class Network(base.Layer):
   """Represents the composition of a set of Layers.
 
-  TODO(josh11b,ashankar):
-  - Should "trainable" be changeable on the Network object?
-  - Do we allow add_variable in Network?
-  - Detect layers used in __call__ that weren't registered with track_layer.
-  - Convert inputs to __call__ to tensors.
-  - Prevent variables from being created after the first __call__?
-    (Think about restoring from a checkpoint).
+  `Network` implements the `Layer` interface and adds convenience methods for
+  managing sub-`Layer`s, such as listing variables.
+
+  `Layer`s (including other `Network`s) should be added via `track_layer`. They
+  can then be used when overriding the `Network.call` method:
+
+  ```python
+  class TwoLayerNetwork(tfe.Network):
+
+    def __init__(self, name):
+      super(TwoLayerNetwork, self).__init__(name=name)
+      self.layer_one = self.track_layer(tf.layers.Dense(16, input_shape=(8,)))
+      self.layer_two = self.track_layer(tf.layers.Dense(1, input_shape=(16,)))
+
+    def call(self, inputs):
+      return self.layer_two(self.layer_one(inputs))
+  ```
+
+  After constructing an object and calling the `Network`, a list of variables
+  created by tracked `Layer`s is available via `Network.variables`:
+
+  ```python
+  net = TwoLayerNetwork(name="net")
+  output = net(tf.ones([1, 8]))
+  print([v.name for v in net.variables])
+  ```
+
+  This example prints variable names, one kernel and one bias per
+  `tf.layers.Dense` layer:
+
+  ```
+  ['net/dense/kernel:0',
+   'net/dense/bias:0',
+   'net/dense_1/kernel:0',
+   'net/dense_1/bias:0']
+  ```
+
+  These variables can be passed to a `Saver` (`tf.train.Saver`, or
+  `tf.contrib.eager.Saver` when executing eagerly) to save or restore the
+  `Network`, typically alongside a global step and `tf.train.Optimizer`
+  variables when checkpointing during training.
+
+  Note that the semantics of calling a `Network` with graph execution (i.e. not
+  executing eagerly) may change slightly in the future. Currently stateful ops
+  are pruned from the graph unless they or something that depends on them is
+  executed in a session, but this behavior is not consistent with eager
+  execution (where stateful ops are executed eagerly). `Layer`s from `tf.layers`
+  do not depend on this pruning and so will not be affected, but `Network`s
+  which rely on stateful ops being added to the graph but not executed (e.g. via
+  custom `Layer`s which manage stateful ops) may break with this change.
   """
+  # TODO(josh11b,ashankar,allenl):
+  # - Should 'trainable' be changeable on the Network object?
+  # - Do we allow add_variable in Network?
+  # - Detect layers used in __call__ that weren't registered with track_layer.
+  # - Convert inputs to __call__ to tensors.
 
   def __init__(self, name=None):
+    """Configure the `Network`.
+
+    Args:
+      name: The name to use for this `Network`. If specified, it must be unique
+        in the context where this `Network` is first
+         (1) added to another `Network` (in which case it must not share a name
+           with other `Layers` added to that `Network`), or
+         (2) built/called (in which case no other 'top-level' `Network`s may
+          share this name).
+        If unspecified or None, the `Network` will be named using its class
+        name, with a number appended if necessary for uniqueness (e.g. MyNetwork
+        -> 'my_network_1').
+
+    Raises:
+      ValueError: If `name` is not valid. Note that some naming errors will
+        instead be raised when the `Network` is called.
+    """
     if isinstance(name, variable_scope.VariableScope):
       raise ValueError("VariableScopes are not valid Network names.")
     if name is not None and "/" in name:
@@ -210,8 +144,17 @@ class Network(base.Layer):
     self._owned_layers = {}
     # The scope to use if we end up without a parent.
     self._default_parent_variable_scope = variable_scope.get_variable_scope()
-    self._custom_getter, self._deferred_restorations = (
-        _make_custom_getter_for_deferred_restorations())
+    # Hold on to the variable scope counts from init to check whether a scope
+    # with the name we want was ever created in our parent scope. Without this
+    # check we might have name collisions if the parent scope on init gets
+    # closed before build is called.
+    self._variable_scope_counts_on_init = (
+        variable_scope._get_default_variable_store().variable_scopes_count)
+
+  def _name_scope_name(self, current_variable_scope):
+    """Overrides Layer op naming to match variable naming."""
+    return _network_name_scope_naming(
+        current_variable_scope=current_variable_scope)
 
   def _init_set_name(self, name):
     # Anonymous Networks (name=None) defer setting a final name until they are
@@ -227,18 +170,30 @@ class Network(base.Layer):
 
   def _finalize_name(self, parent_network):
     if not self._name:
-      if not parent_network:
-        name_uid_map = base._get_default_graph_uid_map()
-      else:
-        name_uid_map = parent_network._sub_layer_name_uids
       # Were were not passed a name explicitly (or it was blank), so this is an
       # anonymous Network. We make up a unique name.
       if parent_network:
         avoid_names = parent_network._owned_layers
+        name_uid_map = parent_network._sub_layer_name_uids
       else:
-        avoid_names = None
+        name_uid_map = base._get_default_graph_uid_map()
+        # Figure out which names we have to avoid based on which variable scope
+        # we're nested in.
+        strip_name = self._default_parent_variable_scope.name
+        if strip_name:
+          strip_name += "/"
+        def _strip_on_init_scope(name):
+          if name.startswith(strip_name):
+            return name[len(strip_name):]
+          else:
+            return None
+        avoid_names = set(
+            _strip_on_init_scope(name)
+            for name in self._variable_scope_counts_on_init.keys() if name)
       self._name, self._base_name = self._make_unique_name(
-          name_uid_map=name_uid_map, avoid_names=avoid_names)
+          name_uid_map=name_uid_map, avoid_names=avoid_names,
+          namespace=self._default_parent_variable_scope.name,
+          zero_based=True)
     if self._first_parent is None or (self._first_parent  # False = no parent
                                       and self._first_parent() is None):
       # Save a pointer to the parent Network so that we can later check that the
@@ -268,7 +223,13 @@ class Network(base.Layer):
         parent_scope = first_parent._scope
       else:
         parent_scope = self._default_parent_variable_scope
-      with variable_scope.variable_scope(parent_scope):
+      with variable_scope.variable_scope(parent_scope) as parent_vs:
+        expected_scope_name = parent_vs.name + "/" + self._name
+        if expected_scope_name in self._variable_scope_counts_on_init:
+          raise ValueError(
+              ("A Network named '%s' already exists (or a variable_scope was "
+               "created with this name). Names must be unique.") % (
+                   self._name,))
         # Make sure variables with this prefix will be unique.
         with variable_scope.variable_scope(
             None, use_resource=True, default_name=self._name) as scope:
@@ -285,25 +246,22 @@ class Network(base.Layer):
                  "created with this name). Names must be unique.") % (
                      self._name,))
           if (first_parent
-              and scope_prefix[:-1] != first_parent._scope.name):
+              and scope_prefix[:-1] != first_parent.scope_name):
             raise ValueError(
                 ("Network variable names must match a nesting of sub-Network "
                  "names. Expected prefix '%s' from parent network, but got "
                  "'%s' when attempting to create a variable_scope for Network "
                  "'%s'. Likely an explicit variable_scope was inserted into "
                  "the nesting.") % (
-                     first_parent._scope.name,
+                     first_parent.scope_name,
                      scope_prefix[:-1],
                      self._name))
           elif not first_parent and scope_prefix:
             # For the case when this Network is not nested inside any other
-            # Network, but is in a variable_scope. This is an error for now.
-            raise ValueError(
-                "Creating Networks inside named variable_scopes is currently "
-                "not supported (to ensure that variable names match the names "
-                "of Networks in which they were first created). To set "
-                "options, try `with tf.variable_scope(''):`. If this "
-                "limitation bothers you, please file a feature request.")
+            # Network, but is in a variable_scope. This Network's name takes on
+            # the full variable scope prefix.
+            self._name = scope_name
+
       for non_network_sublayer in self._non_network_sublayers:
         self._set_scope_for_nonnetwork_sublayer(non_network_sublayer)
 
@@ -321,7 +279,8 @@ class Network(base.Layer):
         raise ValueError(
             ("The parent of a Layer added to Network %s was garbage collected "
              "before the Layer was built. If this limitation bothers you "
-             "please, file a feature request.") % (self.name,))
+             "please file a feature request.") %
+            (self.name,))
       with variable_scope.variable_scope(parent_scope):
         # Horrid hack to make Layer variable names which are direct
         # sub-layers of Networks conform to the Network variable naming
@@ -330,6 +289,9 @@ class Network(base.Layer):
             None, use_resource=True,
             default_name=sublayer.name) as sub_scope:
           sublayer._scope = sub_scope
+          # Also switch op naming for this Layer to match Network conventions,
+          # i.e. op naming matching variable naming.
+          sublayer._name_scope_name = _network_name_scope_naming
 
   @base.Layer.name.getter
   def name(self):
@@ -384,7 +346,10 @@ class Network(base.Layer):
             # name, and we should respect it (subject to error checking).
             layer._name, layer._base_name = layer._make_unique_name(
                 name_uid_map=self._sub_layer_name_uids,
-                avoid_names=self._owned_layers)
+                avoid_names=self._owned_layers,
+                zero_based=True
+                # No namespace required, since we've specified our own UID map.
+            )
           layer._first_parent = weakref.ref(self)
         self._non_network_sublayers.append(layer)
     if (not layer.built
@@ -486,272 +451,30 @@ class Network(base.Layer):
         "at https://github.com/tensorflow/tensorflow/issues/new if this is "
         "important to you")
 
-  def _strip_variable_prefix(self, original_variable_name):
-    """The default map_func for saving or restoring variables.
-
-    Strips the variable prefix for the Network on which save/restore was called,
-    and leaves other variable names fully qualified in the checkpoint.
-
-    Args:
-      original_variable_name: The _shared_name of the variable (no :0
-        suffix) to map.
-    Returns:
-      The checkpoint name of the variable.
-    """
-    scope_name_with_slash = self.scope_name + "/"
-    if original_variable_name.startswith(scope_name_with_slash):
-      return original_variable_name[len(scope_name_with_slash):]
-    else:
-      return original_variable_name
-
-  def save(self, save_path, global_step=None, map_func=None):
-    """Save variables from the Network to a checkpoint.
+  def add_loss(self, losses, inputs=None):
+    raise RuntimeError(
+        "add_loss is not supported in Network class yet. Please file an issue "
+        "at https://github.com/tensorflow/tensorflow/issues/new if this is "
+        "important to you")
 
-    Args:
-      save_path: Either a checkpoint prefix or the name of a directory to save
-        the checkpoint in (in which case the checkpoint will be named based on
-        the Network name).
-      global_step: The global step to use when naming the checkpoint. If None
-        (default), we will first try to get the default global step. If that
-        fails because no default global step exists, then the checkpoint is
-        created without a global step suffix.
-      map_func: A function mapping fully qualified variable names
-        (e.g. 'my_network_1/dense_1/kernel') to names in the checkpoint. By
-        default (if `map_func=None`), the variable prefix for the network being
-        restored (`Network.scope_name + '/'`, e.g. 'my_network_1/') is stripped
-        and all other variable names (shared with other Networks) are left
-        unchanged.
-    Returns:
-      The checkpoint prefix for the saved checkpoint, which may be passed to
-      `Network.restore`.
-    Raises:
-      ValueError: If the Network has not yet been called, or if map_func results
-        in a name collision.
-    """
-    if not self.built:
-      raise ValueError(
-          "Attempt to save the Network before it was first called. This means "
-          "variables have not yet been created, so there is nothing to save.")
-    self._set_scope()  # scope_name should be available to map_funcs
-    if global_step is None:
-      global_step = training_util.get_global_step()
-    if os.path.isdir(save_path):
-      # If we were passed a directory, default to naming based on the Network
-      # name.
-      save_path = os.path.join(save_path, self.name)
-    user_map_func = map_func
-    if map_func is None:
-      map_func = self._strip_variable_prefix
-    variable_map = {}
-    for variable in self.variables:
-      mapped_name = map_func(variable._shared_name)
-      if variable_map.setdefault(mapped_name, variable) is not variable:
-        if user_map_func is None:
-          # Instead of erroring out, we could just re-try and silently use the
-          # full variable names in the checkpoint. This could be odd for deeply
-          # nested sub-Networks (since the full prefix from the nesting would
-          # get added), so for now we'll let the user deal with this case.
-          raise ValueError(_default_naming_conflict_error_message(
-              mapped_name=mapped_name,
-              first_variable=variable_map[mapped_name],
-              second_variable=variable,
-              network_name=self.name,
-              network_scope_name=self.scope_name))
-        else:
-          # The user passed their own problematic map_func.
-          raise ValueError(
-              ("The map_func passed to Network.save for the Network '%s' "
-               "resulted in two variables named '%s' ('%s' and '%s'). Try "
-               "stripping less from the variable names, or renaming parts of "
-               "the Network. For reference, variables created by sub-Layers of "
-               "this Network are prefixed with '%s', but if they are re-used "
-               "after being added to another Network, they will have that "
-               "Network's full variable prefix instead.") % (
-                   self.name, mapped_name,
-                   variable_map[mapped_name]._shared_name,
-                   variable._shared_name,
-                   self.scope_name))
-    if context.in_eager_mode():
-      sess = None
-    else:
-      sess = ops.get_default_session()
-    return saver_lib.Saver(variable_map).save(
-        sess=sess, save_path=save_path, write_meta_graph=False,
-        global_step=global_step)
+  @property
+  def losses(self):
+    """Gather losses from `Layer`s in the `Network`.
 
-  def _restore_existing_variables(self, save_path, map_func, user_map_func):
-    """Use a standard Saver to restore existing variables from a checkpoint.
+    Note that when executing eagerly, `Layer.losses` evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
 
-    Args:
-      save_path: The checkpoint prefix or directory to read from.
-      map_func: The function to use when mapping from variable names to
-        checkpoint names.
-      user_map_func: The original map_func passed by the user, for error
-        checking.
     Returns:
-      A dictionary mapping from checkpoint names to variable objects which have
-      been restored (for bookkeeping to avoid deferred restorations on these
-      variables).
-    Raises:
-      ValueError: If there is a name collision.
-    """
-    existing_variables_by_checkpoint_name = {}
-    for variable in self.variables:
-      checkpoint_name = map_func(variable._shared_name)
-      if existing_variables_by_checkpoint_name.setdefault(
-          checkpoint_name, variable) is not variable:
-        if user_map_func is None:
-          raise ValueError(_default_naming_conflict_error_message(
-              mapped_name=checkpoint_name,
-              first_variable=existing_variables_by_checkpoint_name[
-                  checkpoint_name],
-              second_variable=variable,
-              network_name=self.name,
-              network_scope_name=self.scope_name))
-        else:
-          raise ValueError(_restore_custom_map_func_error_message(
-              mapped_name=checkpoint_name,
-              first_variable=existing_variables_by_checkpoint_name[
-                  checkpoint_name],
-              second_variable=variable,
-              network_name=self.name,
-              network_scope_name=self.scope_name))
-    if existing_variables_by_checkpoint_name:
-      if context.in_eager_mode():
-        sess = None
-      else:
-        sess = ops.get_default_session()
-      saver_lib.Saver(var_list=existing_variables_by_checkpoint_name).restore(
-          sess=sess, save_path=save_path)
-    return existing_variables_by_checkpoint_name
-
-  def _set_restore_on_create(self, save_path, map_func, user_map_func,
-                             existing_variables_by_checkpoint_name):
-    """If necessary, request deferred restorations of variables."""
-    checkpoint_reader = checkpoint_utils.load_checkpoint(save_path)
-    checkpointed_variables_to_restore = {}
-    for checkpoint_name, _ in checkpoint_utils.list_variables(save_path):
-      if checkpoint_name in existing_variables_by_checkpoint_name:
-        # This variable was already created and restored.
-        continue
-      # Save the variable for later restoration in a custom getter.
-      checkpointed_variables_to_restore[checkpoint_name] = (
-          checkpoint_reader.get_tensor(checkpoint_name))
-    # Only set a deferred restoration if there are checkpoint variables which
-    # have not been assigned to existing variables. Note that this loses out on
-    # some opportunity for error checking, but avoids creating
-    # _DeferredRestoration objects once a Network has been built (so that
-    # restoring in a loop does not take increasing amounts of memory).
-    if checkpointed_variables_to_restore:
-      if context.in_eager_mode():
-        sess = None
-      else:
-        sess = ops.get_default_session()
-      # We need a name for error messages. If we haven't been added to another
-      # Network yet, we're top-level.
-      self._finalize_name(False)
-      self._set_scope()
-      # Save a record of this restoration for use in the custom getter.
-      deferred_restoration = _DeferredRestoration(
-          map_func=map_func,
-          map_func_is_user=(user_map_func is not None),
-          checkpointed_variables_to_restore=checkpointed_variables_to_restore,
-          restored_variables={},
-          session=sess,
-          network_name=self.name,
-          network_scope_name=self.scope_name)
-      self._deferred_restorations.append(deferred_restoration)
-      # Add the deferred registration to non-Network children, and request that
-      # Networks propagate the request to their children.
-      self._add_deferred_restoration(deferred_restoration)
-
-  def _add_deferred_restoration(self, deferred_restoration):
-    """Add a deferred restoration to this Network and all children.
-
-    Restorations which are requested later have higher priority, and the highest
-    priority matching restoration is applied to a variable when it is created.
-
-    Args:
-      deferred_restoration: A _DeferredRestoration object.
+      A list of tensors.
     """
-    # Networks don't create variables at the moment, so this append isn't
-    # strictly necessary. We could get by with only adding deferred restorations
-    # to non-Network Layers.
-    self._set_scope()
-    # We use set_custom_getter because it avoids recursively calling up the
-    # variable_scope tree. We've done the tree traversal ourselves and have
-    # added the request to each Layer which needs it.
-    self._scope.set_custom_getter(self._custom_getter)
-    self._deferred_restorations.append(deferred_restoration)
+    layer_losses = []
     for layer in self.layers:
-      if isinstance(layer, Network):
-        # For Networks, request that they propagate this deferred restoration
-        # to all of their children recursively.
-        layer._add_deferred_restoration(deferred_restoration)
-      else:
-        # For non-Network Layers, make sure they have a deferred restoration
-        # queue and a custom getter, then add our request to it.
-        if not hasattr(layer, "_custom_getter"):
-          assert not hasattr(layer, "_deferred_restorations")
-          layer._custom_getter, layer._deferred_restorations = (
-              _make_custom_getter_for_deferred_restorations())
-          self._set_scope_for_nonnetwork_sublayer(layer)
-          layer._scope.set_custom_getter(layer._custom_getter)
-        layer._deferred_restorations.append(deferred_restoration)
-
-  def restore(self, save_path, map_func=None):
-    """Restore the Network from a checkpoint.
-
-    If variables have already been created (typically when some or all of the
-    `Network` is built), they are assigned values from the checkpoint
-    immediately, overwriting any existing values (in graph mode the default
-    session is used for the assignments).
-
-    If there are checkpoint entries which do not correspond to any existing
-    variables in the `Network`, these values are saved for deferred restoration;
-    their initial values will be the checkpointed values once they are
-    created. Requests for multiple deferred restorations behave the same way as
-    immediate restorations, in that later requests will take priority over
-    earlier requests relevant to the same variable.
-
-    If this `Network` shares `Layer`s with another network, those `Layer`s will
-    also have their variables restored from the checkpoint.
-
-    Args:
-      save_path: The return value of `Network.save`, or a directory to search
-        for a checkpoint.
-      map_func: A function mapping fully qualified variable names
-        (e.g. 'my_network_1/dense_1/kernel') to names in the checkpoint. By
-        default (if `map_func=None`), the variable prefix for the network being
-        restored (`Network.scope_name + '/'`, e.g. 'my_network_1/') is stripped
-        and all other variable names (shared with other Networks) are left
-        unchanged. Note that this is the _same_ map_func as `Network.save`, not
-        an inverse mapping.
-    """
-    self._finalize_name(parent_network=False)
-    self._set_scope()  # scope_name should be available to map_funcs
-    if os.path.isdir(save_path):
-      # If we don't have a name yet, set no parent.
-      save_path = os.path.join(save_path, self.name)
-    user_map_func = map_func
-    if map_func is None:
-      map_func = self._strip_variable_prefix
-    # Step one is to restore any existing variables from the checkpoint.
-    existing_variables_by_checkpoint_name = self._restore_existing_variables(
-        save_path=save_path,
-        map_func=map_func,
-        user_map_func=user_map_func)
-    # Step two is to set a custom getter which restores variables on creation,
-    # for those variables which have not been added to sub-Layers yet.
-    self._set_restore_on_create(
-        save_path=save_path,
-        map_func=map_func,
-        user_map_func=user_map_func,
-        existing_variables_by_checkpoint_name=(
-            existing_variables_by_checkpoint_name))
+      layer_losses.extend(layer.losses)
+    return layer_losses
 
-  # TODO(josh11b): Support other Layer methods needed for graph mode, such as for
-  # losses and updates
+  # TODO(allenl): Support other Layer methods needed for graph mode, such as for
+  # updates
 
 
 class Sequential(Network):
@@ -799,3 +522,436 @@ class Sequential(Network):
         else:
           inputs = l(inputs)
     return inputs
+
+
+_DeferredRestoration = collections.namedtuple(
+
+    "_DeferredRestoration",
+    [
+        # The map_func to use (either user-specified or the default).
+        "map_func",
+        # Boolean, True if the user specified an explicit map_func, for error
+        # messages.
+        "map_func_is_user",
+        # A mapping from checkpoint names to initial values of not-yet-created
+        # variables which should be restored. These values come from parsing a
+        # checkpoint.
+        "checkpointed_variables_to_restore",
+        # A mapping from checkpoint name to variable objects of variables which
+        # have already been restored, for error checking.
+        "restored_variables",
+        # The session to restore with (if in graph mode).
+        "session",
+        # Names of the Network where the restore was requested, for error
+        # messages.
+        "network_name",
+        "network_scope_name"
+    ])
+
+
+def _default_naming_conflict_error_message(
+    mapped_name, first_variable, second_variable,
+    network_name, network_scope_name):
+  return (
+      ("The default checkpoint variable name mapping strategy for Network "
+       "'%s' resulted in a naming conflict. We attempted to strip off the "
+       "variable prefix for the Network ('%s'), but this resulted in two "
+       "variables named '%s' (originally '%s' and '%s'). This should only "
+       "happen when using variable sharing (i.e. the Network contains Networks "
+       "or Layers which were first added to another Network, and therefore "
+       "have that Network's variable prefix). One solution is to pass "
+       "`map_func=lambda n: n` to save and restore to use fully qualified "
+       "variable names in the checkpoint, although this will require that the "
+       "variable prefix of the Network being restored into is also '%s'. You "
+       "may alternatively write an arbitrary mapping.")
+      % (
+          network_name, network_scope_name, mapped_name,
+          first_variable._shared_name,
+          second_variable._shared_name, network_scope_name
+      ))
+
+
+def _restore_custom_map_func_error_message(
+    mapped_name, first_variable, second_variable,
+    network_name, network_scope_name):
+  return (
+      ("The map_func passed to restore_network_checkpoint for the Network '%s' "
+       "resulted in two variables named '%s' (originally '%s' and '%s'). Since "
+       "this is also an error when saving, this Network was "
+       "probably not saved with this map_func. Note that map_func "
+       "always maps from full variable names to checkpoint names; "
+       "there is no need to specify an inverse mapping.\n\n"
+       "Try stripping less from the variable names, or renaming parts "
+       "of the Network. For reference, variables created by sub-Layers "
+       "of this Network are prefixed with '%s', but if they are "
+       "re-used after being added to another Network they will have "
+       "that Network's full variable prefix instead.") % (
+           network_name, mapped_name,
+           first_variable._shared_name,
+           second_variable._shared_name,
+           network_scope_name))
+
+
+def _make_custom_getter_for_deferred_restorations():
+  """Returns a custom getter which searches `deferred_restorations`.
+
+  Returns: A tuple of (_custom_getter, deferred_restorations)
+    _custom_getter: The getter which should be added to variable_scopes where
+      variables will be created.
+    deferred_restorations: A list for _DeferredRestoration objects. Typically
+      empty when the getter is set, and expanded as deferred restorations are
+      requested. All new deferred restorations should be appended to the end of
+      the list, where they will have priority over older deferred restorations.
+  """
+  deferred_restorations = []
+
+  def _custom_getter(getter, name, shape=None, dtype=None,
+                     initializer=None,
+                     *args, **kwargs):
+    """A custom getter which processes deferred restorations."""
+    # Iterate over restorations, newest first (newer restorations will take
+    # precedence over older restorations, just like with immediate restorations
+    # into existing variables).
+    delayed_restoration = None
+    found_value = False
+    value_to_restore = None
+    for delayed_restoration in reversed(
+        deferred_restorations):
+      checkpoint_name = delayed_restoration.map_func(name)
+      if (checkpoint_name
+          in delayed_restoration.checkpointed_variables_to_restore):
+        found_value = True
+        value_to_restore = (
+            delayed_restoration.checkpointed_variables_to_restore[
+                checkpoint_name])
+      if found_value:
+        break
+    # value_to_restore may be False because this variable is not in any
+    # checkpoint we are restoring, or None because we have explicitly set it to
+    # None when it was previously fetched. In either case, we don't need to
+    # set an initializer.
+    if found_value and value_to_restore is not None:
+      initializer = value_to_restore
+      shape = None
+    variable = getter(name, shape=shape, dtype=dtype, initializer=initializer,
+                      *args, **kwargs)
+    if found_value and value_to_restore is not None:
+      # Mark as already restored from this checkpoint.
+      delayed_restoration.checkpointed_variables_to_restore[
+          checkpoint_name] = None
+      if context.in_graph_mode():
+        delayed_restoration.session.run(variable.initializer)
+    if found_value:
+      # Error checking should run even if we've already restored a value.
+      if delayed_restoration.restored_variables.setdefault(
+          checkpoint_name, variable) is not variable:
+        # Naming conflict. We've tried to initialize two variables with the
+        # same value from the checkpoint.
+        if delayed_restoration.map_func_is_user:
+          raise ValueError(
+              _restore_custom_map_func_error_message(
+                  mapped_name=checkpoint_name,
+                  first_variable=delayed_restoration.restored_variables[
+                      checkpoint_name],
+                  second_variable=variable,
+                  network_name=delayed_restoration.network_name,
+                  network_scope_name=delayed_restoration.network_scope_name))
+        else:
+          raise ValueError(
+              _default_naming_conflict_error_message(
+                  mapped_name=checkpoint_name,
+                  first_variable=delayed_restoration.restored_variables[
+                      checkpoint_name],
+                  second_variable=variable,
+                  network_name=delayed_restoration.network_name,
+                  network_scope_name=delayed_restoration.network_scope_name))
+    return variable
+  return _custom_getter, deferred_restorations
+
+
+def _make_prefix_stripping_map_fn(scope_name):
+  """Closure for stripping the scope name of a Network.
+
+  Implemented as a closure rather than a member function to avoid reference
+  cycles in deferred restorations (this function should not have a reference to
+  the Network which created it).
+
+  Args:
+    scope_name: The Network.scope_name to strip from variables.
+  Returns:
+    A scope_name-stripping default `map_fn` for the Network.
+  """
+
+  def _strip_variable_prefix(original_variable_name):
+    """The default map_func for saving or restoring variables.
+
+    Strips the variable prefix for the Network on which save/restore was called,
+    and leaves other variable names fully qualified in the checkpoint.
+
+    Args:
+      original_variable_name: The _shared_name of the variable (no :0
+        suffix) to map.
+    Returns:
+      The checkpoint name of the variable.
+    """
+    scope_name_with_slash = scope_name + "/"
+    if original_variable_name.startswith(scope_name_with_slash):
+      return original_variable_name[len(scope_name_with_slash):]
+    else:
+      return original_variable_name
+
+  return _strip_variable_prefix
+
+
+def save_network_checkpoint(
+    network, save_path, global_step=None, map_func=None):
+  """Save variables from the Network to a checkpoint.
+
+  Args:
+    network: A Network object to save.
+    save_path: Either a checkpoint prefix or the name of a directory to save
+      the checkpoint in (in which case the checkpoint will be named based on
+      the Network name).
+    global_step: The global step to use when naming the checkpoint. If None
+      (default), we will first try to get the default global step. If that
+      fails because no default global step exists, then the checkpoint is
+      created without a global step suffix.
+    map_func: A function mapping fully qualified variable names
+      (e.g. 'my_network_1/dense_1/kernel') to names in the checkpoint. By
+      default (if `map_func=None`), the variable prefix for the network being
+      restored (`Network.scope_name + '/'`, e.g. 'my_network_1/') is stripped
+      and all other variable names (shared with other Networks) are left
+      unchanged.
+  Returns:
+    The checkpoint prefix for the saved checkpoint, which may be passed to
+    `Network.restore`.
+  Raises:
+    ValueError: If the Network has not yet been called, or if map_func results
+      in a name collision.
+  """
+  if not network.built:
+    raise ValueError(
+        "Attempt to save the Network before it was first called. This means "
+        "variables have not yet been created, so there is nothing to save.")
+  network._set_scope()  # scope_name should be available to map_funcs
+  if global_step is None:
+    global_step = training_util.get_global_step()
+  if os.path.isdir(save_path):
+    # If we were passed a directory, default to naming based on the Network
+    # name.
+    save_path = os.path.join(save_path, network.name.replace("/", "_"))
+  user_map_func = map_func
+  if map_func is None:
+    map_func = _make_prefix_stripping_map_fn(network.scope_name)
+  variable_map = {}
+  for variable in network.variables:
+    mapped_name = map_func(variable._shared_name)
+    if variable_map.setdefault(mapped_name, variable) is not variable:
+      if user_map_func is None:
+        # Instead of erroring out, we could just re-try and silently use the
+        # full variable names in the checkpoint. This could be odd for deeply
+        # nested sub-Networks (since the full prefix from the nesting would
+        # get added), so for now we'll let the user deal with this case.
+        raise ValueError(_default_naming_conflict_error_message(
+            mapped_name=mapped_name,
+            first_variable=variable_map[mapped_name],
+            second_variable=variable,
+            network_name=network.name,
+            network_scope_name=network.scope_name))
+      else:
+        # The user passed their own problematic map_func.
+        raise ValueError(
+            ("The map_func passed to save_network_checkpoint for the Network "
+             "'%s' resulted in two variables named '%s' ('%s' and '%s'). Try "
+             "stripping less from the variable names, or renaming parts of "
+             "the Network. For reference, variables created by sub-Layers of "
+             "this Network are prefixed with '%s', but if they are re-used "
+             "after being added to another Network, they will have that "
+             "Network's full variable prefix instead.") % (
+                 network.name, mapped_name,
+                 variable_map[mapped_name]._shared_name,
+                 variable._shared_name,
+                 network.scope_name))
+  if context.in_eager_mode():
+    sess = None
+  else:
+    sess = ops.get_default_session()
+  return saver_lib.Saver(variable_map).save(
+      sess=sess, save_path=save_path, write_meta_graph=False,
+      global_step=global_step)
+
+
+def _add_deferred_restoration(layer, deferred_restoration):
+  """Add a deferred restoration to this Layer and all children.
+
+  Restorations which are requested later have higher priority, and the highest
+  priority matching restoration is applied to a variable when it is created.
+
+  Args:
+    layer: The Layer (may not be a Network) to operate on.
+    deferred_restoration: A _DeferredRestoration object.
+  """
+  # Networks don't create variables at the moment, so this append isn't strictly
+  # necessary. We could get by with only adding deferred restorations to
+  # non-Network Layers.
+  if isinstance(layer, Network):
+    layer._set_scope()
+  # Make sure this Layer has a deferred restoration queue and a custom getter,
+  # then add our request to it.
+  if not hasattr(layer, "_custom_getter"):
+    assert not hasattr(layer, "_deferred_restorations")
+    layer._custom_getter, layer._deferred_restorations = (
+        _make_custom_getter_for_deferred_restorations())
+  # We use set_custom_getter because it avoids recursively calling up the
+  # variable_scope tree. We've done the tree traversal ourselves and have added
+  # the request to each Layer which needs it.
+  layer._scope.set_custom_getter(layer._custom_getter)
+  layer._deferred_restorations.append(deferred_restoration)
+  if isinstance(layer, Network):
+    for sublayer in layer.layers:
+      if not isinstance(sublayer, Network):
+        layer._set_scope_for_nonnetwork_sublayer(sublayer)
+      _add_deferred_restoration(sublayer, deferred_restoration)
+
+
+def _restore_existing_variables(network, save_path, map_func, user_map_func):
+  """Use a standard Saver to restore existing variables from a checkpoint.
+
+  Args:
+    network: A Network object to restore.
+    save_path: The checkpoint prefix or directory to read from.
+    map_func: The function to use when mapping from variable names to
+      checkpoint names.
+    user_map_func: The original map_func passed by the user, for error
+      checking.
+  Returns:
+    A dictionary mapping from checkpoint names to variable objects which have
+    been restored (for bookkeeping to avoid deferred restorations on these
+    variables).
+  Raises:
+    ValueError: If there is a name collision.
+  """
+  existing_variables_by_checkpoint_name = {}
+  for variable in network.variables:
+    checkpoint_name = map_func(variable._shared_name)
+    if existing_variables_by_checkpoint_name.setdefault(
+        checkpoint_name, variable) is not variable:
+      if user_map_func is None:
+        raise ValueError(_default_naming_conflict_error_message(
+            mapped_name=checkpoint_name,
+            first_variable=existing_variables_by_checkpoint_name[
+                checkpoint_name],
+            second_variable=variable,
+            network_name=network.name,
+            network_scope_name=network.scope_name))
+      else:
+        raise ValueError(_restore_custom_map_func_error_message(
+            mapped_name=checkpoint_name,
+            first_variable=existing_variables_by_checkpoint_name[
+                checkpoint_name],
+            second_variable=variable,
+            network_name=network.name,
+            network_scope_name=network.scope_name))
+  if existing_variables_by_checkpoint_name:
+    if context.in_eager_mode():
+      sess = None
+    else:
+      sess = ops.get_default_session()
+    saver_lib.Saver(var_list=existing_variables_by_checkpoint_name).restore(
+        sess=sess, save_path=save_path)
+  return existing_variables_by_checkpoint_name
+
+
+def _set_restore_on_create(network, save_path, map_func, user_map_func,
+                           existing_variables_by_checkpoint_name):
+  """If necessary, request deferred restorations of variables."""
+  checkpoint_reader = checkpoint_utils.load_checkpoint(save_path)
+  checkpointed_variables_to_restore = {}
+  for checkpoint_name, _ in checkpoint_utils.list_variables(save_path):
+    if checkpoint_name in existing_variables_by_checkpoint_name:
+      # This variable was already created and restored.
+      continue
+    # Save the variable for later restoration in a custom getter.
+    checkpointed_variables_to_restore[checkpoint_name] = (
+        checkpoint_reader.get_tensor(checkpoint_name))
+  # Only set a deferred restoration if there are checkpoint variables which
+  # have not been assigned to existing variables. Note that this loses out on
+  # some opportunity for error checking, but avoids creating
+  # _DeferredRestoration objects once a Network has been built (so that
+  # restoring in a loop does not take increasing amounts of memory).
+  if checkpointed_variables_to_restore:
+    if context.in_eager_mode():
+      sess = None
+    else:
+      sess = ops.get_default_session()
+    # We need a name for error messages. If we haven't been added to another
+    # Network yet, we're top-level.
+    network._finalize_name(False)
+    network._set_scope()
+    # Save a record of this restoration for use in the custom getter.
+    deferred_restoration = _DeferredRestoration(
+        map_func=map_func,
+        map_func_is_user=(user_map_func is not None),
+        checkpointed_variables_to_restore=checkpointed_variables_to_restore,
+        restored_variables={},
+        session=sess,
+        network_name=network.name,
+        network_scope_name=network.scope_name)
+    # Add the deferred registration to non-Network children, and request that
+    # Networks propagate the request to their children.
+    _add_deferred_restoration(network, deferred_restoration)
+
+
+def restore_network_checkpoint(network, save_path, map_func=None):
+  """Restore the Network from a checkpoint.
+
+  If variables have already been created (typically when some or all of the
+  `Network` is built), they are assigned values from the checkpoint immediately,
+  overwriting any existing values (in graph mode the default session is used for
+  the assignments).
+
+  If there are checkpoint entries which do not correspond to any existing
+  variables in the `Network`, these values are saved for deferred restoration;
+  their initial values will be the checkpointed values once they are
+  created. Requests for multiple deferred restorations behave the same way as
+  immediate restorations, in that later requests will take priority over earlier
+  requests relevant to the same variable.
+
+  If this `Network` shares `Layer`s with another network, those `Layer`s will
+  also have their variables restored from the checkpoint.
+
+  Args:
+    network: A Network object to restore.
+    save_path: The return value of `tfe.save_network_checkpoint`, or a directory
+      to search for a checkpoint.
+    map_func: A function mapping fully qualified variable names
+      (e.g. 'my_network_1/dense_1/kernel') to names in the checkpoint. By
+      default (if `map_func=None`), the variable prefix for the network being
+      restored (`Network.scope_name + '/'`, e.g. 'my_network_1/') is stripped
+      and all other variable names (shared with other Networks) are left
+      unchanged. Note that this is the _same_ map_func as
+      `tfe.save_network_checkpoint`, not an inverse mapping.
+  """
+  network._finalize_name(parent_network=False)
+  network._set_scope()  # scope_name should be available to map_funcs
+  if os.path.isdir(save_path):
+    # If we don't have a name yet, set no parent.
+    save_path = os.path.join(save_path, network.name.replace("/", "_"))
+  user_map_func = map_func
+  if map_func is None:
+    map_func = _make_prefix_stripping_map_fn(network.scope_name)
+  # Step one is to restore any existing variables from the checkpoint.
+  existing_variables_by_checkpoint_name = _restore_existing_variables(
+      network=network,
+      save_path=save_path,
+      map_func=map_func,
+      user_map_func=user_map_func)
+  # Step two is to set a custom getter which restores variables on creation,
+  # for those variables which have not been added to sub-Layers yet.
+  _set_restore_on_create(
+      network=network,
+      save_path=save_path,
+      map_func=map_func,
+      user_map_func=user_map_func,
+      existing_variables_by_checkpoint_name=(
+          existing_variables_by_checkpoint_name))
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index c621f527c28306131bdba56d8427eaa787ba150b..3eb4f5f8b3954a7ed04d2ef1d4f119ad137e1e65 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -19,9 +19,13 @@ from __future__ import print_function
 import gc
 
 from tensorflow.contrib.eager.python import network
+from tensorflow.contrib.layers.python.layers import regularizers
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import math_ops
@@ -42,12 +46,28 @@ class MyNetwork(network.Network):
     return self.l1(x)
 
 
+class RegularizedNetwork(network.Network):
+
+  def __init__(self):
+    super(RegularizedNetwork, self).__init__()
+    self.l1 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0),
+        kernel_regularizer=regularizers.l1_regularizer(2.0)))
+    self.l2 = self.track_layer(core.Dense(
+        1,
+        bias_regularizer=regularizers.l1_regularizer(2.0)))
+
+  def call(self, values):
+    return self.l2(self.l1(values))
+
+
 class NetworkTest(test.TestCase):
 
   def _save_modify_load_network_built(self, net, global_step=None):
     checkpoint_directory = self.get_temp_dir()
-    checkpoint_path = net.save(
-        save_path=checkpoint_directory, global_step=global_step)
+    checkpoint_path = network.save_network_checkpoint(
+        network=net, save_path=checkpoint_directory, global_step=global_step)
     input_value = constant_op.constant([[42.0]])
     original_output = self.evaluate(net(input_value))
     for var in net.variables:
@@ -56,18 +76,18 @@ class NetworkTest(test.TestCase):
         self.evaluate(net(input_value)),
         original_output)
     # Either the returned explicit checkpoint path or the directory should work.
-    net.restore(save_path=checkpoint_directory)
+    network.restore_network_checkpoint(net, save_path=checkpoint_directory)
     self.assertAllEqual(
         original_output,
         self.evaluate(net(input_value)))
     for var in net.variables:
       self.evaluate(var.assign(var + 2.))
-    net.restore(save_path=checkpoint_path)
+    network.restore_network_checkpoint(net, save_path=checkpoint_path)
     self.assertAllEqual(
         original_output,
         self.evaluate(net(input_value)))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testTrainableAttribute(self):
     net = network.Network()
     self.assertTrue(net.trainable)
@@ -75,7 +95,7 @@ class NetworkTest(test.TestCase):
       net.trainable = False
     self.assertTrue(net.trainable)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNetworkCall(self):
     net = MyNetwork(name="abcd")
     net(constant_op.constant([[2.0]]))  # Force variables to be created.
@@ -85,17 +105,36 @@ class NetworkTest(test.TestCase):
     result = net(constant_op.constant([[2.0]]))
     self.assertEqual(34.0, self.evaluate(result))
 
+  # TODO(akshayka): This test should be changed once an API for compiling
+  # `call` into a defun is implemented.
+  def testReplacingNetworkCallWithDefun(self):
+    net = MyNetwork(name="abcd")
+    x = constant_op.constant([[2.0]])
+    net(x)  # Force variables to be created.
+    self.evaluate(net.trainable_variables[0].assign([[17.0]]))
+
+    net.call = function.defun(net.call)
+    result = net(x)  # Build and execute the TensorFlow function
+    self.assertEqual(34.0, self.evaluate(result))
+
+    # Force the creation of another TensorFlow function by changing input shape
+    y = constant_op.constant([[1.0], [2.0]])
+    result = net(y)
+    self.assertAllEqual([[17.0], [34.0]], self.evaluate(result))
+
+  # TODO(allenl): This test creates garbage in some Python versions
   @test_util.run_in_graph_and_eager_modes()
   def testNetworkSaveRestoreAlreadyBuilt(self):
     net = MyNetwork(name="abcd")
     with self.assertRaisesRegexp(
         ValueError, "Attempt to save the Network before it was first called"):
-      net.save(self.get_temp_dir())
+      network.save_network_checkpoint(net, self.get_temp_dir())
     net(constant_op.constant([[2.0]]))
     self.evaluate(net.trainable_variables[0].assign([[17.0]]))
     self._save_modify_load_network_built(net, global_step=None)
     self._save_modify_load_network_built(net, global_step=10)
 
+  # TODO(allenl): This test creates garbage in some Python versions
   @test_util.run_in_graph_and_eager_modes()
   def testSaveRestoreDefaultGlobalStep(self):
     net = MyNetwork(name="abcd")
@@ -103,9 +142,10 @@ class NetworkTest(test.TestCase):
     self.evaluate(net.variables[0].assign([[3.]]))
     default_global_step = training_util.get_or_create_global_step()
     self.evaluate(default_global_step.assign(4242))
-    save_path = net.save(self.get_temp_dir())
+    save_path = network.save_network_checkpoint(net, self.get_temp_dir())
     self.assertIn("abcd-4242", save_path)
 
+  # TODO(allenl): This test creates garbage in some Python versions
   @test_util.run_in_graph_and_eager_modes()
   def testNetworkSaveAndRestoreIntoUnbuilt(self):
     save_dir = self.get_temp_dir()
@@ -113,16 +153,43 @@ class NetworkTest(test.TestCase):
     test_input = constant_op.constant([[2.0]])
     net1(test_input)
     self.evaluate(net1.trainable_variables[0].assign([[17.0]]))
-    save_path = net1.save(save_dir)
+    save_path = network.save_network_checkpoint(net1, save_dir)
     # With a pre-build restore we should have the same value.
     net2 = MyNetwork()
-    net2.restore(save_path)
+    network.restore_network_checkpoint(net2, save_path)
     self.assertAllEqual(self.evaluate(net1(test_input)),
                         self.evaluate(net2(test_input)))
     self.assertIsNot(net1.variables[0], net2.variables[0])
     self.assertAllEqual(self.evaluate(net1.variables[0]),
                         self.evaluate(net2.variables[0]))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testNetworkMatchesLayerVariableNames(self):
+    zero = constant_op.constant([[0.]])
+    layer_one = core.Dense(1, use_bias=False)
+    layer_one(zero)
+    layer_two = core.Dense(1, use_bias=False)
+    layer_two(zero)
+
+    class TwoLayerNet(network.Network):
+
+      def __init__(self, name=None):
+        super(TwoLayerNet, self).__init__(name=name)
+        self.first = self.track_layer(core.Dense(
+            1, use_bias=False))
+        self.second = self.track_layer(core.Dense(
+            1, use_bias=False))
+
+      def call(self, x):
+        return self.second(self.first(x))
+
+    net = TwoLayerNet()
+    net(zero)
+    self.assertEqual("two_layer_net/" + layer_one.variables[0].name,
+                     net.first.variables[0].name)
+    self.assertEqual("two_layer_net/" + layer_two.variables[0].name,
+                     net.second.variables[0].name)
+
   @test_util.run_in_graph_and_eager_modes()
   def testLoadIntoUnbuiltSharedLayer(self):
 
@@ -170,14 +237,15 @@ class NetworkTest(test.TestCase):
     # Re-map the variable names so that with default restore mapping we'll
     # attempt to restore into the unbuilt Layer.
     name_mapping = {
-        "checkpoint_creator/first_layer/kernel": "owner_1/first_layer/kernel",
+        "checkpoint_creator/first_layer/kernel": "owner/first_layer/kernel",
         "checkpoint_creator/second_layer/kernel": "second_layer/kernel",
     }
-    save_path = checkpoint_creator.save(
+    save_path = network.save_network_checkpoint(
+        checkpoint_creator,
         self.get_temp_dir(),
         map_func=lambda full_name: name_mapping[full_name])
     load_into = User(use_layer=first_owner.first)
-    load_into.restore(save_path)
+    network.restore_network_checkpoint(load_into, save_path)
     self.assertEqual(0, len(first_owner.variables))
     self.assertAllEqual(self.evaluate(checkpoint_creator(one)),
                         self.evaluate(load_into(one)))
@@ -193,12 +261,13 @@ class NetworkTest(test.TestCase):
     del first_owner
     gc.collect()
     def _restore_map_func(original_name):
-      if original_name.startswith("owner_1"):
-        return original_name.replace("owner_1", "owner_2")
+      if original_name.startswith("owner/"):
+        return original_name.replace("owner/", "owner_1/")
       else:
-        return "user_2/" + original_name
+        return "user_1/" + original_name
     with self.assertRaisesRegexp(ValueError, "garbage collected"):
-      load_into.restore(save_path, map_func=_restore_map_func)
+      network.restore_network_checkpoint(
+          load_into, save_path, map_func=_restore_map_func)
 
   @test_util.run_in_graph_and_eager_modes()
   def testRestoreIntoSubNetwork(self):
@@ -218,17 +287,18 @@ class NetworkTest(test.TestCase):
     whole_model_saver(one)
     self.evaluate(whole_model_saver.variables[0].assign([[15.]]))
     self.evaluate(whole_model_saver.variables[1].assign([[16.]]))
-    whole_model_checkpoint = whole_model_saver.save(self.get_temp_dir())
+    whole_model_checkpoint = network.save_network_checkpoint(
+        whole_model_saver, self.get_temp_dir())
 
     save_from = MyNetwork()
     save_from(one)
     self.evaluate(save_from.variables[0].assign([[5.]]))
-    checkpoint = save_from.save(self.get_temp_dir())
+    checkpoint = network.save_network_checkpoint(save_from, self.get_temp_dir())
     save_into_parent = Parent()
-    save_into_parent.restore(whole_model_checkpoint)
-    save_into_parent.first.restore(checkpoint)
-    save_into_parent.first.restore(checkpoint)  # deferred loading multiple
-                                                # times is fine
+    network.restore_network_checkpoint(save_into_parent, whole_model_checkpoint)
+    network.restore_network_checkpoint(save_into_parent.first, checkpoint)
+    # deferred loading multiple times is fine
+    network.restore_network_checkpoint(save_into_parent.first, checkpoint)
     save_into_parent(one)  # deferred loading
     self.assertAllEqual([[5.]], self.evaluate(save_into_parent.variables[0]))
     self.assertAllEqual([[16.]], self.evaluate(save_into_parent.variables[1]))
@@ -237,9 +307,9 @@ class NetworkTest(test.TestCase):
     # (deferred restoration should happen the same way non-deferred happens,
     # with later restorations overwriting older ones).
     save_into_parent = Parent()
-    save_into_parent.first.restore(checkpoint)  # deferred loading multiple
-                                                # times is fine
-    save_into_parent.restore(whole_model_checkpoint)
+    # deferred loading multiple times is fine
+    network.restore_network_checkpoint(save_into_parent.first, checkpoint)
+    network.restore_network_checkpoint(save_into_parent, whole_model_checkpoint)
     save_into_parent(one)  # deferred loading
     # We've overwritten the sub-Network restore.
     self.assertAllEqual([[15.]], self.evaluate(save_into_parent.variables[0]))
@@ -247,12 +317,12 @@ class NetworkTest(test.TestCase):
 
     self.evaluate(save_into_parent.variables[0].assign([[3.]]))
     self.evaluate(save_into_parent.variables[1].assign([[4.]]))
-    save_into_parent.second.restore(checkpoint)
+    network.restore_network_checkpoint(save_into_parent.second, checkpoint)
     self.assertAllEqual([[5.]], self.evaluate(save_into_parent.variables[1]))
     with self.assertRaisesRegexp(errors_impl.NotFoundError,
                                  "not found in checkpoint"):
       # The checkpoint is incompatible.
-      save_into_parent.restore(checkpoint)
+      network.restore_network_checkpoint(save_into_parent, checkpoint)
 
   @test_util.run_in_graph_and_eager_modes()
   def testCustomMapCollisionErrors(self):
@@ -274,31 +344,36 @@ class NetworkTest(test.TestCase):
     self.evaluate(make_checkpoint.variables[1].assign([[3.]]))
     with self.assertRaisesRegexp(
         ValueError,
-        "The map_func passed to Network.save for the Network 'parent_1' "
-        "resulted in two variables named 'foo'"):
-      make_checkpoint.save(self.get_temp_dir(), map_func=lambda n: "foo")
-    checkpoint = make_checkpoint.first.save(
-        self.get_temp_dir(), map_func=lambda n: "foo")
+        "The map_func passed to save_network_checkpoint for the Network "
+        "'parent' resulted in two variables named 'foo'"):
+      network.save_network_checkpoint(
+          make_checkpoint, self.get_temp_dir(), map_func=lambda n: "foo")
+    checkpoint = network.save_network_checkpoint(
+        network=make_checkpoint.first,
+        save_path=self.get_temp_dir(),
+        map_func=lambda n: "foo")
     loader = Parent()
-    loader.restore(checkpoint, map_func=lambda n: "foo")
+    network.restore_network_checkpoint(
+        loader, checkpoint, map_func=lambda n: "foo")
     with self.assertRaisesRegexp(
         ValueError,
-        ("The map_func passed to Network.restore for the Network"
-         " 'parent_2' resulted in two variables named 'foo'")):
+        ("The map_func passed to restore_network_checkpoint for the Network"
+         " 'parent_1' resulted in two variables named 'foo'")):
       loader(one)
     loader = Parent()
     loader(one)
     with self.assertRaisesRegexp(
         ValueError,
-        ("The map_func passed to Network.restore for the Network"
-         " 'parent_3' resulted in two variables named 'foo'")):
-      loader.restore(checkpoint, map_func=lambda n: "foo")
+        ("The map_func passed to restore_network_checkpoint for the Network"
+         " 'parent_2' resulted in two variables named 'foo'")):
+      network.restore_network_checkpoint(
+          loader, checkpoint, map_func=lambda n: "foo")
 
   @test_util.run_in_graph_and_eager_modes()
   def testDefaultMapCollisionErrors(self):
 
     one = constant_op.constant([[1.]])
-    first = core.Dense(1, name="dense_1", use_bias=False)
+    first = core.Dense(1, name="dense", use_bias=False)
     first(one)
 
     class Parent(network.Network):
@@ -319,8 +394,8 @@ class NetworkTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         ("The default checkpoint variable name mapping strategy for Network "
-         "'parent_1' resulted in a naming conflict.")):
-      make_checkpoint.save(self.get_temp_dir())
+         "'parent' resulted in a naming conflict.")):
+      network.save_network_checkpoint(make_checkpoint, self.get_temp_dir())
 
     class Compatible(network.Network):
 
@@ -334,14 +409,15 @@ class NetworkTest(test.TestCase):
     successful_checkpoint = Compatible()
     successful_checkpoint(one)
     self.evaluate(successful_checkpoint.variables[0].assign([[-1.]]))
-    checkpoint_path = successful_checkpoint.save(self.get_temp_dir())
+    checkpoint_path = network.save_network_checkpoint(
+        successful_checkpoint, self.get_temp_dir())
     load_checkpoint = Parent()
     load_checkpoint(one)
     with self.assertRaisesRegexp(
         ValueError,
         ("The default checkpoint variable name mapping strategy for Network "
-         "'parent_2' resulted in a naming conflict.")):
-      load_checkpoint.restore(checkpoint_path)
+         "'parent_1' resulted in a naming conflict.")):
+      network.restore_network_checkpoint(load_checkpoint, checkpoint_path)
 
   def testNoReferenceCyclesAfterCall(self):
 
@@ -377,25 +453,67 @@ class NetworkTest(test.TestCase):
     gc.set_debug(previous_gc_debug_flags)
     gc.enable()
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAnonymousNoNameInitially(self):
     net = MyNetwork()
     with self.assertRaisesRegexp(ValueError, "does not yet have a final name"):
       net.name  # pylint: disable=pointless-statement
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testExplicitHasNameInitially(self):
     net = MyNetwork(name="abcd")
     self.assertEqual("abcd", net.name)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testUsingResourceVariables(self):
     net = MyNetwork()
     net(constant_op.constant([[0.]]))
     self.assertIsInstance(net.trainable_weights[0],
                           resource_variable_ops.ResourceVariable)
 
-  @test_util.run_in_graph_and_eager_modes()
+  def testGraphOpNames(self):
+    """Network operation names should match variable naming."""
+
+    def _check_op_prefixes(expected_prefix, checked_ops):
+      for operation in ops.get_default_graph().get_operations():
+        if operation.name == "ignore":
+          continue
+        if operation.name in checked_ops:
+          continue
+        checked_ops.add(operation.name)
+        self.assertStartsWith(expected_start=expected_prefix,
+                              actual=operation.name)
+        self.assertNotIn("my_network", operation.name[len(expected_prefix):])
+        self.assertNotIn("dense", operation.name[len(expected_prefix):])
+
+    with context.graph_mode():
+      net = MyNetwork()
+      zero = constant_op.constant([[0.]], name="ignore")
+      net(zero)
+      checked_ops = set()
+      _check_op_prefixes(expected_prefix="my_network/dense/",
+                         checked_ops=checked_ops)
+      net.net2 = net.track_layer(MyNetwork())
+      net.net2(zero)
+      _check_op_prefixes(expected_prefix="my_network/my_network/dense/",
+                         checked_ops=checked_ops)
+      MyNetwork()(zero)
+      _check_op_prefixes(expected_prefix="my_network_1/dense/",
+                         checked_ops=checked_ops)
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testVariableRegularizers(self):
+    net = RegularizedNetwork()
+    net(constant_op.constant([[1.]]))
+    self.evaluate(net.variables[0].assign([[2.]]))
+    self.evaluate(net.variables[1].assign([3.]))
+    self.evaluate(net.variables[2].assign([[-2.]]))
+    self.evaluate(net.variables[3].assign([4.]))
+    self.assertAllEqual([4., 6., 8.], self.evaluate(net.losses))
+    self.evaluate(net.variables[3].assign([5.]))
+    self.assertAllEqual([4., 6., 10.], self.evaluate(net.losses))
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDuplicateNameError(self):
     one = constant_op.constant([[1.]])
     net = MyNetwork(name="foo")
@@ -405,21 +523,105 @@ class NetworkTest(test.TestCase):
       net1 = MyNetwork(name="foo")
       net1(one)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testWrappingInVariableScope(self):
+    one = constant_op.constant([[1.]])
+    # Naming happens in the order of first build rather than the order of
+    # construction, but for clarity they're the same here and construction is
+    # annotated.
+    outside_net_before = MyNetwork()  # name=my_network
+    outside_net_before(one)
+    captured_scope = variable_scope.get_variable_scope()
     with variable_scope.variable_scope("outside_scope"):
-      net = MyNetwork()
-      one = constant_op.constant([[1.]])
-      with self.assertRaisesRegexp(
-          ValueError,
-          ("Creating Networks inside named variable_scopes is currently not "
-           "supported")):
-        net(one)
-      # Alternatively, we could re-name the Network to match the variable_scope:
-      # self.assertEqual("outside_scope/my_network_1", net.name)
-      # self.assertStartsWith(
-      #     expected_start="outside_scope/my_network_1/dense/",
-      #     actual=net.trainable_weights[0].name)
+      net1 = MyNetwork()  # name=outside_scope/my_network
+      net1(one)
+      name_conflict1 = MyNetwork(name="name_conflict")  # fine, unique so far
+      name_conflict2 = MyNetwork(name="name_conflict")  # error on build
+      with variable_scope.variable_scope("inside_scope"):
+        # No issue here since the name is unique within its scope.
+        name_conflict3 = MyNetwork(name="name_conflict")
+      net2 = MyNetwork()  # name=outside_scope/my_network_2 to avoid the
+                          # variable_scope my_network_1 below.
+      vs_name_conflict = MyNetwork(name="vs_name_conflict")  # conflict below
+    with variable_scope.variable_scope("intervening_scope"):
+      with variable_scope.variable_scope(captured_scope):
+        with variable_scope.variable_scope("outside_scope"):
+          name_conflict4 = MyNetwork(name="name_conflict")  # error on build
+          with variable_scope.variable_scope("my_network_1"):
+            pass
+          with variable_scope.variable_scope("vs_name_conflict"):
+            pass
+          net3 = MyNetwork()  # name=outside_scope/my_network_4
+    name_conflict1(one)
+    with self.assertRaisesRegexp(
+        ValueError, "named 'name_conflict' already exists"):
+      name_conflict2(one)
+    name_conflict3(one)
+    net2(one)
+    with self.assertRaisesRegexp(
+        ValueError, "or a variable_scope was created with this name"):
+      vs_name_conflict(one)
+    with self.assertRaisesRegexp(
+        ValueError, "named 'name_conflict' already exists"):
+      name_conflict4(one)
+    self.assertEqual("outside_scope/name_conflict",
+                     name_conflict1.name)
+    self.assertStartsWith(
+        expected_start="outside_scope/name_conflict/dense/",
+        actual=name_conflict1.variables[0].name)
+    self.assertEqual("outside_scope/inside_scope/name_conflict",
+                     name_conflict3.name)
+    self.assertStartsWith(
+        expected_start="outside_scope/inside_scope/name_conflict/dense/",
+        actual=name_conflict3.variables[0].name)
+    self.assertEqual("outside_scope/my_network", net1.name)
+    self.assertStartsWith(
+        expected_start="outside_scope/my_network/dense/",
+        actual=net1.trainable_weights[0].name)
+    self.assertEqual("outside_scope/my_network_2", net2.name)
+    self.assertStartsWith(
+        expected_start="outside_scope/my_network_2/dense/",
+        actual=net2.trainable_weights[0].name)
+    net3(one)
+    self.assertEqual("outside_scope/my_network_3", net3.name)
+    self.assertStartsWith(
+        expected_start="outside_scope/my_network_3/dense/",
+        actual=net3.trainable_weights[0].name)
+    outside_net_after = MyNetwork()
+    outside_net_after(one)
+    self.assertEqual("my_network", outside_net_before.name)
+    self.assertStartsWith(
+        expected_start="my_network/dense/",
+        actual=outside_net_before.trainable_weights[0].name)
+    self.assertEqual("my_network_1", outside_net_after.name)
+    self.assertStartsWith(
+        expected_start="my_network_1/dense/",
+        actual=outside_net_after.trainable_weights[0].name)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testVariableScopeStripping(self):
+    with variable_scope.variable_scope("scope1"):
+      with variable_scope.variable_scope("scope2"):
+        net = MyNetwork()
+    net(constant_op.constant([[2.0]]))
+    self.evaluate(net.variables[0].assign([[42.]]))
+    self.assertEqual(net.name, "scope1/scope2/my_network")
+    self.assertStartsWith(
+        expected_start="scope1/scope2/my_network/dense/",
+        actual=net.trainable_weights[0].name)
+    save_path = network.save_network_checkpoint(net, self.get_temp_dir())
+    self.assertIn("scope1_scope2_my_network", save_path)
+    restore_net = MyNetwork()
+    # Delayed restoration
+    network.restore_network_checkpoint(restore_net, save_path)
+    restore_net(constant_op.constant([[1.0]]))
+    self.assertAllEqual([[42.]],
+                        self.evaluate(restore_net.variables[0]))
+    self.evaluate(restore_net.variables[0].assign([[-1.]]))
+    # Immediate restoration
+    network.restore_network_checkpoint(restore_net, save_path)
+    self.assertAllEqual([[42.]],
+                        self.evaluate(restore_net.variables[0]))
 
   @test_util.run_in_graph_and_eager_modes()
   def testLayerNamesRespected(self):
@@ -436,11 +638,11 @@ class NetworkTest(test.TestCase):
     one = constant_op.constant([[1.]])
     net = ParentNetwork()
     net(one)
-    self.assertStartsWith(expected_start="parent_network_1/explicit_name/",
+    self.assertStartsWith(expected_start="parent_network/explicit_name/",
                           actual=net.trainable_weights[0].name)
     self.assertEqual("explicit_name", net.first.name)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testWrappingInAnonymousVariableScope(self):
     # Named outside variable_scopes are not supported at the moment. However,
     # blank-named top level variable scopes do not change variable names, and so
@@ -455,20 +657,20 @@ class NetworkTest(test.TestCase):
       net(one)
     self.assertTrue(was_called[0])
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testReasonableSlashError(self):
     with self.assertRaisesRegexp(
         ValueError, "not allowed in Network names"):
       MyNetwork(name="slash/slash")
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNoVariableScopeNames(self):
     with self.assertRaisesRegexp(
         ValueError, "VariableScopes are not valid Network names"):
       with variable_scope.variable_scope("some_scope") as vs:
         MyNetwork(name=vs)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableScopeNameCollision(self):
     with variable_scope.variable_scope("abcd"):
       pass
@@ -478,7 +680,7 @@ class NetworkTest(test.TestCase):
       one = constant_op.constant([[1.]])
       net(one)
 
-  @test_util.run_in_graph_and_eager_modes()
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNetworkVariablesDoNotInterfere(self):
     core.Dense(1, use_bias=True)  # Should not interfere with naming.
     net1 = MyNetwork()
@@ -491,15 +693,15 @@ class NetworkTest(test.TestCase):
     # locally so that previous Layer consutrciton does not interfere with
     # variable naming (e.g. add a Layer construction before the Network,
     # suddenly your previously saved checkpoint is incompatible).
-    self.assertEqual("dense_1", net1.l1.name)
-    self.assertEqual("dense_1", net2.l1.name)
+    self.assertEqual("dense", net1.l1.name)
+    self.assertEqual("dense", net2.l1.name)
     self.evaluate(net1.trainable_weights[0].assign([[1.]]))
     self.evaluate(net2.trainable_weights[0].assign([[2.]]))
     self.assertEqual(2., self.evaluate(net2.trainable_weights[0]))
     self.assertEqual(1., self.evaluate(net1.trainable_weights[0]))
-    self.assertStartsWith(expected_start="my_network_1/dense_1/",
+    self.assertStartsWith(expected_start="my_network/dense/",
                           actual=net1.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="my_network_2/dense_1/",
+    self.assertStartsWith(expected_start="my_network_1/dense/",
                           actual=net2.trainable_weights[0].name)
 
   @test_util.run_in_graph_and_eager_modes()
@@ -520,31 +722,31 @@ class NetworkTest(test.TestCase):
     one = constant_op.constant([[1.]])
     net = ParentNetwork()
     net(one)
-    self.assertStartsWith(expected_start="parent_network_1/my_network_1/dense",
+    self.assertStartsWith(expected_start="parent_network/my_network/dense",
                           actual=net.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="parent_network_1/my_network_1/dense",
+    self.assertStartsWith(expected_start="parent_network/my_network/dense",
                           actual=net.first.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="parent_network_1/my_network_2/dense",
+    self.assertStartsWith(expected_start="parent_network/my_network_1/dense",
                           actual=net.trainable_weights[1].name)
-    self.assertStartsWith(expected_start="parent_network_1/my_network_2/dense",
+    self.assertStartsWith(expected_start="parent_network/my_network_1/dense",
                           actual=net.second.trainable_weights[0].name)
-    self.assertEqual("parent_network_1", net.name)
-    self.assertEqual("my_network_1", net.first.name)
-    self.assertEqual("my_network_2", net.second.name)
+    self.assertEqual("parent_network", net.name)
+    self.assertEqual("my_network", net.first.name)
+    self.assertEqual("my_network_1", net.second.name)
 
     net2 = ParentNetwork()
     net2(one)
-    self.assertStartsWith(expected_start="parent_network_2/my_network_1/dense",
+    self.assertStartsWith(expected_start="parent_network_1/my_network/dense",
                           actual=net2.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="parent_network_2/my_network_1/dense",
+    self.assertStartsWith(expected_start="parent_network_1/my_network/dense",
                           actual=net2.first.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="parent_network_2/my_network_2/dense",
+    self.assertStartsWith(expected_start="parent_network_1/my_network_1/dense",
                           actual=net2.trainable_weights[1].name)
-    self.assertStartsWith(expected_start="parent_network_2/my_network_2/dense",
+    self.assertStartsWith(expected_start="parent_network_1/my_network_1/dense",
                           actual=net2.second.trainable_weights[0].name)
-    self.assertEqual("parent_network_2", net2.name)
-    self.assertEqual("my_network_1", net2.first.name)
-    self.assertEqual("my_network_2", net2.second.name)
+    self.assertEqual("parent_network_1", net2.name)
+    self.assertEqual("my_network", net2.first.name)
+    self.assertEqual("my_network_1", net2.second.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testNestableExplicit(self):
@@ -605,26 +807,26 @@ class NetworkTest(test.TestCase):
     one = constant_op.constant([[1.]])
     net = MixedLayerNetwork()
     net(one)
-    self.assertEqual("dense_1", net.first.name)
-    self.assertEqual("dense_2", net.second.name)
-    self.assertEqual("dense_3", net.third.name)
-    self.assertEqual("dense_4", net.fourth.name)
-    self.assertEqual("dense_5", net.fifth.name)
+    self.assertEqual("dense", net.first.name)
+    self.assertEqual("dense_1", net.second.name)
+    self.assertEqual("dense_2", net.third.name)
+    self.assertEqual("dense_3", net.fourth.name)
+    self.assertEqual("dense_4", net.fifth.name)
     # Note that this is _not_ the default naming behavior for Layers. Layers
     # which are added to Networks follow Network variable naming conventions
     # (i.e. variable names = network name unless variable sharing). Nested
     # Layers revert to Layer behavior.
-    self.assertStartsWith(expected_start="mixed_layer_network_1/dense_1/",
+    self.assertStartsWith(expected_start="mixed_layer_network/dense/",
                           actual=net.trainable_weights[0].name)
-    self.assertStartsWith(expected_start="mixed_layer_network_1/dense_2/",
+    self.assertStartsWith(expected_start="mixed_layer_network/dense_1/",
                           actual=net.trainable_weights[1].name)
-    self.assertStartsWith(expected_start="mixed_layer_network_1/dense_3/",
+    self.assertStartsWith(expected_start="mixed_layer_network/dense_2/",
                           actual=net.trainable_weights[2].name)
-    self.assertStartsWith(expected_start="mixed_layer_network_1/dense_4/",
+    self.assertStartsWith(expected_start="mixed_layer_network/dense_3/",
                           actual=net.trainable_weights[3].name)
-    self.assertStartsWith(expected_start="mixed_layer_network_1/dense_5/",
+    self.assertStartsWith(expected_start="mixed_layer_network/dense_4/",
                           actual=net.trainable_weights[4].name)
-    self.assertEqual("mixed_layer_network_1", net.name)
+    self.assertEqual("mixed_layer_network", net.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testNestableExplicitCollisions(self):
@@ -677,24 +879,24 @@ class NetworkTest(test.TestCase):
     net = ParentNetwork()
     net(one)
     self.assertStartsWith(
-        expected_start="parent_network_1/first_unique_child_name/dense_1/",
+        expected_start="parent_network/first_unique_child_name/dense/",
         actual=net.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="parent_network_1/second_unique_child_name/dense_1/",
+        expected_start="parent_network/second_unique_child_name/dense/",
         actual=net.trainable_weights[1].name)
-    self.assertEqual("parent_network_1", net.name)
+    self.assertEqual("parent_network", net.name)
     self.assertEqual("first_unique_child_name", net.first.name)
     self.assertEqual("second_unique_child_name", net.second.name)
 
     net2 = ParentNetwork()
     net2(one)
     self.assertStartsWith(
-        expected_start="parent_network_2/first_unique_child_name/dense",
+        expected_start="parent_network_1/first_unique_child_name/dense",
         actual=net2.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="parent_network_2/second_unique_child_name/dense",
+        expected_start="parent_network_1/second_unique_child_name/dense",
         actual=net2.trainable_weights[1].name)
-    self.assertEqual("parent_network_2", net2.name)
+    self.assertEqual("parent_network_1", net2.name)
     self.assertEqual("first_unique_child_name", net2.first.name)
     self.assertEqual("second_unique_child_name", net2.second.name)
 
@@ -752,15 +954,15 @@ class NetworkTest(test.TestCase):
     net2(one)
 
     self.assertStartsWith(
-        expected_start="first_parent_network_1/my_network_1/dense_1/",
+        expected_start="first_parent_network/my_network/dense/",
         actual=net2.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="second_parent_network_1/my_network_1/dense_1/",
+        expected_start="second_parent_network/my_network/dense/",
         actual=net2.trainable_weights[1].name)
-    self.assertEqual("second_parent_network_1", net2.name)
+    self.assertEqual("second_parent_network", net2.name)
     self.assertTrue(net2.first is net.first)
-    self.assertEqual("my_network_1", net2.first.name)
-    self.assertEqual("my_network_1", net2.second.name)
+    self.assertEqual("my_network", net2.first.name)
+    self.assertEqual("my_network", net2.second.name)
 
     # No name collision; the owned Network is added first and has a different
     # name than the shared Network.
@@ -778,15 +980,15 @@ class NetworkTest(test.TestCase):
     net3(one)
 
     self.assertStartsWith(
-        expected_start="third_parent_network_1/my_network_1/dense",
+        expected_start="third_parent_network/my_network/dense",
         actual=net3.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="first_parent_network_1/my_network_2/dense",
+        expected_start="first_parent_network/my_network_1/dense",
         actual=net3.trainable_weights[1].name)
-    self.assertEqual("third_parent_network_1", net3.name)
+    self.assertEqual("third_parent_network", net3.name)
     self.assertTrue(net3.second is net.second)
-    self.assertEqual("my_network_1", net3.first.name)
-    self.assertEqual("my_network_2", net3.second.name)
+    self.assertEqual("my_network", net3.first.name)
+    self.assertEqual("my_network_1", net3.second.name)
 
     # "Unavoidable" same-name Layer. The owned name is added first (fixed), then
     # a shared Network is added with the same name.
@@ -804,15 +1006,15 @@ class NetworkTest(test.TestCase):
     net4(one)
 
     self.assertStartsWith(
-        expected_start="fourth_parent_network_1/my_network_1/dense_1/",
+        expected_start="fourth_parent_network/my_network/dense/",
         actual=net4.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="first_parent_network_1/my_network_1/dense_1/",
+        expected_start="first_parent_network/my_network/dense/",
         actual=net4.trainable_weights[1].name)
-    self.assertEqual("fourth_parent_network_1", net4.name)
+    self.assertEqual("fourth_parent_network", net4.name)
     self.assertTrue(net4.second is net.first)
-    self.assertEqual("my_network_1", net4.first.name)
-    self.assertEqual("my_network_1", net4.second.name)
+    self.assertEqual("my_network", net4.first.name)
+    self.assertEqual("my_network", net4.second.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testRecursiveLayerRenaming(self):
@@ -843,28 +1045,28 @@ class NetworkTest(test.TestCase):
     net(one)
 
     self.assertStartsWith(
-        expected_start=("parent_network_1/network_with_layer_children_1/"
-                        "dense_1/"),
+        expected_start=("parent_network/network_with_layer_children/"
+                        "dense/"),
         actual=net.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start=("parent_network_1/network_with_layer_children_1/"
-                        "dense_2/"),
+        expected_start=("parent_network/network_with_layer_children/"
+                        "dense_1/"),
         actual=net.trainable_weights[1].name)
     self.assertStartsWith(
-        expected_start=("parent_network_1/network_with_layer_children_2/"
-                        "dense_1/"),
+        expected_start=("parent_network/network_with_layer_children_1/"
+                        "dense/"),
         actual=net.trainable_weights[2].name)
     self.assertStartsWith(
-        expected_start=("parent_network_1/network_with_layer_children_2/"
-                        "dense_2/"),
+        expected_start=("parent_network/network_with_layer_children_1/"
+                        "dense_1/"),
         actual=net.trainable_weights[3].name)
-    self.assertEqual("parent_network_1", net.name)
-    self.assertEqual("network_with_layer_children_1", net.first.name)
-    self.assertEqual("network_with_layer_children_2", net.second.name)
-    self.assertEqual("dense_1", net.first.first.name)
-    self.assertEqual("dense_2", net.first.second.name)
-    self.assertEqual("dense_1", net.second.first.name)
-    self.assertEqual("dense_2", net.second.second.name)
+    self.assertEqual("parent_network", net.name)
+    self.assertEqual("network_with_layer_children", net.first.name)
+    self.assertEqual("network_with_layer_children_1", net.second.name)
+    self.assertEqual("dense", net.first.first.name)
+    self.assertEqual("dense_1", net.first.second.name)
+    self.assertEqual("dense", net.second.first.name)
+    self.assertEqual("dense_1", net.second.second.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testCallInDifferentOrderThanConstruct(self):
@@ -898,23 +1100,23 @@ class NetworkTest(test.TestCase):
     net1(one)
 
     self.assertStartsWith(
-        expected_start="first_network_1/my_network_1/dense_1/",
+        expected_start="first_network/my_network/dense/",
         actual=net1.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="first_network_1/my_network_2/dense_1/",
+        expected_start="first_network/my_network_1/dense/",
         actual=net1.trainable_weights[1].name)
     self.assertStartsWith(
-        expected_start="first_network_1/my_network_1/dense_1/",
+        expected_start="first_network/my_network/dense/",
         actual=net2.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="second_network_1/my_network_1/dense_1/",
+        expected_start="second_network/my_network/dense/",
         actual=net2.trainable_weights[1].name)
     self.assertTrue(net1.trainable_weights[0] is net2.trainable_weights[0])
-    self.assertEqual("first_network_1", net1.name)
-    self.assertEqual("my_network_1", net1.first.name)
-    self.assertEqual("my_network_2", net1.second.name)
+    self.assertEqual("first_network", net1.name)
+    self.assertEqual("my_network", net1.first.name)
+    self.assertEqual("my_network_1", net1.second.name)
     self.assertTrue(net2.first is net1.first)
-    self.assertEqual("my_network_1", net2.second.name)
+    self.assertEqual("my_network", net2.second.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLayerCallInDifferentOrderThanConstruct(self):
@@ -951,23 +1153,23 @@ class NetworkTest(test.TestCase):
     net1(one)
 
     self.assertStartsWith(
-        expected_start="first_network_1/dense_1/",
+        expected_start="first_network/dense/",
         actual=net1.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="first_network_1/dense_2/",
+        expected_start="first_network/dense_1/",
         actual=net1.trainable_weights[1].name)
     self.assertStartsWith(
-        expected_start="first_network_1/dense_1/",
+        expected_start="first_network/dense/",
         actual=net2.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="second_network_1/dense_1/",
+        expected_start="second_network/dense/",
         actual=net2.trainable_weights[1].name)
     self.assertTrue(net1.trainable_weights[0] is net2.trainable_weights[0])
-    self.assertEqual("first_network_1", net1.name)
-    self.assertEqual("dense_1", net1.first.name)
-    self.assertEqual("dense_2", net1.second.name)
+    self.assertEqual("first_network", net1.name)
+    self.assertEqual("dense", net1.first.name)
+    self.assertEqual("dense_1", net1.second.name)
     self.assertTrue(net2.first is net1.first)
-    self.assertEqual("dense_1", net2.second.name)
+    self.assertEqual("dense", net2.second.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLayerAlreadyBuilt(self):
@@ -996,17 +1198,18 @@ class NetworkTest(test.TestCase):
                                     # do not match their layer names.
         actual=net.trainable_weights[0].name)
     self.assertStartsWith(
-        expected_start="first_network_1/dense_1/",
+        expected_start="first_network/dense/",
         actual=net.trainable_weights[1].name)
     self.assertTrue(
         net.trainable_weights[0] is shared_layer.trainable_weights[0])
-    self.assertEqual("first_network_1", net.name)
+    self.assertEqual("first_network", net.name)
     self.assertEqual("dense_3", net.first.name)
-    self.assertEqual("dense_1", net.second.name)
+    self.assertEqual("dense", net.second.name)
 
 
 class SequentialTest(test.TestCase):
 
+  @test_util.assert_no_garbage_created
   def testTwoLayers(self):
     # Create a sequential network with one layer.
     net = network.Sequential([core.Dense(1, use_bias=False)])
@@ -1028,6 +1231,7 @@ class SequentialTest(test.TestCase):
     l2.trainable_variables[0].assign([[11.0]])
     self.assertEqual(231.0, net(constant_op.constant([[7.0]])).numpy())
 
+  @test_util.assert_no_garbage_created
   def testFunctions(self):
     # Create a sequential network with one function.
     net = network.Sequential([nn_ops.relu])
@@ -1038,6 +1242,7 @@ class SequentialTest(test.TestCase):
     net.add(math_ops.negative)
     self.assertEqual(-2.0, net(two).numpy())
 
+  @test_util.assert_no_garbage_created
   def testTrainingLayer(self):
     net = network.Sequential([core.Dropout(0.99999)])
     two = constant_op.constant(2.0)
@@ -1051,6 +1256,7 @@ class SequentialTest(test.TestCase):
     # Should only fail spuriously 1 in 10^100 runs.
     self.fail("Didn't see dropout happen after 20 tries.")
 
+  @test_util.assert_no_garbage_created
   def testTrainingFunction(self):
     # Output depends on value of "training".
     def add_training(input_value, training=None):
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index e0a20d2485e831b1841991596b91429c6eaa2854..57b070ec6eeac00c77f199a846639d64c4957cd8 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -23,7 +23,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import adam as _adam
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as _saver
 
@@ -171,20 +170,12 @@ class Saver(object):
 def get_optimizer_variables(optimizer):
   """Returns a list of variables for the given `tf.train.Optimizer`.
 
+  Equivalent to `optimizer.variables()`.
+
   Args:
     optimizer: An instance of `tf.train.Optimizer` which has created variables
       (typically after a call to `Optimizer.minimize`).
   Returns:
-    A list of variables which have been created by the `Optimizer`. Currently
-    returns all variables even if they were not created in the default graph,
-    but this behavior may change.
+    A list of variables which have been created by the `Optimizer`.
   """
-  variables = []
-  # pylint: disable=protected-access
-  for _, variable_dict in optimizer._slots.items():
-    for _, slot_for_variable in variable_dict.items():
-      variables.append(slot_for_variable)
-  if isinstance(optimizer, _adam.AdamOptimizer):
-    variables.append(optimizer._beta1_power)
-    variables.append(optimizer._beta2_power)
-  return variables
+  return optimizer.variables()
diff --git a/tensorflow/contrib/eager/python/summary_writer.py b/tensorflow/contrib/eager/python/summary_writer.py
deleted file mode 100644
index 5d8c41b545b3c9fd03af85f302ba05a394f085a4..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TensorBoard Summary Writer for TensorFlow Eager Execution."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import uuid
-
-from tensorflow.contrib.summary import gen_summary_ops
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_op_util
-from tensorflow.python.ops import variable_scope
-
-
-def _maybe_cpu(v):
-  if isinstance(v, (ops.EagerTensor, ops.Tensor)):
-    return v.cpu()
-  else:
-    return v
-
-
-def _summary_writer_function(name, tensor, function, family=None):
-  def record():
-    with summary_op_util.summary_scope(
-        name, family, values=[tensor]) as (tag, scope):
-      function(tag, scope)
-      return True
-  return record
-
-
-class SummaryWriter(object):
-  """Writes summaries for TensorBoard, compatible with eager execution.
-
-  This class is the supported way of writing TensorBoard summaries under
-  eager execution.
-  """
-
-  _CPU_DEVICE = "cpu:0"
-
-  def __init__(self,
-               logdir,
-               max_queue=10,
-               flush_secs=120,
-               filename_suffix=""):
-    """Summary writer for TensorBoard, compatible with eager execution.
-
-    If necessary, multiple instances of `SummaryWriter` can be created, with
-    distinct `logdir`s and `name`s. Each `SummaryWriter` instance will retain
-    its independent `global_step` counter and data writing destination.
-
-    Example:
-    ```python
-    writer = tfe.SummaryWriter("my_model")
-
-    # ... Code that sets up the model and data batches ...
-
-    for _ in xrange(train_iters):
-      loss = model.train_batch(batch)
-      writer.scalar("loss", loss)
-      writer.step()
-    ```
-
-    Args:
-      logdir: Directory in which summary files will be written.
-      max_queue: Number of summary items to buffer before flushing to
-        filesystem. If 0, summaries will be flushed immediately.
-      flush_secs: Number of secondsbetween forced commits to disk.
-      filename_suffix: Suffix of the event protobuf files in which the summary
-        data are stored.
-
-    Raises:
-      ValueError: If this constructor is called not under eager execution.
-    """
-    # TODO(apassos, ashankar): Make this class and the underlying
-    # contrib.summary_ops compatible with graph model and remove this check.
-    if not context.in_eager_mode():
-      raise ValueError(
-          "Use of SummaryWriter is currently supported only with eager "
-          "execution enabled. File an issue at "
-          "https://github.com/tensorflow/tensorflow/issues/new to express "
-          "interest in fixing this.")
-
-    # TODO(cais): Consider adding name keyword argument, which if None or empty,
-    # will register the global global_step that training_util.get_global_step()
-    # can find.
-    with context.device(self._CPU_DEVICE):
-      self._name = uuid.uuid4().hex
-      self._global_step = 0
-      self._global_step_tensor = variable_scope.get_variable(
-          "global_step/summary_writer/" + self._name,
-          shape=[], dtype=dtypes.int64,
-          initializer=init_ops.zeros_initializer())
-      self._global_step_dirty = False
-      self._resource = gen_summary_ops.summary_writer(shared_name=self._name)
-      gen_summary_ops.create_summary_file_writer(
-          self._resource, logdir, max_queue, flush_secs, filename_suffix)
-      # Delete the resource when this object is deleted
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device=self._CPU_DEVICE)
-
-  def step(self):
-    """Increment the global step counter of this SummaryWriter instance."""
-    self._global_step += 1
-    self._global_step_dirty = True
-
-  @property
-  def global_step(self):
-    """Obtain the current global_step value of this SummaryWriter instance.
-
-    Returns:
-      An `int` representing the current value of the global_step of this
-       `SummaryWriter` instance.
-    """
-    return self._global_step
-
-  def _update_global_step_tensor(self):
-    with context.device(self._CPU_DEVICE):
-      if self._global_step_dirty:
-        self._global_step_dirty = False
-        return state_ops.assign(self._global_step_tensor, self._global_step)
-      else:
-        return self._global_step_tensor
-
-  def generic(self, name, tensor, metadata, family=None):
-    """Write a generic-type summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A `Tensor` or compatible value type containing the value of the
-        summary.
-      metadata: Metadata about the summary.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_summary(
-            self._resource,
-            self._update_global_step_tensor(),
-            _maybe_cpu(tensor),
-            tag,
-            _maybe_cpu(metadata),
-            name=scope)
-
-  def scalar(self, name, tensor, family=None):
-    """Write a scalar summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as the series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type containing a
-        single value.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-
-    Returns:
-      A summary writer function for scalars.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_scalar_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def histogram(self, name, tensor, family=None):
-    """Write a histogram summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A real numeric `Tensor` or compatible value type. Any shape.
-        Values to use to build the histogram.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_histogram_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), name=scope)
-
-  def image(self, name, tensor, bad_color=None, max_images=3, family=None):
-    """Write an image summary."""
-    with context.device(self._CPU_DEVICE):
-      if bad_color is None:
-        bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_image_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag, _maybe_cpu(tensor), bad_color_, max_images,
-            name=scope)
-
-  def audio(self, name, tensor, sample_rate, max_outputs, family=None):
-    """Write an audio summary.
-
-    Args:
-      name: A name for the generated node. Will also serve as a series name in
-        TensorBoard.
-      tensor: A 3-D `float32` `Tensor` of shape `[batch_size, frames, channels]`
-        or a 2-D `float32` `Tensor` of shape `[batch_size, frames]`, or
-        compatible value type.
-      sample_rate: A Scalar `float32` `Tensor` indicating the sample rate of the
-        signal in hertz.
-      max_outputs: Max number of batch elements to generate audio for.
-      family: Optional; if provided, used as the prefix of the summary tag name,
-        which controls the tab name used for display on Tensorboard.
-    """
-    with context.device(self._CPU_DEVICE):
-      with summary_op_util.summary_scope(
-          name, family, values=[tensor]) as (tag, scope):
-        gen_summary_ops.write_audio_summary(
-            self._resource, self._update_global_step_tensor(),
-            tag,
-            _maybe_cpu(tensor),
-            sample_rate=_maybe_cpu(sample_rate),
-            max_outputs=max_outputs,
-            name=scope)
diff --git a/tensorflow/contrib/eager/python/summary_writer_test.py b/tensorflow/contrib/eager/python/summary_writer_test.py
deleted file mode 100644
index 5ebb36d04fcba8f4558fa1c09716314af42f559f..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/eager/python/summary_writer_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests for eager execution SummaryWriter."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.contrib.eager.python import summary_writer
-from tensorflow.core.util import event_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-
-
-class SummaryWriterTest(test.TestCase):
-
-  def setUp(self):
-    super(SummaryWriterTest, self).setUp()
-    self._test_device = "gpu:0" if context.num_gpus() else "cpu:0"
-    self._tmp_logdir = tempfile.mkdtemp()
-    with context.device(self._test_device):
-      # Use max_queue=0 so that summaries are immediately flushed to filesystem,
-      # making testing easier.
-      self._writer = summary_writer.SummaryWriter(self._tmp_logdir, max_queue=0)
-
-  def tearDown(self):
-    if os.path.isdir(self._tmp_logdir):
-      shutil.rmtree(self._tmp_logdir)
-    super(SummaryWriterTest, self).tearDown()
-
-  def _readLastEvent(self, logdir=None):
-    if not logdir:
-      logdir = self._tmp_logdir
-    files = [f for f in gfile.ListDirectory(logdir)
-             if not gfile.IsDirectory(os.path.join(logdir, f))]
-    file_path = os.path.join(logdir, files[0])
-    records = list(tf_record.tf_record_iterator(file_path))
-    event = event_pb2.Event()
-    event.ParseFromString(records[-1])
-    return event
-
-  def testGlobalStep(self):
-    with context.device(self._test_device):
-      orig_step = self._writer.global_step
-      self._writer.step()
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self.assertEqual(orig_step + 1, self._writer.global_step)
-      self._writer.step()
-      self._writer.step()
-      self.assertEqual(orig_step + 3, self._writer.global_step)
-
-  def testGenericSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      with context.device("cpu:0"):
-        metadata = constant_op.constant("foo")
-      self._writer.generic("x", x, metadata)
-      event = self._readLastEvent()
-      self.assertEqual("x", event.summary.value[0].tag)
-
-  def testScalarSummary(self):
-    with context.device(self._test_device):
-      x = constant_op.constant(1337.0)
-      self._writer.scalar("x", x)
-      event = self._readLastEvent()
-      self.assertTrue("x", event.summary.value[0].tag)
-      self.assertEqual(1337.0, event.summary.value[0].simple_value)
-
-  def testHistogramSummary(self):
-    with context.device(self._test_device):
-      y = constant_op.constant([1.0, 3.0, 3.0, 7.0])
-      self._writer.histogram("y", y)
-      event = self._readLastEvent()
-      self.assertEqual("y", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].histo)
-
-  def testImageSummary(self):
-    with context.device(self._test_device):
-      a = constant_op.constant([[10.0, 20.0], [-20.0, -10.0]])
-      self._writer.histogram("image1", a)
-      event = self._readLastEvent()
-      self.assertEqual("image1", event.summary.value[0].tag)
-      self.assertTrue(event.summary.value[0].image)
-
-  def testAudioSummary(self):
-    with context.device(self._test_device):
-      w = constant_op.constant(np.random.rand(3, 10, 2), dtype=dtypes.float32)
-      fs = constant_op.constant(44100.0, dtype=dtypes.float32)
-      max_outputs = 1
-      self._writer.audio("audio1", w, fs, max_outputs)
-      event = self._readLastEvent()
-      self.assertTrue(event.summary.value[0].audio)
-
-  def testTwoSummaryWritersGlobalStepsWorkWithoutCrosstalk(self):
-    tmp_logdir2 = os.path.join(self._tmp_logdir, "_writer2_")
-    writer2 = summary_writer.SummaryWriter(tmp_logdir2, max_queue=0)
-
-    self.assertEqual(0, writer2.global_step)
-    self._writer.step()
-    self.assertEqual(0, writer2.global_step)
-    writer2.step()
-    writer2.step()
-    writer2.step()
-    self.assertEqual(3, writer2.global_step)
-
-    x = constant_op.constant(1337.0)
-    writer_orig_step = self._writer.global_step
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 1, event.step)
-
-    writer2.scalar("x", x)
-    event = self._readLastEvent(tmp_logdir2)
-    self.assertEqual(3, event.step)
-
-    self._writer.step()
-    self._writer.scalar("x", x)
-
-    event = self._readLastEvent()
-    self.assertEqual(writer_orig_step + 2, event.step)
-
-
-# TODO(cais): Add performance benchmark for SummaryWriter.
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index b6c687c82946ec62ccb90165791587dc335f13c7..770a7e3e7a01f3351c229b7fb53383240dd1f1c8 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -23,6 +23,7 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@list_devices
 @@num_gpus
 
+@@py_func
 @@defun
 @@implicit_gradients
 @@implicit_value_and_gradients
@@ -30,9 +31,6 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@value_and_gradients_function
 @@GradientTape
 
-@@enable_tracing
-@@flush_trace
-
 @@run
 @@enable_eager_execution
 
@@ -46,13 +44,16 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 @@seterr
 
 @@Iterator
-@@Network
 @@Saver
 @@restore_variables_on_create
 @@Variable
 @@get_optimizer_variables
 @@EagerVariableStore
 
+@@Network
+@@save_network_checkpoint
+@@restore_network_checkpoint
+
 @@in_eager_mode
 @@in_graph_mode
 
@@ -74,6 +75,8 @@ from __future__ import print_function
 from tensorflow.contrib.eager.python import metrics
 from tensorflow.contrib.eager.python.datasets import Iterator
 from tensorflow.contrib.eager.python.network import Network
+from tensorflow.contrib.eager.python.network import save_network_checkpoint
+from tensorflow.contrib.eager.python.network import restore_network_checkpoint
 from tensorflow.contrib.eager.python.saver import get_optimizer_variables
 from tensorflow.contrib.eager.python.saver import restore_variables_on_create
 from tensorflow.contrib.eager.python.saver import Saver
@@ -86,7 +89,6 @@ from tensorflow.python.eager.context import in_eager_mode
 from tensorflow.python.eager.context import in_graph_mode
 from tensorflow.python.eager.context import list_devices
 from tensorflow.python.eager.context import num_gpus
-from tensorflow.python.eager.core import enable_tracing
 from tensorflow.python.eager.custom_gradient import custom_gradient
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
@@ -100,8 +102,10 @@ from tensorflow.python.framework.test_util import IsolateTest
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
+from tensorflow.python.ops import script_ops
 from tensorflow.python.util.all_util import remove_undocumented
 
+py_func = script_ops.eager_py_func
 defun = function.defun
 implicit_gradients = backprop.implicit_grad
 implicit_value_and_gradients = backprop.implicit_val_and_grad
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index a0f83ac10555913b5be177f0f2b00b2b0e30494a..ba272d7e885434eb556cbafd3d9e64a50d21f9b2 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -7,6 +7,7 @@ package(
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 filegroup(
     name = "all_files",
@@ -26,10 +27,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dnn",
+        ":dnn_linear_combined",
         ":extenders",
         ":head",
+        ":linear",
         ":logit_fns",
         ":multi_head",
+        ":replicate_model_fn",
         "//tensorflow/python:util",
     ],
 )
@@ -71,6 +75,46 @@ py_test(
     ],
 )
 
+py_library(
+    name = "dnn_linear_combined",
+    srcs = ["python/estimator/dnn_linear_combined.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:nn",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:dnn_linear_combined",
+    ],
+)
+
+py_test(
+    name = "dnn_linear_combined_test",
+    size = "medium",
+    srcs = ["python/estimator/dnn_linear_combined_test.py"],
+    shard_count = 3,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":dnn_linear_combined",
+        ":head",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:dnn_testing_utils",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:linear_testing_utils",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "extenders",
     srcs = [
@@ -167,6 +211,42 @@ py_test(
     ],
 )
 
+py_library(
+    name = "linear",
+    srcs = ["python/estimator/linear.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:linear",
+    ],
+)
+
+py_test(
+    name = "linear_test",
+    size = "small",
+    srcs = ["python/estimator/linear_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":head",
+        ":linear",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:linear_testing_utils",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "logit_fns",
     srcs = [
@@ -202,10 +282,14 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:summary",
         "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/saved_model:signature_constants",
         "@six_archive//:six",
@@ -233,3 +317,63 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_library(
+    name = "replicate_model_fn",
+    srcs = [
+        "python/estimator/replicate_model_fn.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:util",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "replicate_model_fn_test",
+    size = "medium",
+    srcs = ["python/estimator/replicate_model_fn_test.py"],
+    additional_deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:dnn",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:optimizers",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        ":replicate_model_fn",
+    ],
+    tags = ["multi_gpu"],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index cf727264cd5116915f6bd7f285e470cbc2e2742a..28c1f8b1809d27db697365b7bb50441f7820d2b4 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.contrib.estimator.python.estimator.dnn import *
+from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
 from tensorflow.contrib.estimator.python.estimator.head import *
+from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
+from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -38,9 +41,12 @@ _allowed_symbols = [
     'multi_label_head',
     'regression_head',
     'DNNEstimator',
+    'DNNLinearCombinedEstimator',
+    'LinearEstimator',
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
+    'replicate_model_fn',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccaf1128bf23af734f7a5722a4dd8c1f0304fab7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
@@ -0,0 +1,164 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow estimator for Linear and DNN joined training models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import dnn_linear_combined as dnn_linear_combined_lib
+from tensorflow.python.ops import nn
+
+
+class DNNLinearCombinedEstimator(estimator.Estimator):
+  """An estimator for TensorFlow Linear and DNN joined models with custom head.
+
+  Note: This estimator is also known as wide-n-deep.
+
+  Example:
+
+  ```python
+  numeric_feature = numeric_column(...)
+  categorical_column_a = categorical_column_with_hash_bucket(...)
+  categorical_column_b = categorical_column_with_hash_bucket(...)
+
+  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
+  categorical_feature_a_emb = embedding_column(
+      categorical_column=categorical_feature_a, ...)
+  categorical_feature_b_emb = embedding_column(
+      categorical_column=categorical_feature_b, ...)
+
+  estimator = DNNLinearCombinedEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      # wide settings
+      linear_feature_columns=[categorical_feature_a_x_categorical_feature_b],
+      linear_optimizer=tf.train.FtrlOptimizer(...),
+      # deep settings
+      dnn_feature_columns=[
+          categorical_feature_a_emb, categorical_feature_b_emb,
+          numeric_feature],
+      dnn_hidden_units=[1000, 500, 100],
+      dnn_optimizer=tf.train.ProximalAdagradOptimizer(...))
+
+  # To apply L1 and L2 regularization, you can set optimizers as follows:
+  tf.train.ProximalAdagradOptimizer(
+      learning_rate=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=0.001)
+  # It is same for FtrlOptimizer.
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * for each `column` in `dnn_feature_columns` + `linear_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using mean squared error.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               head,
+               model_dir=None,
+               linear_feature_columns=None,
+               linear_optimizer='Ftrl',
+               dnn_feature_columns=None,
+               dnn_optimizer='Adagrad',
+               dnn_hidden_units=None,
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a DNNLinearCombinedEstimator instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      linear_feature_columns: An iterable containing all the feature columns
+        used by linear part of the model. All items in the set must be
+        instances of classes derived from `FeatureColumn`.
+      linear_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the linear part of the model. Defaults to FTRL optimizer.
+      dnn_feature_columns: An iterable containing all the feature columns used
+        by deep part of the model. All items in the set must be instances of
+        classes derived from `FeatureColumn`.
+      dnn_optimizer: An instance of `tf.Optimizer` used to apply gradients to
+        the deep part of the model. Defaults to Adagrad optimizer.
+      dnn_hidden_units: List of hidden units per layer. All layers are fully
+        connected.
+      dnn_activation_fn: Activation function applied to each layer. If None,
+        will use `tf.nn.relu`.
+      dnn_dropout: When not None, the probability we will drop out
+        a given coordinate.
+      input_layer_partitioner: Partitioner for input layer. Defaults to
+        `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: RunConfig object to configure the runtime settings.
+
+    Raises:
+      ValueError: If both linear_feature_columns and dnn_features_columns are
+        empty at the same time.
+    """
+    linear_feature_columns = linear_feature_columns or []
+    dnn_feature_columns = dnn_feature_columns or []
+    self._feature_columns = (
+        list(linear_feature_columns) + list(dnn_feature_columns))
+    if not self._feature_columns:
+      raise ValueError('Either linear_feature_columns or dnn_feature_columns '
+                       'must be defined.')
+
+    def _model_fn(features, labels, mode, config):
+      return dnn_linear_combined_lib._dnn_linear_combined_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          linear_feature_columns=linear_feature_columns,
+          linear_optimizer=linear_optimizer,
+          dnn_feature_columns=dnn_feature_columns,
+          dnn_optimizer=dnn_optimizer,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+
+    super(DNNLinearCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e4d34dc70ccaa4806ae8b8ed5001bd971ee7b4
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dnn_linear_combined.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import dnn_linear_combined
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.python.estimator.canned import dnn_testing_utils
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+def _dnn_only_estimator_fn(
+    hidden_units,
+    feature_columns,
+    model_dir=None,
+    label_dimension=1,
+    weight_column=None,
+    optimizer='Adagrad',
+    activation_fn=nn.relu,
+    dropout=None,
+    input_layer_partitioner=None,
+    config=None):
+  return dnn_linear_combined.DNNLinearCombinedEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      model_dir=model_dir,
+      dnn_feature_columns=feature_columns,
+      dnn_optimizer=optimizer,
+      dnn_hidden_units=hidden_units,
+      dnn_activation_fn=activation_fn,
+      dnn_dropout=dropout,
+      input_layer_partitioner=input_layer_partitioner,
+      config=config)
+
+
+class DNNOnlyEstimatorEvaluateTest(
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorEvaluateTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+class DNNOnlyEstimatorPredictTest(
+    dnn_testing_utils.BaseDNNRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorPredictTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+class DNNOnlyEstimatorTrainTest(
+    dnn_testing_utils.BaseDNNRegressorTrainTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    dnn_testing_utils.BaseDNNRegressorTrainTest.__init__(
+        self, _dnn_only_estimator_fn)
+
+
+def _linear_only_estimator_fn(
+    feature_columns,
+    model_dir=None,
+    label_dimension=1,
+    weight_column=None,
+    optimizer='Ftrl',
+    config=None,
+    partitioner=None):
+  return dnn_linear_combined.DNNLinearCombinedEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      model_dir=model_dir,
+      linear_feature_columns=feature_columns,
+      linear_optimizer=optimizer,
+      input_layer_partitioner=partitioner,
+      config=config)
+
+
+class LinearOnlyEstimatorEvaluateTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class LinearOnlyEstimatorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class LinearOnlyEstimatorTrainTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_only_estimator_fn)
+
+
+class DNNLinearCombinedEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    est = dnn_linear_combined.DNNLinearCombinedEstimator(
+        head=head_lib.regression_head(label_dimension=label_dimension),
+        linear_feature_columns=linear_feature_columns,
+        dnn_feature_columns=dnn_feature_columns,
+        dnn_hidden_units=(2, 2),
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 189f098005b8926bfb30b723cc989cb854a5d77e..a9311a20f127d92f02a95b8b48082fc90850635a 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
@@ -48,7 +49,20 @@ def multi_class_head(n_classes,
 
   Uses `sparse_softmax_cross_entropy` loss.
 
-  This head expects to be fed integer labels specifying the class index.
+  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
+  In many applications, the shape is `[batch_size, n_classes]`.
+
+  `labels` must be a dense `Tensor` with shape matching `logits`, namely
+  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
+  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
+  `labels` must be an integer `Tensor` with values specifying the class index.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
+
+  The loss is the weighted sum over the input dimensions. Namely, if the input
+  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
+  `batch_size`.
 
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
@@ -57,11 +71,11 @@ def multi_class_head(n_classes,
       `tf.feature_column.numeric_column` defining feature column representing
       weights. It is used to down weight or boost examples during training. It
       will be multiplied by the loss of the example.
-    label_vocabulary: A list of strings represents possible label values. If it
-      is not given, that means labels are already encoded as integer within
-      [0, n_classes). If given, labels must be string type and have any value in
-      `label_vocabulary`. Also there will be errors if vocabulary is not
-      provided and labels are string.
+    label_vocabulary: A list or tuple of strings representing possible label
+      values. If it is not given, that means labels are already encoded as an
+      integer within [0, n_classes). If given, labels must be of string type and
+      have any value in `label_vocabulary`. Note that errors will be raised if
+      `label_vocabulary` is not provided but labels are strings.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -84,7 +98,20 @@ def binary_classification_head(
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
 
-  This head expects to be fed float labels of shape `(batch_size, 1)`.
+  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
+  In many applications, the shape is `[batch_size, 1]`.
+
+  `labels` must be a dense `Tensor` with shape matching `logits`, namely
+  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
+  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
+  `labels` must be float `Tensor` with values in the interval `[0, 1]`.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
+
+  The loss is the weighted sum over the input dimensions. Namely, if the input
+  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
+  `batch_size`.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -96,11 +123,11 @@ def binary_classification_head(
       generated for each threshold value. This threshold is applied to the
       logistic values to determine the binary classification (i.e., above the
       threshold is `true`, below is `false`.
-    label_vocabulary: A list of strings represents possible label values. If it
-      is not given, that means labels are already encoded within [0, 1]. If
-      given, labels must be string type and have any value in
-      `label_vocabulary`. Also there will be errors if vocabulary is not
-      provided and labels are string.
+    label_vocabulary: A list or tuple of strings representing possible label
+      values. If it is not given, labels must be float with values within
+      [0, 1]. If given, labels must be string type and have any value in
+      `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
+      is not provided but labels are strings.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -120,9 +147,22 @@ def binary_classification_head(
 def regression_head(weight_column=None,
                     label_dimension=1,
                     name=None):
-  """Creates a `_Head` for regression using the mean squared loss.
+  """Creates a `_Head` for regression using the `mean_squared_error` loss.
 
-  Uses `mean_squared_error` loss.
+  The loss is the weighted sum over all input dimensions. Namely, if the input
+  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
+  sum over both `batch_size` and `label_dimension`.
+
+  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
+  In many applications, the shape is `[batch_size, label_dimension]`.
+
+  The `labels` shape must match `logits`, namely
+  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
+  `[D0, D1, ... DN]` is also supported.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
+  `[D0, D1, ... DN, label_dimension]`.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -156,15 +196,29 @@ def multi_label_head(n_classes,
   or more associated labels, from a discrete set. This is distinct from
   `multi_class_head` which has exactly one label per example.
 
-  Uses `sigmoid_cross_entropy` loss averaged over classes. Expects labels as a
-  multi-hot tensor of shape `[batch_size, n_classes]`, or as an integer
-  `SparseTensor` of class indices.
+  Uses `sigmoid_cross_entropy` loss average over classes and weighted sum over
+  the batch. Namely, if the input logits have shape `[batch_size, n_classes]`,
+  the loss is the average over `n_classes` and the weighted sum over
+  `batch_size`.
+
+  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`. In many
+  applications, the shape is `[batch_size, label_n_classes]`.
+
+  Labels can be:
+  * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
+  * An integer `SparseTensor` of class indices. The `dense_shape` must be
+    `[D0, D1, ... DN, ?]` and the values within `[0, n_classes)`.
+  * If `label_vocabulary` is given, a string `SparseTensor`. The `dense_shape`
+    must be `[D0, D1, ... DN, ?]` and the values within `label_vocabulary`.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
 
   Also supports custom `loss_fn`. `loss_fn` takes `(labels, logits)` or
   `(labels, logits, features)` as arguments and returns unreduced loss with
-  shape `[batch_size, 1]`. `loss_fn` must support indicator `labels` with shape
-  `[batch_size, n_classes]`. Namely, the head applies `label_vocabulary` to the
-  input labels before passing them to `loss_fn`.
+  shape `[D0, D1, ... DN, 1]`. `loss_fn` must support indicator `labels` with
+  shape `[D0, D1, ... DN, n_classes]`. Namely, the head applies
+  `label_vocabulary` to the input labels before passing them to `loss_fn`.
 
   Args:
     n_classes: Number of classes, must be greater than 1 (for 1 class, use
@@ -172,7 +226,8 @@ def multi_label_head(n_classes,
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
       weights. It is used to down weight or boost examples during training. It
-      will be multiplied by the loss of the example.
+      will be multiplied by the loss of the example.  Per-class weighting is
+      not supported.
     thresholds: Iterable of floats in the range `(0, 1)`. Accuracy, precision
       and recall metrics are evaluated for each threshold value. The threshold
       is applied to the predicted probabilities, i.e. above the threshold is
@@ -190,7 +245,7 @@ def multi_label_head(n_classes,
     An instance of `_Head` for multi-label classification.
 
   Raises:
-    ValueError: if `n_classes` or `thresholds` is invalid.
+    ValueError: if `n_classes`, `thresholds`, or `loss_fn` is invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if n_classes is None or n_classes < 2:
@@ -258,26 +313,36 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
             indices=labels.indices,
             values=label_ids_values,
             dense_shape=labels.dense_shape)
+        return math_ops.to_int64(
+            sparse_ops.sparse_to_indicator(label_ids, self._n_classes))
       else:
-        label_ids = labels
-      return math_ops.to_int64(
-          sparse_ops.sparse_to_indicator(label_ids, self._n_classes))
-    msg = ('labels shape must be [batch_size, {}]. '
-           'Given: ').format(self._n_classes)
-    labels_shape = array_ops.shape(labels)
-    check_rank_op = control_flow_ops.Assert(
-        math_ops.equal(array_ops.rank(labels), 2),
-        data=[msg, labels_shape])
-    check_label_dim = control_flow_ops.Assert(
-        math_ops.equal(labels_shape[-1], self._n_classes),
-        data=[msg, labels_shape])
-    with ops.control_dependencies([check_rank_op, check_label_dim]):
-      return array_ops.identity(labels)
+        err_msg = (
+            r'labels must be an integer SparseTensor with values in '
+            r'[0, {})'.format(self._n_classes))
+        assert_int = check_ops.assert_integer(
+            labels.values, message=err_msg)
+        assert_less = check_ops.assert_less(
+            labels.values,
+            ops.convert_to_tensor(self._n_classes, dtype=labels.dtype),
+            message=err_msg)
+        assert_greater = check_ops.assert_non_negative(
+            labels.values, message=err_msg)
+        with ops.control_dependencies(
+            [assert_int, assert_less, assert_greater]):
+          return math_ops.to_int64(
+              sparse_ops.sparse_to_indicator(labels, self._n_classes))
+    err_msg = (
+        r'labels must be an integer indicator Tensor with values in [0, 1]')
+    return head_lib._assert_range(labels, 2, message=err_msg)  # pylint:disable=protected-access,
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
     del mode  # Unused for this head.
+    logits = ops.convert_to_tensor(logits)
     processed_labels = self._process_labels(labels)
+    processed_labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint:disable=protected-access
+        labels=processed_labels, logits=logits,
+        expected_labels_dimension=self.logits_dimension)
     if self._loss_fn:
       unweighted_loss = _call_loss_fn(
           loss_fn=self._loss_fn, labels=processed_labels, logits=logits,
@@ -289,15 +354,23 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
       # Averages loss over classes.
       unweighted_loss = math_ops.reduce_mean(
           unweighted_loss, axis=-1, keep_dims=True)
-    return head_lib.LossAndLabels(
-        unweighted_loss=unweighted_loss,
+    weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
+        features=features, weight_column=self._weight_column, logits=logits)
+    weighted_sum_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+    # _weights() can return 1.
+    example_weight_sum = math_ops.reduce_sum(
+        weights * array_ops.ones_like(unweighted_loss))
+    return head_lib.LossSpec(
+        weighted_sum_loss=weighted_sum_loss,
+        example_weight_sum=example_weight_sum,
         processed_labels=processed_labels)
 
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `Head`."""
     with ops.name_scope(self._name, 'head'):
-      logits = head_lib._check_logits(logits, self.logits_dimension)  # pylint:disable=protected-access
+      logits = head_lib._check_logits_final_dim(logits, self.logits_dimension)  # pylint:disable=protected-access
 
       # Predict.
       pred_keys = prediction_keys.PredictionKeys
@@ -321,22 +394,24 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
                     export_output.PredictOutput(predictions))
             })
 
+      (weighted_sum_loss, example_weight_sum,
+       processed_labels) = self.create_loss(
+           features=features, mode=mode, logits=logits, labels=labels)
+
       # Eval.
-      unweighted_loss, processed_labels = self.create_loss(
-          features=features, mode=mode, logits=logits, labels=labels)
-      weights = head_lib._weights(features, self._weight_column)  # pylint:disable=protected-access
-      training_loss = losses.compute_weighted_loss(
-          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
       if mode == model_fn.ModeKeys.EVAL:
+        weights = head_lib._get_weights_and_check_match_logits(  # pylint:disable=protected-access,
+            features=features, weight_column=self._weight_column, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=weighted_sum_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 probabilities=probabilities,
                 weights=weights,
-                unweighted_loss=unweighted_loss))
+                weighted_sum_loss=weighted_sum_loss,
+                example_weight_sum=example_weight_sum))
 
       # Train.
       if train_op_fn is None:
@@ -344,37 +419,43 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
     with ops.name_scope(''):
       summary.scalar(
           head_lib._summary_key(self._name, metric_keys.MetricKeys.LOSS),  # pylint:disable=protected-access
-          training_loss)
+          weighted_sum_loss)
       summary.scalar(
           head_lib._summary_key(  # pylint:disable=protected-access
               self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
+          weighted_sum_loss / example_weight_sum)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=weighted_sum_loss,
+        train_op=train_op_fn(weighted_sum_loss))
 
-  def _eval_metric_ops(self, labels, probabilities, weights, unweighted_loss):
+  def _eval_metric_ops(self, labels, probabilities, weights, weighted_sum_loss,
+                       example_weight_sum):
     """Returns a dict of metrics for eval_metric_ops."""
     with ops.name_scope(
-        None, 'metrics', [labels, probabilities, weights, unweighted_loss]):
+        None, 'metrics',
+        [labels, probabilities, weights, weighted_sum_loss, example_weight_sum
+        ]):
       keys = metric_keys.MetricKeys
       metric_ops = {
           # Estimator already adds a metric for loss.
           head_lib._summary_key(self._name, keys.LOSS_MEAN):  # pylint:disable=protected-access
               metrics_lib.mean(
-                  unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+                  # Both values and weights here are reduced, scalar Tensors.
+                  # values is the actual mean we want, but we pass the scalar
+                  # example_weight_sum in order to return the correct update_op
+                  # alongside the value_op for streaming metrics.
+                  values=(weighted_sum_loss / example_weight_sum),
+                  weights=example_weight_sum,
+                  name=keys.LOSS_MEAN),
           head_lib._summary_key(self._name, keys.AUC):  # pylint:disable=protected-access
-              metrics_lib.auc(
-                  labels=labels, predictions=probabilities, weights=weights,
-                  name=keys.AUC),
+              metrics_lib.auc(labels=labels, predictions=probabilities,
+                              weights=weights, name=keys.AUC),
           head_lib._summary_key(self._name, keys.AUC_PR):  # pylint:disable=protected-access
-              metrics_lib.auc(
-                  labels=labels, predictions=probabilities, weights=weights,
-                  curve='PR', name=keys.AUC_PR),
+              metrics_lib.auc(labels=labels, predictions=probabilities,
+                              weights=weights, curve='PR',
+                              name=keys.AUC_PR),
       }
       for threshold in self._thresholds:
         accuracy_key = keys.ACCURACY_AT_THRESHOLD % threshold
@@ -453,4 +534,3 @@ def _call_loss_fn(loss_fn, labels, logits, features):
           loss_shape])
   with ops.control_dependencies([check_shape_op]):
     return array_ops.identity(unweighted_loss)
-
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index db7d96d508649f93c23b55504088551747f15a26..d1cf9090048470181818c573647923c9f5824dfa 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -226,7 +226,7 @@ class MultiLabelHead(test.TestCase):
 
   def test_weight_should_not_impact_prediction(self):
     n_classes = 4
-    head = head_lib.multi_label_head(n_classes, weight_column='label_weights')
+    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
     self.assertEqual(n_classes, head.logits_dimension)
 
     logits = np.array(
@@ -237,7 +237,7 @@ class MultiLabelHead(test.TestCase):
     spec = head.create_estimator_spec(
         features={
             'x': np.array(((42,),), dtype=np.int32),
-            'label_weights': weights_2x1,
+            'example_weights': weights_2x1,
         },
         mode=model_fn.ModeKeys.PREDICT,
         logits=logits)
@@ -262,17 +262,17 @@ class MultiLabelHead(test.TestCase):
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
-    expected_unweighted_loss = _sigmoid_cross_entropy(
-        labels=labels, logits=logits)
-    actual_unweighted_loss, _ = head.create_loss(
+    expected_weighted_sum_loss = np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
+    actual_weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(
-          expected_unweighted_loss, actual_unweighted_loss.eval())
+      self.assertAllClose(expected_weighted_sum_loss,
+                          actual_weighted_sum_loss.eval())
 
   def test_eval_create_loss_large_logits(self):
     """Tests head.create_loss for eval mode and large logits."""
@@ -286,17 +286,19 @@ class MultiLabelHead(test.TestCase):
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_unweighted_loss = np.array(
-        [[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32)
-    actual_unweighted_loss, _ = head.create_loss(
+    expected_weighted_sum_loss = np.sum(
+        np.array([[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32))
+    actual_weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unweighted_loss, actual_unweighted_loss.eval(), atol=1e-4)
+          expected_weighted_sum_loss,
+          actual_weighted_sum_loss.eval(),
+          atol=1e-4)
 
   def test_eval_create_loss_labels_wrong_shape(self):
     """Tests head.create_loss for eval mode when labels has the wrong shape."""
@@ -305,23 +307,26 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
-    actual_unweighted_loss, _ = head.create_loss(
+    actual_weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'labels shape must be \[batch_size, 2\]\. Given: \] \[2 1\]'):
-        actual_unweighted_loss.eval(
-            {labels_placeholder: np.array([[1], [1]], dtype=np.int64)})
+          r'\[expected_labels_shape: \] \[2 2\] \[labels_shape: \] \[2 1\]'):
+        actual_weighted_sum_loss.eval({
+            labels_placeholder: np.array([[1], [1]], dtype=np.int64)
+        })
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
-          r'labels shape must be \[batch_size, 2\]\. Given: \] \[2\]'):
-        actual_unweighted_loss.eval(
-            {labels_placeholder: np.array([1, 1], dtype=np.int64)})
+          r'labels shape must be \[D0, D1, ... DN, 2\]\..*'
+          r'\[Received shape: \] \[2\]'):
+        actual_weighted_sum_loss.eval({
+            labels_placeholder: np.array([1, 1], dtype=np.int64)
+        })
 
   def test_eval_create_loss_loss_fn(self):
     """Tests head.create_loss for eval mode and custom loss_fn."""
@@ -339,14 +344,14 @@ class MultiLabelHead(test.TestCase):
         return constant_op.constant(loss)
     head = head_lib.multi_label_head(n_classes=2, loss_fn=_loss_fn)
 
-    actual_unweighted_loss, _ = head.create_loss(
+    actual_weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_input,
-        labels=labels_input)
+        labels=labels_input)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(loss, actual_unweighted_loss.eval())
+      self.assertAllClose(np.sum(loss), actual_weighted_sum_loss.eval())
 
   def test_eval_create_loss_loss_fn_wrong_shape(self):
     """Tests custom loss_fn that returns Tensor of unexpected shape."""
@@ -358,18 +363,18 @@ class MultiLabelHead(test.TestCase):
 
     logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
-    actual_unweighted_loss, _ = head.create_loss(
+    actual_weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           r'loss_fn must return Tensor of shape \[batch_size, 1\]\. '
           r'Given: \] \[2\]'):
-        actual_unweighted_loss.eval()
+        actual_weighted_sum_loss.eval()
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -383,9 +388,11 @@ class MultiLabelHead(test.TestCase):
           logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
           labels=None)
 
-  def _test_eval(self, head, logits, labels, expected_loss, expected_metrics):
+  def _test_eval(
+      self, head, logits, labels, expected_loss, expected_metrics,
+      features=None):
     spec = head.create_estimator_spec(
-        features={'x': np.array(((42,),), dtype=np.int32)},
+        features=features or {},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
         labels=labels)
@@ -545,7 +552,7 @@ class MultiLabelHead(test.TestCase):
 
   def test_eval_with_weights(self):
     n_classes = 2
-    head = head_lib.multi_label_head(n_classes, weight_column='label_weights')
+    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
 
     logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
@@ -559,7 +566,7 @@ class MultiLabelHead(test.TestCase):
     spec = head.create_estimator_spec(
         features={
             'x': np.array([[41], [42]], dtype=np.int32),
-            'label_weights': np.array([[1.], [2.]], dtype=np.float32),
+            'example_weights': np.array([[1.], [2.]], dtype=np.float32),
         },
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
@@ -601,26 +608,39 @@ class MultiLabelHead(test.TestCase):
   def test_train_create_loss_large_logits(self):
     """Tests head.create_loss for train mode and large logits."""
     n_classes = 2
-    head = head_lib.multi_label_head(n_classes)
+    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
 
     logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    weights = np.array([[1.], [2.]], dtype=np.float32)
     # loss = labels * -log(sigmoid(logits)) +
     #        (1 - labels) * -log(1 - sigmoid(logits))
     # For large logits, this is approximated as:
     # loss = labels * (logits < 0) * (-logits) +
     #        (1 - labels) * (logits > 0) * logits
-    expected_unweighted_loss = np.array(
-        [[(10. + 10.) / 2.], [(15. + 0.) / 2.]], dtype=np.float32)
-    actual_unweighted_loss, _ = head.create_loss(
-        features={'x': np.array(((42,),), dtype=np.int32)},
+    expected_weighted_sum_loss = np.sum(
+        np.array(
+            [[1. * (10. + 10.) / 2.], [2. * (15. + 0.) / 2.]],
+            dtype=np.float32))
+    expected_example_weight_sum = 1. + 2.
+    actual_weighted_sum_loss, actual_example_weight_sum, _ = head.create_loss(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'example_weights': weights
+        },
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels)
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unweighted_loss, actual_unweighted_loss.eval(), atol=1e-4)
+          expected_weighted_sum_loss,
+          actual_weighted_sum_loss.eval(),
+          atol=1e-4)
+      self.assertAllClose(
+          expected_example_weight_sum,
+          actual_example_weight_sum.eval(),
+          atol=1e-4)
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -638,6 +658,54 @@ class MultiLabelHead(test.TestCase):
           labels=None,
           train_op_fn=_no_op_train_fn)
 
+  def test_train_invalid_indicator_labels(self):
+    head = head_lib.multi_label_head(n_classes=2)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    # The value 2 is outside the allowed range.
+    labels = np.array([[2, 0], [1, 1]], dtype=np.int64)
+    def _train_op_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'labels must be an integer indicator Tensor with values in '
+          r'\[0, 1\]'):
+        sess.run(spec.loss)
+
+  def test_train_invalid_sparse_labels(self):
+    head = head_lib.multi_label_head(n_classes=2)
+    logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
+    # The value 2 is outside the allowed range.
+    labels = sparse_tensor.SparseTensor(
+        values=[2, 0, 1],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    def _train_op_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'labels must be an integer SparseTensor with values in \[0, 2\)'):
+        sess.run(spec.loss)
+
   def _test_train(self, head, logits, labels, expected_loss):
     expected_train_result = 'my_train_op'
     def _train_op_fn(loss):
@@ -725,7 +793,7 @@ class MultiLabelHead(test.TestCase):
 
   def test_train_with_weights(self):
     n_classes = 2
-    head = head_lib.multi_label_head(n_classes, weight_column='label_weights')
+    head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
 
     logits = np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)
     labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
@@ -744,7 +812,7 @@ class MultiLabelHead(test.TestCase):
     spec = head.create_estimator_spec(
         features={
             'x': np.array([[41], [42]], dtype=np.int32),
-            'label_weights': np.array([[1.], [2.]], dtype=np.float32),
+            'example_weights': np.array([[1.], [2.]], dtype=np.float32),
         },
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -774,6 +842,153 @@ class MultiLabelHead(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 3,
       }, summary_str, tol)
 
+  def test_multi_dim_weighted_train_create_loss(self):
+    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
+    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
+
+    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
+    labels = np.array([[[1, 0, 0], [1, 0, 0]],
+                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
+    #      = [[20/3, 10/3], [4, 8]]
+    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
+    expected_weighted_sum_loss = 39.6667
+    expected_example_weight_sum = np.sum(weights)
+    actual_weighted_sum_loss, actual_example_weight_sum, _ = head.create_loss(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    atol = 1.e-3
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(
+          expected_weighted_sum_loss, actual_weighted_sum_loss.eval(),
+          atol=atol)
+      self.assertAllClose(
+          expected_example_weight_sum, actual_example_weight_sum.eval(),
+          atol=atol)
+
+  def test_multi_dim_weighted_train(self):
+    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
+    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
+
+    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
+    labels = np.array([[[1, 0, 0], [1, 0, 0]],
+                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
+    #      = [[20/3, 10/3], [4, 8]]
+    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
+    expected_loss = 39.6667
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=3)])
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    atol = 1.e-3
+    with self.test_session() as sess:
+      _initialize_variables(self, monitored_session.Scaffold())
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, atol=atol)
+      self.assertEqual(
+          six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
+  def test_multi_dim_weights_wrong_inner_dim(self):
+    """Logits and labels of shape [2, 2, 3], weights [2, 1]."""
+    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
+
+    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
+    labels = np.array([[[1, 0, 0], [1, 0, 0]],
+                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
+    weights = np.array([[1.], [2.]], dtype=np.float32)
+    def _train_op_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
+        spec.loss.eval()
+
+  def test_multi_dim_weights_wrong_outer_dim(self):
+    """Logits and labels of shape [2, 2, 3], weights [2, 2, 3]."""
+    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
+
+    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
+    labels = np.array([[[1, 0, 0], [1, 0, 0]],
+                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
+    weights = np.array([[[1., 1., 1.], [1.5, 1.5, 1.5]],
+                        [[2., 2., 2.], [2.5, 2.5, 2.5]]], dtype=np.float32)
+    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    def _train_op_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights_placeholder},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 2 3\]'):
+        spec.loss.eval({weights_placeholder: weights})
+
+  def test_multi_dim_weighted_eval(self):
+    """Logits and labels of shape [2, 2, 3], weights [2, 2]."""
+    head = head_lib.multi_label_head(n_classes=3, weight_column='weights')
+
+    logits = np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                       [[-12., 12., -12.], [12., -12., 12.]]], dtype=np.float32)
+    labels = np.array([[[1, 0, 0], [1, 0, 0]],
+                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = [[10 + 10 + 0, 0 + 0 + 10], [0 + 0 + 12, 12 + 12 + 0]] / 3
+    #      = [[20/3, 10/3], [4, 8]]
+    # weighted_sum_loss = 1*20/3 + 1.5*10/3 + 2*4 + 2.5*8 = 39.6667
+    expected_loss = 39.6667
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / np.sum(weights),
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.4977,
+        keys.AUC_PR: 0.6645,
+    }
+    self._test_eval(
+        head=head,
+        features={'weights': weights},
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_loss,
+        expected_metrics=expected_metrics)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/contrib/estimator/python/estimator/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf4abe83d54504d55de73b63f369cceaf149dd2
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/linear.py
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Linear estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import linear as linear_lib
+
+
+class LinearEstimator(estimator.Estimator):
+  """An estimator for TensorFlow linear models with user-specified head.
+
+  Example:
+
+  ```python
+  categorical_column_a = categorical_column_with_hash_bucket(...)
+  categorical_column_b = categorical_column_with_hash_bucket(...)
+
+  categorical_feature_a_x_categorical_feature_b = crossed_column(...)
+
+  # Estimator using the default optimizer.
+  estimator = LinearEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b])
+
+  # Or estimator using the FTRL optimizer with regularization.
+  estimator = LinearEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3),
+      feature_columns=[categorical_column_a,
+                       categorical_feature_a_x_categorical_feature_b])
+      optimizer=tf.train.FtrlOptimizer(
+          learning_rate=0.1,
+          l1_regularization_strength=0.001
+      ))
+
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    ...
+  estimator.train(input_fn=input_fn_train, steps=100)
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    ...
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    ...
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss and predicted output are determined by the specified head.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               head,
+               feature_columns,
+               model_dir=None,
+               optimizer='Ftrl',
+               config=None,
+               partitioner=None):
+    """Initializes a `LinearEstimator` instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to FTRL optimizer.
+      config: `RunConfig` object to configure the runtime settings.
+      partitioner: Optional. Partitioner for input layer.
+    """
+    def _model_fn(features, labels, mode, config):
+      return linear_lib._linear_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          feature_columns=tuple(feature_columns or []),
+          optimizer=optimizer,
+          partitioner=partitioner,
+          config=config)
+    super(LinearEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63514eb688af48577f0a3b7ce9e7478309f2c30
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for linear.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.contrib.estimator.python.estimator import linear
+from tensorflow.python.estimator.canned import linear_testing_utils
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+def _linear_estimator_fn(
+    weight_column=None, label_dimension=1, *args, **kwargs):
+  """Returns a LinearEstimator that uses regression_head."""
+  return linear.LinearEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension),
+      *args, **kwargs)
+
+
+class LinearEstimatorEvaluateTest(
+    linear_testing_utils.BaseLinearRegressorEvaluationTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorEvaluationTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorPredictTest(
+    linear_testing_utils.BaseLinearRegressorPredictTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorPredictTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorTrainTest(
+    linear_testing_utils.BaseLinearRegressorTrainingTest, test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    linear_testing_utils.BaseLinearRegressorTrainingTest.__init__(
+        self, _linear_estimator_fn)
+
+
+class LinearEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, input_dimension,
+      label_dimension, batch_size):
+    feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))]
+    est = linear.LinearEstimator(
+        head=head_lib.regression_head(label_dimension=label_dimension),
+        feature_columns=feature_columns,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    # learn y = x
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=label_dimension,
+        label_dimension=label_dimension,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns.py b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
index 110ea0302e703fd3eecdfafea928d7ba04f07d8e..09c2862ccd3f90de4153a2095afc9c3d3f9476c1 100644
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns.py
+++ b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
@@ -39,6 +39,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import dnn as dnn_core
 from tensorflow.python.estimator.canned import linear as linear_core
@@ -67,7 +69,8 @@ def call_logit_fn(logit_fn, features, mode, params, config):
     A logit Tensor, the output of logit_fn.
 
   Raises:
-    ValueError: if logit_fn does not return a Tensor.
+    ValueError: if logit_fn does not return a Tensor or a dictionary mapping
+      strings to Tensors.
   """
   logit_fn_args = util.fn_args(logit_fn)
   kwargs = {}
@@ -79,7 +82,15 @@ def call_logit_fn(logit_fn, features, mode, params, config):
     kwargs['config'] = config
   logit_fn_results = logit_fn(features=features, **kwargs)
 
-  if not isinstance(logit_fn_results, ops.Tensor):
-    raise ValueError('model_fn should return a Tensor.')
+  result_is_valid_dictionary = (
+      isinstance(logit_fn_results, dict) and
+      all([(isinstance(k, six.string_types) and isinstance(v, ops.Tensor))
+           for k, v in six.iteritems(logit_fn_results)]))
+  result_is_tensor = isinstance(logit_fn_results, ops.Tensor)
+
+  if not (result_is_valid_dictionary or result_is_tensor):
+    raise ValueError('logit_fn should return a Tensor or a dictionary mapping '
+                     'strings to Tensors.  logit_fn returned: %s' %
+                     logit_fn_results)
 
   return logit_fn_results
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py b/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py
index d75eada798dcdf929e4094258ecdc6ce394f847c..074ece6cca2865b9057ab5ce874a210d3d9ac2e0 100644
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/logit_fns_test.py
@@ -43,22 +43,53 @@ class LogitFnTest(test.TestCase):
     with session.Session():
       self.assertAllClose([[4., 5.]], logit_fn_result.eval())
 
-  def test_should_return_tensor(self):
+  def test_simple_call_multi_logit_fn(self):
+
+    def dummy_logit_fn(features):
+      return {u'head1': features['f1'], 'head2': features['f2']}
+
+    features = {
+        'f1': constant_op.constant([[2., 3.]]),
+        'f2': constant_op.constant([[4., 5.]])
+    }
+    logit_fn_result = logit_fns.call_logit_fn(dummy_logit_fn, features,
+                                              model_fn.ModeKeys.TRAIN,
+                                              'fake_params', 'fake_config')
+    with session.Session():
+      self.assertAllClose([[2., 3.]], logit_fn_result['head1'].eval())
+      self.assertAllClose([[4., 5.]], logit_fn_result['head2'].eval())
+
+  def test_invalid_logit_fn_results(self):
 
     def invalid_logit_fn(features, params):
-      return {
-          'tensor1': features['f1'] * params['input_multiplier'],
-          'tensor2': features['f2'] * params['input_multiplier']
-      }
+      return [
+          features['f1'] * params['input_multiplier'],
+          features['f2'] * params['input_multiplier']
+      ]
+
     features = {
         'f1': constant_op.constant([[2., 3.]]),
         'f2': constant_op.constant([[4., 5.]])
     }
     params = {'learning_rate': 0.001, 'input_multiplier': 2.0}
-    with self.assertRaisesRegexp(ValueError, 'model_fn should return a Tensor'):
+    with self.assertRaisesRegexp(
+        ValueError, 'logit_fn should return a Tensor or a dictionary mapping '
+                    'strings to Tensors'):
       logit_fns.call_logit_fn(invalid_logit_fn, features, 'fake_mode', params,
                               'fake_config')
 
+  def test_invalid_logit_fn_results_dict(self):
+
+    def invalid_logit_fn(features):
+      return {'head1': features['f1'], 'head2': features['f2']}
+
+    features = {'f1': constant_op.constant([[2., 3.]]), 'f2': 'some string'}
+    with self.assertRaisesRegexp(
+        ValueError, 'logit_fn should return a Tensor or a dictionary mapping '
+                    'strings to Tensors'):
+      logit_fns.call_logit_fn(invalid_logit_fn, features, 'fake_mode',
+                              'fake_params', 'fake_config')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index 64b2a9dee83801b5d6d852a3485fc0cc81417ff0..f2a6eae03ec021e5c28d48b3887870d8a057e077 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -22,10 +22,14 @@ import six
 
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.summary import summary
 
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -72,6 +76,23 @@ def multi_head(heads, head_weights=None):
   estimator.train(input_fn=input_fn, steps=100)
   ```
 
+  Also supports `logits` as a `Tensor` of shape
+  `[D0, D1, ... DN, logits_dimension]`. It will split the `Tensor` along the
+  last dimension and distribute it appropriately among the heads. E.g.:
+
+  ```python
+  def model_fn(features, labels, mode):
+    # Create simple heads and specify head name.
+    head1 = multi_class_head(n_classes=3, name='head1')
+    head2 = binary_classification_head(name='head2')
+    # Create multi-head from two simple heads.
+    head = multi_head([head1, head2])
+    # Create logits for the multihead.
+    logits = logit_fn(logits_dimension=head.logits_dimension)
+    # Return the merged EstimatorSpec
+    return head.create_estimator_spec(..., logits=logits, ...)
+  ```
+
   Args:
     heads: List or tuple of `_Head` instances. All heads must have `name`
       specified. The first head in the list is the default used at serving time.
@@ -161,14 +182,53 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
-    # TODO(roumposg): Implement it.
-    raise NotImplementedError('create_loss not yet implemented for MultiHead.')
+    if isinstance(logits, dict):
+      logits_dict = logits
+    else:
+      logits_dict = self._split_logits(logits)
+    weighted_sum_losses = []
+    example_weight_sums = []
+    labels_by_head = {}
+    for head in self._heads:
+      (weighted_sum_loss,
+       example_weight_sum, processed_labels) = head.create_loss(
+           features, mode, logits_dict[head.name], labels[head.name])
+      weighted_sum_losses.append(weighted_sum_loss)
+      example_weight_sums.append(example_weight_sum)
+      labels_by_head[head.name] = processed_labels
+
+    weighted_sum_losses = tuple(weighted_sum_losses)
+    with ops.name_scope('merge_losses',
+                        values=weighted_sum_losses + (self._head_weights or
+                                                      tuple())):
+      if self._head_weights:
+        head_weighted_losses = []
+        head_weighted_example_weight_sums = []
+        for loss, example_weight_sum, weight in zip(weighted_sum_losses,
+                                                    example_weight_sums,
+                                                    self._head_weights):
+          head_weighted_losses.append(math_ops.multiply(loss, weight))
+          head_weighted_example_weight_sums.append(math_ops.multiply(
+              example_weight_sum, weight))
+        merged_weighted_sum_loss = math_ops.add_n(head_weighted_losses)
+        merged_example_weight_sum = math_ops.add_n(
+            head_weighted_example_weight_sums)
+      else:
+        merged_weighted_sum_loss = math_ops.add_n(weighted_sum_losses)
+        merged_example_weight_sum = math_ops.add_n(example_weight_sums)
+
+    return head_lib.LossSpec(
+        weighted_sum_loss=merged_weighted_sum_loss,
+        example_weight_sum=merged_example_weight_sum,
+        processed_labels=labels_by_head)
 
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
     """See `_Head`."""
-    if not isinstance(logits, dict):
-      raise ValueError('logits must be a dict. Given: {}'.format(logits))
+    if isinstance(logits, dict):
+      logits_dict = logits
+    else:
+      logits_dict = self._split_logits(logits)
     if labels and not isinstance(labels, dict):
       raise ValueError('labels must be a dict. Given: {}'.format(labels))
 
@@ -179,20 +239,42 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
           head.create_estimator_spec(
               features=features,
               mode=mode,
-              logits=logits[head_name],
+              logits=logits_dict[head_name],
               labels=labels[head_name] if labels else None,
               train_op_fn=_no_op_train_fn))
 
     if mode == model_fn.ModeKeys.TRAIN:
       if train_op_fn is None:
         raise ValueError('train_op_fn can not be None in TRAIN mode.')
-      return self._merge_train(all_estimator_spec, train_op_fn)
+      spec = self._merge_train(all_estimator_spec, train_op_fn)
+      with ops.name_scope(''):
+        summary.scalar(metric_keys.MetricKeys.LOSS, spec.loss)
+      return spec
     if mode == model_fn.ModeKeys.PREDICT:
       return self._merge_predict(all_estimator_spec)
     if mode == model_fn.ModeKeys.EVAL:
       return self._merge_eval(all_estimator_spec)
     raise ValueError('mode={} unrecognized'.format(mode))
 
+  def _split_logits(self, logits):
+    """Splits logits along the last dimension and returns a dict."""
+    logits_dict = {}
+    with ops.name_scope(None, 'split_logits', values=[logits]):
+      logits = ops.convert_to_tensor(logits)
+      batch_shape = array_ops.shape(logits)[:-1]
+      zeros_like_batch_shape = array_ops.zeros_like(batch_shape)
+      minus_ones_like_batch_shape = -1 * array_ops.ones_like(batch_shape)
+      begin_idx = 0
+      for head in self._heads:
+        begin_tensor = array_ops.concat(
+            [zeros_like_batch_shape, [begin_idx]], axis=0)
+        size_tensor = array_ops.concat(
+            [minus_ones_like_batch_shape, [head.logits_dimension]], axis=0)
+        logits_dict[head.name] = array_ops.slice(
+            logits, begin=begin_tensor, size=size_tensor)
+        begin_idx += head.logits_dimension
+    return logits_dict
+
   def _merge_train(self, all_estimator_spec, train_op_fn):
     """Merges list of `EstimatorSpec` for training.
 
@@ -261,14 +343,19 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
     predictions = {}
     metrics = {}
     losses = []
-    for head, spec in zip(self._heads, all_estimator_spec):
-      losses.append(spec.loss)
-      head_name = head.name
-      # Metric keys already contain head.name.
-      metrics.update(spec.eval_metric_ops or {})
-      for k, v in six.iteritems(spec.predictions):
-        predictions[(head_name, k)] = v
-    loss = _merge_losses(losses, self._head_weights)
+    with ops.name_scope('merge_eval'):
+      for head, spec in zip(self._heads, all_estimator_spec):
+        losses.append(spec.loss)
+        head_name = head.name
+        # Loss metric is not added by default.
+        loss_name = head_lib._summary_key(  # pylint:disable=protected-access
+            head_name, metric_keys.MetricKeys.LOSS)
+        metrics[loss_name] = metrics_lib.mean(spec.loss, name=loss_name)
+        # Metric keys already contain head.name.
+        metrics.update(spec.eval_metric_ops or {})
+        for k, v in six.iteritems(spec.predictions):
+          predictions[(head_name, k)] = v
+      loss = _merge_losses(losses, self._head_weights)
 
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.EVAL,
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 48027035cecffc3ce8aacf8ae917f5eb9e9b2473..68f2d5d1cd53456f7dd82222e171b3619052321a 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -106,7 +106,8 @@ class MultiHeadTest(test.TestCase):
     multi_head = multi_head_lib.multi_head([head1, head2])
     self.assertEqual('head1_head2', multi_head.name)
 
-  def test_predict_two_heads(self):
+  def test_predict_two_heads_logits_dict(self):
+    """Tests predict with logits as dict."""
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
     multi_head = multi_head_lib.multi_head([head1, head2])
@@ -158,6 +159,111 @@ class MultiHeadTest(test.TestCase):
           expected_probabilities['head2'],
           sess.run(spec.export_outputs['head2'].scores))
 
+  def test_predict_two_heads_logits_tensor(self):
+    """Tests predict with logits as Tensor."""
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
+    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
+    multi_head = multi_head_lib.multi_head([head1, head2])
+
+    logits = np.array(
+        [[-1., 1., 2., -2., 2.], [-1.5, 1., -3., 2., -2.]], dtype=np.float32)
+    expected_logits1 = np.array([[-1., 1.], [-1.5, 1.]], dtype=np.float32)
+    expected_logits2 = np.array([[2., -2., 2.], [-3., 2., -2.]],
+                                dtype=np.float32)
+    expected_probabilities = {
+        'head1': _sigmoid(expected_logits1),
+        'head2': _sigmoid(expected_logits2),
+    }
+
+    spec = multi_head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    self.assertItemsEqual(
+        (_DEFAULT_SERVING_KEY, 'head1', 'classification/head1', 'predict/head1',
+         'head2', 'classification/head2', 'predict/head2'),
+        spec.export_outputs.keys())
+
+    # Assert predictions and export_outputs.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(
+          expected_logits1,
+          predictions[('head1', prediction_keys.PredictionKeys.LOGITS)])
+      self.assertAllClose(
+          expected_logits2,
+          predictions[('head2', prediction_keys.PredictionKeys.LOGITS)])
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          predictions[('head1', prediction_keys.PredictionKeys.PROBABILITIES)])
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          predictions[('head2', prediction_keys.PredictionKeys.PROBABILITIES)])
+
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].scores))
+      self.assertAllClose(
+          expected_probabilities['head1'],
+          sess.run(spec.export_outputs['head1'].scores))
+      self.assertAllClose(
+          expected_probabilities['head2'],
+          sess.run(spec.export_outputs['head2'].scores))
+
+  def test_predict_two_heads_logits_tensor_multi_dim(self):
+    """Tests predict with multi-dimensional logits of shape [2, 2, 5]."""
+    head1 = head_lib.regression_head(label_dimension=2, name='head1')
+    head2 = head_lib.regression_head(label_dimension=3, name='head2')
+    multi_head = multi_head_lib.multi_head([head1, head2])
+
+    logits = np.array(
+        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
+         [[-1.5, 1., -3., 2., -2.], [-1.5, 1., -3., 2., -2.]]],
+        dtype=np.float32)
+    expected_logits1 = np.array(
+        [[[-1., 1.], [-1., 1.]],
+         [[-1.5, 1.], [-1.5, 1.]]],
+        dtype=np.float32)
+    expected_logits2 = np.array(
+        [[[2., -2., 2.], [2., -2., 2.]],
+         [[-3., 2., -2.], [-3., 2., -2.]]],
+        dtype=np.float32)
+
+    spec = multi_head.create_estimator_spec(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    self.assertItemsEqual(
+        (_DEFAULT_SERVING_KEY, 'head1', 'regression/head1', 'predict/head1',
+         'head2', 'regression/head2', 'predict/head2'),
+        spec.export_outputs.keys())
+
+    # Assert predictions and export_outputs.
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      self.assertIsNone(spec.scaffold.summary_op)
+      predictions = sess.run(spec.predictions)
+      self.assertAllClose(
+          expected_logits1,
+          predictions[('head1', prediction_keys.PredictionKeys.PREDICTIONS)])
+      self.assertAllClose(
+          expected_logits2,
+          predictions[('head2', prediction_keys.PredictionKeys.PREDICTIONS)])
+
+      self.assertAllClose(
+          expected_logits1,
+          sess.run(spec.export_outputs[_DEFAULT_SERVING_KEY].value))
+      self.assertAllClose(
+          expected_logits1,
+          sess.run(spec.export_outputs['head1'].value))
+      self.assertAllClose(
+          expected_logits2,
+          sess.run(spec.export_outputs['head2'].value))
+
   def test_eval_two_heads_with_weights(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
     head2 = head_lib.multi_label_head(n_classes=3, name='head2')
@@ -178,7 +284,7 @@ class MultiHeadTest(test.TestCase):
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # Average over classes, weighted sum ober batch and heads.
+    # Average over classes, weighted sum over batch and heads.
     expected_loss_head1 = 17.5
     expected_loss_head2 = 30.0
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
@@ -191,6 +297,8 @@ class MultiHeadTest(test.TestCase):
 
     keys = metric_keys.MetricKeys
     expected_metrics = {
+        keys.LOSS + '/head1': expected_loss_head1,
+        keys.LOSS + '/head2': expected_loss_head2,
         # Average loss over examples.
         keys.LOSS_MEAN + '/head1': expected_loss_head1 / 2,
         keys.LOSS_MEAN + '/head2': expected_loss_head2 / 2,
@@ -231,18 +339,25 @@ class MultiHeadTest(test.TestCase):
 
     logits = {'head1': np.array([[-10., 10.], [-15., 10.]], dtype=np.float32)}
     labels = {'head1': np.array([[1, 0], [1, 1]], dtype=np.int64)}
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r'create_loss not yet implemented for MultiHead\.'):
-      multi_head.create_loss(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=logits,
-          labels=labels)
+    loss = multi_head.create_loss(
+        features={'x': np.array(((42,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)[0]
+    tol = 1e-3
+    with self.test_session():
+      # Unreduced loss of the head is [[(10 + 10) / 2], (15 + 0) / 2]
+      # (averaged over classes, sum-reduced over examples).
+      self.assertAllClose(17.5, loss.eval(), rtol=tol, atol=tol)
 
   def test_train_create_loss_two_heads_with_weights(self):
-    head1 = head_lib.multi_label_head(n_classes=2, name='head1')
-    head2 = head_lib.multi_label_head(n_classes=3, name='head2')
+    # Use different example weighting for each head weighting.
+    weights1 = np.array([[1.], [2.]], dtype=np.float32)
+    weights2 = np.array([[2.], [3.]])
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1',
+                                      weight_column='weights1')
+    head2 = head_lib.multi_label_head(n_classes=3, name='head2',
+                                      weight_column='weights2')
     multi_head = multi_head_lib.multi_head(
         [head1, head2], head_weights=[1., 2.])
 
@@ -255,14 +370,105 @@ class MultiHeadTest(test.TestCase):
         'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
         'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
     }
-    with self.assertRaisesRegexp(
-        NotImplementedError,
-        r'create_loss not yet implemented for MultiHead\.'):
-      multi_head.create_loss(
-          features={'x': np.array(((42,),), dtype=np.int32)},
-          mode=model_fn.ModeKeys.TRAIN,
-          logits=logits,
-          labels=labels)
+    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'weights1': weights1,
+            'weights2': weights2
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    tol = 1e-3
+    with self.test_session():
+      # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
+      # = [10, 7.5]
+      # weighted_sum_loss = 1 * 10 + 2 * 7.5 = 25
+      # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
+      # = [20, 10]
+      # weighted_sum_loss = 2 * 20 + 3 * 10 = 70
+      # head-weighted merge = 1 * 25 + 2 * 70 = 165
+      self.assertAllClose(165, weighted_sum_loss.eval(), rtol=tol, atol=tol)
+      # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
+      self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
+
+  def test_train_create_loss_logits_tensor(self):
+    """Tests create_loss with logits Tensor."""
+    weights1 = np.array([[1.], [2.]], dtype=np.float32)
+    weights2 = np.array([[2.], [3.]])
+    head1 = head_lib.multi_label_head(n_classes=2, name='head1',
+                                      weight_column='weights1')
+    head2 = head_lib.multi_label_head(n_classes=3, name='head2',
+                                      weight_column='weights2')
+    multi_head = multi_head_lib.multi_head(
+        [head1, head2], head_weights=[1., 2.])
+
+    logits = np.array([[-10., 10., 20., -20., 20.],
+                       [-15., 10., -30., 20., -20.]], dtype=np.float32)
+    labels = {
+        'head1': np.array([[1, 0], [1, 1]], dtype=np.int64),
+        'head2': np.array([[0, 1, 0], [1, 1, 0]], dtype=np.int64),
+    }
+    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+        features={
+            'x': np.array(((42,),), dtype=np.int32),
+            'weights1': weights1,
+            'weights2': weights2
+        },
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    tol = 1e-3
+    with self.test_session():
+      # loss of the first head is [[(10 + 10) / 2], [(15 + 0) / 2]]
+      # = [10, 7.5]
+      # weighted_sum_loss = 1 * 10 + 2 * 7.5 = 25
+      # loss of the second head is [[(20 + 20 + 20) / 3], [(30 + 0 + 0) / 3]]
+      # = [20, 10]
+      # weighted_sum_loss = 2 * 20 + 3 * 10 = 70
+      # head-weighted merge = 1 * 25 + 2 * 70 = 165
+      self.assertAllClose(165, weighted_sum_loss.eval(), rtol=tol, atol=tol)
+      # example_weight_sum = 1 * (1 + 2) + 2 * (2 + 3) = 13
+      self.assertAllClose(13., example_weight_sum.eval(), rtol=tol, atol=tol)
+
+  def test_train_create_loss_logits_tensor_multi_dim(self):
+    """Tests create_loss with multi-dimensional logits of shape [2, 2, 5]."""
+    head1 = head_lib.regression_head(label_dimension=2, name='head1')
+    head2 = head_lib.regression_head(label_dimension=3, name='head2')
+    multi_head = multi_head_lib.multi_head([head1, head2])
+
+    logits = np.array(
+        [[[-1., 1., 2., -2., 2.], [-1., 1., 2., -2., 2.]],
+         [[-1.5, 1.5, -2., 2., -2.], [-1.5, 1.5, -2., 2., -2.]]],
+        dtype=np.float32)
+    labels = {
+        'head1': np.array([[[1., 0.], [1., 0.]],
+                           [[1.5, 1.5], [1.5, 1.5]]], dtype=np.float32),
+        'head2': np.array([[[0., 1., 0.], [0., 1., 0.]],
+                           [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
+    }
+    # Loss for the first head:
+    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
+    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
+    #       = 28
+    # Loss for the second head:
+    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
+    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
+    #       = 74
+    expected_weighted_sum_loss = 28. + 74.
+
+    weighted_sum_loss, example_weight_sum, _ = multi_head.create_loss(
+        features={},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    tol = 1e-3
+    with self.test_session():
+      self.assertAllClose(
+          expected_weighted_sum_loss, weighted_sum_loss.eval(),
+          rtol=tol, atol=tol)
+      self.assertAllClose(
+          2. * 2. * 5., example_weight_sum.eval(), rtol=tol, atol=tol)
 
   def test_train_one_head(self):
     head1 = head_lib.multi_label_head(n_classes=2, name='head1')
@@ -307,6 +513,7 @@ class MultiHeadTest(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
       _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss,
           # Average loss over examples.
           metric_keys.MetricKeys.LOSS_MEAN + '/head1': expected_loss / 2,
@@ -332,7 +539,7 @@ class MultiHeadTest(test.TestCase):
     #        (1 - labels) * (logits > 0) * logits =>
     # head1: expected_unweighted_loss = [[10., 10.], [15., 0.]]
     # head2: expected_unweighted_loss = [[20., 20., 20.], [30., 0., 0]]
-    # Average over classes, weighted sum ober batch and heads.
+    # Average over classes, weighted sum over batch and heads.
     expected_loss_head1 = 17.5
     expected_loss_head2 = 30.0
     expected_loss = 1. * expected_loss_head1 + 2. * expected_loss_head2
@@ -367,6 +574,7 @@ class MultiHeadTest(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
       _assert_simple_summaries(self, {
+          metric_keys.MetricKeys.LOSS: expected_loss,
           metric_keys.MetricKeys.LOSS + '/head1': expected_loss_head1,
           metric_keys.MetricKeys.LOSS + '/head2': expected_loss_head2,
           # Average loss over examples.
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3a2394ee227f2ab78e6d4d3d882f2b10954699
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -0,0 +1,529 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to replicate model_fn's over local GPUs.
+
+This file contains util that allow to replicate `Estimator.model_fn` over
+GPUs.  Replicated version of a `model_fn` is returned that can subsequently
+be used with `Estimator`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import six
+
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.client import device_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import util
+from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import device as framework_device
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import device_setter as device_setter_lib
+from tensorflow.python.training import training_util
+
+
+def replicate_model_fn(model_fn, optimizer_fn, devices=None):
+  """Replicate `Estimator.model_fn` over GPUs within a single host.
+
+  The given `model_fn` specifies a single forward pass of a model.  To replicate
+  such a model over GPUs, each GPU gets its own instance of the forward pass
+  (a.k.a. a tower).  The input features and labels get sharded into the chunks
+  that correspond to the number of GPUs.  Each tower computes its own loss based
+  on its input.  For each such loss, gradients are computed.  After that, the
+  available losses are summed to form aggregated loss.  The available
+  gradients are summed too.  Then, they update weights using the specified
+  optimizer.
+
+  If `devices` are `None`, then all available GPUs are going to be used for
+  replication.  If no GPUs are available, then the model is going to be
+  placed on the CPU.
+
+  Two modes of local replication over available GPUs are supported:
+    1)  If exactly 1 GPU is detected, then variables and operations are placed
+        onto GPU.
+    2)  If more than 1 GPU is detected, then variables are going to be placed on
+        the CPU.  Replicas of operations are placed on each individual GPU.
+
+  Here is an example of how one might use their `model_fn` to run over GPUs:
+    ```python
+       def optimizer_fn():
+         return tf.train.GradientDescentOptimizer(learning_rate=0.001)
+       ...
+       def model_fn(...):  # See `model_fn` in `Estimator`.
+         loss = ...
+         if mode == tf.estimator.ModeKeys.TRAIN:
+           #  See the section below on `EstimatorSpec.train_op`.
+           return EstimatorSpec(mode=mode, loss=loss, train_op=tf.noop())
+
+         #  No change for `ModeKeys.EVAL` or `ModeKeys.PREDICT`.
+         return EstimatorSpec(...)
+       ...
+       classifier = tf.estimator.Estimator(
+         model_fn=replicate_model_fn.replicate_model_fn(model_fn, optimizer_fn))
+    ```
+
+  On `EstimatorSpec.train_op`:
+  `model_fn` returns `EstimatorSpec.train_op` for
+  `tf.estimator.GraphKeys.TRAIN`. It is typically derived using an optimizer.
+  `replicate_model_fn` ignores the returned `EstimatorSpec.train_op`, so there
+  is no need to use an optimizer inside the user's `model_fn`.  The
+  `EstimatorSpec.loss` subgraph is going to be executed, while
+  `EstimatorSpec.train_op` isn't going to be executed. One could pass
+  `train_op=tf.noop()` to `EstimatorSpec`.
+
+  On sharding input features and labels:
+  Input features and labels are split for consumption by each tower. They are
+  split across the dimension 0.  Features and labels need to be batch major.
+
+  On reduction algorithms:
+  Certain algorithms were chosen for aggregating results of computations on
+  multiple towers:
+    - Losses from all towers are reduced using sum.
+    - Gradients are reduced using sum for each trainable variable.
+    - `eval_metrics_ops` are reduced per metric using `reduce_mean`.
+    - `EstimatorSpec.predictions` and `EstimatorSpec.export_outputs` are
+      reduced using concatenation.
+    - For all other fields of `EstimatorSpec` the values of the first tower
+      are taken.
+
+  On distribution of variables:
+  Variables are not duplicated between towers.  Instead, they are placed on a
+  single device as defined above and shared across towers.
+
+  Other current limitations:
+    - `predictions` are not supported for `ModeKeys.EVAL`.  That is required for
+      `tf.contrib.estimator.add_metrics`.
+
+  Args:
+    model_fn: `model_fn` as defined in `Estimator`.  See the section above about
+      the train_op argument of `EstimatorSpec`.
+    optimizer_fn: a function that returns an optimizer instance.  The function
+      may accept one `params` argument.  This is the `params` argument as
+      defined by `Estimator`.  See  the `Estimator` documentation for details.
+    devices: Optional list of devices to replicate the model across.  This
+      argument can be used to replice only on the subset of available GPUs.
+      If `None`, then all available GPUs are going to be used for replication.
+      If no GPUs are available, then the model is going to be placed on the CPU.
+
+  Returns:
+    A replicated version of the supplied `model_fn`. Returned function that
+      conforms to the requirements of `Estimator`'s `model_fn` and can be used
+      instead of the supplied `model_fn`.
+  """
+  return _replicate_model_fn_with_mode(
+      model_fn,
+      optimizer_fn,
+      devices,
+      # TODO(isaprykin): Query system configuration to choose modes other than
+      # `SHARED_LOCAL_PARAMETER_SERVER`, even though it is often appropriate.
+      mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER)
+
+
+class _VariableDistributionMode(object):
+  """Modes for variable distribution used for forcing a particular one.
+
+  Forcing a mode is meant for performance experimentation purposes rather than
+  for general use cases.
+  """
+
+  SHARED_LOCAL_PARAMETER_SERVER = 1
+  """Variables are placed on a single device and shared across all devices.
+
+  Two ways to achieve this distribution over available GPUs are supported:
+    1)  If exactly 1 GPU is detected, then variables and operations are placed
+        onto GPU.
+    2)  If more than 1 GPU is detected, then variables are going to be placed on
+        the CPU.  Replicas of operations are placed on each individual GPU.
+  """
+
+  SHARED_ROUND_ROBIN = 2
+  """Variables are placed on all devices in a round-robin fashion.
+
+  Every subsequent variable is placed on the next device.  There is only one
+  copy of each variable that is shared across all devices.
+  """
+
+
+def _replicate_model_fn_with_mode(
+    model_fn,
+    optimizer_fn,
+    devices=None,
+    mode=_VariableDistributionMode.SHARED_LOCAL_PARAMETER_SERVER):
+  """A version of `replicate_model_fn` that allows to specify a `mode`."""
+  if not devices:
+    devices = _get_local_devices('GPU') or _get_local_devices('CPU')
+
+  is_a_single_gpu_case = len(devices) == 1 and 'GPU' in devices[0]
+  consolidation_device = '/{}:0'.format('GPU'
+                                        if is_a_single_gpu_case else 'CPU')
+
+  ps_devices = [consolidation_device]
+  if mode == _VariableDistributionMode.SHARED_ROUND_ROBIN:
+    ps_devices = devices
+
+  tf_logging.info('Replicating the `model_fn` across {}.  Variables are going '
+                  'to be placed on {}.  Consolidation device is going to be {}.'
+                  .format(devices, ps_devices, consolidation_device))
+
+  def replicated_model_fn(features, labels, mode, params=None, config=None):
+    """Replicated version of `model_fn` to be used instead."""
+    feature_shards, label_shards = _split_batch(
+        features, labels, len(devices), device=consolidation_device)
+    tower_specs = _get_loss_towers(
+        model_fn=model_fn,
+        mode=mode,
+        features=feature_shards,
+        labels=label_shards,
+        params=params,
+        config=config,
+        devices=devices,
+        local_ps_devices=ps_devices)
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      train_op = _minimize_towers(tower_specs,
+                                  _call_optimizer_fn(optimizer_fn, params))
+      return _train_spec(
+          tower_specs, train_op, aggregation_device=consolidation_device)
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return _eval_spec(tower_specs, aggregation_device=consolidation_device)
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return _predict_spec(tower_specs, aggregation_device=consolidation_device)
+
+  return replicated_model_fn
+
+
+def _get_local_devices(device_type):
+  local_device_protos = device_lib.list_local_devices()
+  return [
+      device.name
+      for device in local_device_protos
+      if device.device_type == device_type
+  ]
+
+
+def _split_batch(features, labels, number_of_shards, device):
+  """Split input features and labes into batches."""
+
+  def split_dictionary(dictionary):
+    """Split a dictionary into shards."""
+    shards = [{} for _ in range(number_of_shards)]
+    for name, tensor in six.iteritems(dictionary):
+      if isinstance(tensor, sparse_tensor.SparseTensor):
+        for i, shard in enumerate(
+            sparse_ops.sparse_split(
+                sp_input=tensor, num_split=number_of_shards, axis=0)):
+          shards[i][name] = shard
+      else:
+        for i, shard in enumerate(array_ops.split(tensor, number_of_shards)):
+          shards[i][name] = shard
+    return shards
+
+  with ops_lib.name_scope('split_inputs'):
+    with ops_lib.device(device):
+      if isinstance(features, dict):
+        feature_shards = split_dictionary(features)
+      else:
+        feature_shards = array_ops.split(features, number_of_shards)
+
+      if labels is None:
+        label_shards = None
+      elif isinstance(labels, dict):
+        label_shards = split_dictionary(labels)
+      else:
+        label_shards = array_ops.split(labels, number_of_shards)
+  return feature_shards, label_shards
+
+
+_DEFAULT_NAME_SCOPE_PATTERN = 'tower_{}'
+
+
+def _get_loss_towers(model_fn,
+                     mode,
+                     features,
+                     labels,
+                     params,
+                     config,
+                     devices,
+                     local_ps_devices,
+                     name_scope_pattern=_DEFAULT_NAME_SCOPE_PATTERN):
+  """Replicate the loss computation across devices."""
+  tower_specs = []
+
+  model_fn_args = util.fn_args(model_fn)
+  optional_params = {}
+  if 'params' in model_fn_args:
+    optional_params['params'] = copy.deepcopy(params)
+  if 'config' in model_fn_args:
+    optional_params['config'] = copy.deepcopy(config)
+
+  # pylint: disable=protected-access
+  round_robin_strategy = device_setter_lib._RoundRobinStrategy(
+      num_tasks=len(local_ps_devices))
+  # pylint: enable=protected-access
+
+  for i, device in enumerate(devices):
+    is_the_first_tower = (i == 0)
+
+    device_setter = _local_device_setter(
+        worker_device=device,
+        ps_devices=local_ps_devices,
+        ps_strategy=round_robin_strategy)
+
+    # We would like to preserve the names of the variables and ops that the user
+    # might be relying on. Names without a prefix are going to resolve to
+    # variables and ops of the first tower.
+    name_scope = name_scope_pattern
+    if is_the_first_tower:
+      name_scope = ''
+
+    with variable_scope.variable_scope('', reuse=not is_the_first_tower):
+      with ops_lib.name_scope(name_scope.format(i)):
+        with ops_lib.device(device_setter):
+          labels_shard = None
+          if labels:
+            labels_shard = labels[i]
+
+          tower_specs.append(
+              model_fn(
+                  mode=mode,
+                  features=features[i],
+                  labels=labels_shard,
+                  **optional_params))
+  return tower_specs
+
+
+def _local_device_setter(worker_device, ps_devices, ps_strategy):
+  """A device setter that puts distributes Var/Ops to PS/workers."""
+  ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
+
+  def local_device_chooser(op):
+    current_device = framework_device.DeviceSpec.from_string(op.device or '')
+
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if node_def.op in ps_ops:
+      ps_device_spec = framework_device.DeviceSpec.from_string(
+          '{}'.format(ps_devices[ps_strategy(op)]))
+
+      ps_device_spec.merge_from(current_device)
+      return ps_device_spec.to_string()
+    else:
+      worker_device_spec = framework_device.DeviceSpec.from_string(
+          worker_device or '')
+      worker_device_spec.merge_from(current_device)
+      return worker_device_spec.to_string()
+
+  return local_device_chooser
+
+
+def _minimize_towers(tower_specs, optimizer):
+  """Aggregate and apply gradients for computed losses."""
+  grad_lists = {}
+  for tower_spec in tower_specs:
+    with ops_lib.device(tower_spec.loss.device):
+      for grad, var in optimizer.compute_gradients(tower_spec.loss):
+        if grad is not None:
+          grad_lists.setdefault(var, []).append(grad)
+
+  aggregated_grads = []
+  with ops_lib.name_scope('gradient_aggregating'):
+    for var, grads in six.iteritems(grad_lists):
+      grad = _compute_sum_on_device(grads, var.device)
+      aggregated_grads.append((grad, var))
+
+  train_op = optimizer.apply_gradients(
+      aggregated_grads, global_step=training_util.get_global_step())
+
+  return train_op
+
+
+def _call_optimizer_fn(optimizer_fn, params):
+  arguments = {}
+  optimizer_fn_arguments = util.fn_args(optimizer_fn)
+  if 'params' in optimizer_fn_arguments:
+    arguments['params'] = params
+  return optimizer_fn(**arguments)
+
+
+def _compute_sum_on_device(values, device, name=None):
+  with ops_lib.device(device):
+    if isinstance(values[0], ops_lib.IndexedSlices):
+      if name:
+        raise ValueError('The name {} is not expected to be given to '
+                         'IndexedSlices {}'.format(name, values))
+
+      values_concat = array_ops.concat([v.values for v in values], axis=0)
+      indices_concat = array_ops.concat([v.indices for v in values], axis=0)
+      return ops_lib.IndexedSlices(values_concat, indices_concat,
+                                   values[0].dense_shape)
+    else:
+      return math_ops.add_n(values, name=name)
+
+
+def _train_spec(tower_specs,
+                train_op,
+                aggregation_device,
+                aggregated_loss_name='loss'):
+  """Populate replicated EstimatorSpec for `GraphKeys.TRAIN`."""
+  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec['mode'] = model_fn_lib.ModeKeys.TRAIN
+  estimator_spec['train_op'] = train_op
+  estimator_spec['loss'] = _compute_sum_on_device(
+      [spec.loss for spec in tower_specs], aggregation_device,
+      aggregated_loss_name)
+  return model_fn_lib.EstimatorSpec(**estimator_spec)
+
+
+def _eval_spec(tower_specs, aggregation_device, aggregated_loss_name='loss'):
+  """Populate replicated EstimatorSpec for `GraphKeys.EVAL`."""
+  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec['mode'] = model_fn_lib.ModeKeys.EVAL
+  estimator_spec['loss'] = _compute_sum_on_device(
+      [spec.loss for spec in tower_specs], aggregation_device,
+      aggregated_loss_name)
+
+  update_ops = []
+  for tower_spec in tower_specs:
+    for name, (_, update_op) in six.iteritems(tower_spec.eval_metric_ops):
+      update_ops.append(update_op)
+
+  with ops_lib.control_dependencies(update_ops):
+    reduced_update_op = _reduce_metric_variables(len(tower_specs))
+
+  eval_metric_ops = {}
+  for name, (metric_tensor, _) in six.iteritems(tower_specs[0].eval_metric_ops):
+    eval_metric_ops[name] = (metric_tensor, reduced_update_op)
+  estimator_spec['eval_metric_ops'] = eval_metric_ops
+  return model_fn_lib.EstimatorSpec(**estimator_spec)
+
+
+def _reduce_metric_variables(number_of_towers):
+  """Aggregate local variables used in metrics into the first tower."""
+  if number_of_towers == 1:
+    return control_flow_ops.no_op()
+
+  metric_variables = ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)
+  variables_per_tower = len(metric_variables) // number_of_towers
+
+  if len(metric_variables) % number_of_towers != 0:
+    raise ValueError(
+        'Different `EstimatorSpec.eval_metric_ops` across `model_fn()` calls.'
+        ' Expected {} local variables, but got {} instead.'.format(
+            variables_per_tower * number_of_towers, len(metric_variables)))
+
+  # `metric_variables` has the size of `variables_per_tower` x
+  #  number_of_towers.  Each tower is produced by calling the same model_fn.
+  #  First `variables_per_tower` correspond to the first tower.  Each such
+  #  variable has an replica at the `(variables_per_tower * i)` position, where
+  #  `i` is `[1.. number_of_towers]`.  We are going to add values from replicas
+  #  to each variable of the first tower.  We then zero out replica values, so
+  #  that `_reduce_metric_variables` operation is idempotent.  If a metric
+  #  is then computed based on local variables from the first tower, then the
+  #  resulting metric is an estimate for all `number_of_towers` towers.
+  ops = []
+  for i in range(0, variables_per_tower):
+    next_replica_id = i + variables_per_tower
+    replicas = [
+        metric_variables[replica_id]
+        for replica_id in range(next_replica_id, len(metric_variables),
+                                variables_per_tower)
+    ]  #  `replicas` doesn't contain the first-tower variable.
+
+    reduce_op = state_ops.assign_add(metric_variables[i],
+                                     math_ops.add_n(replicas))
+
+    with ops_lib.control_dependencies([reduce_op]):
+      for replica in replicas:
+        zeros_for_replica = array_ops.zeros(
+            array_ops.shape(replica), dtype=replica.dtype)
+        zero_out_replica_op = state_ops.assign(replica, zeros_for_replica)
+        ops.append(zero_out_replica_op)
+
+  return control_flow_ops.group(*ops)
+
+
+def _predict_spec(tower_specs, aggregation_device):
+  """Populate replicated EstimatorSpec for `GraphKeys.PREDICT`."""
+  estimator_spec = tower_specs[0]._asdict()
+  estimator_spec['mode'] = model_fn_lib.ModeKeys.PREDICT
+
+  with ops_lib.device(aggregation_device):
+    estimator_spec['predictions'] = _concat_tensor_dicts(
+        *[tower_spec.predictions for tower_spec in tower_specs])
+
+    export_outputs_dict = _dict_concat(
+        *[tower_spec.export_outputs for tower_spec in tower_specs])
+
+    export_outputs = {}
+    for name, export_output_list in six.iteritems(export_outputs_dict):
+      if isinstance(export_output_list[0], export_output_lib.PredictOutput):
+        export_outputs[name] = export_output_lib.PredictOutput(
+            outputs=_concat_tensor_dicts(*[
+                export_output.outputs for export_output in export_output_list
+            ]))
+      elif isinstance(export_output_list[0],
+                      export_output_lib.RegressionOutput):
+        export_outputs[name] = export_output_lib.RegressionOutput(
+            value=array_ops.concat(
+                [export_output.value for export_output in export_output_list],
+                axis=0))
+      elif isinstance(export_output_list[0],
+                      export_output_lib.ClassificationOutput):
+        scores = None
+        if export_output_list[0].scores is not None:
+          scores = array_ops.concat(
+              [export_output.scores for export_output in export_output_list],
+              axis=0)
+
+        classes = None
+        if export_output_list[0].classes is not None:
+          classes = array_ops.stack(
+              [export_output.classes for export_output in export_output_list],
+              axis=0)
+
+        export_outputs[name] = export_output_lib.ClassificationOutput(
+            scores=scores, classes=classes)
+
+  estimator_spec['export_outputs'] = export_outputs
+  return model_fn_lib.EstimatorSpec(**estimator_spec)
+
+
+def _concat_tensor_dicts(*tensor_dicts):
+  return {
+      name: array_ops.concat(tensors, axis=0, name=name)
+      for name, tensors in six.iteritems(_dict_concat(*tensor_dicts))
+  }
+
+
+def _dict_concat(*dicts):
+  list_dict = {}
+  for d in dicts:
+    if d is None:
+      continue
+
+    for k, v in six.iteritems(d):
+      list_dict.setdefault(k, []).append(v)
+  return list_dict
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83a1b84079f115f94be33297f0ab0e2e8f2f7e3
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn_test.py
@@ -0,0 +1,1087 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities that replicate `Estimator.model_fn` over GPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import shutil
+import tempfile
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import replicate_model_fn
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.canned import dnn
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import device_setter
+from tensorflow.python.training import gradient_descent
+
+
+# TODO(isaprykin):  Parametrize all the tests on
+#   replicate_model_fn._VariableDistributionMode when it's supported.
+class DNNClassifierIntegrationTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def test_complete_flow_with_public_version(self):
+    return self._complete_flow_with_mode(mode=None)
+
+  def test_complete_flow_with_mode_local_ps_server(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.
+        SHARED_LOCAL_PARAMETER_SERVER)
+
+  def test_complete_flow_with_mode_round_robin(self):
+    return self._complete_flow_with_mode(
+        replicate_model_fn._VariableDistributionMode.SHARED_ROUND_ROBIN)
+
+  def _complete_flow_with_mode(self, mode):
+    n_classes = 3
+    input_dimension = 2
+    batch_size = 12
+
+    data = np.linspace(
+        0., n_classes - 1., batch_size * input_dimension, dtype=np.float32)
+    x_data = data.reshape(batch_size, input_dimension)
+    categorical_data = np.random.random_integers(
+        0, len(x_data), size=len(x_data))
+    y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1))
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data,
+           'categories': categorical_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data,
+           'categories': categorical_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': x_data,
+           'categories': categorical_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,)),
+        feature_column.embedding_column(
+            feature_column.categorical_column_with_vocabulary_list(
+                'categories',
+                vocabulary_list=np.linspace(
+                    0., len(x_data), len(x_data), dtype=np.int64)), 1)
+    ]
+
+    estimator = dnn.DNNClassifier(
+        hidden_units=(2, 2),
+        feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    def optimizer_fn():
+      return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05)
+
+    if not mode:  # Use the public `replicate_model_fn`.
+      model_fn = replicate_model_fn.replicate_model_fn(
+          estimator.model_fn,
+          optimizer_fn,
+          devices=['/gpu:0', '/gpu:1', '/gpu:2'])
+    else:
+      model_fn = replicate_model_fn._replicate_model_fn_with_mode(
+          estimator.model_fn,
+          optimizer_fn,
+          devices=['/gpu:0', '/gpu:1', '/gpu:2'],
+          mode=mode)
+
+    estimator = estimator_lib.Estimator(
+        model_fn=model_fn,
+        model_dir=estimator.model_dir,
+        config=estimator.config,
+        params=estimator.params)
+
+    num_steps = 10
+    estimator.train(train_input_fn, steps=num_steps)
+
+    scores = estimator.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def _as_label(self, data_in_float):
+    return np.rint(data_in_float).astype(np.int64)
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+class ReplicateModelTest(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.multiply(features, c)
+
+    loss = None
+    if mode is not model_fn_lib.ModeKeys.PREDICT:
+      loss = losses.absolute_difference(
+          labels=labels,
+          predictions=predictions,
+          reduction=losses.Reduction.SUM)
+      loss = math_ops.reduce_sum(loss)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=loss,
+        eval_metric_ops=metrics,
+        predictions={'probabilities': predictions},
+        train_op=control_flow_ops.no_op())  # This train_op isn't actually used.
+
+  def optimizer_fn(self, params):
+    return gradient_descent.GradientDescentOptimizer(params['learning_rate'])
+
+  @property
+  def params(self):
+    params = {}
+    params['learning_rate'] = 1.0
+    return params
+
+  def test_train(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # loss' of c is 3.
+      # new value of c = 10 - learning rate * 3 = 7.0.
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(7.0, session.run(c))
+
+  def test_train_spec_with_optimizer_without_params(self):
+
+    def optimizer_fn_without_params():
+      return gradient_descent.GradientDescentOptimizer(learning_rate=1.0)
+
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn,
+          optimizer_fn_without_params,
+          devices=['/gpu:0', '/gpu:1'])
+      # This call is going to fail if `replicated_model_fn` is still passing
+      # `params` inside `optimizer_fn`, even though the latter doesn't take any:
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+      del estimator_spec
+
+  def test_eval(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
+      session.run(variables.local_variables_initializer())
+      session.run(variables.global_variables_initializer())
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      session.run([a, b])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
+
+      # loss[i] = features[i] * 10 - labels[i].
+      # Accuracy is 0.0 (no match) in the first tower.
+      # Accuracy is 1.0 (match) in the second tower, since the feature
+      # times weight "c" happened to be equal to the label.
+      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
+
+      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
+
+  def test_predict(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
+      session.run(variables.global_variables_initializer())
+
+      self.assertAllClose({
+          'probabilities': np.array([[0.1], [0.02]])
+      }, session.run(estimator_spec.predictions))
+
+  def test_train_single_tower(self):
+    features = np.array([[1.0], [2.0]])
+    labels = np.array([[1.0], [2.0]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.TRAIN, self.params)
+      session.run(variables.global_variables_initializer())
+
+      # loss = feature * c - label
+      total_loss = (1.0 * 10 - 1.0) + (2.0 * 10 - 2.0)
+      self.assertEqual(total_loss, session.run(estimator_spec.loss))
+
+      # loss' of c is 3.
+      # new value of c = 10 - learning rate * 3 = 7.0.
+      session.run(estimator_spec.train_op)
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(7.0, session.run(c))
+
+  def test_eval_single_tower(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.EVAL, self.params)
+      session.run(variables.local_variables_initializer())
+      session.run(variables.global_variables_initializer())
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      session.run([a, b])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
+
+      # Accuracy is 0.0 (no match) in the first tower.
+      # Accuracy is 1.0 (match) in the second tower, since the feature
+      # times weight "c" happened to be equal to the label.
+      total_loss = ((0.01 * 10 - 0.01) + (0.002 * 10 - 0.02))
+
+      self.assertNear((0.0 + 1.0) / 2.0, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertNear(total_loss, session.run(estimator_spec.loss), 0.01)
+
+  def test_predict_single_tower(self):
+    features = np.array([[0.01], [0.002]])
+    labels = np.array([[0.01], [0.02]])
+
+    with self.test_session() as session:
+      replicated_model_fn = replicate_model_fn.replicate_model_fn(
+          self.model_fn, self.optimizer_fn, devices=['/gpu:0'])
+      estimator_spec = replicated_model_fn(
+          features, labels, model_fn_lib.ModeKeys.PREDICT, self.params)
+      session.run(variables.global_variables_initializer())
+
+      self.assertAllClose({
+          'probabilities': np.array([[0.1], [0.02]])
+      }, session.run(estimator_spec.predictions))
+
+
+class GetLossTowersTest(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(0.25, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
+    labels = np.array([0.1, 0.2, 0.3, labels[0]])
+
+    loss = losses.absolute_difference(
+        labels=labels, predictions=predictions, reduction=losses.Reduction.SUM)
+
+    return model_fn_lib.EstimatorSpec(mode=mode, loss=math_ops.reduce_sum(loss))
+
+  def test_gradients_are_computed(self):
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          self.model_fn,
+          mode=None,
+          features=[[0.6], [1.6]],
+          labels=[[0.6], [0.6]],
+          params=None,
+          config=None,
+          devices=['/gpu:0', '/gpu:1'],
+          local_ps_devices=['/gpu:0'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 2)
+
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('Sum:0', tower_specs[0].loss.name)
+      self.assertEqual(1.0, session.run(tower_specs[0].loss))
+
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('test_tower_1/Sum:0', tower_specs[1].loss.name)
+      # The input batch for the second tower had a loss that is 1.0
+      # bigger: 0.6 vs 1.6.
+      self.assertEqual(2.0, session.run(tower_specs[1].loss))
+
+      self.assertEqual(1, len(variables.global_variables()))
+      self.assertEqual(1, len(variables.trainable_variables()))
+
+      with variable_scope.variable_scope('', reuse=True):
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual(0.25, session.run(c))
+
+  def test_variables_are_round_robined_correctly(self):
+    """Test that creates multiple variables and tests round-robin placement."""
+
+    def model_fn(mode, features, labels, params):
+      del params
+      for variable_name in ['a', 'b', 'c', 'd']:
+        c = variable_scope.get_variable(
+            variable_name,
+            initializer=constant_op.constant(0.25, dtype=dtypes.float64),
+            dtype=dtypes.float64)
+
+      predictions = math_ops.add(np.array([0.1, 0.2, 0.3, features[0]]), c)
+      labels = np.array([0.1, 0.2, 0.3, labels[0]])
+      loss = losses.absolute_difference(
+          labels=labels,
+          predictions=predictions,
+          reduction=losses.Reduction.SUM)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode, loss=math_ops.reduce_sum(loss))
+
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          model_fn,
+          mode=None,
+          features=[[0.6], [1.6], [2.6]],
+          labels=[[0.6], [0.6], [2.6]],
+          params=None,
+          config=None,
+          devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          local_ps_devices=['/gpu:0', '/gpu:1', '/gpu:3'],
+          name_scope_pattern='test_tower_{}')
+      session.run(variables.global_variables_initializer())
+
+      self.assertEqual(len(tower_specs), 3)
+      self.assertEqual('/device:GPU:0', tower_specs[0].loss.device)
+      self.assertEqual('/device:GPU:1', tower_specs[1].loss.device)
+      self.assertEqual('/device:GPU:3', tower_specs[2].loss.device)
+
+      with variable_scope.variable_scope('', reuse=True):
+        a = variable_scope.get_variable('a', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', a.device)
+        b = variable_scope.get_variable('b', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:1', b.device)
+        c = variable_scope.get_variable('c', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:3', c.device)
+        d = variable_scope.get_variable('d', dtype=dtypes.float64)
+        self.assertEqual('/device:GPU:0', d.device)
+
+
+class SplitBatchTest(test_util.TensorFlowTestCase):
+
+  def evaluate_shards(self, first_list, second_list):
+    evaluate_items = lambda x: x.eval()
+    return list(map(evaluate_items, first_list)), list(
+        map(evaluate_items, second_list))
+
+  def test_simple_half_split(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = [0.0, 1.0, 2.0, 3.0]
+      labels = [10.0, 11.0, 12.0, 13.0]
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      feature_shards, label_shards = self.evaluate_shards(
+          feature_shards, label_shards)
+
+      self.assertAllEqual([[0.0, 1.0], [2.0, 3.0]], feature_shards)
+      self.assertAllEqual([[10.0, 11.0], [12.0, 13.0]], label_shards)
+
+  def test_to_each_their_own(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = [0.0, 1.0, 2.0, 3.0]
+      labels = [10.0, 11.0, 12.0, 13.0]
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 4, device='/gpu:0')
+
+      feature_shards, label_shards = self.evaluate_shards(
+          feature_shards, label_shards)
+
+      self.assertAllEqual([[0.0], [1.0], [2.0], [3.0]], feature_shards)
+      self.assertAllEqual([[10.0], [11.0], [12.0], [13.0]], label_shards)
+
+  def test_one_batch(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = [0.0, 1.0, 2.0, 3.0]
+      labels = [10.0, 11.0, 12.0, 13.0]
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 1, device='/gpu:0')
+
+      feature_shards, label_shards = self.evaluate_shards(
+          feature_shards, label_shards)
+
+      self.assertAllEqual([[0.0, 1.0, 2.0, 3.0]], feature_shards)
+      self.assertAllEqual([[10.0, 11.0, 12.0, 13.0]], label_shards)
+
+  def test_half_split_in_dictionary(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
+      labels = [10.0, 11.0, 12.0, 13.0]
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
+      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
+      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
+      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
+      self.assertAllEqual([10.0, 11.0], label_shards[0].eval())
+      self.assertAllEqual([12.0, 13.0], label_shards[1].eval())
+
+  def test_one_batch_in_dictionary(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
+      labels = [10.0, 11.0, 12.0, 13.0]
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 1, device='/gpu:0')
+
+      self.assertAllEqual([0.0, 1.0, 2.0, 3.0],
+                          feature_shards[0]['first'].eval())
+      self.assertAllEqual([4.0, 5.0, 6.0, 7.0],
+                          feature_shards[0]['second'].eval())
+      self.assertAllEqual([10.0, 11.0, 12.0, 13.0], label_shards[0].eval())
+
+  def test_feature_and_label_dictionaries(self):
+    with self.test_session() as session:  # pylint: disable=unused-variable
+      features = {'first': [0.0, 1.0, 2.0, 3.0], 'second': [4.0, 5.0, 6.0, 7.0]}
+      labels = {'first': [10.0, 11.0], 'second': [12.0, 13.0]}
+
+      feature_shards, label_shards = replicate_model_fn._split_batch(
+          features, labels, 2, device='/gpu:0')
+
+      self.assertAllEqual([0.0, 1.0], feature_shards[0]['first'].eval())
+      self.assertAllEqual([4.0, 5.0], feature_shards[0]['second'].eval())
+      self.assertAllEqual([2.0, 3.0], feature_shards[1]['first'].eval())
+      self.assertAllEqual([6.0, 7.0], feature_shards[1]['second'].eval())
+      self.assertAllEqual([10.0], label_shards[0]['first'].eval())
+      self.assertAllEqual([12.0], label_shards[0]['second'].eval())
+      self.assertAllEqual([11], label_shards[1]['first'].eval())
+      self.assertAllEqual([13.0], label_shards[1]['second'].eval())
+
+
+class TrainSpecTest(test_util.TensorFlowTestCase):
+
+  expected_predictions = {}
+
+  def create_estimator_spec(self, loss):
+    return model_fn_lib.EstimatorSpec(
+        mode=model_fn_lib.ModeKeys.TRAIN,
+        loss=loss,
+        train_op=loss,  # Not used; currently required.
+        predictions=self.expected_predictions)
+
+  def create_constant_loss(self, loss_value):
+    return constant_op.constant(loss_value, dtype=dtypes.float64)
+
+  def test_example(self):
+    with self.test_session() as session:
+      tower_losses = list(map(self.create_constant_loss, [2, 4, 6]))
+      tower_specs = list(map(self.create_estimator_spec, tower_losses))
+
+      expected_train_op = tower_losses[1]
+
+      estimator_spec = replicate_model_fn._train_spec(
+          tower_specs, expected_train_op, aggregation_device='/gpu:0')
+
+      self.assertEqual(expected_train_op, estimator_spec.train_op)
+      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
+      self.assertEqual(self.expected_predictions, estimator_spec.predictions)
+
+
+class EvalSpecTest(test_util.TensorFlowTestCase):
+
+  def create_estimator_spec(self, loss, metrics):
+    return model_fn_lib.EstimatorSpec(
+        mode=model_fn_lib.ModeKeys.EVAL, loss=loss, eval_metric_ops=metrics)
+
+  def create_constant_loss(self, loss_value):
+    return constant_op.constant(loss_value, dtype=dtypes.float64)
+
+  def create_eval_metrics(self, noise):
+    predictions = np.array([0.1, 0.2, 0.3, 0.6 + noise])
+    labels = np.array([0.1, 0.2, 0.3, 0.6])
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions),
+        'auc': metrics_lib.auc(labels, predictions)
+    }
+    return metrics
+
+  def test_example(self):
+    with self.test_session() as session:
+      tower_losses = map(self.create_constant_loss, [2, 4, 6])
+      tower_metrics = map(self.create_eval_metrics, [0, 0.2, 0.3])
+      tower_specs = [
+          self.create_estimator_spec(l, m)
+          for l, m in zip(tower_losses, tower_metrics)
+      ]
+      session.run(variables.local_variables_initializer())
+
+      estimator_spec = replicate_model_fn._eval_spec(
+          tower_specs, aggregation_device='/device:GPU:0')
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      self.assertEqual('/device:CPU:0', accuracy.device)
+      self.assertEqual('/device:CPU:0', auc.device)
+
+      session.run([a, b])
+      accuracy, auc = session.run([accuracy, auc])
+
+      self.assertNear((12 - 2) / 12, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertEqual(2 + 4 + 6, session.run(estimator_spec.loss))
+
+  def test_handles_single_tower(self):
+    with self.test_session() as session:
+      tower_losses = map(self.create_constant_loss, [5])
+      tower_metrics = map(self.create_eval_metrics, [0.2])
+      tower_specs = [
+          self.create_estimator_spec(l, m)
+          for l, m in zip(tower_losses, tower_metrics)
+      ]
+      session.run(variables.local_variables_initializer())
+
+      estimator_spec = replicate_model_fn._eval_spec(
+          tower_specs, aggregation_device='/device:GPU:0')
+
+      accuracy, a = estimator_spec.eval_metric_ops['accuracy']
+      auc, b = estimator_spec.eval_metric_ops['auc']
+
+      self.assertEqual('/device:CPU:0', accuracy.device)
+      self.assertEqual('/device:CPU:0', auc.device)
+
+      session.run([a, b])
+      accuracy = session.run(accuracy)
+      auc = session.run(auc)
+
+      self.assertNear((4 - 1) / 4, accuracy, 0.01)
+      self.assertEqual(0, auc)
+      self.assertEqual(5, session.run(estimator_spec.loss))
+
+
+class PredictSpecTest(test_util.TensorFlowTestCase):
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(0.25, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = math_ops.add(np.array([features[0], features[0]]), c)
+
+    return model_fn_lib.EstimatorSpec(
+        mode=model_fn_lib.ModeKeys.PREDICT,
+        predictions={
+            'probabilities': predictions
+        })
+
+  def test_example(self):
+    with self.test_session() as session:
+      tower_specs = replicate_model_fn._get_loss_towers(
+          self.model_fn,
+          mode=None,
+          features=[[0.1], [0.2]],
+          labels=[[], []],
+          params=None,
+          config=None,
+          devices=['/gpu:0', '/gpu:1'],
+          local_ps_devices=['/gpu:0'],
+      )
+      session.run(variables.global_variables_initializer())
+
+      estimator_spec = replicate_model_fn._predict_spec(
+          tower_specs, aggregation_device='/gpu:0')
+
+      self.assertEqual('/device:GPU:0',
+                       estimator_spec.predictions['probabilities'].device)
+      self.assertAllClose({
+          'probabilities': np.array([0.35, 0.35, 0.45, 0.45])
+      }, session.run(estimator_spec.predictions))
+
+
+class ReduceMetricVariablesTest(test_util.TensorFlowTestCase):
+
+  def create_metric_variable(self, initial_value, name):
+    return variable_scope.variable(
+        initial_value,
+        trainable=False,
+        collections=[ops_lib.GraphKeys.METRIC_VARIABLES],
+        validate_shape=True,
+        name=name)
+
+  def create_tower_metrics(self, tower_id):
+    with variable_scope.variable_scope('', reuse=(tower_id != 0)):
+      self.create_metric_variable(1.3 * (tower_id + 1), 'total')
+      self.create_metric_variable(2.3 * (tower_id + 1), 'count')
+      self.create_metric_variable(
+          np.array([3.3, 3.5, 3.7]) * (tower_id + 1), 'total')
+
+  def test_example(self):
+    with self.test_session() as session:
+      for tower_id in range(3):
+        self.create_tower_metrics(tower_id)
+
+      session.run(
+          variables.variables_initializer(
+              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
+
+      session.run(
+          replicate_model_fn._reduce_metric_variables(number_of_towers=3))
+
+      # 1st tower = 1.3, 2.3,  [3.3, 3.5, 3.7]
+      # 2nd tower = 2.6, 4.6,  [6.6, 7.0, 7.4]
+      # 3rd tower = 3.9, 6.9,  [9.9, 10.5, 11.1]
+      # Reduced =   7.8, 13.8, [19.8, 21.0, 22.2]
+      # Towers are accumulated in the first tower.
+      local_metrics = session.run(
+          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
+
+      self.assertNear(7.8, local_metrics[0], 0.01)
+      self.assertNear(13.8, local_metrics[1], 0.01)
+      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
+      self.assertNear(0.0, local_metrics[3], 0.01)
+      self.assertNear(0.0, local_metrics[4], 0.01)
+      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
+      self.assertNear(0.0, local_metrics[6], 0.01)
+      self.assertNear(0.0, local_metrics[7], 0.01)
+      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
+
+  def test_reduce_is_idempotent(self):
+    with self.test_session() as session:
+      for tower_id in range(3):
+        self.create_tower_metrics(tower_id)
+
+      session.run(
+          variables.variables_initializer(
+              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
+
+      for _ in range(20):
+        session.run(
+            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
+
+      local_metrics = session.run(
+          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
+
+      self.assertNear(7.8, local_metrics[0], 0.01)
+      self.assertNear(13.8, local_metrics[1], 0.01)
+      self.assertAllClose([19.8, 21., 22.1], local_metrics[2], 0.01)
+      self.assertNear(0.0, local_metrics[3], 0.01)
+      self.assertNear(0.0, local_metrics[4], 0.01)
+      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[5], 0.01)
+      self.assertNear(0.0, local_metrics[6], 0.01)
+      self.assertNear(0.0, local_metrics[7], 0.01)
+      self.assertAllClose([0.0, 0.0, 0.0], local_metrics[8], 0.01)
+
+  def test_handles_single_tower(self):
+    with self.test_session() as session:
+      self.create_tower_metrics(0)
+      session.run(
+          variables.variables_initializer(
+              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
+
+      session.run(
+          replicate_model_fn._reduce_metric_variables(number_of_towers=1))
+
+      local_metrics = session.run(
+          ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES))
+
+      self.assertNear(1.3, local_metrics[0], 0.01)
+      self.assertNear(2.3, local_metrics[1], 0.01)
+      self.assertAllClose([3.3, 3.5, 3.7], local_metrics[2], 0.01)
+
+  def test_doesnt_accept_uneven_number_of_variables(self):
+    with self.test_session() as session:
+      for tower_id in range(3):
+        self.create_tower_metrics(tower_id)
+      self.create_metric_variable(-1.0, 'oddball')
+
+      session.run(
+          variables.variables_initializer(
+              ops_lib.get_collection(ops_lib.GraphKeys.METRIC_VARIABLES)))
+
+      with self.assertRaisesRegexp(ValueError, ''):
+        session.run(
+            replicate_model_fn._reduce_metric_variables(number_of_towers=3))
+
+
+class MergeExportOutputsTest(test_util.TensorFlowTestCase):
+
+  def optimizer_fn(self):
+    return gradient_descent.GradientDescentOptimizer(1.0)
+
+  def model_fn(self, mode, features, labels, params):
+    c = variable_scope.get_variable(
+        'c',
+        initializer=constant_op.constant(10, dtype=dtypes.float64),
+        dtype=dtypes.float64)
+
+    predictions = {'probabilities': math_ops.multiply(features, c)}
+    loss = losses.absolute_difference(
+        labels=labels,
+        predictions=predictions['probabilities'],
+        reduction=losses.Reduction.SUM)
+
+    metrics = {
+        'accuracy': metrics_lib.accuracy(labels, predictions['probabilities']),
+        'auc': metrics_lib.auc(labels, predictions['probabilities'])
+    }
+    tensor_string_repr = str(features)
+    classes = constant_op.constant(
+        re.search('(split_inputs/split:[0-9])', tensor_string_repr).group(1),
+        dtype=dtypes.string)
+
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(predictions),
+        'classification_output':
+            export_output.ClassificationOutput(predictions['probabilities'],
+                                               classes),
+        'classification_scores':
+            export_output.ClassificationOutput(
+                scores=predictions['probabilities']),
+        'classification_classes':
+            export_output.ClassificationOutput(classes=classes),
+        'regression_output':
+            export_output.RegressionOutput(predictions['probabilities']),
+    }
+
+    return model_fn_lib.EstimatorSpec(
+        mode=mode,
+        loss=math_ops.reduce_sum(loss),
+        eval_metric_ops=metrics,
+        predictions=predictions,
+        train_op=loss,  # This train_op isn't actually used.
+        export_outputs=export_outputs)
+
+  def replicate_estimator_spec(self, session):
+    features = np.array([0.01, 0.002])
+    labels = np.array([0.01, 0.02])
+
+    replicated_model_fn = replicate_model_fn.replicate_model_fn(
+        self.model_fn, self.optimizer_fn, devices=['/gpu:0', '/gpu:1'])
+    estimator_spec = replicated_model_fn(features, labels,
+                                         model_fn_lib.ModeKeys.PREDICT, {})
+    session.run(variables.global_variables_initializer())
+    return estimator_spec
+
+  def test_merde_predict_output(self):
+    with self.test_session() as session:
+      estimator_spec = self.replicate_estimator_spec(session)
+      self.assertAllClose(
+          {
+              'probabilities': np.array([0.1, 0.02])
+          },
+          session.run(estimator_spec.export_outputs[
+              signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY].outputs))
+
+  def test_merge_classification_output_scores_classes(self):
+    with self.test_session() as session:
+      estimator_spec = self.replicate_estimator_spec(session)
+      self.assertAllClose(
+          [0.1, 0.02],
+          session.run(
+              estimator_spec.export_outputs['classification_output'].scores))
+      self.assertAllEqual(
+          [b'split_inputs/split:0', b'split_inputs/split:1'],
+          session.run(
+              estimator_spec.export_outputs['classification_output'].classes))
+
+  def test_merge_classification_output_scores(self):
+    with self.test_session() as session:
+      estimator_spec = self.replicate_estimator_spec(session)
+      self.assertAllClose(
+          [0.1, 0.02],
+          session.run(
+              estimator_spec.export_outputs['classification_scores'].scores))
+      self.assertEqual(
+          None, estimator_spec.export_outputs['classification_scores'].classes)
+
+  def test_merge_classification_output_classes(self):
+    with self.test_session() as session:
+      estimator_spec = self.replicate_estimator_spec(session)
+      self.assertAllEqual(
+          [b'split_inputs/split:0', b'split_inputs/split:1'],
+          session.run(
+              estimator_spec.export_outputs['classification_classes'].classes))
+      self.assertEqual(
+          None, estimator_spec.export_outputs['classification_classes'].scores)
+
+  def test_merge_regression_output(self):
+    with self.test_session() as session:
+      estimator_spec = self.replicate_estimator_spec(session)
+      self.assertAllClose(
+          [0.1, 0.02],
+          session.run(estimator_spec.export_outputs['regression_output'].value))
+
+
+class GetLocalDevicesTest(test_util.TensorFlowTestCase):
+
+  def test_there_is_at_least_a_cpu(self):
+    self.assertTrue(replicate_model_fn._get_local_devices('CPU'))
+
+  def test_there_is_no_xpu(self):
+    self.assertFalse(
+        replicate_model_fn._get_local_devices('XPU'))  # XPU doesn't exist.
+
+  def test_whether_there_is_a_gpu(self):
+    if test.is_gpu_available():
+      self.assertTrue(len(replicate_model_fn._get_local_devices('GPU')))
+
+
+class LocalDeviceSetterTest(test_util.TensorFlowTestCase):
+
+  def test_vars_are_on_ps_but_ops_are_on_workers(self):
+    ps_devices = ['/device:GPU:3']
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
+    local_device_setter = replicate_model_fn._local_device_setter(
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
+
+    with ops_lib.device(local_device_setter):
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:3', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:3', b.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:3', c.device)
+
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
+
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+  def test_round_robin_placement(self):
+    ps_devices = [
+        '/device:GPU:0', '/device:GPU:1', '/device:GPU:3', '/device:GPU:4'
+    ]
+    round_robin = device_setter._RoundRobinStrategy(num_tasks=len(ps_devices))
+
+    local_device_setter = replicate_model_fn._local_device_setter(
+        ps_devices=ps_devices,
+        ps_strategy=round_robin,
+        worker_device='/device:GPU:2')
+
+    with ops_lib.device(local_device_setter):
+      a = variables.Variable(0.01)
+      self.assertEqual('/device:GPU:0', a.device)
+
+      b = variables.Variable(0.02)
+      self.assertEqual('/device:GPU:1', b.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:3', c.device)
+
+      a_op = array_ops.concat(a, axis=0)
+      self.assertEqual('/device:GPU:2', a_op.device)
+
+      b_op = array_ops.concat(b, axis=0)
+      self.assertEqual('/device:GPU:2', b_op.device)
+
+      c = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:4', c.device)
+
+      d = variables.Variable(0.03)
+      self.assertEqual('/device:GPU:0', d.device)
+
+      c_op = array_ops.concat(c, axis=0)
+      self.assertEqual('/device:GPU:2', c_op.device)
+
+
+class ComputeSumWithDevicePlacementTest(test_util.TensorFlowTestCase):
+
+  def test_vectors(self):
+    with self.test_session() as session:
+      total = replicate_model_fn._compute_sum_on_device(
+          [1.0, 2.0, 3.0, 4.0], device='/device:GPU:0', name='test_sum')
+
+      self.assertEqual('/device:GPU:0', total.device)
+      self.assertEqual('test_sum', total.op.name)
+      self.assertEqual(10.0, session.run(total))
+
+  def test_tensors(self):
+    with self.test_session() as session:
+      total = replicate_model_fn._compute_sum_on_device(
+          [[1.0, 2.0], [3.0, 4.0]], device='/device:GPU:0', name='test_sum')
+
+      self.assertEqual('/device:GPU:0', total.device)
+      self.assertEqual('test_sum', total.op.name)
+      self.assertAllEqual([4.0, 6.0], session.run(total))
+
+  def test_indexedslices(self):
+    with self.test_session() as session:
+      a = ops_lib.IndexedSlices(
+          constant_op.constant([1.0, 2.0]), [0, 1],
+          dense_shape=constant_op.constant([2]))
+      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
+
+      total = replicate_model_fn._compute_sum_on_device(
+          [a, b], device='/device:GPU:0')
+
+      self.assertEqual('/device:GPU:0', total.device)
+      self.assertAllEqual([4.0, 6.0],
+                          session.run(ops_lib.convert_to_tensor(total)))
+
+  def test_indexedslices_higher_dimensions(self):
+    with self.test_session() as session:
+      a = ops_lib.IndexedSlices(
+          constant_op.constant([[1.0, 5.0], [2.0, 6.0]]), [0, 1],
+          dense_shape=constant_op.constant([2, 4]))
+      b = ops_lib.IndexedSlices(
+          constant_op.constant([[3.0, 7.0], [4.0, 8.0]]), [0, 1])
+
+      total = replicate_model_fn._compute_sum_on_device(
+          [a, b], device='/device:GPU:0')
+
+      self.assertEqual('/device:GPU:0', total.device)
+      self.assertAllEqual([[4.0, 12.0], [6.0, 14.0]],
+                          session.run(ops_lib.convert_to_tensor(total)))
+
+  def test_indexedslices_some_dont_overlap(self):
+    with self.test_session() as session:
+      a = ops_lib.IndexedSlices(
+          constant_op.constant([1.0, 2.0]), [0, 3],
+          dense_shape=constant_op.constant([4]))
+      b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
+
+      total = replicate_model_fn._compute_sum_on_device(
+          [a, b], device='/device:GPU:0')
+
+      self.assertEqual('/device:GPU:0', total.device)
+      self.assertAllEqual([4.0, 4.0, 0.0, 2.0],
+                          session.run(ops_lib.convert_to_tensor(total)))
+
+  def test_no_name_for_indexslices(self):
+    a = ops_lib.IndexedSlices(
+        constant_op.constant([1.0, 2.0]), [0, 1],
+        dense_shape=constant_op.constant([2]))
+    b = ops_lib.IndexedSlices(constant_op.constant([3.0, 4.0]), [0, 1])
+
+    with self.assertRaisesRegexp(ValueError, ''):
+      _ = replicate_model_fn._compute_sum_on_device(
+          [a, b], device='/device:GPU:0', name='cant_name_indexslices')
+
+
+class ConcatTensorDictsTest(test_util.TensorFlowTestCase):
+
+  def test_example(self):
+    tensor_dicts = [
+        {
+            'a': np.array([1.0, 2.0]),
+            'b': np.array([11.0]),
+            'c': np.array([21.0]),
+        },
+        {
+            'a': np.array([3.0]),
+            'b': np.array([12.0, 13.0]),
+        },
+        {
+            'b': np.array([14.0]),
+        },
+    ]
+
+    with self.test_session() as session:
+      self.assertAllClose({
+          'a': np.array([1.0, 2.0, 3.0]),
+          'b': np.array([11.0, 12.0, 13.0, 14.0]),
+          'c': np.array([21.0]),
+      }, session.run(replicate_model_fn._concat_tensor_dicts(*tensor_dicts)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/factorization/python/ops/gmm.py b/tensorflow/contrib/factorization/python/ops/gmm.py
index 0d67e09f8151b48c97094b6b48f26e63443707ef..f72280c4ecf19e33278ffe74061f44bbb7b21709 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm.py
@@ -24,7 +24,7 @@ import numpy as np
 from tensorflow.contrib import framework
 from tensorflow.contrib.factorization.python.ops import gmm_ops
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
@@ -167,7 +167,7 @@ class GMM(estimator.Estimator):
                                      self._num_clusters, self._random_seed,
                                      self._covariance_type,
                                      self._params)
-      incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+      incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
       loss = math_ops.reduce_sum(losses)
       training_op = with_dependencies([training_op, incr_step], loss)
       training_hooks = [_InitializeClustersHook(
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 3976395d78e9188dd56d5b3b32fa8a3daf43c37d..4fe22ea26ec5f5a43f1c99d1fee518b1d326c5c9 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.factorization.python.ops import factorization_ops
-from tensorflow.contrib.framework.python.ops import variables as framework_variables
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.framework import dtypes
@@ -32,175 +31,81 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
 
 
 class _SweepHook(session_run_hook.SessionRunHook):
   """Keeps track of row/col sweeps, and runs prep ops before each sweep."""
 
-  def __init__(self, is_row_sweep_var, train_ops, num_rows, num_cols,
-               input_row_indices, input_col_indices, row_prep_ops,
-               col_prep_ops, init_op, completed_sweeps_var):
+  def __init__(self, is_row_sweep_var, is_sweep_done_var, init_op,
+               row_prep_ops, col_prep_ops, row_train_op, col_train_op,
+               switch_op):
     """Initializes SweepHook.
 
     Args:
       is_row_sweep_var: A Boolean tf.Variable, determines whether we are
         currently doing a row or column sweep. It is updated by the hook.
-      train_ops: A list of ops. The ops created by this hook will have
-        control dependencies on `train_ops`.
-      num_rows: int, the total number of rows to be processed.
-      num_cols: int, the total number of columns to be processed.
-      input_row_indices: A Tensor of type int64. The indices of the input rows
-        that are processed during the current sweep. All elements of
-        `input_row_indices` must be in [0, num_rows).
-      input_col_indices: A Tensor of type int64. The indices of the input
-        columns that are processed during the current sweep. All elements of
-        `input_col_indices` must be in [0, num_cols).
-      row_prep_ops: list of ops, to be run before the beginning of each row
-        sweep, in the given order.
-      col_prep_ops: list of ops, to be run before the beginning of each column
-        sweep, in the given order.
+      is_sweep_done_var: A Boolean tf.Variable, determines whether we are
+        starting a new sweep (this is used to determine when to run the prep ops
+        below).
       init_op: op to be run once before training. This is typically a local
         initialization op (such as cache initialization).
-      completed_sweeps_var: An integer tf.Variable, indicates the number of
-        completed sweeps. It is updated by the hook.
+      row_prep_ops: A list of TensorFlow ops, to be run before the beginning of
+        each row sweep (and during initialization), in the given order.
+      col_prep_ops: A list of TensorFlow ops, to be run before the beginning of
+        each column sweep (and during initialization), in the given order.
+      row_train_op: A TensorFlow op to be run during row sweeps.
+      col_train_op: A TensorFlow op to be run during column sweeps.
+      switch_op: A TensorFlow op to be run before each sweep.
     """
-    self._num_rows = num_rows
-    self._num_cols = num_cols
+    self._is_row_sweep_var = is_row_sweep_var
+    self._is_sweep_done_var = is_sweep_done_var
+    self._init_op = init_op
     self._row_prep_ops = row_prep_ops
     self._col_prep_ops = col_prep_ops
-    self._init_op = init_op
-    self._is_row_sweep_var = is_row_sweep_var
-    self._completed_sweeps_var = completed_sweeps_var
-    # Boolean variable that determines whether the init_ops have been run.
+    self._row_train_op = row_train_op
+    self._col_train_op = col_train_op
+    self._switch_op = switch_op
+    # Boolean variable that determines whether the init_op has been run.
     self._is_initialized = False
-    # Ops to run jointly with train_ops, responsible for updating
-    # `is_row_sweep_var` and incrementing the `global_step` and
-    # `completed_sweeps` counters.
-    self._update_op, self._is_sweep_done_var, self._switch_op = (
-        self._create_hook_ops(input_row_indices, input_col_indices, train_ops))
-
-  def _create_hook_ops(self, input_row_indices, input_col_indices, train_ops):
-    """Creates ops to update is_row_sweep_var, global_step and completed_sweeps.
-
-    Creates two boolean tensors `processed_rows` and `processed_cols`, which
-    keep track of which rows/cols have been processed during the current sweep.
-    Returns ops that should be run after each row / col update.
-      - When `self._is_row_sweep_var` is True, it sets
-        processed_rows[input_row_indices] to True.
-      - When `self._is_row_sweep_var` is False, it sets
-        processed_cols[input_col_indices] to True.
-
-    Args:
-      input_row_indices: A Tensor. The indices of the input rows that are
-        processed during the current sweep.
-      input_col_indices: A Tensor. The indices of the input columns that
-        are processed during the current sweep.
-      train_ops: A list of ops. The ops created by this function have control
-        dependencies on `train_ops`.
-
-    Returns:
-      A tuple consisting of:
-        update_op: An op to be run jointly with training. It updates the state
-          and increments counters (global step and completed sweeps).
-        is_sweep_done_var: A Boolean tf.Variable, specifies whether the sweep is
-          done, i.e. all rows (during a row sweep) or all columns (during a
-          column sweep) have been processed.
-        switch_op: An op to be run in `self.before_run` when the sweep is done.
-    """
-    processed_rows_init = array_ops.fill(dims=[self._num_rows], value=False)
-    with ops.colocate_with(processed_rows_init):
-      processed_rows = variable_scope.variable(
-          processed_rows_init,
-          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-          trainable=False,
-          name="sweep_hook_processed_rows")
-    processed_cols_init = array_ops.fill(dims=[self._num_cols], value=False)
-    with ops.colocate_with(processed_cols_init):
-      processed_cols = variable_scope.variable(
-          processed_cols_init,
-          collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-          trainable=False,
-          name="sweep_hook_processed_cols")
-    switch_ops = control_flow_ops.group(
-        state_ops.assign(
-            self._is_row_sweep_var,
-            math_ops.logical_not(self._is_row_sweep_var)),
-        state_ops.assign(processed_rows, processed_rows_init),
-        state_ops.assign(processed_cols, processed_cols_init))
-    is_sweep_done_var = variable_scope.variable(
-        False,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        trainable=False,
-        name="is_sweep_done")
-
-    # After running the `train_ops`, updates `processed_rows` or
-    # `processed_cols` tensors, depending on whether this is a row or col sweep.
-    with ops.control_dependencies(train_ops):
-      with ops.colocate_with(processed_rows):
-        update_processed_rows = state_ops.scatter_update(
-            processed_rows,
-            input_row_indices,
-            math_ops.logical_and(
-                self._is_row_sweep_var,
-                array_ops.ones_like(input_row_indices, dtype=dtypes.bool)))
-      with ops.colocate_with(processed_cols):
-        update_processed_cols = state_ops.scatter_update(
-            processed_cols,
-            input_col_indices,
-            math_ops.logical_and(
-                math_ops.logical_not(self._is_row_sweep_var),
-                array_ops.ones_like(input_col_indices, dtype=dtypes.bool)))
-      update_processed_op = control_flow_ops.group(
-          update_processed_rows, update_processed_cols)
-
-      with ops.control_dependencies([update_processed_op]):
-        is_sweep_done = math_ops.logical_or(
-            math_ops.reduce_all(processed_rows),
-            math_ops.reduce_all(processed_cols))
-        # Increments global step.
-        global_step = framework_variables.get_global_step()
-        if global_step is not None:
-          global_step_incr_op = state_ops.assign_add(
-              global_step, 1, name="global_step_incr").op
-        else:
-          global_step_incr_op = control_flow_ops.no_op()
-        # Increments completed sweeps.
-        completed_sweeps_incr_op = state_ops.assign_add(
-            self._completed_sweeps_var,
-            math_ops.cast(is_sweep_done, dtypes.int32),
-            use_locking=True).op
-        update_ops = control_flow_ops.group(
-            global_step_incr_op,
-            completed_sweeps_incr_op,
-            state_ops.assign(is_sweep_done_var, is_sweep_done))
-
-    return update_ops, is_sweep_done_var, switch_ops
 
   def before_run(self, run_context):
     """Runs the appropriate prep ops, and requests running update ops."""
-    # Runs the appropriate init ops and prep ops.
     sess = run_context.session
     is_sweep_done = sess.run(self._is_sweep_done_var)
     if not self._is_initialized:
-      logging.info("SweepHook running cache init op.")
+      logging.info("SweepHook running init op.")
       sess.run(self._init_op)
     if is_sweep_done:
+      logging.info("SweepHook starting the next sweep.")
       sess.run(self._switch_op)
+    is_row_sweep = sess.run(self._is_row_sweep_var)
     if is_sweep_done or not self._is_initialized:
-      logging.info("SweepHook running sweep prep ops.")
-      row_sweep = sess.run(self._is_row_sweep_var)
-      prep_ops = self._row_prep_ops if row_sweep else self._col_prep_ops
+      logging.info("SweepHook running prep ops for the {} sweep.".format(
+          "row" if is_row_sweep else "col"))
+      prep_ops = self._row_prep_ops if is_row_sweep else self._col_prep_ops
       for prep_op in prep_ops:
         sess.run(prep_op)
-
     self._is_initialized = True
-
-    # Requests running `self._update_op` jointly with the training op.
     logging.info("Next fit step starting.")
-    return session_run_hook.SessionRunArgs(fetches=[self._update_op])
+    return session_run_hook.SessionRunArgs(
+        fetches=[self._row_train_op if is_row_sweep else self._col_train_op])
 
-  def after_run(self, run_context, run_values):
-    logging.info("Fit step done.")
+
+class _IncrementGlobalStepHook(session_run_hook.SessionRunHook):
+  """Hook that increments the global step."""
+
+  def __init__(self):
+    global_step = training_util.get_global_step()
+    if global_step:
+      self._global_step_incr_op = state_ops.assign_add(
+          global_step, 1, name="global_step_incr").op
+    else:
+      self._global_step_incr_op = None
+
+  def before_run(self, run_context):
+    if self._global_step_incr_op:
+      run_context.session.run(self._global_step_incr_op)
 
 
 class _StopAtSweepHook(session_run_hook.SessionRunHook):
@@ -246,6 +151,9 @@ def _wals_factorization_model_function(features, labels, mode, params):
 
   Returns:
     A ModelFnOps object.
+
+  Raises:
+    ValueError: If `mode` is not recognized.
   """
   assert labels is None
   use_factors_weights_cache = (params["use_factors_weights_cache_for_training"]
@@ -269,86 +177,145 @@ def _wals_factorization_model_function(features, labels, mode, params):
       use_gramian_cache=use_gramian_cache)
 
   # Get input rows and cols. We either update rows or columns depending on
-  # the value of row_sweep, which is maintained using a session hook
+  # the value of row_sweep, which is maintained using a session hook.
   input_rows = features[WALSMatrixFactorization.INPUT_ROWS]
   input_cols = features[WALSMatrixFactorization.INPUT_COLS]
-  input_row_indices, _ = array_ops.unique(input_rows.indices[:, 0])
-  input_col_indices, _ = array_ops.unique(input_cols.indices[:, 0])
-
-  # Train ops, controlled using the SweepHook
-  # We need to run the following ops:
-  # Before a row sweep:
-  #   row_update_prep_gramian_op
-  #   initialize_row_update_op
-  # During a row sweep:
-  #   update_row_factors_op
-  # Before a col sweep:
-  #   col_update_prep_gramian_op
-  #   initialize_col_update_op
-  # During a col sweep:
-  #   update_col_factors_op
-
-  is_row_sweep_var = variable_scope.variable(
-      True,
-      trainable=False,
-      name="is_row_sweep",
-      collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-  completed_sweeps_var = variable_scope.variable(
-      0,
-      trainable=False,
-      name=WALSMatrixFactorization.COMPLETED_SWEEPS,
-      collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-
-  # The row sweep is determined by is_row_sweep_var (controlled by the
-  # sweep_hook) in TRAIN mode, and manually in EVAL mode.
-  is_row_sweep = (features[WALSMatrixFactorization.PROJECT_ROW]
-                  if mode == model_fn.ModeKeys.EVAL else is_row_sweep_var)
-
-  def update_row_factors():
-    return model.update_row_factors(sp_input=input_rows, transpose_input=False)
-
-  def update_col_factors():
-    return model.update_col_factors(sp_input=input_cols, transpose_input=True)
-
-  (_, train_op,
-   unregularized_loss, regularization, sum_weights) = control_flow_ops.cond(
-       is_row_sweep, update_row_factors, update_col_factors)
-  loss = unregularized_loss + regularization
-  root_weighted_squared_error = math_ops.sqrt(unregularized_loss / sum_weights)
-
-  row_prep_ops = [
-      model.row_update_prep_gramian_op, model.initialize_row_update_op
-  ]
-  col_prep_ops = [
-      model.col_update_prep_gramian_op, model.initialize_col_update_op
-  ]
-  init_ops = [model.worker_init]
-
-  sweep_hook = _SweepHook(
-      is_row_sweep_var,
-      [train_op, loss],
-      params["num_rows"],
-      params["num_cols"],
-      input_row_indices,
-      input_col_indices,
-      row_prep_ops,
-      col_prep_ops,
-      init_ops,
-      completed_sweeps_var)
-  training_hooks = [sweep_hook]
-  if max_sweeps is not None:
-    training_hooks.append(_StopAtSweepHook(max_sweeps))
-
-  # The root weighted squared error =
-  #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
-  summary.scalar("loss", loss)  # the estimated total training loss
-  summary.scalar("root_weighted_squared_error", root_weighted_squared_error)
-  summary.scalar("completed_sweeps", completed_sweeps_var)
-
-  # Prediction ops (only return predictions in INFER mode)
-  predictions = {}
-  if mode == model_fn.ModeKeys.INFER:
-    project_row = features[WALSMatrixFactorization.PROJECT_ROW]
+
+  # TRAIN mode:
+  if mode == model_fn.ModeKeys.TRAIN:
+    # Training consists of the following ops (controlled using a SweepHook).
+    # Before a row sweep:
+    #   row_update_prep_gramian_op
+    #   initialize_row_update_op
+    # During a row sweep:
+    #   update_row_factors_op
+    # Before a col sweep:
+    #   col_update_prep_gramian_op
+    #   initialize_col_update_op
+    # During a col sweep:
+    #   update_col_factors_op
+
+    is_row_sweep_var = variable_scope.variable(
+        True,
+        trainable=False,
+        name="is_row_sweep",
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+    is_sweep_done_var = variable_scope.variable(
+        False,
+        trainable=False,
+        name="is_sweep_done",
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+    completed_sweeps_var = variable_scope.variable(
+        0,
+        trainable=False,
+        name=WALSMatrixFactorization.COMPLETED_SWEEPS,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+    loss_var = variable_scope.variable(
+        0.,
+        trainable=False,
+        name=WALSMatrixFactorization.LOSS,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+    # The root weighted squared error =
+    #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+    rwse_var = variable_scope.variable(
+        0.,
+        trainable=False,
+        name=WALSMatrixFactorization.RWSE,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+    summary.scalar("loss", loss_var)
+    summary.scalar("root_weighted_squared_error", rwse_var)
+    summary.scalar("completed_sweeps", completed_sweeps_var)
+
+    def create_axis_ops(sp_input, num_items, update_fn, axis_name):
+      """Creates book-keeping and training ops for a given axis.
+
+      Args:
+        sp_input: A SparseTensor corresponding to the row or column batch.
+        num_items: An integer, the total number of items of this axis.
+        update_fn: A function that takes one argument (`sp_input`), and that
+        returns a tuple of
+          * new_factors: A flot Tensor of the factor values after update.
+          * update_op: a TensorFlow op which updates the factors.
+          * loss: A float Tensor, the unregularized loss.
+          * reg_loss: A float Tensor, the regularization loss.
+          * sum_weights: A float Tensor, the sum of factor weights.
+        axis_name: A string that specifies the name of the axis.
+
+      Returns:
+        A tuple consisting of:
+          * reset_processed_items_op: A TensorFlow op, to be run before the
+            beginning of any sweep. It marks all items as not-processed.
+          * axis_train_op: A Tensorflow op, to be run during this axis' sweeps.
+      """
+      processed_items_init = array_ops.fill(dims=[num_items], value=False)
+      with ops.colocate_with(processed_items_init):
+        processed_items = variable_scope.variable(
+            processed_items_init,
+            collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+            trainable=False,
+            name="processed_" + axis_name)
+      _, update_op, loss, reg, sum_weights = update_fn(sp_input)
+      input_indices = sp_input.indices[:, 0]
+      with ops.control_dependencies([
+          update_op,
+          state_ops.assign(loss_var, loss + reg),
+          state_ops.assign(rwse_var, math_ops.sqrt(loss / sum_weights))]):
+        with ops.colocate_with(processed_items):
+          update_processed_items = state_ops.scatter_update(
+              processed_items,
+              input_indices,
+              array_ops.ones_like(input_indices, dtype=dtypes.bool),
+              name="update_processed_{}_indices".format(axis_name))
+        with ops.control_dependencies([update_processed_items]):
+          is_sweep_done = math_ops.reduce_all(processed_items)
+          axis_train_op = control_flow_ops.group(
+              state_ops.assign(is_sweep_done_var, is_sweep_done),
+              state_ops.assign_add(
+                  completed_sweeps_var,
+                  math_ops.cast(is_sweep_done, dtypes.int32)),
+              name="{}_sweep_train_op".format(axis_name))
+      return processed_items.initializer, axis_train_op
+
+    reset_processed_rows_op, row_train_op = create_axis_ops(
+        input_rows,
+        params["num_rows"],
+        lambda x: model.update_row_factors(sp_input=x, transpose_input=False),
+        "rows")
+    reset_processed_cols_op, col_train_op = create_axis_ops(
+        input_cols,
+        params["num_cols"],
+        lambda x: model.update_col_factors(sp_input=x, transpose_input=True),
+        "cols")
+    switch_op = control_flow_ops.group(
+        state_ops.assign(
+            is_row_sweep_var, math_ops.logical_not(is_row_sweep_var)),
+        reset_processed_rows_op,
+        reset_processed_cols_op,
+        name="sweep_switch_op")
+    row_prep_ops = [
+        model.row_update_prep_gramian_op, model.initialize_row_update_op]
+    col_prep_ops = [
+        model.col_update_prep_gramian_op, model.initialize_col_update_op]
+    init_op = model.worker_init
+    sweep_hook = _SweepHook(
+        is_row_sweep_var, is_sweep_done_var, init_op,
+        row_prep_ops, col_prep_ops, row_train_op, col_train_op, switch_op)
+    global_step_hook = _IncrementGlobalStepHook()
+    training_hooks = [sweep_hook, global_step_hook]
+    if max_sweeps is not None:
+      training_hooks.append(_StopAtSweepHook(max_sweeps))
+
+    return model_fn.ModelFnOps(
+        mode=model_fn.ModeKeys.TRAIN,
+        predictions={},
+        loss=loss_var,
+        eval_metric_ops={},
+        train_op=control_flow_ops.no_op(),
+        training_hooks=training_hooks)
+
+  # INFER mode
+  elif mode == model_fn.ModeKeys.INFER:
     projection_weights = features.get(
         WALSMatrixFactorization.PROJECTION_WEIGHTS)
 
@@ -364,17 +331,45 @@ def _wals_factorization_model_function(features, labels, mode, params):
           projection_weights=projection_weights,
           transpose_input=True)
 
-    predictions[WALSMatrixFactorization.PROJECTION_RESULT] = (
-        control_flow_ops.cond(project_row, get_row_projection,
-                              get_col_projection))
+    predictions = {
+        WALSMatrixFactorization.PROJECTION_RESULT: control_flow_ops.cond(
+            features[WALSMatrixFactorization.PROJECT_ROW],
+            get_row_projection,
+            get_col_projection)
+    }
 
-  return model_fn.ModelFnOps(
-      mode=mode,
-      predictions=predictions,
-      loss=loss,
-      eval_metric_ops={},
-      train_op=train_op,
-      training_hooks=training_hooks)
+    return model_fn.ModelFnOps(
+        mode=model_fn.ModeKeys.INFER,
+        predictions=predictions,
+        loss=None,
+        eval_metric_ops={},
+        train_op=control_flow_ops.no_op(),
+        training_hooks=[])
+
+  # EVAL mode
+  elif mode == model_fn.ModeKeys.EVAL:
+    def get_row_loss():
+      _, _, loss, reg, _ = model.update_row_factors(
+          sp_input=input_rows, transpose_input=False)
+      return loss + reg
+    def get_col_loss():
+      _, _, loss, reg, _ = model.update_col_factors(
+          sp_input=input_cols, transpose_input=True)
+      return loss + reg
+    loss = control_flow_ops.cond(
+        features[WALSMatrixFactorization.PROJECT_ROW],
+        get_row_loss,
+        get_col_loss)
+    return model_fn.ModelFnOps(
+        mode=model_fn.ModeKeys.EVAL,
+        predictions={},
+        loss=loss,
+        eval_metric_ops={},
+        train_op=control_flow_ops.no_op(),
+        training_hooks=[])
+
+  else:
+    raise ValueError("mode=%s is not recognized." % str(mode))
 
 
 class WALSMatrixFactorization(estimator.Estimator):
@@ -452,6 +447,10 @@ class WALSMatrixFactorization(estimator.Estimator):
   PROJECTION_RESULT = "projection"
   # Name of the completed_sweeps variable
   COMPLETED_SWEEPS = "completed_sweeps"
+  # Name of the loss variable
+  LOSS = "WALS_loss"
+  # Name of the Root Weighted Squared Error variable
+  RWSE = "WALS_RWSE"
 
   def __init__(self,
                num_rows,
diff --git a/tensorflow/contrib/factorization/python/ops/wals_test.py b/tensorflow/contrib/factorization/python/ops/wals_test.py
index 8bd72b7025aad80e387171b93b9b264da3ed0f66..36b483c6d7a59bba78b7fa22aac0714e278f22cc 100644
--- a/tensorflow/contrib/factorization/python/ops/wals_test.py
+++ b/tensorflow/contrib/factorization/python/ops/wals_test.py
@@ -417,73 +417,67 @@ class WALSMatrixFactorizationUnsupportedTest(test.TestCase):
 
 class SweepHookTest(test.TestCase):
 
-  def setUp(self):
-    self._num_rows = 5
-    self._num_cols = 7
-    self._train_op = control_flow_ops.no_op()
-    self._row_prep_done = variables.Variable(False)
-    self._col_prep_done = variables.Variable(False)
-    self._init_done = variables.Variable(False)
-    self._row_prep_ops = [state_ops.assign(self._row_prep_done, True)]
-    self._col_prep_ops = [state_ops.assign(self._col_prep_done, True)]
-    self._init_ops = [state_ops.assign(self._init_done, True)]
-    self._input_row_indices_ph = array_ops.placeholder(dtypes.int64)
-    self._input_col_indices_ph = array_ops.placeholder(dtypes.int64)
-
   def test_sweeps(self):
-    def ind_feed(row_indices, col_indices):
-      return {
-          self._input_row_indices_ph: row_indices,
-          self._input_col_indices_ph: col_indices
-      }
+    is_row_sweep_var = variables.Variable(True)
+    is_sweep_done_var = variables.Variable(False)
+    init_done = variables.Variable(False)
+    row_prep_done = variables.Variable(False)
+    col_prep_done = variables.Variable(False)
+    row_train_done = variables.Variable(False)
+    col_train_done = variables.Variable(False)
+
+    init_op = state_ops.assign(init_done, True)
+    row_prep_op = state_ops.assign(row_prep_done, True)
+    col_prep_op = state_ops.assign(col_prep_done, True)
+    row_train_op = state_ops.assign(row_train_done, True)
+    col_train_op = state_ops.assign(col_train_done, True)
+    train_op = control_flow_ops.no_op()
+    switch_op = control_flow_ops.group(
+        state_ops.assign(is_sweep_done_var, False),
+        state_ops.assign(is_row_sweep_var,
+                         math_ops.logical_not(is_row_sweep_var)))
+    mark_sweep_done = state_ops.assign(is_sweep_done_var, True)
 
     with self.test_session() as sess:
-      is_row_sweep_var = variables.Variable(True)
-      completed_sweeps_var = variables.Variable(0)
       sweep_hook = wals_lib._SweepHook(
           is_row_sweep_var,
-          [self._train_op],
-          self._num_rows,
-          self._num_cols,
-          self._input_row_indices_ph,
-          self._input_col_indices_ph,
-          self._row_prep_ops,
-          self._col_prep_ops,
-          self._init_ops,
-          completed_sweeps_var)
+          is_sweep_done_var,
+          init_op,
+          [row_prep_op],
+          [col_prep_op],
+          row_train_op,
+          col_train_op,
+          switch_op)
       mon_sess = monitored_session._HookedSession(sess, [sweep_hook])
       sess.run([variables.global_variables_initializer()])
 
-      # Init ops should run before the first run. Row sweep not completed.
-      mon_sess.run(self._train_op, ind_feed([0, 1, 2], []))
-      self.assertTrue(sess.run(self._init_done),
-                      msg='init ops not run by the sweep_hook')
-      self.assertTrue(sess.run(self._row_prep_done),
-                      msg='row_prep not run by the sweep_hook')
-      self.assertTrue(sess.run(is_row_sweep_var),
-                      msg='Row sweep is not complete but is_row_sweep is '
-                      'False.')
-      # Row sweep completed.
-      mon_sess.run(self._train_op, ind_feed([3, 4], [0, 1, 2, 3, 4, 5, 6]))
-      self.assertTrue(sess.run(completed_sweeps_var) == 1,
-                      msg='Completed sweeps should be equal to 1.')
-      self.assertTrue(sess.run(sweep_hook._is_sweep_done_var),
-                      msg='Sweep is complete but is_sweep_done is False.')
-      # Col init ops should run. Col sweep not completed.
-      mon_sess.run(self._train_op, ind_feed([], [0, 1, 2, 3, 4]))
-      self.assertTrue(sess.run(self._col_prep_done),
-                      msg='col_prep not run by the sweep_hook')
-      self.assertFalse(sess.run(is_row_sweep_var),
-                       msg='Col sweep is not complete but is_row_sweep is '
-                       'True.')
-      self.assertFalse(sess.run(sweep_hook._is_sweep_done_var),
-                       msg='Sweep is not complete but is_sweep_done is True.')
-      # Col sweep completed.
-      mon_sess.run(self._train_op, ind_feed([], [4, 5, 6]))
-      self.assertTrue(sess.run(sweep_hook._is_sweep_done_var),
-                      msg='Sweep is complete but is_sweep_done is False.')
-      self.assertTrue(sess.run(completed_sweeps_var) == 2,
-                      msg='Completed sweeps should be equal to 2.')
+      # Row sweep.
+      mon_sess.run(train_op)
+      self.assertTrue(sess.run(init_done),
+                      msg='init op not run by the Sweephook')
+      self.assertTrue(sess.run(row_prep_done),
+                      msg='row_prep_op not run by the SweepHook')
+      self.assertTrue(sess.run(row_train_done),
+                      msg='row_train_op not run by the SweepHook')
+      self.assertTrue(
+          sess.run(is_row_sweep_var),
+          msg='Row sweep is not complete but is_row_sweep_var is False.')
+      # Col sweep.
+      mon_sess.run(mark_sweep_done)
+      mon_sess.run(train_op)
+      self.assertTrue(sess.run(col_prep_done),
+                      msg='col_prep_op not run by the SweepHook')
+      self.assertTrue(sess.run(col_train_done),
+                      msg='col_train_op not run by the SweepHook')
+      self.assertFalse(
+          sess.run(is_row_sweep_var),
+          msg='Col sweep is not complete but is_row_sweep_var is True.')
+      # Row sweep.
+      mon_sess.run(mark_sweep_done)
+      mon_sess.run(train_op)
+      self.assertTrue(
+          sess.run(is_row_sweep_var),
+          msg='Col sweep is complete but is_row_sweep_var is False.')
 
 
 class StopAtSweepHookTest(test.TestCase):
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index 7a5a4cb8c9499b950a3ad89be710e48474d5791e..eccce99071dc1477cf4f3bb152f3304b3b0fc35a 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -47,10 +47,25 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "decode_video_op_cc",
+    srcs = ["decode_video_op.cc"],
+    copts = tf_copts(),
+    linkstatic = 1,
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/contrib/ffmpeg/default:ffmpeg_lib",
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_library(
     name = "ffmpeg.so",
     deps = [
         ":decode_audio_op_cc",
+        ":decode_video_op_cc",
         ":encode_audio_op_cc",
     ],
 )
@@ -59,6 +74,7 @@ cc_library(
     name = "ffmpeg_op_lib",
     deps = [
         ":decode_audio_op_cc",
+        ":decode_video_op_cc",
         ":encode_audio_op_cc",
     ],
 )
@@ -81,6 +97,15 @@ tf_gen_op_wrapper_py(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "decode_video_op_py",
+    require_shape_functions = True,
+    visibility = ["//visibility:private"],
+    deps = [
+        ":decode_video_op_cc",
+    ],
+)
+
 tf_py_test(
     name = "decode_audio_op_test",
     srcs = ["decode_audio_op_test.py"],
@@ -115,6 +140,27 @@ tf_py_test(
     tags = ["manual"],
 )
 
+tf_py_test(
+    name = "decode_video_op_test",
+    size = "small",
+    srcs = ["decode_video_op_test.py"],
+    additional_deps = [
+        ":ffmpeg_ops_py",
+        "@six_archive//:six",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:image_ops",
+    ],
+    data = [
+        ":test_data",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
 py_library(
     name = "ffmpeg_ops_py",
     srcs = [
@@ -126,6 +172,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":decode_audio_op_py",
+        ":decode_video_op_py",
         ":encode_audio_op_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index 2bcb7284e10991b19ee5607147371e8d505c7732..daba965a98893b992abdc598ec713f13020d6e91 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -26,9 +26,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
+from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['decode_audio', 'encode_audio']
+_allowed_symbols = ['decode_audio', 'encode_audio', 'decode_video']
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/ffmpeg/decode_audio_op.cc b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
index 4b1c8a337e10c7025ca06e2ed6e1b934716dc1d0..92fad70b1f9cc55e0690a3fbb35abcf56aa68f16 100644
--- a/tensorflow/contrib/ffmpeg/decode_audio_op.cc
+++ b/tensorflow/contrib/ffmpeg/decode_audio_op.cc
@@ -37,29 +37,6 @@ namespace {
 // https://www.ffmpeg.org/ffmpeg-formats.html
 const char* kValidFileFormats[] = {"mp3", "mp4", "ogg", "wav"};
 
-// Writes binary data to a file.
-Status WriteFile(const string& filename, tensorflow::StringPiece contents) {
-  Env& env = *Env::Default();
-  std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(env.NewWritableFile(filename, &file));
-  TF_RETURN_IF_ERROR(file->Append(contents));
-  TF_RETURN_IF_ERROR(file->Close());
-  return Status::OK();
-}
-
-// Cleans up a file on destruction.
-class FileDeleter {
- public:
-  explicit FileDeleter(const string& filename) : filename_(filename) {}
-  ~FileDeleter() {
-    Env& env = *Env::Default();
-    env.DeleteFile(filename_).IgnoreError();
-  }
-
- private:
-  const string filename_;
-};
-
 /*
  * Decoding implementation, shared across V1 and V2 ops. Creates a new
  * output in the context.
@@ -69,7 +46,7 @@ void Decode(OpKernelContext* context,
             const string& file_format, const int32 samples_per_second,
             const int32 channel_count) {
   // Write the input data to a temp file.
-  const string temp_filename = GetTempFilename(file_format);
+  const string temp_filename = io::GetTempFilename(file_format);
   OP_REQUIRES_OK(context, WriteFile(temp_filename, file_contents));
   FileDeleter deleter(temp_filename);
 
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op.cc b/tensorflow/contrib/ffmpeg/decode_video_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d44032968d559bec14722902a4d47d22c46ea4aa
--- /dev/null
+++ b/tensorflow/contrib/ffmpeg/decode_video_op.cc
@@ -0,0 +1,118 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include <stdlib.h>
+
+#include <cstdio>
+#include <set>
+
+#include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace ffmpeg {
+
+class DecodeVideoOp : public OpKernel {
+ public:
+  explicit DecodeVideoOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(
+        context, context->num_inputs() == 1,
+        errors::InvalidArgument("DecodeVideo requires exactly 1 input."));
+    const Tensor& contents_tensor = context->input(0);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents_tensor.shape()),
+                errors::InvalidArgument(
+                    "contents must be a rank-0 tensor but got shape ",
+                    contents_tensor.shape().DebugString()));
+    const tensorflow::StringPiece contents = contents_tensor.scalar<string>()();
+
+    // Write the input data to a temp file.
+    string extension;
+    const string temp_filename = io::GetTempFilename(extension);
+    OP_REQUIRES_OK(context, WriteFile(temp_filename, contents));
+    FileDeleter deleter(temp_filename);
+
+    uint32 width = 0;
+    uint32 height = 0;
+    uint32 frames = 0;
+
+    // Run FFmpeg on the data and verify results.
+    std::vector<uint8> output_data;
+    const Status result = ffmpeg::ReadVideoFile(temp_filename, &output_data,
+                                                &width, &height, &frames);
+    if (result.code() == error::Code::NOT_FOUND) {
+      OP_REQUIRES(
+          context, result.ok(),
+          errors::Unavailable("FFmpeg must be installed to run this op. FFmpeg "
+                              "can be found at http://www.ffmpeg.org."));
+    } else if (result.code() == error::UNKNOWN) {
+      LOG(ERROR) << "Ffmpeg failed with error '" << result.error_message()
+                 << "'. Returning empty tensor.";
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, TensorShape({0, 0}), &output));
+      return;
+    } else {
+      OP_REQUIRES_OK(context, result);
+    }
+    OP_REQUIRES(context, !output_data.empty(),
+                errors::Unknown("No output created by FFmpeg."));
+    OP_REQUIRES(
+        context, output_data.size() == (frames * height * width * 3),
+        errors::Unknown("Output created by FFmpeg [", output_data.size(),
+                        "] does not match description [", frames, ", ", height,
+                        ", ", width, ", 3]"));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(
+                       0, TensorShape({frames, height, width, 3}), &output));
+    auto output_flat = output->flat<uint8>();
+    std::copy_n(output_data.begin(), output_data.size(), &output_flat(0));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeVideo").Device(DEVICE_CPU), DecodeVideoOp);
+
+REGISTER_OP("DecodeVideo")
+    .Input("contents: string")
+    .Output("output: uint8")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShapeOfRank(4));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Processes the contents of an audio file into a tensor using FFmpeg to decode
+the file.
+
+One row of the tensor is created for each channel in the audio file. Each
+channel contains audio samples starting at the beginning of the audio and
+having `1/samples_per_second` time between them. If the `channel_count` is
+different from the contents of the file, channels will be merged or created.
+
+contents: The binary audio file contents, as a string or rank-0 string
+    tensor.
+)doc");
+
+}  // namespace ffmpeg
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/ffmpeg/decode_video_op_test.py b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b43b6b8919223bd7731209d5423b142601396ea5
--- /dev/null
+++ b/tensorflow/contrib/ffmpeg/decode_video_op_test.py
@@ -0,0 +1,69 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for third_party.tensorflow.contrib.ffmpeg.decode_video_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+
+import six  # pylint: disable=unused-import
+
+from tensorflow.contrib import ffmpeg
+from tensorflow.python.ops import image_ops
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class DecodeVideoOpTest(test.TestCase):
+
+  def _loadFileAndTest(self, filename, width, height, frames, bmp_filename,
+                       index):
+    """Loads an video file and validates the output tensor.
+
+    Args:
+      filename: The filename of the input file.
+      width: The width of the video.
+      height: The height of the video.
+      frames: The frames of the video.
+      bmp_filename: The filename for the bmp file.
+      index: Index location inside the video.
+    """
+    with self.test_session():
+      path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
+                          filename)
+      with open(path, 'rb') as f:
+        contents = f.read()
+
+      bmp_path = os.path.join(resource_loader.get_data_files_path(), 'testdata',
+                              bmp_filename)
+      with open(bmp_path, 'rb') as f:
+        bmp_contents = f.read()
+
+      image_op = image_ops.decode_bmp(bmp_contents)
+      image = image_op.eval()
+      self.assertEqual(image.shape, (height, width, 3))
+      video_op = ffmpeg.decode_video(contents)
+      video = video_op.eval()
+      self.assertEqual(video.shape, (frames, height, width, 3))
+      self.assertAllEqual(video[index, :, :, :], image)
+
+  def testMp4(self):
+    self._loadFileAndTest('small.mp4', 560, 320, 166, 'small_100.bmp', 99)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
index 545a4386d043af604a747b8b5a8103101812b177..1245f515fe84f02e8470dbf941243bcd9834f3d0 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib.cc
@@ -16,6 +16,7 @@
 #include "tensorflow/contrib/ffmpeg/ffmpeg_lib.h"
 
 #include <errno.h>
+#include <fcntl.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -25,6 +26,7 @@
 #include <vector>
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
@@ -38,28 +40,45 @@ namespace {
 const char kFfmpegExecutable[] = "ffmpeg";
 const int32 kDefaultProbeSize = 5000000;  // 5MB
 
-std::vector<string> FfmpegCommandLine(const string& input_filename,
-                                      const string& output_filename,
-                                      const string& input_format_id,
-                                      int32 samples_per_second,
-                                      int32 channel_count) {
-  return {
-    "-nostats",  // No additional progress display.
-    "-nostdin",  // No interactive commands accepted.
-    "-f", input_format_id,  // eg: "mp3"
-    "-probesize", StrCat(kDefaultProbeSize),
-    "-i", input_filename,
-    "-loglevel", "info",  // Enable verbose logging to support debugging.
-    "-map_metadata", "-1",  // Copy global metadata from input to output.
-    "-vn",  // No video recording.
-    "-ac:a:0", StrCat(channel_count),
-    "-ar:a:0", StrCat(samples_per_second),
-    // Output set (in several ways) to signed 16-bit little-endian ints.
-    "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
-    "-sn",  // No subtitle recording.
-    "-y",  // Overwrite output file.
-    StrCat(output_filename)
-  };
+std::vector<string> FfmpegAudioCommandLine(const string& input_filename,
+                                           const string& output_filename,
+                                           const string& input_format_id,
+                                           int32 samples_per_second,
+                                           int32 channel_count) {
+  return {"-nostats",             // No additional progress display.
+          "-nostdin",             // No interactive commands accepted.
+          "-f", input_format_id,  // eg: "mp3"
+          "-probesize", StrCat(kDefaultProbeSize), "-i", input_filename,
+          "-loglevel", "info",  // Enable verbose logging to support debugging.
+          "-map_metadata", "-1",  // Copy global metadata from input to output.
+          "-vn",                  // No video recording.
+          "-ac:a:0", StrCat(channel_count), "-ar:a:0",
+          StrCat(samples_per_second),
+          // Output set (in several ways) to signed 16-bit little-endian ints.
+          "-codec:a:0", "pcm_s16le", "-sample_fmt", "s16", "-f", "s16le",
+          "-sn",  // No subtitle recording.
+          "-y",   // Overwrite output file.
+          StrCat(output_filename)};
+}
+
+std::vector<string> FfmpegVideoCommandLine(const string& input_filename,
+                                           const string& output_filename) {
+  return {"-nostats",  // No additional progress display.
+          "-nostdin",  // No interactive commands accepted.
+          "-i",
+          input_filename,
+          "-f",
+          "image2pipe",
+          "-probesize",
+          StrCat(kDefaultProbeSize),
+          "-loglevel",
+          "info",  // Enable verbose logging to support debugging.
+          "-vcodec",
+          "rawvideo",
+          "-pix_fmt",
+          "rgb24",
+          "-y",  // Overwrite output file.
+          StrCat(output_filename)};
 }
 
 // Is a named binary installed and executable by the current process?
@@ -106,7 +125,7 @@ bool IsBinaryInstalled(const string& binary_name) {
   ::execvp(kFfmpegExecutable, args_chars.data());
   // exec only returns on error.
   const int error = errno;
-  LOG(ERROR) << "FFmpeg could not be executed: " << error;
+  LOG(ERROR) << "FFmpeg could not be executed: " << strerror(error);
   ::_exit(error);
 }
 
@@ -198,52 +217,101 @@ string BuildWavFile(int32 samples_per_second, int32 channel_count,
   return data;
 }
 
-// Returns a unique number every time it is called.
-int64 UniqueId() {
-  static mutex mu(LINKER_INITIALIZED);
-  static int64 id = 0;
-  mutex_lock l(mu);
-  return ++id;
-}
-
-}  // namespace
-
-string GetTempFilename(const string& extension) {
-  for (const char* dir : std::vector<const char*>(
-           {getenv("TEST_TMPDIR"), getenv("TMPDIR"), getenv("TMP"), "/tmp"})) {
-    if (!dir || !dir[0]) {
+Status ReadInfoFile(const string& filename, uint32* width, uint32* height,
+                    uint32* frames) {
+  string data;
+  TF_QCHECK_OK(ReadFileToString(Env::Default(), filename, &data))
+      << "Could not read FFmpeg file: " << filename;
+  bool in_output = false;
+  bool in_mapping = false;
+  uint32 frames_value = 0;
+  uint32 height_value = 0;
+  uint32 width_value = 0;
+  for (const string& line : str_util::Split(data, '\n')) {
+    // Output starts with the first line of `Output #..`.
+    // Further processing output region starts next line so we could continue
+    // the loop.
+    if (!in_output && line.find("Output #") == 0) {
+      in_output = true;
+      in_mapping = false;
       continue;
     }
-    struct stat statbuf;
-    if (!stat(dir, &statbuf) && S_ISDIR(statbuf.st_mode)) {
-      // UniqueId is added here because mkstemps is not as thread safe as it
-      // looks. https://github.com/tensorflow/tensorflow/issues/5804 shows
-      // the problem.
-      string tmp_filepath = io::JoinPath(
-          dir,
-          StrCat("tmp_file_tensorflow_", UniqueId(), "_XXXXXX.", extension));
-      int fd = mkstemps(&tmp_filepath[0], extension.length() + 1);
-      if (fd < 0) {
-        LOG(FATAL) << "Failed to create temp file.";
-      } else {
-        close(fd);
-        return tmp_filepath;
+    // Stream mapping starts with the first line of `Stream mapping`, it also
+    // signals the end of Output section.
+    // Further processing of stream mapping region starts next line so we could
+    // continue the loop.
+    if (!in_mapping && line.find("Stream mapping:") == 0) {
+      in_output = false;
+      in_mapping = true;
+      continue;
+    }
+    if (in_output) {
+      // We only look for the first stream in output `Stream #0`.
+      // Once processed we will not further process output section.
+      if (line.find("    Stream #") == 0) {
+        size_t p = line.find(", rgb24, ", 24);
+        if (p != std::string::npos) {
+          string rgb24 = line.substr(p + 9, line.find(" ", p + 9));
+          rgb24 = rgb24.substr(0, rgb24.find(","));
+          string rgb24_width = rgb24.substr(0, rgb24.find("x"));
+          string rgb24_height = rgb24.substr(rgb24_width.length() + 1);
+          if (strings::safe_strtou32(rgb24_width, &width_value) &&
+              strings::safe_strtou32(rgb24_height, &height_value)) {
+            in_output = false;
+          }
+        }
+      }
+      continue;
+    }
+    if (in_mapping) {
+      // We only look for the first stream mapping to have the number of the
+      // frames.
+      // Once processed we will not further process stream mapping section.
+      if (line.find("frame=  ") == 0) {
+        string number = line.substr(8, line.find(" ", 8));
+        number = number.substr(0, number.find(" "));
+        if (strings::safe_strtou32(number, &frames_value)) {
+          in_mapping = false;
+        }
       }
+      continue;
     }
   }
-  LOG(FATAL) << "No temp directory found.";
+  if (frames_value == 0 || height_value == 0 || width_value == 0) {
+    return errors::Unknown("Not enough video info returned by FFmpeg [",
+                           frames_value, ", ", height_value, ", ", width_value,
+                           ", 3]");
+  }
+  *width = width_value;
+  *height = height_value;
+  *frames = frames_value;
+  return Status::OK();
 }
 
-Status ReadAudioFile(const string& filename,
-                     const string& audio_format_id,
-                     int32 samples_per_second,
-                     int32 channel_count,
+}  // namespace
+
+FileDeleter::~FileDeleter() {
+  Env& env = *Env::Default();
+  env.DeleteFile(filename_).IgnoreError();
+}
+
+Status WriteFile(const string& filename, StringPiece contents) {
+  Env& env = *Env::Default();
+  std::unique_ptr<WritableFile> file;
+  TF_RETURN_IF_ERROR(env.NewWritableFile(filename, &file));
+  TF_RETURN_IF_ERROR(file->Append(contents));
+  TF_RETURN_IF_ERROR(file->Close());
+  return Status::OK();
+}
+
+Status ReadAudioFile(const string& filename, const string& audio_format_id,
+                     int32 samples_per_second, int32 channel_count,
                      std::vector<float>* output_samples) {
   // Create an argument list.
-  string output_filename = GetTempFilename("raw");
+  string output_filename = io::GetTempFilename("raw");
   const std::vector<string> args =
-      FfmpegCommandLine(filename, output_filename, audio_format_id,
-                        samples_per_second, channel_count);
+      FfmpegAudioCommandLine(filename, output_filename, audio_format_id,
+                             samples_per_second, channel_count);
 
   // Unfortunately, it's impossible to differentiate an exec failure due to the
   // binary being missing and an error from the binary's execution. Therefore,
@@ -256,7 +324,8 @@ Status ReadAudioFile(const string& filename,
   // Execute ffmpeg and report errors.
   pid_t child_pid = ::fork();
   if (child_pid < 0) {
-    return Status(error::Code::UNKNOWN, StrCat("fork failed: ", errno));
+    return Status(error::Code::UNKNOWN,
+                  StrCat("fork failed: ", strerror(errno)));
   }
   if (child_pid == 0) {
     ExecuteFfmpeg(args);
@@ -285,5 +354,63 @@ Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
   return Status::OK();
 }
 
+Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
+                     uint32* width, uint32* height, uint32* frames) {
+  if (!IsBinaryInstalled(kFfmpegExecutable)) {
+    return Status(error::Code::NOT_FOUND, StrCat("FFmpeg could not be found."));
+  }
+
+  string output_filename = io::GetTempFilename("raw");
+  string stderr_filename = io::GetTempFilename("err");
+
+  // Create an argument list.
+  const std::vector<string> args =
+      FfmpegVideoCommandLine(filename, output_filename);
+
+  // Execute ffmpeg and report errors.
+  pid_t child_pid = ::fork();
+  if (child_pid < 0) {
+    return Status(error::Code::UNKNOWN,
+                  StrCat("fork failed: ", strerror(errno)));
+  }
+  if (child_pid == 0) {
+    const int fd =
+        open(stderr_filename.c_str(), O_RDWR | O_CREAT | O_APPEND, 0600);
+    if (fd < 0) {
+      const int error = errno;
+      LOG(ERROR) << "FFmpeg stderr file could not be created: "
+                 << strerror(error);
+      ::_exit(error);
+    }
+    close(STDERR_FILENO);
+    dup2(fd, STDERR_FILENO);
+    ExecuteFfmpeg(args);
+  } else {
+    int status_code;
+    if (::waitpid(child_pid, &status_code, 0) < 0) {
+      return Status(error::Code::UNKNOWN,
+                    StrCat("waitpid failed: ", strerror(errno)));
+    }
+    if (status_code) {
+      return Status(error::Code::UNKNOWN,
+                    StrCat("FFmpeg execution failed: ", status_code));
+    }
+
+    TF_QCHECK_OK(ReadInfoFile(stderr_filename, width, height, frames))
+        << "Could not read FFmpeg stderr file: " << stderr_filename;
+
+    string raw_data;
+    TF_QCHECK_OK(ReadFileToString(Env::Default(), output_filename, &raw_data))
+        << "Could not read FFmpeg output file: " << output_filename;
+    output_data->resize(raw_data.size());
+    std::copy_n(raw_data.data(), raw_data.size(), output_data->begin());
+
+    TF_QCHECK_OK(Env::Default()->DeleteFile(output_filename))
+        << output_filename;
+    TF_QCHECK_OK(Env::Default()->DeleteFile(stderr_filename))
+        << stderr_filename;
+    return Status::OK();
+  }
+}
 }  // namespace ffmpeg
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
index 2871c1462894c6a4ddef63e9178272df0d14824c..85b61b26163d87a10d4e316720b4f633e038bbec 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_test.cc
@@ -39,7 +39,7 @@ const char kTestMp3Filename[] =
 
 // Set to true via a command line flag iff the test is expected to have FFmpeg
 // installed.
-mutex mu;
+mutex mu(LINKER_INITIALIZED);
 bool should_ffmpeg_be_installed GUARDED_BY(mu) = false;
 
 string ParseTestFlags(int* argc, char** argv) {
diff --git a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
index 7176f3b550679555d5ab3b70f2b360a90eaee253..36fc71794b06e0f3cb86c40b325ce50e8999c667 100644
--- a/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
+++ b/tensorflow/contrib/ffmpeg/default/ffmpeg_lib_utility_test.cc
@@ -20,7 +20,10 @@
 #include <string>
 #include <vector>
 
+
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
@@ -49,7 +52,7 @@ TEST(FfmpegLibTest, TestTempDirectoryThreading) {
     pool.Schedule([&mu, &temp_filenames, environment]() {
       std::array<string, kStringsPerItem> buffer;
       for (int32 j = 0; j < kStringsPerItem; ++j) {
-        buffer[j] = GetTempFilename("mp3");
+        buffer[j] = io::GetTempFilename("mp3");
         TF_QCHECK_OK(environment->DeleteFile(buffer[j]));
       }
       mutex_lock l(mu);
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index f64007c81d74276d42c9d6ebd7c8f46cda6b7d72..c5ea1432bf8b61c87615074a93a45325371c4c87 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -24,16 +24,24 @@
 namespace tensorflow {
 namespace ffmpeg {
 
-// Gets a temp filename in an appropriate location.
-string GetTempFilename(const string& extension);
+// Cleans up a file on destruction.
+class FileDeleter {
+ public:
+  explicit FileDeleter(const string& filename) : filename_(filename) {}
+  ~FileDeleter();
+
+ private:
+  const string filename_;
+};
+
+// Writes binary data to a file.
+Status WriteFile(const string& filename, tensorflow::StringPiece contents);
 
 // Reads an audio file using ffmpeg and converts it into an array of samples in
 // [-1.0, 1.0]. If there are multiple channels in the audio then each frame will
 // contain a separate sample for each channel. Frames are ordered by time.
-Status ReadAudioFile(const string& filename,
-                     const string& audio_format_id,
-                     int32 samples_per_second,
-                     int32 channel_count,
+Status ReadAudioFile(const string& filename, const string& audio_format_id,
+                     int32 samples_per_second, int32 channel_count,
                      std::vector<float>* output_samples);
 
 // Creates an audio file using ffmpeg in a specific format. The samples are in
@@ -45,6 +53,11 @@ Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
                        int32 samples_per_second, int32 channel_count,
                        const std::vector<float>& samples, string* output_data);
 
+// Reads an video file using ffmpeg adn converts it into a RGB24 in uint8
+// [frames, height, width, 3]. The w, h, and frames are obtained from ffmpeg.
+Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
+                     uint32* width, uint32* height, uint32* frames);
+
 }  // namespace ffmpeg
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 18b0b8b812c908cff62a241aa59b3a53021123f4..08b5a6ea48c2d4959af68a2ee9d27d21c6245457 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
+from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
@@ -89,3 +91,19 @@ def encode_audio(audio, file_format=None, samples_per_second=None):
 
 
 ops.NotDifferentiable('EncodeAudio')
+
+
+def decode_video(contents):
+  """Create an op that decodes the contents of a video file.
+
+  Args:
+    contents: The binary contents of the video file to decode. This is a
+      scalar.
+
+  Returns:
+    A rank-4 `Tensor` that has `[frames, height, width, 3]` RGB as output.
+  """
+  return gen_decode_video_op_py.decode_video(contents)
+
+
+ops.NotDifferentiable('DecodeVideo')
diff --git a/tensorflow/contrib/ffmpeg/testdata/small.mp4 b/tensorflow/contrib/ffmpeg/testdata/small.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1fc478842f51e7519866f474a02ad605235bc6a6
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/small.mp4 differ
diff --git a/tensorflow/contrib/ffmpeg/testdata/small_100.bmp b/tensorflow/contrib/ffmpeg/testdata/small_100.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..61f53a2a21c933037f004d6ae4319dc6065fb886
Binary files /dev/null and b/tensorflow/contrib/ffmpeg/testdata/small_100.bmp differ
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index 891425fd8cae6fbbf60d30cbd9137c049073456c..5b659ddaa1386736eb8cc05a203ed1827ccd160e 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -24,6 +24,7 @@ tf_custom_op_py_library(
         "python/framework/__init__.py",
         "python/framework/checkpoint_utils.py",
         "python/framework/experimental.py",
+        "python/framework/graph_util.py",
         "python/framework/tensor_util.py",
         "python/ops/__init__.py",
         "python/ops/accumulate_n_v2.py",
@@ -32,6 +33,7 @@ tf_custom_op_py_library(
         "python/ops/checkpoint_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
+        "python/ops/sort_ops.py",
         "python/ops/variables.py",
     ],
     dso = [
@@ -231,6 +233,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "graph_util_test",
+    srcs = ["python/framework/graph_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_test(
     name = "tensor_util_test",
     srcs = ["python/framework/tensor_util_test.py"],
@@ -263,6 +276,7 @@ py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -307,6 +321,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "sort_ops_test",
+    size = "medium",
+    srcs = ["python/ops/sort_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 8421ba7c0423c6ed274f92ba74930822d0171e05..4edc77f86ba786ca547b8d3842e2cf02833fbbac 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -65,6 +65,7 @@ See the @{$python/contrib.framework} guide.
 @@get_variable_full_name
 @@get_variables_to_restore
 @@get_variables
+@@global_variable
 @@local_variable
 @@model_variable
 @@variable
@@ -79,6 +80,8 @@ See the @{$python/contrib.framework} guide.
 @@load_embedding_initializer
 @@load_linear_multiclass_bias_initializer
 @@load_variable_slot_initializer
+
+@@sort
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/framework/python/framework/__init__.py b/tensorflow/contrib/framework/python/framework/__init__.py
index c8e6a4685498a4d89cef44f6a9a3acbe7557cb67..2d49771ab756359712a3ee0b23649c231678f952 100644
--- a/tensorflow/contrib/framework/python/framework/__init__.py
+++ b/tensorflow/contrib/framework/python/framework/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.framework.checkpoint_utils import *
 from tensorflow.contrib.framework.python.framework.experimental import experimental
+from tensorflow.contrib.framework.python.framework.graph_util import *
 from tensorflow.contrib.framework.python.framework.tensor_util import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import decorator_utils
diff --git a/tensorflow/contrib/framework/python/framework/graph_util.py b/tensorflow/contrib/framework/python/framework/graph_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..a18ff2320d99726bb355ff6179fc97a070c2fec7
--- /dev/null
+++ b/tensorflow/contrib/framework/python/framework/graph_util.py
@@ -0,0 +1,154 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to manipulate a tensor graph in python.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import six
+
+# pylint: disable=unused-import
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework.graph_util_impl import _assert_nodes_are_present
+from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
+from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
+from tensorflow.python.framework.graph_util_impl import _node_name
+
+
+__all__ = ["fuse_op", "get_placeholders"]
+
+
+def fuse_op(graph_def, input_nodes, output_nodes, output_dtypes,
+            output_quantized, op_name, op_type):
+  """Fuse subgraph between input_nodes and output_nodes into a single custom op.
+
+  Args:
+    graph_def: A graph_pb2.GraphDef proto.
+    input_nodes: input nodes to the subgraph to be fused.
+    output_nodes: output nodes to the subgraph to be fused.
+    output_dtypes: A list of output datatypes for the custom op
+    output_quantized: A boolean flag that indicates if output is quantized
+    op_name: fused op name.
+    op_type: fused op type.
+  Returns:
+    The GraphDef of the new graph.
+
+  Raises:
+    TypeError: If 'graph_def' is not a graph_pb2.GraphDef proto.
+  """
+
+  if not isinstance(graph_def, graph_pb2.GraphDef):
+    raise TypeError("graph_def must be a graph_pb2.GraphDef proto.")
+
+  if isinstance(input_nodes, six.string_types):
+    raise TypeError("input_nodes must be a list.")
+
+  if isinstance(output_nodes, six.string_types):
+    raise TypeError("output_nodes must be a list.")
+
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      graph_def)
+  _assert_nodes_are_present(name_to_node, input_nodes + output_nodes)
+
+  # Nodes upto and including input_nodes
+  reachable_by_input = _bfs_for_reachable_nodes(input_nodes, name_to_input_name)
+  # Nodes upto and including output_nodes
+  reachable_by_output = _bfs_for_reachable_nodes(output_nodes,
+                                                 name_to_input_name)
+
+  # Set of nodes in the list input_nodes
+  input_nodes_set = set(input_nodes)
+
+  # Set of nodes in the list output_nodes
+  output_nodes_set = set(output_nodes)
+
+  nodes_post_output = []
+  for node in graph_def.node:
+    n = _node_name(node.name)
+    if n in reachable_by_output:
+      if n not in reachable_by_input and n not in output_nodes_set:
+        # n is between input and output, i.e., part of the fused op
+        next_to_visit = [n]
+        while next_to_visit:
+          cur_node = next_to_visit[0]
+          del next_to_visit[0]
+          if cur_node in reachable_by_input and cur_node not in input_nodes_set:
+            raise TypeError("Node %s uses input %s not in input_nodes." %
+                            (n, cur_node))
+          if cur_node not in input_nodes_set:
+            next_to_visit += name_to_input_name[cur_node]
+    elif n not in reachable_by_input:
+      nodes_post_output.append(n)
+
+  # Add all nodes upto the input nodes
+  out = graph_pb2.GraphDef()
+  reachable_by_input_sorted = sorted(
+      list(reachable_by_input), key=lambda n: name_to_seq_num[n])
+  for node in reachable_by_input_sorted:
+    out.node.extend([copy.deepcopy(name_to_node[node])])
+
+  # Add the custom op
+  new_node = node_def_pb2.NodeDef()
+  for node in input_nodes:
+    new_node.input.append(node)
+  new_node.attr["_output_types"].list.type[:] = output_dtypes
+  new_node.attr["_output_quantized"].b = output_quantized
+  new_node.op = op_type
+  new_node.name = op_name
+  out.node.extend([new_node])
+
+  # Add the nodes in the output of the custom op
+  for index, n in enumerate(output_nodes):
+    assert len(name_to_node[n].input) == 1
+    new_node = copy.deepcopy(name_to_node[n])
+    del new_node.input[:]
+    new_node.input.append(op_name + (":" + str(index) if index != 0 else ""))
+    out.node.extend([new_node])
+
+  # Add the nodes post output_nodes
+  for n in nodes_post_output:
+    out.node.extend([copy.deepcopy(name_to_node[n])])
+
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
+  return out
+
+
+def get_placeholders(graph):
+  """Get placeholders of a graph.
+
+  Args:
+    graph: A tf.Graph.
+  Returns:
+    A list contains all placeholders of given graph.
+
+  Raises:
+    TypeError: If `graph` is not a tensorflow graph.
+  """
+
+  if not isinstance(graph, ops.Graph):
+    raise TypeError("Input graph needs to be a Graph: %s" % graph)
+
+  # For each placeholder() call, there is a corresponding
+  # operation of type 'Placeholder' registered to the graph.
+  # The return value (a Tensor) of placeholder() is the
+  # first output of this operation in fact.
+  operations = graph.get_operations()
+  result = [i.outputs[0] for i in operations if i.type == "Placeholder"]
+  return result
diff --git a/tensorflow/contrib/framework/python/framework/graph_util_test.py b/tensorflow/contrib/framework/python/framework/graph_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a6d109e19211d271c2b15bac66ddacd38fe395
--- /dev/null
+++ b/tensorflow/contrib/framework/python/framework/graph_util_test.py
@@ -0,0 +1,99 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""@graph_util tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.framework.python.framework import graph_util
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+def GetNewNode(name, op, input_nodes):
+  new_node = node_def_pb2.NodeDef()
+  new_node.op = op
+  new_node.name = name
+  for node in input_nodes:
+    new_node.input.append(node)
+  return new_node
+
+
+class GraphUtilTest(test.TestCase):
+
+  def testGraphUtil(self):
+    graph_def = graph_pb2.GraphDef()
+    node_a = GetNewNode('A', 'Placeholder', [])
+    node_b = GetNewNode('B', 'Op1', ['A'])
+    node_c = GetNewNode('C', 'Op1', ['B'])
+    node_d = GetNewNode('D', 'Op1', ['C'])
+    node_e = GetNewNode('E', 'Op1', ['D'])
+    graph_def.node.extend([node_a, node_b, node_c, node_d, node_e])
+    fused_graph_def = graph_util.fuse_op(
+        graph_def, ['A'], ['D'], [types_pb2.DT_FLOAT], True, 'FusedOp', 'Op2')
+    self.assertEqual(len(fused_graph_def.node), 4)
+    self.assertEqual(fused_graph_def.node[0].name, 'A')
+    self.assertEqual(fused_graph_def.node[1].name, 'FusedOp')
+    self.assertEqual(fused_graph_def.node[1].input[0], 'A')
+    self.assertEqual(fused_graph_def.node[1].op, 'Op2')
+    self.assertEqual(fused_graph_def.node[1].attr['_output_quantized'].b, True)
+    self.assertEqual(fused_graph_def.node[1].attr['_output_types'].list.type,
+                     [types_pb2.DT_FLOAT])
+    self.assertEqual(fused_graph_def.node[2].name, 'D')
+    self.assertEqual(fused_graph_def.node[3].name, 'E')
+
+  def testGraphUtilArtificialDependencyInjection(self):
+    graph_def = graph_pb2.GraphDef()
+    node_a = GetNewNode('A', 'Placeholder', [])
+    node_a1 = GetNewNode('A1', 'Placeholder', [])
+    node_b = GetNewNode('B', 'Op1', ['A'])
+    node_c = GetNewNode('C', 'Op1', ['B'])
+    node_d = GetNewNode('D', 'Op1', ['C'])
+    node_e = GetNewNode('E', 'Op1', ['D'])
+    graph_def.node.extend([node_a, node_a1, node_b, node_c, node_d, node_e])
+    fused_graph_def = graph_util.fuse_op(graph_def, ['A', 'A1'], ['D'],
+                                         [types_pb2.DT_FLOAT], True, 'FusedOp',
+                                         'Op2')
+    self.assertEqual(len(fused_graph_def.node), 5)
+    self.assertEqual(fused_graph_def.node[0].name, 'A')
+    self.assertEqual(fused_graph_def.node[1].name, 'A1')
+    self.assertEqual(fused_graph_def.node[2].name, 'FusedOp')
+    self.assertEqual(fused_graph_def.node[2].input[0], 'A')
+    self.assertEqual(fused_graph_def.node[2].op, 'Op2')
+    self.assertEqual(fused_graph_def.node[2].attr['_output_quantized'].b, True)
+    self.assertEqual(fused_graph_def.node[2].attr['_output_types'].list.type,
+                     [types_pb2.DT_FLOAT])
+    self.assertEqual(fused_graph_def.node[3].name, 'D')
+    self.assertEqual(fused_graph_def.node[4].name, 'E')
+
+
+class GetPlaceholdersTest(test.TestCase):
+
+  def test_get_placeholders(self):
+    with ops.Graph().as_default() as g:
+      placeholders = [array_ops.placeholder(dtypes.float32) for _ in range(5)]
+      results = graph_util.get_placeholders(g)
+      self.assertEqual(
+          sorted(placeholders, key=lambda x: x._id),  # pylint: disable=protected-access
+          sorted(results, key=lambda x: x._id))  # pylint: disable=protected-access
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index edef37cf0c0719bf10a4c75c34adb30b9716cdcd..685bb94779762ce46ee342e7e0a182c54be64743 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -24,5 +24,6 @@ from tensorflow.contrib.framework.python.ops.arg_scope import *
 from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
+from tensorflow.contrib.framework.python.ops.sort_ops import *
 from tensorflow.contrib.framework.python.ops.variables import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
index a0667bd489213cf366e27114a91e8699ed9e7428..2375ee4f550616ff60d20b87b5773704d8fbbe1e 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2.py
@@ -48,7 +48,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   tf.accumulate_n_v2([a, b, a])  # [[7, 4], [6, 14]]
 
   # Explicitly pass shape and type
-  tf.accumulate_n_v2([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)  
+  tf.accumulate_n_v2([a, b, a], shape=[2, 2], tensor_dtype=tf.int32)
                                                                    # [[7,  4],
                                                                    #  [6, 14]]
   ```
@@ -93,7 +93,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   elif len(inputs) == 1 and name is not None:
     return array_ops.identity(inputs[0], name=name)
   elif context.in_eager_mode():
-    # TemporaryVariable not currently supported in eager mode; fall back 
+    # TemporaryVariable not currently supported in eager mode; fall back
     # onto AddN for now.
     # TODO(frreiss) remove this once the lifetime of eager variables gets
     # addressed
@@ -101,7 +101,7 @@ def accumulate_n_v2(inputs, shape=None, tensor_dtype=None, name=None):
   else:
     return gen_math_ops._accumulate_nv2(inputs, name=name, shape=shape)
 
-# The following code should eventually be merged into 
+# The following code should eventually be merged into
 # tensorflow/python/ops/math_grad.py
 @ops.RegisterGradient("AccumulateNV2")
 def _AddNGrad(op, grad):
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
index c2229bb8ad3d5b38321d16f150ed94175ab9bdbe..8f44698da851b48abf831e957c80fa1643a58bda 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_eager_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into 
+"""Tests for new version of accumulate_n op that will eventually go into
 `ops.math_ops`.
 
-These test cases spefically exercise the `eager` APIs. They need to be in a 
+These test cases spefically exercise the `eager` APIs. They need to be in a
 separate file from the remaining tests because eager mode is currently something
 you can turn on but can't turn off for the lifetime of the current process."""
 from __future__ import absolute_import
@@ -64,7 +64,7 @@ class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
     np.random.seed(42)
     num_inputs = 3
     input_vars = [
-        resource_variable_ops.ResourceVariable(10.0 * np.random.random(), 
+        resource_variable_ops.ResourceVariable(10.0 * np.random.random(),
                                                name="t%d" % i)
         for i in range(0, num_inputs)
     ]
@@ -72,7 +72,7 @@ class AccumulateNV2EagerTest(test_util.TensorFlowTestCase):
     def fn(first, second, third):
       return av2.accumulate_n_v2([first, second, third])
 
-    grad_fn = backprop.gradients_function(fn)      
+    grad_fn = backprop.gradients_function(fn)
     grad = grad_fn(input_vars[0], input_vars[1], input_vars[2])
     self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                         [elem.numpy() for elem in grad])
diff --git a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
index 3386e849d5cb8516ab3b1f6cb0429be3fc2fc960..b5e9f8df79262635bf579a6bf2260bc40c140c6f 100644
--- a/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
+++ b/tensorflow/contrib/framework/python/ops/accumulate_n_v2_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for new version of accumulate_n op that will eventually go into 
+"""Tests for new version of accumulate_n op that will eventually go into
 `ops.math_ops`."""
 from __future__ import absolute_import
 from __future__ import division
@@ -102,21 +102,21 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         a = variables.Variable(np.array([0.1,0.2]))
         b = variables.Variable(np.array([[0.3],[0.4]]))
-        tf_val = av2.accumulate_n_v2([a,b]) 
+        tf_val = av2.accumulate_n_v2([a,b])
 
   def testWrongType(self):
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
         b = variables.Variable(0.1, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32) 
+        tf_val = av2.accumulate_n_v2([a,b], tensor_dtype=np.int32)
 
   def testWrongTypeOneInput(self):
     # Scenario that used to trigger a bug, even when testWrongType() worked
     with self.test_session():
       with self.assertRaises(TypeError):
         a = variables.Variable(0.2, dtype=np.float32)
-        tf_val = av2.accumulate_n_v2([a], tensor_dtype=np.int32) 
+        tf_val = av2.accumulate_n_v2([a], tensor_dtype=np.int32)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f62f0ea7b9b561f235b9496ffda97a9f378d530
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+
+
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+        axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+        `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    if direction not in _SORT_IMPL:
+      raise ValueError('%s should be one of %s' %
+                       (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
+    # Axis must be an integer, not a Tensor.
+    axis = framework_ops.convert_to_tensor(axis, name='axis')
+    axis_static = tensor_util.constant_value(axis)
+    if axis.shape.ndims != 0 or axis_static is None:
+      raise ValueError('axis must be a constant scalar')
+    axis_static = int(axis_static)  # Avoids NumPy casting error
+
+    values = framework_ops.convert_to_tensor(values, name='values')
+
+    return _SORT_IMPL[direction](values, axis_static)
+
+
+def _descending_sort(values, axis):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    return nn_ops.top_k(values, k)[0]
+
+  # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+  if axis < 0:
+    # Make axis a Tensor with the real axis index if needed.
+    axis += rank
+  transposition = array_ops.concat(
+      [
+          # Axes up to axis are unchanged.
+          math_ops.range(axis),
+          # Swap axis and rank - 1.
+          [rank - 1],
+          # Axes in [axis + 1, rank - 1) are unchanged.
+          math_ops.range(axis + 1, rank - 1),
+          # Swap axis and rank - 1.
+          [axis]
+      ],
+      axis=0)
+  top_k_input = array_ops.transpose(values, transposition)
+  values, unused_indices = nn_ops.top_k(top_k_input, k)
+  # transposition contains a single cycle of length 2 (swapping 2 elements),
+  # so it is an involution (it is its own inverse).
+  return array_ops.transpose(values, transposition)
+
+
+def _ascending_sort(values, axis):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis)
+  return -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08ae502f10d98ee14d8bea2f76b18bedb935cea
--- /dev/null
+++ b/tensorflow/contrib/framework/python/ops/sort_ops_test.py
@@ -0,0 +1,95 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the sort wrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.framework.python.ops import sort_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+class SortTest(test.TestCase):
+
+  def testRandom_lowDimensionality(self):
+    self._testRandom_lowDimensionality(negative_axis=False)
+
+  def testRandom_lowDimensionality_negative(self):
+    self._testRandom_lowDimensionality(negative_axis=True)
+
+  def _testRandom_lowDimensionality(self, negative_axis):
+    np.random.seed(42)
+    for _ in range(20):
+      rank = np.random.randint(1, 3)
+      shape = [np.random.randint(0, 20) for _ in range(rank)]
+      arr = np.random.random(shape)
+      sort_axis = np.random.choice(rank)
+      if negative_axis:
+        sort_axis = -1 - sort_axis
+      with self.test_session():
+        self.assertAllEqual(
+            np.sort(arr, axis=sort_axis),
+            sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
+
+  def testRandom_highDimensionality(self):
+    np.random.seed(100)
+    for _ in range(20):
+      rank = np.random.randint(5, 15)
+      shape = [np.random.randint(1, 4) for _ in range(rank)]
+      arr = np.random.random(shape)
+      sort_axis = np.random.choice(rank)
+      with self.test_session():
+        self.assertAllEqual(
+            np.sort(arr, axis=sort_axis),
+            sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
+
+  def testScalar(self):
+    # Create an empty scalar where the static shape is unknown.
+    zeros_length_1 = array_ops.zeros(
+        random_ops.random_uniform([1], minval=0, maxval=1, dtype=dtypes.int32),
+        dtype=dtypes.int32)
+    scalar = array_ops.zeros(zeros_length_1)
+
+    sort = sort_ops.sort(scalar)
+    with self.test_session():
+      with self.assertRaises(errors.InvalidArgumentError):
+        sort.eval()
+
+  def testNegativeOutOfBounds_staticShape(self):
+    arr = constant_op.constant([3, 4, 5])
+    with self.assertRaises(ValueError):
+      sort_ops.sort(arr, axis=-4)
+
+  def testDescending(self):
+    arr = np.random.random((10, 5, 5))
+    with self.test_session():
+      self.assertAllEqual(
+          np.sort(arr, axis=0)[::-1],
+          sort_ops.sort(
+              constant_op.constant(arr),
+              axis=0,
+              direction='DESCENDING').eval())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index 1bd9a14a7f3e17b30b811b3b73e5915c0dd1ec59..3f1ece4510578b5ac39849c577fffbb2a3be45a7 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -60,6 +60,7 @@ __all__ = ['add_model_variable',
            'get_variable_full_name',
            'get_variables_to_restore',
            'get_variables',
+           'global_variable',
            'local_variable',
            'model_variable',
            'variable',
@@ -147,20 +148,48 @@ def get_or_create_global_step(graph=None):
   return training_util.get_or_create_global_step(graph)
 
 
-def local_variable(initial_value, validate_shape=True, name=None):
-  """Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
+def local_variable(initial_value,
+                   validate_shape=True,
+                   name=None,
+                   use_resource=None):
+  """Create a variable with a value and add it to `GraphKeys.LOCAL_VARIABLES`.
 
   Args:
     initial_value: See variables.Variable.__init__.
     validate_shape: See variables.Variable.__init__.
     name: See variables.Variable.__init__.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
   Returns:
     New variable.
   """
   return variable_scope.variable(
       initial_value, trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      validate_shape=validate_shape, name=name)
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      name=name)
+
+
+def global_variable(initial_value,
+                    validate_shape=True,
+                    name=None,
+                    use_resource=None):
+  """Create a variable with a value and add it to `GraphKeys.GLOBAL_VARIABLES`.
+
+  Args:
+    initial_value: See variables.Variable.__init__.
+    validate_shape: See variables.Variable.__init__.
+    name: See variables.Variable.__init__.
+    use_resource: If `True` use a ResourceVariable instead of a Variable.
+  Returns:
+    New variable.
+  """
+  return variable_scope.variable(
+      initial_value, trainable=False,
+      collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+      validate_shape=validate_shape,
+      use_resource=use_resource,
+      name=name)
 
 
 @contrib_add_arg_scope
@@ -201,7 +230,7 @@ def variable(name, shape=None, dtype=None, initializer=None,
                      else [ops.GraphKeys.GLOBAL_VARIABLES])
 
   # Remove duplicates
-  collections = set(collections)
+  collections = list(set(collections))
   getter = variable_scope.get_variable
   if custom_getter is not None:
     getter = functools.partial(custom_getter,
@@ -412,7 +441,7 @@ def get_unique_variable(var_op_name):
   """
   candidates = get_variables(scope=var_op_name)
   if not candidates:
-    raise ValueError('Couldnt find variable %s' % var_op_name)
+    raise ValueError('Couldn\'t find variable %s' % var_op_name)
 
   for candidate in candidates:
     if candidate.op.name == var_op_name:
diff --git a/tensorflow/contrib/framework/python/ops/variables_test.py b/tensorflow/contrib/framework/python/ops/variables_test.py
index 6a74e4e8666e98ca3c97dc9ddd8a6c11613f708e..2f06df93acb0a4c0b36c68839ff531e3c22c5ee3 100644
--- a/tensorflow/contrib/framework/python/ops/variables_test.py
+++ b/tensorflow/contrib/framework/python/ops/variables_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -102,6 +103,82 @@ class LocalVariableTest(test.TestCase):
       sess.run(variables_lib.local_variables_initializer())
       self.assertAllEqual(a.eval(), [0] * 5)
 
+  def testResourceVariable(self):
+    a = variables_lib2.local_variable(0)
+    b = variables_lib2.local_variable(0, use_resource=True)
+    self.assertEqual(type(a), variables_lib.Variable)
+    self.assertEqual(type(b), resource_variable_ops.ResourceVariable)
+
+
+class GlobalVariableTest(test.TestCase):
+
+  def test_global_variable(self):
+    with self.test_session() as sess:
+      self.assertEquals([], variables_lib.global_variables())
+      value0 = 42
+      variables_lib2.global_variable(value0)
+      value1 = 43
+      variables_lib2.global_variable(value1)
+      variables = variables_lib.global_variables()
+      self.assertEquals(2, len(variables))
+      with self.assertRaisesOpError(
+          'Attempting to use uninitialized value Variable'):
+        sess.run(variables)
+      variables_lib.variables_initializer(variables).run()
+      self.assertAllEqual(set([value0, value1]), set(sess.run(variables)))
+
+  def testVariableNameAndShape(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable([1, 1, 1, 1, 1], name='a')
+        self.assertEquals(a.op.name, 'A/a')
+        self.assertListEqual(a.get_shape().as_list(), [5])
+        self.assertListEqual([a], variables_lib.global_variables())
+
+  def testGlobalVariableNotInLocalVariables(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+        self.assertFalse(a in variables_lib.local_variables())
+        self.assertTrue(a in variables_lib.global_variables())
+
+  def testGlobalVariableInVariablesToRestore(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+        self.assertFalse(a in variables_lib.local_variables())
+        self.assertTrue(a in variables_lib2.get_variables_to_restore())
+
+  def testGetVariablesReturnsThem(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        a = variables_lib2.global_variable(0)
+      with variable_scope.variable_scope('B'):
+        b = variables_lib2.global_variable(0)
+      self.assertEquals([a], variables_lib2.get_variables('A'))
+      self.assertEquals([b], variables_lib2.get_variables('B'))
+
+  def testGetLocalVariablesDontReturnsThem(self):
+    with self.test_session():
+      with variable_scope.variable_scope('A'):
+        variables_lib2.global_variable(0)
+      with variable_scope.variable_scope('B'):
+        variables_lib2.global_variable(0)
+      self.assertEquals([], variables_lib2.get_local_variables('A'))
+      self.assertEquals([], variables_lib2.get_local_variables('B'))
+
+  def testInitializedVariableValue(self):
+    with self.test_session() as sess:
+      a = variables_lib2.global_variable([0, 0, 0, 0, 0], name='a')
+      sess.run(variables_lib.global_variables_initializer())
+      self.assertAllEqual(a.eval(), [0] * 5)
+
+  def testResourceVariable(self):
+    a = variables_lib2.global_variable(0)
+    b = variables_lib2.global_variable(0, use_resource=True)
+    self.assertEqual(type(a), variables_lib.Variable)
+    self.assertEqual(type(b), resource_variable_ops.ResourceVariable)
+
 
 class GlobalStepTest(test.TestCase):
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 88306094ab9947c9c78b03c0013f6afc88316803..5fec69ea4361a97c79ddc3188469e7ffb327f6cc 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -493,6 +493,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       {{conv_input_rows, conv_input_cols}},
       output_depth,
       {{filter_rows, filter_cols}},
+      // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
+      {{1, 1}},  // dilation_rows, dilation_cols
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
       conv_input->dtype(),
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index dc43af11580ce5fda74ee25da6c151a5b89c7aee..fa7a3c03aa35c756252b22a004be91fa24c10e41 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -30,11 +30,12 @@ class FusedConvParameters : public ConvParameters {
  public:
   FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                       int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& stride, const SpatialArray& padding,
-                      DataType dtype, int device_id, bool has_side_input,
+                      const SpatialArray& dilation, const SpatialArray& stride,
+                      const SpatialArray& padding, DataType dtype,
+                      int device_id, bool has_side_input,
                       ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, stride,
-                       padding, dtype, device_id),
+      : ConvParameters(batch, in_depths, in, out_depths, filter, dilation,
+                       stride, padding, dtype, device_id),
         activation_mode_(activation_mode),
         has_side_input_(has_side_input) {
     hash_code_ = Hash64Combine(hash_code_, has_side_input);
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 887ebc5a6c35379476fa1a643c866d38e2b25699..6a56237f67c844a3daa546eb02d64c9e2658f639 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -52,6 +52,7 @@ REGISTER_OP("FusedConv2DBiasActivation")
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Attr("filter_format: {'HWIO', 'OIHW', 'OIHW_VECT_I'} = 'HWIO'")
     .Attr("activation_mode: {'Relu'} = 'Relu'")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       using shape_inference::ShapeHandle;
       using shape_inference::DimensionHandle;
@@ -151,6 +152,11 @@ REGISTER_OP("FusedConv2DBiasActivation")
                      kernel_height, kernel_width, input_channels % 4 ]`
     activation_mode: The activation applied to the output.
         Currently must be "Relu".
+    dilations: 1-D tensor of length 4.  The dilation factor for each dimension
+        of `input`. If set to k > 1, there will be k-1 skipped cells between
+        each filter element on that dimension. The dimension order is determined
+        by the value of `data_format`, see above for details. Dilations in the
+        batch and depth dimensions must be 1.
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 1418c87023af0dbff890f46e10f0140d5b89e4b7..a2e6fa51f1e1cea1d995204d84a620a991cfb7ba 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -116,6 +116,7 @@ py_library(
     deps = [
         ":clip_weights",
         ":conditioning_utils",
+        ":random_tensor_pool",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -219,6 +220,37 @@ py_test(
     ],
 )
 
+py_library(
+    name = "random_tensor_pool",
+    srcs = [
+        "python/features/python/random_tensor_pool.py",
+        "python/features/python/random_tensor_pool_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "random_tensor_pool_test",
+    srcs = ["python/features/python/random_tensor_pool_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":random_tensor_pool",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "virtual_batchnorm",
     srcs = [
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 3ab84780705b35567169bd76fd3485ad355ba9d8..4bca0a1d62a2b404c6783c7cfe3b5c67cfc58221 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -8,7 +8,8 @@ explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al.
+Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
+introduction.
 
 #### Usage
 ```python
@@ -23,8 +24,8 @@ mix TFGAN, native TF, and other custom frameworks
 * Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
 * [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
 * Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
-* Develop based on examples of common GAN setups
-* Use the TFGAN-backed tf.Learn Estimator to easily train a GAN model
+* Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
+* Use the TFGAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
 * Improvements in TFGAN infrastructure will automatically benefit your TFGAN project
 * Stay up-to-date with research as we add more algorithms
 
@@ -51,7 +52,7 @@ network to evaluate your unconditional generative model. You can also use
 your own pretrained classifier for more specific performance numbers, or use
 other methods for evaluating conditional generative models.
 
-* examples (coming soon):
+* [examples](https://github.com/tensorflow/models/tree/master/research/gan/) and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb):
 See examples of how to use TFGAN to make GAN training easier, or use the more complicated examples to jumpstart your
 own project. These include unconditional and conditional GANs, InfoGANs,
 adversarial losses on existing networks, and image-to-image translation.
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index dff361fdc42708ea69999c2def4721f9d49fcf14..f1946c7f925660eae3aaa650c437e03da1f33d6c 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN is a lightweight library for training and evaluating GANs.
+
+In addition to providing the infrastructure for easily training and evaluating
+GANS, this library contains modules for a TFGAN-backed Estimator,
+evaluation metrics, features (such as virtual batch normalization), and losses.
+Please see README.md for details and usage.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 8c4a18228039cb4f2c06e0333f4b8408f1f631e9..c9f7bc61b25230e4159cf8cbc7c9cceead0aa706 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN estimator module.
+
+GANEstimator provides all the infrastructure support of a TensorFlow Estimator
+with the feature support of TFGAN.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index e89993991a389d68254a95aded2d771f4c2627be..9d14f391332fa95035bf96f8f37930af595634a9 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import enum
 
 from tensorflow.contrib.framework.python.ops import variables as variable_lib
@@ -29,6 +30,7 @@ from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_inspect as inspect
 
 
 __all__ = [
@@ -76,7 +78,7 @@ class GANEstimator(estimator.Estimator):
         return logits
 
       # Create GAN estimator.
-      gan_estimator = estimator.GANEstimator(
+      gan_estimator = tfgan.estimator.GANEstimator(
           model_dir,
           generator_fn=generator_fn,
           discriminator_fn=discriminator_fn,
@@ -105,6 +107,7 @@ class GANEstimator(estimator.Estimator):
                discriminator_loss_fn=None,
                generator_optimizer=None,
                discriminator_optimizer=None,
+               get_hooks_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -116,7 +119,10 @@ class GANEstimator(estimator.Estimator):
         to continue training a previously saved model.
       generator_fn: A python function that takes a Tensor, Tensor list, or
         Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples.
+        generator. See `TFGAN` for more details and examples. Additionally, if
+        it has an argument called `mode`, the Estimator's `mode` will be passed
+        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
+        normalization.
       discriminator_fn: A python function that takes the output of
         `generator_fn` or real data in the GAN setup, and `generator_inputs`.
         Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
@@ -132,6 +138,10 @@ class GANEstimator(estimator.Estimator):
         work.
       discriminator_optimizer: Same as `generator_optimizer`, but for the
         discriminator updates.
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. These hooks are run on the generator and discriminator
+        train ops, and can be used to implement the GAN training scheme.
+        Defaults to `train.get_sequential_train_hooks()`.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -146,7 +156,7 @@ class GANEstimator(estimator.Estimator):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
@@ -225,9 +235,12 @@ def _gan_model_fn(
       labels=None)
 
 
-def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
-                          generator_inputs, generator_scope, add_summaries):
-  """Make a `GANModel` for training."""
+def _make_gan_model(generator_fn, discriminator_fn, real_data,
+                    generator_inputs, generator_scope, add_summaries, mode):
+  """Make a `GANModel`, and optionally pass in `mode`."""
+  # If `generator_fn` has an argument `mode`, pass mode to it.
+  if 'mode' in inspect.getargspec(generator_fn).args:
+    generator_fn = functools.partial(generator_fn, mode=mode)
   gan_model = tfgan_train.gan_model(
       generator_fn,
       discriminator_fn,
@@ -245,15 +258,28 @@ def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
   return gan_model
 
 
+def _make_train_gan_model(generator_fn, discriminator_fn, real_data,
+                          generator_inputs, generator_scope, add_summaries):
+  """Make a `GANModel` for training."""
+  return _make_gan_model(generator_fn, discriminator_fn, real_data,
+                         generator_inputs, generator_scope, add_summaries,
+                         model_fn_lib.ModeKeys.TRAIN)
+
+
 def _make_eval_gan_model(generator_fn, discriminator_fn, real_data,
                          generator_inputs, generator_scope, add_summaries):
   """Make a `GANModel` for evaluation."""
-  return _make_train_gan_model(generator_fn, discriminator_fn, real_data,
-                               generator_inputs, generator_scope, add_summaries)
+  return _make_gan_model(generator_fn, discriminator_fn, real_data,
+                         generator_inputs, generator_scope, add_summaries,
+                         model_fn_lib.ModeKeys.EVAL)
 
 
 def _make_prediction_gan_model(generator_inputs, generator_fn, generator_scope):
   """Make a `GANModel` from just the generator."""
+  # If `generator_fn` has an argument `mode`, pass mode to it.
+  if 'mode' in inspect.getargspec(generator_fn).args:
+    generator_fn = functools.partial(generator_fn,
+                                     mode=model_fn_lib.ModeKeys.PREDICT)
   with variable_scope.variable_scope(generator_scope) as gen_scope:
     generator_inputs = tfgan_train._convert_tensor_or_l_or_d(generator_inputs)  # pylint:disable=protected-access
     generated_data = generator_fn(generator_inputs)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 1bfdce9ee94d4d05d5186cd999361662bc0e3f85..e752f0bcccda418b79d4fdabb27807394cbbb425 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -48,7 +48,8 @@ from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
 
-def generator_fn(noise_dict):
+def generator_fn(noise_dict, mode):
+  del mode
   noise = noise_dict['x']
   return layers.fully_connected(noise, noise.shape[1].value)
 
@@ -90,7 +91,6 @@ def mock_head(testcase, expected_generator_inputs, expected_real_data,
         generator_var_names,
         set([x.name for x in gan_model.generator_variables]))
     testcase.assertEqual(generator_scope_name, gan_model.generator_scope.name)
-    testcase.assertEqual(generator_fn, gan_model.generator_fn)
     testcase.assertEqual(_or_none(expected_real_data), gan_model.real_data)
     # TODO(joelshor): Add check on `discriminator_real_outputs`.
     # TODO(joelshor): Add check on `discriminator_gen_outputs`.
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 204c646e194319c0e63599da0b2a4909ef270ef3..a21358c50bbdb4a1a929b0c5bc322cec4c9923b5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -71,7 +71,7 @@ class GANHead(head._Head):  # pylint: disable=protected-access
   def __init__(self, generator_loss_fn, discriminator_loss_fn,
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
-               get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
+               get_hooks_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -86,10 +86,12 @@ class GANHead(head._Head):  # pylint: disable=protected-access
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
       get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+        of hooks. Defaults to `train.get_sequential_train_hooks()`
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
+    if get_hooks_fn is None:
+      get_hooks_fn = tfgan_train.get_sequential_train_hooks()
     # TODO(joelshor): Validate inputs.
 
     if use_loss_summaries in [True, False]:
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index bb8046187807d0cc584f7174eb9aac578855c110..7daf78bc5dcab87f6fa31a8334269d31e94576d4 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN evaluation module.
+
+This module supports techniques such as Inception Score, Frechet Inception
+distance, and Sliced Wasserstein distance.
+"""
 # pylint: disable=,wildcard-import,unused-import
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index d4c080cab3d82f6a69a293e84e1c08322bbb6f86..82293b575aefa198a618ae7286ca24ebabd6987d 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -57,8 +57,10 @@ __all__ = [
     'run_inception',
     'inception_score',
     'classifier_score',
+    'classifier_score_from_logits',
     'frechet_inception_distance',
     'frechet_classifier_distance',
+    'frechet_classifier_distance_from_activations',
     'INCEPTION_DEFAULT_IMAGE_SIZE',
 ]
 
@@ -130,10 +132,10 @@ def preprocess_image(
   with ops.name_scope(scope, 'preprocess', [images, height, width]):
     if not images.dtype.is_floating:
       images = math_ops.to_float(images)
-    images = (images - 128.0) / 128.0
     if is_single:
       images = array_ops.expand_dims(images, axis=0)
     resized = image_ops.resize_bilinear(images, [height, width])
+    resized = (resized - 128.0) / 128.0
     if is_single:
       resized = array_ops.squeeze(resized, axis=0)
     return resized
@@ -222,13 +224,13 @@ def run_inception(images,
     image_size: Required image width and height. See unit tests for the default
       values.
     input_tensor: Name of input Tensor.
-    output_tensor: Name of output Tensor. This function will compute activations
-      at the specified layer. Examples include INCEPTION_V3_OUTPUT and
-      INCEPTION_V3_FINAL_POOL which would result in this function computing
+    output_tensor: Name or list of output Tensors. This function will compute
+      activations at the specified layer. Examples include INCEPTION_V3_OUTPUT
+      and INCEPTION_V3_FINAL_POOL which would result in this function computing
       the final logits or the penultimate pooling layer.
 
   Returns:
-    Logits.
+    Tensor or Tensors corresponding to computed `output_tensor`.
 
   Raises:
     ValueError: If images are not the correct size.
@@ -244,8 +246,14 @@ def run_inception(images,
 
   activations = run_image_classifier(images, graph_def, input_tensor,
                                      output_tensor)
-  if array_ops.rank(activations) != 2:
-    activations = layers.flatten(activations)
+  if isinstance(activations, list):
+    for i, activation in enumerate(activations):
+      if array_ops.rank(activation) != 2:
+        activations[i] = layers.flatten(activation)
+  else:
+    if array_ops.rank(activations) != 2:
+      activations = layers.flatten(activations)
+
   return activations
 
 
@@ -257,23 +265,26 @@ def run_image_classifier(tensor, graph_def, input_tensor,
     tensor: An Input tensor.
     graph_def: A GraphDef proto.
     input_tensor: Name of input tensor in graph def.
-    output_tensor: Name of output tensor in graph def.
+    output_tensor: A tensor name or list of tensor names in graph def.
     scope: Name scope for classifier.
 
   Returns:
-    Classifier output. Shape depends on the classifier used, but is often
-    [batch, classes].
+    Classifier output if `output_tensor` is a string, or a list of outputs if
+    `output_tensor` is a list.
 
   Raises:
-    ValueError: If `image_size` is not `None`, and `tensor` are not the correct
-      size.
+    ValueError: If `input_tensor` or `output_tensor` aren't in the graph_def.
   """
   input_map = {input_tensor: tensor}
-  return_elements = [output_tensor]
-  classifier_output = importer.import_graph_def(
-      graph_def, input_map, return_elements, name=scope)[0]
+  is_singleton = isinstance(output_tensor, str)
+  if is_singleton:
+    output_tensor = [output_tensor]
+  classifier_outputs = importer.import_graph_def(
+      graph_def, input_map, output_tensor, name=scope)
+  if is_singleton:
+    classifier_outputs = classifier_outputs[0]
 
-  return classifier_output
+  return classifier_outputs
 
 
 def classifier_score(images, classifier_fn, num_batches=1):
@@ -297,7 +308,8 @@ def classifier_score(images, classifier_fn, num_batches=1):
       efficiently run them through the classifier network.
 
   Returns:
-    The classifier score. A floating-point scalar.
+    The classifier score. A floating-point scalar of the same type as the output
+    of `classifier_fn`.
   """
   generated_images_list = array_ops.split(
       images, num_or_size_splits=num_batches)
@@ -311,12 +323,36 @@ def classifier_score(images, classifier_fn, num_batches=1):
       swap_memory=True,
       name='RunClassifier')
   logits = array_ops.concat(array_ops.unstack(logits), 0)
+
+  return classifier_score_from_logits(logits)
+
+
+def classifier_score_from_logits(logits):
+  """Classifier score for evaluating a conditional generative model.
+
+  This is based on the Inception Score, but for an arbitrary classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1606.03498. In
+  summary, this function calculates
+
+  exp( E[ KL(p(y|x) || p(y)) ] )
+
+  which captures how different the network's classification prediction is from
+  the prior distribution over classes.
+
+  Args:
+    logits: A 2D Tensor of logits.
+
+  Returns:
+    The classifier score. A floating-point scalar of the same type as the output
+    of `logits`.
+  """
   logits.shape.assert_has_rank(2)
 
   # Use maximum precision for best results.
   logits_dtype = logits.dtype
   if logits_dtype != dtypes.float64:
-    logits = math_ops.cast(logits, dtypes.float64)
+    logits = math_ops.to_double(logits)
 
   p = nn_ops.softmax(logits)
   q = math_ops.reduce_mean(p, axis=0)
@@ -326,7 +362,7 @@ def classifier_score(images, classifier_fn, num_batches=1):
   final_score = math_ops.exp(log_score)
 
   if logits_dtype != dtypes.float64:
-    final_score = math_ops.cast(final_score, dtypes.float64)
+    final_score = math_ops.cast(final_score, logits_dtype)
   return final_score
 
 
@@ -415,7 +451,8 @@ def frechet_classifier_distance(real_images,
       efficiently run them through the classifier network.
 
   Returns:
-    The Frechet Inception distance. A floating-point scalar.
+    The Frechet Inception distance. A floating-point scalar of the same type
+    as the output of `classifier_fn`
   """
 
   real_images_list = array_ops.split(
@@ -440,20 +477,65 @@ def frechet_classifier_distance(real_images,
   # Ensure the activations have the right shapes.
   real_a = array_ops.concat(array_ops.unstack(real_a), 0)
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
-  real_a.shape.assert_has_rank(2)
-  gen_a.shape.assert_has_rank(2)
+
+  return frechet_classifier_distance_from_activations(real_a, gen_a)
+
+
+def frechet_classifier_distance_from_activations(
+    real_activations, generated_activations):
+  """Classifier distance for evaluating a generative model.
+
+  This is based on the Frechet Inception distance, but for an arbitrary
+  classifier.
+
+  This technique is described in detail in https://arxiv.org/abs/1706.08500.
+  Given two Gaussian distribution with means m and m_w and covariance matrices
+  C and C_w, this function calcuates
+
+  |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2))
+
+  which captures how different the distributions of real images and generated
+  images (or more accurately, their visual features) are. Note that unlike the
+  Inception score, this is a true distance and utilizes information about real
+  world images.
+
+  Note that when computed using sample means and sample covariance matrices,
+  Frechet distance is biased. It is more biased for small sample sizes. (e.g.
+  even if the two distributions are the same, for a small sample size, the
+  expected Frechet distance is large). It is important to use the same
+  sample size to compute frechet classifier distance when comparing two
+  generative models.
+
+  Args:
+    real_activations: Real images to use to compute Frechet Inception distance.
+    generated_activations: Generated images to use to compute Frechet Inception
+      distance.
+
+  Returns:
+    The Frechet Inception distance. A floating-point scalar of the same type
+    as the output of the activations.
+  """
+  real_activations.shape.assert_has_rank(2)
+  generated_activations.shape.assert_has_rank(2)
+
+  activations_dtype = real_activations.dtype
+  if activations_dtype != dtypes.float64:
+    real_activations = math_ops.to_double(real_activations)
+    generated_activations = math_ops.to_double(generated_activations)
 
   # Compute mean and covariance matrices of activations.
-  m = math_ops.reduce_mean(real_a, 0)
-  m_v = math_ops.reduce_mean(gen_a, 0)
-  num_examples = math_ops.to_float(array_ops.shape(real_a)[0])
+  m = math_ops.reduce_mean(real_activations, 0)
+  m_v = math_ops.reduce_mean(generated_activations, 0)
+  num_examples = math_ops.to_double(array_ops.shape(real_activations)[0])
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
+  real_centered = real_activations - m
   sigma = math_ops.matmul(
-      real_a - m, real_a - m, transpose_a=True) / (num_examples - 1)
+      real_centered, real_centered, transpose_a=True) / (num_examples - 1)
 
+  gen_centered = generated_activations - m_v
   sigma_v = math_ops.matmul(
-      gen_a - m_v, gen_a - m_v, transpose_a=True) / (num_examples - 1)
+      gen_centered, gen_centered, transpose_a=True) / (num_examples - 1)
 
   # Find the Tr(sqrt(sigma sigma_v)) component of FID
   sqrt_trace_component = trace_sqrt_product(sigma, sigma_v)
@@ -467,6 +549,8 @@ def frechet_classifier_distance(real_images,
   # Next the distance between means.
   mean = math_ops.square(linalg_ops.norm(m - m_v))  # This uses the L2 norm.
   fid = trace + mean
+  if activations_dtype != dtypes.float64:
+    fid = math_ops.cast(fid, activations_dtype)
 
   return fid
 
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 81fa2fc0f126647d2f01a1f4fc695d714eba2c75..1e18c699ba93b5f524341c65d0a2db84556b65a2 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -190,6 +190,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
+  def test_run_inception_multiple_outputs(self):
+    """Test `run_inception` graph construction with multiple outputs."""
+    batch_size = 3
+    img = array_ops.ones([batch_size, 299, 299, 3])
+    logits, pool = _run_with_mock(
+        classifier_metrics.run_inception, img,
+        output_tensor=[classifier_metrics.INCEPTION_OUTPUT,
+                       classifier_metrics.INCEPTION_FINAL_POOL])
+
+    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertTrue(isinstance(pool, ops.Tensor))
+    logits.shape.assert_is_compatible_with([batch_size, 1001])
+    pool.shape.assert_is_compatible_with([batch_size, 2048])
+
+    # Check that none of the model variables are trainable.
+    self.assertListEqual([], variables.trainable_variables())
+
   def test_inception_score_graph(self):
     """Test `inception_score` graph construction."""
     score = _run_with_mock(classifier_metrics.inception_score,
@@ -277,7 +294,7 @@ class ClassifierMetricsTest(test.TestCase):
 
     expected_fid = _expected_fid(test_pool_real_a, test_pool_gen_a)
 
-    self.assertAllClose(expected_fid, actual_fid, 0.01)
+    self.assertAllClose(expected_fid, actual_fid, 0.0001)
 
   def test_trace_sqrt_product_value(self):
     """Test that `trace_sqrt_product` gives the correct value."""
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 6d0972f8db418d6fcf517cc6f7e96093ae08a9e4..4816daf760143af9f1502873b123ffad8e5ec8ce 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN features module.
+
+This module includes support for virtual batch normalization, buffer replay,
+conditioning, etc.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,10 +26,12 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -33,5 +39,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
+_allowed_symbols += random_tensor_pool.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
index 030e37ec679ec58e3b534fd3644ffe1d23173404..2b7bb5f14e7f3d1b3f913d3426efaaae19079ffb 100644
--- a/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
+++ b/tensorflow/contrib/gan/python/features/python/clip_weights_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tfgan.python.features.clip_weights."""
+"""Tests for features.clip_weights."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,17 +31,18 @@ class ClipWeightsTest(test.TestCase):
   """Tests for `discriminator_weight_clip`."""
 
   def setUp(self):
+    super(ClipWeightsTest, self).setUp()
     self.variables = [variables.Variable(2.0)]
     self.tuple = collections.namedtuple(
         'VarTuple', ['discriminator_variables'])(self.variables)
 
   def _test_weight_clipping_helper(self, use_tuple):
-    loss = self.variables[0] * 2.0
+    loss = self.variables[0]
     opt = training.GradientDescentOptimizer(1.0)
     if use_tuple:
-      opt_clip = clip_weights.weight_clip(opt, self.variables, 0.1)
+      opt_clip = clip_weights.clip_variables(opt, self.variables, 0.1)
     else:
-      opt_clip = clip_weights.discriminator_weight_clip(opt, self.tuple, 0.1)
+      opt_clip = clip_weights.clip_discriminator_weights(opt, self.tuple, 0.1)
 
     train_op1 = opt.minimize(loss, var_list=self.variables)
     train_op2 = opt_clip.minimize(loss, var_list=self.variables)
@@ -72,10 +73,14 @@ class ClipWeightsTest(test.TestCase):
         clip_weights.clip_discriminator_weights(opt, self.tuple, weight_clip=-1)
     else:
       with self.assertRaisesRegexp(ValueError, 'must be positive'):
-        clip_weights.clip_weights(opt, self.variables, weight_clip=-1)
+        clip_weights.clip_variables(opt, self.variables, weight_clip=-1)
 
   def test_incorrect_weight_clip_value_argsonly(self):
     self._test_incorrect_weight_clip_value_helper(False)
 
   def test_incorrect_weight_clip_value_tuple(self):
     self._test_incorrect_weight_clip_value_helper(True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca904971fa8cb0440d3e0c9060f13cc214c9eaad
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool.py
@@ -0,0 +1,35 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor pool stores values from an input tensor and returns a stored one.
+
+See the following papers for more details.
+1) `Learning from simulated and unsupervised images through adversarial
+    training` (https://arxiv.org/abs/1612.07828).
+2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
+    Networks` (https://arxiv.org/abs/1703.10593).
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.features.python import random_tensor_pool_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = random_tensor_pool_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d733b6ff9f6afc44e8a0d9364729de506fc36d2
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor pool stores values from an input tensor and returns a stored one.
+
+We use this to keep a history of values created by a generator, such that
+a discriminator can randomly be trained on some older samples, not just the
+current one. This can help to not let the discriminator get too far ahead of the
+generator and also to keep the system from oscilating, if the discriminator
+forgets too fast what past samples from the generator looked like.
+
+See the following papers for more details.
+1) `Learning from simulated and unsupervised images through adversarial
+    training` (https://arxiv.org/abs/1612.07828).
+2) `Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial
+    Networks` (https://arxiv.org/abs/1703.10593).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import random_ops
+
+__all__ = [
+    'tensor_pool',
+]
+
+
+def _to_tuple(x):
+  if isinstance(x, (list, tuple)):
+    return tuple(x)
+  return (x,)
+
+
+def tensor_pool(input_values,
+                pool_size,
+                pooling_probability=0.5,
+                name='tensor_pool'):
+  """Queue storing input values and returning random previously stored ones.
+
+  Every time the returned `output_value` is evaluated, `input_value` is
+  evaluated and its value either directly returned (with
+  `1-pooling_probability`) or stored in the pool and a random one of the samples
+  currently in the pool is popped and returned. As long as the pool in not fully
+  filled, the input_value is always directly returned, as well as stored in the
+  pool. Note during inference / testing, it may be appropriate to set
+  `pool_size` = 0 or `pooling_probability` = 0.
+
+  Args:
+    input_values: A `Tensor`, or a list or tuple of `Tensor`s from which to read
+      values to be pooled.
+    pool_size: An integer specifying the maximum size of the pool.
+    pooling_probability: A float `Tensor` specifying the probability of getting
+      a value from the pool, as opposed to just the current input.
+    name: A string prefix for the name scope for all tensorflow ops.
+
+  Returns:
+    A `Tensor`, or a list or tuple of `Tensor`s (according to the type ofx
+    `input_values`) which is with given probability either the `input_values` or
+    a randomly chosen sample that was previously inserted in the pool.
+
+  Raises:
+    ValueError: If `pool_size` is negative.
+  """
+  pool_size = int(pool_size)
+  if pool_size < 0:
+    raise ValueError('`pool_size` is negative.')
+  elif pool_size == 0:
+    return input_values
+
+  original_input_values = input_values
+  input_values = _to_tuple(input_values)
+
+  with ops.name_scope(
+      '{}_pool_queue'.format(name),
+      values=input_values + (pooling_probability,)):
+    pool_queue = data_flow_ops.RandomShuffleQueue(
+        capacity=pool_size,
+        min_after_dequeue=0,
+        dtypes=[v.dtype for v in input_values],
+        shapes=None)
+
+    # In pseudeo code this code does the following:
+    # if not pool_full:
+    #   enqueue(input_values)
+    #   return input_values
+    # else
+    #   dequeue_values = dequeue_random_sample()
+    #   enqueue(input_values)
+    #   if rand() < pooling_probability:
+    #     return dequeue_values
+    #   else
+    #     return input_values
+
+    def _get_input_value_pooled():
+      enqueue_op = pool_queue.enqueue(input_values)
+      with ops.control_dependencies([enqueue_op]):
+        return tuple(array_ops.identity(v) for v in input_values)
+
+    def _get_random_pool_value_and_enqueue_input():
+      dequeue_values = _to_tuple(pool_queue.dequeue())
+      with ops.control_dependencies(dequeue_values):
+        enqueue_op = pool_queue.enqueue(input_values)
+        with ops.control_dependencies([enqueue_op]):
+          prob = random_ops.random_uniform(
+              (), dtype=dtypes.float32) < pooling_probability
+          return control_flow_ops.cond(prob, lambda: dequeue_values,
+                                       lambda: input_values)
+
+    output_values = _to_tuple(control_flow_ops.cond(
+        pool_queue.size() < pool_size, _get_input_value_pooled,
+        _get_random_pool_value_and_enqueue_input))
+
+  if isinstance(original_input_values, list):
+    return list(output_values)
+  elif isinstance(original_input_values, tuple):
+    return output_values
+  return output_values[0]
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef3a87ab34f9754099073eefcb3f1b1c97a3762
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.contrib.gan.python.features.random_tensor_pool."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class TensorPoolTest(test.TestCase):
+
+  def test_pool_unknown_input_shape(self):
+    """Checks that `input_value` can have unknown shape."""
+    input_value = array_ops.placeholder(
+        dtype=dtypes.int32, shape=[None, None, 3])
+    output_value = tensor_pool(input_value, pool_size=10)
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(10):
+        session.run(output_value, {input_value: [[[i] * 3]]})
+        session.run(output_value, {input_value: [[[i] * 3] * 2]})
+        session.run(output_value, {input_value: [[[i] * 3] * 5] * 2})
+
+  def test_pool_sequence(self):
+    """Checks that values are pooled and returned maximally twice."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    output_value = tensor_pool(input_value, pool_size=10)
+
+    with self.test_session(use_gpu=True) as session:
+      outs = []
+      for i in range(50):
+        out = session.run(output_value, {input_value: i})
+        outs.append(out)
+        self.assertLessEqual(out, i)
+
+      _, counts = np.unique(outs, return_counts=True)
+      # Check that each value is returned maximally twice.
+      self.assertTrue((counts <= 2).all())
+
+  def test_never_pool(self):
+    """Checks that setting `pooling_probability` to zero works."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    output_value = tensor_pool(
+        input_value, pool_size=10, pooling_probability=0.0)
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(50):
+        out = session.run(output_value, {input_value: i})
+        self.assertEqual(out, i)
+
+  def test_pooling_probability(self):
+    """Checks that `pooling_probability` works."""
+    input_value = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    pool_size = 10
+    pooling_probability = 0.2
+    output_value = tensor_pool(
+        input_value,
+        pool_size=pool_size,
+        pooling_probability=pooling_probability)
+
+    with self.test_session(use_gpu=True) as session:
+      not_pooled = 0
+      total = 1000
+      for i in range(total):
+        out = session.run(output_value, {input_value: i})
+        if out == i:
+          not_pooled += 1
+      self.assertAllClose(
+          (not_pooled - pool_size) / (total - pool_size),
+          1 - pooling_probability,
+          atol=0.03)
+
+  def test_input_values_tuple(self):
+    """Checks that `input_values` can be a tuple."""
+    input_values = (array_ops.placeholder(dtype=dtypes.int32, shape=[]),
+                    array_ops.placeholder(dtype=dtypes.int32, shape=[]))
+    output_values = tensor_pool(input_values, pool_size=3)
+    self.assertEqual(len(output_values), len(input_values))
+
+    with self.test_session(use_gpu=True) as session:
+      for i in range(10):
+        outs = session.run(output_values, {
+            input_values[0]: i,
+            input_values[1]: i + 1
+        })
+        self.assertEqual(len(outs), len(input_values))
+        self.assertEqual(outs[1] - outs[0], 1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/__init__.py b/tensorflow/contrib/gan/python/losses/__init__.py
index 290ff867a1e443f20a63e27fd97f53fed8a6cc11..d9bf8ebfdf65dfc76e4569dcaf26e0e51c7fc107 100644
--- a/tensorflow/contrib/gan/python/losses/__init__.py
+++ b/tensorflow/contrib/gan/python/losses/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN grouped API. Please see README.md for details and usage."""
+"""TFGAN losses and penalties.
+
+Losses can be used with individual arguments or with GANModel tuples.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 48f5e8e47dbcd5d32c23806b967a0d1e7403d2f7..3d4e315ebd0bd52b3b5e3e4a8655df8bfe9cebe8 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -79,6 +79,7 @@ class InfoGANModel(
     collections.namedtuple('InfoGANModel', GANModel._fields + (
         'structured_generator_inputs',
         'predicted_distributions',
+        'discriminator_and_aux_fn',
     ))):
   """An InfoGANModel contains all the pieces needed for InfoGAN training.
 
@@ -91,6 +92,8 @@ class InfoGANModel(
     predicted_distributions: A list of tf.Distributions. Predicted by the
       recognizer, and used to evaluate the likelihood of the structured noise.
       List length should match `structured_generator_inputs`.
+    discriminator_and_aux_fn: The original discriminator function that returns
+      a tuple of (logits, `predicted_distributions`).
   """
 
 
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 06dd281489be7b12d9123ca83d926bc7b81f7e10..27c1a2245135299ac943bc2b2dd89dd10e52ea1b 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -58,6 +58,7 @@ __all__ = [
     'get_sequential_train_hooks',
     'get_joint_train_hooks',
     'get_sequential_train_steps',
+    'RunTrainOpsHook',
 ]
 
 
@@ -214,7 +215,8 @@ def infogan_model(
       disc_scope,
       lambda x, y: discriminator_fn(x, y)[0],  # conform to non-InfoGAN API
       structured_generator_inputs,
-      predicted_distributions)
+      predicted_distributions,
+      discriminator_fn)
 
 
 def acgan_model(
@@ -421,7 +423,7 @@ def gan_loss(
     ac_disc_loss = tfgan_losses.acgan_discriminator_loss(
         model, add_summaries=add_summaries)
     dis_loss += aux_cond_discriminator_weight * ac_disc_loss
-  # Gathers auxilliary losses.
+  # Gathers auxiliary losses.
   if model.generator_scope:
     gen_reg_loss = losses.get_regularization_loss(model.generator_scope.name)
   else:
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 6b27b6926102b6e5a7ff134ceed75c23459a6534..4d4ede706c51ec17d0ea5bd1854ea2cd79358bdb 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -145,14 +145,16 @@ def get_infogan_model():
   return namedtuples.InfoGANModel(
       *get_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def get_callable_infogan_model():
   return namedtuples.InfoGANModel(
       *get_callable_gan_model(),
       structured_generator_inputs=[constant_op.constant(0)],
-      predicted_distributions=[categorical.Categorical([1.0])])
+      predicted_distributions=[categorical.Categorical([1.0])],
+      discriminator_and_aux_fn=infogan_discriminator_model)
 
 
 def create_infogan_model():
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index a417dba87543d82526ab856e5b915ee47f496d46..bdbe6f0a72621e59562fe113da101ff5a2b8c06d 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_interface",
diff --git a/tensorflow/contrib/graph_editor/util.py b/tensorflow/contrib/graph_editor/util.py
index 959905e9826fe439112078a32fef9a5f5b96e9ac..30bc33b9ee42ba78bc7307c67c0fc0af9f3356ef 100644
--- a/tensorflow/contrib/graph_editor/util.py
+++ b/tensorflow/contrib/graph_editor/util.py
@@ -93,6 +93,8 @@ class ListView(object):
 # TODO(fkp): very generic code, it should be moved in a more generic place.
 def is_iterable(obj):
   """Return true if the object is iterable."""
+  if isinstance(obj, tf_ops.Tensor):
+    return False
   try:
     _ = iter(obj)
   except Exception:  # pylint: disable=broad-except
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 157e97d237021d95c935a6be66aa57842b97125c..54502cfc6eecb9d064ffde9773e97d893a24133a 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -9,6 +9,7 @@ package(default_visibility = ["//visibility:public"])
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
@@ -106,10 +107,33 @@ tf_custom_op_library(
     name = "python/ops/_distort_image_ops.so",
     srcs = [
         "kernels/adjust_hsv_in_yiq_op.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
         "ops/distort_image_ops.cc",
     ],
+    gpu_srcs = [
+        "kernels/adjust_hsv_in_yiq_op_gpu.cu.cc",
+        "kernels/adjust_hsv_in_yiq_op.h",
+    ],
     deps = [
-        "@protobuf_archive//:protobuf",
+        "//tensorflow/core/kernels:gpu_util_hdrs",
+    ],
+)
+
+tf_cc_test(
+    name = "adjust_hsv_in_yiq_op_test",
+    size = "small",
+    srcs = [
+        "kernels/adjust_hsv_in_yiq_op.h",
+        "kernels/adjust_hsv_in_yiq_op_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
     ],
 )
 
@@ -122,19 +146,6 @@ tf_gen_op_wrapper_py(
     deps = [":distort_image_ops_op_lib"],
 )
 
-cc_library(
-    name = "distort_image_ops_cc",
-    srcs = [
-        "kernels/adjust_hsv_in_yiq_op.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-    alwayslink = 1,
-)
-
 py_library(
     name = "distort_image_py",
     srcs = [
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index f4962ed69dc68d4bad06ef29d7a167e0ba8ae044..478b716d88321101c971789f36c0ff8ecd3f418e 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <cmath>
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
 #include <memory>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
@@ -36,10 +37,10 @@ class AdjustHsvInYiqOpBase : public OpKernel {
 
   struct ComputeOptions {
     const Tensor* input = nullptr;
+    Tensor* output = nullptr;
     const Tensor* delta_h = nullptr;
     const Tensor* scale_s = nullptr;
     const Tensor* scale_v = nullptr;
-    Tensor* output = nullptr;
     int64 channel_count = 0;
   };
 
@@ -65,7 +66,7 @@ class AdjustHsvInYiqOpBase : public OpKernel {
                                         scale_v.shape().DebugString()));
     auto channels = input.dim_size(input.dims() - 1);
     OP_REQUIRES(
-        context, channels == 3,
+        context, channels == kChannelSize,
         errors::InvalidArgument("input must have 3 channels but instead has ",
                                 channels, " channels."));
 
@@ -101,53 +102,21 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
     const Tensor* input = options.input;
     Tensor* output = options.output;
     const int64 channel_count = options.channel_count;
-    static const int kChannelSize = 3;
     auto input_data = input->shaped<float, 2>({channel_count, kChannelSize});
     const float delta_h = options.delta_h->scalar<float>()();
     const float scale_s = options.scale_s->scalar<float>()();
     const float scale_v = options.scale_v->scalar<float>()();
     auto output_data = output->shaped<float, 2>({channel_count, kChannelSize});
+    float tranformation_matrix[kChannelSize * kChannelSize] = {0};
+    internal::compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        delta_h, scale_s, scale_v, tranformation_matrix);
     const int kCostPerChannel = 10;
     const DeviceBase::CpuWorkerThreads& worker_threads =
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h, scale_s, scale_v](
+          [channel_count, &input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
-            // Using approximate linear transfomation described in:
-            // https://beesbuzz.biz/code/hsv_color_transforms.php
-            /** Get the constants from sympy
-             from sympy import Matrix
-             from sympy.abc import u, w
-             # Projection matrix to YIQ. http://en.wikipedia.org/wiki/YIQ
-             tyiq = Matrix([[0.299, 0.587, 0.114],
-                            [0.596, -0.274, -0.322],
-                            [0.211, -0.523, 0.312]])
-             # Hue rotation matrix in YIQ space.
-             hue_proj = Matrix(3,3, [v, 0, 0, 0, vsu, -vsw, 0, vsw, vsu])
-             m = tyiq.inv() * hue_proj * tyiq
-             **/
-            // TODO(huangyp): directly compute the projection matrix from tyiq.
-            static const float t[kChannelSize][kChannelSize][kChannelSize] = {
-                {{.299, .701, .16862179492229},
-                 {.587, -.587, .329804745287403},
-                 {.114, -.114, -0.498426540209694}},
-                {{.299, -.299, -.327963394172371},
-                 {.587, .413, .0346106879248821},
-                 {.114, -.114, .293352706247489}},
-                {{.299, -.299, 1.24646136576682},
-                 {.587, -.587, -1.04322888291964},
-                 {.114, .886, -.203232482847173}}};
-            float m[kChannelSize][kChannelSize] = {{0.}};
-            float su = scale_s * std::cos(delta_h);
-            float sw = scale_s * std::sin(delta_h);
-            for (int q_index = 0; q_index < kChannelSize; q_index++) {
-              for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                m[q_index][p_index] = scale_v * (t[q_index][p_index][0] +
-                                                 t[q_index][p_index][1] * su +
-                                                 t[q_index][p_index][2] * sw);
-              }
-            }
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
@@ -155,7 +124,9 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
               for (int q_index = 0; q_index < kChannelSize; q_index++) {
                 q[q_index] = 0;
                 for (int p_index = 0; p_index < kChannelSize; p_index++) {
-                  q[q_index] += m[q_index][p_index] * p[p_index];
+                  q[q_index] +=
+                      p[p_index] *
+                      tranformation_matrix[q_index + kChannelSize * p_index];
                 }
               }
               p += kChannelSize;
@@ -165,8 +136,33 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHsvInYiq").Device(DEVICE_CPU),
-                        AdjustHsvInYiqOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<CPUDevice>);
+
+#if GOOGLE_CUDA
+template <>
+class AdjustHsvInYiqOp<GPUDevice> : public AdjustHsvInYiqOpBase {
+ public:
+  explicit AdjustHsvInYiqOp(OpKernelConstruction* context)
+      : AdjustHsvInYiqOpBase(context) {}
+
+  void DoCompute(OpKernelContext* ctx, const ComputeOptions& options) override {
+    const int64 number_of_elements = options.input->NumElements();
+    if (number_of_elements <= 0) {
+      return;
+    }
+    const float* delta_h = options.delta_h->flat<float>().data();
+    const float* scale_s = options.scale_s->flat<float>().data();
+    const float* scale_v = options.scale_v->flat<float>().data();
+    functor::AdjustHsvInYiqGPU()(ctx, options.channel_count, options.input,
+                                 delta_h, scale_s, scale_v, options.output);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHsvInYiq").Device(DEVICE_GPU).TypeConstraint<float>("T"),
+    AdjustHsvInYiqOp<GPUDevice>);
+#endif
 
-// TODO(huangyp): add the GPU kernel
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..194ae2ba47456cac66c01989a78ab4ce607d1295
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include <cmath>
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+static constexpr int kChannelSize = 3;
+
+namespace internal {
+
+template <int MATRIX_SIZE>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void compute_tranformation_matrix(
+    const float delta_h, const float scale_s, const float scale_v,
+    float* matrix) {
+  static_assert(MATRIX_SIZE == kChannelSize * kChannelSize,
+                "Size of matrix should be 9.");
+  // Projection matrix from RGB to YIQ. Numbers from wikipedia
+  // https://en.wikipedia.org/wiki/YIQ
+  Eigen::Matrix3f yiq;
+  /* clang-format off */
+  yiq << 0.299, 0.587, 0.114,
+         0.596, -0.274, -0.322,
+         0.211, -0.523, 0.312;
+  Eigen::Matrix3f yiq_inverse;
+  yiq_inverse << 1, 0.95617069, 0.62143257,
+                 1, -0.2726886, -0.64681324,
+                 1, -1.103744, 1.70062309;
+  /* clang-format on */
+  // Construct hsv linear transformation matrix in YIQ space.
+  // https://beesbuzz.biz/code/hsv_color_transforms.php
+  float vsu = scale_v * scale_s * std::cos(delta_h);
+  float vsw = scale_v * scale_s * std::sin(delta_h);
+  Eigen::Matrix3f hsv_transform;
+  /* clang-format off */
+  hsv_transform << scale_v, 0, 0,
+                   0, vsu, -vsw,
+                   0, vsw, vsu;
+  /* clang-format on */
+  // Compute final transformation matrix = inverse_yiq * hsv_transform * yiq
+  Eigen::Map<Eigen::Matrix<float, 3, 3, Eigen::ColMajor>> eigen_matrix(matrix);
+  eigen_matrix = yiq_inverse * hsv_transform * yiq;
+}
+}  // namespace internal
+
+#if GOOGLE_CUDA
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+struct AdjustHsvInYiqGPU {
+  void operator()(OpKernelContext* ctx, int channel_count,
+                  const Tensor* const input, const float* const delta_h,
+                  const float* const scale_s, const float* const scale_v,
+                  Tensor* const output);
+};
+
+}  // namespace functor
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_IMAGE_KERNELS_ADJUST_HSV_IN_YIQ_OP_H_
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b71ff9cd507faac66b3a33d3c02ec9b5901d814a
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/kernels/gpu_utils.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace internal {
+
+__global__ void compute_tranformation_matrix_cuda(const float* const delta_h,
+                                                  const float* const scale_s,
+                                                  const float* const scale_v,
+                                                  float* const matrix,
+                                                  const int matrix_size) {
+  if (matrix_size == kChannelSize * kChannelSize) {
+    compute_tranformation_matrix<kChannelSize * kChannelSize>(
+        *delta_h, *scale_s, *scale_v, matrix);
+  }
+}
+}  // namespace internal
+
+namespace functor {
+
+void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
+                                   const Tensor* const input,
+                                   const float* const delta_h,
+                                   const float* const scale_s,
+                                   const float* const scale_v,
+                                   Tensor* const output) {
+  const uint64 m = channel_count;
+  const uint64 k = kChannelSize;
+  const uint64 n = kChannelSize;
+  auto* cu_stream = ctx->eigen_device<GPUDevice>().stream();
+  OP_REQUIRES(ctx, cu_stream, errors::Internal("No GPU stream available."));
+  Tensor tranformation_matrix;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
+                          &tranformation_matrix));
+  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // with one thread. Improve its performance if necessary.
+  internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
+      delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
+      tranformation_matrix.flat<float>().size());
+  // Call cuBlas C = A * B directly.
+  auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+  auto a_ptr =
+      AsDeviceMemory(input->flat<float>().data(), input->flat<float>().size());
+  auto b_ptr = AsDeviceMemory(tranformation_matrix.flat<float>().data(),
+                              tranformation_matrix.flat<float>().size());
+  auto c_ptr = AsDeviceMemory(output->flat<float>().data(),
+                              output->flat<float>().size());
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+  // TODO(huangyp): share/use autotune cublas algorithms in Matmul.op.
+  bool blas_launch_status =
+      stream
+          ->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n,
+                         a_ptr, k, 0.0f, &c_ptr, n)
+          .ok();
+  if (!blas_launch_status) {
+    ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                    ", n=", n, ", k=", k));
+  }
+}
+}  // namespace functor
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cbbd277840133c9419f9ce3d945b7d099679dc0
--- /dev/null
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class AdjustHsvInYiqOpTest : public OpsTestBase {
+ protected:
+};
+
+TEST_F(AdjustHsvInYiqOpTest, IdentiyTransformMatrix) {
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, 1.0,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected, {1, 0, 0, 0, 1, 0, 0, 0, 1});
+  test::ExpectClose(matrix, expected);
+}
+
+TEST_F(AdjustHsvInYiqOpTest, ScaleValueTransformMatrix) {
+  float scale_v = 2.3;
+  Tensor matrix(allocator(), DT_FLOAT, TensorShape({9}));
+  internal::compute_tranformation_matrix<9>(0.0, 1.0, scale_v,
+                                            matrix.flat<float>().data());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({9}));
+  test::FillValues<float>(&expected,
+                          {scale_v, 0, 0, 0, scale_v, 0, 0, 0, scale_v});
+  test::ExpectClose(matrix, expected);
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 2b6799213827537f77deda4e052bb7ec16f46343..f8b56ab1c5400694b3aa8d4a0c19c7769aa8cbce 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -40,7 +40,7 @@ REGISTER_OP("SingleImageRandomDotStereograms")
     .Doc(R"doc(
 Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 
-Given the 2-D tensor 'depth_values' with encoded Z values, this operation will 
+Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
 encode 3-D data witin the image.
@@ -68,14 +68,14 @@ with open('picture_out.png', 'wb') as f:
     f.write(png)
 ```
 
-depth_values: Z values of data to encode into 'output_data_window' window, 
+depth_values: Z values of data to encode into 'output_data_window' window,
   lower values are further away {0.0 floor(far), 1.0 ceiling(near) after normalization}, must be 2-D tensor
 hidden_surface_removal: Activate hidden surface removal
 convergence_dots_size: Black dot size in pixels to help view converge image, drawn on bottom of image
 dots_per_inch: Output device in dots/inch
 eye_separation: Separation between eyes in inches
 mu: Depth of field, Fraction of viewing distance (eg. 1/3 = .3333)
-normalize: Normalize input data to [0.0, 1.0] 
+normalize: Normalize input data to [0.0, 1.0]
 normalize_max: Fix MAX value for Normalization - if < MIN, autoscale
 normalize_min: Fix MIN value for Normalization - if > MAX, autoscale
 border_level: Value of border depth 0.0 {far} to 1.0 {near}
diff --git a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
index b85f19d29b79defa10493bdbaa4a1b237cb2a9ee..a495b58b7f6481d4cdedf73f23615d0390eb6a45 100644
--- a/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/distort_image_ops_test.py
@@ -172,7 +172,7 @@ class AdjustValueInYiqTest(test_util.TensorFlowTestCase):
           raise AssertionError('Invalid test style: %s' % (test_style))
         y_np = self._adjust_value_in_yiq_np(x_np, scale)
         y_tf = self._adjust_value_in_yiq_tf(x_np, scale)
-        self.assertAllClose(y_tf, y_np, rtol=2e-5, atol=1e-5)
+        self.assertAllClose(y_tf, y_np, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -237,7 +237,7 @@ class AdjustSaturationInYiqTest(test_util.TensorFlowTestCase):
             raise AssertionError('Invalid test style: %s' % (test_style))
           y_baseline = self._adjust_saturation_in_yiq_np(x_np, scale)
           y_tf = self._adjust_saturation_in_yiq_tf(x_np, scale)
-          self.assertAllClose(y_tf, y_baseline, rtol=2e-5, atol=1e-5)
+          self.assertAllClose(y_tf, y_baseline, rtol=2e-4, atol=1e-4)
 
   def test_invalid_shapes(self):
     x_np = np.random.rand(2, 3) * 255.
@@ -291,6 +291,9 @@ class AdjustHueInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_hue_in_yiqCpuAll(self):
     self._benchmark_adjust_hue_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_hue_in_yiq_gpu_all(self):
+    self._benchmark_adjust_hue_in_yiq(test.gpu_device_name(), None)
+
 
 class AdjustSaturationInYiqBenchmark(test.Benchmark):
 
@@ -333,6 +336,9 @@ class AdjustSaturationInYiqBenchmark(test.Benchmark):
   def benchmark_adjust_saturation_in_yiq_cpu_all(self):
     self._benchmark_adjust_saturation_in_yiq('/cpu:0', None)
 
+  def benchmark_adjust_saturation_in_yiq_gpu_all(self):
+    self._benchmark_adjust_saturation_in_yiq(test.gpu_device_name(), None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index 011ddeaa9a1eebaa507c9e0d33f9546ff3497166..faedee6f87772016561671bacd87f88657eafffb 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -224,7 +224,8 @@ def transform(images, transforms, interpolation="NEAREST", name=None):
        `(x, y)` to a transformed *input* point
        `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`,
        where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared to
-       the transform mapping input points to output points.
+       the transform mapping input points to output points. Note that gradients
+       are not backpropagated into transformation parameters.
     interpolation: Interpolation mode. Supported values: "NEAREST", "BILINEAR".
 
   Returns:
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index 5cccf26028ca6bf269dbc67a33075351edecb407..bb766e59d2cee648042cc08be466796d9233ad66 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -68,7 +68,7 @@ def single_image_random_dot_stereograms(
   ```
 
   Args:
-    depth_values: A `Tensor`. Must be one of the following types: 
+    depth_values: A `Tensor`. Must be one of the following types:
       `float64`, `float32`, `int64`, `int32`.  Z values of data to encode
       into 'output_data_window' window, lower further away {0.0 floor(far),
       1.0 ceiling(near) after norm}, must be 2-D tensor
@@ -84,17 +84,17 @@ def single_image_random_dot_stereograms(
     mu: An optional `float`. Defaults to `0.3333`.
       Depth of field, Fraction of viewing distance (eg. 1/3 = 0.3333)
     normalize: An optional `bool`. Defaults to `True`.
-      Normalize input data to [0.0, 1.0] 
+      Normalize input data to [0.0, 1.0]
     normalize_max: An optional `float`. Defaults to `-100`.
       Fix MAX value for Normalization (0.0) - if < MIN, autoscale
     normalize_min: An optional `float`. Defaults to `100`.
       Fix MIN value for Normalization (0.0) - if > MAX, autoscale
     border_level: An optional `float`. Defaults to `0`.
-      Value of bord in depth 0.0 {far} to 1.0 {near} 
+      Value of bord in depth 0.0 {far} to 1.0 {near}
     number_colors: An optional `int`. Defaults to `256`. 2 (Black &
       White), 256 (grayscale), and Numbers > 256 (Full Color) are
       supported
-    output_image_shape: An optional `tf.TensorShape` or list of `ints`. 
+    output_image_shape: An optional `tf.TensorShape` or list of `ints`.
       Defaults to shape `[1024, 768, 1]`. Defines output shape of returned
       image in '[X,Y, Channels]' 1-grayscale, 3 color; channels will be
       updated to 3 if number_colors > 256
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index 5d86373a232d55cd281d06cfc0606f4224d8f669..95fba59e3c96ae3c69e0b154740785b0d2bcb3c9 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -16,6 +16,7 @@ py_test(
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
@@ -33,6 +34,7 @@ py_test(
         "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
@@ -68,6 +70,7 @@ py_test(
     srcs = ["layer_collection_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
         "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
         "//tensorflow/python:array_ops",
@@ -75,6 +78,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:variable_scope",
@@ -88,7 +92,6 @@ py_test(
     deps = [
         "//tensorflow/contrib/kfac/python/ops:kfac_optimizer",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
-        "//tensorflow/contrib/kfac/python/ops:loss_functions",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -139,6 +142,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:random_ops",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index b52a7b52a7efd4292ad514c5a744c4da07082142..9b28c45c7263208d21b1514ae5f05b7e81e315a3 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.kfac.python.ops import estimator
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -33,6 +34,30 @@ from tensorflow.python.platform import test
 _ALL_ESTIMATION_MODES = ["gradients", "empirical", "curvature_prop", "exact"]
 
 
+class DeviceContextGeneratorTest(test.TestCase):
+
+  def testNoDevice(self):
+    device_context_generator = estimator._DeviceContextGenerator(None)
+    with ops.device("/device:CPU:0"):  # This is what will be used
+      with device_context_generator():  # Does nothing
+        a = constant_op.constant([2.0], name="a")
+    self.assertEqual("/device:CPU:0", a.op.device)
+
+  def testTwoDevices(self):
+    device_context_generator = estimator._DeviceContextGenerator(
+        ["/device:GPU:0", "/device:GPU:1"])
+    with ops.device("/device:CPU:0"):  # Will be over-ridden by the inner scopes
+      with device_context_generator():
+        a = constant_op.constant([2.0], name="a")
+      with device_context_generator():
+        b = constant_op.constant([2.0], name="b")
+      with device_context_generator():
+        c = constant_op.constant([2.0], name="c")
+    self.assertEqual("/device:GPU:0", a.op.device)
+    self.assertEqual("/device:GPU:1", b.op.device)
+    self.assertEqual("/device:GPU:0", c.op.device)
+
+
 class EstimatorTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index dbf40fccc8257b1dec6cbd790adfa59161ab9049..2d9b28185ce0db32d5cd7d84737fdf96e2c98851 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -40,13 +40,29 @@ def _make_psd(dim):
   return array_ops.constant(mat)
 
 
+class UtilsTest(test.TestCase):
+
+  def testComputePiTracenorm(self):
+    with ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      left_factor = array_ops.diag([1., 2., 0., 1.])
+      right_factor = array_ops.ones([2., 2.])
+
+      # pi is the sqrt of the left trace norm divided by the right trace norm
+      pi = fb._compute_pi_tracenorm(left_factor, right_factor)
+
+      pi_val = sess.run(pi)
+      self.assertEqual(1., pi_val)
+
+
 class FullFBTest(test.TestCase):
 
   def testFullFBInitSingleTensor(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -54,7 +70,8 @@ class FullFBTest(test.TestCase):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -62,7 +79,8 @@ class FullFBTest(test.TestCase):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -71,7 +89,8 @@ class FullFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
 
@@ -88,7 +107,8 @@ class FullFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
 
@@ -105,7 +125,8 @@ class FullFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.FullFB(lc.LayerCollection(), params, 32)
+      block = fb.FullFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = (array_ops.constant([2., 3.]), array_ops.constant(4.))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
@@ -131,7 +152,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -139,7 +161,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       self.assertAllEqual(params, block.tensors_to_compute_grads())
 
@@ -147,7 +170,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
 
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors(grads, 0.5)
@@ -156,7 +180,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       block.instantiate_factors((grads,), 0.5)
 
@@ -173,7 +198,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = array_ops.constant([[1.], [2.]])
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = params**2
       block.instantiate_factors((grads,), 0.5)
 
@@ -189,7 +215,8 @@ class NaiveDiagonalFBTest(test.TestCase):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       params = (array_ops.constant([1., 2.]), array_ops.constant(3.))
-      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params, 32)
+      block = fb.NaiveDiagonalFB(lc.LayerCollection(), params)
+      block.register_additional_minibatch(32)
       grads = (params[0]**2, math_ops.sqrt(params[1]))
       damping = 0.5
       block.instantiate_factors((grads,), damping)
@@ -289,8 +316,7 @@ class FullyConnectedDiagonalFB(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -572,8 +598,7 @@ class ConvDiagonalFBTest(test.TestCase):
     multiply_result_big, multiply_inverse_result_big = self.runFisherBlockOps(
         self.w, [self.inputs], [self.outputs], [self.output_grads])
     multiply_result_small, multiply_inverse_result_small = (
-        self.runFisherBlockOps(self.w,
-                               np.split(self.inputs, 2),
+        self.runFisherBlockOps(self.w, np.split(self.inputs, 2),
                                np.split(self.outputs, 2),
                                np.split(self.output_grads, 2)))
 
@@ -596,8 +621,9 @@ class ConvDiagonalFBTest(test.TestCase):
         self.kernel_size, self.kernel_size, self.input_channels + 1,
         self.output_channels
     ])
-    expected_result = (expected_result[:, :, 0:-1, :], np.reshape(
-        expected_result[:, :, -1, :], [self.output_channels]))
+    expected_result = (expected_result[:, :, 0:-1, :],
+                       np.reshape(expected_result[:, :, -1, :],
+                                  [self.output_channels]))
 
     self.assertEqual(len(result), 2)
     self.assertAllClose(expected_result[0], result[0])
@@ -680,8 +706,8 @@ class ConvKFCBasicFBTest(test.TestCase):
       sess.run(block._input_factor.make_inverse_update_ops())
       sess.run(block._output_factor.make_inverse_update_ops())
 
-      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32), np.arange(
-          2, 4).reshape(2, 1).astype(np.float32))
+      vector = (np.arange(1, 15).reshape(7, 2).astype(np.float32),
+                np.arange(2, 4).reshape(2, 1).astype(np.float32))
       output = block.multiply_inverse((array_ops.constant(vector[0]),
                                        array_ops.constant(vector[1])))
 
@@ -764,11 +790,50 @@ class ConvKFCBasicFBTest(test.TestCase):
       self.assertAllClose(output_flat, explicit)
 
 
+class FullyConnectedSeriesFBTest(test.TestCase):
+
+  def testFullyConnectedSeriesFBInit(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([1., 2.])
+      outputs = array_ops.constant([3., 4.])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(), inputs=[inputs], outputs=[outputs])
+      self.assertAllEqual([outputs], block.tensors_to_compute_grads())
+
+  def testInstantiateFactorsHasBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=True)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+  def testInstantiateFactorsNoBias(self):
+    with ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      inputs = array_ops.constant([[1., 2.], [3., 4.]])
+      outputs = array_ops.constant([[3., 4.], [5., 6.]])
+      block = fb.FullyConnectedSeriesFB(
+          lc.LayerCollection(),
+          inputs=[inputs],
+          outputs=[outputs],
+          has_bias=False)
+      grads = outputs**2
+      block.instantiate_factors(((grads,),), 0.5)
+
+
 def as_tensors(tensor_or_tuple):
   """Converts a potentially nested tuple of np.array to Tensors."""
   if isinstance(tensor_or_tuple, (tuple, list)):
     return tuple(as_tensors(t) for t in tensor_or_tuple)
   return ops.convert_to_tensor(tensor_or_tuple)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index fbb3d219139a4bc05253841a89e73645ef37dddd..70e56db055078bd4399b03e4d4a877e34249cc5e 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import numpy.random as npr
 
 from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.framework import random_seed
@@ -32,6 +33,25 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import test
 
 
+class MaybeColocateTest(test.TestCase):
+
+  def testFalse(self):
+    with tf_ops.Graph().as_default():
+      a = constant_op.constant([2.0], name='a')
+      with ff._maybe_colocate_with(a, False):
+        b = constant_op.constant(3.0, name='b')
+      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
+      self.assertEqual([b'loc:@b'], b.op.colocation_groups())
+
+  def testTrue(self):
+    with tf_ops.Graph().as_default():
+      a = constant_op.constant([2.0], name='a')
+      with ff._maybe_colocate_with(a, True):
+        b = constant_op.constant(3.0, name='b')
+      self.assertEqual([b'loc:@a'], a.op.colocation_groups())
+      self.assertEqual([b'loc:@a'], b.op.colocation_groups())
+
+
 class FisherFactorTestingDummy(ff.FisherFactor):
   """Dummy class to test the non-abstract methods on ff.FisherFactor."""
 
@@ -47,12 +67,19 @@ class FisherFactorTestingDummy(ff.FisherFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
   def instantiate_covariance(self):
     pass
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
@@ -74,6 +101,10 @@ class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return dtypes.float32
+
   def _compute_new_cov(self):
     raise NotImplementedError
 
@@ -101,7 +132,7 @@ class NumericalUtilsTest(test.TestCase):
 
       normalizer = 10.
       x = npr.randn(100, 3)
-      cov = ff._compute_cov(array_ops.constant(x), normalizer)
+      cov = ff._compute_cov(array_ops.constant(x), normalizer=normalizer)
       np_cov = np.dot(x.T, x) / normalizer
 
       self.assertAllClose(sess.run(cov), np_cov)
@@ -247,13 +278,13 @@ class InverseProvidingFactorTest(test.TestCase):
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         factor.register_damped_inverse(1. / i)
       ops = factor.make_inverse_update_ops()
-      self.assertEqual(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD, len(ops))
+      self.assertEqual(1, len(ops))
 
       sess.run(tf_variables.global_variables_initializer())
       new_invs = []
+      sess.run(ops)
       for i in range(1, ff.EIGENVALUE_DECOMPOSITION_THRESHOLD + 1):
         # The inverse op will assign the damped inverse of cov to the inv var.
-        sess.run(ops[i - 1])
         new_invs.append(sess.run(factor._inverses_by_damping[1. / i]))
       # We want to see that the new invs are all different from each other.
       for i in range(len(new_invs)):
@@ -311,6 +342,16 @@ class FullFactorTest(test.TestCase):
       factor = ff.FullFactor((tensor,), 32)
       self.assertEqual([6, 6], factor.get_cov().get_shape().as_list())
 
+  def testFullFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.FullFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 6], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -331,6 +372,16 @@ class NaiveDiagonalFactorTest(test.TestCase):
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
       self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
 
+  def testNaiveDiagonalFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.NaiveDiagonalFactor((tensor,), 32)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([6, 1], cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -344,18 +395,25 @@ class NaiveDiagonalFactorTest(test.TestCase):
 
 class FullyConnectedKroneckerFactorTest(test.TestCase):
 
-  def _testFullyConnectedKroneckerFactorInit(self, has_bias, final_shape):
+  def _testFullyConnectedKroneckerFactorInit(self,
+                                             has_bias,
+                                             final_shape,
+                                             dtype=dtypes.float32_ref):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
-      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.FullyConnectedKroneckerFactor((tensor,), has_bias=has_bias)
-      self.assertEqual(final_shape, factor.get_cov().get_shape().as_list())
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual(final_shape, cov.get_shape().as_list())
 
   def testFullyConnectedKroneckerFactorInitNoBias(self):
-    self._testFullyConnectedKroneckerFactorInit(False, [3, 3])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(False, [3, 3], dtype=dtype)
 
   def testFullyConnectedKroneckerFactorInitWithBias(self):
-    self._testFullyConnectedKroneckerFactorInit(True, [4, 4])
+    for dtype in (dtypes.float32_ref, dtypes.float64_ref):
+      self._testFullyConnectedKroneckerFactorInit(True, [4, 4], dtype=dtype)
 
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
@@ -398,6 +456,18 @@ class ConvInputKroneckerFactorTest(test.TestCase):
       self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
                        factor.get_cov().get_shape().as_list())
 
+  def testConvInputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      factor = ff.ConvInputKroneckerFactor(
+          tensor, (1, 2, 3, 4), 3, 2, has_bias=True)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([1 * 2 * 3 + 1, 1 * 2 * 3 + 1],
+                       cov.get_shape().as_list())
+
   def testMakeCovarianceUpdateOpWithBias(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
@@ -433,6 +503,16 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       factor = ff.ConvOutputKroneckerFactor((tensor,))
       self.assertEqual([5, 5], factor.get_cov().get_shape().as_list())
 
+  def testConvOutputKroneckerFactorInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3, 4, 5), dtype=dtype, name='a/b/c')
+      factor = ff.ConvOutputKroneckerFactor((tensor,))
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([5, 5], cov.get_shape().as_list())
+
   def testConvOutputKroneckerFactorInitNotEnoughDims(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
@@ -451,5 +531,49 @@ class ConvOutputKroneckerFactorTest(test.TestCase):
       self.assertAllClose([[43, 46.5], [46.5, 51.5]], new_cov)
 
 
+class FullyConnectedMultiKFTest(test.TestCase):
+
+  def testFullyConnectedMultiKFInit(self):
+    with tf_ops.Graph().as_default():
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      self.assertEqual([3, 3], factor.get_cov().get_shape().as_list())
+
+  def testFullyConnectedMultiKFInitFloat64(self):
+    with tf_ops.Graph().as_default():
+      dtype = dtypes.float64_ref
+      random_seed.set_random_seed(200)
+      tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=False)
+      cov = factor.get_cov()
+      self.assertEqual(cov.dtype, dtype)
+      self.assertEqual([3, 3], cov.get_shape().as_list())
+
+  def testMakeCovarianceUpdateOpWithBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,), has_bias=True)
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5, 1], [3.5, 5.5, 1.5], [1, 1.5, 1]], new_cov)
+
+  def testMakeCovarianceUpdateOpNoBias(self):
+    with tf_ops.Graph().as_default(), self.test_session() as sess:
+      random_seed.set_random_seed(200)
+      tensor = array_ops.constant([[1., 2.], [3., 4.]], name='a/b/c')
+      tensor_list = [tensor]
+      factor = ff.FullyConnectedMultiKF((tensor_list,))
+
+      sess.run(tf_variables.global_variables_initializer())
+      new_cov = sess.run(factor.make_covariance_update_op(.5))
+      self.assertAllClose([[3, 3.5], [3.5, 5.5]], new_cov)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
index db7ab63c7d1166649acbe41851a5876d8af476db..b8ccbeadd0a9d69edb41fef50e3edb090457adf2 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/layer_collection_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.kfac.python.ops import fisher_blocks
 from tensorflow.contrib.kfac.python.ops import fisher_factors
 from tensorflow.contrib.kfac.python.ops import layer_collection
 from tensorflow.python.framework import dtypes
@@ -25,11 +26,27 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
+class MockFisherBlock(object):
+  """A fake FisherBlock."""
+
+  num_registered_minibatches = 2
+
+  def __init__(self, name='MockFisherBlock'):
+    self.name = name
+
+  def __eq__(self, other):
+    return isinstance(other, MockFisherBlock) and other.name == self.name
+
+  def __hash__(self):
+    return hash(self.name)
+
+
 class LayerParametersDictTest(test.TestCase):
 
   def testSetItem(self):
@@ -90,8 +107,10 @@ class LayerCollectionTest(test.TestCase):
           array_ops.constant(4), [1, 1, 1, 1], 'SAME',
           array_ops.ones((1, 1, 1, 1)), array_ops.constant(3))
       lc.register_conv2d(
-          array_ops.constant(4), [1, 1, 1, 1], 'SAME',
-          array_ops.ones((1, 1, 1, 1)), array_ops.constant(3),
+          array_ops.constant(4), [1, 1, 1, 1],
+          'SAME',
+          array_ops.ones((1, 1, 1, 1)),
+          array_ops.constant(3),
           approx=layer_collection.APPROX_DIAGONAL_NAME)
       lc.register_generic(
           array_ops.constant(5), 16, approx=layer_collection.APPROX_FULL_NAME)
@@ -107,10 +126,11 @@ class LayerCollectionTest(test.TestCase):
       random_seed.set_random_seed(200)
       lc = layer_collection.LayerCollection()
       key = array_ops.constant(1)
-      lc.register_fully_connected(key,
-                                  array_ops.constant(2), array_ops.constant(3))
-      with self.assertRaises(ValueError):
+      lc.register_fully_connected(key, array_ops.constant(2),
+                                  array_ops.constant(3))
+      with self.assertRaises(ValueError) as cm:
         lc.register_generic(key, 16)
+      self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterSingleParamNotRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -125,16 +145,18 @@ class LayerCollectionTest(test.TestCase):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {x: '1'}
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block(x, 'foo')
+    self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterSingleParamRegisteredInTuple(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
     y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y): '1'}
-    lc.register_block(x, 'foo')
-    self.assertEqual(set(['1']), set(lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block(x, 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleParamNotRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -154,8 +176,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y): '1'}
 
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block((x, y), 'foo')
+    self.assertIn('already in LayerCollection', str(cm.exception))
 
   def testRegisterTupleParamRegisteredInSuperset(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -164,18 +187,20 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, y, z): '1'}
 
-    lc.register_block((x, y), 'foo')
-    self.assertEqual(set(['1']), set(lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block((x, y), 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleParamSomeRegistered(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
     y = variable_scope.get_variable('y', initializer=array_ops.constant(1,))
     z = variable_scope.get_variable('z', initializer=array_ops.constant(1,))
     lc = layer_collection.LayerCollection()
-    lc.fisher_blocks = {x: '1', z: '2'}
+    lc.fisher_blocks = {x: MockFisherBlock('1'), z: MockFisherBlock('2')}
 
-    lc.register_block((x, y), 'foo')
-    self.assertEqual(set(['2', 'foo']), set(lc.get_blocks()))
+    with self.assertRaises(ValueError) as cm:
+      lc.register_block((x, y), MockFisherBlock('foo'))
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterTupleVarSomeRegisteredInOtherTuples(self):
     x = variable_scope.get_variable('x', initializer=array_ops.constant(1,))
@@ -185,8 +210,9 @@ class LayerCollectionTest(test.TestCase):
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {(x, z): '1', (z, w): '2'}
 
-    with self.assertRaises(ValueError):
+    with self.assertRaises(ValueError) as cm:
       lc.register_block((x, y), 'foo')
+    self.assertIn('was already registered', str(cm.exception))
 
   def testRegisterCategoricalPredictiveDistribution(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -406,6 +432,23 @@ class LayerCollectionTest(test.TestCase):
 
       self.ensureLayerReuseWorks(register_fn)
 
+  def testReuseWithInvalidRegistration(self):
+    """Invalid registrations shouldn't overwrite existing blocks."""
+    with ops.Graph().as_default():
+      inputs = array_ops.ones([2, 5, 5, 10])
+      outputs = array_ops.zeros([2, 5, 5, 3])
+      w = variable_scope.get_variable('w', [1, 1, 10, 3])
+      b = variable_scope.get_variable('b', [3])
+      lc = layer_collection.LayerCollection()
+      lc.register_fully_connected(w, inputs, outputs)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      with self.assertRaises(KeyError):
+        lc.register_fully_connected((w, b), inputs, outputs, reuse=True)
+      self.assertNotIn((w, b), lc.fisher_blocks)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 1)
+      lc.register_fully_connected(w, inputs, outputs, reuse=True)
+      self.assertEqual(lc.fisher_blocks[w].num_registered_minibatches, 2)
+
   def testMakeOrGetFactor(self):
     with ops.Graph().as_default():
       random_seed.set_random_seed(200)
@@ -438,11 +481,6 @@ class LayerCollectionTest(test.TestCase):
 
   def testGetUseCountMap(self):
     """Ensure get_use_count_map() sums 'num_registered_minibatches'."""
-
-    class MockFisherBlock(object):
-
-      num_registered_minibatches = 2
-
     lc = layer_collection.LayerCollection()
     lc.fisher_blocks = {
         'a': MockFisherBlock(),
@@ -452,6 +490,66 @@ class LayerCollectionTest(test.TestCase):
     use_count_map = lc.get_use_count_map()
     self.assertDictEqual({'a': 4, 'b': 2, 'c': 4}, use_count_map)
 
+  def testIdentifyLinkedParametersSomeRegisteredInOtherTuples(self):
+    x = variable_scope.get_variable('x', shape=())
+    y = variable_scope.get_variable('y', shape=())
+    z = variable_scope.get_variable('z', shape=())
+    lc = layer_collection.LayerCollection()
+    lc.define_linked_parameters((x, y))
+
+    with self.assertRaises(ValueError):
+      lc.define_linked_parameters((x, z))
+
+  def testIdentifySubsetPreviouslyRegisteredTensor(self):
+    x = variable_scope.get_variable('x', shape=())
+    y = variable_scope.get_variable('y', shape=())
+    lc = layer_collection.LayerCollection()
+    lc.define_linked_parameters((x, y))
+
+    with self.assertRaises(ValueError):
+      lc.define_linked_parameters(x)
+
+  def testSpecifyApproximation(self):
+    w_0 = variable_scope.get_variable('w_0', [10, 10])
+    w_1 = variable_scope.get_variable('w_1', [10, 10])
+
+    b_0 = variable_scope.get_variable('b_0', [10])
+    b_1 = variable_scope.get_variable('b_1', [10])
+
+    x_0 = array_ops.placeholder(dtypes.float32, shape=(32, 10))
+    x_1 = array_ops.placeholder(dtypes.float32, shape=(32, 10))
+
+    pre_bias_0 = math_ops.matmul(x_0, w_0)
+    pre_bias_1 = math_ops.matmul(x_1, w_1)
+
+    # Build the fully connected layers in the graph.
+    pre_bias_0 + b_0  # pylint: disable=pointless-statement
+    pre_bias_1 + b_1  # pylint: disable=pointless-statement
+
+    lc = layer_collection.LayerCollection()
+    lc.define_linked_parameters(
+        w_0, approximation=layer_collection.APPROX_DIAGONAL_NAME)
+    lc.define_linked_parameters(
+        w_1, approximation=layer_collection.APPROX_DIAGONAL_NAME)
+    lc.define_linked_parameters(
+        b_0, approximation=layer_collection.APPROX_FULL_NAME)
+    lc.define_linked_parameters(
+        b_1, approximation=layer_collection.APPROX_FULL_NAME)
+
+    lc.register_fully_connected(w_0, x_0, pre_bias_0)
+    lc.register_fully_connected(
+        w_1, x_1, pre_bias_1, approx=layer_collection.APPROX_KRONECKER_NAME)
+    self.assertIsInstance(lc.fisher_blocks[w_0],
+                          fisher_blocks.FullyConnectedDiagonalFB)
+    self.assertIsInstance(lc.fisher_blocks[w_1],
+                          fisher_blocks.FullyConnectedKFACBasicFB)
+
+    lc.register_generic(b_0, batch_size=1)
+    lc.register_generic(
+        b_1, batch_size=1, approx=layer_collection.APPROX_DIAGONAL_NAME)
+    self.assertIsInstance(lc.fisher_blocks[b_0], fisher_blocks.FullFB)
+    self.assertIsInstance(lc.fisher_blocks[b_1], fisher_blocks.NaiveDiagonalFB)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
index 87339cb059802ec8944d5d1ae4557ee34550cd60..39ce3e9337157c8206107bc40c489e44019743ab 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/loss_functions_test.py
@@ -24,6 +24,7 @@ from tensorflow.contrib.kfac.python.ops import loss_functions
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
@@ -96,6 +97,22 @@ class CategoricalLogitsNegativeLogProbLossTest(test.TestCase):
       # difficult to say if the output is correct or not...
       neg_log_prob = sess.run(neg_log_prob)
 
+  def testMultiMinibatchRegistration(self):
+    """Ensure this loss function supports registering multiple minibatches."""
+    with ops.Graph().as_default():
+      tower_logits = []
+      loss = None
+      num_towers = 5
+      for _ in range(num_towers):
+        logits = random_ops.random_uniform(shape=[2, 3])
+        tower_logits.append(logits)
+        if loss is None:
+          loss = loss_functions.CategoricalLogitsNegativeLogProbLoss(logits)
+        else:
+          loss.register_additional_minibatch(logits)
+      self.assertListEqual(loss.input_minibatches, tower_logits)
+      self.assertEqual(loss.num_registered_minibatches, num_towers)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
index 55fe38e3e9aab2dbd70a45cdc8fa0c208b036db0..d255a6e7160386d8eb6fca00765eea8a318f4eaa 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/utils_test.py
@@ -222,18 +222,6 @@ class UtilsTest(test.TestCase):
       self.assertAllClose(b, np.array([4., 5.]))
       self.assertAllClose(c, np.array([[6.], [7.], [8.], [9.]]))
 
-  def testComputePi(self):
-    with ops.Graph().as_default(), self.test_session() as sess:
-      random_seed.set_random_seed(200)
-      left_factor = array_ops.diag([1., 2., 0., 1.])
-      right_factor = array_ops.ones([2., 2.])
-
-      # pi is the sqrt of the left trace norm divided by the right trace norm
-      pi = utils.compute_pi(left_factor, right_factor)
-
-      pi_val = sess.run(pi)
-      self.assertEqual(1., pi_val)
-
   def testPosDefInvCholesky(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index de4b8920b849dbf2117657de6e7c26f94f4d0363..3d731c7bc206d6f168e9b8f29b66bf4f1dbe8542 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -38,6 +38,7 @@ py_library(
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:special_math_ops",
@@ -171,6 +172,7 @@ py_library(
     deps = [
         ":utils",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index 6e2c9ecdce7ad9f98a5beb016770ad2b1e197b0a..5e1680967c184bf19f2a2578219db07a48264dc9 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -18,16 +18,53 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
+import contextlib
+import itertools
 
 import numpy as np
 
 from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.util import nest
 
 
+class _DeviceContextGenerator(object):
+  """Class for generating device contexts in a round-robin fashion."""
+
+  def __init__(self, devices):
+    """Creates a _DeviceContextGenerator object.
+
+    Example usage:
+
+    ```python
+    dcg = _DeviceContextGenerator(['/gpu:0', 'gpu:1'])
+    with dcg():
+      # All operations in this context will be placed on GPU 0
+      ...
+    with dcg():
+      # All operations in this context will be placed on GPU 1
+      ...
+    ```
+
+    Args:
+      devices: An iterable of device strings (or None). Successive calls to
+          __call__ will give contexts which place devices on these devices in
+          a round-robin fashion.
+    """
+    self._cycle = None if devices is None else itertools.cycle(devices)
+
+  @contextlib.contextmanager
+  def __call__(self):
+    """Returns a context manager specifying the default device."""
+    if self._cycle is None:
+      yield
+    else:
+      with tf_ops.device(next(self._cycle)):
+        yield
+
+
 class FisherEstimator(object):
   """Fisher estimator class supporting various approximations of the Fisher."""
 
@@ -36,7 +73,10 @@ class FisherEstimator(object):
                cov_ema_decay,
                damping,
                layer_collection,
-               estimation_mode="gradients"):
+               estimation_mode="gradients",
+               colocate_gradients_with_ops=False,
+               cov_devices=None,
+               inv_devices=None):
     """Create a FisherEstimator object.
 
     Args:
@@ -54,7 +94,7 @@ class FisherEstimator(object):
           blocks, kronecker factors, and losses associated with the
           graph.
       estimation_mode: The type of estimator to use for the Fishers.  Can be
-          'gradients', 'empirical', 'curvature_propagation', or 'exact'.
+          'gradients', 'empirical', 'curvature_prop', or 'exact'.
           (Default: 'gradients').  'gradients' is the basic estimation approach
           from the original K-FAC paper.  'empirical' computes the 'empirical'
           Fisher information matrix (which uses the data's distribution for the
@@ -69,6 +109,14 @@ class FisherEstimator(object):
           for each coordinate of the output instead of using 1/-1 vectors.  It
           is more expensive to compute than the other three options by a factor
           equal to the output dimension, roughly speaking.
+      colocate_gradients_with_ops: Whether we should request gradients be
+          colocated with their respective ops.
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
 
     Raises:
       ValueError: If no losses have been registered with layer_collection.
@@ -79,13 +127,19 @@ class FisherEstimator(object):
     self._estimation_mode = estimation_mode
     self._layers = layer_collection
     self._layers.create_subgraph()
-    self._check_registration(variables)
+    self._layers.check_registration(variables)
     self._gradient_fns = {
         "gradients": self._get_grads_lists_gradients,
         "empirical": self._get_grads_lists_empirical,
         "curvature_prop": self._get_grads_lists_curvature_prop,
         "exact": self._get_grads_lists_exact
     }
+    self._colocate_gradients_with_ops = colocate_gradients_with_ops
+    self._cov_device_context_generator = _DeviceContextGenerator(cov_devices)
+    if inv_devices == cov_devices:
+      self._inv_device_context_generator = self._cov_device_context_generator
+    else:
+      self._inv_device_context_generator = _DeviceContextGenerator(inv_devices)
     setup = self._setup(cov_ema_decay)
     self.cov_update_op, self.inv_update_op, self.inv_updates_dict = setup
 
@@ -148,49 +202,6 @@ class FisherEstimator(object):
     return self._apply_transformation(vecs_and_vars,
                                       lambda fb, vec: fb.multiply(vec))
 
-  def _check_registration(self, variables):
-    """Checks that all variable uses have been registered properly.
-
-    Args:
-      variables: List of variables.
-
-    Raises:
-      ValueError: If any registered variables are not included in the list.
-      ValueError: If any variable in the list is not registered.
-      ValueError: If any variable in the list is registered with the wrong
-          number of "uses" in the subgraph recorded (vs the number of times that
-          variable is actually used in the subgraph).
-    """
-    # Note that overlapping parameters (i.e. those that share variables) will
-    # be caught by layer_collection.LayerParametersDict during registration.
-
-    reg_use_map = self._layers.get_use_count_map()
-
-    error_messages = []
-
-    for var in variables:
-      total_uses = self._layers.subgraph.variable_uses(var)
-      reg_uses = reg_use_map[var]
-
-      if reg_uses == 0:
-        error_messages.append("Variable {} not registered.".format(var))
-      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
-        error_messages.append(
-            "Variable {} registered with wrong number of uses ({} "
-            "vs {} actual).".format(var, reg_uses, total_uses))
-
-    num_get_vars = len(reg_use_map)
-
-    if num_get_vars > len(variables):
-      error_messages.append("{} registered variables were not included in list."
-                            .format(num_get_vars - len(variables)))
-
-    if error_messages:
-      error_messages = [
-          "Found the following errors with variable registration:"
-      ] + error_messages
-      raise ValueError("\n\t".join(error_messages))
-
   def _setup(self, cov_ema_decay):
     """Sets up the various operations.
 
@@ -219,8 +230,13 @@ class FisherEstimator(object):
       raise ValueError("Unrecognized value {} for estimation_mode.".format(
           self._estimation_mode))
 
+    # TODO(b/68033310): This loop round-robins the "concat" operations which
+    # gather the inputs for the cov_updates. In future, we might do these
+    # computations locally then communicate the results, which would require a
+    # modification to this code.
     for grads_list, fb in zip(grads_lists, fisher_blocks_list):
-      fb.instantiate_factors(grads_list, self.damping)
+      with self._cov_device_context_generator():
+        fb.instantiate_factors(grads_list, self.damping)
 
     cov_updates = [
         factor.make_covariance_update_op(cov_ema_decay)
@@ -233,18 +249,23 @@ class FisherEstimator(object):
 
   def _get_all_inverse_update_ops(self):
     for factor in self._layers.get_factors():
-      for op in factor.make_inverse_update_ops():
-        yield op
+      with self._inv_device_context_generator():
+        for op in factor.make_inverse_update_ops():
+          yield op
 
   def _get_grads_lists_gradients(self, tensors):
-    grads_flat = gradients_impl.gradients(self._layers.total_sampled_loss(),
-                                          nest.flatten(tensors))
+    grads_flat = gradients_impl.gradients(
+        self._layers.total_sampled_loss(),
+        nest.flatten(tensors),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_empirical(self, tensors):
-    grads_flat = gradients_impl.gradients(self._layers.total_loss(),
-                                          nest.flatten(tensors))
+    grads_flat = gradients_impl.gradients(
+        self._layers.total_loss(),
+        nest.flatten(tensors),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
@@ -262,11 +283,13 @@ class FisherEstimator(object):
     grads_flat = gradients_impl.gradients(
         nest.flatten(loss_inputs),
         nest.flatten(tensors),
-        grad_ys=nest.flatten(transformed_random_signs))
+        grad_ys=nest.flatten(transformed_random_signs),
+        colocate_gradients_with_ops=self._colocate_gradients_with_ops)
     grads_all = nest.pack_sequence_as(tensors, grads_flat)
     return tuple((grad,) for grad in grads_all)
 
   def _get_grads_lists_exact(self, tensors):
+    """No docstring required."""
     # Loop over all coordinates of all losses.
     grads_all = []
     for loss in self._layers.losses:
@@ -274,6 +297,9 @@ class FisherEstimator(object):
         transformed_one_hot = loss.multiply_fisher_factor_replicated_one_hot(
             index)
         grads_flat = gradients_impl.gradients(
-            loss.inputs, nest.flatten(tensors), grad_ys=transformed_one_hot)
+            loss.inputs,
+            nest.flatten(tensors),
+            grad_ys=transformed_one_hot,
+            colocate_gradients_with_ops=self._colocate_gradients_with_ops)
         grads_all.append(nest.pack_sequence_as(tensors, grads_flat))
     return zip(*grads_all)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index efffaaef8d56aed3a1cdbf2df1d8209d58b3502f..1ccb9e040f2bb6bcfd217886918abd40e3cc1cfb 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -38,6 +38,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import enum  # pylint: disable=g-bad-import-order
 
 import six
 
@@ -52,14 +53,54 @@ from tensorflow.python.ops import math_ops
 #   damping /= num_replications ** NORMALIZE_DAMPING_POWER
 NORMALIZE_DAMPING_POWER = 1.0
 
+# Methods for adjusting damping for FisherBlocks. See
+# _compute_pi_adjusted_damping() for details.
+PI_OFF_NAME = "off"
+PI_TRACENORM_NAME = "tracenorm"
+PI_TYPE = PI_TRACENORM_NAME
 
-def set_global_constants(normalize_damping_power=None):
+
+def set_global_constants(normalize_damping_power=None, pi_type=None):
   """Sets various global constants used by the classes in this module."""
   global NORMALIZE_DAMPING_POWER
+  global PI_TYPE
 
   if normalize_damping_power is not None:
     NORMALIZE_DAMPING_POWER = normalize_damping_power
 
+  if pi_type is not None:
+    PI_TYPE = pi_type
+
+
+def _compute_pi_tracenorm(left_cov, right_cov):
+  """Computes the scalar constant pi for Tikhonov regularization/damping.
+
+  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
+
+  Args:
+    left_cov: The left Kronecker factor "covariance".
+    right_cov: The right Kronecker factor "covariance".
+
+  Returns:
+    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
+  """
+  # Instead of dividing by the dim of the norm, we multiply by the dim of the
+  # other norm. This works out the same in the ratio.
+  left_norm = math_ops.trace(left_cov) * right_cov.shape.as_list()[0]
+  right_norm = math_ops.trace(right_cov) * left_cov.shape.as_list()[0]
+  return math_ops.sqrt(left_norm / right_norm)
+
+
+def _compute_pi_adjusted_damping(left_cov, right_cov, damping):
+
+  if PI_TYPE == PI_TRACENORM_NAME:
+    pi = _compute_pi_tracenorm(left_cov, right_cov)
+    return (damping * pi, damping / pi)
+
+  elif PI_TYPE == PI_OFF_NAME:
+    return (damping, damping)
+
 
 @six.add_metaclass(abc.ABCMeta)
 class FisherBlock(object):
@@ -133,16 +174,15 @@ class FullFB(FisherBlock):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, layer_collection, params, batch_size):
+  def __init__(self, layer_collection, params):
     """Creates a FullFB block.
 
     Args:
       layer_collection: The collection of all layers in the K-FAC approximate
           Fisher information matrix to which this FisherBlock belongs.
       params: The parameters of this layer (Tensor or tuple of Tensors).
-      batch_size: The batch size, used in the covariance estimator.
     """
-    self._batch_size = batch_size
+    self._batch_sizes = []
     self._params = params
 
     super(FullFB, self).__init__(layer_collection)
@@ -154,7 +194,7 @@ class FullFB(FisherBlock):
     self._factor.register_damped_inverse(damping)
 
   def multiply_inverse(self, vector):
-    inverse = self._factor.get_inverse(self._damping)
+    inverse = self._factor.get_damped_inverse(self._damping)
     out_flat = math_ops.matmul(inverse, utils.tensors_to_column(vector))
     return utils.column_to_tensors(vector, out_flat)
 
@@ -172,9 +212,21 @@ class FullFB(FisherBlock):
   def tensors_to_compute_grads(self):
     return self._params
 
+  def register_additional_minibatch(self, batch_size):
+    """Register an additional minibatch.
+
+    Args:
+      batch_size: The batch size, used in the covariance estimator.
+    """
+    self._batch_sizes.append(batch_size)
+
   @property
   def num_registered_minibatches(self):
-    return 1  # Multiple minibatches not supported.
+    return len(self._batch_sizes)
+
+  @property
+  def _batch_size(self):
+    return math_ops.reduce_sum(self._batch_sizes)
 
 
 class NaiveDiagonalFB(FisherBlock):
@@ -186,17 +238,16 @@ class NaiveDiagonalFB(FisherBlock):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, layer_collection, params, batch_size):
+  def __init__(self, layer_collection, params):
     """Creates a NaiveDiagonalFB block.
 
     Args:
       layer_collection: The collection of all layers in the K-FAC approximate
           Fisher information matrix to which this FisherBlock belongs.
       params: The parameters of this layer (Tensor or tuple of Tensors).
-      batch_size: The batch size, used in the covariance estimator.
     """
     self._params = params
-    self._batch_size = batch_size
+    self._batch_sizes = []
 
     super(NaiveDiagonalFB, self).__init__(layer_collection)
 
@@ -221,9 +272,21 @@ class NaiveDiagonalFB(FisherBlock):
   def tensors_to_compute_grads(self):
     return self._params
 
+  def register_additional_minibatch(self, batch_size):
+    """Register an additional minibatch.
+
+    Args:
+      batch_size: The batch size, used in the covariance estimator.
+    """
+    self._batch_sizes.append(batch_size)
+
   @property
   def num_registered_minibatches(self):
-    return 1  # Multiple minibatches not supported.
+    return len(self._batch_sizes)
+
+  @property
+  def _batch_size(self):
+    return math_ops.reduce_sum(self._batch_sizes)
 
 
 class FullyConnectedDiagonalFB(FisherBlock):
@@ -389,7 +452,7 @@ class ConvDiagonalFB(FisherBlock):
         (self._strides[1] * self._strides[2]))
 
     if NORMALIZE_DAMPING_POWER:
-      damping /= self._num_locations ** NORMALIZE_DAMPING_POWER
+      damping /= self._num_locations**NORMALIZE_DAMPING_POWER
     self._damping = damping
 
     self._factor = self._layer_collection.make_or_get_factor(
@@ -443,11 +506,10 @@ class KroneckerProductFB(FisherBlock):
     Args:
       damping: The base damping factor (float or Tensor) for the damped inverse.
     """
-    pi = utils.compute_pi(self._input_factor.get_cov(),
-                          self._output_factor.get_cov())
-
-    self._input_damping = math_ops.sqrt(damping) * pi
-    self._output_damping = math_ops.sqrt(damping) / pi
+    self._input_damping, self._output_damping = _compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
 
     self._input_factor.register_damped_inverse(self._input_damping)
     self._output_factor.register_damped_inverse(self._output_damping)
@@ -465,8 +527,9 @@ class KroneckerProductFB(FisherBlock):
     return 1.0
 
   def multiply_inverse(self, vector):
-    left_factor_inv = self._input_factor.get_inverse(self._input_damping)
-    right_factor_inv = self._output_factor.get_inverse(self._output_damping)
+    left_factor_inv = self._input_factor.get_damped_inverse(self._input_damping)
+    right_factor_inv = self._output_factor.get_damped_inverse(
+        self._output_damping)
     reshaped_vector = utils.layer_params_to_mat2d(vector)
     reshaped_out = math_ops.matmul(left_factor_inv,
                                    math_ops.matmul(reshaped_vector,
@@ -698,3 +761,260 @@ def _concat_along_batch_dim(tensor_list):
 def _num_conv_locations(input_shape, strides):
   """Returns the number of locations a Conv kernel is applied to."""
   return input_shape[1] * input_shape[2] // (strides[1] * strides[2])
+
+
+class FullyConnectedMultiIndepFB(KroneckerProductFB):
+  """FisherBlock for fully-connected layers that share parameters.
+  """
+
+  def __init__(self, layer_collection, inputs, outputs, has_bias=False):
+    """Creates a FullyConnectedMultiIndepFB block.
+
+    Args:
+      layer_collection: LayerCollection instance.
+      inputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        inputs_size].
+      outputs: list or tuple of Tensors. Each Tensor has shape [batch_size,
+        outputs_size].
+      has_bias: bool. If True, estimates Fisher with respect to a bias
+        parameter as well as the layer's parameters.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_uses = len(inputs)
+
+    super(FullyConnectedMultiIndepFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF,
+        ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    if NORMALIZE_DAMPING_POWER:
+      damping /= self._num_uses**NORMALIZE_DAMPING_POWER
+
+    self._register_damped_input_and_output_inverses(damping)
+
+  @property
+  def _renorm_coeff(self):
+    return self._num_uses
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
+
+
+class SeriesFBApproximation(enum.IntEnum):
+  """See FullyConnectedSeriesFB.__init__ for description and usage."""
+  option1 = 1
+  option2 = 2
+
+
+class FullyConnectedSeriesFB(FisherBlock):
+  """FisherBlock for fully-connected layers that share parameters across time.
+
+  See the following preprint for details:
+    https://openreview.net/pdf?id=HyMTkQZAb
+
+  See the end of the appendix of the paper for a pseudo-code of the
+  algorithm being implemented by multiply_inverse here.  Note that we are
+  using pre-computed versions of certain matrix-matrix products to speed
+  things up.  This is explicitly explained wherever it is done.
+  """
+
+  def __init__(self,
+               layer_collection,
+               inputs,
+               outputs,
+               has_bias=False,
+               option=SeriesFBApproximation.option2):
+    """Constructs a new `FullyConnectedSeriesFB`.
+
+    Args:
+      layer_collection: The collection of all layers in the K-FAC approximate
+        Fisher information matrix to which this FisherBlock belongs.
+      inputs: List of tensors of shape [batch_size, input_size].
+        Inputs to the layer.
+      outputs: List of tensors of shape [batch_size, input_size].
+        Outputs of the layer (before activations).
+      has_bias: Whether the layer includes a bias parameter.
+      option: A `SeriesFBApproximation` specifying the simplifying assumption
+        to be used in this block. `option1` approximates the cross-covariance
+        over time as a symmetric matrix, while `option2` makes
+        the assumption that training sequences are infinitely long. See section
+        3.5 of the paper for more details.
+    """
+
+    assert len(inputs) == len(outputs)
+    # We need to make sure inputs and outputs are tuples and not lists so that
+    # they get hashed by layer_collection.make_or_get_factor properly.
+    self._inputs = tuple(inputs)
+    self._outputs = tuple(outputs)
+    self._has_bias = has_bias
+    self._num_timesteps = len(inputs)
+    self._option = option
+
+    super(FullyConnectedSeriesFB, self).__init__(layer_collection)
+
+  @property
+  def num_registered_minibatches(self):
+    # TODO(b/69411207): Add support for registering additional minibatches.
+    return 1
+
+  def instantiate_factors(self, grads_list, damping):
+
+    self._input_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, ((self._inputs,), self._has_bias))
+
+    self._output_factor = self._layer_collection.make_or_get_factor(
+        fisher_factors.FullyConnectedMultiKF, (grads_list,))
+
+    if NORMALIZE_DAMPING_POWER:
+      damping /= self._num_timesteps**NORMALIZE_DAMPING_POWER
+
+    self._damping_input, self._damping_output = _compute_pi_adjusted_damping(
+        self._input_factor.get_cov(),
+        self._output_factor.get_cov(),
+        damping**0.5)
+
+    if self._option == SeriesFBApproximation.option1:
+      self._input_factor.register_option1quants(self._damping_input)
+      self._output_factor.register_option1quants(self._damping_output)
+    elif self._option == SeriesFBApproximation.option2:
+      self._input_factor.register_option2quants(self._damping_input)
+      self._output_factor.register_option2quants(self._damping_output)
+    else:
+      raise ValueError(
+          "Unrecognized FullyConnectedSeriesFB approximation: {}".format(
+              self._option))
+
+  def multiply_inverse(self, vector):
+    # pylint: disable=invalid-name
+
+    Z = utils.layer_params_to_mat2d(vector)
+
+    # Derivations were done for "batch_dim==1" case so we need to convert to
+    # that orientation:
+    Z = array_ops.transpose(Z)
+
+    if self._option == SeriesFBApproximation.option1:
+
+      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      L_A, psi_A = self._input_factor.get_option1quants(self._damping_input)
+      L_G, psi_G = self._output_factor.get_option1quants(self._damping_output)
+
+      def gamma(x):
+        # We are assuming that each case has the same number of time-steps.
+        # If this stops being the case one shouldn't simply replace this T
+        # with its average value.  Instead, one needs to go back to the
+        # definition of the gamma function from the paper.
+        T = self._num_timesteps
+        return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
+
+      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # Even though Y is Z-independent we are recomputing it from the psi's
+      # each since Y depends on both A and G quantities, and it is relatively
+      # cheap to compute.
+      Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
+
+      # Z = L_G^T * Z * L_A
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = U_G^T * Z * U_A
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
+
+      # Z = Z .* Y
+      Z *= Y
+
+      # Z = L_G * Z * L_A^T
+      # This is equivalent to the following computation from the original
+      # pseudo-code:
+      # Z = U_G * Z * U_A^T
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
+
+    elif self._option == SeriesFBApproximation.option2:
+
+      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
+      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      P_A, K_A, mu_A = self._input_factor.get_option2quants(self._damping_input)
+      P_G, K_G, mu_G = self._output_factor.get_option2quants(
+          self._damping_output)
+
+      # Our approach differs superficially from the pseudo-code in the paper
+      # in order to reduce the total number of matrix-matrix multiplies.
+      # In particular, the first three computations in the pseudo code are
+      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # Z = Z - hPsi_G^T * Z * hPsi_A
+      # Z = E_G^T * Z * E_A
+      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
+      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # the entire computation can be written as
+      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
+      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
+      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
+      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
+      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
+      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # This final expression is computed by the following two lines:
+      # Z = Z - P_G * Z * P_A^T
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
+      # Z = K_G^T * Z * K_A
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
+
+      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # Be careful with the outer product.  We don't want to accidentally
+      # make it an inner-product instead.
+      tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
+      # Prevent some numerical issues by setting any 0.0 eigs to 1.0
+      tmp += 1.0 * math_ops.cast(math_ops.equal(tmp, 0.0), dtype=tmp.dtype)
+      Z /= tmp
+
+      # We now perform the transpose/reverse version of the operations
+      # derived above, whose derivation from the original pseudo-code is
+      # analgous.
+      # Z = K_G * Z * K_A^T
+      Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
+
+      # Z = Z - P_G^T * Z * P_A
+      Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
+
+      # Z = normalize (1/E[T]) * Z
+      # Note that this normalization is done because we compute the statistics
+      # by averaging, not summing, over time. (And the gradient is presumably
+      # summed over time, not averaged, and thus their scales are different.)
+      Z /= math_ops.cast(self._num_timesteps, Z.dtype)
+
+    # Convert back to the "batch_dim==0" orientation.
+    Z = array_ops.transpose(Z)
+
+    return utils.mat2d_to_layer_params(vector, Z)
+
+    # pylint: enable=invalid-name
+
+  def multiply(self, vector):
+    raise NotImplementedError
+
+  def tensors_to_compute_grads(self):
+    return self._outputs
+
+  def num_inputs(self):
+    return len(self._inputs)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 4e36813369e69de1d6f13ddb00566bda912244f6..5a6d1a93ff217c3922f45a047b4d548086ac5258 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import contextlib
 
 import numpy as np
 import six
@@ -26,6 +27,8 @@ import six
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
@@ -50,7 +53,22 @@ EIGENVALUE_DECOMPOSITION_THRESHOLD = 2
 EIGENVALUE_CLIPPING_THRESHOLD = 0.0
 
 
-def set_global_constants(init_covariances_at_zero=None, zero_debias=None,
+@contextlib.contextmanager
+def _maybe_colocate_with(op, colocate_cov_ops_with_inputs):
+  """Context to colocate with `op` if `colocate_cov_ops_with_inputs`."""
+  if colocate_cov_ops_with_inputs:
+    if isinstance(op, (list, tuple)):
+      with tf_ops.colocate_with(op[0]):
+        yield
+    else:
+      with tf_ops.colocate_with(op):
+        yield
+  else:
+    yield
+
+
+def set_global_constants(init_covariances_at_zero=None,
+                         zero_debias=None,
                          eigenvalue_decomposition_threshold=None,
                          eigenvalue_clipping_threshold=None):
   """Sets various global constants used by the classes in this module."""
@@ -85,7 +103,7 @@ def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: di
   return array_ops.ones(shape, dtype)
 
 
-def _compute_cov(tensor, normalizer=None):
+def _compute_cov(tensor, tensor_right=None, normalizer=None):
   """Compute the empirical second moment of the rows of a 2D Tensor.
 
   This function is meant to be applied to random matrices for which the true row
@@ -93,6 +111,8 @@ def _compute_cov(tensor, normalizer=None):
 
   Args:
     tensor: A 2D Tensor.
+    tensor_right: An optional 2D Tensor. If provided, this function computes
+      the matrix product tensor^T * tensor_right instead of tensor^T * tensor.
     normalizer: optional scalar for the estimator (by default, the normalizer is
         the number of rows of tensor).
 
@@ -101,9 +121,14 @@ def _compute_cov(tensor, normalizer=None):
   """
   if normalizer is None:
     normalizer = array_ops.shape(tensor)[0]
-  cov = (math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
-      normalizer, tensor.dtype))
-  return (cov + array_ops.transpose(cov)) / math_ops.cast(2, cov.dtype)
+  if tensor_right is None:
+    cov = (
+        math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
+            normalizer, tensor.dtype))
+    return (cov + array_ops.transpose(cov)) / math_ops.cast(2.0, cov.dtype)
+  else:
+    return (math_ops.matmul(tensor, tensor_right, transpose_a=True) /
+            math_ops.cast(normalizer, tensor.dtype))
 
 
 def _append_homog(tensor):
@@ -119,7 +144,7 @@ def _append_homog(tensor):
   rank = len(tensor.shape.as_list())
   shape = array_ops.concat([array_ops.shape(tensor)[:-1], [1]], axis=0)
   ones = array_ops.ones(shape, dtype=tensor.dtype)
-  return array_ops.concat([tensor, ones], axis=rank-1)
+  return array_ops.concat([tensor, ones], axis=rank - 1)
 
 
 def scope_string_from_params(params):
@@ -157,8 +182,8 @@ def scope_string_from_params(params):
     elif isinstance(param, (tf_ops.Tensor, variables.Variable)):
       name_parts.append(scope_string_from_name(param))
     else:
-      raise ValueError(
-          "Encountered an unsupported param type {}".format(type(param)))
+      raise ValueError("Encountered an unsupported param type {}".format(
+          type(param)))
   return "_".join(name_parts)
 
 
@@ -209,6 +234,10 @@ class FisherFactor(object):
     """
     pass
 
+  @abc.abstractproperty
+  def _dtype(self):
+    pass
+
   @property
   def _cov_initializer(self):
     return covariance_initializer
@@ -220,7 +249,8 @@ class FisherFactor(object):
           "cov",
           initializer=self._cov_initializer,
           shape=self._cov_shape,
-          trainable=False)
+          trainable=False,
+          dtype=self._dtype)
 
   @abc.abstractmethod
   def _compute_new_cov(self, idx=0):
@@ -240,9 +270,10 @@ class FisherFactor(object):
     return moving_averages.assign_moving_average(
         self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
+  @abc.abstractmethod
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    return []
+    pass
 
   def get_cov(self):
     return self._cov
@@ -257,6 +288,13 @@ class InverseProvidingFactor(FisherFactor):
   _cov_shape properties.
   """
 
+  # TODO(b/69108481): This class (and its subclasses) should be refactored to
+  # serve the matrix quantities it computes as both (potentially stale)
+  # variables, updated by the inverse update ops, and fresh values stored in
+  # tensors that recomputed once every session.run() call.  Currently matpower
+  # and damp_inverse have the former behavior, while eigendecomposition has
+  # the latter.
+
   def __init__(self):
     self._inverses_by_damping = {}
     self._matpower_by_exp_and_damping = {}
@@ -267,6 +305,10 @@ class InverseProvidingFactor(FisherFactor):
   def register_damped_inverse(self, damping):
     """Registers a damped inverse needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_inverse.
+
     Args:
       damping: The damping value (float or Tensor) for this factor.
     """
@@ -277,12 +319,17 @@ class InverseProvidingFactor(FisherFactor):
             "inv_damp{}".format(damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._inverses_by_damping[damping] = inv
 
   def register_matpower(self, exp, damping):
     """Registers a matrix power needed by a FisherBlock.
 
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_matpower.
+
     Args:
       exp: The exponent (float or Tensor) to raise the matrix to.
       damping: The damping value (float or Tensor).
@@ -295,57 +342,78 @@ class InverseProvidingFactor(FisherFactor):
             "matpower_exp{}_damp{}".format(exp_string, damping_string),
             initializer=inverse_initializer,
             shape=self._cov_shape,
-            trainable=False)
+            trainable=False,
+            dtype=self._dtype)
       self._matpower_by_exp_and_damping[(exp, damping)] = matpower
 
   def register_eigendecomp(self):
-    """Registers that an eigendecomposition is needed by a FisherBlock."""
+    """Registers an eigendecomposition.
+
+    Unlike register_damp_inverse and register_matpower this doesn't create
+    any variables or inverse ops.  Instead it merely makes tensors containing
+    the eigendecomposition available to anyone that wants them.  They will be
+    recomputed (once) for each session.run() call (when they needed by some op).
+    """
     if not self._eigendecomp:
-      self._eigendecomp = linalg_ops.self_adjoint_eig(self._cov)
+      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self._cov)
+
+      # The matrix self._cov is positive semidefinite by construction, but the
+      # numerical eigenvalues could be negative due to numerical errors, so here
+      # we clip them to be at least FLAGS.eigenvalue_clipping_threshold
+      clipped_eigenvalues = math_ops.maximum(eigenvalues,
+                                             EIGENVALUE_CLIPPING_THRESHOLD)
+      self._eigendecomp = (clipped_eigenvalues, eigenvectors)
 
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
-    ops = super(InverseProvidingFactor, self).make_inverse_update_ops()
+    ops = []
 
     num_inverses = len(self._inverses_by_damping)
     matrix_power_registered = bool(self._matpower_by_exp_and_damping)
-    use_eig = (self._eigendecomp or matrix_power_registered or
-               num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
+    use_eig = (
+        self._eigendecomp or matrix_power_registered or
+        num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)
 
     if use_eig:
       self.register_eigendecomp()  # ensures self._eigendecomp is set
       eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence
 
-      # The matrix self._cov is positive semidefinite by construction, but the
-      # numerical eigenvalues could be negative due to numerical errors, so here
-      # we clip them to be at least EIGENVALUE_CLIPPING_THRESHOLD.
-      clipped_eigenvalues = math_ops.maximum(eigenvalues,
-                                             EIGENVALUE_CLIPPING_THRESHOLD)
-
       for damping, inv in self._inverses_by_damping.items():
         ops.append(
             inv.assign(
-                math_ops.matmul(eigenvectors / (clipped_eigenvalues + damping),
+                math_ops.matmul(eigenvectors / (eigenvalues + damping),
                                 array_ops.transpose(eigenvectors))))
 
       for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
         ops.append(
             matpower.assign(
-                math_ops.matmul(eigenvectors * (clipped_eigenvalues + damping)**
-                                exp, array_ops.transpose(eigenvectors))))
+                math_ops.matmul(eigenvectors *
+                                (eigenvalues + damping)**exp,
+                                array_ops.transpose(eigenvectors))))
+      # These ops share computation and should be run on a single device.
+      ops = [control_flow_ops.group(*ops)]
     else:
       for damping, inv in self._inverses_by_damping.items():
         ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))
 
     return ops
 
-  def get_inverse(self, damping):
+  def get_damped_inverse(self, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._inverses_by_damping[damping]
 
   def get_matpower(self, exp, damping):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
     return self._matpower_by_exp_and_damping[(exp, damping)]
 
   def get_eigendecomp(self):
+    # Unlike get_inverse and get_matpower this doesn't retrieve a stored
+    # variable, but instead always computes a fresh version from the current
+    # value of get_cov().
     return self._eigendecomp
 
 
@@ -356,12 +424,21 @@ class FullFactor(InverseProvidingFactor):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, params_grads, batch_size):
+  def __init__(self,
+               params_grads,
+               batch_size,
+               colocate_cov_ops_with_inputs=False):
     self._batch_size = batch_size
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     self._orig_params_grads_name = scope_string_from_params(
         [params_grads, self._batch_size])
-    self._params_grads_flat = tuple(
-        utils.tensors_to_column(params_grad) for params_grad in params_grads)
+    params_grads_flat = []
+    for params_grad in params_grads:
+      with _maybe_colocate_with(params_grad,
+                                self._colocate_cov_ops_with_inputs):
+        col = utils.tensors_to_column(params_grad)
+        params_grads_flat.append(col)
+    self._params_grads_flat = tuple(params_grads_flat)
     super(FullFactor, self).__init__()
 
   @property
@@ -377,11 +454,17 @@ class FullFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._params_grads_flat)
 
+  @property
+  def _dtype(self):
+    return self._params_grads_flat[0].dtype
+
   def _compute_new_cov(self, idx=0):
     # This will be a very basic rank 1 estimate
-    return ((self._params_grads_flat[idx] * array_ops.transpose(
-        self._params_grads_flat[idx])) / math_ops.cast(
-            self._batch_size, self._params_grads_flat[idx].dtype))
+    with _maybe_colocate_with(self._params_grads_flat[idx],
+                              self._colocate_cov_ops_with_inputs):
+      return ((self._params_grads_flat[idx] * array_ops.transpose(
+          self._params_grads_flat[idx])) / math_ops.cast(
+              self._batch_size, self._params_grads_flat[idx].dtype))
 
 
 class DiagonalFactor(FisherFactor):
@@ -394,6 +477,9 @@ class DiagonalFactor(FisherFactor):
   def _cov_initializer(self):
     return diagonal_covariance_initializer
 
+  def make_inverse_update_ops(self):
+    return []
+
 
 class NaiveDiagonalFactor(DiagonalFactor):
   """FisherFactor for a diagonal approximation of any type of param's Fisher.
@@ -402,10 +488,19 @@ class NaiveDiagonalFactor(DiagonalFactor):
   to any type of parameter in principle, but has very high variance.
   """
 
-  def __init__(self, params_grads, batch_size):
+  def __init__(self,
+               params_grads,
+               batch_size,
+               colocate_cov_ops_with_inputs=False):
     self._batch_size = batch_size
-    self._params_grads = tuple(
-        utils.tensors_to_column(params_grad) for params_grad in params_grads)
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
+    params_grads_flat = []
+    for params_grad in params_grads:
+      with _maybe_colocate_with(params_grad,
+                                self._colocate_cov_ops_with_inputs):
+        col = utils.tensors_to_column(params_grad)
+        params_grads_flat.append(col)
+    self._params_grads = tuple(params_grads_flat)
     self._orig_params_grads_name = scope_string_from_params(
         [self._params_grads, self._batch_size])
     super(NaiveDiagonalFactor, self).__init__()
@@ -422,9 +517,15 @@ class NaiveDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._params_grads)
 
+  @property
+  def _dtype(self):
+    return self._params_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    return (math_ops.square(self._params_grads[idx]) / math_ops.cast(
-        self._batch_size, self._params_grads[idx].dtype))
+    with _maybe_colocate_with(self._params_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      return (math_ops.square(self._params_grads[idx]) / math_ops.cast(
+          self._batch_size, self._params_grads[idx].dtype))
 
 
 class FullyConnectedDiagonalFactor(DiagonalFactor):
@@ -440,7 +541,11 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
 
   # TODO(jamesmartens): add units tests for this class
 
-  def __init__(self, inputs, outputs_grads, has_bias=False):
+  def __init__(self,
+               inputs,
+               outputs_grads,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Instantiate FullyConnectedDiagonalFactor.
 
     Args:
@@ -449,18 +554,22 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
       outputs_grads: List of Tensors of shape [batch_size, output_size].
         Gradient of loss with respect to layer's preactivations.
       has_bias: bool. If True, append '1' to each input.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     self._batch_size = array_ops.shape(inputs)[0]
-    self._orig_tensors_name = scope_string_from_params((inputs,) +
-                                                       tuple(outputs_grads))
+    self._orig_tensors_name = scope_string_from_params(
+        (inputs,) + tuple(outputs_grads))
 
     # Note that we precompute the required operations on the inputs since the
     # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
     # the target entry of _outputs_grads changes with idx.)
-    if has_bias:
-      inputs = _append_homog(inputs)
-    self._squared_inputs = math_ops.square(inputs)
+    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
+      if has_bias:
+        inputs = _append_homog(inputs)
+      self._squared_inputs = math_ops.square(inputs)
 
     super(FullyConnectedDiagonalFactor, self).__init__()
 
@@ -476,17 +585,23 @@ class FullyConnectedDiagonalFactor(DiagonalFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
     # The well-known special formula that uses the fact that the entry-wise
     # square of an outer product is the outer-product of the entry-wise squares.
     # The gradient is the outer product of the input and the output gradients,
     # so we just square both and then take their outer-product.
-    new_cov = math_ops.matmul(
-        self._squared_inputs,
-        math_ops.square(self._outputs_grads[idx]),
-        transpose_a=True)
-    new_cov /= math_ops.cast(self._batch_size, new_cov.dtype)
-    return new_cov
+    with _maybe_colocate_with(self._squared_inputs,
+                              self._colocate_cov_ops_with_inputs):
+      new_cov = math_ops.matmul(
+          self._squared_inputs,
+          math_ops.square(self._outputs_grads[idx]),
+          transpose_a=True)
+      new_cov /= math_ops.cast(self._batch_size, new_cov.dtype)
+      return new_cov
 
 
 class ConvDiagonalFactor(DiagonalFactor):
@@ -494,8 +609,14 @@ class ConvDiagonalFactor(DiagonalFactor):
 
   # TODO(jamesmartens): add units tests for this class
 
-  def __init__(self, inputs, outputs_grads, filter_shape, strides, padding,
-               has_bias=False):
+  def __init__(self,
+               inputs,
+               outputs_grads,
+               filter_shape,
+               strides,
+               padding,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Creates a ConvDiagonalFactor object.
 
     Args:
@@ -510,29 +631,36 @@ class ConvDiagonalFactor(DiagonalFactor):
       padding: The padding in this layer (1-D of Tensor length 4).
       has_bias: Python bool. If True, the layer is assumed to have a bias
         parameter in addition to its filter parameter.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._filter_shape = filter_shape
     self._has_bias = has_bias
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
 
-    self._orig_tensors_name = scope_string_from_name((inputs,)
-                                                     + tuple(outputs_grads))
+    self._orig_tensors_name = scope_string_from_name(
+        (inputs,) + tuple(outputs_grads))
 
     # Note that we precompute the required operations on the inputs since the
     # inputs don't change with the 'idx' argument to _compute_new_cov.  (Only
     # the target entry of _outputs_grads changes with idx.)
-    filter_height, filter_width, _, _ = self._filter_shape
-    patches = array_ops.extract_image_patches(
-        inputs,
-        ksizes=[1, filter_height, filter_width, 1],
-        strides=strides,
-        rates=[1, 1, 1, 1],
-        padding=padding)
+    with _maybe_colocate_with(inputs, self._colocate_cov_ops_with_inputs):
+      filter_height, filter_width, _, _ = self._filter_shape
 
-    if has_bias:
-      patches = _append_homog(patches)
+      # TODO(b/64144716): there is potential here for a big savings in terms of
+      # memory use.
+      patches = array_ops.extract_image_patches(
+          inputs,
+          ksizes=[1, filter_height, filter_width, 1],
+          strides=strides,
+          rates=[1, 1, 1, 1],
+          padding=padding)
+
+      if has_bias:
+        patches = _append_homog(patches)
 
-    self._patches = patches
+      self._patches = patches
 
     super(ConvDiagonalFactor, self).__init__()
 
@@ -543,21 +671,29 @@ class ConvDiagonalFactor(DiagonalFactor):
   @property
   def _cov_shape(self):
     filter_height, filter_width, in_channels, out_channels = self._filter_shape
-    return [filter_height * filter_width * in_channels + self._has_bias,
-            out_channels]
+    return [
+        filter_height * filter_width * in_channels + self._has_bias,
+        out_channels
+    ]
 
   @property
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    outputs_grad = self._outputs_grads[idx]
-    batch_size = array_ops.shape(self._patches)[0]
+    with _maybe_colocate_with(self._outputs_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      outputs_grad = self._outputs_grads[idx]
+      batch_size = array_ops.shape(self._patches)[0]
 
-    new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
-    new_cov /= math_ops.cast(batch_size, new_cov.dtype)
+      new_cov = self._convdiag_sum_of_squares(self._patches, outputs_grad)
+      new_cov /= math_ops.cast(batch_size, new_cov.dtype)
 
-    return new_cov
+      return new_cov
 
   def _convdiag_sum_of_squares(self, patches, outputs_grad):
     # This computes the sum of the squares of the per-training-case "gradients".
@@ -572,19 +708,24 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   """Kronecker factor for the input or output side of a fully-connected layer.
   """
 
-  def __init__(self, tensors, has_bias=False):
+  def __init__(self,
+               tensors,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Instantiate FullyConnectedKroneckerFactor.
 
     Args:
       tensors: List of Tensors of shape [batch_size, n]. Represents either a
         layer's inputs or its output's gradients.
-      has_bias: bool. If True, assume this factor is for the layer's inputs and
-        append '1' to each row.
+      has_bias: bool. If True, append '1' to each row.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     # The tensor argument is either a tensor of input activations or a tensor of
     # output pre-activation gradients.
     self._has_bias = has_bias
     self._tensors = tensors
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(FullyConnectedKroneckerFactor, self).__init__()
 
   @property
@@ -601,11 +742,17 @@ class FullyConnectedKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._tensors)
 
+  @property
+  def _dtype(self):
+    return self._tensors[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    tensor = self._tensors[idx]
-    if self._has_bias:
-      tensor = _append_homog(tensor)
-    return _compute_cov(tensor)
+    with _maybe_colocate_with(self._tensors[idx],
+                              self._colocate_cov_ops_with_inputs):
+      tensor = self._tensors[idx]
+      if self._has_bias:
+        tensor = _append_homog(tensor)
+      return _compute_cov(tensor)
 
 
 class ConvInputKroneckerFactor(InverseProvidingFactor):
@@ -618,7 +765,13 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, inputs, filter_shape, strides, padding, has_bias=False):
+  def __init__(self,
+               inputs,
+               filter_shape,
+               strides,
+               padding,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
     """Initializes ConvInputKroneckerFactor.
 
     Args:
@@ -630,12 +783,15 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
         width_stride, in_channel_stride].
       padding: str. Padding method for layer. "SAME" or "VALID".
       has_bias: bool. If True, append 1 to in_channel.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._filter_shape = filter_shape
     self._strides = strides
     self._padding = padding
     self._has_bias = has_bias
     self._inputs = inputs
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvInputKroneckerFactor, self).__init__()
 
   @property
@@ -655,26 +811,34 @@ class ConvInputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return 1
 
+  @property
+  def _dtype(self):
+    return self._inputs.dtype
+
   def _compute_new_cov(self, idx=0):
     if idx != 0:
       raise ValueError("ConvInputKroneckerFactor only supports idx = 0")
 
     # TODO(jamesmartens): factor this patches stuff out into a utility function
-    filter_height, filter_width, in_channels, _ = self._filter_shape
-    patches = array_ops.extract_image_patches(
-        self._inputs,
-        ksizes=[1, filter_height, filter_width, 1],
-        strides=self._strides,
-        rates=[1, 1, 1, 1],
-        padding=self._padding)
+    with _maybe_colocate_with(self._inputs, self._colocate_cov_ops_with_inputs):
+      filter_height, filter_width, in_channels, _ = self._filter_shape
 
-    flatten_size = (filter_height * filter_width * in_channels)
-    patches_flat = array_ops.reshape(patches, [-1, flatten_size])
+      # TODO(b/64144716): there is potential here for a big savings in terms of
+      # memory use.
+      patches = array_ops.extract_image_patches(
+          self._inputs,
+          ksizes=[1, filter_height, filter_width, 1],
+          strides=self._strides,
+          rates=[1, 1, 1, 1],
+          padding=self._padding)
 
-    if self._has_bias:
-      patches_flat = _append_homog(patches_flat)
+      flatten_size = (filter_height * filter_width * in_channels)
+      patches_flat = array_ops.reshape(patches, [-1, flatten_size])
 
-    return _compute_cov(patches_flat)
+      if self._has_bias:
+        patches_flat = _append_homog(patches_flat)
+
+      return _compute_cov(patches_flat)
 
 
 class ConvOutputKroneckerFactor(InverseProvidingFactor):
@@ -688,15 +852,18 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   Section 3.1 Estimating the factors.
   """
 
-  def __init__(self, outputs_grads):
+  def __init__(self, outputs_grads, colocate_cov_ops_with_inputs=False):
     """Initializes ConvOutputKroneckerFactor.
 
     Args:
       outputs_grads: list of Tensors. Each Tensor is of shape
-        [batch_size, height, width, out_channels].
+          [batch_size, height, width, out_channels].
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+          their inputs.
     """
     self._out_channels = outputs_grads[0].shape.as_list()[3]
     self._outputs_grads = outputs_grads
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
     super(ConvOutputKroneckerFactor, self).__init__()
 
   @property
@@ -712,7 +879,286 @@ class ConvOutputKroneckerFactor(InverseProvidingFactor):
   def _num_sources(self):
     return len(self._outputs_grads)
 
+  @property
+  def _dtype(self):
+    return self._outputs_grads[0].dtype
+
   def _compute_new_cov(self, idx=0):
-    reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
-                                        [-1, self._out_channels])
-    return _compute_cov(reshaped_tensor)
+    with _maybe_colocate_with(self._outputs_grads[idx],
+                              self._colocate_cov_ops_with_inputs):
+      reshaped_tensor = array_ops.reshape(self._outputs_grads[idx],
+                                          [-1, self._out_channels])
+      return _compute_cov(reshaped_tensor)
+
+
+class FullyConnectedMultiKF(InverseProvidingFactor):
+  """Kronecker factor for a fully connected recurrent layer."""
+
+  def __init__(self,
+               tensor_lists,
+               has_bias=False,
+               colocate_cov_ops_with_inputs=False):
+    """Constructs a new `FullyConnectedMultiKF`.
+
+    Args:
+      tensor_lists: List of lists of Tensors of shape [batch_size, n].
+      has_bias: bool. If True, '1' is appended to each row.
+      colocate_cov_ops_with_inputs: Whether to colocate cov_update ops with
+        their inputs.
+    """
+
+    self._orig_tensors_name = scope_string_from_params(tensor_lists)
+    self._batch_size = array_ops.shape(tensor_lists[0][0])[0]
+    self._num_timesteps = len(tensor_lists[0])
+
+    tensors = tuple(
+        array_ops.concat(tensor_list, 0) for tensor_list in tensor_lists)
+    if has_bias:
+      tensors = tuple(_append_homog(tensor) for tensor in tensors)
+    self._tensors = tensors
+
+    self._cov_dt1 = None
+    self._option1quants_by_damping = {}
+    self._option2quants_by_damping = {}
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
+
+    super(FullyConnectedMultiKF, self).__init__()
+
+  @property
+  def _var_scope(self):
+    return "ff_fc_multi/" + self._orig_tensors_name
+
+  @property
+  def _num_sources(self):
+    return len(self._tensors)
+
+  @property
+  def _dtype(self):
+    return self._tensors[0].dtype
+
+  def make_covariance_update_op(self, ema_decay):
+    with _maybe_colocate_with(self._tensors,
+                              self._colocate_cov_ops_with_inputs):
+      op = super(FullyConnectedMultiKF,
+                 self).make_covariance_update_op(ema_decay)
+
+      if self._cov_dt1 is not None:
+        new_cov_dt1 = math_ops.add_n(
+            tuple(
+                self._compute_new_cov_dt1(idx)
+                for idx in range(self._num_sources)))
+        op2 = moving_averages.assign_moving_average(
+            self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
+
+        # TODO(b/69112164):
+        # It's important that _cov and _cov_dt1 remain consistent with each
+        # other while the inverse ops are happening. How can we ensure this?
+        # We will need to add explicit synchronization for this to
+        # work with asynchronous training.
+        op = control_flow_ops.group(op, op2)
+
+    return op
+
+  def _compute_new_cov(self, idx=0):
+    tensor = self._tensors[idx]
+    normalizer = self._num_timesteps * self._batch_size
+    return _compute_cov(tensor, normalizer=normalizer)
+
+  def _compute_new_cov_dt1(self, idx=0):
+    tensor = self._tensors[idx]
+    normalizer = self._num_timesteps * self._batch_size
+    tensor_present = tensor[:-self._batch_size, :]
+    tensor_future = tensor[self._batch_size:, :]
+    return _compute_cov(
+        tensor_future, tensor_right=tensor_present, normalizer=normalizer)
+
+  @property
+  def _cov_shape(self):
+    size = self._tensors[0].shape[1]
+    return [size, size]
+
+  @property
+  def _vec_shape(self):
+    size = self._tensors[0].shape[1]
+    return [size]
+
+  def get_option1quants(self, damping):
+    return self._option1quants_by_damping[damping]
+
+  def get_option2quants(self, damping):
+    return self._option2quants_by_damping[damping]
+
+  def get_cov_dt1(self):
+    assert self._cov_dt1 is not None
+    return self._cov_dt1
+
+  def register_cov_dt1(self):
+    """Create a variable representing temporal cross-covariance.
+
+    (This is technically the second moment, not covariance, since it's
+    not mean subtracted.)
+    """
+    if self._cov_dt1 is None:
+      with variable_scope.variable_scope(self._var_scope):
+        self._cov_dt1 = variable_scope.get_variable(
+            "cov_dt1",
+            initializer=init_ops.zeros_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+  def register_option1quants(self, damping):
+
+    self.register_eigendecomp()
+    self.register_cov_dt1()
+
+    if damping not in self._option1quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Lmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        psi = variable_scope.get_variable(
+            "psi_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option1quants_by_damping[damping] = (Lmat, psi)
+
+  def register_option2quants(self, damping):
+
+    self.register_eigendecomp()
+    self.register_cov_dt1()
+
+    if damping not in self._option2quants_by_damping:
+      # It's questionable as to whether we should initialize with stuff like
+      # this at all.  Ideally these values should never be used until they are
+      # updated at least once.
+      damping_string = scalar_or_tensor_to_string(damping)
+      with variable_scope.variable_scope(self._var_scope):
+        Pmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Lmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        Kmat = variable_scope.get_variable(  # pylint: disable=invalid-name
+            "Kmat_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+        mu = variable_scope.get_variable(
+            "mu_damp{}".format(damping_string),
+            initializer=init_ops.ones_initializer,
+            shape=self._vec_shape,
+            trainable=False,
+            dtype=self._dtype)
+
+      self._option2quants_by_damping[damping] = (Pmat, Kmat, mu)
+
+  def make_inverse_update_ops(self):
+    """Create and return update ops corresponding to registered computations."""
+    # TODO(b/69918258): Add correctness tests for this method.
+    # pylint: disable=invalid-name
+
+    ops = super(FullyConnectedMultiKF, self).make_inverse_update_ops()
+
+    if (len(self._option1quants_by_damping) +
+        len(self._option2quants_by_damping)):
+
+      # Note that C0 and C1 are stand-ins for A0 and A1, or G0 and G1, from
+      # the pseudo-code in the original paper.  Because the computations for
+      # the A and G case are essentially the same they can both be performed by
+      # the same class (this one).
+
+      C1 = self.get_cov_dt1()
+
+      # Get the eigendecomposition of C0  (= self.get_cov())
+      eigen_e, eigen_V = self.get_eigendecomp()
+
+      # TODO(b/69678661): Note, there is an implicit assumption here that C1
+      # and C0 (as represented here by its eigen-decomp) are consistent.  This
+      # could fail to be the case if self._cov and self._cov_dt1 are not updated
+      # consistently, or are somehow read between or during the cov updates.
+      # Can this possibly happen?  Is there a way to prevent it?
+
+      for damping, (Lmat_var,
+                    psi_var) in self._option1quants_by_damping.items():
+
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # The following line imposses the symmetry assumed by "Option 1" on C1.
+        # Stangely the code can work okay with this line commented out,
+        # depending on how psd_eig is defined.  I'm not sure why.
+        C1 = (C1 + array_ops.transpose(C1)) / 2.0
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
+        hPsi = math_ops.matmul(math_ops.matmul(invsqrtC0, C1), invsqrtC0)
+
+        # Compute the decomposition U*diag(psi)*U^T = hPsi
+        psi, U = utils.posdef_eig(hPsi)
+
+        # L = C0^(-1/2) * U
+        Lmat = math_ops.matmul(invsqrtC0, U)
+
+        ops.append(Lmat_var.assign(Lmat))
+        ops.append(psi_var.assign(psi))
+
+      for damping, (Pmat_var, Kmat_var,
+                    mu_var) in self._option2quants_by_damping.items():
+
+        # compute C0^(-1/2)
+        invsqrtC0 = math_ops.matmul(
+            eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
+
+        # Might need to enforce symmetry lost due to numerical issues.
+        invsqrtC0 = (invsqrtC0 + array_ops.transpose(invsqrtC0)) / 2.0
+
+        # Compute the product C0^(-1/2) * C1
+        invsqrtC0C1 = math_ops.matmul(invsqrtC0, C1)
+
+        # hPsi = C0^(-1/2) * C1 * C0^(-1/2)  (hPsi means \hat{Psi})
+        hPsi = math_ops.matmul(invsqrtC0C1, invsqrtC0)
+
+        # Compute the decomposition E*diag(mu)*E^T = hPsi^T * hPsi
+        # Note that we using the notation mu instead of "m" for the eigenvalues.
+        # Instead of computing the product hPsi^T * hPsi and then doing an
+        # eigen-decomposition of this we just compute the SVD of hPsi and then
+        # square the singular values to get the eigenvalues. For a justification
+        # of this approach, see:
+        # https://en.wikipedia.org/wiki/Singular-value_decomposition#Relation_to_eigenvalue_decomposition
+        sqrtmu, _, E = linalg_ops.svd(hPsi)
+        mu = math_ops.square(sqrtmu)
+
+        # Mathematically, the eigenvalues should not should not exceed 1.0, but
+        # due to numerical issues, or possible issues with inconsistent
+        # values of C1 and (the eigen-decomposition of) C0 they might. So
+        # we enforce this condition.
+        mu = math_ops.minimum(mu, 1.0)
+
+        # P = (C0^(-1/2) * C1)^T * C0^(-1/2) = C_1^T * C_0^(-1)
+        Pmat = math_ops.matmul(invsqrtC0C1, invsqrtC0, transpose_a=True)
+
+        # K = C_0^(-1/2) * E
+        Kmat = math_ops.matmul(invsqrtC0, E)
+
+        ops.append(Pmat_var.assign(Pmat))
+        ops.append(Kmat_var.assign(Kmat))
+        ops.append(mu_var.assign(mu))
+
+    return [control_flow_ops.group(*ops)]
+
+    # pylint: enable=invalid-name
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 1806f5d8651e0b922fc30aed58d19de7faa5b265..ca42afe6fb2f5c7d7de8b5b087dc11be30a75d5e 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -26,7 +26,9 @@ from __future__ import print_function
 
 from collections import defaultdict
 from collections import OrderedDict
+from functools import partial
 
+import math
 import six
 
 from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
@@ -35,20 +37,51 @@ from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
-
 # Names for various approximations that can be requested for Fisher blocks.
 APPROX_KRONECKER_NAME = "kron"
 APPROX_DIAGONAL_NAME = "diagonal"
 APPROX_FULL_NAME = "full"
 
+_GENERIC_APPROX_TO_BLOCK_TYPES = {
+    APPROX_FULL_NAME: fb.FullFB,
+    APPROX_DIAGONAL_NAME: fb.NaiveDiagonalFB,
+}
+
+_FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_NAME: fb.FullyConnectedKFACBasicFB,
+    APPROX_DIAGONAL_NAME: fb.FullyConnectedDiagonalFB,
+}
+
+_CONV2D_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_NAME: fb.ConvKFCBasicFB,
+    APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
+}
+
+APPROX_KRONECKER_INDEP_NAME = "kron_indep"
+APPROX_KRONECKER_SERIES_1_NAME = "kron_series_1"
+APPROX_KRONECKER_SERIES_2_NAME = "kron_series_2"
+
+_FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES = {
+    APPROX_KRONECKER_INDEP_NAME: fb.FullyConnectedMultiIndepFB,
+    APPROX_KRONECKER_SERIES_1_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=1),
+    APPROX_KRONECKER_SERIES_2_NAME: partial(fb.FullyConnectedSeriesFB,
+                                            option=2)
+}
+
 # Possible value for 'reuse' keyword argument. Sets 'reuse' to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
-# TODO(jamesmartens): need to add find_canonical_output back into this somewhere
+
+def ensure_sequence(obj):
+  """If `obj` isn't a tuple or list, return a tuple containing `obj`."""
+  if isinstance(obj, (tuple, list)):
+    return obj
+  else:
+    return (obj,)
 
 
 class LayerParametersDict(OrderedDict):
@@ -103,21 +136,27 @@ class LayerCollection(object):
     fisher_blocks: a LayersParamsDict (subclass of OrderedDict) mapping layer
         parameters (Tensors or tuples of Tensors) to FisherBlock instances.
     fisher_factors: an OrderedDict mapping tuples to FisherFactor instances.
-    generic_registrations: a list of variables registered via a generic layer
-        registration. Generic registrations handle any and all of the ways a
-        variable is used in the graph, which means we don't need to check
-        their registration when verifying the correctness of the graph.
     losses: a list of LossFunction objects. The loss to be optimized is their
         sum.
   """
 
-  def __init__(self, graph=None, name="LayerCollection"):
+  def __init__(self,
+               graph=None,
+               colocate_cov_ops_with_inputs=False,
+               name="LayerCollection"):
     self.fisher_blocks = LayerParametersDict()
     self.fisher_factors = OrderedDict()
-    self._generic_registrations = set()
+    self._linked_parameters = dict(
+    )  # dict mapping sets of variables to optionally specified approximations.
     self._graph = graph or ops.get_default_graph()
     self._loss_dict = {}  # {str: LossFunction}
     self._subgraph = None
+    self._default_generic_approximation = APPROX_FULL_NAME
+    self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
+    self._default_convolution_2d_approximation = APPROX_KRONECKER_NAME
+    self._default_fully_connected_multi_approximation = (
+        APPROX_KRONECKER_SERIES_2_NAME)
+    self._colocate_cov_ops_with_inputs = colocate_cov_ops_with_inputs
 
     with variable_scope.variable_scope(None, default_name=name) as scope:
       self._var_scope = scope.name
@@ -127,113 +166,195 @@ class LayerCollection(object):
     """LossFunctions registered with this LayerCollection."""
     return list(self._loss_dict.values())
 
-  def register_block(self, layer_key, fisher_block):
-    """Validates and registers the layer_key associated with the fisher_block.
+  @property
+  def registered_variables(self):
+    """A tuple of all of the variables currently registered."""
+    tuple_of_tuples = (ensure_sequence(key) for key, block
+                       in six.iteritems(self.fisher_blocks))
+    flat_tuple = tuple(item for tuple_ in tuple_of_tuples for item in tuple_)
+    return flat_tuple
+
+  @property
+  def linked_parameters(self):
+    """Groups of parameters with an optionally specified approximation.
 
-    Validation consists of checking whether the key was already registered or
-    if any of the elements of layer_key (if it's a tuple) were already
-    registered as part of another tuple (throws an error if so). If any of the
-    elements were registered by themselves, or as part of tuples that are
-    subsets of this layer_key, those registrations are first removed.
-
-    If the layer_key is a subset of an existing registration, registration of
-    the new, smaller layer_key is skipped.
-
-    e.g. If registrations include {'a': foo, ('b', 'c'): bar}, then
-      - register_layer('a', baz) -> ValueError
-      - register_layer(('b', 'c', 'd'), baz) ->
-        {'a': foo, ('b', 'c', 'd'): baz}
-      - register_layer('b', baz) ->
-        {'a': foo, ('b', 'c'): bar} (No change)
-      - register_layer(('a', 'd'), baz) ->
-        {('a', 'd'): baz, ('b', 'c'): bar}
-      - register_layer(('b', 'd'), baz) -> ValueError
+    Linked parameters can be added using `define_linked_parameters`.
+    If an approximation is specified, then this approximation will be used
+    when registering a layer with exactly these parameters, unless an
+    approximation is specified when calling the registration function.
+
+    Returns:
+      A `dict` mapping tuples of parameters to an optional string.
+    """
+    return self._linked_parameters
+
+  @property
+  def default_generic_approximation(self):
+    return self._default_generic_approximation
+
+  def set_default_generic_approximation(self, value):
+    if value not in _GENERIC_APPROX_TO_BLOCK_TYPES:
+      raise ValueError(
+          "{} is not a valid approximation for generic variables.".format(
+              value))
+    self._default_generic_approximation = value
+
+  @property
+  def default_fully_connected_approximation(self):
+    return self._default_fully_connected_approximation
+
+  def set_default_fully_connected_approximation(self, value):
+    if value not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
+      raise ValueError(
+          "{} is not a valid approximation for fully connected layers.".format(
+              value))
+    self._default_fully_connected_approximation = value
+
+  @property
+  def default_conv2d_approximation(self):
+    return self._default_convolution_2d_approximation
+
+  def set_default_conv2d_approximation(self, value):
+    if value not in _CONV2D_APPROX_TO_BLOCK_TYPES:
+      raise ValueError(
+          "{} is not a valid approximation for 2d convolutional layers.".format(
+              value))
+    self._default_convolution_2d_approximation = value
+
+  @property
+  def default_fully_connected_multi_approximation(self):
+    return self._default_fully_connected_multi_approximation
+
+  def set_default_fully_connected_multi_approximation(self, value):
+    if value not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("{} is not a valid approximation for a fully-connected "
+                       "multi layer.".format(value))
+    self._default_fully_connected_multi_approximation = value
+
+  def register_block(self, layer_key, fisher_block, reuse=VARIABLE_SCOPE):
+    """Validates and registers the layer_key associated with the fisher_block.
 
     Args:
-      layer_key: The key to check for in existing registrations and to register
-          if valid.
-      fisher_block: The associated fisher block.
+      layer_key: A variable or tuple of variables. The key to check for in
+          existing registrations and to register if valid.
+      fisher_block: The associated `FisherBlock`.
+      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
+        or 'VARIABLE_SCOPE'.
 
     Raises:
-      ValueError: If the layer_key was already registered, or if a subset of the
-          layer_key has already been registered as part of a different tuple.
+      ValueError: If `layer_key` was already registered and reuse is `False`,
+        if `layer_key` was registered with a different block type, or if
+        `layer_key` shares any variables with but is not equal to a previously
+        registered key.
+      KeyError: If `reuse` is `True` but `layer_key` was not previously
+        registered.
+
+    Returns:
+      The `FisherBlock` registered under `layer_key`. If `layer_key` was already
+      registered, this will be the previously registered `FisherBlock`.
     """
+    if reuse is VARIABLE_SCOPE:
+      reuse = variable_scope.get_variable_scope().reuse
+
+    if reuse is True or (reuse is variable_scope.AUTO_REUSE and
+                         layer_key in self.fisher_blocks):
+      result = self.fisher_blocks[layer_key]
+      if type(result) != type(fisher_block):  # pylint: disable=unidiomatic-typecheck
+        raise ValueError(
+            "Attempted to register FisherBlock of type %s when existing "
+            "FisherBlock has type %s." % (type(fisher_block), type(result)))
+      return result
+    if reuse is False and layer_key in self.fisher_blocks:
+      raise ValueError("FisherBlock for %s is already in LayerCollection." %
+                       (layer_key,))
+
+    # Insert fisher_block into self.fisher_blocks.
     if layer_key in self.fisher_blocks:
       raise ValueError("Duplicate registration: {}".format(layer_key))
-    if isinstance(layer_key, (tuple, list)):
-      self._register_block_with_sequence_key(layer_key, fisher_block)
-    else:
-      self._register_block_with_nonsequence_key(layer_key, fisher_block)
-
-  def _register_block_with_sequence_key(self, layer_key, fisher_block):
-    """Validates and registers the layer_key if it's a sequence."""
-    inclusions = {
-        fisher_elt
-        for layer_elt in layer_key for fisher_elt in self.fisher_blocks
-        if self._equal_or_subset(layer_elt, fisher_elt)
+    # Raise an error if any variable in layer_key has been registered in any
+    # other blocks.
+    variable_to_block = {
+        var: (params, block)
+        for (params, block) in self.fisher_blocks.items()
+        for var in ensure_sequence(params)
     }
-
-    if not inclusions:
-      self.fisher_blocks[layer_key] = fisher_block
-      return
-
-    for key in inclusions:
-      fisher_block_key = key if isinstance(key, (tuple, list)) else (key,)
-      if set(layer_key).issubset(fisher_block_key):
-        logging.warning("Graph Registration Warning: tried to register "
-                        "a subset ({}) of an already registered tuple "
-                        "({}), skipping".format(layer_key, fisher_block_key))
-        return
-      if not set(fisher_block_key).issubset(layer_key):
+    for variable in ensure_sequence(layer_key):
+      if variable in variable_to_block:
+        prev_key, prev_block = variable_to_block[variable]
         raise ValueError(
-            "Inconsistent registration, expected new key to be a subset or "
-            "superset of the existing key: existing is {}, new is {}".format(
-                key, layer_key))
-      else:
-        self.fisher_blocks.pop(key)
-
+            "Attempted to register layer_key {} with block {}, but variable {}"
+            " was already registered in key {} with block {}.".format(
+                layer_key, fisher_block, variable, prev_key, prev_block))
     self.fisher_blocks[layer_key] = fisher_block
-
-  def _register_block_with_nonsequence_key(self, layer_key, fisher_block):
-    """Validates and registers the layer_key if it's not a sequence."""
-    inclusions = {
-        fisher_elt
-        for fisher_elt in self.fisher_blocks
-        if self._equal_or_subset(layer_key, fisher_elt)
-    }
-
-    if not inclusions:
-      self.fisher_blocks[layer_key] = fisher_block
-    else:
-      logging.warning("Graph Registration Warning: tried to register "
-                      "variable ({}) but a containing tuple was already "
-                      "registered ({}), skipping".format(layer_key, inclusions))
-
-  def _equal_or_subset(self, elt1, elt2):
-    """Checks if the elements are equal or one is contained in the other."""
-    return (elt1 == elt2 or (isinstance(elt1,
-                                        (tuple, list)) and elt2 in elt1) or
-            (isinstance(elt2, (tuple, list)) and elt1 in elt2))
+    return fisher_block
 
   def get_use_count_map(self):
     """Returns a dict of variables to their number of registrations."""
+    # TODO(b/70283403): Reimplement this in the old way, where each
+    # registration function would be responsible for incrementing the count.
+    # Also, this version has a bug: it won't do the right thing for generic
+    # registration for parameters that are shared.  i.e. it won't set the use
+    # count to infinity.
     vars_to_uses = defaultdict(int)
     for key, block in six.iteritems(self.fisher_blocks):
-      key = key if isinstance(key, (tuple, list)) else (key,)
+      n = (
+          block.num_inputs()*block.num_registered_minibatches if isinstance(
+              block, (fb.FullyConnectedSeriesFB, fb.FullyConnectedMultiIndepFB))
+          else block.num_registered_minibatches)
+      key = ensure_sequence(key)
       for k in key:
-        vars_to_uses[k] += block.num_registered_minibatches
+        vars_to_uses[k] += n
     return vars_to_uses
 
+  def check_registration(self, variables):
+    """Checks that all variable uses have been registered properly.
+
+    Args:
+      variables: List of variables.
+
+    Raises:
+      ValueError: If any registered variables are not included in the list.
+      ValueError: If any variable in the list is not registered.
+      ValueError: If any variable in the list is registered with the wrong
+          number of "uses" in the subgraph recorded (vs the number of times that
+          variable is actually used in the subgraph).
+    """
+    # Note that overlapping parameters (i.e. those that share variables) will
+    # be caught by layer_collection.LayerParametersDict during registration.
+
+    reg_use_map = self.get_use_count_map()
+
+    error_messages = []
+
+    for var in variables:
+      total_uses = self.subgraph.variable_uses(var)
+      reg_uses = reg_use_map[var]
+
+      if reg_uses == 0:
+        error_messages.append("Variable {} not registered.".format(var))
+      elif (not math.isinf(reg_uses)) and reg_uses != total_uses:
+        error_messages.append(
+            "Variable {} registered with wrong number of uses ({} "
+            "registrations vs {} uses).".format(var, reg_uses, total_uses))
+
+    num_get_vars = len(reg_use_map)
+
+    if num_get_vars > len(variables):
+      error_messages.append("{} registered variables were not included in list."
+                            .format(num_get_vars - len(variables)))
+
+    if error_messages:
+      error_messages = [
+          "Found the following errors with variable registration:"
+      ] + error_messages
+      raise ValueError("\n\t".join(error_messages))
+
   def get_blocks(self):
     return self.fisher_blocks.values()
 
   def get_factors(self):
     return self.fisher_factors.values()
 
-  @property
-  def generic_registrations(self):
-    return self._generic_registrations
-
   @property
   def graph(self):
     return self._graph
@@ -242,6 +363,49 @@ class LayerCollection(object):
   def subgraph(self):
     return self._subgraph
 
+  def define_linked_parameters(self, params, approximation=None):
+    """Identify a set of parameters that should be grouped together.
+
+    During automatic graph scanning, any matches containing variables that have
+    been identified as part of a linked group will be filtered out unless
+    the match parameters are exactly equal to the ones specified in the linked
+    group.
+
+    Args:
+      params: A variable, or a tuple or list of variables. The variables
+        to be linked.
+      approximation: Optional string specifying the type of approximation to use
+        for these variables. If unspecified, this layer collection's default
+        approximation for the layer type will be used.
+
+    Raises:
+      ValueError: If the parameters were already registered in a layer or
+        identified as part of an incompatible group.
+    """
+    params = frozenset(ensure_sequence(params))
+
+    # Check if any of the variables in 'params' is already in
+    # 'self.fisher_blocks.keys()'.
+    for registered_params, fisher_block in self.fisher_blocks.items():
+      registered_params_set = set(ensure_sequence(registered_params))
+      for variable in params:
+        if (variable in registered_params_set and
+            params != registered_params_set):
+          raise ValueError(
+              "Can't link parameters {}, variable {} was already registered in "
+              "group {} with layer {}".format(params, variable,
+                                              registered_params, fisher_block))
+
+    # Check if any of the variables in 'params' is already in
+    # 'self.linked_parameters'.
+    for variable in params:
+      for other_linked_params in self.linked_parameters:
+        if variable in other_linked_params:
+          raise ValueError("Can't link parameters {}, variable {} was already "
+                           "linked in group {}.".format(params, variable,
+                                                        other_linked_params))
+    self._linked_parameters[params] = approximation
+
   def create_subgraph(self):
     if not self.losses:
       raise ValueError("Must have at least one registered loss.")
@@ -255,11 +419,19 @@ class LayerCollection(object):
     return math_ops.add_n(
         tuple(loss.evaluate_on_sample() for loss in self.losses))
 
+  def _get_linked_approx(self, params):
+    """If params were linked, return their specified approximation."""
+    params_set = frozenset(ensure_sequence(params))
+    if params_set in self.linked_parameters:
+      return self.linked_parameters[params_set]
+    else:
+      return None
+
   def register_fully_connected(self,
                                params,
                                inputs,
                                outputs,
-                               approx=APPROX_KRONECKER_NAME,
+                               approx=None,
                                reuse=VARIABLE_SCOPE):
     """Registers a fully connnected layer.
 
@@ -268,11 +440,11 @@ class LayerCollection(object):
         this layer. Weight matrix should have shape [input_size, output_size].
         Bias should have shape [output_size].
       inputs: Tensor of shape [batch_size, input_size]. Inputs to layer.
-      outputs: Tensor of shape [batch_size, output_size]. Preactivations
+      outputs: Tensor of shape [batch_size, output_size]. Outputs
         produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -280,35 +452,18 @@ class LayerCollection(object):
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    approx_to_block_types = {
-        APPROX_KRONECKER_NAME: fb.FullyConnectedKFACBasicFB,
-        APPROX_DIAGONAL_NAME: fb.FullyConnectedDiagonalFB,
-    }
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_fully_connected_approximation
 
-    if approx not in approx_to_block_types:
+    if approx not in _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES:
       raise ValueError("Bad value {} for approx.".format(approx))
 
-    block_type = approx_to_block_types[approx]
+    block_type = _FULLY_CONNECTED_APPROX_TO_BLOCK_TYPES[approx]
     has_bias = isinstance(params, (tuple, list))
 
-    if reuse == VARIABLE_SCOPE:
-      reuse = variable_scope.get_variable_scope().reuse
-
-    if reuse:
-      block = self.fisher_blocks.get(params, None)
-      if block is None:
-        raise KeyError(
-            "Reuse requested but no FisherBlock found for params {}.".format(
-                params))
-      if not isinstance(block, block_type):
-        raise ValueError(
-            "Requested block of type {} but block of type {} already exists "
-            "for params {}.".format(block_type, type(block), params))
-
-    else:
-      block = block_type(self, has_bias)
-      self.register_block(params, block)
-
+    block = self.register_block(params, block_type(self, has_bias), reuse=reuse)
     block.register_additional_minibatch(inputs, outputs)
 
   def register_conv2d(self,
@@ -317,7 +472,7 @@ class LayerCollection(object):
                       padding,
                       inputs,
                       outputs,
-                      approx=APPROX_KRONECKER_NAME,
+                      approx=None,
                       reuse=VARIABLE_SCOPE):
     """Registers a convolutional layer.
 
@@ -331,10 +486,10 @@ class LayerCollection(object):
       inputs: Tensor of shape [batch_size, height, width, in_channels]. Inputs
         to layer.
       outputs: Tensor of shape [batch_size, height, width, out_channels].
-        Preactivations produced by layer.
-      approx: str. One of APPROX_KRONECKER_NAME or APPROX_DIAGONAL_NAME.
+        Output produced by layer.
+      approx: str. One of "kron" or "diagonal".
       reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
-        create a new FisherBlock.  If VARIABLE_SCOPE, use
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse.
 
     Raises:
@@ -342,50 +497,93 @@ class LayerCollection(object):
       KeyError: If reuse == True but no FisherBlock found for 'params'.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
-    approx_to_block_types = {
-        APPROX_KRONECKER_NAME: fb.ConvKFCBasicFB,
-        APPROX_DIAGONAL_NAME: fb.ConvDiagonalFB,
-    }
 
-    if approx not in approx_to_block_types:
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_conv2d_approximation
+
+    if approx not in _CONV2D_APPROX_TO_BLOCK_TYPES:
       raise ValueError("Bad value {} for approx.".format(approx))
 
-    block_type = approx_to_block_types[approx]
+    block_type = _CONV2D_APPROX_TO_BLOCK_TYPES[approx]
+    block = self.register_block(
+        params, block_type(self, params, strides, padding), reuse=reuse)
+    block.register_additional_minibatch(inputs, outputs)
 
-    if reuse == VARIABLE_SCOPE:
-      reuse = variable_scope.get_variable_scope().reuse
+  def register_generic(self,
+                       params,
+                       batch_size,
+                       approx=None,
+                       reuse=VARIABLE_SCOPE):
+    """Registers a generic layer.
 
-    if reuse:
-      block = self.fisher_blocks.get(params, None)
-      if block is None:
-        raise KeyError(
-            "Reuse requested but no FisherBlock found for params {}.".format(
-                params))
-      if not isinstance(block, block_type):
-        raise ValueError(
-            "Requested block of type {} but block of type {} already exists "
-            "for params {}.".format(block_type, type(block), params))
+    Args:
+      params: Tensor or tuple of Tensors corresponding to the parameters.
+      batch_size: 0-D Tensor. Size of the minibatch.
+      approx: str. One of "full" or "diagonal".
+      reuse: bool or str.  If True, reuse an existing FisherBlock. If False,
+        create a new FisherBlock.  If "VARIABLE_SCOPE", use
+        tf.get_variable_scope().reuse.
 
-    else:
-      block = block_type(self, params, strides, padding)
-      self.register_block(params, block)
+    Raises:
+      ValueError: For improper value to 'approx'.
+      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: If reuse == True and FisherBlock found but of the wrong type.
+    """
 
-    block.register_additional_minibatch(inputs, outputs)
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_generic_approximation
 
-  def register_generic(self, params, batch_size, approx=APPROX_DIAGONAL_NAME):
-    params = params if isinstance(params, (tuple, list)) else (params,)
-    self._generic_registrations |= set(params)
-
-    # Generic registrations do not need special registration rules because we do
-    # not care about multiple generic registrations. Add them to the
-    # fisher_block dictionary manually rather than going through the logic in
-    # self.register_block.
-    if approx == APPROX_FULL_NAME:
-      self.fisher_blocks[params] = fb.FullFB(self, params, batch_size)
-    elif approx == APPROX_DIAGONAL_NAME:
-      self.fisher_blocks[params] = fb.NaiveDiagonalFB(self, params, batch_size)
-    else:
+    if approx not in _GENERIC_APPROX_TO_BLOCK_TYPES:
+      raise ValueError("Bad value {} for approx.".format(approx))
+
+    block_type = _GENERIC_APPROX_TO_BLOCK_TYPES[approx]
+    block = self.register_block(params, block_type(self, params), reuse=reuse)
+    block.register_additional_minibatch(batch_size)
+
+  def register_fully_connected_multi(self, params, inputs, outputs,
+                                     approx=None):
+    """Register fully connected layers with shared parameters.
+
+    This can handle general fully-connected layers with shared parameters, but
+    has specialized approximations to deal with the case where there is a
+    meaningful linear order to the share instances (such as in an RNN).
+
+    Args:
+      params: Tensor or 2-tuple of Tensors corresponding to weight and bias of
+        this layer. Weight matrix should have shape [input_size, output_size].
+        Bias should have shape [output_size].
+      inputs: A list of tensors, each of shape [batch_size, input_size]. Inputs
+        to layer. In the case of RNNs, one Tensor per time step.
+      outputs: A list of tensors, the same length as 'inputs', each of shape
+        [batch_size, output_size]. Outputs produced by layer. In the case of
+        RNNs, one Tensor per time step.
+      approx: str. One of "kron_indep", "kron_series_1", or "kron_series_2".
+
+    Raises:
+      ValueError: For improper value to 'approx'.
+    """
+    if approx is None:
+      approx = self._get_linked_approx(params)
+      if approx is None:
+        approx = self.default_fully_connected_multi_approximation
+    has_bias = isinstance(params, (tuple, list))
+
+    # TODO(b/70283649): something along the lines of find_canonical_output
+    # should be added back in here (and for the other block types, arguably).
+
+    if approx not in _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES:
       raise ValueError("Bad value {} for approx.".format(approx))
+    block_type = _FULLY_CONNECTED_MULTI_APPROX_TO_BLOCK_TYPES[approx]
+
+    # For now we don't support multiple minibatches for this type of layer, so
+    # we set reuse=False
+    self.register_block(params,
+                        block_type(self, inputs, outputs, has_bias=has_bias),
+                        reuse=False)
 
   def register_categorical_predictive_distribution(self,
                                                    logits,
@@ -410,10 +608,10 @@ class LayerCollection(object):
         tf.get_variable_scope().reuse.
 
     Raises:
-      ValueError: If reuse=True and name != None.
-      ValueError: If reuse=True and seed != None.
-      KeyError: If reuse=True and no existing LossFunction with 'name' found.
-      KeyError: If reuse=False and existing LossFunction with 'name' found.
+      ValueError: If reuse == True and name == None.
+      ValueError: If reuse == True and seed != None.
+      KeyError: If reuse == True and no existing LossFunction with 'name' found.
+      KeyError: If reuse == False and existing LossFunction with 'name' found.
     """
     name = name or self._graph.unique_name(
         "register_categorical_predictive_distribution")
@@ -522,11 +720,14 @@ class LayerCollection(object):
     try:
       hash(args)
     except TypeError:
-      raise TypeError((
-          "Unable to use (cls, args) = ({}, {}) as a key in "
-          "LayerCollection.fisher_factors. The pair cannot be hashed."
-      ).format(cls, args))
-
-    with variable_scope.variable_scope(self._var_scope):
-      return utils.setdefault(self.fisher_factors, (cls, args),
-                              lambda: cls(*args))
+      raise TypeError(
+          ("Unable to use (cls, args) = ({}, {}) as a key in "
+           "LayerCollection.fisher_factors. The pair cannot be hashed.").format(
+               cls, args))
+
+    key = cls, args
+    if key not in self.fisher_factors:
+      colo = self._colocate_cov_ops_with_inputs
+      with variable_scope.variable_scope(self._var_scope):
+        self.fisher_factors[key] = cls(*args, colocate_cov_ops_with_inputs=colo)
+    return self.fisher_factors[key]
diff --git a/tensorflow/contrib/kfac/python/ops/loss_functions.py b/tensorflow/contrib/kfac/python/ops/loss_functions.py
index 3cfde7f9ababab73980e93ea1dd65be1b559712b..e2e5bc3ffea3e52087c24802948bc8260e3b199a 100644
--- a/tensorflow/contrib/kfac/python/ops/loss_functions.py
+++ b/tensorflow/contrib/kfac/python/ops/loss_functions.py
@@ -56,6 +56,30 @@ class LossFunction(object):
     """The inputs to the loss function (excluding the targets)."""
     pass
 
+  @property
+  def input_minibatches(self):
+    """A `list` of inputs to the loss function, separated by minibatch.
+
+    Typically there will be one minibatch per tower in a multi-tower setup.
+    Returns a list consisting of `self.inputs` by default; `LossFunction`s
+    supporting registering multiple minibatches should override this method.
+
+    Returns:
+      A `list` of `Tensor`s representing
+    """
+    return [self.inputs]
+
+  @property
+  def num_registered_minibatches(self):
+    """Number of minibatches registered for this LossFunction.
+
+    Typically equal to the number of towers in a multi-tower setup.
+
+    Returns:
+      An `int` representing the number of registered minibatches.
+    """
+    return len(self.input_minibatches)
+
   def evaluate(self):
     """Evaluate the loss function on the targets."""
     if self.targets is not None:
@@ -75,7 +99,6 @@ class LossFunction(object):
     Returns:
       log probability of each target, summed across all targets.
     """
-
     pass
 
   @abc.abstractmethod
@@ -415,8 +438,8 @@ class NormalMeanNegativeLogProbLoss(DistributionNegativeLogProbLoss,
         array_ops.ones(array_ops.shape(self._mean)[:1], dtype=self._mean.dtype),
         axis=-1)
     output_slice = self._var**-0.5 * ones_slice
-    return insert_slice_in_zeros(output_slice, 1,
-                                 int(self._mean.shape[1]), index[0])
+    return insert_slice_in_zeros(output_slice, 1, int(self._mean.shape[1]),
+                                 index[0])
 
   @property
   def fisher_factor_inner_shape(self):
@@ -474,24 +497,23 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
 
   @property
   def _fisher_mean(self):
-    return 1./self._variance
+    return 1. / self._variance
 
   @property
   def _fisher_mean_factor(self):
-    return 1./self._scale
+    return 1. / self._scale
 
   @property
   def _fisher_var(self):
-    return 1./(2*math_ops.square(self._variance))
+    return 1. / (2 * math_ops.square(self._variance))
 
   @property
   def _fisher_var_factor(self):
-    return 1./(math_ops.sqrt(2.)*self._variance)
+    return 1. / (math_ops.sqrt(2.) * self._variance)
 
   def multiply_fisher(self, vecs):
     mean_vec, var_vec = vecs
-    return (self._fisher_mean * mean_vec,
-            self._fisher_var * var_vec)
+    return (self._fisher_mean * mean_vec, self._fisher_var * var_vec)
 
   def multiply_fisher_factor(self, vecs):
     mean_vec, var_vec = self._split(vecs)
@@ -511,8 +533,8 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
       # Index corresponds to mean parameter.
       mean_slice = self._fisher_mean_factor[:, index]
       mean_slice = array_ops.expand_dims(mean_slice, axis=-1)
-      mean_output = insert_slice_in_zeros(mean_slice, 1,
-                                          int(self._mean.shape[1]), index)
+      mean_output = insert_slice_in_zeros(mean_slice, 1, int(
+          self._mean.shape[1]), index)
       var_output = array_ops.zeros_like(mean_output)
     else:
       index -= int(self._mean.shape[-1])
@@ -527,13 +549,17 @@ class NormalMeanVarianceNegativeLogProbLoss(DistributionNegativeLogProbLoss):
 
   @property
   def fisher_factor_inner_shape(self):
-    return array_ops.concat([array_ops.shape(self._mean)[:-1],
-                             2*array_ops.shape(self._mean)[-1:]], axis=0)
+    return array_ops.concat(
+        [
+            array_ops.shape(self._mean)[:-1],
+            2 * array_ops.shape(self._mean)[-1:]
+        ],
+        axis=0)
 
   @property
   def fisher_factor_inner_static_shape(self):
     shape = self._mean.shape.as_list()
-    return tensor_shape.TensorShape(shape[-1:] + [2*shape[-1]])
+    return tensor_shape.TensorShape(shape[-1:] + [2 * shape[-1]])
 
   def multiply_hessian(self, vector):
     raise NotImplementedError()
@@ -605,6 +631,10 @@ class CategoricalLogitsNegativeLogProbLoss(DistributionNegativeLogProbLoss,
   def _logits(self):
     return array_ops.concat(self._logits_components, axis=0)
 
+  @property
+  def input_minibatches(self):
+    return self._logits_components
+
   @property
   def targets(self):
     if all(target is None for target in self._targets_components):
@@ -710,8 +740,8 @@ class MultiBernoulliNegativeLogProbLoss(DistributionNegativeLogProbLoss,
     assert len(index) == 1, "Length of index was {}".format(len(index))
     probs_slice = array_ops.expand_dims(self._probs[:, index[0]], -1)
     output_slice = math_ops.sqrt(probs_slice * (1 - probs_slice))
-    return insert_slice_in_zeros(output_slice, 1,
-                                 int(self._logits.shape[1]), index[0])
+    return insert_slice_in_zeros(output_slice, 1, int(self._logits.shape[1]),
+                                 index[0])
 
   @property
   def fisher_factor_inner_shape(self):
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index bfa15e0948c96477d9a79dece985bc4b6dafab6f..ecf7f3e4e5ab7d9c151f760fdab733bc3830e37b 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -35,16 +35,20 @@ from tensorflow.python.training import gradient_descent
 class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
   """The KFAC Optimizer (https://arxiv.org/abs/1503.05671)."""
 
-  def __init__(
-      self,
-      learning_rate,
-      cov_ema_decay,
-      damping,
-      layer_collection,
-      momentum=0.,
-      momentum_type="regular",
-      norm_constraint=None,
-      name="KFAC",):
+  def __init__(self,
+               learning_rate,
+               cov_ema_decay,
+               damping,
+               layer_collection,
+               var_list=None,
+               momentum=0.,
+               momentum_type="regular",
+               norm_constraint=None,
+               name="KFAC",
+               estimation_mode="gradients",
+               colocate_gradients_with_ops=False,
+               cov_devices=None,
+               inv_devices=None):
     """Initializes the KFAC optimizer with the given settings.
 
     Args:
@@ -63,6 +67,9 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           blocks, kronecker factors, and losses associated with the
           graph.  The layer_collection cannot be modified after KfacOptimizer's
           initialization.
+      var_list: Optional list or tuple of variables to train. Defaults to the
+          list of variables collected in the graph under the key
+          `GraphKeys.TRAINABLE_VARIABLES`.
       momentum: The momentum value for this optimizer. Only applies when
           momentum_type is 'regular' or 'adam'. (Default: 0)
       momentum_type: The type of momentum to use in this optimizer, one of
@@ -72,6 +79,18 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           specified value. May only be used with momentum type 'regular'.
           (Default: None)
       name: The name for this optimizer. (Default: 'KFAC')
+      estimation_mode: The type of estimator to use for the Fishers.  Can be
+          'gradients', 'empirical', 'curvature_propagation', or 'exact'.
+          (Default: 'gradients'). See the doc-string for FisherEstimator for
+          more a more detailed description of these options.
+      colocate_gradients_with_ops: Whether we should request gradients we
+          compute in the estimator be colocated with their respective ops.
+      cov_devices: Iterable of device strings (e.g. '/gpu:0'). Covariance
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
+      inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
+          computations will be placed on these devices in a round-robin fashion.
+          Can be None, which means that no devices are specified.
 
     Raises:
       ValueError: If the momentum type is unsupported.
@@ -81,12 +100,19 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
           or 'adam'.
     """
 
-    # We may consider determining the set of variables some other way, but for
-    # now it's just all the trainable variables.
-    variables = tf_variables.trainable_variables()
+    variables = var_list
+    if variables is None:
+      variables = tf_variables.trainable_variables()
 
-    self._fisher_est = est.FisherEstimator(variables, cov_ema_decay, damping,
-                                           layer_collection)
+    self._fisher_est = est.FisherEstimator(
+        variables,
+        cov_ema_decay,
+        damping,
+        layer_collection,
+        estimation_mode=estimation_mode,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        cov_devices=cov_devices,
+        inv_devices=inv_devices)
 
     momentum_type = momentum_type.lower()
     legal_momentum_types = ["regular", "adam", "qmodel"]
@@ -101,7 +127,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       raise ValueError("Momentum must be unspecified if using a momentum_type "
                        "other than 'regular' or 'adam'.")
 
-    self._momentum = ops.convert_to_tensor(momentum, name="momentum")
+    self._momentum = momentum
     self._momentum_type = momentum_type
     self._norm_constraint = norm_constraint
 
@@ -125,16 +151,24 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     return self._fisher_est.damping
 
   def minimize(self, *args, **kwargs):
-
-    if "var_list" not in kwargs:
-      kwargs["var_list"] = tf_variables.trainable_variables()
-
+    kwargs["var_list"] = kwargs.get("var_list") or self.variables
     if set(kwargs["var_list"]) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
-
     return super(KfacOptimizer, self).minimize(*args, **kwargs)
 
+  def compute_gradients(self, *args, **kwargs):
+    # args[1] could be our var_list
+    if len(args) > 1:
+      var_list = args[1]
+    else:
+      kwargs["var_list"] = kwargs.get("var_list") or self.variables
+      var_list = kwargs["var_list"]
+    if set(var_list) != set(self.variables):
+      raise ValueError("var_list doesn't match with set of Fisher-estimating "
+                       "variables.")
+    return super(KfacOptimizer, self).compute_gradients(*args, **kwargs)
+
   def apply_gradients(self, grads_and_vars, *args, **kwargs):
     """Applies gradients to variables.
 
@@ -291,14 +325,17 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
         self._batch_size, dtype=fft_precon_grads[0].dtype)
 
     # compute the entries of the 2x2 matrix
-    m_11 = (_inner_product_list(fft_precon_grads, fft_precon_grads) / batch_size
-            + self.damping * _inner_product_list(precon_grads, precon_grads))
+    m_11 = (
+        _inner_product_list(fft_precon_grads, fft_precon_grads) / batch_size +
+        self.damping * _inner_product_list(precon_grads, precon_grads))
 
-    m_21 = (_inner_product_list(fft_prev_updates, fft_precon_grads) / batch_size
-            + self.damping * _inner_product_list(prev_updates, precon_grads))
+    m_21 = (
+        _inner_product_list(fft_prev_updates, fft_precon_grads) / batch_size +
+        self.damping * _inner_product_list(prev_updates, precon_grads))
 
-    m_22 = (_inner_product_list(fft_prev_updates, fft_prev_updates) / batch_size
-            + self.damping * _inner_product_list(prev_updates, prev_updates))
+    m_22 = (
+        _inner_product_list(fft_prev_updates, fft_prev_updates) / batch_size +
+        self.damping * _inner_product_list(prev_updates, prev_updates))
 
     def non_zero_prevupd_case():
       r"""Computes optimal (alpha, mu) given non-zero previous update.
@@ -384,8 +421,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       grads = list(grad for (grad, _) in grads_and_vars)
       variables = list(var for (_, var) in grads_and_vars)
       # previous updates are the negative velocities (up to scaling by LR)
-      prev_updates = list(-self._zeros_slot(var, "velocity", self._name)
-                          for var in variables)
+      prev_updates = list(
+          -self._zeros_slot(var, "velocity", self._name) for var in variables)
 
       # Compute optimal velocity update parameters according to quadratic model
       alpha, mu, _ = self._compute_qmodel_hyperparams(
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index a7473481e44da0b09c047db9af29032918ea6cef..cec018e406bc51c07f5cafcc2c38efe7e9601618 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -28,9 +28,17 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
-
 # Method used for inverting matrices.
 POSDEF_INV_METHOD = "cholesky"
+POSDEF_EIG_METHOD = "self_adjoint"
+
+
+def set_global_constants(posdef_inv_method=None):
+  """Sets various global constants used by the classes in this module."""
+  global POSDEF_INV_METHOD
+
+  if posdef_inv_method is not None:
+    POSDEF_INV_METHOD = posdef_inv_method
 
 
 class SequenceDict(object):
@@ -56,13 +64,6 @@ class SequenceDict(object):
     return list(self._dict.items())
 
 
-def setdefault(dct, key, thunk):
-  """Like dict.setdefault but delays evaluation of the value to be set."""
-  if key not in dct:
-    dct[key] = thunk()
-  return dct[key]
-
-
 def tensors_to_column(tensors):
   """Converts a tensor or list of tensors to a column vector.
 
@@ -161,33 +162,11 @@ def mat2d_to_layer_params(vector_template, mat2d):
     return array_ops.reshape(mat2d, vector_template.shape)
 
 
-def compute_pi(left_factor, right_factor):
-  """Computes the scalar constant pi for Tikhonov regularization/damping.
-
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
-  See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
-
-  Args:
-    left_factor: The left Kronecker factor Tensor.
-    right_factor: The right Kronecker factor Tensor.
-
-  Returns:
-    The computed scalar constant pi for these Kronecker Factors (as a Tensor).
-  """
-  # Instead of dividing by the dim of the norm, we multiply by the dim of the
-  # other norm. This works out the same in the ratio.
-  left_norm = math_ops.trace(left_factor) * right_factor.get_shape().as_list()[
-      0]
-  right_norm = math_ops.trace(right_factor) * left_factor.get_shape().as_list()[
-      0]
-  return math_ops.sqrt(left_norm / right_norm)
-
-
 def posdef_inv(tensor, damping):
   """Computes the inverse of tensor + damping * identity."""
   identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
   damping = math_ops.cast(damping, dtype=tensor.dtype)
-  return posdef_inv_funcs[POSDEF_INV_METHOD](tensor, identity, damping)
+  return posdef_inv_functions[POSDEF_INV_METHOD](tensor, identity, damping)
 
 
 def posdef_inv_matrix_inverse(tensor, identity, damping):
@@ -201,9 +180,44 @@ def posdef_inv_cholesky(tensor, identity, damping):
   return linalg_ops.cholesky_solve(chol, identity)
 
 
-posdef_inv_funcs = {
+def posdef_inv_eig(tensor, identity, damping):
+  """Computes inverse(tensor + damping * identity) with eigendecomposition."""
+  eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(
+      tensor + damping * identity)
+  return math_ops.matmul(
+      eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
+
+
+posdef_inv_functions = {
     "matrix_inverse": posdef_inv_matrix_inverse,
     "cholesky": posdef_inv_cholesky,
+    "eig": posdef_inv_eig,
+}
+
+
+def posdef_eig(mat):
+  """Computes the eigendecomposition of a positive semidefinite matrix."""
+  return posdef_eig_functions[POSDEF_EIG_METHOD](mat)
+
+
+def posdef_eig_svd(mat):
+  """Computes the singular values and left singular vectors of a matrix."""
+  evals, evecs, _ = linalg_ops.svd(mat)
+
+  return evals, evecs
+
+
+def posdef_eig_self_adjoint(mat):
+  """Computes eigendecomposition using self_adjoint_eig."""
+  evals, evecs = linalg_ops.self_adjoint_eig(mat)
+  evals = math_ops.abs(evals)  # Should be equivalent to svd approach.
+
+  return evals, evecs
+
+
+posdef_eig_functions = {
+    "self_adjoint": posdef_eig_self_adjoint,
+    "svd": posdef_eig_svd,
 }
 
 
@@ -260,8 +274,8 @@ def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
   # generated by the first gradients_impl.gradients call.
 
   us = [array_ops.zeros_like(y) + float("nan") for y in ys]
-  dydxs = gradients_impl.gradients(ys, xs, grad_ys=us,
-                                   stop_gradients=stop_gradients)
+  dydxs = gradients_impl.gradients(
+      ys, xs, grad_ys=us, stop_gradients=stop_gradients)
 
   # Deal with strange types that gradients_impl.gradients returns but can't
   # deal with.
@@ -277,3 +291,6 @@ def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
   dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs)
 
   return dysdx
+
+# TODO(b/69623235): Add a function for finding tensors that share gradients
+# to eliminate redundant fisher factor computations.
diff --git a/tensorflow/contrib/kfac/python/ops/utils_lib.py b/tensorflow/contrib/kfac/python/ops/utils_lib.py
index ddbb4485ce6967082f1844c6d798c078f1cc303b..8903c90fbce6a890aa419d89b3b79d75f69509fc 100644
--- a/tensorflow/contrib/kfac/python/ops/utils_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/utils_lib.py
@@ -25,13 +25,11 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     "SequenceDict",
-    "setdefault",
     "tensors_to_column",
     "column_to_tensors",
     "kronecker_product",
     "layer_params_to_mat2d",
     "mat2d_to_layer_params",
-    "compute_pi",
     "posdef_inv",
     "posdef_inv_matrix_inverse",
     "posdef_inv_cholesky",
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 2f1f283811b6cb9e8bfb52ab2052afac1de700cb..852d06e1e3cc8f8deecd15b7436cd4e4a393ad66 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -61,6 +61,7 @@ tf_custom_op_py_library(
         "python/layers/normalization.py",
         "python/layers/optimizers.py",
         "python/layers/regularizers.py",
+        "python/layers/rev_block_lib.py",
         "python/layers/summaries.py",
         "python/layers/target_column.py",
         "python/layers/utils.py",
@@ -376,6 +377,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rev_block_lib_test",
+    size = "small",
+    srcs = ["python/layers/rev_block_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index d309ba958ded86afdc1e4bba2ff471a5181cda4e..6c624929f20503054e0258aad8a843f4a201be64 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -42,6 +42,9 @@ See the @{$python/contrib.layers} guide.
 @@relu
 @@relu6
 @@repeat
+@@recompute_grad
+@@RevBlock
+@@rev_block
 @@safe_embedding_lookup_sparse
 @@scale_gradient
 @@separable_conv2d
diff --git a/tensorflow/contrib/layers/python/layers/__init__.py b/tensorflow/contrib/layers/python/layers/__init__.py
index 03337f9a5d11784316124442125bb498c4ce9603..f1ae2de68be33880a6fc09957f4d857973902b26 100644
--- a/tensorflow/contrib/layers/python/layers/__init__.py
+++ b/tensorflow/contrib/layers/python/layers/__init__.py
@@ -28,6 +28,7 @@ from tensorflow.contrib.layers.python.layers.layers import *
 from tensorflow.contrib.layers.python.layers.normalization import *
 from tensorflow.contrib.layers.python.layers.optimizers import *
 from tensorflow.contrib.layers.python.layers.regularizers import *
+from tensorflow.contrib.layers.python.layers.rev_block_lib import *
 from tensorflow.contrib.layers.python.layers.summaries import *
 from tensorflow.contrib.layers.python.layers.target_column import *
 from tensorflow.contrib.layers.python.ops.bucketization_op import *
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 226d933d85d91600e36ffb84212703e10455bfbb..092d418c3f232b364e2c6b4d25a4162626ba17f0 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -521,7 +521,7 @@ def sparse_column_with_integerized_feature(column_name,
 
   Args:
     column_name: A string defining sparse column name.
-    bucket_size: An int that is > 1. The number of buckets. It should be bigger
+    bucket_size: An int that is >= 1. The number of buckets. It should be bigger
       than maximum feature. In other words features in this column should be an
       int64 in range [0, bucket_size)
     combiner: A string specifying how to reduce if the sparse column is
@@ -539,7 +539,7 @@ def sparse_column_with_integerized_feature(column_name,
     An integerized _SparseColumn definition.
 
   Raises:
-    ValueError: bucket_size is not greater than 1.
+    ValueError: bucket_size is less than 1.
     ValueError: dtype is not integer.
   """
   return _SparseColumnIntegerized(
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops.py b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
index fa0047f05d893f6543ddb1680824a32469e13293..78affea44cbfb92523063968dbc1be98841854db 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops.py
@@ -97,10 +97,13 @@ def _input_from_feature_columns(columns_to_tensors,
                                 trainable,
                                 scope,
                                 output_rank,
-                                default_name):
+                                default_name,
+                                cols_to_outs=None):
   """Implementation of `input_from(_sequence)_feature_columns`."""
   columns_to_tensors = columns_to_tensors.copy()
   check_feature_columns(feature_columns)
+  if cols_to_outs is not None and not isinstance(cols_to_outs, dict):
+    raise ValueError('cols_to_outs must be a dict unless None')
   with variable_scope.variable_scope(scope,
                                      default_name=default_name,
                                      values=columns_to_tensors.values()):
@@ -144,6 +147,8 @@ def _input_from_feature_columns(columns_to_tensors,
           except ValueError as e:
             raise ValueError('Error creating input layer for column: {}.\n'
                              '{}, {}'.format(column.name, e, ee))
+        if cols_to_outs is not None:
+          cols_to_outs[column] = output_tensors[-1]
     return array_ops.concat(output_tensors, output_rank - 1)
 
 
@@ -151,7 +156,8 @@ def input_from_feature_columns(columns_to_tensors,
                                feature_columns,
                                weight_collections=None,
                                trainable=True,
-                               scope=None):
+                               scope=None,
+                               cols_to_outs=None):
   """A tf.contrib.layers style input layer builder based on FeatureColumns.
 
   Generally a single example in training data is described with feature columns.
@@ -196,6 +202,8 @@ def input_from_feature_columns(columns_to_tensors,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for variable_scope.
+    cols_to_outs: Optional dict from feature column to output tensor,
+      which is concatenated into the returned tensor.
 
   Returns:
     A Tensor which can be consumed by hidden layers in the neural network.
@@ -209,7 +217,8 @@ def input_from_feature_columns(columns_to_tensors,
                                      trainable,
                                      scope,
                                      output_rank=2,
-                                     default_name='input_from_feature_columns')
+                                     default_name='input_from_feature_columns',
+                                     cols_to_outs=cols_to_outs)
 
 
 @experimental
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index fbfa0e32de55edab3c90189ddfe05ab826ac9167..e6bbd86ab722c4e853a59f816bed8a8ac1fe9ede 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -607,6 +607,31 @@ class CreateInputLayersForDNNsTest(test.TestCase):
       # Verify cross compatibility: Core builder output should equal to contrib.
       self.assertAllEqual(output.eval().shape, output_core.eval().shape)
 
+  def testAllDNNColumnsWithColumnwiseOutputs(self):
+    sparse_column = feature_column.sparse_column_with_keys(
+        "ids", ["a", "b", "c", "unseen"])
+    real_valued_column = feature_column.real_valued_column("income", 2)
+    one_hot_column = feature_column.one_hot_column(sparse_column)
+    embedding_column = feature_column.embedding_column(sparse_column, 10)
+    features = {
+        "ids":
+            sparse_tensor.SparseTensor(
+                values=["c", "b", "a"],
+                indices=[[0, 0], [1, 0], [2, 0]],
+                dense_shape=[3, 1]),
+        "income":
+            constant_op.constant([[20.3, 10], [110.3, 0.4], [-3.0, 30.4]]),
+    }
+    columns = [one_hot_column, embedding_column, real_valued_column]
+    cols_to_outs = {}
+    feature_column_ops.input_from_feature_columns(
+        features, columns, cols_to_outs=cols_to_outs)
+    with self.test_session():
+      variables_lib.global_variables_initializer().run()
+      lookup_ops.tables_initializer().run()
+      for column in columns:
+        self.assertTrue(column in cols_to_outs)
+
   def testRealValuedColumn(self):
     real_valued = feature_column.real_valued_column("price")
     features = {"price": constant_op.constant([[20.], [110], [-3]])}
diff --git a/tensorflow/contrib/layers/python/layers/initializers.py b/tensorflow/contrib/layers/python/layers/initializers.py
index b12a882d9ae88f7cf4f920cfa5872e5de1c67290..51610f21b24f1d40f26630cc1e69ca723d130639 100644
--- a/tensorflow/contrib/layers/python/layers/initializers.py
+++ b/tensorflow/contrib/layers/python/layers/initializers.py
@@ -79,7 +79,8 @@ def variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False,
   ```
 
   * To get [Delving Deep into Rectifiers](
-     http://arxiv.org/pdf/1502.01852v1.pdf), use (Default):<br/>
+     http://arxiv.org/pdf/1502.01852v1.pdf) (also know as the "MSRA 
+     initialization"), use (Default):<br/>
     `factor=2.0 mode='FAN_IN' uniform=False`
   * To get [Convolutional Architecture for Fast Feature Embedding](
      http://arxiv.org/abs/1408.5093), use:<br/>
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index c429d53cdc9101486359a09d985a5649c649f3e2..0d25a09852544a7eb1ed5eb9c2f3402d9064d91a 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -198,23 +198,23 @@ def avg_pool3d(inputs,
     return utils.collect_named_outputs(outputs_collections, sc, outputs)
 
 
-def _fused_batch_norm(
-    inputs,
-    decay=0.999,
-    center=True,
-    scale=False,
-    epsilon=0.001,
-    activation_fn=None,
-    param_initializers=None,
-    updates_collections=ops.GraphKeys.UPDATE_OPS,
-    is_training=True,
-    reuse=None,
-    variables_collections=None,
-    outputs_collections=None,
-    trainable=True,
-    data_format=DATA_FORMAT_NHWC,
-    zero_debias_moving_mean=False,
-    scope=None):
+def _fused_batch_norm(inputs,
+                      decay=0.999,
+                      center=True,
+                      scale=False,
+                      epsilon=0.001,
+                      activation_fn=None,
+                      param_initializers=None,
+                      param_regularizers=None,
+                      updates_collections=ops.GraphKeys.UPDATE_OPS,
+                      is_training=True,
+                      reuse=None,
+                      variables_collections=None,
+                      outputs_collections=None,
+                      trainable=True,
+                      data_format=DATA_FORMAT_NHWC,
+                      zero_debias_moving_mean=False,
+                      scope=None):
   """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
     "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -257,6 +257,7 @@ def _fused_batch_norm(
       maintain a linear activation.
     param_initializers: Optional initializers for beta, gamma, moving mean and
       moving variance.
+    param_regularizers: Optional regularizer for beta and gamma.
     updates_collections: Collections to collect the update ops for computation.
       The updates_ops need to be executed with the train_op.
       If None, a control dependency would be added to make sure the updates are
@@ -285,7 +286,6 @@ def _fused_batch_norm(
     ValueError: If the rank of `inputs` is neither 2 or 4.
     ValueError: If rank or `C` dimension of `inputs` is undefined.
   """
-  # TODO(reedwm): Add support for fp16 inputs.
   if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
     raise ValueError('data_format has to be either NCHW or NHWC.')
   with variable_scope.variable_scope(
@@ -309,7 +309,6 @@ def _fused_batch_norm(
         new_shape = [-1, channels, 1, 1]
       inputs = array_ops.reshape(inputs, new_shape)
     inputs_shape = inputs.get_shape()
-    dtype = inputs.dtype.base_dtype
     if data_format == DATA_FORMAT_NHWC:
       params_shape = inputs_shape[-1:]
     else:
@@ -319,23 +318,30 @@ def _fused_batch_norm(
                        (inputs.name, params_shape))
 
     # Allocate parameters for the beta and gamma of the normalization.
-    trainable_beta = trainable and center
     beta_collections = utils.get_variable_collections(variables_collections,
                                                       'beta')
+    # Float32 required to avoid precision-loss when using fp16 input/output
+    variable_dtype = dtypes.float32
     if not param_initializers:
       param_initializers = {}
+    if not param_regularizers:
+      param_regularizers = {}
+    beta_regularizer = param_regularizers.get('beta')
+    gamma_regularizer = param_regularizers.get('gamma')
+
     if center:
       beta_initializer = param_initializers.get('beta',
                                                 init_ops.zeros_initializer())
       beta = variables.model_variable(
           'beta',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=beta_initializer,
+          regularizer=beta_regularizer,
           collections=beta_collections,
-          trainable=trainable_beta)
+          trainable=trainable)
     else:
-      beta = array_ops.constant(0.0, shape=params_shape)
+      beta = array_ops.constant(0.0, dtype=variable_dtype, shape=params_shape)
 
     if scale:
       gamma_collections = utils.get_variable_collections(
@@ -345,12 +351,13 @@ def _fused_batch_norm(
       gamma = variables.model_variable(
           'gamma',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=gamma_initializer,
+          regularizer=gamma_regularizer,
           collections=gamma_collections,
           trainable=trainable)
     else:
-      gamma = array_ops.constant(1.0, shape=params_shape)
+      gamma = array_ops.constant(1.0, dtype=variable_dtype, shape=params_shape)
 
     # Create moving_mean and moving_variance variables and add them to the
     # appropriate collections. We disable variable partitioning while creating
@@ -367,7 +374,7 @@ def _fused_batch_norm(
       moving_mean = variables.model_variable(
           'moving_mean',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=moving_mean_initializer,
           trainable=False,
           collections=moving_mean_collections)
@@ -378,7 +385,7 @@ def _fused_batch_norm(
       moving_variance = variables.model_variable(
           'moving_variance',
           shape=params_shape,
-          dtype=dtype,
+          dtype=variable_dtype,
           initializer=moving_variance_initializer,
           trainable=False,
           collections=moving_variance_collections)
@@ -596,6 +603,7 @@ def batch_norm(inputs,
         epsilon=epsilon,
         activation_fn=activation_fn,
         param_initializers=param_initializers,
+        param_regularizers=param_regularizers,
         updates_collections=updates_collections,
         is_training=is_training,
         reuse=reuse,
@@ -1394,7 +1402,8 @@ def dropout(inputs,
             noise_shape=None,
             is_training=True,
             outputs_collections=None,
-            scope=None):
+            scope=None,
+            seed=None):
   """Returns a dropout op applied to the input.
 
   With probability `keep_prob`, outputs the input element scaled up by
@@ -1412,6 +1421,8 @@ def dropout(inputs,
       Otherwise, inputs is returned.
     outputs_collections: Collection to add the outputs.
     scope: Optional scope for name_scope.
+    seed: A Python integer. Used to create random seeds. See
+      @{tf.set_random_seed} for behavior.
 
   Returns:
     A tensor representing the output of the operation.
@@ -1421,6 +1432,7 @@ def dropout(inputs,
     inputs = ops.convert_to_tensor(inputs)
     layer = core_layers.Dropout(rate=1 - keep_prob,
                                 noise_shape=noise_shape,
+                                seed=seed,
                                 name=sc.name,
                                 _scope=sc)
     outputs = layer.apply(inputs, training=is_training)
@@ -2008,7 +2020,7 @@ def layer_norm(inputs,
 
   Given a tensor `inputs` of rank `R`, moments are calculated and normalization
   is performed over axes `begin_norm_axis ... R - 1`.  Scaling and centering,
-  if requested, is performed over axes `begin_shift_axis .. R - 1`.
+  if requested, is performed over axes `begin_params_axis .. R - 1`.
 
   By default, `begin_norm_axis = 1` and `begin_params_axis = -1`,
   meaning that normalization is performed over all but the first axis
@@ -2549,7 +2561,10 @@ def separable_convolution2d(
           regularizer=weights_regularizer,
           trainable=trainable,
           collections=weights_collections)
-      strides = [1, stride_h, stride_w, 1]
+      strides = [1, 1, stride_h,
+                 stride_w] if data_format.startswith('NC') else [
+                     1, stride_h, stride_w, 1
+                 ]
 
       outputs = nn.depthwise_conv2d(inputs, depthwise_weights, strides, padding,
                                     rate=utils.two_element_tuple(rate),
@@ -2639,51 +2654,52 @@ def spatial_softmax(features,
     ValueError: If unexpected data_format specified.
     ValueError: If num_channels dimension is unspecified.
   """
-  shape = array_ops.shape(features)
-  static_shape = features.shape
-  if data_format == DATA_FORMAT_NHWC:
-    height, width, num_channels = shape[1], shape[2], static_shape[3]
-  elif data_format == DATA_FORMAT_NCHW:
-    num_channels, height, width = static_shape[1], shape[2], shape[3]
-  else:
-    raise ValueError('data_format has to be either NCHW or NHWC.')
-  if num_channels.value is None:
-    raise ValueError('The num_channels dimension of the inputs to '
-                     '`spatial_softmax` should be defined. Found `None`.')
-
-  with ops.name_scope(name, 'spatial_softmax', [features]) as name:
-    # Create tensors for x and y coordinate values, scaled to range [-1, 1].
-    pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
-                                      math_ops.lin_space(-1., 1., num=width),
-                                      indexing='ij')
-    pos_x = array_ops.reshape(pos_x, [height * width])
-    pos_y = array_ops.reshape(pos_y, [height * width])
-    if temperature is None:
-      temperature_collections = utils.get_variable_collections(
-          variables_collections, 'temperature')
-      temperature = variables.model_variable(
-          'temperature',
-          shape=(),
-          dtype=dtypes.float32,
-          initializer=init_ops.ones_initializer(),
-          collections=temperature_collections,
-          trainable=trainable)
-    if data_format == 'NCHW':
-      features = array_ops.reshape(features, [-1, height * width])
+  with variable_scope.variable_scope(name, 'spatial_softmax'):
+    shape = array_ops.shape(features)
+    static_shape = features.shape
+    if data_format == DATA_FORMAT_NHWC:
+      height, width, num_channels = shape[1], shape[2], static_shape[3]
+    elif data_format == DATA_FORMAT_NCHW:
+      num_channels, height, width = static_shape[1], shape[2], shape[3]
     else:
-      features = array_ops.reshape(
-          array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
-
-    softmax_attention = nn.softmax(features/temperature)
-    expected_x = math_ops.reduce_sum(
-        pos_x * softmax_attention, [1], keep_dims=True)
-    expected_y = math_ops.reduce_sum(
-        pos_y * softmax_attention, [1], keep_dims=True)
-    expected_xy = array_ops.concat([expected_x, expected_y], 1)
-    feature_keypoints = array_ops.reshape(
-        expected_xy, [-1, num_channels.value * 2])
-    feature_keypoints.set_shape([None, num_channels.value * 2])
-    return feature_keypoints
+      raise ValueError('data_format has to be either NCHW or NHWC.')
+    if num_channels.value is None:
+      raise ValueError('The num_channels dimension of the inputs to '
+                       '`spatial_softmax` should be defined. Found `None`.')
+
+    with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):
+      # Create tensors for x and y coordinate values, scaled to range [-1, 1].
+      pos_x, pos_y = array_ops.meshgrid(math_ops.lin_space(-1., 1., num=height),
+                                        math_ops.lin_space(-1., 1., num=width),
+                                        indexing='ij')
+      pos_x = array_ops.reshape(pos_x, [height * width])
+      pos_y = array_ops.reshape(pos_y, [height * width])
+      if temperature is None:
+        temperature_collections = utils.get_variable_collections(
+            variables_collections, 'temperature')
+        temperature = variables.model_variable(
+            'temperature',
+            shape=(),
+            dtype=dtypes.float32,
+            initializer=init_ops.ones_initializer(),
+            collections=temperature_collections,
+            trainable=trainable)
+      if data_format == 'NCHW':
+        features = array_ops.reshape(features, [-1, height * width])
+      else:
+        features = array_ops.reshape(
+            array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
+
+      softmax_attention = nn.softmax(features/temperature)
+      expected_x = math_ops.reduce_sum(
+          pos_x * softmax_attention, [1], keep_dims=True)
+      expected_y = math_ops.reduce_sum(
+          pos_y * softmax_attention, [1], keep_dims=True)
+      expected_xy = array_ops.concat([expected_x, expected_y], 1)
+      feature_keypoints = array_ops.reshape(
+          expected_xy, [-1, num_channels.value * 2])
+      feature_keypoints.set_shape([None, num_channels.value * 2])
+  return feature_keypoints
 
 
 def stack(inputs, layer, stack_args, **kwargs):
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 7c77e905f7432db4e42e7fda70aa72f32f40bb09..ae64b75d939ce0ffab300b01d3cfcb67a9d0da1c 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1345,11 +1345,20 @@ class DropoutTest(test.TestCase):
       num_elem_initial = math_ops.reduce_mean(math_ops.to_float(images > 0))
       output = _layers.dropout(images)
       num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
-      sess.run(variables_lib.global_variables_initializer())
       num_elem, num_elem_initial = sess.run([num_elem, num_elem_initial])
       self.assertLess(num_elem, num_elem_initial / 2 + 0.1)
       self.assertGreater(num_elem, num_elem_initial / 2 - 0.1)
 
+  def testDropoutSeed(self):
+    """Test that providing the same seed produces the same result."""
+    height, width = 10, 10
+    with self.test_session() as sess:
+      images = random_ops.random_uniform(
+          (5, height, width, 3), seed=1, name='images')
+      output1 = _layers.dropout(images, seed=1)
+      output2 = _layers.dropout(images, seed=1)
+      self.assertAllEqual(*sess.run([output1, output2]))
+
   def testCreateDropoutNoTraining(self):
     height, width = 3, 3
     with self.test_session() as sess:
@@ -1358,7 +1367,6 @@ class DropoutTest(test.TestCase):
       num_elem_initial = math_ops.reduce_mean(math_ops.to_float(images > 0))
       output = _layers.dropout(images, is_training=False)
       num_elem = math_ops.reduce_mean(math_ops.to_float(output > 0))
-      sess.run(variables_lib.global_variables_initializer())
       num_elem, num_elem_initial = sess.run([num_elem, num_elem_initial])
       self.assertEqual(num_elem, num_elem_initial)
       outputs, inputs = sess.run([output, images])
@@ -1766,10 +1774,13 @@ class BatchNormTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'undefined'):
         _layers.batch_norm(inputs, data_format='NCHW')
 
-  def _testCreateOp(self, fused):
+  def _testCreateOp(self, fused, dtype=None):
+    if dtype is None:
+      dtype = dtypes.float32
     height, width = 3, 3
     with self.test_session():
-      images = np.random.uniform(size=(5, height, width, 3)).astype('f')
+      images = np.random.uniform(size=(5, height, width, 3)).astype(
+          dtype.as_numpy_dtype)
       output = _layers.batch_norm(images, fused=fused)
       expected_name = ('BatchNorm/FusedBatchNorm' if fused else
                        'BatchNorm/batchnorm')
@@ -1784,29 +1795,44 @@ class BatchNormTest(test.TestCase):
   def testCreateOpFused(self):
     self._testCreateOp(True)
 
-  def testCreateOpBetaRegularizer(self):
+  def testCreateOpFusedFloat16(self):
+    self._testCreateOp(True, dtypes.float16)
+
+  def _testCreateOpBetaRegularizer(self, fused=True):
     height, width = 3, 3
     with self.test_session():
       reg = lambda x: 0.1 * math_ops.reduce_sum(x)
       images = np.random.uniform(size=(5, height, width, 3)).astype('f')
-      _layers.batch_norm(images, param_regularizers={'beta': reg})
+      _layers.batch_norm(images, param_regularizers={'beta': reg}, fused=fused)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
       beta_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertEqual(beta_decay.op.name, 'BatchNorm/beta/Regularizer/mul')
 
-  def testCreateOpGammaRegularizer(self):
+  def testCreateOpBetaRegularizerFused(self):
+    self._testCreateOpBetaRegularizer(fused=True)
+
+  def testCreateOpBetaRegularizerNonFused(self):
+    self._testCreateOpBetaRegularizer(fused=False)
+
+  def _testCreateOpGammaRegularizer(self, fused=True):
     height, width = 3, 3
     with self.test_session():
       reg = lambda x: 0.1 * math_ops.reduce_sum(x)
       images = np.random.uniform(size=(5, height, width, 3)).astype('f')
       _layers.batch_norm(
-          images, param_regularizers={'gamma': reg}, scale=True)
+          images, param_regularizers={'gamma': reg}, scale=True, fused=fused)
       self.assertEqual(
           len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
       gamma_decay = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)[0]
       self.assertEqual(gamma_decay.op.name, 'BatchNorm/gamma/Regularizer/mul')
 
+  def testCreateOpGammaRegularizerFused(self):
+    self._testCreateOpGammaRegularizer(fused=True)
+
+  def testCreateOpGammaRegularizerNonFused(self):
+    self._testCreateOpGammaRegularizer(fused=False)
+
   def testCreateVariables(self):
     height, width = 3, 3
     with self.test_session():
@@ -2639,10 +2665,63 @@ class BatchNormTest(test.TestCase):
   def testBatchNormBeta(self):
     # Test case for 11673
     with self.test_session() as sess:
-      a = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
-      b = _layers.batch_norm(a, center=False, data_format='NCHW',
-                                       zero_debias_moving_mean=True)
+      a_32 = array_ops.placeholder(dtypes.float32, shape=(10, 10, 10, 10))
+      _layers.batch_norm(
+          a_32, center=False, data_format='NCHW', zero_debias_moving_mean=True)
+      a_16 = array_ops.placeholder(dtypes.float16, shape=(10, 10, 10, 10))
+      _layers.batch_norm(
+          a_16, center=False, data_format='NCHW', zero_debias_moving_mean=True)
+      sess.run(variables_lib.global_variables_initializer())
+
+  def testVariablesAreFloat32(self):
+    height, width = 3, 3
+    with self.test_session():
+      images = random_ops.random_uniform(
+          (5, height, width, 3), seed=1, dtype=dtypes.float16)
+      _layers.batch_norm(images, scale=True)
+      beta = variables.get_variables_by_name('beta')[0]
+      gamma = variables.get_variables_by_name('gamma')[0]
+      self.assertEqual(beta.dtype, dtypes.float32_ref)
+      self.assertEqual(gamma.dtype, dtypes.float32_ref)
+      moving_mean = variables.get_variables_by_name('moving_mean')[0]
+      moving_variance = variables.get_variables_by_name('moving_variance')[0]
+      self.assertEqual(moving_mean.dtype, dtypes.float32_ref)
+      self.assertEqual(moving_variance.dtype, dtypes.float32_ref)
+
+  def _runFusedBatchNorm(self, shape, dtype):
+    channels = shape[1]
+    images = np.arange(np.product(shape), dtype=dtype).reshape(shape)
+    beta = init_ops.constant_initializer(
+        np.arange(2, channels + 2, dtype=np.float32))
+    gamma = init_ops.constant_initializer(
+        np.arange(10, channels + 10, dtype=np.float32) * 2.0)
+    mean = init_ops.constant_initializer(
+        np.arange(3, channels + 3, dtype=np.float32) * 5.0)
+    variance = init_ops.constant_initializer(
+        np.arange(1, channels + 1, dtype=np.float32) * 4.0)
+    output = _layers.batch_norm(
+        images,
+        fused=True,
+        is_training=True,
+        scale=True,
+        epsilon=0.5,
+        param_initializers={
+            'beta': beta,
+            'gamma': gamma,
+            'moving_mean': mean,
+            'moving_variance': variance,
+        },
+        data_format='NCHW')
+    with self.test_session(use_gpu=True) as sess:
       sess.run(variables_lib.global_variables_initializer())
+      return sess.run(output)
+
+  def testFusedBatchNormFloat16MatchesFloat32(self):
+    if test.is_gpu_available(cuda_only=True):
+      shape = [5, 4, 2, 3]
+      res_32 = self._runFusedBatchNorm(shape, np.float32)
+      res_16 = self._runFusedBatchNorm(shape, np.float16)
+      self.assertAllClose(res_32, res_16, rtol=1e-3)
 
   def testAdjustmentCreated(self):
     # Tests that the adjustment is appropriately passed to and used by the core
@@ -3247,16 +3326,24 @@ class SeparableConv2dTest(test.TestCase):
           for model_variable in model_variables:
             self.assertEqual(trainable, model_variable in trainable_variables)
 
-  def testConvNCHW(self):
-    for num_filters, correct_output_filters in [(None, 6), (8, 8)]:
+  def testSepConvNCHW(self):
+    for num_filters, correct_output_filters in zip((None, 5), (6, 5)):
       with self.test_session():
-        batch, height, width = 4, 5, 6
+        batch, height, width = 4, 10, 12
+        kernel_dim, stride = 3, 2
         images = random_ops.random_uniform((batch, 3, height, width), seed=1)
         output = layers_lib.separable_conv2d(
-            images, num_filters, [3, 3], 2, padding='VALID', data_format='NCHW')
-        self.assertListEqual(
-            output.get_shape().as_list(), [batch, correct_output_filters,
-                                           height - 2, width - 2])
+            images,
+            num_outputs=num_filters,
+            kernel_size=[kernel_dim, kernel_dim],
+            depth_multiplier=2,
+            stride=stride,
+            padding='VALID',
+            data_format='NCHW')
+        self.assertListEqual(output.get_shape().as_list(), [
+            batch, correct_output_filters, (height - kernel_dim + 1) // stride,
+            (width - kernel_dim + 1) // stride
+        ])
 
 
 class ScaleGradientTests(test.TestCase):
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..123275e1fde047cd3772528641b2e3b09742fbdc
--- /dev/null
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -0,0 +1,583 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Reversible Residual Block.
+
+From
+[The Reversible Residual Network: Backpropagation Without Storing
+Activations](https://arxiv.org/abs/1707.04585).
+
+Also contains the @recompute_grad decorator, which recomputes the forward
+function on the backwards pass.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import re
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.framework.python import ops as contrib_framework_ops
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.layers import base
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+__all__ = ["rev_block", "RevBlock", "recompute_grad"]
+
+LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
+
+
+def _acc_grads(*lists_of_grads):
+  """Accumulates lists of gradients."""
+  acc_grads = []
+  for grads in zip(*lists_of_grads):
+    grads = [g for g in grads if g is not None]
+    if grads:
+      acc_grads.append(math_ops.add_n(grads))
+    else:
+      acc_grads.append(None)
+  return acc_grads
+
+
+def _rev_layer_forward(xs, f, g, f_side_input, g_side_input,
+                       gate_outputs=False):
+  """Forward for 1 reversible layer."""
+  x1, x2 = xs
+  y1 = x1 + (f(x2, f_side_input) if f_side_input else f(x2))
+  y2 = x2 + (g(y1, g_side_input) if g_side_input else g(y1))
+  if gate_outputs:
+    return control_flow_ops.tuple([y1, y2])
+  else:
+    return (y1, y2)
+
+
+def _rev_layer_backward(ys, grad_ys, f, g, f_vars, f_side_input, g_vars,
+                        g_side_input):
+  """Backprop for 1 layer."""
+  y1, y2 = ys
+  grad_y1, grad_y2 = grad_ys
+
+  # Reconstruct intermediates and inputs (x1, x2)
+  # stop_gradients required on fn inputs to prevent infinite recursion into this
+  # grad function on the calls to gradients.
+  y1_stop = array_ops.stop_gradient(y1)
+  g_side_input = [array_ops.stop_gradient(t) for t in g_side_input]
+  gy1 = g(y1_stop, g_side_input) if g_side_input else g(y1_stop)
+
+  x2 = y2 - gy1
+  x2_stop = array_ops.stop_gradient(x2)
+  f_side_input = [array_ops.stop_gradient(t) for t in f_side_input]
+  fx2 = f(x2_stop, f_side_input) if f_side_input else f(x2_stop)
+
+  x1 = y1 - fx2
+
+  # Compute gradients wrt to inputs
+  # dL/dy2 * dG(y1)/y1
+  grad_gy1_y2 = gradients_impl.gradients(gy1, y1_stop, grad_y2)[0]
+  grad_x1 = grad_y1 + grad_gy1_y2
+  grad_x2 = (
+      gradients_impl.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 +
+      gradients_impl.gradients(fx2, x2_stop, grad_gy1_y2)[0])
+
+  # Compute gradients wrt to vars and side inputs in f and g
+  grads1 = gradients_impl.gradients(gy1, g_vars + g_side_input, grad_y2)
+  grad_g_vars, grad_g_side = grads1[:len(g_vars)], grads1[len(g_vars):]
+  grads2 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_y1)
+  grad_f_y1, grad_f_side1 = grads2[:len(f_vars)], grads2[len(f_vars):]
+  grads3 = gradients_impl.gradients(fx2, f_vars + f_side_input, grad_gy1_y2)
+  grad_f_y2, grad_f_side2 = grads3[:len(f_vars)], grads3[len(f_vars):]
+  grad_f_vars = _acc_grads(grad_f_y1, grad_f_y2)
+
+  grad_f_side = _acc_grads(grad_f_side1, grad_f_side2)
+
+  # Put returns in a tuple to ensure a constant memory budget (i.e. don't want
+  # the subsequent layer to start computing and consuming memory based on a
+  # subset of these values).
+  outputs = ((x1, x2), (grad_x1, grad_x2), (grad_f_vars, grad_f_side),
+             (grad_g_vars, grad_g_side))
+  tupled = control_flow_ops.tuple(nest.flatten(outputs))
+  return nest.pack_sequence_as(outputs, tupled)
+
+
+def _rev_block_forward(x1,
+                       x2,
+                       f,
+                       g,
+                       num_layers=1,
+                       f_side_input=None,
+                       g_side_input=None,
+                       gate_outputs=False):
+  """Forward for a series of reversible layers."""
+  out = (x1, x2)
+  for i in xrange(num_layers):
+    out = _rev_layer_forward(
+        out, f[i], g[i], f_side_input, g_side_input, gate_outputs=gate_outputs)
+
+  y1, y2 = out
+  return y1, y2
+
+
+def _scope_wrap(fn, scope):
+
+  @functools.wraps(fn)
+  def wrap(*args, **kwargs):
+    with variable_scope.variable_scope(scope):
+      return fn(*args, **kwargs)
+
+  return wrap
+
+
+class RevBlock(base.Layer):
+  """Block of reversible layers. See rev_block."""
+
+  def __init__(self,
+               f,
+               g,
+               num_layers=1,
+               f_side_input=None,
+               g_side_input=None,
+               use_efficient_backprop=True,
+               name="revblock",
+               **kwargs):
+    super(RevBlock, self).__init__(name=name, **kwargs)
+
+    if isinstance(f, list):
+      assert len(f) == num_layers
+    else:
+      f = [f] * num_layers
+
+    if isinstance(g, list):
+      assert len(g) == num_layers
+    else:
+      g = [g] * num_layers
+
+    f = [_scope_wrap(fn, "revlayer_%d/f" % i) for i, fn in enumerate(f)]
+    g = [_scope_wrap(fn, "revlayer_%d/g" % i) for i, fn in enumerate(g)]
+
+    self.f = f
+    self.g = g
+
+    self.num_layers = num_layers
+    self.f_side_input = f_side_input or []
+    self.g_side_input = g_side_input or []
+
+    self._use_efficient_backprop = use_efficient_backprop
+
+  def call(self, inputs, forward=True):
+    vs = variable_scope.get_variable_scope()
+    vars_before = vs.global_variables()
+
+    if forward:
+      x1, x2 = inputs
+      out = self._forward(x1, x2)
+    else:
+      y1, y2 = inputs
+      out = self._backward(y1, y2)
+
+    # Add any created variables to the Layer's variable stores
+    new_vars = vs.global_variables()[len(vars_before):]
+    train_vars = vs.trainable_variables()
+    for new_var in new_vars:
+      if new_var in train_vars:
+        self._trainable_weights.append(new_var)
+      else:
+        self._non_trainable_weights.append(new_var)
+
+    return out
+
+  def forward(self, x1, x2):
+    return self.apply([x1, x2])
+
+  def backward(self, y1, y2):
+    return self.apply([y1, y2], forward=False)
+
+  def build(self, _):
+    logging.warn("RevBlock constructs its variables on first call, not on "
+                 "build.")
+    self.built = True
+
+  def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
+    """Custom gradient fn for a block of reversible residual layers."""
+    side_inputs = inputs[2:]
+    f_side_idxs = [None] * len(self.f_side_input)
+    g_side_idxs = [None] * len(self.g_side_input)
+    assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
+
+    for i, t in enumerate(side_inputs):
+      if t in self.f_side_input:
+        f_side_idxs[self.f_side_input.index(t)] = i
+      elif t in self.g_side_input:
+        g_side_idxs[self.g_side_input.index(t)] = i
+      else:
+        assert False
+
+    f_vars = [[] for _ in range(self.num_layers)]
+    g_vars = [[] for _ in range(self.num_layers)]
+    f_vars_idxs = [[] for _ in range(self.num_layers)]
+    g_vars_idxs = [[] for _ in range(self.num_layers)]
+
+    for i, t in enumerate(variables):
+      ref = _underlying_variable_ref(t)
+
+      # Use the name to identify the layer number and function (f or g)
+      regex = LAYER_RE.match(ref.name)
+      layer_no = int(regex.group(1))
+      fn_name = regex.group(2)
+      if fn_name == "f":
+        f_vars[layer_no].append(ref)
+        f_vars_idxs[layer_no].append(i)
+      else:
+        assert fn_name == "g"
+        g_vars[layer_no].append(ref)
+        g_vars_idxs[layer_no].append(i)
+
+    f_var_grads = []
+    g_var_grads = []
+    f_side_grads = []
+    g_side_grads = []
+
+    # Reverse variable containers to go backward
+    f_vars.reverse()
+    g_vars.reverse()
+    f = list(self.f)
+    g = list(self.g)
+    f.reverse()
+    g.reverse()
+
+    with variable_scope.variable_scope(self.scope_name, reuse=True):
+      for i in xrange(self.num_layers):
+        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
+            ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
+            self.g_side_input)
+
+        grad_f_vars, grad_f_side = f_ret
+        grad_g_vars, grad_g_side = g_ret
+        f_var_grads.append(grad_f_vars)
+        g_var_grads.append(grad_g_vars)
+        f_side_grads.append(grad_f_side)
+        g_side_grads.append(grad_g_side)
+
+    # Accumulate layer gradients for f_side_input and g_side_input
+    acc_f_side_grads = _acc_grads(*f_side_grads)
+    acc_g_side_grads = _acc_grads(*g_side_grads)
+
+    # Use the stored idxs to put gradients in the passed-in order.
+    side_input_grads = [None] * len(side_inputs)
+    variable_grads = [None] * len(variables)
+
+    # Variable gradients were collected in reverse layer order. Reverse to match
+    # idxs.
+    f_var_grads.reverse()
+    g_var_grads.reverse()
+    for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
+        zip(g_vars_idxs, g_var_grads)):
+      for i, grad in zip(idxs, grads):
+        variable_grads[i] = grad
+
+    for i, grad in zip(f_side_idxs, acc_f_side_grads):
+      side_input_grads[i] = grad
+    for i, grad in zip(g_side_idxs, acc_g_side_grads):
+      side_input_grads[i] = grad
+
+    grad_x1, grad_x2 = grad_ys
+    return [grad_x1, grad_x2] + side_input_grads, variable_grads
+
+  def _forward(self, x1, x2):
+    """Run forward through the reversible layers."""
+
+    side_inputs = [self.f_side_input, self.g_side_input]
+    flat_side_inputs = nest.flatten(side_inputs)
+
+    custom_grad_fn = (
+        self._efficient_grad_fn if self._use_efficient_backprop else None)
+
+    @_fn_with_custom_grad(custom_grad_fn)
+    def _forward_wrap(x1_, x2_, *flat_side_inputs):
+      f_side, g_side = nest.pack_sequence_as(side_inputs, flat_side_inputs)
+      return _rev_block_forward(
+          x1_,
+          x2_,
+          self.f,
+          self.g,
+          num_layers=self.num_layers,
+          f_side_input=f_side,
+          g_side_input=g_side,
+          gate_outputs=self._use_efficient_backprop)
+
+    return _forward_wrap(x1, x2, *flat_side_inputs)
+
+  def _backward(self, y1, y2):
+    """Run backward through the reversible layers."""
+
+    f = list(self.f)
+    g = list(self.g)
+    f.reverse()
+    g.reverse()
+
+    for i in xrange(self.num_layers):
+      gy1 = g[i](y1, self.g_side_input) if self.g_side_input else g[i](y1)
+      x2 = y2 - gy1
+      fx2 = f[i](x2, self.f_side_input) if self.f_side_input else f[i](x2)
+      x1 = y1 - fx2
+
+      y1, y2 = x1, x2
+
+    return x1, x2
+
+
+def rev_block(x1,
+              x2,
+              f,
+              g,
+              num_layers=1,
+              f_side_input=None,
+              g_side_input=None,
+              is_training=True):
+  """A block of reversible residual layers.
+
+  A reversible residual layer is defined as:
+
+  ```
+  y1 = x1 + f(x2, f_side_input)
+  y2 = x2 + g(y1, g_side_input)
+  ```
+
+  A reversible residual block, defined here, is a series of reversible residual
+  layers.
+
+  Limitations:
+  * f and g must not close over any Tensors; all side inputs to f and g should
+    be passed in with f_side_input and g_side_input which will be forwarded to
+    f and g.
+  * f and g must not change the dimensionality of their inputs in order for the
+    addition in the equations above to work.
+
+  Args:
+    x1: a float Tensor.
+    x2: a float Tensor.
+    f: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Can make calls to get_variable.
+      See f_side_input if there are side inputs.
+    g: a function, (Tensor) -> (Tensor) (or list of such of length num_layers).
+      Should not change the shape of the Tensor. Can make calls to get_variable.
+      See g_side_input if there are side inputs.
+    num_layers: int, number of reversible residual layers. Each layer will
+      apply f and g according to the equations above, with new variables in each
+      layer.
+    f_side_input: list of Tensors, side input to f. If not None, signature of f
+      should be (Tensor, list<Tensor>) -> (Tensor).
+    g_side_input: list of Tensors, side input to g. If not None, signature of g
+      should be (Tensor, list<Tensor>) -> (Tensor).
+    is_training: bool, whether to actually use the efficient backprop codepath.
+
+  Returns:
+    y1, y2: tuple of float Tensors.
+  """
+  block = RevBlock(
+      f=f,
+      g=g,
+      num_layers=num_layers,
+      f_side_input=f_side_input,
+      g_side_input=g_side_input,
+      use_efficient_backprop=is_training,
+      _reuse=variable_scope.get_variable_scope().reuse)
+  return block.forward(x1, x2)
+
+
+def recompute_grad(fn):
+  """Decorator that recomputes the function on the backwards pass.
+
+  Args:
+    fn: a function that takes Tensors (all as positional arguments) and returns
+      a tuple of Tensors.
+
+  Returns:
+    A wrapped fn that is identical to fn when called, but its activations will
+    be discarded and recomputed on the backwards pass (i.e. on a call to
+    tf.gradients).
+  """
+
+  @functools.wraps(fn)
+  def wrapped(*args):
+    return _recompute_grad(fn, args)
+
+  return wrapped
+
+
+def _recompute_grad(fn, args):
+  """See recompute_grad."""
+
+  cached_vs = []
+  cached_arg_scope = []
+
+  def grad_fn(inputs, variables, outputs, output_grads):
+    """Recompute outputs for gradient computation."""
+    del outputs
+    # Recompute outputs
+    with framework_ops.control_dependencies(output_grads):
+      with contrib_framework_ops.arg_scope(cached_arg_scope[0]):
+        with variable_scope.variable_scope(cached_vs[0], reuse=True):
+          outputs = fn(*inputs)
+
+    if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
+      outputs = [outputs]
+    outputs = list(outputs)
+    grads = gradients_impl.gradients(outputs, inputs + variables, output_grads)
+    grad_inputs = grads[:len(inputs)]
+    grad_vars = grads[len(inputs):]
+    return grad_inputs, grad_vars
+
+  @_fn_with_custom_grad(grad_fn)
+  def fn_with_recompute(*args):
+    cached_vs.append(variable_scope.get_variable_scope())
+    # TODO(rsepassi): Rm conditional in TF 1.4
+    if hasattr(contrib_framework_ops, "current_arg_scope"):
+      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
+    else:
+      cached_arg_scope.append({})
+    return fn(*args)
+
+  return fn_with_recompute(*args)
+
+
+def _underlying_variable_ref(t):
+  """Find the underlying variable ref.
+
+  Traverses through Identity, ReadVariableOp, and Enter ops.
+  Stops when op type has Variable or VarHandle in name.
+
+  Args:
+    t: a Tensor
+
+  Returns:
+    a Tensor that is a variable ref, or None on error.
+  """
+  while t.op.type in ["Identity", "ReadVariableOp", "Enter"]:
+    t = t.op.inputs[0]
+
+  op_type = t.op.type
+  if "Variable" in op_type or "VarHandle" in op_type:
+    return t
+  else:
+    return None
+
+
+def _fn_with_custom_grad(grad_fn, use_global_vars=False):
+  """Decorator to create a subgraph with a custom gradient function.
+
+  The subgraph created by the decorated function is NOT put in a Defun and so
+  does not suffer from the limitations of the Defun (all subgraph ops on the
+  same device, no summaries).
+
+  Args:
+    grad_fn: function with signature
+      (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+    use_global_vars: if True, variables will be the global variables created.
+      If False, will be the trainable variables.
+
+  Returns:
+    Decorator for function such that the gradient is defined by grad_fn.
+  """
+
+  def dec(fn):
+
+    @functools.wraps(fn)
+    def wrapped(*args):
+      return _fn_with_custom_grad_internal(
+          fn, args, grad_fn, use_global_vars=use_global_vars)
+
+    return wrapped
+
+  return dec
+
+
+def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
+  """Create a subgraph with a custom gradient.
+
+  Args:
+    fn: function that takes inputs as arguments and produces 1 or more Tensors.
+    inputs: list<Tensor>, will be passed as fn(*inputs).
+    grad_fn: function with signature
+      (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars),
+      all of which are lists of Tensors.
+    use_global_vars: if True, variables will be the global variables created.
+      If False, will be the trainable variables.
+
+  Returns:
+    fn(*inputs)
+  """
+  vs = variable_scope.get_variable_scope()
+  get_vars_fn = (
+      vs.global_variables if use_global_vars else vs.trainable_variables)
+  len_before_vars = len(get_vars_fn())
+  inputs = list(inputs)
+  outputs = fn(*inputs)
+  train_vars = get_vars_fn()[len_before_vars:]
+
+  if grad_fn is None:
+    return outputs
+
+  if not (isinstance(outputs, tuple) or isinstance(outputs, list)):
+    outputs = [outputs]
+  outputs = list(outputs)
+
+  defun_inputs = [inputs, train_vars, outputs]
+
+  def custom_grad_fn(op, *dys):
+    """Custom grad fn applying grad_fn for identity Defun."""
+    fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
+        defun_inputs, list(op.inputs))
+    dys = list(dys)
+    assert len(fn_outputs) == len(outputs)
+    assert len(fn_outputs) == len(dys)
+
+    grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys)
+    grad_outputs = [None] * len(fn_outputs)
+    return tuple(grad_inputs + grad_vars + grad_outputs)
+
+  # The Defun takes as input the original inputs, the trainable variables
+  # created in fn, and the outputs. In the forward it passes through the
+  # outputs. In the backwards, it produces gradients for the original inputs
+  # and the trainable variables.
+  in_types = [t.dtype for t in inputs]
+  out_types = [t.dtype for t in outputs]
+  var_types = [t.dtype for t in train_vars]
+
+  # Get a unique name for the Defun
+  with framework_ops.name_scope("identity_custom_grad") as ns:
+    defun_name = ns
+
+  @function.Defun(
+      *(in_types + var_types + out_types),
+      func_name=defun_name,
+      python_grad_func=custom_grad_fn,
+      shape_func=lambda _: [t.get_shape() for t in outputs])
+  def identity(*args):
+    _, _, outs = nest.pack_sequence_as(defun_inputs, args)
+    return tuple([array_ops.identity(t) for t in outs])
+
+  flat_inputs = nest.flatten(defun_inputs)
+  id_out = identity(*flat_inputs)
+  return id_out
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbcbcd75114a522b95631e4e7e95c1641b0a9987
--- /dev/null
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -0,0 +1,364 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RevBlock."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.layers.python.layers import layers
+from tensorflow.contrib.layers.python.layers import rev_block_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.layers import convolutional
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RevBlockTest(test.TestCase):
+  CHANNELS = 8
+  NUM_LAYERS = 4
+  BATCH_SIZE = 16
+
+  def testForwardBackward(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    block = rev_block_lib.RevBlock(f, g, num_layers=3)
+    y1, y2 = block.forward(x1, x2)
+    x1_inv, x2_inv = block.backward(y1, y2)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      x1, x2, x1_inv, x2_inv = sess.run([x1, x2, x1_inv, x2_inv])
+
+      self.assertAllClose(x1, x1_inv)
+      self.assertAllClose(x2, x2_inv)
+
+  def testBackwardForward(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    y = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    y1, y2 = array_ops.split(y, 2, axis=-1)
+
+    block = rev_block_lib.RevBlock(f, g, num_layers=3)
+    x1, x2 = block.backward(y1, y2)
+    y1_inv, y2_inv = block.forward(x1, x2)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      y1, y2, y1_inv, y2_inv = sess.run([y1, y2, y1_inv, y2_inv])
+
+      self.assertAllClose(y1, y1_inv)
+      self.assertAllClose(y2, y2_inv)
+
+  def _testRevBlock(self,
+                    x=None,
+                    f=None,
+                    g=None,
+                    f_side_input=None,
+                    g_side_input=None):
+    random_seed.set_random_seed(1234)
+
+    if f is None:
+
+      def f(x):  # pylint: disable=function-redefined
+        return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    if g is None:
+
+      def g(x):  # pylint: disable=function-redefined
+        return core_layers.dense(x, self.CHANNELS // 2, use_bias=True)
+
+    if f_side_input is None:
+      f_side_input = []
+
+    if g_side_input is None:
+      g_side_input = []
+
+    if x is None:
+      x = random_ops.random_uniform(
+          [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    with variable_scope.variable_scope("rev_test") as vs:
+      y1_rev, y2_rev = rev_block_lib.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS)
+      y_rev = array_ops.concat([y1_rev, y2_rev], axis=1)
+      fg_vars = vs.trainable_variables()
+
+    num_vars = len(variables.global_variables())
+    with variable_scope.variable_scope(vs, reuse=True):
+      y1, y2 = rev_block_lib.rev_block(
+          x1,
+          x2,
+          f,
+          g,
+          f_side_input=f_side_input,
+          g_side_input=g_side_input,
+          num_layers=self.NUM_LAYERS,
+          is_training=False)
+      y = array_ops.concat([y1, y2], axis=1)
+    # Ensure no new vars were created - full reuse
+    assert len(variables.global_variables()) == num_vars
+
+    loss_rev = math_ops.reduce_mean(y_rev + 10.)
+    loss = math_ops.reduce_mean(y + 10.)
+
+    wrt = [x] + f_side_input + g_side_input + fg_vars
+    grads_rev = gradients_impl.gradients(loss_rev, wrt)
+    grads = gradients_impl.gradients(loss, wrt)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      y_val, yd_val, gd_val, g_val = sess.run([y, y_rev, grads_rev, grads])
+      self.assertAllClose(y_val, yd_val)
+      for g1, g2 in zip(gd_val, g_val):
+        self.assertAllClose(g1, g2)
+
+  def testRevBlock(self):
+    self._testRevBlock()
+
+  def testSideInput(self):
+    f_side_input = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS // 2])
+
+    def f(x, side_input):
+      return core_layers.dense(
+          x, self.CHANNELS // 2, use_bias=True) + side_input[0]
+
+    self._testRevBlock(f=f, f_side_input=[f_side_input])
+
+  def testMultipleFns(self):
+
+    def f1(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    def f2(x):
+      return core_layers.dense(x, self.CHANNELS // 2, activation=nn_ops.relu)
+
+    self._testRevBlock(f=[f1, f2, f1, f2])
+
+  # TODO(rsepassi): Recent change to conv seems to have broken this test. Find
+  # out why.
+  def _testConvAndBatchNorm(self):
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, 10, self.CHANNELS], dtype=dtypes.float32)
+
+    def f(x):
+      x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
+      x = layers.batch_norm(x, is_training=True)
+      x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
+      x = layers.batch_norm(x, is_training=True)
+      return x
+
+    self._testRevBlock(x=x, f=f)
+
+  def testReuse(self):
+
+    def f(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    def g(x):
+      return core_layers.dense(x, self.CHANNELS // 2)
+
+    x = random_ops.random_uniform(
+        [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32)
+    x1, x2 = array_ops.split(x, 2, axis=-1)
+
+    with variable_scope.variable_scope("test"):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_before = len(variables.global_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
+    loss = math_ops.reduce_mean(y1 + y2)
+    _ = gradients_impl.gradients(loss,
+                                 [x] + variables.trainable_variables())
+
+    with variable_scope.variable_scope("test", reuse=True):
+      y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS)
+
+    num_vars_after = len(variables.global_variables())
+    self.assertEqual(num_vars_before, num_vars_after)
+
+
+class RecomputeTest(test.TestCase):
+
+  def testRecompute(self):
+
+    def layer(x, name=None):
+      with variable_scope.variable_scope(name, default_name="layer"):
+        x = layers.layer_norm(x)
+        x = convolutional.conv1d(
+            x,
+            10,
+            1,
+            use_bias=False,
+            kernel_initializer=init_ops.constant_initializer(42.42))
+        x = nn_ops.relu(x)
+        return x
+
+    def fn(x):
+      out = x
+      for _ in range(3):
+        out = layer(out)
+      return out
+
+    @rev_block_lib.recompute_grad
+    def fn_recompute(x):
+      return fn(x)
+
+    x = random_ops.random_uniform((3, 1, 3))
+    recompute_vars = None
+    with variable_scope.variable_scope("recompute") as vs:
+      out1 = math_ops.reduce_sum(fn_recompute(x))
+      recompute_vars = vs.trainable_variables()
+    reg_vars = None
+    with variable_scope.variable_scope("regular") as vs:
+      out2 = math_ops.reduce_sum(fn(x))
+      reg_vars = vs.trainable_variables()
+
+    grad1 = gradients_impl.gradients(out1, recompute_vars)
+    grad2 = gradients_impl.gradients(out2, reg_vars)
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      outs = sess.run([out1, out2, grad1, grad2])
+      self.assertAllClose(outs[0], outs[1])
+      for g1, g2 in zip(outs[2], outs[3]):
+        self.assertAllClose(g1, g2)
+
+
+class FnWithCustomGradTest(test.TestCase):
+
+  def testCorrectness(self):
+
+    w = random_ops.random_uniform([6, 10])
+
+    def fn(a, b, c):
+      return core_layers.dense(
+          a,
+          10,
+          use_bias=False,
+          kernel_initializer=lambda shape, dtype, partition_info: w
+      ) + math_ops.matmul(b, c)
+
+    def grad_fn(inputs, trainable_variables, outputs, grad_outputs):
+      outputs = outputs[0]
+      grad_outputs = grad_outputs[0]
+      grad_inputs = gradients_impl.gradients(
+          outputs, inputs, grad_ys=grad_outputs)
+      grad_vars = gradients_impl.gradients(
+          outputs, trainable_variables, grad_ys=grad_outputs)
+      return grad_inputs, grad_vars
+
+    custom_fn = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)
+
+    a = random_ops.random_uniform([11, 6])
+    b = random_ops.random_uniform([11, 7])
+    c = random_ops.random_uniform([7, 10])
+
+    out = fn(a, b, c)
+    custom_out = custom_fn(a, b, c)
+    self.assertEqual(out.get_shape().as_list(),
+                     custom_out.get_shape().as_list())
+
+    loss = math_ops.reduce_mean(out)
+    custom_loss = math_ops.reduce_mean(custom_out)
+
+    grads = gradients_impl.gradients(
+        loss, [a, b, c] + [variables.trainable_variables()[0]])
+    custom_grads = gradients_impl.gradients(
+        custom_loss, [a, b, c] + [variables.trainable_variables()[1]])
+
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      out_val, custom_out_val, grads_val, custom_grads_val = sess.run(
+          [out, custom_out, grads, custom_grads])
+      self.assertAllClose(out_val, custom_out_val)
+      for g1, g2 in zip(grads_val, custom_grads_val):
+        self.assertAllClose(g1, g2)
+
+  def testCustomGrad(self):
+
+    def fn(a, b, c):
+      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)
+
+    def grad_fn(inputs, trainable_variables, unused_outputs,
+                unused_grad_outputs):
+      grad_inputs = [
+          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
+      ]
+      grad_vars = [
+          array_ops.ones_like(t) * (i + len(inputs) + 1.)
+          for i, t in enumerate(trainable_variables)
+      ]
+      return grad_inputs, grad_vars
+
+    a = random_ops.random_uniform([11, 6])
+    b = random_ops.random_uniform([11, 7])
+    c = random_ops.random_uniform([7, 10])
+    w = random_ops.random_uniform([6, 10])
+    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
+    loss = math_ops.reduce_mean(out)
+    grads = gradients_impl.gradients(
+        loss, [a, b, c, variables.trainable_variables()[0]])
+    expected_grads = [
+        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
+    ]
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      g_val, eg_val = sess.run([grads, expected_grads])
+      for g1, g2 in zip(g_val, eg_val):
+        self.assertAllClose(g1, g2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index ac615b120c16d5d9a7798874653f8f00f8fd15b4..33f509ec121af6484411ab898fda37179511b708 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -10,7 +10,7 @@ package(default_visibility = [
     "//tensorflow:internal",
 ])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
 
 py_library(
     name = "learn",
@@ -22,6 +22,8 @@ py_library(
         exclude = ["python/learn/**/*_test.py"],
     ),
     srcs_version = "PY2AND3",
+    # This library should not depend on sklearn, even though some of the code
+    # refers to it. (The code handles the presence of sklearn conditionally.)
     deps = [
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/framework:framework_py",
@@ -55,6 +57,7 @@ py_library(
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:partitioned_variables",
@@ -76,6 +79,7 @@ py_library(
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:inputs",
         "//tensorflow/python/estimator:inputs_queues",
@@ -85,6 +89,7 @@ py_library(
         "//tensorflow/python/estimator:run_config",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:signature_constants",
@@ -131,6 +136,7 @@ py_test(
         "//tensorflow/contrib/learn/python/learn/datasets",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -148,17 +154,17 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "experiment_test",
     size = "medium",
     srcs = ["python/learn/experiment_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
+        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
@@ -198,6 +204,7 @@ py_test(
         "//tensorflow/contrib/training:training_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform",
+        "//tensorflow/python/estimator:run_config",
     ],
 )
 
@@ -216,6 +223,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
@@ -278,6 +286,8 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:protos_all_py",
+        "//tensorflow/python:session",
+        "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
@@ -319,12 +329,12 @@ py_test(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn/python/learn/datasets",
-        "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
     ],
 )
@@ -363,10 +373,10 @@ py_test(
     deps = [
         ":learn",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/losses",
@@ -430,7 +440,6 @@ py_test(
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:functional_ops",
@@ -439,6 +448,7 @@ py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
@@ -450,6 +460,7 @@ py_test(
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["noasan"],
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -575,10 +586,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/saved_model:signature_constants",
@@ -631,9 +642,9 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:session",
         "//third_party/py/numpy",
     ],
 )
@@ -704,12 +715,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_io_test",
     size = "small",
     srcs = ["python/learn/learn_io/graph_io_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":learn",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -721,9 +731,11 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
 )
 
 py_test(
@@ -770,11 +782,12 @@ py_test(
         "//tensorflow/contrib/session_bundle:exporter",
         "//tensorflow/contrib/session_bundle:manifest_proto_py_pb2",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -822,12 +835,9 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":learn",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python:dtypes",
     ],
 )
 
@@ -855,7 +865,6 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:platform",
     ],
 )
diff --git a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
index 14750961efa30128708430fac038498de0a42118..ef5e620e8f08cffa7c2b945089aa5d150baefefc 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/composable_model_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import composable_model
@@ -55,7 +55,7 @@ def _base_model_fn(features, labels, mode, params):
     raise NotImplementedError
 
   def _train_op_fn(loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     assert global_step
     train_step = model.get_train_step(loss)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index cb15ef23e95d27c737d8ae08065b804bafd39a07..c17b41c0f767e19d9c3635a8f60347a49b297cfb 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -23,7 +23,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -189,7 +189,7 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
       """Returns the op to optimize the loss."""
       return optimizers.optimize_loss(
           loss=loss,
-          global_step=contrib_variables.get_global_step(),
+          global_step=training_util.get_global_step(),
           learning_rate=_LEARNING_RATE,
           optimizer=_get_optimizer(optimizer),
           gradient_multipliers=(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 788d2d0b1a58fad16712c968593b40de0d3979f0..05ed8b3409e68ae54e5ef89b3a1592a6f285565b 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -30,7 +30,6 @@ import six
 
 from google.protobuf import message
 from tensorflow.contrib import layers
-from tensorflow.contrib import metrics as metrics_lib
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_args
 from tensorflow.contrib.framework import list_variables
@@ -60,6 +59,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import resources
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -1230,7 +1230,7 @@ class Estimator(BaseEstimator):
 
     if metric_key.MetricKey.LOSS not in model_fn_ops.eval_metric_ops:
       model_fn_ops.eval_metric_ops[metric_key.MetricKey.LOSS] = (
-          metrics_lib.streaming_mean(model_fn_ops.loss))
+          metrics_lib.mean(model_fn_ops.loss))
     return model_fn_ops
 
   def _get_predict_ops(self, features):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
index 248c6c733ffca351c848ba07110ba89928634a23..9d7c1a099aa4be64ca0296fa5b870597dabec7b4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_input_test.py
@@ -23,7 +23,7 @@ import tempfile
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
 from tensorflow.contrib.learn.python.learn import models
@@ -114,7 +114,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -129,7 +129,7 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -139,7 +139,7 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -150,7 +150,7 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index be2b0cb3ca959323b4de095ca072278f028be301..2a13a84627df35a68a4f04b25ab26ceecad0db0d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -32,7 +32,7 @@ from google.protobuf import text_format
 
 from tensorflow.contrib import learn
 from tensorflow.contrib import lookup
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import experiment
@@ -132,7 +132,7 @@ def linear_model_params_fn(features, labels, mode, params):
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
       loss,
-      variables.get_global_step(),
+      training_util.get_global_step(),
       optimizer='Adagrad',
       learning_rate=params['learning_rate'])
   return prediction, loss, train_op
@@ -147,7 +147,7 @@ def linear_model_fn(features, labels, mode):
     (_, features), = features.items()
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return prediction, loss, train_op
 
 
@@ -157,7 +157,7 @@ def linear_model_fn_with_model_fn_ops(features, labels, mode):
                   model_fn.ModeKeys.INFER)
   prediction, loss = (models.linear_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return model_fn.ModelFnOps(
       mode=mode, predictions=prediction, loss=loss, train_op=train_op)
 
@@ -168,7 +168,7 @@ def logistic_model_no_mode_fn(features, labels):
   labels = array_ops.one_hot(labels, 3, 1, 0)
   prediction, loss = (models.logistic_regression_zero_init(features, labels))
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return {
       'class': math_ops.argmax(prediction, 1),
       'prob': prediction
@@ -241,7 +241,7 @@ def _build_estimator_for_resource_export_test():
     const = constant_op.constant(-1, dtype=dtypes.int64)
     table = lookup.MutableHashTable(
         dtypes.string, dtypes.int64, const, name='LookupTableModel')
-    update_global_step = variables.get_global_step().assign_add(1)
+    update_global_step = training_util.get_global_step().assign_add(1)
     if mode in (model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL):
       key = constant_op.constant(['key'])
       value = constant_op.constant([42], dtype=dtypes.int64)
@@ -306,7 +306,7 @@ def _model_fn_ops(
         mode=mode,
         predictions=constant_op.constant(0.),
         loss=constant_op.constant(0.),
-        train_op=variables.get_global_step().assign_add(1))
+        train_op=training_util.get_global_step().assign_add(1))
 
 
 def _make_input_fn(features, labels):
@@ -389,7 +389,7 @@ class EstimatorModelFnTest(test.TestCase):
       self.assertEqual(expected_param, params)
       self.assertEqual(model_dir, expected_model_dir)
       return (constant_op.constant(0.), constant_op.constant(0.),
-              variables.get_global_step().assign_add(1))
+              training_util.get_global_step().assign_add(1))
     est = estimator.Estimator(model_fn=_argument_checker,
                               params=expected_param,
                               model_dir=expected_model_dir)
@@ -400,7 +400,7 @@ class EstimatorModelFnTest(test.TestCase):
     def _invalid_model_fn(features, labels):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         loss = 100.0 - w
       return None, loss, None
@@ -415,7 +415,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       predictions = loss
@@ -434,7 +434,7 @@ class EstimatorModelFnTest(test.TestCase):
       # pylint: disable=unused-argument
       w = variables_lib.Variable(42.0, 'weight')
       loss = 100.0 - w
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       with ops.control_dependencies([update_global_step]):
         train_op = w.assign_add(loss / 100.0)
       return None, loss, train_op
@@ -464,7 +464,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant(0.),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(init_fn=_init_fn))
 
     est = estimator.Estimator(model_fn=_model_fn_scaffold)
@@ -483,7 +483,7 @@ class EstimatorModelFnTest(test.TestCase):
           mode=mode,
           predictions=constant_op.constant([[1.]]),
           loss=constant_op.constant(0.),
-          train_op=variables.get_global_step().assign_add(1),
+          train_op=training_util.get_global_step().assign_add(1),
           scaffold=monitored_session.Scaffold(saver=self.mock_saver))
 
     def input_fn():
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
index 1d89dfb55b10b032cab7dcf434d396404d4eb83b..8131e0fde6fea5501cacc4714f53ed8d867ca70f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimators_test.py
@@ -22,7 +22,7 @@ import random
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python import learn
 from tensorflow.contrib.learn.python.learn import datasets
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -62,7 +62,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["transformed_x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -100,7 +100,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator = estimator_lib.Estimator(
@@ -139,7 +139,7 @@ class FeatureEngineeringFunctionTest(test.TestCase):
       _ = labels
       predictions = features["x"]
       loss = constant_op.constant([2.])
-      update_global_step = variables.get_global_step().assign_add(1)
+      update_global_step = training_util.get_global_step().assign_add(1)
       return predictions, loss, update_global_step
 
     estimator_with_fe_fn = estimator_lib.Estimator(
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 468d792a0dccf5cf046a41ed8e1600940a15ac37..bc0e6fc0091c9b5419ab526855b404eb4a927e97 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -119,7 +119,7 @@ class Head(object):
       update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
                                                   loss=model_fn_ops.loss, ...)
       hooks = [sync.make_session_run_hook(is_chief)]
-      ... upate train_op and hooks in ModelFnOps and return
+      ... update train_op and hooks in ModelFnOps and return
     ```
   """
   __metaclass__ = abc.ABCMeta
diff --git a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
index 992b804f59ecd88fedc2fba10d3079f93c4fe83d..8f9d6fc318a357853bdb8e3264f6691b410006b1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/kmeans.py
@@ -28,7 +28,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib.factorization.python.ops import clustering_ops
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators.model_fn import ModelFnOps
 from tensorflow.python.framework import ops
@@ -128,7 +128,7 @@ def _kmeans_clustering_model_fn(features, labels, mode, params, config):
        random_seed=params.get('random_seed'),
        kmeans_plus_plus_num_retries=params.get(
            'kmeans_plus_plus_num_retries')).training_graph()
-  incr_step = state_ops.assign_add(variables.get_global_step(), 1)
+  incr_step = state_ops.assign_add(training_util.get_global_step(), 1)
   loss = math_ops.reduce_sum(losses, name=KMeansClustering.LOSS_OP_NAME)
   summary.scalar('loss/raw', loss)
   training_op = with_dependencies([training_op, incr_step], loss)
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index f5445ad4e728dbd3904279573771de9454b5d17c..37aa8b339622415d082933cdf66d2472a4119b48 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -26,7 +26,7 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -170,7 +170,7 @@ def _linear_model_fn(features, labels, mode, params, config=None):
           weight_collections=[parent_scope])
 
     def _train_op_fn(loss):
-      global_step = contrib_variables.get_global_step()
+      global_step = training_util.get_global_step()
       my_vars = ops.get_collection(parent_scope)
       grads = gradients.gradients(loss, my_vars)
       if gradient_clip_norm:
@@ -252,7 +252,7 @@ def sdca_model_fn(features, labels, mode, params):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(columns_to_variables,
                                                     weight_column_name,
                                                     loss_type, features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
index 93c62f87e8495f299a8c456574c7b40534186304..656d68b76888d9319c0b9be481f9b0478ac4314c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/logistic_regressor_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.contrib.learn.python.learn.estimators import logistic_regressor
@@ -57,7 +57,7 @@ def _logistic_regression_model_fn(features, labels, mode):
   predictions = math_ops.sigmoid(logits)
   loss = losses.sigmoid_cross_entropy(labels, logits)
   train_op = optimizers.optimize_loss(
-      loss, variables.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
+      loss, training_util.get_global_step(), optimizer='Adagrad', learning_rate=0.1)
   return predictions, loss, train_op
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
index 8be9c72adf1602826fabc650f350b57f72c886be..44e6c7c52dac524a22e9099e33e2aef82f8fe7ba 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py
@@ -23,7 +23,6 @@ import collections
 
 import six
 
-from tensorflow.contrib import framework as contrib_framework
 from tensorflow.contrib.framework import get_graph_from_inputs
 from tensorflow.contrib.learn.python.learn.estimators import constants
 from tensorflow.contrib.learn.python.learn.estimators import metric_key
@@ -32,6 +31,7 @@ from tensorflow.python.estimator import model_fn as core_model_fn_lib
 from tensorflow.python.estimator.export import export_output as core_export_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -156,11 +156,11 @@ class ModelFnOps(
     else:
       if isinstance(predictions, dict):
         predictions = {
-            k: contrib_framework.convert_to_tensor_or_sparse_tensor(v)
+            k: sparse_tensor.convert_to_tensor_or_sparse_tensor(v)
             for k, v in six.iteritems(predictions)
         }
       else:
-        predictions = contrib_framework.convert_to_tensor_or_sparse_tensor(
+        predictions = sparse_tensor.convert_to_tensor_or_sparse_tensor(
             predictions)
 
     # Validate eval_metric_ops
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 307db76afe20a7743df16d169270a6f319497eb6..fc4bd1f461d7bfbfcfb78201d527959055342f0a 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -140,7 +140,8 @@ class Experiment(object):
                delay_workers_by_global_step=False,
                export_strategies=None,
                train_steps_per_iteration=None,
-               checkpoint_and_export=False):
+               checkpoint_and_export=False,
+               saving_listeners=None):
     """Constructor for `Experiment`.
 
     Creates an Experiment instance. None of the functions passed to this
@@ -200,6 +201,9 @@ class Experiment(object):
         `save_checkpoints_steps`. Also, this parameter leads to the creation of
         a default `CheckpointSaverHook` instead of a `ValidationMonitor`, so the
         provided `train_monitors` will need to be adjusted accordingly.
+      saving_listeners: list of `CheckpointSaverListener` objects. Used by
+        tf.estimator.Estimator for callbacks that run immediately before or
+        after checkpoint savings.
 
     Raises:
       ValueError: if `estimator` does not implement Estimator interface,
@@ -221,6 +225,9 @@ class Experiment(object):
         raise ValueError(
             "`estimator` must implement `tf.contrib.learn.Trainable`"
             "or `tf.estimator.`Estimator`.")
+      if saving_listeners is not None:
+        raise ValueError("`saving_listeners` must be `None` with "
+                         "`tf.contrib.learn.Estimator`.")
 
     if isinstance(estimator, tpu_estimator.TPUEstimator):
       logging.warn(
@@ -242,6 +249,7 @@ class Experiment(object):
     self._eval_delay_secs = eval_delay_secs
     self._continuous_eval_throttle_secs = continuous_eval_throttle_secs
     self._checkpoint_and_export = checkpoint_and_export
+    self._saving_listeners = saving_listeners
     # Using 1 on a non-cached file system requires a lot of overhead to
     # read the checkpoint state file. This is particular bad on GCS, so
     # we use a different default. This is a temporary band-aid, to be
@@ -362,9 +370,11 @@ class Experiment(object):
       logging.info("Waiting %d secs before starting training.", remaining)
       time.sleep(delay_secs)
 
-    return self._call_train(input_fn=self._train_input_fn,
-                            max_steps=self._train_steps,
-                            hooks=self._train_monitors + extra_hooks)
+    return self._call_train(
+        input_fn=self._train_input_fn,
+        max_steps=self._train_steps,
+        hooks=self._train_monitors + extra_hooks,
+        saving_listeners=self._saving_listeners)
 
   def evaluate(self, delay_secs=None, name=None):
     """Evaluate on the evaluation data.
@@ -712,9 +722,11 @@ class Experiment(object):
         break
 
       logging.info("Training model for %s steps", train_steps_per_iteration)
-      self._call_train(input_fn=self._train_input_fn,
-                       steps=train_steps_per_iteration,
-                       hooks=self._train_monitors)
+      self._call_train(
+          input_fn=self._train_input_fn,
+          steps=train_steps_per_iteration,
+          hooks=self._train_monitors,
+          saving_listeners=self._saving_listeners)
 
       logging.info("Evaluating model now.")
       eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
@@ -762,9 +774,11 @@ class Experiment(object):
     Returns:
       The result of the `evaluate` call to the `Estimator`.
     """
-    self._call_train(input_fn=self._train_input_fn,
-                     steps=1,
-                     hooks=self._train_monitors)
+    self._call_train(
+        input_fn=self._train_input_fn,
+        steps=1,
+        hooks=self._train_monitors,
+        saving_listeners=self._saving_listeners)
 
     eval_result = self._call_evaluate(input_fn=self._eval_input_fn,
                                       steps=1,
@@ -792,7 +806,8 @@ class Experiment(object):
     return server
 
   def _call_train(self, _sentinel=None,  # pylint: disable=invalid-name,
-                  input_fn=None, steps=None, hooks=None, max_steps=None):
+                  input_fn=None, steps=None, hooks=None, max_steps=None,
+                  saving_listeners=None):
     if _sentinel is not None:
       raise ValueError("_call_train should be called with keyword args only")
 
@@ -801,10 +816,12 @@ class Experiment(object):
     # safe to convert for both cases.
     hooks = monitors.replace_monitors_with_hooks(hooks, self._estimator)
     if self._core_estimator_used:
-      return self._estimator.train(input_fn=input_fn,
-                                   steps=steps,
-                                   max_steps=max_steps,
-                                   hooks=hooks)
+      return self._estimator.train(
+          input_fn=input_fn,
+          steps=steps,
+          max_steps=max_steps,
+          hooks=hooks,
+          saving_listeners=saving_listeners)
     else:
       return self._estimator.fit(input_fn=input_fn,
                                  steps=steps,
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index fe40d27c445d4f560c96fc9b50ceb0daed30ee93..c29c198d094090a59c8c7dd2949c3f069adf49d0 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -232,14 +232,19 @@ class ExperimentTest(test.TestCase):
 
   def test_train(self):
     for est in self._estimators_for_tests():
-      eval_metrics = 'eval_metrics' if not isinstance(
-          est, core_estimator.Estimator) else None
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           train_steps='train_steps',
           eval_input_fn='eval_input',
-          eval_metrics=eval_metrics)
+          eval_metrics=eval_metrics,
+          saving_listeners=saving_listeners)
       fit_args = ex.train(delay_secs=0)
       self.assertEqual(1, est.fit_count)
       self.assertIn(('max_steps', 'train_steps'), fit_args)
@@ -675,8 +680,12 @@ class ExperimentTest(test.TestCase):
 
   def test_continuous_train_and_eval(self):
     for est in self._estimators_for_tests(eval_dict={'global_step': 100}):
-      eval_metrics = 'eval_metrics' if not isinstance(
-          est, core_estimator.Estimator) else None
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       noop_hook = _NoopHook()
       export_strategy = saved_model_export_utils.make_export_strategy(
           est,
@@ -690,7 +699,8 @@ class ExperimentTest(test.TestCase):
           eval_hooks=[noop_hook],
           train_steps=100,
           eval_steps=100,
-          export_strategies=export_strategy)
+          export_strategies=export_strategy,
+          saving_listeners=saving_listeners)
       ex.continuous_train_and_eval()
       self.assertEqual(1, est.fit_count)
       self.assertEqual(1, est.eval_count)
@@ -742,9 +752,10 @@ class ExperimentTest(test.TestCase):
     ex.continuous_train_and_eval(continuous_eval_predicate_fn=predicate_fn)
     mock_estimator.train.assert_called_once_with(
         input_fn='train_input',
-        steps=int(total_steps/10),
+        steps=int(total_steps / 10),
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_steps_per_iteration_from_user(self):
     mock_estimator = test.mock.Mock(core_estimator.Estimator)
@@ -768,7 +779,8 @@ class ExperimentTest(test.TestCase):
         input_fn='train_input',
         steps=1234,
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_default_steps_per_iteration(self):
     mock_estimator = test.mock.Mock(core_estimator.Estimator)
@@ -791,7 +803,8 @@ class ExperimentTest(test.TestCase):
         input_fn='train_input',
         steps=1000,
         max_steps=test.mock.ANY,
-        hooks=test.mock.ANY)
+        hooks=test.mock.ANY,
+        saving_listeners=test.mock.ANY)
 
   def test_continuous_train_and_eval_with_invalid_predicate_fn(self):
     for est in self._estimators_for_tests():
@@ -857,11 +870,19 @@ class ExperimentTest(test.TestCase):
           est,
           None if isinstance(est, core_estimator.Estimator) else 'export_input',
           exports_to_keep=None)
+      if isinstance(est, core_estimator.Estimator):
+        eval_metrics = None
+        saving_listeners = 'saving_listeners'
+      else:
+        eval_metrics = 'eval_metrics'
+        saving_listeners = None
       ex = experiment.Experiment(
           est,
           train_input_fn='train_input',
           eval_input_fn='eval_input',
-          export_strategies=(exp_strategy,))
+          export_strategies=(exp_strategy,),
+          eval_metrics=eval_metrics,
+          saving_listeners=saving_listeners)
       ex.test()
       self.assertEqual(1, est.fit_count)
       self.assertEqual(1, est.eval_count)
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
index 4c50d40aaa9b3c5d94d0a66d08e8ab6173db427a..86fad4c5535a918d87e0741687cfebe3afaf9ddf 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/data_feeder.py
@@ -28,13 +28,13 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 # pylint: disable=g-multiple-import,g-bad-import-order
 from .pandas_io import HAS_PANDAS, extract_pandas_data, extract_pandas_matrix, extract_pandas_labels
 from .dask_io import HAS_DASK, extract_dask_data, extract_dask_labels
-
 # pylint: enable=g-multiple-import,g-bad-import-order
 
 
@@ -365,8 +365,14 @@ class DataFeeder(object):
     self.random_state = np.random.RandomState(
         42) if random_state is None else random_state
 
-    num_samples = list(self._x.values())[0].shape[
-        0] if x_is_dict else self._x.shape[0]
+    if x_is_dict:
+      num_samples = list(self._x.values())[0].shape[0]
+    elif tensor_util.is_tensor(self._x):
+      num_samples = self._x.shape[
+          0].value  # shape will be a Dimension, extract an int
+    else:
+      num_samples = self._x.shape[0]
+
     if self._shuffle:
       self.indices = self.random_state.permutation(num_samples)
     else:
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
index 4b34fc62849766370979bb2002d42ee03ea7161a..3a46c239688017f9204d2c6182a6f81cd325a417 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -280,14 +281,33 @@ def _get_file_names(file_pattern, randomize_input):
 
 def _get_examples(file_name_queue, reader, num_threads, read_batch_size,
                   filter_fn, parse_fn):
+  """Get example filenames matching.
+
+  Args:
+    file_name_queue: A queue implementation that dequeues elements in
+      first-in first-out order.
+    reader: A function or class that returns an object with
+      `read` method, (filename tensor) -> (example tensor).
+    num_threads: The number of threads enqueuing examples.
+    read_batch_size: An int or scalar `Tensor` specifying the number of
+      records to read at once.
+    filter_fn: Filtering function, takes both keys as well as an `Example`
+      Tensors and returns a boolean mask of the same shape as the input Tensors
+      to be applied for filtering. If `None`, no filtering is done.
+    parse_fn: Parsing function, takes `Example` Tensor returns parsed
+      representation. If `None`, no parsing is done.
+
+  Returns:
+    List of example file names matching `file_name_queue`.
+  """
   with ops.name_scope('read'):
     example_list = []
     for _ in range(num_threads):
-      if read_batch_size > 1:
-        keys, examples_proto = reader().read_up_to(file_name_queue,
-                                                   read_batch_size)
-      else:
-        keys, examples_proto = reader().read(file_name_queue)
+      keys, examples_proto = utils.smart_cond(
+          read_batch_size > 1,
+          lambda: reader().read_up_to(file_name_queue, read_batch_size),
+          lambda: reader().read(file_name_queue))
+
       if filter_fn:
         mask = filter_fn(keys, examples_proto)
         keys = array_ops.boolean_mask(keys, mask)
@@ -379,14 +399,15 @@ def _read_keyed_batch_examples_helper(file_pattern,
             capacity=1, dtypes=[dtypes.string], shapes=[[]])
         enqueue_op = file_name_queue.enqueue(
             input_pipeline_ops.seek_next(
-                file_names, shuffle=randomize_input, num_epochs=num_epochs,
+                file_names,
+                shuffle=randomize_input,
+                num_epochs=num_epochs,
                 seed=seed))
         queue_runner.add_queue_runner(
             queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
       else:
         file_name_queue = input_ops.string_input_producer(
-            constant_op.constant(
-                file_names, name='input'),
+            constant_op.constant(file_names, name='input'),
             shuffle=randomize_input,
             num_epochs=num_epochs,
             name=file_name_queue_scope,
@@ -496,7 +517,8 @@ def read_keyed_batch_features(file_pattern,
   """
 
   with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
-    if read_batch_size is None: read_batch_size = batch_size
+    if read_batch_size is None:
+      read_batch_size = batch_size
     keys, examples = read_keyed_batch_examples(
         file_pattern,
         batch_size,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
index 6f0fd9a2976d37d1c701a96f50c2b987562cb191..e11e8b698adc113486bbb45572c8129e964cc931 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/graph_io_test.py
@@ -204,8 +204,7 @@ class GraphIOTest(test.TestCase):
     shape = (0,)
     features = {
         "feature":
-            parsing_ops.FixedLenFeature(
-                shape=shape, dtype=dtypes_lib.float32)
+            parsing_ops.FixedLenFeature(shape=shape, dtype=dtypes_lib.float32)
     }
 
     with ops.Graph().as_default() as g, self.test_session(graph=g) as sess:
@@ -255,8 +254,8 @@ class GraphIOTest(test.TestCase):
       self.assertAllEqual((None,), inputs.get_shape().as_list())
       self.assertEqual("%s:1" % name, inputs.name)
       file_name_queue_name = "%s/file_name_queue" % name
-      file_name_queue_limit_name = ("%s/limit_epochs/epochs" %
-                                    file_name_queue_name)
+      file_name_queue_limit_name = (
+          "%s/limit_epochs/epochs" % file_name_queue_name)
       file_names_name = "%s/input" % file_name_queue_name
       example_queue_name = "%s/random_shuffle_queue" % name
       op_nodes = test_util.assert_ops_in_graph({
@@ -354,8 +353,8 @@ class GraphIOTest(test.TestCase):
     json_lines = [
         "".join([
             '{"features": { "feature": { "sequence": {',
-            '"bytes_list": { "value": ["', base64.b64encode(l).decode("ascii"),
-            '"]}}}}}\n'
+            '"bytes_list": { "value": ["',
+            base64.b64encode(l).decode("ascii"), '"]}}}}}\n'
         ]) for l in lines
     ]
     return self._create_temp_file("".join(json_lines))
@@ -823,6 +822,31 @@ class GraphIOTest(test.TestCase):
       coord.request_stop()
       coord.join(threads)
 
+  def test_read_keyed_batch_features_shared_queue(self):
+    batch_size = 17
+    shape = (0,)
+    fixed_feature = parsing_ops.FixedLenFeature(
+        shape=shape, dtype=dtypes_lib.float32)
+    feature = {"feature": fixed_feature}
+    reader = io_ops.TFRecordReader
+
+    _, queued_feature = graph_io.read_keyed_batch_features_shared_queue(
+        _VALID_FILE_PATTERN, batch_size, feature, reader)
+
+    with ops.Graph().as_default() as g, self.test_session(graph=g) as session:
+      features_result = graph_io.read_batch_features(
+          _VALID_FILE_PATTERN, batch_size, feature, reader)
+      session.run(variables.local_variables_initializer())
+
+    self.assertAllEqual(
+        queued_feature.get("feature").get_shape().as_list(),
+        features_result.get("feature").get_shape().as_list())
+
+  def test_get_file_names_errors(self):
+    # Raise bad file_pattern.
+    with self.assertRaises(ValueError):
+      graph_io._get_file_names([], True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/learn/python/learn/metric_spec.py b/tensorflow/contrib/learn/python/learn/metric_spec.py
index ed6683abedbb8ae76ba364405158eb52cbb6d762..6440bc204b8e339ff51311dcc87b36f556b94092 100644
--- a/tensorflow/contrib/learn/python/learn/metric_spec.py
+++ b/tensorflow/contrib/learn/python/learn/metric_spec.py
@@ -42,10 +42,8 @@ def _args(fn):
   """
   if hasattr(fn, 'func') and hasattr(fn, 'keywords'):
     # Handle functools.partial and similar objects.
-    return tuple([
-        arg for arg in tf_inspect.getargspec(fn.func).args
-        if arg not in set(fn.keywords.keys())
-    ])
+    return tuple(
+        [arg for arg in _args(fn.func) if arg not in set(fn.keywords.keys())])
   # Handle function.
   return tuple(tf_inspect.getargspec(fn).args)
 
diff --git a/tensorflow/contrib/learn/python/learn/utils/export.py b/tensorflow/contrib/learn/python/learn/utils/export.py
index 6af2287761299f6725f9547917101c18b0cc0164..cb34cb1d26b6812c7f3f39e9f965615de5a8ef07 100644
--- a/tensorflow/contrib/learn/python/learn/utils/export.py
+++ b/tensorflow/contrib/learn/python/learn/utils/export.py
@@ -20,7 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework import deprecated
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.session_bundle import exporter
 from tensorflow.contrib.session_bundle import gc
 from tensorflow.python.client import session as tf_session
@@ -78,7 +78,7 @@ def _export_graph(graph, saver, checkpoint_path, export_dir,
           default_graph_signature=default_graph_signature,
           named_graph_signatures=named_graph_signatures,
           assets_collection=ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))
-      return export.export(export_dir, contrib_variables.get_global_step(),
+      return export.export(export_dir, training_util.get_global_step(),
                            session, exports_to_keep=exports_to_keep)
 
 
@@ -295,7 +295,7 @@ def _export_estimator(estimator,
   checkpoint_path = (checkpoint_path or
                      tf_saver.latest_checkpoint(estimator._model_dir))
   with ops.Graph().as_default() as g:
-    contrib_variables.create_global_step(g)
+    training_util.create_global_step(g)
 
     if use_deprecated_input_fn:
       examples = array_ops.placeholder(dtype=dtypes.string,
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index 49413092a6bae547ddd2cad272b1abb3af1de046..6ffd2a133995a6ff8b35540221fb5676bf5de19f 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -33,6 +33,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 import time
 
 from tensorflow.contrib.layers.python.layers import feature_column
@@ -644,18 +645,22 @@ def make_best_model_export_strategy(serving_input_fn,
 
 # TODO(b/67013778): Revisit this approach when corresponding changes to
 # TF Core are finalized.
-def extend_export_strategy(base_export_strategy, post_export_fn,
-                           post_export_name):
+def extend_export_strategy(base_export_strategy,
+                           post_export_fn,
+                           post_export_name=None):
   """Extend ExportStrategy, calling post_export_fn after export.
 
   Args:
     base_export_strategy: An ExportStrategy that can be passed to the Experiment
       constructor.
     post_export_fn: A user-specified function to call after exporting the
-      SavedModel. Takes the export directory as an argument, and returns
-      a string path to a (potentially different) SavedModel.
+      SavedModel. Takes two arguments - the path to the SavedModel exported by
+      base_export_strategy and the directory where to export the SavedModel
+      modified by the post_export_fn. Returns the path to the exported
+      SavedModel.
     post_export_name: The directory name under the export base directory where
-      SavedModels generated by the post_export_fn will be written.
+      SavedModels generated by the post_export_fn will be written. If None, the
+      directory name of base_export_strategy is used.
 
   Returns:
     An ExportStrategy that can be passed to the Experiment constructor.
@@ -675,12 +680,24 @@ def extend_export_strategy(base_export_strategy, post_export_fn,
 
     Raises:
       ValueError: If `estimator` is a ${tf.estimator.Estimator} instance
-        and `default_output_alternative_key` was specified.
+        and `default_output_alternative_key` was specified or if post_export_fn
+        does not return a valid directory.
     """
-    export_dir = base_export_strategy.export(estimator, export_dir_base,
-                                             checkpoint_path)
-    if post_export_fn:
-      export_dir = post_export_fn(export_dir)
-    return export_dir
-
-  return export_strategy.ExportStrategy(post_export_name, export_fn)
+    tmp_base_export_dir = tempfile.mkdtemp()
+    tmp_base_export = base_export_strategy.export(
+        estimator, tmp_base_export_dir, checkpoint_path)
+    tmp_post_export_dir = tempfile.mkdtemp()
+    tmp_post_export = post_export_fn(tmp_base_export, tmp_post_export_dir)
+
+    if not tmp_post_export.startswith(tmp_post_export_dir):
+      raise ValueError('post_export_fn must return a sub-directory of {}'
+                       .format(tmp_post_export_dir))
+    export_relpath = os.path.relpath(tmp_post_export, tmp_post_export_dir)
+
+    gfile.Rename(
+        os.path.join(tmp_post_export_dir, export_relpath),
+        os.path.join(export_dir_base, export_relpath))
+    return os.path.join(export_dir_base, export_relpath)
+
+  name = post_export_name if post_export_name else base_export_strategy.name
+  return export_strategy.ExportStrategy(name, export_fn)
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
index 27f17b54221ea442baafb382aa3fb034d1bb82e6..ec3a88003f01b3b62591c13472029601b11ba491 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils_test.py
@@ -743,12 +743,19 @@ class SavedModelExportUtilsTest(test.TestCase):
                              None)
 
   def test_extend_export_strategy(self):
-    def _base_export_fn(unused_estimator, export_dir_base,
+
+    def _base_export_fn(unused_estimator,
+                        export_dir_base,
                         unused_checkpoint_path=None):
-      return export_dir_base + "/e1"
+      base_path = os.path.join(export_dir_base, "e1")
+      gfile.MkDir(base_path)
+      return base_path
 
-    def _post_export_fn(orig_path):
-      return orig_path + "/rewrite"
+    def _post_export_fn(orig_path, new_path):
+      assert orig_path.endswith("/e1")
+      post_export_path = os.path.join(new_path, "rewrite")
+      gfile.MkDir(post_export_path)
+      return post_export_path
 
     base_export_strategy = export_strategy_lib.ExportStrategy(
         "Servo", _base_export_fn)
@@ -758,9 +765,67 @@ class SavedModelExportUtilsTest(test.TestCase):
     self.assertEqual(final_export_strategy.name, "Servo2")
 
     test_estimator = TestEstimator()
-    final_path = final_export_strategy.export(test_estimator, "/path/to/orig",
-                                              "/path/to/checkpoint")
-    self.assertEqual("/path/to/orig/e1/rewrite", final_path)
+    tmpdir = tempfile.mkdtemp()
+    final_path = final_export_strategy.export(test_estimator, tmpdir,
+                                              os.path.join(
+                                                  tmpdir, "checkpoint"))
+    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+
+  def test_extend_export_strategy_same_name(self):
+
+    def _base_export_fn(unused_estimator,
+                        export_dir_base,
+                        unused_checkpoint_path=None):
+      base_path = os.path.join(export_dir_base, "e1")
+      gfile.MkDir(base_path)
+      return base_path
+
+    def _post_export_fn(orig_path, new_path):
+      assert orig_path.endswith("/e1")
+      post_export_path = os.path.join(new_path, "rewrite")
+      gfile.MkDir(post_export_path)
+      return post_export_path
+
+    base_export_strategy = export_strategy_lib.ExportStrategy(
+        "Servo", _base_export_fn)
+
+    final_export_strategy = saved_model_export_utils.extend_export_strategy(
+        base_export_strategy, _post_export_fn)
+    self.assertEqual(final_export_strategy.name, "Servo")
+
+    test_estimator = TestEstimator()
+    tmpdir = tempfile.mkdtemp()
+    final_path = final_export_strategy.export(test_estimator, tmpdir,
+                                              os.path.join(
+                                                  tmpdir, "checkpoint"))
+    self.assertEqual(os.path.join(tmpdir, "rewrite"), final_path)
+
+  def test_extend_export_strategy_raises_error(self):
+
+    def _base_export_fn(unused_estimator,
+                        export_dir_base,
+                        unused_checkpoint_path=None):
+      base_path = os.path.join(export_dir_base, "e1")
+      gfile.MkDir(base_path)
+      return base_path
+
+    def _post_export_fn(unused_orig_path, unused_new_path):
+      return tempfile.mkdtemp()
+
+    base_export_strategy = export_strategy_lib.ExportStrategy(
+        "Servo", _base_export_fn)
+
+    final_export_strategy = saved_model_export_utils.extend_export_strategy(
+        base_export_strategy, _post_export_fn)
+
+    test_estimator = TestEstimator()
+    tmpdir = tempfile.mkdtemp()
+    with self.assertRaises(ValueError) as ve:
+      final_export_strategy.export(test_estimator, tmpdir,
+                                   os.path.join(tmpdir, "checkpoint"))
+
+    self.assertTrue(
+        "post_export_fn must return a sub-directory" in str(ve.exception))
 
 
 def _create_test_export_dir(export_dir_base):
diff --git a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
index 8313aa355d6d40596b40c39f28b64f46c1bb5719..5e7b422e3cc368a22eb94ed470297ae78293c4eb 100644
--- a/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
+++ b/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py
@@ -76,7 +76,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 # TODO(ebrevdo): Remove once _linear is fully deprecated.
-Linear = rnn_cell_impl._Linear  # pylint: disable=protected-access,invalid-name
+Linear = core_rnn_cell._Linear  # pylint: disable=protected-access,invalid-name
 
 
 def _extract_argmax_and_embed(embedding,
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 13f2f0f5021ea4dd339b671e20cb718f4db509f9..7526f3ae0dbdb3d6827e9d7f690090b8438e4f6e 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -238,10 +238,10 @@ class SdcaModel(object):
     with name_scope('sdca/prediction'):
       sparse_variables = self._convert_n_to_tensor(self._variables[
           'sparse_features_weights'])
-      result = 0.0
+      result_sparse = 0.0
       for sfc, sv in zip(examples['sparse_features'], sparse_variables):
         # TODO(sibyl-Aix6ihai): following does not take care of missing features.
-        result += math_ops.segment_sum(
+        result_sparse += math_ops.segment_sum(
             math_ops.multiply(
                 array_ops.gather(sv, sfc.feature_indices), sfc.feature_values),
             sfc.example_indices)
@@ -249,12 +249,14 @@ class SdcaModel(object):
       dense_variables = self._convert_n_to_tensor(self._variables[
           'dense_features_weights'])
 
+      result_dense = 0.0
       for i in range(len(dense_variables)):
-        result += math_ops.matmul(dense_features[i],
-                                  array_ops.expand_dims(dense_variables[i], -1))
+        result_dense += math_ops.matmul(dense_features[i],
+                                        array_ops.expand_dims(
+                                            dense_variables[i], -1))
 
     # Reshaping to allow shape inference at graph construction time.
-    return array_ops.reshape(result, [-1])
+    return array_ops.reshape(result_dense, [-1]) + result_sparse
 
   def predictions(self, examples):
     """Add operations to compute predictions by the model.
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index 701fc1c0597d1de0b0189e86feafbd1c5bbdc818..05794a42c5f2d0eece6adab36fb5610078cece31 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib import layers
-from tensorflow.contrib.framework.python.ops import variables as contrib_variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
@@ -154,7 +154,7 @@ def sdca_model_fn(features, labels, mode, params, config=None):
     _add_bias_column(feature_columns, features, bias, columns_to_variables)
 
   def _train_op_fn(unused_loss):
-    global_step = contrib_variables.get_global_step()
+    global_step = training_util.get_global_step()
     sdca_model, train_op = optimizer.get_train_step(
         columns_to_variables, weight_column_name, loss_type, features, labels,
         global_step)
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3f1b0be1a73a3ff1da3452f4ee1a9125f9e26178
--- /dev/null
+++ b/tensorflow/contrib/lite/BUILD
@@ -0,0 +1,204 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+
+exports_files(glob([
+    "testdata/*.bin",
+    "models/testdata/*",
+]))
+
+config_setting(
+    name = "mips",
+    values = {
+        "cpu": "mips",
+    },
+)
+
+config_setting(
+    name = "mips64",
+    values = {
+        "cpu": "mips64",
+    },
+)
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "schema_fbs_version",
+    hdrs = ["version.h"],
+)
+
+# Main library. No ops are included here.
+# TODO(aselle): Resolve problems preventing C99 usage.
+cc_library(
+    name = "context",
+    srcs = ["context.c"],
+    hdrs = ["context.h"],
+)
+
+cc_library(
+    name = "builtin_op_data",
+    hdrs = [
+        "builtin_op_data.h",
+    ],
+)
+
+cc_library(
+    name = "string",
+    hdrs = [
+        "string.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib_platform",
+    ],
+)
+
+# TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
+cc_library(
+    name = "framework",
+    srcs = [
+        "allocation.cc",
+        "error_reporter.cc",
+        "interpreter.cc",
+        "model.cc",
+        "nnapi_delegate.cc",
+        "optional_debug_tools.cc",
+        "simple_memory_arena.cc",
+    ],
+    hdrs = [
+        "allocation.h",
+        "context.h",
+        "error_reporter.h",
+        "interpreter.h",
+        "model.h",
+        "nnapi_delegate.h",
+        "optional_debug_tools.h",
+        "simple_memory_arena.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":builtin_op_data",
+        ":context",
+        ":schema_fbs_version",
+        "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/core:lib_platform",
+    ],
+)
+
+cc_library(
+    name = "string_util",
+    srcs = ["string_util.cc"],
+    hdrs = ["string_util.h"],
+    deps = [
+        ":framework",
+        ":string",
+    ],
+)
+
+cc_test(
+    name = "string_util_test",
+    size = "small",
+    srcs = ["string_util_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test main interpreter
+cc_test(
+    name = "interpreter_test",
+    size = "small",
+    srcs = ["interpreter_test.cc"],
+    deps = [
+        ":framework",
+        ":string_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test arena allocator
+cc_test(
+    name = "simple_memory_arena_test",
+    size = "small",
+    srcs = ["simple_memory_arena_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test model framework.
+cc_test(
+    name = "model_test",
+    size = "small",
+    srcs = ["model_test.cc"],
+    data = [
+        "testdata/0_subgraphs.bin",
+        "testdata/2_subgraphs.bin",
+        "testdata/empty_model.bin",
+        "testdata/test_model.bin",
+        "testdata/test_model_broken.bin",
+    ],
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test the C extension API code.
+cc_test(
+    name = "context_test",
+    size = "small",
+    srcs = ["context_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Test the serialization of a model with optional tensors.
+
+# Model tests
+
+cc_library(
+    name = "models_test_utils",
+    testonly = 1,
+    hdrs = ["models/test_utils.h"],
+    deps = select({
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            "@com_google_absl//absl/strings",
+            "//tensorflow/core:test",
+        ],
+    }),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "downloads",
+            "examples",
+            "gen",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..78402727abdd2742ffff54bf59ca076d8b97b042
--- /dev/null
+++ b/tensorflow/contrib/lite/Makefile
@@ -0,0 +1,147 @@
+
+# Find where we're running from, so we can store generated files here.
+ifeq ($(origin MAKEFILE_DIR), undefined)
+	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+endif
+
+# Try to figure out the host system
+HOST_OS :=
+ifeq ($(OS),Windows_NT)
+	HOST_OS = WINDOWS
+else
+	UNAME_S := $(shell uname -s)
+	ifeq ($(UNAME_S),Linux)
+	        HOST_OS := LINUX
+	endif
+	ifeq ($(UNAME_S),Darwin)
+		HOST_OS := OSX
+	endif
+endif
+
+ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+
+# Where compiled objects are stored.
+OBJDIR := $(MAKEFILE_DIR)/gen/obj/
+BINDIR := $(MAKEFILE_DIR)/gen/bin/
+LIBDIR := $(MAKEFILE_DIR)/gen/lib/
+GENDIR := $(MAKEFILE_DIR)/gen/obj/
+
+# Settings for the host compiler.
+CXX := $(CC_PREFIX) gcc
+CXXFLAGS := --std=c++11 -O3 -DNDEBUG
+CC := $(CC_PREFIX) gcc
+CFLAGS :=
+LDOPTS :=
+LDOPTS += -L/usr/local/lib
+ARFLAGS := -r
+
+INCLUDES := \
+-I. \
+-I$(MAKEFILE_DIR)/../../../ \
+-I$(MAKEFILE_DIR)/downloads/ \
+-I$(MAKEFILE_DIR)/downloads/eigen \
+-I$(MAKEFILE_DIR)/downloads/gemmlowp \
+-I$(MAKEFILE_DIR)/downloads/neon_2_sse \
+-I$(MAKEFILE_DIR)/downloads/farmhash/src \
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
+-I$(GENDIR)
+# This is at the end so any globally-installed frameworks like protobuf don't
+# override local versions in the source tree.
+INCLUDES += -I/usr/local/include
+
+LIBS := \
+-lstdc++ \
+-lpthread \
+-lm \
+-lz
+
+# If we're on Linux, also link in the dl library.
+ifeq ($(OS),LINUX)
+	LIBS += -ldl -lpthread
+endif
+
+include $(MAKEFILE_DIR)/ios_makefile.inc
+
+# This library is the main target for this makefile. It will contain a minimal
+# runtime that can be linked in to other programs.
+LIB_NAME := libtensorflow-lite.a
+LIB_PATH := $(LIBDIR)$(LIB_NAME)
+
+# A small example program that shows how to link against the library.
+BENCHMARK_PATH := $(BINDIR)benchmark_model
+
+BENCHMARK_SRCS := \
+tensorflow/contrib/lite/tools/benchmark_model.cc
+BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
+
+# What sources we want to compile, must be kept in sync with the main Bazel
+# build files.
+
+CORE_CC_ALL_SRCS := \
+$(wildcard tensorflow/contrib/lite/*.cc) \
+$(wildcard tensorflow/contrib/lite/kernels/*.cc) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/*.cc) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.cc) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.cc) \
+$(wildcard tensorflow/contrib/lite/*.c) \
+$(wildcard tensorflow/contrib/lite/kernels/*.c) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/*.c) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/optimized/*.c) \
+$(wildcard tensorflow/contrib/lite/kernels/internal/reference/*.c) \
+$(wildcard tensorflow/contrib/lite/downloads/farmhash/src/farmhash.cc)
+# Remove any duplicates.
+CORE_CC_ALL_SRCS := $(sort $(CORE_CC_ALL_SRCS))
+CORE_CC_EXCLUDE_SRCS := \
+$(wildcard tensorflow/contrib/lite/*test.cc) \
+$(wildcard tensorflow/contrib/lite/*/*test.cc) \
+$(wildcard tensorflow/contrib/lite/*/*/*test.cc) \
+$(wildcard tensorflow/contrib/lite/*/*/*/*test.cc) \
+$(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \
+$(BENCHMARK_SRCS)
+# Filter out all the excluded files.
+TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
+# File names of the intermediate files target compilation generates.
+TF_LITE_CC_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(TF_LITE_CC_SRCS))))
+LIB_OBJS := $(TF_LITE_CC_OBJS)
+
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.cc
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# For normal manually-created TensorFlow C++ source files.
+$(OBJDIR)%.o: %.c
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+# The target that's compiled if there's no command-line arguments.
+all: $(LIB_PATH) $(BENCHMARK_PATH)
+
+# Gathers together all the objects we've compiled into a single '.a' archive.
+$(LIB_PATH): $(LIB_OBJS)
+	@mkdir -p $(dir $@)
+	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
+
+$(BENCHMARK_PATH): $(BENCHMARK_OBJS) $(LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(BENCHMARK_PATH) $(BENCHMARK_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(MAKEFILE_DIR)/gen
+
+# Gets rid of target files only, leaving the host alone. Also leaves the lib
+# directory untouched deliberately, so we can persist multiple architectures
+# across builds for iOS and Android.
+cleantarget:
+	rm -rf $(OBJDIR)
+	rm -rf $(BINDIR)
+
+$(DEPDIR)/%.d: ;
+.PRECIOUS: $(DEPDIR)/%.d
+
+-include $(patsubst %,$(DEPDIR)/%.d,$(basename $(TF_CC_SRCS)))
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2fb40070cb25df16d32569ca764c181bf6333506
--- /dev/null
+++ b/tensorflow/contrib/lite/README.md
@@ -0,0 +1,222 @@
+# TensorFlow Lite
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
+
+TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
+
+![image](g3doc/TFLite-Architecture.jpg)
+# Getting Started with an Android Demo App
+
+This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using a quantized Mobilenet model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
+
+There are 3 ways to get the demo app to your device
+ - Download the prebuilt binary or
+ - Use Android Studio to build the application or
+ - Download the source code for TensorFlow Lite and the demo and build it using bazel
+
+## Description
+In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
+
+## Downloading the pre-built binary
+The fastest path to trying the demo, is to download the pre-built binary
+[TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
+
+Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
+
+## Building in Android Studio using TensorFlow Lite AAR from JCenter
+The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
+
+ - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
+ - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
+ - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
+ - Click through installing all the Gradle extensions it requests.
+ - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+     - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
+       `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+ - Build and run the demo app
+
+## Building TensorFlow Lite and the demo app from source
+
+### Clone the TensorFlow repo
+- git clone
+  [https://github.com/tensorflow/tensorflow](https://github.com/tensorflow/tensorflow)
+
+### Install Bazel
+If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
+
+NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
+
+### Install Android NDK and SDK
+Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
+ - Install the latest version of Bazel as per the instructions on the [Bazel website](https://bazel.build/versions/master/docs/install.html)
+ - The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 14b, which can be found [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
+ - The Android SDK and build tools may be obtained [here](https://developer.android.com/tools/revisions/build-tools.html), or alternatively as part of [Android Studio](https://developer.android.com/studio/index.html). Build tools API >= 23 is required to build the TF Android demo (though it will run on API >= 21 devices).
+ - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
+
+```
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
+
+android_ndk_repository(
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
+```
+
+Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+
+### Build the source code
+Run bazel with the following command to build the demo.
+
+Build the demo app:
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
+
+### Note
+
+Currently, we only support building the Android demo app within a Python 2
+environment (due to a Bazel bug).
+
+### More about the demo
+The demo is resizing each camera image frame to (224 width * 224 height) to match the  quantized Mobilenet model being used. The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch 224 * 224 is the width and height of the image 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. The Mobilenet model has 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The Mobilenet quantized model is bundled within the assets directory of the app.
+
+# iOS Demo App
+
+Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
+
+This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
+
+1.   Follow the Building section [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/ios.md#building) to build the universal iOS library for TensorFlow Lite.
+1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
+1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
+1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
+
+# TensorFlow Lite Quick Start
+
+## Step 1. Decide which GraphDef to use
+ Depending on the use case, the developer may choose to use one of the popular
+ open-sourced models such as InceptionV3 or MobileNets, re-train these models
+ with their own custom data set or even build their own custom model.
+
+### Using a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html) is a family of mobile-first computer vision models for [TensorFlow](https://www.tensorflow.org/) designed to effectively maximize accuracy while being mindful of the restricted resources for an on-device or embedded application. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as [Inception](https://arxiv.org/pdf/1602.07261.pdf), are used. Google provides 16 pre-trained [ImageNet](http://www.image-net.org/challenges/LSVRC/)  classification checkpoints for MobileNets for use in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model which achieves fairly high accuracy in recognizing general objects with 1000 classes, like "Zebra", "Dalmatian", and "Dishwasher". The model extracts general features from input images using a convolutional neural network and classifies them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)  is an on-device model which provides one-touch replies for an incoming text message by suggesting contextually relevant messages. The model is built specifically for memory constrained devices such as watches & phones and it has been successfully used to surface [Smart Replies on Android Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html). Note that this model only works on Android as of now.
+
+These pre-trained models can be downloaded from [here](g3doc/models.md).
+
+### Retrain Inception-V3 or MobileNet for a custom data set
+The above pre-trained models have been trained on the ImageNet data set, which consists of 1000 predefined classes. A model will need to be re-trained if these classes are not relevant or useful for a given use case. This technique is called transfer learning, which starts with a model that has been already trained on a problem and will then be retrained on a similar problem. Deep learning from scratch can take days, but transfer learning can be done fairly quickly. In order to do this, a developer will need to generate their custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/) codelab walks through this process step-by-step. The retraining code supports retraining for both floating point and quantized inference.
+
+
+### Train a custom model
+A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
+set will continue to expand in future releases of Tensorflow Lite.
+
+
+## Step 2. Model format conversion
+
+The model generated in Step 1 is a standard Tensorflow model. After the completion of Step 1 a user should have a standard .pb or .pbtxt GraphDef file. If the application developer is using a pre-trained model (as defined in Step 1 above), they can download a ready to use, already converted model for use from [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/models.md). Models generated using retraining (aka transfer learning) or custom models will need to be converted using the steps mentioned below.
+
+A prerequisite to converting the model to the Tensorflow Lite format is to freeze the graph.
+
+Since we employ several formats, the following definitions may be useful:
+ - GraphDef (.pb) - a protobuf that represents the TensorFlow training and or computation graph. This contains operators, tensors, and variables definitions.
+
+ - CheckPoint (.ckpt) - Serialized variables from a TensorFlow graph. Note, this does not contain the graph structure, so alone it cannot typically be interpreted.
+
+ - FrozenGraphDef - a subclass of GraphDef that contains no variables. A GraphDef can be converted to a frozen graphdef by taking a checkpoint and a graphdef and converting every variable into a constant with the value looked up in the checkpoint.
+
+ - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
+
+ - TensorFlow lite model (.lite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
+
+### Freeze Graph
+To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
+
+The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+
+Graph freezing can be done using the command below (and modifying the arguments appropriately)
+
+```
+bazel build tensorflow/python/tools:freeze_graph
+
+bazel-bin/tensorflow/python/tools/freeze_graph\
+    --input_graph=/tmp/mobilenet_v1_224.pb \
+    --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+    --input_binary=true --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+    --output_node_names=MobileNet/Predictions/Reshape_1
+```
+
+The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
+graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
+
+This frozen Graphdef is now ready to be converted to flatbuffer format (.lite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
+
+Here is a sample command line to convert the frozen Graphdef to '.lite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
+(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
+
+```
+bazel build tensorflow/contrib/lite/toco:toco
+
+bazel-bin/tensorflow/contrib/lite/toco/toco -- \
+  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+  --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
+  --output_file=/tmp/mobilenet_v1_1.0_224.lite --inference_type=FLOAT \
+  --input_type=FLOAT --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
+```
+
+- The input_file argument should point to the frozen GraphDef file that holds the model architecture.
+- The output_file argument should point to where the TensorFlow Lite model file should be generated.
+- The input_type and inference_type arguments should be set to FLOAT, unless converted a [quantized](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/) model.
+- Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
+
+Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
+documentation [here](https://github.com/tensorflow/tensorflow/tree/mastertensorflow/contrib/lite/python:toco_from_protos target) A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
+
+```
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("converteds_model.tflite", "wb").write(tflite_model)
+
+```
+For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
+
+You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
+
+## Step 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completion of Step 2 the developer should have a .lite model.
+
+### For Android
+Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+
+The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+
+Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
+
+### For iOS
+Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
+
+## Core ML support
+
+Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b322e027d48f4bf9f90d5b873c449d1ec31cc49
--- /dev/null
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cassert>
+#include <cstdarg>
+#include <cstdint>
+#include <cstring>
+
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/nnapi_delegate.h"
+
+namespace tflite {
+
+MMAPAllocation::MMAPAllocation(const char* filename,
+                               ErrorReporter* error_reporter)
+    : Allocation(error_reporter), mmapped_buffer_(MAP_FAILED) {
+  mmap_fd_ = open(filename, O_RDONLY);
+  if (mmap_fd_ == -1) {
+    error_reporter_->Report("Could not open '%s'.", filename);
+    return;
+  }
+  struct stat sb;
+  fstat(mmap_fd_, &sb);
+  buffer_size_bytes_ = sb.st_size;
+  mmapped_buffer_ =
+      mmap(nullptr, buffer_size_bytes_, PROT_READ, MAP_SHARED, mmap_fd_, 0);
+  if (mmapped_buffer_ == MAP_FAILED) {
+    error_reporter_->Report("Mmap of '%s' failed.", filename);
+    return;
+  }
+}
+
+MMAPAllocation::~MMAPAllocation() {
+  if (valid()) {
+    munmap(const_cast<void*>(mmapped_buffer_), buffer_size_bytes_);
+  }
+  if (mmap_fd_ != -1) close(mmap_fd_);
+}
+
+const void* MMAPAllocation::base() const { return mmapped_buffer_; }
+
+size_t MMAPAllocation::bytes() const { return buffer_size_bytes_; }
+
+bool MMAPAllocation::valid() const { return mmapped_buffer_ != MAP_FAILED; }
+
+FileCopyAllocation::FileCopyAllocation(const char* filename,
+                                       ErrorReporter* error_reporter)
+    : Allocation(error_reporter) {
+  // Obtain the file size, using an alternative method that is does not
+  // require fstat for more compatibility.
+  std::unique_ptr<FILE, decltype(&fclose)> file(fopen(filename, "rb"), fclose);
+  if (!file) {
+    error_reporter_->Report("Could not open '%s'.", filename);
+    return;
+  }
+  // TODO(ahentz): Why did you think using fseek here was better for finding
+  // the size?
+  struct stat sb;
+  if (fstat(fileno(file.get()), &sb) != 0) {
+    error_reporter_->Report("Failed to get file size of '%s'.", filename);
+    return;
+  }
+  buffer_size_bytes_ = sb.st_size;
+  std::unique_ptr<char[]> buffer(new char[buffer_size_bytes_]);
+  if (!buffer) {
+    error_reporter_->Report("Malloc of buffer to hold copy of '%s' failed.",
+                            filename);
+    return;
+  }
+  size_t bytes_read =
+      fread(buffer.get(), sizeof(char), buffer_size_bytes_, file.get());
+  if (bytes_read != buffer_size_bytes_) {
+    error_reporter_->Report("Read of '%s' failed (too few bytes read).",
+                            filename);
+    return;
+  }
+  copied_buffer_ = std::move(buffer);
+}
+
+FileCopyAllocation::~FileCopyAllocation() {}
+
+const void* FileCopyAllocation::base() const { return copied_buffer_.get(); }
+
+size_t FileCopyAllocation::bytes() const { return buffer_size_bytes_; }
+
+bool FileCopyAllocation::valid() const { return copied_buffer_ != nullptr; }
+
+MemoryAllocation::MemoryAllocation(const void* ptr, size_t num_bytes,
+                                   ErrorReporter* error_reporter)
+    : Allocation(error_reporter) {
+  buffer_ = ptr;
+  buffer_size_bytes_ = num_bytes;
+}
+
+MemoryAllocation::~MemoryAllocation() {}
+
+const void* MemoryAllocation::base() const { return buffer_; }
+
+size_t MemoryAllocation::bytes() const { return buffer_size_bytes_; }
+
+bool MemoryAllocation::valid() const { return buffer_ != nullptr; }
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/allocation.h b/tensorflow/contrib/lite/allocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee8a7ccd0b232f9e48095567fd4aefe94f595bc3
--- /dev/null
+++ b/tensorflow/contrib/lite/allocation.h
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Main abstraction controlling the tflite interpreter.
+// See context.h for the API for defining operations (TfLiteRegistration).
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+namespace tflite {
+
+// A memory allocation handle. This could be a mmap or shared memory.
+class Allocation {
+ public:
+  Allocation(ErrorReporter* error_reporter) : error_reporter_(error_reporter) {}
+  virtual ~Allocation() {}
+
+  // Base pointer of this allocation
+  virtual const void* base() const = 0;
+  // Size in bytes of the allocation
+  virtual size_t bytes() const = 0;
+  // Whether the allocation is valid
+  virtual bool valid() const = 0;
+
+ protected:
+  ErrorReporter* error_reporter_;
+};
+
+class MMAPAllocation : public Allocation {
+ public:
+  MMAPAllocation(const char* filename, ErrorReporter* error_reporter);
+  virtual ~MMAPAllocation();
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+ protected:
+  // Data required for mmap.
+  int mmap_fd_ = -1;  // mmap file descriptor
+  const void* mmapped_buffer_;
+  size_t buffer_size_bytes_ = 0;
+};
+
+class FileCopyAllocation : public Allocation {
+ public:
+  FileCopyAllocation(const char* filename, ErrorReporter* error_reporter);
+  virtual ~FileCopyAllocation();
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+ private:
+  // Data required for mmap.
+  std::unique_ptr<const char[]> copied_buffer_;
+  size_t buffer_size_bytes_ = 0;
+};
+
+class MemoryAllocation : public Allocation {
+ public:
+  // Allocates memory with the pointer and the number of bytes of the memory.
+  // The pointer has to remain alive and unchanged until the destructor is
+  // called.
+  MemoryAllocation(const void* ptr, size_t num_bytes,
+                   ErrorReporter* error_reporter);
+  virtual ~MemoryAllocation();
+  const void* base() const override;
+  size_t bytes() const override;
+  bool valid() const override;
+
+ private:
+  const void* buffer_;
+  size_t buffer_size_bytes_ = 0;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ALLOCATION_H_
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..d1fcdce70a34393defce0f2d0f6d5bb53f21c45e
--- /dev/null
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -0,0 +1,235 @@
+"""Generate Flatbuffer binary from json."""
+
+def tflite_copts():
+  """Defines compile time flags."""
+  copts = [
+      "-DFARMHASH_NO_CXX_STRING",
+  ] + select({
+          "//tensorflow:android_arm64": [
+              "-std=c++11",
+              "-O3",
+          ],
+          "//tensorflow:android_arm": [
+              "-mfpu=neon",
+              "-mfloat-abi=softfp",
+              "-std=c++11",
+              "-O3",
+          ],
+          "//tensorflow:android_x86": [
+              "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
+          ],
+          "//tensorflow:ios_x86_64": [
+              "-msse4.1",
+          ],
+          "//conditions:default": [],
+  }) + select({
+      "//tensorflow:with_default_optimizations": [],
+      "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
+  })
+
+  return copts
+
+LINKER_SCRIPT = "//tensorflow/contrib/lite/java/src/main/native:version_script.lds"
+
+def tflite_linkopts_unstripped():
+  """Defines linker flags to reduce size of TFLite binary.
+
+     These are useful when trying to investigate the relative size of the
+     symbols in TFLite.
+
+  Returns:
+     a select object with proper linkopts
+  """
+  return select({
+      "//tensorflow:android": [
+          "-Wl,--no-export-dynamic", # Only inc syms referenced by dynamic obj.
+          "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
+          "-Wl,--gc-sections", # Eliminate unused code and data.
+          "-Wl,--as-needed", # Don't link unused libs.
+      ],
+      "//tensorflow/contrib/lite:mips": [],
+      "//tensorflow/contrib/lite:mips64": [],
+      "//conditions:default": [
+          "-Wl,--icf=all",  # Identical code folding.
+      ],
+  })
+
+def tflite_jni_linkopts_unstripped():
+  """Defines linker flags to reduce size of TFLite binary with JNI.
+
+     These are useful when trying to investigate the relative size of the
+     symbols in TFLite.
+
+  Returns:
+     a select object with proper linkopts
+  """
+  return select({
+      "//tensorflow:android": [
+          "-Wl,--gc-sections", # Eliminate unused code and data.
+          "-Wl,--as-needed", # Don't link unused libs.
+      ],
+      "//tensorflow/contrib/lite:mips": [],
+      "//tensorflow/contrib/lite:mips64": [],
+      "//conditions:default": [
+          "-Wl,--icf=all",  # Identical code folding.
+      ],
+  })
+
+def tflite_linkopts():
+  """Defines linker flags to reduce size of TFLite binary."""
+  return tflite_linkopts_unstripped() + select({
+      "//tensorflow:android": [
+          "-s",  # Omit symbol table.
+      ],
+      "//conditions:default": [],
+  })
+
+def tflite_jni_linkopts():
+  """Defines linker flags to reduce size of TFLite binary with JNI."""
+  return tflite_jni_linkopts_unstripped() + select({
+      "//tensorflow:android": [
+          "-s",  # Omit symbol table.
+          "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
+      ],
+      "//conditions:default": [],
+  })
+
+
+def tflite_jni_binary(name,
+                      copts=tflite_copts(),
+                      linkopts=tflite_jni_linkopts(),
+                      linkscript=LINKER_SCRIPT,
+                      linkshared=1,
+                      linkstatic=1,
+                      deps=[]):
+  """Builds a jni binary for TFLite."""
+  linkopts = linkopts + [
+      "-Wl,--version-script",  # Export only jni functions & classes.
+      linkscript,
+  ]
+  native.cc_binary(
+      name=name,
+      copts=copts,
+      linkshared=linkshared,
+      linkstatic=linkstatic,
+      deps= deps + [linkscript],
+      linkopts=linkopts)
+
+def tf_to_tflite(name, src, options, out):
+  """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
+
+  Args:
+    name: Name of rule.
+    src: name of the input graphdef file.
+    options: options passed to TOCO.
+    out: name of the output flatbuffer file.
+  """
+
+  toco = "//tensorflow/contrib/lite/toco:toco"
+  native.genrule(
+      name = name,
+      srcs=[src, options],
+      outs=[out],
+      cmd = ("$(location %s) " +
+             "   --input_file=$(location %s) " +
+             "   --output_file=$(location %s) " +
+             "   --input_format=TENSORFLOW_GRAPHDEF" +
+             "   --output_format=TFLITE" +
+             "   `cat $(location %s)`")
+            % (toco, src, out, options),
+      tools= [toco],
+  )
+
+def tflite_to_json(name, src, out):
+  """Convert a TF Lite flatbuffer to JSON.
+
+  Args:
+    name: Name of rule.
+    src: name of the input flatbuffer file.
+    out: name of the output JSON file.
+  """
+
+  flatc = "@flatbuffers//:flatc"
+  schema = "//tensorflow/contrib/lite/schema:schema.fbs"
+  native.genrule(
+      name = name,
+      srcs = [schema, src],
+      outs = [out],
+      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.bin &&"  +
+             "$(location %s) --raw-binary --strict-json -t" +
+             " -o /tmp $(location %s) -- $${TMP}.bin &&" +
+             "cp $${TMP}.json $(location %s)")
+            % (src, flatc, schema, out),
+      tools = [flatc],
+  )
+
+def json_to_tflite(name, src, out):
+  """Convert a JSON file to TF Lite's flatbuffer.
+
+  Args:
+    name: Name of rule.
+    src: name of the input JSON file.
+    out: name of the output flatbuffer file.
+  """
+
+  flatc = "@flatbuffers//:flatc"
+  schema = "//tensorflow/contrib/lite/schema:schema_fbs"
+  native.genrule(
+      name = name,
+      srcs = [schema, src],
+      outs = [out],
+      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.json &&"  +
+             "$(location %s) --raw-binary --unknown-json --allow-non-utf8 -b" +
+             " -o /tmp $(location %s) $${TMP}.json &&" +
+             "cp $${TMP}.bin $(location %s)")
+      % (src, flatc, schema, out),
+      tools = [flatc],
+  )
+
+def gen_zipped_test_files(name, files):
+  """Generate a zip file of tests by using :generate_examples.
+
+  Args:
+    name: Name of output. We will produce "`name`_files" as a target.
+    files: A list of zip file basenames.
+  """
+  toco = "//tensorflow/contrib/lite/toco:toco"
+  out_files = []
+  for f in files:
+    out_file = name + "/" + f
+    out_files.append(out_file)
+    native.genrule(
+        name = name + "_" + f + ".files",
+        cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
+               + " --zip_to_output " + f +
+               " $(@D) zipped"),
+        outs = [out_file],
+        tools = [
+            ":generate_examples",
+            toco,
+        ],
+    )
+
+  native.filegroup(
+      name = name,
+      srcs = out_files,
+  )
+
+def gen_selected_ops(name, model):
+  """Generate the library that includes only used ops.
+
+  Args:
+    name: Name of the generated library.
+    model: TFLite model to interpret.
+  """
+  out = name + "_registration.cc"
+  tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
+  tflite_path = "//tensorflow/contrib/lite"
+  native.genrule(
+      name = name,
+      srcs = [model],
+      outs = [out],
+      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s")
+      % (tool, model, out, tflite_path[2:]),
+      tools = [tool],
+  )
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
new file mode 100755
index 0000000000000000000000000000000000000000..cbc96e6edd4358f6666731caa4c208c77d9c6c54
--- /dev/null
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -0,0 +1,31 @@
+#!/bin/bash -x
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+
+lipo \
+tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
+tensorflow/contrib/lite/gen/lib/ios_i386/libtensorflow-lite.a \
+tensorflow/contrib/lite/gen/lib/ios_armv7/libtensorflow-lite.a \
+tensorflow/contrib/lite/gen/lib/ios_armv7s/libtensorflow-lite.a \
+tensorflow/contrib/lite/gen/lib/ios_arm64/libtensorflow-lite.a \
+-create \
+-output tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..93072bf90bd8a18d9011a74c2eec95d86dbdce8a
--- /dev/null
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TODO(aselle): Consider using "if this then that" for testing.
+
+// Possible padding types (for convolutions)
+typedef enum {
+  kTfLitePaddingUnknown = 0,
+  kTfLitePaddingSame,
+  kTfLitePaddingValid,
+} TfLitePadding;
+
+typedef struct {
+  int width;
+  int height;
+} TfLitePaddingValues;
+
+// Possible fused activation functions.
+// TODO(aselle): rename to TfLiteActivation
+typedef enum {
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActRelu1,
+  kTfLiteActRelu6,
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} TfLiteFusedActivation;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  TfLiteFusedActivation activation;
+} TfLiteConvParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int filter_width;
+  int filter_height;
+  TfLiteFusedActivation activation;
+  struct {
+    TfLitePaddingValues padding;
+  } computed;
+} TfLitePoolParams;
+
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  int depth_multiplier;
+  TfLiteFusedActivation activation;
+} TfLiteDepthwiseConvParams;
+
+typedef struct {
+  int rank;
+  TfLiteFusedActivation activation;
+} TfLiteSVDFParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteRNNParams;
+
+typedef struct { TfLiteFusedActivation activation; } TfLiteFullyConnectedParams;
+
+typedef enum {
+  kTfLiteLshProjectionUnknown = 0,
+  kTfLiteLshProjectionSparse = 1,
+  kTfLiteLshProjectionDense = 2,
+} TfLiteLSHProjectionType;
+
+typedef struct { TfLiteLSHProjectionType type; } TfLiteLSHProjectionParams;
+
+typedef struct { float beta; } TfLiteSoftmaxParams;
+
+typedef struct {
+  int axis;
+  TfLiteFusedActivation activation;
+} TfLiteConcatenationParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteAddParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteMulParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+} TfLiteL2NormParams;
+
+typedef struct {
+  int radius;
+  float bias;
+  float alpha;
+  float beta;
+} TfLiteLocalResponseNormParams;
+
+typedef struct {
+  TfLiteFusedActivation activation;
+  float cell_clip;
+  float proj_clip;
+} TfLiteLSTMParams;
+
+typedef struct {
+  int new_height;
+  int new_width;
+} TfLiteResizeBilinearParams;
+
+typedef struct {
+  // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
+  // For now we will fix the maximum possible number of dimensions.
+  int shape[8];
+  int num_dimensions;
+} TfLiteReshapeParams;
+
+typedef struct {
+  int ngram_size;
+  int max_skip_size;
+  bool include_all_ngrams;
+} TfLiteSkipGramParams;
+
+typedef struct {
+  int block_size;
+} TfLiteSpaceToDepthParams;
+
+typedef enum {
+  kTfLiteCombinerTypeSum = 0,
+  kTfLiteCombinerTypeMean = 1,
+  kTfLiteCombinerTypeSqrtn = 2,
+} TfLiteCombinerType;
+
+typedef struct {
+  TfLiteCombinerType combiner;
+} TfLiteEmbeddingLookupSparseParams;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_BUILTIN_OP_DATA_H_
diff --git a/tensorflow/contrib/lite/context.c b/tensorflow/contrib/lite/context.c
new file mode 100644
index 0000000000000000000000000000000000000000..c09e838c5c2e50e0f4a38eaf66e55246fd9a6f7f
--- /dev/null
+++ b/tensorflow/contrib/lite/context.c
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/context.h"
+#include <stdio.h>
+#include <string.h>
+
+TfLiteIntArray* TfLiteIntArrayCreate(int size) {
+  TfLiteIntArray* ret =
+      (TfLiteIntArray*)malloc(sizeof(*ret) + sizeof(ret->data[0]) * size);
+  ret->size = size;
+  return ret;
+}
+
+void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
+  printf("%s: length=%d [", s, a->size);
+  if (a->size) printf("%d", a->data[0]);
+  int i = 1;
+  for (; i < a->size; i++) {
+    printf(" %d", a->data[i]);
+  }
+  printf("]\n");
+}
+
+int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b) {
+  if (a == b) return 1;
+  if (a == NULL || b == NULL) return 0;
+  if (a->size != b->size) return 0;
+  int i = 0;
+  for (; i < a->size; i++)
+    if (a->data[i] != b->data[i]) return 0;
+  return 1;
+}
+
+TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
+  if (!src) return NULL;
+  TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
+  if (ret) {
+    memcpy(ret->data, src->data, src->size * sizeof(int));
+  }
+  return ret;
+}
+
+void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
+
+void TfLiteTensorFree(TfLiteTensor* t) {
+  if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
+    free(t->data.raw);
+  }
+  if (t->dims) TfLiteIntArrayFree(t->dims);
+  t->data.raw = NULL;
+  t->dims = NULL;
+}
+
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, TfLiteTensor* tensor) {
+  TfLiteTensorFree(tensor);
+  tensor->type = type;
+  tensor->name = name;
+  tensor->dims = dims;
+  tensor->params = quantization;
+  tensor->data.raw = buffer;
+  tensor->bytes = size;
+  tensor->allocation_type = allocation_type;
+  tensor->allocation = allocation;
+}
+
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLiteDynamic) {
+    return;
+  }
+  if (!tensor->data.raw) {
+    tensor->data.raw = malloc(num_bytes);
+  } else if (num_bytes > tensor->bytes) {
+    tensor->data.raw = realloc(tensor->data.raw, num_bytes);
+  }
+  tensor->bytes = num_bytes;
+}
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..41257a53b145cbe7e252c9d4de6ea7ef654431b5
--- /dev/null
+++ b/tensorflow/contrib/lite/context.h
@@ -0,0 +1,298 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file defines a C API for implementing operations in tflite.
+// These operations can be defined using c++ but the interface between
+// the interpreter and the operations are C.
+//
+// Summary of abstractions
+// TF_LITE_ENSURE - Self-sufficient error checking
+// TfLiteStatus - Status reporting
+// TfLiteIntArray - stores tensor shapes (dims),
+// TfLiteContext - allows an op to access the tensors
+// TfLiteTensor - tensor (a multidimensional array)
+// TfLiteNode - a single node or operation
+// TfLiteRegistration - the implementation of a conceptual operation.
+//
+// Some abstractions in this file are created and managed by Interpreter.
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef enum { kTfLiteOk = 0, kTfLiteError = 1 } TfLiteStatus;
+
+#define kOptionalTensor (-1)
+
+// Fixed size list of integers. Used for dimensions and inputs/outputs tensor
+// indices
+typedef struct {
+  int size;
+// gcc 6.1+ have a bug where flexible members aren't properly handled
+// https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+    __GNUC_MINOR__ >= 1
+  int data[0];
+#else
+  int data[];
+#endif
+} TfLiteIntArray;
+
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteIntArrayFree().
+TfLiteIntArray* TfLiteIntArrayCreate(int size);
+
+// Check if two tensors are equal. Returns 1 if they are equal, 0 otherwise.
+int TfLiteIntArrayEqual(TfLiteIntArray* a, TfLiteIntArray* b);
+
+// Create a copy of an array passed as `src`.
+// You are expected to free memory with TfLiteIntArrayFree
+TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src);
+
+// Free memory of array `v`.
+void TfLiteIntArrayFree(TfLiteIntArray* v);
+
+// Since we must not depend on any libraries, define a minimal subset of
+// error macros while avoiding names that have pre-conceived meanings like
+// assert and check.
+
+// Check whether value is true, and if not return kTfLiteError from
+// the current function (and report the error string msg).
+#define TF_LITE_ENSURE_MSG(context, value, msg)            \
+  do {                                                     \
+    if (!(value)) {                                        \
+      (context)->ReportError((context), __FILE__ " " msg); \
+      return kTfLiteError;                                 \
+    }                                                      \
+  } while (0)
+
+// Check whether the value `a` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+#define TF_LITE_ENSURE(context, a)                                          \
+  do {                                                                      \
+    if (!(a)) {                                                             \
+      (context)->ReportError((context), "%s:%d %s was not true.", __FILE__, \
+                             __LINE__, #a);                                 \
+      return kTfLiteError;                                                  \
+    }                                                                       \
+  } while (0)
+
+#define TF_LITE_ENSURE_STATUS(a) \
+  do {                           \
+    if ((a) != kTfLiteOk) {      \
+      return kTfLiteError;       \
+    }                            \
+  } while (0)
+
+// Check whether the value `a == b` is true, and if not return kTfLiteError from
+// the current function, while also reporting the location of the error.
+// `a` and `b` may be evaluated more than once, so no side effects or
+// extremely expensive computations should be done.
+#define TF_LITE_ENSURE_EQ(context, a, b)                                       \
+  do {                                                                         \
+    if ((a) != (b)) {                                                          \
+      (context)->ReportError((context), "%s:%d %s != %s (%d != %d)", __FILE__, \
+                             __LINE__, #a, #b, (a), (b));                      \
+      return kTfLiteError;                                                     \
+    }                                                                          \
+  } while (0)
+
+#define TF_LITE_ENSURE_OK(context, status) \
+  do {                                     \
+    if ((status) != kTfLiteOk) {           \
+      return status;                       \
+    }                                      \
+  } while (0)
+
+// Types supported by tensor
+typedef enum {
+  kTfLiteNoType = 0,
+  kTfLiteFloat32 = 1,
+  kTfLiteInt32 = 2,
+  kTfLiteUInt8 = 3,
+  kTfLiteInt64 = 4,
+  kTfLiteString = 5,
+} TfLiteType;
+
+// Parameters for asymmetric quantization. Quantized values can be converted
+// back to float using:
+//    real_value = scale * (quantized_value - zero_point);
+typedef struct {
+  float scale;
+  int32_t zero_point;
+} TfLiteQuantizationParams;
+
+// A union of points that points to memory for a given tensor.
+typedef union {
+  int* i32;
+  float* f;
+  char* raw;
+  const char* raw_const;
+  uint8_t* uint8;
+} TfLitePtrUnion;
+
+// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
+// data (or data externally allocated). kTfLiteArenaRw is arena allocated
+// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+typedef enum {
+  kTfLiteMemNone = 0,
+  kTfLiteMmapRo,
+  kTfLiteArenaRw,
+  kTfLiteArenaRwPersistent,
+  kTfLiteDynamic,
+} TfLiteAllocationType;
+
+// An tensor in the interpreter system which is a wrapper around a buffer of
+// data including a dimensionality (or NULL if not currently defined).
+typedef struct {
+  // The data type specification for data stored in `data`. This affects
+  // what member of `data` union should be used.
+  TfLiteType type;
+  // A union of data pointers. The appropriate type should be used for a typed
+  // tensor based on `type`.
+  TfLitePtrUnion data;
+  // A pointer to a structure representing the dimensionality interpretation
+  // that the buffer should have. NOTE: the product of elements of `dims`
+  // and the element datatype size should be equal to `bytes` below.
+  TfLiteIntArray* dims;
+  // Quantization information.
+  TfLiteQuantizationParams params;
+  // How memory is mapped
+  //  kTfLiteMmapRo: Memory mapped read only.
+  //  i.e. weights
+  //  kTfLiteArenaRw: Arena allocated read write memory
+  //  (i.e. temporaries, outputs).
+  TfLiteAllocationType allocation_type;
+  // The number of bytes required to store the data of this Tensor. I.e.
+  // (bytes of each element) * dims[0] * ... * dims[n-1].  For example, if
+  // type is kTfLiteFloat32 and dims = {3, 2} then
+  // bytes = sizeof(float) * 3 * 2 = 4 * 3 * 2 = 24.
+  size_t bytes;
+
+  // An opaque pointer to a tflite::MMapAllocation
+  const void* allocation;
+
+  // Null-terminated name of this tensor.
+  const char* name;
+} TfLiteTensor;
+
+// Free memory of tensor `t`;
+void TfLiteTensorFree(TfLiteTensor* t);
+
+// Set all of a tensor's fields (and free any previously allocated data).
+void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
+                       TfLiteQuantizationParams quantization, char* buffer,
+                       size_t size, TfLiteAllocationType allocation_type,
+                       const void* allocation, TfLiteTensor* tensor);
+
+// Resize the allocated data of a (dynamic) tensor.
+void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor);
+
+typedef struct TfLiteContext {
+  // Number of tensors in the context.
+  int tensors_size;
+  // An tensor of tensors in the interpreter context (of length `tensors_size`)
+  TfLiteTensor* tensors;
+
+  // opaque full context ptr (an opaque c++ data structure)
+  void* impl_;
+
+  // Request memory pointer be resized. Updates dimensions on the tensor.
+  // NOTE: ResizeTensor takes ownership of newSize.
+  TfLiteStatus (*ResizeTensor)(struct TfLiteContext*, TfLiteTensor* tensor,
+                               TfLiteIntArray* new_size);
+  // Request that a error be reported with format string msg.
+  void (*ReportError)(struct TfLiteContext*, const char* msg, ...);
+
+  // Add `tensors_to_add` tensors, preserving pre-existing Tensor entries.  If
+  // non-null, the value pointed to by `first_new_tensor_index` will be set to
+  // the index of the first new tensor.
+  TfLiteStatus (*AddTensors)(struct TfLiteContext*, int tensors_to_add,
+                             int* first_new_tensor_index);
+
+  // TODO(ahentz): we should create a more general mechanism for this sort of
+  // library-global objects.
+  void* gemm_context;
+} TfLiteContext;
+
+// A structure representing an instance of a node.
+// This structure only exhibits the inputs, outputs and user defined data, not
+// other features like the type.
+typedef struct {
+  // Inputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* inputs;
+
+  // Outputs to this node expressed as indices into the simulator's tensors.
+  TfLiteIntArray* outputs;
+
+  // Temporary tensors uses during the computations. This usually contains no
+  // tensors, but ops are allowed to change that if they need scratch space of
+  // any sort.
+  TfLiteIntArray* temporaries;
+
+  // Opaque data provided by the node implementer through `Registration.init`.
+  void* user_data;
+
+  // Opaque data provided to the node if the node is a builtin.
+  void* builtin_data;
+} TfLiteNode;
+
+typedef struct {
+  // Initializes the op from serialized data.
+  // If a built-in op:
+  //   `buffer` is the op's params data (TfLiteLSTMParams*).
+  //   `length` is zero.
+  // If custom op:
+  //   `buffer` is the op's `custom_options`.
+  //   `length` is the size of the buffer.
+  //
+  // Returns a type-punned (i.e. void*) opaque data (e.g. a primitive pointer
+  // or an instance of a struct).
+  //
+  // The returned pointer will be stored with the node in the `user_data` field,
+  // accessible within prepare and invoke functions below.
+  // NOTE: if the data is already in the desired format, simply implement this
+  // function to return `nullptr` and implement the free function to be a no-op.
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+
+  // The pointer `buffer` is the data previously returned by an init invocation.
+  void (*free)(TfLiteContext* context, void* buffer);
+
+  // prepare is called when the inputs this node depends on have been resized.
+  // context->ResizeTensor() can be called to request output tensors to be
+  // resized.
+  //
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+
+  // Execute the node (should read node->inputs and output to node->outputs).
+  // Returns kTfLiteOk on success.
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+
+  // Builtin codes. If this kernel refers to a builtin this is the code
+  // of the builtin. This is so we can do marshaling to other frameworks like
+  // NN API. Note, it is the responsibility of the registration binder to
+  // set this properly.
+  int32_t builtin_code;
+} TfLiteRegistration;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
diff --git a/tensorflow/contrib/lite/context_test.cc b/tensorflow/contrib/lite/context_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20d6f69a25e9f0bb4323cf5d067b8ebd37bb3c23
--- /dev/null
+++ b/tensorflow/contrib/lite/context_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/context.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+
+// NOTE: this tests only the TfLiteIntArray part of context.
+// most of context.h is provided in the context of using it with interpreter.h
+// and interpreter.cc, so interpreter_test.cc tests context structures more
+// thoroughly.
+
+TEST(IntArray, TestIntArrayCreate) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(0);
+  TfLiteIntArray* b = TfLiteIntArrayCreate(3);
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+}
+
+TEST(IntArray, TestIntArrayCopy) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(2);
+  a->data[0] = 22;
+  a->data[1] = 24;
+  TfLiteIntArray* b = TfLiteIntArrayCopy(a);
+  ASSERT_NE(a, b);
+  ASSERT_EQ(a->size, b->size);
+  ASSERT_EQ(a->data[0], b->data[0]);
+  ASSERT_EQ(a->data[1], b->data[1]);
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+}
+
+TEST(IntArray, TestIntArrayEqual) {
+  TfLiteIntArray* a = TfLiteIntArrayCreate(1);
+  a->data[0] = 1;
+  TfLiteIntArray* b = TfLiteIntArrayCreate(2);
+  b->data[0] = 5;
+  b->data[1] = 6;
+  TfLiteIntArray* c = TfLiteIntArrayCreate(2);
+  c->data[0] = 5;
+  c->data[1] = 6;
+  TfLiteIntArray* d = TfLiteIntArrayCreate(2);
+  d->data[0] = 6;
+  d->data[1] = 6;
+  ASSERT_FALSE(TfLiteIntArrayEqual(a, b));
+  ASSERT_TRUE(TfLiteIntArrayEqual(b, c));
+  ASSERT_TRUE(TfLiteIntArrayEqual(b, b));
+  ASSERT_FALSE(TfLiteIntArrayEqual(c, d));
+  TfLiteIntArrayFree(a);
+  TfLiteIntArrayFree(b);
+  TfLiteIntArrayFree(c);
+  TfLiteIntArrayFree(d);
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7fce1ba3461066e6dada95246781440258d844c1
--- /dev/null
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+DOWNLOADS_DIR=tensorflow/contrib/lite/downloads
+BZL_FILE_PATH=tensorflow/workspace.bzl
+
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
+ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
+NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
+FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz"
+FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/master.zip"
+MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
+QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
+
+# TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
+#                   so work around it by patching the source.
+replace_by_sed() {
+  local regex="${1}"
+  shift
+  # Detect the version of sed by the return value of "--version" flag. GNU-sed
+  # supports "--version" while BSD-sed doesn't.
+  if ! sed --version >/dev/null 2>&1; then
+    # BSD-sed.
+    sed -i '' -e "${regex}" "$@"
+  else
+    # GNU-sed.
+    sed -i -e "${regex}" "$@"
+  fi
+}
+
+download_and_extract() {
+  local usage="Usage: download_and_extract URL DIR"
+  local url="${1:?${usage}}"
+  local dir="${2:?${usage}}"
+  echo "downloading ${url}" >&2
+  mkdir -p "${dir}"
+  if [[ "${url}" == *gz ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *zip ]]; then
+    tempdir=$(mktemp -d)
+    tempdir2=$(mktemp -d)
+
+    curl -L ${url} > ${tempdir}/zipped.zip
+    unzip ${tempdir}/zipped.zip -d ${tempdir2}
+
+    # If the zip file contains nested directories, extract the files from the
+    # inner directory.
+    if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
+      # unzip has no strip components, so unzip to a temp dir, and move the
+      # files we want from the tempdir to destination.
+      cp -R ${tempdir2}/*/* ${dir}/
+    else
+      cp -R ${tempdir2}/* ${dir}/
+    fi
+    rm -rf ${tempdir2} ${tempdir}
+  fi
+
+  # Delete any potential BUILD files, which would interfere with Bazel builds.
+  find "${dir}" -type f -name '*BUILD' -delete
+}
+
+download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
+download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
+download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
+download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
+download_and_extract "${NEON_2_SSE_URL}" "${DOWNLOADS_DIR}/neon_2_sse"
+download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
+download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
+download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
+download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
+
+replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
+  "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+
+cp ${DOWNLOADS_DIR}/models/models/* tensorflow/contrib/lite/examples/ios/simple/data/
+cp ${DOWNLOADS_DIR}/quantized_models/* tensorflow/contrib/lite/examples/ios/camera/data/
+
+echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/lite/error_reporter.cc b/tensorflow/contrib/lite/error_reporter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ba5384a94dbf9de03fb2e4e2f63074525eafa2d
--- /dev/null
+++ b/tensorflow/contrib/lite/error_reporter.cc
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include <cstdarg>
+#include <cstdio>
+
+namespace tflite {
+
+ErrorReporter::~ErrorReporter() {}
+
+int ErrorReporter::Report(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  int code = Report(format, args);
+  va_end(args);
+  return code;
+}
+
+// TODO(aselle): Make the name of ReportError on context the same, so
+// we can use the ensure functions w/o a context and w/ a reporter.
+int ErrorReporter::ReportError(void*, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  int code = Report(format, args);
+  va_end(args);
+  return code;
+}
+
+int StderrReporter::Report(const char* format, va_list args) {
+  return vfprintf(stderr, format, args);
+}
+
+ErrorReporter* DefaultErrorReporter() {
+  static StderrReporter* error_reporter = new StderrReporter;
+  return error_reporter;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/error_reporter.h b/tensorflow/contrib/lite/error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..637d456ce7a754c7da34e551869e49b4efd18e3b
--- /dev/null
+++ b/tensorflow/contrib/lite/error_reporter.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
+
+#include <cstdarg>
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// A functor that reports error to supporting system. Invoked similar to
+// printf.
+//
+// Usage:
+//  ErrorReporter foo;
+//  foo.Report("test %d\n", 5);
+// or
+//  va_list args;
+//  foo.Report("test %d\n", args); // where args is va_list
+//
+// Sublclass ErrorReporter to provide another reporting destination.
+// For example, if you have a GUI program, you might redirect to a buffer
+// that drives a GUI error log box.
+class ErrorReporter {
+ public:
+  virtual ~ErrorReporter();
+  virtual int Report(const char* format, va_list args) = 0;
+  int Report(const char* format, ...);
+  int ReportError(void*, const char* format, ...);
+};
+
+// An error reporter that simplify writes the message to stderr.
+struct StderrReporter : public ErrorReporter {
+  int Report(const char* format, va_list args) override;
+};
+
+// Return the default error reporter (output to stderr).
+ErrorReporter* DefaultErrorReporter();
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_ERROR_REPORTER_H_
diff --git a/tensorflow/contrib/lite/examples/ios/camera/.gitignore b/tensorflow/contrib/lite/examples/ios/camera/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9e8962f4c63562dd95896833f563abfbfb578ccc
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/.gitignore
@@ -0,0 +1,2 @@
+/data/*.txt
+/data/*.tflite
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.h b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..55891c3ee18318037fd14fe4160c6f012aeaae66
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.h
@@ -0,0 +1,21 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface CameraExampleAppDelegate : UIResponder<UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow* window;
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.m b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.m
new file mode 100644
index 0000000000000000000000000000000000000000..128266d53f560f3009f6435939ab48ae1c117a3a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleAppDelegate.m
@@ -0,0 +1,44 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "CameraExampleAppDelegate.h"
+
+@implementation CameraExampleAppDelegate
+
+@synthesize window = _window;
+
+- (BOOL)application:(UIApplication *)application
+    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+  [self.window makeKeyAndVisible];
+  return YES;
+}
+
+- (void)applicationWillResignActive:(UIApplication *)application {
+  [[UIApplication sharedApplication] setIdleTimerDisabled:NO];
+}
+
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+}
+
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+}
+
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+  [[UIApplication sharedApplication] setIdleTimerDisabled:YES];
+}
+
+- (void)applicationWillTerminate:(UIApplication *)application {
+}
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb5800e86d365b56f1b52147c3f9cc8d7211f8c3
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.h
@@ -0,0 +1,48 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <AVFoundation/AVFoundation.h>
+#import <UIKit/UIKit.h>
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+
+@interface CameraExampleViewController
+    : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
+  IBOutlet UIView* previewView;
+  AVCaptureVideoPreviewLayer* previewLayer;
+  AVCaptureVideoDataOutput* videoDataOutput;
+  dispatch_queue_t videoDataOutputQueue;
+  UIView* flashView;
+  BOOL isUsingFrontFacingCamera;
+  NSMutableDictionary* oldPredictionValues;
+  NSMutableArray* labelLayers;
+  AVCaptureSession* session;
+
+  std::vector<std::string> labels;
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+
+  double total_latency;
+  int total_count;
+}
+@property(strong, nonatomic) CATextLayer* predictionTextLayer;
+
+- (IBAction)takePicture:(id)sender;
+- (IBAction)switchCameras:(id)sender;
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
new file mode 100644
index 0000000000000000000000000000000000000000..10f31bb6f17242c9f7f70f0648ec643f99c5ac86
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -0,0 +1,510 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "CameraExampleViewController.h"
+#import <AssertMacros.h>
+#import <AssetsLibrary/AssetsLibrary.h>
+#import <CoreImage/CoreImage.h>
+#import <ImageIO/ImageIO.h>
+
+#include <sys/time.h>
+#include <fstream>
+#include <iostream>
+#include <queue>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+#define LOG(x) std::cerr
+
+// If you have your own model, modify this to the file name, and make sure
+// you've added the file to your app resources too.
+static NSString* model_file_name = @"mobilenet_quant_v1_224";
+static NSString* model_file_type = @"tflite";
+
+// If you have your own model, point this to the labels file.
+static NSString* labels_file_name = @"labels";
+static NSString* labels_file_type = @"txt";
+
+// These dimensions need to match those the model was trained with.
+static const int wanted_input_width = 224;
+static const int wanted_input_height = 224;
+static const int wanted_input_channels = 3;
+
+static NSString* FilePathForResourceName(NSString* name, NSString* extension) {
+  NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
+  if (file_path == NULL) {
+    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
+               << "' in bundle.";
+  }
+  return file_path;
+}
+
+static void LoadLabels(NSString* file_name, NSString* file_type,
+                       std::vector<std::string>* label_strings) {
+  NSString* labels_path = FilePathForResourceName(file_name, file_type);
+  if (!labels_path) {
+    LOG(ERROR) << "Failed to find model proto at" << [file_name UTF8String]
+               << [file_type UTF8String];
+  }
+  std::ifstream t;
+  t.open([labels_path UTF8String]);
+  std::string line;
+  while (t) {
+    std::getline(t, line);
+    label_strings->push_back(line);
+  }
+  t.close();
+}
+
+// Returns the top N confidence values over threshold in the provided vector,
+// sorted by confidence in descending order.
+static void GetTopN(const uint8_t* prediction, const int prediction_size, const int num_results,
+                    const float threshold, std::vector<std::pair<float, int>>* top_results) {
+  // Will contain top N results in ascending order.
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
+                      std::greater<std::pair<float, int>>>
+      top_result_pq;
+
+  const long count = prediction_size;
+  for (int i = 0; i < count; ++i) {
+    const float value = prediction[i] / 255.0;
+    // Only add it if it beats the threshold and has a chance at being in
+    // the top N.
+    if (value < threshold) {
+      continue;
+    }
+
+    top_result_pq.push(std::pair<float, int>(value, i));
+
+    // If at capacity, kick the smallest value out.
+    if (top_result_pq.size() > num_results) {
+      top_result_pq.pop();
+    }
+  }
+
+  // Copy to output vector and reverse into descending order.
+  while (!top_result_pq.empty()) {
+    top_results->push_back(top_result_pq.top());
+    top_result_pq.pop();
+  }
+  std::reverse(top_results->begin(), top_results->end());
+}
+
+@interface CameraExampleViewController (InternalMethods)
+- (void)setupAVCapture;
+- (void)teardownAVCapture;
+@end
+
+@implementation CameraExampleViewController
+
+- (void)setupAVCapture {
+  NSError* error = nil;
+
+  session = [AVCaptureSession new];
+  if ([[UIDevice currentDevice] userInterfaceIdiom] == UIUserInterfaceIdiomPhone)
+    [session setSessionPreset:AVCaptureSessionPreset640x480];
+  else
+    [session setSessionPreset:AVCaptureSessionPresetPhoto];
+
+  AVCaptureDevice* device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+  AVCaptureDeviceInput* deviceInput =
+      [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
+
+  if (error != nil) {
+    NSLog(@"Failed to initialize AVCaptureDeviceInput. Note: This app doesn't work with simulator");
+    assert(NO);
+  }
+
+  if ([session canAddInput:deviceInput]) [session addInput:deviceInput];
+
+  videoDataOutput = [AVCaptureVideoDataOutput new];
+
+  NSDictionary* rgbOutputSettings =
+      [NSDictionary dictionaryWithObject:[NSNumber numberWithInt:kCMPixelFormat_32BGRA]
+                                  forKey:(id)kCVPixelBufferPixelFormatTypeKey];
+  [videoDataOutput setVideoSettings:rgbOutputSettings];
+  [videoDataOutput setAlwaysDiscardsLateVideoFrames:YES];
+  videoDataOutputQueue = dispatch_queue_create("VideoDataOutputQueue", DISPATCH_QUEUE_SERIAL);
+  [videoDataOutput setSampleBufferDelegate:self queue:videoDataOutputQueue];
+
+  if ([session canAddOutput:videoDataOutput]) [session addOutput:videoDataOutput];
+  [[videoDataOutput connectionWithMediaType:AVMediaTypeVideo] setEnabled:YES];
+
+  previewLayer = [[AVCaptureVideoPreviewLayer alloc] initWithSession:session];
+  [previewLayer setBackgroundColor:[[UIColor blackColor] CGColor]];
+  [previewLayer setVideoGravity:AVLayerVideoGravityResizeAspect];
+  CALayer* rootLayer = [previewView layer];
+  [rootLayer setMasksToBounds:YES];
+  [previewLayer setFrame:[rootLayer bounds]];
+  [rootLayer addSublayer:previewLayer];
+  [session startRunning];
+
+  if (error) {
+    NSString* title = [NSString stringWithFormat:@"Failed with error %d", (int)[error code]];
+    UIAlertController* alertController =
+        [UIAlertController alertControllerWithTitle:title
+                                            message:[error localizedDescription]
+                                     preferredStyle:UIAlertControllerStyleAlert];
+    UIAlertAction* dismiss =
+        [UIAlertAction actionWithTitle:@"Dismiss" style:UIAlertActionStyleDefault handler:nil];
+    [alertController addAction:dismiss];
+    [self presentViewController:alertController animated:YES completion:nil];
+    [self teardownAVCapture];
+  }
+}
+
+- (void)teardownAVCapture {
+  [previewLayer removeFromSuperlayer];
+}
+
+- (AVCaptureVideoOrientation)avOrientationForDeviceOrientation:
+    (UIDeviceOrientation)deviceOrientation {
+  AVCaptureVideoOrientation result = (AVCaptureVideoOrientation)(deviceOrientation);
+  if (deviceOrientation == UIDeviceOrientationLandscapeLeft)
+    result = AVCaptureVideoOrientationLandscapeRight;
+  else if (deviceOrientation == UIDeviceOrientationLandscapeRight)
+    result = AVCaptureVideoOrientationLandscapeLeft;
+  return result;
+}
+
+- (IBAction)takePicture:(id)sender {
+  if ([session isRunning]) {
+    [session stopRunning];
+    [sender setTitle:@"Continue" forState:UIControlStateNormal];
+
+    flashView = [[UIView alloc] initWithFrame:[previewView frame]];
+    [flashView setBackgroundColor:[UIColor whiteColor]];
+    [flashView setAlpha:0.f];
+    [[[self view] window] addSubview:flashView];
+
+    [UIView animateWithDuration:.2f
+        animations:^{
+          [flashView setAlpha:1.f];
+        }
+        completion:^(BOOL finished) {
+          [UIView animateWithDuration:.2f
+              animations:^{
+                [flashView setAlpha:0.f];
+              }
+              completion:^(BOOL finished) {
+                [flashView removeFromSuperview];
+                flashView = nil;
+              }];
+        }];
+
+  } else {
+    [session startRunning];
+    [sender setTitle:@"Freeze Frame" forState:UIControlStateNormal];
+  }
+}
+
+- (void)captureOutput:(AVCaptureOutput*)captureOutput
+    didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+           fromConnection:(AVCaptureConnection*)connection {
+  CVPixelBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+  CFRetain(pixelBuffer);
+  [self runModelOnFrame:pixelBuffer];
+  CFRelease(pixelBuffer);
+}
+
+- (void)runModelOnFrame:(CVPixelBufferRef)pixelBuffer {
+  assert(pixelBuffer != NULL);
+
+  OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
+  int doReverseChannels;
+  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
+    doReverseChannels = 1;
+  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
+    doReverseChannels = 0;
+  } else {
+    assert(false);  // Unknown source format
+  }
+
+  const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
+  const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
+  const int fullHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
+
+  CVPixelBufferLockFlags unlockFlags = kNilOptions;
+  CVPixelBufferLockBaseAddress(pixelBuffer, unlockFlags);
+
+  unsigned char* sourceBaseAddr = (unsigned char*)(CVPixelBufferGetBaseAddress(pixelBuffer));
+  int image_height;
+  unsigned char* sourceStartAddr;
+  if (fullHeight <= image_width) {
+    image_height = fullHeight;
+    sourceStartAddr = sourceBaseAddr;
+  } else {
+    image_height = image_width;
+    const int marginY = ((fullHeight - image_width) / 2);
+    sourceStartAddr = (sourceBaseAddr + (marginY * sourceRowBytes));
+  }
+  const int image_channels = 4;
+  assert(image_channels >= wanted_input_channels);
+  uint8_t* in = sourceStartAddr;
+
+  int input = interpreter->inputs()[0];
+
+  uint8_t* out = interpreter->typed_tensor<uint8_t>(input);
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = out + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = in + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+
+  double startTimestamp = [[NSDate new] timeIntervalSince1970];
+  if (interpreter->Invoke() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to invoke!";
+  }
+  double endTimestamp = [[NSDate new] timeIntervalSince1970];
+  total_latency += (endTimestamp - startTimestamp);
+  total_count += 1;
+  NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", endTimestamp - startTimestamp,
+        total_latency / total_count, total_count);
+
+  const int output_size = 1000;
+  const int kNumResults = 5;
+  const float kThreshold = 0.1f;
+
+  std::vector<std::pair<float, int>> top_results;
+
+  uint8_t* output = interpreter->typed_output_tensor<uint8_t>(0);
+  GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
+
+  NSMutableDictionary* newValues = [NSMutableDictionary dictionary];
+  for (const auto& result : top_results) {
+    const float confidence = result.first;
+    const int index = result.second;
+    NSString* labelObject = [NSString stringWithUTF8String:labels[index].c_str()];
+    NSNumber* valueObject = [NSNumber numberWithFloat:confidence];
+    [newValues setObject:valueObject forKey:labelObject];
+  }
+  dispatch_async(dispatch_get_main_queue(), ^(void) {
+    [self setPredictionValues:newValues];
+  });
+
+  CVPixelBufferUnlockBaseAddress(pixelBuffer, unlockFlags);
+
+  CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
+}
+
+- (void)dealloc {
+  [self teardownAVCapture];
+}
+
+- (void)didReceiveMemoryWarning {
+  [super didReceiveMemoryWarning];
+}
+
+- (void)viewDidLoad {
+  [super viewDidLoad];
+  labelLayers = [[NSMutableArray alloc] init];
+  oldPredictionValues = [[NSMutableDictionary alloc] init];
+
+  NSString* graph_path = FilePathForResourceName(model_file_name, @"tflite");
+  model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+  if (!model) {
+    LOG(FATAL) << "Failed to mmap model " << graph_path;
+  }
+  LOG(INFO) << "Loaded model " << graph_path;
+  model->error_reporter();
+  LOG(INFO) << "resolved reporter";
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  LoadLabels(labels_file_name, labels_file_type, &labels);
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    LOG(FATAL) << "Failed to construct interpreter";
+  }
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  [self setupAVCapture];
+}
+
+- (void)viewDidUnload {
+  [super viewDidUnload];
+}
+
+- (void)viewWillAppear:(BOOL)animated {
+  [super viewWillAppear:animated];
+}
+
+- (void)viewDidAppear:(BOOL)animated {
+  [super viewDidAppear:animated];
+}
+
+- (void)viewWillDisappear:(BOOL)animated {
+  [super viewWillDisappear:animated];
+}
+
+- (void)viewDidDisappear:(BOOL)animated {
+  [super viewDidDisappear:animated];
+}
+
+- (BOOL)shouldAutorotateToInterfaceOrientation:(UIInterfaceOrientation)interfaceOrientation {
+  return (interfaceOrientation == UIInterfaceOrientationPortrait);
+}
+
+- (BOOL)prefersStatusBarHidden {
+  return YES;
+}
+
+- (void)setPredictionValues:(NSDictionary*)newValues {
+  const float decayValue = 0.75f;
+  const float updateValue = 0.25f;
+  const float minimumThreshold = 0.01f;
+
+  NSMutableDictionary* decayedPredictionValues = [[NSMutableDictionary alloc] init];
+  for (NSString* label in oldPredictionValues) {
+    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
+    const float oldPredictionValue = [oldPredictionValueObject floatValue];
+    const float decayedPredictionValue = (oldPredictionValue * decayValue);
+    if (decayedPredictionValue > minimumThreshold) {
+      NSNumber* decayedPredictionValueObject = [NSNumber numberWithFloat:decayedPredictionValue];
+      [decayedPredictionValues setObject:decayedPredictionValueObject forKey:label];
+    }
+  }
+  oldPredictionValues = decayedPredictionValues;
+
+  for (NSString* label in newValues) {
+    NSNumber* newPredictionValueObject = [newValues objectForKey:label];
+    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
+    if (!oldPredictionValueObject) {
+      oldPredictionValueObject = [NSNumber numberWithFloat:0.0f];
+    }
+    const float newPredictionValue = [newPredictionValueObject floatValue];
+    const float oldPredictionValue = [oldPredictionValueObject floatValue];
+    const float updatedPredictionValue = (oldPredictionValue + (newPredictionValue * updateValue));
+    NSNumber* updatedPredictionValueObject = [NSNumber numberWithFloat:updatedPredictionValue];
+    [oldPredictionValues setObject:updatedPredictionValueObject forKey:label];
+  }
+  NSArray* candidateLabels = [NSMutableArray array];
+  for (NSString* label in oldPredictionValues) {
+    NSNumber* oldPredictionValueObject = [oldPredictionValues objectForKey:label];
+    const float oldPredictionValue = [oldPredictionValueObject floatValue];
+    if (oldPredictionValue > 0.05f) {
+      NSDictionary* entry = @{@"label" : label, @"value" : oldPredictionValueObject};
+      candidateLabels = [candidateLabels arrayByAddingObject:entry];
+    }
+  }
+  NSSortDescriptor* sort = [NSSortDescriptor sortDescriptorWithKey:@"value" ascending:NO];
+  NSArray* sortedLabels =
+      [candidateLabels sortedArrayUsingDescriptors:[NSArray arrayWithObject:sort]];
+
+  const float leftMargin = 10.0f;
+  const float topMargin = 10.0f;
+
+  const float valueWidth = 48.0f;
+  const float valueHeight = 18.0f;
+
+  const float labelWidth = 246.0f;
+  const float labelHeight = 18.0f;
+
+  const float labelMarginX = 5.0f;
+  const float labelMarginY = 5.0f;
+
+  [self removeAllLabelLayers];
+
+  int labelCount = 0;
+  for (NSDictionary* entry in sortedLabels) {
+    NSString* label = [entry objectForKey:@"label"];
+    NSNumber* valueObject = [entry objectForKey:@"value"];
+    const float value = [valueObject floatValue];
+    const float originY = topMargin + ((labelHeight + labelMarginY) * labelCount);
+    const int valuePercentage = (int)roundf(value * 100.0f);
+
+    const float valueOriginX = leftMargin;
+    NSString* valueText = [NSString stringWithFormat:@"%d%%", valuePercentage];
+
+    [self addLabelLayerWithText:valueText
+                        originX:valueOriginX
+                        originY:originY
+                          width:valueWidth
+                         height:valueHeight
+                      alignment:kCAAlignmentRight];
+
+    const float labelOriginX = (leftMargin + valueWidth + labelMarginX);
+
+    [self addLabelLayerWithText:[label capitalizedString]
+                        originX:labelOriginX
+                        originY:originY
+                          width:labelWidth
+                         height:labelHeight
+                      alignment:kCAAlignmentLeft];
+
+    labelCount += 1;
+    if (labelCount > 4) {
+      break;
+    }
+  }
+}
+
+- (void)removeAllLabelLayers {
+  for (CATextLayer* layer in labelLayers) {
+    [layer removeFromSuperlayer];
+  }
+  [labelLayers removeAllObjects];
+}
+
+- (void)addLabelLayerWithText:(NSString*)text
+                      originX:(float)originX
+                      originY:(float)originY
+                        width:(float)width
+                       height:(float)height
+                    alignment:(NSString*)alignment {
+  CFTypeRef font = (CFTypeRef) @"Menlo-Regular";
+  const float fontSize = 12.0;
+  const float marginSizeX = 5.0f;
+  const float marginSizeY = 2.0f;
+
+  const CGRect backgroundBounds = CGRectMake(originX, originY, width, height);
+  const CGRect textBounds = CGRectMake((originX + marginSizeX), (originY + marginSizeY),
+                                       (width - (marginSizeX * 2)), (height - (marginSizeY * 2)));
+
+  CATextLayer* background = [CATextLayer layer];
+  [background setBackgroundColor:[UIColor blackColor].CGColor];
+  [background setOpacity:0.5f];
+  [background setFrame:backgroundBounds];
+  background.cornerRadius = 5.0f;
+
+  [[self.view layer] addSublayer:background];
+  [labelLayers addObject:background];
+
+  CATextLayer* layer = [CATextLayer layer];
+  [layer setForegroundColor:[UIColor whiteColor].CGColor];
+  [layer setFrame:textBounds];
+  [layer setAlignmentMode:alignment];
+  [layer setWrapped:YES];
+  [layer setFont:font];
+  [layer setFontSize:fontSize];
+  layer.contentsScale = [[UIScreen mainScreen] scale];
+  [layer setString:text];
+
+  [[self.view layer] addSublayer:layer];
+  [labelLayers addObject:layer];
+}
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Info.plist b/tensorflow/contrib/lite/examples/ios/camera/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..f3d96bab162a707df4df8655354af5a54d1e985e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/Info.plist
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>tflite_camera_example</string>
+	<key>CFBundleExecutable</key>
+	<string>${EXECUTABLE_NAME}</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>${PRODUCT_NAME}</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>1.0</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>Capture images to detect object</string>
+	<key>UIMainStoryboardFile</key>
+	<string>MainStoryboard_iPhone</string>
+	<key>UIRequiresFullScreen</key>
+	<true/>
+	<key>UIStatusBarHidden</key>
+	<true/>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/contrib/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard b/tensorflow/contrib/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..0f10a22e415bd2519e90dd6bfac8b2ad6230caab
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/MainStoryboard_iPhone.storyboard
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="9531" systemVersion="15E65" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" initialViewController="2">
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="9529"/>
+    </dependencies>
+    <scenes>
+        <!--Camera Example View Controller-->
+        <scene sceneID="5">
+            <objects>
+                <viewController id="2" customClass="CameraExampleViewController" sceneMemberID="viewController">
+                    <view key="view" contentMode="scaleToFill" id="3">
+                        <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
+                        <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                        <subviews>
+                            <view contentMode="scaleToFill" id="12">
+                                <rect key="frame" x="0.0" y="0.0" width="320" height="522"/>
+                                <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="calibratedWhite"/>
+                                <gestureRecognizers/>
+                            </view>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" id="iD8-yH-eWH">
+                                <rect key="frame" x="0.0" y="454" width="320" height="33"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
+                                <fontDescription key="fontDescription" name="Menlo-Regular" family="Menlo" pointSize="20"/>
+                                <state key="normal" title="Freeze Frame">
+                                    <color key="titleColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
+                                    <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
+                                </state>
+                                <connections>
+                                    <action selector="takePicture:" destination="2" eventType="touchUpInside" id="BTy-7E-XUS"/>
+                                </connections>
+                            </button>
+                        </subviews>
+                        <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
+                    </view>
+                    <connections>
+                        <outlet property="previewView" destination="12" id="13"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="4" sceneMemberID="firstResponder"/>
+            </objects>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/contrib/lite/examples/ios/camera/Podfile b/tensorflow/contrib/lite/examples/ios/camera/Podfile
new file mode 100644
index 0000000000000000000000000000000000000000..4ae6fb6b94e4489f63506b05a2f348b7daafd3b7
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tflite_camera_example'
+       pod 'TensorFlow-experimental'
diff --git a/tensorflow/contrib/lite/examples/ios/camera/data/.gitignore b/tensorflow/contrib/lite/examples/ios/camera/data/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/contrib/lite/examples/ios/camera/main.mm b/tensorflow/contrib/lite/examples/ios/camera/main.mm
new file mode 100644
index 0000000000000000000000000000000000000000..1a9e542f7c9a5b09be6463437c3a8e4a5afeda6d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/main.mm
@@ -0,0 +1,28 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+#import "CameraExampleAppDelegate.h"
+
+int main(int argc, char* argv[]) {
+  int retVal = 0;
+
+  @autoreleasepool {
+    retVal =
+        UIApplicationMain(argc, argv, nil, NSStringFromClass([CameraExampleAppDelegate class]));
+  }
+  return retVal;
+}
diff --git a/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..c98183276bd60d2a0ad023ba26aad12572a02786
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -0,0 +1,419 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
+		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
+		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
+		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
+		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
+		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
+		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
+		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
+		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
+		AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */; };
+		ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tflite_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
+		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
+		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
+		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
+		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
+		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
+		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tflite_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
+		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
+		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
+		ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				AC1F82691FBA3F930052BA77 /* libtensorflow-lite.a in Frameworks */,
+				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
+				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
+				54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				AC1F82681FBA3F930052BA77 /* libtensorflow-lite.a */,
+				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
+				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+				3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		3E9FC355632FB928EA23BEED /* Pods */ = {
+			isa = PBXGroup;
+			children = (
+				3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */,
+				55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */,
+			);
+			name = Pods;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
+				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
+				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
+				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
+				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
+				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
+				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				3E9FC355632FB928EA23BEED /* Pods */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				ACA1A4C91FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite */,
+				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		1C564C0C1ED3A92E00087306 /* tflite_camera_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */;
+			buildPhases = (
+				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
+				1C564C091ED3A92E00087306 /* Sources */,
+				1C564C0A1ED3A92E00087306 /* Frameworks */,
+				1C564C0B1ED3A92E00087306 /* Resources */,
+				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
+				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tflite_camera_example;
+			productName = tflite_camera_example;
+			productReference = 1C564C0D1ED3A92E00087306 /* tflite_camera_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0830;
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					1C564C0C1ED3A92E00087306 = {
+						CreatedOnToolsVersion = 8.3.2;
+						DevelopmentTeam = EQHXZ8M8AV;
+						ProvisioningStyle = Automatic;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				1C564C0C1ED3A92E00087306 /* tflite_camera_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		1C564C0B1ED3A92E00087306 /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				ACA1A4CA1FBB6C28009B8D86 /* mobilenet_quant_v1_224.tflite in Resources */,
+				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
+				1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */,
+				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXShellScriptBuildPhase section */
+		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+			);
+			name = "[CP] Copy Pods Resources";
+			outputPaths = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-resources.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
+		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_PODFILE_DIR_PATH}/Podfile.lock",
+				"${PODS_ROOT}/Manifest.lock",
+			);
+			name = "[CP] Check Pods Manifest.lock";
+			outputPaths = (
+				"$(DERIVED_FILE_DIR)/Pods-tflite_camera_example-checkManifestLockResult.txt",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
+			showEnvVarsInLog = 0;
+		};
+/* End PBXShellScriptBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		1C564C091ED3A92E00087306 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
+				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
+				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		1C564C361ED3A92E00087306 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Debug;
+		};
+		1C564C371ED3A92E00087306 /* Release */ = {
+			isa = XCBuildConfiguration;
+			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				INFOPLIST_FILE = Info.plist;
+				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
+				SWIFT_VERSION = 3.0;
+			};
+			name = Release;
+		};
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../downloads/flatbuffers/include/,
+					../../../downloads/eigen/,
+					../../../downloads/,
+				);
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../downloads/flatbuffers/include/,
+					../../../downloads/eigen/,
+					../../../downloads/,
+				);
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tflite_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1C564C361ED3A92E00087306 /* Debug */,
+				1C564C371ED3A92E00087306 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tflite_camera_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..94046d9728258901091f018fd0d081651145f400
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.h
@@ -0,0 +1,21 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface AppDelegate : UIResponder<UIApplicationDelegate>
+
+@property(strong, nonatomic) UIWindow *window;
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
new file mode 100644
index 0000000000000000000000000000000000000000..d1215fa0bffd978b4aaadbd8bc13b07723703c9a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/AppDelegate.mm
@@ -0,0 +1,48 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "AppDelegate.h"
+
+#import "RunModelViewController.h"
+
+@implementation AppDelegate
+
+- (BOOL)application:(UIApplication *)application
+    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
+
+  UITabBarController *bar = [[UITabBarController alloc] init];
+  [bar setViewControllers:@[ [[RunModelViewController alloc] init] ]];
+  bar.selectedIndex = 0;
+  self.window = [[UIWindow alloc] initWithFrame:[[UIScreen mainScreen] bounds]];
+  self.window.rootViewController = bar;
+  [self.window makeKeyAndVisible];
+  return YES;
+}
+
+- (void)applicationWillResignActive:(UIApplication *)application {
+}
+
+- (void)applicationDidEnterBackground:(UIApplication *)application {
+}
+
+- (void)applicationWillEnterForeground:(UIApplication *)application {
+}
+
+- (void)applicationDidBecomeActive:(UIApplication *)application {
+}
+
+- (void)applicationWillTerminate:(UIApplication *)application {
+}
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/Podfile b/tensorflow/contrib/lite/examples/ios/simple/Podfile
new file mode 100644
index 0000000000000000000000000000000000000000..1740ad64573a84fae6de0fcf284eb06afec67e25
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/Podfile
@@ -0,0 +1,5 @@
+platform :ios, '8.0'
+inhibit_all_warnings!
+
+target 'tf_simple_example'
+       pod 'TensorFlow-experimental'
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..1a3eaa8a2c18d1cd24dfd475d396b00ec4d86c9d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModel-Info.plist
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleDisplayName</key>
+	<string>tflite-simple-example</string>
+	<key>CFBundleExecutable</key>
+	<string>tf_simple_example</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>ios-app</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleVersion</key>
+	<string>1.0</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>UILaunchStoryboardName</key>
+	<string>RunModelViewController</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+		<string>UIInterfaceOrientationLandscapeLeft</string>
+		<string>UIInterfaceOrientationLandscapeRight</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4b358b4eb7f6ba109638405091b798d30bd1768
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.h
@@ -0,0 +1,24 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+@interface RunModelViewController : UIViewController
+
+- (IBAction)getUrl:(id)sender;
+
+@property(weak, nonatomic) IBOutlet UITextView *urlContentTextView;
+@property(weak, nonatomic) IBOutlet UITextField *urlTextField;
+
+@end
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
new file mode 100644
index 0000000000000000000000000000000000000000..0dafb1f61e19f46bb3b17f07c55e09f5813ed560
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.mm
@@ -0,0 +1,221 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "RunModelViewController.h"
+
+#include <pthread.h>
+#include <unistd.h>
+#include <fstream>
+#include <iostream>
+#include <queue>
+#include <sstream>
+#include <string>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+#include "ios_image_load.h"
+
+#define LOG(x) std::cerr
+#define CHECK(x)                  \
+  if (!(x)) {                     \
+    LOG(ERROR) << #x << "failed"; \
+    exit(1);                      \
+  }
+
+NSString* RunInferenceOnImage();
+
+@interface RunModelViewController ()
+@end
+
+@implementation RunModelViewController {
+}
+
+- (IBAction)getUrl:(id)sender {
+  NSString* inference_result = RunInferenceOnImage();
+  self.urlContentTextView.text = inference_result;
+}
+
+@end
+
+// Returns the top N confidence values over threshold in the provided vector,
+// sorted by confidence in descending order.
+static void GetTopN(const float* prediction, const int prediction_size, const int num_results,
+                    const float threshold, std::vector<std::pair<float, int> >* top_results) {
+  // Will contain top N results in ascending order.
+  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int> >,
+                      std::greater<std::pair<float, int> > >
+      top_result_pq;
+
+  const long count = prediction_size;
+  for (int i = 0; i < count; ++i) {
+    const float value = prediction[i];
+
+    // Only add it if it beats the threshold and has a chance at being in
+    // the top N.
+    if (value < threshold) {
+      continue;
+    }
+
+    top_result_pq.push(std::pair<float, int>(value, i));
+
+    // If at capacity, kick the smallest value out.
+    if (top_result_pq.size() > num_results) {
+      top_result_pq.pop();
+    }
+  }
+
+  // Copy to output vector and reverse into descending order.
+  while (!top_result_pq.empty()) {
+    top_results->push_back(top_result_pq.top());
+    top_result_pq.pop();
+  }
+  std::reverse(top_results->begin(), top_results->end());
+}
+
+NSString* FilePathForResourceName(NSString* name, NSString* extension) {
+  NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
+  if (file_path == NULL) {
+    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "." << [extension UTF8String]
+               << "' in bundle.";
+  }
+  return file_path;
+}
+
+NSString* RunInferenceOnImage() {
+  std::string graph;
+  const int num_threads = 1;
+  std::string input_layer_type = "float";
+  std::vector<int> sizes = {1, 224, 224, 3};
+
+  NSString* graph_path = FilePathForResourceName(@"mobilenet_v1_1.0_224", @"tflite");
+
+  std::unique_ptr<tflite::FlatBufferModel> model(
+      tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]));
+  if (!model) {
+    LOG(FATAL) << "Failed to mmap model " << graph;
+  }
+  LOG(INFO) << "Loaded model " << graph;
+  model->error_reporter();
+  LOG(INFO) << "resolved reporter";
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+#else
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+#endif
+
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    LOG(FATAL) << "Failed to construct interpreter";
+  }
+
+  if (num_threads != -1) {
+    interpreter->SetNumThreads(num_threads);
+  }
+
+  int input = interpreter->inputs()[0];
+
+  if (input_layer_type != "string") {
+    interpreter->ResizeInputTensor(input, sizes);
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  // Read the label list
+  NSString* labels_path = FilePathForResourceName(@"labels", @"txt");
+  std::vector<std::string> label_strings;
+  std::ifstream t;
+  t.open([labels_path UTF8String]);
+  std::string line;
+  while (t) {
+    std::getline(t, line);
+    label_strings.push_back(line);
+  }
+  t.close();
+
+  // Read the Grace Hopper image.
+  NSString* image_path = FilePathForResourceName(@"grace_hopper", @"jpg");
+  int image_width;
+  int image_height;
+  int image_channels;
+  std::vector<uint8_t> image_data =
+      LoadImageFromFile([image_path UTF8String], &image_width, &image_height, &image_channels);
+  const int wanted_width = 224;
+  const int wanted_height = 224;
+  const int wanted_channels = 3;
+  const float input_mean = 127.5f;
+  const float input_std = 127.5f;
+  assert(image_channels >= wanted_channels);
+  uint8_t* in = image_data.data();
+  float* out = interpreter->typed_tensor<float>(input);
+  for (int y = 0; y < wanted_height; ++y) {
+    const int in_y = (y * image_height) / wanted_height;
+    uint8_t* in_row = in + (in_y * image_width * image_channels);
+    float* out_row = out + (y * wanted_width * wanted_channels);
+    for (int x = 0; x < wanted_width; ++x) {
+      const int in_x = (x * image_width) / wanted_width;
+      uint8_t* in_pixel = in_row + (in_x * image_channels);
+      float* out_pixel = out_row + (x * wanted_channels);
+      for (int c = 0; c < wanted_channels; ++c) {
+        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
+      }
+    }
+  }
+
+  if (interpreter->Invoke() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to invoke!";
+  }
+
+  float* output = interpreter->typed_output_tensor<float>(0);
+  const int output_size = 1000;
+  const int kNumResults = 5;
+  const float kThreshold = 0.1f;
+  std::vector<std::pair<float, int> > top_results;
+  GetTopN(output, output_size, kNumResults, kThreshold, &top_results);
+
+  std::stringstream ss;
+  ss.precision(3);
+  for (const auto& result : top_results) {
+    const float confidence = result.first;
+    const int index = result.second;
+
+    ss << index << " " << confidence << "  ";
+
+    // Write out the result as a string
+    if (index < label_strings.size()) {
+      // just for safety: theoretically, the output is under 1000 unless there
+      // is some numerical issues leading to a wrong prediction.
+      ss << label_strings[index];
+    } else {
+      ss << "Prediction: " << index;
+    }
+
+    ss << "\n";
+  }
+
+  LOG(INFO) << "Predictions: " << ss.str();
+
+  std::string predictions = ss.str();
+  NSString* result = @"";
+  result = [NSString stringWithFormat:@"%@ - %s", result, predictions.c_str()];
+
+  return result;
+}
diff --git a/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.xib b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.xib
new file mode 100644
index 0000000000000000000000000000000000000000..93f334b9850c6f5f22455b3d14a075c17a7c9171
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/RunModelViewController.xib
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.XIB" version="3.0" toolsVersion="9531" systemVersion="15D21" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES">
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="9529"/>
+    </dependencies>
+    <objects>
+        <placeholder placeholderIdentifier="IBFilesOwner" id="-1" userLabel="File's Owner" customClass="RunModelViewController">
+            <connections>
+                <outlet property="urlContentTextView" destination="quY-AK-ZCn" id="YjW-BO-1Ta"/>
+                <outlet property="urlTextField" destination="hPw-q5-vh5" id="wmc-b6-2CV"/>
+                <outlet property="view" destination="1" id="iHm-Rr-4wj"/>
+            </connections>
+        </placeholder>
+        <placeholder placeholderIdentifier="IBFirstResponder" id="-2" customClass="UIResponder"/>
+        <view contentMode="scaleToFill" id="1">
+            <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
+            <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+            <subviews>
+                <textView clipsSubviews="YES" contentMode="scaleToFill" fixedFrame="YES" editable="NO" text="The results of running the model will appear here." selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="quY-AK-ZCn">
+                    <rect key="frame" x="40" y="99" width="240" height="168"/>
+                    <color key="backgroundColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
+                    <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                    <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                </textView>
+                <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="AAC-Bk-PCC">
+                    <rect key="frame" x="76" y="37" width="168" height="30"/>
+                    <color key="backgroundColor" white="0.33333333333333331" alpha="1" colorSpace="calibratedWhite"/>
+                    <state key="normal" title="Run Model">
+                        <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
+                    </state>
+                    <connections>
+                        <action selector="getUrl:" destination="-1" eventType="touchUpInside" id="mdP-nK-k9T"/>
+                    </connections>
+                </button>
+            </subviews>
+            <color key="backgroundColor" red="0.78314738357315861" green="0.79869981749999996" blue="0.56305065858222869" alpha="1" colorSpace="calibratedRGB"/>
+        </view>
+        <textField opaque="NO" clipsSubviews="YES" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="center" text="http://localhost:8080" borderStyle="roundedRect" placeholder="Enter URL" minimumFontSize="17" id="hPw-q5-vh5">
+            <rect key="frame" x="0.0" y="0.0" width="280" height="30"/>
+            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+            <fontDescription key="fontDescription" type="system" pointSize="14"/>
+            <textInputTraits key="textInputTraits"/>
+            <point key="canvasLocation" x="795" y="44"/>
+        </textField>
+    </objects>
+</document>
diff --git a/tensorflow/contrib/lite/examples/ios/simple/data/grace_hopper.jpg b/tensorflow/contrib/lite/examples/ios/simple/data/grace_hopper.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d2a427810f679db537236c5430873a81a62ef412
Binary files /dev/null and b/tensorflow/contrib/lite/examples/ios/simple/data/grace_hopper.jpg differ
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
new file mode 100644
index 0000000000000000000000000000000000000000..98934ce41d349b33d4fc010a39a956e52f3d5721
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.h
@@ -0,0 +1,23 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+#define TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
+
+#include <vector>
+
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width,
+                                       int* out_height, int* out_channels);
+
+#endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
new file mode 100644
index 0000000000000000000000000000000000000000..cb0fe1a7650c572d3745066431f2759daa94ffc9
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/ios_image_load.mm
@@ -0,0 +1,82 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "ios_image_load.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#import <CoreImage/CoreImage.h>
+#import <ImageIO/ImageIO.h>
+
+std::vector<uint8_t> LoadImageFromFile(const char* file_name, int* out_width, int* out_height,
+                                       int* out_channels) {
+  FILE* file_handle = fopen(file_name, "rb");
+  fseek(file_handle, 0, SEEK_END);
+  const size_t bytes_in_file = ftell(file_handle);
+  fseek(file_handle, 0, SEEK_SET);
+  std::vector<uint8_t> file_data(bytes_in_file);
+  fread(file_data.data(), 1, bytes_in_file, file_handle);
+  fclose(file_handle);
+
+  CFDataRef file_data_ref =
+      CFDataCreateWithBytesNoCopy(NULL, file_data.data(), bytes_in_file, kCFAllocatorNull);
+  CGDataProviderRef image_provider = CGDataProviderCreateWithCFData(file_data_ref);
+
+  const char* suffix = strrchr(file_name, '.');
+  if (!suffix || suffix == file_name) {
+    suffix = "";
+  }
+  CGImageRef image;
+  if (strcasecmp(suffix, ".png") == 0) {
+    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
+  } else if ((strcasecmp(suffix, ".jpg") == 0) || (strcasecmp(suffix, ".jpeg") == 0)) {
+    image =
+        CGImageCreateWithJPEGDataProvider(image_provider, NULL, true, kCGRenderingIntentDefault);
+  } else {
+    CFRelease(image_provider);
+    CFRelease(file_data_ref);
+    fprintf(stderr, "Unknown suffix for file '%s'\n", file_name);
+    *out_width = 0;
+    *out_height = 0;
+    *out_channels = 0;
+    return std::vector<uint8_t>();
+  }
+
+  const int width = (int)CGImageGetWidth(image);
+  const int height = (int)CGImageGetHeight(image);
+  const int channels = 4;
+  CGColorSpaceRef color_space = CGColorSpaceCreateDeviceRGB();
+  const int bytes_per_row = (width * channels);
+  const int bytes_in_image = (bytes_per_row * height);
+  std::vector<uint8_t> result(bytes_in_image);
+  const int bits_per_component = 8;
+
+  CGContextRef context =
+      CGBitmapContextCreate(result.data(), width, height, bits_per_component, bytes_per_row,
+                            color_space, kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
+  CGColorSpaceRelease(color_space);
+  CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
+  CGContextRelease(context);
+  CFRelease(image);
+  CFRelease(image_provider);
+  CFRelease(file_data_ref);
+
+  *out_width = width;
+  *out_height = height;
+  *out_channels = channels;
+  return result;
+}
diff --git a/tensorflow/contrib/lite/examples/ios/simple/main.mm b/tensorflow/contrib/lite/examples/ios/simple/main.mm
new file mode 100644
index 0000000000000000000000000000000000000000..05cb55ddd7a230593863e64b351f6aac31a1b4d7
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/main.mm
@@ -0,0 +1,22 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <UIKit/UIKit.h>
+
+int main(int argc, char *argv[]) {
+  @autoreleasepool {
+    NSString *delegateClassName = @"AppDelegate";
+    return UIApplicationMain(argc, argv, nil, delegateClassName);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..9277c230b8cce1b5673a50d32d7640d52e2e8f9d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj/project.pbxproj
@@ -0,0 +1,359 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */; };
+		1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */; };
+		594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */; };
+		594C14B11FB9037100EE8BFE /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = 594C14AF1FB9037100EE8BFE /* labels.txt */; };
+		594C14B21FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = 594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */; };
+		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
+		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
+		59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
+		59A3D0091CF4E68100C4259F /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
+		59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */; };
+		59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* RunModelViewController.xib */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
+		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
+		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
+		5911579B1CF4011C00C31E3A /* tf_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "libtensorflow-lite.a"; path = "../../../gen/lib/libtensorflow-lite.a"; sourceTree = "<group>"; };
+		594C14AF1FB9037100EE8BFE /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
+		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
+		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
+		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
+		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
+		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
+		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
+		59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "RunModel-Info.plist"; sourceTree = "<group>"; };
+		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
+		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
+		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
+		73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		591157981CF4011C00C31E3A /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				594C14AE1FB8F9B500EE8BFE /* libtensorflow-lite.a in Frameworks */,
+				1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */,
+				1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		24D7686C331131624F4454A0 /* Frameworks */ = {
+			isa = PBXGroup;
+			children = (
+				594C14AD1FB8F9B500EE8BFE /* libtensorflow-lite.a */,
+				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
+				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
+				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
+				73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */,
+			);
+			name = Frameworks;
+			sourceTree = "<group>";
+		};
+		591157921CF4011C00C31E3A = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
+				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
+				59A3CFF31CF4E68100C4259F /* data */,
+				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
+				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
+				59A3CFFC1CF4E68100C4259F /* main.mm */,
+				59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */,
+				59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */,
+				59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */,
+				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
+				5911579C1CF4011C00C31E3A /* Products */,
+				24D7686C331131624F4454A0 /* Frameworks */,
+			);
+			sourceTree = "<group>";
+		};
+		5911579C1CF4011C00C31E3A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				5911579B1CF4011C00C31E3A /* tf_simple_example.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		59A3CFF31CF4E68100C4259F /* data */ = {
+			isa = PBXGroup;
+			children = (
+				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
+				594C14AF1FB9037100EE8BFE /* labels.txt */,
+				594C14B01FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite */,
+			);
+			path = data;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		5911579A1CF4011C00C31E3A /* tf_simple_example */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */;
+			buildPhases = (
+				591157971CF4011C00C31E3A /* Sources */,
+				591157981CF4011C00C31E3A /* Frameworks */,
+				591157991CF4011C00C31E3A /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = tf_simple_example;
+			productName = tf_ios_makefile_example;
+			productReference = 5911579B1CF4011C00C31E3A /* tf_simple_example.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		591157931CF4011C00C31E3A /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0830;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					5911579A1CF4011C00C31E3A = {
+						CreatedOnToolsVersion = 7.2;
+						DevelopmentTeam = EQHXZ8M8AV;
+						ProvisioningStyle = Manual;
+					};
+				};
+			};
+			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "simple" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 591157921CF4011C00C31E3A;
+			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				5911579A1CF4011C00C31E3A /* tf_simple_example */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		591157991CF4011C00C31E3A /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */,
+				594C14B11FB9037100EE8BFE /* labels.txt in Resources */,
+				59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */,
+				594C14B21FB9037100EE8BFE /* mobilenet_v1_1.0_224.tflite in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		591157971CF4011C00C31E3A /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				59A3D0091CF4E68100C4259F /* main.mm in Sources */,
+				59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */,
+				59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */,
+				59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		591157B01CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		591157B11CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				TARGETED_DEVICE_FAMILY = "1,2";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		591157B31CF4011D00C31E3A /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CLANG_DEBUG_INFORMATION_LEVEL = default;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				DEVELOPMENT_TEAM = EQHXZ8M8AV;
+				ENABLE_BITCODE = NO;
+				GCC_ENABLE_CPP_EXCEPTIONS = YES;
+				GCC_ENABLE_CPP_RTTI = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../downloads/flatbuffers/include/,
+					../../../downloads/eigen/,
+					../../../downloads/,
+				);
+				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tflite-simple-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE = "1072bd47-ff19-4e5f-8107-d912748f83f1";
+				PROVISIONING_PROFILE_SPECIFIER = "Google Development";
+				SEPARATE_STRIP = NO;
+			};
+			name = Debug;
+		};
+		591157B41CF4011D00C31E3A /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				CLANG_DEBUG_INFORMATION_LEVEL = default;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				DEVELOPMENT_TEAM = "";
+				ENABLE_BITCODE = NO;
+				GCC_ENABLE_CPP_EXCEPTIONS = YES;
+				GCC_ENABLE_CPP_RTTI = YES;
+				HEADER_SEARCH_PATHS = (
+					"$(inherited)",
+					../../../../../../,
+					../../../downloads/flatbuffers/include/,
+					../../../downloads/eigen/,
+					../../../downloads/,
+				);
+				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
+				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
+				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
+				LIBRARY_SEARCH_PATHS = ../../../gen/lib/;
+				ONLY_ACTIVE_ARCH = YES;
+				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
+				OTHER_LDFLAGS = "$(inherited)";
+				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tflite-simple-example";
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				PROVISIONING_PROFILE_SPECIFIER = "";
+				SEPARATE_STRIP = NO;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "simple" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B01CF4011D00C31E3A /* Debug */,
+				591157B11CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				591157B31CF4011D00C31E3A /* Debug */,
+				591157B41CF4011D00C31E3A /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 591157931CF4011C00C31E3A /* Project object */;
+}
diff --git a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc83946647c6a923a8a0bd3a041b42e4febe6a31
Binary files /dev/null and b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg differ
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe208e47d1ac10995881e55c8596ae14ff4242df
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -0,0 +1,359 @@
+# TensorFlow Lite APIs
+
+TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
+the API design reflects a preference for performance over ease of use.
+TensorFlow Lite is designed for fast inference on small devices so it should be
+no surprise that the APIs try to avoid unnecessary copies at the expense of
+convenience.  Similarly, consistency with TensorFlow APIs was not an explicit
+goal and some variance is to be expected.
+
+## C++
+
+In order to run the inference model in TensorFlow Lite, one has to load the
+model into a `FlatBufferModel` object which then can be executed by an
+`Interpreter`.  The `FlatBufferModel` needs to remain valid for the whole
+lifetime of the `Interpreter`, and a single `FlatBufferModel` can be
+simultaneously used by more than one `Interpreter`. In concrete terms, the
+`FlatBufferModel` object must be created before any `Interpreter` objects that
+use it, and must be kept around until they have all been destroyed.
+
+The simplest usage of TensorFlow Lite will look like this:
+
+```c++
+tflite::FlatBufferModel model(path_to_model);
+tflite::ops::builtin::BuiltinOpResolver resolver;
+std::unique_ptr<tflite::Interpreter> interpreter;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+// Resize input tensors, if desired.
+interpreter->AllocateTensors();
+float* input = interpreter->typed_input_tensor<float>(0);
+// Fill `input`.
+interpreter->Invoke();
+float* output = interpreter->type_output_tensor<float>(0);
+```
+### Data Alignment
+
+TensorFlow Lite data is usually aligned to 32-bit boundaries. It is recommended
+that all data provided to TensorFlow Lite be aligned that way.
+
+### Error Reporting
+
+In many places TensorFlow Lite returns status information through
+`TfLiteStatus` objects:
+
+```c++
+typedef enum {
+  kTfLiteOk = 0,
+  kTfLiteError = 1
+} TfLiteStatus;
+
+```
+
+Failures can be easily verified with:
+```c++
+if (status != kTfLiteOk) {
+  // ... error handling here ...
+}
+```
+
+In order to obtain detailed error information an ErrorReporter must be
+provided:
+
+```c++
+class ErrorReporter {
+  virtual int Report(const char* format, va_list args) = 0;
+};
+```
+
+The `DefaultErrorReporter` takes care of reporting to `stderr`.
+
+### Loading a Model
+
+The `FlatBufferModel` class encapsulates a model and can be built in a couple of
+slightly different ways depending on where the model is stored:
+
+```c++
+class FlatBufferModel {
+  // Build a model based on a file. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter);
+
+  // Build a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Return a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer,
+      size_t buffer_size,
+      ErrorReporter* error_reporter);
+};
+```
+
+Note that if TensorFlow Lite detects the presence of Android's NNAPI it will
+automatically try to use shared memory to store the FlatBufferModel.
+
+### Running a Model
+
+Running a model involves a few simple steps:
+
+  * Build an `Interpreter` based on an existing `FlatBufferModel`
+  * Optionally resize input tensors if the predefined sizes are not desired.
+  * Set input tensor values
+  * Invoke inference
+  * Read output tensor values
+
+The important parts of public interface of the `Interpreter` are provided
+below.  It should be noted that:
+
+  * Tensors are represented by integers, in order to avoid string comparisons
+    (and any fixed dependency on string libraries).
+  * An interpreter must not be accessed from concurrent threads
+  * Memory allocation for input and output tensors must be triggered
+    by calling AllocateTensors() right after resizing tensors.
+
+```c++
+class Interpreter {
+  Interpreter(ErrorReporter* error_reporter);
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const;
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const;
+
+  // Change the dimensionality of a given tensor.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Return a pointer into the data of a given input tensor.
+  template <class T>
+  T* typed_input_tensor(int index) {
+    return typed_tensor<T>(inputs_[index]);
+  }
+
+  // Return a pointer into the data of a given output tensor.
+  template <class T>
+  T* typed_output_tensor(int index) {
+    return typed_tensor<T>(outputs_[index]);
+  }
+
+  // Execute the model, populating output tensors.
+  TfLiteStatus Invoke();
+};
+```
+
+### Writing Custom Operators
+
+All TensorFlow Lite operators (both custom and builtin) are defined using a
+simple pure-C interface that consists of four functions:
+
+```c++
+typedef struct {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+} TfLiteRegistration;
+```
+
+Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The
+former provides error reporting facilities and access to global objects,
+including all the tensors. The latter allows implementations to access their
+inputs and outputs.
+
+When the interpreter loads a model, it calls init() once for each node in the
+graph. A given `init()` will be called more than once if the op is used
+multiple times in the graph. For custom ops a configuration buffer will be
+provided, containing a flexbuffer that maps parameter names to their values.
+The buffer is empty for builtin ops because the interpreter has already parsed
+the op parameters. Kernel implementation that require state should initialize
+it here and transfer ownership to the caller.  For each `init()` call, there
+will be a corresponding call to `free()`, allowing implementations to dispose
+of the buffer they might have allocated in `init()`.
+
+Whenever the input tensors are resized the interpreter will go through the
+graph notifying implementations of the change. This gives them the chance to
+resize their internal buffer, check validity of input shapes and types, and
+recalculate output shapes. This is all done through `prepare()` and
+implementation can access their state using `node->user_data`.
+
+Finally, each time inference runs the interpreter traverses the graph calling
+`invoke()`, and here too the state is available as `node->user_data`.
+
+Custom ops can be implemented in exactly the same way as builtin ops, by
+defined those four functions and a global registration function that usually
+looks like this:
+
+```c++
+namespace tflite {
+namespace ops {
+namespace custom {
+  TfLiteRegistration* Register_MY_CUSTOM_OP() {
+    static TfLiteRegistration r = {my_custom_op::Init,
+                                   my_custom_op::Free,
+                                   my_custom_op::Prepare,
+                                   my_custom_op::Eval};
+    return &r;
+  }
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+```
+
+Note that registration is not automatic and an explicit call to
+`Register_MY_CUSTOM_OP` should be made somewhere. While the standard
+`:builtin_ops` takes care of the registration of builtins, custom ops will have
+to be collected in separated custom libraries.
+
+### Customizing the kernel library
+
+Behind the scenes the interpreter will load a library of kernels which will be
+assigned to execute each of the operators in the model. While the default
+library only contains builtin kernels, it is possible to replace it with a
+custom library.
+
+The interpreter uses an `OpResolver` to translate operator codes and names into
+actual code:
+
+```c++
+class OpResolver {
+  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
+  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
+  virtual void AddOp(tflite::BuiltinOperator op, TfLiteRegistration* registration) = 0;
+  virtual void AddOp(const char* op, TfLiteRegistration* registration) = 0;
+};
+```
+
+The regular usage will require the developer to use the `BuiltinOpResolver` and
+write:
+
+```c++
+tflite::ops::builtin::BuiltinOpResolver resolver;
+```
+
+They can then optionally register custom ops:
+
+```c++
+resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
+```
+
+before the resolver is passed to the `InterpreterBuilder`.
+
+If the set of builtin ops is deemed to be too large, a new `OpResolver` could
+be code-generated  based on a given subset of ops, possibly only the ones
+contained in a given model. This is the equivalent of TensorFlow's selective
+registration (and a simple version of it is available in the `tools`
+directory).
+
+## Java
+
+TensorFlow Lite's Java API supports on-device inference and is provided as an
+Android Studio Library that allows loading models, feeding inputs, and
+retrieving inference outputs.
+
+The simplest usage of Tensorflow Lite Java API looks like this:
+
+```java
+try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+  interpreter.run(input, output);
+}
+```
+
+### Loading a Model
+
+The `Interpreter.java` class drives model inference with TensorFlow Lite. In
+most of the cases, this is the only class an app developer will need.
+
+#### Initializing an `Interpreter` With a Model File
+
+The `Interpreter` can be initialized with a model file using the constructor:
+
+```java
+public Interpreter(@NotNull File modelFile);
+```
+
+or with a `MappedByteBuffer`:
+
+```java
+public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer);
+```
+
+In both cases a valid TensorFlow Lite must be provided or an
+`IllegalArgumentException` with be thrown. If a `MappedByteBuffer` is used to
+initialize an Interpreter, it should remain unchanged for the whole lifetime of
+the `Interpreter`.
+
+### Running a Model
+
+#### Supported Data Types
+
+To use TensorFlow Lite, the data types of the input and output tensors must be
+one of the following primitive types:
+
+*   `float`
+*   `int`
+*   `long`
+*   `byte`
+
+If other data types, including boxed types like `Integer` and `Float`, are used,
+an `IllegalArgumentException` will be thrown.
+
+#### Inputs
+
+Each input should be an array, a multi-dimensional array, or a `ByteBuffer` of
+the supported primitive types.
+
+The use of `ByteBuffer` is preferred since it allows the `Interpreter` to avoid
+unnecessary copies. Each `ByteBuffer` needs to be a direct byte buffer, and its
+order must be `ByteOrder.nativeOrder()`. After it is used for a model inference,
+it must remain unchanged until the model inference is finished.
+
+#### Outputs
+
+Each output should be an array, or a multi-dimensional array of the supported
+primitive types.
+
+#### Running Model Inference
+
+If a model takes only one input and returns only one output, the following will
+trigger an inference run:
+
+```java
+interpreter.run(input, output);
+```
+
+For models with multiple inputs, or multiple outputs, use:
+
+```java
+interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
+```
+
+where each entry in `inputs` corresponds to an input tensor and
+`map_of_indices_to_outputs` maps indices of output tensors to the
+corresponding output data. In both cases the tensor indices should correspond to
+the values given to the `TensorFlow Lite Optimized Converter` when the model was
+created. Be aware that the order of tensors in `input` must match the order
+given to the `TensorFlow Lite Optimized Converter`.
+
+The Java API also provides convenient functions for app developers to get the
+index of any model input or output using a tensor name:
+
+```java
+public int getInputIndex(String tensorName);
+public int getOutputIndex(String tensorName);
+```
+
+If tensorName is not a valid name in model, an `IllegalArgumentException` will
+be thrown.
+
+### Releasing Resources After Use
+
+An `Interpreter` owns resources. To avoid memory leak, the resources must be
+released after use by:
+
+```java
+interpreter.close();
+```
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
new file mode 100644
index 0000000000000000000000000000000000000000..204a489a93519309bb09238f1b2c8bbd4f1f19e4
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -0,0 +1,91 @@
+# How to use custom operators
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. However, it
+does support the use of user-provided implementations (as known as custom
+implementations) if the model contains an operator that is not supported.
+
+Let’s walk through this via an example. Assume we are using the `Sin` operator
+and that we are building a very simple model for a function `y = sin(x +
+offset)`, where `offset` is trainable.
+
+The code to train the TensorFlow model will be something like:
+
+```python
+offset = tf.get_variable("offset", [1,], tf.float32)
+x = tf.placeholder(tf.float32, shape=(None,))
+y = tf.sin(x + offset)
+y_ = tf.placeholder(tf.float32, shape=(None,))
+loss = tf.reduce_sum(tf.square(y - y_))
+optimizer = tf.train.GradientDescentOptimizer(0.001)
+train = optimizer.minimize(loss)
+```
+
+If you convert this model to Tensorflow Lite format using the TensorFlow Lite
+Optimizing Converter with `--allow_custom_ops` argument, and run it with the
+default interpreter, the interpreter will raise the following error messages:
+
+```
+Didn't find custom op for name 'Sin'
+Registration failed.
+```
+
+All we need to do to use the op in TensorFlow Lite is define two functions
+(`Prepare` and `Eval`), and construct a `TfLiteRegistration`. This code would
+look something like this:
+
+```cpp
+TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
+  using namespace tflite;
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  int num_dims = NumDimensions(input);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(num_dims);
+  for (int i=0; i<num_dims; ++i) {
+    output_size->data[i] = input->dims->data[i];
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+  using namespace tflite;
+  TfLiteTensor* input = GetInput(context, node,0);
+  TfLiteTensor* output = GetOutput(context, node,0);
+
+  float* input_data = input->data.f;
+  float* output_data = output->data.f;
+
+  size_t count = 1;
+  int num_dims = NumDimensions(input);
+  for (int i = 0; i < num_dims; ++i) {
+    count *= input->dims->data[i];
+  }
+
+  for (size_t i=0; i<count; ++i) {
+    output_data[i] = sin(input_data[i]);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteRegistration* Register_SIN() {
+  static TfLiteRegistration r = {nullptr, nullptr, SinResize, SinEval};
+  return &r;
+}
+```
+
+When initializing the OpResolver, add the custom op into the resolver, this will
+register the operator with Tensorflow Lite so that TensorFlow Lite can use the
+new implementation.
+
+```cpp
+tflite::ops::builtin::BuiltinOpResolver builtins;
+builtins.AddCustom("Sin", Register_SIN());
+```
+
+Note that a similar process as above can be followed for supporting for a set of
+operations instead of a single operator.
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce8b37fbf9b0db5dee60784e85a3cbf0326fddb6
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -0,0 +1,67 @@
+# TensorFlow Lite for iOS
+
+## Building
+
+To create a universal iOS library for TensorFlow Lite, you need to build it
+using Xcode's command line tools on a MacOS machine. If you have not already,
+you will need to install Xcode 8 or later and the tools using `xcode-select`:
+
+```bash
+xcode-select --install
+```
+
+If this is a new install, you will need to run XCode once to agree to the
+license before continuing.
+
+(You will also need to have [Homebrew](http://brew.sh/) installed.)
+
+Then install
+[automake](https://en.wikipedia.org/wiki/Automake)/[libtool](https://en.wikipedia.org/wiki/GNU_Libtool):
+
+```bash
+brew install automake
+brew install libtool
+```
+
+Then you need to run a shell script to download the dependencies you need:
+
+```bash
+tensorflow/contrib/lite/download_dependencies.sh
+```
+
+This will fetch copies of libraries and data from the web and install them in
+`tensorflow/contrib/lite/downloads`.
+
+With all of the dependencies set up, you can now build the library for all five
+supported architectures on iOS:
+
+```bash
+tensorflow/contrib/lite/build_ios_universal_lib.sh
+```
+
+Under the hood this uses a makefile in `tensorflow/contrib/lite` to build the
+different versions of the library, followed by a call to `lipo` to bundle them
+into a universal file containing armv7, armv7s, arm64, i386, and x86_64
+architectures. The resulting library is in
+`tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a`.
+
+## Using in your own application
+
+You'll need to update various settings in your app to link against TensorFlow
+Lite. You can view them in the example project at
+`tensorflow/contrib/lite/examples/ios/simple/simple.xcodeproj` but here's a full
+rundown:
+
+-   You'll need to add the library at
+    `tensorflow/contrib/lite/gen/lib/libtensorflow-lite.a` to your linking build
+    stage, and in Search Paths add `tensorflow/contrib/lite/gen/lib` to the
+    Library Search Paths setting.
+
+-   The _Header Search_ paths needs to contain:
+
+    -   the root folder of tensorflow,
+    -   `tensorflow/contrib/lite/downloads`
+    -   `tensorflow/contrib/lite/downloads/flatbuffers/include`
+
+-   C++11 support (or later) should be enabled by setting `C++ Language Dialect`
+    to `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
new file mode 100644
index 0000000000000000000000000000000000000000..5b393140d61544e6d6e40d4b6ee1872b22cc84b2
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -0,0 +1,22 @@
+#List of Hosted Models
+
+*   [Inception V3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
+*   [Inception V3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+*   [Mobilenet 0.25 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_128_float_2017_11_08.zip)
+*   [Mobilenet 0.25 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_160_float_2017_11_08.zip)
+*   [Mobilenet 0.25 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_192_float_2017_11_08.zip)
+*   [Mobilenet 0.25 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_224_float_2017_11_08.zip)
+*   [Mobilenet 0.50 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_128_float_2017_11_08.zip)
+*   [Mobilenet 0.50 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_160_float_2017_11_08.zip)
+*   [Mobilenet 0.50 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_192_float_2017_11_08.zip)
+*   [Mobilenet 0.50 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_224_float_2017_11_08.zip)
+*   [Mobilenet 0.75 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_128_float_2017_11_08.zip)
+*   [Mobilenet 0.75 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_160_float_2017_11_08.zip)
+*   [Mobilenet 0.75 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_192_float_2017_11_08.zip)
+*   [Mobilenet 0.75 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_224_float_2017_11_08.zip)
+*   [Mobilenet 1.0 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_128_float_2017_11_08.zip)
+*   [Mobilenet 1.0 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_160_float_2017_11_08.zip)
+*   [Mobilenet 1.0 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_192_float_2017_11_08.zip)
+*   [Mobilenet 1.0 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_float_2017_11_08.zip)
+*   [Mobilenet 1.0 224 Quant](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+*   [Smart Reply 1.0 Android ](https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip)
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ade04eb8c696d7e0e39a8104e02b6e5feec95eb
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -0,0 +1,417 @@
+# TensorFlow Compatibility Guide
+
+TensorFlow Lite supports a number of TensorFlow operations used in common
+inference models. As they are processed by the TensorFlow Lite Optimizing
+Converter, those operations may be elided or fused, before the supported
+operations are mapped to their TensorFlow Lite counterparts.
+
+Since the set of TensorFlow Lite operations is smaller than TensorFlow's, not
+every model is convertible. Even for supported operations, very specific usage
+patterns are sometimes expected, for performance reasons. We expect to expand
+the set of supported operations in future TensorFlow Lite releases.
+
+The best way to understand how to build a TensorFlow model that can be used with
+TensorFlow Lite is to carefully consider how operations are converted and
+optimized, along with the limitations imposed by this process.
+
+## Supported Types
+
+Most TensorFlow Lite operations target both floating-point (float32) and
+quantized (uint8) inference, but usually there is little or no support for other
+types like tf.float16 and strings.
+
+Apart from using different version of the operations, the other difference
+between floating-point and quantized models lies in the way they are converted.
+Quantized conversion expect the models to be annotated with "fake quantization"
+nodes that record the dynamic range of the tensors. Without that information TF
+Lite is not able to accurately quantize a model, which means that proper
+quantized training is necessary before conversion.
+
+## Data Format and Broadcasting
+
+At the moment TensorFlow Lite supports only TensorFlow's "NHWC" format, and
+broadcasting in operations like tf.add and tf.mul is generally not supported.
+
+## Compatible Operations
+
+The following TensorFlow operations are usually mapped to their TensorFlow Lite
+counterparts:
+
+*   [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul) - *as long
+    as the second argument is constant and transposition is not used*
+*   [tf.nn.avg_pool](https://www.tensorflow.org/api_docs/python/tf/nn/avg_pool)
+*   [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d) -
+    *as long as the filter is constant*
+*   [tf.nn.depthwise_conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/depthwise_conv2d) -
+    *as long as the filter is constant and rate is [1,1]*
+*   [tf.nn.l2_normalize](https://www.tensorflow.org/api_docs/python/tf/nn/l2_normalize) -
+    *as long as normalization is done along the last dimension*
+*   [tf.nn.local_response_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/local_response_normalization)
+*   [tf.nn.max_pool](https://www.tensorflow.org/api_docs/python/tf/nn/max_pool)
+*   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
+    *as long as tensors are 2D and axis is the last dimension*
+*   [tf.reshape](https://www.tensorflow.org/api_docs/python/tf/reshape)
+*   [tf.sigmoid](https://www.tensorflow.org/api_docs/python/tf/sigmoid)
+*   [tf.space_to_depth](https://www.tensorflow.org/api_docs/python/tf/space_to_depth)
+
+## Straightforward Conversions, Constant-Folding and Fusing
+
+A number of TensorFlow operations can be processed by TensorFlow Lite even
+though they have no direct equivalent. This is the case for operations that can
+be simply removed from the graph (tf.identity), replaced by tensors
+(tf.placeholder), or fused into more complex operations (tf.nn.bias_add). Even
+some supported operations may sometimes be removed through one of these
+processes.
+
+Here is a list of TensorFlow operations that are usually removed from the graph:
+
+*   [tf.add](https://www.tensorflow.org/api_docs/python/tf/add)
+*   [tf.check_numerics](https://www.tensorflow.org/api_docs/python/tf/check_numerics)
+*   [tf.constant](https://www.tensorflow.org/api_docs/python/tf/constant)
+*   [tf.div](https://www.tensorflow.org/api_docs/python/tf/div)
+*   [tf.divide](https://www.tensorflow.org/api_docs/python/tf/divide)
+*   [tf.fake_quant_with_min_max_args](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_args)
+*   [tf.fake_quant_with_min_max_vars](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars)
+*   [tf.greater](https://www.tensorflow.org/api_docs/python/tf/greater)
+*   [tf.greater_equal](https://www.tensorflow.org/api_docs/python/tf/greater_equal)
+*   [tf.identity](https://www.tensorflow.org/api_docs/python/tf/identity)
+*   [tf.less](https://www.tensorflow.org/api_docs/python/tf/less)
+*   [tf.less_equal](https://www.tensorflow.org/api_docs/python/tf/less_equal)
+*   [tf.maximum](https://www.tensorflow.org/api_docs/python/tf/maximum)
+*   [tf.minimum](https://www.tensorflow.org/api_docs/python/tf/minimum)
+*   [tf.multiply](https://www.tensorflow.org/api_docs/python/tf/multiply)
+*   [tf.no_op](https://www.tensorflow.org/api_docs/python/tf/no_op)
+*   [tf.placeholder](https://www.tensorflow.org/api_docs/python/tf/placeholder)
+*   [tf.placeholder_with_default](https://www.tensorflow.org/api_docs/python/tf/placeholder_with_default)
+*   [tf.realdiv](https://www.tensorflow.org/api_docs/python/tf/realdiv)
+*   [tf.reduce_max](https://www.tensorflow.org/api_docs/python/tf/reduce_max)
+*   [tf.reduce_min](https://www.tensorflow.org/api_docs/python/tf/reduce_min)
+*   [tf.reduce_sum](https://www.tensorflow.org/api_docs/python/tf/reduce_sum)
+*   [tf.rsqrt](https://www.tensorflow.org/api_docs/python/tf/rsqrt)
+*   [tf.shape](https://www.tensorflow.org/api_docs/python/tf/shape)
+*   [tf.sqrt](https://www.tensorflow.org/api_docs/python/tf/sqrt)
+*   [tf.square](https://www.tensorflow.org/api_docs/python/tf/square)
+*   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze)
+*   [tf.subtract](https://www.tensorflow.org/api_docs/python/tf/subtract)
+*   [tf.tile](https://www.tensorflow.org/api_docs/python/tf/tile)
+*   [tf.nn.batch_norm_with_global_normalization](https://www.tensorflow.org/api_docs/python/tf/nn/batch_norm_with_global_normalization)
+*   [tf.nn.bias_add](https://www.tensorflow.org/api_docs/python/tf/nn/bias_add)
+*   [tf.nn.fused_batch_norm](https://www.tensorflow.org/api_docs/python/tf/nn/fused_batch_norm)
+*   [tf.nn.relu](https://www.tensorflow.org/api_docs/python/tf/nn/relu)
+*   [tf.nn.relu6](https://www.tensorflow.org/api_docs/python/tf/nn/relu6)
+
+Note that many of those operations don't have TensorFlow Lite equivalents and
+the corresponding model will not be convertible if they can't be elided or
+fused.
+
+## Unsupported Operations
+
+TensorFlow operation not listed above are likely unsupported. Notably, the
+following common ops are not supported at the moment:
+
+*   [tf.batch_to_space_nd](https://www.tensorflow.org/api_docs/python/tf/batch_to_space_nd)
+*   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
+*   [tf.floor](https://www.tensorflow.org/api_docs/python/tf/floor)
+*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
+*   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
+*   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad)
+*   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean)
+*   [tf.slice](https://www.tensorflow.org/api_docs/python/tf/slice)
+*   [tf.space_to_batch_nd](https://www.tensorflow.org/api_docs/python/tf/space_to_batch_nd)
+*   [tf.split](https://www.tensorflow.org/api_docs/python/tf/split)
+*   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice)
+*   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
+
+## TensorFlow Lite Operations
+
+The following TensorFlow Lite operations are fully supported and used in place
+of the TensorFlow operations listed above:
+
+**ADD**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise sum of the input tensors
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+}
+```
+
+**AVERAGE_POOL_2D**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor where each entry is the mean of the input values in the
+     corresponding window.
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the sliding window
+  filter_width,filter_height: size of the sliding window
+}
+```
+
+**CONCATENATION**
+
+```
+Inputs {
+  0-N: any number of tensors
+}
+Outputs {
+  0: concatenation of the input tensors along the given axis.
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  axis: dimension along which the concatenation is performed
+}
+```
+
+**CONV_2D**
+
+```
+Inputs {
+  0: 4D tensor
+  1: filter
+  2: bias (optional)
+}
+Outputs {
+  0: result of 2D convolution of the input tensor
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the filter window
+}
+```
+
+**DEPTHWISE_CONV_2D**
+
+```
+Inputs {
+  0: 4D tensor
+  1: filter
+  2: bias (optional)
+}
+Outputs {
+  0: result of a depthwise-2D convolution of the input tensor
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the filter window
+  depth_multiplier: relation between the last dimension of the input and output
+    tensors
+}
+```
+
+**FULLY_CONNECTED**
+
+```
+Inputs {
+  0: 4D tensor
+  1: filter
+  2: bias (optional)
+}
+Outputs {
+  0: output of a fully (densely) connected layer, which connects all
+     elements in the input tensor with each element in this tensor.
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+}
+```
+
+**L2_NORMALIZATION**
+
+```
+Inputs {
+  0: input tensor
+}
+Outputs {
+  0: normalized tensor (along the last dimension)
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+}
+```
+
+**L2_POOL_2D**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to tf.sqrt(tf.nn.ave_pool(tf.square(input))
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the sliding window
+  filter_width,filter_height: size of the sliding window
+}
+```
+
+**LOCAL_RESPONSE_NORMALIZATION**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to tf.nn.local_response_normalization
+}
+Options {
+  radius
+  bias
+  alpha
+  beta
+}
+```
+
+**LOGISTIC**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to 1 / (1 + exp(-input))
+}
+```
+
+**MAX_POOL_2D**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor where each entry is the maximum of the input values in the
+     corresponding window.
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the sliding window
+  filter_width,filter_height: size of the sliding window
+}
+```
+
+**MUL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise multiplication of the input tensors
+}
+Options {
+  fused_activation_function:  NONE|RELU|RELU6
+}
+```
+
+**RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(0, min(input, 1)
+}
+```
+
+**RELU1**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(-1, min(input, 6)
+}
+```
+
+**RELU6**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(0, min(input, 6)
+}
+```
+
+**RESHAPE**
+
+```
+Inputs {
+  0: a tensor
+  1: ignored
+}
+Outputs {
+  0: a tensor with the same elements as the input but with the new shape
+}
+Options {
+  new_shape
+}
+```
+
+**SOFTMAX**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to exp(input) / tf.reduce_sum(exp(input * beta), dim),
+     where dim is always the last dimension of the input tensor.
+}
+Options {
+  beta
+}
+```
+
+**SPACE_TO_DEPTH**
+
+```
+Inputs {
+  0: a 4D tensor
+}
+Outputs {
+  0: a tensor rearranged using block_size. See tf.space_to_depth for details.
+}
+Options {
+  block_size
+}
+```
+
+And these are TensorFlow Lite operations that are present but not ready for
+custom models yet:
+
+*   CALL
+*   CONCAT_EMBEDDINGS
+*   CUSTOM
+*   EMBEDDING_LOOKUP
+*   EMBEDDING_LOOKUP_SPARSE
+*   HASHTABLE_LOOKUP
+*   LSH_PROJECTION
+*   LSTM
+*   RESIZE_BILINEAR
+*   RNN
+*   SKIP_GRAM
+*   SVDF
+*   TANH
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..954e236ac8f0c8c59a9d20d62e66b3aa1164ecc1
--- /dev/null
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -0,0 +1,567 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include <cassert>
+#include <cstdarg>
+#include <cstdint>
+#include <cstring>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/nnapi_delegate.h"
+
+namespace {
+
+// Memory allocation tuning
+constexpr const int kDefaultArenaAlignment = 64;
+constexpr const int kDefaultTensorAlignment = 4;
+// std::vector preallocation tuning.
+constexpr const int kSlotsToReserve = 128;
+
+}  // namespace
+
+namespace tflite {
+
+Interpreter::Interpreter(ErrorReporter* error_reporter)
+    : arena_(kDefaultArenaAlignment),
+      persistent_arena_(kDefaultArenaAlignment),
+      error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  context_.impl_ = static_cast<void*>(this);
+  context_.ResizeTensor = ResizeTensor;
+  context_.ReportError = ReportError;
+  context_.AddTensors = AddTensors;
+  context_.tensors = nullptr;
+  context_.tensors_size = 0;
+  context_.gemm_context = nullptr;
+  // Reserve some space for the tensors to avoid excessive resizing.
+  tensors_.reserve(kSlotsToReserve);
+  nodes_and_registration_.reserve(kSlotsToReserve);
+  next_allocate_node_id_ = 0;
+  UseNNAPI(false);
+}
+
+Interpreter::~Interpreter() {
+  for (auto& nodeAndReg : nodes_and_registration_) {
+    TfLiteNode& node = nodeAndReg.first;
+    TfLiteIntArrayFree(node.inputs);
+    TfLiteIntArrayFree(node.outputs);
+    TfLiteIntArrayFree(node.temporaries);
+    if (node.builtin_data) free(node.builtin_data);
+    OpFree(nodeAndReg.second, node.user_data);
+    node.builtin_data = nullptr;
+  }
+
+  for (int i = 0; i < context_.tensors_size; i++) {
+    TfLiteTensorFree(&context_.tensors[i]);
+  }
+}
+
+TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
+  TF_LITE_ENSURE_OK(&context_,
+                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
+  inputs_ = std::move(inputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
+  TF_LITE_ENSURE_OK(
+      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
+  outputs_ = std::move(outputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
+                                             const int* indices, int length) {
+  // Making sure kOptionalTensor is not re-defined to something other than -1.
+  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
+
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    if (index < kOptionalTensor || index >= context_.tensors_size) {
+      ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
+                                        int dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(&context_, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    default:
+      ReportError(&context_,
+                  "Only float32, int32, int64, uint8 supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::AllocateTensorsWhoseSizesAreKnown() {
+  if (!consistent_) {
+    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+  if (next_allocate_node_id_ == nodes_and_registration_.size() && invokable_) {
+    return kTfLiteOk;
+  }
+  allocs_and_refcounts_.resize(context_.tensors_size);
+
+  int new_next_allocate_node_id = next_allocate_node_id_;
+  invokable_ = false;
+
+  // Allocate graph input nodes.
+  if (next_allocate_node_id_ == 0) {
+    for (int i = 0; i < inputs_.size(); ++i) {
+      int tensor_index = inputs_[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+      if (tensor.allocation_type == kTfLiteArenaRw) {
+        TF_LITE_ENSURE_OK(
+            &context_,
+            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
+                            &allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+    // Add 1 to output tensors, so they will not get overwritten.
+    for (int i = 0; i < outputs_.size(); ++i) {
+      allocs_and_refcounts_[outputs_[i]].count++;
+    }
+  }
+
+  // Count references to node input tensors, and resize node-referenced tensors
+  // until we encounter a node that has a dynamic output tensor.
+  for (int k = next_allocate_node_id_; k < nodes_and_registration_.size();
+       k++) {
+    new_next_allocate_node_id++;
+    TfLiteNode& node = nodes_and_registration_[k].first;
+    const TfLiteRegistration& registration = nodes_and_registration_[k].second;
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return kTfLiteError;
+    }
+
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int i = 0; i < node_inputs->size; ++i) {
+      int tensor_index = node_inputs->data[i];
+      if (tensor_index != kOptionalTensor) {
+        allocs_and_refcounts_[node_inputs->data[i]].count++;
+      }
+    }
+
+    // Discontinue if the node has dynamic outputs.
+    bool has_unallocated_dynamic_tensor = false;
+    TfLiteIntArray* node_outputs = node.outputs;
+    for (int i = 0; i < node_outputs->size; ++i) {
+      TfLiteTensor& tensor = context_.tensors[node_outputs->data[i]];
+      if (tensor.allocation_type == kTfLiteDynamic) {
+        has_unallocated_dynamic_tensor = true;
+        break;
+      }
+    }
+    if (has_unallocated_dynamic_tensor) {
+      break;
+    }
+  }
+
+  // Allocate graph persistent outputs, e.g. RNN cell states, etc.
+  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
+    TfLiteNode& node = nodes_and_registration_[k].first;
+
+    // Go through output tensors and allocate the persistent ones first.
+    TfLiteIntArray* node_outputs = node.outputs;
+    for (int i = 0; i < node_outputs->size; ++i) {
+      int tensor_index = node_outputs->data[i];
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+      if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+        TF_LITE_ENSURE_OK(&context_,
+                          persistent_arena_.Allocate(
+                              &context_, kDefaultTensorAlignment, tensor.bytes,
+                              &allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+  }
+
+  // Go through the graph in execution order.
+  for (int k = next_allocate_node_id_; k < new_next_allocate_node_id; k++) {
+    TfLiteNode& node = nodes_and_registration_[k].first;
+
+    // First allocate output tensors.
+    TfLiteIntArray* node_outputs = node.outputs;
+    for (int i = 0; i < node_outputs->size; ++i) {
+      int tensor_index = node_outputs->data[i];
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+      if (tensor.allocation_type == kTfLiteArenaRw) {
+        TF_LITE_ENSURE_OK(
+            &context_,
+            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
+                            &allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+    // Then the temporaries, in two passes. First allocate them all, them
+    // deallocate them.
+    TfLiteIntArray* node_temporaries = node.temporaries;
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+      if (tensor.allocation_type == kTfLiteArenaRw) {
+        TF_LITE_ENSURE_OK(
+            &context_,
+            arena_.Allocate(&context_, kDefaultTensorAlignment, tensor.bytes,
+                            &allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+    for (int i = 0; i < node_temporaries->size; ++i) {
+      int tensor_index = node_temporaries->data[i];
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+      allocs_and_refcounts_[tensor_index].count--;
+      if (tensor.allocation_type == kTfLiteArenaRw &&
+          allocs_and_refcounts_[tensor_index].count == 0) {
+        TF_LITE_ENSURE_OK(
+            &context_,
+            arena_.Deallocate(&context_,
+                              allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+
+    // Then process the node's inputs.
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int i = 0; i < node_inputs->size; ++i) {
+      int tensor_index = node_inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor& tensor = context_.tensors[tensor_index];
+
+      // Decrease reference count and deallocate if not needed anymore.
+      allocs_and_refcounts_[tensor_index].count--;
+      if (tensor.allocation_type == kTfLiteArenaRw &&
+          allocs_and_refcounts_[tensor_index].count == 0) {
+        TF_LITE_ENSURE_OK(
+            &context_,
+            arena_.Deallocate(&context_,
+                              allocs_and_refcounts_[tensor_index].alloc));
+      }
+    }
+  }
+
+  // Resize the buffer and commit the arena.
+  TF_LITE_ENSURE_OK(&context_, arena_.Commit(&context_));
+  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Commit(&context_));
+
+  // Rewire the tensors to use the underlying arena buffer.
+  for (int i = 0; i < context_.tensors_size; ++i) {
+    TfLiteTensor& tensor = context_.tensors[i];
+    if (tensor.allocation_type == kTfLiteArenaRw) {
+      TF_LITE_ENSURE_OK(
+          &context_,
+          arena_.ResolveAlloc(&context_, allocs_and_refcounts_[i].alloc,
+                              &tensor.data.raw));
+    }
+    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+      TF_LITE_ENSURE_OK(
+          &context_,
+          persistent_arena_.ResolveAlloc(
+              &context_, allocs_and_refcounts_[i].alloc, &tensor.data.raw));
+    }
+  }
+
+  invokable_ = true;
+  next_allocate_node_id_ = new_next_allocate_node_id;
+  return kTfLiteOk;
+}
+
+namespace {
+TfLiteIntArray* convertVectorToTfLiteIntArray(const std::vector<int>& x) {
+  TfLiteIntArray* lite = TfLiteIntArrayCreate(x.size());
+  for (size_t i = 0; i < x.size(); i++) lite->data[i] = x[i];
+  return lite;
+}
+}  // namespace
+
+TfLiteStatus Interpreter::AllocateTensors() {
+  next_allocate_node_id_ = 0;
+  TF_LITE_ENSURE_OK(&context_, arena_.Clear());
+  TF_LITE_ENSURE_OK(&context_, persistent_arena_.Clear());
+  allocs_and_refcounts_.clear();
+  return AllocateTensorsWhoseSizesAreKnown();
+}
+
+TfLiteStatus Interpreter::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  invokable_ = false;
+
+  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
+                                                              free);
+
+  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("node inputs", inputs.data(),
+                                                  inputs.size()));
+  TF_LITE_ENSURE_OK(
+      &context_,
+      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
+
+  if (node_index) *node_index = nodes_and_registration_.size();
+  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+  auto& node_and_reg = nodes_and_registration_.back();
+  TfLiteNode& node = node_and_reg.first;
+  if (node.inputs) TfLiteIntArrayFree(node.inputs);
+  if (node.outputs) TfLiteIntArrayFree(node.outputs);
+  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
+
+  // NOTE, here we are not using move semantics yet, since our internal
+  // representation isn't std::vector, but in the future we would like to avoid
+  // copies, so we want the interface to take r-value references now.
+  node.inputs = convertVectorToTfLiteIntArray(inputs);
+  node.outputs = convertVectorToTfLiteIntArray(outputs);
+  node.temporaries = TfLiteIntArrayCreate(0);
+  if (init_data) {
+    node.user_data = OpInit(*registration, init_data, init_data_size);
+  } else {
+    node.user_data =
+        OpInit(*registration,
+               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
+  }
+  node.builtin_data = builtin_data_deleter.release();
+  node_and_reg.second = *registration;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
+                                            const std::vector<int>& dims) {
+  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
+  // checks by casting to unsigned for efficiency. Profile before doing this.
+
+  TF_LITE_ENSURE(&context_,
+                 tensor_index < context_.tensors_size && tensor_index >= 0);
+  invokable_ = false;
+  TfLiteIntArray* dims_lite = convertVectorToTfLiteIntArray(dims);
+  return ResizeTensorImpl(&context_.tensors[tensor_index], dims_lite);
+}
+
+TfLiteStatus Interpreter::Invoke() {
+  if (!consistent_) {
+    ReportError(&context_, "Invoke called on model that is not consistent.");
+    return kTfLiteError;
+  }
+  if (!invokable_) {
+    ReportError(&context_, "Invoke called on model that is not ready.");
+    return kTfLiteError;
+  }
+
+  TfLiteStatus status = kTfLiteOk;
+  if (nnapi_delegate_) {
+    if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
+      return kTfLiteError;
+    }
+    if (next_allocate_node_id_ == nodes_and_registration_.size()) {
+      TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
+      return kTfLiteOk;
+    } else {
+      // TODO(aselle): In the future, we would like this to be an
+      // automatic tflite CPU fallback.
+      ReportError(&context_,
+                  "NNAPI was requested, but dependent sized tensors "
+                  "being used.\n");
+      return kTfLiteError;
+    }
+  }
+
+  for (int i = 0; i < nodes_and_registration_.size(); i++) {
+    // Ensure we have allocated up to this node. The point of this is to
+    // allocate as much as possible before running any evaluation, but
+    // dynamic shapes can prevent this from being possible.
+    if (i >= next_allocate_node_id_) {
+      if (AllocateTensorsWhoseSizesAreKnown() == kTfLiteError) {
+        return kTfLiteError;
+      }
+    }
+    TfLiteNode& node = nodes_and_registration_[i].first;
+    const TfLiteRegistration& registration = nodes_and_registration_[i].second;
+    if (OpInvoke(registration, &node) == kTfLiteError) {
+      status = kTfLiteError;
+    }
+  }
+  return status;
+}
+
+TfLiteStatus Interpreter::ResizeTensor(TfLiteContext* context,
+                                       TfLiteTensor* tensor,
+                                       TfLiteIntArray* new_size) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ResizeTensorImpl
+  // (this function is static).
+  return static_cast<Interpreter*>(context->impl_)
+      ->ResizeTensorImpl(tensor, new_size);
+}
+
+void Interpreter::ReportErrorImpl(const char* format, va_list args) {
+  error_reporter_->Report(format, args);
+}
+
+void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Interpreter*>(context->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
+                                     int* first_new_tensor_index) {
+  int base_index = tensors_.size();
+  if (first_new_tensor_index) *first_new_tensor_index = base_index;
+  tensors_.resize(tensors_.size() + tensors_to_add);
+  for (int i = base_index; i < tensors_.size(); i++) {
+    memset(&tensors_[i], 0, sizeof(tensors_[i]));
+  }
+  context_.tensors = tensors_.data();
+  context_.tensors_size = tensors_.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
+                                     int* first_new_tensor_index) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function AddTensors
+  // (this function is static).
+  return static_cast<Interpreter*>(context->impl_)
+      ->AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Interpreter::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+    const char* buffer, size_t bytes, const Allocation* allocation) {
+  TF_LITE_ENSURE(&context_,
+                 tensor_index < context_.tensors_size && tensor_index >= 0);
+  // For most tensors we know exactly how much memory is necessary so we can
+  // ensure the buffer is large enough. However, we need to skip string tensors
+  // because their sizes change with the contents of the individual strings.
+  if (type != kTfLiteString) {
+    size_t required_bytes;
+    TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims.data(), dims.size(),
+                                               &required_bytes));
+    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
+  }
+  invokable_ = false;
+  TfLiteTensorReset(type, name, convertVectorToTfLiteIntArray(dims),
+                    quantization, const_cast<char*>(buffer), bytes,
+                    kTfLiteMmapRo, allocation, &context_.tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Interpreter::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantizationParams quantization) {
+  invokable_ = false;
+  TF_LITE_ENSURE(&context_,
+                 tensor_index < context_.tensors_size && tensor_index >= 0);
+  size_t required_bytes = 0;
+  if (type != kTfLiteString) {
+    // These types will be allocated in our arena so we need to record how
+    // many bytes we will need based on the dimensions. String tensors are
+    // allocated dynamically and we can't know ahead of time how much space
+    // they will require.
+    TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims.data(), dims.size(),
+                                               &required_bytes));
+  }
+  TfLiteTensorReset(type, name, convertVectorToTfLiteIntArray(dims),
+                    quantization,
+                    /*buffer=*/nullptr, required_bytes,
+                    type == kTfLiteString ? kTfLiteDynamic : kTfLiteArenaRw,
+                    nullptr, &context_.tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
+                                           TfLiteIntArray* new_size) {
+  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
+  if (tensor->allocation_type == kTfLiteArenaRw ||
+      tensor->allocation_type == kTfLiteDynamic) {
+    if (tensor->type != kTfLiteString) {
+      size_t bytesRequired;
+      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
+                                          new_size->size, &bytesRequired);
+      if (status != kTfLiteOk) {
+        TfLiteIntArrayFree(new_size);
+        return kTfLiteError;
+      }
+      tensor->bytes = bytesRequired;
+    }
+    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    tensor->dims = new_size;
+
+    if (tensor->allocation_type != kTfLiteDynamic) {
+      tensor->data.raw = nullptr;
+    }
+  } else {
+    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
+    // of fixed size.
+    TfLiteIntArrayFree(new_size);
+    ReportError(&context_, "Attempting to resize a fixed-size tensor.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void Interpreter::UseNNAPI(bool enable) {
+  // TODO(aselle): This is a workaround for finding if NNAPI exists.
+  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
+  // prefixed.
+  if (!NNAPIExists()) enable = false;
+  if (!enable) {
+    nnapi_delegate_.reset();
+  } else if (!nnapi_delegate_) {
+    nnapi_delegate_.reset(new NNAPIDelegate);
+  }
+}
+
+void Interpreter::SetNumThreads(int num_threads) {
+  // TODO(ahentz): this forces us to link against gemmlowp even when the ops
+  // don't use it. We should implement some dynamic mechanism for this sort of
+  // library-specific initialization.
+  tflite::gemm_support::SetMaxNumThreads(&context_, num_threads);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..65c61e44bee48535f884a3afaddc691972f5e04b
--- /dev/null
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -0,0 +1,374 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Main abstraction controlling the tflite interpreter.
+// See context.h for the API for defining operations (TfLiteRegistration).
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+namespace tflite {
+
+// Map statically from a c++ type to a TfLiteType (used below for safe casts).
+template <class T>
+constexpr TfLiteType typeToTfLiteType() {
+  return kTfLiteNoType;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int>() {
+  return kTfLiteInt32;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<int64_t>() {
+  return kTfLiteInt64;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<float>() {
+  return kTfLiteFloat32;
+}
+template <>
+constexpr TfLiteType typeToTfLiteType<unsigned char>() {
+  return kTfLiteUInt8;
+}
+
+struct ArenaAllocRefCount {
+  ArenaAllocRefCount() : alloc(), count(0) {}
+
+  ArenaAlloc alloc;
+  int count;
+};
+
+// Forward declare since NNAPIDelegate uses Interpreter.
+class NNAPIDelegate;
+
+// An interpreter for a graph of nodes that input and output from tensors.
+// Each node of the graph processes a set of input tensors and produces a
+// set of output Tensors. All inputs/output tensors are referenced by index.
+//
+// Usage:
+//
+// -- Create basic model
+// Interpreter foo(2, 1);
+// foo.SetTensorParametersReadWrite(0, ...);
+// foo.SetTensorParametersReadOnly(1, ...);
+// foo.SetNodeParameters(0, ...)
+//
+// -- Resize input array to 1 length.
+// foo.ResizeInputTensor(0, 1);
+// foo.AllocateTensors();
+// -- Install array data
+// foo.typed_tensor<float>(0)[0] = 3;
+// foo.Invoke();
+// foo.typed_tensor<float>(0)[0] = 4;
+// foo.Invoke();
+// -- Resize input array and set data.
+// foo.ResizeInputTensor(0, 2);
+// foo.AllocateTensors();
+// foo.typed_tensor<float>(0)[0] = 4;
+// foo.typed_tensor<float>(0)[1] = 8;
+// foo.Invoke();
+//
+
+class Interpreter {
+ public:
+  // Instantiate an interpreter. All errors associated with reading and
+  // processing this model will be forwarded to the error_reporter object.
+  //
+  // Note, if error_reporter is nullptr, then a default StderrReporter is
+  // used.
+  explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  ~Interpreter();
+
+  Interpreter(const Interpreter&) = delete;
+  Interpreter& operator=(const Interpreter&) = delete;
+
+  // Functions to build interpreter
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `delete`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index = nullptr);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantizationParams quantization);
+
+  // Functions to access tensor data
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Return the name of a given input. The given index must be between 0 and
+  // inputs().size().
+  const char* GetInputName(int index) const {
+    return context_.tensors[inputs_[index]].name;
+  }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Return the name of a given output. The given index must be between 0 and
+  // outputs().size().
+  const char* GetOutputName(int index) const {
+    return context_.tensors[outputs_[index]].name;
+  }
+
+  // Return the number of tensors in the model.
+  int tensors_size() const { return context_.tensors_size; }
+
+  // Return the number of ops in the model.
+  int nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Get a tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index >= context_.tensors_size || tensor_index < 0)
+        return nullptr;
+    return &context_.tensors[tensor_index];
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) {
+    if (node_index >= nodes_and_registration_.size() || node_index < 0)
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Perform a checked cast to the appropriate tensor type.
+  template <class T>
+  T* typed_tensor(int tensor_index) {
+    if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<T>()) {
+        return reinterpret_cast<T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  // Return a pointer into the data of a given input tensor. The given index
+  // must be between 0 and inputs().size().
+  template <class T>
+  T* typed_input_tensor(int index) {
+    return typed_tensor<T>(inputs_[index]);
+  }
+
+  // Return a pointer into the data of a given output tensor. The given index
+  // must be between 0 and outputs().size().
+  template <class T>
+  T* typed_output_tensor(int index) {
+    return typed_tensor<T>(outputs_[index]);
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the interpreter (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Enable or disable the NN API (true to enable)
+  void UseNNAPI(bool enable);
+
+  // Set the number of threads available to the interpreter.
+  void SetNumThreads(int num_threads);
+
+ private:
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length) {
+    if (op_reg.init == nullptr) return nullptr;
+    return op_reg.init(&context_, buffer, length);
+  }
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
+    if (op_reg.free == nullptr) return;
+    if (buffer) {
+      op_reg.free(&context_, buffer);
+    }
+  }
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.prepare == nullptr) return kTfLiteOk;
+    return op_reg.prepare(&context_, node);
+  }
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.invoke == nullptr) return kTfLiteError;
+    return op_reg.invoke(&context_, node);
+  }
+
+  // Allocate tensors whose sizes are known in order of nodes. Discontinue when
+  // we encounter a node that has a dynamic output tensor.
+  TfLiteStatus AllocateTensorsWhoseSizesAreKnown();
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Compute the number of bytes required to represent a tensor with dimensions
+  // specified by the array dims (of length dims_size). Returns the status code
+  // and bytes.
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, int dims_size,
+                             size_t* bytes);
+
+  // Request an tensor be resized implementation.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  // TODO(aselle): allow user of class to provide alternative destinations.
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportError(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  TfLiteContext context_;
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Raw memory buffer that is allocated for all temporary and graph outputs.
+  // that are declared kTfLiteArenaRw.
+  SimpleMemoryArena arena_;
+
+  // Raw memory buffer that is allocated for persistent tensors that are
+  // declared as kTfLiteArenaRwPersistent.
+  SimpleMemoryArena persistent_arena_;
+
+  // Stores allocation and reference counts of all tensors.
+  std::vector<ArenaAllocRefCount> allocs_and_refcounts_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Whether the model is safe to invoke (if any errors occurred this
+  // will be false).
+  bool invokable_ = false;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Next node to allocate output tensors.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Intepreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_allocate_node_id_;
+
+  // Whether to delegate to NN API
+  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+};
+
+}  // namespace tflite
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_INTERPRETER_H_
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edff2109430c6e1ec6c481619ed7772237a3301d
--- /dev/null
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -0,0 +1,526 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace {
+
+// Make an interpreter that has no tensors and no nodes
+TEST(BasicInterpreter, ZeroInterpreter) {
+  Interpreter interpreter;
+  interpreter.SetInputs({});
+  interpreter.SetOutputs({});
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+}
+
+// Test various error conditions.
+TEST(BasicInterpreter, InvokeInvalidModel) {
+  Interpreter interpreter;
+  ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+}
+
+// Test size accesser functions.
+TEST(BasicInterpreter, TestSizeFunctions) {
+  Interpreter interpreter;
+  int base_index;
+  ASSERT_EQ(interpreter.nodes_size(), 0);
+  ASSERT_EQ(interpreter.tensors_size(), 0);
+  ASSERT_EQ(interpreter.AddTensors(2, &base_index), kTfLiteOk);
+  ASSERT_EQ(interpreter.tensors_size(), 2);
+  ASSERT_EQ(base_index, 0);
+  ASSERT_EQ(interpreter.AddTensors(3, &base_index), kTfLiteOk);
+  ASSERT_EQ(interpreter.tensors_size(), 5);
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.tensors_size(), 6);
+  ASSERT_EQ(base_index, 2);
+}
+
+// Test if invalid indices make a model inconsistent (and conversely if
+// valid indices keep a model consistent).
+TEST(BasicInterpreter, InconsistentModel) {
+  // Invalid inputs
+  {
+    Interpreter interpreter;
+    ASSERT_NE(interpreter.SetInputs({5}), kTfLiteOk);
+    ASSERT_NE(interpreter.AllocateTensors(), kTfLiteOk);
+    ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
+    ASSERT_EQ(interpreter.inputs(), std::vector<int>());
+  }
+  // Invalid outputs
+  {
+    Interpreter interpreter;
+    ASSERT_NE(interpreter.SetOutputs({5}), kTfLiteOk);
+    ASSERT_NE(interpreter.AllocateTensors(), kTfLiteOk);
+    ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
+    ASSERT_EQ(interpreter.outputs(), std::vector<int>());
+  }
+  // Invalid node inputs
+  {
+    Interpreter interpreter;
+    TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+    ASSERT_NE(interpreter.AddNodeWithParameters({3}, {0}, nullptr, 0, nullptr,
+                                                &registration),
+              kTfLiteOk);
+    ASSERT_NE(interpreter.AllocateTensors(), kTfLiteOk);
+    ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
+  }
+  // Valid inputs and outputs and a node with valid inputs and outputs
+  {
+    Interpreter interpreter;
+    ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+    TfLiteRegistration registration = {nullptr, nullptr, nullptr, nullptr};
+    ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+    ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+    ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                                &registration),
+              kTfLiteOk);
+  }
+}
+
+// Make an interpreter that has one tensor but no ops
+TEST(BasicInterpreter, CheckAllocate) {
+  struct {
+    TfLiteType type;
+    size_t size;
+  } cases[] = {
+      {kTfLiteFloat32, sizeof(float)},
+      {kTfLiteInt32, sizeof(int32_t)},
+      {kTfLiteUInt8, sizeof(uint8_t)},
+      {kTfLiteInt64, sizeof(int64_t)},
+  };
+
+  for (auto test : cases) {
+    Interpreter interpreter;
+    ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+    interpreter.SetInputs({0, 1});
+    interpreter.SetOutputs({});
+    TfLiteQuantizationParams quant;
+
+    interpreter.SetTensorParametersReadWrite(0, test.type, "", {3}, quant);
+    interpreter.SetTensorParametersReadWrite(1, test.type, "", {4}, quant);
+    ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    ASSERT_EQ(interpreter.tensor(0)->bytes, 3 * test.size);
+    ASSERT_NE(interpreter.tensor(0)->data.raw, nullptr);
+    ASSERT_EQ(interpreter.tensor(1)->bytes, 4 * test.size);
+    ASSERT_NE(interpreter.tensor(1)->data.raw, nullptr);
+  }
+}
+
+TEST(BasicInterpreter, CheckResize) {
+  const float floats[] = {-3., -4.};
+  const int32_t int32s[] = {-3, -4};
+  const uint8_t uint8s[] = {3, 4};
+  const int64_t int64s[] = {6, -7};
+
+  struct {
+    TfLiteType type;
+    size_t size;
+    const char* array;
+  } cases[] = {
+      {kTfLiteFloat32, sizeof(float), reinterpret_cast<const char*>(floats)},
+      {kTfLiteInt32, sizeof(int32_t), reinterpret_cast<const char*>(int32s)},
+      {kTfLiteUInt8, sizeof(uint8_t), reinterpret_cast<const char*>(uint8s)},
+      {kTfLiteInt64, sizeof(int64_t), reinterpret_cast<const char*>(int64s)},
+  };
+
+  for (auto test : cases) {
+    Interpreter interpreter;
+
+    ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+    interpreter.SetInputs({0, 1});
+    interpreter.SetOutputs({});
+    TfLiteQuantizationParams quant;
+
+    ASSERT_EQ(
+        interpreter.SetTensorParametersReadWrite(0, test.type, "", {3}, quant),
+        kTfLiteOk);
+    ASSERT_EQ(interpreter.SetTensorParametersReadOnly(
+                  1, test.type, "", {2}, quant, test.array, 2 * test.size),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    ASSERT_EQ(interpreter.ResizeInputTensor(0, {1, 2}), kTfLiteOk);
+    // Resizing a mmapped tensor is not allowed and should produce error.
+    ASSERT_NE(interpreter.ResizeInputTensor(1, {3}), kTfLiteOk);
+    // Set the tensor to be mmapped but with a buffer size that is insufficient
+    // to match the dimensionality.
+    ASSERT_NE(interpreter.SetTensorParametersReadOnly(
+                  1, test.type, "", {2}, quant, test.array, 1 * test.size),
+              kTfLiteOk);
+    // Allocating should work since we should have our last correct array
+    // values in place.
+    ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  }
+}
+
+TEST(BasicInterpreter, CheckAlignment) {
+  struct {
+    TfLiteType type;
+  } cases[] = {
+      {kTfLiteFloat32},
+      {kTfLiteInt32},
+      {kTfLiteUInt8},
+      {kTfLiteInt64},
+  };
+
+  for (auto test : cases) {
+    Interpreter interpreter;
+
+    ASSERT_EQ(interpreter.AddTensors(4), kTfLiteOk);
+
+    for (int i = 0; i < 4; i++) {
+      TfLiteQuantizationParams quant;
+      interpreter.SetTensorParametersReadWrite(i, test.type, "", {2 * i + 1},
+                                               quant);
+    }
+    interpreter.AllocateTensors();
+    for (int i = 0; i < 4; i++) {
+      const TfLiteTensor& tensor = *interpreter.tensor(i);
+      ASSERT_EQ(reinterpret_cast<intptr_t>(tensor.data.raw) % 4, 0);
+    }
+  }
+}
+
+TEST(BasicInterpreter, CheckArenaAllocation) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(10), kTfLiteOk);
+
+  TfLiteQuantizationParams quant;
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  std::vector<int> sizes{2048, 4096, 1023, 2047, 1021,
+                         2047, 1023, 2046, 1021, 2048};
+  for (int i = 0; i < sizes.size(); ++i) {
+    interpreter.SetTensorParametersReadWrite(i, kTfLiteUInt8, "", {sizes[i]},
+                                             quant);
+  }
+  interpreter.SetInputs({0, 1});
+  interpreter.SetOutputs({9, 4});
+  interpreter.AddNodeWithParameters({0, 1}, {2, 3}, nullptr, 0, nullptr, &reg);
+  interpreter.AddNodeWithParameters({2, 1}, {4, 5}, nullptr, 0, nullptr, &reg);
+  interpreter.AddNodeWithParameters({4, 3}, {6, 7}, nullptr, 0, nullptr, &reg);
+  interpreter.AddNodeWithParameters({6, 5}, {8}, nullptr, 0, nullptr, &reg);
+  interpreter.AddNodeWithParameters({8, 7}, {9}, nullptr, 0, nullptr, &reg);
+
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.tensor(0)->data.raw, interpreter.tensor(4)->data.raw);
+  ASSERT_EQ(interpreter.tensor(1)->data.raw, interpreter.tensor(7)->data.raw);
+
+  ASSERT_LT(interpreter.tensor(4)->data.raw, interpreter.tensor(1)->data.raw);
+  ASSERT_LT(interpreter.tensor(6)->data.raw, interpreter.tensor(1)->data.raw);
+  ASSERT_LT(interpreter.tensor(0)->data.raw, interpreter.tensor(1)->data.raw);
+
+  ASSERT_LT(interpreter.tensor(0)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(4)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(6)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(7)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(8)->data.raw, interpreter.tensor(3)->data.raw);
+  ASSERT_LT(interpreter.tensor(9)->data.raw, interpreter.tensor(3)->data.raw);
+
+  ASSERT_LT(interpreter.tensor(0)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(3)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(4)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(6)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(7)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(8)->data.raw, interpreter.tensor(5)->data.raw);
+  ASSERT_LT(interpreter.tensor(9)->data.raw, interpreter.tensor(5)->data.raw);
+}
+
+TEST(BasicInterpreter, BufferAccess) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  // Verify we get a valid pointer.r
+  ASSERT_NE(interpreter.typed_tensor<float>(0), nullptr);
+  // Verify incorrect pointer will not returned.
+  ASSERT_EQ(interpreter.typed_tensor<int>(0), nullptr);
+  // Verify that raw c interface ptr matches safe interface.
+  ASSERT_EQ(interpreter.typed_tensor<float>(0), interpreter.tensor(0)->data.f);
+}
+
+TEST(BasicInterpreter, NoOpInterpreter) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(1), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({0}), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(
+                0, kTfLiteFloat32, "", {3}, TfLiteQuantizationParams()),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter.ResizeInputTensor(interpreter.inputs()[0], {1, 2, 3}),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+}
+
+TEST(BasicInterpreter, OneOpInterpreter) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({1}), kTfLiteOk);
+
+  TfLiteQuantizationParams quantized;
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(0, kTfLiteFloat32, "in1",
+                                                     {3}, quantized),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(1, kTfLiteFloat32, "out0",
+                                                     {3}, quantized),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter.GetInputName(0), "in1");
+  ASSERT_EQ(interpreter.GetOutputName(0), "out0");
+
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+  reg.init = [](TfLiteContext* context, const char*, size_t) -> void* {
+    auto* first_new_tensor = new int;
+    context->AddTensors(context, 2, first_new_tensor);
+    return first_new_tensor;
+  };
+  reg.free = [](TfLiteContext* context, void* buffer) {
+    delete reinterpret_cast<int*>(buffer);
+  };
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    auto* first_new_tensor = reinterpret_cast<int*>(node->user_data);
+
+    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+
+    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor1, newSize));
+
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(2);
+    for (int i = 0; i < 2; ++i) {
+      node->temporaries->data[i] = *(first_new_tensor) + i;
+    }
+
+    auto setup_temporary = [&](int id) {
+      TfLiteTensor* tmp = &context->tensors[id];
+      tmp->type = kTfLiteFloat32;
+      tmp->allocation_type = kTfLiteArenaRw;
+      return context->ResizeTensor(context, tmp,
+                                   TfLiteIntArrayCopy(tensor0->dims));
+    };
+    TF_LITE_ENSURE_STATUS(setup_temporary(node->temporaries->data[0]));
+    TF_LITE_ENSURE_STATUS(setup_temporary(node->temporaries->data[1]));
+
+    return kTfLiteOk;
+  };
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+
+    auto populate = [&](int id) {
+      TfLiteTensor* t = &context->tensors[id];
+      int num = a0->dims->data[0];
+      for (int i = 0; i < num; i++) {
+        t->data.f[i] = a0->data.f[i];
+      }
+    };
+
+    populate(node->outputs->data[0]);
+    populate(node->temporaries->data[0]);
+    populate(node->temporaries->data[1]);
+    return kTfLiteOk;
+  };
+  ASSERT_EQ(
+      interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter.ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+}
+
+// Forcefully divides tensor allocation in three steps: one before invocation
+// and two more at invocation time. This happens because we use string tensors
+// and their sizes can't be determined until invocation time.
+TEST(BasicInterpreter, ThreeStepAllocate) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(5), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({4}), kTfLiteOk);
+
+  TfLiteQuantizationParams quantized;
+  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'A', 'B', 'C'};
+  // Read only string tensor.
+  ASSERT_EQ(interpreter.SetTensorParametersReadOnly(0, kTfLiteString, "", {1},
+                                                    quantized, data, 15),
+            kTfLiteOk);
+  // Read-write string tensor.
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(1, kTfLiteString, "", {1},
+                                                     quantized),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(2, kTfLiteInt32, "", {1},
+                                                     quantized),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(3, kTfLiteString, "", {1},
+                                                     quantized),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(4, kTfLiteInt32, "", {1},
+                                                     quantized),
+            kTfLiteOk);
+
+  // String-in String-out node.
+  TfLiteRegistration reg_copy = {nullptr, nullptr, nullptr, nullptr};
+  reg_copy.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+    DynamicBuffer buf;
+    StringRef str_ref = GetString(a0, 0);
+    buf.AddString(str_ref);
+    buf.WriteToTensor(a1);
+    return kTfLiteOk;
+  };
+
+  // String-in Int-out node.
+  TfLiteRegistration reg_len = {nullptr, nullptr, nullptr, nullptr};
+  reg_len.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+    TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
+    outputSize->data[0] = 1;
+    return context->ResizeTensor(context, output, outputSize);
+  };
+  reg_len.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+    a1->data.i32[0] = a0->bytes;
+    return kTfLiteOk;
+  };
+
+  ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr,
+                                              &reg_copy),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AddNodeWithParameters({1}, {2}, nullptr, 0, nullptr,
+                                              &reg_len),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AddNodeWithParameters({0}, {3}, nullptr, 0, nullptr,
+                                              &reg_copy),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AddNodeWithParameters({3}, {4}, nullptr, 0, nullptr,
+                                              &reg_len),
+            kTfLiteOk);
+
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+
+  ASSERT_EQ(interpreter.tensor(0)->bytes, 15);
+  ASSERT_NE(interpreter.tensor(0)->data.raw, nullptr);
+  ASSERT_EQ(interpreter.tensor(1)->bytes, 15);
+  ASSERT_NE(interpreter.tensor(1)->data.raw, nullptr);
+  ASSERT_EQ(interpreter.tensor(3)->bytes, 15);
+  ASSERT_NE(interpreter.tensor(4)->data.raw, nullptr);
+  ASSERT_EQ(interpreter.tensor(2)->bytes, 4);
+  ASSERT_EQ(interpreter.tensor(2)->data.i32[0], 15);
+  ASSERT_EQ(interpreter.tensor(4)->bytes, 4);
+  ASSERT_EQ(interpreter.tensor(4)->data.i32[0], 15);
+}
+
+TEST(BasicInterpreter, AllocateTwice) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetInputs({0}), kTfLiteOk);
+  ASSERT_EQ(interpreter.SetOutputs({1}), kTfLiteOk);
+
+  TfLiteQuantizationParams quantized;
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                                     quantized),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                                     quantized),
+            kTfLiteOk);
+
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* tensor0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* tensor1 = &context->tensors[node->outputs->data[0]];
+    TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
+    return context->ResizeTensor(context, tensor1, newSize);
+  };
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    TfLiteTensor* a0 = &context->tensors[node->inputs->data[0]];
+    TfLiteTensor* a1 = &context->tensors[node->outputs->data[0]];
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      a1->data.f[i] = a0->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  ASSERT_EQ(
+      interpreter.AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter.ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+  char* old_tensor0_ptr = interpreter.tensor(0)->data.raw;
+  char* old_tensor1_ptr = interpreter.tensor(1)->data.raw;
+
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter.Invoke(), kTfLiteOk);
+  ASSERT_EQ(old_tensor0_ptr, interpreter.tensor(0)->data.raw);
+  ASSERT_EQ(old_tensor1_ptr, interpreter.tensor(1)->data.raw);
+}
+
+struct TestErrorReporter : public ErrorReporter {
+  int Report(const char* format, va_list args) override {
+    char buffer[1024];
+    int size = vsnprintf(buffer, sizeof(buffer), format, args);
+    all_reports += buffer;
+    calls++;
+    return size;
+  }
+  int calls = 0;
+  std::string all_reports;
+};
+
+TEST(BasicInterpreter, TestNullErrorReporter) {
+  TestErrorReporter reporter;
+  Interpreter interpreter;
+}
+
+TEST(BasicInterpreter, TestCustomErrorReporter) {
+  TestErrorReporter reporter;
+  Interpreter interpreter(&reporter);
+  ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
+  ASSERT_EQ(reporter.all_reports, "Invoke called on model that is not ready.");
+  ASSERT_EQ(reporter.calls, 1);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+#ifdef OS_LINUX
+  FLAGS_logtostderr = true;
+#endif
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/ios_makefile.inc b/tensorflow/contrib/lite/ios_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..bcff7ed9889e95c13294b6cf0d0f4788991a04df
--- /dev/null
+++ b/tensorflow/contrib/lite/ios_makefile.inc
@@ -0,0 +1,47 @@
+# Settings for iOS.
+ifeq ($(TARGET), IOS)
+        BUILD_FOR_IOS_SIMULATOR := false
+	ifeq ($(IOS_ARCH), x86_64)
+	     	BUILD_FOR_IOS_SIMULATOR := true
+	endif
+	ifeq ($(IOS_ARCH), i386)
+	     	BUILD_FOR_IOS_SIMULATOR := true
+	endif
+	ifeq ($(BUILD_FOR_IOS_SIMULATOR), true)
+		IPHONEOS_PLATFORM := $(shell xcrun --sdk iphonesimulator \
+			--show-sdk-platform-path)
+		IPHONEOS_SYSROOT := $(shell xcrun --sdk iphonesimulator \
+			--show-sdk-path)
+	else
+		IPHONEOS_PLATFORM := $(shell xcrun --sdk iphoneos --show-sdk-platform-path)
+		IPHONEOS_SYSROOT := $(shell xcrun --sdk iphoneos --show-sdk-path)
+	endif
+	IOS_SDK_VERSION := $(shell xcrun --sdk iphoneos --show-sdk-version)
+	MIN_SDK_VERSION := 9.0
+	# Override IOS_ARCH with armv7, armv7s, arm64, i386, or x86_64.
+	IOS_ARCH := x86_64
+	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
+		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+		-fembed-bitcode \
+		-Wno-c++11-narrowing \
+		-mno-thumb \
+		-fno-exceptions \
+		-isysroot \
+		${IPHONEOS_SYSROOT} \
+		-arch $(IOS_ARCH) \
+		-O3
+	CCFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
+		-fembed-bitcode \
+		-mno-thumb \
+		-isysroot \
+		${IPHONEOS_SYSROOT} \
+		-arch $(IOS_ARCH) \
+		-O3
+	LDFLAGS := -fembed-bitcode \
+		-miphoneos-version-min=${MIN_SDK_VERSION} \
+		-arch $(IOS_ARCH)
+	OBJDIR := $(OBJDIR)ios_$(IOS_ARCH)/
+	LIBDIR := $(LIBDIR)ios_$(IOS_ARCH)/
+	BINDIR := $(BINDIR)ios_$(IOS_ARCH)/
+	DEPDIR := $(DEPDIR)ios_$(IOS_ARCH)/
+endif
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1de28eb52ddb458df0be0a8f9ef453f7caf68654
--- /dev/null
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -0,0 +1,150 @@
+# Description:
+# TensorFlow Lite Java API.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_jni_binary")
+
+android_library(
+    name = "tensorflowlite",
+    srcs = glob(
+        [
+            "src/main/java/org/tensorflow/lite/*.java",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tflite_runtime",
+        "@javax_validation",
+    ],
+)
+
+android_library(
+    name = "tensorflowlite_java",
+    srcs = glob(
+        [
+            "src/main/java/org/tensorflow/lite/*.java",
+        ],
+    ),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@javax_validation",
+    ],
+)
+
+java_library(
+    name = "tensorflowlitelib",
+    srcs = glob(
+        [
+            "src/main/java/org/tensorflow/lite/*.java",
+        ],
+    ),
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "@javax_validation",
+    ],
+)
+
+java_test(
+    name = "TensorFlowLiteTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.TensorFlowLiteTest",
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "DataTypeTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.DataTypeTest",
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "NativeInterpreterWrapperTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java"],
+    data = [
+        "src/testdata/add.bin",
+        "src/testdata/int32.bin",
+        "src/testdata/int64.bin",
+        "src/testdata/invalid_model.bin",
+        "src/testdata/uint8.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_test(
+    name = "TensorTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/lite/TensorTest.java"],
+    data = [
+        "src/testdata/add.bin",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.lite.TensorTest",
+    deps = [
+        ":tensorflowlitelib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+filegroup(
+    name = "libtensorflowlite_jni",
+    srcs = select({
+        "//conditions:default": [":libtensorflowlite_jni.so"],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tflite_runtime",
+    srcs = ["libtensorflowlite_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_jni.so",
+    deps = [
+        "//tensorflow/contrib/lite/java/src/main/native",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/java/demo/.gitignore b/tensorflow/contrib/lite/java/demo/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..39fb081a42a86ccf8f9cf99dbccc8bdf7c828bce
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/.gitignore
@@ -0,0 +1,9 @@
+*.iml
+.gradle
+/local.properties
+/.idea/workspace.xml
+/.idea/libraries
+.DS_Store
+/build
+/captures
+.externalNativeBuild
diff --git a/tensorflow/contrib/lite/java/demo/README.md b/tensorflow/contrib/lite/java/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e818f728ef208d30b0eeb27ffd7e3fa0c7c1a2d
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/README.md
@@ -0,0 +1,46 @@
+# TF Lite Android App
+
+## Building from Source with Bazel
+
+1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel):
+
+  1. [Install Bazel and Android Prerequisites](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites).
+     It's easiest with Android Studio.
+
+      - You'll need at least SDK version 23.
+      - Make sure to install the latest version of Bazel. Some distributions
+        ship with Bazel 0.5.4, which is too old.
+      - Bazel requires Android Build Tools `26.0.1` or higher.
+      - **Bazel is incompatible with NDK revisions 15 and above,** with revision
+        16 being a compile-breaking change. [Download an older version manually
+        instead of using the SDK Manager.](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-bazel-and-android-prerequisites)
+      - You also need to install the Android Support Repository, available
+        through Android Studio under `Android SDK Manager -> SDK Tools ->
+        Android Support Repository`.
+
+  2. [Edit your `WORKSPACE`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#edit-workspace)
+     to add SDK and NDK targets.
+
+     NOTE: As long as you have the SDK and NDK installed, the `./configure`
+     script will create these rules for you. Answer "Yes" when the script asks
+     to automatically configure the `./WORKSPACE`.
+
+      - Make sure the `api_level` in `WORKSPACE` is set to an SDK version that
+        you have installed.
+      - By default, Android Studio will install the SDK to `~/Android/Sdk` and
+        the NDK to `~/Android/Sdk/ndk-bundle` (but the NDK should be a manual
+        download until Bazel supports NDK 16. See bullet points under (1)).
+
+2. Build the app with Bazel. The demo needs C++11:
+
+  ```shell
+  bazel build -c opt --cxxopt='--std=c++11' \
+    //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+  ```
+
+3. Install the demo on a
+   [debug-enabled device](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install):
+
+  ```shell
+  adb install bazel-bin/tensorflow/contrib/lite/java/demo/app/src/main/TfLiteCameraDemo.apk
+  ```
diff --git a/tensorflow/contrib/lite/java/demo/app/build.gradle b/tensorflow/contrib/lite/java/demo/app/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..b76eaad8bb91224805d16b3d6f7c3274c9feb90c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/build.gradle
@@ -0,0 +1,58 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "android.example.com.tflitecamerademo"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'com.android.support:appcompat-v7:25.2.0'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:25.2.0'
+    compile 'com.android.support:support-annotations:25.3.1'
+    compile 'com.android.support:support-v13:25.2.0'
+
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ba63dce5d9a7192a2c3c4c5561333d39a3ecc024
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.example.android.tflitecamerademo">
+
+    <uses-permission android:name="android.permission.CAMERA" />
+
+    <uses-feature android:name="android.hardware.camera" />
+    <uses-feature android:name="android.hardware.camera.autofocus" />
+
+    <uses-sdk android:minSdkVersion="21" />
+
+    <application android:allowBackup="true"
+        android:label="@string/app_name"
+        android:icon="@drawable/ic_launcher"
+        android:theme="@style/MaterialTheme">
+
+        <activity android:name="com.example.android.tflitecamerademo.CameraActivity"
+                  android:label="@string/app_name">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..654fa9d6d2799fc3cafa3e0e042cb2a5746bf2c5
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
@@ -0,0 +1,41 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+android_binary(
+    name = "TfLiteCameraDemo",
+    srcs = glob(["java/**/*.java"]),
+    assets = [
+        "@tflite_mobilenet//:labels.txt",
+        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+    ],
+    assets_dir = "",
+    custom_package = "com.example.android.tflitecamerademo",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd0cd6c98ff878e9c41875cab74c12191cadb173
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt b/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/labels.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java
new file mode 100644
index 0000000000000000000000000000000000000000..f2045906599218871b51a752dcbb3eeb23b8f085
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/AutoFitTextureView.java
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.content.Context;
+import android.util.AttributeSet;
+import android.view.TextureView;
+
+/** A {@link TextureView} that can be adjusted to a specified aspect ratio. */
+public class AutoFitTextureView extends TextureView {
+
+  private int mRatioWidth = 0;
+  private int mRatioHeight = 0;
+
+  public AutoFitTextureView(Context context) {
+    this(context, null);
+  }
+
+  public AutoFitTextureView(Context context, AttributeSet attrs) {
+    this(context, attrs, 0);
+  }
+
+  public AutoFitTextureView(Context context, AttributeSet attrs, int defStyle) {
+    super(context, attrs, defStyle);
+  }
+
+  /**
+   * Sets the aspect ratio for this view. The size of the view will be measured based on the ratio
+   * calculated from the parameters. Note that the actual sizes of parameters don't matter, that is,
+   * calling setAspectRatio(2, 3) and setAspectRatio(4, 6) make the same result.
+   *
+   * @param width Relative horizontal size
+   * @param height Relative vertical size
+   */
+  public void setAspectRatio(int width, int height) {
+    if (width < 0 || height < 0) {
+      throw new IllegalArgumentException("Size cannot be negative.");
+    }
+    mRatioWidth = width;
+    mRatioHeight = height;
+    requestLayout();
+  }
+
+  @Override
+  protected void onMeasure(int widthMeasureSpec, int heightMeasureSpec) {
+    super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+    int width = MeasureSpec.getSize(widthMeasureSpec);
+    int height = MeasureSpec.getSize(heightMeasureSpec);
+    if (0 == mRatioWidth || 0 == mRatioHeight) {
+      setMeasuredDimension(width, height);
+    } else {
+      if (width < height * mRatioWidth / mRatioHeight) {
+        setMeasuredDimension(width, width * mRatioHeight / mRatioWidth);
+      } else {
+        setMeasuredDimension(height * mRatioWidth / mRatioHeight, height);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..74737a8b883d23684220dd32bbd7a9e8ab4b2123
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -0,0 +1,708 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.Dialog;
+import android.app.DialogFragment;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.DialogInterface;
+import android.content.pm.PackageInfo;
+import android.content.pm.PackageManager;
+import android.content.res.Configuration;
+import android.graphics.Bitmap;
+import android.graphics.ImageFormat;
+import android.graphics.Matrix;
+import android.graphics.Point;
+import android.graphics.RectF;
+import android.graphics.SurfaceTexture;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCaptureSession;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraDevice;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.CaptureRequest;
+import android.hardware.camera2.CaptureResult;
+import android.hardware.camera2.TotalCaptureResult;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.ImageReader;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.support.annotation.NonNull;
+import android.support.v13.app.FragmentCompat;
+import android.support.v4.content.ContextCompat;
+import android.util.Log;
+import android.util.Size;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.TextView;
+import android.widget.Toast;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+
+/** Basic fragments for the Camera. */
+public class Camera2BasicFragment extends Fragment
+    implements FragmentCompat.OnRequestPermissionsResultCallback {
+
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "TfLiteCameraDemo";
+
+  private static final String FRAGMENT_DIALOG = "dialog";
+
+  private static final String HANDLE_THREAD_NAME = "CameraBackground";
+
+  private static final int PERMISSIONS_REQUEST_CODE = 1;
+
+  private final Object lock = new Object();
+  private boolean runClassifier = false;
+  private boolean checkedPermissions = false;
+  private TextView textView;
+  private ImageClassifier classifier;
+
+  /** Max preview width that is guaranteed by Camera2 API */
+  private static final int MAX_PREVIEW_WIDTH = 1920;
+
+  /** Max preview height that is guaranteed by Camera2 API */
+  private static final int MAX_PREVIEW_HEIGHT = 1080;
+
+  /**
+   * {@link TextureView.SurfaceTextureListener} handles several lifecycle events on a {@link
+   * TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+
+        @Override
+        public void onSurfaceTextureAvailable(SurfaceTexture texture, int width, int height) {
+          openCamera(width, height);
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(SurfaceTexture texture, int width, int height) {
+          configureTransform(width, height);
+        }
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(SurfaceTexture texture) {}
+      };
+
+  /** ID of the current {@link CameraDevice}. */
+  private String cameraId;
+
+  /** An {@link AutoFitTextureView} for camera preview. */
+  private AutoFitTextureView textureView;
+
+  /** A {@link CameraCaptureSession } for camera preview. */
+  private CameraCaptureSession captureSession;
+
+  /** A reference to the opened {@link CameraDevice}. */
+  private CameraDevice cameraDevice;
+
+  /** The {@link android.util.Size} of camera preview. */
+  private Size previewSize;
+
+  /** {@link CameraDevice.StateCallback} is called when {@link CameraDevice} changes its state. */
+  private final CameraDevice.StateCallback stateCallback =
+      new CameraDevice.StateCallback() {
+
+        @Override
+        public void onOpened(@NonNull CameraDevice currentCameraDevice) {
+          // This method is called when the camera is opened.  We start camera preview here.
+          cameraOpenCloseLock.release();
+          cameraDevice = currentCameraDevice;
+          createCameraPreviewSession();
+        }
+
+        @Override
+        public void onDisconnected(@NonNull CameraDevice currentCameraDevice) {
+          cameraOpenCloseLock.release();
+          currentCameraDevice.close();
+          cameraDevice = null;
+        }
+
+        @Override
+        public void onError(@NonNull CameraDevice currentCameraDevice, int error) {
+          cameraOpenCloseLock.release();
+          currentCameraDevice.close();
+          cameraDevice = null;
+          Activity activity = getActivity();
+          if (null != activity) {
+            activity.finish();
+          }
+        }
+      };
+
+  /** An additional thread for running tasks that shouldn't block the UI. */
+  private HandlerThread backgroundThread;
+
+  /** A {@link Handler} for running tasks in the background. */
+  private Handler backgroundHandler;
+
+  /** An {@link ImageReader} that handles image capture. */
+  private ImageReader imageReader;
+
+  /** {@link CaptureRequest.Builder} for the camera preview */
+  private CaptureRequest.Builder previewRequestBuilder;
+
+  /** {@link CaptureRequest} generated by {@link #previewRequestBuilder} */
+  private CaptureRequest previewRequest;
+
+  /** A {@link Semaphore} to prevent the app from exiting before closing the camera. */
+  private Semaphore cameraOpenCloseLock = new Semaphore(1);
+
+  /** A {@link CameraCaptureSession.CaptureCallback} that handles events related to capture. */
+  private CameraCaptureSession.CaptureCallback captureCallback =
+      new CameraCaptureSession.CaptureCallback() {
+
+        @Override
+        public void onCaptureProgressed(
+            @NonNull CameraCaptureSession session,
+            @NonNull CaptureRequest request,
+            @NonNull CaptureResult partialResult) {}
+
+        @Override
+        public void onCaptureCompleted(
+            @NonNull CameraCaptureSession session,
+            @NonNull CaptureRequest request,
+            @NonNull TotalCaptureResult result) {}
+      };
+
+  /**
+   * Shows a {@link Toast} on the UI thread for the classification results.
+   *
+   * @param text The message to show
+   */
+  private void showToast(final String text) {
+    final Activity activity = getActivity();
+    if (activity != null) {
+      activity.runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              textView.setText(text);
+            }
+          });
+    }
+  }
+
+  /**
+   * Resizes image.
+   *
+   * Attempting to use too large a preview size could  exceed the camera bus' bandwidth limitation,
+   * resulting in gorgeous previews but the storage of garbage capture data.
+   *
+   * Given {@code choices} of {@code Size}s supported by a camera, choose the smallest one that is
+   * at least as large as the respective texture view size, and that is at most as large as the
+   * respective max size, and whose aspect ratio matches with the specified value. If such size
+   * doesn't exist, choose the largest one that is at most as large as the respective max size, and
+   * whose aspect ratio matches with the specified value.
+   *
+   * @param choices The list of sizes that the camera supports for the intended output class
+   * @param textureViewWidth The width of the texture view relative to sensor coordinate
+   * @param textureViewHeight The height of the texture view relative to sensor coordinate
+   * @param maxWidth The maximum width that can be chosen
+   * @param maxHeight The maximum height that can be chosen
+   * @param aspectRatio The aspect ratio
+   * @return The optimal {@code Size}, or an arbitrary one if none were big enough
+   */
+  private static Size chooseOptimalSize(
+      Size[] choices,
+      int textureViewWidth,
+      int textureViewHeight,
+      int maxWidth,
+      int maxHeight,
+      Size aspectRatio) {
+
+    // Collect the supported resolutions that are at least as big as the preview Surface
+    List<Size> bigEnough = new ArrayList<>();
+    // Collect the supported resolutions that are smaller than the preview Surface
+    List<Size> notBigEnough = new ArrayList<>();
+    int w = aspectRatio.getWidth();
+    int h = aspectRatio.getHeight();
+    for (Size option : choices) {
+      if (option.getWidth() <= maxWidth
+          && option.getHeight() <= maxHeight
+          && option.getHeight() == option.getWidth() * h / w) {
+        if (option.getWidth() >= textureViewWidth && option.getHeight() >= textureViewHeight) {
+          bigEnough.add(option);
+        } else {
+          notBigEnough.add(option);
+        }
+      }
+    }
+
+    // Pick the smallest of those big enough. If there is no one big enough, pick the
+    // largest of those not big enough.
+    if (bigEnough.size() > 0) {
+      return Collections.min(bigEnough, new CompareSizesByArea());
+    } else if (notBigEnough.size() > 0) {
+      return Collections.max(notBigEnough, new CompareSizesByArea());
+    } else {
+      Log.e(TAG, "Couldn't find any suitable preview size");
+      return choices[0];
+    }
+  }
+
+  public static Camera2BasicFragment newInstance() {
+    return new Camera2BasicFragment();
+  }
+
+  /** Layout the preview and buttons. */
+  @Override
+  public View onCreateView(
+      LayoutInflater inflater, ViewGroup container, Bundle savedInstanceState) {
+    return inflater.inflate(R.layout.fragment_camera2_basic, container, false);
+  }
+
+  /** Connect the buttons to their event handler. */
+  @Override
+  public void onViewCreated(final View view, Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+    textView = (TextView) view.findViewById(R.id.text);
+  }
+
+  /** Load the model and labels. */
+  @Override
+  public void onActivityCreated(Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+    try {
+      classifier = new ImageClassifier(getActivity());
+    } catch (IOException e) {
+      Log.e(TAG, "Failed to initialize an image classifier.");
+    }
+    startBackgroundThread();
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+    if (textureView.isAvailable()) {
+      openCamera(textureView.getWidth(), textureView.getHeight());
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    closeCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  @Override
+  public void onDestroy() {
+    classifier.close();
+    super.onDestroy();
+  }
+
+  /**
+   * Sets up member variables related to camera.
+   *
+   * @param width The width of available size for camera preview
+   * @param height The height of available size for camera preview
+   */
+  private void setUpCameraOutputs(int width, int height) {
+    Activity activity = getActivity();
+    CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      for (String cameraId : manager.getCameraIdList()) {
+        CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+        // We don't use a front facing camera in this sample.
+        Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
+        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+          continue;
+        }
+
+        StreamConfigurationMap map =
+            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+        if (map == null) {
+          continue;
+        }
+
+        // // For still image captures, we use the largest available size.
+        Size largest =
+            Collections.max(
+                Arrays.asList(map.getOutputSizes(ImageFormat.JPEG)), new CompareSizesByArea());
+        imageReader =
+            ImageReader.newInstance(
+                largest.getWidth(), largest.getHeight(), ImageFormat.JPEG, /*maxImages*/ 2);
+
+        // Find out if we need to swap dimension to get the preview size relative to sensor
+        // coordinate.
+        int displayRotation = activity.getWindowManager().getDefaultDisplay().getRotation();
+        // noinspection ConstantConditions
+        /* Orientation of the camera sensor */
+        int sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
+        boolean swappedDimensions = false;
+        switch (displayRotation) {
+          case Surface.ROTATION_0:
+          case Surface.ROTATION_180:
+            if (sensorOrientation == 90 || sensorOrientation == 270) {
+              swappedDimensions = true;
+            }
+            break;
+          case Surface.ROTATION_90:
+          case Surface.ROTATION_270:
+            if (sensorOrientation == 0 || sensorOrientation == 180) {
+              swappedDimensions = true;
+            }
+            break;
+          default:
+            Log.e(TAG, "Display rotation is invalid: " + displayRotation);
+        }
+
+        Point displaySize = new Point();
+        activity.getWindowManager().getDefaultDisplay().getSize(displaySize);
+        int rotatedPreviewWidth = width;
+        int rotatedPreviewHeight = height;
+        int maxPreviewWidth = displaySize.x;
+        int maxPreviewHeight = displaySize.y;
+
+        if (swappedDimensions) {
+          rotatedPreviewWidth = height;
+          rotatedPreviewHeight = width;
+          maxPreviewWidth = displaySize.y;
+          maxPreviewHeight = displaySize.x;
+        }
+
+        if (maxPreviewWidth > MAX_PREVIEW_WIDTH) {
+          maxPreviewWidth = MAX_PREVIEW_WIDTH;
+        }
+
+        if (maxPreviewHeight > MAX_PREVIEW_HEIGHT) {
+          maxPreviewHeight = MAX_PREVIEW_HEIGHT;
+        }
+
+        previewSize =
+            chooseOptimalSize(
+                map.getOutputSizes(SurfaceTexture.class),
+                rotatedPreviewWidth,
+                rotatedPreviewHeight,
+                maxPreviewWidth,
+                maxPreviewHeight,
+                largest);
+
+        // We fit the aspect ratio of TextureView to the size of preview we picked.
+        int orientation = getResources().getConfiguration().orientation;
+        if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
+          textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
+        } else {
+          textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
+        }
+
+        this.cameraId = cameraId;
+        return;
+      }
+    } catch (CameraAccessException e) {
+      e.printStackTrace();
+    } catch (NullPointerException e) {
+      // Currently an NPE is thrown when the Camera2API is used but not supported on the
+      // device this code runs.
+      ErrorDialog.newInstance(getString(R.string.camera_error))
+          .show(getChildFragmentManager(), FRAGMENT_DIALOG);
+    }
+  }
+
+  private String[] getRequiredPermissions() {
+    Activity activity = getActivity();
+    try {
+      PackageInfo info =
+          activity
+              .getPackageManager()
+              .getPackageInfo(activity.getPackageName(), PackageManager.GET_PERMISSIONS);
+      String[] ps = info.requestedPermissions;
+      if (ps != null && ps.length > 0) {
+        return ps;
+      } else {
+        return new String[0];
+      }
+    } catch (Exception e) {
+      return new String[0];
+    }
+  }
+
+  /** Opens the camera specified by {@link Camera2BasicFragment#cameraId}. */
+  private void openCamera(int width, int height) {
+    if (!checkedPermissions && !allPermissionsGranted()) {
+      FragmentCompat.requestPermissions(this, getRequiredPermissions(), PERMISSIONS_REQUEST_CODE);
+      return;
+    } else {
+      checkedPermissions = true;
+    }
+    setUpCameraOutputs(width, height);
+    configureTransform(width, height);
+    Activity activity = getActivity();
+    CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      if (!cameraOpenCloseLock.tryAcquire(2500, TimeUnit.MILLISECONDS)) {
+        throw new RuntimeException("Time out waiting to lock camera opening.");
+      }
+      manager.openCamera(cameraId, stateCallback, backgroundHandler);
+    } catch (CameraAccessException e) {
+      e.printStackTrace();
+    } catch (InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
+    }
+  }
+
+  private boolean allPermissionsGranted() {
+    for (String permission : getRequiredPermissions()) {
+      if (ContextCompat.checkSelfPermission(getActivity(), permission)
+          != PackageManager.PERMISSION_GRANTED) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, @NonNull String[] permissions, @NonNull int[] grantResults) {
+    super.onRequestPermissionsResult(requestCode, permissions, grantResults);
+  }
+
+  /** Closes the current {@link CameraDevice}. */
+  private void closeCamera() {
+    try {
+      cameraOpenCloseLock.acquire();
+      if (null != captureSession) {
+        captureSession.close();
+        captureSession = null;
+      }
+      if (null != cameraDevice) {
+        cameraDevice.close();
+        cameraDevice = null;
+      }
+      if (null != imageReader) {
+        imageReader.close();
+        imageReader = null;
+      }
+    } catch (InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera closing.", e);
+    } finally {
+      cameraOpenCloseLock.release();
+    }
+  }
+
+  /** Starts a background thread and its {@link Handler}. */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread(HANDLE_THREAD_NAME);
+    backgroundThread.start();
+    backgroundHandler = new Handler(backgroundThread.getLooper());
+    synchronized (lock) {
+      runClassifier = true;
+    }
+    backgroundHandler.post(periodicClassify);
+  }
+
+  /** Stops the background thread and its {@link Handler}. */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+      backgroundHandler = null;
+      synchronized (lock) {
+        runClassifier = false;
+      }
+    } catch (InterruptedException e) {
+      e.printStackTrace();
+    }
+  }
+
+  /** Takes photos and classify them periodically. */
+  private Runnable periodicClassify =
+      new Runnable() {
+        @Override
+        public void run() {
+          synchronized (lock) {
+            if (runClassifier) {
+              classifyFrame();
+            }
+          }
+          backgroundHandler.post(periodicClassify);
+        }
+      };
+
+  /** Creates a new {@link CameraCaptureSession} for camera preview. */
+  private void createCameraPreviewSession() {
+    try {
+      SurfaceTexture texture = textureView.getSurfaceTexture();
+      assert texture != null;
+
+      // We configure the size of default buffer to be the size of camera preview we want.
+      texture.setDefaultBufferSize(previewSize.getWidth(), previewSize.getHeight());
+
+      // This is the output Surface we need to start preview.
+      Surface surface = new Surface(texture);
+
+      // We set up a CaptureRequest.Builder with the output Surface.
+      previewRequestBuilder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+      previewRequestBuilder.addTarget(surface);
+
+      // Here, we create a CameraCaptureSession for camera preview.
+      cameraDevice.createCaptureSession(
+          Arrays.asList(surface),
+          new CameraCaptureSession.StateCallback() {
+
+            @Override
+            public void onConfigured(@NonNull CameraCaptureSession cameraCaptureSession) {
+              // The camera is already closed
+              if (null == cameraDevice) {
+                return;
+              }
+
+              // When the session is ready, we start displaying the preview.
+              captureSession = cameraCaptureSession;
+              try {
+                // Auto focus should be continuous for camera preview.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AF_MODE,
+                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+
+                // Finally, we start displaying the camera preview.
+                previewRequest = previewRequestBuilder.build();
+                captureSession.setRepeatingRequest(
+                    previewRequest, captureCallback, backgroundHandler);
+              } catch (CameraAccessException e) {
+                e.printStackTrace();
+              }
+            }
+
+            @Override
+            public void onConfigureFailed(@NonNull CameraCaptureSession cameraCaptureSession) {
+              showToast("Failed");
+            }
+          },
+          null);
+    } catch (CameraAccessException e) {
+      e.printStackTrace();
+    }
+  }
+
+  /**
+   * Configures the necessary {@link android.graphics.Matrix} transformation to `textureView`. This
+   * method should be called after the camera preview size is determined in setUpCameraOutputs and
+   * also the size of `textureView` is fixed.
+   *
+   * @param viewWidth The width of `textureView`
+   * @param viewHeight The height of `textureView`
+   */
+  private void configureTransform(int viewWidth, int viewHeight) {
+    Activity activity = getActivity();
+    if (null == textureView || null == previewSize || null == activity) {
+      return;
+    }
+    int rotation = activity.getWindowManager().getDefaultDisplay().getRotation();
+    Matrix matrix = new Matrix();
+    RectF viewRect = new RectF(0, 0, viewWidth, viewHeight);
+    RectF bufferRect = new RectF(0, 0, previewSize.getHeight(), previewSize.getWidth());
+    float centerX = viewRect.centerX();
+    float centerY = viewRect.centerY();
+    if (Surface.ROTATION_90 == rotation || Surface.ROTATION_270 == rotation) {
+      bufferRect.offset(centerX - bufferRect.centerX(), centerY - bufferRect.centerY());
+      matrix.setRectToRect(viewRect, bufferRect, Matrix.ScaleToFit.FILL);
+      float scale =
+          Math.max(
+              (float) viewHeight / previewSize.getHeight(),
+              (float) viewWidth / previewSize.getWidth());
+      matrix.postScale(scale, scale, centerX, centerY);
+      matrix.postRotate(90 * (rotation - 2), centerX, centerY);
+    } else if (Surface.ROTATION_180 == rotation) {
+      matrix.postRotate(180, centerX, centerY);
+    }
+    textureView.setTransform(matrix);
+  }
+
+  /** Classifies a frame from the preview stream. */
+  private void classifyFrame() {
+    if (classifier == null || getActivity() == null || cameraDevice == null) {
+      showToast("Uninitialized Classifier or invalid context.");
+      return;
+    }
+    Bitmap bitmap =
+        textureView.getBitmap(ImageClassifier.DIM_IMG_SIZE_X, ImageClassifier.DIM_IMG_SIZE_Y);
+    String textToShow = classifier.classifyFrame(bitmap);
+    bitmap.recycle();
+    showToast(textToShow);
+  }
+
+  /** Compares two {@code Size}s based on their areas. */
+  private static class CompareSizesByArea implements Comparator<Size> {
+
+    @Override
+    public int compare(Size lhs, Size rhs) {
+      // We cast here to ensure the multiplications won't overflow
+      return Long.signum(
+          (long) lhs.getWidth() * lhs.getHeight() - (long) rhs.getWidth() * rhs.getHeight());
+    }
+  }
+
+  /** Shows an error message dialog. */
+  public static class ErrorDialog extends DialogFragment {
+
+    private static final String ARG_MESSAGE = "message";
+
+    public static ErrorDialog newInstance(String message) {
+      ErrorDialog dialog = new ErrorDialog();
+      Bundle args = new Bundle();
+      args.putString(ARG_MESSAGE, message);
+      dialog.setArguments(args);
+      return dialog;
+    }
+
+    @Override
+    public Dialog onCreateDialog(Bundle savedInstanceState) {
+      final Activity activity = getActivity();
+      return new AlertDialog.Builder(activity)
+          .setMessage(getArguments().getString(ARG_MESSAGE))
+          .setPositiveButton(
+              android.R.string.ok,
+              new DialogInterface.OnClickListener() {
+                @Override
+                public void onClick(DialogInterface dialogInterface, int i) {
+                  activity.finish();
+                }
+              })
+          .create();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..e7161ddb26b379f9dcf6addefa585ccf6431c055
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/CameraActivity.java
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import android.os.Bundle;
+
+/** Main {@code Activity} class for the Camera app. */
+public class CameraActivity extends Activity {
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_camera);
+    if (null == savedInstanceState) {
+      getFragmentManager()
+          .beginTransaction()
+          .replace(R.id.container, Camera2BasicFragment.newInstance())
+          .commit();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..e7bad4637041d003c1e507d81c0c30404c587653
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import android.content.res.AssetFileDescriptor;
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import org.tensorflow.lite.Interpreter;
+
+/** Classifies images with Tensorflow Lite. */
+public class ImageClassifier {
+
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "TfLiteCameraDemo";
+
+  /** Name of the model file stored in Assets. */
+  private static final String MODEL_PATH = "mobilenet_quant_v1_224.tflite";
+
+  /** Name of the label file stored in Assets. */
+  private static final String LABEL_PATH = "labels.txt";
+
+  /** Number of results to show in the UI. */
+  private static final int RESULTS_TO_SHOW = 3;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+
+  private static final int DIM_PIXEL_SIZE = 3;
+
+  static final int DIM_IMG_SIZE_X = 224;
+  static final int DIM_IMG_SIZE_Y = 224;
+
+  /* Preallocated buffers for storing image data in. */
+  private int[] intValues = new int[DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y];
+
+  /** An instance of the driver class to run model inference with Tensorflow Lite. */
+  private Interpreter tflite;
+
+  /** Labels corresponding to the output of the vision model. */
+  private List<String> labelList;
+
+  /** A ByteBuffer to hold image data, to be feed into Tensorflow Lite as inputs. */
+  private ByteBuffer imgData = null;
+
+  /** An array to hold inference results, to be feed into Tensorflow Lite as outputs. */
+  private byte[][] labelProbArray = null;
+
+  private PriorityQueue<Map.Entry<String, Float>> sortedLabels =
+      new PriorityQueue<>(
+          RESULTS_TO_SHOW,
+          new Comparator<Map.Entry<String, Float>>() {
+            @Override
+            public int compare(Map.Entry<String, Float> o1, Map.Entry<String, Float> o2) {
+              return (o1.getValue()).compareTo(o2.getValue());
+            }
+          });
+
+  /** Initializes an {@code ImageClassifier}. */
+  ImageClassifier(Activity activity) throws IOException {
+    tflite = new Interpreter(loadModelFile(activity));
+    labelList = loadLabelList(activity);
+    imgData =
+        ByteBuffer.allocateDirect(
+            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
+    imgData.order(ByteOrder.nativeOrder());
+    labelProbArray = new byte[1][labelList.size()];
+    Log.d(TAG, "Created a Tensorflow Lite Image Classifier.");
+  }
+
+  /** Classifies a frame from the preview stream. */
+  String classifyFrame(Bitmap bitmap) {
+    if (tflite == null) {
+      Log.e(TAG, "Image classifier has not been initialized; Skipped.");
+      return "Uninitialized Classifier.";
+    }
+    convertBitmapToByteBuffer(bitmap);
+    // Here's where the magic happens!!!
+    long startTime = SystemClock.uptimeMillis();
+    tflite.run(imgData, labelProbArray);
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to run model inference: " + Long.toString(endTime - startTime));
+    String textToShow = printTopKLabels();
+    textToShow = Long.toString(endTime - startTime) + "ms" + textToShow;
+    return textToShow;
+  }
+
+  /** Closes tflite to release resources. */
+  public void close() {
+    tflite.close();
+    tflite = null;
+  }
+
+  /** Reads label list from Assets. */
+  private List<String> loadLabelList(Activity activity) throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    BufferedReader reader =
+        new BufferedReader(new InputStreamReader(activity.getAssets().open(LABEL_PATH)));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      labelList.add(line);
+    }
+    reader.close();
+    return labelList;
+  }
+
+  /** Memory-map the model file in Assets. */
+  private MappedByteBuffer loadModelFile(Activity activity) throws IOException {
+    AssetFileDescriptor fileDescriptor = activity.getAssets().openFd(MODEL_PATH);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  /** Writes Image data into a {@code ByteBuffer}. */
+  private void convertBitmapToByteBuffer(Bitmap bitmap) {
+    if (imgData == null) {
+      return;
+    }
+    imgData.rewind();
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+    // Convert the image to floating point.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+
+  /** Prints top-K labels, to be shown in UI as the results. */
+  private String printTopKLabels() {
+    for (int i = 0; i < labelList.size(); ++i) {
+      sortedLabels.add(
+          new AbstractMap.SimpleEntry<>(labelList.get(i), (labelProbArray[0][i] & 0xff) / 255.0f));
+      if (sortedLabels.size() > RESULTS_TO_SHOW) {
+        sortedLabels.poll();
+      }
+    }
+    String textToShow = "";
+    final int size = sortedLabels.size();
+    for (int i = 0; i < size; ++i) {
+      Map.Entry<String, Float> label = sortedLabels.poll();
+      textToShow = "\n" + label.getKey() + ":" + Float.toString(label.getValue()) + textToShow;
+    }
+    return textToShow;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0a70008b10b98162b4710385e21ac65333f1231
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..c22509d8dfccae14d9470e3042a9ed5b469ca2c9
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png
new file mode 100644
index 0000000000000000000000000000000000000000..a84e3ef52c6dce90ccfa98f64db25fad7a8f0289
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-hdpi/tile.9.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..520c2dd100b092fad5987dc1b41575e1681b459c
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..d68af39186ca9cd2bc755cad8397467a11844a1d
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..1347b091983ebd9d3d58e29194b9335b6c138a2b
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e419b7ccd88651bd21dac36853a827fc4075b8
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd933333b71590608d91201aad29553f9b365b6a
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..342ce34e1663960d8d7050a9be57face3571d336
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/app/src/main/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a84f1bbfa0cb48a3fc335c9bc4aa7d8e93d20e75
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="true" />
+
+    <FrameLayout
+        android:id="@+id/control"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true"
+        android:layout_alignParentEnd="true"
+        android:layout_alignParentTop="true"
+        android:layout_toRightOf="@id/texture"
+        android:background="@color/control_background"
+        android:orientation="horizontal">
+
+        <TextView android:id="@+id/text"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingTop="20dp"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+
+    </FrameLayout>
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/activity_camera.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/activity_camera.xml
new file mode 100644
index 0000000000000000000000000000000000000000..286e549c6569cef4b7a9e46f9c73e6f43b6d3045
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/activity_camera.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/container"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#000"
+    tools:context="com.example.android.tflitecamerademo.CameraActivity" />
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
new file mode 100644
index 0000000000000000000000000000000000000000..15305c436e0d997af15a326ab4027ea713ed8098
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentStart="true"
+        android:layout_alignParentTop="true" />
+
+    <FrameLayout
+        android:id="@+id/control"
+        android:layout_width="match_parent"
+        android:layout_height="112dp"
+        android:layout_alignParentBottom="true"
+        android:layout_alignParentStart="true"
+        android:background="@color/control_background">
+
+        <TextView android:id="@+id/text"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:paddingLeft="80dp"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold" />
+
+    </FrameLayout>
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..22074a2bdbaf60efff64d98a0788ef797a966f80
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-dimens.xml
@@ -0,0 +1,24 @@
+<!--
+  Copyright 2013 The Android Open Source Project
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_huge</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..03d1974183dd645178c07d247d61b83d067806be
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-sw600dp/template-styles.xml
@@ -0,0 +1,25 @@
+<!--
+  Copyright 2013 The Android Open Source Project
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceLarge</item>
+        <item name="android:lineSpacingMultiplier">1.2</item>
+        <item name="android:shadowDy">-6.5</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v11/template-styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v11/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8c1ea66f28907ac211f355f4220ff4582cfb31eb
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v11/template-styles.xml
@@ -0,0 +1,22 @@
+<!--
+  Copyright 2013 The Android Open Source Project
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Holo.Light" />
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-colors.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8b6ec3f85dd98221822e17d808f8d00891714861
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-colors.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c778e4f98a30777440a68fbd1661bbccc7b3f6e0
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values-v21/base-template-styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Material.Light">
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0a71dbd0e8010f5e3a176de1f7e8321331289f7c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+    <string name="app_name">TfLiteCameraDemo</string>
+    <string name="intro_message">
+        <![CDATA[
+
+
+            This sample demonstrates the basic use of TfLite API. Check the source code to see how
+            you can use TfLite for efficient, on-device inference with trained TensorFlow models.
+
+
+        ]]>
+    </string>
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..4b75d2b2bda0f95166d0442ebae19cedcad162d8
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/colors.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2015 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <color name="control_background">#cc4285f4</color>
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a08ec3eb629250a727cec49a822375fe5569f455
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/strings.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="picture">Picture</string>
+    <string name="description_info">Info</string>
+    <string name="request_permission">This sample needs camera permission.</string>
+    <string name="camera_error">This device doesn\'t support Camera2 API.</string>
+    <string name="toggle_turn_on">NN:On</string>
+    <string name="toggle_turn_off">NN:Off</string>
+    <string name="toggle">Use NNAPI</string>
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..3f3bdfb49480e779c108cd15da854ae82a118d52
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2014 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-dimens.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..39e710b5ca358c1ed04c4a95fc859861b318ae7e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-dimens.xml
@@ -0,0 +1,32 @@
+<!--
+  Copyright 2013 The Android Open Source Project
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Define standard dimensions to comply with Holo-style grids and rhythm. -->
+
+    <dimen name="margin_tiny">4dp</dimen>
+    <dimen name="margin_small">8dp</dimen>
+    <dimen name="margin_medium">16dp</dimen>
+    <dimen name="margin_large">32dp</dimen>
+    <dimen name="margin_huge">64dp</dimen>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_medium</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..6e7d593dd8b50a4deb3002c6073d34f9c6b8ffec
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/template-styles.xml
@@ -0,0 +1,42 @@
+<!--
+  Copyright 2013 The Android Open Source Project
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+
+    <style name="Theme.Base" parent="android:Theme.Light" />
+
+    <style name="Theme.Sample" parent="Theme.Base" />
+
+    <style name="AppTheme" parent="Theme.Sample" />
+    <!-- Widget styling -->
+
+    <style name="Widget" />
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceMedium</item>
+        <item name="android:lineSpacingMultiplier">1.1</item>
+    </style>
+
+    <style name="Widget.SampleMessageTile">
+        <item name="android:background">@drawable/tile</item>
+        <item name="android:shadowColor">#7F000000</item>
+        <item name="android:shadowDy">-3.5</item>
+        <item name="android:shadowRadius">2</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/java/demo/build.gradle b/tensorflow/contrib/lite/java/demo/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..b78a0b86c939620b6f05483ce45c4d3ef0ef595e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/build.gradle
@@ -0,0 +1,23 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.1'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/tensorflow/contrib/lite/java/demo/gradle.properties b/tensorflow/contrib/lite/java/demo/gradle.properties
new file mode 100644
index 0000000000000000000000000000000000000000..aac7c9b4614ccfde6c721f24994cf30885a791d0
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/gradle.properties
@@ -0,0 +1,17 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
diff --git a/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.jar b/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..13372aef5e24af05341d49695ee84e5f9b594659
Binary files /dev/null and b/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000000000000000000000000000000..fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Sep 28 09:01:41 PDT 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
diff --git a/tensorflow/contrib/lite/java/demo/gradlew b/tensorflow/contrib/lite/java/demo/gradlew
new file mode 100755
index 0000000000000000000000000000000000000000..9d82f78915133e1c35a6ea51252590fb38efac2f
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/contrib/lite/java/demo/gradlew.bat b/tensorflow/contrib/lite/java/demo/gradlew.bat
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b282aa6885fb573c106b3551f7275c5f17e8e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/contrib/lite/java/demo/settings.gradle b/tensorflow/contrib/lite/java/demo/settings.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..e7b4def49cb53d9aa04228dd3edb14c9e635e003
--- /dev/null
+++ b/tensorflow/contrib/lite/java/demo/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
new file mode 100644
index 0000000000000000000000000000000000000000..d63c299589d2e8ce1051a52d29b533ed126bbcf7
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Type of elements in a {@link TfLiteTensor}. */
+enum DataType {
+  /** 32-bit single precision floating point. */
+  FLOAT32(1),
+
+  /** 32-bit signed integer. */
+  INT32(2),
+
+  /** 8-bit unsigned integer. */
+  UINT8(3),
+
+  /** 64-bit signed integer. */
+  INT64(4),
+
+  /** A {@link ByteBuffer}. */
+  BYTEBUFFER(999);
+
+  private final int value;
+
+  DataType(int value) {
+    this.value = value;
+  }
+
+  /** Corresponding value of the kTfLite* enum in the TensorFlow Lite CC API. */
+  int getNumber() {
+    return value;
+  }
+
+  /** Converts an integer to the corresponding type. */
+  static DataType fromNumber(int c) {
+    for (DataType t : values) {
+      if (t.value == c) {
+        return t;
+      }
+    }
+    throw new IllegalArgumentException(
+        "DataType " + c + " is not recognized in Java (version " + TensorFlowLite.version() + ")");
+  }
+
+  /** Returns byte size of the type. */
+  int elemByteSize() {
+    switch (this) {
+      case FLOAT32:
+        return 4;
+      case INT32:
+        return 4;
+      case UINT8:
+        return 1;
+      case INT64:
+        return 8;
+      case BYTEBUFFER:
+        return 1;
+    }
+    throw new IllegalArgumentException("DataType " + this + " is not supported yet");
+  }
+
+  // Cached to avoid copying it
+  private static final DataType[] values = values();
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
new file mode 100644
index 0000000000000000000000000000000000000000..dd883d69d2065236ee29012b9bde99972aefbcf7
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -0,0 +1,172 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import java.io.File;
+import java.nio.MappedByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import javax.validation.constraints.NotNull;
+
+/**
+ * Driver class to drive model inference with TensorFlow Lite.
+ *
+ * <p>A {@code Interpreter} encapsulates a pre-trained TensorFlow Lite model, in which operations
+ * are executed for model inference.
+ *
+ * <p>For example, if a model takes only one input and returns only one output:
+ *
+ * <pre>{@code
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.run(input, output);
+ * }
+ * }</pre>
+ *
+ * <p>If a model takes multiple inputs or outputs:
+ *
+ * <pre>{@code
+ * Object[] inputs = {input0, input1, ...};
+ * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
+ * float[][][] ith_output = new float[3][2][4];
+ * map_of_indices_to_outputs.put(i, ith_output);
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
+ * }
+ * }</pre>
+ *
+ * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
+ * model with Toco.
+ *
+ * <p><b>WARNING:</b>Instances of a {@code Interpreter} is <b>not</b> thread-safe. A {@code
+ * Interpreter} owns resources that <b>must</b> be explicitly freed by invoking {@link #close()}
+ */
+public final class Interpreter implements AutoCloseable {
+
+  /**
+   * Initializes a {@code Interpreter}
+   *
+   * @param modelFile: a File of a pre-trained TF Lite model.
+   */
+  public Interpreter(@NotNull File modelFile) {
+    if (modelFile == null) {
+      return;
+    }
+    wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath());
+  }
+
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   */
+  public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer) {
+    wrapper = new NativeInterpreterWrapper(mappedByteBuffer);
+  }
+
+  /**
+   * Runs model inference if the model takes only one input, and provides only one output.
+   *
+   * @param input an array or multidimensional array, or a {@link ByteBuffer} of primitive types
+   *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
+   *     input data. When {@link ByteBuffer} is used, its content should remain unchanged until
+   *     model inference is done.
+   * @param output a multidimensional array of output data.
+   */
+  public void run(@NotNull Object input, @NotNull Object output) {
+    Object[] inputs = {input};
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, output);
+    runForMultipleInputsOutputs(inputs, outputs);
+  }
+
+  /**
+   * Runs model inference if the model takes multiple inputs, or returns multiple outputs.
+   *
+   * @param inputs an array of input data. The inputs should be in the same order as inputs of the
+   *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
+   *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
+   *     way to pass large input data. When {@link ByteBuffer} is used, its content should remain
+   *     unchanged until model inference is done.
+   * @param outputs a map mapping output indices to multidimensional arrays of output data. It only
+   *     needs to keep entries for the outputs to be used.
+   */
+  public void runForMultipleInputsOutputs(
+      @NotNull Object[] inputs, @NotNull Map<Integer, Object> outputs) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The Interpreter has already been closed.");
+    }
+    Tensor[] tensors = wrapper.run(inputs);
+    if (outputs == null || tensors == null || outputs.size() > tensors.length) {
+      throw new IllegalArgumentException("Outputs do not match with model outputs.");
+    }
+    final int size = tensors.length;
+    for (Integer idx : outputs.keySet()) {
+      if (idx == null || idx < 0 || idx >= size) {
+        throw new IllegalArgumentException(
+            String.format("Invalid index of output %d (should be in range [0, %d))", idx, size));
+      }
+      tensors[idx].copyTo(outputs.get(idx));
+    }
+  }
+
+  /**
+   * Resizes idx-th input of the native model to the given dims.
+   *
+   * <p>IllegalArgumentException will be thrown if it fails to resize.
+   */
+  public void resizeInput(int idx, @NotNull int[] dims) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The Interpreter has already been closed.");
+    }
+    wrapper.resizeInput(idx, dims);
+  }
+
+  /**
+   * Gets index of an input given the op name of the input.
+   *
+   * <p>IllegalArgumentException will be thrown if the op name does not exist in the model file used
+   * to initialize the {@link Interpreter}.
+   */
+  public int getInputIndex(String opName) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The Interpreter has already been closed.");
+    }
+    return wrapper.getInputIndex(opName);
+  }
+
+  /**
+   * Gets index of an output given the op name of the output.
+   *
+   * <p>IllegalArgumentException will be thrown if the op name does not exist in the model file used
+   * to initialize the {@link Interpreter}.
+   */
+  public int getOutputIndex(String opName) {
+    if (wrapper == null) {
+      throw new IllegalStateException("The Interpreter has already been closed.");
+    }
+    return wrapper.getOutputIndex(opName);
+  }
+
+  /** Release resources associated with the {@code Interpreter}. */
+  @Override
+  public void close() {
+    wrapper.close();
+    wrapper = null;
+  }
+
+  NativeInterpreterWrapper wrapper;
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
new file mode 100644
index 0000000000000000000000000000000000000000..1939a078ad8031b99620773c9b91335c4e8f7b22
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -0,0 +1,276 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import java.lang.reflect.Array;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A wrapper wraps native interpreter and controls model execution.
+ *
+ * <p><b>WARNING:</b> Resources consumed by the {@code NativeInterpreterWrapper} object must be
+ * explicitly freed by invoking the {@link #close()} method when the {@code
+ * NativeInterpreterWrapper} object is no longer needed.
+ */
+final class NativeInterpreterWrapper implements AutoCloseable {
+
+  NativeInterpreterWrapper(String modelPath) {
+    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
+    modelHandle = createModel(modelPath, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle);
+  }
+
+  /**
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer}. The
+   * MappedByteBuffer should not be modified after the construction of a {@code
+   * NativeInterpreterWrapper}.
+   */
+  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
+    modelByteBuffer = mappedByteBuffer;
+    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
+    modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
+    interpreterHandle = createInterpreter(modelHandle);
+  }
+
+  /** Releases resources associated with this {@code NativeInterpreterWrapper}. */
+  @Override
+  public void close() {
+    delete(errorHandle, modelHandle, interpreterHandle);
+    errorHandle = 0;
+    modelHandle = 0;
+    interpreterHandle = 0;
+    modelByteBuffer = null;
+    inputsIndexes = null;
+    outputsIndexes = null;
+  }
+
+  /** Sets inputs, runs model inference and returns outputs. */
+  Tensor[] run(Object[] inputs) {
+    if (inputs == null || inputs.length == 0) {
+      throw new IllegalArgumentException("Invalid inputs. Inputs should not be null or empty.");
+    }
+    int[] dataTypes = new int[inputs.length];
+    Object[] sizes = new Object[inputs.length];
+    int[] numsOfBytes = new int[inputs.length];
+    for (int i = 0; i < inputs.length; ++i) {
+      DataType dataType = dataTypeOf(inputs[i]);
+      dataTypes[i] = dataType.getNumber();
+      if (dataType == DataType.BYTEBUFFER) {
+        ByteBuffer buffer = (ByteBuffer) inputs[i];
+        if (buffer.order() != ByteOrder.nativeOrder()) {
+          throw new IllegalArgumentException(
+              "Invalid ByteBuffer. It shoud use ByteOrder.nativeOrder().");
+        }
+        numsOfBytes[i] = buffer.limit();
+        sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
+      } else if (isNonEmptyArray(inputs[i])) {
+        int[] dims = shapeOf(inputs[i]);
+        sizes[i] = dims;
+        numsOfBytes[i] = dataType.elemByteSize() * numElements(dims);
+      } else {
+        throw new IllegalArgumentException(
+            String.format(
+                "%d-th element of the %d inputs is not an array or a ByteBuffer.",
+                i, inputs.length));
+      }
+    }
+    long[] outputsHandles =
+        run(interpreterHandle, errorHandle, sizes, dataTypes, numsOfBytes, inputs);
+    if (outputsHandles == null || outputsHandles.length == 0) {
+      throw new IllegalStateException("Interpreter has no outputs.");
+    }
+    Tensor[] outputs = new Tensor[outputsHandles.length];
+    for (int i = 0; i < outputsHandles.length; ++i) {
+      outputs[i] = Tensor.fromHandle(outputsHandles[i]);
+    }
+    return outputs;
+  }
+
+  /** Resizes dimensions of a specific input. */
+  void resizeInput(int idx, int[] dims) {
+    resizeInput(interpreterHandle, errorHandle, idx, dims);
+  }
+
+  void setUseNNAPI(boolean useNNAPI) {
+    useNNAPI(interpreterHandle, useNNAPI);
+  }
+
+  /** Gets index of an input given its name. */
+  int getInputIndex(String name) {
+    if (inputsIndexes == null) {
+      String[] names = getInputNames(interpreterHandle);
+      inputsIndexes = new HashMap<>();
+      if (names != null) {
+        for (int i = 0; i < names.length; ++i) {
+          inputsIndexes.put(names[i], i);
+        }
+      }
+    }
+    if (inputsIndexes.containsKey(name)) {
+      return inputsIndexes.get(name);
+    } else {
+      throw new IllegalArgumentException(
+          String.format(
+              "%s is not a valid name for any input. The indexes of the inputs are %s",
+              name, inputsIndexes.toString()));
+    }
+  }
+
+  /** Gets index of an output given its name. */
+  int getOutputIndex(String name) {
+    if (outputsIndexes == null) {
+      String[] names = getOutputNames(interpreterHandle);
+      outputsIndexes = new HashMap<>();
+      if (names != null) {
+        for (int i = 0; i < names.length; ++i) {
+          outputsIndexes.put(names[i], i);
+        }
+      }
+    }
+    if (outputsIndexes.containsKey(name)) {
+      return outputsIndexes.get(name);
+    } else {
+      throw new IllegalArgumentException(
+          String.format(
+              "%s is not a valid name for any output. The indexes of the outputs are %s",
+              name, outputsIndexes.toString()));
+    }
+  }
+
+  static int numElements(int[] shape) {
+    if (shape == null) {
+      return 0;
+    }
+    int n = 1;
+    for (int i = 0; i < shape.length; i++) {
+      n *= shape[i];
+    }
+    return n;
+  }
+
+  static boolean isNonEmptyArray(Object o) {
+    return (o != null && o.getClass().isArray() && Array.getLength(o) != 0);
+  }
+
+  /** Returns the type of the data. */
+  static DataType dataTypeOf(Object o) {
+    if (o != null) {
+      Class<?> c = o.getClass();
+      while (c.isArray()) {
+        c = c.getComponentType();
+      }
+      if (float.class.equals(c)) {
+        return DataType.FLOAT32;
+      } else if (int.class.equals(c)) {
+        return DataType.INT32;
+      } else if (byte.class.equals(c)) {
+        return DataType.UINT8;
+      } else if (long.class.equals(c)) {
+        return DataType.INT64;
+      } else if (ByteBuffer.class.isInstance(o)) {
+        return DataType.BYTEBUFFER;
+      }
+    }
+    throw new IllegalArgumentException("cannot resolve DataType of " + o.getClass().getName());
+  }
+
+  /** Returns the shape of an object as an int array. */
+  static int[] shapeOf(Object o) {
+    int size = numDimensions(o);
+    int[] dimensions = new int[size];
+    fillShape(o, 0, dimensions);
+    return dimensions;
+  }
+
+  static int numDimensions(Object o) {
+    if (o == null || !o.getClass().isArray()) {
+      return 0;
+    }
+    if (Array.getLength(o) == 0) {
+      throw new IllegalArgumentException("array lengths cannot be 0.");
+    }
+    return 1 + numDimensions(Array.get(o, 0));
+  }
+
+  static void fillShape(Object o, int dim, int[] shape) {
+    if (shape == null || dim == shape.length) {
+      return;
+    }
+    final int len = Array.getLength(o);
+    if (shape[dim] == 0) {
+      shape[dim] = len;
+    } else if (shape[dim] != len) {
+      throw new IllegalArgumentException(
+          String.format("mismatched lengths (%d and %d) in dimension %d", shape[dim], len, dim));
+    }
+    for (int i = 0; i < len; ++i) {
+      fillShape(Array.get(o, i), dim + 1, shape);
+    }
+  }
+
+  private static final int ERROR_BUFFER_SIZE = 512;
+
+  private long errorHandle;
+
+  private long interpreterHandle;
+
+  private long modelHandle;
+
+  private int inputSize;
+
+  private MappedByteBuffer modelByteBuffer;
+
+  private Map<String, Integer> inputsIndexes;
+
+  private Map<String, Integer> outputsIndexes;
+
+  private static native String[] getInputNames(long interpreterHandle);
+
+  private static native String[] getOutputNames(long interpreterHandle);
+
+  private static native void resizeInput(
+      long interpreterHandle, long errorHandle, int inputIdx, int[] dims);
+
+  private static native void useNNAPI(long interpreterHandle, boolean state);
+
+  private static native long createErrorReporter(int size);
+
+  private static native long createModel(String modelPathOrBuffer, long errorHandle);
+
+  private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
+
+  private static native long createInterpreter(long modelHandle);
+
+  private static native long[] run(
+      long interpreterHandle,
+      long errorHandle,
+      Object[] sizes,
+      int[] dtypes,
+      int[] numsOfBytes,
+      Object[] values);
+
+  private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
+
+  private static native int[] getInputDims(long interpreterHandle, int inputIdx, int numBytes);
+
+  static {
+    TensorFlowLite.init();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
new file mode 100644
index 0000000000000000000000000000000000000000..54ace6c63ce5bd1b38be744176d0378e3cc8a1d3
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import java.util.Arrays;
+
+/**
+ * A typed multi-dimensional array used in Tensorflow Lite.
+ *
+ * <p>The native handle of a {@code Tensor} belongs to {@code NativeInterpreterWrapper}, thus not
+ * needed to be closed here.
+ */
+final class Tensor {
+
+  static Tensor fromHandle(long nativeHandle) {
+    return new Tensor(nativeHandle);
+  }
+
+  /** Reads Tensor content into an array. */
+  <T> T copyTo(T dst) {
+    if (NativeInterpreterWrapper.dataTypeOf(dst) != dtype) {
+      throw new IllegalArgumentException(
+          String.format(
+              "Cannot convert an TensorFlowLite tensor with type %s to a Java object of "
+                  + "type %s (which is compatible with the TensorFlowLite type %s)",
+              dtype, dst.getClass().getName(), NativeInterpreterWrapper.dataTypeOf(dst)));
+    }
+    int[] dstShape = NativeInterpreterWrapper.shapeOf(dst);
+    if (!Arrays.equals(dstShape, shapeCopy)) {
+      throw new IllegalArgumentException(
+          String.format(
+              "Shape of output target %s does not match with the shape of the Tensor %s.",
+              Arrays.toString(dstShape), Arrays.toString(shapeCopy)));
+    }
+    readMultiDimensionalArray(nativeHandle, dst);
+    return dst;
+  }
+
+  final long nativeHandle;
+  final DataType dtype;
+  final int[] shapeCopy;
+
+  private Tensor(long nativeHandle) {
+    this.nativeHandle = nativeHandle;
+    this.dtype = DataType.fromNumber(dtype(nativeHandle));
+    this.shapeCopy = shape(nativeHandle);
+  }
+
+  private static native int dtype(long handle);
+
+  private static native int[] shape(long handle);
+
+  private static native void readMultiDimensionalArray(long handle, Object value);
+
+  static {
+    TensorFlowLite.init();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
new file mode 100644
index 0000000000000000000000000000000000000000..711638a9f995ce270cd362b93a7bcfca990430dc
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/TensorFlowLite.java
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Static utility methods loading the TensorFlowLite runtime. */
+public final class TensorFlowLite {
+
+  private static final String LIBNAME = "tensorflowlite_jni";
+
+  private TensorFlowLite() {}
+
+  /** Returns the version of the underlying TensorFlowLite runtime. */
+  public static native String version();
+
+  /**
+   * Load the TensorFlowLite runtime C library.
+   */
+  static boolean init() {
+    try {
+      System.loadLibrary(LIBNAME);
+      return true;
+    } catch (UnsatisfiedLinkError e) {
+      System.err.println("TensorFlowLite: failed to load native library: " + e.getMessage());
+      return false;
+    }
+  }
+
+  static {
+    init();
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/package-info.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/package-info.java
new file mode 100644
index 0000000000000000000000000000000000000000..68e6a0f57810f6d9675a5d1193601e43e172ab74
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/package-info.java
@@ -0,0 +1,17 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/** Defines classes to load and execute TensorFlowLite models. */
+package org.tensorflow.lite;
diff --git a/tensorflow/contrib/lite/java/src/main/native/BUILD b/tensorflow/contrib/lite/java/src/main/native/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..15806d57c8ed7a45d2db9b80e2aab8e22349ee3e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/BUILD
@@ -0,0 +1,108 @@
+# Description:
+# Java Native Interface (JNI) library intended for implementing the
+# TensorFlow Lite Java API using the TensorFlow Lite CC library.
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "native_framework_only",
+    srcs = [
+        "exception_jni.cc",
+        "nativeinterpreterwrapper_jni.cc",
+        "tensor_jni.cc",
+        "tensorflow_lite_jni.cc",
+    ] + select({
+        # The Android toolchain makes "jni.h" available in the include path.
+        # For non-Android toolchains, generate jni.h and jni_md.h.
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            ":jni.h",
+            ":jni_md.h",
+        ],
+    }),
+    hdrs = [
+        "exception_jni.h",
+        "nativeinterpreterwrapper_jni.h",
+        "tensor_jni.h",
+        "tensorflow_lite_jni.h",
+    ],
+    copts = tflite_copts(),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["."],
+    }),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+    ],
+    alwayslink = 1,
+)
+
+# Silly rules to make
+# #include <jni.h>
+# in the source headers work
+# (in combination with the "includes" attribute of the tf_cuda_library rule
+# above. Not needed when using the Android toolchain).
+#
+# Inspired from:
+# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
+# but hopefully there is a simpler alternative to this.
+genrule(
+    name = "copy_jni_h",
+    srcs = ["@bazel_tools//tools/jdk:jni_header"],
+    outs = ["jni.h"],
+    cmd = "cp -f $< $@",
+)
+
+genrule(
+    name = "copy_jni_md_h",
+    srcs = select({
+        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
+    }),
+    outs = ["jni_md.h"],
+    cmd = "cp -f $< $@",
+)
+
+# This includes all ops. If you want a smaller binary, you should copy and
+# modify builtin_ops_jni.cc.  You should then link your binary against both
+# ":native_framework_only" and your own version of ":native_builtin_ops".
+cc_library(
+    name = "native",
+    srcs = [
+        "builtin_ops_jni.cc",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":native_framework_only",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+    alwayslink = 1,
+)
+
+exports_files(
+    [
+        "version_script.lds",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc b/tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cce356370fa770de3e44438f08470077fb07c04c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/builtin_ops_jni.cc
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+
+namespace tflite {
+
+// The JNI code in interpreter_jni.cc expects a CreateOpResolver() function in
+// the tflite namespace. This one instantiates a BuiltinOpResolver, with all the
+// builtin ops. For smaller binary sizes users should avoid linking this in, and
+// should provide a custom make CreateOpResolver() instead.
+std::unique_ptr<OpResolver> CreateOpResolver() {  // NOLINT
+  return std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver>(
+      new tflite::ops::builtin::BuiltinOpResolver());
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1578c9e3ddd034ad9ce17c8c3ae6c942258e2a55
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.cc
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
+
+const char kIllegalArgumentException[] = "java/lang/IllegalArgumentException";
+const char kIllegalStateException[] = "java/lang/IllegalStateException";
+const char kNullPointerException[] = "java/lang/NullPointerException";
+const char kIndexOutOfBoundsException[] = "java/lang/IndexOutOfBoundsException";
+const char kUnsupportedOperationException[] =
+    "java/lang/UnsupportedOperationException";
+
+void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  const size_t max_msg_len = 512;
+  auto* message = static_cast<char*>(malloc(max_msg_len));
+  if (vsnprintf(message, max_msg_len, fmt, args) >= 0) {
+    env->ThrowNew(env->FindClass(clazz), message);
+  } else {
+    env->ThrowNew(env->FindClass(clazz), "");
+  }
+  free(message);
+  va_end(args);
+}
+
+BufferErrorReporter::BufferErrorReporter(JNIEnv* env, int limit) {
+  buffer_ = new char[limit];
+  if (!buffer_) {
+    throwException(env, kNullPointerException,
+                   "Malloc of BufferErrorReporter to hold %d char failed.",
+                   limit);
+    return;
+  }
+  start_idx_ = 0;
+  end_idx_ = limit - 1;
+}
+
+BufferErrorReporter::~BufferErrorReporter() { delete[] buffer_; }
+
+int BufferErrorReporter::Report(const char* format, va_list args) {
+  int size = 0;
+  if (start_idx_ < end_idx_) {
+    size = vsnprintf(buffer_ + start_idx_, end_idx_ - start_idx_, format, args);
+  }
+  start_idx_ += size;
+  return size;
+}
+
+const char* BufferErrorReporter::CachedErrorMessage() { return buffer_; }
diff --git a/tensorflow/contrib/lite/java/src/main/native/exception_jni.h b/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ffff052df73c5cb21bb6522d31dc615c38f7d1f
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/exception_jni.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
+
+#include <jni.h>
+#include "tensorflow/contrib/lite/error_reporter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const char kIllegalArgumentException[];
+extern const char kIllegalStateException[];
+extern const char kNullPointerException[];
+extern const char kIndexOutOfBoundsException[];
+extern const char kUnsupportedOperationException[];
+
+void throwException(JNIEnv* env, const char* clazz, const char* fmt, ...);
+
+class BufferErrorReporter : public tflite::ErrorReporter {
+ public:
+  BufferErrorReporter(JNIEnv* env, int limit);
+  virtual ~BufferErrorReporter();
+  int Report(const char* format, va_list args) override;
+  const char* CachedErrorMessage();
+
+ private:
+  char* buffer_;
+  int start_idx_ = 0;
+  int end_idx_ = 0;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_EXCEPTION_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc6462eb5466e14769f94c5103984f5201b4b8dc
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -0,0 +1,446 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h"
+
+namespace {
+
+const int kByteBufferValue = 999;
+const int kBufferSize = 256;
+
+tflite::Interpreter* convertLongToInterpreter(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Invalid handle to Interpreter.");
+    return nullptr;
+  }
+  return reinterpret_cast<tflite::Interpreter*>(handle);
+}
+
+tflite::FlatBufferModel* convertLongToModel(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException, "Invalid handle to model.");
+    return nullptr;
+  }
+  return reinterpret_cast<tflite::FlatBufferModel*>(handle);
+}
+
+BufferErrorReporter* convertLongToErrorReporter(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Invalid handle to ErrorReporter.");
+    return nullptr;
+  }
+  return reinterpret_cast<BufferErrorReporter*>(handle);
+}
+
+std::vector<int> convertJIntArrayToVector(JNIEnv* env, jintArray inputs) {
+  int size = static_cast<int>(env->GetArrayLength(inputs));
+  std::vector<int> outputs(size, 0);
+  jint* ptr = env->GetIntArrayElements(inputs, nullptr);
+  if (ptr == nullptr) {
+    throwException(env, kIllegalArgumentException,
+                   "Empty dimensions of input array.");
+    return {};
+  }
+  for (int i = 0; i < size; ++i) {
+    outputs[i] = ptr[i];
+  }
+  env->ReleaseIntArrayElements(inputs, ptr, JNI_ABORT);
+  return outputs;
+}
+
+bool isByteBuffer(jint data_type) { return data_type == kByteBufferValue; }
+
+TfLiteType resolveDataType(jint data_type) {
+  switch (data_type) {
+    case 1:
+      return kTfLiteFloat32;
+    case 2:
+      return kTfLiteInt32;
+    case 3:
+      return kTfLiteUInt8;
+    case 4:
+      return kTfLiteInt64;
+    default:
+      return kTfLiteNoType;
+  }
+}
+
+void printDims(char* buffer, int max_size, int* dims, int num_dims) {
+  if (max_size <= 0) return;
+  buffer[0] = '?';
+  int size = 1;
+  for (int i = 1; i < num_dims; ++i) {
+    if (max_size > size) {
+      int written_size =
+          snprintf(buffer + size, max_size - size, ",%d", dims[i]);
+      if (written_size < 0) return;
+      size += written_size;
+    }
+  }
+}
+
+TfLiteStatus checkInputs(JNIEnv* env, tflite::Interpreter* interpreter,
+                         const int input_size, jintArray data_types,
+                         jintArray nums_of_bytes, jobjectArray values,
+                         jobjectArray sizes) {
+  if (input_size != interpreter->inputs().size()) {
+    throwException(env, kIllegalArgumentException,
+                   "Expected num of inputs is %d but got %d",
+                   interpreter->inputs().size(), input_size);
+    return kTfLiteError;
+  }
+  if (input_size != env->GetArrayLength(data_types) ||
+      input_size != env->GetArrayLength(nums_of_bytes) ||
+      input_size != env->GetArrayLength(values)) {
+    throwException(env, kIllegalArgumentException,
+                   "Arrays in arguments should be of the same length, but got "
+                   "%d sizes, %d data_types, %d nums_of_bytes, and %d values",
+                   input_size, env->GetArrayLength(data_types),
+                   env->GetArrayLength(nums_of_bytes),
+                   env->GetArrayLength(values));
+    return kTfLiteError;
+  }
+  for (int i = 0; i < input_size; ++i) {
+    int input_idx = interpreter->inputs()[i];
+    TfLiteTensor* target = interpreter->tensor(input_idx);
+    jintArray dims =
+        static_cast<jintArray>(env->GetObjectArrayElement(sizes, i));
+    int num_dims = static_cast<int>(env->GetArrayLength(dims));
+    if (target->dims->size != num_dims) {
+      throwException(env, kIllegalArgumentException,
+                     "%d-th input should have %d dimensions, but found %d "
+                     "dimensions",
+                     i, target->dims->size, num_dims);
+      return kTfLiteError;
+    }
+    jint* ptr = env->GetIntArrayElements(dims, nullptr);
+    for (int j = 1; j < num_dims; ++j) {
+      if (target->dims->data[j] != ptr[j]) {
+        std::unique_ptr<char[]> expected_dims(new char[kBufferSize]);
+        std::unique_ptr<char[]> obtained_dims(new char[kBufferSize]);
+        printDims(expected_dims.get(), kBufferSize, target->dims->data,
+                  num_dims);
+        printDims(obtained_dims.get(), kBufferSize, ptr, num_dims);
+        throwException(env, kIllegalArgumentException,
+                       "%d-th input dimension should be [%s], but found [%s]",
+                       i, expected_dims.get(), obtained_dims.get());
+        env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
+        return kTfLiteError;
+      }
+    }
+    env->ReleaseIntArrayElements(dims, ptr, JNI_ABORT);
+    env->DeleteLocalRef(dims);
+    if (env->ExceptionCheck()) return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus resizeInputs(JNIEnv* env, tflite::Interpreter* interpreter,
+                          int input_size, jobjectArray sizes) {
+  for (int i = 0; i < input_size; ++i) {
+    int input_idx = interpreter->inputs()[i];
+    jintArray dims =
+        static_cast<jintArray>(env->GetObjectArrayElement(sizes, i));
+    TfLiteStatus status = interpreter->ResizeInputTensor(
+        input_idx, convertJIntArrayToVector(env, dims));
+    if (status != kTfLiteOk) {
+      return status;
+    }
+    env->DeleteLocalRef(dims);
+    if (env->ExceptionCheck()) return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus setInputs(JNIEnv* env, tflite::Interpreter* interpreter,
+                       int input_size, jintArray data_types,
+                       jintArray nums_of_bytes, jobjectArray values) {
+  jint* data_type = env->GetIntArrayElements(data_types, nullptr);
+  jint* num_bytes = env->GetIntArrayElements(nums_of_bytes, nullptr);
+  for (int i = 0; i < input_size; ++i) {
+    int input_idx = interpreter->inputs()[i];
+    TfLiteTensor* target = interpreter->tensor(input_idx);
+    jobject value = env->GetObjectArrayElement(values, i);
+    bool is_byte_buffer = isByteBuffer(data_type[i]);
+    if (is_byte_buffer) {
+      writeByteBuffer(env, value, &(target->data.raw),
+                      static_cast<int>(num_bytes[i]));
+    } else {
+      TfLiteType type = resolveDataType(data_type[i]);
+      if (type != target->type) {
+        throwException(env, kIllegalArgumentException,
+                       "DataType (%d) of input data does not match with the "
+                       "DataType (%d) of model inputs.",
+                       type, target->type);
+        return kTfLiteError;
+      }
+      writeMultiDimensionalArray(env, value, target->type, target->dims->size,
+                                 &(target->data.raw),
+                                 static_cast<int>(num_bytes[i]));
+    }
+    env->DeleteLocalRef(value);
+    if (env->ExceptionCheck()) return kTfLiteError;
+  }
+  env->ReleaseIntArrayElements(data_types, data_type, JNI_ABORT);
+  env->ReleaseIntArrayElements(nums_of_bytes, num_bytes, JNI_ABORT);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+JNIEXPORT jobjectArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong handle) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return nullptr;
+  jclass string_class = env->FindClass("java/lang/String");
+  if (string_class == nullptr) {
+    throwException(env, kUnsupportedOperationException,
+                   "Can not find java/lang/String class to get input names.");
+    return nullptr;
+  }
+  size_t size = interpreter->inputs().size();
+  jobjectArray names = static_cast<jobjectArray>(
+      env->NewObjectArray(size, string_class, env->NewStringUTF("")));
+  for (int i = 0; i < size; ++i) {
+    env->SetObjectArrayElement(names, i,
+                               env->NewStringUTF(interpreter->GetInputName(i)));
+  }
+  return names;
+}
+
+JNIEXPORT jobjectArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong handle) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return nullptr;
+  jclass string_class = env->FindClass("java/lang/String");
+  if (string_class == nullptr) {
+    throwException(env, kUnsupportedOperationException,
+                   "Can not find java/lang/String class to get output names.");
+    return nullptr;
+  }
+  size_t size = interpreter->outputs().size();
+  jobjectArray names = static_cast<jobjectArray>(
+      env->NewObjectArray(size, string_class, env->NewStringUTF("")));
+  for (int i = 0; i < size; ++i) {
+    env->SetObjectArrayElement(
+        names, i, env->NewStringUTF(interpreter->GetOutputName(i)));
+  }
+  return names;
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jboolean state) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->UseNNAPI(static_cast<bool>(state));
+}
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
+    JNIEnv* env, jclass clazz, jint size) {
+  BufferErrorReporter* error_reporter =
+      new BufferErrorReporter(env, static_cast<int>(size));
+  return reinterpret_cast<jlong>(error_reporter);
+}
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
+    JNIEnv* env, jclass clazz, jstring model_file, jlong error_handle) {
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return 0;
+  const char* path = env->GetStringUTFChars(model_file, nullptr);
+  auto model = tflite::FlatBufferModel::BuildFromFile(path, error_reporter);
+  if (!model) {
+    throwException(env, kIllegalArgumentException,
+                   "Contents of %s does not encode a valid TensorFlowLite "
+                   "model: %s",
+                   path, error_reporter->CachedErrorMessage());
+    env->ReleaseStringUTFChars(model_file, path);
+    return 0;
+  }
+  env->ReleaseStringUTFChars(model_file, path);
+  return reinterpret_cast<jlong>(model.release());
+}
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
+    JNIEnv* env, jclass /*clazz*/, jobject model_buffer, jlong error_handle) {
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return 0;
+  const char* buf =
+      static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
+  jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+  auto model = tflite::FlatBufferModel::BuildFromBuffer(
+      buf, static_cast<size_t>(capacity), error_reporter);
+  if (!model) {
+    throwException(env, kIllegalArgumentException,
+                   "MappedByteBuffer does not encode a valid TensorFlowLite "
+                   "model: %s",
+                   error_reporter->CachedErrorMessage());
+    return 0;
+  }
+  return reinterpret_cast<jlong>(model.release());
+}
+
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
+    JNIEnv* env, jclass clazz, jlong model_handle) {
+  tflite::FlatBufferModel* model = convertLongToModel(env, model_handle);
+  if (model == nullptr) return 0;
+  auto resolver = ::tflite::CreateOpResolver();
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  tflite::InterpreterBuilder(*model, *(resolver.get()))(&interpreter);
+  return reinterpret_cast<jlong>(interpreter.release());
+}
+
+// Sets inputs, runs inference, and returns outputs as long handles.
+JNIEXPORT jlongArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jobjectArray sizes, jintArray data_types, jintArray nums_of_bytes,
+    jobjectArray values) {
+  tflite::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) return nullptr;
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return nullptr;
+  const int input_size = env->GetArrayLength(sizes);
+  // validates inputs
+  TfLiteStatus status = checkInputs(env, interpreter, input_size, data_types,
+                                    nums_of_bytes, values, sizes);
+  if (status != kTfLiteOk) return nullptr;
+  // resizes inputs
+  status = resizeInputs(env, interpreter, input_size, sizes);
+  if (status != kTfLiteOk) {
+    throwException(env, kNullPointerException, "Can not resize the input: %s",
+                   error_reporter->CachedErrorMessage());
+    return nullptr;
+  }
+  // allocates memory
+  status = interpreter->AllocateTensors();
+  if (status != kTfLiteOk) {
+    throwException(env, kNullPointerException,
+                   "Can not allocate memory for the given inputs: %s",
+                   error_reporter->CachedErrorMessage());
+    return nullptr;
+  }
+  // sets inputs
+  status = setInputs(env, interpreter, input_size, data_types, nums_of_bytes,
+                     values);
+  if (status != kTfLiteOk) return nullptr;
+  // runs inference
+  if (interpreter->Invoke() != kTfLiteOk) {
+    throwException(env, kIllegalArgumentException,
+                   "Failed to run on the given Interpreter: %s",
+                   error_reporter->CachedErrorMessage());
+    return nullptr;
+  }
+  // returns outputs
+  const std::vector<int>& results = interpreter->outputs();
+  if (results.empty()) {
+    throwException(env, kIllegalArgumentException,
+                   "The Interpreter does not have any outputs.");
+    return nullptr;
+  }
+  jlongArray outputs = env->NewLongArray(results.size());
+  size_t size = results.size();
+  for (int i = 0; i < size; ++i) {
+    TfLiteTensor* source = interpreter->tensor(results[i]);
+    jlong output = reinterpret_cast<jlong>(source);
+    env->SetLongArrayRegion(outputs, i, 1, &output);
+  }
+  return outputs;
+}
+
+JNIEXPORT jintArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
+    JNIEnv* env, jclass clazz, jlong handle, jint input_idx, jint num_bytes) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return nullptr;
+  const int idx = static_cast<int>(input_idx);
+  if (input_idx >= interpreter->inputs().size()) {
+    throwException(env, kIllegalArgumentException,
+                   "Out of range: Failed to get %d-th input out of %d inputs",
+                   input_idx, interpreter->inputs().size());
+    return nullptr;
+  }
+  TfLiteTensor* target = interpreter->tensor(interpreter->inputs()[idx]);
+  int size = target->dims->size;
+  int expected_num_bytes = elementByteSize(target->type);
+  for (int i = 0; i < size; ++i) {
+    expected_num_bytes *= target->dims->data[i];
+  }
+  if (num_bytes != expected_num_bytes) {
+    throwException(env, kIllegalArgumentException,
+                   "Failed to get input dimensions. %d-th input should have"
+                   " %d bytes, but found %d bytes.",
+                   idx, expected_num_bytes, num_bytes);
+    return nullptr;
+  }
+  jintArray outputs = env->NewIntArray(size);
+  env->SetIntArrayRegion(outputs, 0, size, &(target->dims->data[0]));
+  return outputs;
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jint input_idx, jintArray dims) {
+  BufferErrorReporter* error_reporter =
+      convertLongToErrorReporter(env, error_handle);
+  if (error_reporter == nullptr) return;
+  tflite::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) return;
+  const int idx = static_cast<int>(input_idx);
+  if (idx < 0 || idx >= interpreter->inputs().size()) {
+    throwException(env, kIllegalArgumentException,
+                   "Can not resize %d-th input for a model having %d inputs.",
+                   idx, interpreter->inputs().size());
+  }
+  TfLiteStatus status = interpreter->ResizeInputTensor(
+      interpreter->inputs()[idx], convertJIntArrayToVector(env, dims));
+  if (status != kTfLiteOk) {
+    throwException(env, kIllegalArgumentException,
+                   "Failed to resize %d-th input: %s", idx,
+                   error_reporter->CachedErrorMessage());
+  }
+}
+
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
+    JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
+    jlong interpreter_handle) {
+  if (interpreter_handle != 0) {
+    delete convertLongToInterpreter(env, interpreter_handle);
+  }
+  if (model_handle != 0) {
+    delete convertLongToModel(env, model_handle);
+  }
+  if (error_handle != 0) {
+    delete convertLongToErrorReporter(env, error_handle);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..430886b7cc04a356d1826843acc1bbebf4189bf7
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.h
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
+
+#include <jni.h>
+#include <stdio.h>
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/contrib/lite/java/src/main/native/tensor_jni.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+// This is to be provided at link-time by a library.
+extern std::unique_ptr<OpResolver> CreateOpResolver();
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (J)[Ljava/lang/Object;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputNames(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong handle);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (J)[Ljava/lang/Object;
+ */
+JNIEXPORT jobjectArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong handle);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JZ)
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
+                                                           jclass clazz,
+                                                           jlong handle,
+                                                           jboolean state);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (I)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createErrorReporter(
+    JNIEnv* env, jclass clazz, jint size);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (Ljava/lang/String;J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createModel(
+    JNIEnv* env, jclass clazz, jstring model_file, jlong error_handle);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (Ljava/lang/Object;J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
+    JNIEnv* env, jclass clazz, jobject model_buffer, jlong error_handle);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
+    JNIEnv* env, jclass clazz, jlong model_handle);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JJ[Ljava/lang/Object;[I[I[Ljava/lang/Object;)[J
+ */
+JNIEXPORT jlongArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jobjectArray sizes, jintArray data_types, jintArray nums_of_bytes,
+    jobjectArray values);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JII)[I
+ *
+ * It gets input dimensions if num_bytes matches number of bytes required by
+ * the input, else returns null and throws IllegalArgumentException.
+ */
+JNIEXPORT jintArray JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputDims(
+    JNIEnv* env, jclass clazz, jlong handle, jint input_idx, jint num_bytes);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JJI[I)
+ *
+ * It resizes dimensions of a input.
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_resizeInput(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle,
+    jint input_idx, jintArray dims);
+
+/*
+ *  Class:     org_tensorflow_lite_NativeInterpreterWrapper
+ *  Method:
+ *  Signature: (JJJ)
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
+    JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
+    jlong interpreter_handle);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_NATIVEINTERPRETERWRAPPER_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..65126e78a3003f8a69c69326124d613e878c0f9d
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -0,0 +1,242 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/java/src/main/native/tensor_jni.h"
+#include <cstring>
+#include <memory>
+#include "tensorflow/contrib/lite/java/src/main/native/exception_jni.h"
+
+namespace {
+
+TfLiteTensor* convertLongToTensor(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Invalid handle to TfLiteTensor.");
+    return nullptr;
+  }
+  return reinterpret_cast<TfLiteTensor*>(handle);
+}
+
+size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
+                                void* dst, size_t dst_size) {
+  jarray array = static_cast<jarray>(object);
+  const int num_elements = env->GetArrayLength(array);
+  size_t to_copy = num_elements * elementByteSize(type);
+  if (to_copy > dst_size) {
+    throwException(env, kIllegalStateException,
+                   "cannot write Java array of %d bytes to Tensor of %d bytes",
+                   to_copy, dst_size);
+    return 0;
+  }
+  switch (type) {
+    case kTfLiteFloat32: {
+      jfloatArray a = static_cast<jfloatArray>(array);
+      jfloat* values = env->GetFloatArrayElements(a, nullptr);
+      memcpy(dst, values, to_copy);
+      env->ReleaseFloatArrayElements(a, values, JNI_ABORT);
+      return to_copy;
+    }
+    case kTfLiteInt32: {
+      jintArray a = static_cast<jintArray>(array);
+      jint* values = env->GetIntArrayElements(a, nullptr);
+      memcpy(dst, values, to_copy);
+      env->ReleaseIntArrayElements(a, values, JNI_ABORT);
+      return to_copy;
+    }
+    case kTfLiteInt64: {
+      jlongArray a = static_cast<jlongArray>(array);
+      jlong* values = env->GetLongArrayElements(a, nullptr);
+      memcpy(dst, values, to_copy);
+      env->ReleaseLongArrayElements(a, values, JNI_ABORT);
+      return to_copy;
+    }
+    case kTfLiteUInt8: {
+      jbyteArray a = static_cast<jbyteArray>(array);
+      jbyte* values = env->GetByteArrayElements(a, nullptr);
+      memcpy(dst, values, to_copy);
+      env->ReleaseByteArrayElements(a, values, JNI_ABORT);
+      return to_copy;
+    }
+    default: {
+      throwException(env, kUnsupportedOperationException,
+                     "TensorFlowLite currently supports float (32 bits), "
+                     "int (32 bits), byte (8 bits), and long (64 bits), "
+                     "support for other types (DataType %d in this case) will "
+                     "be added in the future",
+                     kTfLiteFloat32, type);
+      return 0;
+    }
+  }
+}
+
+size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
+                               const void* src, size_t src_size, jarray dst) {
+  const int len = env->GetArrayLength(dst);
+  const size_t size = len * elementByteSize(data_type);
+  if (size > src_size) {
+    throwException(
+        env, kIllegalStateException,
+        "cannot fill a Java array of %d bytes with a Tensor of %d bytes", size,
+        src_size);
+    return 0;
+  }
+  switch (data_type) {
+    case kTfLiteFloat32: {
+      jfloatArray float_array = static_cast<jfloatArray>(dst);
+      env->SetFloatArrayRegion(float_array, 0, len,
+                               static_cast<const jfloat*>(src));
+      return size;
+    }
+    case kTfLiteInt32: {
+      jintArray int_array = static_cast<jintArray>(dst);
+      env->SetIntArrayRegion(int_array, 0, len, static_cast<const jint*>(src));
+      return size;
+    }
+    case kTfLiteInt64: {
+      jlongArray long_array = static_cast<jlongArray>(dst);
+      env->SetLongArrayRegion(long_array, 0, len,
+                              static_cast<const jlong*>(src));
+      return size;
+    }
+    case kTfLiteUInt8: {
+      jbyteArray byte_array = static_cast<jbyteArray>(dst);
+      env->SetByteArrayRegion(byte_array, 0, len,
+                              static_cast<const jbyte*>(src));
+      return size;
+    }
+    default: {
+      throwException(env, kIllegalStateException, "invalid DataType(%d)",
+                     data_type);
+    }
+  }
+  return 0;
+}
+
+size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
+                                 size_t src_size, int dims_left, jarray dst) {
+  if (dims_left == 1) {
+    return readOneDimensionalArray(env, data_type, src, src_size, dst);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(dst);
+    int len = env->GetArrayLength(ndarray);
+    size_t size = 0;
+    for (int i = 0; i < len; ++i) {
+      jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
+      size += readMultiDimensionalArray(env, data_type, src + size,
+                                        src_size - size, dims_left - 1, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return size;
+    }
+    return size;
+  }
+}
+
+}  // namespace
+
+size_t elementByteSize(TfLiteType data_type) {
+  // The code in this file makes the assumption that the
+  // TensorFlow TF_DataTypes and the Java primitive types
+  // have the same byte sizes. Validate that:
+  switch (data_type) {
+    case kTfLiteFloat32:
+      static_assert(sizeof(jfloat) == 4,
+                    "Java float not compatible with kTfLiteFloat");
+      return 4;
+    case kTfLiteInt32:
+      static_assert(sizeof(jint) == 4,
+                    "Java int not compatible with kTfLiteInt");
+      return 4;
+    case kTfLiteUInt8:
+      static_assert(sizeof(jbyte) == 1,
+                    "Java byte not compatible with kTfLiteUInt8");
+      return 1;
+    case kTfLiteInt64:
+      static_assert(sizeof(jlong) == 8,
+                    "Java long not compatible with kTfLiteInt64");
+      return 8;
+    default:
+      return 0;
+  }
+}
+
+size_t writeByteBuffer(JNIEnv* env, jobject object, char** dst, int dst_size) {
+  char* buf = static_cast<char*>(env->GetDirectBufferAddress(object));
+  if (!buf) {
+    throwException(env, kIllegalArgumentException,
+                   "Input ByteBuffer is not a direct buffer");
+    return 0;
+  }
+  *dst = buf;
+  return dst_size;
+}
+
+size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
+                                  int dims_left, char** dst, int dst_size) {
+  if (dims_left <= 1) {
+    return writeOneDimensionalArray(env, src, type, *dst, dst_size);
+  } else {
+    jobjectArray ndarray = static_cast<jobjectArray>(src);
+    int len = env->GetArrayLength(ndarray);
+    size_t sz = 0;
+    for (int i = 0; i < len; ++i) {
+      jobject row = env->GetObjectArrayElement(ndarray, i);
+      char* next_dst = *dst + sz;
+      sz += writeMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
+                                       dst_size - sz);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return sz;
+    }
+    return sz;
+  }
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jlong handle,
+                                                          jobject value) {
+  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  if (tensor == nullptr) return;
+  int num_dims = tensor->dims->size;
+  if (num_dims == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "copyTo() is not meant for scalar Tensors.");
+    return;
+  }
+  readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
+                            num_dims, static_cast<jarray>(value));
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle) {
+  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  if (tensor == nullptr) return 0;
+  return static_cast<jint>(tensor->type);
+}
+
+JNIEXPORT jintArray JNICALL
+Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
+  TfLiteTensor* tensor = convertLongToTensor(env, handle);
+  if (tensor == nullptr) return nullptr;
+  int num_dims = tensor->dims->size;
+  jintArray result = env->NewIntArray(num_dims);
+  jint* dims = env->GetIntArrayElements(result, nullptr);
+  for (int i = 0; i < num_dims; ++i) {
+    dims[i] = static_cast<jint>(tensor->dims->data[i]);
+  }
+  env->ReleaseIntArrayElements(result, dims, 0);
+  return result;
+}
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a4910dcc3a719fbb9f365dae693423de768349c
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
+
+#include <jni.h>
+#include "tensorflow/contrib/lite/context.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+ *  Class:     org_tensorflow_lite_TfLiteTensor
+ *  Method:
+ *  Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle);
+
+/*
+ *  Class:     org_tensorflow_lite_TfLiteTensor
+ *  Method:
+ *  Signature: (J)[I
+ */
+JNIEXPORT jintArray JNICALL Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong handle);
+
+/*
+ *  Class:     org_tensorflow_lite_TfLiteTensor
+ *  Method:
+ *  Signature: (JLjava/lang/Object;)
+ */
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jlong handle,
+                                                          jobject value);
+
+/*
+ * Finds the size of each data type.
+ */
+size_t elementByteSize(TfLiteType data_type);
+
+/*
+ * Writes data of a ByteBuffer into dest.
+ */
+size_t writeByteBuffer(JNIEnv* env, jobject object, char** dst, int dst_size);
+
+/*
+ * Writes a multi-dimensional array into dest.
+ */
+size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
+                                  int dims_left, char** dst, int dst_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_TENSOR_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f2f56921b871a6ace2b6cb984fcd185a4d2ab
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.cc
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+
+#include "tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h"
+#include "tensorflow/contrib/lite/version.h"
+
+JNIEXPORT jstring JNICALL
+Java_org_tensorflow_lite_TensorFlowLite_version(JNIEnv* env, jclass /*clazz*/) {
+  char buf[64];
+  snprintf(buf, sizeof(buf), "%d", TFLITE_SCHEMA_VERSION);
+  return env->NewStringUTF(buf);
+}
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
new file mode 100644
index 0000000000000000000000000000000000000000..65f8341149287f151f7e51fe04d9525bf119164e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/tensorflow_lite_jni.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
+#define TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
+
+#include <jni.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+/*
+ *  Class:     org_tensorflow_lite_TensorFlowLite
+ *  Method:    version
+ *  Signature: ()Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL
+Java_org_tensorflow_lite_TensorFlowLite_version(JNIEnv*, jclass);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // TENSORFLOW_CONTRIB_LITE_JAVA_TENSORFLOW_LITE_JNI_H_
diff --git a/tensorflow/contrib/lite/java/src/main/native/version_script.lds b/tensorflow/contrib/lite/java/src/main/native/version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..38c93dda730550070f28b59297c5191a9615ed7b
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/main/native/version_script.lds
@@ -0,0 +1,11 @@
+VERS_1.0 {
+  # Export JNI symbols.
+  global:
+    Java_*;
+    JNI_OnLoad;
+    JNI_OnUnload;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..cebc9442008e10e7674cf7b1dc58e633fef4ba39
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.DataType}. */
+@RunWith(JUnit4.class)
+public final class DataTypeTest {
+
+  @Test
+  public void testElemByteSize() {
+    assertThat(DataType.FLOAT32.elemByteSize()).isEqualTo(4);
+    assertThat(DataType.INT32.elemByteSize()).isEqualTo(4);
+    assertThat(DataType.UINT8.elemByteSize()).isEqualTo(1);
+    assertThat(DataType.INT64.elemByteSize()).isEqualTo(8);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..424b3de6c97672e310c54230a7ac1204f46d9ac8
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -0,0 +1,221 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Interpreter}. */
+@RunWith(JUnit4.class)
+public final class InterpreterTest {
+
+  private static final File MODEL_FILE =
+      new File("tensorflow/contrib/lite/java/src/testdata/add.bin");
+
+  private static final File MOBILENET_MODEL_FILE =
+      new File("tensorflow/contrib/lite/java/src/testdata/mobilenet.tflite.bin");
+
+  @Test
+  public void testInterpreter() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    assertThat(interpreter).isNotNull();
+    interpreter.close();
+  }
+
+  @Test
+  public void testRunWithMappedByteBufferModel() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    MappedByteBuffer mappedByteBuffer =
+        fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size());
+    Interpreter interpreter = new Interpreter(mappedByteBuffer);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+    fileChannel.close();
+  }
+
+  @Test
+  public void testRun() {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    Float[] oneD = {1.23f, 6.54f, 7.81f};
+    Float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    Float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    Float[][][][] fourD = {threeD, threeD};
+    Float[][][][] parsedOutputs = new Float[2][8][8][3];
+    try {
+      interpreter.run(fourD, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("cannot resolve DataType of [[[[Ljava.lang.Float;");
+    }
+    interpreter.close();
+  }
+
+  @Test
+  public void testRunWithBoxedInputs() {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+  }
+
+  @Test
+  public void testRunForMultipleInputsOutputs() {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    interpreter.runForMultipleInputsOutputs(inputs, outputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+  }
+
+  @Test
+  public void testMobilenetRun() {
+    // Create a gray image.
+    float[][][][] img = new float[1][224][224][3];
+    for (int i = 0; i < 224; ++i) {
+      for (int j = 0; j < 224; ++j) {
+        img[0][i][j][0] = 0.5f;
+        img[0][i][j][1] = 0.5f;
+        img[0][i][j][2] = 0.5f;
+      }
+    }
+
+    // Allocate memory to receive the output values.
+    float[][] labels = new float[1][1001];
+
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    interpreter.run(img, labels);
+    interpreter.close();
+
+    assertThat(labels[0])
+        .usingExactEquality()
+        .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+  }
+
+  @Test
+  public void testRunWithWrongInputType() {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    int[] oneD = {4, 3, 9};
+    int[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    int[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    int[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    try {
+      interpreter.run(fourD, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "DataType (2) of input data does not match with the DataType (1) of model inputs.");
+    }
+    interpreter.close();
+  }
+
+  @Test
+  public void testRunWithWrongOutputType() {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    int[][][][] parsedOutputs = new int[2][8][8][3];
+    try {
+      interpreter.run(fourD, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot convert an TensorFlowLite tensor with type "
+                  + "FLOAT32 to a Java object of type [[[[I (which is compatible with the"
+                  + " TensorFlowLite type INT32)");
+    }
+    interpreter.close();
+  }
+
+  @Test
+  public void testGetInputIndex() {
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    try {
+      interpreter.getInputIndex("WrongInputName");
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "WrongInputName is not a valid name for any input. The indexes of the inputs"
+                  + " are {input=0}");
+    }
+    int index = interpreter.getInputIndex("input");
+    assertThat(index).isEqualTo(0);
+  }
+
+  @Test
+  public void testGetOutputIndex() {
+    Interpreter interpreter = new Interpreter(MOBILENET_MODEL_FILE);
+    try {
+      interpreter.getOutputIndex("WrongOutputName");
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "WrongOutputName is not a valid name for any output. The indexes of the outputs"
+                  + " are {MobilenetV1/Predictions/Softmax=0}");
+    }
+    int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
+    assertThat(index).isEqualTo(0);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..9a6894f49c0b7278511717d2671648c6d1763e00
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -0,0 +1,406 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.NativeInterpreterWrapper}. */
+@RunWith(JUnit4.class)
+public final class NativeInterpreterWrapperTest {
+
+  private static final String FLOAT_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/add.bin";
+
+  private static final String INT_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/int32.bin";
+
+  private static final String LONG_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/int64.bin";
+
+  private static final String BYTE_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/uint8.bin";
+
+  private static final String INVALID_MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/invalid_model.bin";
+
+  @Test
+  public void testConstructor() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    assertThat(wrapper).isNotNull();
+    wrapper.close();
+  }
+
+  @Test
+  public void testConstructorWithInvalidModel() {
+    try {
+      NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INVALID_MODEL_PATH);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("Model provided has model identifier ' is ', should be 'TFL3'");
+    }
+  }
+
+  @Test
+  public void testRunWithFloat() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, -6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    outputs[0].copyTo(parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, -19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithInt() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(INT_MODEL_PATH);
+    int[] oneD = {3, 7, -4};
+    int[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    int[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    int[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    int[][][][] parsedOutputs = new int[2][4][4][12];
+    outputs[0].copyTo(parsedOutputs);
+    int[] outputOneD = parsedOutputs[0][0][0];
+    int[] expected = {3, 7, -4, 3, 7, -4, 3, 7, -4, 3, 7, -4};
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithLong() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(LONG_MODEL_PATH);
+    long[] oneD = {-892834092L, 923423L, 2123918239018L};
+    long[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    long[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    long[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    long[][][][] parsedOutputs = new long[2][4][4][12];
+    outputs[0].copyTo(parsedOutputs);
+    long[] outputOneD = parsedOutputs[0][0][0];
+    long[] expected = {-892834092L, 923423L, 2123918239018L, -892834092L, 923423L, 2123918239018L,
+                       -892834092L, 923423L, 2123918239018L, -892834092L, 923423L, 2123918239018L};
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithByte() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
+    byte[] oneD = {(byte) 0xe0, 0x4f, (byte) 0xd0};
+    byte[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    byte[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    byte[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    int[] inputDims = {2, 8, 8, 3};
+    wrapper.resizeInput(0, inputDims);
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    byte[][][][] parsedOutputs = new byte[2][4][4][12];
+    outputs[0].copyTo(parsedOutputs);
+    byte[] outputOneD = parsedOutputs[0][0][0];
+    byte[] expected = {(byte) 0xe0, 0x4f, (byte) 0xd0, (byte) 0xe0, 0x4f, (byte) 0xd0,
+                       (byte) 0xe0, 0x4f, (byte) 0xd0, (byte) 0xe0, 0x4f, (byte) 0xd0};
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithByteBufferHavingBytes() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
+    ByteBuffer bbuf = ByteBuffer.allocateDirect(2 * 8 * 8 * 3);
+    bbuf.order(ByteOrder.nativeOrder());
+    bbuf.rewind();
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 8; ++j) {
+        for (int k = 0; k < 8; ++k) {
+          bbuf.put((byte) 0xe0);
+          bbuf.put((byte) 0x4f);
+          bbuf.put((byte) 0xd0);
+        }
+      }
+    }
+    Object[] inputs = {bbuf};
+    int[] inputDims = {2, 8, 8, 3};
+    wrapper.resizeInput(0, inputDims);
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    byte[][][][] parsedOutputs = new byte[2][4][4][12];
+    outputs[0].copyTo(parsedOutputs);
+    byte[] outputOneD = parsedOutputs[0][0][0];
+    byte[] expected = {
+      (byte) 0xe0, 0x4f, (byte) 0xd0, (byte) 0xe0, 0x4f, (byte) 0xd0,
+      (byte) 0xe0, 0x4f, (byte) 0xd0, (byte) 0xe0, 0x4f, (byte) 0xd0
+    };
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithByteBufferHavingFloats() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    ByteBuffer bbuf = ByteBuffer.allocateDirect(4 * 8 * 8 * 3 * 4);
+    bbuf.order(ByteOrder.nativeOrder());
+    bbuf.rewind();
+    for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 8; ++j) {
+        for (int k = 0; k < 8; ++k) {
+          bbuf.putFloat(1.23f);
+          bbuf.putFloat(-6.54f);
+          bbuf.putFloat(7.81f);
+        }
+      }
+    }
+    Object[] inputs = {bbuf};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Failed to get input dimensions. 0-th input should have 768 bytes, but found 3072 bytes");
+    }
+    int[] inputDims = {4, 8, 8, 3};
+    wrapper.resizeInput(0, inputDims);
+    Tensor[] outputs = wrapper.run(inputs);
+    assertThat(outputs.length).isEqualTo(1);
+    float[][][][] parsedOutputs = new float[4][8][8][3];
+    outputs[0].copyTo(parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, -19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithByteBufferHavingWrongSize() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
+    ByteBuffer bbuf = ByteBuffer.allocateDirect(2 * 7 * 8 * 3);
+    bbuf.order(ByteOrder.nativeOrder());
+    Object[] inputs = {bbuf};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Failed to get input dimensions. 0-th input should have 192 bytes, but found 336 bytes.");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithWrongInputType() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    int[] oneD = {4, 3, 9};
+    int[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    int[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    int[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "DataType (2) of input data does not match with the DataType (1) of model inputs.");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunAfterClose() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    wrapper.close();
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Invalid handle to Interpreter.");
+    }
+  }
+
+  @Test
+  public void testRunWithEmptyInputs() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    try {
+      Object[] inputs = {};
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("Invalid inputs. Inputs should not be null or empty.");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithWrongInputSize() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD, fourD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("Expected num of inputs is 1 but got 2");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithWrongInputNumOfDims() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    Object[] inputs = {threeD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("0-th input should have 4 dimensions, but found 3 dimensions");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithWrongInputDims() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(FLOAT_MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    try {
+      wrapper.run(inputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains("0-th input dimension should be [?,8,8,3], but found [?,8,7,3]");
+    }
+    wrapper.close();
+  }
+
+  @Test
+  public void testNumElements() {
+    int[] shape = {2, 3, 4};
+    int num = NativeInterpreterWrapper.numElements(shape);
+    assertThat(num).isEqualTo(24);
+    shape = null;
+    num = NativeInterpreterWrapper.numElements(shape);
+    assertThat(num).isEqualTo(0);
+  }
+
+  @Test
+  public void testIsNonEmtpyArray() {
+    assertThat(NativeInterpreterWrapper.isNonEmptyArray(null)).isFalse();
+    assertThat(NativeInterpreterWrapper.isNonEmptyArray(3.2)).isFalse();
+    int[] emptyArray = {};
+    assertThat(NativeInterpreterWrapper.isNonEmptyArray(emptyArray)).isFalse();
+    int[] validArray = {9, 5, 2, 1};
+    assertThat(NativeInterpreterWrapper.isNonEmptyArray(validArray)).isTrue();
+  }
+
+  @Test
+  public void testDataTypeOf() {
+    float[] testEmtpyArray = {};
+    DataType dataType = NativeInterpreterWrapper.dataTypeOf(testEmtpyArray);
+    assertThat(dataType).isEqualTo(DataType.FLOAT32);
+    float[] testFloatArray = {0.783f, 0.251f};
+    dataType = NativeInterpreterWrapper.dataTypeOf(testFloatArray);
+    assertThat(dataType).isEqualTo(DataType.FLOAT32);
+    float[][] testMultiDimArray = {testFloatArray, testFloatArray, testFloatArray};
+    dataType = NativeInterpreterWrapper.dataTypeOf(testFloatArray);
+    assertThat(dataType).isEqualTo(DataType.FLOAT32);
+    try {
+      double[] testDoubleArray = {0.783, 0.251};
+      NativeInterpreterWrapper.dataTypeOf(testDoubleArray);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("cannot resolve DataType of");
+    }
+    try {
+      Float[] testBoxedArray = {0.783f, 0.251f};
+      NativeInterpreterWrapper.dataTypeOf(testBoxedArray);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("cannot resolve DataType of [Ljava.lang.Float;");
+    }
+  }
+
+  @Test
+  public void testNumDimensions() {
+    int scalar = 1;
+    assertThat(NativeInterpreterWrapper.numDimensions(scalar)).isEqualTo(0);
+    int[][] array = {{2, 4}, {1, 9}};
+    assertThat(NativeInterpreterWrapper.numDimensions(array)).isEqualTo(2);
+    try {
+      int[] emptyArray = {};
+      NativeInterpreterWrapper.numDimensions(emptyArray);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("array lengths cannot be 0.");
+    }
+  }
+
+  @Test
+  public void testFillShape() {
+    int[][][] array = {{{23}, {14}, {87}}, {{12}, {42}, {31}}};
+    int num = NativeInterpreterWrapper.numDimensions(array);
+    int[] shape = new int[num];
+    NativeInterpreterWrapper.fillShape(array, 0, shape);
+    assertThat(num).isEqualTo(3);
+    assertThat(shape[0]).isEqualTo(2);
+    assertThat(shape[1]).isEqualTo(3);
+    assertThat(shape[2]).isEqualTo(1);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..665c937cb60ad957c0030c01eb57899754c80bf8
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.TensorFlowLite}. */
+@RunWith(JUnit4.class)
+public final class TensorFlowLiteTest {
+
+  @Test
+  public void testVersion() {
+    assertThat(TensorFlowLite.version()).isEqualTo("3");
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..94b6632bb8dd7117bf4074da1939bd23ce732efd
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+import static com.google.common.truth.Truth.assertThat;
+import static org.junit.Assert.fail;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+/** Unit tests for {@link org.tensorflow.lite.Tensor}. */
+@RunWith(JUnit4.class)
+public final class TensorTest {
+
+  private static final String MODEL_PATH =
+      "tensorflow/contrib/lite/java/src/testdata/add.bin";
+
+  private NativeInterpreterWrapper wrapper;
+  private long nativeHandle;
+
+  @Before
+  public void setUp() {
+    wrapper = new NativeInterpreterWrapper(MODEL_PATH);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    Tensor[] outputs = wrapper.run(inputs);
+    nativeHandle = outputs[0].nativeHandle;
+  }
+
+  @After
+  public void tearDown() {
+    wrapper.close();
+  }
+
+  @Test
+  public void testFromHandle() throws Exception {
+    Tensor tensor = Tensor.fromHandle(nativeHandle);
+    assertThat(tensor).isNotNull();
+    int[] expectedShape = {2, 8, 8, 3};
+    assertThat(tensor.shapeCopy).isEqualTo(expectedShape);
+    assertThat(tensor.dtype).isEqualTo(DataType.FLOAT32);
+  }
+
+  @Test
+  public void testCopyTo() {
+    Tensor tensor = Tensor.fromHandle(nativeHandle);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    tensor.copyTo(parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+  }
+
+  @Test
+  public void testCopyToWrongType() {
+    Tensor tensor = Tensor.fromHandle(nativeHandle);
+    int[][][][] parsedOutputs = new int[2][8][8][3];
+    try {
+      tensor.copyTo(parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot convert an TensorFlowLite tensor with type "
+                  + "FLOAT32 to a Java object of type [[[[I (which is compatible with the TensorFlowLite "
+                  + "type INT32)");
+    }
+  }
+
+  @Test
+  public void testCopyToWrongShape() {
+    Tensor tensor = Tensor.fromHandle(nativeHandle);
+    float[][][][] parsedOutputs = new float[1][8][8][3];
+    try {
+      tensor.copyTo(parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Shape of output target [1, 8, 8, 3] does not match "
+                  + "with the shape of the Tensor [2, 8, 8, 3].");
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/java/src/testdata/add.bin b/tensorflow/contrib/lite/java/src/testdata/add.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aef0fe3d82c9d92dc444076d3b46e05af1923f46
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/add.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testdata/float32.bin b/tensorflow/contrib/lite/java/src/testdata/float32.bin
new file mode 100644
index 0000000000000000000000000000000000000000..30b1264ca152740e1607651ce6cbc2a548319bc3
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/float32.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testdata/int32.bin b/tensorflow/contrib/lite/java/src/testdata/int32.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f6f3cf607a249e096921b12d848c4055a37d1168
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/int32.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testdata/int64.bin b/tensorflow/contrib/lite/java/src/testdata/int64.bin
new file mode 100644
index 0000000000000000000000000000000000000000..c12aa41ca7be49b30db291a25156bd20cbab21a9
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/int64.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testdata/invalid_model.bin b/tensorflow/contrib/lite/java/src/testdata/invalid_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8156ac741cbc0aa32e6d867ad09b5e6be8451868
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/testdata/invalid_model.bin
@@ -0,0 +1 @@
+This is an invalid model.
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/java/src/testdata/uint8.bin b/tensorflow/contrib/lite/java/src/testdata/uint8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f06c5cf58462ce56b012d163fb208329874f83ad
Binary files /dev/null and b/tensorflow/contrib/lite/java/src/testdata/uint8.bin differ
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2b4f37bc6cfe1dbc0c178a56b892f545e8ad4f3b
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -0,0 +1,30 @@
+# Description:
+# Internal helper function to test TF Lite API.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+android_library(
+    name = "testhelper",
+    srcs = glob(
+        [
+            "*.java",
+        ],
+    ),
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite_java",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
new file mode 100644
index 0000000000000000000000000000000000000000..8660cabf709e6531a5667a16e5cf43a93c7135bd
--- /dev/null
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/TestHelper.java
@@ -0,0 +1,35 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** A helper class for internal tests. */
+public class TestHelper {
+
+  /**
+   * Turns on/off NNAPI of an {@code Interpreter}.
+   *
+   * @param interpreter an instance of {@code Interpreter}. If it is not initialized, an {@code
+   *     IllegalArgumentException} will be thrown.
+   * @param useNNAPI a boolean value indicating to turn on or off NNAPI.
+   */
+  public static void setUseNNAPI(Interpreter interpreter, boolean useNNAPI) {
+    if (interpreter != null && interpreter.wrapper != null) {
+      interpreter.wrapper.setUseNNAPI(useNNAPI);
+    } else {
+      throw new IllegalArgumentException("Interpreter has not initialized; Failed to setUseNNAPI.");
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ad76e906064b30801b4c2484cfe180589241afe1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -0,0 +1,409 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+tf_cc_test(
+    name = "optional_tensor_test",
+    size = "small",
+    srcs = ["optional_tensor_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "gemm_support",
+    srcs = [
+        "gemm_support.cc",
+    ],
+    hdrs = [
+        "gemm_support.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":op_macros",
+        "//tensorflow/contrib/lite:context",
+        "@gemmlowp//:gemmlowp",
+    ],
+)
+
+cc_library(
+    name = "activation_functor",
+    hdrs = [
+        "activation_functor.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ],
+)
+
+cc_library(
+    name = "op_macros",
+    hdrs = [
+        "op_macros.h",
+    ],
+)
+
+cc_library(
+    name = "builtin_ops",
+    srcs = [
+        "activations.cc",
+        "add.cc",
+        "basic_rnn.cc",
+        "concatenation.cc",
+        "conv.cc",
+        "depthwise_conv.cc",
+        "embedding_lookup.cc",
+        "embedding_lookup_sparse.cc",
+        "fully_connected.cc",
+        "hashtable_lookup.cc",
+        "kernel_util.cc",
+        "l2norm.cc",
+        "local_response_norm.cc",
+        "lsh_projection.cc",
+        "lstm.cc",
+        "mul.cc",
+        "pooling.cc",
+        "register.cc",
+        "reshape.cc",
+        "resize_bilinear.cc",
+        "skip_gram.cc",
+        "space_to_depth.cc",
+        "svdf.cc",
+    ],
+    hdrs = [
+        "kernel_util.h",
+        "padding.h",
+        "register.h",
+    ],
+    # Suppress warnings that are introduced by Eigen Tensor.
+    copts = tflite_copts() + [
+        "-Wno-error=reorder",
+    ] + select({
+        "//tensorflow:ios": ["-Wno-error=invalid-partial-specialization"],
+        "//conditions:default": [
+        ],
+    }),
+    deps = [
+        ":activation_functor",
+        ":op_macros",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:gemm_support",
+        "//tensorflow/contrib/lite/kernels/internal:optimized",
+        "//tensorflow/contrib/lite/kernels/internal:optimized_base",
+        "//tensorflow/contrib/lite/kernels/internal:quantization_util",
+        "//tensorflow/contrib/lite/kernels/internal:reference",
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "//tensorflow/contrib/lite/kernels/internal:round",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+tf_cc_test(
+    name = "activations_test",
+    size = "small",
+    srcs = ["activations_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "add_test",
+    size = "small",
+    srcs = ["add_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "concatenation_test",
+    size = "small",
+    srcs = ["concatenation_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "conv_test",
+    size = "small",
+    srcs = ["conv_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "depthwise_conv_test",
+    size = "small",
+    srcs = ["depthwise_conv_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "basic_rnn_test",
+    size = "small",
+    srcs = ["basic_rnn_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "l2norm_test",
+    size = "small",
+    srcs = ["l2norm_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "mul_test",
+    size = "small",
+    srcs = ["mul_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "reshape_test",
+    size = "small",
+    srcs = ["reshape_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "resize_bilinear_test",
+    size = "small",
+    srcs = ["resize_bilinear_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "svdf_test",
+    size = "small",
+    srcs = ["svdf_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "embedding_lookup_test",
+    size = "small",
+    srcs = ["embedding_lookup_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "embedding_lookup_sparse_test",
+    size = "small",
+    srcs = ["embedding_lookup_sparse_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fully_connected_test",
+    size = "small",
+    srcs = ["fully_connected_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "local_response_norm_test",
+    size = "small",
+    srcs = ["local_response_norm_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "pooling_test",
+    size = "small",
+    srcs = ["pooling_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "softmax_test",
+    size = "small",
+    srcs = ["softmax_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "lsh_projection_test",
+    size = "small",
+    srcs = ["lsh_projection_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "hashtable_lookup_test",
+    size = "small",
+    srcs = ["hashtable_lookup_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "lstm_test",
+    size = "small",
+    srcs = ["lstm_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "skip_gram_test",
+    size = "small",
+    srcs = ["skip_gram_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "space_to_depth_test",
+    size = "small",
+    srcs = ["space_to_depth_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/kernels/activation_functor.h b/tensorflow/contrib/lite/kernels/activation_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb3369e991a474315424423fe655ba214edabbc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activation_functor.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+
+// Dynamic (non-fused) activation functor. perhaps it is worth having
+// template instantiation?
+// TODO(aselle): Make this more efficient by pulling the switch to conv_eval
+// using template inlining.
+class ActivationFunctor {
+ public:
+  explicit ActivationFunctor(TfLiteFusedActivation act) : act_(act) {}
+
+  float operator()(float a) const {
+    switch (act_) {
+      case kTfLiteActNone:
+        return a;
+      case kTfLiteActRelu:
+        return a < 0.f ? 0.f : a;
+      case kTfLiteActRelu6:
+        return std::max(0.f, std::min(a, 6.f));
+      case kTfLiteActTanh:
+        return std::tanh(a);
+      case kTfLiteActSigmoid:
+        return 1.0f / (1.0f + std::exp(-a));
+      default:
+        // TODO(aselle): More informative fatal error!
+        exit(1);
+    }
+  }
+
+ private:
+  TfLiteFusedActivation act_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_ACTIVATION_FUNCTOR_H_
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7ab60a33e5e2ff61bae5f4c6db85ab9c47a391bc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -0,0 +1,389 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace activations {
+
+struct OpData {
+  int32_t input_multiplier = 0;
+  int input_left_shift = 0;
+  int32_t input_range_radius = 0;
+  int diff_min = 0;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+    static constexpr int kInputIntegerBits = 4;
+
+    const double input_real_multiplier =
+        input->params.scale *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    QuantizeMultiplierGreaterThanOne(input_real_multiplier,
+                                     &data->input_multiplier,
+                                     &data->input_left_shift);
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift);
+  }
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  TF_LITE_ENSURE(context,
+                 NumDimensions(input) == 2 || NumDimensions(input) == 4);
+
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+
+    static const int kScaledDiffIntegerBits = 5;
+
+    tflite::PreprocessSoftmaxScaling(
+        params->beta, input->params.scale, kScaledDiffIntegerBits,
+        &data->input_multiplier, &data->input_left_shift);
+    data->diff_min = -1.0 * tflite::CalculateInputRadius(
+                                kScaledDiffIntegerBits, data->input_left_shift);
+  }
+
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = input->bytes / sizeof(float);
+      float* in = input->data.f;
+      float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = std::max(0.f, *in);
+      return kTfLiteOk;
+    }
+    break;
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = input->bytes / sizeof(float);
+      float* in = input->data.f;
+      float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) {
+        *out = std::min(std::max(-1.f, *in), 1.f);
+      }
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = input->bytes / sizeof(float);
+      float* in = input->data.f;
+      float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
+      return kTfLiteOk;
+    }
+    break;
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = input->bytes / sizeof(float);
+      float* in = input->data.f;
+      float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = std::tanh(*in);
+      return kTfLiteOk;
+    }
+    break;
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+}
+
+// Sigmoid is also know as "Logistic".
+TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = input->bytes / sizeof(float);
+      float* in = input->data.f;
+      float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = 1.f / (1.f + std::exp(-*in));
+      break;
+    }
+    case kTfLiteUInt8: {
+      optimized_ops::Logistic(
+          GetTensorData<uint8_t>(input), GetTensorDims(input),
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift,
+          GetTensorData<uint8_t>(output), GetTensorDims(output));
+      break;
+    }
+    default:
+      context->ReportError(context, "Only float32 supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Takes a 2D tensor and perform softmax along the second dimension.
+void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  float* in = input->data.f;
+  float* out = output->data.f;
+  TF_LITE_ASSERT(input_size > 0);
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++) {
+    // Find the max coeff.
+    float max_coeff = in[0];
+    for (int i = 1; i < input_size; i++) {
+      if (in[i] > max_coeff) max_coeff = in[i];
+    }
+
+    // Compute the normalized sum of exps.
+    float exp_sum = 0.0;
+    for (int i = 0; i < input_size; i++) {
+      out[i] = std::exp((in[i] - max_coeff) * params->beta);
+      exp_sum += out[i];
+    }
+
+    // Divide by the sum of exps.
+    float reciprocal_sum_exp = 1.f / exp_sum;
+    for (int i = 0; i < input_size; i++) {
+      out[i] *= reciprocal_sum_exp;
+    }
+
+    // Advance in and out pointers for the next batch.
+    in += input_size;
+    out += input_size;
+  }
+}
+
+void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+  // always traverses the last dimension of a 4D tensor, we will pretend our 2D
+  // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
+  // 1, 1, Y) shape.
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  optimized_ops::Softmax(GetTensorData<uint8_t>(input),
+                         GetTensorDims({batch_size, 1, 1, input_size}),
+                         data->input_multiplier, data->input_left_shift,
+                         data->diff_min, GetTensorData<uint8_t>(output),
+                         GetTensorDims({batch_size, 1, 1, input_size}));
+}
+
+// Takes a 4D tensor and perform softmax along the forth dimension.
+void Softmax4DFloat(TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  optimized_ops::Softmax(GetTensorData<float>(input), GetTensorDims(input),
+                         params->beta, GetTensorData<float>(output),
+                         GetTensorDims(output));
+}
+
+void Softmax4DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  optimized_ops::Softmax(GetTensorData<uint8_t>(input), GetTensorDims(input),
+                         data->input_multiplier, data->input_left_shift,
+                         data->diff_min, GetTensorData<uint8_t>(output),
+                         GetTensorDims(output));
+}
+
+TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+
+  // TODO(ahentz): consider an implementation that works for many (all?)
+  // dimensions.
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      if (NumDimensions(input) == 2) {
+        Softmax2DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DFloat(input, output, params);
+        return kTfLiteOk;
+      }
+      context->ReportError(context,
+                           "Only 2D and 4D tensors supported currently.");
+      return kTfLiteError;
+    }
+    case kTfLiteUInt8: {
+      if (NumDimensions(input) == 2) {
+        Softmax2DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
+      context->ReportError(context,
+                           "Only 2D and 4D tensors supported currently.");
+      return kTfLiteError;
+    }
+    default:
+      context->ReportError(context,
+                           "Only float32 and uint8_t supported currently.");
+      return kTfLiteError;
+  }
+}
+
+}  // namespace activations
+
+TfLiteRegistration* Register_RELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::ReluEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_RELU1() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::Relu1Eval};
+  return &r;
+}
+
+TfLiteRegistration* Register_RELU6() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::Relu6Eval};
+  return &r;
+}
+
+TfLiteRegistration* Register_TANH() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::TanhEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGISTIC() {
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::SigmoidPrepare,
+                                 activations::SigmoidEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_SOFTMAX() {
+  static TfLiteRegistration r = {activations::Init, activations::Free,
+                                 activations::SoftmaxPrepare,
+                                 activations::SoftmaxEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33ca56e745c043efd12b851af14f273fb273d577
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseActivationsOpModel : public SingleOpModel {
+ public:
+  // Most activations don't take any options, so this constructor works for
+  // them.
+  BaseActivationsOpModel(BuiltinOperator type, TensorData input) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  // A dedicated constructor for SOFTMAX, which does some options.
+  BaseActivationsOpModel(float softmax_beta, TensorData input) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
+                 CreateSoftmaxOptions(builder_, softmax_beta).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class FloatActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// TODO(ahentz): I don't quite understand the tradeoffs in the quantized
+// implementation of sigmoid and software, but a tolerance of twice the output
+// scale seems reasonable. We might want to change this if we have a better
+// theoretical bound.
+const float kQuantizedTolerance = 2 * (1. / 256);
+
+class QuantizedActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(FloatActivationsOpTest, Relu) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,   //
+                                 3, 0, 10, 1,  //
+                             }));
+}
+
+TEST(FloatActivationsOpTest, Relu1) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU1,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0.0, -0.6, 0.2, -0.4,  //
+      0.3, -2.0, 1.1, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0, -0.6, 0.2, -0.4,  //
+                                 0.3, -1.0, 1.0, -0.1,  //
+                             }));
+}
+
+TEST(FloatActivationsOpTest, Relu6) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU6,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,  //
+                                 3, 0, 6, 1,  //
+                             }));
+}
+
+TEST(FloatActivationsOpTest, Tanh) {
+  FloatActivationsOpModel m(BuiltinOperator_TANH,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0, -0.9999877, 0.9640275, 0.999329,    //
+                                 0.99505475, -0.9640275, 1, 0.7615941,  //
+                             })));
+}
+
+TEST(FloatActivationsOpTest, Sigmoid) {
+  FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.5, 0.002473, 0.880797, 0.982014,       //
+                                 0.952574, 0.119203, 0.999955, 0.731059,  //
+                             })));
+}
+
+TEST(QuantizedActivationsOpTest, Sigmoid) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+}
+
+TEST(FloatActivationsOpTest, Softmax4D) {
+  FloatActivationsOpModel m(0.1,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 1, 4}});
+  m.SetInput({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 .23463, .12877, .28658, .35003,  //
+                                 .22528, .13664, .45365, .18443,  //
+                             })));
+
+  // Same input, but a different shape.
+  FloatActivationsOpModel m2(0.1,
+                             /*input=*/{TensorType_FLOAT32, {4, 1, 1, 2}});
+  m2.SetInput({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                  0.645656, 0.354344,  //
+                                  0.450166, 0.549834,  //
+                                  0.622459, 0.377541,  //
+                                  0.710949, 0.28905,   //
+                              })));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax4D) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_UINT8, {1, 2, 1, 4}, -10, 10});
+  m.SetInput({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_UINT8, {4, 1, 1, 2}, -10, 10});
+  m2.SetInput({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                             {
+                                                 0.645656, 0.354344,  //
+                                                 0.450166, 0.549834,  //
+                                                 0.622459, 0.377541,  //
+                                                 0.710949, 0.28905,   //
+                                             },
+                                             kQuantizedTolerance)));
+}
+
+TEST(FloatActivationsOpTest, Softmax2D) {
+  FloatActivationsOpModel m(0.1,
+                            /*input=*/{TensorType_FLOAT32, {2, 4}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 .23463, .12877, .28658, .35003,  //
+                                 .22528, .13664, .45365, .18443,  //
+                             })));
+
+  // Same input, but a different shape.
+  FloatActivationsOpModel m2(0.1,
+                             /*input=*/{TensorType_FLOAT32, {4, 2}});
+  m2.SetInput({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                  0.645656, 0.354344,  //
+                                  0.450166, 0.549834,  //
+                                  0.622459, 0.377541,  //
+                                  0.710949, 0.28905,   //
+                              })));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax2D) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_UINT8, {2, 4}, -10, 10});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(0.1,
+                                 /*input=*/{TensorType_UINT8, {4, 2}, -10, 10});
+  m2.SetInput({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                             {
+                                                 0.645656, 0.354344,  //
+                                                 0.450166, 0.549834,  //
+                                                 0.622459, 0.377541,  //
+                                                 0.710949, 0.28905,   //
+                                             },
+                                             kQuantizedTolerance)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e10a249abac3ba19cf107e055aa71d1eee00122
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -0,0 +1,184 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace add {
+
+// This file has three implementation of Add.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+  for (int i = 0; i < NumDimensions(input1); ++i) {
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+                      SizeOfDimension(input2, i));
+  }
+
+  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLiteAddParams* params, TfLiteTensor* input1,
+                  TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+#define TF_LITE_ADD(type)                                        \
+  type::Add(GetTensorData<float>(input1), GetTensorDims(input1), \
+            GetTensorData<float>(input2), GetTensorDims(input2), \
+            output_activation_min, output_activation_max,        \
+            GetTensorData<float>(output), GetTensorDims(output))
+    if (kernel_type == kReference) {
+      TF_LITE_ADD(reference_ops);
+    } else {
+      TF_LITE_ADD(optimized_ops);
+  }
+#undef TF_LITE_ADD
+}
+
+template <KernelType kernel_type>
+void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLiteAddParams* params, TfLiteTensor* input1,
+                      TfLiteTensor* input2, TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+  const int left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1->params.scale, input2->params.scale);
+  const double real_input1_multiplier =
+      input1->params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2->params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+
+  int32 input1_multiplier;
+  int input1_shift;
+  QuantizeMultiplierSmallerThanOne(real_input1_multiplier, &input1_multiplier,
+                                   &input1_shift);
+  int32 input2_multiplier;
+  int input2_shift;
+  QuantizeMultiplierSmallerThanOne(real_input2_multiplier, &input2_multiplier,
+                                   &input2_shift);
+  int32 output_multiplier;
+  int output_shift;
+  QuantizeMultiplierSmallerThanOne(real_output_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_ADD(type)                                                   \
+  type::BroadcastAdd(                                                       \
+      left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1),    \
+      input1_offset, input1_multiplier, input1_shift,                       \
+      GetTensorData<uint8_t>(input2), GetTensorDims(input2), input2_offset, \
+      input2_multiplier, input2_shift, output_offset, output_multiplier,    \
+      output_shift, output_activation_min, output_activation_max,           \
+      GetTensorData<uint8_t>(output), GetTensorDims(output));
+
+  if (kernel_type == kReference) {
+    TF_LITE_ADD(reference_ops);
+  } else {
+    TF_LITE_ADD(optimized_ops);
+  }
+#undef TF_LITE_ADD
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalAddFloat<kernel_type>(context, node, params, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalAddQuantized<kernel_type>(context, node, params, input1, input2,
+                                  output);
+  } else {
+    context->ReportError(context,
+                         "Inputs and outputs not all float|unit8 types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace add
+
+TfLiteRegistration* Register_ADD_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+                                 add::Eval<add::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_ADD_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+                                 add::Eval<add::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_ADD_NEON_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, add::Prepare,
+                                 add::Eval<add::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_ADD() {
+#ifdef USE_NEON
+  return Register_ADD_NEON_OPT();
+#else
+  return Register_ADD_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/add_test.cc b/tensorflow/contrib/lite/kernels/add_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ddf45bb576755d57d50c9e6e01bf50f15612c56d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/add_test.cc
@@ -0,0 +1,170 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseAddOpModel : public SingleOpModel {
+ public:
+  BaseAddOpModel(const TensorData& input, const TensorData& output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input);
+    input2_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatAddOpModel : public BaseAddOpModel {
+ public:
+  using BaseAddOpModel::BaseAddOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedAddOpModel : public BaseAddOpModel {
+ public:
+  using BaseAddOpModel::BaseAddOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// for quantized Add, the error shouldn't exceed 2*step
+float GetTolerance(int min, int max) {
+  float kQuantizedStep = (max - min) / 255.0;
+  float kQuantizedTolerance = 2.0 * kQuantizedStep;
+  return kQuantizedTolerance;
+}
+
+TEST(FloatAddOpModel, NoActivation) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
+TEST(FloatAddOpModel, ActivationRELU1) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.0, 0.4, 1.0, 1.0}));
+}
+
+TEST(FloatAddOpModel, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(),
+                ElementsAreArray({-1.9, 0.4, 1.0, 1.3, 2.2, 2.1}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
+  std::vector<std::initializer_list<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
+  std::vector<std::initializer_list<float>> results = {
+      {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU1) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<std::initializer_list<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                                       {-0.8, 0.2, 0.7, 0.3}};
+  std::vector<std::initializer_list<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                                       {0.6, 0.4, -0.8, 0.5}};
+  std::vector<std::initializer_list<float>> results = {{-0.2, 0.6, 1.0, -0.1},
+                                                       {-0.2, 0.6, -0.1, 0.8}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                          {TensorType_UINT8, {}, -1.0, 1.0},
+                          ActivationFunctionType_RELU1);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                              results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
+  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
+                          {TensorType_UINT8, {}, -3.0, 3.0},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutput(),
+                ElementsAreArray(ArrayFloatNear({-1.9, 0.5, 1.0, 1.3, 2.2, 2.1},
+                                                kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3cee43c68b2a0af5a3fd84b33a980b74bb8f0cb4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace rnn {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kRecurrentWeightsTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int KHiddenStateTensor = 0;
+constexpr int kOutputTensor = 1;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int batch_size = input->dims->data[0];
+  const int num_units = input_weights->dims->data[0];
+  TF_LITE_ASSERT_EQ(input->dims->data[1], input_weights->dims->data[1]);
+  TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
+  TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[KHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Resize state.
+  TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
+  hidden_state_size_array->data[0] = batch_size;
+  hidden_state_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, hidden_state,
+                                                   hidden_state_size_array));
+
+  // Mark hidden state as a persistent tensor.
+  hidden_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+  output_size_array->data[0] = batch_size;
+  output_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output,
+                                                   output_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* input_weights =
+      &context->tensors[node->inputs->data[kWeightsTensor]];
+  TfLiteTensor* recurrent_weights =
+      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
+  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  TfLiteTensor* hidden_state =
+      &context->tensors[node->outputs->data[KHiddenStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+
+  // Initialize the pointer bias.
+  const float* bias_ptr = bias->data.f;
+
+  const int batch_size = input->dims->data[0];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int input_weights_stride = input_weights->dims->data[1];
+  const int recurrent_weights_stride = recurrent_weights->dims->data[1];
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++) {
+    // Initialize the pointer to input, output and bias.
+    const float* input_ptr_batch = input->data.f + b * input_size;
+    float* output_ptr_batch = output->data.f + b * num_units;
+    float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+
+    // Initialize input_weights and recurrent_weights.
+    const float* input_weights_ptr = input_weights->data.f;
+    const float* recurrent_weights_ptr = recurrent_weights->data.f;
+
+    // Output = bias
+    for (int o = 0; o < num_units; o++) {
+      output_ptr_batch[o] = bias_ptr[o];
+    }
+
+    // Output += input * input_weights
+    for (int o = 0; o < num_units; o++) {
+      for (int i = 0; i < input_size; i++) {
+        output_ptr_batch[o] += input_ptr_batch[i] * input_weights_ptr[i];
+      }
+      input_weights_ptr += input_weights_stride;
+    }
+
+    // Output += recurrent_weights * hidden_state
+    for (int o = 0; o < num_units; o++) {
+      for (int h = 0; h < num_units; h++) {
+        output_ptr_batch[o] +=
+            hidden_state_ptr_batch[h] * recurrent_weights_ptr[h];
+      }
+      recurrent_weights_ptr += recurrent_weights_stride;
+    }
+
+    // Output = activation(Output) and update hidden_state
+    for (int o = 0; o < num_units; o++) {
+      output_ptr_batch[o] =
+          (ActivationFunctor(params->activation))(output_ptr_batch[o]);
+      hidden_state_ptr_batch[o] = output_ptr_batch[o];
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace rnn
+
+TfLiteRegistration* Register_RNN() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 rnn::Prepare, rnn::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ecccb985e91238f1183c8f94a2b5f468758ce55
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -0,0 +1,267 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite RNN op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float rnn_input[] = {
+    0.23689353,   0.285385,     0.037029743, -0.19858193,  -0.27569133,
+    0.43773448,   0.60379338,   0.35562468,  -0.69424844,  -0.93421471,
+    -0.87287879,  0.37144363,   -0.62476718, 0.23791671,   0.40060222,
+    0.1356622,    -0.99774903,  -0.98858172, -0.38952237,  -0.47685933,
+    0.31073618,   0.71511042,   -0.63767755, -0.31729108,  0.33468103,
+    0.75801885,   0.30660987,   -0.37354088, 0.77002847,   -0.62747043,
+    -0.68572164,  0.0069220066, 0.65791464,  0.35130811,   0.80834007,
+    -0.61777675,  -0.21095741,  0.41213346,  0.73784804,   0.094794154,
+    0.47791874,   0.86496925,   -0.53376222, 0.85315156,   0.10288584,
+    0.86684,      -0.011186242, 0.10513687,  0.87825835,   0.59929144,
+    0.62827742,   0.18899453,   0.31440187,  0.99059987,   0.87170351,
+    -0.35091716,  0.74861872,   0.17831337,  0.2755419,    0.51864719,
+    0.55084288,   0.58982027,   -0.47443086, 0.20875752,   -0.058871567,
+    -0.66609079,  0.59098077,   0.73017097,  0.74604273,   0.32882881,
+    -0.17503482,  0.22396147,   0.19379807,  0.29120302,   0.077113032,
+    -0.70331609,  0.15804303,   -0.93407321, 0.40182066,   0.036301374,
+    0.66521823,   0.0300982,    -0.7747041,  -0.02038002,  0.020698071,
+    -0.90300065,  0.62870288,   -0.23068321, 0.27531278,   -0.095755219,
+    -0.712036,    -0.17384434,  -0.50593495, -0.18646687,  -0.96508682,
+    0.43519354,   0.14744234,   0.62589407,  0.1653645,    -0.10651493,
+    -0.045277178, 0.99032974,   -0.88255352, -0.85147917,  0.28153265,
+    0.19455957,   -0.55479527,  -0.56042433, 0.26048636,   0.84702539,
+    0.47587705,   -0.074295521, -0.12287641, 0.70117295,   0.90532446,
+    0.89782166,   0.79817224,   0.53402734,  -0.33286154,  0.073485017,
+    -0.56172788,  -0.044897556, 0.89964068,  -0.067662835, 0.76863563,
+    0.93455386,   -0.6324693,   -0.083922029};
+
+static float rnn_golden_output[] = {
+    0.496726,   0,          0.965996,  0,         0.0584254, 0,
+    0,          0.12315,    0,         0,         0.612266,  0.456601,
+    0,          0.52286,    1.16099,   0.0291232,
+
+    0,          0,          0.524901,  0,         0,         0,
+    0,          1.02116,    0,         1.35762,   0,         0.356909,
+    0.436415,   0.0355727,  0,         0,
+
+    0,          0,          0,         0.262335,  0,         0,
+    0,          1.33992,    0,         2.9739,    0,         0,
+    1.31914,    2.66147,    0,         0,
+
+    0.942568,   0,          0,         0,         0.025507,  0,
+    0,          0,          0.321429,  0.569141,  1.25274,   1.57719,
+    0.8158,     1.21805,    0.586239,  0.25427,
+
+    1.04436,    0,          0.630725,  0,         0.133801,  0.210693,
+    0.363026,   0,          0.533426,  0,         1.25926,   0.722707,
+    0,          1.22031,    1.30117,   0.495867,
+
+    0.222187,   0,          0.72725,   0,         0.767003,  0,
+    0,          0.147835,   0,         0,         0,         0.608758,
+    0.469394,   0.00720298, 0.927537,  0,
+
+    0.856974,   0.424257,   0,         0,         0.937329,  0,
+    0,          0,          0.476425,  0,         0.566017,  0.418462,
+    0.141911,   0.996214,   1.13063,   0,
+
+    0.967899,   0,          0,         0,         0.0831304, 0,
+    0,          1.00378,    0,         0,         0,         1.44818,
+    1.01768,    0.943891,   0.502745,  0,
+
+    0.940135,   0,          0,         0,         0,         0,
+    0,          2.13243,    0,         0.71208,   0.123918,  1.53907,
+    1.30225,    1.59644,    0.70222,   0,
+
+    0.804329,   0,          0.430576,  0,         0.505872,  0.509603,
+    0.343448,   0,          0.107756,  0.614544,  1.44549,   1.52311,
+    0.0454298,  0.300267,   0.562784,  0.395095,
+
+    0.228154,   0,          0.675323,  0,         1.70536,   0.766217,
+    0,          0,          0,         0.735363,  0.0759267, 1.91017,
+    0.941888,   0,          0,         0,
+
+    0,          0,          1.5909,    0,         0,         0,
+    0,          0.5755,     0,         0.184687,  0,         1.56296,
+    0.625285,   0,          0,         0,
+
+    0,          0,          0.0857888, 0,         0,         0,
+    0,          0.488383,   0.252786,  0,         0,         0,
+    1.02817,    1.85665,    0,         0,
+
+    0.00981836, 0,          1.06371,   0,         0,         0,
+    0,          0,          0,         0.290445,  0.316406,  0,
+    0.304161,   1.25079,    0.0707152, 0,
+
+    0.986264,   0.309201,   0,         0,         0,         0,
+    0,          1.64896,    0.346248,  0,         0.918175,  0.78884,
+    0.524981,   1.92076,    2.07013,   0.333244,
+
+    0.415153,   0.210318,   0,         0,         0,         0,
+    0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
+    0.628881,   3.58099,    1.49974,   0
+};
+
+class RNNOpModel : public SingleOpModel {
+ public:
+  RNNOpModel(int batches, int units, int size)
+      : batches_(batches), units_(units), input_size_(size) {
+    input_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    bias_ = AddInput(TensorType_FLOAT32);
+    hidden_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
+        CreateRNNOptions(builder_, ActivationFunctionType_RELU).Union());
+    BuildInterpreter({{batches_, input_size_},
+                      {units_, input_size_},
+                      {units_, units_},
+                      {units_}});
+  }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetWeights(std::initializer_list<float> f) {
+    PopulateTensor(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  void ResetHiddenState() {
+    const int zero_buffer_size = units_ * batches_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(hidden_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ private:
+  int input_;
+  int weights_;
+  int recurrent_weights_;
+  int bias_;
+  int hidden_state_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+  RNNOpModel rnn(2, 16, 8);
+  rnn.SetWeights(
+      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+       0.277308,    0.415818});
+
+  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
+               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
+               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
+               -0.37609905});
+
+  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                           0.1});
+
+  rnn.ResetHiddenState();
+  const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+                                  (rnn.input_size() * rnn.num_batches());
+
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    rnn.SetInput(0, batch_start, batch_end);
+    rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+    rnn.Invoke();
+
+    float* golden_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_end = golden_start + rnn.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7a1233dac0f3cd02dc386f9d194597f38ca3b8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -0,0 +1,200 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace concatenation {
+
+// This file has two implementation of Concatenation.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
+  int axis = params->axis;
+  int num_inputs = node->inputs->size;
+
+  // The number of dimensions of the input tensors must match, and all
+  // dimensions except 'axis' must be equal.
+  TfLiteTensor* t0 = &context->tensors[node->inputs->data[0]];
+  TfLiteType input_type = t0->type;
+  TF_LITE_ENSURE(context, axis >= 0);
+  TF_LITE_ENSURE(context, axis < t0->dims->size);
+
+  // TODO(ahentz): These are limitations of our implementation that could be
+  // removed with a bit of effort.
+  TF_LITE_ENSURE(context, t0->dims->size <= 4);
+  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
+  TF_LITE_ENSURE(context,
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8);
+
+  // Output dimensions will match input dimensions, except 'axis', which
+  // will be the sum of inputs
+  int sum_axis = t0->dims->data[axis];
+  for (int i = 1; i < num_inputs; ++i) {
+    TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
+    TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
+    TF_LITE_ENSURE_EQ(context, t->type, input_type);
+    if (input_type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
+      TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
+    }
+    for (int d = 0; d < t0->dims->size; ++d) {
+      if (d == axis) {
+        sum_axis += t->dims->data[axis];
+      } else {
+        TF_LITE_ENSURE_EQ(context, t->dims->data[d], t0->dims->data[d]);
+      }
+    }
+  }
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(t0->dims->size);
+  for (int d = 0; d < t0->dims->size; ++d) {
+    output_size->data[d] = (d == axis) ? sum_axis : t0->dims->data[d];
+  }
+
+  TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  TF_LITE_ENSURE_EQ(context, output->type, input_type);
+  if (input_type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                      t0->params.zero_point);
+    TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+class VectorOfInputs {
+ public:
+  VectorOfInputs(const TfLiteContext& context, const TfLiteIntArray& inputs) {
+    int num_inputs = inputs.size;
+
+    all_data_.reserve(num_inputs);
+    all_dims_.reserve(num_inputs);
+    all_dims_ptr_.reserve(num_inputs);
+
+    for (int i = 0; i < num_inputs; ++i) {
+      TfLiteTensor* input = &context.tensors[inputs.data[i]];
+      all_data_.push_back(GetTensorData<T>(input));
+      all_dims_.push_back(GetTensorDims(input));
+    }
+
+    // Taking the pointer from inside a std::vector is only OK if the vector is
+    // never modified, so we populate all_dims in the previous loop and then we
+    // are free to grab iterators here.
+    for (int i = 0; i < num_inputs; ++i) {
+      all_dims_ptr_.push_back(&all_dims_[i]);
+    }
+  }
+  const T* const* data() const { return all_data_.data(); }
+  const Dims<4>* const* dims() const { return all_dims_ptr_.data(); }
+
+ private:
+  std::vector<T*> all_data_;
+  std::vector<Dims<4>> all_dims_;
+  std::vector<Dims<4>*> all_dims_ptr_;
+};
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
+
+  TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+
+// TODO(ahentz): Creating 'all_inputs' below is not very efficient. We should
+// allocate and populate these during Prepare().
+// TODO(ycling): Activation function parameter is ignored. For now we dont have
+// a model with a Concatenation with fused activation function.
+#define TF_LITE_CONCATENATION(type, scalar)                                 \
+  VectorOfInputs<scalar> all_inputs(*context, *node->inputs);               \
+  type::Concatenation<FusedActivationFunctionType::kNone, scalar>(          \
+      RemapDim(NumDimensions(output), params->axis), all_inputs.data(),     \
+      all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
+      GetTensorDims(output))
+
+  switch (output->type) {  // Already know in/outtypes are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_CONCATENATION(reference_ops, float);
+      } else {
+        TF_LITE_CONCATENATION(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_CONCATENATION(reference_ops, uint8_t);
+      } else {
+        TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+      }
+      break;
+    default:
+      context->ReportError(context,
+                           "Only float32 and uint8 are currently supported.");
+      return kTfLiteError;
+  }
+
+#undef TF_LITE_CONCATENATION
+
+  return kTfLiteOk;
+}
+
+#undef TF_LITE_MACRO_DISPATCH
+
+}  // namespace concatenation
+
+TfLiteRegistration* Register_CONCATENATION_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, concatenation::Prepare,
+      concatenation::Eval<concatenation::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONCATENATION_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, concatenation::Prepare,
+      concatenation::Eval<concatenation::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONCATENATION() {
+  // TODO(ahentz): It turns out the two versions of Concatenation are almost
+  // identical, so we should consider removing one.
+  return Register_CONCATENATION_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..499856a93cbbfbf9aa1a326912e52ce32bbbdf83
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseConcatenationOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): Also test different activation types, axis, input
+  // dimensions.
+  BaseConcatenationOpModel(const TensorData& input_template, int axis,
+                           int num_inputs) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+ protected:
+  int output_;
+};
+
+class ConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  void SetInput(int index, std::initializer_list<float> data) {
+    PopulateTensor(index, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  void SetInput(int index, std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(index, data);
+  }
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(ConcatenationOpTest, ThreeDimensionalOneInput) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
+                          /*num_inputs=*/1);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
+TEST(ConcatenationOpTest, OneTrivialInput) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {1}}, /*axis=*/0,
+                          /*num_inputs=*/1);
+  m0.SetInput(0, {5.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ::testing::ElementsAre(5));
+}
+
+TEST(ConcatenationOpTest, TwoDimensionalOneInput) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+                          /*num_inputs=*/1);
+  m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(ConcatenationOpTest, TwoInputsTwoAxis) {
+  // We will concatenate two tensors along different dimensions.
+  auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
+
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+                          /*num_inputs=*/2);
+  m0.SetInput(0, tensor0);
+  m0.SetInput(1, tensor1);
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+  ConcatenationOpModel m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
+                          /*num_inputs=*/2);
+  m1.SetInput(0, tensor0);
+  m1.SetInput(1, tensor1);
+  m1.Invoke();
+  EXPECT_THAT(m1.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
+TEST(ConcatenationOpTest, FourInputs) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
+                          /*num_inputs=*/4);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantized) {
+  QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
+                                   /*axis=*/2,
+                                   /*num_inputs=*/4);
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c75c04baeac2ce53c6261d677dca8d72fafa0da5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -0,0 +1,425 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace conv {
+
+// This file has three implementation of Conv.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+struct OpData {
+  // IDs are the arbitrary identifiers used by TF Lite to identify and access
+  // memory buffers.
+  int im2col_id;
+  int hwcn_weights_id;
+
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multipler plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // Indexes are the offset to the memory buffer in the array used to keep track
+  // of the allocated temporaries.
+  int32_t im2col_index;
+  int32_t hwcn_weights_index;
+  bool need_hwcn_weights;
+  bool have_weights_been_transposed;
+  bool need_im2col;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to use as scratch space for im2col, and
+  // to carry information from Prepare() to Eval().
+  auto* data = new OpData;
+  context->AddTensors(context, 1, &data->im2col_id);
+  context->AddTensors(context, 1, &data->hwcn_weights_id);
+  gemm_support::IncrementUsageCounter(context);
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  gemm_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+// Naive implementation of transpose for floats. Could be optimized to be more
+// cache friendly, but for now it's a one-time cost on first run, and we would
+// prefer to remove the need to do this at all eventually.
+void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
+  const int rows = output->dims->data[1];
+  const int cols = output->dims->data[0];
+  const float* input_data = GetTensorData<float>(input);
+  float* output_data = GetTensorData<float>(output);
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      const float in_value = input_data[i * cols + j];
+      output_data[j * rows + i] = in_value;
+    }
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  bool hasBias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, hasBias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+  TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+  // Check dimensionality of input, filter
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
+  TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
+  // Check input channels matching filter
+  TF_LITE_ENSURE_EQ(context, input->dims->data[3], filter->dims->data[3]);
+
+  // Check types. (We assume that UINT8 refers to quantized tensors)
+  TfLiteType data_type = input->type;
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+  TfLiteTensor* bias = nullptr;
+
+  // TODO(ahentz): At this point the optimized versions require 'bias'. We can
+  // either change that or document that convolution requires it.
+  TF_LITE_ENSURE(context, hasBias);
+
+  if (hasBias) {
+    bias = &context->tensors[node->inputs->data[2]];
+    if (data_type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else {
+      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+    }
+    TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
+  }
+
+  int channels_out = filter->dims->data[0];
+  int width = input->dims->data[2];
+  int height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int batches = input->dims->data[0];
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  auto computeOutSize = [padding](int imageSize, int filterSize,
+                                  int stride) -> int {
+    return padding == kTfLitePaddingSame
+               ? (imageSize + stride - 1) / stride
+               : padding == kTfLitePaddingValid
+                     ? (imageSize - filterSize + stride) / stride
+                     : 0;
+  };
+
+  int outWidth = computeOutSize(width, filter_width, params->stride_width);
+  int outHeight = computeOutSize(height, filter_height, params->stride_height);
+
+  data->padding.height =
+      ComputePadding(params->stride_height, height, filter_height, outHeight);
+  data->padding.width =
+      ComputePadding(params->stride_width, width, filter_width, outWidth);
+
+  TF_LITE_ENSURE(context, hasBias);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+                                     &data->output_shift);
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = batches;
+  output_size->data[1] = outHeight;
+  output_size->data[2] = outWidth;
+  output_size->data[3] = channels_out;
+  auto output_status = context->ResizeTensor(context, output, output_size);
+
+  if (output_status != kTfLiteOk) return output_status;
+
+  // We don't always need to allocate im2col. It is only used in some versions
+  // of the optimized Conv. This test just mimics something that happens inside
+  // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
+  data->need_im2col =
+      (params->stride_width != 1 || params->stride_height != 1 ||
+       filter_width != 1 || filter_height != 1);
+  // If we're using the optimized multithreaded EigenTensor implementation of
+  // convolution, it expects the filter weights to be transposed compared to
+  // the normal TF Lite buffer format. Typical TF Lite weights are
+  // [filter_count, filter_height, filter_width, input_depth], but for the float
+  // implementation we need them as [filter_height, filter_width, input_depth,
+  // filter_count]. We get to that format by transposing, and create a temporary
+  // buffer to store the results.
+  // This path is only used for float processing, so only create the buffer if
+  // we're running with that data type.
+  data->need_hwcn_weights = (data_type == kTfLiteFloat32);
+
+  int temporaries_count = 0;
+  if (data->need_im2col) {
+    data->im2col_index = temporaries_count;
+    ++temporaries_count;
+  }
+  if (data->need_hwcn_weights) {
+    data->hwcn_weights_index = temporaries_count;
+    ++temporaries_count;
+  }
+
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(temporaries_count);
+
+  if (data->need_im2col) {
+    node->temporaries->data[data->im2col_index] = data->im2col_id;
+
+    TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
+
+    int input_depth = input->dims->data[3];
+    im2col_size->data[0] = output_size->data[0];
+    im2col_size->data[1] = output_size->data[1];
+    im2col_size->data[2] = output_size->data[2];
+    im2col_size->data[3] = input_depth * filter_height * filter_width;
+
+    TfLiteTensor* im2col =
+        &context->tensors[node->temporaries->data[data->im2col_index]];
+    im2col->type = data_type;
+    im2col->allocation_type = kTfLiteArenaRw;
+    auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
+    if (im2col_status != kTfLiteOk) return im2col_status;
+  }
+
+  if (data->need_hwcn_weights) {
+    node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
+    TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
+
+    // Because we're treating the filter weights as a matrix when we do the
+    // transpose, we allocate the buffer with a two-dimensional shape, where one
+    // dimension is the number of elements in each filter, and the second is the
+    // total number of filters.
+    int input_depth = input->dims->data[3];
+    hwcn_weights_size->data[0] = (filter_height * filter_width * input_depth);
+    hwcn_weights_size->data[1] = channels_out;
+
+    TfLiteTensor* hwcn_weights =
+        &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
+    hwcn_weights->type = data_type;
+    hwcn_weights->allocation_type = kTfLiteDynamic;
+    // Make sure we release any previous allocations before we reallocate.
+    // TODO(petewarden): Persistent arenas would be a better fit for this, but
+    // they aren't fully implemented yet.
+    if (hwcn_weights->data.raw) {
+      free(hwcn_weights->data.raw);
+      hwcn_weights->data.raw = nullptr;
+    }
+    auto hwcn_weights_status =
+        context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
+    if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
+    hwcn_weights->data.raw = static_cast<char*>(malloc(hwcn_weights->bytes));
+
+    // TODO(petewarden): If Resize() is called when the size hasn't actually
+    // changed, this will do extra redundant work.
+    data->have_weights_been_transposed = false;
+  }
+
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+                   TfLiteTensor* filter, TfLiteTensor* bias,
+                   TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
+                   TfLiteTensor* output) {
+  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+  auto input_offset = -input->params.zero_point;
+  auto filter_offset = -filter->params.zero_point;
+  auto output_offset = output->params.zero_point;
+
+  if (kernel_type == kReference) {
+    reference_ops::Conv(
+        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+        params->stride_height, data->padding.width, data->padding.height,
+        output_offset, data->output_multiplier, data->output_shift,
+        data->output_activation_min, data->output_activation_max,
+        GetTensorData<uint8_t>(output), GetTensorDims(output),
+        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  } else {
+    optimized_ops::Conv(
+        GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+        GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+        GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+        params->stride_height, data->padding.width, data->padding.height,
+        output_offset, data->output_multiplier, data->output_shift,
+        data->output_activation_min, data->output_activation_max,
+        GetTensorData<uint8_t>(output), GetTensorDims(output),
+        GetTensorData<uint8_t>(im2col), GetTensorDims(im2col), gemm_context);
+  }
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
+               TfLiteTensor* filter, TfLiteTensor* bias, TfLiteTensor* im2col,
+               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+
+  const float* filter_data;
+  if (data->need_hwcn_weights) {
+    filter_data = GetTensorData<float>(hwcn_weights);
+  } else {
+    filter_data = GetTensorData<float>(filter);
+  }
+
+  if (kernel_type == kReference) {
+    reference_ops::Conv(
+        GetTensorData<float>(input), GetTensorDims(input), filter_data,
+        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+        params->stride_width, params->stride_height, data->padding.width,
+        data->padding.height, output_activation_min, output_activation_max,
+        GetTensorData<float>(output), GetTensorDims(output),
+        GetTensorData<float>(im2col), GetTensorDims(im2col));
+  } else {
+    multithreaded_ops::Conv(
+        GetTensorData<float>(input), GetTensorDims(input), filter_data,
+        GetTensorDims(filter), GetTensorData<float>(bias), GetTensorDims(bias),
+        params->stride_width, params->stride_height, data->padding.width,
+        data->padding.height, params->padding, output_activation_min,
+        output_activation_max, GetTensorData<float>(output),
+        GetTensorDims(output), GetTensorData<float>(im2col),
+        GetTensorDims(im2col));
+  }
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
+  TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
+  TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
+  bool hasBias = node->inputs->size == 3;
+  TfLiteTensor* bias =
+      hasBias ? &context->tensors[node->inputs->data[2]] : nullptr;
+  TfLiteTensor* im2col =
+      data->need_im2col
+          ? &context->tensors[node->temporaries->data[data->im2col_index]]
+          : nullptr;
+  TfLiteTensor* hwcn_weights =
+      data->need_hwcn_weights
+          ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
+          : nullptr;
+
+  if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
+    TransposeFloatTensor(filter, hwcn_weights);
+    data->have_weights_been_transposed = true;
+  }
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/outtypes are same.
+    case kTfLiteFloat32:
+      EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+                             im2col, hwcn_weights, output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+                                 bias, im2col, hwcn_weights, output);
+      break;
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace conv
+
+TfLiteRegistration* Register_CONVOLUTION_REF() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONVOLUTION_NEON_OPT() {
+  static TfLiteRegistration r = {conv::Init, conv::Free, conv::Prepare,
+                                 conv::Eval<conv::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_CONV_2D() {
+#ifdef USE_NEON
+  return Register_CONVOLUTION_NEON_OPT();
+#else
+  return Register_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/conv_test.cc b/tensorflow/contrib/lite/kernels/conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d0a81c3135625c07a3566f5f9a8e5401f0d4db7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/conv_test.cc
@@ -0,0 +1,440 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseConvolutionOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): Also test different activation types, bias, padding types,
+  // stride values.
+  BaseConvolutionOpModel(
+      const TensorData& input, const TensorData& filter,
+      const TensorData& output, int stride_width = 2, int stride_height = 2,
+      enum Padding padding = Padding_VALID,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE) {
+    input_ = AddInput(input);
+    filter_ = AddInput(filter);
+
+    int bias_size = GetShape(filter_)[0];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      auto bias_scale = GetScale(input_) * GetScale(filter_);
+      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+    if (input.type != TensorType_FLOAT32) {
+      // The following is required by quantized inference. It is the unittest's
+      // responsibility to make sure the output scale falls into the correct
+      // range.
+      CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_));
+    }
+
+    SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+                 CreateConv2DOptions(builder_, padding, stride_width,
+                                     stride_height, activation)
+                     .Union());
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+class ConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(ConvolutionOpTest, SimpleTestFloat32) {
+  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                       {TensorType_FLOAT32, {3, 2, 2, 1}},
+                       {TensorType_FLOAT32, {}});
+
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 18, 2, 5,  // first batch, left
+                                 18, 2, 5,  // first batch, right
+                                 17, 4, 3,  // second batch, left
+                                 37, 4, 3,  // second batch, right
+                             }));
+}
+
+TEST(ConvolutionOpTest, SimpleTestFloat32WithAnisotropicStrides) {
+  ConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 6, 1}},
+                       {TensorType_FLOAT32, {1, 2, 2, 1}},
+                       {TensorType_FLOAT32, {}},
+                       /*stride_width=*/3, /*stride_height=*/1);
+  m.SetInput({
+      3, 2, 1, -1, -2, -3,  //
+      4, 3, 2, -2, -3, -4,  //
+      5, 4, 3, -3, -4, -5,  //
+  });
+  m.SetFilter({
+      1, 2,  //
+      3, 4,  //
+  });
+  m.SetBias({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 30, -24,  //
+                                 40, -34,  //
+                             }));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedFloat32) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  ConvolutionOpModel m(
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+                                               178, 187, 234, 261, 121}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  ConvolutionOpModel m(
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+  // Bias is | 10 |.
+  m.SetBias({10});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)+10=115
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)+10=160
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)+10=193
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)+10=105
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)+10=245
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)+10=322
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)+10=367
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)+10=188
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)+10=197
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)+10=244
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)+10=271
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)+10=131
+  // This means we should end up with this matrix:
+  // |  115  |  160  |  193  |  105  |
+  // |  245  |  322  |  367  |  188  |
+  // |  197  |  244  |  271  |  131  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({115, 160, 193, 105, 245, 322,
+                                               367, 188, 197, 244, 271, 131}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedWithReluFloat32) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  ConvolutionOpModel m(
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_RELU);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+  // Bias is | -200 |.
+  m.SetBias({-200});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)-200=-95
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)-200=-50
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)-200=-17
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)-200=-105
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)-200=35
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)-200=112
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)-200=157
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)-200=-22
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)-200=-13
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)-200=34
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)-200=61
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)-200=-79
+  // All negative values are gated to zero by the Relu activation function.
+  // This means we should end up with this matrix:
+  // |   0 |   0 |   0 |   0 |
+  // |  35 | 112 | 157 |   0 |
+  // |   0 |  34 |  61 |   0 |
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 0, 0, 0, 35, 112, 157, 0, 0, 34, 61, 0}));
+}
+
+TEST(ConvolutionOpTest, HandCalculatedValidFloat32) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_VALID;
+  ConvolutionOpModel m(
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  m.SetFilter({1, 4, 7, 2, 5, 8, 3, 6, 9});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with no accesses outside
+  // the input because we're using the 'VALID' padding mode, giving a 2x1
+  // output.
+  // The calculations behind the expected output are:
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // This means we should end up with this matrix:
+  // |  312  |  357  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({312, 357}));
+}
+
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// In this tests we set the input and output scales so that the results
+// match exactly the 'non-quantized' version.
+TEST(ConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      18, 2, 5,  // first batch, left
+                      18, 2, 5,  // first batch, right
+                      17, 4, 3,  // second batch, left
+                      37, 4, 3,  // second batch, right
+                  },
+                  1e-5)));
+  // For good  measure, let's also verify the quantized values:
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 145, 129, 132,  //
+                                 145, 129, 132,  //
+                                 144, 131, 130,  //
+                                 164, 131, 130,  //
+                             }));
+}
+
+TEST(ConvolutionOpTest, SimpleTestQuantizedWithAnisotropicStrides) {
+  QuantizedConvolutionOpModel m({TensorType_UINT8, {1, 3, 6, 1}, -63.5, 64},
+                                {TensorType_UINT8, {1, 2, 2, 1}, -63.5, 64},
+                                {TensorType_UINT8, {}, -127, 128},
+                                /*stride_width=*/3, /*stride_height=*/1);
+  m.SetInput({
+      3, 2, 1, -1, -2, -3,  //
+      4, 3, 2, -2, -3, -4,  //
+      5, 4, 3, -3, -4, -5,  //
+  });
+  m.SetFilter({
+      1, 2,  //
+      3, 4,  //
+  });
+  m.SetBias({-1});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+                                            30, -24,  //
+                                            40, -34,  //
+                                        })));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 157, 103,  //
+                                 167, 93,   //
+                             }));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15dbfe08c82befcf001b9ed9a053528b5606053e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -0,0 +1,289 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace depthwise_conv {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// This file has three implementation of DepthwiseConv.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multipler plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  // TODO(ahentz): use could use GetOptionalInputTensor() here, but we need to
+  // decide whether we are OK with optional tensors being completely absent, as
+  // opposed to having -1 as their index.
+  bool hasBias = NumInputs(node) == 3;
+
+  TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* bias = nullptr;
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
+
+  // The parameter 'depth_multiplier' is redundant, so we check here to make
+  // sure it is consistent with the given dimensions.
+  TF_LITE_ENSURE_EQ(context,
+                    params->depth_multiplier * SizeOfDimension(input, 3),
+                    SizeOfDimension(filter, 3));
+
+  const TfLiteType data_type = input->type;
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_EQ(context, filter->type, data_type);
+
+  if (hasBias) {
+    bias = GetInput(context, node, kBiasTensor);
+    if (data_type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+      TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
+    } else {
+      TF_LITE_ENSURE_EQ(context, bias->type, data_type);
+    }
+    TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 3),
+                      SizeOfDimension(bias, 0));
+  }
+
+  int channels_out = SizeOfDimension(filter, 3);
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+  int batches = SizeOfDimension(input, 0);
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  auto compute_out_size = [padding](int imageSize, int filterSize,
+                                    int stride) -> int {
+    return padding == kTfLitePaddingSame
+               ? (imageSize + stride - 1) / stride
+               : padding == kTfLitePaddingValid
+                     ? (imageSize - filterSize + stride) / stride
+                     : 0;
+  };
+
+  int out_width = compute_out_size(width, filter_width, params->stride_width);
+  int out_height =
+      compute_out_size(height, filter_height, params->stride_height);
+
+  data->padding.height =
+      ComputePadding(params->stride_height, height, filter_height, out_height);
+  data->padding.width =
+      ComputePadding(params->stride_width, width, filter_width, out_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+                                     &data->output_shift);
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
+  outputSize->data[0] = batches;
+  outputSize->data[1] = out_height;
+  outputSize->data[2] = out_width;
+  outputSize->data[3] = channels_out;
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, OpData* data,
+               TfLiteTensor* input, TfLiteTensor* filter, TfLiteTensor* bias,
+               TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+
+  void (*depthwise_conv)(const float*, const Dims<4>&, const float*,
+                         const Dims<4>&, const float*, const Dims<4>&, int, int,
+                         int, int, int, float, float, float*, const Dims<4>&);
+  if (kernel_type == kReference) {
+    depthwise_conv = &reference_ops::DepthwiseConv;
+  } else {
+    depthwise_conv = &optimized_ops::DepthwiseConv;
+  }
+
+  depthwise_conv(
+      GetTensorData<float>(input), GetTensorDims(input),
+      GetTensorData<float>(filter), GetTensorDims(filter),
+      GetTensorData<float>(bias), GetTensorDims(bias), params->stride_width,
+      params->stride_height, data->padding.width, data->padding.height,
+      params->depth_multiplier, output_activation_min, output_activation_max,
+      GetTensorData<float>(output), GetTensorDims(output));
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, OpData* data,
+                   TfLiteTensor* input, TfLiteTensor* filter,
+                   TfLiteTensor* bias, TfLiteTensor* output) {
+  auto input_offset = -input->params.zero_point;
+  auto filter_offset = -filter->params.zero_point;
+  auto output_offset = output->params.zero_point;
+
+  void (*depthwise_conv)(const uint8*, const Dims<4>&, int32, const uint8*,
+                         const Dims<4>&, int32, const int32*, const Dims<4>&,
+                         int, int, int, int, int, int32, int32, int, int32,
+                         int32, uint8*, const Dims<4>&);
+  if (kernel_type == kReference) {
+    depthwise_conv = &reference_ops::DepthwiseConv;
+  } else {
+    depthwise_conv = &optimized_ops::DepthwiseConv;
+  }
+
+  depthwise_conv(
+      GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,
+      GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset,
+      GetTensorData<int32_t>(bias), GetTensorDims(bias), params->stride_width,
+      params->stride_height, data->padding.width, data->padding.height,
+      params->depth_multiplier, output_offset, data->output_multiplier,
+      data->output_shift, data->output_activation_min,
+      data->output_activation_max, GetTensorData<uint8_t>(output),
+      GetTensorDims(output));
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* bias =
+      (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
+                             output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized<kernel_type>(context, node, params, data, input, filter,
+                                 bias, output);
+      break;
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_NEON_OPT() {
+  static TfLiteRegistration r = {
+      depthwise_conv::Init, depthwise_conv::Free, depthwise_conv::Prepare,
+      depthwise_conv::Eval<depthwise_conv::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D() {
+#ifdef USE_NEON
+  return Register_DEPTHWISE_CONVOLUTION_NEON_OPT();
+#else
+  return Register_DEPTHWISE_CONVOLUTION_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1439c8bce14ad127ed68dc54991aed8b8bb39383
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
@@ -0,0 +1,186 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): Also test different activation types, bias, padding types,
+  // stride values.
+  BaseDepthwiseConvolutionOpModel(const TensorData& input,
+                                  const TensorData& filter,
+                                  const TensorData& output) {
+    input_ = AddInput(input);
+    filter_ = AddInput(filter);
+
+    int bias_size = GetShape(filter_)[3];
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      auto bias_scale = GetScale(input_) * GetScale(filter_);
+      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+    if (input.type != TensorType_FLOAT32) {
+      // The following is required by quantized inference. It is the unittest's
+      // responsibility to make sure the output scale falls into the correct
+      // range.
+      CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_));
+    }
+
+    int input_depth = GetShape(input_)[3];
+    int output_depth = GetShape(filter_)[3];
+    int depth_mul = output_depth / input_depth;
+
+    SetBuiltinOp(
+        BuiltinOperator_DEPTHWISE_CONV_2D,
+        BuiltinOptions_DepthwiseConv2DOptions,
+        CreateDepthwiseConv2DOptions(builder_, Padding_VALID, 1, 1, depth_mul,
+                                     ActivationFunctionType_NONE)
+            .Union());
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+class DepthwiseConvolutionOpModel : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(DepthwiseConvolutionOpTest, SimpleTest) {
+  DepthwiseConvolutionOpModel m({TensorType_FLOAT32, {1, 3, 2, 2}},
+                                {TensorType_FLOAT32, {1, 2, 2, 4}},
+                                {TensorType_FLOAT32, {}});
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 71, -34, 99, -20,  //
+                                 91, -26, 127, -4,  //
+                             }));
+}
+
+class QuantizedDepthwiseConvolutionOpModel
+    : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// In this test we set the input and output scales so that the results match
+// exactly the 'non-quantized' version.
+TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
+  QuantizedDepthwiseConvolutionOpModel m(
+      {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
+      {TensorType_UINT8, {1, 2, 2, 4}, -63.5, 64},
+      {TensorType_UINT8, {}, -127, 128});
+
+  m.SetInput({
+      1, 2, 7, 8,    // column 1
+      3, 4, 9, 10,   // column 2
+      5, 6, 11, 12,  // column 3
+  });
+  m.SetFilter({
+      1, 2, 3, 4,        //
+      -9, 10, -11, 12,   //
+      5, 6, 7, 8,        //
+      13, -14, 15, -16,  //
+  });
+  m.SetBias({1, 2, 3, 4});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {
+                                                71, -34, 99, -20,  //
+                                                91, -26, 127, -4,  //
+                                            },
+                                            1e-5)));
+  // For good  measure, let's also verify the quantized values:
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 198, 93, 226, 107,   //
+                                 218, 101, 254, 123,  //
+                             }));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8cb396d43a58f94b08eb8dd8b05d16fd74fd2f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -0,0 +1,104 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Ops that looks up items from matrix.
+//
+// Input:
+//     Tensor[0]: Row number to lookup, dim.size == 1, int32
+//     Tensor[1]: 2-dimensional matrix of multi-dimensional items
+//                dim.size >= 2, any data type.
+//                first dimension is row, second dimension is column.
+//
+// Output:
+//   Output.dim[0] == Tensor[0].dim[0], num of lookups
+//   Output.dim[1] == Tensor[1].dim[1],  num of items per row
+//   Each item in output is a raw bytes copy of corresponding item in input.
+//   When indices are out of bound, the ops will not succeed.
+//
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace embedding_lookup {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* lookup = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
+  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+
+  TfLiteTensor* value = GetInput(context, node, 1);
+  TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
+
+  outputSize->data[0] = SizeOfDimension(lookup, 0);
+  outputSize->data[1] = SizeOfDimension(value, 1);
+  for (int i = 2; i < NumDimensions(value); i++) {
+    outputSize->data[i] = SizeOfDimension(value, i);
+  }
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* lookup = GetInput(context, node, 0);
+  TfLiteTensor* value = GetInput(context, node, 1);
+
+  const int row_size = SizeOfDimension(value, 0);
+  const int row_bytes = value->bytes / row_size;
+
+  for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
+    int idx = lookup->data.i32[i];
+    if (idx >= row_size || idx < 0) {
+      context->ReportError(context, "Embedding Lookup: index out of bounds.");
+      return kTfLiteError;
+    } else {
+      memcpy(output->data.raw + i * row_bytes,
+             value->data.raw + idx * row_bytes, row_bytes);
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace embedding_lookup
+
+TfLiteRegistration* Register_EMBEDDING_LOOKUP() {
+  static TfLiteRegistration r = {nullptr, nullptr, embedding_lookup::Prepare,
+                                 embedding_lookup::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c770e7f71efe83eace3640c47e03e0c7ab19e20
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
@@ -0,0 +1,248 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Op that looks up items from a sparse tensor in an embedding matrix.
+// The sparse lookup tensor is represented by three individual tensors: lookup,
+// indices, and dense_shape. The representation assume that the corresponding
+// dense tensor would satisfy:
+//   * dense.shape = dense_shape
+//   * dense[tuple(indices[i])] = lookup[i]
+//
+// By convention, indices should be sorted.
+//
+// Options:
+//   combiner: The reduction op (SUM, MEAN, SQRTN).
+//     * SUM computes the weighted sum of the embedding results.
+//     * MEAN is the weighted sum divided by the total weight.
+//     * SQRTN is the weighted sum divided by the square root of the sum of the
+//       squares of the weights.
+//
+// Input:
+//     Tensor[0]: Ids to lookup, dim.size == 1, int32.
+//     Tensor[1]: Indices, int32.
+//     Tensor[2]: Dense shape, int32.
+//     Tensor[3]: Weights to use for aggregation, float.
+//     Tensor[4]: Params, a matrix of multi-dimensional items,
+//                dim.size >= 2, float.
+//
+// Output:
+//   A (dense) tensor representing the combined embeddings for the sparse ids.
+//   For each row in the sparse tensor represented by (lookup, indices, shape)
+//   the op looks up the embeddings for all ids in that row, multiplies them by
+//   the corresponding weight, and combines these embeddings as specified in the
+//   last dimension.
+//
+//   Output.dim = [l0, ... , ln-1, e1, ..., em]
+//   Where dense_shape == [l0, ..., ln] and Tensor[4].dim == [e0, e1, ..., em]
+//
+//   For instance, if params is a 10x20 matrix and ids, weights are:
+//
+//   [0, 0]: id 1, weight 2.0
+//   [0, 1]: id 3, weight 0.5
+//   [1, 0]: id 0, weight 1.0
+//   [2, 3]: id 1, weight 3.0
+//
+//   with combiner=MEAN, then the output will be a (3, 20) tensor where:
+//
+//   output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
+//   output[1, :] = (params[0, :] * 1.0) / 1.0
+//   output[2, :] = (params[1, :] * 3.0) / 3.0
+//
+//   When indices are out of bound, the op will not succeed.
+
+#include <algorithm>
+#include <cmath>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 5);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* ids = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(ids), 1);
+  TF_LITE_ENSURE_EQ(context, ids->type, kTfLiteInt32);
+
+  TfLiteTensor* indices = GetInput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(indices), 2);
+  TF_LITE_ENSURE_EQ(context, indices->type, kTfLiteInt32);
+
+  TfLiteTensor* shape = GetInput(context, node, 2);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
+  TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
+
+  TfLiteTensor* weights = GetInput(context, node, 3);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 1);
+  TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
+
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+                    SizeOfDimension(ids, 0));
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+                    SizeOfDimension(weights, 0));
+
+  TfLiteTensor* value = GetInput(context, node, 4);
+  TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
+
+  // Mark the output as a dynamic tensor.
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  output->allocation_type = kTfLiteDynamic;
+
+  return kTfLiteOk;
+}
+
+void FinalizeAggregation(TfLiteCombinerType combiner, int num_elements,
+                         float current_total_weight,
+                         float current_squares_weight, int embedding_size,
+                         float* output) {
+  if (combiner != kTfLiteCombinerTypeSum && num_elements > 0) {
+    float multiplier = 1.0;
+    switch (combiner) {
+      case kTfLiteCombinerTypeMean:
+        multiplier = current_total_weight;
+        break;
+      case kTfLiteCombinerTypeSqrtn:
+        multiplier = std::sqrt(current_squares_weight);
+        break;
+      default:
+        break;
+    }
+    for (int k = 0; k < embedding_size; k++) {
+      output[k] /= multiplier;
+    }
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteEmbeddingLookupSparseParams*>(node->builtin_data);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* ids = GetInput(context, node, 0);
+  TfLiteTensor* indices = GetInput(context, node, 1);
+  TfLiteTensor* dense_shape = GetInput(context, node, 2);
+  TfLiteTensor* weights = GetInput(context, node, 3);
+  TfLiteTensor* value = GetInput(context, node, 4);
+
+  const int lookup_rank = SizeOfDimension(indices, 1);
+  const int embedding_rank = NumDimensions(value);
+  const int num_lookups = SizeOfDimension(ids, 0);
+  const int num_rows = SizeOfDimension(value, 0);
+
+  // The last dimension gets replaced by the embedding.
+  const int output_rank = (lookup_rank - 1) + (embedding_rank - 1);
+
+  // Make sure that the actual dense shape of the sparse tensor represented by
+  // (loopkup, indices, dense_shape) is consistent.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(dense_shape, 0), lookup_rank);
+
+  // Resize output tensor.
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
+  int k = 0;
+  int embedding_size = 1;
+  int lookup_size = 1;
+  for (int i = 0; i < lookup_rank - 1; i++, k++) {
+    const int dim = dense_shape->data.i32[i];
+    lookup_size *= dim;
+    output_shape->data[k] = dim;
+  }
+  for (int i = 1; i < embedding_rank; i++, k++) {
+    const int dim = SizeOfDimension(value, i);
+    embedding_size *= dim;
+    output_shape->data[k] = dim;
+  }
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_shape));
+  const int output_size = lookup_size * embedding_size;
+  TfLiteTensorRealloc(output_size * sizeof(float), output);
+
+  tensor_utils::ZeroVector(output->data.f, output_size);
+
+  // Keep track of the current bucket for aggregation/combination.
+  int current_output_offset = 0;
+  float current_total_weight = 0.0;
+  float current_squares_weight = 0.0;
+  int num_elements = 0;
+
+  for (int i = 0; i < num_lookups; i++) {
+    int idx = ids->data.i32[i];
+    if (idx >= num_rows || idx < 0) {
+      context->ReportError(context,
+                           "Embedding Lookup Sparse: index out of bounds.");
+      return kTfLiteError;
+    }
+
+    // Check where we need to aggregate.
+    const int example_indices_offset = i * lookup_rank;
+    int output_bucket = 0;
+    int stride = 1;
+    for (int k = (lookup_rank - 1) - 1; k >= 0; k--) {
+      output_bucket += indices->data.i32[example_indices_offset + k] * stride;
+      stride *= dense_shape->data.i32[k];
+    }
+    const int output_offset = output_bucket * embedding_size;
+
+    // If we are in a new aggregation bucket and the combiner is not the sum,
+    // go back and finalize the result of the previous bucket.
+    if (output_offset != current_output_offset) {
+      FinalizeAggregation(params->combiner, num_elements, current_total_weight,
+                          current_squares_weight, embedding_size,
+                          &output->data.f[current_output_offset]);
+
+      // Track next bucket.
+      num_elements = 0;
+      current_total_weight = 0.0;
+      current_squares_weight = 0.0;
+      current_output_offset = output_offset;
+    }
+
+    // Add element to aggregation.
+    ++num_elements;
+    const int example_embedding_offset = idx * embedding_size;
+    const float w = weights->data.f[i];
+    current_squares_weight += w * w;
+    current_total_weight += w;
+    for (int k = 0; k < embedding_size; k++) {
+      output->data.f[current_output_offset + k] +=
+          (value->data.f[example_embedding_offset + k] * w);
+    }
+  }
+
+  // Finalize last bucket.
+  FinalizeAggregation(params->combiner, num_elements, current_total_weight,
+                      current_squares_weight, embedding_size,
+                      &output->data.f[current_output_offset]);
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE() {
+  static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc5fffad9ceac1a9d23a4e91637a9ff92a8dda
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse_test.cc
@@ -0,0 +1,164 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite sparse lookup op.
+
+#include <cmath>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class EmbeddingLookupSparseOpModel : public SingleOpModel {
+ public:
+  EmbeddingLookupSparseOpModel(CombinerType type,
+                               std::initializer_list<int> lookup_shape,
+                               std::initializer_list<int> indices_shape,
+                               std::initializer_list<int> dense_shape_shape,
+                               std::initializer_list<int> value_shape) {
+    lookup_ = AddInput(TensorType_INT32);
+    indices_ = AddInput(TensorType_INT32);
+    dense_shape_ = AddInput(TensorType_INT32);
+    weights_ = AddInput(TensorType_FLOAT32);
+    value_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+                 BuiltinOptions_EmbeddingLookupSparseOptions,
+                 CreateEmbeddingLookupSparseOptions(builder_, type).Union());
+    BuildInterpreter({lookup_shape, indices_shape, dense_shape_shape,
+                      lookup_shape, value_shape});
+  }
+
+  void SetInput(std::initializer_list<int> lookup_data,
+                std::initializer_list<int> indices_data,
+                std::initializer_list<int> dense_shape_data,
+                std::initializer_list<float> weights_data) {
+    PopulateTensor(lookup_, lookup_data);
+    PopulateTensor(indices_, indices_data);
+    PopulateTensor(dense_shape_, dense_shape_data);
+    PopulateTensor(weights_, weights_data);
+  }
+
+  void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
+    TfLiteTensor* tensor = interpreter_->tensor(value_);
+    int rows = tensor->dims->data[0];
+    int columns = tensor->dims->data[1];
+    int features = tensor->dims->data[2];
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < columns; j++) {
+        for (int k = 0; k < features; k++) {
+          tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+        }
+      }
+    }
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int lookup_;
+  int weights_;
+  int indices_;
+  int dense_shape_;
+  int value_;
+  int output_;
+};
+
+TEST(EmbeddingLookupOpTest, SimpleTest) {
+  EmbeddingLookupSparseOpModel m(CombinerType_SUM, {3}, {3, 2}, {2}, {4, 3, 2});
+  m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+  m.Set3DWeightMatrix(
+      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  6.00, 6.06, 6.60, 6.66, 7.20, 7.26,  // 2 * Row 3 + 4 * Row 0
+              })));
+}
+
+TEST(EmbeddingLookupOpTest, SimpleTestMean) {
+  EmbeddingLookupSparseOpModel m(CombinerType_MEAN, {3}, {3, 2}, {2},
+                                 {4, 3, 2});
+  m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+  m.Set3DWeightMatrix(
+      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // 2 * Row 3 + 4 * Row 0
+              })));
+}
+
+TEST(EmbeddingLookupOpTest, SimpleTestSqrtn) {
+  EmbeddingLookupSparseOpModel m(CombinerType_SQRTN, {3}, {3, 2}, {2},
+                                 {4, 3, 2});
+  m.SetInput({1, 3, 0}, {0, 0, 2, 0, 2, 1}, {3, 2}, {1.0, 2.0, 4.0});
+  m.Set3DWeightMatrix(
+      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({
+          1.00, 1.01, 1.10, 1.11, 1.20, 1.21,  // Row 1
+          0.00, 0.00, 0.00, 0.00, 0.00, 0.00,  // -
+          6.00f / std::sqrt(20.0f), 6.06f / std::sqrt(20.0f),
+          6.60f / std::sqrt(20.0f), 6.66f / std::sqrt(20.0f),
+          7.20f / std::sqrt(20.0f),
+          7.26f /
+              std::sqrt(
+                  20.0f),  // 2 * Row 3 + 4 * Row 0,  // 2 * Row 3 + 4 * Row 0
+      })));
+}
+
+TEST(EmbeddingLookupOpTest, Indices3DTest) {
+  EmbeddingLookupSparseOpModel m(CombinerType_SUM, {3}, {3, 3}, {3}, {4, 3, 2});
+  m.SetInput({1, 3, 0}, {0, 0, 0, 2, 0, 0, 2, 0, 1}, {3, 2, 2},
+             {1.0, 2.0, 4.0});
+  m.Set3DWeightMatrix(
+      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.10, 1.11, 1.20, 1.21, 0.00, 0.00, 0.00,
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
+                  0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 6.00, 6.06, 6.60,
+                  6.66, 7.20, 7.26, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,
+              })));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b501878f196216a61568bfa36e6615f4dd07478
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Lookup op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class EmbeddingLookupOpModel : public SingleOpModel {
+ public:
+  EmbeddingLookupOpModel(std::initializer_list<int> index_shape,
+                         std::initializer_list<int> weight_shape) {
+    input_ = AddInput(TensorType_INT32);
+    weight_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
+    BuildInterpreter({index_shape, weight_shape});
+  }
+
+  void SetInput(std::initializer_list<int> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
+    TfLiteTensor* tensor = interpreter_->tensor(weight_);
+    int rows = tensor->dims->data[0];
+    int columns = tensor->dims->data[1];
+    int features = tensor->dims->data[2];
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < columns; j++) {
+        for (int k = 0; k < features; k++) {
+          tensor->data.f[(i * columns + j) * features + k] = function(i, j, k);
+        }
+      }
+    }
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int weight_;
+  int output_;
+};
+
+// TODO(ahentz): write more tests that exercise the details of the op, such as
+// lookup errors and variable input shapes.
+TEST(EmbeddingLookupOpTest, SimpleTest) {
+  EmbeddingLookupOpModel m({3}, {3, 2, 4});
+  m.PopulateTensor<int>(0, {1, 0, 2});
+  m.Set3DWeightMatrix(
+      [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                  0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                  2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+              })));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a77fe94e499078bc2f0660e8e49fd557ed0f625d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -0,0 +1,307 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fully_connected {
+
+// This file has four implementations of FullyConnected
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+  kPie,  // Used by the PIE team
+};
+
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multipler plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  gemm_support::IncrementUsageCounter(context);
+  return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  gemm_support::DecrementUsageCounter(context);
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  int input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    input_size *= input->dims->data[i];
+  }
+
+  const int batch_size = input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
+  if (bias) {
+    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+  }
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  TfLiteType data_type = input->type;
+  if (data_type != kTfLiteFloat32) {
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
+                                     &data->output_shift);
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &data->output_activation_min,
+                                  &data->output_activation_max);
+  }
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+  output_size_array->data[0] = batch_size;
+  output_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
+                     TfLiteFullyConnectedParams* params, OpData* data,
+                     TfLiteTensor* input, TfLiteTensor* filter,
+                     TfLiteTensor* bias, TfLiteTensor* output) {
+  int total_input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    total_input_size *= input->dims->data[i];
+  }
+
+  int input_size = filter->dims->data[1];
+  const int batch_size = total_input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  // Output = bias if bias tensor exists.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+                                          output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+  }
+
+  // Compute output += weight * input
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      filter->data.f, num_units, input_size, input->data.f, batch_size,
+      output->data.f, /*result_stride=*/1);
+
+  // Apply activation function
+  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
+                                        params->activation, output->data.f);
+
+  return kTfLiteOk;
+}
+
+#define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
+  if (params->activation == kTfLiteActNone) {                        \
+    macro_name(target_namespace, kNone);                             \
+  }                                                                  \
+  if (params->activation == kTfLiteActRelu) {                        \
+    macro_name(target_namespace, kRelu);                             \
+  }                                                                  \
+  if (params->activation == kTfLiteActRelu6) {                       \
+    macro_name(target_namespace, kRelu6);                            \
+  }
+
+template <KernelType kernel_type>
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteFullyConnectedParams* params, OpData* data,
+                           TfLiteTensor* input, TfLiteTensor* filter,
+                           TfLiteTensor* bias, TfLiteTensor* output) {
+  gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
+
+  int32_t input_offset = -input->params.zero_point;
+  int32_t filter_offset = -filter->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+#define TF_LITE_FULLY_CONNECTED(type)                                       \
+  type::FullyConnected(                                                     \
+      GetTensorData<uint8_t>(input), GetTensorDims(input), input_offset,    \
+      GetTensorData<uint8_t>(filter), GetTensorDims(filter), filter_offset, \
+      GetTensorData<int32_t>(bias), GetTensorDims(bias), output_offset,     \
+      data->output_multiplier, data->output_shift,                          \
+      data->output_activation_min, data->output_activation_max,             \
+      GetTensorData<uint8_t>(output), GetTensorDims(output), gemm_context)
+  if (kernel_type == kReference) {
+    TF_LITE_FULLY_CONNECTED(reference_ops);
+  } else if (kernel_type == kPie) {
+    // TODO(ahentz): we don't have a quantized version of the PIE kernels, so
+    // we just defer to the MINI ones.
+    TF_LITE_FULLY_CONNECTED(optimized_ops);
+  } else {
+    TF_LITE_FULLY_CONNECTED(optimized_ops);
+  }
+#undef TF_LITE_FULLY_CONNECTED
+
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteTensor* input, TfLiteTensor* filter,
+                       TfLiteTensor* bias, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+#define TF_LITE_FULLY_CONNECTED(type)                                       \
+  type::FullyConnected(GetTensorData<float>(input), GetTensorDims(input),   \
+                       GetTensorData<float>(filter), GetTensorDims(filter), \
+                       GetTensorData<float>(bias), GetTensorDims(bias),     \
+                       output_activation_min, output_activation_max,        \
+                       GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_FULLY_CONNECTED(reference_ops);
+  } else if (kernel_type == kPie) {
+    return EvalPie(context, node, params, data, input, filter, bias, output);
+  } else {
+    TF_LITE_FULLY_CONNECTED(optimized_ops);
+  }
+#undef TF_LITE_FULLY_CONNECTED
+
+  return kTfLiteOk;
+}
+
+#undef TF_LITE_MACRO_DISPATCH
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      return EvalFloat<kernel_type>(context, node, params, data, input, filter,
+                                    bias, output);
+    case kTfLiteUInt8:
+      return EvalQuantized<kernel_type>(context, node, params, data, input,
+                                        filter, bias, output);
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace fully_connected
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      fully_connected::Init, fully_connected::Free, fully_connected::Prepare,
+      fully_connected::Eval<fully_connected::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
+  static TfLiteRegistration r = {fully_connected::Init, fully_connected::Free,
+                                 fully_connected::Prepare,
+                                 fully_connected::Eval<fully_connected::kPie>};
+  return &r;
+}
+
+TfLiteRegistration* Register_FULLY_CONNECTED() {
+  // TODO(ahentz): We don't have a dedicated quantized version of the PIE
+  // kernel. For now, the quantized version just defer to the corresponding
+  // optimized MINI kernel. At some point we will allow different libraries to
+  // be built with different kernels, but for now we have to pick one here.
+  return Register_FULLY_CONNECTED_PIE();
+#ifdef USE_NEON
+  return Register_FULLY_CONNECTED_NEON_OPT();
+#else
+  return Register_FULLY_CONNECTED_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0f766c4f4580d7679275c0b63aa200410fcb5ad
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -0,0 +1,376 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite FULLY_CONNECTED op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+static float fully_connected_input[] = {
+    0.503691, 0.196961, 0.521017, 0.554248, 0.288678, 0.792476, 0.561653,
+    0.462230, 0.650736, 0.163132, 0.029658, 0.411544, 0.470539, 0.572390,
+    0.538755, 0.212030, 0.264309, 0.193908, 0.777480, 0.745661, 0.423314,
+    0.470804, 0.175501, 0.492225, 0.192743, 0.540183, 0.372514, 0.446550,
+    0.498173, 0.126472, 0.132706, 0.001864, 0.323433, 0.653723, 0.556112,
+    0.612111, 0.446199, 0.117765, 0.074341, 0.096935, 0.280897, 0.103999,
+    0.508479, 0.751437, 0.676389, 0.047234, 0.963467, 0.940698, 0.241142,
+    0.740947, 0.686359, 0.664456, 0.211751, 0.861860, 0.156681, 0.404494,
+    0.402043, 0.529195, 0.851044, 0.900216, 0.655667, 0.983750, 0.902081,
+    0.979100, 0.637473, 0.458193, 0.591211, 0.083671, 0.575958, 0.665552,
+    0.180606, 0.856856, 0.769551, 0.689086, 0.608293, 0.445940, 0.736320,
+    0.571760, 0.386637, 0.977461, 0.312707, 0.072996, 0.641918, 0.524458,
+    0.934856, 0.798598, 0.928951, 0.336899, 0.327793, 0.779995, 0.237115,
+    0.983460, 0.763746, 0.139196, 0.962560, 0.401218, 0.597389, 0.553771,
+    0.484890, 0.173347, 0.219322, 0.665496, 0.030203, 0.988873, 0.354582,
+    0.638496, 0.434813, 0.090902, 0.210256, 0.821450, 0.068363, 0.522962,
+    0.894446, 0.710280, 0.047420, 0.829302, 0.508879, 0.976371, 0.166202,
+    0.836672, 0.756367, 0.403317, 0.820132, 0.520112, 0.542513, 0.782691,
+    0.921330, 0.139902};
+
+static float fully_connected_golden_output[] = {
+    0,        0.0732134,   0,        0,          0,         0.280859,
+    0,        0.128927,    0,        0.0777251,  0,         0.270268,
+    0.271435, 0.0173503,   0.335465, 0.235562,
+
+    0,        0.0745866,   0,        0.051611,   0,         0.253876,
+    0,        0.0814873,   0,        0.104104,   0,         0.248529,
+    0.264194, 0,           0.302973, 0.166252,
+
+    0,        0.0170409,   0,        0.0509851,  0,         0.212834,
+    0,        0.0208326,   0,        0.129932,   0.203978,  0.103428,
+    0.298051, 0,           0.332233, 0.00445903,
+
+    0,        0.125246,    0,        0.0735336,  0,         0.0910256,
+    0,        0,           0,        0.18933,    0.378111,  0.0712443,
+    0.277298, 0.0123414,   0.267454, 0,
+
+    0,        0.14687,     0,        0.155495,   0.0300215, 0.147256,
+    0,        0,           0,        0.156412,   0.434914,  0.0461529,
+    0.246508, 0,           0.363138, 0,
+
+    0,        0,           0,        0.0212949,  0,         0.301708,
+    0,        0.35497,     0,        0.406223,   0.0260211, 0.049195,
+    0.197161, 0,           0.37316,  0,
+
+    0,        0.221783,    0,        0,          0.0116515, 0.281945,
+    0,        0,           0,        0,          0.285626,  0.181773,
+    0.296401, 0.170452,    0.367135, 0.142597,
+
+    0,        0,           0,        0,          0,         0.418886,
+    0,        0.291063,    0,        0.227541,   0.0424759, 0.27589,
+    0.398286, 0.177146,    0.40359,  0.121452,
+
+    0,        0.0834884,   0,        0,          0,         0.287441,
+    0,        0.0046838,   0,        0.0122087,  0,         0.217376,
+    0.140183, 0.0948412,   0.436677, 0.0589876,
+
+    0,        0.0289969,   0,        0.0921397,  0,         0.396802,
+    0,        0.0126157,   0,        0.0968433,  0,         0.172271,
+    0.173295, 0.0664741,   0.53645,  0.00915603,
+
+    0,        0,           0,        0,          0,         0.147942,
+    0,        0.263795,    0,        0.39782,    0,         0.382435,
+    0.561072, 0.0579847,   0.145712, 0.13508,
+
+    0,        0,           0,        0.16382,    0,         0.322294,
+    0,        0.163798,    0,        0.405211,   0.367953,  0.076852,
+    0.342473, 0.0834118,   0.377537, 0,
+
+    0,        0.206,       0,        0,          0,         0.375769,
+    0,        0,           0,        0,          0,         0.125165,
+    0,        0.105591,    0.52055,  0.0536445,
+
+    0,        0.259261,    0,        0,          0,         0.247707,
+    0,        0,           0,        0,          0,         0.215862,
+    0.149153, 0.224678,    0.359519, 0.129419,
+
+    0,        0.17611,     0,        0.280895,   0,         0.576484,
+    0,        0.000418848, 0,        0,          0,         0.151112,
+    0.211902, 0,           0.566341, 0.106305,
+
+    0,        0.0246284,   0,        0,          0,         0.196267,
+    0,        0.0248624,   0,        0.265635,   0,         0.436199,
+    0.408079, 0.134514,    0.328489, 0.411368};
+
+class BaseFullyConnectedOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): test different activation types too.
+  BaseFullyConnectedOpModel(int units, int batches, const TensorData& input,
+                            const TensorData& output = {TensorType_FLOAT32})
+      : batches_(batches), units_(units) {
+    int total_input_size = 1;
+    for (int i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ =
+        AddInput({input.type, {units_, input_size_}, input.min, input.max});
+
+    if (input.type == TensorType_FLOAT32) {
+      bias_ = AddInput({TensorType_FLOAT32, {units_}});
+    } else {
+      // This is a quantized version. The scale of 'bias' depends on the scales
+      // of input and filter. Supposedly this is correctly set during quantized
+      // training.
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
+        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
+            .Union());
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+  }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
+class FloatFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+
+  void SetWeights(std::initializer_list<float> f) {
+    PopulateTensor(weights_, f);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
+ public:
+  using BaseFullyConnectedOpModel::BaseFullyConnectedOpModel;
+
+  void SetBias(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int32_t>(bias_, data);
+  }
+  void SetWeights(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(weights_, data);
+  }
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+// TODO(ahentz): add more small tests like this one, focused on making sure the
+// calculations are correct.
+TEST(FullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(3, 2, {TensorType_FLOAT32, {2, 10}});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
+}
+
+TEST(FullyConnectedOpTest, SimpleTestQuantized) {
+  QuantizedFullyConnectedOpModel m(
+      3, 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+                                            24, 25, 26,  //
+                                            58, 59, 60,  //
+                                        })));
+  EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+TEST(FullyConnectedOpTest, SimpleTest4DInput) {
+  // Note that it is not required that the first dimension be the number of
+  // batches. All we care is that the input can be evenly distributed in
+  // batches. In this case, we need the input to have multiples of '2'.
+  FloatFullyConnectedOpModel m(/*units=*/3,
+                               /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // first batch
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // second batch
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 24, 25, 26,  // first batch
+                                 58, 59, 60,  // second batch
+                             }));
+}
+
+TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+  QuantizedFullyConnectedOpModel m(
+      3, 2,
+      /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({
+                                            24, 25, 26,  //
+                                            58, 59, 60,  //
+                                        })));
+  EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
+// TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
+// to debug errors and doesn't necessarily test all the important details.
+TEST(FullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(16, 2, {TensorType_FLOAT32, {2, 8}});
+  m.SetWeights(
+      {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
+       -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
+       -0.275581, 0.059388,  -0.118497, -0.079224, 0.109758,  0.008307,
+       -0.062657, -0.060962, -0.049782, -0.106719, -0.319482, -0.103650,
+       0.266455,  0.051517,  -0.123448, 0.322464,  0.043282,  -0.173782,
+       -0.190381, 0.002013,  0.096086,  0.131157,  0.031164,  0.100638,
+       -0.312191, -0.080923, -0.101318, -0.116614, 0.142238,  0.086540,
+       -0.139154, 0.174268,  -0.073161, 0.080072,  0.006874,  0.229382,
+       -0.104321, -0.176035, -0.208587, -0.001019, -0.162032, 0.080824,
+       -0.025021, 0.074460,  -0.252595, -0.161750, -0.136403, 0.008308,
+       0.005710,  0.096600,  0.289839,  0.218816,  -0.304651, -0.070958,
+       0.054598,  0.147113,  -0.139112, -0.072798, -0.163335, -0.167863,
+       -0.128762, -0.035780, 0.117262,  0.017177,  0.263335,  -0.176612,
+       0.262961,  -0.093654, -0.339283, 0.333071,  0.180827,  0.287583,
+       0.066350,  -0.197947, -0.114449, -0.236035, 0.103532,  -0.034284,
+       0.093299,  -0.145361, 0.054001,  0.250570,  0.157010,  -0.143480,
+       -0.139061, -0.048873, 0.067557,  0.139038,  0.324106,  0.227041,
+       0.037793,  -0.225747, -0.241619, 0.357835,  0.135762,  -0.306764,
+       -0.125982, 0.091916,  0.266587,  0.030135,  0.265148,  0.141627,
+       0.020120,  0.083815,  -0.124556, -0.100124, -0.048159, 0.181172,
+       0.302309,  -0.041084, 0.146334,  -0.061511, -0.232605, 0.281324,
+       0.145408,  -0.221897});
+  m.SetBias({-0.160594, 0.205770, -0.078307, -0.077984, 0.001937, 0.015860,
+             0.036810, 0.012346, 0.001028, 0.038551, 0.075415, 0.020804,
+             0.048478, -0.032270, 0.175688, -0.085662});
+
+  const int input_sequence_size = sizeof(fully_connected_input) /
+                                  sizeof(float) /
+                                  (m.input_size() * m.num_batches());
+  for (int i = 0; i < input_sequence_size; i++) {
+    // TODO(ahentz): This is what the original test was doing: two equal
+    // batches per invocation. We could instead use two different batches.
+    float* batch_start = fully_connected_input + i * m.input_size();
+    float* batch_end = batch_start + m.input_size();
+    m.SetInput(0, batch_start, batch_end);
+    m.SetInput(m.input_size(), batch_start, batch_end);
+
+    m.Invoke();
+
+    float* golden_start = fully_connected_golden_output + i * m.num_units();
+    float* golden_end = golden_start + m.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.cc b/tensorflow/contrib/lite/kernels/gemm_support.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb2b0aacf7ecc3ed5dbde5ccce7a46dcda0a93b3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gemm_support.cc
@@ -0,0 +1,68 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/gemm_support.h"
+
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace gemm_support {
+
+struct RefCountedGemmContext {
+  gemmlowp::GemmContext* gemm_context_ = nullptr;
+  int num_references_ = 0;
+};
+
+void IncrementUsageCounter(TfLiteContext* context) {
+  auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+  if (ptr == nullptr) {
+    ptr = new RefCountedGemmContext;
+    ptr->gemm_context_ = new gemmlowp::GemmContext();
+    ptr->num_references_ = 0;
+    context->gemm_context = ptr;
+  }
+  ptr->num_references_++;
+}
+
+void DecrementUsageCounter(TfLiteContext* context) {
+  auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+  if (ptr == nullptr) {
+    TF_LITE_FATAL(
+        "Call to DecrementUsageCounter() not preceded by "
+        "IncrementUsageCounter()");
+  }
+  if (--ptr->num_references_ == 0) {
+    delete ptr->gemm_context_;
+    delete ptr;
+    context->gemm_context = nullptr;
+  }
+}
+
+gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) {
+  auto* ptr = reinterpret_cast<RefCountedGemmContext*>(context->gemm_context);
+  if (ptr == nullptr) {
+    TF_LITE_FATAL(
+        "Call to GetFromContext() not preceded by IncrementUsageCounter()");
+  }
+  return ptr->gemm_context_;
+}
+
+void SetMaxNumThreads(TfLiteContext* context, int num_threads) {
+  IncrementUsageCounter(context);
+  GetFromContext(context)->set_max_num_threads(num_threads);
+  DecrementUsageCounter(context);
+}
+
+}  // namespace gemm_support
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/gemm_support.h b/tensorflow/contrib/lite/kernels/gemm_support.h
new file mode 100644
index 0000000000000000000000000000000000000000..b531959ffb143c774ee715743480b03ebfbdc114
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/gemm_support.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace gemm_support {
+
+// Returns the GemmContext stored in 'context', allowing multiple ops to
+// share a single object, as long as they share a TfLiteContext. The caller
+// must ensure that this is called between IncrementUsageCounter() and
+// DecrementUsageCounter(). For example, in the implementation of an op:
+//   void* Init(TfLiteContext* context, const char*, size_t) {
+//     gemm_support::IncrementUsageCounter(context);
+//     return nullptr;
+//   }
+//   void Free(TfLiteContext* context, void*) {
+//     gemm_support::DecrementUsageCounter(context);
+//   }
+//   TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+//     auto* gemm_context = gemm_support::GetFromContext(context);
+//   }
+gemmlowp::GemmContext* GetFromContext(TfLiteContext* context);
+
+// Let the framework know that the GemmContext stored in 'context' will be used
+// by an op. If necessary a new GemmContext is created and placed in 'context'.
+void IncrementUsageCounter(TfLiteContext* context);
+
+// Let the framework know that the op stopped using the GemmContext stored in
+// 'context'. If there are no more usages the GemmContext will be deleted.
+void DecrementUsageCounter(TfLiteContext* context);
+
+// Set the maximum number threads available for gemmlowp operations.
+void SetMaxNumThreads(TfLiteContext* context, int num_threads);
+
+}  // namespace gemm_support
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_GEMM_SUPPORT_H_
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b82601d119b2e4946db6e3577300168c7e710b6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -0,0 +1,155 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Op that looks up items from hashtable.
+//
+// Input:
+//     Tensor[0]: Hash key to lookup, dim.size == 1, int32
+//     Tensor[1]: Key of hashtable, dim.size == 1, int32
+//                *MUST* be sorted in ascending order.
+//     Tensor[2]: Value of hashtable, dim.size >= 1
+//                Tensor[1].Dim[0] == Tensor[2].Dim[0]
+//
+// Output:
+//   Output[0].dim[0] == Tensor[0].dim[0], num of lookups
+//   Each item in output is a raw bytes copy of corresponding item in input.
+//   When key does not exist in hashtable, the returned bytes are all 0s.
+//
+//   Output[1].dim = { Tensor[0].dim[0] }, num of lookups
+//   Each item indicates whether the corresponding lookup has a returned value.
+//   0 for missing key, 1 for found key.
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+int greater(const void* a, const void* b) {
+  return *static_cast<const int*>(a) - *static_cast<const int*>(b);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
+
+  TfLiteTensor* lookup = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
+  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+
+  TfLiteTensor* key = GetInput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(key), 1);
+  TF_LITE_ENSURE_EQ(context, key->type, kTfLiteInt32);
+
+  TfLiteTensor* value = GetInput(context, node, 2);
+  TF_LITE_ENSURE(context, NumDimensions(value) >= 1);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(key, 0),
+                    SizeOfDimension(value, 0));
+  if (value->type == kTfLiteString) {
+    TF_LITE_ENSURE_EQ(context, NumDimensions(value), 1);
+  }
+
+  TfLiteTensor* hits = GetOutput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, hits->type, kTfLiteUInt8);
+  TfLiteIntArray* hitSize = TfLiteIntArrayCreate(1);
+  hitSize->data[0] = SizeOfDimension(lookup, 0);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, value->type, output->type);
+
+  TfLiteStatus status = kTfLiteOk;
+  if (output->type != kTfLiteString) {
+    TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
+    outputSize->data[0] = SizeOfDimension(lookup, 0);
+    for (int i = 1; i < NumDimensions(value); i++) {
+      outputSize->data[i] = SizeOfDimension(value, i);
+    }
+    status = context->ResizeTensor(context, output, outputSize);
+  }
+  if (context->ResizeTensor(context, hits, hitSize) == kTfLiteError) {
+    status = kTfLiteError;
+  }
+  return status;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* hits = GetOutput(context, node, 1);
+  TfLiteTensor* lookup = GetInput(context, node, 0);
+  TfLiteTensor* key = GetInput(context, node, 1);
+  TfLiteTensor* value = GetInput(context, node, 2);
+
+  const int num_rows = SizeOfDimension(value, 0);
+  const int row_bytes = value->bytes / num_rows;
+  void* pointer = nullptr;
+  DynamicBuffer buf;
+
+  for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
+    int idx = -1;
+    pointer = bsearch(&(lookup->data.i32[i]), key->data.i32, num_rows,
+                      sizeof(int32_t), greater);
+    if (pointer != nullptr) {
+      idx = (reinterpret_cast<char*>(pointer) - (key->data.raw)) /
+            sizeof(int32_t);
+    }
+
+    if (idx >= num_rows || idx < 0) {
+      if (output->type == kTfLiteString) {
+        buf.AddString(nullptr, 0);
+      } else {
+        memset(output->data.raw + i * row_bytes, 0, row_bytes);
+      }
+      hits->data.uint8[i] = 0;
+    } else {
+      if (output->type == kTfLiteString) {
+        buf.AddString(GetString(value, idx));
+      } else {
+        memcpy(output->data.raw + i * row_bytes,
+               value->data.raw + idx * row_bytes, row_bytes);
+      }
+      hits->data.uint8[i] = 1;
+    }
+  }
+  if (output->type == kTfLiteString) {
+    buf.WriteToTensor(output);
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration* Register_HASHTABLE_LOOKUP() {
+  static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb6038f9009a3865661e7b4f075c3033166d0f91
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup_test.cc
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite Lookup op.
+
+#include <iomanip>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class HashtableLookupOpModel : public SingleOpModel {
+ public:
+  HashtableLookupOpModel(std::initializer_list<int> lookup_shape,
+                         std::initializer_list<int> key_shape,
+                         std::initializer_list<int> value_shape,
+                         TensorType type) {
+    lookup_ = AddInput(TensorType_INT32);
+    key_ = AddInput(TensorType_INT32);
+    value_ = AddInput(type);
+    output_ = AddOutput(type);
+    hit_ = AddOutput(TensorType_UINT8);
+    SetBuiltinOp(BuiltinOperator_HASHTABLE_LOOKUP, BuiltinOptions_NONE, 0);
+    BuildInterpreter({lookup_shape, key_shape, value_shape});
+  }
+
+  void SetLookup(std::initializer_list<int> data) {
+    PopulateTensor<int>(lookup_, data);
+  }
+
+  void SetHashtableKey(std::initializer_list<int> data) {
+    PopulateTensor<int>(key_, data);
+  }
+
+  void SetHashtableValue(const std::vector<string>& content) {
+    PopulateStringTensor(value_, content);
+  }
+
+  void SetHashtableValue(const std::function<float(int)>& function) {
+    TfLiteTensor* tensor = interpreter_->tensor(value_);
+    int rows = tensor->dims->data[0];
+    for (int i = 0; i < rows; i++) {
+      tensor->data.f[i] = function(i);
+    }
+  }
+
+  void SetHashtableValue(const std::function<float(int, int)>& function) {
+    TfLiteTensor* tensor = interpreter_->tensor(value_);
+    int rows = tensor->dims->data[0];
+    int features = tensor->dims->data[1];
+    for (int i = 0; i < rows; i++) {
+      for (int j = 0; j < features; j++) {
+        tensor->data.f[i * features + j] = function(i, j);
+      }
+    }
+  }
+
+  std::vector<string> GetStringOutput() {
+    TfLiteTensor* output = interpreter_->tensor(output_);
+    int num = GetStringCount(output);
+    std::vector<string> result(num);
+    for (int i = 0; i < num; i++) {
+      auto ref = GetString(output, i);
+      result[i] = string(ref.str, ref.len);
+    }
+    return result;
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<uint8_t> GetHit() { return ExtractVector<uint8_t>(hit_); }
+
+ private:
+  int lookup_;
+  int key_;
+  int value_;
+  int output_;
+  int hit_;
+};
+
+// TODO(yichengfan): write more tests that exercise the details of the op,
+// such as lookup errors and variable input shapes.
+TEST(HashtableLookupOpTest, Test2DInput) {
+  HashtableLookupOpModel m({4}, {3}, {3, 2}, TensorType_FLOAT32);
+
+  m.SetLookup({1234, -292, -11, 0});
+  m.SetHashtableKey({-11, 0, 1234});
+  m.SetHashtableValue([](int i, int j) { return i + j / 10.0f; });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 2.0, 2.1,  // 2-nd item
+                                 0, 0,      // Not found
+                                 0.0, 0.1,  // 0-th item
+                                 1.0, 1.1,  // 1-st item
+                             })));
+  EXPECT_THAT(m.GetHit(), ElementsAreArray({
+                              1, 0, 1, 1,
+                          }));
+}
+
+TEST(HashtableLookupOpTest, Test1DInput) {
+  HashtableLookupOpModel m({4}, {3}, {3}, TensorType_FLOAT32);
+
+  m.SetLookup({1234, -292, -11, 0});
+  m.SetHashtableKey({-11, 0, 1234});
+  m.SetHashtableValue([](int i) { return i * i / 10.0f; });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.4,  // 2-nd item
+                                 0,    // Not found
+                                 0.0,  // 0-th item
+                                 0.1,  // 1-st item
+                             })));
+  EXPECT_THAT(m.GetHit(), ElementsAreArray({
+                              1,
+                              0,
+                              1,
+                              1,
+                          }));
+}
+
+TEST(HashtableLookupOpTest, TestString) {
+  HashtableLookupOpModel m({4}, {3}, {3}, TensorType_STRING);
+
+  m.SetLookup({1234, -292, -11, 0});
+  m.SetHashtableKey({-11, 0, 1234});
+  m.SetHashtableValue({"Hello", "", "Hi"});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({
+                                       "Hi",     // 2-nd item
+                                       "",       // Not found
+                                       "Hello",  // 0-th item
+                                       "",       // 1-st item
+                                   }));
+  EXPECT_THAT(m.GetHit(), ElementsAreArray({
+                              1,
+                              0,
+                              1,
+                              1,
+                          }));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..288534099b9e090ce0c223a401b4152ca6ffb61f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -0,0 +1,359 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
+tflite_deps_intel = [
+    "@arm_neon_2_x86_sse",
+]
+
+NEON_FLAGS_IF_APPLICABLE = select({
+    ":arm": [
+        "-O3",
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ],
+    ":armeabi-v7a": [
+        "-O3",
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ],
+    ":armv7a": [
+        "-O3",
+        "-mfpu=neon",
+        "-mfloat-abi=softfp",
+    ],
+    "//conditions:default": [
+        "-O3",
+    ],
+})
+
+cc_library(
+    name = "types",
+    srcs = [],
+    hdrs = [
+        "compatibility.h",
+        "types.h",
+    ],
+)
+
+config_setting(
+    name = "arm",
+    values = {
+        "cpu": "arm",
+    },
+)
+
+config_setting(
+    name = "arm64-v8a",
+    values = {
+        "cpu": "arm64-v8a",
+    },
+)
+
+config_setting(
+    name = "armv7a",
+    values = {
+        "cpu": "armv7a",
+    },
+)
+
+config_setting(
+    name = "armeabi-v7a",
+    values = {
+        "cpu": "armeabi-v7a",
+    },
+)
+
+config_setting(
+    name = "haswell",
+    values = {
+        "cpu": "haswell",
+    },
+)
+
+config_setting(
+    name = "ios_x86_64",
+    values = {
+        "cpu": "ios_x86_64",
+    },
+)
+
+config_setting(
+    name = "ios_armv7",
+    values = {
+        "cpu": "ios_armv7",
+    },
+)
+
+config_setting(
+    name = "ios_arm64",
+    values = {
+        "cpu": "ios_arm64",
+    },
+)
+
+config_setting(
+    name = "k8",
+    values = {
+        "cpu": "k8",
+    },
+)
+
+config_setting(
+    name = "x86",
+    values = {
+        "cpu": "x86",
+    },
+)
+
+config_setting(
+    name = "x86_64",
+    values = {
+        "cpu": "x86_64",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {
+        "cpu": "darwin",
+    },
+)
+
+cc_library(
+    name = "optimized_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "optimized/depthwiseconv_float.h",
+        "optimized/depthwiseconv_uint8.h",
+        "optimized/optimized_ops.h",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        ":types",
+        ":round",
+        "//third_party/eigen3",
+        "@gemmlowp//:gemmlowp",
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "optimized",
+    hdrs = [
+        "optimized/eigen_spatial_convolutions.h",
+        "optimized/eigen_tensor_reduced_instantiations_oss.h",
+        "optimized/multithreaded_conv.h",
+        "tensor.h",
+    ],
+    deps = [
+        ":optimized_base",
+        ":types",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:context",
+        "//third_party/eigen3",
+    ],
+)
+
+cc_test(
+    name = "tensor_test",
+    srcs = ["tensor_test.cc"],
+    deps = [
+        ":reference",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "round",
+    srcs = [],
+    hdrs = ["round.h"],
+)
+
+cc_library(
+    name = "quantization_util",
+    srcs = ["quantization_util.cc"],
+    hdrs = [
+        "compatibility.h",
+        "quantization_util.h",
+    ],
+    deps = [":round"],
+)
+
+cc_test(
+    name = "quantization_util_test",
+    srcs = ["quantization_util_test.cc"],
+    deps = [
+        ":quantization_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "reference_base",
+    srcs = [],
+    hdrs = [
+        "common.h",
+        "reference/depthwiseconv_float.h",
+        "reference/depthwiseconv_uint8.h",
+        "reference/reference_ops.h",
+    ],
+    deps = [
+        ":round",
+        ":types",
+        "//third_party/eigen3",
+        "@gemmlowp//:gemmlowp",
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ] + select({
+        ":haswell": tflite_deps_intel,
+        ":ios_x86_64": tflite_deps_intel,
+        ":k8": tflite_deps_intel,
+        ":x86": tflite_deps_intel,
+        ":x86_64": tflite_deps_intel,
+        ":darwin": tflite_deps_intel,
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "reference",
+    hdrs = ["tensor.h"],
+    deps = [
+        ":types",
+        "//tensorflow/contrib/lite:context",
+    ],
+)
+
+cc_library(
+    name = "portable_tensor_utils",
+    srcs = [
+        "reference/portable_tensor_utils.cc",
+    ],
+    hdrs = [
+        "reference/portable_tensor_utils.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite/kernels:activation_functor",
+        "//tensorflow/contrib/lite/kernels:op_macros",
+    ],
+)
+
+cc_library(
+    name = "neon_tensor_utils",
+    srcs = [
+        "optimized/neon_tensor_utils.cc",
+    ],
+    hdrs = [
+        "optimized/neon_tensor_utils.h",
+        "optimized/tensor_utils_impl.h",
+    ],
+    copts = NEON_FLAGS_IF_APPLICABLE,
+    deps = [
+        ":cpu_check",
+        ":portable_tensor_utils",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite/kernels:activation_functor",
+    ],
+)
+
+cc_library(
+    name = "tensor_utils",
+    srcs = [
+        "tensor_utils.cc",
+    ],
+    hdrs = [
+        "optimized/tensor_utils_impl.h",
+        "reference/portable_tensor_utils.h",
+        "tensor_utils.h",
+    ],
+    copts = NEON_FLAGS_IF_APPLICABLE,
+    deps = [
+        "//tensorflow/contrib/lite/kernels:activation_functor",
+        "//tensorflow/contrib/lite:builtin_op_data",
+    ] + select({
+        ":arm": [
+            ":neon_tensor_utils",
+        ],
+        ":arm64-v8a": [
+            ":neon_tensor_utils",
+        ],
+        ":armeabi-v7a": [
+            ":neon_tensor_utils",
+        ],
+        ":armv7a": [
+            ":neon_tensor_utils",
+        ],
+        ":ios_armv7": [
+            ":neon_tensor_utils",
+        ],
+        ":ios_arm64": [
+            ":neon_tensor_utils",
+        ],
+        "//conditions:default": [
+            ":portable_tensor_utils",
+        ],
+    }),
+)
+
+cc_test(
+    name = "tensor_utils_test",
+    srcs = ["tensor_utils_test.cc"],
+    copts = NEON_FLAGS_IF_APPLICABLE,
+    linkopts = select({
+        "//tensorflow:android": [
+            "-fPIE -pie",
+        ],
+        "//conditions:default": [],
+    }),
+    linkstatic = 1,
+    deps = [
+        ":tensor_utils",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "cpu_check",
+    hdrs = [
+        "optimized/cpu_check.h",
+    ],
+    deps = [
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "@androidndk//:cpufeatures",
+            ],
+            "//conditions:default": [],
+        },
+    ),
+)
+
+exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..28f19a250629aec4d03aa71df57d31d8a5014e9f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -0,0 +1,107 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined __GNUC__ && defined __SSE4_1__
+#define USE_NEON
+
+#define OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wattributes"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wnarrowing"
+#pragma GCC diagnostic ignored "-Wsequence-point"
+
+#include "NEON_2_SSE.h"
+
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+inline void GetActivationMinMax(FusedActivationFunctionType ac,
+                                float* output_activation_min,
+                                float* output_activation_max) {
+  switch (ac) {
+    case FusedActivationFunctionType::kNone:
+      *output_activation_min = std::numeric_limits<float>::lowest();
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu:
+      *output_activation_min = 0.f;
+      *output_activation_max = std::numeric_limits<float>::max();
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      *output_activation_min = -1.f;
+      *output_activation_max = 1.f;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      *output_activation_min = 0.f;
+      *output_activation_max = 6.f;
+      break;
+  }
+}
+
+inline float ActivationFunctionWithMinMax(float x, float output_activation_min,
+                                          float output_activation_max) {
+  return std::min(std::max(x, output_activation_min), output_activation_max);
+}
+
+// Legacy function, left for compatibility only.
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  return ActivationFunctionWithMinMax(x, output_activation_min,
+                                      output_activation_max);
+}
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
+    int32 x, int32 quantized_multiplier, int right_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32 x, int32 quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/compatibility.h b/tensorflow/contrib/lite/kernels/internal/compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..796a03566a4bf971294dd2375f590dfd20d600f7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/compatibility.h
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+#ifndef TFLITE_DCHECK
+#define TFLITE_DCHECK(condition) (condition) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_EQ
+#define TFLITE_DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_GE
+#define TFLITE_DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_GT
+#define TFLITE_DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_LE
+#define TFLITE_DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef TFLITE_DCHECK_LT
+#define TFLITE_DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false)
+#endif
+
+// TODO(ahentz): Clean up: We should stick to the DCHECK versions.
+#ifndef TFLITE_CHECK
+#define TFLITE_CHECK(condition) (condition) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_EQ
+#define TFLITE_CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_GE
+#define TFLITE_CHECK_GE(x, y) ((x) >= (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_GT
+#define TFLITE_CHECK_GT(x, y) ((x) > (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_LE
+#define TFLITE_CHECK_LE(x, y) ((x) <= (y)) ? (void)0 : abort()
+#endif
+
+#ifndef TFLITE_CHECK_LT
+#define TFLITE_CHECK_LT(x, y) ((x) < (y)) ? (void)0 : abort()
+#endif
+
+// TODO(ahentz): Clean up.
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMPATIBILITY_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..dea46cc12065ed34cf681916a46a55bd7a86f463
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
+
+namespace tflite {
+
+#ifdef __ANDROID__
+#include "ndk/sources/android/cpufeatures/cpu-features.h"
+
+// Runtime check for Neon support on Android.
+inline bool TestCPUFeatureNeon() {
+#ifdef __aarch64__
+  // ARM-64 always has NEON support.
+  return true;
+#else
+  static bool kUseAndroidNeon =
+      (android_getCpuFamily() == ANDROID_CPU_FAMILY_ARM &&
+       android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_ARMv7 &&
+       android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
+  return kUseAndroidNeon;
+#endif  // __aarch64__
+}
+
+#elif __ARM_NEON
+
+inline bool TestCPUFeatureNeon() {
+  return true;
+}
+
+#else
+
+inline bool TestCPUFeatureNeon() {
+  return false;
+}
+
+#endif
+
+}  // namespace tflite
+
+// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if Neon is both
+// enabled at build time and detected at runtime, or PortableSomeFunc(args)
+// otherwise.
+#ifdef __ARM_ARCH_5TE__
+// Neon isn't available at all on ARMv5.
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+#else
+#define NEON_OR_PORTABLE(funcname, ...)              \
+  TestCPUFeatureNeon() ? Neon##funcname(__VA_ARGS__) \
+                       : Portable##funcname(__VA_ARGS__)
+#endif
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..da34c8aef94b1c69e661bd33fcb518e73034c4bd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -0,0 +1,1060 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+        float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+        float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+        float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+        float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+        float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+        float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+        float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+        float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+        float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+        // Multiply-accumulate
+        acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+        acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+        acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+        acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x2_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1_f32(filter_ptr + 2 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x2_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+      }
+      // Multiply-accumulate for each input channel there 2 outputs
+      acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+      }
+      acc_buffer_ptr += 6;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 3, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[3];
+    for (int i = 0; i < 3; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // NOTE: we only want 3 values, so we read it as two ops where
+      // the second op just duplicates the lane
+      const float32x2_t input01 = vld1_f32(input_ptr);
+      const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[3];
+      for (int i = 0; i < 3; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate all outputs.
+      acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+      acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+      acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 3; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 12;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+    float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+    float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+    float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+    float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+    float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+    float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+    float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+      float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+      float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+      float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+      float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+      float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+      float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+      float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+      acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+      acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+      acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+      acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+      acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+      acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+      acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x2_t filter = vld1_f32(filter_ptr);
+    float32x4_t filter_x4 = vcombine_f32(filter, filter);
+    int outp = 0;
+
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x2_t input_1 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x2_t input_2 = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+      float32x4_t input = vcombine_f32(input_1, input_2);
+
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter_x4);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += input_ptr_increment;
+
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filter);
+
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    float32x4_t filter = vld1q_f32(filter_ptr);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input = vld1q_f32(input_ptr);
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filter);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const float* input_data,
+    int pad_width, int depth_multiplier, int filter_width,
+    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, float* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+  LOG(FATAL)
+      << "\n\n"
+      << "*****************************************************************\n"
+      << "* This tfmini inference code was about to use the slow generic\n"
+      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
+      << "* to be aware of that so that you will know why you get terrible\n"
+      << "* performance.\n"
+      << "*\n"
+      << "* If you would like to carry on with the slow code, compile\n"
+      << "* with this preprocessor token defined:\n"
+      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+      << "*\n"
+      << "* The right thing to do, if you care about performance, is to add\n"
+      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
+      << "* The relevant parameters defining your case are:\n"
+      << "* stride = " << stride << "\n"
+      << "* input_depth = " << input_depth << "\n"
+      << "* depth_multiplier = " << depth_multiplier << "\n"
+      << "*\n"
+      << "* Please do not hesitate to contact benoitjacob@ with this\n"
+      << "* information.\n"
+      << "*****************************************************************\n";
+#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  // TODO(benoitjacob): This might need optimized specializations
+  // for small output_depth values, if that ever becomes an important
+  // case (like it was for some quantized DepthwiseConv cases).
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 2048;
+  float acc_buffer[kAccBufferMaxSize];
+  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
+                                   FIXED_DEPTH_MULTIPLIER>;               \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif  // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+  }
+
+  // Now that we have determined row_accum_func, we can start work.
+  float* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(stride_width, input_depth, input_width,
+                         input_data + in_y * input_dims.strides[2] +
+                             b * input_dims.strides[3],
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_dims.strides[2],
+                         out_x_buffer_start, out_x_buffer_end, output_depth,
+                         acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(
+                  vdupq_n_f32(output_activation_min),
+                  vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+            }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+          acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+                          vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          acc = std::max(output_activation_min,
+                         std::min(output_activation_max, acc));
+
+          *output_ptr++ = acc;
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..051ed2a2c44a04f0473dfd26637e53865a5a51ac
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -0,0 +1,1916 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8x2_t filter_u8;
+    filter_u8.val[0] = vld1_u8(filter_ptr);
+    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
+                            vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+                                  vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+                                  vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] =
+          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] =
+          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+                                   vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+                                   vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+                              vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+                              vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+                              vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+                              vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+                              vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+                              vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+                              vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+                              vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+                                                   {2, 3, 3, 3, 4, 4, 4, 5},
+                                                   {5, 5, 6, 6, 6, 7, 7, 7}};
+    uint8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++) {
+      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[3];
+        uint8x8x3_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+
+        uint8x8_t input_u8_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t input_s16_dup3 =
+              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+                                    vget_low_s16(filter[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+                                    vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[2];
+        uint8x8x2_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+                                    vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+                                    vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs.
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters, add filter_offset.
+        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
+        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
+        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 =
+            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 =
+            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+        const int16x8_t filter =
+            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        const int16 filter_val = *local_filter_ptr++ + filter_offset;
+        *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                   vget_low_s16(filter[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                   vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] =
+            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 32> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
+    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
+    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
+    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
+    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
+    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
+    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
+    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter = vaddq_s16(
+        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint16x4_t input_u16 = vdup_n_u16(0);
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+                                input_u16, 0);
+      input_ptr += input_ptr_increment;
+      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
+                                input_u16, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vreinterpret_s16_u16(
+          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    if (num_output_pixels <= 0) {
+      return;
+    }
+
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    uint8x8_t input_u8 = vdup_n_u8(0);
+    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+    const int16x4_t input_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 12, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
+    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
+    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
+    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
+    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
+      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
+      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(
+    int stride, int input_depth, int input_width, const uint8* input_data,
+    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  TFLITE_DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<
+        kAllowStrided, kFixedInputDepth,
+        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+                                    depth_multiplier, input_ptr, input_offset,
+                                    input_ptr_increment, filter_base_ptr,
+                                    filter_offset, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const uint8* input_data,
+    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+  LOG(FATAL)
+      << "\n\n"
+      << "*****************************************************************\n"
+      << "* This tfmini inference code was about to use the slow generic\n"
+      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
+      << "* to be aware of that so that you will know why you get terrible\n"
+      << "* performance.\n"
+      << "*\n"
+      << "* If you would like to carry on with the slow code, compile\n"
+      << "* with this preprocessor token defined:\n"
+      << "* TFLITE_ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
+      << "*\n"
+      << "* The right thing to do, if you care about performance, is to add\n"
+      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
+      << "* The relevant parameters defining your case are:\n"
+      << "* stride = " << stride << "\n"
+      << "* input_depth = " << input_depth << "\n"
+      << "* depth_multiplier = " << depth_multiplier << "\n"
+      << "*\n"
+      << "* Please do not hesitate to contact benoitjacob@ with this\n"
+      << "* information.\n"
+      << "*****************************************************************\n";
+#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const uint8* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const int16 input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int16 filter_val = *filter_ptr++ + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32* bias_data,
+                                       int32* acc_buffer) {
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1) {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16) {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  } else if (output_depth == 2) {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8) {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  } else if (output_depth == 4) {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  } else if (output_depth == 8) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  } else if (output_depth == 16) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 2048;
+  int32 acc_buffer[kAccBufferMaxSize];
+  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
+                   kAccBufferActualSize);
+  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif  // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func) {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // Now that we have determined row_accum_func, we can start work.
+  uint8* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(
+              stride_width, input_depth, input_width,
+              input_data + in_y * input_dims.strides[2] +
+                  b * input_dims.strides[3],
+              input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_dims.strides[2], filter_offset,
+              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        gemmlowp::ScopedProfilingLabel label("downquantize+store");
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        using gemmlowp::RoundingDivideByPOT;
+        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+        const int32x4_t output_activation_min_vec =
+            vdupq_n_s32(output_activation_min);
+        const int32x4_t output_activation_max_vec =
+            vdupq_n_s32(output_activation_max);
+        // Handle 16 values at once.
+        // This allows us to issue 4 mutually independent int32
+        // multiplications (vqrdmulh), which should alleviate most of their
+        // high latency.
+        for (; i <= num_output_values - 16; i += 16) {
+          int32x4_t acc[4];
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+          }
+
+          // Fixed-point multiplication.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+          }
+          for (int j = 0; j < 4; j++) {
+            acc[j] = RoundingDivideByPOT(acc[j], output_shift);
+          }
+          // Add the output offset.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vaddq_s32(acc[j], output_offset_vec);
+          }
+          // Apply the activation function.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+          }
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+          }
+          // Saturating cast to uint8 and store to destination.
+          int16x4_t acc_s16[4];
+          for (int j = 0; j < 4; j++) {
+            acc_s16[j] = vqmovn_s32(acc[j]);
+          }
+          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+          output_ptr += 16;
+        }
+        // Handle 8 values at once.
+        // Not as good as 16 (now we're only issuing 2 mutually independent
+        // vqrdmulh instructions, so we're probably paying for their high
+        // latency).
+        for (; i <= num_output_values - 8; i += 8) {
+          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          // Rounding right shift.
+          acc0 = RoundingDivideByPOT(acc0, output_shift);
+          acc1 = RoundingDivideByPOT(acc1, output_shift);
+          // Add the output offset.
+          acc0 = vaddq_s32(acc0, output_offset_vec);
+          acc1 = vaddq_s32(acc1, output_offset_vec);
+          // Apply the activation function.
+          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+          acc0 = vminq_s32(acc0, output_activation_max_vec);
+          acc1 = vminq_s32(acc1, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_u8(output_ptr, res_u8);
+          output_ptr += 8;
+        }
+        // Handle 4 values at once. Now we're paying the full price of the
+        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+        // (without any alignment) can only be done 1 byte at a time.
+        // Yet, that is still worth doing to minimize the amount of leftover
+        // that will have to go through the very slow scalar code.
+        for (; i <= num_output_values - 4; i += 4) {
+          int32x4_t acc = vld1q_s32(acc_buffer + i);
+          // Fixed-point multiplication.
+          acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          // Rounding right shift.
+          acc = RoundingDivideByPOT(acc, output_shift);
+          // Add the output offset.
+          acc = vaddq_s32(acc, output_offset_vec);
+          // Apply the activation function.
+          acc = vmaxq_s32(acc, output_activation_min_vec);
+          acc = vminq_s32(acc, output_activation_max_vec);
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc_s16 = vqmovn_s32(acc);
+          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_lane_u8(output_ptr + 0, res_u8, 0);
+          vst1_lane_u8(output_ptr + 1, res_u8, 1);
+          vst1_lane_u8(output_ptr + 2, res_u8, 2);
+          vst1_lane_u8(output_ptr + 3, res_u8, 3);
+          output_ptr += 4;
+        }
+#endif  // USE_NEON
+
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          int32 acc = acc_buffer[i];
+          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+              acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          *output_ptr++ = static_cast<uint8>(acc);
+        }
+      }
+    }
+  }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32 output_offset,
+                   int32 output_multiplier, int output_shift,
+                   int32 output_activation_min, int32 output_activation_max,
+                   uint8* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   int32 output_offset, int32 output_multiplier,
+                   int output_shift, int32 output_activation_min,
+                   int32 output_activation_max, uint8* output_data,
+                   const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
new file mode 100644
index 0000000000000000000000000000000000000000..8004c24a9914e216974539930853d0aadf61e324
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h
@@ -0,0 +1,231 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copied from tensorflow/core/kernels/eigen_spatial_convolutions.h.
+// TODO(petewarden) - move this to a common location in Eigen itself.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
+
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+
+// NOTE: Eigen is slightly different internally and externally. We need to
+// hack the unsupported/Eigen/CXX11/Tensor header instantiation macros at
+// specific places, so we need two copies of the hacked file, one for
+// internal and one for external.
+// If you have trouble simply undef out the reducer macro e.g.
+// TFLITE_REDUCE_INSTANTIATIONS_GOOGLE, but be aware this will make
+// the binary much bigger!
+#define TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE
+#define Eigen EigenForTFLite
+#if defined(TFLITE_REDUCE_INSTANTIATIONS_GOOGLE)
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h"
+#elif defined(TFLITE_REDUCE_INSTANTIATIONS_OPEN_SOURCE)
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h"
+#else
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
+
+
+namespace Eigen {
+
+/** SpatialConvolution
+ * \ingroup CXX11_NeuralNetworks_Module
+ *
+ * \brief Applies a 2D convolution over a multichannel input image.
+ *
+ * The input parameter is expected to be a tensor with a rank of 3 or more
+ * (channels, height, width, and optionally others)
+ * The kernel parameter is expected to be a 4D tensor (filters, channels,
+ * kernel_height, kernel_width)
+ * The input and the kernel must both be in col-major layout. The result will
+ * also be in col-major layout.
+ *
+ * If col_in_stride, row_in_stride > 1, then applies convolution with holes
+ * (aka atrous convolution), sampling every col_in_stride, row_in_stride input
+ * pixels.
+ *
+ * The result can be assigned to a tensor of rank equal to the rank of the
+ * input. The dimensions of the result will be filters, height, width (and
+ * others if applicable).
+ *
+ * It is possible to swap the order of the width and height dimensions provided
+ * that the same order is used in the input, the kernel, and the output.
+ *
+ */
+template <typename Input, typename Kernel>
+EIGEN_DEVICE_FUNC
+    EIGEN_ALWAYS_INLINE static const typename internal::conditional<
+        internal::traits<Input>::Layout == ColMajor,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic,
+                                             const Input> > > >,
+        TensorReshapingOp<
+            const DSizes<typename internal::traits<Input>::Index,
+                         internal::traits<Input>::NumDimensions>,
+            const TensorContractionOp<
+                const array<IndexPair<typename internal::traits<Input>::Index>,
+                            1>,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const TensorImagePatchOp<Dynamic, Dynamic, const Input> >,
+                const TensorReshapingOp<
+                    const DSizes<typename internal::traits<Input>::Index, 2>,
+                    const Kernel> > > >::type
+    SpatialConvolution(const Input& input, const Kernel& kernel,
+                       const DenseIndex row_stride = 1,
+                       const DenseIndex col_stride = 1,
+                       const PaddingType padding_type = PADDING_SAME,
+                       const DenseIndex row_in_stride = 1,
+                       const DenseIndex col_in_stride = 1) {
+  typedef typename internal::traits<Input>::Index TensorIndex;
+  TensorRef<Tensor<typename internal::traits<Input>::Scalar,
+                   internal::traits<Input>::NumDimensions,
+                   internal::traits<Input>::Layout, TensorIndex> >
+      in(input);
+  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar,
+                   internal::traits<Kernel>::NumDimensions,
+                   internal::traits<Kernel>::Layout, TensorIndex> >
+      kern(kernel);
+
+  EIGEN_STATIC_ASSERT(
+      internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
+      YOU_MADE_A_PROGRAMMING_MISTAKE);
+  const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
+
+  const int NumDims = internal::traits<Input>::NumDimensions;
+
+  // Number of filters to apply. This is the same as the output depth of the
+  // result
+  const TensorIndex kernelFilters =
+      isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
+  // Number of channels. This is the same as the input depth.
+  const TensorIndex kernelChannels =
+      isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
+  const TensorIndex kernelRows =
+      isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
+  const TensorIndex kernelCols =
+      isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
+
+  const DenseIndex kernelRowsEff =
+      kernelRows + (kernelRows - 1) * (row_in_stride - 1);
+  const DenseIndex kernelColsEff =
+      kernelCols + (kernelCols - 1) * (col_in_stride - 1);
+
+  array<IndexPair<TensorIndex>, 1> contract_dims;
+  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
+
+  const TensorIndex InputRows =
+      isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
+  const TensorIndex InputCols =
+      isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
+
+  TensorIndex out_height;
+  TensorIndex out_width;
+  switch (padding_type) {
+    case PADDING_VALID:
+      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) /
+                                static_cast<float>(row_stride));
+      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) /
+                               static_cast<float>(col_stride));
+      break;
+    case PADDING_SAME:
+      out_height = numext::ceil(InputRows / static_cast<float>(row_stride));
+      out_width = numext::ceil(InputCols / static_cast<float>(col_stride));
+      break;
+    default:
+      // Initialize unused variables to avoid a compiler warning
+      out_height = 0;
+      out_width = 0;
+      eigen_assert(false && "unexpected padding");
+  }
+
+  // Molds the output of the patch extraction code into a 2d tensor:
+  // - the first dimension (dims[0]): the patch values to be multiplied with the
+  // kernels
+  // - the second dimension (dims[1]): everything else
+  DSizes<TensorIndex, 2> pre_contract_dims;
+  if (isColMajor) {
+    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[1] = out_height * out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      pre_contract_dims[1] *= in.dimension(i);
+    }
+  } else {
+    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
+    pre_contract_dims[0] = out_height * out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      pre_contract_dims[0] *= in.dimension(i);
+    }
+  }
+
+  // Molds the output of the contraction into the shape expected by the used
+  // (assuming this is ColMajor):
+  // - 1st dim: kernel filters
+  // - 2nd dim: output height
+  // - 3rd dim: output width
+  // - 4th dim and beyond: everything else including batch size
+  DSizes<TensorIndex, NumDims> post_contract_dims;
+  if (isColMajor) {
+    post_contract_dims[0] = kernelFilters;
+    post_contract_dims[1] = out_height;
+    post_contract_dims[2] = out_width;
+    for (int i = 3; i < NumDims; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  } else {
+    post_contract_dims[NumDims - 1] = kernelFilters;
+    post_contract_dims[NumDims - 2] = out_height;
+    post_contract_dims[NumDims - 3] = out_width;
+    for (int i = 0; i < NumDims - 3; ++i) {
+      post_contract_dims[i] = in.dimension(i);
+    }
+  }
+
+  DSizes<TensorIndex, 2> kernel_dims;
+  if (isColMajor) {
+    kernel_dims[0] = kernelFilters;
+    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
+  } else {
+    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
+    kernel_dims[1] = kernelFilters;
+  }
+  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
+  // moving it to somewhere more "common".
+  return
+      input
+          .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+                                 row_in_stride, col_in_stride, padding_type)
+          .reshape(pre_contract_dims)
+          .contract(kernel.reshape(kernel_dims), contract_dims)
+          .reshape(post_contract_dims);
+}
+
+}  // end namespace Eigen
+
+// clang-format on
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_SPATIAL_CONVOLUTIONS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f78f69360b1ebbfb08600c8bc427f1ba9d5244d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_GOOGLE_H_
+
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+#define EIGEN_USE_THREADS
+
+// clang-format off
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+#include <random>
+#include <atomic>
+#include <condition_variable>  // NOLINT(build/c++11)
+#include <mutex>  // NOLINT(build/c++11)
+#include <thread>  // NOLINT(build/c++11)
+#include <functional>
+
+#ifdef _WIN32
+#include <winbase.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+
+// Because some programs may link Eigen in through other frameworks with
+// different flags, we can run into multiple definition issues if we don't have
+// a private namespace for our versions. This is a nasty hack, but a similar
+// approach is used elsewhere to handle the problem, so it should be stable.
+#define Eigen EigenForTFLite
+
+#include "Eigen/src/Core/util/StaticAssert.h"
+#include "unsupported/Eigen/CXX11/Core"
+#include "unsupported/Eigen/SpecialFunctions"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "Eigen/Core"
+
+// Beware: the order of the include matters to some compilers. For example
+// TensorIndexList.h should be included before TensorDimensions.h in order to
+// use index lists to encode tensor dimensions when compiling with llvm.
+// We're defining this ourselves rather than using the Eigen Tensor header file
+// so that we can alter the macro definition of TENSOR_CONTRACTION_DISPATCH to
+// reduce binary size.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/ThreadPoolInterface.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorNonBlockingThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStats.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMappers.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+#undef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
+  if (this->m_lhs_inner_dim_contiguous &&                       \
+      this->m_rhs_inner_dim_contiguous &&                       \
+      !this->m_rhs_inner_dim_reordered) {                       \
+    METHOD<true, true, false, ALIGNMENT> ARGS;                  \
+  } else {                                                      \
+    eigen_assert(false && "Unsupported contraction formats");   \
+  }
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_H
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5c316194df0b87ee7eecbdd04bd5ce9e2e40b5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is essentially unsupported/CXX11/Eigen/Tensor.h
+// TODO(petewarden) - move this to a common location in Eigen itself.
+
+// clang-format off
+
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
+
+
+#include "Eigen/Core"
+
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+
+
+
+
+
+#ifdef _WIN32
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#include <windows.h>
+#else
+#include <stdint.h>
+#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
+#include <random>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+// #if defined(EIGEN_USE_LIBXSMM)
+// #include "libxsmm.h"
+// #endif
+
+#ifdef EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/ThreadPool"
+#endif
+
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "unsupported/Eigen/SpecialFunctions"
+#include "unsupported/Eigen/CXX11/src/util/CXX11Meta.h"
+#include "unsupported/Eigen/CXX11/src/util/MaxSizeVector.h"
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h"
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
+
+#undef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS)    \
+  if (this->m_lhs_inner_dim_contiguous &&                       \
+      this->m_rhs_inner_dim_contiguous &&                       \
+      !this->m_rhs_inner_dim_reordered) {                       \
+    METHOD<true, true, false, ALIGNMENT> ARGS;                  \
+  } else {                                                      \
+    eigen_assert(false && "Unsupported contraction formats");   \
+  }
+
+
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/Tensor.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorMap.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorRef.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorIO.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_EIGEN_TENSOR_REDUCED_INSTANTIATIONS_OSS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3615f4658a1a70284cc9d386a868a87aa09819b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -0,0 +1,195 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace multithreaded_ops {
+
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+  explicit EigenThreadPoolWrapper(Eigen::ThreadPool* pool) : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+
+  void Schedule(std::function<void()> fn) override {
+    pool_->Schedule(std::move(fn));
+  }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+  Eigen::ThreadPool* pool_ = nullptr;
+};
+
+// We have a single global threadpool for all convolution operations. This means
+// that inferences started from different threads may block each other, but
+// since the underlying resource of CPU cores should be consumed by the
+// operations anyway, it shouldn't affect overall performance.
+const Eigen::ThreadPoolDevice& GetThreadPoolDevice() {
+  const int thread_count = 4;
+  static Eigen::ThreadPool* tp = new Eigen::ThreadPool(thread_count);
+  static EigenThreadPoolWrapper* thread_pool_wrapper =
+      new EigenThreadPoolWrapper(tp);
+  static Eigen::ThreadPoolDevice* device =
+      new Eigen::ThreadPoolDevice(thread_pool_wrapper, thread_count);
+  return *device;
+}
+
+// Shorthands for the types we need when interfacing with the EigenTensor
+// library.
+typedef Eigen::TensorMap<
+    Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+    EigenMatrix;
+typedef Eigen::TensorMap<
+    Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
+    Eigen::Aligned>
+    ConstEigenMatrix;
+
+typedef Eigen::TensorMap<
+    Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+    EigenTensor;
+typedef Eigen::TensorMap<
+    Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
+    Eigen::Aligned>
+    ConstEigenTensor;
+
+// Utility functions we need for the EigenTensor API.
+template <typename Device, typename T>
+struct MatMulConvFunctor {
+  // Computes on device "d": out = in0 * in1, where * is matrix
+  // multiplication.
+  void operator()(
+      const Device& d, EigenMatrix out, ConstEigenMatrix in0,
+      ConstEigenMatrix in1,
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
+    out.device(d) = in0.contract(in1, dim_pair);
+  }
+};
+
+template <class T>
+class EigenTensorConvFunctor {
+ private:
+  Eigen::PaddingType TfLitePadding2EigenPadding(TfLitePadding padding) {
+    switch (padding) {
+      case kTfLitePaddingValid:
+        return Eigen::PADDING_VALID;
+      case kTfLitePaddingSame:
+        return Eigen::PADDING_SAME;
+      case kTfLitePaddingUnknown:
+        assert(false);  // should never get here.
+        return Eigen::PADDING_VALID;
+    }
+    return Eigen::PADDING_SAME;  // Prevent compiler warning about missing
+                                 // return
+  }
+
+ public:
+  void operator()(const T* input_data, T* im2col_buffer, int input_batches,
+                  int input_height, int input_width, int input_depth,
+                  const T* filter_data, int filter_height, int filter_width,
+                  int filter_count, int stride_rows, int stride_cols,
+                  int pad_width, int pad_height, TfLitePadding padding,
+                  T* output_data, int output_height, int output_width) {
+    const Eigen::ThreadPoolDevice& device = GetThreadPoolDevice();
+
+    const bool is_1x1_kernel = (filter_height == 1 && filter_width == 1 &&
+                                stride_rows == 1 && stride_cols == 1);
+    if (is_1x1_kernel) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      const int conv_width = output_height * output_width;
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      EigenMatrix output(output_data, conv_width, filter_count);
+      ConstEigenMatrix input(input_data, conv_width, input_depth);
+      ConstEigenMatrix filter(filter_data, input_depth, filter_count);
+      MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+                                                      filter, dim_pair);
+    } else if (filter_height == input_height && filter_width == input_width &&
+               pad_width == 0 && pad_height == 0) {
+      // If the input data and filter have the same height/width,
+      // the 2D convolution is reduced to matrix multiplication.
+      const int k =  // Length of reduction dimension.
+          filter_width * filter_height * input_depth;
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      EigenMatrix output(output_data, 1, filter_count);
+      ConstEigenMatrix input(input_data, 1, k);
+      ConstEigenMatrix filter(filter_data, k, filter_count);
+      MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
+                                                      filter, dim_pair);
+    } else {
+      EigenTensor output(output_data, input_batches, output_height,
+                         output_width, filter_count);
+      ConstEigenTensor input(input_data, input_batches, input_height,
+                             input_width, input_depth);
+      ConstEigenTensor filter(filter_data, filter_height, filter_width,
+                              input_depth, filter_count);
+      output.device(device) =
+          Eigen::SpatialConvolution(input, filter, stride_cols, stride_rows,
+                                    TfLitePadding2EigenPadding(padding));
+    }
+  }
+};
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, TfLitePadding padding,
+                 float output_activation_min, float output_activation_max,
+                 float* output_data, const Dims<4>& output_dims,
+                 float* im2col_data, const Dims<4>& im2col_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  EigenTensorConvFunctor<float> conv_functor;
+  conv_functor(input_data, im2col_data, batches, input_height, input_width,
+               input_depth, filter_data, filter_height, filter_width,
+               output_depth, stride_height, stride_width, pad_height, pad_width,
+               padding, output_data, output_height, output_width);
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      bias_data, bias_dims, output_data, output_dims, output_activation_min,
+      output_activation_max);
+}
+
+}  // namespace multithreaded_ops
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_MULTITHREAD_CONV
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf0bdfb1fb875c4b54c55e25d4a17541507ecd4c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -0,0 +1,337 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+
+#ifdef USE_NEON
+
+#include <arm_neon.h>
+#define kFloatWeightsPerNeonLane 4
+
+namespace tflite {
+namespace tensor_utils {
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+  const int kUnrollSize = 2;
+  for (int b = 0; b < n_batch; b++) {
+    float* result_in_batch = result + b * m_rows * result_stride;
+    const float* vector_in_batch = vector + b * m_cols;
+
+    const float* matrix_ptr0 = matrix;
+    // If there is only 1 row, we don't want to assign an illegal pointer.
+    const float* matrix_ptr1 = nullptr;
+    if (m_rows > 1) {
+      matrix_ptr1 = matrix + m_cols;
+    }
+
+    // Cahce the vector.
+    for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+      vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
+    }
+
+    // Main matrix by vector multiplication loop, which handles two rows of
+    // matrix by vector multiplication.
+    for (int r = 0; r < (m_rows & ~(kUnrollSize - 1)); r += kUnrollSize) {
+      float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+      float32x4_t acc1_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+        float32x4_t temp = vector_cache_float32x4[c >> 2];
+        // Load 4 float values from vector1 and vector2 and accumulator.
+        float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+        float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr1 + c);
+        // Vector multiply-accumulate 4 float
+        acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+        acc1_32x4 = vmlaq_f32(acc1_32x4, v1_f32x4, temp);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch +=
+          (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+           vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+      *(result_in_batch + result_stride) +=
+          (vgetq_lane_f32(acc1_32x4, 0) + vgetq_lane_f32(acc1_32x4, 1) +
+           vgetq_lane_f32(acc1_32x4, 2) + vgetq_lane_f32(acc1_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++) {
+        *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+        *(result_in_batch + result_stride) +=
+            matrix_ptr1[c] * vector_in_batch[c];
+      }
+      matrix_ptr0 += kUnrollSize * m_cols;
+      matrix_ptr1 += kUnrollSize * m_cols;
+      result_in_batch += kUnrollSize * result_stride;
+    }
+    for (int r = (m_rows & ~(kUnrollSize - 1)); r < m_rows; r++) {
+      float32x4_t acc0_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+        float32x4_t temp = vector_cache_float32x4[c >> 2];
+        // Load 4 float values from vector1 and vector2 and accumulator.
+        float32x4_t v0_f32x4 = vld1q_f32(matrix_ptr0 + c);
+        // Vector multiply-accumulate 4 float
+        acc0_32x4 = vmlaq_f32(acc0_32x4, v0_f32x4, temp);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch +=
+          (vgetq_lane_f32(acc0_32x4, 0) + vgetq_lane_f32(acc0_32x4, 1) +
+           vgetq_lane_f32(acc0_32x4, 2) + vgetq_lane_f32(acc0_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++) {
+        *result_in_batch += matrix_ptr0[c] * vector_in_batch[c];
+      }
+      matrix_ptr0 += m_cols;
+      result_in_batch += result_stride;
+    }
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    // Vector multiply 4 float
+    float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], mul_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2 and accumulator.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    float32x4_t acc_32x4 = vld1q_f32(result + v);
+    // Vector multiply-accumulate 4 float
+    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], acc_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] += vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
+  }
+
+  float* result_ptr = result;
+  const float* batch_vector_ptr = batch_vector;
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+      // Load from memory to vectors.
+      float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
+      float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
+      // Multiply-accumulate.
+      result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
+                               vector_cache_float32x4[v >> 2]);
+      // Store.
+      vst1q_f32(result_ptr + v, result_f32x4);
+    }
+    // Postamble loop
+    for (int v = postamble_start; v < v_size; v++) {
+      result_ptr[v] += vector[v] * batch_vector_ptr[v];
+    }
+    // Update the pointers.
+    result_ptr += v_size;
+    batch_vector_ptr += v_size;
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonSub1Vector(const float* vector, int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from the current pointers of the input column and
+    // subtract from 1.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = 1.0f - vector[v];
+  }
+}
+
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // Replicate abs_limit and -abs_limit in two vectors.
+  const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
+  const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
+
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    // Clip between abs_limit and -abs_limit.
+    float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
+    result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  // Postamble loop.
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
+    result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
+  }
+}
+
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2 and accumulator.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    // Vector multiply-accumulate 4 float
+    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+  }
+
+  float result = (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+                  vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+  // Postamble loop.
+  for (int v = postamble_start; v < v_size; v++) {
+    result += vector1[v] * vector2[v];
+  }
+  return result;
+}
+
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+                                          const float* vector2, int v_size,
+                                          int n_batch, float* result,
+                                          int result_stride) {
+  float* result_ptr = result;
+  const float* vector1_ptr = vector1;
+  const float* vector2_ptr = vector2;
+  for (int b = 0; b < n_batch; b++) {
+    *result_ptr = NeonVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
+    vector1_ptr += v_size;
+    vector2_ptr += v_size;
+    result_ptr += result_stride;
+  }
+}
+
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size) {
+  const float* input_vector_ptr = input_vector;
+  for (int o = 0; o < output_size; o++) {
+    // If reduction_size is not divisible by kWeightsPerNeonLane, we cannot use
+    // the main vectorized loop, and we need to process sequentially.
+    // postamble_start shows the start index where this should happen.
+    const int postamble_start =
+        reduction_size - (reduction_size & (kFloatWeightsPerNeonLane - 1));
+    float32x4_t sum_f32x4 = vmovq_n_f32(0.0);
+    for (int r = 0; r < postamble_start; r += kFloatWeightsPerNeonLane) {
+      float32x4_t v1_f32x4 = vld1q_f32(input_vector_ptr + r);
+      sum_f32x4 = vaddq_f32(sum_f32x4, v1_f32x4);
+    }
+    output_vector[o] +=
+        (vgetq_lane_f32(sum_f32x4, 0) + vgetq_lane_f32(sum_f32x4, 1) +
+         vgetq_lane_f32(sum_f32x4, 2) + vgetq_lane_f32(sum_f32x4, 3));
+    input_vector_ptr += postamble_start;
+
+    // Postamble loop.
+    for (int r = postamble_start; r < reduction_size; r++) {
+      output_vector[o] += *input_vector_ptr++;
+    }
+  }
+}
+
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value) {
+  // This variable keeps track of the next to the last index which is being
+  // copied to make sure we are not out of the vector boundary.
+  int last_index_copy = kFloatWeightsPerNeonLane;
+  int current_index_copy = 0;
+  while (last_index_copy < v_size) {
+    float32x4_t v_f32x4 = vld1q_f32(vector + current_index_copy + 1);
+    vst1q_f32(vector + current_index_copy, v_f32x4);
+    current_index_copy += kFloatWeightsPerNeonLane;
+    last_index_copy += kFloatWeightsPerNeonLane;
+  }
+  // Postamble loop.
+  for (int i = current_index_copy; i < v_size - 1; i++) {
+    vector[i] = vector[i + 1];
+  }
+  vector[v_size - 1] = shift_value;
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a4af87304eaf33489b38bd9b15ad9789e091d24
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -0,0 +1,113 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
+
+// TODO(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result,
+                                         int result_stride) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vector, n_batch, result, result_stride);
+}
+
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result) {
+  NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result) {
+  NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
+                   result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result) {
+  NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
+                   batch_vector, n_batch, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      int n_batch, float* result,
+                                      int result_stride) {
+  NEON_OR_PORTABLE(BatchVectorBatchVectorDotProduct, vector1, vector2, v_size,
+                   n_batch, result, result_stride);
+}
+
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector) {
+  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+  PortableApplySigmoidToVector(vector, v_size, result);
+}
+
+void ApplyActivationToVector(const float* vector, int v_size,
+                             TfLiteFusedActivation activation, float* result) {
+  PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+void CopyVector(const float* vector, int v_size, float* result) {
+  PortableCopyVector(vector, v_size, result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+void ZeroVector(float* vector, int v_size) {
+  PortableZeroVector(vector, v_size);
+}
+
+float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
+
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result) {
+  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
+}
+
+void VectorShiftLeft(float* vector, int v_size, float shift_value) {
+  NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                   reduction_size);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd565c16a1ee7226f83c19f0020beed75e401497
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -0,0 +1,3715 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen vector expression. The std::conditional here is to
+// construct the suitable Eigen type for the constness of the
+// data. Indeed, for const data, we need to produce
+//    Eigen::Map<const Eigen::Matrix<float, ...>>
+// and not the more straightforward
+//    Eigen::Map<Eigen::Matrix<const float, ...>>
+template <typename Scalar>
+using VectorMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, 1>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+
+template <typename Scalar, int N>
+VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
+  const int size = RequiredBufferSizeForDims(dims);
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+                                                const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+                                               const Dims<N>& dims) {
+  const int cols = dims.sizes[N - 1];
+  int rows = 1;
+  for (int d = 0; d < N - 1; d++) {
+    rows *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar>
+using ArrayMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
+                                  Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
+                                              const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return ArrayMap<Scalar>(data, rows, cols);
+}
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+                                                   const Dims<N>& dims,
+                                                   int rows) {
+  int cols = 1;
+  bool matched_rows = false;
+  for (int d = 0; d < N; d++) {
+    cols *= dims.sizes[d];
+    if (cols == rows) {
+      matched_rows = true;
+      cols = 1;
+    }
+  }
+  TFLITE_DCHECK(matched_rows);
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// ELEMENT-WISE BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
+  for (int i = 0; i < 4; i++) {
+    if (dims1.sizes[i] != dims2.sizes[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                             const Dims<4>& bias_dims,
+                                             float* array_data,
+                                             const Dims<4>& array_dims,
+                                             float output_activation_min,
+                                             float output_activation_max) {
+#ifdef USE_NEON
+  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      x0 = vmaxq_f32(activation_min, x0);
+      x1 = vmaxq_f32(activation_min, x1);
+      x2 = vmaxq_f32(activation_min, x2);
+      x3 = vmaxq_f32(activation_min, x3);
+      x0 = vminq_f32(activation_max, x0);
+      x1 = vminq_f32(activation_max, x1);
+      x2 = vminq_f32(activation_max, x2);
+      x3 = vminq_f32(activation_max, x3);
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      x = vmaxq_f32(activation_min, x);
+      x = vminq_f32(activation_max, x);
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i],
+                                                  output_activation_min,
+                                                  output_activation_max);
+    }
+  }
+#else  // not NEON
+  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  TFLITE_DCHECK_EQ((array_size % bias_size), 0);
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] = ActivationFunctionWithMinMax(
+          array_data[array_offset + i] + bias_data[i], output_activation_min,
+          output_activation_max);
+    }
+  }
+#endif
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                      const Dims<4>& bias_dims,
+                                      float* array_data,
+                                      const Dims<4>& array_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims,
+                                   output_activation_min,
+                                   output_activation_max);
+}
+
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+          Eigen::MatrixBase<Result>* result) {
+  if (rhs.cols() == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    result->col(0).noalias() = lhs * rhs.col(0);
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    result->noalias() = lhs * rhs;
+  }
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected");
+  // TODO(b/62193649): this convoluted shape computation (determining
+  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+  // is because the current --variable_batch hack consists in overwriting the
+  // 3rd dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  // When that is fixed, this should become:
+  // const auto input_matrix_map =
+  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const int input_rows = ArraySize(weights_dims, 0);
+  const auto input_matrix_map =
+      MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+                                   output_dims, output_activation_min,
+                                   output_activation_max);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+inline void preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+#ifdef USE_NEON
+inline void FullyConnectedAsGEMV(
+    const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
+    const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, uint8* output_data,
+    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  static constexpr int kPeel = 4;
+  for (int k = 0; k < input_size; k += 64) {
+    preload_l1_stream(input_data + k);
+  }
+  for (int k = 0; k < kPeel * input_size; k += 64) {
+    preload_l1_stream(filter_data + k);
+  }
+  TFLITE_DCHECK(!(output_size % kPeel));
+  const int32* bias_ptr = bias_data;
+  uint8* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += kPeel) {
+    int32x4_t acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      acc[k] = vdupq_n_s32(0);
+    }
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      uint8x16_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1q_u8(filter_ptr);
+        preload_l1_stream(filter_ptr + 64);
+      }
+      int16x8_t input_val[2];
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
+      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
+      int16x8_t filter_val[kPeel][2];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
+        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
+        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
+        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
+        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
+        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
+      }
+      for (int p = 0; p < 2; p++) {
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
+                             vget_low_s16(input_val[p]));
+        }
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
+                             vget_high_s16(input_val[p]));
+        }
+      }
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      uint8x8_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1_u8(filter_ptr);
+      }
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
+        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
+                           vget_low_s16(input_val));
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
+                           vget_high_s16(input_val));
+      }
+    }
+    if (in < input_size) {
+      int32 buf[4 * kPeel];
+      for (int k = 0; k < 4; k++) {
+        vst1q_s32(buf + 4 * k, acc[k]);
+      }
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32 input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32 filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      for (int k = 0; k < 4; k++) {
+        acc[k] = vld1q_s32(buf + 4 * k);
+      }
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      pairwise_reduced_acc[k] =
+          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
+    }
+    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, output_shift);
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit unsigned, saturating.
+    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+    // Apply the clamping from the activation function
+    res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+    res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+    // Store results to destination. Assumes 32bit alignment.
+    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
+                  vreinterpret_u32_u8(res8), 0);
+    output_ptr += kPeel;
+  }
+}
+#endif  // USE_NEON
+
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<
+      gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+      gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline Make(const int32* bias_data, int output_rows,
+                       int32 output_offset, int32 output_multiplier,
+                       int output_shift, int32 output_activation_min,
+                       int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+        quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_shift = output_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+#ifdef USE_NEON
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  if (batches == 1 && !(output_size % 4)) {
+    return FullyConnectedAsGEMV(
+        input_data, input_dims, input_offset, filter_data, filter_dims,
+        filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_data,
+        output_dims);
+  }
+#endif  // USE_NEON
+  const int filter_rows = filter_dims.sizes[1];
+  const int filter_cols = filter_dims.sizes[0];
+  TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1);
+  const int output_rows = output_dims.sizes[0];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims, gemm_context);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+    int stride_width, int stride_height, int pad_width, int pad_height,
+    int in_width, int in_height, int in_depth, int single_buffer_length,
+    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
+  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num =
+      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset =
+      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num ==
+         ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0) {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, byte_zero,
+           (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0)) {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  } else {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      if (left_padding > 0) {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, byte_zero,
+               (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      if (right_padding > 0) {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, byte_zero,
+               (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0) {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset +
+        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, byte_zero,
+           (bottom_row_elements * sizeof(T)));
+  }
+}
+
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+            int stride_height, int pad_width, int pad_height, int kheight,
+            int kwidth, uint8 byte_zero, T* output_data,
+            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Im2col");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, byte_zero);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, byte_zero, output_data, output_dims);
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, float output_activation_min,
+                 float output_activation_max, float* output_data,
+                 const Dims<4>& output_dims, float* im2col_data,
+                 const Dims<4>& im2col_dims) {
+  (void)im2col_data;
+  (void)im2col_dims;
+  gemmlowp::ScopedProfilingLabel label("Conv");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, 0, im2col_data,
+           im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    // TODO(aselle): We need to make sure to not send im2col if it is not
+    // needed.
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data,
+                                   output_dims, output_activation_min,
+                                   output_activation_max);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  const uint8* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    TFLITE_DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    TFLITE_DCHECK_GE(input_zero_point, 0);
+    TFLITE_DCHECK_LE(input_zero_point, 255);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, input_zero_point,
+           im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+    TFLITE_DCHECK(!im2col_data);
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  const int gemm_input_rows = gemm_input_dims->sizes[0];
+  const int gemm_input_cols = gemm_input_dims->sizes[1] *
+                              gemm_input_dims->sizes[2] *
+                              gemm_input_dims->sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+       pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
+
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int batch_size = ArraySize(output_dims, 3);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        const T* src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, byte_zero, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+                const float* filter_data, const Dims<4>& filter_dims,
+                const float* bias_data, const Dims<4>& bias_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
+
+  const auto input_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
+                int32 input_offset, const uint8* filter_data,
+                const Dims<4>& filter_dims, int32 filter_offset,
+                const int32* bias_data, const Dims<4>& bias_dims,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims,
+                gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int input_rows = input_dims.sizes[0];
+  const int input_cols =
+      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  TFLITE_DCHECK_EQ(output_rows, filter_rows);
+  TFLITE_DCHECK_EQ(output_cols, input_cols);
+  TFLITE_DCHECK_EQ(filter_cols, input_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1);
+  TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, output_cols, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+
+  const int input_depth = ArraySize(input_dims, 0);
+  const int batch_size = ArraySize(input_dims, 3);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        T* dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void NonGlobalBatchNormalization(
+    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+    const Dims<4>& mean_dims, const float* multiplier_data,
+    const Dims<4>& multiplier_dims, const float* offset_data,
+    const Dims<4>& offset_dims, float* output_data,
+    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+                        offset_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+                        offset_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, x, y, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+              offset_data[Offset(offset_dims, c, x, y, 0)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+                              const Dims<4>& input_dims, const float* mean_data,
+                              const Dims<4>& mean_dims,
+                              const float* multiplier_data,
+                              const Dims<4>& multiplier_dims,
+                              const float* offset_data,
+                              const Dims<4>& offset_dims, float* output_data,
+                              const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+        }
+      }
+    }
+  }
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
+
+  const auto input = MapAsVector(input_data, input_dims);
+  auto output = MapAsVector(output_data, output_dims);
+  output = input.cwiseMax(0.0f);
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 1;
+          const float lower = -1;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 6;
+          const float lower = 0;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization");
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        float squared_l2_norm = 0;
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          squared_l2_norm += val * val;
+        }
+        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
+        }
+      }
+    }
+  }
+}
+
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+                                          int* output_shift) {
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  TFLITE_DCHECK_GT(input, 0);
+  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32, 3>;
+  using F0 = FixedPoint<int32, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+}
+
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+                            int32 input_zero_point, uint8* output_data,
+                            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(batches, 1);
+  TFLITE_DCHECK_EQ(height, 1);
+  TFLITE_DCHECK_EQ(width, 1);
+  int32 square_l2_norm = 0;
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    square_l2_norm += diff * diff;
+  }
+  int32 inv_l2norm_multiplier;
+  int inv_l2norm_shift;
+  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                &inv_l2norm_shift);
+
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+    int32 unclamped_output_val = 128 + rescaled_diff;
+    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+    output_data[i] = static_cast<uint8>(output_val);
+  }
+}
+
+inline void Add(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vaddq_f32(a10, a20);
+    auto x1 = vaddq_f32(a11, a21);
+    auto x2 = vaddq_f32(a12, a22);
+    auto x3 = vaddq_f32(a13, a23);
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vaddq_f32(a1, a2);
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+                                                  output_activation_max);
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("Add/8bit");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  TFLITE_DCHECK_GT(input1_offset, -256);
+  TFLITE_DCHECK_GT(input2_offset, -256);
+  TFLITE_DCHECK_LT(input1_offset, 256);
+  TFLITE_DCHECK_LT(input2_offset, 256);
+#ifdef USE_NEON
+  for (; i <= size - 8; i += 8) {
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+    const auto input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    auto x11 = vmovl_s16(input1_val_low);
+    auto x12 = vmovl_s16(input1_val_high);
+    auto x21 = vmovl_s16(input2_val_low);
+    auto x22 = vmovl_s16(input2_val_high);
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
+    const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
+    const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    auto s1 = vaddq_s32(x11, x21);
+    auto s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s1 = RoundingDivideByPOT(s1, output_shift);
+    s2 = RoundingDivideByPOT(s2, output_shift);
+    const auto s1_narrowed = vmovn_s32(s1);
+    const auto s2_narrowed = vmovn_s32(s2);
+    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                             vdupq_n_s16(output_offset));
+    vst1_u8(output_data + i, vqmovun_s16(s));
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
+                                 raw_sum, output_multiplier, output_shift) +
+                             output_offset;
+    const int32 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto input2_map = MapAsVector(input2_data, input2_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  if (AreSameDims(input1_dims, input2_dims)) {
+    output_map.array() = input1_map.array() + input2_map.array();
+  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() + scalar;
+  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar + input2_map.array();
+  } else {
+    // Should not come here.
+    TFLITE_DCHECK(false);
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
+               input1_multiplier, input1_shift, input2_data, input2_dims,
+               input2_offset, input2_multiplier, input2_shift, output_offset,
+               output_multiplier, output_shift, output_activation_min,
+               output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto activation_min = vdupq_n_f32(output_activation_min);
+  const auto activation_max = vdupq_n_f32(output_activation_max);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vmulq_f32(a10, a20);
+    auto x1 = vmulq_f32(a11, a21);
+    auto x2 = vmulq_f32(a12, a22);
+    auto x3 = vmulq_f32(a13, a23);
+
+    x0 = vmaxq_f32(activation_min, x0);
+    x1 = vmaxq_f32(activation_min, x1);
+    x2 = vmaxq_f32(activation_min, x2);
+    x3 = vmaxq_f32(activation_min, x3);
+    x0 = vminq_f32(activation_max, x0);
+    x1 = vminq_f32(activation_max, x1);
+    x2 = vminq_f32(activation_max, x2);
+    x3 = vminq_f32(activation_max, x3);
+
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vmulq_f32(a1, a2);
+
+    x = vmaxq_f32(activation_min, x);
+    x = vminq_f32(activation_max, x);
+
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] * input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
+                                                  output_activation_max);
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto input2_map = MapAsVector(input2_data, input2_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  if (AreSameDims(input1_dims, input2_dims)) {
+    output_map.array() = input1_map.array() * input2_map.array();
+  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() * scalar;
+  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar * input2_map.array();
+  } else {
+    // Should not come here.
+    TFLITE_DCHECK(false);
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastMul is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val * input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+                   const Dims<4>* const* input_dims, int inputs_count,
+                   Scalar* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  // for now we dont have a model with a Concatenation
+  // with fused activation function.
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+                            output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  gemmlowp::ScopedProfilingLabel label("LstmCell");
+  MatchingArraySize(  // batches
+      input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
+      3, output_activ_dims, 3);
+  MatchingArraySize(  // height
+      input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
+      2, output_activ_dims, 2);
+  MatchingArraySize(  // width
+      input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
+      1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<Dims<4> const*> concat_input_arrays_dims;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_dims.push_back(&input_dims);
+  concat_input_arrays_dims.push_back(&prev_activ_dims);
+  Concatenation<FusedActivationFunctionType::kNone, float>(
+      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+  // Fully connected
+  FullyConnected<FusedActivationFunctionType::kNone>(
+      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+      bias_dims, activ_temp_data, activ_temp_dims);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
+
+  // Combined memory state and final output calculation
+  gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_state_map.tanh();
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
+  TFLITE_DCHECK_GE(outputs_count, 1);
+  for (int i = 0; i < outputs_count; i++) {
+    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+  }
+  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
+  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
+  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  // for now we dont have a model with a TensorFlowSplit
+  // with fused activation function.
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  const int whb = width * height * batches;
+  const Scalar* input_ptr = input_data;
+  for (int k = 0; k < whb; k++) {
+    for (int i = 0; i < outputs_count; ++i) {
+      memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
+             output_dims[i]->sizes[0] * sizeof(Scalar));
+      input_ptr += output_dims[i]->sizes[0];
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+  return (b * height + h) * width + w;
+}
+
+inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int kwidth, int kheight,
+                        float output_activation_min,
+                        float output_activation_max, float* output_data,
+                        const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("AveragePool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  // TODO(benoitjacob) make this a proper reference impl without Eigen!
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // TODO(benoitjacob) get rid of the dynamic memory allocation here!
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + pad_height;
+        int wpad = w + pad_width;
+        int h_start =
+            (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) +=
+                in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+  // Divide the output by the actual number of elements being averaged over
+  TFLITE_DCHECK_GT(out_count.minCoeff(), 0);
+  out_mat.array().rowwise() /= out_count.transpose().array();
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      for (int x = 0; x < output_width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  output_data[Offset(output_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int kwidth, int kheight, float* output_data,
+                 const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+              pad_height, kwidth, kheight, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, float* output_data,
+                 const Dims<4>& output_dims) {
+  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+                  filter_width, filter_height, output_data, output_dims);
+}
+
+inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int filter_width, int filter_height,
+                        int32 output_activation_min,
+                        int32 output_activation_max, uint8* output_data,
+                        const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        const int filter_x_start = std::max(0, -in_x_origin);
+        const int filter_x_end =
+            std::min(filter_width, input_width - in_x_origin);
+        const int filter_y_start = std::max(0, -in_y_origin);
+        const int filter_y_end =
+            std::min(filter_height, input_height - in_y_origin);
+        const int filter_count =
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+        // 1280 required by Inception v3
+        static constexpr int kAccBufferMaxSize = 2048;
+        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
+        uint16 acc[kAccBufferMaxSize];
+        memset(acc, 0, depth * sizeof(acc[0]));
+        const uint8* input_ptr =
+            input_data + input_dims.strides[1] * in_x_origin +
+            input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+          const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+                                       filter_x_start * input_dims.strides[1];
+          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+            int channel = 0;
+#ifdef USE_NEON
+            for (; channel <= depth - 16; channel += 16) {
+              uint16x8_t acc_reg[2];
+              for (int i = 0; i < 2; i++) {
+                acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+              }
+              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+              input_row_ptr += 16;
+              acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+              acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+              for (int i = 0; i < 2; i++) {
+                vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+              }
+            }
+            for (; channel <= depth - 8; channel += 8) {
+              uint16x8_t acc_reg = vld1q_u16(acc + channel);
+              uint8x8_t input_reg = vld1_u8(input_row_ptr);
+              input_row_ptr += 8;
+              acc_reg = vaddw_u8(acc_reg, input_reg);
+              vst1q_u16(acc + channel, acc_reg);
+            }
+#endif
+            for (; channel < depth; ++channel) {
+              acc[channel] += *input_row_ptr++;
+            }
+          }
+        }
+        uint8* output_ptr =
+            output_data + Offset(output_dims, 0, out_x, out_y, batch);
+        int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                              \
+  if (filter_count == FILTER_COUNT) {                                  \
+    for (; channel <= depth - 8; channel += 8) {                       \
+      uint16 buf[8];                                                   \
+      for (int i = 0; i < 8; i++) {                                    \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
+      }                                                                \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                     \
+      buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));          \
+      buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));          \
+      vst1_u8(output_ptr + channel, buf8);                             \
+    }                                                                  \
+  }
+        AVGPOOL_DIVIDING_BY(9)
+        AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+        for (; channel <= depth - 8; channel += 8) {
+          uint16 buf[8];
+          for (int i = 0; i < 8; i++) {
+            buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+          }
+          uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+          buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));
+          buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));
+          vst1_u8(output_ptr + channel, buf8);
+        }
+#endif
+        for (; channel < depth; ++channel) {
+          uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+          a = std::max<uint16>(a, output_activation_min);
+          a = std::min<uint16>(a, output_activation_max);
+          output_ptr[channel] = static_cast<uint8>(a);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int filter_width, int filter_height,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+              pad_height, filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims) {
+  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+                  filter_width, filter_height, output_activation_min,
+                  output_activation_max, output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int kwidth, int kheight,
+                    float output_activation_min, float output_activation_max,
+                    float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("MaxPool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // Prefill the output to minimum representable float value
+  out_mat.setConstant(std::numeric_limits<float>::lowest());
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + pad_height;
+        int wpad = w + pad_width;
+        int h_start =
+            (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) =
+                out_mat.col(out_offset)
+                    .cwiseMax(in_mat.col(
+                        NodeOffset(b, h, w, input_height, input_width)));
+          }
+        }
+      }
+    }
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      for (int x = 0; x < output_width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  output_data[Offset(output_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int kwidth, int kheight, float* output_data,
+             const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, kwidth, kheight, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             float* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int filter_width, int filter_height,
+                    int32 output_activation_min, int32 output_activation_max,
+                    uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        const int filter_x_start = std::max(0, -in_x_origin);
+        const int filter_x_end =
+            std::min(filter_width, input_width - in_x_origin);
+        const int filter_y_start = std::max(0, -in_y_origin);
+        const int filter_y_end =
+            std::min(filter_height, input_height - in_y_origin);
+        // 2048 required by Inception v3
+        static constexpr int kAccBufferMaxSize = 2048;
+        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
+        uint8 acc[kAccBufferMaxSize];
+        memset(acc, 0, depth * sizeof(acc[0]));
+        const uint8* input_ptr =
+            input_data + input_dims.strides[1] * in_x_origin +
+            input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+          const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+                                       filter_x_start * input_dims.strides[1];
+          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+            int channel = 0;
+#ifdef USE_NEON
+            for (; channel <= depth - 16; channel += 16) {
+              uint8x16_t acc_reg = vld1q_u8(acc + channel);
+              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+              input_row_ptr += 16;
+              acc_reg = vmaxq_u8(acc_reg, input_reg);
+              vst1q_u8(acc + channel, acc_reg);
+            }
+
+            for (; channel <= depth - 8; channel += 8) {
+              uint8x8_t acc_reg = vld1_u8(acc + channel);
+              uint8x8_t input_reg = vld1_u8(input_row_ptr);
+              input_row_ptr += 8;
+              acc_reg = vmax_u8(acc_reg, input_reg);
+              vst1_u8(acc + channel, acc_reg);
+            }
+#endif
+            for (; channel < depth; ++channel) {
+              acc[channel] = std::max(acc[channel], *input_row_ptr++);
+            }
+          }
+        }
+        uint8* output_ptr =
+            output_data + Offset(output_dims, 0, out_x, out_y, batch);
+        int channel = 0;
+#ifdef USE_NEON
+        for (; channel <= depth - 16; channel += 16) {
+          uint8x16_t a = vld1q_u8(acc + channel);
+          a = vminq_u8(a, vdupq_n_u8(output_activation_max));
+          a = vmaxq_u8(a, vdupq_n_u8(output_activation_min));
+          vst1q_u8(output_ptr + channel, a);
+        }
+        for (; channel <= depth - 8; channel += 8) {
+          uint8x8_t a = vld1_u8(acc + channel);
+          a = vmin_u8(a, vdup_n_u8(output_activation_max));
+          a = vmax_u8(a, vdup_n_u8(output_activation_min));
+          vst1_u8(output_ptr + channel, a);
+        }
+#endif
+        for (; channel < depth; ++channel) {
+          uint8 a = acc[channel];
+          a = std::max<uint8>(a, output_activation_min);
+          a = std::min<uint8>(a, output_activation_max);
+          output_ptr[channel] = static_cast<uint8>(a);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int filter_width, int filter_height, int32 output_activation_min,
+             int32 output_activation_max, uint8* output_data,
+             const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, filter_width, filter_height, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             int32 output_activation_min, int32 output_activation_max,
+             uint8* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int filter_width, int filter_height,
+                   float output_activation_min, float output_activation_max,
+                   float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Pool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  // Actually carry out L2 Pool. Code is written in forward mode: we go through
+  // the input values once, and write to all the pooled regions that it maps to.
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  Eigen::VectorXf in_square(in_mat.rows());
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        const int hpad = h + pad_height;
+        const int wpad = w + pad_width;
+        const int h_start = (hpad < filter_height)
+                                ? 0
+                                : (hpad - filter_height) / stride_height + 1;
+        const int h_end = std::min(hpad / stride_height + 1, output_height);
+        const int w_start = (wpad < filter_width)
+                                ? 0
+                                : (wpad - filter_width) / stride_width + 1;
+        const int w_end = std::min(wpad / stride_width + 1, output_width);
+        // pre-compute square
+        const int in_offset = w + input_width * (h + input_height * b);
+        in_square =
+            in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
+        // compute elementwise sum of squares
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            const int out_offset = pw + output_width * (ph + output_height * b);
+            out_mat.col(out_offset) += in_square;
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+
+  out_count = out_count.array().inverse();
+  out_mat =
+      (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+            int stride_width, int stride_height, int pad_width, int pad_height,
+            int filter_width, int filter_height, float* output_data,
+            const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+         pad_height, filter_width, filter_height, output_activation_min,
+         output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int filter_width, int filter_height,
+            float* output_data, const Dims<4>& output_dims) {
+  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+             filter_width, filter_height, output_data, output_dims);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+                                       const Dims<4>& input_dims, int range,
+                                       float bias, float alpha, float beta,
+                                       float* output_data,
+                                       const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization");
+  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  // Carry out local response normalization, vector by vector.
+  // Since the data are stored column major, making row-wise operation
+  // probably not memory efficient anyway, we do an explicit for loop over
+  // the columns.
+  const int double_range = range * 2;
+  Eigen::VectorXf padded_square(data_in.rows() + double_range);
+  padded_square.setZero();
+  for (int r = 0; r < data_in.cols(); ++r) {
+    // Do local response normalization for data_in(:, r)
+    // first, compute the square and store them in buffer for repeated use
+    padded_square.block(range, 0, data_in.rows(), 1) =
+        data_in.col(r).cwiseProduct(data_in.col(r)) * alpha;
+    // Then, compute the scale and writes them to data_out
+    float accumulated_scale = 0;
+    for (int i = 0; i < double_range; ++i) {
+      accumulated_scale += padded_square(i);
+    }
+    for (int i = 0; i < data_in.rows(); ++i) {
+      accumulated_scale += padded_square(i + double_range);
+      data_out(i, r) = bias + accumulated_scale;
+      accumulated_scale -= padded_square(i);
+    }
+  }
+
+  // In a few cases, the pow computation could benefit from speedups.
+  if (beta == 1) {
+    data_out.array() = data_in.array() * data_out.array().inverse();
+  } else if (beta == 0.5) {
+    data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+  } else {
+    data_out.array() = data_in.array() * data_out.array().pow(-beta);
+  }
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+                    float beta, float* output_data,
+                    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Softmax");
+  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // Compute the exponential first, removing the max coefficient for numerical
+  // stability.
+  out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta;
+  // We are separating out the exp function so that exp can be vectorized.
+  out_mat = out_mat.array().exp();
+  // Normalize to get the activations.
+  Eigen::Array<float, 1, Eigen::Dynamic> scale =
+      out_mat.array().colwise().sum().inverse();
+  out_mat.array().rowwise() *= scale;
+}
+
+inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_beta_multiplier, int32 input_beta_left_shift,
+                    int diff_min, uint8* output_data,
+                    const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  gemmlowp::ScopedProfilingLabel label("Softmax");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int x = 0; x < width; ++x) {
+      for (int y = 0; y < height; ++y) {
+        uint8 max_in_row = 0;
+        for (int c = 0; c < depth; ++c) {
+          max_in_row =
+              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
+        }
+
+        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+            sum_of_exps =
+                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                  exp_on_negative_values(scaled_diff_f8));
+          }
+        }
+
+        int32 fixed_sum_of_exps = sum_of_exps.raw();
+        // TODO(starka): Use a NEON intrinsic like vclzq_u32 instead.
+        int headroom_plus_one =
+            __builtin_clz(static_cast<uint32>(fixed_sum_of_exps));
+        // This is the number of bits to the left of the binary point above 1.0.
+        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+        // no later adjustment will be needed.
+        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+        int32 shifted_sum_minus_one = static_cast<int32>(
+            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+            (static_cast<uint32>(1) << 31));
+
+        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+            FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+            output_data[Offset(output_dims, c, x, y, b)] =
+                std::max(std::min(unsat_output, 255), 0);
+
+          } else {
+            output_data[Offset(output_dims, c, x, y, b)] = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Logistic");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() =
+      input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+}
+
+inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
+                     int32 input_zero_point, int32 input_range_radius,
+                     int32 input_multiplier, int input_left_shift,
+                     uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Logistic");
+  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int size = RequiredBufferSizeForDims(input_dims);
+
+  int c = 0;
+#ifdef USE_NEON
+  // Handle 16 values at a time
+  for (; c <= size - 16; c += 16) {
+    // Read input uint8 values, cast to int16 and subtract input_zero_point
+    uint8x16_t input_val_u8 = vld1q_u8(input_data + c);
+    int16x8_t input_val_centered_0 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+    int16x8_t input_val_centered_1 =
+        vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))),
+                  vdupq_n_s16(input_zero_point));
+
+    // Prepare the bit masks that we will use at the end to implement the logic
+    // that was expressed in the scalar code with branching:
+    //   if (input_val_centered < -input_range_radius) {
+    //     output_val = 0;
+    //   } else if (input_val_centered > input_range_radius) {
+    //     output_val = 255;
+    //   } else {
+    //     ...
+    uint16x8_t mask_rightclamp_0 =
+        vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_rightclamp_1 =
+        vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius));
+    uint16x8_t mask_leftclamp_0 =
+        vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius));
+    uint16x8_t mask_leftclamp_1 =
+        vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius));
+    uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8),
+                                             vshrn_n_u16(mask_rightclamp_1, 8));
+    uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8),
+                                            vshrn_n_u16(mask_leftclamp_1, 8));
+
+    // This performs what is expressed in the scalar code as
+    // const int32 input_val_rescaled =
+    //     MultiplyByQuantizedMultiplierGreaterThanOne(
+    //         input_val_centered, input_multiplier, input_left_shift);
+    int32x4_t input_val_rescaled_0 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_1 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_2 =
+        vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    int32x4_t input_val_rescaled_3 =
+        vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)),
+                  vdupq_n_s32(input_left_shift));
+    input_val_rescaled_0 =
+        vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier);
+    input_val_rescaled_1 =
+        vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier);
+    input_val_rescaled_2 =
+        vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier);
+    input_val_rescaled_3 =
+        vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier);
+
+    // Invoke gemmlowp::logistic on FixedPoint wrapping int32x4_t
+    using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>;
+    using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>;
+    const FixedPoint4 input_val_f4_0 =
+        FixedPoint4::FromRaw(input_val_rescaled_0);
+    const FixedPoint4 input_val_f4_1 =
+        FixedPoint4::FromRaw(input_val_rescaled_1);
+    const FixedPoint4 input_val_f4_2 =
+        FixedPoint4::FromRaw(input_val_rescaled_2);
+    const FixedPoint4 input_val_f4_3 =
+        FixedPoint4::FromRaw(input_val_rescaled_3);
+    const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0);
+    const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1);
+    const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2);
+    const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3);
+
+    // Divide by 2^23 as in the scalar code
+    using gemmlowp::RoundingDivideByPOT;
+    int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 23);
+    int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 23);
+    int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 23);
+    int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 23);
+
+    // Cast output values to uint8, saturating
+    int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0),
+                                              vqmovn_s32(output_val_s32_1));
+    int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2),
+                                              vqmovn_s32(output_val_s32_3));
+    uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0),
+                                           vqmovun_s16(output_val_s16_1));
+
+    // Perform the bit-masking with the bit masks computed at the beginning,
+    // see the comment there.
+    output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp);
+    output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp);
+
+    // Store back to memory
+    vst1q_u8(output_data + c, output_val_u8);
+  }
+#endif
+  // Leftover loop: handle one value at a time with scalar code.
+  for (; c < size; ++c) {
+    const uint8 input_val_u8 = input_data[c];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered < -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered > input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
+      }
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
+    }
+    output_data[c] = output_val;
+  }
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Tanh");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = input_map.array().tanh();
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          int32 val = input_data[Offset(input_dims, c, x, y, b)];
+          float result = static_cast<float>(scale * (val - zero_point));
+          output_data[Offset(output_dims, c, x, y, b)] = result;
+        }
+      }
+    }
+  }
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+                      float rmin, float rmax, float* output_data,
+                      const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FakeQuant");
+
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.);
+  TFLITE_DCHECK_GE(rmax, 0.);
+
+  // Determine quantization parameters: zero_point, scale.
+  using Integer = uint8;
+  const Integer qmin = std::numeric_limits<Integer>::min();
+  const Integer qmax = std::numeric_limits<Integer>::max();
+  const float qmin_float = qmin;
+  const float qmax_float = qmax;
+  int32 zero_point = 0;
+  float scale = 0.f;
+  // If rmin==rmax, both must be zero per the above assertion,
+  // so we are done.
+  if (rmin != rmax) {
+    // First determine the scale.
+    scale = (rmax - rmin) / (qmax_float - qmin_float);
+
+    // Zero-point computation.
+    // First the initial floating-point computation. The zero-point can be
+    // determined from solving an affine equation for any known pair
+    // (real value, corresponding quantized value).
+    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+    // The arithmetic error on the zero point computed from either pair
+    // will be roughly machine_epsilon * (sum of absolute values of terms)
+    // so we want to use the variant that adds the smaller terms.
+    const float zero_point_from_min = qmin_float - rmin / scale;
+    const float zero_point_from_max = qmax_float - rmax / scale;
+    const float zero_point_from_min_error =
+        std::abs(qmin_float) + std::abs(rmin / scale);
+    const float zero_point_from_max_error =
+        std::abs(qmax_float) + std::abs(rmax / scale);
+
+    const float zero_point_float =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+
+    // Now we need to nudge the zero point to be an integer
+    // (our zero points are integer, and this is motivated by the requirement
+    // to be able to represent the real value "0" exactly as a quantized value,
+    // which is required in multiple places, for example in Im2col with SAME
+    // padding).
+    if (zero_point_float < qmin_float) {
+      zero_point = qmin;
+    } else if (zero_point_float > qmax_float) {
+      zero_point = qmax;
+    } else {
+      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
+    }
+    // The zero point should always be in the range of quantized value,
+    // [qmin, qmax].
+    TFLITE_DCHECK_GE(zero_point, qmin);
+    TFLITE_DCHECK_LE(zero_point, qmax);
+  }
+
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
+          const float unclamped_quantized_val =
+              TfLiteRound(zero_point + src_val / scale);
+          const float quantized_val = std::min(
+              qmax_float, std::max(qmin_float, unclamped_quantized_val));
+          const float dst_val = scale * (quantized_val - zero_point);
+          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
+                 DstT* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Cast");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = input_map.array().template cast<DstT>();
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Floor");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = Eigen::floor(input_map.array());
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+                   int input_rank, const int32* coords_data,
+                   const Dims<4>& coords_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Gather");
+
+  TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
+  int stride = input_dims.strides[input_rank - 1];
+  T* out = output_data;
+
+  for (int i = 0; i < coords_dims.sizes[0]; i++) {
+    TFLITE_DCHECK_GE(coords_data[i], 0);
+    TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
+    const T* in = input_data + coords_data[i] * stride;
+    memcpy(out, in, sizeof(T) * stride);
+    out += stride;
+  }
+}
+
+#ifdef USE_NEON
+inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
+                                 float scale, float* output_ptr) {
+  int ic = 0;
+  // Handle 32 input channels at a time.
+  for (; ic <= depth - 32; ic += 32) {
+    float32x4x2_t input[4];
+    for (int i = 0; i < 4; i++) {
+      input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+      input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+    }
+    float32x4x2_t acc[4];
+    for (int i = 0; i < 4; i++) {
+      acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+      acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+    }
+    for (int i = 0; i < 4; i++) {
+      acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+      acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+    }
+    for (int i = 0; i < 4; i++) {
+      vst1q_f32(output_ptr, acc[i].val[0]);
+      vst1q_f32(output_ptr + 4, acc[i].val[1]);
+      output_ptr += 8;
+    }
+    input_ptr += 32;
+  }
+  // Handle 16 input channels at a time.
+  for (; ic <= depth - 16; ic += 16) {
+    float32x4x2_t input[2];
+    for (int i = 0; i < 2; i++) {
+      input[i].val[0] = vld1q_f32(input_ptr + 8 * i);
+      input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4);
+    }
+    float32x4x2_t acc[2];
+    for (int i = 0; i < 2; i++) {
+      acc[i].val[0] = vld1q_f32(output_ptr + 8 * i);
+      acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4);
+    }
+    for (int i = 0; i < 2; i++) {
+      acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale);
+      acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale);
+    }
+    for (int i = 0; i < 2; i++) {
+      vst1q_f32(output_ptr, acc[i].val[0]);
+      vst1q_f32(output_ptr + 4, acc[i].val[1]);
+      output_ptr += 8;
+    }
+    input_ptr += 16;
+  }
+  // Handle 8 input channels at a time.
+  for (; ic <= depth - 8; ic += 8) {
+    float32x4x2_t input;
+    input.val[0] = vld1q_f32(input_ptr);
+    input.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t acc;
+    acc.val[0] = vld1q_f32(output_ptr);
+    acc.val[1] = vld1q_f32(output_ptr + 4);
+    acc.val[0] = vmlaq_n_f32(acc.val[0], input.val[0], scale);
+    acc.val[1] = vmlaq_n_f32(acc.val[1], input.val[1], scale);
+
+    vst1q_f32(output_ptr, acc.val[0]);
+    vst1q_f32(output_ptr + 4, acc.val[1]);
+
+    input_ptr += 8;
+    output_ptr += 8;
+  }
+  // Handle 4 input channels at a time.
+  for (; ic <= depth - 4; ic += 4) {
+    float32x4_t input = vld1q_f32(input_ptr);
+    float32x4_t acc = vld1q_f32(output_ptr);
+
+    acc = vmlaq_n_f32(acc, input, scale);
+    vst1q_f32(output_ptr, acc);
+
+    input_ptr += 4;
+    output_ptr += 4;
+  }
+  // Handle 1 input channel at a time.
+  for (; ic < depth; ic++) {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+#else
+inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
+                                 float scale, float* output_ptr) {
+  for (int32 i = 0; i < depth; i++) {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+#endif
+
+inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1,
+                                    int32 x, int32 y, int32 depth, int32 batch,
+                                    const float* input_data,
+                                    const Dims<4>& input_dims,
+                                    float* output_data,
+                                    const Dims<4>& output_dims) {
+  const int32 input_width = ArraySize(input_dims, 1);
+  const int32 output_width = ArraySize(output_dims, 1);
+
+  const int32 input_x_offset = (x1 - x0) * depth;
+  const int32 input_y_offset = (y1 - y0) * depth * input_width;
+  const int32 output_x_offset = depth;
+  const int32 output_y_offset = depth * output_width;
+
+#ifdef USE_NEON
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(x1 >= x0);
+  TFLITE_DCHECK(y1 >= y0);
+
+  int ic = 0;
+  // Handle 8 input channels at a time.
+  for (; ic <= depth - 8; ic += 8) {
+    const float* input_ptr = nullptr;
+
+    float32x4x2_t x0y0;
+    input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+    x0y0.val[0] = vld1q_f32(input_ptr);
+    x0y0.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x1y0;
+    input_ptr += input_x_offset;
+    x1y0.val[0] = vld1q_f32(input_ptr);
+    x1y0.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x0y1;
+    input_ptr += -input_x_offset + input_y_offset;
+    x0y1.val[0] = vld1q_f32(input_ptr);
+    x0y1.val[1] = vld1q_f32(input_ptr + 4);
+
+    float32x4x2_t x1y1;
+    input_ptr += input_x_offset;
+    x1y1.val[0] = vld1q_f32(input_ptr);
+    x1y1.val[1] = vld1q_f32(input_ptr + 4);
+
+    // Top left corner.
+    float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+    vst1q_f32(output_ptr, x0y0.val[0]);
+    vst1q_f32(output_ptr + 4, x0y0.val[1]);
+
+    // Top right corner.
+    output_ptr += output_x_offset;
+    float32x4x2_t tr;
+    tr.val[0] = vaddq_f32(x0y0.val[0], x1y0.val[0]);
+    tr.val[1] = vaddq_f32(x0y0.val[1], x1y0.val[1]);
+    tr.val[0] = vmulq_n_f32(tr.val[0], 0.5f);
+    tr.val[1] = vmulq_n_f32(tr.val[1], 0.5f);
+
+    vst1q_f32(output_ptr, tr.val[0]);
+    vst1q_f32(output_ptr + 4, tr.val[1]);
+
+    // Bottom left corner.
+    output_ptr += -output_x_offset + output_y_offset;
+    float32x4x2_t bl;
+    bl.val[0] = vaddq_f32(x0y0.val[0], x0y1.val[0]);
+    bl.val[1] = vaddq_f32(x0y0.val[1], x0y1.val[1]);
+    bl.val[0] = vmulq_n_f32(bl.val[0], 0.5f);
+    bl.val[1] = vmulq_n_f32(bl.val[1], 0.5f);
+    vst1q_f32(output_ptr, bl.val[0]);
+    vst1q_f32(output_ptr + 4, bl.val[1]);
+
+    // Bottom right corner.
+    output_ptr += output_x_offset;
+    float32x4x2_t br;
+    br.val[0] = vaddq_f32(x1y0.val[0], x1y1.val[0]);
+    br.val[1] = vaddq_f32(x1y0.val[1], x1y1.val[1]);
+    br.val[0] = vmlaq_n_f32(bl.val[0], br.val[0], 0.5f);
+    br.val[1] = vmlaq_n_f32(bl.val[1], br.val[1], 0.5f);
+    br.val[0] = vmulq_n_f32(br.val[0], 0.5f);
+    br.val[1] = vmulq_n_f32(br.val[1], 0.5f);
+    vst1q_f32(output_ptr, br.val[0]);
+    vst1q_f32(output_ptr + 4, br.val[1]);
+  }
+  // Handle 4 input channels at a time.
+  for (; ic <= depth - 4; ic += 4) {
+    const float* input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)];
+    float32x4_t x0y0 = vld1q_f32(input_ptr);
+    float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset);
+    float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset);
+    float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset);
+
+    // Top left corner.
+    float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)];
+    vst1q_f32(output_ptr, x0y0);
+
+    // Top right corner.
+    output_ptr += output_x_offset;
+    float32x4_t tr = vaddq_f32(x0y0, x1y0);
+    tr = vmulq_n_f32(tr, 0.5f);
+    vst1q_f32(output_ptr, tr);
+
+    // Bottom left corner.
+    output_ptr += -output_x_offset + output_y_offset;
+    float32x4_t bl = vaddq_f32(x0y0, x0y1);
+    bl = vmulq_n_f32(bl, 0.5f);
+    vst1q_f32(output_ptr, bl);
+
+    // Bottom right corner.
+    output_ptr += output_x_offset;
+    float32x4_t br = vaddq_f32(x1y0, x1y1);
+    br = vmlaq_n_f32(bl, br, 0.5f);
+    br = vmulq_n_f32(br, 0.5f);
+    vst1q_f32(output_ptr, br);
+  }
+  // Handle one input channel at a time.
+  for (; ic < depth; ic++) {
+    const int32 input_offset = Offset(input_dims, ic, x0, y0, batch);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32 output_offset = Offset(output_dims, ic, x, y, batch);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+#else
+  for (int ch = 0; ch < depth; ch++) {
+    const int32 input_offset = Offset(input_dims, ch, x0, y0, batch);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32 output_offset = Offset(output_dims, ch, x, y, batch);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+#endif
+}
+
+inline void ResizeBilinear2x2(const float* input_data,
+                              const Dims<4>& input_dims, float* output_data,
+                              const Dims<4>& output_dims, int32 batches,
+                              int32 input_height, int32 input_width,
+                              int32 depth, int32 output_height,
+                              int32 output_width) {
+  for (int b = 0; b < batches; b++) {
+    for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) {
+      for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) {
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+        int32 y1 = std::min(y0 + 1, input_height - 1);
+        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_data,
+                                input_dims, output_data, output_dims);
+      }
+    }
+  }
+}
+
+inline void ResizeBilinearGeneric(const float* input_data,
+                                  const Dims<4>& input_dims, float* output_data,
+                                  const Dims<4>& output_dims, int32 batches,
+                                  int32 input_height, int32 input_width,
+                                  int32 depth, int32 output_height,
+                                  int32 output_width, float height_scale,
+                                  float width_scale) {
+  memset(output_data, 0,
+         batches * output_height * output_width * depth * sizeof(float));
+
+  int32 output_offset = 0;
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y = y * height_scale;
+      int32 y0 = static_cast<int32>(std::floor(input_y));
+      int32 y1 = std::min(y0 + 1, input_height - 1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x = x * width_scale;
+        int32 x0 = static_cast<int32>(input_x);
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+        float* output_ptr = &output_data[output_offset];
+
+        // Run kernel on the 4 corners of the bilinear resize algorithm.
+        int32 input_offset = Offset(input_dims, 0, x0, y0, b);
+        float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
+        const float* input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_dims, 0, x1, y0, b);
+        scale = (1 - (input_y - y0)) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_dims, 0, x0, y1, b);
+        scale = (input_y - y0) * (1 - (input_x - x0));
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_dims, 0, x1, y1, b);
+        scale = (input_y - y0) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        output_offset += depth;
+      }
+    }
+  }
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
+  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  int32 input_height = ArraySize(input_dims, 2);
+  int32 input_width = ArraySize(input_dims, 1);
+  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+
+  // Specialize for 2x2 upsample.
+  if (output_height == 2 * input_height && output_width == 2 * input_width) {
+    ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches,
+                      input_height, input_width, depth, output_height,
+                      output_width);
+  } else {
+    float height_scale = static_cast<float>(input_height) / output_height;
+    float width_scale = static_cast<float>(input_width) / output_width;
+
+    ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims,
+                          batches, input_height, input_width, depth,
+                          output_height, output_width, height_scale,
+                          width_scale);
+  }
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
+
+  const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int input_batch_size = ArraySize(input_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width = block_shape_data[1];
+  const int padding_top = paddings_data[0];
+  const int padding_left = paddings_data[2];
+
+  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+    int input_batch = out_b % input_batch_size;
+    int shift_w = (out_b / input_batch_size) % block_shape_width;
+    int shift_h = (out_b / input_batch_size) / block_shape_width;
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
+        if (out_h * block_shape_height < padding_top ||
+            out_h * block_shape_height >= padding_top + input_height ||
+            out_w * block_shape_width < padding_left ||
+            out_w * block_shape_width >= padding_left + input_width) {
+          memset(out, 0, depth * sizeof(T));
+        } else {
+          const T* in =
+              input_data +
+              Offset(input_dims, 0,
+                     (out_w * block_shape_width + shift_w) - padding_left,
+                     (out_h * block_shape_height + shift_h) - padding_top,
+                     input_batch);
+          memcpy(out, in, depth * sizeof(T));
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
+
+  const int output_batch_size = ArraySize(output_dims, 3);
+  const int input_batch_size = ArraySize(input_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  const int block_shape_width = block_shape_data[1];
+  const int block_shape_height = block_shape_data[0];
+
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        int out_batch = in_batch % output_batch_size;
+        int out_w = in_w * block_shape_width +
+                    (in_batch / output_batch_size) % block_shape_width;
+        int out_h = in_h * block_shape_height +
+                    (in_batch / output_batch_size) / block_shape_width;
+        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
+        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Pad");
+  const int output_batch = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_depth = ArraySize(output_dims, 0);
+
+  const int left_b_padding = left_paddings[3];
+  const int left_h_padding = left_paddings[2];
+  const int left_w_padding = left_paddings[1];
+  const int left_d_padding = left_paddings[0];
+
+  const int right_b_padding = right_paddings[3];
+  const int right_h_padding = right_paddings[2];
+  const int right_w_padding = right_paddings[1];
+  const int right_d_padding = right_paddings[0];
+
+  const int input_depth = ArraySize(input_dims, 0);
+
+  if (left_b_padding != 0) {
+    memset(output_data, 0,
+           left_b_padding * output_height * output_width * output_depth *
+               sizeof(T));
+  }
+  for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
+       ++out_b) {
+    if (left_h_padding != 0) {
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+             left_h_padding * output_width * output_depth * sizeof(T));
+    }
+    for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
+         ++out_h) {
+      if (left_w_padding != 0) {
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+               left_w_padding * output_depth * sizeof(T));
+      }
+      for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
+           ++out_w) {
+        if (left_d_padding != 0) {
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
+                 left_d_padding * sizeof(T));
+        }
+
+        T* out = output_data +
+                 Offset(output_dims, left_d_padding, out_w, out_h, out_b);
+        const T* in =
+            input_data + Offset(input_dims, 0, out_w - left_w_padding,
+                                out_h - left_h_padding, out_b - left_b_padding);
+        memcpy(out, in, input_depth * sizeof(T));
+
+        if (right_d_padding != 0) {
+          memset(
+              output_data + Offset(output_dims, output_depth - right_d_padding,
+                                   out_w, out_h, out_b),
+              0, right_d_padding * sizeof(T));
+        }
+      }
+      if (right_w_padding != 0) {
+        memset(
+            output_data + Offset(output_dims, 0, output_width - right_w_padding,
+                                 out_h, out_b),
+            0, right_w_padding * output_depth * sizeof(T));
+      }
+    }
+    if (right_h_padding != 0) {
+      memset(output_data + Offset(output_dims, 0, 0,
+                                  output_height - right_h_padding, out_b),
+             0, right_h_padding * output_width * output_depth * sizeof(T));
+    }
+  }
+  if (right_b_padding != 0) {
+    memset(output_data +
+               Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+           0,
+           right_b_padding * output_height * output_width * output_depth *
+               sizeof(T));
+  }
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("StridedSlice");
+  const int start_b = (begin_mask & 8) ? 0 : starts[3];
+  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
+  const int start_h = (begin_mask & 4) ? 0 : starts[2];
+  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
+  const int start_w = (begin_mask & 2) ? 0 : starts[1];
+  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
+  const int start_d = (begin_mask & 1) ? 0 : starts[0];
+  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+
+  T* out_ptr = output_data;
+  if (strides[0] == 0) {
+    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+          const int len = stop_d - start_d;
+          memcpy(out_ptr,
+                 input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+                 len * sizeof(T));
+          out_ptr += len;
+        }
+      }
+    }
+  } else {
+    for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+      for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+        for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+          for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+            *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  // TODO(dkalenichenko): This op only supports 4D tensors.
+  TFLITE_DCHECK_EQ(begin.size(), 4);
+  TFLITE_DCHECK_EQ(size.size(), 4);
+  const int start_b = begin[3];
+  const int stop_b =
+      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
+  const int start_h = begin[2];
+  const int stop_h =
+      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+  const int start_w = begin[1];
+  const int stop_w =
+      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+  const int start_d = begin[0];
+  const int stop_d =
+      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+
+  T* out_ptr = output_data;
+  for (int in_b = start_b; in_b < stop_b; ++in_b) {
+    for (int in_h = start_h; in_h < stop_h; ++in_h) {
+      for (int in_w = start_w; in_w < stop_w; ++in_w) {
+        const int len = stop_d - start_d;
+        memcpy(out_ptr,
+               input_data + Offset(input_dims, start_d, in_w, in_h, in_b),
+               len * sizeof(T));
+        out_ptr += len;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+                 const std::vector<int>& reduction_indices, T* output_data,
+                 const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mean");
+  const int output_batch = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_depth = ArraySize(output_dims, 0);
+
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+
+  // The current implementation only supports simultaneous reduction over
+  // width and height.
+  TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
+  TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
+                (reduction_indices[0] == 2 && reduction_indices[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
+        }
+      }
+      output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
+          value / (input_width * input_height);
+    }
+  }
+}
+
+template <typename T>
+void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                         const T* input2_data, const Dims<4>& input2_dims,
+                         T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Sub");
+
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto input2_map = MapAsVector(input2_data, input2_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  if (AreSameDims(input1_dims, input2_dims)) {
+    output_map.array() = input1_map.array() - input2_map.array();
+  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar - input2_map.array();
+  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() - scalar;
+  } else {
+    GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
+                        output_data, output_dims);
+  }
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum");
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  auto min_value = input2_data[0];
+  output_map.array() = input1_map.array().min(min_value);
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum");
+  auto input1_map = MapAsVector(input1_data, input1_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  auto max_value = input2_data[0];
+  output_map.array() = input1_map.array().max(max_value);
+}
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic pop
+#endif
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8be99e82fb8721ced7a3e5da686b20ce241ea2d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+#define TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
+
+// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+namespace tflite {
+namespace tensor_utils {
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result,
+                                                 int result_stride);
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride);
+
+// Cwise product of two vectors.
+void PortableVectorVectorCwiseProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      float* result);
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+                                                const float* vector2,
+                                                int v_size, float* result);
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size);
+float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                 int v_size);
+
+// Dot product of two batch vectors.
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+                                              const float* vector2, int v_size,
+                                              int n_batch, float* result,
+                                              int result_stride);
+void NeonBatchVectorBatchVectorDotProduct(const float* vector1,
+                                          const float* vector2, int v_size,
+                                          int n_batch, float* result,
+                                          int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                     int v_size,
+                                                     const float* batch_vector,
+                                                     int n_batch,
+                                                     float* result);
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result);
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result);
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+                                  float* result);
+
+// Apply activation function to elements of a vector.
+void PortableApplyActivationToVector(const float* vector, int v_size,
+                                     TfLiteFusedActivation activation,
+                                     float* result);
+
+// Copy vector to another vector.
+void PortableCopyVector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void PortableZeroVector(float* vector, int v_size);
+
+// Limit a float input f between +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+// Shift left a vector in place with v_size size.
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
+void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+                                int output_size, int reduction_size);
+void NeonReductionSumVector(const float* input_vector, float* output_vector,
+                            int output_size, int reduction_size);
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // TF_LITE_KERNELS_INTERNAL_OPTIMIZED_TENSOR_UTILS_IMPL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98f2e365c5249a6c28673fc185ebec34cc2105b2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+
+namespace tflite {
+
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* right_shift) {
+  TFLITE_CHECK(double_multiplier >= 0.);
+  TFLITE_CHECK(double_multiplier < 1.);
+  if (double_multiplier == 0.) {
+    *quantized_multiplier = 0;
+    *right_shift = 0;
+    return;
+  }
+  TFLITE_CHECK(double_multiplier > 0.);
+  const double q = std::frexp(double_multiplier, right_shift);
+  *right_shift *= -1;
+
+  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+  TFLITE_CHECK(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    --*right_shift;
+  }
+  TFLITE_CHECK_GE(*right_shift, 0);
+  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift) {
+  TFLITE_CHECK(double_multiplier > 1.);
+  const double q = std::frexp(double_multiplier, left_shift);
+  auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
+  TFLITE_CHECK(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*left_shift;
+  }
+  TFLITE_CHECK_GE(*left_shift, 0);
+  TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift) {
+  // If the overall multiplier (input and beta) is large, then exp() of an
+  // input difference of 1 scaled by this will be large.  In other words, we
+  // can cap the multiplier and know that, when it is used, the output will be
+  // (round to) zero wherever the input is not at the maximum value.
+
+  // If the overall scale is less than one, and input_integer_bits=0, then the
+  // result is double equivalent of Q0.31 (actually with more precision). Thus
+  // this generates a Q(input_integer_bits).(31-input_integer_bits)
+  // representation.
+  const double input_beta_real_multiplier = std::min(
+      beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
+
+  QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
+                                   quantized_multiplier, left_shift);
+}
+
+int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                    (1ll << (31 - input_integer_bits)) /
+                                    (1ll << input_left_shift);
+  // Tighten bound using floor.  Suppose that we could use the exact value.
+  // After scaling the difference, the result would be at the maximum.  Thus we
+  // must ensure that our value has lower magnitude.
+  return static_cast<int>(std::floor(max_input_rescaled));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..efb7191c8deb2a23ea5473ab131d2b6537202765
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+#define PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
+
+#include <cstdint>
+
+namespace tflite {
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier < 1 (and non-negative).
+void QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* right_shift);
+
+// Decompose a double multiplier into a Q0.31 int32 representation of its
+// significand, and shift representation of its exponent.
+//
+// Restricted to the case where the multiplier > 1.
+void QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+// This first creates a multiplier in a double equivalent of
+// Q(input_integer_bits).(31-input_integer_bits) representation, with extra
+// precision in the double's fractional bits.  It then splits the result into
+// significand and exponent.
+void PreprocessSoftmaxScaling(double beta, double input_scale,
+                              int input_integer_bits,
+                              int32_t* quantized_multiplier, int* left_shift);
+
+// Calculate the largest input that will result in a within-bounds intermediate
+// result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
+// it must not overflow before we reduce the value by multiplication by the
+// input multiplier.  The negative radius is used as the minimum difference
+// in Softmax.
+int CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
+}  // namespace tflite
+
+#endif  // PHOTOS_VISION_LEARNING_TENSORFLOW_MINI_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6f306e2cbae3c780b3d773638ba46cd2abf02f5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+using ::testing::Pair;
+
+TEST(QuantizationUtilTest, QuantizeMultiplierSmallerThanOne) {
+  auto quantize = [](double d) {
+    int32_t q;
+    int s;
+    QuantizeMultiplierSmallerThanOne(d, &q, &s);
+    return std::pair<int32_t, int>{q, s};
+  };
+
+  EXPECT_DEATH(quantize(-0.1), "");
+  EXPECT_THAT(quantize(0.0), Pair(0, 0));
+  EXPECT_THAT(quantize(0.25), Pair(1073741824, 1));
+
+  // Around 0.5 we can see the change in exponent and how we try hard to
+  // void hitting max int32.
+  EXPECT_THAT(quantize(0.50 - 5e-9), Pair(2147483627, 1));
+  EXPECT_THAT(quantize(0.50 - 1e-10), Pair(1073741824, 0));
+  EXPECT_THAT(quantize(0.50), Pair(1073741824, 0));
+
+  EXPECT_THAT(quantize(0.75), Pair(1610612736, 0));
+  EXPECT_THAT(quantize(1 - 1e-9), Pair(2147483646, 0));
+
+  // If we get close enough to 1.0 it crashes and dies in one of two ways:
+  // Either the shift becomes negative or we trigger the 'less-than-one' CHECK.
+  EXPECT_DEATH(quantize(1 - 1e-15), "");
+  EXPECT_DEATH(quantize(1 - 1e-17), "");
+  EXPECT_DEATH(quantize(1.0), "");
+}
+
+TEST(QuantizationUtilTest, QuantizeMultiplierGreaterThanOne) {
+  auto quantize = [](double d) {
+    int32_t q;
+    int s;
+    QuantizeMultiplierGreaterThanOne(d, &q, &s);
+    return std::pair<int32_t, int>{q, s};
+  };
+
+  // If we are close enough to 1.0 it crashes.
+  EXPECT_DEATH(quantize(1 + 1e-16), "");
+
+  EXPECT_THAT(quantize(1 + 1e-11), Pair(1073741824, 1));
+  EXPECT_THAT(quantize(1.25), Pair(1342177280, 1));
+  EXPECT_THAT(quantize(1.50), Pair(1610612736, 1));
+  EXPECT_THAT(quantize(1.75), Pair(1879048192, 1));
+
+  // Around the powers of two we see the change in exponent. Also,
+  // we try hard to avoid hitting max int32.
+  EXPECT_THAT(quantize(2 - 1e-9), Pair(2147483647, 1));
+  EXPECT_THAT(quantize(2 - 1e-11), Pair(1073741824, 2));
+  EXPECT_THAT(quantize(2), Pair(1073741824, 2));
+}
+
+TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
+  auto quantize = [](double beta, double scale, int integer_bits) {
+    int32_t q;
+    int s;
+    PreprocessSoftmaxScaling(beta, scale, integer_bits, &q, &s);
+    return std::pair<int32_t, int>{q, s};
+  };
+
+  // If beta * scale is greater than fits in the number of integer bits, the
+  // result is move near the maximum. Otherwise they quantize as expected.
+  // With 4 integer bits we can represent up to 16.0.
+  EXPECT_THAT(quantize(1.0, 16.0, 4), Pair(2147483647, 31));
+  EXPECT_THAT(quantize(1.0, 8.0, 4), Pair(1073741824, 31));
+  // But with 5 bits we can go further.
+  EXPECT_THAT(quantize(2.0, 16.0, 5), Pair(2147483647, 31));
+  EXPECT_THAT(quantize(2.0, 8.0, 5), Pair(1073741824, 31));
+}
+
+TEST(QuantizationUtilTest, CalculateInputRadius) {
+  EXPECT_EQ(CalculateInputRadius(4, 27), 15);
+  EXPECT_EQ(CalculateInputRadius(3, 27), 14);
+  EXPECT_EQ(CalculateInputRadius(3, 28), 7);
+  EXPECT_EQ(CalculateInputRadius(4, 2), 503316480);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e0f234545e43dd8b2412e065aaecad8325a1182
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h
@@ -0,0 +1,115 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          float output_activation_min,
+                          float output_activation_max, float* output_data,
+                          const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            float total = 0.f;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + filter_x;
+                const int in_y = in_y_origin + filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value =
+                      input_data[Offset(input_dims, ic, in_x, in_y, b)];
+                  float filter_value = filter_data[Offset(
+                      filter_dims, oc, filter_x, filter_y, 0)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+            float bias_value = 0.0f;
+            if (bias_data) {
+              bias_value = bias_data[Offset(bias_dims, oc, 0, 0, 0)];
+            }
+            output_data[Offset(output_dims, oc, out_x, out_y, b)] =
+                ActivationFunctionWithMinMax(total + bias_value,
+                                             output_activation_min,
+                                             output_activation_max);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, float* output_data,
+                   const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  DepthwiseConv(input_data, input_dims, filter_data, filter_dims, bias_data,
+                bias_dims, stride_width, stride_height, pad_width, pad_height,
+                depth_multiplier, output_activation_min, output_activation_max,
+                output_data, output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+                    bias_dims, stride, stride, pad_width, pad_height,
+                    depth_multiplier, output_data, output_dims);
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_FLOAT_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a80558b32f2858778460956cd9f57617674e21e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int stride_width, int stride_height, int pad_width,
+                          int pad_height, int depth_multiplier,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int ic = 0; ic < input_depth; ++ic) {
+          for (int m = 0; m < depth_multiplier; m++) {
+            const int oc = m + ic * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + filter_x;
+                const int in_y = in_y_origin + filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32 input_val =
+                      input_data[Offset(input_dims, ic, in_x, in_y, b)];
+                  int32 filter_val = filter_data[Offset(filter_dims, oc,
+                                                        filter_x, filter_y, 0)];
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[Offset(bias_dims, oc, 0, 0, 0)];
+            }
+            acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+                acc, output_multiplier, output_shift);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_dims, oc, out_x, out_y, b)] =
+                static_cast<uint8>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int depth_multiplier, int32 output_offset,
+                   int32 output_multiplier, int output_shift,
+                   int32 output_activation_min, int32 output_activation_max,
+                   uint8* output_data, const Dims<4>& output_dims) {
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
+                filter_offset, bias_data, bias_dims, stride_width,
+                stride_height, pad_width, pad_height, depth_multiplier,
+                output_offset, output_multiplier, output_shift,
+                output_activation_min, output_activation_max, output_data,
+                output_dims);
+}
+
+// Legacy, for compatibility with old checked-in code.
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   int32 output_offset, int32 output_multiplier,
+                   int output_shift, int32 output_activation_min,
+                   int32 output_activation_max, uint8* output_data,
+                   const Dims<4>& output_dims) {
+  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
+                    filter_dims, filter_offset, bias_data, bias_dims, stride,
+                    stride, pad_width, pad_height, depth_multiplier,
+                    output_offset, output_multiplier, output_shift,
+                    output_activation_min, output_activation_max, output_data,
+                    output_dims);
+}
+
+}  // end namespace reference_ops
+}  // end namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b0bccc9da5fa2ff9c3a9d430725b613435abf1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+float PortableClip(float f, float abs_limit) {
+  float result = (abs_limit < f) ? abs_limit : f;
+  result = (-abs_limit > result) ? -abs_limit : result;
+  return result;
+}
+
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result,
+                                                 int result_stride) {
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    for (int r = 0; r < m_rows; r++) {
+      const float* vector_in_batch = vector + b * m_cols;
+      for (int c = 0; c < m_cols; c++) {
+        *result_in_batch += *matrix_ptr++ * *vector_in_batch++;
+      }
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void PortableVectorVectorCwiseProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = *vector1++ * *vector2++;
+  }
+}
+
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size) {
+  float result = 0.0;
+  for (int v = 0; v < v_size; v++) {
+    result += *vector1++ * *vector2++;
+  }
+  return result;
+}
+
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+                                              const float* vector2, int v_size,
+                                              int n_batch, float* result,
+                                              int result_stride) {
+  float* result_ptr = result;
+  const float* vector1_ptr = vector1;
+  const float* vector2_ptr = vector2;
+  for (int b = 0; b < n_batch; b++) {
+    *result_ptr =
+        PortableVectorVectorDotProduct(vector1_ptr, vector2_ptr, v_size);
+    vector1_ptr += v_size;
+    vector2_ptr += v_size;
+    result_ptr += result_stride;
+  }
+}
+
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+                                                const float* vector2,
+                                                int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ += *vector1++ * *vector2++;
+  }
+}
+
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                     int v_size,
+                                                     const float* batch_vector,
+                                                     int n_batch,
+                                                     float* result) {
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < v_size; v++) {
+      *result++ += vector[v] * *batch_vector++;
+    }
+  }
+}
+
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector) {
+  for (int b = 0; b < n_batch; b++) {
+    memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
+  }
+}
+
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+                                  float* result) {
+  auto sigmoid_func = ActivationFunctor(kTfLiteActSigmoid);
+  for (int v = 0; v < v_size; v++) {
+    *result++ = (sigmoid_func)(*vector++);
+  }
+}
+
+void PortableApplyActivationToVector(const float* vector, int v_size,
+                                     TfLiteFusedActivation activation,
+                                     float* result) {
+  auto activation_func = ActivationFunctor(activation);
+  for (int v = 0; v < v_size; v++) {
+    *result++ = (activation_func)(*vector++);
+  }
+}
+
+void PortableCopyVector(const float* vector, int v_size, float* result) {
+  memcpy(result, vector, v_size * sizeof(float));
+}
+
+void PortableSub1Vector(const float* vector, int v_size, float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = 1.0f - *vector++;
+  }
+}
+
+void PortableZeroVector(float* vector, int v_size) {
+  memset(vector, 0, v_size * sizeof(float));
+}
+
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result) {
+  for (int v = 0; v < v_size; v++) {
+    *result++ = PortableClip(*vector++, abs_limit);
+  }
+}
+
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value) {
+  TF_LITE_ASSERT(v_size > 0);
+  for (int i = 0; i < v_size - 1; i++) {
+    vector[i] = vector[i + 1];
+  }
+  vector[v_size - 1] = shift_value;
+}
+
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+                                int output_size, int reduction_size) {
+  const float* input_vector_ptr = input_vector;
+  for (int o = 0; o < output_size; o++) {
+    for (int r = 0; r < reduction_size; r++) {
+      output_vector[o] += *input_vector_ptr++;
+    }
+  }
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2ab78000b81485f037c507933cd024e70f39850
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -0,0 +1,189 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
+
+// TDOD(ghodrat): Remove this header file and the dependency to internal data
+// structure.
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+// Limit a float input f betweeen +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result,
+                                                 int result_stride);
+
+// Cwise product of two vectors.
+void PortableVectorVectorCwiseProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
+// assumption here is that result array is initialized to valid values.
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+                                                const float* vector2,
+                                                int v_size, float* result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size);
+
+// Dot product of two batch vectors.
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+                                              const float* vector2, int v_size,
+                                              int n_batch, float* result,
+                                              int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                     int v_size,
+                                                     const float* batch_vector,
+                                                     int n_batch,
+                                                     float* result);
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+                                  float* result);
+
+// Apply activation function to elements of a vector.
+void PortableApplyActivationToVector(const float* vector, int v_size,
+                                     TfLiteFusedActivation activation,
+                                     float* result);
+
+// Copy vector to another vector.
+void PortableCopyVector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void PortableZeroVector(float* vector, int v_size);
+
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result);
+
+// Shift left a vector in place with v_size size.
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void PortableReductionSumVector(const float* input_vector, float* output_vector,
+                                int output_size, int reduction_size);
+
+float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result,
+                                         int result_stride) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              n_batch, result, result_stride);
+}
+
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result) {
+  PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result) {
+  PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result) {
+  PortableVectorBatchVectorCwiseProductAccumulate(vector, v_size, batch_vector,
+                                                  n_batch, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      int n_batch, float* result,
+                                      int result_stride) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result, result_stride);
+}
+
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector) {
+  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+  PortableApplySigmoidToVector(vector, v_size, result);
+}
+
+void ApplyActivationToVector(const float* vector, int v_size,
+                             TfLiteFusedActivation activation, float* result) {
+  PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+void CopyVector(const float* vector, int v_size, float* result) {
+  PortableCopyVector(vector, v_size, result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  PortableSub1Vector(vector, v_size, result);
+}
+
+void ZeroVector(float* vector, int v_size) {
+  PortableZeroVector(vector, v_size);
+}
+
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result) {
+  PortableClipVector(vector, v_size, abs_limit, result);
+}
+
+void VectorShiftLeft(float* vector, int v_size, float shift_value) {
+  PortableVectorShiftLeft(vector, v_size, shift_value);
+}
+
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size) {
+  PortableReductionSumVector(input_vector, output_vector, output_size,
+                             reduction_size);
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9ca3d5c626dff4ea8ba52949e8fea8e9b43689f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -0,0 +1,2455 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "public/gemmlowp.h"
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
+    int32 x, int32 quantized_multiplier, int right_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32 x, int32 quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// ELEMENT-WISE BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+inline void Conv(const float* input_data, const Dims<4>& input_dims,
+                 const float* filter_data, const Dims<4>& filter_dims,
+                 const float* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, float output_activation_min,
+                 float output_activation_max, float* output_data,
+                 const Dims<4>& output_dims, float* im2col_data,
+                 const Dims<4>& im2col_dims) {
+  (void)im2col_data;  // only used in optimized code.
+  (void)im2col_dims;  // only used in optimized code.
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
+  }
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + filter_x;
+                const int in_y = in_y_origin + filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  float input_value = input_data[Offset(input_dims, in_channel,
+                                                        in_x, in_y, batch)];
+                  float filter_value =
+                      filter_data[Offset(filter_dims, in_channel, filter_x,
+                                         filter_y, out_channel)];
+                  total += (input_value * filter_value);
+                }
+              }
+            }
+          }
+          float bias_value = 0.0f;
+          if (bias_data) {
+            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
+          }
+          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+              ActivationFunctionWithMinMax(total + bias_value,
+                                           output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
+       stride_width, stride_height, pad_width, pad_height,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
+           bias_dims, stride, stride, pad_width, pad_height, output_data,
+           output_dims, im2col_data, im2col_dims);
+}
+
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  (void)im2col_data;   // only used in optimized code.
+  (void)im2col_dims;   // only used in optimized code.
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth =
+      MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + filter_x;
+                const int in_y = in_y_origin + filter_y;
+                // If the location is outside the bounds of the input image,
+                // use zero as a default value.
+                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height)) {
+                  int32 input_val = input_data[Offset(input_dims, in_channel,
+                                                      in_x, in_y, batch)];
+                  int32 filter_val =
+                      filter_data[Offset(filter_dims, in_channel, filter_x,
+                                         filter_y, out_channel)];
+                  acc +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+          }
+          if (bias_data) {
+            acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
+          }
+          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+              acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
+              static_cast<uint8>(acc);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
+                 int32 input_offset, const uint8* filter_data,
+                 const Dims<4>& filter_dims, int32 filter_offset,
+                 const int32* bias_data, const Dims<4>& bias_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int32 output_offset, int32 output_multiplier,
+                 int output_shift, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims, uint8* im2col_data,
+                 const Dims<4>& im2col_dims,
+                 gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
+       filter_offset, bias_data, bias_dims, stride_width, stride_height,
+       pad_width, pad_height, output_offset, output_multiplier, output_shift,
+       output_activation_min, output_activation_max, output_data, output_dims,
+       im2col_data, im2col_dims, gemm_context);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride,
+          int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
+           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
+           pad_height, output_offset, output_multiplier, output_shift,
+           output_activation_min, output_activation_max, output_data,
+           output_dims, im2col_data, im2col_dims, gemm_context);
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_batch = ArraySize(input_dims, 3);
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_batch = ArraySize(output_dims, 3);
+
+  TFLITE_DCHECK_EQ(input_width * block_size, output_width);
+  TFLITE_DCHECK_EQ(input_height * block_size, output_height);
+  TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
+  TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        for (int out_d = 0; out_d < output_depth; ++out_d) {
+          const int in_d =
+              out_d + ((out_h % block_size) * block_size + out_w % block_size) *
+                          output_depth;
+          const int in_w = out_w / block_size;
+          const int in_h = out_h / block_size;
+          const int in_b = out_b;
+
+          const int output_index =
+              Offset(output_dims, out_d, out_w, out_h, out_b);
+          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_batch = ArraySize(input_dims, 3);
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_batch = ArraySize(output_dims, 3);
+
+  TFLITE_DCHECK_EQ(input_width, output_width * block_size);
+  TFLITE_DCHECK_EQ(input_height, output_height * block_size);
+  TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
+  TFLITE_DCHECK_EQ(input_batch, output_batch);
+
+  for (int in_b = 0; in_b < input_batch; ++in_b) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        for (int in_d = 0; in_d < input_depth; ++in_d) {
+          const int out_d =
+              in_d + ((in_h % block_size) * block_size + in_w % block_size) *
+                         input_depth;
+          const int out_w = in_w / block_size;
+          const int out_h = in_h / block_size;
+          const int out_b = in_b;
+
+          const int output_index =
+              Offset(output_dims, out_d, out_w, out_h, out_b);
+          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
+
+          output_data[output_index] = input_data[input_index];
+        }
+      }
+    }
+  }
+}
+
+inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                           const float* weights_data,
+                           const Dims<4>& weights_dims, const float* bias_data,
+                           const Dims<4>& bias_dims,
+                           float output_activation_min,
+                           float output_activation_max, float* output_data,
+                           const Dims<4>& output_dims) {
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(weights_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      float total = 0.f;
+      for (int d = 0; d < accum_depth; ++d) {
+        total += input_data[b * accum_depth + d] *
+                 weights_data[out_c * accum_depth + d];
+      }
+      float bias_value = 0.0f;
+      if (bias_data) {
+        bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
+      }
+      output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
+          total + bias_value, output_activation_min, output_activation_max);
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
+                 bias_dims, output_activation_min, output_activation_max,
+                 output_data, output_dims);
+}
+
+inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                           int32 input_offset, const uint8* filter_data,
+                           const Dims<4>& filter_dims, int32 filter_offset,
+                           const int32* bias_data, const Dims<4>& bias_dims,
+                           int32 output_offset, int32 output_multiplier,
+                           int output_shift, int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data,
+                           const Dims<4>& output_dims,
+                           gemmlowp::GemmContext* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  // TODO(benoitjacob): This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  const int accum_depth = ArraySize(filter_dims, 0);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32 input_val = input_data[b * accum_depth + d];
+        int32 filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
+      }
+      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
+                                                        output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
+                 filter_offset, bias_data, bias_dims, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data, output_dims, gemm_context);
+}
+
+template <FusedActivationFunctionType Ac>
+void NonGlobalBatchNormalization(
+    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+    const Dims<4>& mean_dims, const float* multiplier_data,
+    const Dims<4>& multiplier_dims, const float* offset_data,
+    const Dims<4>& offset_dims, float* output_data,
+    const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+                        offset_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+                        offset_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, x, y, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+              offset_data[Offset(offset_dims, c, x, y, 0)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+                              const Dims<4>& input_dims, const float* mean_data,
+                              const Dims<4>& mean_dims,
+                              const float* multiplier_data,
+                              const Dims<4>& multiplier_dims,
+                              const float* offset_data,
+                              const Dims<4>& offset_dims, float* output_data,
+                              const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+        }
+      }
+    }
+  }
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float lower = 0;
+          float clamped = val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 1;
+          const float lower = -1;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 6;
+          const float lower = 0;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        float squared_l2_norm = 0;
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          squared_l2_norm += val * val;
+        }
+        float l2_norm = std::sqrt(squared_l2_norm);
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
+        }
+      }
+    }
+  }
+}
+
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+                                          int* output_shift) {
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  TFLITE_DCHECK_GT(input, 0);
+  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  TFLITE_DCHECK_GE(input, (1 << 27));
+  TFLITE_DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32, 3>;
+  using F0 = FixedPoint<int32, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+}
+
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+                            int32 input_zero_point, uint8* output_data,
+                            const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  TFLITE_DCHECK_EQ(batches, 1);
+  TFLITE_DCHECK_EQ(height, 1);
+  TFLITE_DCHECK_EQ(width, 1);
+  int32 square_l2_norm = 0;
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
+    square_l2_norm += diff * diff;
+  }
+  int32 inv_l2norm_multiplier;
+  int inv_l2norm_shift;
+  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                &inv_l2norm_shift);
+
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
+    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+    int32 unclamped_output_val = 128 + rescaled_diff;
+    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+    output_data[Offset(output_dims, i, 0, 0, 0)] =
+        static_cast<uint8>(output_val);
+  }
+}
+
+inline void Add(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] +
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <FusedActivationFunctionType Ac>
+void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
+                  const float* input2_data, const Dims<4>& input2_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
+               input1_multiplier, input1_shift, input2_data, input2_dims,
+               input2_offset, input2_multiplier, input2_shift, output_offset,
+               output_multiplier, output_shift, output_activation_min,
+               output_activation_max, output_data, output_dims);
+}
+
+inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int batches =
+      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[Offset(input1_dims, c, x, y, b)] *
+                      input2_data[Offset(input2_dims, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+template <FusedActivationFunctionType Ac>
+void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
+                  const float* input2_data, const Dims<4>& input2_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val * input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
+               input2_dims, input2_offset, output_offset, output_multiplier,
+               output_shift, output_activation_min, output_activation_max,
+               output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+                   const Dims<4>* const* input_dims, int inputs_count,
+                   Scalar* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+                            output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  const int batches =
+      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
+                        output_state_dims, 3, output_activ_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
+                        output_state_dims, 2, output_activ_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
+                        output_state_dims, 1, output_activ_dims, 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
+                  1);
+  const int intern_activ_depth =
+      MatchingArraySize(weights_dims, 1, bias_dims, 0);
+  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth =
+      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
+                        output_state_dims, 0, output_activ_dims, 0);
+  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<Dims<4> const*> concat_input_arrays_dims;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_dims.push_back(&input_dims);
+  concat_input_arrays_dims.push_back(&prev_activ_dims);
+  Concatenation<FusedActivationFunctionType::kNone, float>(
+      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+  // Fully connected
+  FullyConnected<FusedActivationFunctionType::kNone>(
+      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+      bias_dims, activ_temp_data, activ_temp_dims);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b) {
+    for (int w = 0; w < width; ++w) {
+      for (int h = 0; h < height; ++h) {
+        for (int c = 0; c < output_depth; ++c) {
+          const float input_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(
+                         activ_temp_dims, 0 * output_depth + c, w, h, b)]));
+          const float new_input = std::tanh(activ_temp_data[Offset(
+              activ_temp_dims, 1 * output_depth + c, w, h, b)]);
+          const float forget_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(
+                         activ_temp_dims, 2 * output_depth + c, w, h, b)]));
+          const float output_gate =
+              1.f /
+              (1.f + std::exp(-activ_temp_data[Offset(
+                         activ_temp_dims, 3 * output_depth + c, w, h, b)]));
+          const float new_state =
+              input_gate * new_input +
+              forget_gate *
+                  prev_state_data[Offset(prev_state_dims, c, w, h, b)];
+          output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state;
+          output_activ_data[Offset(output_activ_dims, c, w, h, b)] =
+              output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  TFLITE_DCHECK_GE(outputs_count, 1);
+  for (int i = 0; i < outputs_count; i++) {
+    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+  }
+  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
+  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
+  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+  // for now we dont have a model with a TensorFlowSplit
+  // with fused activation function.
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        int in_c = 0;
+        for (int i = 0; i < outputs_count; ++i) {
+          const int depth = ArraySize(*output_dims[i], 0);
+          for (int c = 0; c < depth; ++c) {
+            output_data[i][Offset(*output_dims[i], c, x, y, b)] =
+                input_data[Offset(input_dims, in_c, x, y, b)];
+            in_c++;
+          }
+        }
+        TFLITE_DCHECK(in_c == ArraySize(input_dims, 0));
+      }
+    }
+  }
+}
+
+// TODO(benoitjacob) make this a proper reference impl without Eigen!
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+                                                const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+                                               const Dims<N>& dims) {
+  const int cols = dims.sizes[N - 1];
+  int rows = 1;
+  for (int d = 0; d < N - 1; d++) {
+    rows *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+  return (b * height + h) * width + w;
+}
+
+inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int filter_width, int filter_height,
+                        float output_activation_min,
+                        float output_activation_max, float* output_data,
+                        const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(filter_height, input_height - in_y_origin);
+          float total = 0.f;
+          float filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              total +=
+                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+              filter_count++;
+            }
+          }
+          const float average = total / filter_count;
+          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+              ActivationFunctionWithMinMax(average, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int filter_width, int filter_height,
+                 float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+              pad_height, filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, float* output_data,
+                 const Dims<4>& output_dims) {
+  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+                  filter_width, filter_height, output_data, output_dims);
+}
+
+inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+                        int stride_width, int stride_height, int pad_width,
+                        int pad_height, int filter_width, int filter_height,
+                        int32 output_activation_min,
+                        int32 output_activation_max, uint8* output_data,
+                        const Dims<4>& output_dims) {
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(filter_height, input_height - in_y_origin);
+          int32 acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+              filter_count++;
+            }
+          }
+          acc = (acc + filter_count / 2) / filter_count;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+              static_cast<uint8>(acc);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height, int pad_width,
+                 int pad_height, int filter_width, int filter_height,
+                 int32 output_activation_min, int32 output_activation_max,
+                 uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
+              pad_height, filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims) {
+  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+                  filter_width, filter_height, output_activation_min,
+                  output_activation_max, output_data, output_dims);
+}
+
+inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
+                   int stride_width, int stride_height, int pad_width,
+                   int pad_height, int filter_width, int filter_height,
+                   float output_activation_min, float output_activation_max,
+                   float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(filter_height, input_height - in_y_origin);
+          float sum_squares = 0.f;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              const float val =
+                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
+              sum_squares += val * val;
+              filter_count++;
+            }
+          }
+          const float l2pool_result = std::sqrt(sum_squares / filter_count);
+          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+              ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+            int stride_width, int stride_height, int pad_width, int pad_height,
+            int filter_width, int filter_height, float* output_data,
+            const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
+         pad_height, filter_width, filter_height, output_activation_min,
+         output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int filter_width, int filter_height,
+            float* output_data, const Dims<4>& output_dims) {
+  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+             filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int filter_width, int filter_height,
+                    float output_activation_min, float output_activation_max,
+                    float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(filter_height, input_height - in_y_origin);
+          float max = std::numeric_limits<float>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
+            }
+          }
+          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+              ActivationFunctionWithMinMax(max, output_activation_min,
+                                           output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int filter_width, int filter_height, float* output_data,
+             const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, filter_width, filter_height, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             float* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_data, output_dims);
+}
+
+inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+                    int stride_width, int stride_height, int pad_width,
+                    int pad_height, int filter_width, int filter_height,
+                    int32 output_activation_min, int32 output_activation_max,
+                    uint8* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_GE(output_activation_min, 0);
+  TFLITE_DCHECK_LE(output_activation_max, 255);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(filter_height, input_height - in_y_origin);
+          uint8 max = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
+            }
+          }
+          max = std::max<uint8>(max, output_activation_min);
+          max = std::min<uint8>(max, output_activation_max);
+          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
+              static_cast<uint8>(max);
+        }
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height, int pad_width, int pad_height,
+             int filter_width, int filter_height, int32 output_activation_min,
+             int32 output_activation_max, uint8* output_data,
+             const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
+          pad_height, filter_width, filter_height, output_activation_min,
+          output_activation_max, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             int32 output_activation_min, int32 output_activation_max,
+             uint8* output_data, const Dims<4>& output_dims) {
+  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
+              filter_width, filter_height, output_activation_min,
+              output_activation_max, output_data, output_dims);
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+                                       const Dims<4>& input_dims, int range,
+                                       float bias, float alpha, float beta,
+                                       float* output_data,
+                                       const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const int begin_input_c = std::max(0, c - range);
+          const int end_input_c = std::min(depth, c + range);
+          float accum = 0.f;
+          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+            const float input_val =
+                input_data[Offset(input_dims, input_c, x, y, b)];
+            accum += input_val * input_val;
+          }
+          const float multiplier = std::pow(bias + alpha * accum, -beta);
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
+        }
+      }
+    }
+  }
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+                    float beta, float* output_data,
+                    const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        // Find max element value which we'll use to ensure numerical stability
+        // taking advantage of the following equality:
+        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+        float max = std::numeric_limits<float>::lowest();
+        for (int c = 0; c < depth; ++c) {
+          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
+        }
+
+        // Compute sum.
+        float sum = 0.f;
+        for (int c = 0; c < depth; ++c) {
+          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
+                          beta);
+        }
+
+        // Compute result.
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
+                       beta) /
+              sum;
+        }
+      }
+    }
+  }
+}
+
+inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_beta_multiplier, int32 input_beta_left_shift,
+                    int diff_min, uint8* output_data,
+                    const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int x = 0; x < width; ++x) {
+      for (int y = 0; y < height; ++y) {
+        uint8 max_in_row = 0;
+        for (int c = 0; c < depth; ++c) {
+          max_in_row =
+              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
+        }
+
+        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+            sum_of_exps =
+                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                  exp_on_negative_values(scaled_diff_f8));
+          }
+        }
+
+        int32 fixed_sum_of_exps = sum_of_exps.raw();
+        int headroom_plus_one =
+            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+        // This is the number of bits to the left of the binary point above 1.0.
+        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+        // no later adjustment will be needed.
+        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+        int32 shifted_sum_minus_one = static_cast<int32>(
+            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+            (static_cast<uint32>(1) << 31));
+
+        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+            FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
+                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+
+          } else {
+            output_data[Offset(output_dims, c, x, y, b)] = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          float result = 1.f / (1.f + std::exp(-val));
+          output_data[Offset(output_dims, c, x, y, b)] = result;
+        }
+      }
+    }
+  }
+}
+
+inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
+                     int32 input_zero_point, int32 input_range_radius,
+                     int32 input_multiplier, int input_left_shift,
+                     uint8* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+          const int32 input_val_centered =
+              static_cast<int32>(input_val_u8) - input_zero_point;
+          uint8 output_val;
+          if (input_val_centered <= -input_range_radius) {
+            output_val = 0;
+          } else if (input_val_centered >= input_range_radius) {
+            output_val = 255;
+          } else {
+            const int32 input_val_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_val_centered, input_multiplier, input_left_shift);
+            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+            const FixedPoint4 input_val_f4 =
+                FixedPoint4::FromRaw(input_val_rescaled);
+            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+            using gemmlowp::RoundingDivideByPOT;
+            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+            if (output_val_s32 == 256) {
+              output_val_s32 = 255;
+            }
+            TFLITE_DCHECK_GE(output_val_s32, 0);
+            TFLITE_DCHECK_LE(output_val_s32, 255);
+            output_val = static_cast<uint8>(output_val_s32);
+          }
+          output_data[Offset(output_dims, c, x, y, b)] = output_val;
+        }
+      }
+    }
+  }
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          float result = std::tanh(val);
+          output_data[Offset(output_dims, c, x, y, b)] = result;
+        }
+      }
+    }
+  }
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          int32 val = input_data[Offset(input_dims, c, x, y, b)];
+          float result = static_cast<float>(scale * (val - zero_point));
+          output_data[Offset(output_dims, c, x, y, b)] = result;
+        }
+      }
+    }
+  }
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+                      float rmin, float rmax, float* output_data,
+                      const Dims<4>& output_dims) {
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.);
+  TFLITE_DCHECK_GE(rmax, 0.);
+
+  // Determine quantization parameters: zero_point, scale.
+  using Integer = uint8;
+  const Integer qmin = std::numeric_limits<Integer>::min();
+  const Integer qmax = std::numeric_limits<Integer>::max();
+  const float qmin_float = qmin;
+  const float qmax_float = qmax;
+  int32 zero_point = 0;
+  float scale = 0.f;
+  // If rmin==rmax, both must be zero per the above assertion,
+  // so we are done.
+  if (rmin != rmax) {
+    // First determine the scale.
+    scale = (rmax - rmin) / (qmax_float - qmin_float);
+
+    // Zero-point computation.
+    // First the initial floating-point computation. The zero-point can be
+    // determined from solving an affine equation for any known pair
+    // (real value, corresponding quantized value).
+    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+    // The arithmetic error on the zero point computed from either pair
+    // will be roughly machine_epsilon * (sum of absolute values of terms)
+    // so we want to use the variant that adds the smaller terms.
+    const float zero_point_from_min = qmin_float - rmin / scale;
+    const float zero_point_from_max = qmax_float - rmax / scale;
+    const float zero_point_from_min_error =
+        std::abs(qmin_float) + std::abs(rmin / scale);
+    const float zero_point_from_max_error =
+        std::abs(qmax_float) + std::abs(rmax / scale);
+
+    const float zero_point_float =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+
+    // Now we need to nudge the zero point to be an integer
+    // (our zero points are integer, and this is motivated by the requirement
+    // to be able to represent the real value "0" exactly as a quantized value,
+    // which is required in multiple places, for example in Im2col with SAME
+    // padding).
+    if (zero_point_float < qmin_float) {
+      zero_point = qmin;
+    } else if (zero_point_float > qmax_float) {
+      zero_point = qmax;
+    } else {
+      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
+    }
+    // The zero point should always be in the range of quantized value,
+    // [qmin, qmax].
+    TFLITE_DCHECK_GE(zero_point, qmin);
+    TFLITE_DCHECK_LE(zero_point, qmax);
+  }
+
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
+          const float unclamped_quantized_val =
+              TfLiteRound(zero_point + src_val / scale);
+          const float quantized_val = std::min(
+              qmax_float, std::max(qmin_float, unclamped_quantized_val));
+          const float dst_val = scale * (quantized_val - zero_point);
+          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
+                 DstT* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          int offset = Offset(input_dims, c, x, y, b);
+          output_data[offset] = static_cast<DstT>(input_data[offset]);
+        }
+      }
+    }
+  }
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          int offset = Offset(input_dims, c, x, y, b);
+          output_data[offset] = std::floor(input_data[offset]);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+                   int input_rank, const int32* coords_data,
+                   const Dims<4>& coords_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
+  int stride = input_dims.strides[input_rank - 1];
+  T* out = output_data;
+
+  for (int i = 0; i < coords_dims.sizes[0]; i++) {
+    TFLITE_DCHECK_GE(coords_data[i], 0);
+    TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
+    const T* in = input_data + coords_data[i] * stride;
+    memcpy(out, in, sizeof(T) * stride);
+    out += stride;
+  }
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  int32 input_height = ArraySize(input_dims, 2);
+  int32 input_width = ArraySize(input_dims, 1);
+  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+  float height_scale = static_cast<float>(input_height) / output_height;
+  float width_scale = static_cast<float>(input_width) / output_width;
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y = y * height_scale;
+      int32 y0 = static_cast<int32>(std::floor(input_y));
+      int32 y1 = std::min(y0 + 1, input_height - 1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x = x * width_scale;
+        int32 x0 = static_cast<int32>(std::floor(input_x));
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+        for (int c = 0; c < depth; ++c) {
+          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
+                                    (1 - (input_y - y0)) *
+                                    (1 - (input_x - x0)) +
+                                input_data[Offset(input_dims, c, x0, y1, b)] *
+                                    (input_y - y0) * (1 - (input_x - x0)) +
+                                input_data[Offset(input_dims, c, x1, y0, b)] *
+                                    (1 - (input_y - y0)) * (input_x - x0) +
+                                input_data[Offset(input_dims, c, x1, y1, b)] *
+                                    (input_y - y0) * (input_x - x0);
+          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  const int output_batch_size = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int input_batch_size = ArraySize(input_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  const int block_shape_height = block_shape_data[0];
+  const int block_shape_width = block_shape_data[1];
+  const int padding_top = paddings_data[0];
+  const int padding_left = paddings_data[2];
+
+  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
+    int input_batch = out_b % input_batch_size;
+    int shift_w = (out_b / input_batch_size) % block_shape_width;
+    int shift_h = (out_b / input_batch_size) / block_shape_width;
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
+        if (out_h * block_shape_height < padding_top ||
+            out_h * block_shape_height >= padding_top + input_height ||
+            out_w * block_shape_width < padding_left ||
+            out_w * block_shape_width >= padding_left + input_width) {
+          memset(out, 0, depth * sizeof(T));
+        } else {
+          const T* in =
+              input_data +
+              Offset(input_dims, 0,
+                     (out_w * block_shape_width + shift_w) - padding_left,
+                     (out_h * block_shape_height + shift_h) - padding_top,
+                     input_batch);
+          memcpy(out, in, depth * sizeof(T));
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  const int output_batch_size = ArraySize(output_dims, 3);
+  const int input_batch_size = ArraySize(input_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int depth = ArraySize(input_dims, 0);
+  const int block_shape_width = block_shape_data[1];
+  const int block_shape_height = block_shape_data[0];
+
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      for (int in_w = 0; in_w < input_width; ++in_w) {
+        int out_batch = in_batch % output_batch_size;
+        int out_w = in_w * block_shape_width +
+                    (in_batch / output_batch_size) % block_shape_width;
+        int out_h = in_h * block_shape_height +
+                    (in_batch / output_batch_size) / block_shape_width;
+        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
+        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  const int output_batch = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_depth = ArraySize(output_dims, 0);
+
+  const int left_b_padding = left_paddings[3];
+  const int left_h_padding = left_paddings[2];
+  const int left_w_padding = left_paddings[1];
+  const int left_d_padding = left_paddings[0];
+
+  const int right_b_padding = right_paddings[3];
+  const int right_h_padding = right_paddings[2];
+  const int right_w_padding = right_paddings[1];
+  const int right_d_padding = right_paddings[0];
+
+  const T* in_ptr = input_data;
+  T* out_ptr = output_data;
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      for (int out_w = 0; out_w < output_width; ++out_w) {
+        for (int out_d = 0; out_d < output_depth; ++out_d) {
+          if (out_b < left_b_padding ||
+              out_b >= output_batch - right_b_padding ||
+              out_h < left_h_padding ||
+              out_h >= output_height - right_h_padding ||
+              out_w < left_w_padding ||
+              out_w >= output_width - right_w_padding ||
+              out_d < left_d_padding ||
+              out_d >= output_depth - right_d_padding) {
+            *out_ptr++ = 0;
+          } else {
+            *out_ptr++ = *in_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
+                         int begin_mask, int end_mask,
+                         const std::vector<int>& starts,
+                         const std::vector<int>& stops,
+                         const std::vector<int>& strides, T* output_data,
+                         const Dims<4>& output_dims) {
+  const int start_b = (begin_mask & 8) ? 0 : starts[3];
+  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
+  const int start_h = (begin_mask & 4) ? 0 : starts[2];
+  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
+  const int start_w = (begin_mask & 2) ? 0 : starts[1];
+  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
+  const int start_d = (begin_mask & 1) ? 0 : starts[0];
+  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
+
+  T* out_ptr = output_data;
+  for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
+    for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
+      for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
+        for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
+          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& begin, const std::vector<int>& size,
+                  T* output_data, const Dims<4>& output_dims) {
+  // TODO(dkalenichenko): This op only supports 4D tensors.
+  TFLITE_DCHECK_EQ(begin.size(), 4);
+  TFLITE_DCHECK_EQ(size.size(), 4);
+  const int start_b = begin[3];
+  const int stop_b =
+      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
+  const int start_h = begin[2];
+  const int stop_h =
+      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+  const int start_w = begin[1];
+  const int stop_w =
+      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+  const int start_d = begin[0];
+  const int stop_d =
+      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
+
+  T* out_ptr = output_data;
+  for (int in_b = start_b; in_b < stop_b; ++in_b) {
+    for (int in_h = start_h; in_h < stop_h; ++in_h) {
+      for (int in_w = start_w; in_w < stop_w; ++in_w) {
+        for (int in_d = start_d; in_d < stop_d; ++in_d) {
+          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Mean(const T* input_data, const Dims<4>& input_dims,
+                 const std::vector<int>& reduction_indices, T* output_data,
+                 const Dims<4>& output_dims) {
+  const int output_batch = ArraySize(output_dims, 3);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_depth = ArraySize(output_dims, 0);
+
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+
+  // The current implementation only supports simultaneous reduction over
+  // width and height.
+  TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
+  TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
+                (reduction_indices[0] == 2 && reduction_indices[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
+        }
+      }
+      output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
+          value / (input_width * input_height);
+    }
+  }
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
+  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
+  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
+  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+
+  auto min_value = input2_data[0];
+
+  for (int b = 0; b < batches; b++) {
+    for (int y = 0; y < input_height; y++) {
+      for (int x = 0; x < input_width; x++) {
+        for (int c = 0; c < depth; c++) {
+          int offset = Offset(input1_dims, c, x, y, b);
+          output_data[offset] =
+              input1_data[offset] > min_value ? min_value : input1_data[offset];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, T* output_data,
+                       const Dims<4>& output_dims) {
+  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
+  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
+  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
+  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+
+  auto max_value = input2_data[0];
+
+  for (int b = 0; b < batches; b++) {
+    for (int y = 0; y < input_height; y++) {
+      for (int x = 0; x < input_width; x++) {
+        for (int c = 0; c < depth; c++) {
+          int offset = Offset(input1_dims, c, x, y, b);
+          output_data[offset] =
+              input1_data[offset] < max_value ? max_value : input1_data[offset];
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/round.h b/tensorflow/contrib/lite/kernels/internal/round.h
new file mode 100644
index 0000000000000000000000000000000000000000..38525b0e208b852343849096ac68cbfc9ef3e389
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/round.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
+
+#include <cmath>
+
+namespace tflite {
+
+// TODO(aselle): See if we can do this only on jdk. Also mikecase, check
+// if you need this for java host build.
+#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+template <class T>
+inline float TfLiteRound(const float x) {
+  return ::round(x);
+}
+inline double TfLiteRound(const double x) { return ::round(x); }
+#else
+template <class T>
+inline T TfLiteRound(const T x) {
+  return std::round(x);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_ROUND_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee4111e0416560d94d513c528971bdf3bf819662
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
+
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+template <typename T>
+inline T* GetTensorData(TfLiteTensor* tensor);
+
+template <>
+inline float* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f : nullptr;
+}
+
+template <>
+inline uint8_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.uint8 : nullptr;
+}
+
+template <>
+inline int32_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.i32 : nullptr;
+}
+
+template <>
+inline int64_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? reinterpret_cast<int64_t*>(tensor->data.raw)
+                           : nullptr;
+}
+
+inline int RemapDim(int max_dimensions, int d) {
+  return max_dimensions - d - 1;
+}
+
+// TODO(ahentz): the implementations in kernels/internal/ take a Dims<4> object
+// even if the original tensors were not 4D. We should consider rewriting them
+// to take a more generic 'shape' object.
+inline Dims<4> GetTensorDims(const int data[], const int size) {
+  Dims<4> d;
+  for (int i = 0; i < 4; ++i) {
+    int src = size - i - 1;
+    if (src >= 0) {
+      d.sizes[i] = data[src];
+    } else {
+      d.sizes[i] = 1;
+    }
+  }
+  d.strides[0] = 1;
+  for (int i = 1; i < 4; i++) {
+    d.strides[i] = d.strides[i - 1] * d.sizes[i - 1];
+  }
+  return d;
+}
+
+inline Dims<4> GetTensorDims(std::vector<int32_t> data) {
+  return GetTensorDims(data.data(), data.size());
+}
+
+inline Dims<4> GetTensorDims(const TfLiteTensor* tensor) {
+  if (tensor == nullptr) {
+    return Dims<4>();
+  }
+
+  auto* dims = tensor->dims;
+  return GetTensorDims(dims->data, dims->size);
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf2068d320f65cf0195abbc181f4ef4ff8f20679
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TensorTest, GetTensorDims4D) {
+  Dims<4> d = GetTensorDims({2, 3, 4, 5});
+  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 2));
+  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+}
+
+TEST(TensorTest, GetTensorDims3D) {
+  Dims<4> d = GetTensorDims({3, 4, 5});
+  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 3, 1));
+  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 60));
+}
+
+TEST(TensorTest, GetTensorDims2D) {
+  Dims<4> d = GetTensorDims({4, 5});
+  EXPECT_THAT(d.sizes, ElementsAre(5, 4, 1, 1));
+  EXPECT_THAT(d.strides, ElementsAre(1, 5, 20, 20));
+}
+
+TEST(TensorTest, GetTensorDims1D) {
+  Dims<4> d = GetTensorDims({5});
+  EXPECT_THAT(d.sizes, ElementsAre(5, 1, 1, 1));
+  EXPECT_THAT(d.strides, ElementsAre(1, 5, 5, 5));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..904a97803a6a9ba369c1e64c711b12d19ffc10c4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.cc
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+#ifdef USE_NEON
+#include "tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h"
+#else
+#include "tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h"
+#endif  // USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e69ef5982f01e364d865684652d1dfecab6fee3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+// Limit a float input f betweeen +abs_limit and -abs_limit.
+float Clip(float f, float abs_limit);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector using a stride value provided in result_stride. 'result_stride' shows
+// how the number of elements between consecutive result values. For example
+// result_stride = 1, will cause the output to look like this:
+// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be
+// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows]
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result,
+                                         int result_stride);
+
+// Cwise product of two vectors.
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
+// assumption here is that result array is initialized to valid values.
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+//            x_2_1, x_2_2, ..., x_2_vsize,
+//            ...
+//            x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+//            y_2_1, y_2_2, ..., y_2_vsize,
+//            ...
+//            y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size which will be saved with a
+// stride of result_stride in memory starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+//  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+//  ...
+//  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      int n_batch, float* result,
+                                      int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result);
+
+// Batch vector initialization with another vector.
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void ApplySigmoidToVector(const float* vector, int v_size, float* result);
+
+// Apply activation function to elements of a vector.
+void ApplyActivationToVector(const float* vector, int v_size,
+                             TfLiteFusedActivation activation, float* result);
+
+// Copy vector to another vector.
+void CopyVector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void ZeroVector(float* vector, int v_size);
+
+// Clip elements of a vector using a abs_limit value.
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result);
+
+// Shift left a vector in place with v_size size.
+void VectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, float* output_vector,
+                        int output_size, int reduction_size);
+}  // namespace tensor_utils
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..588f1a428b8c84367d659c2c5bb59a411cd8bb34
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
@@ -0,0 +1,192 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include <gmock/gmock.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace tensor_utils {
+
+TEST(uKernels, ClipTest) {
+  constexpr int kVectorSize = 10;
+  constexpr float kAbsLimit = 2.0;
+  static float input[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                     -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  std::vector<float> output(kVectorSize);
+  ClipVector(input, kVectorSize, kAbsLimit, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
+}
+
+TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
+  constexpr int kRow = 3;
+  constexpr int kCol = 4;
+  constexpr int kBatch = 2;
+  static float matrix[kRow * kCol] = {1.0,  2.0,  3.0,  4.0,   //
+                                      -1.0, -2.0, -3.0, -4.0,  //
+                                      1.0,  -2.0, 3.0,  -4.0};
+  static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0,  //
+                                        2.0, -2.0, 2.0, -2.0};
+  std::vector<float> output(kRow * kBatch);
+  std::fill(output.begin(), output.end(), 3.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      output.data(), /*result_stride=*/1);
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13.,  //
+                                                       -1., 7., 23.})));
+
+  std::vector<float> output_with_stride2(kRow * kBatch * 2);
+  std::fill(output_with_stride2.begin(), output_with_stride2.end(), 3.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      output_with_stride2.data(),
+                                      /*result_stride=*/2);
+  EXPECT_THAT(output_with_stride2,
+              ElementsAreArray(ArrayFloatNear({1., 3., 5., 3., 13., 3.,  //
+                                               -1., 3., 7., 3., 23., 3.})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductTest) {
+  constexpr int kVectorSize = 10;
+  static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                      -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                      -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kVectorSize);
+  VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductAccumulateTest) {
+  constexpr int kVectorSize = 10;
+  static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                      -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                      -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kVectorSize);
+  std::fill(output.begin(), output.end(), 1.0);
+  VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize,
+                                     output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45})));
+}
+
+TEST(uKernels, VectorBatchVectorAssignTest) {
+  constexpr int kVectorSize = 5;
+  constexpr int kBatchSize = 3;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize * kBatchSize);
+  VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0,
+                           0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, ApplySigmoidToVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  ApplySigmoidToVector(input, kVectorSize, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.5, 0.377541, 0.731059, 0.182426, 0.880797})));
+}
+
+TEST(uKernels, ApplyActivationToVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  ApplyActivationToVector(input, kVectorSize, kTfLiteActRelu, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0})));
+
+  ApplyActivationToVector(input, kVectorSize, kTfLiteActTanh, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
+}
+
+TEST(uKernels, CopyVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  CopyVector(input, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, Sub1VectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  Sub1Vector(input, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
+}
+
+TEST(uKernels, ZeroVectorTest) {
+  constexpr int kVectorSize = 5;
+  std::vector<float> output(kVectorSize);
+  ZeroVector(output.data(), kVectorSize);
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
+}
+
+TEST(uKernels, BatchVectorBatchVectorDotProductTest) {
+  constexpr int kVectorSize = 5;
+  constexpr int kBatch = 2;
+  static float input1[kVectorSize * kBatch] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                               -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize * kBatch] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                               -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kBatch);
+  BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch,
+                                   output.data(), /*result_stride=*/1);
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75})));
+}
+
+TEST(uKernels, VectorShiftLeftTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> result(kVectorSize);
+  VectorShiftLeft(input, kVectorSize, 3.0);
+  result.assign(input, input + kVectorSize);
+  EXPECT_THAT(result,
+              ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
+}
+
+TEST(uKernels, ReductionSumVectorTest) {
+  constexpr int kInputVectorSize = 10;
+  constexpr int kOutputVectorSize1 = 5;
+  constexpr int kReductionSize1 = 2;
+  static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+                                          0.0, -0.5, 1.0, 1.0,  2.0};
+  std::vector<float> result1(kOutputVectorSize1);
+  ReductionSumVector(input, result1.data(), kOutputVectorSize1,
+                     kReductionSize1);
+  EXPECT_THAT(result1,
+              ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0})));
+
+  constexpr int kOutputVectorSize2 = 2;
+  constexpr int kReductionSize2 = 5;
+  std::vector<float> result2(kOutputVectorSize2);
+  ReductionSumVector(input, result2.data(), kOutputVectorSize2,
+                     kReductionSize2);
+  EXPECT_THAT(result2, ElementsAreArray(ArrayFloatNear({1.0, 3.5})));
+}
+
+}  // namespace tensor_utils
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..07f1cb40045fff3ae47ed4efa6ec43b0cb88a0a7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -0,0 +1,81 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  TFLITE_DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  TFLITE_DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  int max_offset = 0;
+  for (int i = 0; i < 4; i++) {
+    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  }
+  return max_offset + 1;
+}
+
+template <int N>
+bool IsPackedWithoutStrides(const Dims<N>& dims) {
+  int expected_stride = 1;
+  for (int d = 0; d < N; d++) {
+    if (dims.strides[d] != expected_stride) return false;
+    expected_stride *= dims.sizes[d];
+  }
+  return true;
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0546c00cf977af5f722a802866448b0cb293b8d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include <algorithm>
+#include <cmath>
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
+
+namespace tflite {
+
+TfLiteStatus GetQuantizedConvolutionMultipler(
+    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
+    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier) {
+  const double input_product_scale = input->params.scale * filter->params.scale;
+  const double bias_scale = bias->params.scale;
+  const double output_scale = output->params.scale;
+
+  // TODO(ahentz): The following conditions must be guaranteed by the training
+  // pipeline.
+  TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <=
+                              1e-6 * std::min(input_product_scale, bias_scale));
+  TF_LITE_ENSURE(context, input_product_scale >= 0);
+  TF_LITE_ENSURE(context, input_product_scale < output_scale);
+
+  *multiplier = input_product_scale / output_scale;
+
+  return kTfLiteOk;
+}
+
+void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
+                                   TfLiteTensor* output, int32_t* act_min,
+                                   int32_t* act_max) {
+  const int32_t qmin = std::numeric_limits<uint8_t>::min();
+  const int32_t qmax = std::numeric_limits<uint8_t>::max();
+
+  const auto scale = output->params.scale;
+  const auto zero_point = output->params.zero_point;
+
+  auto quantize = [scale, zero_point](float f) {
+    return zero_point + static_cast<int32_t>(TfLiteRound(f / scale));
+  };
+
+  if (activation == kTfLiteActRelu) {
+    *act_min = std::max(qmin, quantize(0.0));
+    *act_max = qmax;
+  } else if (activation == kTfLiteActRelu6) {
+    *act_min = std::max(qmin, quantize(0.0));
+    *act_max = std::min(qmax, quantize(6.0));
+  } else if (activation == kTfLiteActRelu1) {
+    *act_min = std::max(qmin, quantize(-1.0));
+    *act_max = std::min(qmax, quantize(1.0));
+  } else {
+    *act_min = qmin;
+    *act_max = qmax;
+  }
+}
+
+void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
+                                   float* activation_min,
+                                   float* activation_max) {
+  if (activation == kTfLiteActRelu) {
+    *activation_min = 0.f;
+    *activation_max = std::numeric_limits<float>::max();
+  } else if (activation == kTfLiteActRelu6) {
+    *activation_min = 0.f;
+    *activation_max = 6.f;
+  } else if (activation == kTfLiteActRelu1) {
+    *activation_min = -1.f;
+    *activation_max = 1.f;
+  } else {
+    *activation_min = std::numeric_limits<float>::lowest();
+    *activation_max = std::numeric_limits<float>::max();
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..25556ae4567aca45b3bfe4ba02b1cb58331d239d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
+inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
+  return t->dims->data[dim];
+}
+inline TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
+                              int index) {
+  return &context->tensors[node->inputs->data[index]];
+}
+inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
+                               int index) {
+  return &context->tensors[node->outputs->data[index]];
+}
+inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
+inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
+
+inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
+                                            const TfLiteNode* node, int index) {
+  const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
+  if (use_tensor) {
+    return &context->tensors[node->inputs->data[index]];
+  }
+  return nullptr;
+}
+
+// Calculates the multiplication factor for a quantized convolution (or
+// quantized depthwise convolution) involving the given tensors. Returns an
+// error if the scales of the tensors are not compatible.
+TfLiteStatus GetQuantizedConvolutionMultipler(
+    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
+    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier);
+
+// Calculates the useful range of an activation layer given its activation
+// tensor.
+void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
+                                   TfLiteTensor* output, int32_t* act_min,
+                                   int32_t* act_max);
+void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
+                                   float* activation_min,
+                                   float* activation_max);
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_KERNEL_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f43aa372b6398a38e57dd38f3d7c7db2bd3aefc1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -0,0 +1,112 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace l2norm {
+
+// This file has two implementation of L2Norm.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+  // TODO(ahentz): Our current implementations only support float32.
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  // TODO(ahentz): For some reason our implementations don't support
+  // activations.
+  TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  output_size->data[1] = input->dims->data[1];
+  output_size->data[2] = input->dims->data[2];
+  output_size->data[3] = input->dims->data[3];
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+#define TF_LITE_L2NORM(type)                                 \
+  type::L2Normalization<FusedActivationFunctionType::kNone>( \
+      GetTensorData<float>(input), GetTensorDims(input),     \
+      GetTensorData<float>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_L2NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_L2NORM(optimized_ops);
+    }
+#undef TF_LITE_L2NORM
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace l2norm
+
+TfLiteRegistration* Register_L2NORM_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, l2norm::Prepare,
+                                 l2norm::Eval<l2norm::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_L2NORM_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, l2norm::Prepare,
+                                 l2norm::Eval<l2norm::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_L2_NORMALIZATION() {
+  return Register_L2NORM_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30e103f3303484c339ef98e6a68e0438291c102f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class L2NormOpModel : public SingleOpModel {
+ public:
+  L2NormOpModel(std::initializer_list<int> input_shape,
+                ActivationFunctionType activation_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
+                 CreateL2NormOptions(builder_, activation_type).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(L2NormOpTest, SimpleTest) {
+  L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/contrib/lite/kernels/local_response_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1c70d0dfa0050dee3815aa15f5d16d2e7ddc721
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/local_response_norm.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace local_response_norm {
+
+// This file has two implementation of LocalResponseNorm.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  output_size->data[1] = input->dims->data[1];
+  output_size->data[2] = input->dims->data[2];
+  output_size->data[3] = input->dims->data[3];
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteLocalResponseNormParams*>(node->builtin_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+#define TF_LITE_LOCAL_RESPONSE_NORM(type)                                      \
+  type::LocalResponseNormalization(                                            \
+      GetTensorData<float>(input), GetTensorDims(input), params->radius,       \
+      params->bias, params->alpha, params->beta, GetTensorData<float>(output), \
+      GetTensorDims(output))
+    if (kernel_type == kReference) {
+      TF_LITE_LOCAL_RESPONSE_NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_LOCAL_RESPONSE_NORM(optimized_ops);
+    }
+#undef TF_LITE_LOCAL_RESPONSE_NORM
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace local_response_norm
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, local_response_norm::Prepare,
+      local_response_norm::Eval<local_response_norm::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, local_response_norm::Prepare,
+      local_response_norm::Eval<local_response_norm::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION() {
+  return Register_LOCAL_RESPONSE_NORM_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm_test.cc b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d75ce258a04c820d8f82735988c01d0154ef36f2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/local_response_norm_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LocalResponseNormOpModel : public SingleOpModel {
+ public:
+  LocalResponseNormOpModel(std::initializer_list<int> input_shape, int radius,
+                           float bias, float alpha, float beta) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+                 BuiltinOptions_LocalResponseNormalizationOptions,
+                 CreateLocalResponseNormalizationOptions(builder_, radius, bias,
+                                                         alpha, beta)
+                     .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(LocalResponseNormOpTest, SameAsL2Norm) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/1.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 2.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})));
+}
+
+TEST(LocalResponseNormOpTest, WithAlpha) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 3.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {-0.275, 0.15, 0.175, 0.3, -0.175, 0.025})));
+}
+
+TEST(LocalResponseNormOpTest, WithBias) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 5.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02})));
+}
+
+TEST(LocalResponseNormOpTest, SmallRadius) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/2, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266})));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f73b56ed9790b216adc788490faebaabd2bc756
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -0,0 +1,204 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// LSH Projection projects an input to a bit vector via locality senstive
+// hashing.
+//
+// Options:
+//   Sparse:
+//     Computed bit vector is considered to be sparse.
+//     Each output element is an int32 made up by multiple bits computed from
+// hash functions.
+//
+//   Dense:
+//     Computed bit vector is considered to be dense. Each output element is
+// either 0 or 1 that represents a bit.
+//
+// Input:
+//   Tensor[0]: Hash functions. Dim.size == 2, DataType: Float.
+//              Tensor[0].Dim[0]: Num of hash functions.
+//              Tensor[0].Dim[1]: Num of projected output bits generated by
+//                                each hash function.
+//   In sparse case, Tensor[0].Dim[1] + ceil( log2(Tensor[0].Dim[0] )) <= 32.
+//
+//   Tensor[1]: Input. Dim.size >= 1, No restriction on DataType.
+//   Tensor[2]: Optional, Weight. Dim.size == 1, DataType: Float.
+//              If not set, each element of input is considered to have same
+// weight of 1.0 Tensor[1].Dim[0] == Tensor[2].Dim[0]
+//
+// Output:
+//   Sparse:
+//     Output.Dim == { Tensor[0].Dim[0] }
+//     A tensor of int32 that represents hash signatures,
+//
+//     NOTE: To avoid collisions across hash functions, an offset value of
+//     k * (1 << Tensor[0].Dim[1]) will be added to each signature,
+//     k is the index of the hash function.
+//   Dense:
+//     Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
+//     A flattened tensor represents projected bit vectors.
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <memory>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include <farmhash.h>
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lsh_projection {
+
+TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
+  TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* hash = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(hash), 2);
+  // Support up to 32 bits.
+  TF_LITE_ENSURE(context, SizeOfDimension(hash, 1) <= 32);
+
+  TfLiteTensor* input = GetInput(context, node, 1);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
+  if (NumInputs(node) == 3) {
+    TfLiteTensor* weight = GetInput(context, node, 2);
+    TF_LITE_ENSURE_EQ(context, NumDimensions(weight), 1);
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(weight, 0),
+                      SizeOfDimension(input, 0));
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
+  switch (params->type) {
+    case kTfLiteLshProjectionSparse:
+      outputSize->data[0] = SizeOfDimension(hash, 0);
+      break;
+    case kTfLiteLshProjectionDense:
+      outputSize->data[0] = SizeOfDimension(hash, 0) * SizeOfDimension(hash, 1);
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+// Compute sign bit of dot product of hash(seed, input) and weight.
+// NOTE: use float as seed, and convert it to double as a temporary solution
+//       to match the trained model. This is going to be changed once the new
+//       model is trained in an optimized method.
+//
+int RunningSignBit(const TfLiteTensor* input, const TfLiteTensor* weight,
+                   float seed) {
+  double score = 0.0;
+  int input_item_bytes = input->bytes / SizeOfDimension(input, 0);
+  char* input_ptr = input->data.raw;
+
+  const size_t seed_size = sizeof(float);
+  const size_t key_bytes = sizeof(float) + input_item_bytes;
+  std::unique_ptr<char[]> key(new char[key_bytes]);
+
+  for (int i = 0; i < SizeOfDimension(input, 0); ++i) {
+    // Create running hash id and value for current dimension.
+    memcpy(key.get(), &seed, seed_size);
+    memcpy(key.get() + seed_size, input_ptr, input_item_bytes);
+
+    int64_t hash_signature = ::util::Fingerprint64(key.get(), key_bytes);
+    double running_value = static_cast<double>(hash_signature);
+    input_ptr += input_item_bytes;
+    if (weight == nullptr) {
+      score += running_value;
+    } else {
+      score += weight->data.f[i] * running_value;
+    }
+  }
+
+  return (score > 0) ? 1 : 0;
+}
+
+void SparseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
+                         const TfLiteTensor* weight, int32_t* out_buf) {
+  int num_hash = SizeOfDimension(hash, 0);
+  int num_bits = SizeOfDimension(hash, 1);
+  for (int i = 0; i < num_hash; i++) {
+    int32_t hash_signature = 0;
+    for (int j = 0; j < num_bits; j++) {
+      float seed = hash->data.f[i * num_bits + j];
+      int bit = RunningSignBit(input, weight, seed);
+      hash_signature = (hash_signature << 1) | bit;
+    }
+    *out_buf++ = hash_signature + i * (1 << num_bits);
+  }
+}
+
+void DenseLshProjection(const TfLiteTensor* hash, const TfLiteTensor* input,
+                        const TfLiteTensor* weight, int32_t* out_buf) {
+  int num_hash = SizeOfDimension(hash, 0);
+  int num_bits = SizeOfDimension(hash, 1);
+  for (int i = 0; i < num_hash; i++) {
+    for (int j = 0; j < num_bits; j++) {
+      float seed = hash->data.f[i * num_bits + j];
+      int bit = RunningSignBit(input, weight, seed);
+      *out_buf++ = bit;
+    }
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
+
+  int32_t* out_buf = GetOutput(context, node, 0)->data.i32;
+  TfLiteTensor* hash = GetInput(context, node, 0);
+  TfLiteTensor* input = GetInput(context, node, 1);
+  TfLiteTensor* weight =
+      NumInputs(node) == 2 ? nullptr : GetInput(context, node, 2);
+
+  switch (params->type) {
+    case kTfLiteLshProjectionDense:
+      DenseLshProjection(hash, input, weight, out_buf);
+      break;
+    case kTfLiteLshProjectionSparse:
+      SparseLshProjection(hash, input, weight, out_buf);
+      break;
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace lsh_projection
+
+TfLiteRegistration* Register_LSH_PROJECTION() {
+  static TfLiteRegistration r = {nullptr, nullptr, lsh_projection::Resize,
+                                 lsh_projection::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection_test.cc b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414d728dfc153058ec878d3c766f58e86815cd3f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lsh_projection_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class LSHProjectionOpModel : public SingleOpModel {
+ public:
+  LSHProjectionOpModel(LSHProjectionType type,
+                       std::initializer_list<int> hash_shape,
+                       std::initializer_list<int> input_shape,
+                       std::initializer_list<int> weight_shape) {
+    hash_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(TensorType_INT32);
+    if (weight_shape.size() > 0) {
+      weight_ = AddInput(TensorType_FLOAT32);
+    }
+    output_ = AddOutput(TensorType_INT32);
+
+    SetBuiltinOp(BuiltinOperator_LSH_PROJECTION,
+                 BuiltinOptions_LSHProjectionOptions,
+                 CreateLSHProjectionOptions(builder_, type).Union());
+    if (weight_shape.size() > 0) {
+      BuildInterpreter({hash_shape, input_shape, weight_shape});
+    } else {
+      BuildInterpreter({hash_shape, input_shape});
+    }
+
+    output_size_ = 1;
+    for (int i : hash_shape) {
+      output_size_ *= i;
+      if (type == LSHProjectionType_SPARSE) {
+        break;
+      }
+    }
+  }
+  void SetInput(std::initializer_list<int> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetHash(std::initializer_list<float> data) {
+    PopulateTensor(hash_, data);
+  }
+
+  void SetWeight(std::initializer_list<float> f) { PopulateTensor(weight_, f); }
+
+  std::vector<int> GetOutput() { return ExtractVector<int>(output_); }
+
+ private:
+  int input_;
+  int hash_;
+  int weight_;
+  int output_;
+
+  int output_size_;
+};
+
+TEST(LSHProjectionOpTest2, Dense1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_DENSE, {3, 2}, {5}, {5});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({1.0, 1.0, 1.0, 1.0, 1.0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0, 0, 0, 1, 0, 0));
+}
+
+TEST(LSHProjectionOpTest2, Sparse1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5}, {});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 0, 4 + 1, 8 + 0));
+}
+
+TEST(LSHProjectionOpTest2, Sparse3DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5, 2, 2}, {5});
+
+  m.SetInput({1234, 2345, 3456, 1234, 4567, 5678, 6789, 4567, 7891, 8912,
+              9123, 7890, -987, -876, -765, -987, -543, -432, -321, -543});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({0.12, 0.34, 0.56, 0.67, 0.78});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 2, 4 + 1, 8 + 1));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c06264d845c24e71647b6fd2374734be32383ef
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -0,0 +1,515 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace lstm {
+
+// Input Tensors of size {n_batch, n_input}
+constexpr int kInputTensor = 0;
+
+// Input weight tensors of size: {n_cell, n_input}
+constexpr int kInputToInputWeightsTensor = 1;  // Optional
+constexpr int kInputToForgetWeightsTensor = 2;
+constexpr int kInputToCellWeightsTensor = 3;
+constexpr int kInputToOutputWeightsTensor = 4;
+
+// Recurrent weight tensors of size {n_cell, n_output}
+constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
+constexpr int kRecurrentToForgetWeightsTensor = 6;
+constexpr int kRecurrentToCellWeightsTensor = 7;
+constexpr int kRecurrentToOutputWeightsTensor = 8;
+
+// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
+constexpr int kCellToInputWeightsTensor = 9;    // Optional
+constexpr int kCellToForgetWeightsTensor = 10;  // Optional
+constexpr int kCellToOutputWeightsTensor = 11;  // Optional
+
+// Gates bias tensors of size {n_cell}
+constexpr int kInputGateBiasTensor = 12;  // Optional
+constexpr int kForgetGateBiasTensor = 13;
+constexpr int kCellGateBiasTensor = 14;
+constexpr int kOutputGateBiasTensor = 15;
+
+// Projection weight tensor of size {n_output, n_cell}
+constexpr int kProjectionWeightsTensor = 16;  // Optional
+// Projection bias tensor of size {n_output}
+constexpr int kProjectionBiasTensor = 17;  // Optional
+
+// Output tensors.
+constexpr int kScratchBufferTensor = 0;
+constexpr int kOutputStateTensor = 1;
+constexpr int kCellStateTensor = 2;
+constexpr int kOutputTensor = 3;
+
+// Check that input tensor dimensions matches with each other.
+TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
+                                        TfLiteNode* node, int n_input,
+                                        int n_output, int n_cell) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+
+  // Making sure clipping parameters have valid values.
+  // == 0 means no clipping
+  //  > 0 means clipping
+  TF_LITE_ENSURE(context, params->cell_clip >= 0);
+  TF_LITE_ENSURE(context, params->proj_clip >= 0);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  if (input_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
+    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
+  }
+
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
+
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  if (recurrent_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
+                      n_cell);
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
+                      n_output);
+  }
+
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
+                    n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
+                    n_output);
+
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
+                    n_output);
+
+  // We make sure the input-gate's parameters are either both present (regular
+  // LSTM) or not at all (CIFG-LSTM).
+  const bool cifg_weights_all_or_none =
+      ((input_to_input_weights != nullptr) &&
+       (recurrent_to_input_weights != nullptr)) ||
+      ((input_to_input_weights == nullptr) &&
+       (recurrent_to_input_weights == nullptr));
+  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  if (cell_to_input_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  if (cell_to_forget_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+  if (cell_to_output_weights) {
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
+  }
+
+  // Making sure the peephole weights are there all or none.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool peephole_weights_all_or_none =
+      ((cell_to_input_weights != nullptr || use_cifg) &&
+       (cell_to_forget_weights != nullptr) &&
+       (cell_to_output_weights != nullptr)) ||
+      ((cell_to_input_weights == nullptr) &&
+       (cell_to_forget_weights == nullptr) &&
+       (cell_to_output_weights == nullptr));
+  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
+
+  // Make sure the input gate bias is present only when not a CIFG-LSTM.
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
+  }
+
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  if (projection_weights) {
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
+    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
+  }
+
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+  if (projection_bias) {
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
+  }
+
+  // Making sure the projection tensors are consistent:
+  // 1) If projection weight is not present, then projection bias should not be
+  // present.
+  // 2) If projection weight is present, then projection bias is optional.
+  // TODO(ghodrat): make sure this is correct.
+  const bool projecton_tensors_consistent =
+      ((projection_weights != nullptr) || (projection_bias == nullptr));
+  TF_LITE_ENSURE(context, projecton_tensors_consistent == true);
+
+  return kTfLiteOk;
+}
+
+// Resize the output, state and scratch tensors based on the sizes of the input
+// tensors. Also check that the size of the input tensors match each other.
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+
+  // Inferring batch size, number of outputs and number of cells from the
+  // input tensors.
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input->dims->size > 1);
+  const int n_batch = input->dims->data[0];
+  const int n_input = input->dims->data[1];
+
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+  const int n_cell = input_to_output_weights->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
+
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
+                    n_cell);
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Check that input tensor dimensions matches with each other.
+  CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
+
+  // Get the pointer to output, state and scratch buffer tensors.
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  // TODO(ghodrat): Modify this as soon as we have a finalized method for
+  // scratch buffers.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+
+  // Resize the output and output_state tensors.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
+  output_size->data[0] = n_batch;
+  output_size->data[1] = n_output;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size));
+
+  TfLiteIntArray* output_state_size = TfLiteIntArrayCreate(2);
+  output_state_size->data[0] = n_batch;
+  output_state_size->data[1] = n_output;
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, output_state, output_state_size));
+
+  // Resize the output, state and scratch buffer tensors.
+  TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
+  cell_size->data[0] = n_batch;
+  cell_size->data[1] = n_cell;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, cell_state, cell_size));
+
+  // Mark state tensors as persistent tensors.
+  output_state->allocation_type = kTfLiteArenaRwPersistent;
+  cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (use_cifg) {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 3;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  } else {
+    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+    scratch_buffer_size->data[0] = n_batch;
+    // Reserving space for Input, Cell, Forget, Output gates
+    scratch_buffer_size->data[1] = n_cell * 4;
+    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                     scratch_buffer_size));
+  }
+  return kTfLiteOk;
+}
+
+// The LSTM Op engine.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const int n_batch = input->dims->data[0];
+  const int n_input = input->dims->data[1];
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias->data.f, n_cell,
+                                          n_batch, input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias->data.f, n_cell,
+                                        n_batch, forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias->data.f, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias->data.f, n_cell,
+                                        n_batch, output_gate_scratch);
+
+  // For each batch and cell: compute input_weight * input.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_input_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+        input_gate_scratch, /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_forget_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+      forget_gate_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_cell_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+      cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      input_to_output_weights->data.f, n_cell, n_input, input->data.f, n_batch,
+      output_gate_scratch, /*result_stride=*/1);
+
+  // For each batch and cell: compute recurrent_weight * output_state.
+  if (!use_cifg) {
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_input_weights->data.f, n_cell, n_output,
+        output_state->data.f, n_batch, input_gate_scratch, /*result_stride=*/1);
+  }
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_forget_weights->data.f, n_cell, n_output,
+      output_state->data.f, n_batch, forget_gate_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_cell_weights->data.f, n_cell, n_output, output_state->data.f,
+      n_batch, cell_scratch, /*result_stride=*/1);
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      recurrent_to_output_weights->data.f, n_cell, n_output,
+      output_state->data.f, n_batch, output_gate_scratch, /*result_stride=*/1);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole) {
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          cell_to_input_weights->data.f, n_cell, cell_state->data.f, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_forget_weights->data.f, n_cell, cell_state->data.f, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch,
+                                         cell_state->data.f, n_batch * n_cell,
+                                         cell_state->data.f);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell,
+        cell_state->data.f);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state->data.f);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state->data.f, n_batch * n_cell,
+                             params->cell_clip, cell_state->data.f);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole) {
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        cell_to_output_weights->data.f, n_cell, cell_state->data.f, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state->data.f, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights != nullptr);
+  const bool use_projection_bias = (projection_bias != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias->data.f, n_output,
+                                            n_batch, output->data.f);
+    } else {
+      tensor_utils::ZeroVector(output->data.f, n_batch * n_output);
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        projection_weights->data.f, n_output, n_cell, output_gate_scratch,
+        n_batch, output->data.f, /*result_stride=*/1);
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output->data.f, n_batch * n_output,
+                               params->proj_clip, output->data.f);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output->data.f);
+  }
+  tensor_utils::CopyVector(output->data.f, n_batch * n_output,
+                           output_state->data.f);
+
+  return kTfLiteOk;
+}
+
+}  // namespace lstm
+
+TfLiteRegistration* Register_LSTM() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 lstm::Prepare, lstm::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c068286b0d84bcb51ebb0e239350a42863de6523
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -0,0 +1,1087 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+  LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+              bool use_peephole, bool use_projection_weights,
+              bool use_projection_bias, float cell_clip, float proj_clip,
+              const std::vector<std::vector<int>>& input_shapes)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    output_state_ = AddOutput(TensorType_FLOAT32);
+    cell_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::initializer_list<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::initializer_list<float> f) {
+    PopulateTensor(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::initializer_list<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void ResetOutputState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void ResetCellState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ private:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+  int scratch_buffer_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/false,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+                       {0},  // cell_to_input_weight tensor
+                       {0},  // cell_to_forget_weight tensor
+                       {0},  // cell_to_output_weight tensor
+
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.02973187, 0.1229473,   0.20885126,
+                                       -0.15358765, -0.03716109, 0.12507336,
+                                       0.41193449,  -0.20860538, -0.15053082,
+                                       0.09120187,  0.24278517,  -0.12222792};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  const int input_sequence_size =
+      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch0_start = lstm_input + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(0, batch0_start, batch0_end);
+
+    lstm.Invoke();
+
+    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
+    float* golden_end = golden_start + lstm.num_outputs();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/true, /*use_peephole=*/true,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {0, 0},             // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {0, 0},              // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+                       {0},       // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
+
+                       {0},       // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   });
+
+  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                              0.04717243, 0.48944736, -0.38535351,
+                              -0.17212132});
+
+  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+                                0.33826375});
+
+  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556, 0.42751634});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToCellWeights(
+      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+       0.21193194});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+  lstm.SetCellToForgetWeights(
+      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+  lstm.SetCellToOutputWeights(
+      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
+                                       -0.05163646, -0.42312205, -0.01218222,
+                                       0.24201041,  -0.08124574, -0.358325,
+                                       -0.04621704, 0.21641694,  -0.06471302};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  const int input_sequence_size =
+      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch0_start = lstm_input + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(0, batch0_start, batch0_end);
+
+    lstm.Invoke();
+
+    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
+    float* golden_end = golden_start + lstm.num_outputs();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/false, /*use_peephole=*/true,
+                   /*use_projection_weights=*/true,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {n_cell, n_input},  // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {n_cell, n_output},  // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+                       {n_cell},  // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
+
+                       {n_cell},  // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {n_output, n_cell},  // projection_weight tensor
+                       {0},                 // projection_bias tensor
+                   });
+
+  lstm.SetInputToInputWeights(
+      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
+
+  lstm.SetInputToForgetWeights(
+      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
+
+  lstm.SetInputToCellWeights(
+      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+       0.05453865,    0.091149814,   0.06387331,    0.007518393,
+       0.055960953,   0.069779344,   0.046411168,   0.10509911,
+       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
+
+  lstm.SetInputToOutputWeights(
+      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
+
+  lstm.SetInputGateBias(
+      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
+       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
+       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
+       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
+
+  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
+
+  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
+
+  lstm.SetOutputGateBias(
+      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
+       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
+       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
+       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+       0.0365468,      0.07590991,     0.08838724,    0.021681072,
+       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+       0.015963363,    0.00871737,     0.060130805,   0.028611384,
+       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+       0.06358255,     0.18531723,     0.07759293,    0.12006465,
+       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+       0.026351685,    0.012641483,    0.07466548,    0.044301085,
+       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+       0.14811787,    0.10826372,    0.09471067,     0.03987225,
+       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+       0.060212336,   0.055259194,   0.06974018,     0.049454916,
+       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+       0.052958444,   0.07558703,    0.04817258,     0.044462286,
+       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+       0.014410365,   0.020995233,   0.17040324,     0.11511526,
+       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+       0.007076659,   0.10964551,    0.0409152,      0.008275321,
+       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+       0.08089997,     0.05143358,    0.038261272,   0.03339287,
+       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+       0.02295182,     0.030739572,   0.056506045,   0.004612461,
+       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
+
+  lstm.SetRecurrentToOutputWeights({
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  });
+
+  lstm.SetCellToInputWeights(
+      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
+
+  lstm.SetCellToForgetWeights(
+      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
+
+  lstm.SetCellToOutputWeights(
+      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
+
+  lstm.SetProjectionWeights(
+      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
+
+  static float lstm_input[][20] = {
+      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+  static float lstm_golden_output[][64] = {
+      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+       0.0286833,   0.00824207,   0.0264887,   0.0305169},
+      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  const int input_sequence_size =
+      sizeof(lstm_input[0]) / sizeof(float) / (lstm.num_inputs());
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
+    float* batch0_end = batch0_start + lstm.num_inputs();
+
+    lstm.SetInput(0, batch0_start, batch0_end);
+
+    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
+    float* batch1_end = batch1_start + lstm.num_inputs();
+    lstm.SetInput(lstm.num_inputs(), batch1_start, batch1_end);
+
+    lstm.Invoke();
+
+    float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
+    float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
+    float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
+    float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81c73f2523186c2d4072d56bdc8980fcdbb588a3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -0,0 +1,167 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mul {
+
+// This file has three implementation of Mul.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input1), NumDimensions(input2));
+  for (int i = 0; i < NumDimensions(input1); ++i) {
+    TF_LITE_ENSURE_EQ(context, SizeOfDimension(input1, i),
+                      SizeOfDimension(input2, i));
+  }
+
+  TF_LITE_ENSURE_EQ(context, input1->type, output->type);
+  TF_LITE_ENSURE_EQ(context, input2->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input1->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteMulParams* params, TfLiteTensor* input1,
+               TfLiteTensor* input2, TfLiteTensor* output) {
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(params->activation, &output_activation_min,
+                                &output_activation_max);
+#define TF_LITE_MUL(type)                                        \
+  type::Mul(GetTensorData<float>(input1), GetTensorDims(input1), \
+            GetTensorData<float>(input2), GetTensorDims(input2), \
+            output_activation_min, output_activation_max,        \
+            GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_MUL(reference_ops);
+  } else {
+    TF_LITE_MUL(optimized_ops);
+  }
+#undef TF_LITE_MUL
+}
+
+template <KernelType kernel_type>
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteMulParams* params, TfLiteTensor* input1,
+                   TfLiteTensor* input2, TfLiteTensor* output) {
+  auto input1_offset = -input1->params.zero_point;
+  auto input2_offset = -input2->params.zero_point;
+  auto output_offset = output->params.zero_point;
+
+  int32_t output_multiplier;
+  int output_shift;
+
+  double real_multiplier =
+      input1->params.scale * input2->params.scale / output->params.scale;
+  QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                   &output_shift);
+
+  int32 output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(params->activation, output,
+                                &output_activation_min, &output_activation_max);
+
+#define TF_LITE_MUL(type)                                                    \
+  type::BroadcastMul(GetTensorData<uint8_t>(input1), GetTensorDims(input1),  \
+                     input1_offset, GetTensorData<uint8_t>(input2),          \
+                     GetTensorDims(input2), input2_offset, output_offset,    \
+                     output_multiplier, output_shift, output_activation_min, \
+                     output_activation_max, GetTensorData<uint8_t>(output),  \
+                     GetTensorDims(output));
+  if (kernel_type == kReference) {
+    TF_LITE_MUL(reference_ops);
+  } else {
+    TF_LITE_MUL(optimized_ops);
+  }
+#undef TF_LITE_MUL
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
+
+  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalFloat<kernel_type>(context, node, params, input1, input2, output);
+  } else if (output->type == kTfLiteUInt8) {
+    EvalQuantized<kernel_type>(context, node, params, input1, input2, output);
+  } else {
+    context->ReportError(context,
+                         "Mul only supports FLOAT32 and quantized UINT8 now.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace mul
+
+TfLiteRegistration* Register_MUL_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+                                 mul::Eval<mul::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MUL_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+                                 mul::Eval<mul::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MUL_NEON_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, mul::Prepare,
+                                 mul::Eval<mul::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MUL() {
+#ifdef USE_NEON
+  return Register_MUL_NEON_OPT();
+#else
+  return Register_MUL_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/mul_test.cc b/tensorflow/contrib/lite/kernels/mul_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4255cfe18a043c55f3ce7292afdedb6e988a28a2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/mul_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseMulOpModel : public SingleOpModel {
+ public:
+  BaseMulOpModel(TensorData input, TensorData output,
+                 ActivationFunctionType activation_type) {
+    input1_ = AddInput(input);
+    input2_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MUL, BuiltinOptions_MulOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatMulOpModel : public BaseMulOpModel {
+ public:
+  using BaseMulOpModel::BaseMulOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+// For quantized Mul, the error shouldn't exceed (2*step + step^2).
+// The param min=-1.0 & max=1.0 is used in the following tests.
+// The tolerance value is ~0.0157.
+const float kQuantizedStep = 2.0 / 255.0;
+const float kQuantizedTolerance =
+    2.0 * kQuantizedStep + kQuantizedStep * kQuantizedStep;
+
+class QuantizedMulOpModel : public BaseMulOpModel {
+ public:
+  using BaseMulOpModel::BaseMulOpModel;
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(FloatMulOpTest, NoActivation) {
+  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 0.4})));
+}
+
+TEST(FloatMulOpTest, ActivationRELU1) {
+  FloatMulOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU1);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 1.0})));
+}
+
+TEST(FloatMulOpTest, VariousInputShapes) {
+  std::vector<std::initializer_list<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                      {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5, 1.1, 0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({-0.2, 0.04, 0.21, 0.4, 1.21, 0.2})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedMulOpTest, NoActivation) {
+  QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                        {TensorType_UINT8, {}, -1.0, 1.0},
+                        ActivationFunctionType_NONE);
+  m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
+  m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
+                                              kQuantizedTolerance)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/op_macros.h b/tensorflow/contrib/lite/kernels/op_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..7535afaf8ea52d855e2e4773e56ce2118a16447c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/op_macros.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
+
+#define TF_LITE_FATAL(msg)          \
+  do {                              \
+    fprintf(stderr, "%s\n", (msg)); \
+    exit(1);                        \
+  } while (0)
+#define TF_LITE_ASSERT(x)        \
+  do {                           \
+    if (!(x)) TF_LITE_FATAL(#x); \
+  } while (0)
+#define TF_LITE_ASSERT_EQ(x, y)                            \
+  do {                                                     \
+    if ((x) != (y)) TF_LITE_FATAL(#x " didn't equal " #y); \
+  } while (0)
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_OP_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17166715ca30ff3d8ba3d384110e403f8910e39d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -0,0 +1,340 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+  LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+              bool use_peephole, bool use_projection_weights,
+              bool use_projection_bias, float cell_clip, float proj_clip,
+              const std::vector<std::vector<int>>& input_shapes)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    input_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+    }
+
+    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      }
+      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
+    cell_bias_ = AddInput(TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(TensorType_FLOAT32);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
+    // TODO(ghodrat): Modify these states when we have a permanent solution for
+    // persistent buffer.
+    output_state_ = AddOutput(TensorType_FLOAT32);
+    cell_state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+                 CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                                   cell_clip, proj_clip)
+                     .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    PopulateTensor(cell_to_output_weights_, f);
+  }
+
+  void SetInputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::initializer_list<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::initializer_list<float> f) {
+    PopulateTensor(cell_bias_, f);
+  }
+
+  void SetOutputGateBias(std::initializer_list<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    PopulateTensor(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::initializer_list<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void ResetOutputState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(output_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void ResetCellState() {
+    const int zero_buffer_size = n_cell_ * n_batch_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(cell_state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  void Verify() {
+    auto model = tflite::UnPackModel(builder_.GetBufferPointer());
+    EXPECT_NE(model, nullptr);
+  }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ private:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_;
+  int output_state_;
+  int cell_state_;
+  int scratch_buffer_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+
+TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  LSTMOpModel lstm(n_batch, n_input, n_cell, n_output,
+                   /*use_cifg=*/true, /*use_peephole=*/true,
+                   /*use_projection_weights=*/false,
+                   /*use_projection_bias=*/false,
+                   /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+                   {
+                       {n_batch, n_input},  // input tensor
+
+                       {0, 0},             // input_to_input_weight tensor
+                       {n_cell, n_input},  // input_to_forget_weight tensor
+                       {n_cell, n_input},  // input_to_cell_weight tensor
+                       {n_cell, n_input},  // input_to_output_weight tensor
+
+                       {0, 0},              // recurrent_to_input_weight tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+                       {0},       // cell_to_input_weight tensor
+                       {n_cell},  // cell_to_forget_weight tensor
+                       {n_cell},  // cell_to_output_weight tensor
+
+                       {0},       // input_gate_bias tensor
+                       {n_cell},  // forget_gate_bias tensor
+                       {n_cell},  // cell_bias tensor
+                       {n_cell},  // output_gate_bias tensor
+
+                       {0, 0},  // projection_weight tensor
+                       {0},     // projection_bias tensor
+                   });
+
+
+  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                              0.04717243, 0.48944736, -0.38535351,
+                              -0.17212132});
+
+  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
+                                0.33826375});
+
+  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556, 0.42751634});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToCellWeights(
+      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
+       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
+       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
+       0.21193194});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
+       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
+       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
+       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
+       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+
+  lstm.SetCellToForgetWeights(
+      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
+  lstm.SetCellToOutputWeights(
+      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
+
+  // Verify the model by unpacking it.
+  lstm.Verify();
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a60274524c468ef29e522de5569e0d8354974c2
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
+
+namespace tflite {
+
+inline int ComputePadding(int stride, int in_size, int filter_size,
+                          int out_size) {
+  int padding = ((out_size - 1) * stride + filter_size - in_size) / 2;
+  return padding > 0 ? padding : 0;
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b79880110897a1438a589d97363fd861c61667e7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -0,0 +1,355 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pooling {
+
+// This file has two implementation of each pooling op.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+enum PoolType {
+  kAverage,
+  kMax,
+  kL2,
+};
+
+struct OpData {
+  TfLitePaddingValues padding;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  // This is a builtin op, so we don't use the contents in 'buffer', if any.
+  // Instead, we allocate a new object to carry information from Prepare() to
+  // Eval().
+  return new OpData;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+template <PoolType pool_type>
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  int batches = input->dims->data[0];
+  int height = input->dims->data[1];
+  int width = input->dims->data[2];
+  int channels_out = input->dims->data[3];
+
+  // Matching GetWindowedOutputSize in TensorFlow.
+  auto padding = params->padding;
+  auto computeOutSize = [padding](int imageSize, int filterSize,
+                                  int stride) -> int {
+    return padding == kTfLitePaddingSame
+               ? (imageSize + stride - 1) / stride
+               : padding == kTfLitePaddingValid
+                     ? (imageSize - filterSize + stride) / stride
+                     : 0;
+  };
+
+  int outWidth =
+      computeOutSize(width, params->filter_width, params->stride_width);
+  int outHeight =
+      computeOutSize(height, params->filter_height, params->stride_height);
+
+  data->padding.height = ComputePadding(params->stride_height, height,
+                                        params->filter_height, outHeight);
+  data->padding.width = ComputePadding(params->stride_width, width,
+                                       params->filter_width, outWidth);
+
+  if (input->type == kTfLiteUInt8) {
+    if (pool_type == kAverage || pool_type == kMax) {
+      TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
+      TF_LITE_ENSURE_EQ(context, input->params.zero_point,
+                        output->params.zero_point);
+    }
+    if (pool_type == kL2) {
+      // We currently don't have a quantized implementation of L2Pool
+      TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+    }
+  }
+
+  TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
+  outputSize->data[0] = batches;
+  outputSize->data[1] = outHeight;
+  outputSize->data[2] = outWidth;
+  outputSize->data[3] = channels_out;
+  return context->ResizeTensor(context, output, outputSize);
+}
+
+template <KernelType kernel_type>
+void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, OpData* data,
+                      TfLiteTensor* input, TfLiteTensor* output) {
+  float activation_min, activation_max;
+  CalculateActivationRangeFloat(params->activation, &activation_min,
+                                &activation_max);
+#define TF_LITE_AVERAGE_POOL(type)                                             \
+  type::AveragePool(                                                           \
+      GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+      params->stride_height, data->padding.width, data->padding.height,        \
+      params->filter_width, params->filter_height, activation_min,             \
+      activation_max, GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_AVERAGE_POOL(reference_ops);
+  } else {
+    TF_LITE_AVERAGE_POOL(optimized_ops);
+  }
+#undef TF_LITE_AVERAGE_POOL
+}
+
+template <KernelType kernel_type>
+void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeUint8(params->activation, output, &activation_min,
+                                &activation_max);
+#define TF_LITE_AVERAGE_POOL(type)                                       \
+  type::AveragePool(GetTensorData<uint8_t>(input), GetTensorDims(input), \
+                    params->stride_width, params->stride_height,         \
+                    data->padding.width, data->padding.height,           \
+                    params->filter_width, params->filter_height,         \
+                    activation_min, activation_max,                      \
+                    GetTensorData<uint8_t>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_AVERAGE_POOL(reference_ops);
+  } else {
+    TF_LITE_AVERAGE_POOL(optimized_ops);
+  }
+#undef TF_LITE_AVERAGE_POOL
+}
+
+template <KernelType kernel_type>
+void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
+                  TfLiteTensor* output) {
+  float activation_min, activation_max;
+  CalculateActivationRangeFloat(params->activation, &activation_min,
+                                &activation_max);
+#define TF_LITE_MAX_POOL(type)                                                 \
+  type::MaxPool(                                                               \
+      GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+      params->stride_height, data->padding.width, data->padding.height,        \
+      params->filter_width, params->filter_height, activation_min,             \
+      activation_max, GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_MAX_POOL(reference_ops);
+  } else {
+    TF_LITE_MAX_POOL(optimized_ops);
+  }
+#undef TF_LITE_MAX_POOL
+}
+
+template <KernelType kernel_type>
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, OpData* data,
+                      TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeUint8(params->activation, output, &activation_min,
+                                &activation_max);
+#define TF_LITE_MAX_POOL(type)                                               \
+  type::MaxPool(GetTensorData<uint8_t>(input), GetTensorDims(input),         \
+                params->stride_width, params->stride_height,                 \
+                data->padding.width, data->padding.height,                   \
+                params->filter_width, params->filter_height, activation_min, \
+                activation_max, GetTensorData<uint8_t>(output),              \
+                GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_MAX_POOL(reference_ops);
+  } else {
+    TF_LITE_MAX_POOL(optimized_ops);
+  }
+#undef TF_LITE_MAX_POOL
+}
+
+template <KernelType kernel_type>
+void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                 TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
+                 TfLiteTensor* output) {
+  float activation_min, activation_max;
+  CalculateActivationRangeFloat(params->activation, &activation_min,
+                                &activation_max);
+#define TF_LITE_L2_POOL(type)                                                  \
+  type::L2Pool(                                                                \
+      GetTensorData<float>(input), GetTensorDims(input), params->stride_width, \
+      params->stride_height, data->padding.width, data->padding.height,        \
+      params->filter_width, params->filter_height, activation_min,             \
+      activation_max, GetTensorData<float>(output), GetTensorDims(output))
+  if (kernel_type == kReference) {
+    TF_LITE_L2_POOL(reference_ops);
+  } else {
+    TF_LITE_L2_POOL(optimized_ops);
+  }
+#undef TF_LITE_L2_POOL
+}
+
+#undef TF_LITE_KERNEL_TYPE_DISPATCH
+
+template <KernelType kernel_type>
+TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
+      break;
+    case kTfLiteUInt8:
+      AverageEvalQuantized<kernel_type>(context, node, params, data, input,
+                                        output);
+      break;
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
+      break;
+    case kTfLiteUInt8:
+      MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
+      break;
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+template <KernelType kernel_type>
+TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      L2EvalFloat<kernel_type>(context, node, params, data, input, output);
+      break;
+    case kTfLiteUInt8:
+    // We don't have a quantized implementation, so just fall through to the
+    // 'default' case.
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace pooling
+
+TfLiteRegistration* Register_AVERAGE_POOL_REF() {
+  static TfLiteRegistration r = {pooling::Init, pooling::Free,
+                                 pooling::GenericPrepare<pooling::kAverage>,
+                                 pooling::AverageEval<pooling::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_REF() {
+  static TfLiteRegistration r = {pooling::Init, pooling::Free,
+                                 pooling::GenericPrepare<pooling::kMax>,
+                                 pooling::MaxEval<pooling::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_L2_POOL_REF() {
+  static TfLiteRegistration r = {pooling::Init, pooling::Free,
+                                 pooling::GenericPrepare<pooling::kL2>,
+                                 pooling::L2Eval<pooling::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_AVERAGE_POOL_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      pooling::Init, pooling::Free, pooling::GenericPrepare<pooling::kAverage>,
+      pooling::AverageEval<pooling::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_MAX_POOL_GENERIC_OPT() {
+  static TfLiteRegistration r = {pooling::Init, pooling::Free,
+                                 pooling::GenericPrepare<pooling::kMax>,
+                                 pooling::MaxEval<pooling::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_L2_POOL_GENERIC_OPT() {
+  static TfLiteRegistration r = {pooling::Init, pooling::Free,
+                                 pooling::GenericPrepare<pooling::kL2>,
+                                 pooling::L2Eval<pooling::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_AVERAGE_POOL_2D() {
+  return Register_AVERAGE_POOL_GENERIC_OPT();
+}
+
+TfLiteRegistration* Register_MAX_POOL_2D() {
+  return Register_MAX_POOL_GENERIC_OPT();
+}
+
+TfLiteRegistration* Register_L2_POOL_2D() {
+  return Register_L2_POOL_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pooling_test.cc b/tensorflow/contrib/lite/kernels/pooling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01c91b2ba905e249c36af19f175c68a7e7f17f6d
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pooling_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BasePoolingOpModel : public SingleOpModel {
+ public:
+  // TODO(ahentz): Also test different activation types, bias, padding types,
+  // stride values.
+  BasePoolingOpModel(BuiltinOperator type, const TensorData& input,
+                     int filter_width, int filter_height,
+                     const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        type, BuiltinOptions_Pool2DOptions,
+        CreatePool2DOptions(builder_, Padding_VALID, 2, 2, filter_width,
+                            filter_height, ActivationFunctionType_NONE)
+            .Union());
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class FloatPoolingOpModel : public BasePoolingOpModel {
+ public:
+  using BasePoolingOpModel::BasePoolingOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class QuantizedPoolingOpModel : public BasePoolingOpModel {
+ public:
+  using BasePoolingOpModel::BasePoolingOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(FloatPoolingOpTest, AveragePool) {
+  FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
+                        /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+                        /*filter_width=*/2, /*filter_height=*/2,
+                        /*output=*/{TensorType_FLOAT32, {}});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2.75, 5.75}));
+}
+
+TEST(QuantizedPoolingOpTest, AveragePool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({2.75, 5.75})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({44, 92}));
+}
+
+TEST(FloatPoolingOpTest, MaxPool) {
+  FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
+                        /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+                        /*filter_width=*/2, /*filter_height=*/2,
+                        /*output=*/{TensorType_FLOAT32, {}});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 10}));
+}
+
+TEST(QuantizedPoolingOpTest, MaxPool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({6, 10})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({96, 160}));
+}
+
+TEST(FloatPoolingOpTest, L2Pool) {
+  FloatPoolingOpModel m(BuiltinOperator_L2_POOL_2D,
+                        /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
+                        /*filter_width=*/2, /*filter_height=*/2,
+                        /*output=*/{TensorType_FLOAT32, {}});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca7a0dd1949a3a31d26be770a7df781cc5fe7533
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -0,0 +1,109 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_RELU();
+TfLiteRegistration* Register_RELU1();
+TfLiteRegistration* Register_RELU6();
+TfLiteRegistration* Register_TANH();
+TfLiteRegistration* Register_LOGISTIC();
+TfLiteRegistration* Register_AVERAGE_POOL_2D();
+TfLiteRegistration* Register_MAX_POOL_2D();
+TfLiteRegistration* Register_L2_POOL_2D();
+TfLiteRegistration* Register_CONV_2D();
+TfLiteRegistration* Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration* Register_SVDF();
+TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration* Register_FULLY_CONNECTED();
+TfLiteRegistration* Register_LSH_PROJECTION();
+TfLiteRegistration* Register_HASHTABLE_LOOKUP();
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_CONCATENATION();
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_MUL();
+TfLiteRegistration* Register_L2_NORMALIZATION();
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORMALIZATION();
+TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_BILINEAR();
+TfLiteRegistration* Register_SKIP_GRAM();
+TfLiteRegistration* Register_SPACE_TO_DEPTH();
+
+BuiltinOpResolver::BuiltinOpResolver() {
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_RELU1, Register_RELU1());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+             Register_EMBEDDING_LOOKUP_SPARSE());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED());
+  AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
+  AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+             Register_LOCAL_RESPONSE_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
+}
+
+TfLiteRegistration* BuiltinOpResolver::FindOp(
+    tflite::BuiltinOperator op) const {
+  auto it = builtins_.find(op);
+  return it != builtins_.end() ? it->second : nullptr;
+}
+
+TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op) const {
+  auto it = custom_ops_.find(op);
+  return it != custom_ops_.end() ? it->second : nullptr;
+}
+
+void BuiltinOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                   TfLiteRegistration* registration) {
+  registration->builtin_code = op;
+  builtins_.insert(std::make_pair(op, registration));
+}
+
+void BuiltinOpResolver::AddCustom(const char* name,
+                                  TfLiteRegistration* registration) {
+  registration->builtin_code = BuiltinOperator_CUSTOM;
+  custom_ops_.insert(std::make_pair(std::string(name), registration));
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
new file mode 100644
index 0000000000000000000000000000000000000000..28f5e0fcc80a14cf9fb6fb19b795d0c0d55e0df9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_REGISTER_H_
+
+#include <unordered_map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+class BuiltinOpResolver : public OpResolver {
+ public:
+  BuiltinOpResolver();
+  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
+  TfLiteRegistration* FindOp(const char* op) const override;
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
+  void AddCustom(const char* name, TfLiteRegistration* registration);
+
+ private:
+  struct BuiltinOperatorHasher {
+    size_t operator()(const tflite::BuiltinOperator& x) const {
+      return std::hash<size_t>()(static_cast<size_t>(x));
+    }
+  };
+  std::unordered_map<tflite::BuiltinOperator, TfLiteRegistration*,
+                     BuiltinOperatorHasher>
+      builtins_;
+  std::unordered_map<std::string, TfLiteRegistration*> custom_ops_;
+};
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_BUILTIN_KERNELS_H
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3e6ddc9f480e3863cac52157ae28b7329ee2088
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reshape {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
+
+  // TODO(ahentz): we are often given a tensor with the shape but we only pay
+  // attention to what the shape specified in 'params'.
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Tensorflow's Reshape allows one of the shape components to have the
+  // special -1 value, meaning it will be calculated automatically based on the
+  // input. Here we calculate what that dimension should be so that the number
+  // of output elements in the same as the number of input elements.
+  int num_input_elements = 1;
+  for (int i = 0; i < NumDimensions(input); ++i) {
+    num_input_elements *= SizeOfDimension(input, i);
+  }
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
+  int num_output_elements = 1;
+  int strech_dim = -1;
+  for (int i = 0; i < params->num_dimensions; ++i) {
+    int value = params->shape[i];
+    if (value == -1) {
+      TF_LITE_ENSURE_EQ(context, strech_dim, -1);
+      strech_dim = i;
+    } else {
+      num_output_elements *= value;
+      output_size->data[i] = value;
+    }
+  }
+  if (strech_dim != -1) {
+    output_size->data[strech_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_size->data[strech_dim];
+  }
+
+  TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  memcpy(output->data.raw, input->data.raw, input->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace reshape
+
+TfLiteRegistration* Register_RESHAPE() {
+  static TfLiteRegistration r = {nullptr, nullptr, reshape::Prepare,
+                                 reshape::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/reshape_test.cc b/tensorflow/contrib/lite/kernels/reshape_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fbcf6e6aa311d2cac491336ee54ccf58bbda8fd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/reshape_test.cc
@@ -0,0 +1,89 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ReshapeOpModel : public SingleOpModel {
+ public:
+  ReshapeOpModel(std::initializer_list<int> input_shape,
+                 std::initializer_list<int> new_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
+        CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
+            .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ReshapeOpTest, MismatchedDimensions) {
+  EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {2, 1}),
+               "num_input_elements != num_output_elements");
+}
+
+TEST(ReshapeOpTest, TooManyDimensions) {
+  EXPECT_DEATH(
+      ReshapeOpModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 2, 3, 4, 5, 6, 7, 8, 9}),
+      "Found too many dimensions");
+}
+
+TEST(ReshapeOpTest, TooManySpecialDimensions) {
+  EXPECT_DEATH(ReshapeOpModel({1, 2, 4, 1}, {-1, -1, 2, 4}),
+               "strech_dim != -1");
+}
+
+TEST(ReshapeOpTest, SimpleTest) {
+  ReshapeOpModel m({1, 2, 4, 1}, {2, 2, 2});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(ReshapeOpTest, WithStretchDimension) {
+  ReshapeOpModel m({1, 2, 4, 1}, {2, 1, -1});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1613c9a89faa3579b913408cc09cdad7f942cb99
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace resize_bilinear {
+
+// This file has three implementation of RESIZE_BILINEAR.
+enum KernelType {
+  kReference,
+  kGenericOptimized,  // Neon-free
+  kNeonOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): Our current implementations rely on the inputs being 4D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+  // TODO(ahentz): Our current implementations only support float32.
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  output_size->data[1] = params->new_height;
+  output_size->data[2] = params->new_width;
+  output_size->data[3] = input->dims->data[3];
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // We have to fake a tensor here, to satisfy ResizeBilinear().
+  int32 output_size_data[2] = {params->new_height, params->new_width};
+
+  if (output->type == kTfLiteFloat32) {
+#define TF_LITE_RESIZE_BILINEAR(type)                                     \
+  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input), \
+                       output_size_data, GetTensorDims({1, 1, 1, 2}),     \
+                       GetTensorData<float>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_RESIZE_BILINEAR(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
+      TF_LITE_RESIZE_BILINEAR(optimized_ops);
+    }
+#undef TF_LITE_RESIZE_BILINEAR
+  } else {
+    context->ReportError(context, "Inputs and outputs not all float types.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace resize_bilinear
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_bilinear::Prepare,
+      resize_bilinear::Eval<resize_bilinear::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_bilinear::Prepare,
+      resize_bilinear::Eval<resize_bilinear::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR_NEON_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, resize_bilinear::Prepare,
+      resize_bilinear::Eval<resize_bilinear::kNeonOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_RESIZE_BILINEAR() {
+#ifdef USE_NEON
+  return Register_RESIZE_BILINEAR_NEON_OPT();
+#else
+  return Register_RESIZE_BILINEAR_GENERIC_OPT();
+#endif
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..314a71e210d9b5ea75bb137ef228273ef48f28b5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ResizeBilinearOpModel : public SingleOpModel {
+ public:
+  ResizeBilinearOpModel(std::initializer_list<int> input_shape, int new_height,
+                        int new_width) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_RESIZE_BILINEAR, BuiltinOptions_ResizeBilinearOptions,
+        CreateResizeBilinearOptions(builder_, new_height, new_width).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ResizeBilinearOpTest, HorizontalResize) {
+  ResizeBilinearOpModel m({1, 1, 2, 1}, 1, 3);
+  m.SetInput({3, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize) {
+  ResizeBilinearOpModel m({1, 2, 1, 1}, 3, 1);
+  m.SetInput({3, 9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
+  ResizeBilinearOpModel m({1, 2, 2, 1}, 3, 3);
+  m.SetInput({
+      3, 6,  //
+      9, 12  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 3, 5, 6,    //
+                                 7, 9, 10,   //
+                                 9, 11, 12,  //
+                             })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
+  ResizeBilinearOpModel m({2, 2, 2, 1}, 3, 3);
+  m.SetInput({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 3, 5, 6,     //
+                                 7, 9, 10,    //
+                                 9, 11, 12,   //
+                                 4, 8, 10,    //
+                                 8, 12, 14,   //
+                                 10, 14, 16,  //
+                             })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
+  ResizeBilinearOpModel m({1, 2, 2, 2}, 3, 3);
+  m.SetInput({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 3, 4, 5, 8, 6, 10,      //
+                                 7, 8, 9, 12, 10, 14,    //
+                                 9, 10, 11, 14, 12, 16,  //
+                             })));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/skip_gram.cc b/tensorflow/contrib/lite/kernels/skip_gram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c90a15b3a2e79028128260e579f41742a46289f6
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/skip_gram.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate a list of skip grams from an input.
+//
+// Options:
+//   ngram_size: num of words for each output item.
+//   max_skip_size: max num of words to skip.
+//                  The op generates ngrams when it is 0.
+//   include_all_ngrams: include all ngrams with size up to ngram_size.
+//
+// Input:
+//   A string tensor to generate n-grams.
+//   Dim = {1}
+//
+// Output:
+//   A list of strings, each of which contains ngram_size words.
+//   Dim = {num_ngram}
+
+#include <ctype.h>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TF_LITE_ENSURE_EQ(context, GetInput(context, node, 0)->type, kTfLiteString);
+  TF_LITE_ENSURE_EQ(context, GetOutput(context, node, 0)->type, kTfLiteString);
+  return kTfLiteOk;
+}
+
+bool ShouldIncludeCurrentNgram(const TfLiteSkipGramParams* params, int size) {
+  if (size <= 0) {
+    return false;
+  }
+  if (params->include_all_ngrams) {
+    return size <= params->ngram_size;
+  } else {
+    return size == params->ngram_size;
+  }
+}
+
+bool ShouldStepInRecursion(const TfLiteSkipGramParams* params,
+                           const std::vector<int>& stack, int stack_idx,
+                           int num_words) {
+  // If current stack size and next word enumeration are within valid range.
+  if (stack_idx < params->ngram_size && stack[stack_idx] + 1 < num_words) {
+    // If this stack is empty, step in for first word enumeration.
+    if (stack_idx == 0) {
+      return true;
+    }
+    // If next word enumeration are within the range of max_skip_size.
+    // NOTE: equivalent to
+    //   next_word_idx = stack[stack_idx] + 1
+    //   next_word_idx - stack[stack_idx-1] <= max_skip_size + 1
+    if (stack[stack_idx] - stack[stack_idx - 1] <= params->max_skip_size) {
+      return true;
+    }
+  }
+  return false;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSkipGramParams*>(node->builtin_data);
+
+  // Split sentence to words.
+  std::vector<StringRef> words;
+  tflite::StringRef strref = tflite::GetString(GetInput(context, node, 0), 0);
+  int prev_idx = 0;
+  for (int i = 1; i < strref.len; i++) {
+    if (isspace(*(strref.str + i))) {
+      if (i > prev_idx && !isspace(*(strref.str + prev_idx))) {
+        words.push_back({strref.str + prev_idx, i - prev_idx});
+      }
+      prev_idx = i + 1;
+    }
+  }
+  if (strref.len > prev_idx) {
+    words.push_back({strref.str + prev_idx, strref.len - prev_idx});
+  }
+
+  // Generate n-grams recursively.
+  tflite::DynamicBuffer buf;
+  if (words.size() < params->ngram_size) {
+    buf.WriteToTensor(GetOutput(context, node, 0));
+    return kTfLiteOk;
+  }
+
+  // Stack stores the index of word used to generate ngram.
+  // The size of stack is the size of ngram.
+  std::vector<int> stack(params->ngram_size, 0);
+  // Stack index that indicates which depth the recursion is operating at.
+  int stack_idx = 1;
+  int num_words = words.size();
+
+  while (stack_idx >= 0) {
+    if (ShouldStepInRecursion(params, stack, stack_idx, num_words)) {
+      // When current depth can fill with a new word
+      // and the new word is within the max range to skip,
+      // fill this word to stack, recurse into next depth.
+      stack[stack_idx]++;
+      stack_idx++;
+      if (stack_idx < params->ngram_size) {
+        stack[stack_idx] = stack[stack_idx - 1];
+      }
+    } else {
+      if (ShouldIncludeCurrentNgram(params, stack_idx)) {
+        // Add n-gram to tensor buffer when the stack has filled with enough
+        // words to generate the ngram.
+        std::vector<StringRef> gram(stack_idx);
+        for (int i = 0; i < stack_idx; i++) {
+          gram[i] = words[stack[i]];
+        }
+        buf.AddJoinedString(gram, ' ');
+      }
+      // When current depth cannot fill with a valid new word,
+      // and not in last depth to generate ngram,
+      // step back to previous depth to iterate to next possible word.
+      stack_idx--;
+    }
+  }
+
+  buf.WriteToTensor(GetOutput(context, node, 0));
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteRegistration* Register_SKIP_GRAM() {
+  static TfLiteRegistration r = {nullptr, nullptr, Prepare, Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/skip_gram_test.cc b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..185b64cb44969b57588ea5d0b40f55b6ddf8e11f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/skip_gram_test.cc
@@ -0,0 +1,257 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+static char kSentence[] = "The quick\t brown fox\n jumps over\n the lazy dog!";
+
+class SkipGramOp : public SingleOpModel {
+ public:
+  SkipGramOp(int ngram_size, int max_skip_size, bool include_all_ngrams) {
+    input_ = AddInput(TensorType_STRING);
+    output_ = AddOutput(TensorType_STRING);
+
+    SetBuiltinOp(BuiltinOperator_SKIP_GRAM, BuiltinOptions_SkipGramOptions,
+                 CreateSkipGramOptions(builder_, ngram_size, max_skip_size,
+                                       include_all_ngrams)
+                     .Union());
+    BuildInterpreter({{1}});
+  }
+  void SetInput(const string& content) {
+    PopulateStringTensor(input_, {content});
+  }
+
+  std::vector<string> GetOutput() {
+    std::vector<string> ans;
+    TfLiteTensor* tensor = interpreter_->tensor(output_);
+
+    int num = GetStringCount(tensor);
+    for (int i = 0; i < num; i++) {
+      StringRef strref = GetString(tensor, i);
+      ans.push_back(string(strref.str, strref.len));
+    }
+    return ans;
+  }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(SkipGramTest, TestUnigram) {
+  SkipGramOp m(1, 0, false);
+
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), testing::UnorderedElementsAreArray(
+                                 {"The", "quick", "brown", "fox", "jumps",
+                                  "over", "the", "lazy", "dog!"}));
+}
+
+TEST(SkipGramTest, TestBigram) {
+  SkipGramOp m(2, 0, false);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {"The quick", "quick brown", "brown fox", "fox jumps",
+                   "jumps over", "over the", "the lazy", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllBigram) {
+  SkipGramOp m(2, 0, true);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {// Unigram
+                   "The", "quick", "brown", "fox", "jumps", "over", "the",
+                   "lazy", "dog!",
+                   //  Bigram
+                   "The quick", "quick brown", "brown fox", "fox jumps",
+                   "jumps over", "over the", "the lazy", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllTrigram) {
+  SkipGramOp m(3, 0, true);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {// Unigram
+                   "The", "quick", "brown", "fox", "jumps", "over", "the",
+                   "lazy", "dog!",
+                   // Bigram
+                   "The quick", "quick brown", "brown fox", "fox jumps",
+                   "jumps over", "over the", "the lazy", "lazy dog!",
+                   // Trigram
+                   "The quick brown", "quick brown fox", "brown fox jumps",
+                   "fox jumps over", "jumps over the", "over the lazy",
+                   "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip1Bigram) {
+  SkipGramOp m(2, 1, false);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      testing::UnorderedElementsAreArray(
+          {"The quick", "The brown", "quick brown", "quick fox", "brown fox",
+           "brown jumps", "fox jumps", "fox over", "jumps over", "jumps the",
+           "over the", "over lazy", "the lazy", "the dog!", "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip2Bigram) {
+  SkipGramOp m(2, 2, false);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {"The quick",  "The brown",   "The fox",    "quick brown",
+                   "quick fox",  "quick jumps", "brown fox",  "brown jumps",
+                   "brown over", "fox jumps",   "fox over",   "fox the",
+                   "jumps over", "jumps the",   "jumps lazy", "over the",
+                   "over lazy",  "over dog!",   "the lazy",   "the dog!",
+                   "lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip1Trigram) {
+  SkipGramOp m(3, 1, false);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {"The quick brown", "The quick fox",    "The brown fox",
+                   "The brown jumps", "quick brown fox",  "quick brown jumps",
+                   "quick fox jumps", "quick fox over",   "brown fox jumps",
+                   "brown fox over",  "brown jumps over", "brown jumps the",
+                   "fox jumps over",  "fox jumps the",    "fox over the",
+                   "fox over lazy",   "jumps over the",   "jumps over lazy",
+                   "jumps the lazy",  "jumps the dog!",   "over the lazy",
+                   "over the dog!",   "over lazy dog!",   "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSkip2Trigram) {
+  SkipGramOp m(3, 2, false);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              testing::UnorderedElementsAreArray(
+                  {"The quick brown",  "The quick fox",     "The quick jumps",
+                   "The brown fox",    "The brown jumps",   "The brown over",
+                   "The fox jumps",    "The fox over",      "The fox the",
+                   "quick brown fox",  "quick brown jumps", "quick brown over",
+                   "quick fox jumps",  "quick fox over",    "quick fox the",
+                   "quick jumps over", "quick jumps the",   "quick jumps lazy",
+                   "brown fox jumps",  "brown fox over",    "brown fox the",
+                   "brown jumps over", "brown jumps the",   "brown jumps lazy",
+                   "brown over the",   "brown over lazy",   "brown over dog!",
+                   "fox jumps over",   "fox jumps the",     "fox jumps lazy",
+                   "fox over the",     "fox over lazy",     "fox over dog!",
+                   "fox the lazy",     "fox the dog!",      "jumps over the",
+                   "jumps over lazy",  "jumps over dog!",   "jumps the lazy",
+                   "jumps the dog!",   "jumps lazy dog!",   "over the lazy",
+                   "over the dog!",    "over lazy dog!",    "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestAllSkip2Trigram) {
+  SkipGramOp m(3, 2, true);
+  m.SetInput(kSentence);
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      testing::UnorderedElementsAreArray(
+          {// Unigram
+           "The", "quick", "brown", "fox", "jumps", "over", "the", "lazy",
+           "dog!",
+           // Bigram
+           "The quick", "The brown", "The fox", "quick brown", "quick fox",
+           "quick jumps", "brown fox", "brown jumps", "brown over", "fox jumps",
+           "fox over", "fox the", "jumps over", "jumps the", "jumps lazy",
+           "over the", "over lazy", "over dog!", "the lazy", "the dog!",
+           "lazy dog!",
+           // Trigram
+           "The quick brown", "The quick fox", "The quick jumps",
+           "The brown fox", "The brown jumps", "The brown over",
+           "The fox jumps", "The fox over", "The fox the", "quick brown fox",
+           "quick brown jumps", "quick brown over", "quick fox jumps",
+           "quick fox over", "quick fox the", "quick jumps over",
+           "quick jumps the", "quick jumps lazy", "brown fox jumps",
+           "brown fox over", "brown fox the", "brown jumps over",
+           "brown jumps the", "brown jumps lazy", "brown over the",
+           "brown over lazy", "brown over dog!", "fox jumps over",
+           "fox jumps the", "fox jumps lazy", "fox over the", "fox over lazy",
+           "fox over dog!", "fox the lazy", "fox the dog!", "jumps over the",
+           "jumps over lazy", "jumps over dog!", "jumps the lazy",
+           "jumps the dog!", "jumps lazy dog!", "over the lazy",
+           "over the dog!", "over lazy dog!", "the lazy dog!"}));
+}
+
+TEST(SkipGramTest, TestSingleWord) {
+  SkipGramOp m(1, 1, false);
+  m.SetInput("Hi");
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAre("Hi"));
+}
+
+TEST(SkipGramTest, TestWordsLessThanGram) {
+  SkipGramOp m(3, 1, false);
+  m.SetInput("Hi hi");
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), std::vector<string>());
+}
+
+TEST(SkipGramTest, TestEmptyInput) {
+  SkipGramOp m(1, 1, false);
+  m.SetInput("");
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAre());
+}
+
+TEST(SkipGramTest, TestWhitespaceInput) {
+  SkipGramOp m(1, 1, false);
+  m.SetInput("    ");
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAre());
+}
+
+TEST(SkipGramTest, TestInputWithExtraSpace) {
+  SkipGramOp m(1, 1, false);
+  m.SetInput("   Hello   world    !  ");
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAre("Hello", "world", "!"));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/softmax_test.cc b/tensorflow/contrib/lite/kernels/softmax_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c5338ff0fd26337c9adc8e0b94a0a88edfde37f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/softmax_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite SOFTMAX op.
+
+#include <iomanip>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+class SoftmaxOpModel : public SingleOpModel {
+ public:
+  SoftmaxOpModel(int batches, int size, float beta)
+      : batches_(batches), input_size_(size), beta_(beta) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_SOFTMAX, BuiltinOptions_SoftmaxOptions,
+                 CreateSoftmaxOptions(builder_, beta_).Union());
+    BuildInterpreter({{batches_, input_size_}});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+
+  int batches_;
+  int input_size_;
+  float beta_;
+};
+
+TEST(SoftmaxOpTest, SimpleTest) {
+  SoftmaxOpModel m(/*batches=*/2, /*size=*/5, /*beta=*/1.0);
+  m.SetInput({
+      1.0, 2.0, 3.0, 4.0, 5.0,       // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 0
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.011656231, 0.031684921, 0.086128544, 0.234121657, 0.636408647,
+           0.636408647, 0.234121657, 0.086128544, 0.031684921, 0.011656231},
+          1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaEq1) {
+  const int batch_size = 2;
+  const int input_size = 5;
+  const float beta = 1.0;
+  static float input_buffer[] = {
+      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  };
+
+  SoftmaxOpModel m(batch_size, input_size, beta);
+
+  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+  m.Invoke();
+
+  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+  static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size},
+                                       {1, 0, 0, input_size}};
+  tflite::reference_ops::Softmax(input_buffer, input_dims, beta,
+                                 output_buffer.get(), input_dims);
+
+  std::vector<float> expected;
+  expected.insert(expected.end(), output_buffer.get(),
+                  output_buffer.get() + input_size * batch_size);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+TEST(SoftmaxOpTest, CompareWithTFminiBetaNotEq1) {
+  const int batch_size = 2;
+  const int input_size = 5;
+  const float beta = 0.5;
+  static float input_buffer[] = {
+      1.0,  2.0,  3.0,  4.0,  5.0,   // b = 0
+      -1.0, -2.0, -3.0, -4.0, -5.0,  // b = 1
+  };
+
+  SoftmaxOpModel m(batch_size, input_size, beta);
+
+  m.SetInput(0, input_buffer, input_buffer + input_size * batch_size);
+
+  m.Invoke();
+
+  std::unique_ptr<float[]> output_buffer(new float[input_size * batch_size]);
+  static tflite::Dims<4> input_dims = {{input_size, 1, 1, batch_size},
+                                       {1, 0, 0, input_size}};
+  tflite::reference_ops::Softmax(input_buffer, input_dims, beta,
+                                 output_buffer.get(), input_dims);
+
+  std::vector<float> expected;
+  expected.insert(expected.end(), output_buffer.get(),
+                  output_buffer.get() + input_size * batch_size);
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(expected, 1e-6)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/contrib/lite/kernels/space_to_depth.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb2e509c9811b1469c4d3f676532edff570a6c4a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_depth.cc
@@ -0,0 +1,146 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace space_to_depth {
+
+// This file has two implementation of SpaceToDepth. Note that SpaceToDepth
+// only works on 4D tensors.
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+
+  auto data_type = output->type;
+  TF_LITE_ENSURE(context,
+                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
+                     data_type == kTfLiteInt32 || data_type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  const int block_size = params->block_size;
+  const int input_height = input->dims->data[1];
+  const int input_width = input->dims->data[2];
+  int output_height = input_height / block_size;
+  int output_width = input_width / block_size;
+
+  TF_LITE_ENSURE_EQ(context, input_height, output_height * block_size);
+  TF_LITE_ENSURE_EQ(context, input_width, output_width * block_size);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
+  output_size->data[0] = input->dims->data[0];
+  output_size->data[1] = output_height;
+  output_size->data[2] = output_width;
+  output_size->data[3] = input->dims->data[3] * block_size * block_size;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <KernelType kernel_type>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params =
+      reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_SPACE_TO_DEPTH(type, scalar)                                  \
+  type::SpaceToDepth<scalar>(                                                 \
+      GetTensorData<scalar>(input), GetTensorDims(input), params->block_size, \
+      GetTensorData<scalar>(output), GetTensorDims(output))
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteFloat32:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, float);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, float);
+      }
+      break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, uint8_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, uint8_t);
+      }
+      break;
+    case kTfLiteInt32:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, int32_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, int32_t);
+      }
+      break;
+    case kTfLiteInt64:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, int64_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, int64_t);
+      }
+      break;
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPACE_TO_DEPTH
+
+  return kTfLiteOk;
+}
+
+}  // namespace space_to_depth
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, space_to_depth::Prepare,
+      space_to_depth::Eval<space_to_depth::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, space_to_depth::Prepare,
+      space_to_depth::Eval<space_to_depth::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_SPACE_TO_DEPTH() {
+  return Register_SPACE_TO_DEPTH_GENERIC_OPT();
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth_test.cc b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..997f354861a235fb511235e4d64544dc8c3ddb34
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/space_to_depth_test.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class SpaceToDepthOpModel : public SingleOpModel {
+ public:
+  SpaceToDepthOpModel(const TensorData& tensor_data, int block_size) {
+    input_ = AddInput(tensor_data);
+    output_ = AddOutput(tensor_data);
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_DEPTH,
+                 BuiltinOptions_SpaceToDepthOptions,
+                 CreateSpaceToDepthOptions(builder_, block_size).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(SpaceToDepthOpModel, BadBlockSize) {
+  EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
+               "Cannot allocate tensors");
+}
+
+TEST(SpaceToDepthOpModel, Float32) {
+  SpaceToDepthOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, 2);
+  m.SetInput<float>({1.4, 2.3, 3.2, 4.1, 5.4, 6.3, 7.2, 8.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({1.4, 2.3, 3.2, 4.1, 5.4, 6.3, 7.2, 8.1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 8));
+}
+
+TEST(SpaceToDepthOpModel, Uint8) {
+  SpaceToDepthOpModel m({TensorType_UINT8, {1, 2, 2, 1}}, 2);
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({1, 2, 3, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(SpaceToDepthOpModel, Int32) {
+  SpaceToDepthOpModel m({TensorType_INT32, {1, 2, 2, 3}}, 2);
+  m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 12));
+}
+
+TEST(SpaceToDepthOpModel, Int64) {
+  SpaceToDepthOpModel m({TensorType_INT64, {1, 4, 4, 1}}, 2);
+  m.SetInput<int64_t>({1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray(
+                  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 2, 2, 4));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72f705fe4242b01c1516c99d3500484e8729fd9a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -0,0 +1,222 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace svdf {
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+constexpr int kStateTensor = 0;
+constexpr int KOutputTensor = 1;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, 1, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
+  // Check we have all the inputs and outputs we need.
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* weights_feature =
+      &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
+  TfLiteTensor* weights_time =
+      &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+
+  // Check all the parameters of tensor match within themselves and match the
+  // input configuration.
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int num_filters = weights_feature->dims->data[0];
+  TF_LITE_ASSERT_EQ(num_filters % rank, 0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+  TF_LITE_ASSERT_EQ(input->dims->data[1], weights_feature->dims->data[1]);
+  TF_LITE_ASSERT_EQ(weights_time->dims->data[0], num_filters);
+
+  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  if (bias) {
+    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+  }
+
+  TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
+
+  // Resize state.
+  // For each batch, the state is a 2-D tensor: memory_size * num_filters
+  // The left most column is used to save current cycle activation.
+  // The right most column is used to save temporary output which will be
+  // reduced to num_units outputs.
+  TfLiteIntArray* state_size_array = TfLiteIntArrayCreate(2);
+  state_size_array->data[0] = batch_size;
+  state_size_array->data[1] = memory_size * num_filters;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, state, state_size_array));
+
+  // Mark state as a persistent tensor.
+  state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // Resize output.
+  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
+  output_size_array->data[0] = batch_size;
+  output_size_array->data[1] = num_units;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, output, output_size_array));
+
+  // Resize scratch.
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(1);
+  node->temporaries->data[0] = *scratch_tensor_index;
+
+  TfLiteIntArray* scratch_size_array = TfLiteIntArrayCreate(2);
+  scratch_size_array->data[0] = batch_size;
+  scratch_size_array->data[1] = num_filters;
+
+  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  scratch_tensor->type = input->type;
+  scratch_tensor->allocation_type = kTfLiteArenaRw;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor,
+                                                   scratch_size_array));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+
+  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
+  TfLiteTensor* weights_feature =
+      &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
+  TfLiteTensor* weights_time =
+      &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+
+  TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
+  TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
+  TfLiteTensor* scratch = &context->tensors[node->temporaries->data[0]];
+
+  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Clear the activation (state left most column).
+  // TODO(ghodrat): Add a test which initialize state with invalid values in
+  // left most column and make sure it passes.
+  for (int b = 0; b < batch_size; b++) {
+    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+    for (int c = 0; c < num_filters; c++) {
+      float* state_ptr = state_ptr_batch + c * memory_size;
+      state_ptr[memory_size - 1] = 0.0;
+    }
+  }
+
+  // Compute conv1d(inputs, weights_feature).
+  // The state left most column is used to save current cycle activation. This
+  // is achieved by starting at state->data.f[memory_size - 1] and having the
+  // stride equal to memory_size.
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      weights_feature->data.f, num_filters, input_size, input->data.f,
+      batch_size, &state->data.f[memory_size - 1], memory_size);
+
+  // Compute matmul(state, weights_time).
+  // The right most column is used to save temporary output (with the size of
+  // num_filters). This is achieved by starting at state->data.f and having the
+  // stride equal to memory_size.
+  for (int b = 0; b < batch_size; b++) {
+    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+    float* scratch_ptr_batch = scratch->data.f + b * num_filters;
+    tensor_utils::BatchVectorBatchVectorDotProduct(
+        weights_time->data.f, state_ptr_batch, memory_size, num_filters,
+        scratch_ptr_batch, /*result_stride=*/1);
+  }
+
+  // Initialize output with bias if provided.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+                                          output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+  }
+
+  // Reduction sum
+  for (int b = 0; b < batch_size; b++) {
+    float* output_ptr_batch = output->data.f + b * num_units;
+    float* scratch_ptr_batch = scratch->data.f + b * num_filters;
+    tensor_utils::ReductionSumVector(scratch_ptr_batch, output_ptr_batch,
+                                     num_units, rank);
+  }
+
+  // Apply activation.
+  for (int b = 0; b < batch_size; b++) {
+    float* output_ptr_batch = output->data.f + b * num_units;
+    tensor_utils::ApplyActivationToVector(output_ptr_batch, num_units,
+                                          params->activation, output_ptr_batch);
+  }
+
+  // Right shift the state.
+  for (int b = 0; b < batch_size; b++) {
+    float* state_ptr_batch = state->data.f + b * memory_size * num_filters;
+    for (int f = 0; f < num_filters; f++) {
+      tensor_utils::VectorShiftLeft(state_ptr_batch, memory_size,
+                                    /*shift_value=*/0.0);
+      state_ptr_batch += memory_size;
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace svdf
+
+TfLiteRegistration* Register_SVDF() {
+  static TfLiteRegistration r = {svdf::Init, svdf::Free, svdf::Prepare,
+                                 svdf::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/svdf_test.cc b/tensorflow/contrib/lite/kernels/svdf_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4de2ceaf053df31a4bc857fb250db416c071e80f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/svdf_test.cc
@@ -0,0 +1,312 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite SVDF op.
+
+#include <vector>
+#include <iomanip>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+static float svdf_input[] = {
+    0.12609188,  -0.46347019, -0.89598465,
+    0.35867718,  0.36897406,  0.73463392,
+
+    0.14278367,  -1.64410412, -0.75222826,
+    -0.57290924, 0.12729003,  0.7567004,
+
+    0.49837467,  0.19278903,  0.26584083,
+    0.17660543,  0.52949083,  -0.77931279,
+
+    -0.11186574, 0.13164264,  -0.05349274,
+    -0.72674477, -0.5683046,  0.55900657,
+
+    -0.68892461, 0.37783599,  0.18263303,
+    -0.63690937, 0.44483393,  -0.71817774,
+
+    -0.81299269, -0.86831826, 1.43940818,
+    -0.95760226, 1.82078898,  0.71135032,
+
+    -1.45006323, -0.82251364, -1.69082689,
+    -1.65087092, -1.89238167, 1.54172635,
+
+    0.03966608,  -0.24936394, -0.77526885,
+    2.06740379,  -1.51439476, 1.43768692,
+
+    0.11771342,  -0.23761693, -0.65898693,
+    0.31088525,  -1.55601168, -0.87661445,
+
+    -0.89477462, 1.67204106,  -0.53235275,
+    -0.6230064,  0.29819036,  1.06939757,
+};
+
+static float svdf_golden_output_rank_1[] = {
+    0.014899,    -0.0517661,  -0.143725,   -0.00271883,
+    -0.03004015, 0.09565311,  0.1587342,   0.00784263,
+
+    0.068281,    -0.162217,   -0.152268,   0.00323521,
+    0.01582633,  0.03858774,  -0.03001583, -0.02671271,
+
+    -0.0317821,  -0.0333089,  0.0609602,   0.0333759,
+    -0.01432795, 0.05524484,  0.1101355,   -0.02382665,
+
+    -0.00623099, -0.077701,   -0.391193,   -0.0136691,
+    -0.02333033, 0.02293761,  0.12338032,  0.04326871,
+
+    0.201551,    -0.164607,   -0.179462,   -0.0592739,
+    0.01064911,  -0.17503069, 0.07821996,  -0.00224009,
+
+    0.0886511,   -0.0875401,  -0.269283,   0.0281379,
+    -0.02282338, 0.09741908,  0.32973239,  0.12281385,
+
+    -0.201174,   -0.586145,   -0.628624,   -0.0330412,
+    0.24780814,  -0.39304617, -0.22473189, 0.02589256,
+
+    -0.0839096,  -0.299329,   0.108746,    0.109808,
+    0.10084175,  -0.06416984, 0.28936723,  0.0026358,
+
+    0.419114,    -0.237824,   -0.422627,   0.175115,
+    -0.2314795,  -0.18584411, -0.4228974,  -0.12928449,
+
+    0.36726,     -0.522303,   -0.456502,   -0.175475,
+    0.17012937,  -0.34447709, 0.38505614,  -0.28158101,
+};
+
+static float svdf_golden_output_rank_2[] = {
+    -0.09623547, -0.10193135, 0.11083051,  -0.0347917,
+    0.1141196,   0.12965347,  -0.12652366, 0.01007236,
+
+    -0.16396809, -0.21247184, 0.11259045,  -0.04156673,
+    0.10132131,  -0.06143532, -0.00924693, 0.10084561,
+
+    0.01257364,  0.0506071,   -0.19287863, -0.07162561,
+    -0.02033747, 0.22673416,  0.15487903,  0.02525555,
+
+    -0.1411963,  -0.37054959, 0.01774767,  0.05867489,
+    0.09607603,  -0.0141301,  -0.08995658, 0.12867066,
+
+    -0.27142537, -0.16955489, 0.18521598,  -0.12528358,
+    0.00331409,  0.11167502,  0.02218599,  -0.07309391,
+
+    0.09593632,  -0.28361851, -0.0773851,  0.17199151,
+    -0.00075242, 0.33691186,  -0.1536046,  0.16572715,
+
+    -0.27916506, -0.27626723, 0.42615682,  0.3225764,
+    -0.37472126, -0.55655634, -0.05013514, 0.289112,
+
+    -0.24418658, 0.07540751,  -0.1940318,  -0.08911639,
+    0.00732617,  0.46737891,  0.26449674,  0.24888524,
+
+    -0.17225097, -0.54660404, -0.38795233, 0.08389944,
+    0.07736043,  -0.28260678, 0.15666828,  1.14949894,
+
+    -0.57454878, -0.64704704, 0.73235172,  -0.34616736,
+    0.21120001,  -0.22927976, 0.02455296,  -0.35906726,
+};
+
+// Derived class of SingleOpModel, which is used to test SVDF TFLite op.
+class SVDFOpModel : public SingleOpModel {
+ public:
+  SVDFOpModel(int batches, int units, int input_size, int memory_size, int rank)
+      : batches_(batches),
+        units_(units),
+        input_size_(input_size),
+        memory_size_(memory_size),
+        rank_(rank) {
+    input_ = AddInput(TensorType_FLOAT32);
+    weights_feature_ = AddInput(TensorType_FLOAT32);
+    weights_time_ = AddInput(TensorType_FLOAT32);
+    bias_ = AddNullInput();
+    state_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
+        CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE).Union());
+    BuildInterpreter({
+        {batches_, input_size_},        // Input tensor
+        {units_ * rank, input_size_},   // weights_feature tensor
+        {units_ * rank, memory_size_},  // weights_time tensor
+        {units_}                        // bias tensor
+    });
+  }
+
+  // Populates the weights_feature tensor.
+  void SetWeightsFeature(std::initializer_list<float> f) {
+    PopulateTensor(weights_feature_, f);
+  }
+
+  // Populates the weights_time tensor.
+  void SetWeightsTime(std::initializer_list<float> f) {
+    PopulateTensor(weights_time_, f);
+  }
+
+  // Populates the input tensor.
+  void SetInput(int offset, float* begin, float* end) {
+    PopulateTensor(input_, offset, begin, end);
+  }
+
+  // Resets the state of SVDF op by filling it with 0's.
+  void ResetState() {
+    const int zero_buffer_size = rank_ * units_ * batches_ * memory_size_;
+    std::unique_ptr<float[]> zero_buffer(new float[zero_buffer_size]);
+    memset(zero_buffer.get(), 0, zero_buffer_size * sizeof(float));
+    PopulateTensor(state_, 0, zero_buffer.get(),
+                   zero_buffer.get() + zero_buffer_size);
+  }
+
+  // Extracts the output tensor from the SVDF op.
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ private:
+  int input_;
+  int weights_feature_;
+  int weights_time_;
+  int bias_;
+  int state_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+  int memory_size_;
+  int rank_;
+};
+
+TEST(SVDFOpTest, BlackBoxTestRank1) {
+  SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                   /*memory_size=*/10, /*rank=*/1);
+  svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
+                          0.22197971, 0.12416199, 0.27901134, 0.27557442,
+                          0.3905206, -0.36137494, -0.06634006, -0.10640851});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
+
+  svdf.ResetState();
+  const int svdf_num_batches = svdf.num_batches();
+  const int svdf_input_size = svdf.input_size();
+  const int svdf_num_units = svdf.num_units();
+  const int input_sequence_size =
+      sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches);
+  // Going over each input batch, setting the input tensor, invoking the SVDF op
+  // and checking the output with the expected golden values.
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches;
+    float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
+    svdf.SetInput(0, batch_start, batch_end);
+
+    svdf.Invoke();
+
+    float* golden_start =
+        svdf_golden_output_rank_1 + i * svdf_num_units * svdf_num_batches;
+    float* golden_end = golden_start + svdf_num_units * svdf_num_batches;
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+TEST(SVDFOpTest, BlackBoxTestRank2) {
+  SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                   /*memory_size=*/10, /*rank=*/2);
+  svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
+                          0.12416199,  0.15785322,  0.27901134,  0.3905206,
+                          0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+                          -0.36118156, -0.0976817,  -0.36916667, 0.22197971,
+                          0.15294972,  0.38031587,  0.27557442,  0.39635518,
+                          -0.21580373, -0.06634006, -0.02702999, 0.27072677});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+       -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+       0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+       -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+       0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+       -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+       -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+       0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+       0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
+
+  svdf.ResetState();
+  const int svdf_num_batches = svdf.num_batches();
+  const int svdf_input_size = svdf.input_size();
+  const int svdf_num_units = svdf.num_units();
+  const int input_sequence_size =
+      sizeof(svdf_input) / sizeof(float) / (svdf_input_size * svdf_num_batches);
+  // Going over each input batch, setting the input tensor, invoking the SVDF op
+  // and checking the output with the expected golden values.
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = svdf_input + i * svdf_input_size * svdf_num_batches;
+    float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
+    svdf.SetInput(0, batch_start, batch_end);
+
+    svdf.Invoke();
+
+    float* golden_start =
+        svdf_golden_output_rank_2 + i * svdf_num_units * svdf_num_batches;
+    float* golden_end = golden_start + svdf_num_units * svdf_num_batches;
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(svdf.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f716ba8741fd469e7ee405ac300924b53c5c48e5
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+
+#include "tensorflow/contrib/lite/version.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+using ::testing::FloatNear;
+using ::testing::Matcher;
+
+namespace {
+template <typename T>
+std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+  // These are required by many quantized operations.
+  CHECK_LE(f_min, 0);
+  CHECK_GE(f_max, 0);
+  T q_min = std::numeric_limits<T>::min();
+  T q_max = std::numeric_limits<T>::max();
+  float range = q_max - q_min;
+  float scale = (f_max - f_min) / range;
+  int32_t zero_point = std::min(
+      q_max,
+      std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+  return {scale, zero_point};
+}
+}  // namespace
+
+std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+
+int SingleOpModel::AddTensor(TensorData t) {
+  int id = tensors_.size();
+
+  // This is slightly different depending on whether we are adding a
+  // quantized or a regular tensor.
+  bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+  flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+  if (is_quantized) {
+    if (t.min != 0 || t.max != 0) {
+      if (t.type == TensorType_UINT8) {
+        std::tie(t.scale, t.zero_point) =
+            QuantizationParams<uint8_t>(t.min, t.max);
+      } else if (t.type == TensorType_INT32) {
+        std::tie(t.scale, t.zero_point) =
+            QuantizationParams<int32_t>(t.min, t.max);
+      } else {
+        LOG(FATAL) << "No support for the requested quantized type";
+      }
+      t.min = 0;
+      t.max = 0;
+    }
+
+    q_params = CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>({t.scale}),
+        builder_.CreateVector<int64_t>({t.zero_point}));
+  }
+
+  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>({}),
+                                  t.type, /*buffer=*/0,
+                                  /*name=*/0, q_params));
+
+  tensor_data_[id] = t;
+
+  return id;
+}
+
+int SingleOpModel::AddInput(const TensorData& t) {
+  int id = AddTensor(t);
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddNullInput() {
+  int id = kOptionalTensor;
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddOutput(const TensorData& t) {
+  int id = AddTensor(t);
+  outputs_.push_back(id);
+  return id;
+}
+
+void SingleOpModel::SetBuiltinOp(BuiltinOperator type,
+                                 BuiltinOptions builtin_options_type,
+                                 flatbuffers::Offset<void> builtin_options) {
+  opcodes_.push_back(CreateOperatorCode(builder_, type, 0));
+  operators_.push_back(CreateOperator(
+      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+      builder_.CreateVector<int32_t>(outputs_), builtin_options_type,
+      builtin_options,
+      /*custom_options=*/0, CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::SetCustomOp(
+    const string& name, const std::vector<uint8_t>& custom_option,
+    const std::function<TfLiteRegistration*()>& registeration) {
+  custom_registrations_[name] = registeration;
+  opcodes_.push_back(
+      CreateOperatorCodeDirect(builder_, BuiltinOperator_CUSTOM, name.data()));
+  operators_.push_back(CreateOperator(
+      builder_, /*opcode_index=*/0, builder_.CreateVector<int32_t>(inputs_),
+      builder_.CreateVector<int32_t>(outputs_), BuiltinOptions_NONE, 0,
+      builder_.CreateVector<uint8_t>(custom_option),
+      CustomOptionsFormat_FLEXBUFFERS));
+}
+
+void SingleOpModel::BuildInterpreter(
+    std::vector<std::vector<int>> input_shapes) {
+  auto opcodes = builder_.CreateVector(opcodes_);
+  auto operators = builder_.CreateVector(operators_);
+  auto tensors = builder_.CreateVector(tensors_);
+  auto inputs = builder_.CreateVector<int32_t>(inputs_);
+  auto outputs = builder_.CreateVector<int32_t>(outputs_);
+  // Create a single subgraph
+  std::vector<flatbuffers::Offset<SubGraph>> subgraphs;
+  auto subgraph = CreateSubGraph(builder_, tensors, inputs, outputs, operators);
+  subgraphs.push_back(subgraph);
+  auto subgraphs_flatbuffer = builder_.CreateVector(subgraphs);
+
+  std::vector<flatbuffers::Offset<Buffer>> buffers_vec;
+  auto buffers = builder_.CreateVector(buffers_vec);
+  auto description = builder_.CreateString("programmatic model");
+  builder_.Finish(CreateModel(builder_, TFLITE_SCHEMA_VERSION, opcodes,
+                              subgraphs_flatbuffer, description, buffers));
+
+  auto* model = GetModel(builder_.GetBufferPointer());
+
+  ops::builtin::BuiltinOpResolver builtins;
+  for (const auto& reg : custom_registrations_) {
+    builtins.AddCustom(reg.first.data(), reg.second());
+  }
+  InterpreterBuilder(model, builtins)(&interpreter_);
+
+  CHECK(interpreter_ != nullptr);
+
+  int i = 0;
+  for (const auto& shape : input_shapes) {
+    int input_idx = interpreter_->inputs()[i++];
+    if (input_idx == kOptionalTensor) continue;
+    CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
+  }
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+}
+
+void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
+
+int32_t SingleOpModel::GetTensorSize(int index) const {
+  TfLiteTensor* t = interpreter_->tensor(index);
+  CHECK(t);
+  int total_size = 1;
+  for (int i = 0; i < t->dims->size; ++i) {
+    total_size *= t->dims->data[i];
+  }
+  return total_size;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..adcdeddbfc9d3b3313b09cd6310171160e0be645
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -0,0 +1,197 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tflite {
+
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<::testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5);
+
+template <typename T>
+inline std::vector<T> Quantize(const std::vector<float>& data, float scale,
+                               int32_t zero_point) {
+  std::vector<T> q;
+  for (float f : data) {
+    q.push_back(std::max(
+        std::numeric_limits<T>::min(),
+        std::min(std::numeric_limits<T>::max(),
+                 static_cast<T>(std::round(zero_point + (f / scale))))));
+  }
+  return q;
+}
+
+template <typename T>
+inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
+                                     int32_t zero_point) {
+  std::vector<float> f;
+  for (T q : data) {
+    f.push_back(scale * (q - zero_point));
+  }
+  return f;
+}
+
+// A test model that contains a single operator. All operator inputs and
+// output are external to the model, so the tests can directly access them.
+// Typical usage:
+//    SingleOpModel m;
+//    int a = m.AddInput({TensorType_FLOAT32, a_shape});
+//    int b = m.AddInput({TensorType_FLOAT32, b_shape});
+//    int c = m.AddOutput({TensorType_FLOAT32, {}});
+//    m.SetBuiltinOp(...);
+//    m.BuildInterpreter({GetShape(a), GetShape(b)});
+//    m.PopulateTensor(a, {...});
+//    m.PopulateTensor(b, {...});
+//    m.Invoke();
+//    EXPECT_THAT(m.ExtractVector<float>(c), ArrayFloatNear({...}));
+//
+
+// A helper struct to construct test tensors. This is particularly useful for
+// quantized tensor which must have their scale and zero_point defined before
+// the actual data is known. This mimics what happens in practice: quantization
+// parameters are calculate during training.
+struct TensorData {
+  TensorType type;
+  std::vector<int> shape;
+  float min;
+  float max;
+  float scale;
+  int32_t zero_point;
+};
+
+class SingleOpModel {
+ public:
+  SingleOpModel() {}
+  ~SingleOpModel() {}
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  SingleOpModel(const SingleOpModel&) = delete;
+  SingleOpModel& operator=(const SingleOpModel&) = delete;
+
+  // Add a TensorType input tensor and return its index.
+  int AddInput(TensorType type) { return AddInput(TensorData{type}); }
+  int AddInput(const TensorData& t);
+
+  // Add a null input tensor (optional input) and return kOptionalTensor.
+  int AddNullInput();
+
+  // Add a TensorType output tensor and return its index.
+  int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
+  int AddOutput(const TensorData& t);
+
+  template <typename T>
+  void QuantizeAndPopulate(int index, std::initializer_list<float> data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto q = Quantize<T>(data, t->params.scale, t->params.zero_point);
+    PopulateTensor(index, 0, q.data(), q.data() + q.size());
+  }
+
+  const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
+
+  float GetScale(int id) { return tensor_data_.at(id).scale; }
+  int32_t GetZeroPoint(int id) { return tensor_data_.at(id).zero_point; }
+
+  // Define the operator in this model.
+  void SetBuiltinOp(BuiltinOperator type, BuiltinOptions builtin_options_type,
+                    flatbuffers::Offset<void> builtin_options);
+  void SetCustomOp(const string& name,
+                   const std::vector<uint8_t>& custom_option,
+                   const std::function<TfLiteRegistration*()>& registeration);
+
+  // Build the interpreter for this model. Also, resize and allocate all
+  // tensors given the shapes of the inputs.
+  void BuildInterpreter(std::vector<std::vector<int>> input_shapes);
+
+  void Invoke();
+
+  void PopulateStringTensor(int index, const std::vector<string>& content) {
+    auto tensor = interpreter_->tensor(index);
+    DynamicBuffer buf;
+    for (const string& s : content) {
+      buf.AddString(s.data(), s.length());
+    }
+    buf.WriteToTensor(tensor);
+  }
+
+  // Populate the tensor given its index.
+  template <typename T>
+  void PopulateTensor(int index, std::initializer_list<T> data) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    CHECK(v) << "No tensor with index '" << index << "'.";
+    for (T f : data) {
+      *v = f;
+      ++v;
+    }
+  }
+
+  // Partially populate the tensor, starting at the given offset.
+  template <typename T>
+  void PopulateTensor(int index, int offset, T* begin, T* end) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    memcpy(v + offset, begin, (end - begin) * sizeof(T));
+  }
+
+  // Return a vector with the flattened contents of a tensor.
+  template <typename T>
+  std::vector<T> ExtractVector(int index) {
+    T* v = interpreter_->typed_tensor<T>(index);
+    CHECK(v);
+    return std::vector<T>(v, v + GetTensorSize(index));
+  }
+
+  std::vector<int> GetTensorShape(int index) {
+    std::vector<int> result;
+    TfLiteTensor* t = interpreter_->tensor(index);
+    for (int i = 0; i < t->dims->size; ++i) {
+      result.push_back(t->dims->data[i]);
+    }
+    return result;
+  }
+
+ protected:
+  int32_t GetTensorSize(int index) const;
+
+  flatbuffers::FlatBufferBuilder builder_;
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+
+ private:
+  int AddTensor(TensorData t);
+
+  std::map<int, TensorData> tensor_data_;
+  std::vector<int32_t> inputs_;
+  std::vector<int32_t> outputs_;
+  std::vector<flatbuffers::Offset<Tensor>> tensors_;
+  std::vector<flatbuffers::Offset<OperatorCode>> opcodes_;
+  std::vector<flatbuffers::Offset<Operator>> operators_;
+  std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54efad94afa73ccdfb3f26513e934c7eb5001400
--- /dev/null
+++ b/tensorflow/contrib/lite/model.cc
@@ -0,0 +1,700 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+
+namespace {
+inline const tflite::Model* VerifyAndGetModel(const void* buf, size_t len) {
+  ::flatbuffers::Verifier verifier(static_cast<const uint8_t*>(buf), len);
+  if (VerifyModelBuffer(verifier)) {
+    return ::tflite::GetModel(buf);
+  } else {
+    return nullptr;
+  }
+}
+}  // namespace
+
+const char* kEmptyTensorName = "";
+
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
+    const char* filename, ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  model.reset(new FlatBufferModel(filename, /*mmap_file=*/true, error_reporter,
+                                  /*use_nnapi=*/true));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
+    const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  model.reset(new FlatBufferModel(buffer, buffer_size, error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
+std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
+    const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  std::unique_ptr<FlatBufferModel> model;
+  model.reset(new FlatBufferModel(model_spec, error_reporter));
+  if (!model->initialized()) model.reset();
+  return model;
+}
+
+FlatBufferModel::FlatBufferModel(const char* filename, bool mmap_file,
+                                 ErrorReporter* error_reporter, bool use_nnapi)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  if (mmap_file) {
+    if (use_nnapi && NNAPIExists())
+      allocation_ = new NNAPIAllocation(filename, error_reporter);
+    else
+      allocation_ = new MMAPAllocation(filename, error_reporter);
+  } else {
+    allocation_ = new FileCopyAllocation(filename, error_reporter);
+  }
+  if (!allocation_->valid()) return;
+  if (!CheckModelIdentifier()) return;
+
+  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+}
+
+bool FlatBufferModel::CheckModelIdentifier() const {
+  if (!tflite::ModelBufferHasIdentifier(allocation_->base())) {
+    const char* ident = flatbuffers::GetBufferIdentifier(allocation_->base());
+    error_reporter_->Report(
+        "Model provided has model identifier '%c%c%c%c', should be '%s'\n",
+        ident[0], ident[1], ident[2], ident[3], tflite::ModelIdentifier());
+    return false;
+  }
+  return true;
+}
+
+FlatBufferModel::FlatBufferModel(const char* ptr, size_t num_bytes,
+                                 ErrorReporter* error_reporter)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  allocation_ = new MemoryAllocation(ptr, num_bytes, error_reporter);
+  if (!allocation_->valid()) return;
+
+  model_ = VerifyAndGetModel(allocation_->base(), allocation_->bytes());
+}
+
+FlatBufferModel::FlatBufferModel(const Model* model,
+                                 ErrorReporter* error_reporter)
+    : error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {
+  model_ = model;
+}
+
+FlatBufferModel::~FlatBufferModel() { delete allocation_; }
+
+InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
+                                       const OpResolver& op_resolver)
+    : model_(model.GetModel()),
+      op_resolver_(op_resolver),
+      error_reporter_(model.error_reporter()),
+      allocation_(model.allocation()) {}
+
+InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
+                                       const OpResolver& op_resolver,
+                                       ErrorReporter* error_reporter)
+    : model_(model),
+      op_resolver_(op_resolver),
+      error_reporter_(error_reporter ? error_reporter
+                                     : DefaultErrorReporter()) {}
+
+TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
+  TfLiteStatus status = kTfLiteOk;
+  auto opcodes = model_->operator_codes();
+  for (const OperatorCode* opcode : *opcodes) {
+    TfLiteRegistration* registration = nullptr;
+
+    if (opcode->builtin_code() != BuiltinOperator_CUSTOM) {
+      auto x = opcode->builtin_code();
+      flatbuffer_op_index_to_registration_types_.push_back(x);
+      registration = op_resolver_.FindOp(x);
+      if (registration == nullptr) {
+        error_reporter_->Report("Didn't find op for builtin opcode '%s'\n",
+                                EnumNameBuiltinOperator(x));
+        status = kTfLiteError;
+      }
+    } else if (!opcode->custom_code()) {
+      error_reporter_->Report(
+          "Operator with builtin_code==0 has no custom_code.\n");
+      status = kTfLiteError;
+    } else {
+      const char* name = opcode->custom_code()->c_str();
+      registration = op_resolver_.FindOp(name);
+      flatbuffer_op_index_to_registration_types_.push_back(
+          BuiltinOperator_CUSTOM);
+      if (registration == nullptr) {
+        error_reporter_->Report("Didn't find custom op for name '%s'\n", name);
+        status = kTfLiteError;
+      }
+    }
+    flatbuffer_op_index_to_registration_.push_back(registration);
+  }
+  return status;
+}
+
+namespace {
+template <class T>
+std::vector<int> FlatBufferIntArrayToVector(T* flat_array) {
+  std::vector<int> ret(flat_array->Length());
+  for (int i = 0; i < flat_array->Length(); i++) {
+    ret[i] = flat_array->Get(i);
+  }
+  return ret;
+}
+
+// Allocate a structure using C malloc, but make sure the structure is a
+// POD structure that doesn't require constructors to run. The reason we do
+// this, is that Interpreter's C extension part will take ownership and wants
+// to use malloc() and free().
+template <class T>
+T* MallocPOD() {
+  static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
+  return static_cast<T*>(malloc(sizeof(T)));
+}
+
+// Parse the appropriate data out of the op.
+//
+// This handles builtin data explicitly as there are flatbuffer schemas.
+//
+// Returns memory that must be feed.
+void* ParseOpData(const Operator* op, BuiltinOperator op_type,
+                  ErrorReporter* error_reporter) {
+  auto parse_padding = [](Padding padding) {
+    switch (padding) {
+      case Padding_SAME:
+        return kTfLitePaddingSame;
+      case Padding_VALID:
+        return kTfLitePaddingValid;
+    }
+    return kTfLitePaddingUnknown;
+  };
+  auto parse_activation = [](ActivationFunctionType activation) {
+    switch (activation) {
+      case ActivationFunctionType_NONE:
+        return kTfLiteActNone;
+      case ActivationFunctionType_RELU:
+        return kTfLiteActRelu;
+      case ActivationFunctionType_RELU1:
+        return kTfLiteActRelu1;
+      case ActivationFunctionType_RELU6:
+        return kTfLiteActRelu6;
+      case ActivationFunctionType_TANH:
+        return kTfLiteActTanh;
+      case ActivationFunctionType_SIGN_BIT:
+        return kTfLiteActSignBit;
+    }
+    return kTfLiteActNone;
+  };
+  auto parseLSHProjectionType = [](LSHProjectionType type) {
+    switch (type) {
+      case LSHProjectionType_SPARSE:
+        return kTfLiteLshProjectionSparse;
+      case LSHProjectionType_DENSE:
+        return kTfLiteLshProjectionDense;
+      default:
+        return kTfLiteLshProjectionUnknown;
+    }
+  };
+  auto parseCombinerType = [](CombinerType type) {
+    switch (type) {
+      case CombinerType_MEAN:
+        return kTfLiteCombinerTypeMean;
+      case CombinerType_SQRTN:
+        return kTfLiteCombinerTypeSqrtn;
+      case CombinerType_SUM:
+      default:
+        return kTfLiteCombinerTypeSum;
+    }
+  };
+
+  void* builtin_data = nullptr;
+  switch (op_type) {
+    case BuiltinOperator_CALL:
+      // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
+      // ok for now, since there is no call implementation either.
+      break;
+    case BuiltinOperator_CUSTOM:
+      break;
+    case BuiltinOperator_CONV_2D: {
+      TfLiteConvParams* params = MallocPOD<TfLiteConvParams>();
+      if (auto* conv_params = op->builtin_options_as_Conv2DOptions()) {
+        params->padding = parse_padding(conv_params->padding());
+        params->stride_width = conv_params->stride_w();
+        params->stride_height = conv_params->stride_h();
+        params->activation =
+            parse_activation(conv_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_LOGISTIC:
+    case BuiltinOperator_RELU:
+    case BuiltinOperator_RELU1:
+    case BuiltinOperator_RELU6:
+    case BuiltinOperator_CONCAT_EMBEDDINGS:
+      break;
+    case BuiltinOperator_LSH_PROJECTION: {
+      TfLiteLSHProjectionParams* params =
+          MallocPOD<TfLiteLSHProjectionParams>();
+      if (auto* lshParams = op->builtin_options_as_LSHProjectionOptions()) {
+        params->type = parseLSHProjectionType(lshParams->type());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_AVERAGE_POOL_2D:
+    case BuiltinOperator_MAX_POOL_2D:
+    case BuiltinOperator_L2_POOL_2D: {
+      TfLitePoolParams* params = MallocPOD<TfLitePoolParams>();
+      if (auto* pool_params = op->builtin_options_as_Pool2DOptions()) {
+        params->padding = parse_padding(pool_params->padding());
+        params->stride_width = pool_params->stride_w();
+        params->stride_height = pool_params->stride_h();
+        params->filter_width = pool_params->filter_width();
+        params->filter_height = pool_params->filter_height();
+        params->activation =
+            parse_activation(pool_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_DEPTHWISE_CONV_2D: {
+      TfLiteDepthwiseConvParams* params =
+          MallocPOD<TfLiteDepthwiseConvParams>();
+      if (auto* conv_params = op->builtin_options_as_DepthwiseConv2DOptions()) {
+        params->padding = parse_padding(conv_params->padding());
+        params->stride_width = conv_params->stride_w();
+        params->stride_height = conv_params->stride_h();
+        params->depth_multiplier = conv_params->depth_multiplier();
+        params->activation =
+            parse_activation(conv_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SVDF: {
+      TfLiteSVDFParams* params = MallocPOD<TfLiteSVDFParams>();
+      if (auto* svdf_params = op->builtin_options_as_SVDFOptions()) {
+        params->rank = svdf_params->rank();
+        params->activation =
+            parse_activation(svdf_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_RNN: {
+      TfLiteRNNParams* params = MallocPOD<TfLiteRNNParams>();
+      if (auto* rnn_params = op->builtin_options_as_RNNOptions()) {
+        params->activation =
+            parse_activation(rnn_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_EMBEDDING_LOOKUP:
+      // no-op.
+      break;
+    case BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: {
+      TfLiteEmbeddingLookupSparseParams* params =
+          MallocPOD<TfLiteEmbeddingLookupSparseParams>();
+      if (auto* embedding_params =
+              op->builtin_options_as_EmbeddingLookupSparseOptions()) {
+        params->combiner = parseCombinerType(embedding_params->combiner());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_FULLY_CONNECTED: {
+      TfLiteFullyConnectedParams* params =
+          MallocPOD<TfLiteFullyConnectedParams>();
+      if (auto* fully_connected_params =
+              op->builtin_options_as_FullyConnectedOptions()) {
+        params->activation = parse_activation(
+            fully_connected_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_HASHTABLE_LOOKUP:
+      // no-op.
+      break;
+    case BuiltinOperator_SOFTMAX: {
+      TfLiteSoftmaxParams* params = MallocPOD<TfLiteSoftmaxParams>();
+      if (auto* softmax_params = op->builtin_options_as_SoftmaxOptions()) {
+        params->beta = softmax_params->beta();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_CONCATENATION: {
+      TfLiteConcatenationParams* params =
+          MallocPOD<TfLiteConcatenationParams>();
+      if (auto* concatenation_params =
+              op->builtin_options_as_ConcatenationOptions()) {
+        params->activation =
+            parse_activation(concatenation_params->fused_activation_function());
+        params->axis = concatenation_params->axis();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MUL: {
+      auto* params = MallocPOD<TfLiteMulParams>();
+      if (auto* schema_params = op->builtin_options_as_MulOptions()) {
+        params->activation =
+            parse_activation(schema_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_ADD: {
+      auto* params = MallocPOD<TfLiteAddParams>();
+      if (auto* schema_params = op->builtin_options_as_AddOptions()) {
+        params->activation =
+            parse_activation(schema_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_L2_NORMALIZATION: {
+      auto* params = MallocPOD<TfLiteL2NormParams>();
+      if (auto* schema_params = op->builtin_options_as_L2NormOptions()) {
+        params->activation =
+            parse_activation(schema_params->fused_activation_function());
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
+      auto* params = MallocPOD<TfLiteLocalResponseNormParams>();
+      if (auto* schema_params =
+              op->builtin_options_as_LocalResponseNormalizationOptions()) {
+        params->radius = schema_params->radius();
+        params->bias = schema_params->bias();
+        params->alpha = schema_params->alpha();
+        params->beta = schema_params->beta();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_LSTM: {
+      TfLiteLSTMParams* params = MallocPOD<TfLiteLSTMParams>();
+      if (auto* lstm_params = op->builtin_options_as_LSTMOptions()) {
+        params->activation =
+            parse_activation(lstm_params->fused_activation_function());
+        params->cell_clip = lstm_params->cell_clip();
+        params->proj_clip = lstm_params->proj_clip();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_RESIZE_BILINEAR: {
+      auto* params = MallocPOD<TfLiteResizeBilinearParams>();
+      if (auto* schema_params =
+              op->builtin_options_as_ResizeBilinearOptions()) {
+        params->new_height = schema_params->new_height();
+        params->new_width = schema_params->new_width();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_RESHAPE: {
+      auto* params = MallocPOD<TfLiteReshapeParams>();
+      if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
+        auto* new_shape = schema_params->new_shape();
+        if (!new_shape) {
+          error_reporter->Report("No new_shape provided for Reshape\n");
+        } else {
+          params->num_dimensions = new_shape->Length();
+          if (params->num_dimensions > sizeof(params->shape) / sizeof(int)) {
+            error_reporter->Report(
+                "Found too many dimensions in Reshape's new_shape\n");
+          } else {
+            for (int i = 0; i < params->num_dimensions; ++i) {
+              params->shape[i] = new_shape->Get(i);
+            }
+          }
+        }
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SKIP_GRAM: {
+      TfLiteSkipGramParams* params = MallocPOD<TfLiteSkipGramParams>();
+      if (auto* skip_gram_params = op->builtin_options_as_SkipGramOptions()) {
+        params->ngram_size = skip_gram_params->ngram_size();
+        params->max_skip_size = skip_gram_params->max_skip_size();
+        params->include_all_ngrams = skip_gram_params->include_all_ngrams();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SPACE_TO_DEPTH: {
+      auto* params = MallocPOD<TfLiteSpaceToDepthParams>();
+      if (auto* schema_params = op->builtin_options_as_SpaceToDepthOptions()) {
+        params->block_size = schema_params->block_size();
+      }
+      builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+  }
+  return builtin_data;
+}
+
+}  // namespace
+
+TfLiteStatus InterpreterBuilder::ParseNodes(
+    const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
+    Interpreter* interpreter) {
+  TfLiteStatus status = kTfLiteOk;
+  for (int i = 0; i < operators->Length(); ++i) {
+    const auto* op = operators->Get(i);
+    int index = op->opcode_index();
+    if (index < 0 || index >= flatbuffer_op_index_to_registration_.size()) {
+      error_reporter_->Report("Missing registration for opcode_index %d\n",
+                              index);
+      status = kTfLiteError;
+      continue;
+    }
+    const TfLiteRegistration* reg =
+        flatbuffer_op_index_to_registration_[op->opcode_index()];
+    if (reg == nullptr) {
+      error_reporter_->Report("Skipping op for opcode_index %d\n", index);
+      status = kTfLiteError;
+      continue;
+    }
+
+    auto op_type =
+        flatbuffer_op_index_to_registration_types_[op->opcode_index()];
+    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
+      error_reporter_->Report(
+          "Found builtin operator %s with custom options.\n",
+          EnumNameBuiltinOperator(op_type));
+    }
+    if (op->custom_options()) {
+      interpreter->AddNodeWithParameters(
+          FlatBufferIntArrayToVector(op->inputs()),
+          FlatBufferIntArrayToVector(op->outputs()),
+          reinterpret_cast<const char*>(op->custom_options()->data()),
+          op->custom_options()->size(), nullptr, reg);
+    } else {
+      interpreter->AddNodeWithParameters(
+          FlatBufferIntArrayToVector(op->inputs()),
+          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0,
+          ParseOpData(op, op_type, error_reporter_), reg);
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus InterpreterBuilder::ParseTensors(
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+    const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+    Interpreter* interpreter) {
+  TfLiteStatus status = kTfLiteOk;
+
+  // A little helper to get the names of inputs and outputs. Note that they
+  // must outlive the interpreter.
+  auto get_name = [](const tflite::Tensor* t) -> const char* {
+    auto name = t->name();
+    if (name) return name->c_str();
+    return kEmptyTensorName;
+  };
+
+  for (int i = 0; i < tensors->Length(); ++i) {
+    const auto* tensor = tensors->Get(i);
+    std::vector<int> dims = FlatBufferIntArrayToVector(tensor->shape());
+
+    TfLiteQuantizationParams quantization;
+    quantization.scale = 0;
+    quantization.zero_point = 0;
+    auto* q_params = tensor->quantization();
+    if (q_params) {
+      // Note that the schema could hold per-channel quantization parameters
+      // but we really only support one value for the whole tensor.
+      // TODO(aselle): This breaks as well if these are nullptr's.
+      // TODO(aselle): This assumes non per-channel quantization.
+      if (q_params->scale()) quantization.scale = q_params->scale()->Get(0);
+      if (q_params->zero_point())
+        quantization.zero_point = q_params->zero_point()->Get(0);
+    }
+
+    TfLiteType type;
+    switch (tensor->type()) {
+      case TensorType_FLOAT32:
+        type = kTfLiteFloat32;
+        break;
+      case TensorType_INT32:
+        type = kTfLiteInt32;
+        break;
+      case TensorType_UINT8:
+        type = kTfLiteUInt8;
+        break;
+      case TensorType_INT64:
+        type = kTfLiteInt64;
+        break;
+      case TensorType_STRING:
+        type = kTfLiteString;
+        break;
+      default:
+        // tensorType = ArrayType::NONE;
+        error_reporter_->Report("Unimplemented data type %s (%d) in tensor\n",
+                                EnumNameTensorType(tensor->type()),
+                                tensor->type());
+        status = kTfLiteError;
+        continue;
+    }
+    auto get_readonly_data = [&](const char** buffer_data,
+                                 size_t* buffer_size) {
+      // TODO(aselle): Check what happens if we have an unspecified size
+      // constant.
+      *buffer_data = nullptr;
+      if (tensor->buffer() == 0) return kTfLiteOk;
+      if (tensor->buffer() >= buffers->size()) {
+        error_reporter_->Report(
+            "Tensor %d specifies out of range buffer %d (only %d buffers).\n",
+            i, tensor->buffer(), buffers->size());
+        return kTfLiteError;
+      }
+      if (auto* buffer = (*buffers)[tensor->buffer()]) {
+        if (auto* array = buffer->data()) {
+          if (size_t size = array->size()) {
+            *buffer_size = size;
+            *buffer_data = reinterpret_cast<const char*>(array->data());
+            return kTfLiteOk;
+          }
+        }
+      }
+      return kTfLiteOk;
+    };
+    size_t buffer_size = 0;
+    const char* buffer_ptr;
+    TF_LITE_ENSURE_STATUS(get_readonly_data(&buffer_ptr, &buffer_size));
+
+    if (buffer_ptr) {
+      if (interpreter->SetTensorParametersReadOnly(
+              i, type, get_name(tensor), dims, quantization, buffer_ptr,
+              buffer_size, allocation_) != kTfLiteOk) {
+        error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
+                                i);
+        status = kTfLiteError;
+      }
+    } else {
+      if (interpreter->SetTensorParametersReadWrite(
+              i, type, get_name(tensor), dims, quantization) != kTfLiteOk) {
+        error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
+                                i);
+        status = kTfLiteError;
+      }
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus InterpreterBuilder::operator()(
+    std::unique_ptr<Interpreter>* interpreter) {
+  if (!interpreter) {
+    error_reporter_->Report(
+        "Null output pointer passed to InterpreterBuilder.");
+    return kTfLiteError;
+  }
+
+  // Safe exit by deleting partially created interpreter, to reduce verbosity
+  // on error conditions. Use by return cleanup_on_error();
+  auto cleanup_and_error = [&interpreter]() {
+    interpreter->reset();
+    return kTfLiteError;
+  };
+
+  if (!model_) {
+    error_reporter_->Report("Null pointer passed in as model.");
+    return cleanup_and_error();
+  }
+
+  if (model_->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter_->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model_->version(), TFLITE_SCHEMA_VERSION);
+    return cleanup_and_error();
+  }
+
+  if (BuildLocalIndexToRegistrationMapping() != kTfLiteOk) {
+    error_reporter_->Report("Registration failed.\n");
+    return cleanup_and_error();
+  }
+
+  // Flatbuffer model schemas define a list of opcodes independent of the graph.
+  // We first map those to registrations. This reduces string lookups for custom
+  // ops since we only do it once per custom op rather than once per custom op
+  // invocation in the model graph.
+  // Construct interpreter with correct number of tensors and operators.
+  auto* subgraphs = model_->subgraphs();
+  auto* buffers = model_->buffers();
+  if (subgraphs->size() != 1) {
+    error_reporter_->Report("Only 1 subgraph is currently supported.\n");
+    return cleanup_and_error();
+  }
+  const tflite::SubGraph* subgraph = (*subgraphs)[0];
+  auto operators = subgraph->operators();
+  auto tensors = subgraph->tensors();
+  if (!operators || !tensors || !buffers) {
+    error_reporter_->Report(
+        "Did not get operators, tensors, or buffers in input flat buffer.\n");
+    return cleanup_and_error();
+  }
+  interpreter->reset(new Interpreter(error_reporter_));
+  if ((**interpreter).AddTensors(tensors->Length()) != kTfLiteOk) {
+    return cleanup_and_error();
+  }
+
+  // Parse inputs/outputs
+  (**interpreter).SetInputs(FlatBufferIntArrayToVector(subgraph->inputs()));
+  (**interpreter).SetOutputs(FlatBufferIntArrayToVector(subgraph->outputs()));
+
+  // Finally setup nodes and tensors
+  if (ParseNodes(operators, interpreter->get()) != kTfLiteOk)
+    return cleanup_and_error();
+  if (ParseTensors(buffers, tensors, interpreter->get()) != kTfLiteOk)
+    return cleanup_and_error();
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0c96f7f0480cd3146f95a22957477809cf0096d
--- /dev/null
+++ b/tensorflow/contrib/lite/model.h
@@ -0,0 +1,176 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Deserialization infrastructure for tflite. Provides functionality
+// to go from a serialized tflite model in flatbuffer format to an
+// interpreter.
+//
+// using namespace tflite;
+// StderrReporter error_reporter;
+// auto model = FlatBufferModel::BuildFromFile("interesting_model.tflite",
+//                                             &error_reporter);
+// MyOpResolver resolver;  // You need to subclass OpResolver to provide
+//                         // implementations.
+// InterpreterBuilder builder(*model, resolver);
+// std::unique_ptr<Interpreter> interpreter;
+// if(builder(&interpreter) == kTfLiteOk) {
+//   .. run model inference with interpreter
+// }
+//
+// OpResolver must be defined to provide your kernel implementations to the
+// interpreter. This is environment specific and may consist of just the builtin
+// ops, or some custom operators you defined to extend tflite.
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
+
+#include <memory>
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// An RAII object that represents a read-only tflite model, copied from disk,
+// or mmapped. This uses flatbuffers as the serialization format.
+class FlatBufferModel {
+ public:
+  // Builds a model based on a file. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromFile(
+      const char* filename,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Builds a model based on a pre-loaded flatbuffer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* buffer, size_t buffer_size,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Builds a model directly from a flatbuffer pointer. The caller retains
+  // ownership of the buffer and should keep it alive until the returned object
+  // is destroyed. Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> BuildFromModel(
+      const tflite::Model* model_spec,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Releases memory or unmaps mmaped meory.
+  ~FlatBufferModel();
+
+  // Copying or assignment is disallowed to simplify ownership semantics.
+  FlatBufferModel(const FlatBufferModel&) = delete;
+  FlatBufferModel& operator=(const FlatBufferModel&) = delete;
+
+  bool initialized() const { return model_ != nullptr; }
+  const tflite::Model* operator->() const { return model_; }
+  const tflite::Model* GetModel() const { return model_; }
+  ErrorReporter* error_reporter() const { return error_reporter_; }
+  const Allocation* allocation() const { return allocation_; }
+
+  // Returns true if the model identifier is correct (otherwise false and
+  // reports an error).
+  bool CheckModelIdentifier() const;
+
+ private:
+  // Loads a model from `filename`. If `mmap_file` is true then use mmap,
+  // otherwise make a copy of the model in a buffer.
+  //
+  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
+  // used.
+  explicit FlatBufferModel(
+      const char* filename, bool mmap_file = true,
+      ErrorReporter* error_reporter = DefaultErrorReporter(),
+      bool use_nnapi = false);
+
+  // Loads a model from `ptr` and `num_bytes` of the model file. The `ptr` has
+  // to remain alive and unchanged until the end of this flatbuffermodel's
+  // lifetime.
+  //
+  // Note, if `error_reporter` is null, then a DefaultErrorReporter() will be
+  // used.
+  FlatBufferModel(const char* ptr, size_t num_bytes,
+                  ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Loads a model from Model flatbuffer. The `model` has to remain alive and
+  // unchanged until the end of this flatbuffermodel's lifetime.
+  FlatBufferModel(const Model* model, ErrorReporter* error_reporter);
+
+  // Flatbuffer traverser pointer. (Model* is a pointer that is within the
+  // allocated memory of the data allocated by allocation's internals.
+  const tflite::Model* model_ = nullptr;
+  ErrorReporter* error_reporter_;
+  Allocation* allocation_ = nullptr;
+};
+
+// Abstract interface that returns TfLiteRegistrations given op codes or custom
+// op names. This is the mechanism that ops being referenced in the flatbuffer
+// model are mapped to executable function pointers (TfLiteRegistrations).
+class OpResolver {
+ public:
+  // Finds the op registration for a builtin operator by enum code.
+  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
+  // Finds the op registration of a custom operator by op name.
+  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
+  virtual ~OpResolver() {}
+};
+
+// Build an interpreter capable of interpreting `model`.
+//
+// model: a scoped model whose lifetime must be at least as long as
+//   the interpreter. In principle multiple interpreters can be made from
+//   a single model.
+// op_resolver: An instance that implements the Resolver interface which maps
+//   custom op names and builtin op codes to op registrations.
+// reportError: a functor that is called to report errors that handles
+//   printf var arg semantics. The lifetime of the reportError object must
+//   be greater than or equal to the Interpreter created by operator().
+//
+// Returns a kTfLiteOk when successful and sets interpreter to a valid
+// Interpreter. Note: the user must ensure the model lifetime is at least as
+// long as interpreter's lifetime.
+class InterpreterBuilder {
+ public:
+  InterpreterBuilder(const FlatBufferModel& model,
+                     const OpResolver& op_resolver);
+  // Builds an interpreter given only the raw flatbuffer Model object (instead
+  // of a FlatBufferModel). Mostly used for testing.
+  // If `error_reporter` is null, then DefaultErrorReporter() is used.
+  InterpreterBuilder(const ::tflite::Model* model,
+                     const OpResolver& op_resolver,
+                     ErrorReporter* error_reporter = DefaultErrorReporter());
+  InterpreterBuilder(const InterpreterBuilder&) = delete;
+  InterpreterBuilder& operator=(const InterpreterBuilder&) = delete;
+  TfLiteStatus operator()(std::unique_ptr<Interpreter>* interpreter);
+
+ private:
+  TfLiteStatus BuildLocalIndexToRegistrationMapping();
+  TfLiteStatus ParseNodes(
+      const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
+      Interpreter* interpreter);
+  TfLiteStatus ParseTensors(
+      const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
+      const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+      Interpreter* interpreter);
+
+  const ::tflite::Model* model_;
+  const OpResolver& op_resolver_;
+  ErrorReporter* error_reporter_;
+
+  std::vector<TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
+  const Allocation* allocation_ = nullptr;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5330c8f594593655b2a8776cf6b399c0d16cdc19
--- /dev/null
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -0,0 +1,290 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <string>
+
+#include "tensorflow/contrib/lite/model.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+// Comparison for TfLiteRegistration. Since TfLiteRegistration is a C object,
+// we must declare this in global namespace, so argument-dependent operator
+// lookup works.
+inline bool operator==(const TfLiteRegistration& a,
+                       const TfLiteRegistration& b) {
+  return a.invoke == b.invoke && a.init == b.init && a.prepare == b.prepare &&
+         a.free == b.free;
+}
+
+namespace tflite {
+
+// Provide a dummy operation that does nothing.
+namespace {
+void* dummy_init(TfLiteContext*, const char*, size_t) { return nullptr; }
+void dummy_free(TfLiteContext*, void*) {}
+TfLiteStatus dummy_resize(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+TfLiteStatus dummy_invoke(TfLiteContext*, TfLiteNode*) { return kTfLiteOk; }
+TfLiteRegistration dummy_reg = {dummy_init, dummy_free, dummy_resize,
+                                dummy_invoke};
+}  // namespace
+
+// Provide a trivial resolver that returns a constant value no matter what
+// op is asked for.
+class TrivialResolver : public OpResolver {
+ public:
+  explicit TrivialResolver(TfLiteRegistration* constant_return = nullptr)
+      : constant_return_(constant_return) {}
+  // Find the op registration of a custom operator by op name.
+  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    return constant_return_;
+  }
+  // Find the op registration of a custom operator by op name.
+  TfLiteRegistration* FindOp(const char* op) const override {
+    return constant_return_;
+  }
+
+ private:
+  TfLiteRegistration* constant_return_;
+};
+
+TEST(BasicFlatBufferModel, TestNonExistantFiles) {
+  ASSERT_TRUE(!FlatBufferModel::BuildFromFile("/tmp/tflite_model_1234"));
+}
+
+// Make sure a model with nothing in it loads properly.
+TEST(BasicFlatBufferModel, TestEmptyModelsAndNullDestination) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/empty_model.bin");
+  ASSERT_TRUE(model);
+  // Now try to build it into a model.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter),
+            kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver())(nullptr), kTfLiteOk);
+}
+
+// Make sure currently unsupported # of subgraphs are checked
+// TODO(aselle): Replace this test when multiple subgraphs are supported.
+TEST(BasicFlatBufferModel, TestZeroAndMultipleSubgraphs) {
+  auto m1 = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/0_subgraphs.bin");
+  ASSERT_TRUE(m1);
+  std::unique_ptr<Interpreter> interpreter1;
+  ASSERT_NE(InterpreterBuilder(*m1, TrivialResolver())(&interpreter1),
+            kTfLiteOk);
+
+  auto m2 = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/2_subgraphs.bin");
+  ASSERT_TRUE(m2);
+  std::unique_ptr<Interpreter> interpreter2;
+  ASSERT_NE(InterpreterBuilder(*m2, TrivialResolver())(&interpreter2),
+            kTfLiteOk);
+}
+
+// Test what happens if we cannot bind any of the ops.
+TEST(BasicFlatBufferModel, TestModelWithoutNullRegistrations) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin");
+  ASSERT_TRUE(model);
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_NE(InterpreterBuilder(*model, TrivialResolver(nullptr))(&interpreter),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter, nullptr);
+}
+
+// Make sure model is read to interpreter propelrly
+TEST(BasicFlatBufferModel, TestModelInInterpreter) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model.bin");
+  ASSERT_TRUE(model);
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(interpreter->tensors_size(), 4);
+  ASSERT_EQ(interpreter->nodes_size(), 2);
+  std::vector<int> inputs = {0, 1};
+  std::vector<int> outputs = {2, 3};
+  ASSERT_EQ(interpreter->inputs(), inputs);
+  ASSERT_EQ(interpreter->outputs(), outputs);
+
+  EXPECT_EQ(std::string(interpreter->GetInputName(0)), "input0");
+  EXPECT_EQ(std::string(interpreter->GetInputName(1)), "input1");
+  EXPECT_EQ(std::string(interpreter->GetOutputName(0)), "out1");
+  EXPECT_EQ(std::string(interpreter->GetOutputName(1)), "out2");
+
+  // Make sure all input tensors are correct
+  TfLiteTensor* i0 = interpreter->tensor(0);
+  ASSERT_EQ(i0->type, kTfLiteFloat32);
+  ASSERT_NE(i0->data.raw, nullptr);  // mmapped
+  ASSERT_EQ(i0->allocation_type, kTfLiteMmapRo);
+  TfLiteTensor* i1 = interpreter->tensor(1);
+  ASSERT_EQ(i1->type, kTfLiteFloat32);
+  ASSERT_EQ(i1->data.raw, nullptr);
+  ASSERT_EQ(i1->allocation_type, kTfLiteArenaRw);
+  TfLiteTensor* o0 = interpreter->tensor(2);
+  ASSERT_EQ(o0->type, kTfLiteFloat32);
+  ASSERT_EQ(o0->data.raw, nullptr);
+  ASSERT_EQ(o0->allocation_type, kTfLiteArenaRw);
+  TfLiteTensor* o1 = interpreter->tensor(3);
+  ASSERT_EQ(o1->type, kTfLiteFloat32);
+  ASSERT_EQ(o1->data.raw, nullptr);
+  ASSERT_EQ(o1->allocation_type, kTfLiteArenaRw);
+
+  // Check op 0 which has inputs {0, 1} outputs {2}.
+  {
+    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg0 =
+        interpreter->node_and_registration(0);
+    ASSERT_NE(node_and_reg0, nullptr);
+    const TfLiteNode& node0 = node_and_reg0->first;
+    const TfLiteRegistration& reg0 = node_and_reg0->second;
+    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(2);
+    desired_inputs->data[0] = 0;
+    desired_inputs->data[1] = 1;
+    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
+    desired_outputs->data[0] = 2;
+    ASSERT_TRUE(TfLiteIntArrayEqual(node0.inputs, desired_inputs));
+    ASSERT_TRUE(TfLiteIntArrayEqual(node0.outputs, desired_outputs));
+    TfLiteIntArrayFree(desired_inputs);
+    TfLiteIntArrayFree(desired_outputs);
+    ASSERT_EQ(reg0, dummy_reg);
+  }
+
+  // Check op 1 which has inputs {2} outputs {3}.
+  {
+    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg1 =
+        interpreter->node_and_registration(1);
+    ASSERT_NE(node_and_reg1, nullptr);
+    const TfLiteNode& node1 = node_and_reg1->first;
+    const TfLiteRegistration& reg1 = node_and_reg1->second;
+    TfLiteIntArray* desired_inputs = TfLiteIntArrayCreate(1);
+    TfLiteIntArray* desired_outputs = TfLiteIntArrayCreate(1);
+    desired_inputs->data[0] = 2;
+    desired_outputs->data[0] = 3;
+    ASSERT_TRUE(TfLiteIntArrayEqual(node1.inputs, desired_inputs));
+    ASSERT_TRUE(TfLiteIntArrayEqual(node1.outputs, desired_outputs));
+    TfLiteIntArrayFree(desired_inputs);
+    TfLiteIntArrayFree(desired_outputs);
+    ASSERT_EQ(reg1, dummy_reg);
+  }
+}
+
+// This tests on a flatbuffer that defines a shape of 2 to be a memory mapped
+// buffer. But the buffer is provided to be only 1 element.
+TEST(BasicFlatBufferModel, TestBrokenMmap) {
+  ASSERT_FALSE(FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/test_model_broken.bin"));
+}
+
+TEST(BasicFlatBufferModel, TestNullModel) {
+  // Check that we get an error code and interpreter pointer is reset.
+  std::unique_ptr<Interpreter> interpreter(new Interpreter);
+  ASSERT_NE(
+      InterpreterBuilder(nullptr, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter.get(), nullptr);
+}
+
+struct TestErrorReporter : public ErrorReporter {
+  int Report(const char* format, va_list args) override {
+    calls++;
+    return 0;
+  }
+  int calls = 0;
+};
+
+// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
+// the Interpreter.
+TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
+  TestErrorReporter reporter;
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/empty_model.bin",
+      &reporter);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  TrivialResolver resolver;
+  InterpreterBuilder(*model, resolver)(&interpreter);
+  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(reporter.calls, 1);
+}
+
+// This makes sure the ErrorReporter is marshalled from FlatBufferModel to
+// the Interpreter.
+TEST(BasicFlatBufferModel, TestNullErrorReporter) {
+  auto model = FlatBufferModel::BuildFromFile(
+      "tensorflow/contrib/lite/testdata/empty_model.bin", nullptr);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  TrivialResolver resolver;
+  InterpreterBuilder(*model, resolver)(&interpreter);
+  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
+}
+
+// Test what happens if we cannot bind any of the ops.
+TEST(BasicFlatBufferModel, TestBuildModelFromCorruptedData) {
+  std::string corrupted_data = "123";
+  auto model = FlatBufferModel::BuildFromBuffer(corrupted_data.c_str(),
+                                                corrupted_data.length());
+  ASSERT_FALSE(model);
+}
+
+// Test that loading model directly from a Model flatbuffer works.
+TEST(BasicFlatBufferModel, TestBuildFromModel) {
+  TestErrorReporter reporter;
+  FileCopyAllocation model_allocation(
+      "tensorflow/contrib/lite/testdata/test_model.bin", &reporter);
+  ASSERT_TRUE(model_allocation.valid());
+  ::flatbuffers::Verifier verifier(
+      reinterpret_cast<const uint8_t*>(model_allocation.base()),
+      model_allocation.bytes());
+  ASSERT_TRUE(VerifyModelBuffer(verifier));
+  const Model* model_fb = ::tflite::GetModel(model_allocation.base());
+
+  auto model = FlatBufferModel::BuildFromModel(model_fb);
+  ASSERT_TRUE(model);
+
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(*model, TrivialResolver(&dummy_reg))(&interpreter),
+      kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+}
+
+// TODO(aselle): Add tests for serialization of builtin op data types.
+// These tests will occur with the evaluation tests of individual operators,
+// not here.
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..733c3f4c7fa0605f24a1e6b4c458e34310c079c4
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -0,0 +1,100 @@
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+
+licenses(["notice"])  # Apache 2.0
+
+gen_selected_ops(
+    name = "smartreply_ops",
+    model = "@tflite_smartreply//:smartreply.tflite",
+)
+
+cc_library(
+    name = "custom_ops",
+    srcs = [
+        "ops/extract_feature.cc",
+        "ops/normalize.cc",
+        "ops/predict.cc",
+        ":smartreply_ops",
+    ],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_library(
+    name = "predictor_lib",
+    srcs = ["predictor.cc"],
+    hdrs = ["predictor.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "extract_feature_op_test",
+    size = "small",
+    srcs = ["ops/extract_feature_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@farmhash_archive//:farmhash",
+    ],
+)
+
+cc_test(
+    name = "normalize_op_test",
+    size = "small",
+    srcs = ["ops/normalize_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "predict_op_test",
+    size = "small",
+    srcs = ["ops/predict_test.cc"],
+    deps = [
+        ":custom_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..75ed9432c8fcdfd77a64d3c659e6336c977cdda2
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/AndroidManifest.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2017 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+  package="com.example.android.smartreply" >
+
+  <uses-sdk
+      android:minSdkVersion="15"
+      android:targetSdkVersion="24" />
+
+  <application android:label="TfLite SmartReply Demo">
+    <activity
+        android:name="com.example.android.smartreply.MainActivity"
+        android:configChanges="orientation|keyboardHidden|screenSize"
+        android:windowSoftInputMode="stateUnchanged|adjustPan"
+        android:label="TfLite SmartReply Demo"
+        android:screenOrientation="portrait" >
+      <intent-filter>
+        <action android:name="android.intent.action.MAIN" />
+        <category android:name="android.intent.category.LAUNCHER" />
+      </intent-filter>
+    </activity>
+  </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f8767b443a2aa64b666c3b6bfb7db30cc0be62ea
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/BUILD
@@ -0,0 +1,65 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+filegroup(
+    name = "assets",
+    srcs = [
+        "@tflite_smartreply//:model_files",
+    ],
+)
+
+android_binary(
+    name = "SmartReplyDemo",
+    srcs = glob(["java/**/*.java"]),
+    assets = [":assets"],
+    assets_dir = "",
+    custom_package = "com.example.android.smartreply",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        ":smartreply_runtime",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
+
+cc_library(
+    name = "smartreply_runtime",
+    srcs = ["libsmartreply_jni.so"],
+    visibility = ["//visibility:public"],
+)
+
+tflite_jni_binary(
+    name = "libsmartreply_jni.so",
+    deps = [
+        ":smartreply_jni_lib",
+    ],
+)
+
+cc_library(
+    name = "smartreply_jni_lib",
+    srcs = [
+        "smartreply_jni.cc",
+    ],
+    copts = tflite_copts(),
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/models/smartreply:predictor_lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3c882ffc43fde577801428151a43b592e8faaed1
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/BUILD
@@ -0,0 +1,15 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(glob(["*"]))
+
+filegroup(
+    name = "assets_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a0a5b46b5f8d5fd6a0297c8056bb2fb9b6ad9ada
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/assets/backoff_response.txt
@@ -0,0 +1,16 @@
+Ok
+Yes
+No
+👍
+☺
+😟
+❤️
+Lol
+Thanks
+Got it
+Done
+Nice
+I don't know
+What?
+Why?
+What's up?
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..02fec9ae5e971ad756ae6c2b0149a6aacfa27cad
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/MainActivity.java
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.app.Activity;
+import android.os.Bundle;
+import android.os.Handler;
+import android.util.Log;
+import android.view.View;
+import android.widget.Button;
+import android.widget.EditText;
+import android.widget.TextView;
+
+/**
+ * The main (and only) activity of this demo app. Displays a text box which updates as messages are
+ * received.
+ */
+public class MainActivity extends Activity {
+  private static final String TAG = "SmartReplyDemo";
+  private SmartReplyClient client;
+
+  private Button sendButton;
+  private TextView messageTextView;
+  private EditText messageInput;
+
+  private Handler handler;
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    Log.v(TAG, "onCreate");
+    setContentView(R.layout.main_activity);
+
+    client = new SmartReplyClient(getApplicationContext());
+    handler = new Handler();
+
+    sendButton = (Button) findViewById(R.id.send_button);
+    sendButton.setOnClickListener(
+        (View v) -> {
+          send(messageInput.getText().toString());
+        });
+
+    messageTextView = (TextView) findViewById(R.id.message_text);
+    messageInput = (EditText) findViewById(R.id.message_input);
+  }
+
+  @Override
+  protected void onStart() {
+    super.onStart();
+    Log.v(TAG, "onStart");
+    handler.post(
+        () -> {
+          client.loadModel();
+        });
+  }
+
+  @Override
+  protected void onStop() {
+    super.onStop();
+    Log.v(TAG, "onStop");
+    handler.post(
+        () -> {
+          client.unloadModel();
+        });
+  }
+
+  private void send(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append("Input: " + message + "\n");
+
+          SmartReply[] ans = client.predict(new String[] {message});
+          for (SmartReply reply : ans) {
+            appendMessage("Reply: " + reply.getText());
+          }
+          appendMessage("------");
+        });
+  }
+
+  private void appendMessage(final String message) {
+    handler.post(
+        () -> {
+          messageTextView.append(message + "\n");
+        });
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
new file mode 100644
index 0000000000000000000000000000000000000000..3357fd17c11f870d1b0998bb26ffa9abf149686b
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReply.java
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.support.annotation.Keep;
+
+/**
+ * SmartReply contains predicted message, and confidence.
+ *
+ * <p>NOTE: this class used by JNI, class name and constructor should not be obfuscated.
+ */
+@Keep
+public class SmartReply {
+
+  private final String text;
+  private final float score;
+
+  @Keep
+  public SmartReply(String text, float score) {
+    this.text = text;
+    this.score = score;
+  }
+
+  public String getText() {
+    return text;
+  }
+
+  public float getScore() {
+    return score;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
new file mode 100644
index 0000000000000000000000000000000000000000..d5b1ac0ffbc47283aa0c1bf68c0a85ad6228cdcc
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.smartreply;
+
+import android.content.Context;
+import android.content.res.AssetFileDescriptor;
+import android.support.annotation.Keep;
+import android.support.annotation.WorkerThread;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.List;
+
+/** Interface to load TfLite model and provide predictions. */
+public class SmartReplyClient implements AutoCloseable {
+  private static final String TAG = "SmartReplyDemo";
+  private static final String MODEL_PATH = "smartreply.tflite";
+  private static final String BACKOFF_PATH = "backoff_response.txt";
+  private static final String JNI_LIB = "smartreply_jni";
+
+  private final Context context;
+  private long storage;
+  private MappedByteBuffer model;
+
+  private volatile boolean isLibraryLoaded;
+
+  public SmartReplyClient(Context context) {
+    this.context = context;
+  }
+
+  public boolean isLoaded() {
+    return storage != 0;
+  }
+
+  @WorkerThread
+  public synchronized void loadModel() {
+    if (!isLibraryLoaded) {
+      System.loadLibrary(JNI_LIB);
+      isLibraryLoaded = true;
+    }
+
+    try {
+      model = loadModelFile();
+      String[] backoff = loadBackoffList();
+      storage = loadJNI(model, backoff);
+    } catch (IOException e) {
+      Log.e(TAG, "Fail to load model", e);
+      return;
+    }
+  }
+
+  @WorkerThread
+  public synchronized SmartReply[] predict(String[] input) {
+    if (storage != 0) {
+      return predictJNI(storage, input);
+    } else {
+      return new SmartReply[] {};
+    }
+  }
+
+  @WorkerThread
+  public synchronized void unloadModel() {
+    close();
+  }
+
+  @Override
+  public synchronized void close() {
+    if (storage != 0) {
+      unloadJNI(storage);
+      storage = 0;
+    }
+  }
+
+  private MappedByteBuffer loadModelFile() throws IOException {
+    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    try {
+      FileChannel fileChannel = inputStream.getChannel();
+      long startOffset = fileDescriptor.getStartOffset();
+      long declaredLength = fileDescriptor.getDeclaredLength();
+      return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    } finally {
+      inputStream.close();
+    }
+  }
+
+  private String[] loadBackoffList() throws IOException {
+    List<String> labelList = new ArrayList<String>();
+    BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
+    String line;
+    while ((line = reader.readLine()) != null) {
+      if (!line.isEmpty()) {
+        labelList.add(line);
+      }
+    }
+    reader.close();
+    String[] ans = new String[labelList.size()];
+    labelList.toArray(ans);
+    return ans;
+  }
+
+  @Keep
+  private native long loadJNI(MappedByteBuffer buffer, String[] backoff);
+
+  @Keep
+  private native SmartReply[] predictJNI(long storage, String[] text);
+
+  @Keep
+  private native void unloadJNI(long storage);
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
new file mode 100644
index 0000000000000000000000000000000000000000..23b4cadc007a4457d33b8c8fecf9b1e7b7436320
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/res/layout/main_activity.xml
@@ -0,0 +1,44 @@
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:orientation="vertical">
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="3">
+
+        <TextView
+            android:id="@+id/message_text"
+            android:layout_width="fill_parent"
+            android:layout_height="fill_parent"
+            android:scrollbars="vertical"
+            android:gravity="bottom"/>
+    </LinearLayout>
+
+    <LinearLayout
+        android:layout_width="fill_parent"
+        android:layout_height="0dp"
+        android:padding="5dip"
+        android:layout_weight="1">
+
+        <EditText
+            android:id="@+id/message_input"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="6"
+            android:scrollbars="vertical"
+            android:hint="Enter Text"
+            android:gravity="top"
+            android:inputType="text"/>
+        <Button
+            android:id="@+id/send_button"
+            android:layout_width="0dp"
+            android:layout_height="fill_parent"
+            android:layout_weight="2"
+            android:text="Send" />
+    </LinearLayout>
+
+</LinearLayout>
diff --git a/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f158cc511a9bee0710aee13cd04f77b6f95fb868
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/demo/app/src/main/smartreply_jni.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+
+const char kIllegalStateException[] = "java/lang/IllegalStateException";
+
+using tflite::custom::smartreply::GetSegmentPredictions;
+using tflite::custom::smartreply::PredictorResponse;
+
+template <typename T>
+T CheckNotNull(JNIEnv* env, T&& t) {
+  if (t == nullptr) {
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return nullptr;
+  }
+  return std::forward<T>(t);
+}
+
+std::vector<std::string> jniStringArrayToVector(JNIEnv* env,
+                                                jobjectArray string_array) {
+  int count = env->GetArrayLength(string_array);
+  std::vector<std::string> result;
+  for (int i = 0; i < count; i++) {
+    auto jstr =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(string_array, i));
+    const char* raw_str = env->GetStringUTFChars(jstr, JNI_FALSE);
+    result.emplace_back(std::string(raw_str));
+    env->ReleaseStringUTFChars(jstr, raw_str);
+  }
+  return result;
+}
+
+struct JNIStorage {
+  std::vector<std::string> backoff_list;
+  std::unique_ptr<::tflite::FlatBufferModel> model;
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_loadJNI(
+    JNIEnv* env, jobject thiz, jobject model_buffer,
+    jobjectArray backoff_list) {
+  const char* buf =
+      static_cast<char*>(env->GetDirectBufferAddress(model_buffer));
+  jlong capacity = env->GetDirectBufferCapacity(model_buffer);
+
+  JNIStorage* storage = new JNIStorage;
+  storage->model = tflite::FlatBufferModel::BuildFromBuffer(
+      buf, static_cast<size_t>(capacity));
+  storage->backoff_list = jniStringArrayToVector(env, backoff_list);
+
+  if (!storage->model) {
+    delete storage;
+    env->ThrowNew(env->FindClass(kIllegalStateException), "");
+    return 0;
+  }
+  return reinterpret_cast<jlong>(storage);
+}
+
+extern "C" JNIEXPORT jobjectArray JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_predictJNI(
+    JNIEnv* env, jobject /*thiz*/, jlong storage_ptr, jobjectArray input_text) {
+  // Predict
+  if (storage_ptr == 0) {
+    return nullptr;
+  }
+  JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+  if (storage == nullptr) {
+    return nullptr;
+  }
+  std::vector<PredictorResponse> responses;
+  GetSegmentPredictions(jniStringArrayToVector(env, input_text),
+                        *storage->model, {storage->backoff_list}, &responses);
+
+  // Create a SmartReply[] to return back to Java
+  jclass smart_reply_class = CheckNotNull(
+      env, env->FindClass("com/example/android/smartreply/SmartReply"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jmethodID smart_reply_ctor = CheckNotNull(
+      env,
+      env->GetMethodID(smart_reply_class, "<init>", "(Ljava/lang/String;F)V"));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  jobjectArray array = CheckNotNull(
+      env, env->NewObjectArray(responses.size(), smart_reply_class, nullptr));
+  if (env->ExceptionCheck()) {
+    return nullptr;
+  }
+  for (int i = 0; i < responses.size(); i++) {
+    jstring text =
+        CheckNotNull(env, env->NewStringUTF(responses[i].GetText().data()));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+    jobject reply = env->NewObject(smart_reply_class, smart_reply_ctor, text,
+                                   responses[i].GetScore());
+    env->SetObjectArrayElement(array, i, reply);
+  }
+  return array;
+}
+
+extern "C" JNIEXPORT void JNICALL
+Java_com_example_android_smartreply_SmartReplyClient_unloadJNI(
+    JNIEnv* env, jobject thiz, jlong storage_ptr) {
+  if (storage_ptr != 0) {
+    JNIStorage* storage = reinterpret_cast<JNIStorage*>(storage_ptr);
+    delete storage;
+  }
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/g3doc/README.md b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cab5dcca43a31ec3cf824f00d6794ea9e66d9bf8
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/g3doc/README.md
@@ -0,0 +1,146 @@
+# Smart Reply Model
+
+## What is On-Device Smart Reply Model?
+
+Smart Replies are contextually relevant, one-touch responses that help the user
+to reply to an incoming text message (or email) efficiently and effortlessly.
+Smart Replies have been highly successful across several Google products
+including
+[Gmail](https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/),
+[Inbox](https://www.blog.google/products/gmail/computer-respond-to-this-email/)
+and
+[Allo](https://blog.google/products/allo/google-allo-smarter-messaging-app/).
+
+The On-device Smart Reply model is targeted towards text chat use cases. It has
+a completely different architecture from its cloud-based counterparts, and is
+built specifically for memory constraints devices such as phones & watches. It
+has been successfully used to provide [Smart Replies on Android
+Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+to all first- & third-party apps.
+
+The on-device model comes with several benefits. It is:
+
+*   **Faster**: The model resides on the device and does not require internet
+    connectivity. Thus, the inference is very fast and has an average latency of
+    only a few milliseconds.
+*   **Resource efficient**: The model has a small memory footprint on
+    the device.
+*   **Privacy-friendly**: The user data never leaves the device and this
+    eliminates any privacy restrictions.
+
+A caveat, though, is that the on-device model has lower triggering rate than its
+cloud counterparts (triggering rate is the percentage of times the model
+suggests a response for an incoming message).
+
+## When to use this Model?
+
+The On-Device Smart Reply model is aimed towards improving the messaging
+experience for day-to-day conversational chat messages. We recommend using this
+model for similar use cases. Some sample messages on which the model does well
+are provided in this [tsv
+file](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv)
+for reference. The file format is:
+
+```
+   {incoming_message  smart_reply1   [smart_reply2]   [smart_reply3]}
+```
+
+For the current model, we see a triggering rate of about 30-40% for messages
+which are similar to those provided in the tsv file above.
+
+In case the model does not trigger any response, the system falls back to
+suggesting replies from a fixed back-off set that was compiled from popular
+response intents observed in chat conversations. Some of the fallback responses
+are `Ok, Yes, No, 👍, ☺`.
+
+The model can only be used for inference at this time (i.e. it cannot be custom
+trained). If you are interested to know how the model was trained, please refer
+to this [blog
+post](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+and [research paper](https://arxiv.org/pdf/1708.00630).
+
+## How to use this Model?
+
+We have provided a pre-built demo APK that you can download, install and test on
+your phone ([demo APK
+here](http://download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)).
+
+The On-Device Smart Reply demo App works in the following way:
+
+1.  Android app links to the JNI binary with a predictor library.
+
+2.  In the predictor library, `GetSegmentPredictions` is called with a list of input
+    strings.
+
+    2.1 The input string can be 1-3 most recent messages of the conversations in
+    form of string vector. The model will run on these input sentences and
+    provide Smart Replies corresponding to them.
+
+    2.2 The function performs some preprocessing on input data which includes:
+
+    *   Sentence splitting: The input message will be split into sentences if
+        message has more than one sentence. Eg: a message like “How are you?
+        Want to grab lunch?” will be broken down into 2 different sentences.
+    *   Normalization: The individual sentences will be normalized by converting
+        them into lower cases, removing unnecessary punctuations, etc. Eg: “how
+        are you????” will be converted to “how are you?” (refer for NORMALIZE op
+        for more details).
+
+        The input string content will be converted to tensors.
+
+    2.3 The function then runs the prediction model on the input tensors.
+
+    2.4 The function also performs some post-processing which includes
+    aggregating the model predictions for the input sentences from 2.2 and
+    returning the appropriate responses.
+
+3.  Finally, it gets response(s) from `std::vector<PredictorResponse>`, and
+    returns back to Android app. Responses are sorted in descending order of
+    confidence score.
+
+## Ops and Functionality Supported
+
+Following are the ops supported for using On-Device Smart Reply model:
+
+*   **NORMALIZE**
+
+    This is a custom op which normalizes the sentences by:
+
+    *   Converting all sentences into lower case.
+    *   Removing unnecessary punctuations (eg: “how are you????” → “how are
+        you?”).
+    *   Expanding sentences wherever necessary (eg: “ I’m home” → “I am home”).
+
+*   **SKIP_GRAM**
+
+    This is an op inside TensorFlow Lite that converts sentences into a list of
+    skip grams. The configurable parameters are `ngram_size` and
+    `max_skip_size`. For the model provided, the values for these parameters are
+    set to 3 & 2 respectively.
+
+*   **EXTRACT_FEATURES**
+
+    This is a custom op that hashes skip grams to features represented as
+    integers. Longer skip-grams are allocated higher weights.
+
+*   **LSH_PROJECTION**
+
+    This is an op inside TensorFlow Lite that projects input features to a
+    corresponding bit vector space using Locality Sensitive Hashing (LSH).
+
+*   **PREDICT**
+
+    This is a custom op that runs the input features through the projection
+    model (details [here](https://arxiv.org/pdf/1708.00630.pdf)), computes the
+    appropriate response labels along with weights for the projected features,
+    and aggregates the response labels and weights together.
+
+*   **HASHTABLE_LOOKUP**
+
+    This is a custom op that uses label id from predict op and looks up the
+    response text from the given label id.
+
+## Further Information
+
+*   Open source code
+    [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/smartreply/).
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f97a6486d6c11cf0184622f515fe5b1e096c6257
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Convert a list of strings to integers via hashing.
+// Input:
+//     Input[0]: A list of ngrams. string[num of input]
+//
+// Output:
+//     Output[0]: Hashed features. int32[num of input]
+//     Output[1]: Weights. float[num of input]
+
+#include <algorithm>
+#include <map>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include <farmhash.h>
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+namespace extract {
+
+static const int kMaxDimension = 1000000;
+static const std::vector<string> kBlacklistNgram = {"<S>", "<E>", "<S> <E>"};
+
+bool Equals(const string& x, const tflite::StringRef& strref) {
+  if (strref.len != x.length()) {
+    return false;
+  }
+  if (strref.len > 0) {
+    int r = memcmp(strref.str, x.data(), strref.len);
+    return r == 0;
+  }
+  return true;
+}
+
+bool IsValidNgram(const tflite::StringRef& strref) {
+  for (const auto& s : kBlacklistNgram) {
+    if (Equals(s, strref)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteIntArray* outputSize1 = TfLiteIntArrayCreate(1);
+  TfLiteIntArray* outputSize2 = TfLiteIntArrayCreate(1);
+  TfLiteTensor* input = GetInput(context, node, 0);
+  int dim = input->dims->data[0];
+  if (dim == 0) {
+    // TFLite non-string output should have size greater than 0.
+    dim = 1;
+  }
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteString);
+  outputSize1->data[0] = dim;
+  outputSize2->data[0] = dim;
+  context->ResizeTensor(context, GetOutput(context, node, 0), outputSize1);
+  context->ResizeTensor(context, GetOutput(context, node, 1), outputSize2);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, 0);
+  int num_strings = tflite::GetStringCount(input);
+  TfLiteTensor* label = GetOutput(context, node, 0);
+  TfLiteTensor* weight = GetOutput(context, node, 1);
+
+  std::map<int64_t, int> feature_id_counts;
+  for (int i = 0; i < num_strings; i++) {
+    // Use fingerprint of feature name as id.
+    auto strref = tflite::GetString(input, i);
+    if (!IsValidNgram(strref)) {
+      label->data.i32[i] = 0;
+      weight->data.i32[i] = 0;
+      continue;
+    }
+
+    int64_t feature_id =
+        ::util::Fingerprint64(strref.str, strref.len) % kMaxDimension;
+    label->data.i32[i] = static_cast<int32_t>(feature_id);
+    weight->data.f[i] =
+        std::count(strref.str, strref.str + strref.len, ' ') + 1;
+  }
+  // Explicitly set an empty result to make preceding ops run.
+  if (num_strings == 0) {
+    label->data.i32[0] = 0;
+    weight->data.i32[0] = 0;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace extract
+
+TfLiteRegistration* Register_EXTRACT_FEATURES() {
+  static TfLiteRegistration r = {nullptr, nullptr, extract::Prepare,
+                                 extract::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b8676bab6e81109b01809e7e332448b05a9fbb5
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include <farmhash.h>
+
+namespace tflite {
+
+namespace ops {
+namespace custom {
+TfLiteRegistration* Register_EXTRACT_FEATURES();
+
+namespace {
+
+using ::testing::ElementsAre;
+
+class ExtractFeatureOpModel : public SingleOpModel {
+ public:
+  explicit ExtractFeatureOpModel(const std::vector<string>& input) {
+    input_ = AddInput(TensorType_STRING);
+    signature_ = AddOutput(TensorType_INT32);
+    weight_ = AddOutput(TensorType_FLOAT32);
+
+    SetCustomOp("ExtractFeatures", {}, Register_EXTRACT_FEATURES);
+    BuildInterpreter({{static_cast<int>(input.size())}});
+    PopulateStringTensor(input_, input);
+  }
+
+  std::vector<int> GetSignature() { return ExtractVector<int>(signature_); }
+  std::vector<float> GetWeight() { return ExtractVector<float>(weight_); }
+
+ private:
+  int input_;
+  int signature_;
+  int weight_;
+};
+
+int CalcFeature(const string& str) {
+  return ::util::Fingerprint64(str) % 1000000;
+}
+
+TEST(ExtractFeatureOpTest, RegularInput) {
+  ExtractFeatureOpModel m({"<S>", "<S> Hi", "Hi", "Hi !", "!", "! <E>", "<E>"});
+  m.Invoke();
+  EXPECT_THAT(m.GetSignature(),
+              ElementsAre(0, CalcFeature("<S> Hi"), CalcFeature("Hi"),
+                          CalcFeature("Hi !"), CalcFeature("!"),
+                          CalcFeature("! <E>"), 0));
+  EXPECT_THAT(m.GetWeight(), ElementsAre(0, 2, 1, 2, 1, 2, 0));
+}
+
+TEST(ExtractFeatureOpTest, OneInput) {
+  ExtractFeatureOpModel m({"Hi"});
+  m.Invoke();
+  EXPECT_THAT(m.GetSignature(), ElementsAre(CalcFeature("Hi")));
+  EXPECT_THAT(m.GetWeight(), ElementsAre(1));
+}
+
+TEST(ExtractFeatureOpTest, ZeroInput) {
+  ExtractFeatureOpModel m({});
+  m.Invoke();
+  EXPECT_THAT(m.GetSignature(), ElementsAre(0));
+  EXPECT_THAT(m.GetWeight(), ElementsAre(0));
+}
+
+TEST(ExtractFeatureOpTest, AllBlacklistInput) {
+  ExtractFeatureOpModel m({"<S>", "<E>"});
+  m.Invoke();
+  EXPECT_THAT(m.GetSignature(), ElementsAre(0, 0));
+  EXPECT_THAT(m.GetWeight(), ElementsAre(0, 0));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c55ac9f52f7293a8ba5baf17f2052e11a7422074
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/normalize.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Normalize the string input.
+//
+// Input:
+//     Input[0]: One sentence. string[1]
+//
+// Output:
+//     Output[0]: Normalized sentence. string[1]
+//
+
+#include <algorithm>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/strip.h"
+#include "re2/re2.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+namespace normalize {
+
+// Predictor transforms.
+const char kPunctuationsRegex[] = "[.*()\"]";
+
+const std::map<string, string>* kRegexTransforms =
+    new std::map<string, string>({
+        {"([^\\s]+)n't", "\\1 not"},
+        {"([^\\s]+)'nt", "\\1 not"},
+        {"([^\\s]+)'ll", "\\1 will"},
+        {"([^\\s]+)'re", "\\1 are"},
+        {"([^\\s]+)'ve", "\\1 have"},
+        {"i'm", "i am"},
+    });
+
+static const char kStartToken[] = "<S>";
+static const char kEndToken[] = "<E>";
+static const int32_t kMaxInputChars = 300;
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  tflite::StringRef input = tflite::GetString(GetInput(context, node, 0), 0);
+
+  string result(absl::AsciiStrToLower(absl::string_view(input.str, input.len)));
+  absl::StripAsciiWhitespace(&result);
+  // Do not remove commas, semi-colons or colons from the sentences as they can
+  // indicate the beginning of a new clause.
+  RE2::GlobalReplace(&result, kPunctuationsRegex, "");
+  RE2::GlobalReplace(&result, "\\s('t|'nt|n't|'d|'ll|'s|'m|'ve|'re)([\\s,;:/])",
+                     "\\1\\2");
+  RE2::GlobalReplace(&result, "\\s('t|'nt|n't|'d|'ll|'s|'m|'ve|'re)$", "\\1");
+  for (auto iter = kRegexTransforms->begin(); iter != kRegexTransforms->end();
+       iter++) {
+    RE2::GlobalReplace(&result, iter->first, iter->second);
+  }
+
+  // Treat questions & interjections as special cases.
+  RE2::GlobalReplace(&result, "([?])+", "\\1");
+  RE2::GlobalReplace(&result, "([!])+", "\\1");
+  RE2::GlobalReplace(&result, "([^?!]+)([?!])", "\\1 \\2 ");
+  RE2::GlobalReplace(&result, "([?!])([?!])", "\\1 \\2");
+
+  RE2::GlobalReplace(&result, "[\\s,:;\\-&'\"]+$", "");
+  RE2::GlobalReplace(&result, "^[\\s,:;\\-&'\"]+", "");
+  absl::StripAsciiWhitespace(&result);
+
+  // Add start and end token.
+  // Truncate input to maximum allowed size.
+  if (result.length() <= kMaxInputChars) {
+    absl::StrAppend(&result, " ", kEndToken);
+  } else {
+    result = result.substr(0, kMaxInputChars);
+  }
+  result = absl::StrCat(kStartToken, " ", result);
+
+  tflite::DynamicBuffer buf;
+  buf.AddString(result.data(), result.length());
+  buf.WriteToTensor(GetOutput(context, node, 0));
+  return kTfLiteOk;
+}
+
+}  // namespace normalize
+
+TfLiteRegistration* Register_NORMALIZE() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr, normalize::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc b/tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d35dba9a64a849d0321c3aa89d89f5bb61b0764
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/normalize_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+
+namespace ops {
+namespace custom {
+TfLiteRegistration* Register_NORMALIZE();
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class NormalizeOpModel : public SingleOpModel {
+ public:
+  explicit NormalizeOpModel(const string& input) {
+    input_ = AddInput(TensorType_STRING);
+    output_ = AddOutput(TensorType_STRING);
+
+    SetCustomOp("Normalize", {}, Register_NORMALIZE);
+    BuildInterpreter({{static_cast<int>(input.size())}});
+    PopulateStringTensor(input_, {input});
+  }
+
+  std::vector<string> GetStringOutput() {
+    TfLiteTensor* output = interpreter_->tensor(output_);
+    int num = GetStringCount(output);
+    std::vector<string> result(num);
+    for (int i = 0; i < num; i++) {
+      auto ref = GetString(output, i);
+      result[i] = string(ref.str, ref.len);
+    }
+    return result;
+  }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NormalizeOpTest, RegularInput) {
+  NormalizeOpModel m("I'm good; you're welcome");
+  m.Invoke();
+  EXPECT_THAT(m.GetStringOutput(),
+              ElementsAreArray({"<S> i am good; you are welcome <E>"}));
+}
+
+TEST(NormalizeOpTest, OneInput) {
+  NormalizeOpModel m("Hi!!!!");
+  m.Invoke();
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"<S> hi ! <E>"}));
+}
+
+TEST(NormalizeOpTest, EmptyInput) {
+  NormalizeOpModel m("");
+  m.Invoke();
+  EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"<S>  <E>"}));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/predict.cc b/tensorflow/contrib/lite/models/smartreply/ops/predict.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b23adb990cf10d4f0cd5b66cfa40eaa0cc46c41
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/predict.cc
@@ -0,0 +1,174 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Lookup projected hash signatures in Predictor model,
+// output predicted labels and weights in decreasing order.
+//
+// Input:
+//     Input[0]: A list of hash signatures. int32[num of input]
+//     Input[1]: Hash signature keys in the model. int32[keys of model]
+//     Input[2]: Labels in the model. int32[keys of model, item per entry]
+//     Input[3]: Weights in the model. float[keys of model, item per entry]
+//
+// Output:
+//     Output[0]: Predicted labels. int32[num of output]
+//     Output[1]: Predicted weights. float[num of output]
+//
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+namespace predict {
+
+struct PredictOption {
+  int32_t num_output;
+  float weight_threshold;
+
+  static PredictOption* Cast(void* ptr) {
+    return reinterpret_cast<PredictOption*>(ptr);
+  }
+};
+
+bool WeightGreater(const std::pair<int32_t, float>& a,
+                   const std::pair<int32_t, float>& b) {
+  return a.second > b.second;
+}
+
+void* Init(TfLiteContext* context, const char* custom_option, size_t length) {
+  if (custom_option == nullptr || length != sizeof(PredictOption)) {
+    fprintf(stderr, "No Custom option set\n");
+    exit(1);
+  }
+  PredictOption* option = new PredictOption;
+  int offset = 0;
+  option->num_output =
+      *reinterpret_cast<const int32_t*>(custom_option + offset);
+  offset += sizeof(int32_t);
+  option->weight_threshold =
+      *reinterpret_cast<const float*>(custom_option + offset);
+  return reinterpret_cast<void*>(option);
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete PredictOption::Cast(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
+
+  TfLiteTensor* lookup = &context->tensors[node->inputs->data[0]];
+  TfLiteTensor* model_key = &context->tensors[node->inputs->data[1]];
+  TfLiteTensor* model_label = &context->tensors[node->inputs->data[2]];
+  TfLiteTensor* model_weight = &context->tensors[node->inputs->data[3]];
+  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, model_key->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, model_label->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, model_weight->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, lookup->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, model_key->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, model_label->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, model_weight->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, model_key->dims->data[0],
+                    model_label->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, model_key->dims->data[0],
+                    model_weight->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, model_label->dims->data[1],
+                    model_weight->dims->data[1]);
+
+  PredictOption* option = PredictOption::Cast(node->user_data);
+  TfLiteTensor* output_label = &context->tensors[node->outputs->data[0]];
+  TfLiteTensor* output_weight = &context->tensors[node->outputs->data[1]];
+  TF_LITE_ENSURE_EQ(context, output_label->type, kTfLiteInt32);
+  TF_LITE_ENSURE_EQ(context, output_weight->type, kTfLiteFloat32);
+
+  TfLiteIntArray* label_size = TfLiteIntArrayCreate(1);
+  label_size->data[0] = option->num_output;
+  TfLiteIntArray* weight_size = TfLiteIntArrayCreate(1);
+  weight_size->data[0] = option->num_output;
+  TfLiteStatus status =
+      context->ResizeTensor(context, output_label, label_size);
+  if (status != kTfLiteOk) {
+    return status;
+  }
+  return context->ResizeTensor(context, output_weight, weight_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* lookup = &context->tensors[node->inputs->data[0]];
+  TfLiteTensor* model_key = &context->tensors[node->inputs->data[1]];
+  TfLiteTensor* model_label = &context->tensors[node->inputs->data[2]];
+  TfLiteTensor* model_weight = &context->tensors[node->inputs->data[3]];
+
+  // Aggregate by key
+  std::unordered_map<int32_t, float> aggregation;
+  const int num_input = lookup->dims->data[0];
+  const int num_rows = model_key->dims->data[0];
+  const int items = model_label->dims->data[1];
+  int* model_key_end = model_key->data.i32 + num_rows;
+
+  for (int i = 0; i < num_input; i++) {
+    int* ptr = std::lower_bound(model_key->data.i32, model_key_end,
+                                lookup->data.i32[i]);
+    if (ptr != nullptr && ptr != model_key_end && *ptr == lookup->data.i32[i]) {
+      int idx = ptr - model_key->data.i32;
+      for (int j = 0; j < items; j++) {
+        aggregation[model_label->data.i32[idx * items + j]] +=
+            model_weight->data.f[idx * items + j] / num_input;
+      }
+    }
+  }
+
+  // Sort by value
+  std::vector<std::pair<int32_t, float>> sorted_labels(aggregation.begin(),
+                                                       aggregation.end());
+  std::sort(sorted_labels.begin(), sorted_labels.end(), WeightGreater);
+
+  PredictOption* option = PredictOption::Cast(node->user_data);
+  TfLiteTensor* output_label = &context->tensors[node->outputs->data[0]];
+  TfLiteTensor* output_weight = &context->tensors[node->outputs->data[1]];
+  for (int i = 0; i < output_label->dims->data[0]; i++) {
+    if (i >= sorted_labels.size() ||
+        sorted_labels[i].second < option->weight_threshold) {
+      // Set -1 to avoid lookup message with id 0, which is set for backoff.
+      output_label->data.i32[i] = -1;
+      output_weight->data.f[i] = 0.0f;
+    } else {
+      output_label->data.i32[i] = sorted_labels[i].first;
+      output_weight->data.f[i] = sorted_labels[i].second;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace predict
+
+TfLiteRegistration* Register_PREDICT() {
+  static TfLiteRegistration r = {predict::Init, predict::Free, predict::Prepare,
+                                 predict::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc b/tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e97c58cbd185023e59c21c93057fd0f094585bf9
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/ops/predict_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+
+namespace ops {
+namespace custom {
+TfLiteRegistration* Register_PREDICT();
+
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class PredictOpModel : public SingleOpModel {
+ public:
+  PredictOpModel(std::initializer_list<int> input_signature_shape,
+                 std::initializer_list<int> key_shape,
+                 std::initializer_list<int> labelweight_shape, int num_output,
+                 float threshold) {
+    input_signature_ = AddInput(TensorType_INT32);
+    model_key_ = AddInput(TensorType_INT32);
+    model_label_ = AddInput(TensorType_INT32);
+    model_weight_ = AddInput(TensorType_FLOAT32);
+    output_label_ = AddOutput(TensorType_INT32);
+    output_weight_ = AddOutput(TensorType_FLOAT32);
+
+    std::vector<uint8_t> predict_option;
+    writeInt32(num_output, &predict_option);
+    writeFloat32(threshold, &predict_option);
+    SetCustomOp("Predict", predict_option, Register_PREDICT);
+    BuildInterpreter({{input_signature_shape, key_shape, labelweight_shape,
+                       labelweight_shape}});
+  }
+
+  void SetInputSignature(std::initializer_list<int> data) {
+    PopulateTensor<int>(input_signature_, data);
+  }
+
+  void SetModelKey(std::initializer_list<int> data) {
+    PopulateTensor<int>(model_key_, data);
+  }
+
+  void SetModelLabel(std::initializer_list<int> data) {
+    PopulateTensor<int>(model_label_, data);
+  }
+
+  void SetModelWeight(std::initializer_list<float> data) {
+    PopulateTensor<float>(model_weight_, data);
+  }
+
+  std::vector<int> GetLabel() { return ExtractVector<int>(output_label_); }
+  std::vector<float> GetWeight() {
+    return ExtractVector<float>(output_weight_);
+  }
+
+  void writeFloat32(float value, std::vector<uint8_t>* data) {
+    union {
+      float v;
+      uint8_t r[4];
+    } float_to_raw;
+    float_to_raw.v = value;
+    for (unsigned char i : float_to_raw.r) {
+      data->push_back(i);
+    }
+  }
+
+  void writeInt32(int32_t value, std::vector<uint8_t>* data) {
+    union {
+      int32_t v;
+      uint8_t r[4];
+    } int32_to_raw;
+    int32_to_raw.v = value;
+    for (unsigned char i : int32_to_raw.r) {
+      data->push_back(i);
+    }
+  }
+
+ private:
+  int input_signature_;
+  int model_key_;
+  int model_label_;
+  int model_weight_;
+  int output_label_;
+  int output_weight_;
+};
+
+TEST(PredictOpTest, AllLabelsAreValid) {
+  PredictOpModel m({4}, {5}, {5, 2}, 2, 0.0001);
+  m.SetInputSignature({1, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 12, 11, 12, 11, 12, 11, 12, 11, 12});
+  m.SetModelWeight({0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({12, 11}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0.1, 0.05})));
+}
+
+TEST(PredictOpTest, MoreLabelsThanRequired) {
+  PredictOpModel m({4}, {5}, {5, 2}, 1, 0.0001);
+  m.SetInputSignature({1, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 12, 11, 12, 11, 12, 11, 12, 11, 12});
+  m.SetModelWeight({0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({12}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0.1})));
+}
+
+TEST(PredictOpTest, OneLabelDoesNotPassThreshold) {
+  PredictOpModel m({4}, {5}, {5, 2}, 2, 0.07);
+  m.SetInputSignature({1, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 12, 11, 12, 11, 12, 11, 12, 11, 12});
+  m.SetModelWeight({0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({12, -1}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0.1, 0})));
+}
+
+TEST(PredictOpTest, NoneLabelPassThreshold) {
+  PredictOpModel m({4}, {5}, {5, 2}, 2, 0.6);
+  m.SetInputSignature({1, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 12, 11, 12, 11, 12, 11, 12, 11, 12});
+  m.SetModelWeight({0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({-1, -1}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0, 0})));
+}
+
+TEST(PredictOpTest, OnlyOneLabelGenerated) {
+  PredictOpModel m({4}, {5}, {5, 2}, 2, 0.0001);
+  m.SetInputSignature({1, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 0, 11, 0, 11, 0, 11, 0, 11, 0});
+  m.SetModelWeight({0.1, 0, 0.1, 0, 0.1, 0, 0.1, 0, 0.1, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({11, -1}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0.05, 0})));
+}
+
+TEST(PredictOpTest, NoLabelGenerated) {
+  PredictOpModel m({4}, {5}, {5, 2}, 2, 0.0001);
+  m.SetInputSignature({5, 3, 7, 9});
+  m.SetModelKey({1, 2, 4, 6, 7});
+  m.SetModelLabel({11, 0, 11, 0, 11, 0, 11, 0, 0, 0});
+  m.SetModelWeight({0.1, 0, 0.1, 0, 0.1, 0, 0.1, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetLabel(), ElementsAreArray({-1, -1}));
+  EXPECT_THAT(m.GetWeight(), ElementsAreArray(ArrayFloatNear({0, 0})));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/contrib/lite/models/smartreply/predictor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6da5cc8eecc0920850f666b0992c4d9598c55b6c
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+
+#include "absl/strings/str_split.h"
+#include "re2/re2.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+namespace tflite {
+namespace custom {
+namespace smartreply {
+
+// Split sentence into segments (using punctuation).
+std::vector<std::string> SplitSentence(const std::string& input) {
+  string result(input);
+
+  RE2::GlobalReplace(&result, "([?.!,])+", " \\1");
+  RE2::GlobalReplace(&result, "([?.!,])+\\s+", "\\1\t");
+  RE2::GlobalReplace(&result, "[ ]+", " ");
+  RE2::GlobalReplace(&result, "\t+$", "");
+
+  return absl::StrSplit(result, '\t');
+}
+
+// Predict with TfLite model.
+void ExecuteTfLite(const std::string& sentence,
+                   ::tflite::Interpreter* interpreter,
+                   std::map<std::string, float>* response_map) {
+  {
+    TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
+    tflite::DynamicBuffer buf;
+    buf.AddString(sentence.data(), sentence.length());
+    buf.WriteToTensor(input);
+    interpreter->AllocateTensors();
+
+    interpreter->Invoke();
+
+    TfLiteTensor* messages = interpreter->tensor(interpreter->outputs()[0]);
+    TfLiteTensor* confidence = interpreter->tensor(interpreter->outputs()[1]);
+
+    for (int i = 0; i < confidence->dims->data[0]; i++) {
+      float weight = confidence->data.f[i];
+      auto response_text = tflite::GetString(messages, i);
+      if (response_text.len > 0) {
+        (*response_map)[string(response_text.str, response_text.len)] += weight;
+      }
+    }
+  }
+}
+
+void GetSegmentPredictions(
+    const std::vector<std::string>& input,
+    const ::tflite::FlatBufferModel& model, const SmartReplyConfig& config,
+    std::vector<PredictorResponse>* predictor_responses) {
+  // Initialize interpreter
+  std::unique_ptr<::tflite::Interpreter> interpreter;
+  ::tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+  ::tflite::InterpreterBuilder(model, resolver)(&interpreter);
+
+  if (!model.initialized()) {
+    fprintf(stderr, "Failed to mmap model \n");
+    return;
+  }
+
+  // Execute Tflite Model
+  std::map<std::string, float> response_map;
+  std::vector<std::string> sentences;
+  for (const std::string& str : input) {
+    std::vector<std::string> splitted_str = SplitSentence(str);
+    sentences.insert(sentences.end(), splitted_str.begin(), splitted_str.end());
+  }
+  for (const auto& sentence : sentences) {
+    ExecuteTfLite(sentence, interpreter.get(), &response_map);
+  }
+
+  // Generate the result.
+  for (const auto& iter : response_map) {
+    PredictorResponse prediction(iter.first, iter.second);
+    predictor_responses->emplace_back(prediction);
+  }
+  std::sort(predictor_responses->begin(), predictor_responses->end(),
+            [](const PredictorResponse& a, const PredictorResponse& b) {
+              return a.GetScore() > b.GetScore();
+            });
+
+  // Add backoff response.
+  for (const string& backoff : config.backoff_responses) {
+    if (predictor_responses->size() >= config.num_response) {
+      break;
+    }
+    predictor_responses->push_back({backoff, config.backoff_confidence});
+  }
+}
+
+}  // namespace smartreply
+}  // namespace custom
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.h b/tensorflow/contrib/lite/models/smartreply/predictor.h
new file mode 100644
index 0000000000000000000000000000000000000000..d17323a3f9a0ea80ad5e215b0a4700e625d0c590
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace custom {
+namespace smartreply {
+
+const int kDefaultNumResponse = 10;
+const float kDefaultBackoffConfidence = 1e-4;
+
+class PredictorResponse;
+struct SmartReplyConfig;
+
+// With a given string as input, predict the response with a Tflite model.
+// When config.backoff_response is not empty, predictor_responses will be filled
+// with messagees from backoff response.
+void GetSegmentPredictions(const std::vector<std::string>& input,
+                           const ::tflite::FlatBufferModel& model,
+                           const SmartReplyConfig& config,
+                           std::vector<PredictorResponse>* predictor_responses);
+
+// Data object used to hold a single predictor response.
+// It includes messages, and confidence.
+class PredictorResponse {
+ public:
+  PredictorResponse(const std::string& response_text, float score) {
+    response_text_ = response_text;
+    prediction_score_ = score;
+  }
+
+  // Accessor methods.
+  const std::string& GetText() const { return response_text_; }
+  float GetScore() const { return prediction_score_; }
+
+ private:
+  std::string response_text_ = "";
+  float prediction_score_ = 0.0;
+};
+
+// Configurations for SmartReply.
+struct SmartReplyConfig {
+  // Maximum responses to return.
+  int num_response;
+  // Default confidence for backoff responses.
+  float backoff_confidence;
+  // Backoff responses are used when predicted responses cannot fulfill the
+  // list.
+  const std::vector<std::string>& backoff_responses;
+
+  SmartReplyConfig(std::vector<std::string> backoff_responses)
+      : num_response(kDefaultNumResponse),
+        backoff_confidence(kDefaultBackoffConfidence),
+        backoff_responses(backoff_responses) {}
+};
+
+}  // namespace smartreply
+}  // namespace custom
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_SMARTREPLY_PREDICTOR_H_
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97d3c650e21c3cb4bef1db09df93f4bf24f38ba5
--- /dev/null
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/models/smartreply/predictor.h"
+
+#include <fstream>
+#include <unordered_set>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace custom {
+namespace smartreply {
+namespace {
+
+const char kModelName[] = "smartreply_ondevice_model.bin";
+const char kSamples[] = "smartreply_samples.tsv";
+
+MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
+  bool has_expected_response = false;
+  for (const auto &item : *arg) {
+    const string &response = item.GetText();
+    if (expected_response.find(response) != expected_response.end()) {
+      has_expected_response = true;
+      break;
+    }
+  }
+  return has_expected_response;
+}
+
+class PredictorTest : public ::testing::Test {
+ protected:
+  PredictorTest() {
+    model_ = tflite::FlatBufferModel::BuildFromFile(
+        StrCat(TestDataPath(), "/", kModelName).c_str());
+    CHECK(model_);
+  }
+  ~PredictorTest() override {}
+
+  std::unique_ptr<::tflite::FlatBufferModel> model_;
+};
+
+TEST_F(PredictorTest, GetSegmentPredictions) {
+  std::vector<PredictorResponse> predictions;
+
+  GetSegmentPredictions({"Welcome"}, *model_, /*config=*/{{}}, &predictions);
+  EXPECT_GT(predictions.size(), 0);
+
+  float max = 0;
+  for (const auto &item : predictions) {
+    if (item.GetScore() > max) {
+      max = item.GetScore();
+    }
+  }
+
+  EXPECT_GT(max, 0.3);
+  EXPECT_THAT(
+      &predictions,
+      IncludeAnyResponesIn(std::unordered_set<string>({"Thanks very much"})));
+}
+
+TEST_F(PredictorTest, TestTwoSentences) {
+  std::vector<PredictorResponse> predictions;
+
+  GetSegmentPredictions({"Hello", "How are you?"}, *model_, /*config=*/{{}},
+                        &predictions);
+  EXPECT_GT(predictions.size(), 0);
+
+  float max = 0;
+  for (const auto &item : predictions) {
+    if (item.GetScore() > max) {
+      max = item.GetScore();
+    }
+  }
+
+  EXPECT_GT(max, 0.3);
+  EXPECT_THAT(&predictions, IncludeAnyResponesIn(std::unordered_set<string>(
+                                {"Hi, how are you doing?"})));
+}
+
+TEST_F(PredictorTest, TestBackoff) {
+  std::vector<PredictorResponse> predictions;
+
+  GetSegmentPredictions({"你好"}, *model_, /*config=*/{{}}, &predictions);
+  EXPECT_EQ(predictions.size(), 0);
+
+  // Backoff responses are returned in order.
+  GetSegmentPredictions({"你好"}, *model_, /*config=*/{{"Yes", "Ok"}},
+                        &predictions);
+  EXPECT_EQ(predictions.size(), 2);
+  EXPECT_EQ(predictions[0].GetText(), "Yes");
+  EXPECT_EQ(predictions[1].GetText(), "Ok");
+}
+
+TEST_F(PredictorTest, BatchTest) {
+  int total_items = 0;
+  int total_responses = 0;
+  int total_triggers = 0;
+
+  string line;
+  std::ifstream fin(StrCat(TestDataPath(), "/", kSamples));
+  while (std::getline(fin, line)) {
+    const std::vector<string> fields = absl::StrSplit(line, '\t');
+    if (fields.empty()) {
+      continue;
+    }
+
+    // Parse sample file and predict
+    const string &msg = fields[0];
+    std::vector<PredictorResponse> predictions;
+    GetSegmentPredictions({msg}, *model_, /*config=*/{{}}, &predictions);
+
+    // Validate response and generate stats.
+    total_items++;
+    total_responses += predictions.size();
+    if (!predictions.empty()) {
+      total_triggers++;
+    }
+    EXPECT_THAT(&predictions, IncludeAnyResponesIn(std::unordered_set<string>(
+                                  fields.begin() + 1, fields.end())));
+  }
+
+  EXPECT_EQ(total_triggers, total_items);
+  EXPECT_GE(total_responses, total_triggers);
+}
+
+}  // namespace
+}  // namespace smartreply
+}  // namespace custom
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf95b313f31c2f76046727353a9a7b0658dbf067
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_asr_am_model_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech ASR AM model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "file/base/path.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+
+namespace tflite {
+namespace models {
+
+constexpr int kModelInputTensor = 0;
+constexpr int kLstmLayer1OutputStateTensor = 19;
+constexpr int kLstmLayer1CellStateTensor = 20;
+constexpr int kLstmLayer2OutputStateTensor = 40;
+constexpr int kLstmLayer2CellStateTensor = 41;
+constexpr int kLstmLayer3OutputStateTensor = 61;
+constexpr int kLstmLayer3CellStateTensor = 62;
+constexpr int kLstmLayer4OutputStateTensor = 82;
+constexpr int kLstmLayer4CellStateTensor = 83;
+constexpr int kLstmLayer5OutputStateTensor = 103;
+constexpr int kLstmLayer5CellStateTensor = 104;
+constexpr int kModelOutputTensor = 109;
+
+TEST(SpeechAsrAm, RandomIOTest) {
+  // Read the model.
+  string tflite_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_am_model.tflite");
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to mmap model " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ops::builtin::BuiltinOpResolver builtins;
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, builtins)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_am_model_in.csv");
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_am_model_out.csv");
+  ReadFrames(output_file_path, &output_frames);
+
+  const int speech_batch_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[0];
+  const int speech_input_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[1];
+  const int speech_output_size =
+      interpreter->tensor(kModelOutputTensor)->dims->data[1];
+
+  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
+  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
+
+  // Clear the LSTM state for layers.
+  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer4OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer4OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer4CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer4CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer5OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer5OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer5CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer5CellStateTensor)->bytes);
+
+
+  for (int i = 0; i < input_frames.size(); i++) {
+    // Feed the input to model.
+    int frame_ptr = 0;
+    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
+      input_ptr[k] = input_frames[i][frame_ptr++];
+    }
+    // Run the model.
+    interpreter->Invoke();
+    // Validate the output.
+    for (int k = 0; k < speech_output_size; k++) {
+      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 5.2e-4);
+    }
+  }
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc b/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53f2b66da492f8fe56fa9e234f0951cf61c35037
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_asr_lm_model_test.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech ASR LM model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "file/base/path.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+
+namespace tflite {
+namespace models {
+
+constexpr int kModelInput1Tensor = 0;
+constexpr int kModelInput2Tensor = 66;
+constexpr int kLstmLayer1OutputStateTensor = 21;
+constexpr int kLstmLayer1CellStateTensor = 22;
+constexpr int kLstmLayer2OutputStateTensor = 42;
+constexpr int kLstmLayer2CellStateTensor = 43;
+constexpr int kLstmLayer3OutputStateTensor = 63;
+constexpr int kLstmLayer3CellStateTensor = 64;
+constexpr int kModelOutputTensor = 75;
+
+static void ClearLstmStates(Interpreter* interpreter) {
+  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
+}
+
+TEST(SpeechAsrLm, EndToEndTest) {
+  // Read the model.
+  string tflite_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model.tflite");
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to mmap model " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ops::builtin::BuiltinOpResolver builtins;
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, builtins)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model_in.csv");
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path =
+      file::JoinPath(TestDataPath(), "speech_asr_lm_model_out.csv");
+  ReadFrames(output_file_path, &output_frames);
+
+  CHECK_EQ(interpreter->tensor(kModelInput1Tensor)->dims->size, 1);
+  const int input1_size =
+      interpreter->tensor(kModelInput1Tensor)->dims->data[0];
+  CHECK_EQ(input1_size, 1);
+  CHECK_EQ(interpreter->tensor(kModelInput2Tensor)->dims->size, 1);
+  const int output_size =
+      interpreter->tensor(kModelOutputTensor)->dims->data[0];
+  CHECK_EQ(output_size, 1);
+
+  int* input_lookup_ptr = interpreter->tensor(kModelInput1Tensor)->data.i32;
+  int* output_lookup_ptr = interpreter->tensor(kModelInput2Tensor)->data.i32;
+  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
+
+
+  for (int i = 0; i < input_frames.size(); i++) {
+    float output_score = 0.0f;
+    // Reset LSTM states for each sequence.
+    ClearLstmStates(interpreter.get());
+    // For subsequent inputs feed them sequentially, one-by-one.
+    for (int k = 1; k < input_frames[i].size(); k++) {
+      // Feed the inputs to model.
+      input_lookup_ptr[0] = static_cast<int32>(input_frames[i][k - 1]);
+      output_lookup_ptr[0] = static_cast<int32>(input_frames[i][k]);
+      // Run the model.
+      interpreter->Invoke();
+      // Sum up the outputs.
+      output_score += output_ptr[0];
+    }
+    // Validate the output.
+    ASSERT_NEAR(output_score, output_frames[i][0], 1.4e-5);
+  }
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_hotword_model_test.cc b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f69cae8d2cb08678f9eec8c9b9d653cfce55bd2e
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_hotword_model_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech Hotword model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+
+namespace tflite {
+namespace models {
+
+void RunTest(int model_input_tensor, int svdf_layer_state_tensor,
+             int model_output_tensor, const string& model_name,
+             const string& golden_in_name, const string& golden_out_name) {
+  // Read the model.
+  string tflite_file_path = StrCat(TestDataPath(), "/", model_name);
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to read model from file " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ops::builtin::BuiltinOpResolver builtins;
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, builtins)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Reset the SVDF layer state.
+  memset(interpreter->tensor(svdf_layer_state_tensor)->data.raw, 0,
+         interpreter->tensor(svdf_layer_state_tensor)->bytes);
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path = StrCat(TestDataPath(), "/", golden_in_name);
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path = StrCat(TestDataPath(), "/", golden_out_name);
+  ReadFrames(output_file_path, &output_frames);
+
+  const int speech_batch_size =
+      interpreter->tensor(model_input_tensor)->dims->data[0];
+  const int speech_input_size =
+      interpreter->tensor(model_input_tensor)->dims->data[1];
+  const int speech_output_size =
+      interpreter->tensor(model_output_tensor)->dims->data[1];
+  const int input_sequence_size =
+      input_frames[0].size() / (speech_input_size * speech_batch_size);
+  float* input_ptr = interpreter->tensor(model_input_tensor)->data.f;
+  float* output_ptr = interpreter->tensor(model_output_tensor)->data.f;
+
+  // The first layer (SVDF) input size is 40 (speech_input_size). Each speech
+  // input frames for this model is 1600 floats, which can be fed to input in a
+  // sequence of size 40 (input_sequence_size).
+  for (int i = 0; i < TestInputSize(input_frames); i++) {
+    int frame_ptr = 0;
+    for (int s = 0; s < input_sequence_size; s++) {
+      for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
+        input_ptr[k] = input_frames[i][frame_ptr++];
+      }
+      interpreter->Invoke();
+    }
+    // After the whole frame (1280 floats) is fed, we can check the output frame
+    // matches with the golden output frame.
+    for (int k = 0; k < speech_output_size; k++) {
+      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
+    }
+  }
+}
+
+TEST(SpeechHotword, OkGoogleTestRank1) {
+  constexpr int kModelInputTensor = 0;
+  constexpr int kSvdfLayerStateTensor = 4;
+  constexpr int kModelOutputTensor = 18;
+
+  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
+          "speech_hotword_model_rank1.tflite", "speech_hotword_model_in.csv",
+          "speech_hotword_model_out_rank1.csv");
+}
+
+TEST(SpeechHotword, OkGoogleTestRank2) {
+  constexpr int kModelInputTensor = 17;
+  constexpr int kSvdfLayerStateTensor = 1;
+  constexpr int kModelOutputTensor = 18;
+  RunTest(kModelInputTensor, kSvdfLayerStateTensor, kModelOutputTensor,
+          "speech_hotword_model_rank2.tflite", "speech_hotword_model_in.csv",
+          "speech_hotword_model_out_rank2.csv");
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc b/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9da0fb1fc62360dcf584c4a08f99b0cef9964a0d
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech SpeakerId model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+
+namespace tflite {
+namespace models {
+
+constexpr int kModelInputTensor = 0;
+constexpr int kLstmLayer1OutputStateTensor = 19;
+constexpr int kLstmLayer1CellStateTensor = 20;
+constexpr int kLstmLayer2OutputStateTensor = 40;
+constexpr int kLstmLayer2CellStateTensor = 41;
+constexpr int kLstmLayer3OutputStateTensor = 61;
+constexpr int kLstmLayer3CellStateTensor = 62;
+constexpr int kModelOutputTensor = 66;
+
+TEST(SpeechSpeakerId, OkGoogleTest) {
+  // Read the model.
+  string tflite_file_path =
+      StrCat(TestDataPath(), "/", "speech_speakerid_model.tflite");
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to read model from file " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ::tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, resolver)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path =
+      StrCat(TestDataPath(), "/", "speech_speakerid_model_in.csv");
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path =
+      StrCat(TestDataPath(), "/", "speech_speakerid_model_out.csv");
+  ReadFrames(output_file_path, &output_frames);
+
+  const int speech_batch_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[0];
+  const int speech_input_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[1];
+  const int speech_output_size =
+      interpreter->tensor(kModelOutputTensor)->dims->data[1];
+
+  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
+  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
+
+  // Clear the LSTM state for layers.
+  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
+  for (int i = 0; i < input_frames.size(); i++) {
+    // Feed the input to model.
+    int frame_ptr = 0;
+    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
+      input_ptr[k] = input_frames[i][frame_ptr++];
+    }
+    // Run the model.
+    interpreter->Invoke();
+    // Validate the output.
+    for (int k = 0; k < speech_output_size; k++) {
+      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
+    }
+  }
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/speech_tts_model_test.cc b/tensorflow/contrib/lite/models/speech_tts_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88291776892f3186ca5bfc726e814f8d23d73b11
--- /dev/null
+++ b/tensorflow/contrib/lite/models/speech_tts_model_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for speech TTS model using TFLite Ops.
+
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "base/logging.h"
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/models/test_utils.h"
+
+namespace tflite {
+namespace models {
+
+constexpr int kModelInputTensor = 0;
+constexpr int kLstmLayer1OutputStateTensor = 25;
+constexpr int kLstmLayer1CellStateTensor = 26;
+constexpr int kLstmLayer2OutputStateTensor = 46;
+constexpr int kLstmLayer2CellStateTensor = 47;
+constexpr int kLstmLayer3OutputStateTensor = 67;
+constexpr int kLstmLayer3CellStateTensor = 68;
+constexpr int kRnnLayerHiddenStateTensor = 73;
+constexpr int kModelOutputTensor = 74;
+
+TEST(SpeechTTS, RandomIOTest) {
+  // Read the model.
+  string tflite_file_path =
+      StrCat(TestDataPath(), "/", "speech_tts_model.tflite");
+  auto model = FlatBufferModel::BuildFromFile(tflite_file_path.c_str());
+  CHECK(model) << "Failed to mmap model " << tflite_file_path;
+
+  // Initialize the interpreter.
+  ops::builtin::BuiltinOpResolver builtins;
+  std::unique_ptr<Interpreter> interpreter;
+  InterpreterBuilder(*model, builtins)(&interpreter);
+  CHECK(interpreter != nullptr);
+  interpreter->AllocateTensors();
+
+  // Load the input frames.
+  Frames input_frames;
+  const string input_file_path =
+      StrCat(TestDataPath(), "/", "speech_tts_model_in.csv");
+  ReadFrames(input_file_path, &input_frames);
+
+  // Load the golden output results.
+  Frames output_frames;
+  const string output_file_path =
+      StrCat(TestDataPath(), "/", "speech_tts_model_out.csv");
+  ReadFrames(output_file_path, &output_frames);
+
+  const int speech_batch_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[0];
+  const int speech_input_size =
+      interpreter->tensor(kModelInputTensor)->dims->data[1];
+  const int speech_output_size =
+      interpreter->tensor(kModelOutputTensor)->dims->data[1];
+
+  float* input_ptr = interpreter->tensor(kModelInputTensor)->data.f;
+  float* output_ptr = interpreter->tensor(kModelOutputTensor)->data.f;
+
+  // Clear the LSTM state for layers.
+  memset(interpreter->tensor(kLstmLayer1OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer1CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer1CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer2OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer2CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer2CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kLstmLayer3OutputStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3OutputStateTensor)->bytes);
+  memset(interpreter->tensor(kLstmLayer3CellStateTensor)->data.raw, 0,
+         interpreter->tensor(kLstmLayer3CellStateTensor)->bytes);
+
+  memset(interpreter->tensor(kRnnLayerHiddenStateTensor)->data.raw, 0,
+         interpreter->tensor(kRnnLayerHiddenStateTensor)->bytes);
+
+  for (int i = 0; i < input_frames.size(); i++) {
+    // Feed the input to model.
+    int frame_ptr = 0;
+    for (int k = 0; k < speech_input_size * speech_batch_size; k++) {
+      input_ptr[k] = input_frames[i][frame_ptr++];
+    }
+    // Run the model.
+    interpreter->Invoke();
+    // Validate the output.
+    for (int k = 0; k < speech_output_size; k++) {
+      ASSERT_NEAR(output_ptr[k], output_frames[i][k], 1e-5);
+    }
+  }
+}
+
+}  // namespace models
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/models/test_utils.h b/tensorflow/contrib/lite/models/test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e14c26a3544ed44f9395ff3b59a70551a1a6394
--- /dev/null
+++ b/tensorflow/contrib/lite/models/test_utils.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace models {
+using Frames = std::vector<std::vector<float>>;
+}  // namespace models
+}  // namespace tflite
+
+#ifndef __ANDROID__
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/platform/test.h"
+
+inline string TestDataPath() {
+  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
+                       "contrib/lite/models/testdata/"));
+}
+inline int TestInputSize(const tflite::models::Frames& input_frames) {
+  return input_frames.size();
+}
+#else
+inline string TestDataPath() {
+  return string("third_party/tensorflow/contrib/lite/models/testdata/");
+}
+
+inline int TestInputSize(const tflite::models::Frames& input_frames) {
+  // Android TAP is very slow, we only test the first 20 frames.
+  return 20;
+}
+#endif
+
+namespace tflite {
+namespace models {
+
+// Read float data from a comma-separated file:
+// Each line will be read into a float vector.
+// The return result will be a vector of float vectors.
+void ReadFrames(const string& csv_file_path, Frames* frames) {
+  std::ifstream csv_file(csv_file_path);
+  string line;
+  while (std::getline(csv_file, line, '\n')) {
+    std::vector<float> fields;
+    // Used by strtok_r internaly for successive calls on the same string.
+    char* save_ptr = nullptr;
+
+    // Tokenize the line.
+    char* next_token =
+        strtok_r(const_cast<char*>(line.c_str()), ",", &save_ptr);
+    while (next_token != nullptr) {
+      float f = strtod(next_token, nullptr);
+      fields.push_back(f);
+      next_token = strtok_r(nullptr, ",", &save_ptr);
+    }
+    frames->push_back(fields);
+  }
+  csv_file.close();
+}
+
+}  // namespace models
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_MODELS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/README.md b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..46b24248f002b8a1a30a2ac614c3874dfd2207db
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/README.md
@@ -0,0 +1,123 @@
+## Speech Model Tests
+
+Sample test data has been provided for speech related models in Tensorflow Lite
+to help users working with speech models to verify and test their models.
+
+For the hotword, speaker-id and automatic speech recognition sample models, the
+architecture assumes that the models receive their input from a speech
+pre-processing module. The speech pre-processing module receives the audio
+signal and produces features for the encoder neural network and uses some
+typical signal processing algorithms, like FFT and spectral subtraction, and
+ultimately produces a log-mel filterbank (the log of the triangular mel filters
+applied to the power spectra). The text-to-speech model assumes that the inputs
+are linguistic features describing characteristics of phonemes, syllables,
+words, phrases, and sentence. The outputs are acoustic features including
+mel-cepstral coefficients, log fundamental frequency, and band aperiodicity.
+The pre-processing modules for these models are not provided in the open source
+version of TensorFlow Lite.
+
+The following sections describe the architecture of the sample models at a high
+level:
+
+### Hotword Model
+
+The hotword model is the neural network model we use for keyphrase/hotword
+spotting (i.e. "okgoogle" detection). It is the entry point for voice
+interaction (e.g. Google search app on Android devices or Google Home, etc.).
+The speech hotword model block diagram is shown in Figure below. It has an input
+size of 40 (float), an output size of 7 (float), one Svdf layer, and four fully
+connected layers with the corresponding parameters as shown in figure below.
+
+![hotword_model](hotword.svg "Hotword model")
+
+### Speaker-id Model
+
+The speaker-id model is the neural network model we use for speaker
+verification. It runs after the hotword triggers. The speech speaker-id model
+block diagram is shown in Figure below. It has an input size of 80 (float), an
+output size of 64 (float), three Lstm layers, and one fully connected layers
+with the corresponding parameters as shown in figure below.
+
+![speakerid_model](speakerid.svg "Speaker-id model")
+
+### Text-to-speech (TTS) Model
+
+The text-to-speech model is the neural network model used to generate speech
+from text. The speech text-to-speech model’s block diagram is shown
+in Figure below. It has and input size of 334 (float), an output size of 196
+(float), two fully connected layers, three Lstm layers, and one recurrent layer
+with the corresponding parameters as shown in the figure.
+
+![tts_model](tts.svg "TTS model")
+
+### Automatic Speech Recognizer (ASR) Acoustic Model (AM)
+
+The acoustic model for automatic speech recognition is the neural network model
+for matching phonemes to the input autio features. It generates posterior
+probabilities of phonemes from speech frontend features (log-mel filterbanks).
+It has an input size of 320 (float), an output size of 42 (float), five LSTM
+layers and one fully connected layers with a Softmax activation function, with
+the corresponding parameters as shown in the figure.
+
+![asr_am_model](asr_am.svg "ASR AM model")
+
+### Automatic Speech Recognizer (ASR) Language Model (LM)
+
+The language model for automatic speech recognition is the neural network model
+for predicting the probability of a word given previous words in a sentence.
+It generates posterior probabilities of the next word based from a sequence of
+words. The words are encoded as indices in a fixed size dictionary.
+The model has two inputs both of size one (integer): the current word index and
+next word index, an output size of one (float): the log probability. It consits
+of three embedding layer, three LSTM layers, followed by a multiplication, a
+fully connected layers and an addition.
+The corresponding parameters as shown in the figure.
+
+![asr_lm_model](asr_lm.svg "ASR LM model")
+
+## Speech models test input/output generation
+
+As mentioned above the input to models are generated from a pre-processing
+module (output of a log-mel filterbank, or linguistic features), and the outputs
+are generated by running the equivalent TensorFlow model by feeding them the
+same input.
+
+## Link to the open source code
+
+### Models:
+
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+
+[ASR AM
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
+
+### Test benches
+
+[Speech hotword model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_hotword_model_test.cc)
+
+[Speaker-id model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_speakerid_model_test.cc)
+
+[TTS model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_tts_model_test.cc)
+
+[ASR AM model
+test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/models/speech_terse_am_model_test.cc)
+
+## Android Support
+The models have been tested on Android phones, using the following tests:
+
+[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/android/BUILD?rcl=172930882&l=25)
+
+[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg b/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
new file mode 100644
index 0000000000000000000000000000000000000000..9f841c219b1ff247231939106d0a6ba47bf6d305
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/asr_am.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 703.0 722.8005249343832" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l703.0 0l0 722.80054l-703.0 0l0 -722.80054z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l703.0 0l0 722.80054l-703.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m256.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m256.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m268.43954 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.6676636 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm6.8439026 0.28125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 102.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 128.94362l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944519 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 154.72487l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m249.80052 657.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m249.80052 657.01575l180.00002 0l0 42.11023l-180.00002 0z" fill-rule="evenodd"></path><path fill="#000000" d="m266.3206 677.3107q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433289 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187653 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797577 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 161.01575l0 24.724411" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 161.01575l0 18.724411" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 179.74016l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 244.72906l0 25.29132" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 244.72906l0 19.291351" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 264.02042l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.00787 72.81108l0.09448242 29.196846" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.00787 72.81108l0.07507324 23.196877" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.4312 96.013306l1.6664124 4.5327225l1.6370544 -4.543419z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 526.4199l232.18896 0l0 42.11029l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m236.43524 553.33997l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637146 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.328125 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.797577 3.171875l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569824 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 413.32974l0 24.125977" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 413.3297l0 18.126007" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 431.45572l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 329.01575l0 25.322845" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 329.01575l0 19.322845" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 348.3386l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 496.44235l0 29.984283" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 496.44238l0 23.984253" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.44275 520.42664l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 185.73694l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 212.65694l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 238.43819l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 270.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 296.94363l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.931427 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 322.72488l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 354.33762l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 381.2576l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm20.275177 0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm13.855896 8.78125q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 407.03885l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m223.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m223.0 437.45026l232.18896 0l0 58.992096l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m265.82367 464.37024l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.915802 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.215271 7.5625q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375732 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016357 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.375702 -3.140625q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656982 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m259.9714 490.1515l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844452 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019806 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426636 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5042114 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281952 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.578827 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm18.210358 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -5.09375q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.656952 0q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m269.46194 594.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m269.46194 594.54596l140.06299 0l0 42.11023l-140.06299 0z" fill-rule="evenodd"></path><path fill="#000000" d="m306.13754 617.09094l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm12.209198 -0.546875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.688232 4.921875l0 -8.546875l-1.484375 0l0 -1.3125l1.484375 0l0 -1.046875q0 -0.984375 0.171875 -1.46875q0.234375 -0.65625 0.84375 -1.046875q0.609375 -0.40625 1.703125 -0.40625q0.703125 0 1.5624695 0.15625l-0.25 1.46875q-0.5155945 -0.09375 -0.9843445 -0.09375q-0.765625 0 -1.078125 0.328125q-0.3125 0.3125 -0.3125 1.203125l0 0.90625l1.921875 0l0 1.3125l-1.921875 0l0 8.546875l-1.65625 0zm8.433289 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 1.5l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm21.978302 -1.21875q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm2.9694824 4.9375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m339.09448 568.5302l0.40945435 26.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.09448 568.5302l0.31503296 20.01648" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m337.758 588.5727l1.7229309 4.5115356l1.5801086 -4.5635376z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m339.49344 636.6562l0.31497192 20.346436" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m339.49344 636.6562l0.22210693 14.347168" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m338.064 651.02893l1.7217712 4.511963l1.5812988 -4.5631104z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg b/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2662f772693197ed21197463175961bf9b65a1f4
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/asr_lm.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 742.6010498687664 753.6010498687664" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l742.6011 0l0 753.6011l-742.6011 0l0 -753.6011z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l742.6011 0l0 753.6011l-742.6011 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m136.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m136.0 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m153.6274 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667679 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375717 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313217 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm7.355179 1.5l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.07463 -2.125l-8.968735 0l0 -1.5625l8.968735 0l0 1.5625zm0 4.125l-8.968735 0l0 -1.546875l8.968735 0l0 1.546875zm13.125153 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 180.96326l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 180.96326l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m151.01154 207.88326l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844467 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880356 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672592 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860092 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.95384216 1.609375l3.5937347 -5.125l-3.3281097 -4.734375l2.09375 0l1.5156097 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.5937347 3.890625l-2.015625 0zm16.26561 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750732 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078857 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 233.6645l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 653.0184l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 653.0184l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m150.8024 673.31335q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433304 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270538 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313217 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578842 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm13.125153 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 239.95538l0 21.543304" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 239.95538l0 15.543304" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 255.49869l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 320.48557l0 21.543304" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 320.48557l0 15.543304" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 336.02887l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.00787 72.81108l0.09448242 25.732285" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.00787 72.81108l0.07246399 19.732315" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.4286 92.54946l1.668396 4.5320053l1.6350555 -4.544136z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 401.01575l0 19.40158" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 401.01575l0 13.401581" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 414.41733l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 261.49344l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 261.49344l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m145.82367 288.41342l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697052 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672607 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672577 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 314.19467l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 342.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 342.02362l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m145.82367 368.94363l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844482 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.8803406 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.931427 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672607 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.96875 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm16.672577 3.5625l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" d="m145.15926 394.72488l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.844467 4.875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm5.603302 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 -6.734375l0 -1.9375l1.65625 0l0 1.9375l-1.65625 0zm-2.125 15.484375l0.3125 -1.421875q0.5 0.125 0.796875 0.125q0.515625 0 0.765625 -0.34375q0.25 -0.328125 0.25 -1.6875l0 -10.359375l1.65625 0l0 10.390625q0 1.828125 -0.46875 2.546875q-0.59375 0.921875 -2.0 0.921875q-0.671875 0 -1.3125 -0.171875zm13.019821 -7.0l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.887146 -2.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2541962 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.95311 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.4218597 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.2812347 -1.375 3.3281097 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.3437347 0q0.09375 1.625 0.92185974 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.4843597 -2.703125l5.4999847 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78123474 0.765625 -0.85935974 2.046875zm9.578842 -2.078125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm0 7.953125l0 -1.90625l1.90625 0l0 1.90625l-1.90625 0zm9.444733 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 618.4042l0 34.614197" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 618.4042l0 28.614197" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 647.0184l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m103.0 98.54593l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m103.0 98.54593l232.18896 0l0 58.992126l-232.18896 0z" fill-rule="evenodd"></path><path fill="#000000" d="m143.32318 125.46593l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594467 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500717 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656967 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281967 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129196 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078842 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.5510712 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656967 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.922577 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m176.34024 151.46593q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531967 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625717 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.860092 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688217 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719467 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750717 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 157.53806l0 23.433075" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 157.53806l0 17.433075" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 174.97113l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m395.48425 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m395.48425 30.700842l166.01575 0l0 42.110237l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m413.11163 57.620842l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667694 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.839569 -0.109375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm13.125122 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.6413574 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 411.97638l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 411.97638l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path fill="#000000" d="m402.72214 438.89636l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594452 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656952 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281982 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129181 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078857 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.551056 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.0312805 0 3.3125305 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.0781555 0.59375 -2.3750305 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625305 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.8281555 -0.9375 -2.0625305 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656952 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.9226074 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m435.7392 464.89636q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531952 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625732 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.860077 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688232 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719452 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm9.984375 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.1569214 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 567.8504l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 567.8504l179.99997 0l0 58.992126l-179.99997 0z" fill-rule="evenodd"></path><path fill="#000000" d="m402.72214 594.7704l0 -13.59375l9.84375 0l0 1.59375l-8.046875 0l0 4.171875l7.53125 0l0 1.59375l-7.53125 0l0 4.625l8.359375 0l0 1.609375l-10.15625 0zm12.193573 0l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm17.087677 0l-1.546875 0l0 -13.59375l1.65625 0l0 4.84375q1.0625 -1.328125 2.703125 -1.328125q0.90625 0 1.71875 0.375q0.8125 0.359375 1.328125 1.03125q0.53125 0.65625 0.828125 1.59375q0.296875 0.9375 0.296875 2.0q0 2.53125 -1.25 3.921875q-1.25 1.375 -3.0 1.375q-1.75 0 -2.734375 -1.453125l0 1.234375zm-0.015625 -5.0q0 1.765625 0.46875 2.5625q0.796875 1.28125 2.140625 1.28125q1.09375 0 1.890625 -0.9375q0.796875 -0.953125 0.796875 -2.84375q0 -1.921875 -0.765625 -2.84375q-0.765625 -0.921875 -1.84375 -0.921875q-1.09375 0 -1.890625 0.953125q-0.796875 0.953125 -0.796875 2.75zm15.594452 1.828125l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656952 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281982 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129181 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.078857 0.8125l1.609375 0.25q0.109375 0.75 0.578125 1.09375q0.609375 0.453125 1.6875 0.453125q1.171875 0 1.796875 -0.46875q0.625 -0.453125 0.859375 -1.28125q0.125 -0.515625 0.109375 -2.15625q-1.09375 1.296875 -2.71875 1.296875q-2.03125 0 -3.15625 -1.46875q-1.109375 -1.46875 -1.109375 -3.515625q0 -1.40625 0.515625 -2.59375q0.515625 -1.203125 1.484375 -1.84375q0.96875 -0.65625 2.265625 -0.65625q1.75 0 2.875 1.40625l0 -1.1875l1.546875 0l0 8.515625q0 2.3125 -0.46875 3.265625q-0.46875 0.96875 -1.484375 1.515625q-1.015625 0.5625 -2.5 0.5625q-1.765625 0 -2.859375 -0.796875q-1.078125 -0.796875 -1.03125 -2.390625zm1.375 -5.921875q0 1.953125 0.765625 2.84375q0.78125 0.890625 1.9375 0.890625q1.140625 0 1.921875 -0.890625q0.78125 -0.890625 0.78125 -2.78125q0 -1.8125 -0.8125 -2.71875q-0.796875 -0.921875 -1.921875 -0.921875q-1.109375 0 -1.890625 0.90625q-0.78125 0.890625 -0.78125 2.671875zm14.449646 5.109375l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm3.551056 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.0312805 0 3.3125305 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.0781555 0.59375 -2.3750305 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625305 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.8281555 -0.9375 -2.0625305 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm8.656952 0q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.297607 4.921875l0 -13.59375l1.671875 0l0 7.75l3.953125 -4.015625l2.15625 0l-3.765625 3.65625l4.140625 6.203125l-2.0625 0l-3.25 -5.03125l-1.171875 1.125l0 3.90625l-1.671875 0zm15.765625 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.9226074 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625z" fill-rule="nonzero"></path><path fill="#000000" d="m440.92703 620.7704q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm8.531982 0.8125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.625702 0.453125l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm5.8601074 1.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.688202 0.328125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm9.719482 3.59375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.6413574 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 420.41733l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 420.41733l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m147.40158 447.3373l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.837677 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913422 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm7.832321 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm4.129196 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm8.828842 4.875l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.191696 -11.6875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm10.566696 -3.609375l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm9.328125 2.390625q-0.9375 0.796875 -1.796875 1.125q-0.859375 0.3125 -1.84375 0.3125q-1.609375 0 -2.484375 -0.78125q-0.875 -0.796875 -0.875 -2.03125q0 -0.734375 0.328125 -1.328125q0.328125 -0.59375 0.859375 -0.953125q0.53125 -0.359375 1.203125 -0.546875q0.5 -0.140625 1.484375 -0.25q2.03125 -0.25 2.984375 -0.578125q0 -0.34375 0 -0.4375q0 -1.015625 -0.46875 -1.4375q-0.640625 -0.5625 -1.90625 -0.5625q-1.171875 0 -1.734375 0.40625q-0.5625 0.40625 -0.828125 1.46875l-1.640625 -0.234375q0.234375 -1.046875 0.734375 -1.6875q0.515625 -0.640625 1.46875 -0.984375q0.96875 -0.359375 2.25 -0.359375q1.265625 0 2.046875 0.296875q0.78125 0.296875 1.15625 0.75q0.375 0.453125 0.515625 1.140625q0.09375 0.421875 0.09375 1.53125l0 2.234375q0 2.328125 0.09375 2.953125q0.109375 0.609375 0.4375 1.171875l-1.75 0q-0.265625 -0.515625 -0.328125 -1.21875zm-0.140625 -3.71875q-0.90625 0.359375 -2.734375 0.625q-1.03125 0.140625 -1.453125 0.328125q-0.421875 0.1875 -0.65625 0.546875q-0.234375 0.359375 -0.234375 0.796875q0 0.671875 0.5 1.125q0.515625 0.4375 1.484375 0.4375q0.96875 0 1.71875 -0.421875q0.75 -0.4375 1.109375 -1.15625q0.265625 -0.578125 0.265625 -1.671875l0 -0.609375zm7.735092 3.4375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm18.746506 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141357 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm3.156952 5.609375l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m129.09448 576.29395l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m129.09448 576.29395l180.0 0l0 42.11023l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m171.36136 603.214l5.234375 -13.59375l1.9375 0l5.5625 13.59375l-2.046875 0l-1.59375 -4.125l-5.6875 0l-1.484375 4.125l-1.921875 0zm3.921875 -5.578125l4.609375 0l-1.40625 -3.78125q-0.65625 -1.703125 -0.96875 -2.8125q-0.265625 1.3125 -0.734375 2.59375l-1.5 4.0zm16.193573 5.578125l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.656967 4.921875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm9.281967 -6.765625l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm7.785446 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5426788 -10.1875l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.5041962 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm18.746521 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm8.853302 -4.0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 597.34644l-79.40158 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 597.34644l-73.40158 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m315.09186 595.6947l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m388.49344 441.47244l-79.40158 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m388.49344 441.47244l-73.40158 0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m315.09186 439.8207l-4.538086 1.6517334l4.538086 1.6517334z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 462.52756l0 31.84253" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 462.52756l0 25.84253" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 488.3701l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m561.5 51.755962l31.99347 0l0 545.57477l-25.001343 0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m561.5 51.755962l31.99347 0l0 545.57477l-25.001343 0" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m478.49213 72.81108l0 339.1496" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m478.49213 72.81108l0 333.1496" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m476.8404 405.96066l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m590.00525 597.4094l-21.51184 -0.06298828" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m590.00525 597.4094l-15.511841 -0.045410156" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m574.4982 595.7123l-4.5429077 1.6384277l4.533264 1.6650391z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m109.09449 494.357l220.0 0l0 42.11023l-220.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m109.09449 494.357l220.0 0l0 42.11023l-220.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m126.81095 521.277l0 -13.59375l9.171867 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.7968674 0zm17.536598 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913422 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144821 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097946 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260712 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281967 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375717 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125717 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547592 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277054 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500717 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.637161 8.921875q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.572052 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.141327 1.984375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm0.9538574 1.609375l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm5.641327 4.0l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m219.09448 536.4672l0 39.811035" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m219.09448 536.4672l0 33.811035" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m217.44275 570.27826l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/hotword.svg b/tensorflow/contrib/lite/models/testdata/g3doc/hotword.svg
new file mode 100755
index 0000000000000000000000000000000000000000..36187aa32184ec60f3033625e660ab7364f1f48d
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/hotword.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m286.0 5.0l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m286.0 5.0l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m303.62738 31.919998l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667694 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897827 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.3533325 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187653 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm5.016327 -1.921875q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m273.00787 70.23491l192.0 0l0 92.7874l-192.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m273.00787 70.23491l192.0 0l0 92.7874l-192.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m344.98923 92.77991l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.943573 4.375l-5.28125 -13.59375l1.953125 0l3.53125 9.875q0.4375 1.1875 0.71875 2.21875q0.3125 -1.109375 0.734375 -2.21875l3.671875 -9.875l1.84375 0l-5.328125 13.59375l-1.84375 0zm8.552948 0l0 -13.59375l4.6875 0q1.578125 0 2.421875 0.1875q1.15625 0.265625 1.984375 0.96875q1.078125 0.921875 1.609375 2.34375q0.53125 1.40625 0.53125 3.21875q0 1.546875 -0.359375 2.75q-0.359375 1.1875 -0.921875 1.984375q-0.5625 0.78125 -1.234375 1.234375q-0.671875 0.4375 -1.625 0.671875q-0.953125 0.234375 -2.1875 0.234375l-4.90625 0zm1.796875 -1.609375l2.90625 0q1.34375 0 2.109375 -0.25q0.765625 -0.25 1.21875 -0.703125q0.640625 -0.640625 1.0 -1.71875q0.359375 -1.078125 0.359375 -2.625q0 -2.125 -0.703125 -3.265625q-0.703125 -1.15625 -1.703125 -1.546875q-0.71875 -0.28125 -2.328125 -0.28125l-2.859375 0l0 10.390625zm11.769806 1.609375l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0z" fill-rule="nonzero"></path><path fill="#000000" d="m296.54065 119.15491l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm22.134552 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.110107 5.875l0 -9.859375l1.5 0l0 1.390625q0.453125 -0.71875 1.21875 -1.15625q0.78125 -0.453125 1.765625 -0.453125q1.09375 0 1.796875 0.453125q0.703125 0.453125 0.984375 1.28125q1.171875 -1.734375 3.046875 -1.734375q1.46875 0 2.25 0.8125q0.796875 0.8125 0.796875 2.5l0 6.765625l-1.671875 0l0 -6.203125q0 -1.0 -0.15625 -1.4375q-0.15625 -0.453125 -0.59375 -0.71875q-0.421875 -0.265625 -1.0 -0.265625q-1.03125 0 -1.71875 0.6875q-0.6875 0.6875 -0.6875 2.21875l0 5.71875l-1.671875 0l0 -6.40625q0 -1.109375 -0.40625 -1.65625q-0.40625 -0.5625 -1.34375 -0.5625q-0.703125 0 -1.3125 0.375q-0.59375 0.359375 -0.859375 1.078125q-0.265625 0.71875 -0.265625 2.0625l0 5.109375l-1.671875 0zm14.915802 -4.921875q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.266327 4.921875l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm6.150177 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm13.917694 -6.734375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm12.187653 3.875l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm5.016327 -1.921875q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375z" fill-rule="nonzero"></path><path fill="#000000" d="m326.25818 145.1549q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm7.915802 -4.0l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm5.0163574 -1.921875q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm13.199646 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm21.448914 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm7.891327 1.609375l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750732 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m294.0 411.00525l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m294.0 411.00525l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m305.7563 437.92526l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536621 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375z" fill-rule="nonzero"></path><path fill="#000000" d="m336.6339 463.92526q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.3376465 -5.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -1.953125l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.047607 -6.703125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.059021 4.40625l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.277039 -11.8125l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm11.813232 15.8125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m286.0 485.50656l166.01575 0l0 41.984222l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m286.0 485.50656l166.01575 0l0 41.984222l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m300.7158 505.80157q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.43332 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270691 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897888 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.3532715 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm6.953247 -7.9375l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm11.813232 15.8125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m294.0 187.5l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m294.0 187.5l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m305.7563 214.42l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536621 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375z" fill-rule="nonzero"></path><path fill="#000000" d="m321.0703 240.42q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm8.853302 -4.0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm7.891327 1.609375l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.750732 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.059021 4.40625l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm15.152039 -3.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.500732 -8.25l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm12.828827 4.4375q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm10.235107 7.921875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m294.0 262.00262l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m294.0 262.00262l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m305.7563 288.92264l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536621 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375z" fill-rule="nonzero"></path><path fill="#000000" d="m326.25818 314.92264q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.556427 -7.5625l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm10.500702 -8.25l0 -1.609375l8.796875 0l0 1.296875q-1.296875 1.375 -2.578125 3.671875q-1.265625 2.296875 -1.96875 4.71875q-0.5 1.703125 -0.640625 3.734375l-1.71875 0q0.03125 -1.609375 0.625 -3.875q0.609375 -2.28125 1.734375 -4.390625q1.140625 -2.109375 2.40625 -3.546875l-6.65625 0zm12.828857 4.4375q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm13.215271 3.921875l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm23.933289 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -1.953125l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.047607 -6.703125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m294.0 336.50394l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m294.0 336.50394l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m305.7563 363.42395l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536621 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375z" fill-rule="nonzero"></path><path fill="#000000" d="m326.25818 389.42395q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.337677 -5.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944519 -1.953125l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.047607 -6.703125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.059021 4.40625l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm23.933289 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm1.5944824 -1.953125l1.765625 -0.15625q0.1875 1.28125 0.890625 1.9375q0.71875 0.640625 1.71875 0.640625q1.203125 0 2.03125 -0.90625q0.84375 -0.90625 0.84375 -2.421875q0 -1.421875 -0.8125 -2.25q-0.796875 -0.828125 -2.09375 -0.828125q-0.796875 0 -1.453125 0.375q-0.640625 0.359375 -1.015625 0.953125l-1.578125 -0.203125l1.328125 -7.0l6.765625 0l0 1.609375l-5.4375 0l-0.734375 3.640625q1.234375 -0.84375 2.578125 -0.84375q1.78125 0 3.0 1.234375q1.234375 1.234375 1.234375 3.171875q0 1.84375 -1.078125 3.1875q-1.3125 1.65625 -3.578125 1.65625q-1.859375 0 -3.03125 -1.03125q-1.171875 -1.046875 -1.34375 -2.765625zm19.047607 -6.703125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 46.984253l0 23.244095" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 46.984253l0 17.244095" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 64.22835l1.6517334 4.5380936l1.6517334 -4.5380936z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 163.02231l0 24.472443" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 163.02231l0 18.472443" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 181.49475l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 246.50656l0 15.496063" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 246.50656l0 9.496063" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 256.00262l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 320.99475l0 15.496063" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 320.99475l0 9.496063" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 330.4908l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 395.49606l0 15.496063" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 395.49606l0 9.496063" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 404.99213l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m369.00787 470.0105l0 15.496063" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m369.00787 470.0105l0 9.496063" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m367.35614 479.50656l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/speakerid.svg b/tensorflow/contrib/lite/models/testdata/g3doc/speakerid.svg
new file mode 100755
index 0000000000000000000000000000000000000000..dbe4312c46408901c6290a7c4b4470378f403f1d
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/speakerid.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m287.0 39.0l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m287.0 39.0l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m304.62738 65.92l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.667694 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897827 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.3533325 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm9.406403 -3.5q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.672577 -2.78125q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm10.219452 10.703125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m295.0 111.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m295.0 111.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m307.1128 137.92l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880371 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm21.212646 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm5.1345825 -11.375q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.672577 -2.78125q0 -2.421875 0.5 -3.890625q0.5 -1.46875 1.46875 -2.265625q0.984375 -0.796875 2.46875 -0.796875q1.09375 0 1.921875 0.4375q0.828125 0.4375 1.359375 1.28125q0.546875 0.828125 0.84375 2.015625q0.3125 1.1875 0.3125 3.21875q0 2.390625 -0.5 3.859375q-0.484375 1.46875 -1.46875 2.28125q-0.96875 0.796875 -2.46875 0.796875q-1.96875 0 -3.078125 -1.40625q-1.359375 -1.703125 -1.359375 -5.53125zm1.71875 0q0 3.34375 0.78125 4.453125q0.796875 1.109375 1.9375 1.109375q1.15625 0 1.9375 -1.109375q0.78125 -1.125 0.78125 -4.453125q0 -3.359375 -0.78125 -4.46875q-0.78125 -1.109375 -1.953125 -1.109375q-1.15625 0 -1.828125 0.984375q-0.875 1.234375 -0.875 4.59375zm8.016327 6.703125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m295.0 183.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m295.0 183.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m307.1128 209.92l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880371 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697021 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.375702 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m295.0 255.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m295.0 255.0l150.01575 0l0 41.984253l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m307.1128 281.91998l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880371 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.9313965 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.375702 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m295.0 327.0l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m295.0 327.0l150.01575 0l0 58.992126l-150.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m306.7563 353.91998l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.536621 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.097931 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.9260864 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125702 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547607 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277039 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500732 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375z" fill-rule="nonzero"></path><path fill="#000000" d="m342.8172 379.91998q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m282.0 416.00787l177.19684 0l0 41.984253l-177.19684 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.0 416.00787l177.19684 0l0 41.984253l-177.19684 0z" fill-rule="evenodd"></path><path fill="#000000" d="m297.11847 436.3029q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.43332 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270691 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm15.500122 -6.390625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m370.00787 80.98425l0 30.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m370.00787 80.98425l0 24.015747" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m368.35614 105.0l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m370.00787 152.98425l0 30.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m370.00787 152.98425l0 24.015747" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m368.35614 177.0l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m370.00787 224.98425l0 30.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m370.00787 224.98425l0 24.015747" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m368.35614 249.0l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m370.00787 296.98425l0 30.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m370.00787 296.98425l0 24.015747" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m368.35614 321.0l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m370.00787 385.99213l0.5984192 30.015747" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m370.00787 385.99213l0.47885132 24.016937" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m368.8353 410.042l1.7418518 4.5042725l1.5609436 -4.5701294z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/g3doc/tts.svg b/tensorflow/contrib/lite/models/testdata/g3doc/tts.svg
new file mode 100755
index 0000000000000000000000000000000000000000..9664b78f1603447746ef92c1245931f471e66998
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/g3doc/tts.svg
@@ -0,0 +1,4 @@
+<?xml version="1.0" standalone="yes"?>
+
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"></path></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 14.7l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 14.7l166.01575 0l0 41.984253l-166.01575 0z" fill-rule="evenodd"></path><path fill="#000000" d="m276.43954 41.62l0 -13.59375l1.8125 0l0 13.59375l-1.8125 0zm4.6676636 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375732 3.78125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313202 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.254181 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm6.8439026 0.28125l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.375702 0l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm15.719482 3.59375l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 238.01575l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 238.01575l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m279.91705 264.93576l0 -13.593765l1.796875 0l0 11.98439l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75001526 -0.46875 -1.6875153q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46876526 2.703125 0.96876526q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.000015l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.000015l-1.796875 0zm7.880371 0l0 -13.593765l2.71875 0l3.21875 9.625015q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.45314l2.421875 0l0 13.593765l-1.734375 0l0 -11.39064l-3.953125 11.39064l-1.625 0l-3.9375 -11.57814l0 11.57814l-1.734375 0zm21.212677 0l-1.671875 0l0 -10.64064q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.56251526 -1.765625 0.85939026l0 -1.6250153q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.656265zm12.918396 4.0q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.2343903q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.6718903q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm8.853302 -4.0l-1.671875 0l0 -10.64064q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.56251526 -1.765625 0.85939026l0 -1.6250153q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.656265zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625153 0.78125 -2.0156403q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.6093903q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm4.1726074 -5.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71876526 -0.5 -1.7031403q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.6562653q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.8281403q0 0.96875 0.609375 1.5781403q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.60939026 0.625 -1.4843903q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.2812653q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.031952 3.921875l3.59375 -5.125l-3.328125 -4.7343903l2.09375 0l1.515625 2.3125153q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.3281403l1.984375 0l-3.390625 4.6406403l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.26564l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.1406403q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.9531403 1.453125 -5.7343903q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.8593903q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.812515l1.359375 0l0 8.812515l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.1406403l-4.25 6.1406403l4.25 0zm6.578827 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.3906403 -0.890625 -2.6718903q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.0156403 0.71875 4.2343903q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 296.0l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 296.0l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m285.10492 322.91998l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880371 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm23.697021 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm10.434021 5.609375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.375702 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 358.1l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 358.1l168.0 0l0 41.984253l-168.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m285.10492 385.02l0 -13.59375l1.796875 0l0 11.984375l6.703125 0l0 1.609375l-8.5 0zm9.844452 -4.375l1.6875 -0.140625q0.125 1.015625 0.5625 1.671875q0.4375 0.65625 1.359375 1.0625q0.9375 0.40625 2.09375 0.40625q1.03125 0 1.8125 -0.3125q0.796875 -0.3125 1.1875 -0.84375q0.390625 -0.53125 0.390625 -1.15625q0 -0.640625 -0.375 -1.109375q-0.375 -0.484375 -1.234375 -0.8125q-0.546875 -0.21875 -2.421875 -0.65625q-1.875 -0.453125 -2.625 -0.859375q-0.96875 -0.515625 -1.453125 -1.265625q-0.46875 -0.75 -0.46875 -1.6875q0 -1.03125 0.578125 -1.921875q0.59375 -0.90625 1.703125 -1.359375q1.125 -0.46875 2.5 -0.46875q1.515625 0 2.671875 0.484375q1.15625 0.484375 1.765625 1.4375q0.625 0.9375 0.671875 2.140625l-1.71875 0.125q-0.140625 -1.28125 -0.953125 -1.9375q-0.796875 -0.671875 -2.359375 -0.671875q-1.625 0 -2.375 0.609375q-0.75 0.59375 -0.75 1.4375q0 0.734375 0.53125 1.203125q0.515625 0.46875 2.703125 0.96875q2.203125 0.5 3.015625 0.875q1.1875 0.546875 1.75 1.390625q0.578125 0.828125 0.578125 1.921875q0 1.09375 -0.625 2.0625q-0.625 0.953125 -1.796875 1.484375q-1.15625 0.53125 -2.609375 0.53125q-1.84375 0 -3.09375 -0.53125q-1.25 -0.546875 -1.96875 -1.625q-0.703125 -1.078125 -0.734375 -2.453125zm16.506073 4.375l0 -12.0l-4.46875 0l0 -1.59375l10.765625 0l0 1.59375l-4.5 0l0 12.0l-1.796875 0zm7.880371 0l0 -13.59375l2.71875 0l3.21875 9.625q0.4375 1.34375 0.640625 2.015625q0.234375 -0.75 0.734375 -2.1875l3.25 -9.453125l2.421875 0l0 13.59375l-1.734375 0l0 -11.390625l-3.953125 11.390625l-1.625 0l-3.9375 -11.578125l0 11.578125l-1.734375 0zm14.9313965 -3.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm19.199646 7.59375q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.2283325 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.375702 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm18.640625 -10.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875732 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm6.5788574 8.78125l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 78.7l168.0 0l0 58.992126l-168.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 78.7l168.0 0l0 58.992126l-168.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m279.56058 105.619995l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm15.094482 4.921875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625z" fill-rule="nonzero"></path><path fill="#000000" d="m310.4336 131.62q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.5720825 -7.59375l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm10.375702 0l1.671875 -0.21875q0.28125 1.421875 0.96875 2.046875q0.703125 0.625 1.6875 0.625q1.1875 0 2.0 -0.8125q0.8125 -0.828125 0.8125 -2.03125q0 -1.140625 -0.765625 -1.890625q-0.75 -0.75 -1.90625 -0.75q-0.46875 0 -1.171875 0.1875l0.1875 -1.46875q0.15625 0.015625 0.265625 0.015625q1.0625 0 1.90625 -0.546875q0.859375 -0.5625 0.859375 -1.71875q0 -0.921875 -0.625 -1.515625q-0.609375 -0.609375 -1.59375 -0.609375q-0.96875 0 -1.625 0.609375q-0.640625 0.609375 -0.828125 1.84375l-1.671875 -0.296875q0.296875 -1.6875 1.375 -2.609375q1.09375 -0.921875 2.71875 -0.921875q1.109375 0 2.046875 0.484375q0.9375 0.46875 1.421875 1.296875q0.5 0.828125 0.5 1.75q0 0.890625 -0.46875 1.609375q-0.46875 0.71875 -1.40625 1.15625q1.21875 0.265625 1.875 1.15625q0.671875 0.875 0.671875 2.1875q0 1.78125 -1.296875 3.015625q-1.296875 1.234375 -3.28125 1.234375q-1.796875 0 -2.984375 -1.0625q-1.171875 -1.0625 -1.34375 -2.765625zm15.719452 3.59375l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm4.172577 -5.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm10.235077 7.921875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m257.8 488.0l180.0 0l0 46.992126l-180.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m257.8 488.0l180.0 0l0 46.992126l-180.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m269.1322 508.29498q0 -3.390625 1.8125 -5.296875q1.828125 -1.921875 4.703125 -1.921875q1.875 0 3.390625 0.90625q1.515625 0.890625 2.296875 2.5q0.796875 1.609375 0.796875 3.65625q0 2.0625 -0.84375 3.703125q-0.828125 1.625 -2.359375 2.46875q-1.53125 0.84375 -3.296875 0.84375q-1.921875 0 -3.4375 -0.921875q-1.5 -0.9375 -2.28125 -2.53125q-0.78125 -1.609375 -0.78125 -3.40625zm1.859375 0.03125q0 2.453125 1.3125 3.875q1.328125 1.40625 3.3125 1.40625q2.03125 0 3.34375 -1.421875q1.3125 -1.4375 1.3125 -4.0625q0 -1.65625 -0.5625 -2.890625q-0.546875 -1.234375 -1.640625 -1.921875q-1.078125 -0.6875 -2.421875 -0.6875q-1.90625 0 -3.28125 1.3125q-1.375 1.3125 -1.375 4.390625zm19.433289 6.59375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.5788574 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm1.5270386 5.28125l0 -13.640625l1.53125 0l0 1.28125q0.53125 -0.75 1.203125 -1.125q0.6875 -0.375 1.640625 -0.375q1.265625 0 2.234375 0.65625q0.96875 0.640625 1.453125 1.828125q0.5 1.1875 0.5 2.59375q0 1.515625 -0.546875 2.734375q-0.546875 1.203125 -1.578125 1.84375q-1.03125 0.640625 -2.171875 0.640625q-0.84375 0 -1.515625 -0.34375q-0.65625 -0.359375 -1.078125 -0.890625l0 4.796875l-1.671875 0zm1.515625 -8.65625q0 1.90625 0.765625 2.8125q0.78125 0.90625 1.875 0.90625q1.109375 0 1.890625 -0.9375q0.796875 -0.9375 0.796875 -2.921875q0 -1.875 -0.78125 -2.8125q-0.765625 -0.9375 -1.84375 -0.9375q-1.0625 0 -1.890625 1.0q-0.8125 1.0 -0.8125 2.890625zm15.313232 4.875l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm7.578827 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm2.353302 -6.9375l1.65625 -0.265625q0.140625 1.0 0.765625 1.53125q0.640625 0.515625 1.78125 0.515625q1.15625 0 1.703125 -0.46875q0.5625 -0.46875 0.5625 -1.09375q0 -0.5625 -0.484375 -0.890625q-0.34375 -0.21875 -1.703125 -0.5625q-1.84375 -0.46875 -2.5625 -0.796875q-0.703125 -0.34375 -1.078125 -0.9375q-0.359375 -0.609375 -0.359375 -1.328125q0 -0.65625 0.296875 -1.21875q0.3125 -0.5625 0.828125 -0.9375q0.390625 -0.28125 1.0625 -0.484375q0.671875 -0.203125 1.4375 -0.203125q1.171875 0 2.046875 0.34375q0.875 0.328125 1.28125 0.90625q0.421875 0.5625 0.578125 1.515625l-1.625 0.21875q-0.109375 -0.75 -0.65625 -1.171875q-0.53125 -0.4375 -1.5 -0.4375q-1.15625 0 -1.640625 0.390625q-0.484375 0.375 -0.484375 0.875q0 0.328125 0.203125 0.59375q0.203125 0.265625 0.640625 0.4375q0.25 0.09375 1.46875 0.4375q1.765625 0.46875 2.46875 0.765625q0.703125 0.296875 1.09375 0.875q0.40625 0.578125 0.40625 1.4375q0 0.828125 -0.484375 1.578125q-0.484375 0.734375 -1.40625 1.140625q-0.921875 0.390625 -2.078125 0.390625q-1.921875 0 -2.9375 -0.796875q-1.0 -0.796875 -1.28125 -2.359375zm10.015625 -8.75l0 -1.90625l1.671875 0l0 1.90625l-1.671875 0zm0 11.6875l0 -9.859375l1.671875 0l0 9.859375l-1.671875 0zm3.2542114 0l0 -1.359375l6.265625 -7.1875q-1.0625 0.046875 -1.875 0.046875l-4.015625 0l0 -1.359375l8.046875 0l0 1.109375l-5.34375 6.25l-1.015625 1.140625q1.109375 -0.078125 2.09375 -0.078125l4.5625 0l0 1.4375l-8.71875 0zm16.953125 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm23.074646 -2.125l-8.96875 0l0 -1.5625l8.96875 0l0 1.5625zm0 4.125l-8.96875 0l0 -1.546875l8.96875 0l0 1.546875zm13.125122 3.875l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.3444824 -3.140625l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm11.953827 -1.125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078857 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m348.0 280.0l0 16.0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.0 280.0l0 10.0" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.34827 290.0l1.6517334 4.538086l1.6517334 -4.538086z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m348.0 337.98425l0 20.125977" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.0 337.98425l0 14.125977" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.34827 352.11023l1.6517334 4.5381165l1.6517334 -4.5381165z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m347.00787 56.684254l1.0078735 22.015743" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m347.00787 56.68425l0.7334595 16.022026" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.09134 72.781815l1.857544 4.4578094l1.4424744 -4.6088867z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m348.0 400.08426l0.31497192 21.921265" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.0 400.08423l0.22875977 15.921875" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.5772 416.02985l1.7167358 4.5138855l1.5863647 -4.5613403z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m264.0 158.19606l168.0 0l0 58.992126l-168.0 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m264.0 158.19606l168.0 0l0 58.992126l-168.0 0z" fill-rule="evenodd"></path><path fill="#000000" d="m279.56058 185.11606l0 -13.59375l9.171875 0l0 1.59375l-7.375 0l0 4.21875l6.375 0l0 1.609375l-6.375 0l0 6.171875l-1.796875 0zm17.53659 0l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.8913574 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.144806 0l0 -13.59375l1.671875 0l0 13.59375l-1.671875 0zm4.0979614 3.796875l-0.171875 -1.5625q0.546875 0.140625 0.953125 0.140625q0.546875 0 0.875 -0.1875q0.34375 -0.1875 0.5625 -0.515625q0.15625 -0.25 0.5 -1.25q0.046875 -0.140625 0.15625 -0.40625l-3.734375 -9.875l1.796875 0l2.046875 5.71875q0.40625 1.078125 0.71875 2.28125q0.28125 -1.15625 0.6875 -2.25l2.09375 -5.75l1.671875 0l-3.75 10.03125q-0.59375 1.625 -0.9375 2.234375q-0.4375 0.828125 -1.015625 1.203125q-0.578125 0.390625 -1.375 0.390625q-0.484375 0 -1.078125 -0.203125zm19.328125 -8.5625l1.796875 0.453125q-0.5625 2.21875 -2.03125 3.390625q-1.46875 1.15625 -3.59375 1.15625q-2.203125 0 -3.578125 -0.890625q-1.375 -0.90625 -2.09375 -2.59375q-0.71875 -1.703125 -0.71875 -3.65625q0 -2.125 0.796875 -3.703125q0.8125 -1.578125 2.3125 -2.390625q1.5 -0.828125 3.296875 -0.828125q2.046875 0 3.4375 1.046875q1.390625 1.03125 1.9375 2.90625l-1.765625 0.421875q-0.46875 -1.484375 -1.375 -2.15625q-0.90625 -0.6875 -2.265625 -0.6875q-1.5625 0 -2.625 0.75q-1.046875 0.75 -1.484375 2.03125q-0.421875 1.265625 -0.421875 2.609375q0 1.734375 0.5 3.03125q0.515625 1.28125 1.578125 1.921875q1.078125 0.640625 2.3125 0.640625q1.515625 0 2.5625 -0.859375q1.046875 -0.875 1.421875 -2.59375zm2.926056 -0.15625q0 -2.734375 1.53125 -4.0625q1.265625 -1.09375 3.09375 -1.09375q2.03125 0 3.3125 1.34375q1.296875 1.328125 1.296875 3.671875q0 1.90625 -0.578125 3.0q-0.5625 1.078125 -1.65625 1.6875q-1.078125 0.59375 -2.375 0.59375q-2.0625 0 -3.34375 -1.328125q-1.28125 -1.328125 -1.28125 -3.8125zm1.71875 0q0 1.890625 0.828125 2.828125q0.828125 0.9375 2.078125 0.9375q1.25 0 2.0625 -0.9375q0.828125 -0.953125 0.828125 -2.890625q0 -1.828125 -0.828125 -2.765625q-0.828125 -0.9375 -2.0625 -0.9375q-1.25 0 -2.078125 0.9375q-0.828125 0.9375 -0.828125 2.828125zm9.281982 4.921875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm10.375702 0l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm17.125732 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm6.546875 2.109375l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm8.277069 -1.671875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.500702 5.875l0 -1.25q-0.9375 1.46875 -2.75 1.46875q-1.171875 0 -2.171875 -0.640625q-0.984375 -0.65625 -1.53125 -1.8125q-0.53125 -1.171875 -0.53125 -2.6875q0 -1.46875 0.484375 -2.671875q0.5 -1.203125 1.46875 -1.84375q0.984375 -0.640625 2.203125 -0.640625q0.890625 0 1.578125 0.375q0.703125 0.375 1.140625 0.984375l0 -4.875l1.65625 0l0 13.59375l-1.546875 0zm-5.28125 -4.921875q0 1.890625 0.796875 2.828125q0.8125 0.9375 1.890625 0.9375q1.09375 0 1.859375 -0.890625q0.765625 -0.890625 0.765625 -2.734375q0 -2.015625 -0.78125 -2.953125q-0.78125 -0.953125 -1.921875 -0.953125q-1.109375 0 -1.859375 0.90625q-0.75 0.90625 -0.75 2.859375zm17.578857 3.3125l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0z" fill-rule="nonzero"></path><path fill="#000000" d="m310.4336 211.11606q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm8.8533325 -4.0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860077 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm4.172577 -5.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm8.031982 3.921875l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm12.860107 -1.609375l0 1.609375l-8.984375 0q-0.015625 -0.609375 0.1875 -1.15625q0.34375 -0.921875 1.09375 -1.8125q0.765625 -0.890625 2.1875 -2.0625q2.21875 -1.8125 3.0 -2.875q0.78125 -1.0625 0.78125 -2.015625q0 -0.984375 -0.71875 -1.671875q-0.703125 -0.6875 -1.84375 -0.6875q-1.203125 0 -1.9375 0.734375q-0.71875 0.71875 -0.71875 2.0l-1.71875 -0.171875q0.171875 -1.921875 1.328125 -2.921875q1.15625 -1.015625 3.09375 -1.015625q1.953125 0 3.09375 1.09375q1.140625 1.078125 1.140625 2.6875q0 0.8125 -0.34375 1.609375q-0.328125 0.78125 -1.109375 1.65625q-0.765625 0.859375 -2.5625 2.390625q-1.5 1.265625 -1.9375 1.71875q-0.421875 0.4375 -0.703125 0.890625l6.671875 0zm4.172577 -5.765625q-1.046875 -0.375 -1.546875 -1.078125q-0.5 -0.71875 -0.5 -1.703125q0 -1.484375 1.0625 -2.484375q1.078125 -1.015625 2.84375 -1.015625q1.78125 0 2.859375 1.03125q1.09375 1.03125 1.09375 2.515625q0 0.953125 -0.5 1.65625q-0.484375 0.703125 -1.5 1.078125q1.25 0.40625 1.90625 1.3125q0.65625 0.90625 0.65625 2.171875q0 1.75 -1.234375 2.9375q-1.234375 1.1875 -3.25 1.1875q-2.015625 0 -3.25 -1.1875q-1.234375 -1.203125 -1.234375 -2.984375q0 -1.328125 0.671875 -2.21875q0.671875 -0.890625 1.921875 -1.21875zm-0.328125 -2.828125q0 0.96875 0.609375 1.578125q0.625 0.609375 1.625 0.609375q0.953125 0 1.5625 -0.609375q0.625 -0.609375 0.625 -1.484375q0 -0.921875 -0.640625 -1.546875q-0.625 -0.625 -1.578125 -0.625q-0.953125 0 -1.578125 0.609375q-0.625 0.609375 -0.625 1.46875zm-0.546875 6.28125q0 0.71875 0.328125 1.390625q0.34375 0.65625 1.015625 1.03125q0.671875 0.359375 1.4375 0.359375q1.203125 0 1.984375 -0.765625q0.78125 -0.78125 0.78125 -1.96875q0 -1.203125 -0.8125 -1.984375q-0.796875 -0.796875 -2.0 -0.796875q-1.1875 0 -1.96875 0.78125q-0.765625 0.78125 -0.765625 1.953125zm10.235077 7.921875l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m348.0 137.69212l0 20.503937" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.0 137.69212l0 14.503937" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.34827 152.19606l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m348.0 217.18819l0 20.818893" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.0 217.1882l0 14.818893" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.34827 232.0071l1.6517334 4.538101l1.6517334 -4.538101z" fill-rule="evenodd"></path><path fill="#000000" fill-opacity="0.0" d="m253.3 422.01575l190.01573 0l0 41.984253l-190.01573 0z" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m253.3 422.01575l190.01573 0l0 41.984253l-190.01573 0z" fill-rule="evenodd"></path><path fill="#000000" d="m269.44388 448.93576l0 -13.59375l6.03125 0q1.8125 0 2.75 0.359375q0.953125 0.359375 1.515625 1.296875q0.5625 0.921875 0.5625 2.046875q0 1.453125 -0.9375 2.453125q-0.921875 0.984375 -2.890625 1.25q0.71875 0.34375 1.09375 0.671875q0.78125 0.734375 1.484375 1.8125l2.375 3.703125l-2.265625 0l-1.796875 -2.828125q-0.796875 -1.21875 -1.3125 -1.875q-0.5 -0.65625 -0.90625 -0.90625q-0.40625 -0.265625 -0.8125 -0.359375q-0.3125 -0.078125 -1.015625 -0.078125l-2.078125 0l0 6.046875l-1.796875 0zm1.796875 -7.59375l3.859375 0q1.234375 0 1.921875 -0.25q0.703125 -0.265625 1.0625 -0.828125q0.375 -0.5625 0.375 -1.21875q0 -0.96875 -0.703125 -1.578125q-0.703125 -0.625 -2.21875 -0.625l-4.296875 0l0 4.5zm18.176086 4.421875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm15.547577 2.265625l1.640625 0.21875q-0.265625 1.6875 -1.375 2.65625q-1.109375 0.953125 -2.734375 0.953125q-2.015625 0 -3.25 -1.3125q-1.21875 -1.328125 -1.21875 -3.796875q0 -1.59375 0.515625 -2.78125q0.53125 -1.203125 1.609375 -1.796875q1.09375 -0.609375 2.359375 -0.609375q1.609375 0 2.625 0.8125q1.015625 0.8125 1.3125 2.3125l-1.625 0.25q-0.234375 -1.0 -0.828125 -1.5q-0.59375 -0.5 -1.421875 -0.5q-1.265625 0 -2.0625 0.90625q-0.78125 0.90625 -0.78125 2.859375q0 1.984375 0.765625 2.890625q0.765625 0.890625 1.984375 0.890625q0.984375 0 1.640625 -0.59375q0.65625 -0.609375 0.84375 -1.859375zm9.34375 3.609375l0 -1.453125q-1.140625 1.671875 -3.125 1.671875q-0.859375 0 -1.625 -0.328125q-0.75 -0.34375 -1.125 -0.84375q-0.359375 -0.5 -0.515625 -1.234375q-0.09375 -0.5 -0.09375 -1.5625l0 -6.109375l1.671875 0l0 5.46875q0 1.3125 0.09375 1.765625q0.15625 0.65625 0.671875 1.03125q0.515625 0.375 1.265625 0.375q0.75 0 1.40625 -0.375q0.65625 -0.390625 0.921875 -1.046875q0.28125 -0.671875 0.28125 -1.9375l0 -5.28125l1.671875 0l0 9.859375l-1.5 0zm3.9069824 0l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm6.212677 0l0 -9.859375l1.5 0l0 1.5q0.578125 -1.046875 1.0625 -1.375q0.484375 -0.34375 1.078125 -0.34375q0.84375 0 1.71875 0.546875l-0.578125 1.546875q-0.609375 -0.359375 -1.234375 -0.359375q-0.546875 0 -0.984375 0.328125q-0.421875 0.328125 -0.609375 0.90625q-0.28125 0.890625 -0.28125 1.953125l0 5.15625l-1.671875 0zm12.978302 -3.171875l1.71875 0.21875q-0.40625 1.5 -1.515625 2.34375q-1.09375 0.828125 -2.8125 0.828125q-2.15625 0 -3.421875 -1.328125q-1.265625 -1.328125 -1.265625 -3.734375q0 -2.484375 1.265625 -3.859375q1.28125 -1.375 3.328125 -1.375q1.984375 0 3.234375 1.34375q1.25 1.34375 1.25 3.796875q0 0.140625 -0.015625 0.4375l-7.34375 0q0.09375 1.625 0.921875 2.484375q0.828125 0.859375 2.0625 0.859375q0.90625 0 1.546875 -0.46875q0.65625 -0.484375 1.046875 -1.546875zm-5.484375 -2.703125l5.5 0q-0.109375 -1.234375 -0.625 -1.859375q-0.796875 -0.96875 -2.078125 -0.96875q-1.140625 0 -1.9375 0.78125q-0.78125 0.765625 -0.859375 2.046875zm9.110077 5.875l0 -9.859375l1.5 0l0 1.40625q1.09375 -1.625 3.140625 -1.625q0.890625 0 1.640625 0.328125q0.75 0.3125 1.109375 0.84375q0.375 0.515625 0.53125 1.21875q0.09375 0.46875 0.09375 1.625l0 6.0625l-1.671875 0l0 -6.0q0 -1.015625 -0.203125 -1.515625q-0.1875 -0.515625 -0.6875 -0.8125q-0.5 -0.296875 -1.171875 -0.296875q-1.0625 0 -1.84375 0.671875q-0.765625 0.671875 -0.765625 2.578125l0 5.375l-1.671875 0zm14.031982 -1.5l0.234375 1.484375q-0.703125 0.140625 -1.265625 0.140625q-0.90625 0 -1.40625 -0.28125q-0.5 -0.296875 -0.703125 -0.75q-0.203125 -0.46875 -0.203125 -1.984375l0 -5.65625l-1.234375 0l0 -1.3125l1.234375 0l0 -2.4375l1.65625 -1.0l0 3.4375l1.6875 0l0 1.3125l-1.6875 0l0 5.75q0 0.71875 0.078125 0.921875q0.09375 0.203125 0.296875 0.328125q0.203125 0.125 0.578125 0.125q0.265625 0 0.734375 -0.078125zm9.897858 5.5q-1.375 -1.75 -2.328125 -4.078125q-0.953125 -2.34375 -0.953125 -4.84375q0 -2.21875 0.703125 -4.234375q0.84375 -2.34375 2.578125 -4.671875l1.203125 0q-1.125 1.921875 -1.484375 2.75q-0.5625 1.28125 -0.890625 2.671875q-0.40625 1.734375 -0.40625 3.484375q0 4.46875 2.78125 8.921875l-1.203125 0zm11.228302 -14.265625l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm13.875702 4.40625l0 -3.25l-5.90625 0l0 -1.53125l6.21875 -8.8125l1.359375 0l0 8.8125l1.84375 0l0 1.53125l-1.84375 0l0 3.25l-1.671875 0zm0 -4.78125l0 -6.140625l-4.25 6.140625l4.25 0zm4.3757324 4.78125l3.59375 -5.125l-3.328125 -4.734375l2.09375 0l1.515625 2.3125q0.421875 0.65625 0.671875 1.109375q0.421875 -0.609375 0.765625 -1.09375l1.65625 -2.328125l1.984375 0l-3.390625 4.640625l3.65625 5.21875l-2.046875 0l-2.03125 -3.0625l-0.53125 -0.828125l-2.59375 3.890625l-2.015625 0zm16.265625 0l-1.671875 0l0 -10.640625q-0.59375 0.578125 -1.578125 1.15625q-0.984375 0.5625 -1.765625 0.859375l0 -1.625q1.40625 -0.65625 2.453125 -1.59375q1.046875 -0.9375 1.484375 -1.8125l1.078125 0l0 13.65625zm4.344452 -3.140625l1.59375 -0.15625q0.203125 1.140625 0.78125 1.65625q0.578125 0.5 1.484375 0.5q0.765625 0 1.34375 -0.34375q0.578125 -0.359375 0.953125 -0.953125q0.375 -0.59375 0.625 -1.59375q0.25 -1.0 0.25 -2.03125q0 -0.109375 -0.015625 -0.34375q-0.5 0.796875 -1.375 1.296875q-0.859375 0.5 -1.875 0.5q-1.6875 0 -2.859375 -1.21875q-1.171875 -1.234375 -1.171875 -3.234375q0 -2.078125 1.21875 -3.328125q1.234375 -1.265625 3.0625 -1.265625q1.328125 0 2.421875 0.71875q1.109375 0.703125 1.671875 2.03125q0.578125 1.328125 0.578125 3.828125q0 2.609375 -0.578125 4.15625q-0.5625 1.546875 -1.6875 2.359375q-1.109375 0.796875 -2.609375 0.796875q-1.59375 0 -2.609375 -0.890625q-1.0 -0.890625 -1.203125 -2.484375zm6.828125 -6.0q0 -1.4375 -0.765625 -2.28125q-0.765625 -0.859375 -1.84375 -0.859375q-1.109375 0 -1.9375 0.921875q-0.828125 0.90625 -0.828125 2.34375q0 1.3125 0.78125 2.125q0.796875 0.796875 1.9375 0.796875q1.171875 0 1.90625 -0.796875q0.75 -0.8125 0.75 -2.25zm11.953857 -1.125l-1.65625 0.125q-0.21875 -0.984375 -0.640625 -1.421875q-0.671875 -0.71875 -1.65625 -0.71875q-0.8125 0 -1.40625 0.4375q-0.796875 0.578125 -1.25 1.6875q-0.453125 1.09375 -0.46875 3.140625q0.609375 -0.921875 1.46875 -1.359375q0.875 -0.453125 1.828125 -0.453125q1.671875 0 2.84375 1.234375q1.171875 1.234375 1.171875 3.171875q0 1.28125 -0.546875 2.390625q-0.546875 1.09375 -1.515625 1.6875q-0.96875 0.578125 -2.1875 0.578125q-2.09375 0 -3.40625 -1.53125q-1.3125 -1.546875 -1.3125 -5.0625q0 -3.953125 1.453125 -5.734375q1.265625 -1.5625 3.421875 -1.5625q1.609375 0 2.625 0.90625q1.03125 0.890625 1.234375 2.484375zm-6.8125 5.859375q0 0.859375 0.359375 1.65625q0.375 0.78125 1.03125 1.203125q0.65625 0.40625 1.375 0.40625q1.0625 0 1.8125 -0.84375q0.765625 -0.859375 0.765625 -2.328125q0 -1.40625 -0.75 -2.21875q-0.75 -0.8125 -1.890625 -0.8125q-1.125 0 -1.921875 0.8125q-0.78125 0.8125 -0.78125 2.125zm10.078827 8.40625l-1.1875 0q2.765625 -4.453125 2.765625 -8.921875q0 -1.734375 -0.390625 -3.453125q-0.328125 -1.390625 -0.890625 -2.671875q-0.359375 -0.84375 -1.484375 -2.78125l1.1875 0q1.75 2.328125 2.578125 4.671875q0.71875 2.015625 0.71875 4.234375q0 2.5 -0.96875 4.84375q-0.953125 2.328125 -2.328125 4.078125z" fill-rule="nonzero"></path><path fill="#000000" fill-opacity="0.0" d="m348.30786 464.0l-0.50393677 24.0" fill-rule="evenodd"></path><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m348.30786 464.0l-0.37799072 18.001312" fill-rule="evenodd"></path><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m346.2785 481.96664l1.5561218 4.5717773l1.7466431 -4.502411z" fill-rule="evenodd"></path></g></svg>
+
diff --git a/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv b/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..dfdc783106098ee2daade25830af384939501ac0
--- /dev/null
+++ b/tensorflow/contrib/lite/models/testdata/smartreply_samples.tsv
@@ -0,0 +1,50 @@
+any chance ur free tonight	Maybe not
+any updates?	No update yet
+anything i can do to help?	No, but thanks	No, but thank you	No, but thanks for asking
+be safe.	I will be	Will do my best	Thanks, I will
+congratulations	Thanks thanks	Congratulations
+cool, let me know when you have time	Cool	Yes very cool	Yeah, cool
+drive safe	Thank you, I will	Home now	I will thanks
+hang in there, you'll be okay	Doing my best	Of course we will
+happy birthday!	Hey, thanks
+happy new year!	Wish you the same	Thanks and same to you
+have a safe flight	Thanks, love you too	Safe travels
+hey	What is up?	How it going?	Can I help you?
+hey, got a sec?	What is up?	How it going?	Can I help you?
+how are you doing?	Great and you?	I am doing great
+how are you feeling	Feeling okay	A little better	Much much better
+how was your weekend?	It was real good
+how you doing	Okay and you
+hugs.	So sweet	Thanks sweetie	Take care of yourself
+i'm bored	Sorry to hear that	Join the club	No you are not
+i'm planning on coming next week. let me know if that works.	Works	Perfect, thanks
+i'm sick	Sorry to hear that
+i'm so happy for you	Thanks me too
+i'm so hungry	Haha me too
+i'm sorry	No I am sorry	Why sorry?	No worries love
+i'm sorry, i'm going to have to cancel.	No I am sorry	Why sorry?	No worries love
+is there anything i can do to help?	No, but thanks	No, but thanks for asking
+lunch?	Yes coming
+okay. lemme know as soon as you find out.	Any more questions?	It is done
+omg amazing	So amazing
+on my way	Okay see you soon	Cool, see you soon	Oh wow, ok
+oops, mistexted.	Oops	Haha, oh well	That was funny
+safe travels.	Thanks, love you too	Safe travels
+so sorry	So sorry
+sorry, i can't.	No worries at all	Sorry what?
+sorry, i can't do saturday	No worries at all
+thank you so much.	You are so welcome	You are so very welcome	You are most welcome
+thanks for coming	It was my pleasure
+thanks, this has been great.	Glad to help	So happy for you
+tomorrow would be ideal.	Yes it would
+tried calling	Try again?
+ugh, my flight is delayed.	Ugh indeed
+what are you guys up to tonight?	Nothing planned
+what day works best for you	Any day
+what do you want for dinner	Your call	Whatever is fine
+what time will you be home?	Not sure why
+where are you?!?	At my house
+wish you were here.	I wish the same	Me too honey
+you're amazing	You are too	You are amazing	I am
+you're marvelous	You are too
+you're the best.	I do my best	You are the best	Well, I try
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/nnapi/BUILD b/tensorflow/contrib/lite/nnapi/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..402f1e949b7bb576de4970a8ebb41541fcee1cb2
--- /dev/null
+++ b/tensorflow/contrib/lite/nnapi/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = [
+    "//visibility:public",
+])
+
+cc_library(
+    name = "nnapi_lib",
+    hdrs = [
+        "NeuralNetworksShim.h",
+    ],
+    linkopts = ["-ldl"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
new file mode 100644
index 0000000000000000000000000000000000000000..80668890786becd161f9fd07317970b199ddb044
--- /dev/null
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -0,0 +1,1916 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef NN_API_SHIM_H0
+#define NN_API_SHIM_H0
+
+#include <dlfcn.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// helpers
+
+#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
+#define LOAD_FUNCTION(name) \
+  static name##_fn fn = reinterpret_cast<name##_fn>(loadFunction(#name));
+#define EXECUTE_FUNCTION(...) \
+  if (fn != nullptr) {        \
+    fn(__VA_ARGS__);          \
+  }
+#define EXECUTE_FUNCTION_RETURN(...) return fn != nullptr ? fn(__VA_ARGS__) : 0;
+
+inline void* loadLibrary(const char* name) {
+  // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn
+  // api RT
+  void* handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
+  if (handle == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", name);
+  }
+  return handle;
+}
+
+inline void* getLibraryHandle() {
+  static void* handle = loadLibrary("libneuralnetworks.so");
+  return handle;
+}
+
+inline void* loadFunction(const char* name) {
+  void* fn = nullptr;
+  if (getLibraryHandle() != nullptr) {
+    fn = dlsym(getLibraryHandle(), name);
+  }
+  if (fn == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open function %s", name);
+  }
+  return fn;
+}
+
+inline bool NNAPIExists() {
+  static bool nnapi_is_available = getLibraryHandle();
+  return nnapi_is_available;
+}
+
+// nn api types
+
+/**
+ * Operand types.
+ *
+ * The type of operands that can be added to a model.
+ *
+ * Although we define many types, most operators accept just a few
+ * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
+ * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
+ */
+enum {
+  /** The following entries are used to declare scalars. */
+
+  /** A 32 bit floating point scalar value. */
+  ANEURALNETWORKS_FLOAT32 = 0,
+  /** A signed 32 bit integer scalar value. */
+  ANEURALNETWORKS_INT32 = 1,
+  /** An unsigned 32 bit integer scalar value. */
+  ANEURALNETWORKS_UINT32 = 2,
+
+  /** The following entries are used to declare tensors. */
+
+  /** A tensor of 32 bit floating point values. */
+  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
+  /** A tensor of 32 bit integer values. */
+  ANEURALNETWORKS_TENSOR_INT32 = 4,
+  /** A tensor of 8 bit integers that represent real numbers.
+   *
+   * Attached to this tensor are two numbers that can be used to convert
+   * the 8 bit integer to the real value and vice versa.  These two numbers are:
+   * - scale: a 32 bit floating point value
+   * - zero_value: an 32 bit integer
+   *
+   * The formula is:
+   * real_value = (integer_value - zero_value) * scale.
+   */
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+};
+
+/**
+ * Operation types.
+ *
+ * The type of operations that can be added to a model.
+ */
+enum {
+  /** Adds two tensors, element-wise.
+   *
+   * Takes two input tensors of identical type and compatible dimensions. The
+   * output is the sum of both input tensors, optionally modified by an
+   * activation function.
+   *
+   * Two dimensions are compatible when:
+   *     1. they are equal, or
+   *     2. one of them is 1
+   *
+   * The size of the output is the maximum size along each dimension of the
+   * input operands. It starts with the trailing dimensions, and works its way
+   * forward.
+   *
+   * Example:
+   *
+   *     input1.dimension = {4, 1, 2}
+   *     input2.dimension = {5, 4, 3, 1}
+   *     output.dimension = {5, 4, 3, 2}
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * * 0: A tensor.
+   * * 1: A tensor of the same type, and compatible dimensions as input0.
+   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The sum, a tensor of the same type as input0.
+   */
+  ANEURALNETWORKS_ADD = 0,
+  /** Performs a 2-D average pooling operation.
+   *
+   * The output dimensions are functions of the filter dimensions, stride, and
+   * padding.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[batch, row, col, channel] =
+   *         sum_{i, j}(input[batch, row + i, col + j, channel]) / sum(1)
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
+   * dimension.
+   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
+   * dimension.
+   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
+   * dimension.
+   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
+   * dimension.
+   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
+   * * 6: An INT32 value, specifying the output stride in the ‘height’
+   * dimension.
+   * * 7: An INT32 value, specifying the filter width.
+   * * 8: An INT32 value, specifying the filter height.
+   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth].
+   */
+  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
+  /** Concatenates the input tensors along the given dimension.
+   *
+   * The input tensors must have identical type and the same dimensions except
+   * the dimension along the concatenation axis.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * 0 ~ n: The list on n input tensors, of shape [D0, D1, ..., Daxis(i), ...,
+   * Dm] n+1: An INT32 value, specifying the concatenation axis. n+2: An INT32
+   * value, and has to be one of the {@link FuseCode} values. Specifies the
+   * activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output, a tensor of the same type as the input tensors.
+   *      The output shape is [D0, D1, ..., sum(Daxis(i)), ..., Dm].
+   */
+  ANEURALNETWORKS_CONCATENATION = 2,
+  /** Performs an 2-D convolution operation.
+   *
+   * The CONV_2D op sweeps a 2-D filter that can mix channels together over a
+   * batch of images, applying the filter to each window of each image of the
+   * appropriate size.
+   *
+   * The output dimensions are functions of the filter dimensions, stride, and
+   * padding.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[batch, row, col, channel] =
+   *         sum_{i, j} (
+   *             input[batch, row + i, col + j, k] *
+   *             filter[channel, row + i, col + j, k] +
+   *             bias[channel]
+   *         )
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
+   * the input.
+   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
+   * depth_in], specifying the filter.
+   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
+   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
+   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
+   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
+   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
+   * dimension.
+   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
+   * dimension.
+   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
+   * dimension.
+   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
+   * dimension.
+   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
+   * * 8: An INT32 value, specifying the output stride in the ‘height’
+   * dimension.
+   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth_out].
+   */
+  ANEURALNETWORKS_CONV_2D = 3,
+  /** Performs a depthwise 2-D convolution operation.
+   *
+   * Given an input tensor of shape [batches, height, width, depth_in] and a
+   * filter tensor of shape [depth_out, filter_height, filter_width, depth_in]
+   * containing in_channels convolutional filters of depth 1, DEPTHWISE_CONV
+   * applies a different filter to each input channel (expanding from 1 channel
+   * to channel_multiplier channels for each), then concatenates the results
+   * together.
+   *
+   * The output has depth_out = depth_in * depth_multiplier channels.
+   * The output dimensions are functions of the filter dimensions, stride, and
+   * padding.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[b, i, j, k * channel_multiplier + q] =
+   *         sum_{di, dj} (
+   *             input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+   *             filter[di, dj, k, q]
+   *         )
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
+   * the input.
+   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
+   * depth_in], specifying the filter.
+   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
+   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
+   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
+   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
+   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
+   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
+   * dimension.
+   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
+   * dimension.
+   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
+   * dimension.
+   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
+   * dimension.
+   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
+   * * 8: An INT32 value, specifying the output stride in the ‘height’
+   * dimension.
+   * * 9: An INT32 value, specifying the depthwise multiplier.
+   * * 10: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *       Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth_out].
+   */
+  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
+  /** Rearranges data from depth into blocks of spatial data.
+   *
+   * More specifically, this op outputs a copy of the input tensor where values
+   * from the depth dimension are moved in spatial blocks to the height and
+   * width dimensions. The value block_size indicates the input block size and
+   * how the data is moved.
+   *
+   * Chunks of data of size block_size * block_size from depth are rearranged
+   * into non-overlapping blocks of size block_size x block_size.
+   *
+   * The width of the output tensor is input_depth * block_size, whereas the
+   * height is input_height * block_size. The depth of the input tensor must be
+   * divisible by block_size * block_size
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
+   * the input.
+   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
+   *      block_size * block_size must be a divisor of the input depth.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batch, height*block_size,
+   * width*block_size, depth/(block_size*block_size)].
+   */
+  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
+  /** Dequantizes the input tensor.
+   *
+   * The formula is:
+   *
+   *     output = (input - zero_value) * scale.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * * 0: A tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0, but with type
+   *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
+   */
+  ANEURALNETWORKS_DEQUANTIZE = 6,
+
+  /**
+   * Looks up items from a given tensor.
+   *
+   * Each item in the output is a raw copy of the corresponding item in
+   * the input “values”. If the the given “lookup” indices are out of bounds,
+   * the op will fail and an error will be reported.
+   *
+   * Inputs:
+   * * 0: Values. An n-D tensor of any type X (where n >= 2). E.g., if n is 2,
+   *      then the shape would be [lookup_dimension, values_dimension], where
+   *      “lookup_dimension” corresponds to the indexing dimension in the lookup
+   *      table, and “values_dimension” to the contents.
+   * * 1: Lookups. An 1-D tensor of type T, of shape [lookup_size], where
+   *      “lookup_size” is the number of elements to look for, and each entry
+   *      corresponds to the first dimension of the “values” tensor.
+   *
+   * Output:
+   * * 0: A n-D tensor of type X and the same rank and shape as the “values”
+   *      tensor, except for the first dimension which has size “lookup_size”.
+   */
+  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
+
+  /** Computes element-wise floor() on the input tensor.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * * 0: A tensor.
+   *
+   * Outputs:
+   * * 0: The output, a tensor of the same type and dimensions as input0.
+   */
+  ANEURALNETWORKS_FLOOR = 8,
+  /** Denotes a fully (densely) connected layer, which connects all elements in
+   * the input tensor with each element in the output tensor.
+   *
+   * This layer implements the operation:
+   *
+   *     outputs = activation(inputs * weights’ + bias)
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input. If rank is greater than 2, then it
+   * gets flattened to a 2-D Tensor. The 2-D Tensor is handled as if dimensions
+   * corresponded to shape [batch_size, input_size], where “batch_size”
+   * corresponds to the batching dimension, and “input_size” is the size of the
+   * input.
+   * * 1: A 2-D tensor, specifying the weights, of shape [num_units,
+   * input_size], where "num_units" corresponds to the number of output nodes.
+   * * 2: A 1-D tensor, of shape [num_units], specifying the bias.
+   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
+   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
+   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
+   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
+   * * 3: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output tensor, of shape [batch_size, num_units].
+   */
+  ANEURALNETWORKS_FULLY_CONNECTED = 9,
+
+  /**
+   * Looks up values of a hash table with given keys.
+   *
+   * Inputs:
+   * * 0: Lookups. A 1-D int32 tensor with shape [ k ].
+   * * 1: Keys. A 1-D int32 tensor with shape [ n ], *MUST* be sorted in
+   *      ascending order.
+   * * 2: Values. A tensor with shape [ n … ].
+   *
+   * Outputs:
+   * * 0: Output. A tensor with shape [ k …].
+   * * 1: Hits. A uint8 tensor with shape [ k ] indicates whether the lookup
+   *      hits or not.
+   */
+  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
+
+  /** Applies L2 normalization along the depth dimension.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[batch, row, col, channel] =
+   *         input[batch, row, col, channel] /
+   *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
+   *
+   * For x with more dimensions, independently normalizes each 1-D slice along
+   * dimension dim.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth].
+   */
+  ANEURALNETWORKS_L2_NORMALIZATION = 11,
+
+  /** Performs an 2-D L2 pooling operation.
+   *
+   * The output dimensions are functions of the filter dimensions, stride, and
+   * padding.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[batch, row, col, channel] =
+   *         sqrt(sum_{i, j} pow(input[batch, row + i, col + j, channel], 2) /
+   * sum(1))
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
+   * dimension.
+   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
+   * dimension.
+   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
+   * dimension.
+   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
+   * dimension.
+   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
+   * * 6: An INT32 value, specifying the output stride in the ‘height’
+   * dimension.
+   * * 7: An INT32 value, specifying the filter width.
+   * * 8: An INT32 value, specifying the filter height.
+   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth].
+   */
+  ANEURALNETWORKS_L2_POOL_2D = 12,
+  /** Applies Local Response Normalization along the depth dimension.
+   *
+   * The 4-D input tensor is treated as a 3-D array of 1-D vectors (along the
+   * last dimension), and each vector is normalized independently. Within a
+   * given vector, each component is divided by the weighted, squared sum of
+   * inputs within depth_radius.
+   *
+   * The output is calculated using this formula:
+   *
+   *     sqr_sum[a, b, c, d] =
+   *         sum(pow(input[a, b, c, d - depth_radius : d + depth_radius + 1], 2)
+   *     output = input / pow((bias + alpha * sqr_sum), beta)
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   * * 1: An INT32 value, specifying the radius of the normalization window.
+   * * 2: A FLOAT32 value, specifying the bias, must not be zero.
+   * * 3: A FLOAT32 value, specifying the scale factor, alpha.
+   * * 4: A FLOAT32 value, specifying the exponent, beta.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
+  /** Computes sigmoid activation on the input tensor element-wise.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output = 1 / (1 + exp(-input))
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_LOGISTIC = 14,
+
+  /**
+   * Projects an input to a bit vector via locality senstive hashing.
+   *
+   * Inputs:
+   * * 0: Hash functions. Dim.size == 2, DataType: Float.
+   *            Tensor[0].Dim[0]: Number of hash functions.
+   *            Tensor[0].Dim[1]: Number of seeds per hash functions.
+   *            Tensor[0].Dim[1] <= 32 in sparse case.
+   *
+   * * 1: Input. Dim.size >= 1, no restriction on DataType.
+   * * 2: Weight. Optional. Dim.size == 1, DataType: Float.
+   *     If not set, each input element is considered to have the same weight of
+   *     1.0.
+   *     Tensor[1].Dim[0] == Tensor[2].Dim[0]
+   * * 3: Type:
+   *        Sparse: Value LSHProjectionType_SPARSE(=1).
+   *          Computed bit vector is considered to be sparse.
+   *          Each output element is an int32 made up of multiple bits computed
+   * from hash functions.
+   *
+   *        Dense: Value LSHProjectionType_DENSE(=2).
+   *          Computed bit vector is considered to be dense. Each output element
+   *          represents a bit and can take the value of either 0 or 1.
+   *
+   * Outputs:
+   * * 0: If the projection type is sparse:
+   *        Output.Dim == { Tensor[0].Dim[0] }
+   *        A tensor of int32 that represents hash signatures.
+   *      If the projection type is Dense:
+   *        Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
+   *        A flattened tensor that represents projected bit vectors.
+   */
+  ANEURALNETWORKS_LSH_PROJECTION = 15,
+
+  /**
+   * Long short-term memory unit (LSTM) recurrent network layer.
+   *
+   * The default non-peephole implementation is based on:
+   * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
+   * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
+   * Computation, 9(8):1735-1780, 1997.
+   *
+   * The peephole implementation is based on:
+   * https://research.google.com/pubs/archive/43905.pdf
+   * Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
+   * recurrent neural network architectures for large scale acoustic modeling."
+   * INTERSPEECH, 2014.
+   *
+   * The coupling of input and forget gate (CIFG) is based on:
+   * http://arxiv.org/pdf/1503.04069.pdf
+   * Greff et al. "LSTM: A Search Space Odyssey"
+   *
+   * The class has the following independently optional inputs:
+   * * If input gate (if CIFG): “input_to_forget_weights”,
+   *   “recurrent_to_input_weights”, “cell_to_input_weights”, “input_gate_bias”.
+   * * If no peephole connections: “cell_to_input_weights”,
+   *   “cell_to_forget_weights”, “cell_to_output_weights”.
+   * * If no projection layer: “projection_weights” and “projection_bias”.
+   * * If no projection bias: “projection_bias”.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Inputs:
+   * * 0: Input.
+   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+   *      “batch_size” corresponds to the batching dimension, and “input_size”
+   *      is the size of the input.
+   * * 1: input_to_input_weights.
+   *      A 2-D tensor of type T, of shape [num_units, input_size], where
+   *      “num_units” corresponds to the number of cell units.
+   * * 2: input_to_forget_weights.
+   *      A 2-D tensor of type T, of shape [num_units, input_size].
+   * * 3: input_to_cell_weights.
+   *      A 2-D tensor of type T, of shape [num_units, input_size].
+   * * 4: input_to_output_weights.
+   *      A 2-D tensor of type T, of shape [num_units, input_size].
+   * * 5: recurrent_to_input_weights.
+   *      A 2-D tensor of type T, of shape [num_units, output_size], where
+   *      “output_size” corresponds to either the number of cell units (i.e.,
+   *      “num_units”), or the second dimension of the “projection_weights”, if
+   *      defined.
+   * * 6: recurrent_to_forget_weights.
+   *      A 2-D tensor of type T, of shape [num_units, output_size].
+   * * 7: recurrent_to_cell_weights.
+   *      A 2-D tensor of type T, of shape [num_units, output_size].
+   * * 8: recurrent_to_output_weights.
+   *      A 2-D tensor of type T, of shape [num_units, output_size].
+   * * 9: cell_to_input_weights.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 10:cell_to_forget_weights.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 11:cell_to_output_weights.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 12:input_gate_bias.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 13:forget_gate_bias.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 14:cell_bias.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 15:output_gate_bias.
+   *      A 1-D tensor of type T, of shape [num_units].
+   * * 16:projection_weights.
+   *      A 2-D tensor of type T, of shape [output_size, num_units].
+   * * 17:projection_bias.
+   *      A 1-D tensor of type T, of shape [output_size].
+   *
+   * Parameters:
+   * * 18:fused_activation_function.
+   *      An (optional) ActivationFunctionType indicating the activation
+   *      function.
+   *      If “NONE” is specified then it results in a linear activation.
+   * * 19:cell_clip.
+   *      A clipping threshold for the cell state, such that values are bound
+   *      within [-cell_clip, cell_clip]. If set to 0.0 then clipping is
+   *      disabled.
+   * * 20:proj_clip.
+   *      A clipping threshold for the output from the projection layer, such
+   *      that values are bound within [-proj_clip, proj_clip]. If set to 0.0
+   *      then clipping is disabled.
+   *
+   * Outputs:
+   * * 0: scratch_buffer.
+   *      A 3-D tensor of type T, of shape [batch_size, num_cell, 4].
+   * * 1: output_state.
+   *      A 2-D tensor of type T, of shape [batch_size, output_size].
+   * * 2: cell_state.
+   *      A 2-D tensor of type T, of shape [batch_size, num_units].
+   * * 3: output.
+   *      A 2-D tensor of type T, of shape [batch_size, output_size]. This is
+   *      effectively the same as the current “output_state” value.
+   */
+  ANEURALNETWORKS_LSTM = 16,
+
+  /** Performs an 2-D max pooling operation.
+   *
+   * The output dimensions are functions of the filter dimensions, stride, and
+   * padding.
+   *
+   * The values in the output tensor are computed as:
+   *
+   *     output[batch, row, col, channel] =
+   *         max_{i, j} (input[batch, row + i, col + j, channel])
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
+   * dimension.
+   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
+   * dimension.
+   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
+   * dimension.
+   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
+   * dimension.
+   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
+   * * 6: An INT32 value, specifying the output stride in the ‘height’
+   * dimension.
+   * * 7: An INT32 value, specifying the filter width.
+   * * 8: An INT32 value, specifying the filter height.
+   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
+   * depth].
+   */
+  ANEURALNETWORKS_MAX_POOL_2D = 17,
+
+  /** Multiplies two tensors, element-wise.
+   *
+   * Takes two input tensors of identical type and compatible dimensions. The
+   * output is the product of both input tensors, optionally modified by an
+   * activation function.
+   *
+   * Two dimensions are compatible when:
+   *     1. they are equal, or
+   *     2. one of them is 1
+   *
+   * The size of the resulting output is the maximum size along each dimension
+   * of the input operands. It starts with the trailing dimensions, and works
+   * its way forward.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * * 0: A tensor.
+   * * 1: A tensor of the same type, and compatible dimensions as input0.
+   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
+   *      Specifies the activation to invoke on the result of each addition.
+   *
+   * Outputs:
+   * * 0: The product, a tensor of the same type as input0.
+   */
+  ANEURALNETWORKS_MUL = 18,
+  /** Computes rectified linear activation on the input tensor element-wise.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output = max(0, input)
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_RELU = 19,
+  /** Computes rectified linear 1 activation on the input tensor element-wise.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output = min(1.f, max(-1.f, input))
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_RELU1 = 20,
+  /** Computes rectified linear 6 activation on the input tensor element-wise.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output = min(6, max(0, input))
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_RELU6 = 21,
+  /** Reshapes a tensor.
+   *
+   * Given tensor, this operation returns a tensor that has the same values as
+   * tensor, but with a newly specified shape.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the tensor to be reshaped.
+   * * 1: A 1-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}, defining
+   * the shape of the output tensor. The number of elements implied by shape
+   * must be the same as the number of elements in the input tensor.
+   *
+   * Outputs:
+   * * 0: The output tensor, of shape specified by the input shape.
+   */
+  ANEURALNETWORKS_RESHAPE = 22,
+  /** Resizes images to given size using the bilinear interpretation.
+   *
+   * Resized images will be distorted if their original aspect ratio is not the
+   * same as input.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
+   * input.
+   * * 1: An INT32 value, specifying the output width of the output tensor.
+   * * 2: An INT32 value, specifying the output height of the output tensor.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batches, new_height, new_width,
+   * depth].
+   */
+  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
+
+  /**
+   * A basic recurrent neural network layer.
+   *
+   * This layer implements the operation:
+   * outputs = state = activation(inputs * input_weights + state *
+   * recurrent_weights + bias)
+   *
+   * Where:
+   * * “input_weights” is a weight matrix that multiplies the inputs;
+   * * “recurrent_weights” is a weight matrix that multiplies the current
+   *    “state” which itself is the output from the previous time step
+   *    computation;
+   * * “bias” is a bias vector (added to each output vector in the batch);
+   * * “activation” is the function passed as the “fused_activation_function”
+   *   argument (if not “NONE”).
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Inputs:
+   * * 0: input.
+   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+   *      “batch_size” corresponds to the batching dimension, and “input_size”
+   * is the size of the input.
+   * * 1: weights.
+   *      A 2-D tensor of type T, of shape [num_units, input_size], where
+   *      “num_units” corresponds to the number of units.
+   * * 2: recurrent_weights.
+   *      A 2-D tensor of type T, of shape [num_units, num_units], with columns
+   *      corresponding to the weights from each unit.
+   * * 3: bias.
+   *      A 1-D tensor of type T, of shape [num_units].
+   *
+   *    For FLOAT32 input tensor, bias must also be FLOAT32.
+   *    For UINT8 input tensor, bias must be INT32.
+   *
+   * Parameters
+   * * 4: fused_activation_function.
+   *      An (optional) ActivationFunctionType indicating the activation
+   *      function. If “NONE” is specified then it results in a linear
+   *      activation.
+   *
+   * * 5: Hidden state.
+   *      A 2-D tensor of type T, of shape [batch_size, num_units].
+   *
+   * Outputs:
+   * * 0: output.
+   *      A 2-D tensor of type T, of shape [batch_size, num_units]. This is
+   *      effectively the same as the current state value.
+   */
+  ANEURALNETWORKS_RNN = 24,
+
+  /** Computes the softmax activation on the input tensor element-wise, per
+   * batch, by normalizing the input vector so the maximum coefficient is zero.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output[batch, i] =
+   *         exp((input[batch, i] - max(input[batch, :])) * beta) /
+   *         sum_{k}{exp((input[batch, k] - max(input[batch, :])) * beta)}
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 2 or 4.
+   *
+   * Inputs:
+   * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
+   * * 1: A FLOAT32 value, specifying the scaling factor for the exponent, beta.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_SOFTMAX = 25,
+
+  /** Rearranges blocks of spatial data, into depth.
+   *
+   * More specifically, this op outputs a copy of the input tensor where values
+   * from the height and width dimensions are moved to the depth dimension. The
+   * value block_size indicates the input block size and how the data is moved.
+   *
+   * Chunks of data of size block_size * block_size from depth are rearranged
+   * into non-overlapping blocks of size block_size x block_size.
+   *
+   * The depth of the output tensor is input_depth * block_size * block_size.
+   * The input tensor's height and width must be divisible by block_size.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
+   *
+   * Supported tensor rank: 4, with "NHWC" data layout.
+   *
+   * Inputs:
+   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
+   * the input.
+   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
+   *      block_size must be a divisor of both the input height and width.
+   *
+   * Outputs:
+   * * 0: The output 4-D tensor, of shape [batch, height/block_size,
+   * width/block_size, depth*block_size*block_size].
+   */
+  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
+
+  /**
+   * SVDF op is a kind of stateful layer derived from the notion that a
+   * densely connected layer that's processing a sequence of input frames can
+   * be approximated by using a singular value decomposition of each of its
+   * nodes. The implementation is based on:
+   *
+   * https://research.google.com/pubs/archive/43813.pdf
+   *
+   * P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
+   * “Compressing Deep Neural Networks using a Rank-Constrained Topology”.
+   * INTERSPEECH, 2015.
+   *
+   * It processes the incoming input using a 2-stage filtering mechanism:
+   * * stage 1 performs filtering on the "features" dimension, whose outputs get
+   *   pushed into a memory of fixed-size memory_size.
+   * * stage 2 performs filtering on the "time" dimension of the memory_size
+   *   memoized outputs of stage 1.
+   *
+   * Specifically, for rank 1, this layer implements the operation:
+   *
+   *    memory = push(conv1d(inputs, weights_feature, feature_dim, "VALID"));
+   *    outputs = activation(memory * weights_time + bias);
+   *
+   * Where:
+   * * “weights_feature” is a weights matrix that processes the inputs (by
+   *   convolving the input with every “feature filter”), and whose outputs get
+   *   pushed, stacked in order, into the fixed-size “memory” (the oldest entry
+   *   gets dropped);
+   * * “weights_time” is a weights matrix that processes the “memory” (by a
+   *   batched matrix multiplication on the num_units);
+   * * “bias” is an optional bias vector (added to each output vector in the
+   *   batch); and
+   * * “activation” is the function passed as the “fused_activation_function”
+   *   argument (if not “NONE”).
+   *
+   * Each rank adds a dimension to the weights matrices by means of stacking
+   * the filters.
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Inputs:
+   * * 0: input.
+   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
+   *      “batch_size” corresponds to the batching dimension, and “input_size”
+   * is the size of the input.
+   * * 1: weights_feature.
+   *      A 2-D tensor of type T, of shape [num_units, input_size], where
+   *      “num_units” corresponds to the number of units.
+   * * 2: weights_time.
+   *      A 2-D tensor of type T, of shape [num_units, memory_size], where
+   *      “memory_size” corresponds to the fixed-size of the memory.
+   * * 3: bias.
+   *      A optional 1-D tensor of type T, of shape [num_units].
+   *
+   *    For FLOAT32 input tensor, bias must also be FLOAT32.
+   *    For UINT8 input tensor, bias must be INT32.
+   *
+   * Parameters:
+   * * 4: rank.
+   *      The rank of the SVD approximation.
+   * * 5: fused_activation_function.
+   *      An (optional) ActivationFunctionType indicating the activation
+   * function. If “NONE” is specified then it results in a linear activation.
+   *
+   * Outputs:
+   * * 0: state.
+   *      A 2-D tensor of type T, of shape [batch_size, (memory_size - 1) *
+   * num_units * rank].
+   * * 1: output.
+   *      A 2-D tensor of type T, of shape [batch_size, num_units].
+   */
+  ANEURALNETWORKS_SVDF = 27,
+
+  /** Computes hyperbolic tangent of input tensor element-wise.
+   *
+   * The output is calculated using this formula:
+   *
+   *     output = tanh(input)
+   *
+   * Supported tensor types:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
+   *
+   * Supported tensor rank: up to 4.
+   *
+   * Inputs:
+   * * 0: A tensor, specifying the input.
+   *
+   * Outputs:
+   * * 0: The output tensor of same shape as input0.
+   */
+  ANEURALNETWORKS_TANH = 28,
+};
+
+/**
+ * Fused activation function types.
+ *
+ */
+enum {
+  /** NO fused activation function. */
+  ANEURALNETWORKS_FUSED_NONE = 0,
+  /** Fused ReLU activation function. */
+  ANEURALNETWORKS_FUSED_RELU = 1,
+  /** Fused ReLU1 activation function. */
+  ANEURALNETWORKS_FUSED_RELU1 = 2,
+  /** Fused ReLU6 activation function. */
+  ANEURALNETWORKS_FUSED_RELU6 = 3,
+};
+
+/**
+ * Execution preferences.
+ */
+enum {
+  /**
+   * Prefer executing in a way that minimizes battery drain.
+   * This is desirable for compilations that will be executed often.
+   */
+  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
+  /**
+   * Prefer returning a single answer as fast as possible, even if this causes
+   * more power consumption.
+   */
+  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
+  /**
+   * Prefer maximizing the throughput of successive frames, for example when
+   * processing successive frames coming from the camera.
+   */
+  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
+};
+
+/**
+ * Result codes.
+ */
+enum {
+  ANEURALNETWORKS_NO_ERROR = 0,
+  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
+  ANEURALNETWORKS_INCOMPLETE = 2,
+  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
+  ANEURALNETWORKS_BAD_DATA = 4,
+  ANEURALNETWORKS_OP_FAILED = 5,
+  ANEURALNETWORKS_UNMAPPABLE = 5,
+  ANEURALNETWORKS_BAD_STATE = 6,
+};
+
+/**
+ * ANeuralNetworksMemory is an opaque type that represents memory.
+ *
+ * This type is used to represent shared memory, memory mapped files,
+ * and similar memories.
+ *
+ * By using shared memory, a program can efficiently communicate to the
+ * runtime and drivers the tensors that define a model. See
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
+ * should typically create one shared memory object that contains every tensor
+ * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
+ * used to create shared memory from a file handle. {@link
+ * ANeuralNetworksMemory_createShared} can be used to directly created shared
+ * memory.
+ *
+ * Memory objects can also be used to specify the input and output arguments of
+ * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
+ * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ */
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+/**
+ * ANeuralNetworksModel is an opaque type that contains a description of the
+ * mathematical operations that constitute the model.
+ *
+ * <p>The model will be built by calling<ul>
+ * <li>{@link ANeuralNetworksModel_create},</li>
+ * <li>{@link ANeuralNetworksModel_addOperation},</li>
+ * <li>{@link ANeuralNetworksModel_addOperand},</li>
+ * </ul>
+ *
+ * A model is completed by calling {@link ANeuralNetworksModel_finish}.
+ * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a model at a given time. It is however safe for more than one
+ * thread to use the model once {@link ANeuralNetworksModel_finish} has
+ * returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
+ * includes any compilation or execution object created using the model.</p>
+ */
+typedef struct ANeuralNetworksModel ANeuralNetworksModel;
+
+/**
+ * ANeuralNetworksCompilation is an opaque type that can be used to compile
+ * a machine learning model.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new compilation instance by calling the
+ *        {@link ANeuralNetworksCompilation_create} function.</li>
+ *    <li>Perform the compilation with {@link
+ * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
+ * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
+ * compilation as many times as needed with {@link
+ * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
+ * {@link ANeuralNetworksCompilation_free} once all executions using the
+ * compilation have completed.</li></ul></p>
+ *
+ * <p>A compilation cannot be modified once {@link
+ * ANeuralNetworksCompilation_start} has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a compilation at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
+ * It is also safe for multiple threads to use a compilation object once
+ * {@link ANeuralNetworksCompilation_wait} has completed.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the compilation after calling {@link
+ * ANeuralNetworksCompilation_free}. This includes any execution object created
+ * using the compilation.</p>
+ */
+typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
+
+/**
+ * ANeuralNetworksExecution is an opaque type that can be used to apply a
+ * machine learning model to a set of inputs.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new execution instance by calling the
+ *        {@link ANeuralNetworksExecution_create} function.</li>
+ *    <li>Associate data to the model inputs with
+ *        {@link ANeuralNetworksExecution_setInput} or
+ *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
+ *    <li>Associate output buffers to the model outputs with
+ *        {@link ANeuralNetworksExecution_setOutput} or
+ *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
+ *    <li>Apply the model with {@link
+ * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
+ * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
+ * execution with
+ *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
+ *
+ * <p>An execution cannot be modified once {@link
+ * ANeuralNetworksExecution_start} has been called on it.</p>
+ *
+ * <p>An execution can be applied to a model with
+ * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * executions to do new evaluations of the model.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies an execution at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the request after calling {@link
+ * ANeuralNetworksRequest_free}.</p>
+ */
+typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
+
+/**
+ * ANeuralNetworksOperandType describes the type of an operand.
+ * This structure is used to describe both scalars and tensors.
+ */
+typedef struct ANeuralNetworksOperandType {
+  /** The data type, e.g ANEURALNETWORKS_INT8. */
+  int32_t type;
+  /** The number of dimensions. It should be 0 for scalars. */
+  uint32_t dimensionCount;
+  /** The dimensions of the tensor. It should be nullptr for scalars. */
+  const uint32_t* dimensions;
+  /** These two fields are only used for quantized tensors.
+   * They should be zero for scalars and non-fixed point tensors.
+   * The dequantized value of each entry is (value - offset) * scale.
+   */
+  float scale;
+  int32_t zeroPoint;
+} ANeuralNetworksOperandType;
+
+/**
+ * ANeuralNetworksEvent is an opaque type that represents an event
+ * that will be signaled once an execution completes.
+ */
+typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
+
+typedef int32_t ANeuralNetworksOperationType;
+
+// nn api function types
+
+typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
+    size_t size, int protect, int fd, size_t offset,
+    ANeuralNetworksMemory** memory);
+
+typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
+
+typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
+
+typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
+
+typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
+
+typedef int (*ANeuralNetworksCompilation_create_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+typedef void (*ANeuralNetworksCompilation_free_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
+    ANeuralNetworksCompilation* compilation, int32_t preference);
+
+typedef int (*ANeuralNetworksCompilation_finish_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksModel_addOperand_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
+    ANeuralNetworksModel* model, int32_t index, const void* buffer,
+    size_t length);
+
+typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksModel_addOperation_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+    const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksExecution_create_fn)(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution);
+
+typedef void (*ANeuralNetworksExecution_free_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setInput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_startCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
+
+typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
+
+/**
+ * Creates a shared memory object from a file descriptor.
+ *
+ * The shared memory is backed by a file descriptor via mmap.
+ * See {@link ANeuralNetworksMemory} for a description on how to use
+ * this shared memory.
+ *
+ * @param size The requested size in bytes.
+ *             Must not be larger than the file size.
+ * @param prot The desired memory protection for the mapping.
+ *             It is either PROT_NONE or the bitwise OR of one or
+ *             more of the following flags: PROT_READ, PROT_WRITE.
+ * @param fd The requested file descriptor.
+ *           The file descriptor has to be mmap-able. The file
+ *           descriptor will be duplicated.
+ * @param offset The offset to the beginning of the file of the area to map.
+ *               The offset has to be aligned to a page size.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ */
+inline int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd,
+                                              size_t offset,
+                                              ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromFd);
+  EXECUTE_FUNCTION_RETURN(size, protect, fd, offset, memory);
+}
+
+/**
+ * Delete a memory object.
+ *
+ * Destroys the object used by the run time to keep track of the memory.
+ * This will free the underlying actual memory if no other code has open
+ * handles to this memory.
+ *
+ * @param memory The memory object to be freed.
+ */
+inline void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_free);
+  EXECUTE_FUNCTION(memory);
+}
+
+/**
+ * Create an empty {@link ANeuralNetworksModel}.
+ *
+ * <p>This only creates the object. Computation is performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * The model should be constructed with calls to
+ * {@link ANeuralNetworksModel_addOperation} and
+ * {@link ANeuralNetworksModel_addOperand}
+ *
+ * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+ * has been fully constructed.</p>
+ *
+ * <p>{@link ANeuralNetworksModel_free} should be called once the model
+ * is no longer needed.</p>
+ *
+ * @param model The {@link ANeuralNetworksModel} to be created.
+ *              Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_create(ANeuralNetworksModel** model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_create);
+  EXECUTE_FUNCTION_RETURN(model);
+}
+
+/**
+ * Destroy a model.
+ *
+ * The model need not have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+inline void ANeuralNetworksModel_free(ANeuralNetworksModel* model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_free);
+  EXECUTE_FUNCTION(model);
+}
+
+/**
+ * Indicate that we have finished modifying a model. Required before
+ * calling {@link ANeuralNetworksCompilation_compile}.
+ *
+ * An application is responsible to make sure that no other thread uses
+ * the model at the same time.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be finished.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) {
+  LOAD_FUNCTION(ANeuralNetworksModel_finish);
+  EXECUTE_FUNCTION_RETURN(model);
+}
+
+/**
+ * Add an operand to a model.
+ *
+ * The order in which the operands are added is important. The first one added
+ * to a model will have the index value 0, the second 1, etc. These indexes are
+ * used as operand identifiers in {@link ANeuralNetworksModel_addOperation},
+ * {@link ANeuralNetworksExecution_setInput},
+ * {@link ANeuralNetworksExecution_setInputFromMemory},
+ * {@link ANeuralNetworksExecution_setOutput},
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+ * {@link ANeuralNetworksExecution_setOperandValue}.
+ *
+ * To build a model that can accommodate inputs of various sizes, as you may
+ * want to do for a CNN, set the size of the dimensions that will vary at run
+ * time to 0. If you do so, provide the full dimensions when calling
+ * {@link ANeuralNetworksExecution_setInput} or {@link
+ * ANeuralNetworksExecution_setInputFromMemory}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+ * of the operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_addOperand(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
+  LOAD_FUNCTION(ANeuralNetworksModel_addOperand);
+  EXECUTE_FUNCTION_RETURN(model, type);
+}
+
+/**
+ * Sets an operand to a constant value.
+ *
+ * For scalar values, the content of buffer is copied into the model.
+ *
+ * For tensor values, a pointer to the buffer is stored within the model.
+ * The application is responsible for not changing the content of this region
+ * until all executions using this model have completed. As the data may
+ * be copied during processing, modifying the data after this call yields
+ * undefined results.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
+                                                int32_t index,
+                                                const void* buffer,
+                                                size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValue);
+  EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
+}
+
+/**
+ * Sets an operand to a value stored in a memory object.
+ *
+ * The content of the memory is not copied. A reference to that memory is stored
+ * inside the model. The application is responsible for not changing the content
+ * of the memory region until all executions using this model have completed.
+ * As the data may be copied during processing, modifying the data after this
+ * call yields undefined results.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param buffer A pointer to the data to use.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data within the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandValueFromMemory(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandValueFromMemory);
+  EXECUTE_FUNCTION_RETURN(model, index, memory, offset, length);
+}
+
+/**
+ * Add an operation to a model.
+ *
+ * @param model The model to be modified.
+ * @param type The type of the operation.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying each operand.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying each operand.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model,
+                                             ANeuralNetworksOperationType type,
+                                             uint32_t inputCount,
+                                             const uint32_t* inputs,
+                                             uint32_t outputCount,
+                                             const uint32_t* outputs) {
+  LOAD_FUNCTION(ANeuralNetworksModel_addOperation);
+  EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount,
+                          outputs);
+}
+
+/**
+ * Specfifies which operands will be the model's inputs and outputs.
+ *
+ * An operand cannot be used for both input and output. Doing so will
+ * return an error.
+ *
+ * @param model The model to be modified.
+ * @param inputCount The number of entries in the inputs array.
+ * @param inputs An array of indexes identifying the input operands.
+ * @param outputCount The number of entries in the outputs array.
+ * @param outputs An array of indexes identifying the output operands.
+ *
+ * The operands specified by inputs and outputs must have been
+ * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+ *
+ * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+ * been called will return an error.
+ *
+ * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+ *
+ */
+inline int ANeuralNetworksModel_identifyInputsAndOutputs(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs) {
+  LOAD_FUNCTION(ANeuralNetworksModel_identifyInputsAndOutputs);
+  EXECUTE_FUNCTION_RETURN(model, inputCount, inputs, outputCount, outputs);
+}
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+ * This only creates the object. Compilation is only performed once
+ * {@link ANeuralNetworksCompilation_start} is invoked.
+ *
+ * <p>The provided model must outlive the compilation.</p>
+ *
+ * The model must already have been finished by a call to
+ * {@link ANeuralNetworksModel_finish}.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param model The {@link ANeuralNetworksModel} to be compiled.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ */
+inline int ANeuralNetworksCompilation_create(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_create);
+  EXECUTE_FUNCTION_RETURN(model, compilation);
+}
+
+/**
+ * Destroy a compilation.
+ *
+ * <p>If called on a compilation for which
+ * {@link ANeuralNetworksCompilation_start} has been called, the
+ * function will return immediately but will mark the compilation to be deleted
+ * once the compilation completes. The {@link ANeuralNetworksCompilation_wait}
+ * will return ERROR_DELETED.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be destroyed. Passing NULL is
+ * acceptable and results in no operation.
+ */
+inline void ANeuralNetworksCompilation_free(
+    ANeuralNetworksCompilation* compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_free);
+  EXECUTE_FUNCTION(compilation);
+}
+
+/**
+ * Sets the execution preference.
+ *
+ * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param preference Either {@link PREFER_LOW_POWER},
+ *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+ *                  {@link PREFER_SUSTAINED_SPEED}.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksCompilation_setPreference(
+    ANeuralNetworksCompilation* compilation, int32_t preference) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_setPreference);
+  EXECUTE_FUNCTION_RETURN(compilation, preference);
+}
+
+/**
+ * Waits until the compilation completes.
+ *
+ * More than one thread can wait on a compilation. When the compilation
+ * completes, all threads will be released.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+ */
+inline int ANeuralNetworksCompilation_finish(
+    ANeuralNetworksCompilation* compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_finish);
+  EXECUTE_FUNCTION_RETURN(compilation);
+}
+/**
+ * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+ * This only creates the object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_startCompute} is invoked.
+ *
+ * <p>The provided compilation must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param execution The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+inline int ANeuralNetworksExecution_create(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_create);
+  EXECUTE_FUNCTION_RETURN(compilation, execution);
+}
+
+/**
+ * Destroy an execution.
+ *
+ * <p>If called on an execution for which
+ * {@link ANeuralNetworksExecution_startCompute} has been called, the
+ * function will return immediately but will mark the execution to be deleted
+ * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+ * will return ANEURALNETWORKS_ERROR_DELETED.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be destroyed. Passing NULL is acceptable
+ * and results in no operation.
+ */
+inline void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_free);
+  EXECUTE_FUNCTION(execution);
+}
+
+/**
+ * Associate a user buffer with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This should be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other properties of the type must be the same as
+ *             specified in the model. If the type is the same as specified
+ *             when the model was built, NULL can be passed.
+ * @param buffer The buffer containing the data.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the input.
+ */
+inline int ANeuralNetworksExecution_setInput(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setInput);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
+}
+
+/**
+ * Associate part of a memory object with an input of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the input argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory containing the data.
+ * @param offset This specifies the location of the data whithin the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The size in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the input.
+ */
+inline int ANeuralNetworksExecution_setInputFromMemory(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setInputFromMemory);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
+}
+
+/**
+ * Associate a user buffer with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided buffer must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param buffer The buffer where the data is to be written.
+ * @param length The length in bytes of the buffer.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the output.
+ */
+inline int ANeuralNetworksExecution_setOutput(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setOutput);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length);
+}
+
+/**
+ * Associate part of a memory object with an output of the model of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * <p>The provided memory must outlive the execution.</p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be modified.
+ * @param index The index of the output argument we are setting. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param type The type of the operand. This can be used to specify the
+ *             dimensions that were set to 0 when the operand was added to the
+ *             model. All other values must be the same as specified in the
+ *             model. If the type is the same as specified when the model
+ *             was built, NULL can be passed.
+ * @param memory The memory where the data is to be stored.
+ * @param offset This specifies the location of the data whithin the memory.
+ *               The offset is in bytes from the start of memory.
+ * @param length The length in bytes of the data value.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+ * the name is not recognized or the buffer is too small for the output.
+ */
+inline int ANeuralNetworksExecution_setOutputFromMemory(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setOutputFromMemory);
+  EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length);
+}
+
+/**
+ * Schedule evaluation of the execution.
+ *
+ * <p>Schedules evaluation of the execution. Once the model has been
+ * applied and the outputs are ready to be consumed, the execution will be
+ * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that signal.
+ * </p>
+ *
+ * Multiple executions can be scheduled and evaluated concurrently, and
+ * compilations can be performed concurrently with executions. The runtime makes
+ * no guarantee on the ordering of the completion of compilations and
+ * executions. If it's important to the application, the application should
+ * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+ * {@link ANeuralNetworksExecution_wait}.
+ *
+ * ANeuralNetworksExecution_wait must be called to recuperate the resources used
+ * by the execution.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @param execution The execution to be scheduled and executed.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_startCompute(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_startCompute);
+  EXECUTE_FUNCTION_RETURN(execution, event);
+}
+
+/**
+ * Waits until the execution completes.
+ *
+ * More than one thread can wait on an event. When the execution completes,
+ * all threads will be released.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+inline int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_wait);
+  EXECUTE_FUNCTION_RETURN(event);
+}
+
+/**
+ * Destroys the event.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ */
+inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
+  LOAD_FUNCTION(ANeuralNetworksEvent_free);
+  EXECUTE_FUNCTION(event);
+}
+
+/**/
+
+#endif  // NN_API_SHIM_H0
diff --git a/tensorflow/contrib/lite/nnapi/README.md b/tensorflow/contrib/lite/nnapi/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..913467d17687b291c850c5edbc01c11576d5d790
--- /dev/null
+++ b/tensorflow/contrib/lite/nnapi/README.md
@@ -0,0 +1,15 @@
+# Android Neural Network API
+
+The Android Neural Networks API (NNAPI) is an Android C API designed for running
+computationally intensive operators for machine learning on mobile devices.
+Tensorflow Lite is designed to use the NNAPI to perform hardware-accelerated
+inference operators on supported devices.
+Based on the app’s requirements and the hardware capabilities on a device, the
+NNAPI can distribute the computation workload across available on-device
+processors, including dedicated neural network hardware, graphics processing
+units (GPUs), and digital signal processors (DSPs).
+For devices that lack a specialized vendor driver, the NNAPI runtime relies on
+optimized code to execute requests on the CPU. For more information about the
+NNAPI, please refer to the [NNAPI documentation](https://developer.android.com/ndk/guides/neuralnetworks/index.html)
+
+
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05853e853c4378134f4240bf99ec25b9b4e39ce2
--- /dev/null
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -0,0 +1,394 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/nnapi_delegate.h"
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
+
+namespace tflite {
+
+// TODO(aselle): FATAL leaves resources hanging.
+void FATAL(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  fflush(stderr);
+  exit(1);
+}
+
+// TODO(aselle): Change the error model to use status codes.
+#define CHECK_TFLITE_SUCCESS(x)                       \
+  if (x != kTfLiteOk) {                               \
+    FATAL("Aborting since tflite returned failure."); \
+  }
+
+#define CHECK_NN(x)                                   \
+  if (x != ANEURALNETWORKS_NO_ERROR) {                \
+    FATAL("Aborting since tflite returned failure."); \
+  }
+
+NNAPIAllocation::NNAPIAllocation(const char* filename,
+                                 ErrorReporter* error_reporter)
+    : MMAPAllocation(filename, error_reporter) {
+  if (mmapped_buffer_ != MAP_FAILED)
+    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
+                                                mmap_fd_, 0, &handle_));
+}
+
+NNAPIAllocation::~NNAPIAllocation() {
+  if (handle_) {
+    ANeuralNetworksMemory_free(handle_);
+  }
+}
+
+NNAPIDelegate::~NNAPIDelegate() {
+  if (nn_model_) {
+    ANeuralNetworksModel_free(nn_model_);
+    nn_model_ = nullptr;
+    // TODO(aselle): Is this thread-safe and callable multiple times?
+  }
+  // ANeuralNetworksShutdown();
+}
+
+// Adds the tensors of the interpreter to the NN API model.
+// Returns the number of operands added.
+uint32_t addTensorOperands(tflite::Interpreter* interpreter,
+                           ANeuralNetworksModel* nn_model) {
+  uint32_t next_id = 0;
+  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+    int32_t nn_type = 0;
+    float scale = 1.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = interpreter->tensor(i);
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        continue;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      default:
+        FATAL("Unsupported type.");
+    }
+    // TODO(aselle): Note, many of these are intermediate results. Do I need
+    // to ever specify these sizes. I am currently below doing setValue
+    // on all of them, but I shouldn't in the future.
+    // Answer(jeanluc): If all the operators can set the dimension correctly,
+    // you won't need to.
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+
+    // TODO(aselle): Based on Michael's suggestion, limiting this to read
+    // only memory
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
+              static_cast<const Allocation*>(tensor->allocation))) {
+        CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory(
+            nn_model, i, alloc->memory(), alloc->offset(tensor->data.raw),
+            tensor->bytes));
+      } else {
+        CHECK_NN(ANeuralNetworksModel_setOperandValue(
+            nn_model, i, tensor->data.raw, tensor->bytes));
+      }
+    }
+    ++next_id;
+  }
+  return next_id;
+}
+
+// Adds the operations and their parameters to the NN API model.
+// 'next-id' is the operand ID of the next operand of the model.
+void AddOpsAndParams(tflite::Interpreter* interpreter,
+                     ANeuralNetworksModel* nn_model, uint32_t next_id) {
+  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+    const auto* node_and_registration = interpreter->node_and_registration(i);
+    const TfLiteNode& node = node_and_registration->first;
+    const TfLiteRegistration& registration = node_and_registration->second;
+    tflite::BuiltinOperator builtin =
+        static_cast<tflite::BuiltinOperator>(registration.builtin_code);
+
+    // Add the parameters.
+    std::vector<uint32_t> augmented_inputs(
+        node.inputs->data, node.inputs->data + node.inputs->size);
+
+    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+                             &next_id](int value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(int32_t)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+                               &next_id](float value) {
+      ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
+                                                    sizeof(float)))
+      augmented_inputs.push_back(next_id++);
+    };
+
+    auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
+
+    auto add_pooling_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->filter_width);
+      add_scalar_int32(builtin->filter_height);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_convolution_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_depthwise_conv_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data);
+      add_scalar_int32(builtin->padding);
+      add_scalar_int32(builtin->stride_width);
+      add_scalar_int32(builtin->stride_height);
+      add_scalar_int32(builtin->depth_multiplier);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_fully_connected_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_concatenation_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data);
+      add_scalar_int32(builtin->axis);
+      if (builtin->activation != kTfLiteActNone) {
+        FATAL("Concatenation does not support fused activation in NNAPI");
+      }
+    };
+
+    auto add_softmax_params = [&add_scalar_float32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data);
+      add_scalar_float32(builtin->beta);
+    };
+
+    auto add_space_to_depth_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data);
+      add_scalar_int32(builtin->block_size);
+    };
+
+#if 0
+    auto add_reshape_params = [&](void* data) {
+      auto builtin = reinterpret_cast<TfLiteReshapeParams*>(data);
+      uint32_t tensor_size_shape = builtin->num_dimensions;
+      ANeuralNetworksOperandType operand_type{
+          ANEURALNETWORKS_TENSOR_INT32,
+          {static_cast<uint32_t>(1),
+           reinterpret_cast<uint32_t*>(&tensor_size_shape)},
+          0,
+          0};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, builtin->shape,
+          sizeof(int) * builtin->num_dimensions));
+      augmented_inputs.push_back(next_id++);
+    };
+#endif
+
+    ANeuralNetworksOperationType nn_op_type;
+    switch (builtin) {
+      case tflite::BuiltinOperator_ADD:
+        nn_op_type = ANEURALNETWORKS_ADD;
+        add_add_params();
+        break;
+      case tflite::BuiltinOperator_AVERAGE_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_MAX_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MAX_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_L2_POOL_2D:
+        add_pooling_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_L2_POOL_2D;
+        break;
+      case tflite::BuiltinOperator_CONV_2D:
+        add_convolution_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_RELU:
+        nn_op_type = ANEURALNETWORKS_RELU;
+        break;
+      case tflite::BuiltinOperator_RELU6:
+        nn_op_type = ANEURALNETWORKS_RELU6;
+        break;
+      case tflite::BuiltinOperator_TANH:
+        nn_op_type = ANEURALNETWORKS_TANH;
+        break;
+      case tflite::BuiltinOperator_LOGISTIC:
+        nn_op_type = ANEURALNETWORKS_LOGISTIC;
+        break;
+      case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
+        add_depthwise_conv_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D;
+        break;
+      case tflite::BuiltinOperator_CONCATENATION:
+        add_concatenation_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_CONCATENATION;
+        break;
+      case tflite::BuiltinOperator_SOFTMAX:
+        add_softmax_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SOFTMAX;
+        break;
+      case tflite::BuiltinOperator_FULLY_CONNECTED:
+        add_fully_connected_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
+        break;
+      case tflite::BuiltinOperator_RESHAPE:
+        nn_op_type = ANEURALNETWORKS_RESHAPE;
+        // add_reshape_params(node.builtin_data);
+        break;
+      case tflite::BuiltinOperator_SPACE_TO_DEPTH:
+        add_space_to_depth_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH;
+        break;
+      case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
+      case tflite::BuiltinOperator_LSH_PROJECTION:
+      case tflite::BuiltinOperator_SVDF:
+      case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
+      case tflite::BuiltinOperator_RNN:
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
+      case tflite::BuiltinOperator_LSTM:
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+      case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
+      case tflite::BuiltinOperator_MUL:
+      case tflite::BuiltinOperator_RESIZE_BILINEAR:
+      case tflite::BuiltinOperator_CALL:
+      case tflite::BuiltinOperator_SKIP_GRAM:
+      case tflite::BuiltinOperator_RELU1:
+        FATAL("Op code %d is currently not delegated to NNAPI", builtin);
+        nn_op_type = -1;  // set to invalid
+        break;
+      case tflite::BuiltinOperator_CUSTOM:
+        FATAL("Custom operations are not supported when using NNAPI.");
+        nn_op_type = -1;  // set to invalid
+        break;
+    }
+
+    // Add the operation.
+    CHECK_NN(ANeuralNetworksModel_addOperation(
+        nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
+        augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
+        reinterpret_cast<uint32_t*>(node.outputs->data)));
+  }
+}
+
+TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+  // TODO(aselle): This is not correct. need to handle resize invalidation.
+  if (nn_model_ && nn_compiled_model_) return kTfLiteOk;
+
+  if (!nn_model_) {
+    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+
+    uint32_t next_id = addTensorOperands(interpreter, nn_model_);
+    AddOpsAndParams(interpreter, nn_model_, next_id);
+    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+        nn_model_, static_cast<uint32_t>(interpreter->inputs().size()),
+        reinterpret_cast<const uint32_t*>(interpreter->inputs().data()),
+        static_cast<uint32_t>(interpreter->outputs().size()),
+        reinterpret_cast<const uint32_t*>(interpreter->outputs().data())));
+    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+  }
+  if (!nn_compiled_model_) {
+    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
+    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
+  if (!nn_model_) {
+    TF_LITE_ENSURE_STATUS(BuildGraph(interpreter));
+  }
+
+  ANeuralNetworksExecution* execution = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+
+  // Currently perform deep copy of input buffer
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input = interpreter->inputs()[i];
+    // TODO(aselle): Is this what we want or do we want input instead?
+    // TODO(aselle): This should be called setInputValue maybe to be cons.
+    TfLiteTensor* tensor = interpreter->tensor(input);
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+  // Tell nn api where to place final data.
+  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+    int output = interpreter->outputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(output);
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i, nullptr, tensor->data.raw, tensor->bytes));
+  }
+  // Currently use blocking compute.
+  ANeuralNetworksEvent* event = nullptr;
+  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(ANeuralNetworksEvent_wait(event));
+  ANeuralNetworksEvent_free(event);
+  ANeuralNetworksExecution_free(execution);
+
+#if 0
+  printf("From the NN API:\n");
+  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  if (float* data =
+          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+    size_t num = tensor->bytes / sizeof(float);
+    for (float* p = data; p < data + num; p++) {
+      printf(" %f", *p);
+    }
+    printf("\n");
+  }
+#endif
+
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..f29aa9e18e605ef0b5d246b2a672639c64391646
--- /dev/null
+++ b/tensorflow/contrib/lite/nnapi_delegate.h
@@ -0,0 +1,66 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
+
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
+
+class ANeuralNetworsModel;
+
+namespace tflite {
+
+class NNAPIAllocation : public MMAPAllocation {
+ public:
+  NNAPIAllocation(const char* filename, ErrorReporter* error_reporter);
+  ~NNAPIAllocation();
+
+  size_t offset(const void* ptr) const {
+    auto signed_offset = reinterpret_cast<const uint8_t*>(ptr) -
+                         reinterpret_cast<const uint8_t*>(mmapped_buffer_);
+
+    return static_cast<size_t>(signed_offset);
+  }
+
+  ANeuralNetworksMemory* memory() const { return handle_; }
+  bool valid() const override { return handle_ != nullptr; }
+
+ private:
+  mutable ANeuralNetworksMemory* handle_ = nullptr;
+};
+
+class NNAPIDelegate {
+ public:
+  ~NNAPIDelegate();
+
+  // Convert a tflite graph to NNAPI
+  TfLiteStatus BuildGraph(Interpreter* interpreter);
+
+  // Run
+  TfLiteStatus Invoke(Interpreter* interpreter);
+
+ private:
+  // The NN API model handle
+  ANeuralNetworksModel* nn_model_ = nullptr;
+  // The NN API compilation handle
+  ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/optional_debug_tools.cc b/tensorflow/contrib/lite/optional_debug_tools.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f762e6688d0cc2a91417b9d82201446e3060a6f
--- /dev/null
+++ b/tensorflow/contrib/lite/optional_debug_tools.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/optional_debug_tools.h"
+
+namespace tflite {
+
+void PrintIntVector(const std::vector<int>& v) {
+  for (const auto& it : v) {
+    printf(" %d", it);
+  }
+  printf("\n");
+}
+
+void PrintTfLiteIntVector(const TfLiteIntArray* v) {
+  if (!v) {
+    printf(" (null)");
+    return;
+  }
+  for (int k = 0; k < v->size; k++) {
+    printf(" %d", v->data[k]);
+  }
+  printf("\n");
+}
+
+const char* TensorTypeName(TfLiteType type) {
+  switch (type) {
+    case kTfLiteNoType:
+      return "kTfLiteNoType";
+    case kTfLiteFloat32:
+      return "kTfLiteFloat32";
+    case kTfLiteInt32:
+      return "kTfLiteInt32";
+    case kTfLiteUInt8:
+      return "kTfLiteUInt8";
+    case kTfLiteInt64:
+      return "kTfLiteInt64";
+    case kTfLiteString:
+      return "kTfLiteString";
+  }
+  return "(invalid)";
+}
+
+const char* AllocTypeName(TfLiteAllocationType type) {
+  switch (type) {
+    case kTfLiteMemNone:
+      return "kTfLiteMemNone";
+    case kTfLiteMmapRo:
+      return "kTfLiteMmapRo";
+    case kTfLiteDynamic:
+      return "kTfLiteDynamic";
+    case kTfLiteArenaRw:
+      return "kTfLiteArenaRw";
+    case kTfLiteArenaRwPersistent:
+      return "kTfLiteArenaRwPersistent";
+  }
+  return "(invalid)";
+}
+
+// Prints a dump of what tensors and what nodes are in the interpreter.
+void PrintInterpreterState(Interpreter* interpreter) {
+  printf("Interpreter has %d tensors and %d nodes\n",
+         interpreter->tensors_size(), interpreter->nodes_size());
+  printf("Inputs:");
+  PrintIntVector(interpreter->inputs());
+  printf("Outputs:");
+  PrintIntVector(interpreter->outputs());
+  printf("\n");
+  for (int tensor_index = 0; tensor_index < interpreter->tensors_size();
+       tensor_index++) {
+    TfLiteTensor* tensor = interpreter->tensor(tensor_index);
+    printf("Tensor %3d %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
+           TensorTypeName(tensor->type), AllocTypeName(tensor->allocation_type),
+           tensor->bytes, float(tensor->bytes) / float(1 << 20));
+    PrintTfLiteIntVector(tensor->dims);
+    printf("\n");
+  }
+
+  for (int node_index = 0; node_index < interpreter->nodes_size();
+       node_index++) {
+    const std::pair<TfLiteNode, TfLiteRegistration>* node_and_reg =
+        interpreter->node_and_registration(node_index);
+    const TfLiteNode& node = node_and_reg->first;
+    const TfLiteRegistration& reg = node_and_reg->second;
+    printf("Node %3d Operator Builtin Code %3d\n", node_index,
+           reg.builtin_code);
+    printf("  Inputs:");
+    PrintTfLiteIntVector(node.inputs);
+    printf("  Outputs:");
+    PrintTfLiteIntVector(node.outputs);
+  }
+}
+
+// Prints a dump of what tensors and what nodes are in the interpreter.
+TfLiteStatus ValidateInterpreterState(const Interpreter* interpreter);
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/optional_debug_tools.h b/tensorflow/contrib/lite/optional_debug_tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..54d48760951c946d0493a86961348df25e53bd1f
--- /dev/null
+++ b/tensorflow/contrib/lite/optional_debug_tools.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Optional debugging functionality. For small sized binaries, these are not
+// needed.
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
+
+#include "tensorflow/contrib/lite/interpreter.h"
+
+namespace tflite {
+
+// Prints a dump of what tensors and what nodes are in the interpreter.
+void PrintInterpreterState(Interpreter* interpreter);
+
+// Prints a dump of what tensors and what nodes are in the interpreter.
+TfLiteStatus ValidateInterpreterState(const Interpreter* interpreter);
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_DEBUG_TOOLS_H_
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3d6a3ec0fd4c673f601254b19452bbf8b9454e27
--- /dev/null
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -0,0 +1,48 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "lite",
+    srcs = ["lite.py"],
+    # data = [
+    #     "//tensorflow/contrib/lite/toco/python:toco_from_protos",
+    # ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:model_flags_proto_py",
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
+        "//tensorflow/contrib/lite/toco/python:tensorflow_wrap_toco",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_test(
+    name = "lite_test",
+    srcs = ["lite_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_oss"],
+    deps = [
+        ":lite",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
new file mode 100644
index 0000000000000000000000000000000000000000..95309478a6f9791e3510736c45f9c5cfab88703b
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite tooling helper functionality.
+
+EXPERIMENTAL: APIs here are unstable and likely to change without notice.
+
+@@toco_convert
+@@toco_convert_protos
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+import tempfile
+
+from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.contrib.lite.toco.python.tensorflow_wrap_toco import TocoConvert as _toco_convert_protos
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.platform import resource_loader as _resource_loader
+from tensorflow.python.util.all_util import remove_undocumented
+
+# Enum types from the protobuf promoted to the API
+FLOAT = _types_pb2.FLOAT
+INT32 = _types_pb2.INT32
+INT64 = _types_pb2.INT64
+STRING = _types_pb2.STRING
+QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
+TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
+TFLITE = _toco_flags_pb2.TFLITE
+GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
+
+# Currently the default mode of operation is to shell to another python process
+# to protect against crashes. However, it breaks some dependent targets because
+# it forces us to depend on an external py_binary. The experimental API doesn't
+# have that drawback.
+EXPERIMENTAL_USE_TOCO_API_DIRECTLY = False
+
+# Find the toco_from_protos binary using the resource loader if using from
+# bazel, otherwise we are in a pip where console_scripts already has
+# the toco_from_protos tool.
+if EXPERIMENTAL_USE_TOCO_API_DIRECTLY:
+  _toco_from_proto_bin = ""
+else:
+  _toco_from_proto_bin = _resource_loader.get_path_to_datafile(
+      "../toco/python/toco_from_protos")
+
+if _toco_from_proto_bin and not os.path.exists(_toco_from_proto_bin):
+  _toco_from_proto_bin = "toco_from_protos"
+
+
+def toco_convert_protos(model_flags_str, toco_flags_str, input_data_str):
+  """Convert `input_data_str` according to model and toco parameters.
+
+  Unless you know what you are doing consider using
+  the more friendly @{tf.contrib.lite.toco_convert}}.
+
+  Args:
+    model_flags_str: Serialized proto describing model properties, see
+      `toco/model_flags.proto`.
+    toco_flags_str: Serialized proto describing conversion properties, see
+      `toco/toco_flags.proto`.
+    input_data_str: Input data in serialized form (e.g. a graphdef is common)
+  Returns:
+    Converted model in serialized form (e.g. a TFLITE model is common).
+  Raises:
+    RuntimeError: When conversion fails, an exception is raised with the error
+      message embedded.
+  """
+  # TODO(aselle): When toco does not use fatal errors for failure, we can
+  # switch this on.
+  if not _toco_from_proto_bin:
+    return _toco_convert_protos(model_flags_str, toco_flags_str, input_data_str)
+
+  with tempfile.NamedTemporaryFile() as fp_toco, \
+           tempfile.NamedTemporaryFile() as fp_model, \
+           tempfile.NamedTemporaryFile() as fp_input, \
+           tempfile.NamedTemporaryFile() as fp_output:
+    fp_model.write(model_flags_str)
+    fp_toco.write(toco_flags_str)
+    fp_input.write(input_data_str)
+    fp_model.flush()
+    fp_toco.flush()
+    fp_input.flush()
+
+    cmd = [
+        _toco_from_proto_bin, fp_model.name, fp_toco.name, fp_input.name,
+        fp_output.name
+    ]
+    cmdline = " ".join(cmd)
+    proc = subprocess.Popen(
+        cmdline,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        close_fds=True)
+    stdout, stderr = proc.communicate()
+    exitcode = proc.returncode
+    if exitcode == 0:
+      stuff = fp_output.read()
+      return stuff
+    else:
+      raise RuntimeError("TOCO failed see console for info.\n%s\n%s\n" %
+                         (stdout, stderr))
+
+
+def _tensor_name(x):
+  return x.name.split(":")[0]
+
+
+def toco_convert(input_data,
+                 input_tensors,
+                 output_tensors,
+                 inference_type=FLOAT,
+                 input_format=TENSORFLOW_GRAPHDEF,
+                 output_format=TFLITE,
+                 quantized_input_stats=None,
+                 drop_control_dependency=True):
+  """Convert a model using TOCO from `input_format` to `output_format`.
+
+  Typically this is to convert from TensorFlow GraphDef to TFLite, in which
+  case the default `input_format` and `output_format` are sufficient.
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`).
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
+    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
+    output_format: Type of data to write (currently must be TFLITE or
+      GRAPHVIZ_DOT)
+    quantized_input_stats: For each member of input_tensors the mean and
+      std deviation of training data. Only needed if `inference_type` is
+      `QUANTIZED_UINT8`.
+    drop_control_dependency: Drops control dependencies silently. This is due
+      to tf lite not supporting control dependencies.
+
+  Returns:
+    The converted data. For example if tflite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    ValueError: If the input tensor type is unknown
+    RuntimeError: If TOCO fails to convert (in which case the runtime error's
+      error text will contain the TOCO error log)
+  """
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.drop_control_dependency = drop_control_dependency
+  model = _model_flags_pb2.ModelFlags()
+  toco.inference_type = inference_type
+  for idx, input_tensor in enumerate(input_tensors):
+    if input_tensor.dtype == _dtypes.float32:
+      tflite_input_type = FLOAT
+    elif input_tensor.dtype == _dtypes.int32:
+      tflite_input_type = INT32
+    elif input_tensor.dtype == _dtypes.int64:
+      tflite_input_type = INT64
+    # TODO(aselle): Insert strings when they are available
+    else:
+      raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
+                                                         input_tensor.dtype))
+
+    input_array = model.input_arrays.add()
+
+    if inference_type == QUANTIZED_UINT8:
+      if tflite_input_type == FLOAT:
+        tflite_input_type = QUANTIZED_UINT8
+      input_array.mean, input_array.std = quantized_input_stats[idx]
+
+    input_array.name = _tensor_name(input_tensor)
+    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+    toco.inference_input_type = tflite_input_type
+
+  for output_tensor in output_tensors:
+    model.output_arrays.append(_tensor_name(output_tensor))
+
+  data = toco_convert_protos(model.SerializeToString(),
+                             toco.SerializeToString(),
+                             input_data.SerializeToString())
+  return data
+
+
+_allowed_symbols = [
+    "FLOAT",
+    "INT32",
+    "INT64",
+    "STRING",
+    "QUANTIZED_UINT8",
+    "TENSORFLOW_GRAPHDEF",
+    "TFLITE",
+    "GRAPHVIZ_DOT",
+    "EXPERIMENTAL_USE_TOCO_API_DIRECTLY",
+]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..da360aeb344ab9c4eb183d84e9b5f60ba715c6e8
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -0,0 +1,45 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite Python Interface: Sanity check."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.lite.python import lite
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class LiteTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3],
+                                      dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+    # Try running on valid graph
+    result = lite.toco_convert(sess.graph_def, [in_tensor], [out_tensor])
+    self.assertTrue(result)
+    # TODO(aselle): remove tests that fail.
+    # Try running on identity graph (known fail)
+    # with self.assertRaisesRegexp(RuntimeError, "!model->operators.empty()"):
+    #   result = lite.toco_convert(sess.graph_def, [in_tensor], [in_tensor])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..54167ddd9a5a003d0ff21e6627a1dbe94afa3e87
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -0,0 +1,82 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_binary(
+    name = "upgrade_schema",
+    srcs = [
+        "upgrade_schema.py",
+    ],
+    data = [
+        "schema_v0.fbs",
+        "schema_v1.fbs",
+        "schema_v2.fbs",
+        "schema_v3.fbs",
+        "@flatbuffers//:flatc",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_test(
+    name = "upgrade_schema_test",
+    size = "small",
+    srcs = ["upgrade_schema_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":upgrade_schema",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+exports_files([
+    "schema_v0.fbs",
+    "schema_v1.fbs",
+    "schema_v2.fbs",
+    "schema_v3.fbs",
+])
+
+load("//third_party/flatbuffers:build_defs.bzl", "flatbuffer_cc_library")
+
+# Generic schema for inference on device.
+flatbuffer_cc_library(
+    name = "schema_fbs",
+    srcs = ["schema.fbs"],
+)
+
+# Schema test to make sure we don't introduce backward incompatible changes
+# to schemas.
+cc_test(
+    name = "flatbuffer_compatibility_test",
+    size = "small",
+    srcs = ["flatbuffer_compatibility_test.cc"],
+    data = [
+        "schema.fbs",
+        "schema_v3.fbs",
+    ],
+    deps = [
+        "//tensorflow/core:lib_platform",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers//:flatc_library",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd46a06f7d173d87d04c2ff0910190ecd40a1954
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/flatbuffer_compatibility_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <fstream>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatc.h"
+#include "tensorflow/core/platform/platform.h"
+
+#ifdef PLATFORM_GOOGLE
+#define TFLITE_TF_PREFIX "third_party/tensorflow/"
+#else
+#define TFLITE_TF_PREFIX "tensorflow/"
+#endif
+/// Load filename `name`
+bool LoadFileRaw(const char *name, std::string *buf) {
+  std::ifstream fp(name, std::ios::binary);
+  if (!fp) {
+    fprintf(stderr, "Failed to read '%s'\n", name);
+    return false;
+  }
+  std::string s((std::istreambuf_iterator<char>(fp)),
+                std::istreambuf_iterator<char>());
+  if (s.empty()) {
+    fprintf(stderr, "Read '%s' resulted in empty\n", name);
+    return false;
+  }
+  *buf = s;
+  return true;
+}
+
+bool ParseFile(flatbuffers::Parser *parser, const std::string &filename,
+               const std::string &contents) {
+  std::vector<const char *> include_directories;
+  auto local_include_directory = flatbuffers::StripFileName(filename);
+  include_directories.push_back(local_include_directory.c_str());
+  include_directories.push_back(nullptr);
+  if (!parser->Parse(contents.c_str(), include_directories.data(),
+                     filename.c_str())) {
+    fprintf(stderr, "Failed to parse flatbuffer schema '%s'\n",
+            contents.c_str());
+    return false;
+  }
+  return true;
+}
+
+// Checks to make sure current schema in current code does not cause an
+// incompatibility.
+TEST(SchemaTest, TestCompatibility) {
+  // Read file contents of schemas into strings
+  // TODO(aselle): Need a reliable way to load files.
+  std::string base_contents, current_contents;
+  const char *base_filename =
+      TFLITE_TF_PREFIX "contrib/lite/schema/schema_v3.fbs";
+  const char *current_filename =
+      TFLITE_TF_PREFIX "contrib/lite/schema/schema.fbs";
+
+  ASSERT_TRUE(LoadFileRaw(base_filename, &base_contents));
+  ASSERT_TRUE(LoadFileRaw(current_filename, &current_contents));
+  // Parse the schemas
+  flatbuffers::Parser base_parser, current_parser;
+  std::vector<const char *> include_directories;
+  ASSERT_TRUE(ParseFile(&base_parser, base_filename, base_contents));
+  ASSERT_TRUE(ParseFile(&current_parser, current_filename, current_contents));
+  // Check that the schemas conform and fail if they don't
+  auto err = current_parser.ConformTo(base_parser);
+  if (!err.empty()) {
+    fprintf(stderr,
+            "Schemas don't conform:\n%s\n"
+            "In other words some change you made means that new parsers can't"
+            "parse old files.\n",
+            err.c_str());
+    FAIL();
+  }
+}
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..ddb2ab792c520eb245445532f534ebce8a9f1280
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -0,0 +1,346 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+}
+
+// Parameters for converting a quantized tensor back to float. Given a
+// quantized value q, the corresponding float value f should be:
+//   f = scale * (q - zero_point)
+table QuantizationParameters {
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];
+  zero_point:[long];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, number of channels, height, width] (That's
+  // Tensorflow's NCHW).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existant empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+}
+
+// A list of builtin operators. Builtin operators a slighlty faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  // DEPTH_TO_SPACE = 5,
+  // DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  // FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  RELU1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
+table ResizeBilinearOptions {
+  new_height:int;
+  new_width:int;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input and output tensors are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+}
+
+// The root type, defining a model.
+table SubGraph {
+  // A list of all tensors used in this model.
+  tensors:[Tensor];
+
+  // Indices of the input tensors.
+  inputs:[int];
+
+  // Indices of the output tensors.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index.
+table Buffer {
+  data:[ubyte];
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model
+  buffers:[Buffer];
+
+}
+
+root_type Model;
+
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
new file mode 100755
index 0000000000000000000000000000000000000000..cbf10275f3111f167439c8a78307ea696fe78686
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -0,0 +1,5417 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+
+struct QuantizationParameters;
+struct QuantizationParametersT;
+
+struct Tensor;
+struct TensorT;
+
+struct Conv2DOptions;
+struct Conv2DOptionsT;
+
+struct Pool2DOptions;
+struct Pool2DOptionsT;
+
+struct DepthwiseConv2DOptions;
+struct DepthwiseConv2DOptionsT;
+
+struct ConcatEmbeddingsOptions;
+struct ConcatEmbeddingsOptionsT;
+
+struct LSHProjectionOptions;
+struct LSHProjectionOptionsT;
+
+struct SVDFOptions;
+struct SVDFOptionsT;
+
+struct RNNOptions;
+struct RNNOptionsT;
+
+struct FullyConnectedOptions;
+struct FullyConnectedOptionsT;
+
+struct SoftmaxOptions;
+struct SoftmaxOptionsT;
+
+struct ConcatenationOptions;
+struct ConcatenationOptionsT;
+
+struct AddOptions;
+struct AddOptionsT;
+
+struct MulOptions;
+struct MulOptionsT;
+
+struct L2NormOptions;
+struct L2NormOptionsT;
+
+struct LocalResponseNormalizationOptions;
+struct LocalResponseNormalizationOptionsT;
+
+struct LSTMOptions;
+struct LSTMOptionsT;
+
+struct ResizeBilinearOptions;
+struct ResizeBilinearOptionsT;
+
+struct CallOptions;
+struct CallOptionsT;
+
+struct ReshapeOptions;
+struct ReshapeOptionsT;
+
+struct SkipGramOptions;
+struct SkipGramOptionsT;
+
+struct SpaceToDepthOptions;
+struct SpaceToDepthOptionsT;
+
+struct EmbeddingLookupSparseOptions;
+struct EmbeddingLookupSparseOptionsT;
+
+struct OperatorCode;
+struct OperatorCodeT;
+
+struct Operator;
+struct OperatorT;
+
+struct SubGraph;
+struct SubGraphT;
+
+struct Buffer;
+struct BufferT;
+
+struct Model;
+struct ModelT;
+
+enum TensorType {
+  TensorType_FLOAT32 = 0,
+  TensorType_FLOAT16 = 1,
+  TensorType_INT32 = 2,
+  TensorType_UINT8 = 3,
+  TensorType_INT64 = 4,
+  TensorType_STRING = 5,
+  TensorType_MIN = TensorType_FLOAT32,
+  TensorType_MAX = TensorType_STRING
+};
+
+inline TensorType (&EnumValuesTensorType())[6] {
+  static TensorType values[] = {TensorType_FLOAT32, TensorType_FLOAT16,
+                                TensorType_INT32,   TensorType_UINT8,
+                                TensorType_INT64,   TensorType_STRING};
+  return values;
+}
+
+inline const char **EnumNamesTensorType() {
+  static const char *names[] = {"FLOAT32", "FLOAT16", "INT32", "UINT8",
+                                "INT64",   "STRING",  nullptr};
+  return names;
+}
+
+inline const char *EnumNameTensorType(TensorType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesTensorType()[index];
+}
+
+enum BuiltinOperator {
+  BuiltinOperator_ADD = 0,
+  BuiltinOperator_AVERAGE_POOL_2D = 1,
+  BuiltinOperator_CONCATENATION = 2,
+  BuiltinOperator_CONV_2D = 3,
+  BuiltinOperator_DEPTHWISE_CONV_2D = 4,
+  BuiltinOperator_EMBEDDING_LOOKUP = 7,
+  BuiltinOperator_FULLY_CONNECTED = 9,
+  BuiltinOperator_HASHTABLE_LOOKUP = 10,
+  BuiltinOperator_L2_NORMALIZATION = 11,
+  BuiltinOperator_L2_POOL_2D = 12,
+  BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION = 13,
+  BuiltinOperator_LOGISTIC = 14,
+  BuiltinOperator_LSH_PROJECTION = 15,
+  BuiltinOperator_LSTM = 16,
+  BuiltinOperator_MAX_POOL_2D = 17,
+  BuiltinOperator_MUL = 18,
+  BuiltinOperator_RELU = 19,
+  BuiltinOperator_RELU1 = 20,
+  BuiltinOperator_RELU6 = 21,
+  BuiltinOperator_RESHAPE = 22,
+  BuiltinOperator_RESIZE_BILINEAR = 23,
+  BuiltinOperator_RNN = 24,
+  BuiltinOperator_SOFTMAX = 25,
+  BuiltinOperator_SPACE_TO_DEPTH = 26,
+  BuiltinOperator_SVDF = 27,
+  BuiltinOperator_TANH = 28,
+  BuiltinOperator_CONCAT_EMBEDDINGS = 29,
+  BuiltinOperator_SKIP_GRAM = 30,
+  BuiltinOperator_CALL = 31,
+  BuiltinOperator_CUSTOM = 32,
+  BuiltinOperator_EMBEDDING_LOOKUP_SPARSE = 33,
+  BuiltinOperator_MIN = BuiltinOperator_ADD,
+  BuiltinOperator_MAX = BuiltinOperator_EMBEDDING_LOOKUP_SPARSE
+};
+
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[31] {
+  static BuiltinOperator values[] = {
+      BuiltinOperator_ADD,
+      BuiltinOperator_AVERAGE_POOL_2D,
+      BuiltinOperator_CONCATENATION,
+      BuiltinOperator_CONV_2D,
+      BuiltinOperator_DEPTHWISE_CONV_2D,
+      BuiltinOperator_EMBEDDING_LOOKUP,
+      BuiltinOperator_FULLY_CONNECTED,
+      BuiltinOperator_HASHTABLE_LOOKUP,
+      BuiltinOperator_L2_NORMALIZATION,
+      BuiltinOperator_L2_POOL_2D,
+      BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+      BuiltinOperator_LOGISTIC,
+      BuiltinOperator_LSH_PROJECTION,
+      BuiltinOperator_LSTM,
+      BuiltinOperator_MAX_POOL_2D,
+      BuiltinOperator_MUL,
+      BuiltinOperator_RELU,
+      BuiltinOperator_RELU1,
+      BuiltinOperator_RELU6,
+      BuiltinOperator_RESHAPE,
+      BuiltinOperator_RESIZE_BILINEAR,
+      BuiltinOperator_RNN,
+      BuiltinOperator_SOFTMAX,
+      BuiltinOperator_SPACE_TO_DEPTH,
+      BuiltinOperator_SVDF,
+      BuiltinOperator_TANH,
+      BuiltinOperator_CONCAT_EMBEDDINGS,
+      BuiltinOperator_SKIP_GRAM,
+      BuiltinOperator_CALL,
+      BuiltinOperator_CUSTOM,
+      BuiltinOperator_EMBEDDING_LOOKUP_SPARSE};
+  return values;
+}
+
+inline const char **EnumNamesBuiltinOperator() {
+  static const char *names[] = {"ADD",
+                                "AVERAGE_POOL_2D",
+                                "CONCATENATION",
+                                "CONV_2D",
+                                "DEPTHWISE_CONV_2D",
+                                "",
+                                "",
+                                "EMBEDDING_LOOKUP",
+                                "",
+                                "FULLY_CONNECTED",
+                                "HASHTABLE_LOOKUP",
+                                "L2_NORMALIZATION",
+                                "L2_POOL_2D",
+                                "LOCAL_RESPONSE_NORMALIZATION",
+                                "LOGISTIC",
+                                "LSH_PROJECTION",
+                                "LSTM",
+                                "MAX_POOL_2D",
+                                "MUL",
+                                "RELU",
+                                "RELU1",
+                                "RELU6",
+                                "RESHAPE",
+                                "RESIZE_BILINEAR",
+                                "RNN",
+                                "SOFTMAX",
+                                "SPACE_TO_DEPTH",
+                                "SVDF",
+                                "TANH",
+                                "CONCAT_EMBEDDINGS",
+                                "SKIP_GRAM",
+                                "CALL",
+                                "CUSTOM",
+                                "EMBEDDING_LOOKUP_SPARSE",
+                                nullptr};
+  return names;
+}
+
+inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesBuiltinOperator()[index];
+}
+
+enum BuiltinOptions {
+  BuiltinOptions_NONE = 0,
+  BuiltinOptions_Conv2DOptions = 1,
+  BuiltinOptions_DepthwiseConv2DOptions = 2,
+  BuiltinOptions_ConcatEmbeddingsOptions = 3,
+  BuiltinOptions_LSHProjectionOptions = 4,
+  BuiltinOptions_Pool2DOptions = 5,
+  BuiltinOptions_SVDFOptions = 6,
+  BuiltinOptions_RNNOptions = 7,
+  BuiltinOptions_FullyConnectedOptions = 8,
+  BuiltinOptions_SoftmaxOptions = 9,
+  BuiltinOptions_ConcatenationOptions = 10,
+  BuiltinOptions_AddOptions = 11,
+  BuiltinOptions_L2NormOptions = 12,
+  BuiltinOptions_LocalResponseNormalizationOptions = 13,
+  BuiltinOptions_LSTMOptions = 14,
+  BuiltinOptions_ResizeBilinearOptions = 15,
+  BuiltinOptions_CallOptions = 16,
+  BuiltinOptions_ReshapeOptions = 17,
+  BuiltinOptions_SkipGramOptions = 18,
+  BuiltinOptions_SpaceToDepthOptions = 19,
+  BuiltinOptions_EmbeddingLookupSparseOptions = 20,
+  BuiltinOptions_MulOptions = 21,
+  BuiltinOptions_MIN = BuiltinOptions_NONE,
+  BuiltinOptions_MAX = BuiltinOptions_MulOptions
+};
+
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[22] {
+  static BuiltinOptions values[] = {
+      BuiltinOptions_NONE,
+      BuiltinOptions_Conv2DOptions,
+      BuiltinOptions_DepthwiseConv2DOptions,
+      BuiltinOptions_ConcatEmbeddingsOptions,
+      BuiltinOptions_LSHProjectionOptions,
+      BuiltinOptions_Pool2DOptions,
+      BuiltinOptions_SVDFOptions,
+      BuiltinOptions_RNNOptions,
+      BuiltinOptions_FullyConnectedOptions,
+      BuiltinOptions_SoftmaxOptions,
+      BuiltinOptions_ConcatenationOptions,
+      BuiltinOptions_AddOptions,
+      BuiltinOptions_L2NormOptions,
+      BuiltinOptions_LocalResponseNormalizationOptions,
+      BuiltinOptions_LSTMOptions,
+      BuiltinOptions_ResizeBilinearOptions,
+      BuiltinOptions_CallOptions,
+      BuiltinOptions_ReshapeOptions,
+      BuiltinOptions_SkipGramOptions,
+      BuiltinOptions_SpaceToDepthOptions,
+      BuiltinOptions_EmbeddingLookupSparseOptions,
+      BuiltinOptions_MulOptions};
+  return values;
+}
+
+inline const char **EnumNamesBuiltinOptions() {
+  static const char *names[] = {"NONE",
+                                "Conv2DOptions",
+                                "DepthwiseConv2DOptions",
+                                "ConcatEmbeddingsOptions",
+                                "LSHProjectionOptions",
+                                "Pool2DOptions",
+                                "SVDFOptions",
+                                "RNNOptions",
+                                "FullyConnectedOptions",
+                                "SoftmaxOptions",
+                                "ConcatenationOptions",
+                                "AddOptions",
+                                "L2NormOptions",
+                                "LocalResponseNormalizationOptions",
+                                "LSTMOptions",
+                                "ResizeBilinearOptions",
+                                "CallOptions",
+                                "ReshapeOptions",
+                                "SkipGramOptions",
+                                "SpaceToDepthOptions",
+                                "EmbeddingLookupSparseOptions",
+                                "MulOptions",
+                                nullptr};
+  return names;
+}
+
+inline const char *EnumNameBuiltinOptions(BuiltinOptions e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesBuiltinOptions()[index];
+}
+
+template <typename T>
+struct BuiltinOptionsTraits {
+  static const BuiltinOptions enum_value = BuiltinOptions_NONE;
+};
+
+template <>
+struct BuiltinOptionsTraits<Conv2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Conv2DOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<DepthwiseConv2DOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_DepthwiseConv2DOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<ConcatEmbeddingsOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_ConcatEmbeddingsOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<LSHProjectionOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSHProjectionOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<Pool2DOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_Pool2DOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SVDFOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SVDFOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<RNNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RNNOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<FullyConnectedOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_FullyConnectedOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SoftmaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SoftmaxOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<ConcatenationOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ConcatenationOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<AddOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<L2NormOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_L2NormOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<LocalResponseNormalizationOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_LocalResponseNormalizationOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<LSTMOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LSTMOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<ResizeBilinearOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ResizeBilinearOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<CallOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CallOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<ReshapeOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReshapeOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SkipGramOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SkipGramOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<SpaceToDepthOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SpaceToDepthOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<EmbeddingLookupSparseOptions> {
+  static const BuiltinOptions enum_value =
+      BuiltinOptions_EmbeddingLookupSparseOptions;
+};
+
+template <>
+struct BuiltinOptionsTraits<MulOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MulOptions;
+};
+
+struct BuiltinOptionsUnion {
+  BuiltinOptions type;
+  void *value;
+
+  BuiltinOptionsUnion() : type(BuiltinOptions_NONE), value(nullptr) {}
+  BuiltinOptionsUnion(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT
+      : type(BuiltinOptions_NONE),
+        value(nullptr) {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+  }
+  BuiltinOptionsUnion(const BuiltinOptionsUnion &) FLATBUFFERS_NOEXCEPT;
+  BuiltinOptionsUnion &operator=(const BuiltinOptionsUnion &u)
+      FLATBUFFERS_NOEXCEPT {
+    BuiltinOptionsUnion t(u);
+    std::swap(type, t.type);
+    std::swap(value, t.value);
+    return *this;
+  }
+  BuiltinOptionsUnion &operator=(BuiltinOptionsUnion &&u) FLATBUFFERS_NOEXCEPT {
+    std::swap(type, u.type);
+    std::swap(value, u.value);
+    return *this;
+  }
+  ~BuiltinOptionsUnion() { Reset(); }
+
+  void Reset();
+
+#ifndef FLATBUFFERS_CPP98_STL
+  template <typename T>
+  void Set(T &&val) {
+    Reset();
+    type = BuiltinOptionsTraits<typename T::TableType>::enum_value;
+    if (type != BuiltinOptions_NONE) {
+      value = new T(std::forward<T>(val));
+    }
+  }
+#endif  // FLATBUFFERS_CPP98_STL
+
+  static void *UnPack(const void *obj, BuiltinOptions type,
+                      const flatbuffers::resolver_function_t *resolver);
+  flatbuffers::Offset<void> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr) const;
+
+  Conv2DOptionsT *AsConv2DOptions() {
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<Conv2DOptionsT *>(value)
+               : nullptr;
+  }
+  const Conv2DOptionsT *AsConv2DOptions() const {
+    return type == BuiltinOptions_Conv2DOptions
+               ? reinterpret_cast<const Conv2DOptionsT *>(value)
+               : nullptr;
+  }
+  DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() {
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
+  }
+  const DepthwiseConv2DOptionsT *AsDepthwiseConv2DOptions() const {
+    return type == BuiltinOptions_DepthwiseConv2DOptions
+               ? reinterpret_cast<const DepthwiseConv2DOptionsT *>(value)
+               : nullptr;
+  }
+  ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
+  }
+  const ConcatEmbeddingsOptionsT *AsConcatEmbeddingsOptions() const {
+    return type == BuiltinOptions_ConcatEmbeddingsOptions
+               ? reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value)
+               : nullptr;
+  }
+  LSHProjectionOptionsT *AsLSHProjectionOptions() {
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<LSHProjectionOptionsT *>(value)
+               : nullptr;
+  }
+  const LSHProjectionOptionsT *AsLSHProjectionOptions() const {
+    return type == BuiltinOptions_LSHProjectionOptions
+               ? reinterpret_cast<const LSHProjectionOptionsT *>(value)
+               : nullptr;
+  }
+  Pool2DOptionsT *AsPool2DOptions() {
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<Pool2DOptionsT *>(value)
+               : nullptr;
+  }
+  const Pool2DOptionsT *AsPool2DOptions() const {
+    return type == BuiltinOptions_Pool2DOptions
+               ? reinterpret_cast<const Pool2DOptionsT *>(value)
+               : nullptr;
+  }
+  SVDFOptionsT *AsSVDFOptions() {
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<SVDFOptionsT *>(value)
+               : nullptr;
+  }
+  const SVDFOptionsT *AsSVDFOptions() const {
+    return type == BuiltinOptions_SVDFOptions
+               ? reinterpret_cast<const SVDFOptionsT *>(value)
+               : nullptr;
+  }
+  RNNOptionsT *AsRNNOptions() {
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<RNNOptionsT *>(value)
+               : nullptr;
+  }
+  const RNNOptionsT *AsRNNOptions() const {
+    return type == BuiltinOptions_RNNOptions
+               ? reinterpret_cast<const RNNOptionsT *>(value)
+               : nullptr;
+  }
+  FullyConnectedOptionsT *AsFullyConnectedOptions() {
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<FullyConnectedOptionsT *>(value)
+               : nullptr;
+  }
+  const FullyConnectedOptionsT *AsFullyConnectedOptions() const {
+    return type == BuiltinOptions_FullyConnectedOptions
+               ? reinterpret_cast<const FullyConnectedOptionsT *>(value)
+               : nullptr;
+  }
+  SoftmaxOptionsT *AsSoftmaxOptions() {
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<SoftmaxOptionsT *>(value)
+               : nullptr;
+  }
+  const SoftmaxOptionsT *AsSoftmaxOptions() const {
+    return type == BuiltinOptions_SoftmaxOptions
+               ? reinterpret_cast<const SoftmaxOptionsT *>(value)
+               : nullptr;
+  }
+  ConcatenationOptionsT *AsConcatenationOptions() {
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<ConcatenationOptionsT *>(value)
+               : nullptr;
+  }
+  const ConcatenationOptionsT *AsConcatenationOptions() const {
+    return type == BuiltinOptions_ConcatenationOptions
+               ? reinterpret_cast<const ConcatenationOptionsT *>(value)
+               : nullptr;
+  }
+  AddOptionsT *AsAddOptions() {
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<AddOptionsT *>(value)
+               : nullptr;
+  }
+  const AddOptionsT *AsAddOptions() const {
+    return type == BuiltinOptions_AddOptions
+               ? reinterpret_cast<const AddOptionsT *>(value)
+               : nullptr;
+  }
+  L2NormOptionsT *AsL2NormOptions() {
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<L2NormOptionsT *>(value)
+               : nullptr;
+  }
+  const L2NormOptionsT *AsL2NormOptions() const {
+    return type == BuiltinOptions_L2NormOptions
+               ? reinterpret_cast<const L2NormOptionsT *>(value)
+               : nullptr;
+  }
+  LocalResponseNormalizationOptionsT *AsLocalResponseNormalizationOptions() {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<LocalResponseNormalizationOptionsT *>(value)
+               : nullptr;
+  }
+  const LocalResponseNormalizationOptionsT *
+  AsLocalResponseNormalizationOptions() const {
+    return type == BuiltinOptions_LocalResponseNormalizationOptions
+               ? reinterpret_cast<const LocalResponseNormalizationOptionsT *>(
+                     value)
+               : nullptr;
+  }
+  LSTMOptionsT *AsLSTMOptions() {
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<LSTMOptionsT *>(value)
+               : nullptr;
+  }
+  const LSTMOptionsT *AsLSTMOptions() const {
+    return type == BuiltinOptions_LSTMOptions
+               ? reinterpret_cast<const LSTMOptionsT *>(value)
+               : nullptr;
+  }
+  ResizeBilinearOptionsT *AsResizeBilinearOptions() {
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<ResizeBilinearOptionsT *>(value)
+               : nullptr;
+  }
+  const ResizeBilinearOptionsT *AsResizeBilinearOptions() const {
+    return type == BuiltinOptions_ResizeBilinearOptions
+               ? reinterpret_cast<const ResizeBilinearOptionsT *>(value)
+               : nullptr;
+  }
+  CallOptionsT *AsCallOptions() {
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<CallOptionsT *>(value)
+               : nullptr;
+  }
+  const CallOptionsT *AsCallOptions() const {
+    return type == BuiltinOptions_CallOptions
+               ? reinterpret_cast<const CallOptionsT *>(value)
+               : nullptr;
+  }
+  ReshapeOptionsT *AsReshapeOptions() {
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<ReshapeOptionsT *>(value)
+               : nullptr;
+  }
+  const ReshapeOptionsT *AsReshapeOptions() const {
+    return type == BuiltinOptions_ReshapeOptions
+               ? reinterpret_cast<const ReshapeOptionsT *>(value)
+               : nullptr;
+  }
+  SkipGramOptionsT *AsSkipGramOptions() {
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<SkipGramOptionsT *>(value)
+               : nullptr;
+  }
+  const SkipGramOptionsT *AsSkipGramOptions() const {
+    return type == BuiltinOptions_SkipGramOptions
+               ? reinterpret_cast<const SkipGramOptionsT *>(value)
+               : nullptr;
+  }
+  SpaceToDepthOptionsT *AsSpaceToDepthOptions() {
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<SpaceToDepthOptionsT *>(value)
+               : nullptr;
+  }
+  const SpaceToDepthOptionsT *AsSpaceToDepthOptions() const {
+    return type == BuiltinOptions_SpaceToDepthOptions
+               ? reinterpret_cast<const SpaceToDepthOptionsT *>(value)
+               : nullptr;
+  }
+  EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
+  }
+  const EmbeddingLookupSparseOptionsT *AsEmbeddingLookupSparseOptions() const {
+    return type == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value)
+               : nullptr;
+  }
+  MulOptionsT *AsMulOptions() {
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<MulOptionsT *>(value)
+               : nullptr;
+  }
+  const MulOptionsT *AsMulOptions() const {
+    return type == BuiltinOptions_MulOptions
+               ? reinterpret_cast<const MulOptionsT *>(value)
+               : nullptr;
+  }
+};
+
+bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj,
+                          BuiltinOptions type);
+bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types);
+
+enum Padding {
+  Padding_SAME = 0,
+  Padding_VALID = 1,
+  Padding_MIN = Padding_SAME,
+  Padding_MAX = Padding_VALID
+};
+
+inline Padding (&EnumValuesPadding())[2] {
+  static Padding values[] = {Padding_SAME, Padding_VALID};
+  return values;
+}
+
+inline const char **EnumNamesPadding() {
+  static const char *names[] = {"SAME", "VALID", nullptr};
+  return names;
+}
+
+inline const char *EnumNamePadding(Padding e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesPadding()[index];
+}
+
+enum ActivationFunctionType {
+  ActivationFunctionType_NONE = 0,
+  ActivationFunctionType_RELU = 1,
+  ActivationFunctionType_RELU1 = 2,
+  ActivationFunctionType_RELU6 = 3,
+  ActivationFunctionType_TANH = 4,
+  ActivationFunctionType_SIGN_BIT = 5,
+  ActivationFunctionType_MIN = ActivationFunctionType_NONE,
+  ActivationFunctionType_MAX = ActivationFunctionType_SIGN_BIT
+};
+
+inline ActivationFunctionType (&EnumValuesActivationFunctionType())[6] {
+  static ActivationFunctionType values[] = {
+      ActivationFunctionType_NONE,  ActivationFunctionType_RELU,
+      ActivationFunctionType_RELU1, ActivationFunctionType_RELU6,
+      ActivationFunctionType_TANH,  ActivationFunctionType_SIGN_BIT};
+  return values;
+}
+
+inline const char **EnumNamesActivationFunctionType() {
+  static const char *names[] = {"NONE", "RELU",     "RELU1", "RELU6",
+                                "TANH", "SIGN_BIT", nullptr};
+  return names;
+}
+
+inline const char *EnumNameActivationFunctionType(ActivationFunctionType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesActivationFunctionType()[index];
+}
+
+enum LSHProjectionType {
+  LSHProjectionType_UNKNOWN = 0,
+  LSHProjectionType_SPARSE = 1,
+  LSHProjectionType_DENSE = 2,
+  LSHProjectionType_MIN = LSHProjectionType_UNKNOWN,
+  LSHProjectionType_MAX = LSHProjectionType_DENSE
+};
+
+inline LSHProjectionType (&EnumValuesLSHProjectionType())[3] {
+  static LSHProjectionType values[] = {LSHProjectionType_UNKNOWN,
+                                       LSHProjectionType_SPARSE,
+                                       LSHProjectionType_DENSE};
+  return values;
+}
+
+inline const char **EnumNamesLSHProjectionType() {
+  static const char *names[] = {"UNKNOWN", "SPARSE", "DENSE", nullptr};
+  return names;
+}
+
+inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesLSHProjectionType()[index];
+}
+
+enum CombinerType {
+  CombinerType_SUM = 0,
+  CombinerType_MEAN = 1,
+  CombinerType_SQRTN = 2,
+  CombinerType_MIN = CombinerType_SUM,
+  CombinerType_MAX = CombinerType_SQRTN
+};
+
+inline CombinerType (&EnumValuesCombinerType())[3] {
+  static CombinerType values[] = {CombinerType_SUM, CombinerType_MEAN,
+                                  CombinerType_SQRTN};
+  return values;
+}
+
+inline const char **EnumNamesCombinerType() {
+  static const char *names[] = {"SUM", "MEAN", "SQRTN", nullptr};
+  return names;
+}
+
+inline const char *EnumNameCombinerType(CombinerType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesCombinerType()[index];
+}
+
+enum CustomOptionsFormat {
+  CustomOptionsFormat_FLEXBUFFERS = 0,
+  CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
+  CustomOptionsFormat_MAX = CustomOptionsFormat_FLEXBUFFERS
+};
+
+inline CustomOptionsFormat (&EnumValuesCustomOptionsFormat())[1] {
+  static CustomOptionsFormat values[] = {CustomOptionsFormat_FLEXBUFFERS};
+  return values;
+}
+
+inline const char **EnumNamesCustomOptionsFormat() {
+  static const char *names[] = {"FLEXBUFFERS", nullptr};
+  return names;
+}
+
+inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesCustomOptionsFormat()[index];
+}
+
+struct QuantizationParametersT : public flatbuffers::NativeTable {
+  typedef QuantizationParameters TableType;
+  std::vector<float> min;
+  std::vector<float> max;
+  std::vector<float> scale;
+  std::vector<int64_t> zero_point;
+  QuantizationParametersT() {}
+};
+
+struct QuantizationParameters FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef QuantizationParametersT NativeTableType;
+  enum { VT_MIN = 4, VT_MAX = 6, VT_SCALE = 8, VT_ZERO_POINT = 10 };
+  const flatbuffers::Vector<float> *min() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
+  }
+  const flatbuffers::Vector<float> *max() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_MAX);
+  }
+  const flatbuffers::Vector<float> *scale() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(VT_SCALE);
+  }
+  const flatbuffers::Vector<int64_t> *zero_point() const {
+    return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_ZERO_POINT);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_MIN) &&
+           verifier.Verify(min()) && VerifyOffset(verifier, VT_MAX) &&
+           verifier.Verify(max()) && VerifyOffset(verifier, VT_SCALE) &&
+           verifier.Verify(scale()) && VerifyOffset(verifier, VT_ZERO_POINT) &&
+           verifier.Verify(zero_point()) && verifier.EndTable();
+  }
+  QuantizationParametersT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      QuantizationParametersT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<QuantizationParameters> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct QuantizationParametersBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_min(flatbuffers::Offset<flatbuffers::Vector<float>> min) {
+    fbb_.AddOffset(QuantizationParameters::VT_MIN, min);
+  }
+  void add_max(flatbuffers::Offset<flatbuffers::Vector<float>> max) {
+    fbb_.AddOffset(QuantizationParameters::VT_MAX, max);
+  }
+  void add_scale(flatbuffers::Offset<flatbuffers::Vector<float>> scale) {
+    fbb_.AddOffset(QuantizationParameters::VT_SCALE, scale);
+  }
+  void add_zero_point(
+      flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point) {
+    fbb_.AddOffset(QuantizationParameters::VT_ZERO_POINT, zero_point);
+  }
+  explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  QuantizationParametersBuilder &operator=(
+      const QuantizationParametersBuilder &);
+  flatbuffers::Offset<QuantizationParameters> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<QuantizationParameters>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<float>> min = 0,
+    flatbuffers::Offset<flatbuffers::Vector<float>> max = 0,
+    flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0) {
+  QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_zero_point(zero_point);
+  builder_.add_scale(scale);
+  builder_.add_max(max);
+  builder_.add_min(min);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<QuantizationParameters>
+CreateQuantizationParametersDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<float> *min = nullptr,
+    const std::vector<float> *max = nullptr,
+    const std::vector<float> *scale = nullptr,
+    const std::vector<int64_t> *zero_point = nullptr) {
+  return tflite::CreateQuantizationParameters(
+      _fbb, min ? _fbb.CreateVector<float>(*min) : 0,
+      max ? _fbb.CreateVector<float>(*max) : 0,
+      scale ? _fbb.CreateVector<float>(*scale) : 0,
+      zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0);
+}
+
+flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TensorT : public flatbuffers::NativeTable {
+  typedef Tensor TableType;
+  std::vector<int32_t> shape;
+  TensorType type;
+  uint32_t buffer;
+  std::string name;
+  std::unique_ptr<QuantizationParametersT> quantization;
+  TensorT() : type(TensorType_FLOAT32), buffer(0) {}
+};
+
+struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TensorT NativeTableType;
+  enum {
+    VT_SHAPE = 4,
+    VT_TYPE = 6,
+    VT_BUFFER = 8,
+    VT_NAME = 10,
+    VT_QUANTIZATION = 12
+  };
+  const flatbuffers::Vector<int32_t> *shape() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_SHAPE);
+  }
+  TensorType type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  uint32_t buffer() const { return GetField<uint32_t>(VT_BUFFER, 0); }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  const QuantizationParameters *quantization() const {
+    return GetPointer<const QuantizationParameters *>(VT_QUANTIZATION);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) &&
+           verifier.Verify(shape()) && VerifyField<int8_t>(verifier, VT_TYPE) &&
+           VerifyField<uint32_t>(verifier, VT_BUFFER) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
+           VerifyOffset(verifier, VT_QUANTIZATION) &&
+           verifier.VerifyTable(quantization()) && verifier.EndTable();
+  }
+  TensorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Tensor> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_shape(flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape) {
+    fbb_.AddOffset(Tensor::VT_SHAPE, shape);
+  }
+  void add_type(TensorType type) {
+    fbb_.AddElement<int8_t>(Tensor::VT_TYPE, static_cast<int8_t>(type), 0);
+  }
+  void add_buffer(uint32_t buffer) {
+    fbb_.AddElement<uint32_t>(Tensor::VT_BUFFER, buffer, 0);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(Tensor::VT_NAME, name);
+  }
+  void add_quantization(
+      flatbuffers::Offset<QuantizationParameters> quantization) {
+    fbb_.AddOffset(Tensor::VT_QUANTIZATION, quantization);
+  }
+  explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TensorBuilder &operator=(const TensorBuilder &);
+  flatbuffers::Offset<Tensor> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Tensor>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> shape = 0,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    flatbuffers::Offset<QuantizationParameters> quantization = 0) {
+  TensorBuilder builder_(_fbb);
+  builder_.add_quantization(quantization);
+  builder_.add_name(name);
+  builder_.add_buffer(buffer);
+  builder_.add_shape(shape);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *shape = nullptr,
+    TensorType type = TensorType_FLOAT32, uint32_t buffer = 0,
+    const char *name = nullptr,
+    flatbuffers::Offset<QuantizationParameters> quantization = 0) {
+  return tflite::CreateTensor(
+      _fbb, shape ? _fbb.CreateVector<int32_t>(*shape) : 0, type, buffer,
+      name ? _fbb.CreateString(name) : 0, quantization);
+}
+
+flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Conv2DOptionsT : public flatbuffers::NativeTable {
+  typedef Conv2DOptions TableType;
+  Padding padding;
+  int32_t stride_w;
+  int32_t stride_h;
+  ActivationFunctionType fused_activation_function;
+  Conv2DOptionsT()
+      : padding(Padding_SAME),
+        stride_w(0),
+        stride_h(0),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Conv2DOptionsT NativeTableType;
+  enum {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FUSED_ACTIVATION_FUNCTION = 10
+  };
+  Padding padding() const {
+    return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  Conv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Conv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Conv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Conv2DOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(Padding padding) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  Conv2DOptionsBuilder &operator=(const Conv2DOptionsBuilder &);
+  flatbuffers::Offset<Conv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Conv2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  Conv2DOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct Pool2DOptionsT : public flatbuffers::NativeTable {
+  typedef Pool2DOptions TableType;
+  Padding padding;
+  int32_t stride_w;
+  int32_t stride_h;
+  int32_t filter_width;
+  int32_t filter_height;
+  ActivationFunctionType fused_activation_function;
+  Pool2DOptionsT()
+      : padding(Padding_SAME),
+        stride_w(0),
+        stride_h(0),
+        filter_width(0),
+        filter_height(0),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct Pool2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef Pool2DOptionsT NativeTableType;
+  enum {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_FILTER_WIDTH = 10,
+    VT_FILTER_HEIGHT = 12,
+    VT_FUSED_ACTIVATION_FUNCTION = 14
+  };
+  Padding padding() const {
+    return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
+  int32_t filter_width() const { return GetField<int32_t>(VT_FILTER_WIDTH, 0); }
+  int32_t filter_height() const {
+    return GetField<int32_t>(VT_FILTER_HEIGHT, 0);
+  }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_WIDTH) &&
+           VerifyField<int32_t>(verifier, VT_FILTER_HEIGHT) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  Pool2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      Pool2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Pool2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct Pool2DOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(Padding padding) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_filter_width(int32_t filter_width) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_WIDTH, filter_width, 0);
+  }
+  void add_filter_height(int32_t filter_height) {
+    fbb_.AddElement<int32_t>(Pool2DOptions::VT_FILTER_HEIGHT, filter_height, 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(Pool2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit Pool2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  Pool2DOptionsBuilder &operator=(const Pool2DOptionsBuilder &);
+  flatbuffers::Offset<Pool2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Pool2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t filter_width = 0,
+    int32_t filter_height = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  Pool2DOptionsBuilder builder_(_fbb);
+  builder_.add_filter_height(filter_height);
+  builder_.add_filter_width(filter_width);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct DepthwiseConv2DOptionsT : public flatbuffers::NativeTable {
+  typedef DepthwiseConv2DOptions TableType;
+  Padding padding;
+  int32_t stride_w;
+  int32_t stride_h;
+  int32_t depth_multiplier;
+  ActivationFunctionType fused_activation_function;
+  DepthwiseConv2DOptionsT()
+      : padding(Padding_SAME),
+        stride_w(0),
+        stride_h(0),
+        depth_multiplier(0),
+        fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct DepthwiseConv2DOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef DepthwiseConv2DOptionsT NativeTableType;
+  enum {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8,
+    VT_DEPTH_MULTIPLIER = 10,
+    VT_FUSED_ACTIVATION_FUNCTION = 12
+  };
+  Padding padding() const {
+    return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const { return GetField<int32_t>(VT_STRIDE_W, 0); }
+  int32_t stride_h() const { return GetField<int32_t>(VT_STRIDE_H, 0); }
+  int32_t depth_multiplier() const {
+    return GetField<int32_t>(VT_DEPTH_MULTIPLIER, 0);
+  }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           VerifyField<int32_t>(verifier, VT_DEPTH_MULTIPLIER) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  DepthwiseConv2DOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<DepthwiseConv2DOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct DepthwiseConv2DOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(Padding padding) {
+    fbb_.AddElement<int8_t>(DepthwiseConv2DOptions::VT_PADDING,
+                            static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  void add_depth_multiplier(int32_t depth_multiplier) {
+    fbb_.AddElement<int32_t>(DepthwiseConv2DOptions::VT_DEPTH_MULTIPLIER,
+                             depth_multiplier, 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(
+        DepthwiseConv2DOptions::VT_FUSED_ACTIVATION_FUNCTION,
+        static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit DepthwiseConv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  DepthwiseConv2DOptionsBuilder &operator=(
+      const DepthwiseConv2DOptionsBuilder &);
+  flatbuffers::Offset<DepthwiseConv2DOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<DepthwiseConv2DOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, Padding padding = Padding_SAME,
+    int32_t stride_w = 0, int32_t stride_h = 0, int32_t depth_multiplier = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  DepthwiseConv2DOptionsBuilder builder_(_fbb);
+  builder_.add_depth_multiplier(depth_multiplier);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_fused_activation_function(fused_activation_function);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatEmbeddingsOptionsT : public flatbuffers::NativeTable {
+  typedef ConcatEmbeddingsOptions TableType;
+  int32_t num_channels;
+  std::vector<int32_t> num_columns_per_channel;
+  std::vector<int32_t> embedding_dim_per_channel;
+  ConcatEmbeddingsOptionsT() : num_channels(0) {}
+};
+
+struct ConcatEmbeddingsOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef ConcatEmbeddingsOptionsT NativeTableType;
+  enum {
+    VT_NUM_CHANNELS = 4,
+    VT_NUM_COLUMNS_PER_CHANNEL = 6,
+    VT_EMBEDDING_DIM_PER_CHANNEL = 8
+  };
+  int32_t num_channels() const { return GetField<int32_t>(VT_NUM_CHANNELS, 0); }
+  const flatbuffers::Vector<int32_t> *num_columns_per_channel() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_NUM_COLUMNS_PER_CHANNEL);
+  }
+  const flatbuffers::Vector<int32_t> *embedding_dim_per_channel() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(
+        VT_EMBEDDING_DIM_PER_CHANNEL);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_CHANNELS) &&
+           VerifyOffset(verifier, VT_NUM_COLUMNS_PER_CHANNEL) &&
+           verifier.Verify(num_columns_per_channel()) &&
+           VerifyOffset(verifier, VT_EMBEDDING_DIM_PER_CHANNEL) &&
+           verifier.Verify(embedding_dim_per_channel()) && verifier.EndTable();
+  }
+  ConcatEmbeddingsOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatEmbeddingsOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatEmbeddingsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_channels(int32_t num_channels) {
+    fbb_.AddElement<int32_t>(ConcatEmbeddingsOptions::VT_NUM_CHANNELS,
+                             num_channels, 0);
+  }
+  void add_num_columns_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          num_columns_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_NUM_COLUMNS_PER_CHANNEL,
+                   num_columns_per_channel);
+  }
+  void add_embedding_dim_per_channel(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+          embedding_dim_per_channel) {
+    fbb_.AddOffset(ConcatEmbeddingsOptions::VT_EMBEDDING_DIM_PER_CHANNEL,
+                   embedding_dim_per_channel);
+  }
+  explicit ConcatEmbeddingsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ConcatEmbeddingsOptionsBuilder &operator=(
+      const ConcatEmbeddingsOptionsBuilder &);
+  flatbuffers::Offset<ConcatEmbeddingsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConcatEmbeddingsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                              int32_t num_channels = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  num_columns_per_channel = 0,
+                              flatbuffers::Offset<flatbuffers::Vector<int32_t>>
+                                  embedding_dim_per_channel = 0) {
+  ConcatEmbeddingsOptionsBuilder builder_(_fbb);
+  builder_.add_embedding_dim_per_channel(embedding_dim_per_channel);
+  builder_.add_num_columns_per_channel(num_columns_per_channel);
+  builder_.add_num_channels(num_channels);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t num_channels = 0,
+    const std::vector<int32_t> *num_columns_per_channel = nullptr,
+    const std::vector<int32_t> *embedding_dim_per_channel = nullptr) {
+  return tflite::CreateConcatEmbeddingsOptions(
+      _fbb, num_channels,
+      num_columns_per_channel
+          ? _fbb.CreateVector<int32_t>(*num_columns_per_channel)
+          : 0,
+      embedding_dim_per_channel
+          ? _fbb.CreateVector<int32_t>(*embedding_dim_per_channel)
+          : 0);
+}
+
+flatbuffers::Offset<ConcatEmbeddingsOptions> CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSHProjectionOptionsT : public flatbuffers::NativeTable {
+  typedef LSHProjectionOptions TableType;
+  LSHProjectionType type;
+  LSHProjectionOptionsT() : type(LSHProjectionType_UNKNOWN) {}
+};
+
+struct LSHProjectionOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef LSHProjectionOptionsT NativeTableType;
+  enum { VT_TYPE = 4 };
+  LSHProjectionType type() const {
+    return static_cast<LSHProjectionType>(GetField<int8_t>(VT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_TYPE) && verifier.EndTable();
+  }
+  LSHProjectionOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSHProjectionOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSHProjectionOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSHProjectionOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_type(LSHProjectionType type) {
+    fbb_.AddElement<int8_t>(LSHProjectionOptions::VT_TYPE,
+                            static_cast<int8_t>(type), 0);
+  }
+  explicit LSHProjectionOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LSHProjectionOptionsBuilder &operator=(const LSHProjectionOptionsBuilder &);
+  flatbuffers::Offset<LSHProjectionOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LSHProjectionOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    LSHProjectionType type = LSHProjectionType_UNKNOWN) {
+  LSHProjectionOptionsBuilder builder_(_fbb);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SVDFOptionsT : public flatbuffers::NativeTable {
+  typedef SVDFOptions TableType;
+  int32_t rank;
+  ActivationFunctionType fused_activation_function;
+  SVDFOptionsT()
+      : rank(0), fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct SVDFOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SVDFOptionsT NativeTableType;
+  enum { VT_RANK = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t rank() const { return GetField<int32_t>(VT_RANK, 0); }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RANK) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  SVDFOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SVDFOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SVDFOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SVDFOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_rank(int32_t rank) {
+    fbb_.AddElement<int32_t>(SVDFOptions::VT_RANK, rank, 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(SVDFOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit SVDFOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SVDFOptionsBuilder &operator=(const SVDFOptionsBuilder &);
+  flatbuffers::Offset<SVDFOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SVDFOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t rank = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  SVDFOptionsBuilder builder_(_fbb);
+  builder_.add_rank(rank);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct RNNOptionsT : public flatbuffers::NativeTable {
+  typedef RNNOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  RNNOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct RNNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RNNOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  RNNOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      RNNOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RNNOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RNNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(RNNOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit RNNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  RNNOptionsBuilder &operator=(const RNNOptionsBuilder &);
+  flatbuffers::Offset<RNNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RNNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  RNNOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct FullyConnectedOptionsT : public flatbuffers::NativeTable {
+  typedef FullyConnectedOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  FullyConnectedOptionsT()
+      : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct FullyConnectedOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef FullyConnectedOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  FullyConnectedOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      FullyConnectedOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<FullyConnectedOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct FullyConnectedOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(FullyConnectedOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit FullyConnectedOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  FullyConnectedOptionsBuilder &operator=(const FullyConnectedOptionsBuilder &);
+  flatbuffers::Offset<FullyConnectedOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<FullyConnectedOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  FullyConnectedOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SoftmaxOptionsT : public flatbuffers::NativeTable {
+  typedef SoftmaxOptions TableType;
+  float beta;
+  SoftmaxOptionsT() : beta(0.0f) {}
+};
+
+struct SoftmaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SoftmaxOptionsT NativeTableType;
+  enum { VT_BETA = 4 };
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  SoftmaxOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SoftmaxOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SoftmaxOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SoftmaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(SoftmaxOptions::VT_BETA, beta, 0.0f);
+  }
+  explicit SoftmaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SoftmaxOptionsBuilder &operator=(const SoftmaxOptionsBuilder &);
+  flatbuffers::Offset<SoftmaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SoftmaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, float beta = 0.0f) {
+  SoftmaxOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ConcatenationOptionsT : public flatbuffers::NativeTable {
+  typedef ConcatenationOptions TableType;
+  int32_t axis;
+  ActivationFunctionType fused_activation_function;
+  ConcatenationOptionsT()
+      : axis(0), fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct ConcatenationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef ConcatenationOptionsT NativeTableType;
+  enum { VT_AXIS = 4, VT_FUSED_ACTIVATION_FUNCTION = 6 };
+  int32_t axis() const { return GetField<int32_t>(VT_AXIS, 0); }
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  ConcatenationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ConcatenationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ConcatenationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ConcatenationOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(ConcatenationOptions::VT_AXIS, axis, 0);
+  }
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(ConcatenationOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit ConcatenationOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ConcatenationOptionsBuilder &operator=(const ConcatenationOptionsBuilder &);
+  flatbuffers::Offset<ConcatenationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ConcatenationOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t axis = 0,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  ConcatenationOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddOptionsT : public flatbuffers::NativeTable {
+  typedef AddOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  AddOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct AddOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AddOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  AddOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      AddOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(AddOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit AddOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AddOptionsBuilder &operator=(const AddOptionsBuilder &);
+  flatbuffers::Offset<AddOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AddOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  AddOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MulOptionsT : public flatbuffers::NativeTable {
+  typedef MulOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  MulOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct MulOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MulOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  MulOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      MulOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MulOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MulOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(MulOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit MulOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MulOptionsBuilder &operator=(const MulOptionsBuilder &);
+  flatbuffers::Offset<MulOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MulOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  MulOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct L2NormOptionsT : public flatbuffers::NativeTable {
+  typedef L2NormOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  L2NormOptionsT() : fused_activation_function(ActivationFunctionType_NONE) {}
+};
+
+struct L2NormOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef L2NormOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           verifier.EndTable();
+  }
+  L2NormOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      L2NormOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<L2NormOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct L2NormOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(L2NormOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  explicit L2NormOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  L2NormOptionsBuilder &operator=(const L2NormOptionsBuilder &);
+  flatbuffers::Offset<L2NormOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<L2NormOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE) {
+  L2NormOptionsBuilder builder_(_fbb);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LocalResponseNormalizationOptionsT : public flatbuffers::NativeTable {
+  typedef LocalResponseNormalizationOptions TableType;
+  int32_t radius;
+  float bias;
+  float alpha;
+  float beta;
+  LocalResponseNormalizationOptionsT()
+      : radius(0), bias(0.0f), alpha(0.0f), beta(0.0f) {}
+};
+
+struct LocalResponseNormalizationOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef LocalResponseNormalizationOptionsT NativeTableType;
+  enum { VT_RADIUS = 4, VT_BIAS = 6, VT_ALPHA = 8, VT_BETA = 10 };
+  int32_t radius() const { return GetField<int32_t>(VT_RADIUS, 0); }
+  float bias() const { return GetField<float>(VT_BIAS, 0.0f); }
+  float alpha() const { return GetField<float>(VT_ALPHA, 0.0f); }
+  float beta() const { return GetField<float>(VT_BETA, 0.0f); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_RADIUS) &&
+           VerifyField<float>(verifier, VT_BIAS) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           VerifyField<float>(verifier, VT_BETA) && verifier.EndTable();
+  }
+  LocalResponseNormalizationOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LocalResponseNormalizationOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const LocalResponseNormalizationOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LocalResponseNormalizationOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_radius(int32_t radius) {
+    fbb_.AddElement<int32_t>(LocalResponseNormalizationOptions::VT_RADIUS,
+                             radius, 0);
+  }
+  void add_bias(float bias) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BIAS, bias,
+                           0.0f);
+  }
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_ALPHA, alpha,
+                           0.0f);
+  }
+  void add_beta(float beta) {
+    fbb_.AddElement<float>(LocalResponseNormalizationOptions::VT_BETA, beta,
+                           0.0f);
+  }
+  explicit LocalResponseNormalizationOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LocalResponseNormalizationOptionsBuilder &operator=(
+      const LocalResponseNormalizationOptionsBuilder &);
+  flatbuffers::Offset<LocalResponseNormalizationOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LocalResponseNormalizationOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                        int32_t radius = 0, float bias = 0.0f,
+                                        float alpha = 0.0f, float beta = 0.0f) {
+  LocalResponseNormalizationOptionsBuilder builder_(_fbb);
+  builder_.add_beta(beta);
+  builder_.add_alpha(alpha);
+  builder_.add_bias(bias);
+  builder_.add_radius(radius);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct LSTMOptionsT : public flatbuffers::NativeTable {
+  typedef LSTMOptions TableType;
+  ActivationFunctionType fused_activation_function;
+  float cell_clip;
+  float proj_clip;
+  LSTMOptionsT()
+      : fused_activation_function(ActivationFunctionType_NONE),
+        cell_clip(0.0f),
+        proj_clip(0.0f) {}
+};
+
+struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LSTMOptionsT NativeTableType;
+  enum { VT_FUSED_ACTIVATION_FUNCTION = 4, VT_CELL_CLIP = 6, VT_PROJ_CLIP = 8 };
+  ActivationFunctionType fused_activation_function() const {
+    return static_cast<ActivationFunctionType>(
+        GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
+  }
+  float cell_clip() const { return GetField<float>(VT_CELL_CLIP, 0.0f); }
+  float proj_clip() const { return GetField<float>(VT_PROJ_CLIP, 0.0f); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
+           VerifyField<float>(verifier, VT_CELL_CLIP) &&
+           VerifyField<float>(verifier, VT_PROJ_CLIP) && verifier.EndTable();
+  }
+  LSTMOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      LSTMOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LSTMOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LSTMOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fused_activation_function(
+      ActivationFunctionType fused_activation_function) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_FUSED_ACTIVATION_FUNCTION,
+                            static_cast<int8_t>(fused_activation_function), 0);
+  }
+  void add_cell_clip(float cell_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_CELL_CLIP, cell_clip, 0.0f);
+  }
+  void add_proj_clip(float proj_clip) {
+    fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
+  }
+  explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LSTMOptionsBuilder &operator=(const LSTMOptionsBuilder &);
+  flatbuffers::Offset<LSTMOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LSTMOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    ActivationFunctionType fused_activation_function =
+        ActivationFunctionType_NONE,
+    float cell_clip = 0.0f, float proj_clip = 0.0f) {
+  LSTMOptionsBuilder builder_(_fbb);
+  builder_.add_proj_clip(proj_clip);
+  builder_.add_cell_clip(cell_clip);
+  builder_.add_fused_activation_function(fused_activation_function);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ResizeBilinearOptionsT : public flatbuffers::NativeTable {
+  typedef ResizeBilinearOptions TableType;
+  int32_t new_height;
+  int32_t new_width;
+  ResizeBilinearOptionsT() : new_height(0), new_width(0) {}
+};
+
+struct ResizeBilinearOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef ResizeBilinearOptionsT NativeTableType;
+  enum { VT_NEW_HEIGHT = 4, VT_NEW_WIDTH = 6 };
+  int32_t new_height() const { return GetField<int32_t>(VT_NEW_HEIGHT, 0); }
+  int32_t new_width() const { return GetField<int32_t>(VT_NEW_WIDTH, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NEW_HEIGHT) &&
+           VerifyField<int32_t>(verifier, VT_NEW_WIDTH) && verifier.EndTable();
+  }
+  ResizeBilinearOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ResizeBilinearOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ResizeBilinearOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ResizeBilinearOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_new_height(int32_t new_height) {
+    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_HEIGHT, new_height,
+                             0);
+  }
+  void add_new_width(int32_t new_width) {
+    fbb_.AddElement<int32_t>(ResizeBilinearOptions::VT_NEW_WIDTH, new_width, 0);
+  }
+  explicit ResizeBilinearOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ResizeBilinearOptionsBuilder &operator=(const ResizeBilinearOptionsBuilder &);
+  flatbuffers::Offset<ResizeBilinearOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ResizeBilinearOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t new_height = 0,
+    int32_t new_width = 0) {
+  ResizeBilinearOptionsBuilder builder_(_fbb);
+  builder_.add_new_width(new_width);
+  builder_.add_new_height(new_height);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct CallOptionsT : public flatbuffers::NativeTable {
+  typedef CallOptions TableType;
+  uint32_t subgraph;
+  CallOptionsT() : subgraph(0) {}
+};
+
+struct CallOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CallOptionsT NativeTableType;
+  enum { VT_SUBGRAPH = 4 };
+  uint32_t subgraph() const { return GetField<uint32_t>(VT_SUBGRAPH, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_SUBGRAPH) && verifier.EndTable();
+  }
+  CallOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      CallOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CallOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CallOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_subgraph(uint32_t subgraph) {
+    fbb_.AddElement<uint32_t>(CallOptions::VT_SUBGRAPH, subgraph, 0);
+  }
+  explicit CallOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CallOptionsBuilder &operator=(const CallOptionsBuilder &);
+  flatbuffers::Offset<CallOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CallOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t subgraph = 0) {
+  CallOptionsBuilder builder_(_fbb);
+  builder_.add_subgraph(subgraph);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReshapeOptionsT : public flatbuffers::NativeTable {
+  typedef ReshapeOptions TableType;
+  std::vector<int32_t> new_shape;
+  ReshapeOptionsT() {}
+};
+
+struct ReshapeOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReshapeOptionsT NativeTableType;
+  enum { VT_NEW_SHAPE = 4 };
+  const flatbuffers::Vector<int32_t> *new_shape() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_NEW_SHAPE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NEW_SHAPE) &&
+           verifier.Verify(new_shape()) && verifier.EndTable();
+  }
+  ReshapeOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      ReshapeOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReshapeOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReshapeOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_new_shape(
+      flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape) {
+    fbb_.AddOffset(ReshapeOptions::VT_NEW_SHAPE, new_shape);
+  }
+  explicit ReshapeOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ReshapeOptionsBuilder &operator=(const ReshapeOptionsBuilder &);
+  flatbuffers::Offset<ReshapeOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReshapeOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> new_shape = 0) {
+  ReshapeOptionsBuilder builder_(_fbb);
+  builder_.add_new_shape(new_shape);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptionsDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int32_t> *new_shape = nullptr) {
+  return tflite::CreateReshapeOptions(
+      _fbb, new_shape ? _fbb.CreateVector<int32_t>(*new_shape) : 0);
+}
+
+flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SkipGramOptionsT : public flatbuffers::NativeTable {
+  typedef SkipGramOptions TableType;
+  int32_t ngram_size;
+  int32_t max_skip_size;
+  bool include_all_ngrams;
+  SkipGramOptionsT()
+      : ngram_size(0), max_skip_size(0), include_all_ngrams(false) {}
+};
+
+struct SkipGramOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SkipGramOptionsT NativeTableType;
+  enum { VT_NGRAM_SIZE = 4, VT_MAX_SKIP_SIZE = 6, VT_INCLUDE_ALL_NGRAMS = 8 };
+  int32_t ngram_size() const { return GetField<int32_t>(VT_NGRAM_SIZE, 0); }
+  int32_t max_skip_size() const {
+    return GetField<int32_t>(VT_MAX_SKIP_SIZE, 0);
+  }
+  bool include_all_ngrams() const {
+    return GetField<uint8_t>(VT_INCLUDE_ALL_NGRAMS, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NGRAM_SIZE) &&
+           VerifyField<int32_t>(verifier, VT_MAX_SKIP_SIZE) &&
+           VerifyField<uint8_t>(verifier, VT_INCLUDE_ALL_NGRAMS) &&
+           verifier.EndTable();
+  }
+  SkipGramOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SkipGramOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SkipGramOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SkipGramOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_ngram_size(int32_t ngram_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_NGRAM_SIZE, ngram_size, 0);
+  }
+  void add_max_skip_size(int32_t max_skip_size) {
+    fbb_.AddElement<int32_t>(SkipGramOptions::VT_MAX_SKIP_SIZE, max_skip_size,
+                             0);
+  }
+  void add_include_all_ngrams(bool include_all_ngrams) {
+    fbb_.AddElement<uint8_t>(SkipGramOptions::VT_INCLUDE_ALL_NGRAMS,
+                             static_cast<uint8_t>(include_all_ngrams), 0);
+  }
+  explicit SkipGramOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SkipGramOptionsBuilder &operator=(const SkipGramOptionsBuilder &);
+  flatbuffers::Offset<SkipGramOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SkipGramOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t ngram_size = 0,
+    int32_t max_skip_size = 0, bool include_all_ngrams = false) {
+  SkipGramOptionsBuilder builder_(_fbb);
+  builder_.add_max_skip_size(max_skip_size);
+  builder_.add_ngram_size(ngram_size);
+  builder_.add_include_all_ngrams(include_all_ngrams);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SpaceToDepthOptionsT : public flatbuffers::NativeTable {
+  typedef SpaceToDepthOptions TableType;
+  int32_t block_size;
+  SpaceToDepthOptionsT() : block_size(0) {}
+};
+
+struct SpaceToDepthOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef SpaceToDepthOptionsT NativeTableType;
+  enum { VT_BLOCK_SIZE = 4 };
+  int32_t block_size() const { return GetField<int32_t>(VT_BLOCK_SIZE, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_BLOCK_SIZE) && verifier.EndTable();
+  }
+  SpaceToDepthOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SpaceToDepthOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SpaceToDepthOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SpaceToDepthOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_block_size(int32_t block_size) {
+    fbb_.AddElement<int32_t>(SpaceToDepthOptions::VT_BLOCK_SIZE, block_size, 0);
+  }
+  explicit SpaceToDepthOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SpaceToDepthOptionsBuilder &operator=(const SpaceToDepthOptionsBuilder &);
+  flatbuffers::Offset<SpaceToDepthOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SpaceToDepthOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t block_size = 0) {
+  SpaceToDepthOptionsBuilder builder_(_fbb);
+  builder_.add_block_size(block_size);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct EmbeddingLookupSparseOptionsT : public flatbuffers::NativeTable {
+  typedef EmbeddingLookupSparseOptions TableType;
+  CombinerType combiner;
+  EmbeddingLookupSparseOptionsT() : combiner(CombinerType_SUM) {}
+};
+
+struct EmbeddingLookupSparseOptions FLATBUFFERS_FINAL_CLASS
+    : private flatbuffers::Table {
+  typedef EmbeddingLookupSparseOptionsT NativeTableType;
+  enum { VT_COMBINER = 4 };
+  CombinerType combiner() const {
+    return static_cast<CombinerType>(GetField<int8_t>(VT_COMBINER, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_COMBINER) && verifier.EndTable();
+  }
+  EmbeddingLookupSparseOptionsT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EmbeddingLookupSparseOptions> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb,
+      const EmbeddingLookupSparseOptionsT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EmbeddingLookupSparseOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_combiner(CombinerType combiner) {
+    fbb_.AddElement<int8_t>(EmbeddingLookupSparseOptions::VT_COMBINER,
+                            static_cast<int8_t>(combiner), 0);
+  }
+  explicit EmbeddingLookupSparseOptionsBuilder(
+      flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  EmbeddingLookupSparseOptionsBuilder &operator=(
+      const EmbeddingLookupSparseOptionsBuilder &);
+  flatbuffers::Offset<EmbeddingLookupSparseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<EmbeddingLookupSparseOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(flatbuffers::FlatBufferBuilder &_fbb,
+                                   CombinerType combiner = CombinerType_SUM) {
+  EmbeddingLookupSparseOptionsBuilder builder_(_fbb);
+  builder_.add_combiner(combiner);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorCodeT : public flatbuffers::NativeTable {
+  typedef OperatorCode TableType;
+  BuiltinOperator builtin_code;
+  std::string custom_code;
+  OperatorCodeT() : builtin_code(BuiltinOperator_ADD) {}
+};
+
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorCodeT NativeTableType;
+  enum { VT_BUILTIN_CODE = 4, VT_CUSTOM_CODE = 6 };
+  BuiltinOperator builtin_code() const {
+    return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
+  }
+  const flatbuffers::String *custom_code() const {
+    return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
+           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
+           verifier.Verify(custom_code()) && verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorCodeT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OperatorCode> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OperatorCodeBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_builtin_code(BuiltinOperator builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE,
+                            static_cast<int8_t>(builtin_code), 0);
+  }
+  void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
+    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
+  }
+  explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OperatorCodeBuilder &operator=(const OperatorCodeBuilder &);
+  flatbuffers::Offset<OperatorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OperatorCode>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_custom_code(custom_code);
+  builder_.add_builtin_code(builtin_code);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    const char *custom_code = nullptr) {
+  return tflite::CreateOperatorCode(
+      _fbb, builtin_code, custom_code ? _fbb.CreateString(custom_code) : 0);
+}
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorT : public flatbuffers::NativeTable {
+  typedef Operator TableType;
+  uint32_t opcode_index;
+  std::vector<int32_t> inputs;
+  std::vector<int32_t> outputs;
+  BuiltinOptionsUnion builtin_options;
+  std::vector<uint8_t> custom_options;
+  CustomOptionsFormat custom_options_format;
+  OperatorT()
+      : opcode_index(0),
+        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {}
+};
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorT NativeTableType;
+  enum {
+    VT_OPCODE_INDEX = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_BUILTIN_OPTIONS_TYPE = 10,
+    VT_BUILTIN_OPTIONS = 12,
+    VT_CUSTOM_OPTIONS = 14,
+    VT_CUSTOM_OPTIONS_FORMAT = 16
+  };
+  uint32_t opcode_index() const {
+    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+  }
+  const flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  BuiltinOptions builtin_options_type() const {
+    return static_cast<BuiltinOptions>(
+        GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
+  }
+  const void *builtin_options() const {
+    return GetPointer<const void *>(VT_BUILTIN_OPTIONS);
+  }
+  template <typename T>
+  const T *builtin_options_as() const;
+  const Conv2DOptions *builtin_options_as_Conv2DOptions() const {
+    return builtin_options_type() == BuiltinOptions_Conv2DOptions
+               ? static_cast<const Conv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const DepthwiseConv2DOptions *builtin_options_as_DepthwiseConv2DOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_DepthwiseConv2DOptions
+               ? static_cast<const DepthwiseConv2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ConcatEmbeddingsOptions *builtin_options_as_ConcatEmbeddingsOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ConcatEmbeddingsOptions
+               ? static_cast<const ConcatEmbeddingsOptions *>(builtin_options())
+               : nullptr;
+  }
+  const LSHProjectionOptions *builtin_options_as_LSHProjectionOptions() const {
+    return builtin_options_type() == BuiltinOptions_LSHProjectionOptions
+               ? static_cast<const LSHProjectionOptions *>(builtin_options())
+               : nullptr;
+  }
+  const Pool2DOptions *builtin_options_as_Pool2DOptions() const {
+    return builtin_options_type() == BuiltinOptions_Pool2DOptions
+               ? static_cast<const Pool2DOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SVDFOptions *builtin_options_as_SVDFOptions() const {
+    return builtin_options_type() == BuiltinOptions_SVDFOptions
+               ? static_cast<const SVDFOptions *>(builtin_options())
+               : nullptr;
+  }
+  const RNNOptions *builtin_options_as_RNNOptions() const {
+    return builtin_options_type() == BuiltinOptions_RNNOptions
+               ? static_cast<const RNNOptions *>(builtin_options())
+               : nullptr;
+  }
+  const FullyConnectedOptions *builtin_options_as_FullyConnectedOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_FullyConnectedOptions
+               ? static_cast<const FullyConnectedOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SoftmaxOptions *builtin_options_as_SoftmaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_SoftmaxOptions
+               ? static_cast<const SoftmaxOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ConcatenationOptions *builtin_options_as_ConcatenationOptions() const {
+    return builtin_options_type() == BuiltinOptions_ConcatenationOptions
+               ? static_cast<const ConcatenationOptions *>(builtin_options())
+               : nullptr;
+  }
+  const AddOptions *builtin_options_as_AddOptions() const {
+    return builtin_options_type() == BuiltinOptions_AddOptions
+               ? static_cast<const AddOptions *>(builtin_options())
+               : nullptr;
+  }
+  const L2NormOptions *builtin_options_as_L2NormOptions() const {
+    return builtin_options_type() == BuiltinOptions_L2NormOptions
+               ? static_cast<const L2NormOptions *>(builtin_options())
+               : nullptr;
+  }
+  const LocalResponseNormalizationOptions *
+  builtin_options_as_LocalResponseNormalizationOptions() const {
+    return builtin_options_type() ==
+                   BuiltinOptions_LocalResponseNormalizationOptions
+               ? static_cast<const LocalResponseNormalizationOptions *>(
+                     builtin_options())
+               : nullptr;
+  }
+  const LSTMOptions *builtin_options_as_LSTMOptions() const {
+    return builtin_options_type() == BuiltinOptions_LSTMOptions
+               ? static_cast<const LSTMOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ResizeBilinearOptions *builtin_options_as_ResizeBilinearOptions()
+      const {
+    return builtin_options_type() == BuiltinOptions_ResizeBilinearOptions
+               ? static_cast<const ResizeBilinearOptions *>(builtin_options())
+               : nullptr;
+  }
+  const CallOptions *builtin_options_as_CallOptions() const {
+    return builtin_options_type() == BuiltinOptions_CallOptions
+               ? static_cast<const CallOptions *>(builtin_options())
+               : nullptr;
+  }
+  const ReshapeOptions *builtin_options_as_ReshapeOptions() const {
+    return builtin_options_type() == BuiltinOptions_ReshapeOptions
+               ? static_cast<const ReshapeOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SkipGramOptions *builtin_options_as_SkipGramOptions() const {
+    return builtin_options_type() == BuiltinOptions_SkipGramOptions
+               ? static_cast<const SkipGramOptions *>(builtin_options())
+               : nullptr;
+  }
+  const SpaceToDepthOptions *builtin_options_as_SpaceToDepthOptions() const {
+    return builtin_options_type() == BuiltinOptions_SpaceToDepthOptions
+               ? static_cast<const SpaceToDepthOptions *>(builtin_options())
+               : nullptr;
+  }
+  const EmbeddingLookupSparseOptions *
+  builtin_options_as_EmbeddingLookupSparseOptions() const {
+    return builtin_options_type() == BuiltinOptions_EmbeddingLookupSparseOptions
+               ? static_cast<const EmbeddingLookupSparseOptions *>(
+                     builtin_options())
+               : nullptr;
+  }
+  const MulOptions *builtin_options_as_MulOptions() const {
+    return builtin_options_type() == BuiltinOptions_MulOptions
+               ? static_cast<const MulOptions *>(builtin_options())
+               : nullptr;
+  }
+  const flatbuffers::Vector<uint8_t> *custom_options() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
+  }
+  CustomOptionsFormat custom_options_format() const {
+    return static_cast<CustomOptionsFormat>(
+        GetField<int8_t>(VT_CUSTOM_OPTIONS_FORMAT, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_OPCODE_INDEX) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
+           VerifyField<uint8_t>(verifier, VT_BUILTIN_OPTIONS_TYPE) &&
+           VerifyOffset(verifier, VT_BUILTIN_OPTIONS) &&
+           VerifyBuiltinOptions(verifier, builtin_options(),
+                                builtin_options_type()) &&
+           VerifyOffset(verifier, VT_CUSTOM_OPTIONS) &&
+           verifier.Verify(custom_options()) &&
+           VerifyField<int8_t>(verifier, VT_CUSTOM_OPTIONS_FORMAT) &&
+           verifier.EndTable();
+  }
+  OperatorT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      OperatorT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<Operator> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+template <>
+inline const Conv2DOptions *Operator::builtin_options_as<Conv2DOptions>()
+    const {
+  return builtin_options_as_Conv2DOptions();
+}
+
+template <>
+inline const DepthwiseConv2DOptions *
+Operator::builtin_options_as<DepthwiseConv2DOptions>() const {
+  return builtin_options_as_DepthwiseConv2DOptions();
+}
+
+template <>
+inline const ConcatEmbeddingsOptions *
+Operator::builtin_options_as<ConcatEmbeddingsOptions>() const {
+  return builtin_options_as_ConcatEmbeddingsOptions();
+}
+
+template <>
+inline const LSHProjectionOptions *
+Operator::builtin_options_as<LSHProjectionOptions>() const {
+  return builtin_options_as_LSHProjectionOptions();
+}
+
+template <>
+inline const Pool2DOptions *Operator::builtin_options_as<Pool2DOptions>()
+    const {
+  return builtin_options_as_Pool2DOptions();
+}
+
+template <>
+inline const SVDFOptions *Operator::builtin_options_as<SVDFOptions>() const {
+  return builtin_options_as_SVDFOptions();
+}
+
+template <>
+inline const RNNOptions *Operator::builtin_options_as<RNNOptions>() const {
+  return builtin_options_as_RNNOptions();
+}
+
+template <>
+inline const FullyConnectedOptions *
+Operator::builtin_options_as<FullyConnectedOptions>() const {
+  return builtin_options_as_FullyConnectedOptions();
+}
+
+template <>
+inline const SoftmaxOptions *Operator::builtin_options_as<SoftmaxOptions>()
+    const {
+  return builtin_options_as_SoftmaxOptions();
+}
+
+template <>
+inline const ConcatenationOptions *
+Operator::builtin_options_as<ConcatenationOptions>() const {
+  return builtin_options_as_ConcatenationOptions();
+}
+
+template <>
+inline const AddOptions *Operator::builtin_options_as<AddOptions>() const {
+  return builtin_options_as_AddOptions();
+}
+
+template <>
+inline const L2NormOptions *Operator::builtin_options_as<L2NormOptions>()
+    const {
+  return builtin_options_as_L2NormOptions();
+}
+
+template <>
+inline const LocalResponseNormalizationOptions *
+Operator::builtin_options_as<LocalResponseNormalizationOptions>() const {
+  return builtin_options_as_LocalResponseNormalizationOptions();
+}
+
+template <>
+inline const LSTMOptions *Operator::builtin_options_as<LSTMOptions>() const {
+  return builtin_options_as_LSTMOptions();
+}
+
+template <>
+inline const ResizeBilinearOptions *
+Operator::builtin_options_as<ResizeBilinearOptions>() const {
+  return builtin_options_as_ResizeBilinearOptions();
+}
+
+template <>
+inline const CallOptions *Operator::builtin_options_as<CallOptions>() const {
+  return builtin_options_as_CallOptions();
+}
+
+template <>
+inline const ReshapeOptions *Operator::builtin_options_as<ReshapeOptions>()
+    const {
+  return builtin_options_as_ReshapeOptions();
+}
+
+template <>
+inline const SkipGramOptions *Operator::builtin_options_as<SkipGramOptions>()
+    const {
+  return builtin_options_as_SkipGramOptions();
+}
+
+template <>
+inline const SpaceToDepthOptions *
+Operator::builtin_options_as<SpaceToDepthOptions>() const {
+  return builtin_options_as_SpaceToDepthOptions();
+}
+
+template <>
+inline const EmbeddingLookupSparseOptions *
+Operator::builtin_options_as<EmbeddingLookupSparseOptions>() const {
+  return builtin_options_as_EmbeddingLookupSparseOptions();
+}
+
+template <>
+inline const MulOptions *Operator::builtin_options_as<MulOptions>() const {
+  return builtin_options_as_MulOptions();
+}
+
+struct OperatorBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_opcode_index(uint32_t opcode_index) {
+    fbb_.AddElement<uint32_t>(Operator::VT_OPCODE_INDEX, opcode_index, 0);
+  }
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(Operator::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(Operator::VT_OUTPUTS, outputs);
+  }
+  void add_builtin_options_type(BuiltinOptions builtin_options_type) {
+    fbb_.AddElement<uint8_t>(Operator::VT_BUILTIN_OPTIONS_TYPE,
+                             static_cast<uint8_t>(builtin_options_type), 0);
+  }
+  void add_builtin_options(flatbuffers::Offset<void> builtin_options) {
+    fbb_.AddOffset(Operator::VT_BUILTIN_OPTIONS, builtin_options);
+  }
+  void add_custom_options(
+      flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options) {
+    fbb_.AddOffset(Operator::VT_CUSTOM_OPTIONS, custom_options);
+  }
+  void add_custom_options_format(CustomOptionsFormat custom_options_format) {
+    fbb_.AddElement<int8_t>(Operator::VT_CUSTOM_OPTIONS_FORMAT,
+                            static_cast<int8_t>(custom_options_format), 0);
+  }
+  explicit OperatorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OperatorBuilder &operator=(const OperatorBuilder &);
+  flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+    BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
+    flatbuffers::Offset<void> builtin_options = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom_options = 0,
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_custom_options(custom_options);
+  builder_.add_builtin_options(builtin_options);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_opcode_index(opcode_index);
+  builder_.add_custom_options_format(custom_options_format);
+  builder_.add_builtin_options_type(builtin_options_type);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Operator> CreateOperatorDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t opcode_index = 0,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    BuiltinOptions builtin_options_type = BuiltinOptions_NONE,
+    flatbuffers::Offset<void> builtin_options = 0,
+    const std::vector<uint8_t> *custom_options = nullptr,
+    CustomOptionsFormat custom_options_format =
+        CustomOptionsFormat_FLEXBUFFERS) {
+  return tflite::CreateOperator(
+      _fbb, opcode_index, inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0, builtin_options_type,
+      builtin_options,
+      custom_options ? _fbb.CreateVector<uint8_t>(*custom_options) : 0,
+      custom_options_format);
+}
+
+flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SubGraphT : public flatbuffers::NativeTable {
+  typedef SubGraph TableType;
+  std::vector<std::unique_ptr<TensorT>> tensors;
+  std::vector<int32_t> inputs;
+  std::vector<int32_t> outputs;
+  std::vector<std::unique_ptr<OperatorT>> operators;
+  std::string name;
+  SubGraphT() {}
+};
+
+struct SubGraph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SubGraphT NativeTableType;
+  enum {
+    VT_TENSORS = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_OPERATORS = 10,
+    VT_NAME = 12
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *tensors() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Tensor>> *>(
+        VT_TENSORS);
+  }
+  const flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Operator>> *operators() const {
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<Operator>> *>(
+        VT_OPERATORS);
+  }
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TENSORS) &&
+           verifier.Verify(tensors()) &&
+           verifier.VerifyVectorOfTables(tensors()) &&
+           VerifyOffset(verifier, VT_INPUTS) && verifier.Verify(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) && verifier.Verify(outputs()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.Verify(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_NAME) && verifier.Verify(name()) &&
+           verifier.EndTable();
+  }
+  SubGraphT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(
+      SubGraphT *_o,
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SubGraph> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SubGraphBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_tensors(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+          tensors) {
+    fbb_.AddOffset(SubGraph::VT_TENSORS, tensors);
+  }
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs) {
+    fbb_.AddOffset(SubGraph::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs) {
+    fbb_.AddOffset(SubGraph::VT_OUTPUTS, outputs);
+  }
+  void add_operators(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+          operators) {
+    fbb_.AddOffset(SubGraph::VT_OPERATORS, operators);
+  }
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(SubGraph::VT_NAME, name);
+  }
+  explicit SubGraphBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SubGraphBuilder &operator=(const SubGraphBuilder &);
+  flatbuffers::Offset<SubGraph> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SubGraph>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Tensor>>>
+        tensors = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> outputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Operator>>>
+        operators = 0,
+    flatbuffers::Offset<flatbuffers::String> name = 0) {
+  SubGraphBuilder builder_(_fbb);
+  builder_.add_name(name);
+  builder_.add_operators(operators);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  builder_.add_tensors(tensors);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraphDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<Tensor>> *tensors = nullptr,
+    const std::vector<int32_t> *inputs = nullptr,
+    const std::vector<int32_t> *outputs = nullptr,
+    const std::vector<flatbuffers::Offset<Operator>> *operators = nullptr,
+    const char *name = nullptr) {
+  return tflite::CreateSubGraph(
+      _fbb,
+      tensors ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(*tensors) : 0,
+      inputs ? _fbb.CreateVector<int32_t>(*inputs) : 0,
+      outputs ? _fbb.CreateVector<int32_t>(*outputs) : 0,
+      operators ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(*operators)
+                : 0,
+      name ? _fbb.CreateString(name) : 0);
+}
+
+flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct BufferT : public flatbuffers::NativeTable {
+  typedef Buffer TableType;
+  std::vector<uint8_t> data;
+  BufferT() {}
+};
+
+struct Buffer FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef BufferT NativeTableType;
+  enum { VT_DATA = 4 };
+  const flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) &&
+           verifier.Verify(data()) && verifier.EndTable();
+  }
+  BufferT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                 nullptr) const;
+  static flatbuffers::Offset<Buffer> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct BufferBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_data(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(Buffer::VT_DATA, data);
+  }
+  explicit BufferBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  BufferBuilder &operator=(const BufferBuilder &);
+  flatbuffers::Offset<Buffer> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Buffer>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> data = 0) {
+  BufferBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Buffer> CreateBufferDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  return tflite::CreateBuffer(_fbb,
+                              data ? _fbb.CreateVector<uint8_t>(*data) : 0);
+}
+
+flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ModelT : public flatbuffers::NativeTable {
+  typedef Model TableType;
+  uint32_t version;
+  std::vector<std::unique_ptr<OperatorCodeT>> operator_codes;
+  std::vector<std::unique_ptr<SubGraphT>> subgraphs;
+  std::string description;
+  std::vector<std::unique_ptr<BufferT>> buffers;
+  ModelT() : version(0) {}
+};
+
+struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ModelT NativeTableType;
+  enum {
+    VT_VERSION = 4,
+    VT_OPERATOR_CODES = 6,
+    VT_SUBGRAPHS = 8,
+    VT_DESCRIPTION = 10,
+    VT_BUFFERS = 12
+  };
+  uint32_t version() const { return GetField<uint32_t>(VT_VERSION, 0); }
+  const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *operator_codes()
+      const {
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<OperatorCode>> *>(
+        VT_OPERATOR_CODES);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *subgraphs() const {
+    return GetPointer<
+        const flatbuffers::Vector<flatbuffers::Offset<SubGraph>> *>(
+        VT_SUBGRAPHS);
+  }
+  const flatbuffers::String *description() const {
+    return GetPointer<const flatbuffers::String *>(VT_DESCRIPTION);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *buffers() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(
+        VT_BUFFERS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_VERSION) &&
+           VerifyOffset(verifier, VT_OPERATOR_CODES) &&
+           verifier.Verify(operator_codes()) &&
+           verifier.VerifyVectorOfTables(operator_codes()) &&
+           VerifyOffset(verifier, VT_SUBGRAPHS) &&
+           verifier.Verify(subgraphs()) &&
+           verifier.VerifyVectorOfTables(subgraphs()) &&
+           VerifyOffset(verifier, VT_DESCRIPTION) &&
+           verifier.Verify(description()) &&
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.Verify(buffers()) &&
+           verifier.VerifyVectorOfTables(buffers()) && verifier.EndTable();
+  }
+  ModelT *UnPack(
+      const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *_resolver =
+                                nullptr) const;
+  static flatbuffers::Offset<Model> Pack(
+      flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+      const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ModelBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_version(uint32_t version) {
+    fbb_.AddElement<uint32_t>(Model::VT_VERSION, version, 0);
+  }
+  void add_operator_codes(
+      flatbuffers::Offset<
+          flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+          operator_codes) {
+    fbb_.AddOffset(Model::VT_OPERATOR_CODES, operator_codes);
+  }
+  void add_subgraphs(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+          subgraphs) {
+    fbb_.AddOffset(Model::VT_SUBGRAPHS, subgraphs);
+  }
+  void add_description(flatbuffers::Offset<flatbuffers::String> description) {
+    fbb_.AddOffset(Model::VT_DESCRIPTION, description);
+  }
+  void add_buffers(
+      flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+          buffers) {
+    fbb_.AddOffset(Model::VT_BUFFERS, buffers);
+  }
+  explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ModelBuilder &operator=(const ModelBuilder &);
+  flatbuffers::Offset<Model> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Model>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>>
+        operator_codes = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>>
+        subgraphs = 0,
+    flatbuffers::Offset<flatbuffers::String> description = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>>
+        buffers = 0) {
+  ModelBuilder builder_(_fbb);
+  builder_.add_buffers(buffers);
+  builder_.add_description(description);
+  builder_.add_subgraphs(subgraphs);
+  builder_.add_operator_codes(operator_codes);
+  builder_.add_version(version);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Model> CreateModelDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, uint32_t version = 0,
+    const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes =
+        nullptr,
+    const std::vector<flatbuffers::Offset<SubGraph>> *subgraphs = nullptr,
+    const char *description = nullptr,
+    const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr) {
+  return tflite::CreateModel(
+      _fbb, version,
+      operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                           *operator_codes)
+                     : 0,
+      subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs)
+                : 0,
+      description ? _fbb.CreateString(description) : 0,
+      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0);
+}
+
+flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+inline QuantizationParametersT *QuantizationParameters::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new QuantizationParametersT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void QuantizationParameters::UnPackTo(
+    QuantizationParametersT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = min();
+    if (_e) {
+      _o->min.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->min[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = max();
+    if (_e) {
+      _o->max.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->max[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = scale();
+    if (_e) {
+      _o->scale.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->scale[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = zero_point();
+    if (_e) {
+      _o->zero_point.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->zero_point[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateQuantizationParameters(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
+    flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const QuantizationParametersT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _min = _o->min.size() ? _fbb.CreateVector(_o->min) : 0;
+  auto _max = _o->max.size() ? _fbb.CreateVector(_o->max) : 0;
+  auto _scale = _o->scale.size() ? _fbb.CreateVector(_o->scale) : 0;
+  auto _zero_point =
+      _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
+  return tflite::CreateQuantizationParameters(_fbb, _min, _max, _scale,
+                                              _zero_point);
+}
+
+inline TensorT *Tensor::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TensorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Tensor::UnPackTo(
+    TensorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = shape();
+    if (_e) {
+      _o->shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
+  {
+    auto _e = buffer();
+    _o->buffer = _e;
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
+  {
+    auto _e = quantization();
+    if (_e)
+      _o->quantization =
+          std::unique_ptr<QuantizationParametersT>(_e->UnPack(_resolver));
+  };
+}
+
+inline flatbuffers::Offset<Tensor> Tensor::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensor(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Tensor> CreateTensor(
+    flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const TensorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _shape = _o->shape.size() ? _fbb.CreateVector(_o->shape) : 0;
+  auto _type = _o->type;
+  auto _buffer = _o->buffer;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _quantization = _o->quantization
+                           ? CreateQuantizationParameters(
+                                 _fbb, _o->quantization.get(), _rehasher)
+                           : 0;
+  return tflite::CreateTensor(_fbb, _shape, _type, _buffer, _name,
+                              _quantization);
+}
+
+inline Conv2DOptionsT *Conv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Conv2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Conv2DOptions::UnPackTo(
+    Conv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<Conv2DOptions> Conv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Conv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Conv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConv2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _fused_activation_function);
+}
+
+inline Pool2DOptionsT *Pool2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new Pool2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Pool2DOptions::UnPackTo(
+    Pool2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = filter_width();
+    _o->filter_width = _e;
+  };
+  {
+    auto _e = filter_height();
+    _o->filter_height = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<Pool2DOptions> Pool2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePool2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Pool2DOptions> CreatePool2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const Pool2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const Pool2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _filter_width = _o->filter_width;
+  auto _filter_height = _o->filter_height;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreatePool2DOptions(_fbb, _padding, _stride_w, _stride_h,
+                                     _filter_width, _filter_height,
+                                     _fused_activation_function);
+}
+
+inline DepthwiseConv2DOptionsT *DepthwiseConv2DOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DepthwiseConv2DOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DepthwiseConv2DOptions::UnPackTo(
+    DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = padding();
+    _o->padding = _e;
+  };
+  {
+    auto _e = stride_w();
+    _o->stride_w = _e;
+  };
+  {
+    auto _e = stride_h();
+    _o->stride_h = _e;
+  };
+  {
+    auto _e = depth_multiplier();
+    _o->depth_multiplier = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> DepthwiseConv2DOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDepthwiseConv2DOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DepthwiseConv2DOptions> CreateDepthwiseConv2DOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const DepthwiseConv2DOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const DepthwiseConv2DOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  auto _depth_multiplier = _o->depth_multiplier;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateDepthwiseConv2DOptions(_fbb, _padding, _stride_w,
+                                              _stride_h, _depth_multiplier,
+                                              _fused_activation_function);
+}
+
+inline ConcatEmbeddingsOptionsT *ConcatEmbeddingsOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ConcatEmbeddingsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ConcatEmbeddingsOptions::UnPackTo(
+    ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = num_channels();
+    _o->num_channels = _e;
+  };
+  {
+    auto _e = num_columns_per_channel();
+    if (_e) {
+      _o->num_columns_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->num_columns_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = embedding_dim_per_channel();
+    if (_e) {
+      _o->embedding_dim_per_channel.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->embedding_dim_per_channel[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+ConcatEmbeddingsOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatEmbeddingsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConcatEmbeddingsOptions>
+CreateConcatEmbeddingsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatEmbeddingsOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatEmbeddingsOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _num_channels = _o->num_channels;
+  auto _num_columns_per_channel =
+      _o->num_columns_per_channel.size()
+          ? _fbb.CreateVector(_o->num_columns_per_channel)
+          : 0;
+  auto _embedding_dim_per_channel =
+      _o->embedding_dim_per_channel.size()
+          ? _fbb.CreateVector(_o->embedding_dim_per_channel)
+          : 0;
+  return tflite::CreateConcatEmbeddingsOptions(_fbb, _num_channels,
+                                               _num_columns_per_channel,
+                                               _embedding_dim_per_channel);
+}
+
+inline LSHProjectionOptionsT *LSHProjectionOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LSHProjectionOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LSHProjectionOptions::UnPackTo(
+    LSHProjectionOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = type();
+    _o->type = _e;
+  };
+}
+
+inline flatbuffers::Offset<LSHProjectionOptions> LSHProjectionOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSHProjectionOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LSHProjectionOptions> CreateLSHProjectionOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSHProjectionOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSHProjectionOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _type = _o->type;
+  return tflite::CreateLSHProjectionOptions(_fbb, _type);
+}
+
+inline SVDFOptionsT *SVDFOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SVDFOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SVDFOptions::UnPackTo(
+    SVDFOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = rank();
+    _o->rank = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<SVDFOptions> SVDFOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSVDFOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SVDFOptions> CreateSVDFOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SVDFOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SVDFOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _rank = _o->rank;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateSVDFOptions(_fbb, _rank, _fused_activation_function);
+}
+
+inline RNNOptionsT *RNNOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RNNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void RNNOptions::UnPackTo(
+    RNNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<RNNOptions> RNNOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRNNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RNNOptions> CreateRNNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const RNNOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const RNNOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateRNNOptions(_fbb, _fused_activation_function);
+}
+
+inline FullyConnectedOptionsT *FullyConnectedOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new FullyConnectedOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void FullyConnectedOptions::UnPackTo(
+    FullyConnectedOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<FullyConnectedOptions> FullyConnectedOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateFullyConnectedOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<FullyConnectedOptions> CreateFullyConnectedOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const FullyConnectedOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const FullyConnectedOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateFullyConnectedOptions(_fbb, _fused_activation_function);
+}
+
+inline SoftmaxOptionsT *SoftmaxOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SoftmaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SoftmaxOptions::UnPackTo(
+    SoftmaxOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
+}
+
+inline flatbuffers::Offset<SoftmaxOptions> SoftmaxOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SoftmaxOptions> CreateSoftmaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SoftmaxOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SoftmaxOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _beta = _o->beta;
+  return tflite::CreateSoftmaxOptions(_fbb, _beta);
+}
+
+inline ConcatenationOptionsT *ConcatenationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ConcatenationOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ConcatenationOptions::UnPackTo(
+    ConcatenationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = axis();
+    _o->axis = _e;
+  };
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<ConcatenationOptions> ConcatenationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateConcatenationOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ConcatenationOptions> CreateConcatenationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ConcatenationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ConcatenationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _axis = _o->axis;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateConcatenationOptions(_fbb, _axis,
+                                            _fused_activation_function);
+}
+
+inline AddOptionsT *AddOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AddOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AddOptions::UnPackTo(
+    AddOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<AddOptions> AddOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AddOptions> CreateAddOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const AddOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const AddOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateAddOptions(_fbb, _fused_activation_function);
+}
+
+inline MulOptionsT *MulOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MulOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MulOptions::UnPackTo(
+    MulOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<MulOptions> MulOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMulOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MulOptions> CreateMulOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const MulOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const MulOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateMulOptions(_fbb, _fused_activation_function);
+}
+
+inline L2NormOptionsT *L2NormOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new L2NormOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void L2NormOptions::UnPackTo(
+    L2NormOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+}
+
+inline flatbuffers::Offset<L2NormOptions> L2NormOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateL2NormOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<L2NormOptions> CreateL2NormOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const L2NormOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const L2NormOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  return tflite::CreateL2NormOptions(_fbb, _fused_activation_function);
+}
+
+inline LocalResponseNormalizationOptionsT *
+LocalResponseNormalizationOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LocalResponseNormalizationOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LocalResponseNormalizationOptions::UnPackTo(
+    LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = radius();
+    _o->radius = _e;
+  };
+  {
+    auto _e = bias();
+    _o->bias = _e;
+  };
+  {
+    auto _e = alpha();
+    _o->alpha = _e;
+  };
+  {
+    auto _e = beta();
+    _o->beta = _e;
+  };
+}
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+LocalResponseNormalizationOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLocalResponseNormalizationOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LocalResponseNormalizationOptions>
+CreateLocalResponseNormalizationOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const LocalResponseNormalizationOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LocalResponseNormalizationOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _radius = _o->radius;
+  auto _bias = _o->bias;
+  auto _alpha = _o->alpha;
+  auto _beta = _o->beta;
+  return tflite::CreateLocalResponseNormalizationOptions(_fbb, _radius, _bias,
+                                                         _alpha, _beta);
+}
+
+inline LSTMOptionsT *LSTMOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LSTMOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LSTMOptions::UnPackTo(
+    LSTMOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = fused_activation_function();
+    _o->fused_activation_function = _e;
+  };
+  {
+    auto _e = cell_clip();
+    _o->cell_clip = _e;
+  };
+  {
+    auto _e = proj_clip();
+    _o->proj_clip = _e;
+  };
+}
+
+inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLSTMOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const LSTMOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _fused_activation_function = _o->fused_activation_function;
+  auto _cell_clip = _o->cell_clip;
+  auto _proj_clip = _o->proj_clip;
+  return tflite::CreateLSTMOptions(_fbb, _fused_activation_function, _cell_clip,
+                                   _proj_clip);
+}
+
+inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ResizeBilinearOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ResizeBilinearOptions::UnPackTo(
+    ResizeBilinearOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = new_height();
+    _o->new_height = _e;
+  };
+  {
+    auto _e = new_width();
+    _o->new_width = _e;
+  };
+}
+
+inline flatbuffers::Offset<ResizeBilinearOptions> ResizeBilinearOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateResizeBilinearOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ResizeBilinearOptions> CreateResizeBilinearOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ResizeBilinearOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ResizeBilinearOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _new_height = _o->new_height;
+  auto _new_width = _o->new_width;
+  return tflite::CreateResizeBilinearOptions(_fbb, _new_height, _new_width);
+}
+
+inline CallOptionsT *CallOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CallOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CallOptions::UnPackTo(
+    CallOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = subgraph();
+    _o->subgraph = _e;
+  };
+}
+
+inline flatbuffers::Offset<CallOptions> CallOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCallOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CallOptions> CreateCallOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const CallOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const CallOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _subgraph = _o->subgraph;
+  return tflite::CreateCallOptions(_fbb, _subgraph);
+}
+
+inline ReshapeOptionsT *ReshapeOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReshapeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReshapeOptions::UnPackTo(
+    ReshapeOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = new_shape();
+    if (_e) {
+      _o->new_shape.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->new_shape[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<ReshapeOptions> ReshapeOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReshapeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReshapeOptions> CreateReshapeOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const ReshapeOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ReshapeOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _new_shape = _o->new_shape.size() ? _fbb.CreateVector(_o->new_shape) : 0;
+  return tflite::CreateReshapeOptions(_fbb, _new_shape);
+}
+
+inline SkipGramOptionsT *SkipGramOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SkipGramOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SkipGramOptions::UnPackTo(
+    SkipGramOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = ngram_size();
+    _o->ngram_size = _e;
+  };
+  {
+    auto _e = max_skip_size();
+    _o->max_skip_size = _e;
+  };
+  {
+    auto _e = include_all_ngrams();
+    _o->include_all_ngrams = _e;
+  };
+}
+
+inline flatbuffers::Offset<SkipGramOptions> SkipGramOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSkipGramOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SkipGramOptions> CreateSkipGramOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SkipGramOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SkipGramOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _ngram_size = _o->ngram_size;
+  auto _max_skip_size = _o->max_skip_size;
+  auto _include_all_ngrams = _o->include_all_ngrams;
+  return tflite::CreateSkipGramOptions(_fbb, _ngram_size, _max_skip_size,
+                                       _include_all_ngrams);
+}
+
+inline SpaceToDepthOptionsT *SpaceToDepthOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SpaceToDepthOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SpaceToDepthOptions::UnPackTo(
+    SpaceToDepthOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = block_size();
+    _o->block_size = _e;
+  };
+}
+
+inline flatbuffers::Offset<SpaceToDepthOptions> SpaceToDepthOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSpaceToDepthOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SpaceToDepthOptions> CreateSpaceToDepthOptions(
+    flatbuffers::FlatBufferBuilder &_fbb, const SpaceToDepthOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SpaceToDepthOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _block_size = _o->block_size;
+  return tflite::CreateSpaceToDepthOptions(_fbb, _block_size);
+}
+
+inline EmbeddingLookupSparseOptionsT *EmbeddingLookupSparseOptions::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new EmbeddingLookupSparseOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void EmbeddingLookupSparseOptions::UnPackTo(
+    EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = combiner();
+    _o->combiner = _e;
+  };
+}
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+EmbeddingLookupSparseOptions::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEmbeddingLookupSparseOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<EmbeddingLookupSparseOptions>
+CreateEmbeddingLookupSparseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const EmbeddingLookupSparseOptionsT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const EmbeddingLookupSparseOptionsT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _combiner = _o->combiner;
+  return tflite::CreateEmbeddingLookupSparseOptions(_fbb, _combiner);
+}
+
+inline OperatorCodeT *OperatorCode::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new OperatorCodeT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void OperatorCode::UnPackTo(
+    OperatorCodeT *_o,
+    const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = builtin_code();
+    _o->builtin_code = _e;
+  };
+  {
+    auto _e = custom_code();
+    if (_e) _o->custom_code = _e->str();
+  };
+}
+
+inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperatorCode(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorCodeT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _builtin_code = _o->builtin_code;
+  auto _custom_code =
+      _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  return tflite::CreateOperatorCode(_fbb, _builtin_code, _custom_code);
+}
+
+inline OperatorT *Operator::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new OperatorT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Operator::UnPackTo(
+    OperatorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = opcode_index();
+    _o->opcode_index = _e;
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = builtin_options_type();
+    _o->builtin_options.type = _e;
+  };
+  {
+    auto _e = builtin_options();
+    if (_e)
+      _o->builtin_options.value =
+          BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver);
+  };
+  {
+    auto _e = custom_options();
+    if (_e) {
+      _o->custom_options.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->custom_options[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = custom_options_format();
+    _o->custom_options_format = _e;
+  };
+}
+
+inline flatbuffers::Offset<Operator> Operator::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOperator(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Operator> CreateOperator(
+    flatbuffers::FlatBufferBuilder &_fbb, const OperatorT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const OperatorT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _opcode_index = _o->opcode_index;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _builtin_options_type = _o->builtin_options.type;
+  auto _builtin_options = _o->builtin_options.Pack(_fbb);
+  auto _custom_options =
+      _o->custom_options.size() ? _fbb.CreateVector(_o->custom_options) : 0;
+  auto _custom_options_format = _o->custom_options_format;
+  return tflite::CreateOperator(_fbb, _opcode_index, _inputs, _outputs,
+                                _builtin_options_type, _builtin_options,
+                                _custom_options, _custom_options_format);
+}
+
+inline SubGraphT *SubGraph::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SubGraphT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SubGraph::UnPackTo(
+    SubGraphT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = tensors();
+    if (_e) {
+      _o->tensors.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->tensors[_i] =
+            std::unique_ptr<TensorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = inputs();
+    if (_e) {
+      _o->inputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->inputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = outputs();
+    if (_e) {
+      _o->outputs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->outputs[_i] = _e->Get(_i);
+      }
+    }
+  };
+  {
+    auto _e = operators();
+    if (_e) {
+      _o->operators.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operators[_i] =
+            std::unique_ptr<OperatorT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = name();
+    if (_e) _o->name = _e->str();
+  };
+}
+
+inline flatbuffers::Offset<SubGraph> SubGraph::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSubGraph(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SubGraph> CreateSubGraph(
+    flatbuffers::FlatBufferBuilder &_fbb, const SubGraphT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const SubGraphT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _tensors =
+      _o->tensors.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Tensor>>(
+                _o->tensors.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateTensor(*__va->__fbb, __va->__o->tensors[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector(_o->inputs) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector(_o->outputs) : 0;
+  auto _operators = _o->operators.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<Operator>>(
+                              _o->operators.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateOperator(
+                                    *__va->__fbb, __va->__o->operators[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  return tflite::CreateSubGraph(_fbb, _tensors, _inputs, _outputs, _operators,
+                                _name);
+}
+
+inline BufferT *Buffer::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new BufferT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Buffer::UnPackTo(
+    BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = data();
+    if (_e) {
+      _o->data.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->data[_i] = _e->Get(_i);
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<Buffer> Buffer::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateBuffer(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Buffer> CreateBuffer(
+    flatbuffers::FlatBufferBuilder &_fbb, const BufferT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const BufferT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _data = _o->data.size() ? _fbb.CreateVector(_o->data) : 0;
+  return tflite::CreateBuffer(_fbb, _data);
+}
+
+inline ModelT *Model::UnPack(
+    const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ModelT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void Model::UnPackTo(
+    ModelT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  {
+    auto _e = version();
+    _o->version = _e;
+  };
+  {
+    auto _e = operator_codes();
+    if (_e) {
+      _o->operator_codes.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->operator_codes[_i] =
+            std::unique_ptr<OperatorCodeT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = subgraphs();
+    if (_e) {
+      _o->subgraphs.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->subgraphs[_i] =
+            std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+  {
+    auto _e = description();
+    if (_e) _o->description = _e->str();
+  };
+  {
+    auto _e = buffers();
+    if (_e) {
+      _o->buffers.resize(_e->size());
+      for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) {
+        _o->buffers[_i] =
+            std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver));
+      }
+    }
+  };
+}
+
+inline flatbuffers::Offset<Model> Model::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateModel(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<Model> CreateModel(
+    flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o,
+    const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs {
+    flatbuffers::FlatBufferBuilder *__fbb;
+    const ModelT *__o;
+    const flatbuffers::rehasher_function_t *__rehasher;
+  } _va = {&_fbb, _o, _rehasher};
+  (void)_va;
+  auto _version = _o->version;
+  auto _operator_codes =
+      _o->operator_codes.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(
+                _o->operator_codes.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateOperatorCode(*__va->__fbb,
+                                            __va->__o->operator_codes[i].get(),
+                                            __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  auto _subgraphs = _o->subgraphs.size()
+                        ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(
+                              _o->subgraphs.size(),
+                              [](size_t i, _VectorArgs *__va) {
+                                return CreateSubGraph(
+                                    *__va->__fbb, __va->__o->subgraphs[i].get(),
+                                    __va->__rehasher);
+                              },
+                              &_va)
+                        : 0;
+  auto _description =
+      _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
+  auto _buffers =
+      _o->buffers.size()
+          ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(
+                _o->buffers.size(),
+                [](size_t i, _VectorArgs *__va) {
+                  return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(),
+                                      __va->__rehasher);
+                },
+                &_va)
+          : 0;
+  return tflite::CreateModel(_fbb, _version, _operator_codes, _subgraphs,
+                             _description, _buffers);
+}
+
+inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier,
+                                 const void *obj, BuiltinOptions type) {
+  switch (type) {
+    case BuiltinOptions_NONE: {
+      return true;
+    }
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const Conv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const DepthwiseConv2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const ConcatEmbeddingsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const LSHProjectionOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const Pool2DOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const SVDFOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const RNNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const FullyConnectedOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const SoftmaxOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const ConcatenationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const AddOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const L2NormOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const LSTMOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const ResizeBilinearOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const CallOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const ReshapeOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const SkipGramOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const SpaceToDepthOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const EmbeddingLookupSparseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const MulOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default:
+      return false;
+  }
+}
+
+inline bool VerifyBuiltinOptionsVector(
+    flatbuffers::Verifier &verifier,
+    const flatbuffers::Vector<flatbuffers::Offset<void>> *values,
+    const flatbuffers::Vector<uint8_t> *types) {
+  if (values->size() != types->size()) return false;
+  for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyBuiltinOptions(verifier, values->Get(i),
+                              types->GetEnum<BuiltinOptions>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline void *BuiltinOptionsUnion::UnPack(
+    const void *obj, BuiltinOptions type,
+    const flatbuffers::resolver_function_t *resolver) {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const Conv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const DepthwiseConv2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const ConcatEmbeddingsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const LSHProjectionOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const Pool2DOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const SVDFOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const RNNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const FullyConnectedOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const SoftmaxOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const ConcatenationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const AddOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const L2NormOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const LSTMOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const ResizeBilinearOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const CallOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const ReshapeOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const SkipGramOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const SpaceToDepthOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const EmbeddingLookupSparseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const MulOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    default:
+      return nullptr;
+  }
+}
+
+inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const flatbuffers::rehasher_function_t *_rehasher) const {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<const Conv2DOptionsT *>(value);
+      return CreateConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<const DepthwiseConv2DOptionsT *>(value);
+      return CreateDepthwiseConv2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<const ConcatEmbeddingsOptionsT *>(value);
+      return CreateConcatEmbeddingsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<const LSHProjectionOptionsT *>(value);
+      return CreateLSHProjectionOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<const Pool2DOptionsT *>(value);
+      return CreatePool2DOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<const SVDFOptionsT *>(value);
+      return CreateSVDFOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<const RNNOptionsT *>(value);
+      return CreateRNNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<const FullyConnectedOptionsT *>(value);
+      return CreateFullyConnectedOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<const SoftmaxOptionsT *>(value);
+      return CreateSoftmaxOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<const ConcatenationOptionsT *>(value);
+      return CreateConcatenationOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<const AddOptionsT *>(value);
+      return CreateAddOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<const L2NormOptionsT *>(value);
+      return CreateL2NormOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr =
+          reinterpret_cast<const LocalResponseNormalizationOptionsT *>(value);
+      return CreateLocalResponseNormalizationOptions(_fbb, ptr, _rehasher)
+          .Union();
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<const LSTMOptionsT *>(value);
+      return CreateLSTMOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<const ResizeBilinearOptionsT *>(value);
+      return CreateResizeBilinearOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<const CallOptionsT *>(value);
+      return CreateCallOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<const ReshapeOptionsT *>(value);
+      return CreateReshapeOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<const SkipGramOptionsT *>(value);
+      return CreateSkipGramOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<const SpaceToDepthOptionsT *>(value);
+      return CreateSpaceToDepthOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<const EmbeddingLookupSparseOptionsT *>(value);
+      return CreateEmbeddingLookupSparseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<const MulOptionsT *>(value);
+      return CreateMulOptions(_fbb, ptr, _rehasher).Union();
+    }
+    default:
+      return 0;
+  }
+}
+
+inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u)
+    FLATBUFFERS_NOEXCEPT : type(u.type),
+                           value(nullptr) {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      value = new Conv2DOptionsT(*reinterpret_cast<Conv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      value = new DepthwiseConv2DOptionsT(
+          *reinterpret_cast<DepthwiseConv2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      value = new ConcatEmbeddingsOptionsT(
+          *reinterpret_cast<ConcatEmbeddingsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      value = new LSHProjectionOptionsT(
+          *reinterpret_cast<LSHProjectionOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      value = new Pool2DOptionsT(*reinterpret_cast<Pool2DOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      value = new SVDFOptionsT(*reinterpret_cast<SVDFOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      value = new RNNOptionsT(*reinterpret_cast<RNNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      value = new FullyConnectedOptionsT(
+          *reinterpret_cast<FullyConnectedOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      value =
+          new SoftmaxOptionsT(*reinterpret_cast<SoftmaxOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      value = new ConcatenationOptionsT(
+          *reinterpret_cast<ConcatenationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      value = new AddOptionsT(*reinterpret_cast<AddOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      value = new L2NormOptionsT(*reinterpret_cast<L2NormOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      value = new LocalResponseNormalizationOptionsT(
+          *reinterpret_cast<LocalResponseNormalizationOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      value = new LSTMOptionsT(*reinterpret_cast<LSTMOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      value = new ResizeBilinearOptionsT(
+          *reinterpret_cast<ResizeBilinearOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      value = new CallOptionsT(*reinterpret_cast<CallOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      value =
+          new ReshapeOptionsT(*reinterpret_cast<ReshapeOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      value =
+          new SkipGramOptionsT(*reinterpret_cast<SkipGramOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      value = new SpaceToDepthOptionsT(
+          *reinterpret_cast<SpaceToDepthOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      value = new EmbeddingLookupSparseOptionsT(
+          *reinterpret_cast<EmbeddingLookupSparseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      value = new MulOptionsT(*reinterpret_cast<MulOptionsT *>(u.value));
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+inline void BuiltinOptionsUnion::Reset() {
+  switch (type) {
+    case BuiltinOptions_Conv2DOptions: {
+      auto ptr = reinterpret_cast<Conv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_DepthwiseConv2DOptions: {
+      auto ptr = reinterpret_cast<DepthwiseConv2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatEmbeddingsOptions: {
+      auto ptr = reinterpret_cast<ConcatEmbeddingsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSHProjectionOptions: {
+      auto ptr = reinterpret_cast<LSHProjectionOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_Pool2DOptions: {
+      auto ptr = reinterpret_cast<Pool2DOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SVDFOptions: {
+      auto ptr = reinterpret_cast<SVDFOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RNNOptions: {
+      auto ptr = reinterpret_cast<RNNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_FullyConnectedOptions: {
+      auto ptr = reinterpret_cast<FullyConnectedOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SoftmaxOptions: {
+      auto ptr = reinterpret_cast<SoftmaxOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ConcatenationOptions: {
+      auto ptr = reinterpret_cast<ConcatenationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddOptions: {
+      auto ptr = reinterpret_cast<AddOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_L2NormOptions: {
+      auto ptr = reinterpret_cast<L2NormOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LocalResponseNormalizationOptions: {
+      auto ptr = reinterpret_cast<LocalResponseNormalizationOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LSTMOptions: {
+      auto ptr = reinterpret_cast<LSTMOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ResizeBilinearOptions: {
+      auto ptr = reinterpret_cast<ResizeBilinearOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CallOptions: {
+      auto ptr = reinterpret_cast<CallOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReshapeOptions: {
+      auto ptr = reinterpret_cast<ReshapeOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SkipGramOptions: {
+      auto ptr = reinterpret_cast<SkipGramOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SpaceToDepthOptions: {
+      auto ptr = reinterpret_cast<SpaceToDepthOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_EmbeddingLookupSparseOptions: {
+      auto ptr = reinterpret_cast<EmbeddingLookupSparseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MulOptions: {
+      auto ptr = reinterpret_cast<MulOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    default:
+      break;
+  }
+  value = nullptr;
+  type = BuiltinOptions_NONE;
+}
+
+inline const tflite::Model *GetModel(const void *buf) {
+  return flatbuffers::GetRoot<tflite::Model>(buf);
+}
+
+inline const char *ModelIdentifier() { return "TFL3"; }
+
+inline bool ModelBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(buf, ModelIdentifier());
+}
+
+inline bool VerifyModelBuffer(flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::Model>(ModelIdentifier());
+}
+
+inline const char *ModelExtension() { return "tflite"; }
+
+inline void FinishModelBuffer(flatbuffers::FlatBufferBuilder &fbb,
+                              flatbuffers::Offset<tflite::Model> root) {
+  fbb.Finish(root, ModelIdentifier());
+}
+
+inline std::unique_ptr<ModelT> UnPackModel(
+    const void *buf, const flatbuffers::resolver_function_t *res = nullptr) {
+  return std::unique_ptr<ModelT>(GetModel(buf)->UnPack(res));
+}
+
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_TFLITE_H_
diff --git a/tensorflow/contrib/lite/schema/schema_v0.fbs b/tensorflow/contrib/lite/schema/schema_v0.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..852ea988f3ddc749ef20238e1171059268441030
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema_v0.fbs
@@ -0,0 +1,247 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace tflite;
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+}
+
+// Parameters for converting a quantized tensor back to float. Given a
+// quantized value q, the corresponding float value f should be:
+//   f = scale * (q - zero_point)
+table QuantizationParameters {
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];
+  zero_point:[long];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, number of channels, height, width] (That's
+  // Tensorflow's NCHW).
+  shape:[int];
+  type:TensorType;
+  // The data_buffer is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*4*3 + j*3 + k].
+  data_buffer:[ubyte];
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+}
+
+// A list of builtin operators. Builtin operators a slighlty faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+enum BuiltinOperator : byte {
+  CUSTOM = 0,
+  CONVOLUTION = 1,
+  DEPTHWISE_CONVOLUTION = 2,
+  CONCAT_EMBEDDINGS = 3,
+  LSH_PROJECTION = 4,
+  TANH = 5,
+  RELU = 6,
+  AVERAGE_POOL = 7,
+  MAX_POOL = 8,
+  L2_POOL = 9,
+  SIGMOID = 10,
+  SVDF = 11,
+  BasicRNN = 12,
+  RELU6 = 13,
+  EMBEDDING_LOOKUP = 14,
+  FULLY_CONNECTED = 15,
+  HASHTABLE_LOOKUP = 16,
+  SOFTMAX = 17,
+  CONCATENATION = 18,
+  LSTM = 19,
+  ADD = 20,
+  L2NORM = 21,
+  LOCAL_RESPONSE_NORM = 22,
+  RESIZE_BILINEAR = 23,
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  ConvolutionOptions,
+  DepthwiseConvolutionOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  PoolOptions,
+  SVDFOptions,
+  BasicRNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table ConvolutionOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table PoolOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConvolutionOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow BasicRNNCell.
+table BasicRNNOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
+table ResizeBilinearOptions {
+  new_height:int;
+  new_width:int;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:int;
+
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+}
+
+// The root type, defining a model.
+table Model {
+  // A list of all tensors used in this model.
+  tensors:[Tensor];
+
+  // Indices of the input tensors.
+  inputs:[int];
+
+  // Indices of the output tensors.
+  outputs:[int];
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All operators, in execution order.
+  operators:[Operator];
+}
+
+root_type Model;
diff --git a/tensorflow/contrib/lite/schema/schema_v1.fbs b/tensorflow/contrib/lite/schema/schema_v1.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..06cd9408edb710104faffe854cb13807f0c63bcc
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema_v1.fbs
@@ -0,0 +1,295 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+
+namespace tflite;
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+}
+
+// Parameters for converting a quantized tensor back to float. Given a
+// quantized value q, the corresponding float value f should be:
+//   f = scale * (q - zero_point)
+table QuantizationParameters {
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];
+  zero_point:[long];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, number of channels, height, width] (That's
+  // Tensorflow's NCHW).
+  shape:[int];
+  type:TensorType;
+  // The data_buffer is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  data_buffer:[ubyte];
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+}
+
+// A list of builtin operators. Builtin operators a slighlty faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+enum BuiltinOperator : byte {
+  CUSTOM = 0,
+  CONVOLUTION = 1,
+  DEPTHWISE_CONVOLUTION = 2,
+  CONCAT_EMBEDDINGS = 3,
+  LSH_PROJECTION = 4,
+  TANH = 5,
+  RELU = 6,
+  AVERAGE_POOL = 7,
+  MAX_POOL = 8,
+  L2_POOL = 9,
+  SIGMOID = 10,
+  SVDF = 11,
+  BasicRNN = 12,
+  RELU6 = 13,
+  EMBEDDING_LOOKUP = 14,
+  FULLY_CONNECTED = 15,
+  HASHTABLE_LOOKUP = 16,
+  SOFTMAX = 17,
+  CONCATENATION = 18,
+  LSTM = 19,
+  ADD = 20,
+  L2NORM = 21,
+  LOCAL_RESPONSE_NORM = 22,
+  RESIZE_BILINEAR = 23,
+  CALL = 24,
+  RESHAPE = 25,
+  SKIP_GRAM = 26,
+  SPACE_TO_DEPTH = 27,
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  ConvolutionOptions,
+  DepthwiseConvolutionOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  PoolOptions,
+  SVDFOptions,
+  BasicRNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table ConvolutionOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table PoolOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConvolutionOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow BasicRNNCell.
+table BasicRNNOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
+table ResizeBilinearOptions {
+  new_height:int;
+  new_width:int;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:int;
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:int;
+
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+}
+
+// The root type, defining a model.
+table SubGraph {
+  // A list of all tensors used in this model.
+  tensors:[Tensor];
+
+  // Indices of the input tensors.
+  inputs:[int];
+
+  // Indices of the output tensors.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of subgraph (used for debugging).
+  name:string;
+}
+
+table Model {
+  // Version of the schema.
+  version:int;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+}
+
+root_type Model;
diff --git a/tensorflow/contrib/lite/schema/schema_v2.fbs b/tensorflow/contrib/lite/schema/schema_v2.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..96731c8aaebf69358c71c52738f045735e385aa0
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema_v2.fbs
@@ -0,0 +1,303 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+
+namespace tflite;
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+}
+
+// Parameters for converting a quantized tensor back to float. Given a
+// quantized value q, the corresponding float value f should be:
+//   f = scale * (q - zero_point)
+table QuantizationParameters {
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];
+  zero_point:[long];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, number of channels, height, width] (That's
+  // Tensorflow's NCHW).
+  shape:[int];
+  type:TensorType;
+  // The data_buffer is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  data_buffer:[ubyte];
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+}
+
+// A list of builtin operators. Builtin operators a slighlty faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  // DEPTH_TO_SPACE = 5,
+  // DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  // FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  // MUL = 18,
+  RELU = 19,
+  // RELU1=20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
+table ResizeBilinearOptions {
+  new_height:int;
+  new_width:int;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:int;
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:int;
+
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+}
+
+// The root type, defining a model.
+table SubGraph {
+  // A list of all tensors used in this model.
+  tensors:[Tensor];
+
+  // Indices of the input tensors.
+  inputs:[int];
+
+  // Indices of the output tensors.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of subgraph (used for debugging).
+  name:string;
+}
+
+table Model {
+  // Version of the schema.
+  version:int;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+}
+
+root_type Model;
diff --git a/tensorflow/contrib/lite/schema/schema_v3.fbs b/tensorflow/contrib/lite/schema/schema_v3.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..cedefe08f35cbb5dd8aa5063de35a13c1b1ca298
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/schema_v3.fbs
@@ -0,0 +1,326 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+
+namespace tflite;
+
+// This corresponds to the version (4).
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+}
+
+// Parameters for converting a quantized tensor back to float. Given a
+// quantized value q, the corresponding float value f should be:
+//   f = scale * (q - zero_point)
+table QuantizationParameters {
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];
+  zero_point:[long];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, number of channels, height, width] (That's
+  // Tensorflow's NCHW).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existant empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*3 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+}
+
+// A list of builtin operators. Builtin operators a slighlty faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  // DEPTH_TO_SPACE = 5,
+  // DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  // FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  // MUL = 18,
+  RELU = 19,
+  // RELU1=20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+
+}
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+}
+
+table ResizeBilinearOptions {
+  new_height:int;
+  new_width:int;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+}
+
+// The root type, defining a model.
+table SubGraph {
+  // A list of all tensors used in this model.
+  tensors:[Tensor];
+
+  // Indices of the input tensors.
+  inputs:[int];
+
+  // Indices of the output tensors.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index.
+table Buffer {
+  data:[ubyte];
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // NOTE: It is required that the first entry in here is always an empty
+  // buffer. This is so that the default buffer index of zero in Tensor
+  // will always refer to a valid empty buffer.
+  buffers:[Buffer];
+
+}
+
+root_type Model;
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema.py b/tensorflow/contrib/lite/schema/upgrade_schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..94f5730be5d991ae13fb019e4d035e23f76fe441
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/upgrade_schema.py
@@ -0,0 +1,348 @@
+# ==============================================================================
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Upgrade script to move from pre-release schema to new schema.
+
+Usage examples:
+
+bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.json out.json
+bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.bin out.bin
+bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.bin out.json
+bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.json out.bin
+bazel run tensorflow/contrib/lite/schema/upgrade_schema -- in.tflite out.tflite
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import contextlib
+import json
+import os
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import tensorflow as tf
+from tensorflow.python.platform import resource_loader
+
+parser = argparse.ArgumentParser(
+    description="Script to move TFLite models from pre-release schema to"
+    " new schema.")
+parser.add_argument(
+    "input",
+    type=str,
+    help="Input TensorFlow lite file in `.json`, `.bin` or `.tflite` format.")
+parser.add_argument(
+    "output",
+    type=str,
+    help="Output json or bin TensorFlow lite model compliant with"
+    "the new schema. Extension must be `.json`, `.bin` or `.tflite`.")
+
+
+# RAII Temporary Directory, because flatc doesn't allow direct use of tempfiles.
+@contextlib.contextmanager
+def TemporaryDirectoryResource():
+  temporary = tempfile.mkdtemp()
+  try:
+    yield temporary
+  finally:
+    shutil.rmtree(temporary)
+
+
+class Converter(object):
+  """Converts TensorFlow flatbuffer models from old to new version of schema.
+
+  This can convert between any version to the latest version. It uses
+  an incremental upgrade strategy to go from version to version.
+
+  Usage:
+    converter = Converter()
+    converter.Convert("a.tflite", "a.json")
+    converter.Convert("b.json", "b.tflite")
+  """
+
+  def __init__(self):
+    # TODO(aselle): make this work in the open source version with better
+    # path.
+    paths_to_try = [
+        "../../../../flatbuffers/flatc",  # not bazel
+        "../../../../external/flatbuffers/flatc"  # bazel
+    ]
+    for p in paths_to_try:
+      self._flatc_path = resource_loader.get_path_to_datafile(p)
+      if os.path.exists(self._flatc_path): break
+
+    def FindSchema(base_name):
+      return resource_loader.get_path_to_datafile("%s" % base_name)
+
+    # Supported schemas for upgrade.
+    self._schemas = [
+        (0, FindSchema("schema_v0.fbs"), True, self._Upgrade0To1),
+        (1, FindSchema("schema_v1.fbs"), True, self._Upgrade1To2),
+        (2, FindSchema("schema_v2.fbs"), True, self._Upgrade2To3),
+        (3, FindSchema("schema_v3.fbs"), False, None)  # Non-callable by design.
+    ]
+    # Ensure schemas are sorted, and extract latest version and upgrade
+    # dispatch function table.
+    self._schemas.sort()
+    self._new_version, self._new_schema = self._schemas[-1][:2]
+    self._upgrade_dispatch = dict(
+        (version, dispatch)
+        for version, unused1, unused2, dispatch in self._schemas)
+
+  def _Read(self, input_file, schema, raw_binary=False):
+    """Read a tflite model assuming the given flatbuffer schema.
+
+    If `input_file` is in bin, then we must use flatc to convert the schema
+    from binary to json.
+
+    Args:
+      input_file: a binary (flatbuffer) or json file to read from. Extension
+        must  be `.tflite`, `.bin`, or `.json` for FlatBuffer Binary or
+        FlatBuffer JSON.
+      schema: which schema to use for reading
+      raw_binary: whether to assume raw_binary (versions previous to v3)
+        that lacked file_identifier require this.
+
+    Raises:
+      RuntimeError: When flatc cannot be invoked.
+      ValueError: When the extension is not json or bin.
+
+    Returns:
+      A dictionary representing the read tflite model.
+    """
+    raw_binary = ["--raw-binary"] if raw_binary else []
+    with TemporaryDirectoryResource() as tempdir:
+      basename = os.path.basename(input_file)
+      basename_no_extension, extension = os.path.splitext(basename)
+      if extension in [".bin", ".tflite"]:
+        # Convert to json using flatc
+        returncode = subprocess.call([
+            self._flatc_path,
+            "-t",
+            "--strict-json",
+            "--defaults-json",
+        ] + raw_binary + ["-o", tempdir, schema, "--", input_file])
+        if returncode != 0:
+          raise RuntimeError("flatc failed to convert from binary to json.")
+        json_file = os.path.join(tempdir, basename_no_extension + ".json")
+        if not os.path.exists(json_file):
+          raise RuntimeError("Could not find %r" % json_file)
+      elif extension == ".json":
+        json_file = input_file
+      else:
+        raise ValueError("Invalid extension on input file %r" % input_file)
+      return json.load(open(json_file))
+
+  def _Write(self, data, output_file):
+    """Output a json or bin version of the flatbuffer model.
+
+    Args:
+      data: Dict representing the TensorFlow Lite model to write.
+      output_file: filename to write the converted flatbuffer to. (json,
+        tflite, or bin extension is required).
+    Raises:
+      ValueError: When the extension is not json or bin
+      RuntimeError: When flatc fails to convert json data to binary.
+    """
+    _, extension = os.path.splitext(output_file)
+    with TemporaryDirectoryResource() as tempdir:
+      if extension == ".json":
+        json.dump(data, open(output_file, "w"), sort_keys=True, indent=2)
+      elif extension in [".tflite", ".bin"]:
+        input_json = os.path.join(tempdir, "temp.json")
+        with open(input_json, "w") as fp:
+          json.dump(data, fp, sort_keys=True, indent=2)
+        returncode = subprocess.call([
+            self._flatc_path, "-b", "--defaults-json", "--strict-json", "-o",
+            tempdir, self._new_schema, input_json
+        ])
+        if returncode != 0:
+          raise RuntimeError("flatc failed to convert upgraded json to binary.")
+
+        shutil.copy(os.path.join(tempdir, "temp.tflite"), output_file)
+      else:
+        raise ValueError("Invalid extension on output file %r" % output_file)
+
+  def _Upgrade0To1(self, data):
+    """Upgrade data from Version 0 to Version 1.
+
+    Changes: Added subgraphs (which contains a subset of formally global
+    entries).
+
+    Args:
+      data: Dictionary representing the TensorFlow lite data to be upgraded.
+        This will be modified in-place to be an upgraded version.
+    """
+    subgraph = {}
+    for key_to_promote in ["tensors", "operators", "inputs", "outputs"]:
+      subgraph[key_to_promote] = data[key_to_promote]
+      del data[key_to_promote]
+    data["subgraphs"] = [subgraph]
+
+  def _Upgrade1To2(self, data):
+    """Upgrade data from Version 1 to Version 2.
+
+    Changes: Rename operators to Conform to NN API.
+
+    Args:
+      data: Dictionary representing the TensorFlow lite data to be upgraded.
+        This will be modified in-place to be an upgraded version.
+    Raises:
+      ValueError: Throws when model builtins are numeric rather than symbols.
+    """
+
+    def RemapOperator(opcode_name):
+      """Go from old schema op name to new schema op name.
+
+      Args:
+        opcode_name: String representing the ops (see :schema.fbs).
+      Returns:
+        Converted opcode_name from V1 to V2.
+      """
+      old_name_to_new_name = {
+          "CONVOLUTION": "CONV_2D",
+          "DEPTHWISE_CONVOLUTION": "DEPTHWISE_CONV_2D",
+          "AVERAGE_POOL": "AVERAGE_POOL_2D",
+          "MAX_POOL": "MAX_POOL_2D",
+          "L2_POOL": "L2_POOL_2D",
+          "SIGMOID": "LOGISTIC",
+          "L2NORM": "L2_NORMALIZATION",
+          "LOCAL_RESPONSE_NORM": "LOCAL_RESPONSE_NORMALIZATION",
+          "Basic_RNN": "RNN",
+      }
+
+      return (old_name_to_new_name[opcode_name]
+              if opcode_name in old_name_to_new_name else opcode_name)
+
+    def RemapOperatorType(operator_type):
+      """Remap operator structs from old names to new names.
+
+      Args:
+        operator_type: String representing the builtin operator data type
+          string.
+        (see :schema.fbs).
+      Returns:
+        Upgraded builtin operator data type as a string.
+      """
+      old_to_new = {
+          "PoolOptions": "Pool2DOptions",
+          "DepthwiseConvolutionOptions": "DepthwiseConv2DOptions",
+          "ConvolutionOptions": "Conv2DOptions",
+          "LocalResponseNormOptions": "LocalResponseNormalizationOptions",
+          "BasicRNNOptions": "RNNOptions",
+      }
+      return (old_to_new[operator_type]
+              if operator_type in old_to_new else operator_type)
+
+    for subgraph in data["subgraphs"]:
+      for ops in subgraph["operators"]:
+        ops["builtin_options_type"] = RemapOperatorType(
+            ops["builtin_options_type"])
+
+    # Upgrade the operator codes
+    for operator_code in data["operator_codes"]:
+      # Check if builtin_code is the appropriate string type
+      # use type("") instead of str or unicode. for py2and3
+      if not isinstance(operator_code["builtin_code"], type(u"")):
+        raise ValueError("builtin_code %r is non-string. this usually means"
+                         "your model has consistency problems." %
+                         (operator_code["builtin_code"]))
+      operator_code["builtin_code"] = (RemapOperator(
+          operator_code["builtin_code"]))
+
+  def _Upgrade2To3(self, data):
+    """Upgrade data from Version 2 to Version 3.
+
+    Changed actual read-only tensor data to be in a buffers table instead
+    of inline with the tensor.
+
+    Args:
+      data: Dictionary representing the TensorFlow lite data to be upgraded.
+        This will be modified in-place to be an upgraded version.
+    """
+    buffers = [{"data": []}]  # Start with 1 empty buffer
+    for subgraph in data["subgraphs"]:
+      if "tensors" not in subgraph:
+        continue
+      for tensor in subgraph["tensors"]:
+        if "data_buffer" not in tensor:
+          tensor["buffer"] = 0
+        else:
+          if tensor["data_buffer"]:
+            tensor[u"buffer"] = len(buffers)
+            buffers.append({"data": tensor["data_buffer"]})
+          else:
+            tensor["buffer"] = 0
+          del tensor["data_buffer"]
+    data["buffers"] = buffers
+
+  def _PerformUpgrade(self, data):
+    """Manipulate the `data` (parsed JSON) based on changes in format.
+
+    This incrementally will upgrade from version to version within data.
+
+    Args:
+      data: Dictionary representing the TensorFlow data. This will be upgraded
+        in place.
+    """
+    while data["version"] < self._new_version:
+      self._upgrade_dispatch[data["version"]](data)
+      data["version"] += 1
+
+  def Convert(self, input_file, output_file):
+    """Perform schema conversion from input_file to output_file.
+
+    Args:
+      input_file: Filename of TensorFlow Lite data to convert from. Must
+        be `.json` or `.bin` extension files for JSON or Binary forms of
+        the TensorFlow FlatBuffer schema.
+      output_file: Filename to write to. Extension also must be `.json`
+        or `.bin`.
+
+    Raises:
+      RuntimeError: Generated when none of the upgrader supported schemas
+        matche the `input_file` data.
+    """
+    # Read data in each schema (since they are incompatible). Version is
+    # always present. Use the read data that matches the version of the
+    # schema.
+    for version, schema, raw_binary, _ in self._schemas:
+      try:
+        data_candidate = self._Read(input_file, schema, raw_binary)
+      except RuntimeError:
+        continue  # Skip and hope another schema works
+      if "version" not in data_candidate:  # Assume version 1 if not present.
+        data_candidate["version"] = 1
+      elif data_candidate["version"] == 0:  # Version 0 doesn't exist in wild.
+        data_candidate["version"] = 1
+
+      if data_candidate["version"] == version:
+        self._PerformUpgrade(data_candidate)
+        self._Write(data_candidate, output_file)
+        return
+    raise RuntimeError("No schema that the converter understands worked with "
+                       "the data file you provided.")
+
+
+def main(argv):
+  del argv
+  Converter().Convert(FLAGS.input, FLAGS.output)
+
+
+if __name__ == "__main__":
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/lite/schema/upgrade_schema_test.py b/tensorflow/contrib/lite/schema/upgrade_schema_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5002e6f7576b6de533046aaad37fe06746d3644
--- /dev/null
+++ b/tensorflow/contrib/lite/schema/upgrade_schema_test.py
@@ -0,0 +1,322 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Testing for updating TensorFlow lite schema."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import tempfile
+from tensorflow.contrib.lite.schema import upgrade_schema as upgrade_schema_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+EMPTY_TEST_SCHEMA_V1 = {
+    "version": 1,
+    "operator_codes": [],
+    "subgraphs": [],
+}
+
+EMPTY_TEST_SCHEMA_V3 = {
+    "version": 3,
+    "operator_codes": [],
+    "subgraphs": [],
+    "buffers": [{
+        "data": []
+    }]
+}
+
+TEST_SCHEMA_V0 = {
+    "operator_codes": [],
+    "tensors": [],
+    "inputs": [],
+    "outputs": [],
+    "operators": [],
+    "version": 0
+}
+
+TEST_SCHEMA_V3 = {
+    "operator_codes": [],
+    "buffers": [{
+        "data": []
+    }],
+    "subgraphs": [{
+        "tensors": [],
+        "inputs": [],
+        "outputs": [],
+        "operators": [],
+    }],
+    "version":
+        3
+}
+
+FULL_TEST_SCHEMA_V1 = {
+    "version":
+        1,
+    "operator_codes": [
+        {
+            "builtin_code": "CONVOLUTION"
+        },
+        {
+            "builtin_code": "DEPTHWISE_CONVOLUTION"
+        },
+        {
+            "builtin_code": "AVERAGE_POOL"
+        },
+        {
+            "builtin_code": "MAX_POOL"
+        },
+        {
+            "builtin_code": "L2_POOL"
+        },
+        {
+            "builtin_code": "SIGMOID"
+        },
+        {
+            "builtin_code": "L2NORM"
+        },
+        {
+            "builtin_code": "LOCAL_RESPONSE_NORM"
+        },
+        {
+            "builtin_code": "ADD"
+        },
+        {
+            "builtin_code": "Basic_RNN"
+        },
+    ],
+    "subgraphs": [{
+        "operators": [
+            {
+                "builtin_options_type": "PoolOptions"
+            },
+            {
+                "builtin_options_type": "DepthwiseConvolutionOptions"
+            },
+            {
+                "builtin_options_type": "ConvolutionOptions"
+            },
+            {
+                "builtin_options_type": "LocalResponseNormOptions"
+            },
+            {
+                "builtin_options_type": "BasicRNNOptions"
+            },
+        ],
+    }],
+    "description":
+        "",
+}
+
+FULL_TEST_SCHEMA_V3 = {
+    "version":
+        3,
+    "operator_codes": [
+        {
+            "builtin_code": "CONV_2D"
+        },
+        {
+            "builtin_code": "DEPTHWISE_CONV_2D"
+        },
+        {
+            "builtin_code": "AVERAGE_POOL_2D"
+        },
+        {
+            "builtin_code": "MAX_POOL_2D"
+        },
+        {
+            "builtin_code": "L2_POOL_2D"
+        },
+        {
+            "builtin_code": "LOGISTIC"
+        },
+        {
+            "builtin_code": "L2_NORMALIZATION"
+        },
+        {
+            "builtin_code": "LOCAL_RESPONSE_NORMALIZATION"
+        },
+        {
+            "builtin_code": "ADD"
+        },
+        {
+            "builtin_code": "RNN"
+        },
+    ],
+    "subgraphs": [{
+        "operators": [
+            {
+                "builtin_options_type": "Pool2DOptions"
+            },
+            {
+                "builtin_options_type": "DepthwiseConv2DOptions"
+            },
+            {
+                "builtin_options_type": "Conv2DOptions"
+            },
+            {
+                "builtin_options_type": "LocalResponseNormalizationOptions"
+            },
+            {
+                "builtin_options_type": "RNNOptions"
+            },
+        ],
+    }],
+    "description":
+        "",
+    "buffers": [{
+        "data": []
+    }]
+}
+
+BUFFER_TEST_V2 = {
+    "operator_codes": [],
+    "buffers": [],
+    "subgraphs": [{
+        "tensors": [
+            {
+                "data_buffer": [1, 2, 3, 4]
+            },
+            {
+                "data_buffer": [1, 2, 3, 4, 5, 6, 7, 8]
+            },
+            {
+                "data_buffer": []
+            },
+        ],
+        "inputs": [],
+        "outputs": [],
+        "operators": [],
+    }],
+    "version":
+        2
+}
+
+BUFFER_TEST_V3 = {
+    "operator_codes": [],
+    "subgraphs": [{
+        "tensors": [
+            {
+                "buffer": 1
+            },
+            {
+                "buffer": 2
+            },
+            {
+                "buffer": 0
+            },
+        ],
+        "inputs": [],
+        "outputs": [],
+        "operators": [],
+    }],
+    "buffers": [
+        {
+            "data": []
+        },
+        {
+            "data": [1, 2, 3, 4]
+        },
+        {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8]
+        },
+    ],
+    "version":
+        3
+}
+
+
+def JsonDumpAndFlush(data, fp):
+  """Write the dictionary `data` to a JSON file `fp` (and flush).
+
+  Args:
+    data: in a dictionary that is JSON serializable.
+    fp: File-like object
+  """
+  json.dump(data, fp)
+  fp.flush()
+
+
+class TestSchemaUpgrade(test_util.TensorFlowTestCase):
+
+  def testNonExistentFile(self):
+    converter = upgrade_schema_lib.Converter()
+    non_existent = tempfile.mktemp(suffix=".json")
+    with self.assertRaisesRegexp(IOError, "No such file or directory"):
+      converter.Convert(non_existent, non_existent)
+
+  def testInvalidExtension(self):
+    converter = upgrade_schema_lib.Converter()
+    invalid_extension = tempfile.mktemp(suffix=".foo")
+    with self.assertRaisesRegexp(ValueError, "Invalid extension on input"):
+      converter.Convert(invalid_extension, invalid_extension)
+    with tempfile.NamedTemporaryFile(suffix=".json", mode="w+") as in_json:
+      JsonDumpAndFlush(EMPTY_TEST_SCHEMA_V1, in_json)
+      with self.assertRaisesRegexp(ValueError, "Invalid extension on output"):
+        converter.Convert(in_json.name, invalid_extension)
+
+  def CheckConversion(self, data_old, data_expected):
+    """Given a data dictionary, test upgrading to current version.
+
+    Args:
+        data_old: TFLite model as a dictionary (arbitrary version).
+        data_expected: TFLite model as a dictionary (upgraded).
+    """
+    converter = upgrade_schema_lib.Converter()
+    with tempfile.NamedTemporaryFile(suffix=".json", mode="w+") as in_json, \
+            tempfile.NamedTemporaryFile(
+                suffix=".json", mode="w+") as out_json, \
+            tempfile.NamedTemporaryFile(
+                suffix=".bin", mode="w+b") as out_bin, \
+            tempfile.NamedTemporaryFile(
+                suffix=".tflite", mode="w+b") as out_tflite:
+      JsonDumpAndFlush(data_old, in_json)
+      # Test JSON output
+      converter.Convert(in_json.name, out_json.name)
+      # Test binary output
+      # Convert to .tflite  and then to .bin and check if binary is equal
+      converter.Convert(in_json.name, out_tflite.name)
+      converter.Convert(out_tflite.name, out_bin.name)
+      self.assertEqual(
+          open(out_bin.name, "rb").read(),
+          open(out_tflite.name, "rb").read())
+      # Test that conversion actually produced successful new json.
+      converted_schema = json.load(out_json)
+      self.assertEqual(converted_schema, data_expected)
+
+  def testAlreadyUpgraded(self):
+    """A file already at version 3 should stay at version 3."""
+    self.CheckConversion(EMPTY_TEST_SCHEMA_V3, EMPTY_TEST_SCHEMA_V3)
+    self.CheckConversion(TEST_SCHEMA_V3, TEST_SCHEMA_V3)
+    self.CheckConversion(BUFFER_TEST_V3, BUFFER_TEST_V3)
+
+  # Disable this while we have incorrectly versioned structures around.
+  # def testV0Upgrade_IntroducesSubgraphs(self):
+  #   """V0 did not have subgraphs; check to make sure they get introduced."""
+  #   self.CheckConversion(TEST_SCHEMA_V0, TEST_SCHEMA_V3)
+
+  def testV1Upgrade_RenameOps(self):
+    """V1 had many different names for ops; check to make sure they rename."""
+    self.CheckConversion(EMPTY_TEST_SCHEMA_V1, EMPTY_TEST_SCHEMA_V3)
+    self.CheckConversion(FULL_TEST_SCHEMA_V1, FULL_TEST_SCHEMA_V3)
+
+  def testV2Upgrade_CreateBuffers(self):
+    """V2 did not have buffers; check to make sure they are created."""
+    self.CheckConversion(BUFFER_TEST_V2, BUFFER_TEST_V3)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/contrib/lite/simple_memory_arena.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4aab244989ca5300fbe74162e03deaac89af60ad
--- /dev/null
+++ b/tensorflow/contrib/lite/simple_memory_arena.cc
@@ -0,0 +1,136 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+#include <cstring>
+#include <limits>
+#include <vector>
+
+namespace {
+
+template <typename T>
+T AlignTo(size_t alignment, T offset) {
+  return offset % alignment == 0 ? offset
+                                 : offset + (alignment - offset % alignment);
+}
+
+}  // namespace
+
+namespace tflite {
+
+TfLiteStatus SimpleMemoryArena::Allocate(TfLiteContext* context,
+                                         size_t alignment, size_t size,
+                                         ArenaAlloc* new_alloc) {
+  TF_LITE_ENSURE(context, alignment < arena_alignment_);
+
+  size_t current_top = 0;
+
+  if (!allocs_.empty()) {
+    auto last = allocs_.rbegin();
+    current_top = last->offset + last->size;
+  }
+
+  // If we don't find a better gap just allocate at the end of the buffer.
+  size_t best_offset = AlignTo(alignment, current_top);
+  size_t best_offset_fit = std::numeric_limits<size_t>::max();
+  auto best_insertion_it = allocs_.end();
+
+  // Go through the sorted allocs and look at the gaps between them.
+  size_t current_offset = 0;
+  for (auto it = allocs_.begin(); it != allocs_.end(); ++it) {
+    size_t aligned_current_offset = AlignTo(alignment, current_offset);
+    // If we found a gap larger than required size, and smaller than previous
+    // best fit, take it.
+    if (aligned_current_offset + size <= it->offset &&
+        it->offset - current_offset < best_offset_fit) {
+      best_offset = aligned_current_offset;
+      best_offset_fit = it->offset - current_offset;
+      best_insertion_it = it;
+    }
+    current_offset = it->offset + it->size;
+  }
+
+  // Update the required buffer size.
+  high_water_mark_ = std::max(high_water_mark_, best_offset + size);
+
+  new_alloc->offset = best_offset;
+  new_alloc->size = size;
+  allocs_.insert(best_insertion_it, *new_alloc);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SimpleMemoryArena::Deallocate(TfLiteContext* context,
+                                           const ArenaAlloc& alloc) {
+  int erased_allocs_count = 0;
+  auto it = allocs_.begin();
+  while (it != allocs_.end()) {
+    if (it->offset == alloc.offset) {
+      TF_LITE_ENSURE_EQ(context, it->size, alloc.size);
+      erased_allocs_count++;
+      it = allocs_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  TF_LITE_ENSURE_EQ(context, erased_allocs_count, 1);
+  return kTfLiteOk;
+}
+
+TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context) {
+  size_t required_size = RequiredBufferSize();
+  if (required_size > underlying_buffer_size_) {
+    char* new_alloc = new char[required_size];
+    char* new_underlying_buffer_aligned_ptr = reinterpret_cast<char*>(
+        AlignTo(arena_alignment_, reinterpret_cast<intptr_t>(new_alloc)));
+
+    // If the arena had been previously allocated, copy over the old memory.
+    // Since Alloc pointers are offset based, they will remain valid in the new
+    // memory block.
+    if (high_water_mark_ > 0 && underlying_buffer_size_ > 0) {
+      size_t copy_amount = std::min(
+          underlying_buffer_.get() + underlying_buffer_size_ -
+              underlying_buffer_aligned_ptr_,
+          new_alloc + required_size - new_underlying_buffer_aligned_ptr);
+      memcpy(new_underlying_buffer_aligned_ptr, underlying_buffer_aligned_ptr_,
+             copy_amount);
+    }
+
+    underlying_buffer_.reset(new_alloc);
+    underlying_buffer_size_ = required_size;
+    underlying_buffer_aligned_ptr_ = new_underlying_buffer_aligned_ptr;
+  }
+  commited_ = true;
+  return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError;
+}
+
+TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context,
+                                             const ArenaAlloc& alloc,
+                                             char** output_ptr) {
+  TF_LITE_ENSURE(context, commited_);
+  TF_LITE_ENSURE(context, output_ptr != nullptr);
+  *output_ptr = underlying_buffer_aligned_ptr_ + alloc.offset;
+  return kTfLiteOk;
+}
+
+TfLiteStatus SimpleMemoryArena::Clear() {
+  commited_ = false;
+  high_water_mark_ = 0;
+  allocs_.clear();
+  return kTfLiteOk;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/simple_memory_arena.h b/tensorflow/contrib/lite/simple_memory_arena.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d0b7f9ff79bf9fd8a60dbc057d63f44eeaa6396
--- /dev/null
+++ b/tensorflow/contrib/lite/simple_memory_arena.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
+
+#include <list>
+#include <memory>
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// This little structure holds the offset and the size for a dynamic memory
+// allocation in the memory arena. When the arena is commited and the
+// underlying buffer is set, the alloc can be resolved into an actual memory
+// pointer.
+struct ArenaAlloc {
+  ArenaAlloc() : offset(0), size(0) {}
+
+  size_t offset;
+  size_t size;
+
+  inline bool operator<(const ArenaAlloc& other) const {
+    return offset < other.offset;
+  }
+};
+
+// This small class is responsible for allocating, dealocating and reusing
+// dynamic memory from a common underlying buffer. The arena can be used in
+// scenarios when the pattern of memory allocations and dealocations is
+// repetitive, e.g. running NN inference in multiple iterations.
+class SimpleMemoryArena {
+ public:
+  explicit SimpleMemoryArena(size_t arena_alignment)
+      : commited_(false),
+        arena_alignment_(arena_alignment),
+        high_water_mark_(0),
+        underlying_buffer_size_(0),
+        allocs_() {}
+
+  TfLiteStatus Allocate(TfLiteContext* context, size_t alignment, size_t size,
+                        ArenaAlloc* new_alloc);
+
+  TfLiteStatus Deallocate(TfLiteContext* context, const ArenaAlloc& alloc);
+
+  inline size_t RequiredBufferSize() {
+    // Add in a small amount of padding to reduce the chance of resize events
+    // for small allocations.
+    size_t padding = arena_alignment_;
+    return arena_alignment_ + high_water_mark_ + padding;
+  }
+
+  TfLiteStatus Commit(TfLiteContext* context);
+
+  TfLiteStatus ResolveAlloc(TfLiteContext* context, const ArenaAlloc& alloc,
+                            char** output_ptr);
+
+  TfLiteStatus Clear();
+
+ private:
+  bool commited_;
+  size_t arena_alignment_;
+  size_t high_water_mark_;
+  std::unique_ptr<char[]> underlying_buffer_;
+  size_t underlying_buffer_size_;
+  char* underlying_buffer_aligned_ptr_;
+  // TODO(maciekc): add list iterator to the ArenaAlloc to lookup quickly.
+  std::list<ArenaAlloc> allocs_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_SIMPLE_MEMORY_ARENA_H_
diff --git a/tensorflow/contrib/lite/simple_memory_arena_test.cc b/tensorflow/contrib/lite/simple_memory_arena_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4444f642eb75c563c57762d095e454ac63d836c6
--- /dev/null
+++ b/tensorflow/contrib/lite/simple_memory_arena_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/simple_memory_arena.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+TEST(SimpleMemoryArenaTest, BasicArenaOperations) {
+  TfLiteContext context;
+  SimpleMemoryArena arena(64);
+  ArenaAlloc allocs[6];
+
+  arena.Allocate(&context, 32, 2047, &allocs[0]);
+  arena.Allocate(&context, 32, 2047, &allocs[1]);
+  arena.Allocate(&context, 32, 2047, &allocs[2]);
+  arena.Deallocate(&context, allocs[0]);
+  arena.Allocate(&context, 32, 1023, &allocs[3]);
+  arena.Allocate(&context, 32, 2047, &allocs[4]);
+  arena.Deallocate(&context, allocs[1]);
+  arena.Allocate(&context, 32, 1023, &allocs[5]);
+
+  EXPECT_EQ(allocs[0].offset, 0);
+  EXPECT_EQ(allocs[1].offset, 2048);
+  EXPECT_EQ(allocs[2].offset, 4096);
+  EXPECT_EQ(allocs[3].offset, 0);
+  EXPECT_EQ(allocs[4].offset, 6144);
+  EXPECT_EQ(allocs[5].offset, 1024);
+}
+
+TEST(SimpleMemoryArenaTest, TestAfterClear) {
+  TfLiteContext context;
+  SimpleMemoryArena arena(64);
+  ArenaAlloc allocs[9];
+
+  arena.Allocate(&context, 32, 2047, &allocs[0]);
+  arena.Allocate(&context, 32, 2047, &allocs[1]);
+  arena.Allocate(&context, 32, 2047, &allocs[2]);
+  arena.Commit(&context);
+
+  EXPECT_EQ(allocs[0].offset, 0);
+  EXPECT_EQ(allocs[1].offset, 2048);
+  EXPECT_EQ(allocs[2].offset, 4096);
+
+  arena.Clear();
+
+  // Test with smaller allocs.
+  arena.Allocate(&context, 32, 1023, &allocs[3]);
+  arena.Allocate(&context, 32, 1023, &allocs[4]);
+  arena.Allocate(&context, 32, 1023, &allocs[5]);
+  arena.Commit(&context);
+
+  EXPECT_EQ(allocs[3].offset, 0);
+  EXPECT_EQ(allocs[4].offset, 1024);
+  EXPECT_EQ(allocs[5].offset, 2048);
+
+  arena.Clear();
+
+  // Test larger allocs which should require a reallocation.
+  arena.Allocate(&context, 32, 4095, &allocs[6]);
+  arena.Allocate(&context, 32, 4095, &allocs[7]);
+  arena.Allocate(&context, 32, 4095, &allocs[8]);
+  arena.Commit(&context);
+
+  EXPECT_EQ(allocs[6].offset, 0);
+  EXPECT_EQ(allocs[7].offset, 4096);
+  EXPECT_EQ(allocs[8].offset, 8192);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/string.h b/tensorflow/contrib/lite/string.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f8f4e851ee69aa86b7f3eaec6383e17fa6a734c
--- /dev/null
+++ b/tensorflow/contrib/lite/string.h
@@ -0,0 +1,29 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Abstract string. We don't want even absl at this level.
+#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
+
+#include <string>
+
+namespace tflite {
+
+#ifndef HAS_GLOBAL_STRING
+using std::string;
+#endif
+
+}  // namespace tflite
+
+#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_H_
diff --git a/tensorflow/contrib/lite/string_util.cc b/tensorflow/contrib/lite/string_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd41299d38361321503d421272426a9d1082c937
--- /dev/null
+++ b/tensorflow/contrib/lite/string_util.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/string_util.h"
+
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+
+namespace tflite {
+namespace {
+
+// Convenient method to get pointer to int32_t.
+int32_t* GetIntPtr(char* ptr) { return reinterpret_cast<int32_t*>(ptr); }
+}  // namespace
+
+void DynamicBuffer::AddString(const char* str, size_t len) {
+  data_.resize(data_.size() + len);
+  memcpy(data_.data() + offset_.back(), str, len);
+  offset_.push_back(offset_.back() + len);
+}
+
+void DynamicBuffer::AddString(const StringRef& string) {
+  AddString(string.str, string.len);
+}
+
+void DynamicBuffer::AddJoinedString(const std::vector<StringRef>& strings,
+                                    char separator) {
+  // Resize the data buffer.
+  int total_len = strings.size() - 1;
+  for (StringRef ref : strings) {
+    total_len += ref.len;
+  }
+  data_.resize(data_.size() + total_len);
+
+  int current_idx = 0;
+  for (StringRef ref : strings) {
+    char* dst = data_.data() + offset_.back() + current_idx;
+
+    // Fill separator if not first string.
+    if (current_idx != 0) {
+      *dst = separator;
+      ++dst;
+      ++current_idx;
+    }
+
+    // Fill content of the string.
+    memcpy(dst, ref.str, ref.len);
+    current_idx += ref.len;
+  }
+  offset_.push_back(offset_.back() + total_len);
+}
+
+void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
+  // Allocate sufficient memory to tensor buffer.
+  int32_t num_strings = offset_.size() - 1;
+  // Total bytes include:
+  //   * size of content (data_.size)
+  //   * offset of each tensor (sizeof(int32_t) * num_strings)
+  //   * length of whole buffer (int32_t)
+  //   * num of strings (int32_t).
+  int32_t bytes = data_.size()                            // size of content
+                  + sizeof(int32_t) * (num_strings + 2);  // size of header
+
+  // Output tensor will take over the ownership of tensor_buffer, and free it
+  // during Interpreter destruction.
+  char* tensor_buffer = static_cast<char*>(malloc(bytes));
+
+  // Set num of string
+  memcpy(tensor_buffer, &num_strings, sizeof(int32_t));
+
+  // Set offset of strings.
+  int32_t start = sizeof(int32_t) * (num_strings + 2);
+  for (int i = 0; i < offset_.size(); i++) {
+    int32_t offset = start + offset_[i];
+    memcpy(tensor_buffer + sizeof(int32_t) * (i + 1), &offset, sizeof(int32_t));
+  }
+
+  // Copy data of strings.
+  memcpy(tensor_buffer + start, data_.data(), data_.size());
+
+  // Set tensor content pointer to tensor_buffer, and release original data.
+  auto dims = TfLiteIntArrayCreate(1);
+  dims->data[0] = num_strings;
+  TfLiteTensorReset(tensor->type, tensor->name, dims, tensor->params,
+                    tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
+                    tensor);
+}
+
+int GetStringCount(const TfLiteTensor* tensor) {
+  // The first integers in the raw buffer is the number of strings.
+  return *GetIntPtr(tensor->data.raw);
+}
+
+StringRef GetString(const TfLiteTensor* tensor, int string_index) {
+  int32_t* offset =
+      GetIntPtr(tensor->data.raw + sizeof(int32_t) * (string_index + 1));
+  return {
+      tensor->data.raw + (*offset),
+      (*(offset + 1)) - (*offset),
+  };
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/string_util.h b/tensorflow/contrib/lite/string_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..12872d11232e2a32527d660be8acce3e09f00125
--- /dev/null
+++ b/tensorflow/contrib/lite/string_util.h
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Util methods to read and write String tensors.
+// String tensors are considered to be char tensor with protocol.
+//   [0, 3] 4 bytes: N, num of strings in the tensor in little endian.
+//   [(i+1)*4, (i+1)*4+3] 4 bytes: offset of i-th string in little endian.
+//   [(N+2)*4, (N+2)*4+3] 4 bytes: length of the whole char buffer.
+//   [offset(i), offset(i+1) - 1] : content of i-th string.
+// Example of a string tensor:
+// [
+//   2, 0, 0, 0,     # 2 strings.
+//   16, 0, 0, 0,    # 0-th string starts from index 12.
+//   18, 0, 0, 0,    # 1-st string starts from index 18.
+//   18, 0, 0, 0,    # total length of array.
+//   'A', 'B',       # 0-th string [16..17]: "AB"
+// ]                 # 1-th string, empty
+//
+// A typical usage:
+// In op.Eval(context, node):
+//   DynamicBuffer buf;
+//   # Add string "AB" to tensor, string is stored in dynamic buffer.
+//   buf.AddString("AB", 2);
+//   # Write content of DynamicBuffer to tensor in format of string tensor
+//   # described above.
+//   buf.WriteToTensor(tensor)
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+
+// Convenient structure to store string pointer and length.
+typedef struct {
+  char* str;
+  int len;
+} StringRef;
+
+// DynamicBuffer holds temporary buffer that will be used to create a dynamic
+// tensor. A typical usage is to initialize a DynamicBuffer object, fill in
+// content and call CreateStringTensor in op.Eval().
+class DynamicBuffer {
+ public:
+  DynamicBuffer() : offset_({0}) {}
+
+  // Add string to dynamic buffer by resizing the buffer and copying the data.
+  void AddString(const StringRef& string);
+
+  // Add string to dynamic buffer by resizing the buffer and copying the data.
+  void AddString(const char* str, size_t len);
+
+  // Join a list of string with separator, and add as a single string to the
+  // buffer.
+  void AddJoinedString(const std::vector<StringRef>& strings, char separator);
+
+  // Fill content into a string tensor.
+  void WriteToTensor(TfLiteTensor* tensor);
+
+ private:
+  // Data buffer to store contents of strings, not including headers.
+  std::vector<char> data_;
+  // Offset of the starting index of each string in data buffer.
+  std::vector<int32_t> offset_;
+};
+
+// Return num of strings in a String tensor.
+int GetStringCount(const TfLiteTensor* tensor);
+
+// Get String pointer and length of index-th string in tensor.
+// NOTE: This will not create a copy of string data.
+StringRef GetString(const TfLiteTensor* tensor, int string_index);
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_STRING_UTIL_H_
diff --git a/tensorflow/contrib/lite/string_util_test.cc b/tensorflow/contrib/lite/string_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d53fec7512f902fb277524100640f4a6a2aaf130
--- /dev/null
+++ b/tensorflow/contrib/lite/string_util_test.cc
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/string_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+
+TEST(StringUtil, TestStringUtil) {
+  Interpreter interpreter;
+  interpreter.AddTensors(3);
+
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  TfLiteTensor* t1 = interpreter.tensor(1);
+  t1->type = kTfLiteString;
+  t1->allocation_type = kTfLiteDynamic;
+
+  char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
+
+  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
+                                          15);
+  TfLiteTensor* t2 = interpreter.tensor(2);
+  interpreter.AllocateTensors();
+
+  char s0[] = "ABC";
+  string s1 = "DEFG";
+  char s2[] = "";
+
+  // Write strings to tensors
+  DynamicBuffer buf0;
+  buf0.AddString(s0, 3);
+  DynamicBuffer buf1;
+  buf1.AddString(s1.data(), s1.length());
+  buf0.AddString(s2, 0);
+  buf0.WriteToTensor(t0);
+  buf1.WriteToTensor(t1);
+
+  // Read strings from tensors.
+  ASSERT_EQ(GetStringCount(t0), 2);
+  StringRef str_ref;
+  str_ref = GetString(t0, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC");
+  str_ref = GetString(t0, 1);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "");
+  ASSERT_EQ(t0->bytes, 19);
+
+  ASSERT_EQ(GetStringCount(t1), 1);
+  str_ref = GetString(t1, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "DEFG");
+  ASSERT_EQ(t1->bytes, 16);
+
+  ASSERT_EQ(GetStringCount(t2), 1);
+  str_ref = GetString(t2, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "XYZ");
+  ASSERT_EQ(t2->bytes, 15);
+}
+
+TEST(StringUtil, TestAddJoinedString) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+
+  char s0[] = "ABC";
+  char s1[] = "DEFG";
+  char s2[] = "";
+  char s3[] = "XYZ";
+
+  DynamicBuffer buf;
+  buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
+  buf.WriteToTensor(t0);
+
+  ASSERT_EQ(GetStringCount(t0), 1);
+  StringRef str_ref;
+  str_ref = GetString(t0, 0);
+  ASSERT_EQ(string(str_ref.str, str_ref.len), "ABC DEFG  XYZ");
+  ASSERT_EQ(t0->bytes, 25);
+}
+
+TEST(StringUtil, TestEmptyList) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  DynamicBuffer buf;
+  buf.WriteToTensor(t0);
+
+  ASSERT_EQ(GetStringCount(t0), 0);
+  ASSERT_EQ(t0->bytes, 8);
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/testdata/0_subgraphs.bin b/tensorflow/contrib/lite/testdata/0_subgraphs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5606898d7fd50aa25f7c4be692d2308bcea7c87d
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/0_subgraphs.bin differ
diff --git a/tensorflow/contrib/lite/testdata/2_subgraphs.bin b/tensorflow/contrib/lite/testdata/2_subgraphs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..07308ba62b2db533bb541c47872ba9f239e8b045
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/2_subgraphs.bin differ
diff --git a/tensorflow/contrib/lite/testdata/empty_model.bin b/tensorflow/contrib/lite/testdata/empty_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..1762ca39384971b072e8b8acd53f415b8c66d350
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/empty_model.bin differ
diff --git a/tensorflow/contrib/lite/testdata/multi_add.bin b/tensorflow/contrib/lite/testdata/multi_add.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e5048a32812bbf6522cfd164fe47804a1cdd160f
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/multi_add.bin differ
diff --git a/tensorflow/contrib/lite/testdata/multi_add.json b/tensorflow/contrib/lite/testdata/multi_add.json
new file mode 100644
index 0000000000000000000000000000000000000000..97b931dba8b1050ecf91939d1d9dcea5e0ea56fb
--- /dev/null
+++ b/tensorflow/contrib/lite/testdata/multi_add.json
@@ -0,0 +1,46 @@
+{
+  "version": 1,
+  "operator_codes": [
+    {
+      "builtin_code": "ADD"
+    }
+  ],
+  "subgraphs": [
+    {
+      "tensors": [
+        { "shape": [ 1, 8, 8, 3 ], "name": "a" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "b" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "c" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "d" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "i" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "x" },
+        { "shape": [ 1, 8, 8, 3 ], "name": "y" }
+      ],
+      "inputs": [ 0, 1, 2, 3 ],
+      "outputs": [ 5, 6 ],
+      "operators": [
+        {
+          "inputs": [ 1, 2 ],
+          "outputs": [ 4 ],
+          "builtin_options_type": "AddOptions",
+          "builtin_options": {
+          }
+        },
+        {
+          "inputs": [ 0, 4 ],
+          "outputs": [ 5 ],
+          "builtin_options_type": "AddOptions",
+          "builtin_options": {
+          }
+        },
+        {
+          "inputs": [ 3, 4 ],
+          "outputs": [ 6 ],
+          "builtin_options_type": "AddOptions",
+          "builtin_options": {
+          }
+        }
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/lite/testdata/no_subgraphs.bin b/tensorflow/contrib/lite/testdata/no_subgraphs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5606898d7fd50aa25f7c4be692d2308bcea7c87d
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/no_subgraphs.bin differ
diff --git a/tensorflow/contrib/lite/testdata/test_model.bin b/tensorflow/contrib/lite/testdata/test_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2878b1f96e2d3e1932eda4cebfd750b3daf082ce
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/test_model.bin differ
diff --git a/tensorflow/contrib/lite/testdata/test_model_broken.bin b/tensorflow/contrib/lite/testdata/test_model_broken.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9fd050cd4a82a89c00aa3e1c6fac0e05223a285c
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/test_model_broken.bin differ
diff --git a/tensorflow/contrib/lite/testdata/test_model_broken.json b/tensorflow/contrib/lite/testdata/test_model_broken.json
new file mode 100644
index 0000000000000000000000000000000000000000..b701eb9a25f11013ea4090124cdd1d905040d65d
--- /dev/null
+++ b/tensorflow/contrib/lite/testdata/test_model_broken.json
@@ -0,0 +1,62 @@
+{
+  "subgraphs": [
+    {
+      "inputs": [0, 1],
+      "outputs": [2, 3],
+      "operators": [
+        {
+          "opcode_index": 0,
+          "inputs": [0,1],
+          "outputs": [2]
+        },
+        {
+          "opcode_index": 1,
+          "inputs": [2],
+          "outputs": [3]
+        }
+      ],
+      "tensors": [
+          {
+             "shape" : [
+                2
+             ],
+             "type" : "FLOAT32",
+             "name" : "input0",
+             "data_buffer" : [1,0,0,0]
+          },
+          {
+             "shape" : [
+                3
+             ],
+             "type" : "FLOAT32",
+             "name" : "input1",
+             "data_buffer" : []
+          },
+          {
+             "shape" : [
+                3
+             ],
+             "type" : "FLOAT32",
+             "name" : "out1",
+             "data_buffer" : []
+          },
+          {
+             "shape" : [
+                3
+             ],
+             "type" : "FLOAT32",
+             "name" : "out2",
+             "data_buffer" : []
+          }
+      ],
+    }
+  ],
+  "operator_codes": [
+    {
+      "builtin_code": 0
+    },
+    {
+      "custom_code": "testing_op"
+    }
+  ]
+}
diff --git a/tensorflow/contrib/lite/testdata/two_subgraphs.bin b/tensorflow/contrib/lite/testdata/two_subgraphs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..07308ba62b2db533bb541c47872ba9f239e8b045
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/two_subgraphs.bin differ
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3ff65dd381c42fea45183bd12d26a0257138b7eb
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -0,0 +1,220 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "gen_zipped_test_files",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+gen_zipped_test_files(
+    name = "optest",
+    files = [
+        "add.zip",
+        "avg_pool.zip",
+        "concat.zip",
+        "constant.zip",
+        "control_dep.zip",
+        "conv.zip",
+        "depthwiseconv.zip",
+        "fully_connected.zip",
+        "fused_batch_norm.zip",
+        "global_batch_norm.zip",
+        "l2_pool.zip",
+        "l2norm.zip",
+        "local_response_norm.zip",
+        "max_pool.zip",
+        "mul.zip",
+        "relu.zip",
+        "relu1.zip",
+        "relu6.zip",
+        "reshape.zip",
+        "resize_bilinear.zip",
+        "sigmoid.zip",
+        "softmax.zip",
+        "space_to_depth.zip",
+    ],
+)
+
+py_binary(
+    name = "generate_examples",
+    srcs = ["generate_examples.py"],
+    data = [
+        "//tensorflow/contrib/lite/toco",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":generate_examples_report",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:graph_util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "generate_examples_report",
+    srcs = ["generate_examples_report.py"],
+    srcs_version = "PY2AND3",
+)
+
+cc_library(
+    name = "parse_testdata_lib",
+    srcs = ["parse_testdata.cc"],
+    hdrs = ["parse_testdata.h"],
+    deps = [
+        ":message",
+        ":split",
+        ":test_runner",
+        "//tensorflow/contrib/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "message",
+    srcs = ["message.cc"],
+    hdrs = ["message.h"],
+    deps = [":tokenize"],
+)
+
+cc_test(
+    name = "message_test",
+    srcs = ["message_test.cc"],
+    deps = [
+        ":message",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "split",
+    srcs = ["split.cc"],
+    hdrs = ["split.h"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+    ],
+)
+
+cc_test(
+    name = "split_test",
+    size = "small",
+    srcs = ["split_test.cc"],
+    deps = [
+        ":split",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tflite_driver",
+    srcs = ["tflite_driver.cc"],
+    hdrs = ["tflite_driver.h"],
+    deps = [
+        ":split",
+        ":test_runner",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_test(
+    name = "tflite_driver_test",
+    size = "small",
+    srcs = ["tflite_driver_test.cc"],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    deps = [
+        ":tflite_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "tokenize",
+    srcs = ["tokenize.cc"],
+    hdrs = ["tokenize.h"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+    ],
+)
+
+cc_test(
+    name = "tokenize_test",
+    srcs = ["tokenize_test.cc"],
+    deps = [
+        ":tokenize",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "test_runner",
+    hdrs = ["test_runner.h"],
+    deps = [
+        "//tensorflow/contrib/lite:string",
+    ],
+)
+
+cc_library(
+    name = "util",
+    testonly = 1,
+    hdrs = ["util.h"],
+)
+
+cc_test(
+    name = "test_runner_test",
+    srcs = ["test_runner_test.cc"],
+    deps = [
+        ":test_runner",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_binary(
+    name = "nnapi_example",
+    srcs = ["nnapi_example.cc"],
+    deps = [
+        ":parse_testdata_lib",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "generated_examples_zip_test",
+    size = "medium",
+    srcs = ["generated_examples_zip_test.cc"],
+    data = [":optest"],
+    shard_count = 10,
+    tags = ["no_oss"],
+    deps = [
+        ":parse_testdata_lib",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bca82ded038ded702effd46c0f4247e45a36524
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -0,0 +1,1195 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Generate a series of TensorFlow graphs that become tflite test cases.
+
+Usage:
+
+generate_examples <output directory> zipped
+
+bazel run //tensorflow/contrib/lite/testing:generate_examples
+    third_party/tensorflow/contrib/lite/testing/generated_examples zipped
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import itertools
+import os
+import re
+import sys
+import tempfile
+import traceback
+import zipfile
+import numpy as np
+from six import StringIO
+
+# TODO(aselle): Disable GPU for now
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+# pylint: disable=g-import-not-at-top
+import tensorflow as tf
+from google.protobuf import text_format
+# TODO(aselle): switch to TensorFlow's resource_loader
+from tensorflow.contrib.lite.testing import generate_examples_report as report_lib
+from tensorflow.python.framework import graph_util as tf_graph_util
+
+parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
+parser.add_argument("output_path",
+                    help="Directory where the outputs will be go.")
+# TODO(ahentz): remove this flag
+parser.add_argument("type", help="zipped")
+parser.add_argument("--zip_to_output",
+                    type=str,
+                    help="Particular zip to output.",
+                    required=False)
+parser.add_argument("--toco",
+                    type=str,
+                    help="Path to toco tool.",
+                    required=True)
+parser.add_argument(
+    "--known_bugs_are_errors",
+    action="store_true",
+    help=("If a particular model is affected by a known bug,"
+          " count it as a toco error."))
+parser.add_argument(
+    "--ignore_toco_errors",
+    action="store_true",
+    help="Raise an exception if any toco error is encountered.")
+parser.add_argument(
+    "--save_graphdefs",
+    action="store_true",
+    help="Include intermediate graphdefs in the output zip files.")
+
+
+RANDOM_SEED = 342
+TEST_INPUT_DEPTH = 3
+
+
+# A map from regular expression to bug number. Any test failure with label
+# matching the expression will be considered due to the corresponding bug.
+KNOWN_BUGS = {
+    # TOCO doesn't support scalars as input.
+    r"relu.*input_shape=\[\]": "67587484",
+    r"sigmoid.*input_shape=\[\]": "67645668",
+    # Concat doesn't work with a single input tensor
+    r"concat.*num_tensors=1": "67378344",
+    # Transposition in MatMul is not supported.
+    r"fully_connected.*transpose_.=True": "67586970",
+    # Softmax graphs are too complex.
+    r"softmax.*dim=0": "67749831",
+    r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
+    # SpaceToDepth only supports float32.
+    r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
+}
+
+
+def toco_options(data_types,
+                 input_arrays,
+                 output_arrays,
+                 shapes,
+                 drop_control_dependency):
+  """Create TOCO options to process a model.
+
+  Args:
+    data_types: input and inference types used by TOCO.
+    input_arrays: names of the input tensors
+    output_arrays: name of the output tensors
+    shapes: shapes of the input tensors
+    drop_control_dependency: whether to ignore control dependency nodes.
+
+  Returns:
+    the options in a string.
+  """
+  shape_str = ":".join([",".join(str(y) for y in x) for x in shapes])
+  inference_type = "FLOAT"
+  # TODO(ahentz): if we get multi-input quantization to work we need this
+  # to change
+  if data_types[0] == "QUANTIZED_UINT8":
+    inference_type = "QUANTIZED_UINT8"
+  s = (" --input_types=%s" % ",".join(data_types) +
+       " --inference_type=%s" % inference_type +
+       " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
+       " --input_arrays=%s" % ",".join(input_arrays) +
+       " --input_shapes=%s" % shape_str +
+       " --output_arrays=%s" % ",".join(output_arrays))
+  if drop_control_dependency:
+    s += " --drop_control_dependency"
+  return s
+
+
+def write_toco_options(filename,
+                       data_types,
+                       input_arrays,
+                       output_arrays,
+                       shapes,
+                       drop_control_dependency=False):
+  """Create TOCO options to process a model.
+
+  Args:
+    filename: Filename to write the options to.
+    data_types: input and inference types used by TOCO.
+    input_arrays: names of the input tensors
+    output_arrays: names of the output tensors
+    shapes: shapes of the input tensors
+    drop_control_dependency: whether to ignore control dependency nodes.
+  """
+  with open(filename, "w") as fp:
+    fp.write(
+        toco_options(
+            data_types=data_types,
+            input_arrays=input_arrays,
+            output_arrays=output_arrays,
+            shapes=shapes,
+            drop_control_dependency=drop_control_dependency))
+
+
+def write_examples(fp, examples):
+  """Given a list `examples`, write a text format representation.
+
+  The file format is csv like with a simple repeated pattern. We would ike
+  to use proto here, but we can't yet due to interfacing with the Android
+  team using this format.
+
+  Args:
+    fp: File-like object to write to.
+    examples: Example dictionary consiting of keys "inputs" and "outputs"
+  """
+
+  def write_tensor(fp, x):
+    """Write tensor in file format supported by TFLITE example."""
+    fp.write("dtype,%s\n" % x.dtype)
+    fp.write("shape," + ",".join(map(str, x.shape)) + "\n")
+    # Output 9 digits after the point to ensure the precision is good enough.
+    values = ["{:.9f}".format(value) for value in list(x.flatten())]
+    fp.write("values," + ",".join(values) + "\n")
+
+  fp.write("test_cases,%d\n" % len(examples))
+  for example in examples:
+    fp.write("inputs,%d\n" % len(example["inputs"]))
+    for i in example["inputs"]:
+      write_tensor(fp, i)
+    fp.write("outputs,%d\n" % len(example["outputs"]))
+    for i in example["outputs"]:
+      write_tensor(fp, i)
+
+
+def write_test_cases(fp, model_name, examples):
+  """Given a dictionary of `examples`, write a text format representation.
+
+  The file format is protocol-buffer-like, even though we don't use proto due
+  to the needs of the Android team.
+
+  Args:
+    fp: File-like object to write to.
+    model_name: Filename where the model was written to, relative to filename.
+    examples: Example dictionary consiting of keys "inputs" and "outputs"
+  """
+
+  fp.write("load_model: %s\n" % os.path.basename(model_name))
+  for example in examples:
+    fp.write("reshape {\n")
+    for t in example["inputs"]:
+      fp.write("  input: \"" + ",".join(map(str, t.shape)) + "\"\n")
+    fp.write("}\n")
+    fp.write("invoke {\n")
+
+    for t in example["inputs"]:
+      values = ["{:.9f}".format(value) for value in list(t.flatten())]
+      fp.write("  input: \"" + ",".join(values) + "\"\n")
+    for t in example["outputs"]:
+      values = ["{:.9f}".format(value) for value in list(t.flatten())]
+      fp.write("  output: \"" + ",".join(values) + "\"\n")
+    fp.write("}\n")
+
+
+_TF_TYPE_INFO = {
+    tf.float32: (np.float32, "FLOAT"),
+    tf.float16: (np.float16, "FLOAT"),
+    tf.int32: (np.int32, "INT32"),
+    tf.uint8: (np.uint8, "QUANTIZED_UINT8"),
+    tf.int64: (np.int64, "INT64"),
+}
+
+
+def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
+  """Build tensor data spreading the range [min_value, max_value)."""
+
+  if dtype in _TF_TYPE_INFO:
+    dtype = _TF_TYPE_INFO[dtype][0]
+
+  if dtype in (tf.float32, tf.float16):
+    value = (max_value-min_value)*np.random.random_sample(shape)+min_value
+  elif dtype in (tf.int32, tf.uint8, tf.int64):
+    value = np.random.random_integers(min_value, max_value, shape)
+  return value.astype(dtype)
+
+
+def freeze_graph(session, outputs):
+  """Freeze the current graph.
+
+  Args:
+    session: Tensorflow sessions containing the graph
+    outputs: List of output tensors
+
+  Returns:
+    The frozen graph_def.
+  """
+  return tf_graph_util.convert_variables_to_constants(
+      session, session.graph.as_graph_def(), [x.op.name for x in outputs])
+
+
+def make_control_dep_tests(zip_path):
+  """Make a set of tests that use control dependencies."""
+
+  test_parameters = [{
+      "input_shape": [[], [1, 1, 1, 1], [1, 15, 14, 1], [3, 15, 14, 3]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    filter_value = tf.zeros((3, 3, TEST_INPUT_DEPTH, 8), tf.float32)
+    assert_op = tf.assert_greater_equal(input_tensor, input_tensor - 1)
+    with tf.control_dependencies([assert_op]):
+      out = tf.nn.conv2d(input_tensor, filter_value,
+                         strides=(1, 1, 1, 1), padding="SAME")
+      return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(tf.float32, parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    drop_control_dependency=True)
+
+
+def toco_convert(graph_def_str, input_tensors, output_tensors,
+                 drop_control_dependency=False):
+  """Convert a model's graph def into a tflite model.
+
+  NOTE: this currently shells out to the toco binary, but we would like
+  convert to Python API tooling in the future.
+
+  Args:
+    graph_def_str: Graph def proto in serialized string format.
+    input_tensors: List of input tensor tuples `(name, shape, type)`
+    output_tensors: List of output tensors (names)
+    drop_control_dependency: whether to ignore control dependency nodes.
+
+  Returns:
+    output tflite model, log_txt from conversion
+    or None, log_txt if it did not convert properly.
+  """
+  data_types = [_TF_TYPE_INFO[x[2]][1] for x in input_tensors]
+  opts = toco_options(
+      data_types=data_types,
+      input_arrays=[x[0] for x in input_tensors],
+      shapes=[x[1] for x in input_tensors],
+      output_arrays=output_tensors,
+      drop_control_dependency=drop_control_dependency)
+
+  with tempfile.NamedTemporaryFile() as graphdef_file, \
+       tempfile.NamedTemporaryFile() as output_file, \
+       tempfile.NamedTemporaryFile("w+") as stdout_file:
+    graphdef_file.write(graph_def_str)
+    graphdef_file.flush()
+
+    # TODO(aselle): Switch this to subprocess at some point.
+    cmd = ("%s --input_file=%s --output_file=%s %s > %s 2>&1" %
+           (bin_path, graphdef_file.name, output_file.name, opts,
+            stdout_file.name))
+    exit_code = os.system(cmd)
+    log = (
+        cmd + "exited with code %d" % exit_code + "\n------------------\n" +
+        stdout_file.read())
+    return (None if exit_code != 0 else output_file.read()), log
+
+
+def make_zip_of_tests(zip_path,
+                      test_parameters,
+                      make_graph,
+                      make_test_inputs,
+                      drop_control_dependency=False):
+  """Helper to make a zip file of a bunch of TensorFlow models.
+
+  This does a cartestian product of the dictionary of test_parameters and
+  calls make_graph() for each item in the cartestian product set.
+  If the graph is built successfully, then make_test_inputs() is called to
+  build expected input/output value pairs. The model is then converted to tflite
+  with toco, and the examples are serialized with the tflite model into a zip
+  file (2 files per item in the cartesian product set).
+
+  Args:
+    zip_path: Path of zip file to write
+    test_parameters: Dictionary mapping to lists for each parameter.
+      e.g. `{"strides": [[1,3,3,1], [1,2,2,1]], "foo": [1.2, 1.3]}`
+    make_graph: function that takes current parameters and returns tuple
+      `[input1, input2, ...], [output1, output2, ...]`
+    make_test_inputs: function taking `curr_params`, `session`, `input_tensors`,
+      `output_tensors` and returns tuple `(input_values, output_values)`.
+    drop_control_dependency: whether to ignore control dependency nodes.
+  Raises:
+    RuntimeError: if there are toco errors that can't be ignored.
+  """
+
+  # TODO(aselle): Make this allow multiple inputs outputs.
+  archive = zipfile.PyZipFile(zip_path, "w")
+  zip_manifest = []
+  convert_report = []
+  toco_errors = 0
+  for parameters in test_parameters:
+    keys = parameters.keys()
+    for curr in itertools.product(*parameters.values()):
+      label = zip_path.replace(".zip", "") + (",".join(
+          "%s=%r" % z for z in sorted(zip(keys, curr))).replace(" ", ""))
+      if label[0] == "/":
+        label = label[1:]
+      param_dict = dict(zip(keys, curr))
+
+      def build_example(label, param_dict_real):
+        """Build the model with parameter values set in param_dict_real.
+
+        Args:
+          label: Label of the model (i.e. the filename in the zip).
+          param_dict_real: Parameter dictionary (arguments to the factories
+            make_graph and make_test_inputs)
+        Returns:
+          (tflite_model_binary, report) where tflite_model_binary is the
+          serialized flatbuffer as a string and report is a dictionary with
+          keys `toco_log` (log of toco conversion), `tf_log` (log of tf
+          conversion), `toco` (a string of success status of the conversion),
+          `tf` (a string success status of the conversion).
+        """
+
+        np.random.seed(RANDOM_SEED)
+        report = {"toco": report_lib.NOTRUN, "tf": report_lib.FAILED}
+
+        # Build graph
+        report["tf_log"] = ""
+        report["toco_log"] = ""
+        tf.reset_default_graph()
+
+        with tf.device("/cpu:0"):
+          try:
+            inputs, outputs = make_graph(param_dict_real)
+          except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
+                  ValueError):
+            report["tf_log"] += traceback.format_exc()
+            return None, report
+
+        sess = tf.Session()
+        try:
+          baseline_inputs, baseline_outputs = (make_test_inputs(
+              param_dict_real, sess, inputs, outputs))
+        except (tf.errors.UnimplementedError, tf.errors.InvalidArgumentError,
+                ValueError):
+          report["tf_log"] += traceback.format_exc()
+          return None, report
+        report["toco"] = report_lib.FAILED
+        report["tf"] = report_lib.SUCCESS
+
+        # Convert graph to toco
+        tflite_model_binary, toco_log = toco_convert(
+            sess.graph_def.SerializeToString(),
+            [(input_tensor.name.split(":")[0], input_tensor.get_shape(),
+              input_tensor.dtype) for input_tensor in inputs],
+            [out.name.split(":")[0]
+             for out in outputs], drop_control_dependency)
+        report["toco"] = (report_lib.SUCCESS if tflite_model_binary is not None
+                          else report_lib.FAILED)
+        report["toco_log"] = toco_log
+
+        if FLAGS.save_graphdefs:
+          archive.writestr(label + ".pb",
+                           text_format.MessageToString(sess.graph_def),
+                           zipfile.ZIP_DEFLATED)
+
+        if tflite_model_binary:
+          archive.writestr(label + ".bin", tflite_model_binary,
+                           zipfile.ZIP_DEFLATED)
+          example = {"inputs": baseline_inputs, "outputs": baseline_outputs}
+
+          example_fp = StringIO()
+          write_examples(example_fp, [example])
+          archive.writestr(label + ".inputs",
+                           example_fp.getvalue(), zipfile.ZIP_DEFLATED)
+
+          example_fp2 = StringIO()
+          write_test_cases(example_fp2, label + ".bin", [example])
+          archive.writestr(label + "_tests.txt",
+                           example_fp2.getvalue(), zipfile.ZIP_DEFLATED)
+
+          zip_manifest.append(label + "\n")
+
+        return tflite_model_binary, report
+
+      _, report = build_example(label, param_dict)
+
+      if report["toco"] == report_lib.FAILED:
+        ignore_error = False
+        if not FLAGS.known_bugs_are_errors:
+          for pattern, bug_number in KNOWN_BUGS.items():
+            if re.search(pattern, label):
+              print("Ignored TOCO error due to bug %s" % bug_number)
+              ignore_error = True
+        if not ignore_error:
+          toco_errors += 1
+          print("-----------------\ntoco error!\n%s\n-----------------\n" %
+                report["toco_log"])
+
+      convert_report.append((param_dict, report))
+  report_io = StringIO()
+  report_lib.make_report_table(report_io, zip_path, convert_report)
+  archive.writestr("report.html", report_io.getvalue())
+
+  archive.writestr("manifest.txt", "".join(zip_manifest), zipfile.ZIP_DEFLATED)
+
+  # Log statistics of what succeeded
+  total_conversions = len(convert_report)
+  tf_success = sum(1 for x in convert_report
+                   if x[1]["tf"] == report_lib.SUCCESS)
+  toco_success = sum(1 for x in convert_report
+                     if x[1]["toco"] == report_lib.SUCCESS)
+  percent = 0
+  if tf_success > 0:
+    percent = float(toco_success) / float(tf_success) * 100.
+  tf.logging.info(("Archive %s Considered %d graphs, %d TF evaluated graphs "
+                   " and %d TOCO converted graphs (%.1f%%"), zip_path,
+                  total_conversions, tf_success, toco_success, percent)
+
+  if not FLAGS.ignore_toco_errors and toco_errors > 0:
+    raise RuntimeError(
+        "Found %d errors while generating toco models" % toco_errors)
+
+
+def make_pool_tests(pool_op_in):
+  """Make a set of tests to do average pooling.
+
+  Args:
+    pool_op_in: TensorFlow pooling operation to test  i.e. `tf.nn.avg_pool`.
+
+  Returns:
+    A function representing the true generator (after curried pool_op_in).
+  """
+
+  pool_op = pool_op_in
+
+  def f(zip_path):
+    """Actual function that generates examples.
+
+    Args:
+      zip_path: path to write zip to.
+    """
+
+    # Chose a set of parameters
+    test_parameters = [{
+        "ksize": [[2, 1, 1, 2], [1, 1, 1, 1], [1, 1, 2, 1], [1, 10, 11, 1]],
+        "strides": [[2, 1, 1, 2], [1, 1, 1, 1], [1, 1, 2, 1], [1, 10, 11, 1]],
+        # TODO(aselle): should add in a degenerate shape (e.g. [1, 0, 1, 1]).
+        "input_shape": [[], [1, 1, 1, 1], [1, 15, 14, 1], [3, 15, 14, 3]],
+        "padding": ["SAME", "VALID"],
+        "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+    }]
+
+    def build_graph(parameters):
+      input_tensor = tf.placeholder(
+          dtype=tf.float32, name="input", shape=parameters["input_shape"])
+      out = pool_op(
+          input_tensor,
+          ksize=parameters["ksize"],
+          strides=parameters["strides"],
+          data_format=parameters["data_format"],
+          padding=parameters["padding"])
+      return [input_tensor], [out]
+
+    def build_inputs(parameters, sess, inputs, outputs):
+      input_values = create_tensor_data(tf.float32, parameters["input_shape"])
+      return [input_values], sess.run(
+          outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  return f
+
+
+def make_relu_tests(zip_path):
+  """Make a set of tests to do relu."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                      [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.relu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_relu1_tests(zip_path):
+  """Make a set of tests to do relu1."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3],
+                      [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    # Note that the following is not supported:
+    #   out = tf.maximum(-1.0, tf.minimum(input_tensor, 1.0))
+    out = tf.minimum(1.0, tf.maximum(input_tensor, -1.0))
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_relu6_tests(zip_path):
+  """Make a set of tests to do relu6."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3],
+                      [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.relu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+# This function tests various TensorFLow functions that generates Const op,
+# including `tf.ones`, `tf.zeros` and random functions.
+def make_constant_tests(zip_path):
+  """Make a set of tests to do constant ops."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape": [[1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+  }]
+
+  def build_graph(parameters):
+    # Since Toco & Tflite can't have a single constant op in the entire graph,
+    # this test adds a zero tesnor with a constant op tensor.
+    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
+                            shape=parameters["input_shape"])
+    out = tf.ones(parameters["input_shape"], dtype=parameters["dtype"]) + input1
+    return [input1], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = np.zeros(parameters["input_shape"],
+                      dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
+    return [input1], sess.run(outputs, feed_dict={inputs[0]: input1})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_add_tests(zip_path):
+  """Make a set of tests to do add with and without broadcast."""
+
+  # These parameters are split because we don't support broadcasting.
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape_1": [[1, 3, 4, 3]],
+      "input_shape_2": [[1, 3, 4, 3]],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[5]],
+      "input_shape_2": [[5]],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[1, 3, 4, 3]],
+      "input_shape_2": [[3]],
+  }]
+
+  def build_graph(parameters):
+    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
+                            shape=parameters["input_shape_1"])
+    input2 = tf.placeholder(dtype=parameters["dtype"], name="input2",
+                            shape=parameters["input_shape_2"])
+    out = tf.add(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dtype"],
+                                parameters["input_shape_1"])
+    input2 = create_tensor_data(parameters["dtype"],
+                                parameters["input_shape_2"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict={
+            inputs[0]: input1,
+            inputs[1]: input2
+        })
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_mul_tests(zip_path):
+  """Make a set of tests to do mul with and without broadcast."""
+
+  # These parameters are split because we don't support broadcasting.
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape_1": [[1, 3, 4, 3]],
+      "input_shape_2": [[1, 3, 4, 3]],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[5]],
+      "input_shape_2": [[5]],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[1, 3, 4, 3]],
+      "input_shape_2": [[3]],
+  }]
+
+  def build_graph(parameters):
+    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
+                            shape=parameters["input_shape_1"])
+    input2 = tf.placeholder(dtype=parameters["dtype"], name="input2",
+                            shape=parameters["input_shape_2"])
+    out = tf.multiply(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dtype"],
+                                parameters["input_shape_1"])
+    input2 = create_tensor_data(parameters["dtype"],
+                                parameters["input_shape_2"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict={inputs[0]: input1,
+                            inputs[1]: input2})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_global_batch_norm_tests(zip_path):
+  """Make a set of tests to do batch_norm_with_global_normalization."""
+
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "input_shape": [[1, 1, 6, 2], [3, 4, 5, 4]],
+      "epsilon": [0.1, 0.0001],
+      "scale_after": [True, False],
+  }]
+
+  def build_graph(parameters):
+    """Build the global batch norm testing graph."""
+    input_shape = parameters["input_shape"]
+    scale_shape = input_shape[3]
+
+    scale = create_tensor_data(parameters["dtype"], scale_shape)
+    offset = create_tensor_data(parameters["dtype"], scale_shape)
+    mean = create_tensor_data(parameters["dtype"], scale_shape)
+    variance = create_tensor_data(parameters["dtype"], scale_shape)
+
+    x = create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    x_norm = tf.nn.batch_norm_with_global_normalization(
+        x, mean, variance, scale, offset,
+        parameters["epsilon"], parameters["scale_after"])
+
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.add(input_tensor, x_norm)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_fused_batch_norm_tests(zip_path):
+  """Make a set of tests to do fused_batch_norm."""
+
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "input_shape": [[1, 1, 6, 2]],
+      "epsilon": [0.001, 0.1],
+  }]
+
+  def build_graph(parameters):
+    """Build the testing graph for fused batch normalization."""
+    input_shape = parameters["input_shape"]
+    scale_shape = input_shape[3]
+
+    scale = create_tensor_data(parameters["dtype"], scale_shape)
+    offset = create_tensor_data(parameters["dtype"], scale_shape)
+    mean = create_tensor_data(parameters["dtype"], scale_shape)
+    variance = create_tensor_data(parameters["dtype"], scale_shape)
+
+    x = create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    [x_norm, _, _] = tf.nn.fused_batch_norm(
+        x, scale, offset, mean, variance,
+        parameters["epsilon"], data_format="NHWC", is_training=False)
+
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.add(input_tensor, x_norm)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_conv_tests(zip_path):
+  """Make a set of tests to do convolution."""
+
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 3]],
+      "filter_shape": [[1, 1, 3, 2]],
+      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+  }, {
+      "input_shape": [[2, 14, 14, 2]],
+      "filter_shape": [[6, 6, 2, 2]],
+      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    filter_values = create_tensor_data(np.float32, parameters["filter_shape"])
+    out = tf.nn.conv2d(input_tensor, filter_values,
+                       strides=parameters["strides"],
+                       padding=parameters["padding"],
+                       data_format=parameters["data_format"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(np.float32, parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_depthwiseconv_tests(zip_path):
+  """Make a set of tests to do convolution."""
+
+  # Tensorflow only supports equal strides
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 3], [1, 10, 10, 3]],
+      "filter_size": [[1, 1], [1, 2], [3, 3]],
+      "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
+      "channel_multiplier": [1, 2],
+      "rate": [[1, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],
+  }, {
+      "input_shape": [[1, 3, 4, 3]],
+      "filter_size": [[1, 1]],
+      "strides": [[1, 1, 2, 1]],  # TF needs [1, x, x, 1]
+      "channel_multiplier": [2],
+      "rate": [[2, 2]],   #  Only [1, 1] is supported
+      "padding": ["SAME"],
+      "data_format": ["NHWC"],
+  }]
+
+  def build_graph(parameters):
+    """Build a depthwise conv graph given `parameters`."""
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_size"]
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]]
+    filter_values = create_tensor_data(np.float32, filter_shape)
+    out = tf.nn.depthwise_conv2d(
+        input_tensor, filter_values,
+        strides=parameters["strides"],
+        rate=parameters["rate"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(np.float32, parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_concatenation_tests(zip_path):
+  """Make a set of tests to do concatenatinon."""
+
+  test_parameters = [{
+      "base_shape": [[1, 3, 4, 3], [3, 4]],
+      "num_tensors": [1, 2, 3, 4, 5, 6],
+      "axis": [0, 1, 2, 3],
+  }]
+
+  def get_shape(parameters, delta):
+    """Return a tweaked version of 'base_shape'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    if axis < len(shape):
+      shape[axis] += delta
+    return shape
+
+  def build_graph(parameters):
+    all_tensors = []
+    for n in range(0, parameters["num_tensors"]):
+      input_tensor = tf.placeholder(dtype=tf.float32, name=("input%d" % n),
+                                    shape=get_shape(parameters, n))
+      all_tensors.append(input_tensor)
+    out = tf.concat(all_tensors, parameters["axis"])
+    return all_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    all_values = []
+    for n in range(0, parameters["num_tensors"]):
+      input_values = create_tensor_data(np.float32,
+                                        get_shape(parameters, n))
+      all_values.append(input_values)
+    return all_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, all_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_fully_connected_tests(zip_path):
+  """Make a set of tests to do fully_connected."""
+
+  test_parameters = [{
+      "shape1": [[3, 3]],
+      "shape2": [[3, 3]],
+      "transpose_a": [True, False],
+      "transpose_b": [True, False],
+  }, {
+      "shape1": [[4, 4], [1, 4], [4]],
+      "shape2": [[4, 4], [4, 1], [4]],
+      "transpose_a": [False],
+      "transpose_b": [False],
+  }, {
+      "shape1": [[40, 37]],
+      "shape2": [[37, 40]],
+      "transpose_a": [False],
+      "transpose_b": [False],
+
+  }]
+
+  def build_graph(parameters):
+    input_tensor1 = tf.placeholder(dtype=tf.float32, name="input1",
+                                   shape=parameters["shape1"])
+    input_tensor2 = create_tensor_data(np.float32, parameters["shape2"])
+    out = tf.matmul(input_tensor1, input_tensor2,
+                    transpose_a=parameters["transpose_a"],
+                    transpose_b=parameters["transpose_b"])
+    return [input_tensor1], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values1 = create_tensor_data(np.float32, shape=parameters["shape1"])
+    return [input_values1], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values1])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_l2norm_tests(zip_path):
+  """Make a set of tests to do l2norm."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[5, 7], [1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3],
+                      [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+      "dim": [0, 1, 2, 3, [2, 3], -2],
+      "epsilon": [None, 1e-12, 1e-3],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    if parameters["epsilon"]:
+      out = tf.nn.l2_normalize(
+          input_tensor, parameters["dim"], epsilon=parameters["epsilon"])
+    else:
+      out = tf.nn.l2_normalize(input_tensor, parameters["dim"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_local_response_norm_tests(zip_path):
+  """Make a set of tests to do local_response_norm."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3]],
+      "depth_radius": [None, 0, 1, 3, 4, 5],
+      "bias": [None, 0.1, 0.3, -0.1],
+      "alpha": [None, 1, 2, -3],
+      "beta": [None, 0.5, 0.25, 2],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.local_response_normalization(
+        input_tensor, depth_radius=parameters["depth_radius"],
+        bias=parameters["bias"], alpha=parameters["alpha"],
+        beta=parameters["beta"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_reshape_tests(zip_path):
+  """Make a set of tests to do reshape."""
+
+  # Alll shapes below are suitable for tensors with 420 elements.
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape": [[3, 4, 5, 7], [4, 105], [21, 5, 2, 2], [420]],
+      "output_shape": [[15, 28], [420], [1, -1, 5, 7], [-1]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.reshape(input_tensor, shape=parameters["output_shape"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_resize_bilinear_tests(zip_path):
+  """Make a set of tests to do resize_bilinear."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 3, 4, 3], [1, 10, 2, 1]],
+      "size": [[1, 1], [4, 3], [2, 2], [5, 6]],
+      "align_corners": [None, True, False],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.image.resize_bilinear(input_tensor, size=parameters["size"],
+                                   align_corners=parameters["align_corners"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_sigmoid_tests(zip_path):
+  """Make a set of tests to do sigmoid."""
+
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "input_shape": [[1, 3, 4, 3], [4], [], [1, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.sigmoid(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_softmax_tests(zip_path):
+  """Make a set of tests to do softmax."""
+
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "input_shape": [[1, 3, 4, 3], [2, 3]],
+      "dim": [-1, 0],
+  }, {
+      "dtype": [tf.float32],
+      "input_shape": [[4, 7]],
+      "dim": [-1, 1],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.nn.softmax(input_tensor, dim=parameters["dim"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_space_to_depth_tests(zip_path):
+  """Make a set of tests to do space_to_depth."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.float16, tf.int32, tf.uint8, tf.int64],
+      "input_shape": [[2, 12, 24, 1]],
+      "block_size": [2, 3, 4],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
+                                  shape=parameters["input_shape"])
+    out = tf.space_to_depth(input_tensor, block_size=parameters["block_size"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
+  """Given an input perform a sequence of TensorFlow ops to produce l2pool."""
+  return tf.sqrt(tf.nn.avg_pool(
+      tf.square(input_tensor), ksize=ksize, strides=strides,
+      padding=padding, data_format=data_format))
+
+
+# Toco binary path provided by the generate rule.
+bin_path = None
+
+
+def main(unused_args):
+  global bin_path
+  def mkdir_if_not_exist(x):
+    if not os.path.isdir(x):
+      os.mkdir(x)
+      if not os.path.isdir(x):
+        raise RuntimeError("Failed to create dir %r" % x)
+
+  if FLAGS.type == "zipped":
+    opstest_path = os.path.join(FLAGS.output_path)
+    mkdir_if_not_exist(opstest_path)
+    def _path(filename):
+      return os.path.join(opstest_path, filename)
+
+    dispatch = {
+        "control_dep.zip": make_control_dep_tests,
+        "add.zip": make_add_tests,
+        "conv.zip": make_conv_tests,
+        "constant.zip": make_constant_tests,
+        "depthwiseconv.zip": make_depthwiseconv_tests,
+        "concat.zip": make_concatenation_tests,
+        "fully_connected.zip": make_fully_connected_tests,
+        "global_batch_norm.zip": make_global_batch_norm_tests,
+        "fused_batch_norm.zip": make_fused_batch_norm_tests,
+        "l2norm.zip": make_l2norm_tests,
+        "local_response_norm.zip": make_local_response_norm_tests,
+        "mul.zip": make_mul_tests,
+        "relu.zip": make_relu_tests,
+        "relu1.zip": make_relu1_tests,
+        "relu6.zip": make_relu6_tests,
+        "l2_pool.zip": make_pool_tests(make_l2_pool),
+        "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
+        "max_pool.zip": make_pool_tests(tf.nn.max_pool),
+        "reshape.zip": make_reshape_tests,
+        "resize_bilinear.zip": make_resize_bilinear_tests,
+        "sigmoid.zip": make_sigmoid_tests,
+        "softmax.zip": make_softmax_tests,
+        "space_to_depth.zip": make_space_to_depth_tests,
+    }
+    out = FLAGS.zip_to_output
+    bin_path = FLAGS.toco
+    if out in dispatch:
+      dispatch[out](_path(out))
+    else:
+      raise RuntimeError("Invalid zip to output %r" % out)
+
+  else:
+    raise RuntimeError("Invalid argument for type of generation.")
+
+
+if __name__ == "__main__":
+  FLAGS, unparsed = parser.parse_known_args()
+
+  if unparsed:
+    print("Usage: %s <path out> zipped <zip file to generate>")
+  else:
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/lite/testing/generate_examples_report.py b/tensorflow/contrib/lite/testing/generate_examples_report.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcf8cd86a182dca78af5e3ddcbffd748f5fdfce
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generate_examples_report.py
@@ -0,0 +1,125 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Make HTML tables that report where TF and TOCO failed to convert models.
+
+This is primarily used by generate_examples.py. See it or
+`make_report_table` for more details on usage.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cgi
+import json
+
+FAILED = "FAILED"
+SUCCESS = "SUCCESS"
+NOTRUN = "NOTRUN"
+
+
+def make_report_table(fp, title, reports):
+  """Make an HTML report of the success/failure reports.
+
+  Args:
+    fp: File-like object in which to put the html.
+    title: "Title of the zip file this pertains to."
+    reports: a list of conversion attempts. (report_args, report_vals) i.e.
+      ({"shape": [1,2,3], "type": "tf.float32"},
+       {"tf": "SUCCESS", "toco": "FAILURE", "toco_log": "Unsupported type.",
+        "tf_log": ""})
+  """
+  # sort reports by if TOCO failure and then TF failure (reversed)
+  reports.sort(key=lambda x: x[1]["toco"], reverse=False)
+  reports.sort(key=lambda x: x[1]["tf"], reverse=True)
+  def result_cell(x, row, col):
+    """Produce a cell with the condition string `x`."""
+    s = cgi.escape(repr(x), quote=True)
+    color = "#44ff44" if x == SUCCESS else (
+        "#ff4444" if x == FAILED else "#eeeeee")
+    handler = "ShowLog(%d, %d)" % (row, col)
+    fp.write("<td style='background-color: %s' onclick='%s'>%s</td>\n" % (
+        color, handler, s))
+
+  fp.write("""<html>
+<head>
+<title>tflite report</title>
+<style>
+body { font-family: Arial; }
+th { background-color: #555555; color: #eeeeee; }
+td { vertical-align: top; }
+td.horiz {width: 50%;}
+pre { white-space: pre-wrap; word-break: keep-all; }
+table {width: 100%;}
+</style>
+</head>
+""")
+  # Write the log data to a javascript variable and also make a function
+  # in javascript to show the log when an item is clicked.
+  fp.write("<script> \n")
+  fp.write("""
+function ShowLog(row, col) {
+
+var log = document.getElementById("log");
+log.innerHTML = "<pre>" + data[row][col]  + "</pre>";
+}
+""")
+  fp.write("var data = \n")
+  fp.write(json.dumps([[cgi.escape(x[1]["tf_log"], quote=True),
+                        cgi.escape(x[1]["toco_log"], quote=True)]
+                       for x in reports]))
+  fp.write(";</script>\n")
+
+  # Write the main table and use onclick on the items that have log items.
+  fp.write("""
+<body>
+<h1>TOCO Conversion</h1>
+<h2>%s</h2>
+""" % title)
+
+  # Get a list of keys that are in any of the records.
+  param_keys = {}
+  for params, _ in reports:
+    for k in params.keys():
+      param_keys[k] = True
+
+  fp.write("<table>\n")
+  fp.write("<tr><td class='horiz'>\n")
+  fp.write("<div style='height:1000px; overflow:auto'>\n")
+  fp.write("<table>\n")
+  fp.write("<tr>\n")
+  for p in param_keys:
+    fp.write("<th>%s</th>\n" % cgi.escape(p, quote=True))
+  fp.write("<th>TensorFlow</th>\n")
+  fp.write("<th>TOCO</th>\n")
+  fp.write("</tr>\n")
+  for idx, (params, vals) in enumerate(reports):
+    fp.write("<tr>\n")
+    for p in param_keys:
+      fp.write("  <td>%s</td>\n" % cgi.escape(repr(params[p]), quote=True))
+
+    result_cell(vals["tf"], idx, 0)
+    result_cell(vals["toco"], idx, 1)
+    fp.write("</tr>\n")
+  fp.write("</table>\n")
+  fp.write("</div>\n")
+  fp.write("</td>\n")
+  fp.write("<td class='horiz' id='log'></td></tr>\n")
+  fp.write("</table>\n")
+  fp.write("<script>\n")
+  fp.write("</script>\n")
+  fp.write("""
+    </body>
+    </html>
+    """)
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b3266738c7870df00195d75cfbbf698ce94e040
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -0,0 +1,279 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <gtest/gtest.h>
+#include "re2/re2.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+bool FLAGS_ignore_known_bugs = true;
+}  // namespace
+
+namespace tflite {
+namespace testing {
+
+// TensorFlow system environment for file system called.
+tensorflow::Env* env = tensorflow::Env::Default();
+
+// List of tests that are expected to fail when
+//   --test_arg=--ignore_known_bugs=false
+// Key is a substring of the test name and value is a bug number.
+// TODO(ahentz): make sure we clean this list up frequently.
+std::map<string, string> kBrokenTests = {
+    // Add doesn't support broadcasting.
+    {R"(addd.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+    {R"(muld.*input_shape_1=\[1,3,4,3\],input_shape_2=\[3\])", "68500195"},
+
+    // Add only supports float32. (and "constant" tests use Add)
+    {R"(addd.*int32)", "68808744"},
+    {R"(constant.*int32)", "68808744"},
+    {R"(mul.*int32)", "68808744"},
+
+    // Toco or TFLite has a bug to deal with some constant functions with
+    // more than 1 element.
+    {R"(constant.*input_shape=\[(2|2,2,2,2)\])", "68721522"},
+
+    // L2Norm only supports 4D tensors.
+    {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.\])", "67963684"},
+    {R"(l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
+
+    // L2Norm only works for dim=-1.
+    {R"(l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+
+    // ResizeBilinear looks completely incompatible with Tensorflow
+    {R"(resize_bilinear)", "67964336"},
+};
+
+// Allows test data to be unzipped into a temporary directory and makes
+// sure those temporary directories are removed later.
+class ZipEnvironment : public ::testing::Environment {
+ public:
+  ~ZipEnvironment() override {}
+
+  // Delete all temporary directories on teardown.
+  void TearDown() override {
+    for (const auto& dir : temporary_directories_) {
+      tensorflow::int64 undeleted_dirs, undeleted_files;
+      TF_CHECK_OK(
+          env->DeleteRecursively(dir, &undeleted_dirs, &undeleted_files));
+    }
+    temporary_directories_.clear();
+  }
+
+  // Unzip `zip` file into a new temporary directory  `out_dir`.
+  tensorflow::Status UnZip(const string& zip, string* out_dir) {
+    string dir;
+    TF_CHECK_OK(MakeTemporaryDirectory(&dir));
+    tensorflow::SubProcess proc;
+    string unzip_binary =
+        "/usr/bin/unzip";
+    proc.SetProgram(unzip_binary, {"unzip", "-d", dir, zip});
+    proc.SetChannelAction(tensorflow::CHAN_STDOUT, tensorflow::ACTION_PIPE);
+    proc.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
+    if (!proc.Start())
+      return tensorflow::Status(tensorflow::error::UNKNOWN,
+                                "unzip couldn't start");
+    string out, err;
+    int status = proc.Communicate(nullptr, &out, &err);
+    if (WEXITSTATUS(status) == 0) {
+      *out_dir = dir;
+      return tensorflow::Status::OK();
+    } else {
+      return tensorflow::Status(tensorflow::error::UNKNOWN, "unzip failed");
+    }
+  }
+
+ private:
+  // Make a temporary directory and return its name in `temporary`.
+  tensorflow::Status MakeTemporaryDirectory(string* temporary) {
+    if (env->LocalTempFilename(temporary)) {
+      TF_CHECK_OK(env->CreateDir(*temporary));
+      temporary_directories_.push_back(*temporary);
+      return tensorflow::Status::OK();
+    }
+    return tensorflow::Status(tensorflow::error::UNKNOWN,
+                              "make temporary directory failed");
+  }
+
+  std::vector<string> temporary_directories_;
+};
+
+// Return the singleton zip_environment.
+ZipEnvironment* zip_environment() {
+  static ZipEnvironment* env = new ZipEnvironment;
+  return env;
+}
+
+// Read the manifest.txt out of the unarchived zip file. Specifically
+// `original_file` is the original zip file for error messages. `dir` is
+// the temporary directory where the zip file has been unarchived and
+// `test_paths` is the list of test prefixes that were in the manifest.
+// Note, it is an error for a manifest to contain no tests.
+tensorflow::Status ReadManifest(const string& original_file, const string& dir,
+                                std::vector<string>* test_paths) {
+  // Read the newline delimited list of entries in the manifest.
+  std::ifstream manifest_fp(dir + "/manifest.txt");
+  string manifest((std::istreambuf_iterator<char>(manifest_fp)),
+                  std::istreambuf_iterator<char>());
+  size_t pos = 0;
+  int added = 0;
+  while (true) {
+    size_t end_pos = manifest.find("\n", pos);
+    if (end_pos == string::npos) break;
+    string filename = manifest.substr(pos, end_pos - pos);
+    test_paths->push_back(dir + "/" + filename);
+    pos = end_pos + 1;
+    added += 1;
+  }
+  if (!added) {
+    string message = "Test had no examples: " + original_file;
+    return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
+  }
+  return tensorflow::Status::OK();
+}
+
+// Get a list of tests from a zip file `zip_file_name`.
+std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
+  string zip_file = ::tensorflow::testing::TensorFlowSrcRoot() +
+                    "/contrib/lite/testing/optest/" + zip_file_name;
+  string decompress_tmp_dir;
+  TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
+  std::vector<string> stuff;
+  TF_CHECK_OK(ReadManifest(zip_file, decompress_tmp_dir, &stuff));
+  return stuff;
+}
+
+class OpsTest : public ::testing::TestWithParam<string> {};
+
+TEST_P(OpsTest, RunStuff) {
+  string test_path = GetParam();
+  string tflite_file = test_path + ".bin";
+  string tflite_examples = test_path + ".inputs";
+  string test_name = test_path.substr(test_path.find_last_of('/'));
+
+  auto model = tflite::FlatBufferModel::BuildFromFile(tflite_file.c_str());
+  std::unique_ptr<tflite::Interpreter> interpreter;
+
+  tflite::ops::builtin::BuiltinOpResolver builtins;
+  ASSERT_EQ(tflite::InterpreterBuilder(*model, builtins)(&interpreter),
+            kTfLiteOk);
+
+  std::vector<tflite::testing::Example> examples;
+  ASSERT_EQ(tflite::testing::ParseExamples(tflite_examples.c_str(), &examples),
+            kTfLiteOk);
+
+  string bug_number;
+  for (const auto& p : kBrokenTests) {
+    if (RE2::PartialMatch(test_name, p.first)) {
+      bug_number = p.second;
+    }
+  }
+
+  for (const auto& example : examples) {
+    ASSERT_EQ(interpreter->inputs().size(), example.inputs.size());
+    auto result = [&]() {
+      TF_LITE_ENSURE_STATUS(FeedExample(interpreter.get(), example));
+      TF_LITE_ENSURE_STATUS(interpreter->Invoke());
+      TF_LITE_ENSURE_STATUS(CheckOutputs(interpreter.get(), example));
+      return kTfLiteOk;
+    }();
+
+    if (bug_number.empty()) {
+      ASSERT_EQ(result, kTfLiteOk);
+    } else {
+      if (FLAGS_ignore_known_bugs) {
+        ASSERT_EQ(result, kTfLiteError)
+            << "Not failing as expected due to http://b/" << bug_number;
+      } else {
+        ASSERT_EQ(result, kTfLiteOk)
+            << "Possibly due to http://b/" << bug_number;
+      }
+    }
+  }
+}
+
+// Instantiate a test. This assumes `zip_base`.zip is a declared data file
+// of this test.
+#define INSTANTIATE_TESTS(zip_base) \
+  INSTANTIATE_TEST_CASE_P(          \
+      zip_base, OpsTest,            \
+      ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
+
+INSTANTIATE_TESTS(add)
+INSTANTIATE_TESTS(avg_pool)
+INSTANTIATE_TESTS(concat)
+INSTANTIATE_TESTS(constant)
+INSTANTIATE_TESTS(control_dep)
+INSTANTIATE_TESTS(conv)
+INSTANTIATE_TESTS(depthwiseconv)
+INSTANTIATE_TESTS(fully_connected)
+INSTANTIATE_TESTS(fused_batch_norm)
+INSTANTIATE_TESTS(global_batch_norm)
+INSTANTIATE_TESTS(l2norm)
+INSTANTIATE_TESTS(l2_pool)
+INSTANTIATE_TESTS(local_response_norm)
+INSTANTIATE_TESTS(max_pool)
+INSTANTIATE_TESTS(mul)
+INSTANTIATE_TESTS(relu)
+INSTANTIATE_TESTS(relu1)
+INSTANTIATE_TESTS(relu6)
+INSTANTIATE_TESTS(reshape)
+INSTANTIATE_TESTS(resize_bilinear)
+INSTANTIATE_TESTS(sigmoid)
+INSTANTIATE_TESTS(softmax)
+INSTANTIATE_TESTS(space_to_depth)
+
+}  // namespace testing
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::AddGlobalTestEnvironment(tflite::testing::zip_environment());
+
+  std::vector<tensorflow::Flag> flags = {tensorflow::Flag(
+      "ignore_known_bugs", &FLAGS_ignore_known_bugs,
+      "If a particular model is affected by a known bug, the "
+      "corresponding test should expect the outputs to not match.")};
+  bool success = tensorflow::Flags::Parse(&argc, argv, flags);
+  if (!success || (argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+    fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return 1;
+  }
+
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/testing/message.cc b/tensorflow/contrib/lite/testing/message.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03fae4bb86a30e692dbc7f38bede6154c3a9a303
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/message.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/message.h"
+
+#include <stack>
+
+#include "tensorflow/contrib/lite/testing/tokenize.h"
+
+namespace tflite {
+namespace testing {
+
+// A token processor that builds messages and forward calls to the current
+// message object. Place a new message at the top of the stack when it start
+// and remove it when it is finished.
+class MessageStack : public TokenProcessor {
+ public:
+  // Start a new MessageStack with the given first_node, which will be used to
+  // process freestanding fields and submessages.
+  explicit MessageStack(Message* first_node) {
+    nodes_.push(first_node);
+    valid_ = true;
+  }
+
+  void ConsumeToken(std::string* token) override {
+    if (!valid_) return;
+    Message* current_node = nodes_.top();
+    if (*token == "{") {
+      // This is the beginning of a new message, names after the previous token.
+      if (previous_token_.empty()) {
+        valid_ = false;
+        return;
+      }
+      nodes_.push(current_node ? current_node->AddChild(previous_token_)
+                               : nullptr);
+      previous_token_.clear();
+    } else if (*token == "}") {
+      // A message is being completed. There should be no previous token.  Note
+      // that the top-level message never closes, so we should always have at
+      // least one entry in the stack.
+      if (nodes_.size() == 1 || !previous_token_.empty()) {
+        valid_ = false;
+        return;
+      }
+      if (current_node) {
+        current_node->Finish();
+      }
+      nodes_.pop();
+    } else if (*token == ":") {
+      // We reached the end of the 'key' portion of a field. Store the token
+      // until we have the 'value' portion.
+      if (previous_token_.empty()) {
+        valid_ = false;
+        return;
+      }
+    } else {
+      if (previous_token_.empty()) {
+        previous_token_.swap(*token);
+      } else {
+        // This is the 'value' portion of a field. The previous token is the
+        // 'key'.
+        if (current_node) {
+          current_node->SetField(previous_token_, *token);
+        }
+        previous_token_.clear();
+      }
+    }
+  }
+
+  bool valid() const { return valid_; }
+
+ private:
+  std::stack<Message*> nodes_;
+  std::string previous_token_;
+  bool valid_;
+};
+
+bool Message::Read(std::istream* input, Message* message) {
+  MessageStack stack(message);
+  Tokenize(input, &stack);
+  return stack.valid();
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/message.h b/tensorflow/contrib/lite/testing/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..78ef7e2cbe1c323753ac36f1be06a089e650aa37
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/message.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace testing {
+
+// A Message is a textual protobuf-like structure that looks like:
+//    tag {
+//      f : "values"
+//      child {
+//        a : 1
+//       }
+//    }
+// This class provides the framework for processing message but does not
+// associate any particular behavior to fields and submessage. In order
+// to properly parse a stream this class must be derived.
+class Message {
+ public:
+  // Reads a stream, tokenizes it and create a new message under the given
+  // top-level message. Returns true if the parsing succeeded.
+  static bool Read(std::istream* input, Message* message);
+
+  Message() {}
+  virtual ~Message() {}
+
+  // Called when a new field is found. For example, when:
+  //   f : "values"
+  // is found, it triggers:
+  //   SetField("f", "values");
+  virtual void SetField(const std::string& name, const std::string& value) {}
+
+  // Called when a submessage is started. For example, when:
+  //   child {
+  // is found, it triggers
+  //   AddChild("child");
+  // If nullptr is returned, the contents of the submessage will be ignored.
+  // Otherwise, the returned Message will be used to handle new fields and new
+  // submessages. The caller should not take ownership of the returned pointer.
+  virtual Message* AddChild(const std::string& name) { return nullptr; }
+
+  // Called when a submessage is completed, that is, whenever a '}' is found.
+  virtual void Finish() {}
+
+ protected:
+  // Takes ownership of the given pointer. Subclasses can use this method if
+  // they don't want to implement their own ownership semantics.
+  Message* Store(Message* n) {
+    children_.emplace_back(n);
+    return n;
+  }
+
+  // Returns a list of all owned submessages.
+  const std::vector<std::unique_ptr<Message>>& Children() const {
+    return children_;
+  }
+
+ private:
+  std::vector<std::unique_ptr<Message>> children_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_MESSAGE_H_
diff --git a/tensorflow/contrib/lite/testing/message_test.cc b/tensorflow/contrib/lite/testing/message_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb6a49bd6f1ea88f1b48c03dfb08a54626bda2eb
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/message_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/message.h"
+
+#include <map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// A hierarchical, key-value store.
+class TestMessage : public Message {
+ public:
+  TestMessage() {}
+  explicit TestMessage(const std::string& text_to_parse) {
+    std::stringstream ss(text_to_parse);
+    finished_ = Message::Read(&ss, this);
+  }
+  void SetField(const std::string& name, const std::string& value) override {
+    fields_[name] = value;
+  }
+  Message* AddChild(const std::string& name) override {
+    TestMessage* m = new TestMessage;
+    m->name_ = name;
+    return Store(m);
+  }
+  void Finish() override { finished_ = true; }
+
+  int NumChildren() const { return Children().size(); }
+
+  const TestMessage* GetChild(int i) const {
+    return dynamic_cast<TestMessage*>(Children()[i].get());
+  }
+
+  int NumFields() const { return fields_.size(); }
+  const std::string& GetField(const std::string& key) const {
+    return fields_.at(key);
+  }
+
+  const std::string& name() const { return name_; }
+  bool finished() const { return finished_; }
+
+ protected:
+  std::string name_;
+  std::map<std::string, std::string> fields_;
+  bool finished_ = false;
+};
+
+TEST(MessageTest, Simple) {
+  TestMessage message("x{a:1 b:2} y{} z{c:3} d:4");
+  ASSERT_TRUE(message.finished());
+
+  ASSERT_EQ(message.NumFields(), 1);
+  EXPECT_EQ(message.GetField("d"), "4");
+
+  ASSERT_EQ(message.NumChildren(), 3);
+
+  auto* x = message.GetChild(0);
+  EXPECT_EQ(x->name(), "x");
+  ASSERT_EQ(x->NumFields(), 2);
+  EXPECT_EQ(x->GetField("a"), "1");
+  EXPECT_EQ(x->GetField("b"), "2");
+
+  auto* y = message.GetChild(1);
+  EXPECT_EQ(y->name(), "y");
+  ASSERT_EQ(y->NumFields(), 0);
+
+  auto* z = message.GetChild(2);
+  EXPECT_EQ(z->name(), "z");
+  ASSERT_EQ(z->NumFields(), 1);
+  EXPECT_EQ(z->GetField("c"), "3");
+}
+
+TEST(MessageTest, Unnamed) {
+  TestMessage message("x{c:3} {} y{d:4}");
+  ASSERT_FALSE(message.finished());
+  EXPECT_EQ(message.NumChildren(), 1);
+}
+
+TEST(MessageTest, TooManyBraces) {
+  TestMessage message("x{c:3} } y{d:4}");
+  ASSERT_FALSE(message.finished());
+  EXPECT_EQ(message.NumChildren(), 1);
+}
+
+TEST(MessageTest, LeftoverToken) {
+  TestMessage message("x{c:3} z{test} y{d:4}");
+  ASSERT_FALSE(message.finished());
+  EXPECT_EQ(message.NumChildren(), 2);
+}
+
+TEST(MessageTest, MissingKey) {
+  TestMessage message("x{c:3} z{:test} y{d:4}");
+  ASSERT_FALSE(message.finished());
+  EXPECT_EQ(message.NumChildren(), 2);
+}
+
+TEST(MessageTest, MissingValue) {
+  TestMessage message("x{c:3} z{test:} y{d:4}");
+  ASSERT_FALSE(message.finished());
+  EXPECT_EQ(message.NumChildren(), 2);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/nnapi_example.cc b/tensorflow/contrib/lite/testing/nnapi_example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74f6cfc3de5d209671c38595434a43128966bb0e
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/nnapi_example.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// NOTE: this is an example driver that converts a tflite model to TensorFlow.
+// This is an example that will be integrated more tightly into tflite in
+// the future.
+//
+// Usage: bazel run -c opt \
+// tensorflow/contrib/lite/nnapi:nnapi_example -- <filename>
+//
+#include <cstdarg>
+#include <cstdio>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+
+// TODO(aselle): FATAL leaves resources hanging.
+void FATAL(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  va_end(args);
+  fflush(stderr);
+  exit(1);
+}
+
+#define CHECK_TFLITE_SUCCESS(x)                       \
+  if (x != kTfLiteOk) {                               \
+    FATAL("Aborting since tflite returned failure."); \
+  }
+
+void Interpret(const char* filename, const char* examples_filename,
+               bool use_nnapi) {
+  // TODO(aselle): Resize of input image should go here
+  // ...
+  // For now I am allocating all tensors. This means I am fixed size.
+  // So I am not using the variable size ability yet.
+  fprintf(stderr, "example file %s\n", examples_filename);
+  std::vector<tflite::testing::Example> examples;
+  CHECK_TFLITE_SUCCESS(
+      tflite::testing::ParseExamples(examples_filename, &examples));
+
+  for (const tflite::testing::Example& example : examples) {
+    auto model = tflite::FlatBufferModel::BuildFromFile(filename);
+    if (!model) FATAL("Cannot read file %s\n", filename);
+    std::unique_ptr<tflite::Interpreter> interpreter;
+    tflite::ops::builtin::BuiltinOpResolver builtins;
+
+    CHECK_TFLITE_SUCCESS(
+        tflite::InterpreterBuilder(*model, builtins)(&interpreter));
+
+    printf("Use nnapi is set to: %d\n", use_nnapi);
+    interpreter->UseNNAPI(use_nnapi);
+    CHECK_TFLITE_SUCCESS(
+        tflite::testing::FeedExample(interpreter.get(), example));
+
+    {
+      TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+      if (float* data =
+              interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+        size_t num = tensor->bytes / sizeof(float);
+        for (float* p = data; p < data + num; p++) {
+          *p = 0;
+        }
+      }
+    }
+    interpreter->Invoke();
+
+    CHECK_TFLITE_SUCCESS(
+        tflite::testing::CheckOutputs(interpreter.get(), example));
+
+    printf("Result:\n");
+    TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+    if (float* data =
+            interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+      size_t num = tensor->bytes / sizeof(float);
+      for (float* p = data; p < data + num; p++) {
+        printf(" %f", *p);
+      }
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  bool use_nnapi = true;
+  if (argc == 4) {
+    use_nnapi = strcmp(argv[3], "1") == 0 ? true : false;
+  }
+  if (argc < 3) {
+    fprintf(stderr,
+            "Compiled " __DATE__ __TIME__
+            "\n"
+            "Usage!!!: %s <tflite model> <examples to test> "
+            "{ use nn api i.e. 0,1}\n",
+            argv[0]);
+    return 1;
+  }
+  Interpret(argv[1], argv[2], use_nnapi);
+  return 0;
+}
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.cc b/tensorflow/contrib/lite/testing/parse_testdata.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d745ed27158cdad55bdcd97162cb3dfa9e32c112
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/parse_testdata.cc
@@ -0,0 +1,335 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Parses tflite example input data.
+// Format is ASCII
+// TODO(aselle): Switch to protobuf, but the android team requested a simple
+// ASCII file.
+#include "tensorflow/contrib/lite/testing/parse_testdata.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <streambuf>
+
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/testing/message.h"
+#include "tensorflow/contrib/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// Fatal error if parse error occurs
+#define PARSE_CHECK_EQ(filename, current_line, x, y)                         \
+  if ((x) != (y)) {                                                          \
+    fprintf(stderr, "Parse Error @ %s:%d\n  File %s\n  Line %d, %s != %s\n", \
+            __FILE__, __LINE__, filename, current_line + 1, #x, #y);         \
+    return kTfLiteError;                                                     \
+  }
+
+// Breakup a "," delimited line into a std::vector<std::string>.
+// This is extremely inefficient, and just used for testing code.
+// TODO(aselle): replace with absl when we use it.
+std::vector<std::string> ParseLine(const std::string& line) {
+  size_t pos = 0;
+  std::vector<std::string> elements;
+  while (true) {
+    size_t end = line.find(',', pos);
+    if (end == std::string::npos) {
+      elements.push_back(line.substr(pos));
+      break;
+    } else {
+      elements.push_back(line.substr(pos, end - pos));
+    }
+    pos = end + 1;
+  }
+  return elements;
+}
+
+}  // namespace
+
+// Given a `filename`, produce a vector of Examples corresopnding
+// to test cases that can be applied to a tflite model.
+TfLiteStatus ParseExamples(const char* filename,
+                           std::vector<Example>* examples) {
+  std::ifstream fp(filename);
+  if (!fp.good()) {
+    fprintf(stderr, "Could not read '%s'\n", filename);
+    return kTfLiteError;
+  }
+  std::string str((std::istreambuf_iterator<char>(fp)),
+                  std::istreambuf_iterator<char>());
+  size_t pos = 0;
+
+  // \n and , delimit parse a file.
+  std::vector<std::vector<std::string>> csv;
+  while (true) {
+    size_t end = str.find('\n', pos);
+
+    if (end == std::string::npos) {
+      csv.emplace_back(ParseLine(str.substr(pos)));
+      break;
+    }
+    csv.emplace_back(ParseLine(str.substr(pos, end - pos)));
+    pos = end + 1;
+  }
+
+  int current_line = 0;
+  PARSE_CHECK_EQ(filename, current_line, csv[0][0], "test_cases");
+  int example_count = std::stoi(csv[0][1]);
+  current_line++;
+
+  auto parse_tensor = [&filename, &current_line,
+                       &csv](FloatTensor* tensor_ptr) {
+    PARSE_CHECK_EQ(filename, current_line, csv[current_line][0], "dtype");
+    current_line++;
+    // parse shape
+    PARSE_CHECK_EQ(filename, current_line, csv[current_line][0], "shape");
+    size_t elements = 1;
+    FloatTensor& tensor = *tensor_ptr;
+
+    for (size_t i = 1; i < csv[current_line].size(); i++) {
+      const auto& shape_part_to_parse = csv[current_line][i];
+      if (shape_part_to_parse.empty()) {
+        // Case of a 0-dimensional shape
+        break;
+      }
+      int shape_part = std::stoi(shape_part_to_parse);
+      elements *= shape_part;
+      tensor.shape.push_back(shape_part);
+    }
+    current_line++;
+    // parse data
+    PARSE_CHECK_EQ(filename, current_line, csv[current_line].size() - 1,
+                   elements);
+    for (size_t i = 1; i < csv[current_line].size(); i++) {
+      tensor.flat_data.push_back(std::stof(csv[current_line][i]));
+    }
+    current_line++;
+
+    return kTfLiteOk;
+  };
+
+  for (int example_idx = 0; example_idx < example_count; example_idx++) {
+    Example example;
+    PARSE_CHECK_EQ(filename, current_line, csv[current_line][0], "inputs");
+    int inputs = std::stoi(csv[current_line][1]);
+    current_line++;
+    // parse dtype
+    for (int input_index = 0; input_index < inputs; input_index++) {
+      example.inputs.push_back(FloatTensor());
+      TF_LITE_ENSURE_STATUS(parse_tensor(&example.inputs.back()));
+    }
+
+    PARSE_CHECK_EQ(filename, current_line, csv[current_line][0], "outputs");
+    int outputs = std::stoi(csv[current_line][1]);
+    current_line++;
+    for (int input_index = 0; input_index < outputs; input_index++) {
+      example.outputs.push_back(FloatTensor());
+      TF_LITE_ENSURE_STATUS(parse_tensor(&example.outputs.back()));
+    }
+    examples->emplace_back(example);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus FeedExample(tflite::Interpreter* interpreter,
+                         const Example& example) {
+  // Resize inputs to match example & allocate.
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_index = interpreter->inputs()[i];
+
+    TF_LITE_ENSURE_STATUS(
+        interpreter->ResizeInputTensor(input_index, example.inputs[i].shape));
+  }
+  TF_LITE_ENSURE_STATUS(interpreter->AllocateTensors());
+  // Copy data into tensors.
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_index = interpreter->inputs()[i];
+    if (float* data = interpreter->typed_tensor<float>(input_index)) {
+      for (size_t idx = 0; idx < example.inputs[i].flat_data.size(); idx++) {
+        data[idx] = example.inputs[i].flat_data[idx];
+      }
+    } else if (int32_t* data =
+                   interpreter->typed_tensor<int32_t>(input_index)) {
+      for (size_t idx = 0; idx < example.inputs[i].flat_data.size(); idx++) {
+        data[idx] = example.inputs[i].flat_data[idx];
+      }
+    } else {
+      fprintf(stderr, "input[%zu] was not float or int data\n", i);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
+                          const Example& example) {
+  constexpr double kRelativeThreshold = 1e-2f;
+  constexpr double kAbsoluteThreshold = 1e-4f;
+
+  ErrorReporter* context = DefaultErrorReporter();
+  int model_outputs = interpreter->outputs().size();
+  TF_LITE_ENSURE_EQ(context, model_outputs, example.outputs.size());
+  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
+    int output_index = interpreter->outputs()[i];
+    if (const float* data = interpreter->typed_tensor<float>(output_index)) {
+      for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
+        float computed = data[idx];
+        float reference = example.outputs[0].flat_data[idx];
+        float diff = std::abs(computed - reference);
+        bool error_is_large = false;
+        // For very small numbers, try absolute error, otherwise go with
+        // relative.
+        if (std::abs(reference) < kRelativeThreshold) {
+          error_is_large = (diff > kAbsoluteThreshold);
+        } else {
+          error_is_large = (diff > kRelativeThreshold * std::abs(reference));
+        }
+        if (error_is_large) {
+          fprintf(stdout, "output[%zu][%zu] did not match %f vs reference %f\n",
+                  i, idx, data[idx], reference);
+          return kTfLiteError;
+        }
+      }
+      fprintf(stderr, "\n");
+    } else if (const int32_t* data =
+                   interpreter->typed_tensor<int32_t>(output_index)) {
+      for (size_t idx = 0; idx < example.outputs[i].flat_data.size(); idx++) {
+        int32_t computed = data[idx];
+        int32_t reference = example.outputs[0].flat_data[idx];
+        if (std::abs(computed - reference) > 0) {
+          fprintf(stderr, "output[%zu][%zu] did not match %d vs reference %f\n",
+                  i, idx, data[idx], example.outputs[0].flat_data[idx]);
+          return kTfLiteError;
+        }
+      }
+      fprintf(stderr, "\n");
+    } else {
+      fprintf(stderr, "output[%zu] was not float or int data\n", i);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+// Process an 'invoke' message, triggering execution of the test runner, as
+// well as verification of outputs. An 'invoke' message looks like:
+//   invoke {
+//     id: xyz
+//     input: 1,2,1,1,1,2,3,4
+//     output: 4,5,6
+//   }
+class Invoke : public Message {
+ public:
+  explicit Invoke(TestRunner* test_runner) : test_runner_(test_runner) {
+    expected_inputs_ = test_runner->GetInputs();
+    expected_outputs_ = test_runner->GetOutputs();
+  }
+
+  void SetField(const std::string& name, const std::string& value) override {
+    if (name == "id") {
+      test_runner_->SetInvocationId(value);
+    } else if (name == "input") {
+      if (expected_inputs_.empty()) {
+        return test_runner_->Invalidate("Too many inputs");
+      }
+      test_runner_->SetInput(*expected_inputs_.begin(), value);
+      expected_inputs_.erase(expected_inputs_.begin());
+    } else if (name == "output") {
+      if (expected_outputs_.empty()) {
+        return test_runner_->Invalidate("Too many outputs");
+      }
+      test_runner_->SetExpectation(*expected_outputs_.begin(), value);
+      expected_outputs_.erase(expected_outputs_.begin());
+    }
+  }
+  void Finish() override {
+    test_runner_->Invoke();
+    test_runner_->CheckResults();
+  }
+
+ private:
+  std::vector<int> expected_inputs_;
+  std::vector<int> expected_outputs_;
+
+  TestRunner* test_runner_;
+};
+
+// Process an 'reshape' message, triggering resizing of the input tensors via
+// the test runner. A 'reshape' message looks like:
+//   reshape {
+//     input: 1,2,1,1,1,2,3,4
+//   }
+class Reshape : public Message {
+ public:
+  explicit Reshape(TestRunner* test_runner) : test_runner_(test_runner) {
+    expected_inputs_ = test_runner->GetInputs();
+  }
+
+  void SetField(const std::string& name, const std::string& value) override {
+    if (name == "input") {
+      if (expected_inputs_.empty()) {
+        return test_runner_->Invalidate("Too many inputs to reshape");
+      }
+      test_runner_->ReshapeTensor(*expected_inputs_.begin(), value);
+      expected_inputs_.erase(expected_inputs_.begin());
+    }
+  }
+
+ private:
+  std::vector<int> expected_inputs_;
+  TestRunner* test_runner_;
+};
+
+// This is the top-level message in a test file.
+class TestData : public Message {
+ public:
+  explicit TestData(TestRunner* test_runner) : test_runner_(test_runner) {}
+
+  void SetField(const std::string& name, const std::string& value) override {
+    if (name == "load_model") {
+      test_runner_->LoadModel(value);
+    } else if (name == "init_state") {
+      test_runner_->AllocateTensors();
+      for (int id : Split<int>(value, ",")) {
+        test_runner_->ResetTensor(id);
+      }
+    }
+  }
+  Message* AddChild(const std::string& s) override {
+    if (s == "invoke") {
+      test_runner_->AllocateTensors();
+      return Store(new Invoke(test_runner_));
+    } else if (s == "reshape") {
+      return Store(new Reshape(test_runner_));
+    }
+    return nullptr;
+  }
+
+ private:
+  TestRunner* test_runner_;
+};
+
+bool ParseAndRunTests(std::istream* input, TestRunner* test_runner) {
+  TestData test_data(test_runner);
+  Message::Read(input, &test_data);
+  return test_runner->IsValid() && test_runner->GetOverallSuccess();
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/parse_testdata.h b/tensorflow/contrib/lite/testing/parse_testdata.h
new file mode 100644
index 0000000000000000000000000000000000000000..90839fe24550b6c4a0a3a3f4115c479a71580bb0
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/parse_testdata.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
+
+#include <vector>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// Shape and data for a float tensor
+struct FloatTensor {
+  std::vector<int> shape;
+  std::vector<float> flat_data;
+};
+
+// A prescribed input, output example
+struct Example {
+  std::vector<FloatTensor> inputs;
+  std::vector<FloatTensor> outputs;
+};
+
+// Parses an example input and output file (used for unit tests)
+TfLiteStatus ParseExamples(const char* filename,
+                           std::vector<Example>* examples);
+
+// Inputs Tensors into a TensorFlow lite interpreter. Note, this will run
+// interpreter.AllocateTensors();
+TfLiteStatus FeedExample(tflite::Interpreter* interpreter, const Example&);
+
+// Check outputs against (already) evaluated result.
+TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter, const Example&);
+
+// Parses a test description and feeds the given test runner with data.
+// The input format is similar to an ASCII proto:
+//   // Loads model 'add.bin' from the TestRunner's model directory.
+//   load_model: "add.bin"
+//   // Changes the shape of inputs, provided in the same order they appear
+//   // in the model.
+//   reshape {
+//     input: "1,224,224,3"
+//     input: "1,3,4,1"
+//   }
+//   // Fills the given persistent tensors with zeros.
+//   init_state: 0,1,2,3
+//   // Invokes the interpreter with the given input and checks that it
+//   // produces the expected output. Inputs and outputs should be specified in
+//   // the order they appear in the model.
+//   invoke {
+//     input: "1,2,3,4,56"
+//     input: "0.1,0.2,0.3,4.3,56.4"
+//     output: "12,3,4,545,3"
+//     output: "0.01,0.02"
+//   }
+bool ParseAndRunTests(std::istream* input, TestRunner* test_runner);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_NNAPI_PARSE_TESTDATA_H_
diff --git a/tensorflow/contrib/lite/testing/split.cc b/tensorflow/contrib/lite/testing/split.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5836f4ff049b70c00d22524a3bf3327074281f3a
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/split.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+std::vector<std::pair<size_t, size_t>> SplitToPos(const string& s,
+                                                  const string& delimiter) {
+  std::vector<std::pair<size_t, size_t>> fields;
+  if (delimiter.length() == 0) {
+    fields.emplace_back(0, s.length());
+    return fields;
+  }
+  size_t pos = 0;
+  size_t start = 0;
+  while ((pos = s.find(delimiter, start)) != string::npos) {
+    if (pos != start) {
+      fields.emplace_back(start, pos);
+    }
+    start = pos + delimiter.length();
+  }
+  if (start != s.length()) {
+    fields.emplace_back(start, s.length());
+  }
+  return fields;
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/split.h b/tensorflow/contrib/lite/testing/split.h
new file mode 100644
index 0000000000000000000000000000000000000000..24071442e8929f37443df1b98d22711b3024b87c
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/split.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
+
+#include <cstdlib>
+#include <string>
+#include <utility>
+#include <vector>
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Splits a string based on the given delimiter string. Each pair in the
+// returned vector has the start and past-the-end positions for each of the
+// parts of the original string. Empty fields are not represented in the
+// output.
+std::vector<std::pair<size_t, size_t>> SplitToPos(const string& s,
+                                                  const string& delimiter);
+
+// Splits the given string and converts each part to the given T.
+template <typename T>
+std::vector<T> Split(const string& s, const string& delimiter);
+
+template <>
+inline std::vector<string> Split(const string& s, const string& delimiter) {
+  std::vector<string> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(s.substr(p.first, p.second - p.first));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<int> Split(const string& s, const string& delimiter) {
+  std::vector<int> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<float> Split(const string& s, const string& delimiter) {
+  std::vector<float> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtod(s.data() + p.first, nullptr));
+  }
+  return fields;
+}
+
+template <>
+inline std::vector<uint8_t> Split(const string& s, const string& delimiter) {
+  std::vector<uint8_t> fields;
+  for (const auto& p : SplitToPos(s, delimiter)) {
+    fields.push_back(strtol(s.data() + p.first, nullptr, 10));
+  }
+  return fields;
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_SPLIT_H_
diff --git a/tensorflow/contrib/lite/testing/split_test.cc b/tensorflow/contrib/lite/testing/split_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d1e25d9c7dab50984928adfe0d7392675578662
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/split_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/split.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pair;
+
+TEST(SplitTest, SplitToPos) {
+  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ";:"),
+              ElementsAre(Pair(0, 4), Pair(6, 12), Pair(14, 19)));
+  EXPECT_THAT(SplitToPos("test;:1-2-3 ;: test", ":"),
+              ElementsAre(Pair(0, 5), Pair(6, 13), Pair(14, 19)));
+  EXPECT_THAT(SplitToPos("test", ":"), ElementsAre(Pair(0, 4)));
+  EXPECT_THAT(SplitToPos("test ", ":"), ElementsAre(Pair(0, 5)));
+  EXPECT_THAT(SplitToPos("", ":"), ElementsAre());
+  EXPECT_THAT(SplitToPos("test ", ""), ElementsAre(Pair(0, 5)));
+  EXPECT_THAT(SplitToPos("::::", ":"), ElementsAre());
+}
+
+TEST(SplitTest, SplitString) {
+  EXPECT_THAT(Split<string>("A;B;C", ";"), ElementsAre("A", "B", "C"));
+}
+
+TEST(SplitTest, SplitFloat) {
+  EXPECT_THAT(Split<float>("1.0 B 1e-5", " "), ElementsAre(1.0, 0.0, 1e-5));
+}
+
+TEST(SplitTest, SplitInt) {
+  EXPECT_THAT(Split<int>("1,-1,258", ","), ElementsAre(1, -1, 258));
+}
+
+TEST(SplitTest, SplitUint8) {
+  EXPECT_THAT(Split<uint8_t>("1,-1,258", ","), ElementsAre(1, 255, 2));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4b26949b57e0702ac5554afd766a6072af268a4
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// This is the base class for processing test data. Each one of the virtual
+// methods must be implemented to forward the data to the appropriate executor
+// (e.g. TF Lite's interpreter, or the NNAPI).
+class TestRunner {
+ public:
+  TestRunner() {}
+  virtual ~TestRunner() {}
+
+  // Load the given model, as a path relative to SetModelBaseDir().
+  virtual void LoadModel(const string& bin_file_path) = 0;
+
+  // Return the list of input tensors in the loaded model.
+  virtual const std::vector<int>& GetInputs() = 0;
+
+  // Return the list of output tensors in the loaded model.
+  virtual const std::vector<int>& GetOutputs() = 0;
+
+  // Prepare for a run by resize the given tensor. The given 'id' is
+  // guaranteed to be one of the ids returned by GetInputs().
+  virtual void ReshapeTensor(int id, const string& csv_values) = 0;
+
+  // Reserve memory for all tensors.
+  virtual void AllocateTensors() = 0;
+
+  // Set the given tensor to some initial state, usually zero. This is
+  // used to reset persistent buffers in a model.
+  virtual void ResetTensor(int id) = 0;
+
+  // Define the contents of the given input tensor. The given 'id' is
+  // guaranteed to be one of the ids returned by GetInputs().
+  virtual void SetInput(int id, const string& csv_values) = 0;
+
+  // Define what should be expected for an output tensor after Invoke() runs.
+  // The given 'id' is guaranteed to be one of the ids returned by
+  // GetOutputs().
+  virtual void SetExpectation(int id, const string& csv_values) = 0;
+
+  // Run the model.
+  virtual void Invoke() = 0;
+
+  // Verify that the contents of all outputs conform to the existing
+  // expectations. Return true if there are no expectations or they are all
+  // satisfied.
+  virtual bool CheckResults() = 0;
+
+  // Set the base path for loading models.
+  void SetModelBaseDir(const string& path) {
+    model_base_dir_ = path;
+    if (path[path.length() - 1] != '/') {
+      model_base_dir_ += "/";
+    }
+  }
+
+  // Return the full path of a model.
+  string GetFullPath(const string& path) { return model_base_dir_ + path; }
+
+  // Give an id to the next invocation to make error reporting more meaningful.
+  void SetInvocationId(const string& id) { invocation_id_ = id; }
+  const string& GetInvocationId() const { return invocation_id_; }
+
+  // Invalidate the test runner, preventing it from executing any further.
+  void Invalidate(const string& error_message) {
+    error_message_ = error_message;
+  }
+  bool IsValid() const { return error_message_.empty(); }
+  const string& GetErrorMessage() const { return error_message_; }
+
+  // Handle the overall success of this test runner. This will be true if all
+  // invocations were successful.
+  void SetOverallSuccess(bool value) { overall_success_ = value; }
+  bool GetOverallSuccess() const { return overall_success_; }
+
+ protected:
+  // A helper to check of the given number of values is consistent with the
+  // number of bytes in a tensor of type T. When incompatibles sizes are found,
+  // the test runner is invalidated and false is returned.
+  template <typename T>
+  bool CheckSizes(size_t tensor_bytes, size_t num_values) {
+    size_t num_tensor_elements = tensor_bytes / sizeof(T);
+    if (num_tensor_elements != num_values) {
+      Invalidate("Expected '" + std::to_string(num_tensor_elements) +
+                 "' elements for a tensor, but only got '" +
+                 std::to_string(num_values) + "'");
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  string model_base_dir_;
+  string invocation_id_;
+  bool overall_success_ = true;
+
+  string error_message_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
diff --git a/tensorflow/contrib/lite/testing/test_runner_test.cc b/tensorflow/contrib/lite/testing/test_runner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f712a5347a042990ae5adb9d44325dd683193168
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/test_runner_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/test_runner.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+class ConcreteTestRunner : public TestRunner {
+ public:
+  void LoadModel(const string& bin_file_path) override {}
+  const std::vector<int>& GetInputs() override { return ids_; }
+  const std::vector<int>& GetOutputs() override { return ids_; }
+  void ReshapeTensor(int id, const string& csv_values) override {}
+  void AllocateTensors() override {}
+  void ResetTensor(int id) override {}
+  void SetInput(int id, const string& csv_values) override {}
+  void SetExpectation(int id, const string& csv_values) override {}
+  void Invoke() override {}
+  bool CheckResults() override { return true; }
+  bool CheckFloatSizes(size_t bytes, size_t values) {
+    return CheckSizes<float>(bytes, values);
+  }
+
+ private:
+  std::vector<int> ids_;
+};
+
+TEST(TestRunner, ModelPath) {
+  ConcreteTestRunner runner;
+  EXPECT_EQ(runner.GetFullPath("test.bin"), "test.bin");
+  runner.SetModelBaseDir("/tmp");
+  EXPECT_EQ(runner.GetFullPath("test.bin"), "/tmp/test.bin");
+}
+
+TEST(TestRunner, InvocationId) {
+  ConcreteTestRunner runner;
+  EXPECT_EQ(runner.GetInvocationId(), "");
+  runner.SetInvocationId("X");
+  EXPECT_EQ(runner.GetInvocationId(), "X");
+}
+
+TEST(TestRunner, Invalidation) {
+  ConcreteTestRunner runner;
+  EXPECT_TRUE(runner.IsValid());
+  EXPECT_EQ(runner.GetErrorMessage(), "");
+  runner.Invalidate("Some Error");
+  EXPECT_FALSE(runner.IsValid());
+  EXPECT_EQ(runner.GetErrorMessage(), "Some Error");
+}
+
+TEST(TestRunner, OverallSuccess) {
+  ConcreteTestRunner runner;
+  EXPECT_TRUE(runner.GetOverallSuccess());
+  runner.SetOverallSuccess(false);
+  EXPECT_FALSE(runner.GetOverallSuccess());
+}
+
+TEST(TestRunner, CheckSizes) {
+  ConcreteTestRunner runner;
+  EXPECT_TRUE(runner.CheckFloatSizes(16, 4));
+  EXPECT_FALSE(runner.CheckFloatSizes(16, 2));
+  EXPECT_EQ(runner.GetErrorMessage(),
+            "Expected '4' elements for a tensor, but only got '2'");
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf9df2ec264bcff7f836a70db37afe8a5ce01c28
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -0,0 +1,208 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+
+#include <iostream>
+
+#include "tensorflow/contrib/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+// Returns the value in the given position in a tensor.
+template <typename T>
+T Value(const TfLitePtrUnion& data, int index);
+template <>
+float Value(const TfLitePtrUnion& data, int index) {
+  return data.f[index];
+}
+template <>
+uint8_t Value(const TfLitePtrUnion& data, int index) {
+  return data.uint8[index];
+}
+
+template <typename T>
+void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
+  T* input_ptr = reinterpret_cast<T*>(data->raw);
+  for (const T& v : values) {
+    *input_ptr = v;
+    ++input_ptr;
+  }
+}
+
+}  // namespace
+
+class TfLiteDriver::Expectation {
+ public:
+  Expectation() { data_.raw = nullptr; }
+  ~Expectation() { delete[] data_.raw; }
+  template <typename T>
+  void SetData(const string& csv_values) {
+    const auto& values = testing::Split<T>(csv_values, ",");
+    data_.raw = new char[values.size() * sizeof(T)];
+    SetTensorData(values, &data_);
+  }
+
+  bool Check(bool verbose, const TfLiteTensor& tensor) {
+    switch (tensor.type) {
+      case kTfLiteFloat32:
+        return TypedCheck<float>(verbose, tensor);
+      case kTfLiteUInt8:
+        return TypedCheck<uint8_t>(verbose, tensor);
+      default:
+        return false;
+    }
+  }
+
+ private:
+  template <typename T>
+  bool TypedCheck(bool verbose, const TfLiteTensor& tensor) {
+    int tensor_size = tensor.bytes / sizeof(T);
+
+    bool good_output = true;
+    for (int i = 0; i < tensor_size; ++i) {
+      if (std::abs(Value<T>(data_, i) - Value<T>(tensor.data, i)) > 1e-5) {
+        good_output = false;
+        if (verbose) {
+          std::cerr << "  index " << i << ": " << Value<T>(data_, i)
+                    << " != " << Value<T>(tensor.data, i) << std::endl;
+        }
+      }
+    }
+    return good_output;
+  }
+
+  TfLitePtrUnion data_;
+};
+
+TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
+TfLiteDriver::~TfLiteDriver() {}
+
+void TfLiteDriver::AllocateTensors() {
+  if (must_allocate_tensors_) {
+    if (interpreter_->AllocateTensors() != kTfLiteOk) {
+      std::cerr << "Failed to allocate tensors" << std::endl;
+      abort();
+    }
+    must_allocate_tensors_ = false;
+  }
+}
+
+void TfLiteDriver::LoadModel(const string& bin_file_path) {
+  if (!IsValid()) return;
+  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
+
+  model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
+  if (!model_) {
+    Invalidate("Failed to mmap model " + bin_file_path);
+    return;
+  }
+  ops::builtin::BuiltinOpResolver builtins;
+  InterpreterBuilder(*model_, builtins)(&interpreter_);
+  if (!interpreter_) {
+    Invalidate("Failed build interpreter");
+    return;
+  }
+
+  must_allocate_tensors_ = true;
+}
+
+void TfLiteDriver::ResetTensor(int id) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  memset(tensor->data.raw, 0, tensor->bytes);
+}
+
+void TfLiteDriver::ReshapeTensor(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  if (interpreter_->ResizeInputTensor(
+          id, testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
+    Invalidate("Failed to resize input tensor " + std::to_string(id));
+    return;
+  }
+  must_allocate_tensors_ = true;
+}
+
+void TfLiteDriver::SetInput(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  switch (tensor->type) {
+    case kTfLiteFloat32: {
+      const auto& values = testing::Split<float>(csv_values, ",");
+      if (!CheckSizes<float>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    case kTfLiteUInt8: {
+      const auto& values = testing::Split<uint8_t>(csv_values, ",");
+      if (!CheckSizes<uint8_t>(tensor->bytes, values.size())) return;
+      SetTensorData(values, &tensor->data);
+      break;
+    }
+    default:
+      Invalidate("Unsupported tensor data type");
+      return;
+  }
+}
+
+void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
+  if (!IsValid()) return;
+  auto* tensor = interpreter_->tensor(id);
+  expected_output_[id].reset(new Expectation);
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      expected_output_[id]->SetData<float>(csv_values);
+      break;
+    case kTfLiteUInt8:
+      expected_output_[id]->SetData<uint8_t>(csv_values);
+      break;
+    default:
+      Invalidate("Unsupported tensor data type");
+      return;
+  }
+}
+
+void TfLiteDriver::Invoke() {
+  if (!IsValid()) return;
+  if (interpreter_->Invoke() != kTfLiteOk) {
+    Invalidate("Failed to invoke interpreter");
+  }
+}
+
+bool TfLiteDriver::CheckResults() {
+  if (!IsValid()) return false;
+  bool success = true;
+  for (const auto& p : expected_output_) {
+    int id = p.first;
+    auto* tensor = interpreter_->tensor(id);
+    if (!p.second->Check(/*verbose=*/false, *tensor)) {
+      // Do not invalidate anything here. Instead, simply output the
+      // differences and return false. Invalidating would prevent all
+      // subsequent invocations from running..
+      std::cerr << "There were errors in invocation '" << GetInvocationId()
+                << "', output tensor '" << id << "':" << std::endl;
+      p.second->Check(/*verbose=*/true, *tensor);
+      success = false;
+      SetOverallSuccess(false);
+    }
+  }
+  expected_output_.clear();
+  return success;
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..4440d4285e948c3d1622c8de5c47ff3729c5847f
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
+
+#include <map>
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/testing/test_runner.h"
+
+namespace tflite {
+namespace testing {
+
+// A test runner that feeds inputs into TF Lite and verifies its outputs.
+class TfLiteDriver : public TestRunner {
+ public:
+  explicit TfLiteDriver(bool use_nnapi);
+  ~TfLiteDriver() override;
+
+  void LoadModel(const string& bin_file_path) override;
+  const std::vector<int>& GetInputs() override {
+    return interpreter_->inputs();
+  }
+  const std::vector<int>& GetOutputs() override {
+    return interpreter_->outputs();
+  }
+  void ReshapeTensor(int id, const string& csv_values) override;
+  void AllocateTensors() override;
+  void ResetTensor(int id) override;
+  void SetInput(int id, const string& csv_values) override;
+  void SetExpectation(int id, const string& csv_values) override;
+  void Invoke() override;
+  bool CheckResults() override;
+
+ private:
+  class Expectation;
+
+  bool use_nnapi_ = false;
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::map<int, std::unique_ptr<Expectation>> expected_output_;
+  bool must_allocate_tensors_ = true;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DRIVER_H_
diff --git a/tensorflow/contrib/lite/testing/tflite_driver_test.cc b/tensorflow/contrib/lite/testing/tflite_driver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37010c468f250fdf4ef958b23a38aa38b7a533db
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tflite_driver_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tflite_driver.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(TfliteDriverTest, SimpleTest) {
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(/*use_nnapi=*/false));
+
+  runner->SetModelBaseDir("tensorflow/contrib/lite");
+  runner->LoadModel("testdata/multi_add.bin");
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
+  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
+
+  for (int i : {0, 1, 2, 3}) {
+    runner->ReshapeTensor(i, "1,2,2,1");
+  }
+  ASSERT_TRUE(runner->IsValid());
+
+  runner->AllocateTensors();
+
+  runner->SetInput(0, "0.1,0.2,0.3,0.4");
+  runner->SetInput(1, "0.001,0.002,0.003,0.004");
+  runner->SetInput(2, "0.001,0.002,0.003,0.004");
+  runner->SetInput(3, "0.01,0.02,0.03,0.04");
+
+  runner->ResetTensor(2);
+
+  runner->SetExpectation(5, "0.101,0.202,0.303,0.404");
+  runner->SetExpectation(6, "0.011,0.022,0.033,0.044");
+
+  runner->Invoke();
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_TRUE(runner->CheckResults());
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tokenize.cc b/tensorflow/contrib/lite/testing/tokenize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e84ea475cae60b197a243953517f401f77e2e46
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tokenize.cc
@@ -0,0 +1,95 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tokenize.h"
+#include <istream>
+#include <string>
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+void Tokenize(std::istream* input, TokenProcessor* processor) {
+  enum State { kBuildQuotedToken, kBuildToken, kIdle };
+
+  std::string current_token;
+  State state = kIdle;
+  auto start_token = [&](char c) {
+    state = kBuildToken;
+    current_token.clear();
+    current_token = c;
+  };
+  auto issue_token = [&]() {
+    state = kIdle;
+    processor->ConsumeToken(&current_token);
+    current_token.clear();
+  };
+  auto start_quoted_token = [&]() {
+    state = kBuildQuotedToken;
+    current_token.clear();
+  };
+  auto issue_quoted_token = [&]() {
+    state = kIdle;
+    processor->ConsumeToken(&current_token);
+    current_token.clear();
+  };
+  auto issue_delim = [&](char d) {
+    current_token = string(1, d);
+    processor->ConsumeToken(&current_token);
+    current_token.clear();
+  };
+  auto is_delim = [](char c) { return c == '{' || c == '}' || c == ':'; };
+  auto is_quote = [](char c) { return c == '"'; };
+
+  for (auto it = std::istreambuf_iterator<char>(*input);
+       it != std::istreambuf_iterator<char>(); ++it) {
+    switch (state) {
+      case kIdle:
+        if (is_delim(*it)) {
+          issue_delim(*it);
+        } else if (is_quote(*it)) {
+          start_quoted_token();
+        } else if (!isspace(*it)) {
+          start_token(*it);
+        }
+        break;
+      case kBuildToken:
+        if (is_delim(*it)) {
+          issue_token();
+          issue_delim(*it);
+        } else if (is_quote(*it)) {
+          issue_token();
+          start_quoted_token();
+        } else if (isspace(*it)) {
+          issue_token();
+        } else {
+          current_token += *it;
+        }
+        break;
+      case kBuildQuotedToken:
+        if (is_quote(*it)) {
+          issue_quoted_token();
+        } else {
+          current_token += *it;
+        }
+        break;
+    }
+  }
+  if (state != kIdle) {
+    issue_token();
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tokenize.h b/tensorflow/contrib/lite/testing/tokenize.h
new file mode 100644
index 0000000000000000000000000000000000000000..daccf0e84a450a0ffdf04a1eb8ff319878cfc808
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tokenize.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
+
+#include <istream>
+#include <string>
+
+namespace tflite {
+namespace testing {
+
+// Process tokens coming from Tokenize().
+class TokenProcessor {
+ public:
+  virtual ~TokenProcessor() {}
+  // Process a single token. The token won't be reused, so it is OK to call
+  // token.swap().
+  virtual void ConsumeToken(std::string* token) = 0;
+};
+
+// Tokenize a stream on whitespaces, colons and curly braces. Whitespaces are
+// removed from the tokens and double-quotes can be used to avoid that. Note
+// that there is no way to escape double-quotes, so there's no way to have a
+// double-quote inside a token.
+void Tokenize(std::istream* input, TokenProcessor* processor);
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_TOKENIZER_H_
diff --git a/tensorflow/contrib/lite/testing/tokenize_test.cc b/tensorflow/contrib/lite/testing/tokenize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80f44aacca7e90efb3a6c8967c7175eada35734b
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/tokenize_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/testing/tokenize.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+class TokenCollector : public TokenProcessor {
+ public:
+  void ConsumeToken(std::string* token) override { tokens_.push_back(*token); }
+  const std::vector<std::string>& Tokens() { return tokens_; }
+
+ private:
+  std::vector<std::string> tokens_;
+};
+
+std::vector<std::string> TokenizeString(const std::string& s) {
+  std::stringstream ss(s);
+  TokenCollector collector;
+  Tokenize(&ss, &collector);
+  return collector.Tokens();
+}
+
+TEST(TokenizeTest, TokenDetection) {
+  EXPECT_THAT(TokenizeString("x :1"), ElementsAre("x", ":", "1"));
+  EXPECT_THAT(TokenizeString("x:1"), ElementsAre("x", ":", "1"));
+  EXPECT_THAT(TokenizeString("x {1"), ElementsAre("x", "{", "1"));
+  EXPECT_THAT(TokenizeString("x{1"), ElementsAre("x", "{", "1"));
+  EXPECT_THAT(TokenizeString("x }1"), ElementsAre("x", "}", "1"));
+  EXPECT_THAT(TokenizeString("x}1"), ElementsAre("x", "}", "1"));
+  EXPECT_THAT(TokenizeString("x \"1"), ElementsAre("x", "1"));
+  EXPECT_THAT(TokenizeString("x\"1"), ElementsAre("x", "1"));
+}
+
+TEST(TokenizeTest, QuotedTokenDetection) {
+  EXPECT_THAT(TokenizeString("\"w:x{y}z\"1"), ElementsAre("w:x{y}z", "1"));
+  EXPECT_THAT(TokenizeString("\"w:x{y}z\"\"1\""), ElementsAre("w:x{y}z", "1"));
+}
+
+TEST(TokenizeTest, Delimiters) {
+  EXPECT_THAT(TokenizeString("}"), ElementsAre("}"));
+  EXPECT_THAT(TokenizeString("}}"), ElementsAre("}", "}"));
+  EXPECT_THAT(TokenizeString("{"), ElementsAre("{"));
+  EXPECT_THAT(TokenizeString("{{"), ElementsAre("{", "{"));
+  EXPECT_THAT(TokenizeString(":"), ElementsAre(":"));
+  EXPECT_THAT(TokenizeString("::"), ElementsAre(":", ":"));
+}
+
+TEST(TokenizeTest, CornerCases) {
+  EXPECT_THAT(TokenizeString("  i { b:a } "),
+              ElementsAre("i", "{", "b", ":", "a", "}"));
+  EXPECT_THAT(TokenizeString(" }"), ElementsAre("}"));
+  EXPECT_THAT(TokenizeString(" }  "), ElementsAre("}"));
+  EXPECT_THAT(TokenizeString(" {}  "), ElementsAre("{", "}"));
+  EXPECT_THAT(TokenizeString(" x{}  y{} "),
+              ElementsAre("x", "{", "}", "y", "{", "}"));
+  EXPECT_THAT(TokenizeString("x:1 y:2 "),
+              ElementsAre("x", ":", "1", "y", ":", "2"));
+  EXPECT_THAT(TokenizeString("x:\"1\" y:2 "),
+              ElementsAre("x", ":", "1", "y", ":", "2"));
+  EXPECT_THAT(TokenizeString("x:\"1, 2\" y:\"\" "),
+              ElementsAre("x", ":", "1, 2", "y", ":", ""));
+}
+
+TEST(TokenizeTest, NewLines) {
+  EXPECT_THAT(TokenizeString("x:\n1,\n 2 \n  y :\n3 \n"),
+              ElementsAre("x", ":", "1,", "2", "y", ":", "3"));
+}
+
+TEST(TokenizeTest, LongString) {
+  EXPECT_THAT(
+      TokenizeString("   i { b:a } input {"
+                     "a: \"1e-1, 2,3\" b:\"1,2,3\"\n c{ "
+                     "id:1 x{d{a:"
+                     "1}}} f:2 "
+                     "\n}\n t:1"),
+      ElementsAreArray({"i",  "{", "b",         ":", "a", "}",     "input", "{",
+                        "a",  ":", "1e-1, 2,3", "b", ":", "1,2,3", "c",     "{",
+                        "id", ":", "1",         "x", "{", "d",     "{",     "a",
+                        ":",  "1", "}",         "}", "}", "f",     ":",     "2",
+                        "}",  "t", ":",         "1"}));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d4304f022187027950f58050ececae73dedffb6
--- /dev/null
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
+
+namespace tflite {
+
+inline void LogToStderr() {
+#ifdef PLATFORM_GOOGLE
+  FLAGS_logtostderr = true;
+#endif
+}
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..78c036fa779cfdb72a1761fe23cbfa3e43f11182
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -0,0 +1,366 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+    "tf_proto_library_py",
+)
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+)
+
+tf_proto_library_cc(
+    name = "types_proto",
+    srcs = ["types.proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "toco_flags_proto",
+    srcs = ["toco_flags.proto"],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "model_flags_proto",
+    srcs = ["model_flags.proto"],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "types_proto",
+    srcs = [
+        "types.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "toco_flags_proto",
+    srcs = [
+        "toco_flags.proto",
+    ],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_py(
+    name = "model_flags_proto",
+    srcs = [
+        "model_flags.proto",
+    ],
+    protodeps = [":types_proto"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensorflow_core_cc_protos_all",
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+cc_library(
+    name = "runtime",
+    hdrs = [
+        "runtime/common.h",
+        "runtime/types.h",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/kernels/internal:reference_base",
+        "//tensorflow/contrib/lite/kernels/internal:types",
+    ],
+)
+
+# :model offers the core data structures representing a model (a.k.a. "graph")
+# for tooling purposes (not needed at inference runtime).
+# That includes the top-level Model structure, and the lower-level Operator,
+# Array, Buffer structures, etc.
+cc_library(
+    name = "model",
+    hdrs = [
+        "model.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_port",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "toco_graphviz_dump_options",
+    srcs = [
+        "toco_graphviz_dump_options.cc",
+    ],
+    hdrs = [
+        "toco_graphviz_dump_options.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "toco_cmdline_flags",
+    srcs = [
+        "toco_cmdline_flags.cc",
+    ],
+    hdrs = [
+        "toco_cmdline_flags.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "model_cmdline_flags",
+    srcs = [
+        "model_cmdline_flags.cc",
+    ],
+    hdrs = [
+        "args.h",
+        "model_cmdline_flags.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model_flags_proto_cc",
+        ":toco_graphviz_dump_options",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "toco_port",
+    srcs = [
+        "toco_port.cc",
+    ],
+    hdrs = [
+        "format_port.h",
+        "toco_port.h",
+        "toco_types.h",
+    ],
+    deps = [
+        # Placeholder for internal file dependency.
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "graph_transformations",
+    srcs = [
+        "graph_transformations/convert_expanddims_to_reshape.cc",
+        "graph_transformations/convert_pure_conv_to_depthwise.cc",
+        "graph_transformations/create_im2col_arrays.cc",
+        "graph_transformations/dequantize.cc",
+        "graph_transformations/drop_fake_quant.cc",
+        "graph_transformations/drop_im2col_arrays.cc",
+        "graph_transformations/ensure_bias_vectors.cc",
+        "graph_transformations/fuse_activation_functions.cc",
+        "graph_transformations/fuse_binary_into_following_affine.cc",
+        "graph_transformations/fuse_binary_into_preceding_affine.cc",
+        "graph_transformations/graph_transformations.cc",
+        "graph_transformations/hardcode_min_max.cc",
+        "graph_transformations/identify_l2_normalization.cc",
+        "graph_transformations/identify_l2_pool.cc",
+        "graph_transformations/identify_lstm.cc",
+        "graph_transformations/identify_relu1.cc",
+        "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/propagate_array_data_types.cc",
+        "graph_transformations/propagate_fixed_sizes.cc",
+        "graph_transformations/quantize.cc",
+        "graph_transformations/read_fake_quant_min_max.cc",
+        "graph_transformations/remove_final_dequantize_op.cc",
+        "graph_transformations/remove_tensorflow_assert.cc",
+        "graph_transformations/remove_tensorflow_identity.cc",
+        "graph_transformations/remove_trivial_binary.cc",
+        "graph_transformations/remove_trivial_concatenation.cc",
+        "graph_transformations/remove_trivial_concatenation_input.cc",
+        "graph_transformations/remove_trivial_passthrough.cc",
+        "graph_transformations/remove_trivial_passthrough.h",
+        "graph_transformations/remove_trivial_quantized_activation_func.cc",
+        "graph_transformations/remove_trivial_reshape.cc",
+        "graph_transformations/remove_unused_op.cc",
+        "graph_transformations/resolve_batch_normalization.cc",
+        "graph_transformations/resolve_constant_binary.cc",
+        "graph_transformations/resolve_constant_concatenation.cc",
+        "graph_transformations/resolve_constant_fake_quant.cc",
+        "graph_transformations/resolve_constant_tensorflow_shape.cc",
+        "graph_transformations/resolve_constant_unary.cc",
+        "graph_transformations/resolve_mean_attributes.cc",
+        "graph_transformations/resolve_pad_attributes.cc",
+        "graph_transformations/resolve_reorder_axes.cc",
+        "graph_transformations/resolve_reshape_attributes.cc",
+        "graph_transformations/resolve_slice_attributes.cc",
+        "graph_transformations/resolve_strided_slice_attributes.cc",
+        "graph_transformations/resolve_tensorflow_concat.cc",
+        "graph_transformations/resolve_tensorflow_matmul.cc",
+        "graph_transformations/resolve_tensorflow_merge.cc",
+        "graph_transformations/resolve_tensorflow_squeeze.cc",
+        "graph_transformations/resolve_tensorflow_switch.cc",
+        "graph_transformations/resolve_tensorflow_tile.cc",
+        "graph_transformations/unfuse_activation_functions.cc",
+    ],
+    hdrs = [
+        "graph_transformations/graph_transformations.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_port",
+        ":tooling_util",
+        ":types_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+# :toco_tooling is the library providing the offline tooling functionality
+# exposed by the :toco command-line tool.
+cc_library(
+    name = "toco_tooling",
+    srcs = [
+        "allocate_transient_arrays.cc",
+        "export_tensorflow.cc",
+        "import_tensorflow.cc",
+        "tensorflow_util.cc",
+        "toco_tooling.cc",
+    ],
+    hdrs = [
+        "allocate_transient_arrays.h",
+        "export_tensorflow.h",
+        "import_tensorflow.h",
+        "tensorflow_util.h",
+        "toco_tooling.h",
+    ],
+    copts = select({
+        "//tensorflow:darwin": ["-DTOCO_SUPPORT_PORTABLE_PROTOS=0"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_transformations",
+        ":model",
+        ":model_flags_proto_cc",
+        ":types_proto_cc",
+        ":runtime",
+        ":toco_graphviz_dump_options",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":tooling_util",
+        "@protobuf_archive//:protobuf_headers",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:resolve_cluster",
+        "//tensorflow/contrib/lite/toco/tflite:export",
+        "//tensorflow/contrib/lite/toco/tflite:import",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ] + select({
+        # Placeholder for internal darwin rule.
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "tooling_util",
+    srcs = [
+        "dump_graphviz.cc",
+        "tooling_util.cc",
+    ],
+    hdrs = [
+        "dump_graphviz.h",
+        "tooling_util.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_flags_proto_cc",
+        ":runtime",
+        ":toco_flags_proto_cc",
+        ":toco_graphviz_dump_options",
+        ":toco_port",
+        ":types_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "tooling_util_test",
+    srcs = ["tooling_util_test.cc"],
+    deps = [
+        ":model",
+        ":tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+# :toco is the main public command-line tool exposing the functionality
+# of the :toco_tooling library.
+tf_cc_binary(
+    name = "toco",
+    srcs = ["toco.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "toco_port_test",
+    srcs = ["toco_port_test.cc"],
+    data = [
+        "toco_port_test.cc",
+    ],
+    deps = [
+        ":toco_port",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/toco/README.md b/tensorflow/contrib/lite/toco/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..281b2ea5e4c5553ff7aa240cdef3cb9819f19b49
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/README.md
@@ -0,0 +1,26 @@
+# The TensorFlow Lite Optimizing Converter
+
+The TensorFlow Lite Optimizing Converter's most typical use is converting from the TensorFlow GraphDef to the TensorFlow Lite
+format, but it supports much more than that.
+
+## Usage documentation
+
+Usage information is given in these documents:
+
+*   [Command-line examples](g3doc/cmdline_examples.md)
+*   [Command-line reference](g3doc/cmdline_reference.md)
+*   [Python API](g3doc/python_api.md)
+
+## Design documentation
+
+Coming soon!
+
+## Where the converter fits in the TensorFlow landscape
+
+In the typical case, an application developer is using TensorFlow to design and
+train models, then uses TensorFlow's freeze_graph.py to generate a frozen
+inference graph, then uses the converter to convert that into a TensorFlow Lite flatbuffer file,
+then ships that file to client devices where the TensorFlow Lite interpreter handles them
+on-device. This is represented in the following diagram:
+
+![drawing](https://storage.googleapis.com/download.tensorflow.org/example_images/tensorflow_landscape.svg)
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e7282d16aa9aa02d6ebe131ead569282518753
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -0,0 +1,319 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// The life span of an array.
+struct ArrayLifespan {
+  // If true, the array is persistent state (as in a RNN). In that case,
+  // its allocation is permanent and the first_op, last_op members are
+  // unused. (The term 'transient' is a misnomer and we should think in
+  // terms of 'workspace' instead).
+  bool persistent = false;
+  // Index of the first op addressing that array. The array must be allocated
+  // just before executing this op.
+  std::size_t first_op = 0;
+  // Index of the last op addressing that array. We want to deallocate the array
+  // immediately after executing this op.
+  std::size_t last_op = 0;
+};
+
+bool StartsAt(const ArrayLifespan& lifespan, std::size_t op_index) {
+  return !lifespan.persistent && lifespan.first_op == op_index;
+}
+
+bool EndsAt(const ArrayLifespan& lifespan, std::size_t op_index) {
+  return !lifespan.persistent && lifespan.last_op == op_index;
+}
+
+// Helper function for ComputeArrayLifespans: updates one ArrayLifespan for
+// one array for one op.
+void UpdateArrayLifespan(
+    const string& array_name, std::size_t op_index,
+    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+  if (array_lifespans->count(array_name)) {
+    auto& lifespan = array_lifespans->at(array_name);
+    if (!lifespan.persistent) {
+      lifespan.first_op = std::min(lifespan.first_op, op_index);
+      lifespan.last_op = std::max(lifespan.last_op, op_index);
+    }
+  } else {
+    ArrayLifespan lifespan;
+    lifespan.first_op = op_index;
+    lifespan.last_op = op_index;
+    (*array_lifespans)[array_name] = lifespan;
+  }
+}
+
+// Computes the ArrayLifespan for each array.
+void ComputeArrayLifespans(
+    const Model& model,
+    std::unordered_map<string, ArrayLifespan>* array_lifespans) {
+  CHECK(array_lifespans->empty());
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    ArrayLifespan lifespan;
+    lifespan.persistent = true;
+    (*array_lifespans)[rnn_state.state_array()] = lifespan;
+  }
+  for (std::size_t op_index = 0; op_index < model.operators.size();
+       op_index++) {
+    const auto& op = model.operators[op_index];
+    for (const auto& input : op->inputs) {
+      UpdateArrayLifespan(input, op_index, array_lifespans);
+    }
+    for (const auto& output : op->outputs) {
+      UpdateArrayLifespan(output, op_index, array_lifespans);
+    }
+  }
+}
+
+inline bool operator==(const Alloc& a, const Alloc& b) {
+  CHECK(a.start != b.start || a.end == b.end);
+  return a.start == b.start;
+}
+
+// Helper to keep track of total allocation size and of currently live
+// allocations, and containing the core allocation routine.
+class Allocator {
+ public:
+  Allocator() : total_size_(0) {}
+
+  // Core allocation routine.
+  void Allocate(std::size_t size, Alloc* result) {
+    // Naive algorithm: pick the first gap between live allocations,
+    // that is wide enough for the new array.
+    std::size_t pos = 0;
+    for (const auto& a : live_allocs_) {
+      if (a.start >= pos + size) {
+        result->start = pos;
+        result->end = pos + size;
+        live_allocs_.insert(*result);
+        return;
+      }
+      pos = a.end;
+    }
+    // No sufficiently wide gap was found before an existing live allocation,
+    // so we allocate the new array at the end of the allocation space.
+    // We may then have to grow total_size_.
+    total_size_ = std::max(total_size_, pos + size);
+    result->start = pos;
+    result->end = pos + size;
+    live_allocs_.insert(*result);
+  }
+
+  void Deallocate(const Alloc& a) {
+    auto iter = std::lower_bound(live_allocs_.begin(), live_allocs_.end(), a);
+    CHECK(iter != live_allocs_.end());
+    CHECK(*iter == a);
+    live_allocs_.erase(iter);
+  }
+
+  std::size_t total_size() const { return total_size_; }
+
+ private:
+  std::size_t total_size_;
+  std::set<Alloc> live_allocs_;
+};
+
+// Returns the required transient allocation size (in bytes) for a given array,
+// or 0 if it's not a transient array.
+std::size_t TransientArraySize(const Model& model, const string& array_name,
+                               std::size_t transient_data_alignment) {
+  if (!IsAllocatableTransientArray(model, array_name)) {
+    return 0;
+  }
+  const auto& array = model.arrays.at(array_name);
+  CHECK(array->has_shape())
+      << "Array '" << array_name << "' doesn't have a shape";
+  if (array->data_type == ArrayDataType::kNone) {
+    // Catch a typical issue at the moment with RNN states
+    for (const auto& rnn_state : model.flags.rnn_states()) {
+      if (rnn_state.state_array() == array_name) {
+        LOG(FATAL)
+            << "A RNN state array, " << array_name << ", still does not "
+            << "have a known data type after all graph transformations have "
+            << "run. That's mostly a toco bug --- sorry. For now, you can "
+            << "work around this issue by adding manually_create:true in the "
+            << "--rnn_state description of this RNN state.";
+      }
+    }
+    LOG(FATAL) << "An array, " << array_name << ", still does not "
+               << "have a known data type after all graph transformations have "
+               << "run.";
+  }
+  const std::size_t elem_size = ElementSize(array->data_type);
+  const std::size_t raw_size =
+      elem_size * RequiredBufferSizeForShape(array->shape());
+  const std::size_t rounded_size =
+      RoundUpToNextMultipleOf(raw_size, transient_data_alignment);
+  return rounded_size;
+}
+
+// Allocates an array: call this for every array just before the first
+// op where it is used.
+void AllocateTransientArray(const Model& model, const string& array_name,
+                            Allocator* allocator,
+                            std::size_t transient_data_alignment) {
+  if (!IsAllocatableTransientArray(model, array_name)) {
+    return;
+  }
+  const std::size_t size =
+      TransientArraySize(model, array_name, transient_data_alignment);
+  const auto& array = model.arrays.at(array_name);
+  CHECK(!array->alloc);
+  allocator->Allocate(size, &array->GetOrCreateAlloc());
+}
+
+// Deallocates an array: call this for every array just after the last
+// op where it is used.
+void DeallocateTransientArray(const Model& model, const string& array_name,
+                              Allocator* allocator) {
+  if (!IsAllocatableTransientArray(model, array_name)) {
+    return;
+  }
+  const auto& array = model.arrays.at(array_name);
+  CHECK(!!array->alloc);
+  allocator->Deallocate(*array->alloc);
+}
+
+}  // namespace
+
+void AllocateTransientArrays(Model* model,
+                             std::size_t transient_data_alignment) {
+  // Precompute the lifespans for all arrays.
+  std::unordered_map<string, ArrayLifespan> array_lifespans;
+  ComputeArrayLifespans(*model, &array_lifespans);
+
+  // In case of variable batch, our convention will be to compute the
+  // allocations for batch==1, then let the inference code multiply all
+  // the offsets by the actual runtime batch size. Conveniently,
+  // the variable_batch and batch flags are mutually exclusive, and the default
+  // value of batch is 1, so we have nothing special to do here. Let us
+  // just guard this assumption with a CHECK:
+  bool batchless_input_shapes = true;
+  for (const auto& input_array : model->flags.input_arrays()) {
+    if (!input_array.has_shape() || input_array.shape().dims().empty() ||
+        input_array.shape().dims(0) != 1) {
+      batchless_input_shapes = false;
+      break;
+    }
+  }
+  CHECK(!model->flags.variable_batch() || batchless_input_shapes);
+
+  Allocator allocator;
+
+  // Construct a sorted map of array names, so that other layout engines can
+  // match exactly.
+  std::map<string, const Array*> ordered_arrays_map;
+  for (const auto& pair : model->arrays) {
+    ordered_arrays_map[pair.first] = pair.second.get();
+  }
+
+  // Allocate persistent arrays (like RNN states). For them, 'transient'
+  // is a misnormer, should read 'workspace'.
+  for (const auto& array_pair : ordered_arrays_map) {
+    const string& array_name = array_pair.first;
+    const auto& array_lifespan = array_lifespans.find(array_name)->second;
+    if (array_lifespan.persistent) {
+      AllocateTransientArray(*model, array_name, &allocator,
+                             transient_data_alignment);
+    }
+  }
+
+  for (std::size_t op_index = 0; op_index < model->operators.size();
+       op_index++) {
+    const auto& op = model->operators[op_index];
+    // Allocate those arrays whose lifespan starts exactly here.
+    for (const auto& input : op->inputs) {
+      if (StartsAt(array_lifespans[input], op_index)) {
+        AllocateTransientArray(*model, input, &allocator,
+                               transient_data_alignment);
+      }
+    }
+    for (const auto& output : op->outputs) {
+      if (StartsAt(array_lifespans[output], op_index)) {
+        AllocateTransientArray(*model, output, &allocator,
+                               transient_data_alignment);
+      }
+    }
+    // Deallocate those arrays whose lifespan ends exactly here.
+    for (const auto& input : op->inputs) {
+      if (EndsAt(array_lifespans[input], op_index)) {
+        DeallocateTransientArray(*model, input, &allocator);
+      }
+    }
+    for (const auto& output : op->outputs) {
+      if (EndsAt(array_lifespans[output], op_index)) {
+        DeallocateTransientArray(*model, output, &allocator);
+      }
+    }
+  }
+
+  // Just out of curiosity (not used in the actual allocation process)
+  // evaluate the optimal total allocated size.
+  // First, compute the size of persistent arrays.
+  std::size_t optimal_transient_alloc_size = 0;
+  std::size_t persistent_alloc_size = 0;
+  for (const auto& array_pair : ordered_arrays_map) {
+    const string& array_name = array_pair.first;
+    const auto& array_lifespan = array_lifespans.find(array_name)->second;
+    if (array_lifespan.persistent) {
+      persistent_alloc_size +=
+          TransientArraySize(*model, array_name, transient_data_alignment);
+    }
+  }
+  for (const auto& op : model->operators) {
+    // for each operator, compute the sum of the sizes of the array that must
+    // be live during the execution of this operator, plus the size of
+    // persistent arrays that must be live at all times.
+    std::size_t size = persistent_alloc_size;
+    for (const auto& input : op->inputs) {
+      if (!array_lifespans[input].persistent) {
+        size += TransientArraySize(*model, input, transient_data_alignment);
+      }
+    }
+    for (const auto& output : op->outputs) {
+      if (!array_lifespans[output].persistent) {
+        size += TransientArraySize(*model, output, transient_data_alignment);
+      }
+    }
+    // The optimal total size is the maximum of all operator-specific sizes.
+    optimal_transient_alloc_size = std::max(optimal_transient_alloc_size, size);
+  }
+
+  model->transient_data_size = allocator.total_size();
+  model->transient_data_alignment = transient_data_alignment;
+  CHECK_GE(model->transient_data_size, optimal_transient_alloc_size);
+  LOG(INFO) << "Total transient array allocated size: "
+            << model->transient_data_size << " bytes, "
+            << "theoretical optimal value: " << optimal_transient_alloc_size
+            << " bytes.";
+  CheckInvariants(*model);
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.h b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
new file mode 100644
index 0000000000000000000000000000000000000000..12d0d0498f5224962f2775d4e3cb7d8e360cbe46
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.h
@@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
+
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+// We align the allocated sizes to the next multiple of a cache line,
+// to get simple performance characteristics without side effects of
+// accesses to one buffer on accesses to another buffer.
+// That also takes care of data type alignment for any reasonable type
+// (no reasonable data type should have alignment greater than a cache line).
+// Here we make CPU-centric assumptions, in particular, we assume 64-byte cache
+// lines. Getting this wrong by a factor of 2x (if this ever changes) wouldn't
+// be terrible.
+// Embedded architectures may use a different value for alignment.
+constexpr std::size_t kDefaultTransientDataAlignment = 64;
+
+// Rounds up dividend to a value divisible by divisor.
+inline std::size_t RoundUpToNextMultipleOf(std::size_t dividend,
+                                           std::size_t divisor) {
+  return ((dividend + divisor - 1) / divisor) * divisor;
+}
+
+void AllocateTransientArrays(Model* model,
+                             std::size_t transient_data_alignment);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ALLOCATE_TRANSIENT_ARRAYS_H_
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
new file mode 100644
index 0000000000000000000000000000000000000000..5268902346f720be7ecd4980c696d4df8c3da173
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -0,0 +1,230 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This abstracts command line arguments in toco.
+// Arg<T> is a parseable type that can register a default value, be able to
+// parse itself, and keep track of whether it was specified.
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+
+namespace toco {
+
+// Since std::vector<int32> is in the std namespace, and we are not allowed
+// to add ParseFlag/UnparseFlag to std, we introduce a simple wrapper type
+// to use as the flag type:
+struct IntList {
+  std::vector<int32> elements;
+};
+struct StringMapList {
+  std::vector<std::unordered_map<string, string>> elements;
+};
+
+// command_line_flags.h don't track whether or not a flag is specified. Arg
+// contains the value (which will be default if not specified) and also
+// whether the flag is specified.
+// TODO(aselle): consider putting doc string and ability to construct the
+// tensorflow argument into this, so declaration of parameters can be less
+// distributed.
+// Every template specialization of Arg is required to implement
+// default_value(), specified(), value(), parse(), bind().
+template <class T>
+class Arg final {
+ public:
+  explicit Arg(T default_ = T()) : value_(default_) {}
+  virtual ~Arg() {}
+
+  // Provide default_value() to arg list
+  T default_value() const { return value_; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Const reference to parsed value.
+  const T& value() const { return value_; }
+
+  // Parsing callback for the tensorflow::Flags code
+  bool parse(T value_in) {
+    value_ = value_in;
+    specified_ = true;
+    return true;
+  }
+
+  // Bind the parse member function so tensorflow::Flags can call it.
+  std::function<bool(T)> bind() {
+    return std::bind(&Arg::parse, this, std::placeholders::_1);
+  }
+
+ private:
+  // Becomes true after parsing if the value was specified
+  bool specified_ = false;
+  // Value of the argument (initialized to the default in the constructor).
+  T value_;
+};
+
+template <>
+class Arg<toco::IntList> final {
+ public:
+  // Provide default_value() to arg list
+  string default_value() const { return ""; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Bind the parse member function so tensorflow::Flags can call it.
+  bool parse(string text) {
+    parsed_value_.elements.clear();
+    specified_ = true;
+    // strings::Split("") produces {""}, but we need {} on empty input.
+    // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
+    // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
+    if (!text.empty()) {
+      int32 element;
+      for (absl::string_view part : absl::StrSplit(text, ',')) {
+        if (!SimpleAtoi(part, &element)) return false;
+        parsed_value_.elements.push_back(element);
+      }
+    }
+    return true;
+  }
+
+  std::function<bool(string)> bind() {
+    return std::bind(&Arg::parse, this, std::placeholders::_1);
+  }
+
+  const toco::IntList& value() const { return parsed_value_; }
+
+ private:
+  toco::IntList parsed_value_;
+  bool specified_ = false;
+};
+
+template <>
+class Arg<toco::StringMapList> final {
+ public:
+  // Provide default_value() to StringMapList
+  string default_value() const { return ""; }
+  // Return true if the command line argument was specified on the command line.
+  bool specified() const { return specified_; }
+  // Bind the parse member function so tensorflow::Flags can call it.
+
+  bool parse(string text) {
+    parsed_value_.elements.clear();
+    specified_ = true;
+
+    if (text.empty()) {
+      return true;
+    }
+
+#if defined(PLATFORM_GOOGLE)
+    std::vector<absl::string_view> outer_vector;
+    absl::string_view text_disposable_copy = text;
+    SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
+    for (const absl::string_view& outer_member_stringpiece : outer_vector) {
+      string outer_member(outer_member_stringpiece);
+      if (outer_member.empty()) {
+        continue;
+      }
+      string outer_member_copy = outer_member;
+      absl::StripAsciiWhitespace(&outer_member);
+      if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
+      if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
+      const std::vector<string> inner_fields_vector =
+          strings::Split(outer_member, ',');
+
+      std::unordered_map<string, string> element;
+      for (const string& member_field : inner_fields_vector) {
+        std::vector<string> outer_member_key_value =
+            strings::Split(member_field, ':');
+        if (outer_member_key_value.size() != 2) return false;
+        string& key = outer_member_key_value[0];
+        string& value = outer_member_key_value[1];
+        absl::StripAsciiWhitespace(&key);
+        absl::StripAsciiWhitespace(&value);
+        if (element.count(key) != 0) return false;
+        element[key] = value;
+      }
+      parsed_value_.elements.push_back(element);
+    }
+    return true;
+#else
+    // TODO(aselle): Fix argument parsing when absl supports structuredline
+    fprintf(stderr, "%s:%d StringMapList arguments not supported\n", __FILE__,
+            __LINE__);
+    abort();
+#endif
+  }
+
+  std::function<bool(string)> bind() {
+    return std::bind(&Arg::parse, this, std::placeholders::_1);
+  }
+
+  const toco::StringMapList& value() const { return parsed_value_; }
+
+ private:
+  toco::StringMapList parsed_value_;
+  bool specified_ = false;
+};
+
+// Flags that describe a model. See model_cmdline_flags.cc for details.
+struct ParsedModelFlags {
+  Arg<string> input_array;
+  Arg<string> input_arrays;
+  Arg<string> output_array;
+  Arg<string> output_arrays;
+  Arg<string> input_shapes;
+  Arg<float> mean_value = Arg<float>(0.f);
+  Arg<string> mean_values;
+  Arg<float> std_value = Arg<float>(1.f);
+  Arg<string> std_values;
+  Arg<string> input_data_type;
+  Arg<string> input_data_types;
+  Arg<bool> variable_batch = Arg<bool>(false);
+  Arg<toco::IntList> input_shape;
+  Arg<toco::StringMapList> rnn_states;
+  Arg<toco::StringMapList> model_checks;
+  // Debugging output options.
+  // TODO(benoitjacob): these shouldn't be ModelFlags.
+  Arg<string> graphviz_first_array;
+  Arg<string> graphviz_last_array;
+  Arg<string> dump_graphviz;
+  Arg<bool> dump_graphviz_video = Arg<bool>(false);
+};
+
+// Flags that describe the operation you would like to do (what conversion
+// you want). See toco_cmdline_flags.cc for details.
+struct ParsedTocoFlags {
+  Arg<string> input_file;
+  Arg<string> output_file;
+  Arg<string> input_format;
+  Arg<string> output_format;
+  // TODO(aselle): command_line_flags  doesn't support doubles
+  Arg<float> default_ranges_min = Arg<float>(0.);
+  Arg<float> default_ranges_max = Arg<float>(0.);
+  Arg<string> inference_type;
+  Arg<string> inference_input_type;
+  Arg<bool> drop_fake_quant = Arg<bool>(false);
+  Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
+  Arg<bool> allow_custom_ops = Arg<bool>(false);
+  // Deprecated flags
+  Arg<string> input_type;
+  Arg<string> input_types;
+  Arg<bool> drop_control_dependency = Arg<bool>(false);
+};
+
+}  // namespace toco
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_ARGS_H_
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1a7b26d91d946b9c338bd18bd3538bcb37173a6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -0,0 +1,308 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
+
+#include <memory>
+#include <set>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/strings/str_replace.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+using toco::port::AppendF;
+using toco::port::StringF;
+
+namespace toco {
+namespace {
+
+class Color {
+ public:
+  Color() {}
+  Color(uint8 r, uint8 g, uint8 b) : r_(r), g_(g), b_(b) {}
+  // Returns the string serialization of this color in graphviz format,
+  // for use as 'fillcolor' in boxes.
+  string FillColorString() const { return StringF("%.2X%.2X%.2X", r_, g_, b_); }
+  // Returns the serialization in graphviz format of a suitable color to use
+  // 'fontcolor' in the same boxes. It should black or white, whichever offers
+  // the better contrast from FillColorString().
+  string TextColorString() const {
+    // https://en.wikipedia.org/wiki/Relative_luminance
+    const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
+    const uint8 l = luminance > 128.f ? 0 : 255;
+    return StringF("%.2X%.2X%.2X", l, l, l);
+  }
+
+ private:
+  uint8 r_ = 0, g_ = 0, b_ = 0;
+};
+
+struct NodeProperties {
+  // The text to display inside the box for this node.
+  string label;
+  // The color to use for this node; will be used as 'fillcolor'
+  // for its box. See Color::FillColorString. A suitable, different
+  // color will be chosen for the 'fontcolor' for the inside text
+  // label, see Color::TextColorString.
+  Color color;
+};
+
+// All colors in this file are from:
+// https://material.io/guidelines/style/color.html
+
+Color GetColorForArray(const Model& model, const string& array_name) {
+  // Arrays involved in RNN back-edges have a different color
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    // RNN state, fed by a back-edge. Bold color.
+    if (array_name == rnn_state.state_array()) {
+      return Color(0x0F, 0x9D, 0x58);
+    }
+    // RNN back-edge source, feeding a RNN state.
+    // Light tone of the same color as RNN states.
+    if (array_name == rnn_state.back_edge_source_array()) {
+      return Color(0xB7, 0xE1, 0xCD);
+    }
+  }
+  // Constant parameter arrays have their own bold color
+  if (model.GetArray(array_name).buffer) {
+    return Color(0x42, 0x85, 0xF4);
+  }
+  // Remaining arrays are activations.
+  // We use gray colors for them because they are the majority
+  // of arrays so we want to highlight other arrays instead of them.
+  // First, we use a bolder gray for input/output arrays:
+  const auto& dump_options = *GraphVizDumpOptions::singleton();
+  if (IsInputArray(model, array_name) ||
+      array_name == dump_options.graphviz_first_array ||
+      array_name == dump_options.graphviz_last_array) {
+    return Color(0x9E, 0x9E, 0x9E);
+  }
+  for (const string& output_array : model.flags.output_arrays()) {
+    if (array_name == output_array) {
+      return Color(0x9E, 0x9E, 0x9E);
+    }
+  }
+  // Remaining arrays are intermediate activation arrays.
+  // Lighter tone of the same grey as for input/output arrays:
+  // We want these to be very discrete.
+  return Color(0xF5, 0xF5, 0xF5);
+}
+
+NodeProperties GetPropertiesForArray(const Model& model,
+                                     const string& array_name) {
+  NodeProperties node_properties;
+  node_properties.color = GetColorForArray(model, array_name);
+  node_properties.label = absl::StrReplaceAll(array_name, {{"/", "/\\n"}});
+
+  // Append array shape to the label.
+  auto& array = model.GetArray(array_name);
+
+  if (array.data_type == ArrayDataType::kFloat) {
+    AppendF(&node_properties.label, "\\nType: float");
+  } else if (array.data_type == ArrayDataType::kInt32) {
+    AppendF(&node_properties.label, "\\nType: int32");
+  } else if (array.data_type == ArrayDataType::kUint8) {
+    AppendF(&node_properties.label, "\\nType: uint8");
+  }
+
+  if (array.has_shape()) {
+    auto& array_shape = array.shape();
+    node_properties.label += "\\n[";
+    for (int id = 0; id < array_shape.dimensions_count(); id++) {
+      if (id == 0) {
+        AppendF(&node_properties.label, "%d", array_shape.dims(id));
+      } else {
+        AppendF(&node_properties.label, "x%d", array_shape.dims(id));
+      }
+    }
+    node_properties.label += "]";
+  }
+
+  if (array.minmax) {
+    AppendF(&node_properties.label, "\\nMinMax: [%.3g, %.3g]",
+            array.minmax->min, array.minmax->max);
+  }
+
+  if (array.quantization_params) {
+    AppendF(&node_properties.label, "\\nQuantization: %.3g * (x - %d)",
+            array.quantization_params->scale,
+            array.quantization_params->zero_point);
+  }
+
+  if (array.alloc) {
+    AppendF(&node_properties.label, "\\nTransient Alloc: [%d, %d)",
+            array.alloc->start, array.alloc->end);
+  }
+
+  return node_properties;
+}
+
+NodeProperties GetPropertiesForOperator(const Operator& op) {
+  NodeProperties node_properties;
+  if (op.type == OperatorType::kTensorFlowUnsupported) {
+    node_properties.label =
+        static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
+  } else {
+    node_properties.label =
+        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  switch (op.fused_activation_function) {
+    case FusedActivationFunctionType::kRelu:
+      AppendF(&node_properties.label, "\\nReLU");
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      AppendF(&node_properties.label, "\\nReLU6");
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      AppendF(&node_properties.label, "\\nReLU1");
+      break;
+    default:
+      break;
+  }
+  // Additional information for some of the operators.
+  switch (op.type) {
+    case OperatorType::kConv: {
+      const auto& conv_op = static_cast<const ConvOperator&>(op);
+      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
+              conv_op.stride_height,
+              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
+      break;
+    }
+    case OperatorType::kDepthwiseConv: {
+      const auto& conv_op = static_cast<const DepthwiseConvOperator&>(op);
+      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
+              conv_op.stride_height,
+              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
+      break;
+    }
+    case OperatorType::kFullyConnected: {
+      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      break;
+    }
+    default:
+      node_properties.color = Color(0xDB, 0x44, 0x37);
+      break;
+  }
+
+  return node_properties;
+}
+
+std::vector<const Operator*> OperatorsToDump(const Model& model) {
+  const auto& dump_options = *GraphVizDumpOptions::singleton();
+  bool first_specified = !dump_options.graphviz_first_array.empty();
+  bool last_specified = !dump_options.graphviz_last_array.empty();
+  CHECK_EQ(first_specified, last_specified);
+  std::vector<const Operator*> ops_to_dump;
+  if (last_specified) {
+    // Return only the part of the graph between graphviz_first_array
+    // and graphviz_last_array.
+    CHECK(model.arrays.count(dump_options.graphviz_first_array));
+    CHECK(model.arrays.count(dump_options.graphviz_last_array));
+    std::unordered_set<string> arrays_already_produced;
+    std::vector<string> arrays_to_produce;
+    arrays_to_produce.push_back(dump_options.graphviz_last_array);
+    while (!arrays_to_produce.empty()) {
+      const string array = arrays_to_produce.back();
+      arrays_to_produce.pop_back();
+      CHECK(!arrays_already_produced.count(array));
+      arrays_already_produced.insert(array);
+      const Operator* op = GetOpWithOutput(model, array);
+      if (!op) {
+        continue;
+      }
+      ops_to_dump.push_back(op);
+      for (const string& input : op->inputs) {
+        if (arrays_already_produced.count(input) ||
+            input == dump_options.graphviz_first_array) {
+          continue;
+        }
+        arrays_to_produce.push_back(input);
+      }
+    }
+  } else {
+    // Return the whole graph.
+    for (const auto& op : model.operators) {
+      ops_to_dump.push_back(op.get());
+    }
+  }
+  return ops_to_dump;
+}
+
+}  // namespace
+
+void DumpGraphviz(const Model& model, string* output_file_contents) {
+  AppendF(output_file_contents, "digraph Computegraph {\n");
+
+  constexpr char kNodeFormat[] =
+      "\t \"%s\" [label=\"%s\", shape=%s, style=filled, fillcolor=\"#%s\", "
+      "fontcolor = \"#%sDD\"];\n";
+
+  constexpr char kEdgeFormat[] = "\t \"%s\" -> \"%s\";\n";
+
+  constexpr char kRNNBackEdgeFormat[] =
+      "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
+
+  std::vector<const Operator*> ops_to_dump = OperatorsToDump(model);
+  std::set<string> already_added_arrays;
+  for (int op_index = 0; op_index < ops_to_dump.size(); op_index++) {
+    const Operator& op = *ops_to_dump[op_index];
+    // Add node for operator.
+    auto op_properties = GetPropertiesForOperator(op);
+    string operator_id = StringF("op%05d", op_index);
+    AppendF(output_file_contents, kNodeFormat, operator_id, op_properties.label,
+            "box", op_properties.color.FillColorString().c_str(),
+            op_properties.color.TextColorString().c_str());
+    // Add nodes and edges for all inputs of the operator.
+    for (const auto& input : op.inputs) {
+      auto array_properties = GetPropertiesForArray(model, input);
+      if (!already_added_arrays.count(input)) {
+        AppendF(output_file_contents, kNodeFormat, input,
+                array_properties.label, "octagon",
+                array_properties.color.FillColorString().c_str(),
+                array_properties.color.TextColorString().c_str());
+      }
+      AppendF(output_file_contents, kEdgeFormat, input, operator_id);
+      already_added_arrays.insert(input);
+    }
+    // Add nodes and edges for all outputs of the operator.
+    for (const auto& output : op.outputs) {
+      auto array_properties = GetPropertiesForArray(model, output);
+      if (!already_added_arrays.count(output)) {
+        AppendF(output_file_contents, kNodeFormat, output,
+                array_properties.label, "octagon",
+                array_properties.color.FillColorString().c_str(),
+                array_properties.color.TextColorString().c_str());
+      }
+      AppendF(output_file_contents, kEdgeFormat, operator_id, output);
+      already_added_arrays.insert(output);
+    }
+  }
+
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    AppendF(output_file_contents, kRNNBackEdgeFormat,
+            rnn_state.back_edge_source_array(), rnn_state.state_array());
+  }
+
+  AppendF(output_file_contents, "}\n");
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.h b/tensorflow/contrib/lite/toco/dump_graphviz.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fb28e3de844b123a60e36bc23c7d2add8189962
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.h
@@ -0,0 +1,28 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
+
+#include <string>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+void DumpGraphviz(const Model& model, string* output_file_contents);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_DUMP_GRAPHVIZ_H_
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e18cf46c69badf4b7584f723a4ba39f2e0d8dd1d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -0,0 +1,1598 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "google/protobuf/map.h"
+#include "google/protobuf/text_format.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::DT_BOOL;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_UINT8;
+using tensorflow::GraphDef;
+using tensorflow::TensorProto;
+
+namespace toco {
+namespace {
+
+// TensorFlow sometimes forbids what it calls "legacy scalars",
+// which are 1-D shapes where the unique shape size is 1.
+// See OpKernel::IsLegacyScalar and OpKernel::allow_legacy_scalars.
+// For that reason, we generally avoid creating legacy scalars,
+// by detecting the case where a 1-D shape would be of size 1 and
+// replacing that by a 0-D shape.
+// However, there is a special circumstance where we must not do that
+// and must unconditionally create a 1-D shape even if it is going to
+// be of size 1: that is the case of bias vectors, with BiasAdd nodes.
+// Indeed, TensorFlow requires bias vectors to be 1-D; in the case of
+// a depth of 1, that would be a legacy scalar, so in that case we
+// must go ahead and keep the shape 1-D, letting it be a legacy scalar.
+enum class LegacyScalarPolicy { kAvoidLegacyScalars, kDoCreateLegacyScalars };
+
+void ExportFloatArray(const Shape& input_shape, const float* input_data,
+                      TensorProto* output_tensor,
+                      LegacyScalarPolicy legacy_scalar_policy) {
+  output_tensor->set_dtype(DT_FLOAT);
+  const int input_flat_size = RequiredBufferSizeForShape(input_shape);
+  auto* shape = output_tensor->mutable_tensor_shape();
+
+  const int kDims = input_shape.dimensions_count();
+  if (legacy_scalar_policy == LegacyScalarPolicy::kDoCreateLegacyScalars ||
+      kDims > 1 || (kDims == 1 && input_shape.dims(0) > 1)) {
+    for (int i = 0; i < kDims; ++i) {
+      shape->add_dim()->set_size(input_shape.dims(i));
+    }
+  }
+  output_tensor->set_tensor_content(
+      string(reinterpret_cast<const char*>(input_data),
+             sizeof(*input_data) * input_flat_size));
+}
+
+void ExportFloatArray(AxesOrder input_axes_order, const Shape& input_shape,
+                      const float* input_data, AxesOrder output_axes_order,
+                      TensorProto* output_tensor,
+                      LegacyScalarPolicy legacy_scalar_policy) {
+  CHECK_EQ(AxesCount(output_axes_order), AxesCount(input_axes_order));
+  output_tensor->set_dtype(DT_FLOAT);
+  CHECK_EQ(input_shape.dimensions_count(), AxesCount(input_axes_order));
+  const int input_flat_size = RequiredBufferSizeForShape(input_shape);
+
+  Shape shuffled_shape;
+  ShuffleDims(input_shape, input_axes_order, output_axes_order,
+              &shuffled_shape);
+  std::vector<float> shuffled_data(input_flat_size);
+  ShuffleArray(input_shape, input_axes_order, output_axes_order, shuffled_shape,
+               input_data, shuffled_data.data());
+
+  ExportFloatArray(shuffled_shape, shuffled_data.data(), output_tensor,
+                   legacy_scalar_policy);
+}
+
+bool HasAlreadyExportedConst(const string& name,
+                             const GraphDef& tensorflow_graph) {
+  for (const auto& node : tensorflow_graph.node()) {
+    if (node.op() == "Const" && node.name() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+                             const float* input_data,
+                             AxesOrder input_axes_order,
+                             AxesOrder output_axes_order,
+                             GraphDef* tensorflow_graph,
+                             LegacyScalarPolicy legacy_scalar_policy) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  ExportFloatArray(input_axes_order, input_shape, input_data, output_axes_order,
+                   tensor, legacy_scalar_policy);
+}
+
+void ConvertFloatTensorConst(const string& name, const Shape& input_shape,
+                             const float* input_data,
+                             AxesOrder input_axes_order,
+                             AxesOrder output_axes_order,
+                             GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  ExportFloatArray(input_axes_order, input_shape, input_data, output_axes_order,
+                   tensor, LegacyScalarPolicy::kAvoidLegacyScalars);
+}
+
+void ConvertFloatTensorConst(const Model& model, const string& name,
+                             AxesOrder input_axes_order,
+                             AxesOrder output_axes_order,
+                             GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  CHECK(model.arrays.count(name));
+  const auto& input_array = *model.arrays.at(name);
+  const auto& input_shape = input_array.shape();
+  CHECK(input_array.buffer);
+  CHECK(input_array.buffer->type == ArrayDataType::kFloat);
+  const float* input_data =
+      input_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ExportFloatArray(input_axes_order, input_shape, input_data, output_axes_order,
+                   tensor, LegacyScalarPolicy::kAvoidLegacyScalars);
+}
+
+void ConvertFloatTensorConst(const Model& model, const string& name,
+                             GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  CHECK(model.arrays.count(name));
+  const auto& input_array = *model.arrays.at(name);
+  const auto& input_shape = input_array.shape();
+  CHECK(input_array.buffer);
+  CHECK(input_array.buffer->type == ArrayDataType::kFloat);
+  const float* input_data =
+      input_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ExportFloatArray(input_shape, input_data, tensor,
+                   LegacyScalarPolicy::kAvoidLegacyScalars);
+}
+
+void ConvertIntTensorConst(const Model& model, const string& name,
+                           GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  CHECK(model.arrays.count(name));
+  const auto& array = *model.arrays.at(name);
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  const auto& data = array.GetBuffer<ArrayDataType::kInt32>().data;
+  for (auto index : data) {
+    tensor->add_int_val(index);
+  }
+  const auto& array_shape = array.shape();
+  auto* shape = tensor->mutable_tensor_shape();
+  for (int i = 0; i < array_shape.dimensions_count(); i++) {
+    shape->add_dim()->set_size(array_shape.dims(i));
+  }
+}
+
+void CreateMatrixShapeTensorConst(const string& name, int rows, int cols,
+                                  GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  const int32 data[2] = {cols, rows};
+  tensor->set_tensor_content(
+      string(reinterpret_cast<const char*>(data), sizeof(data)));
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(2);
+}
+
+void CreateDummyConcatDimTensorConst(const string& name, int dim,
+                                     GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  tensor->add_int_val(dim);
+}
+
+void CreateReshapeShapeTensorConst(const string& name,
+                                   const std::vector<int32>& shape,
+                                   GraphDef* tensorflow_graph) {
+  if (HasAlreadyExportedConst(name, *tensorflow_graph)) {
+    return;
+  }
+  auto* const_op = tensorflow_graph->add_node();
+  const_op->set_op("Const");
+  const_op->set_name(name);
+  (*const_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*const_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  for (auto s : shape) {
+    tensor->add_int_val(s);
+  }
+  // TensorFlow sometimes forbids what it calls "legacy scalars",
+  // which are shapes of size 1 where the unique shape size is 1.
+  // See OpKernel::IsLegacyScalar and OpKernel::allow_legacy_scalars.
+  if (shape.size() > 1) {
+    auto* tensor_shape = tensor->mutable_tensor_shape();
+    tensor_shape->add_dim()->set_size(shape.size());
+  }
+}
+
+string WalkUpToConstantArray(const Model& model, const string& name) {
+  const Array& original_array = model.GetArray(name);
+  if (original_array.buffer) {
+    return name;
+  }
+  const auto* op = GetOpWithOutput(model, name);
+  CHECK(op);
+  CHECK(op->type == OperatorType::kFakeQuant);
+  const string& input_of_fakequant_name = op->inputs[0];
+  const Array& input_of_fakequant = model.GetArray(input_of_fakequant_name);
+  CHECK(input_of_fakequant.buffer);
+  return input_of_fakequant_name;
+}
+
+void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  const bool has_bias = src_op.inputs.size() >= 3;
+  string conv_output = src_op.outputs[0];
+  if (has_bias) {
+    conv_output += "/conv";
+  }
+
+  auto* conv2d_op = tensorflow_graph->add_node();
+  conv2d_op->set_op("Conv2D");
+  conv2d_op->set_name(conv_output);
+  *conv2d_op->add_input() = src_op.inputs[0];
+  *conv2d_op->add_input() = src_op.inputs[1];
+  (*conv2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  const string& weights_array_name =
+      WalkUpToConstantArray(model, src_op.inputs[1]);
+  const auto& weights_array = model.GetArray(weights_array_name);
+  CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
+  ConvertFloatTensorConst(model, weights_array_name, AxesOrder::kOHWI,
+                          AxesOrder::kHWIO, tensorflow_graph);
+  auto& strides = (*conv2d_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  (*conv2d_op->mutable_attr())["padding"].set_s(padding);
+
+  if (has_bias) {
+    auto* biasadd_op = tensorflow_graph->add_node();
+    biasadd_op->set_op("BiasAdd");
+    biasadd_op->set_name(src_op.outputs[0]);
+    biasadd_op->add_input(conv_output);
+    biasadd_op->add_input(src_op.inputs[2]);
+    (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
+    CHECK(model.arrays.count(src_op.inputs[2]));
+    const string& bias_array_name =
+        WalkUpToConstantArray(model, src_op.inputs[2]);
+    const auto& bias_array = model.GetArray(bias_array_name);
+    // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
+    Shape bias_shape_1d = bias_array.shape();
+    UnextendShape(&bias_shape_1d, 1);
+    CHECK(bias_array.buffer->type == ArrayDataType::kFloat);
+    const float* bias_data =
+        bias_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+    ConvertFloatTensorConst(bias_array_name, bias_shape_1d, bias_data,
+                            AxesOrder::kOneAxis, AxesOrder::kOneAxis,
+                            tensorflow_graph,
+                            LegacyScalarPolicy::kDoCreateLegacyScalars);
+  }
+}
+
+void ConvertDepthwiseConvOperator(const Model& model,
+                                  const DepthwiseConvOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  const bool has_bias = src_op.inputs.size() >= 3;
+  string conv_output = src_op.outputs[0];
+  if (has_bias) {
+    conv_output += "/conv";
+  }
+
+  auto* dc2d_op = tensorflow_graph->add_node();
+  dc2d_op->set_op("DepthwiseConv2dNative");
+  dc2d_op->set_name(conv_output);
+  *dc2d_op->add_input() = src_op.inputs[0];
+  *dc2d_op->add_input() = src_op.inputs[1];
+  (*dc2d_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  // Our internal DepthwiseConv weights are 1 x H x W x OutputDepth.
+  // We need to convert that to H x W x InputDepth x Multiplier.
+  // That's only a matter of constructing a Dims object; the actual
+  // array layout is the same.
+  CHECK(model.arrays.count(src_op.inputs[1]));
+  const string& src_weights_name =
+      WalkUpToConstantArray(model, src_op.inputs[1]);
+  const auto& src_weights_array = model.GetArray(src_weights_name);
+  const auto& src_weights_shape = src_weights_array.shape();
+  CHECK_EQ(src_weights_shape.dimensions_count(), 4);
+  const Shape dst_weights_shape =
+      Shape({src_weights_shape.dims(1), src_weights_shape.dims(2),
+             src_weights_shape.dims(3) / src_op.depth_multiplier,
+             src_op.depth_multiplier});
+  CHECK_EQ(src_weights_shape.dims(3) % src_op.depth_multiplier, 0);
+  CHECK(dst_weights_shape.dims(2) * dst_weights_shape.dims(3) ==
+        src_weights_shape.dims(3));
+  CHECK_EQ(src_weights_shape.dims(0), 1);
+
+  CHECK(src_weights_array.buffer->type == ArrayDataType::kFloat);
+  const float* src_weights_data =
+      src_weights_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ConvertFloatTensorConst(src_weights_name, dst_weights_shape, src_weights_data,
+                          AxesOrder::kHWIM, AxesOrder::kHWIM, tensorflow_graph);
+
+  auto& strides = (*dc2d_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  (*dc2d_op->mutable_attr())["padding"].set_s(padding);
+
+  if (has_bias) {
+    auto* biasadd_op = tensorflow_graph->add_node();
+    biasadd_op->set_op("BiasAdd");
+    biasadd_op->set_name(src_op.outputs[0]);
+    biasadd_op->add_input(conv_output);
+    biasadd_op->add_input(src_op.inputs[2]);
+    (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
+    CHECK(model.arrays.count(src_op.inputs[2]));
+    const string& bias_name = WalkUpToConstantArray(model, src_op.inputs[2]);
+    const auto& bias_array = model.GetArray(bias_name);
+    // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
+    Shape bias_shape_1d = bias_array.shape();
+    UnextendShape(&bias_shape_1d, 1);
+    CHECK(bias_array.buffer->type == ArrayDataType::kFloat);
+    const float* bias_data =
+        bias_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+    ConvertFloatTensorConst(bias_name, bias_shape_1d, bias_data,
+                            AxesOrder::kOneAxis, AxesOrder::kOneAxis,
+                            tensorflow_graph,
+                            LegacyScalarPolicy::kDoCreateLegacyScalars);
+  }
+}
+
+void ConvertDepthToSpaceOperator(const Model& model,
+                                 const DepthToSpaceOperator& src_op,
+                                 GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("DepthToSpace");
+  op->set_name(src_op.outputs[0]);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*op->mutable_attr())["block_size"].set_i(src_op.block_size);
+}
+
+void ConvertSpaceToDepthOperator(const Model& model,
+                                 const SpaceToDepthOperator& src_op,
+                                 GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("SpaceToDepth");
+  op->set_name(src_op.outputs[0]);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*op->mutable_attr())["block_size"].set_i(src_op.block_size);
+}
+
+void ConvertFullyConnectedOperator(const Model& model,
+                                   const FullyConnectedOperator& src_op,
+                                   GraphDef* tensorflow_graph) {
+  const string reshape_output = src_op.outputs[0] + "/reshape";
+  const string reshape_shape = src_op.outputs[0] + "/reshape/shape";
+  auto* reshape_op = tensorflow_graph->add_node();
+  reshape_op->set_op("Reshape");
+  reshape_op->set_name(reshape_output);
+  reshape_op->add_input(src_op.inputs[0]);
+  reshape_op->add_input(reshape_shape);
+  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const bool has_bias = src_op.inputs.size() >= 3;
+  string matmul_output = src_op.outputs[0];
+  if (has_bias) {
+    matmul_output += "/matmul";
+  }
+
+  auto* matmul_op = tensorflow_graph->add_node();
+  matmul_op->set_op("MatMul");
+
+  matmul_op->set_name(matmul_output);
+  *matmul_op->add_input() = reshape_output;
+  *matmul_op->add_input() = src_op.inputs[1];
+  (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*matmul_op->mutable_attr())["transpose_a"].set_b(false);
+  (*matmul_op->mutable_attr())["transpose_b"].set_b(false);
+  CHECK(model.arrays.count(src_op.inputs[1]));
+  const string& fc_weights_name =
+      WalkUpToConstantArray(model, src_op.inputs[1]);
+  const auto& fc_weights_array = *model.arrays.at(fc_weights_name);
+  const auto& fc_weights_shape = fc_weights_array.shape();
+  CHECK_EQ(fc_weights_shape.dimensions_count(), 2);
+  CreateMatrixShapeTensorConst(reshape_shape, fc_weights_shape.dims(1), -1,
+                               tensorflow_graph);
+
+  CHECK(fc_weights_array.buffer);
+  CHECK(fc_weights_array.buffer->type == ArrayDataType::kFloat);
+  const float* fc_weights_data =
+      fc_weights_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ConvertFloatTensorConst(fc_weights_name, fc_weights_shape, fc_weights_data,
+                          AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
+
+  if (has_bias) {
+    auto* biasadd_op = tensorflow_graph->add_node();
+    biasadd_op->set_op("BiasAdd");
+    biasadd_op->set_name(src_op.outputs[0]);
+    biasadd_op->add_input(matmul_output);
+    biasadd_op->add_input(src_op.inputs[2]);
+    (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
+    CHECK(model.arrays.count(src_op.inputs[2]));
+    const auto& bias_array = *model.arrays.at(src_op.inputs[2]);
+    // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
+    Shape bias_shape_1d = bias_array.shape();
+    UnextendShape(&bias_shape_1d, 1);
+    CHECK(bias_array.buffer);
+    CHECK(bias_array.buffer->type == ArrayDataType::kFloat);
+    const float* bias_data =
+        bias_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+    ConvertFloatTensorConst(WalkUpToConstantArray(model, src_op.inputs[2]),
+                            bias_shape_1d, bias_data, AxesOrder::kOneAxis,
+                            AxesOrder::kOneAxis, tensorflow_graph,
+                            LegacyScalarPolicy::kDoCreateLegacyScalars);
+  }
+}
+
+void ConvertAddOperator(const Model& model, const AddOperator& src_op,
+                        GraphDef* tensorflow_graph) {
+  auto* add_op = tensorflow_graph->add_node();
+  add_op->set_op("Add");
+  add_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *add_op->add_input() = src_op.inputs[0];
+  *add_op->add_input() = src_op.inputs[1];
+  (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertMulOperator(const Model& model, const MulOperator& src_op,
+                        GraphDef* tensorflow_graph) {
+  auto* add_op = tensorflow_graph->add_node();
+  add_op->set_op("Mul");
+  add_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *add_op->add_input() = src_op.inputs[0];
+  *add_op->add_input() = src_op.inputs[1];
+  (*add_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertReluOperator(const ReluOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* relu_op = tensorflow_graph->add_node();
+  relu_op->set_op("Relu");
+  relu_op->set_name(src_op.outputs[0]);
+  *relu_op->add_input() = src_op.inputs[0];
+  (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertRelu1Operator(const Relu1Operator& src_op,
+                          GraphDef* tensorflow_graph) {
+  const string max_bounds = src_op.outputs[0] + "/max_bounds";
+  const string min_bounds = src_op.outputs[0] + "/min_bounds";
+  const string max_output = src_op.outputs[0] + "/max_output";
+
+  auto* max_bounds_const_op = tensorflow_graph->add_node();
+  max_bounds_const_op->set_op("Const");
+  max_bounds_const_op->set_name(max_bounds);
+  (*max_bounds_const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* max_bounds_const_op_tensor =
+      (*max_bounds_const_op->mutable_attr())["value"].mutable_tensor();
+  max_bounds_const_op_tensor->set_dtype(DT_FLOAT);
+  max_bounds_const_op_tensor->add_float_val(-1.0f);
+
+  auto* min_bounds_const_op = tensorflow_graph->add_node();
+  min_bounds_const_op->set_op("Const");
+  min_bounds_const_op->set_name(min_bounds);
+  (*min_bounds_const_op->mutable_attr())["dtype"].set_type(DT_FLOAT);
+  auto* min_bounds_const_op_tensor =
+      (*min_bounds_const_op->mutable_attr())["value"].mutable_tensor();
+  min_bounds_const_op_tensor->set_dtype(DT_FLOAT);
+  min_bounds_const_op_tensor->add_float_val(1.0f);
+
+  auto* max_op = tensorflow_graph->add_node();
+  max_op->set_op("Maximum");
+  max_op->set_name(max_output);
+  *max_op->add_input() = src_op.inputs[0];
+  *max_op->add_input() = max_bounds;
+  (*max_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  auto* min_op = tensorflow_graph->add_node();
+  min_op->set_op("Minimum");
+  min_op->set_name(src_op.outputs[0]);
+  *min_op->add_input() = max_output;
+  *min_op->add_input() = min_bounds;
+  (*min_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertRelu6Operator(const Relu6Operator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* relu_op = tensorflow_graph->add_node();
+  relu_op->set_op("Relu6");
+  relu_op->set_name(src_op.outputs[0]);
+  *relu_op->add_input() = src_op.inputs[0];
+  (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertLogisticOperator(const LogisticOperator& src_op,
+                             GraphDef* tensorflow_graph) {
+  auto* relu_op = tensorflow_graph->add_node();
+  relu_op->set_op("Sigmoid");
+  relu_op->set_name(src_op.outputs[0]);
+  *relu_op->add_input() = src_op.inputs[0];
+  (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertTanhOperator(const TanhOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* tanh_op = tensorflow_graph->add_node();
+  tanh_op->set_op("Tanh");
+  tanh_op->set_name(src_op.outputs[0]);
+  *tanh_op->add_input() = src_op.inputs[0];
+  (*tanh_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertSoftmaxOperator(const Model& model, const SoftmaxOperator& src_op,
+                            GraphDef* tensorflow_graph) {
+  string softmax_input;
+  Operator* providing_op = GetOpWithOutput(model, src_op.inputs[0]);
+  if (providing_op->type == OperatorType::kTensorFlowReshape) {
+    softmax_input = src_op.inputs[0];
+  } else {
+    // Insert a reshape operator that reduces the dimensions down to the 2 that
+    // are required for TensorFlow Logits.
+    const string reshape_output = src_op.outputs[0] + "/softmax_insert_reshape";
+    const string softmax_size = src_op.outputs[0] + "/softmax_insert_size";
+    softmax_input = reshape_output;
+
+    auto* reshape_op = tensorflow_graph->add_node();
+    reshape_op->set_op("Reshape");
+    reshape_op->set_name(reshape_output);
+    *reshape_op->add_input() = src_op.inputs[0];
+    *reshape_op->add_input() = softmax_size;
+    (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+    const auto& input_shape = model.arrays.at(src_op.inputs[0])->shape();
+    int32 flattened_size = 1;
+    for (int i = 0; i < input_shape.dimensions_count() - 1; ++i) {
+      flattened_size *= input_shape.dims(i);
+    }
+    const std::vector<int32> shape_data = {
+        flattened_size, input_shape.dims(input_shape.dimensions_count() - 1)};
+    CreateReshapeShapeTensorConst(softmax_size, shape_data, tensorflow_graph);
+  }
+
+  auto* softmax_op = tensorflow_graph->add_node();
+  softmax_op->set_op("Softmax");
+  softmax_op->set_name(src_op.outputs[0]);
+  *softmax_op->add_input() = softmax_input;
+  // TensorFlow's Softmax doesn't seem to admit a 'beta' parameter
+  CHECK_EQ(src_op.beta, 1.f);
+  (*softmax_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertL2NormalizationOperator(const L2NormalizationOperator& src_op,
+                                    GraphDef* tensorflow_graph) {
+  const string square_output = src_op.outputs[0] + "/square";
+  const string sum_reduction_indices = src_op.outputs[0] + "/reduction_indices";
+  const string sum_output = src_op.outputs[0] + "/sum";
+  const string rsqrt_output = src_op.outputs[0] + "/rsqrt";
+  const string rsqrt_tiled_output = src_op.outputs[0] + "/rsqrt_tiled";
+
+  auto* sum_reduction_indices_op = tensorflow_graph->add_node();
+  sum_reduction_indices_op->set_op("Const");
+  sum_reduction_indices_op->set_name(sum_reduction_indices);
+  (*sum_reduction_indices_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* sum_reduction_indices_tensor =
+      (*sum_reduction_indices_op->mutable_attr())["value"].mutable_tensor();
+  sum_reduction_indices_tensor->set_dtype(DT_INT32);
+  auto* sum_reduction_indices_shape =
+      sum_reduction_indices_tensor->mutable_tensor_shape();
+  auto* sum_reduction_indices_dim = sum_reduction_indices_shape->add_dim();
+  sum_reduction_indices_dim->set_size(2);
+  sum_reduction_indices_tensor->add_int_val(0);
+  sum_reduction_indices_tensor->add_int_val(1);
+
+  auto* square_op = tensorflow_graph->add_node();
+  square_op->set_op("Square");
+  square_op->set_name(square_output);
+  *square_op->add_input() = src_op.inputs[0];
+  (*square_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  auto* sum_op = tensorflow_graph->add_node();
+  sum_op->set_op("Sum");
+  sum_op->set_name(sum_output);
+  *sum_op->add_input() = square_output;
+  *sum_op->add_input() = sum_reduction_indices;
+  (*sum_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  auto* rsqrt_op = tensorflow_graph->add_node();
+  rsqrt_op->set_op("Rsqrt");
+  rsqrt_op->set_name(rsqrt_output);
+  *rsqrt_op->add_input() = sum_output;
+  (*rsqrt_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  auto* mul_op = tensorflow_graph->add_node();
+  mul_op->set_op("Mul");
+  mul_op->set_name(src_op.outputs[0]);
+  *mul_op->add_input() = src_op.inputs[0];
+  *mul_op->add_input() = rsqrt_output;
+  (*mul_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertLocalResponseNormalizationOperator(
+    const LocalResponseNormalizationOperator& src_op,
+    GraphDef* tensorflow_graph) {
+  auto* lrn_op = tensorflow_graph->add_node();
+  lrn_op->set_op("LRN");
+  lrn_op->set_name(src_op.outputs[0]);
+  *lrn_op->add_input() = src_op.inputs[0];
+  (*lrn_op->mutable_attr())["depth_radius"].set_i(src_op.range);
+  (*lrn_op->mutable_attr())["bias"].set_f(src_op.bias);
+  (*lrn_op->mutable_attr())["alpha"].set_f(src_op.alpha);
+  (*lrn_op->mutable_attr())["beta"].set_f(src_op.beta);
+}
+
+void ConvertFakeQuantOperator(const FakeQuantOperator& src_op,
+                              GraphDef* tensorflow_graph) {
+  auto* fakequant_op = tensorflow_graph->add_node();
+  fakequant_op->set_op("FakeQuantWithMinMaxArgs");
+  fakequant_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *fakequant_op->add_input() = src_op.inputs[0];
+  CHECK(src_op.minmax);
+  (*fakequant_op->mutable_attr())["min"].set_f(src_op.minmax->min);
+  (*fakequant_op->mutable_attr())["max"].set_f(src_op.minmax->max);
+}
+
+void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
+                            GraphDef* tensorflow_graph) {
+  auto* maxpool_op = tensorflow_graph->add_node();
+  maxpool_op->set_op("MaxPool");
+  maxpool_op->set_name(src_op.outputs[0]);
+  *maxpool_op->add_input() = src_op.inputs[0];
+  auto& strides = (*maxpool_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  (*maxpool_op->mutable_attr())["padding"].set_s(padding);
+  (*maxpool_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  auto& ksize = (*maxpool_op->mutable_attr())["ksize"];
+  ksize.mutable_list()->add_i(1);
+  ksize.mutable_list()->add_i(src_op.kheight);
+  ksize.mutable_list()->add_i(src_op.kwidth);
+  ksize.mutable_list()->add_i(1);
+}
+
+void ConvertAveragePoolOperator(const AveragePoolOperator& src_op,
+                                GraphDef* tensorflow_graph) {
+  auto* avgpool_op = tensorflow_graph->add_node();
+  avgpool_op->set_op("AvgPool");
+  avgpool_op->set_name(src_op.outputs[0]);
+  *avgpool_op->add_input() = src_op.inputs[0];
+  auto& strides = (*avgpool_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  (*avgpool_op->mutable_attr())["padding"].set_s(padding);
+  (*avgpool_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  auto& ksize = (*avgpool_op->mutable_attr())["ksize"];
+  ksize.mutable_list()->add_i(1);
+  ksize.mutable_list()->add_i(src_op.kheight);
+  ksize.mutable_list()->add_i(src_op.kwidth);
+  ksize.mutable_list()->add_i(1);
+}
+
+void ConvertConcatenationOperator(const Model& model,
+                                  const ConcatenationOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  auto* dc_op = tensorflow_graph->add_node();
+  dc_op->set_op("ConcatV2");
+  dc_op->set_name(src_op.outputs[0]);
+  const string dummy_concat_dim = src_op.outputs[0] + "/concat_dim";
+  CreateDummyConcatDimTensorConst(dummy_concat_dim, src_op.concat_dim,
+                                  tensorflow_graph);
+  for (const auto& input : src_op.inputs) {
+    *dc_op->add_input() = input;
+  }
+  *dc_op->add_input() = dummy_concat_dim;
+  (*dc_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*dc_op->mutable_attr())["Tidx"].set_type(DT_INT32);
+  (*dc_op->mutable_attr())["N"].set_i(src_op.inputs.size());
+}
+
+void ConvertTensorFlowReshapeOperator(const Model& model,
+                                      const TensorFlowReshapeOperator& src_op,
+                                      GraphDef* tensorflow_graph) {
+  auto* reshape_op = tensorflow_graph->add_node();
+  reshape_op->set_op("Reshape");
+  reshape_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *reshape_op->add_input() = src_op.inputs[0];
+  *reshape_op->add_input() = src_op.inputs[1];
+  (*reshape_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  const auto& shape_array = model.GetArray(src_op.inputs[1]);
+  CHECK(shape_array.data_type == ArrayDataType::kInt32);
+  CHECK(shape_array.buffer != nullptr);
+  const auto& shape_data = shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CreateReshapeShapeTensorConst(src_op.inputs[1], shape_data, tensorflow_graph);
+}
+
+void ConvertL2PoolOperator(const L2PoolOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  const string square_output = src_op.outputs[0] + "/square";
+  const string avgpool_output = src_op.outputs[0] + "/avgpool";
+
+  auto* square_op = tensorflow_graph->add_node();
+  square_op->set_op("Square");
+  square_op->set_name(square_output);
+  *square_op->add_input() = src_op.inputs[0];
+  (*square_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  string padding;
+  if (src_op.padding.type == PaddingType::kSame) {
+    padding = "SAME";
+  } else if (src_op.padding.type == PaddingType::kValid) {
+    padding = "VALID";
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+
+  auto* avgpool_op = tensorflow_graph->add_node();
+  avgpool_op->set_op("AvgPool");
+  avgpool_op->set_name(avgpool_output);
+  *avgpool_op->add_input() = square_output;
+  auto& strides = (*avgpool_op->mutable_attr())["strides"];
+  strides.mutable_list()->add_i(1);
+  strides.mutable_list()->add_i(src_op.stride_height);
+  strides.mutable_list()->add_i(src_op.stride_width);
+  strides.mutable_list()->add_i(1);
+
+  (*avgpool_op->mutable_attr())["padding"].set_s(padding);
+  (*avgpool_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  auto& ksize = (*avgpool_op->mutable_attr())["ksize"];
+  ksize.mutable_list()->add_i(1);
+  ksize.mutable_list()->add_i(src_op.kheight);
+  ksize.mutable_list()->add_i(src_op.kwidth);
+  ksize.mutable_list()->add_i(1);
+
+  auto* sqrt_op = tensorflow_graph->add_node();
+  sqrt_op->set_op("Sqrt");
+  sqrt_op->set_name(src_op.outputs[0]);
+  *sqrt_op->add_input() = avgpool_output;
+  (*sqrt_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertSquareOperator(const TensorFlowSquareOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* square_op = tensorflow_graph->add_node();
+  square_op->set_op("Square");
+  square_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *square_op->add_input() = src_op.inputs[0];
+  (*square_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertSqrtOperator(const TensorFlowSqrtOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* sqrt_op = tensorflow_graph->add_node();
+  sqrt_op->set_op("Sqrt");
+  sqrt_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *sqrt_op->add_input() = src_op.inputs[0];
+  (*sqrt_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertSplitOperator(const Model& model,
+                          const TensorFlowSplitOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* split_op = tensorflow_graph->add_node();
+  split_op->set_op("Split");
+  split_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *split_op->add_input() = input;
+  }
+  (*split_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*split_op->mutable_attr())["num_split"].set_i(src_op.num_split);
+  const auto& split_dim_array = model.GetArray(src_op.inputs[0]);
+  CHECK(split_dim_array.buffer);
+  CHECK(split_dim_array.data_type == ArrayDataType::kInt32);
+  const auto& split_dim_data =
+      split_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(split_dim_data.size(), 1);
+  const int split_dim = split_dim_data[0];
+  CreateDummyConcatDimTensorConst(src_op.inputs[0], split_dim,
+                                  tensorflow_graph);
+}
+
+tensorflow::DataType GetTensorFlowDataType(const Model& model,
+                                           const string& array_name) {
+  auto& dtype = model.GetArray(array_name).data_type;
+  CHECK(dtype == ArrayDataType::kFloat || dtype == ArrayDataType::kInt32 ||
+        dtype == ArrayDataType::kUint8);
+  if (dtype == ArrayDataType::kFloat) {
+    return tensorflow::DT_FLOAT;
+  } else if (dtype == ArrayDataType::kInt32) {
+    return tensorflow::DT_INT32;
+  } else if (dtype == ArrayDataType::kUint8) {
+    return tensorflow::DT_UINT8;
+  } else {
+    LOG(FATAL) << "Wrong data type";
+  }
+}
+
+void ConvertCastOperator(const Model& model, const CastOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* cast_op = tensorflow_graph->add_node();
+  cast_op->set_op("Cast");
+  cast_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *cast_op->add_input() = src_op.inputs[0];
+
+  (*cast_op->mutable_attr())["DstT"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+  (*cast_op->mutable_attr())["SrcT"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+}
+
+void ConvertFloorOperator(const Model& model, const FloorOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* floor_op = tensorflow_graph->add_node();
+  floor_op->set_op("Floor");
+  floor_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *floor_op->add_input() = src_op.inputs[0];
+  (*floor_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* gather_op = tensorflow_graph->add_node();
+  gather_op->set_op("Gather");
+  gather_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *gather_op->add_input() = src_op.inputs[0];
+  *gather_op->add_input() = src_op.inputs[1];
+
+  (*gather_op->mutable_attr())["Tindices"].set_type(DT_INT32);
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*gather_op->mutable_attr())["Tparams"].set_type(params_type);
+}
+
+void ConvertResizeBilinearOperator(const Model& model,
+                                   const ResizeBilinearOperator& src_op,
+                                   GraphDef* tensorflow_graph) {
+  auto* resize_op = tensorflow_graph->add_node();
+  resize_op->set_op("ResizeBilinear");
+  resize_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *resize_op->add_input() = src_op.inputs[0];
+  *resize_op->add_input() = src_op.inputs[1];
+  (*resize_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+namespace {
+// TODO(aselle): Remove when available in absl
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  string::difference_type count = 0;
+  const string::difference_type limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+}  // namespace
+
+void ConvertLstmCellOperator(const Model& model, const LstmCellOperator& src_op,
+                             GraphDef* tensorflow_graph) {
+  // Find the base name
+  const string base(
+      FindLongestCommonPrefix(src_op.outputs[LstmCellOperator::STATE_OUTPUT],
+                              src_op.outputs[LstmCellOperator::ACTIV_OUTPUT]));
+
+  // Concatenate inputs
+  const string concat_output = base + "basic_lstm_cell/concat";
+  // Op names have been chosen to match the tf.slim LSTM naming
+  // as closely as possible.
+  const int concat_dim =
+      model.arrays.at(src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT])
+          ->shape()
+          .dimensions_count() -
+      1;
+  // Note that DATA_INPUT may have extra size 1 dimensions, but TF concat
+  // works the same since the tensor has the same underlying data layout.
+  const string concat_dim_output = concat_output + "/concat_dim";
+  CreateDummyConcatDimTensorConst(concat_dim_output, concat_dim,
+                                  tensorflow_graph);
+  auto* concat_op = tensorflow_graph->add_node();
+  concat_op->set_op("ConcatV2");
+  concat_op->set_name(concat_output);
+  *concat_op->add_input() = src_op.inputs[LstmCellOperator::DATA_INPUT];
+  *concat_op->add_input() = src_op.inputs[LstmCellOperator::PREV_ACTIV_INPUT];
+  *concat_op->add_input() = concat_dim_output;
+  (*concat_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*concat_op->mutable_attr())["Tidx"].set_type(DT_INT32);
+  (*concat_op->mutable_attr())["N"].set_i(2);  // Number of inputs
+
+  // Write weights
+  const string weights_output = base + "weights";
+  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]));
+  const auto& weights_array =
+      *model.arrays.at(src_op.inputs[LstmCellOperator::WEIGHTS_INPUT]);
+  // Convert 4D FullyConnected weights into 2D matrix
+  const auto& weights_shape = weights_array.shape();
+  CHECK_EQ(weights_shape.dimensions_count(), 2);
+  CHECK(weights_array.buffer);
+  CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
+  const float* weights_data =
+      weights_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ConvertFloatTensorConst(weights_output, weights_shape, weights_data,
+                          AxesOrder::kCR, AxesOrder::kRC, tensorflow_graph);
+
+  // Fully connected matrix multiply
+  const string matmul_output = base + "MatMul";
+  auto* matmul_op = tensorflow_graph->add_node();
+  matmul_op->set_op("MatMul");
+  matmul_op->set_name(matmul_output);
+  *matmul_op->add_input() = concat_output;
+  *matmul_op->add_input() = weights_output;
+  (*matmul_op->mutable_attr())["transpose_a"].set_b(false);
+  (*matmul_op->mutable_attr())["transpose_b"].set_b(false);
+  (*matmul_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  // Write biases
+  const string biases_output = base + "biases";
+  CHECK(model.arrays.count(src_op.inputs[LstmCellOperator::BIASES_INPUT]));
+  const auto& bias_array =
+      *model.arrays.at(src_op.inputs[LstmCellOperator::BIASES_INPUT]);
+  // TODO(b/62904716) Bias arrays should be 1-D, and used directly.
+  Shape bias_shape_1d = bias_array.shape();
+  UnextendShape(&bias_shape_1d, 1);
+  CHECK(bias_array.buffer);
+  CHECK(bias_array.buffer->type == ArrayDataType::kFloat);
+  const float* bias_data =
+      bias_array.GetBuffer<ArrayDataType::kFloat>().data.data();
+  ConvertFloatTensorConst(biases_output, bias_shape_1d, bias_data,
+                          AxesOrder::kOneAxis, AxesOrder::kOneAxis,
+                          tensorflow_graph,
+                          LegacyScalarPolicy::kDoCreateLegacyScalars);
+
+  // Add biases
+  string biasadd_output = base + "BiasAdd";
+  auto* biasadd_op = tensorflow_graph->add_node();
+  biasadd_op->set_op("BiasAdd");
+  biasadd_op->set_name(biasadd_output);
+  biasadd_op->add_input(matmul_output);
+  biasadd_op->add_input(biases_output);
+  (*biasadd_op->mutable_attr())["data_format"].set_s("NHWC");
+  (*biasadd_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  // Split
+  string split_dim_output = base + "split/split_dim";
+  // The dimension is the same as the concatenation dimension
+  CreateDummyConcatDimTensorConst(split_dim_output, concat_dim,
+                                  tensorflow_graph);
+  string split_output = base + "split";
+  auto* split_op = tensorflow_graph->add_node();
+  split_op->set_op("Split");
+  split_op->set_name(split_output);
+  *split_op->add_input() = split_dim_output;
+  *split_op->add_input() = biasadd_output;
+  (*split_op->mutable_attr())["T"].set_type(DT_FLOAT);
+  (*split_op->mutable_attr())["num_split"].set_i(4);  // Split into four outputs
+
+  // Activation functions and memory computations
+  const string tanh_0_output = base + "Tanh";
+  auto* tanh_0_op = tensorflow_graph->add_node();
+  tanh_0_op->set_op("Tanh");
+  tanh_0_op->set_name(tanh_0_output);
+  *tanh_0_op->add_input() = split_output + ":1";
+  (*tanh_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string sigmoid_1_output = base + "Sigmoid_1";
+  auto* logistic_1_op = tensorflow_graph->add_node();
+  logistic_1_op->set_op("Sigmoid");
+  logistic_1_op->set_name(sigmoid_1_output);
+  *logistic_1_op->add_input() = split_output;
+  (*logistic_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string mul_1_output = base + "mul_1";
+  auto* mul_1_op = tensorflow_graph->add_node();
+  mul_1_op->set_op("Mul");
+  mul_1_op->set_name(mul_1_output);
+  *mul_1_op->add_input() = sigmoid_1_output;
+  *mul_1_op->add_input() = tanh_0_output;
+  (*mul_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string sigmoid_0_output = base + "Sigmoid";
+  auto* logistic_2_op = tensorflow_graph->add_node();
+  logistic_2_op->set_op("Sigmoid");
+  logistic_2_op->set_name(sigmoid_0_output);
+  *logistic_2_op->add_input() = split_output + ":2";
+  (*logistic_2_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string sigmoid_2_output = base + "Sigmoid_2";
+  auto* logistic_3_op = tensorflow_graph->add_node();
+  logistic_3_op->set_op("Sigmoid");
+  logistic_3_op->set_name(sigmoid_2_output);
+  *logistic_3_op->add_input() = split_output + ":3";
+  (*logistic_3_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string mul_0_output = base + "mul";
+  auto* mul_0_op = tensorflow_graph->add_node();
+  mul_0_op->set_op("Mul");
+  mul_0_op->set_name(mul_0_output);
+  *mul_0_op->add_input() = src_op.inputs[LstmCellOperator::PREV_STATE_INPUT];
+  *mul_0_op->add_input() = sigmoid_0_output;
+  (*mul_0_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string add_1_output = src_op.outputs[LstmCellOperator::STATE_OUTPUT];
+  auto* add_1_op = tensorflow_graph->add_node();
+  add_1_op->set_op("Add");
+  add_1_op->set_name(add_1_output);
+  *add_1_op->add_input() = mul_0_output;
+  *add_1_op->add_input() = mul_1_output;
+  (*add_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string tanh_1_output = base + "Tanh_1";
+  auto* tanh_1_op = tensorflow_graph->add_node();
+  tanh_1_op->set_op("Tanh");
+  tanh_1_op->set_name(tanh_1_output);
+  *tanh_1_op->add_input() = add_1_output;
+  (*tanh_1_op->mutable_attr())["T"].set_type(DT_FLOAT);
+
+  const string mul_2_output = src_op.outputs[LstmCellOperator::ACTIV_OUTPUT];
+  auto* mul_2_op = tensorflow_graph->add_node();
+  mul_2_op->set_op("Mul");
+  mul_2_op->set_name(mul_2_output);
+  *mul_2_op->add_input() = tanh_1_output;
+  *mul_2_op->add_input() = sigmoid_2_output;
+  (*mul_2_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
+void ConvertSpaceToBatchNDOperator(const Model& model,
+                                   const SpaceToBatchNDOperator& src_op,
+                                   GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("SpaceToBatchND");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+  (*new_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
+  (*new_op->mutable_attr())["Tpaddings"].set_type(DT_INT32);
+}
+
+void ConvertBatchToSpaceNDOperator(const Model& model,
+                                   const BatchToSpaceNDOperator& src_op,
+                                   GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("BatchToSpaceND");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+  (*new_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
+  (*new_op->mutable_attr())["Tcrops"].set_type(DT_INT32);
+}
+
+void ConvertPadOperator(const Model& model, const PadOperator& src_op,
+                        GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("Pad");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  // Create the params tensor.
+  auto* params_op = tensorflow_graph->add_node();
+  params_op->set_op("Const");
+  params_op->set_name(src_op.inputs[1]);
+  (*params_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+
+  CHECK_EQ(src_op.left_padding.size(), src_op.right_padding.size());
+  for (int i = 0; i < src_op.left_padding.size(); ++i) {
+    tensor->add_int_val(src_op.left_padding[i]);
+    tensor->add_int_val(src_op.right_padding[i]);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(src_op.left_padding.size());
+  shape->add_dim()->set_size(2);
+}
+
+void CreateSliceInput(const string& input_name, const std::vector<int>& values,
+                      GraphDef* tensorflow_graph) {
+  auto* params_op = tensorflow_graph->add_node();
+  params_op->set_op("Const");
+  params_op->set_name(input_name);
+  (*params_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+
+  for (int i = 0; i < values.size(); ++i) {
+    tensor->add_int_val(values[i]);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(values.size());
+}
+
+void ConvertStridedSliceOperator(const Model& model,
+                                 const StridedSliceOperator& src_op,
+                                 GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("StridedSlice");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 4);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+  *new_op->add_input() = src_op.inputs[3];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  (*new_op->mutable_attr())["Index"].set_type(DT_INT32);
+  (*new_op->mutable_attr())["begin_mask"].set_i(src_op.begin_mask);
+  (*new_op->mutable_attr())["ellipsis_mask"].set_i(src_op.ellipsis_mask);
+  (*new_op->mutable_attr())["end_mask"].set_i(src_op.end_mask);
+  (*new_op->mutable_attr())["new_axis_mask"].set_i(src_op.new_axis_mask);
+  (*new_op->mutable_attr())["shrink_axis_mask"].set_i(src_op.shrink_axis_mask);
+
+  // Create tensors for start/stop indices and strides.
+  CreateSliceInput(src_op.inputs[1], src_op.start_indices, tensorflow_graph);
+  CreateSliceInput(src_op.inputs[2], src_op.stop_indices, tensorflow_graph);
+  CreateSliceInput(src_op.inputs[3], src_op.strides, tensorflow_graph);
+}
+
+void ConvertSliceOperator(const Model& model, const SliceOperator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("Slice");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+  (*new_op->mutable_attr())["Index"].set_type(DT_INT32);
+
+  // Create tensors for begin and size inputs.
+  CreateSliceInput(src_op.inputs[1], src_op.begin, tensorflow_graph);
+  CreateSliceInput(src_op.inputs[2], src_op.size, tensorflow_graph);
+}
+
+void ConvertMeanOperator(const Model& model, const MeanOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("Mean");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  if (src_op.keep_dims) {
+    (*new_op->mutable_attr())["keep_dims"].set_b(true);
+  }
+
+  // Create the params tensor.
+  auto* params_op = tensorflow_graph->add_node();
+  params_op->set_op("Const");
+  params_op->set_name(src_op.inputs[1]);
+  (*params_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+
+  for (int i = 0; i < src_op.reduction_indices.size(); ++i) {
+    tensor->add_int_val(src_op.reduction_indices[i]);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(src_op.reduction_indices.size());
+}
+
+void ConvertSqueezeOperator(const Model& model, const SqueezeOperator& src_op,
+                            GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("Squeeze");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *new_op->add_input() = src_op.inputs[0];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  auto& squeeze_dims = (*new_op->mutable_attr())["squeeze_dims"];
+  for (int i : src_op.squeeze_dims) {
+    squeeze_dims.mutable_list()->add_i(i);
+  }
+}
+
+void ConvertSubOperator(const Model& model, const SubOperator& src_op,
+                        GraphDef* tensorflow_graph) {
+  auto* sub_op = tensorflow_graph->add_node();
+  sub_op->set_op("Sub");
+  sub_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *sub_op->add_input() = src_op.inputs[0];
+  *sub_op->add_input() = src_op.inputs[1];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*sub_op->mutable_attr())["T"].set_type(data_type);
+}
+
+void ConvertTensorFlowMinimumOperator(const Model& model,
+                                      const TensorFlowMinimumOperator& src_op,
+                                      GraphDef* tensorflow_graph) {
+  auto* sub_op = tensorflow_graph->add_node();
+  sub_op->set_op("Minimum");
+  sub_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *sub_op->add_input() = src_op.inputs[0];
+  *sub_op->add_input() = src_op.inputs[1];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*sub_op->mutable_attr())["T"].set_type(data_type);
+}
+
+void ConvertTensorFlowMaximumOperator(const Model& model,
+                                      const TensorFlowMaximumOperator& src_op,
+                                      GraphDef* tensorflow_graph) {
+  auto* sub_op = tensorflow_graph->add_node();
+  sub_op->set_op("Maximum");
+  sub_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *sub_op->add_input() = src_op.inputs[0];
+  *sub_op->add_input() = src_op.inputs[1];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*sub_op->mutable_attr())["T"].set_type(data_type);
+}
+
+void ConvertOperator(const Model& model, const Operator& src_op,
+                     GraphDef* tensorflow_graph) {
+  if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
+    LOG(FATAL)
+        << "Unsupported: the input model has a fused activation function";
+  }
+
+  if (src_op.type == OperatorType::kConv) {
+    ConvertConvOperator(model, static_cast<const ConvOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kDepthwiseConv) {
+    ConvertDepthwiseConvOperator(
+        model, static_cast<const DepthwiseConvOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kDepthToSpace) {
+    ConvertDepthToSpaceOperator(
+        model, static_cast<const DepthToSpaceOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSpaceToDepth) {
+    ConvertSpaceToDepthOperator(
+        model, static_cast<const SpaceToDepthOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFullyConnected) {
+    ConvertFullyConnectedOperator(
+        model, static_cast<const FullyConnectedOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kAdd) {
+    ConvertAddOperator(model, static_cast<const AddOperator&>(src_op),
+                       tensorflow_graph);
+  } else if (src_op.type == OperatorType::kMul) {
+    ConvertMulOperator(model, static_cast<const MulOperator&>(src_op),
+                       tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRelu) {
+    ConvertReluOperator(static_cast<const ReluOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRelu1) {
+    ConvertRelu1Operator(static_cast<const Relu1Operator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRelu6) {
+    ConvertRelu6Operator(static_cast<const Relu6Operator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLogistic) {
+    ConvertLogisticOperator(static_cast<const LogisticOperator&>(src_op),
+                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTanh) {
+    ConvertTanhOperator(static_cast<const TanhOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kL2Normalization) {
+    ConvertL2NormalizationOperator(
+        static_cast<const L2NormalizationOperator&>(src_op), tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSoftmax) {
+    ConvertSoftmaxOperator(model, static_cast<const SoftmaxOperator&>(src_op),
+                           tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLocalResponseNormalization) {
+    ConvertLocalResponseNormalizationOperator(
+        static_cast<const LocalResponseNormalizationOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLstmCell) {
+    ConvertLstmCellOperator(model, static_cast<const LstmCellOperator&>(src_op),
+                            tensorflow_graph);
+  } else if (src_op.type == OperatorType::kMaxPool) {
+    ConvertMaxPoolOperator(static_cast<const MaxPoolOperator&>(src_op),
+                           tensorflow_graph);
+  } else if (src_op.type == OperatorType::kAveragePool) {
+    ConvertAveragePoolOperator(static_cast<const AveragePoolOperator&>(src_op),
+                               tensorflow_graph);
+  } else if (src_op.type == OperatorType::kConcatenation) {
+    ConvertConcatenationOperator(
+        model, static_cast<const ConcatenationOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowReshape) {
+    ConvertTensorFlowReshapeOperator(
+        model, static_cast<const TensorFlowReshapeOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kL2Pool) {
+    ConvertL2PoolOperator(static_cast<const L2PoolOperator&>(src_op),
+                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowSquare) {
+    ConvertSquareOperator(static_cast<const TensorFlowSquareOperator&>(src_op),
+                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowSqrt) {
+    ConvertSqrtOperator(static_cast<const TensorFlowSqrtOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowSplit) {
+    ConvertSplitOperator(model,
+                         static_cast<const TensorFlowSplitOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFakeQuant) {
+    ConvertFakeQuantOperator(static_cast<const FakeQuantOperator&>(src_op),
+                             tensorflow_graph);
+  } else if (src_op.type == OperatorType::kCast) {
+    ConvertCastOperator(model, static_cast<const CastOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kFloor) {
+    ConvertFloorOperator(model, static_cast<const FloorOperator&>(src_op),
+                         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kGather) {
+    ConvertGatherOperator(model, static_cast<const GatherOperator&>(src_op),
+                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kResizeBilinear) {
+    ConvertResizeBilinearOperator(
+        model, static_cast<const ResizeBilinearOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSpaceToBatchND) {
+    ConvertSpaceToBatchNDOperator(
+        model, static_cast<const SpaceToBatchNDOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kBatchToSpaceND) {
+    ConvertBatchToSpaceNDOperator(
+        model, static_cast<const BatchToSpaceNDOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kPad) {
+    ConvertPadOperator(model, static_cast<const PadOperator&>(src_op),
+                       tensorflow_graph);
+  } else if (src_op.type == OperatorType::kStridedSlice) {
+    ConvertStridedSliceOperator(
+        model, static_cast<const StridedSliceOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kMean) {
+    ConvertMeanOperator(model, static_cast<const MeanOperator&>(src_op),
+                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSub) {
+    ConvertSubOperator(model, static_cast<const SubOperator&>(src_op),
+                       tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowMinimum) {
+    ConvertTensorFlowMinimumOperator(
+        model, static_cast<const TensorFlowMinimumOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowMaximum) {
+    ConvertTensorFlowMaximumOperator(
+        model, static_cast<const TensorFlowMaximumOperator&>(src_op),
+        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSqueeze) {
+    ConvertSqueezeOperator(model, static_cast<const SqueezeOperator&>(src_op),
+                           tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSlice) {
+    ConvertSliceOperator(model, static_cast<const SliceOperator&>(src_op),
+                         tensorflow_graph);
+  } else {
+    LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
+  }
+}
+
+void AddPlaceholder(const string& name, ArrayDataType type,
+                    GraphDef* tensorflow_graph) {
+  auto* placeholder = tensorflow_graph->add_node();
+  placeholder->set_op("Placeholder");
+  switch (type) {
+    case ArrayDataType::kBool:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_BOOL);
+      break;
+    case ArrayDataType::kFloat:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
+      break;
+    case ArrayDataType::kUint8:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_UINT8);
+      break;
+    case ArrayDataType::kInt32:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT32);
+      break;
+    case ArrayDataType::kInt64:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
+  }
+  placeholder->set_name(name);
+}
+
+void AddPlaceholderForRNNState(const Model& model, const string& name, int size,
+                               GraphDef* tensorflow_graph) {
+  auto* placeholder = tensorflow_graph->add_node();
+  placeholder->set_op("Placeholder");
+  placeholder->set_name(name);
+  (*placeholder->mutable_attr())["dtype"].set_type(DT_FLOAT);
+
+  auto* shape = (*placeholder->mutable_attr())["shape"].mutable_shape();
+  const auto& state_array = *model.arrays.at(name);
+  if (state_array.has_shape()) {
+    const auto& state_shape = state_array.shape();
+    const int kDims = state_shape.dimensions_count();
+    for (int i = 0; i < kDims; ++i) {
+      shape->add_dim()->set_size(state_shape.dims(i));
+    }
+  } else {
+    shape->add_dim()->set_size(1);
+    shape->add_dim()->set_size(size);
+  }
+}
+
+void ExportTensorFlowGraphDefImplementation(const Model& model,
+                                            GraphDef* tensorflow_graph) {
+  for (const auto& input_array : model.flags.input_arrays()) {
+    AddPlaceholder(input_array.name(),
+                   model.arrays.at(input_array.name())->data_type,
+                   tensorflow_graph);
+  }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    AddPlaceholderForRNNState(model, rnn_state.state_array(), rnn_state.size(),
+                              tensorflow_graph);
+  }
+  for (const auto& op : model.operators) {
+    ConvertOperator(model, *op, tensorflow_graph);
+  }
+  // Generically export arrays that haven't been exported already
+  // by the above operators export. It's important that this comes
+  // after, as some operators need to export arrays that they reference
+  // in a specific way, rather than in the generic way done below.
+  for (const auto& array_pair : model.arrays) {
+    const string& array_name = array_pair.first;
+    const auto& array = *array_pair.second;
+    if (array.buffer) {
+      switch (array.data_type) {
+        case ArrayDataType::kFloat:
+          ConvertFloatTensorConst(model, array_name, tensorflow_graph);
+          break;
+        case ArrayDataType::kInt32:
+          ConvertIntTensorConst(model, array_name, tensorflow_graph);
+          break;
+        default:
+          break;
+      }
+    }
+  }
+}
+}  // namespace
+
+void ExportTensorFlowGraphDef(const Model& model,
+                              string* output_file_contents) {
+  CHECK(output_file_contents->empty());
+  GraphDef tensorflow_graph;
+  ExportTensorFlowGraphDefImplementation(model, &tensorflow_graph);
+  LogDumpGraphDef(kLogLevelModelChanged, "AT EXPORT", tensorflow_graph);
+  CHECK(tensorflow_graph.SerializeToString(output_file_contents));
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.h b/tensorflow/contrib/lite/toco/export_tensorflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..eca97745767387a04bcd2c8deb579928edf2497c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.h
@@ -0,0 +1,27 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
+
+#include <string>
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+void ExportTensorFlowGraphDef(const Model& model, string* output_file_contents);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_EXPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e999001e0e35fb916b11db199dbf28572685f3d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -0,0 +1,77 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file is used to provide equivalents of internal util::format::FormatF
+// and util::format::AppendF. Unfortunately, type safety is not as good as a
+// a full C++ example.
+// TODO(aselle): When absl adds support for StrFormat, use that instead.
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
+
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace toco {
+namespace port {
+
+/// Identity (default case)
+template <class T>
+T IdentityOrConvertStringToRaw(T foo) {
+  return foo;
+}
+
+// Overloaded case where we return std::string.
+inline const char* IdentityOrConvertStringToRaw(const std::string& foo) {
+  return foo.c_str();
+}
+
+#if defined(PLATFORM_GOOGLE) && defined(HAS_GLOBAL_STRING)
+// Overloaded case where we return string.
+inline const char* IdentityOrConvertStringToRaw(const string& foo) {
+  return foo.c_str();
+}
+#endif  // PLATFORM_GOOGLE
+// Delegate to TensorFlow Appendf function until absl has an equivalent.
+template <typename... Args>
+inline void AppendFHelper(string* destination, const char* fmt,
+                          Args&&... args) {
+  tensorflow::strings::Appendf(destination, fmt, args...);
+}
+
+// Specialization for no argument format string (avoid security bug).
+inline void AppendFHelper(string* destination, const char* fmt) {
+  tensorflow::strings::Appendf(destination, "%s", fmt);
+}
+
+// Append formatted string (with format fmt and args args) to the string
+// pointed to by destination. fmt follows C printf semantics.
+// One departure is that %s can be driven by a std::string or string.
+template <typename... Args>
+inline void AppendF(string* destination, const char* fmt, Args&&... args) {
+  AppendFHelper(destination, fmt, IdentityOrConvertStringToRaw(args)...);
+}
+
+// Return formatted string (with format fmt and args args). fmt follows C printf
+// semantics. One departure is that %s can be driven by a std::string or string.
+template <typename... Args>
+inline string StringF(const char* fmt, Args&&... args) {
+  string result;
+  AppendFHelper(&result, fmt, IdentityOrConvertStringToRaw(args)...);
+  return result;
+}
+
+}  // namespace port
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e152f5ba887088c98055596f8245b82fbc86eaa
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -0,0 +1,490 @@
+# TensorFlow Lite Optimizing Converter command-line examples
+
+This page is a guide to using the TensorFlow Lite Optimizing Converter by
+looking at some example command lines. It is complemented by the following other
+documents:
+
+*   [README](../README.md)
+*   [Command-line reference](cmdline_reference.md)
+
+Table of contents:
+
+[TOC]
+
+## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference
+
+In this example, we look at the most common task: we have an ordinary TensorFlow
+GraphDef and want to convert it to a TensorFlow Lite flatbuffer to perform
+floating-point inference.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=FLOAT \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+To explain each of these flags:
+
+*   `--input_format` and `--output_format` determine the formats of the input
+    and output files: here we are converting from `TENSORFLOW_GRAPHDEF` to
+    `TFLITE`.
+*   `--input_file` specifies the path of the input file, to be converted. When
+    `--input_format=TENSORFLOW_GRAPHDEF`, this file should be a
+    *[frozen](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)*
+    *inference* graph. Being frozen means in particular that the input file is
+    self-contained, and does not reference any external "checkpoint" file. An
+    *inference* graph is a version of a graph meant to be used for inference,
+    typically not the same graph file as was used for training a given model.
+*   `--output_file` specifies the destination to write the converted file to.
+*   `--input_array` specifies the input activations, that is, the input "tensor"
+    in the input TensorFlow GraphDef file. The array designated by
+    `--input_array` is the one that the user will have to provide the contents
+    of as input to the runtime inference code.
+*   `--output_array` specifies the output activations, that is, the output
+    "tensor" in the input TensorFlow GraphDef file. The runtime inference code
+    will store its results in the array designated by `--output_array`.
+*   `--input_shape` specifies the shape of the input array. It is currently
+    required, but the plan is for a future version to no longer require it,
+    allowing to defer the specification of the input shape until runtime. The
+    format of `input_shape` is always a comma-separated list of dimensions,
+    always in TensorFlow convention.
+*   `--inference_type` specifies what type of arithmetic the output file should
+    be relying on. It implies in particular the choice of type of the output
+    arrays in the output file.
+
+## Just optimize a TensorFlow GraphDef
+
+The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
+`--input_format` and `--output_format`. This means that conversion from and to
+any supported format is possible, and in particular, same-format "conversions"
+are possible, and effectively ask the converter to optimize and simplify a
+graph. Example:
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+Here we did not pass `--inference_type` because it is not considered applicable
+to the TensorFlow GraphDef format (as far as we are concerned, TensorFlow
+GraphDefs are technically always float, and the only flavor of "quantized"
+GraphDef that the converter deals with is "FakeQuantized" graphs that are still
+technically float graphs).
+
+Below in the section about passing arbitrary input/output arrays we give another
+example, using the converter to extract just a sub-graph from a TensorFlow
+GraphDef.
+
+## Convert a TensorFlow Lite flatbuffer back into TensorFlow GraphDef format
+
+As we mentioned that the converter supports file format conversions in any
+direction, let us just give an example of that:
+
+```
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/foo.lite \
+  --output_file=/tmp/foo.pb \
+  --input_format=TFLITE \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+## Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference
+
+Let us now look at a quantized model. As mentioned above, the only flavor of
+quantized TensorFlow GraphDefs that the converter is concerned with, is
+"FakeQuantized" models. These are technically float models, but with special
+`FakeQuant*` ops inserted at the boundaries of fused layers to record min-max
+range information allowing to generate a quantized inference workload that is
+able to reproduce exactly the specific quantization behavior that was used
+during training. Indeed, the whole point of quantized training is to allow for
+both training and inference to perform exactly the same arithmetic, so that the
+way that the training process about around quantization inaccuracy is
+effectively helping the quantized inference process to be more accurate.
+
+Given a quantized TensorFlow GraphDef, generating a quantized TensorFlow Lite
+flatbuffer is done like this:
+
+```
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/some_quantized_graph.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=QUANTIZED_UINT8 \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1 \
+  --mean_value=128 \
+  --std_value=127
+```
+
+Here, besides changing `--input_file` to point to a (fake-)quantized GraphDef,
+the only other changes are:
+
+*   To change `--inference_type` to `QUANTIZED_UINT8`. This effectively tells
+    the converter to generate an output file that performs quantized inference
+    on a quantized input.
+*   To pass `--mean_value` and `--std_value` flags to describe how the quantized
+    uint8 input array values are to be interpreted as the mathematical real
+    numbers that the graph is concerned with (keep in mind that even a
+    "fake-quantized" TensorFlow GraphDef is still technically a float graph).
+    The meaning of `--mean_value` and `--std_value` is explained in the
+    command-line reference; it suffices for now to say that they are a property
+    of each model.
+
+## Use dummy-quantization to try out quantized inference on a float graph
+
+Sometimes, one only has a plain float graph, and one is curious as to how much
+faster inference might run if one could perform quantized inference instead of
+float inference. Rather than requiring users to first invest in quantizing their
+graphs before they can evaluate a possible benefit, the converter allows to
+simply experiment with what we call "dummy quantization": provide some vaguely
+plausible values for the min-max ranges of values in all arrays that do not have
+min-max information, so that quantization can carry on, certainly producing
+inaccurate results (do not use that in production!) but with performance
+characteristics that should be identical to those of an actually quantized
+flavor of the model.
+
+In the present example, we have a model using Relu6 activation functions almost
+everywhere, so a reasonable guess is that most activation ranges should be
+contained in [0, 6] and roughly comparable to it.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.cc \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=QUANTIZED_UINT8 \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1 \
+  --default_ranges_min=0 \
+  --default_ranges_max=6 \
+  --mean_value=127.5 \
+  --std_value=127.5
+```
+
+## Multiple output arrays
+
+Some models have multiple outputs. Even in a model with only one output, you may
+want for the inference code to return the contents of other arrays as well, or
+to perform inference on a subgraph with multiple outputs (see the section below
+on specifying arbitrary arrays as input/output arrays).
+
+Either way, using `--output_arrays` instead of `--output_array` allows to
+specify a comma-separated list of output arrays.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=FLOAT \
+  --input_shape=1,224,224,3 \
+  --input_array=input \
+  --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
+```
+
+## Multiple input arrays
+
+Some models have multiple inputs; even in a model with a single input, you may
+want for the inference code to implement only a subgraph with multiple inputs
+(see the section below on specifying arbitrary arrays as input/output arrays).
+
+Either way, multiple input arrays are specified by using `--input_arrays`
+instead of `--input_array` to specify a comma-separated list of input arrays. In
+that case, one also needs to use `--input_shapes` instead of `--input_shape`.
+The syntax for `--input_shapes` is a bit trickier, since already the singular
+`--input_shape` was a comma-separated list of integers! Multiple input shapes
+are delimited by a colon (`:`) in `--input_shapes`.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=FLOAT \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_array=InceptionV1/Logits/Predictions/Reshape_1
+```
+
+## Specifying arbitrary arrays in a graph as input or output arrays
+
+Any array in the input file can be specified as an input or output array. This
+allows to use the converter to extract a sub-graph out of the input graph file.
+The converter then automatically discards any part of the graph that is not
+needed for the subgraph identified by the specified input and output arrays.
+Another use case for specifying multiple output arrays is to get inference code
+to return the contents of some specified intermediate activations array, not
+just the output activations.
+
+In order to know which array you want to pass as `--input_arrays` /
+`--output_arrays`, it helps to have a visualization of the graph. See the
+section below on graph visualization. When using graph visualization for that
+purpose, make sure to use `--dump_graphviz=` to visualize exactly the graph as
+it is in the actual final form being exported to the output file.
+
+Note that the final representation of an on-device inference workload (say, in
+TensorFlow Lite flatbuffers format) tends to have coarser granularity than the
+very fine granularity of the TensorFlow GraphDef representation. For example,
+while a fully-connected layer is typically represented as at least four separate
+ops in TensorFlow GraphDef (Reshape, MatMul, BiasAdd, Relu...), it is typically
+represented as a single "fused" op (FullyConnected) in the converter's optimized
+representation and in the final on-device representation (e.g. in TensorFlow
+Lite flatbuffer format). As the level of granularity gets coarser, some
+intermediate arrays (say, the array between the MatMul and the BiasAdd in the
+TensorFlow GraphDef) are dropped. When specifying intermediate arrays as
+`--input_arrays` / `--output_arrays`, it is generally at least desirable (and
+often required) to specify arrays that are meant to survive in the final form of
+the graph, after fusing. These are typically the outputs of activation functions
+(since everything in each layer until the activation function tends to get
+fused).
+
+Here is an example of extracting just a sub-graph, namely just a single fused
+layer, out of a TensorFlow GraphDef, and exporting a TensorFlow GraphDef
+containing just that subgraph:
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+```
+
+## Logging
+
+### Standard logging
+
+The converter generates some informative log messages during processing. The
+easiest way to view them is to add `--logtostderr` to command lines. For the
+previous example, that gives:
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=FLOAT \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1 \
+  --logtostderr
+```
+
+After some initialization messages, we get the following informative messages:
+
+```
+I1101 21:51:33.297475    5339 graph_transformations.cc:39] Before general graph transformations: 416 operators, 583 arrays (0 quantized)
+I1101 21:51:33.308972    5339 graph_transformations.cc:39] After general graph transformations pass 1: 31 operators, 89 arrays (0 quantized)
+I1101 21:51:33.309204    5339 graph_transformations.cc:39] Before dequantization graph transformations: 31 operators, 89 arrays (0 quantized)
+I1101 21:51:33.309368    5339 allocate_transient_arrays.cc:312] Total transient array allocated size: 1048576 bytes, theoretical optimal value: 786432 bytes.
+I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic ops: 0.099218 billion (note that a multiply-add is counted as 2 ops).
+```
+
+### Verbose logging
+
+For debugging purposes, the converter supports two levels of verbose logging,
+which can be set by passing a `--v=` flag:
+
+*   At `--v=1`, the converter generates text dumps of the graph at various
+    points during processing, as well as log messages about every graph
+    transformation that did take place, typically answering questions of the
+    form "why was my graph transformed in this way"?
+*   At `--v=2`, the converter additionally generates log messages about graph
+    transformations that were considered but not actually performed, typically
+    answering questions of the form "why was my graph NOT transformed when I
+    expected it would be?".
+
+### Graph "video" logging
+
+When `--dump_graphviz=` is used (see the section on Graph visualizations), one
+may additionally pass `--dump_graphviz_video`, which causes a graph
+visualization to be dumped after each individual graph transformations, often
+resulting in thousands of files. Typically, one would then bisect into these
+files to understand when a given change was introduced in the graph.
+
+## Graph visualizations
+
+The converter is able to export a graph to the GraphViz Dot format, for easy
+visualization. Combined with the converter's ability to transform the graph into
+a simpler, coarser-granularity representation, that makes it a very powerful
+visualization tool.
+
+There are two ways to get the converter to export a GraphViz Dot file,
+corresponding to two separate use cases. Understanding the difference between
+them is key to getting useful graph visualizations.
+
+### Using `--output_format=GRAPHVIZ_DOT`
+
+The first way to get a graphviz rendering is to pass
+`--output_format=GRAPHVIZ_DOT`, instead of the `--output_format` that you would
+otherwise use. This says: "I just want to get a plausible visualization of that
+graph". The upside is that it makes for very simple command lines, and makes the
+converter very lax about aspects of the graph or the command line that it would
+otherwise complain about. Example:
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.dot \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=GRAPHVIZ_DOT \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+The resulting `.dot` file can be rendered into a PDF as follows:
+
+```
+dot -Tpdf -O /tmp/foo.dot
+```
+
+And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
+with a good ability to pan and zoom across a very large page; Google Chrome does
+well in that respect.
+
+```
+google-chrome /tmp/foo.dot.pdf
+```
+
+Example PDF files are viewable online in the next section.
+
+### Using `--dump_graphviz=`
+
+The second way to get a graphviz rendering is to pass a `--dump_graphviz=` flag
+specifying a destination directory to dump GraphViz rendering to. Unlike the
+previous approach, this one allows you to keep your real command-line (with your
+real `--output_format` and other flags) unchanged, just appending a
+`--dump_graphviz=` flag to it. This says: "I want visualizations of the actual
+graph during this specific conversion process". Example:
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.lite \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --inference_type=FLOAT \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1 \
+  --dump_graphviz=/tmp
+```
+
+This generates a few files in the destination directory, here `/tmp`. Most
+important are these two files:
+
+```
+/tmp/toco_AT_IMPORT.dot
+/tmp/toco_AFTER_TRANSFORMATIONS.dot
+```
+
+`toco_AT_IMPORT.dot` represents the graph as it was imported from
+`--input_file`, before any transformation was applied to it (besides some
+transformations that are applied immediately while importing). This tends to be
+a complex visualization with limited information, but is useful especially in
+situations where a conversion command fails (this file is generated even if the
+conversion subsequently fails).
+
+`toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
+were applied to it, just before it was exported to the `--output_file`.
+Typically, this is a much smaller graph, and it conveys much more information
+about each node.
+
+Again, these can be rendered to PDFs:
+
+```
+dot -Tpdf -O /tmp/toco_*.dot
+```
+
+The resulting files can be seen here:
+
+*   [toco_AT_IMPORT.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf)
+*   [toco_AFTER_TRANSFORMATIONS.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf).
+
+### Legend for the graph visualizations
+
+*   Operators are red square boxes with the following hues of red:
+    *   Most operators are
+        <span style="background-color:#db4437;color:white;border:1px;border-style:solid;border-color:black;padding:1px">bright
+        red</span>.
+    *   Some typically heavy operators (e.g. Conv) are rendered in a
+        <span style="background-color:#c53929;color:white;border:1px;border-style:solid;border-color:black;padding:1px">darker
+        red</span>.
+*   Arrays are octogons with the following colors:
+    *   Constant arrays are
+        <span style="background-color:#4285f4;color:white;border:1px;border-style:solid;border-color:black;padding:1px">blue</span>.
+    *   Activation arrays are gray:
+        *   Internal (intermediate) activation arrays are
+            <span style="background-color:#f5f5f5;border:1px;border-style:solid;border-color:black;border:1px;border-style:solid;border-color:black;padding:1px">light
+            gray</span>.
+        *   Those activation arrays that are designated as `--input_arrays` or
+            `--output_arrays` are
+            <span style="background-color:#9e9e9e;border:1px;border-style:solid;border-color:black;padding:1px">dark
+            gray</span>.
+    *   RNN state arrays are green. Because of the way that the converter
+        represents RNN back-edges explicitly, each RNN state is represented by a
+        pair of green arrays:
+        *   The activation array that is the source of the RNN back-edge (i.e.
+            whose contents are copied into the RNN state array after having been
+            computed) is
+            <span style="background-color:#b7e1cd;border:1px;border-style:solid;border-color:black;padding:1px">light
+            green</span>.
+        *   The actual RNN state array is
+            <span style="background-color:#0f9d58;color:white;border:1px;border-style:solid;border-color:black;padding:1px">dark
+            green</span>. It is the destination of the RNN back-edge updating
+            it.
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
new file mode 100644
index 0000000000000000000000000000000000000000..4776741ab9273cf3b2ef0c63a6dbfdea5475b057
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -0,0 +1,255 @@
+# TensorFlow Lite Optimizing Converter command-line reference
+
+This page is complete reference of command-line flags. It is complemented by the
+following other documents:
+
+*   [README](../README.md)
+*   [Command-line examples](cmdline_examples.md)
+
+Table of contents:
+
+[TOC]
+
+## High-level overview
+
+A full list and detailed specification of all flags is given in the next
+section. For now we focus on a higher-level description of command lines:
+
+```
+toco \
+  --input_format=... \
+  --output_format=... \
+  --input_file=... \
+  --output_file=... \
+  [model flags...] \
+  [transformation flags...] \
+  [logging flags...]
+```
+
+In other words, the converter requires at least the following mandatory flags:
+`--input_format`, `--output_format`, `--input_file`, `--output_file`. Depending
+on the input and output formats, additional flags may be allowed or mandatory:
+
+*   *Model flags* provide additional information about the model stored in the
+    input file.
+    *   `--output_array` or `--output_arrays` specify which arrays in the input
+        file are to be considered the output activations.
+    *   `--input_array` or `--input_arrays` specify which arrays in the input
+        file are to be considered the input activations.
+    *   `--input_shape` or `--input_shapes` specify the shapes of the input
+        arrays.
+    *   `--input_data_type` or `--input_data_types` specify the data types of
+        input arrays, which can be used if the input file does not already
+        specify them.
+    *   `--mean_value` or `--mean_values`, and `--std_value` or `--std_values`,
+        give the dequantization parameters of the input arrays, for the case
+        when the output file will accept quantized input arrays.
+*   *Transformation flags* specify options of the transformations to be applied
+    to the graph, i.e. they specify requested properties that the output file
+    should have.
+    *   `--inference_type` specifies the type of real-numbers arrays in the
+        output file. This only affects arrays of real numbers and allows to
+        control their quantization or dequantization, effectively switching
+        between floating-point and quantized arithmetic for the inference
+        workload, as far as real numbers are concerned. Other data types are
+        unaffected (e.g. plain integers, and strings).
+    *   `--inference_input_type` is like `--inference_type` but specifically
+        controlling input arrays, separately from other arrays. If not
+        specified, then `--inference_type` is used. The use case for specifying
+        `--inference_input_type` is when one wants to perform floating-point
+        inference on a quantized input, as is common in image models operating
+        on bitmap image inputs.
+    *   Some transformation flags allow to carry on with quantization when the
+        input graph is not properly quantized: `--default_ranges_min`,
+        `--default_ranges_max`, `--drop_fake_quant`,
+        `--reorder_across_fake_quant`.
+*   *Logging flags* described below.
+
+## Command-line flags complete reference
+
+### Mandatory flags
+
+*   `--input_format`. Type: string. Specifies the format of the input file.
+    Allowed values:
+    *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Both
+        binary and text proto formats are allowed.
+    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
+*   `--output_format`. Type: string. Specifies the format of the output file.
+    Allowed values:
+    *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Always
+        produces a file in binary (not text) proto format.
+    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
+        *   Whether a float or quantized TensorFlow Lite file will be produced
+            depends on the `--inference_type` flag.
+    *   `GRAPHVIZ_DOT` &mdash; The GraphViz `.dot` format. This asks the
+        converter to generate a reasonable graphical representation of the graph
+        after simplification by a generic set of transformation.
+        *   A typical `dot` command line to view the resulting graph might look
+            like: `dot -Tpdf -O file.dot`.
+        *   Note that since passing this `--output_format` means losing the
+            information of which output format you actually care about, and
+            since the converter's transformations depend on the specific output
+            format, the resulting visualization may not fully reflect what you
+            would get on the actual output format that you are using. To avoid
+            that concern, and generally to get a visualization of exactly what
+            you get in your actual output format as opposed to just a merely
+            plausible visualization of a model, consider using `--dump_graphviz`
+            instead and keeping your true `--output_format`.
+*   `--input_file`. Type: string. Specifies the path of the input file. This may
+    be either an absolute or a relative path.
+*   `--output_file`. Type: string. Specifies the path of the output file.
+
+### Model flags
+
+*   `--output_array`. Type: string. Specifies a single array as the output
+    activations. Incompatible with `--output_arrays`.
+*   `--output_arrays`. Type: comma-separated list of strings. Specifies a list
+    of arrays as the output activations, for models with multiple outputs.
+    Incompatible with `--output_array`.
+*   `--input_array`. Type: string. Specifies a single array as the input
+    activations. Incompatible with `--input_arrays`.
+*   `--input_arrays`. Type: comma-separated list of strings. Specifies a list of
+    arrays as the input activations, for models with multiple inputs.
+    Incompatible with `--input_array`.
+
+When `--input_array` is used, the following flags are available to provide
+additional information about the single input array:
+
+*   `--input_shape`. Type: comma-separated list of integers. Specifies the shape
+    of the input array, in TensorFlow convention: starting with the outer-most
+    dimension (the dimension corresponding to the largest offset stride in the
+    array layout), ending with the inner-most dimension (the dimension along
+    which array entries are typically laid out contiguously in memory).
+    *   For example, a typical vision model might pass
+        `--input_shape=1,60,80,3`, meaning a batch size of 1 (no batching), an
+        input image height of 60, an input image width of 80, and an input image
+        depth of 3, for the typical case where the input image is a RGB bitmap
+        (3 channels, depth=3) stored by horizontal scanlines (so 'width' is the
+        next innermost dimension after 'depth').
+*   `--mean_value` and `--std_value`. Type: floating-point. The decimal point
+    character is always the dot (`.`) regardless of the locale. These specify
+    the (de-)quantization parameters of the input array, when it is quantized.
+    *   The meaning of mean_value and std_value is as follows: each quantized
+        value in the quantized input array will be interpreted as a mathematical
+        real number (i.e. as an input activation value) according to the
+        following formula:
+        *   `real_value = (quantized_input_value - mean_value) / std_value`.
+    *   When performing float inference (`--inference_type=FLOAT`) on a
+        quantized input, the quantized input would be immediately dequantized by
+        the inference code according to the above formula, before proceeding
+        with float inference.
+    *   When performing quantized inference
+        (`--inference_type=QUANTIZED_UINT8`), no dequantization is ever to be
+        performed by the inference code; however, the quantization parameters of
+        all arrays, including those of the input arrays as specified by
+        mean_value and std_value, all participate in the determination of the
+        fixed-point multipliers used in the quantized inference code.
+
+When `--input_arrays` is used, the following flags are available to provide
+additional information about the multiple input arrays:
+
+*   `--input_shapes`. Type: colon-separated list of comma-separated lists of
+    integers. Each comma-separated list of integer gives the shape of one of the
+    input arrays specified in `--input_arrays`, in the same order. See
+    `--input_shape` for details.
+    *   Example: `--input_arrays=foo,bar --input_shapes=2,3:4,5,6` means that
+        there are two input arrays. The first one, "foo", has shape [2,3]. The
+        second one, "bar", has shape [4,5,6].
+*   `--mean_values`, `--std_values`. Type: comma-separated lists of
+    floating-point numbers. Each number gives the corresponding value for one of
+    the input arrays specified in `--input_arrays`, in the same order. See
+    `--mean_value`, `--std_value` for details.
+
+### Transformation flags
+
+*   `--inference_type`. Type: string. Sets the type of real-number arrays in the
+    output file, that is, controls the representation (quantization) of real
+    numbers in the output file, except for input arrays, which are controlled by
+    `--inference_input_type`.
+
+    This flag only impacts real-number arrays. By "real-number" we mean float
+    arrays, and quantized arrays. This excludes plain integer arrays, strings
+    arrays, and every other data type.
+
+    For real-number arrays, the impact of this flag is to allow the output file
+    to choose a different real-numbers representation (quantization) from what
+    the input file used. For any other types of arrays, changing the data type
+    would not make sense.
+
+    Specifically:
+
+    *   If `FLOAT`, then real-numbers arrays will be of type float in the output
+        file. If they were quantized in the input file, then they get
+        dequantized.
+    *   If `QUANTIZED_UINT8`, then real-numbers arrays will be quantized as
+        uint8 in the output file. If they were float in the input file, then
+        they get quantized.
+    *   If not set, then all real-numbers arrays retain the same type in the
+        output file as they have in the input file.
+
+*   `--inference_input_type`. Type: string. Similar to inference_type, but
+    allows to control specifically the quantization of input arrays, separately
+    from other arrays.
+
+    If not set, then the value of `--inference_type` is implicitly used, i.e. by
+    default input arrays are quantized like other arrays.
+
+    Like `--inference_type`, this only affects real-number arrays. By
+    "real-number" we mean float arrays, and quantized arrays. This excludes
+    plain integer arrays, strings arrays, and every other data type.
+
+    The typical use for this flag is for vision models taking a bitmap as input,
+    typically with uint8 channels, yet still requiring floating-point inference.
+    For such image models, the uint8 input is quantized, i.e. the uint8 values
+    are interpreted as real numbers, and the quantization parameters used for
+    such input arrays are their `mean_value`, `std_value` parameters.
+
+*   `--default_ranges_min`, `--default_ranges_max`. Type: floating-point. The
+    decimal point character is always the dot (`.`) regardless of the locale.
+    These flags enable what is called "dummy quantization". If defined, their
+    effect is to define fallback (min, max) range values for all arrays that do
+    not have a properly specified (min, max) range in the input file, thus
+    allowing to proceed with quantization of non-quantized or
+    incorrectly-quantized input files. This enables easy performance prototyping
+    ("how fast would my model run if I quantized it?") but should never be used
+    in production as the resulting quantized arithmetic is inaccurate.
+
+*   `--drop_fake_quant`. Type: boolean. Default: false. Causes fake-quantization
+    nodes to be dropped from the graph. This may be used to recover a plain
+    float graph from a fake-quantized graph.
+
+*   `--reorder_across_fake_quant`. Type: boolean. Default: false. Normally,
+    fake-quantization nodes must be strict boundaries for graph transformations,
+    in order to ensure that quantized inference has the exact same arithmetic
+    behavior as quantized training --- which is the whole point of quantized
+    training and of FakeQuant nodes in the first place. However, that entails
+    subtle requirements on where exactly FakeQuant nodes must be placed in the
+    graph. Some quantized graphs have FakeQuant nodes at unexpected locations,
+    that prevent graph transformations that are necessary in order to generate a
+    well-formed quantized representation of these graphs. Such graphs should be
+    fixed, but as a temporary work-around, setting this
+    reorder_across_fake_quant flag allows the converter to perform necessary
+    graph transformaitons on them, at the cost of no longer faithfully matching
+    inference and training arithmetic.
+
+### Logging flags
+
+The following are standard Google logging flags:
+
+*   `--logtostderr` redirects Google logging to standard error, typically making
+    it visible in a terminal.
+*   `--v` sets verbose logging levels (for debugging purposes). Defined levels:
+    *   `--v=1`: log all graph transformations that did make a change on the
+        graph.
+    *   `--v=2`: log all graph transformations that did *not* make a change on
+        the graph.
+
+The following flags allow to generate graph visualizations of the actual graph
+at various points during transformations:
+
+*   `--dump_graphviz=/path` enables dumping of the graphs at various stages of
+    processing as GraphViz `.dot` files. Generally preferred over
+    `--output_format=GRAPHVIZ_DOT` as this allows you to keep your actually
+    relevant `--output_format`.
+*   `--dump_graphviz_video` enables dumping of the graph after every single
+    graph transformation (for debugging purposes).
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..440f9c367c25726e20aa8828e3050cd1dc1b230d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -0,0 +1,62 @@
+# TensorFlow Lite Optimizing Converter (TOCO) Python API reference
+
+## High-level overview
+
+While the TensorFlow Lite Optimizing Converter can be used from the command
+line, it is often convenient to use it as part of Python model build and
+training script. This is so that conversion can be part of your model
+development pipeline. This allows you to know early and often that you are
+designing a model that can be targeted to devices with mobile.
+
+## API
+
+In Python you can run `help(tf.contrib.lite)` to get documentation on functions.
+In particular, `tf.contrib.lite.toco_convert` presents a simple API and
+`tf.contrib.lite.toco_from_protos` allows more detailed control of TOCO using
+the protobuf interface to TOCO.
+
+## Example
+
+In particular, here we show creating a simple model and converting it to a
+TensorFlow Lite Model.
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("test.tflite", "wb").write(tflite_modeL)
+```
+
+**NOTE** Currently, the TOCO command will cause a fatal error to the Python
+interpreter when TOCO conversion fails. This will be remedied as soon as
+possible.
+
+## Example 2: Export with variables
+
+If a model has variables, they need to be turned into constants. This process is
+known as freezing, and it can actually be accomplished with
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+var = tf.get_variable("weights", dtype=tf.float32, shape=(1,64,64,3))
+val = img + var
+
+def canonical_name(x):
+  return x.name.split(":")[0]
+
+out = tf.identity(val, name="out")
+with tf.Session() as sess:
+  sess.run(tf.global_variables_initializer())
+  out_tensors = [out]
+  frozen_graphdef = tf.graph_util.convert_variables_to_constants(
+      sess, sess.graph_def, map(canonical_name, out_tensors))
+  tflite_model = tf.contrib.lite.toco_convert(
+      frozen_graphdef, [img], out_tensors)
+  open("converted_model.tflite", "wb").write(tflite_model)
+```
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bde9b0169ddfb7fc37657122e2e8eb65ccbdf6d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_expanddims_to_reshape.cc
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertExpandDimsToReshape::Run(Model* model, std::size_t op_index) {
+  auto expand_it = model->operators.begin() + op_index;
+  if (expand_it->get()->type != OperatorType::kExpandDims) {
+    return false;
+  }
+  ExpandDimsOperator* expand_op =
+      static_cast<ExpandDimsOperator*>(expand_it->get());
+  CHECK_EQ(expand_op->inputs.size(), 2);
+  CHECK_EQ(expand_op->outputs.size(), 1);
+
+  const auto& input_array = *model->arrays[expand_op->inputs[0]];
+  if (!input_array.has_shape()) {
+    // Yield until input dims have been resolved.
+    return false;
+  }
+  if (input_array.shape().dimensions_count() == 0) {
+    // Input array cannot be 0-D.
+    // (Unsure if this is TF behavior, but was required to get a test to pass.)
+    return false;
+  }
+
+  const auto& axis_array = *model->arrays[expand_op->inputs[1]];
+  if (!axis_array.has_shape()) {
+    // Yield until input axis array shape has been resolved.
+    return false;
+  }
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1);
+  if (!axis_array.buffer) {
+    // Yield until the input axis array is constant
+    return false;
+  }
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  std::vector<int> reshape_dims(input_array.shape().dims());
+  if (axis < 0) {
+    axis = reshape_dims.size();
+  }
+  reshape_dims.insert(reshape_dims.begin() + axis, 1);
+
+  // The input tensor has shape, and the axis input is constant. We can now
+  // replace ExpandDims with a Reshape.
+  auto* reshape_op = new TensorFlowReshapeOperator;
+
+  // Copy inputs
+  reshape_op->inputs.push_back(expand_op->inputs[0]);
+  reshape_op->outputs = expand_op->outputs;
+
+  // Create a new input array
+  string axis_array_name = expand_op->inputs[1];
+  string shape_array_name = toco::AvailableArrayName(*model, axis_array_name);
+  Array& shape_array = model->GetOrCreateArray(shape_array_name);
+  *(shape_array.mutable_shape()->mutable_dims()) = {
+      1, static_cast<int>(reshape_dims.size())};
+  reshape_op->inputs.push_back(shape_array_name);
+  shape_array.data_type = ArrayDataType::kInt32;
+  auto& shape_buffer = shape_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  shape_buffer.data = reshape_dims;
+
+  // Delete axis array if unused
+  if (IsDiscardableArray(*model, axis_array_name) &&
+      CountOpsWithInput(*model, axis_array_name) == 1 &&
+      !GetOpWithOutput(*model, axis_array_name)) {
+    model->arrays.erase(axis_array_name);
+  }
+
+  // Replace the operator in the graph.
+  const auto reshape_it = model->operators.emplace(expand_it, reshape_op);
+  expand_it = reshape_it + 1;
+  CHECK_EQ(expand_it->get(), expand_op);
+  model->operators.erase(expand_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf454c40c7b50d242d8a7e9eb6b7e579fb0da217
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -0,0 +1,98 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
+  auto conv_it = model->operators.begin() + op_index;
+  if (conv_it->get()->type != OperatorType::kConv) {
+    return false;
+  }
+  const auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
+  if (conv_op->stride_width != conv_op->stride_height) {
+    return false;
+  }
+  auto& weights_array = model->GetArray(conv_op->inputs[1]);
+  if (!weights_array.buffer) {
+    // Yield until the weights are resolved as a constant array.
+    return false;
+  }
+  if (weights_array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+  if (weights_array.shape().dims(3) != 1) {
+    // Not a pure convolution: Conv does accumulation across the depth
+    // dimension.
+    return false;
+  }
+  // At this point we know we have a pure conv. Rewrite it as DepthwiseConv.
+  AddMessageF(
+      "%s is purely convolutional (input/weights depth is 1), replacing it by "
+      "a DepthwiseConv.",
+      LogName(*conv_op));
+  auto* depthwiseconv_op = new DepthwiseConvOperator;
+  // Conv and DepthwiseConv take the same inputs
+  depthwiseconv_op->inputs = conv_op->inputs;
+  // Conv may have a 2nd output for im2col
+  depthwiseconv_op->outputs = {conv_op->outputs[0]};
+  if (conv_op->outputs.size() > 1) {
+    // delete the im2col array.
+    model->arrays.erase(conv_op->outputs[1]);
+  }
+  depthwiseconv_op->fused_activation_function =
+      conv_op->fused_activation_function;
+  // Let PropagateFixedSizes recompute fixed padding, just in case some day it
+  // may be different for Conv vs DepthwiseConv.
+  depthwiseconv_op->padding.type = conv_op->padding.type;
+  depthwiseconv_op->stride_height = conv_op->stride_height;
+  depthwiseconv_op->stride_width = conv_op->stride_width;
+  depthwiseconv_op->depth_multiplier = weights_array.shape().dims(0);
+  // Replace the operator in the graph.
+  const auto depthwiseconv_it =
+      model->operators.emplace(conv_it, depthwiseconv_op);
+  conv_it = depthwiseconv_it + 1;
+  CHECK_EQ(conv_it->get(), conv_op);
+  model->operators.erase(conv_it);
+  // Shuffle the weights.
+  const auto& weights_shape = weights_array.shape();
+  auto& weights_buffer =
+      weights_array.GetMutableBuffer<ArrayDataType::kFloat>();
+  const std::vector<float>& conv_weights_data = weights_buffer.data;
+  std::vector<float> depthwise_conv_weights_data(conv_weights_data.size());
+  const int depth = weights_shape.dims(0);
+  const int width = weights_shape.dims(1);
+  const int height = weights_shape.dims(2);
+  const int width_height = width * height;
+  for (int c = 0; c < depth; c++) {
+    for (int xy = 0; xy < width_height; xy++) {
+      depthwise_conv_weights_data[c + depth * xy] =
+          conv_weights_data[xy + width_height * c];
+    }
+  }
+  *weights_array.mutable_shape()->mutable_dims() = {1, width, height, depth};
+  weights_buffer.data = depthwise_conv_weights_data;
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1735b51e5b6ca517bad62bf55f0cc9f0c21ac440
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool CreateIm2colArrays::Run(Model* model, std::size_t op_index) {
+  auto conv_it = model->operators.begin() + op_index;
+  if (conv_it->get()->type != OperatorType::kConv) {
+    return false;
+  }
+  auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
+  if (conv_op->outputs.size() == 2) {
+    // We already have an im2col array
+    return false;
+  }
+  const auto& weights_array = *model->arrays[conv_op->inputs[1]];
+  if (!weights_array.has_shape()) {
+    // We need to yield until weights dims have been resolved, because
+    // from the weights dims we determine whether an im2col array is
+    // needed.
+    return false;
+  }
+  const auto& weights_shape = weights_array.shape();
+  const int kheight = weights_shape.dims(1);
+  const int kwidth = weights_shape.dims(2);
+  if (kwidth == 1 && kheight == 1 && conv_op->stride_width == 1 &&
+      conv_op->stride_height == 1) {
+    // 1x1 unstrided conv does not need an im2col array.
+    return false;
+  }
+
+  // Create the im2col array.
+  CHECK_EQ(conv_op->outputs.size(), 1);
+  const string& im2col_array_name =
+      AvailableArrayName(*model, conv_op->inputs[0] + "_im2col");
+  model->GetOrCreateArray(im2col_array_name);
+  conv_op->outputs.push_back(im2col_array_name);
+  AddMessageF(
+      "Created an im2col array for %s, with %dx%d kernel and stride_width=%d, "
+      "stride_height=%d",
+      LogName(*conv_op), kwidth, kheight, conv_op->stride_width,
+      conv_op->stride_height);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b89e3f5310cd7364294ad875cfcdf9c14660366b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/dequantize.cc
@@ -0,0 +1,223 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <ArrayDataType A>
+void DequantizeBuffer(Array* array) {
+  const auto old_data = array->GetBuffer<A>().data;
+  array->buffer = nullptr;
+  array->data_type = ArrayDataType::kFloat;
+  auto& new_data = array->GetMutableBuffer<ArrayDataType::kFloat>().data;
+  new_data.resize(old_data.size());
+  const auto& qparams = array->GetQuantizationParams();
+  for (int i = 0; i < old_data.size(); i++) {
+    new_data[i] = qparams.scale * (old_data[i] - qparams.zero_point);
+  }
+}
+
+std::vector<std::unique_ptr<Operator>>::iterator FindFirstOpWithInput(
+    Model* model, const string& array_name) {
+  for (auto it = model->operators.begin(); it != model->operators.end(); ++it) {
+    for (const auto& input : it->get()->inputs) {
+      if (input == array_name) {
+        return it;
+      }
+    }
+  }
+  return model->operators.end();
+}
+
+void ClearArrayQuantizationParams(const string& array_name, Model* model) {
+  auto* array = model->arrays.at(array_name).get();
+  CHECK(array->quantization_params);
+  for (auto& input_array : *model->flags.mutable_input_arrays()) {
+    if (input_array.name() == array_name) {
+      auto& qparams = *array->quantization_params;
+      const double new_std_value = 1. / qparams.scale;
+      const double new_mean_value = qparams.zero_point;
+      if (input_array.has_std_value()) {
+        CHECK_LE(std::abs(new_std_value - input_array.std_value()), 0.001);
+      } else {
+        input_array.set_std_value(new_std_value);
+      }
+      if (input_array.has_mean_value()) {
+        CHECK_LE(std::abs(new_mean_value - input_array.mean_value()), 0.001);
+      } else {
+        input_array.set_mean_value(new_mean_value);
+      }
+    }
+  }
+  array->quantization_params = nullptr;
+}
+
+bool DequantizeArray(const string& array_name,
+                     GraphTransformation* transformation, Model* model) {
+  auto* array = model->arrays.at(array_name).get();
+  if (!array->quantization_params) {
+    return false;
+  }
+  transformation->AddMessageF("Dequantizing array: %s", array_name);
+
+  // Dequantize any buffer
+  if (array->buffer) {
+    if (array->data_type == ArrayDataType::kUint8) {
+      DequantizeBuffer<ArrayDataType::kUint8>(array);
+    } else if (array->data_type == ArrayDataType::kInt32) {
+      DequantizeBuffer<ArrayDataType::kInt32>(array);
+    } else {
+      LOG(FATAL) << "Unhandled data type";
+    }
+    CHECK(array->data_type == ArrayDataType::kFloat);
+    CHECK(array->buffer->type == ArrayDataType::kFloat);
+
+    // Clear quantization params, officially makes this a non-quantized array.
+    ClearArrayQuantizationParams(array_name, model);
+    return true;
+  } else {
+    array->data_type = ArrayDataType::kFloat;
+  }
+
+  // Clear quantization params, officially makes this a non-quantized array.
+  ClearArrayQuantizationParams(array_name, model);
+
+  if (array->buffer) {
+    return true;
+  }
+
+  auto* op_outputting_array = GetOpWithOutput(*model, array_name);
+  if (op_outputting_array) {
+    if (op_outputting_array->type == OperatorType::kTensorFlowReshape) {
+      return true;
+    }
+  }
+
+  // If there was no minmax info, we can return now. Indeed,
+  // the below only serves to create a FakeQuant node, but some arrays are
+  // quantized without MinMax (see the CHECK above) and that corresponds to
+  // places where a FakeQuant node is actually not wanted, because the
+  // quantization params are meant to be inferred in another way (e.g. bias
+  // vector for a Conv op, see their special-casing in quantize.cc).
+  if (!array->minmax) {
+    return true;
+  }
+
+  // Determine whether to insert a FakeQuant before or after
+  // this array.
+  bool must_insert_fakequant_before = false;
+  bool must_insert_fakequant_after = false;
+  if (IsInputArray(*model, array_name)) {
+    must_insert_fakequant_after = true;
+  }
+  for (const string& output_array : model->flags.output_arrays()) {
+    if (array_name == output_array) {
+      must_insert_fakequant_before = true;
+    }
+  }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (array_name == rnn_state.state_array()) {
+      must_insert_fakequant_after = true;
+    }
+    if (array_name == rnn_state.back_edge_source_array()) {
+      must_insert_fakequant_before = true;
+    }
+  }
+  CHECK(!(must_insert_fakequant_before && must_insert_fakequant_after));
+
+  // Create and insert the FakeQuant node
+  auto* fakequant_op = new FakeQuantOperator;
+  model->operators.emplace(FindFirstOpWithInput(model, array_name),
+                           fakequant_op);
+  const string& new_array_name = AvailableArrayName(*model, array_name);
+  auto& new_array = model->GetOrCreateArray(new_array_name);
+  new_array.data_type = ArrayDataType::kFloat;
+  new_array.copy_shape(array->shape());
+  new_array.GetOrCreateMinMax() = array->GetMinMax();
+  fakequant_op->minmax.reset(new MinMax);
+  *fakequant_op->minmax = array->GetMinMax();
+  if (must_insert_fakequant_before) {
+    for (const auto& op : model->operators) {
+      for (string& output : op->outputs) {
+        if (output == array_name) {
+          output = new_array_name;
+        }
+      }
+    }
+    fakequant_op->inputs = {new_array_name};
+    fakequant_op->outputs = {array_name};
+  } else {
+    for (const auto& op : model->operators) {
+      for (string& input : op->inputs) {
+        if (input == array_name) {
+          input = new_array_name;
+        }
+      }
+    }
+    fakequant_op->inputs = {array_name};
+    fakequant_op->outputs = {new_array_name};
+  }
+  return true;
+}
+
+}  // namespace
+
+bool Dequantize::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  auto* op = op_it->get();
+
+  if (op->type == OperatorType::kDequantize) {
+    auto& input_array = model->GetArray(op->inputs[0]);
+    if (input_array.data_type == ArrayDataType::kFloat) {
+      return false;
+    }
+    if (input_array.final_data_type != ArrayDataType::kFloat) {
+      return false;
+    }
+    input_array.data_type = ArrayDataType::kFloat;
+    input_array.quantization_params = nullptr;
+    auto& output_array = model->GetArray(op->outputs[0]);
+    output_array.data_type = ArrayDataType::kFloat;
+    output_array.quantization_params = nullptr;
+    return RemoveTrivialPassthroughOp(this, model, op_index);
+  }
+
+  std::vector<string> arrays;
+  for (const string& input : op->inputs) {
+    arrays.push_back(input);
+  }
+  for (const string& output : op->outputs) {
+    arrays.push_back(output);
+  }
+  bool changed = false;
+  for (const string& array : arrays) {
+    changed |= DequantizeArray(array, this, model);
+  }
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fea360740f4e645e1f00eaed42cbff48f430fe2a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_fake_quant.cc
@@ -0,0 +1,56 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool DropFakeQuant::Run(Model* model, std::size_t op_index) {
+  const auto fakequant_it = model->operators.begin() + op_index;
+  auto* fakequant_base_op = fakequant_it->get();
+  if (fakequant_base_op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
+
+  if (!fakequant_op->minmax) {
+    return false;
+  }
+
+  const auto& output_array = model->GetArray(fakequant_op->outputs[0]);
+  if (!output_array.minmax) {
+    return false;
+  }
+
+  // Drop min/max inputs
+  for (int i = 1; i < fakequant_op->inputs.size(); i++) {
+    if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
+      model->arrays.erase(fakequant_op->inputs[i]);
+    }
+  }
+  fakequant_op->inputs.resize(1);
+
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ed6663bcc80c5fc642a399b1e5c0cf3336973a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/drop_im2col_arrays.cc
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool DropIm2colArrays::Run(Model* model, std::size_t op_index) {
+  auto conv_it = model->operators.begin() + op_index;
+  if (conv_it->get()->type != OperatorType::kConv) {
+    return false;
+  }
+  auto* conv_op = static_cast<ConvOperator*>(conv_it->get());
+  if (conv_op->outputs.size() < 2) {
+    // Conv op does not have im2col.
+    return false;
+  }
+
+  // Drop the im2col array.
+  CHECK_EQ(conv_op->outputs.size(), 2);
+  model->arrays.erase(conv_op->outputs[1]);
+  conv_op->outputs.resize(1);
+  AddMessageF("Dropped an im2col array for %s", LogName(*conv_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..badefeca883b1e1d67f7de5276389c5e6e7f7cd3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_bias_vectors.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool ProcessLinearOperator(Model* model, Operator* op) {
+  if (op->inputs.size() >= 3) {
+    return false;
+  }
+  const string& output_name = op->outputs[0];
+  const string& bias_name = AvailableArrayName(*model, output_name + "_bias");
+  op->inputs.push_back(bias_name);
+  DCHECK_EQ(op->inputs.size(), 3);
+  auto& bias_array = model->GetOrCreateArray(bias_name);
+  bias_array.data_type = ArrayDataType::kFloat;
+
+  return true;
+}
+}  // namespace
+
+bool EnsureBiasVectors::Run(Model* model, std::size_t op_index) {
+  auto* op = model->operators[op_index].get();
+  if (op->type == OperatorType::kConv ||
+      op->type == OperatorType::kDepthwiseConv ||
+      op->type == OperatorType::kFullyConnected) {
+    if (ProcessLinearOperator(model, op)) {
+      AddMessageF("Added bias vector to %s", LogName(*op));
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d129b5ecf2615434b8ff8387a04af9561fe617a4
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_activation_functions.cc
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool FuseActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto ac_it = model->operators.begin() + op_index;
+  const auto* ac_op = ac_it->get();
+
+  if (ac_op->type != OperatorType::kRelu6 &&
+      ac_op->type != OperatorType::kRelu1 &&
+      ac_op->type != OperatorType::kRelu) {
+    return false;
+  }
+
+  // Find the op producing the array passed to this activation function
+  Operator* op = GetOpWithOutput(*model, ac_op->inputs[0]);
+
+  if (!op) return false;
+
+  if (CountTrueOutputs(*model, *op) > 1) {
+    AddMessageF(
+        "Not fusing activation function into %s because it has more than one "
+        " consumed output",
+        LogName(*op));
+    return false;
+  }
+
+  CHECK_EQ(op->outputs[0], ac_op->inputs[0]);
+
+  int count_ops_consuming_output = CountOpsWithInput(*model, ac_op->inputs[0]);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not fusing activation function into %s because it is consumed by more "
+        "than 1 other operator",
+        LogName(*op));
+    return false;
+  }
+
+  if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
+    AddMessageF(
+        "Not fusing activation function into %s because it already has a fused "
+        "activation function",
+        LogName(*op));
+    return false;
+  }
+
+  // TODO(dkalenichenko): Great many ops don't support activation function
+  // fusing. Switch to the whilelist approach instead.
+  if (op->type == OperatorType::kConcatenation ||
+      op->type == OperatorType::kSlice ||
+      op->type == OperatorType::kTensorFlowSplit) {
+    AddMessageF(
+        "Not fusing activation function because the %s op doesn't support it",
+        LogName(*op));
+    return false;
+  }
+
+  AddMessageF("Fusing activation function %s into the preceding %s",
+              LogName(*ac_op), LogName(*op));
+  if (ac_op->type == OperatorType::kRelu6) {
+    op->fused_activation_function = FusedActivationFunctionType::kRelu6;
+  } else if (ac_op->type == OperatorType::kRelu1) {
+    op->fused_activation_function = FusedActivationFunctionType::kRelu1;
+  } else if (ac_op->type == OperatorType::kRelu) {
+    op->fused_activation_function = FusedActivationFunctionType::kRelu;
+  } else {
+    LOG(FATAL) << "Unhandled activation function type";
+  }
+  model->arrays.erase(ac_op->inputs[0]);
+  op->outputs[0] = ac_op->outputs[0];
+  model->operators.erase(ac_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4619d8bbee2e52483a523277f421de5bfa155635
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -0,0 +1,300 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void FuseAddOrSubParamsIntoFollowingAffine(Model* model, Operator* following_op,
+                                           const Operator* add_or_sub_op,
+                                           int index_of_constant_input) {
+  CHECK(add_or_sub_op->type == OperatorType::kAdd ||
+        add_or_sub_op->type == OperatorType::kSub);
+  CHECK(index_of_constant_input == 0 || index_of_constant_input == 1);
+  // If the op is a subtraction, the constant input should be the right hand
+  // side.
+  // This should have been checked before this point.
+  CHECK(add_or_sub_op->type != OperatorType::kSub ||
+        index_of_constant_input == 1);
+  if (following_op->inputs.size() < 3) {
+    LOG(FATAL) << "Missing bias parameter";
+  }
+  const auto& weights = model->GetArray(following_op->inputs[1]);
+  auto& bias = model->GetArray(following_op->inputs[2]);
+  bias.minmax = nullptr;
+  const auto& operand =
+      model->GetArray(add_or_sub_op->inputs[index_of_constant_input]);
+  // We're only supporting the case of a scalar operand. Should have
+  // been checked earlier.
+  CHECK_EQ(RequiredBufferSizeForShape(operand.shape()), 1);
+
+  const float scalar_operand =
+      operand.GetBuffer<ArrayDataType::kFloat>().data[0];
+  // At this point we reduce the case of subtraction to that of addition
+  // by negating the operand.
+  float add_scalar_operand = 0.f;
+  if (add_or_sub_op->type == OperatorType::kAdd) {
+    add_scalar_operand = scalar_operand;
+  } else if (add_or_sub_op->type == OperatorType::kSub &&
+             index_of_constant_input == 1) {
+    add_scalar_operand = -scalar_operand;
+  } else {
+    LOG(FATAL) << "Should not get here";
+  }
+  // From here on we are fusing an addition. add_or_sub_op->type does not
+  // matter anymore.
+
+  const Shape& weights_shape = weights.shape();
+  const Shape& bias_shape = bias.shape();
+  const auto& weights_buffer = weights.GetBuffer<ArrayDataType::kFloat>();
+  const float* const weights_data = weights_buffer.data.data();
+  auto& bias_buffer = bias.GetMutableBuffer<ArrayDataType::kFloat>();
+  float* const bias_data = bias_buffer.data.data();
+
+  if (following_op->type == OperatorType::kConv ||
+      following_op->type == OperatorType::kFullyConnected) {
+    const int output_depth = weights_shape.dims(0);
+    // TODO(b/62904716): Bias array should become 1-D when padding removed.
+    CHECK_EQ(output_depth, bias_shape.dims(bias_shape.dimensions_count() - 1));
+    const int weights_size = RequiredBufferSizeForShape(weights_shape);
+    const int weights_per_depth = weights_size / output_depth;
+    CHECK_EQ(weights_size, weights_per_depth * output_depth);
+
+    for (int d = 0; d < output_depth; d++) {
+      float accumulation = 0;
+      for (int i = 0; i < weights_per_depth; i++) {
+        accumulation +=
+            add_scalar_operand * weights_data[d * weights_per_depth + i];
+      }
+      bias_data[d] += accumulation;
+    }
+  } else if (following_op->type == OperatorType::kDepthwiseConv) {
+    const int output_depth =
+        weights_shape.dims(weights_shape.dimensions_count() - 1);
+    const int weights_size = RequiredBufferSizeForShape(weights_shape);
+    const int weights_per_depth = weights_size / output_depth;
+    CHECK_EQ(weights_size, weights_per_depth * output_depth);
+
+    for (int c = 0; c < output_depth; c++) {
+      float accumulation = 0;
+      for (int k = 0; k < weights_per_depth; k++) {
+        accumulation += add_scalar_operand * weights_data[k * output_depth + c];
+      }
+      bias_data[c] += accumulation;
+    }
+  } else {
+    LOG(FATAL) << "Should not get here.";
+  }
+}
+
+void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
+                                           const Operator* mul_or_div_op,
+                                           int index_of_constant_input) {
+  CHECK(mul_or_div_op->type == OperatorType::kMul ||
+        mul_or_div_op->type == OperatorType::kDiv);
+  CHECK(index_of_constant_input == 0 || index_of_constant_input == 1);
+  // If the op is a division, the constant input should be the right hand side.
+  // This should have been checked before this point.
+  CHECK(mul_or_div_op->type != OperatorType::kDiv ||
+        index_of_constant_input == 1);
+  const auto& weights_name = following_op->inputs[1];
+  const auto& bias_name = following_op->inputs[2];
+  auto& weights = model->GetArray(weights_name);
+  DropMinMax(model, weights_name);
+  DropMinMax(model, bias_name);
+  const auto& operand =
+      model->GetArray(mul_or_div_op->inputs[index_of_constant_input]);
+  // We're only supporting the case of a scalar operand. Should have
+  // been checked earlier.
+  CHECK_EQ(RequiredBufferSizeForShape(operand.shape()), 1);
+
+  const float scalar_operand =
+      operand.GetBuffer<ArrayDataType::kFloat>().data[0];
+
+  float* weights_data =
+      weights.GetMutableBuffer<ArrayDataType::kFloat>().data.data();
+  const int weights_size = RequiredBufferSizeForShape(weights.shape());
+  for (int i = 0; i < weights_size; i++) {
+    if (mul_or_div_op->type == OperatorType::kMul) {
+      weights_data[i] *= scalar_operand;
+    } else if (mul_or_div_op->type == OperatorType::kDiv) {
+      weights_data[i] /= scalar_operand;
+    } else {
+      LOG(FATAL) << "Should not get here";
+    }
+  }
+}
+
+}  // namespace
+
+bool FuseBinaryIntoFollowingAffine::Run(Model* model, std::size_t op_index) {
+  const auto binary_it = model->operators.begin() + op_index;
+  auto* binary_op = binary_it->get();
+  if (binary_op->type != OperatorType::kAdd &&
+      binary_op->type != OperatorType::kMul &&
+      binary_op->type != OperatorType::kSub &&
+      binary_op->type != OperatorType::kDiv) {
+    return false;
+  }
+
+  CHECK_EQ(binary_op->inputs.size(), 2);
+
+  // We only can fuse an binary when the two operands break down as follows:
+  //   1. One operand is the (variable) output of a typical affine (linear plus
+  //   bias)
+  //      op of a finite list of possible types: at the moment Conv,
+  //      DepthwiseConv and
+  //      FullyConnected are supported.
+  //   2. The other operand is a constant param array.
+  const bool is_input_constant[2] = {
+      IsConstantParameterArray(*model, binary_op->inputs[0]),
+      IsConstantParameterArray(*model, binary_op->inputs[1]),
+  };
+  if (!is_input_constant[0] && !is_input_constant[1]) {
+    // Neither input is constant, so nothing we can fuse into a constant.
+    return false;
+  }
+  if (is_input_constant[0] && is_input_constant[1]) {
+    // Both inputs are constants. That's a job for constants
+    // propagation, not for us to handle here.
+    return false;
+  }
+  const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
+  const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
+  CHECK(is_input_constant[index_of_constant_input]);
+  CHECK(!is_input_constant[index_of_variable_input]);
+
+  // For division, we can only fuse if the denominator is constant.
+  if (binary_op->type == OperatorType::kDiv) {
+    if (index_of_constant_input != 1) {
+      AddMessageF("Not fusing %s because the denominator is not constant",
+                  LogName(*binary_op));
+      return false;
+    }
+  }
+
+  const auto& operand_shape =
+      model->GetArray(binary_op->inputs[index_of_constant_input]).shape();
+  for (const auto& dim : operand_shape.dims()) {
+    if (dim > 1) {
+      AddMessageF(
+          "Not fusing %s into the following affine op, because we only know "
+          "how to do so when the constant operand is a scalar",
+          LogName(*binary_op));
+      return false;
+    }
+  }
+
+  if (binary_op->fused_activation_function !=
+      FusedActivationFunctionType::kNone) {
+    AddMessageF("Not fusing %s because it has a fused activation function",
+                LogName(*binary_op));
+    return false;
+  }
+
+  Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
+
+  if (!following_op) {
+    AddMessageF(
+        "Not fusing %s because it is not consumed by exactly one other op",
+        LogName(*binary_op));
+    return false;
+  }
+
+  if (following_op->type != OperatorType::kConv &&
+      following_op->type != OperatorType::kFullyConnected &&
+      following_op->type != OperatorType::kDepthwiseConv) {
+    AddMessageF(
+        "Not fusing %s because the following %s is not of one of the supported "
+        "types",
+        LogName(*binary_op), LogName(*following_op));
+    return false;
+  }
+
+  if (following_op->inputs.size() < 3) {
+    AddMessageF(
+        "Not fusing %s because the following %s does not have a bias vector",
+        LogName(*following_op), LogName(*binary_op));
+    return false;
+  }
+
+  const auto& weights = model->GetArray(following_op->inputs[1]);
+  const auto& bias = model->GetArray(following_op->inputs[2]);
+  if (!weights.buffer || !bias.buffer) {
+    AddMessageF(
+        "Not fusing %s because the following %s has non-constant weights or "
+        "bias arrays",
+        LogName(*binary_op), LogName(*following_op));
+    return false;
+  }
+
+  // Try to fuse the binary params into the following op's params
+  if (binary_op->type == OperatorType::kAdd ||
+      binary_op->type == OperatorType::kSub) {
+    if (following_op->type == OperatorType::kConv) {
+      if (static_cast<ConvOperator*>(following_op)->padding.type !=
+          PaddingType::kValid) {
+        AddMessageF(
+            "Not fusing %s because the following %s does not use VALID padding",
+            LogName(*binary_op), LogName(*following_op));
+        return false;
+      }
+    }
+    if (following_op->type == OperatorType::kDepthwiseConv) {
+      if (static_cast<DepthwiseConvOperator*>(following_op)->padding.type !=
+          PaddingType::kValid) {
+        AddMessageF(
+            "Not fusing %s because the following %s does not use VALID padding",
+            LogName(*binary_op), LogName(*following_op));
+        return false;
+      }
+    }
+    FuseAddOrSubParamsIntoFollowingAffine(model, following_op, binary_op,
+                                          index_of_constant_input);
+  } else if (binary_op->type == OperatorType::kMul ||
+             binary_op->type == OperatorType::kDiv) {
+    FuseMulOrDivParamsIntoFollowingAffine(model, following_op, binary_op,
+                                          index_of_constant_input);
+  } else {
+    LOG(FATAL) << "should not get here";
+  }
+
+  AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
+              LogName(*following_op));
+
+  model->arrays.erase(binary_op->outputs[0]);
+  following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
+  const auto& old_constant_param_name =
+      binary_op->inputs[index_of_constant_input];
+  CHECK(IsConstantParameterArray(*model, old_constant_param_name));
+  if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
+    model->arrays.erase(old_constant_param_name);
+  }
+  model->operators.erase(binary_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8948653ec38f5a5a6e92cfe9e6bafdbf1aa9a962
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/fuse_binary_into_preceding_affine.cc
@@ -0,0 +1,326 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void FuseAddOrSubParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
+                                           const Operator* add_or_sub_op,
+                                           int index_of_constant_input) {
+  CHECK(add_or_sub_op->type == OperatorType::kAdd ||
+        add_or_sub_op->type == OperatorType::kSub);
+  CHECK(index_of_constant_input == 0 || index_of_constant_input == 1);
+  if (preceding_op->inputs.size() < 3) {
+    LOG(FATAL) << "Missing bias parameter";
+  }
+  auto& bias = model->GetArray(preceding_op->inputs[2]);
+  bias.minmax = nullptr;
+  const auto& operand =
+      model->GetArray(add_or_sub_op->inputs[index_of_constant_input]);
+
+  const Shape& bias_shape = bias.shape();
+  const Shape& operand_shape = operand.shape();
+  auto& bias_buffer = bias.GetMutableBuffer<ArrayDataType::kFloat>();
+  float* const bias_data = bias_buffer.data.data();
+  const auto& operand_buffer = operand.GetBuffer<ArrayDataType::kFloat>();
+  const float* const operand_data = operand_buffer.data.data();
+
+  // TODO(b/62904716): Bias array should become 1-D when padding removed.
+  const int depth = bias_shape.dims(bias_shape.dimensions_count() - 1);
+  CHECK_EQ(depth, operand_shape.dims(operand_shape.dimensions_count() - 1));
+
+  enum class OpType { BiasPlusOperand, BiasMinusOperand, OperandMinusBias };
+
+  const OpType optype = (add_or_sub_op->type == OperatorType::kAdd)
+                            ? OpType::BiasPlusOperand
+                            : (index_of_constant_input == 1)
+                                  ? OpType::BiasMinusOperand
+                                  : OpType::OperandMinusBias;
+
+  for (int i = 0; i < depth; i++) {
+    float& bias_val = bias_data[i];
+    const float operand_val = operand_data[i];
+    if (optype == OpType::BiasPlusOperand) {
+      bias_val += operand_val;
+    } else if (optype == OpType::BiasMinusOperand) {
+      bias_val -= operand_val;
+    } else if (optype == OpType::OperandMinusBias) {
+      bias_val = operand_val - bias_val;
+    } else {
+      LOG(FATAL) << "Should not get here.";
+    }
+  }
+}
+
+void FuseMulOrDivParamsIntoPrecedingAffine(Model* model, Operator* preceding_op,
+                                           const Operator* mul_or_div_op,
+                                           int index_of_constant_input) {
+  CHECK(mul_or_div_op->type == OperatorType::kMul ||
+        mul_or_div_op->type == OperatorType::kDiv);
+  CHECK(index_of_constant_input == 0 || index_of_constant_input == 1);
+  // If the op is a division, the constant input should be the right hand side.
+  // This should have been checked before this point.
+  CHECK(mul_or_div_op->type != OperatorType::kDiv ||
+        index_of_constant_input == 1);
+  if (preceding_op->inputs.size() < 3) {
+    LOG(FATAL) << "Missing bias parameter";
+  }
+  const auto& weights_name = preceding_op->inputs[1];
+  const auto& bias_name = preceding_op->inputs[2];
+  auto& weights = model->GetArray(weights_name);
+  DropMinMax(model, weights_name);
+  auto& bias = model->GetArray(bias_name);
+  DropMinMax(model, bias_name);
+  const auto& operand =
+      model->GetArray(mul_or_div_op->inputs[index_of_constant_input]);
+
+  const Shape& weights_shape = weights.shape();
+  const Shape& bias_shape = bias.shape();
+  const Shape& operand_shape = operand.shape();
+  auto& weights_buffer = weights.GetMutableBuffer<ArrayDataType::kFloat>();
+  float* const weights_data = weights_buffer.data.data();
+  auto& bias_buffer = bias.GetMutableBuffer<ArrayDataType::kFloat>();
+  float* const bias_data = bias_buffer.data.data();
+  const auto& operand_buffer = operand.GetBuffer<ArrayDataType::kFloat>();
+  const float* const operand_data = operand_buffer.data.data();
+
+  // We support broadcasting the operand along the depth dimension,
+  // when the operand's depth is 1.
+  int operand_channel_increment = 0;
+  if (operand_shape.dimensions_count() >= 1 &&
+      operand_shape.dims(operand_shape.dimensions_count() - 1) ==
+          bias_shape.dims(bias_shape.dimensions_count() - 1)) {
+    operand_channel_increment = 1;
+  } else if (operand_shape.dimensions_count() == 0 ||
+             operand_shape.dims(operand_shape.dimensions_count() - 1) == 1) {
+    operand_channel_increment = 0;
+  } else {
+    LOG(FATAL) << "Operand shape mismatch.";
+  }
+
+  int output_depth;
+
+  if (preceding_op->type == OperatorType::kConv ||
+      preceding_op->type == OperatorType::kFullyConnected) {
+    output_depth = weights_shape.dims(0);
+  } else if (preceding_op->type == OperatorType::kDepthwiseConv) {
+    output_depth = weights_shape.dims(weights_shape.dimensions_count() - 1);
+  } else {
+    LOG(FATAL) << "Should not get here";
+  }
+
+  const int weights_size = RequiredBufferSizeForShape(weights_shape);
+  const int weights_per_depth = weights_size / output_depth;
+  CHECK_EQ(weights_size, weights_per_depth * output_depth);
+
+  int operand_channel = 0;
+  for (int c = 0; c < output_depth; c++) {
+    if (mul_or_div_op->type == OperatorType::kMul) {
+      bias_data[c] *= operand_data[operand_channel];
+    } else if (mul_or_div_op->type == OperatorType::kDiv) {
+      bias_data[c] /= operand_data[operand_channel];
+    } else {
+      LOG(FATAL) << "Should not get here";
+    }
+    if (preceding_op->type == OperatorType::kConv ||
+        preceding_op->type == OperatorType::kFullyConnected) {
+      for (int i = 0; i < weights_per_depth; i++) {
+        if (mul_or_div_op->type == OperatorType::kMul) {
+          weights_data[c * weights_per_depth + i] *=
+              operand_data[operand_channel];
+        } else if (mul_or_div_op->type == OperatorType::kDiv) {
+          weights_data[c * weights_per_depth + i] /=
+              operand_data[operand_channel];
+        } else {
+          LOG(FATAL) << "Should not get here";
+        }
+      }
+    } else if (preceding_op->type == OperatorType::kDepthwiseConv) {
+      for (int k = 0; k < weights_per_depth; k++) {
+        if (mul_or_div_op->type == OperatorType::kMul) {
+          weights_data[k * output_depth + c] *= operand_data[operand_channel];
+        } else if (mul_or_div_op->type == OperatorType::kDiv) {
+          weights_data[k * output_depth + c] /= operand_data[operand_channel];
+        } else {
+          LOG(FATAL) << "Should not get here";
+        }
+      }
+    } else {
+      LOG(FATAL) << "Should not get here";
+    }
+    operand_channel += operand_channel_increment;
+  }
+}
+}  // namespace
+
+bool FuseBinaryIntoPrecedingAffine::Run(Model* model, std::size_t op_index) {
+  const auto binary_it = model->operators.begin() + op_index;
+  const auto* binary_op = binary_it->get();
+  if (binary_op->type != OperatorType::kAdd &&
+      binary_op->type != OperatorType::kMul &&
+      binary_op->type != OperatorType::kSub &&
+      binary_op->type != OperatorType::kDiv) {
+    return false;
+  }
+
+  CHECK_EQ(binary_op->inputs.size(), 2);
+
+  // We only can fuse an binary when the two operands break down as follows:
+  //   1. One operand is the (variable) output of a typical affine (linear plus
+  //   bias)
+  //      op of a finite list of possible types: at the moment Conv,
+  //      DepthwiseConv and
+  //      FullyConnected are supported.
+  //   2. The other operand is a constant param array.
+  const bool is_input_constant[2] = {
+      IsConstantParameterArray(*model, binary_op->inputs[0]),
+      IsConstantParameterArray(*model, binary_op->inputs[1]),
+  };
+  if (!is_input_constant[0] && !is_input_constant[1]) {
+    // Neither input is constant, so nothing we can fuse into a constant.
+    return false;
+  }
+  if (is_input_constant[0] && is_input_constant[1]) {
+    // Both inputs are constants. That's a job for constants
+    // propagation, not for us to handle here.
+    return false;
+  }
+  const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
+  const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
+  CHECK(is_input_constant[index_of_constant_input]);
+  CHECK(!is_input_constant[index_of_variable_input]);
+
+  // For division, we can only fuse if the denominator is constant.
+  if (binary_op->type == OperatorType::kDiv) {
+    if (index_of_constant_input != 1) {
+      AddMessageF("Not fusing %s because the denominator is not constant",
+                  LogName(*binary_op));
+      return false;
+    }
+  }
+
+  Operator* preceding_op =
+      GetOpWithOutput(*model, binary_op->inputs[index_of_variable_input]);
+  if (!preceding_op) {
+    AddMessageF("Not fusing %s because it is not the output of another op",
+                LogName(*binary_op));
+    return false;
+  }
+
+  for (const string& output_array : model->flags.output_arrays()) {
+    if (preceding_op->outputs[0] == output_array) {
+      return false;
+    }
+  }
+
+  if (preceding_op->type != OperatorType::kConv &&
+      preceding_op->type != OperatorType::kFullyConnected &&
+      preceding_op->type != OperatorType::kDepthwiseConv) {
+    AddMessageF(
+        "Not fusing %s because the preceding %s is not of one of the supported "
+        "types",
+        LogName(*binary_op), LogName(*preceding_op));
+    return false;
+  }
+
+  if (preceding_op->fused_activation_function !=
+      FusedActivationFunctionType::kNone) {
+    AddMessageF(
+        "Not fusing %s because the preceding %s has a fused activation "
+        "function",
+        LogName(*binary_op), LogName(*preceding_op));
+    return false;
+  }
+
+  if (preceding_op->inputs.size() < 3) {
+    AddMessageF(
+        "Not fusing %s because the preceding %s does not have a bias vector",
+        LogName(*binary_op), LogName(*preceding_op));
+    return false;
+  }
+
+  const auto& weights = model->GetArray(preceding_op->inputs[1]);
+  const auto& bias = model->GetArray(preceding_op->inputs[2]);
+  if (binary_op->type == OperatorType::kAdd ||
+      binary_op->type == OperatorType::kSub) {
+    if (!bias.buffer) {
+      AddMessageF(
+          "Not fusing %s because the preceding %s has a non-constant bias "
+          "array",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
+  } else {
+    if (!weights.buffer || !bias.buffer) {
+      AddMessageF(
+          "Not fusing %s because the preceding %s has non-constant weights or "
+          "bias arrays",
+          LogName(*binary_op), LogName(*preceding_op));
+      return false;
+    }
+  }
+
+  int count_ops_consuming_output =
+      CountOpsWithInput(*model, preceding_op->outputs[0]);
+  DCHECK_GE(count_ops_consuming_output, 1);
+  if (count_ops_consuming_output > 1) {
+    AddMessageF(
+        "Not fusing %s because the output of the preceding %s is consumed by "
+        "another op",
+        LogName(*binary_op), LogName(*preceding_op));
+    return false;
+  }
+
+  AddMessageF("Fusing %s into the preceding %s", LogName(*binary_op),
+              LogName(*preceding_op));
+
+  if (binary_op->type == OperatorType::kAdd ||
+      binary_op->type == OperatorType::kSub) {
+    FuseAddOrSubParamsIntoPrecedingAffine(model, preceding_op, binary_op,
+                                          index_of_constant_input);
+  } else if (binary_op->type == OperatorType::kMul ||
+             binary_op->type == OperatorType::kDiv) {
+    FuseMulOrDivParamsIntoPrecedingAffine(model, preceding_op, binary_op,
+                                          index_of_constant_input);
+  } else {
+    LOG(FATAL) << "should not get here";
+  }
+
+  model->arrays.erase(preceding_op->outputs[0]);
+  preceding_op->outputs[0] = binary_op->outputs[0];
+  preceding_op->fused_activation_function =
+      binary_op->fused_activation_function;
+  const auto& old_constant_param_name =
+      binary_op->inputs[index_of_constant_input];
+  CHECK(IsConstantParameterArray(*model, old_constant_param_name));
+  if (CountOpsWithInput(*model, old_constant_param_name) == 1) {
+    model->arrays.erase(old_constant_param_name);
+  }
+  model->operators.erase(binary_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a7611a6683206eb3a9f6779668158292274a7fe
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.cc
@@ -0,0 +1,200 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void PrintModelStats(const string& label, const Model& model) {
+  int quantized_arrays = 0;
+  for (const auto& array : model.arrays) {
+    if (array.second->quantization_params) {
+      quantized_arrays++;
+    }
+  }
+  LOG(INFO) << label << ": " << model.operators.size() << " operators, "
+            << model.arrays.size() << " arrays (" << quantized_arrays
+            << " quantized)";
+}
+
+// Some graphs have RNN back-edges that are discardable, having been
+// created typically by TensorFlow import rather than specified by the user.
+// Such graphs might have cycles (closed by RNN back-edges) that may be pruned.
+// Local graph transformations can't identify such global features,
+// so this function performs this global transformation.
+//
+// The other (and related) thing that is peculiar about RNN back-edges
+// is that they do not prevent the arrays that they touch, from being
+// pruned. Thus, they may refer to array names which no longer exist.
+// The intent is for that to result in the eventual pruning of such
+// 'dangling' RNN back-edges. We perform this pruning at the end of this
+// function, as the pruning of connected components done here may leave
+// more RNN back-edges dangling.
+void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
+  // Identify the set of arrays that are in 'useful' connected components
+  // of the graph, which means connected to output arrays.
+  std::unordered_set<string> useful_arrays;
+  for (const string& output_array : model->flags.output_arrays()) {
+    useful_arrays.insert(output_array);
+  }
+  bool found_new_useful_arrays;
+  do {
+    found_new_useful_arrays = false;
+    for (const auto& op : model->operators) {
+      bool op_touches_useful_arrays = false;
+      for (const string& output : op->outputs) {
+        op_touches_useful_arrays |= useful_arrays.count(output);
+      }
+      if (op_touches_useful_arrays) {
+        for (const string& input : op->inputs) {
+          found_new_useful_arrays |= !useful_arrays.count(input);
+          useful_arrays.insert(input);
+        }
+        for (const string& output : op->outputs) {
+          found_new_useful_arrays |= !useful_arrays.count(output);
+          useful_arrays.insert(output);
+        }
+      }
+    }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      bool rnn_back_edge_touches_useful_arrays =
+          useful_arrays.count(rnn_state.state_array());
+      if (rnn_back_edge_touches_useful_arrays) {
+        found_new_useful_arrays |=
+            !useful_arrays.count(rnn_state.back_edge_source_array());
+        useful_arrays.insert(rnn_state.back_edge_source_array());
+      }
+    }
+  } while (found_new_useful_arrays);
+  // Erase arrays that aren't useful, and that are discardable.
+  for (auto it = model->arrays.begin(); it != model->arrays.end();) {
+    if (useful_arrays.count(it->first) ||
+        !IsDiscardableArray(*model, it->first)) {
+      ++it;
+    } else {
+      it = model->arrays.erase(it);
+    }
+  }
+  // Erase operators that do not produce a useful output array.
+  for (auto it = model->operators.begin(); it != model->operators.end();) {
+    // Only need to test the first output, as we simultaneously added all of
+    // an operator's outputs to the list of output arrays.
+    if (useful_arrays.count((*it)->outputs[0])) {
+      ++it;
+    } else {
+      for (const string& output : (*it)->outputs) {
+        CHECK(!useful_arrays.count(output));
+      }
+      it = model->operators.erase(it);
+    }
+  }
+  // Erase RNN back-edges that are 'dangling' i.e. that touch an array
+  // that no longer exists. This should only happen for discardable RNN
+  // back-edges.
+  std::vector<RnnState> rnn_states_to_keep;
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    const bool dangling =
+        !model->arrays.count(rnn_state.back_edge_source_array()) ||
+        !model->arrays.count(rnn_state.state_array());
+    if (dangling) {
+      CHECK(rnn_state.discardable());
+    } else {
+      rnn_states_to_keep.push_back(rnn_state);
+    }
+  }
+  model->flags.clear_rnn_states();
+  for (const auto& rnn_state : rnn_states_to_keep) {
+    *model->flags.add_rnn_states() = rnn_state;
+  }
+}
+
+bool GraphTransformationsPass(int increment, Model* model,
+                              const GraphTransformationsSet& transformations) {
+  CHECK(increment == 1 || increment == -1);
+  bool changed = false;
+  CHECK(!model->operators.empty());
+  int op_index = increment == 1 ? 0 : model->operators.size() - 1;
+  while (true) {
+    bool changed_now = false;
+    // Loop over all transformations at the current position in the graph.
+    for (const auto& transformation : transformations) {
+      CHECK(!changed_now);
+      CHECK(transformation->Messages().empty());
+      changed_now = transformation->Run(model, op_index);
+      if (changed_now) {
+        DumpGraphvizVideoFrame(*model);
+        CHECK(!model->operators.empty());
+        op_index = std::min<int>(op_index, model->operators.size() - 1);
+        // Uncomment for debugging
+        // CheckInvariants(*model);
+      }
+      const char* made_a_change_msg =
+          changed_now ? "made a change" : "did NOT make a change";
+      const int log_level =
+          changed_now ? kLogLevelModelChanged : kLogLevelModelUnchanged;
+      for (const string& message : transformation->Messages()) {
+        VLOG(log_level) << transformation->Name() << " " << made_a_change_msg
+                        << " at op_index=" << op_index << "/"
+                        << model->operators.size() - 1 << ": " << message;
+      }
+      transformation->ClearMessages();
+      if (changed_now) {
+        break;
+      }
+    }
+    if (changed_now) {
+      changed = true;
+    } else {
+      const int op_index_last =
+          increment == 1 ? model->operators.size() - 1 : 0;
+      if (op_index == op_index_last) {
+        break;
+      }
+      op_index += increment;
+    }
+  }
+  DiscardUselessConnectedComponentsAndRNNBackEdges(model);
+  return changed;
+}
+
+}  // namespace
+
+void RunGraphTransformations(Model* model, const string& msg,
+                             const GraphTransformationsSet& transformations) {
+  PrintModelStats(toco::port::StringF("Before %s", msg), *model);
+  int pass_index = 0;
+  while (GraphTransformationsPass((pass_index % 2) ? -1 : 1, model,
+                                  transformations)) {
+    pass_index++;
+    const auto& label =
+        toco::port::StringF("After %s pass %d", msg, pass_index);
+    PrintModelStats(label, *model);
+    CheckInvariants(*model);
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1dc41170c8fb8770d075813adf17f98263ac9a2
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -0,0 +1,187 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
+
+#include <cstddef>
+#include <initializer_list>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+
+namespace toco {
+
+class GraphTransformation {
+ public:
+  virtual bool Run(Model* model, std::size_t op_index) = 0;
+  virtual const char* Name() const = 0;
+  virtual ~GraphTransformation() {}
+  // Returns the list of messages that this graph transformation
+  // generated since ClearMessages() was called.
+  const std::vector<string>& Messages() const { return messages_; }
+  // Clears the list of messages; should be called after every
+  // run of this graph transformation.
+  void ClearMessages() { return messages_.clear(); }
+  // Adds a message; normally only called by the graph transformation
+  // itself during its run (this function could be protected).
+  template <typename... Args>
+  void AddMessageF(const char* format, const Args&... args) {
+    return messages_.push_back(toco::port::StringF(format, args...));
+  }
+
+ protected:
+  GraphTransformation() {}
+
+  // List of messages generated by this graph transformation.
+  std::vector<string> messages_;
+
+ private:
+  GraphTransformation(const GraphTransformation& other) = delete;
+  GraphTransformation(const GraphTransformation&& other) = delete;
+};
+
+class GraphTransformationsSet {
+ public:
+  // The choice of a container with fully-specified iteration order
+  // ensures that graph transformations are always run in the same order,
+  // which avoids having toco randomly fail or produce different results
+  // depending on the toolchain. Ideally success/results should be independent
+  // of the order in which graph transformations are run, but that's
+  // unfortunately not currently guaranteed to be the case.
+  using TransformationsContainer =
+      std::vector<std::unique_ptr<GraphTransformation>>;
+
+  GraphTransformationsSet() {}
+  GraphTransformationsSet(
+      const std::initializer_list<GraphTransformation*> transformations) {
+    for (GraphTransformation* t : transformations) {
+      Add(t);
+    }
+  }
+  void Add(GraphTransformation* transformation) {
+    const string& name = transformation->Name();
+    CHECK(!names_.count(name));
+    names_.insert(name);
+    transformations_.emplace_back(transformation);
+  }
+  TransformationsContainer::const_iterator begin() const {
+    return transformations_.begin();
+  }
+  TransformationsContainer::const_iterator end() const {
+    return transformations_.end();
+  }
+  bool empty() const { return transformations_.empty(); }
+
+ private:
+  GraphTransformationsSet(const GraphTransformationsSet& other) = delete;
+  GraphTransformationsSet(const GraphTransformationsSet&& other) = delete;
+  std::vector<std::unique_ptr<GraphTransformation>> transformations_;
+  // Names of transformations in the set. Only used to guard against dupes.
+  std::unordered_set<string> names_;
+};
+
+// Run the given list of graph transformations on the model.
+// The message is only for logging purposes.
+// The transformations is a rvalue reference, indicating that
+// nothing else will use these pointers. The user is supposed to
+// construct GraphTransformation objects by using 'new', pass us
+// the resulting raw pointers, and this RunGraphTransformations
+// takes care of delete'ing these pointers.
+void RunGraphTransformations(Model* model, const string& message,
+                             const GraphTransformationsSet& transformations);
+
+#define DECLARE_GRAPH_TRANSFORMATION(GTName)               \
+  class GTName : public GraphTransformation {              \
+   public:                                                 \
+    bool Run(Model* model, std::size_t op_index) override; \
+    const char* Name() const override { return #GTName; }  \
+  };
+
+// List of all graph transformations
+DECLARE_GRAPH_TRANSFORMATION(ConvertExpandDimsToReshape)
+DECLARE_GRAPH_TRANSFORMATION(ConvertPureConvToDepthwise)
+DECLARE_GRAPH_TRANSFORMATION(EnsureBiasVectors)
+DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
+DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
+DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
+DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
+DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
+DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
+DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
+DECLARE_GRAPH_TRANSFORMATION(Quantize)
+DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialBinaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialConcatenationInput)
+DECLARE_GRAPH_TRANSFORMATION(RemoveTrivialQuantizedActivationFunc)
+DECLARE_GRAPH_TRANSFORMATION(RemoveUnusedOp)
+DECLARE_GRAPH_TRANSFORMATION(ResolveBatchNormalization)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantBinaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
+DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
+DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
+DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
+DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMerge)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSqueeze)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowSwitch)
+DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowTile)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFakeQuant)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantConcatenation)
+DECLARE_GRAPH_TRANSFORMATION(DropFakeQuant)
+DECLARE_GRAPH_TRANSFORMATION(UnfuseActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantTensorFlowShape)
+DECLARE_GRAPH_TRANSFORMATION(Dequantize)
+
+class ResolveReshapeAttributes : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "ResolveReshapeAttributes"; }
+};
+
+class RemoveTrivialReshape : public GraphTransformation {
+ public:
+  bool Run(Model* model, std::size_t op_index) override;
+  const char* Name() const override { return "RemoveTrivialReshape"; }
+  bool treat_expand_dims_as_trivial() const {
+    return treat_expand_dims_as_trivial_;
+  }
+  void set_treat_expand_dims_as_trivial(bool val) {
+    treat_expand_dims_as_trivial_ = val;
+  }
+
+ private:
+  bool treat_expand_dims_as_trivial_ = false;
+};
+
+#undef DECLARE_GRAPH_TRANSFORMATION
+
+}  // end namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_GRAPH_TRANSFORMATIONS_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cb26c8752c0d27a3d1138b9ad32e60f34177520
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -0,0 +1,230 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool HardcodeMinMaxForIm2colArray(Model* model, Operator* op) {
+  if (op->outputs.size() != 2) {
+    return false;
+  }
+  auto& im2col_array = model->GetArray(op->outputs[1]);
+  if (im2col_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!im2col_array.minmax);
+  auto& im2col_minmax = im2col_array.GetOrCreateMinMax();
+  im2col_minmax.min = input_minmax.min;
+  im2col_minmax.max = input_minmax.max;
+  return true;
+}
+
+bool HardcodeMinMaxForL2Normalization(Model* model, Operator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax.min >= 0. ? 0. : -1.;
+  output_minmax.max = input_minmax.max <= 0. ? 0. : 1.;
+  return true;
+}
+
+bool HardcodeMinMaxForConcatenation(Model* model, Operator* op) {
+  // Do not early return if the output already has min/max:
+  // we may still need to adjust the inputs min/max.
+  bool has_minmax = false;
+  double overall_min = std::numeric_limits<double>::infinity();
+  double overall_max = -std::numeric_limits<double>::infinity();
+  for (const auto& input : op->inputs) {
+    if (model->GetArray(input).minmax) {
+      has_minmax = true;
+      const auto* minmax = model->GetArray(input).minmax.get();
+      if (minmax) {
+        overall_min = std::min(overall_min, minmax->min);
+        overall_max = std::max(overall_max, minmax->max);
+      }
+    }
+  }
+  auto& output = model->GetArray(op->outputs[0]);
+  if (output.minmax) {
+    has_minmax = true;
+    const auto* minmax = model->GetArray(op->outputs[0]).minmax.get();
+    if (minmax) {
+      overall_min = std::min(overall_min, minmax->min);
+      overall_max = std::max(overall_max, minmax->max);
+    }
+  }
+  if (!has_minmax) {
+    return false;
+  }
+  MinMax overall_minmax;
+  overall_minmax.min = overall_min;
+  overall_minmax.max = overall_max;
+  bool changed = false;
+  for (const auto& input : op->inputs) {
+    auto& array = model->GetArray(input);
+    if (!array.minmax) {
+      changed = true;
+    } else if (!(overall_minmax == array.GetMinMax())) {
+      changed = true;
+      LOG(WARNING)
+          << "Tweaking the MinMax of array " << input << ", which is "
+          << "an input to " << LogName(*op) << ", because we want all inputs "
+          << "and outputs of a Concatenation operator to have the same MinMax "
+          << "so that it can be implemented as a pure byte-copy, no "
+             "arithmetic.";
+    }
+    array.GetOrCreateMinMax() = overall_minmax;
+  }
+  if (!output.minmax) {
+    changed = true;
+  } else if (!(overall_minmax == output.GetMinMax())) {
+    changed = true;
+    LOG(WARNING)
+        << "Tweaking the MinMax of the output array of " << LogName(*op)
+        << ", because we want all inputs "
+        << "and outputs of a Concatenation operator to have the same MinMax "
+        << "so that it can be implemented as a pure byte-copy, no arithmetic.";
+  }
+  output.GetOrCreateMinMax() = overall_minmax;
+
+  return changed;
+}
+
+// The output of average or max pooling is within the same range as its input.
+bool HardcodeMinMaxForAverageOrMaxPool(Model* model, Operator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = std::min(input_minmax.min, 0.);
+  output_minmax.max = std::max(input_minmax.max, 0.);
+  return true;
+}
+
+bool HardcodeMinMaxForReshapeOrSqueeze(Model* model, Operator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  const auto& input_minmax = input_array.GetMinMax();
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax.min;
+  output_minmax.max = input_minmax.max;
+  return true;
+}
+
+bool HardcodeMinMaxForOutput(Model* model, Operator* op, double min,
+                             double max) {
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.minmax) {
+    return false;
+  }
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = min;
+  output_minmax.max = max;
+  return true;
+}
+}  // namespace
+
+bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  bool changed = false;
+  switch (op->type) {
+    case OperatorType::kConv:
+      changed = HardcodeMinMaxForIm2colArray(model, op);
+      break;
+
+    case OperatorType::kL2Normalization:
+      changed = HardcodeMinMaxForL2Normalization(model, op);
+      break;
+
+    case OperatorType::kConcatenation:
+      changed = HardcodeMinMaxForConcatenation(model, op);
+      break;
+
+    case OperatorType::kAveragePool:
+    case OperatorType::kMaxPool:
+      changed = HardcodeMinMaxForAverageOrMaxPool(model, op);
+      break;
+
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+      changed = HardcodeMinMaxForReshapeOrSqueeze(model, op);
+      break;
+
+    case OperatorType::kLogistic:
+      // We hardcode quantization_params to: zero_point=0, scale=1/256.
+      // This choice of minmax is the one that is equivalent to that.
+      changed = HardcodeMinMaxForOutput(model, op, 0, 255. / 256.);
+      break;
+
+    case OperatorType::kSoftmax:
+      // We hardcode quantization_params to: zero_point=0, scale=1/256.
+      // This choice of minmax is the one that is equivalent to that.
+      changed = HardcodeMinMaxForOutput(model, op, 0, 255. / 256.);
+      break;
+
+    default:
+      break;
+  }
+  if (changed) {
+    AddMessageF("Hardcoded min-max through %s", LogName(*op));
+  }
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01b75e37c691d48fabf8832af04543be3f5eb3bc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -0,0 +1,170 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator* op) {
+  auto it = model->operators.begin();
+  for (; it != model->operators.end(); ++it) {
+    if (it->get() == op) {
+      break;
+    }
+  }
+  return it;
+}
+}  // namespace
+
+bool IdentifyL2Normalization::Run(Model* model, std::size_t op_index) {
+  const auto div_it = model->operators.begin() + op_index;
+  const auto* div_or_mul_op = div_it->get();
+  OperatorType expected_op_type_producing_div_or_mul_input;
+  if (div_or_mul_op->type == OperatorType::kDiv) {
+    expected_op_type_producing_div_or_mul_input = OperatorType::kTensorFlowSqrt;
+  } else if (div_or_mul_op->type == OperatorType::kMul) {
+    expected_op_type_producing_div_or_mul_input =
+        OperatorType::kTensorFlowRsqrt;
+  } else {
+    return false;
+  }
+  CHECK_EQ(div_or_mul_op->inputs.size(), 2);
+  Operator* op_producing_div_or_mul_input[2] = {
+      GetOpWithOutput(*model, div_or_mul_op->inputs[0]),
+      GetOpWithOutput(*model, div_or_mul_op->inputs[1]),
+  };
+  if (!op_producing_div_or_mul_input[1] ||
+      op_producing_div_or_mul_input[1]->type !=
+          expected_op_type_producing_div_or_mul_input) {
+    return false;
+  }
+  Operator* sqrt_or_rsqrt_op = op_producing_div_or_mul_input[1];
+  CHECK_EQ(sqrt_or_rsqrt_op->inputs.size(), 1);
+  Operator* op_producing_sqrt_or_rsqrt_input =
+      GetOpWithOutput(*model, sqrt_or_rsqrt_op->inputs[0]);
+  if (!op_producing_sqrt_or_rsqrt_input) {
+    return false;
+  }
+
+  // There may be an Add or a Maximum here, adding or clamping to a "small"
+  // constant scalar.
+  // Reported bug: b/29395854
+  Operator* add_op = nullptr;
+  Operator* op_producing_add_input = nullptr;
+  if (op_producing_sqrt_or_rsqrt_input->type == OperatorType::kAdd ||
+      op_producing_sqrt_or_rsqrt_input->type ==
+          OperatorType::kTensorFlowMaximum) {
+    add_op = op_producing_sqrt_or_rsqrt_input;
+    bool add_can_be_removed = false;
+    CHECK_EQ(op_producing_sqrt_or_rsqrt_input->inputs.size(), 2);
+    for (int i = 0; i < 2; i++) {
+      const auto& input_array =
+          model->GetArray(op_producing_sqrt_or_rsqrt_input->inputs[i]);
+      if (!input_array.buffer) {
+        continue;
+      }
+      if (input_array.buffer->type != ArrayDataType::kFloat) {
+        continue;
+      }
+      if (RequiredBufferSizeForShape(input_array.shape()) != 1) {
+        continue;
+      }
+      const auto& input_float_data =
+          input_array.GetBuffer<ArrayDataType::kFloat>().data;
+      if (std::abs(input_float_data[0]) > 1e-3f) {
+        continue;
+      }
+      add_can_be_removed = true;
+      op_producing_add_input = GetOpWithOutput(*model, add_op->inputs[1 - i]);
+      break;
+    }
+    if (!add_can_be_removed) {
+      AddMessageF(
+          "Giving up trying to identify L2Normalization subgraph "
+          " because the operator producing the input to the square root, %s,"
+          ", does not match the expected pattern",
+          LogName(*op_producing_sqrt_or_rsqrt_input));
+      return false;
+    }
+  }
+
+  Operator* sum_op =
+      add_op ? op_producing_add_input : op_producing_sqrt_or_rsqrt_input;
+  if (sum_op->type != OperatorType::kTensorFlowSum) {
+    AddMessageF(
+        "Giving up trying to identify L2Normalization subgraph: "
+        "expected Sum op, got %s",
+        LogName(*sum_op));
+    return false;
+  }
+
+  Operator* square_op = GetOpWithOutput(*model, sum_op->inputs[0]);
+  if (square_op->type != OperatorType::kTensorFlowSquare) {
+    AddMessageF(
+        "Giving up trying to identify L2Normalization subgraph: "
+        "expected Square op, got %s",
+        LogName(*square_op));
+    return false;
+  }
+
+  CHECK_EQ(square_op->inputs.size(), 1);
+
+  if (square_op->inputs[0] != div_or_mul_op->inputs[0]) {
+    AddMessageF(
+        "Giving up trying to identify L2Normalization subgraph: %s does not "
+        "take the same input as the Mul/Div node",
+        LogName(*square_op));
+    return false;
+  }
+
+  // Create and emplace the new L2Normalization
+  auto* l2norm_op = new L2NormalizationOperator;
+  l2norm_op->inputs = {div_or_mul_op->inputs[0]};
+  l2norm_op->outputs = div_or_mul_op->outputs;
+  model->operators.emplace(div_it, l2norm_op);
+
+  AddMessageF("Creating %s replacing equivalent subgraph", LogName(*l2norm_op));
+
+  // Erase the subgraph that is now replaced by L2Normalization
+  model->operators.erase(FindOperator(model, square_op));
+  model->arrays.erase(sum_op->inputs[0]);
+  if (sum_op->inputs.size() > 1) {
+    model->arrays.erase(sum_op->inputs[1]);
+  }
+  model->operators.erase(FindOperator(model, sum_op));
+  if (add_op) {
+    model->arrays.erase(add_op->inputs[0]);
+    model->arrays.erase(add_op->inputs[1]);
+    model->operators.erase(FindOperator(model, add_op));
+  }
+  model->arrays.erase(sqrt_or_rsqrt_op->inputs[0]);
+  model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
+  model->arrays.erase(div_or_mul_op->inputs[1]);
+  model->operators.erase(FindOperator(model, div_or_mul_op));
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1865416fc2226d663dfd51a5c0a0e2129caf485c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_l2_pool.cc
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator* op) {
+  auto it = model->operators.begin();
+  for (; it != model->operators.end(); ++it) {
+    if (it->get() == op) {
+      break;
+    }
+  }
+  return it;
+}
+}  // namespace
+
+bool IdentifyL2Pool::Run(Model* model, std::size_t op_index) {
+  const auto sqrt_it = model->operators.begin() + op_index;
+  const auto* sqrt_op = sqrt_it->get();
+  if (sqrt_op->type != OperatorType::kTensorFlowSqrt) {
+    return false;
+  }
+
+  CHECK_EQ(sqrt_op->inputs.size(), 1);
+  CHECK_EQ(sqrt_op->outputs.size(), 1);
+
+  const AveragePoolOperator* avpool_op;
+  const Operator* square_op;
+
+  Operator* prev_to_sqrt_op = GetOpWithOutput(*model, sqrt_op->inputs[0]);
+  if (prev_to_sqrt_op->type != OperatorType::kAveragePool) {
+    AddMessageF(
+        "Giving up trying to identify L2Pool subgraph: "
+        "expected AveragePool op, got %s",
+        LogName(*prev_to_sqrt_op));
+    return false;
+  }
+
+  avpool_op = static_cast<const AveragePoolOperator*>(prev_to_sqrt_op);
+  CHECK_EQ(avpool_op->inputs.size(), 1);
+
+  square_op = GetOpWithOutput(*model, avpool_op->inputs[0]);
+  CHECK_EQ(square_op->inputs.size(), 1);
+  if (square_op->type != OperatorType::kTensorFlowSquare) {
+    AddMessageF(
+        "Giving up trying to identify L2Pool subgraph: "
+        "expected Square op, got %s",
+        LogName(*square_op));
+    return false;
+  }
+
+  // Create and emplace L2Pool node.
+  auto* l2pool_op = new L2PoolOperator;
+
+  l2pool_op->inputs = {square_op->inputs[0]};
+  l2pool_op->outputs = sqrt_op->outputs;
+
+  l2pool_op->padding.type = avpool_op->padding.type;
+  // Note that we do not setup avpool_op->padding.fixed here.  This is done by
+  // the PropagateFixedSizes graph transformation.
+
+  l2pool_op->stride_height = avpool_op->stride_height;
+  l2pool_op->stride_width = avpool_op->stride_width;
+  l2pool_op->kheight = avpool_op->kheight;
+  l2pool_op->kwidth = avpool_op->kwidth;
+  model->operators.emplace(sqrt_it, l2pool_op);
+
+  AddMessageF("Creating %s replacing equivalent subgraph", LogName(*l2pool_op));
+
+  // Erase intermediate arrays, keeping input to square op.
+  model->arrays.erase(avpool_op->inputs[0]);
+  model->arrays.erase(sqrt_op->inputs[0]);
+
+  // Erase three operators being replaced.
+  model->operators.erase(FindOperator(model, square_op));
+  model->operators.erase(FindOperator(model, avpool_op));
+  model->operators.erase(FindOperator(model, sqrt_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..082820fddcf137238867239bbc4d4eed8158e307
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm.cc
@@ -0,0 +1,396 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator& op) {
+  auto it = model->operators.begin();
+  for (; it != model->operators.end(); ++it) {
+    if (it->get() == &op) {
+      break;
+    }
+  }
+  return it;
+}
+
+bool GetStateArrayForBackEdge(const Model& model,
+                              const string& back_edge_source_array,
+                              string* state_array = nullptr) {
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (back_edge_source_array == rnn_state.back_edge_source_array()) {
+      // Found LSTM cell output
+      if (state_array) {
+        *state_array = rnn_state.state_array();
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
+// Returns true if the given operator has exactly 1 input, and is connected to
+// the given op_type.
+// We use kNone to indicate an input unattached to an operator output. Usually
+// these are the static input arrays.
+bool MatchOperatorInputs(const Operator& op, const Model& model,
+                         OperatorType op_type, Operator** connected_op) {
+  // Check for required number of inputs
+  if (op.inputs.size() != 1) {
+    return false;
+  }
+
+  // Check if first input is disconnected/connected to an operator
+  Operator* x = GetOpWithOutput(model, op.inputs[0]);
+  if ((op_type == OperatorType::kNone) && (x != nullptr)) {
+    return false;
+  }
+  if ((op_type != OperatorType::kNone) && (x == nullptr)) {
+    return false;
+  }
+
+  // Check that first operator, if connected, is of correct type
+  if ((x != nullptr) && (x->type != op_type)) {
+    return false;
+  }
+
+  // Successfully matched. Optionally return matching input operators.
+  if (connected_op) {
+    *connected_op = x;
+  }
+
+  return true;
+}
+
+// Returns true if the given operator has exactly 2 inputs, which are connected
+// to the given op_types.
+// We use kNone to indicate an input unattached to an operator output. Usually
+// these are the static input arrays.
+bool MatchOperatorInputs(const Operator& op, const Model& model,
+                         OperatorType a_op_type, Operator** a_op,
+                         OperatorType b_op_type, Operator** b_op) {
+  // Check for required number of inputs
+  if (op.inputs.size() != 2) {
+    return false;
+  }
+
+  // Check if first input is disconnected/connected to an operator
+  Operator* x = GetOpWithOutput(model, op.inputs[0]);
+  if ((a_op_type == OperatorType::kNone) && (x != nullptr)) {
+    return false;
+  }
+  if ((a_op_type != OperatorType::kNone) && (x == nullptr)) {
+    return false;
+  }
+
+  // Check that first operator, if connected, is of correct type
+  if ((x != nullptr) && (x->type != a_op_type)) {
+    return false;
+  }
+
+  // Check if second input is disconnected/connected to an operator
+  Operator* y = GetOpWithOutput(model, op.inputs[1]);
+  if ((b_op_type == OperatorType::kNone) && (y != nullptr)) {
+    return false;
+  }
+  if ((b_op_type != OperatorType::kNone) && (y == nullptr)) {
+    return false;
+  }
+
+  // Check that second operator, if connected, is of correct type
+  if ((y != nullptr) && (y->type != b_op_type)) {
+    return false;
+  }
+
+  // Successfully matched. Optionally return matching input operators.
+  if (a_op != nullptr) {
+    *a_op = x;
+  }
+  if (b_op != nullptr) {
+    *b_op = y;
+  }
+  return true;
+}
+
+// Returns true if the given operator has exactly 3 inputs, which are connected
+// to the given op_types.
+// We use kNone to indicate an input unattached to an operator output. Usually
+// these are the static input arrays.
+bool MatchOperatorInputs(const Operator& op, const Model& model,
+                         OperatorType a_op_type, Operator** a_op,
+                         OperatorType b_op_type, Operator** b_op,
+                         OperatorType c_op_type, Operator** c_op) {
+  // Check for required number of inputs
+  if (op.inputs.size() != 3) {
+    return false;
+  }
+
+  // Check if first input is disconnected/connected to an operator
+  Operator* x = GetOpWithOutput(model, op.inputs[0]);
+  if ((a_op_type == OperatorType::kNone) && (x != nullptr)) {
+    return false;
+  }
+  if ((a_op_type != OperatorType::kNone) && (x == nullptr)) {
+    return false;
+  }
+
+  // Check that first operator, if connected, is of correct type
+  if ((x != nullptr) && (x->type != a_op_type)) {
+    return false;
+  }
+
+  // Check if second input is disconnected/connected to an operator
+  Operator* y = GetOpWithOutput(model, op.inputs[1]);
+  if ((b_op_type == OperatorType::kNone) && (y != nullptr)) {
+    return false;
+  }
+  if ((b_op_type != OperatorType::kNone) && (y == nullptr)) {
+    return false;
+  }
+
+  // Check that second operator, if connected, is of correct type
+  if ((y != nullptr) && (y->type != b_op_type)) {
+    return false;
+  }
+
+  // Check if third input is disconnected/connected to an operator
+  Operator* z = GetOpWithOutput(model, op.inputs[2]);
+  if ((c_op_type == OperatorType::kNone) && (z != nullptr)) {
+    return false;
+  }
+  if ((c_op_type != OperatorType::kNone) && (z == nullptr)) {
+    return false;
+  }
+
+  // Check that third operator, if connected, is of correct type
+  if ((z != nullptr) && (z->type != c_op_type)) {
+    return false;
+  }
+
+  // Successfully matched. Optionally return matching input operators.
+  if (a_op != nullptr) {
+    *a_op = x;
+  }
+  if (b_op != nullptr) {
+    *b_op = y;
+  }
+  if (c_op != nullptr) {
+    *c_op = z;
+  }
+  return true;
+}
+
+absl::string_view FindLongestCommonPrefix(absl::string_view a,
+                                          absl::string_view b) {
+  if (a.empty() || b.empty()) return absl::string_view();
+
+  const char* pa = a.data();
+  const char* pb = b.data();
+  size_t count = 0;
+  const ssize_t limit = std::min(a.size(), b.size());
+  while (count < limit && *pa == *pb) {
+    ++pa;
+    ++pb;
+    ++count;
+  }
+
+  return absl::string_view(a.data(), count);
+}
+
+}  // namespace
+
+bool IdentifyLstmCell::Run(Model* model, std::size_t op_index) {
+  // This LSTM cell identification method is not invariant to commutation of
+  // commutative operator inputs. For example, if input[0] and input[1] of the
+  // final output multiplication were swapped, this method would not identify it
+  // as an LSTM cell. This is OK in most cases, because
+  // tf.rnn.contrib.BasicLSTMCell always generates LSTM cells the same way.
+
+  // Final output multiply
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_output_mul = op_it->get();
+  if (final_output_mul->type != OperatorType::kMul) {
+    return false;
+  }
+  Operator *state_output_tanh, *fc_output_sig;
+  if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
+                           &state_output_tanh, OperatorType::kLogistic,
+                           &fc_output_sig)) {
+    return false;
+  }
+
+  // State output TanH
+  // (We don't count an operator as ID'd until we verify it has the correct
+  // operator types feeding into it.)
+  Operator* state_combine_add;
+  if (!MatchOperatorInputs(*state_output_tanh, *model, OperatorType::kAdd,
+                           &state_combine_add)) {
+    return false;
+  }
+  string prev_state;
+  if (!GetStateArrayForBackEdge(*model, state_output_tanh->inputs[0],
+                                &prev_state)) {
+    return false;
+  }
+
+  // State forget & remember addition
+  Operator *state_forget_mul, *state_remember_mul;
+  if (!MatchOperatorInputs(*state_combine_add, *model, OperatorType::kMul,
+                           &state_forget_mul, OperatorType::kMul,
+                           &state_remember_mul)) {
+    return false;
+  }
+  if (state_forget_mul->inputs[0] != prev_state) {
+    return false;
+  }
+
+  // State forget gate
+  Operator* state_forget_sig;
+  if (!MatchOperatorInputs(*state_forget_mul, *model, OperatorType::kNone,
+                           nullptr, OperatorType::kLogistic,
+                           &state_forget_sig)) {
+    return false;
+  }
+
+  // State remember gate
+  Operator *state_remember_sig, *state_info_tanh;
+  if (!MatchOperatorInputs(*state_remember_mul, *model, OperatorType::kLogistic,
+                           &state_remember_sig, OperatorType::kTanh,
+                           &state_info_tanh)) {
+    return false;
+  }
+
+  // State remember "information" activation function
+  Operator* fc_output_split;
+  if (!MatchOperatorInputs(*state_info_tanh, *model,
+                           OperatorType::kTensorFlowSplit, &fc_output_split)) {
+    return false;
+  }
+  // State remember gate activation function
+  Operator* tmp;
+  if (!MatchOperatorInputs(*state_remember_sig, *model,
+                           OperatorType::kTensorFlowSplit, &tmp) ||
+      (tmp != fc_output_split)) {
+    return false;
+  }
+  // State forget gate activation function
+  if (!MatchOperatorInputs(*state_forget_sig, *model,
+                           OperatorType::kTensorFlowSplit, &tmp) ||
+      (tmp != fc_output_split)) {
+    return false;
+  }
+  // Fully connected output activation function
+  if (!MatchOperatorInputs(*fc_output_sig, *model,
+                           OperatorType::kTensorFlowSplit, &tmp) ||
+      (tmp != fc_output_split)) {
+    return false;
+  }
+  // Fully connected output split
+  Operator* fully_connected;
+  if (!MatchOperatorInputs(*fc_output_split, *model, OperatorType::kNone,
+                           nullptr, OperatorType::kFullyConnected,
+                           &fully_connected)) {
+    return false;
+  }
+
+  // Fully connected op
+  Operator* concat_inputs;
+  if (!MatchOperatorInputs(*fully_connected, *model,
+                           OperatorType::kConcatenation, &concat_inputs,
+                           OperatorType::kNone, nullptr, OperatorType::kNone,
+                           nullptr)) {
+    return false;
+  }
+
+  // Emplace a new LSTM cell operator
+  auto* lstm_cell_op = new LstmCellOperator;
+  lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
+  lstm_cell_op->inputs[LstmCellOperator::DATA_INPUT] = concat_inputs->inputs[0];
+  lstm_cell_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT] =
+      concat_inputs->inputs[1];
+  lstm_cell_op->inputs[LstmCellOperator::WEIGHTS_INPUT] =
+      fully_connected->inputs[1];
+  lstm_cell_op->inputs[LstmCellOperator::BIASES_INPUT] =
+      fully_connected->inputs[2];
+  lstm_cell_op->inputs[LstmCellOperator::PREV_STATE_INPUT] = prev_state;
+  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
+  lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT] =
+      state_output_tanh->inputs[0];
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT] =
+      final_output_mul->outputs[0];
+  model->operators.emplace(op_it, lstm_cell_op);
+  AddMessageF("Creating %s replacing equivalent subgraph",
+              LogName(*lstm_cell_op));
+
+  // Create temp arrays used internally during runtime.
+  const string base_name(FindLongestCommonPrefix(
+      lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT],
+      lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
+  const string& concat_temp_array_name =
+      AvailableArrayName(*model, base_name + "concat_temp");
+  model->GetOrCreateArray(concat_temp_array_name);
+  lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
+  const string& activ_temp_array_name =
+      AvailableArrayName(*model, base_name + "activ_temp");
+  model->GetOrCreateArray(activ_temp_array_name);
+  lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] = activ_temp_array_name;
+  AddMessageF("Created temp outputs %s and %s on operator %s",
+              concat_temp_array_name, activ_temp_array_name,
+              LogName(*lstm_cell_op));
+
+  // Delete arrays and operators replaced by the LSTM cell operator. Order is
+  // important - DeleteArrayIfUnused() only succeeds if dependent operators
+  // have been removed first. Start at the output and work towards the input.
+  model->operators.erase(FindOperator(model, *final_output_mul));
+  DeleteArrayIfUnused(state_output_tanh->outputs[0], model);
+  DeleteArrayIfUnused(fc_output_sig->outputs[0], model);
+  model->operators.erase(FindOperator(model, *state_output_tanh));
+  model->operators.erase(FindOperator(model, *fc_output_sig));
+  model->operators.erase(FindOperator(model, *state_combine_add));
+  DeleteArrayIfUnused(state_forget_mul->outputs[0], model);
+  DeleteArrayIfUnused(state_remember_mul->outputs[0], model);
+  model->operators.erase(FindOperator(model, *state_forget_mul));
+  model->operators.erase(FindOperator(model, *state_remember_mul));
+  DeleteArrayIfUnused(state_forget_sig->outputs[0], model);
+  DeleteArrayIfUnused(state_info_tanh->outputs[0], model);
+  DeleteArrayIfUnused(state_remember_sig->outputs[0], model);
+  model->operators.erase(FindOperator(model, *state_forget_sig));
+  model->operators.erase(FindOperator(model, *state_info_tanh));
+  model->operators.erase(FindOperator(model, *state_remember_sig));
+  DeleteArrayIfUnused(fc_output_split->outputs[0], model);
+  DeleteArrayIfUnused(fc_output_split->outputs[1], model);
+  DeleteArrayIfUnused(fc_output_split->outputs[2], model);
+  DeleteArrayIfUnused(fc_output_split->outputs[3], model);
+  string dims_array = fc_output_split->inputs[0];
+  model->operators.erase(FindOperator(model, *fc_output_split));
+  DeleteArrayIfUnused(dims_array, model);
+  DeleteArrayIfUnused(fully_connected->outputs[0], model);
+  model->operators.erase(FindOperator(model, *fully_connected));
+  DeleteArrayIfUnused(concat_inputs->outputs[0], model);
+  model->operators.erase(FindOperator(model, *concat_inputs));
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfc77024e7e56038878570c9d3a462715a53ae3f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
@@ -0,0 +1,103 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator* op) {
+  auto it = model->operators.begin();
+  for (; it != model->operators.end(); ++it) {
+    if (it->get() == op) {
+      break;
+    }
+  }
+  return it;
+}
+
+bool CheckArrayIsScalarFloat(Model* model, const std::string& name, float val) {
+  const auto& op_array = model->GetArray(name);
+  if (!op_array.buffer || op_array.buffer->type != ArrayDataType::kFloat ||
+      RequiredBufferSizeForShape(op_array.shape()) != 1) {
+    return false;
+  }
+  const auto& op_data = op_array.GetBuffer<ArrayDataType::kFloat>().data;
+  return op_data[0] == val;
+}
+
+// Returns index of scalar input when there is exactly one scalar, -1 otherwise
+int GetSingleScalarInputIndexOfBinaryOp(Model* model, const Operator* op,
+                                        float val) {
+  bool input0_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[0], val);
+  bool input1_is_scalar = CheckArrayIsScalarFloat(model, op->inputs[1], val);
+  return input0_is_scalar == input1_is_scalar ? -1 : input0_is_scalar ? 0 : 1;
+}
+}  // namespace
+
+bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
+  const auto maximum_it = model->operators.begin() + op_index;
+  const auto* maximum_op = maximum_it->get();
+  if (maximum_op->type != OperatorType::kTensorFlowMaximum) {
+    return false;
+  }
+  CHECK_EQ(maximum_op->inputs.size(), 2);
+  if (maximum_op->outputs.size() != 1) {
+    return false;
+  }
+  int scalar_input_index =
+      GetSingleScalarInputIndexOfBinaryOp(model, maximum_op, -1.0f);
+  if (scalar_input_index == -1) {
+    return false;
+  }
+  const auto* minimum_op = GetOpWithInput(*model, maximum_op->outputs[0]);
+  if (!minimum_op || minimum_op->type != OperatorType::kTensorFlowMinimum) {
+    return false;
+  }
+  if (GetSingleScalarInputIndexOfBinaryOp(model, minimum_op, 1.0f) == -1) {
+    return false;
+  }
+  CHECK_EQ(minimum_op->inputs.size(), 2);
+
+  // Create and emplace Relu1 node
+  auto* relu1_op = new Relu1Operator;
+  relu1_op->inputs = {maximum_op->inputs[!scalar_input_index]};
+  relu1_op->outputs = minimum_op->outputs;
+  model->operators.emplace(maximum_it, relu1_op);
+
+  AddMessageF("Creating %s replacing equivalent subgraph", LogName(*relu1_op));
+
+  // Erase Maximum scalar input & operator
+  model->arrays.erase(maximum_op->inputs[scalar_input_index]);
+  model->operators.erase(FindOperator(model, maximum_op));
+
+  // Erase Minimum inputs & operator
+  model->arrays.erase(minimum_op->inputs[0]);
+  model->arrays.erase(minimum_op->inputs[1]);
+  model->operators.erase(FindOperator(model, minimum_op));
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d83603e9a2c59ae74a5e5fda5b11178740336bfb
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -0,0 +1,120 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+// This inserts an operator whose output is a float array (name:
+// flags.input_array()).  It has to wait for any existing operators that
+// generate this output to be removed by graph transformations.  Note that there
+// may be more than one operator that takes the input_array as their input, and
+// that some of these may be removed by graph transformations.
+bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
+                                  GraphTransformation* transformation,
+                                  Model* model) {
+  // An operator with the required output may be a dequantize operator already
+  // created.  Alternatively it may be an operator that needs to be removed
+  // because it is unused, in which case we wait for RemoveUnusedOp to do its
+  // work.
+  if (GetOpWithOutput(*model, input_name)) {
+    return false;
+  }
+
+  // We only apply for the first operator if there is more than one.  This is
+  // not strictly necessary for ordering correctness, since we insert the
+  // dequant operator at the beginning of the op sequence, but it makes the
+  // insertion more predictable (eg forward vs backwards operator sweep).
+  if (CountOpsWithInput(*model, input_name) > 1) {
+    if (op != GetFirstOpWithInput(*model, input_name)) {
+      return false;
+    }
+  }
+
+  auto& input_array = model->GetArray(input_name);
+  if (input_array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+
+  if (input_array.final_data_type == input_array.data_type ||
+      input_array.final_data_type == ArrayDataType::kNone) {
+    return false;
+  }
+
+  const auto& dequantized_input_name =
+      AvailableArrayName(*model, input_name + "_dequantized");
+  for (auto& other_op : model->operators) {
+    for (string& other_op_input : other_op->inputs) {
+      if (other_op_input == input_name) {
+        other_op_input = dequantized_input_name;
+      }
+    }
+  }
+
+  auto& dequantized_input_array =
+      model->GetOrCreateArray(dequantized_input_name);
+  auto* image_input_op = new DequantizeOperator;
+  image_input_op->inputs = {input_name};
+  image_input_op->outputs = {dequantized_input_name};
+  model->operators.emplace(model->operators.begin(), image_input_op);
+
+  CHECK(input_array.final_data_type == ArrayDataType::kUint8);
+  input_array.data_type = ArrayDataType::kUint8;
+  dequantized_input_array.data_type = ArrayDataType::kFloat;
+  const auto& input_minmax = input_array.GetMinMax();
+  auto& dequantized_input_minmax = dequantized_input_array.GetOrCreateMinMax();
+  dequantized_input_minmax = input_minmax;
+  auto& input_qparams = input_array.GetOrCreateQuantizationParams();
+  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
+      model->flags, input_minmax, &input_qparams);
+
+  transformation->AddMessageF(
+      "Created %s"
+      " to handle quantized input image data, taking over existing"
+      " mean_value and std_value flags. Cleared those flags.",
+      LogName(*image_input_op));
+
+  return true;
+}
+
+bool MakeInitialDequantizeOperator::Run(Model* model, std::size_t op_index) {
+  // This is effectively a transformation applied to edges.  We iterate over the
+  // specified node (op) and proceed for input edges.
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+  bool change_made = false;
+  for (auto& input : op->inputs) {
+    for (auto& input_array : *model->flags.mutable_input_arrays()) {
+      if (input_array.name() == input) {
+        if (AddDequantizeOperatorToInput(input_array.name(), op, this, model)) {
+          change_made = true;
+          input_array.clear_mean_value();
+          input_array.clear_std_value();
+        }
+      }
+    }
+  }
+  return change_made;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d92bcbccd6693870879ccb871cc0a7eb7359a0c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+void SetDataTypeForAllOutputs(Model* model, Operator* op,
+                              ArrayDataType data_type) {
+  for (const auto& output : op->outputs) {
+    model->arrays[output]->data_type = data_type;
+  }
+}
+}  // namespace
+
+bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+
+  // If the data type of some input is unknown, we need to yield.
+  for (const auto& input : op->inputs) {
+    if (model->arrays[input]->data_type == ArrayDataType::kNone) {
+      return false;
+    }
+  }
+  // Record data types of output before processing, so we can see at the
+  // end if we changed anything, and return the correct boolean value.
+  std::unordered_map<string, ArrayDataType> old_output_data_types;
+  for (const auto& output : op->outputs) {
+    old_output_data_types[output] = model->arrays[output]->data_type;
+  }
+  // Do the actual output data types propagation.
+  if (op->type == OperatorType::kDequantize ||
+      op->type == OperatorType::kResizeBilinear) {
+    // These operators unconditionally produce float outputs
+    SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
+  } else if (op->type == OperatorType::kTensorFlowLess ||
+             op->type == OperatorType::kTensorFlowLessEqual ||
+             op->type == OperatorType::kTensorFlowGreater ||
+             op->type == OperatorType::kTensorFlowGreaterEqual) {
+    // These operators unconditionally produce bool outputs
+    SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
+  } else if (op->type == OperatorType::kRank ||
+             op->type == OperatorType::kTensorFlowShape) {
+    // These operators are assumed to produce int32 outputs.
+    SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
+  } else if (op->type == OperatorType::kTensorFlowSplit ||
+             op->type == OperatorType::kTensorFlowConcat ||
+             op->type == OperatorType::kFill) {
+    // These operators produce an output with the same type as their 2nd input
+    CHECK_GE(op->inputs.size(), 2);
+    const ArrayDataType data_type = model->arrays[op->inputs[1]]->data_type;
+    SetDataTypeForAllOutputs(model, op, data_type);
+  } else if (op->type == OperatorType::kCast) {
+    // Data type of the Cast op is specified.
+    CHECK_EQ(op->outputs.size(), 1);
+    auto* cast_op = static_cast<CastOperator*>(op);
+    model->arrays[op->outputs[0]]->data_type = cast_op->dst_data_type;
+  } else if (op->type == OperatorType::kTensorFlowUnsupported) {
+    auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
+    if (unsupported_op->output_data_types.size() != op->outputs.size()) {
+      return false;
+    }
+    for (int i = 0; i < unsupported_op->output_data_types.size(); ++i) {
+      auto output = op->outputs[i];
+      auto data_type = unsupported_op->output_data_types[i];
+      model->arrays[output]->data_type = data_type;
+    }
+  } else if (op->type == OperatorType::kExpandDims) {
+    // Yield on ExpandDim until it is converted to Reshape
+    return false;
+  } else {
+    // These operators produce outputs with the same type as their 1st input
+    CHECK_GT(op->inputs.size(), 0);
+    const ArrayDataType data_type = model->arrays[op->inputs[0]]->data_type;
+    SetDataTypeForAllOutputs(model, op, data_type);
+  }
+  // Return true if any output data type changed, false if none changed.
+  for (const auto& output : op->outputs) {
+    if (old_output_data_types[output] != model->arrays[output]->data_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4530806ede36a49e1b1f5e22d7c45c89aa4cb312
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -0,0 +1,1141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
+                      int kheight, int stride_width, int stride_height,
+                      PaddingType padding_type, Shape* output_shape,
+                      FixedPadding* fixed_padding) {
+  const int input_width = input_shape.dims(2);
+  const int input_height = input_shape.dims(1);
+  const int batch = input_shape.dims(0);
+
+  int output_height = 0;
+  int output_width = 0;
+  if (padding_type == PaddingType::kValid) {
+    output_height = (input_height + stride_height - kheight) / stride_height;
+    output_width = (input_width + stride_width - kwidth) / stride_width;
+  } else if (padding_type == PaddingType::kSame) {
+    output_height = (input_height + stride_height - 1) / stride_height;
+    output_width = (input_width + stride_width - 1) / stride_width;
+  } else {
+    LOG(FATAL) << "Only supporting SAME or VALID padding";
+  }
+
+  fixed_padding->height = std::max(
+      0, ((output_height - 1) * stride_height + kheight - input_height) / 2);
+  fixed_padding->width = std::max(
+      0, ((output_width - 1) * stride_width + kwidth - input_width) / 2);
+
+  // Actually had to debug a situation where those were negative due to bad
+  // propagation of placeholder -1 sizes in TensorFlowReshape.
+  CHECK_GT(output_width, 0);
+  CHECK_GT(output_height, 0);
+  output_shape->ReplaceDims({batch, output_height, output_width, output_depth});
+}
+
+void ComputeBinaryOperatorOutputSize(const Shape& input_shape1,
+                                     const Shape& input_shape2,
+                                     Array* output_array) {
+  const int size1 = RequiredBufferSizeForShape(input_shape1);
+  const int size2 = RequiredBufferSizeForShape(input_shape2);
+  if (size1 > size2) {
+    output_array->copy_shape(input_shape1);
+  } else if (size2 > size1) {
+    output_array->copy_shape(input_shape2);
+  } else {
+    CHECK_EQ(size1, size2);
+    const int dims1 = input_shape1.dimensions_count();
+    const int dims2 = input_shape2.dimensions_count();
+    if (dims1 >= dims2) {
+      output_array->copy_shape(input_shape1);
+    } else {
+      output_array->copy_shape(input_shape2);
+    }
+  }
+  CHECK(output_array->has_shape());
+}
+
+int GetOutputDepthFromWeights(const Model& model, const Operator& op) {
+  const string& weights_name = op.inputs[1];
+  const auto& weights_shape = model.arrays.at(weights_name)->shape();
+  if (op.type == OperatorType::kConv ||
+      op.type == OperatorType::kFullyConnected) {
+    return weights_shape.dims(0);
+  } else if (op.type == OperatorType::kDepthwiseConv) {
+    return weights_shape.dims(3);
+  } else {
+    LOG(FATAL) << "Unhandled operator type";
+  }
+}
+
+bool EnsureBiasVectorShape(Model* model, Operator* op) {
+  const string& weights_name = op->inputs[1];
+  const auto& weights_array = *model->arrays[weights_name];
+  // Yield until weights shape has been resolved.
+  if (!weights_array.has_shape()) {
+    return false;
+  }
+
+  if (op->inputs.size() < 3) {
+    return false;
+  }
+  auto& bias_array = *model->arrays[op->inputs[2]];
+  if (bias_array.has_shape()) {
+    return true;
+  }
+
+  const int output_depth = GetOutputDepthFromWeights(*model, *op);
+  bias_array.copy_shape(Shape({output_depth}));
+
+  auto& float_buffer = bias_array.GetMutableBuffer<ArrayDataType::kFloat>();
+  float_buffer.data.resize(output_depth, 0);
+
+  return true;
+}
+
+void ProcessConvOperator(Model* model, ConvOperator* op) {
+  if (!EnsureBiasVectorShape(model, op)) {
+    return;
+  }
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+
+  const auto& weights_array = *model->arrays[op->inputs[1]];
+  // Yield until weights dims have been resolved.
+  if (!weights_array.has_shape()) {
+    return;
+  }
+  const auto& weights_shape = weights_array.shape();
+  CHECK_EQ(weights_shape.dimensions_count(), 4);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  const int output_depth = weights_shape.dims(0);
+  const int kheight = weights_shape.dims(1);
+  const int kwidth = weights_shape.dims(2);
+  ComputeConvSizes(input_shape, output_depth, kwidth, kheight, op->stride_width,
+                   op->stride_height, op->padding.type,
+                   output_array.mutable_shape(),
+                   &op->padding.GetOrCreateFixedPadding());
+  CHECK_EQ(output_array.shape().dimensions_count(), 4);
+
+  // Set im2col array dimensions if there is one.
+  if (op->outputs.size() == 2) {
+    const auto& output_shape = output_array.shape();
+    const int input_depth = weights_shape.dims(3);
+    auto& im2col_array = *model->arrays[op->outputs[1]];
+    im2col_array.copy_shape(Shape{output_shape.dims(0), output_shape.dims(1),
+                                  output_shape.dims(2),
+                                  input_depth * kheight * kwidth});
+  }
+}
+
+void ProcessDepthwiseConvOperator(Model* model, DepthwiseConvOperator* op) {
+  if (!EnsureBiasVectorShape(model, op)) {
+    return;
+  }
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+
+  const auto& weights_array = *model->arrays[op->inputs[1]];
+  // Yield until weights dims have been resolved.
+  if (!weights_array.has_shape()) {
+    return;
+  }
+  const auto& weights_shape = weights_array.shape();
+  CHECK_EQ(weights_shape.dimensions_count(), 4);
+
+  const string& output_name = op->outputs[0];
+  const int input_depth = input_shape.dims(3);
+  const int output_depth = weights_shape.dims(3);
+  // TensorFlow doesn't define the depth_multiplier value on DepthwiseConv ops,
+  // instead it has to be inferred from the weights dims. However, once we are
+  // here, weights dims have already been converted to our own internal format,
+  // where the multiplier is no longer readily apparent. So instead we get it
+  // as the quotient of output and input depths. We only want to do that when
+  // depth_multiplier had the zero value: any other value should be checked
+  // as done by the next if() below.
+  if (!op->depth_multiplier) {
+    op->depth_multiplier = output_depth / input_depth;
+  }
+  QCHECK_EQ(output_depth, input_depth * op->depth_multiplier)
+      << "input/output depths and depth_multiplier don't match";
+
+  const int kheight = weights_shape.dims(1);
+  const int kwidth = weights_shape.dims(2);
+  ComputeConvSizes(input_shape, output_depth, kwidth, kheight, op->stride_width,
+                   op->stride_height, op->padding.type,
+                   model->GetArray(output_name).mutable_shape(),
+                   &op->padding.GetOrCreateFixedPadding());
+}
+
+void ProcessDepthToSpaceOperator(Model* model, DepthToSpaceOperator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+
+  const string& output_name = op->outputs[0];
+  const int block_size = op->block_size;
+  CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
+  const int batch = input_shape.dims(0);
+  const int height = input_shape.dims(1);
+  const int width = input_shape.dims(2);
+  const int depth = input_shape.dims(3);
+  QCHECK_EQ(depth % (block_size * block_size), 0);
+
+  model->GetArray(output_name)
+      .copy_shape(Shape({batch, height * block_size, width * block_size,
+                         depth / block_size / block_size}));
+}
+
+void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+
+  const string& output_name = op->outputs[0];
+  const int block_size = op->block_size;
+  CHECK_NE(block_size, 0) << "Invalid block_size in " << output_name;
+  const int batch = input_shape.dims(0);
+  const int height = input_shape.dims(1);
+  const int width = input_shape.dims(2);
+  const int depth = input_shape.dims(3);
+  QCHECK_EQ(width % block_size, 0);
+  QCHECK_EQ(height % block_size, 0);
+
+  model->GetArray(output_name)
+      .copy_shape(Shape({batch, height / block_size, width / block_size,
+                         depth * block_size * block_size}));
+}
+
+void ProcessFullyConnectedOperator(Model* model, FullyConnectedOperator* op) {
+  if (!EnsureBiasVectorShape(model, op)) {
+    return;
+  }
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_GE(input_shape.dimensions_count(), 1);
+
+  const auto& weights_array = *model->arrays[op->inputs[1]];
+  // Yield until weights dims have been resolved.
+  if (!weights_array.has_shape()) {
+    return;
+  }
+  const auto& weights_shape = weights_array.shape();
+
+  const int weights_output_depth = weights_shape.dims(0);
+  CHECK_EQ(weights_shape.dimensions_count(), 2);
+
+  const int input_overall_size = RequiredBufferSizeForShape(input_shape);
+  const int matmul_repeats = input_overall_size / weights_shape.dims(1);
+  CHECK_EQ(matmul_repeats * weights_shape.dims(1), input_overall_size);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(Shape({matmul_repeats, weights_output_depth}));
+}
+
+void ProcessTensorFlowReshapeOperator(Model* model,
+                                      TensorFlowReshapeOperator* op) {
+  auto& output_array = *model->arrays[op->outputs[0]];
+  // Bail if we already have output dims
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+
+  const string& shape_name = op->inputs[1];
+  auto& shape_array = model->GetArray(shape_name);
+  // Yield until the shape is resolved as a constant array
+  if (!shape_array.buffer) {
+    return;
+  }
+  CHECK(shape_array.data_type == ArrayDataType::kInt32);
+  // shape_data is the raw array of ints describing the shape
+  // in the TensorFlow node. We intentionally make a copy here, rather than
+  // modify wildcards in-place below, because in some graphs, the same shape
+  // array with a wildcard may be referenced from multiple Reshape nodes, where
+  // the wildcard needs to resolved to distinct values.
+  std::vector<int32> shape_data =
+      shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  // The Reshape shape may have a wildcard dim, encoded as -1.
+  bool has_wildcard = false;
+  int wildcard_index = 0;
+  int product_non_wildcard_dims = 1;
+  for (int i = 0; i < shape_data.size(); i++) {
+    if (shape_data[i] == -1) {
+      CHECK(!has_wildcard);
+      has_wildcard = true;
+      wildcard_index = i;
+    } else {
+      product_non_wildcard_dims *= shape_data[i];
+    }
+  }
+  const int input_flat_size = RequiredBufferSizeForShape(input_shape);
+  if (has_wildcard) {
+    shape_data[wildcard_index] = input_flat_size / product_non_wildcard_dims;
+  }
+  auto& output_shape = *output_array.mutable_shape();
+  *output_shape.mutable_dims() = shape_data;
+  const int output_flat_size = RequiredBufferSizeForShape(output_shape);
+  CHECK_EQ(output_flat_size, input_flat_size);
+}
+
+void ProcessSimpleOperator(Model* model, Operator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  const string& output_name = op->outputs[0];
+  auto& output_array = *model->arrays[output_name];
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  output_array.copy_shape(input_array.shape());
+}
+
+void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input0_array = *model->arrays[op->inputs[0]];
+  const auto& input1_array = *model->arrays[op->inputs[1]];
+  // Yield until input dims have been resolved.
+  if (!input0_array.has_shape() || !input1_array.has_shape()) {
+    return;
+  }
+  const string& output_name = op->outputs[0];
+  auto& output_array = *model->arrays[output_name];
+  ComputeBinaryOperatorOutputSize(input0_array.shape(), input1_array.shape(),
+                                  &output_array);
+}
+
+bool KeepDims(const Operator& op) {
+  switch (op.type) {
+    case OperatorType::kTensorFlowMin:
+      return static_cast<const TensorFlowMinOperator&>(op).keep_dims;
+    case OperatorType::kTensorFlowMax:
+      return static_cast<const TensorFlowMaxOperator&>(op).keep_dims;
+    case OperatorType::kTensorFlowSum:
+      return static_cast<const TensorFlowSumOperator&>(op).keep_dims;
+    case OperatorType::kMean:
+      return static_cast<const MeanOperator&>(op).keep_dims;
+    default:
+      LOG(FATAL) << "Not a reduction operator!";
+      return false;
+  }
+}
+
+void ProcessTensorFlowReductionOperator(Model* model, Operator* op) {
+  CHECK_LE(op->inputs.size(), 2);
+  auto& output_array = *model->arrays[op->outputs[0]];
+  if (output_array.has_shape()) {
+    return;
+  }
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const bool keep_dims = KeepDims(*op);
+  if (op->inputs.size() == 2) {
+    // There is a reduction_indices input.
+    const auto& reduction_array = *model->arrays[op->inputs[1]];
+    if (!reduction_array.buffer) {
+      return;
+    }
+    CHECK(reduction_array.buffer->type == ArrayDataType::kInt32);
+    const auto& reduction_array_vals =
+        reduction_array.GetBuffer<ArrayDataType::kInt32>().data;
+    auto& output_dims = *output_array.mutable_shape()->mutable_dims();
+    output_dims.clear();
+    for (int i = 0; i < input_shape.dimensions_count(); i++) {
+      bool is_reduction_dim = false;
+      for (int r : reduction_array_vals) {
+        if (i == r) {
+          is_reduction_dim = true;
+        }
+      }
+      if (!is_reduction_dim) {
+        output_dims.push_back(input_shape.dims(i));
+      } else if (keep_dims) {
+        output_dims.push_back(1);
+      }
+    }
+  } else {
+    // No reduction_indices means complete reduction to a single scalar.
+    if (keep_dims) {
+      output_array.copy_shape(input_shape);
+    } else {
+      output_array.copy_shape(Shape({}));
+    }
+  }
+}
+
+void ProcessSliceOperator(Model* model, SliceOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  // Yield until the Slice params have been resolved.
+  if (op->begin.empty()) return;
+
+  // Yield until input dims have been resolved.
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  if (!input_array.has_shape()) return;
+  const Shape& input_shape = input_array.shape();
+
+  auto& output_array = *model->arrays[op->outputs[0]];
+  if (output_array.has_shape()) return;
+
+  CHECK_EQ(input_shape.dims().size(), op->size.size());
+  CHECK_EQ(op->begin.size(), op->size.size());
+
+  std::vector<int> output_dims;
+  for (int i = 0; i < op->begin.size(); ++i) {
+    int size = op->size[i];
+    if (size == -1) {
+      size = input_array.shape().dims(i) - op->begin[i];
+    }
+    output_dims.push_back(size);
+  }
+
+  *output_array.mutable_shape()->mutable_dims() = output_dims;
+}
+
+void ProcessReorderAxesOperator(Model* model, ReorderAxesOperator* op) {
+  const string& input_name = op->inputs[0];
+  const auto& input_array = *model->arrays[input_name];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const string& output_name = op->outputs[0];
+  Shape* output_shape = model->GetArray(output_name).mutable_shape();
+  ShuffleDims(input_shape, op->input_axes_order, op->output_axes_order,
+              output_shape);
+}
+
+void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
+  // Yield until input dims have been resolved.
+  for (const auto& input_name : op->inputs) {
+    auto& input_array = *model->arrays[input_name];
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+  auto& output_array = model->GetArray(op->outputs[0]);
+  // Use 0 input as basis for output dimensions.
+  const auto& first_input_array = *model->arrays[op->inputs[0]];
+  output_array.copy_shape(first_input_array.shape());
+  // Determine the concat size, and enfore that all inputs have
+  // the same dimensions count.
+  int concat_size = 0;
+  for (const auto& input_name : op->inputs) {
+    auto& input_array = *model->arrays[input_name];
+    CHECK(input_array.has_shape());
+    if (input_array.shape().dimensions_count() == 0) {
+      continue;
+    }
+    CHECK_EQ(input_array.shape().dimensions_count(),
+             output_array.shape().dimensions_count());
+    const std::vector<int>& input_dims = input_array.shape().dims();
+    CHECK_LT(op->concat_dim, input_dims.size());
+    concat_size += input_dims[op->concat_dim];
+  }
+  // Write out the concat_size on the output array shape.
+  auto& output_shape = *output_array.mutable_shape();
+  auto& output_dims = *output_shape.mutable_dims();
+  CHECK_LT(op->concat_dim, output_shape.dimensions_count());
+  output_dims[op->concat_dim] = concat_size;
+}
+
+void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const string& input_name = op->inputs[1];
+  const auto& input_array = *model->arrays[input_name];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const Shape& input_shape = input_array.shape();
+
+  // This code is slightly suspect.  The TensorFlow docs say that the axis
+  // selection defaults to 0, but we are splitting across the final axis.
+  const int input_dims_count = input_shape.dimensions_count();
+  const int input_depth = input_shape.dims(input_dims_count - 1);
+  CHECK_EQ(input_depth % op->num_split, 0);
+  const int split_depth = input_depth / op->num_split;
+
+  Shape output_shape = input_shape;
+  (*output_shape.mutable_dims())[input_dims_count - 1] = split_depth;
+
+  CHECK_EQ(op->outputs.size(), op->num_split);
+  for (const auto& output : op->outputs) {
+    model->arrays[output]->copy_shape(output_shape);
+  }
+}
+
+void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
+  const string& input_name = op->inputs[0];
+  const auto& input_array = *model->arrays[input_name];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+  const string& output_name = op->outputs[0];
+  const int output_depth = input_shape.dims(3);
+  ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
+                   op->stride_width, op->stride_height, op->padding.type,
+                   model->GetArray(output_name).mutable_shape(),
+                   &op->padding.GetOrCreateFixedPadding());
+}
+
+void ProcessMaxPoolOperator(Model* model, MaxPoolOperator* op) {
+  const string& input_name = op->inputs[0];
+  const auto& input_array = *model->arrays[input_name];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+  const string& output_name = op->outputs[0];
+  const int output_depth = input_shape.dims(3);
+  ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
+                   op->stride_width, op->stride_height, op->padding.type,
+                   model->GetArray(output_name).mutable_shape(),
+                   &op->padding.GetOrCreateFixedPadding());
+}
+
+void ProcessL2PoolOperator(Model* model, L2PoolOperator* op) {
+  const string& input_name = op->inputs[0];
+  const auto& input_array = *model->arrays[input_name];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  if (input_shape.dimensions_count() < 4) {
+    LOG(FATAL) << "missing dimensions for " << input_name;
+  }
+  const string& output_name = op->outputs[0];
+  const int output_depth = input_shape.dims(3);
+  ComputeConvSizes(input_shape, output_depth, op->kwidth, op->kheight,
+                   op->stride_width, op->stride_height, op->padding.type,
+                   model->GetArray(output_name).mutable_shape(),
+                   &op->padding.GetOrCreateFixedPadding());
+}
+
+void ProcessResizeBilinearOperator(Model* model, ResizeBilinearOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  if (!model->arrays[op->inputs[0]]->has_shape() ||
+      !model->arrays[op->inputs[1]]->has_shape()) {
+    return;
+  }
+  const auto& input_data_shape = model->arrays[op->inputs[0]]->shape();
+
+  const string& output_size_name = op->inputs[1];
+  const auto& output_size_array = *model->arrays[output_size_name];
+  CHECK(output_size_array.data_type == ArrayDataType::kInt32);
+  CHECK(output_size_array.has_shape());
+  const auto& output_size_shape = output_size_array.shape();
+  CHECK_EQ(output_size_shape.dimensions_count(), 1);
+  CHECK_EQ(output_size_shape.dims(0), 2);
+  std::vector<int32> output_shape =
+      output_size_array.GetBuffer<ArrayDataType::kInt32>().data;
+  model->arrays[op->outputs[0]]->copy_shape(
+      Shape({input_data_shape.dims(0), output_shape[0], output_shape[1],
+             input_data_shape.dims(3)}));
+}
+
+void ProcessLstmCellOperator(Model* model, LstmCellOperator* op) {
+  // I/O arrays should be allocated on creation of op.
+  QCHECK_EQ(op->inputs.size(), LstmCellOperator::NUM_INPUTS);
+  QCHECK_EQ(op->outputs.size(), LstmCellOperator::NUM_OUTPUTS);
+
+  const auto& input_array =
+      *model->arrays[op->inputs[LstmCellOperator::DATA_INPUT]];
+  // Yield until all input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_GE(input_shape.dimensions_count(), 2);
+
+  const auto& prev_activ_array =
+      *model->arrays[op->inputs[LstmCellOperator::PREV_ACTIV_INPUT]];
+  // Yield until all input dims have been resolved.
+  if (!prev_activ_array.has_shape()) {
+    return;
+  }
+  const auto& prev_activ_shape = prev_activ_array.shape();
+  CHECK_GE(prev_activ_shape.dimensions_count(), 2);
+
+  const auto& weights_array =
+      *model->arrays[op->inputs[LstmCellOperator::WEIGHTS_INPUT]];
+  // Yield until weights dims have been resolved.
+  if (!weights_array.has_shape()) {
+    return;
+  }
+  const auto& weights_shape = weights_array.shape();
+  CHECK_EQ(weights_shape.dimensions_count(), 2);
+
+  const auto& bias_array =
+      *model->arrays[op->inputs[LstmCellOperator::BIASES_INPUT]];
+  // Yield until bias dims have been resolved.
+  if (!bias_array.has_shape()) {
+    return;
+  }
+  const auto& bias_shape = bias_array.shape();
+  CHECK_GE(bias_shape.dimensions_count(), 1);
+
+  const auto& prev_state_array =
+      *model->arrays[op->inputs[LstmCellOperator::PREV_STATE_INPUT]];
+  // Yield until all input dims have been resolved.
+  if (!prev_state_array.has_shape()) {
+    return;
+  }
+  const auto& prev_state_shape = prev_state_array.shape();
+  CHECK_GE(prev_state_shape.dimensions_count(), 2);
+
+  const int fc_output_depth = weights_shape.dims(0);
+  CHECK_EQ(fc_output_depth, bias_shape.dims(0));
+  CHECK_EQ(fc_output_depth % 4, 0);
+  const int depth = fc_output_depth / 4;
+
+  const int input_depth = input_shape.dims(input_shape.dimensions_count() - 1);
+  const int fc_input_depth = weights_shape.dims(1);
+  CHECK_EQ(input_depth + depth, fc_input_depth);
+  Shape output_shape(input_shape);
+  (*output_shape.mutable_dims())[output_shape.dimensions_count() - 1] = depth;
+
+  // Set output dimensions
+  model->GetArray(op->outputs[LstmCellOperator::STATE_OUTPUT])
+      .copy_shape(output_shape);
+  model->GetArray(op->outputs[LstmCellOperator::ACTIV_OUTPUT])
+      .copy_shape(output_shape);
+
+  Shape concat_temp_shape(input_shape);
+  (*concat_temp_shape
+        .mutable_dims())[concat_temp_shape.dimensions_count() - 1] =
+      fc_input_depth;
+  model->GetArray(op->outputs[LstmCellOperator::CONCAT_TEMP])
+      .copy_shape(concat_temp_shape);
+
+  Shape activ_temp_shape(input_shape);
+  (*activ_temp_shape.mutable_dims())[activ_temp_shape.dimensions_count() - 1] =
+      fc_output_depth;
+  model->GetArray(op->outputs[LstmCellOperator::ACTIV_TEMP])
+      .copy_shape(activ_temp_shape);
+}
+
+void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  if (input_shape.dimensions_count() != 4) {
+    // This method only handles input dimensions of 4
+    return;
+  }
+  const auto input_height = input_shape.dims(1);
+  const auto input_width = input_shape.dims(2);
+
+  const auto& block_shape_array = *model->arrays[op->inputs[1]];
+  const auto& paddings_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array_shape = block_shape_array.shape();
+  const auto& paddings_array_shape = paddings_array.shape();
+  QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
+  QCHECK_EQ(paddings_array_shape.dimensions_count(), 2);
+
+  // We only support two dimensions.
+  QCHECK_EQ(block_shape_array_shape.dims(0), 2);
+  if (!block_shape_array.buffer) {
+    return;
+  }
+  QCHECK(block_shape_array.data_type == ArrayDataType::kInt32);
+  const auto& block_shape_data =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  auto block_height = block_shape_data[0];
+  auto block_width = block_shape_data[1];
+
+  QCHECK_EQ(paddings_array_shape.dims(0), 2);  // Number of block dimensions
+  QCHECK_EQ(paddings_array_shape.dims(1), 2);  // Two parameters per dimension.
+  if (!paddings_array.buffer) {
+    return;
+  }
+  QCHECK(paddings_array.data_type == ArrayDataType::kInt32);
+  const auto& paddings_data =
+      paddings_array.GetBuffer<ArrayDataType::kInt32>().data;
+  int height_with_paddings = input_height + paddings_data[0] + paddings_data[1];
+  int width_with_paddings = input_width + paddings_data[2] + paddings_data[3];
+  QCHECK_EQ(height_with_paddings % block_height, 0);
+  QCHECK_EQ(width_with_paddings % block_width, 0);
+  int output_height = height_with_paddings / block_height;
+  int output_width = width_with_paddings / block_width;
+
+  model->arrays[op->outputs[0]]->copy_shape(
+      Shape({input_shape.dims(0) * block_height * block_width, output_height,
+             output_width, input_shape.dims(3)}));
+}
+
+void ProcessBatchToSpaceNDOperator(Model* model, BatchToSpaceNDOperator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  CHECK_EQ(input_shape.dimensions_count(), 4);
+  const auto input_height = input_shape.dims(1);
+  const auto input_width = input_shape.dims(2);
+
+  const auto& block_shape_array = *model->arrays[op->inputs[1]];
+  const auto& crops_array = *model->arrays[op->inputs[2]];
+  const auto& block_shape_array_shape = block_shape_array.shape();
+  const auto& crops_array_shape = crops_array.shape();
+  QCHECK_EQ(block_shape_array_shape.dimensions_count(), 1);
+  QCHECK_EQ(crops_array_shape.dimensions_count(), 2);
+
+  // We only support two dimensions.
+  QCHECK_EQ(block_shape_array_shape.dims(0), 2);
+  if (!block_shape_array.buffer) {
+    return;
+  }
+  QCHECK(block_shape_array.data_type == ArrayDataType::kInt32);
+  const auto& block_shape_data =
+      block_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  auto block_height = block_shape_data[0];
+  auto block_width = block_shape_data[1];
+
+  QCHECK_EQ(crops_array_shape.dims(0), 2);  // Number of block dimensions
+  QCHECK_EQ(crops_array_shape.dims(1), 2);  // Two parameters per dimension.
+  if (!crops_array.buffer) {
+    return;
+  }
+  QCHECK(crops_array.data_type == ArrayDataType::kInt32);
+  const auto& crops_data = crops_array.GetBuffer<ArrayDataType::kInt32>().data;
+  // We don't support crops now.
+  QCHECK_EQ(crops_data[0], 0);
+  QCHECK_EQ(crops_data[1], 0);
+  QCHECK_EQ(crops_data[2], 0);
+  QCHECK_EQ(crops_data[3], 0);
+
+  QCHECK_EQ(input_shape.dims(0) % (block_height * block_width), 0);
+
+  int output_height = input_height * block_height;
+  int output_width = input_width * block_width;
+
+  model->arrays[op->outputs[0]]->copy_shape(
+      Shape({input_shape.dims(0) / (block_height * block_width), output_height,
+             output_width, input_shape.dims(3)}));
+}
+
+void ProcessGatherOperator(Model* model, GatherOperator* op) {
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  const auto& indices_array = *model->arrays[op->inputs[1]];
+  auto& output_array = *model->arrays[op->outputs[0]];
+
+  // Bail if we already know the output shape.
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape() || !indices_array.has_shape()) {
+    return;
+  }
+
+  const auto& input_shape = input_array.shape();
+  const auto& indices_shape = indices_array.shape();
+  QCHECK_GE(input_shape.dimensions_count(), 1);
+  op->input_rank = input_shape.dimensions_count();
+
+  // We only support 1-D indices.
+  QCHECK_EQ(indices_shape.dimensions_count(), 1);
+
+  // Copy the input dimensions to the output except for dimension 0,
+  // where the dimension of indices_shape is used.
+  auto output_dims = output_array.mutable_shape()->mutable_dims();
+  output_dims->push_back(indices_shape.dims(0));
+  for (int dim = 1; dim < input_shape.dimensions_count(); dim++) {
+    output_dims->push_back(input_shape.dims(dim));
+  }
+}
+
+void ProcessPadOperator(Model* model, PadOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) return;
+
+  if (op->left_padding.empty()) return;
+  CHECK_EQ(op->left_padding.size(), op->right_padding.size());
+
+  auto& output_array = *model->arrays[op->outputs[0]];
+  if (output_array.has_shape()) return;
+
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+  CHECK_EQ(op->left_padding.size(), dims.size());
+
+  for (int i = 0; i < op->left_padding.size(); ++i) {
+    dims[i] += op->left_padding[i] + op->right_padding[i];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
+void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
+  CHECK_EQ(op->inputs.size(), 4);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) return;
+
+  if (op->start_indices.empty()) return;
+  CHECK_EQ(op->start_indices.size(), op->stop_indices.size());
+  CHECK_EQ(op->start_indices.size(), op->strides.size());
+
+  auto& output_array = *model->arrays[op->outputs[0]];
+  if (output_array.has_shape()) return;
+
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+  CHECK_EQ(op->start_indices.size(), dims.size());
+
+  for (int i = 0; i < op->start_indices.size(); ++i) {
+    const int mask = 1 << i;
+    const int start = (op->begin_mask & mask) ? 0 : op->start_indices[i];
+    const int stop = (op->end_mask & mask) ? input_array.shape().dims()[i]
+                                           : op->stop_indices[i];
+    dims[i] = (stop - start) / op->strides[i];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
+void ProcessSqueezeOperator(Model* model, SqueezeOperator* op) {
+  CHECK_EQ(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  const auto& input_array = *model->arrays[op->inputs[0]];
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) return;
+
+  auto& output_array = *model->arrays[op->outputs[0]];
+  if (output_array.has_shape()) return;
+
+  const std::vector<int>& input_dims = input_array.shape().dims();
+  std::vector<int> output_dims;
+
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (input_dims[i] != 1 ||
+        (!op->squeeze_dims.empty() &&
+         std::find(op->squeeze_dims.begin(), op->squeeze_dims.end(), i) ==
+             op->squeeze_dims.end())) {
+      output_dims.push_back(input_dims[i]);
+    }
+  }
+  *output_array.mutable_shape()->mutable_dims() = output_dims;
+}
+
+void ProcessSvdfOperator(Model* model, SvdfOperator* op) {
+  CHECK(op->inputs.size() == 3 || op->inputs.size() == 4);
+  const auto& input_array = *model->arrays[op->inputs[0]];
+  if (!input_array.has_shape()) return;
+
+  auto& weights_feature_array = *model->arrays[op->inputs[1]];
+  if (!weights_feature_array.has_shape()) return;
+
+  const auto& weights_time_array = *model->arrays[op->inputs[2]];
+  if (!weights_time_array.has_shape()) return;
+
+  const bool has_bias = (op->inputs.size() == 4);
+  if (has_bias) {
+    const auto& bias_array = *model->arrays[op->inputs[3]];
+    if (!bias_array.has_shape()) return;
+  }
+
+  const int batch_size = input_array.shape().dims()[0];
+  const int num_units = weights_feature_array.shape().dims()[0];
+  const int memory_size = weights_time_array.shape().dims()[1];
+
+  auto& state_array = model->GetArray(op->outputs[0]);
+  state_array.mutable_shape()->ReplaceDims(
+      {batch_size, memory_size * num_units});
+
+  auto& output_array = model->GetArray(op->outputs[1]);
+  output_array.mutable_shape()->ReplaceDims({batch_size, num_units});
+}
+}  // namespace
+
+bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  std::unordered_map<string, std::vector<int>> old_output_dims;
+  for (const auto& output : op->outputs) {
+    if (model->arrays[output]->has_shape()) {
+      old_output_dims[output] = model->arrays[output]->shape().dims();
+    }
+  }
+
+  switch (op->type) {
+    case OperatorType::kBatchNormalization:
+    case OperatorType::kL2Normalization:
+    case OperatorType::kDequantize:
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+    case OperatorType::kSoftmax:
+    case OperatorType::kLogistic:
+    case OperatorType::kTanh:
+    case OperatorType::kLocalResponseNormalization:
+    case OperatorType::kTensorFlowIdentity:
+    case OperatorType::kFakeQuant:
+    case OperatorType::kTensorFlowRsqrt:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+    case OperatorType::kTensorFlowAll:
+    case OperatorType::kTensorFlowAssert:
+    case OperatorType::kCast:
+    case OperatorType::kFloor:
+      ProcessSimpleOperator(model, op);
+      break;
+    case OperatorType::kGather:
+      ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
+      break;
+
+    case OperatorType::kAdd:
+    case OperatorType::kSub:
+    case OperatorType::kMul:
+    case OperatorType::kDiv:
+    case OperatorType::kFloorDiv:
+    case OperatorType::kFloorMod:
+    case OperatorType::kTensorFlowLess:
+    case OperatorType::kTensorFlowLessEqual:
+    case OperatorType::kTensorFlowGreater:
+    case OperatorType::kTensorFlowMaximum:
+    case OperatorType::kTensorFlowMinimum:
+    case OperatorType::kTensorFlowGreaterEqual:
+      ProcessSimpleBinaryOperator(model, op);
+      break;
+    case OperatorType::kConv:
+      ProcessConvOperator(model, static_cast<ConvOperator*>(op));
+      break;
+    case OperatorType::kTransposeConv:
+      // Unimplemented, hopefully another graph transformation will drop it or
+      // rewrite it.
+      break;
+    case OperatorType::kDepthwiseConv:
+      ProcessDepthwiseConvOperator(model,
+                                   static_cast<DepthwiseConvOperator*>(op));
+      break;
+    case OperatorType::kDepthToSpace:
+      ProcessDepthToSpaceOperator(model,
+                                  static_cast<DepthToSpaceOperator*>(op));
+      break;
+    case OperatorType::kSpaceToDepth:
+      ProcessSpaceToDepthOperator(model,
+                                  static_cast<SpaceToDepthOperator*>(op));
+      break;
+    case OperatorType::kFullyConnected:
+      ProcessFullyConnectedOperator(model,
+                                    static_cast<FullyConnectedOperator*>(op));
+      break;
+    case OperatorType::kTensorFlowReshape:
+      ProcessTensorFlowReshapeOperator(
+          model, static_cast<TensorFlowReshapeOperator*>(op));
+      break;
+    case OperatorType::kAveragePool:
+      ProcessAveragePoolOperator(model, static_cast<AveragePoolOperator*>(op));
+      break;
+    case OperatorType::kMaxPool:
+      ProcessMaxPoolOperator(model, static_cast<MaxPoolOperator*>(op));
+      break;
+    case OperatorType::kL2Pool:
+      ProcessL2PoolOperator(model, static_cast<L2PoolOperator*>(op));
+      break;
+    case OperatorType::kTensorFlowMin:
+    case OperatorType::kTensorFlowMax:
+    case OperatorType::kTensorFlowSum:
+    case OperatorType::kMean:
+      ProcessTensorFlowReductionOperator(model, op);
+      break;
+
+    case OperatorType::kSlice:
+      ProcessSliceOperator(model, static_cast<SliceOperator*>(op));
+      break;
+
+    case OperatorType::kTensorFlowTile:
+      // We don't currently implement the propagation of fixed sizes through
+      // a TensorFlow Tile.
+      //
+      // Fortunately, we don't need to: so far, we have only dealt with Tile
+      // or Slice ops in subgraphs that are identified as L2Normalization.
+      // See IdentifyL2Normalization.
+      break;
+    case OperatorType::kTensorFlowSwitch:
+      // We can't know the sizes of the outputs until we have resolved the
+      // predicate, and once we have resolved the predicate, the whole
+      // Switch node will get resolved away.
+      // See ResolveTensorFlowSwitch.
+      break;
+    case OperatorType::kTensorFlowMerge:
+      // No need to bother resolving TensorFlow Merge ops: other graph
+      // transformations will remove them anyway.
+      // See ResolveTensorFlowMerge.
+      break;
+    case OperatorType::kTensorFlowSplit:
+      ProcessTensorFlowSplitOperator(model,
+                                     static_cast<TensorFlowSplitOperator*>(op));
+      break;
+    case OperatorType::kSqueeze:
+      ProcessSqueezeOperator(model, static_cast<SqueezeOperator*>(op));
+      break;
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kTensorFlowConcatV2:
+      // Unimplemented, hopefully another graph transformation will
+      // drop it or rewrite it. Concretely, either ResolveTensorFlowConcat
+      // will resolve this node to a DepthConcatenation, or else we have
+      // a more general non-depth concatenation that will hopefully be dropped,
+      // or else at the moment we will abort.
+      break;
+    case OperatorType::kExpandDims:
+    case OperatorType::kFill:
+    case OperatorType::kRange:
+    case OperatorType::kRank:
+    case OperatorType::kTensorFlowShape:
+    case OperatorType::kStack:
+    case OperatorType::kTranspose:
+      // Unimplemented. Hopefully another graph transformation will drop it or
+      // rewrite it.
+      break;
+    case OperatorType::kReorderAxes:
+      ProcessReorderAxesOperator(model, static_cast<ReorderAxesOperator*>(op));
+      break;
+    case OperatorType::kConcatenation:
+      ProcessConcatenationOperator(model,
+                                   static_cast<ConcatenationOperator*>(op));
+      break;
+    case OperatorType::kResizeBilinear:
+      ProcessResizeBilinearOperator(model,
+                                    static_cast<ResizeBilinearOperator*>(op));
+      break;
+    case OperatorType::kLstmCell:
+      ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
+      break;
+    case OperatorType::kTensorFlowMatMul:
+      // MatMul operators are converted to FullyConnected, after which their
+      // shapes are propagated.
+      break;
+    case OperatorType::kSpaceToBatchND:
+      ProcessSpaceToBatchNDOperator(model,
+                                    static_cast<SpaceToBatchNDOperator*>(op));
+      break;
+    case OperatorType::kBatchToSpaceND:
+      ProcessBatchToSpaceNDOperator(model,
+                                    static_cast<BatchToSpaceNDOperator*>(op));
+      break;
+    case OperatorType::kPad:
+      ProcessPadOperator(model, static_cast<PadOperator*>(op));
+      break;
+    case OperatorType::kStridedSlice:
+      ProcessStridedSliceOperator(model,
+                                  static_cast<StridedSliceOperator*>(op));
+      break;
+    case OperatorType::kTensorFlowUnsupported:
+      break;
+    case OperatorType::kSvdf:
+      ProcessSvdfOperator(model, static_cast<SvdfOperator*>(op));
+      break;
+    default:
+      // Unimplemented, another graph transformation should drop it.
+      LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
+  }
+
+  // Return true if any output dim changed, false if none changed.
+  // Assumption: no transformation clears an output shape, they only add shapes.
+  for (const auto& output : op->outputs) {
+    if (model->arrays[output]->has_shape() &&
+        (old_output_dims[output] != model->arrays[output]->shape().dims())) {
+      AddMessageF("Set shape of %s to [%s]", output,
+                  absl::StrJoin(model->arrays[output]->shape().dims(), ","));
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d33597d38144278dfca66edbdd9b3da68fbaa32c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -0,0 +1,468 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool SupportsQuantization(const Operator& op) {
+  auto type = op.type;
+  if (type == OperatorType::kTensorFlowUnsupported) {
+    auto* unsupported = static_cast<const TensorFlowUnsupportedOperator*>(&op);
+    return unsupported->quantized;
+  }
+  return type == OperatorType::kConv || type == OperatorType::kDepthwiseConv ||
+         type == OperatorType::kFullyConnected ||
+         type == OperatorType::kConcatenation ||
+         type == OperatorType::kL2Normalization || type == OperatorType::kAdd ||
+         type == OperatorType::kAveragePool || type == OperatorType::kMaxPool ||
+         type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kSqueeze ||
+         type == OperatorType::kTensorFlowReshape ||
+         type == OperatorType::kMul || type == OperatorType::kSpaceToDepth ||
+         type == OperatorType::kDepthToSpace;
+}
+
+template <ArrayDataType A>
+std::unique_ptr<GenericBuffer> QuantizeBuffer(
+    const GenericBuffer& buffer,
+    const QuantizationParams& quantization_params) {
+  const auto inverse_scale = 1. / quantization_params.scale;
+  CHECK(buffer.type == ArrayDataType::kFloat);
+  const auto& float_buffer =
+      static_cast<const Buffer<ArrayDataType::kFloat>&>(buffer);
+  auto* quantized_buffer = new Buffer<A>;
+  quantized_buffer->data.resize(float_buffer.data.size());
+  const auto qmin = static_cast<int32>(std::numeric_limits<DataType<A>>::min());
+  const auto qmax = static_cast<int32>(std::numeric_limits<DataType<A>>::max());
+  for (std::size_t i = 0; i < float_buffer.data.size(); i++) {
+    const float src_val = float_buffer.data[i];
+    double scaled_val;  // Astonishingly, using 'float' degrades accuracy just
+                        // enough to make a few tests fail!
+    if (quantization_params.scale == 0) {
+      CHECK_EQ(src_val, 0) << "The quantization scale for this array is 0, "
+                           << "so all its values should be 0.";
+      scaled_val = quantization_params.zero_point;
+    } else {
+      scaled_val = quantization_params.zero_point + inverse_scale * src_val;
+    }
+    const auto rounded_val = static_cast<int32>(std::round(scaled_val));
+    const auto clamped_val = std::min(qmax, std::max(qmin, rounded_val));
+    quantized_buffer->data[i] = static_cast<DataType<A>>(clamped_val);
+  }
+  return std::unique_ptr<GenericBuffer>(quantized_buffer);
+}
+
+template <ArrayDataType A>
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name,
+                   const QuantizationParams& quantization_params) {
+  auto& array = model->GetArray(name);
+  CHECK(array.data_type == ArrayDataType::kFloat);
+  CHECK(!array.quantization_params);
+  array.GetOrCreateQuantizationParams() = quantization_params;
+  if (array.buffer) {
+    array.buffer = QuantizeBuffer<A>(*array.buffer, quantization_params);
+  }
+  array.data_type = A;
+  transformation->AddMessageF("Quantized array %s", name);
+}
+
+void QuantizeArray(GraphTransformation* transformation, Model* model,
+                   const string& name, ArrayDataType quantized_data_type,
+                   const QuantizationParams& quantization_params) {
+  switch (quantized_data_type) {
+    case ArrayDataType::kUint8:
+      return QuantizeArray<ArrayDataType::kUint8>(transformation, model, name,
+                                                  quantization_params);
+    case ArrayDataType::kInt32:
+      return QuantizeArray<ArrayDataType::kInt32>(transformation, model, name,
+                                                  quantization_params);
+    default:
+      LOG(FATAL) << "Unhandled case.";
+  }
+}
+
+const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
+  auto& array = model->GetArray(array_name);
+  // Normally we should have a MinMax recorded on this Array,
+  // so we just use it.
+  if (array.minmax != nullptr) {
+    return *array.minmax;
+  }
+
+  // We don't have a MinMax. That's bad news: we need
+  // the graph to provide MinMax info for all arrays in order
+  // for inference to reproduce faithfully the same quantization
+  // error as the training process had.
+  //
+  // But we still want to support a fallback for constant arrays,
+  // just using the plain min and max computed from array elements.
+  // We should hopefully never rely on that in production, as that
+  // will not give very good accuracy as that typically won't be
+  // exactly what the training process used. But it will be useful
+  // to allow easily trying out quantization even if the graph
+  // lacks some minmax information.
+  if (array.buffer != nullptr) {
+    LOG(WARNING)
+        << "Constant array " << array_name
+        << " lacks MinMax information. To make up for that, we will now compute"
+        << " the MinMax from actual array elements. That will result in"
+        << " quantization parameters that probably do not match whichever "
+           "arithmetic"
+        << " was used during training, and thus will probably be a cause of "
+           "poor"
+        << " inference accuracy.";
+    CHECK(array.buffer->type == ArrayDataType::kFloat);
+    const auto& data = array.GetBuffer<ArrayDataType::kFloat>().data;
+    // We always want [min, max] to contain 0.
+    float min = 0.f;
+    float max = 0.f;
+    for (auto val : data) {
+      min = std::min(min, val);
+      max = std::max(max, val);
+    }
+    auto& minmax = array.GetOrCreateMinMax();
+    minmax.min = min;
+    minmax.max = max;
+    return minmax;
+  }
+
+  LOG(FATAL) << "Array " << array_name
+             << " does not have MinMax information, "
+                "and is not a constant array. Cannot "
+                "proceed with quantization.";
+}
+
+bool ChooseQuantizationForOperatorInput(
+    GraphTransformation* transformation, Model* model, const Operator& op,
+    std::size_t input_index, ArrayDataType* quantized_data_type,
+    QuantizationParams* quantization_params) {
+  const auto& input = op.inputs[input_index];
+  auto& array = model->GetArray(input);
+  if (array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+  if (op.type == OperatorType::kConv ||
+      op.type == OperatorType::kDepthwiseConv ||
+      op.type == OperatorType::kFullyConnected) {
+    if (input_index == 2) {
+      // Quantization of bias vector.
+      // We need both of the mandatory inputs (input activations and weights) to
+      // have
+      // been already quantized.
+      const auto& input_activations = model->GetArray(op.inputs[0]);
+      const auto& input_weights = model->GetArray(op.inputs[1]);
+      if (!input_activations.quantization_params ||
+          !input_weights.quantization_params) {
+        return false;
+      }
+      const auto input_activations_scale =
+          input_activations.quantization_params->scale;
+      const auto input_weights_scale = input_weights.quantization_params->scale;
+      quantization_params->scale =
+          input_activations_scale * input_weights_scale;
+      quantization_params->zero_point = 0;
+      *quantized_data_type = ArrayDataType::kInt32;
+      transformation->AddMessageF(
+          "Input array %s is a bias vector. Choosing quantization params "
+          "accordingly.",
+          input);
+      return true;
+    }
+  }
+
+  const MinMax& minmax = GetOrComputeMinMax(model, input);
+  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
+                                                         quantization_params);
+  transformation->AddMessageF(
+      "For input array %s with min=%g"
+      ", max=%g"
+      ", chose to quantize as uint8 with zero_point=%d"
+      ", scale=%g",
+      input, minmax.min, minmax.max, quantization_params->zero_point,
+      quantization_params->scale);
+  *quantized_data_type = ArrayDataType::kUint8;
+  return true;
+}
+
+bool IsExactlyRepresentable(double real_value, ArrayDataType data_type,
+                            const QuantizationParams& quantization_params) {
+  const double scaled_value =
+      quantization_params.zero_point + real_value / quantization_params.scale;
+  const double fractional_scaled_value =
+      scaled_value - std::round(scaled_value);
+  if (std::abs(fractional_scaled_value) > 1e-12) {
+    return false;
+  }
+  const double rounded_scaled_value = std::round(scaled_value);
+  if (data_type == ArrayDataType::kUint8) {
+    if (rounded_scaled_value < 0 || rounded_scaled_value > 255) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ChooseHardcodedQuantizationForOperatorOutput(
+    const Operator& op, ArrayDataType* quantized_data_type,
+    QuantizationParams* quantization_params) {
+  if (op.type == OperatorType::kL2Normalization) {
+    // L2Normalization has range: [-1, 1].
+    // 0 should be exactly representable, as values will typically be centered
+    // around 0, with many values near 0.
+    *quantized_data_type = ArrayDataType::kUint8;
+    quantization_params->zero_point = 128;
+    quantization_params->scale = 1. / 128.;
+    CHECK(
+        IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
+    return true;
+  }
+  if ((op.type == OperatorType::kLogistic) ||
+      (op.type == OperatorType::kSoftmax)) {
+    // Logistic and Softmax have range: [0, 1].
+    //
+    // For Logistic, 0.5 should be exactly representable, as implementations
+    // will typically exploit the symmetry logistic(-x) = 1 - logistic(x), and
+    // the glueing of the two halves of the graph will only be seamless if we
+    // are accurately representing logistic(0) == 0.5.
+    *quantized_data_type = ArrayDataType::kUint8;
+    quantization_params->zero_point = 0;
+    quantization_params->scale = 1. / 256.;
+    CHECK(IsExactlyRepresentable(0.5, *quantized_data_type,
+                                 *quantization_params));
+    return true;
+  }
+  return false;
+}
+
+bool ChooseQuantizationForOperatorOutput(
+    GraphTransformation* transformation, Model* model, const Operator& op,
+    std::size_t output_index, ArrayDataType* quantized_data_type,
+    QuantizationParams* quantization_params) {
+  const auto& output = op.outputs[output_index];
+  auto& array = model->GetArray(output);
+  if (array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+  if (ChooseHardcodedQuantizationForOperatorOutput(op, quantized_data_type,
+                                                   quantization_params)) {
+    transformation->AddMessageF(
+        "Output array %s is produced by a %s operator. Choosing fixed "
+        "quantization params accordingly.",
+        output, OperatorTypeName(op.type));
+    return true;
+  }
+  if ((op.type == OperatorType::kDepthToSpace) ||
+      (op.type == OperatorType::kSpaceToDepth)) {
+    // DepthToSpace and SpaceToDepth should preserve the quantization parameters
+    // of the input array, as these are simple reshape operations.
+    const auto& input_quantization_params =
+        model->GetArray(op.inputs[0]).GetQuantizationParams();
+    *quantized_data_type = ArrayDataType::kUint8;
+    quantization_params->zero_point = input_quantization_params.zero_point;
+    quantization_params->scale = input_quantization_params.scale;
+
+    transformation->AddMessageF(
+        "Output array %s is produced by a %s operator. Copying quantization "
+        "params from input array.",
+        output, OperatorTypeName(op.type));
+    return true;
+  }
+  const MinMax& minmax = GetOrComputeMinMax(model, output);
+  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(model->flags, minmax,
+                                                         quantization_params);
+  *quantized_data_type = ArrayDataType::kUint8;
+  transformation->AddMessageF(
+      "For output array %s with min=%g, max=%g"
+      ", chose to quantize as uint8 with zero_point=%d"
+      ", scale=%g",
+      output, minmax.min, minmax.max, quantization_params->zero_point,
+      quantization_params->scale);
+
+  return true;
+}
+}  // namespace
+
+bool Quantize::Run(Model* model, std::size_t op_index) {
+  // Our general "quantization" graph transformation consists in replacing
+  //   QuantizedInputArrays[] ->
+  //     DequantizeOperators[] ->
+  //       FloatInputArrays[] ->
+  //         Operator ->
+  //           FloatOutputArray
+  // by
+  //   QuantizedInputArrays[] ->
+  //     Operator ->
+  //       QuantizedOutputArray ->
+  //         DequantizeOperator ->
+  //           FloatOutputArray
+  //
+  // In other words, this is pushing Dequantize operators to the right of
+  // other operators.
+  //
+
+  auto& op = *model->operators[op_index];
+  if (op.type == OperatorType::kDequantize ||
+      op.type == OperatorType::kFakeQuant) {
+    return false;
+  }
+
+  // Our assumption here is that the input arrays are already quantized -
+  // that is typically the case in models operating on an input bitmap
+  // image, and MakeInitialDequantizeOp should have already resolved
+  // the handling of the input image as an initial Dequantize op.
+  //
+  // Thus we are building around the assumption that the graph always starts
+  // with a quantized input array, and only after some Dequantize op do we have
+  // float arrays. The problem of quantizing the graph thus becomes a problem of
+  // pushing Dequantize ops to the right of other ops.
+  //
+  // Let us just guard this assumption by the following assertion:
+  for (const auto& input : op.inputs) {
+    if (IsInputArray(*model, input)) {
+      const auto& input_array = model->GetArray(input);
+      CHECK(input_array.quantization_params);
+    }
+  }
+  if (!SupportsQuantization(op)) {
+    LOG(FATAL) << "Unimplemented: this graph contains an operator of type "
+               << HelpfulOperatorTypeName(op)
+               << " for which the quantized form is not yet implemented. "
+                  "Sorry, and patches welcome (that's a relatively fun patch "
+                  "to write, mostly providing the actual quantized arithmetic "
+                  "code for this op).";
+  }
+
+  for (const auto& input : op.inputs) {
+    const auto& array = model->GetArray(input);
+    if (array.data_type == ArrayDataType::kFloat) {
+      if (!array.minmax && !array.buffer) {
+        LOG(ERROR) << "Can't quantize input array " << input
+                   << " because it lacks min/max info";
+        return false;
+      }
+      const auto* other_op = GetOpWithOutput(*model, input);
+      if (other_op && other_op->type != OperatorType::kDequantize) {
+        AddMessageF(
+            "Not quantizing %s for now, because its input array %s is not "
+            "produced by a Dequantize op, "
+            "which means that we should yield and let other ops "
+            "get quantized first",
+            LogName(op), input);
+        return false;
+      }
+    }
+  }
+
+  bool changed = false;
+
+  // Quantize inputs, remove any Dequantize op on the inputs side
+  for (std::size_t input_index = 0; input_index < op.inputs.size();
+       input_index++) {
+    ArrayDataType quantized_data_type;
+    QuantizationParams quantization_params;
+    if (ChooseQuantizationForOperatorInput(this, model, op, input_index,
+                                           &quantized_data_type,
+                                           &quantization_params)) {
+      changed = true;
+      const auto& input = op.inputs[input_index];
+      if (IsConstantParameterArray(*model, input)) {
+        QuantizeArray(this, model, input, quantized_data_type,
+                      quantization_params);
+      } else {
+        auto dequantize_it = FindOpWithOutput(*model, input);
+        CHECK(dequantize_it != model->operators.end());
+        auto* dequantize_op = dequantize_it->get();
+        CHECK(dequantize_op->type == OperatorType::kDequantize);
+        op.inputs[input_index] = dequantize_op->inputs[0];
+        // Check if the output of that Dequantize op was not used by any
+        // other operator. We will then erase that Dequantize op.
+        if (!CountOpsWithInput(*model, dequantize_op->outputs[0])) {
+          // If any of the model's output_arrays was pointing to the
+          // Dequantize op's output, let it point to the Dequantize op's
+          // input instead.
+          for (int i = 0; i < model->flags.output_arrays_size(); i++) {
+            if (model->flags.output_arrays(i) == dequantize_op->outputs[0]) {
+              model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+            }
+          }
+          model->arrays.erase(dequantize_op->outputs[0]);
+          model->operators.erase(dequantize_it);
+        }
+      }
+    }
+  }
+
+  // Quantize outputs, add Dequantize ops as needed on the outputs side
+  for (std::size_t output_index = 0; output_index < op.outputs.size();
+       output_index++) {
+    ArrayDataType quantized_data_type;
+    QuantizationParams quantization_params;
+    if (ChooseQuantizationForOperatorOutput(this, model, op, output_index,
+                                            &quantized_data_type,
+                                            &quantization_params)) {
+      changed = true;
+      const auto& output = op.outputs[output_index];
+      QuantizeArray(this, model, output, quantized_data_type,
+                    quantization_params);
+      const auto& dequantized_output =
+          AvailableArrayName(*model, output + "_dequantized");
+      const auto& output_array = model->GetArray(output);
+      const auto& output_minmax = output_array.GetMinMax();
+      auto& dequantized_output_array =
+          model->GetOrCreateArray(dequantized_output);
+      dequantized_output_array.data_type = ArrayDataType::kFloat;
+      auto& dequantized_output_minmax =
+          dequantized_output_array.GetOrCreateMinMax();
+      dequantized_output_minmax.min = output_minmax.min;
+      dequantized_output_minmax.max = output_minmax.max;
+      for (const auto& other_op : model->operators) {
+        for (auto& other_op_input : other_op->inputs) {
+          if (other_op_input == output) {
+            other_op_input = dequantized_output;
+          }
+        }
+      }
+      auto* dequantize_op = new DequantizeOperator;
+      dequantize_op->inputs = {output};
+      dequantize_op->outputs = {dequantized_output};
+      for (int i = 0; i < model->flags.output_arrays_size(); i++) {
+        if (model->flags.output_arrays(i) == output) {
+          model->flags.set_output_arrays(i, dequantized_output);
+        }
+      }
+      const auto op_it = FindOp(*model, &op);
+      model->operators.emplace(op_it + 1, dequantize_op);
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..371ced388a8111c18ada32cf31a784809479291d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -0,0 +1,105 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool ApplyMinMaxToArray(GraphTransformation* transformation, Model* model,
+                        const MinMax& minmax, const string& array_name) {
+  auto& annotated_array = model->GetArray(array_name);
+  if (annotated_array.minmax) {
+    return false;
+  }
+  annotated_array.GetOrCreateMinMax() = minmax;
+  transformation->AddMessageF(
+      "Read min/max annotation for array %s: min=%g, max=%g", array_name,
+      minmax.min, minmax.max);
+  return true;
+}
+
+}  // end namespace
+
+bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
+  const auto fakequant_it = model->operators.begin() + op_index;
+  auto* fakequant_base_op = fakequant_it->get();
+  if (fakequant_base_op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+  auto* fakequant_op = static_cast<FakeQuantOperator*>(fakequant_base_op);
+
+  bool changed = false;
+
+  if (!fakequant_op->minmax) {
+    CHECK_EQ(fakequant_op->inputs.size(), 3);
+    // We need to yield until the min and max parameters have been
+    // resolved to constant arrays.
+    for (int i = 1; i <= 2; i++) {
+      if (!IsConstantParameterArray(*model, fakequant_op->inputs[1])) {
+        return false;
+      }
+    }
+
+    // Obtain the final min/max values
+    const auto& min_array = model->GetArray(fakequant_op->inputs[1]);
+    const auto& max_array = model->GetArray(fakequant_op->inputs[2]);
+    CHECK_EQ(RequiredBufferSizeForShape(min_array.shape()), 1);
+    CHECK_EQ(RequiredBufferSizeForShape(max_array.shape()), 1);
+    fakequant_op->minmax.reset(new MinMax);
+    MinMax& minmax = *fakequant_op->minmax;
+    minmax.min = min_array.GetBuffer<ArrayDataType::kFloat>().data[0];
+    minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
+    // We always want [min, max] to contain 0.
+    minmax.min = std::min(minmax.min, 0.);
+    minmax.max = std::max(minmax.max, 0.);
+
+    // We won't use the input arrays that provided these min and max
+    // values, anymore. Delete them unless they are used by something
+    // else.
+    for (int i = 1; i <= 2; i++) {
+      if (CountOpsWithInput(*model, fakequant_op->inputs[i]) == 1) {
+        model->arrays.erase(fakequant_op->inputs[i]);
+      }
+    }
+    fakequant_op->inputs.resize(1);
+    changed = true;
+  }
+
+  // At this point, this FakeQuantOperator should have a MinMax
+  // attached to it, and should only have 1 input (it should not have
+  // 2nd and 3rd input arrays giving min and max anymore).
+  CHECK(fakequant_op->minmax);
+  CHECK_EQ(1, fakequant_op->inputs.size());
+
+  const MinMax& minmax = *fakequant_op->minmax;
+
+  // Record the MinMax info on the input and output arrays
+  changed |= ApplyMinMaxToArray(this, model, minmax, fakequant_op->inputs[0]);
+  changed |= ApplyMinMaxToArray(this, model, minmax, fakequant_op->outputs[0]);
+
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3992e7d1ef71edd4040e626d5848d2fd9bb3dab6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_final_dequantize_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveFinalDequantizeOp::Run(Model* model, std::size_t op_index) {
+  const auto dequantize_it = model->operators.begin() + op_index;
+  const auto* dequantize_op = dequantize_it->get();
+  if (dequantize_op->type != OperatorType::kDequantize) {
+    return false;
+  }
+  const auto& output = dequantize_op->outputs[0];
+  // We can remove any dequantize op whose output is not consumed by
+  // any op. This is not necessarily equivalent to the output being
+  // one of the model's output arrays, as some intermediate array
+  // in the middle of the graph might be designated as an output
+  // array.
+  if (CountOpsWithInput(*model, output)) {
+    return false;
+  }
+
+  // If one of the model's output arrays was actually the Dequantize op's
+  // output, then we need to update it to point to the Dequantize op's input.
+  for (int i = 0; i < model->flags.output_arrays_size(); i++) {
+    if (output == model->flags.output_arrays(i)) {
+      model->flags.set_output_arrays(i, dequantize_op->inputs[0]);
+    }
+  }
+
+  // Remove the node and its output array.
+  AddMessageF("Removed final %s", LogName(*dequantize_op));
+  model->arrays.erase(output);
+  model->operators.erase(dequantize_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35a0c465327f352863350e7a8af714d16b7be393
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_assert.cc
@@ -0,0 +1,60 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveTensorFlowAssert::Run(Model* model, std::size_t op_index) {
+  const auto assert_it = model->operators.begin() + op_index;
+  const auto* assert_op = assert_it->get();
+  if (assert_op->type != OperatorType::kTensorFlowAssert) {
+    return false;
+  }
+
+  bool changed = false;
+  // Remove any other node's dependency on this assert node
+  for (const auto& op : model->operators) {
+    auto it = op->inputs.begin();
+    while (it != op->inputs.end()) {
+      if (*it == assert_op->outputs[0]) {
+        op->inputs.erase(it);
+        changed = true;
+      } else {
+        ++it;
+      }
+    }
+  }
+  CHECK(!CountOpsWithInput(*model, assert_op->outputs[0]));
+
+  if (changed) {
+    AddMessageF(
+        "Prepared for the removal of %s by removing any other op's dependency "
+        "on it",
+        LogName(*assert_op));
+  }
+
+  // That's it. We can stop here, no need to duplicate the work that
+  // RemoveUnusedOp will do removing this now-unused node.
+  return changed;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc
new file mode 100644
index 0000000000000000000000000000000000000000..404269bbfd9312bbbab32489783d9e4217ecbd89
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_tensorflow_identity.cc
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveTensorFlowIdentity::Run(Model* model, std::size_t op_index) {
+  const auto passthru_it = model->operators.begin() + op_index;
+  const auto* passthru_op = passthru_it->get();
+  if (passthru_op->type != OperatorType::kTensorFlowIdentity) {
+    return false;
+  }
+
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6add443f2d62fd06e8c0d17e03bc78c5d74732a1
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -0,0 +1,113 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <typename Scalar>
+bool AreAllBufferElementsEqualTo(const std::vector<Scalar>& buffer_data,
+                                 Scalar value) {
+  for (auto x : buffer_data) {
+    if (x != value) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+// A binary operator is called trivial when exactly one of its operands is
+// a constant and is such that the binary operation is equivalent to
+// the identity operation on its other input.
+// For example, an Add operator is trivial if
+// one of its operands is constant 0, a Mul operator is trivial
+// if one of its operands is constant 1, etc.
+bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
+  const auto binary_it = model->operators.begin() + op_index;
+  auto* binary_op = binary_it->get();
+  if (binary_op->type != OperatorType::kAdd &&
+      binary_op->type != OperatorType::kMul &&
+      binary_op->type != OperatorType::kSub &&
+      binary_op->type != OperatorType::kDiv) {
+    return false;
+  }
+
+  CHECK_EQ(binary_op->inputs.size(), 2);
+
+  // This graph transformation is only concerned with the case
+  // when one input is constant and the other is not constant.
+  const bool is_input_constant[2] = {
+      IsConstantParameterArray(*model, binary_op->inputs[0]),
+      IsConstantParameterArray(*model, binary_op->inputs[1]),
+  };
+  if (!is_input_constant[0] && !is_input_constant[1]) {
+    // Neither input is constant, so nothing we can resolve here.
+    return false;
+  }
+  if (is_input_constant[0] && is_input_constant[1]) {
+    // Both inputs are constants. That's a job for constants
+    // propagation, not for us to handle here.
+    return false;
+  }
+  const int index_of_constant_input = is_input_constant[0] ? 0 : 1;
+  const int index_of_variable_input = is_input_constant[0] ? 1 : 0;
+  CHECK(is_input_constant[index_of_constant_input]);
+  CHECK(!is_input_constant[index_of_variable_input]);
+
+  // Now check if the constant operand makes this binary
+  // operator trivial.
+  const auto& constant_input_array =
+      *model->arrays[binary_op->inputs[index_of_constant_input]];
+  // For now, we only handle floats here.
+  if (constant_input_array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+  const auto& constant_input_float_data =
+      constant_input_array.GetBuffer<ArrayDataType::kFloat>().data;
+  bool is_trivial = false;
+  if (binary_op->type != OperatorType::kAdd) {
+    is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
+  } else if (binary_op->type != OperatorType::kSub) {
+    is_trivial = index_of_constant_input == 1 &&
+                 AreAllBufferElementsEqualTo(constant_input_float_data, 0.f);
+  } else if (binary_op->type != OperatorType::kMul) {
+    is_trivial = AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
+  } else if (binary_op->type != OperatorType::kDiv) {
+    is_trivial = index_of_constant_input == 1 &&
+                 AreAllBufferElementsEqualTo(constant_input_float_data, 1.f);
+  }
+
+  if (!is_trivial) {
+    return false;
+  }
+
+  // Now we know that this node is trivial, so we can remove it.
+  AddMessageF("Removing trivial %s", LogName(*binary_op));
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ceb93d8eedbb3743be112e6bd03cfe3e6f74d13
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation.cc
@@ -0,0 +1,40 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveTrivialConcatenation::Run(Model* model, std::size_t op_index) {
+  const auto concat_it = model->operators.begin() + op_index;
+  auto* concat_op = concat_it->get();
+  if (concat_op->type != OperatorType::kConcatenation) {
+    return false;
+  }
+  if (concat_op->inputs.size() != 1) {
+    return false;
+  }
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23a5c857e8b19f7edbb48f2c004d03e21008833d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_concatenation_input.cc
@@ -0,0 +1,69 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveTrivialConcatenationInput::Run(Model* model, std::size_t op_index) {
+  // TensorFlow allows Concatenation nodes to have 0-D inputs,
+  // and they are then treated as empty i.e. omitted from concatenation,
+  // in violation of the notion that 0-D is equivalent to 1x1x1x1.
+  // Thus we have to drop these 0-D inputs from Concatenation nodes.
+  // Sometimes, there will remain only one non-trivial input, and
+  // the other graph transformation RemoveTrivialConcatenation will then drop
+  // it.
+  const auto concat_it = model->operators.begin() + op_index;
+  auto* concat_op = concat_it->get();
+  if (concat_op->type != OperatorType::kConcatenation) {
+    return false;
+  }
+  std::vector<string> trivial_inputs;
+  std::vector<string> nontrivial_inputs;
+  for (const string& input : concat_op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    const bool is_trivial =
+        input_array.has_shape() && input_array.shape().dimensions_count() == 0;
+    if (is_trivial) {
+      trivial_inputs.push_back(input);
+    } else {
+      nontrivial_inputs.push_back(input);
+    }
+  }
+
+  if (trivial_inputs.empty()) {
+    return false;
+  }
+
+  // Drop trivial inputs.
+  for (const string& input : trivial_inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1) {
+      model->arrays.erase(input);
+    }
+  }
+  concat_op->inputs = nontrivial_inputs;
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
new file mode 100644
index 0000000000000000000000000000000000000000..047389f69a1d8987b52b07478b0d3eaf46f433ba
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -0,0 +1,134 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Reroute all edges involving a given discardable array to another
+// array instead. from_array is assumed to be discardable, and consequently
+// this only updates operator edges (since discardable arrays only
+// appear there, and not e.g. in model flags).
+void RerouteEdges(const string& from_array, const string& to_array,
+                  Model* model) {
+  for (const auto& op : model->operators) {
+    for (auto& output : op->outputs) {
+      if (output == from_array) {
+        output = to_array;
+      }
+    }
+    for (auto& input : op->inputs) {
+      if (input == from_array) {
+        input = to_array;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
+                                Model* model, std::size_t op_index) {
+  const auto passthru_it = model->operators.begin() + op_index;
+  auto* passthru_op = passthru_it->get();
+  CHECK_EQ(passthru_op->outputs.size(), 1);
+  CHECK_GE(passthru_op->inputs.size(), 1);
+  int count_nonconstant_input_arrays = 0;
+  // We call 'main input' the unique nonconstant input array if there is one,
+  // or else the 0-th input.
+  int main_input_array_index = 0;
+  for (int i = 0; i < passthru_op->inputs.size(); i++) {
+    if (!model->GetArray(passthru_op->inputs[i]).buffer) {
+      count_nonconstant_input_arrays++;
+      main_input_array_index = i;
+    }
+  }
+
+  const string main_input_name = passthru_op->inputs[main_input_array_index];
+  const string output_name = passthru_op->outputs[0];
+
+  // Build the list of all input and output arrays of the passthrough node
+  // that we are considering removing. Any of these arrays is a candidate
+  // for being removed as well, if nothing else references it. Doing that
+  // arrays-removal together with the passthrough-node-removal proved too
+  // error-prone.
+  std::vector<string> removal_candidates;
+  for (const string& input : passthru_op->inputs) {
+    removal_candidates.push_back(input);
+  }
+  removal_candidates.push_back(output_name);
+
+  if (IsDiscardableArray(*model, output_name)) {
+    transformation->AddMessageF(
+        "Removing %s, keeping its non-constant input array",
+        LogName(*passthru_op));
+    for (const string& input : passthru_op->inputs) {
+      if (IsDiscardableArray(*model, input) && input != main_input_name &&
+          CountOpsWithInput(*model, input) == 1) {
+      }
+    }
+    RerouteEdges(output_name, main_input_name, model);
+  } else if (IsDiscardableArray(*model, main_input_name)) {
+    transformation->AddMessageF("Removing %s, keeping its output array",
+                                LogName(*passthru_op));
+    for (const string& input : passthru_op->inputs) {
+      if (IsDiscardableArray(*model, input) &&
+          (input == main_input_name || CountOpsWithInput(*model, input) == 1)) {
+      }
+    }
+    RerouteEdges(main_input_name, output_name, model);
+  } else {
+    transformation->AddMessageF(
+        "Cannot remove %s, neither its main input nor its output may be "
+        "discarded",
+        LogName(*passthru_op));
+    return false;
+  }
+
+  // Remove the pass-through node.
+  model->operators.erase(passthru_it);
+
+  // Remove any array that is no longer used.
+  for (const string& removal_candidate : removal_candidates) {
+    bool is_referenced = false;
+    for (const auto& op : model->operators) {
+      for (const string& input : op->inputs) {
+        if (input == removal_candidate) {
+          is_referenced = true;
+        }
+      }
+      for (const string& output : op->outputs) {
+        if (output == removal_candidate) {
+          is_referenced = true;
+        }
+      }
+    }
+    if (!is_referenced) {
+      model->arrays.erase(removal_candidate);
+    }
+  }
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
new file mode 100644
index 0000000000000000000000000000000000000000..a06181ca0b5f1cbb930fa4295fec3d6adf66440d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+// A "passthrough op" is an op that satisfies the following conditions:
+//   1. One of its inputs is (per the semantics of that op) its "main input"
+//      for some notion of "main input" that is operator-specific; for example,
+//      for a Reshape op, the main input is the array being reshaped, not the
+//      other input which gives the new shape.
+//   2. It has exactly one output.
+//   3. It forwards exactly its main input to its single output.
+//
+// Examples include:
+//   1. TensorFlow Identity ops. (Have one input).
+//   2. TensorFlow Reshape ops when the input and output shapes agree.
+//   3. Any binary operator, one of whose two inputs is a constant and is the
+//      neutral value for that operation. For example, a binary Add operator
+//      where one of its inputs is a constant array filled with zeros.
+//
+// A passthrough op is "trivial" and can be removed when it is possible to
+// discard either its main input or output array, rerouting any
+// edge involving it to the other of these two arrays.
+//
+// It is only possible to discard such an array if it is not explicitly
+// designated as a global input/output array of the graph, e.g. the model's
+// input arrays, output arrays, and any array involved in a RNN back-edge
+// specified by the model.
+//
+// This function does not check that the given operator is a passthrough op:
+// that's the responsibility of the caller.
+// Given that it is a passthrough op, this function checks whether it is trivial
+// and then discards it and returns true, or, if it's not trivial (if neither
+// the input nor the output may be discarded), returns false.
+bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
+                                Model* model, std::size_t op_index);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_GRAPH_TRANSFORMATIONS_REMOVE_TRIVIAL_PASSTHROUGH_H_
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28f76c9d36d6f68c8997fa0cf620c8aec4273619
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_quantized_activation_func.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveTrivialQuantizedActivationFunc::Run(Model* model,
+                                               std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+  if (op->fused_activation_function != FusedActivationFunctionType::kRelu &&
+      op->fused_activation_function != FusedActivationFunctionType::kRelu6) {
+    return false;
+  }
+  const auto& output_array = model->GetArray(op->outputs[0]);
+  if (!output_array.quantization_params) {
+    return false;
+  }
+  if (output_array.data_type != ArrayDataType::kUint8) {
+    return false;
+  }
+  const auto& quantization_params = output_array.GetQuantizationParams();
+
+  bool has_nontrivial_min_bound = false;
+  bool has_nontrivial_max_bound = false;
+
+  if (op->fused_activation_function == FusedActivationFunctionType::kRelu ||
+      op->fused_activation_function == FusedActivationFunctionType::kRelu6) {
+    double lowest_representable_output =
+        (0. - quantization_params.zero_point) * quantization_params.scale;
+    if (lowest_representable_output < 0.) {
+      has_nontrivial_min_bound = true;
+      AddMessageF(
+          "Quantized activation function is not trivial: "
+          "the lowest representable output value %g"
+          " less than the clamp min bound.",
+          lowest_representable_output);
+    }
+  }
+  if (op->fused_activation_function == FusedActivationFunctionType::kRelu6) {
+    double highest_representable_output =
+        (255. - quantization_params.zero_point) * quantization_params.scale;
+    if (highest_representable_output > 6.) {
+      has_nontrivial_max_bound = true;
+      AddMessageF(
+          "Quantized activation function is not trivial: "
+          "the highest representable output value %g"
+          " is greater than the clamp max bound.",
+          highest_representable_output);
+    }
+  }
+
+  if (has_nontrivial_min_bound || has_nontrivial_max_bound) {
+    return false;
+  }
+
+  op->fused_activation_function = FusedActivationFunctionType::kNone;
+  AddMessageF(
+      "Removing trivial quantized activation function on %s"
+      " because the output quantization parameters imply at least as tight"
+      " a clamp anyway.",
+      LogName(*op));
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90f9381ec154f145cda826ff9730ff332cd96701
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_reshape.cc
@@ -0,0 +1,92 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsReshapeTrivial(const Model& model, const Operator& op,
+                      RemoveTrivialReshape* transformation) {
+  CHECK(op.type == OperatorType::kTensorFlowReshape);
+
+  // One way in which a reshape can be trivial is if its
+  // output shape is == its input shape
+  const auto& input_array = model.GetArray(op.inputs[0]);
+  const auto& output_array = model.GetArray(op.outputs[0]);
+  if (input_array.has_shape() && output_array.has_shape()) {
+    if (transformation->treat_expand_dims_as_trivial() &&
+        ShapesAgreeUpToExtending(input_array.shape(), output_array.shape())) {
+      transformation->AddMessageF(
+          "%s is trivial because its input and output shapes are equal up to "
+          "extending "
+          "by 1's, and we are told to aggressively discard such Reshape ops.",
+          LogName(op));
+      return true;
+    }
+    if (input_array.shape().dims() == output_array.shape().dims()) {
+      transformation->AddMessageF(
+          "%s is trivial because its input and output shapes are equal",
+          LogName(op));
+      return true;
+    }
+  }
+
+  // Another way in which a reshape can be trivial is if its output
+  // is only consumed by another reshape.
+  if (CountOpsWithInput(model, op.outputs[0]) == 1) {
+    const auto* next_op = GetOpWithInput(model, op.outputs[0]);
+    if (next_op->type == OperatorType::kTensorFlowReshape) {
+      transformation->AddMessageF(
+          "%s is trivial because its output is only consumed by another "
+          "Reshape op",
+          LogName(op));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+}  // namespace
+
+bool RemoveTrivialReshape::Run(Model* model, std::size_t op_index) {
+  const auto reshape_it = model->operators.begin() + op_index;
+  auto* reshape_op = reshape_it->get();
+  if (reshape_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  if (!IsReshapeTrivial(*model, *reshape_op, this)) {
+    return false;
+  }
+
+  AddMessageF("Removing trivial %s", LogName(*reshape_op));
+
+  CHECK_EQ(reshape_op->inputs.size(), 2);
+  return RemoveTrivialPassthroughOp(this, model, op_index);
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6cca8acf36745d989fb731aa948f257375d7e90
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -0,0 +1,125 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* op = it->get();
+
+  // Bail if any output is used, and is not an input_array of
+  // the model. We allow specifying an arbitrary input_array,
+  // treating the part of the graph leading up to it as unused.
+  for (const auto& output : op->outputs) {
+    CHECK(model->arrays.count(output));
+    // If this output is provided as the model's input array,
+    // then we don't need this operator to produce its contents.
+    if (IsInputArray(*model, output)) {
+      continue;
+    }
+    // If this output is provided as a RNN's state array,
+    // then we don't need this operator to produce its contents.
+    // So far this case has only been encountered with TensorFlow
+    // Fill ops used to zero-initialize RNN states, which is
+    // redundant for us as we zero-initialize RNN states anyway.
+    bool found_output_as_rnn_state_array = false;
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      if (output == rnn_state.state_array()) {
+        CHECK(op->type == OperatorType::kFill);
+        found_output_as_rnn_state_array = true;
+        break;
+      }
+    }
+    if (found_output_as_rnn_state_array) {
+      continue;
+    }
+    for (const string& output_array : model->flags.output_arrays()) {
+      if (output == output_array) {
+        return false;
+      }
+    }
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      if (output == rnn_state.back_edge_source_array()) {
+        // The output is consumed by a RNN back-edge..
+        if (!IsDiscardableArray(*model, rnn_state.back_edge_source_array()) ||
+            !IsDiscardableArray(*model, rnn_state.state_array()) ||
+            CountOpsWithInput(*model, rnn_state.state_array())) {
+          return false;
+        }
+      }
+    }
+    if (CountOpsWithInput(*model, output)) {
+      return false;
+    }
+  }
+
+  if (op->unresolved_outputs) {
+    AddMessageF("Not discarding %s because it has unresolved outputs.",
+                LogName(*op));
+    return false;
+  }
+
+  AddMessageF("Discarding %s because none of its outputs is used.",
+              LogName(*op));
+
+  // At that point we know that none of the outputs is used, so we will
+  // definitely remove the node and all its outputs.
+
+  // Remove any input array that is not used by anything else,
+  // and that is not the output of some other operator.
+  for (const auto& input : op->inputs) {
+    if (IsDiscardableArray(*model, input) &&
+        CountOpsWithInput(*model, input) == 1 &&
+        !GetOpWithOutput(*model, input)) {
+      model->arrays.erase(input);
+    }
+  }
+
+  // Remove the node and its now-unused output arrays.
+  for (const auto& output : op->outputs) {
+    // If the output array is the model's input array, don't remove that.
+    // That's the case when cropping a model at a given --input_array.
+    if (!IsDiscardableArray(*model, output)) {
+      continue;
+    }
+    // Likewise, if the output array is a RNN state array, don't remove that.
+    bool found_output_as_rnn_state_array = false;
+    for (const auto& rnn_state : model->flags.rnn_states()) {
+      if (output == rnn_state.state_array()) {
+        found_output_as_rnn_state_array = true;
+        break;
+      }
+    }
+    if (found_output_as_rnn_state_array) {
+      continue;
+    }
+    // Generic case: do delete this output array.
+    model->arrays.erase(output);
+  }
+  model->operators.erase(it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3eb7fa3896c57ea612f21f8b4f3fa568d19420d4
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -0,0 +1,135 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
+  auto bn_it = model->operators.begin() + op_index;
+  if (bn_it->get()->type != OperatorType::kBatchNormalization) {
+    return false;
+  }
+  const auto* bn_op =
+      static_cast<const BatchNormalizationOperator*>(bn_it->get());
+
+  const auto& mean_array = model->GetArray(bn_op->inputs[1]);
+  const auto& multiplier_array = model->GetArray(bn_op->inputs[2]);
+  const auto& offset_array = model->GetArray(bn_op->inputs[3]);
+
+  CHECK(IsConstantParameterArray(*model, bn_op->inputs[1]) &&
+        IsConstantParameterArray(*model, bn_op->inputs[2]) &&
+        IsConstantParameterArray(*model, bn_op->inputs[3]))
+      << "Batch normalization resolution requires that mean, multiplier and "
+         "offset arrays be constant.";
+
+  // We should only have *float* BatchNormalizations... let's guard this
+  // assumption by CHECK's.
+  CHECK(mean_array.data_type == ArrayDataType::kFloat);
+  CHECK(multiplier_array.data_type == ArrayDataType::kFloat);
+  CHECK(offset_array.data_type == ArrayDataType::kFloat);
+
+  // Create the new Mul, Add operators
+  auto* mul_op = new MulOperator;
+  auto* add_op = new AddOperator;
+  const string mul_name =
+      AvailableArrayName(*model, bn_op->outputs[0] + "_mul");
+  const string add_name =
+      AvailableArrayName(*model, bn_op->outputs[0] + "_add");
+  const string mul_param_name = AvailableArrayName(*model, mul_name + "_param");
+  const string add_param_name = AvailableArrayName(*model, add_name + "_param");
+  mul_op->inputs = {bn_op->inputs[0], mul_param_name};
+  mul_op->outputs = {mul_name};
+  add_op->inputs = {mul_name, add_param_name};
+  add_op->outputs = {bn_op->outputs[0]};
+  AddMessageF("Splitting %s into %s and %s", LogName(*bn_op), LogName(*mul_op),
+              LogName(*add_op));
+
+  // Create the intermediate activation array (output of mul, input of add)
+  auto& intermediate_array = model->GetOrCreateArray(mul_op->outputs[0]);
+  intermediate_array.data_type = model->GetArray(bn_op->inputs[0]).data_type;
+
+  // Insert the new operators in the graph
+  auto add_it = model->operators.emplace(bn_it, add_op);
+  auto mul_it = model->operators.emplace(add_it, mul_op);
+  // update invalidated iterators.
+  DCHECK_EQ(mul_it->get(), mul_op);
+  add_it = mul_it + 1;
+  DCHECK_EQ(add_it->get(), add_op);
+  bn_it = add_it + 1;
+  DCHECK_EQ(bn_it->get(), bn_op);
+
+  // Create the new param arrays
+  const auto& mean_shape = mean_array.shape();
+  const auto& multiplier_shape = multiplier_array.shape();
+  const auto& offset_shape = offset_array.shape();
+  CHECK(mean_shape.dims() == multiplier_shape.dims());
+  CHECK(mean_shape.dims() == offset_shape.dims());
+  const auto& param_shape = mean_shape;
+  const int buffer_size = RequiredBufferSizeForShape(param_shape);
+  auto& mul_param_array = model->GetOrCreateArray(mul_param_name);
+  auto& add_param_array = model->GetOrCreateArray(add_param_name);
+  DropMinMax(model, mul_param_name);
+  DropMinMax(model, add_param_name);
+  mul_param_array.copy_shape(param_shape);
+  add_param_array.copy_shape(param_shape);
+  mul_param_array.data_type = ArrayDataType::kFloat;
+  add_param_array.data_type = ArrayDataType::kFloat;
+  auto& mul_float_data =
+      mul_param_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+  auto& add_float_data =
+      add_param_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+  mul_float_data.resize(buffer_size);
+  add_float_data.resize(buffer_size);
+  const auto& mean_float_data =
+      mean_array.GetBuffer<ArrayDataType::kFloat>().data;
+  const auto& multiplier_float_data =
+      multiplier_array.GetBuffer<ArrayDataType::kFloat>().data;
+  const auto& offset_float_data =
+      offset_array.GetBuffer<ArrayDataType::kFloat>().data;
+
+  CHECK(mul_float_data.size() == buffer_size);
+  CHECK(add_float_data.size() == buffer_size);
+  CHECK(mean_float_data.size() == buffer_size);
+  CHECK(multiplier_float_data.size() == buffer_size);
+  CHECK(offset_float_data.size() == buffer_size);
+
+  for (int i = 0; i < buffer_size; i++) {
+    mul_float_data[i] = multiplier_float_data[i];
+    add_float_data[i] =
+        offset_float_data[i] - mean_float_data[i] * multiplier_float_data[i];
+  }
+
+  // Remove the old param arrays
+  model->arrays.erase(bn_op->inputs[1]);
+  model->arrays.erase(bn_op->inputs[2]);
+  model->arrays.erase(bn_op->inputs[3]);
+
+  // Remove the old operator
+  DCHECK_EQ(bn_it->get(), bn_op);
+  model->operators.erase(bn_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53e1be7a05807cde305eca2a7a8901f652f986f6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_binary.cc
@@ -0,0 +1,247 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+std::vector<bool> VectorGreaterThan(const std::vector<int>& a,
+                                    const std::vector<int>& b) {
+  DCHECK_EQ(a.size(), b.size());
+  const int size = a.size();
+  std::vector<bool> result(size);
+  for (int i = 0; i < size; i++) {
+    result[i] = a[i] > b[i];
+  }
+  return result;
+}
+
+void PairwiseVectorSelect(const std::vector<bool>& selector,
+                          const std::vector<int>& input_a,
+                          const std::vector<int>& input_b,
+                          std::vector<int>* output_a,
+                          std::vector<int>* output_b) {
+  DCHECK_EQ(input_a.size(), input_b.size());
+  DCHECK_EQ(output_a->size(), output_b->size());
+  DCHECK_EQ(input_a.size(), output_a->size());
+  DCHECK_EQ(selector.size(), input_a.size());
+  const int size = input_a.size();
+  for (int i = 0; i < size; i++) {
+    if (selector[i]) {
+      (*output_a)[i] = input_a[i];
+      (*output_b)[i] = input_b[i];
+    } else {
+      (*output_a)[i] = input_b[i];
+      (*output_b)[i] = input_a[i];
+    }
+  }
+}
+
+template <ArrayDataType InputsDataType, ArrayDataType OutputDataType>
+void EvaluateBinaryOperatorOnConstantInputs(Model* model,
+                                            const Operator* binary_op) {
+  CHECK(IsConstantParameterArray(*model, binary_op->inputs[0]));
+  CHECK(IsConstantParameterArray(*model, binary_op->inputs[1]));
+  CHECK(binary_op->fused_activation_function ==
+        FusedActivationFunctionType::kNone);
+  const auto& input0_array = model->GetArray(binary_op->inputs[0]);
+  const auto& input1_array = model->GetArray(binary_op->inputs[1]);
+  const auto& output_name = binary_op->outputs[0];
+  auto& output_array = model->GetArray(output_name);
+  CHECK(input0_array.data_type == InputsDataType);
+  CHECK(input1_array.data_type == InputsDataType);
+  CHECK(output_array.data_type == OutputDataType);
+
+  // We have already tested above for existence of input buffers
+  // (synonymous to being a constant param).
+  CHECK(input0_array.buffer);
+  CHECK(input1_array.buffer);
+  // On the other hand, the output should not already have a buffer.
+  CHECK(!output_array.buffer);
+
+  const auto& input0_data = input0_array.GetBuffer<InputsDataType>().data;
+  const auto& input1_data = input1_array.GetBuffer<InputsDataType>().data;
+  // Create the buffer on the output array, effectively turning it into
+  // a constant parameter
+
+  const Shape& output_shape = output_array.shape();
+  auto& output_data = output_array.GetMutableBuffer<OutputDataType>().data;
+  const int output_buffer_size = RequiredBufferSizeForShape(output_shape);
+  output_data.resize(output_buffer_size);
+  const int dims_count = output_shape.dimensions_count();
+
+  // It will be convenient here to have copies of the operands shapes
+  // extended to match the number of dimensions of the output shape.
+  Shape input0_shape = input0_array.shape();
+  Shape input1_shape = input1_array.shape();
+  ExtendShape(&input0_shape, dims_count);
+  ExtendShape(&input1_shape, dims_count);
+  // Now we may still have operands of different sizes, which would indicate
+  // that we have to "broadcast" the smaller dimension.  We do this using a
+  // a vector of Booleans indicating which input is the larger in each
+  // dimension.
+  CHECK_EQ(input0_shape.dimensions_count(), input1_shape.dimensions_count());
+  CHECK_EQ(input0_shape.dimensions_count(), dims_count);
+  const std::vector<bool> input0_larger =
+      VectorGreaterThan(input0_shape.dims(), input1_shape.dims());
+
+  std::vector<int> big_sizes(dims_count);
+  std::vector<int> small_sizes(dims_count);
+  PairwiseVectorSelect(input0_larger, input0_shape.dims(), input1_shape.dims(),
+                       &big_sizes, &small_sizes);
+
+  // The output should already be correctly sized to match the big dimensions.
+  for (int i = 0; i < dims_count; i++) {
+    CHECK_EQ(output_shape.dims(i), big_sizes[i]);
+  }
+
+  std::vector<int> input0_indices(dims_count);
+  std::vector<int> input1_indices(dims_count);
+  std::vector<int> modulo_indices(dims_count);
+
+  for (int k = 0; k < output_buffer_size; k++) {
+    const std::vector<int> output_indices = ReverseOffset(output_shape, k);
+    for (int i = 0; i < dims_count; i++) {
+      modulo_indices[i] = output_indices[i] % small_sizes[i];
+    }
+    PairwiseVectorSelect(input0_larger, output_indices, modulo_indices,
+                         &input0_indices, &input1_indices);
+    const auto val0 = input0_data[Offset(input0_shape, input0_indices)];
+    const auto val1 = input1_data[Offset(input1_shape, input1_indices)];
+
+    DataType<OutputDataType> outval;
+    if (binary_op->type == OperatorType::kAdd) {
+      outval = val0 + val1;
+    } else if (binary_op->type == OperatorType::kMul) {
+      outval = val0 * val1;
+    } else if (binary_op->type == OperatorType::kSub) {
+      outval = val0 - val1;
+    } else if (binary_op->type == OperatorType::kDiv) {
+      outval = val0 / val1;
+    } else if (binary_op->type == OperatorType::kTensorFlowMinimum) {
+      outval = std::min(val0, val1);
+    } else if (binary_op->type == OperatorType::kTensorFlowMaximum) {
+      outval = std::max(val0, val1);
+    } else if (binary_op->type == OperatorType::kTensorFlowLess) {
+      outval = val0 < val1;
+    } else if (binary_op->type == OperatorType::kTensorFlowLessEqual) {
+      outval = val0 <= val1;
+    } else if (binary_op->type == OperatorType::kTensorFlowGreater) {
+      outval = val0 > val1;
+    } else if (binary_op->type == OperatorType::kTensorFlowGreaterEqual) {
+      outval = val0 >= val1;
+    } else {
+      LOG(FATAL) << "should not get here";
+    }
+    output_data[Offset(output_shape, output_indices)] = outval;
+  }
+}
+
+void EvaluateBinaryOperatorOnConstantInputs(Model* model,
+                                            const Operator* binary_op) {
+  const auto inputs_data_type = model->arrays[binary_op->inputs[0]]->data_type;
+  const auto output_data_type = model->arrays[binary_op->outputs[0]]->data_type;
+#define TOCO_HANDLE_CASE(InputsDataType, OutputDataType)                    \
+  if (inputs_data_type == InputsDataType &&                                 \
+      output_data_type == OutputDataType) {                                 \
+    EvaluateBinaryOperatorOnConstantInputs<InputsDataType, OutputDataType>( \
+        model, binary_op);                                                  \
+    return;                                                                 \
+  }
+  TOCO_HANDLE_CASE(ArrayDataType::kFloat, ArrayDataType::kFloat)
+  TOCO_HANDLE_CASE(ArrayDataType::kFloat, ArrayDataType::kBool)
+  TOCO_HANDLE_CASE(ArrayDataType::kInt32, ArrayDataType::kInt32)
+  TOCO_HANDLE_CASE(ArrayDataType::kInt32, ArrayDataType::kBool)
+  TOCO_HANDLE_CASE(ArrayDataType::kInt64, ArrayDataType::kInt64)
+  TOCO_HANDLE_CASE(ArrayDataType::kInt64, ArrayDataType::kBool)
+  LOG(FATAL) << "Unimplemented: don't know how to resolve a constant "
+             << "binary operator for these data types.";
+#undef TOCO_HANDLE_CASE
+}
+}  // namespace
+
+bool ResolveConstantBinaryOperator::Run(Model* model, std::size_t op_index) {
+  const auto binary_it = model->operators.begin() + op_index;
+  const auto* binary_op = binary_it->get();
+  // Test for binary ops of types that we know how to resolve
+  if (binary_op->type != OperatorType::kAdd &&
+      binary_op->type != OperatorType::kMul &&
+      binary_op->type != OperatorType::kSub &&
+      binary_op->type != OperatorType::kDiv &&
+      binary_op->type != OperatorType::kTensorFlowMinimum &&
+      binary_op->type != OperatorType::kTensorFlowMaximum &&
+      binary_op->type != OperatorType::kTensorFlowLess &&
+      binary_op->type != OperatorType::kTensorFlowLessEqual &&
+      binary_op->type != OperatorType::kTensorFlowGreater &&
+      binary_op->type != OperatorType::kTensorFlowGreaterEqual) {
+    return false;
+  }
+  CHECK_EQ(binary_op->inputs.size(), 2);
+
+  const auto& input0_array = model->GetArray(binary_op->inputs[0]);
+  const auto& input1_array = model->GetArray(binary_op->inputs[1]);
+  // Check if both inputs are constant parameters.
+  if (!input0_array.buffer || !input1_array.buffer) {
+    return false;
+  }
+
+  auto& output_array = *model->arrays[binary_op->outputs[0]];
+  // Yield until the output array dims have been resolved.
+  if (!output_array.has_shape()) {
+    return false;
+  }
+
+  // At the moment we don't want to care about fused activation functions.
+  // The idea is that we should do the present constants-propagation before
+  // activation functions get fused.
+  if (binary_op->fused_activation_function !=
+      FusedActivationFunctionType::kNone) {
+    AddMessageF(
+        "Not resolving constant %s because it has a fused activation function",
+        LogName(*binary_op));
+    return false;
+  }
+
+  // Check that input data types agree.
+  CHECK(input0_array.data_type == input1_array.data_type);
+
+  // Do the actual constants propagation
+  EvaluateBinaryOperatorOnConstantInputs(model, binary_op);
+
+  // Remove the binary operator and its inputs
+  if (CountOpsWithInput(*model, binary_op->inputs[0]) == 1) {
+    model->arrays.erase(binary_op->inputs[0]);
+  }
+  if (CountOpsWithInput(*model, binary_op->inputs[1]) == 1) {
+    model->arrays.erase(binary_op->inputs[1]);
+  }
+  AddMessageF("Resolved constant %s to the equivalent constant array",
+              LogName(*binary_op));
+  model->operators.erase(binary_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0983c438498fed28903f8facf8db239ec1a7c2c4
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -0,0 +1,196 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+// Copies data from multiple source arrays to a destination array based on a
+// concatenation dimension. From each array in input_arrays, it copies chunk
+// sizes provided in array_copy_size vector (per array). It uses the buffer
+// in concatenated_array as destination buffer.
+template <ArrayDataType A, typename T>
+void CopyTensorSegments(const std::vector<Array*>& input_arrays,
+                        const std::vector<int>& array_copy_size,
+                        const int num_elements_concatenated_array,
+                        Array* concatenated_array) {
+  for (Array* input_array : input_arrays) {
+    if (!input_array->buffer) {
+      return;
+    }
+  }
+
+  auto& concatenated_array_buffer =
+      concatenated_array->GetMutableBuffer<A>().data;
+  concatenated_array_buffer.resize(num_elements_concatenated_array);
+
+  // It does not matter which array to use to find the value for the total
+  // number of copy steps.
+  CHECK(!input_arrays.empty());
+  CHECK_NE(array_copy_size[0], 0);
+  const int total_copy_steps =
+      input_arrays[0]->GetBuffer<A>().data.size() / array_copy_size[0];
+
+  // Initialize the source pointers to point to beginning of the array buffers.
+  std::vector<const T*> src_ptr;
+  src_ptr.reserve(input_arrays.size());
+  for (Array* input_array : input_arrays) {
+    src_ptr.push_back(input_array->GetBuffer<A>().data.data());
+  }
+
+  // Copy the data from input_arrays to concatenated_array_buffer.
+  T* dest_ptr = concatenated_array_buffer.data();
+  for (int s = 0; s < total_copy_steps; s++) {
+    for (int i = 0; i < input_arrays.size(); i++) {
+      std::copy(src_ptr[i], src_ptr[i] + array_copy_size[i], dest_ptr);
+      src_ptr[i] += array_copy_size[i];
+      dest_ptr += array_copy_size[i];
+    }
+  }
+}
+
+// Receives a series of input arrays of type Array and an integer showing the
+// axis on which those arrays will be concatenated. It returns the concatenated
+// arrray.
+template <ArrayDataType A>
+void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
+                              int concatenation_axis,
+                              Array* concatenated_array) {
+  int num_elements_concatenated_array = 1;
+  for (int i = 0; i < concatenated_array->shape().dimensions_count(); i++) {
+    num_elements_concatenated_array *= concatenated_array->shape().dims()[i];
+  }
+  // Prepare the data needed for segmented copy from multiple source arrays to
+  // a destination array based on a oncatenation dimension.
+  std::vector<int> array_copy_size(input_arrays.size());
+  int count = 0;
+  for (Array* input_array : input_arrays) {
+    const Shape array_shape = input_array->shape();
+    array_copy_size[count] = 1;
+    for (int i = concatenation_axis; i < array_shape.dimensions_count(); i++) {
+      array_copy_size[count] *= array_shape.dims()[i];
+    }
+    count++;
+  }
+
+  // Do the actual data copy.
+  CopyTensorSegments<A, DataType<A>>(input_arrays, array_copy_size,
+                                     num_elements_concatenated_array,
+                                     concatenated_array);
+}
+
+// Sets the minimum and maximum values for the concatenated array. If it's
+// already set (e.g. because of previous pass in TOCO), it doesn't change it and
+// returns. Otherwise it uses the input arrays min and max values to compute the
+// concatenated array min and max.
+void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
+                                 Array* concatenated_array) {
+  CHECK(concatenated_array->data_type == ArrayDataType::kFloat);
+  // If the minmax is already set, use it
+  if (concatenated_array->minmax) return;
+
+  double concat_min = std::numeric_limits<double>::infinity();
+  double concat_max = -std::numeric_limits<double>::infinity();
+
+  for (Array* input_array : input_arrays) {
+    // If any of the input arrays minmax is not set,  return.
+    // TODO(ghodrat): shall we add the logic to compute the minmax?
+    if (!input_array->minmax) return;
+    const MinMax& input_minmax = input_array->GetMinMax();
+    concat_min = std::min(concat_min, input_minmax.min);
+    concat_max = std::max(concat_max, input_minmax.max);
+  }
+  MinMax& minmax = concatenated_array->GetOrCreateMinMax();
+  minmax.min = concat_min;
+  minmax.max = concat_max;
+}
+
+}  // namespace
+
+// Resolves the concatenation operator if all its inputs are constant arrays.
+bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
+  const auto concat_it = model->operators.begin() + op_index;
+  const auto* concat_base_op = concat_it->get();
+  if (concat_base_op->type != OperatorType::kConcatenation) {
+    return false;
+  }
+  const auto* concat_op =
+      static_cast<const ConcatenationOperator*>(concat_base_op);
+
+  for (const string& input_name : concat_op->inputs) {
+    // We only expect constant unquantized arrays as input, otherwise we return.
+    // We  also make sure the shapes of the input arrays are known and they are
+    // all discardable.
+    const Operator* input_op = GetOpWithOutput(*model, input_name);
+    if (input_op) return false;
+    if (!IsConstantParameterArray(*model, input_name)) return false;
+    if (!model->GetArray(input_name).has_shape()) return false;
+    if (model->GetArray(input_name).quantization_params) return false;
+    if (!IsDiscardableArray(*model, input_name)) return false;
+  }
+
+  const int concatenation_axis = concat_op->concat_dim;
+
+  CHECK_EQ(concat_op->outputs.size(), 1);
+  string concatenated_array_name = concat_op->outputs[0];
+  Array& concatenated_array = model->GetOrCreateArray(concatenated_array_name);
+  std::vector<Array*> input_arrays;
+  for (const string& input_name : concat_op->inputs) {
+    input_arrays.push_back(&model->GetArray(input_name));
+  }
+
+  switch (concatenated_array.data_type) {
+    case ArrayDataType::kFloat:
+      ConcatenateTensorBuffers<ArrayDataType::kFloat>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      SetMinMaxForConcatenedArray(input_arrays, &concatenated_array);
+      break;
+    case ArrayDataType::kUint8:
+      ConcatenateTensorBuffers<ArrayDataType::kUint8>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      break;
+    case ArrayDataType::kInt32:
+      ConcatenateTensorBuffers<ArrayDataType::kInt32>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      break;
+    case ArrayDataType::kInt64:
+      ConcatenateTensorBuffers<ArrayDataType::kInt64>(
+          input_arrays, concatenation_axis, &concatenated_array);
+      break;
+    default:
+      LOG(FATAL) << "ArrayDataType not supported";
+  }
+
+  // Remove all the resolved arrays.
+  for (const string& input_name : concat_op->inputs) {
+    model->arrays.erase(input_name);
+  }
+
+  // Remove concatenate operator
+  model->operators.erase(concat_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..244adcc4c46eda9de79dd753565113bbeca970c5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_fake_quant.cc
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveConstantFakeQuant::Run(Model* model, std::size_t op_index) {
+  const auto fakequant_it = model->operators.begin() + op_index;
+  const auto* fakequant_base_op = fakequant_it->get();
+  if (fakequant_base_op->type != OperatorType::kFakeQuant) {
+    return false;
+  }
+
+  const auto* fakequant_op =
+      static_cast<const FakeQuantOperator*>(fakequant_base_op);
+
+  // Yield until the fakequant MinMax has been resolved.
+  if (!fakequant_op->minmax) {
+    return false;
+  }
+
+  // This transformation only applies when the input array is constant.
+  if (!IsConstantParameterArray(*model, fakequant_op->inputs[0])) {
+    return false;
+  }
+
+  const auto& input_array = model->GetArray(fakequant_op->inputs[0]);
+  auto& output_array = model->GetArray(fakequant_op->outputs[0]);
+  CHECK(input_array.data_type == ArrayDataType::kFloat);
+  output_array.data_type = ArrayDataType::kFloat;
+  CHECK(!output_array.buffer);
+  const auto& input_buffer = input_array.GetBuffer<ArrayDataType::kFloat>();
+  auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kFloat>();
+  const int size = input_buffer.data.size();
+  output_buffer.data.resize(size);
+  QuantizationParams qparams;
+  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(
+      model->flags, *fakequant_op->minmax, &qparams);
+  for (int i = 0; i < size; i++) {
+    const double src_val = input_buffer.data[i];
+    const double unclamped_quantized_val =
+        std::round(qparams.zero_point + src_val / qparams.scale);
+    const double quantized_val =
+        std::min(255., std::max(0., unclamped_quantized_val));
+    const double dst_val = qparams.scale * (quantized_val - qparams.zero_point);
+    output_buffer.data[i] = dst_val;
+  }
+  if (CountOpsWithInput(*model, fakequant_op->inputs[0]) == 1) {
+    model->arrays.erase(fakequant_op->inputs[0]);
+  }
+  model->operators.erase(fakequant_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cc6db161987bbd834212fdfed7e1f82cac958ce
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_tensorflow_shape.cc
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveConstantTensorFlowShape::Run(Model* model, std::size_t op_index) {
+  const auto tfshape_it = model->operators.begin() + op_index;
+  const auto* tfshape_base_op = tfshape_it->get();
+  if (tfshape_base_op->type != OperatorType::kTensorFlowShape) {
+    return false;
+  }
+
+  const auto* tfshape_op =
+      static_cast<const TensorFlowShapeOperator*>(tfshape_base_op);
+
+  const auto& input_array = model->GetArray(tfshape_op->inputs[0]);
+  auto& output_array = model->GetArray(tfshape_op->outputs[0]);
+
+  // Yield until the input array's shape has been resolved.
+  if (!input_array.has_shape()) {
+    return false;
+  }
+
+  // Create a buffer for the output array, making it a constant array, and
+  // copy the input shape into the output buffer.
+  CHECK(!output_array.buffer);
+  auto& output_buffer = output_array.GetMutableBuffer<ArrayDataType::kInt32>();
+  output_buffer.data = input_array.shape().dims();
+
+  // Erase the input array if no longer used
+  if (IsDiscardableArray(*model, tfshape_op->inputs[0]) &&
+      CountOpsWithInput(*model, tfshape_op->inputs[0]) == 1) {
+    model->arrays.erase(tfshape_op->inputs[0]);
+  }
+  model->operators.erase(tfshape_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb9bda3c82cc9e9d3526efdabbb2c478fb172d80
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -0,0 +1,175 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
+  const auto unary_it = model->operators.begin() + op_index;
+  const auto* unary_op = unary_it->get();
+  // Test for unary ops of types that we know how to resolve
+  if (unary_op->type != OperatorType::kTensorFlowRsqrt &&
+      unary_op->type != OperatorType::kTensorFlowSqrt &&
+      unary_op->type != OperatorType::kTensorFlowSquare &&
+      unary_op->type != OperatorType::kTensorFlowSum &&
+      unary_op->type != OperatorType::kTensorFlowMin &&
+      unary_op->type != OperatorType::kTensorFlowMax &&
+      unary_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+  // Check if the input is a constant parameter.
+  if (!IsConstantParameterArray(*model, unary_op->inputs[0])) {
+    return false;
+  }
+
+  // if the unary op involves a tensor required by a rnn state, ignore it
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (unary_op->inputs[0] == rnn_state.back_edge_source_array()) {
+      return false;
+    }
+    if (unary_op->inputs[0] == rnn_state.state_array()) {
+      return false;
+    }
+  }
+
+  // At the moment we don't want to care about fused activation functions.
+  // The idea is that we should do the present constants-propagation before
+  // activation functions get fused.
+  if (unary_op->fused_activation_function !=
+      FusedActivationFunctionType::kNone) {
+    AddMessageF(
+        "Not resolving constant %s "
+        " because it has a fused activation function",
+        LogName(*unary_op));
+    return false;
+  }
+  const auto& input_array = model->GetArray(unary_op->inputs[0]);
+  // We have already tested above for existence of buffers (synonymous to being
+  // a constant param).
+  CHECK(input_array.buffer);
+  // At the moment we only support float buffers.
+  if (input_array.buffer->type != ArrayDataType::kFloat) {
+    return false;
+  }
+  const auto& input_float_data =
+      input_array.GetBuffer<ArrayDataType::kFloat>().data;
+  // Create the float buffer on the output array, effectively turning it into
+  // a constant parameter
+  const auto& output_name = unary_op->outputs[0];
+  auto& output_array = model->GetArray(output_name);
+  // Yield until the output array dims have been resolved.
+  if (!output_array.has_shape()) {
+    return false;
+  }
+
+  int input_buffer_size = RequiredBufferSizeForShape(input_array.shape());
+  int output_buffer_size = RequiredBufferSizeForShape(output_array.shape());
+  const Shape& input_shape = input_array.shape();
+  const Shape& output_shape = output_array.shape();
+
+  auto& output_float_data =
+      output_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+  output_float_data.resize(output_buffer_size);
+
+  const int output_dims_count = output_shape.dimensions_count();
+  if (unary_op->type == OperatorType::kTensorFlowReshape) {
+    CHECK(input_buffer_size == output_buffer_size);
+    memcpy(output_float_data.data(), input_float_data.data(),
+           input_buffer_size * sizeof(input_float_data[0]));
+  } else if (unary_op->type == OperatorType::kTensorFlowSum) {
+    // At the moment only full reduction across all dimensions is supported.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), 1);
+    }
+    float sum = 0.f;
+    const int input_size = RequiredBufferSizeForShape(input_shape);
+    for (int i = 0; i < input_size; i++) {
+      sum += input_float_data[i];
+    }
+    output_float_data[0] = sum;
+  } else if (unary_op->type == OperatorType::kTensorFlowMin) {
+    // At the moment only full reduction across all dimensions is supported.
+    // TODO(starka): Output should not be padded.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), 1);
+    }
+    float min = input_float_data[0];
+    const int input_size = RequiredBufferSizeForShape(input_shape);
+    for (int i = 0; i < input_size; i++) {
+      min = std::min(min, input_float_data[i]);
+    }
+    output_float_data[0] = min;
+  } else if (unary_op->type == OperatorType::kTensorFlowMax) {
+    // At the moment only full reduction across all dimensions is supported.
+    // TODO(starka): Output should not be padded.
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), 1);
+    }
+    float max = input_float_data[0];
+    const int input_size = RequiredBufferSizeForShape(input_shape);
+    for (int i = 0; i < input_size; i++) {
+      max = std::max(max, input_float_data[i]);
+    }
+    output_float_data[0] = max;
+  } else if (unary_op->type == OperatorType::kTensorFlowRsqrt ||
+             unary_op->type == OperatorType::kTensorFlowSqrt ||
+             unary_op->type == OperatorType::kTensorFlowSquare) {
+    // Element-wise ops. Should have perfectly matching sizes here.
+    const int input_size = RequiredBufferSizeForShape(input_shape);
+    for (int i = 0; i < output_dims_count; i++) {
+      CHECK_EQ(output_shape.dims(i), input_shape.dims(i));
+    }
+
+    for (int i = 0; i < input_size; i++) {
+      const float val = input_float_data[i];
+      float outval = 0.f;
+      if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
+        outval = 1.0f / std::sqrt(val);
+      } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
+        outval = std::sqrt(val);
+      } else if (unary_op->type == OperatorType::kTensorFlowSquare) {
+        outval = val * val;
+      } else {
+        LOG(FATAL) << "should not get here.";
+      }
+      output_float_data[i] = outval;
+    }
+  } else {
+    LOG(FATAL) << "should not get here.";
+  }
+  for (const auto& input : unary_op->inputs) {
+    if (CountOpsWithInput(*model, input) == 1) {
+      model->arrays.erase(input);
+    }
+  }
+  AddMessageF("Resolved constant %s to the equivalent constant array",
+              LogName(*unary_op));
+  model->operators.erase(unary_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d6ac331befe2e8b8edeec8dfafb305a7e0e9236
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_mean_attributes.cc
@@ -0,0 +1,57 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveMeanAttributes::Run(Model* model, std::size_t op_index) {
+  auto* mean_op = model->operators[op_index].get();
+  if (mean_op->type != OperatorType::kMean) return false;
+  auto* op = static_cast<MeanOperator*>(mean_op);
+
+  if (!op->reduction_indices.empty()) {
+    // Attributes already resolved
+    return false;
+  }
+  if (op->inputs.size() != 2) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  const auto& indices_array = *model->arrays[op->inputs[1]];
+  if (!indices_array.has_shape()) return false;
+
+  // We only support simultaneous reduction over width and height.
+  std::vector<int> reduction_indices =
+      indices_array.GetBuffer<ArrayDataType::kInt32>().data;
+  if (reduction_indices.size() != 2) {
+    return false;
+  }
+  if (!((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
+        (reduction_indices[0] == 2 && reduction_indices[1] == 1))) {
+    return false;
+  }
+
+  op->reduction_indices = reduction_indices;
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5f5869c625f419a825f6bd652a04eca1bce4a6f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_pad_attributes.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolvePadAttributes::Run(Model* model, std::size_t op_index) {
+  const auto pad_it = model->operators.begin() + op_index;
+  auto* pad_op = pad_it->get();
+  if (pad_op->type != OperatorType::kPad) return false;
+
+  auto* op = static_cast<PadOperator*>(pad_op);
+  if (!op->left_padding.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 2);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  const auto& array = *model->arrays[op->inputs[1]];
+  if (!array.has_shape()) return false;
+
+  const std::vector<int>& dims = array.shape().dims();
+  CHECK_EQ(dims.size(), 2);
+
+  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  for (int i = 0; i < dims[0]; ++i) {
+    op->left_padding.push_back(buffer[i * 2]);
+    op->right_padding.push_back(buffer[i * 2 + 1]);
+  }
+
+  // TODO(dkalenichenko): Delete the extra input?
+
+  return true;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fa7b83bedc0da99c3a5a60f38586f712eeb3c4e
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reorder_axes.cc
@@ -0,0 +1,93 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveReorderAxes::Run(Model* model, std::size_t op_index) {
+  auto reorder_it = model->operators.begin() + op_index;
+  auto* reorder_op = static_cast<ReorderAxesOperator*>(reorder_it->get());
+  if (reorder_op->type != OperatorType::kReorderAxes) {
+    return false;
+  }
+  const auto& input_array_name = reorder_op->inputs[0];
+  const auto& output_array_name = reorder_op->outputs[0];
+  auto& input_array = model->GetArray(input_array_name);
+  auto& output_array = model->GetArray(output_array_name);
+  string constant_input_array_name = input_array_name;
+  if (!input_array.buffer) {
+    const auto* op_producing_input = GetOpWithOutput(*model, input_array_name);
+    if (op_producing_input &&
+        op_producing_input->type == OperatorType::kFakeQuant) {
+      constant_input_array_name = op_producing_input->inputs[0];
+    }
+  }
+  auto& constant_input_array = model->GetArray(constant_input_array_name);
+  if (!constant_input_array.buffer) {
+    return false;
+  }
+  // Yield until output dims have been resolved.
+  if (!output_array.has_shape()) {
+    return false;
+  }
+  // Reorder the input array dims and buffer data
+  CHECK(constant_input_array.buffer->type == ArrayDataType::kFloat);
+  CHECK(!output_array.buffer);
+  auto& input_data =
+      constant_input_array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+  std::vector<float> reordered_data;
+  reordered_data.resize(RequiredBufferSizeForShape(output_array.shape()));
+  const auto input_axes_order = reorder_op->input_axes_order;
+  const auto output_axes_order = reorder_op->output_axes_order;
+  // TODO(b/62904716) Shapes should be used directly.
+  Shape input_shape = constant_input_array.shape();
+  Shape output_shape = output_array.shape();
+  if (AxesCount(input_axes_order) == 2) {
+    UnextendShape(&input_shape, 2);
+    UnextendShape(&output_shape, 2);
+  }
+  ShuffleArray(input_shape, input_axes_order, output_axes_order, output_shape,
+               input_data.data(), reordered_data.data());
+  input_data = reordered_data;
+  input_array.copy_shape(output_array.shape());
+  constant_input_array.copy_shape(output_array.shape());
+
+  // Update the edges of the graph to point to the input array
+  for (const auto& other_op : model->operators) {
+    for (auto& input : other_op->inputs) {
+      if (input == output_array_name) {
+        input = input_array_name;
+      }
+    }
+  }
+
+  AddMessageF("Reordered axes for array %s", input_array_name);
+
+  // Remove the op and output array.
+  model->arrays.erase(output_array_name);
+  model->operators.erase(reorder_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bed2a85bd262c49913f22e522d260c4dc6510246
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_reshape_attributes.cc
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveReshapeAttributes::Run(Model* model, std::size_t op_index) {
+  const auto reshape_it = model->operators.begin() + op_index;
+  auto* reshape_op = reshape_it->get();
+  if (reshape_op->type != OperatorType::kTensorFlowReshape) {
+    return false;
+  }
+
+  auto* op = static_cast<TensorFlowReshapeOperator*>(reshape_op);
+
+  if (!op->shape.empty()) return false;
+
+  if (IsConstantParameterArray(*model, reshape_op->inputs[1])) {
+    const auto& constant_input_array = *model->arrays[reshape_op->inputs[1]];
+    op->shape = constant_input_array.GetBuffer<ArrayDataType::kInt32>().data;
+  }
+
+  if (op->shape.empty()) return false;
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d0a2ec8f6c1f532f23873062534a37e07fff72b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_slice_attributes.cc
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveSliceAttributes::Run(Model* model, std::size_t op_index) {
+  const auto slice_it = model->operators.begin() + op_index;
+  auto* slice_op = slice_it->get();
+  if (slice_op->type != OperatorType::kSlice) return false;
+
+  auto* op = static_cast<SliceOperator*>(slice_op);
+  if (!op->begin.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
+
+  const auto& begin_array = *model->arrays[op->inputs[1]];
+  if (!begin_array.has_shape()) return false;
+
+  const auto& size_array = *model->arrays[op->inputs[2]];
+  if (!size_array.has_shape()) return false;
+
+  op->begin = begin_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->size = size_array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  // TODO(dkalenichenko): Delete the extra inputs?
+
+  return true;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97946182ef07b0c3d826cafa95b2bb47fbaf0125
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -0,0 +1,61 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
+  const auto slice_it = model->operators.begin() + op_index;
+  auto* slice_op = slice_it->get();
+  if (slice_op->type != OperatorType::kStridedSlice) return false;
+
+  auto* op = static_cast<StridedSliceOperator*>(slice_op);
+  if (!op->start_indices.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 4);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
+  if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
+
+  const auto& start_array = *model->arrays[op->inputs[1]];
+  if (!start_array.has_shape()) return false;
+  if (toco::RequiredBufferSizeForShape(start_array.shape()) != 4) {
+    // Only 4D arrays are supported for now.
+    return false;
+  }
+
+  const auto& stop_array = *model->arrays[op->inputs[2]];
+  if (!stop_array.has_shape()) return false;
+
+  const auto& stride_array = *model->arrays[op->inputs[3]];
+  if (!stride_array.has_shape()) return false;
+
+  op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  // TODO(dkalenichenko): Delete the extra inputs?
+
+  return true;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b482f5cf51f7bde67e76792439203487402b75ce
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_concat.cc
@@ -0,0 +1,86 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTensorFlowConcat::Run(Model* model, std::size_t op_index) {
+  auto concat_it = model->operators.begin() + op_index;
+  const auto* tf_concat_op = concat_it->get();
+  if (tf_concat_op->type != OperatorType::kTensorFlowConcat &&
+      tf_concat_op->type != OperatorType::kTensorFlowConcatV2) {
+    return false;
+  }
+
+  CHECK_GE(tf_concat_op->inputs.size(), 2);
+  // TensorFlow Concat and ConcatV2 nodes only differ by the ordering
+  // of inputs: in Concat, the concat_dim is the first input, while in
+  // ConcatV2, it is the last input.
+  std::size_t concat_dim_pos = 0;
+  if (tf_concat_op->type == OperatorType::kTensorFlowConcatV2) {
+    concat_dim_pos = tf_concat_op->inputs.size() - 1;
+  }
+  const string concat_dim_name = tf_concat_op->inputs[concat_dim_pos];
+  std::vector<string> concat_input_names;
+  for (std::size_t i = 0; i < tf_concat_op->inputs.size(); i++) {
+    if (i != concat_dim_pos) {
+      concat_input_names.push_back(tf_concat_op->inputs[i]);
+    }
+  }
+  // If the concat_dim array hasn't been resolved to a constant yet,
+  // we need to yield.
+  const auto& concat_dim_array = model->GetArray(concat_dim_name);
+  if (!concat_dim_array.buffer) {
+    AddMessageF("Waiting for the concat_dim of %s to be resolved to a constant",
+                LogName(*tf_concat_op));
+    return false;
+  }
+
+  CHECK(concat_dim_array.data_type == ArrayDataType::kInt32);
+  const auto& concat_dim_data =
+      concat_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(concat_dim_data.size(), 1);
+  const int concat_dim = concat_dim_data[0];
+
+  // Create the Concatenation op replacing the TensorFlowConcat op.
+  auto* concatenation_op = new ConcatenationOperator;
+  concatenation_op->concat_dim = concat_dim;
+  concatenation_op->inputs = concat_input_names;
+  concatenation_op->outputs = {tf_concat_op->outputs[0]};
+  auto depth_concat_it = model->operators.emplace(concat_it, concatenation_op);
+  CHECK_EQ(depth_concat_it->get(), concatenation_op);
+  // Update invalidated iterator
+  concat_it = depth_concat_it + 1;
+  CHECK_EQ(concat_it->get(), tf_concat_op);
+
+  // Remove the concat_dim array if it is not used by anything else.
+  if (CountOpsWithInput(*model, concat_dim_name) == 1) {
+    model->arrays.erase(concat_dim_name);
+  }
+  // Remove the TensorFlowConcat op
+  model->operators.erase(concat_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bea7487051a58344a56a3186a05d0fdceebc8727
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -0,0 +1,106 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
+  auto matmul_it = model->operators.begin() + op_index;
+  if (matmul_it->get()->type != OperatorType::kTensorFlowMatMul) {
+    return false;
+  }
+  const auto* matmul_op = matmul_it->get();
+
+  // Find the op producing the array passed to this MatMul
+  auto previous_op_it = model->operators.begin();
+  bool found = false;
+  for (; previous_op_it != model->operators.end(); ++previous_op_it) {
+    for (const auto& output : (*previous_op_it)->outputs) {
+      if (output == matmul_op->inputs[0]) {
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      break;
+    }
+  }
+  Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
+
+  // construct the new FullyConnectedOperator
+  auto* fc_op = new FullyConnectedOperator;
+  fc_op->outputs = matmul_op->outputs;
+
+  // insert the newly constructed FullyConnectedOperator
+  auto fc_it = model->operators.emplace(matmul_it, fc_op);
+
+  // refresh invalidated iterator
+  matmul_it = fc_it + 1;
+  DCHECK_EQ(matmul_it->get(), matmul_op);
+
+  // The way that TensorFlow encodes FullyConnected ops is as a pair
+  // (Reshape, MatMul), so we want to remove the Reshape op and rewrite the
+  // MatMul
+  // op as a FullyConnected. However, TensorFlow skips the Reshape ops if the
+  // input doesn't need reshaping, so we can't just match (Reshape, MatMul)
+  // pairs.
+  if (previous_op && previous_op->type == OperatorType::kTensorFlowReshape) {
+    AddMessageF("Combining %s and %s into %s", LogName(*previous_op),
+                LogName(*matmul_op), LogName(*fc_op));
+    const auto& previous_op_output = previous_op->outputs[0];
+    if (CountOpsWithInput(*model, previous_op_output) == 1) {
+      model->arrays.erase(previous_op_output);
+    }
+    CHECK_EQ(previous_op->inputs.size(), 2);
+    fc_op->inputs = {previous_op->inputs[0], matmul_op->inputs[1]};
+    // Only remove Reshape node if no other node uses its output.
+    if (CountOpsWithInput(*model, previous_op_output) == 1) {
+      const auto& previous_op_shape = previous_op->inputs[1];
+      if (CountOpsWithInput(*model, previous_op_shape) == 1 &&
+          !GetOpWithOutput(*model, previous_op_shape)) {
+        model->arrays.erase(previous_op_shape);
+      }
+      model->operators.erase(previous_op_it);
+    }
+
+    // We may have just invalidated matmul_it, so let's refresh it now.
+    matmul_it = model->operators.begin();
+    for (; matmul_it != model->operators.end(); ++matmul_it) {
+      if (matmul_it->get() == matmul_op) {
+        break;
+      }
+    }
+    CHECK(matmul_it != model->operators.end());
+    CHECK(matmul_it->get() == matmul_op);
+  } else {
+    AddMessageF("Replacing %s by a FullyConnected operator",
+                LogName(*matmul_op));
+    fc_op->inputs = {matmul_op->inputs[0], matmul_op->inputs[1]};
+  }
+
+  // erase the MatMul operator
+  model->operators.erase(matmul_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfa5ce0716523adbfb0a76e89ce3b202f0595763
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_merge.cc
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTensorFlowMerge::Run(Model* model, std::size_t op_index) {
+  const auto merge_it = model->operators.begin() + op_index;
+  const auto* merge_op = merge_it->get();
+  if (merge_op->type != OperatorType::kTensorFlowMerge) {
+    return false;
+  }
+
+  // We need to yield until this Merge node has only 1 input, which will mean
+  // that that is the selected input. Other graph transformations on other nodes
+  // such as ResolveTensorFlowSwitch, will take care of trimming the
+  // non-selected inputs, so that at some point there will be only 1 input left.
+  if (merge_op->inputs.size() > 1) {
+    AddMessageF("Waiting for %s to be resolved", LogName(*merge_op));
+    return false;
+  }
+
+  // Now that the merge node has 1 input exactly, it is the same as an Identity
+  // node and can be resolved trivially.
+  CHECK_EQ(merge_op->inputs.size(), 1);
+
+  // Update the edges of the graph ahead of removing the node.
+  for (const auto& other_op : model->operators) {
+    for (auto& input : other_op->inputs) {
+      if (input == merge_op->outputs[0]) {
+        input = merge_op->inputs[0];
+      }
+    }
+  }
+
+  // Remove the node and its output array.
+  AddMessageF("Removing already-resolved %s", LogName(*merge_op));
+  model->arrays.erase(merge_op->outputs[0]);
+  model->operators.erase(merge_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d3f42b5ec4cab29189c12043d12ea687d684832
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_squeeze.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTensorFlowSqueeze::Run(Model* model, std::size_t op_index) {
+  const auto squeeze_it = model->operators.begin() + op_index;
+  const auto* squeeze_op = squeeze_it->get();
+  if (squeeze_op->type != OperatorType::kSqueeze) {
+    return false;
+  }
+
+  CHECK_EQ(squeeze_op->inputs.size(), 1);
+  CHECK_EQ(squeeze_op->outputs.size(), 1);
+
+  // If the output is consumed by a reshape op, it's a trivial squeeze.
+  if (CountOpsWithInput(*model, squeeze_op->outputs[0]) == 1) {
+    const auto* next_op = GetOpWithInput(*model, squeeze_op->outputs[0]);
+    if (next_op->type == OperatorType::kTensorFlowReshape) {
+      AddMessageF(
+          "%s is trivial because its output is only consumed by a "
+          "Reshape op",
+          LogName(*squeeze_op));
+
+      return RemoveTrivialPassthroughOp(this, model, op_index);
+    }
+  }
+
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55adfca03739deb35cbeb50c67222768f8a02164
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_switch.cc
@@ -0,0 +1,123 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolveTensorFlowSwitch::Run(Model* model, std::size_t op_index) {
+  const auto switch_it = model->operators.begin() + op_index;
+  const auto* switch_op = switch_it->get();
+  if (switch_op->type != OperatorType::kTensorFlowSwitch) {
+    return false;
+  }
+
+  CHECK_EQ(switch_op->inputs.size(), 2);
+  CHECK_EQ(switch_op->outputs.size(), 2);
+  const string& predicate_name = switch_op->inputs[1];
+  // If the predicate array hasn't been resolved to a constant yet,
+  // we need to yield.
+  if (!IsConstantParameterArray(*model, predicate_name)) {
+    AddMessageF(
+        "Waiting for the boolean predicate of %s to be resolved to a constant",
+        LogName(*switch_op));
+    return false;
+  }
+
+  // The predicate should be boolean, and should consist of a single value.
+  const auto& predicate_array = model->GetArray(predicate_name);
+  CHECK(predicate_array.data_type == ArrayDataType::kBool);
+  for (const auto& dim : predicate_array.shape().dims()) {
+    CHECK_EQ(dim, 1);
+  }
+
+  // Obtain the predicate boolean value.
+  const auto& predicate_data =
+      predicate_array.GetBuffer<ArrayDataType::kBool>().data;
+  CHECK_EQ(predicate_data.size(), 1);
+  const bool predicate_value = predicate_data[0];
+
+  // From the TensorFlow docs on .switch() in
+  // third_party/tensorflow/python/ops/control_flow_ops.py
+  //
+  //    If `pred` is false, the `data` input is forwared to the first output.
+  //    Otherwise, the data goes to the second output.
+  //
+  // Note that this comment used to say the opposite and was recently fixed:
+  // https://github.com/tensorflow/tensorflow/commit/bc456e361d49d1d89a74b80060c70efb51fd7d87#diff-76ab9dafbe12c20ddc3769c6b108986c
+  const int selected_output_index = predicate_value ? 1 : 0;
+  const int nonselected_output_index = predicate_value ? 0 : 1;
+
+  // Update the edges of the graph ahead of removing the node:
+  // edges that were pointing to the selected output, should instead
+  // point to the input of the Switch node.
+  for (const auto& other_op : model->operators) {
+    for (auto& input : other_op->inputs) {
+      if (input == switch_op->outputs[selected_output_index]) {
+        input = switch_op->inputs[0];
+      }
+    }
+  }
+
+  // There remains to handle the edges that were pointing to the nonselected
+  // output. We will just discard those edges. Concretely, at the moment,
+  // our only examples of graphs with Switch nodes have them feeding into Merge
+  // nodes, so what we're saying here is that we'll make the convention,
+  // in our toco internal representation, that Merge nodes with only 1 input
+  // are Merge nodes that have been resolved already and should be have as
+  // Identity nodes, simply forwarding their input.
+  //
+  for (const auto& other_op : model->operators) {
+    auto input_it = other_op->inputs.begin();
+    while (input_it != other_op->inputs.end()) {
+      if (*input_it == switch_op->outputs[nonselected_output_index]) {
+        // Let us guard our assumption that only Merge nodes consume the outputs
+        // of Switch nodes:
+        CHECK(other_op->type == OperatorType::kTensorFlowMerge);
+        input_it = other_op->inputs.erase(input_it);
+      } else {
+        ++input_it;
+      }
+    }
+  }
+
+  // Remove the output arrays if they are now unused.
+  for (int i = 0; i < 2; i++) {
+    if (!GetOpWithInput(*model, switch_op->outputs[i])) {
+      model->arrays.erase(switch_op->outputs[i]);
+    }
+  }
+  // Remove input arrays if they are only used by the switch itself and aren't
+  // the output of another op (will get handled by RemoveUnusedOp in that case).
+  for (const auto& input : switch_op->inputs) {
+    if (CountOpsWithInput(*model, input) == 1 &&
+        !GetOpWithOutput(*model, input)) {
+      model->arrays.erase(input);
+    }
+  }
+  // Remove the switch node itself.
+  AddMessageF("Removing already-resolved %s", LogName(*switch_op));
+  model->operators.erase(switch_it);
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f7e7c42a26b60c96573be6653babb78fdb5fd73
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_tile.cc
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+void RemoveTileOperator(Model* model, Operator* tile_op, Operator* binary_op,
+                        int operand_index) {
+  CHECK(tile_op->type == OperatorType::kTensorFlowTile);
+  CHECK_EQ(binary_op->inputs.size(), 2);
+  CHECK_EQ(tile_op->inputs.size(), 2);
+  const string tile_multiplier_array = tile_op->inputs[1];
+  const string tile_output_array = tile_op->outputs[0];
+  binary_op->inputs[operand_index] = tile_op->inputs[0];
+  auto tile_it = model->operators.begin();
+  for (; tile_it != model->operators.end(); ++tile_it) {
+    if (tile_it->get() == tile_op) {
+      break;
+    }
+  }
+  CHECK(tile_it != model->operators.end());
+  CHECK(tile_it->get() == tile_op);
+  model->operators.erase(tile_it);
+  if (!CountOpsWithInput(*model, tile_multiplier_array) &&
+      !GetOpWithOutput(*model, tile_multiplier_array)) {
+    model->arrays.erase(tile_multiplier_array);
+  }
+  if (!CountOpsWithInput(*model, tile_output_array)) {
+    model->arrays.erase(tile_output_array);
+  }
+}
+}  // namespace
+
+bool ResolveTensorFlowTile::Run(Model* model, std::size_t op_index) {
+  const auto binary_it = model->operators.begin() + op_index;
+  auto* binary_op = binary_it->get();
+  // Test for binary ops of types that we know how to resolve
+  if (binary_op->inputs.size() != 2) {
+    return false;
+  }
+  if (binary_op->type != OperatorType::kAdd &&
+      binary_op->type != OperatorType::kMul &&
+      binary_op->type != OperatorType::kSub &&
+      binary_op->type != OperatorType::kDiv) {
+    return false;
+  }
+
+  Operator* const op[2] = {
+      GetOpWithOutput(*model, binary_op->inputs[0]),
+      GetOpWithOutput(*model, binary_op->inputs[1]),
+  };
+
+  // In the unlikely case where both operands are Tile, we can't infer the
+  // output
+  // size without the Tile nodes, so we have to bail out.
+  if (op[0] && op[0]->type == OperatorType::kTensorFlowTile && op[1] &&
+      op[1]->type == OperatorType::kTensorFlowTile) {
+    return false;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    if (op[i] && op[i]->type == OperatorType::kTensorFlowTile) {
+      // We can only remove a Tile operator is no other op than the present
+      // binary op was consuming its tiled output.
+      if (CountOpsWithInput(*model, binary_op->inputs[i]) == 1) {
+        AddMessageF("Removing %s", LogName(*op[i]));
+        RemoveTileOperator(model, op[i], binary_op, i);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..893149878293c9ef2740effe331d3b6c51b49983
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+tf_cc_test(
+    name = "resolve_constant_concatenation_test",
+    srcs = ["resolve_constant_concatenation_test.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6705ad305ac85f7098f40469ebc54fc6fa1b3ab
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -0,0 +1,221 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+//#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+// The following 3 tests make sure the concatenation operation on different axis
+// values match TensorFlow results listed below:
+//
+// x0 = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
+// x1 = [[[10, 11], [12, 13]], [[14, 15], [16, 17]]]
+// x2 = [[[20, 21], [22, 23]], [[24, 25], [26, 27]]]
+// x3 = [[[30, 31], [32, 33]], [[34, 35], [36, 37]]]
+//
+// ConcatAtAxis0 test:
+// t0 = tf.concat([x0, x1, x2, x3], 0)
+// [[[ 0  1]
+//   [ 2  3]]
+//
+//  [[ 4  5]
+//   [ 6  7]]
+//
+//  [[10 11]
+//   [12 13]]
+//
+//  [[14 15]
+//   [16 17]]
+//
+//  [[20 21]
+//   [22 23]]
+//
+//  [[24 25]
+//   [26 27]]
+//
+//  [[30 31]
+//   [32 33]]
+//
+//  [[34 35]
+//   [36 37]]]
+//
+// ConcatAtAxis1 test:
+// t1 = tf.concat([x0, x1, x2, x3], 1)
+// [[[ 0  1]
+//   [ 2  3]
+//   [10 11]
+//   [12 13]
+//   [20 21]
+//   [22 23]
+//   [30 31]
+//   [32 33]]
+//
+//  [[ 4  5]
+//   [ 6  7]
+//   [14 15]
+//   [16 17]
+//   [24 25]
+//   [26 27]
+//   [34 35]
+//   [36 37]]]
+//
+// ConcatAtAxis2 test:
+// t2 = tf.concat([x0, x1, x2, x3], 2)
+// [[[ 0  1 10 11 20 21 30 31]
+//   [ 2  3 12 13 22 23 32 33]]
+//
+//  [[ 4  5 14 15 24 25 34 35]
+//   [ 6  7 16 17 26 27 36 37]]]
+
+class ResolveConstantConcatenationTest : public ::testing::Test {
+ protected:
+  ResolveConstantConcatenationTest() {}
+
+  // Prepare a hypothetical TOCO model with one Concatenation operator in it
+  // together with 4 arrays as its inputs.
+  // It receives the dimension of concatenation as input.
+  void PrepareModel(Model* model, int concat_dim) {
+    std::vector<string> concat_input_names = {"array0", "array1", "array2",
+                                              "array3"};
+
+    const int kDim = 3;
+    const int kElementPerDim = 2;
+    const int kBufSize = 8;
+    const int kNumArrays = 4;
+    static float in_buf[kNumArrays][kBufSize] = {
+        {0., 1., 2., 3., 4., 5., 6., 7.},
+        {10., 11., 12., 13., 14., 15., 16., 17.},
+        {20., 21., 22., 23., 24., 25., 26., 27.},
+        {30., 31., 32., 33., 34., 35., 36., 37.}};
+    int cnt = 0;
+    for (const string& concat_input_name : concat_input_names) {
+      Array& in_array = model->GetOrCreateArray(concat_input_name);
+      in_array.data_type = ArrayDataType::kFloat;
+
+      // Initialize shape for the input  array.
+      Shape* in_array_shape = in_array.mutable_shape();
+      std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims();
+      for (int i = 0; i < kDim; i++) {
+        in_array_shape_dim->push_back(kElementPerDim);
+      }
+      auto& in_array_buffer =
+          in_array.GetMutableBuffer<toco::ArrayDataType::kFloat>();
+      in_array_buffer.data.resize(kBufSize);
+      float* buf_ptr =
+          in_array.GetMutableBuffer<toco::ArrayDataType::kFloat>().data.data();
+      std::copy(in_buf[cnt], in_buf[cnt] + kBufSize, buf_ptr);
+      cnt++;
+    }
+    auto* concatenation_op = new ConcatenationOperator;
+    concatenation_op->concat_dim = concat_dim;
+    concatenation_op->inputs = concat_input_names;
+    concatenation_op->outputs = {"concat_op_outputs"};
+    Array& out_array = model->GetOrCreateArray(concatenation_op->outputs[0]);
+    out_array.data_type = ArrayDataType::kFloat;
+    Shape* out_array_shape = out_array.mutable_shape();
+    std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims();
+    out_array_shape_dim->resize(kDim);
+    for (int i = 0; i < kDim; i++) {
+      if (i == concat_dim) {
+        (*out_array_shape_dim)[i] = kNumArrays * kElementPerDim;
+      } else {
+        (*out_array_shape_dim)[i] = kElementPerDim;
+      }
+    }
+    model->operators.push_back(std::unique_ptr<Operator>(concatenation_op));
+  }
+};
+
+TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis0) {
+  Model model;
+  const int concat_dim = 0;
+  PrepareModel(&model, concat_dim);
+
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
+  EXPECT_THAT(model.arrays.size(), 5);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  EXPECT_THAT(model.arrays.size(), 1);
+
+  auto& concatenated_array = (*model.arrays.begin()).second;
+  EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  10., 11., 12.,
+                   13., 14., 15., 16., 17., 20., 21., 22., 23., 24., 25.,
+                   26., 27., 30., 31., 32., 33., 34., 35., 36., 37.})));
+}
+
+TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis1) {
+  Model model;
+  const int concat_dim = 1;
+  PrepareModel(&model, concat_dim);
+
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
+  EXPECT_THAT(model.arrays.size(), 5);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  EXPECT_THAT(model.arrays.size(), 1);
+
+  auto& concatenated_array = (*model.arrays.begin()).second;
+  EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.,  1.,  2.,  3.,  10., 11., 12., 13., 20., 21., 22.,
+                   23., 30., 31., 32., 33., 4.,  5.,  6.,  7.,  14., 15.,
+                   16., 17., 24., 25., 26., 27., 34., 35., 36., 37.})));
+}
+
+TEST_F(ResolveConstantConcatenationTest, ConcatAtAxis2) {
+  Model model;
+  const int concat_dim = 2;
+  PrepareModel(&model, concat_dim);
+
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::ResolveConstantConcatenation);
+  EXPECT_THAT(model.arrays.size(), 5);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+  EXPECT_THAT(model.arrays.size(), 1);
+
+  auto& concatenated_array = (*model.arrays.begin()).second;
+  EXPECT_THAT(concatenated_array->GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.,  1.,  10., 11., 20., 21., 30., 31., 2.,  3.,  12.,
+                   13., 22., 23., 32., 33., 4.,  5.,  14., 15., 24., 25.,
+                   34., 35., 6.,  7.,  16., 17., 26., 27., 36., 37.})));
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e273343df9f3e5ade8f23a2fbd868bcab72c62e
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unfuse_activation_functions.cc
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool UnfuseActivationFunctions::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* op = it->get();
+
+  // If a conv operation has an im2col array, yield: it should be dropped first.
+  if ((op->type == OperatorType::kConv) && (op->outputs.size() == 2)) {
+    return false;
+  }
+
+  Operator* ac_op = nullptr;
+  switch (op->fused_activation_function) {
+    case FusedActivationFunctionType::kRelu:
+      ac_op = new ReluOperator;
+      break;
+    case FusedActivationFunctionType::kRelu6:
+      ac_op = new Relu6Operator;
+      break;
+    case FusedActivationFunctionType::kRelu1:
+      ac_op = new Relu1Operator;
+      break;
+    default:
+      return false;
+  }
+
+  // At this point we know that the op has a fused activation function. At the
+  // moment that only happens with ops having a single output, may be
+  // relaxed in the future.
+  CHECK_EQ(op->outputs.size(), 1);
+
+  // Emplace unfused activation function, drop the fused one.
+  model->operators.emplace(it + 1, ac_op);
+  op->fused_activation_function = FusedActivationFunctionType::kNone;
+
+  // Wire up arrays, constructing a new intermediate array to connect the
+  // op to its new unfused activation function.
+  ac_op->outputs = op->outputs;
+  const string& tmp_array_name =
+      AvailableArrayName(*model, op->outputs[0] + "_unfused");
+  CHECK(!model->arrays.count(tmp_array_name));
+  model->GetOrCreateArray(tmp_array_name);
+  ac_op->inputs = {tmp_array_name};
+  op->outputs = {tmp_array_name};
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34d38f1fcb212a5fcf21c4347c128748e5d801a0
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -0,0 +1,1869 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "google/protobuf/map.h"
+#include "google/protobuf/text_format.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+using tensorflow::AttrValue;
+using tensorflow::DT_BOOL;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_UINT8;
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+using tensorflow::TensorProto;
+using tensorflow::TensorShapeProto;
+
+namespace toco {
+namespace {
+bool HasAttr(const NodeDef& node, const string& attr_name) {
+  return node.attr().count(attr_name) > 0;
+}
+
+const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kS);
+  return attr.s();
+}
+
+int GetIntAttr(const NodeDef& node, const string& attr_name) {
+  CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
+                                  << node.DebugString();
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kI);
+  return attr.i();
+}
+
+float GetFloatAttr(const NodeDef& node, const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kF);
+  return attr.f();
+}
+
+bool GetBoolAttr(const NodeDef& node, const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kB);
+  return attr.b();
+}
+
+tensorflow::DataType GetDataTypeAttr(const NodeDef& node,
+                                     const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kType);
+  return attr.type();
+}
+
+const TensorShapeProto& GetShapeAttr(const NodeDef& node,
+                                     const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kShape);
+  return attr.shape();
+}
+
+const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kTensor);
+  return attr.tensor();
+}
+
+const AttrValue::ListValue& GetListAttr(const NodeDef& node,
+                                        const string& attr_name) {
+  CHECK(HasAttr(node, attr_name));
+  const auto& attr = node.attr().at(attr_name);
+  CHECK_EQ(attr.value_case(), AttrValue::kList);
+  return attr.list();
+}
+
+ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
+  if (dtype == DT_UINT8)
+    return ArrayDataType::kUint8;
+  else if (dtype == DT_FLOAT)
+    return ArrayDataType::kFloat;
+  else if (dtype == DT_BOOL)
+    return ArrayDataType::kBool;
+  else if (dtype == DT_INT32)
+    return ArrayDataType::kInt32;
+  else if (dtype == DT_INT64)
+    return ArrayDataType::kInt64;
+  else
+    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+  return ArrayDataType::kNone;
+}
+
+void ImportShape(const TFLITE_PROTO_NS::RepeatedPtrField<
+                     tensorflow::TensorShapeProto_Dim>& input_dims,
+                 Shape* shape) {
+  std::vector<int> input_dims_only_sizes;
+  for (auto& d : input_dims) {
+    if (d.size() == 0) {
+      // Some TensorFlow shapes contain a 0 dim, effectively making
+      // them of flat size 0 even though they have other nonzero dims.
+      // This breaks our invariant, that array dims can't be 0.
+      // For now, tweaking this to record a 0-D shape instead.
+      input_dims_only_sizes.clear();
+      break;
+    }
+    input_dims_only_sizes.push_back(d.size());
+  }
+  *shape->mutable_dims() = input_dims_only_sizes;
+}
+
+void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_float_data =
+      output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
+  output_float_data.resize(input_flat_size);
+  if (input_tensor.float_val_size()) {
+    for (int i = 0; i < input_tensor.float_val_size(); i++) {
+      output_float_data[i] = input_tensor.float_val(i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(float)) {
+    toco::port::CopyToBuffer(input_tensor.tensor_content(),
+                             reinterpret_cast<char*>(output_float_data.data()));
+  } else {
+    LOG(FATAL) << "Neither input_content nor float_val have the right "
+                  "dimensions for this float tensor.";
+  }
+}
+
+void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_INT32);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_int_data =
+      output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
+  output_int_data.resize(input_flat_size);
+  if (input_tensor.int_val_size()) {
+    for (int i = 0; i < input_tensor.int_val_size(); i++) {
+      output_int_data[i] = input_tensor.int_val(i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(int32)) {
+    toco::port::CopyToBuffer(input_tensor.tensor_content(),
+                             reinterpret_cast<char*>(output_int_data.data()));
+  } else {
+    LOG(FATAL) << "Neither input_content nor int_val have the right "
+                  "dimensions for this int32 tensor.";
+  }
+}
+
+void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
+  CHECK_EQ(input_tensor.dtype(), DT_INT64);
+  const auto& input_shape = input_tensor.tensor_shape();
+  CHECK_LE(input_shape.dim_size(), 4);
+  ImportShape(input_shape.dim(), output_array->mutable_shape());
+  int input_flat_size = 1;
+  for (int k = 0; k < input_shape.dim_size(); k++) {
+    input_flat_size *= input_shape.dim(k).size();
+  }
+  auto& output_int_data =
+      output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
+  output_int_data.resize(input_flat_size);
+  if (input_tensor.int64_val_size()) {
+    for (int i = 0; i < input_tensor.int64_val_size(); i++) {
+      output_int_data[i] = input_tensor.int64_val(i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(int64)) {
+    toco::port::CopyToBuffer(input_tensor.tensor_content(),
+                             reinterpret_cast<char*>(output_int_data.data()));
+  } else {
+    LOG(FATAL) << "Neither input_content nor int64_val have the right "
+                  "dimensions for this int64 tensor.";
+  }
+}
+
+// Count the number of inputs of a given node. If
+// `tf_import_flags.drop_control_dependency` is true, count the number of
+// non-control-dependency inputs.
+int GetInputsCount(const NodeDef& node,
+                   const TensorFlowImportFlags& tf_import_flags) {
+  if (tf_import_flags.drop_control_dependency) {
+    for (size_t i = 0; i < node.input_size(); ++i) {
+      if (node.input(i)[0] == '^') {
+        return i;
+      }
+    }
+    return node.input_size();
+  } else {
+    return node.input_size();
+  }
+}
+
+void ConvertConstOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Const");
+  const auto& tensor = GetTensorAttr(node, "value");
+  const auto dtype = GetDataTypeAttr(node, "dtype");
+
+  auto& array = model->GetOrCreateArray(node.name());
+  array.data_type = dtype == DT_FLOAT
+                        ? ArrayDataType::kFloat
+                        : dtype == DT_INT32
+                              ? ArrayDataType::kInt32
+                              : dtype == DT_INT64 ? ArrayDataType::kInt64
+                                                  : ArrayDataType::kNone;
+  if (dtype == DT_FLOAT) {
+    ImportFloatArray(tensor, &array);
+  } else if (dtype == DT_INT32) {
+    ImportInt32Array(tensor, &array);
+  } else if (dtype == DT_INT64) {
+    ImportInt64Array(tensor, &array);
+  } else {
+    // do nothing, silently ignore the Const data. For example, there are consts
+    // of string type. We just make a dummy buffer to indicate that this array
+    // does not rely on external input.
+    array.GetMutableBuffer<ArrayDataType::kNone>();
+  }
+}
+
+void ConvertConvOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Conv2D");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+
+  const auto& input_name = node.input(0);
+  const auto& weights_name = node.input(1);
+  const auto& reordered_weights_name = weights_name + "_reordered";
+  // Check if a ReorderAxesOperator was already created for these weights
+  // (that happens when multiple layers share the same weights).
+  const Operator* existing_reorder =
+      GetOpWithOutput(*model, reordered_weights_name);
+  if (existing_reorder) {
+    // Check that it is safe to rely on the _reordered naming of the output
+    // array!
+    CHECK(existing_reorder->type == OperatorType::kReorderAxes);
+  } else {
+    // Create a new ReorderAxesOperator
+    auto* reorder = new ReorderAxesOperator;
+    reorder->inputs = {weights_name};
+    reorder->outputs = {reordered_weights_name};
+    reorder->input_axes_order = AxesOrder::kHWIO;
+    reorder->output_axes_order = AxesOrder::kOHWI;
+    model->operators.emplace_back(reorder);
+  }
+  auto* conv = new ConvOperator;
+  conv->inputs = {input_name, reordered_weights_name};
+  conv->outputs = {node.name()};
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  CHECK_EQ(strides.i(3), 1);
+  conv->stride_height = strides.i(1);
+  conv->stride_width = strides.i(2);
+  const auto& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    conv->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    conv->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  model->operators.emplace_back(conv);
+}
+
+void ConvertDepthwiseConvOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "DepthwiseConv2dNative");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+
+  const auto& input_name = node.input(0);
+  const auto& weights_name = node.input(1);
+  const auto& reordered_weights_name = weights_name + "_reordered";
+  // Check if a ReorderAxesOperator was already created for these weights
+  // (that happens when multiple layers share the same weights).
+  const Operator* existing_reorder =
+      GetOpWithOutput(*model, reordered_weights_name);
+  if (existing_reorder) {
+    // Check that it is safe to rely on the _reordered naming of the output
+    // array!
+    CHECK(existing_reorder->type == OperatorType::kReorderAxes);
+  } else {
+    // Create a new ReorderAxesOperator
+    auto* reorder = new ReorderAxesOperator;
+    reorder->inputs = {weights_name};
+    reorder->outputs = {reordered_weights_name};
+    reorder->input_axes_order = AxesOrder::kHWIM;
+    reorder->output_axes_order = AxesOrder::k1HWO;
+    model->operators.emplace_back(reorder);
+  }
+  auto* conv = new DepthwiseConvOperator;
+  conv->inputs = {input_name, reordered_weights_name};
+  conv->outputs = {node.name()};
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  CHECK_EQ(strides.i(3), 1);
+  conv->stride_height = strides.i(1);
+  conv->stride_width = strides.i(2);
+  const auto& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    conv->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    conv->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  model->operators.emplace_back(conv);
+}
+
+void ConvertDepthToSpaceOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
+  CHECK_EQ(node.op(), "DepthToSpace");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  auto* op = new DepthToSpaceOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->block_size = GetIntAttr(node, "block_size");
+  QCHECK_GE(op->block_size, 2);
+  model->operators.emplace_back(op);
+}
+
+void ConvertSpaceToDepthOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
+  CHECK_EQ(node.op(), "SpaceToDepth");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  auto* op = new SpaceToDepthOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->block_size = GetIntAttr(node, "block_size");
+  QCHECK_GE(op->block_size, 2);
+  model->operators.emplace_back(op);
+}
+
+void ConvertBiasAddOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "BiasAdd");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  const auto& input_name = node.input(0);
+  const auto& bias_name = node.input(1);
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  auto* biasadd = new AddOperator;
+  biasadd->inputs.push_back(input_name);
+  biasadd->inputs.push_back(bias_name);
+  biasadd->outputs.push_back(node.name());
+  model->operators.emplace_back(biasadd);
+}
+
+void ConvertReluOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Relu");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* relu = new ReluOperator;
+  relu->inputs.push_back(input_name);
+  relu->outputs.push_back(node.name());
+  model->operators.emplace_back(relu);
+}
+
+void ConvertRelu6Operator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Relu6");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* op = new Relu6Operator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertLogisticOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK_EQ(node.op(), "Sigmoid");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* op = new LogisticOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertTanhOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Tanh");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* op = new TanhOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertDivOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK(node.op() == "Div" || node.op() == "RealDiv");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new DivOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertIdentityOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK(node.op() == "Identity" || node.op() == "CheckNumerics" ||
+        node.op() == "PlaceholderWithDefault" || node.op() == "StopGradient");
+  auto* op = new TensorFlowIdentityOperator;
+  // Amazingly, some TensorFlow graphs (at least rajeev_lstm.pb) have
+  // identity nodes with multiple inputs, but the other inputs seem
+  // to be gratuitous (in the case of rajeev_lstm.pb, these are
+  // enumerating the LSTM state arrays). We will just ignore extra
+  // inputs beyond the first input.
+  CHECK_GE(node.input_size(), 1);
+  const auto& input_name = node.input(0);
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFakeQuantWithMinMaxArgs(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "FakeQuantWithMinMaxArgs");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new FakeQuantOperator;
+  op->inputs.push_back(node.input(0));
+  op->minmax.reset(new MinMax);
+  auto& minmax = *op->minmax;
+  minmax.min = GetFloatAttr(node, "min");
+  minmax.max = GetFloatAttr(node, "max");
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFakeQuantWithMinMaxVars(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "FakeQuantWithMinMaxVars");
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK(num_inputs == 3 || num_inputs == 4);
+  auto* op = new FakeQuantOperator;
+  for (int i = 0; i < 3; i++) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRsqrtOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Rsqrt");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new TensorFlowRsqrtOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSqrtOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Sqrt");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new TensorFlowSqrtOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSqueezeOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Squeeze");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new SqueezeOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+
+  const auto& squeeze_dims = GetListAttr(node, "squeeze_dims");
+  for (int i = 0; i < squeeze_dims.i_size(); ++i) {
+    op->squeeze_dims.push_back(squeeze_dims.i(i));
+  }
+
+  model->operators.emplace_back(op);
+}
+
+void ConvertSquareOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "Square");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new TensorFlowSquareOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertAddOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Add");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new AddOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertMulOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Mul");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new MulOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSubOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Sub");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new SubOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSumOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Sum");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowSumOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
+}
+
+void ConvertTileOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Tile");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowTileOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSliceOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Slice");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  auto* op = new SliceOperator;
+  for (int i = 0; i < 3; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertPadOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Pad");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new PadOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertShapeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Shape");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new TensorFlowShapeOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSplitOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Split");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowSplitOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  const int num_split = GetIntAttr(node, "num_split");
+  op->outputs.push_back(node.name());
+  for (int i = 1; i < num_split; i++) {
+    op->outputs.push_back(absl::StrCat(node.name(), ":", i));
+  }
+  op->num_split = num_split;
+  model->operators.emplace_back(op);
+}
+
+void ConvertMergeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Merge");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowMergeOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertSwitchOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "Switch");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowSwitchOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  // Switch operators have two outputs: "name" and "name:1".
+  op->outputs.push_back(node.name() + ":1");
+  model->operators.emplace_back(op);
+}
+void ConvertSoftmaxOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Softmax");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* softmax = new SoftmaxOperator;
+  softmax->inputs.push_back(input_name);
+  softmax->outputs.push_back(node.name());
+  // TensorFlow's Softmax doesn't seem to admit a 'beta' parameter.
+  CHECK(!node.attr().count("beta"));  // Stab in the dark, just in case.
+  softmax->beta = 1.f;
+  model->operators.emplace_back(softmax);
+}
+
+void ConvertLRNOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "LRN");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  auto* lrn = new LocalResponseNormalizationOperator;
+  lrn->inputs.push_back(input_name);
+  lrn->outputs.push_back(node.name());
+  lrn->range = GetIntAttr(node, "depth_radius");
+  lrn->bias = GetFloatAttr(node, "bias");
+  lrn->alpha = GetFloatAttr(node, "alpha");
+  lrn->beta = GetFloatAttr(node, "beta");
+  model->operators.emplace_back(lrn);
+}
+
+void ConvertMaxPoolOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "MaxPool");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
+  if (HasAttr(node, "T")) {
+    CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  } else {
+    LOG(WARNING) << "Found MaxPool operator missing 'T' attribute";
+  }
+  auto* maxpool = new MaxPoolOperator;
+  maxpool->inputs.push_back(input_name);
+  maxpool->outputs.push_back(node.name());
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  CHECK_EQ(strides.i(3), 1);
+  maxpool->stride_height = strides.i(1);
+  maxpool->stride_width = strides.i(2);
+  const auto& ksize = GetListAttr(node, "ksize");
+  CHECK_EQ(ksize.i_size(), 4);
+  CHECK_EQ(ksize.i(0), 1);
+  CHECK_EQ(ksize.i(3), 1);
+  maxpool->kheight = ksize.i(1);
+  maxpool->kwidth = ksize.i(2);
+  const auto& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    maxpool->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    maxpool->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  model->operators.emplace_back(maxpool);
+}
+
+void ConvertAvgPoolOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "AvgPool");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto& input_name = node.input(0);
+  // We only support NHWC, which is the default data_format.
+  // So if data_format is not defined, we're all good.
+  if (node.attr().count("data_format")) {
+    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
+  }
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  auto* avgpool = new AveragePoolOperator;
+  avgpool->inputs.push_back(input_name);
+  avgpool->outputs.push_back(node.name());
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  CHECK_EQ(strides.i(3), 1);
+  avgpool->stride_height = strides.i(1);
+  avgpool->stride_width = strides.i(2);
+  const auto& ksize = GetListAttr(node, "ksize");
+  CHECK_EQ(ksize.i_size(), 4);
+  CHECK_EQ(ksize.i(0), 1);
+  CHECK_EQ(ksize.i(3), 1);
+  avgpool->kheight = ksize.i(1);
+  avgpool->kwidth = ksize.i(2);
+  const auto& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    avgpool->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    avgpool->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+  }
+  model->operators.emplace_back(avgpool);
+}
+
+void ConvertReshapeOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Reshape");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowReshapeOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertMatMulOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "MatMul");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  // Transpose flags should be easy to support, but we don't have a
+  // GraphDef with them to test on at the moment.
+  CHECK_EQ(GetBoolAttr(node, "transpose_a"), false);
+  CHECK_EQ(GetBoolAttr(node, "transpose_b"), false);
+  const auto& input_name = node.input(0);
+  const auto& weights_name = node.input(1);
+  const auto& reordered_weights_name = weights_name + "_reordered";
+  // Check if a ReorderAxesOperator was already created for these weights
+  // (that happens when multiple layers share the same weights).
+  const Operator* existing_reorder =
+      GetOpWithOutput(*model, reordered_weights_name);
+  if (existing_reorder) {
+    // Check that it is safe to rely on the _reordered naming of the output
+    // array!
+    CHECK(existing_reorder->type == OperatorType::kReorderAxes);
+  } else {
+    // Create a new ReorderAxesOperator
+    auto* reorder = new ReorderAxesOperator;
+    reorder->inputs = {weights_name};
+    reorder->outputs = {reordered_weights_name};
+    reorder->input_axes_order = AxesOrder::kRC;
+    reorder->output_axes_order = AxesOrder::kCR;
+    model->operators.emplace_back(reorder);
+  }
+  auto* matmul = new TensorFlowMatMulOperator;
+  matmul->inputs = {input_name, reordered_weights_name};
+  matmul->outputs = {node.name()};
+  model->operators.emplace_back(matmul);
+}
+
+void ConvertConcatOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  Operator* op = nullptr;
+  if (node.op() == "Concat") {
+    op = new TensorFlowConcatOperator;
+  } else if (node.op() == "ConcatV2") {
+    op = new TensorFlowConcatV2Operator;
+  } else {
+    LOG(FATAL) << "Expected Concat or ConcatV2";
+  }
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_GE(num_inputs, 2);
+  CHECK_EQ(num_inputs, 1 + GetIntAttr(node, "N"));
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertAllOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "All");
+  auto* op = new TensorFlowAllOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertAssertOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "Assert");
+  auto* op = new TensorFlowAssertOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertLessOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Less");
+  auto* op = new TensorFlowLessOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertLessEqualOperator(const NodeDef& node,
+                              const TensorFlowImportFlags& tf_import_flags,
+                              Model* model) {
+  CHECK_EQ(node.op(), "LessEqual");
+  auto* op = new TensorFlowLessEqualOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertGreaterOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Greater");
+  auto* op = new TensorFlowGreaterOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertGreaterEqualOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
+  CHECK_EQ(node.op(), "GreaterEqual");
+  auto* op = new TensorFlowGreaterEqualOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertMaxOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Max");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowMaxOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
+}
+
+void ConvertMinOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Min");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowMinOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
+}
+
+void ConvertMaximumOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Maximum");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowMaximumOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertMinimumOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  CHECK_EQ(node.op(), "Minimum");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TensorFlowMinimumOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertUnsupportedOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
+  LOG(INFO) << "Converting unsupported operation: " << node.op();
+  auto* op = new TensorFlowUnsupportedOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  op->tensorflow_op = node.op();
+  node.SerializeToString(&op->tensorflow_node_def);
+  model->operators.emplace_back(op);
+  if (HasAttr(node, "_output_quantized")) {
+    op->quantized = GetBoolAttr(node, "_output_quantized");
+  }
+  if (HasAttr(node, "_output_types")) {
+    const auto& output_types = GetListAttr(node, "_output_types");
+    for (int i = 0; i < output_types.type_size(); ++i) {
+      op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
+    }
+  }
+}
+
+void ConvertStridedSliceOperator(const NodeDef& node,
+                                 const TensorFlowImportFlags& tf_import_flags,
+                                 Model* model) {
+  CHECK_EQ(node.op(), "StridedSlice");
+  CHECK_EQ(node.input_size(), 4);
+
+  // Only a subset of the full TF op functionality is supported now.
+  if (  // No 64-bit indices.
+      GetDataTypeAttr(node, "Index") != DT_INT32 ||
+      // No dimensionality changes.
+      GetIntAttr(node, "new_axis_mask") != 0 ||
+      GetIntAttr(node, "shrink_axis_mask") != 0 ||
+      // No sparse indices.
+      GetIntAttr(node, "ellipsis_mask") != 0 ||
+      // Only 4D tensors are supported.
+      GetIntAttr(node, "begin_mask") > 15 ||
+      GetIntAttr(node, "end_mask") > 15) {
+    ConvertUnsupportedOperator(node, tf_import_flags, model);
+    return;
+  }
+
+  auto* op = new StridedSliceOperator;
+  for (const auto& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  op->outputs.push_back(node.name());
+
+  op->begin_mask = GetIntAttr(node, "begin_mask");
+  op->ellipsis_mask = GetIntAttr(node, "ellipsis_mask");
+  op->end_mask = GetIntAttr(node, "end_mask");
+  op->new_axis_mask = GetIntAttr(node, "new_axis_mask");
+  op->shrink_axis_mask = GetIntAttr(node, "shrink_axis_mask");
+  model->operators.emplace_back(op);
+}
+
+void ConvertPlaceholderOperator(const NodeDef& node,
+                                const TensorFlowImportFlags& tf_import_flags,
+                                Model* model) {
+  CHECK(node.op() == "Placeholder" || node.op() == "LegacyFedInput");
+  if (node.op() == "Placeholder") {
+    CHECK_EQ(GetInputsCount(node, tf_import_flags), 0);
+  }
+  auto& array = model->GetOrCreateArray(node.name());
+  if (node.attr().count("dtype")) {
+    array.data_type = ConvertDataType(GetDataTypeAttr(node, "dtype"));
+  }
+  if (node.attr().count("shape")) {
+    const auto& shape = GetShapeAttr(node, "shape");
+    auto num_dims = shape.dim_size();
+    bool has_wildcard = false;
+    for (std::size_t i = 0; i < num_dims; i++) {
+      if (shape.dim(i).size() == -1) {
+        has_wildcard = true;
+      }
+    }
+    // TODO(b/62716978): This logic needs to be revisted.  During dims
+    // refactoring it is an interim fix.
+    if (num_dims > 0 && !has_wildcard) {
+      auto& dst_array_dims = *array.mutable_shape()->mutable_dims();
+      dst_array_dims.resize(num_dims);
+      for (std::size_t i = 0; i < num_dims; i++) {
+        dst_array_dims[i] = shape.dim(i).size();
+      }
+    }
+  }
+}
+
+void ConvertNoOpOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {}
+
+void ConvertCastOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Cast");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto tf_src_dtype = GetDataTypeAttr(node, "SrcT");
+  const auto tf_dst_dtype = GetDataTypeAttr(node, "DstT");
+  auto* op = new CastOperator;
+  op->src_data_type = ConvertDataType(tf_src_dtype);
+  op->dst_data_type = ConvertDataType(tf_dst_dtype);
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Floor");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  const auto data_type = GetDataTypeAttr(node, "T");
+  CHECK(data_type == DT_FLOAT);
+  auto* op = new FloorOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertGatherOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CHECK_EQ(node.op(), "Gather");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
+  CHECK(indices_data_type == DT_INT32);
+  auto* op = new GatherOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertResizeBilinearOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
+  CHECK_EQ(node.op(), "ResizeBilinear");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new ResizeBilinearOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertBatchNormWithGlobalNormalizationOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "BatchNormWithGlobalNormalization");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 5);
+
+  // TODO(ahentz): to really match tensorflow we need to add variance_epsilon
+  // to the input, before feeding it into TensorFlowRsqrtOperator.
+  // CHECK_EQ(GetFloatAttr(node, "variance_epsilon"), 0.001f);
+
+  string multiplier = node.name() + "_mul";
+  if (GetBoolAttr(node, "scale_after_normalization")) {
+    // Create graph:
+    //   v -> RSQRT ->
+    //                 MUL  -> multiplier
+    //   gamma  ----->
+    string rsqrt = node.name() + "_rsqrt";
+
+    auto* rsqrt_op = new TensorFlowRsqrtOperator;
+    rsqrt_op->inputs.push_back(node.input(2));
+    rsqrt_op->outputs.push_back(rsqrt);
+    model->operators.emplace_back(rsqrt_op);
+
+    auto* mul_op = new MulOperator;
+    mul_op->inputs.push_back(rsqrt);
+    mul_op->inputs.push_back(node.input(4));
+    mul_op->outputs.push_back(multiplier);
+    model->operators.emplace_back(mul_op);
+  } else {
+    // Create graph:
+    //   v -> RSQRT -> multiplier
+    auto* rsqrt_op = new TensorFlowRsqrtOperator;
+    rsqrt_op->inputs.push_back(node.input(2));
+    rsqrt_op->outputs.push_back(multiplier);
+    model->operators.emplace_back(rsqrt_op);
+  }
+
+  auto* op = new BatchNormalizationOperator;
+  op->global_normalization = true;
+
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(multiplier);
+  op->inputs.push_back(node.input(3));
+  op->outputs.push_back(node.name());
+
+  model->operators.emplace_back(op);
+}
+
+void ConvertFusedBatchNormOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
+  CHECK_EQ(node.op(), "FusedBatchNorm");
+  CHECK_EQ(node.input_size(), 5);
+
+  // Declare shortcuts for the inputs.
+  const string& gamma_input = node.input(1);
+  const string& beta_input = node.input(2);
+  const string& moving_mean_input = node.input(3);
+  const string& moving_variance_input = node.input(4);
+
+  // Create an array holding the epsilon value (typically, 0.001).
+  const string epsilon_array_name = node.name() + "_epsilon_array";
+  auto& epsilon_array = model->GetOrCreateArray(epsilon_array_name);
+  epsilon_array.data_type = ArrayDataType::kFloat;
+  *epsilon_array.mutable_shape()->mutable_dims() = {1};
+  epsilon_array.GetMutableBuffer<ArrayDataType::kFloat>().data.push_back(
+      GetFloatAttr(node, "epsilon"));
+
+  // Add epsilon to the moving variance.
+  const string epsilon_add_op_name = node.name() + "_epsilon";
+  auto* epsilon_add_op = new AddOperator;
+  epsilon_add_op->inputs.push_back(moving_variance_input);
+  epsilon_add_op->inputs.push_back(epsilon_array_name);
+  epsilon_add_op->outputs.push_back(epsilon_add_op_name);
+  model->operators.emplace_back(epsilon_add_op);
+
+  // Take the inverse square root of the (variance + epsilon).
+  const string rsqrt_op_name = node.name() + "_rsqrt";
+  auto* rsqrt_op = new TensorFlowRsqrtOperator;
+  rsqrt_op->inputs.push_back(epsilon_add_op_name);
+  rsqrt_op->outputs.push_back(rsqrt_op_name);
+  model->operators.emplace_back(rsqrt_op);
+
+  // Multiply the result by gamma.
+  const string multiplier = node.name() + "_mul";
+  auto* mul_op = new MulOperator;
+  mul_op->inputs.push_back(rsqrt_op_name);
+  mul_op->inputs.push_back(gamma_input);
+  mul_op->outputs.push_back(multiplier);
+  model->operators.emplace_back(mul_op);
+
+  // Now we have all required inputs for the BatchNormalizationOperator.
+  auto* op = new BatchNormalizationOperator;
+  op->global_normalization = true;
+
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(moving_mean_input);
+  op->inputs.push_back(multiplier);
+  op->inputs.push_back(beta_input);
+  op->outputs.push_back(node.name());
+
+  model->operators.emplace_back(op);
+}
+
+void ConvertSpaceToBatchNDOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
+  CHECK_EQ(node.op(), "SpaceToBatchND");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
+  CHECK_EQ(GetDataTypeAttr(node, "Tpaddings"), DT_INT32);
+  auto* op = new SpaceToBatchNDOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertBatchToSpaceNDOperator(const NodeDef& node,
+                                   const TensorFlowImportFlags& tf_import_flags,
+                                   Model* model) {
+  CHECK_EQ(node.op(), "BatchToSpaceND");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  CHECK_EQ(GetDataTypeAttr(node, "Tblock_shape"), DT_INT32);
+  CHECK_EQ(GetDataTypeAttr(node, "Tcrops"), DT_INT32);
+  auto* op = new BatchToSpaceNDOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertMeanOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Mean");
+  CHECK_EQ(node.input_size(), 2);
+  auto* op = new MeanOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  if (HasAttr(node, "keep_dims")) {
+    op->keep_dims = GetBoolAttr(node, "keep_dims");
+  }
+}
+
+void ConvertSvdfOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Svdf");
+  bool has_bias = (node.input_size() == 4);
+  auto* op = new SvdfOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  if (has_bias) {
+    op->inputs.push_back(node.input(3));
+  }
+  op->outputs.push_back(node.name() + "_state");
+  op->outputs.push_back(node.name());
+  if (node.attr().at("ActivationFunction").s() == "Relu") {
+    op->fused_activation_function = FusedActivationFunctionType::kRelu;
+  } else {
+    op->fused_activation_function = FusedActivationFunctionType::kNone;
+  }
+  op->rank = node.attr().at("Rank").i();
+  model->operators.emplace_back(op);
+}
+
+// This is just bare bones support to get the shapes to propagate.
+void ConvertTransposeConvOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "Conv2DBackpropInput");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  auto* op = new TransposeConvOperator;
+  op->inputs.push_back(node.input(2));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  const auto& strides = GetListAttr(node, "strides");
+  CHECK_EQ(strides.i_size(), 4);
+  CHECK_EQ(strides.i(0), 1);
+  op->stride_height = strides.i(1);
+  op->stride_width = strides.i(2);
+  CHECK_EQ(strides.i(3), 1);
+  auto const& padding = GetStringAttr(node, "padding");
+  if (padding == "SAME") {
+    op->padding.type = PaddingType::kSame;
+  } else if (padding == "VALID") {
+    op->padding.type = PaddingType::kValid;
+  } else {
+    LOG(FATAL) << "Only SAME and VALID padding supported on "
+                  "Conv2DBackpropInput nodes.";
+  }
+  model->operators.emplace_back(op);
+}
+
+void ConvertExpandDimsOperator(const NodeDef& node,
+                               const TensorFlowImportFlags& tf_import_flags,
+                               Model* model) {
+  CHECK_EQ(node.op(), "ExpandDims");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new ExpandDimsOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFillOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Fill");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FillOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorDivOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK_EQ(node.op(), "FloorDiv");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FloorDivOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertFloorModOperator(const NodeDef& node,
+                             const TensorFlowImportFlags& tf_import_flags,
+                             Model* model) {
+  CHECK(node.op() == "FloorMod");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new FloorModOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRangeOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "Range");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 3);
+  auto* op = new RangeOperator;
+  if (HasAttr(node, "Tidx")) {
+    const auto dtype = toco::GetDataTypeAttr(node, "Tidx");
+    CHECK(dtype == DT_UINT8 || dtype == DT_INT32 || dtype == DT_INT64 ||
+          dtype == DT_FLOAT);
+    op->dtype = ConvertDataType(dtype);
+  }
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertRankOperator(const NodeDef& node,
+                         const TensorFlowImportFlags& tf_import_flags,
+                         Model* model) {
+  CHECK_EQ(node.op(), "Rank");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 1);
+  auto* op = new RankOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertStackOperator(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK((node.op() == "Stack") || (node.op() == "Pack"));
+  auto* op = new StackOperator;
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_GE(num_inputs, 1);
+  CHECK_EQ(num_inputs, GetIntAttr(node, "N"));
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  // Both "Stack" and "Pack" have the "axis" attribute.
+  op->axis = GetIntAttr(node, "axis");
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+void ConvertTransposeOperator(const NodeDef& node,
+                              const TensorFlowImportFlags& tf_import_flags,
+                              Model* model) {
+  CHECK_EQ(node.op(), "Transpose");
+  CHECK_EQ(GetInputsCount(node, tf_import_flags), 2);
+  auto* op = new TransposeOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+}
+
+// Some TensorFlow ops only occur in graph cycles, representing
+// control flow. We do not currently support control flow, so we wouldn't
+// be able to fully support such graphs, including performing inference,
+// anyway. However, rather than erroring out early on graphs being cyclic,
+// it helps to at least support these just enough to allow getting a
+// graph visualization. This is not trivial, as we require graphs to be
+// acyclic aside from RNN back-edges. The solution is to special-case
+// such ops as RNN back-edges, which is technically incorrect (does not
+// allow representing the op's semantics) but good enough to get a
+// graph visualization.
+void ConvertOperatorSpecialCasedAsRNNBackEdge(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  // At the moment, the only type of operator special-cased in this way is
+  // NextIteration, occuring only in control-flow cycles.
+  CHECK_EQ(node.op(), "NextIteration");
+  CHECK_EQ(node.input_size(), 1);
+  auto* rnn_state = model->flags.add_rnn_states();
+  // This RNN state is not explicitly created by the user, so it's
+  // OK for some later graph transformation to discard it.
+  rnn_state->set_discardable(true);
+  rnn_state->set_state_array(node.name());
+  rnn_state->set_back_edge_source_array(node.input(0));
+}
+
+void StripCaretFromArrayNames(Model* model) {
+  for (auto& op : model->operators) {
+    for (auto& input : op->inputs) {
+      input = string(absl::StripPrefix(input, "^"));
+    }
+    for (auto& output : op->outputs) {
+      output = string(absl::StripPrefix(output, "^"));
+    }
+  }
+  for (auto& array : model->arrays) {
+    if (absl::StartsWith(array.first, "^")) {
+      LOG(FATAL) << "What?";
+    }
+  }
+}
+
+void StripZeroOutputIndexFromInputs(NodeDef* node) {
+  for (auto& input : *node->mutable_input()) {
+    input = string(absl::StripSuffix(input, ":0"));
+  }
+}
+
+// In TensorFlow GraphDef, when a node has multiple outputs, they are named
+// name:0, name:1, ...
+// where 'name' is the node's name(). Just 'name' is an equivalent shorthand
+// form for name:0.
+// A TensorFlow GraphDef does not explicitly list all the outputs of each node
+// (unlike inputs), it being implied by the node's name and operator type
+// (the latter implies the number of outputs).
+// This makes it non-trivial for us to reconstruct the list of all arrays
+// present in the graph and, for each operator, the list of its outputs.
+// We do that by taking advantage of the fact that
+// at least each node lists explicitly its inputs, so after we've loaded
+// all nodes, we can use that information.
+void AddExtraOutputs(Model* model) {
+  // Construct the list of all arrays consumed by anything in the graph.
+  std::vector<string> consumed_arrays;
+  // Add arrays consumed by an op.
+  for (const auto& consumer_op : model->operators) {
+    for (const string& input : consumer_op->inputs) {
+      consumed_arrays.push_back(input);
+    }
+  }
+  // Add global outputs of the model.
+  for (const string& output_array : model->flags.output_arrays()) {
+    consumed_arrays.push_back(output_array);
+  }
+  // Add arrays consumed by a RNN back-edge.
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    consumed_arrays.push_back(rnn_state.back_edge_source_array());
+  }
+  // Now add operator outputs so that all arrays that are consumed,
+  // are produced.
+  for (const string& consumed_array : consumed_arrays) {
+    // Split the consumed array name into the form name:output_index.
+    const std::vector<string>& split = absl::StrSplit(consumed_array, ':');
+    // If not of the form name:output_index, then this is not an additional
+    // output of a node with multiple outputs, so nothing to do here.
+    if (split.size() != 2) {
+      continue;
+    }
+    int output_index = 0;
+    if (!absl::SimpleAtoi(split[1], &output_index)) {
+      continue;
+    }
+    // Each op is initially recorded as producing at least the array that
+    // has its name. We use that to identify the producer node.
+    auto* producer_op = GetOpWithOutput(*model, split[0]);
+    if (!producer_op) {
+      continue;
+    }
+    // Add extra outputs to that producer node, all the way to the
+    // output_index.
+    while (producer_op->outputs.size() <= output_index) {
+      using toco::port::StringF;
+      producer_op->outputs.push_back(
+          StringF("%s:%d", split[0], producer_op->outputs.size()));
+    }
+  }
+}
+
+bool InlineAllFunctions(GraphDef* graphdef) {
+  if (graphdef->library().function().empty()) {
+    VLOG(kLogLevelModelUnchanged) << "No functions to inline.";
+    return false;
+  }
+
+  // Override "_noinline" attribute on all functions
+  GraphDef graphdef_copy(*graphdef);
+  for (auto& function :
+       (*graphdef_copy.mutable_library()->mutable_function())) {
+    auto* attributes = function.mutable_attr();
+    if (attributes->count(tensorflow::kNoInlineAttr) != 0) {
+      (*attributes)[tensorflow::kNoInlineAttr].set_b(false);
+    }
+  }
+
+  // Construct minimum resources needed to use ExpandInlineFunctions().
+  tensorflow::SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  device_count->insert({"CPU", 1});
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+
+  tensorflow::FunctionLibraryDefinition fld(tensorflow::OpRegistry::Global(),
+                                            graphdef_copy.library());
+  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::OptimizerOptions o_opts;
+  tensorflow::ProcessFunctionLibraryRuntime pflr(
+      &device_mgr, tensorflow::Env::Default(), TF_GRAPH_DEF_VERSION, &fld,
+      o_opts, nullptr);
+  tensorflow::FunctionLibraryRuntime* flr;
+  flr = pflr.GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+
+  tensorflow::Graph graph(fld);
+  tensorflow::GraphConstructorOptions gc_opts;
+  const auto& tf_convert_status =
+      tensorflow::ConvertGraphDefToGraph(gc_opts, graphdef_copy, &graph);
+  if (!tf_convert_status.ok()) {
+    LOG(ERROR) << "tensorflow::ConvertGraphDefToGraph failed with status: "
+               << tf_convert_status.ToString();
+    return false;
+  }
+
+  // Iterate over the graph until there are no more nodes to be inlined.
+  bool graph_modified = false;
+  while (tensorflow::ExpandInlineFunctions(flr, &graph)) {
+    graph_modified = true;
+  }
+
+  // Output inlined graph
+  if (graph_modified) {
+    LOG(INFO) << "Found and inlined TensorFlow functions.";
+    graph.ToGraphDef(graphdef);
+  }
+  return graph_modified;
+}
+}  // namespace
+
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const GraphDef& tf_graph) {
+  LogDumpGraphDef(kLogLevelModelChanged, "AT IMPORT", tf_graph);
+
+  GraphDef inlined_graph(tf_graph);
+  if (InlineAllFunctions(&inlined_graph)) {
+    LogDumpGraphDef(kLogLevelModelChanged, "AFTER INLINING", inlined_graph);
+  }
+
+  // Check input and output specification.
+  for (const auto& specified_input_array : model_flags.input_arrays()) {
+    CHECK(!absl::EndsWith(specified_input_array.name(), ":0"))
+        << "Unsupported explicit zero output index: "
+        << specified_input_array.name();
+  }
+  for (const string& specified_output_array : model_flags.output_arrays()) {
+    CHECK(!absl::EndsWith(specified_output_array, ":0"))
+        << "Unsupported explicit zero output index: " << specified_output_array;
+  }
+
+  Model* model = new Model;
+
+  for (auto node : inlined_graph.node()) {
+    StripZeroOutputIndexFromInputs(&node);
+    if (node.op() == "Const") {
+      ConvertConstOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Conv2D") {
+      ConvertConvOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Conv2DBackpropInput") {
+      ConvertTransposeConvOperator(node, tf_import_flags, model);
+    } else if (node.op() == "DepthwiseConv2dNative") {
+      ConvertDepthwiseConvOperator(node, tf_import_flags, model);
+    } else if (node.op() == "DepthToSpace") {
+      ConvertDepthToSpaceOperator(node, tf_import_flags, model);
+    } else if (node.op() == "SpaceToDepth") {
+      ConvertSpaceToDepthOperator(node, tf_import_flags, model);
+    } else if (node.op() == "BiasAdd") {
+      ConvertBiasAddOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Relu") {
+      ConvertReluOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Relu6") {
+      ConvertRelu6Operator(node, tf_import_flags, model);
+    } else if (node.op() == "Sigmoid") {
+      ConvertLogisticOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Tanh") {
+      ConvertTanhOperator(node, tf_import_flags, model);
+    } else if (node.op() == "MaxPool") {
+      ConvertMaxPoolOperator(node, tf_import_flags, model);
+    } else if (node.op() == "AvgPool") {
+      ConvertAvgPoolOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Reshape") {
+      ConvertReshapeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "MatMul") {
+      ConvertMatMulOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Div" || node.op() == "RealDiv") {
+      ConvertDivOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
+               node.op() == "StopGradient") {
+      ConvertIdentityOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FakeQuantWithMinMaxVars") {
+      ConvertFakeQuantWithMinMaxVars(node, tf_import_flags, model);
+    } else if (node.op() == "FakeQuantWithMinMaxArgs") {
+      ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
+    } else if (node.op() == "Rsqrt") {
+      ConvertRsqrtOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Squeeze") {
+      ConvertSqueezeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Sqrt") {
+      ConvertSqrtOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Square") {
+      ConvertSquareOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Add") {
+      ConvertAddOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Mul") {
+      ConvertMulOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Sub") {
+      ConvertSubOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Sum") {
+      ConvertSumOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Tile") {
+      ConvertTileOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Concat" || node.op() == "ConcatV2") {
+      ConvertConcatOperator(node, tf_import_flags, model);
+    } else if (node.op() == "LRN") {
+      ConvertLRNOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Softmax") {
+      ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "All") {
+      ConvertAllOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Assert") {
+      ConvertAssertOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Less") {
+      ConvertLessOperator(node, tf_import_flags, model);
+    } else if (node.op() == "LessEqual") {
+      ConvertLessEqualOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Greater") {
+      ConvertGreaterOperator(node, tf_import_flags, model);
+    } else if (node.op() == "GreaterEqual") {
+      ConvertGreaterEqualOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Max") {
+      ConvertMaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Min") {
+      ConvertMinOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Maximum") {
+      ConvertMaximumOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Minimum") {
+      ConvertMinimumOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Merge") {
+      ConvertMergeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Pad") {
+      ConvertPadOperator(node, tf_import_flags, model);
+    } else if (node.op() == "StridedSlice") {
+      ConvertStridedSliceOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Shape") {
+      ConvertShapeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Slice") {
+      ConvertSliceOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Split") {
+      ConvertSplitOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Switch") {
+      ConvertSwitchOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Placeholder") {
+      ConvertPlaceholderOperator(node, tf_import_flags, model);
+    } else if (node.op() == "PlaceholderWithDefault") {
+      ConvertIdentityOperator(node, tf_import_flags, model);
+    } else if (node.op() == "LegacyFedInput") {
+      ConvertPlaceholderOperator(node, tf_import_flags, model);
+    } else if (node.op() == "NoOp") {
+      ConvertNoOpOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Cast") {
+      ConvertCastOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Floor") {
+      ConvertFloorOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Gather") {
+      ConvertGatherOperator(node, tf_import_flags, model);
+    } else if (node.op() == "ResizeBilinear") {
+      ConvertResizeBilinearOperator(node, tf_import_flags, model);
+    } else if (node.op() == "BatchNormWithGlobalNormalization") {
+      ConvertBatchNormWithGlobalNormalizationOperator(node, tf_import_flags,
+                                                      model);
+    } else if (node.op() == "FusedBatchNorm") {
+      ConvertFusedBatchNormOperator(node, tf_import_flags, model);
+    } else if (node.op() == "SpaceToBatchND") {
+      ConvertSpaceToBatchNDOperator(node, tf_import_flags, model);
+    } else if (node.op() == "BatchToSpaceND") {
+      ConvertBatchToSpaceNDOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Mean") {
+      ConvertMeanOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Svdf") {
+      ConvertSvdfOperator(node, tf_import_flags, model);
+    } else if (node.op() == "NextIteration") {
+      ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
+    } else if (node.op() == "ExpandDims") {
+      ConvertExpandDimsOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Fill") {
+      ConvertFillOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorDiv") {
+      ConvertFloorDivOperator(node, tf_import_flags, model);
+    } else if (node.op() == "FloorMod") {
+      ConvertFloorModOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Range") {
+      ConvertRangeOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Rank") {
+      ConvertRankOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Stack" || node.op() == "Pack") {
+      ConvertStackOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Transpose") {
+      ConvertTransposeOperator(node, tf_import_flags, model);
+    } else {
+      ConvertUnsupportedOperator(node, tf_import_flags, model);
+    }
+  }
+
+  ResolveModelFlags(model_flags, model);
+
+  StripCaretFromArrayNames(model);
+  AddExtraOutputs(model);
+  FixNoMissingArray(model);
+  FixNoOrphanedArray(model);
+  FixOperatorOrdering(model);
+  CheckInvariants(*model);
+
+  // if rnn state arrays are constant, make them transient
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    model->GetArray(rnn_state.state_array()).buffer = nullptr;
+  }
+
+  return std::unique_ptr<Model>(model);
+}
+
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const string& input_file_contents) {
+  std::unique_ptr<GraphDef> tf_graph(new GraphDef);
+  CHECK(ParseFromStringEitherTextOrBinary(input_file_contents, tf_graph.get()));
+
+  std::unique_ptr<GraphDef> pruned_graph =
+      MaybeReplaceCompositeSubgraph(*tf_graph);
+  if (pruned_graph) {
+    tf_graph = std::move(pruned_graph);
+  }
+  return ImportTensorFlowGraphDef(model_flags, tf_import_flags, *tf_graph);
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.h b/tensorflow/contrib/lite/toco/import_tensorflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..312e3b8f17cfaa012bf25696937f97d396802bb2
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.h
@@ -0,0 +1,42 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
+
+#include <memory>
+#include <string>
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace toco {
+
+struct TensorFlowImportFlags {
+  // If true, control dependencies will be dropped immediately
+  // during the import of the TensorFlow GraphDef.
+  bool drop_control_dependency = false;
+};
+
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const tensorflow::GraphDef& graph_def);
+
+std::unique_ptr<Model> ImportTensorFlowGraphDef(
+    const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
+    const string& input_file_contents);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_IMPORT_TENSORFLOW_H_
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..57911b1e89e41354225e732d40b57b043df31848
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -0,0 +1,1513 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
+
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+enum class OperatorType {
+  kNone,
+  // General-purpose neural network operators.
+  kAdd,
+  kAveragePool,
+  kBatchNormalization,
+  kConv,
+  kConcatenation,
+  kDepthwiseConv,
+  kDepthToSpace,
+  kSpaceToDepth,
+  kDequantize,
+  kDiv,
+  kExpandDims,
+  kFill,
+  kFloorDiv,
+  kFloorMod,
+  kFullyConnected,
+  kL2Normalization,
+  kL2Pool,
+  kLstmCell,
+  kLocalResponseNormalization,
+  kLogistic,
+  kMaxPool,
+  kFakeQuant,
+  kMul,
+  kRange,
+  kRank,
+  kRelu,
+  kRelu1,
+  kRelu6,
+  kSoftmax,
+  kSub,
+  kTanh,
+  kTransposeConv,
+  kCast,
+  kFloor,
+  kGather,
+  kResizeBilinear,
+  kSpaceToBatchND,
+  kStack,
+  kBatchToSpaceND,
+  kPad,
+  kStridedSlice,
+  kSlice,
+  kSqueeze,
+  kMean,
+  // The SVDF Op is a decomposition of a densely connected Op into
+  // low rank filters. For details:
+  // https://research.google.com/pubs/pub43813.html
+  kSvdf,
+  // Special operators used for importing TensorFlow nodes.
+  // The general intent is to have some graph transformation either
+  // drop them or rewrite them as general-purpose operators.
+  kTensorFlowAll,
+  kTensorFlowAssert,
+  kTensorFlowConcat,
+  kTensorFlowConcatV2,
+  kTensorFlowGreater,
+  kTensorFlowGreaterEqual,
+  kTensorFlowIdentity,
+  kTensorFlowLess,
+  kTensorFlowLessEqual,
+  kTensorFlowMax,
+  kTensorFlowMaximum,
+  kTensorFlowMin,
+  kTensorFlowMinimum,
+  kTensorFlowMatMul,
+  kTensorFlowMerge,
+  kTensorFlowReshape,
+  kTensorFlowRsqrt,
+  kTensorFlowShape,
+  kTensorFlowSplit,
+  kTensorFlowSqrt,
+  kTensorFlowSquare,
+  kTensorFlowSum,
+  kTensorFlowSwitch,
+  kTensorFlowTile,
+  kTranspose,
+  // An unsupported TF operation. It's only needed to be able to represent TF
+  // graph internally and is expected to be dropped by graph transformations.
+  kTensorFlowUnsupported,
+  // Finally, TensorFlow uses different conventions for axes ordering,
+  // see AxesOrder, and this cannot always be resolved at the time of importing
+  // nodes, as TensorFlow parameters may be constant-expression subgraphs
+  // instead of being given as plain constant arrays. So we need to insert
+  // special nodes in the graph to shuffle axes.
+  kReorderAxes,
+};
+
+// Helper to deal with TensorFlow arrays using a different ordering of
+// dimensions
+// ("axes") than our own.
+// TODO(benoitjacob): Ultimately, we shouldn't have any "ordering" of axes,
+// we should have associative arrays mapping symbolic axes identifiers (like
+// "output_depth") to dimensions. We would then not need this anymore.
+enum class AxesOrder {
+  kOneAxis,  // one-dimensional array, one unique axis.
+  kCR,       // column-major matrix storage order. Our standard.
+  kRC,       // row-major matrix storage order. TensorFlow default.
+  kOHWI,     // Our standard for conv weights
+  kHWIO,     // TensorFlow conv weights
+  k1HWO,     // Our standard for DepthwiseConv weights
+  kHWIM,     // TensorFlow DepthwiseConv weights
+  kNHWC,     // TensorFlow activations
+};
+
+// The type of the scalars in an array.
+// Note that that does not by itself tell whether the values in the array are
+// real (are literally interpreted as real numbers) or quantized (only acquire
+// a meaning as real numbers in conjunction with QuantizationParams).
+//
+// In practice though:
+//   float values are always real
+//   uint8 values are always quantized
+//   int32 values are either real or quantized (depending on whether
+//   QuantizationParams are present).
+//   other types are unused at the moment.
+//
+// kNone means that we don't know the data type yet, or that we don't care
+// because we'll be dropping the array anyway (e.g. some exotic array types
+// may be involved only in debug-only subgraphs that we may not be interested
+// in actually supporting).
+enum class ArrayDataType { kNone, kBool, kFloat, kUint8, kInt32, kInt64 };
+
+// Compile-time logic to map ArrayDataType to the corresponding C++ scalar type
+template <ArrayDataType A>
+struct DataTypeImpl {};
+template <>
+struct DataTypeImpl<ArrayDataType::kNone> {
+  typedef int Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kBool> {
+  typedef bool Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kFloat> {
+  typedef float Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kUint8> {
+  typedef uint8 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt32> {
+  typedef int32 Type;
+};
+template <>
+struct DataTypeImpl<ArrayDataType::kInt64> {
+  typedef int64 Type;
+};
+
+template <ArrayDataType A>
+using DataType = typename DataTypeImpl<A>::Type;
+
+// Base class for type-specific buffer types.
+struct GenericBuffer {
+  // Non-default-constructible: only ArrayDataType-specific subclass
+  // objects may be constructed.
+  GenericBuffer() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Buffer
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  GenericBuffer(const GenericBuffer&) = delete;
+  GenericBuffer(const GenericBuffer&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Buffer
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~GenericBuffer() {}
+
+  const ArrayDataType type;
+
+ protected:
+  // Constructor used by subclasses for specific ArrayDataType's.
+  explicit GenericBuffer(ArrayDataType t) : type(t) {}
+};
+
+// Type-specific buffer, containing type-specific storage.
+template <ArrayDataType A>
+struct Buffer : GenericBuffer {
+  Buffer() : GenericBuffer(A) {}
+
+  std::vector<DataType<A>> data;
+};
+
+// Base class for all operator classes.
+struct Operator {
+  // Non-default-constructible: only OperatorType-specific subclass
+  // objects may be constructed.
+  Operator() = delete;
+  // Non-copyable-or-movable: we should only store pointers-to-Operator
+  // in containers, not Operators themselves, so there should be no
+  // copy or move.
+  Operator(const Operator&) = delete;
+  Operator(const Operator&&) = delete;
+
+  // We need a virtual destructor so we can store pointers-to-Operator
+  // in containers and have the containers call the right subclass destructor.
+  virtual ~Operator() {}
+
+  // The specific type of operator. Corresponds 1:1 to subclasses.
+  const OperatorType type;
+
+  // The activation function that may be fused into this operator,
+  // or None if no activation function is fused.
+  FusedActivationFunctionType fused_activation_function;
+
+  // Input arrays: either activation arrays or constant array parameters.
+  // We refer to them by their name, not by their address; the mapping of
+  // names to addresses is given by the Model, which owns both Operator's and
+  // Array's. Thus, an Operator on its own doesn't contain much information,
+  // it is meant to be used in conjunction with the Model that owns it.
+  std::vector<string> inputs;
+
+  // Output activation arrays. Same comments as for inputs apply here too.
+  std::vector<string> outputs;
+
+  // If true, the array has more outputs than are listed in the 'outputs'
+  // member. These need to be resolved by some graph transformation.
+  // This flag is only here to indicate that an operator should not be
+  // discarded as unused, even if from its 'outputs' member alone it
+  // looks unused.
+  bool unresolved_outputs = false;
+
+ protected:
+  // Constructor used by subclasses for specific OperatorType's.
+  explicit Operator(OperatorType t)
+      : type(t),
+        fused_activation_function(FusedActivationFunctionType::kNone) {}
+};
+
+// Padding types for Conv-like operators. This is how padding is typically
+// specified in model files. But for inference, we will need to resolve this
+// to a FixedPadding, see below.
+enum class PaddingType { kNone, kSame, kValid };
+
+// Padding as resolved for a specific layer shape, as needed for inference.
+// For a given layer shape, a given padding type will resolve to a choice of
+// a number of padding rows and columns, which we call the padding height and
+// width respectively.
+struct FixedPadding {
+  int width = 0;
+  int height = 0;
+};
+
+// "Universal" padding struct containing both a generic PaddingType (as
+// represented in a model file), and a FixedPadding (as needed for inference).
+// The latter is resolved during the PropagateFixedSizes pass.
+struct Padding {
+  FixedPadding& GetOrCreateFixedPadding() {
+    if (!fixed) {
+      FixedPadding* ptr = new FixedPadding;
+      fixed = std::unique_ptr<FixedPadding>(ptr);
+    }
+    return *fixed;
+  }
+
+  Padding() : type(PaddingType::kNone) {}
+  PaddingType type;
+  std::unique_ptr<FixedPadding> fixed;
+};
+
+// "Convolutional" layer, as represented in model files.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//   outputs[1]: optional: the intermediate array of im2col-replicated input
+//                         activations. Present when targeting implementations
+//                         of Conv layers as Im2col+GEMM.
+//
+// TensorFlow equivalent: Conv2D
+struct ConvOperator : Operator {
+  ConvOperator() : Operator(OperatorType::kConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+  // A dilation_rate of 0 is invalid and this field is an optional attribute.
+  // Thus initializing it to 1 to allow default conv behavior when the
+  // attribute is not present.
+  int dilation_rate = 1;
+};
+
+// Depthwise-separable convolution operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the DepthwiseConv weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: DepthwiseConv2dNative
+struct DepthwiseConvOperator : Operator {
+  DepthwiseConvOperator() : Operator(OperatorType::kDepthwiseConv) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int depth_multiplier = 0;
+};
+
+// Depth-to-space transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: DepthToSpace
+struct DepthToSpaceOperator : Operator {
+  DepthToSpaceOperator() : Operator(OperatorType::kDepthToSpace) {}
+  int block_size = 0;
+};
+
+// Space-to-depth transform operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: SpaceToDepth
+struct SpaceToDepthOperator : Operator {
+  SpaceToDepthOperator() : Operator(OperatorType::kSpaceToDepth) {}
+  int block_size = 0;
+};
+
+// Fully-connected operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the FullyConnected weights
+//   inputs[2]: optional: the bias vector, specifying the biases for each output
+//   channel.
+//
+// TensorFlow equivalent: a pair consisting of a Reshape node reshaping the
+// input activations as a matrix, followed by a MatMul node.
+struct FullyConnectedOperator : Operator {
+  FullyConnectedOperator() : Operator(OperatorType::kFullyConnected) {}
+};
+
+// Dequantization operator, converting a quantized array of integers with
+// quantization parameters specifying how these integers correspond to real
+// numbers
+// (see QuantizationParams) to an output activations array of floating-point
+// values.
+//
+// In floating-point image models, there is typically a Dequantization operator
+// at the very beginning, converting the input image RGB data, consisting of
+// uint8 integer values, to floating-point input activations. That is where
+// image model parameters such as "mean_value" and "std_value" are typically
+// handled.
+//
+// This is the only operator type that converts from quantized to
+// floating-point,
+// and there is at the moment no operator type at all to convert from
+// floating-point
+// to quantized. Every other operator does either float->float or
+// quantized->quantized.
+//
+// Inputs:
+//   inputs[0]: required: the input quantized activations array
+//
+// TensorFlow equivalent: Dequantize
+struct DequantizeOperator : Operator {
+  DequantizeOperator() : Operator(OperatorType::kDequantize) {}
+};
+
+// Batch-normalization operator.
+//
+// We only support batch-normalization using pre-learned moments, so this is
+// just
+// computing (input - mean) * multiplier + offset. As such, this can be
+// expressed as a combination of Add and Mul nodes, and indeed this is how
+// we break it down during tooling for the purpose of fusing it into
+// other operators.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the learned mean array
+//   inputs[2]: required: the learned multiplier array
+//   inputs[3]: required: the learned offset array
+//
+// TensorFlow equivalent: a combination of Add and Mul nodes
+struct BatchNormalizationOperator : Operator {
+  BatchNormalizationOperator()
+      : Operator(OperatorType::kBatchNormalization),
+        global_normalization(false) {}
+  bool global_normalization;
+};
+
+// L2-normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input activations array
+//
+// TensorFlow equivalent: none. In TensorFlow, L2 normalization is implemented
+// by a sub-graph of operators implementing L2-normalization
+// from lower-level arithmetic nodes; during tooling, we identify such
+// sub-graphs
+// and replace them by L2NormalizationOperator's. See IdentifyL2Normalization.
+struct L2NormalizationOperator : Operator {
+  L2NormalizationOperator() : Operator(OperatorType::kL2Normalization) {}
+};
+
+// LSTM Cell operator.
+//
+// Inputs:
+//   inputs[0]: required: the input data array
+//   inputs[1]: required: the previous output activations array
+//   inputs[2]: required: the learned weights array
+//   inputs[3]: required: the learned biases array
+//   inputs[4]: required: the previous output state
+//   outputs[0]: required: the output activations array
+//   outputs[1]: required: the new state array
+//
+// TensorFlow equivalent: none. In TensorFlow, an LSTM is implemented
+// with a sub-graph of lower-level arithmetic nodes; during tooling, we identify
+// such sub-graphs and replace them with LstmCells. See IdentifyLstmCell().
+struct LstmCellOperator : Operator {
+  enum Inputs {
+    DATA_INPUT = 0,
+    PREV_ACTIV_INPUT = 1,
+    WEIGHTS_INPUT = 2,
+    BIASES_INPUT = 3,
+    PREV_STATE_INPUT = 4,
+    NUM_INPUTS = 5
+  };
+  enum Outputs {
+    ACTIV_OUTPUT = 0,
+    STATE_OUTPUT = 1,
+    CONCAT_TEMP = 2,
+    ACTIV_TEMP = 3,
+    NUM_OUTPUTS = 4
+  };
+  LstmCellOperator() : Operator(OperatorType::kLstmCell) {}
+};
+
+// Element-wise multiplication operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Mul
+struct MulOperator : Operator {
+  MulOperator() : Operator(OperatorType::kMul) {}
+};
+
+// Element-wise Relu operator:
+//   x -> max(0, x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct ReluOperator : Operator {
+  ReluOperator() : Operator(OperatorType::kRelu) {}
+};
+
+// Element-wise Relu1 operator:
+//   x -> min(max(x, -1), 1)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. We can construct the operator with Minimum
+// and Maximum operations
+struct Relu1Operator : Operator {
+  Relu1Operator() : Operator(OperatorType::kRelu1) {}
+};
+
+// Element-wise Relu6 operator:
+//   x -> max(0, min(6, x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu6
+struct Relu6Operator : Operator {
+  Relu6Operator() : Operator(OperatorType::kRelu6) {}
+};
+
+// Element-wise Logistic operator:
+//   x -> Logistic(x) = 1 / (1 + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sigmoid
+struct LogisticOperator : Operator {
+  LogisticOperator() : Operator(OperatorType::kLogistic) {}
+};
+
+// Element-wise Tanh operator:
+//   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Tanh
+struct TanhOperator : Operator {
+  TanhOperator() : Operator(OperatorType::kTanh) {}
+};
+
+// Element-wise addition operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Add
+struct AddOperator : Operator {
+  AddOperator() : Operator(OperatorType::kAdd) {}
+};
+
+// Concatenation operator: concatenates its inputs
+// along the concat_dim dimension.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to concatenate.
+//
+// TensorFlow equivalent: Concat.
+struct ConcatenationOperator : Operator {
+  ConcatenationOperator() : Operator(OperatorType::kConcatenation) {}
+  int concat_dim = 0;
+};
+
+// Reordering dimensions. Used only during tooling to transform graphs from
+// the TensorFlow format.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. This is only useful to convert between formats.
+struct ReorderAxesOperator : Operator {
+  ReorderAxesOperator() : Operator(OperatorType::kReorderAxes) {}
+  AxesOrder input_axes_order;
+  AxesOrder output_axes_order;
+};
+
+// Average-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: AveragePool
+struct AveragePoolOperator : Operator {
+  AveragePoolOperator() : Operator(OperatorType::kAveragePool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// Local response normalization operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LRN
+struct LocalResponseNormalizationOperator : Operator {
+  LocalResponseNormalizationOperator()
+      : Operator(OperatorType::kLocalResponseNormalization) {}
+
+  int range = 0;
+  float bias = 0.f;
+  float alpha = 0.f;
+  float beta = 0.f;
+};
+
+// Max-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: MaxPool
+struct MaxPoolOperator : Operator {
+  MaxPoolOperator() : Operator(OperatorType::kMaxPool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// L2-pooling operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: none. Can be shimmed by squaring+avgpool+sqrt.
+struct L2PoolOperator : Operator {
+  L2PoolOperator() : Operator(OperatorType::kL2Pool) {}
+  Padding padding;
+  int stride_height = 0;
+  int stride_width = 0;
+  int kheight = 0;
+  int kwidth = 0;
+};
+
+// The expected [min, max] range of values in a given array.
+// Used for quantization only.
+// This information typically comes from special nodes found in quantized
+// models,
+// see FakeQuantOperator, and is used during quantization to resolve
+// actual quantization parameters (see QuantizationParams).
+struct MinMax {
+  double min = 0.;
+  double max = 0.;
+};
+
+inline bool operator==(const MinMax& m1, const MinMax& m2) {
+  return m1.min == m2.min && m1.max == m2.max;
+}
+
+// Fake-quantization operator. This does two things:
+//   - Annotate its input and output arrays with MinMax information,
+//   - Arithmetic-wise, this operator rounds incoming activation values
+//     to the nearest representable value on the scale of 256
+//     values from the min to the max value dictated by its MinMax info.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: optional: the 'min' value, if it has not yet been resolved
+//              to a constant.
+//   inputs[2]: optional: the 'max' value, if it has not yet been resolved
+//              to a constant.
+//
+// TensorFlow equivalent: FakeQuantWithMinMaxVars, FakeQuantWithMinMaxArgs.
+struct FakeQuantOperator : Operator {
+  FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
+  std::unique_ptr<MinMax> minmax;
+};
+
+// Element-wise division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Div
+struct DivOperator : Operator {
+  DivOperator() : Operator(OperatorType::kDiv) {}
+};
+
+// Element-wise identity (x->x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Identity
+struct TensorFlowIdentityOperator : Operator {
+  TensorFlowIdentityOperator() : Operator(OperatorType::kTensorFlowIdentity) {}
+};
+
+// General matrix multiplication operator. We don't want to support general
+// matrix multiplication at inference time, so we resolve it during tooling
+// to more specific operator types, namely, FullyConnected.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side matrix
+//   inputs[1]: required: the right-hand side matrix
+//
+// TensorFlow equivalent: MatMul
+struct TensorFlowMatMulOperator : Operator {
+  TensorFlowMatMulOperator() : Operator(OperatorType::kTensorFlowMatMul) {}
+};
+
+// Padding operator. Pads a tensor with zeros.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of
+// `input` in that dimension.
+//
+// TensorFlow equivalent: Pad
+struct PadOperator : Operator {
+  PadOperator() : Operator(OperatorType::kPad) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
+// Strided slice operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: StridedSlice
+struct StridedSliceOperator : Operator {
+  StridedSliceOperator() : Operator(OperatorType::kStridedSlice) {}
+
+  std::vector<int> start_indices;
+  std::vector<int> stop_indices;
+  std::vector<int> strides;
+
+  int begin_mask;
+  int ellipsis_mask;
+  int end_mask;
+  int new_axis_mask;
+  int shrink_axis_mask;
+};
+
+// Reshaping operator, reshaping its input array to a two-dimensional shape
+// (a "matrix"). This is used in the TensorFlow format, in conjunction with
+// MatMul nodes, to implement fully-connected layers.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Reshape --- except that we only support a special case
+// here, where the output shape is a matrix (2D) shape.
+struct TensorFlowReshapeOperator : Operator {
+  TensorFlowReshapeOperator() : Operator(OperatorType::kTensorFlowReshape) {}
+  std::vector<int> shape;
+};
+
+// Removes dimensions of size 1 from the shape of a tensor.
+// https://www.tensorflow.org/api_docs/python/tf/squeeze
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Squeeze
+struct SqueezeOperator : Operator {
+  SqueezeOperator() : Operator(OperatorType::kSqueeze) {}
+
+  std::vector<int> squeeze_dims;
+};
+
+// Inputs:
+//   inputs[0]: required: the input activations array
+//   inputs[1]: required: the Conv weights
+//   channel.
+//
+// Outputs:
+//   outputs[0]: required: the output activations array
+//
+// TensorFlow equivalent: Conv2DBackpropInput
+struct TransposeConvOperator : Operator {
+  TransposeConvOperator() : Operator(OperatorType::kTransposeConv) {}
+  Padding padding;
+  int stride_width = 0;
+  int stride_height = 0;
+};
+
+// Given a tensor input, this operation inserts a dimension of 1 at the
+// dimension index axis of input's shape. The dimension index axis starts at
+// zero; if you specify a negative number for axis it is counted backward from
+// the end.
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//   inputs[1]: required: 0-D (scalar). Specifies the dimension index at which
+//   to expand the shape of input
+//
+// TensorFlow equivalent: ExpandDims
+struct ExpandDimsOperator : Operator {
+  ExpandDimsOperator() : Operator(OperatorType::kExpandDims) {}
+};
+
+// Ceates a tensor of shape dims and fills it with the given scalar value.
+// Output type will be the same as the given scalar value.
+//
+// Inputs:
+//   inputs[0]: required: 1-D (int32) - the shape of the output tensor
+//   inputs[1]: required: 0-D (scalar) - value to fill the tensor with
+//
+// TensorFlow equivalent: Fill
+struct FillOperator : Operator {
+  FillOperator() : Operator(OperatorType::kFill) {}
+};
+
+// Element-wise floor division operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorDiv
+struct FloorDivOperator : Operator {
+  FloorDivOperator() : Operator(OperatorType::kFloorDiv) {}
+};
+
+// Element-wise floor mod operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: FloorMod
+struct FloorModOperator : Operator {
+  FloorModOperator() : Operator(OperatorType::kFloorMod) {}
+};
+
+// Creates a sequence of numbers that begins at start and extends by increments
+// of delta up to but not including limit.
+//
+// The dtype of the resulting tensor is inferred from the inputs unless it is
+// provided explicitly.
+//
+// Inputs:
+//   inputs[0]: required: the start
+//   inputs[1]: required: the limit
+//   inputs[2]: required: the delta
+//
+// TensorFlow equivalent: Range
+struct RangeOperator : Operator {
+  RangeOperator() : Operator(OperatorType::kRange) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+};
+
+// Rank operator. Extracts the rank of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 0-D integer tensor representing the rank of
+// the input.
+//
+// TensorFlow equivalent: Rank.  We currently assume that the output is int32
+// and not int64.  The output type could be stored herein.
+struct RankOperator : Operator {
+  RankOperator() : Operator(OperatorType::kRank) {}
+};
+
+// Element-wise reciprocal-square-root (x^-0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Rsqrt
+struct TensorFlowRsqrtOperator : Operator {
+  TensorFlowRsqrtOperator() : Operator(OperatorType::kTensorFlowRsqrt) {}
+};
+
+// Stacks a list of rank-R tensors into one rank-(R+1) tensor.
+//
+// Packs the list of tensors in values into a tensor with rank one higher than
+// each tensor in values, by packing them along the axis dimension. Given a list
+// of length N of tensors of shape (A, B, C);.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// TensorFlow equivalent: Stack or Pack
+struct StackOperator : Operator {
+  StackOperator() : Operator(OperatorType::kStack) {}
+  int axis = 0;
+};
+
+// Shape operator. Extracts the shape of the tensor.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// This operation outputs a 1-D integer tensor representing the shape of
+// the input.
+//
+// TensorFlow equivalent: Shape.  We currently assume that the output is int32
+// and not int64.  The output type could be stored herein.
+struct TensorFlowShapeOperator : Operator {
+  TensorFlowShapeOperator() : Operator(OperatorType::kTensorFlowShape) {}
+};
+
+// Element-wise square-root (x^0.5) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sqrt
+struct TensorFlowSqrtOperator : Operator {
+  TensorFlowSqrtOperator() : Operator(OperatorType::kTensorFlowSqrt) {}
+};
+
+// Element-wise square (x*x) operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Square
+struct TensorFlowSquareOperator : Operator {
+  TensorFlowSquareOperator() : Operator(OperatorType::kTensorFlowSquare) {}
+};
+
+// Transposes a tensor.
+//
+// By default, this operation performs a regular matrix transpose on 2-D input
+// tensors.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Transpose
+struct TransposeOperator : Operator {
+  TransposeOperator() : Operator(OperatorType::kTranspose) {}
+};
+
+// Element-wise subtraction operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Sub
+struct SubOperator : Operator {
+  SubOperator() : Operator(OperatorType::kSub) {}
+};
+
+// Global sum reduction: computes the sum of all of entries in the input array.
+// Thus the output is "0-dimensional": it consists of a single scalar value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sum --- except that we only support the special case
+// of global reduction across all dimensions.
+struct TensorFlowSumOperator : Operator {
+  TensorFlowSumOperator() : Operator(OperatorType::kTensorFlowSum) {}
+  bool keep_dims = false;
+};
+
+// TensorFlow Tile equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+struct TensorFlowTileOperator : Operator {
+  TensorFlowTileOperator() : Operator(OperatorType::kTensorFlowTile) {}
+};
+
+// TensorFlow Slice equivalent. Refer to TensorFlow documentation for details.
+struct SliceOperator : Operator {
+  SliceOperator() : Operator(OperatorType::kSlice) {}
+
+  std::vector<int> begin;
+  std::vector<int> size;
+};
+
+// TensorFlow Split equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+struct TensorFlowSplitOperator : Operator {
+  TensorFlowSplitOperator() : Operator(OperatorType::kTensorFlowSplit) {}
+  int num_split = 0;
+};
+
+// TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatOperator : Operator {
+  TensorFlowConcatOperator() : Operator(OperatorType::kTensorFlowConcat) {}
+};
+
+// TensorFlow ConcatV2 equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Concretely, once the concat dim becomes known, if it is the depth
+// dimension then we can change this op into a DepthConcatenation op.
+// Otherwise, we hope for some other graph transformation to drop this node.
+struct TensorFlowConcatV2Operator : Operator {
+  TensorFlowConcatV2Operator() : Operator(OperatorType::kTensorFlowConcatV2) {}
+};
+
+// TensorFlow Merge equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs: this operator accepts any number >= 1 of inputs.
+//   inputs[i]: the i-th array to merge.
+//
+// It is expected that graph transformations will drop all but exactly one
+// of the inputs, at which point the Merge node will be equivalent to an
+// Identity node forwarding the remaining input.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowMergeOperator : Operator {
+  TensorFlowMergeOperator() : Operator(OperatorType::kTensorFlowMerge) {}
+};
+
+// TensorFlow Switch equivalent. Refer to TensorFlow documentation for details.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the boolean predicate, given as an array of size 1
+//     and of type kBool, will determine which output gets selected.
+//
+// Outputs: a TensorFlow Switch node always has exactly two outputs. Depending
+// on the boolean value that the input predicate resolves to (see note below),
+// one or the other of the outputs will be 'selected': the input array will be
+// forwarded to the 'selected output' as if by a Identity node, while the other
+// output will be discarded, and any graph edge connecting that discarded output
+// will be dropped. The rule for selecting outputs is as follows:
+//   outputs[0] will be selected if the input predicate resolves to 'true'.
+//   outputs[1] will be selected if the input predicate resolves to 'false'.
+//
+// Note: We do not currently support runtime control flow: we only support
+// control flow that can be resolved at tooling time (independently of input
+// activations).
+struct TensorFlowSwitchOperator : Operator {
+  TensorFlowSwitchOperator() : Operator(OperatorType::kTensorFlowSwitch) {}
+};
+
+// TensorFlow All equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowAllOperator : Operator {
+  TensorFlowAllOperator() : Operator(OperatorType::kTensorFlowAll) {}
+};
+
+// TensorFlow Assert equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, we just drop Assert nodes.
+struct TensorFlowAssertOperator : Operator {
+  TensorFlowAssertOperator() : Operator(OperatorType::kTensorFlowAssert) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessOperator : Operator {
+  TensorFlowLessOperator() : Operator(OperatorType::kTensorFlowLess) {}
+};
+
+// TensorFlow LessEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowLessEqualOperator : Operator {
+  TensorFlowLessEqualOperator()
+      : Operator(OperatorType::kTensorFlowLessEqual) {}
+};
+
+// TensorFlow Less equivalent. Refer to TensorFlow documentation for details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterOperator : Operator {
+  TensorFlowGreaterOperator() : Operator(OperatorType::kTensorFlowGreater) {}
+};
+
+// TensorFlow GreaterEqual equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowGreaterEqualOperator : Operator {
+  TensorFlowGreaterEqualOperator()
+      : Operator(OperatorType::kTensorFlowGreaterEqual) {}
+};
+
+// Global max reduction: computes the max of all of entries in the input array.
+// Thus the output is "0-dimensional": it consists of a single scalar value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Max --- except that we only support the special case
+// of global reduction across all dimensions.
+struct TensorFlowMaxOperator : Operator {
+  TensorFlowMaxOperator() : Operator(OperatorType::kTensorFlowMax) {}
+  bool keep_dims = false;
+};
+
+// Global min reduction: computes the min of all of entries in the input array.
+// Thus the output is "0-dimensional": it consists of a single scalar value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Min --- except that we only support the special case
+// of global reduction across all dimensions.
+struct TensorFlowMinOperator : Operator {
+  TensorFlowMinOperator() : Operator(OperatorType::kTensorFlowMin) {}
+  bool keep_dims = false;
+};
+
+// Element-wise maximum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Maximum
+struct TensorFlowMaximumOperator : Operator {
+  TensorFlowMaximumOperator() : Operator(OperatorType::kTensorFlowMaximum) {}
+};
+
+// Element-wise minimum operator. Currently it only supports scalar as
+// the second operand.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: Minimum
+struct TensorFlowMinimumOperator : Operator {
+  TensorFlowMinimumOperator() : Operator(OperatorType::kTensorFlowMinimum) {}
+};
+
+// General TF operation, unsupported by tf.mini. Expected to be dropped by
+// graph transformations.
+struct TensorFlowUnsupportedOperator : Operator {
+  TensorFlowUnsupportedOperator()
+      : Operator(OperatorType::kTensorFlowUnsupported) {}
+
+  // The original TF operation type. Used for diagnostic purposes.
+  string tensorflow_op;
+  // A serialized tensorflow::NodeDef string.
+  string tensorflow_node_def;
+  // A boolean indicating if the unsupported op should be treated as quantized.
+  bool quantized = false;
+  // Output data types
+  std::vector<ArrayDataType> output_data_types;
+};
+
+// Softmax activation function.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Softmax
+struct SoftmaxOperator : Operator {
+  SoftmaxOperator() : Operator(OperatorType::kSoftmax) {}
+  float beta = 0.f;
+};
+
+// Cast operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Cast
+struct CastOperator : Operator {
+  CastOperator() : Operator(OperatorType::kCast) {}
+  ArrayDataType src_data_type = ArrayDataType::kNone;
+  ArrayDataType dst_data_type = ArrayDataType::kNone;
+};
+
+// Floor operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Floor
+struct FloorOperator : Operator {
+  FloorOperator() : Operator(OperatorType::kFloor) {}
+};
+
+// Gather operator. It gathers slices from params according to indices.
+// Only 1-D indices are supported at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//
+// TensorFlow equivalent: Gather
+struct GatherOperator : Operator {
+  GatherOperator() : Operator(OperatorType::kGather) {}
+  int input_rank;
+};
+
+// ResizeBilinear operator. It resizes input images with bilinear interpolation.
+// It does not support align_corners at the moment.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the new image size
+//
+// TensorFlow equivalent: ResizeBilinear
+struct ResizeBilinearOperator : Operator {
+  ResizeBilinearOperator() : Operator(OperatorType::kResizeBilinear) {}
+};
+
+// SpaceToBatchND operator. It divides spatial dimensions into a grid of
+// blocks and interleaves these blocks with the batch dimension. Currently,
+// only 2-d blocks are supported.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the paddings
+//
+// TensorFlow equivalent: SpaceToBatchND
+struct SpaceToBatchNDOperator : Operator {
+  SpaceToBatchNDOperator() : Operator(OperatorType::kSpaceToBatchND) {}
+};
+
+// BatchToSpaceND operator. Rearranges data from batch into blocks of
+// spatial data. Currently, only 2-d blocks are supported. Cropping is not
+// supported, either, and the crops array should be all zero.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the block shape
+//   inputs[2]: required: the crops
+//
+// TensorFlow equivalent: BatchToSpaceND
+struct BatchToSpaceNDOperator : Operator {
+  BatchToSpaceNDOperator() : Operator(OperatorType::kBatchToSpaceND) {}
+};
+
+// Mean operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Mean
+struct MeanOperator : Operator {
+  MeanOperator() : Operator(OperatorType::kMean) {}
+
+  std::vector<int> reduction_indices;
+  bool keep_dims = false;
+};
+
+// Svdf operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: weights_feature
+//   inputs[2]: required: weights_time
+//   inputs[3]: optional: bias
+struct SvdfOperator : Operator {
+  SvdfOperator() : Operator(OperatorType::kSvdf) {}
+  int rank;
+};
+
+// Alloc's are used for transient arrays only. An Alloc specifies which interval
+// of the "transient_data" workspace buffer passed to inference functions, is to
+// be used for the transient array at hand. The 'start' and 'end' values are
+// offsets from the start of the workspace buffer, expressed in bytes.
+struct Alloc {
+  int start = 0;
+  int end = 0;
+};
+
+inline bool operator<(const Alloc& a, const Alloc& b) {
+  return a.start < b.start;
+}
+
+// Quantization parameters, determining the mapping of quantized values
+// to real values (i.e. determining how quantized values are mathematically
+// interpreted).
+//
+// The correspondence is as follows:
+//
+//   real_value = scale * (quantized_value - zero_point);
+//
+// In other words, zero_point designates which quantized value corresponds to
+// the real 0 value, and scale designates the difference between the real values
+// corresponding to consecutive quantized values differing by 1.
+struct QuantizationParams {
+  int32 zero_point = 0;
+  double scale = 0.;
+};
+
+class Shape {
+ public:
+  // For Shape, we stick to half-way encapsulation for now:
+  // we hide the raw dims_ member, but expose it raw by accessors
+  // because from some brainstorming, it's not at all easy to
+  // anticipate which flavor of more hermetic encapsulation would
+  // actually buy us future-proof-ness without being needlessly
+  // cumbersome.
+  Shape() {}
+  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
+
+  void ReplaceDims(std::initializer_list<int> dim_list) {
+    dims_ = std::vector<int>(dim_list);
+  }
+
+  const std::vector<int>& dims() const { return dims_; }
+  std::vector<int>* mutable_dims() { return &dims_; }
+  const int dimensions_count() const { return dims_.size(); }
+
+  // We still have that one convenience accessor to avoid
+  // the awkward double bracket issue:  shape.dims()[i].
+  int dims(int i) const { return dims_[i]; }
+
+  bool operator==(const Shape& comp) const {
+    return (this->dims_ == comp.dims());
+  }
+
+  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
+
+ private:
+  std::vector<int> dims_;
+};
+
+// Array represents an array (either a constant parameter array or an
+// activations array) in a Model.
+struct Array {
+  template <ArrayDataType A>
+  const Buffer<A>& GetBuffer() const {
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<const Buffer<A>*>(buffer.get());
+  }
+  template <ArrayDataType A>
+  Buffer<A>& GetMutableBuffer() {
+    if (!buffer) {
+      Buffer<A>* ptr = new Buffer<A>;
+      buffer = std::unique_ptr<GenericBuffer>(ptr);
+    }
+    DCHECK(buffer);
+    DCHECK(buffer->type == A);
+    return *static_cast<Buffer<A>*>(buffer.get());
+  }
+  Alloc& GetOrCreateAlloc() {
+    if (!alloc) {
+      alloc = std::unique_ptr<Alloc>(new Alloc);
+    }
+    return *alloc;
+  }
+  MinMax& GetOrCreateMinMax() {
+    if (!minmax) {
+      minmax = std::unique_ptr<MinMax>(new MinMax);
+    }
+    return *minmax;
+  }
+  MinMax& GetMinMax() const {
+    DCHECK(minmax);
+    return *minmax;
+  }
+  QuantizationParams& GetOrCreateQuantizationParams() {
+    if (!quantization_params) {
+      quantization_params =
+          std::unique_ptr<QuantizationParams>(new QuantizationParams);
+    }
+    return *quantization_params;
+  }
+  QuantizationParams& GetQuantizationParams() const {
+    DCHECK(quantization_params);
+    return *quantization_params;
+  }
+
+  // The data type of the actual elements of this array, that is:
+  //  - If there is a buffer (see 'buffer' member), it must be of the same
+  //    type.
+  //  - If there is no buffer, meaning that this is a runtime (i.e. activations)
+  //    array, then this specifies the type of elements that there will be
+  //    at runtime.
+  //
+  // Note that this only specifies the storage type of elements; this does
+  // not specify whether these are to be treated as 'real' or 'quantized'
+  // values.
+  // That is decided by whether the 'quantization_params' member is null.
+  ArrayDataType data_type = ArrayDataType::kNone;
+  // The final value that data_type should have at the end of graph
+  // transformations
+  ArrayDataType final_data_type = ArrayDataType::kNone;
+  // The dimensions of this array --- this specifies both sizes and strides
+  // (the storage layout).
+  //
+  // Issues with shape handling that remain include:
+  //   - No way to distinguish between 0-dimensional dims and missing dims.
+  //   - No way to describe dims that may be runtime-variable.
+  //   - Addressing of dims by integer index differs in different graph formats
+  //     (TensorFlow vs. other frameworks vs. what we have informally grown
+  //     within toco).
+  //     This is currently quite messy; see ReorderAxesOperator which is how we
+  //     bridge some of these discrepancies at the moment. This is overdue for
+  //     a redesign; I'm thinking that it would be nice to have more flexible
+  //     dims that allow mapping 1:1, cleanly, dims as they are in various
+  //     formats,
+  //     then explicitly convert between different conventions.
+
+  // Proto-style accessors
+  bool has_shape() const { return array_shape != nullptr; }
+  const Shape& shape() const {
+    CHECK(has_shape());
+    return *array_shape;
+  }
+  Shape* mutable_shape() {
+    if (!array_shape) {
+      array_shape.reset(new Shape);
+    }
+    return array_shape.get();
+  }
+  void copy_shape(const Shape& src_shape) { *mutable_shape() = src_shape; }
+  void clear_shape() { array_shape = nullptr; }
+
+  // The constant buffer backing this array. This is non-null if and only if
+  // this is a constant parameter array. Conversely, this is null for
+  // activations arrays.
+  //
+  // Note that this buffer is pure storage. In the case of quantized values,
+  // it only stores the quantized values, it does not know by itself about the
+  // quantization parameters necessary to interprete these values, that is
+  // in the separate 'quantization_params' field. In fact, this 'buffer' field
+  // does no even know whether values are quantized. It only has a data_type,
+  // which must equal the 'data_type' member here, and which only describes
+  // the storage type of element, does not tell whether they are quantized i.e.
+  // whether they are to be interpreted with quantization_params.
+  std::unique_ptr<GenericBuffer> buffer;
+  // Only for activation arrays (i.e. when 'buffer' is null).
+  // Only for code generation.
+  //
+  // Describes the allocation of this array within the workspace buffer
+  // allocated
+  // for all transient arrays.
+  std::unique_ptr<Alloc> alloc;
+  // Describes the [min, max] range of values
+  // to be assumed when determining quantization_params.
+  //
+  // Only used for quantization. In fact, only used for determining
+  // quantization_params.
+  //
+  // Used for both constant arrays (those having a 'buffer') and non-constant
+  // arrays (activations). Indeed, it is important to use the same min-max range
+  // as was used during training, even if that min-max range is slightly wrong
+  // w.r.t. actual buffer elements. Doing otherwise would defeat the point of
+  // re-training for quantization.
+  std::unique_ptr<MinMax> minmax;
+  // Quantization parameters. The non-null-ness of this pointer is what
+  // defines whether this array is quantized or not.
+  //
+  // If this is non-null, then these quantization parameters are to be used
+  // to assign a meaning as real numbers to the elements of this array.
+  std::unique_ptr<QuantizationParams> quantization_params;
+
+ private:
+  std::unique_ptr<Shape> array_shape;
+};
+
+// Our Model struct, represents an entire model (our "top-level" struct).
+// Owns everything.
+struct Model {
+  Array& GetArray(const string& name) const {
+    DCHECK(arrays.count(name));
+    return *arrays.at(name);
+  }
+  Array& GetOrCreateArray(const string& name) {
+    if (!arrays.count(name)) {
+      Array* ptr = new Array;
+      arrays[name] = std::unique_ptr<Array>(ptr);
+    }
+    Array& result = GetArray(name);
+    return result;
+  }
+
+  // The list of operators. Notice how it's a list of unique_ptr's, implying
+  // that the Model is what owns Operator's and keeps them alive.
+  std::vector<std::unique_ptr<Operator>> operators;
+  // The associative array mapping names to Array's.
+  // Notice how it's a container of unique_ptr's, implying
+  // that the Model is what owns Array's and keeps them alive.
+  // The Operator's refer to these Array's by their name strings, not by their
+  // addresses. See Operator::inputs, Operator::outputs.
+  std::unordered_map<string, std::unique_ptr<Array>> arrays;
+  // Generic flags, a place where we combine information passed to us via
+  // command-line parameters (e.g. --input_width=N) with information that
+  // we may or may not find in the input model file.
+  ModelFlags flags;
+  // For code-generation only: required size of the transient_data buffer
+  std::size_t transient_data_size = 0;
+  // For code-generation only: required alignment of the transient_data buffer
+  std::size_t transient_data_alignment = 0;
+};
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29802da9fe4078d931b0fcd094610be63e2a10d3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -0,0 +1,399 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/contrib/lite/toco/args.h"
+#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+// "batch" flag only exists internally
+#ifdef PLATFORM_GOOGLE
+#include "base/commandlineflags.h"
+#endif
+
+namespace toco {
+
+bool ParseModelFlagsFromCommandLineFlags(
+    int* argc, char* argv[], string* msg,
+    ParsedModelFlags* parsed_model_flags_ptr) {
+  ParsedModelFlags& parsed_flags = *parsed_model_flags_ptr;
+  using tensorflow::Flag;
+  std::vector<tensorflow::Flag> flags = {
+      Flag("input_array", parsed_flags.input_array.bind(),
+           parsed_flags.input_array.default_value(),
+           "Deprecated: use --input_arrays instead. Name of the input array. "
+           "If not specified, will try to read "
+           "that information from the input file."),
+      Flag("input_arrays", parsed_flags.input_arrays.bind(),
+           parsed_flags.input_arrays.default_value(),
+           "Names of the output arrays, comma-separated. If not specified, "
+           "will try to read that information from the input file."),
+      Flag("output_array", parsed_flags.output_array.bind(),
+           parsed_flags.output_array.default_value(),
+           "Deprecated: use --output_arrays instead. Name of the output array, "
+           "when specifying a unique output array. "
+           "If not specified, will try to read that information from the "
+           "input file."),
+      Flag("output_arrays", parsed_flags.output_arrays.bind(),
+           parsed_flags.output_arrays.default_value(),
+           "Names of the output arrays, comma-separated. "
+           "If not specified, will try to read "
+           "that information from the input file."),
+      Flag("input_shape", parsed_flags.input_shape.bind(),
+           parsed_flags.input_shape.default_value(),
+           "Deprecated: use --input_shapes instead. Input array shape. For "
+           "many models the shape takes the form "
+           "batch size, input array height, input array width, input array "
+           "depth."),
+      Flag("input_shapes", parsed_flags.input_shapes.bind(),
+           parsed_flags.input_shapes.default_value(),
+           "Shapes corresponding to --input_arrays, colon-separated. For "
+           "many models each shape takes the form batch size, input array "
+           "height, input array width, input array depth."),
+      Flag("input_data_type", parsed_flags.input_data_type.bind(),
+           parsed_flags.input_data_type.default_value(),
+           "Deprecated: use --input_data_types instead. Input array type, if "
+           "not already provided in the graph. "
+           "Typically needs to be specified when passing arbitrary arrays "
+           "to --input_array."),
+      Flag("input_data_types", parsed_flags.input_data_types.bind(),
+           parsed_flags.input_data_types.default_value(),
+           "Input arrays types, comma-separated, if not already provided in "
+           "the graph. "
+           "Typically needs to be specified when passing arbitrary arrays "
+           "to --input_arrays."),
+      Flag("mean_value", parsed_flags.mean_value.bind(),
+           parsed_flags.mean_value.default_value(),
+           "Deprecated: use --mean_values instead. mean_value parameter for "
+           "image models, used to compute input "
+           "activations from input pixel data."),
+      Flag("mean_values", parsed_flags.mean_values.bind(),
+           parsed_flags.mean_values.default_value(),
+           "mean_values parameter for image models, comma-separated list of "
+           "doubles, used to compute input activations from input pixel "
+           "data. Each entry in the list should match an entry in "
+           "--input_arrays."),
+      Flag("std_value", parsed_flags.std_value.bind(),
+           parsed_flags.std_value.default_value(),
+           "Deprecated: use --std_values instead. std_value parameter for "
+           "image models, used to compute input "
+           "activations from input pixel data."),
+      Flag("std_values", parsed_flags.std_values.bind(),
+           parsed_flags.std_values.default_value(),
+           "std_value parameter for image models, comma-separated list of "
+           "doubles, used to compute input activations from input pixel "
+           "data. Each entry in the list should match an entry in "
+           "--input_arrays."),
+      Flag("variable_batch", parsed_flags.variable_batch.bind(),
+           parsed_flags.variable_batch.default_value(),
+           "If true, the model accepts an arbitrary batch size. Mutually "
+           "exclusive "
+           "with the 'batch' field: at most one of these two fields can be "
+           "set."),
+      Flag("rnn_states", parsed_flags.rnn_states.bind(),
+           parsed_flags.rnn_states.default_value(), ""),
+      Flag("model_checks", parsed_flags.model_checks.bind(),
+           parsed_flags.model_checks.default_value(),
+           "A list of model checks to be applied to verify the form of the "
+           "model.  Applied after the graph transformations after import."),
+      Flag("graphviz_first_array", parsed_flags.graphviz_first_array.bind(),
+           parsed_flags.graphviz_first_array.default_value(),
+           "If set, defines the start of the sub-graph to be dumped to "
+           "GraphViz."),
+      Flag(
+          "graphviz_last_array", parsed_flags.graphviz_last_array.bind(),
+          parsed_flags.graphviz_last_array.default_value(),
+          "If set, defines the end of the sub-graph to be dumped to GraphViz."),
+      Flag("dump_graphviz", parsed_flags.dump_graphviz.bind(),
+           parsed_flags.dump_graphviz.default_value(),
+           "Dump graphviz during LogDump call. If string is non-empty then "
+           "it defines path to dump, otherwise will skip dumping."),
+      Flag("dump_graphviz_video", parsed_flags.dump_graphviz_video.bind(),
+           parsed_flags.dump_graphviz_video.default_value(),
+           "If true, will dump graphviz at each "
+           "graph transformation, which may be used to generate a video."),
+  };
+  bool asked_for_help =
+      *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
+  if (asked_for_help) {
+    *msg += tensorflow::Flags::Usage(argv[0], flags);
+    return false;
+  } else {
+    if (!tensorflow::Flags::Parse(argc, argv, flags)) return false;
+  }
+  auto& dump_options = *GraphVizDumpOptions::singleton();
+  dump_options.graphviz_first_array = parsed_flags.graphviz_first_array.value();
+  dump_options.graphviz_last_array = parsed_flags.graphviz_last_array.value();
+  dump_options.dump_graphviz_video = parsed_flags.dump_graphviz_video.value();
+  dump_options.dump_graphviz = parsed_flags.dump_graphviz.value();
+
+  return true;
+}
+
+void ReadModelFlagsFromCommandLineFlags(
+    const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags) {
+  toco::port::CheckInitGoogleIsDone("InitGoogle is not done yet");
+
+// "batch" flag only exists internally
+#ifdef PLATFORM_GOOGLE
+  CHECK(!((base::SpecifiedOnCommandLine("batch") &&
+           parsed_model_flags.variable_batch.specified())))
+      << "The --batch and --variable_batch flags are mutually exclusive.";
+#endif
+  CHECK(!(parsed_model_flags.output_array.specified() &&
+          parsed_model_flags.output_arrays.specified()))
+      << "The --output_array and --vs flags are mutually exclusive.";
+
+  if (parsed_model_flags.output_array.specified()) {
+    model_flags->add_output_arrays(parsed_model_flags.output_array.value());
+  }
+
+  if (parsed_model_flags.output_arrays.specified()) {
+    std::vector<string> output_arrays =
+        absl::StrSplit(parsed_model_flags.output_arrays.value(), ',');
+    for (const string& output_array : output_arrays) {
+      model_flags->add_output_arrays(output_array);
+    }
+  }
+
+  const bool uses_single_input_flags =
+      parsed_model_flags.input_array.specified() ||
+      parsed_model_flags.mean_value.specified() ||
+      parsed_model_flags.std_value.specified() ||
+      parsed_model_flags.input_shape.specified();
+
+  const bool uses_multi_input_flags =
+      parsed_model_flags.input_arrays.specified() ||
+      parsed_model_flags.mean_values.specified() ||
+      parsed_model_flags.std_values.specified() ||
+      parsed_model_flags.input_shapes.specified();
+
+  QCHECK(!(uses_single_input_flags && uses_multi_input_flags))
+      << "Use either the singular-form input flags (--input_array, "
+         "--input_shape, --mean_value, --std_value) or the plural form input "
+         "flags (--input_arrays, --input_shapes, --mean_values, --std_values), "
+         "but not both forms within the same command line.";
+
+  if (parsed_model_flags.input_array.specified()) {
+    QCHECK(uses_single_input_flags);
+    model_flags->add_input_arrays()->set_name(
+        parsed_model_flags.input_array.value());
+  }
+  if (parsed_model_flags.input_arrays.specified()) {
+    QCHECK(uses_multi_input_flags);
+    for (const auto& input_array :
+         absl::StrSplit(parsed_model_flags.input_arrays.value(), ',')) {
+      model_flags->add_input_arrays()->set_name(string(input_array));
+    }
+  }
+  if (parsed_model_flags.mean_value.specified()) {
+    QCHECK(uses_single_input_flags);
+    model_flags->mutable_input_arrays(0)->set_mean_value(
+        parsed_model_flags.mean_value.value());
+  }
+  if (parsed_model_flags.mean_values.specified()) {
+    QCHECK(uses_multi_input_flags);
+    std::vector<string> mean_values =
+        absl::StrSplit(parsed_model_flags.mean_values.value(), ',');
+    QCHECK(mean_values.size() == model_flags->input_arrays_size());
+    for (int i = 0; i < mean_values.size(); ++i) {
+      char* last = nullptr;
+      model_flags->mutable_input_arrays(i)->set_mean_value(
+          strtod(mean_values[i].data(), &last));
+      CHECK(last != mean_values[i].data());
+    }
+  }
+  if (parsed_model_flags.std_value.specified()) {
+    QCHECK(uses_single_input_flags);
+    model_flags->mutable_input_arrays(0)->set_std_value(
+        parsed_model_flags.std_value.value());
+  }
+  if (parsed_model_flags.std_values.specified()) {
+    QCHECK(uses_multi_input_flags);
+    std::vector<string> std_values =
+        absl::StrSplit(parsed_model_flags.std_values.value(), ',');
+    QCHECK(std_values.size() == model_flags->input_arrays_size());
+    for (int i = 0; i < std_values.size(); ++i) {
+      char* last = nullptr;
+      model_flags->mutable_input_arrays(i)->set_std_value(
+          strtod(std_values[i].data(), &last));
+      CHECK(last != std_values[i].data());
+    }
+  }
+  if (parsed_model_flags.input_data_type.specified()) {
+    QCHECK(uses_single_input_flags);
+    IODataType type;
+    QCHECK(IODataType_Parse(parsed_model_flags.input_data_type.value(), &type));
+    model_flags->mutable_input_arrays(0)->set_data_type(type);
+  }
+  if (parsed_model_flags.input_data_types.specified()) {
+    QCHECK(uses_multi_input_flags);
+    std::vector<string> input_data_types =
+        absl::StrSplit(parsed_model_flags.input_data_types.value(), ',');
+    QCHECK(input_data_types.size() == model_flags->input_arrays_size());
+    for (int i = 0; i < input_data_types.size(); ++i) {
+      IODataType type;
+      QCHECK(IODataType_Parse(input_data_types[i], &type));
+      model_flags->mutable_input_arrays(i)->set_data_type(type);
+    }
+  }
+  if (parsed_model_flags.input_shape.specified()) {
+    QCHECK(uses_single_input_flags);
+    if (model_flags->input_arrays().empty()) {
+      model_flags->add_input_arrays();
+    }
+    auto* shape = model_flags->mutable_input_arrays(0)->mutable_shape();
+    shape->clear_dims();
+    const IntList& list = parsed_model_flags.input_shape.value();
+    for (auto& dim : list.elements) {
+      shape->add_dims(dim);
+    }
+  }
+  if (parsed_model_flags.input_shapes.specified()) {
+    QCHECK(uses_multi_input_flags);
+    std::vector<string> input_shapes =
+        absl::StrSplit(parsed_model_flags.input_shapes.value(), ':');
+    QCHECK(input_shapes.size() == model_flags->input_arrays_size());
+    for (int i = 0; i < input_shapes.size(); ++i) {
+      auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
+      shape->clear_dims();
+      for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
+        int size;
+        CHECK(absl::SimpleAtoi(dim_str, &size))
+            << "Failed to parse input_shape: " << input_shapes[i];
+        shape->add_dims(size);
+      }
+    }
+  }
+
+#define READ_MODEL_FLAG(name)                                   \
+  do {                                                          \
+    if (parsed_model_flags.name.specified()) {                  \
+      model_flags->set_##name(parsed_model_flags.name.value()); \
+    }                                                           \
+  } while (false)
+
+  READ_MODEL_FLAG(variable_batch);
+
+#undef READ_MODEL_FLAG
+
+  for (const auto& element : parsed_model_flags.rnn_states.value().elements) {
+    auto* rnn_state_proto = model_flags->add_rnn_states();
+    for (const auto& kv_pair : element) {
+      const string& key = kv_pair.first;
+      const string& value = kv_pair.second;
+      if (key == "state_array") {
+        rnn_state_proto->set_state_array(value);
+      } else if (key == "back_edge_source_array") {
+        rnn_state_proto->set_back_edge_source_array(value);
+      } else if (key == "size") {
+        int32 size = 0;
+        CHECK(absl::SimpleAtoi(value, &size));
+        CHECK_GT(size, 0);
+        rnn_state_proto->set_size(size);
+      } else if (key == "manually_create") {
+        CHECK_EQ(absl::AsciiStrToLower(value), "true");
+        rnn_state_proto->set_manually_create(true);
+      } else {
+        LOG(FATAL) << "Unknown key '" << key << "' in --rnn_states";
+      }
+    }
+    CHECK(rnn_state_proto->has_state_array() &&
+          rnn_state_proto->has_back_edge_source_array() &&
+          rnn_state_proto->has_size())
+        << "--rnn_states must include state_array, back_edge_source_array and "
+           "size.";
+  }
+
+  for (const auto& element : parsed_model_flags.model_checks.value().elements) {
+    auto* model_check_proto = model_flags->add_model_checks();
+    for (const auto& kv_pair : element) {
+      const string& key = kv_pair.first;
+      const string& value = kv_pair.second;
+      if (key == "count_type") {
+        model_check_proto->set_count_type(value);
+      } else if (key == "count_min") {
+        int32 count = 0;
+        CHECK(absl::SimpleAtoi(value, &count));
+        CHECK_GE(count, -1);
+        model_check_proto->set_count_min(count);
+      } else if (key == "count_max") {
+        int32 count = 0;
+        CHECK(absl::SimpleAtoi(value, &count));
+        CHECK_GE(count, -1);
+        model_check_proto->set_count_max(count);
+      } else {
+        LOG(FATAL) << "Unknown key '" << key << "' in --model_checks";
+      }
+    }
+  }
+
+  CheckInputArraysAreNotOutputArrays(*model_flags);
+}
+
+ParsedModelFlags* UncheckedGlobalParsedModelFlags(bool must_already_exist) {
+  static auto* flags = [must_already_exist]() {
+    if (must_already_exist) {
+      fprintf(stderr, __FILE__
+              ":"
+              "GlobalParsedModelFlags() used without initialization\n");
+      fflush(stderr);
+      abort();
+    }
+    return new toco::ParsedModelFlags;
+  }();
+  return flags;
+}
+
+ParsedModelFlags* GlobalParsedModelFlags() {
+  return UncheckedGlobalParsedModelFlags(true);
+}
+
+void ParseModelFlagsOrDie(int* argc, char* argv[]) {
+  // TODO(aselle): in the future allow Google version to use
+  // flags, and only use this mechanism for open source
+  auto* flags = UncheckedGlobalParsedModelFlags(false);
+  string msg;
+  bool model_success =
+      toco::ParseModelFlagsFromCommandLineFlags(argc, argv, &msg, flags);
+  if (!model_success || !msg.empty()) {
+    // Log in non-standard way since this happens pre InitGoogle.
+    fprintf(stderr, "%s", msg.c_str());
+    fflush(stderr);
+    abort();
+  }
+}
+
+void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags) {
+  for (const auto& input_array : model_flags.input_arrays()) {
+    for (const string& output_array : model_flags.output_arrays()) {
+      QCHECK_NE(input_array.name(), output_array)
+          << "The array " << output_array
+          << " is listed in both --input_arrays and --output_arrays.";
+    }
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.h b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..61bcde234e56a0fcc8079ca0f8941f9ac2c9804a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/args.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/types.pb.h"
+
+namespace toco {
+// Parse and remove arguments for models (in toco). Returns true if parsing
+// is successful. msg has the usage string if there was an error or
+// "--help" was specified
+bool ParseModelFlagsFromCommandLineFlags(
+    int* argc, char* argv[], string* msg,
+    ParsedModelFlags* parsed_model_flags_ptr);
+// Populate the ModelFlags proto with model data.
+void ReadModelFlagsFromCommandLineFlags(
+    const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags);
+// Parse the global model flags to a static
+void ParseModelFlagsOrDie(int* argc, char* argv[]);
+// Get the global parsed model flags
+ParsedModelFlags* GlobalParsedModelFlags();
+
+void CheckInputArraysAreNotOutputArrays(const ModelFlags& model_flags);
+
+}  // namespace toco
+
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_MODEL_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
new file mode 100644
index 0000000000000000000000000000000000000000..05c48bc369bd5cd023b290dab24140a765c8f715
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -0,0 +1,150 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+import "tensorflow/contrib/lite/toco/types.proto";
+
+package toco;
+
+message InputArrayShape {
+  repeated int32 dims = 2;
+}
+
+// Next ID to USE: 7.
+message InputArray {
+  // Name of the input arrays, i.e. the arrays from which input activations
+  // will be read.
+  optional string name = 1;
+
+  // Shape of the input.  For many applications the dimensions are {batch,
+  // height, width, depth}.  Often the batch is left "unspecified" by providing
+  // a value of -1.
+  //
+  // The last dimension is typically called 'depth' or 'channels'. For example,
+  // for an image model taking RGB images as input, this would have the value 3.
+  optional InputArrayShape shape = 6;
+
+  // mean_value and std_value parameters control the interpretation of raw input
+  // activation values (elements of the input array) as real numbers. The
+  // mapping is given by:
+  //
+  //    real_value = (raw_input_value - mean_value) / std_value
+  //
+  // In particular, the defaults (mean_value=0, std_value=1) yield
+  // real_value = raw_input_value. Often, non-default values are used in image
+  // models. For example, an image model taking uint8 image channel values as
+  // its raw inputs, in [0, 255] range, may use mean_value=128, std_value=128 to
+  // map them into the interval [-1, 1).
+  //
+  // Note: this matches exactly the meaning of mean_value and std_value in
+  // (TensorFlow via LegacyFedInput).
+  optional float mean_value = 3;
+  optional float std_value = 4 [default = 1.];
+
+  // Data type of the input.
+  //
+  // In many graphs, the input arrays already have defined data types,
+  // e.g. Placeholder nodes in a TensorFlow GraphDef have a dtype attribute.
+  // In those cases, it is not needed to specify this data_type flag.
+  // The purpose of this flag is only to define the data type of input
+  // arrays whose type isn't defined in the input graph file. For example,
+  // when specifying an arbitrary (not Placeholder) --input_array into
+  // a TensorFlow GraphDef.
+  //
+  // When this data_type is quantized (e.g. QUANTIZED_UINT8), the
+  // corresponding quantization parameters are the mean_value, std_value
+  // fields.
+  //
+  // It is also important to understand the nuance between this data_type
+  // flag and the inference_input_type in TocoFlags. The basic difference
+  // is that this data_type (like all ModelFlags) describes a property
+  // of the input graph, while inference_input_type (like all TocoFlags)
+  // describes an aspect of the toco transformation process and thus of
+  // the output file. The types of input arrays may be different between
+  // the input and output files if quantization or dequantization occurred.
+  // Such differences can only occur for real-number data i.e. only
+  // between FLOAT and quantized types (e.g. QUANTIZED_UINT8).
+  optional IODataType data_type = 5;
+}
+
+message RnnState {
+  optional string state_array = 1;
+  optional string back_edge_source_array = 2;
+  optional bool discardable = 5;
+  // TODO(benoitjacob): drop the 'size' field. Should be redundant with
+  // --input_shapes and shapes propagation.
+  optional int32 size = 3;
+  // TODO(benoitjacob): manually_create is a temporary hack:
+  // due to discrepancies between the current toco dims tracking and
+  // TensorFlow shapes, for some models we need to manually create RNN state
+  // arrays with a specified shape.
+  // Maybe we should actually implement back-edges as operators of their own,
+  // which would remove the need for much special-casing, including here,
+  // we could probably consistently let PropagateFixedSizes handle state
+  // arrays.
+  // TODO(benoitjacob): should really drop manually_create now.
+  optional bool manually_create = 4;
+}
+
+// ModelFlags encodes properties of a model that, depending on the file
+// format, may or may not be recorded in the model file. The purpose of
+// representing these properties in ModelFlags is to allow passing them
+// separately from the input model file, for instance as command-line
+// parameters, so that we can offer a single uniform interface that can
+// handle files from different input formats.
+//
+// For each of these properties, and each supported file format, we
+// detail in comments below whether the property exists in the given file
+// format.
+//
+// Obsolete flags that have been removed:
+//   optional int32 input_depth = 3;
+//   optional int32 input_width = 4;
+//   optional int32 input_height = 5;
+//   optional int32 batch = 6 [ default = 1];
+//   optional float mean_value = 7;
+//   optional float std_value = 8 [default = 1.];
+//   optional int32 input_dims = 11 [ default = 4];
+//   repeated int32 input_shape = 13;
+//
+// Next ID to USE: 16.
+message ModelFlags {
+  // Information about the input arrays, i.e. the arrays from which input
+  // activations will be read.
+  repeated InputArray input_arrays = 1;
+
+  // Name of the output arrays, i.e. the arrays into which output activations
+  // will be written.
+  repeated string output_arrays = 2;
+
+  // If true, the model accepts an arbitrary batch size. Mutually exclusive with
+  // the 'batch' field: at most one of these two fields can be set.
+  optional bool variable_batch = 10;
+
+  repeated RnnState rnn_states = 12;
+
+  // Checks applied to the model, typically after toco's comprehensive
+  // graph transformations.
+  // Next ID to USE: 4.
+  message ModelCheck {
+    // Use the name of a type of operator to check its counts.
+    // Use "Total" for overall operator counts.
+    // Use "Arrays" for overall array counts.
+    optional string count_type = 1 [default = "None"];
+    // A count of zero is a meaningful check, so negative used to mean disable.
+    optional int32 count_min = 2 [default = -1];
+    // If count_max < count_min, then count_min is only allowed value.
+    optional int32 count_max = 3 [default = -1];
+  }
+  repeated ModelCheck model_checks = 14;
+}
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..17115047d2ef93cce7004926c2b1a4bfa58f6243
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -0,0 +1,77 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+cc_library(
+    name = "toco_python_api",
+    srcs = ["toco_python_api.cc"],
+    hdrs = ["toco_python_api.h"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/contrib/lite/toco:toco_port",
+        "//tensorflow/contrib/lite/toco:toco_tooling",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_wrap_toco",
+    srcs = ["toco.i"],
+    deps = [
+        ":toco_python_api",
+        "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
+        "//util/python:python_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+py_binary(
+    name = "toco_from_protos",
+    srcs = ["toco_from_protos.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tensorflow_wrap_toco",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_binary(
+    name = "toco_wrapper",
+    srcs = ["toco_wrapper.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_test(
+    name = "toco_from_protos_test",
+    srcs = ["toco_from_protos_test.py"],
+    additional_deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/lite/toco:model_flags_proto_py",
+        "//tensorflow/contrib/lite/toco:toco_flags_proto_py",
+    ],
+    data = [
+        ":toco_from_protos",
+    ],
+    tags = ["no_pip"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/toco/python/toco.i b/tensorflow/contrib/lite/toco/python/toco.i
new file mode 100644
index 0000000000000000000000000000000000000000..3787cba4a371f1893d877daadcfe31e59eb5b3f6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco.i
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "std_string.i"
+
+%{
+#include "tensorflow/contrib/lite/toco/python/toco_python_api.h"
+%}
+
+namespace toco {
+
+// Convert a model represented in `input_contents`. `model_flags_proto`
+// describes model parameters. `toco_flags_proto` describes conversion
+// parameters (see relevant .protos for more information). Returns a string
+// representing the contents of the converted model.
+PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
+                        PyObject* toco_flags_proto_txt_raw,
+                        PyObject* input_contents_txt_raw);
+
+} // namespace toco
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos.py b/tensorflow/contrib/lite/toco/python/toco_from_protos.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0b032083b2347424b9fd85ab2440e18c0f68e91
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos.py
@@ -0,0 +1,63 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python console command to invoke TOCO from serialized protos."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+from tensorflow.contrib.lite.toco.python import tensorflow_wrap_toco
+from tensorflow.python.platform import app
+
+FLAGS = None
+
+
+def execute(unused_args):
+  model_str = open(FLAGS.model_proto_file, "rb").read()
+  toco_str = open(FLAGS.toco_proto_file, "rb").read()
+  input_str = open(FLAGS.model_input_file, "rb").read()
+
+  output_str = tensorflow_wrap_toco.TocoConvert(model_str, toco_str, input_str)
+  open(FLAGS.model_output_file, "wb").write(output_str)
+  sys.exit(0)
+
+
+def main():
+  global FLAGS
+  parser = argparse.ArgumentParser(
+      description="Invoke toco using protos as input.")
+  parser.add_argument(
+      "model_proto_file",
+      type=str,
+      help="File containing serialized proto that describes the model.")
+  parser.add_argument(
+      "toco_proto_file",
+      type=str,
+      help="File containing serialized proto describing how TOCO should run.")
+  parser.add_argument(
+      "model_input_file", type=str, help="Input model is read from this file.")
+  parser.add_argument(
+      "model_output_file",
+      type=str,
+      help="Result of applying TOCO conversion is written here.")
+
+  FLAGS, unparsed = parser.parse_known_args()
+
+  app.run(main=execute, argv=[sys.argv[0]] + unparsed)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c35b6f99259b762aa83d92d21512169a7ab50b70
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -0,0 +1,97 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+import tensorflow as tf
+from tensorflow.contrib.lite.toco import model_flags_pb2
+from tensorflow.contrib.lite.toco import toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import resource_loader
+
+
+def TensorName(x):
+  """Get the canonical (non foo:0 name)."""
+  return x.name.split(":")[0]
+
+
+class TocoFromProtosTest(googletest.TestCase):
+
+  def _run(self, sess, in_tensor, out_tensor, should_succeed):
+    """Use toco binary to check conversion from graphdef to tflite.
+
+    Args:
+      sess: Active TensorFlow session containing graph.
+      in_tensor: TensorFlow tensor to use as input.
+      out_tensor: TensorFlow tensor to use as output.
+      should_succeed: Whether this is a valid conversion.
+    """
+    # Build all protos and extract graphdef
+    graph_def = sess.graph_def
+    toco_flags = toco_flags_pb2.TocoFlags()
+    toco_flags.input_format = toco_flags_pb2.TENSORFLOW_GRAPHDEF
+    toco_flags.output_format = toco_flags_pb2.TFLITE
+    toco_flags.inference_input_type = types_pb2.FLOAT
+    toco_flags.inference_type = types_pb2.FLOAT
+    model_flags = model_flags_pb2.ModelFlags()
+    input_array = model_flags.input_arrays.add()
+    input_array.name = TensorName(in_tensor)
+    input_array.shape.dims.extend(map(int, in_tensor.get_shape()))
+    model_flags.output_arrays.append(TensorName(out_tensor))
+    # Shell out to run toco (in case it crashes)
+    with tempfile.NamedTemporaryFile() as fp_toco, \
+           tempfile.NamedTemporaryFile() as fp_model, \
+           tempfile.NamedTemporaryFile() as fp_input, \
+           tempfile.NamedTemporaryFile() as fp_output:
+      fp_model.write(model_flags.SerializeToString())
+      fp_toco.write(toco_flags.SerializeToString())
+      fp_input.write(graph_def.SerializeToString())
+      fp_model.flush()
+      fp_toco.flush()
+      fp_input.flush()
+      tflite_bin = resource_loader.get_path_to_datafile("toco_from_protos")
+      cmdline = " ".join([
+          tflite_bin, fp_model.name, fp_toco.name, fp_input.name, fp_output.name
+      ])
+      exitcode = os.system(cmdline)
+      if exitcode == 0:
+        stuff = fp_output.read()
+        self.assertEqual(stuff is not None, should_succeed)
+      else:
+        self.assertFalse(should_succeed)
+
+  def test_toco(self):
+    """Run a couple of TensorFlow graphs against TOCO through the python bin."""
+    with tf.Session() as sess:
+      img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+      val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+      out = tf.identity(val, name="out")
+      out2 = tf.sin(val, name="out2")
+      # This is a valid mdoel
+      self._run(sess, img, out, True)
+      # This uses an invalid function.
+      # TODO(aselle): Check to make sure a warning is included.
+      self._run(sess, img, out2, True)
+      # This is an identity graph, which doesn't work
+      self._run(sess, img, img, False)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a5e483f3f1676ebed3244bd6f7eb610fad21557
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -0,0 +1,85 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/python/toco_python_api.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_tooling.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+
+namespace toco {
+
+#if PY_MAJOR_VERSION >= 3
+#define TOCO_PY_TO_CPPSTRING PyBytes_AsStringAndSize
+#define TOCO_FROM_CPPSTRING_TO_PY PyBytes_FromStringAndSize
+#else
+#define TOCO_PY_TO_CPPSTRING PyString_AsStringAndSize
+#define TOCO_FROM_CPPSTRING_TO_PY PyString_FromStringAndSize
+#endif
+
+// NOTE(aselle): We are using raw PyObject's here because we want to make
+// sure we input and output bytes rather than unicode strings for Python3.
+PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
+                      PyObject* toco_flags_proto_txt_raw,
+                      PyObject* input_contents_txt_raw) {
+  // Use Python C API to validate and convert arguments. In py3 (bytes),
+  // in py2 (str).
+  auto ConvertArg = [&](PyObject* obj, bool* error) {
+    char* buf;
+    Py_ssize_t len;
+    if (TOCO_PY_TO_CPPSTRING(obj, &buf, &len) == -1) {
+      *error = true;
+      return std::string();
+    } else {
+      *error = false;
+      return std::string(buf, len);
+    }
+  };
+
+  bool error;
+  std::string model_flags_proto_txt =
+      ConvertArg(model_flags_proto_txt_raw, &error);
+  if (error) return nullptr;
+  std::string toco_flags_proto_txt =
+      ConvertArg(toco_flags_proto_txt_raw, &error);
+  if (error) return nullptr;
+  std::string input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
+  if (error) return nullptr;
+
+  // Use toco to produce new outputs
+  toco::ModelFlags model_flags;
+  if (!model_flags.ParseFromString(model_flags_proto_txt)) {
+    LOG(FATAL) << "Model proto failed to parse." << std::endl;
+  }
+  toco::TocoFlags toco_flags;
+  if (!toco_flags.ParseFromString(toco_flags_proto_txt)) {
+    LOG(FATAL) << "Toco proto failed to parse." << std::endl;
+  }
+  std::unique_ptr<toco::Model> model =
+      toco::Import(toco_flags, model_flags, input_contents_txt);
+  toco::Transform(toco_flags, model.get());
+  string output_file_contents_txt;
+  Export(toco_flags, *model, &output_file_contents_txt);
+
+  // Convert arguments back to byte (py3) or str (py2)
+  return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
+                                   output_file_contents_txt.size());
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/contrib/lite/toco/python/toco_python_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc378353f79945f4fbb72305899b2b604be785ad
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+#define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
+
+#include <string>
+#include <Python.h>
+
+namespace toco {
+
+// Convert a model represented in `input_contents`. `model_flags_proto`
+// describes model parameters. `toco_flags_proto` describes conversion
+// parameters (see relevant .protos for more information). Returns a string
+// representing the contents of the converted model.
+PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
+                      PyObject* toco_flags_proto_txt_raw,
+                      PyObject* input_contents_txt_raw);
+
+}  // namespace toco
+
+#endif  // _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39b5f22c7c8ffafaf72129be6f54090e6761dc3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
@@ -0,0 +1,35 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper for runninmg toco binary embedded in pip site-package.
+
+NOTE: this mainly exists since PIP setup.py cannot install binaries to bin/.
+It can only install Python "console-scripts." This will work as a console
+script. See tools/pip_package/setup.py (search for CONSOLE_SCRIPTS).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import tensorflow as tf
+
+
+def main():
+  # Pip installs the binary in aux-bin off of main site-package install.
+  # Just find it and exec, passing all arguments in the process.
+  # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
+  binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
+  os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lite/toco/runtime/common.h b/tensorflow/contrib/lite/toco/runtime/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd55544f57f9a266514e878edd8f1f7dec1cb7b7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/runtime/common.h
@@ -0,0 +1,26 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
+
+#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#ifdef GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#endif
+#endif
+
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_COMMON_H_
diff --git a/tensorflow/contrib/lite/toco/runtime/types.h b/tensorflow/contrib/lite/toco/runtime/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..df63b2d59ea2a98f1ec9009614c18791e8822c14
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/runtime/types.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
+
+#include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace toco {
+
+// TODO(ahentz): These are just stopgaps for now, untils we move all
+// the code over to tflite.
+using tflite::Dims;
+using tflite::FusedActivationFunctionType;
+using tflite::RequiredBufferSizeForDims;
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_RUNTIME_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0c1a1141fca91e7d27fe48ffae4f834ae92a1e08
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
@@ -0,0 +1,102 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "cluster_utils",
+    srcs = [
+        "cluster_utils.cc",
+    ],
+    hdrs = [
+        "cluster_utils.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite/toco:toco_port",
+    ],
+)
+
+cc_library(
+    name = "cluster",
+    srcs = [
+        "cluster.cc",
+    ],
+    hdrs = [
+        "cluster.h",
+    ],
+    deps = [
+        ":cluster_utils",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "resolve_svdf",
+    srcs = [
+        "resolve_svdf.cc",
+    ],
+    hdrs = [
+        "resolve_svdf.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:toco_port",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_svdf_test",
+    srcs = ["resolve_svdf_test.cc"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        ":resolve_cluster",
+        ":resolve_svdf",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "resolve_cluster",
+    srcs = [
+        "resolve_cluster.cc",
+    ],
+    hdrs = [
+        "resolve_cluster.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cluster",
+        ":cluster_utils",
+        ":resolve_svdf",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98a130ea39c45c2c8259c87779532a312433c5a7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.cc
@@ -0,0 +1,52 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+
+namespace toco {
+
+void Cluster::SetGraphDefInfo(const tensorflow::GraphDef* graph_def) {
+  graph_def_ = graph_def;
+  for (const tensorflow::NodeDef& node : graph_def_->node()) {
+    if (StrContains(node.name(), name_)) {
+      nodes_.push_back(&node);
+    }
+  }
+}
+
+bool Cluster::FindClusterInputsAndOutputs() {
+  // For every node N in the graph:
+  // If N belongs to this cluster C, then each of N's inputs that are not part
+  // of C are then inputs of C.
+  // If N does not belong to cluster C, then each of N's inputs that belong to C
+  // are then outputs of C.
+  for (const tensorflow::NodeDef& node : graph_def_->node()) {
+    if (StrContains(node.name(), name_)) {
+      for (int i = 0; i < node.input_size(); i++) {
+        if (!StrContains(node.input(i), name_)) {
+          inputs_.push_back(node.input(i));
+        }
+      }
+    } else {
+      for (int i = 0; i < node.input_size(); i++) {
+        if (StrContains(node.input(i), name_)) {
+          outputs_.push_back(node.input(i));
+        }
+      }
+    }
+  }
+  return (!inputs_.empty()) && (!outputs_.empty());
+}
+
+}  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
new file mode 100644
index 0000000000000000000000000000000000000000..18ff73ac3936cc973ce16ca88e6a94055fabcf7a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h
@@ -0,0 +1,101 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+
+// The base class for Cluster. A cluster is group of nodes all related to each
+// other because their name match a given "pattern", which shows they all belong
+// to a composite op supported in TFLite. The nodes in a cluster will be
+// collapsed into a single composite op node plus a series of constant nodes
+// holding the input parameters to that node. The nodes in a cluster are assumed
+// to be using the same device. By changing the "pattern" we can have different
+// subclasses of the base Cluster class.
+class Cluster {
+ public:
+  virtual ~Cluster() {}
+
+  virtual void CreateNodes() = 0;
+
+  // Save the following info from the original GraphDef this cluster is from:
+  // 1- a pointer to the GraphDef
+  // 2- All the nodes in GraphDef which belong to this cluster.
+  void SetGraphDefInfo(const tensorflow::GraphDef* graph_def);
+
+  const string& GetName() const { return name_; }
+
+  const std::vector<std::unique_ptr<tensorflow::NodeDef>>& GetNewNodes() const {
+    return new_nodes_;
+  }
+
+  const std::vector<const tensorflow::NodeDef*>& GetNodes() { return nodes_; }
+
+  void SetName(const string& name) { name_ = name; }
+
+  void SetDevice(const string& device) { device_ = device; }
+
+  // Find the input(s) and output(s) of this Cluster.
+  bool FindClusterInputsAndOutputs();
+
+ protected:
+  string name_;
+  string device_;
+  std::vector<string> inputs_;
+  std::vector<string> outputs_;
+
+  // Used to hold the pointers to nodes which are in this cluster. These nodes
+  // are pointing to the nodes in graph_def_.
+  std::vector<const tensorflow::NodeDef*> nodes_;
+
+  // Used to cache the newly generated nodes: like the nodes created by
+  // collapsing Const nodes, or the nodes which is used to show the composite
+  // op.
+  std::vector<std::unique_ptr<tensorflow::NodeDef>> new_nodes_;
+
+  const tensorflow::GraphDef* graph_def_; /*Not owned*/
+};
+
+// A factory interface for cluster class.
+// It defines a virtual function interface which is responsible for creating
+// a cluster. Each cluster factory is responsible to pack a cluster of nodes
+// into a cluster using a name-based pattern matching approach.
+class ClusterFactoryInterface {
+ public:
+  virtual ~ClusterFactoryInterface() {}
+
+  // Creates a cluster of nodes using a name-based pattern matching approach. It
+  // uses a node as a seed and if its name matches a certain pattern, then it
+  // builds the cluster around that node.
+  virtual std::unique_ptr<Cluster> CreateCluster(
+      const tensorflow::NodeDef& node,
+      const tensorflow::GraphDef& graph_def) const = 0;
+};
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTER_H
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14c3cd6487841d6d79b583d9245c130585324d9d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.cc
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+namespace toco {
+
+bool StrContains(const string& x, const string& search_pattern) {
+  return x.find(search_pattern) != string::npos;
+}
+
+void Transpose2DTensor(const float* tensor, int row, int col,
+                       float* transposed_tensor) {
+  float* result = transposed_tensor;
+  for (int r = 0; r < row; ++r) {
+    for (int c = 0; c < col; ++c) {
+      *(result + c * row) = *tensor++;
+    }
+    ++result;
+  }
+}
+
+}  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..a15e480e7007c21045dbc77052dc1ab70c2c5861
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
+
+#include <string>
+
+namespace toco {
+
+// Check if string x includes string search_pattern.
+bool StrContains(const string& x, const string& search_pattern);
+
+// Transpose a 2D tensor of size row * col pointed by "tensor" and return the
+// results in "transposed_tensor". "transposed_tensor" must be pre-allocated
+// by the same size as "tensor".
+void Transpose2DTensor(const float* tensor, int row, int col,
+                       float* transposed_tensor);
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_CLUSTERUTILS_H
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fddf6cc83686632033f31496ec42b33e2ea15f20
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.cc
@@ -0,0 +1,151 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+
+void AddNodeToGraph(const NodeDef& node,
+                    const std::vector<string>& cluster_names, GraphDef* graph) {
+  NodeDef* new_node = graph->add_node();
+  new_node->set_op(node.op());
+  new_node->set_name(node.name());
+  new_node->set_device(node.device());
+  // If the inputs are coming from a node which belongs to another cluster, then
+  // those inputs are renamed to the source cluster name. Otherwise the original
+  // input name is used.
+  for (const string& node_input : node.input()) {
+    bool input_from_cluster = false;
+    for (const string& cluster_name : cluster_names) {
+      if (StrContains(node_input, cluster_name) &&
+          !StrContains(node.name(), cluster_name)) {
+        new_node->add_input(cluster_name);
+        input_from_cluster = true;
+        break;
+      }
+    }
+    if (!input_from_cluster) {
+      new_node->add_input(node_input);
+    }
+  }
+  for (const auto& attr : node.attr()) {
+    (*new_node->mutable_attr())[attr.first] = attr.second;
+  }
+}
+
+bool FindCluster(const ClusterFactoryInterface& cluster_factory,
+                 const GraphDef& graph_def,
+                 std::unordered_map<string, bool>* is_node_in_cluster,
+                 std::vector<std::unique_ptr<Cluster>>* clusters) {
+  for (const NodeDef& node : graph_def.node()) {
+    // If the node is not assigned to any cluster, then we check if it belong to
+    // the cluster_factory.
+    bool node_in_cluster = (*is_node_in_cluster)[node.name()];
+    if (!node_in_cluster) {
+      std::unique_ptr<Cluster> cluster =
+          cluster_factory.CreateCluster(node, graph_def);
+      if (cluster) {
+        // Label all the nodes in is_node_in_cluster which are in this cluster
+        // as belonged to this cluster.
+        for (const NodeDef* cluster_node : cluster->GetNodes()) {
+          (*is_node_in_cluster)[cluster_node->name()] = true;
+        }
+        clusters->push_back(std::move(cluster));
+      }
+    }
+  }
+  return (!clusters->empty());
+}
+
+std::unique_ptr<GraphDef> MaybeResolveClusters(
+    const GraphDef& graph_def,
+    const std::vector<ClusterFactoryInterface*>& cluster_factories) {
+  std::unique_ptr<GraphDef> pruned_graph(new GraphDef);
+  // The structure to keep track of which cluster each node is assigned to, and
+  // to initialize them to all un-assigned,
+  std::unordered_map<string, bool> is_node_in_cluster;
+  for (const NodeDef& node : graph_def.node()) {
+    is_node_in_cluster[node.name()] = false;
+  }
+
+  std::vector<string> cluster_names;
+  std::vector<std::unique_ptr<Cluster>> all_clusters;
+  // Find the clusters for all available cluster factories.
+  for (const ClusterFactoryInterface* cluster_factory : cluster_factories) {
+    std::vector<std::unique_ptr<Cluster>> clusters;
+    if (FindCluster(*cluster_factory, graph_def, &is_node_in_cluster,
+                    &clusters)) {
+      for (auto itr = clusters.begin(); itr != clusters.end(); ++itr) {
+        cluster_names.push_back((*itr)->GetName());
+        (*itr)->CreateNodes();
+        all_clusters.push_back(std::move(*itr));
+      }
+    }
+  }
+
+  for (const std::unique_ptr<Cluster>& cluster : all_clusters) {
+    for (const std::unique_ptr<tensorflow::NodeDef>& src_node :
+         cluster->GetNewNodes()) {
+      // Add it to the output GraphDef.
+      AddNodeToGraph(*src_node, cluster_names, pruned_graph.get());
+    }
+  }
+
+  // Add any node which is not part of a cluster.
+  for (const NodeDef& node : graph_def.node()) {
+    bool node_in_cluster = is_node_in_cluster[node.name()];
+    if (!node_in_cluster) {
+      AddNodeToGraph(node, cluster_names, pruned_graph.get());
+    }
+  }
+
+  if (pruned_graph->node_size() == 0) {
+    return nullptr;
+  } else {
+    return pruned_graph;
+  }
+}
+
+std::unique_ptr<GraphDef> MaybeReplaceCompositeSubgraph(
+    const GraphDef& tf_graph) {
+  SvdfClusterFactory svdf_cluster_factory;
+
+  std::vector<ClusterFactoryInterface*> cluster_factories;
+  cluster_factories.push_back(&svdf_cluster_factory);
+
+  std::unique_ptr<GraphDef> pruned_graph =
+      MaybeResolveClusters(tf_graph, cluster_factories);
+
+  // Copy function definitions
+  *(pruned_graph->mutable_library()) = tf_graph.library();
+  return pruned_graph;
+}
+
+}  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d33dd1885ed9bbc938d4020d13e2b3deb0047f3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h
@@ -0,0 +1,63 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+// Given a graph info and a list of cluster classes (cluster_factories), it
+// partitions the graph to clusters, and then collapses each cluster into their
+// corresponding composite ops. It generates a new graph using the newly
+// generated composite ops. Each cluster factory is responsible to recognize a
+// cluster of nodes into a cluster using a name-based pattern matching approach.
+std::unique_ptr<tensorflow::GraphDef> MaybeResolveClusters(
+    const tensorflow::GraphDef& graph_def,
+    const std::vector<ClusterFactoryInterface*>& cluster_factories);
+
+// Adds a node to a given graph. The added node will be a copy of a given source
+// node, except for the inputs. If the inputs are coming from a node which
+// belongs to another cluster, then those inputs are renamed to the source
+// cluster name.
+void AddNodeToGraph(const tensorflow::NodeDef& node,
+                    const std::vector<string>& cluster_names,
+                    tensorflow::GraphDef* graph);
+
+// Given a graph and a cluster class, it finds all the nodes which belong to a
+// given class factory, encapsulate them inside a cluster of the given type and
+// returns a vector of those clusters. It also labels the nodes in that graph if
+// they belong to the generated clusters.
+bool FindCluster(const ClusterFactoryInterface& cluster_factory,
+                 const tensorflow::GraphDef& graph_def,
+                 std::unordered_map<string, bool>* is_node_in_cluster,
+                 std::vector<std::unique_ptr<Cluster>>* clusters);
+
+// Receives a graph and generates another graph by replacing the cluster of
+// nodes which matches a given composite op. Each composite op is represented
+// using a class factory.
+std::unique_ptr<tensorflow::GraphDef> MaybeReplaceCompositeSubgraph(
+    const tensorflow::GraphDef& tf_graph);
+
+}  // end namespace toco
+
+#endif  // CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_CLUSTER_H
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6a099817c7b88c7dcd9c3e4e8b131c2a25cffcd
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.cc
@@ -0,0 +1,285 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+
+#include <ctype.h>
+#include <stddef.h>
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "google/protobuf/map.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+
+namespace toco {
+
+namespace {
+
+// Receives a vector of cluster nodes and returns only those which are array
+// partitions (of type 'Const' and have the pattern 'part_<.*>' in their name.
+// Since these nodes are connected to a Concatenate node, it makes sure the
+// axis value input of the Concatenate operator is 0.
+void FilterPartitionedConstNodes(
+    const string& const_pattern,
+    const std::vector<const NodeDef*>& cluster_nodes,
+    std::vector<const NodeDef*>* const_node_parts) {
+  for (const NodeDef* node : cluster_nodes) {
+    string node_name_to_upper = node->name();
+    std::transform(node_name_to_upper.begin(), node_name_to_upper.end(),
+                   node_name_to_upper.begin(), ::toupper);
+    if (StrContains(node->name(), const_pattern) && node->op() == "Const") {
+      if (StrContains(node_name_to_upper, "/PART_")) {
+        const_node_parts->push_back(node);
+      } else if (StrContains(node->name(), "AXIS") &&
+                 StrContains(node->name(), "CONCAT")) {
+        // For now only supporting Concatenate on Axix 0
+        const auto& value_attr = node->attr().at("value");
+        const tensorflow::TensorProto& tensor = value_attr.tensor();
+        CHECK_EQ(tensor.int_val(0), 0);
+      }
+    }
+  }
+  sort(const_node_parts->begin(), const_node_parts->end(),
+       [](const NodeDef* a, const NodeDef* b) {
+         return (a->name().compare(b->name()) < 0 &&
+                 (a->name().size() < b->name().size()));
+       });
+}
+
+}  // namespace
+
+// SvdfCluster methods
+
+int SvdfCluster::InferFilterRank() {
+  for (const NodeDef* node : nodes_) {
+    if (StrContains(node->name(), "Reshape/shape")) {
+      const auto& value_attr = node->attr().at("value");
+      const tensorflow::TensorProto& tensor = value_attr.tensor();
+      std::vector<int32> shape_values(
+          tensor.tensor_content().size() / sizeof(int), 0);
+      port::CopyToBuffer(tensor.tensor_content(),
+                         reinterpret_cast<char*>(shape_values.data()));
+      CHECK_EQ(shape_values.size(), 3);
+      // shape_value array is arranged as:
+      // [num_units, rank, -1]
+      CHECK_EQ(shape_values[2], -1);
+      return shape_values[1];
+    }
+  }
+  return -1;
+}
+
+void SvdfCluster::CreateNodes() {
+  for (const string& const_pattern : const_node_patterns_) {
+    CreateConstNode(const_pattern);
+  }
+  std::unique_ptr<tensorflow::NodeDef> svdf_node(new NodeDef);
+  svdf_node->set_op("Svdf");
+  svdf_node->set_name(name_);
+  svdf_node->set_device(device_);
+
+  // Add the main input.
+  svdf_node->add_input(inputs_[0]);
+
+  // Add the rest of the inputs to Svdf cell: weights and bias.
+  CHECK(new_nodes_.size() == 3 || new_nodes_.size() == 2);
+  string* weights_feature_input = svdf_node->add_input();
+  string* weights_time_input = svdf_node->add_input();
+  string* bias_input;
+  if (new_nodes_.size() == 3) {
+    bias_input = svdf_node->add_input();
+  }
+  for (const std::unique_ptr<tensorflow::NodeDef>& node : new_nodes_) {
+    const string node_name = node->name();
+    if (StrContains(node_name, "SVDF_weights_feature")) {
+      *weights_feature_input = node_name;
+    } else if (StrContains(node_name, "SVDF_weights_time")) {
+      *weights_time_input = node_name;
+    } else if (StrContains(node_name, "SVDF_bias")) {
+      CHECK(bias_input) << "Bias input cannot be provided when there are only "
+                           "two Const input nodes!";
+      *bias_input = node_name;
+    } else {
+      // Unexpected input for Svdf op.
+      LOG(FATAL) << "Unexpected input node for SVDF op! Accepted inputs are: "
+                    "weights_feature, weights_time and bias.";
+    }
+  }
+  const int rank = InferFilterRank();
+  CHECK_GT(rank, 0);
+
+  // Add Svdf activation and rank.
+  string activation_function =
+      StrContains(outputs_[0], "Relu") ? "Relu" : "None";
+  (*svdf_node->mutable_attr())["ActivationFunction"].set_s(activation_function);
+  (*svdf_node->mutable_attr())["Rank"].set_i(rank);
+
+  // Finally add it to the list of the newly created nodes.
+  new_nodes_.push_back(std::move(svdf_node));
+}
+
+void SvdfCluster::CreateConstNode(const string& const_pattern) {
+  // Find the nodes with pattern like: "const_pattern"/part_xxx of type Const.
+  std::vector<const NodeDef*> const_node_parts;
+  FilterPartitionedConstNodes(const_pattern, nodes_, &const_node_parts);
+
+  if (const_node_parts.empty()) return;
+
+  bool transpose_tensor_value =
+      StrContains(const_pattern, "SVDF_weights_feature");
+
+  // Merge them if necessary.
+  std::unique_ptr<tensorflow::NodeDef> merged_node(new NodeDef);
+  MaybeMergeConstNodes(const_node_parts, transpose_tensor_value, merged_node);
+  new_nodes_.push_back(std::move(merged_node));
+}
+
+void SvdfCluster::MaybeMergeConstNodes(
+    const std::vector<const NodeDef*>& const_node_parts,
+    bool transpose_tensor_value,
+    const std::unique_ptr<tensorflow::NodeDef>& merged_node) {
+  merged_node->set_name(const_node_parts[0]->name());
+  merged_node->set_op("Const");
+  merged_node->set_device(const_node_parts[0]->device());
+  (*merged_node->mutable_attr())["dtype"].set_type(
+      const_node_parts[0]->attr().at("dtype").type());
+
+  // Figuring out Value attribute for the merged node.
+  // Assuming the partitioning is done on Axis 0.
+  // The attributes which are inferred:
+  // * Shape and dimensions
+  // * Float content values
+
+  // Inferring shape and dimension
+  int dim0_size = 0;
+  int dim1_size = 1;
+  tensorflow::TensorProto* allocated_tensor =
+      (*merged_node->mutable_attr())["value"].mutable_tensor();
+  tensorflow::TensorShapeProto* allocated_tensor_shape =
+      allocated_tensor->mutable_tensor_shape();
+  auto tensor_shape_dim0 = allocated_tensor_shape->add_dim();
+  int allocated_content_flat_size = 0;
+  for (int i = 0; i < const_node_parts.size(); i++) {
+    const auto& value_attr = const_node_parts[i]->attr().at("value");
+    const tensorflow::TensorProto& tensor = value_attr.tensor();
+    if (i == 0) {
+      allocated_tensor->set_dtype(tensor.dtype());
+    } else {
+      CHECK_EQ(allocated_tensor->dtype(), tensor.dtype());
+    }
+    allocated_content_flat_size += tensor.tensor_content().size();
+    CHECK(tensor.has_tensor_shape());
+    const tensorflow::TensorShapeProto shape = tensor.tensor_shape();
+    dim0_size += shape.dim(0).size();
+    for (int d = 1; d < shape.dim_size(); d++) {
+      if (i == 0) {
+        allocated_tensor_shape->add_dim()->set_size(shape.dim(d).size());
+        allocated_tensor_shape->set_unknown_rank(shape.unknown_rank());
+        dim1_size *= shape.dim(d).size();
+      } else {
+        CHECK_EQ(shape.dim(d).size(), allocated_tensor_shape->dim(d).size());
+        CHECK_EQ(allocated_tensor_shape->unknown_rank(), shape.unknown_rank());
+      }
+    }
+  }
+
+  // Copying the float content from each array partition.
+  std::unique_ptr<char[]> allocated_content(
+      new char[allocated_content_flat_size]);
+  char* content_ptr = allocated_content.get();
+  for (int i = 0; i < const_node_parts.size(); i++) {
+    const auto& value_attr = const_node_parts[i]->attr().at("value");
+    const tensorflow::TensorProto& tensor = value_attr.tensor();
+    port::CopyToBuffer(tensor.tensor_content(), content_ptr);
+    content_ptr += tensor.tensor_content().size();
+  }
+
+  // Transpose the tensor if needed.
+  if (transpose_tensor_value) {
+    // We use dimension 0 to show the row size for the tensor.
+    // We use multiplication of the rest of dimension size to for the col size
+    // of the tensor.
+    std::unique_ptr<float[]> transposed_tensor(
+        new float[dim0_size * dim1_size]);
+    Transpose2DTensor(reinterpret_cast<float*>(allocated_content.get()),
+                      dim0_size, dim1_size, transposed_tensor.get());
+    allocated_tensor_shape->clear_dim();
+    allocated_tensor_shape->add_dim()->set_size(dim1_size);
+    allocated_tensor_shape->add_dim()->set_size(dim0_size);
+
+    // Set the tensor attributes.
+    allocated_tensor->set_tensor_content(
+        string(reinterpret_cast<const char*>(transposed_tensor.get()),
+               allocated_content_flat_size));
+  } else {
+    tensor_shape_dim0->set_size(dim0_size);
+
+    // Set the tensor attributes.
+    allocated_tensor->set_tensor_content(
+        string(reinterpret_cast<const char*>(allocated_content.get()),
+               allocated_content_flat_size));
+  }
+}
+
+// SvdfClusterFactory methods
+
+std::unique_ptr<Cluster> SvdfClusterFactory::CreateCluster(
+    const NodeDef& node, const GraphDef& graph_def) const {
+  std::vector<string> node_patterns = {"SVDF_weights_feature",
+                                       "SVDF_weights_time", "SVDF_bias"};
+
+  string node_name_to_upper = node.name();
+  std::transform(node_name_to_upper.begin(), node_name_to_upper.end(),
+                 node_name_to_upper.begin(), ::toupper);
+  std::unique_ptr<SvdfCluster> cluster = nullptr;
+  if (node_name_to_upper.find("SVDF", 0) != string::npos) {
+    size_t weights_pos = node.name().find(node_patterns[0]);
+    if (weights_pos != string::npos) {
+      // Assuming the node name has a pattern like:
+      // "SOMESTRING1/CELLNAME/SEARCH_PATTERN/SOMESTRING2", we use
+      // CELLNAME as the cluster name.
+      size_t cell_pos = node.name().rfind("/", weights_pos - 2) + 1;
+      string cell_name =
+          node.name().substr(cell_pos, weights_pos - cell_pos - 1);
+      cluster = std::unique_ptr<SvdfCluster>(new SvdfCluster);
+      cluster->SetName(cell_name);
+      cluster->SetDevice(node.device());
+      cluster->SetGraphDefInfo(&graph_def);
+      CHECK(cluster->FindClusterInputsAndOutputs());
+
+      for (const string& const_pattern : node_patterns) {
+        cluster->AddConstNodePattern(const_pattern);
+      }
+    }
+  }
+  return std::move(cluster);
+}
+
+}  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4c6c341178e3acfc7bf5a4b8bf322f947ba088b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
+#define TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+class SvdfCluster : public Cluster {
+ public:
+  // For this cluster, it collapses all the nodes in nodes_ into a composite op
+  // and it returns all the newly generated ops in new_nodes_.
+  void CreateNodes() override;
+
+  // A helper function to set the pattern of Const nodes which CreateNodes()
+  // should handle specially.
+  void AddConstNodePattern(const string& const_pattern) {
+    const_node_patterns_.push_back(const_pattern);
+  }
+
+  virtual ~SvdfCluster() {}
+
+ private:
+  // The main function which is used to create Const nodes for this cluster.
+  // These Const nodes are the inputs to the composite op generated for this
+  // cluster.
+  void CreateConstNode(const string& const_pattern);
+
+  // Receives a vector of Const nodes, merge them (if necessary) and returns
+  // only one Const node holding all the arrays contents. It transposes it if
+  // needed.
+  void MaybeMergeConstNodes(
+      const std::vector<const tensorflow::NodeDef*>& const_node_parts,
+      bool transpose_tensor_value,
+      const std::unique_ptr<tensorflow::NodeDef>& merged_node);
+
+  // Infer the value of Svdf filter rank, by looking up a reshape operator which
+  // is used for 'output' which reshapes output from [num_filters, batch, 1]
+  // shape to [num_units, rank, batch] shape. The 2nd shape element is rank.
+  int InferFilterRank();
+
+  std::vector<string> const_node_patterns_;
+};
+
+class SvdfClusterFactory : public ClusterFactoryInterface {
+ public:
+  // Creates a cluster of nodes using a name-based pattern matching approach. It
+  // uses a node as a seed and if its name matches a certain pattern, then it
+  // builds the cluster around that node.
+  // This factory expects nodes which have "SVDF_weights_feature" and
+  // "SVDF_weights_time" pattern in their names (and optionally "SVDF_bias")
+  // and it creates an SVDF Op from them.
+  std::unique_ptr<Cluster> CreateCluster(
+      const tensorflow::NodeDef& node,
+      const tensorflow::GraphDef& graph_def) const;
+};
+
+}  // end namespace toco
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_GRAPH_MATCHING_RESOLVE_SVDF_H
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..664e828c19dca1117b81113f723416541f48d621
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_svdf.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/cluster_utils.h"
+#include "tensorflow/contrib/lite/toco/tensorflow_graph_matching/resolve_cluster.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+using tensorflow::GraphDef;
+using tensorflow::NodeDef;
+
+namespace toco {
+
+class ResolveSvdfTest : public ::testing::Test {
+ public:
+  ResolveSvdfTest() {
+    AddNewNode("Input1", "Const", {});
+    AddNewNode("Svdf1/SVDF_weights_feature/part_0", "Const", {},
+               {0.1, 0.2, 0.3});
+    AddNewNode("Svdf1/SVDF_weights_feature/part_0/read", "Identity",
+               {"Svdf1/SVDF_weights_feature/part_0"});
+    AddNewNode("Svdf1/SVDF_weights_time/part_0", "Const", {}, {0.1, 0.2, 0.3});
+    AddNewNode("Svdf1/SVDF_weights_time/part_0/read", "Identity",
+               {"Svdf1/SVDF_weights_time/part_0"});
+
+    AddNewNode("Svdf1/f1", "SVDF_F1",
+               {"Input1", "Svdf1/SVDF_weights_feature/part_0/read"});
+    AddNewNode("Svdf1/f2", "SVDF_F2",
+               {"Svdf1/SVDF_weights_time/part_0/read", "Svdf1/f1"});
+    AddNewNode("Svdf1/Relu", "Relu", {"Svdf1/f2"});
+    AddShapeNode("Svdf1/Reshape/shape", {10, 1, -1});
+    AddNewNode("Output1", "Const", {"Svdf1/Relu"});
+
+    AddNewNode("Input2", "Const", {});
+    AddNewNode("Svdf2/SVDF_weights_feature/part_0", "Const", {},
+               {0.1, 0.2, 0.3});
+    AddNewNode("Svdf2/SVDF_weights_feature/part_0/read", "Identity",
+               {"Svdf2/SVDF_weights_feature/part_0"});
+    AddNewNode("Svdf2/SVDF_weights_time/part_0", "Const", {}, {0.1, 0.2, 0.3});
+    AddNewNode("Svdf2/SVDF_weights_time/part_0/read", "Identity",
+               {"Svdf2/SVDF_weights_time/part_0"});
+
+    AddNewNode("Svdf2/f1", "SVDF_F1",
+               {"Input1", "Svdf2/SVDF_weights_feature/part_0/read"});
+    AddNewNode("Svdf2/f2", "SVDF_F2",
+               {"Svdf2/SVDF_weights_time/part_0/read", "Svdf2/f1"});
+    AddNewNode("Svdf2/Relu", "Relu", {"Svdf2/f2"});
+    AddShapeNode("Svdf2/Reshape/shape", {10, 2, -1});
+    AddNewNode("Output2", "Const", {"Svdf2/Relu"});
+  }
+
+  ~ResolveSvdfTest() override {}
+
+ protected:
+  void AddNewNode(const string& name, const string& op,
+                  const std::vector<string>& inputs) {
+    NodeDef* node = graph_.add_node();
+    node->set_name(name);
+    node->set_op(op);
+    node->set_device("");
+    for (int i = 0; i < inputs.size(); i++) {
+      node->add_input();
+      node->set_input(i, inputs[i]);
+    }
+  }
+
+  void AddNewNode(const string& name, const string& op,
+                  const std::vector<string>& inputs,
+                  const std::vector<float>& values) {
+    NodeDef* node = graph_.add_node();
+    node->set_name(name);
+    node->set_op(op);
+    node->set_device("");
+    for (int i = 0; i < inputs.size(); i++) {
+      node->add_input();
+      node->set_input(i, inputs[i]);
+    }
+    // Add the float vector as an attribute to the node.
+    (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_FLOAT);
+    tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
+    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+        new tensorflow::TensorShapeProto;
+    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    tensor_shape_dim0->set_size(values.size());
+    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_tensor_content(
+        string(reinterpret_cast<const char*>(values.data()),
+               values.size() * sizeof(float)));
+    (*node->mutable_attr())["value"].set_allocated_tensor(allocated_tensor);
+  }
+
+  void AddShapeNode(const string& name, const std::vector<int>& values) {
+    NodeDef* node = graph_.add_node();
+    node->set_name(name);
+    node->set_op("Const");
+    node->set_device("");
+    // Add the float vector as an attribute to the node.
+    (*node->mutable_attr())["dtype"].set_type(tensorflow::DT_INT32);
+    tensorflow::TensorProto* allocated_tensor = new tensorflow::TensorProto;
+    tensorflow::TensorShapeProto* allocated_tesnor_shape =
+        new tensorflow::TensorShapeProto;
+    auto tensor_shape_dim0 = allocated_tesnor_shape->add_dim();
+    tensor_shape_dim0->set_size(values.size());
+    allocated_tensor->set_allocated_tensor_shape(allocated_tesnor_shape);
+    allocated_tensor->set_tensor_content(
+        string(reinterpret_cast<const char*>(values.data()),
+               values.size() * sizeof(int)));
+    (*node->mutable_attr())["value"].set_allocated_tensor(allocated_tensor);
+  }
+
+  GraphDef graph_;
+  SvdfClusterFactory svdf_cluster_factory_;
+  std::vector<std::unique_ptr<Cluster>> clusters_;
+};
+
+TEST_F(ResolveSvdfTest, TestTranspose2DTensor) {
+  static float matrix[] = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.};
+  static float expected_transposed_matrix[] = {1., 5., 9.,  2., 6., 10.,
+                                               3., 7., 11., 4., 8., 12.};
+  float* transposed_matrix = new float[12];
+  Transpose2DTensor(matrix, 3, 4, transposed_matrix);
+
+  std::vector<float> actual;
+  actual.insert(
+      actual.end(), transposed_matrix,
+      transposed_matrix + sizeof(expected_transposed_matrix) / sizeof(float));
+  std::vector<float> expected;
+  expected.insert(expected.end(), expected_transposed_matrix,
+                  expected_transposed_matrix +
+                      sizeof(expected_transposed_matrix) / sizeof(float));
+  delete[] transposed_matrix;
+}
+
+TEST_F(ResolveSvdfTest, TestResolveSvdfFlow) {
+  std::unordered_map<string, bool> is_node_in_cluster;
+  for (const NodeDef& node : graph_.node()) {
+    is_node_in_cluster[node.name()] = false;
+  }
+
+  std::vector<string> cluster_names;
+  CHECK(FindCluster(svdf_cluster_factory_, graph_, &is_node_in_cluster,
+                    &clusters_));
+
+  for (const std::unique_ptr<Cluster>& cluster : clusters_) {
+    cluster_names.push_back(cluster->GetName());
+    cluster->CreateNodes();
+  }
+
+  EXPECT_THAT(cluster_names,
+              testing::UnorderedElementsAreArray({"Svdf1", "Svdf2"}));
+
+  std::vector<string> new_node_names;
+  std::vector<float> content_array(3);
+  for (const std::unique_ptr<Cluster>& cluster : clusters_) {
+    // After CreateNodes in each cluster we have three nodes: Svdf,
+    // weights_feature and weights_time.
+    CHECK_EQ(cluster->GetNewNodes().size(), 3);
+    for (const std::unique_ptr<tensorflow::NodeDef>& node :
+         cluster->GetNewNodes()) {
+      new_node_names.push_back(node->name());
+      if (node->op() == "Const") {
+        CHECK_EQ(node->attr().at("dtype").type(), tensorflow::DT_FLOAT);
+        toco::port::CopyToBuffer(
+            node->attr().at("value").tensor().tensor_content(),
+            reinterpret_cast<char*>(content_array.data()));
+        EXPECT_THAT(content_array,
+                    testing::UnorderedElementsAreArray({0.1, 0.2, 0.3}));
+      } else {
+        // Checking the Svdf node attributes (rank and activation type) are
+        // correct.
+        if (node->name() == "Svdf1") {
+          CHECK_EQ(node->attr().at("Rank").i(), 1);
+        } else if (node->name() == "Svdf2") {
+          CHECK_EQ(node->attr().at("Rank").i(), 2);
+        }
+        CHECK_EQ(node->attr().at("ActivationFunction").s(), "Relu");
+      }
+    }
+  }
+  EXPECT_THAT(new_node_names, testing::UnorderedElementsAreArray(
+                                  {"Svdf2/SVDF_weights_feature/part_0",
+                                   "Svdf2/SVDF_weights_time/part_0", "Svdf2",
+                                   "Svdf1/SVDF_weights_feature/part_0",
+                                   "Svdf1/SVDF_weights_time/part_0", "Svdf1"}));
+}
+
+}  // end namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.cc b/tensorflow/contrib/lite/toco/tensorflow_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82e2800ca2f5bb017f91b5bf43d8d3cd05e97b83
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.cc
@@ -0,0 +1,197 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tensorflow_util.h"
+
+#include <string.h>
+#include <memory>
+#include <set>
+
+#ifdef GOOGLE_PLATFORM
+#include "file/logging/log_lines.h"
+#endif
+#include "google/protobuf/map.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+using tensorflow::AttrValue;
+using tensorflow::GraphDef;
+
+void LogDumpGraphDef(int log_level, const string& message,
+                     const GraphDef& tf_graph) {
+  if (!VLOG_IS_ON(log_level)) {
+    return;
+  }
+  std::set<string> ops;
+  for (const auto& node : tf_graph.node()) {
+    ops.insert(node.op());
+  }
+  string dump;
+  toco::port::AppendF(&dump, R"MSG(
+BEGIN DUMP OF TENSORFLOW GRAPHDEF (%s)
+There are %d nodes.
+There are %zu different op types:
+)MSG", message, tf_graph.node_size(), ops.size());
+  for (const auto& op : ops) {
+    toco::port::AppendF(&dump, "  %s\n", op);
+  }
+  dump.append(R"MSG(
+PROTO DUMP
+)MSG");
+  for (const auto& node : tf_graph.node()) {
+    toco::port::AppendF(&dump, R"MSG(
+BEGIN NODE: name = %s
+  op = %s
+  inputs = [
+)MSG", node.name(), node.op());
+    for (const auto& input : node.input()) {
+      toco::port::AppendF(&dump, "    %s\n", input);
+    }
+    dump.append("  ]\n");
+    for (const auto& attr : node.attr()) {
+      toco::port::AppendF(&dump, "  ATTR: name = %s\n", attr.first);
+      if (attr.second.value_case() == AttrValue::kFunc) {
+        dump.append("    func\n");
+      } else if (attr.second.value_case() == AttrValue::kPlaceholder) {
+        toco::port::AppendF(&dump, "    placeholder: %s\n",
+                            attr.second.placeholder());
+      } else if (attr.second.value_case() == AttrValue::kS) {
+        dump.append("    string:\n");
+        dump.append(R"MSG(
+      BEGIN EMBEDDED STRING
+)MSG");
+        const auto& lines = absl::StrSplit(attr.second.s(), '\n');
+        for (const auto& line : lines) {
+          toco::port::AppendF(&dump, "      %s\n", line);
+        }
+        dump.append(R"MSG(
+      END EMBEDDED STRING
+)MSG");
+      } else if (attr.second.value_case() == AttrValue::kI) {
+        toco::port::AppendF(&dump, "    int: %lld\n", attr.second.i());
+      } else if (attr.second.value_case() == AttrValue::kF) {
+        toco::port::AppendF(&dump, "    float: %g\n", attr.second.f());
+      } else if (attr.second.value_case() == AttrValue::kB) {
+        toco::port::AppendF(&dump, "    bool: %s\n",
+                            attr.second.b() ? "true" : "false");
+      } else if (attr.second.value_case() == AttrValue::kType) {
+        toco::port::AppendF(&dump, "    type: %s\n",
+                            tensorflow::DataType_Name(attr.second.type()));
+      } else if (attr.second.value_case() == AttrValue::kShape) {
+        dump.append("    shape: [ ");
+        const auto& shape = attr.second.shape();
+        for (int i = 0; i < shape.dim_size(); i++) {
+          toco::port::AppendF(&dump, "%lld ", shape.dim(i).size());
+        }
+        dump.append("]\n");
+      } else if (attr.second.value_case() == AttrValue::kTensor) {
+        const auto& tensor = attr.second.tensor();
+        dump.append("    TENSOR:\n");
+        toco::port::AppendF(&dump, "      type: %s\n",
+                            tensorflow::DataType_Name(tensor.dtype()));
+        const auto& shape = tensor.tensor_shape();
+        dump.append("      shape: [ ");
+        for (int i = 0; i < shape.dim_size(); i++) {
+          toco::port::AppendF(&dump, "%lld ", shape.dim(i).size());
+        }
+        dump.append("]\n");
+        if (!tensor.tensor_content().empty()) {
+          toco::port::AppendF(&dump, "      tensor_content: %zu bytes\n",
+                              tensor.tensor_content().size());
+        }
+        if (tensor.dtype() == tensorflow::DT_INT32) {
+          CHECK_EQ(0, tensor.tensor_content().size() % sizeof(int32));
+          const int size = tensor.tensor_content().size() / sizeof(int32);
+          std::vector<int32> data(size);
+          toco::port::CopyToBuffer(tensor.tensor_content(),
+                                   reinterpret_cast<char*>(data.data()));
+          const int kMaxValsToPrint = 4;
+          dump.append("        tensor_content as ints: [ ");
+          for (int i = 0; i < kMaxValsToPrint && i < size; i++) {
+            toco::port::AppendF(&dump, "%d ", data[i]);
+          }
+          if (size > kMaxValsToPrint) {
+            dump.append("... ");
+          }
+          dump.append("]\n");
+        }
+        if (tensor.dtype() == tensorflow::DT_FLOAT) {
+          CHECK_EQ(0, tensor.tensor_content().size() % sizeof(float));
+          const int size = tensor.tensor_content().size() / sizeof(float);
+          std::vector<float> data(size);
+          toco::port::CopyToBuffer(tensor.tensor_content(),
+                                   reinterpret_cast<char*>(data.data()));
+          const int kMaxValsToPrint = 4;
+          dump.append("        tensor_content as floats: [ ");
+          for (int i = 0; i < kMaxValsToPrint && i < size; i++) {
+            toco::port::AppendF(&dump, "%g ", data[i]);
+          }
+          if (size > kMaxValsToPrint) {
+            dump.append("... ");
+          }
+          dump.append("]\n");
+        }
+        if (tensor.int_val_size()) {
+          toco::port::AppendF(&dump, "      int_val: %d ints: [ ",
+                              tensor.int_val_size());
+          const int kMaxValsToPrint = 4;
+          for (int i = 0; i < kMaxValsToPrint && i < tensor.int_val_size();
+               i++) {
+            toco::port::AppendF(&dump, "%d ", tensor.int_val(i));
+          }
+          if (tensor.int_val_size() > kMaxValsToPrint) {
+            dump.append("... ");
+          }
+          dump.append("]\n");
+        }
+        if (tensor.float_val_size()) {
+          toco::port::AppendF(&dump, "      float_val: %d floats: [ ",
+                              tensor.float_val_size());
+          const int kMaxValsToPrint = 4;
+          for (int i = 0; i < kMaxValsToPrint && i < tensor.float_val_size();
+               i++) {
+            toco::port::AppendF(&dump, "%g ", tensor.float_val(i));
+          }
+          if (tensor.float_val_size() > kMaxValsToPrint) {
+            dump.append("... ");
+          }
+          dump.append("]\n");
+        }
+        if (tensor.string_val_size()) {
+          toco::port::AppendF(&dump, "      string_val: %d strings\n",
+                              tensor.string_val_size());
+        }
+      } else if (attr.second.value_case() == AttrValue::kList) {
+        dump.append("  LIST\n");
+      }
+    }
+    dump.append("END NODE\n");
+  }
+  toco::port::AppendF(&dump, "END DUMP OF TENSORFLOW GRAPHDEF (%s)\n", message);
+#if defined(GOOGLE_PLATFORM)
+  VLOG_LINES(log_level, dump);
+#else
+  VLOG(log_level) << dump;
+#endif
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tensorflow_util.h b/tensorflow/contrib/lite/toco/tensorflow_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..152b4f7a727a88f721f1a63299ea4fa709bb5d52
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tensorflow_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+
+void LogDumpGraphDef(int log_level, const string& message,
+                     const tensorflow::GraphDef& tf_graph);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TENSORFLOW_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..332253a092aff812fb18601862c66bc0423599c2
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -0,0 +1,148 @@
+package(
+    # To suppress build cleaner error about inclusion of schema_generate.h.
+    features = ["-layering_check"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "operator",
+    srcs = [
+        "operator.cc",
+    ],
+    hdrs = [
+        "builtin_operator.h",
+        "custom_operator.h",
+        "operator.h",
+        "simple_operator.h",
+    ],
+    deps = [
+        ":types",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers//:flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "operator_test",
+    srcs = [
+        "operator_test.cc",
+    ],
+    deps = [
+        ":operator",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "types",
+    srcs = [
+        "types.cc",
+    ],
+    hdrs = [
+        "types.h",
+    ],
+    deps = [
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/toco:model",
+    ],
+)
+
+tf_cc_test(
+    name = "types_test",
+    srcs = [
+        "types_test.cc",
+    ],
+    deps = [
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "export",
+    srcs = [
+        "export.cc",
+    ],
+    hdrs = [
+        "export.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":operator",
+        ":types",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers//:flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "export_test",
+    srcs = [
+        "export_test.cc",
+    ],
+    deps = [
+        ":export",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "import",
+    srcs = [
+        "import.cc",
+    ],
+    hdrs = [
+        "import.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":operator",
+        ":types",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/contrib/lite/toco:model",
+        "@flatbuffers//:flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "import_test",
+    srcs = [
+        "import_test.cc",
+    ],
+    deps = [
+        ":import",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers//:flatbuffers",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/toco/tflite/builtin_operator.h b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..93cc79ddb64fbc46a97a47ecdc155a8aabf5c3ef
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/builtin_operator.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
+
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Builtin operators have special TF Lite objects describing their options.
+// This class has the boilerplate code for creating those.
+//
+// Template arguments:
+//   - T1 must derive from ::toco::Operator.
+//   - T2 must be one of TF Lite's objects defining Builtin Options, such as
+//     ::tflite::Conv2DOptions.
+template <typename T1, typename T2, ::tflite::BuiltinOptions TfLiteEnum>
+class BuiltinOperator : public BaseOperator {
+ public:
+  using TocoOperator = T1;
+  using TfLiteOptions = T2;
+
+  BuiltinOperator(::tflite::BuiltinOperator op, OperatorType type)
+      : BaseOperator(::tflite::EnumNameBuiltinOperator(op), type) {}
+
+  // Build the configuration object in the given flatbuffer builder. Return
+  // its offset.
+  virtual flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const = 0;
+
+  // Read options from the TF Lite object and set the corresponding values in
+  // the tf.mini operator.
+  virtual void ReadOptions(const TfLiteOptions& opt,
+                           TocoOperator* op) const = 0;
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    auto options = WriteOptions(static_cast<const TocoOperator&>(op), builder);
+    return Options::Builtin(TfLiteEnum, options.Union());
+  }
+
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    auto op = absl::make_unique<TocoOperator>();
+    auto* options = static_cast<const TfLiteOptions*>(builtin_options);
+    if (options) {
+      ReadOptions(*options, op.get());
+    }
+    return std::unique_ptr<Operator>(op.release());
+  }
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_BUILTIN_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/custom_operator.h b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a4bfac7d4f684043d2a9ce8fc2c78dd738f4b69
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/custom_operator.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Custom operators have a generic byte buffer describing their options. This
+// class provides the boilerplate code for populating those options using
+// flexbuffers. Note that most of toco's operators will likely be supported
+// as builtin operators in TF Lite.
+//
+// Template argument T must derive from ::toco::Operator.
+template <typename T>
+class CustomOperator : public BaseOperator {
+ public:
+  using TocoOperator = T;
+  using BaseOperator::BaseOperator;
+
+  // Populate the given flexbuffer with options obtained from the tf.mini
+  // operator.
+  virtual void WriteOptions(const TocoOperator& op,
+                            flexbuffers::Builder* fbb) const {}
+
+  // Set options in the given tf.mini operator using values from the flexbuffer
+  // map.
+  virtual void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const {}
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    flexbuffers::Builder fbb;
+    fbb.Map(
+        [&]() { WriteOptions(static_cast<const TocoOperator&>(op), &fbb); });
+    fbb.Finish();
+    return Options::Custom(builder->CreateVector(fbb.GetBuffer()));
+  }
+
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    auto op = absl::make_unique<TocoOperator>();
+    if (custom_options) {
+      auto flexbuffer_map =
+          flexbuffers::GetRoot(custom_options->data(), custom_options->size())
+              .AsMap();
+      ReadOptions(flexbuffer_map, op.get());
+    }
+    return std::unique_ptr<Operator>(op.release());
+  }
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_CUSTOM_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bec694a23377c7c70684000069e9c08ee446b6c0
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -0,0 +1,331 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/export.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace toco {
+
+namespace tflite {
+
+using ::tflite::Buffer;
+using ::tflite::BuiltinOperator;
+using ::tflite::BuiltinOperator_CUSTOM;
+using ::tflite::BuiltinOperator_MAX;
+using ::tflite::BuiltinOperator_MIN;
+using ::tflite::CreateBuffer;
+using ::tflite::CreateModel;
+using ::tflite::CreateOperator;
+using ::tflite::CreateTensor;
+using ::tflite::Operator;
+using ::tflite::OperatorCode;
+using ::tflite::SubGraph;
+using ::tflite::Tensor;
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+namespace {
+
+details::OperatorKey GetOperatorKey(const ::toco::Operator& op) {
+  string custom_code;
+  if (op.type == OperatorType::kTensorFlowUnsupported) {
+    const TensorFlowUnsupportedOperator& unsupported_op =
+        static_cast<const TensorFlowUnsupportedOperator&>(op);
+    custom_code = unsupported_op.tensorflow_op;
+  }
+  return details::OperatorKey(op.type, custom_code);
+}
+
+}  // Anonymous namespace.
+
+namespace details {
+
+void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
+  // First find a list of unique array names.
+  std::set<string> names;
+  for (const auto& array_pair : model.arrays) {
+    names.insert(array_pair.first);
+  }
+
+  // Now assign indices to them and fill in the map.
+  int index = 0;
+  for (const auto& name : names) {
+    (*tensors_map)[name] = index;
+    ++index;
+  }
+}
+
+void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map) {
+  // First find a list of unique operator types.
+  std::set<OperatorKey> keys;
+  for (const auto& op : model.operators) {
+    keys.insert(GetOperatorKey(*op));
+  }
+  // Now assign indices to them and fill in the map.
+  int index = 0;
+  for (const auto& key : keys) {
+    (*operators_map)[key] = index;
+    ++index;
+  }
+}
+}  // namespace details
+
+Offset<Vector<Offset<Tensor>>> ExportTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder, std::vector<const Array*>* buffers_to_write) {
+  // In the end we will need to produce a vector sorted by the indices of the
+  // tensors in the tensors_map.
+  std::map<int, Offset<Tensor>> ordered_tensors;
+
+  for (const auto& array_pair : model.arrays) {
+    const string& tensor_name = array_pair.first;
+    const toco::Array& array = *array_pair.second;
+
+    int buffer_index = buffers_to_write->size();
+    auto type = DataType::Serialize(array.data_type);
+    buffers_to_write->push_back(&array);
+
+    std::vector<int> shape;
+    if (array.has_shape()) {
+      for (int d : array.shape().dims()) {
+        shape.push_back(d);
+      }
+    }
+
+    Offset<Vector<float>> min;
+    Offset<Vector<float>> max;
+    Offset<Vector<float>> scale;
+    Offset<Vector<int64_t>> zero_point;
+    if (array.minmax) {
+      min = builder->CreateVector(
+          std::vector<float>{static_cast<float>(array.minmax->min)});
+      max = builder->CreateVector(
+          std::vector<float>{static_cast<float>(array.minmax->max)});
+    }
+    if (array.quantization_params) {
+      scale = builder->CreateVector(std::vector<float>{
+          static_cast<float>(array.quantization_params->scale)});
+      zero_point = builder->CreateVector(
+          std::vector<int64_t>{array.quantization_params->zero_point});
+    }
+    auto q_param = ::tflite::CreateQuantizationParameters(*builder, min, max,
+                                                          scale, zero_point);
+
+    int index = tensors_map.at(tensor_name);
+    ordered_tensors[index] =
+        CreateTensor(*builder, builder->CreateVector(shape), type, buffer_index,
+                     builder->CreateString(tensor_name), q_param);
+  }
+
+  std::vector<Offset<Tensor>> tensor_vector;
+  tensor_vector.reserve(ordered_tensors.size());
+  for (const auto& tensor : ordered_tensors) {
+    tensor_vector.push_back(tensor.second);
+  }
+
+  return builder->CreateVector(tensor_vector);
+}
+
+Offset<Vector<int32_t>> ExportInputTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder) {
+  std::vector<int32_t> inputs;
+  for (const auto& input : model.flags.input_arrays()) {
+    inputs.push_back(tensors_map.at(input.name()));
+  }
+  return builder->CreateVector<int32_t>(inputs);
+}
+
+Offset<Vector<int32_t>> ExportOutputTensors(
+    const Model& model, const details::TensorsMap& tensors_map,
+    FlatBufferBuilder* builder) {
+  std::vector<int32_t> outputs;
+  for (const string& output : model.flags.output_arrays()) {
+    outputs.push_back(tensors_map.at(output));
+  }
+  return builder->CreateVector<int32_t>(outputs);
+}
+
+Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
+    const Model& model,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    const details::OperatorsMap& operators_map, FlatBufferBuilder* builder,
+    std::set<string>* error_summary) {
+  // Map from operator name to TF Lite enum value, for all builtins.
+  std::map<string, BuiltinOperator> builtin_ops;
+  for (int i = BuiltinOperator_MIN; i <= BuiltinOperator_MAX; ++i) {
+    BuiltinOperator op = static_cast<BuiltinOperator>(i);
+    string name = EnumNameBuiltinOperator(op);
+    if (op != BuiltinOperator_CUSTOM && !name.empty()) {
+      builtin_ops[name] = op;
+    }
+  }
+
+  // We will need to produce a vector of codes in the same order as they
+  // appear in the operators_map.
+  std::map<int, Offset<OperatorCode>> ordered_opcodes;
+
+  for (const auto& op : model.operators) {
+    const details::OperatorKey operator_key = GetOperatorKey(*op);
+    int op_index = operators_map.at(operator_key);
+
+    string name = HelpfulOperatorTypeName(*op);
+    bool is_builtin = false;
+    if (ops_by_type.count(op->type) != 0) {
+      name = ops_by_type.at(op->type)->name();
+      is_builtin = (builtin_ops.count(name) > 0);
+    }
+
+    if (is_builtin) {
+      ordered_opcodes[op_index] =
+          CreateOperatorCode(*builder, builtin_ops[name], 0);
+    } else {
+      // This could be a kTensorFlowUnsupported, in which case we should be
+      // able to retrieve the original Tensorflow name from the OperatorKey, or
+      // this could be a proper TOCO operator that is completely unknown to TF
+      // Lite.
+      if (!operator_key.custom_code.empty()) {
+        name = operator_key.custom_code;
+      }
+      // Either way, this is an operator that is not supported by TF Lite,
+      // so we output it as a custom op and add it to the error summary.
+      if (error_summary) {
+        error_summary->insert(name);
+      }
+      ordered_opcodes[op_index] = CreateOperatorCode(
+          *builder, BuiltinOperator_CUSTOM, builder->CreateString(name));
+    }
+  }
+
+  std::vector<Offset<OperatorCode>> opcode_vector;
+  opcode_vector.reserve(ordered_opcodes.size());
+  for (const auto& opcode : ordered_opcodes) {
+    opcode_vector.push_back(opcode.second);
+  }
+
+  return builder->CreateVector(opcode_vector);
+}
+
+Offset<Vector<Offset<Operator>>> ExportOperators(
+    const Model& model,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
+    const details::OperatorsMap& operators_map,
+    const details::TensorsMap& tensors_map, FlatBufferBuilder* builder) {
+  // The operators are in execution order, so we just follow tf.mini order.
+  std::vector<Offset<Operator>> op_vector;
+  for (const auto& op : model.operators) {
+    std::vector<int32_t> inputs;
+    for (const string& input : op->inputs) {
+      inputs.push_back(tensors_map.at(input));
+    }
+
+    std::vector<int32_t> outputs;
+    for (const string& output : op->outputs) {
+      outputs.push_back(tensors_map.at(output));
+    }
+
+    int op_index = operators_map.at(GetOperatorKey(*op));
+
+    // This is a custom op unless we can find it in ops_by_type, and even then
+    // it could be a custom op (such as kTensorFlowUnsupported).
+
+    auto options = Options::Custom(0);
+    if (ops_by_type.count(op->type) != 0) {
+      options = ops_by_type.at(op->type)->Serialize(*op, builder);
+    }
+    // The only supported CustomOptionFormat is FLEXBUFFERS now.
+    op_vector.push_back(CreateOperator(
+        *builder, op_index, builder->CreateVector(inputs),
+        builder->CreateVector(outputs), options.type, options.builtin,
+        options.custom, ::tflite::CustomOptionsFormat_FLEXBUFFERS));
+  }
+
+  return builder->CreateVector(op_vector);
+}
+
+Offset<Vector<Offset<Buffer>>> ExportBuffers(
+    const Model& model, const std::vector<const Array*>& buffers_to_write,
+    FlatBufferBuilder* builder) {
+  std::vector<Offset<Buffer>> buffer_vector;
+  size_t index = 0;
+  for (const Array* array_ptr : buffers_to_write) {
+    const Array& array = *array_ptr;
+    Offset<Vector<uint8_t>> data_buffer = DataBuffer::Serialize(array, builder);
+    buffer_vector.push_back(CreateBuffer(*builder, data_buffer));
+    index++;
+  }
+  return builder->CreateVector(buffer_vector);
+}
+
+void Export(const Model& model, bool allow_custom_ops,
+            string* output_file_contents) {
+  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+
+  details::TensorsMap tensors_map;
+  details::LoadTensorsMap(model, &tensors_map);
+
+  details::OperatorsMap operators_map;
+  details::LoadOperatorsMap(model, &operators_map);
+
+  std::vector<const Array*> buffers_to_write;
+  Array empty_array;
+  buffers_to_write.push_back(&empty_array);
+
+  auto tensors = ExportTensors(model, tensors_map, &builder, &buffers_to_write);
+  auto inputs = ExportInputTensors(model, tensors_map, &builder);
+  auto outputs = ExportOutputTensors(model, tensors_map, &builder);
+
+  std::set<string> error_summary;
+  auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
+                                      &builder, &error_summary);
+  if (!allow_custom_ops && !error_summary.empty()) {
+    LOG(QFATAL) << "Some of the operators in the model are not supported by "
+                   "the standard TensorFlow Lite runtime. If you have a custom "
+                   "implementation for them you can disable this error with "
+                   "--allow_custom_ops. Here is a list of operators for which "
+                   "you will need custom implementations: "
+                << absl::StrJoin(error_summary, ", ") << ".";
+  }
+
+  auto ops =
+      ExportOperators(model, ops_by_type, operators_map, tensors_map, &builder);
+
+  // TODO(aselle): add support to toco for multiple subgraphs.
+  auto subgraph = CreateSubGraph(builder, tensors, inputs, outputs, ops);
+  std::vector<flatbuffers::Offset<SubGraph>> subgraphs = {subgraph};
+
+  auto buffers = ExportBuffers(model, buffers_to_write, &builder);
+  auto description = builder.CreateString("TOCO Converted.");
+  auto new_model_location =
+      CreateModel(builder, TFLITE_SCHEMA_VERSION, op_codes,
+                  builder.CreateVector(subgraphs), description, buffers);
+  ::tflite::FinishModelBuffer(builder, new_model_location);
+  const uint8_t* buffer = builder.GetBufferPointer();
+  int size = builder.GetSize();
+  *output_file_contents = string(reinterpret_cast<const char*>(buffer), size);
+}
+
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
new file mode 100644
index 0000000000000000000000000000000000000000..44012b7126e17d730ea248551dea2414ad0072d9
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
+
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Transform the given tf.mini model into a TF Lite flatbuffer and deposit the
+// result in the given string.
+void Export(const Model& model, bool allow_custom_ops,
+            string* output_file_contents);
+// This if backward-compatibility.
+inline void Export(const Model& model, string* output_file_contents) {
+  Export(model, true, output_file_contents);
+}
+
+namespace details {
+
+// A maps from tensor name to its final position in the TF Lite buffer.
+using TensorsMap = std::unordered_map<string, int>;
+
+// A key to identify an operator.
+// Only when `type` is `kTensorFlowUnsupported`, `custom_code` is filled to
+// identify which operation is used.
+struct OperatorKey {
+  OperatorKey(OperatorType type, const std::string& custom_code)
+      : type(type), custom_code(custom_code) {}
+  const OperatorType type;
+  const std::string custom_code;
+
+  bool operator<(const OperatorKey& other) const {
+    if (type < other.type) return true;
+    if (type > other.type) return false;
+    return custom_code < other.custom_code;
+  }
+
+  bool operator==(const OperatorKey& other) const {
+    return type == other.type && custom_code == other.custom_code;
+  }
+
+  struct Hash {
+    std::size_t operator()(const OperatorKey& key) const {
+      return std::hash<size_t>()(static_cast<size_t>(key.type)) ^
+             std::hash<std::string>()(key.custom_code);
+    }
+  };
+};
+
+// A maps from operator type to its final position in the TF Lite buffer.
+using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
+
+void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
+void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map);
+
+}  // namespace details
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4c4612d62c4eb5b14898eb8846314246ecbb815
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -0,0 +1,111 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/export.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace toco {
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class ExportTest : public ::testing::Test {
+ protected:
+  // This is a very simplistic model. We are not interested in testing all the
+  // details here, since tf.mini's testing framework will be exercising all the
+  // conversions multiple times, and the conversion of operators is tested by
+  // separate unittests.
+  void BuildTestModel() {
+    input_model_.GetOrCreateArray("tensor_one");
+    input_model_.GetOrCreateArray("tensor_two");
+    {
+      auto* op = new ConvOperator;
+      op->padding.type = PaddingType::kSame;
+      input_model_.operators.emplace_back(op);
+    }
+    input_model_.operators.emplace_back(new AddOperator);
+    {
+      auto* op = new TensorFlowUnsupportedOperator;
+      op->tensorflow_op = "MyCrazyOp";
+      input_model_.operators.emplace_back(op);
+    }
+    // Note that Sub is not know to TF Lite, so it gets exported as a custom
+    // op (and no options).
+    input_model_.operators.emplace_back(new SubOperator);
+  }
+
+  Model input_model_;
+};
+
+TEST_F(ExportTest, LoadTensorsMap) {
+  BuildTestModel();
+
+  details::TensorsMap tensors;
+  details::LoadTensorsMap(input_model_, &tensors);
+  EXPECT_EQ(0, tensors["tensor_one"]);
+  EXPECT_EQ(1, tensors["tensor_two"]);
+}
+
+TEST_F(ExportTest, LoadOperatorsMap) {
+  BuildTestModel();
+
+  details::OperatorsMap operators;
+  details::LoadOperatorsMap(input_model_, &operators);
+  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "")]);
+  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "")]);
+  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "")]);
+  EXPECT_EQ(3, operators[details::OperatorKey(
+                   OperatorType::kTensorFlowUnsupported, "MyCrazyOp")]);
+}
+
+TEST_F(ExportTest, Export) {
+  BuildTestModel();
+
+  string result;
+  Export(input_model_, true, &result);
+
+  auto* model = ::tflite::GetModel(result.data());
+
+  std::vector<string> names;
+  for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
+    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+      names.push_back(string("builtin:") + ::tflite::EnumNameBuiltinOperator(
+                                               opcode->builtin_code()));
+    } else {
+      names.push_back(string("custom:") + opcode->custom_code()->c_str());
+    }
+  }
+
+  EXPECT_THAT(names, ElementsAre("builtin:ADD", "builtin:CONV_2D", "custom:Sub",
+                                 "custom:MyCrazyOp"));
+
+  std::vector<uint32_t> indices;
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(operators->Length(), 4);
+  for (const auto* op : *operators) {
+    indices.push_back(op->opcode_index());
+  }
+
+  EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
+}
+
+// TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
+
+}  // namespace
+}  // namespace tflite
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbf201fd288140d990b8f739adcd9244e1196072
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -0,0 +1,183 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/import.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
+
+namespace toco {
+
+namespace tflite {
+
+namespace details {
+void LoadTensorsTable(const ::tflite::Model& input_model,
+                      TensorsTable* tensors_table) {
+  // TODO(aselle): add support to toco for multiple subgraphs.
+  auto tensors = (*input_model.subgraphs())[0]->tensors();
+  if (!tensors) return;
+  for (const auto* tensor : *tensors) {
+    tensors_table->push_back(tensor->name()->c_str());
+  }
+}
+
+void LoadOperatorsTable(const ::tflite::Model& input_model,
+                        OperatorsTable* operators_table) {
+  auto opcodes = input_model.operator_codes();
+  if (!opcodes) return;
+  for (const auto* opcode : *opcodes) {
+    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+      operators_table->push_back(
+          EnumNameBuiltinOperator(opcode->builtin_code()));
+    } else {
+      operators_table->push_back(opcode->custom_code()->c_str());
+    }
+  }
+}
+}  // namespace details
+
+void ImportTensors(const ::tflite::Model& input_model, Model* model) {
+  auto tensors = (*input_model.subgraphs())[0]->tensors();
+  auto* buffers = input_model.buffers();
+  // auto tensors = input_model.tensors();
+  if (!tensors) return;
+  for (const auto* input_tensor : *tensors) {
+    Array& array = model->GetOrCreateArray(input_tensor->name()->c_str());
+    array.data_type = DataType::Deserialize(input_tensor->type());
+    int buffer_index = input_tensor->buffer();
+    auto* buffer = buffers->Get(buffer_index);
+    DataBuffer::Deserialize(*input_tensor, *buffer, &array);
+
+    auto shape = input_tensor->shape();
+    if (shape) {
+      for (int i = 0; i < shape->Length(); ++i) {
+        auto d = shape->Get(i);
+        array.mutable_shape()->mutable_dims()->push_back(d);
+      }
+    }
+
+    auto quantization = input_tensor->quantization();
+    if (quantization) {
+      // Note that tf.mini only supports a single quantization parameters for
+      // the whole array.
+      if (quantization->min() && quantization->max()) {
+        CHECK_EQ(1, quantization->min()->Length());
+        CHECK_EQ(1, quantization->max()->Length());
+        MinMax& minmax = array.GetOrCreateMinMax();
+        minmax.min = quantization->min()->Get(0);
+        minmax.max = quantization->max()->Get(0);
+      }
+      if (quantization->scale() && quantization->zero_point()) {
+        CHECK_EQ(1, quantization->scale()->Length());
+        CHECK_EQ(1, quantization->zero_point()->Length());
+        QuantizationParams& q = array.GetOrCreateQuantizationParams();
+        q.scale = quantization->scale()->Get(0);
+        q.zero_point = quantization->zero_point()->Get(0);
+      }
+    }
+  }
+}
+
+void ImportOperators(
+    const ::tflite::Model& input_model,
+    const std::map<string, std::unique_ptr<BaseOperator>>& ops_by_name,
+    const details::TensorsTable& tensors_table,
+    const details::OperatorsTable& operators_table, Model* model) {
+  // TODO(aselle): add support for multiple subgraphs.
+  auto ops = (*input_model.subgraphs())[0]->operators();
+
+  if (!ops) return;
+  for (const auto* input_op : *ops) {
+    int index = input_op->opcode_index();
+    if (index < 0 || index > operators_table.size()) {
+      LOG(FATAL) << "Index " << index << " must be between zero and "
+                 << operators_table.size();
+    }
+    string opname = operators_table.at(index);
+    if (ops_by_name.count(opname) == 0) {
+      LOG(FATAL) << "Op '" << opname << "' not supported";
+    }
+
+    auto new_op = ops_by_name.at(opname)->Deserialize(
+        input_op->builtin_options(), input_op->custom_options());
+    model->operators.emplace_back(new_op.release());
+    auto* op = model->operators.back().get();
+
+    auto inputs = input_op->inputs();
+    for (int i = 0; i < inputs->Length(); i++) {
+      auto input_index = inputs->Get(i);
+      const string& input_name = tensors_table.at(input_index);
+      op->inputs.push_back(input_name);
+    }
+    auto outputs = input_op->outputs();
+    for (int i = 0; i < outputs->Length(); i++) {
+      auto output_index = outputs->Get(i);
+      const string& output_name = tensors_table.at(output_index);
+      op->outputs.push_back(output_name);
+    }
+  }
+}
+
+void ImportIOTensors(const ::tflite::Model& input_model,
+                     const details::TensorsTable& tensors_table, Model* model) {
+  auto inputs = (*input_model.subgraphs())[0]->inputs();
+  if (inputs) {
+    for (int input : *inputs) {
+      const string& input_name = tensors_table.at(input);
+      model->flags.add_input_arrays()->set_name(input_name);
+    }
+  }
+
+  auto outputs = (*input_model.subgraphs())[0]->outputs();
+  if (outputs) {
+    for (int output : *outputs) {
+      const string& output_name = tensors_table.at(output);
+      model->flags.add_output_arrays(output_name);
+    }
+  }
+}
+
+std::unique_ptr<Model> Import(const ModelFlags& model_flags,
+                              const string& input_file_contents) {
+  const ::tflite::Model* input_model =
+      ::tflite::GetModel(input_file_contents.data());
+
+  // Full list of all known operators.
+  const auto ops_by_name = BuildOperatorByNameMap();
+
+  if (input_model->subgraphs()->size() != 1) {
+    LOG(FATAL) << "# of subgraphs in tflite should be exactly 1 for now.";
+  }
+  std::unique_ptr<Model> model;
+  model.reset(new Model);
+
+  details::TensorsTable tensors_table;
+  details::LoadTensorsTable(*input_model, &tensors_table);
+
+  details::OperatorsTable operators_table;
+  details::LoadOperatorsTable(*input_model, &operators_table);
+
+  ImportTensors(*input_model, model.get());
+  ImportOperators(*input_model, ops_by_name, tensors_table, operators_table,
+                  model.get());
+  ImportIOTensors(*input_model, tensors_table, model.get());
+
+  return model;
+}
+
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/import.h b/tensorflow/contrib/lite/toco/tflite/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c27a2843c47814ad46c8f1bbd77b7afcb324375
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/import.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
+
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Parse the given string as TF Lite flatbuffer and return a new tf.mini model.
+std::unique_ptr<Model> Import(const ModelFlags &model_flags,
+                              const string &input_file_contents);
+
+namespace details {
+
+// The names of all tensors found in a TF Lite model.
+using TensorsTable = std::vector<string>;
+
+// The names of all operators found in TF Lite model. If the operator is
+// builtin, the string representation of the corresponding enum value is used
+// as name.
+using OperatorsTable = std::vector<string>;
+
+void LoadTensorsTable(const ::tflite::Model &input_model,
+                      TensorsTable *tensors_table);
+void LoadOperatorsTable(const ::tflite::Model &input_model,
+                        OperatorsTable *operators_table);
+
+}  // namespace details
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_IMPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/import_test.cc b/tensorflow/contrib/lite/toco/tflite/import_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..309fa6d7f688ba1dd99a7e6eeda14d513a9e49d4
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/import_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/import.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace toco {
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+
+class ImportTest : public ::testing::Test {
+ protected:
+  template <typename T>
+  flatbuffers::Offset<flatbuffers::Vector<unsigned char>> CreateDataVector(
+      const std::vector<T>& data) {
+    return builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.data()),
+                                 sizeof(T) * data.size());
+  }
+  // This is a very simplistic model. We are not interested in testing all the
+  // details here, since tf.mini's testing framework will be exercising all the
+  // conversions multiple times, and the conversion of operators is tested by
+  // separate unittests.
+  void BuildTestModel() {
+    // The tensors
+    auto q = ::tflite::CreateQuantizationParameters(
+        builder_,
+        /*min=*/builder_.CreateVector<float>({0.1f}),
+        /*max=*/builder_.CreateVector<float>({0.2f}),
+        /*scale=*/builder_.CreateVector<float>({0.3f}),
+        /*zero_point=*/builder_.CreateVector<int64_t>({100ll}));
+    auto buf0 = ::tflite::CreateBuffer(builder_, CreateDataVector<float>({}));
+    auto buf1 =
+        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({1.0f, 2.0f}));
+    auto buf2 =
+        ::tflite::CreateBuffer(builder_, CreateDataVector<float>({3.0f}));
+    auto buffers = builder_.CreateVector(
+        std::vector<flatbuffers::Offset<::tflite::Buffer>>({buf0, buf1, buf2}));
+    auto t1 = ::tflite::CreateTensor(builder_,
+                                     builder_.CreateVector<int>({1, 2, 3, 4}),
+                                     ::tflite::TensorType_FLOAT32, 1,
+                                     builder_.CreateString("tensor_one"), q);
+    auto t2 =
+        ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
+                               ::tflite::TensorType_FLOAT32, 2,
+                               builder_.CreateString("tensor_two"), q);
+    auto tensors = builder_.CreateVector(
+        std::vector<flatbuffers::Offset<::tflite::Tensor>>({t1, t2}));
+
+    // The operator codes.
+    auto c1 =
+        ::tflite::CreateOperatorCode(builder_, ::tflite::BuiltinOperator_CUSTOM,
+                                     builder_.CreateString("custom_op_one"));
+    auto c2 = ::tflite::CreateOperatorCode(
+        builder_, ::tflite::BuiltinOperator_CONV_2D, 0);
+    auto opcodes = builder_.CreateVector(
+        std::vector<flatbuffers::Offset<::tflite::OperatorCode>>({c1, c2}));
+
+    auto subgraph = ::tflite::CreateSubGraph(builder_, tensors, 0, 0, 0);
+    std::vector<flatbuffers::Offset<::tflite::SubGraph>> subgraph_vector(
+        {subgraph});
+    auto subgraphs = builder_.CreateVector(subgraph_vector);
+    auto s = builder_.CreateString("");
+    builder_.Finish(::tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
+                                          opcodes, subgraphs, s, buffers));
+
+    input_model_ = ::tflite::GetModel(builder_.GetBufferPointer());
+  }
+  string InputModelAsString() {
+    return string(reinterpret_cast<char*>(builder_.GetBufferPointer()),
+                  builder_.GetSize());
+  }
+  flatbuffers::FlatBufferBuilder builder_;
+  // const uint8_t* buffer_ = nullptr;
+  const ::tflite::Model* input_model_ = nullptr;
+};
+
+TEST_F(ImportTest, LoadTensorsTable) {
+  BuildTestModel();
+
+  details::TensorsTable tensors;
+  details::LoadTensorsTable(*input_model_, &tensors);
+  EXPECT_THAT(tensors, ElementsAre("tensor_one", "tensor_two"));
+}
+
+TEST_F(ImportTest, LoadOperatorsTable) {
+  BuildTestModel();
+
+  details::OperatorsTable operators;
+  details::LoadOperatorsTable(*input_model_, &operators);
+  EXPECT_THAT(operators, ElementsAre("custom_op_one", "CONV_2D"));
+}
+
+TEST_F(ImportTest, Tensors) {
+  BuildTestModel();
+
+  auto model = Import(ModelFlags(), InputModelAsString());
+
+  ASSERT_GT(model->arrays.count("tensor_one"), 0);
+  Array& a1 = model->GetArray("tensor_one");
+  EXPECT_EQ(ArrayDataType::kFloat, a1.data_type);
+  EXPECT_THAT(a1.GetBuffer<ArrayDataType::kFloat>().data,
+              ElementsAre(1.0f, 2.0f));
+  ASSERT_TRUE(a1.has_shape());
+  EXPECT_THAT(a1.shape().dims(), ElementsAre(1, 2, 3, 4));
+
+  const auto& mm = a1.minmax;
+  ASSERT_TRUE(mm.get());
+  EXPECT_FLOAT_EQ(0.1, mm->min);
+  EXPECT_FLOAT_EQ(0.2, mm->max);
+
+  const auto& q = a1.quantization_params;
+  ASSERT_TRUE(q.get());
+  EXPECT_FLOAT_EQ(0.3, q->scale);
+  EXPECT_EQ(100, q->zero_point);
+}
+
+// TODO(ahentz): still need tests for Operators and IOTensors.
+
+}  // namespace
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a33500ddcda67d97e68158ce40d8d7e086a27cc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -0,0 +1,627 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+
+#include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/custom_operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/simple_operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+namespace tflite {
+
+class AveragePool
+    : public BuiltinOperator<AveragePoolOperator, ::tflite::Pool2DOptions,
+                             ::tflite::BuiltinOptions_Pool2DOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreatePool2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, op.kwidth,
+                                         op.kheight, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->kwidth = options.filter_width();
+    op->kheight = options.filter_height();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Convolution
+    : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
+                             ::tflite::BuiltinOptions_Conv2DOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class DepthwiseConvolution
+    : public BuiltinOperator<DepthwiseConvOperator,
+                             ::tflite::DepthwiseConv2DOptions,
+                             ::tflite::BuiltinOptions_DepthwiseConv2DOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateDepthwiseConv2DOptions(
+        *builder, padding, op.stride_width, op.stride_height,
+        op.depth_multiplier, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->depth_multiplier = options.depth_multiplier();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
+                                   ::tflite::BuiltinOptions_AddOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateAddOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Cast : public CustomOperator<CastOperator> {
+ public:
+  using CustomOperator::CustomOperator;
+  void WriteOptions(const TocoOperator& op,
+                    flexbuffers::Builder* fbb) const override {
+    fbb->Int("src_data_type", DataType::Serialize(op.src_data_type));
+    fbb->Int("dst_data_type", DataType::Serialize(op.dst_data_type));
+  }
+  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
+    op->src_data_type = DataType::Deserialize(m["src_data_type"].AsInt64());
+    op->dst_data_type = DataType::Deserialize(m["dst_data_type"].AsInt64());
+  }
+};
+
+class Concatenation
+    : public BuiltinOperator<ConcatenationOperator,
+                             ::tflite::ConcatenationOptions,
+                             ::tflite::BuiltinOptions_ConcatenationOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateConcatenationOptions(*builder, op.concat_dim);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->concat_dim = options.axis();
+  }
+};
+
+class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
+ public:
+  using CustomOperator::CustomOperator;
+  void WriteOptions(const TocoOperator& op,
+                    flexbuffers::Builder* fbb) const override {
+    fbb->Int("block_size", op.block_size);
+  }
+  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
+    op->block_size = m["block_size"].AsInt64();
+  }
+};
+
+class FakeQuant : public CustomOperator<FakeQuantOperator> {
+ public:
+  using CustomOperator::CustomOperator;
+  void WriteOptions(const TocoOperator& op,
+                    flexbuffers::Builder* fbb) const override {
+    fbb->Float("min", op.minmax->min);
+    fbb->Float("max", op.minmax->max);
+  }
+  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
+    auto* minmax = new MinMax;
+    minmax->min = m["min"].AsFloat();
+    minmax->max = m["max"].AsFloat();
+    op->minmax.reset(minmax);
+  }
+};
+
+class FullyConnected
+    : public BuiltinOperator<FullyConnectedOperator,
+                             ::tflite::FullyConnectedOptions,
+                             ::tflite::BuiltinOptions_FullyConnectedOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateFullyConnectedOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
+                                    ::tflite::BuiltinOptions_SVDFOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateSVDFOptions(*builder, op.rank, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+    op->rank = options.rank();
+  }
+};
+
+class L2Normalization
+    : public BuiltinOperator<L2NormalizationOperator, ::tflite::L2NormOptions,
+                             ::tflite::BuiltinOptions_L2NormOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateL2NormOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
+                                      ::tflite::BuiltinOptions_Pool2DOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreatePool2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, op.kwidth,
+                                         op.kheight, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->kwidth = options.filter_width();
+    op->kheight = options.filter_height();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class LocalResponseNormalization
+    : public BuiltinOperator<
+          LocalResponseNormalizationOperator,
+          ::tflite::LocalResponseNormalizationOptions,
+          ::tflite::BuiltinOptions_LocalResponseNormalizationOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateLocalResponseNormalizationOptions(
+        *builder, op.range, op.bias, op.alpha, op.beta);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->range = options.radius();
+    op->bias = options.bias();
+    op->alpha = options.alpha();
+    op->beta = options.beta();
+  }
+};
+
+class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
+                                       ::tflite::BuiltinOptions_Pool2DOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreatePool2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, op.kwidth,
+                                         op.kheight, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->kwidth = options.filter_width();
+    op->kheight = options.filter_height();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
+                                   ::tflite::BuiltinOptions_MulOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateMulOptions(*builder, activation_function);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class Reshape
+    : public BuiltinOperator<TensorFlowReshapeOperator,
+                             ::tflite::ReshapeOptions,
+                             ::tflite::BuiltinOptions_ReshapeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateReshapeOptions(*builder,
+                                          builder->CreateVector(op.shape));
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->shape.insert(op->shape.end(), options.new_shape()->begin(),
+                     options.new_shape()->end());
+  }
+};
+
+class Softmax
+    : public BuiltinOperator<SoftmaxOperator, ::tflite::SoftmaxOptions,
+                             ::tflite::BuiltinOptions_SoftmaxOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSoftmaxOptions(*builder, op.beta);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->beta = options.beta();
+  }
+};
+
+class SpaceToDepth
+    : public BuiltinOperator<SpaceToDepthOperator,
+                             ::tflite::SpaceToDepthOptions,
+                             ::tflite::BuiltinOptions_SpaceToDepthOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSpaceToDepthOptions(*builder, op.block_size);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->block_size = options.block_size();
+  }
+};
+
+class Split : public CustomOperator<TensorFlowSplitOperator> {
+ public:
+  using CustomOperator::CustomOperator;
+  void WriteOptions(const TocoOperator& op,
+                    flexbuffers::Builder* fbb) const override {
+    fbb->Int("num_split", op.num_split);
+  }
+  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
+    op->num_split = m["num_split"].AsInt64();
+  }
+};
+
+class TensorFlowUnsupported : public BaseOperator {
+ public:
+  using BaseOperator::BaseOperator;
+
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    auto fbb =
+        WriteOptions(static_cast<const TensorFlowUnsupportedOperator&>(op));
+    if (fbb) {
+      return Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+    } else {
+      return Options::Custom(0);
+    }
+  }
+
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
+    if (custom_options) {
+      auto flexbuffer_map =
+          flexbuffers::GetRoot(custom_options->data(), custom_options->size())
+              .AsMap();
+      ReadOptions(flexbuffer_map, op.get());
+    }
+    return std::unique_ptr<Operator>(op.release());
+  }
+
+  std::unique_ptr<flexbuffers::Builder> WriteOptions(
+      const TensorFlowUnsupportedOperator& op) const {
+    auto fbb = absl::make_unique<flexbuffers::Builder>();
+
+    ::tensorflow::NodeDef node_def;
+    if (!node_def.ParseFromString(op.tensorflow_node_def)) {
+      LOG(ERROR) << "Failed to parse TensorFlow NodeDef";
+      return std::unique_ptr<flexbuffers::Builder>();
+    }
+
+    bool has_valid_attr = false;
+    size_t map_start = fbb->StartMap();
+    for (const auto& pair : node_def.attr()) {
+      const char* key = pair.first.c_str();
+      const auto& attr = pair.second;
+      switch (attr.value_case()) {
+        case ::tensorflow::AttrValue::kS:
+          fbb->String(key, attr.s());
+          has_valid_attr = true;
+          break;
+        case ::tensorflow::AttrValue::kI:
+          fbb->Int(key, attr.i());
+          has_valid_attr = true;
+          break;
+        case ::tensorflow::AttrValue::kF:
+          fbb->Float(key, attr.f());
+          has_valid_attr = true;
+          break;
+        case ::tensorflow::AttrValue::kB:
+          fbb->Bool(key, attr.b());
+          has_valid_attr = true;
+          break;
+        default:
+          LOG(WARNING) << "Ignoring unsupported attribute type with key '"
+                       << key << "'";
+          break;
+      }
+    }
+    if (!has_valid_attr) {
+      return std::unique_ptr<flexbuffers::Builder>();
+    }
+    fbb->EndMap(map_start);
+    fbb->Finish();
+    return std::unique_ptr<flexbuffers::Builder>(fbb.release());
+  }
+
+  void ReadOptions(const flexbuffers::Map& m,
+                   TensorFlowUnsupportedOperator* op) const {
+    ::tensorflow::NodeDef node_def;
+    auto attr = node_def.mutable_attr();
+
+    const auto& keys = m.Keys();
+    for (size_t i = 0; i < keys.size(); ++i) {
+      const auto key = keys[i].AsKey();
+      const auto& value = m[key];
+      switch (value.GetType()) {
+        case flexbuffers::TYPE_STRING:
+          (*attr)[key].set_s(value.AsString().c_str());
+          break;
+        case flexbuffers::TYPE_INT:
+          (*attr)[key].set_i(value.AsInt64());
+          break;
+        case flexbuffers::TYPE_FLOAT:
+          (*attr)[key].set_f(value.AsFloat());
+          break;
+        case flexbuffers::TYPE_BOOL:
+          (*attr)[key].set_b(value.AsBool());
+          break;
+        default:
+          LOG(WARNING) << "Ignoring unsupported attribute type with key '"
+                       << key << "'";
+          break;
+      }
+    }
+    node_def.SerializeToString(&op->tensorflow_node_def);
+  }
+};
+
+namespace {
+// Build a vector containing all the known operators.
+std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
+  std::vector<std::unique_ptr<BaseOperator>> ops;
+
+  // Builtin Operators.
+  ops.emplace_back(new Add(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.emplace_back(new AveragePool(::tflite::BuiltinOperator_AVERAGE_POOL_2D,
+                                   OperatorType::kAveragePool));
+  ops.emplace_back(new Concatenation(::tflite::BuiltinOperator_CONCATENATION,
+                                     OperatorType::kConcatenation));
+  ops.emplace_back(
+      new Convolution(::tflite::BuiltinOperator_CONV_2D, OperatorType::kConv));
+  ops.emplace_back(
+      new DepthwiseConvolution(::tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+                               OperatorType::kDepthwiseConv));
+  ops.emplace_back(new FullyConnected(::tflite::BuiltinOperator_FULLY_CONNECTED,
+                                      OperatorType::kFullyConnected));
+  ops.emplace_back(
+      new L2Normalization(::tflite::BuiltinOperator_L2_NORMALIZATION,
+                          OperatorType::kL2Normalization));
+  ops.emplace_back(
+      new L2Pool(::tflite::BuiltinOperator_L2_POOL_2D, OperatorType::kL2Pool));
+  ops.emplace_back(new LocalResponseNormalization(
+      ::tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+      OperatorType::kLocalResponseNormalization));
+  ops.emplace_back(new MaxPool(::tflite::BuiltinOperator_MAX_POOL_2D,
+                               OperatorType::kMaxPool));
+  ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+  ops.emplace_back(new Reshape(::tflite::BuiltinOperator_RESHAPE,
+                               OperatorType::kTensorFlowReshape));
+  ops.emplace_back(
+      new Softmax(::tflite::BuiltinOperator_SOFTMAX, OperatorType::kSoftmax));
+  ops.emplace_back(new SpaceToDepth(::tflite::BuiltinOperator_SPACE_TO_DEPTH,
+                                    OperatorType::kSpaceToDepth));
+  ops.emplace_back(
+      new Svdf(::tflite::BuiltinOperator_SVDF, OperatorType::kSvdf));
+
+  // Custom Operators.
+  ops.emplace_back(new Cast("CAST", OperatorType::kCast));
+  ops.emplace_back(
+      new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
+  ops.emplace_back(new FakeQuant("FAKE_QUANT", OperatorType::kFakeQuant));
+  ops.emplace_back(new Split("SPLIT", OperatorType::kTensorFlowSplit));
+  ops.emplace_back(new TensorFlowUnsupported(
+      "TENSORFLOW_UNSUPPORTED", OperatorType::kTensorFlowUnsupported));
+
+  // There operators are supported by Toco, but not by TF Lite, and has no
+  // attributes.
+  ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
+      "RSQRT", OperatorType::kTensorFlowRsqrt));
+  ops.emplace_back(
+      new SimpleOperator<TensorFlowRsqrtOperator>("DIV", OperatorType::kDiv));
+
+  // Simple Operators.
+  ops.emplace_back(new SimpleOperator<DequantizeOperator>(
+      "DEQUANTIZE", OperatorType::kDequantize));
+  ops.emplace_back(
+      new SimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor));
+  ops.emplace_back(
+      new SimpleOperator<GatherOperator>("GATHER", OperatorType::kGather));
+  ops.emplace_back(
+      new SimpleOperator<ReluOperator>("RELU", OperatorType::kRelu));
+  ops.emplace_back(
+      new SimpleOperator<Relu1Operator>("RELU1", OperatorType::kRelu1));
+  ops.emplace_back(
+      new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
+  ops.emplace_back(new SimpleOperator<ResizeBilinearOperator>(
+      "RESIZE_BILINEAR", OperatorType::kResizeBilinear));
+  ops.emplace_back(new SimpleOperator<LogisticOperator>(
+      "LOGISTIC", OperatorType::kLogistic));
+  ops.emplace_back(
+      new SimpleOperator<TanhOperator>("TANH", OperatorType::kTanh));
+
+  return ops;
+}
+}  // namespace
+
+std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap() {
+  std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
+
+  std::vector<std::unique_ptr<BaseOperator>> ops = BuildOperatorList();
+  for (auto& op : ops) {
+    result[op->type()] = std::move(op);
+  }
+
+  return result;
+}
+
+std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap() {
+  std::map<string, std::unique_ptr<BaseOperator>> result;
+
+  std::vector<std::unique_ptr<BaseOperator>> ops = BuildOperatorList();
+  for (auto& op : ops) {
+    result[op->name()] = std::move(op);
+  }
+
+  return result;
+}
+
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..37df302d4697c78e0349bcd30e0e1adc540066bc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -0,0 +1,89 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+class BaseOperator;
+
+// Return a map contained all knwo TF Lite Operators, keyed by their names.
+std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap();
+
+// Return a map contained all knwo TF Lite Operators, keyed by the type of
+// their tf.mini counterparts.
+std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap();
+
+// These are the flatbuffer types for custom and builtin options.
+using CustomOptions = flatbuffers::Vector<uint8_t>;
+using BuiltinOptions = void;
+
+// A simple wrapper around the flatbuffer objects used to describe options that
+// configure operators.
+struct Options {
+  // Build custom options.
+  static Options Custom(flatbuffers::Offset<CustomOptions> offset) {
+    return {::tflite::BuiltinOptions_NONE, 0, offset};
+  }
+
+  // Build builtin options of the given type.
+  static Options Builtin(::tflite::BuiltinOptions type,
+                         flatbuffers::Offset<BuiltinOptions> offset) {
+    return {type, offset, 0};
+  }
+
+  ::tflite::BuiltinOptions type;
+  flatbuffers::Offset<BuiltinOptions> builtin;
+  flatbuffers::Offset<CustomOptions> custom;
+};
+
+// A BaseOperator encapsulates the relationship between operators in tf.mini
+// and TF lite, and provides methods for converting between those two formats.
+class BaseOperator {
+ public:
+  // Build an operator with the given TF Lite name and tf.mini type.
+  BaseOperator(const string& name, OperatorType type)
+      : name_(name), type_(type) {}
+  virtual ~BaseOperator() = default;
+
+  string name() const { return name_; }
+  OperatorType type() const { return type_; }
+
+  // Given a tf.mini operator, create the corresponding flatbuffer options and
+  // return their offsets.
+  virtual Options Serialize(const Operator& op,
+                            flatbuffers::FlatBufferBuilder* builder) const = 0;
+
+  // Read TF Lite options and create the appropriate tf.mini operator.
+  virtual std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const = 0;
+
+ private:
+  string name_;
+  OperatorType type_;
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e77c56d8aaa88d5c801ae246e1ee63e40b6f955
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -0,0 +1,372 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace toco {
+
+namespace tflite {
+namespace {
+
+class OperatorTest : public ::testing::Test {
+ protected:
+  // Return the operator for the given name and type.
+  const BaseOperator& GetOperator(const string& name, OperatorType type) {
+    using OpsByName = std::map<string, std::unique_ptr<BaseOperator>>;
+    using OpsByType = std::map<OperatorType, std::unique_ptr<BaseOperator>>;
+
+    static auto* by_name = new OpsByName(BuildOperatorByNameMap());
+    static auto* by_type = new OpsByType(BuildOperatorByTypeMap());
+
+    // Make sure the two maps were consitently built.
+    CHECK(by_name->count(name)) << "No operator for '" << name << "'.";
+    BaseOperator* op1 = by_name->at(name).get();
+    CHECK(op1->type() == type) << "while verifying '" << name << "'.";
+
+    CHECK(by_type->count(type))
+        << "No operator for '" << OperatorTypeName(type) << "'.";
+    BaseOperator* op2 = by_type->at(type).get();
+    CHECK(op2->name() == name)
+        << "while verifying '" << OperatorTypeName(type) << "'.";
+
+    return *op1;
+  }
+
+  // Use the given BaseOperator to serialize the tf.mini operator into a set of
+  // TF Lite options. Proceed to deserialize the options back into a new
+  // tf.mini operator, which is then returned. If `options` is given, it will
+  // be populated with the serialized options.
+  template <typename T>
+  std::unique_ptr<T> SerializeAndDeserialize(const BaseOperator& op,
+                                             const T& toco_op,
+                                             Options* options = nullptr) {
+    flatbuffers::FlatBufferBuilder builder;
+    Options input_options = op.Serialize(toco_op, &builder);
+
+    if (options) {
+      *options = input_options;
+    }
+
+    builder.Finish(CreateOperator(builder, 0, 0, 0, input_options.type,
+                                  input_options.builtin, input_options.custom,
+                                  ::tflite::CustomOptionsFormat_FLEXBUFFERS));
+    auto* output_options =
+        flatbuffers::GetRoot<::tflite::Operator>(builder.GetBufferPointer());
+    auto new_toco_op = op.Deserialize(output_options->builtin_options(),
+                                      output_options->custom_options());
+
+    CHECK(dynamic_cast<T*>(new_toco_op.get()))
+        << "Cannot cast " << HelpfulOperatorTypeName(*new_toco_op) << " to "
+        << HelpfulOperatorTypeName(toco_op);
+
+    return std::unique_ptr<T>(dynamic_cast<T*>(new_toco_op.release()));
+  }
+
+  // Verify serialization and deserialization of simple operators (those
+  // that don't have any configuration parameters).
+  template <typename T>
+  void CheckSimpleOperator(const string& name, OperatorType type) {
+    Options options;
+    auto output_toco_op =
+        SerializeAndDeserialize(GetOperator(name, type), T(), &options);
+
+    ASSERT_EQ(0, options.builtin.o);
+    ASSERT_EQ(0, options.custom.o);
+    ASSERT_EQ(::tflite::BuiltinOptions_NONE, options.type);
+
+    ASSERT_NE(nullptr, output_toco_op.get());
+  }
+};
+
+TEST_F(OperatorTest, SimpleOperators) {
+  CheckSimpleOperator<DequantizeOperator>("DEQUANTIZE",
+                                          OperatorType::kDequantize);
+  CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
+  CheckSimpleOperator<GatherOperator>("GATHER", OperatorType::kGather);
+  CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
+  CheckSimpleOperator<Relu1Operator>("RELU1", OperatorType::kRelu1);
+  CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
+  CheckSimpleOperator<ResizeBilinearOperator>("RESIZE_BILINEAR",
+                                              OperatorType::kResizeBilinear);
+  CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
+  CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
+}
+
+TEST_F(OperatorTest, BuiltinAdd) {
+  AddOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("ADD", OperatorType::kAdd), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, CustomCast) {
+  CastOperator op;
+  op.src_data_type = ArrayDataType::kFloat;
+  op.dst_data_type = ArrayDataType::kUint8;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("CAST", OperatorType::kCast), op);
+  EXPECT_EQ(op.src_data_type, output_toco_op->src_data_type);
+  EXPECT_EQ(op.dst_data_type, output_toco_op->dst_data_type);
+}
+
+TEST_F(OperatorTest, CustomConcatenation) {
+  ConcatenationOperator op;
+  op.concat_dim = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("CONCATENATION", OperatorType::kConcatenation), op);
+  EXPECT_EQ(op.concat_dim, output_toco_op->concat_dim);
+}
+
+TEST_F(OperatorTest, CustomDepthToSpace) {
+  DepthToSpaceOperator op;
+  op.block_size = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("DEPTH_TO_SPACE", OperatorType::kDepthToSpace), op);
+  EXPECT_EQ(op.block_size, output_toco_op->block_size);
+}
+
+TEST_F(OperatorTest, CustomFakeQuant) {
+  FakeQuantOperator op;
+  auto* minmax = new MinMax;
+  minmax->min = -10;
+  minmax->max = 200;
+  op.minmax.reset(minmax);
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("FAKE_QUANT", OperatorType::kFakeQuant), op);
+  EXPECT_EQ(op.minmax->min, output_toco_op->minmax->min);
+  EXPECT_EQ(op.minmax->max, output_toco_op->minmax->max);
+}
+
+TEST_F(OperatorTest, CustomFullyConnected) {
+  FullyConnectedOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("FULLY_CONNECTED", OperatorType::kFullyConnected), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, BuiltinL2Pool) {
+  L2PoolOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  op.kwidth = 480;
+  op.kheight = 1080;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("L2_POOL_2D", OperatorType::kL2Pool), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+  EXPECT_EQ(op.kwidth, output_toco_op->kwidth);
+  EXPECT_EQ(op.kheight, output_toco_op->kheight);
+}
+
+TEST_F(OperatorTest, BuiltinLocalResponseNormalization) {
+  LocalResponseNormalizationOperator op;
+  op.range = 123;
+  op.bias = 1.23;
+  op.alpha = 12.3;
+  op.beta = .123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("LOCAL_RESPONSE_NORMALIZATION",
+                  OperatorType::kLocalResponseNormalization),
+      op);
+  EXPECT_EQ(op.range, output_toco_op->range);
+  EXPECT_EQ(op.bias, output_toco_op->bias);
+  EXPECT_EQ(op.alpha, output_toco_op->alpha);
+  EXPECT_EQ(op.beta, output_toco_op->beta);
+}
+
+TEST_F(OperatorTest, BuiltinMaxPool) {
+  MaxPoolOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  op.kwidth = 480;
+  op.kheight = 1080;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("MAX_POOL_2D", OperatorType::kMaxPool), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+  EXPECT_EQ(op.kwidth, output_toco_op->kwidth);
+  EXPECT_EQ(op.kheight, output_toco_op->kheight);
+}
+
+TEST_F(OperatorTest, BuiltinReshape) {
+  TensorFlowReshapeOperator op;
+  op.shape = {1, 2, 4, 5, 8};
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("RESHAPE", OperatorType::kTensorFlowReshape), op);
+  EXPECT_EQ(op.shape, output_toco_op->shape);
+}
+
+TEST_F(OperatorTest, CustomSoftmax) {
+  SoftmaxOperator op;
+  op.beta = 123.1;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SOFTMAX", OperatorType::kSoftmax), op);
+  EXPECT_EQ(op.beta, output_toco_op->beta);
+}
+
+TEST_F(OperatorTest, BuiltinSpaceToDepth) {
+  SpaceToDepthOperator op;
+  op.block_size = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SPACE_TO_DEPTH", OperatorType::kSpaceToDepth), op);
+  EXPECT_EQ(op.block_size, output_toco_op->block_size);
+}
+
+TEST_F(OperatorTest, CustomSplit) {
+  TensorFlowSplitOperator op;
+  op.num_split = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SPLIT", OperatorType::kTensorFlowSplit), op);
+  EXPECT_EQ(op.num_split, output_toco_op->num_split);
+}
+
+TEST_F(OperatorTest, BuiltinAveragePool) {
+  AveragePoolOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  op.kwidth = 480;
+  op.kheight = 1080;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("AVERAGE_POOL_2D", OperatorType::kAveragePool), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+  EXPECT_EQ(op.kwidth, output_toco_op->kwidth);
+  EXPECT_EQ(op.kheight, output_toco_op->kheight);
+}
+
+TEST_F(OperatorTest, BuiltinConvolution) {
+  ConvOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("CONV_2D", OperatorType::kConv), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, BuiltinDepthwiseConvolution) {
+  DepthwiseConvOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  op.depth_multiplier = 6;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("DEPTHWISE_CONV_2D", OperatorType::kDepthwiseConv), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+  EXPECT_EQ(op.depth_multiplier, output_toco_op->depth_multiplier);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, BuiltinL2Norm) {
+  L2NormalizationOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("L2_NORMALIZATION", OperatorType::kL2Normalization), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, BuiltinMul) {
+  MulOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu6;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("MUL", OperatorType::kMul), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+}
+
+TEST_F(OperatorTest, Svdf) {
+  SvdfOperator op;
+  op.fused_activation_function = FusedActivationFunctionType::kRelu;
+  op.rank = 1;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("SVDF", OperatorType::kSvdf), op);
+  EXPECT_EQ(op.fused_activation_function,
+            output_toco_op->fused_activation_function);
+  EXPECT_EQ(op.rank, output_toco_op->rank);
+}
+
+TEST_F(OperatorTest, TensorFlowUnsupported) {
+  TensorFlowUnsupportedOperator op;
+  op.tensorflow_op = "MyCustomUnsupportedOp";
+
+  ::tensorflow::NodeDef node_def;
+  auto attr = node_def.mutable_attr();
+  (*attr)["float_attr"].set_f(2.0);
+  (*attr)["str_attr"].set_s("Hello World");
+  (*attr)["int_attr"].set_i(17);
+  (*attr)["bool_attr"].set_b(true);
+  node_def.SerializeToString(&op.tensorflow_node_def);
+
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("TENSORFLOW_UNSUPPORTED",
+                                          OperatorType::kTensorFlowUnsupported),
+                              op);
+
+  ::tensorflow::NodeDef output_node_def;
+  output_node_def.ParseFromString(output_toco_op->tensorflow_node_def);
+  const auto& output_attr = output_node_def.attr();
+  EXPECT_EQ(2.0, output_attr.at("float_attr").f());
+  EXPECT_EQ("Hello World", output_attr.at("str_attr").s());
+  EXPECT_EQ(17, output_attr.at("int_attr").i());
+  EXPECT_EQ(true, output_attr.at("bool_attr").b());
+}
+
+TEST_F(OperatorTest, TensorFlowUnsupportedWithoutAttr) {
+  TensorFlowUnsupportedOperator op;
+  op.tensorflow_op = "MyCustomUnsupportedOp";
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("TENSORFLOW_UNSUPPORTED",
+                                          OperatorType::kTensorFlowUnsupported),
+                              op);
+
+  ::tensorflow::NodeDef output_node_def;
+  output_node_def.ParseFromString(output_toco_op->tensorflow_node_def);
+  EXPECT_TRUE(output_node_def.attr().empty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..992b98bacafecb080e792ae87a2940977482eed6
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
+
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+
+namespace toco {
+
+namespace tflite {
+
+// Simple operators don't have any configuration options and can be trivially
+// serialized and deserialized. Note that most of toco's operators will
+// likely be supported as builtin operators in TF Lite.  Simple (and custom)
+// operators are mostly a convenience for the times when tf.mini supports more
+// operators than TF Lite.
+//
+// Template argument T must derive from ::toco::Operator.
+template <typename T>
+class SimpleOperator : public BaseOperator {
+ public:
+  using BaseOperator::BaseOperator;
+  Options Serialize(const Operator& op,
+                    flatbuffers::FlatBufferBuilder* builder) const override {
+    return Options();
+  }
+  std::unique_ptr<Operator> Deserialize(
+      const BuiltinOptions* builtin_options,
+      const CustomOptions* custom_options) const override {
+    return std::unique_ptr<Operator>(new T);
+  }
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_SIMPLE_OPERATOR_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b4dbfae2477d629624a70bf7c6e93606c937605
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -0,0 +1,165 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
+
+namespace toco {
+
+namespace tflite {
+
+namespace {
+template <ArrayDataType T>
+DataBuffer::FlatBufferOffset CopyBuffer(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  using NativeT = ::toco::DataType<T>;
+  const auto& src_data = array.GetBuffer<T>().data;
+  const uint8_t* dst_data = reinterpret_cast<const uint8_t*>(src_data.data());
+  auto size = src_data.size() * sizeof(NativeT);
+  return builder->CreateVector(dst_data, size);
+}
+
+template <ArrayDataType T>
+void CopyBuffer(const ::tflite::Buffer& buffer, Array* array) {
+  using NativeT = ::toco::DataType<T>;
+  auto* src_buffer = buffer.data();
+  const NativeT* src_data =
+      reinterpret_cast<const NativeT*>(src_buffer->data());
+  int num_items = src_buffer->size() / sizeof(NativeT);
+
+  std::vector<NativeT>* dst_data = &array->GetMutableBuffer<T>().data;
+  for (int i = 0; i < num_items; ++i) {
+    dst_data->push_back(*src_data);
+    ++src_data;
+  }
+}
+}  // namespace
+
+::tflite::TensorType DataType::Serialize(ArrayDataType array_data_type) {
+  switch (array_data_type) {
+    case ArrayDataType::kFloat:
+      return ::tflite::TensorType_FLOAT32;
+    case ArrayDataType::kInt32:
+      return ::tflite::TensorType_INT32;
+    case ArrayDataType::kUint8:
+      return ::tflite::TensorType_UINT8;
+    default:
+      // FLOAT32 is filled for unknown data types.
+      // TODO(ycling): Implement type inference in TF Lite interpreter.
+      return ::tflite::TensorType_FLOAT32;
+  }
+}
+
+ArrayDataType DataType::Deserialize(int tensor_type) {
+  switch (::tflite::TensorType(tensor_type)) {
+    case ::tflite::TensorType_FLOAT32:
+      return ArrayDataType::kFloat;
+    case ::tflite::TensorType_INT32:
+      return ArrayDataType::kInt32;
+    case ::tflite::TensorType_UINT8:
+      return ArrayDataType::kUint8;
+    default:
+      LOG(FATAL) << "Unhandled tensor type '" << tensor_type << "'.";
+  }
+}
+
+flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  if (!array.buffer) return 0;  // an empty buffer, usually an output.
+
+  switch (array.data_type) {
+    case ArrayDataType::kFloat:
+      return CopyBuffer<ArrayDataType::kFloat>(array, builder);
+    case ArrayDataType::kInt32:
+      return CopyBuffer<ArrayDataType::kInt32>(array, builder);
+    case ArrayDataType::kUint8:
+      return CopyBuffer<ArrayDataType::kUint8>(array, builder);
+    default:
+      LOG(FATAL) << "Unhandled array data type.";
+  }
+}
+
+void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
+                             const ::tflite::Buffer& buffer, Array* array) {
+  if (tensor.buffer() == 0) return;      // an empty buffer, usually an output.
+  if (buffer.data() == nullptr) return;  // a non-defined buffer.
+
+  switch (tensor.type()) {
+    case ::tflite::TensorType_FLOAT32:
+      return CopyBuffer<ArrayDataType::kFloat>(buffer, array);
+    case ::tflite::TensorType_INT32:
+      return CopyBuffer<ArrayDataType::kInt32>(buffer, array);
+    case ::tflite::TensorType_UINT8:
+      return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
+    default:
+      LOG(FATAL) << "Unhandled tensor type.";
+  }
+}
+
+::tflite::Padding Padding::Serialize(PaddingType padding_type) {
+  switch (padding_type) {
+    case PaddingType::kSame:
+      return ::tflite::Padding_SAME;
+    case PaddingType::kValid:
+      return ::tflite::Padding_VALID;
+    default:
+      LOG(FATAL) << "Unhandled padding type.";
+  }
+}
+
+PaddingType Padding::Deserialize(int padding) {
+  switch (::tflite::Padding(padding)) {
+    case ::tflite::Padding_SAME:
+      return PaddingType::kSame;
+    case ::tflite::Padding_VALID:
+      return PaddingType::kValid;
+    default:
+      LOG(FATAL) << "Unhandled padding.";
+  }
+}
+
+::tflite::ActivationFunctionType ActivationFunction::Serialize(
+    FusedActivationFunctionType faf_type) {
+  switch (faf_type) {
+    case FusedActivationFunctionType::kNone:
+      return ::tflite::ActivationFunctionType_NONE;
+    case FusedActivationFunctionType::kRelu:
+      return ::tflite::ActivationFunctionType_RELU;
+    case FusedActivationFunctionType::kRelu6:
+      return ::tflite::ActivationFunctionType_RELU6;
+    case FusedActivationFunctionType::kRelu1:
+      return ::tflite::ActivationFunctionType_RELU1;
+    default:
+      LOG(FATAL) << "Unhandled fused activation function type.";
+  }
+}
+
+FusedActivationFunctionType ActivationFunction::Deserialize(
+    int activation_function) {
+  switch (::tflite::ActivationFunctionType(activation_function)) {
+    case ::tflite::ActivationFunctionType_NONE:
+      return FusedActivationFunctionType::kNone;
+    case ::tflite::ActivationFunctionType_RELU:
+      return FusedActivationFunctionType::kRelu;
+    case ::tflite::ActivationFunctionType_RELU6:
+      return FusedActivationFunctionType::kRelu6;
+    case ::tflite::ActivationFunctionType_RELU1:
+      return FusedActivationFunctionType::kRelu1;
+    default:
+      LOG(FATAL) << "Unhandled fused activation function type.";
+  }
+}
+
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tflite/types.h b/tensorflow/contrib/lite/toco/tflite/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7c51405107d954fa259809b72f56af193e344fb
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/types.h
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
+
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+
+namespace toco {
+
+namespace tflite {
+
+struct DataType {
+  static ::tflite::TensorType Serialize(ArrayDataType array_data_type);
+  static ArrayDataType Deserialize(int tensor_type);
+};
+
+struct DataBuffer {
+  using FlatBufferOffset = flatbuffers::Offset<flatbuffers::Vector<uint8_t>>;
+
+  // Build the flatbuffer representation of a toco's Array and return the
+  // corresponding offset into the flatbuffer. Note that data from the array
+  // will be copied into the flatbuffer.
+  static FlatBufferOffset Serialize(const Array& array,
+                                    flatbuffers::FlatBufferBuilder* builder);
+  // Copy data from the given tensor into toco's Array.
+  static void Deserialize(const ::tflite::Tensor& tensor,
+                          const ::tflite::Buffer& buffer, Array* array);
+};
+
+struct Padding {
+  static ::tflite::Padding Serialize(PaddingType padding_type);
+  static PaddingType Deserialize(int padding);
+};
+
+struct ActivationFunction {
+  static ::tflite::ActivationFunctionType Serialize(
+      FusedActivationFunctionType faf_type);
+  static FusedActivationFunctionType Deserialize(int activation_function);
+};
+
+}  // namespace tflite
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..174b78f3e632fde8dc6ea0ed83ed7a67fa12c16a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -0,0 +1,191 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+
+namespace tflite {
+namespace {
+
+using flatbuffers::FlatBufferBuilder;
+using flatbuffers::Offset;
+using flatbuffers::Vector;
+
+// These are types that exist in TF Mini but don't have a correspondence
+// in TF Lite.
+static const ArrayDataType kUnsupportedTocoTypes[] = {
+    ArrayDataType::kNone, ArrayDataType::kBool, ArrayDataType::kInt64};
+
+// These are TF Lite types for which there is no correspondence in TF Mini.
+static const ::tflite::TensorType kUnsupportedTfLiteTypes[] = {
+    ::tflite::TensorType_FLOAT16};
+
+// A little helper to match flatbuffer offsets.
+MATCHER_P(HasOffset, value, "") { return arg.o == value; }
+
+// Helper function that creates an array, writes it into a flatbuffer, and then
+// reads it back in.
+template <ArrayDataType T>
+Array ToFlatBufferAndBack(std::initializer_list<::toco::DataType<T>> items) {
+  // NOTE: This test does not construct the full buffers list. Since
+  // Deserialize normally takes a buffer, we need to synthesize one and provide
+  // an index that is non-zero so the buffer is not assumed to be emtpy.
+  Array src;
+  src.data_type = T;
+  src.GetMutableBuffer<T>().data = items;
+
+  Array result;
+  flatbuffers::FlatBufferBuilder builder;
+  builder.Finish(CreateTensor(builder, 0, DataType::Serialize(T),
+                              /*buffer*/ 1));  // Can't use 0 which means empty.
+  flatbuffers::FlatBufferBuilder buffer_builder;
+  Offset<Vector<uint8_t>> data_buffer =
+      DataBuffer::Serialize(src, &buffer_builder);
+  buffer_builder.Finish(::tflite::CreateBuffer(buffer_builder, data_buffer));
+
+  auto* tensor =
+      flatbuffers::GetRoot<::tflite::Tensor>(builder.GetBufferPointer());
+  auto* buffer =
+      flatbuffers::GetRoot<::tflite::Buffer>(buffer_builder.GetBufferPointer());
+  DataBuffer::Deserialize(*tensor, *buffer, &result);
+  return result;
+}
+
+TEST(DataType, SupportedTypes) {
+  std::vector<std::pair<ArrayDataType, ::tflite::TensorType>> testdata = {
+      {ArrayDataType::kUint8, ::tflite::TensorType_UINT8},
+      {ArrayDataType::kInt32, ::tflite::TensorType_INT32},
+      {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32}};
+  for (auto x : testdata) {
+    EXPECT_EQ(x.second, DataType::Serialize(x.first));
+    EXPECT_EQ(x.first, DataType::Deserialize(x.second));
+  }
+}
+
+TEST(DataType, UnsupportedTypes) {
+  for (::tflite::TensorType t : kUnsupportedTfLiteTypes) {
+    EXPECT_DEATH(DataType::Deserialize(t), "Unhandled tensor type.");
+  }
+
+  // Unsupported types are all serialized as FLOAT32 currently.
+  for (ArrayDataType t : kUnsupportedTocoTypes) {
+    EXPECT_EQ(::tflite::TensorType_FLOAT32, DataType::Serialize(t));
+  }
+}
+
+TEST(DataBuffer, EmptyBuffers) {
+  flatbuffers::FlatBufferBuilder builder;
+  Array array;
+  EXPECT_THAT(DataBuffer::Serialize(array, &builder), HasOffset(0));
+
+  builder.Finish(::tflite::CreateTensor(builder));
+  auto* tensor =
+      flatbuffers::GetRoot<::tflite::Tensor>(builder.GetBufferPointer());
+  flatbuffers::FlatBufferBuilder buffer_builder;
+  Offset<Vector<uint8_t>> v = buffer_builder.CreateVector<uint8_t>({});
+  buffer_builder.Finish(::tflite::CreateBuffer(buffer_builder, v));
+  auto* buffer =
+      flatbuffers::GetRoot<::tflite::Buffer>(buffer_builder.GetBufferPointer());
+
+  DataBuffer::Deserialize(*tensor, *buffer, &array);
+  EXPECT_EQ(nullptr, array.buffer);
+}
+
+TEST(DataBuffer, UnsupportedTypes) {
+  for (ArrayDataType t : kUnsupportedTocoTypes) {
+    flatbuffers::FlatBufferBuilder builder;
+    Array array;
+    array.data_type = t;
+    array.GetMutableBuffer<ArrayDataType::kFloat>();  // This is OK.
+    EXPECT_DEATH(DataBuffer::Serialize(array, &builder),
+                 "Unhandled array data type.");
+  }
+
+  for (::tflite::TensorType t : kUnsupportedTfLiteTypes) {
+    flatbuffers::FlatBufferBuilder builder;
+    builder.Finish(::tflite::CreateTensor(builder, 0, t, /*buffer*/ 1));
+    flatbuffers::FlatBufferBuilder buffer_builder;
+    Offset<Vector<uint8_t>> v = buffer_builder.CreateVector<uint8_t>({1});
+    buffer_builder.Finish(::tflite::CreateBuffer(buffer_builder, v));
+    auto* buffer = flatbuffers::GetRoot<::tflite::Buffer>(
+        buffer_builder.GetBufferPointer());
+    auto* tensor =
+        flatbuffers::GetRoot<::tflite::Tensor>(builder.GetBufferPointer());
+    Array array;
+    EXPECT_DEATH(DataBuffer::Deserialize(*tensor, *buffer, &array),
+                 "Unhandled tensor type.");
+  }
+}
+
+TEST(DataBuffer, Float) {
+  Array recovered = ToFlatBufferAndBack<ArrayDataType::kFloat>({1.0f, 2.0f});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kFloat>().data,
+              ::testing::ElementsAre(1.0f, 2.0f));
+}
+
+TEST(DataBuffer, Uint8) {
+  Array recovered = ToFlatBufferAndBack<ArrayDataType::kUint8>({127, 244});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kUint8>().data,
+              ::testing::ElementsAre(127, 244));
+}
+
+TEST(DataBuffer, Int32) {
+  Array recovered = ToFlatBufferAndBack<ArrayDataType::kInt32>({1, 1 << 30});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kInt32>().data,
+              ::testing::ElementsAre(1, 1 << 30));
+}
+
+TEST(Padding, All) {
+  EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame));
+  EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME));
+
+  EXPECT_EQ(::tflite::Padding_VALID, Padding::Serialize(PaddingType::kValid));
+  EXPECT_EQ(PaddingType::kValid, Padding::Deserialize(::tflite::Padding_VALID));
+
+  EXPECT_DEATH(Padding::Serialize(static_cast<PaddingType>(10000)),
+               "Unhandled padding type.");
+  EXPECT_DEATH(Padding::Deserialize(10000), "Unhandled padding.");
+}
+
+TEST(ActivationFunction, All) {
+  std::vector<
+      std::pair<FusedActivationFunctionType, ::tflite::ActivationFunctionType>>
+      testdata = {{FusedActivationFunctionType::kNone,
+                   ::tflite::ActivationFunctionType_NONE},
+                  {FusedActivationFunctionType::kRelu,
+                   ::tflite::ActivationFunctionType_RELU},
+                  {FusedActivationFunctionType::kRelu6,
+                   ::tflite::ActivationFunctionType_RELU6},
+                  {FusedActivationFunctionType::kRelu1,
+                   ::tflite::ActivationFunctionType_RELU1}};
+  for (auto x : testdata) {
+    EXPECT_EQ(x.second, ActivationFunction::Serialize(x.first));
+    EXPECT_EQ(x.first, ActivationFunction::Deserialize(x.second));
+  }
+
+  EXPECT_DEATH(ActivationFunction::Serialize(
+                   static_cast<FusedActivationFunctionType>(10000)),
+               "Unhandled fused activation function type.");
+  EXPECT_DEATH(ActivationFunction::Deserialize(10000),
+               "Unhandled fused activation function type.");
+}
+
+}  // namespace
+}  // namespace tflite
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco.cc b/tensorflow/contrib/lite/toco/toco.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f01ec0ec6102494f36cca0265b79e90355661271
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco.cc
@@ -0,0 +1,119 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_tooling.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+#ifndef CHECK_OK
+#define CHECK_OK(val) CHECK_EQ((val).ok(), true)
+#define QCHECK_OK(val) QCHECK_EQ((val).ok(), true)
+#endif
+
+namespace toco {
+namespace {
+
+#define QCHECK_REQUIRE_TOCO_FLAG(arg) \
+  QCHECK(parsed_toco_flags.arg.specified()) << "Missing required flag: " #arg;
+
+void CheckFilePermissions(const ParsedTocoFlags& parsed_toco_flags,
+                          const ParsedModelFlags& parsed_model_flags,
+                          const TocoFlags& toco_flags) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet");
+
+  QCHECK_REQUIRE_TOCO_FLAG(input_file)
+  QCHECK_OK(port::file::Exists(parsed_toco_flags.input_file.value(),
+                               port::file::Defaults()))
+      << "Specified input_file does not exist: "
+      << parsed_toco_flags.input_file.value();
+  QCHECK_OK(port::file::Readable(parsed_toco_flags.input_file.value(),
+                                 port::file::Defaults()))
+      << "Specified input_file exists, but is not readable: "
+      << parsed_toco_flags.input_file.value();
+
+  QCHECK_REQUIRE_TOCO_FLAG(output_file);
+  QCHECK_OK(port::file::Writable(parsed_toco_flags.output_file.value()))
+      << "parsed_toco_flags.input_file.value() output_file is not writable: "
+      << parsed_toco_flags.output_file.value();
+}
+
+void ToolMain(const ParsedTocoFlags& parsed_toco_flags,
+              const ParsedModelFlags& parsed_model_flags) {
+  ModelFlags model_flags;
+  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
+
+  TocoFlags toco_flags;
+  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
+
+  CheckFilePermissions(parsed_toco_flags, parsed_model_flags, toco_flags);
+
+  string input_file_contents;
+  CHECK_OK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                   &input_file_contents,
+                                   port::file::Defaults()));
+  std::unique_ptr<Model> model =
+      Import(toco_flags, model_flags, input_file_contents);
+  Transform(toco_flags, model.get());
+  string output_file_contents;
+  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+         &output_file_contents);
+  CHECK_OK(port::file::SetContents(parsed_toco_flags.output_file.value(),
+                                   output_file_contents,
+                                   port::file::Defaults()));
+}
+
+}  // namespace
+}  // namespace toco
+
+int main(int argc, char** argv) {
+  toco::string msg;
+  toco::ParsedTocoFlags parsed_toco_flags;
+  toco::ParsedModelFlags parsed_model_flags;
+
+  // If no args were specified, give a help string to be helpful.
+  int* effective_argc = &argc;
+  char** effective_argv = argv;
+  if (argc == 1) {
+    // No arguments, so manufacture help argv.
+    static int dummy_argc = 2;
+    static char* dummy_argv[] = {argv[0], const_cast<char*>("--help")};
+    effective_argc = &dummy_argc;
+    effective_argv = dummy_argv;
+  }
+
+  // Parse toco flags and command flags in sequence, each one strips off args,
+  // giving InitGoogle a chance to handle all remaining arguments.
+  bool toco_success = toco::ParseTocoFlagsFromCommandLineFlags(
+      effective_argc, effective_argv, &msg, &parsed_toco_flags);
+  bool model_success = toco::ParseModelFlagsFromCommandLineFlags(
+      effective_argc, effective_argv, &msg, &parsed_model_flags);
+  if (!toco_success || !model_success || !msg.empty()) {
+    fprintf(stderr, "%s", msg.c_str());
+    fflush(stderr);
+    return 1;
+  }
+  toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
+  toco::ToolMain(parsed_toco_flags, parsed_model_flags);
+}
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8281f3a5725283d472e5e1a36e4d904b4dc1c49
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -0,0 +1,212 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/contrib/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace toco {
+
+bool ParseTocoFlagsFromCommandLineFlags(
+    int* argc, char* argv[], string* msg,
+    ParsedTocoFlags* parsed_toco_flags_ptr) {
+  using tensorflow::Flag;
+  ParsedTocoFlags& parsed_flags = *parsed_toco_flags_ptr;
+  std::vector<tensorflow::Flag> flags = {
+      Flag("input_file", parsed_flags.input_file.bind(),
+           parsed_flags.input_file.default_value(),
+           "Input file (model of any supported format). For Protobuf "
+           "formats, both text and binary are supported regardless of file "
+           "extension."),
+      Flag("output_file", parsed_flags.output_file.bind(),
+           parsed_flags.output_file.default_value(),
+           "Output file. "
+           "For Protobuf formats, the binary format will be used."),
+      Flag("input_format", parsed_flags.input_format.bind(),
+           parsed_flags.input_format.default_value(),
+           "Input file format. One of: tensorflow_graphdef, "),
+      Flag("output_format", parsed_flags.output_format.bind(),
+           parsed_flags.output_format.default_value(), "Output file format."),
+      Flag("default_ranges_min", parsed_flags.default_ranges_min.bind(),
+           parsed_flags.default_ranges_min.default_value(),
+           "If defined, will be used as the default value for the min bound "
+           "of min/max ranges used for quantization."),
+      Flag("default_ranges_max", parsed_flags.default_ranges_max.bind(),
+           parsed_flags.default_ranges_max.default_value(),
+           "If defined, will be used as the default value for the max bound "
+           "of min/max ranges used for quantization."),
+      Flag("inference_type", parsed_flags.inference_type.bind(),
+           parsed_flags.inference_type.default_value(),
+           "Target data type of arrays in the output file (for input_arrays, "
+           "this may be overridden by inference_input_type)."),
+      Flag("inference_input_type", parsed_flags.inference_input_type.bind(),
+           parsed_flags.inference_input_type.default_value(),
+           "Target data type of input arrays. If not specified, inference_type "
+           "is used."),
+      Flag("input_type", parsed_flags.input_type.bind(),
+           parsed_flags.input_type.default_value(),
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type."),
+      Flag("input_types", parsed_flags.input_types.bind(),
+           parsed_flags.input_types.default_value(),
+           "Deprecated ambiguous flag that set both --input_data_types and "
+           "--inference_input_type. Was meant to be a "
+           "comma-separated list, but this was deprecated before "
+           "multiple-input-types was ever properly supported."),
+
+      Flag("drop_fake_quant", parsed_flags.drop_fake_quant.bind(),
+           parsed_flags.drop_fake_quant.default_value(),
+           "Ignore and discard FakeQuant nodes. For instance, that can be used "
+           "to "
+           "generate plain float code without fake-quantization from a "
+           "quantized "
+           "graph."),
+      Flag(
+          "reorder_across_fake_quant",
+          parsed_flags.reorder_across_fake_quant.bind(),
+          parsed_flags.reorder_across_fake_quant.default_value(),
+          "Normally, FakeQuant nodes must be strict boundaries for graph "
+          "transformations, in order to ensure that quantized inference has "
+          "the "
+          "exact same arithmetic behavior as quantized training --- which is "
+          "the "
+          "whole point of quantized training and of FakeQuant nodes in the "
+          "first "
+          "place. However, that entails subtle requirements on where exactly "
+          "FakeQuant nodes must be placed in the graph. Some quantized graphs "
+          "have FakeQuant nodes at unexpected locations, that prevent graph "
+          "transformations that are necessary in order to generate inference "
+          "code for these graphs. Such graphs should be fixed, but as a "
+          "temporary work-around, setting this reorder_across_fake_quant flag "
+          "allows toco to perform necessary graph transformaitons on them, "
+          "at the cost of no longer faithfully matching inference and training "
+          "arithmetic."),
+      Flag("allow_custom_ops", parsed_flags.allow_custom_ops.bind(),
+           parsed_flags.allow_custom_ops.default_value(),
+           "If true, allow TOCO to create TF Lite Custom operators for all the"
+           "unsupported Tensorflow ops."),
+      Flag(
+          "drop_control_dependency",
+          parsed_flags.drop_control_dependency.bind(),
+          parsed_flags.drop_control_dependency.default_value(),
+          "If true, ignore control dependency requirements in input TensorFlow "
+          "GraphDef. Otherwise an error will be raised upon control dependency "
+          "inputs."),
+  };
+  bool asked_for_help =
+      *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
+  if (asked_for_help) {
+    *msg += tensorflow::Flags::Usage(argv[0], flags);
+    return false;
+  } else {
+    return tensorflow::Flags::Parse(argc, argv, flags);
+  }
+}
+
+void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
+                                       TocoFlags* toco_flags) {
+  namespace port = toco::port;
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet");
+
+  enum class FlagRequirement { kNone, kMustBeSpecified, kMustNotBeSpecified };
+
+#define ENFORCE_FLAG_REQUIREMENT(name, requirement)                          \
+  do {                                                                       \
+    if (requirement == FlagRequirement::kMustBeSpecified) {                  \
+      QCHECK(parsed_toco_flags.name.specified())                             \
+          << "Missing required flag: " << #name;                             \
+    }                                                                        \
+    if (requirement == FlagRequirement::kMustNotBeSpecified) {               \
+      QCHECK(!parsed_toco_flags.name.specified())                            \
+          << "Given other flags, this flag should not have been specified: " \
+          << #name;                                                          \
+    }                                                                        \
+  } while (false)
+#define READ_TOCO_FLAG(name, requirement)                     \
+  ENFORCE_FLAG_REQUIREMENT(name, requirement);                \
+  do {                                                        \
+    if (parsed_toco_flags.name.specified()) {                 \
+      toco_flags->set_##name(parsed_toco_flags.name.value()); \
+    }                                                         \
+  } while (false)
+
+#define PARSE_TOCO_FLAG(Type, name, requirement)               \
+  ENFORCE_FLAG_REQUIREMENT(name, requirement);                 \
+  do {                                                         \
+    if (parsed_toco_flags.name.specified()) {                  \
+      Type x;                                                  \
+      QCHECK(Type##_Parse(parsed_toco_flags.name.value(), &x)) \
+          << "Unrecognized " << #Type << " value "             \
+          << parsed_toco_flags.name.value();                   \
+      toco_flags->set_##name(x);                               \
+    }                                                          \
+  } while (false)
+
+  PARSE_TOCO_FLAG(FileFormat, input_format, FlagRequirement::kMustBeSpecified);
+  PARSE_TOCO_FLAG(FileFormat, output_format, FlagRequirement::kMustBeSpecified);
+  PARSE_TOCO_FLAG(IODataType, inference_type, FlagRequirement::kNone);
+  PARSE_TOCO_FLAG(IODataType, inference_input_type, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_ranges_min, FlagRequirement::kNone);
+  READ_TOCO_FLAG(default_ranges_max, FlagRequirement::kNone);
+  READ_TOCO_FLAG(drop_fake_quant, FlagRequirement::kNone);
+  READ_TOCO_FLAG(reorder_across_fake_quant, FlagRequirement::kNone);
+  READ_TOCO_FLAG(allow_custom_ops, FlagRequirement::kNone);
+  READ_TOCO_FLAG(drop_control_dependency, FlagRequirement::kNone);
+
+  // Deprecated flag handling.
+  if (parsed_toco_flags.input_type.specified()) {
+    LOG(WARNING)
+        << "--input_type is deprecated. It was an ambiguous flag that set both "
+           "--input_data_types and --inference_input_type. If you are trying "
+           "to complement the input file with information about the type of "
+           "input arrays, use --input_data_type. If you are trying to control "
+           "the quantization/dequantization of real-numbers input arrays in "
+           "the output file, use --inference_input_type.";
+    toco::IODataType input_type;
+    QCHECK(toco::IODataType_Parse(parsed_toco_flags.input_type.value(),
+                                  &input_type));
+    toco_flags->set_inference_input_type(input_type);
+  }
+  if (parsed_toco_flags.input_types.specified()) {
+    LOG(WARNING)
+        << "--input_types is deprecated. It was an ambiguous flag that set "
+           "both --input_data_types and --inference_input_type. If you are "
+           "trying to complement the input file with information about the "
+           "type of input arrays, use --input_data_type. If you are trying to "
+           "control the quantization/dequantization of real-numbers input "
+           "arrays in the output file, use --inference_input_type.";
+    std::vector<string> input_types =
+        absl::StrSplit(parsed_toco_flags.input_types.value(), ',');
+    QCHECK(!input_types.empty());
+    for (int i = 1; i < input_types.size(); i++) {
+      QCHECK_EQ(input_types[i], input_types[0]);
+    }
+    toco::IODataType input_type;
+    QCHECK(toco::IODataType_Parse(input_types[0], &input_type));
+    toco_flags->set_inference_input_type(input_type);
+  }
+
+#undef READ_TOCO_FLAG
+#undef PARSE_TOCO_FLAG
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.h b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba35ca8d5d23f07d843ae6fa2099cc7e15b1e9a3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.h
@@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/contrib/lite/toco/args.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/types.pb.h"
+
+namespace toco {
+// Parse and remove arguments handled from toco. Returns true if parsing
+// is successful. msg has the usage string if there was an error or
+// "--help" was specified
+bool ParseTocoFlagsFromCommandLineFlags(int* argc, char* argv[], string* msg,
+                                        ParsedTocoFlags* parsed_toco_flags_ptr);
+// Populate the TocoFlags proto with parsed_toco_flags data.
+void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
+                                       TocoFlags* toco_flags);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_CMDLINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
new file mode 100644
index 0000000000000000000000000000000000000000..3b9d7e22570b66aef2c9fc819e5ab4ec38e179f5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -0,0 +1,139 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+import "tensorflow/contrib/lite/toco/types.proto";
+
+package toco;
+
+// Supported I/O file formats. Some formats may be input-only or output-only.
+enum FileFormat {
+  FILE_FORMAT_UNKNOWN = 0;
+
+  // GraphDef, third_party/tensorflow/core/framework/graph.proto
+  TENSORFLOW_GRAPHDEF = 1;
+
+  // Tensorflow's mobile inference model.
+  // third_party/tensorflow/contrib/tflite/schema.fbs
+  TFLITE = 2;
+
+  // GraphViz
+  // Export-only.
+  GRAPHVIZ_DOT = 3;
+}
+
+// TocoFlags encodes extra parameters that drive tooling operations, that
+// are not normally encoded in model files and in general may not be thought
+// of as properties of models, instead describing how models are to be
+// processed in the context of the present tooling job.
+// Next Id: 13
+message TocoFlags {
+  // Input file format
+  optional FileFormat input_format = 1;
+
+  // Output file format
+  optional FileFormat output_format = 2;
+
+  // Similar to inference_type, but allows to control specifically the
+  // quantization of input arrays, separately from other arrays.
+  //
+  // If not set, then the value of inference_type is implicitly used, i.e.
+  // by default input arrays are quantized like other arrays.
+  //
+  // Like inference_type, this only affects real-number arrays. By "real-number"
+  // we mean float arrays, and quantized arrays. This excludes plain
+  // integer arrays, strings arrays, and every other data type.
+  //
+  // The typical use for this flag is for vision models taking a bitmap
+  // as input, typically with uint8 channels, yet still requiring floating-point
+  // inference. For such image models, the uint8 input is quantized, i.e.
+  // the uint8 values are interpreted as real numbers, and the quantization
+  // parameters used for such input arrays are their mean_value, std_value
+  // parameters.
+  optional IODataType inference_input_type = 11;
+
+  // Sets the type of real-number arrays in the output file, that is, controls
+  // the representation (quantization) of real numbers in the output file,
+  // except for input arrays, which are controlled by inference_input_type.
+  //
+  // NOTE: this flag only impacts real-number arrays. By "real-number"
+  // we mean float arrays, and quantized arrays. This excludes plain
+  // integer arrays, strings arrays, and every other data type.
+  //
+  // For real-number arrays, the impact of this flag is to allow the output
+  // file to choose a different real-numbers representation (quantization)
+  // from what the input file used. For any other types of arrays, changing
+  // the data type would not make sense.
+  //
+  // Specifically:
+  //    - If FLOAT, then real-numbers arrays will be of type float in
+  //      the output file. If they were quantized in the input file, then
+  //      they get dequantized.
+  //    - If QUANTIZED_UINT8, then real-numbers arrays will be quantized
+  //      as uint8 in the output file. If they were float in the input file,
+  //      then they get quantized.
+  //    - If not set, then all real-numbers arrays retain the same type in the
+  //      output file as they have in the input file.
+  //
+  optional IODataType inference_type = 4;
+
+  // default_ranges_min and default_ranges_max are helpers to experiment
+  // with quantization of models. Normally, quantization requires the input
+  // model to have (min, max) range information for every activations array.
+  // This is needed in order to know how to quantize arrays and still achieve
+  // satisfactory accuracy. However, in some circumstances one would just like
+  // to estimate the performance of quantized inference, without caring about
+  // accuracy. That is what default_ranges_min and default_ranges_max are for:
+  // when specified, they will be used as default (min, max) range boundaries
+  // for all activation arrays that lack (min, max) range information, thus
+  // allowing for quantization to proceed.
+  //
+  // It should be clear from the above explanation that these parameters are
+  // for experimentation purposes only and should not be used in production:
+  // they make it easy to quantize models, but the resulting quantized model
+  // will be inaccurate.
+  optional float default_ranges_min = 5;
+  optional float default_ranges_max = 6;
+
+  // Ignore and discard FakeQuant nodes. For instance, that can be used to
+  // generate plain float code without fake-quantization from a quantized
+  // graph.
+  optional bool drop_fake_quant = 7;
+
+  // Normally, FakeQuant nodes must be strict boundaries for graph
+  // transformations, in order to ensure that quantized inference has the
+  // exact same arithmetic behavior as quantized training --- which is the
+  // whole point of quantized training and of FakeQuant nodes in the first
+  // place. However, that entails subtle requirements on where exactly
+  // FakeQuant nodes must be placed in the graph. Some quantized graphs
+  // have FakeQuant nodes at unexpected locations, that prevent graph
+  // transformations that are necessary in order to generate inference
+  // code for these graphs. Such graphs should be fixed, but as a
+  // temporary work-around, setting this reorder_across_fake_quant flag
+  // allows toco to perform necessary graph transformaitons on them,
+  // at the cost of no longer faithfully matching inference and training
+  // arithmetic.
+  optional bool reorder_across_fake_quant = 8;
+
+  // If true, allow TOCO to create TF Lite Custom operators for all the
+  // unsupported Tensorflow ops.
+  optional bool allow_custom_ops = 10;
+
+  // Applies only to the case when the input format is TENSORFLOW_GRAPHDEF.
+  // If true, then control dependencies will be immediately dropped during
+  // import.
+  // If not set, the default behavior is as follows:
+  //    - Default to false if the output format is TENSORFLOW_GRAPHDEF.
+  //    - Default to true in all other cases.
+  optional bool drop_control_dependency = 12;
+}
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e98e7081de4388e5425f0eea9f6bb5f5cdafcd7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.cc
@@ -0,0 +1,22 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+
+namespace toco {
+GraphVizDumpOptions* GraphVizDumpOptions::singleton() {
+  static auto* ptr = new GraphVizDumpOptions;
+  return ptr;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae0541f62b61581e3ba183725a85fe51c54116dc
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
+
+#include <string>
+
+namespace toco {
+
+// Global data for determining whether to output graph viz format from toco.
+struct GraphVizDumpOptions {
+  std::string graphviz_first_array;
+  std::string graphviz_last_array;
+  std::string dump_graphviz;
+  bool dump_graphviz_video = false;
+
+  static GraphVizDumpOptions* singleton();
+};
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_GRAPHVIZ_DUMP_OPTIONS_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c8696cd06a30bfe8661bb70aa4f2d6d175aac3
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -0,0 +1,227 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstring>
+
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace port {
+void CopyToBuffer(const string& src, char* dest) {
+  memcpy(dest, src.data(), src.size());
+}
+
+#ifdef PLATFORM_GOOGLE
+void CopyToBuffer(const Cord& src, char* dest) { src.CopyToArray(dest); }
+#endif
+}  // namespace port
+}  // namespace toco
+
+#if defined(PLATFORM_GOOGLE) && !defined(__APPLE__) && !defined(__ANDROID__)
+
+// Wrap Google file operations.
+
+#include "base/init_google.h"
+#include "file/base/file.h"
+#include "file/base/filesystem.h"
+#include "file/base/helpers.h"
+#include "file/base/options.h"
+#include "file/base/path.h"
+
+namespace toco {
+namespace port {
+
+void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
+  ::InitGoogle(usage, argc, argv, remove_flags);
+}
+
+void CheckInitGoogleIsDone(const char* message) {
+  ::CheckInitGoogleIsDone(message);
+}
+
+namespace file {
+
+// Conversion to our wrapper Status.
+Status ToStatus(const ::util::Status& uts) {
+  return Status(uts.ok(), uts.error_message());
+}
+
+// Conversion to our wrapper Options.
+toco::port::file::Options ToOptions(const ::file::Options& options) {
+  CHECK_EQ(&options, &::file::Defaults());
+  return Options();
+}
+
+Status Writable(const string& filename) {
+  File* f = nullptr;
+  const auto status = ::file::Open(filename, "w", &f, ::file::Defaults());
+  if (f) {
+    QCHECK_OK(f->Close(::file::Defaults()));
+  }
+  return ToStatus(status);
+}
+
+Status Readable(const string& filename, const file::Options& options) {
+  return ToStatus(::file::Readable(filename, ::file::Defaults()));
+}
+
+Status Exists(const string& filename, const file::Options& options) {
+  auto status = ::file::Exists(filename, ::file::Defaults());
+  return ToStatus(status);
+}
+
+Status GetContents(const string& filename, string* contents,
+                   const file::Options& options) {
+  return ToStatus(::file::GetContents(filename, contents, ::file::Defaults()));
+}
+
+Status SetContents(const string& filename, const string& contents,
+                   const file::Options& options) {
+  return ToStatus(::file::SetContents(filename, contents, ::file::Defaults()));
+}
+
+string JoinPath(const string& a, const string& b) {
+  return ::file::JoinPath(a, b);
+}
+
+}  // namespace file
+}  // namespace port
+}  // namespace toco
+
+#else  // (__APPLE__ || __ANDROID__)
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <cstdio>
+
+#if defined(PLATFORM_GOOGLE)
+#include "base/commandlineflags.h"
+#endif
+
+namespace toco {
+namespace port {
+
+static bool port_initialized = false;
+
+void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
+  if (!port_initialized) {
+#if defined(PLATFORM_GOOGLE)
+    ParseCommandLineFlags(argc, argv, remove_flags);
+#endif
+    port_initialized = true;
+  }
+}
+
+void CheckInitGoogleIsDone(const char* message) {
+  CHECK(port_initialized) << message;
+}
+
+namespace file {
+
+Status Writable(const string& filename) {
+  FILE* f = fopen(filename.c_str(), "w");
+  if (f) {
+    fclose(f);
+    return Status(true, "");
+  }
+  return Status(false, "not writable");
+}
+
+Status Readable(const string& filename, const file::Options& options) {
+  FILE* f = fopen(filename.c_str(), "r");
+  if (f) {
+    fclose(f);
+    return Status(true, "");
+  }
+  return Status(false, "not readable");
+}
+
+Status Exists(const string& filename, const file::Options& options) {
+  struct stat statbuf;
+  int ret = stat(filename.c_str(), &statbuf);
+  return Status(ret != -1, "");
+}
+
+Status GetContents(const string& path, string* output,
+                   const file::Options& options) {
+  output->clear();
+
+  int fd = open(path.c_str(), O_RDONLY);
+  if (fd == -1) {
+    return Status(false, "can't open() for read");
+  }
+
+  // Direct read, for speed.
+  const int kBufSize = 1 << 16;
+  char buffer[kBufSize];
+  while (true) {
+    int size = read(fd, buffer, kBufSize);
+    if (size == 0) {
+      // Done.
+      close(fd);
+      return Status(true, "");
+    } else if (size == -1) {
+      // Error.
+      close(fd);
+      return Status(false, "error during read()");
+    } else {
+      output->append(buffer, size);
+    }
+  }
+
+  CHECK(0);
+  return Status(false, "internal error");
+}
+
+Status SetContents(const string& filename, const string& contents,
+                   const file::Options& options) {
+  int fd = open(filename.c_str(), O_WRONLY | O_CREAT, 0664);
+  if (fd == -1) {
+    return Status(false, "can't open() for write");
+  }
+
+  size_t i = 0;
+  while (i < contents.size()) {
+    size_t to_write = contents.size() - i;
+    ssize_t written = write(fd, &contents[i], to_write);
+    if (written == -1) {
+      close(fd);
+      return Status(false, "write() error");
+    }
+    i += written;
+  }
+  close(fd);
+
+  return Status(true, "");
+}
+
+string JoinPath(const string& base, const string& filename) {
+  if (base.empty()) return filename;
+  string base_fixed = base;
+  if (!base_fixed.empty() && base_fixed.back() == '/') base_fixed.pop_back();
+  string filename_fixed = filename;
+  if (!filename_fixed.empty() && filename_fixed.front() == '/')
+    filename_fixed.erase(0, 1);
+  return base_fixed + "/" + filename_fixed;
+}
+
+}  // namespace file
+}  // namespace port
+}  // namespace toco
+
+#endif  // (__APPLE || __ANDROID__)
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5cb7a11e7c46d02d398ff937d46e52368e88098
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
+
+// Portability layer for toco tool. Mainly, abstract filesystem access so we
+// can build and use on google internal environments and on OSX.
+
+#include <string>
+#include "tensorflow/contrib/lite/toco/format_port.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform.h"
+#if defined(PLATFORM_GOOGLE)
+#include "absl/strings/cord.h"
+#endif  // PLATFORM_GOOGLE
+
+#ifdef PLATFORM_GOOGLE
+#define TFLITE_PROTO_NS proto2
+#else
+#define TFLITE_PROTO_NS google::protobuf
+#endif
+
+namespace toco {
+namespace port {
+
+class Status {
+ public:
+  Status() {}
+
+  Status(bool ok, const string& message) : ok_(ok), message_(message) {}
+
+  bool ok() const { return ok_; }
+
+  const string error_message() const { return message_; }
+
+ private:
+  bool ok_ = false;
+  string message_;
+};
+
+void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags);
+void CheckInitGoogleIsDone(const char* message);
+
+namespace file {
+class Options {};
+inline Options Defaults() {
+  Options o;
+  return o;
+}
+Status GetContents(const string& filename, string* contents,
+                   const Options& options);
+Status SetContents(const string& filename, const string& contents,
+                   const Options& options);
+string JoinPath(const string& base, const string& filename);
+Status Writable(const string& filename);
+Status Readable(const string& filename, const Options& options);
+Status Exists(const string& filename, const Options& options);
+}  // namespace file
+
+// Copy `src` string to `dest`. User must ensure `dest` has enough space.
+#if defined(PLATFORM_GOOGLE)
+void CopyToBuffer(const ::Cord& src, char* dest);
+#endif  // PLATFORM_GOOGLE
+void CopyToBuffer(const string& src, char* dest);
+}  // namespace port
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/toco_port_test.cc b/tensorflow/contrib/lite/toco/toco_port_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..650a617aebc053e789f41a56f9bb7fb514740f9a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_port_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/toco_types.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace port {
+namespace {
+
+#ifdef PLATFORM_GOOGLE
+#define TFLITE_PREFIX "third_party/tensorflow/contrib/lite/"
+#else
+#define TFLITE_PREFIX "tensorflow/contrib/lite/"
+#endif
+
+TEST(TocoPortTest, Exists) {
+  EXPECT_TRUE(
+      file::Exists(TFLITE_PREFIX "toco/toco_port_test.cc", file::Defaults())
+          .ok());
+
+  EXPECT_FALSE(
+      file::Exists("non-existent_file_asldjflasdjf", file::Defaults()).ok());
+}
+
+TEST(TocoPortTest, Readable) {
+  EXPECT_TRUE(
+      file::Readable(TFLITE_PREFIX "toco/toco_port_test.cc", file::Defaults())
+          .ok());
+
+  EXPECT_FALSE(
+      file::Readable("non-existent_file_asldjflasdjf", file::Defaults()).ok());
+}
+
+TEST(TocoPortTest, JoinPath) {
+  EXPECT_EQ("part1/part2", file::JoinPath("part1", "part2"));
+  EXPECT_EQ("part1/part2", file::JoinPath("part1/", "part2"));
+  EXPECT_EQ("part1/part2", file::JoinPath("part1", "/part2"));
+  EXPECT_EQ("part1/part2", file::JoinPath("part1/", "/part2"));
+}
+
+}  // namespace
+}  // namespace port
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a93fade6affe6c78922778678f52b11754d6c11d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -0,0 +1,287 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/toco_tooling.h"
+
+#include <cstdlib>
+#include <memory>
+#include <set>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/contrib/lite/toco/allocate_transient_arrays.h"
+#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
+#include "tensorflow/contrib/lite/toco/export_tensorflow.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tflite/export.h"
+#include "tensorflow/contrib/lite/toco/tflite/import.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+// CHECK-fails if the model contains a kTensorFlowUnsupported operation.
+void CheckUnsupportedOperations(const Model& model) {
+  std::set<string> unsupported_ops;
+  for (auto& op : model.operators) {
+    if (op->type == OperatorType::kTensorFlowUnsupported) {
+      unsupported_ops.insert(
+          static_cast<const TensorFlowUnsupportedOperator*>(op.get())
+              ->tensorflow_op);
+    }
+  }
+  QCHECK(unsupported_ops.empty())
+      << "These unsupported ops were not removed by graph transformations: "
+      << absl::StrJoin(unsupported_ops, ", ");
+}
+
+void MakeGeneralGraphTransformationsSet(
+    GraphTransformationsSet* transformations) {
+  CHECK(transformations->empty());
+  transformations->Add(new ConvertExpandDimsToReshape);
+  transformations->Add(new ResolveReshapeAttributes);
+  transformations->Add(new PropagateArrayDataTypes);
+  transformations->Add(new PropagateFixedSizes);
+  transformations->Add(new RemoveTensorFlowAssert);
+  transformations->Add(new RemoveTensorFlowIdentity);
+  transformations->Add(new RemoveTrivialConcatenation);
+  transformations->Add(new RemoveTrivialConcatenationInput);
+  transformations->Add(new RemoveUnusedOp);
+  transformations->Add(new EnsureBiasVectors);
+  transformations->Add(new ResolveReorderAxes);
+  transformations->Add(new ResolveTensorFlowMatMul);
+  transformations->Add(new FuseBinaryIntoPrecedingAffine);
+  transformations->Add(new FuseBinaryIntoFollowingAffine);
+  transformations->Add(new ResolveBatchNormalization);
+  transformations->Add(new ResolveConstantBinaryOperator);
+  transformations->Add(new ResolveConstantUnaryOperator);
+  transformations->Add(new ResolveTensorFlowMerge);
+  transformations->Add(new ResolveTensorFlowSqueeze);
+  transformations->Add(new ResolveTensorFlowSwitch);
+  transformations->Add(new ResolveTensorFlowTile);
+  transformations->Add(new ResolveTensorFlowConcat);
+  transformations->Add(new IdentifyL2Normalization);
+  transformations->Add(new IdentifyL2Pool);
+  transformations->Add(new IdentifyRelu1);
+  transformations->Add(new RemoveTrivialBinaryOperator);
+  transformations->Add(new ReadFakeQuantMinMax);
+  transformations->Add(new ResolvePadAttributes);
+  transformations->Add(new ResolveStridedSliceAttributes);
+  transformations->Add(new ResolveSliceAttributes);
+  transformations->Add(new ResolveMeanAttributes);
+  transformations->Add(new ResolveConstantTensorFlowShape);
+  transformations->Add(new MakeInitialDequantizeOperator);
+}
+
+bool SupportsQuantization(FileFormat format) {
+  return (format == GRAPHVIZ_DOT || format == TFLITE);
+  ;
+}
+
+bool SupportsFusedActivationFunction(FileFormat format) {
+  return (format == GRAPHVIZ_DOT || format == TFLITE);
+}
+
+bool SupportsLstmCell(FileFormat format) {
+  return (format == TENSORFLOW_GRAPHDEF || format == GRAPHVIZ_DOT);
+}
+
+bool SupportsPreallocatedWorkspace(FileFormat format) {
+  return (format == TFLITE);
+}
+
+bool IsRealValued(toco::ArrayDataType type) {
+  return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
+                           type == toco::ArrayDataType::kUint8);
+}
+
+void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
+  const FileFormat output_format = toco_flags.output_format();
+  ArrayDataType type;
+  if (toco_flags.has_inference_input_type()) {
+    type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
+  } else if (toco_flags.has_inference_type()) {
+    type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
+  } else if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
+  } else {
+    // Nothing to do. Data types stay as-is.
+    return;
+  }
+
+  for (int i = 0; i < model->flags.input_arrays_size(); i++) {
+    string const& array_name = model->flags.input_arrays(i).name();
+    auto* array = model->arrays[array_name].get();
+    // Note that the notion of changing data types only applies to real-numbers
+    // arrays (see the documentation for inference_input_type).
+    // TODO(benoitjacob) this is assuming that uint8 arrays are quantized,
+    // i.e. represent real numbers by means of quantization parameters,
+    // and not plain integer uint8 input arrays.
+    if (!IsRealValued(array->data_type)) {
+      // Ignore non-real data types.
+      continue;
+    }
+
+    array->final_data_type = type;
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
+                              const ModelFlags& model_flags,
+                              const string& input_file_contents) {
+  std::unique_ptr<Model> model;
+  switch (toco_flags.input_format()) {
+    case TENSORFLOW_GRAPHDEF: {
+      TensorFlowImportFlags tf_import_flags;
+      tf_import_flags.drop_control_dependency =
+          toco_flags.has_drop_control_dependency()
+              ? toco_flags.drop_control_dependency()
+              : (toco_flags.output_format() != TENSORFLOW_GRAPHDEF);
+      model = ImportTensorFlowGraphDef(model_flags, tf_import_flags,
+                                       input_file_contents);
+      break;
+    }
+    case TFLITE:
+      model = toco::tflite::Import(model_flags, input_file_contents);
+      ResolveModelFlags(model_flags, model.get());
+      CheckInvariants(*model);
+      break;
+    default:
+      LOG(FATAL) << "Unhandled input_format";
+  }
+
+  LogDump(kLogLevelModelChanged, "AT IMPORT", *model);
+
+  return model;
+}
+
+void Transform(const TocoFlags& toco_flags, Model* model) {
+  const FileFormat output_format = toco_flags.output_format();
+  const IODataType inference_type = toco_flags.inference_type();
+
+  const bool quantize_output =
+      SupportsQuantization(output_format) && inference_type == QUANTIZED_UINT8;
+
+  if (quantize_output) {
+    QCHECK_NE(toco_flags.inference_input_type(), FLOAT)
+        << "Quantized inference is not allowed with float inputs.";
+  }
+
+  SetFinalDataTypeOnInputs(toco_flags, model);
+
+  GraphTransformationsSet transformations;
+  MakeGeneralGraphTransformationsSet(&transformations);
+  auto* remove_trivial_reshape = new RemoveTrivialReshape;
+  transformations.Add(remove_trivial_reshape);
+  if (SupportsFusedActivationFunction(output_format)) {
+    transformations.Add(new FuseActivationFunctions);
+  } else {
+    transformations.Add(new UnfuseActivationFunctions);
+  }
+  if (output_format != TENSORFLOW_GRAPHDEF) {
+    transformations.Add(new ResolveConstantFakeQuant);
+  }
+  if (toco_flags.drop_fake_quant()) {
+    transformations.Add(new DropFakeQuant);
+  } else {
+    // See the doc for --reorder_across_fake_quant: that flag is needed to
+    // support some existing models, e.g. WordLens, that have FakeQuant
+    // nodes in the wrong places.
+    // TODO(benoitjacob): drop special casing when we can.
+    if ((quantize_output && toco_flags.reorder_across_fake_quant())) {
+      transformations.Add(new DropFakeQuant);
+    }
+  }
+  transformations.Add(new ConvertPureConvToDepthwise);
+  // TFLite export does not yet support fused LSTM cell.
+  if (SupportsLstmCell(output_format)) {
+    transformations.Add(new IdentifyLstmCell);
+  }
+  transformations.Add(new ResolveConstantConcatenation);
+  RunGraphTransformations(model, "general graph transformations",
+                          transformations);
+  if (quantize_output) {
+    RunGraphTransformations(model, "pre-quantization graph transformations",
+                            {new HardcodeMinMax, new DropFakeQuant});
+  }
+
+  if (quantize_output) {
+    if (toco_flags.has_default_ranges_min() &&
+        toco_flags.has_default_ranges_max()) {
+      UseDefaultMinMaxRangeValues(model, toco_flags.default_ranges_min(),
+                                  toco_flags.default_ranges_max());
+    }
+    CheckIsReadyForQuantization(*model);
+    RunGraphTransformations(
+        model, "quantization graph transformations",
+        {new Quantize, new RemoveTrivialQuantizedActivationFunc,
+         new RemoveFinalDequantizeOp});
+  } else {
+    GraphTransformationsSet dequantization_transformations{new Dequantize};
+    // Dequantize creates FakeQuant nodes. We may want to discard
+    // those immediately.
+    if (toco_flags.drop_fake_quant()) {
+      dequantization_transformations.Add(new DropFakeQuant);
+    }
+
+    RunGraphTransformations(model, "dequantization graph transformations",
+                            dequantization_transformations);
+  }
+
+  LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
+
+  if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
+    // By now there shouldn't be any unsupported ops when exporting to
+    // TensorFlow GraphDef.
+    CheckUnsupportedOperations(*model);
+  }
+
+  if (SupportsPreallocatedWorkspace(output_format)) {
+    AllocateTransientArrays(model, kDefaultTransientDataAlignment);
+    LogDump(kLogLevelModelChanged, "AFTER ALLOCATION", *model);
+  }
+
+  CheckModelCounts(*model);
+  CheckFinalDataTypesSatisfied(*model);
+
+  int64 ops_count;
+  if (EstimateArithmeticOpsCount(*model, &ops_count)) {
+    LOG(INFO) << "Estimated count of arithmetic ops: " << 1e-9 * ops_count
+              << " billion (note that a multiply-add is counted as 2 ops).";
+  }
+}
+
+void Export(const TocoFlags& toco_flags, const Model& model,
+            bool allow_custom_ops, string* output_file_contents) {
+  switch (toco_flags.output_format()) {
+    case TENSORFLOW_GRAPHDEF:
+      ExportTensorFlowGraphDef(model, output_file_contents);
+      break;
+    case TFLITE:
+      toco::tflite::Export(model, allow_custom_ops, output_file_contents);
+      break;
+    case GRAPHVIZ_DOT:
+      DumpGraphviz(model, output_file_contents);
+      break;
+    default:
+      LOG(FATAL) << "Unhandled output_format";
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.h b/tensorflow/contrib/lite/toco/toco_tooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c5a93a21170ba773b1160eb2e1261f85cdd70e5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_tooling.h
@@ -0,0 +1,50 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+// Imports the input file into a Model object.
+std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
+                              const ModelFlags& model_flags,
+                              const string& input_file_contents);
+
+// Transforms a Model. The resulting Model is ready to be passed
+// to Export with the exact same toco_flags.
+void Transform(const TocoFlags& toco_flags, Model* model);
+
+// Exports the Model, which must be of the 'lowered' form returned by
+// Transform, to a file of the format given by
+// toco_flags.output_format().
+void Export(const TocoFlags& toco_flags, const Model& model,
+            bool allow_custom_ops, string* output_file_contents);
+
+// This if for backward-compatibility with internal tools.
+inline void Export(const TocoFlags& toco_flags, const Model& model,
+                   string* output_file_contents) {
+  Export(toco_flags, model, true, output_file_contents);
+}
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOCO_TOOLING_H_
diff --git a/tensorflow/contrib/lite/toco/toco_types.h b/tensorflow/contrib/lite/toco/toco_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad42497ada6cb0dbda673bf3aad406c9fedfb078
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/toco_types.h
@@ -0,0 +1,45 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
+
+#include <string>
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(GOOGLE_INTEGRAL_TYPES)
+#include "tensorflow/core/platform/google/integral_types.h"
+#else
+#include "tensorflow/core/platform/default/integral_types.h"
+#endif
+
+namespace toco {
+#ifdef PLATFORM_GOOGLE
+using ::string;
+#else
+using std::string;
+#endif
+
+using tensorflow::int16;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::int8;
+using tensorflow::uint16;
+using tensorflow::uint32;
+using tensorflow::uint64;
+using tensorflow::uint8;
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TYPES_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21b85c86cc12bed897fead2b4bbd7d1f6879bb1f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -0,0 +1,1591 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+#include <functional>
+#include <iterator>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
+#include "tensorflow/contrib/lite/toco/dump_graphviz.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+string LogName(const Operator& op) {
+  const string& opname = HelpfulOperatorTypeName(op);
+  if (op.outputs.empty()) {
+    return toco::port::StringF("{%s operator}", opname);
+  } else {
+    return toco::port::StringF("{%s operator with output %s}", opname,
+                               op.outputs[0]);
+  }
+}
+
+bool IsInputArray(const Model& model, const string& name) {
+  for (const auto& input_array : model.flags.input_arrays()) {
+    if (input_array.name() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsArrayConsumed(const Model& model, const string& name) {
+  if (GetOpWithInput(model, name)) {
+    return true;
+  }
+  for (const string& model_output : model.flags.output_arrays()) {
+    if (model_output == name) {
+      return true;
+    }
+  }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (rnn_state.back_edge_source_array() == name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int CountTrueOutputs(const Model& model, const Operator& op) {
+  int count = 0;
+  for (const string& output : op.outputs) {
+    if (IsArrayConsumed(model, output)) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+int CountOpsWithInput(const Model& model, const string& array_name) {
+  int count = 0;
+  for (const auto& op : model.operators) {
+    for (auto& input : op->inputs) {
+      if (input == array_name) {
+        count++;
+      }
+    }
+  }
+  return count;
+}
+
+bool DeleteArrayIfUnused(const string& array_name, Model* model) {
+  if (CountOpsWithInput(*model, array_name) == 0) {
+    model->arrays.erase(array_name);
+    return true;
+  }
+  return false;
+}
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
+    const Model& model, const string& array_name) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    for (auto& output : it->get()->outputs) {
+      if (output == array_name) {
+        return it;
+      }
+    }
+  }
+  return model.operators.end();
+}
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
+    Model& model, const string& array_name) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    for (auto& output : it->get()->outputs) {
+      if (output == array_name) {
+        return it;
+      }
+    }
+  }
+  return model.operators.end();
+}
+
+Operator* GetOpWithOutput(const Model& model, const string& array_name) {
+  auto it = FindOpWithOutput(model, array_name);
+  return it == model.operators.end() ? nullptr : it->get();
+}
+
+// GetFirstOpWithInput assumes that this finds the first op.
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
+    const Model& model, const string& array_name) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    for (auto& input : it->get()->inputs) {
+      if (input == array_name) {
+        return it;
+      }
+    }
+  }
+  return model.operators.end();
+}
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
+    const Model& model, const Operator* op) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    if (it->get() == op) {
+      return it;
+    }
+  }
+  return model.operators.end();
+}
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
+                                                        const Operator* op) {
+  for (auto it = model.operators.begin(); it != model.operators.end(); ++it) {
+    if (it->get() == op) {
+      return it;
+    }
+  }
+  return model.operators.end();
+}
+
+Operator* GetOpWithInput(const Model& model, const string& array_name) {
+  auto it = FindOpWithInput(model, array_name);
+  return it == model.operators.end() ? nullptr : it->get();
+}
+
+Operator* GetFirstOpWithInput(const Model& model, const string& array_name) {
+  auto it = FindOpWithInput(model, array_name);
+  return it == model.operators.end() ? nullptr : it->get();
+}
+
+string FormatArraysList(const Model& model, const std::vector<string>& list) {
+  if (list.empty()) {
+    return "[]";
+  }
+  string result = "";
+  if (list.size() > 1) {
+    result += "[ ";
+  }
+  for (std::size_t i = 0; i < list.size(); i++) {
+    if (i > 0) {
+      result += ", ";
+    }
+    result += list[i];
+  }
+  if (list.size() > 1) {
+    result += " ]";
+  }
+  return result;
+}
+
+const char* OperatorTypeName(OperatorType type) {
+  switch (type) {
+#define HANDLE_OPERATORTYPENAME_CASE(c) \
+  case OperatorType::k##c:              \
+    return #c;
+    HANDLE_OPERATORTYPENAME_CASE(Add)
+    HANDLE_OPERATORTYPENAME_CASE(AveragePool)
+    HANDLE_OPERATORTYPENAME_CASE(BatchNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Conv)
+    HANDLE_OPERATORTYPENAME_CASE(Concatenation)
+    HANDLE_OPERATORTYPENAME_CASE(DepthwiseConv)
+    HANDLE_OPERATORTYPENAME_CASE(DepthToSpace)
+    HANDLE_OPERATORTYPENAME_CASE(SpaceToDepth)
+    HANDLE_OPERATORTYPENAME_CASE(FullyConnected)
+    HANDLE_OPERATORTYPENAME_CASE(Dequantize)
+    HANDLE_OPERATORTYPENAME_CASE(L2Normalization)
+    HANDLE_OPERATORTYPENAME_CASE(LocalResponseNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Logistic)
+    HANDLE_OPERATORTYPENAME_CASE(LstmCell)
+    HANDLE_OPERATORTYPENAME_CASE(MaxPool)
+    HANDLE_OPERATORTYPENAME_CASE(L2Pool)
+    HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
+    HANDLE_OPERATORTYPENAME_CASE(Mul)
+    HANDLE_OPERATORTYPENAME_CASE(Relu)
+    HANDLE_OPERATORTYPENAME_CASE(Relu1)
+    HANDLE_OPERATORTYPENAME_CASE(Relu6)
+    HANDLE_OPERATORTYPENAME_CASE(ReorderAxes)
+    HANDLE_OPERATORTYPENAME_CASE(Softmax)
+    HANDLE_OPERATORTYPENAME_CASE(Div)
+    HANDLE_OPERATORTYPENAME_CASE(Tanh)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowAssert)
+    HANDLE_OPERATORTYPENAME_CASE(ExpandDims)
+    HANDLE_OPERATORTYPENAME_CASE(Fill)
+    HANDLE_OPERATORTYPENAME_CASE(FloorMod)
+    HANDLE_OPERATORTYPENAME_CASE(FloorDiv)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreater)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowGreaterEqual)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowIdentity)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowLess)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowLessEqual)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMatMul)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMax)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMaximum)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMerge)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMin)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowMinimum)
+    HANDLE_OPERATORTYPENAME_CASE(Pad)
+    HANDLE_OPERATORTYPENAME_CASE(StridedSlice)
+    HANDLE_OPERATORTYPENAME_CASE(Stack)
+    HANDLE_OPERATORTYPENAME_CASE(Range)
+    HANDLE_OPERATORTYPENAME_CASE(Rank)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowReshape)
+    HANDLE_OPERATORTYPENAME_CASE(Squeeze)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowRsqrt)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowShape)
+    HANDLE_OPERATORTYPENAME_CASE(Slice)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowSplit)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowSqrt)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowSquare)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowSwitch)
+    HANDLE_OPERATORTYPENAME_CASE(Sub)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowSum)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowTile)
+    HANDLE_OPERATORTYPENAME_CASE(Transpose)
+    HANDLE_OPERATORTYPENAME_CASE(TransposeConv)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcat)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowConcatV2)
+    HANDLE_OPERATORTYPENAME_CASE(Cast)
+    HANDLE_OPERATORTYPENAME_CASE(Floor)
+    HANDLE_OPERATORTYPENAME_CASE(Gather)
+    HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
+    HANDLE_OPERATORTYPENAME_CASE(SpaceToBatchND)
+    HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
+    HANDLE_OPERATORTYPENAME_CASE(Mean)
+    HANDLE_OPERATORTYPENAME_CASE(Svdf)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowUnsupported)
+    default:
+      LOG(FATAL) << "Unhandled op type";
+#undef HANDLE_OPERATORTYPENAME_CASE
+  }
+}
+
+string HelpfulOperatorTypeName(const Operator& op) {
+  if (op.type == OperatorType::kTensorFlowUnsupported) {
+    return toco::port::StringF(
+        "(Unsupported TensorFlow op: %s)",
+        static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op);
+  }
+  return OperatorTypeName(op.type);
+}
+
+void LogSummary(int log_level, const Model& model) {
+  VLOG(log_level) << "Operators summary (" << model.operators.size()
+                  << " operators): ";
+  std::unordered_multiset<OperatorType> ops_by_type;
+  for (const auto& op : model.operators) {
+    ops_by_type.insert(op->type);
+  }
+  auto it = ops_by_type.begin();
+  while (it != ops_by_type.end()) {
+    int count = ops_by_type.count(*it);
+    VLOG(log_level) << "    " << OperatorTypeName(*it) << ": " << count;
+    std::advance(it, count);
+  }
+}
+
+void LogArray(int log_level, const Model& model, const string& name) {
+  const auto& array = model.GetArray(name);
+  VLOG(log_level) << "Array: " << name;
+  switch (array.data_type) {
+    case ArrayDataType::kNone:
+      VLOG(log_level) << "  Data type:";
+      break;
+    case ArrayDataType::kFloat:
+      VLOG(log_level) << "  Data type: kFloat";
+      break;
+    case ArrayDataType::kInt32:
+      VLOG(log_level) << "  Data type: kInt32";
+      break;
+    case ArrayDataType::kUint8:
+      VLOG(log_level) << "  Data type: kUint8";
+      break;
+    default:
+      VLOG(log_level) << "  Data type: other (numerical value: "
+                      << static_cast<int>(array.data_type) << ")";
+      break;
+  }
+  switch (array.final_data_type) {
+    case ArrayDataType::kNone:
+      VLOG(log_level) << "  Final type:";
+      break;
+    case ArrayDataType::kFloat:
+      VLOG(log_level) << "  Final type: kFloat";
+      break;
+    case ArrayDataType::kInt32:
+      VLOG(log_level) << "  Final type: kInt32";
+      break;
+    case ArrayDataType::kUint8:
+      VLOG(log_level) << "  Final type: kUint8";
+      break;
+    default:
+      VLOG(log_level) << "  Final type: other (numerical value: "
+                      << static_cast<int>(array.data_type) << ")";
+      break;
+  }
+  if (array.buffer) {
+    VLOG(log_level) << "  Constant Buffer";
+  }
+  if (array.alloc) {
+    VLOG(log_level) << "  Transient Alloc";
+  }
+  if (array.has_shape()) {
+    const Shape& array_shape = array.shape();
+    if (array_shape.dimensions_count() == 0) {
+      VLOG(log_level) << "  (Zero dimensions)";
+    } else {
+      string message = "  Dims: ";
+      bool first = true;
+      for (const int dim : array_shape.dims()) {
+        if (!first) {
+          message += ", ";
+        }
+        first = false;
+        toco::port::AppendF(&message, "%d", dim);
+      }
+      VLOG(log_level) << message;
+    }
+  }
+  if (array.minmax) {
+    VLOG(log_level) << "  MinMax: " << array.minmax->min << " .. "
+                    << array.minmax->max;
+  }
+  if (array.quantization_params) {
+    VLOG(log_level) << "  QuantizationParams: zero_point="
+                    << array.quantization_params->zero_point
+                    << ", scale=" << array.quantization_params->scale;
+  }
+}
+
+void DumpGraphvizVideoFrame(const Model& model) {
+  namespace port = toco::port;
+
+  const auto& dump_options = *GraphVizDumpOptions::singleton();
+  if (!dump_options.dump_graphviz_video) {
+    return;
+  }
+  CHECK(!dump_options.dump_graphviz.empty());
+  // TODO(benoitjacob): the static data here means that this function
+  // is stateful, not reentrant, and effectively leaks memory till exit
+  // (since dump_hashes can only grow in size). It also means that it
+  // really only is intended to be called for a single model during the
+  // process' lifetime. So it's not great design at all. The overriding
+  // design aspect here is to make the video-dumping code as unintrusive
+  // and self-contained as possible. Eventually, we'll want to have that
+  // cleaned-up, but that will require some form of general statefulness
+  // in toco (some kind of 'tooling state' data structure) that does
+  // not exist at present, and would be premature to design here just for
+  // this new video-dumping feature.
+  static int dump_id = 0;
+  static std::unordered_set<std::size_t> dump_hashes;
+  string graphviz_dump;
+  DumpGraphviz(model, &graphviz_dump);
+  std::size_t hash = std::hash<string>{}(graphviz_dump);
+  if (!dump_hashes.count(hash)) {
+    dump_hashes.insert(hash);
+    CHECK(port::file::SetContents(
+              port::file::JoinPath(
+                  dump_options.dump_graphviz,
+                  toco::port::StringF("toco_video_%05d.dot", dump_id)),
+              graphviz_dump, port::file::Defaults())
+              .ok());
+    dump_id++;
+  }
+}
+
+void LogDump(int log_level, const string& message, const Model& model) {
+  namespace port = toco::port;
+  const auto& dump_options = *GraphVizDumpOptions::singleton();
+
+  DumpGraphvizVideoFrame(model);
+  if (!dump_options.dump_graphviz.empty()) {
+    string graphviz_dump;
+
+    DumpGraphviz(model, &graphviz_dump);
+    CHECK(port::file::SetContents(
+              port::file::JoinPath(
+                  dump_options.dump_graphviz,
+                  absl::StrCat("toco_",
+                               absl::StrReplaceAll(message, {{" ", "_"}}),
+                               ".dot")),
+              graphviz_dump, port::file::Defaults())
+              .ok());
+  }
+
+  if (!VLOG_IS_ON(log_level)) {
+    return;
+  }
+  VLOG(log_level) << "BEGIN DUMP OF TOCO MODEL (" << message << ")";
+  LogSummary(log_level, model);
+  std::unordered_set<string> already_printed_arrays;
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      if (!already_printed_arrays.count(input)) {
+        already_printed_arrays.insert(input);
+        LogArray(log_level, model, input);
+      }
+    }
+    VLOG(log_level) << HelpfulOperatorTypeName(*op) << " : ";
+    VLOG(log_level) << "  " << FormatArraysList(model, op->inputs) << " -> "
+                    << FormatArraysList(model, op->outputs);
+    if (op->fused_activation_function != FusedActivationFunctionType::kNone) {
+      VLOG(log_level) << "    (with fused activation function)";
+    }
+    for (const auto& output : op->outputs) {
+      if (!already_printed_arrays.count(output)) {
+        already_printed_arrays.insert(output);
+        LogArray(log_level, model, output);
+      }
+    }
+  }
+  VLOG(log_level) << "END DUMP OF TOCO MODEL (" << message << ")";
+}
+
+// Note remaining raw-array extension in ProcessTensorFlowReshapeOperator().
+void ExtendShape(Shape* shape, int new_shape_size) {
+  CHECK_GE(new_shape_size, shape->dimensions_count());
+  const int size_increase = new_shape_size - shape->dimensions_count();
+  auto* shape_dims = shape->mutable_dims();
+  shape_dims->insert(shape_dims->begin(), size_increase, 1);
+}
+
+// TODO(b/62904716) Remove along with remaining uses.
+void UnextendShape(Shape* shape, int new_shape_size) {
+  CHECK_LE(new_shape_size, shape->dimensions_count());
+  const int size_reduction = shape->dimensions_count() - new_shape_size;
+  for (int i = 0; i < size_reduction; i++) {
+    CHECK_EQ(shape->dims(i), 1);
+  }
+  std::vector<int>& shape_dims = *shape->mutable_dims();
+  shape_dims.erase(shape_dims.begin(), shape_dims.begin() + size_reduction);
+}
+
+void CheckShapeDimensions(const Shape& shape) {
+  for (int i = 0; i < shape.dimensions_count(); ++i) {
+    CHECK_GE(shape.dims()[i], 1) << "shape has dimension 0 at index << " << i
+                                 << ". shape = " << ShapeToString(shape);
+  }
+}
+
+bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1) {
+  CheckShapeDimensions(shape0);
+  CheckShapeDimensions(shape1);
+
+  const Shape* longer = &shape0;
+  const Shape* shorter = &shape1;
+  if (shape1.dimensions_count() > shape0.dimensions_count()) {
+    longer = &shape1;
+    shorter = &shape0;
+  }
+
+  // Walk dimensions back to front until we run out of dimensions in the shorter
+  // shape.
+  int longer_index = longer->dimensions_count() - 1;
+  int shorter_index = shorter->dimensions_count() - 1;
+  while (shorter_index >= 0) {
+    const int d_long = longer->dims(longer_index);
+    const int d_short = shorter->dims(shorter_index);
+    // Broadcasting fails if the dimensions are different *and* neither is 1.
+    if ((d_long != d_short) && (d_long != 1) && (d_short != 1)) {
+      return false;
+    }
+    longer_index--;
+    shorter_index--;
+  }
+  return true;
+}
+
+bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1) {
+  CheckShapeDimensions(shape0);
+  CheckShapeDimensions(shape1);
+
+  const Shape* longer = &shape0;
+  const Shape* shorter = &shape1;
+  if (shape1.dimensions_count() > shape0.dimensions_count()) {
+    longer = &shape1;
+    shorter = &shape0;
+  }
+
+  // Walk dimensions back to front until we run out of dimensions in the shorter
+  // shape.
+  int longer_index = longer->dimensions_count() - 1;
+  int shorter_index = shorter->dimensions_count() - 1;
+  while (shorter_index >= 0) {
+    const int d_long = longer->dims(longer_index);
+    const int d_short = shorter->dims(shorter_index);
+    // Extending fails if the dimensions are different.
+    if (d_long != d_short) {
+      return false;
+    }
+    longer_index--;
+    shorter_index--;
+  }
+
+  // The remaining dimensions in the longer shape must be 1.
+  while (longer_index >= 0) {
+    const int d_long = longer->dims(longer_index);
+    if (d_long != 1) {
+      return false;
+    }
+    longer_index--;
+  }
+
+  return true;
+}
+
+int RequiredBufferSizeForShape(const Shape& shape) {
+  int max_offset = 1;
+  for (const auto& dim : shape.dims()) {
+    CHECK_GE(dim, 1);
+    max_offset *= dim;
+  }
+  return max_offset;
+}
+
+bool IsConstantParameterArray(const Model& model, const string& name) {
+  if (!model.arrays.count(name)) {
+    return false;
+  }
+
+  return !!model.arrays.at(name)->buffer;
+}
+
+void CheckNoMissingArray(const Model& model) {
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      CHECK(model.arrays.count(input));
+    }
+    for (const auto& output : op->outputs) {
+      CHECK(model.arrays.count(output));
+    }
+  }
+  for (const auto& input_array : model.flags.input_arrays()) {
+    CHECK(model.arrays.count(input_array.name()))
+        << "Input array not found: " << input_array.name();
+  }
+  for (const string& output_array : model.flags.output_arrays()) {
+    CHECK(model.arrays.count(output_array))
+        << "Output array not found: " << output_array;
+  }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (!rnn_state.discardable()) {
+      CHECK(model.arrays.count(rnn_state.state_array()));
+      CHECK(model.arrays.count(rnn_state.back_edge_source_array()));
+    }
+  }
+}
+
+void FixNoMissingArray(Model* model) {
+  for (const auto& op : model->operators) {
+    for (const auto& input : op->inputs) {
+      if (!model->arrays.count(input)) {
+        model->GetOrCreateArray(input);
+      }
+    }
+    for (const auto& output : op->outputs) {
+      if (!model->arrays.count(output)) {
+        model->GetOrCreateArray(output);
+      }
+    }
+  }
+  for (const string& output_array : model->flags.output_arrays()) {
+    if (!model->arrays.count(output_array)) {
+      model->GetOrCreateArray(output_array);
+    }
+  }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    model->GetOrCreateArray(rnn_state.state_array());
+    model->GetOrCreateArray(rnn_state.back_edge_source_array());
+  }
+}
+
+void CheckNoOrphanedArray(const Model& model) {
+  std::unordered_set<string> arrays_without_known_use;
+  for (const auto& array : model.arrays) {
+    if (IsDiscardableArray(model, array.first)) {
+      arrays_without_known_use.insert(array.first);
+    }
+  }
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      arrays_without_known_use.erase(input);
+    }
+    for (const auto& output : op->outputs) {
+      arrays_without_known_use.erase(output);
+    }
+  }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
+  if (!arrays_without_known_use.empty()) {
+    for (const auto& array : arrays_without_known_use) {
+      LOG(INFO) << "Error: Orphaned array: " << array;
+    }
+  }
+  CHECK(arrays_without_known_use.empty());
+}
+
+void FixNoOrphanedArray(Model* model) {
+  std::unordered_set<string> arrays_without_known_use;
+  for (const auto& array : model->arrays) {
+    arrays_without_known_use.insert(array.first);
+  }
+  for (const auto& op : model->operators) {
+    for (const auto& input : op->inputs) {
+      arrays_without_known_use.erase(input);
+    }
+    for (const auto& output : op->outputs) {
+      arrays_without_known_use.erase(output);
+    }
+  }
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    arrays_without_known_use.erase(rnn_state.state_array());
+    arrays_without_known_use.erase(rnn_state.back_edge_source_array());
+  }
+  for (const auto& array : arrays_without_known_use) {
+    if (IsDiscardableArray(*model, array)) {
+      model->arrays.erase(array);
+    }
+  }
+}
+
+void CheckArrayFieldsConsistent(const Model& model) {
+  for (const auto& array_entry : model.arrays) {
+    const auto& array = array_entry.second;
+    if (array->has_shape()) {
+      for (int d : array->shape().dims()) {
+        CHECK_GE(d, 1);
+      }
+    }
+    // It's OK to have a buffer or an alloc, but not both.
+    // (Since allocs are for transient arrays without a buffer).
+    CHECK(!array->buffer || !array->alloc);
+    // If there is a buffer, its type should be consistent with data_type.
+    if (array->buffer) {
+      CHECK(array->buffer->type == array->data_type);
+    }
+  }
+}
+
+void CheckOperatorOrdering(const Model& model) {
+  std::unordered_set<string> arrays_behind_us;
+  for (const auto& array_entry : model.arrays) {
+    if (!GetOpWithOutput(model, array_entry.first)) {
+      arrays_behind_us.insert(array_entry.first);
+    }
+  }
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      if (!IsConstantParameterArray(model, input)) {
+        CHECK(arrays_behind_us.count(input));
+      }
+    }
+    for (const auto& output : op->outputs) {
+      CHECK(!arrays_behind_us.count(output));
+      arrays_behind_us.insert(output);
+    }
+  }
+  for (const string& output_array : model.flags.output_arrays()) {
+    CHECK(arrays_behind_us.count(output_array));
+  }
+}
+
+void FixOperatorOrdering(Model* model) {
+  std::unordered_set<string> arrays_behind_us;
+  for (const auto& array_entry : model->arrays) {
+    if (!GetOpWithOutput(*model, array_entry.first)) {
+      arrays_behind_us.insert(array_entry.first);
+    }
+  }
+  std::vector<std::unique_ptr<Operator>> old_operators;
+  std::swap(old_operators, model->operators);
+  std::set<std::size_t> remaining;
+  for (std::size_t i = 0; i < old_operators.size(); i++) {
+    remaining.insert(i);
+  }
+  std::unordered_map<string, string> reason_why_leftover;
+  while (true) {
+    bool inserted_something = false;
+    for (auto i : remaining) {
+      bool can_insert = true;
+      auto& op = old_operators[i];
+      CHECK(op.get());
+      for (const auto& input : op->inputs) {
+        if (!IsConstantParameterArray(*model, input) &&
+            !arrays_behind_us.count(input)) {
+          for (const string& output : op->outputs) {
+            reason_why_leftover[output] = input;
+          }
+          can_insert = false;
+          break;
+        }
+      }
+      if (can_insert) {
+        model->operators.emplace_back(nullptr);
+        for (const auto& output : op->outputs) {
+          arrays_behind_us.insert(output);
+        }
+        std::swap(op, model->operators.back());
+        remaining.erase(i);
+        inserted_something = true;
+        break;
+      }
+    }
+    if (!inserted_something) {
+      break;
+    }
+  }
+  if (!remaining.empty()) {
+    LOG(ERROR)
+        << "No viable ordering of operators was found. "
+        << "Here is a 'backtrace' of at least one part of the graph that is "
+        << "problematic. It starts with the first operator that has as "
+        << "problematic input array, and then walks back the graph to "
+        << "the operator that produced that input array, etc., until we find "
+        << "the root cause:";
+    LOG(ERROR) << "BEGIN TRACE OF OPERATOR WITH BAD INPUT";
+    LOG(ERROR) << "Here is the first-encountered operator with a bad input: ";
+    const Operator* bad_op = old_operators[*remaining.begin()].get();
+    std::unordered_set<string> bad_inputs_already_traced;
+    // The following while(true) loop should always end with a LOG(FATAL).
+    while (true) {
+      LOG(ERROR) << HelpfulOperatorTypeName(*bad_op) << " : "
+                 << FormatArraysList(*model, bad_op->inputs) << " -> "
+                 << FormatArraysList(*model, bad_op->outputs);
+      bool found_bad_output = false;
+      string bad_output;
+      for (const string& output : bad_op->outputs) {
+        if (reason_why_leftover.count(output)) {
+          found_bad_output = true;
+          bad_output = output;
+          break;
+        }
+      }
+      CHECK(found_bad_output);
+      const string& bad_input = reason_why_leftover[bad_output];
+      LOG(ERROR) << "The bad input here is: " << bad_input;
+      if (bad_inputs_already_traced.count(bad_input)) {
+        LOG(FATAL)
+            << "Cycle found! We already encountered that "
+            << "input array, " << bad_input << ", earlier in the "
+            << "above trace! We expect graphs to be acyclic, even "
+            << "RNNs. Let us know if some graph actually needs to have "
+            << "cycles, but first, please check if it really is "
+            << "an *inference* graph. *Training* graphs are out-of-scope "
+            << "for toco.";
+      }
+      bad_inputs_already_traced.insert(bad_input);
+      bad_op = nullptr;
+      for (auto i : remaining) {
+        const Operator* op = old_operators[i].get();
+        for (const string& output : op->outputs) {
+          if (bad_input == output) {
+            bad_op = op;
+            break;
+          }
+        }
+        if (bad_op) {
+          break;
+        }
+      }
+      if (!bad_op) {
+        LOG(ERROR) << "And that's the root cause: "
+                   << "that array, " << bad_input << ", isn't produced by any "
+                   << "operator, or provided in any other way.";
+        LOG(ERROR) << "END TRACE OF OPERATOR WITH BAD INPUT";
+        LOG(FATAL) << "(The above was a multi-line fatal error)";
+      }
+      LOG(ERROR) << "And that array is the output of the following operator:";
+    }
+  }
+  CHECK(remaining.empty())
+      << "Should never get here! In case of bad graph, "
+      << "the above code should have generated a FATAL error already!";
+}
+
+void CheckInvariants(const Model& model) {
+  CheckNoMissingArray(model);
+  CheckNoOrphanedArray(model);
+  CheckArrayFieldsConsistent(model);
+  CheckOperatorOrdering(model);
+}
+
+void CheckCountInRange(const ::toco::ModelFlags::ModelCheck& model_check,
+                       const int count, const string& count_description) {
+  if (model_check.count_min() >= 0) {
+    CHECK_GE(count, model_check.count_min())
+        << "Mismatch in " << count_description << ": count  was " << count
+        << ", but the specified "
+        << (model_check.count_max() > model_check.count_min() ? "minimum"
+                                                              : "value")
+        << " was " << model_check.count_min() << ".";
+  }
+  if (model_check.count_max() > model_check.count_min()) {
+    CHECK_LE(count, model_check.count_max())
+        << "Mismatch in " << count_description << ": count  was " << count
+        << ", but the specified maximum was " << model_check.count_max() << ".";
+  }
+}
+
+void CheckModelCounts(const Model& model) {
+  std::unordered_multiset<OperatorType> ops_by_type;
+  std::unordered_map<string, OperatorType> op_type_by_name;
+  if (model.flags.model_checks_size() == 0) {
+    return;
+  }
+
+  for (const auto& op : model.operators) {
+    ops_by_type.insert(op->type);
+    op_type_by_name[OperatorTypeName(op->type)] = op->type;
+  }
+  for (const auto& model_check : model.flags.model_checks()) {
+    string count_type = model_check.count_type();
+    if (count_type == "None") {
+      continue;
+    } else if (count_type == "Arrays") {
+      CheckCountInRange(model_check, model.arrays.size(), "count of arrays");
+    } else if (count_type == "Total") {
+      CheckCountInRange(model_check, model.operators.size(),
+                        "count of all operator instances");
+    } else {
+      // The check type is not itself checked against the set of valid
+      // operators, mainly because the enum set cannot be iterated in C++.
+      const int found_count =
+          op_type_by_name.count(count_type) > 0
+              ? ops_by_type.count(op_type_by_name[count_type])
+              : 0;
+      CheckCountInRange(model_check, found_count,
+                        "count of instances of " + count_type + " operator");
+    }
+  }
+}
+
+void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
+                   std::vector<int>* out_dims) {
+  CHECK(out_dims->empty());
+  if (num_dims == 1) {
+    CHECK_EQ(batch, 1);
+    *out_dims = {depth};
+  } else if (num_dims == 2) {
+    *out_dims = {batch, depth};
+  } else if (num_dims == 3) {
+    CHECK_EQ(batch, 1);
+    *out_dims = {height, width, depth};
+  } else if (num_dims == 4) {
+    *out_dims = {batch, height, width, depth};
+  } else {
+    LOG(FATAL) << "Should not get here: " << num_dims;
+  }
+}
+
+void CreateOrCheckRnnStateArray(const string& name, int size, Model* model) {
+  int batch = 1;
+  int num_dims = -1;
+  for (const auto& input_array : model->flags.input_arrays()) {
+    // Pick 'num_dims' and 'batch' from the first input_arrays, unless we find
+    // a better match by name.
+    if (input_array.name() == name || num_dims == -1) {
+      num_dims = input_array.shape().dims_size();
+      if (num_dims > 0) {
+        batch = input_array.shape().dims(0);
+      }
+    }
+  }
+  Array& array = model->GetOrCreateArray(name);
+  if (array.has_shape()) {
+    num_dims = array.shape().dimensions_count();
+  }
+  std::vector<int> dims;
+  MakeArrayDims(num_dims, batch, 1, 1, size, &dims);
+  CHECK(array.data_type == ArrayDataType::kFloat ||
+        array.data_type == ArrayDataType::kNone);
+  array.data_type = ArrayDataType::kFloat;
+  if (!array.has_shape()) {
+    Shape* shape = array.mutable_shape();
+    *shape->mutable_dims() = dims;
+  }
+}
+
+void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
+  // Merge info about input_arrays from model_flags into model->flags
+  for (const auto& specified_input_array : model_flags.input_arrays()) {
+    toco::InputArray* dst_input_array = nullptr;
+    for (int i = 0; i < model->flags.input_arrays_size(); i++) {
+      toco::InputArray* candidate_dst_input_array =
+          model->flags.mutable_input_arrays(i);
+      if (candidate_dst_input_array->name() == specified_input_array.name()) {
+        // specified_input_array from model_flags maps to dst_input_array
+        // in model->flags
+        dst_input_array = candidate_dst_input_array;
+        break;
+      }
+    }
+    if (!dst_input_array) {
+      // specified_input_array from model_flags is not found in model->flags.
+      // Match a name-less specified input array when there can be no ambiguity
+      // as there is only 1 input array.
+      if (model->flags.input_arrays_size() == 1 &&
+          model_flags.input_arrays_size() == 1 &&
+          !specified_input_array.has_name()) {
+        dst_input_array = model->flags.mutable_input_arrays(0);
+      }
+    }
+    if (!dst_input_array) {
+      // Still no match, so create a new input array to copy
+      // specified_input_array into.
+      dst_input_array = model->flags.add_input_arrays();
+      dst_input_array->set_name(specified_input_array.name());
+    }
+
+#define RESOLVE_MODEL_FLAG(field_name)                                       \
+  if (specified_input_array.has_##field_name()) {                            \
+    if (dst_input_array->has_##field_name()) {                               \
+      QCHECK_EQ(dst_input_array->field_name(),                               \
+                specified_input_array.field_name())                          \
+          << "For input array '" << dst_input_array->name() << "', "         \
+          << "specified " #field_name " flag with value: "                   \
+          << specified_input_array.field_name()                              \
+          << " does not agree with already defined " #field_name             \
+             " of this model, with value: "                                  \
+          << specified_input_array.field_name();                             \
+    } else {                                                                 \
+      dst_input_array->set_##field_name(specified_input_array.field_name()); \
+    }                                                                        \
+  }
+    RESOLVE_MODEL_FLAG(std_value);
+    RESOLVE_MODEL_FLAG(mean_value);
+#undef RESOLVE_MODEL_FLAG
+
+    if (specified_input_array.has_shape()) {
+      if (dst_input_array->has_shape()) {
+        QCHECK_EQ(specified_input_array.shape().dims_size(),
+                  dst_input_array->shape().dims_size())
+            << "For input array '" << specified_input_array.name() << "', "
+            << "size of specified input shape flag with size: "
+            << specified_input_array.shape().dims_size()
+            << " does not agree with already defined input shape"
+               " of this model, with size: "
+            << dst_input_array->shape().dims_size();
+        // We treat the first dimension as a special case, since it is often
+        // a batch size and the input_shape flag is effectively overriding
+        // the model.
+        for (int i = 1; i < specified_input_array.shape().dims_size(); i++) {
+          QCHECK_EQ(specified_input_array.shape().dims(i),
+                    dst_input_array->shape().dims(i))
+              << "At dimension number " << i << " of input array "
+              << specified_input_array.name() << ", the specified shape's "
+              << "dimension flag with dimension: "
+              << specified_input_array.shape().dims(i)
+              << " does not agree with already defined shape"
+              << " of this model, with dimension: "
+              << dst_input_array->shape().dims(i);
+        }
+      } else {
+        *dst_input_array->mutable_shape() = specified_input_array.shape();
+      }
+    }
+
+    if (specified_input_array.has_data_type()) {
+      QCHECK(!dst_input_array->has_data_type());
+      dst_input_array->set_data_type(specified_input_array.data_type());
+    }
+  }
+
+  if (model_flags.output_arrays_size() > 0) {
+    model->flags.mutable_output_arrays()->CopyFrom(model_flags.output_arrays());
+  }
+
+#define RESOLVE_MODEL_FLAG(name)                                           \
+  if (model_flags.has_##name()) {                                          \
+    if (model->flags.has_##name()) {                                       \
+      QCHECK_EQ(model_flags.name(), model->flags.name())                   \
+          << "Specified " #name " flag with value: " << model_flags.name() \
+          << " does not agree with already defined " #name                 \
+             " of this model, with value: "                                \
+          << model->flags.name();                                          \
+    } else {                                                               \
+      model->flags.set_##name(model_flags.name());                         \
+    }                                                                      \
+  }
+
+  RESOLVE_MODEL_FLAG(variable_batch)
+
+#undef RESOLVE_MODEL_FLAG
+
+  if (!model_flags.rnn_states().empty()) {
+    model->flags.mutable_rnn_states()->CopyFrom(model_flags.rnn_states());
+  }
+
+  if (model->flags.model_checks_size() == 0) {
+    model->flags.mutable_model_checks()->CopyFrom(model_flags.model_checks());
+  }
+
+  QCHECK_GT(model->flags.output_arrays_size(), 0)
+      << "This model does not define output arrays, so a "
+         "--output_arrays flag must be given on the command-line.";
+
+  for (const auto& input_array_proto : model->flags.input_arrays()) {
+    auto& input_array = model->GetOrCreateArray(input_array_proto.name());
+    if (input_array_proto.has_data_type()) {
+      const ArrayDataType specified_type =
+          ConvertIODataTypeToArrayDataType(input_array_proto.data_type());
+      QCHECK(specified_type != ArrayDataType::kNone);
+      if (input_array.data_type != ArrayDataType::kNone) {
+        QCHECK(specified_type == input_array.data_type)
+            << "For input array " << input_array_proto.name()
+            << " the specified input data type "
+            << IODataType_Name(input_array_proto.data_type())
+            << " conflicts with the existing type.";
+      }
+      input_array.data_type = specified_type;
+    }
+
+    if (input_array.data_type == ArrayDataType::kNone) {
+      // We start out with a float input array;
+      // that may get replaced by a uint8 array later, by
+      // MakeInitialDequantizeOp.
+      input_array.data_type = ArrayDataType::kFloat;
+    }
+
+    // Compare/merge the model->flags describing the input_shape with
+    // the actual input array's shape.
+    if (!input_array.has_shape()) {
+      if (input_array_proto.has_shape()) {
+        auto& input_array_dims = *input_array.mutable_shape()->mutable_dims();
+        for (auto dim : input_array_proto.shape().dims()) {
+          CHECK_GE(dim, 1);
+          input_array_dims.push_back(dim);
+        }
+      }
+    } else {
+      const auto& input_array_dims =
+          *input_array.mutable_shape()->mutable_dims();
+      CHECK_EQ(input_array_dims.size(), input_array_proto.shape().dims_size());
+      for (int i = 0; i < input_array_dims.size(); i++) {
+        CHECK_EQ(input_array_dims[i], input_array_proto.shape().dims(i));
+      }
+    }
+
+    const float mean_value = input_array_proto.mean_value();
+    const float std_value = input_array_proto.std_value();
+    MinMax input_minmax;
+    input_minmax.min = (0.f - mean_value) / std_value;
+    input_minmax.max = (255.f - mean_value) / std_value;
+    if (input_array.minmax) {
+      if (input_array_proto.has_mean_value() ||
+          input_array_proto.has_std_value()) {
+        CHECK(input_minmax == *input_array.minmax)
+            << input_minmax.min << ", " << input_minmax.max
+            << " != " << input_array.minmax->min << ", "
+            << input_array.minmax->max;
+      }
+    } else {
+      input_array.GetOrCreateMinMax() = input_minmax;
+    }
+  }
+  // Creation of the RNN state arrays
+  for (const auto& rnn_state : model->flags.rnn_states()) {
+    if (!rnn_state.manually_create()) {
+      continue;
+    }
+    CreateOrCheckRnnStateArray(rnn_state.state_array(), rnn_state.size(),
+                               model);
+  }
+
+  for (const auto& input_array : model->flags.input_arrays()) {
+    if (input_array.has_shape()) {
+      CHECK(input_array.shape().dims_size());
+    }
+  }
+}
+
+void CheckIsReadyForQuantization(const Model& model) {
+  for (const auto& op : model.operators) {
+    for (const auto& input : op->inputs) {
+      const auto& input_array = model.GetArray(input);
+      if (input_array.data_type != ArrayDataType::kFloat) {
+        // The array is not floats, no quantization needed.
+        continue;
+      }
+      if (input_array.minmax) {
+        // The array has minmax, we're good.
+        continue;
+      }
+      if (input_array.buffer) {
+        // The array has a constant buffer, so we can
+        // fall back to computing the minmax from actual array entries
+        // (with a WARNING about possible accuracy implications).
+        continue;
+      }
+      LOG(FATAL)
+          << "Array " << input << ", which is an input to the "
+          << HelpfulOperatorTypeName(*op) << " operator producing the output "
+          << "array " << op->outputs[0] << ", is lacking min/max data, "
+          << "which is necessary for quantization. Either target a "
+          << "non-quantized output format, or change the input graph to "
+          << "contain min/max information, or pass --default_ranges_min= and "
+          << "--default_ranges_max= if you do not care about the accuracy of "
+          << "results.";
+    }
+  }
+}
+
+void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
+                                 double default_ranges_max) {
+  for (const auto& op : model->operators) {
+    for (const auto& input : op->inputs) {
+      auto& input_array = model->GetArray(input);
+      if (!input_array.minmax && !input_array.buffer) {
+        auto& minmax = input_array.GetOrCreateMinMax();
+        minmax.min = default_ranges_min;
+        minmax.max = default_ranges_max;
+      }
+    }
+    for (const auto& output : op->outputs) {
+      auto& output_array = model->GetArray(output);
+      if (!output_array.minmax && !output_array.buffer) {
+        auto& minmax = output_array.GetOrCreateMinMax();
+        minmax.min = default_ranges_min;
+        minmax.max = default_ranges_max;
+      }
+    }
+  }
+}
+
+int ElementSize(ArrayDataType data_type) {
+  switch (data_type) {
+    case ArrayDataType::kFloat:
+      return 4;
+    case ArrayDataType::kInt32:
+      return 4;
+    case ArrayDataType::kUint8:
+      return 1;
+    default:
+      LOG(FATAL) << "Should not get here.";
+      return 0;
+  }
+}
+
+void DropMinMax(Model* model, const string& array_name) {
+  auto& array = model->GetArray(array_name);
+  if (!!array.minmax) {
+    LOG(WARNING) << "Dropping MinMax information in array " << array_name
+                 << ". Expect inaccuracy in quantized inference.";
+    array.minmax = nullptr;
+  }
+}
+
+bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
+  // The model's input and output arrays are externally allocated.
+  // They are not transient arrays.
+  if (IsInputArray(model, array_name)) {
+    return false;
+  }
+  for (const string& output_array : model.flags.output_arrays()) {
+    if (array_name == output_array) {
+      return false;
+    }
+  }
+  const auto& array = model.arrays.at(array_name);
+  // An array with a constant buffer isn't a transient array.
+  if (!!array->buffer) {
+    return false;
+  }
+  // An array without shape isn't allocatable.
+  if (!array->has_shape()) {
+    return false;
+  }
+  return true;
+}
+
+string AvailableArrayName(const Model& model, const string& name) {
+  if (!model.arrays.count(name)) {
+    return name;
+  }
+  const int kNumSuffixesToTry = 1000;
+  for (int i = 0; i < kNumSuffixesToTry; i++) {
+    const string& name_with_suffix = toco::port::StringF("%s_%d", name, i);
+    if (!model.arrays.count(name_with_suffix)) {
+      return name_with_suffix;
+    }
+  }
+  LOG(FATAL) << "Could not find an available array name starting with " << name
+             << ". Tried " << kNumSuffixesToTry << " suffixes, all were taken!";
+  return "";
+}
+
+string ShapeToString(const Shape& shape) {
+  if (shape.dimensions_count() == 0) {
+    return "[]";
+  }
+
+  return absl::StrCat("[ ", absl::StrJoin(shape.dims(), ", "), " ]");
+}
+
+void PrintArrayShape(Model* model, const string& name) {
+  if (!model->arrays[name]->has_shape()) {
+    LOG(INFO) << name << " has no shape";
+    return;
+  }
+  LOG(INFO) << name
+            << " has shape: " << ShapeToString(model->arrays[name]->shape());
+}
+
+bool IsArrayFullyConnectedWeights(const Model& model, const string& name) {
+  bool is_fc_weights = false;
+  bool is_something_else = false;
+  for (const auto& op : model.operators) {
+    for (int input_index = 0; input_index < op->inputs.size(); input_index++) {
+      if (op->inputs[input_index] == name) {
+        if (op->type == OperatorType::kFullyConnected && input_index == 1) {
+          is_fc_weights = true;
+        } else {
+          is_something_else = true;
+        }
+      }
+    }
+  }
+  CHECK(!(is_fc_weights && is_something_else));
+  return is_fc_weights;
+}
+
+bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
+  int64 total = 0;
+  for (const auto& op : model.operators) {
+    switch (op->type) {
+      case OperatorType::kFullyConnected:
+      case OperatorType::kConv:
+      case OperatorType::kDepthwiseConv: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        const auto& weights_array = model.GetArray(op->inputs[1]);
+        if (!output_array.has_shape() || !weights_array.has_shape()) {
+          return false;
+        }
+        int cols = 1;
+        for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
+          cols *= output_array.shape().dims(i);
+        }
+        const int64 cost_per_col =
+            2 * RequiredBufferSizeForShape(weights_array.shape());
+        total += cost_per_col * cols;
+        if (op->inputs.size() > 2) {
+          // There is a bias vector. One more op per output value.
+          total += RequiredBufferSizeForShape(output_array.shape());
+        }
+        break;
+      }
+      case OperatorType::kAdd:
+      case OperatorType::kSub:
+      case OperatorType::kMul: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        total += RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
+      case OperatorType::kLogistic:
+      case OperatorType::kSoftmax:
+      case OperatorType::kTanh: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // As a very rough ballpark, the cost of evaluating a math function
+        // such as tanh or logistic is about 32 multiplications, and about as
+        // many additions/subtractions. (Just a power-of-two order-of-magnitude
+        // from looking at actual implementations that we use in runtime/ code).
+        total += 64 * RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
+      case OperatorType::kMaxPool: {
+        const auto& maxpool = *static_cast<const MaxPoolOperator*>(op.get());
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        total += RequiredBufferSizeForShape(output_array.shape()) *
+                 maxpool.kheight * maxpool.kwidth;
+        break;
+      }
+      case OperatorType::kAveragePool: {
+        const auto& avgpool =
+            *static_cast<const AveragePoolOperator*>(op.get());
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        total += RequiredBufferSizeForShape(output_array.shape()) *
+                 avgpool.kheight * avgpool.kwidth;
+        break;
+      }
+      case OperatorType::kL2Pool: {
+        const auto* maxpool = static_cast<const MaxPoolOperator*>(op.get());
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // The sum of squares requires (kheight*kwidth) multiply-adds,
+        // and then there is the sqrt which we ballpark at 32 ops.
+        const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
+        total +=
+            RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
+        break;
+      }
+      case OperatorType::kL2Normalization: {
+        const auto& output_array = model.GetArray(op->outputs[0]);
+        if (!output_array.has_shape()) {
+          return false;
+        }
+        // Computing the squared L2 norm is N multiply-adds so 2N ops,
+        // then the single inverse-sqrt is negligible, then we multiply each
+        // value by the resulting multiplier, so an extra N ops. Total 3N ops.
+        total += 3 * RequiredBufferSizeForShape(output_array.shape());
+        break;
+      }
+      default:
+        break;
+    }
+  }
+  *result = total;
+  return true;
+}
+
+namespace {
+
+void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
+                     std::vector<int>* shuffle) {
+  CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
+  shuffle->resize(4);
+  for (int i = 0; i < 4; i++) {
+    (*shuffle)[i] = i;
+  }
+  if (input_axes_order == output_axes_order) {
+    // nothing to do
+  } else if (AxesCount(input_axes_order) == 2) {
+    shuffle->resize(2);
+    (*shuffle)[0] = 1;
+    (*shuffle)[1] = 0;
+  } else if (input_axes_order == AxesOrder::kOHWI &&
+             output_axes_order == AxesOrder::kHWIO) {
+    // 3210 <- 3210
+    // HWIO <- OHWI
+    (*shuffle)[0] = 1;
+    (*shuffle)[1] = 2;
+    (*shuffle)[2] = 3;
+    (*shuffle)[3] = 0;
+  } else if (input_axes_order == AxesOrder::kHWIO &&
+             output_axes_order == AxesOrder::kOHWI) {
+    // 3210 <- 3210
+    // OHWI <- HWIO
+    (*shuffle)[0] = 3;
+    (*shuffle)[1] = 0;
+    (*shuffle)[2] = 1;
+    (*shuffle)[3] = 2;
+  } else {
+    LOG(FATAL) << "Bad shuffle";
+  }
+}
+
+// Extend shuffle is designed to match ExtendShape, which pads the shape with
+// unit dimensions at the beginning.
+void ExtendShuffle(const std::vector<int>& input_shuffle, int newdim,
+                   std::vector<int>* extended_shuffle) {
+  *extended_shuffle = input_shuffle;
+  CHECK(newdim >= input_shuffle.size());
+  const int pad_size = newdim - input_shuffle.size();
+  extended_shuffle->resize(newdim);
+  for (int i = 0; i < pad_size; i++) {
+    (*extended_shuffle)[i] = i;
+  }
+  for (int i = pad_size; i < newdim; i++) {
+    (*extended_shuffle)[i] = input_shuffle[i - pad_size] + pad_size;
+  }
+}
+
+}  // end anonymous namespace
+
+void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
+                 AxesOrder output_axes_order, Shape* output_shape) {
+  if (input_axes_order == AxesOrder::kHWIM &&
+      output_axes_order == AxesOrder::k1HWO) {
+    // This special case isn't just a permutation, the IM pair of dims get
+    // merged into the 3 dim, so we have to special-case it.
+    *output_shape = Shape({1, input_shape.dims(0), input_shape.dims(1),
+                           input_shape.dims(3) * input_shape.dims(2)});
+  } else {
+    std::vector<int> shuffle;
+    GetShuffleShape(input_axes_order, output_axes_order, &shuffle);
+    std::vector<int>* output_dims = output_shape->mutable_dims();
+    output_dims->resize(input_shape.dimensions_count());
+    for (int i = 0; i < input_shape.dimensions_count(); i++) {
+      (*output_dims)[i] = input_shape.dims(shuffle[i]);
+    }
+  }
+}
+
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const float* input_data, float* output_data) {
+  if (input_axes_order == AxesOrder::kHWIM &&
+      output_axes_order == AxesOrder::k1HWO) {
+    // This special case isn't just a permutation, the IM pair of dims get
+    // merged into the O dim, so we have to special-case it. Fortunately,
+    // as far as array shuffling is concerned, it's just the identity
+    // transformation.
+    memcpy(output_data, input_data,
+           RequiredBufferSizeForShape(input_shape) * sizeof(output_data[0]));
+    return;
+  }
+  CHECK(input_shape.dimensions_count() == output_shape.dimensions_count());
+  const int dim = input_shape.dimensions_count();
+  CHECK_LE(dim, 4);
+  std::vector<int> shuffle;
+  GetShuffleShape(input_axes_order, output_axes_order, &shuffle);
+  CHECK(shuffle.size() >= dim);
+  for (int i = 0; i < dim; i++) {
+    CHECK(shuffle[i] >= 0 && shuffle[i] < dim);
+    CHECK(input_shape.dims(shuffle[i]) == output_shape.dims(i));
+  }
+  Shape extended_input_shape = input_shape;
+  ExtendShape(&extended_input_shape, 4);
+  Shape extended_output_shape = output_shape;
+  ExtendShape(&extended_output_shape, 4);
+  std::vector<int> extended_shuffle;
+  ExtendShuffle(shuffle, 4, &extended_shuffle);
+
+  const std::vector<int>& extended_input_dims = extended_input_shape.dims();
+  const std::vector<int>& extended_output_dims = extended_output_shape.dims();
+
+  // TODO(starka): Rework to handle different numbers of dimensions.
+  int input_strides[4];
+  input_strides[3] = 1;
+  input_strides[2] = extended_input_dims[3];
+  input_strides[1] = input_strides[2] * extended_input_dims[2];
+  input_strides[0] = input_strides[1] * extended_input_dims[1];
+  const int input_stride_0 = input_strides[extended_shuffle[3]];
+  const int input_stride_1 = input_strides[extended_shuffle[2]];
+  const int input_stride_2 = input_strides[extended_shuffle[1]];
+  const int input_stride_3 = input_strides[extended_shuffle[0]];
+
+  const int output_size_0 = extended_output_dims[3];
+  const int output_size_1 = extended_output_dims[2];
+  const int output_size_2 = extended_output_dims[1];
+  const int output_size_3 = extended_output_dims[0];
+  const int output_stride_0 = 1;
+  const int output_stride_1 = output_size_0;
+  const int output_stride_2 = output_stride_1 * output_size_1;
+  const int output_stride_3 = output_stride_2 * output_size_2;
+
+  for (int i3 = 0; i3 < output_size_3; i3++) {
+    const float* const input_ptr_3 = input_data + i3 * input_stride_3;
+    float* const output_ptr_3 = output_data + i3 * output_stride_3;
+    for (int i2 = 0; i2 < output_size_2; i2++) {
+      const float* const input_ptr_2 = input_ptr_3 + i2 * input_stride_2;
+      float* const output_ptr_2 = output_ptr_3 + i2 * output_stride_2;
+      for (int i1 = 0; i1 < output_size_1; i1++) {
+        const float* input_ptr = input_ptr_2 + i1 * input_stride_1;
+        float* output_ptr = output_ptr_2 + i1 * output_stride_1;
+        float* const output_ptr_end =
+            output_ptr + output_size_0 * output_stride_0;
+        while (output_ptr != output_ptr_end) {
+          *output_ptr = *input_ptr;
+          input_ptr += input_stride_0;
+          output_ptr += output_stride_0;
+        }
+      }
+    }
+  }
+}
+
+int AxesCount(AxesOrder axes_order) {
+  switch (axes_order) {
+    case AxesOrder::kOneAxis:
+      return 1;
+    case AxesOrder::kRC:
+      return 2;
+    case AxesOrder::kCR:
+      return 2;
+    case AxesOrder::kHWIO:
+      return 4;
+    case AxesOrder::kOHWI:
+      return 4;
+    case AxesOrder::kHWIM:
+      return 4;
+    case AxesOrder::k1HWO:
+      return 4;
+    case AxesOrder::kNHWC:
+      return 4;
+    default:
+      LOG(FATAL) << "Bad AxesOrder";
+      return 0;
+  }
+}
+
+bool IsDiscardableArray(const Model& model, const string& array_name) {
+  for (const auto& input_array : model.flags.input_arrays()) {
+    if (array_name == input_array.name()) {
+      return false;
+    }
+  }
+  for (const string& output_array : model.flags.output_arrays()) {
+    if (array_name == output_array) {
+      return false;
+    }
+  }
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    if (!rnn_state.discardable()) {
+      if (array_name == rnn_state.state_array()) {
+        return false;
+      }
+      if (array_name == rnn_state.back_edge_source_array()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void CheckFinalDataTypesSatisfied(const Model& model) {
+  for (const auto& array_entry : model.arrays) {
+    const auto& array = *array_entry.second;
+    if (array.final_data_type != ArrayDataType::kNone) {
+      CHECK(array.final_data_type == array.data_type)
+          << "Array \"" << array_entry.first
+          << "\" has mis-matching actual and final data types ("
+          << static_cast<int>(array.data_type) << ","
+          << static_cast<int>(array.final_data_type) << ").";
+    }
+  }
+}
+
+ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
+  switch (type) {
+    case FLOAT:
+      return ArrayDataType::kFloat;
+    case QUANTIZED_UINT8:
+      return ArrayDataType::kUint8;
+    case INT32:
+      return ArrayDataType::kInt32;
+    case INT64:
+      return ArrayDataType::kInt64;
+    default:
+      return ArrayDataType::kNone;
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d820d619d0de425407e88076082a3e0f8d4783a9
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -0,0 +1,295 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "google/protobuf/text_format.h"
+#include "tensorflow/core/platform/logging.h"
+#if TOCO_SUPPORT_PORTABLE_PROTOS
+#include "third_party/protobuf/src/google/protobuf/text_format.h"
+#endif  // TOCO_SUPPORT_PORTABLE_PROTOS
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/model_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_port.h"
+#include "tensorflow/contrib/lite/toco/types.pb.h"
+
+// TODO(aselle): Replace with using a container specific hash override instead.
+namespace std {
+template <>
+struct hash<toco::OperatorType> {
+  size_t operator()(const toco::OperatorType& op) const {
+    return std::hash<size_t>()(static_cast<size_t>(op));
+  }
+};
+}  // namespace std
+
+namespace toco {
+
+constexpr int kLogLevelModelChanged = 1;
+constexpr int kLogLevelModelUnchanged = 2;
+
+string LogName(const Operator& op);
+
+bool IsInputArray(const Model& model, const string& name);
+bool IsArrayConsumed(const Model& model, const string& name);
+int CountTrueOutputs(const Model& model, const Operator& op);
+
+int CountOpsWithInput(const Model& model, const string& array_name);
+bool DeleteArrayIfUnused(const string& array_name, Model* model);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
+    const Model& model, const string& array_name);
+Operator* GetOpWithOutput(const Model& model, const string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOpWithOutput(
+    Model& model, const string& array_name);
+Operator* GetOpWithOutput(const Model& model, const string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithInput(
+    const Model& model, const string& array_name);
+Operator* GetOpWithInput(const Model& model, const string& array_name);
+Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
+
+std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
+    const Model& model, const Operator* op);
+std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
+                                                        const Operator* op);
+
+const char* OperatorTypeName(OperatorType type);
+string HelpfulOperatorTypeName(const Operator& op);
+
+void DumpGraphvizVideoFrame(const Model& model);
+void LogDump(int log_level, const string& message, const Model& model);
+void LogSummary(int log_level, const string& message, const Model& model);
+
+inline bool ParseFromStringOverload(const std::string& in,
+                                    TFLITE_PROTO_NS::Message* proto) {
+  return TFLITE_PROTO_NS::TextFormat::ParseFromString(in, proto);
+}
+
+template <typename Proto>
+bool ParseFromStringEitherTextOrBinary(const std::string& input_file_contents,
+                                       Proto* proto) {
+  if (proto->ParseFromString(input_file_contents)) {
+    return true;
+  }
+
+  if (ParseFromStringOverload(input_file_contents, proto)) {
+    return true;
+  }
+
+  return false;
+}
+
+// TODO(b/36075966): Clean up when dims superseded by array shape.
+void ExtendShape(Shape* shape, int new_shape_size);
+
+// TODO(b/36075966): Clean up when dims superseded by array shape.
+void UnextendShape(Shape* shape, int new_shape_size);
+
+// Checks (using CHECK) that all dimensions of 'shape' are at least 1.
+void CheckShapeDimensions(const Shape& shape);
+
+// Given two shapes with potentially different dimensionality and dimension
+// arrays d0 and d1. Without loss of generality, assume that shape0 may have
+// higher dimensionality (length(d0) >= length(d1)). Then shape0 and shape1
+// "agree up to broadcasting" if:
+// - When walking the d0 and d1 from back to front with indices i0, i1,
+//   d0[i0] == d1[i1] or d0[i0] == 1 or d1[i1] == 1, for each dimension until
+//   i1 == 0 (inclusive).
+bool ShapesAgreeUpToBroadcasting(const Shape& shape0, const Shape& shape1);
+
+// A stricter constraint than ShapesAgreeUpToBroadcasting().
+//
+// Given two shapes with potentially different dimensionality and dimension
+// arrays d0 and d1. Without loss of generality, assume that shape0 may have
+// higher dimensionality (length(d0) >= length(d1)). Then shape0 and shape1
+// "agree up to extending" if:
+// - When walking the d0 and d1 from back to front with indices i0, i1,
+//   d0[i0] == d1[i1] for each dimension until i1 == 0 (inclusive).
+// - For the remaining indices [0..i0), d0[i0] == 1.
+bool ShapesAgreeUpToExtending(const Shape& shape0, const Shape& shape1);
+
+bool IsArrayFullyConnectedWeights(const Model& model, const string& name);
+
+// If there is a wildcard dimension (-1), this may return a negative value.
+int RequiredBufferSizeForShape(const Shape& shape);
+
+bool IsConstantParameterArray(const Model& model, const string& name);
+
+void CheckNoMissingArray(const Model& model);
+void CheckInvariants(const Model& model);
+
+void CheckModelCounts(const Model& model);
+
+void FixOperatorOrdering(Model* model);
+void FixNoMissingArray(Model* model);
+void FixNoOrphanedArray(Model* model);
+
+void ResolveModelFlags(const ModelFlags& model_flags, Model* model);
+
+template <ArrayDataType A>
+void GetQuantizationParamsFromMinMax(const ModelFlags& model_flags,
+                                     const MinMax& minmax,
+                                     QuantizationParams* quantization_params) {
+  using Integer = DataType<A>;
+  const Integer qmin = std::numeric_limits<Integer>::min();
+  const Integer qmax = std::numeric_limits<Integer>::max();
+  const double qmin_double = qmin;
+  const double qmax_double = qmax;
+  const double rmin = minmax.min;
+  const double rmax = minmax.max;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  CHECK_LE(rmin, 0.);
+  CHECK_GE(rmax, 0.);
+  if (rmin == rmax) {
+    // Special case where the min,max range is a point. Should be {0}.
+    CHECK_EQ(rmin, 0.);
+    CHECK_EQ(rmax, 0.);
+    quantization_params->zero_point = 0;
+    quantization_params->scale = 0.;
+    return;
+  }
+
+  // General case.
+  //
+  // First determine the scale.
+  const double scale = (rmax - rmin) / (qmax_double - qmin_double);
+
+  // Zero-point computation.
+  // First the initial floating-point computation. The zero-point can be
+  // determined from solving an affine equation for any known pair
+  // (real value, corresponding quantized value).
+  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+  // The arithmetic error on the zero point computed from either pair
+  // will be roughly machine_epsilon * (sum of absolute values of terms)
+  // so we want to use the variant that adds the smaller terms.
+  const double zero_point_from_min = qmin_double - rmin / scale;
+  const double zero_point_from_max = qmax_double - rmax / scale;
+  const double zero_point_from_min_error =
+      std::abs(qmin_double) + std::abs(rmin / scale);
+  const double zero_point_from_max_error =
+      std::abs(qmax_double) + std::abs(rmax / scale);
+
+  const double zero_point_double =
+      zero_point_from_min_error < zero_point_from_max_error
+          ? zero_point_from_min
+          : zero_point_from_max;
+
+  // Now we need to nudge the zero point to be an integer
+  // (our zero points are integer, and this is motivated by the requirement
+  // to be able to represent the real value "0" exactly as a quantized value,
+  // which is required in multiple places, for example in Im2col with SAME
+  // padding).
+  Integer nudged_zero_point = 0;
+  if (zero_point_double < qmin_double) {
+    nudged_zero_point = qmin;
+  } else if (zero_point_double > qmax_double) {
+    nudged_zero_point = qmax;
+  } else {
+    nudged_zero_point = static_cast<Integer>(std::round(zero_point_double));
+  }
+  // The zero point should always be in the range of quantized value,
+  // [qmin, qmax].
+  CHECK_GE(nudged_zero_point, qmin);
+  CHECK_LE(nudged_zero_point, qmax);
+
+  // Finally, store the result nudged quantization params.
+  quantization_params->zero_point = nudged_zero_point;
+  quantization_params->scale = scale;
+}
+
+void CheckIsReadyForQuantization(const Model& model);
+void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
+                                 double default_ranges_max);
+
+inline int Offset(const Shape& shape, const std::vector<int>& indices) {
+  DCHECK_EQ(shape.dimensions_count(), indices.size());
+  const int dims_count = shape.dimensions_count();
+  int offset = 0;
+  for (int i = 0; i < dims_count; i++) {
+    const int index = indices[i];
+    DCHECK(index >= 0 && index < shape.dims(i));
+    offset *= shape.dims(i);
+    offset += index;
+  }
+  return offset;
+}
+
+inline std::vector<int> ReverseOffset(const Shape& shape, int index) {
+  DCHECK_GE(index, 0);
+  DCHECK_LT(index, RequiredBufferSizeForShape(shape));
+  const int dims_count = shape.dimensions_count();
+  std::vector<int> indices(dims_count);
+  int residual = index;
+  for (int i = dims_count - 1; i >= 0; i--) {
+    indices[i] = residual % shape.dims(i);
+    residual /= shape.dims(i);
+  }
+  return indices;
+}
+
+int ElementSize(ArrayDataType data_type);
+
+void DropMinMax(Model* model, const string& array_name);
+
+bool IsAllocatableTransientArray(const Model& model, const string& array_name);
+
+void CreateOrCheckRnnStateArray(const string& name, int size, Model* model);
+
+string AvailableArrayName(const Model& model, const string& name);
+
+// Formats a shape as a string: [ dims(0), dims(1), ..., dims(num_dims-1) ].
+string ShapeToString(const Shape& shape);
+
+void PrintArrayShape(Model* model, const string& name);
+
+void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
+                   std::vector<int>* out_dims);
+
+bool EstimateArithmeticOpsCount(const Model& model, int64* result);
+
+int AxesCount(AxesOrder axes_order);
+
+void ShuffleDims(const Shape& input_shape, AxesOrder input_axes_order,
+                 AxesOrder output_axes_order, Shape* output_shape);
+void ShuffleArray(const Shape& input_shape, AxesOrder input_axes_order,
+                  AxesOrder output_axes_order, const Shape& output_shape,
+                  const float* input_data, float* output_data);
+
+// Returns true if it may be OK for any graph transformation to ever discard
+// that array. The idea is that we can't ever discard arrays that are either
+// an input or an output of the whole graph, or that appear in RNN back-edges,
+// as that would undercut explicit flags that the user might pass.
+bool IsDiscardableArray(const Model& model, const string& array_name);
+
+void CheckFinalDataTypesSatisfied(const Model& model);
+
+ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
+
+}  // namespace toco
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util_test.cc b/tensorflow/contrib/lite/toco/tooling_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22955ce95661a9ec2bb7da16a371abd35f713f85
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/tooling_util_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+enum class Agreement { kBroadcast, kExtend, kBroadcastNotExtend, kNeither };
+
+// A pair of Shapes and whether they should agree up to broadcasting, extending
+// or neither.
+struct ShapePair {
+  Shape left;
+  Shape right;
+  Agreement agreement;
+};
+
+std::vector<ShapePair> CreateShapePairs() {
+  return std::vector<ShapePair>(
+      {// These agree up to broadcast.
+       {Shape({3}), Shape({3}), Agreement::kBroadcast},
+       {Shape({256, 256, 3}), Shape({256, 256, 3}), Agreement::kBroadcast},
+       {Shape({256, 256, 3}), Shape({3}), Agreement::kBroadcast},
+       {Shape({8, 1, 6, 1}), Shape({7, 1, 5}), Agreement::kBroadcast},
+
+       // These extend (and therefore broadcast).
+       {Shape({3}), Shape({3}), Agreement::kExtend},
+       {Shape({256, 256, 3}), Shape({256, 256, 3}), Agreement::kExtend},
+       {Shape({1, 1, 3}), Shape({1, 1, 3}), Agreement::kExtend},
+       {Shape({1, 1, 3}), Shape({3}), Agreement::kExtend},
+       {Shape({1, 1, 3}), Shape({1, 3}), Agreement::kExtend},
+
+       // These strictly broadcast and do not extend.
+       {Shape({256, 256, 3}), Shape({3}), Agreement::kBroadcastNotExtend},
+       {Shape({5, 4}), Shape({1}), Agreement::kBroadcastNotExtend},
+       {Shape({5, 4}), Shape({4}), Agreement::kBroadcastNotExtend},
+       {Shape({15, 3, 5}), Shape({15, 1, 5}), Agreement::kBroadcastNotExtend},
+       {Shape({15, 3, 5}), Shape({3, 5}), Agreement::kBroadcastNotExtend},
+       {Shape({15, 3, 5}), Shape({3, 1}), Agreement::kBroadcastNotExtend},
+
+       // These do not broadcast (and therefore also do not extend).
+       {Shape({3}), Shape({4}), Agreement::kNeither},
+       {Shape({2, 1}), Shape({8, 4, 3}), Agreement::kNeither}});
+}
+
+// ShapeTest is an empty parameterized test fixture since there is no state.
+class ShapeTest : public ::testing::TestWithParam<ShapePair> {};
+
+TEST_P(ShapeTest, Agrees) {
+  const ShapePair& param = GetParam();
+
+  switch (param.agreement) {
+    case Agreement::kBroadcast: {
+      EXPECT_TRUE(ShapesAgreeUpToBroadcasting(param.left, param.right));
+      break;
+    }
+    case Agreement::kExtend: {
+      EXPECT_TRUE(ShapesAgreeUpToExtending(param.left, param.right));
+      // Anything that extends should also broadcast.
+      EXPECT_TRUE(ShapesAgreeUpToBroadcasting(param.left, param.right));
+      break;
+    }
+    case Agreement::kBroadcastNotExtend: {
+      // Verify that it strictly broadcasts but does not extend.
+      EXPECT_TRUE(ShapesAgreeUpToBroadcasting(param.left, param.right));
+      EXPECT_FALSE(ShapesAgreeUpToExtending(param.left, param.right));
+      break;
+    }
+    case Agreement::kNeither: {
+      EXPECT_FALSE(ShapesAgreeUpToExtending(param.left, param.right));
+      EXPECT_FALSE(ShapesAgreeUpToBroadcasting(param.left, param.right));
+      break;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(AgreeBroadcast, ShapeTest,
+                        ::testing::ValuesIn(CreateShapePairs()));
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/types.proto b/tensorflow/contrib/lite/toco/types.proto
new file mode 100644
index 0000000000000000000000000000000000000000..318fd4b7b2c2df093562e73c3fe707675ee98876
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/types.proto
@@ -0,0 +1,37 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+
+package toco;
+
+// IODataType describes the numeric data types of input and output arrays
+// of a model.
+enum IODataType {
+  IO_DATA_TYPE_UNKNOWN = 0;
+
+  // Float32, not quantized
+  FLOAT = 1;
+
+  // Uint8, quantized
+  QUANTIZED_UINT8 = 2;
+
+  // Int32, not quantized
+  INT32 = 3;
+
+  // Int64, not quantized
+  INT64 = 4;
+
+  // String, not quantized
+  STRING = 5;
+}
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..751682215bce37a8e4b8befe70b5288617053b54
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -0,0 +1,64 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+
+tf_cc_binary(
+    name = "generate_op_registrations",
+    srcs = ["gen_op_registration_main.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/tools:gen_op_registration",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gen_op_registration",
+    srcs = ["gen_op_registration.cc"],
+    hdrs = ["gen_op_registration.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
+
+cc_test(
+    name = "gen_op_registration_test",
+    srcs = ["gen_op_registration_test.cc"],
+    data = [
+        "//tensorflow/contrib/lite:testdata/0_subgraphs.bin",
+        "//tensorflow/contrib/lite:testdata/2_subgraphs.bin",
+        "//tensorflow/contrib/lite:testdata/empty_model.bin",
+        "//tensorflow/contrib/lite:testdata/test_model.bin",
+        "//tensorflow/contrib/lite:testdata/test_model_broken.bin",
+    ],
+    deps = [
+        ":gen_op_registration",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "mutable_op_resolver",
+    srcs = ["mutable_op_resolver.cc"],
+    hdrs = ["mutable_op_resolver.h"],
+    deps = ["//tensorflow/contrib/lite:framework"],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ae3ab57294a92162b15f326630ac202a9ba2a82
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -0,0 +1,96 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdarg>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+#endif
+
+#define LOG(x) std::cerr
+
+#define CHECK(x)                  \
+  if (!(x)) {                     \
+    LOG(ERROR) << #x << "failed"; \
+    exit(1);                      \
+  }
+
+namespace tensorflow {
+namespace benchmark_tflite_model {
+
+std::unique_ptr<tflite::FlatBufferModel> model;
+std::unique_ptr<tflite::Interpreter> interpreter;
+
+void InitImpl(const std::string& graph, const std::vector<int>& sizes,
+              const std::string& input_layer_type, int num_threads) {
+  CHECK(graph.c_str());
+
+  model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
+  if (!model) {
+    LOG(FATAL) << "Failed to mmap model " << graph;
+  }
+  LOG(INFO) << "Loaded model " << graph;
+  model->error_reporter();
+  LOG(INFO) << "resolved reporter";
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+#else
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+#endif
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    LOG(FATAL) << "Failed to construct interpreter";
+  }
+
+  if (num_threads != -1) {
+    interpreter->SetNumThreads(num_threads);
+  }
+
+  int input = interpreter->inputs()[0];
+
+  if (input_layer_type != "string") {
+    interpreter->ResizeInputTensor(input, sizes);
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    LOG(FATAL) << "Failed to allocate tensors!";
+  }
+}
+
+int Main(int argc, char** argv) {
+  InitImpl("", {}, "", 1);
+  return 0;
+}
+
+}  // namespace benchmark_tflite_model
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  return tensorflow::benchmark_tflite_model::Main(argc, argv);
+}
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.cc b/tensorflow/contrib/lite/tools/gen_op_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d80ea59170b4edc67ca45a4410890f60cf5259e7
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/gen_op_registration.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include "re2/re2.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+
+namespace tflite {
+
+string NormalizeCustomOpName(const string& op) {
+  string method(op);
+  RE2::GlobalReplace(&method, "([a-z])([A-Z])", "\\1_\\2");
+  std::transform(method.begin(), method.end(), method.begin(), ::toupper);
+  return method;
+}
+
+void ReadOpsFromModel(const ::tflite::Model* model,
+                      std::vector<string>* builtin_ops,
+                      std::vector<string>* custom_ops) {
+  if (!model) return;
+  auto opcodes = model->operator_codes();
+  if (!opcodes) return;
+  for (const auto* opcode : *opcodes) {
+    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
+      builtin_ops->push_back(
+          tflite::EnumNameBuiltinOperator(opcode->builtin_code()));
+    } else {
+      custom_ops->push_back(opcode->custom_code()->c_str());
+    }
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration.h b/tensorflow/contrib/lite/tools/gen_op_registration.h
new file mode 100644
index 0000000000000000000000000000000000000000..318859e23d7b404c130f003b0e249893f2ed92fe
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/gen_op_registration.h
@@ -0,0 +1,39 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/string.h"
+
+namespace tflite {
+
+// Convert the custom op name to registration name following the convention.
+// Example:
+//   "custom_op" -> "CUSTOM_OP"
+//   "CustomOp" -> "CUSTOM_OP"
+// Note "Register_" suffix will be added later in the tool.
+string NormalizeCustomOpName(const string& op);
+
+// Read ops from the TFLite model.
+// Enum name of builtin ops will be stored, such as "CONV_2D".
+// Custom op name will be stored as it is.
+void ReadOpsFromModel(const ::tflite::Model* model,
+                      std::vector<string>* builtin_ops,
+                      std::vector<string>* custom_ops);
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_GEN_OP_REGISTRATION_H_
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17b514c9169817479e18eecf5799ea4371f3b051
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cassert>
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/strip.h"
+#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+const char kInputModelFlag[] = "input_model";
+const char kOutputRegistrationFlag[] = "output_registration";
+const char kTfLitePathFlag[] = "tflite_path";
+
+using tensorflow::Flag;
+using tensorflow::Flags;
+using tensorflow::string;
+
+void ParseFlagAndInit(int argc, char** argv, string* input_model,
+                      string* output_registration, string* tflite_path) {
+  std::vector<tensorflow::Flag> flag_list = {
+      Flag(kInputModelFlag, input_model, "path to the tflite model"),
+      Flag(kOutputRegistrationFlag, output_registration,
+           "filename for generated registration code"),
+      Flag(kTfLitePathFlag, tflite_path, "Path to tensorflow lite dir"),
+  };
+
+  Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+}
+
+namespace {
+
+void GenerateFileContent(const std::string& tflite_path,
+                         const std::string& filename,
+                         const std::vector<string>& builtin_ops,
+                         const std::vector<string>& custom_ops) {
+  std::ofstream fout(filename);
+
+  fout << "#include \"" << tflite_path << "/model.h\"\n";
+  fout << "#include \"" << tflite_path << "/tools/mutable_op_resolver.h\"\n";
+
+  fout << "namespace tflite {\n";
+  fout << "namespace ops {\n";
+  if (!builtin_ops.empty()) {
+    fout << "namespace builtin {\n";
+    fout << "// Forward-declarations for the builtin ops.\n";
+    for (const auto& op : builtin_ops) {
+      fout << "TfLiteRegistration* Register_" << op << "();\n";
+    }
+    fout << "}  // namespace builtin\n";
+  }
+
+  if (!custom_ops.empty()) {
+    fout << "namespace custom {\n";
+    fout << "// Forward-declarations for the custom ops.\n";
+    for (const auto& op : custom_ops) {
+      fout << "TfLiteRegistration* Register_"
+           << ::tflite::NormalizeCustomOpName(op) << "();\n";
+    }
+    fout << "}  // namespace custom\n";
+  }
+  fout << "}  // namespace ops\n";
+  fout << "}  // namespace tflite\n";
+
+  fout << "void RegisterSelectedOps(::tflite::MutableOpResolver* resolver) {\n";
+  for (const auto& op : builtin_ops) {
+    fout << "  resolver->AddBuiltin(::tflite::BuiltinOperator_" << op
+         << ", ::tflite::ops::builtin::Register_" << op << "());\n";
+  }
+  for (const auto& op : custom_ops) {
+    fout << "  resolver->AddCustom(\"" << op
+         << "\", ::tflite::ops::custom::Register_"
+         << ::tflite::NormalizeCustomOpName(op) << "());\n";
+  }
+  fout << "}\n";
+  fout.close();
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  string input_model;
+  string output_registration;
+  string tflite_path;
+  ParseFlagAndInit(argc, argv, &input_model, &output_registration,
+                   &tflite_path);
+
+  std::vector<string> builtin_ops;
+  std::vector<string> custom_ops;
+  std::ifstream fin(input_model);
+  std::stringstream content;
+  content << fin.rdbuf();
+  // Need to store content data first, otherwise, it won't work in bazel.
+  string content_str = content.str();
+  const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
+  ::tflite::ReadOpsFromModel(model, &builtin_ops, &custom_ops);
+  GenerateFileContent(tflite_path, output_registration, builtin_ops,
+                      custom_ops);
+  return 0;
+}
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_test.cc b/tensorflow/contrib/lite/tools/gen_op_registration_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28a98d68ab23a558a682dd6debb6081f2a1640dc
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/gen_op_registration.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using ::testing::ElementsAreArray;
+
+namespace tflite {
+
+class GenOpRegistrationTest : public ::testing::Test {
+ protected:
+  GenOpRegistrationTest() {}
+
+  void ReadOps(const string& model_path) {
+    auto model = FlatBufferModel::BuildFromFile(model_path.data());
+    if (model) {
+      ReadOpsFromModel(model->GetModel(), &builtin_ops_, &custom_ops_);
+    }
+  }
+
+  std::vector<string> builtin_ops_;
+  std::vector<string> custom_ops_;
+};
+
+TEST_F(GenOpRegistrationTest, TestNonExistantFiles) {
+  ReadOps("/tmp/tflite_model_1234");
+  EXPECT_EQ(builtin_ops_.size(), 0);
+  EXPECT_EQ(custom_ops_.size(), 0);
+}
+
+TEST_F(GenOpRegistrationTest, TestModels) {
+  ReadOps("tensorflow/contrib/lite/testdata/test_model.bin");
+  EXPECT_THAT(builtin_ops_, ElementsAreArray({"CONV_2D"}));
+  EXPECT_THAT(custom_ops_, ElementsAreArray({"testing_op"}));
+}
+
+TEST_F(GenOpRegistrationTest, TestEmptyModels) {
+  ReadOps("tensorflow/contrib/lite/testdata/empty_model.bin");
+  EXPECT_EQ(builtin_ops_.size(), 0);
+  EXPECT_EQ(custom_ops_.size(), 0);
+}
+
+TEST_F(GenOpRegistrationTest, TestZeroSubgraphs) {
+  ReadOps("tensorflow/contrib/lite/testdata/0_subgraphs.bin");
+  EXPECT_EQ(builtin_ops_.size(), 0);
+  EXPECT_EQ(custom_ops_.size(), 0);
+}
+
+TEST_F(GenOpRegistrationTest, TestBrokenMmap) {
+  ReadOps("tensorflow/contrib/lite/testdata/test_model_broken.bin");
+  EXPECT_EQ(builtin_ops_.size(), 0);
+  EXPECT_EQ(custom_ops_.size(), 0);
+}
+
+TEST_F(GenOpRegistrationTest, TestNormalizeCustomOpName) {
+  std::vector<std::pair<string, string>> testcase = {
+      {"CustomOp", "CUSTOM_OP"},
+      {"a", "A"},
+      {"custom_op", "CUSTOM_OP"},
+      {"customop", "CUSTOMOP"},
+  };
+
+  for (const auto& test : testcase) {
+    EXPECT_EQ(NormalizeCustomOpName(test.first), test.second);
+  }
+}
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.cc b/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a921d7c5aa20ce3a9dc279d8f0c7c253905b078
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
@@ -0,0 +1,43 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
+
+namespace tflite {
+
+TfLiteRegistration* MutableOpResolver::FindOp(
+    tflite::BuiltinOperator op) const {
+  auto it = builtins_.find(op);
+  return it != builtins_.end() ? it->second : nullptr;
+}
+
+TfLiteRegistration* MutableOpResolver::FindOp(const char* op) const {
+  auto it = custom_ops_.find(op);
+  return it != custom_ops_.end() ? it->second : nullptr;
+}
+
+void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                   TfLiteRegistration* registration) {
+  registration->builtin_code = op;
+  builtins_.insert(std::make_pair(op, registration));
+}
+
+void MutableOpResolver::AddCustom(const char* name,
+                                  TfLiteRegistration* registration) {
+  registration->builtin_code = BuiltinOperator_CUSTOM;
+  custom_ops_.insert(std::make_pair(std::string(name), registration));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..906553da570720a0c4b90bbd2eebb6d8bdea6bb8
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
+
+#include <map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/model.h"
+
+// Needed to resolve unordered_set hash on older compilers.
+namespace std {
+template <>
+struct hash<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& op) const {
+    return std::hash<int>()(op);
+  }
+};
+}  // namespace std
+
+namespace tflite {
+
+// An OpResolver that is mutable, also used as the op in gen_op_registration.
+// A typical usage:
+//   MutableOpResolver resolver;
+//   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+//   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
+//   InterpreterBuilder(model, resolver)(&interpreter);
+class MutableOpResolver : public OpResolver {
+ public:
+  MutableOpResolver() {}
+  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
+  TfLiteRegistration* FindOp(const char* op) const override;
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
+  void AddCustom(const char* name, TfLiteRegistration* registration);
+
+ private:
+  std::map<int, TfLiteRegistration*> builtins_;
+  std::map<std::string, TfLiteRegistration*> custom_ops_;
+};
+
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/version.h b/tensorflow/contrib/lite/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..a751afabe7460f0c9e88385faf1497b2c0a25d6b
--- /dev/null
+++ b/tensorflow/contrib/lite/version.h
@@ -0,0 +1,23 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
+
+// The version number of the Schema. Ideally all changes will be backward
+// compatible. If that ever changes, we must ensure that version is the first
+// entry in the new tflite root so that we can see that version is not 1.
+#define TFLITE_SCHEMA_VERSION (3)
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_VERSION_H_
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index b7b5418fe91e496f021b44fc32a33d2a549782e5..8ca03f4193f260ce32f942ccaf76a8260b282156 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -7,7 +7,7 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # TODO(yleon): Refactor after one we switching to the V2 kernels.
 py_library(
@@ -26,13 +26,14 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":lookup_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -43,9 +44,8 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
+    grpc_enabled = True,
 )
 
 filegroup(
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 515290e2176169956f2bdcb881becc1170ac26e4..56942115213a762e532971a81da768b53b8537d8 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -82,6 +82,7 @@ py_library(
 
 py_test(
     name = "metric_loss_ops_test",
+    size = "large",
     srcs = [
         "python/metric_learning/metric_loss_ops_test.py",
     ],
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 3b4d0ff799c05ce34cc55385ccc637467e443e40..ee84b5b4c8a9e41fe07b4e9dfdc93e31f807d35d 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -11,6 +11,8 @@
 # the first for the host (the machine you're compiling on) and the second for
 # the target (the machine you want the program to run on).
 
+SHELL := /bin/bash
+
 # Host compilation settings
 
 # Find where we're running from, so we can store generated files here.
@@ -63,6 +65,8 @@ else
 	endif
 endif
 
+HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32; else echo $(shell uname -m); fi)
+
 # Where compiled objects are stored.
 HOST_OBJDIR := $(MAKEFILE_DIR)/gen/host_obj/
 HOST_BINDIR := $(MAKEFILE_DIR)/gen/host_bin/
@@ -235,43 +239,94 @@ ifeq ($(TARGET),ANDROID)
 # NDK_ROOT=/path/to/your/ndk
 # You need to have an Android version of the protobuf libraries compiled to link
 # in. The compile_android_protobuf.sh script may help.
-# TODO(satok): Support all CPU architectures (Currently only armv7 is supported)
 
-	OS_PATH :=
+	ANDROID_HOST_OS_ARCH :=
 	ifeq ($(HOST_OS),LINUX)
-		OS_PATH=linux
+		ANDROID_HOST_OS_ARCH=linux
 	endif
 	ifeq ($(HOST_OS),OSX)
-		OS_PATH=darwin
+		ANDROID_HOST_OS_ARCH=darwin
 	endif
 	ifeq ($(HOST_OS),WINDOWS)
     $(error "windows is not supported.")
 	endif
 
+	ifeq ($(HOST_ARCH),x86_32)
+		ANDROID_HOST_OS_ARCH := $(ANDROID_HOST_OS_ARCH)-x86
+	else
+		ANDROID_HOST_OS_ARCH := $(ANDROID_HOST_OS_ARCH)-$(HOST_ARCH)
+	endif
+    
+	ifndef ANDROID_ARCH
+		ANDROID_ARCH := armeabi-v7a
+	endif
+
+	ifeq ($(ANDROID_ARCH),arm64-v8a)
+		TOOLCHAIN := aarch64-linux-android-4.9
+		SYSROOT_ARCH := arm64
+		BIN_PREFIX := aarch64-linux-android
+		MARCH_OPTION :=
+	endif
+	ifeq ($(ANDROID_ARCH),armeabi)
+		TOOLCHAIN := arm-linux-androideabi-4.9
+		SYSROOT_ARCH := arm
+		BIN_PREFIX := arm-linux-androideabi
+		MARCH_OPTION :=
+	endif
+	ifeq ($(ANDROID_ARCH),armeabi-v7a)
+		TOOLCHAIN := arm-linux-androideabi-4.9
+		SYSROOT_ARCH := arm
+		BIN_PREFIX := arm-linux-androideabi
+		MARCH_OPTION := -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+	endif
+	ifeq ($(ANDROID_ARCH),mips)
+		TOOLCHAIN := mipsel-linux-android-4.9
+		SYSROOT_ARCH := mips
+		BIN_PREFIX := mipsel-linux-android
+		MARCH_OPTION :=
+	endif
+	ifeq ($(ANDROID_ARCH),mips64)
+		TOOLCHAIN := mips64el-linux-android-4.9
+		SYSROOT_ARCH := mips64
+		BIN_PREFIX := mips64el-linux-android
+		MARCH_OPTION :=
+	endif
+	ifeq ($(ANDROID_ARCH),x86)
+		TOOLCHAIN := x86-4.9
+		SYSROOT_ARCH := x86
+		BIN_PREFIX := i686-linux-android
+		MARCH_OPTION :=
+	endif
+	ifeq ($(ANDROID_ARCH),x86_64)
+		TOOLCHAIN := x86_64-4.9
+		SYSROOT_ARCH := x86_64
+		BIN_PREFIX := x86_64-linux-android
+		MARCH_OPTION :=
+	endif
+    
 	ifndef NDK_ROOT
     $(error "NDK_ROOT is not defined.")
 	endif
-	CXX := $(CC_PREFIX) $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/bin/arm-linux-androideabi-g++
-	CC := $(CC_PREFIX) $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/bin/arm-linux-androideabi-gcc
+	CXX := $(CC_PREFIX) $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-g++
+	CC := $(CC_PREFIX) $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-gcc
 	CXXFLAGS +=\
---sysroot $(NDK_ROOT)/platforms/android-21/arch-arm \
+--sysroot $(NDK_ROOT)/platforms/android-21/arch-$(SYSROOT_ARCH) \
 -Wno-narrowing \
 -fomit-frame-pointer \
--march=armv7-a \
--mfloat-abi=softfp \
--mfpu=neon \
--fPIE
+$(MARCH_OPTION) \
+-fPIE \
+-fPIC
 	INCLUDES = \
 -I$(NDK_ROOT)/sources/android/support/include \
 -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/include \
--I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi/include \
+-I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/$(ANDROID_ARCH)/include \
 -I. \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/eigen \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
 -I$(MAKEFILE_DIR)/downloads/nsync/public \
 -I$(MAKEFILE_DIR)/downloads/fft2d \
--I$(MAKEFILE_DIR)/gen/protobuf/include \
+-I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \
 -I$(PROTOGENDIR) \
 -I$(PBTGENDIR)
 
@@ -282,19 +337,20 @@ $(TARGET_NSYNC_LIB) \
 -llog \
 -lz \
 -lm \
--ldl
+-ldl \
+-latomic
 
-	LD := $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/arm-linux-androideabi/bin/ld
+	LD := $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/$(BIN_PREFIX)/bin/ld
 
 	LDFLAGS := \
--march=armv7-a \
--L$(MAKEFILE_DIR)/gen/protobuf/lib \
--L$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/armeabi-v7a \
+$(MARCH_OPTION) \
+-L$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/lib \
+-L$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/$(ANDROID_ARCH) \
 -fPIE \
 -pie \
 -v
 
-	AR := $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/$(OS_PATH)-x86_64/bin/arm-linux-androideabi-ar
+	AR := $(NDK_ROOT)/toolchains/$(TOOLCHAIN)/prebuilt/$(ANDROID_HOST_OS_ARCH)/bin/$(BIN_PREFIX)-ar
 	ARFLAGS := r
 	LIBFLAGS += -Wl,--allow-multiple-definition -Wl,--whole-archive
 
@@ -318,6 +374,11 @@ $(TARGET_NSYNC_LIB) \
 	ifdef ENABLE_EXPERIMENTAL_HEXNN_OPS
 		CXXFLAGS += -DENABLE_EXPERIMENTAL_HEXNN_OPS
 	endif
+	
+	OBJDIR := $(OBJDIR)android_$(ANDROID_ARCH)/
+	LIBDIR := $(LIBDIR)android_$(ANDROID_ARCH)/
+	BINDIR := $(BINDIR)android_$(ANDROID_ARCH)/
+	DEPDIR := $(DEPDIR)android_$(ANDROID_ARCH)/
 
 endif  # ANDROID
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
@@ -660,12 +721,12 @@ clean:
 # Gets rid of all generated files except protobuf libs generated
 # before calling make.  This allows users not to recompile proto libs everytime.
 clean_except_protobuf_libs:
-	find $(MAKEFILE_DIR)/gen -mindepth 1 -maxdepth 1 ! -name "protobuf" ! -name "protobuf-host" -exec rm -r "{}" \;
+	find $(MAKEFILE_DIR)/gen -mindepth 1 -maxdepth 1 ! -name "protobuf*" -exec rm -r "{}" \;
 	rm -rf tensorflow/core/util/version_info.cc
 
 # Gets rid of target files only, leaving the host alone. Also leaves the lib
 # directory untouched deliberately, so we can persist multiple architectures
-# across builds for iOS.
+# across builds for iOS and Android.
 cleantarget:
 	rm -rf $(OBJDIR)
 	rm -rf $(BINDIR)
diff --git a/tensorflow/contrib/makefile/README.md b/tensorflow/contrib/makefile/README.md
index 715eb5157762a3a08079d0845682f55dc05d7b76..9345303ff11462a447ed6299b0ac3cba558ea68b 100644
--- a/tensorflow/contrib/makefile/README.md
+++ b/tensorflow/contrib/makefile/README.md
@@ -16,17 +16,17 @@ This static library will not contain:
 
  - Python or other language bindings
  - GPU support
- 
+
 You can target:
 - iOS
 - OS X (macOS)
 - Android
 - Raspberry-PI
- 
+
 You will compile tensorflow and protobuf libraries that you can link into other
 applications.  You will also compile the [benchmark](../../tools/benchmark/)
 application that will let you check your application.
- 
+
 ## Before you start (all platforms)
 
 First, clone this TensorFlow repository.
@@ -58,9 +58,9 @@ You should then be able to run the `build_all_linux.sh` script to compile:
 tensorflow/contrib/makefile/build_all_linux.sh
 ```
 
-This should compile a static library in 
-`tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`, 
-and create an example executable at `tensorflow/contrib/makefile/gen/bin/benchmark`. 
+This should compile a static library in
+`tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`,
+and create an example executable at `tensorflow/contrib/makefile/gen/bin/benchmark`.
 
 Get the graph file, if you have not already:
 
@@ -174,10 +174,26 @@ tensorflow/contrib/makefile/build_all_ios.sh
 
 This process will take around twenty minutes on a modern MacBook Pro.
 
-When it completes, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
+When it completes, you will have a unified library for all architectures
+(i386sim, x86_64sim, armv7, armv7s and arm64)  and the benchmark program.
+Although successfully compiling the benchmark program is a
 sign of success, the program is not a complete iOS app.
 
+If you would only like to build only one architecture to save time:
+(iOS 11+ only supports 64bit so you can get away with arm64)
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64
+```
+
+After the first build if you would like to just build the tensorflow
+library you can pass the -T flag to avoid a clean & rebuild. This should
+take you just a few seconds to generate the library if you modified one file.
+
+```bash
+tensorflow/contrib/makefile/build_all_ios.sh -a arm64 -T
+```
+
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
 library in a simple app.
@@ -185,7 +201,7 @@ library in a simple app.
 ### Building by hand
 
 This section covers each step of building.  For all the code in one place, see
-[build_all_ios.sh](build_all_ios.sh). 
+[build_all_ios.sh](build_all_ios.sh).
 
 If you have not already, you will need to download dependencies:
 
@@ -193,19 +209,18 @@ If you have not already, you will need to download dependencies:
 tensorflow/contrib/makefile/download_dependencies.sh
 ```
 
-Next, you will need to compile protobufs for iOS:
+Next, you will need to compile protobufs for iOS (optionally takes the -a $ARCH flag):
 
 ```bash
-tensorflow/contrib/makefile/compile_ios_protobuf.sh 
+tensorflow/contrib/makefile/compile_ios_protobuf.sh
 ```
 
-Then, you will need to compile the nsync library for iOS:
+Then, you will need to compile the nsync library for iOS (optionally takes -a $ARCH flag):
 
 ```bash
 export HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
 export TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
 ```
-
 Then, you can run the makefile specifying iOS as the target, along with the
 architecture you want to build for:
 
@@ -217,11 +232,7 @@ make -f tensorflow/contrib/makefile/Makefile \
 
 This creates a library in
 `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a` that you can link any
-xcode project against. 
-
-At this point, you will have a library for a single architecture and the
-benchmark program. Although successfully compiling the benchmark program is a
-sign of success, the program is not a complete iOS app. 
+xcode project against.
 
 To see TensorFlow running on iOS, the example Xcode project in
 [tensorflow/examples/ios](../../examples/ios/) shows how to use the static
@@ -237,19 +248,27 @@ time follow it with:
 compile_ios_tensorflow.sh
 ```
 
+`compile_ios_tensorflow.sh` takes the -a flag to build only for one architecture.
+In case you run into issues with unresolved symbols with nsync you can also pass
+-h ${HOST_NSYNC_LIB} and -n {TARGET_NSYNC_LIB} so it would look like:
+
+```bash
+tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h tensorflow/contrib/makefile/downloads/nsync/builds/default.macos.c++11/nsync.a -n tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11/nsync.a -a arm64
+```
+
 In XCode, you will need to use -force_load in the linker flags
 section of the build settings to pull in the global constructors that are used
-to register ops and kernels. 
+to register ops and kernels.
 
 #### Optimization
- 
+
 The `compile_ios_tensorflow.sh` script can take optional command-line arguments.
 The first argument will be passed as a C++ optimization flag and defaults to
 debug mode. If you are concerned about performance or are working on a release
 build, you would likely want a higher optimization setting, like so:
- 
+
 ```bash
-compile_ios_tensorflow.sh "-Os"
+compile_ios_tensorflow.sh -f "-Os"
 ```
 
 For other variations of valid optimization flags, see [clang optimization levels](http://stackoverflow.com/questions/15548023/clang-optimization-levels).
@@ -311,7 +330,7 @@ what you need for your desired system.
 ## Dependency Management
 
 The Makefile loads in a list of dependencies stored in text files. These files
-are generated from the main Bazel build by running 
+are generated from the main Bazel build by running
 `tensorflow/contrib/makefile/gen_file_lists.sh`. You'll need to re-run this i
 you make changes to the files that are included in the build.
 
@@ -342,10 +361,10 @@ codebase can sometimes break the makefile build process. If you find that tests
 relying on this makefile are failing with a change you're involved in, here are
 some trouble-shooting steps:
 
- - Try to reproduce the issue on your platform. If you're on Linux, running 
+ - Try to reproduce the issue on your platform. If you're on Linux, running
  `make -f tensorflow/contrib/makefile/Makefile` should be enough to recreate
   most issues. For other platforms, see the sections earlier in this document.
-  
+
  - The most common cause of breakages are files that have been added to the
   Bazel build scripts, but that the makefile isn't aware of. Typical symptoms
   of this include linker errors mentioning missing symbols or protobuf headers
@@ -358,11 +377,11 @@ some trouble-shooting steps:
   `tensorflow/core/BUILD`, so if you change the wildcards there to include new
   files you'll need to also update `CORE_CC_ALL_SRCS` and `CORE_CC_EXCLUDE_SRCS`
   in the makefile.
-  
+
  - Some of the supported platforms use clang instead of gcc as their compiler,
   so if you're hitting compile errors you may need to tweak your code to be more
   friendly to different compilers by avoiding gcc extensions or idioms.
-  
+
 These are the most common reasons for makefile breakages, but it's also
 possible you may hit something unusual, like a platform incompatibility. For
 those, you'll need to see if you can reproduce the issue on that particular
diff --git a/tensorflow/contrib/makefile/build_all_android.sh b/tensorflow/contrib/makefile/build_all_android.sh
index 9944f71950ac59ba147bf33c344c3478cdd175be..81cb17a311fd94aa397eb7a766cd8c668268759a 100755
--- a/tensorflow/contrib/makefile/build_all_android.sh
+++ b/tensorflow/contrib/makefile/build_all_android.sh
@@ -18,12 +18,15 @@
 set -e
 
 usage() {
-  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-s:t:Tx:X]"
+  echo "Usage: NDK_ROOT=<path to ndk root> $(basename "$0") [-Es:t:Tx:a:X]"
   echo "-E enable experimental hexnn ops"
   echo "-s [sub_makefiles] sub makefiles separated by white space"
   echo "-t [build_target] build target for Android makefile [default=all]"
   echo "-T only build tensorflow"
   echo "-x [hexagon library path] copy and hexagon libraries in the specified path"
+  echo "-a [architecture] Architecture of target android [default=armeabi-v7a] \
+(supported architecture list: \
+arm64-v8a armeabi armeabi-v7a mips mips64 x86 x86_64)"
   exit 1
 }
 
@@ -32,13 +35,16 @@ if [[ -z "${NDK_ROOT}" ]]; then
     exit 1
 fi
 
-while getopts "Es:t:Tx:" opt_name; do
+ARCH=armeabi-v7a
+
+while getopts "Es:t:Tx:a:" opt_name; do
   case "$opt_name" in
     E) ENABLE_EXPERIMENTAL_HEXNN_OPS="true";;
     s) SUB_MAKEFILES="${OPTARG}";;
     t) BUILD_TARGET="${OPTARG}";;
     T) ONLY_MAKE_TENSORFLOW="true";;
     x) HEXAGON_LIB_PATH="${OPTARG}";;
+    a) ARCH="${OPTARG}";;
     *) usage;;
   esac
 done
@@ -53,25 +59,23 @@ JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 HEXAGON_DOWNLOAD_PATH="tensorflow/contrib/makefile/downloads/hexagon"
 
+# Remove any old files first.
+make -f tensorflow/contrib/makefile/Makefile cleantarget
+
 if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
-  # Remove any old files first.
-  make -f tensorflow/contrib/makefile/Makefile clean
   rm -rf tensorflow/contrib/makefile/downloads
   # Pull down the required versions of the frameworks we need.
   tensorflow/contrib/makefile/download_dependencies.sh
   # Compile protobuf for the target Android device architectures.
   CC_PREFIX="${CC_PREFIX}" NDK_ROOT="${NDK_ROOT}" \
-tensorflow/contrib/makefile/compile_android_protobuf.sh -c
-else
-  # Only clean files generated by make
-  make -f tensorflow/contrib/makefile/Makefile clean_except_protobuf_libs
+tensorflow/contrib/makefile/compile_android_protobuf.sh -c -a ${ARCH}
 fi
 
 # Compile nsync for the host and the target Android device architecture.
 # Don't use  export var=`something` syntax; it swallows the exit status.
 HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
 TARGET_NSYNC_LIB=`CC_PREFIX="${CC_PREFIX}" NDK_ROOT="${NDK_ROOT}" \
-      tensorflow/contrib/makefile/compile_nsync.sh -t android -a armeabi-v7a`
+      tensorflow/contrib/makefile/compile_nsync.sh -t android -a ${ARCH}`
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
 if [[ ! -z "${HEXAGON_LIB_PATH}" ]]; then
@@ -98,7 +102,8 @@ fi
 
 if [[ -z "${BUILD_TARGET}" ]]; then
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-         TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
+         TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" ANDROID_ARCH="${ARCH}" \
+         CC_PREFIX="${CC_PREFIX}" \
          HOST_NSYNC_LIB="$HOST_NSYNC_LIB" TARGET_NSYNC_LIB="$TARGET_NSYNC_LIB" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
 SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]}
@@ -106,7 +111,8 @@ else
     # BUILD_TARGET explicitly uncommented to allow multiple targets to be
     # passed to make in a single build_all_android.sh invocation.
     make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-         TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" CC_PREFIX="${CC_PREFIX}" \
+         TARGET=ANDROID NDK_ROOT="${NDK_ROOT}" ANDROID_ARCH="${ARCH}" \
+         CC_PREFIX="${CC_PREFIX}" \
          HOST_NSYNC_LIB="$HOST_NSYNC_LIB" TARGET_NSYNC_LIB="$TARGET_NSYNC_LIB" \
 HEXAGON_LIBS="${HEXAGON_LIBS}" HEXAGON_INCLUDE="${HEXAGON_INCLUDE}" \
 SUB_MAKEFILES="${SUB_MAKEFILES}" ${EXTRA_MAKE_ARGS[@]} ${BUILD_TARGET}
diff --git a/tensorflow/contrib/makefile/build_all_ios.sh b/tensorflow/contrib/makefile/build_all_ios.sh
index a49bbe4565bfe2101fdf9a78e6d43fae2ff7fb2c..988e12b48287300004cc23c31cb4a20e63f72a27 100755
--- a/tensorflow/contrib/makefile/build_all_ios.sh
+++ b/tensorflow/contrib/makefile/build_all_ios.sh
@@ -23,14 +23,29 @@ if [[ $(uname) != "Darwin" ]]; then
     exit 1
 fi
 
+usage() {
+  echo "Usage: $(basename "$0") [-a:T]"
+  echo "-a [build_arch] build only for specified arch x86_64 [default=all]"
+  echo "-T only build tensorflow (dont download other deps etc)"
+  exit 1
+}
+
+while getopts "a:T" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCH="${OPTARG}";;
+    T) ONLY_MAKE_TENSORFLOW="true";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+
 # Make sure we're in the correct directory, at the root of the source tree.
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd ${SCRIPT_DIR}/../../../
 
-
-# Remove any old files first.
-make -f tensorflow/contrib/makefile/Makefile clean
-rm -rf tensorflow/contrib/makefile/downloads
+source "${SCRIPT_DIR}/build_helper.subr"
+JOB_COUNT="${JOB_COUNT:-$(get_job_count)}"
 
 # Setting a deployment target is required for building with bitcode,
 # otherwise linking will fail with:
@@ -41,20 +56,37 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
-# Pull down the required versions of the frameworks we need.
-tensorflow/contrib/makefile/download_dependencies.sh
+if [[ "${ONLY_MAKE_TENSORFLOW}" != "true" ]]; then
+    # Remove any old files first.
+    make -f tensorflow/contrib/makefile/Makefile clean
+    rm -rf tensorflow/contrib/makefile/downloads
 
-# Compile protobuf for the target iOS device architectures.
-tensorflow/contrib/makefile/compile_ios_protobuf.sh
+    # Pull down the required versions of the frameworks we need.
+    tensorflow/contrib/makefile/download_dependencies.sh
+
+    # Compile protobuf for the target iOS device architectures.
+    tensorflow/contrib/makefile/compile_ios_protobuf.sh
+fi
 
 # Compile nsync for the target iOS device architectures.
 # Don't use  export var=`something` syntax; it swallows the exit status.
 HOST_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh`
-TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # No arch specified so build all architectures
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios`
+else
+    # arch specified so build just that
+    TARGET_NSYNC_LIB=`tensorflow/contrib/makefile/compile_nsync.sh -t ios -a ${BUILD_ARCH}`
+fi
 export HOST_NSYNC_LIB TARGET_NSYNC_LIB
 
-# Build the iOS TensorFlow libraries.
-tensorflow/contrib/makefile/compile_ios_tensorflow.sh "-O3"
+if [[ -z "${BUILD_ARCH}" ]]; then
+    # build the ios tensorflow libraries.
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+else
+    # arch specified so build just that
+    tensorflow/contrib/makefile/compile_ios_tensorflow.sh -f "-O3" -a "${BUILD_ARCH}" -h $HOST_NSYNC_LIB -n $TARGET_NSYNC_LIB
+fi
 
 # Creates a static universal library in
 # tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a
diff --git a/tensorflow/contrib/makefile/compile_android_protobuf.sh b/tensorflow/contrib/makefile/compile_android_protobuf.sh
index fadbe271b85e6812953c1a00345a5e7f92bf9dbe..4355e3e5974e7ec4626773feca808631f2dbf1a8 100755
--- a/tensorflow/contrib/makefile/compile_android_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_android_protobuf.sh
@@ -71,10 +71,10 @@ then
     exit 1
 fi
 
-GENDIR="$(pwd)/gen/protobuf"
+GENDIR="$(pwd)/gen/protobuf_android"
 HOST_GENDIR="$(pwd)/gen/protobuf-host"
 mkdir -p "${GENDIR}"
-mkdir -p "${HOST_GENDIR}"
+mkdir -p "${GENDIR}/${ARCHITECTURE}"
 
 if [[ ! -f "./downloads/protobuf/autogen.sh" ]]; then
     echo "You need to download dependencies before running this script." 1>&2
@@ -153,7 +153,7 @@ then
   exit 1
 fi
 
-./configure --prefix="${GENDIR}" \
+./configure --prefix="${GENDIR}/${ARCHITECTURE}" \
 --host="${bin_prefix}" \
 --with-sysroot="${SYSROOT}" \
 --disable-shared \
diff --git a/tensorflow/contrib/makefile/compile_ios_protobuf.sh b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
index 4056db18a76fc8a58240d9116b19cd8b68c1ee45..8fa20213633414d134d6c6a50e151cce2ac8a368 100755
--- a/tensorflow/contrib/makefile/compile_ios_protobuf.sh
+++ b/tensorflow/contrib/makefile/compile_ios_protobuf.sh
@@ -21,10 +21,28 @@ if [[ -n MACOSX_DEPLOYMENT_TARGET ]]; then
     export MACOSX_DEPLOYMENT_TARGET=$(sw_vers -productVersion)
 fi
 
-SCRIPT_DIR=$(dirname $0)
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default arch i386, x86_64, armv7, armv7s, arm64"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
 source "${SCRIPT_DIR}/build_helper.subr"
 
-cd tensorflow/contrib/makefile
+cd ${SCRIPT_DIR}
 
 HOST_GENDIR="$(pwd)/gen/protobuf-host"
 mkdir -p "${HOST_GENDIR}"
@@ -64,6 +82,10 @@ else
   echo "protoc found. Skip building host tools."
 fi
 
+# Remove old libs
+rm -f ${LIBDIR}/libprotobuf.a
+rm -f ${LIBDIR}/libprotobuf-lite.a
+
 ./autogen.sh
 if [ $? -ne 0 ]
 then
@@ -71,157 +93,192 @@ then
   exit 1
 fi
 
-make distclean
-./configure \
---host=i386-apple-${OSX_VERSION} \
---disable-shared \
---enable-cross-compile \
---with-protoc="${PROTOC_PATH}" \
---prefix=${LIBDIR}/iossim_386 \
---exec-prefix=${LIBDIR}/iossim_386 \
-"CFLAGS=${CFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch i386 \
--fembed-bitcode \
--isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch i386 \
--fembed-bitcode \
--isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch i386 \
--fembed-bitcode \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=x86_64-apple-${OSX_VERSION} \
---disable-shared \
---enable-cross-compile \
---with-protoc="${PROTOC_PATH}" \
---prefix=${LIBDIR}/iossim_x86_64 \
---exec-prefix=${LIBDIR}/iossim_x86_64 \
-"CFLAGS=${CFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch x86_64 \
--fembed-bitcode \
--isysroot ${IPHONESIMULATOR_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
--arch x86_64 \
--fembed-bitcode \
--isysroot \
-${IPHONESIMULATOR_SYSROOT}" \
-LDFLAGS="-arch x86_64 \
--fembed-bitcode \
--mios-simulator-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS} \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
--L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=armv7-apple-${OSX_VERSION} \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm7 \
---exec-prefix=${LIBDIR}/ios_arm7 \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7 \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=armv7s-apple-${OSX_VERSION} \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm7s \
---exec-prefix=${LIBDIR}/ios_arm7s \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7s \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXX=${CXX}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch armv7s \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch armv7s \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-make distclean
-./configure \
---host=arm \
---with-protoc="${PROTOC_PATH}" \
---disable-shared \
---prefix=${LIBDIR}/ios_arm64 \
---exec-prefix=${LIBDIR}/ios_arm64 \
-"CFLAGS=${CFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch arm64 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-"CXXFLAGS=${CXXFLAGS} \
--miphoneos-version-min=${MIN_SDK_VERSION} \
--arch arm64 \
--fembed-bitcode \
--isysroot ${IPHONEOS_SYSROOT}" \
-LDFLAGS="-arch arm64 \
--fembed-bitcode \
--miphoneos-version-min=${MIN_SDK_VERSION} \
-${LDFLAGS}" \
-"LIBS=${LIBS}"
-make -j"${JOB_COUNT}"
-make install
-
-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf.a \
--create \
--output ${LIBDIR}/libprotobuf.a
-
-lipo \
-${LIBDIR}/iossim_386/lib/libprotobuf-lite.a \
-${LIBDIR}/iossim_x86_64/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm7s/lib/libprotobuf-lite.a \
-${LIBDIR}/ios_arm64/lib/libprotobuf-lite.a \
--create \
--output ${LIBDIR}/libprotobuf-lite.a
+package_pb_library() {
+    pb_libs="${LIBDIR}/${1}/lib/libprotobuf.a"
+    if [ -f "${LIBDIR}/libprotobuf.a" ]; then
+        pb_libs="$pb_libs ${LIBDIR}/libprotobuf.a"
+    fi
+    lipo \
+    $pb_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf.a
+
+    pblite_libs="${LIBDIR}/${1}/lib/libprotobuf-lite.a"
+    if [ -f "${LIBDIR}/libprotobuf-lite.a" ]; then
+        pblite_libs="$pblite_libs ${LIBDIR}/libprotobuf-lite.a"
+    fi
+    lipo \
+    $pblite_libs \
+    -create \
+    -output ${LIBDIR}/libprotobuf-lite.a
+}
+
+build_target() {
+case "$1" in
+    i386)  make distclean
+        ./configure \
+        --host=i386-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_386 \
+        --exec-prefix=${LIBDIR}/iossim_386 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch i386 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch i386 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "iossim_386"
+        ;;
+
+    x86_64) make distclean
+        ./configure \
+        --host=x86_64-apple-${OSX_VERSION} \
+        --disable-shared \
+        --enable-cross-compile \
+        --with-protoc="${PROTOC_PATH}" \
+        --prefix=${LIBDIR}/iossim_x86_64 \
+        --exec-prefix=${LIBDIR}/iossim_x86_64 \
+        "CFLAGS=${CFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONESIMULATOR_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        -arch x86_64 \
+        -fembed-bitcode \
+        -isysroot \
+        ${IPHONESIMULATOR_SYSROOT}" \
+        LDFLAGS="-arch x86_64 \
+        -fembed-bitcode \
+        -mios-simulator-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS} \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/ \
+        -L${IPHONESIMULATOR_SYSROOT}/usr/lib/system" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "iossim_x86_64"
+        ;;
+
+    armv7) make distclean
+        ./configure \
+        --host=armv7-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7 \
+        --exec-prefix=${LIBDIR}/ios_arm7 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7"
+        ;;
+
+    armv7s) make distclean
+        ./configure \
+        --host=armv7s-apple-${OSX_VERSION} \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm7s \
+        --exec-prefix=${LIBDIR}/ios_arm7s \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXX=${CXX}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch armv7s \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch armv7s \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm7s"
+        ;;
+
+    arm64) make distclean
+        ./configure \
+        --host=arm \
+        --with-protoc="${PROTOC_PATH}" \
+        --disable-shared \
+        --prefix=${LIBDIR}/ios_arm64 \
+        --exec-prefix=${LIBDIR}/ios_arm64 \
+        "CFLAGS=${CFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        "CXXFLAGS=${CXXFLAGS} \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        -arch arm64 \
+        -fembed-bitcode \
+        -isysroot ${IPHONEOS_SYSROOT}" \
+        LDFLAGS="-arch arm64 \
+        -fembed-bitcode \
+        -miphoneos-version-min=${MIN_SDK_VERSION} \
+        ${LDFLAGS}" \
+        "LIBS=${LIBS}"
+        make -j"${JOB_COUNT}"
+        make install
+
+        package_pb_library "ios_arm64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+        ;;
+esac
+}
+
+for build_element in "${build_targets[@]}"
+do
+    echo "$build_element"
+    build_target "$build_element"
+done
+
+file ${LIBDIR}/libprotobuf.a
+file ${LIBDIR}/libprotobuf-lite.a
+echo "Done building and packaging the libraries"
diff --git a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
index 5d1cc8b375b99d97603c5d7dff78a5ac4eef751b..ae82163e1178216fc22aad37cd07fd1734c2bedb 100755
--- a/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
+++ b/tensorflow/contrib/makefile/compile_ios_tensorflow.sh
@@ -43,55 +43,124 @@ then
     exit 1
 fi
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64,arm64)"
+  echo "default is [i386, x86_64, armv7, armv7s, arm64]"
+  exit 1
+}
+
+BUILD_TARGET="i386 x86_64 armv7 armv7s arm64"
+while getopts "a:f:h:n:" opt_name; do
+  case "$opt_name" in
+    a) BUILD_TARGET="${OPTARG}";;
+    f) BUILD_OPT="${OPTARG}";;
+    h) NSYNC_HOST="${OPTARG}";;
+    n) NSYNC_TARGET="${OPTARG}";;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+
+IFS=' ' read -r -a build_targets <<< "${BUILD_TARGET}"
+
+SCRIPT_DIR=$(cd `dirname $0` && pwd)
+source "${SCRIPT_DIR}/build_helper.subr"
+
+
 GENDIR=tensorflow/contrib/makefile/gen/
 LIBDIR=${GENDIR}lib
 LIB_PREFIX=libtensorflow-core
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "armv7 compilation failed."
-  exit 1
-fi
+#remove any old artifacts
+rm -rf ${LIBDIR}/${LIB_PREFIX}.a
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm7vs compilation failed."
-  exit 1
-fi
+package_tf_library() {
+    CAP_DIR=`echo $1 | tr 'a-z' 'A-Z'`
+    tf_libs="${LIBDIR}/ios_${CAP_DIR}/${LIB_PREFIX}-${1}.a"
+    if [ -f "${LIBDIR}/${LIB_PREFIX}.a" ]; then
+        tf_libs="$tf_libs ${LIBDIR}/${LIB_PREFIX}.a"
+    fi
+    lipo \
+    $tf_libs \
+    -create \
+    -output ${LIBDIR}/${LIB_PREFIX}.a
+}
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "arm64 compilation failed."
-  exit 1
-fi
+build_tf_target() {
+case "$1" in
+    armv7)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7 LIB_NAME=${LIB_PREFIX}-armv7.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "armv7 compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7"
+        ;;
+    armv7s)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARMV7S LIB_NAME=${LIB_PREFIX}-armv7s.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "i386 compilation failed."
-  exit 1
-fi
+        if [ $? -ne 0 ]
+        then
+          echo "arm7vs compilation failed."
+          exit 1
+        fi
+        package_tf_library "armv7s"
+        ;;
+    arm64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=ARM64 LIB_NAME=${LIB_PREFIX}-arm64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "arm64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "arm64"
+        ;;
+    i386)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=I386 LIB_NAME=${LIB_PREFIX}-i386.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "i386 compilation failed."
+          exit 1
+        fi
+        package_tf_library "i386"
+        ;;
+    x86_64)
+        make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
+        TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a \
+        OPTFLAGS="${BUILD_OPT}" HOST_NSYNC_LIB="${NSYNC_HOST}" \
+        TARGET_NSYNC_LIB="${NSYNC_TARGET}"
+        if [ $? -ne 0 ]
+        then
+          echo "x86_64 compilation failed."
+          exit 1
+        fi
+        package_tf_library "x86_64"
+        ;;
+    *)
+        echo "Unknown ARCH"
+        exit 1
+esac
+}
 
-make -j"${JOB_COUNT}" -f tensorflow/contrib/makefile/Makefile \
-TARGET=IOS IOS_ARCH=X86_64 LIB_NAME=${LIB_PREFIX}-x86_64.a OPTFLAGS="$1"
-if [ $? -ne 0 ]
-then
-  echo "x86_64 compilation failed."
-  exit 1
-fi
+for build_tf_element in "${build_targets[@]}"
+do
+    echo "$build_tf_element"
+    build_tf_target "$build_tf_element"
+done
 
-lipo \
-${LIBDIR}/ios_ARMV7/${LIB_PREFIX}-armv7.a \
-${LIBDIR}/ios_ARMV7S/${LIB_PREFIX}-armv7s.a \
-${LIBDIR}/ios_ARM64/${LIB_PREFIX}-arm64.a \
-${LIBDIR}/ios_I386/${LIB_PREFIX}-i386.a \
-${LIBDIR}/ios_X86_64/${LIB_PREFIX}-x86_64.a \
--create \
--output ${LIBDIR}/${LIB_PREFIX}.a
+echo "Done building and packaging TF"
+file ${LIBDIR}/${LIB_PREFIX}.a
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index ecbd9bb82557f082d73c1a6088c81b7f819aeb5a..7927997678f077a716d81749561068f259d9744f 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -28,7 +28,7 @@ usage="usage: $prog [-t linux|ios|android|macos|native]
         [-a architecture] [-v android_api_version]
 
 A script to build nsync for tensorflow.
-This script can be run on Linux or MacOS host platforms, and can target 
+This script can be run on Linux or MacOS host platforms, and can target
 Linux, MacOS, iOS, or Android.
 
 Options:
@@ -265,7 +265,7 @@ for arch in $archs; do
                                           -I$(NDK_ROOT)/sources/cxx-stl/gnu-libstdc++/4.9/libs/'"$arch"'/include \
                                           -I../../platform/c++11 -I../../platform/gcc \
                                           -I../../platform/posix -pthread
-                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE
+                        PLATFORM_CFLAGS=-std=c++11 -Wno-narrowing '"$march_option"' -fPIE -fPIC
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
@@ -301,6 +301,9 @@ done
 
 case "$target_platform" in
 ios)    nsync_platform_dir="$nsync_builds_dir/lipo.$target_platform.c++11"
+        if [ -d "$nsync_platform_dir" ]; then
+            rm -rf "$nsync_platform_dir"
+        fi
         mkdir "$nsync_platform_dir"
         eval lipo $platform_libs -create -output '$nsync_platform_dir/nsync.a'
         echo "$nsync_platform_dir/nsync.a"
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 12e3f589306d54b10b38a48d8aed356de4ddc91b..b61044130897cf0dddc37e460b4e1618c3a7e2e9 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -19,13 +19,21 @@ set -e
 DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
 BZL_FILE_PATH=tensorflow/workspace.bzl
 
-EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+# Ensure it is being run from repo root
+if [ ! -f $BZL_FILE_PATH ]; then
+  echo "Could not find ${BZL_FILE_PATH}":
+  echo "Likely you are not running this from the root directory of the repository.";
+  exit 1;
+fi
+
+EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
+FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
+ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
@@ -73,6 +81,7 @@ download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
 download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
+download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/contrib/makefile/rename_protobuf.sh b/tensorflow/contrib/makefile/rename_protobuf.sh
index b3bff2d5032fd817f34546b1a0663f255d8d3f77..8d52c1a1694b79f24e6a85a7757df1c35c9a99b5 100755
--- a/tensorflow/contrib/makefile/rename_protobuf.sh
+++ b/tensorflow/contrib/makefile/rename_protobuf.sh
@@ -38,7 +38,7 @@
 #
 # Note that this script modifies the source code in-place, so once it's been run
 # it's no longer suitable for further manual modifications, since the difference
-# with the top of tree will already be large. 
+# with the top of tree will already be large.
 
 mv tensorflow/contrib/makefile/downloads/protobuf/src/google/protobuf \
  tensorflow/contrib/makefile/downloads/protobuf//src/google/protobuf3
@@ -71,7 +71,7 @@ sed -i '' 's%::google::protobuf;%google::protobuf3;%' \
 
 # Fix up a couple of special build scripts that look for particular files.
 sed -i '' 's%src/google/protobuf/message.cc%src/google/protobuf3/message.cc%' \
- tensorflow/contrib/makefile/downloads/protobuf/configure.ac 
+ tensorflow/contrib/makefile/downloads/protobuf/configure.ac
 sed -i '' 's%src/google/protobuf/stubs/common.h%src/google/protobuf3/stubs/common.h%' \
  tensorflow/contrib/makefile/downloads/protobuf/autogen.sh
 
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 8b77c99cb574123c2af5d8f9f17cd403613cfffd..5f275663986f9d480659880ab601eeb5c41037be 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -8,6 +8,7 @@ tensorflow/core/kernels/xent_op.cc
 tensorflow/core/kernels/where_op.cc
 tensorflow/core/kernels/variable_ops.cc
 tensorflow/core/kernels/unpack_op.cc
+tensorflow/core/kernels/unique_op.cc
 tensorflow/core/kernels/transpose_op.cc
 tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_op_helpers.cc
@@ -41,6 +42,9 @@ tensorflow/core/kernels/spectrogram_op.cc
 tensorflow/core/kernels/spectrogram.cc
 tensorflow/core/kernels/sparse_to_dense_op.cc
 tensorflow/core/kernels/sparse_matmul_op.cc
+tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+tensorflow/core/kernels/sparse_reshape_op.c
+tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/softsign_op.cc
 tensorflow/core/kernels/softplus_op.cc
 tensorflow/core/kernels/softmax_op.cc
@@ -64,6 +68,8 @@ tensorflow/core/kernels/scatter_nd_op_cpu_impl_2.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_3.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_4.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/scatter_nd_op.cc
 tensorflow/core/kernels/save_restore_tensor.cc
 tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -74,6 +80,7 @@ tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
 tensorflow/core/kernels/resize_nearest_neighbor_op.cc
 tensorflow/core/kernels/resize_bilinear_op.cc
+tensorflow/core/kernels/reshape_util.cc
 tensorflow/core/kernels/reshape_op.cc
 tensorflow/core/kernels/relu_op.cc
 tensorflow/core/kernels/reduction_ops_sum.cc
@@ -109,6 +116,10 @@ tensorflow/core/kernels/maxpooling_op.cc
 tensorflow/core/kernels/matmul_op.cc
 tensorflow/core/kernels/lrn_op.cc
 tensorflow/core/kernels/logging_ops.cc
+tensorflow/core/kernels/initializable_lookup_table.c
+tensorflow/core/kernels/lookup_table_init_op.cc
+tensorflow/core/kernels/lookup_table_op.cc
+tensorflow/core/kernels/lookup_util.cc
 tensorflow/core/kernels/inplace_ops.cc
 tensorflow/core/kernels/in_topk_op.cc
 tensorflow/core/kernels/immutable_constant_op.cc
@@ -116,10 +127,20 @@ tensorflow/core/kernels/identity_op.cc
 tensorflow/core/kernels/identity_n_op.cc
 tensorflow/core/kernels/gather_op.cc
 tensorflow/core/kernels/gather_functor.cc
+tensorflow/core/kernels/gather_nd_op.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_0.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_1.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_2.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_3.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_4.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_5.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
+tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fill_functor.cc
 tensorflow/core/kernels/fifo_queue.cc
+tensorflow/core/kernels/fifo_queue_op.cc
 tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/example_parsing_ops.cc
 tensorflow/core/kernels/encode_wav_op.cc
@@ -127,6 +148,7 @@ tensorflow/core/kernels/dynamic_stitch_op.cc
 tensorflow/core/kernels/dynamic_partition_op.cc
 tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
+tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
@@ -156,6 +178,7 @@ tensorflow/core/kernels/cwise_op_logical_or.cc
 tensorflow/core/kernels/cwise_op_log.cc
 tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_less_equal.cc
+tensorflow/core/kernels/cwise_op_isnan.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
 tensorflow/core/kernels/cwise_op_invert.cc
 tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -166,6 +189,8 @@ tensorflow/core/kernels/cwise_op_floor.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to_2.cc
 tensorflow/core/kernels/cwise_op_equal_to_1.cc
+tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
 tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_bitwise_xor.cc
 tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -265,3 +290,4 @@ tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
+tensorflow/core/kernels/batch_util.cc
diff --git a/tensorflow/contrib/metrics/README.md b/tensorflow/contrib/metrics/README.md
index 247ebac5bb6eabbd87ca9d5dc1a18fa9dbe95aca..e0f2d74fa3270e68acadda026a28e9e5c71e0671 100644
--- a/tensorflow/contrib/metrics/README.md
+++ b/tensorflow/contrib/metrics/README.md
@@ -4,7 +4,7 @@
 
 Metrics are used in evaluation to assess the quality of a model. Most are
 "streaming" ops, meaning they create variables to accumulate a running total,
-and return an update tensor to update these variables, and a value tensor to 
+and return an update tensor to update these variables, and a value tensor to
 read the accumulated value. Example:
 
 value, update_op = metrics.streaming_mean_squared_error(
diff --git a/tensorflow/contrib/metrics/__init__.py b/tensorflow/contrib/metrics/__init__.py
index bb566f69029b4cd3b530c31bda22d78a19d9bf02..27dad5379a2e56b91960a1f2274610e4f2568dbc 100644
--- a/tensorflow/contrib/metrics/__init__.py
+++ b/tensorflow/contrib/metrics/__init__.py
@@ -27,6 +27,7 @@ See the @{$python/contrib.metrics} guide.
 @@streaming_false_negative_rate
 @@streaming_false_negative_rate_at_thresholds
 @@streaming_auc
+@@streaming_dynamic_auc
 @@streaming_curve_points
 @@streaming_recall_at_k
 @@streaming_mean_absolute_error
@@ -66,6 +67,8 @@ See the @{$python/contrib.metrics} guide.
 @@set_size
 @@set_union
 @@count
+@@precision_recall_at_equal_thresholds
+@@recall_at_precision
 
 """
 from __future__ import absolute_import
@@ -80,12 +83,15 @@ from tensorflow.contrib.metrics.python.ops.histogram_ops import auc_using_histog
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metric_map
 from tensorflow.contrib.metrics.python.ops.metric_ops import aggregate_metrics
 from tensorflow.contrib.metrics.python.ops.metric_ops import count
+from tensorflow.contrib.metrics.python.ops.metric_ops import precision_recall_at_equal_thresholds
+from tensorflow.contrib.metrics.python.ops.metric_ops import recall_at_precision
 from tensorflow.contrib.metrics.python.ops.metric_ops import sparse_recall_at_top_k
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_accuracy
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_concat
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_covariance
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_curve_points
+from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_dynamic_auc
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negative_rate
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negative_rate_at_thresholds
 from tensorflow.contrib.metrics.python.ops.metric_ops import streaming_false_negatives
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index dbfc0934eacc6170a8521c1af54865ed0920c7c6..2f2798563481cc0c53360944f967e6b31991057d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -38,6 +38,9 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util.deprecation import deprecated
 
+# Epsilon constant used to represent extremely small quantity.
+_EPSILON = 1e-7
+
 
 def _safe_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is <= 0.
@@ -57,89 +60,6 @@ def _safe_div(numerator, denominator, name):
       name=name)
 
 
-def _create_local(name,
-                  shape,
-                  collections=None,
-                  validate_shape=True,
-                  dtype=dtypes.float32):
-  """Creates a new local variable.
-
-  Args:
-    name: The name of the new or existing variable.
-    shape: Shape of the new or existing variable.
-    collections: A list of collection names to which the Variable will be added.
-    validate_shape: Whether to validate the shape of the variable.
-    dtype: Data type of the variables.
-
-  Returns:
-    The created variable.
-  """
-  # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
-  collections = list(collections or [])
-  collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variable_scope.variable(
-      initial_value=array_ops.zeros(shape, dtype=dtype),
-      name=name,
-      trainable=False,
-      collections=collections,
-      validate_shape=validate_shape)
-
-
-# TODO(ptucker): Move this somewhere common, to share with ops/losses/losses.py.
-def _assert_weights_rank(weights, values):
-  """`weights` rank must be either `0`, or the same as 'values'."""
-  return check_ops.assert_rank_in(weights, (0, array_ops.rank(values)))
-
-
-def _count_condition(values,
-                     weights=None,
-                     metrics_collections=None,
-                     updates_collections=None):
-  """Sums the weights of cases where the given values are True.
-
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-  Args:
-    values: A `bool` `Tensor` of arbitrary size.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `values`, and must be broadcastable to `values` (i.e., all dimensions
-      must be either `1`, or the same as the corresponding `values`
-      dimension).
-    metrics_collections: An optional list of collections that the metric
-      value variable should be added to.
-    updates_collections: An optional list of collections that the metric update
-      ops should be added to.
-
-  Returns:
-    value_tensor: A `Tensor` representing the current value of the metric.
-    update_op: An operation that accumulates the error from a batch of data.
-
-  Raises:
-    ValueError: If `weights` is not `None` and its shape doesn't match `values`,
-      or if either `metrics_collections` or `updates_collections` are not a list
-      or tuple.
-  """
-  check_ops.assert_type(values, dtypes.bool)
-  count_ = _create_local('count', shape=[])
-
-  values = math_ops.to_float(values)
-  if weights is not None:
-    weights = math_ops.to_float(weights)
-    with ops.control_dependencies((_assert_weights_rank(weights, values),)):
-      values = math_ops.multiply(values, weights)
-
-  value_tensor = array_ops.identity(count_)
-  update_op = state_ops.assign_add(count_, math_ops.reduce_sum(values))
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, value_tensor)
-
-  if updates_collections:
-    ops.add_to_collections(updates_collections, update_op)
-
-  return value_tensor, update_op
-
-
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -219,17 +139,13 @@ def streaming_true_negatives(predictions,
       either `metrics_collections` or `updates_collections` are not a list or
       tuple.
   """
-  with variable_scope.variable_scope(name, 'true_negatives',
-                                     (predictions, labels, weights)):
-
-    predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
-        predictions=math_ops.cast(predictions, dtype=dtypes.bool),
-        labels=math_ops.cast(labels, dtype=dtypes.bool),
-        weights=weights)
-    is_true_negative = math_ops.logical_and(
-        math_ops.equal(labels, False), math_ops.equal(predictions, False))
-    return _count_condition(is_true_negative, weights, metrics_collections,
-                            updates_collections)
+  return metrics.true_negatives(
+      predictions=predictions,
+      labels=labels,
+      weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections,
+      name=name)
 
 
 def streaming_false_positives(predictions,
@@ -319,34 +235,6 @@ def streaming_false_negatives(predictions,
       name=name)
 
 
-# TODO(ptucker): Move this somewhere common, to share with ops/losses/losses.py.
-def _broadcast_weights(weights, values):
-  """Broadcast `weights` to the same shape as `values`.
-
-  This returns a version of `weights` following the same broadcast rules as
-  `mul(weights, values)`. When computing a weighted average, use this function
-  to broadcast `weights` before summing them; e.g.,
-  `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`.
-
-  Args:
-    weights: `Tensor` whose rank is either 0, or the same rank as `values`, and
-      must be broadcastable to `values` (i.e., all dimensions must be either
-      `1`, or the same as the corresponding `values` dimension).
-    values: `Tensor` of any shape.
-
-  Returns:
-    `weights` broadcast to `values` shape.
-  """
-  with ops.name_scope(None, 'broadcast_weights', (values, weights)) as scope:
-    weights_shape = weights.get_shape()
-    values_shape = values.get_shape()
-    if (weights_shape.is_fully_defined() and values_shape.is_fully_defined() and
-        weights_shape.is_compatible_with(values_shape)):
-      return weights
-    with ops.control_dependencies((_assert_weights_rank(weights, values),)):
-      return math_ops.multiply(weights, array_ops.ones_like(values), name=scope)
-
-
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -448,8 +336,10 @@ def streaming_mean_tensor(values,
       updates_collections=updates_collections,
       name=name)
 
-@deprecated(None, "Please switch to tf.metrics.accuracy. Note that the order "
-    "of the inputs of labels and predictions have been switched.")
+
+@deprecated(
+    None, 'Please switch to tf.metrics.accuracy. Note that the order of the '
+    'labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -617,53 +507,6 @@ def streaming_recall(predictions,
       name=name)
 
 
-def _true_negatives(labels,
-                    predictions,
-                    weights=None,
-                    metrics_collections=None,
-                    updates_collections=None,
-                    name=None):
-  """Sum the weights of true negatives.
-
-  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
-
-  Args:
-    labels: The ground truth values, a `Tensor` whose dimensions must match
-      `predictions`. Will be cast to `bool`.
-    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
-      be cast to `bool`.
-    weights: Optional `Tensor` whose rank is either 0, or the same rank as
-      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `labels` dimension).
-    metrics_collections: An optional list of collections that the metric
-      value variable should be added to.
-    updates_collections: An optional list of collections that the metric update
-      ops should be added to.
-    name: An optional variable_scope name.
-
-  Returns:
-    value_tensor: A `Tensor` representing the current value of the metric.
-    update_op: An operation that accumulates the error from a batch of data.
-
-  Raises:
-    ValueError: If `predictions` and `labels` have mismatched shapes, or if
-      `weights` is not `None` and its shape doesn't match `predictions`, or if
-      either `metrics_collections` or `updates_collections` are not a list or
-      tuple.
-  """
-  with variable_scope.variable_scope(name, 'true_negatives',
-                                     (predictions, labels, weights)):
-
-    predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
-        predictions=math_ops.cast(predictions, dtype=dtypes.bool),
-        labels=math_ops.cast(labels, dtype=dtypes.bool),
-        weights=weights)
-    is_true_negative = math_ops.logical_and(
-        math_ops.equal(labels, False), math_ops.equal(predictions, False))
-    return _count_condition(is_true_negative, weights, metrics_collections,
-                            updates_collections)
-
-
 def streaming_false_positive_rate(predictions,
                                   labels,
                                   weights=None,
@@ -721,16 +564,16 @@ def streaming_false_positive_rate(predictions,
         weights=weights)
 
     false_p, false_positives_update_op = metrics.false_positives(
-        labels,
-        predictions,
-        weights,
+        labels=labels,
+        predictions=predictions,
+        weights=weights,
         metrics_collections=None,
         updates_collections=None,
         name=None)
-    true_n, true_negatives_update_op = _true_negatives(
-        labels,
-        predictions,
-        weights,
+    true_n, true_negatives_update_op = metrics.true_negatives(
+        labels=labels,
+        predictions=predictions,
+        weights=weights,
         metrics_collections=None,
         updates_collections=None,
         name=None)
@@ -943,7 +786,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
   update_ops = {}
 
   if 'tp' in includes:
-    true_positives = _create_local('true_positives', shape=[num_thresholds])
+    true_positives = metrics_impl.metric_variable(
+        [num_thresholds], dtypes.float32, name='true_positives')
     is_true_positive = math_ops.to_float(
         math_ops.logical_and(label_is_pos, pred_is_pos))
     if weights_tiled is not None:
@@ -954,7 +798,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
     values['tp'] = true_positives
 
   if 'fn' in includes:
-    false_negatives = _create_local('false_negatives', shape=[num_thresholds])
+    false_negatives = metrics_impl.metric_variable(
+        [num_thresholds], dtypes.float32, name='false_negatives')
     is_false_negative = math_ops.to_float(
         math_ops.logical_and(label_is_pos, pred_is_neg))
     if weights_tiled is not None:
@@ -965,7 +810,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
     values['fn'] = false_negatives
 
   if 'tn' in includes:
-    true_negatives = _create_local('true_negatives', shape=[num_thresholds])
+    true_negatives = metrics_impl.metric_variable(
+        [num_thresholds], dtypes.float32, name='true_negatives')
     is_true_negative = math_ops.to_float(
         math_ops.logical_and(label_is_neg, pred_is_neg))
     if weights_tiled is not None:
@@ -976,7 +822,8 @@ def _streaming_confusion_matrix_at_thresholds(predictions,
     values['tn'] = true_negatives
 
   if 'fp' in includes:
-    false_positives = _create_local('false_positives', shape=[num_thresholds])
+    false_positives = metrics_impl.metric_variable(
+        [num_thresholds], dtypes.float32, name='false_positives')
     is_false_positive = math_ops.to_float(
         math_ops.logical_and(label_is_neg, pred_is_pos))
     if weights_tiled is not None:
@@ -1080,13 +927,13 @@ def streaming_curve_points(labels=None,
       tuple.
 
   TODO(chizeng): Consider rewriting this method to make use of logic within the
-  streaming_precision_recall_at_equal_thresholds method (to improve run time).
+  precision_recall_at_equal_thresholds method (to improve run time).
   """
   with variable_scope.variable_scope(name, 'curve_points',
                                      (labels, predictions, weights)):
     if curve != 'ROC' and curve != 'PR':
       raise ValueError('curve must be either ROC or PR, %s unknown' % (curve))
-    kepsilon = 1e-7  # to account for floating point imprecisions
+    kepsilon = _EPSILON  # to account for floating point imprecisions
     thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                   for i in range(num_thresholds - 2)]
     thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
@@ -1123,8 +970,10 @@ def streaming_curve_points(labels=None,
 
     return points, update_op
 
-@deprecated(None, "Please switch to tf.metrics.auc. Note that the order of "
-    "the inputs of labels and predictions have been switched.")
+
+@deprecated(
+    None, 'Please switch to tf.metrics.auc. Note that the order of the '
+    'labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1199,12 +1048,160 @@ def streaming_auc(predictions,
       name=name)
 
 
-def streaming_precision_recall_at_equal_thresholds(predictions,
-                                                   labels,
-                                                   num_thresholds=None,
-                                                   weights=None,
-                                                   name=None,
-                                                   use_locking=None):
+def _compute_dynamic_auc(labels, predictions, curve='ROC'):
+  """Computes the apporixmate AUC by a Riemann sum with data-derived thresholds.
+
+  Computes the area under the ROC or PR curve using each prediction as a
+  threshold. This could be slow for large batches, but has the advantage of not
+  having its results degrade depending on the distribution of predictions.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with the same shape as
+      `predictions` with values of 0 or 1 and type `int64`.
+    predictions: A 1-D `Tensor` of predictions whose values are `float64`.
+    curve: The name of the curve to be computed, 'ROC' for the Receiving
+      Operating Characteristic or 'PR' for the Precision-Recall curve.
+
+  Returns:
+    A scalar `Tensor` containing the area-under-curve value for the input.
+  """
+  # Count the total number of positive and negative labels in the input.
+  size = array_ops.size(predictions)
+  total_positive = math_ops.cast(math_ops.reduce_sum(labels), dtypes.int32)
+
+  def continue_computing_dynamic_auc():
+    """Continues dynamic auc computation, entered if labels are not all equal.
+
+    Returns:
+      A scalar `Tensor` containing the area-under-curve value.
+    """
+    # Sort the predictions descending, and the corresponding labels as well.
+    ordered_predictions, indices = nn.top_k(predictions, k=size)
+    ordered_labels = array_ops.gather(labels, indices)
+
+    # Get the counts of the unique ordered predictions.
+    _, _, counts = array_ops.unique_with_counts(ordered_predictions)
+
+    # Compute the indices of the split points between different predictions.
+    splits = math_ops.cast(
+        array_ops.pad(math_ops.cumsum(counts), paddings=[[1, 0]]), dtypes.int32)
+
+    # Count the positives to the left of the split indices.
+    positives = math_ops.cast(
+        array_ops.pad(math_ops.cumsum(ordered_labels), paddings=[[1, 0]]),
+        dtypes.int32)
+    true_positives = array_ops.gather(positives, splits)
+    if curve == 'ROC':
+      # Count the negatives to the left of every split point and the total
+      # number of negatives for computing the FPR.
+      false_positives = math_ops.subtract(splits, true_positives)
+      total_negative = size - total_positive
+      x_axis_values = math_ops.truediv(false_positives, total_negative)
+      y_axis_values = math_ops.truediv(true_positives, total_positive)
+    elif curve == 'PR':
+      x_axis_values = math_ops.truediv(true_positives, total_positive)
+      # For conformance, set precision to 1 when the number of positive
+      # classifications is 0.
+      y_axis_values = array_ops.where(
+          math_ops.greater(splits, 0),
+          math_ops.truediv(true_positives, splits),
+          array_ops.ones_like(true_positives, dtype=dtypes.float64))
+
+    # Calculate trapezoid areas.
+    heights = math_ops.add(y_axis_values[1:], y_axis_values[:-1]) / 2.0
+    widths = math_ops.abs(
+        math_ops.subtract(x_axis_values[1:], x_axis_values[:-1]))
+    return math_ops.reduce_sum(math_ops.multiply(heights, widths))
+
+  # If all the labels are the same, AUC isn't well-defined (but raising an
+  # exception seems excessive) so we return 0, otherwise we finish computing.
+  return control_flow_ops.cond(
+      math_ops.logical_or(
+          math_ops.equal(total_positive, 0),
+          math_ops.equal(total_positive, size)
+      ),
+      true_fn=lambda: array_ops.constant(0, dtypes.float64),
+      false_fn=continue_computing_dynamic_auc)
+
+
+def streaming_dynamic_auc(labels,
+                          predictions,
+                          curve='ROC',
+                          metrics_collections=(),
+                          updates_collections=(),
+                          name=None):
+  """Computes the apporixmate AUC by a Riemann sum with data-derived thresholds.
+
+  USAGE NOTE: this approach requires storing all of the predictions and labels
+  for a single evaluation in memory, so it may not be usable when the evaluation
+  batch size and/or the number of evaluation steps is very large.
+
+  Computes the area under the ROC or PR curve using each prediction as a
+  threshold. This has the advantage of being resilient to the distribution of
+  predictions by aggregating across batches, accumulating labels and predictions
+  and performing the final calculation using all of the concatenated values.
+
+  Args:
+    labels: A `Tensor` of ground truth labels with the same shape as `labels`
+      and with values of 0 or 1 whose values are castable to `int64`.
+    predictions: A `Tensor` of predictions whose values are castable to
+      `float64`. Will be flattened into a 1-D `Tensor`.
+    curve: The name of the curve for which to compute AUC, 'ROC' for the
+      Receiving Operating Characteristic or 'PR' for the Precision-Recall curve.
+    metrics_collections: An optional iterable of collections that `auc` should
+      be added to.
+    updates_collections: An optional iterable of collections that `update_op`
+      should be added to.
+    name: An optional name for the variable_scope that contains the metric
+      variables.
+
+  Returns:
+    auc: A scalar `Tensor` containing the current area-under-curve value.
+    update_op: An operation that concatenates the input labels and predictions
+      to the accumulated values.
+
+  Raises:
+    ValueError: If `labels` and `predictions` have mismatched shapes or if
+      `curve` isn't a recognized curve type.
+  """
+
+  if curve not in ['PR', 'ROC']:
+    raise ValueError('curve must be either ROC or PR, %s unknown' % curve)
+
+  with variable_scope.variable_scope(name, default_name='dynamic_auc'):
+    labels.get_shape().assert_is_compatible_with(predictions.get_shape())
+    predictions = array_ops.reshape(
+        math_ops.cast(predictions, dtypes.float64), [-1])
+    labels = array_ops.reshape(math_ops.cast(labels, dtypes.int64), [-1])
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(
+            labels,
+            array_ops.zeros_like(labels, dtypes.int64),
+            message='labels must be 0 or 1, at least one is <0'),
+        check_ops.assert_less_equal(
+            labels,
+            array_ops.ones_like(labels, dtypes.int64),
+            message='labels must be 0 or 1, at least one is >1')
+    ]):
+      preds_accum, update_preds = streaming_concat(predictions,
+                                                   name='concat_preds')
+      labels_accum, update_labels = streaming_concat(labels,
+                                                     name='concat_labels')
+      update_op = control_flow_ops.group(update_labels, update_preds)
+      auc = _compute_dynamic_auc(labels_accum, preds_accum, curve=curve)
+      if updates_collections:
+        ops.add_to_collections(updates_collections, update_op)
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, auc)
+      return auc, update_op
+
+
+def precision_recall_at_equal_thresholds(labels,
+                                         predictions,
+                                         weights=None,
+                                         num_thresholds=None,
+                                         use_locking=None,
+                                         name=None):
   """A helper method for creating metrics related to precision-recall curves.
 
   These values are true positives, false negatives, true negatives, false
@@ -1225,20 +1222,20 @@ def streaming_precision_recall_at_equal_thresholds(predictions,
   reweight certain values, or more commonly used for masking values.
 
   Args:
+    labels: A bool `Tensor` whose shape matches `predictions`.
     predictions: A floating point `Tensor` of arbitrary shape and whose values
       are in the range `[0, 1]`.
-    labels: A bool `Tensor` whose shape matches `predictions`.
+    weights: Optional; If provided, a `Tensor` that has the same dtype as,
+      and broadcastable to, `predictions`. This tensor is multplied by counts.
     num_thresholds: Optional; Number of thresholds, evenly distributed in
       `[0, 1]`. Should be `>= 2`. Defaults to 201. Note that the number of bins
       is 1 less than `num_thresholds`. Using an even `num_thresholds` value
       instead of an odd one may yield unfriendly edges for bins.
-    weights: Optional; If provided, a `Tensor` that has the same dtype as,
-      and broadcastable to, `predictions`. This tensor is multplied by counts.
-    name: Optional; variable_scope name. If not provided, the string
-      'precision_recall_at_equal_threshold' is used.
     use_locking: Optional; If True, the op will be protected by a lock.
       Otherwise, the behavior is undefined, but may exhibit less contention.
       Defaults to True.
+    name: Optional; variable_scope name. If not provided, the string
+      'precision_recall_at_equal_threshold' is used.
 
   Returns:
     result: A named tuple (See PrecisionRecallData within the implementation of
@@ -1337,10 +1334,10 @@ def streaming_precision_recall_at_equal_thresholds(predictions,
         math_ops.floor(predictions * (num_thresholds - 1)), dtypes.int32)
 
     with ops.name_scope('variables'):
-      tp_buckets_v = _create_local(
-          'tp_buckets', shape=[num_thresholds], dtype=dtype)
-      fp_buckets_v = _create_local(
-          'fp_buckets', shape=[num_thresholds], dtype=dtype)
+      tp_buckets_v = metrics_impl.metric_variable(
+          [num_thresholds], dtype, name='tp_buckets')
+      fp_buckets_v = metrics_impl.metric_variable(
+          [num_thresholds], dtype, name='fp_buckets')
 
     with ops.name_scope('update_op'):
       update_tp = state_ops.scatter_add(
@@ -1509,9 +1506,10 @@ def streaming_sensitivity_at_specificity(predictions,
       updates_collections=updates_collections,
       name=name)
 
+
 @deprecated(
-    None, "Please switch to tf.metrics.precision_at_thresholds. Note that the "
-    "order of of the inputs of labels and predictions have been switched.")
+    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
@@ -1570,9 +1568,10 @@ def streaming_precision_at_thresholds(predictions,
       updates_collections=updates_collections,
       name=name)
 
+
 @deprecated(
-    None, "Please switch to tf.metrics.recall_at_thresholds. Note that the "
-    "order of of the inputs of labels and predictions have been switched.")
+    None, 'Please switch to tf.metrics.recall_at_thresholds. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_recall_at_thresholds(predictions,
                                    labels,
                                    thresholds,
@@ -1684,7 +1683,7 @@ def streaming_false_positive_rate_at_thresholds(predictions,
         predictions, labels, thresholds, weights, includes=('fp', 'tn'))
 
     # Avoid division by zero.
-    epsilon = 1e-7
+    epsilon = _EPSILON
 
     def compute_fpr(fp, tn, name):
       return math_ops.div(fp, epsilon + fp + tn, name='fpr_' + name)
@@ -1755,7 +1754,7 @@ def streaming_false_negative_rate_at_thresholds(predictions,
         predictions, labels, thresholds, weights, includes=('fn', 'tp'))
 
     # Avoid division by zero.
-    epsilon = 1e-7
+    epsilon = _EPSILON
 
     def compute_fnr(fn, tp, name):
       return math_ops.div(fn, epsilon + fn + tp, name='fnr_' + name)
@@ -1782,8 +1781,8 @@ def _at_k_name(name, k=None, class_id=None):
   return name
 
 
-@deprecated("2016-11-08", "Please use `streaming_sparse_recall_at_k`, "
-            "and reshape labels from [batch_size] to [batch_size, 1].")
+@deprecated('2016-11-08', 'Please use `streaming_sparse_recall_at_k`, '
+            'and reshape labels from [batch_size] to [batch_size, 1].')
 def streaming_recall_at_k(predictions,
                           labels,
                           k,
@@ -2003,7 +2002,7 @@ def streaming_sparse_precision_at_k(predictions,
       `predictions`, or if either `metrics_collections` or `updates_collections`
       are not a list or tuple.
   """
-  return metrics.sparse_precision_at_k(
+  return metrics.precision_at_k(
       k=k,
       class_id=class_id,
       predictions=predictions,
@@ -2173,7 +2172,7 @@ def sparse_recall_at_top_k(labels,
   default_name = _at_k_name('recall', class_id=class_id)
   with ops.name_scope(name, default_name,
                       (top_k_predictions, labels, weights)) as name_scope:
-    return metrics_impl._sparse_recall_at_top_k(  # pylint: disable=protected-access
+    return metrics_impl.recall_at_top_k(
         labels=labels,
         predictions_idx=top_k_predictions,
         class_id=class_id,
@@ -2183,6 +2182,109 @@ def sparse_recall_at_top_k(labels,
         name=name_scope)
 
 
+def _compute_recall_at_precision(tp, fp, fn, precision, name):
+  """Helper function to compute recall at a given `precision`.
+
+  Args:
+    tp: The number of true positives.
+    fp: The number of false positives.
+    fn: The number of false negatives.
+    precision: The precision for which the recall will be calculated.
+    name: An optional variable_scope name.
+
+  Returns:
+    The recall at a the given `precision`.
+  """
+  precisions = math_ops.div(tp, tp + fp + _EPSILON)
+  tf_index = math_ops.argmin(
+      math_ops.abs(precisions - precision), 0, output_type=dtypes.int32)
+
+  # Now, we have the implicit threshold, so compute the recall:
+  return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + _EPSILON,
+                      name)
+
+
+def recall_at_precision(labels,
+                        predictions,
+                        precision,
+                        weights=None,
+                        num_thresholds=200,
+                        metrics_collections=None,
+                        updates_collections=None,
+                        name=None):
+  """Computes `recall` at `precision`.
+
+  The `recall_at_precision` function creates four local variables,
+  `tp` (true positives), `fp` (false positives) and `fn` (false negatives)
+  that are used to compute the `recall` at the given `precision` value. The
+  threshold for the given `precision` value is computed and used to evaluate the
+  corresponding `recall`.
+
+  For estimation of the metric over a stream of data, the function creates an
+  `update_op` operation that updates these variables and returns the
+  `recall`. `update_op` increments the `tp`, `fp` and `fn` counts with the
+  weight of each case found in the `predictions` and `labels`.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: A floating point `Tensor` of arbitrary shape and whose values
+      are in the range `[0, 1]`.
+    precision: A scalar value in range `[0, 1]`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    num_thresholds: The number of thresholds to use for matching the given
+      `precision`.
+    metrics_collections: An optional list of collections that `recall`
+      should be added to.
+    updates_collections: An optional list of collections that `update_op` should
+      be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    recall: A scalar `Tensor` representing the recall at the given
+      `precision` value.
+    update_op: An operation that increments the `tp`, `fp` and `fn`
+      variables appropriately and whose value matches `recall`.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      `precision` is not between 0 and 1, or if either `metrics_collections`
+      or `updates_collections` are not a list or tuple.
+
+  """
+  if not 0 <= precision <= 1:
+    raise ValueError('`precision` must be in the range [0, 1].')
+
+  with variable_scope.variable_scope(name, 'recall_at_precision',
+                                     (predictions, labels, weights)):
+    thresholds = [
+        i * 1.0 / (num_thresholds - 1) for i in range(1, num_thresholds - 1)
+    ]
+    thresholds = [0.0 - _EPSILON] + thresholds + [1.0 + _EPSILON]
+
+    values, update_ops = _streaming_confusion_matrix_at_thresholds(
+        predictions, labels, thresholds, weights)
+
+    recall = _compute_recall_at_precision(values['tp'], values['fp'],
+                                          values['fn'], precision, 'value')
+    update_op = _compute_recall_at_precision(update_ops['tp'], update_ops['fp'],
+                                             update_ops['fn'], precision,
+                                             'update_op')
+
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, recall)
+
+    if updates_collections:
+      ops.add_to_collections(updates_collections, update_op)
+
+    return recall, update_op
+
+
 def streaming_sparse_average_precision_at_k(predictions,
                                             labels,
                                             k,
@@ -2241,7 +2343,7 @@ def streaming_sparse_average_precision_at_k(predictions,
     update: `Operation` that increments variables appropriately, and whose
       value matches `metric`.
   """
-  return metrics.sparse_average_precision_at_k(
+  return metrics.average_precision_at_k(
       k=k,
       predictions=predictions,
       labels=labels,
@@ -2313,7 +2415,8 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       updates_collections=updates_collections,
       name=name)
 
-@deprecated(None, "Please switch to tf.metrics.mean.")
+
+@deprecated(None, 'Please switch to tf.metrics.mean.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2607,10 +2710,13 @@ def streaming_covariance(predictions,
     predictions, labels, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
         predictions, labels, weights)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
-    count_ = _create_local('count', [])
-    mean_prediction = _create_local('mean_prediction', [])
-    mean_label = _create_local('mean_label', [])
-    comoment = _create_local('comoment', [])  # C_A in update equation
+    count_ = metrics_impl.metric_variable([], dtypes.float32, name='count')
+    mean_prediction = metrics_impl.metric_variable(
+        [], dtypes.float32, name='mean_prediction')
+    mean_label = metrics_impl.metric_variable(
+        [], dtypes.float32, name='mean_label')
+    comoment = metrics_impl.metric_variable(  # C_A in update equation
+        [], dtypes.float32, name='comoment')
 
     if weights is None:
       batch_count = math_ops.to_float(array_ops.size(labels))  # n_B in eqn
@@ -3030,9 +3136,9 @@ def streaming_concat(values,
     # applied to contiguous slices
     init_size = 0 if max_size is None else max_size
     init_shape = [init_size] + fixed_shape
-    array = _create_local(
-        'array', shape=init_shape, validate_shape=False, dtype=values.dtype)
-    size = _create_local('size', shape=[], dtype=dtypes.int32)
+    array = metrics_impl.metric_variable(
+        init_shape, values.dtype, validate_shape=False, name='array')
+    size = metrics_impl.metric_variable([], dtypes.int32, name='size')
 
     perm = [0 if n == axis else n + 1 if n < axis else n for n in range(ndim)]
     valid_array = array[:size]
@@ -3166,7 +3272,7 @@ def count(values,
   """
 
   with variable_scope.variable_scope(name, 'count', (values, weights)):
-    count_ = _create_local('count', shape=[])
+    count_ = metrics_impl.metric_variable([], dtypes.float32, name='count')
 
     if weights is None:
       num_values = math_ops.to_float(array_ops.size(values))
@@ -3195,10 +3301,13 @@ __all__ = [
     'aggregate_metric_map',
     'aggregate_metrics',
     'count',
+    'precision_recall_at_equal_thresholds',
+    'recall_at_precision',
     'sparse_recall_at_top_k',
     'streaming_accuracy',
     'streaming_auc',
     'streaming_curve_points',
+    'streaming_dynamic_auc',
     'streaming_false_negative_rate',
     'streaming_false_negative_rate_at_thresholds',
     'streaming_false_negatives',
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 6a8284786f592b2fe840e3c68099fecc93dc91c6..f05ae394e6b46809f9f3f963733076f1a3933059 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -149,9 +149,12 @@ def _assert_nan(test_case, actual):
   test_case.assertTrue(math.isnan(actual), 'Expected NAN, got %s.' % actual)
 
 
-def _assert_local_variables(test_case, expected):
+def _assert_metric_variables(test_case, expected):
   test_case.assertEquals(
       set(expected), set(v.name for v in variables.local_variables()))
+  test_case.assertEquals(
+      set(expected),
+      set(v.name for v in ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
 
 
 class StreamingMeanTest(test.TestCase):
@@ -161,7 +164,7 @@ class StreamingMeanTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_mean(array_ops.ones([4, 3]))
-    _assert_local_variables(self, ('mean/count:0', 'mean/total:0'))
+    _assert_metric_variables(self, ('mean/count:0', 'mean/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -319,8 +322,8 @@ class StreamingMeanTensorTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_mean_tensor(array_ops.ones([4, 3]))
-    _assert_local_variables(self, ('mean/total_tensor:0',
-                                   'mean/count_tensor:0'))
+    _assert_metric_variables(self,
+                             ('mean/total_tensor:0', 'mean/count_tensor:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -485,8 +488,8 @@ class StreamingAccuracyTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         name='my_accuracy')
-    _assert_local_variables(self, ('my_accuracy/count:0',
-                                   'my_accuracy/total:0'))
+    _assert_metric_variables(self,
+                             ('my_accuracy/count:0', 'my_accuracy/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -660,7 +663,7 @@ class StreamingTruePositivesTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_true_positives((0, 1, 0), (0, 1, 1))
-    _assert_local_variables(self, ('true_positives/count:0',))
+    _assert_metric_variables(self, ('true_positives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
@@ -716,7 +719,7 @@ class StreamingFalseNegativesTest(test.TestCase):
   def testVars(self):
     metrics.streaming_false_negatives((0, 1, 0),
                                       (0, 1, 1))
-    _assert_local_variables(self, ('false_negatives/count:0',))
+    _assert_metric_variables(self, ('false_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
@@ -772,7 +775,7 @@ class StreamingFalsePositivesTest(test.TestCase):
   def testVars(self):
     metrics.streaming_false_positives((0, 1, 0),
                                       (0, 1, 1))
-    _assert_local_variables(self, ('false_positives/count:0',))
+    _assert_metric_variables(self, ('false_positives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
@@ -832,7 +835,7 @@ class StreamingTrueNegativesTest(test.TestCase):
   def testVars(self):
     metrics.streaming_true_negatives((0, 1, 0),
                                      (0, 1, 1))
-    _assert_local_variables(self, ('true_negatives/count:0',))
+    _assert_metric_variables(self, ('true_negatives/count:0',))
 
   def testUnweighted(self):
     for expand_predictions in [True, False]:
@@ -888,7 +891,7 @@ class StreamingTruePositivesAtThresholdsTest(test.TestCase):
   def testVars(self):
     metrics.streaming_true_positives_at_thresholds(
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(0.15, 0.5, 0.85))
-    _assert_local_variables(self, ('true_positives:0',))
+    _assert_metric_variables(self, ('true_positives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -935,7 +938,7 @@ class StreamingFalseNegativesAtThresholdsTest(test.TestCase):
             0.15,
             0.5,
             0.85,))
-    _assert_local_variables(self, ('false_negatives:0',))
+    _assert_metric_variables(self, ('false_negatives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -982,7 +985,7 @@ class StreamingFalsePositivesAtThresholdsTest(test.TestCase):
   def testVars(self):
     metrics.streaming_false_positives_at_thresholds(
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(0.15, 0.5, 0.85))
-    _assert_local_variables(self, ('false_positives:0',))
+    _assert_metric_variables(self, ('false_positives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -1031,7 +1034,7 @@ class StreamingTrueNegativesAtThresholdsTest(test.TestCase):
   def testVars(self):
     metrics.streaming_true_negatives_at_thresholds(
         (0.0, 1.0, 0.0), (0, 1, 1), thresholds=(0.15, 0.5, 0.85))
-    _assert_local_variables(self, ('true_negatives:0',))
+    _assert_metric_variables(self, ('true_negatives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -1078,8 +1081,8 @@ class StreamingPrecisionTest(test.TestCase):
   def testVars(self):
     metrics.streaming_precision(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('precision/false_positives/count:0',
-                                   'precision/true_positives/count:0'))
+    _assert_metric_variables(self, ('precision/false_positives/count:0',
+                                    'precision/true_positives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1242,8 +1245,9 @@ class StreamingRecallTest(test.TestCase):
   def testVars(self):
     metrics.streaming_recall(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('recall/false_negatives/count:0',
-                                   'recall/true_positives/count:0'))
+    _assert_metric_variables(
+        self,
+        ('recall/false_negatives/count:0', 'recall/true_positives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1364,9 +1368,9 @@ class StreamingFPRTest(test.TestCase):
   def testVars(self):
     metrics.streaming_false_positive_rate(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, (
-        'false_positive_rate/false_positives/count:0',
-        'false_positive_rate/true_negatives/count:0'))
+    _assert_metric_variables(self,
+                             ('false_positive_rate/false_positives/count:0',
+                              'false_positive_rate/true_negatives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1492,9 +1496,9 @@ class StreamingFNRTest(test.TestCase):
   def testVars(self):
     metrics.streaming_false_negative_rate(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, (
-        'false_negative_rate/false_negatives/count:0',
-        'false_negative_rate/true_positives/count:0'))
+    _assert_metric_variables(self,
+                             ('false_negative_rate/false_negatives/count:0',
+                              'false_negative_rate/true_positives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1620,7 +1624,7 @@ class StreamingCurvePointsTest(test.TestCase):
   def testVars(self):
     metric_ops.streaming_curve_points(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(
+    _assert_metric_variables(
         self,
         ('curve_points/true_positives:0', 'curve_points/false_negatives:0',
          'curve_points/false_positives:0', 'curve_points/true_negatives:0'))
@@ -1704,6 +1708,34 @@ class StreamingCurvePointsTest(test.TestCase):
                    [[1.0, 4.0 / 6.0], [0.75, 1.0], [0.0, 1.0]])
 
 
+def _np_auc(predictions, labels, weights=None):
+  """Computes the AUC explicitly using Numpy.
+
+  Args:
+    predictions: an ndarray with shape [N].
+    labels: an ndarray with shape [N].
+    weights: an ndarray with shape [N].
+
+  Returns:
+    the area under the ROC curve.
+  """
+  if weights is None:
+    weights = np.ones(np.size(predictions))
+  is_positive = labels > 0
+  num_positives = np.sum(weights[is_positive])
+  num_negatives = np.sum(weights[~is_positive])
+
+  # Sort descending:
+  inds = np.argsort(-predictions)
+
+  sorted_labels = labels[inds]
+  sorted_weights = weights[inds]
+  is_positive = sorted_labels > 0
+
+  tp = np.cumsum(sorted_weights * is_positive) / num_positives
+  return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
+
+
 class StreamingAUCTest(test.TestCase):
 
   def setUp(self):
@@ -1713,9 +1745,9 @@ class StreamingAUCTest(test.TestCase):
   def testVars(self):
     metrics.streaming_auc(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self,
-                            ('auc/true_positives:0', 'auc/false_negatives:0',
-                             'auc/false_positives:0', 'auc/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('auc/true_positives:0', 'auc/false_negatives:0',
+                              'auc/false_positives:0', 'auc/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1892,33 +1924,6 @@ class StreamingAUCTest(test.TestCase):
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
-  def np_auc(self, predictions, labels, weights):
-    """Computes the AUC explicitly using Numpy.
-
-    Args:
-      predictions: an ndarray with shape [N].
-      labels: an ndarray with shape [N].
-      weights: an ndarray with shape [N].
-
-    Returns:
-      the area under the ROC curve.
-    """
-    if weights is None:
-      weights = np.ones(np.size(predictions))
-    is_positive = labels > 0
-    num_positives = np.sum(weights[is_positive])
-    num_negatives = np.sum(weights[~is_positive])
-
-    # Sort descending:
-    inds = np.argsort(-predictions)
-
-    sorted_labels = labels[inds]
-    sorted_weights = weights[inds]
-    is_positive = sorted_labels > 0
-
-    tp = np.cumsum(sorted_weights * is_positive) / num_positives
-    return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
-
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1941,7 +1946,7 @@ class StreamingAUCTest(test.TestCase):
 
     for weights in (None, np.ones(num_samples), np.random.exponential(
         scale=1.0, size=num_samples)):
-      expected_auc = self.np_auc(predictions, labels, weights)
+      expected_auc = _np_auc(predictions, labels, weights)
 
       with self.test_session() as sess:
         enqueue_ops = [[] for i in range(num_batches)]
@@ -1970,6 +1975,211 @@ class StreamingAUCTest(test.TestCase):
         self.assertAlmostEqual(expected_auc, auc.eval(), 2)
 
 
+class StreamingDynamicAUCTest(test.TestCase):
+
+  def setUp(self):
+    super(StreamingDynamicAUCTest, self).setUp()
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testUnknownCurve(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'curve must be either ROC or PR, TEST_CURVE unknown'):
+      metrics.streaming_dynamic_auc(labels=array_ops.ones((10, 1)),
+                                    predictions=array_ops.ones((10, 1)),
+                                    curve='TEST_CURVE')
+
+  def testVars(self):
+    metrics.streaming_dynamic_auc(
+        labels=array_ops.ones((10, 1)), predictions=array_ops.ones((10, 1)))
+    _assert_metric_variables(self, ['dynamic_auc/concat_labels/array:0',
+                                    'dynamic_auc/concat_labels/size:0',
+                                    'dynamic_auc/concat_preds/array:0',
+                                    'dynamic_auc/concat_preds/size:0'])
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    auc, _ = metrics.streaming_dynamic_auc(
+        labels=array_ops.ones((10, 1)),
+        predictions=array_ops.ones((10, 1)),
+        metrics_collections=[my_collection_name])
+    self.assertEqual(ops.get_collection(my_collection_name), [auc])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.streaming_dynamic_auc(
+        labels=array_ops.ones((10, 1)),
+        predictions=array_ops.ones((10, 1)),
+        updates_collections=[my_collection_name])
+    self.assertEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
+    auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      # Run several updates.
+      for _ in xrange(10):
+        sess.run(update_op)
+      # Then verify idempotency.
+      initial_auc = auc.eval()
+      for _ in xrange(10):
+        self.assertAlmostEqual(initial_auc, auc.eval(), 5)
+
+  def testAllLabelsOnes(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([1., 1., 1.])
+      labels = constant_op.constant([1, 1, 1])
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, auc.eval())
+
+  def testAllLabelsZeros(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([1., 1., 1.])
+      labels = constant_op.constant([0, 0, 0])
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(0, auc.eval())
+
+  def testNonZeroOnePredictions(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([2.5, -2.5, 2.5, -2.5],
+                                         dtype=dtypes_lib.float32)
+      labels = constant_op.constant([1, 0, 1, 0])
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(auc.eval(), 1.0)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    with self.test_session() as sess:
+      predictions = constant_op.constant(inputs)
+      labels = constant_op.constant(inputs)
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertEqual(1, auc.eval())
+
+  def testSomeCorrect(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([1, 0, 1, 0])
+      labels = constant_op.constant([0, 1, 1, 0])
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0.5, auc.eval())
+
+  def testAllIncorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    with self.test_session() as sess:
+      predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+      labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
+      auc, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0, auc.eval())
+
+  def testExceptionOnIncompatibleShapes(self):
+    with self.test_session() as sess:
+      predictions = array_ops.ones([5])
+      labels = array_ops.zeros([6])
+      with self.assertRaisesRegexp(ValueError, 'Shapes .* are incompatible'):
+        _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+        sess.run(variables.local_variables_initializer())
+        sess.run(update_op)
+
+  def testExceptionOnGreaterThanOneLabel(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([1, 0.5, 0], dtypes_lib.float32)
+      labels = constant_op.constant([2, 1, 0])
+      _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          '.*labels must be 0 or 1, at least one is >1.*'):
+        sess.run(update_op)
+
+  def testExceptionOnNegativeLabel(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant([1, 0.5, 0], dtypes_lib.float32)
+      labels = constant_op.constant([1, 0, -1])
+      _, update_op = metrics.streaming_dynamic_auc(labels, predictions)
+      sess.run(variables.local_variables_initializer())
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          '.*labels must be 0 or 1, at least one is <0.*'):
+        sess.run(update_op)
+
+  def testWithMultipleUpdates(self):
+    batch_size = 10
+    num_batches = 100
+    labels = np.array([])
+    predictions = np.array([])
+    tf_labels = variables.Variable(array_ops.ones(batch_size, dtypes_lib.int32),
+                                   collections=[ops.GraphKeys.LOCAL_VARIABLES],
+                                   dtype=dtypes_lib.int32)
+    tf_predictions = variables.Variable(
+        array_ops.ones(batch_size),
+        collections=[ops.GraphKeys.LOCAL_VARIABLES],
+        dtype=dtypes_lib.float32)
+    auc, update_op = metrics.streaming_dynamic_auc(tf_labels, tf_predictions)
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      for _ in xrange(num_batches):
+        new_labels = np.random.randint(0, 2, size=batch_size)
+        noise = np.random.normal(0.0, scale=0.2, size=batch_size)
+        new_predictions = 0.4 + 0.2 * new_labels + noise
+        labels = np.concatenate([labels, new_labels])
+        predictions = np.concatenate([predictions, new_predictions])
+        sess.run(tf_labels.assign(new_labels))
+        sess.run(tf_predictions.assign(new_predictions))
+        sess.run(update_op)
+        expected_auc = _np_auc(predictions, labels)
+        self.assertAlmostEqual(expected_auc, auc.eval())
+
+  def testAUCPRReverseIncreasingPredictions(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8], dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 1])
+      auc, update_op = metrics.streaming_dynamic_auc(
+          labels, predictions, curve='PR')
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-5)
+
+  def testAUCPRJumbledPredictions(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81], dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1])
+      auc, update_op = metrics.streaming_dynamic_auc(
+          labels, predictions, curve='PR')
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-6)
+
+  def testAUCPRPredictionsLessThanHalf(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.streaming_dynamic_auc(
+          labels, predictions, curve='PR')
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+      self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-5)
+
+
 class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -2008,11 +2218,11 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       if weights:
         weights_tensor = constant_op.constant(weights, dtype=dtypes_lib.float32)
       gotten_result, update_op = (
-          metric_ops.streaming_precision_recall_at_equal_thresholds(
-              predictions=predictions_tensor,
+          metric_ops.precision_recall_at_equal_thresholds(
               labels=labels_tensor,
-              num_thresholds=3,
-              weights=weights_tensor))
+              predictions=predictions_tensor,
+              weights=weights_tensor,
+              num_thresholds=3))
 
       sess.run(variables.local_variables_initializer())
       sess.run(update_op)
@@ -2020,22 +2230,19 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       self._testResultsEqual(expected_result, gotten_result)
 
   def testVars(self):
-    metric_ops.streaming_precision_recall_at_equal_thresholds(
-        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
-        labels=constant_op.constant([True], dtype=dtypes_lib.bool))
-    _assert_local_variables(
-        self,
-        (
-            'precision_recall_at_equal_thresholds/variables/tp_buckets:0',
-            'precision_recall_at_equal_thresholds/variables/fp_buckets:0'
-        ))
+    metric_ops.precision_recall_at_equal_thresholds(
+        labels=constant_op.constant([True], dtype=dtypes_lib.bool),
+        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32))
+    _assert_metric_variables(
+        self, ('precision_recall_at_equal_thresholds/variables/tp_buckets:0',
+               'precision_recall_at_equal_thresholds/variables/fp_buckets:0'))
 
   def testVarsWithName(self):
-    metric_ops.streaming_precision_recall_at_equal_thresholds(
-        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
+    metric_ops.precision_recall_at_equal_thresholds(
         labels=constant_op.constant([True], dtype=dtypes_lib.bool),
+        predictions=constant_op.constant([0.42], dtype=dtypes_lib.float32),
         name='foo')
-    _assert_local_variables(
+    _assert_metric_variables(
         self, ('foo/variables/tp_buckets:0', 'foo/variables/fp_buckets:0'))
 
   def testValuesAreIdempotent(self):
@@ -2044,9 +2251,8 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
     labels = constant_op.constant(
         np.random.uniform(size=(10, 3)) > 0.5, dtype=dtypes_lib.bool)
 
-    result, update_op = (
-        metric_ops.streaming_precision_recall_at_equal_thresholds(
-            predictions=predictions, labels=labels))
+    result, update_op = metric_ops.precision_recall_at_equal_thresholds(
+        labels=labels, predictions=predictions)
 
     with self.test_session() as sess:
       # Run several updates.
@@ -2145,11 +2351,11 @@ class StreamingSpecificityAtSensitivityTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         sensitivity=0.7)
-    _assert_local_variables(self,
-                            ('specificity_at_sensitivity/true_positives:0',
-                             'specificity_at_sensitivity/false_negatives:0',
-                             'specificity_at_sensitivity/false_positives:0',
-                             'specificity_at_sensitivity/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('specificity_at_sensitivity/true_positives:0',
+                              'specificity_at_sensitivity/false_negatives:0',
+                              'specificity_at_sensitivity/false_positives:0',
+                              'specificity_at_sensitivity/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2281,11 +2487,11 @@ class StreamingSensitivityAtSpecificityTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         specificity=0.7)
-    _assert_local_variables(self,
-                            ('sensitivity_at_specificity/true_positives:0',
-                             'sensitivity_at_specificity/false_negatives:0',
-                             'sensitivity_at_specificity/false_positives:0',
-                             'sensitivity_at_specificity/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('sensitivity_at_specificity/true_positives:0',
+                              'sensitivity_at_specificity/false_negatives:0',
+                              'sensitivity_at_specificity/false_positives:0',
+                              'sensitivity_at_specificity/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2398,9 +2604,10 @@ class StreamingPrecisionRecallThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0])
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'precision_at_thresholds/true_positives:0',
-        'precision_at_thresholds/false_positives:0',))
+        'precision_at_thresholds/false_positives:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2693,9 +2900,10 @@ class StreamingFPRThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0])
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'false_positive_rate_at_thresholds/false_positives:0',
-        'false_positive_rate_at_thresholds/true_negatives:0',))
+        'false_positive_rate_at_thresholds/true_negatives:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2914,6 +3122,123 @@ class StreamingFPRThresholdsTest(test.TestCase):
       self.assertAlmostEqual(expected_fpr, fpr.eval(), 2)
 
 
+class RecallAtPrecisionTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.recall_at_precision(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        precision=0.7)
+    _assert_metric_variables(self, ('recall_at_precision/true_positives:0',
+                                    'recall_at_precision/false_negatives:0',
+                                    'recall_at_precision/false_positives:0',
+                                    'recall_at_precision/true_negatives:0'))
+
+  def testMetricsCollection(self):
+    my_collection_name = '__metrics__'
+    mean, _ = metrics.recall_at_precision(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        precision=0.7,
+        metrics_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [mean])
+
+  def testUpdatesCollection(self):
+    my_collection_name = '__updates__'
+    _, update_op = metrics.recall_at_precision(
+        predictions=array_ops.ones((10, 1)),
+        labels=array_ops.ones((10, 1)),
+        precision=0.7,
+        updates_collections=[my_collection_name])
+    self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
+
+  def testValueTensorIsIdempotent(self):
+    predictions = random_ops.random_uniform(
+        (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
+    labels = random_ops.random_uniform(
+        (10, 3), maxval=2, dtype=dtypes_lib.int64, seed=2)
+    recall, update_op = metrics.recall_at_precision(
+        labels, predictions, precision=0.7)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+
+      # Run several updates.
+      for _ in range(10):
+        sess.run(update_op)
+
+      # Then verify idempotency.
+      initial_recall = recall.eval()
+      for _ in range(10):
+        self.assertAlmostEqual(initial_recall, recall.eval(), 5)
+
+  def testAllCorrect(self):
+    inputs = np.random.randint(0, 2, size=(100, 1))
+
+    predictions = constant_op.constant(inputs, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(inputs)
+    recall, update_op = metrics.recall_at_precision(
+        labels, predictions, precision=1.0)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertEqual(1, sess.run(update_op))
+      self.assertEqual(1, recall.eval())
+
+  def testSomeCorrectHighPrecision(self):
+    predictions_values = [1, .9, .8, .7, .6, .5, .4, .3]
+    labels_values = [1, 1, 1, 1, 0, 0, 0, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    recall, update_op = metrics.recall_at_precision(
+        labels, predictions, precision=0.8)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.assertAlmostEqual(0.8, recall.eval())
+
+  def testSomeCorrectLowPrecision(self):
+    predictions_values = [1, .9, .8, .7, .6, .5, .4, .3, .2, .1]
+    labels_values = [1, 1, 0, 0, 0, 0, 0, 0, 0, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    recall, update_op = metrics.recall_at_precision(
+        labels, predictions, precision=0.4)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      target_recall = 2.0 / 3.0
+      self.assertAlmostEqual(target_recall, sess.run(update_op))
+      self.assertAlmostEqual(target_recall, recall.eval())
+
+  def testWeighted(self):
+    predictions_values = [1, .9, .8, .7, .6]
+    labels_values = [1, 1, 0, 0, 1]
+    weights_values = [1, 1, 3, 4, 1]
+
+    predictions = constant_op.constant(
+        predictions_values, dtype=dtypes_lib.float32)
+    labels = constant_op.constant(labels_values)
+    weights = constant_op.constant(weights_values)
+    recall, update_op = metrics.recall_at_precision(
+        labels, predictions, weights=weights, precision=0.4)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      target_recall = 2.0 / 3.0
+      self.assertAlmostEqual(target_recall, sess.run(update_op))
+      self.assertAlmostEqual(target_recall, recall.eval())
+
+
 class StreamingFNRThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -2925,9 +3250,10 @@ class StreamingFNRThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0])
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'false_negative_rate_at_thresholds/false_negatives:0',
-        'false_negative_rate_at_thresholds/true_positives:0',))
+        'false_negative_rate_at_thresholds/true_positives:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -3169,8 +3495,8 @@ class StreamingRecallAtKTest(test.TestCase):
         labels=array_ops.ones(
             (self._batch_size,), dtype=dtypes_lib.int32),
         k=1)
-    _assert_local_variables(self, ('recall_at_1/count:0',
-                                   'recall_at_1/total:0'))
+    _assert_metric_variables(self,
+                             ('recall_at_1/count:0', 'recall_at_1/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -4661,8 +4987,8 @@ class StreamingMeanAbsoluteErrorTest(test.TestCase):
   def testVars(self):
     metrics.streaming_mean_absolute_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_absolute_error/count:0',
-                                   'mean_absolute_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_absolute_error/count:0', 'mean_absolute_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -4724,8 +5050,8 @@ class StreamingMeanRelativeErrorTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         normalizer=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_relative_error/count:0',
-                                   'mean_relative_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_relative_error/count:0', 'mean_relative_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -4807,8 +5133,8 @@ class StreamingMeanSquaredErrorTest(test.TestCase):
   def testVars(self):
     metrics.streaming_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_squared_error/count:0',
-                                   'mean_squared_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_squared_error/count:0', 'mean_squared_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -4987,8 +5313,9 @@ class StreamingRootMeanSquaredErrorTest(test.TestCase):
   def testVars(self):
     metrics.streaming_root_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('root_mean_squared_error/count:0',
-                                   'root_mean_squared_error/total:0'))
+    _assert_metric_variables(
+        self,
+        ('root_mean_squared_error/count:0', 'root_mean_squared_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -5080,11 +5407,12 @@ class StreamingCovarianceTest(test.TestCase):
         predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
             [10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'covariance/comoment:0',
         'covariance/count:0',
         'covariance/mean_label:0',
-        'covariance/mean_prediction:0',))
+        'covariance/mean_prediction:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -5249,7 +5577,7 @@ class StreamingPearsonRTest(test.TestCase):
         predictions=math_ops.to_float(math_ops.range(10)) + array_ops.ones(
             [10, 10]),
         labels=math_ops.to_float(math_ops.range(10)) + array_ops.ones([10, 10]))
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'pearson_r/covariance/comoment:0',
         'pearson_r/covariance/count:0',
         'pearson_r/covariance/mean_label:0',
@@ -5261,7 +5589,8 @@ class StreamingPearsonRTest(test.TestCase):
         'pearson_r/variance_predictions/comoment:0',
         'pearson_r/variance_predictions/count:0',
         'pearson_r/variance_predictions/mean_label:0',
-        'pearson_r/variance_predictions/mean_prediction:0',))
+        'pearson_r/variance_predictions/mean_prediction:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -5474,9 +5803,10 @@ class StreamingMeanCosineDistanceTest(test.TestCase):
         predictions=array_ops.ones((10, 3)),
         labels=array_ops.ones((10, 3)),
         dim=1)
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'mean_cosine_distance/count:0',
-        'mean_cosine_distance/total:0',))
+        'mean_cosine_distance/total:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -5615,9 +5945,10 @@ class PcntBelowThreshTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_percentage_less(values=array_ops.ones((10,)), threshold=2)
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'percentage_below_threshold/count:0',
-        'percentage_below_threshold/total:0',))
+        'percentage_below_threshold/total:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -5690,7 +6021,7 @@ class StreamingMeanIOUTest(test.TestCase):
         predictions=array_ops.ones([10, 1]),
         labels=array_ops.ones([10, 1]),
         num_classes=2)
-    _assert_local_variables(self, ('mean_iou/total_confusion_matrix:0',))
+    _assert_metric_variables(self, ('mean_iou/total_confusion_matrix:0',))
 
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
@@ -5998,9 +6329,10 @@ class StreamingConcatTest(test.TestCase):
 
   def testVars(self):
     metrics.streaming_concat(values=array_ops.ones((10,)))
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'streaming_concat/array:0',
-        'streaming_concat/size:0',))
+        'streaming_concat/size:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -6177,7 +6509,7 @@ class CountTest(test.TestCase):
 
   def testVars(self):
     metrics.count(array_ops.ones([4, 3]))
-    _assert_local_variables(self, ['count/count:0'])
+    _assert_metric_variables(self, ['count/count:0'])
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ca3f13479ed32e9ab3d43dfe9a392ef8466ce5f2
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "core_layers",
+    srcs = ["python/layers/core_layers.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:layers",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = ["python/layers/layers.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":core_layers",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "layers_test",
+    size = "small",
+    srcs = ["python/layers/layers_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":layers",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "learning",
+    srcs = ["python/learning.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/slim",
+    ],
+)
+
+py_library(
+    name = "rnn_cells",
+    srcs = ["python/layers/rnn_cells.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":core_layers",
+    ],
+)
+
+py_library(
+    name = "pruning",
+    srcs = ["python/pruning.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":core_layers",
+        "//tensorflow/contrib/training:training_py",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "pruning_test",
+    size = "small",
+    srcs = ["python/pruning_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pruning",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "rnn_cells_test",
+    size = "small",
+    srcs = ["python/layers/rnn_cells_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pruning",
+        ":rnn_cells",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+)
+
+# Top-level library
+py_library(
+    name = "model_pruning",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":init_py",
+        ":layers",
+        ":learning",
+        ":pruning",
+        ":rnn_cells",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..764e126e0d64d5e6c6caf0a9f0d43a87995447eb
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -0,0 +1,130 @@
+# Model pruning: Training tensorflow models to have masked connections
+
+This document describes the API that facilitates magnitude-based pruning of
+neural network's weight tensors. The API helps inject necessary tensorflow op
+into the training graph so the model can be pruned while it is being trained.
+
+### Model creation
+
+The first step involves adding mask and threshold variables to the layers that
+need to undergo pruning. The variable mask is the same shape as the layer's
+weight tensor and determines which of the weights participate in the forward
+execution of the graph. This can be achieved by wrapping the weight tensor of
+the layer with the `apply_mask` function provided in
+[pruning.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/pruning.py).
+For example:
+
+```python
+conv = tf.nn.conv2d(images, pruning.apply_mask(weights), stride, padding)
+```
+
+This creates a convolutional layer with additional variables mask and threshold
+as shown below: ![Convolutional layer with mask and
+threshold](https://storage.googleapis.com/download.tensorflow.org/example_images/mask.png "Convolutional layer with mask and threshold")
+
+Alternatively, the API also provides variant of tensorflow layers with these
+auxiliary variables built-in (see
+[layers](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers))
+. Layers currently supported:
+
+*   [layers.masked_conv2d](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers/layers.py?l=83)
+
+*   [layers.masked_fully_connected](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers/layers.py?l=241)
+
+*   [rnn_cells.MaskedLSTMCell](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py?l=154)
+
+### Adding pruning ops to the training graph
+
+The pruning library allows for specification of the following hyper parameters:
+
+|Hyperparameter               | Type    | Default       | Description |
+|:----------------------------|:-------:|:-------------:|:--------------|
+| name | string | model_pruning | Name of the pruning specification. Used for adding summaries and ops under a common tensorflow name_scope |
+| begin_pruning_step | integer | 0 | The global step at which to begin pruning |
+| end_pruning_step   | integer | -1 | The global step at which to terminate pruning. Defaults to -1 implying that pruning continues till  the training stops |
+| do_not_prune | list of strings | [""] | list of layers strings that are not pruned |
+| threshold_decay | float | 0.9 | The decay factor to use for exponential decay of the thresholds |
+| pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
+| nbins | integer | 255 | Number of bins to use for histogram computation |
+| initial_sparsity | float | 0.0 | Initial sparsity value |
+| target_sparsity | float | 0.5 | Target sparsity value |
+| sparsity_function_begin_step | integer | 0 | The global step at this which the gradual sparsity function begins to take effect |
+| sparsity_function_end_step | integer | 100 | The global step used as the end point for the gradual sparsity function |
+| sparsity_function_exponent | float | 3.0 | exponent = 1 is linearly varying sparsity between initial and final. exponent > 1 varies more slowly towards the end than the beginning |
+
+The sparsity $$s_t$$ at global step $$t$$ is given by:
+
+$$ s_{t}=s_{f}+\left(s_{i}-s_{f}\right)\left(1-\frac{t-t_{0}}{n\Delta t}\right)^{3} $$
+
+The interval between sparsity_function_begin_step and sparsity_function_end_step
+is divided into $$n$$ intervals of size equal to the pruning_frequency ($$\Delta
+t$$). $$s_f$$ is the target_sparsity, $$s_i$$ is the initial_sparsity, $$t_0$$
+is the sparsity_function_begin_step. In this equation, the
+sparsity_function_exponent is set to 3.
+### Adding pruning ops to the training graph
+
+The final step involves adding ops to the training graph that monitors the
+distribution of the layer's weight magnitudes and determines the layer threshold
+such masking all the weights below this threshold achieves the sparsity level
+desired for the current training step. This can be achieved as follows:
+
+```python
+tf.app.flags.DEFINE_string(
+    'pruning_hparams', '',
+    """Comma separated list of pruning-related hyperparameters""")
+
+with tf.graph.as_default():
+
+  # Create global step variable
+  global_step = tf.train.get_global_step()
+
+  # Parse pruning hyperparameters
+  pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
+
+  # Create a pruning object using the pruning specification
+  p = pruning.Pruning(pruning_hparams, global_step=global_step)
+
+  # Add conditional mask update op. Executing this op will update all
+  # the masks in the graph if the current global step is in the range
+  # [begin_pruning_step, end_pruning_step] as specified by the pruning spec
+  mask_update_op = p.conditional_mask_update_op()
+
+  # Add summaries to keep track of the sparsity in different layers during training
+  p.add_pruning_summaries()
+
+  with tf.train.MonitoredTrainingSession(...) as mon_sess:
+    # Run the usual training op in the tf session
+    mon_sess.run(train_op)
+
+    # Update the masks by running the mask_update_op
+    mon_sess.run(mask_update_op)
+
+```
+
+## Example: Pruning and training deep CNNs on the cifar10 dataset
+
+Please see https://www.tensorflow.org/tutorials/deep_cnn for details on neural
+network architecture, setting up inputs etc. The additional changes needed to
+incorporate pruning are captured in the following:
+
+*   [cifar10_pruning.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py)
+    creates a deep CNN with the same architecture, but adds mask and threshold
+    variables for each of the weight tensors in the convolutional and
+    locally-connected layers.
+
+*   [cifar10_train.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_train.py)
+    add pruning ops to the training graph as described above.
+
+To train the pruned version of cifar10:
+
+```bash
+$ examples_dir=contrib/model_pruning/examples
+$ bazel build -c opt $examples_dir/cifar10:cifar10_{train,eval}
+$ bazel-bin/$examples_dir/cifar10/cifar10_train --pruning_hparams=name=cifar10_pruning,begin_pruning_step=10000,end_pruning_step=100000,target_sparsity=0.9,sparsity_function_begin_step=10000,sparsity_function_end_step=100000
+```
+
+Eval:
+
+```shell
+$ bazel-bin/$examples_dir/cifar10/cifar10_eval --run_once
+```
diff --git a/tensorflow/contrib/model_pruning/__init__.py b/tensorflow/contrib/model_pruning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d32bedbcd6b63bc8e473a9e9d1c8e0753877e6f8
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/__init__.py
@@ -0,0 +1,47 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model pruning implementation in tensorflow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.model_pruning.python.layers.layers import masked_conv2d
+from tensorflow.contrib.model_pruning.python.layers.layers import masked_convolution
+from tensorflow.contrib.model_pruning.python.layers.layers import masked_fully_connected
+from tensorflow.contrib.model_pruning.python.layers.rnn_cells import MaskedBasicLSTMCell
+from tensorflow.contrib.model_pruning.python.layers.rnn_cells import MaskedLSTMCell
+from tensorflow.contrib.model_pruning.python.learning import train
+from tensorflow.contrib.model_pruning.python.pruning import apply_mask
+from tensorflow.contrib.model_pruning.python.pruning import get_masked_weights
+from tensorflow.contrib.model_pruning.python.pruning import get_masks
+from tensorflow.contrib.model_pruning.python.pruning import get_pruning_hparams
+from tensorflow.contrib.model_pruning.python.pruning import get_thresholds
+from tensorflow.contrib.model_pruning.python.pruning import get_weight_sparsity
+from tensorflow.contrib.model_pruning.python.pruning import get_weights
+from tensorflow.contrib.model_pruning.python.pruning import Pruning
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'masked_convolution', 'masked_conv2d', 'masked_fully_connected',
+    'MaskedBasicLSTMCell', 'MaskedLSTMCell', 'train', 'apply_mask',
+    'get_masked_weights', 'get_masks', 'get_pruning_hparams', 'get_thresholds',
+    'get_weights', 'get_weight_sparsity', 'Pruning'
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e7848adcc5ac126a2b85ef6dcb0ffa355b8b0628
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Description:
+# Example TensorFlow models for CIFAR-10
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "cifar10_input",
+    srcs = ["cifar10_input.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "cifar10_pruning",
+    srcs = ["cifar10_pruning.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar10_input",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/model_pruning:pruning",
+    ],
+)
+
+py_binary(
+    name = "cifar10_eval",
+    srcs = [
+        "cifar10_eval.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar10_pruning",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "cifar10_train",
+    srcs = [
+        "cifar10_train.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar10_pruning",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/model_pruning:pruning",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_eval.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..d72b2a1dca5de26b59c81c082ff7a42e9a4f4357
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_eval.py
@@ -0,0 +1,178 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Evaluation for CIFAR-10.
+
+Accuracy:
+cifar10_train.py achieves 83.0% accuracy after 100K steps (256 epochs
+of data) as judged by cifar10_eval.py.
+
+Speed:
+On a single Tesla K40, cifar10_train.py processes a single batch of 128 images
+in 0.25-0.35 sec (i.e. 350 - 600 images /sec). The model reaches ~86%
+accuracy after 100K steps in 8 hours of training time.
+
+Usage:
+Please see the tutorial and website for how to download the CIFAR-10
+data set, compile the program and train the model.
+
+http://tensorflow.org/tutorials/deep_cnn/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import datetime
+import math
+import sys
+import time
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib.model_pruning.examples.cifar10 import cifar10_pruning as cifar10
+
+FLAGS = None
+
+
+def eval_once(saver, summary_writer, top_k_op, summary_op):
+  """Run Eval once.
+
+  Args:
+    saver: Saver.
+    summary_writer: Summary writer.
+    top_k_op: Top K op.
+    summary_op: Summary op.
+  """
+  with tf.Session() as sess:
+    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
+    if ckpt and ckpt.model_checkpoint_path:
+      # Restores from checkpoint
+      saver.restore(sess, ckpt.model_checkpoint_path)
+      # Assuming model_checkpoint_path looks something like:
+      #   /my-favorite-path/cifar10_train/model.ckpt-0,
+      # extract global_step from it.
+      global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
+    else:
+      print('No checkpoint file found')
+      return
+
+    # Start the queue runners.
+    coord = tf.train.Coordinator()
+    try:
+      threads = []
+      for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
+        threads.extend(qr.create_threads(sess, coord=coord, daemon=True,
+                                         start=True))
+
+      num_iter = int(math.ceil(FLAGS.num_examples / 128))
+      true_count = 0  # Counts the number of correct predictions.
+      total_sample_count = num_iter * 128
+      step = 0
+      while step < num_iter and not coord.should_stop():
+        predictions = sess.run([top_k_op])
+        true_count += np.sum(predictions)
+        step += 1
+
+      # Compute precision @ 1.
+      precision = true_count / total_sample_count
+      print('%s: precision @ 1 = %.3f' % (datetime.datetime.now(), precision))
+
+      summary = tf.Summary()
+      summary.ParseFromString(sess.run(summary_op))
+      summary.value.add(tag='Precision @ 1', simple_value=precision)
+      summary_writer.add_summary(summary, global_step)
+    except Exception as e:  # pylint: disable=broad-except
+      coord.request_stop(e)
+
+    coord.request_stop()
+    coord.join(threads, stop_grace_period_secs=10)
+
+
+def evaluate():
+  """Eval CIFAR-10 for a number of steps."""
+  with tf.Graph().as_default() as g:
+    # Get images and labels for CIFAR-10.
+    eval_data = FLAGS.eval_data == 'test'
+    images, labels = cifar10.inputs(eval_data=eval_data)
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    logits = cifar10.inference(images)
+
+    # Calculate predictions.
+    top_k_op = tf.nn.in_top_k(logits, labels, 1)
+
+    # Restore the moving average version of the learned variables for eval.
+    variable_averages = tf.train.ExponentialMovingAverage(
+        cifar10.MOVING_AVERAGE_DECAY)
+    variables_to_restore = variable_averages.variables_to_restore()
+    saver = tf.train.Saver(variables_to_restore)
+
+    # Build the summary operation based on the TF collection of Summaries.
+    summary_op = tf.summary.merge_all()
+
+    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)
+
+    while True:
+      eval_once(saver, summary_writer, top_k_op, summary_op)
+      if FLAGS.run_once:
+        break
+      time.sleep(FLAGS.eval_interval_secs)
+
+
+def main(argv=None):  # pylint: disable=unused-argument
+  cifar10.maybe_download_and_extract()
+  if tf.gfile.Exists(FLAGS.eval_dir):
+    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
+  tf.gfile.MakeDirs(FLAGS.eval_dir)
+  evaluate()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--eval_dir',
+      type=str,
+      default='/tmp/cifar10_eval',
+      help='Directory where to write event logs.')
+  parser.add_argument(
+      '--eval_data',
+      type=str,
+      default='test',
+      help="""Either 'test' or 'train_eval'.""")
+  parser.add_argument(
+      '--checkpoint_dir',
+      type=str,
+      default='/tmp/cifar10_train',
+      help="""Directory where to read model checkpoints.""")
+  parser.add_argument(
+      '--eval_interval_secs',
+      type=int,
+      default=60 * 5,
+      help='How often to run the eval.')
+  parser.add_argument(
+      '--num_examples',
+      type=int,
+      default=10000,
+      help='Number of examples to run.')
+  parser.add_argument(
+      '--run_once',
+      type=bool,
+      default=False,
+      help='Whether to run eval only once.')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07fece4bc668612d517e8dcaab1a35451a0238e
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_input.py
@@ -0,0 +1,256 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Routine for decoding the CIFAR-10 binary file format."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+# Process images of this size. Note that this differs from the original CIFAR
+# image size of 32 x 32. If one alters this number, then the entire model
+# architecture will change and any model would need to be retrained.
+IMAGE_SIZE = 24
+
+# Global constants describing the CIFAR-10 data set.
+NUM_CLASSES = 10
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000
+NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = 10000
+
+
+def read_cifar10(filename_queue):
+  """Reads and parses examples from CIFAR10 data files.
+
+  Recommendation: if you want N-way read parallelism, call this function
+  N times.  This will give you N independent Readers reading different
+  files & positions within those files, which will give better mixing of
+  examples.
+
+  Args:
+    filename_queue: A queue of strings with the filenames to read from.
+
+  Returns:
+    An object representing a single example, with the following fields:
+      height: number of rows in the result (32)
+      width: number of columns in the result (32)
+      depth: number of color channels in the result (3)
+      key: a scalar string Tensor describing the filename & record number
+        for this example.
+      label: an int32 Tensor with the label in the range 0..9.
+      uint8image: a [height, width, depth] uint8 Tensor with the image data
+  """
+
+  class CIFAR10Record(object):
+    pass
+  result = CIFAR10Record()
+
+  # Dimensions of the images in the CIFAR-10 dataset.
+  # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
+  # input format.
+  label_bytes = 1  # 2 for CIFAR-100
+  result.height = 32
+  result.width = 32
+  result.depth = 3
+  image_bytes = result.height * result.width * result.depth
+  # Every record consists of a label followed by the image, with a
+  # fixed number of bytes for each.
+  record_bytes = label_bytes + image_bytes
+
+  # Read a record, getting filenames from the filename_queue.  No
+  # header or footer in the CIFAR-10 format, so we leave header_bytes
+  # and footer_bytes at their default of 0.
+  reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
+  result.key, value = reader.read(filename_queue)
+
+  # Convert from a string to a vector of uint8 that is record_bytes long.
+  record_bytes = tf.decode_raw(value, tf.uint8)
+
+  # The first bytes represent the label, which we convert from uint8->int32.
+  result.label = tf.cast(
+      tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)
+
+  # The remaining bytes after the label represent the image, which we reshape
+  # from [depth * height * width] to [depth, height, width].
+  depth_major = tf.reshape(
+      tf.strided_slice(record_bytes, [label_bytes],
+                       [label_bytes + image_bytes]),
+      [result.depth, result.height, result.width])
+  # Convert from [depth, height, width] to [height, width, depth].
+  result.uint8image = tf.transpose(depth_major, [1, 2, 0])
+
+  return result
+
+
+def _generate_image_and_label_batch(image, label, min_queue_examples,
+                                    batch_size, shuffle):
+  """Construct a queued batch of images and labels.
+
+  Args:
+    image: 3-D Tensor of [height, width, 3] of type.float32.
+    label: 1-D Tensor of type.int32
+    min_queue_examples: int32, minimum number of samples to retain
+      in the queue that provides of batches of examples.
+    batch_size: Number of images per batch.
+    shuffle: boolean indicating whether to use a shuffling queue.
+
+  Returns:
+    images: Images. 4D tensor of [batch_size, height, width, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
+  """
+  # Create a queue that shuffles the examples, and then
+  # read 'batch_size' images + labels from the example queue.
+  num_preprocess_threads = 16
+  if shuffle:
+    images, label_batch = tf.train.shuffle_batch(
+        [image, label],
+        batch_size=batch_size,
+        num_threads=num_preprocess_threads,
+        capacity=min_queue_examples + 3 * batch_size,
+        min_after_dequeue=min_queue_examples)
+  else:
+    images, label_batch = tf.train.batch(
+        [image, label],
+        batch_size=batch_size,
+        num_threads=num_preprocess_threads,
+        capacity=min_queue_examples + 3 * batch_size)
+
+  # Display the training images in the visualizer.
+  tf.summary.image('images', images)
+
+  return images, tf.reshape(label_batch, [batch_size])
+
+
+def distorted_inputs(data_dir, batch_size):
+  """Construct distorted input for CIFAR training using the Reader ops.
+
+  Args:
+    data_dir: Path to the CIFAR-10 data directory.
+    batch_size: Number of images per batch.
+
+  Returns:
+    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
+  """
+  filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
+               for i in xrange(1, 6)]
+  for f in filenames:
+    if not tf.gfile.Exists(f):
+      raise ValueError('Failed to find file: ' + f)
+
+  # Create a queue that produces the filenames to read.
+  filename_queue = tf.train.string_input_producer(filenames)
+
+  # Read examples from files in the filename queue.
+  read_input = read_cifar10(filename_queue)
+  reshaped_image = tf.cast(read_input.uint8image, tf.float32)
+
+  height = IMAGE_SIZE
+  width = IMAGE_SIZE
+
+  # Image processing for training the network. Note the many random
+  # distortions applied to the image.
+
+  # Randomly crop a [height, width] section of the image.
+  distorted_image = tf.random_crop(reshaped_image, [height, width, 3])
+
+  # Randomly flip the image horizontally.
+  distorted_image = tf.image.random_flip_left_right(distorted_image)
+
+  # Because these operations are not commutative, consider randomizing
+  # the order their operation.
+  distorted_image = tf.image.random_brightness(distorted_image,
+                                               max_delta=63)
+  distorted_image = tf.image.random_contrast(distorted_image,
+                                             lower=0.2, upper=1.8)
+
+  # Subtract off the mean and divide by the variance of the pixels.
+  float_image = tf.image.per_image_standardization(distorted_image)
+
+  # Set the shapes of tensors.
+  float_image.set_shape([height, width, 3])
+  read_input.label.set_shape([1])
+
+  # Ensure that the random shuffling has good mixing properties.
+  min_fraction_of_examples_in_queue = 0.4
+  min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN *
+                           min_fraction_of_examples_in_queue)
+  print ('Filling queue with %d CIFAR images before starting to train. '
+         'This will take a few minutes.' % min_queue_examples)
+
+  # Generate a batch of images and labels by building up a queue of examples.
+  return _generate_image_and_label_batch(float_image, read_input.label,
+                                         min_queue_examples, batch_size,
+                                         shuffle=True)
+
+
+def inputs(eval_data, data_dir, batch_size):
+  """Construct input for CIFAR evaluation using the Reader ops.
+
+  Args:
+    eval_data: bool, indicating if one should use the train or eval data set.
+    data_dir: Path to the CIFAR-10 data directory.
+    batch_size: Number of images per batch.
+
+  Returns:
+    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
+  """
+  if not eval_data:
+    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i)
+                 for i in xrange(1, 6)]
+    num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+  else:
+    filenames = [os.path.join(data_dir, 'test_batch.bin')]
+    num_examples_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
+
+  for f in filenames:
+    if not tf.gfile.Exists(f):
+      raise ValueError('Failed to find file: ' + f)
+
+  # Create a queue that produces the filenames to read.
+  filename_queue = tf.train.string_input_producer(filenames)
+
+  # Read examples from files in the filename queue.
+  read_input = read_cifar10(filename_queue)
+  reshaped_image = tf.cast(read_input.uint8image, tf.float32)
+
+  height = IMAGE_SIZE
+  width = IMAGE_SIZE
+
+  # Image processing for evaluation.
+  # Crop the central [height, width] of the image.
+  resized_image = tf.image.resize_image_with_crop_or_pad(reshaped_image,
+                                                         width, height)
+
+  # Subtract off the mean and divide by the variance of the pixels.
+  float_image = tf.image.per_image_standardization(resized_image)
+
+  # Set the shapes of tensors.
+  float_image.set_shape([height, width, 3])
+  read_input.label.set_shape([1])
+
+  # Ensure that the random shuffling has good mixing properties.
+  min_fraction_of_examples_in_queue = 0.4
+  min_queue_examples = int(num_examples_per_epoch *
+                           min_fraction_of_examples_in_queue)
+
+  # Generate a batch of images and labels by building up a queue of examples.
+  return _generate_image_and_label_batch(float_image, read_input.label,
+                                         min_queue_examples, batch_size,
+                                         shuffle=False)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d1de869f6ef91791a235cfe545b3b3a9b734e72
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_pruning.py
@@ -0,0 +1,395 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Builds the CIFAR-10 network with additional variables to support pruning.
+
+Summary of available functions:
+
+ # Compute input images and labels for training. If you would like to run
+ # evaluations, use inputs() instead.
+ inputs, labels = distorted_inputs()
+
+ # Compute inference on the model inputs to make a prediction.
+ predictions = inference(inputs)
+
+ # Compute the total loss of the prediction with respect to the labels.
+ loss = loss(predictions, labels)
+
+ # Create a graph to run one step of training with respect to the loss.
+ train_op = train(loss, global_step)
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import re
+import sys
+import tarfile
+
+from six.moves import urllib
+import tensorflow as tf
+
+from tensorflow.contrib.model_pruning.examples.cifar10 import cifar10_input
+from tensorflow.contrib.model_pruning.python import pruning
+
+# Global constants describing the CIFAR-10 data set.
+IMAGE_SIZE = cifar10_input.IMAGE_SIZE
+NUM_CLASSES = cifar10_input.NUM_CLASSES
+NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
+NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL
+BATCH_SIZE = 128
+DATA_DIR = '/tmp/cifar10_data'
+
+# Constants describing the training process.
+MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
+NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
+LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
+INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.
+
+# If a model is trained with multiple GPUs, prefix all Op names with tower_name
+# to differentiate the operations. Note that this prefix is removed from the
+# names of the summaries when visualizing a model.
+TOWER_NAME = 'tower'
+
+DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
+
+
+def _activation_summary(x):
+  """Helper to create summaries for activations.
+
+  Creates a summary that provides a histogram of activations.
+  Creates a summary that measures the sparsity of activations.
+
+  Args:
+    x: Tensor
+  Returns:
+    nothing
+  """
+  # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
+  # session. This helps the clarity of presentation on tensorboard.
+  tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
+  tf.summary.histogram(tensor_name + '/activations', x)
+  tf.summary.scalar(tensor_name + '/sparsity',
+                                       tf.nn.zero_fraction(x))
+
+
+def _variable_on_cpu(name, shape, initializer):
+  """Helper to create a Variable stored on CPU memory.
+
+  Args:
+    name: name of the variable
+    shape: list of ints
+    initializer: initializer for Variable
+
+  Returns:
+    Variable Tensor
+  """
+  with tf.device('/cpu:0'):
+    dtype = tf.float32
+    var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
+  return var
+
+
+def _variable_with_weight_decay(name, shape, stddev, wd):
+  """Helper to create an initialized Variable with weight decay.
+
+  Note that the Variable is initialized with a truncated normal distribution.
+  A weight decay is added only if one is specified.
+
+  Args:
+    name: name of the variable
+    shape: list of ints
+    stddev: standard deviation of a truncated Gaussian
+    wd: add L2Loss weight decay multiplied by this float. If None, weight
+        decay is not added for this Variable.
+
+  Returns:
+    Variable Tensor
+  """
+  dtype = tf.float32
+  var = _variable_on_cpu(
+      name,
+      shape,
+      tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
+  if wd is not None:
+    weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
+    tf.add_to_collection('losses', weight_decay)
+  return var
+
+
+def distorted_inputs():
+  """Construct distorted input for CIFAR training using the Reader ops.
+
+  Returns:
+    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
+
+  Raises:
+    ValueError: If no data_dir
+  """
+  if not DATA_DIR:
+    raise ValueError('Please supply a data_dir')
+  data_dir = os.path.join(DATA_DIR, 'cifar-10-batches-bin')
+  images, labels = cifar10_input.distorted_inputs(
+      data_dir=data_dir, batch_size=BATCH_SIZE)
+  return images, labels
+
+
+def inputs(eval_data):
+  """Construct input for CIFAR evaluation using the Reader ops.
+
+  Args:
+    eval_data: bool, indicating if one should use the train or eval data set.
+
+  Returns:
+    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
+    labels: Labels. 1D tensor of [batch_size] size.
+
+  Raises:
+    ValueError: If no data_dir
+  """
+  if not DATA_DIR:
+    raise ValueError('Please supply a data_dir')
+  data_dir = os.path.join(DATA_DIR, 'cifar-10-batches-bin')
+  images, labels = cifar10_input.inputs(
+      eval_data=eval_data, data_dir=data_dir, batch_size=BATCH_SIZE)
+  return images, labels
+
+
+def inference(images):
+  """Build the CIFAR-10 model.
+
+  Args:
+    images: Images returned from distorted_inputs() or inputs().
+
+  Returns:
+    Logits.
+  """
+  # We instantiate all variables using tf.get_variable() instead of
+  # tf.Variable() in order to share variables across multiple GPU training runs.
+  # If we only ran this model on a single GPU, we could simplify this function
+  # by replacing all instances of tf.get_variable() with tf.Variable().
+  #
+  # While instantiating conv and local layers, we add mask and threshold
+  # variables to the layer by calling the pruning.apply_mask() function.
+  # Note that the masks are applied only to the weight tensors
+  # conv1
+  with tf.variable_scope('conv1') as scope:
+    kernel = _variable_with_weight_decay('weights',
+                                         shape=[5, 5, 3, 64],
+                                         stddev=5e-2,
+                                         wd=0.0)
+
+    conv = tf.nn.conv2d(
+        images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
+    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
+    pre_activation = tf.nn.bias_add(conv, biases)
+    conv1 = tf.nn.relu(pre_activation, name=scope.name)
+    _activation_summary(conv1)
+
+  # pool1
+  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
+                         padding='SAME', name='pool1')
+  # norm1
+  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
+                    name='norm1')
+
+  # conv2
+  with tf.variable_scope('conv2') as scope:
+    kernel = _variable_with_weight_decay('weights',
+                                         shape=[5, 5, 64, 64],
+                                         stddev=5e-2,
+                                         wd=0.0)
+    conv = tf.nn.conv2d(
+        norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
+    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
+    pre_activation = tf.nn.bias_add(conv, biases)
+    conv2 = tf.nn.relu(pre_activation, name=scope.name)
+    _activation_summary(conv2)
+
+  # norm2
+  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
+                    name='norm2')
+  # pool2
+  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
+                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')
+
+  # local3
+  with tf.variable_scope('local3') as scope:
+    # Move everything into depth so we can perform a single matrix multiply.
+    reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
+    dim = reshape.get_shape()[1].value
+    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
+                                          stddev=0.04, wd=0.004)
+    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
+    local3 = tf.nn.relu(
+        tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
+        name=scope.name)
+    _activation_summary(local3)
+
+  # local4
+  with tf.variable_scope('local4') as scope:
+    weights = _variable_with_weight_decay('weights', shape=[384, 192],
+                                          stddev=0.04, wd=0.004)
+    biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
+    local4 = tf.nn.relu(
+        tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
+        name=scope.name)
+    _activation_summary(local4)
+
+  # linear layer(WX + b),
+  # We don't apply softmax here because
+  # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
+  # and performs the softmax internally for efficiency.
+  with tf.variable_scope('softmax_linear') as scope:
+    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
+                                          stddev=1/192.0, wd=0.0)
+    biases = _variable_on_cpu('biases', [NUM_CLASSES],
+                              tf.constant_initializer(0.0))
+    softmax_linear = tf.add(
+        tf.matmul(local4, pruning.apply_mask(weights, scope)),
+        biases,
+        name=scope.name)
+    _activation_summary(softmax_linear)
+
+  return softmax_linear
+
+
+def loss(logits, labels):
+  """Add L2Loss to all the trainable variables.
+
+  Add summary for "Loss" and "Loss/avg".
+  Args:
+    logits: Logits from inference().
+    labels: Labels from distorted_inputs or inputs(). 1-D tensor
+            of shape [batch_size]
+
+  Returns:
+    Loss tensor of type float.
+  """
+  # Calculate the average cross entropy loss across the batch.
+  labels = tf.cast(labels, tf.int64)
+  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+      labels=labels, logits=logits, name='cross_entropy_per_example')
+  cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
+  tf.add_to_collection('losses', cross_entropy_mean)
+
+  # The total loss is defined as the cross entropy loss plus all of the weight
+  # decay terms (L2 loss).
+  return tf.add_n(tf.get_collection('losses'), name='total_loss')
+
+
+def _add_loss_summaries(total_loss):
+  """Add summaries for losses in CIFAR-10 model.
+
+  Generates moving average for all losses and associated summaries for
+  visualizing the performance of the network.
+
+  Args:
+    total_loss: Total loss from loss().
+  Returns:
+    loss_averages_op: op for generating moving averages of losses.
+  """
+  # Compute the moving average of all individual losses and the total loss.
+  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
+  losses = tf.get_collection('losses')
+  loss_averages_op = loss_averages.apply(losses + [total_loss])
+
+  # Attach a scalar summary to all individual losses and the total loss; do the
+  # same for the averaged version of the losses.
+  for l in losses + [total_loss]:
+    # Name each loss as '(raw)' and name the moving average version of the loss
+    # as the original loss name.
+    tf.summary.scalar(l.op.name + ' (raw)', l)
+    tf.summary.scalar(l.op.name, loss_averages.average(l))
+
+  return loss_averages_op
+
+
+def train(total_loss, global_step):
+  """Train CIFAR-10 model.
+
+  Create an optimizer and apply to all trainable variables. Add moving
+  average for all trainable variables.
+
+  Args:
+    total_loss: Total loss from loss().
+    global_step: Integer Variable counting the number of training steps
+      processed.
+  Returns:
+    train_op: op for training.
+  """
+  # Variables that affect learning rate.
+  num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / BATCH_SIZE
+  decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
+
+  # Decay the learning rate exponentially based on the number of steps.
+  lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
+                                  global_step,
+                                  decay_steps,
+                                  LEARNING_RATE_DECAY_FACTOR,
+                                  staircase=True)
+  tf.summary.scalar('learning_rate', lr)
+
+  # Generate moving averages of all losses and associated summaries.
+  loss_averages_op = _add_loss_summaries(total_loss)
+
+  # Compute gradients.
+  with tf.control_dependencies([loss_averages_op]):
+    opt = tf.train.GradientDescentOptimizer(lr)
+    grads = opt.compute_gradients(total_loss)
+
+  # Apply gradients.
+  apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
+
+  # Add histograms for trainable variables.
+  for var in tf.trainable_variables():
+    tf.summary.histogram(var.op.name, var)
+
+  # Add histograms for gradients.
+  for grad, var in grads:
+    if grad is not None:
+      tf.summary.histogram(var.op.name + '/gradients', grad)
+
+  # Track the moving averages of all trainable variables.
+  variable_averages = tf.train.ExponentialMovingAverage(
+      MOVING_AVERAGE_DECAY, global_step)
+  variables_averages_op = variable_averages.apply(tf.trainable_variables())
+
+  with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
+    train_op = tf.no_op(name='train')
+
+  return train_op
+
+
+def maybe_download_and_extract():
+  """Download and extract the tarball from Alex's website."""
+  dest_directory = DATA_DIR
+  if not os.path.exists(dest_directory):
+    os.makedirs(dest_directory)
+  filename = DATA_URL.split('/')[-1]
+  filepath = os.path.join(dest_directory, filename)
+  if not os.path.exists(filepath):
+    def _progress(count, block_size, total_size):
+      sys.stdout.write('\r>> Downloading %s %.1f%%' % (filename,
+          float(count * block_size) / float(total_size) * 100.0))
+      sys.stdout.flush()
+    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
+    print()
+    statinfo = os.stat(filepath)
+    print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
+
+  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_train.py b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1064a3b6abe90f463184e977efb4de173e175cd
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/cifar10_train.py
@@ -0,0 +1,159 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A binary to train pruned CIFAR-10 using a single GPU.
+
+Accuracy:
+cifar10_train.py achieves ~86% accuracy after 100K steps (256 epochs of
+data) as judged by cifar10_eval.py when target sparsity in
+cifar10_pruning_spec.pbtxt is set to zero
+
+Results:
+Sparsity | Accuracy after 150K steps
+-------- | -------------------------
+0%       | 86%
+50%      | 86%
+75%      | TODO(suyoggupta)
+90%      | TODO(suyoggupta)
+95%      | 77%
+
+Usage:
+Please see the tutorial and website for how to download the CIFAR-10
+data set, compile the program and train the model.
+
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import datetime
+import sys
+import time
+
+
+import tensorflow as tf
+
+from tensorflow.contrib.model_pruning.examples.cifar10 import cifar10_pruning as cifar10
+from tensorflow.contrib.model_pruning.python import pruning
+
+FLAGS = None
+
+
+def train():
+  """Train CIFAR-10 for a number of steps."""
+  with tf.Graph().as_default():
+    global_step = tf.contrib.framework.get_or_create_global_step()
+
+    # Get images and labels for CIFAR-10.
+    images, labels = cifar10.distorted_inputs()
+
+    # Build a Graph that computes the logits predictions from the
+    # inference model.
+    logits = cifar10.inference(images)
+
+    # Calculate loss.
+    loss = cifar10.loss(logits, labels)
+
+    # Build a Graph that trains the model with one batch of examples and
+    # updates the model parameters.
+    train_op = cifar10.train(loss, global_step)
+
+    # Parse pruning hyperparameters
+    pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
+
+    # Create a pruning object using the pruning hyperparameters
+    pruning_obj = pruning.Pruning(pruning_hparams, global_step=global_step)
+
+    # Use the pruning_obj to add ops to the training graph to update the masks
+    # The conditional_mask_update_op will update the masks only when the
+    # training step is in [begin_pruning_step, end_pruning_step] specified in
+    # the pruning spec proto
+    mask_update_op = pruning_obj.conditional_mask_update_op()
+
+    # Use the pruning_obj to add summaries to the graph to track the sparsity
+    # of each of the layers
+    pruning_obj.add_pruning_summaries()
+
+    class _LoggerHook(tf.train.SessionRunHook):
+      """Logs loss and runtime."""
+
+      def begin(self):
+        self._step = -1
+
+      def before_run(self, run_context):
+        self._step += 1
+        self._start_time = time.time()
+        return tf.train.SessionRunArgs(loss)  # Asks for loss value.
+
+      def after_run(self, run_context, run_values):
+        duration = time.time() - self._start_time
+        loss_value = run_values.results
+        if self._step % 10 == 0:
+          num_examples_per_step = 128
+          examples_per_sec = num_examples_per_step / duration
+          sec_per_batch = float(duration)
+
+          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
+                        'sec/batch)')
+          print(format_str % (datetime.datetime.now(), self._step, loss_value,
+                              examples_per_sec, sec_per_batch))
+
+    with tf.train.MonitoredTrainingSession(
+        checkpoint_dir=FLAGS.train_dir,
+        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
+               tf.train.NanTensorHook(loss),
+               _LoggerHook()],
+        config=tf.ConfigProto(
+            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
+      while not mon_sess.should_stop():
+        mon_sess.run(train_op)
+        # Update the masks
+        mon_sess.run(mask_update_op)
+
+
+def main(argv=None):  # pylint: disable=unused-argument
+  cifar10.maybe_download_and_extract()
+  if tf.gfile.Exists(FLAGS.train_dir):
+    tf.gfile.DeleteRecursively(FLAGS.train_dir)
+  tf.gfile.MakeDirs(FLAGS.train_dir)
+  train()
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--train_dir',
+      type=str,
+      default='/tmp/cifar10_train',
+      help='Directory where to write event logs and checkpoint.')
+  parser.add_argument(
+      '--pruning_hparams',
+      type=str,
+      default='',
+      help="""Comma separated list of pruning-related hyperparameters""")
+  parser.add_argument(
+      '--max_steps',
+      type=int,
+      default=1000000,
+      help='Number of batches to run.')
+  parser.add_argument(
+      '--log_device_placement',
+      type=bool,
+      default=False,
+      help='Whether to log device placement.')
+
+  FLAGS, unparsed = parser.parse_known_args()
+  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..95dfd8f4213a8729f5954eb0626f28ecc9265bbb
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -0,0 +1,477 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the core layer classes for model pruning and its functional aliases.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
+from tensorflow.python.layers import utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import standard_ops
+
+MASK_COLLECTION = 'masks'
+THRESHOLD_COLLECTION = 'thresholds'
+MASKED_WEIGHT_COLLECTION = 'masked_weights'
+WEIGHT_COLLECTION = 'kernel'
+# The 'weights' part of the name is needed for the quantization library
+# to recognize that the kernel should be quantized.
+MASKED_WEIGHT_NAME = 'weights/masked_weight'
+
+
+class _MaskedConv(base.Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. The weight tensor of this layer is masked.
+  If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_MaskedConv, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
+    self.strides = utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = utils.normalize_padding(padding)
+    self.data_format = utils.normalize_data_format(data_format)
+    self.dilation_rate = utils.normalize_tuple(dilation_rate, rank,
+                                               'dilation_rate')
+    self.activation = activation
+    self.use_bias = use_bias
+    self.kernel_initializer = kernel_initializer
+    self.bias_initializer = bias_initializer
+    self.kernel_regularizer = kernel_regularizer
+    self.bias_regularizer = bias_regularizer
+    self.input_spec = base.InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    channel_axis = 1 if self.data_format == 'channels_first' else -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis].value
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+    self.mask = self.add_variable(
+        name='mask',
+        shape=kernel_shape,
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+
+    self.kernel = self.add_variable(
+        name='kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        trainable=True,
+        dtype=self.dtype)
+
+    self.threshold = self.add_variable(
+        name='threshold',
+        shape=[],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+
+    # Add masked_weights in the weights namescope so as to make it easier
+    # for the quantization library to add quant ops.
+    self.masked_kernel = math_ops.multiply(self.mask, self.kernel,
+                                           MASKED_WEIGHT_NAME)
+
+    ops.add_to_collection(MASK_COLLECTION, self.mask)
+    ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel)
+    ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold)
+    ops.add_to_collection(WEIGHT_COLLECTION, self.kernel)
+
+    if self.use_bias:
+      self.bias = self.add_variable(
+          name='bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+    self.input_spec = base.InputSpec(
+        ndim=self.rank + 2, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs):
+    outputs = nn.convolution(
+        input=inputs,
+        filter=self.masked_kernel,
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=utils.convert_data_format(self.data_format, self.rank + 2))
+
+    if self.bias is not None:
+      if self.data_format == 'channels_first':
+        if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
+        if self.rank == 3:
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          outputs_4d = array_ops.reshape(outputs, [
+              outputs_shape[0], outputs_shape[1],
+              outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+          ])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
+      else:
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+
+class MaskedConv2D(_MaskedConv):
+  """2D convolution layer (e.g. spatial convolution over images).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+      height and width of the 2D convolution window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the convolution along the height and width.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format='channels_last',
+               dilation_rate=(1, 1),
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(MaskedConv2D, self).__init__(
+        rank=2,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+
+
+class MaskedFullyConnected(base.Layer):
+  """Fully-connected layer class with masked weights.
+
+  This layer implements the operation:
+  `outputs = activation(inputs.kernel + bias)`
+  Where `activation` is the activation function passed as the `activation`
+  argument (if not `None`), `kernel` is a weights matrix created by the layer,
+  and `bias` is a bias vector created by the layer
+  (only if `use_bias` is `True`).
+
+  Note: if the input to the layer has a rank greater than 2, then it is
+  flattened prior to the initial matrix multiply by `kernel`.
+
+  Arguments:
+    units: Integer or Long, dimensionality of the output space.
+    activation: Activation function (callable). Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: Initializer function for the weight matrix.
+    bias_initializer: Initializer function for the bias.
+    kernel_regularizer: Regularizer function for the weight matrix.
+    bias_regularizer: Regularizer function for the bias.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer
+      by the same name.
+
+  Properties:
+    units: Python integer, dimensionality of the output space.
+    activation: Activation function (callable).
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: Initializer instance (or name) for the weight matrix.
+    bias_initializer: Initializer instance (or name) for the bias.
+    kernel_regularizer: Regularizer instance for the weight matrix (callable)
+    bias_regularizer: Regularizer instance for the bias (callable).
+    activity_regularizer: Regularizer instance for the output (callable)
+    kernel: Weight matrix (TensorFlow variable or tensor).
+    bias: Bias vector, if applicable (TensorFlow variable or tensor).
+  """
+
+  def __init__(self,
+               units,
+               activation=None,
+               use_bias=True,
+               kernel_initializer=None,
+               bias_initializer=init_ops.zeros_initializer(),
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(MaskedFullyConnected, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=activity_regularizer,
+        **kwargs)
+    self.units = units
+    self.activation = activation
+    self.use_bias = use_bias
+    self.kernel_initializer = kernel_initializer
+    self.bias_initializer = bias_initializer
+    self.kernel_regularizer = kernel_regularizer
+    self.bias_regularizer = bias_regularizer
+    self.input_spec = base.InputSpec(min_ndim=2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if input_shape[-1].value is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    self.input_spec = base.InputSpec(
+        min_ndim=2, axes={-1: input_shape[-1].value})
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=[input_shape[-1].value, self.units],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        dtype=self.dtype,
+        trainable=True)
+
+    self.mask = self.add_variable(
+        name='mask',
+        shape=[input_shape[-1].value, self.units],
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+
+    self.threshold = self.add_variable(
+        name='threshold',
+        shape=[],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+
+    # Add masked_weights in the weights namescope so as to make it easier
+    # for the quantization library to add quant ops.
+    self.masked_kernel = math_ops.multiply(self.mask, self.kernel,
+                                           MASKED_WEIGHT_NAME)
+
+    ops.add_to_collection(MASK_COLLECTION, self.mask)
+    ops.add_to_collection(MASKED_WEIGHT_COLLECTION, self.masked_kernel)
+    ops.add_to_collection(THRESHOLD_COLLECTION, self.threshold)
+    ops.add_to_collection(WEIGHT_COLLECTION, self.kernel)
+
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=[
+              self.units,
+          ],
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          dtype=self.dtype,
+          trainable=True)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    shape = inputs.get_shape().as_list()
+    output_shape = shape[:-1] + [self.units]
+    if len(output_shape) > 2:
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.masked_kernel,
+                                       [[len(shape) - 1], [0]])
+      # Reshape the output back to the original ndim of the input.
+      outputs.set_shape(output_shape)
+    else:
+      outputs = standard_ops.matmul(inputs, self.masked_kernel)
+    if self.use_bias:
+      outputs = nn.bias_add(outputs, self.bias)
+    if self.activation is not None:
+      return self.activation(outputs)  # pylint: disable=not-callable
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers.py b/tensorflow/contrib/model_pruning/python/layers/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfebb9a6794056dd43b0699ccbcc5797f2f172f7
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/layers/layers.py
@@ -0,0 +1,364 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow layers with added variables for parameter masking.
+
+Branched from tensorflow/contrib/layers/python/layers/layers.py
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.framework.python.ops import add_arg_scope
+from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.contrib.layers.python.layers import initializers
+from tensorflow.contrib.layers.python.layers import utils
+from tensorflow.contrib.model_pruning.python.layers import core_layers as core
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as tf_variables
+
+
+def _model_variable_getter(getter,
+                           name,
+                           shape=None,
+                           dtype=None,
+                           initializer=None,
+                           regularizer=None,
+                           trainable=True,
+                           collections=None,
+                           caching_device=None,
+                           partitioner=None,
+                           rename=None,
+                           use_resource=None,
+                           **_):
+  """Getter that uses model_variable for compatibility with core layers."""
+  short_name = name.split('/')[-1]
+  if rename and short_name in rename:
+    name_components = name.split('/')
+    name_components[-1] = rename[short_name]
+    name = '/'.join(name_components)
+  return variables.model_variable(
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      collections=collections,
+      trainable=trainable,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      custom_getter=getter,
+      use_resource=use_resource)
+
+
+def _build_variable_getter(rename=None):
+  """Build a model variable getter that respects scope getter and renames."""
+
+  # VariableScope will nest the getters
+  def layer_variable_getter(getter, *args, **kwargs):
+    kwargs['rename'] = rename
+    return _model_variable_getter(getter, *args, **kwargs)
+
+  return layer_variable_getter
+
+
+def _add_variable_to_collections(variable, collections_set, collections_name):
+  """Adds variable (or all its parts) to all collections with that name."""
+  collections = utils.get_variable_collections(collections_set,
+                                               collections_name) or []
+  variables_list = [variable]
+  if isinstance(variable, tf_variables.PartitionedVariable):
+    variables_list = [v for v in variable]
+  for collection in collections:
+    for var in variables_list:
+      if var not in ops.get_collection(collection):
+        ops.add_to_collection(collection, var)
+
+
+@add_arg_scope
+def masked_convolution(inputs,
+                       num_outputs,
+                       kernel_size,
+                       stride=1,
+                       padding='SAME',
+                       data_format=None,
+                       rate=1,
+                       activation_fn=nn.relu,
+                       normalizer_fn=None,
+                       normalizer_params=None,
+                       weights_initializer=initializers.xavier_initializer(),
+                       weights_regularizer=None,
+                       biases_initializer=init_ops.zeros_initializer(),
+                       biases_regularizer=None,
+                       reuse=None,
+                       variables_collections=None,
+                       outputs_collections=None,
+                       trainable=True,
+                       scope=None):
+  """Adds an 2D convolution followed by an optional batch_norm layer.
+  The layer creates a mask variable on top of the weight variable. The input to
+  the convolution operation is the elementwise multiplication of the mask
+  variable and the weigh
+
+  It is required that 1 <= N <= 3.
+
+  `convolution` creates a variable called `weights`, representing the
+  convolutional kernel, that is convolved (actually cross-correlated) with the
+  `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is
+  provided (such as `batch_norm`), it is then applied. Otherwise, if
+  `normalizer_fn` is None and a `biases_initializer` is provided then a `biases`
+  variable would be created and added the activations. Finally, if
+  `activation_fn` is not `None`, it is applied to the activations as well.
+
+  Performs atrous convolution with input stride/dilation rate equal to `rate`
+  if a value > 1 for any dimension of `rate` is specified.  In this case
+  `stride` values != 1 are not supported.
+
+  Args:
+    inputs: A Tensor of rank N+2 of shape
+      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
+      not start with "NC" (default), or
+      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
+      with "NC".
+    num_outputs: Integer, the number of output filters.
+    kernel_size: A sequence of N positive integers specifying the spatial
+      dimensions of of the filters.  Can be a single integer to specify the same
+      value for all spatial dimensions.
+    stride: A sequence of N positive integers specifying the stride at which to
+      compute output.  Can be a single integer to specify the same value for all
+      spatial dimensions.  Specifying any `stride` value != 1 is incompatible
+      with specifying any `rate` value != 1.
+    padding: One of `"VALID"` or `"SAME"`.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    rate: A sequence of N positive integers specifying the dilation rate to use
+      for atrous convolution.  Can be a single integer to specify the same
+      value for all spatial dimensions.  Specifying any `rate` value != 1 is
+      incompatible with specifying any `stride` value != 1.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of `biases`. If
+      `normalizer_fn` is provided then `biases_initializer` and
+      `biases_regularizer` are ignored and `biases` are not created nor added.
+      default set to None for no normalizer function
+    normalizer_params: Normalization function parameters.
+    weights_initializer: An initializer for the weights.
+    weights_regularizer: Optional regularizer for the weights.
+    biases_initializer: An initializer for the biases. If None skip biases.
+    biases_regularizer: Optional regularizer for the biases.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional list of collections for all the variables or
+      a dictionary containing a different list of collection per variable.
+    outputs_collections: Collection to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    scope: Optional scope for `variable_scope`.
+
+  Returns:
+    A tensor representing the output of the operation.
+
+  Raises:
+    ValueError: If `data_format` is invalid.
+    ValueError: Both 'rate' and `stride` are not uniformly 1.
+  """
+  if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
+    raise ValueError('Invalid data_format: %r' % (data_format,))
+
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
+
+  with variable_scope.variable_scope(
+      scope, 'Conv', [inputs], reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    input_rank = inputs.get_shape().ndims
+
+    if input_rank == 3:
+      raise ValueError('Sparse Convolution not supported for input with rank',
+                       input_rank)
+    elif input_rank == 4:
+      layer_class = core.MaskedConv2D
+    elif input_rank == 5:
+      raise ValueError('Sparse Convolution not supported for input with rank',
+                       input_rank)
+    else:
+      raise ValueError('Sparse Convolution not supported for input with rank',
+                       input_rank)
+
+    if data_format is None or data_format == 'NHWC':
+      df = 'channels_last'
+    elif data_format == 'NCHW':
+      df = 'channels_first'
+    else:
+      raise ValueError('Unsupported data fromat', data_format)
+
+    layer = layer_class(
+        filters=num_outputs,
+        kernel_size=kernel_size,
+        strides=stride,
+        padding=padding,
+        data_format=df,
+        dilation_rate=rate,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.use_bias:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
+
+    if normalizer_fn is not None:
+      normalizer_params = normalizer_params or {}
+      outputs = normalizer_fn(outputs, **normalizer_params)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
+
+
+masked_conv2d = masked_convolution
+
+
+@add_arg_scope
+def masked_fully_connected(
+    inputs,
+    num_outputs,
+    activation_fn=nn.relu,
+    normalizer_fn=None,
+    normalizer_params=None,
+    weights_initializer=initializers.xavier_initializer(),
+    weights_regularizer=None,
+    biases_initializer=init_ops.zeros_initializer(),
+    biases_regularizer=None,
+    reuse=None,
+    variables_collections=None,
+    outputs_collections=None,
+    trainable=True,
+    scope=None):
+  """Adds a sparse fully connected layer. The weight matrix is masked.
+
+  `fully_connected` creates a variable called `weights`, representing a fully
+  connected weight matrix, which is multiplied by the `inputs` to produce a
+  `Tensor` of hidden units. If a `normalizer_fn` is provided (such as
+  `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is
+  None and a `biases_initializer` is provided then a `biases` variable would be
+  created and added the hidden units. Finally, if `activation_fn` is not `None`,
+  it is applied to the hidden units as well.
+
+  Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened
+  prior to the initial matrix multiply by `weights`.
+
+  Args:
+    inputs: A tensor of at least rank 2 and static value for the last dimension;
+      i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
+    num_outputs: Integer or long, the number of output units in the layer.
+    activation_fn: Activation function. The default value is a ReLU function.
+      Explicitly set it to None to skip it and maintain a linear activation.
+    normalizer_fn: Normalization function to use instead of `biases`. If
+      `normalizer_fn` is provided then `biases_initializer` and
+      `biases_regularizer` are ignored and `biases` are not created nor added.
+      default set to None for no normalizer function
+    normalizer_params: Normalization function parameters.
+    weights_initializer: An initializer for the weights.
+    weights_regularizer: Optional regularizer for the weights.
+    biases_initializer: An initializer for the biases. If None skip biases.
+    biases_regularizer: Optional regularizer for the biases.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional list of collections for all the variables or
+      a dictionary containing a different list of collections per variable.
+    outputs_collections: Collection to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    scope: Optional scope for variable_scope.
+
+  Returns:
+     The tensor variable representing the result of the series of operations.
+
+  Raises:
+    ValueError: If x has rank less than 2 or if its last dimension is not set.
+  """
+  if not isinstance(num_outputs, six.integer_types):
+    raise ValueError('num_outputs should be int or long, got %s.' %
+                     (num_outputs,))
+
+  layer_variable_getter = _build_variable_getter({
+      'bias': 'biases',
+      'kernel': 'weights'
+  })
+
+  with variable_scope.variable_scope(
+      scope,
+      'fully_connected', [inputs],
+      reuse=reuse,
+      custom_getter=layer_variable_getter) as sc:
+    inputs = ops.convert_to_tensor(inputs)
+    layer = core.MaskedFullyConnected(
+        units=num_outputs,
+        activation=None,
+        use_bias=not normalizer_fn and biases_initializer,
+        kernel_initializer=weights_initializer,
+        bias_initializer=biases_initializer,
+        kernel_regularizer=weights_regularizer,
+        bias_regularizer=biases_regularizer,
+        activity_regularizer=None,
+        trainable=trainable,
+        name=sc.name,
+        dtype=inputs.dtype.base_dtype,
+        _scope=sc,
+        _reuse=reuse)
+    outputs = layer.apply(inputs)
+
+    # Add variables to collections.
+    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
+    if layer.bias is not None:
+      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
+
+    # Apply normalizer function / layer.
+    if normalizer_fn is not None:
+      if not normalizer_params:
+        normalizer_params = {}
+      outputs = normalizer_fn(outputs, **normalizer_params)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+
+    return utils.collect_named_outputs(outputs_collections,
+                                       sc.original_name_scope, outputs)
diff --git a/tensorflow/contrib/model_pruning/python/layers/layers_test.py b/tensorflow/contrib/model_pruning/python/layers/layers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..97a2c978509e79f837a20595811a903a02b6a5eb
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/layers/layers_test.py
@@ -0,0 +1,139 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for imagingvision.intelligence.tensorflow.model_pruning.layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.model_pruning.python.layers import core_layers
+from tensorflow.contrib.model_pruning.python.layers import layers
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class MaskedConvolutionLayerTest(test.TestCase):
+
+  def setUp(self):
+    super(MaskedConvolutionLayerTest, self).setUp()
+    self.height, self.width = 7, 9
+
+  def testInvalidRank3(self):
+    input_tensor = array_ops.ones((self.height, self.width, 3))
+    with self.assertRaisesRegexp(ValueError, 'rank'):
+      layers.masked_conv2d(input_tensor, 32, 3)
+
+  def testInvalidRank5(self):
+    input_tensor = array_ops.ones((8, 8, self.height, self.width, 3))
+    with self.assertRaisesRegexp(ValueError, 'rank'):
+      layers.masked_conv2d(input_tensor, 32, 3)
+
+  def testSingleConvMaskAdded(self):
+    kernel_size = 3
+    input_depth, output_depth = 8, 32
+    input_tensor = array_ops.ones((8, self.height, self.width, input_depth))
+    layers.masked_conv2d(input_tensor, output_depth, kernel_size)
+
+    masks = ops.get_collection(core_layers.MASK_COLLECTION)
+    self.assertEqual(len(masks), 1)
+    self.assertListEqual(masks[0].get_shape().as_list(),
+                         [kernel_size, kernel_size, input_depth, output_depth])
+
+    masked_weight = ops.get_collection(core_layers.MASKED_WEIGHT_COLLECTION)
+    self.assertEqual(len(masked_weight), 1)
+    self.assertListEqual(masked_weight[0].get_shape().as_list(),
+                         [kernel_size, kernel_size, input_depth, output_depth])
+
+  def testMultipleConvMaskAdded(self):
+    number_of_layers = 5
+
+    kernel_size = 3
+    base_depth = 4
+    depth_step = 7
+
+    input_tensor = array_ops.ones((8, self.height, self.width, base_depth))
+
+    top_layer = input_tensor
+
+    for ix in range(number_of_layers):
+      top_layer = layers.masked_conv2d(top_layer, base_depth +
+                                       (ix + 1) * depth_step, kernel_size)
+
+    masks = ops.get_collection(core_layers.MASK_COLLECTION)
+    self.assertEqual(len(masks), number_of_layers)
+    for ix in range(number_of_layers):
+      self.assertListEqual(masks[ix].get_shape().as_list(), [
+          kernel_size, kernel_size, base_depth + ix * depth_step,
+          base_depth + (ix + 1) * depth_step
+      ])
+
+    masked_weight = ops.get_collection(core_layers.MASKED_WEIGHT_COLLECTION)
+    self.assertEqual(len(masked_weight), number_of_layers)
+    for ix in range(number_of_layers):
+      self.assertListEqual(masked_weight[ix].get_shape().as_list(), [
+          kernel_size, kernel_size, base_depth + ix * depth_step,
+          base_depth + (ix + 1) * depth_step
+      ])
+
+
+class MaskedFullyConnectedLayerTest(test.TestCase):
+
+  def testSingleFCMaskAdded(self):
+    input_depth, output_depth = 8, 32
+    input_tensor = array_ops.ones((5, input_depth))
+    layers.masked_fully_connected(input_tensor, output_depth)
+
+    masks = ops.get_collection(core_layers.MASK_COLLECTION)
+    self.assertEqual(len(masks), 1)
+    self.assertListEqual(masks[0].get_shape().as_list(),
+                         [input_depth, output_depth])
+
+    masked_weight = ops.get_collection(core_layers.MASKED_WEIGHT_COLLECTION)
+    self.assertEqual(len(masked_weight), 1)
+    self.assertListEqual(masked_weight[0].get_shape().as_list(),
+                         [input_depth, output_depth])
+
+  def testMultipleConvMaskAdded(self):
+    number_of_layers = 5
+
+    base_depth = 4
+    depth_step = 7
+
+    input_tensor = array_ops.ones((8, base_depth))
+
+    top_layer = input_tensor
+
+    for ix in range(number_of_layers):
+      top_layer = layers.masked_fully_connected(top_layer, base_depth +
+                                                (ix + 1) * depth_step)
+
+    masks = ops.get_collection(core_layers.MASK_COLLECTION)
+    self.assertEqual(len(masks), number_of_layers)
+    for ix in range(number_of_layers):
+      self.assertListEqual(masks[ix].get_shape().as_list(), [
+          base_depth + ix * depth_step, base_depth + (ix + 1) * depth_step
+      ])
+
+    masked_weight = ops.get_collection(core_layers.MASKED_WEIGHT_COLLECTION)
+    self.assertEqual(len(masked_weight), number_of_layers)
+    for ix in range(number_of_layers):
+      self.assertListEqual(masked_weight[ix].get_shape().as_list(), [
+          base_depth + ix * depth_step, base_depth + (ix + 1) * depth_step
+      ])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b050d25d00b298a20f7ce6abdda7c1d00db899
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/layers/rnn_cells.py
@@ -0,0 +1,348 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module implementing RNN Cells with pruning.
+
+This module implements BasicLSTMCell and LSTMCell with pruning.
+Code adapted from third_party/tensorflow/python/ops/rnn_cell_impl.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.model_pruning.python.layers import core_layers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import rnn_cell as tf_rnn
+
+
+class MaskedBasicLSTMCell(tf_rnn.BasicLSTMCell):
+  """Basic LSTM recurrent network cell with pruning.
+
+  Overrides the call method of tensorflow BasicLSTMCell and injects the weight
+  masks
+
+  The implementation is based on: http://arxiv.org/abs/1409.2329.
+
+  We add forget_bias (default: 1) to the biases of the forget gate in order to
+  reduce the scale of forgetting in the beginning of the training.
+
+  It does not allow cell clipping, a projection layer, and does not
+  use peep-hole connections: it is the basic baseline.
+
+  For advanced models, please use the full @{tf.nn.rnn_cell.LSTMCell}
+  that follows.
+  """
+
+  def __init__(self,
+               num_units,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None,
+               name=None):
+    """Initialize the basic LSTM cell with pruning.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell.
+      forget_bias: float, The bias added to forget gates (see above).
+        Must set to `0.0` manually when restoring from CudnnLSTM-trained
+        checkpoints.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  The latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+      name: String, the name of the layer. Layers with the same name will
+        share weights, but to avoid mistakes we require reuse=True in such
+        cases.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
+    """
+    super(MaskedBasicLSTMCell, self).__init__(
+        num_units,
+        forget_bias=forget_bias,
+        state_is_tuple=state_is_tuple,
+        activation=activation,
+        reuse=reuse,
+        name=name)
+
+  def build(self, inputs_shape):
+    # Call the build method of the parent class.
+    super(MaskedBasicLSTMCell, self).build(inputs_shape)
+
+    self.built = False
+
+    input_depth = inputs_shape[1].value
+    h_depth = self._num_units
+    self._mask = self.add_variable(
+        name="mask",
+        shape=[input_depth + h_depth, 4 * h_depth],
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+    self._threshold = self.add_variable(
+        name="threshold",
+        shape=[],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+    # Add masked_weights in the weights namescope so as to make it easier
+    # for the quantization library to add quant ops.
+    self._masked_kernel = math_ops.multiply(self._mask, self._kernel,
+                                            core_layers.MASKED_WEIGHT_NAME)
+    if self._mask not in ops.get_collection_ref(core_layers.MASK_COLLECTION):
+      ops.add_to_collection(core_layers.MASK_COLLECTION, self._mask)
+      ops.add_to_collection(core_layers.MASKED_WEIGHT_COLLECTION,
+                            self._masked_kernel)
+      ops.add_to_collection(core_layers.THRESHOLD_COLLECTION, self._threshold)
+      ops.add_to_collection(core_layers.WEIGHT_COLLECTION, self._kernel)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Long short-term memory cell (LSTM) with masks for pruning.
+
+    Args:
+      inputs: `2-D` tensor with shape `[batch_size, input_size]`.
+      state: An `LSTMStateTuple` of state tensors, each shaped
+        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `True`.  Otherwise, a `Tensor` shaped
+        `[batch_size, 2 * self.state_size]`.
+
+    Returns:
+      A pair containing the new hidden state, and the new state (either a
+        `LSTMStateTuple` or a concatenated state, depending on
+        `state_is_tuple`).
+    """
+    sigmoid = math_ops.sigmoid
+    one = constant_op.constant(1, dtype=dtypes.int32)
+    # Parameters of gates are concatenated into one multiply for efficiency.
+    if self._state_is_tuple:
+      c, h = state
+    else:
+      c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one)
+
+    gate_inputs = math_ops.matmul(
+        array_ops.concat([inputs, h], 1), self._masked_kernel)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+
+    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+    i, j, f, o = array_ops.split(
+        value=gate_inputs, num_or_size_splits=4, axis=one)
+
+    forget_bias_tensor = constant_op.constant(self._forget_bias, dtype=f.dtype)
+    # Note that using `add` and `multiply` instead of `+` and `*` gives a
+    # performance improvement. So using those at the cost of readability.
+    add = math_ops.add
+    multiply = math_ops.multiply
+    new_c = add(
+        multiply(c, sigmoid(add(f, forget_bias_tensor))),
+        multiply(sigmoid(i), self._activation(j)))
+    new_h = multiply(self._activation(new_c), sigmoid(o))
+
+    if self._state_is_tuple:
+      new_state = tf_rnn.LSTMStateTuple(new_c, new_h)
+    else:
+      new_state = array_ops.concat([new_c, new_h], 1)
+    return new_h, new_state
+
+
+class MaskedLSTMCell(tf_rnn.LSTMCell):
+  """LSTMCell with pruning.
+
+  Overrides the call method of tensorflow LSTMCell and injects the weight masks.
+  Masks are applied to only the weight matrix of the LSTM and not the
+  projection matrix.
+  """
+
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=None,
+               num_proj_shards=None,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=None,
+               reuse=None):
+    """Initialize the parameters for an LSTM cell with masks for pruning.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      num_unit_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      num_proj_shards: Deprecated, will be removed by Jan. 2017.
+        Use a variable_scope partitioner instead.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training. Must set it manually to `0.0` when restoring from
+        CudnnLSTM trained checkpoints.
+      state_is_tuple: If True, accepted and returned states are 2-tuples of
+        the `c_state` and `m_state`.  If False, they are concatenated
+        along the column axis.  This latter behavior will soon be deprecated.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
+    """
+    super(MaskedLSTMCell, self).__init__(
+        num_units,
+        use_peepholes=use_peepholes,
+        cell_clip=cell_clip,
+        initializer=initializer,
+        num_proj=num_proj,
+        proj_clip=proj_clip,
+        num_unit_shards=num_unit_shards,
+        num_proj_shards=num_proj_shards,
+        forget_bias=forget_bias,
+        state_is_tuple=state_is_tuple,
+        activation=activation,
+        reuse=reuse)
+
+  def build(self, inputs_shape):
+    # Call the build method of the parent class.
+    super(MaskedLSTMCell, self).build(inputs_shape)
+
+    self.built = False
+
+    input_depth = inputs_shape[1].value
+    h_depth = self._num_units
+    self._mask = self.add_variable(
+        name="mask",
+        shape=[input_depth + h_depth, 4 * h_depth],
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+    self._threshold = self.add_variable(
+        name="threshold",
+        shape=[],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=self.dtype)
+    # Add masked_weights in the weights namescope so as to make it easier
+    # for the quantization library to add quant ops.
+    self._masked_kernel = math_ops.multiply(self._mask, self._kernel,
+                                            core_layers.MASKED_WEIGHT_NAME)
+    if self._mask not in ops.get_collection_ref(core_layers.MASK_COLLECTION):
+      ops.add_to_collection(core_layers.MASK_COLLECTION, self._mask)
+      ops.add_to_collection(core_layers.MASKED_WEIGHT_COLLECTION,
+                            self._masked_kernel)
+      ops.add_to_collection(core_layers.THRESHOLD_COLLECTION, self._threshold)
+      ops.add_to_collection(core_layers.WEIGHT_COLLECTION, self._kernel)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, `[batch, num_units].
+      state: if `state_is_tuple` is False, this must be a state Tensor,
+        `2-D, [batch, state_size]`.  If `state_is_tuple` is True, this must be a
+        tuple of state Tensors, both `2-D`, with column sizes `c_state` and
+        `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch, output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+    sigmoid = math_ops.sigmoid
+
+    if self._state_is_tuple:
+      (c_prev, m_prev) = state
+    else:
+      c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+      m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+    lstm_matrix = math_ops.matmul(
+        array_ops.concat([inputs, m_prev], 1), self._masked_kernel)
+    lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias)
+
+    i, j, f, o = array_ops.split(
+        value=lstm_matrix, num_or_size_splits=4, axis=1)
+    # Diagonal connections
+    if self._use_peepholes:
+      c = (
+          sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * c_prev +
+          sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
+    else:
+      c = (
+          sigmoid(f + self._forget_bias) * c_prev +
+          sigmoid(i) * self._activation(j))
+
+    if self._cell_clip is not None:
+      # pylint: disable=invalid-unary-operand-type
+      c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+      # pylint: enable=invalid-unary-operand-type
+    if self._use_peepholes:
+      m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+    else:
+      m = sigmoid(o) * self._activation(c)
+
+    if self._num_proj is not None:
+      m = math_ops.matmul(m, self._proj_kernel)
+
+      if self._proj_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+        # pylint: enable=invalid-unary-operand-type
+
+    new_state = (
+        tf_rnn.LSTMStateTuple(c, m)
+        if self._state_is_tuple else array_ops.concat([c, m], 1))
+    return m, new_state
diff --git a/tensorflow/contrib/model_pruning/python/layers/rnn_cells_test.py b/tensorflow/contrib/model_pruning/python/layers/rnn_cells_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e85ae7b22a545045ec42ba86e9aed9cd7e6103f7
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/layers/rnn_cells_test.py
@@ -0,0 +1,85 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for creating different number of masks in rnn_cells."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.model_pruning.python import pruning
+from tensorflow.contrib.model_pruning.python.layers import rnn_cells
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn_cell as tf_rnn_cells
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class RnnCellsTest(test.TestCase):
+
+  def setUp(self):
+    super(RnnCellsTest, self).setUp()
+    self.batch_size = 8
+    self.dim = 10
+
+  def testMaskedBasicLSTMCell(self):
+    expected_num_masks = 1
+    expected_num_rows = 2 * self.dim
+    expected_num_cols = 4 * self.dim
+    with self.test_session():
+      inputs = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      c = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      h = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      state = tf_rnn_cells.LSTMStateTuple(c, h)
+      lstm_cell = rnn_cells.MaskedBasicLSTMCell(self.dim)
+      lstm_cell(inputs, state)
+      self.assertEqual(len(pruning.get_masks()), expected_num_masks)
+      self.assertEqual(len(pruning.get_masked_weights()), expected_num_masks)
+      self.assertEqual(len(pruning.get_thresholds()), expected_num_masks)
+      self.assertEqual(len(pruning.get_weights()), expected_num_masks)
+
+      for mask in pruning.get_masks():
+        self.assertEqual(mask.shape, (expected_num_rows, expected_num_cols))
+      for weight in pruning.get_weights():
+        self.assertEqual(weight.shape, (expected_num_rows, expected_num_cols))
+
+  def testMaskedLSTMCell(self):
+    expected_num_masks = 1
+    expected_num_rows = 2 * self.dim
+    expected_num_cols = 4 * self.dim
+    with self.test_session():
+      inputs = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      c = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      h = variables.Variable(
+          random_ops.random_normal([self.batch_size, self.dim]))
+      state = tf_rnn_cells.LSTMStateTuple(c, h)
+      lstm_cell = rnn_cells.MaskedLSTMCell(self.dim)
+      lstm_cell(inputs, state)
+      self.assertEqual(len(pruning.get_masks()), expected_num_masks)
+      self.assertEqual(len(pruning.get_masked_weights()), expected_num_masks)
+      self.assertEqual(len(pruning.get_thresholds()), expected_num_masks)
+      self.assertEqual(len(pruning.get_weights()), expected_num_masks)
+
+      for mask in pruning.get_masks():
+        self.assertEqual(mask.shape, (expected_num_rows, expected_num_cols))
+      for weight in pruning.get_weights():
+        self.assertEqual(weight.shape, (expected_num_rows, expected_num_cols))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/model_pruning/python/learning.py b/tensorflow/contrib/model_pruning/python/learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b79c23cefe961b1c4056d41b5fcc0a0521efec6
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/learning.py
@@ -0,0 +1,188 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper around tf-slim's training code contrib/slim/python/slim/learning.py
+to support training of pruned models
+
+*******************************************************************
+* A simple working training script with support for model pruning *
+*******************************************************************
+
+  # Load data and create the model:
+  images, labels = LoadData(...)
+  predictions = MyModel(images)
+
+  # Define the loss:
+  slim.losses.log_loss(predictions, labels)
+  total_loss = slim.losses.get_total_loss()
+
+  # Define the optimizer:
+  optimizer = tf.train.MomentumOptimizer(FLAGS.learning_rate, FLAGS.momentum)
+
+  # Create the train_op
+  train_op = slim.learning.create_train_op(total_loss, optimizer)
+
+  # Set up sparsity
+  sparsity = pruning.setup_gradual_sparsity(self.global_step)
+
+  # Create mask update op
+  mask_update_op = pruning.add_mask_update_ip(sparsity)
+
+  # Run training.
+  learning.train(train_op,
+                 my_log_dir,
+                 mask_update_op)
+  see contrib/slim/python/slim/learning.py for additional examples
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib import slim as _slim
+
+_USE_DEFAULT = 0
+train_step = _slim.learning.train_step
+
+
+def train(train_op,
+          logdir,
+          mask_update_op,
+          train_step_fn=train_step,
+          train_step_kwargs=_USE_DEFAULT,
+          log_every_n_steps=1,
+          graph=None,
+          master='',
+          is_chief=True,
+          global_step=None,
+          number_of_steps=None,
+          init_op=_USE_DEFAULT,
+          init_feed_dict=None,
+          local_init_op=_USE_DEFAULT,
+          init_fn=None,
+          ready_op=_USE_DEFAULT,
+          summary_op=_USE_DEFAULT,
+          save_summaries_secs=600,
+          summary_writer=_USE_DEFAULT,
+          startup_delay_steps=0,
+          saver=None,
+          save_interval_secs=600,
+          sync_optimizer=None,
+          session_config=None,
+          trace_every_n_steps=None):
+  """Wrapper around tf-slim's train function.
+
+  Runs a training loop using a TensorFlow supervisor.
+  When the sync_optimizer is supplied, gradient updates are applied
+  synchronously. Otherwise, gradient updates are applied asynchronous.
+
+  Args:
+    train_op: A `Tensor` that, when executed, will apply the gradients and
+      return the loss value.
+    logdir: The directory where training logs are written to. If None, model
+      checkpoints and summaries will not be written.
+    mask_update_op: Operation that upon execution updates the weight masks and
+      thresholds.
+    train_step_fn: The function to call in order to execute a single gradient
+      step. The function must have take exactly four arguments: the current
+      session, the `train_op` `Tensor`, a global step `Tensor` and a dictionary.
+    train_step_kwargs: A dictionary which is passed to the `train_step_fn`. By
+      default, two `Boolean`, scalar ops called "should_stop" and "should_log"
+      are provided.
+    log_every_n_steps: The frequency, in terms of global steps, that the loss
+      and global step and logged.
+    graph: The graph to pass to the supervisor. If no graph is supplied the
+      default graph is used.
+    master: The address of the tensorflow master.
+    is_chief: Specifies whether or not the training is being run by the primary
+      replica during replica training.
+    global_step: The `Tensor` representing the global step. If left as `None`,
+      then slim.variables.get_or_create_global_step() is used.
+    number_of_steps: The max number of gradient steps to take during training,
+      as measured by 'global_step': training will stop if global_step is
+      greater than 'number_of_steps'. If the value is left as None, training
+      proceeds indefinitely.
+    init_op: The initialization operation. If left to its default value, then
+      the session is initialized by calling `tf.global_variables_initializer()`.
+    init_feed_dict: A feed dictionary to use when executing the `init_op`.
+    local_init_op: The local initialization operation. If left to its default
+      value, then the session is initialized by calling
+      `tf.local_variables_initializer()` and `tf.tables_initializer()`.
+    init_fn: An optional callable to be executed after `init_op` is called. The
+      callable must accept one argument, the session being initialized.
+    ready_op: Operation to check if the model is ready to use. If left to its
+      default value, then the session checks for readiness by calling
+      `tf.report_uninitialized_variables()`.
+    summary_op: The summary operation.
+    save_summaries_secs: How often, in seconds, to save summaries.
+    summary_writer: `SummaryWriter` to use.  Can be `None`
+      to indicate that no summaries should be written. If unset, we
+      create a SummaryWriter.
+    startup_delay_steps: The number of steps to wait for before beginning. Note
+      that this must be 0 if a sync_optimizer is supplied.
+    saver: Saver to save checkpoints. If None, a default one will be created
+      and used.
+    save_interval_secs: How often, in seconds, to save the model to `logdir`.
+    sync_optimizer: an instance of tf.train.SyncReplicasOptimizer, or a list of
+      them. If the argument is supplied, gradient updates will be synchronous.
+      If left as `None`, gradient updates will be asynchronous.
+    session_config: An instance of `tf.ConfigProto` that will be used to
+      configure the `Session`. If left as `None`, the default will be used.
+    trace_every_n_steps: produce and save a `Timeline` in Chrome trace format
+      and add it to the summaries every `trace_every_n_steps`. If None, no trace
+      information will be produced or saved.
+
+  Returns:
+    the value of the loss function after training.
+
+  Raises:
+    ValueError: if `train_op` is empty or if `startup_delay_steps` is
+      non-zero when `sync_optimizer` is supplied, if `number_of_steps` is
+      negative, or if `trace_every_n_steps` is not `None` and no `logdir` is
+      provided.
+  """
+
+  def train_step_with_pruning_fn(sess, train_op, global_step,
+                                 train_step_kwargs):
+    total_loss, should_stop = train_step_fn(sess, train_op, global_step,
+                                            train_step_kwargs)
+    sess.run(mask_update_op)
+    return total_loss, should_stop
+
+  total_loss, _ = _slim.learning.train(
+      train_op,
+      logdir,
+      train_step_fn=train_step_with_pruning_fn,
+      train_step_kwargs=train_step_kwargs,
+      log_every_n_steps=log_every_n_steps,
+      graph=graph,
+      master=master,
+      is_chief=is_chief,
+      global_step=global_step,
+      number_of_steps=number_of_steps,
+      init_op=init_op,
+      init_feed_dict=init_feed_dict,
+      local_init_op=local_init_op,
+      init_fn=init_fn,
+      ready_op=ready_op,
+      summary_op=summary_op,
+      save_summaries_secs=save_summaries_secs,
+      summary_writer=summary_writer,
+      startup_delay_steps=startup_delay_steps,
+      saver=saver,
+      save_interval_secs=save_interval_secs,
+      sync_optimizer=sync_optimizer,
+      session_config=session_config,
+      trace_every_n_steps=trace_every_n_steps)
+
+  return total_loss
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..39eb79daf07766e99aea8149f4424ce12c4fcf27
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -0,0 +1,580 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions to add support for magnitude-based model pruning.
+
+  # Adds variables and ops to the graph to enable
+  # elementwise masking of weights
+  apply_mask(weights)
+
+  # Returns a list containing the sparsity of each of the weight tensors
+  get_weight_sparsity()
+
+  # Returns a list of all the masked weight tensorflow variables
+  get_masked_weights()
+
+  # Returns a list of all the mask tensorflow variables
+  get_masks()
+
+  # Returns a list of all the thresholds
+  get_thresholds()
+
+  # Returns a list of all the weight tensors that have been masked
+  get_weights()
+
+  The Pruning class uses a proto (defined in pruning.proto) to set up the
+  parameters for a pruning specification. Here's a typical usage:
+
+  # Initialize a pruning spec from a proto
+  pruning_spec = '/tmp/pruning.pb'
+  p = Pruning(pruning_spec)
+
+  # Add mask update ops to the graph
+  mask_update_op = p.conditional_mask_update_op()
+
+  # Add the summaries
+  p.add_pruning_summaries()
+
+  # Run the op
+  session.run(mask_update_op)
+
+  # An object of the pruning also accepts externally defined sparsity:
+  sparsity = tf.Variable(0.5, name = "ConstantSparsity")
+  pruning_spec = '/tmp/pruning.pb'
+  p = Pruning(pruning_spec, sparsity=sparsity)
+
+"""
+# pylint: disable=missing-docstring
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.model_pruning.python.layers import core_layers as core
+from tensorflow.contrib.training.python.training import hparam
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+_MASK_COLLECTION = core.MASK_COLLECTION
+_THRESHOLD_COLLECTION = core.THRESHOLD_COLLECTION
+_MASKED_WEIGHT_COLLECTION = core.MASKED_WEIGHT_COLLECTION
+_WEIGHT_COLLECTION = core.WEIGHT_COLLECTION
+_MASKED_WEIGHT_NAME = core.MASKED_WEIGHT_NAME
+
+
+def _weight_mask_variable(var, scope):
+  """Create a mask for the weights.
+
+  This function adds a variable 'mask' to the graph.
+
+  Args:
+    var: the weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    the mask variable of the same size and shape as var, initialized to all 1s.
+  """
+  with variable_scope.variable_scope(scope):
+    mask = variable_scope.get_variable(
+        'mask',
+        var.get_shape(),
+        initializer=init_ops.ones_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+  return mask
+
+
+def _weight_threshold_variable(var, scope):
+  """Create a scalar threshold for the weights.
+
+  This function adds a variable
+  'threshold' to the graph.
+
+  Args:
+    var: The weight variable that needs to be masked
+    scope: The variable scope of the variable var
+
+  Returns:
+    a scalar threshold variable initialized to 0.
+  """
+  with variable_scope.variable_scope(scope):
+    threshold = variable_scope.get_variable(
+        'threshold', [],
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        dtype=var.dtype)
+    return threshold
+
+
+def _histogram(values, value_range, nbins=100, dtype=np.int32, name=None):
+  """Return histogram of values.
+
+  Given the tensor `values`, this operation returns a rank 1 histogram counting
+  the number of entries in `values` that fell into every bin.  The bins are
+  equal width and determined by the arguments `value_range` and `nbins`.
+
+  Args:
+    values:  Numeric `Tensor`.
+    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
+      values <= value_range[0] will be mapped to hist[0],
+      values >= value_range[1] will be mapped to hist[-1].
+    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
+    dtype:  dtype for returned histogram.
+    name:  A name for this operation (defaults to 'histogram').
+
+  Returns:
+    A 1-D `Tensor` holding histogram of values.
+
+  """
+  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
+    values = ops.convert_to_tensor(values, name='values')
+    values = gen_array_ops.reshape(values, [-1])
+    value_range = ops.convert_to_tensor(value_range, name='value_range')
+    nbins = ops.convert_to_tensor(nbins, dtype=np.int32, name='nbins')
+    nbins_float = math_ops.cast(nbins, values.dtype)
+
+    # Map tensor values that fall within value_range to [0, 1].
+    scaled_values = math_ops.truediv(
+        values - value_range[0],
+        value_range[1] - value_range[0],
+        name='scaled_values')
+
+    # map tensor values within the open interval value_range to {0,.., nbins-1},
+    # values outside the open interval will be zero or less, or nbins or more.
+    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
+
+    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
+    indices = math_ops.cast(
+        clip_ops.clip_by_value(indices, 0, nbins_float - 1), np.int32)
+
+    return math_ops.unsorted_segment_sum(
+        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
+
+
+def _determine_partitioned_axis(partitioned_variable):
+  partitioned_axis = 0
+  concatenated_variable_shape = partitioned_variable.get_shape()
+  for partition in partitioned_variable:
+    partition_shape = partition.get_shape()
+    maybe_partitioned_axis = np.less(partition_shape,
+                                     concatenated_variable_shape)
+    # Sanity check: make sure number of partitioned axis == 1
+    if np.count_nonzero(maybe_partitioned_axis) != 1:
+      raise ValueError('Number of partitioned axes %s not equal to 1' %
+                       np.count_nonzero(maybe_partitioned_axis))
+    partitioned_axis = np.where(maybe_partitioned_axis)[0][0]
+  return partitioned_axis
+
+
+def _variable_assign(var, new_value):
+  return state_ops.assign(var, new_value, name=var.op.name + '_assign')
+
+
+def _partitioned_variable_assign(partitioned_var, new_value):
+  """Assign op for partitioned variables.
+
+  Args:
+    partitioned_var: A partitioned tensotflow variable
+    new_value: Value to be assigned to the variable var
+
+  Returns:
+    A tensorflow op that groups the assign ops for each of the variable slices
+  """
+  # Determine which axis was used to partition the variable. Currently
+  # tensorflow allows partitioning variable only along 1 axis.
+  axis = 0 if len(partitioned_var) == 1 else _determine_partitioned_axis(
+      partitioned_var)
+
+  partition_sizes = np.array(
+      [partition.get_shape()[axis] for partition in partitioned_var])
+  new_partitioned_values = array_ops.split(
+      new_value,
+      ops.convert_to_tensor(partition_sizes, dtype=np.int32),
+      axis=axis)
+  op_list = []
+  for partition in partitioned_var:
+    op_list.append(
+        _variable_assign(partition, new_partitioned_values[len(op_list)]))
+  return control_flow_ops.group(
+      *op_list, name=partitioned_var.name + '_group_assign')
+
+
+def apply_mask(x, scope=''):
+  """Apply mask to a given weight tensor.
+
+  Args:
+    x: Input weight tensor
+    scope: The current variable scope. Defaults to ""
+  Returns:
+    Tensor representing masked_weights
+  """
+
+  mask = _weight_mask_variable(x, scope)
+  threshold = _weight_threshold_variable(x, scope)
+  # Add masked_weights in the weights namescope so as to make it easier
+  # for the quantization library to add quant ops.
+  masked_weights = math_ops.multiply(mask, x, _MASKED_WEIGHT_NAME)
+
+  # Make sure the mask for a given variable are not added multiple times to the
+  # collection. This is particularly important when applying mask to RNN's
+  # weight variables
+  if mask not in ops.get_collection_ref(_MASK_COLLECTION):
+    ops.add_to_collection(_THRESHOLD_COLLECTION, threshold)
+    ops.add_to_collection(_MASK_COLLECTION, mask)
+    ops.add_to_collection(_MASKED_WEIGHT_COLLECTION, masked_weights)
+    ops.add_to_collection(_WEIGHT_COLLECTION, x)
+  return masked_weights
+
+
+def get_masked_weights():
+  return ops.get_collection(_MASKED_WEIGHT_COLLECTION)
+
+
+def get_masks():
+  return ops.get_collection(_MASK_COLLECTION)
+
+
+def get_thresholds():
+  return ops.get_collection(_THRESHOLD_COLLECTION)
+
+
+def get_weights():
+  return ops.get_collection(_WEIGHT_COLLECTION)
+
+
+def get_weight_sparsity():
+  """Get sparsity of the weights.
+
+  Args:
+    None
+
+  Returns:
+    A list containing the sparsity of each of the weight tensors
+  """
+  masks = get_masks()
+  return [nn_impl.zero_fraction(mask) for mask in masks]
+
+
+def get_pruning_hparams():
+  """Get a tf.HParams object with the default values for the hyperparameters.
+
+    name: string
+      name of the pruning specification. Used for adding summaries and ops under
+      a common tensorflow name_scope
+    begin_pruning_step: integer
+      the global step at which to begin pruning
+    end_pruning_step: integer
+      the global step at which to terminate pruning. Defaults to -1 implying
+      that pruning continues till the training stops
+    do_not_prune: list of strings
+      list of layers that are not pruned
+    threshold_decay: float
+      the decay factor to use for exponential decay of the thresholds
+    pruning_frequency: integer
+      How often should the masks be updated? (in # of global_steps)
+    nbins: integer
+      number of bins to use for histogram computation
+    initial_sparsity: float
+      initial sparsity value
+    target_sparsity: float
+      target sparsity value
+    sparsity_function_begin_step: integer
+      the global step at this which the gradual sparsity function begins to
+      take effect
+    sparsity_function_end_step: integer
+      the global step used as the end point for the gradual sparsity function
+    sparsity_function_exponent: float
+      exponent = 1 is linearly varying sparsity between initial and final.
+      exponent > 1 varies more slowly towards the end than the beginning
+
+    We use the following sparsity function:
+
+    num_steps = (sparsity_function_end_step -
+                 sparsity_function_begin_step)/pruning_frequency
+    sparsity(step) = (initial_sparsity - target_sparsity)*
+                     [1-step/(num_steps -1)]**exponent + target_sparsity
+
+  Args:
+    None
+
+  Returns:
+    tf.HParams object initialized to default values
+
+  """
+  return hparam.HParams(
+      name='model_pruning',
+      begin_pruning_step=0,
+      end_pruning_step=-1,
+      do_not_prune=[''],
+      threshold_decay=0.9,
+      pruning_frequency=10,
+      nbins=255,
+      initial_sparsity=0,
+      target_sparsity=0.5,
+      sparsity_function_begin_step=0,
+      sparsity_function_end_step=100,
+      sparsity_function_exponent=3)
+
+
+class Pruning(object):
+
+  def __init__(self, spec=None, global_step=None, sparsity=None):
+    """Set up the specification for model pruning.
+
+    If a spec is provided, the sparsity is set up based on the sparsity_function
+    in the spec. The effect of sparsity_function is overridden if the sparsity
+    variable is passed to the constructor. This enables setting up arbitrary
+    sparsity profiles externally and passing it to this pruning functions.
+
+    Args:
+      spec: Pruning spec as defined in pruning.proto
+      global_step: A tensorflow variable that is used while setting up the
+        sparsity function
+      sparsity: A tensorflow scalar variable storing the sparsity
+    """
+    # Pruning specification
+    self._spec = spec if spec else get_pruning_hparams()
+
+    # A tensorflow variable that tracks the sparsity function.
+    # If not provided as input, the graph must already contain the global_step
+    # variable before calling this constructor.
+    self._global_step = self._setup_global_step(global_step)
+
+    # Stores the tensorflow sparsity variable.
+    # Built using self._setup_sparsity() or provided externally
+    self._sparsity = sparsity if sparsity else self._setup_sparsity()
+
+    # List of tensorflow assignments ops for new masks and thresholds
+    self._assign_ops = []
+
+    # Tensorflow variable keeping track of the last global step when the masks
+    # were updated
+    self._last_update_step = self._setup_last_update_step()
+
+  def _setup_global_step(self, global_step):
+    graph_global_step = global_step
+    if graph_global_step is None:
+      graph_global_step = training_util.get_global_step()
+
+    return math_ops.cast(graph_global_step, np.int32)
+
+  def _setup_sparsity(self):
+    begin_step = self._spec.sparsity_function_begin_step
+    end_step = self._spec.sparsity_function_end_step
+    initial_sparsity = self._spec.initial_sparsity
+    target_sparsity = self._spec.target_sparsity
+    exponent = self._spec.sparsity_function_exponent
+
+    if begin_step >= end_step:
+      raise ValueError(
+          'Pruning must begin before it can end. begin_step=%d, end_step=%d' %
+          (begin_step, end_step))
+
+    with ops.name_scope(self._spec.name):
+      p = math_ops.minimum(1.0,
+                           math_ops.maximum(
+                               0.0,
+                               math_ops.div(
+                                   math_ops.cast(self._global_step - begin_step,
+                                                 np.float32),
+                                   end_step - begin_step)))
+      sparsity = math_ops.add(
+          math_ops.multiply(initial_sparsity - target_sparsity,
+                            math_ops.pow(1 - p, exponent)),
+          target_sparsity,
+          name='sparsity')
+
+    return sparsity
+
+  def _setup_last_update_step(self):
+    with variable_scope.variable_scope(self._spec.name) as scope:
+      try:
+        last_update_step = variable_scope.get_variable(
+            'last_mask_update_step', [],
+            initializer=init_ops.zeros_initializer(),
+            trainable=False,
+            dtype=np.int32)
+      except ValueError:
+        scope.reuse_variables()
+        last_update_step = variable_scope.get_variable(
+            'last_mask_update_step', dtype=np.int32)
+    return last_update_step
+
+  def _exists_in_do_not_prune_list(self, tensor_name):
+    do_not_prune_list = self._spec.do_not_prune
+    if not do_not_prune_list[0]:
+      return False
+    for layer_name in do_not_prune_list:
+      if tensor_name.find(layer_name) != -1:
+        return True
+
+    return False
+
+  def _update_mask(self, weights, threshold):
+    """Updates the mask for a given weight tensor.
+
+    This functions first computes the cdf of the weight tensor, and estimates
+    the threshold value such that 'desired_sparsity' fraction of weights
+    have magnitude less than the threshold.
+
+    Args:
+      weights: The weight tensor that needs to be masked.
+      threshold: The current threshold value. The function will compute a new
+        threshold and return the exponential moving average using the current
+        value of threshold
+
+    Returns:
+      new_threshold: The new value of the threshold based on weights, and
+        desired_sparsity
+      new_mask: A n-D numpy array containing 0 or 1 to indicate which of the
+        values in weights falls below the threshold
+
+    Raises:
+      ValueError: if sparsity is not defined
+    """
+    if self._sparsity is None:
+      raise ValueError('Sparsity variable undefined')
+
+    with ops.name_scope(weights.op.name + '_pruning_ops'):
+      abs_weights = math_ops.abs(weights)
+      max_value = math_ops.reduce_max(abs_weights)
+      histogram = _histogram(
+          abs_weights, [0.0, max_value],
+          nbins=self._spec.nbins,
+          dtype=np.float32)
+
+      cdf = math_ops.cumsum(histogram)
+      norm_cdf = math_ops.div(cdf, math_ops.reduce_sum(histogram))
+      current_threshold = math_ops.multiply(
+          math_ops.div(
+              math_ops.reduce_sum(
+                  math_ops.cast(
+                      math_ops.less(norm_cdf, self._sparsity), np.float32)),
+              float(self._spec.nbins)), max_value)
+
+      smoothed_threshold = math_ops.add_n([
+          math_ops.multiply(current_threshold, 1 - self._spec.threshold_decay),
+          math_ops.multiply(threshold, self._spec.threshold_decay)
+      ])
+      new_mask = math_ops.cast(
+          math_ops.greater(abs_weights, smoothed_threshold), np.float32)
+    return smoothed_threshold, new_mask
+
+  def _get_mask_assign_ops(self):
+    # Make sure the assignment ops have not already been added to the list
+    if self._assign_ops:
+      raise ValueError(
+          'Assign op list not empty. _get_mask_assign_ops() called twice?')
+
+    masks = get_masks()
+    weights = get_weights()
+    thresholds = get_thresholds()
+
+    if len(masks) != len(thresholds):
+      raise ValueError(
+          'Number of masks %s and number of thresholds %s mismatch' %
+          (len(masks), len(thresholds)))
+
+    for index, mask in enumerate(masks):
+      threshold = thresholds[index]
+      weight = weights[index]
+      is_partitioned = isinstance(weight, variables.PartitionedVariable)
+      if is_partitioned:
+        weight = weight.as_tensor()
+
+      if self._spec.do_not_prune:
+        if self._exists_in_do_not_prune_list(mask.name):
+          continue
+
+      new_threshold, new_mask = self._update_mask(weight, threshold)
+      self._assign_ops.append(_variable_assign(threshold, new_threshold))
+
+      self._assign_ops.append(
+          _partitioned_variable_assign(mask, new_mask)
+          if is_partitioned else _variable_assign(mask, new_mask))
+
+  def mask_update_op(self):
+    with ops.name_scope(self._spec.name):
+      if not self._assign_ops:
+        self._get_mask_assign_ops()
+      with ops.control_dependencies([
+          state_ops.assign(
+              self._last_update_step,
+              self._global_step,
+              name='last_mask_update_step_assign')
+      ]):
+        with ops.control_dependencies(self._assign_ops):
+          logging.info('Updating masks.')
+          return control_flow_ops.no_op('mask_update')
+
+  def conditional_mask_update_op(self):
+
+    def maybe_update_masks():
+      with ops.name_scope(self._spec.name):
+        is_step_within_pruning_range = math_ops.logical_and(
+            math_ops.greater_equal(self._global_step,
+                                   self._spec.begin_pruning_step),
+            # If end_pruning_step is negative, keep pruning forever!
+            math_ops.logical_or(
+                math_ops.less_equal(self._global_step,
+                                    self._spec.end_pruning_step),
+                math_ops.less(self._spec.end_pruning_step, 0)))
+        is_pruning_step = math_ops.less_equal(
+            math_ops.add(self._last_update_step, self._spec.pruning_frequency),
+            self._global_step)
+        return math_ops.logical_and(is_step_within_pruning_range,
+                                    is_pruning_step)
+
+    def mask_update_op():
+      return self.mask_update_op()
+
+    def no_update_op():
+      return control_flow_ops.no_op()
+
+    return control_flow_ops.cond(maybe_update_masks(), mask_update_op,
+                                 no_update_op)
+
+  def add_pruning_summaries(self):
+    """Adds summaries for this pruning spec.
+
+    Args: none
+
+    Returns: none
+    """
+    with ops.name_scope(self._spec.name + '_summaries'):
+      summary.scalar('sparsity', self._sparsity)
+      summary.scalar('last_mask_update_step', self._last_update_step)
+      masks = get_masks()
+      thresholds = get_thresholds()
+      for index, mask in enumerate(masks):
+        if not self._exists_in_do_not_prune_list(mask.name):
+          summary.scalar(mask.name + '/sparsity', nn_impl.zero_fraction(mask))
+          summary.scalar(thresholds[index].op.name + '/threshold',
+                         thresholds[index])
+
+  def print_hparams(self):
+    logging.info(self._spec.to_json())
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..34b4584f494d1efecbe88d7e417e2f19de7a32e4
--- /dev/null
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -0,0 +1,162 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the key functions in pruning library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.model_pruning.python import pruning
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
+
+
+class PruningHParamsTest(test.TestCase):
+  PARAM_LIST = [
+      "name=test", "threshold_decay=0.9", "pruning_frequency=10",
+      "do_not_prune=[conv1,conv2]", "sparsity_function_end_step=100",
+      "target_sparsity=0.9"
+  ]
+  TEST_HPARAMS = ",".join(PARAM_LIST)
+
+  def setUp(self):
+    super(PruningHParamsTest, self).setUp()
+    # Add global step variable to the graph
+    self.global_step = training_util.get_or_create_global_step()
+    # Add sparsity
+    self.sparsity = variables.Variable(0.5, name="sparsity")
+    # Parse hparams
+    self.pruning_hparams = pruning.get_pruning_hparams().parse(
+        self.TEST_HPARAMS)
+
+  def testInit(self):
+    p = pruning.Pruning(self.pruning_hparams)
+    self.assertEqual(p._spec.name, "test")
+    self.assertAlmostEqual(p._spec.threshold_decay, 0.9)
+    self.assertEqual(p._spec.pruning_frequency, 10)
+    self.assertAllEqual(p._spec.do_not_prune, ["conv1", "conv2"])
+    self.assertEqual(p._spec.sparsity_function_end_step, 100)
+    self.assertAlmostEqual(p._spec.target_sparsity, 0.9)
+
+  def testInitWithExternalSparsity(self):
+    with self.test_session():
+      p = pruning.Pruning(spec=self.pruning_hparams, sparsity=self.sparsity)
+      variables.global_variables_initializer().run()
+      sparsity = p._sparsity.eval()
+      self.assertAlmostEqual(sparsity, 0.5)
+
+  def testInitWithVariableReuse(self):
+    with self.test_session():
+      p = pruning.Pruning(spec=self.pruning_hparams, sparsity=self.sparsity)
+      p_copy = pruning.Pruning(
+          spec=self.pruning_hparams, sparsity=self.sparsity)
+      variables.global_variables_initializer().run()
+      sparsity = p._sparsity.eval()
+      self.assertAlmostEqual(sparsity, 0.5)
+      self.assertEqual(p._sparsity.eval(), p_copy._sparsity.eval())
+
+
+class PruningTest(test.TestCase):
+
+  def setUp(self):
+    super(PruningTest, self).setUp()
+    self.global_step = training_util.get_or_create_global_step()
+
+  def testCreateMask2D(self):
+    width = 10
+    height = 20
+    with self.test_session():
+      weights = variables.Variable(
+          random_ops.random_normal([width, height], stddev=1), name="weights")
+      masked_weights = pruning.apply_mask(weights,
+                                          variable_scope.get_variable_scope())
+      variables.global_variables_initializer().run()
+      weights_val = weights.eval()
+      masked_weights_val = masked_weights.eval()
+      self.assertAllEqual(weights_val, masked_weights_val)
+
+  def testUpdateSingleMask(self):
+    with self.test_session() as session:
+      weights = variables.Variable(
+          math_ops.linspace(1.0, 100.0, 100), name="weights")
+      masked_weights = pruning.apply_mask(weights)
+      sparsity = variables.Variable(0.5, name="sparsity")
+      p = pruning.Pruning(sparsity=sparsity)
+      p._spec.threshold_decay = 0.0
+      mask_update_op = p.mask_update_op()
+      variables.global_variables_initializer().run()
+      masked_weights_val = masked_weights.eval()
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
+      session.run(mask_update_op)
+      masked_weights_val = masked_weights.eval()
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+
+  def testPartitionedVariableMasking(self):
+    partitioner = partitioned_variables.variable_axis_size_partitioner(40)
+    with self.test_session() as session:
+      with variable_scope.variable_scope("", partitioner=partitioner):
+        sparsity = variables.Variable(0.5, name="Sparsity")
+        weights = variable_scope.get_variable(
+            "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
+        masked_weights = pruning.apply_mask(
+            weights, scope=variable_scope.get_variable_scope())
+      p = pruning.Pruning(sparsity=sparsity)
+      p._spec.threshold_decay = 0.0
+      mask_update_op = p.mask_update_op()
+      variables.global_variables_initializer().run()
+      masked_weights_val = masked_weights.eval()
+      session.run(mask_update_op)
+      masked_weights_val = masked_weights.eval()
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
+
+  def testConditionalMaskUpdate(self):
+    param_list = [
+        "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
+    ]
+    test_spec = ",".join(param_list)
+    pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
+    weights = variables.Variable(
+        math_ops.linspace(1.0, 100.0, 100), name="weights")
+    masked_weights = pruning.apply_mask(weights)
+    sparsity = variables.Variable(0.00, name="sparsity")
+    # Set up pruning
+    p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
+    p._spec.threshold_decay = 0.0
+    mask_update_op = p.conditional_mask_update_op()
+    sparsity_val = math_ops.linspace(0.0, 0.9, 10)
+    increment_global_step = state_ops.assign_add(self.global_step, 1)
+    non_zero_count = []
+    with self.test_session() as session:
+      variables.global_variables_initializer().run()
+      for i in range(10):
+        session.run(state_ops.assign(sparsity, sparsity_val[i]))
+        session.run(mask_update_op)
+        session.run(increment_global_step)
+        non_zero_count.append(np.count_nonzero(masked_weights.eval()))
+    # Weights pruned at steps 0,2,4,and,6
+    expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40]
+    self.assertAllEqual(expected_non_zero_count, non_zero_count)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/mpi/BUILD b/tensorflow/contrib/mpi/BUILD
index 20ceef5004afdafacf5fd29b990f6644ae6d4ed2..d9d55faf50b7f5043bfd0ed3b3d9ca5c404c7627 100644
--- a/tensorflow/contrib/mpi/BUILD
+++ b/tensorflow/contrib/mpi/BUILD
@@ -72,6 +72,7 @@ cc_library(
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
         "//tensorflow/core/distributed_runtime:session_mgr",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//third_party/mpi",
     ],
diff --git a/tensorflow/contrib/mpi/README.md b/tensorflow/contrib/mpi/README.md
index b0d03d05a26312273ae65415547d498ca866638c..75cb8230483a7648e771904c7087e2848929d2b4 100644
--- a/tensorflow/contrib/mpi/README.md
+++ b/tensorflow/contrib/mpi/README.md
@@ -23,7 +23,7 @@ The following environment variables can be set to modify the behavior at runtime
 
 **MPI_DISABLED=[0,1]**
 
-This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing). 
+This environment variable allows you to disable the MPI path before launch (e.g. for performance or correctness testing).
 
 **MPI_OPTIMAL_PATH=[0,1]**
 
@@ -34,10 +34,10 @@ This path is disabled by default as it requires that the MPI library can directl
 
 ## Known problems
 
-For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine). 
+For certain complex neural nets the implementation sometimes crashes inside the MPI libraries. This seems to be related to memory allocations/routines that register the memory for the Infiniband transfers. (The crashes do not happen when all MPI processes are within the same physical machine).
 
 **MVAPICH**
-- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support." 
+- The problem manifests itself with a segmentation fault inside a memory copy routine and during startup you will get the following warning: "WARNING: Error in initializing MVAPICH2 ptmalloc library. Continuing without InfiniBand registration cache support."
 
 **OpenMPI**
 - With OpenMPI corrupt data will be received resulting in an assertion or the MPI library will print an error and exit. The error is "Attempt to free memory that is still in use by an ongoing MPI communication.  MPI job will now abort."
@@ -58,11 +58,11 @@ Once a request has arrived from a remote process the request is forwarded to the
 * Receive tensor request
 The MPI thread will check if there are any incoming tensor request messages on the communication lines using MPI_Iprobe. Once a request has been received it will be passed on to the standard TensorFlow code and eventually will be placed on the sendQueue.
 
-* Receive tensor 
+* Receive tensor
 At some point after a request has been sent the remote process will transmit the tensor. This tensor will be received and we look-up the callback that is associated with this tensor in our request table and execute the callback on the received data.
 
 
-In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive. 
+In the implementation all send operations are non-blocking, all probe operations are non-blocking and all receive-operations are blocking. The receive-operations are only executed after the probe has determined that there is something to receive.
 The MPI processes identify each other using an MPI process ID. The TensorFlow gRPC processes identify each other using a name. During launch we create a mapping between the TensorFlow process name and the MPI process ID to allow the processes to communicate with the correct destinations when using MPI operations.
 
 
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index ed9fb64b954cc3dfec06936b479226a7def90008..df9dbb457ace32ab804f7fc736a23f5b08bd077a 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -48,8 +48,8 @@ tf_cuda_cc_test(
     # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
+        "multi_gpu",
         "no_oss",
-        "noguitar",  # note: is run manually there
         "notap",
     ],
     deps = if_cuda(
@@ -138,8 +138,8 @@ cuda_py_test(
     # Disabled on jenkins until errors finding nvmlShutdown are found.
     tags = [
         "manual",
+        "multi_gpu",
         "no_oss",
-        "noguitar",  # note: is run manually there
         "notap",
     ],
 )
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 1eb1481675e08ffc6c952fe4811785ac94f6b0b4..913935b38246f1c5c0f7da4c1ea1f986bc00891b 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -258,9 +258,37 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
     devices[i] = collective->participants[i]->gpu_device_id;
   }
 
+  int device_count = num_devices;
+#if NCCL_MAJOR >= 2
+  // NCCL2 prevents InitAll for more communicators than devices (but doesn't
+  // check that device ids are unique). Work around it by initializing each
+  // rank individually.
+  cudaGetDeviceCount(&device_count);
+#endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
-  auto result = ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-  CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  if (num_devices <= device_count) {
+    auto result =
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
+    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+  } else {
+    int savedDevice = 0;
+    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    ncclUniqueId commId;
+    ncclGetUniqueId(&commId);
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+#endif
+    for (int rank = 0; rank < num_devices; ++rank) {
+      cudaSetDevice(devices[rank]);
+      auto result =
+          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
+      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    }
+#if NCCL_MAJOR >= 2
+    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+#endif
+    cudaSetDevice(savedDevice);
+  }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
@@ -370,7 +398,7 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 }
 
 void NcclManager::RunCollective(const string& key, Collective* collective) {
-  static mutex collective_mu;
+  static mutex collective_mu(LINKER_INITIALIZED);
 
   auto* communicator = GetCommunicator(collective);
   collective->communicator = communicator;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 505c4b0d71028c64b5075cff7ea010597b4263b3..abafe4b40756ee9742b2fd98dfb4fdb3ac4c8218 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -30,6 +30,8 @@ namespace tensorflow {
 static std::vector<BaseGPUDevice*> GetGPUDevices() {
   std::vector<Device*> devices;
   SessionOptions session_options;
+  session_options.config.mutable_gpu_options()
+      ->set_per_process_gpu_memory_fraction(0.1);
   session_options.env = Env::Default();
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 0b13e3595e36b609468f459d9179f8e9f5c1e055..bad0abd44cc507c6ebbe4481f80b8cafd8480322 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,14 +72,15 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
-    if not test.is_gpu_available():
-      return  # Test requires access to a GPU
-
     for dtype in [np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
 
+        # Check GPU availability *after* creating test session, see b/68975239.
+        if not test.is_gpu_available():
+          return  # Test requires access to a GPU
+
         for devices in device_sets:
           shape = (3, 4)
           random = (np.random.random_sample(shape) - .5) * 1024
diff --git a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
index 62ee6630ac613c80a56d4e854cf7af4ae19f6faa..2b412fac9a621f01bd21c6b4391da3c462dd78b3 100644
--- a/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
+++ b/tensorflow/contrib/nearest_neighbor/kernels/hyperplane_lsh_probes.cc
@@ -45,16 +45,16 @@ class HyperplaneLSHProbesOp : public OpKernel {
     const Tensor& products_tensor = context->input(0);
     OP_REQUIRES(context, products_tensor.dims() == 2,
                 InvalidArgument("Need a two-dimensional products tensor, got ",
-                                products_tensor.dims(), " dimensions."))
+                                products_tensor.dims(), " dimensions."));
 
     const Tensor& num_tables_tensor = context->input(1);
     OP_REQUIRES(context, num_tables_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_tables tensor, got ",
-                                num_tables_tensor.dims(), " dimensions."))
+                                num_tables_tensor.dims(), " dimensions."));
     int num_tables = num_tables_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_tables >= 1,
                 InvalidArgument("num_tables must be at least 1 but got ",
-                                num_tables, "."))
+                                num_tables, "."));
     OP_REQUIRES(context, num_tables <= 1000,
                 InvalidArgument("Need num_tables <= 1000, got ", num_tables,
                                 ". This is mostly to protect against incorrect "
@@ -66,12 +66,13 @@ class HyperplaneLSHProbesOp : public OpKernel {
                 InvalidArgument("Need a scalar num_hyperplanes_per_table "
                                 "tensor, got ",
                                 num_hyperplanes_per_table_tensor.dims(),
-                                " dimensions."))
+                                " dimensions."));
     int num_hyperplanes_per_table =
         num_hyperplanes_per_table_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_hyperplanes_per_table >= 1,
                 InvalidArgument("num_hyperplanes_per_table must be at least 1 "
-                                "but got ", num_hyperplanes_per_table, "."))
+                                "but got ",
+                                num_hyperplanes_per_table, "."));
     OP_REQUIRES(context, num_hyperplanes_per_table <= 30,
                 InvalidArgument("Need num_hyperplanes_per_table <= 30, got ",
                                 num_hyperplanes_per_table, ". "
@@ -81,10 +82,10 @@ class HyperplaneLSHProbesOp : public OpKernel {
     const Tensor& num_probes_tensor = context->input(3);
     OP_REQUIRES(context, num_probes_tensor.dims() == 0,
                 InvalidArgument("Need a scalar num_probes tensor, got ",
-                                num_probes_tensor.dims(), " dimensions."))
+                                num_probes_tensor.dims(), " dimensions."));
     int num_probes = num_probes_tensor.scalar<int32>()();
     OP_REQUIRES(context, num_probes >= 1,
-                InvalidArgument("num_probes must be at least 1."))
+                InvalidArgument("num_probes must be at least 1."));
 
     int expected_num_hyperplanes = num_tables * num_hyperplanes_per_table;
     OP_REQUIRES(
diff --git a/tensorflow/contrib/nn/__init__.py b/tensorflow/contrib/nn/__init__.py
index 3bf795d19aad73ec37c0485fe1900a7d8ac43137..96d60e149809aff6fcb7eff77edc23737db177e8 100644
--- a/tensorflow/contrib/nn/__init__.py
+++ b/tensorflow/contrib/nn/__init__.py
@@ -15,11 +15,13 @@
 """Module for variants of ops in tf.nn.
 
 @@alpha_dropout
+@@conv1d_transpose
 @@deprecated_flipped_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sparse_softmax_cross_entropy_with_logits
 @@deprecated_flipped_sigmoid_cross_entropy_with_logits
 @@nth_element
 @@rank_sampled_softmax_loss
+@@sampled_sparse_softmax_loss
 @@scaled_softplus
 """
 
@@ -32,6 +34,7 @@ from tensorflow.contrib.nn.python.ops.alpha_dropout import *
 from tensorflow.contrib.nn.python.ops.cross_entropy import *
 from tensorflow.contrib.nn.python.ops.sampling_ops import *
 from tensorflow.contrib.nn.python.ops.scaled_softplus import *
+from tensorflow.python.ops.nn_ops import conv1d_transpose
 from tensorflow.python.ops.nn_ops import nth_element
 # pylint: enable=unused-import,wildcard-import
 
diff --git a/tensorflow/contrib/nn/python/ops/cross_entropy.py b/tensorflow/contrib/nn/python/ops/cross_entropy.py
index 61c1d1c6d9cbd04faa8736ee0daba9073a0887bc..5045f2c957feb77cc91b9c10c9e96a6f336be00a 100644
--- a/tensorflow/contrib/nn/python/ops/cross_entropy.py
+++ b/tensorflow/contrib/nn/python/ops/cross_entropy.py
@@ -116,7 +116,7 @@ def deprecated_flipped_sparse_softmax_cross_entropy_with_logits(logits,
 
   Raises:
     ValueError: If logits are scalars (need to have rank >= 1) or if the rank
-      of the labels is not equal to the rank of the labels minus one.
+      of the labels is not equal to the rank of the logits minus one.
   """
   return nn.sparse_softmax_cross_entropy_with_logits(
       labels=labels, logits=logits, name=name)
diff --git a/tensorflow/contrib/nn/python/ops/sampling_ops.py b/tensorflow/contrib/nn/python/ops/sampling_ops.py
index 2ae529e0155f5ad9b40391c2f728c5c594e72dc9..63fc487dca69a4777821595a0366d0ae0b393ce2 100644
--- a/tensorflow/contrib/nn/python/ops/sampling_ops.py
+++ b/tensorflow/contrib/nn/python/ops/sampling_ops.py
@@ -24,6 +24,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
 
 
 def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
@@ -34,7 +36,7 @@ def _rank_resample(weights, biases, inputs, sampled_values, num_resampled,
 
       log(sum_j exp((w_i * x_j + b_i) / resampling_temperature))
 
-  where w_i, b_i are the weight and bias of the i-th class, repsectively,
+  where w_i, b_i are the weight and bias of the i-th class, respectively,
   and j ranges over the rows of `inputs`. For efficiency, we rearrange the
   computation to
 
@@ -240,3 +242,101 @@ def rank_sampled_softmax_loss(weights,
         remove_accidental_hits=remove_accidental_hits,
         partition_strategy=partition_strategy,
         name=name)
+
+
+def sampled_sparse_softmax_loss(weights,
+                                biases,
+                                labels,
+                                inputs,
+                                num_sampled,
+                                num_classes,
+                                sampled_values=None,
+                                remove_accidental_hits=True,
+                                partition_strategy="mod",
+                                name="sampled_sparse_softmax_loss"):
+  """Computes and returns the sampled sparse softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  softmax loss for evaluation or inference. In this case, you must set
+  `partition_strategy="div"` for the two losses to be consistent, as in the
+  following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_sparse_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...,
+        partition_strategy="div")
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=tf.squeeze(labels),
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+        objects whose concatenation along dimension 0 has shape
+        [num_classes, dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, 1]`.
+        The index of the single target class for each row of logits.  Note that
+        this format differs from the `labels` argument of
+        `nn.sparse_softmax_cross_entropy_with_logits`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
+        activations of the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+        (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+        where a sampled class equals one of the target classes.  Default is
+        True.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
+        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  logits, _ = nn_impl._compute_sampled_logits(
+      weights=weights,
+      biases=biases,
+      labels=labels,
+      inputs=inputs,
+      num_sampled=num_sampled,
+      num_classes=num_classes,
+      num_true=1,
+      sampled_values=sampled_values,
+      subtract_log_q=True,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy=partition_strategy,
+      name=name)
+
+  # There is only one true label. _compute_sampled_logits puts the true logit
+  # at index 0.
+  labels = array_ops.zeros([array_ops.shape(logits)[0], 1], dtype=dtypes.int64)
+
+  sampled_losses = nn_ops.sparse_softmax_cross_entropy_with_logits(
+      labels=array_ops.squeeze(labels), logits=logits)
+  # sampled_losses is a [batch_size] tensor.
+  return sampled_losses
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index 096d2270e4c2d046a8dc8982bf03a648a195c667..9c961f2b9c828f7406516860b7e3fd3dc343d993 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,11 +14,16 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/addsign.py",
         "python/training/drop_stale_gradient_optimizer.py",
+        "python/training/elastic_average_optimizer.py",
         "python/training/external_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/moving_average_optimizer.py",
+        "python/training/multitask_optimizer_wrapper.py",
         "python/training/nadam_optimizer.py",
+        "python/training/powersign.py",
+        "python/training/sign_decay.py",
         "python/training/variable_clipping_optimizer.py",
     ],
     srcs_version = "PY2AND3",
@@ -76,22 +81,39 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "variable_clipping_optimizer_test",
     srcs = ["python/training/variable_clipping_optimizer_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":opt_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+    grpc_enabled = True,
     tags = [
         "manual",  # Flaky: b/29892493
         "notap",  # data race due to b/62910646
     ],
+)
+
+py_test(
+    name = "multitask_optimizer_wrapper_test",
+    srcs = ["python/training/multitask_optimizer_wrapper_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
+        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:session",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -147,11 +169,78 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Flaky due to port collisions
     ],
 )
 
+tf_py_test(
+    name = "elastic_average_optimizer_test",
+    srcs = ["python/training/elastic_average_optimizer_test.py"],
+    additional_deps = [
+        ":opt_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "sign_decay_test",
+    srcs = ["python/training/sign_decay_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_test(
+    name = "addsign_test",
+    srcs = ["python/training/addsign_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "powersign_test",
+    srcs = ["python/training/powersign_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index e194fa2d4d207bfd94aca67e4e00e9ce1902370c..2025e8b4fca4ff865497671fcb8ec52933640c3a 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,23 +19,36 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.drop_stale_gradient_optimizer import *
 from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
-from tensorflow.contrib.opt.python.training.nadam_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
+from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
 from tensorflow.contrib.opt.python.training.nadam_optimizer import *
+from tensorflow.contrib.opt.python.training.powersign import *
 from tensorflow.contrib.opt.python.training.variable_clipping_optimizer import *
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
 _allowed_symbols = [
+    'PowerSignOptimizer',
+    'AddSignOptimizer'
     'DelayCompensatedGradientDescentOptimizer',
-    'DropStaleGradientOptimizer', 'ExternalOptimizerInterface',
-    'LazyAdamOptimizer', 'NadamOptimizer', 'MovingAverageOptimizer',
-    'ScipyOptimizerInterface', 'VariableClippingOptimizer'
+    'DropStaleGradientOptimizer',
+    'ExternalOptimizerInterface',
+    'LazyAdamOptimizer',
+    'NadamOptimizer',
+    'MovingAverageOptimizer',
+    'ScipyOptimizerInterface',
+    'VariableClippingOptimizer',
+    'MultitaskOptimizerWrapper',
+    'clip_gradients_by_global_norm',
+    'ElasticAverageOptimizer', 
+    'ElasticAverageCustomGetter'
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/opt/python/training/addsign.py b/tensorflow/contrib/opt/python/training/addsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..729e59cb0aab97e6cd657571647fc45a44ae0ab1
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/addsign.py
@@ -0,0 +1,169 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of AddSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class AddSignOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the AddSign update.
+
+  See  Neural Optimizer Search with Reinforcement Learning
+  [Bello et al., ICML2017].
+  """
+
+  def __init__(self,
+               learning_rate=0.1,
+               alpha=1.0,
+               beta=0.9,
+               sign_decay_fn=None,
+               use_locking=False,
+               name='AddSignOptimizer'):
+    """Constructs a new AddSignOptimizer object.
+
+    Initialization:
+
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    Update:
+
+    ```
+    t <- t + 1
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    sign_decay <- sign_decay(t)
+    update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+    variable <- variable - lr_t * update
+    ```
+
+    Example for AddSign-ld (AddSign with linear sign decay)
+    ```
+    decay_steps = 1000
+    linear_decay_fn = sign_decays.get_linear_decay_fn(decay_steps)
+    opt = AddSignOptimizer(learning_rate=0.1, sign_decay_fn=linear_decay_fn)
+    ```
+
+    Args:
+      learning_rate: learning_rate used when taking a step.
+      alpha: alpha used in optimizer.
+      beta: decay used for computing the moving average m.
+      sign_decay_fn: decay function applied to the sign(g*m) quantity.
+          Takes global_step as an argument and returns the quantity to multiply
+          the sign(g*m) by.
+        compute (1.0 + alpha * decay * sign(g) * sign(m)) * m.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AddSignOptimizer".
+    """
+    super(AddSignOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._alpha = alpha
+    self._beta = beta
+
+    self._sign_decay_fn = sign_decay_fn
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._alpha_t = None
+    self._beta_t = None
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    if self._sign_decay_fn is not None:
+      self._sign_decay_t = ops.convert_to_tensor(
+          self._sign_decay_fn(global_step), name='sign_decay')
+    return super(AddSignOptimizer, self).apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
+
+  def _create_slots(self, var_list):
+    # Create slots for the first moment.
+    for v in var_list:
+      self._zeros_slot(v, 'm', self._name)
+
+  def _prepare(self):
+    self._lr_t = ops.convert_to_tensor(self._lr, name='learning_rate')
+    self._beta_t = ops.convert_to_tensor(self._beta, name='beta')
+    self._alpha_t = ops.convert_to_tensor(self._alpha, name='alpha')
+    if self._sign_decay_fn is None:
+      self._sign_decay_t = ops.convert_to_tensor(1.0, name='sign_decay')
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.apply_add_sign(
+        var,
+        m,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._alpha_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.resource_apply_add_sign(
+        var.handle,
+        m.handle,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._alpha_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
+    beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
+
+    m = self.get_slot(var, 'm')
+    m_t = state_ops.assign(
+        m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking)
+
+    sign_g = ops.IndexedSlices(
+        math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape)
+    sign_gm = ops.IndexedSlices(
+        array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values,
+        sign_g.indices,
+        dense_shape=sign_g.dense_shape)
+
+    sign_decayed = math_ops.cast(
+        self._sign_decay_t, var.dtype.base_dtype)
+    multiplier_values = alpha_t + sign_decayed * sign_gm.values
+    multiplier = ops.IndexedSlices(
+        multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape)
+
+    final_update = ops.IndexedSlices(
+        lr_t * multiplier.values * grad.values,
+        multiplier.indices,
+        dense_shape=multiplier.dense_shape)
+
+    var_update = state_ops.scatter_sub(
+        var,
+        final_update.indices,
+        final_update.values,
+        use_locking=self._use_locking)
+
+    return control_flow_ops.group(* [var_update, m_t])
diff --git a/tensorflow/contrib/opt/python/training/addsign_test.py b/tensorflow/contrib/opt/python/training/addsign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd19ee3e7ac514448c6d79272abb86a154f55e9a
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/addsign_test.py
@@ -0,0 +1,262 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AddSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import addsign
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+  return linear_decay
+
+
+def addsign_update_numpy(params,
+                         g_t,
+                         m,
+                         lr,
+                         alpha=1.0,
+                         beta=0.9,
+                         py_sign_decay_fn=None,
+                         t=None):
+  m_t = beta * m + (1 - beta) * g_t
+  if py_sign_decay_fn is None:
+    sign_decayed = 1.0
+  else:
+    sign_decayed = py_sign_decay_fn(t-1)
+  multiplier = alpha + sign_decayed * np.sign(g_t) * np.sign(m_t)
+  params_t = params - lr * multiplier * g_t
+  return params_t, m_t
+
+
+class AddSignTest(test.TestCase):
+
+  def _testDense(self,
+                 use_resource=False,
+                 learning_rate=0.1,
+                 sign_decay_fn=None,
+                 py_sign_decay_fn=None,
+                 alpha=1.0,
+                 beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = addsign.AddSignOptimizer(
+            learning_rate=learning_rate,
+            alpha=alpha,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        if context.in_graph_mode():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 7 steps of AddSign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            if context.in_graph_mode():
+              self.evaluate(update)
+            elif t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+          else:
+            if context.in_graph_mode():
+              self.evaluate(neg_update)
+            elif t > 1:
+              opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                  global_step=global_step)
+
+          var0_np, m0 = addsign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = addsign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testDense(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testDense(use_resource=False)
+    self._testDense(use_resource=False, learning_rate=0.01, alpha=0.1, beta=0.8)
+    self._testDense(use_resource=False,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+    self._testDense(use_resource=True)
+    self._testDense(use_resource=True, learning_rate=0.01, alpha=0.1, beta=0.8)
+    self._testDense(use_resource=True,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+  def _testSparse(self,
+                  use_resource=False,
+                  learning_rate=0.1,
+                  sign_decay_fn=None,
+                  py_sign_decay_fn=None,
+                  alpha=1.0,
+                  beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = addsign.AddSignOptimizer(
+            learning_rate=learning_rate,
+            alpha=alpha,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 7 steps of AddSign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 4):
+          if t < 5:
+            update.run()
+          else:
+            neg_update.run()
+
+          var0_np, m0 = addsign_update_numpy(
+              var0_np,
+              grads0_np,
+              m0,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = addsign_update_numpy(
+              var1_np,
+              grads1_np,
+              m1,
+              learning_rate,
+              alpha=alpha,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testSparse(use_resource=False)
+    self._testSparse(use_resource=False,
+                     learning_rate=0.01,
+                     alpha=0.1,
+                     beta=0.8)
+    self._testSparse(use_resource=False,
+                     sign_decay_fn=sign_decay_fn,
+                     py_sign_decay_fn=py_sign_decay_fn)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9941f22b1f04c0d4ef5176553288f6ce93f694f4
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -0,0 +1,345 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper optimizer for Elastic Average SGD """
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import constant_op
+
+LOCAL_VARIABLE_NAME = 'local_center_variable'
+GLOBAL_VARIABLE_NAME = 'global_center_variable'
+
+
+class ElasticAverageCustomGetter(object):
+  """Custom_getter class is used to do:
+  1. Change trainable variables to local collection and place them at worker
+    device
+  2. Generate global variables(global center variables)
+  3. Generate local variables(local center variables) which record the global
+    variables and place them at worker device
+    Notice that the class should be used with tf.replica_device_setter,
+    so that the global center variables and global step variable can be placed
+    at ps device. Besides, use 'tf.get_variable' instead of 'tf.Variable' to
+    use this custom getter.
+
+  For example,
+  ea_custom_getter = ElasticAverageCustomGetter(worker_device)
+  with tf.device(
+    tf.train.replica_device_setter(
+      worker_device=worker_device,
+      ps_device="/job:ps/cpu:0",
+      cluster=cluster)),
+    tf.variable_scope('',custom_getter=ea_custom_getter):
+    hid_w = tf.get_variable(
+      initializer=tf.truncated_normal(
+          [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
+          stddev=1.0 / IMAGE_PIXELS),
+      name="hid_w")
+    hid_b = tf.get_variable(initializer=tf.zeros([FLAGS.hidden_units]),
+                            name="hid_b")
+  """
+
+  def __init__(self, worker_device):
+    """Create a new `ElasticAverageCustomGetter`.
+
+    Args:
+      worker_device: String.  Name of the `worker` job.
+    """
+    self._worker_device = worker_device
+    self._local_map = {}
+    self._global_map = {}
+
+  def __call__(self, getter, name, trainable, collections, *args, **kwargs):
+    if trainable:
+      with ops.device(self._worker_device):
+        local_var = getter(name, trainable=True,
+                           collections=[ops.GraphKeys.LOCAL_VARIABLES], 
+                           *args, **kwargs)
+        
+      global_center_variable = variable_scope.variable(
+        name='%s/%s' %
+             (GLOBAL_VARIABLE_NAME,
+              name),
+        initial_value=local_var.initialized_value(),
+        trainable=False,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+
+      with ops.device(self._worker_device):
+        local_center_variable = variable_scope.variable(
+          name='%s/%s' % (LOCAL_VARIABLE_NAME, name),
+          initial_value=local_var.initialized_value(),
+          trainable=False,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES])
+        
+      self._local_map[local_var] = local_center_variable
+      self._global_map[local_var] = global_center_variable
+      return local_var
+    else:
+      return getter(name, trainable, collections, *args, **kwargs)
+
+
+class ElasticAverageOptimizer(optimizer.Optimizer):
+  """Wrapper optimizer that implements the Elastic Average SGD algorithm.
+  This is an async optimizer. During the training, Each worker will update
+  the local variables and maintains its own local_step, which starts from 0
+  and is incremented by 1 after each update of local variables. Whenever
+  the communication period divides the local step, the worker requests
+  the current global center variables and then computed the elastic difference
+  between global center variables and local variables. The elastic difference
+  then be used to update both local variables and global variables.
+  """
+
+  # Default value as paper described
+  BETA = 0.9
+
+  def __init__(
+      self,
+      opt,
+      num_worker,
+      ea_custom_getter,
+      communication_period=10,
+      moving_rate=None,
+      rho=None,
+      use_locking=True,
+      name="ElasticAverageOptimizer"):
+    """Construct a new gradient descent optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to update local variables.
+        Must be one of the Optimizer classes.
+      num_worker: The number of workers
+      ea_custom_getter: The ElasticAverageCustomGetter
+      communication_period: An int point value to controls the frequency
+        of the communication between every worker and the ps.
+      moving_rate: A floating point value to control the elastic difference.
+      rho: the amount of exploration we allow ine the model. The default
+        value is moving_rate/learning_rate
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "ElasticAverageOptimizer".
+    """
+    super(ElasticAverageOptimizer, self).__init__(use_locking, name)
+    self._opt = opt
+    self._num_worker = num_worker
+    self._period = communication_period
+    self._local_map = ea_custom_getter._local_map
+    self._global_map = ea_custom_getter._global_map
+
+    if moving_rate is None:
+      self._moving_rate = BETA / communication_period / num_worker
+    else:
+      self._moving_rate = moving_rate
+    if rho is None:
+      self._rho = self._moving_rate / self._opt._learning_rate
+    else:
+      self._rho = rho
+
+    self._local_step = variable_scope.get_variable(
+      initializer=0,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      name="local_step")
+    self._opt._prepare()
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+    
+    Add rho*elastic_difference to loss to control the exploration
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+    """
+    if not var_list:
+      var_list = variables.trainable_variables()
+      
+    elastic_difference = [math_ops.subtract(v, lv) for v, lv in zip(
+      variables.trainable_variables(),
+      [self._local_map[var] for var in var_list])]
+
+    distance_loss = self._rho * math_ops.add_n(
+                      [gen_nn_ops.l2_loss(ed) for ed in elastic_difference])
+
+    total_loss = loss + distance_loss
+    return self._opt.compute_gradients(total_loss, var_list,
+                                       gate_gradients, aggregation_method,
+                                       colocate_gradients_with_ops, grad_loss)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to global variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    apply_updates = self._opt.apply_gradients(grads_and_vars)
+    with ops.control_dependencies([apply_updates]):
+      local_update = state_ops.assign_add(
+        self._local_step, 1, name='local_step_update').op
+
+    # update global variables.
+    def _Update_global_variables():
+      local_vars = [v for g, v in grads_and_vars if g is not None]
+      global_center_vars = [self._global_map[var] for var in local_vars]
+      local_center_vars = [self._local_map[var] for var in local_vars]
+      local_center_vars_update = []
+      for lvar, var in zip(local_center_vars, global_center_vars):
+        local_center_vars_update.append(lvar.assign(var))
+      update_ops = []
+      differences = []
+      with ops.control_dependencies(local_center_vars_update):
+        for v, lv in zip(local_vars, local_center_vars):
+          with ops.device(v.device):
+            differences.append(math_ops.subtract(v, lv))
+        for lvar, diff in zip(local_vars, differences):
+          with ops.device(lvar.device):
+            update_ops.append(state_ops.assign_sub(lvar, math_ops.multiply(
+              self._moving_rate, diff)))
+        for var, diff in zip(global_center_vars, differences):
+          with ops.device(var.device):
+            update_ops.append(state_ops.assign_add(var, math_ops.multiply(
+              self._moving_rate, diff)))
+        if global_step:
+          with ops.colocate_with(global_step):
+            update_ops.append(state_ops.assign_add(global_step, 1))
+      variable_update = control_flow_ops.group(*(update_ops))
+      return variable_update
+
+    with ops.control_dependencies([local_update]):
+      condition = math_ops.equal(math_ops.mod(
+        self._local_step, self._period), 0)
+      conditional_update = control_flow_ops.cond(
+        condition, _Update_global_variables, control_flow_ops.no_op)
+    return conditional_update
+
+  def get_init_op(self, task_index):
+    """Returns the op to let all the local variables and local center
+    variables equal to the global center variables before the training begins"""
+
+    def _Add_sync_queues_and_barrier(enqueue_after_list):
+      """Adds ops to enqueu on all worker queues"""
+      sync_queues = [
+        data_flow_ops.FIFOQueue(self._num_worker, [dtypes.bool], shapes=[[]],
+                                shared_name='%s%s' % (
+                                  'variable_init_sync_queue', i)) for i in
+        range(self._num_worker)]
+      queue_ops = []
+      # For each other worker, add an entry in a queue
+      token = constant_op.constant(False)
+      with ops.control_dependencies(enqueue_after_list):
+        for i, q in enumerate(sync_queues):
+          if i == task_index:
+            queue_ops.append(control_flow_ops.no_op())
+          else:
+            queue_ops.append(q.enqueue(token))
+      queue_ops.append(
+        sync_queues[task_index].dequeue_many(len(sync_queues) - 1))
+      return control_flow_ops.group(*queue_ops)
+
+    init_ops = []
+    local_vars = variables.trainable_variables()
+    global_center_vars = [self._global_map[var] for var in local_vars]
+    local_center_vars = [self._local_map[var] for var in local_vars]
+    if not (local_vars and global_center_vars and local_center_vars):
+      raise ValueError(
+        'The lists of local_variables, global_center_variables, '
+        'local_center_variables should not be empty  ')
+    for lvar, gc_var, lc_var in zip(
+        local_vars, global_center_vars, local_center_vars):
+      init_ops.append(state_ops.assign(lvar, gc_var))
+      init_ops.append(state_ops.assign(lc_var, gc_var))
+
+    init_op = control_flow_ops.group(*(init_ops))
+    sync_queue_op = _Add_sync_queues_and_barrier([init_op])
+    return sync_queue_op
+
+  def make_session_run_hook(self, is_chief, task_index):
+    """Creates a hook to handle ElasticAverageOptimizerHook ops such as initialization."""
+    return _ElasticAverageOptimizerHook(self, is_chief, task_index)
+
+
+class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
+  def __init__(self, ea_optimizer, is_chief, task_index):
+    """Creates hook to handle ElasticAverageOptimizer initialization ops.
+
+    Args:
+      ea_optimizer: `ElasticAverageOptimizer` which this hook will initialize.
+      is_chief: `Bool`, whether is this a chief replica or not.
+    """
+    self._ea_optimizer = ea_optimizer
+    self._is_chief = is_chief
+    self._task_index = task_index
+
+  def begin(self):
+    self._local_init_op = variables.local_variables_initializer()
+    self._global_init_op = None
+    if self._is_chief:
+      self._global_init_op = variables.global_variables_initializer()
+    self._variable_init_op = self._ea_optimizer.get_init_op(self._task_index)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e55fceee52fa8fde8aeafba5dfa98057cfc0ab
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -0,0 +1,225 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ElasticAverageOptimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import portpicker
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import server_lib
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_setter
+
+from tensorflow.contrib.opt.python.training.elastic_average_optimizer import \
+  ElasticAverageOptimizer, ElasticAverageCustomGetter, GLOBAL_VARIABLE_NAME
+
+
+def create_local_cluster(num_workers, num_ps, protocol="grpc"):
+  """Create local GRPC servers and return them."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+  cluster_dict = {
+    "worker": ["localhost:%s" % port for port in worker_ports],
+    "ps": ["localhost:%s" % port for port in ps_ports]
+  }
+  cs = server_lib.ClusterSpec(cluster_dict)
+
+  workers = [
+    server_lib.Server(
+      cs, job_name="worker", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_workers)
+  ]
+  ps_servers = [
+    server_lib.Server(
+      cs, job_name="ps", protocol=protocol, task_index=ix, start=True)
+    for ix in range(num_ps)
+  ]
+
+  return cluster_dict, workers, ps_servers
+
+
+# Creates the workers and return their sessions, graphs, train_ops.
+# Cheif worker will update at last
+def _get_workers(num_workers, period, workers, moving_rate):
+  sessions = []
+  graphs = []
+  train_ops = []
+  for worker_id in range(num_workers):
+    graph = ops.Graph()
+    is_chief = (worker_id == 0)
+    with graph.as_default():
+      worker_device = "/job:worker/task:%d/cpu:0" % (worker_id)
+      ea_coustom = ElasticAverageCustomGetter(
+        worker_device=worker_device)
+      with variable_scope.variable_scope('',
+                                         custom_getter=ea_coustom), ops.device(
+        device_setter.replica_device_setter(worker_device=worker_device,
+                                            ps_device="/job:ps/task:0/cpu:0",
+                                            ps_tasks=1)):
+        global_step = variables.Variable(0, name='global_step',
+                                         trainable=False)
+        var_0 = variable_scope.get_variable(initializer=0.0, name="v0")
+        var_1 = variable_scope.get_variable(initializer=1.0, name="v1")
+
+      with ops.device("/job:worker/task:" + str(worker_id)):
+        grads_0 = constant_op.constant(-1.0)
+        grads_1 = constant_op.constant(-1.0)
+
+        sgd_opt = gradient_descent.GradientDescentOptimizer(1.0)
+        opt = ElasticAverageOptimizer(
+          opt=sgd_opt,
+          num_worker=num_workers,
+          moving_rate=moving_rate,
+          communication_period=period,
+          ea_custom_getter=ea_coustom
+        )
+        train_op = [
+          opt.apply_gradients(
+            ([grads_0, var_0],
+             [grads_1, var_1]), global_step)
+        ]
+        easgd_hook = opt.make_session_run_hook(is_chief, worker_id)
+      # Creates MonitoredSession
+      sess = training.MonitoredTrainingSession(workers[worker_id].target,
+                                               hooks=[easgd_hook])
+
+    sessions.append(sess)
+    graphs.append(graph)
+    train_ops.append(train_op)
+
+  return sessions, graphs, train_ops
+
+
+class ElasticAverageOptimizerTest(test.TestCase):
+  def _run(self, train_op, sess):
+    sess.run(train_op)
+
+  def test1Workers2Period(self):
+    num_workers = 1
+    communication_period = 2
+    num_ps = 1
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 1.0)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+    global_step = training_util.get_global_step(graphs[0])
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(0, sessions[0].run(global_step))
+
+    # iteration 2, global varibale update
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+    # iteration 3
+    sessions[0].run(train_ops[0])
+
+    self.assertAllEqual(1.0, sessions[0].run(var_0))
+    self.assertAllEqual(2.0, sessions[0].run(var_1))
+    self.assertAllEqual(2.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(3.0, sessions[0].run(var_1_g))
+    self.assertAllEqual(1, sessions[0].run(global_step))
+
+  def test2Worker1Period(self):
+    num_workers = 2
+    communication_period = 1
+    num_ps = 2
+    cluster, workers, _ = create_local_cluster(num_workers=num_workers,
+                                               num_ps=num_ps)
+
+    sessions, graphs, train_ops = _get_workers(num_workers,
+                                               communication_period,
+                                               workers, 0.5)
+
+    var_0 = graphs[0].get_tensor_by_name('v0:0')
+    var_1 = graphs[0].get_tensor_by_name('v1:0')
+
+    var_0_1 = graphs[1].get_tensor_by_name('v0:0')
+    var_1_1 = graphs[1].get_tensor_by_name('v1:0')
+
+    var_0_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v0:0")
+    var_1_g = graphs[0].get_tensor_by_name(GLOBAL_VARIABLE_NAME + "/v1:0")
+    # Verify the initialized value.
+    self.assertAllEqual(0.0, sessions[0].run(var_0))
+    self.assertAllEqual(1.0, sessions[0].run(var_1))
+    self.assertAllEqual(0.0, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.0, sessions[1].run(var_1_1))
+    self.assertAllEqual(0.0, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.0, sessions[0].run(var_1_g))
+
+    sessions[0].run(train_ops[0])
+    sessions[1].run(train_ops[1])
+
+    self.assertAllEqual(0.5, sessions[0].run(var_0))
+    self.assertAllEqual(1.5, sessions[0].run(var_1))
+    self.assertAllEqual(0.75, sessions[0].run(var_0_g))
+    self.assertAllEqual(1.75, sessions[0].run(var_1_g))
+    self.assertAllEqual(0.75, sessions[1].run(var_0_1))
+    self.assertAllEqual(1.75, sessions[1].run(var_1_1))
+
+  def testPS2TasksWithClusterSpecClass(self):
+    cluster_spec = server_lib.ClusterSpec({
+      "ps": ["ps0:2222", "ps1:2222"],
+      "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    ea_coustom = ElasticAverageCustomGetter(
+      worker_device="/job:worker/task:0")
+    from tensorflow.python.training import device_setter
+    with ops.device(
+        device_setter.replica_device_setter(cluster=cluster_spec,
+                                            worker_device="/job:worker/task:0",
+                                            ps_device="/job:ps")), \
+         variable_scope.variable_scope('', custom_getter=ea_coustom):
+      v = variable_scope.get_variable(initializer=[1, 2], name="v")
+      w = variable_scope.get_variable(initializer=[2, 1], name='w')
+      v_g, w_g = ea_coustom._global_map[v],ea_coustom._global_map[w]
+      self.assertDeviceEqual("/job:worker/task:0", v.device)
+      self.assertDeviceEqual("job:ps/task:0", v_g.device)
+      self.assertDeviceEqual("/job:worker/task:0", w.device)
+      self.assertDeviceEqual("job:ps/task:1", w_g.device)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index c48494585eb66c40e69a87439265b9cd08d51712..d68ad23d65500cc2348459cdc53030c2ea08373a 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -86,6 +86,9 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     self._variable_map = None
     self._sequential_update = sequential_update
 
+  def compute_gradients(self, *args, **kwargs):
+    return self._optimizer.compute_gradients(*args, **kwargs)
+
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     train_op = self._optimizer.apply_gradients(
         grads_and_vars, global_step=global_step, name=name)
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
index a4ffbfe1c6bf8a63b10593e6c783047c99cad523..60929add198f2e69b5acc2eb5516dafc82b1f3ba 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer_test.py
@@ -116,6 +116,37 @@ class MovingAverageOptimizerTest(test.TestCase):
       with self.assertRaises(RuntimeError):
         _ = opt.swapping_saver([var])
 
+  def testCorrectOverride(self):
+
+    class WrapperOptimizer(gradient_descent.GradientDescentOptimizer):
+
+      def compute_gradients(self, *args, **kwargs):
+        self.compute_gradients_called = True
+        return super(WrapperOptimizer, self).compute_gradients(
+            *args, **kwargs)
+
+      def apply_gradients(self, *args, **kwargs):
+        self.apply_gradients_called = True
+        return super(WrapperOptimizer, self).apply_gradients(*args, **kwargs)
+
+    with self.test_session() as sess:
+      var = variables.Variable([1.2], name='var', dtype=dtypes.float32)
+      loss = var ** 2
+      wrapper_opt = WrapperOptimizer(learning_rate=2.0)
+      opt = moving_average_optimizer.MovingAverageOptimizer(wrapper_opt)
+      train_op = opt.minimize(loss)
+
+      # Check that both methods are called on the underlying optimizer.
+      self.assertTrue(wrapper_opt.compute_gradients_called)
+      self.assertTrue(wrapper_opt.apply_gradients_called)
+
+      # Run train_op once, and verify that we've updated the variable.
+      variables.global_variables_initializer().run()
+      sess.run(train_op)
+      var_value = sess.run(var)
+      # Started at 1.2, gradient is 2*1.2=2.4, lr=2, so should now be -3.6.
+      self.assertNear(-3.6, var_value, 1e-6)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6c77a86feedde3285d75092511c8eb1e63b2a5
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper.py
@@ -0,0 +1,140 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An optimizer wrapper for stateful optimizers with multitask loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import types
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+
+__all__ = ['MultitaskOptimizerWrapper', 'clip_gradients_by_global_norm']
+
+
+def _is_all_zeros(grad):
+  all_zeros = math_ops.equal(math_ops.count_nonzero(grad), 0)
+  return all_zeros
+
+
+def _get_wrapper(fn, opt):
+
+  def wrapper(self, grad, *args, **kwargs):  # pylint: disable=unused-argument
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(all_zeros, control_flow_ops.no_op,
+                                 lambda: fn(grad, *args, **kwargs))
+
+  wrapper = types.MethodType(wrapper, opt)
+  return wrapper
+
+
+class MultitaskOptimizerWrapper(object):
+  """Optimizer wrapper making all-zero gradients harmless.
+
+  This might be useful when a multi-task loss is used,
+  and some components of the loss might be
+  not present (e.g. masked out) in some training batches.
+  Technically their gradient would be zero,
+  which would normally affect the optimizer state
+  (e.g. push running average to zero).
+  However this is not the desired behaviour,
+  since the missing loss component
+  should be treated as unknown rather than zero.
+
+  This wrapper filters out all-zero gradient tensors,
+  therefore preserving the optimizer state.
+
+  If gradient clipping by global norm is used,
+  the provided function clip_gradients_by_global_norm
+  should be used (and specified explicitly by the user).
+  Otherwise the global norm would be underestimated
+  because of all-zero tensors that should be ignored.
+
+  The gradient calculation and application
+  are delegated to an underlying optimizer.
+  The gradient application is altered only for all-zero tensors.
+
+  Example:
+  ```python
+  momentum_optimizer = tf.train.MomentumOptimizer(
+    learning_rate, momentum=0.9)
+  multitask_momentum_optimizer = tf.contrib.opt.MultitaskOptimizerWrapper(
+    momentum_optimizer)
+  gradvars = multitask_momentum_optimizer.compute_gradients(
+    loss)
+  gradvars_clipped, _ = tf.contrib.opt.clip_gradients_by_global_norm(
+    gradvars, 15.0)
+  train_op = multitask_momentum_optimizer.apply_gradients(
+    gradvars_clipped, global_step=batch)
+  ```
+  """
+
+  def __init__(self, opt):
+    """Constructor.
+
+    Args:
+      opt: an instance of a class that implements tf.train.Optimizer.
+    """
+    if not isinstance(opt, optimizer.Optimizer):
+      raise TypeError(
+          'Supplied optimizer must be an instance of tf.train.Optimizer')
+    self._opt = opt
+    overridden_methods = ('_apply_dense', '_resource_apply_dense',
+                          '_apply_sparse', '_resource_apply_sparse')
+    for name in overridden_methods:
+      fn = getattr(self._opt, name)
+      wrapper = _get_wrapper(fn, self._opt)
+      setattr(self._opt, name, wrapper)
+
+  def __getattr__(self, name):
+    return getattr(self._opt, name)
+
+
+def clip_gradients_by_global_norm(gradients_variables, clip_norm=20.):
+  """Clips gradients of a multitask loss by their global norm.
+
+  Ignores all-zero tensors when computing the global norm.
+
+  Args:
+    gradients_variables: a list of pairs (gradient, variable).
+    clip_norm: a float Tensor, the global norm to clip on. Default is 20.0.
+
+  Returns:
+    list: A list of pairs of the same type as gradients_variables,.
+    fixed_global_norm: A 0-D (scalar) Tensor representing the global norm.
+  """
+  gradients, variables = six.moves.zip(*gradients_variables)
+
+  def _replace_nonexisting_grad(grad):
+    if grad is None:
+      return grad
+    all_zeros = _is_all_zeros(grad)
+    return control_flow_ops.cond(
+        all_zeros,
+        lambda: array_ops.zeros([], dtype=dtypes.as_dtype(grad.dtype)),
+        lambda: grad)
+
+  nonzero_gradients = [_replace_nonexisting_grad(g) for g in gradients]
+  fixed_global_norm = clip_ops.global_norm(nonzero_gradients)
+  gradients, _ = clip_ops.clip_by_global_norm(
+      gradients, clip_norm, use_norm=fixed_global_norm)
+  return list(six.moves.zip(gradients, variables)), fixed_global_norm
diff --git a/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..618d8eb18d2e9b738d2c2f5b8e563aeffdf82988
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/multitask_optimizer_wrapper_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultitaskOptimizerWrapper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.contrib.opt.python.training import multitask_optimizer_wrapper
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
+
+
+class MultitaskOptimizerWrapperTest(test.TestCase):
+  """Tests for the multitask optimizer wrapper.
+  """
+
+  def testWrapper(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtypes.float32)
+      grads_allzero = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      mom_opt_impl = momentum.MomentumOptimizer(learning_rate=2.0, momentum=0.9)
+      mom_opt = multitask_optimizer_wrapper.MultitaskOptimizerWrapper(
+          mom_opt_impl)
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      mom_update_partial = mom_opt.apply_gradients(
+          zip([grads_allzero, grads1], [var0, var1]))
+      mom_update_no_action = mom_opt.apply_gradients(
+          zip([grads_allzero, grads_allzero], [var0, var1]))
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+      # Step 1: normal momentum update.
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([0.01, 0.01]), self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+
+      # Step 2: momentum update that changes only slot1 but not slot0.
+      self.evaluate(mom_update_partial)
+      # Check that only the relevant momentum accumulator has been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+      # Step 3: momentum update that does not change anything.
+      self.evaluate(mom_update_no_action)
+      # Check that the momentum accumulators have *NOT* been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([0.1, 0.1]), self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+
+  def testGradientClipping(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var2 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      var3 = variables.Variable([3.0, 4.0], dtype=dtypes.float32)
+      grads0 = constant_op.constant([10.0, 15.0], dtype=dtypes.float32)
+      grads1 = constant_op.constant([0.0, 5.0], dtype=dtypes.float32)
+      grads2 = constant_op.constant([0.0, 0.0], dtype=dtypes.float32)
+      grads3 = None
+      varlist = [var0, var1, var2, var3]
+      gradients = [grads0, grads1, grads2, grads3]
+      clipped_gradvars, global_norm = (
+          multitask_optimizer_wrapper.clip_gradients_by_global_norm(
+              six.moves.zip(gradients, varlist), clip_norm=1.0))
+      clipped_grads = list(six.moves.zip(*clipped_gradvars))[0]
+      reference_global_norm = np.sqrt(np.sum(np.square([10.0, 15.0, 0.0, 5.0])))
+      self.assertAllCloseAccordingToType(
+          self.evaluate(global_norm), reference_global_norm)
+      self.assertAllCloseAccordingToType(
+          self.evaluate(clipped_grads[2]), np.array([0., 0.]))
+      self.assertEqual(clipped_grads[3], None)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/powersign.py b/tensorflow/contrib/opt/python/training/powersign.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f7521581fd685c7a65119e2bd2b4af64aafcd69
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/powersign.py
@@ -0,0 +1,173 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of PowerSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+
+
+class PowerSignOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the PowerSign update.
+
+  See  Neural Optimizer Search with Reinforcement Learning
+  [Bello et al., ICML2017].
+  """
+
+  def __init__(self,
+               learning_rate=0.1,
+               base=math.e,
+               beta=0.9,
+               sign_decay_fn=None,
+               use_locking=False,
+               name='PowerSignOptimizer'):
+    """Constructs a new PowerSignOptimizer object.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    Update:
+
+    ```
+    t <- t + 1
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    sign_decay <- sign_decay(t)
+    update <- base ** (sign_decay * sign(g) * sign(m)) * g
+    variable <- variable - lr_t * update
+    ```
+
+    Example usage for PowerSign-cd (PowerSign with cosine sign decay)
+    ```
+    decay_steps = 1000
+    linear_decay_fn = sign_decays.get_linear_decay_fn(decay_steps)
+    opt = PowerSignOptimizer(learning_rate=0.1, sign_decay_fn=linear_decay_fn)
+    ```
+
+    Args:
+      learning_rate: learning_rate used when taking a step.
+      base: base used in optimizer.
+      beta: decay used for computing the moving average m.
+      sign_decay_fn: decay function applied to the sign(g*m) quantity.
+          Takes global_step as an argument and returns the quantity to multiply
+          the sign(g*m) by.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created iwhen applying gradients.
+        Defaults to "PowerSignOptimizer".
+    """
+    super(PowerSignOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta = beta
+    self._logbase = math.log(base)
+
+    self._sign_decay_fn = sign_decay_fn
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta_t = None
+    self._logbase_t = None
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    if self._sign_decay_fn is not None:
+      self._sign_decay_t = ops.convert_to_tensor(
+          self._sign_decay_fn(global_step), name='sign_decay')
+    return super(PowerSignOptimizer, self).apply_gradients(
+        grads_and_vars, global_step=global_step, name=name)
+
+  def _create_slots(self, var_list):
+    # Create slots for the first moment.
+    for v in var_list:
+      self._zeros_slot(v, 'm', self._name)
+
+  def _prepare(self):
+    self._lr_t = ops.convert_to_tensor(self._lr, name='learning_rate')
+    self._beta_t = ops.convert_to_tensor(self._beta, name='beta')
+    self._logbase_t = ops.convert_to_tensor(self._logbase, name='logbase')
+    if self._sign_decay_fn is None:
+      self._sign_decay_t = ops.convert_to_tensor(1.0, name='sign_decay')
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.apply_power_sign(
+        var,
+        m,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._logbase_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    return training_ops.resource_apply_power_sign(
+        var.handle,
+        m.handle,
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._logbase_t, var.dtype.base_dtype),
+        math_ops.cast(self._sign_decay_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var):
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
+    logbase_t = math_ops.cast(self._logbase_t, var.dtype.base_dtype)
+    e_t = math_ops.cast(math.e, var.dtype.base_dtype)
+
+    m = self.get_slot(var, 'm')
+    m_t = state_ops.assign(
+        m, (m * beta_t) + (grad * (1 - beta_t)), use_locking=self._use_locking)
+
+    sign_g = ops.IndexedSlices(
+        math_ops.sign(grad.values), grad.indices, dense_shape=grad.dense_shape)
+    sign_gm = ops.IndexedSlices(
+        array_ops.gather(math_ops.sign(m_t), sign_g.indices) * sign_g.values,
+        sign_g.indices,
+        dense_shape=sign_g.dense_shape)
+
+    sign_decayed = math_ops.cast(
+        self._sign_decay_t, var.dtype.base_dtype)
+    multiplier_values = math_ops.pow(
+        e_t, logbase_t * sign_decayed * sign_gm.values)
+    multiplier = ops.IndexedSlices(
+        multiplier_values, sign_gm.indices, dense_shape=sign_gm.dense_shape)
+
+    final_update = ops.IndexedSlices(
+        lr_t * multiplier.values * grad.values,
+        multiplier.indices,
+        dense_shape=multiplier.dense_shape)
+
+    var_update = state_ops.scatter_sub(
+        var,
+        final_update.indices,
+        final_update.values,
+        use_locking=self._use_locking)
+
+    return control_flow_ops.group(* [var_update, m_t])
diff --git a/tensorflow/contrib/opt/python/training/powersign_test.py b/tensorflow/contrib/opt/python/training/powersign_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7b1a72d47d8ef54980905323bcaf358c988a82
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/powersign_test.py
@@ -0,0 +1,268 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for PowerSign."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import powersign
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+  return linear_decay
+
+
+def powersign_update_numpy(params,
+                           g_t,
+                           m,
+                           lr,
+                           base=math.e,
+                           beta=0.9,
+                           py_sign_decay_fn=None,
+                           t=None):
+  m_t = beta * m + (1 - beta) * g_t
+  if py_sign_decay_fn is None:
+    sign_decayed = 1.0
+  else:
+    sign_decayed = py_sign_decay_fn(t-1)
+  multiplier = base ** (sign_decayed * np.sign(g_t) * np.sign(m_t))
+  params_t = params - lr * multiplier * g_t
+  return params_t, m_t
+
+
+class PowerSignTest(test.TestCase):
+
+  def _testDense(self,
+                 use_resource=False,
+                 learning_rate=0.1,
+                 sign_decay_fn=None,
+                 py_sign_decay_fn=None,
+                 base=math.e,
+                 beta=0.9):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = powersign.PowerSignOptimizer(
+            learning_rate=learning_rate,
+            base=base,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+
+        if context.in_graph_mode():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 7 steps of powersign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            if context.in_graph_mode():
+              self.evaluate(update)
+            elif t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+          else:
+            if context.in_graph_mode():
+              self.evaluate(neg_update)
+            elif t > 1:
+              opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                  global_step=global_step)
+
+          var0_np, m0 = powersign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = powersign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testDense(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testDense(use_resource=False)
+    self._testDense(use_resource=False,
+                    learning_rate=0.1,
+                    base=10.0,
+                    beta=0.8)
+    self._testDense(use_resource=False,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+    self._testDense(use_resource=True)
+    self._testDense(use_resource=True, learning_rate=0.1, base=10.0, beta=0.8)
+    self._testDense(use_resource=True,
+                    sign_decay_fn=sign_decay_fn,
+                    py_sign_decay_fn=py_sign_decay_fn)
+
+  def _testSparse(self,
+                  use_resource=False,
+                  learning_rate=0.1,
+                  sign_decay_fn=None,
+                  py_sign_decay_fn=None,
+                  base=math.e,
+                  beta=0.9):
+    with self.test_session(use_gpu=True):
+      for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+        # Initialize variables for numpy implementation.
+        m0, m1 = 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+          global_step = resource_variable_ops.ResourceVariable(
+              0, trainable=False)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+          global_step = variables.Variable(
+              0, trainable=False)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = powersign.PowerSignOptimizer(
+            learning_rate=learning_rate,
+            base=base,
+            beta=beta,
+            sign_decay_fn=sign_decay_fn,
+        )
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        neg_update = opt.apply_gradients(zip([-grads0, -grads1], [var0, var1]),
+                                         global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of powersign
+        # first 4 steps with positive gradient
+        # last 3 steps with negative gradient (sign(gm) should be -1)
+        for t in range(1, 8):
+          if t < 5:
+            update.run()
+          else:
+            neg_update.run()
+
+          var0_np, m0 = powersign_update_numpy(
+              var0_np,
+              grads0_np if t < 5 else -grads0_np,
+              m0,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+          var1_np, m1 = powersign_update_numpy(
+              var1_np,
+              grads1_np if t < 5 else -grads1_np,
+              m1,
+              learning_rate,
+              base=base,
+              beta=beta,
+              py_sign_decay_fn=py_sign_decay_fn,
+              t=t,
+          )
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    decay_steps = 10
+    sign_decay_fn = sign_decay.get_linear_decay_fn(decay_steps)
+    py_sign_decay_fn = py_linear_decay_fn(decay_steps)
+    self._testSparse(use_resource=False)
+    self._testSparse(use_resource=False,
+                     learning_rate=0.01,
+                     base=2.0,
+                     beta=0.8)
+    self._testSparse(use_resource=False,
+                     sign_decay_fn=sign_decay_fn,
+                     py_sign_decay_fn=py_sign_decay_fn)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/sign_decay.py b/tensorflow/contrib/opt/python/training/sign_decay.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8870c072110da145c0bb78e20c3584083438ea0
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/sign_decay.py
@@ -0,0 +1,158 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of the sign decay functions used in PowerSign and AddSign.
+
+See [Bello et al., ICML 2017] Neural Optimizer Search with Reinforcement
+Learning for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def get_linear_decay_fn(decay_steps):
+  """Returns a function that computes a linear decay.
+
+  This decay computes linear annealing:
+    max(0, (decay_steps - global_step) / decay_steps)
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  linear_decay_fn = get_linear_decay_fn(decay_steps)
+  decayed = linear_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+  Returns:
+    linear_decay_fn: a function that computes the linear decay.
+  """
+  # pylint:disable=missing-docstring
+  def linear_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for linear_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    remaining_steps = math_ops.to_int32(decay_steps) - math_ops.to_int32(
+        global_step)
+    decayed = math_ops.to_float(remaining_steps) / math_ops.to_float(
+        decay_steps)
+    return math_ops.maximum(0.0, decayed)
+  # pylint:enable=missing-docstring
+  return linear_decay_fn
+
+
+def get_cosine_decay_fn(decay_steps, num_periods=0.5, zero_after=None):
+  """Returns a function that computes a cosine decay.
+
+  This decay computes cosine annealing:
+    0.5 * (1.0 + cos(2.0 * pi * num_periods * global_step / decay_steps))
+
+  This decay can be used to decay the sign quantity in the AddSign and PowerSign
+  optimizers discovered in
+  [Bello et al., ICML 2017] Neural Optimizer Search with RL.
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  num_periods = 2
+  cosine_decay_fn = get_cosine_decay_fn(decay_steps, num_periods=num_periods)
+  decayed = cosine_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+    num_periods: number of periods for cosine signal. 0.5 by default,
+      which maps the last decay step to 0.
+    zero_after: if not None, number after which the decay function
+      will just return 0.
+  Returns:
+    cosine_decay_fn: a function that computes the cosine decay.
+  """
+  # pylint:disable=missing-docstring
+  def cosine_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for cosine_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    completed_fraction = math_ops.to_float(global_step) / math_ops.to_float(
+        decay_steps)
+    fraction = 2.0 * num_periods * completed_fraction
+    decayed = 0.5 * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+    if zero_after is not None:
+      decayed = array_ops.where(
+          math_ops.greater_equal(fraction, 2 * zero_after), 0.0, decayed)
+    return decayed
+  # pylint:enable=missing-docstring
+  return cosine_decay_fn
+
+
+def get_restart_decay_fn(decay_steps, num_periods=1, zero_after=None):
+  """Returns a function that computes a restart decay.
+
+  This decay computes
+    0.5 * (1.0 + cos(pi * (num_periods * global_step) % num_training_steps))
+
+  This is a simplified version of the restart decay introduced in
+  "SGDR: Stochastic Gradient Descent with Warm Restarts"
+  by Ilya Loshchilov & Frank Hutter, Proceedings of
+  ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
+
+  This decay can be used to decay the sign quantity in the AddSign and PowerSign
+  optimizers discovered in
+  [Bello et al., ICML 2017] Neural Optimizer Search with RL.
+
+  Example usage:
+  ```
+  decay_steps = 1000
+  num_periods = 2.0
+  restart_decay_fn = get_restart_decay_fn(decay_steps,
+                                          num_periods=num_periods)
+  decayed = restart_decay_fn(global_step)
+  x *= decayed
+  ```
+  Args:
+    decay_steps: number of steps to decay over.
+    num_periods: number of periods for cosine signal. 1 by default,
+      which maps the last decay step to 0.
+    zero_after: if not None, number after which the decay function
+      will return 0.
+  Returns:
+    restart_decay_fn: a function that computes the restart decay.
+  """
+  # pylint:disable=missing-docstring
+  def restart_decay_fn(global_step):
+    if global_step is None:
+      raise ValueError("global_step is required for cosine_decay.")
+    global_step = math_ops.minimum(global_step, decay_steps)
+    num = math_ops.mod(num_periods * math_ops.to_float(global_step),
+                       decay_steps)
+    fraction = num / math_ops.to_float(decay_steps)
+    decayed = 0.5 * (
+        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+    if zero_after is not None:
+      tmp = math_ops.to_float(
+          num_periods * global_step) / math_ops.to_float(decay_steps)
+      decayed = array_ops.where(
+          math_ops.greater_equal(tmp, zero_after), 0.0, decayed)
+    return decayed
+  # pylint:enable=missing-docstring
+  return restart_decay_fn
diff --git a/tensorflow/contrib/opt/python/training/sign_decay_test.py b/tensorflow/contrib/opt/python/training/sign_decay_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c31cb924eacfc8feea6bbd1f5c9ae903442b04b1
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/sign_decay_test.py
@@ -0,0 +1,110 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sign_decay."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.contrib.opt.python.training import sign_decay
+from tensorflow.python.platform import test
+
+
+def py_linear_decay_fn(decay_steps):
+
+  def linear_decay(step):
+    step = min(step, decay_steps)
+    return float(decay_steps - step) / decay_steps
+
+  return linear_decay
+
+
+def py_cosine_decay_fn(decay_steps, num_periods=0.5, zero_after=None):
+
+  def cosine_decay(step):
+    step = min(step, decay_steps)
+    fraction = 2.0 * num_periods * step / float(decay_steps)
+    if zero_after is not None and fraction >= 2 * zero_after:
+      return 0.0
+    return 0.5 * (1.0 + math.cos(math.pi * fraction))
+
+  return cosine_decay
+
+
+def py_restart_decay_fn(decay_steps, num_periods=1, zero_after=None):
+
+  def restart_decay(step):
+    step = min(step, decay_steps)
+    tmp = num_periods * step / float(decay_steps)
+    fraction = (
+        num_periods * step % decay_steps) / float(decay_steps)
+    if zero_after is not None and tmp >= zero_after:
+      return 0
+    return 0.5 * (1.0 + math.cos(math.pi * fraction))
+
+  return restart_decay
+
+
+class SignDecaysTest(test.TestCase):
+
+  def testLinearDecay(self):
+    num_training_steps = 1000
+    linear_decay_fn = sign_decay.get_linear_decay_fn(num_training_steps)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = linear_decay_fn(step).eval()
+        py_decayed = py_linear_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+  def testCosineDecay(self):
+    num_training_steps = 1000
+    cosine_decay_fn = sign_decay.get_cosine_decay_fn(num_training_steps)
+    cosine_decay_2_fn = sign_decay.get_cosine_decay_fn(
+        num_training_steps, num_periods=5, zero_after=2)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = cosine_decay_fn(step).eval()
+        py_decayed = py_cosine_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+        tf_decayed = cosine_decay_2_fn(step).eval()
+        py_decayed = py_cosine_decay_fn(
+            num_training_steps, num_periods=5, zero_after=2)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+  def testRestartDecay(self):
+    num_training_steps = 1000
+    restart_decay_fn = sign_decay.get_restart_decay_fn(num_training_steps)
+    restart_decay_2_fn = sign_decay.get_restart_decay_fn(
+        num_training_steps, num_periods=5, zero_after=2)
+
+    for step in range(0, 1000, 100):
+      with self.test_session():
+        tf_decayed = restart_decay_fn(step).eval()
+        py_decayed = py_restart_decay_fn(num_training_steps)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+        tf_decayed = restart_decay_2_fn(step).eval()
+        py_decayed = py_restart_decay_fn(
+            num_training_steps, num_periods=5, zero_after=2)(step)
+        self.assertAlmostEqual(tf_decayed, py_decayed, places=4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..71582f9c9a01eb221666e2c71c4a2edb18e7cb98
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -0,0 +1,113 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_libs",
+    "tf_custom_op_library",
+    "tf_custom_op_py_library",
+    "tf_gen_op_wrapper_py",
+)
+
+cc_library(
+    name = "all_ops",
+    srcs = [":custom_op_sources"],
+    hdrs = [":custom_op_headers"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "python/ops/_periodic_resample_op.so",
+    srcs = [
+        ":custom_op_headers",
+        ":custom_op_sources",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["array_ops"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_periodic_resample_op_py",
+    out = "python/ops/gen_periodic_resample_op.py",
+    deps = [":array_ops_op_lib"],
+)
+
+tf_custom_op_py_library(
+    name = "periodic_resample_op_py",
+    srcs = ["python/ops/periodic_resample_op.py"],
+    dso = ["python/ops/_periodic_resample_op.so"],
+    kernels = [
+        ":array_ops_op_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_periodic_resample_op_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_for_generated_wrappers",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = [
+        "__init__.py",
+        "python/__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":periodic_resample_op_py",
+    ],
+)
+
+# py_library(
+#     name = "periodic_resample_op_py",
+#     srcs = ["python/ops/periodic_resample_op.py"],
+#     data = ["python/ops/_periodic_resample_op.so"],
+#     srcs_version = "PY2AND3",
+# )
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "custom_op_sources",
+    srcs = glob(
+        [
+            "ops/*.cc",
+            "kernels/*.cc",
+        ],
+        exclude = [
+            "ops/*_test.cc",
+            "kernels/*_test.cc",
+        ],
+    ),
+)
+
+filegroup(
+    name = "custom_op_headers",
+    srcs = glob(
+        [
+            "kernels/*.h",
+            "ops/*.h",
+        ],
+    ),
+)
diff --git a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py b/tensorflow/contrib/periodic_resample/__init__.py
similarity index 60%
rename from tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
rename to tensorflow/contrib/periodic_resample/__init__.py
index 4d39a7918b36240f970aa192b907c3d127441657..fde9091b88f96da8f880ea341c8fd809b619c807 100644
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
+++ b/tensorflow/contrib/periodic_resample/__init__.py
@@ -1,3 +1,4 @@
+# =============================================================================
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,38 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Support for creating Stochastic Tensors.
-
-See the @{$python/contrib.bayesflow.stochastic_tensor} guide.
-
-@@BaseStochasticTensor
-@@StochasticTensor
-@@MeanValue
-@@SampleValue
-@@value_type
-@@get_current_value_type
-"""
+# =============================================================================
 
+"""Custom op used by periodic_resample."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.stochastic_tensor_impl import *
-# pylint: enable=wildcard-import
+from tensorflow.contrib.periodic_resample.python.ops.periodic_resample_op import periodic_resample
 from tensorflow.python.util.all_util import remove_undocumented
 
-
-_allowed_symbols = [
-    "BaseStochasticTensor",
-    "StochasticTensor",
-    "ObservedStochasticTensor",
-    "MeanValue",
-    "SampleValue",
-    "value_type",
-    "get_current_value_type",
-]
+_allowed_symbols = ["periodic_resample"]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cee405cef25f54fd064f8002265c42016c4fa50
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -0,0 +1,26 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h"
+
+namespace tensorflow {
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResample")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bef21f7a5c8a27011f95eb7fae8451ca944d3cde
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -0,0 +1,230 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+#define TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace {
+
+template <class IndexVecT, class IndexT>
+IndexT compute_input_index(
+    IndexVecT* target_dimensions, const IndexT& output_index,
+    const IndexVecT& original_dimensions, const int& adjustable_dimension,
+    const std::vector<tensorflow::int64>& dimension_ceiling,
+    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
+    std::vector<IndexT>* output_indices, const int& rank) {
+  *result = 0;
+  output_indices->clear();
+
+  // un-rasterize the output index
+  auto last_reduced_i = output_index;
+  for (auto r = rank - 1; r >= 0; --r) {
+    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+    last_reduced_i =
+        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+  }
+
+  // rasterize the input index
+  IndexT last_index_factor = 1;
+  for (auto r = rank - 1; r >= 0; --r) {
+    IndexT index = 0;
+    if (r != adjustable_dimension)
+      index = (*output_indices)[r] / dimension_ceiling[r];
+    else {
+      for (int qi = 0; qi < rank; ++qi) {
+        if (qi == adjustable_dimension) continue;
+        index += cumulative_dimensions[qi] *
+                 ((*output_indices)[qi] % dimension_ceiling[qi]);
+      }
+      index *= (*target_dimensions)[adjustable_dimension];
+      index += (*output_indices)[r];
+    }
+    *result += last_index_factor * index;
+    last_index_factor *= original_dimensions[r];
+  }
+
+  return *result;
+}
+
+template <class InputDataT,
+          class IndexVecT>  // both types are needed here b/c IndexVecT and
+                            // InputDataT are not related
+                            void
+                            fill_periodic_tensor(
+                                tensorflow::OpKernelContext* context,
+                                const IndexVecT& desired_shape,
+                                const tensorflow::Tensor& input_tensor) {
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = input_tensor.flat<InputDataT>();
+  const int rank = input_tensor.dims();
+  // original and target dimensions
+  std::vector<tensorflow::int64> original_dimensions(rank),
+      target_dimensions(rank);
+  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
+  // factors by which original_dimensions increases/decreases w.r.t.
+  // target_dimensions
+  std::vector<tensorflow::int64> dimension_ceiling(rank),
+      cumulative_dimensions(rank);
+  // index of adjustable dimension
+  int adjustable_dimension;
+  tensorflow::TensorShape output_shape;
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.size(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.size(), "."));
+
+  bool found = false;
+  for (int i = 0; i < rank; ++i) {
+    // if (desired_shape(i) < 1) {
+    if (desired_shape[i] < 1) {
+      // only one index can be adjustable
+      OP_REQUIRES(context, !found,
+                  tensorflow::errors::InvalidArgument(
+                      "periodic_resample expects only "
+                      "one index to be marked as adjustable."));
+      adjustable_dimension = i;
+      found = true;
+    } else {
+      // target_dimensions[i] = desired_shape(i);
+      target_dimensions[i] = desired_shape[i];
+      new_sliced_size *= target_dimensions[i];
+    }
+  }
+  // at least one index needs to be adjustable
+  OP_REQUIRES(context, found,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects at least "
+                  "one index to be marked as adjustable."));
+
+  int count = 0;
+  for (const auto dim_info : input_tensor.shape()) {
+    original_dimensions[count] = dim_info.size;
+    ++count;
+  }
+
+  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+
+  count = 0;
+  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
+    dimension_ceiling[count] = tensorflow::int64(std::ceil(
+        float(target_dimensions[count]) / float(original_dimensions[count])));
+    if (count == 0)
+      cumulative_dimensions[count] = 1;
+    else
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
+    ++count;
+  }
+
+  // ensure that the new dimension is greater than zero
+  OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample found that the "
+                  "adjustable dimension, ",
+                  adjustable_dimension, ", isn't greater than zero, ",
+                  target_dimensions[adjustable_dimension], "."));
+  for (int i = 0; i < rank; ++i) {
+    output_shape.AddDim(target_dimensions[i]);
+  }
+  const auto new_size =
+      new_sliced_size * target_dimensions[adjustable_dimension];
+
+  // Create an output tensor and attach it to the current context
+  tensorflow::Tensor* output_tensor = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, output_shape, &output_tensor));
+  auto output = output_tensor->flat<InputDataT>();
+
+  // memory is allocated for these variables outside the inner loop for
+  // efficiency (although, I could create a separate class scope for
+  // this purpose instead)
+  tensorflow::int64 result = 0;
+  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+
+  // Fill output tensor with periodically resampled input tensor values
+  for (tensorflow::int64 output_index = 0; output_index < new_size;
+       ++output_index) {
+    output(output_index) = input(compute_input_index(
+        &target_dimensions, output_index, original_dimensions,
+        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
+        &output_indices, rank));
+  }
+}
+
+void create_output_tensor(
+    tensorflow::OpKernelContext* context,
+    const tensorflow::Tensor& input_tensor,
+    const tensorflow::DataType& input_tensor_type,
+    const tensorflow::PartialTensorShape& desired_shape_tensor) {
+  auto desired_shape = desired_shape_tensor.dim_sizes();
+
+  // obligatory type switch
+  switch (input_tensor_type) {
+    case tensorflow::DataTypeToEnum<float>::value:
+      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<double>::value:
+      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
+      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
+                                              input_tensor);
+      break;
+    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
+      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
+                                              input_tensor);
+      break;
+    default:;
+  }
+}
+
+}  // namespace
+
+class PeriodicResampleOp : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    // Get the desired shape
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // Grab the input tensor
+    const tensorflow::Tensor& input_tensor = context->input(0);
+    const tensorflow::DataType input_tensor_type = context->input_dtype(0);
+
+    create_output_tensor(context, input_tensor, input_tensor_type,
+                         desired_shape);
+  }
+
+ private:
+  tensorflow::PartialTensorShape desired_shape;
+};
+
+#endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c90fc06c7fb9d79e8fd7a937e786a34947d8c1cb
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -0,0 +1,90 @@
+// =============================================================================
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("PeriodicResample")
+    .Attr("T: numbertype")
+    .Input("values: T")
+    .Attr("shape: shape")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::ExplicitShape)
+    .Doc(R"doc(
+Periodically resample elements of a tensor to conform to `shape`.
+
+This function implements a slightly more generic version of the subpixel
+convolutions found in this [paper](https://arxiv.org/abs/1609.05158).
+
+The formula for computing the elements in the `output` tensor is as follows:
+  `T` = `values` tensor of rank `R`
+  `S` = desired `shape` of output tensor (vector of length `R`)
+  `P` = `output` tensor of rank `R`
+  \((T_1,\ldots,T_R)\) = shape(`T`)
+  \([S_1,\ldots,S_q,\ldots,S_R]\) = elements of vector `S`
+
+  A single element in `S` is left unspecified (denoted \(S_q=-1\)).
+  Let \(f_i\) denote the (possibly non-integer) factor that relates the original
+  dimension to the desired dimensions, \(S_i=f_i T_i\), for \(i\neq q\) where
+  \(f_i>0\).
+  Define the following:
+    \(g_i=\lceil f_i\rceil\)
+    \(t=\prod_i T_i\)
+    \(s=\prod_{i\neq q} S_i\)
+  \(S_q\) can then be defined as by \(S_q=\lfloor t/s\rfloor\).
+  The elements of the resulting tensor are defined as
+  \(P_{s_1,\ldots,s_R}=T_{h_1,\ldots,h_q,\ldots,h_R}\).
+  The \(h_i\) (\(i\neq q\)) are defined by \(h_i=\lfloor s_i/g_i\rfloor\).
+  \(h_q=S_q\sum_{j\neq q}^{q-1}G_j \mathrm{mod}(s_j,g_j) + s_q\), where
+  \(G_j=\prod_{i}^{j-1}g_i\) (\(G_0=1\)).
+
+One drawback of this method is that whenever the output dimensions are slightly
+less than integer multiples of the input dimensions, many of the tensor elements
+are repeated in an inefficient way. This is resolved by specifying that all
+desired dimensions are integer multiples of the input tensor.
+
+For example:
+
+```prettyprint
+`input` is [[ 0  1  2  3]
+            [ 4  5  6  7]
+            [ 8  9 10 11]]
+
+tf.periodic_resample(input, [6, None]) ==> [[ 0  1]
+                                            [ 2  3]
+                                            [ 4  5]
+                                            [ 6  7]
+                                            [ 8  9]
+                                            [10 11]]
+```
+
+values: The tensor of rank `R` to periodic_resample
+shape: A 1-D tensor representing the desired shape of the output tensor.
+  Exactly one element of this tensor must have the value `None` which represents
+  that this dimension of `values` can be adjusted downward in order to
+  accommodate increases in other dimensions. The specified sizes of the
+  non-adjustable dimensions must by at least as large as in the `values` tensor.
+output: Periodically resampled tensor that has dimensions specified as in
+  `shape` except that the dimension specified as `None` will be minimally
+  decreased as necessary.
+
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/__init__.py b/tensorflow/contrib/periodic_resample/python/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b6ead0f594ad23e73901254857313635fbd1c5
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/__init__.py
@@ -0,0 +1,20 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Public API of periodic_resample."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d727870f652f3606218928983ea18e990d0afe6
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -0,0 +1,101 @@
+# =============================================================================
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+import tensorflow
+from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+class PeriodicResampleTest(test_util.TensorFlowTestCase):
+
+  def testPeriodicResampleBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([6, None])
+    output_tensor = input_tensor.reshape((6, 2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleTruncatedBasic2D(self):
+
+    input_tensor = numpy.arange(12).reshape((3, 4))
+    desired_shape = numpy.array([5, None])
+    output_tensor = input_tensor.reshape((6, 2))[:-1]
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic3D(self):
+
+    input_tensor = numpy.arange(2*2*4).reshape((2, 2, 4))
+    desired_shape = numpy.array([4, 4, None])
+    output_tensor = numpy.array([[[0], [2], [4], [6]],
+                                 [[1], [3], [5], [7]],
+                                 [[8], [10], [12], [14]],
+                                 [[9], [11], [13], [15]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      # input_tensor[0, 0, 0] == result[0, 0, 0]
+      # input_tensor[0, 0, 1] == result[1, 0, 0]
+      # input_tensor[0, 0, 2] == result[0, 1, 0]
+      # input_tensor[0, 0, 3] == result[1, 1, 0]
+      self.assertAllEqual(result, output_tensor)
+
+  def testPeriodicResampleBasic4D(self):
+
+    input_tensor = numpy.arange(2*2*2*8).reshape((2, 2, 2, 8))
+    desired_shape = numpy.array([4, 4, 4, None])
+    output_tensor = numpy.array([[[[0], [4], [8], [12]],
+                                  [[2], [6], [10], [14]],
+                                  [[16], [20], [24], [28]],
+                                  [[18], [22], [26], [30]]],
+                                 [[[1], [5], [9], [13]],
+                                  [[3], [7], [11], [15]],
+                                  [[17], [21], [25], [29]],
+                                  [[19], [23], [27], [31]]],
+                                 [[[32], [36], [40], [44]],
+                                  [[34], [38], [42], [46]],
+                                  [[48], [52], [56], [60]],
+                                  [[50], [54], [58], [62]]],
+                                 [[[33], [37], [41], [45]],
+                                  [[35], [39], [43], [47]],
+                                  [[49], [53], [57], [61]],
+                                  [[51], [55], [59], [63]]]])
+
+    # NOTE: output_tensor != input_tensor.reshape((4, 4, 4, -1))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      result = periodic_resample(input_tensor, desired_shape).eval()
+      self.assertAllEqual(result, output_tensor)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
similarity index 57%
rename from tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
rename to tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 223bc9d042c69be05b0e578835a31ed6e83c0c97..6a09f70f442131da7da2a4e98a238f21c3ccb6ec 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/sigmoid_centered_impl.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -1,3 +1,4 @@
+# =============================================================================
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,29 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""SigmoidCentered bijector."""
+# =============================================================================
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distributions.python.ops.bijectors import softmax_centered
 
+from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-__all__ = [
-    "SigmoidCentered",
-]
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
 
+from tensorflow.contrib.util import loader
+from tensorflow.python.platform import resource_loader
 
-class SigmoidCentered(softmax_centered.SoftmaxCentered):
-  """Bijector which computes Y = g(X) = exp([X 0]) / (1 + exp(-X)).
-
-  Equivalent to: `bijector.SoftmaxCentered(event_ndims=0)`.
-
-  See `bijector.SoftmaxCentered` for more details.
-  """
-
-  def __init__(self, validate_args=False, name="sigmoid_centered"):
-    super(SigmoidCentered, self).__init__(
-        event_ndims=0, validate_args=validate_args, name=name)
+_periodic_resample_op = loader.load_op_library(
+    resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
diff --git a/tensorflow/contrib/pi_examples/README.md b/tensorflow/contrib/pi_examples/README.md
index f550228083712da4ddc725cd233c1eb7bbffeb25..177357bca64b51fe82360095d677cdddc11ec948 100644
--- a/tensorflow/contrib/pi_examples/README.md
+++ b/tensorflow/contrib/pi_examples/README.md
@@ -13,7 +13,7 @@ sudo apt-get install -y libjpeg-dev
 ```
 
  - To download the example model you'll need, run these commands:
- 
+
 ```bash
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015_stripped.zip \
 -o /tmp/inception_dec_2015_stripped.zip
diff --git a/tensorflow/contrib/pi_examples/camera/Makefile b/tensorflow/contrib/pi_examples/camera/Makefile
index 578f1336f3282f647b18d1622b85905d53b3ebfa..b354c03b6e563c98347ad901bf07430d1fd17b49 100644
--- a/tensorflow/contrib/pi_examples/camera/Makefile
+++ b/tensorflow/contrib/pi_examples/camera/Makefile
@@ -76,7 +76,7 @@ $(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Matches on C++ source files.
-$(OBJDIR)%.o: %.cc 
+$(OBJDIR)%.o: %.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
diff --git a/tensorflow/contrib/pi_examples/label_image/Makefile b/tensorflow/contrib/pi_examples/label_image/Makefile
index 19652e581d2403cf8e4dbd7b9e10b7c386959069..9d054a3133a44e8a612ecad1e95adffa09e4a352 100644
--- a/tensorflow/contrib/pi_examples/label_image/Makefile
+++ b/tensorflow/contrib/pi_examples/label_image/Makefile
@@ -75,7 +75,7 @@ $(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Matches on C++ source files.
-$(OBJDIR)%.o: %.cc 
+$(OBJDIR)%.o: %.cc
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
index 7817cd0c6459aad88836503857301ec6334d486b..0b18045789f3a87ceb228033407d6b696bdb33f6 100644
--- a/tensorflow/contrib/pi_examples/label_image/label_image.cc
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -89,7 +89,7 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   FILE * infile;
   JSAMPARRAY buffer;
   int row_stride;
-  
+
   if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
     LOG(ERROR) << "Can't open " << file_name;
     return tensorflow::errors::NotFound("JPEG file ", file_name,
@@ -105,7 +105,7 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
     fclose(infile);
     return tensorflow::errors::Unknown("JPEG decoding failed");
   }
-  
+
   jpeg_create_decompress(&cinfo);
   jpeg_stdio_src(&cinfo, infile);
   jpeg_read_header(&cinfo, TRUE);
@@ -119,14 +119,14 @@ Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
   buffer = (*cinfo.mem->alloc_sarray)
     ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
   while (cinfo.output_scanline < cinfo.output_height) {
-    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]); 
+    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]);
     jpeg_read_scanlines(&cinfo, buffer, 1);
     memcpy(row_address, buffer[0], row_stride);
   }
 
   jpeg_finish_decompress(&cinfo);
   jpeg_destroy_decompress(&cinfo);
-  fclose(infile);  
+  fclose(infile);
   return Status::OK();
 }
 
@@ -167,7 +167,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
     const int top_y_index = static_cast<int>(floorf(in_y));
     const int bottom_y_index =
       std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
-    const float y_lerp = in_y - top_y_index; 
+    const float y_lerp = in_y - top_y_index;
     tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
     tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
     float *out_row = out + (y * wanted_width * wanted_channels);
@@ -186,7 +186,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
 	in_bottom_row + (right_x_index * wanted_channels);
       const float x_lerp = in_x - left_x_index;
       float *out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {	
+      for (int c = 0; c < wanted_channels; ++c) {
 	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
 	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
 	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
@@ -198,7 +198,7 @@ Status ReadTensorFromImageFile(string file_name, const int wanted_height,
       }
     }
   }
-  
+
   out_tensors->push_back(image_tensor);
   return Status::OK();
 }
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index 1bf40ab6b26c6ad1f9658a4b0ad93527fe609698..82cd7b4c8aeb64cf461d9244c5aaf32a91691a5a 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -165,5 +165,5 @@ py_test(
 filegroup(
     name = "test_export_dir",
     srcs = glob(["test_export_dir/**/*"]),
-    tags = ["nopip"],
+    tags = ["no_pip"],
 )
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 935af80e7a0cb94b9ccdc52b48a73cecc5beb299..389e26cca3eb04fe43abbee62a1efde7ae0d204d 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -133,7 +133,6 @@ py_library(
     deps = [
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
@@ -143,6 +142,23 @@ py_library(
     ],
 )
 
+py_test(
+    name = "quant_ops_test",
+    size = "small",
+    srcs = ["python/quant_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":quant_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+    ],
+)
+
 py_library(
     name = "quantize",
     srcs = ["python/quantize.py"],
@@ -168,9 +184,11 @@ py_test(
         ":quantize",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..40541729da5fd9d0ae75579e11f20999337de124
--- /dev/null
+++ b/tensorflow/contrib/quantize/README.md
@@ -0,0 +1,73 @@
+tf.contrib.quantize provides tools for transforming graphs to include ops to
+model quantization of weights, biases and activations during both training and
+inference. This is done using the
+[fake quantization op]
+(https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization),
+which is described below:
+
+Recent literature has shown that fixed point networks provide comparable
+performance to floating point networks [1]. This is achieved by modeling the
+quantization operation during training in both the forward and backward passes.
+The fake quantization operator achieves this by modeling the quantizer as a pass
+through estimator [2]. Note that during back propagation, the parameters are
+updated at high precision as this is needed to ensure sufficient precision in
+accumulating tiny adjustments to the parameters. However, for the forward pass,
+the parameters and activations are quantized to the desired lower precision.
+
+![drawing](g3doc/drawings/Fake_Quantization.jpg)
+
+###Forward pass
+
+
+
+
+\begin{equation*}
+f_Q(x) = \Delta\text{ }round\left(\frac{sat\left(x\right)-x_{min}}{\Delta}\right)
+\end{equation*}
+
+
+where
+
+$$
+\begin{equation*}
+sat(x) =
+\left\{
+	\begin{array}{ll}
+		x_{min}  & \mbox{if } x \le x_{min} \\
+		x & \mbox{if } x_{min} \leq x \leq x_{max} \\
+    x_{max} & \mbox{if } x_{max} \le x
+	\end{array}
+\right.
+\end{equation*}
+$$
+
+
+where $$\Delta$$ is the Quantizer Step size, given by
+$$\Delta =\frac{x_{max} - x_{min} }{255} $$ and $$x_{min} $$ and $$x_{max}$$ are
+the minimum and maximum values of the variable under consideration. Note that
+the rounding performed is deterministic and corresponds to asymmetric rounding,
+which is supported in almost all hardware platforms.
+
+###Backward pass
+For the backward pass, we model the quantizer as a piecewise linear block, with
+derivatives that are non-zero only in the linear region.
+
+
+
+\begin{equation*}
+\frac{df_Q(x)}{dx}=1, x_{min} \leq x \leq x_{max},\text{ 0  elsewhere }
+\end{equation*}
+
+Therefore, the backward pass through the quantizer reduces to passing through
+the gradients as long as the inputs to the quantizer are in the linear region.
+Otherwise, the gradients are set to zero.
+
+Note that the quantizer is fully specified by the min and max values of the
+variables being quantized.
+
+
+[1] P.Gysel, "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
+NEURAL NETWORKS", https://arxiv.org/pdf/1604.03168.pdf
+
+[2] Y.Bengio, "Estimating or Propagating Gradients Through Stochastic Neurons
+for Conditional Computation", https://arxiv.org/abs/1308.3432
diff --git a/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg b/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fdc7ae40cec757cc0a93d50eca6c8698a4697d07
Binary files /dev/null and b/tensorflow/contrib/quantize/g3doc/drawings/Fake_Quantization.jpg differ
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 0a38ef9fcd6f1699b0feee6d439ba69413e0899b..f80d427ff0a6573ecd6562c443182797b5d22527 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -22,15 +22,12 @@ from tensorflow.contrib.framework.python.ops import add_arg_scope
 from tensorflow.contrib.framework.python.ops import model_variable
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import moving_averages
 
-EPSILON = 1e-5
-
 
 @add_arg_scope
 def FixedQuantize(inputs, init_min=-6.0, init_max=6.0, scope=None):
@@ -133,12 +130,10 @@ def LastValueQuantize(inputs,
         batch_min = inputs
     else:
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
-    batch_min -= EPSILON
-    # B-eng requires that 0.0 if always in the [min; max] range.
+    # TFLite requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min_op = state_ops.assign(
-        min_var, batch_min, name='AssignMinLast').op
-    ops.add_to_collection(updates_collection, assign_min_op)
+    assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
+    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -148,17 +143,15 @@ def LastValueQuantize(inputs,
         batch_max = inputs
     else:
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
-    batch_max += EPSILON
-    # B-eng requires that 0.0 if always in the [min; max] range.
+    # TFLite requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
-    assign_max_op = state_ops.assign(
-        max_var, batch_max, name='AssignMaxLast').op
-    ops.add_to_collection(updates_collection, assign_max_op)
+    assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
+    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
-        batch_min,
-        batch_max,
+        assign_min,
+        assign_max,
         per_channel=per_channel,
         num_bits=num_bits,
         narrow_range=narrow_range)
@@ -251,9 +244,9 @@ def MovingAvgQuantize(inputs,
       batch_min = math_ops.reduce_min(inputs, name='BatchMin')
     # B-eng requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
-    assign_min_op = moving_averages.assign_moving_average(
-        min_var, batch_min, ema_decay, name='AssignMinEma').op
-    ops.add_to_collection(updates_collection, assign_min_op)
+    assign_min = moving_averages.assign_moving_average(
+        min_var, batch_min, ema_decay, name='AssignMinEma')
+    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -265,14 +258,14 @@ def MovingAvgQuantize(inputs,
       batch_max = math_ops.reduce_max(inputs, name='BatchMax')
     # B-eng requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
-    assign_max_op = moving_averages.assign_moving_average(
-        max_var, batch_max, ema_decay, name='AssignMaxEma').op
-    ops.add_to_collection(updates_collection, assign_max_op)
+    assign_max = moving_averages.assign_moving_average(
+        max_var, batch_max, ema_decay, name='AssignMaxEma')
+    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
-        min_var,
-        max_var,
+        assign_min,
+        assign_max,
         per_channel=per_channel,
         num_bits=num_bits,
         narrow_range=narrow_range)
@@ -301,20 +294,10 @@ def _FakeQuantWithMinMaxVars(inputs, min_var, max_var, per_channel, num_bits,
   if per_channel:
     assert len(min_var.get_shape()) == 1
     assert len(max_var.get_shape()) == 1
-    with ops.control_dependencies([check_ops.assert_less(min_var, max_var)]):
-      return array_ops.fake_quant_with_min_max_vars_per_channel(
-          inputs,
-          min_var,
-          max_var,
-          num_bits=num_bits,
-          narrow_range=narrow_range)
+    return array_ops.fake_quant_with_min_max_vars_per_channel(
+        inputs, min_var, max_var, num_bits=num_bits, narrow_range=narrow_range)
   else:
     assert min_var.get_shape() == []  # pylint: disable=g-explicit-bool-comparison
     assert max_var.get_shape() == []  # pylint: disable=g-explicit-bool-comparison
-    with ops.control_dependencies([check_ops.assert_less(min_var, max_var)]):
-      return array_ops.fake_quant_with_min_max_vars(
-          inputs,
-          min_var,
-          max_var,
-          num_bits=num_bits,
-          narrow_range=narrow_range)
+    return array_ops.fake_quant_with_min_max_vars(
+        inputs, min_var, max_var, num_bits=num_bits, narrow_range=narrow_range)
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..38846796028512a722752cd83b8bda3b5b0bb77f
--- /dev/null
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -0,0 +1,87 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for third_party.tensorflow.contrib.quantize.python.quant_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.quantize.python import quant_ops
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+_MIN_MAX_VARS = 'min_max_vars'
+
+
+class QuantOpsTest(googletest.TestCase):
+
+  def testLastValueQuantizeTrainingAssign(self):
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=[2])
+      y = quant_ops.LastValueQuantize(
+          x,
+          init_min=0.0,
+          init_max=0.0,
+          is_training=True,
+          vars_collection=_MIN_MAX_VARS)
+
+      # Run the step.
+      sess.run(variables.global_variables_initializer())
+      sess.run(y, feed_dict={x: [-1.0, 1.0]})
+      # Now check that the min_max_vars were, in fact, updated.
+      min_value, max_value = self._GetMinMaxValues(sess)
+      self.assertEqual(min_value, -1.0)
+      self.assertEqual(max_value, 1.0)
+
+  def testMovingAvgQuantizeTrainingAssign(self):
+    g = ops.Graph()
+    with session.Session(graph=g) as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=[2])
+      y = quant_ops.MovingAvgQuantize(
+          x,
+          init_min=0.0,
+          init_max=0.0,
+          is_training=True,
+          vars_collection=_MIN_MAX_VARS)
+
+      # Run the step.
+      sess.run(variables.global_variables_initializer())
+      # Do two runs to avoid zero debias.
+      sess.run(y, feed_dict={x: [-1.0, 1.0]})
+      sess.run(y, feed_dict={x: [0.0, 0.0]})
+      # Now check that the min_max_vars were, in fact, updated.
+      min_value, max_value = self._GetMinMaxValues(sess)
+      self.assertGreater(min_value, -1.0)
+      self.assertLess(min_value, 0.0)
+      self.assertGreater(max_value, 0.0)
+      self.assertLess(max_value, 1.0)
+
+  def _GetMinMaxValues(self, sess):
+    min_max_vars = ops.get_collection(_MIN_MAX_VARS)
+    self.assertEqual(len(min_max_vars), 2)
+    min_idx = 0 if 'min' in min_max_vars[0].name else 1
+    max_idx = (min_idx + 1) % 2
+    min_var, max_var = min_max_vars[min_idx], min_max_vars[max_idx]
+    min_max_values = sess.run([min_var, max_var])
+    return min_max_values[0], min_max_values[1]
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 3645d034cdb2b82af25c6c8674bf781976ffbf0f..50a2b4c91c9e7a2681f6041646a023a4225fb0c5 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import training_util
 
-# Operation types used to select oerations of interest.
+# Operation types used to select operations of interest.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 
 # Custom key for storing and retrieving update ops used by quantizing nodes.
@@ -83,12 +83,17 @@ def Quantize(graph,
 
   for op in (op for op in graph_ops if _IsInterestingOpWithWeights(op)):
     if op.name.endswith('/depthwise'):
-      # Separable convolution may consist of 2 convolution nodes.  If so,
-      # skip .../depthwise and only quantize the top one.
+      # Separable convolution may consist of 2 convolution nodes. If so, skip
+      # .../depthwise and only quantize the top one.
       separable_conv = context.GetOperationByNameDontThrow(
           op.name[:-len('/depthwise')])
       if separable_conv and separable_conv.type == 'Conv2D':
         continue
+    # Quantize add ops that come after Conv2D or DepthwiseConv2dNative.
+    if op.type in ['Conv2D', 'DepthwiseConv2dNative']:
+      add_context_re = re.search(r'^(.*)/[^/]+/', op.name)
+      if add_context_re is not None:
+        context.add_contexts.add(add_context_re.group(1))
     if not op.name.endswith('_Fold'):
       folded_op = context.GetOperationByNameDontThrow(op.name + '_Fold')
       # Do nothing if found, it will be quantized when it is iterated over.
@@ -97,6 +102,8 @@ def Quantize(graph,
     else:
       context.QuantizeOpWithWeights(op, folded=True)
 
+  context.QuantizeAddContexts()
+
   # Once all quantization ops have been inserted in the graph, collect update
   # ops for their variables and modify the TF Slim update barrier (see
   # https://www.tensorflow.org/code/tensorflow/contrib/slim/python/slim/learning.py)
@@ -153,6 +160,25 @@ class _QuantizeContext(object):
     self.is_training = is_training
     self.quantize_folded_weights_use_ema = quantize_folded_weights_use_ema
     self.input_to_ops_map = input_to_ops.InputToOps(graph)
+    self.add_contexts = set()
+
+  def QuantizeAddContexts(self):
+    """Quantizes all add ops in self.add_contexts."""
+    # Loop through sorted self.add_contexts so that op creation is
+    # deterministic. This is needed when using multiple worker replicas so that
+    # the ops can be initialized consistently.
+    for add_context in sorted(self.add_contexts):
+      add_op = self.GetOperationByNamesDontThrow([
+          add_context + '/Add', add_context + '/add'])
+      if add_op is not None:
+        self._InsertQuantOp(
+            add_context,
+            add_op,
+            self.input_to_ops_map.ConsumerOperations(add_op),
+            name='add_quant',
+            moving_avg=True,
+            bits=self.activation_bits,
+            narrow_range=False)
 
   def QuantizeOpWithWeights(self, op, folded):
     """Quantizes around the specific operation with or without batch norm.
@@ -219,7 +245,6 @@ class _QuantizeContext(object):
 
     # When a bypass connection was found, also quantize Add op input.
     if add_op:
-
       def _QuantizeAddInput(add_input):
         if folded:
           return add_input.op.name.endswith('/add_fold')
@@ -267,7 +292,8 @@ class _QuantizeContext(object):
         raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
       return consumers[0], None, None
     if add_context:
-      add_op = self.GetOperationByNameDontThrow(add_context + '/Add')
+      add_op = self.GetOperationByNamesDontThrow([
+          add_context + '/Add', add_context + '/add'])
       return activation_op, add_op, add_context
     else:
       raise ValueError('Failed to quantize op: %s, %s' % (op.name, op.type))
@@ -280,13 +306,29 @@ class _QuantizeContext(object):
 
     Returns:
       The Operation with the given name. None if the name does not correspond to
-      any operation in the graph
+      any operation in the graph.
     """
     try:
       return self.graph.get_operation_by_name(name)
     except KeyError:
       return None
 
+  def GetOperationByNamesDontThrow(self, names):
+    """Returns an Operation with one of the given names.
+
+    Args:
+      names: Names of Operation to return.
+
+    Returns:
+      The Operation with one of the given names. None if none of the names
+      corresponds to any operation in the graph.
+    """
+    for name in names:
+      op = self.GetOperationByNameDontThrow(name)
+      if op is not None:
+        return op
+    return None
+
   def _InsertQuantOp(
       self,
       context,
@@ -348,7 +390,7 @@ class _QuantizeContext(object):
 
     if delay_requested and self.quant_delay and self.quant_delay > 0:
       activate_quant = math_ops.greater_equal(
-          training_util.get_global_step(),
+          training_util.get_or_create_global_step(),
           self.quant_delay,
           name=scope + '/activate_quant')
       quant = control_flow_ops.cond(
diff --git a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
index 3e62f95bd63db3134ba0b96c46b4a92aa73ebef9..57dab03f162629f84adf1d15521b05f4014c4a80 100644
--- a/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_parameterized_test.py
@@ -97,8 +97,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
-        scope + '/weights/read'
+        scope + '/weights_quant/AssignMinLast',
+        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + '/Conv2D'
@@ -109,8 +109,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/BiasAdd'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -122,7 +122,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(act_quant.type, quantization_node_name)
 
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -172,8 +172,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
-        scope + '/weights/read'
+        scope + '/weights_quant/AssignMinLast',
+        scope + '/weights_quant/AssignMaxLast', scope + '/weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
     output_op_name = scope + '/MatMul'
@@ -184,8 +184,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/BiasAdd'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -196,7 +196,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -247,7 +247,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/Minimum', scope + '/weights_quant/Maximum',
+        scope + '/weights_quant/AssignMinLast',
+        scope + '/weights_quant/AssignMaxLast',
         scope + '/depthwise_weights/read'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -259,8 +260,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/BiasAdd'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/BiasAdd'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -271,7 +272,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -401,8 +402,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
-        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
+        scope + '/weights_quant/' + ('AssignMinEma'
+                                     if use_ema else 'AssignMinLast'),
+        scope + '/weights_quant/' + ('AssignMaxEma'
+                                     if use_ema else 'AssignMaxLast'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -415,8 +418,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/add_fold'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -427,7 +430,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -518,8 +521,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
-        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
+        scope + '/weights_quant/' + ('AssignMinEma'
+                                     if use_ema else 'AssignMinLast'),
+        scope + '/weights_quant/' + ('AssignMaxEma'
+                                     if use_ema else 'AssignMaxLast'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -532,8 +537,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/add_fold'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -544,7 +549,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
@@ -639,8 +644,10 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                 quantization_node_name)
     self.assertEqual(weights_quant.type, quantization_node_name)
     expected_inputs = [
-        scope + '/weights_quant/' + ('min/read' if use_ema else 'Minimum'),
-        scope + '/weights_quant/' + ('max/read' if use_ema else 'Maximum'),
+        scope + '/weights_quant/' + ('AssignMinEma'
+                                     if use_ema else 'AssignMinLast'),
+        scope + '/weights_quant/' + ('AssignMaxEma'
+                                     if use_ema else 'AssignMaxLast'),
         scope + '/mul_fold'
     ]
     self._AssertInputOpsAre(weights_quant, expected_inputs)
@@ -653,8 +660,8 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                                quantization_node_name)
       self.assertEqual(conv_quant.type, quantization_node_name)
       expected_inputs = [
-          scope + '/conv_quant/min/read', scope + '/conv_quant/max/read',
-          scope + '/add_fold'
+          scope + '/conv_quant/AssignMinEma',
+          scope + '/conv_quant/AssignMaxEma', scope + '/add_fold'
       ]
       self._AssertInputOpsAre(conv_quant, expected_inputs)
       output_op_name = (scope + '/conv_quant/delayed_quant/Switch_1'
@@ -665,7 +672,7 @@ class QuantizeTest(test_util.TensorFlowTestCase):
                                             quantization_node_name)
     self.assertEqual(act_quant.type, quantization_node_name)
     expected_inputs = [
-        'test/act_quant/min/read', 'test/act_quant/max/read',
+        'test/act_quant/AssignMinEma', 'test/act_quant/AssignMaxEma',
         'test/' + activation_op_name
     ]
     self._AssertInputOpsAre(act_quant, expected_inputs)
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 4a82eac1978cf834732e339e4e76a4507b9a090c..1e4dd7cf67dbfbd16386fd740c7dcc83e05ad82a 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -23,11 +23,14 @@ from tensorflow.contrib.quantize.python import quantize
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 conv2d = layers.conv2d
+separable_conv2d = layers.separable_conv2d
 
 
 class QuantizeTest(test_util.TensorFlowTestCase):
@@ -52,6 +55,53 @@ class QuantizeTest(test_util.TensorFlowTestCase):
     self.assertEqual(
         str(err.exception), 'Some inputs not quantized for ops: [Relu6]')
 
+  def testInsertQuantOpForAddAfterConv2d(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+      conv = conv2d(input1, 32, [5, 5], stride=2, padding='SAME',
+                    weights_initializer=self._WeightInit(0.09),
+                    activation_fn=None, scope='test/test')
+      node = math_ops.add(conv, input2, name='test/add')
+      node = array_ops.identity(node, name='test/identity')
+      update_barrier = control_flow_ops.no_op(name='update_barrier')
+      with ops.control_dependencies([update_barrier]):
+        array_ops.identity(node, name='control_dependency')
+
+    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
+                      activation_bits=8)
+
+    quantization_node_name = 'FakeQuantWithMinMaxVars'
+    add_quant = graph.get_operation_by_name('test/add_quant/' +
+                                            quantization_node_name)
+    self.assertEqual(add_quant.type, quantization_node_name)
+
+  def testInsertQuantOpForAddAfterSeparableConv2d(self):
+    graph = ops.Graph()
+    with graph.as_default():
+      batch_size, height, width, depth = 5, 128, 128, 3
+      input1 = array_ops.zeros((batch_size, height, width, depth))
+      input2 = array_ops.zeros((batch_size, height / 2, width / 2, depth))
+      conv = separable_conv2d(input1, None, [5, 5], stride=2,
+                              depth_multiplier=1.0, padding='SAME',
+                              weights_initializer=self._WeightInit(0.09),
+                              activation_fn=None, scope='test/test')
+      node = math_ops.add(conv, input2, name='test/add')
+      node = array_ops.identity(node, name='test/identity')
+      update_barrier = control_flow_ops.no_op(name='update_barrier')
+      with ops.control_dependencies([update_barrier]):
+        array_ops.identity(node, name='control_dependency')
+
+    quantize.Quantize(graph=graph, weight_bits=8, weight_narrow_range=True,
+                      activation_bits=8)
+
+    quantization_node_name = 'FakeQuantWithMinMaxVars'
+    add_quant = graph.get_operation_by_name('test/add_quant/' +
+                                            quantization_node_name)
+    self.assertEqual(add_quant.type, quantization_node_name)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/resampler/kernels/resampler_ops.cc b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
index 7d9ef14cefc578e9401d95db9a625428cc0e2605..e02c1b6a2bd9daf9e1f81059f7c1f92106cebc8f 100644
--- a/tensorflow/contrib/resampler/kernels/resampler_ops.cc
+++ b/tensorflow/contrib/resampler/kernels/resampler_ops.cc
@@ -406,10 +406,10 @@ class ResamplerGradOp : public ::tensorflow::OpKernel {
                                    data_channels);
     OP_REQUIRES(ctx, grad_output_shape == resampler_output_shape,
                 ::tensorflow::errors::InvalidArgument(
-                   "grad_output shape is not consistent with data and warp "
-                   "shapes; it should be ",
-                   resampler_output_shape.DebugString(), " but is ",
-                   grad_output_shape.DebugString()))
+                    "grad_output shape is not consistent with data and warp "
+                    "shapes; it should be ",
+                    resampler_output_shape.DebugString(), " but is ",
+                    grad_output_shape.DebugString()));
     const int num_sampling_points = warp.NumElements() / batch_size / 2;
     ::tensorflow::Tensor* grad_data = nullptr;
     ::tensorflow::Tensor* grad_warp = nullptr;
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index b70a5bbcd107b4c21e09c6d01a2e461fa9edd250..7e5e35d0b55c97946c022e55180765d982eaa87a 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -188,6 +188,8 @@ tf_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
@@ -227,9 +229,7 @@ tf_custom_op_library(
         "kernels/lstm_ops_gpu.cu.cc",
         "kernels/lstm_ops.h",
     ],
-    deps = [
-        "//tensorflow/core/kernels:eigen_helpers",
-    ],
+    deps = ["//tensorflow/core/kernels:eigen_helpers"],
 )
 
 tf_gen_op_wrapper_py(
@@ -251,9 +251,7 @@ tf_custom_op_library(
         "kernels/gru_ops_gpu.cu.cc",
         "kernels/gru_ops.h",
     ],
-    deps = [
-        "//tensorflow/core/kernels:eigen_helpers",
-    ],
+    deps = ["//tensorflow/core/kernels:eigen_helpers"],
 )
 
 tf_gen_op_wrapper_py(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index f7dd2cec4f9a898afdc350368a1ebf16be911363..e47755e2fefd7f1ab9ffb26d62febd8e4ad59b2b 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -22,10 +22,9 @@ import functools
 
 import numpy as np
 
-# TODO(ebrevdo): Remove once _linear is fully deprecated.
-# pylint: disable=protected-access
-
 from tensorflow.contrib import rnn as contrib_rnn
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -40,10 +39,12 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.framework import test_util
+from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 
 
 # pylint: enable=protected-access
-Linear = rnn_cell_impl._Linear  # pylint: disable=invalid-name
+Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 
 
 class RNNCellTest(test.TestCase):
@@ -127,8 +128,8 @@ class RNNCellTest(test.TestCase):
         self.assertAllClose(res[0], [[0.175991, 0.175991]])
       with variable_scope.variable_scope(
           "other", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros(
-            [1, 3])  # Test GRUCell with input_size != num_units.
+        # Test GRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
         m = array_ops.zeros([1, 2])
         g, _ = rnn_cell_impl.GRUCell(2)(x, m)
         sess.run([variables_lib.global_variables_initializer()])
@@ -374,6 +375,46 @@ class RNNCellTest(test.TestCase):
       self.assertEquals(variables[2].op.name,
                         "root/lstm_cell/projection/kernel")
 
+  def testLSTMCellLayerNorm(self):
+    with self.test_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index ebd4564f1204cd69527633e16e67cda3f3a8407e..46823fa3643c5b4a3d857fa38d1a70792d97ca40 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -995,26 +996,19 @@ class RNNCellTest(test.TestCase):
         output, state = cell(x, hidden)
 
         sess.run([variables.global_variables_initializer()])
-        res = sess.run([output, state], {
-            hidden[0].name:
-                np.array([[[[[1.],[1.]], 
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]], 
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                            [[2.],[2.]]]]]),
-            x.name:
-                np.array([[[[[1.],[1.]],
-                            [[1.],[1.]]],
-                           [[[1.],[1.]],
-                            [[1.],[1.]]]],
-                          [[[[2.],[2.]],
-                            [[2.],[2.]]],
-                           [[[2.],[2.]],
-                           [[2.],[2.]]]]])
-        })
+        res = sess.run(
+            [output, state], {
+                hidden[0].name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]],
+                                 [[[2.], [2.]], [[2.], [2.]]]]]),
+                x.name:
+                    np.array([[[[[1.], [1.]], [[1.], [1.]]], [[[1.], [1.]], [[
+                        1.
+                    ], [1.]]]], [[[[2.], [2.]], [[2.], [2.]]], [[[2.], [2.]],
+                                                                [[2.], [2.]]]]])
+            })
         # This is a smoke test, making sure expected values are unchanged.
         self.assertEqual(len(res), 2)
         self.assertAllClose(res[0], res[1].h)
@@ -1275,6 +1269,47 @@ class LayerNormBasicLSTMCellTest(test.TestCase):
         self.assertAllClose(res[2].c, expected_c1, 1e-5)
         self.assertAllClose(res[2].h, expected_h1, 1e-5)
 
+  def testBasicLSTMCellWithStateTupleLayerNorm(self):
+    """The results of LSTMCell and LayerNormBasicLSTMCell should be the same."""
+    with self.test_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        c0 = array_ops.zeros([1, 2])
+        h0 = array_ops.zeros([1, 2])
+        state0 = rnn_cell_impl.LSTMStateTuple(c0, h0)
+        c1 = array_ops.zeros([1, 2])
+        h1 = array_ops.zeros([1, 2])
+        state1 = rnn_cell_impl.LSTMStateTuple(c1, h1)
+        cell = rnn_cell_impl.MultiRNNCell([
+            contrib_rnn_cell.LayerNormLSTMCell(
+                2, layer_norm=True, norm_gain=1.0, norm_shift=0.0)
+            for _ in range(2)
+        ])
+        h, (s0, s1) = cell(x, (state0, state1))
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(
+            [h, s0, s1], {
+                x.name: np.array([[1., 1.]]),
+                c0.name: 0.1 * np.asarray([[0, 1]]),
+                h0.name: 0.1 * np.asarray([[2, 3]]),
+                c1.name: 0.1 * np.asarray([[4, 5]]),
+                h1.name: 0.1 * np.asarray([[6, 7]]),
+            })
+
+        expected_h = np.array([[-0.38079708, 0.38079708]])
+        expected_h0 = np.array([[-0.38079708, 0.38079708]])
+        expected_c0 = np.array([[-1.0, 1.0]])
+        expected_h1 = np.array([[-0.38079708, 0.38079708]])
+        expected_c1 = np.array([[-1.0, 1.0]])
+
+        self.assertEqual(len(res), 3)
+        self.assertAllClose(res[0], expected_h, 1e-5)
+        self.assertAllClose(res[1].c, expected_c0, 1e-5)
+        self.assertAllClose(res[1].h, expected_h0, 1e-5)
+        self.assertAllClose(res[2].c, expected_c1, 1e-5)
+        self.assertAllClose(res[2].h, expected_h1, 1e-5)
+
   def testBasicLSTMCellWithDropout(self):
 
     def _is_close(x, y, digits=4):
diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
index f877e4dacbf23df51e0f9231de60443bdce7b42c..8109ebc718353300f94536c5d7ae3332da584a1d 100644
--- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell.py
@@ -24,17 +24,169 @@ from __future__ import print_function
 
 import math
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
-RNNCell = rnn_cell_impl.RNNCell  # pylint: disable=invalid-name
-_Linear = rnn_cell_impl._Linear  # pylint: disable=invalid-name, protected-access
-_like_rnncell = rnn_cell_impl._like_rnncell  # pylint: disable=invalid-name, protected-access
+
+# pylint: disable=protected-access,invalid-name
+RNNCell = rnn_cell_impl.RNNCell
+_like_rnncell = rnn_cell_impl._like_rnncell
+_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
+_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
+# pylint: enable=protected-access,invalid-name
+
+
+class _Linear(object):
+  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+  Args:
+    args: a 2D Tensor or a list of 2D, batch, n, Tensors.
+    output_size: int, second dimension of weight variable.
+    dtype: data type for variables.
+    build_bias: boolean, whether to build a bias variable.
+    bias_initializer: starting value to initialize the bias
+      (default is all zeros).
+    kernel_initializer: starting value to initialize the weight.
+
+  Raises:
+    ValueError: if inputs_shape is wrong.
+  """
+
+  def __init__(self,
+               args,
+               output_size,
+               build_bias,
+               bias_initializer=None,
+               kernel_initializer=None):
+    self._build_bias = build_bias
+
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+      self._is_sequence = False
+    else:
+      self._is_sequence = True
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      self._weights = vs.get_variable(
+          _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
+      if build_bias:
+        with vs.variable_scope(outer_scope) as inner_scope:
+          inner_scope.set_partitioner(None)
+          if bias_initializer is None:
+            bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+          self._biases = vs.get_variable(
+              _BIAS_VARIABLE_NAME, [output_size],
+              dtype=dtype,
+              initializer=bias_initializer)
+
+  def __call__(self, args):
+    if not self._is_sequence:
+      args = [args]
+
+    if len(args) == 1:
+      res = math_ops.matmul(args[0], self._weights)
+    else:
+      # Explicitly creating a one for a minor performance improvement.
+      one = constant_op.constant(1, dtype=dtypes.int32)
+      res = math_ops.matmul(array_ops.concat(args, one), self._weights)
+    if self._build_bias:
+      res = nn_ops.bias_add(res, self._biases)
+    return res
+
+
+# TODO(xpan): Remove this function in a follow up.
+def _linear(args,
+            output_size,
+            bias,
+            bias_initializer=None,
+            kernel_initializer=None):
+  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+  Args:
+    args: a 2D Tensor or a list of 2D, batch, n, Tensors.
+    output_size: int, second dimension of W[i].
+    bias: boolean, whether to add a bias term or not.
+    bias_initializer: starting value to initialize the bias
+      (default is all zeros).
+    kernel_initializer: starting value to initialize the weight.
+
+  Returns:
+    A 2D Tensor with shape `[batch, output_size]` equal to
+    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
+
+  Raises:
+    ValueError: if some of the arguments has unspecified or wrong shape.
+  """
+  if args is None or (nest.is_sequence(args) and not args):
+    raise ValueError("`args` must be specified")
+  if not nest.is_sequence(args):
+    args = [args]
+
+  # Calculate the total size of arguments on dimension 1.
+  total_arg_size = 0
+  shapes = [a.get_shape() for a in args]
+  for shape in shapes:
+    if shape.ndims != 2:
+      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+    if shape[1].value is None:
+      raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                       "but saw %s" % (shape, shape[1]))
+    else:
+      total_arg_size += shape[1].value
+
+  dtype = [a.dtype for a in args][0]
+
+  # Now the computation.
+  scope = vs.get_variable_scope()
+  with vs.variable_scope(scope) as outer_scope:
+    weights = vs.get_variable(
+        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
+        dtype=dtype,
+        initializer=kernel_initializer)
+    if len(args) == 1:
+      res = math_ops.matmul(args[0], weights)
+    else:
+      res = math_ops.matmul(array_ops.concat(args, 1), weights)
+    if not bias:
+      return res
+    with vs.variable_scope(outer_scope) as inner_scope:
+      inner_scope.set_partitioner(None)
+      if bias_initializer is None:
+        bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+      biases = vs.get_variable(
+          _BIAS_VARIABLE_NAME, [output_size],
+          dtype=dtype,
+          initializer=bias_initializer)
+    return nn_ops.bias_add(res, biases)
 
 
 class EmbeddingWrapper(RNNCell):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 6702a89d22283be691deb11339d3374bf8c4fd93..91cb04daedf07ed60ff0a2c722c108ffb783a41b 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -23,6 +23,7 @@ import math
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.contrib.layers.python.layers import layers
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
@@ -35,6 +36,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -75,6 +77,18 @@ def _get_sharded_variable(name, shape, dtype, num_shards):
   return shards
 
 
+def _norm(g, b, inp, scope):
+  shape = inp.get_shape()[-1:]
+  gamma_init = init_ops.constant_initializer(g)
+  beta_init = init_ops.constant_initializer(b)
+  with vs.variable_scope(scope):
+    # Initialize beta and gamma for use by layer_norm.
+    vs.get_variable("gamma", shape=shape, initializer=gamma_init)
+    vs.get_variable("beta", shape=shape, initializer=beta_init)
+  normalized = layers.layer_norm(inp, reuse=True, scope=scope)
+  return normalized
+
+
 class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -101,13 +115,32 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
   The class uses optional peep-hole connections, and an optional projection
   layer.
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+
   """
 
-  def __init__(self, num_units, use_peepholes=False,
-               initializer=None, num_proj=None, proj_clip=None,
-               num_unit_shards=1, num_proj_shards=1,
-               forget_bias=1.0, state_is_tuple=True,
-               activation=math_ops.tanh, reuse=None):
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               num_unit_shards=1,
+               num_proj_shards=1,
+               forget_bias=1.0,
+               state_is_tuple=True,
+               activation=math_ops.tanh,
+               reuse=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0):
     """Initialize the parameters for an LSTM cell.
 
     Args:
@@ -134,6 +167,11 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
     """
     super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse)
     if not state_is_tuple:
@@ -151,6 +189,9 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
     self._state_is_tuple = state_is_tuple
     self._activation = activation
     self._reuse = reuse
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
 
     if num_proj:
       self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj)
@@ -219,9 +260,20 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
 
     # j = new_input, f = forget_gate, o = output_gate
     cell_inputs = array_ops.concat([inputs, m_prev], 1)
-    lstm_matrix = nn_ops.bias_add(math_ops.matmul(cell_inputs, concat_w), b)
+    lstm_matrix = math_ops.matmul(cell_inputs, concat_w)
+
+    # If layer nomalization is applied, do not add bias
+    if not self._layer_norm:
+      lstm_matrix = nn_ops.bias_add(lstm_matrix, b)
+
     j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=3, axis=1)
 
+    # Apply layer normalization
+    if self._layer_norm:
+      j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+      f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+      o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
     # Diagonal connections
     if self._use_peepholes:
       w_f_diag = vs.get_variable(
@@ -235,6 +287,10 @@ class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell):
       f_act = sigmoid(f + self._forget_bias)
     c = (f_act * c_prev + (1 - f_act) * self._activation(j))
 
+    # Apply layer normalization
+    if self._layer_norm:
+      c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
     if self._use_peepholes:
       m = sigmoid(o + w_o_diag * c) * self._activation(c)
     else:
@@ -1017,7 +1073,7 @@ class BidirectionalGridLSTMCell(GridLSTMCell):
 
 
 # pylint: disable=protected-access
-_Linear = rnn_cell_impl._Linear  # pylint: disable=invalid-name
+_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
 # pylint: enable=protected-access
 
 
@@ -1300,8 +1356,8 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     self._keep_prob = dropout_keep_prob
     self._seed = dropout_prob_seed
     self._layer_norm = layer_norm
-    self._g = norm_gain
-    self._b = norm_shift
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
     self._reuse = reuse
 
   @property
@@ -1312,24 +1368,25 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._num_units
 
-  def _norm(self, inp, scope):
+  def _norm(self, inp, scope, dtype=dtypes.float32):
     shape = inp.get_shape()[-1:]
-    gamma_init = init_ops.constant_initializer(self._g)
-    beta_init = init_ops.constant_initializer(self._b)
+    gamma_init = init_ops.constant_initializer(self._norm_gain)
+    beta_init = init_ops.constant_initializer(self._norm_shift)
     with vs.variable_scope(scope):
       # Initialize beta and gamma for use by layer_norm.
-      vs.get_variable("gamma", shape=shape, initializer=gamma_init)
-      vs.get_variable("beta", shape=shape, initializer=beta_init)
+      vs.get_variable("gamma", shape=shape, initializer=gamma_init, dtype=dtype)
+      vs.get_variable("beta", shape=shape, initializer=beta_init, dtype=dtype)
     normalized = layers.layer_norm(inp, reuse=True, scope=scope)
     return normalized
 
   def _linear(self, args):
     out_size = 4 * self._num_units
     proj_size = args.get_shape()[-1]
-    weights = vs.get_variable("kernel", [proj_size, out_size])
+    dtype = args.dtype
+    weights = vs.get_variable("kernel", [proj_size, out_size], dtype=dtype)
     out = math_ops.matmul(args, weights)
     if not self._layer_norm:
-      bias = vs.get_variable("bias", [out_size])
+      bias = vs.get_variable("bias", [out_size], dtype=dtype)
       out = nn_ops.bias_add(out, bias)
     return out
 
@@ -1338,13 +1395,14 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     c, h = state
     args = array_ops.concat([inputs, h], 1)
     concat = self._linear(args)
+    dtype = args.dtype
 
     i, j, f, o = array_ops.split(value=concat, num_or_size_splits=4, axis=1)
     if self._layer_norm:
-      i = self._norm(i, "input")
-      j = self._norm(j, "transform")
-      f = self._norm(f, "forget")
-      o = self._norm(o, "output")
+      i = self._norm(i, "input", dtype=dtype)
+      j = self._norm(j, "transform", dtype=dtype)
+      f = self._norm(f, "forget", dtype=dtype)
+      o = self._norm(o, "output", dtype=dtype)
 
     g = self._activation(j)
     if (not isinstance(self._keep_prob, float)) or self._keep_prob < 1:
@@ -1353,7 +1411,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     new_c = (c * math_ops.sigmoid(f + self._forget_bias)
              + math_ops.sigmoid(i) * g)
     if self._layer_norm:
-      new_c = self._norm(new_c, "state")
+      new_c = self._norm(new_c, "state", dtype=dtype)
     new_h = self._activation(new_c) * math_ops.sigmoid(o)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_h)
@@ -1997,8 +2055,8 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     if self._skip_connection:
       self._total_output_channels += self._input_shape[-1]
 
-    state_size = tensor_shape.TensorShape(self._input_shape[:-1] 
-                                          + [self._output_channels])
+    state_size = tensor_shape.TensorShape(
+        self._input_shape[:-1] + [self._output_channels])
     self._state_size = rnn_cell_impl.LSTMStateTuple(state_size, state_size)
     self._output_size = tensor_shape.TensorShape(self._input_shape[:-1]
                                                  + [self._total_output_channels])
@@ -2058,14 +2116,11 @@ class Conv3DLSTMCell(ConvLSTMCell):
     """Construct Conv3DLSTM. See `ConvLSTMCell` for more details."""
     super(Conv3DLSTMCell, self).__init__(conv_ndims=3, **kwargs)
 
-def _conv(args, 
-          filter_size,
-          num_features,
-          bias,
-          bias_start=0.0):
+
+def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   """convolution:
   Args:
-    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D, 
+    args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
     batch x n, Tensors.
     filter_size: int tuple of filter height and width.
     num_features: int, number of features.
@@ -2159,7 +2214,7 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
         has the given variables, an error is raised.
 
     Raises:
-      ValueError: If `num_units` or `num_proj` is not divisible by 
+      ValueError: If `num_units` or `num_proj` is not divisible by
         `number_of_groups`.
     """
     super(GLSTMCell, self).__init__(_reuse=reuse)
@@ -2305,3 +2360,273 @@ class GLSTMCell(rnn_cell_impl.RNNCell):
 
     new_state = rnn_cell_impl.LSTMStateTuple(c, m)
     return m, new_state
+
+
+class LayerNormLSTMCell(rnn_cell_impl.RNNCell):
+  """Long short-term memory unit (LSTM) recurrent network cell.
+
+  The default non-peephole implementation is based on:
+
+    http://www.bioinf.jku.at/publications/older/2604.pdf
+
+  S. Hochreiter and J. Schmidhuber.
+  "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997.
+
+  The peephole implementation is based on:
+
+    https://research.google.com/pubs/archive/43905.pdf
+
+  Hasim Sak, Andrew Senior, and Francoise Beaufays.
+  "Long short-term memory recurrent neural network architectures for
+   large scale acoustic modeling." INTERSPEECH, 2014.
+
+  The class uses optional peep-hole connections, optional cell clipping, and
+  an optional projection layer.
+
+  Layer normalization implementation is based on:
+
+    https://arxiv.org/abs/1607.06450.
+
+  "Layer Normalization"
+  Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+
+  and is applied before the internal nonlinearities.
+
+  """
+
+  def __init__(self,
+               num_units,
+               use_peepholes=False,
+               cell_clip=None,
+               initializer=None,
+               num_proj=None,
+               proj_clip=None,
+               forget_bias=1.0,
+               activation=None,
+               layer_norm=False,
+               norm_gain=1.0,
+               norm_shift=0.0,
+               reuse=None):
+    """Initialize the parameters for an LSTM cell.
+
+    Args:
+      num_units: int, The number of units in the LSTM cell
+      use_peepholes: bool, set True to enable diagonal/peephole connections.
+      cell_clip: (optional) A float value, if provided the cell state is clipped
+        by this value prior to the cell output activation.
+      initializer: (optional) The initializer to use for the weight and
+        projection matrices.
+      num_proj: (optional) int, The output dimensionality for the projection
+        matrices.  If None, no projection is performed.
+      proj_clip: (optional) A float value.  If `num_proj > 0` and `proj_clip` is
+        provided, then the projected values are clipped elementwise to within
+        `[-proj_clip, proj_clip]`.
+      forget_bias: Biases of the forget gate are initialized by default to 1
+        in order to reduce the scale of forgetting at the beginning of
+        the training. Must set it manually to `0.0` when restoring from
+        CudnnLSTM trained checkpoints.
+      activation: Activation function of the inner states.  Default: `tanh`.
+      layer_norm: If `True`, layer normalization will be applied.
+      norm_gain: float, The layer normalization gain initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      norm_shift: float, The layer normalization shift initial value. If
+        `layer_norm` has been set to `False`, this argument will be ignored.
+      reuse: (optional) Python boolean describing whether to reuse variables
+        in an existing scope.  If not `True`, and the existing scope already has
+        the given variables, an error is raised.
+
+      When restoring from CudnnLSTM-trained checkpoints, must use
+      CudnnCompatibleLSTMCell instead.
+    """
+    super(LayerNormLSTMCell, self).__init__(_reuse=reuse)
+
+    self._num_units = num_units
+    self._use_peepholes = use_peepholes
+    self._cell_clip = cell_clip
+    self._initializer = initializer
+    self._num_proj = num_proj
+    self._proj_clip = proj_clip
+    self._forget_bias = forget_bias
+    self._activation = activation or math_ops.tanh
+    self._layer_norm = layer_norm
+    self._norm_gain = norm_gain
+    self._norm_shift = norm_shift
+
+    if num_proj:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj))
+      self._output_size = num_proj
+    else:
+      self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units))
+      self._output_size = num_units
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+  @property
+  def output_size(self):
+    return self._output_size
+
+  def _linear(self,
+              args,
+              output_size,
+              bias,
+              bias_initializer=None,
+              kernel_initializer=None,
+              layer_norm=False):
+    """Linear map: sum_i(args[i] * W[i]), where W[i] is a Variable.
+
+    Args:
+      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+      output_size: int, second dimension of W[i].
+      bias: boolean, whether to add a bias term or not.
+      bias_initializer: starting value to initialize the bias
+        (default is all zeros).
+      kernel_initializer: starting value to initialize the weight.
+      layer_norm: boolean, whether to apply layer normalization.
+
+
+    Returns:
+      A 2D Tensor with shape [batch x output_size] taking value
+      sum_i(args[i] * W[i]), where each W[i] is a newly created Variable.
+
+    Raises:
+      ValueError: if some of the arguments has unspecified or wrong shape.
+    """
+    if args is None or (nest.is_sequence(args) and not args):
+      raise ValueError("`args` must be specified")
+    if not nest.is_sequence(args):
+      args = [args]
+
+    # Calculate the total size of arguments on dimension 1.
+    total_arg_size = 0
+    shapes = [a.get_shape() for a in args]
+    for shape in shapes:
+      if shape.ndims != 2:
+        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+      if shape[1].value is None:
+        raise ValueError("linear expects shape[1] to be provided for shape %s, "
+                         "but saw %s" % (shape, shape[1]))
+      else:
+        total_arg_size += shape[1].value
+
+    dtype = [a.dtype for a in args][0]
+
+    # Now the computation.
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope) as outer_scope:
+      weights = vs.get_variable(
+          "kernel", [total_arg_size, output_size],
+          dtype=dtype,
+          initializer=kernel_initializer)
+      if len(args) == 1:
+        res = math_ops.matmul(args[0], weights)
+      else:
+        res = math_ops.matmul(array_ops.concat(args, 1), weights)
+      if not bias:
+        return res
+      with vs.variable_scope(outer_scope) as inner_scope:
+        inner_scope.set_partitioner(None)
+        if bias_initializer is None:
+          bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
+        biases = vs.get_variable(
+            "bias", [output_size], dtype=dtype, initializer=bias_initializer)
+
+    if not layer_norm:
+      res = nn_ops.bias_add(res, biases)
+
+    return res
+
+  def call(self, inputs, state):
+    """Run one step of LSTM.
+
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: this must be a tuple of state Tensors,
+       both `2-D`, with column sizes `c_state` and
+        `m_state`.
+
+    Returns:
+      A tuple containing:
+
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
+        LSTM after reading `inputs` when previous state was `state`.
+        Here output_dim is:
+           num_proj if num_proj was set,
+           num_units otherwise.
+      - Tensor(s) representing the new state of LSTM after reading `inputs` when
+        the previous state was `state`.  Same type and shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
+    sigmoid = math_ops.sigmoid
+
+    (c_prev, m_prev) = state
+
+    dtype = inputs.dtype
+    input_size = inputs.get_shape().with_rank(2)[1]
+    if input_size.value is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+    scope = vs.get_variable_scope()
+    with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+
+      # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+      lstm_matrix = self._linear(
+          [inputs, m_prev],
+          4 * self._num_units,
+          bias=True,
+          bias_initializer=None,
+          layer_norm=self._layer_norm)
+      i, j, f, o = array_ops.split(
+          value=lstm_matrix, num_or_size_splits=4, axis=1)
+
+      if self._layer_norm:
+        i = _norm(self._norm_gain, self._norm_shift, i, "input")
+        j = _norm(self._norm_gain, self._norm_shift, j, "transform")
+        f = _norm(self._norm_gain, self._norm_shift, f, "forget")
+        o = _norm(self._norm_gain, self._norm_shift, o, "output")
+
+      # Diagonal connections
+      if self._use_peepholes:
+        with vs.variable_scope(unit_scope):
+          w_f_diag = vs.get_variable(
+              "w_f_diag", shape=[self._num_units], dtype=dtype)
+          w_i_diag = vs.get_variable(
+              "w_i_diag", shape=[self._num_units], dtype=dtype)
+          w_o_diag = vs.get_variable(
+              "w_o_diag", shape=[self._num_units], dtype=dtype)
+
+      if self._use_peepholes:
+        c = (
+            sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
+            sigmoid(i + w_i_diag * c_prev) * self._activation(j))
+      else:
+        c = (
+            sigmoid(f + self._forget_bias) * c_prev +
+            sigmoid(i) * self._activation(j))
+
+      if self._layer_norm:
+        c = _norm(self._norm_gain, self._norm_shift, c, "state")
+
+      if self._cell_clip is not None:
+        # pylint: disable=invalid-unary-operand-type
+        c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+        # pylint: enable=invalid-unary-operand-type
+      if self._use_peepholes:
+        m = sigmoid(o + w_o_diag * c) * self._activation(c)
+      else:
+        m = sigmoid(o) * self._activation(c)
+
+      if self._num_proj is not None:
+        with vs.variable_scope("projection"):
+          m = self._linear(m, self._num_proj, bias=False)
+
+        if self._proj_clip is not None:
+          # pylint: disable=invalid-unary-operand-type
+          m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+          # pylint: enable=invalid-unary-operand-type
+
+    new_state = (rnn_cell_impl.LSTMStateTuple(c, m))
+    return m, new_state
diff --git a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
index a8331cbc8f04f74294675d7ceb57412e1f0b6170..d10ec9cf0cad56930ed1e101bf60cea6cad9d7a4 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/reader_test.py
@@ -86,6 +86,13 @@ class ReaderTest(test.TestCase):
       self._init_and_validate_variable(sess, "v", 44)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags for serving on TPU.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
@@ -97,7 +104,8 @@ class ReaderTest(test.TestCase):
     builder.save()
 
     actual_tags = reader.get_saved_model_tag_sets(saved_model_dir)
-    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["foo", "bar"]]
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["serve", "tpu"],
+                     ["foo", "bar"]]
     self.assertEqual(expected_tags, actual_tags)
 
 
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 91493302b1abb3dd0fbfe824a798e68f83cc9fc7..01a5540121ae9ebf22de0493daadff6c7710d29a 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
@@ -589,6 +590,24 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testBahdanauMonotonicNormalized')
 
+  def testBahdanauMonotonicHard(self):
+    # Run attention mechanism with mode='hard', make sure probabilities are hard
+    b, t, u, d = 10, 20, 30, 40
+    with self.test_session(use_gpu=True) as sess:
+      a = wrapper.BahdanauMonotonicAttention(
+          d,
+          random_ops.random_normal((b, t, u)),
+          mode='hard')
+      # Just feed previous attention as [1, 0, 0, ...]
+      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      sess.run(variables.global_variables_initializer())
+      attn_out = attn.eval()
+      # All values should be 0 or 1
+      self.assertTrue(np.all(np.logical_or(attn_out == 0, attn_out == 1)))
+      # Sum of distributions should be 0 or 1 (0 when all p_choose_i are 0)
+      self.assertTrue(np.all(np.logical_or(attn_out.sum(axis=1) == 1,
+                                           attn_out.sum(axis=1) == 0)))
+
   def testLuongMonotonicNotNormalized(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongMonotonicAttention, sigmoid_noise=1.0,
@@ -695,6 +714,24 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testLuongMonotonicHard(self):
+    # Run attention mechanism with mode='hard', make sure probabilities are hard
+    b, t, u, d = 10, 20, 30, 40
+    with self.test_session(use_gpu=True) as sess:
+      a = wrapper.LuongMonotonicAttention(
+          d,
+          random_ops.random_normal((b, t, u)),
+          mode='hard')
+      # Just feed previous attention as [1, 0, 0, ...]
+      attn = a(random_ops.random_normal((b, d)), array_ops.one_hot([0]*b, t))
+      sess.run(variables.global_variables_initializer())
+      attn_out = attn.eval()
+      # All values should be 0 or 1
+      self.assertTrue(np.all(np.logical_or(attn_out == 0, attn_out == 1)))
+      # Sum of distributions should be 0 or 1 (0 when all p_choose_i are 0)
+      self.assertTrue(np.all(np.logical_or(attn_out.sum(axis=1) == 1,
+                                           attn_out.sum(axis=1) == 0)))
+
   def testMultiAttentionNoAttentionLayer(self):
     create_attention_mechanisms = (
         wrapper.BahdanauAttention, wrapper.LuongAttention)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 839df079ee743c67b3eb6180bbf419f07ecb5435..e87ef413880e37e553c604ec8cfbaef307569682 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -149,7 +149,7 @@ class _BaseAttentionMechanism(AttentionMechanism):
                memory_sequence_length=None,
                memory_layer=None,
                check_inner_dims_defined=True,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                name=None):
     """Construct base AttentionMechanism class.
 
@@ -187,9 +187,13 @@ class _BaseAttentionMechanism(AttentionMechanism):
           "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
     self._query_layer = query_layer
     self._memory_layer = memory_layer
+    self.dtype = memory_layer.dtype
     if not callable(probability_fn):
       raise TypeError("probability_fn must be callable, saw type: %s" %
                       type(probability_fn).__name__)
+    if score_mask_value is None:
+      score_mask_value = dtypes.as_dtype(
+          self._memory_layer.dtype).as_numpy_dtype(-np.inf)
     self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
         probability_fn(
             _maybe_mask_score(score, memory_sequence_length, score_mask_value),
@@ -334,7 +338,8 @@ class LuongAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                scale=False,
                probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
                name="LuongAttention"):
     """Construct the AttentionMechanism mechanism.
 
@@ -353,17 +358,20 @@ class LuongAttention(_BaseAttentionMechanism):
       score_mask_value: (optional) The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
+      dtype: The data type for the memory layer of the attention mechanism.
       name: Name to use when creating ops.
     """
     # For LuongAttention, we only transform the memory layer; thus
     # num_units **must** match expected the query depth.
     if probability_fn is None:
       probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(LuongAttention, self).__init__(
         query_layer=None,
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -475,7 +483,8 @@ class BahdanauAttention(_BaseAttentionMechanism):
                memory_sequence_length=None,
                normalize=False,
                probability_fn=None,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
+               dtype=None,
                name="BahdanauAttention"):
     """Construct the Attention mechanism.
 
@@ -494,16 +503,20 @@ class BahdanauAttention(_BaseAttentionMechanism):
       score_mask_value: (optional): The mask value for score before passing into
         `probability_fn`. The default is -inf. Only used if
         `memory_sequence_length` is not None.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     if probability_fn is None:
       probability_fn = nn_ops.softmax
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = lambda score, _: probability_fn(score)
     super(BahdanauAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -679,7 +692,11 @@ def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode,
                                      seed=seed)
     score += sigmoid_noise*noise
   # Compute "choosing" probabilities from the attention scores
-  p_choose_i = math_ops.sigmoid(score)
+  if mode == "hard":
+    # When mode is hard, use a hard sigmoid
+    p_choose_i = math_ops.cast(score > 0, score.dtype)
+  else:
+    p_choose_i = math_ops.sigmoid(score)
   # Convert from choosing probabilities to attention distribution
   return monotonic_attention(p_choose_i, previous_alignments, mode)
 
@@ -734,11 +751,12 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
                memory,
                memory_sequence_length=None,
                normalize=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               dtype=None,
                name="BahdanauMonotonicAttention"):
     """Construct the Attention mechanism.
 
@@ -762,17 +780,21 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
       mode: How to compute the attention distribution.  Must be one of
         'recursive', 'parallel', or 'hard'.  See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(BahdanauMonotonicAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -830,11 +852,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
                memory,
                memory_sequence_length=None,
                scale=False,
-               score_mask_value=float("-inf"),
+               score_mask_value=None,
                sigmoid_noise=0.,
                sigmoid_noise_seed=None,
                score_bias_init=0.,
                mode="parallel",
+               dtype=None,
                name="LuongMonotonicAttention"):
     """Construct the Attention mechanism.
 
@@ -858,17 +881,21 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
       mode: How to compute the attention distribution.  Must be one of
         'recursive', 'parallel', or 'hard'.  See the docstring for
         `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
       name: Name to use when creating ops.
     """
     # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
     wrapped_probability_fn = functools.partial(
         _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
         seed=sigmoid_noise_seed)
     super(LuongMonotonicAttention, self).__init__(
         query_layer=layers_core.Dense(
-            num_units, name="query_layer", use_bias=False),
+            num_units, name="query_layer", use_bias=False, dtype=dtype),
         memory_layer=layers_core.Dense(
-            num_units, name="memory_layer", use_bias=False),
+            num_units, name="memory_layer", use_bias=False, dtype=dtype),
         memory=memory,
         probability_fn=wrapped_probability_fn,
         memory_sequence_length=memory_sequence_length,
@@ -1119,8 +1146,11 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
             % (len(attention_layer_sizes), len(attention_mechanisms)))
       self._attention_layers = tuple(
           layers_core.Dense(
-              attention_layer_size, name="attention_layer", use_bias=False)
-          for attention_layer_size in attention_layer_sizes)
+              attention_layer_size,
+              name="attention_layer",
+              use_bias=False,
+              dtype=attention_mechanisms[i].dtype)
+          for i, attention_layer_size in enumerate(attention_layer_sizes))
       self._attention_layer_size = sum(attention_layer_sizes)
     else:
       self._attention_layers = None
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index b55d90cbabcc0bb63aaff86ba74c9fa2c6c917cf..dec03ce43f236ba0ebe9a22015e8f67d41bb2164 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -223,8 +223,7 @@ class TrainingHelper(Helper):
 
   def sample(self, time, outputs, name=None, **unused_kwargs):
     with ops.name_scope(name, "TrainingHelperSample", [time, outputs]):
-      sample_ids = math_ops.cast(
-          math_ops.argmax(outputs, axis=-1), dtypes.int32)
+      sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
       return sample_ids
 
   def next_inputs(self, time, outputs, state, name=None, **unused_kwargs):
@@ -540,8 +539,7 @@ class GreedyEmbeddingHelper(Helper):
     if not isinstance(outputs, ops.Tensor):
       raise TypeError("Expected outputs to be a single Tensor, got: %s" %
                       type(outputs))
-    sample_ids = math_ops.cast(
-        math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
     return sample_ids
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index b67090dd509f321c8d28436fa135fb871aee976d..a83fc20596c8ad7e1cf94ede8b10d82e25f47b17 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -12,7 +12,6 @@ py_library(
     srcs = ["__init__.py"] + glob(["python/ops/*.py"]),
     srcs_version = "PY2AND3",
     deps = [
-        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/contrib/signal/__init__.py b/tensorflow/contrib/signal/__init__.py
index 0f2592b0b05722145f1b323ada52fa53e6cdc4ba..6a2080bcec15a7ef29c54cc6394982b2e3709181 100644
--- a/tensorflow/contrib/signal/__init__.py
+++ b/tensorflow/contrib/signal/__init__.py
@@ -20,6 +20,7 @@ See the @{$python/contrib.signal} guide.
 @@hamming_window
 @@hann_window
 @@inverse_stft
+@@inverse_stft_window_fn
 @@mfccs_from_log_mel_spectrograms
 @@linear_to_mel_weight_matrix
 @@overlap_and_add
@@ -44,6 +45,7 @@ from tensorflow.contrib.signal.python.ops.shape_ops import frame
 # Keep an alias to `frames` for backwards compatibility.
 from tensorflow.contrib.signal.python.ops.shape_ops import frame as frames
 from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft
+from tensorflow.contrib.signal.python.ops.spectral_ops import inverse_stft_window_fn
 from tensorflow.contrib.signal.python.ops.spectral_ops import stft
 from tensorflow.contrib.signal.python.ops.window_ops import hamming_window
 from tensorflow.contrib.signal.python.ops.window_ops import hann_window
diff --git a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
index 72d317dc418d313c1c59ac12019a0eee48261fe4..03d6da7765ba5249a9fb22f56a469cf07c310479 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/spectral_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.signal.python.ops import spectral_ops
+from tensorflow.contrib.signal.python.ops import window_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -114,31 +115,6 @@ class SpectralOpsTest(test.TestCase):
       self.assertAllClose(
           expected_inverse_stft, actual_inverse_stft, 1e-4, 1e-4)
 
-  def _compare_round_trip(self, signal, frame_length, frame_step, fft_length):
-    with spectral_ops_test_util.fft_kernel_label_map(), (
-        self.test_session(use_gpu=True)) as sess:
-      stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
-                               pad_end=False)
-      inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
-                                               fft_length)
-      signal, inverse_stft = sess.run([signal, inverse_stft])
-
-      # Since the shapes can differ due to padding, pad both signals to the max
-      # of their lengths.
-      max_length = max(signal.shape[0], inverse_stft.shape[0])
-      signal = np.pad(signal, (0, max_length - signal.shape[0]), "constant")
-      inverse_stft = np.pad(inverse_stft,
-                            (0, max_length - inverse_stft.shape[0]), "constant")
-
-      # Ignore the frame_length samples at either edge.
-      start = frame_length
-      end = signal.shape[0] - frame_length
-      ratio = signal[start:end] / inverse_stft[start:end]
-
-      # Check that the inverse and original signal are equal up to a constant
-      # factor.
-      self.assertLess(np.var(ratio), 2e-5)
-
   def test_shapes(self):
     with spectral_ops_test_util.fft_kernel_label_map(), (
         self.test_session(use_gpu=True)):
@@ -191,23 +167,105 @@ class SpectralOpsTest(test.TestCase):
       self._compare(signal, frame_length, frame_step, fft_length)
 
   def test_stft_round_trip(self):
-    # Tuples of (signal_length, frame_length, frame_step, fft_length).
+    # Tuples of (signal_length, frame_length, frame_step, fft_length,
+    # threshold, corrected_threshold).
     test_configs = [
         # 87.5% overlap.
-        (4096, 256, 32, 256),
+        (4096, 256, 32, 256, 1e-5, 1e-6),
         # 75% overlap.
-        (4096, 256, 64, 256),
+        (4096, 256, 64, 256, 1e-5, 1e-6),
         # Odd frame hop.
-        (4096, 128, 25, 128),
+        (4096, 128, 25, 128, 1e-3, 1e-6),
         # Odd frame length.
-        (4096, 127, 32, 128),
+        (4096, 127, 32, 128, 1e-3, 1e-6),
+        # 50% overlap.
+        (4096, 128, 64, 128, 0.40, 1e-6),
     ]
 
-    for signal_length, frame_length, frame_step, fft_length in test_configs:
-      # Generate a 440Hz signal at 8kHz sample rate.
-      signal = math_ops.sin(2 * np.pi * 440 / 8000 *
-                            math_ops.to_float(math_ops.range(signal_length)))
-      self._compare_round_trip(signal, frame_length, frame_step, fft_length)
+    for (signal_length, frame_length, frame_step, fft_length, threshold,
+         corrected_threshold) in test_configs:
+      # Generate a random white Gaussian signal.
+      signal = random_ops.random_normal([signal_length])
+
+      with spectral_ops_test_util.fft_kernel_label_map(), (
+          self.test_session(use_gpu=True)) as sess:
+        stft = spectral_ops.stft(signal, frame_length, frame_step, fft_length,
+                                 pad_end=False)
+        inverse_stft = spectral_ops.inverse_stft(stft, frame_length, frame_step,
+                                                 fft_length)
+        inverse_stft_corrected = spectral_ops.inverse_stft(
+            stft, frame_length, frame_step, fft_length,
+            window_fn=spectral_ops.inverse_stft_window_fn(frame_step))
+        signal, inverse_stft, inverse_stft_corrected = sess.run(
+            [signal, inverse_stft, inverse_stft_corrected])
+
+        # Truncate signal to the size of inverse stft.
+        signal = signal[:inverse_stft.shape[0]]
+
+        # Ignore the frame_length samples at either edge.
+        signal = signal[frame_length:-frame_length]
+        inverse_stft = inverse_stft[frame_length:-frame_length]
+        inverse_stft_corrected = inverse_stft_corrected[
+            frame_length:-frame_length]
+
+        # Check that the inverse and original signal are close up to a scale
+        # factor.
+        inverse_stft_scaled = inverse_stft / np.mean(np.abs(inverse_stft))
+        signal_scaled = signal / np.mean(np.abs(signal))
+        self.assertLess(np.std(inverse_stft_scaled - signal_scaled), threshold)
+
+        # Check that the inverse with correction and original signal are close.
+        self.assertLess(np.std(inverse_stft_corrected - signal),
+                        corrected_threshold)
+
+  def test_inverse_stft_window_fn(self):
+    """Test that inverse_stft_window_fn has unit gain at each window phase."""
+    # Tuples of (frame_length, frame_step).
+    test_configs = [
+        (256, 32),
+        (256, 64),
+        (128, 25),
+        (127, 32),
+        (128, 64),
+    ]
+
+    for (frame_length, frame_step) in test_configs:
+      hann_window = window_ops.hann_window(frame_length, dtype=dtypes.float32)
+      inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
+      inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
+
+      with self.test_session(use_gpu=True) as sess:
+        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+
+      # Expect unit gain at each phase of the window.
+      product_window = hann_window * inverse_window
+      for i in range(frame_step):
+        self.assertAllClose(1.0, np.sum(product_window[i::frame_step]))
+
+  def test_inverse_stft_window_fn_special_case(self):
+    """Test inverse_stft_window_fn in special overlap = 3/4 case."""
+    # Cases in which frame_length is an integer multiple of 4 * frame_step are
+    # special because they allow exact reproduction of the waveform with a
+    # squared Hann window (Hann window in both forward and reverse transforms).
+    # In the case where frame_length = 4 * frame_step, that combination
+    # produces a constant gain of 1.5, and so the corrected window will be the
+    # Hann window / 1.5.
+
+    # Tuples of (frame_length, frame_step).
+    test_configs = [
+        (256, 64),
+        (128, 32),
+    ]
+
+    for (frame_length, frame_step) in test_configs:
+      hann_window = window_ops.hann_window(frame_length, dtype=dtypes.float32)
+      inverse_window_fn = spectral_ops.inverse_stft_window_fn(frame_step)
+      inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
+
+      with self.test_session(use_gpu=True) as sess:
+        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+
+      self.assertAllClose(hann_window, inverse_window * 1.5)
 
   @staticmethod
   def _compute_stft_gradient(signal, frame_length=32, frame_step=16,
diff --git a/tensorflow/contrib/signal/python/ops/spectral_ops.py b/tensorflow/contrib/signal/python/ops/spectral_ops.py
index 5ed109b7ddad126d16cf45c631434ba0a674896b..bca2e01d7bbefb18fd69a0eba27e3afb8f636724 100644
--- a/tensorflow/contrib/signal/python/ops/spectral_ops.py
+++ b/tensorflow/contrib/signal/python/ops/spectral_ops.py
@@ -91,6 +91,67 @@ def stft(signals, frame_length, frame_step, fft_length=None,
     return spectral_ops.rfft(framed_signals, [fft_length])
 
 
+def inverse_stft_window_fn(frame_step,
+                           forward_window_fn=functools.partial(
+                               window_ops.hann_window, periodic=True),
+                           name=None):
+  """Generates a window function that can be used in `inverse_stft`.
+
+  Constructs a window that is equal to the forward window with a further
+  pointwise amplitude correction.  `inverse_stft_window_fn` is equivalent to
+  `forward_window_fn` in the case where it would produce an exact inverse.
+
+  See examples in `inverse_stft` documentation for usage.
+
+  Args:
+    frame_step: An integer scalar `Tensor`. The number of samples to step.
+    forward_window_fn: window_fn used in the forward transform, `stft`.
+    name: An optional name for the operation.
+
+  Returns:
+    A callable that takes a window length and a `dtype` keyword argument and
+      returns a `[window_length]` `Tensor` of samples in the provided datatype.
+      The returned window is suitable for reconstructing original waveform in
+      inverse_stft.
+  """
+  with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
+    frame_step = ops.convert_to_tensor(frame_step, name='frame_step')
+    frame_step.shape.assert_has_rank(0)
+
+  def inverse_stft_window_fn_inner(frame_length, dtype):
+    """Computes a window that can be used in `inverse_stft`.
+
+    Args:
+      frame_length: An integer scalar `Tensor`. The window length in samples.
+      dtype: Data type of waveform passed to `stft`.
+
+    Returns:
+      A window suitable for reconstructing original waveform in `inverse_stft`.
+
+    Raises:
+      ValueError: If `frame_length` is not scalar, `forward_window_fn` is not a
+      callable that takes a window length and a `dtype` keyword argument and
+      returns a `[window_length]` `Tensor` of samples in the provided datatype
+      `frame_step` is not scalar, or `frame_step` is not scalar.
+    """
+    with ops.name_scope(name, 'inverse_stft_window_fn', [forward_window_fn]):
+      frame_length = ops.convert_to_tensor(frame_length, name='frame_length')
+      frame_length.shape.assert_has_rank(0)
+
+      # Use equation 7 from Griffin + Lim.
+      forward_window = forward_window_fn(frame_length, dtype=dtype)
+      denom = math_ops.square(forward_window)
+      overlaps = -(-frame_length // frame_step)  # Ceiling division.
+      denom = array_ops.pad(denom, [(0, overlaps * frame_step - frame_length)])
+      denom = array_ops.reshape(denom, [overlaps, frame_step])
+      denom = math_ops.reduce_sum(denom, 0, keep_dims=True)
+      denom = array_ops.tile(denom, [overlaps, 1])
+      denom = array_ops.reshape(denom, [overlaps * frame_step])
+
+      return forward_window / denom[:frame_length]
+  return inverse_stft_window_fn_inner
+
+
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
@@ -100,6 +161,38 @@ def inverse_stft(stfts,
                  name=None):
   """Computes the inverse [Short-time Fourier Transform][stft] of `stfts`.
 
+  To reconstruct an original waveform, a complimentary window function should
+  be used in inverse_stft. Such a window function can be constructed with
+  tf.contrib.signal.inverse_stft_window_fn.
+
+  Example:
+
+  ```python
+  frame_length = 400
+  frame_step = 160
+  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  stft = tf.contrib.signal.stft(waveform, frame_length, frame_step)
+  inverse_stft = tf.contrib.signal.inverse_stft(
+      stft, frame_length, frame_step,
+      window_fn=tf.contrib.signal.inverse_stft_window_fn(frame_step))
+  ```
+
+  if a custom window_fn is used in stft, it must be passed to
+  inverse_stft_window_fn:
+
+  ```python
+  frame_length = 400
+  frame_step = 160
+  window_fn = functools.partial(window_ops.hamming_window, periodic=True),
+  waveform = tf.placeholder(dtype=tf.float32, shape=[1000])
+  stft = tf.contrib.signal.stft(
+      waveform, frame_length, frame_step, window_fn=window_fn)
+  inverse_stft = tf.contrib.signal.inverse_stft(
+      stft, frame_length, frame_step,
+      window_fn=tf.contrib.signal.inverse_stft_window_fn(
+         frame_step, forward_window_fn=window_fn))
+  ```
+
   Implemented with GPU-compatible ops and supports gradients.
 
   Args:
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index 23c23af2f4815c3b1d75eb955b9026dfb9b00194..c2f106c2b28029f05648716bb08cd2531729fb36 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -39,6 +39,8 @@ py_test(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/debug:debug_data",
+        "//tensorflow/python/debug:hooks",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 0bfd0801d55b25f78cee60e87ee6c43f11a4995c..dc92ae0c859394f44ba83d814adbef7d324a9ada 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -237,7 +237,7 @@ One way to reduce this code duplication would be via a `for` loop:
 ```python
 net = ...
 for i in range(3):
-  net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1))
+  net = slim.conv2d(net, 256, [3, 3], scope='conv3_%d' % (i+1))
 net = slim.max_pool2d(net, [2, 2], scope='pool2')
 ```
 
@@ -441,7 +441,8 @@ module. Consider the simple case where we want to train the VGG network:
 
 ```python
 import tensorflow as tf
-vgg = tf.contrib.slim.nets.vgg
+import tensorflow.contrib.slim.nets as nets
+vgg = nets.vgg
 
 # Load the images and labels.
 images, labels = ...
@@ -559,9 +560,10 @@ examine the following sample of training the VGG network:
 
 ```python
 import tensorflow as tf
+import tensorflow.contrib.slim.nets as nets
 
 slim = tf.contrib.slim
-vgg = tf.contrib.slim.nets.vgg
+vgg = nets.vgg
 
 ...
 
@@ -809,9 +811,10 @@ Putting it all together:
 
 ```python
 import tensorflow as tf
+import tensorflow.contrib.slim.nets as nets
 
 slim = tf.contrib.slim
-vgg = tf.contrib.slim.nets.vgg
+vgg = nets.vgg
 
 
 # Load the data
diff --git a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
index 82c6b5a619662ba5cbaba1b3a238045a8d9a2cd2..41426a6508f2e572fd6d25a41ab07164ac0143a8 100644
--- a/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
+++ b/tensorflow/contrib/slim/python/slim/data/dataset_data_provider.py
@@ -62,7 +62,9 @@ class DatasetDataProvider(data_provider.DataProvider):
                seed=None,
                scope=None):
     """Creates a DatasetDataProvider.
-
+    Note: if `num_epochs` is not `None`,  local counter `epochs` will be created
+    by relevant function. Use `local_variables_initializer()` to initialize
+    local variables.
     Args:
       dataset: An instance of the Dataset class.
       num_readers: The number of parallel readers to use.
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 2d4b08df61a22b270ab5ed31a5a2b33b108de29b..3caf4e02da3aa2d7e586c4e76807a11f84585ea6 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -34,7 +34,7 @@ the metrics and finally call the `evaluation` method:
       "mse": slim.metrics.mean_squared_error(predictions, labels),
   })
 
-  inital_op = tf.group(
+  initial_op = tf.group(
       tf.global_variables_initializer(),
       tf.local_variables_initializer())
 
@@ -42,7 +42,7 @@ the metrics and finally call the `evaluation` method:
     metric_values = slim.evaluation(
         sess,
         num_evals=1,
-        inital_op=initial_op,
+        initial_op=initial_op,
         eval_op=names_to_updates.values(),
         final_op=name_to_values.values())
 
@@ -153,7 +153,8 @@ def evaluate_once(master,
                   summary_op=_USE_DEFAULT,
                   summary_op_feed_dict=None,
                   variables_to_restore=None,
-                  session_config=None):
+                  session_config=None,
+                  hooks=None):
   """Evaluates the model at the given checkpoint path.
 
   Args:
@@ -177,6 +178,8 @@ def evaluate_once(master,
       slim.variables.GetVariablesToRestore() is used.
     session_config: An instance of `tf.ConfigProto` that will be used to
       configure the `Session`. If left as `None`, the default will be used.
+    hooks: A list of additional `SessionRunHook` objects to pass during the
+      evaluation.
 
   Returns:
     The value of `final_op` or `None` if `final_op` is `None`.
@@ -184,11 +187,13 @@ def evaluate_once(master,
   if summary_op == _USE_DEFAULT:
     summary_op = summary.merge_all()
 
-  hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
+  all_hooks = [evaluation.StopAfterNEvalsHook(num_evals),]
 
   if summary_op is not None:
-    hooks.append(evaluation.SummaryAtEndHook(
+    all_hooks.append(evaluation.SummaryAtEndHook(
         log_dir=logdir, summary_op=summary_op, feed_dict=summary_op_feed_dict))
+  if hooks is not None:
+    all_hooks.extend(hooks)
 
   saver = None
   if variables_to_restore is not None:
@@ -203,7 +208,7 @@ def evaluate_once(master,
       feed_dict=eval_op_feed_dict,
       final_ops=final_op,
       final_ops_feed_dict=final_op_feed_dict,
-      hooks=hooks,
+      hooks=all_hooks,
       config=session_config)
 
 
@@ -256,7 +261,7 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
-    hooks: A list of additional SessionRunHook objects to pass during
+    hooks: A list of additional `SessionRunHook` objects to pass during
       repeated evaluations.
 
   Returns:
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index d9e0f54b724d3b44db158c6d57e7220d28cf7b8a..870f504d10362ed5226951adefc3ba9a934900c1 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import glob
 import os
+import shutil
 import time
 
 import numpy as np
@@ -29,6 +30,8 @@ from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.debug.lib import debug_data
+from tensorflow.python.debug.wrappers import hooks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -230,11 +233,7 @@ class SingleEvaluationTest(test.TestCase):
     with self.assertRaises(errors.NotFoundError):
       evaluation.evaluate_once('', checkpoint_path, log_dir)
 
-  def testRestoredModelPerformance(self):
-    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
-    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
-
-    # First, save out the current model to a checkpoint:
+  def _prepareCheckpoint(self, checkpoint_path):
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
@@ -242,6 +241,13 @@ class SingleEvaluationTest(test.TestCase):
       sess.run(init_op)
       saver.save(sess, checkpoint_path)
 
+  def testRestoredModelPerformance(self):
+    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
+    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
+
+    # First, save out the current model to a checkpoint:
+    self._prepareCheckpoint(checkpoint_path)
+
     # Next, determine the metric to evaluate:
     value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
                                                         self._labels)
@@ -251,6 +257,36 @@ class SingleEvaluationTest(test.TestCase):
         '', checkpoint_path, log_dir, eval_op=update_op, final_op=value_op)
     self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
 
+  def testAdditionalHooks(self):
+    checkpoint_path = os.path.join(self.get_temp_dir(), 'model.ckpt')
+    log_dir = os.path.join(self.get_temp_dir(), 'log_dir1/')
+
+    # First, save out the current model to a checkpoint:
+    self._prepareCheckpoint(checkpoint_path)
+
+    # Next, determine the metric to evaluate:
+    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
+                                                        self._labels)
+
+    dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
+    dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
+    try:
+      # Run the evaluation and verify the results:
+      accuracy_value = evaluation.evaluate_once(
+          '', checkpoint_path, log_dir, eval_op=update_op, final_op=value_op,
+          hooks=[dumping_hook])
+      self.assertAlmostEqual(accuracy_value, self._expected_accuracy)
+
+      dump = debug_data.DebugDumpDir(
+          glob.glob(os.path.join(dumping_root, 'run_*'))[0])
+      # Here we simply assert that the dumped data has been loaded and is
+      # non-empty. We do not care about the detailed model-internal tensors or
+      # their values.
+      self.assertTrue(dump.dumped_tensor_data)
+    finally:
+      if os.path.isdir(dumping_root):
+        shutil.rmtree(dumping_root)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
index b4fd2580c2b8eaef79c1dd5f2f6b4a18cd0904c7..576444214d5edb772addef64d5def84e3915c29b 100644
--- a/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
+++ b/tensorflow/contrib/slim/python/slim/nets/resnet_v1_test.py
@@ -386,7 +386,7 @@ class ResnetCompleteNetworkTest(test.TestCase):
                 inputs, None, is_training=False, global_pool=False)
             sess.run(variables.global_variables_initializer())
             self.assertAllClose(
-                output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
+                output.eval(), expected.eval(), atol=2e-4, rtol=1e-4)
 
   def testUnknownBatchSize(self):
     batch = 2
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 6102fac7bde81cbe8e72635924b9a1c09a533c32..4b688690aef513dd683817b0b5c2ba4cb50f73d9 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -45,6 +45,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
 )
diff --git a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
index cd4d46aa07bfa92b8243f2f168fd1e4682ad70e2..bea6341cfdcf7d56f255bec275b7861228e44e12 100644
--- a/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
+++ b/tensorflow/contrib/stateless/python/kernel_tests/stateless_random_ops_test.py
@@ -69,16 +69,17 @@ class StatelessOpsTest(test.TestCase):
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
     with self.test_session(use_gpu=True):
-      seed_t = array_ops.placeholder(dtypes.int64, shape=[2])
-      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
-      for stateless_op, _ in CASES:
-        for shape in (), (3,), (2, 5):
-          pure = stateless_op(shape, seed=seed_t)
-          values = [(seed, pure.eval(feed_dict={seed_t: seed}))
-                    for seed in seeds]
-          for s0, v0 in values:
-            for s1, v1 in values:
-              self.assertEqual(s0 == s1, np.all(v0 == v1))
+      for seed_type in [dtypes.int32, dtypes.int64]:
+        seed_t = array_ops.placeholder(seed_type, shape=[2])
+        seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+        for stateless_op, _ in CASES:
+          for shape in (), (3,), (2, 5):
+            pure = stateless_op(shape, seed=seed_t)
+            values = [(seed, pure.eval(feed_dict={seed_t: seed}))
+                      for seed in seeds]
+            for s0, v0 in values:
+              for s1, v1 in values:
+                self.assertEqual(s0 == s1, np.all(v0 == v1))
 
   def testShapeType(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index da23f1c3806be73d43e44bf4b4079d81b2d61c8f..5ee5f1ae763db0ede9df464a08a9f1c7341b7cab 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -13,10 +13,7 @@ load(
 tf_gen_op_wrapper_py(
     name = "gen_summary_ops",
     out = "gen_summary_ops.py",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:summary_ops_op_lib",
-    ],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
 )
 
 py_test(
@@ -26,12 +23,35 @@ py_test(
     deps = [
         ":summary_ops",
         ":summary_test_util",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "summary_ops_graph_test",
+    srcs = ["summary_ops_graph_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":summary_ops",
+        ":summary_test_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "@six_archive//:six",
     ],
 )
 
@@ -42,16 +62,20 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":gen_summary_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers_base",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:summary_op_util",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index ca82ea094c41c15f376e6f6f448b770c5cf291d7..7d3b8b7437a9ff5aaa0834db79bca8883cd679c8 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""TensorFlow Summary API v2.
 
-"""Contrib summary package.
-
-The operations in this package are safe to use with eager execution turned or on
-off.
-
+The operations in this package are safe to use with eager execution turned on or
+off. It has a more flexible API that allows summaries to be written directly
+from ops to places other than event log files, rather than propagating protos
+from @{tf.summary.merge_all} to @{tf.summary.FileWriter}.
 """
 
 from __future__ import absolute_import
@@ -28,13 +28,20 @@ from __future__ import print_function
 from tensorflow.contrib.summary.summary_ops import all_summary_ops
 from tensorflow.contrib.summary.summary_ops import always_record_summaries
 from tensorflow.contrib.summary.summary_ops import audio
+from tensorflow.contrib.summary.summary_ops import create_db_writer
+from tensorflow.contrib.summary.summary_ops import create_file_writer
 from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
 from tensorflow.contrib.summary.summary_ops import eval_dir
+from tensorflow.contrib.summary.summary_ops import flush
 from tensorflow.contrib.summary.summary_ops import generic
+from tensorflow.contrib.summary.summary_ops import graph
 from tensorflow.contrib.summary.summary_ops import histogram
 from tensorflow.contrib.summary.summary_ops import image
+from tensorflow.contrib.summary.summary_ops import import_event
+from tensorflow.contrib.summary.summary_ops import initialize
 from tensorflow.contrib.summary.summary_ops import never_record_summaries
 from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
 from tensorflow.contrib.summary.summary_ops import scalar
 from tensorflow.contrib.summary.summary_ops import should_record_summaries
 from tensorflow.contrib.summary.summary_ops import summary_writer_initializer_op
+from tensorflow.contrib.summary.summary_ops import SummaryWriter
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/contrib/summary/summary_ops.py
index 1d1c88944aba7b84f8b56d466c0532c938f90006..4556162bfe127125d4deaa8baa29b911c39e7bf6 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/contrib/summary/summary_ops.py
@@ -19,9 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import getpass
 import os
+import re
+import time
+
+import six
 
 from tensorflow.contrib.summary import gen_summary_ops
+from tensorflow.core.framework import graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,16 +38,21 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
 from tensorflow.python.util import tf_contextlib
 
+
 # Name for a collection which is expected to have at most a single boolean
 # Tensor. If this tensor is True the summary ops will record summaries.
 _SHOULD_RECORD_SUMMARIES_NAME = "ShouldRecordSummaries"
 
-_SUMMARY_COLLECTION_NAME = "_SUMMARY_V2"
 _SUMMARY_WRITER_INIT_COLLECTION_NAME = "_SUMMARY_WRITER_V2"
 
+_EXPERIMENT_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,256}$")
+_RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
+_USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
+
 
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
@@ -57,12 +68,14 @@ def should_record_summaries():
 
 # TODO(apassos) consider how to handle local step here.
 @tf_contextlib.contextmanager
-def record_summaries_every_n_global_steps(n):
+def record_summaries_every_n_global_steps(n, global_step=None):
   """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
+  if global_step is None:
+    global_step = training_util.get_global_step()
   collection_ref = ops.get_collection_ref(_SHOULD_RECORD_SUMMARIES_NAME)
   old = collection_ref[:]
   with ops.device("cpu:0"):
-    collection_ref[:] = [math_ops.equal(training_util.get_global_step() % n, 0)]
+    collection_ref[:] = [math_ops.equal(global_step % n, 0)]
   yield
   collection_ref[:] = old
 
@@ -88,25 +101,32 @@ def never_record_summaries():
 
 
 class SummaryWriter(object):
-  """Encapsulates a summary writer."""
+  """Encapsulates a stateful summary writer resource.
+
+  See also:
+  - @{tf.contrib.summary.create_file_writer}
+  - @{tf.contrib.summary.create_db_writer}
+  """
 
-  def __init__(self, resource):
+  def  __init__(self, resource):
     self._resource = resource
     if context.in_eager_mode():
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
 
   def set_as_default(self):
+    """Enables this summary writer for the current thread."""
     context.context().summary_writer_resource = self._resource
 
   @tf_contextlib.contextmanager
   def as_default(self):
+    """Enables summary writing within a `with` block."""
     if self._resource is None:
-      yield
+      yield self
     else:
       old = context.context().summary_writer_resource
       context.context().summary_writer_resource = self._resource
-      yield
+      yield self
       # Flushes the summary writer in eager mode or in graph functions, but not
       # in legacy graph mode (you're on your own there).
       with ops.device("cpu:0"):
@@ -114,11 +134,48 @@ class SummaryWriter(object):
       context.context().summary_writer_resource = old
 
 
-def create_summary_file_writer(logdir,
-                               max_queue=None,
-                               flush_secs=None,
-                               filename_suffix=None,
-                               name=None):
+def initialize(
+    graph=None,  # pylint: disable=redefined-outer-name
+    session=None):
+  """Initializes summary writing for graph execution mode.
+
+  This helper method provides a higher-level alternative to using
+  @{tf.contrib.summary.summary_writer_initializer_op} and
+  @{tf.contrib.summary.graph}.
+
+  Most users will also want to call @{tf.train.create_global_step}
+  which can happen before or after this function is called.
+
+  Args:
+    graph: A @{tf.Graph} or @{tf.GraphDef} to output to the writer.
+      This function will not write the default graph by default. When
+      writing to an event log file, the associated step will be zero.
+    session: So this method can call @{tf.Session.run}. This defaults
+      to @{tf.get_default_session}.
+
+  Raises:
+    RuntimeError: If in eager mode, or if the current thread has no
+      default @{tf.contrib.summary.SummaryWriter}.
+    ValueError: If session wasn't passed and no default session.
+  """
+  if context.context().summary_writer_resource is None:
+    raise RuntimeError("No default tf.contrib.summary.SummaryWriter found")
+  if session is None:
+    session = ops.get_default_session()
+    if session is None:
+      raise ValueError("session must be passed if no default session exists")
+  session.run(summary_writer_initializer_op())
+  if graph is not None:
+    data = _serialize_graph(graph)
+    x = array_ops.placeholder(dtypes.string)
+    session.run(_graph(x, 0), feed_dict={x: data})
+
+
+def create_file_writer(logdir,
+                       max_queue=None,
+                       flush_millis=None,
+                       filename_suffix=None,
+                       name=None):
   """Creates a summary file writer in the current context.
 
   Args:
@@ -128,9 +185,10 @@ def create_summary_file_writer(logdir,
      useful to use as a context manager.
     max_queue: the largest number of summaries to keep in a queue; will
      flush once the queue gets bigger than this.
-    flush_secs: the largest interval (in seconds) between flushes.
+    flush_millis: the largest interval between flushes.
     filename_suffix: optional suffix for the event file name.
-    name: name for the summary writer.
+    name: Shared name for this SummaryWriter resource stored to default
+      Graph.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
@@ -141,18 +199,85 @@ def create_summary_file_writer(logdir,
   with ops.device("cpu:0"):
     if max_queue is None:
       max_queue = constant_op.constant(10)
-    if flush_secs is None:
-      flush_secs = constant_op.constant(120)
+    if flush_millis is None:
+      flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant("")
-    resource = gen_summary_ops.summary_writer(shared_name=name)
-    # TODO(apassos) ensure the initialization op runs when in graph mode;
-    # consider calling session.run here.
-    ops.add_to_collection(
-        _SUMMARY_WRITER_INIT_COLLECTION_NAME,
-        gen_summary_ops.create_summary_file_writer(resource, logdir, max_queue,
-                                                   flush_secs, filename_suffix))
-    return SummaryWriter(resource)
+    return _make_summary_writer(
+        name,
+        gen_summary_ops.create_summary_file_writer,
+        logdir=logdir,
+        max_queue=max_queue,
+        flush_millis=flush_millis,
+        filename_suffix=filename_suffix)
+
+
+def create_db_writer(db_uri,
+                     experiment_name=None,
+                     run_name=None,
+                     user_name=None,
+                     name=None):
+  """Creates a summary database writer in the current context.
+
+  This can be used to write tensors from the execution graph directly
+  to a database. Only SQLite is supported right now. This function
+  will create the schema if it doesn't exist. Entries in the Users,
+  Experiments, and Runs tables will be created automatically if they
+  don't already exist.
+
+  Args:
+    db_uri: For example "file:/tmp/foo.sqlite".
+    experiment_name: Defaults to YYYY-MM-DD in local time if None.
+      Empty string means the Run will not be associated with an
+      Experiment. Can't contain ASCII control characters or <>. Case
+      sensitive.
+    run_name: Defaults to HH:MM:SS in local time if None. Empty string
+      means a Tag will not be associated with any Run. Can't contain
+      ASCII control characters or <>. Case sensitive.
+    user_name: Defaults to system username if None. Empty means the
+      Experiment will not be associated with a User. Must be valid as
+      both a DNS label and Linux username.
+    name: Shared name for this SummaryWriter resource stored to default
+      @{tf.Graph}.
+
+  Returns:
+    A @{tf.contrib.summary.SummaryWriter} instance.
+  """
+  with ops.device("cpu:0"):
+    if experiment_name is None:
+      experiment_name = time.strftime("%Y-%m-%d", time.localtime(time.time()))
+    if run_name is None:
+      run_name = time.strftime("%H:%M:%S", time.localtime(time.time()))
+    if user_name is None:
+      user_name = getpass.getuser()
+    experiment_name = _cleanse_string(
+        "experiment_name", _EXPERIMENT_NAME_PATTERNS, experiment_name)
+    run_name = _cleanse_string("run_name", _RUN_NAME_PATTERNS, run_name)
+    user_name = _cleanse_string("user_name", _USER_NAME_PATTERNS, user_name)
+    return _make_summary_writer(
+        name,
+        gen_summary_ops.create_summary_db_writer,
+        db_uri=db_uri,
+        experiment_name=experiment_name,
+        run_name=run_name,
+        user_name=user_name)
+
+
+def _make_summary_writer(name, factory, **kwargs):
+  resource = gen_summary_ops.summary_writer(shared_name=name)
+  # TODO(apassos): Consider doing this instead.
+  # node = factory(resource, **kwargs)
+  # if not context.in_eager_mode():
+  #   ops.get_default_session().run(node)
+  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
+                        factory(resource, **kwargs))
+  return SummaryWriter(resource)
+
+
+def _cleanse_string(name, pattern, value):
+  if isinstance(value, six.string_types) and pattern.search(value) is None:
+    raise ValueError("%s (%s) must match %s" % (name, value, pattern.pattern))
+  return ops.convert_to_tensor(value, dtypes.string)
 
 
 def _nothing():
@@ -161,15 +286,31 @@ def _nothing():
 
 
 def all_summary_ops():
-  """Graph-mode only. Returns all summary ops."""
+  """Graph-mode only. Returns all summary ops.
+
+  Please note this excludes @{tf.contrib.summary.graph} ops.
+
+  Returns:
+    The summary ops.
+
+  Raises:
+    RuntimeError: If in Eager mode.
+  """
   if context.in_eager_mode():
     raise RuntimeError(
         "tf.contrib.summary.all_summary_ops is only supported in graph mode.")
-  return ops.get_collection(_SUMMARY_COLLECTION_NAME)
+  return ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
 
 
 def summary_writer_initializer_op():
-  """Graph-mode only. Returns the list of ops to create all summary writers."""
+  """Graph-mode only. Returns the list of ops to create all summary writers.
+
+  Returns:
+    The initializer ops.
+
+  Raises:
+    RuntimeError: If in Eager mode.
+  """
   if context.in_eager_mode():
     raise RuntimeError(
         "tf.contrib.summary.summary_writer_initializer_op is only "
@@ -200,72 +341,105 @@ def summary_writer_function(name, tensor, function, family=None):
   with ops.device("cpu:0"):
     op = utils.smart_cond(
         should_record_summaries(), record, _nothing, name="")
-    ops.add_to_collection(_SUMMARY_COLLECTION_NAME, op)
+    ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
 
 
-def generic(name, tensor, metadata, family=None):
+def generic(name, tensor, metadata=None, family=None, step=None):
   """Writes a tensor summary if possible."""
 
   def function(tag, scope):
+    if metadata is None:
+      serialized_metadata = constant_op.constant("")
+    elif hasattr(metadata, "SerializeToString"):
+      serialized_metadata = constant_op.constant(metadata.SerializeToString())
+    else:
+      serialized_metadata = metadata
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_summary(
         context.context().summary_writer_resource,
-        training_util.get_global_step(), array_ops.identity(tensor),
-        tag, metadata, name=scope)
+        _choose_step(step),
+        array_ops.identity(tensor),
+        tag,
+        serialized_metadata,
+        name=scope)
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def scalar(name, tensor, family=None):
-  """Writes a scalar summary if possible."""
+def scalar(name, tensor, family=None, step=None):
+  """Writes a scalar summary if possible.
+
+  Unlike @{tf.contrib.summary.generic} this op may change the dtype
+  depending on the writer, for both practical and efficiency concerns.
+
+  Args:
+    name: An arbitrary name for this summary.
+    tensor: A @{tf.Tensor} Must be one of the following types:
+      `float32`, `float64`, `int32`, `int64`, `uint8`, `int16`,
+      `int8`, `uint16`, `half`, `uint32`, `uint64`.
+    family: Optional, the summary's family.
+    step: The `int64` monotonic step variable, which defaults
+      to @{tf.train.get_global_step}.
+
+  Returns:
+    The created @{tf.Operation} or a @{tf.no_op} if summary writing has
+    not been enabled for this context.
+  """
 
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_scalar_summary(
         context.context().summary_writer_resource,
-        training_util.get_global_step(), tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def histogram(name, tensor, family=None):
+def histogram(name, tensor, family=None, step=None):
   """Writes a histogram summary if possible."""
 
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_histogram_summary(
         context.context().summary_writer_resource,
-        training_util.get_global_step(), tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def image(name, tensor, bad_color=None, max_images=3, family=None):
+def image(name, tensor, bad_color=None, max_images=3, family=None, step=None):
   """Writes an image summary if possible."""
 
   def function(tag, scope):
-    if bad_color is None:
-      bad_color_ = constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
+    bad_color_ = (constant_op.constant([255, 0, 0, 255], dtype=dtypes.uint8)
+                  if bad_color is None else bad_color)
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_image_summary(
         context.context().summary_writer_resource,
-        training_util.get_global_step(), tag, array_ops.identity(tensor),
+        _choose_step(step),
+        tag,
+        array_ops.identity(tensor),
         bad_color_,
-        max_images, name=scope)
+        max_images,
+        name=scope)
 
   return summary_writer_function(name, tensor, function, family=family)
 
 
-def audio(name, tensor, sample_rate, max_outputs, family=None):
+def audio(name, tensor, sample_rate, max_outputs, family=None, step=None):
   """Writes an audio summary if possible."""
 
   def function(tag, scope):
     # Note the identity to move the tensor to the CPU.
     return gen_summary_ops.write_audio_summary(
         context.context().summary_writer_resource,
-        training_util.get_global_step(),
+        _choose_step(step),
         tag,
         array_ops.identity(tensor),
         sample_rate=sample_rate,
@@ -275,6 +449,118 @@ def audio(name, tensor, sample_rate, max_outputs, family=None):
   return summary_writer_function(name, tensor, function, family=family)
 
 
+def graph(param, step=None, name=None):
+  """Writes a TensorFlow graph to the summary interface.
+
+  The graph summary is, strictly speaking, not a summary. Conditions
+  like @{tf.contrib.summary.never_record_summaries} do not apply. Only
+  a single graph can be associated with a particular run. If multiple
+  graphs are written, then only the last one will be considered by
+  TensorBoard.
+
+  When not using eager execution mode, the user should consider passing
+  the `graph` parameter to @{tf.contrib.summary.initialize} instead of
+  calling this function. Otherwise special care needs to be taken when
+  using the graph to record the graph.
+
+  Args:
+    param: A @{tf.Tensor} containing a serialized graph proto. When
+      eager execution is enabled, this function will automatically
+      coerce @{tf.Graph}, @{tf.GraphDef}, and string types.
+    step: The global step variable. This doesn't have useful semantics
+      for graph summaries, but is used anyway, due to the structure of
+      event log files. This defaults to the global step.
+    name: A name for the operation (optional).
+
+  Returns:
+    The created @{tf.Operation} or a @{tf.no_op} if summary writing has
+    not been enabled for this context.
+
+  Raises:
+    TypeError: If `param` isn't already a @{tf.Tensor} in graph mode.
+  """
+  if not context.in_eager_mode() and not isinstance(param, ops.Tensor):
+    raise TypeError("graph() needs a tf.Tensor (e.g. tf.placeholder) in graph "
+                    "mode, but was: %s" % type(param))
+  writer = context.context().summary_writer_resource
+  if writer is None:
+    return control_flow_ops.no_op()
+  with ops.device("cpu:0"):
+    if isinstance(param, (ops.Graph, graph_pb2.GraphDef)):
+      tensor = ops.convert_to_tensor(_serialize_graph(param), dtypes.string)
+    else:
+      tensor = array_ops.identity(param)
+    return gen_summary_ops.write_graph_summary(
+        writer, _choose_step(step), tensor, name=name)
+
+
+_graph = graph  # for functions with a graph parameter
+
+
+def import_event(tensor, name=None):
+  """Writes a @{tf.Event} binary proto.
+
+  When using create_db_writer(), this can be used alongside
+  @{tf.TFRecordReader} to load event logs into the database. Please
+  note that this is lower level than the other summary functions and
+  will ignore any conditions set by methods like
+  @{tf.contrib.summary.should_record_summaries}.
+
+  Args:
+    tensor: A @{tf.Tensor} of type `string` containing a serialized
+      @{tf.Event} proto.
+    name: A name for the operation (optional).
+
+  Returns:
+    The created @{tf.Operation}.
+  """
+  return gen_summary_ops.import_event(
+      context.context().summary_writer_resource, tensor, name=name)
+
+
+def flush(writer=None, name=None):
+  """Forces summary writer to send any buffered data to storage.
+
+  This operation blocks until that finishes.
+
+  Args:
+    writer: The @{tf.contrib.summary.SummaryWriter} resource to flush.
+      The thread default will be used if this parameter is None.
+      Otherwise a @{tf.no_op} is returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    The created @{tf.Operation}.
+  """
+  if writer is None:
+    writer = context.context().summary_writer_resource
+    if writer is None:
+      return control_flow_ops.no_op()
+  return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
 def eval_dir(model_dir, name=None):
   """Construct a logdir for an eval summary writer."""
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
+
+
+def create_summary_file_writer(*args, **kwargs):
+  """Please use @{tf.contrib.summary.create_file_writer}."""
+  logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
+                  "to create_file_writer")
+  return create_file_writer(*args, **kwargs)
+
+
+def _serialize_graph(arbitrary_graph):
+  if isinstance(arbitrary_graph, ops.Graph):
+    return arbitrary_graph.as_graph_def(add_shapes=True).SerializeToString()
+  else:
+    return arbitrary_graph.SerializeToString()
+
+
+def _choose_step(step):
+  if step is None:
+    return training_util.get_global_step()
+  if not isinstance(step, ops.Tensor):
+    return ops.convert_to_tensor(step, dtypes.int64)
+  return step
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8da7901884343073feeedcca88c1138396f3689
--- /dev/null
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+import six
+
+from tensorflow.contrib.summary import summary_ops
+from tensorflow.contrib.summary import summary_test_util
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import training_util
+
+get_all = summary_test_util.get_all
+
+
+class DbTest(summary_test_util.SummaryDbTest):
+
+  def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
+    with self.assertRaises(TypeError):
+      summary_ops.graph(ops.Graph())
+    with self.assertRaises(TypeError):
+      summary_ops.graph('')
+
+  def testGraphSummary(self):
+    training_util.get_or_create_global_step()
+    name = 'hi'
+    graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
+    with self.test_session():
+      with self.create_db_writer().as_default():
+        summary_ops.initialize(graph=graph)
+    six.assertCountEqual(self, [name],
+                         get_all(self.db, 'SELECT node_name FROM Nodes'))
+
+  def testSummaryGraphModeCond(self):
+    with ops.Graph().as_default(), self.test_session():
+      training_util.get_or_create_global_step()
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(
+          logdir, max_queue=0,
+          name='t2').as_default(), summary_ops.always_record_summaries():
+        summary_ops.initialize()
+        training_util.get_or_create_global_step().initializer.run()
+        def f():
+          summary_ops.scalar('scalar', 2.0)
+          return constant_op.constant(True)
+        pred = array_ops.placeholder(dtypes.bool)
+        x = control_flow_ops.cond(pred, f,
+                                  lambda: constant_op.constant(False))
+        x.eval(feed_dict={pred: True})
+
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'cond/scalar')
+
+  def testSummaryGraphModeWhile(self):
+    with ops.Graph().as_default(), self.test_session():
+      training_util.get_or_create_global_step()
+      logdir = tempfile.mkdtemp()
+      with summary_ops.create_file_writer(
+          logdir, max_queue=0,
+          name='t2').as_default(), summary_ops.always_record_summaries():
+        summary_ops.initialize()
+        training_util.get_or_create_global_step().initializer.run()
+        def body(unused_pred):
+          summary_ops.scalar('scalar', 2.0)
+          return constant_op.constant(False)
+        def cond(pred):
+          return pred
+        pred = array_ops.placeholder(dtypes.bool)
+        x = control_flow_ops.while_loop(cond, body, [pred])
+        x.eval(feed_dict={pred: True})
+
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'while/scalar')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index de7ae6ec277a97235617882a7cc7e469eaebe26c..4ef03434b76ee04ce1bb0bd09c27a46db115bab3 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -12,22 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import tempfile
 
+import six
+
 from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
+get_all = summary_test_util.get_all
+get_one = summary_test_util.get_one
+
 
 class TargetTest(test_util.TensorFlowTestCase):
 
@@ -35,7 +44,7 @@ class TargetTest(test_util.TensorFlowTestCase):
     logdir = '/tmp/apath/that/doesnt/exist'
     self.assertFalse(gfile.Exists(logdir))
     with self.assertRaises(errors.NotFoundError):
-      summary_ops.create_summary_file_writer(logdir, max_queue=0, name='t0')
+      summary_ops.create_file_writer(logdir, max_queue=0, name='t0')
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -45,7 +54,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testSummaryOps(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t0').as_default(), summary_ops.always_record_summaries():
       summary_ops.generic('tensor', 1, '')
@@ -60,7 +69,7 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testDefunSummarys(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t1').as_default(), summary_ops.always_record_summaries():
 
@@ -69,23 +78,148 @@ class TargetTest(test_util.TensorFlowTestCase):
         summary_ops.scalar('scalar', 2.0)
 
       write()
-      events = summary_test_util.events_from_file(logdir)
+      events = summary_test_util.events_from_logdir(logdir)
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].simple_value, 2.0)
 
   def testSummaryName(self):
     training_util.get_or_create_global_step()
     logdir = tempfile.mkdtemp()
-    with summary_ops.create_summary_file_writer(
+    with summary_ops.create_file_writer(
         logdir, max_queue=0,
         name='t2').as_default(), summary_ops.always_record_summaries():
 
       summary_ops.scalar('scalar', 2.0)
 
-      events = summary_test_util.events_from_file(logdir)
+      events = summary_test_util.events_from_logdir(logdir)
+      self.assertEqual(len(events), 2)
+      self.assertEqual(events[1].summary.value[0].tag, 'scalar')
+
+  def testSummaryGlobalStep(self):
+    step = training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logdir, max_queue=0,
+        name='t2').as_default(), summary_ops.always_record_summaries():
+
+      summary_ops.scalar('scalar', 2.0, step=step)
+
+      events = summary_test_util.events_from_logdir(logdir)
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testMaxQueue(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=2, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(3, get_total())
+
+  def testFlush(self):
+    logs = tempfile.mkdtemp()
+    with summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999,
+        name='lol').as_default(), summary_ops.always_record_summaries():
+      get_total = lambda: len(summary_test_util.events_from_logdir(logs))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.scalar('scalar', 2.0, step=2)
+      self.assertEqual(1, get_total())
+      summary_ops.flush()
+      self.assertEqual(3, get_total())
+
+
+class DbTest(summary_test_util.SummaryDbTest):
+
+  def testIntegerSummaries(self):
+    step = training_util.create_global_step()
+    writer = self.create_db_writer()
+
+    def adder(x, y):
+      state_ops.assign_add(step, 1)
+      summary_ops.generic('x', x)
+      summary_ops.generic('y', y)
+      sum_ = x + y
+      summary_ops.generic('sum', sum_)
+      return sum_
+
+    with summary_ops.always_record_summaries():
+      with writer.as_default():
+        self.assertEqual(5, adder(int64(2), int64(3)).numpy())
+
+    six.assertCountEqual(self, [1, 1, 1],
+                         get_all(self.db, 'SELECT step FROM Tensors'))
+    six.assertCountEqual(self, ['x', 'y', 'sum'],
+                         get_all(self.db, 'SELECT tag_name FROM Tags'))
+    x_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "x"')
+    y_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "y"')
+    sum_id = get_one(self.db, 'SELECT tag_id FROM Tags WHERE tag_name = "sum"')
+
+    with summary_ops.always_record_summaries():
+      with writer.as_default():
+        self.assertEqual(9, adder(int64(4), int64(5)).numpy())
+
+    six.assertCountEqual(self, [1, 1, 1, 2, 2, 2],
+                         get_all(self.db, 'SELECT step FROM Tensors'))
+    six.assertCountEqual(self, [x_id, y_id, sum_id],
+                         get_all(self.db, 'SELECT tag_id FROM Tags'))
+    self.assertEqual(2, get_tensor(self.db, x_id, 1))
+    self.assertEqual(3, get_tensor(self.db, y_id, 1))
+    self.assertEqual(5, get_tensor(self.db, sum_id, 1))
+    self.assertEqual(4, get_tensor(self.db, x_id, 2))
+    self.assertEqual(5, get_tensor(self.db, y_id, 2))
+    self.assertEqual(9, get_tensor(self.db, sum_id, 2))
+    six.assertCountEqual(
+        self, ['experiment'],
+        get_all(self.db, 'SELECT experiment_name FROM Experiments'))
+    six.assertCountEqual(self, ['run'],
+                         get_all(self.db, 'SELECT run_name FROM Runs'))
+    six.assertCountEqual(self, ['user'],
+                         get_all(self.db, 'SELECT user_name FROM Users'))
+
+  def testBadExperimentName(self):
+    with self.assertRaises(ValueError):
+      self.create_db_writer(experiment_name='\0')
+
+  def testBadRunName(self):
+    with self.assertRaises(ValueError):
+      self.create_db_writer(run_name='\0')
+
+  def testBadUserName(self):
+    with self.assertRaises(ValueError):
+      self.create_db_writer(user_name='-hi')
+    with self.assertRaises(ValueError):
+      self.create_db_writer(user_name='hi-')
+    with self.assertRaises(ValueError):
+      self.create_db_writer(user_name='@')
+
+  def testGraphSummary(self):
+    training_util.get_or_create_global_step()
+    name = 'hi'
+    graph = graph_pb2.GraphDef(node=(node_def_pb2.NodeDef(name=name),))
+    with summary_ops.always_record_summaries():
+      with self.create_db_writer().as_default():
+        summary_ops.graph(graph)
+    six.assertCountEqual(self, [name],
+                         get_all(self.db, 'SELECT node_name FROM Nodes'))
+
+
+def get_tensor(db, tag_id, step):
+  return get_one(
+      db, 'SELECT tensor FROM Tensors WHERE tag_id = ? AND step = ?', tag_id,
+      step)
+
+
+def int64(x):
+  return array_ops.constant(x, dtypes.int64)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 37b546d3ab3220f934ea3bf7ef8f5fe6ab29f683..bda57e6a0ca8e1ddb979a80de276911c7738f0aa 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -19,23 +19,81 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
+import sqlite3
 
+from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.platform import gfile
 
 
-def events_from_file(logdir):
-  """Returns all events in the single eventfile in logdir."""
-  assert gfile.Exists(logdir)
-  files = gfile.ListDirectory(logdir)
-  assert len(files) == 1, "Found more than one file in logdir: %s" % files
-  records = list(
-      tf_record.tf_record_iterator(os.path.join(logdir, files[0])))
+class SummaryDbTest(test_util.TensorFlowTestCase):
+  """Helper for summary database testing."""
+
+  def setUp(self):
+    super(SummaryDbTest, self).setUp()
+    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
+    if os.path.exists(self.db_path):
+      os.unlink(self.db_path)
+    self.db = sqlite3.connect(self.db_path)
+    self.create_db_writer = functools.partial(
+        summary_ops.create_db_writer,
+        db_uri=self.db_path,
+        experiment_name='experiment',
+        run_name='run',
+        user_name='user')
+
+  def tearDown(self):
+    self.db.close()
+    super(SummaryDbTest, self).tearDown()
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.Event protos in the event file.
+  """
+  records = list(tf_record.tf_record_iterator(filepath))
   result = []
   for r in records:
     event = event_pb2.Event()
     event.ParseFromString(r)
     result.append(event)
   return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
+
+
+def get_one(db, q, *p):
+  return db.execute(q, p).fetchone()[0]
+
+
+def get_all(db, q, *p):
+  return unroll(db.execute(q, p).fetchall())
+
+
+def unroll(list_of_tuples):
+  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index bff7d022740ed8fe0c763865fe20d7cb0efd60d5..f54daa71255f2a49edf30f73e16dfc211dc92e39 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -200,11 +200,8 @@ py_library(
 # Model Ops.
 cc_library(
     name = "model_ops_lib",
-    srcs = [
-        "kernels/model_ops.cc",
-    ],
+    srcs = ["kernels/model_ops.cc"],
     deps = [
-        "//third_party/eigen3",
         "//tensorflow/contrib/tensor_forest:tree_utils",
         "//tensorflow/contrib/tensor_forest/kernels/v4:decision-tree-resource",
         "//tensorflow/contrib/tensor_forest/kernels/v4:input_data",
@@ -270,8 +267,11 @@ tf_custom_op_py_library(
     deps = [
         ":gen_model_ops_py",
         ":stats_ops_py",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
     ],
 )
 
@@ -286,12 +286,10 @@ tf_cc_test(
         ":forest_proto_impl",
         ":model_ops_lib",
         "//tensorflow/contrib/tensor_forest/kernels/v4:decision-tree-resource_impl",
-        "//tensorflow/core",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//third_party/eigen3",
     ],
 )
 
@@ -364,8 +362,12 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gen_stats_ops_py",
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
     ],
 )
 
@@ -382,6 +384,7 @@ tf_cc_test(
         "//tensorflow/contrib/tensor_forest/kernels/v4:decision-tree-resource_impl",
         "//tensorflow/core",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -495,9 +498,13 @@ py_library(
         "//tensorflow/contrib/decision_trees/proto:generic_tree_model_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/tensor_forest/proto:tensor_forest_params_proto_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
@@ -524,13 +531,17 @@ py_library(
     deps = [
         ":client_lib",
         "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/learn",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
     ],
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index 13b9749756d60e2a8ecc5e4cbfd3d3a60c496552..a2a3b485f6aa0ae827bbaa7812823730bd8db3b8 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -105,8 +105,8 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":training_ops",
+        "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
@@ -180,7 +180,6 @@ py_test(
     deps = [
         ":ops_lib",
         ":training_ops",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
diff --git a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
index 09b83e2af1f2038665ac6abc1fedd99426066d02..66aa293dc1cb93b82f06d838ad7b0f9c09761585 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
+++ b/tensorflow/contrib/tensor_forest/hybrid/core/ops/stochastic_hard_routing_function_op.cc
@@ -70,7 +70,7 @@ REGISTER_OP("StochasticHardRoutingFunction")
       return Status::OK();
     })
     .Doc(R"doc(
-  Samples a path for each instance in `input_data` and returns the 
+  Samples a path for each instance in `input_data` and returns the
   probability of the path and the path taken.
 
   tree_depth: The depth of the decision tree.
diff --git a/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py b/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
index cccf444db809df5032877f026f8f89363ca085bc..a56beeeb2c13cd17082531877670475a16396ca6 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
+++ b/tensorflow/contrib/tensor_forest/hybrid/python/models/decisions_to_data_then_nn_test.py
@@ -80,7 +80,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
         isinstance(self.params.num_trees, tensor_forest.ForestHParams))
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testContructionPollution"):
+        "DecisionsToDataThenNNTest_testConstructionPollution"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
 
@@ -95,7 +95,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
          for _ in range(100)])
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testInferenceContruction"):
+        "DecisionsToDataThenNNTest_testInferenceConstruction"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
       graph = graph_builder.inference_graph(data, None)
@@ -111,7 +111,7 @@ class DecisionsToDataThenNNTest(test_util.TensorFlowTestCase):
     labels = [1 for _ in range(100)]
 
     with variable_scope.variable_scope(
-        "DecisionsToDataThenNNTest_testTrainingContruction"):
+        "DecisionsToDataThenNNTest_testTrainingConstruction"):
       graph_builder = decisions_to_data_then_nn.DecisionsToDataThenNN(
           self.params)
       graph = graph_builder.training_graph(data, labels, None)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index d8bbf87d2cecaec9b612e45e82295cebd3ac4c7f..9d3d60c24d72e28cf449cd196e34e53d5450d85f 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -22,10 +22,8 @@ tf_cc_test(
     srcs = ["schema_test.cc"],
     deps = [
         ":schema",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/lib/db:sqlite",
     ],
 )
 
@@ -45,10 +43,12 @@ cc_library(
 
 tf_cc_test(
     name = "summary_db_writer_test",
+    size = "small",
     srcs = ["summary_db_writer_test.cc"],
     deps = [
         ":summary_db_writer",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/db:sqlite",
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/contrib/tensorboard/db/schema.cc
index 98fff9e0ae45279f5734ed2eaac8bf46e8ae4b22..fd024d692c3feddea2e5cbd29380686e8a0e9839 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/contrib/tensorboard/db/schema.cc
@@ -21,6 +21,48 @@ class SqliteSchema {
  public:
   explicit SqliteSchema(std::shared_ptr<Sqlite> db) : db_(std::move(db)) {}
 
+  /// \brief Creates Ids table.
+  ///
+  /// This table must be used to randomly allocate Permanent IDs for
+  /// all top-level tables, in order to maintain an invariant where
+  /// foo_id != bar_id for all IDs of any two tables.
+  ///
+  /// A row should only be deleted from this table if it can be
+  /// guaranteed that it exists absolutely nowhere else in the entire
+  /// system.
+  ///
+  /// Fields:
+  ///   id: An ID that was allocated globally. This must be in the
+  ///     range [1,2**47). 0 is assigned the same meaning as NULL and
+  ///     shouldn't be stored; 2**63-1 is reserved for statically
+  ///     allocating space in a page to UPDATE later; and all other
+  ///     int64 values are reserved for future use.
+  Status CreateIdsTable() {
+    return Run(R"sql(
+      CREATE TABLE IF NOT EXISTS Ids (
+        id INTEGER PRIMARY KEY
+      )
+    )sql");
+  }
+
+  /// \brief Creates Descriptions table.
+  ///
+  /// This table allows TensorBoard to associate Markdown text with any
+  /// object in the database that has a Permanent ID.
+  ///
+  /// Fields:
+  ///   id: The Permanent ID of the associated object. This is also the
+  ///     SQLite rowid.
+  ///   description: Arbitrary Markdown text.
+  Status CreateDescriptionsTable() {
+    return Run(R"sql(
+      CREATE TABLE IF NOT EXISTS Descriptions (
+        id INTEGER PRIMARY KEY,
+        description TEXT
+      )
+    )sql");
+  }
+
   /// \brief Creates Tensors table.
   ///
   /// Fields:
@@ -83,15 +125,15 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   tag_id: Permanent >0 unique ID.
+  ///   tag_id: The Permanent ID of the Tag.
   ///   run_id: Optional ID of associated Run.
   ///   tag_name: The tag field in summary.proto, unique across Run.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
   ///     always the wall time of when the row was inserted into the
   ///     DB. It may be used as a hint for an archival job.
-  ///   metadata: Optional BLOB of SummaryMetadata proto.
   ///   display_name: Optional for GUI and defaults to tag_name.
-  ///   summary_description: Optional markdown information.
+  ///   plugin_name: Arbitrary TensorBoard plugin name for dispatch.
+  ///   plugin_data: Arbitrary data that plugin wants.
   Status CreateTagsTable() {
     return Run(R"sql(
       CREATE TABLE IF NOT EXISTS Tags (
@@ -100,28 +142,31 @@ class SqliteSchema {
         tag_id INTEGER NOT NULL,
         tag_name TEXT,
         inserted_time DOUBLE,
-        metadata BLOB,
         display_name TEXT,
-        description TEXT
+        plugin_name TEXT,
+        plugin_data BLOB
       )
     )sql");
   }
 
   /// \brief Creates Runs table.
   ///
-  /// This table stores information about runs. Each row usually
+  /// This table stores information about Runs. Each row usually
   /// represents a single attempt at training or testing a TensorFlow
   /// model, with a given set of hyper-parameters, whose summaries are
   /// written out to a single event logs directory with a monotonic step
   /// counter.
   ///
-  /// When a run is deleted from this table, TensorBoard should treat all
-  /// information associated with it as deleted, even if those rows in
-  /// different tables still exist.
-  ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   run_id: Permanent >0 unique ID.
+  ///   run_id: The Permanent ID of the Run. This has a 1:1 mapping
+  ///     with a SummaryWriter instance. If two writers spawn for a
+  ///     given (user_name, run_name, run_name) then each should
+  ///     allocate its own run_id and whichever writer puts it in the
+  ///     database last wins. The Tags / Tensors associated with the
+  ///     previous invocations will then enter limbo, where they may be
+  ///     accessible for certain operations, but should be garbage
+  ///     collected eventually.
   ///   experiment_id: Optional ID of associated Experiment.
   ///   run_name: User-supplied string, unique across Experiment.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
@@ -134,9 +179,11 @@ class SqliteSchema {
   ///     started, from the perspective of whichever machine talks to
   ///     the database. This field will be mutated if the run is
   ///     restarted.
-  ///   description: Optional markdown information.
-  ///   graph: Snappy tf.GraphDef proto with node field cleared. That
-  ///     field can be recreated using GraphNodes and NodeDefs.
+  ///   finished_time: Float UNIX timestamp with µs precision of when
+  ///     SummaryWriter resource that created this run was destroyed.
+  ///     Once this value becomes non-NULL a Run and its Tags and
+  ///     Tensors should be regarded as immutable.
+  ///   graph_id: ID of associated Graphs row.
   Status CreateRunsTable() {
     return Run(R"sql(
       CREATE TABLE IF NOT EXISTS Runs (
@@ -146,8 +193,8 @@ class SqliteSchema {
         run_name TEXT,
         inserted_time REAL,
         started_time REAL,
-        description TEXT,
-        graph BLOB
+        finished_time REAL,
+        graph_id INTEGER
       )
     )sql");
   }
@@ -160,15 +207,15 @@ class SqliteSchema {
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
   ///   user_id: Optional ID of associated User.
-  ///   experiment_id: Permanent >0 unique ID.
+  ///   experiment_id: The Permanent ID of the Experiment.
   ///   experiment_name: User-supplied string, unique across User.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
   ///     always the time the row was inserted into the database. It
   ///     does not change.
   ///   started_time: Float UNIX timestamp with µs precision. This is
   ///     the MIN(experiment.started_time, run.started_time) of each
-  ///     Run added to the database.
-  ///   description: Optional markdown information.
+  ///     Run added to the database, including Runs which have since
+  ///     been overwritten.
   Status CreateExperimentsTable() {
     return Run(R"sql(
       CREATE TABLE IF NOT EXISTS Experiments (
@@ -177,8 +224,7 @@ class SqliteSchema {
         experiment_id INTEGER NOT NULL,
         experiment_name TEXT,
         inserted_time REAL,
-        started_time REAL,
-        description TEXT
+        started_time REAL
       )
     )sql");
   }
@@ -187,7 +233,7 @@ class SqliteSchema {
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   user_id: Permanent >0 unique ID.
+  ///   user_id: The Permanent ID of the User.
   ///   user_name: Unique user name.
   ///   email: Optional unique email address.
   ///   inserted_time: Float UNIX timestamp with µs precision. This is
@@ -205,46 +251,78 @@ class SqliteSchema {
     )sql");
   }
 
-  /// \brief Creates NodeDefs table.
-  ///
-  /// This table stores NodeDef protos which define the GraphDef for a
-  /// Run. This functions like a hash table so rows can be shared by
-  /// multiple Runs in an Experiment.
+  /// \brief Creates Graphs table.
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   experiment_id: Optional int64 for grouping rows.
-  ///   node_def_id: Permanent >0 unique ID.
-  ///   fingerprint: Optional farmhash::Fingerprint64() of uncompressed
-  ///     node_def bytes, coerced to int64.
-  ///   node_def: BLOB containing a Snappy tf.NodeDef proto.
-  Status CreateNodeDefsTable() {
+  ///   graph_id: The Permanent ID of the Graph.
+  ///   inserted_time: Float UNIX timestamp with µs precision. This is
+  ///     always the wall time of when the row was inserted into the
+  ///     DB. It may be used as a hint for an archival job.
+  ///   node_def: Contains Snappy tf.GraphDef proto. All fields will be
+  ///     cleared except those not expressed in SQL.
+  Status CreateGraphsTable() {
     return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS NodeDefs (
+      CREATE TABLE IF NOT EXISTS Graphs (
         rowid INTEGER PRIMARY KEY,
-        experiment_id INTEGER,
-        node_def_id INTEGER NOT NULL,
-        fingerprint INTEGER,
-        node_def TEXT
+        graph_id INTEGER NOT NULL,
+        inserted_time REAL,
+        graph_def BLOB
       )
     )sql");
   }
 
-  /// \brief Creates RunNodeDefs table.
+  /// \brief Creates Nodes table.
   ///
-  /// Table mapping Runs to NodeDefs. This is used to recreate the node
-  /// field of the GraphDef proto.
+  /// Fields:
+  ///   rowid: Ephemeral b-tree ID dictating locality.
+  ///   graph_id: The Permanent ID of the associated Graph.
+  ///   node_id: ID for this node. This is more like a 0-index within
+  ///     the Graph. Please note indexes are allowed to be removed.
+  ///   node_name: Unique name for this Node within Graph. This is
+  ///     copied from the proto so it can be indexed. This is allowed
+  ///     to be NULL to save space on the index, in which case the
+  ///     node_def.name proto field must not be cleared.
+  ///   op: Copied from tf.NodeDef proto.
+  ///   device: Copied from tf.NodeDef proto.
+  ///   node_def: Contains Snappy tf.NodeDef proto. All fields will be
+  ///     cleared except those not expressed in SQL.
+  Status CreateNodesTable() {
+    return Run(R"sql(
+      CREATE TABLE IF NOT EXISTS Nodes (
+        rowid INTEGER PRIMARY KEY,
+        graph_id INTEGER NOT NULL,
+        node_id INTEGER NOT NULL,
+        node_name TEXT,
+        op TEXT,
+        device TEXT,
+        node_def BLOB
+      )
+    )sql");
+  }
+
+  /// \brief Creates NodeInputs table.
   ///
   /// Fields:
   ///   rowid: Ephemeral b-tree ID dictating locality.
-  ///   run_id: Mandatory ID of associated Run.
-  ///   node_def_id: Mandatory ID of associated NodeDef.
-  Status CreateRunNodeDefsTable() {
+  ///   graph_id: The Permanent ID of the associated Graph.
+  ///   node_id: Index of Node in question. This can be considered the
+  ///     'to' vertex.
+  ///   idx: Used for ordering inputs on a given Node.
+  ///   input_node_id: Nodes.node_id of the corresponding input node.
+  ///     This can be considered the 'from' vertex.
+  ///   is_control: If non-zero, indicates this input is a controlled
+  ///     dependency, which means this isn't an edge through which
+  ///     tensors flow. NULL means 0.
+  Status CreateNodeInputsTable() {
     return Run(R"sql(
-      CREATE TABLE IF NOT EXISTS RunNodeDefs (
+      CREATE TABLE IF NOT EXISTS NodeInputs (
         rowid INTEGER PRIMARY KEY,
-        run_id INTEGER NOT NULL,
-        node_def_id INTEGER NOT NULL
+        graph_id INTEGER NOT NULL,
+        node_id INTEGER NOT NULL,
+        idx INTEGER NOT NULL,
+        input_node_id INTEGER NOT NULL,
+        is_control INTEGER
       )
     )sql");
   }
@@ -297,11 +375,27 @@ class SqliteSchema {
     )sql");
   }
 
-  /// \brief Uniquely indexes node_def_id on NodeDefs table.
-  Status CreateNodeDefIdIndex() {
+  /// \brief Uniquely indexes graph_id on Graphs table.
+  Status CreateGraphIdIndex() {
     return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS NodeDefIdIndex
-      ON NodeDefs (node_def_id)
+      CREATE UNIQUE INDEX IF NOT EXISTS GraphIdIndex
+      ON Graphs (graph_id)
+    )sql");
+  }
+
+  /// \brief Uniquely indexes (graph_id, node_id) on Nodes table.
+  Status CreateNodeIdIndex() {
+    return Run(R"sql(
+      CREATE UNIQUE INDEX IF NOT EXISTS NodeIdIndex
+      ON Nodes (graph_id, node_id)
+    )sql");
+  }
+
+  /// \brief Uniquely indexes (graph_id, node_id, idx) on NodeInputs table.
+  Status CreateNodeInputsIndex() {
+    return Run(R"sql(
+      CREATE UNIQUE INDEX IF NOT EXISTS NodeInputsIndex
+      ON NodeInputs (graph_id, node_id, idx)
     )sql");
   }
 
@@ -350,20 +444,12 @@ class SqliteSchema {
     )sql");
   }
 
-  /// \brief Indexes (experiment_id, fingerprint) on NodeDefs table.
-  Status CreateNodeDefFingerprintIndex() {
-    return Run(R"sql(
-      CREATE INDEX IF NOT EXISTS NodeDefFingerprintIndex
-      ON NodeDefs (experiment_id, fingerprint)
-      WHERE fingerprint IS NOT NULL
-    )sql");
-  }
-
-  /// \brief Uniquely indexes (run_id, node_def_id) on RunNodeDefs table.
-  Status CreateRunNodeDefIndex() {
+  /// \brief Uniquely indexes (graph_id, node_name) on Nodes table.
+  Status CreateNodeNameIndex() {
     return Run(R"sql(
-      CREATE UNIQUE INDEX IF NOT EXISTS RunNodeDefIndex
-      ON RunNodeDefs (run_id, node_def_id)
+      CREATE UNIQUE INDEX IF NOT EXISTS NodeNameIndex
+      ON Nodes (graph_id, node_name)
+      WHERE node_name IS NOT NULL
     )sql");
   }
 
@@ -381,28 +467,32 @@ class SqliteSchema {
 
 Status SetupTensorboardSqliteDb(std::shared_ptr<Sqlite> db) {
   SqliteSchema s(std::move(db));
+  TF_RETURN_IF_ERROR(s.CreateIdsTable());
+  TF_RETURN_IF_ERROR(s.CreateDescriptionsTable());
   TF_RETURN_IF_ERROR(s.CreateTensorsTable());
   TF_RETURN_IF_ERROR(s.CreateTensorChunksTable());
   TF_RETURN_IF_ERROR(s.CreateTagsTable());
   TF_RETURN_IF_ERROR(s.CreateRunsTable());
   TF_RETURN_IF_ERROR(s.CreateExperimentsTable());
   TF_RETURN_IF_ERROR(s.CreateUsersTable());
-  TF_RETURN_IF_ERROR(s.CreateNodeDefsTable());
-  TF_RETURN_IF_ERROR(s.CreateRunNodeDefsTable());
+  TF_RETURN_IF_ERROR(s.CreateGraphsTable());
+  TF_RETURN_IF_ERROR(s.CreateNodeInputsTable());
+  TF_RETURN_IF_ERROR(s.CreateNodesTable());
   TF_RETURN_IF_ERROR(s.CreateTensorIndex());
   TF_RETURN_IF_ERROR(s.CreateTensorChunkIndex());
   TF_RETURN_IF_ERROR(s.CreateTagIdIndex());
   TF_RETURN_IF_ERROR(s.CreateRunIdIndex());
   TF_RETURN_IF_ERROR(s.CreateExperimentIdIndex());
   TF_RETURN_IF_ERROR(s.CreateUserIdIndex());
-  TF_RETURN_IF_ERROR(s.CreateNodeDefIdIndex());
+  TF_RETURN_IF_ERROR(s.CreateGraphIdIndex());
+  TF_RETURN_IF_ERROR(s.CreateNodeIdIndex());
+  TF_RETURN_IF_ERROR(s.CreateNodeInputsIndex());
   TF_RETURN_IF_ERROR(s.CreateTagNameIndex());
   TF_RETURN_IF_ERROR(s.CreateRunNameIndex());
   TF_RETURN_IF_ERROR(s.CreateExperimentNameIndex());
   TF_RETURN_IF_ERROR(s.CreateUserNameIndex());
   TF_RETURN_IF_ERROR(s.CreateUserEmailIndex());
-  TF_RETURN_IF_ERROR(s.CreateNodeDefFingerprintIndex());
-  TF_RETURN_IF_ERROR(s.CreateRunNodeDefIndex());
+  TF_RETURN_IF_ERROR(s.CreateNodeNameIndex());
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index df64e36305529a67f9573e9d26cc0dfc506d324f..04b9c8e457bd52ff476ed8b13ff9608bdc8a933e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -15,247 +15,618 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
 #include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/snappy.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 namespace {
 
-int64 MakeRandomId() {
-  int64 id = static_cast<int64>(random::New64() & ((1ULL << 63) - 1));
-  if (id == 0) {
-    ++id;
+// https://www.sqlite.org/fileformat.html#record_format
+const uint64 kIdTiers[] = {
+    0x7fffffULL,        // 23-bit (3 bytes on disk)
+    0x7fffffffULL,      // 31-bit (4 bytes on disk)
+    0x7fffffffffffULL,  // 47-bit (5 bytes on disk)
+                        // Remaining bits reserved for future use.
+};
+const int kMaxIdTier = sizeof(kIdTiers) / sizeof(uint64);
+const int kIdCollisionDelayMicros = 10;
+const int kMaxIdCollisions = 21;  // sum(2**i*10µs for i in range(21))~=21s
+const int64 kAbsent = 0LL;
+const int64 kReserved = 0x7fffffffffffffffLL;
+
+double GetWallTime(Env* env) {
+  // TODO(@jart): Follow precise definitions for time laid out in schema.
+  // TODO(@jart): Use monotonic clock from gRPC codebase.
+  return static_cast<double>(env->NowMicros()) / 1.0e6;
+}
+
+Status Serialize(const protobuf::MessageLite& proto, string* output) {
+  output->clear();
+  if (!proto.SerializeToString(output)) {
+    return errors::DataLoss("SerializeToString failed");
   }
-  return id;
+  return Status::OK();
 }
 
-class SummaryDbWriter : public SummaryWriterInterface {
- public:
-  SummaryDbWriter(Env* env, std::shared_ptr<Sqlite> db)
-      : SummaryWriterInterface(), env_(env), db_(std::move(db)), run_id_(-1) {}
-  ~SummaryDbWriter() override {}
+Status Compress(const string& data, string* output) {
+  output->clear();
+  if (!port::Snappy_Compress(data.data(), data.size(), output)) {
+    return errors::FailedPrecondition("TensorBase needs Snappy");
+  }
+  return Status::OK();
+}
 
-  Status Initialize(const string& experiment_name, const string& run_name,
-                    const string& user_name) {
-    mutex_lock ml(mu_);
-    insert_tensor_ = db_->Prepare(R"sql(
-      INSERT OR REPLACE INTO Tensors (tag_id, step, computed_time, tensor)
-      VALUES (?, ?, ?, ?)
-    )sql");
-    update_metadata_ = db_->Prepare(R"sql(
-      UPDATE Tags SET metadata = ? WHERE tag_id = ?
-    )sql");
-    experiment_name_ = experiment_name;
-    run_name_ = run_name;
-    user_name_ = user_name;
-    return Status::OK();
+Status BindProto(SqliteStatement* stmt, int parameter,
+                 const protobuf::MessageLite& proto) {
+  string serialized;
+  TF_RETURN_IF_ERROR(Serialize(proto, &serialized));
+  string compressed;
+  TF_RETURN_IF_ERROR(Compress(serialized, &compressed));
+  stmt->BindBlob(parameter, compressed);
+  return Status::OK();
+}
+
+Status BindTensor(SqliteStatement* stmt, int parameter, const Tensor& t) {
+  // TODO(@jart): Make portable between little and big endian systems.
+  // TODO(@jart): Use TensorChunks with minimal copying for big tensors.
+  // TODO(@jart): Add field to indicate encoding.
+  // TODO(@jart): Allow crunch tool to re-compress with zlib instead.
+  TensorProto p;
+  t.AsProtoTensorContent(&p);
+  return BindProto(stmt, parameter, p);
+}
+
+// Tries to fudge shape and dtype to something with smaller storage.
+Status CoerceScalar(const Tensor& t, Tensor* out) {
+  switch (t.dtype()) {
+    case DT_DOUBLE:
+      *out = t;
+      break;
+    case DT_INT64:
+      *out = t;
+      break;
+    case DT_FLOAT:
+      *out = {DT_DOUBLE, {}};
+      out->scalar<double>()() = t.scalar<float>()();
+      break;
+    case DT_HALF:
+      *out = {DT_DOUBLE, {}};
+      out->scalar<double>()() = static_cast<double>(t.scalar<Eigen::half>()());
+      break;
+    case DT_INT32:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int32>()();
+      break;
+    case DT_INT16:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int16>()();
+      break;
+    case DT_INT8:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<int8>()();
+      break;
+    case DT_UINT32:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint32>()();
+      break;
+    case DT_UINT16:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint16>()();
+      break;
+    case DT_UINT8:
+      *out = {DT_INT64, {}};
+      out->scalar<int64>()() = t.scalar<uint8>()();
+      break;
+    default:
+      return errors::Unimplemented("Scalar summary for dtype ",
+                                   DataTypeString(t.dtype()),
+                                   " is not supported.");
   }
+  return Status::OK();
+}
 
-  // TODO(@jart): Use transactions that COMMIT on Flush()
-  // TODO(@jart): Retry Commit() on SQLITE_BUSY with exponential back-off.
-  Status Flush() override { return Status::OK(); }
+/// \brief Generates unique IDs randomly in the [1,2**63-2] range.
+///
+/// This class starts off generating IDs in the [1,2**23-1] range,
+/// because it's human friendly and occupies 4 bytes max on disk with
+/// SQLite's zigzag varint encoding. Then, each time a collision
+/// happens, the random space is increased by 8 bits.
+///
+/// This class uses exponential back-off so writes will slow down as
+/// the ID space becomes exhausted.
+class IdAllocator {
+ public:
+  IdAllocator(Env* env, Sqlite* db)
+      : env_{env}, inserter_{db->Prepare("INSERT INTO Ids (id) VALUES (?)")} {}
 
-  Status WriteTensor(int64 global_step, Tensor t, const string& tag,
-                     const string& serialized_metadata) override {
-    mutex_lock ml(mu_);
-    TF_RETURN_IF_ERROR(InitializeParents());
-    // TODO(@jart): Memoize tag_id.
-    int64 tag_id;
-    TF_RETURN_IF_ERROR(GetTagId(run_id_, tag, &tag_id));
-    if (!serialized_metadata.empty()) {
-      // TODO(@jart): Only update metadata for first tensor.
-      update_metadata_.BindBlobUnsafe(1, serialized_metadata);
-      update_metadata_.BindInt(2, tag_id);
-      TF_RETURN_IF_ERROR(update_metadata_.StepAndReset());
-    }
-    // TODO(@jart): Lease blocks of rowids and *_ids to minimize fragmentation.
-    // TODO(@jart): Check for random ID collisions without needing txn retry.
-    insert_tensor_.BindInt(1, tag_id);
-    insert_tensor_.BindInt(2, global_step);
-    insert_tensor_.BindDouble(3, GetWallTime());
-    switch (t.dtype()) {
-      case DT_INT64:
-        insert_tensor_.BindInt(4, t.scalar<int64>()());
-        break;
-      case DT_DOUBLE:
-        insert_tensor_.BindDouble(4, t.scalar<double>()());
-        break;
-      default:
-        TF_RETURN_IF_ERROR(BindTensor(t));
+  Status CreateNewId(int64* id) {
+    Status s;
+    for (int i = 0; i < kMaxIdCollisions; ++i) {
+      int64 tid = MakeRandomId();
+      inserter_.BindInt(1, tid);
+      s = inserter_.StepAndReset();
+      if (s.ok()) {
+        *id = tid;
         break;
+      }
+      // SQLITE_CONSTRAINT maps to INVALID_ARGUMENT in sqlite.cc
+      if (s.code() != error::INVALID_ARGUMENT) break;
+      if (tier_ < kMaxIdTier) {
+        LOG(INFO) << "IdAllocator collision at tier " << tier_ << " (of "
+                  << kMaxIdTier << ") so auto-adjusting to a higher tier";
+        ++tier_;
+      } else {
+        LOG(WARNING) << "IdAllocator (attempt #" << i << ") "
+                     << "resulted in a collision at the highest tier; this "
+                        "is problematic if it happens often; you can try "
+                        "pruning the Ids table; you can also file a bug "
+                        "asking for the ID space to be increased; otherwise "
+                        "writes will gradually slow down over time until they "
+                        "become impossible";
+      }
+      env_->SleepForMicroseconds((1 << i) * kIdCollisionDelayMicros);
     }
-    TF_RETURN_IF_ERROR(insert_tensor_.StepAndReset());
-    return Status::OK();
+    return s;
   }
 
-  Status WriteEvent(std::unique_ptr<Event> e) override {
-    // TODO(@jart): This will be used to load event logs.
-    return errors::Unimplemented("WriteEvent");
+ private:
+  int64 MakeRandomId() {
+    int64 id = static_cast<int64>(random::New64() & kIdTiers[tier_]);
+    if (id == kAbsent) ++id;
+    if (id == kReserved) --id;
+    return id;
   }
 
-  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
-    // TODO(@jart): Unlike WriteTensor, this method would be granted leniency
-    //              to change the dtype if it saves storage space. For example,
-    //              DT_UINT32 would be stored in the database as an INTEGER
-    //              rather than a serialized BLOB. But when reading it back,
-    //              the dtype would become DT_INT64.
-    return errors::Unimplemented("WriteScalar");
-  }
+  Env* env_;
+  SqliteStatement inserter_;
+  int tier_ = 0;
+};
 
-  Status WriteHistogram(int64 global_step, Tensor t,
-                        const string& tag) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteHistogram not supported. Please use ",
-        "tensorboard.summary.histogram() instead.");
+class GraphSaver {
+ public:
+  static Status Save(Env* env, Sqlite* db, IdAllocator* id_allocator,
+                     GraphDef* graph, int64* graph_id) {
+    TF_RETURN_IF_ERROR(id_allocator->CreateNewId(graph_id));
+    GraphSaver saver{env, db, graph, *graph_id};
+    saver.MapNameToNodeId();
+    TF_RETURN_IF_ERROR(saver.SaveNodeInputs());
+    TF_RETURN_IF_ERROR(saver.SaveNodes());
+    TF_RETURN_IF_ERROR(saver.SaveGraph());
+    return Status::OK();
   }
 
-  Status WriteImage(int64 global_step, Tensor tensor, const string& tag,
-                    int max_images, Tensor bad_color) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteImage not supported. Please use ",
-        "tensorboard.summary.image() instead.");
+ private:
+  GraphSaver(Env* env, Sqlite* db, GraphDef* graph, int64 graph_id)
+      : env_(env), db_(db), graph_(graph), graph_id_(graph_id) {}
+
+  void MapNameToNodeId() {
+    size_t toto = static_cast<size_t>(graph_->node_size());
+    name_copies_.reserve(toto);
+    name_to_node_id_.reserve(toto);
+    for (int node_id = 0; node_id < graph_->node_size(); ++node_id) {
+      // Copy name into memory region, since we call clear_name() later.
+      // Then wrap in StringPiece so we can compare slices without copy.
+      name_copies_.emplace_back(graph_->node(node_id).name());
+      name_to_node_id_.emplace(name_copies_.back(), node_id);
+    }
   }
 
-  Status WriteAudio(int64 global_step, Tensor tensor, const string& tag,
-                    int max_outputs, float sample_rate) override {
-    return errors::Unimplemented(
-        "SummaryDbWriter::WriteAudio not supported. Please use ",
-        "tensorboard.summary.audio() instead.");
+  Status SaveNodeInputs() {
+    auto insert = db_->Prepare(R"sql(
+      INSERT INTO NodeInputs (graph_id, node_id, idx, input_node_id, is_control)
+      VALUES (?, ?, ?, ?, ?)
+    )sql");
+    for (int node_id = 0; node_id < graph_->node_size(); ++node_id) {
+      const NodeDef& node = graph_->node(node_id);
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        StringPiece name = node.input(idx);
+        insert.BindInt(1, graph_id_);
+        insert.BindInt(2, node_id);
+        insert.BindInt(3, idx);
+        if (!name.empty() && name[0] == '^') {
+          name.remove_prefix(1);
+          insert.BindInt(5, 1);
+        }
+        auto e = name_to_node_id_.find(name);
+        if (e == name_to_node_id_.end()) {
+          return errors::DataLoss("Could not find node: ", name);
+        }
+        insert.BindInt(4, e->second);
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), node.name(),
+                                        " -> ", name);
+      }
+    }
+    return Status::OK();
   }
 
-  string DebugString() override { return "SummaryDbWriter"; }
+  Status SaveNodes() {
+    auto insert = db_->Prepare(R"sql(
+      INSERT INTO Nodes (graph_id, node_id, node_name, op, device, node_def)
+      VALUES (?, ?, ?, ?, ?, ?)
+    )sql");
+    for (int node_id = 0; node_id < graph_->node_size(); ++node_id) {
+      NodeDef* node = graph_->mutable_node(node_id);
+      insert.BindInt(1, graph_id_);
+      insert.BindInt(2, node_id);
+      insert.BindText(3, node->name());
+      node->clear_name();
+      if (!node->op().empty()) {
+        insert.BindText(4, node->op());
+        node->clear_op();
+      }
+      if (!node->device().empty()) {
+        insert.BindText(5, node->device());
+        node->clear_device();
+      }
+      node->clear_input();
+      TF_RETURN_IF_ERROR(BindProto(&insert, 6, *node));
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), node->name());
+    }
+    return Status::OK();
+  }
 
- private:
-  double GetWallTime() {
-    // TODO(@jart): Follow precise definitions for time laid out in schema.
-    // TODO(@jart): Use monotonic clock from gRPC codebase.
-    return static_cast<double>(env_->NowMicros()) / 1.0e6;
+  Status SaveGraph() {
+    auto insert = db_->Prepare(R"sql(
+      INSERT INTO Graphs (graph_id, inserted_time, graph_def)
+      VALUES (?, ?, ?)
+    )sql");
+    insert.BindInt(1, graph_id_);
+    insert.BindDouble(2, GetWallTime(env_));
+    graph_->clear_node();
+    TF_RETURN_IF_ERROR(BindProto(&insert, 3, *graph_));
+    return insert.StepAndReset();
   }
 
-  Status BindTensor(const Tensor& t) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Make portable between little and big endian systems.
-    // TODO(@jart): Use TensorChunks with minimal copying for big tensors.
-    TensorProto p;
-    t.AsProtoTensorContent(&p);
-    string encoded;
-    if (!p.SerializeToString(&encoded)) {
-      return errors::DataLoss("SerializeToString failed");
-    }
-    // TODO(@jart): Put byte at beginning of blob to indicate encoding.
-    // TODO(@jart): Allow crunch tool to re-compress with zlib instead.
-    string compressed;
-    if (!port::Snappy_Compress(encoded.data(), encoded.size(), &compressed)) {
-      return errors::FailedPrecondition("TensorBase needs Snappy");
+  Env* env_;
+  Sqlite* db_;
+  GraphDef* graph_;
+  int64 graph_id_;
+  std::vector<string> name_copies_;
+  std::unordered_map<StringPiece, int64, StringPieceHasher> name_to_node_id_;
+};
+
+class RunWriter {
+ public:
+  RunWriter(Env* env, std::shared_ptr<Sqlite> db, const string& experiment_name,
+            const string& run_name, const string& user_name)
+      : env_{env},
+        db_{std::move(db)},
+        id_allocator_{env_, db_.get()},
+        experiment_name_{experiment_name},
+        run_name_{run_name},
+        user_name_{user_name},
+        insert_tensor_{db_->Prepare(R"sql(
+          INSERT OR REPLACE INTO Tensors (tag_id, step, computed_time, tensor)
+          VALUES (?, ?, ?, ?)
+        )sql")} {}
+
+  ~RunWriter() {
+    if (run_id_ == kAbsent) return;
+    auto update = db_->Prepare(R"sql(
+      UPDATE Runs SET finished_time = ? WHERE run_id = ?
+    )sql");
+    update.BindDouble(1, GetWallTime(env_));
+    update.BindInt(2, run_id_);
+    Status s = update.StepAndReset();
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to set Runs[" << run_id_
+                 << "].finish_time: " << s.ToString();
     }
-    insert_tensor_.BindBlobUnsafe(4, compressed);
-    return Status::OK();
   }
 
-  Status InitializeParents() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (run_id_ >= 0) {
-      return Status::OK();
+  Status InsertTensor(int64 tag_id, int64 step, double computed_time,
+                      Tensor t) {
+    insert_tensor_.BindInt(1, tag_id);
+    insert_tensor_.BindInt(2, step);
+    insert_tensor_.BindDouble(3, computed_time);
+    if (t.shape().dims() == 0 && t.dtype() == DT_INT64) {
+      insert_tensor_.BindInt(4, t.scalar<int64>()());
+    } else if (t.shape().dims() == 0 && t.dtype() == DT_DOUBLE) {
+      insert_tensor_.BindDouble(4, t.scalar<double>()());
+    } else {
+      TF_RETURN_IF_ERROR(BindTensor(&insert_tensor_, 4, t));
     }
-    int64 user_id;
-    TF_RETURN_IF_ERROR(GetUserId(user_name_, &user_id));
-    int64 experiment_id;
+    return insert_tensor_.StepAndReset();
+  }
+
+  Status InsertGraph(std::unique_ptr<GraphDef> g, double computed_time) {
+    TF_RETURN_IF_ERROR(InitializeRun(computed_time));
+    int64 graph_id;
     TF_RETURN_IF_ERROR(
-        GetExperimentId(user_id, experiment_name_, &experiment_id));
-    TF_RETURN_IF_ERROR(GetRunId(experiment_id, run_name_, &run_id_));
+        GraphSaver::Save(env_, db_.get(), &id_allocator_, g.get(), &graph_id));
+    if (run_id_ != kAbsent) {
+      auto set = db_->Prepare("UPDATE Runs SET graph_id = ? WHERE run_id = ?");
+      set.BindInt(1, graph_id);
+      set.BindInt(2, run_id_);
+      TF_RETURN_IF_ERROR(set.StepAndReset());
+    }
     return Status::OK();
   }
 
-  Status GetUserId(const string& user_name, int64* user_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (user_name.empty()) {
-      *user_id = 0LL;
+  Status GetTagId(double computed_time, const string& tag_name,
+                  const SummaryMetadata& metadata, int64* tag_id) {
+    TF_RETURN_IF_ERROR(InitializeRun(computed_time));
+    auto e = tag_ids_.find(tag_name);
+    if (e != tag_ids_.end()) {
+      *tag_id = e->second;
       return Status::OK();
     }
-    SqliteStatement get_user_id = db_->Prepare(R"sql(
+    TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(tag_id));
+    tag_ids_[tag_name] = *tag_id;
+    if (!metadata.summary_description().empty()) {
+      SqliteStatement insert_description = db_->Prepare(R"sql(
+        INSERT INTO Descriptions (id, description) VALUES (?, ?)
+      )sql");
+      insert_description.BindInt(1, *tag_id);
+      insert_description.BindText(2, metadata.summary_description());
+      TF_RETURN_IF_ERROR(insert_description.StepAndReset());
+    }
+    SqliteStatement insert = db_->Prepare(R"sql(
+      INSERT INTO Tags (
+        run_id,
+        tag_id,
+        tag_name,
+        inserted_time,
+        display_name,
+        plugin_name,
+        plugin_data
+      ) VALUES (?, ?, ?, ?, ?, ?, ?)
+    )sql");
+    if (run_id_ != kAbsent) insert.BindInt(1, run_id_);
+    insert.BindInt(2, *tag_id);
+    insert.BindText(3, tag_name);
+    insert.BindDouble(4, GetWallTime(env_));
+    if (!metadata.display_name().empty()) {
+      insert.BindText(5, metadata.display_name());
+    }
+    if (!metadata.plugin_data().plugin_name().empty()) {
+      insert.BindText(6, metadata.plugin_data().plugin_name());
+    }
+    if (!metadata.plugin_data().content().empty()) {
+      insert.BindBlob(7, metadata.plugin_data().content());
+    }
+    return insert.StepAndReset();
+  }
+
+ private:
+  Status InitializeUser() {
+    if (user_id_ != kAbsent || user_name_.empty()) return Status::OK();
+    SqliteStatement get = db_->Prepare(R"sql(
       SELECT user_id FROM Users WHERE user_name = ?
     )sql");
-    get_user_id.BindText(1, user_name);
+    get.BindText(1, user_name_);
     bool is_done;
-    TF_RETURN_IF_ERROR(get_user_id.Step(&is_done));
+    TF_RETURN_IF_ERROR(get.Step(&is_done));
     if (!is_done) {
-      *user_id = get_user_id.ColumnInt(0);
-    } else {
-      *user_id = MakeRandomId();
-      SqliteStatement insert_user = db_->Prepare(R"sql(
-        INSERT INTO Users (user_id, user_name, inserted_time) VALUES (?, ?, ?)
+      user_id_ = get.ColumnInt(0);
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&user_id_));
+    SqliteStatement insert = db_->Prepare(R"sql(
+      INSERT INTO Users (user_id, user_name, inserted_time) VALUES (?, ?, ?)
+    )sql");
+    insert.BindInt(1, user_id_);
+    insert.BindText(2, user_name_);
+    insert.BindDouble(3, GetWallTime(env_));
+    TF_RETURN_IF_ERROR(insert.StepAndReset());
+    return Status::OK();
+  }
+
+  Status InitializeExperiment(double computed_time) {
+    if (experiment_name_.empty()) return Status::OK();
+    if (experiment_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(InitializeUser());
+      SqliteStatement get = db_->Prepare(R"sql(
+        SELECT
+          experiment_id,
+          started_time
+        FROM
+          Experiments
+        WHERE
+          user_id IS ?
+          AND experiment_name = ?
       )sql");
-      insert_user.BindInt(1, *user_id);
-      insert_user.BindText(2, user_name);
-      insert_user.BindDouble(3, GetWallTime());
-      TF_RETURN_IF_ERROR(insert_user.StepAndReset());
+      if (user_id_ != kAbsent) get.BindInt(1, user_id_);
+      get.BindText(2, experiment_name_);
+      bool is_done;
+      TF_RETURN_IF_ERROR(get.Step(&is_done));
+      if (!is_done) {
+        experiment_id_ = get.ColumnInt(0);
+        experiment_started_time_ = get.ColumnInt(1);
+      } else {
+        TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&experiment_id_));
+        experiment_started_time_ = computed_time;
+        SqliteStatement insert = db_->Prepare(R"sql(
+          INSERT INTO Experiments (
+            user_id,
+            experiment_id,
+            experiment_name,
+            inserted_time,
+            started_time
+          ) VALUES (?, ?, ?, ?, ?)
+        )sql");
+        if (user_id_ != kAbsent) insert.BindInt(1, user_id_);
+        insert.BindInt(2, experiment_id_);
+        insert.BindText(3, experiment_name_);
+        insert.BindDouble(4, GetWallTime(env_));
+        insert.BindDouble(5, computed_time);
+        TF_RETURN_IF_ERROR(insert.StepAndReset());
+      }
+    }
+    if (computed_time < experiment_started_time_) {
+      experiment_started_time_ = computed_time;
+      SqliteStatement update = db_->Prepare(R"sql(
+        UPDATE Experiments SET started_time = ? WHERE experiment_id = ?
+      )sql");
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, experiment_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
     }
     return Status::OK();
   }
 
-  Status GetExperimentId(int64 user_id, const string& experiment_name,
-                         int64* experiment_id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Experiments", "user_id", user_id, "experiment_name",
-                 experiment_name, "experiment_id", experiment_id);
+  Status InitializeRun(double computed_time) {
+    if (run_name_.empty()) return Status::OK();
+    TF_RETURN_IF_ERROR(InitializeExperiment(computed_time));
+    if (run_id_ == kAbsent) {
+      TF_RETURN_IF_ERROR(id_allocator_.CreateNewId(&run_id_));
+      run_started_time_ = computed_time;
+      SqliteStatement insert = db_->Prepare(R"sql(
+        INSERT OR REPLACE INTO Runs (
+          experiment_id,
+          run_id,
+          run_name,
+          inserted_time,
+          started_time
+        ) VALUES (?, ?, ?, ?, ?)
+      )sql");
+      if (experiment_id_ != kAbsent) insert.BindInt(1, experiment_id_);
+      insert.BindInt(2, run_id_);
+      insert.BindText(3, run_name_);
+      insert.BindDouble(4, GetWallTime(env_));
+      insert.BindDouble(5, computed_time);
+      TF_RETURN_IF_ERROR(insert.StepAndReset());
+    }
+    if (computed_time < run_started_time_) {
+      run_started_time_ = computed_time;
+      SqliteStatement update = db_->Prepare(R"sql(
+        UPDATE Runs SET started_time = ? WHERE run_id = ?
+      )sql");
+      update.BindDouble(1, computed_time);
+      update.BindInt(2, run_id_);
+      TF_RETURN_IF_ERROR(update.StepAndReset());
+    }
+    return Status::OK();
   }
 
-  Status GetRunId(int64 experiment_id, const string& run_name, int64* run_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    // TODO(@jart): Compute started_time.
-    return GetId("Runs", "experiment_id", experiment_id, "run_name", run_name,
-                 "run_id", run_id);
+  Env* env_;
+  std::shared_ptr<Sqlite> db_;
+  IdAllocator id_allocator_;
+  const string experiment_name_;
+  const string run_name_;
+  const string user_name_;
+  int64 experiment_id_ = kAbsent;
+  int64 run_id_ = kAbsent;
+  int64 user_id_ = kAbsent;
+  std::unordered_map<string, int64> tag_ids_;
+  double experiment_started_time_ = 0.0;
+  double run_started_time_ = 0.0;
+  SqliteStatement insert_tensor_;
+};
+
+class SummaryDbWriter : public SummaryWriterInterface {
+ public:
+  SummaryDbWriter(Env* env, std::shared_ptr<Sqlite> db,
+                  const string& experiment_name, const string& run_name,
+                  const string& user_name)
+      : SummaryWriterInterface(),
+        env_{env},
+        run_writer_{env, std::move(db), experiment_name, run_name, user_name} {}
+  ~SummaryDbWriter() override {}
+
+  Status Flush() override { return Status::OK(); }
+
+  Status WriteTensor(int64 global_step, Tensor t, const string& tag,
+                     const string& serialized_metadata) override {
+    mutex_lock ml(mu_);
+    SummaryMetadata metadata;
+    if (!serialized_metadata.empty()) {
+      metadata.ParseFromString(serialized_metadata);
+    }
+    double now = GetWallTime(env_);
+    int64 tag_id;
+    TF_RETURN_IF_ERROR(run_writer_.GetTagId(now, tag, metadata, &tag_id));
+    return run_writer_.InsertTensor(tag_id, global_step, now, t);
   }
 
-  Status GetTagId(int64 run_id, const string& tag_name, int64* tag_id)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    return GetId("Tags", "run_id", run_id, "tag_name", tag_name, "tag_id",
-                 tag_id);
+  Status WriteScalar(int64 global_step, Tensor t, const string& tag) override {
+    Tensor t2;
+    TF_RETURN_IF_ERROR(CoerceScalar(t, &t2));
+    // TODO(jart): Generate scalars plugin metadata on this value.
+    return WriteTensor(global_step, std::move(t2), tag, "");
   }
 
-  Status GetId(const char* table, const char* parent_id_field, int64 parent_id,
-               const char* name_field, const string& name, const char* id_field,
-               int64* id) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (name.empty()) {
-      *id = 0LL;
-      return Status::OK();
-    }
-    SqliteStatement select = db_->Prepare(
-        strings::Printf("SELECT %s FROM %s WHERE %s = ? AND %s = ?", id_field,
-                        table, parent_id_field, name_field));
-    if (parent_id > 0) {
-      select.BindInt(1, parent_id);
+  Status WriteGraph(int64 global_step, std::unique_ptr<GraphDef> g) override {
+    mutex_lock ml(mu_);
+    return run_writer_.InsertGraph(std::move(g), GetWallTime(env_));
+  }
+
+  Status WriteEvent(std::unique_ptr<Event> e) override {
+    switch (e->what_case()) {
+      case Event::WhatCase::kSummary: {
+        mutex_lock ml(mu_);
+        Status s;
+        for (const auto& value : e->summary().value()) {
+          s.Update(WriteSummary(e.get(), value));
+        }
+        return s;
+      }
+      case Event::WhatCase::kGraphDef: {
+        mutex_lock ml(mu_);
+        std::unique_ptr<GraphDef> graph{new GraphDef};
+        if (!ParseProtoUnlimited(graph.get(), e->graph_def())) {
+          return errors::DataLoss("parse event.graph_def failed");
+        }
+        return run_writer_.InsertGraph(std::move(graph), e->wall_time());
+      }
+      default:
+        // TODO(@jart): Handle other stuff.
+        return Status::OK();
     }
-    select.BindText(2, name);
-    bool is_done;
-    TF_RETURN_IF_ERROR(select.Step(&is_done));
-    if (!is_done) {
-      *id = select.ColumnInt(0);
-    } else {
-      *id = MakeRandomId();
-      SqliteStatement insert = db_->Prepare(strings::Printf(
-          "INSERT INTO %s (%s, %s, %s, inserted_time) VALUES (?, ?, ?, ?)",
-          table, parent_id_field, id_field, name_field));
-      if (parent_id > 0) {
-        insert.BindInt(1, parent_id);
+  }
+
+  Status WriteHistogram(int64 global_step, Tensor t,
+                        const string& tag) override {
+    return errors::Unimplemented(
+        "SummaryDbWriter::WriteHistogram not supported. Please use ",
+        "tensorboard.summary.histogram() instead.");
+  }
+
+  Status WriteImage(int64 global_step, Tensor tensor, const string& tag,
+                    int max_images, Tensor bad_color) override {
+    return errors::Unimplemented(
+        "SummaryDbWriter::WriteImage not supported. Please use ",
+        "tensorboard.summary.image() instead.");
+  }
+
+  Status WriteAudio(int64 global_step, Tensor tensor, const string& tag,
+                    int max_outputs, float sample_rate) override {
+    return errors::Unimplemented(
+        "SummaryDbWriter::WriteAudio not supported. Please use ",
+        "tensorboard.summary.audio() instead.");
+  }
+
+  string DebugString() override { return "SummaryDbWriter"; }
+
+ private:
+  Status WriteSummary(const Event* e, const Summary::Value& summary)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    switch (summary.value_case()) {
+      case Summary::Value::ValueCase::kSimpleValue: {
+        int64 tag_id;
+        TF_RETURN_IF_ERROR(run_writer_.GetTagId(e->wall_time(), summary.tag(),
+                                                summary.metadata(), &tag_id));
+        Tensor t{DT_DOUBLE, {}};
+        t.scalar<double>()() = summary.simple_value();
+        return run_writer_.InsertTensor(tag_id, e->step(), e->wall_time(), t);
       }
-      insert.BindInt(2, *id);
-      insert.BindText(3, name);
-      insert.BindDouble(4, GetWallTime());
-      TF_RETURN_IF_ERROR(insert.StepAndReset());
+      default:
+        // TODO(@jart): Handle the rest.
+        return Status::OK();
     }
-    return Status::OK();
   }
 
   mutex mu_;
   Env* env_;
-  std::shared_ptr<Sqlite> db_ GUARDED_BY(mu_);
-  SqliteStatement insert_tensor_ GUARDED_BY(mu_);
-  SqliteStatement update_metadata_ GUARDED_BY(mu_);
-  string user_name_ GUARDED_BY(mu_);
-  string experiment_name_ GUARDED_BY(mu_);
-  string run_name_ GUARDED_BY(mu_);
-  int64 run_id_ GUARDED_BY(mu_);
+  RunWriter run_writer_ GUARDED_BY(mu_);
 };
 
 }  // namespace
@@ -265,14 +636,8 @@ Status CreateSummaryDbWriter(std::shared_ptr<Sqlite> db,
                              const string& run_name, const string& user_name,
                              Env* env, SummaryWriterInterface** result) {
   TF_RETURN_IF_ERROR(SetupTensorboardSqliteDb(db));
-  SummaryDbWriter* w = new SummaryDbWriter(env, std::move(db));
-  const Status s = w->Initialize(experiment_name, run_name, user_name);
-  if (!s.ok()) {
-    w->Unref();
-    *result = nullptr;
-    return s;
-  }
-  *result = w;
+  *result = new SummaryDbWriter(env, std::move(db), experiment_name, run_name,
+                                user_name);
   return Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index d32904f97c4172ded51a00dc076630b598494716..5ea844b6685d15ac4c0549816770060c6f25ce38 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -14,14 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
 namespace {
 
+const float kTolerance = 1e-5;
+
 Tensor MakeScalarInt64(int64 x) {
   Tensor t(DT_INT64, TensorShape({}));
   t.scalar<int64>()() = x;
@@ -41,7 +48,7 @@ class FakeClockEnv : public EnvWrapper {
 
 class SummaryDbWriterTest : public ::testing::Test {
  protected:
-  void SetUp() override { db_ = Sqlite::Open("file::memory:").ValueOrDie(); }
+  void SetUp() override { db_ = Sqlite::Open(":memory:").ValueOrDie(); }
 
   void TearDown() override {
     if (writer_ != nullptr) {
@@ -94,6 +101,7 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(writer_->Flush());
   writer_->Unref();
   writer_ = nullptr;
+  EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Ids"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   EXPECT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
@@ -102,13 +110,24 @@ TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
 }
 
 TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
+  SummaryMetadata metadata;
+  metadata.set_display_name("display_name");
+  metadata.set_summary_description("description");
+  metadata.mutable_plugin_data()->set_plugin_name("plugin_name");
+  metadata.mutable_plugin_data()->set_content("plugin_data");
+  SummaryMetadata metadata_nope;
+  metadata_nope.set_display_name("nope");
+  metadata_nope.set_summary_description("nope");
+  metadata_nope.mutable_plugin_data()->set_plugin_name("nope");
+  metadata_nope.mutable_plugin_data()->set_content("nope");
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
   env_.AdvanceByMillis(23);
   TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
-                                    "this-is-metaaa"));
+                                    metadata.SerializeAsString()));
   env_.AdvanceByMillis(23);
-  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy", ""));
+  TF_ASSERT_OK(writer_->WriteTensor(2, MakeScalarInt64(314LL), "taggy",
+                                    metadata_nope.SerializeAsString()));
   TF_ASSERT_OK(writer_->Flush());
 
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Users"));
@@ -141,22 +160,209 @@ TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
   EXPECT_EQ(run_id, QueryInt("SELECT run_id FROM Tags"));
   EXPECT_EQ("taggy", QueryString("SELECT tag_name FROM Tags"));
   EXPECT_EQ(0.023, QueryDouble("SELECT inserted_time FROM Tags"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
+
+  EXPECT_EQ("display_name", QueryString("SELECT display_name FROM Tags"));
+  EXPECT_EQ("plugin_name", QueryString("SELECT plugin_name FROM Tags"));
+  EXPECT_EQ("plugin_data", QueryString("SELECT plugin_data FROM Tags"));
+  EXPECT_EQ("description", QueryString("SELECT description FROM Descriptions"));
 
   EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 1"));
   EXPECT_EQ(0.023,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 1"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
   EXPECT_FALSE(
       QueryString("SELECT tensor FROM Tensors WHERE step = 1").empty());
 
   EXPECT_EQ(tag_id, QueryInt("SELECT tag_id FROM Tensors WHERE step = 2"));
   EXPECT_EQ(0.046,
             QueryDouble("SELECT computed_time FROM Tensors WHERE step = 2"));
-  EXPECT_EQ("this-is-metaaa", QueryString("SELECT metadata FROM Tags"));
   EXPECT_FALSE(
       QueryString("SELECT tensor FROM Tensors WHERE step = 2").empty());
 }
 
+TEST_F(SummaryDbWriterTest, EmptyParentNames_NoParentsCreated) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy", ""));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Users"));
+  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
+  ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
+  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
+  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  std::unique_ptr<Event> e{new Event};
+  e->set_step(7);
+  e->set_wall_time(123.456);
+  Summary::Value* s = e->mutable_summary()->add_value();
+  s->set_tag("π");
+  s->set_simple_value(3.14f);
+  s = e->mutable_summary()->add_value();
+  s->set_tag("φ");
+  s->set_simple_value(1.61f);
+  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tags"));
+  ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  int64 tag1_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'π'");
+  int64 tag2_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'φ'");
+  EXPECT_GT(tag1_id, 0LL);
+  EXPECT_GT(tag2_id, 0LL);
+  EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
+                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
+                         tag1_id, " AND step = 7")));
+  EXPECT_EQ(123.456, QueryDouble(strings::StrCat(
+                         "SELECT computed_time FROM Tensors WHERE tag_id = ",
+                         tag2_id, " AND step = 7")));
+  EXPECT_NEAR(3.14,
+              QueryDouble(strings::StrCat(
+                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag1_id,
+                  " AND step = 7")),
+              kTolerance);  // Summary::simple_value is float
+  EXPECT_NEAR(1.61,
+              QueryDouble(strings::StrCat(
+                  "SELECT tensor FROM Tensors WHERE tag_id = ", tag2_id,
+                  " AND step = 7")),
+              kTolerance);
+}
+
+TEST_F(SummaryDbWriterTest, WriteGraph) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "R", "", &env_, &writer_));
+  env_.AdvanceByMillis(23);
+  GraphDef graph;
+  NodeDef* node = graph.add_node();
+  node->set_name("x");
+  node->set_op("Placeholder");
+  node = graph.add_node();
+  node->set_name("y");
+  node->set_op("Placeholder");
+  node = graph.add_node();
+  node->set_name("z");
+  node->set_op("Love");
+  node = graph.add_node();
+  node->set_name("+");
+  node->set_op("Add");
+  node->add_input("x");
+  node->add_input("y");
+  node->add_input("^z");
+  node->set_device("tpu/lol");
+  std::unique_ptr<Event> e{new Event};
+  graph.SerializeToString(e->mutable_graph_def());
+  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Runs"));
+  ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Graphs"));
+  ASSERT_EQ(4LL, QueryInt("SELECT COUNT(*) FROM Nodes"));
+  ASSERT_EQ(3LL, QueryInt("SELECT COUNT(*) FROM NodeInputs"));
+
+  int64 graph_id = QueryInt("SELECT graph_id FROM Graphs");
+  EXPECT_GT(graph_id, 0LL);
+  EXPECT_EQ(graph_id, QueryInt("SELECT graph_id FROM Runs"));
+  EXPECT_EQ(0.023, QueryDouble("SELECT inserted_time FROM Graphs"));
+  EXPECT_FALSE(QueryString("SELECT graph_def FROM Graphs").empty());
+
+  EXPECT_EQ("x", QueryString("SELECT node_name FROM Nodes WHERE node_id = 0"));
+  EXPECT_EQ("y", QueryString("SELECT node_name FROM Nodes WHERE node_id = 1"));
+  EXPECT_EQ("z", QueryString("SELECT node_name FROM Nodes WHERE node_id = 2"));
+  EXPECT_EQ("+", QueryString("SELECT node_name FROM Nodes WHERE node_id = 3"));
+
+  EXPECT_EQ("Placeholder",
+            QueryString("SELECT op FROM Nodes WHERE node_id = 0"));
+  EXPECT_EQ("Placeholder",
+            QueryString("SELECT op FROM Nodes WHERE node_id = 1"));
+  EXPECT_EQ("Love", QueryString("SELECT op FROM Nodes WHERE node_id = 2"));
+  EXPECT_EQ("Add", QueryString("SELECT op FROM Nodes WHERE node_id = 3"));
+
+  EXPECT_EQ("", QueryString("SELECT device FROM Nodes WHERE node_id = 0"));
+  EXPECT_EQ("", QueryString("SELECT device FROM Nodes WHERE node_id = 1"));
+  EXPECT_EQ("", QueryString("SELECT device FROM Nodes WHERE node_id = 2"));
+  EXPECT_EQ("tpu/lol",
+            QueryString("SELECT device FROM Nodes WHERE node_id = 3"));
+
+  EXPECT_EQ(graph_id,
+            QueryInt("SELECT graph_id FROM NodeInputs WHERE idx = 0"));
+  EXPECT_EQ(graph_id,
+            QueryInt("SELECT graph_id FROM NodeInputs WHERE idx = 1"));
+  EXPECT_EQ(graph_id,
+            QueryInt("SELECT graph_id FROM NodeInputs WHERE idx = 2"));
+
+  EXPECT_EQ(3LL, QueryInt("SELECT node_id FROM NodeInputs WHERE idx = 0"));
+  EXPECT_EQ(3LL, QueryInt("SELECT node_id FROM NodeInputs WHERE idx = 1"));
+  EXPECT_EQ(3LL, QueryInt("SELECT node_id FROM NodeInputs WHERE idx = 2"));
+
+  EXPECT_EQ(0LL,
+            QueryInt("SELECT input_node_id FROM NodeInputs WHERE idx = 0"));
+  EXPECT_EQ(1LL,
+            QueryInt("SELECT input_node_id FROM NodeInputs WHERE idx = 1"));
+  EXPECT_EQ(2LL,
+            QueryInt("SELECT input_node_id FROM NodeInputs WHERE idx = 2"));
+
+  EXPECT_EQ(0LL, QueryInt("SELECT is_control FROM NodeInputs WHERE idx = 0"));
+  EXPECT_EQ(0LL, QueryInt("SELECT is_control FROM NodeInputs WHERE idx = 1"));
+  EXPECT_EQ(1LL, QueryInt("SELECT is_control FROM NodeInputs WHERE idx = 2"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteScalarInt32_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_INT32, {});
+  t.scalar<int32>()() = -17;
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteScalarInt8_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_INT8, {});
+  t.scalar<int8>()() = static_cast<int8>(-17);
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(-17LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, WriteScalarUint8_CoercesToInt64) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "", "", "", &env_, &writer_));
+  Tensor t(DT_UINT8, {});
+  t.scalar<uint8>()() = static_cast<uint8>(254);
+  TF_ASSERT_OK(writer_->WriteScalar(1, t, "t"));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(254LL, QueryInt("SELECT tensor FROM Tensors"));
+}
+
+TEST_F(SummaryDbWriterTest, UsesIdsTable) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(4LL, QueryInt("SELECT COUNT(*) FROM Ids"));
+  EXPECT_EQ(4LL, QueryInt(strings::StrCat(
+                     "SELECT COUNT(*) FROM Ids WHERE id IN (",
+                     QueryInt("SELECT user_id FROM Users"), ", ",
+                     QueryInt("SELECT experiment_id FROM Experiments"), ", ",
+                     QueryInt("SELECT run_id FROM Runs"), ", ",
+                     QueryInt("SELECT tag_id FROM Tags"), ")")));
+}
+
+TEST_F(SummaryDbWriterTest, SetsRunFinishedTime) {
+  SummaryMetadata metadata;
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
+                                     &writer_));
+  env_.AdvanceByMillis(23);
+  TF_ASSERT_OK(writer_->WriteTensor(1, MakeScalarInt64(123LL), "taggy",
+                                    metadata.SerializeAsString()));
+  TF_ASSERT_OK(writer_->Flush());
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.0, QueryDouble("SELECT finished_time FROM Runs"));
+  env_.AdvanceByMillis(23);
+  writer_->Unref();
+  writer_ = nullptr;
+  ASSERT_EQ(0.023, QueryDouble("SELECT started_time FROM Runs"));
+  ASSERT_EQ(0.046, QueryDouble("SELECT finished_time FROM Runs"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index b4ecb61a42d71e1901f78095830db63bbc2e0e98..6ba069778ccf5bfba94921ac47db9233c63c0cfe 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -14,11 +14,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/timeseries/python/timeseries:estimators",
-        "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
-        "//tensorflow/contrib/timeseries/python/timeseries:input_pipeline",
         "//tensorflow/contrib/timeseries/python/timeseries:py_init",
-        "//tensorflow/contrib/timeseries/python/timeseries:saved_model_utils",
+        "//tensorflow/python:util",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 755b0657e9fb29c167911407cee340ac7e3e9b7a..bb86ecb2209f9bed3ad6c37f4b23bc7b361e1bd6 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -103,6 +103,7 @@ py_test(
     deps = [
         ":lstm",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/estimator:estimator_py",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/lstm.py b/tensorflow/contrib/timeseries/examples/lstm.py
index 3ba823f638da8f750981bc910d960706ff652fb7..c7193cef6915f9d0caf5b52fc084129cbc736994 100644
--- a/tensorflow/contrib/timeseries/examples/lstm.py
+++ b/tensorflow/contrib/timeseries/examples/lstm.py
@@ -165,12 +165,13 @@ class _LSTMModel(ts_model.SequentialTimeSeriesModel):
         "Exogenous inputs are not implemented for this example.")
 
 
-def train_and_predict(csv_file_name=_DATA_FILE, training_steps=200):
+def train_and_predict(
+    csv_file_name=_DATA_FILE, training_steps=200, estimator_config=None):
   """Train and predict using a custom time series model."""
   # Construct an Estimator from our LSTM model.
   estimator = ts_estimators.TimeSeriesRegressor(
       model=_LSTMModel(num_features=5, num_units=128),
-      optimizer=tf.train.AdamOptimizer(0.001))
+      optimizer=tf.train.AdamOptimizer(0.001), config=estimator_config)
   reader = tf.contrib.timeseries.CSVReader(
       csv_file_name,
       column_names=((tf.contrib.timeseries.TrainEvalFeatures.TIMES,)
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index 56daa1e10d9d1e7e96d71f33afc72671512dbaf8..3cace567266d497b12d836f44a335bbe5d916949 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -20,14 +20,23 @@ from __future__ import print_function
 
 from tensorflow.contrib.timeseries.examples import lstm
 
+from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.platform import test
 
 
+class _SeedRunConfig(estimator_lib.RunConfig):
+
+  @property
+  def tf_random_seed(self):
+    return 3
+
+
 class LSTMExampleTest(test.TestCase):
 
   def test_periodicity_learned(self):
     (observed_times, observed_values,
-     all_times, predicted_values) = lstm.train_and_predict(training_steps=100)
+     all_times, predicted_values) = lstm.train_and_predict(
+         training_steps=100, estimator_config=_SeedRunConfig())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 5896fc2a206bc747688b5b012e0f87465592dd8a..f0330bfbbd6e8067e5d085376acdf2e6bcaccb6a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -19,7 +19,7 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.framework.python.ops import variables
+from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import optimizers
 
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
@@ -79,7 +79,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
 
     train_op = optimizers.optimize_loss(
         model_outputs.loss,
-        global_step=variables.get_global_step(),
+        global_step=training_util.get_global_step(),
         optimizer=self.optimizer,
         # Learning rate is set in the Optimizer object
         learning_rate=None)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index d0deedc50f8b7953394ab2354fae9133b523d97b..c86d06e9236962cbabbc56afa1cfe213e0c78bc0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -92,10 +92,12 @@ tf_py_test(
     additional_deps = [
         ":kalman_filter",
         "//third_party/py/numpy",
+        "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
 )
@@ -210,6 +212,7 @@ tf_py_test(
     name = "varma_test",
     srcs = ["varma_test.py"],
     additional_deps = [
+        ":state_space_model",
         ":varma",
         "//tensorflow/contrib/timeseries/python/timeseries:feature_keys",
         "//tensorflow/python:client",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
index b174bb6af323da62afda2a74a397f25e977a48d0..872474aee1149d36671f660f33f63a204ef8ca43 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/g3doc/periodic_multires_derivation.md
@@ -66,7 +66,7 @@ def make_eigval_mat_fn(to_power=1):
         if i == j:
             number = j // 2 + 1
             powersign = ((j + 1) % 2) * 2 - 1
-            return root_of_unity(matsize + 1, number=number, 
+            return root_of_unity(matsize + 1, number=number,
                                  to_power=powersign*to_power)
         else:
             return 0
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index c89596734c738467c58e845328e396c3f2eb999a..a34c7f91f275ea544a3114e85d53f4258f683ebc 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -16,6 +16,7 @@ package(
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//third_party/cloud_tpu:__subpackages__",
     ],
 )
 
@@ -30,18 +31,6 @@ cc_library(
     ],
 )
 
-py_library(
-    name = "tpu_test_util",
-    srcs = [
-        "python/tpu/test_util.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_lib",
-        ":tpu_py",
-    ],
-)
-
 py_library(
     name = "tpu_estimator",
     srcs = [
@@ -53,17 +42,23 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:run_config",
         "//tensorflow/python/estimator:util",
+        "@six_archive//:six",
     ],
 )
 
@@ -108,6 +103,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -124,21 +120,15 @@ tf_custom_op_py_library(
         ":tpu_ops",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:util",
     ],
 )
 
 py_library(
     name = "tpu",
-    srcs = [
-        "python/tpu/__init__.py",
-    ],
+    srcs = ["python/tpu/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":tpu_estimator",
@@ -150,6 +140,8 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/device_assignment.py",
+        "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
         "python/tpu/tpu_function.py",
@@ -161,6 +153,7 @@ py_library(
     deps = [
         ":profiler",
         ":tpu_py",
+        "//tensorflow/contrib/tpu/proto:topology_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -211,7 +204,9 @@ tf_py_test(
 filegroup(
     name = "all_files",
     srcs = glob(
-        ["**/*"],
+        include = [
+            "**/*",
+        ],
         exclude = [
             "**/METADATA",
             "**/OWNERS",
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index 6a5fe06ff078df52e13016572e80bfcae4a4d178..ea6e874f2d952b03e8cdabeee00ccfe1b076a0d0 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -23,8 +23,8 @@
 
 @@initialize_system
 @@shutdown_system
+@@device_assignment
 @@core
-@@outside_all_rewrites
 @@replicate
 @@shard
 @@batch_parallel
@@ -34,6 +34,9 @@
 
 @@InfeedQueue
 
+@@DeviceAssignment
+@@Topology
+
 @@while_loop
 @@repeat
 
@@ -50,6 +53,8 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.device_assignment import *
+from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
 from tensorflow.contrib.tpu.python.tpu.tpu_config import *
 from tensorflow.contrib.tpu.python.tpu.tpu_estimator import *
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
index cbbd19800eb2e336fc343671fb82bb3ed631c129..d389050e67f9a9e48b91583e5088058ec4e2832f 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 An Op to sum inputs across replicated TPU instances. Each
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
index ed5756cc540a202148a02747bc62001ee363be9d..5900c61a38726551391c212f92b9b9eacd4a465b 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/contrib/tpu/ops/outfeed_ops.cc
@@ -39,7 +39,7 @@ REGISTER_OP("OutfeedEnqueueTuple")
     .Doc(R"doc(
 An op which emits multiple Tensor values from an XLA computation.
 
-inputs: A list of tensors that will be inserted into the outfeed queue as an 
+inputs: A list of tensors that will be inserted into the outfeed queue as an
 XLA tuple.
 )doc");
 
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index b40dac471708793d5a033279e2d2f4b4a0dac480..cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -24,7 +24,9 @@ using shape_inference::ShapeHandle;
 
 REGISTER_OP("TPUReplicateMetadata")
     .Attr("num_replicas: int >= 0")
-    .Attr("global_tpu_id: list(int) = []")
+    .Attr("topology: string = \"\"")
+    .Attr("device_assignment: list(int) = []")
+    .Attr("computation_shape: list(int) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -64,14 +66,18 @@ REGISTER_OP("TPUReplicatedOutput")
 REGISTER_OP("TPUReplicate")
     .Attr("computation: func")
     .Attr("num_replicas: int >= 1")
-    .Attr("global_tpu_id: list(int) = []")
+    .Attr("topology: string = \"\"")
+    .Attr("device_assignment: list(int) = []")
+    .Attr("computation_shape: list(int) = []")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
     .Attr("NumVariables: int >= 0")
+    .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
+    .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
@@ -79,14 +85,25 @@ Runs replicated computations on a distributed TPU system.
 
 computation: a function containing the computation to run.
 num_replicas: the number of replicas of the computation to run.
-global_tpu_id: map from device to global tpu id.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+computation_shape: a [mesh_dimension] array describing the shape of each
+  computation replica in numbers of cores in the TPU mesh.
+device_assignment: a flattened array with shape
+  [replica] + computation_shape + [mesh_dimension] that maps the coordinates of
+  logical cores in each replica of a computation to physical coordinates in
+  the TPU topology.
 Tinputs: the types of the arguments to 'computation'.
 inputs: the inputs to 'computation', flattened, in replica-major order.
 Tbroadcast_inputs: the types of the additional arguments to broadcast to all
   replicas.
+Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
 broadcast_inputs: additional arguments to broadcast to all replicas. The
   broadcast inputs are appended to the per-replica inputs when calling
   computation.
+guaranteed_constants: arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
 output_types: the types of the outputs of 'computation'.
 outputs: the outputs of 'computation'.
 )doc");
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index 8a87a91056efeba5d094503cfa68df104e310f30..8c4fe5538d832f390845fe2d31aa6a08342b280b 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -107,7 +107,7 @@ in a host.
 
 REGISTER_OP("_WaitForDistributedTPU")
     .Input("inputs: N * int32")
-    .Output("global_tpu_array: int32")
+    .Output("topology: string")
     .Attr("host_specs: list(string)")
     .Attr("startup_timeout_sec: int = 20")
     .Attr("N: int")
@@ -118,7 +118,7 @@ REGISTER_OP("_WaitForDistributedTPU")
       for (int i = 0; i < c->num_inputs(); ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &input));
       }
-      c->set_output(0, c->UnknownShapeOfRank(2));
+      c->set_output(0, c->Scalar());
       return ::tensorflow::Status::OK();
     })
     .Doc(R"doc(
@@ -129,30 +129,26 @@ _InitializeHostForDistributedTPU Ops.
 
 inputs: For each initialized host, a vector giving the global TPU id
 of each TPU on the host.
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
-host_specs: For each initialized host, the partial device specification
-indicating job, replica, and task. Combining this spec with
-'/device:TPU:k' gives the full device name of the k'th TPU on the
-host.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 startup_timeout_sec: The number of seconds to wait for the TPU system
 to stabilize.
 )doc");
 
 REGISTER_OP("_SetGlobalTPUArray")
-    .Input("global_tpu_array: int32")
+    .Input("topology: string")
     .SetIsStateful()
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
       return ::tensorflow::Status::OK();
     })
     .Doc(R"doc(
 An op that informs a host of the global ids of all the of TPUs in the
 system.
 
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 )doc");
 
 REGISTER_OP("_ShutdownDistributedTPU")
@@ -198,7 +194,7 @@ chips on the host.
 )doc");
 
 REGISTER_OP("ConfigureDistributedTPU")
-    .Output("global_tpu_array: int32")
+    .Output("topology: string")
     .Attr("embedding_config: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape)
@@ -206,9 +202,8 @@ REGISTER_OP("ConfigureDistributedTPU")
 An op that sets up the centralized structures for a distributed TPU
 system.
 
-global_tpu_array: A two-dimensional array. For each host (the outer
-dimension) the array lists the global ids of the TPUs on that host.
-embedding_config: Internal use.
+topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
 )doc");
 
 REGISTER_OP("ShutdownDistributedTPU")
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index f6309e2e72f75a4ba5b323b4d7348c49555d522e..0e1fca3d3c8b6f3a19b3e989dbee1863475796c5 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -95,3 +95,10 @@ tf_proto_library_cc(
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library_cc(
+    name = "tf_op_stats_proto",
+    srcs = ["tf_op_stats.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 5b51a72ece848f0efcd5ace57fe0201a86e311a3..bff23a447f841339d9bf5bd3bf125d705bf1fee7 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -50,6 +50,7 @@ ProfileResponse Profile(const string& service_addr, int duration_ms) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
+  request.add_tools("input_pipeline");
   std::cout << "Limiting the number of trace events to " << kMaxEvents
             << std::endl;
   ::grpc::ClientContext context;
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 7541a5291d123256e7f1d83cb6f6ef72a78ad99d..120a38b6c2353deaf0b86d330cda999ba6be7dbf 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -42,11 +42,11 @@ using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
 
-constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr char kJsonOpProfileFileName[] = "op_profile.json";
-constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kJsonTraceFileName[] = "trace.json.gz";
-constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
+constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+constexpr char kProtoTraceFileName[] = "trace";
 
 Status WriteGzippedDataToFile(const string& filename, const string& data) {
   std::unique_ptr<WritableFile> file;
@@ -97,6 +97,15 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
   return Status::OK();
 }
 
+Status DumpToolDataToLogDirectory(StringPiece run_dir,
+                                  const tensorflow::ProfileToolData& tool,
+                                  std::ostream* os) {
+  string path = JoinPath(run_dir, tool.name());
+  TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
+  *os << "Dumped tool data for " << tool.name() << " to " << path << std::endl;
+  return Status::OK();
+}
+
 Status DumpGraphEvents(const string& logdir, const string& run,
                        const ProfileResponse& response, std::ostream* os) {
   int num_graphs = response.computation_graph_size();
@@ -154,7 +163,12 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
     TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
                                                    response.op_profile(), os));
   }
-
+  if (!response.tool_data().empty()) {
+    for (const auto& tool_data : response.tool_data()) {
+      TF_RETURN_IF_ERROR(
+          DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
+    }
+  }
   TF_RETURN_IF_ERROR(DumpGraphEvents(logdir, run, response, os));
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 3bdd475fade39baeea67333a55fdd548fb235672..7970c20a2693cbbe91a136080240f676d29f2053 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -24,22 +24,18 @@ import sys
 
 import tensorflow as tf
 
-
 tf.flags.DEFINE_string('service_addr', '',
                        'Address of TPU profiler service e.g. localhost:8466')
-
-
 tf.flags.DEFINE_string('logdir', '',
                        'Path of TensorBoard log directory e.g. /tmp/tb_log')
-
-
 tf.flags.DEFINE_integer('duration_ms', 2000, 'Duration of tracing in ms.')
 
-
 FLAGS = tf.flags.FLAGS
+EXECUTABLE = 'data/capture_tpu_profile'
 
 
-EXECUTABLE = 'data/capture_tpu_profile'
+def run_main():
+  tf.app.run(main)
 
 
 def main(unused_argv=None):
@@ -54,4 +50,4 @@ def main(unused_argv=None):
 
 
 if __name__ == '__main__':
-  tf.app.run(main)
+  run_main()
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index e77cae4695daa54f690f11982ece44ea6a2a3fc4..ee6950699e740139b75f3f061ca0ca455fe2a1af 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -23,7 +23,7 @@ from setuptools import setup
 _VERSION = '1.3.0-a1'
 
 CONSOLE_SCRIPTS = [
-    'capture_tpu_profile=cloud_tpu_profiler.main:main',
+    'capture_tpu_profile=cloud_tpu_profiler.main:run_main',
 ]
 
 REQUIRED_PACKAGES = [
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
new file mode 100644
index 0000000000000000000000000000000000000000..5440bbbfdd75207bd209c19d5cc42dc69504d39b
--- /dev/null
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -0,0 +1,169 @@
+// This proto describes the format of tensorflow operation level stats for
+// profiling (in tensorboard) purpose.
+
+syntax = "proto2";
+
+package tensorflow.tpu;
+
+// Result proto for OpMetrics.
+message OpMetricsResult {
+  // True if this OP is executed on the device; False if it is executed on the
+  // host.
+  optional bool on_device = 1;
+  reserved 2;  // was uint32 id.
+  // Name of this OP.
+  optional string name = 3;
+  // Rank of this OP.
+  optional uint64 rank = 4;
+  // The starting time in cycles of the last instance of this OP executed.
+  optional double last_starttime_in_cycles = 5;
+  // The ending time in cycles of the last instance of this OP executed.
+  optional double last_endtime_in_cycles = 6;
+  // If this OP (say A), is an immediate child of another OP (say B), this field
+  // stores the sum of duration in microseconds of A inside B. If A appears more
+  // than once in B, the duration of all A's appearances will be added together.
+  // This sum will be reset after the self-time of B is calculated so that it
+  // can be reused for a new parent OP.
+  optional double sum_of_duration_in_us_as_children = 7;
+  // Number of instances that this OP occurred.
+  optional uint64 occurrences = 8;
+  // Total time in microseconds spent in this OP (accumulated
+  // over all of its occurrences).
+  optional double total_time_in_us = 9;
+  // Total self time in microseconds spent in this OP
+  // (accumulated over all of its occurrences).
+  optional double total_self_time_in_us = 10;
+  // The total self time as a fraction of sum of all OP's
+  // total self time on the host.
+  optional double host_total_self_time_as_fraction_of_all_op_time = 11;
+  // Cumulative total self time in fraction on the host.
+  optional double host_cumulative_total_self_time_as_fraction_of_all_op_time =
+      12;
+  // The total self time as a fraction of sum of all OP's
+  // total self time on the device.
+  optional double device_total_self_time_as_fraction_of_all_op_time = 13;
+  // Cumulative total self time in fraction on the device.
+  optional double device_cumulative_total_self_time_as_fraction_of_all_op_time =
+      14;
+  // Total number of FLOPs incurred by this OP.
+  optional double total_flops = 15;
+  // Total number of bytes accessed by this OP.
+  optional double total_bytes_accessed = 16;
+  // Total time in microseconds that special hw unit 1 is occupied by this OP.
+  optional double unit1_occupancy_in_us = 17;
+  // Total time in microseconds that special hw unit 2 is occupied by this OP.
+  optional double unit2_occupancy_in_us = 18;
+  // Total memory stall time in microseconds.
+  optional double total_memory_stall_in_us = 19;
+}
+
+// Result proto for OpMetricsDb.
+message OpMetricsDbResult {
+  // A bunch of OpMetricsResults.
+  repeated OpMetricsResult metrics_db = 1;
+}
+
+// Result proto for StepInfo.
+message StepInfoResult {
+  // The (micro) step number.
+  optional uint32 step_num = 1;
+  // The step duration in picoseconds.
+  optional uint64 duration_ps = 2;
+  // The infeed duration in picoseconds.
+  // Can turn into a map if we want a variable number of ops.
+  optional uint64 infeed_duration_ps = 3;
+}
+
+// Result proto for a sequence of steps.
+message StepSequenceResult {
+  // A sequence of StepInfoResults.
+  repeated StepInfoResult step_sequence = 1;
+}
+
+// Result proto for a StepDatabase.
+message StepDatabaseResult {
+  // A map from core_id to StepSequenceResult.
+  map<uint32, StepSequenceResult> step_sequence_per_core = 1;
+}
+
+// Result proto for looping-related metrics.
+message LoopingResult {
+  // The total iteration time in nanoseconds.
+  optional double iteration_time_ns = 1;
+  // The total number of iterations.
+  optional int32 num_iterations = 2;
+  // The total computation time in nanoseconds.
+  optional double computation_time_ns = 3;
+  // The total number of computations.
+  optional int32 num_computations = 4;
+}
+
+// Result proto for HloExtraInfo.
+message HloExtraInfoResult {
+  // Category of the HLO op given by the compiler.
+  optional string category = 1;
+  // The long name of the HLO that includes the dimensions.
+  optional string long_name = 2;
+  // The per-TPU-core batch size inferred from this HLO.
+  optional int64 per_core_batch_size = 3;
+}
+
+// Result proto for HloExtraInfoMap.
+message HloExtraInfoMapResult {
+  // A map from HLO name to HloExtraInfo.
+  map<string, HloExtraInfoResult> hlo_extrainfo_map = 1;
+}
+
+// Result proto for host-independent job information.
+message HostIndependentJobInfoResult {
+  // The change-list number of this build.
+  optional int64 change_list = 1;
+  // The time of this build.
+  optional int64 build_time = 2;
+  // The target of this build.
+  optional string build_target = 3;
+}
+
+// Result proto for host-dependent job information.
+message HostDependentJobInfoResult {
+  // This ID of the host where the job was run on.
+  optional string host_id = 1;
+  // The command line used to run the job.
+  optional string command_line = 2;
+  // The start time of the job on this host.
+  optional int64 start_time = 3;
+}
+
+// Result proto for RunEnvironment (the run environment of a profiling session).
+message RunEnvironmentResult {
+  // Number of hosts used.
+  optional int32 host_count = 1;
+  // The type of TPU used.
+  optional string tpu_type = 2;
+  // The number of TPU cores used.
+  optional int32 tpu_core_count = 3;
+  // The per-TPU-core batch size.
+  optional int32 per_core_batch_size = 4;
+  // Host-independent job information.
+  optional HostIndependentJobInfoResult host_independent_job_info = 5;
+  // Host-dependent job information.
+  repeated HostDependentJobInfoResult host_dependent_job_info = 6;
+}
+
+// Result proto for TfStatsHelper.
+message TfOpStats {
+  // The result for the TF-metric database.
+  optional OpMetricsDbResult tf_metrics_db = 1;
+  // The result for the HLO-metric database.
+  optional OpMetricsDbResult hlo_metrics_db = 2;
+  // The result for the step database.
+  optional StepDatabaseResult step_db = 3;
+  // The result for the looping-related metrics.
+  optional LoopingResult looping = 4;
+  // The result for the HloExtraInfoMap.
+  optional HloExtraInfoMapResult hlo_extrainfo_map = 5;
+  // Overall matrix unit utilization in percentage.
+  optional double matrix_unit_utilization_percent = 6;
+  // The run environment of this profiling session.
+  optional RunEnvironmentResult run_environment = 7;
+}
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 88e86eca3b63da4bf1d2f9340707dc4a50d28b16..9c3fd45fd1ec9736b638b45907e585165d4d9057 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -22,9 +22,21 @@ message ProfileRequest {
   // events.
   uint64 max_events = 2;
 
+  // required profiling tools name such as "input_pipeline_analyzer" etc
+  repeated string tools = 3;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
+  // next-field: 4
+}
+
+message ProfileToolData {
+  // The tool's name which this data is associated. (e.g. "input_pipeline".)
+  string name = 1;
+
+  // The data payload (likely json) for the specific tool.
+  bytes data = 2;
 }
 
 message ProfileResponse {
@@ -45,5 +57,8 @@ message ProfileResponse {
   // If the trace covers multiple programs, the longest-running one is analyzed.
   // See op_profile.proto for the detailed semantics of the returned profile.
   tpu.op_profile.Profile op_profile = 4;
-  // next-field: 6
+
+  // Data payload for each required tools.
+  repeated ProfileToolData tool_data = 6;
+  // next-field: 7
 }
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..79a79efb6b62d3e98127558e951ceefd276b580c
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_proto_library(
+    name = "topology_proto",
+    srcs = [
+        "topology.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/proto/topology.proto b/tensorflow/contrib/tpu/proto/topology.proto
new file mode 100644
index 0000000000000000000000000000000000000000..17064ee5a2ee241824573d51c8c433c3c6c390b7
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/topology.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.tpu;
+
+// Describes the geometry of a TPU mesh.
+message TopologyProto {
+  // The dimensions of the TPU topology, in cores. Typically, this is a 3D
+  // topology [x, y, core], where the major dimensions correspond to TPU chips,
+  // and the minor dimension describes the number of cores on a multicore chip.
+  repeated int32 mesh_shape = 1;
+
+  // Number of TensorFlow tasks in the cluster.
+  int32 num_tasks = 2;
+
+  // Number of TPU devices per task.
+  int32 num_tpu_devices_per_task = 3;
+
+  // A flattened rank 3 int32 array with shape
+  // [num_tasks, num_tpu_devices_per_task, len(mesh_shape)].
+  // `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+  // of TPU devices per task, and the minor dimension corresponds to a position
+  // in the TPU mesh topology. Each entry [task, device, axis] gives the
+  // `axis`-th coordinate in the topology of a task/device pair.
+  repeated int32 device_coordinates = 4;
+}
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee202610a8a8a1406363b3010771e7806d5d84bf
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -0,0 +1,299 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.tpu.python.tpu.topology import Topology
+
+
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
+class DeviceAssignment(object):
+  """Mapping from logical cores in a computation to the physical TPU topology.
+
+  Prefer to use the `device_assignment()` helper to construct a
+  `DeviceAssignment`; it is easier if less flexible than constructing a
+  `DeviceAssignment` directly.
+  """
+
+  def __init__(self, topology, core_assignment):
+    """Constructs a `DeviceAssignment` object.
+
+    Args:
+      topology: A `Topology` object that describes the physical TPU topology.
+      core_assignment: A logical to physical core mapping, represented as a
+        rank 3 numpy array. See the description of the `core_assignment`
+        property for more details.
+
+    Raises:
+      ValueError: If `topology` is not `Topology` object.
+      ValueError: If `core_assignment` is not a rank 3 numpy array.
+    """
+    if not isinstance(topology, Topology):
+      raise ValueError("topology must be a Topology object, got {}".format(
+          type(topology)))
+    core_assignment = np.asarray(core_assignment, dtype=np.int32)
+
+    self._topology = topology
+    self._topology_tasks, self._topology_devices = (
+        self._invert_topology(topology))
+
+    topology_rank = self._topology_tasks.ndim
+    if core_assignment.ndim != topology_rank + 2:
+      raise ValueError("core_assignment must be a rank {} numpy array".format(
+          topology_rank + 2))
+
+    self._num_replicas = core_assignment.shape[0]
+    self._computation_shape = np.array(
+        core_assignment.shape[1:-1], dtype=np.int32)
+
+    if core_assignment.shape[-1] != topology_rank:
+      raise ValueError(
+          "minor dimension of core_assignment must have size equal to topology "
+          "rank ({}), got shape {}".format(topology_rank,
+                                           core_assignment.shape))
+
+    self._core_assignment = core_assignment
+
+  def _invert_topology(self, topology):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    mesh_shape = topology.mesh_shape
+    tasks = np.full(list(mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(mesh_shape), -1, dtype=np.int32)
+    for task in xrange(topology.device_coordinates.shape[0]):
+      for device in xrange(topology.device_coordinates.shape[1]):
+        x, y, z = topology.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
+  @property
+  def topology(self):
+    """A `Topology` that describes the TPU topology."""
+    return self._topology
+
+  @property
+  def computation_shape(self):
+    """The computation shape.
+
+    Returns:
+      A rank-1 int32 numpy array with size equal to the TPU topology rank.
+      Describes the logical shape in numbers of core of each replica of the
+      computation in the TPU topology.
+
+    Returns:
+      The computation shape.
+    """
+    return self._computation_shape
+
+  @property
+  def num_replicas(self):
+    """The number of replicas of the computation."""
+    return self._num_replicas
+
+  @property
+  def core_assignment(self):
+    """The logical to physical core mapping.
+
+    Returns:
+      A numpy array of rank `topology_rank + 2`, with shape
+      `[num_replicas] + computation_shape + [topology_rank]`. Maps
+      (replica, logical core coordinates) pairs to physical topology
+      coordinates.
+    """
+    return self._core_assignment
+
+  def _coordinates(self, replica, logical_core):
+    """Returns the physical topology coordinates of a logical core."""
+    if logical_core is None:
+      logical_core = np.array([0, 0, 0], np.int32)
+
+    if any(logical_core < 0) or any(logical_core >= self.computation_shape):
+      raise ValueError("Invalid core {}; computation shape is {}".format(
+          logical_core, self.computation_shape))
+
+    logical_offset = tuple([replica] + logical_core.tolist() + [slice(3)])
+    return tuple(self.core_assignment[logical_offset])
+
+  def tpu_ordinal(self, replica=0, logical_core=None):
+    """Returns the ordinal of the TPU device assigned to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return self._topology_devices[coordinates]
+
+  def host_device(self, replica=0, logical_core=None, job=None):
+    """Returns the CPU device attached to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return _tpu_host_device_name(job, self._topology_tasks[coordinates])
+
+  def tpu_device(self, replica=0, logical_core=None, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    coordinates = self._coordinates(replica, logical_core)
+    return _tpu_device_name(job, self._topology_tasks[coordinates],
+                            self._topology_devices[coordinates])
+
+
+def device_assignment(topology,
+                      computation_shape=None,
+                      computation_stride=None,
+                      num_replicas=1):
+  """Computes a device_assignment of a computation across a TPU topology.
+
+  Returns a `DeviceAssignment` that describes the cores in the topology assigned
+  to each core of each replica.
+
+  `computation_shape` and `computation_stride` values should be powers of 2 for
+  optimal packing.
+
+  Args:
+    topology: A `Topology` object that describes the TPU cluster topology.
+      To obtain a TPU topology, evaluate the `Tensor` returned by
+      `initialize_system` using `Session.run`. Either a serialized
+      `TopologyProto` or a `Topology` object may be passed. Note: you must
+      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
+    computation_shape: A rank 1 int32 numpy array of size 3, describing the
+      shape of the computation's block of cores. If None, the
+      `computation_shape` is `[1, 1, 1]`.
+    computation_stride: A rank 1 int32 numpy array of size 3, describing the
+      inter-core spacing of the `computation_shape` cores in the TPU topology.
+      If None, the `computation_stride` is `[1, 1, 1]`.
+    num_replicas: The number of computation replicas to run. The replicas will
+      be packed into the free spaces of the topology.
+
+  Returns:
+    A DeviceAssignment object, which describes the mapping between the logical
+    cores in each computation replica and the physical cores in the TPU
+    topology.
+
+  Raises:
+    ValueError: If `topology` is not a valid `Topology` object.
+    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
+      numpy arrays with shape [3] where all values are positive.
+    ValueError: If computation's replicas cannot fit into the TPU topology.
+  """
+  # Deserialize the Topology proto, if it is a string.
+  if isinstance(topology, bytes):
+    topology = Topology(serialized=topology)
+
+  if not isinstance(topology, Topology):
+    raise ValueError("`topology` is not a Topology object; got {}".format(
+        type(topology)))
+
+  topology_rank = len(topology.mesh_shape)
+  mesh_shape = topology.mesh_shape
+  if computation_shape is None:
+    computation_shape = np.array([1, 1, 1], dtype=np.int32)
+  else:
+    computation_shape = np.asarray(computation_shape, dtype=np.int32)
+
+  if computation_stride is None:
+    computation_stride = np.array([1, 1, 1], dtype=np.int32)
+  else:
+    computation_stride = np.asarray(computation_stride, dtype=np.int32)
+
+  if computation_shape.shape != (3,):
+    raise ValueError("computation_shape must have shape [3]; got {}".format(
+        computation_shape.shape))
+  if computation_stride.shape != (3,):
+    raise ValueError("computation_stride must have shape [3]; got {}".format(
+        computation_stride.shape))
+
+  if any(computation_shape < 1):
+    raise ValueError(
+        "computation_shape must be positive; got computation_shape={}".format(
+            computation_shape))
+  if any(computation_stride < 1):
+    raise ValueError(
+        "computation_stride must be positive; got computation_stride={}".format(
+            computation_stride))
+
+  # Computes the physical size of one computation instance.
+  computation_footprint = computation_shape * computation_stride
+  if any(computation_footprint > mesh_shape):
+    raise ValueError(
+        "computation footprint {} does not fit in TPU topology shape {}".format(
+            computation_footprint, mesh_shape))
+
+  # Computes how many copies of the computation footprint fit in the mesh.
+  block_counts = mesh_shape // computation_footprint
+
+  replica_counts = block_counts * computation_stride
+  max_replicas = np.prod(replica_counts)
+  if num_replicas > max_replicas:
+    raise ValueError(
+        "requested {} replicas but only {} replicas with shape {} and "
+        "computation_stride {} fit in a TPU mesh of shape {}".format(
+            num_replicas, max_replicas, computation_shape, computation_stride,
+            mesh_shape))
+
+  # Choose a compact layout for the cores. Choose the smaller dimension in the
+  # topology to be close to the square root of the number of replicas.
+  num_chips = int(math.ceil(num_replicas / replica_counts[2]))
+  target_size = int(math.ceil(math.sqrt(num_chips)))
+
+  # Prefer an even size, if possible. Odd numbered rows head back towards the
+  # first column, so it's best if the last row has an odd index.
+  if target_size % 2 != 0:
+    target_size -= 1
+  y_size = min(replica_counts[1], target_size)
+  if y_size * replica_counts[0] < num_chips:
+    y_size = replica_counts[1]
+
+  # Assigns an offset to each replica such that no two replicas overlap.
+  replica_offsets = np.full([num_replicas, 3], -1, dtype=np.int32)
+  for replica in xrange(num_replicas):
+    # Chooses a replica number in X/Y/Z axes.
+    z = replica % replica_counts[2]
+    t = replica // replica_counts[2]
+    y = t % y_size
+    x = t // y_size
+    replica_pos = np.array([x, y, z], dtype=np.int32)
+
+    # Determines where that replica starts in each axis.
+    outer = replica_pos // computation_stride
+    inner = replica_pos % computation_stride
+    replica_offsets[replica, :] = outer * computation_footprint + inner
+
+  # Computes a complete logical core -> physical core mapping for each replica.
+  indices = [
+      np.arange(0, computation_shape[i] * computation_stride[i],
+                computation_stride[i]) for i in xrange(topology_rank)
+  ]
+  indices = np.concatenate(
+      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+      axis=-1)
+  assignment = (
+      indices + replica_offsets[:, np.newaxis, np.newaxis, np.newaxis, :])
+  return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/contrib/tpu/python/tpu/test_util.py b/tensorflow/contrib/tpu/python/tpu/test_util.py
deleted file mode 100644
index f30c27f1298e2389fe0daefdd4eece5a03a6976c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tpu/python/tpu/test_util.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ===================================================================
-"""Utilities to ease testing on TPU devices."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-
-from tensorflow.python.client import session
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import variables
-
-
-def has_tpu():
-  """Check if a TPU device is available.
-
-  Device enumeration via `device_lib` currently fails for TPU systems.
-  (http://b/68333779).  To work around this, we determine the existence of a
-  TPU by a successful call to `initialize_system`.
-
-  Returns:
-    boolean, True if a TPU device is available, otherwise False.
-  """
-  def _check():
-    with session.Session() as sess:
-      sess.run(tpu.initialize_system())
-      sess.run(tpu.shutdown_system())
-
-  try:
-    _check()
-    return True
-  except errors.OpError as _:
-    return False
-
-
-def _available_devices():
-  devices = ["cpu"]
-  if not test_util.gpu_device_name():
-    devices.append("gpu")
-
-  if has_tpu():
-    devices.append("tpu")
-
-  return tuple(devices)
-
-
-class TPUTestCase(test_util.TensorFlowTestCase):
-  """Adds helpers for testing on TPU devices to `TensorFlowTestCase`.
-
-  Example usage:
-
-  ```
-  def model_fn(features):
-  return tf.reduce_sum(features * 2)
-
-  class ModelTests(test_util.TPUTestCase):
-    def test_sum(self):
-      v = np.random.randn(10, 10).astype("float32")
-      self.assert_device_output(model_fn, [v], (v*2).sum(),
-                                devices=("cpu", "tpu"))
-  ```
-  """
-
-  def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
-    super(TPUTestCase, self).__init__(methodName)
-    self._available_devices = _available_devices()
-
-  def run_on_device(self, model_fn, model_inputs, device):
-    """Runs `model_fn` on the given device.
-
-    Raises an exception if no such device is available.  `model_fn` should
-    return one or more tensors as a list or tuple.
-
-    Args:
-      model_fn: Function returning one or more tensors.
-      model_inputs: An iterable of Numpy arrays or scalars.
-                    These will be passed as arguments to `model_fn`.
-      device: Device to run on.  One of ("tpu", "gpu", "cpu").
-
-    Returns:
-      Output from the model function.
-    """
-    def _make_placeholders():
-      return dict(
-          [(gen_array_ops.placeholder_with_default(v, v.shape), v)
-           for v in model_inputs])
-
-    if device == "tpu":
-      with self.test_session(graph=ops.Graph()) as sess:
-        placeholders = _make_placeholders()
-        tpu_computation = tpu.rewrite(model_fn, placeholders.keys())
-        sess.run(tpu.initialize_system())
-        sess.run(variables.global_variables_initializer())
-        result = sess.run(tpu_computation, placeholders)
-        sess.run(tpu.shutdown_system())
-        # TODO(b/36891278): supports non-flat returns lists in tpu.rewrite().
-        if len(result) == 1:
-          return result[0]
-        return result
-    elif device == "gpu":
-      with self.test_session(graph=ops.Graph(), use_gpu=True) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-    elif device == "cpu":
-      # TODO(power) -- will this interact poorly with cached GPU sessions?
-      with self.test_session(graph=ops.Graph(), use_gpu=False) as sess:
-        placeholders = _make_placeholders()
-        sess.run(variables.global_variables_initializer())
-        return sess.run(model_fn(placeholders.keys()), placeholders)
-
-  def _compare_values(self, actual_outputs, expected_outputs):
-    if isinstance(expected_outputs, (list, tuple)):
-      for a, b in zip(actual_outputs, expected_outputs):
-        self.assertAllCloseAccordingToType(a, b)
-    else:
-      self.assertAllCloseAccordingToType(actual_outputs, expected_outputs)
-
-  def assert_device_output(self, model_fn, model_inputs, expected_outputs,
-                           devices=("cpu", "gpu", "tpu")):
-    """Run `model_fn` on the given devices.
-
-    Results are compared via `assertAllCloseAccordingToType`.
-
-    Args:
-      model_fn: Function returning one or more tensors
-      model_inputs: Numpy arrays or scalars passed as arguments to model_fn
-      expected_outputs: Numpy arrays or scalars to compare against.
-      devices: Set of devices to run on.  If a device is not available, tests
-               will be skipped for that device.
-    """
-    devices = set(devices).intersection(self._available_devices)
-
-    for device in devices:
-      device_out = self.run_on_device(model_fn, model_inputs, device=device)
-      self._compare_values(device_out, expected_outputs)
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda9a63f204ed686b527c95dd5b4fd7786ac60cf
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -0,0 +1,137 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Defines the `Topology` class, that describes a TPU fabric topology."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tpu.proto import topology_pb2
+
+
+class Topology(object):
+  """Describes a set of TPU devices.
+
+  Represents both the shape of the physical mesh, and the mapping between
+  TensorFlow TPU devices to physical mesh coordinates.
+  """
+
+  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
+    """Builds a Topology object.
+
+    If `serialized` is not `None`, the topology is parsed from `serialized` and
+    the other arguments are ignored. Otherwise, the topology is computed from
+    `mesh_shape` and `device_coordinates`.
+
+    Args:
+      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
+        serialized proto is parsed to discover the topology.
+      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+        the shape of the TPU topology, in number of cores. Ignored if
+        `serialized` is not `None`.
+      device_coordinates: A rank 3 numpy array that describes the mapping from
+        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
+        if `serialized is not `None`.
+
+    Raises:
+      ValueError: If `serialized` does not describe a well-formed topology.
+      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
+        of 3 positive integers.
+      ValueError: If `serialized` is `None` and `device_coordinates` is not a
+        rank 3 numpy int32 array that describes a valid coordinate mapping.
+    """
+
+    if serialized:
+      self._serialized = serialized
+      self._parse_topology(serialized)
+    else:
+      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
+      self._device_coordinates = np.asarray(device_coordinates, np.int32)
+      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+                         "entries; got {}".format(self._mesh_shape))
+
+      if (len(self._device_coordinates.shape) != 3 or
+          self._device_coordinates.shape[2] != len(self._mesh_shape)):
+        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
+                         "with minor dimension equal to the mesh shape rank")
+
+  def _parse_topology(self, serialized):
+    """Parses a serialized `TopologyProto` into `self`."""
+    proto = topology_pb2.TopologyProto()
+    proto.ParseFromString(serialized)
+
+    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
+    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+                       "entries; got {}".format(self._mesh_shape))
+
+    if proto.num_tasks < 0:
+      raise ValueError("`num_tasks` must be >= 0; got {}".format(
+          proto.num_tasks))
+    if proto.num_tpu_devices_per_task < 0:
+      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
+          proto.num_tpu_devices_per_task))
+
+    expected_coordinates_size = (
+        proto.num_tasks * proto.num_tpu_devices_per_task * len(
+            proto.mesh_shape))
+    if len(proto.device_coordinates) != expected_coordinates_size:
+      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
+                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
+                       "got shape {}".format(proto.num_tasks,
+                                             proto.num_tpu_devices_per_task,
+                                             proto.mesh_shape,
+                                             len(proto.device_coordinates)))
+
+    coords = np.array(proto.device_coordinates, dtype=np.int32)
+    if any(coords < 0):
+      raise ValueError("`device_coordinates` must be >= 0")
+    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
+                             len(proto.mesh_shape)))
+    self._device_coordinates = coords
+
+  @property
+  def mesh_shape(self):
+    """A rank 1 int32 array describing the shape of the TPU topology."""
+    return self._mesh_shape
+
+  @property
+  def device_coordinates(self):
+    """Describes the mapping from TPU devices to topology coordinates.
+
+    Returns:
+      A rank 3 int32 array with shape `[tasks, devices, axis]`.
+      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+      of TPU devices per task, and `axis` is the number of axes in the TPU
+      cluster topology. Each entry gives the `axis`-th coordinate in the
+      topology of a task/device pair. TPU topologies are 3-dimensional, with
+      dimensions `(x, y, core number)`.
+    """
+    return self._device_coordinates
+
+  def serialized(self):
+    """Returns the serialized form of the topology."""
+    if self._serialized is None:
+      proto = topology_pb2.TopologyProto()
+      proto.mesh_shape[:] = list(self._mesh_shape)
+      proto.num_tasks = self._device_coordinates.shape[0]
+      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
+      proto.device_coordinates = list(self._device_coordinates.flatten())
+      self._serialized = proto.SerializeToString()
+
+    return self._serialized
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 338a4304f3272f3486c88e6e2aeb90fec15e4f58..7fb8a33698fdd2b37f42464e934331de65904bfe 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -19,7 +19,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
@@ -30,13 +29,43 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+
+
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_BLACKLISTED_OPS = set([
+    "Placeholder",
+])
+
+# These operations will currently fail to compile, but we should be able to
+# support them eventually via CPU offload or extending our operation set.
+_NOT_IMPLEMENTED_OPS = set([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+    ])
+
+
+def _tpu_system_device_name(job):
+  """Returns the device name for the TPU_SYSTEM device of `job`."""
+  if job is None:
+    return "/device:TPU_SYSTEM:0"
+  else:
+    return "/job:%s/device:TPU_SYSTEM:0" % job
 
 
 def initialize_system(embedding_config=None, job=None):
   """Initializes a distributed TPU system for use with TensorFlow.
 
   Args:
-    embedding_config: If not None, an EmbeddingLayerConfiguration proto
+    embedding_config: If not None, an `EmbeddingLayerConfiguration` proto
       describing the desired configuration of the hardware embedding lookup
       tables. If embedding_config is None, no hardware embeddings can be used.
     job: The job (the XXX in TensorFlow device specification /job:XXX)
@@ -44,27 +73,18 @@ def initialize_system(embedding_config=None, job=None):
       it is assumed there is only one job in the TensorFlow flock, and an
       error will be returned if this assumption does not hold.
   Returns:
-    Op which, when executed, will initialize the system.
+    A serialized `TopologyProto` that describes the TPU system. Note:
+      the topology must be evaluated using `Session.run` before it can be used.
   """
-  if job is None:
-    device_name = "/device:TPU_SYSTEM:0"
-  else:
-    device_name = "/job:%s/device:TPU_SYSTEM:0" % job
   config_string = ("" if embedding_config is None else
                    embedding_config.SerializeToString())
-  with ops.device(device_name):
-    init_distributed_tpu = tpu_ops.configure_distributed_tpu(
-        embedding_config=config_string)
-  return init_distributed_tpu
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
 
 
 def shutdown_system(job=None):
   """Shuts down a running a distributed TPU system."""
-  if job is None:
-    device_name = "/device:TPU_SYSTEM:0"
-  else:
-    device_name = "/job:%s/device:TPU_SYSTEM:0" % job
-  with ops.device(device_name):
+  with ops.device(_tpu_system_device_name(job)):
     shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
   return shutdown_distributed_tpu
 
@@ -76,43 +96,24 @@ def core(num):
     num: the virtual core number within each replica to which operators should
     be assigned.
   Returns:
-    A device name, suitable for passing to tf.device().
+    A device name, suitable for passing to `tf.device()`.
   """
   return "device:TPU_REPLICATED_CORE:{}".format(num)
 
 
-# Experimental API to 'break out' of a tpu.rewrite() (or shard(), etc.) context.
-# In
-#
-# XXX
-# with tpu.rewrite(...):
-#   YYY
-#   with tpu.outside_all_rewrites():
-#     ZZZ
-#
-# the Ops in ZZZ are added outside the scope of the rewrite().
-# TODO(phawkins): currently outside_all_rewrites() pops out of all nested
-# control flow scopes, for example loops. It would make more sense if it only
-# popped out of a single scope.
-@contextlib.contextmanager
-def outside_all_rewrites():
-  """Experimental API to 'break out' of a tpu.rewrite() (or shard(), etc.)."""
-  with ops.control_dependencies(None):
-    yield
-
-
 class TPUReplicateContext(control_flow_ops.ControlFlowContext):
-  """A ControlFlowContext for nodes inside a TPU computation.
+  """A `ControlFlowContext` for nodes inside a TPU computation.
 
-  The primary role of TPUReplicateContext is to mark operators inside a
+  The primary role of `TPUReplicateContext` is to mark operators inside a
   tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
   is a unique name.
 
-  We use a ControlFlowContext to perform the annotation since it
+  We use a `ControlFlowContext` to perform the annotation since it
   integrates with Tensorflow constructs like ResourceVariables. For example,
-  if a ResourceVariable is constructed inside a tpu.replicate() block, the
-  ResourceVariable implementation can use "with ops.control_dependencies(None)"
-  to build the variable's definition outside the replicated computation.
+  if a `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
   """
 
   def __init__(self, name):
@@ -124,6 +125,14 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
 
   def _AddOpInternal(self, op):
     # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_OPS:
+      raise ValueError("Operation of type %s (%s) is not supported on the TPU" %
+                       (op.type, op.name))
+
+    if op.type in _NOT_IMPLEMENTED_OPS:
+      logging.warning(
+          "Operation %s (%s) is not currently supported", op.type, op.name)
+
     if any(x.dtype._is_ref_dtype for x in op.inputs):
       raise NotImplementedError(
           "Non-resource Variables are not supported inside TPU computations "
@@ -158,37 +167,47 @@ class TPUReplicateContext(control_flow_ops.ControlFlowContext):
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
-              global_tpu_id=None,
+              device_assignment=None,
               name=None):
   """Builds a graph operator that runs a replicated TPU computation.
 
   Args:
-    computation: a Python function that builds the computation to replicate.
-    inputs: a list of lists of input tensors or None (equivalent to
-      [[]]), indexed by [replica_num][input_num]. All replicas must
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
       have the same number of inputs.
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to computation.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
-    A list of lists of output tensors, indexed by [replica_num][output_num].
+    A list of lists of output tensors, indexed by `[replica_num][output_num]`.
   Raises:
-    ValueError: if all replicas do not have equal numbers of input tensors.
-    ValueError: if the number of inputs per replica does not match
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
   if name is None:
     name = "TPUReplicate"
   inputs = [[]] if inputs is None else inputs
 
-  if global_tpu_id is not None:
-    # Turn the Numpy array into a flattened list.
-    global_tpu_id = global_tpu_id.flatten().tolist()
+  metadata_kwargs = {}
+  if device_assignment is not None:
+    # Turn the Numpy array into a flattened list so we can pass it as an
+    # operator attribute.
+    metadata_kwargs = {
+        "topology":
+            device_assignment.topology.serialized(),
+        "device_assignment":
+            device_assignment.core_assignment.flatten().tolist(),
+        "computation_shape":
+            device_assignment.computation_shape.tolist()
+    }
 
   if ((not isinstance(inputs, list)) or
       any(not isinstance(inp, (list, tuple)) for inp in inputs)):
@@ -251,7 +270,7 @@ def replicate(computation,
       context.Enter()
 
       metadata = tpu_ops.tpu_replicate_metadata(
-          num_replicas=num_replicas, global_tpu_id=global_tpu_id)
+          num_replicas=num_replicas, **metadata_kwargs)
 
       with tpu_function.tpu_shard_context(
           num_replicas), ops.control_dependencies([metadata]):
@@ -319,8 +338,11 @@ def replicate(computation,
       # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
       # be rewritten away, leading to a runtime error.
       # TODO(phawkins): extend the rewrite to elide these nodes instead.
-      with ops.device(core(0)):
-        output_tensors = [array_ops.identity(x) for x in output_tensors]
+      new_output_tensors = []
+      for t in output_tensors:
+        with ops.device(t.device if t.device else core(0)):
+          new_output_tensors.append(array_ops.identity(t))
+      output_tensors = new_output_tensors
     finally:
       context.Exit()
 
@@ -355,7 +377,7 @@ def shard(computation,
           outputs_from_all_shards=True,
           output_shard_axes=None,
           infeed_queue=None,
-          global_tpu_id=None,
+          device_assignment=None,
           name=None):
   """Shards `computation` for parallel execution.
 
@@ -383,39 +405,40 @@ def shard(computation,
   Inputs and outputs of the computation must be at least rank-1 Tensors.
 
   Args:
-    computation: a Python function that builds a computation to apply to each
+    computation: A Python function that builds a computation to apply to each
       shard of the input.
-    inputs: a list of input tensors or None (equivalent to an empty
+    inputs: A list of input tensors or None (equivalent to an empty
       list). Each input tensor has a corresponding shard axes, given
       by `input_shard_axes`, which must have size divisible by
       `num_shards`.
-    num_shards: the number of shards.
-    input_shard_axes: a list of dimensions along which to shard `inputs`, or
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
       `None`. `None` means "shard all inputs along dimension 0". If not `None`,
       there must be one dimension per input.
-    outputs_from_all_shards: boolean or list of boolean. For each output, if
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
       `True`, outputs from all shards are concatenated along the corresponding
       `output_shard_axes` entry. Otherwise, each output is taken
       from an arbitrary shard. If the argument is a boolean, the argument's
       value is used for each output.
-    output_shard_axes: a list of dimensions along which to concatenate the
+    output_shard_axes: A list of dimensions along which to concatenate the
       outputs of `computation`, or `None`. `None` means "concatenate all outputs
       along dimension 0". If not `None`, there must be one dimension per output.
       Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: if not None, the InfeedQueue to use to augment the inputs of
-      `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   Raises:
-    ValueError: if num_shards <= 0
-    ValueError: if len(input_shard_axes) != len(inputs)
-    ValueError: if len(output_shard_axes) != len(outputs from `computation`)
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
   """
 
   if num_shards <= 0:
@@ -446,7 +469,7 @@ def shard(computation,
       computation,
       transposed_inputs,
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)
 
   # There must be at least one shard since num_shards > 0.
@@ -500,7 +523,7 @@ def batch_parallel(computation,
                    inputs=None,
                    num_shards=1,
                    infeed_queue=None,
-                   global_tpu_id=None,
+                   device_assignment=None,
                    name=None):
   """Shards `computation` along the batch dimension for parallel execution.
 
@@ -524,55 +547,55 @@ def batch_parallel(computation,
   Inputs and outputs of the computation must be at least rank-1 Tensors.
 
   Args:
-    computation: a Python function that builds a computation to apply to each
+    computation: A Python function that builds a computation to apply to each
       shard of the input.
-    inputs: a list of input tensors or None (equivalent to an empty
+    inputs: A list of input tensors or None (equivalent to an empty
       list). The 0-th dimension of each Tensor must have size
       divisible by `num_shards`.
-    num_shards: the number of shards.
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    num_shards: The number of shards.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   Raises:
-    ValueError: if num_shards <= 0
+    ValueError: If `num_shards <= 0`
   """
   return shard(
       computation,
       inputs,
       num_shards=num_shards,
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)
 
 
 def rewrite(computation,
             inputs=None,
             infeed_queue=None,
-            global_tpu_id=None,
+            device_assignment=None,
             name=None):
   """Rewrites `computation` for execution on a TPU system.
 
   Args:
-    computation: a Python function that builds a computation to apply
+    computation: A Python function that builds a computation to apply
       to the input. If the function takes n inputs, 'inputs' should be
       a list of n tensors. If the function returns m outputs, rewrite
       will return a list of m tensors.
-    inputs: a list of input tensors or None (equivalent to an empty list).
-    infeed_queue: if not None, the InfeedQueue from which to append a tuple
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
-    global_tpu_id: if not None, a Numpy 2D array indicating the global
-      id of each TPU device in the system. The outer dimension of the
-      array is host task id, and the inner dimension is device ordinal,
-      so e.g., global_tpu_id[x][y] indicates the global id of device
-      /task:x/device:TPU_NODE:y.
-    name: name of the operator.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
   Returns:
     A list of output tensors.
   """
@@ -585,6 +608,6 @@ def rewrite(computation,
       computation,
       None if inputs is None else [inputs],
       infeed_queue=infeed_queue,
-      global_tpu_id=global_tpu_id,
+      device_assignment=device_assignment,
       name=name)[0]
   # pylint: enable=indexing-exception
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 3965c087a18dc18298703fad9b1dda9c85c56271..916b9b3082fc197694933bdd6042706891be115c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -45,10 +45,7 @@ class TPUConfig(
       is invoked once on each host. To be precise, with a global batch size
       `train_batch_size` in `TPUEstimator` constructor, the batch size for each
       shard is `train_batch_size` // #hosts. With Per-Core input pipeline
-      deployment, the shard batch size is `train_batch_size` // #cores.  Note
-      that this only works for single-host TPU training now (tracked in
-      b/67051042). For multi-host, please use Per-Core, i.e., `False` for
-      `per_host_input_for_training`.
+      deployment, the shard batch size is `train_batch_size` // #cores.
     tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
@@ -109,3 +106,12 @@ class RunConfig(run_config_lib.RunConfig):
   @property
   def tpu_config(self):
     return self._tpu_config
+
+  def replace(self, **kwargs):
+    if 'tpu_config' not in kwargs:
+      return super(RunConfig, self).replace(**kwargs)
+
+    tpu_config = kwargs.pop('tpu_config')
+    new_instance = super(RunConfig, self).replace(**kwargs)
+    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
+    return new_instance
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 5a3b8314291951b5dfce091dccb0dc9e5f7af3b5..84a4208be35d7056fa8a9a38e9df40f424128412 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -23,6 +23,8 @@ import collections
 from contextlib import contextmanager
 import copy
 import threading
+import time
+
 import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 
@@ -232,8 +234,10 @@ class _TPUContext(object):
                          mode == model_fn_lib.ModeKeys.TRAIN
                          else self._eval_batch_size)
     # On TPU
-    return (global_batch_size // self.num_cores
-            if self.is_input_sharded_per_core() else global_batch_size)
+    if self.is_input_sharded_per_core():
+      return global_batch_size // self.num_cores
+    else:
+      return global_batch_size // self.num_hosts
 
   @property
   def batch_size_for_model_fn(self):
@@ -488,11 +492,29 @@ class _InfeedThreadController(_InfeedOutfeedThreadBaseController):
           count += 1
 
     except Exception:  # pylint: disable=broad-except
+      # Close the session to avoid the main thread from hanging. If input
+      # pipeline triggers any error, the infeed thread dies but the main thread
+      # for TPU computation waits for the infeed enqueue forever. Close the
+      # Session to cancel the main thread Session.run execution.
+      #
+      # However, sleep for 2 minutes before explicit closing to give some time
+      # for the TPU compilation error, if any, propagating, from TPU to CPU
+      # host. Compilation errors should be reported by the main thread so that
+      # the program can be interrupted and users can take action.  Due to a race
+      # condition, the infeed thread might see an error first.  Closing the
+      # session here immediately would result in a session cancellation
+      # exception in the main thread, instead of the expected compile error.
+      # User code that depends on having the proper exception type will
+      # therefore be confused.
       logging.error(
           'Failed running infeed, closing session.\n'
-          'You may see an exception from your main session after this.',
+          'You may see an exception from your main session after this. '
+          'Sleep for 2 minutes before close Session from infeed thread to '
+          'allow the main thread returning an error first, if any.',
           exc_info=1
       )
+      time.sleep(120)
+      logging.error('Closing the failed session.')
       session.close()
 
   def join(self):
@@ -535,13 +557,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
           session, self._dequeue_ops)
 
   def before_run(self, run_context):
-    logging.info('Enqueue next batch of data to infeed.')
-
     iterations = run_context.session.run(self._iterations_per_loop_var)
+
+    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+
     self._infeed_thd_controller.send_next_batch_signal(iterations)
     if self._dequeue_ops is not None:
       # TODO(xiejw): Refactor the outfeed dequeue into tf.while_loop.
-      logging.info('Dequeue next batch of data from outfeed.')
+      logging.info(
+          'Dequeue next (%d) batch(es) of data from outfeed.', iterations)
       self._outfeed_thd_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
@@ -680,6 +704,40 @@ def generate_per_core_enqueue_ops_fn_for_host(
   return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
 
 
+def generate_per_host_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  infeed_queue_holder = {'instance': None}
+
+  def enqueue_ops_fn():
+    with ops.device(device):
+      num_cores_per_host = ctx.num_of_cores_per_host
+      inputs = input_fn()
+      if isinstance(inputs, tuple):
+        features, labels = inputs
+      else:
+        features, labels = inputs, None
+      inputs_structure_recorder.validate_and_record_structure(
+          features, labels)
+      unsharded_tensor_list = (
+          inputs_structure_recorder.flatten_features_and_labels(
+              features, labels))
+
+      infeed_queue = tpu_feed.InfeedQueue(
+          tuple_types=[t.dtype for t in unsharded_tensor_list],
+          tuple_shapes=[t.shape for t in unsharded_tensor_list],
+          shard_dimensions=batch_axis)
+      infeed_queue_holder['instance'] = infeed_queue
+      infeed_queue.set_number_of_shards(num_cores_per_host)
+
+      per_host_enqueue_ops = (
+          infeed_queue.split_inputs_and_generate_enqueue_ops(
+              unsharded_tensor_list,
+              placement_function=lambda x: device))
+      return per_host_enqueue_ops
+  return enqueue_ops_fn, (lambda: infeed_queue_holder['instance'])
+
+
 class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
@@ -842,6 +900,8 @@ class _InputPipeline(object):
     # structure is recorded.
     enqueue_ops = self._invoke_input_fn_and_record_structure()
 
+    self._validate_input_pipeline()
+
     def dequeue_fn():
       """dequeue_fn is used by TPU to retrieve the tensors."""
       values = self._infeed_queue.generate_dequeue_op()
@@ -852,15 +912,15 @@ class _InputPipeline(object):
     return (enqueue_ops, dequeue_fn)
 
   def _invoke_input_fn_and_record_structure(self):
+    """Deploys the input pipeline and record input structure."""
+    enqueue_ops = []
+    infeed_queues = []
+    num_hosts = self._ctx.num_hosts
+    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
     if self._sharded_per_core:
       # Per-Core input pipeline deployment.
-      tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-      enqueue_ops = []
-      infeed_queues = []
-
       # Invoke input pipeline for each core and placed on the corresponding
       # host.
-      num_hosts = self._ctx.num_hosts
       for host_id in range(num_hosts):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
@@ -877,48 +937,43 @@ class _InputPipeline(object):
             # Infeed_queue_getter must be called after enqueue_ops_fn is called.
             infeed_queues.append(infeed_queue_getter())
 
-      # infeed_queue is used to generate dequeue ops. The only thing it uses for
-      # dequeue is dtypes and types. So, any one can be used. Here, grab the
-      # first one.
-      self._infeed_queue = infeed_queues[0]
-      return enqueue_ops
-
     else:
-      # TODO(b/67051042): Extend this to multi-host support.
-      host_id = 0
-      host_device = self._ctx.tpu_host_placement_function(host_id=host_id)
-      def enqueue_fn():
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            inputs = self._input_fn()
-            if isinstance(inputs, tuple):
-              features, labels = inputs
-            else:
-              features, labels = inputs, None
-            self._inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            unsharded_tensor_list = (
-                self._inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels))
-
-            self._infeed_queue = tpu_feed.InfeedQueue(
-                tuple_types=[t.dtype for t in unsharded_tensor_list],
-                tuple_shapes=[t.shape for t in unsharded_tensor_list],
-                shard_dimensions=self._batch_axis)
-            self._infeed_queue.set_number_of_shards(self._ctx.num_cores)
-
-            def placement_fn(core_id):
-              return self._ctx.tpu_host_placement_function(core_id=core_id)
-            return (
-                self._infeed_queue.split_inputs_and_generate_enqueue_ops(
-                    unsharded_tensor_list,
-                    placement_function=placement_fn))
+            enqueue_ops_fn, infeed_queue_getter = (
+                generate_per_host_enqueue_ops_fn_for_host(
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    self._batch_axis, host_device))
 
+            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              enqueue_ops.append(_wrap_computation_in_while_loop(
+                  device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            infeed_queues.append(infeed_queue_getter())
+    # infeed_queue is used to generate dequeue ops. The only thing it uses for
+    # dequeue is dtypes and types. So, any one can be used. Here, grab the
+    # first one.
+    self._infeed_queue = infeed_queues[0]
+    return enqueue_ops
+
+  def _validate_input_pipeline(self):
+    # Perform some sanity checks to log user friendly information. We should
+    # error out to give users better error message. But, if
+    # _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    # user code, so, log a warning.
+    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+      err_msg = ('Input pipeline contains one or more QueueRunners. '
+                 'It could be slow and not scalable. Please consider '
+                 'converting your input pipeline to use `tf.data` instead (see '
+                 'https://www.tensorflow.org/programmers_guide/datasets for '
+                 'instructions.')
       if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        return _wrap_computation_in_while_loop(device=host_device,
-                                               op_fn=enqueue_fn)
+        raise RuntimeError(err_msg)
       else:
-        return enqueue_fn()
+        logging.warn(err_msg)
 
 
 class _ModelFnWrapper(object):
@@ -1396,12 +1451,6 @@ class TPUEstimator(estimator_lib.Estimator):
               'eval batch size {} must be divisible by number of shards {}'
               .format(eval_batch_size, config.tpu_config.num_shards))
 
-      if (config.tpu_config.num_shards > 8 and
-          config.tpu_config.per_host_input_for_training):
-        # TODO(b/67051042): Support per_host input pipelines when num_shards > 8
-        raise NotImplementedError(
-            'Per-host input pipelines only available for num_shards <= 8')
-
     # Verifies the model_fn signature according to Estimator framework.
     estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
     # We cannot store config and params in this constructor as parent
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index 1c8ea63f00ba4b2298abd8053a7fe8702b6fc0bc..42ac6eb680437ec82287468bcba2b770ac0e5749 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -513,7 +513,7 @@ class InfeedQueue(object):
   # for automatic placement of input pipelines.
   def split_inputs_and_generate_enqueue_ops(self,
                                             inputs,
-                                            global_tpu_id=None,
+                                            device_assignment=None,
                                             placement_function=None,
                                             tpu_ordinal_function=None):
     """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
@@ -536,14 +536,12 @@ class InfeedQueue(object):
     Args:
       inputs: a list of Tensors which indicates the types and shapes of the
         queue tuple.
-     global_tpu_id: if not None, a Numpy 2D array indicating the global
-        id of each TPU device in the system. The outer dimension of the
-        array is host task id, and the inner dimension is device ordinal,
-        so e.g., global_tpu_id[x][y] indicates the global id of device
-        /task:x/device:TPU_NODE:y. If global_tpu_id is not None, but
-        placement_function and ordinal_function are None, then global_tpu_id
-        will be used to place infeed on the TPUs with the first k global ids,
-        where k is the number of shards in the queue.
+     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
+        device_assignment is not `None`, but `placement_function` and
+        `ordinal_function` are None, then `device_assignment` will be used to
+        place infeeds on the first k TPU shards, where k is the number of shards
+        in the queue. If all three are `None`, then default placement and
+        ordinal functions are used.
       placement_function: if not None, a function that takes the shard
         index as input and returns a device string indicating which
         device the shard's infeed should be placed on. If placement_function
@@ -567,22 +565,18 @@ class InfeedQueue(object):
         types of the elements of inputs are not compatible with the frozen
         configuration.
     """
-    if global_tpu_id is None:
+    if device_assignment is None:
       if placement_function is None:
         placement_function = self._default_placement_function
       if tpu_ordinal_function is None:
         tpu_ordinal_function = self._default_ordinal_function
     else:
-      global_id_map = {}
-      for host, devices in enumerate(global_tpu_id):
-        for ordinal, global_id in enumerate(devices):
-          global_id_map[global_id] = (host, ordinal)
 
       def _placement_function_from_map(index):
-        return "/task:%d/device:CPU:0" % global_id_map[index][0]
+        return device_assignment.host_device(replica=index)
 
       def _ordinal_function_from_map(index):
-        return global_id_map[index][1]
+        return device_assignment.tpu_ordinal(replica=index)
 
       if placement_function is None:
         placement_function = _placement_function_from_map
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 391899b34f90be25e10450ebf4e285ed2d39446f..80de0f6eb7e36a1c86f7d44e4053a9757b09f0ae 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import numbers
 import re
 
 import six
@@ -76,7 +77,7 @@ def _process_scalar_value(name, parse_fn, var_type, m_dict, values,
       function.
 
   Raises:
-    ValueError: If the name has already been sued.
+    ValueError: If the name has already been used.
   """
   try:
     parsed_value = parse_fn(m_dict['val'])
@@ -138,6 +139,54 @@ def _process_list_value(name, parse_fn, var_type, m_dict, values,
     _parse_fail(name, var_type, m_dict['vals'], values)
 
 
+def _cast_to_type_if_compatible(name, param_type, value):
+  """Cast hparam to the provided type, if compatible.
+
+  Args:
+    name: Name of the hparam to be cast.
+    param_type: The type of the hparam.
+    value: The value to be cast, if compatible.
+
+  Returns:
+    The result of casting `value` to `param_type`.
+
+  Raises:
+    ValueError: If the type of `value` is not compatible with param_type.
+      * If `param_type` is a string type, but `value` is not.
+      * If `param_type` is a boolean, but `value` is not, or vice versa.
+      * If `param_type` is an integer type, but `value` is not.
+      * If `param_type` is a float type, but `value` is not a numeric type.
+  """
+  fail_msg = (
+      "Could not cast hparam '%s' of type '%s' from value %r" %
+      (name, param_type, value))
+
+  # Some callers use None, for which we can't do any casting/checking. :(
+  if issubclass(param_type, type(None)):
+    return value
+
+  # Avoid converting a non-string type to a string.
+  if (issubclass(param_type, (six.string_types, six.binary_type)) and
+      not isinstance(value, (six.string_types, six.binary_type))):
+    raise ValueError(fail_msg)
+
+  # Avoid converting a number or string type to a boolean or vice versa.
+  if issubclass(param_type, bool) != isinstance(value, bool):
+    raise ValueError(fail_msg)
+
+  # Avoid converting float to an integer (the reverse is fine).
+  if (issubclass(param_type, numbers.Integral) and
+      not isinstance(value, numbers.Integral)):
+    raise ValueError(fail_msg)
+
+  # Avoid converting a non-numeric type to a numeric type.
+  if (issubclass(param_type, numbers.Number) and
+      not isinstance(value, numbers.Number)):
+    raise ValueError(fail_msg)
+
+  return param_type(value)
+
+
 def parse_values(values, type_map):
   """Parses hyperparameter values from a string into a python map.
 
@@ -438,17 +487,18 @@ class HParams(object):
     Raises:
       ValueError: If there is a type mismatch.
     """
-    _, is_list = self._hparam_types[name]
+    param_type, is_list = self._hparam_types[name]
     if isinstance(value, list):
       if not is_list:
         raise ValueError(
             'Must not pass a list for single-valued parameter: %s' % name)
-      setattr(self, name, value)
+      setattr(self, name, [
+          _cast_to_type_if_compatible(name, param_type, v) for v in value])
     else:
       if is_list:
         raise ValueError(
             'Must pass a list for multi-valued parameter: %s.' % name)
-      setattr(self, name, value)
+      setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
 
   def parse(self, values):
     """Override hyperparameter values, parsing new values from a string.
@@ -500,13 +550,26 @@ class HParams(object):
   def get_model_structure(self):
     return self._model_structure
 
-  def to_json(self):
+  def to_json(self, indent=None, separators=None, sort_keys=False):
     """Serializes the hyperparameters into JSON.
 
+    Args:
+      indent: If a non-negative integer, JSON array elements and object members
+        will be pretty-printed with that indent level. An indent level of 0, or
+        negative, will only insert newlines. `None` (the default) selects the
+        most compact representation.
+      separators: Optional `(item_separator, key_separator)` tuple. Default is
+        `(', ', ': ')`.
+      sort_keys: If `True`, the output dictionaries will be sorted by key.
+
     Returns:
       A JSON string.
     """
-    return json.dumps(self.values())
+    return json.dumps(
+        self.values(),
+        indent=indent,
+        separators=separators,
+        sort_keys=sort_keys)
 
   def parse_json(self, values_json):
     """Override hyperparameter values, parsing new values from a json object.
@@ -532,6 +595,33 @@ class HParams(object):
     """
     return {n: getattr(self, n) for n in self._hparam_types.keys()}
 
+  def get(self, key, default=None):
+    """Returns the value of `key` if it exists, else `default`."""
+    if key in self._hparam_types:
+      # Ensure that default is compatible with the parameter type.
+      if default is not None:
+        param_type, is_param_list = self._hparam_types[key]
+        type_str = 'list<%s>' % param_type if is_param_list else str(param_type)
+        fail_msg = ("Hparam '%s' of type '%s' is incompatible with "
+                    'default=%s' % (key, type_str, default))
+
+        is_default_list = isinstance(default, list)
+        if is_param_list != is_default_list:
+          raise ValueError(fail_msg)
+
+        try:
+          if is_default_list:
+            for value in default:
+              _cast_to_type_if_compatible(key, param_type, value)
+          else:
+            _cast_to_type_if_compatible(key, param_type, default)
+        except ValueError as e:
+          raise ValueError('%s. %s' % (fail_msg, e))
+
+      return getattr(self, key)
+
+    return default
+
   def __contains__(self, key):
     return key in self._hparam_types
 
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index f54514cefd39cab93e5c3a34786a6bb751b97704..28e4b4d01eda9bef07ff7929f74894e09a3e987c 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -292,6 +292,16 @@ class HParamsTest(test.TestCase):
     self.assertEqual('relu4', hparams2.c_c)
     self.assertEqual(False, hparams2.d)
 
+    hparams3 = hparam.HParams(aaa=123)
+    self.assertEqual('{"aaa": 123}', hparams3.to_json())
+    self.assertEqual('{\n  "aaa": 123\n}', hparams3.to_json(indent=2))
+    self.assertEqual('{"aaa"=123}', hparams3.to_json(separators=(';', '=')))
+
+    hparams4 = hparam.HParams(aaa=123, b='hello', c_c=False)
+    self.assertEqual(
+        '{"aaa": 123, "b": "hello", "c_c": false}',
+        hparams4.to_json(sort_keys=True))
+
   def testSetHParam(self):
     hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True)
     self.assertDictEqual({
@@ -318,13 +328,42 @@ class HParamsTest(test.TestCase):
     self.assertEqual(3.0, hparams.b)
     self.assertEqual('relu4', hparams.c_c)
 
-  def testSetHParamTypeMismatch(self):
+  def testSetHParamListNonListMismatch(self):
     hparams = hparam.HParams(a=1, b=[2.0, 3.0])
     with self.assertRaisesRegexp(ValueError, r'Must not pass a list'):
       hparams.set_hparam('a', [1.0])
     with self.assertRaisesRegexp(ValueError, r'Must pass a list'):
       hparams.set_hparam('b', 1.0)
 
+  def testSetHParamTypeMismatch(self):
+    hparams = hparam.HParams(
+        int_=1, str_='str', bool_=True, float_=1.1, list_int=[1, 2], none=None)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('str_', 2.2)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('int_', False)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('bool_', 1)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('int_', 2.2)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('list_int', [2, 3.3])
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('int_', '2')
+
+    # Casting int to float is OK
+    hparams.set_hparam('float_', 1)
+
+    # Getting stuck with NoneType :(
+    hparams.set_hparam('none', '1')
+    self.assertEqual('1', hparams.none)
+
   def testNonProtoFails(self):
     with self.assertRaisesRegexp(AssertionError, ''):
       hparam.HParams(hparam_def=1)
@@ -335,6 +374,49 @@ class HParamsTest(test.TestCase):
     with self.assertRaisesRegexp(AssertionError, ''):
       hparam.HParams(hparam_def=[1, 2, 3])
 
+  def testGet(self):
+    hparams = hparam.HParams(aaa=1, b=2.0, c_c='relu6', d=True, e=[5.0, 6.0])
+
+    # Existing parameters with default=None.
+    self.assertEqual(1, hparams.get('aaa'))
+    self.assertEqual(2.0, hparams.get('b'))
+    self.assertEqual('relu6', hparams.get('c_c'))
+    self.assertEqual(True, hparams.get('d'))
+    self.assertEqual([5.0, 6.0], hparams.get('e', None))
+
+    # Existing parameters with compatible defaults.
+    self.assertEqual(1, hparams.get('aaa', 2))
+    self.assertEqual(2.0, hparams.get('b', 3.0))
+    self.assertEqual(2.0, hparams.get('b', 3))
+    self.assertEqual('relu6', hparams.get('c_c', 'default'))
+    self.assertEqual(True, hparams.get('d', True))
+    self.assertEqual([5.0, 6.0], hparams.get('e', [1.0, 2.0, 3.0]))
+    self.assertEqual([5.0, 6.0], hparams.get('e', [1, 2, 3]))
+
+    # Existing parameters with incompatible defaults.
+    with self.assertRaises(ValueError):
+      hparams.get('aaa', 2.0)
+
+    with self.assertRaises(ValueError):
+      hparams.get('b', False)
+
+    with self.assertRaises(ValueError):
+      hparams.get('c_c', [1, 2, 3])
+
+    with self.assertRaises(ValueError):
+      hparams.get('d', 'relu')
+
+    with self.assertRaises(ValueError):
+      hparams.get('e', 123.0)
+
+    with self.assertRaises(ValueError):
+      hparams.get('e', ['a', 'b', 'c'])
+
+    # Nonexistent parameters.
+    self.assertEqual(None, hparams.get('unknown'))
+    self.assertEqual(123, hparams.get('unknown', 123))
+    self.assertEqual([1, 2, 3], hparams.get('unknown', [1, 2, 3]))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
index 0ef5f111b2a467fcca76b5d80c24c525345a9ae4..ed0f398e30a7f3c0b1b9378f8fc5d5bfbea1536a 100644
--- a/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
+++ b/tensorflow/contrib/training/python/training/sgdr_learning_rate_decay.py
@@ -28,7 +28,7 @@ from tensorflow.python.ops import math_ops, control_flow_ops
 def sgdr_decay(learning_rate, global_step, initial_period_steps,
                t_mul=2.0, m_mul=1.0, name=None):
   """Implements Stochastic Gradient Descent with Warm Restarts (SGDR).
-  
+
   As described in "SGDR: Stochastic Gradient Descent
   with Warm Restarts" by Ilya Loshchilov & Frank Hutter, Proceedings of
   ICLR'2017, available at https://arxiv.org/pdf/1608.03983.pdf
@@ -48,7 +48,7 @@ def sgdr_decay(learning_rate, global_step, initial_period_steps,
   where `t_0` = `initial_period_steps` is the user-defined number of batch
   iterations (not epochs as in the paper) to be performed before the first
   restart is launched.
-  
+
   Then, we perform the first restart (i=1) by setting the learning rate to
   `learning_rate*(m_mul^i)`, where `m_mul in [0,1]` (set to 1 by default).
   The i-th restart runs for `t_i=t_0*(t_mul^i)` steps, i.e., every new
@@ -73,7 +73,7 @@ def sgdr_decay(learning_rate, global_step, initial_period_steps,
       Training dataset size: 10000
       If the user wants the first decay period to span across 5 epochs, then
       `initial_period_steps` = 5 * 10000/100 = 500
-  
+
       Train for 10000 batch iterations with the initial learning rate set to
       0.1, then restart to run 2 times longer, i.e, for 20000 batch iterations
       and with the initial learning rate 0.05, then restart again and again,
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index 6a4d79796d6cafdf42b332df153932fc1e65aa21..f72e0a3f831f9e9c61a2e9d77828ffb12d8428b1 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -244,7 +244,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -255,6 +254,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import optimizer as tf_optimizer
+from tensorflow.python.training import training_util
 
 # TODO(nsilberman): move add_gradients_summaries, clip_gradient_norms and
 # multiply_gradients into contrib/summaries and contrib/optimizers.py
@@ -409,7 +409,7 @@ def create_train_op(total_loss,
       loss value.
   """
   if global_step is _USE_GLOBAL_STEP:
-    global_step = variables.get_or_create_global_step()
+    global_step = training_util.get_or_create_global_step()
 
   # Update ops use GraphKeys.UPDATE_OPS collection if update_ops is None.
   global_update_ops = set(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
@@ -483,7 +483,8 @@ def train(train_op,
           chief_only_hooks=None,
           save_checkpoint_secs=600,
           save_summaries_steps=100,
-          config=None):
+          config=None,
+          max_wait_secs=7200):
   """Runs the training loop.
 
   Args:
@@ -506,6 +507,10 @@ def train(train_op,
       `save_summaries_steps` is set to `None`, then the default summary saver
       isn't used.
     config: An instance of `tf.ConfigProto`.
+    max_wait_secs: Maximum time workers should wait for the session to
+      become available. This should be kept relatively short to help detect
+      incorrect code, but sometimes may need to be increased if the chief takes
+      a while to start up.
 
   Returns:
     the value of the loss function after training.
@@ -532,7 +537,8 @@ def train(train_op,
       chief_only_hooks=chief_only_hooks,
       save_checkpoint_secs=save_checkpoint_secs,
       save_summaries_steps=save_summaries_steps,
-      config=config) as session:
+      config=config,
+      max_wait_secs=max_wait_secs) as session:
     loss = None
     while not session.should_stop():
       loss = session.run(train_op)
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 746ff38b37fd6ba012f1791bfa35209e84305f5c..38a84ffb10e594568a18dbd06debf32545cb2229 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -7,6 +7,8 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
 exports_files(["LICENSE"])
 
 filegroup(
@@ -97,7 +99,7 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma_rendezvous_mgr",
     srcs = ["rdma_rendezvous_mgr.cc"],
     hdrs = ["rdma_rendezvous_mgr.h"],
@@ -130,7 +132,7 @@ cc_library(
     ],
 )
 
-cc_library(
+tf_cuda_library(
     name = "rdma",
     srcs = ["rdma.cc"],
     hdrs = ["rdma.h"],
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index da5f2b0223bc6698e750ebbc3307d70ee1535478..7c1c8ea45912be8c471efbe42f43e083639e91fc 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -1,4 +1,4 @@
-## How to compile and use RDMA-enabled TensorFlow
+## How to compile, use and configure RDMA-enabled TensorFlow
 1. Follow the regular TF compilation instructions. During configure step, if you want ibverbs based RDMA support, answer yes to this question:
 
     ```Do you wish to build TensorFlow with VERBS-RDMA support [y/N]```
@@ -7,6 +7,18 @@
 
     ```server = tf.train.Server(cluster, job_name="local", task_index=0, protocol='grpc+verbs') # default protocol is 'grpc'```
 
+3. RDMA configuration is done by setting the following environment variables:
+   * **RDMA_DEVICE**: The RDMA device name to be used. If not defined by user, a default device with an active port will be set if exists.
+   * **RDMA_DEVICE_PORT**: The port within the selected device. Not relevant if RDMA_DEVICE is not defined. If not defined by user, a default active port will be set if exists.
+   * **RDMA_GID_INDEX**: The GID index of the port. If not defined by user, a default suitable GID index will be set (RoCEV2 is favourable as default).
+   * **RDMA_QP_PKEY_INDEX**: The Pkey for the QP. If not defined by user, the default value is 0.
+   * **RDMA_QP_QUEUE_DEPTH**: TX/RX queue size for the QP. If not defined by user, the default value is 1024.
+   * **RDMA_QP_TIMEOUT**: The retransmission timeout for QPs. If not defined by user, the default value is 14.
+   * **RDMA_QP_RETRY_COUNT**: Number of retransmission for QPs. If not defined by user, the default value is 7.
+   * **RDMA_QP_SL**: Service level configuration for QOS and ECN, valid values are 0-7. If not defined by user, the default value is 0.
+   * **RDMA_QP_MTU**: MTU configuration for the QPs. If not defined by user, the default value is active MTU from query_port.
+   * **RDMA_TRAFFIC_CLASS**: Traffic class configuration for QP, in case of DSCP trust level QoS configuration. If not defined by user, the default value is 0. For more info see [HowTo Configure Trust state on Mellanox Adapters](https://community.mellanox.com/docs/DOC-2866).
+
 ## Overview
 The design is based on TensorFlow r1.0. An RDMA path is added between servers for tensor transfer (weights, gradients, etc). The existing GRPC path remains and is responsible for "administrative" tasks, such as setting up the RDMA path, exchanging computation graphs, etc.
 
@@ -26,7 +38,7 @@ The following improvements can be made in the future. First, conversion to Tenso
 * **RDMA channel:** Responsible for RDMA connection to a particular node. It manages multiple buffers. A channel has a callback table which stores all the callbacks for the requested tensors.
 * **RDMA buffer:** Responsible for sending or receiving data. It has a fixed size memory to store the data. It has a queue to store the pending jobs. There are three types of buffers, message buffer, ACK buffer and tensor buffer. A channel has two message buffers, two ack buffers and many tensor buffers.
 * **RDMA manager:** Manages the adapter and channels, including channel creation, channel setup via GRPC service, channel lookup, etc.
-* **RDMA rendezvous manager:** manages multiple rdma rendezvous. 
+* **RDMA rendezvous manager:** manages multiple rdma rendezvous.
 * **RDMA rendezvous:** a derived class of BaseRemoteRendezvous. This class is the back end for "send" and "recv" ops. When the sendrecv_op wants to send or receive a tensor, it calls the rendezvous' "send" and "recv" functions respectively. Rendezvous are identified by "step_id", a random number, so that tensors for different iterations don't get mixed up.
 
 ### The SEND operation
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
index cff765d1e832e5a593462283444d7c4ed7831636..991f9a9d8bdf883b1b68bfa1fb6af7bf51b7e66a 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
@@ -43,22 +43,21 @@ VerbsService::Stub::Stub(
     const std::shared_ptr< ::grpc::ChannelInterface>& channel)
     : channel_(channel),
       rpcmethod_GetRemoteAddress_(grpcVerbsService_method_names[0],
-                                  ::grpc::RpcMethod::NORMAL_RPC,
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
                                   channel) {}
 
 ::grpc::Status VerbsService::Stub::GetRemoteAddress(
     ::grpc::ClientContext* context, const GetRemoteAddressRequest& request,
     GetRemoteAddressResponse* response) {
-  return ::grpc::BlockingUnaryCall(
+  return ::grpc::internal::BlockingUnaryCall(
       channel_.get(), rpcmethod_GetRemoteAddress_, context, request, response);
 }
 
 VerbsService::AsyncService::AsyncService() {
   for (int i = 0; i < 1; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcVerbsService_method_names[i],
-        ::grpc::RpcMethod::NORMAL_RPC,
-        nullptr));
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index 6e2bf86dac2aa84ff453aaefbfc57cd3ee8bc1fd..86431ca030c38c56155801202714ee4a49b764df 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -28,15 +28,6 @@ limitations under the License.
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
 
 namespace grpc {
-
-// ensure internal namespace exists
-namespace internal {
-// bring in contents of external namespace
-using namespace ::grpc;
-}  // namespace internal
-// bring in contents of internal namespace
-using namespace internal;
-
 class CompletionQueue;
 class Channel;
 class RpcService;
@@ -70,7 +61,7 @@ class VerbsService GRPC_FINAL {
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    const ::grpc::RpcMethod rpcmethod_GetRemoteAddress_;
+    const ::grpc::internal::RpcMethod rpcmethod_GetRemoteAddress_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index 26e18b28aabd0db6c3c7091fca96aa30f39c73a2..ae9a384565a6ad0e63a6cf3acf07c591c65f0637 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -16,12 +16,16 @@ limitations under the License.
 #ifdef TENSORFLOW_USE_VERBS
 
 #include "tensorflow/contrib/verbs/rdma.h"
+#include <fcntl.h>
 #include <cstdlib>
+#include <fcntl.h>
 #include "tensorflow/contrib/verbs/verbs_util.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
+#endif
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -30,9 +34,12 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
 
+#define RoCE_V2 "RoCE v2"
+
 namespace {
 // hash name to 32-bit integer
 uint32_t NameHash(const string& name) {
@@ -66,16 +73,336 @@ string MessageTypeToString(RdmaMessageType rmt) {
 }
 }  // namespace
 
-ibv_context* open_default_device() {
+// Function to get environment variable
+// Args:
+//    var_name - the name of the environmental variable
+// Returns:
+//    string with it's value or empty string if not set
+string get_env_var(char const* var_name) {
+  char const* var_temp = getenv(var_name);
+
+  return (var_temp == NULL) ? string() : string(var_temp);
+}
+
+// Function to open device
+// Args:
+//   ibv_dev device to open
+// Returns:
+//   context of the opened device
+ibv_context* open_device(ibv_device* ibv_dev) {
+  ibv_context* context = ibv_open_device(ibv_dev);
+
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(ibv_dev);
+  return context;
+}
+
+// Function to count the number of active ports for device
+// Args:
+//   device - to check active ports
+// Returns:
+//   number of active ports of the given device
+int get_dev_active_port_count(ibv_device* device) {
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  ibv_context* context = NULL;
+  int rc, port_index, active_ports = 0;
+
+  context = ibv_open_device(device);
+  CHECK(context) << "Open context failed for " << ibv_get_device_name(device);
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device";
+
+  for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+    rc = ibv_query_port(context, port_index, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_index;
+    if (port_attr.state == IBV_PORT_ACTIVE) {
+      active_ports++;
+    }
+  }
+  ibv_close_device(context);
+  return active_ports;
+}
+
+// Function to set device. If RDMA_DEVICE not set, search for device with active
+// port.
+// Fails if more than one device with active port was found.
+// Returns:
+//   device to use
+ibv_device* set_device() {
   ibv_device** dev_list;
-  ibv_device* ib_dev;
-  dev_list = ibv_get_device_list(NULL);
+  int dev_num, device_index, device_to_open = 0;
+  int num_devs_with_active_port = 0;
+  string env_p_rdma_device, str_port_num;
+
+  dev_list = ibv_get_device_list(&dev_num);
   CHECK(dev_list) << "No InfiniBand device found";
-  ib_dev = dev_list[0];
-  CHECK(ib_dev) << "No InfiniBand device found";
-  ibv_context* context = ibv_open_device(ib_dev);
-  CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev);
-  return context;
+
+  env_p_rdma_device = get_env_var("RDMA_DEVICE");
+  if (!env_p_rdma_device.empty()) {
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      if (!env_p_rdma_device.compare(
+              ibv_get_device_name(dev_list[device_index]))) {
+        CHECK(get_dev_active_port_count(dev_list[device_index]) != 0)
+            << "Device " << ibv_get_device_name(dev_list[device_index])
+            << " has no active ports";
+        return dev_list[device_index];
+      }
+    }
+    // check validity of input device
+    CHECK(false) << "The device " << env_p_rdma_device << " wasn't found";
+  } else {
+    // set default device
+    str_port_num = get_env_var("RDMA_DEVICE_PORT");
+    CHECK(str_port_num.empty())
+        << "RDMA_DEVICE should be provided if RDMA_DEVICE_PORT is set by user";
+    for (device_index = 0; device_index < dev_num; device_index++) {
+      // get port_num
+      if (get_dev_active_port_count(dev_list[device_index]) > 0) {
+        num_devs_with_active_port++;
+        CHECK(num_devs_with_active_port <= 1) << ". More than one device with "
+                                                 "active port in the system. "
+                                                 "Please enter RDMA_DEVICE";
+        // found device with at least 1 active port
+        device_to_open = device_index;
+      }
+    }
+    CHECK(num_devs_with_active_port > 0)
+        << "There is no active port in the system";
+    return dev_list[device_to_open];
+  }
+  CHECK(false) << "No device was set!";
+  return NULL;  // never happens
+}
+
+// Function to set port for device.
+// If RDMA_DEVICE_PORT not set, first active port of the device will be set.
+// Args:
+//   context of the device
+// Returns:
+//   port to use
+uint8_t set_port(ibv_context* context) {
+  uint8_t port_num = 0;  // 0 is illegal port number
+  string str_port_num;
+  ibv_device_attr device_att;
+  ibv_port_attr port_attr;
+  int rc, port_index;
+
+  rc = ibv_query_device(context, &device_att);
+  CHECK(!rc) << "Failed to query the device\n";
+
+  str_port_num = get_env_var("RDMA_DEVICE_PORT");
+  // user defined port
+  if (!str_port_num.empty()) {
+    port_num = stoi(str_port_num);
+    CHECK(port_num > 0) << "RDMA_DEVICE_PORT should be positive";
+    CHECK(port_num <= device_att.phys_port_cnt) << "RDMA_DEVICE_PORT should be "
+                                                   "less or equal to amount of "
+                                                   "available ports";
+    rc = ibv_query_port(context, port_num, &port_attr);
+    CHECK(!rc) << "Failed to query the port" << port_num;
+    // check if port id active
+    CHECK(port_attr.state == IBV_PORT_ACTIVE)
+        << "Selected RDMA_DEVICE_PORT is not active";
+  } else {  // set default port
+    for (port_index = 1; port_index <= device_att.phys_port_cnt; port_index++) {
+      rc = ibv_query_port(context, port_index, &port_attr);
+      CHECK(!rc) << "Failed to query the port" << port_index;
+      if (port_attr.state == IBV_PORT_ACTIVE) {
+        port_num = port_index;
+        break;
+      }
+    }
+    CHECK_GT(port_num, 0) << "No active ports";
+  }
+  return port_num;
+}
+
+// Function read from sysfs file
+// Args:
+//   dir - directory
+//   file - file
+//   buff - buffer for the result
+//   size - buffer size
+// Returns:
+//   number of bytes were read or -1 if failed
+int read_sysfs_file(const char* dir, const char* file, char* buf, size_t size) {
+  char* path;
+  int fd;
+  int len;
+
+  if (asprintf(&path, "%s/%s", dir, file) < 0) return -1;
+
+  fd = open(path, O_RDONLY);
+  if (fd < 0) {
+    free(path);
+    return -1;
+  }
+
+  len = read(fd, buf, size);
+
+  close(fd);
+  free(path);
+
+  if (len > 0 && buf[len - 1] == '\n') buf[--len] = '\0';
+
+  return len;
+}
+
+// Function to check if GID index support RoCE V2
+// Args:
+//   context - device context
+//   port_num - port number
+//   index -  GID index
+// Returns:
+//   if GID supports RoCE V2 - true, otherwise - false.
+bool is_gid_type_roce_v2(ibv_context* context, uint8_t port_num,
+                         uint8_t index) {
+  char name[32];
+  char buff[41];
+
+  snprintf(name, sizeof(name), "ports/%d/gid_attrs/types/%d", port_num, index);
+  if (read_sysfs_file(context->device->ibdev_path, name, buff, sizeof(buff)) <=
+      0) {
+    return false;
+  }
+  return !strcmp(buff, RoCE_V2);
+}
+
+// Function to set GID index.
+// If the port link is IB, no GID index should be selected.
+// If Ethernet but RDMA_GID_INDEX not set gid index that supports
+//   RoCE V2 will be chosen(fails if more than one IP is configured)
+// Args:
+//   context - device context
+//   port_num - port number
+// Returns:
+//   GID index to use
+uint8_t set_gid(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  string gid_str;
+  int rc, i, gids_num = 0, v2_ip_num = 0;
+  union ibv_gid gid;
+  uint8_t gid_index = 0;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  for (i = 0; i < port_attr.gid_tbl_len; i++) {
+    rc = ibv_query_gid(context, port_num, i, &gid);
+    CHECK(!rc) << "Failed to query gid to port " << (int)port_num << " index "
+               << i;
+    if (gid.global.interface_id) {
+      gids_num++;
+      if (gid.global.subnet_prefix == 0 &&
+          is_gid_type_roce_v2(context, port_num, i)) {
+        if (v2_ip_num == 0) {
+          // can be overwritten by RDMA_GID_INDEX later
+          gid_index = i;
+        }
+        v2_ip_num++;
+      }
+    }
+  }
+  switch (port_attr.link_layer) {
+    case (IBV_LINK_LAYER_ETHERNET):
+      gid_str = get_env_var("RDMA_GID_INDEX");
+      if (!gid_str.empty()) {
+        gid_index = stoi(gid_str);
+        CHECK(gid_index < gids_num)
+            << "RDMA_GID_INDEX should be less than GIDs amount" << gids_num;
+      } else {
+        CHECK(v2_ip_num <= 1)
+            << "More than one IP is available, please specify GID_INDEX";
+      }
+      break;
+    case (IBV_LINK_LAYER_INFINIBAND):  // no need in GID index
+      break;
+    default:
+      LOG(INFO) << "Unknown port link layer. Currently supporting Ethernet and "
+                   "InfiniBand only. ";
+  }
+  if (!is_gid_type_roce_v2(context, port_num, gid_index)) {
+    LOG(INFO) << "RoCE v2 is not configured for GID_INDEX " << (int)gid_index;
+  }
+  return gid_index;
+}
+
+// set the default or environment value to the configuration parameter.
+// Args:
+//   default_val- the default value for this parameter
+//   env_param- the environment parameter's name
+// Returns:
+//   32-bit value
+uint32_t set_param(uint32_t default_val, const char* env_param) {
+  uint32_t val = default_val;
+  string val_s;
+
+  val_s = get_env_var(env_param);
+
+  if (!val_s.empty()) {
+    val = stoi(val_s);
+  }
+  return val;
+}
+
+enum ibv_mtu set_mtu(uint8_t port_num, ibv_context* context) {
+  ibv_port_attr port_attr;
+  enum ibv_mtu mtu;
+  string mtu_s;
+  int rc, mtu_i;
+
+  rc = ibv_query_port(context, port_num, &port_attr);
+  CHECK(!rc) << "Failed to query the port" << port_num;
+
+  mtu_s = get_env_var("RDMA_MTU");
+
+  if (!mtu_s.empty()) {
+    mtu_i = stoi(mtu_s);
+    switch (mtu_i) {
+      case 256:
+        mtu = IBV_MTU_256;
+        break;
+      case 512:
+        mtu = IBV_MTU_512;
+        break;
+      case 1024:
+        mtu = IBV_MTU_1024;
+        break;
+      case 2048:
+        mtu = IBV_MTU_2048;
+        break;
+      case 4096:
+        mtu = IBV_MTU_4096;
+        break;
+      default:
+        CHECK(0) << "Error: MTU input value must be one of the following: 256, "
+                    "512, 1024, 2048, 4096. MTU "
+                 << mtu << " is invalid\n";
+        break;
+    }
+    CHECK(mtu < port_attr.active_mtu)
+        << "MTU configuration for the QPs is larger than active MTU";
+  } else {
+    mtu = port_attr.active_mtu;
+  }
+  return mtu;
+}
+
+RdmaParams params_init(ibv_context* context) {
+  RdmaParams params;
+
+  params.port_num = set_port(context);
+  params.sgid_index = set_gid(params.port_num, context);
+  params.pkey_index = (uint8_t)set_param(PKEY_DEFAULT, "RDMA_PKEY");
+  params.queue_depth = set_param(QUEUE_DEPTH_DEFAULT, "RDMA_QUEUE_DEPTH");
+  params.timeout = (uint8_t)set_param(TIMEOUT_DEFAULT, "RDMA_TIMEOUT");
+  params.retry_cnt = (uint8_t)set_param(RETRY_CNT_DEFAULT, "RDMA_RETRY_CNT");
+  params.sl = (uint8_t)set_param(SL_DEFAULT, "RDMA_SL");
+  CHECK(params.sl <= 7) << "SL value is " << (int)params.sl
+                        << ". Valid values are 0-7.";
+  params.mtu = set_mtu(params.port_num, context);
+  params.traffic_class = set_param(TRAFFIC_CLASS, "RDMA_TRAFFIC_CLASS");
+  return params;
 }
 
 ibv_pd* alloc_protection_domain(ibv_context* context) {
@@ -85,7 +412,8 @@ ibv_pd* alloc_protection_domain(ibv_context* context) {
 }
 
 RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
-    : context_(open_default_device()),
+    : context_(open_device(set_device())),
+      params_(params_init(context_)),
       pd_(alloc_protection_domain(context_)),
       worker_env_(worker_env) {
   event_channel_ = ibv_create_comp_channel(context_);
@@ -94,9 +422,6 @@ RdmaAdapter::RdmaAdapter(const WorkerEnv* worker_env)
                       0);
   CHECK(cq_) << "Failed to create completion queue";
   CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification";
-  polling_thread_.reset(Env::Default()->StartThread(
-      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
-  VLOG(2) << "Start RdmaAdapter: " << name();
 }
 
 RdmaAdapter::~RdmaAdapter() {
@@ -108,6 +433,12 @@ RdmaAdapter::~RdmaAdapter() {
   CHECK(!ibv_close_device(context_)) << "Failed to release context";
 }
 
+void RdmaAdapter::StartPolling() {
+  polling_thread_.reset(Env::Default()->StartThread(
+      ThreadOptions(), "RdmaAdapterCQThread", [this] { Process_CQ(); }));
+  VLOG(2) << "Start RdmaAdapter: " << name();
+}
+
 string RdmaAdapter::name() const { return string(context_->device->name); }
 
 // Function to process incoming messages
@@ -128,9 +459,9 @@ void RdmaAdapter::Process_CQ() {
     CHECK_GE(ne, 0);
     for (int i = 0; i < ne; ++i) {
       CHECK(wc_[i].status == IBV_WC_SUCCESS)
-          << "Failed status \n"
-          << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " "
-          << static_cast<int>(wc_[i].wr_id) << " " << wc_[i].vendor_err;
+          << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " "
+          << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "
+          << wc_[i].vendor_err;
       if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
         RdmaChannel* rc = reinterpret_cast<RdmaChannel*>(wc_[i].wr_id);
         // put back a recv wr.
@@ -233,17 +564,52 @@ void RdmaAdapter::Process_CQ() {
   }
 }
 
+int RdmaChannel::PingPostRecv() {
+  struct ibv_recv_wr wr, *bad_wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.sg_list = &ping_sge_list_;
+  wr.num_sge = 1;
+  wr.wr_id = kPingRecvWrid;
+
+  return ibv_post_recv(qp_, &wr, &bad_wr);
+}
+
+int RdmaChannel::PingPostSend() {
+  struct ibv_send_wr wr, *bad_wr;
+  memset(&wr, 0, sizeof(wr));
+  wr.wr_id = (uint64_t) this;
+  wr.sg_list = &ping_sge_list_;
+  wr.num_sge = 1;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+
+  return ibv_post_send(qp_, &wr, &bad_wr);
+}
+
 RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
                          const string remote_name)
     : adapter_(adapter), local_name_(local_name), remote_name_(remote_name) {
+
+  struct ibv_sge list;
+
+  mr_ = ibv_reg_mr(adapter_->pd_, ping_buff_, kPingBuffSize,
+                   IBV_ACCESS_LOCAL_WRITE);
+  CHECK(mr_) << "Failed to register memory region";
+
+  memset(&list, 0, sizeof(list));
+  list.addr = (uintptr_t)ping_buff_;
+  list.length = kPingBuffSize;
+  list.lkey = mr_->lkey;
+
+  ping_sge_list_ = list;
   // Create queue pair
   {
     struct ibv_qp_init_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_init_attr));
     attr.send_cq = adapter_->cq_;
     attr.recv_cq = adapter_->cq_;
-    attr.cap.max_send_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
-    attr.cap.max_recv_wr = RdmaAdapter::MAX_CONCURRENT_WRITES;
+    attr.cap.max_send_wr = adapter_->params_.queue_depth;
+    attr.cap.max_recv_wr = adapter_->params_.queue_depth;
     attr.cap.max_send_sge = 1;
     attr.cap.max_recv_sge = 1;
     attr.qp_type = IBV_QPT_RC;
@@ -257,8 +623,8 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
     struct ibv_qp_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_INIT;
-    attr.pkey_index = 0;
-    attr.port_num = 1;
+    attr.pkey_index = adapter_->params_.pkey_index;
+    attr.port_num = adapter_->params_.port_num;
     attr.qp_access_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
 
     int mask =
@@ -269,13 +635,15 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   // Local address
   {
     struct ibv_port_attr attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &attr))
+    CHECK(
+        !ibv_query_port(adapter_->context_, adapter_->params_.port_num, &attr))
         << "Query port";
     self_.lid = attr.lid;
     self_.qpn = qp_->qp_num;
     self_.psn = static_cast<uint32_t>(random::New64()) & 0xffffff;
     union ibv_gid gid;
-    CHECK(!ibv_query_gid(adapter_->context_, (uint8_t)1, 0, &gid))
+    CHECK(!ibv_query_gid(adapter_->context_, adapter_->params_.port_num,
+                         adapter_->params_.sgid_index, &gid))
         << "Query gid";
     self_.snp = gid.global.subnet_prefix;
     self_.iid = gid.global.interface_id;
@@ -284,7 +652,7 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
   // create message and ack buffers, then initialize the tables.
   {
     const string buffer_names[] = {"tx_message_buffer", "rx_message_buffer",
-                                   "tx_ack_buffer", "rx_ack_buffer"};
+                                   "tx_ack_buffer",     "rx_ack_buffer"};
     tx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[0]);
     rx_message_buffer_ = new RdmaMessageBuffer(this, buffer_names[1]);
     tx_ack_buffer_ = new RdmaAckBuffer(this, buffer_names[2]);
@@ -306,15 +674,13 @@ RdmaChannel::RdmaChannel(const RdmaAdapter* adapter, const string local_name,
       buffer_index_name_table_.insert({index, buffer_names[i]});
       buffer_name_index_table_.insert({buffer_names[i], index});
     }
-
-    // Initiate recv
-    for (int i = 0; i < 100; i++) {
-      Recv();
-    }
   }
+  CHECK(PingPostRecv() == 0) << "Couldn't post receive from " << remote_name_
+                             << " with error " << std::strerror(errno);
 }
 
 RdmaChannel::~RdmaChannel() {
+  ibv_dereg_mr(mr_);
   CHECK(!ibv_destroy_qp(qp_)) << "Failed to destroy QP";
   delete tx_message_buffer_;
   delete rx_message_buffer_;
@@ -345,7 +711,7 @@ void RdmaChannel::SetRemoteAddress(const RdmaAddress& ra, bool override) {
 void RdmaChannel::Recv() {
   struct ibv_recv_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)this;
+  wr.wr_id = (uint64_t) this;
   struct ibv_recv_wr* bad_wr;
   CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)) << "Failed to post recv";
 }
@@ -479,11 +845,9 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     struct ibv_qp_attr attr;
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_RTR;
-    struct ibv_port_attr port_attr;
-    CHECK(!ibv_query_port(adapter_->context_, (uint8_t)1, &port_attr))
-        << "Query port failed";
+
     // This assumes both QP's ports are configured with the same MTU
-    attr.path_mtu = port_attr.active_mtu;
+    attr.path_mtu = adapter_->params_.mtu;
     attr.dest_qp_num = remoteAddr.qpn;
     attr.rq_psn = remoteAddr.psn;
     attr.max_dest_rd_atomic = 1;
@@ -494,30 +858,32 @@ void RdmaChannel::Connect(const RdmaAddress& remoteAddr) {
     attr.ah_attr.grh.flow_label = 0;
     attr.ah_attr.grh.hop_limit = 255;
     attr.ah_attr.dlid = remoteAddr.lid;
-    attr.ah_attr.sl = 0;
+    attr.ah_attr.sl = adapter_->params_.sl;
     attr.ah_attr.src_path_bits = 0;
-    attr.ah_attr.port_num = 1;
+    attr.ah_attr.port_num = adapter_->params_.port_num;
+    attr.ah_attr.grh.sgid_index = adapter_->params_.sgid_index;
+    attr.ah_attr.grh.traffic_class = adapter_->params_.traffic_class;
 
     int r;
-    CHECK(!(r = ibv_modify_qp(qp_, &attr,
-                              IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |
-                                  IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
-                                  IBV_QP_MAX_DEST_RD_ATOMIC |
-                                  IBV_QP_MIN_RNR_TIMER)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_AV |
+                                              IBV_QP_PATH_MTU |
+                                              IBV_QP_DEST_QPN | IBV_QP_RQ_PSN |
+                                              IBV_QP_MAX_DEST_RD_ATOMIC |
+                                              IBV_QP_MIN_RNR_TIMER)))
         << "QP to Ready to Receive " << r;
 
     memset(&attr, 0, sizeof(ibv_qp_attr));
     attr.qp_state = IBV_QPS_RTS;
     attr.sq_psn = self_.psn;
-    attr.timeout = 14;
-    attr.retry_cnt = 7;
+    attr.timeout = adapter_->params_.timeout;
+    attr.retry_cnt = adapter_->params_.retry_cnt;
     attr.rnr_retry = 7; /* infinite */
     attr.max_rd_atomic = 1;
 
-    CHECK(!(r = ibv_modify_qp(qp_, &attr,
-                              IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT |
-                                  IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
-                                  IBV_QP_MAX_QP_RD_ATOMIC)))
+    CHECK(!(r = ibv_modify_qp(qp_, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT |
+                                              IBV_QP_RETRY_CNT |
+                                              IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN |
+                                              IBV_QP_MAX_QP_RD_ATOMIC)))
         << "QP to Ready to Send " << r;
 
     connected_ = true;
@@ -604,7 +970,7 @@ void RdmaBuffer::Write(uint32_t imm_data, size_t buffer_size) {
 
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
-  wr.wr_id = (uint64_t)this;
+  wr.wr_id = (uint64_t) this;
   wr.sg_list = &list;
   wr.num_sge = 1;
   wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
@@ -699,9 +1065,10 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
     TensorProto proto;
     if (src_dev->tensorflow_gpu_device_info() &&
         (!send_args.alloc_attrs.on_host())) {
-      CHECK(send_args.device_context)
-          << "send dev name: " << src_dev->name()
-          << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+#if GOOGLE_CUDA
+      CHECK(send_args.device_context) << "send dev name: " << src_dev->name()
+                                      << " gpu_info: "
+                                      << src_dev->tensorflow_gpu_device_info();
 
       if (can_memcpy) {
         AllocatorAttributes host_alloc_attrs;
@@ -727,8 +1094,8 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
         // aync instead
         GPUUtil::SetProtoFromGPU(
             in, src_dev, send_args.device_context, &proto, is_dead,
-            [this, proto, buffer_size, key, in, step_id, key_with_step_id,
-             is_dead, send_args, recv_args](const Status& s) mutable {
+	    [this, proto, buffer_size, key, in, step_id, key_with_step_id,
+            is_dead, send_args, recv_args](const Status& s) mutable {
               CHECK(s.ok()) << "copy proto from gpu sync";
               auto tensor_bytes = proto.ByteSize();
               buffer_size += tensor_bytes;
@@ -737,6 +1104,7 @@ Rendezvous::DoneCallback RdmaTensorBuffer::getRecvTensorCallback(
                                  &proto, NULL, send_args, recv_args);
             });
       }
+#endif  // GOOGLE_CUDA
     } else {
       // tensor is in CPU memory.
       StringPiece copy_buf;
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index e1e07db776467c5b604f610bbc907d363edae139..fea2327d77ffff67c4b3c45835a81f790bbd1574 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -36,7 +36,24 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
-
+#define PKEY_DEFAULT 0
+#define QUEUE_DEPTH_DEFAULT 1024
+#define TIMEOUT_DEFAULT 14
+#define RETRY_CNT_DEFAULT 7
+#define SL_DEFAULT 0
+#define TRAFFIC_CLASS 0
+
+struct RdmaParams {
+  uint8_t port_num;
+  uint8_t sgid_index;
+  uint8_t pkey_index;
+  uint32_t queue_depth;
+  uint8_t timeout;
+  uint8_t retry_cnt;
+  uint8_t sl;
+  enum ibv_mtu mtu;
+  uint8_t traffic_class;
+};
 // structure to save the address of remote channels.
 struct RdmaAddress {
   uint32_t lid;
@@ -50,9 +67,20 @@ struct RemoteMR {
   uint64_t remote_addr;
   uint32_t rkey;
 };
-enum BufferStatus { none, idle, busy };
-enum Location { local, remote };
-enum BufferType { ACK, MESSAGE, TENSOR };
+enum BufferStatus {
+  none,
+  idle,
+  busy
+};
+enum Location {
+  local,
+  remote
+};
+enum BufferType {
+  ACK,
+  MESSAGE,
+  TENSOR
+};
 enum RdmaMessageType {
   RDMA_MESSAGE_ACK,
   RDMA_MESSAGE_BUFFER_IDLE,
@@ -79,11 +107,14 @@ class RdmaAdapter {
   ~RdmaAdapter();
   // Adapter name, e.g. mlx5_0.
   string name() const;
+  void StartPolling();
   void Process_CQ();
 
  protected:
   static const int MAX_CONCURRENT_WRITES = 1000;
   ibv_context* context_;
+  // RDMA configuration parameters
+  RdmaParams params_;
   // ibverbs protection domain
   ibv_pd* pd_;
   // Completion event channel, to wait for work completions
@@ -131,6 +162,15 @@ class RdmaChannel {
   void RemoveRecvCallback(const string& key);
   void RunRecvCallback(const string& key);
   static const int kNumMessageBuffers = 4;
+  static const int kPingRecvWrid = 0;
+
+ private:
+  static const int kPingBuffSize = 1024;
+  char ping_buff_[kPingBuffSize];
+  struct ibv_mr* mr_;
+  struct ibv_sge ping_sge_list_;
+  int PingPostRecv();
+  int PingPostSend();
 
  protected:
   const RdmaAdapter* adapter_;
@@ -183,7 +223,7 @@ class RdmaBuffer {
   }
   void FreeBuffer();
   void EnqueueItem(string Item);
-  virtual void SendNextItem(){};
+  virtual void SendNextItem() {};
   void CreateCPUBuffer(size_t size, bool lock = true);
   void SetRemoteMR(RemoteMR rmi, bool override);
   uint32_t LookupBufferIndex(const string& buffer_name) {
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 09b878843f52c910f78f3769522d1fa80319c7d7..9cb307bcfa06cfdf5ecb9b4faa1d3710e5701080 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -115,6 +115,57 @@ void RdmaMgr::SetupChannels() {
   }
 }
 
+// Check connectivity by pinging every channel
+bool RdmaMgr::ConnectivityCheck() {
+  int i, rcnt = 0, scnt = 0;
+
+  for (const auto& p : channel_table_) {
+    string worker_name = p.first;
+    RdmaChannel* rc = p.second;
+
+    VLOG(2) << "Ping to " << worker_name;
+    CHECK(rc->PingPostSend() == 0) << "Couldn't post send  to " << worker_name
+                                   << " with error: " << std::strerror(errno);
+    for (i = 0; i < rc->adapter_->params_.queue_depth - 1; i++) {
+      rc->Recv();
+    }
+  }
+
+  while (rcnt < num_remote_workers_ || scnt < num_remote_workers_) {
+    int ne;
+    do {
+      ne = ibv_poll_cq(rdma_adapter_->cq_, 2 * num_remote_workers_,
+                       rdma_adapter_->wc_);
+      CHECK(ne >= 0) << "poll CQ failed " << ne << "with error"
+                     << std::strerror(errno);
+    } while (ne < 1);
+
+    for (i = 0; i < ne; ++i) {
+      ibv_wc_status s = rdma_adapter_->wc_[i].status;
+      // recv complete
+      if ((int)rdma_adapter_->wc_[i].wr_id == RdmaChannel::kPingRecvWrid) {
+        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
+                                                  rdma_adapter_->wc_[i].status)
+                                   << "(" << rdma_adapter_->wc_[i].status
+                                   << ") for PING_RECV_WRID";
+        ++rcnt;
+        // send complete
+      } else {
+        RdmaChannel* rc =
+            reinterpret_cast<RdmaChannel*>(rdma_adapter_->wc_[i].wr_id);
+        CHECK(s == IBV_WC_SUCCESS) << ": " << ibv_wc_status_str(
+                                                  rdma_adapter_->wc_[i].status)
+                                   << "(" << rdma_adapter_->wc_[i].status
+                                   << ") to " << rc->remote_name_;
+        ++scnt;
+      }
+    }  // for
+  }    // while
+  CHECK(rcnt == scnt) << "Connectivity check failed!";
+  rdma_adapter_->StartPolling();
+  return (num_remote_workers_ == rcnt) && (num_remote_workers_ == scnt);
+}
+
 RdmaMgr::~RdmaMgr() {
   for (const auto& p : channel_table_) delete p.second;
   channel_table_.clear();
diff --git a/tensorflow/contrib/verbs/rdma_mgr.h b/tensorflow/contrib/verbs/rdma_mgr.h
index b156f64096c113bb0ac3780b0f64fd1e6bd7cb89..e711e604788b12ff0c1a0977a90db21f9f8fa50e 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.h
+++ b/tensorflow/contrib/verbs/rdma_mgr.h
@@ -28,12 +28,16 @@ limitations under the License.
 namespace tensorflow {
 
 class RdmaMgr {
+  friend class RdmaChannel;
+  friend class RdmaAdapter;
+
  public:
   explicit RdmaMgr(const WorkerEnv* const worker_env,
                    GrpcChannelCache* const channel_cache);
   ~RdmaMgr();
   RdmaChannel* FindChannel(const string& key);
   void SetupChannels();
+  bool ConnectivityCheck();
   const string& local_worker() { return local_worker_; }
 
  private:
@@ -44,7 +48,6 @@ class RdmaMgr {
   RdmaAdapter* rdma_adapter_;
   typedef std::unordered_map<string, RdmaChannel*> ChannelTable;
   ChannelTable channel_table_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(RdmaMgr);
 };
 
diff --git a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
index ce82ca288307a73295368501ad68f88b60c9623c..74f6681af3c29f370d6cdb37d64e10a30cbb7b84 100644
--- a/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_rendezvous_mgr.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
+#endif  // GOOGLE_CUDA
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -58,20 +60,13 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   // parse src_name and dst_name
   string src_name, dst_name, unused;
   if (!DeviceNameUtils::SplitDeviceName(parsed.src_device, &src_name,
+                                        &unused) ||
+      !DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
                                         &unused)) {
-    s = errors::Internal("Could not parse src name.");
+    s = errors::Internal("Could not parse src or dst name.");
   }
-  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-  if (!s.ok()) {
-    done(s, Args(), recv_args, Tensor{}, false);
-    return;
-  }
-  if (!DeviceNameUtils::SplitDeviceName(parsed.dst_device, &dst_name,
-                                        &unused)) {
-    s = errors::Internal("Could not parse dst name.");
-  }
-  CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
   if (!s.ok()) {
+    LOG(ERROR) << "s is not ok, error code " << s.error_message();
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -82,18 +77,13 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
   // insert callback
   rc->InsertRecvCallback(key_with_step_id, [this, key, key_with_step_id, rc,
                                             recv_args, parsed, done]() {
-    Status s;
-    Device* src_dev;
-    s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
-    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-    if (!s.ok()) {
-      done(s, Args(), recv_args, Tensor(), true);
-      return;
-    }
-    Device* dst_dev;
-    s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
-    CHECK(s.ok()) << "s is not ok, error code " << s.error_message();
-    if (!s.ok()) {
+    Status src_s, dst_s, s;
+    Device* src_dev, *dst_dev;
+    src_s = env_->device_mgr->LookupDevice("CPU:0", &src_dev);
+    dst_s = env_->device_mgr->LookupDevice(parsed.dst_device, &dst_dev);
+    if (!src_s.ok() || !dst_s.ok()) {
+      s = src_s.ok() ? dst_s : src_s;
+      LOG(ERROR) << "s is not ok, error code " << s.error_message();
       done(s, Args(), recv_args, Tensor(), true);
       return;
     }
@@ -110,9 +100,10 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
       if (can_memcpy) {
         if (dst_dev->tensorflow_gpu_device_info() &&
             (!recv_args.alloc_attrs.on_host())) {
+#if GOOGLE_CUDA
           CHECK(recv_args.device_context)
-            << "send dev name: " << src_dev->name()
-            << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
+              << "send dev name: " << src_dev->name()
+              << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
           Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
           Tensor copy(alloc, rm.data_type_, rm.tensor_shape_);
           memcpy(DMAHelper::base(&copy), input, rm.tensor_bytes_);
@@ -122,14 +113,15 @@ void RdmaRemoteRendezvous::RecvFromRemoteAsync(
 
           GPUUtil::CopyCPUTensorToGPU(
               &copy, recv_args.device_context, dst_dev, &gpu_copy,
-              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm,
-               rc](const Status& s) {
+              [this, gpu_copy, key, key_with_step_id, recv_args, done, rm, rc](
+                  const Status& s) {
                 CHECK(s.ok()) << "copy tensor to gpu sync";
                 Tensor val;
                 val = std::move(gpu_copy);
                 RecvPostCopyOps(key, key_with_step_id, recv_args, done, rm, rc,
                                 val, s);
               });
+#endif  // GOOGLE_CUDA
           return;
         } else {
           AllocatorAttributes host_alloc_attrs;
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 6d1c79c0fb2f75a9cae835d78fbbe0b40774482b..a606ef75a42069b3c32eb13a69e981a5c4c8f83c 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -49,8 +49,8 @@ VerbsServer::~VerbsServer() {
 Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
                                         GrpcChannelCache** channel_cache) {
   string name_prefix =
-      strings::StrCat("/job:", server_def.job_name(), "/replica:0",
-                      "/task:", server_def.task_index());
+      strings::StrCat("/job:", server_def.job_name(), "/replica:0", "/task:",
+                      server_def.task_index());
 
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(server_def, &channel_spec));
@@ -103,6 +103,7 @@ Status VerbsServer::Start() {
           ThreadOptions(), "TF_verbs_service",
           [this] { verbs_service_->HandleRPCsLoop(); }));
       rdma_mgr_->SetupChannels();
+      CHECK(rdma_mgr_->ConnectivityCheck()) << "Connectivity check failed!";
       verbs_state_ = CONNECTED;
     }
   }
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1c58aa3315bb88eeb69035c11f56ddfd3d651eee..a280444121e21fec4069c304d1fa1b2b746c5be9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -127,9 +127,9 @@ load(
     "tf_additional_verbs_lib_defines",
     "tf_additional_mpi_lib_defines",
     "tf_additional_gdr_lib_defines",
-    "tf_additional_gpu_tracer_srcs",
-    "tf_additional_gpu_tracer_deps",
-    "tf_additional_gpu_tracer_cuda_deps",
+    "tf_additional_device_tracer_srcs",
+    "tf_additional_device_tracer_deps",
+    "tf_additional_device_tracer_cuda_deps",
     "tf_pyclif_proto_library",
     "tf_jspb_proto_library",
     "tf_nano_proto_library",
@@ -249,6 +249,14 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+# Minimal lib to detect plafrom
+cc_library(
+    name = "lib_platform",
+    hdrs = [
+        "platform/platform.h",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -312,6 +320,7 @@ cc_library(
         "lib/io/table_options.h",
         "lib/math/math_util.h",
         "lib/monitoring/counter.h",
+        "lib/monitoring/gauge.h",
         "lib/monitoring/sampler.h",
         "lib/random/distribution_sampler.h",
         "lib/random/philox_random.h",
@@ -446,6 +455,7 @@ tf_cuda_library(
         "util/mirror_pad_mode.h",
         "util/padding.h",
         "util/port.h",
+        "util/ptr_util.h",
         "util/reffed_status_callback.h",
         "util/saved_tensor_slice_util.h",
         "util/sparse/group_iterator.h",
@@ -484,6 +494,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ptr_util",
+    hdrs = ["util/ptr_util.h"],
+)
+
 cc_library(
     name = "reader_base",
     srcs = ["framework/reader_base.cc"],
@@ -737,6 +752,7 @@ tf_cuda_library(
     name = "core_cpu",
     hdrs = [
         "common_runtime/device.h",
+        "common_runtime/device_factory.h",
         "common_runtime/optimization_registry.h",
         "common_runtime/shape_refiner.h",
         "graph/algorithm.h",
@@ -998,7 +1014,7 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + if_not_android_mips_and_mips64(["-Os"]),
+    copts = tf_copts(android_optimization_level_override = None),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1086,8 +1102,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1108,8 +1123,7 @@ cc_library(
 cc_library(
     name = "android_tensorflow_lib_selective_registration_nortti",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts() + tf_opts_nortti_if_android() + [
-        "-Os",
+    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
     tags = [
@@ -1188,7 +1202,7 @@ cc_library(
         "framework/tensor_testutil.h",
         "util/reporter.h",
     ],
-    copts = tf_copts() + ["-Os"],
+    copts = tf_copts(android_optimization_level_override = None),
     tags = [
         "manual",
         "notap",
@@ -1385,6 +1399,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "lib/monitoring/collection_registry.h",
     "lib/monitoring/metric_def.h",
     "lib/monitoring/mobile_counter.h",
+    "lib/monitoring/mobile_gauge.h",
     "lib/monitoring/mobile_sampler.h",
     "lib/png/png_io.h",
     "lib/random/random.h",
@@ -1406,16 +1421,19 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
     "platform/tracing.h",
 ]
 
+# Replicated for lib_internal and lib_internal_impl.
+LIB_INTERNAL_DEFINES = (tf_additional_lib_defines() + [
+                            "TF_USE_SNAPPY",
+                        ] + tf_additional_verbs_lib_defines() +
+                        tf_additional_mpi_lib_defines() +
+                        tf_additional_gdr_lib_defines())
+
 cc_library(
     name = "lib_internal",
     srcs = LIB_INTERNAL_PRIVATE_HEADERS,
     hdrs = LIB_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
-    defines = tf_additional_lib_defines() + [
-                  "TF_USE_SNAPPY",
-              ] + tf_additional_verbs_lib_defines() +
-              tf_additional_mpi_lib_defines() +
-              tf_additional_gdr_lib_defines(),
+    defines = LIB_INTERNAL_DEFINES,
     linkopts = select({
         "//tensorflow:freebsd": [],
         "//tensorflow:windows": [],
@@ -1448,7 +1466,7 @@ cc_library(
             "lib/jpeg/**/*",
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
-            "platform/**/gpu_tracer.cc",
+            "platform/**/device_tracer.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ],
@@ -1459,7 +1477,7 @@ cc_library(
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
-            "platform/**/gpu_tracer.cc",
+            "platform/**/device_tracer.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ] +
@@ -1469,6 +1487,7 @@ cc_library(
     ),
     hdrs = LIB_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
+    defines = LIB_INTERNAL_DEFINES,
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
@@ -1834,11 +1853,13 @@ cc_library(
     deps = ["//tensorflow/core/platform/default/build_config:protos_cc"],
 )
 
-CORE_CPU_BASE_HDRS = [
-    "common_runtime/device.h",
-    "common_runtime/graph_runner.h",
-    "common_runtime/shape_refiner.h",
-    "framework/versions.h",
+# Library containing all of the graph construction code that is
+# independent of the runtime.
+#
+# TODO(mrry): Refactor graph_constructor.cc so that it does not depend on code
+# in "common_runtime/", and then the entire "graph/" directory can be included
+# in this library.
+GRAPH_HDRS = [
     "graph/algorithm.h",
     "graph/colors.h",
     "graph/control_flow.h",
@@ -1846,7 +1867,7 @@ CORE_CPU_BASE_HDRS = [
     "graph/default_device.h",
     "graph/edgeset.h",
     "graph/graph.h",
-    "graph/graph_constructor.h",
+    "graph/graph_constructor.h",  # NOTE(mrry): Don't include the .cc since it depends on common_runtime.
     "graph/graph_def_builder.h",
     "graph/graph_partition.h",
     "graph/mkl_layout_pass.h",
@@ -1862,16 +1883,12 @@ CORE_CPU_BASE_HDRS = [
 ]
 
 tf_cuda_library(
-    name = "core_cpu_base",
+    name = "graph",
     srcs = [
-        "common_runtime/shape_refiner.cc",
-        "common_runtime/shape_refiner.h",
-        "framework/versions.h",
         "graph/algorithm.cc",
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
-        "graph/graph_constructor.cc",
         "graph/graph_def_builder.cc",
         "graph/graph_partition.cc",
         "graph/node_builder.cc",
@@ -1879,6 +1896,33 @@ tf_cuda_library(
         "graph/subgraph.cc",
         "graph/tensor_id.cc",
         "graph/validate.cc",
+    ],
+    hdrs = GRAPH_HDRS,
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":proto_text",
+        ":protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
+    "common_runtime/device.h",
+    "common_runtime/graph_runner.h",
+    "common_runtime/shape_refiner.h",
+    "framework/versions.h",
+]
+
+tf_cuda_library(
+    name = "core_cpu_base",
+    srcs = [
+        "common_runtime/shape_refiner.cc",
+        "common_runtime/shape_refiner.h",
+        "framework/versions.h",
+        "graph/graph_constructor.cc",  # Depends on common_runtime.
         "public/session.h",
         "public/session_options.h",
         "public/version.h",
@@ -1886,6 +1930,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_BASE_HDRS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -1989,6 +2034,7 @@ tf_cuda_library(
     hdrs = CORE_CPU_LIB_HEADERS,
     copts = tf_copts(),
     deps = [
+        ":graph",
         ":framework",
         ":framework_internal",
         ":lib",
@@ -2030,6 +2076,7 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2071,13 +2118,11 @@ tf_cuda_library(
         "util/env_var.h",
     ],
     copts = tf_copts(),
-    cuda_deps = [
-        ":gpu_tracer",
-    ],
-    linkstatic = 1,
     deps = [
         ":core_cpu_internal",
+        ":device_tracer",
         ":framework",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":proto_text",
@@ -2108,18 +2153,18 @@ cc_library(
 )
 
 tf_cuda_library(
-    name = "gpu_tracer",
-    srcs = tf_additional_gpu_tracer_srcs(),
+    name = "device_tracer",
+    srcs = tf_additional_device_tracer_srcs(),
     hdrs = [
-        "platform/gpu_tracer.h",
+        "platform/device_tracer.h",
     ],
     copts = tf_copts(),
-    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_gpu_tracer_cuda_deps(),
+    cuda_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     deps = [
         ":core_cpu_internal",
         ":lib",
         ":protos_all_cc",
-    ] + tf_additional_gpu_tracer_deps(),
+    ] + tf_additional_device_tracer_deps(),
 )
 
 GPU_RUNTIME_HEADERS = [
@@ -2133,6 +2178,7 @@ GPU_RUNTIME_HEADERS = [
     "common_runtime/gpu/gpu_util.h",
     "common_runtime/gpu/pool_allocator.h",
     "common_runtime/gpu/process_state.h",
+    "common_runtime/gpu_device_context.h",
 ]
 
 tf_cuda_library(
@@ -2149,7 +2195,6 @@ tf_cuda_library(
         "common_runtime/gpu/gpu_util_platform_specific.cc",
         "common_runtime/gpu/pool_allocator.cc",
         "common_runtime/gpu/process_state.cc",
-        "common_runtime/gpu_device_context.h",
     ],
     hdrs = GPU_RUNTIME_HEADERS,
     copts = tf_copts(),
@@ -2160,6 +2205,7 @@ tf_cuda_library(
         ":framework_internal",
         ":gpu_init_impl",
         ":gpu_lib",
+        ":graph",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
@@ -2357,6 +2403,7 @@ tf_cc_tests(
         "lib/math/math_util_test.cc",
         "lib/monitoring/collection_registry_test.cc",
         "lib/monitoring/counter_test.cc",
+        "lib/monitoring/gauge_test.cc",
         "lib/monitoring/metric_def_test.cc",
         "lib/monitoring/sampler_test.cc",
         "lib/random/distribution_sampler_test.cc",
@@ -2698,6 +2745,7 @@ tf_cc_test_mkl(
     srcs = [
         "graph/mkl_layout_pass_test.cc",
         "graph/mkl_tfconversion_pass_test.cc",
+        "util/mkl_util_test.cc",
     ],
     linkstatic = 1,
     deps = [
@@ -2735,6 +2783,18 @@ tf_cc_test_mkl(
     ]),
 )
 
+tf_cc_tests_gpu(
+    name = "gpu_device_on_non_gpu_machine_test",
+    size = "small",
+    srcs = ["common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":gpu_headers_lib",
+        ":gpu_runtime",
+        ":test",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "gpu_related_tests",
     size = "small",
@@ -3356,12 +3416,12 @@ tf_cc_test(
 
 filegroup(
     name = "base_api_def",
-    data = glob(["api_def/base_api/*"]),
+    srcs = glob(["api_def/base_api/*"]),
 )
 
 filegroup(
     name = "python_api_def",
-    data = glob(["api_def/python_api/*"]),
+    srcs = glob(["api_def/python_api/*"]),
 )
 
 tf_cc_test(
@@ -3371,10 +3431,6 @@ tf_cc_test(
         ":base_api_def",
         "//tensorflow/cc:ops/op_gen_overrides.pbtxt",
     ],
-    tags = [
-        "manual",
-        "notap",
-    ],
     deps = [
         ":framework",
         ":framework_internal",
@@ -3390,9 +3446,9 @@ tf_cc_test(
 )
 
 tf_cc_test_gpu(
-    name = "gpu_tracer_test",
+    name = "device_tracer_test",
     size = "small",
-    srcs = ["platform/gpu_tracer_test.cc"],
+    srcs = ["platform/device_tracer_test.cc"],
     args = ["--heap_check=local"],
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
@@ -3400,12 +3456,12 @@ tf_cc_test_gpu(
         ":all_kernels",
         ":core_cpu",
         ":core_cpu_internal",
+        ":device_tracer",
         ":direct_session",
         ":direct_session_internal",
         ":framework",
         ":framework_internal",
         ":gpu_runtime",
-        ":gpu_tracer",
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index ceeb172fa0a9abf2ab7adcfc801b4bcb5fa04381..2cdc14843f61a2585b61e214527e0a0b5bdea446 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -46,92 +46,227 @@ constexpr char kDefaultApiDefDir[] =
     "tensorflow/core/api_def/base_api";
 constexpr char kOverridesFilePath[] =
     "tensorflow/cc/ops/op_gen_overrides.pbtxt";
-constexpr char kApiDefFileFormat[] = "api_def_%c.pbtxt";
-constexpr char kAlphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+constexpr char kApiDefFileFormat[] = "api_def_%s.pbtxt";
+constexpr char kApiDefFilePattern[] = "api_def_*.pbtxt";
 
-// Get map from first character to ApiDefs for ops
-// that start with that character.
-std::unordered_map<char, ApiDefs> GenerateApiDef(
-    const OpList& ops, const OpGenOverrides& overrides) {
+void FillBaseApiDef(ApiDef* api_def, const OpDef& op) {
+  api_def->set_graph_op_name(op.name());
+  // Add arg docs
+  for (auto& input_arg : op.input_arg()) {
+    if (!input_arg.description().empty()) {
+      auto* api_def_in_arg = api_def->add_in_arg();
+      api_def_in_arg->set_name(input_arg.name());
+      api_def_in_arg->set_description(input_arg.description());
+    }
+  }
+  for (auto& output_arg : op.output_arg()) {
+    if (!output_arg.description().empty()) {
+      auto* api_def_out_arg = api_def->add_out_arg();
+      api_def_out_arg->set_name(output_arg.name());
+      api_def_out_arg->set_description(output_arg.description());
+    }
+  }
+  // Add attr docs
+  for (auto& attr : op.attr()) {
+    if (!attr.description().empty()) {
+      auto* api_def_attr = api_def->add_attr();
+      api_def_attr->set_name(attr.name());
+      api_def_attr->set_description(attr.description());
+    }
+  }
+  // Add docs
+  api_def->set_summary(op.summary());
+  api_def->set_description(op.description());
+}
+
+// Checks if arg1 should be before arg2 according to ordering in args.
+bool CheckArgBefore(const ApiDef::Arg* arg1, const ApiDef::Arg* arg2,
+                    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args) {
+  for (auto& arg : args) {
+    if (arg.name() == arg2->name()) {
+      return false;
+    } else if (arg.name() == arg1->name()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Checks if attr1 should be before attr2 according to ordering in op_def.
+bool CheckAttrBefore(const ApiDef::Attr* attr1, const ApiDef::Attr* attr2,
+                     const OpDef& op_def) {
+  for (auto& attr : op_def.attr()) {
+    if (attr.name() == attr2->name()) {
+      return false;
+    } else if (attr.name() == attr1->name()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Applies renames to args.
+void ApplyArgOverrides(
+    protobuf::RepeatedPtrField<ApiDef::Arg>* args,
+    const protobuf::RepeatedPtrField<OpGenOverride::Rename>& renames,
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& op_args,
+    const string& op_name) {
+  for (auto& rename : renames) {
+    // First check if rename is valid.
+    bool valid = false;
+    for (const auto& op_arg : op_args) {
+      if (op_arg.name() == rename.from()) {
+        valid = true;
+      }
+    }
+    QCHECK(valid) << rename.from() << " is not a valid argument for "
+                  << op_name;
+    bool found_arg = false;
+    // If Arg is already in ApiDef, just update it.
+    for (int i = 0; i < args->size(); ++i) {
+      auto* arg = args->Mutable(i);
+      if (arg->name() == rename.from()) {
+        arg->set_rename_to(rename.to());
+        found_arg = true;
+        break;
+      }
+    }
+    if (!found_arg) {  // not in ApiDef, add a new arg.
+      auto* new_arg = args->Add();
+      new_arg->set_name(rename.from());
+      new_arg->set_rename_to(rename.to());
+    }
+  }
+  // We don't really need a specific order here right now.
+  // However, it is clearer if order follows OpDef.
+  std::sort(args->pointer_begin(), args->pointer_end(),
+            [&](ApiDef::Arg* arg1, ApiDef::Arg* arg2) {
+              return CheckArgBefore(arg1, arg2, op_args);
+            });
+}
+
+// Returns existing attribute with the given name if such
+// attribute exists. Otherwise, adds a new attribute and returns it.
+ApiDef::Attr* FindOrAddAttr(ApiDef* api_def, const string attr_name) {
+  // If Attr is already in ApiDef, just update it.
+  for (int i = 0; i < api_def->attr_size(); ++i) {
+    auto* attr = api_def->mutable_attr(i);
+    if (attr->name() == attr_name) {
+      return attr;
+    }
+  }
+  // Add a new Attr.
+  auto* new_attr = api_def->add_attr();
+  new_attr->set_name(attr_name);
+  return new_attr;
+}
+
+// Applies renames and default values to attributes.
+void ApplyAttrOverrides(ApiDef* api_def, const OpGenOverride& op_override,
+                        const OpDef& op_def) {
+  for (auto& attr_rename : op_override.attr_rename()) {
+    auto* attr = FindOrAddAttr(api_def, attr_rename.from());
+    attr->set_rename_to(attr_rename.to());
+  }
+
+  for (auto& attr_default : op_override.attr_default()) {
+    auto* attr = FindOrAddAttr(api_def, attr_default.name());
+    *(attr->mutable_default_value()) = attr_default.value();
+  }
+  // We don't really need a specific order here right now.
+  // However, it is clearer if order follows OpDef.
+  std::sort(api_def->mutable_attr()->pointer_begin(),
+            api_def->mutable_attr()->pointer_end(),
+            [&](ApiDef::Attr* attr1, ApiDef::Attr* attr2) {
+              return CheckAttrBefore(attr1, attr2, op_def);
+            });
+}
+
+void ApplyOverridesToApiDef(ApiDef* api_def, const OpDef& op,
+                            const OpGenOverride& op_override) {
+  // Fill ApiDef with data based on op and op_override.
+  // Set visibility
+  if (op_override.skip()) {
+    api_def->set_visibility(ApiDef_Visibility_SKIP);
+  } else if (op_override.hide()) {
+    api_def->set_visibility(ApiDef_Visibility_HIDDEN);
+  }
+  // Add endpoints
+  if (!op_override.rename_to().empty()) {
+    api_def->add_endpoint()->set_name(op_override.rename_to());
+  } else if (!op_override.alias().empty()) {
+    api_def->add_endpoint()->set_name(op.name());
+  }
+
+  for (auto& alias : op_override.alias()) {
+    auto* endpoint = api_def->add_endpoint();
+    endpoint->set_name(alias);
+  }
+
+  ApplyArgOverrides(api_def->mutable_in_arg(), op_override.input_rename(),
+                    op.input_arg(), api_def->graph_op_name());
+  ApplyArgOverrides(api_def->mutable_out_arg(), op_override.output_rename(),
+                    op.output_arg(), api_def->graph_op_name());
+  ApplyAttrOverrides(api_def, op_override, op);
+}
+
+// Get map from ApiDef file path to corresponding ApiDefs proto.
+std::unordered_map<string, ApiDefs> GenerateApiDef(
+    const string& api_def_dir, const OpList& ops,
+    const OpGenOverrides& overrides) {
   std::unordered_map<string, OpGenOverride> name_to_override;
   for (const auto& op_override : overrides.op()) {
     name_to_override[op_override.name()] = op_override;
   }
 
-  std::unordered_map<char, ApiDefs> api_defs_map;
+  std::unordered_map<string, ApiDefs> api_defs_map;
 
+  // These ops are included in OpList only if TF_NEED_GCP
+  // is set to true. So, we skip them for now so that this test passes
+  // whether TF_NEED_GCP is set or not.
+  const std::unordered_set<string> ops_to_exclude = {
+      "BigQueryReader", "GenerateBigQueryReaderPartitions"};
   for (const auto& op : ops.op()) {
     CHECK(!op.name().empty())
         << "Encountered empty op name: %s" << op.DebugString();
-    const char file_id = toupper(op.name()[0]);
-    CHECK(isalpha(file_id)) << "Unexpected op name: " << op.name();
-    ApiDef* api_def = api_defs_map[file_id].add_op();
-    api_def->set_graph_op_name(op.name());
+    if (ops_to_exclude.find(op.name()) != ops_to_exclude.end()) {
+      LOG(INFO) << "Skipping " << op.name();
+      continue;
+    }
+    string file_path = io::JoinPath(api_def_dir, kApiDefFileFormat);
+    file_path = strings::Printf(file_path.c_str(), op.name().c_str());
+    ApiDef* api_def = api_defs_map[file_path].add_op();
+    FillBaseApiDef(api_def, op);
 
     if (name_to_override.find(op.name()) != name_to_override.end()) {
-      const auto& op_override = name_to_override[op.name()];
-      // Set visibility
-      if (op_override.skip()) {
-        api_def->set_visibility(ApiDef_Visibility_SKIP);
-      } else if (op_override.hide()) {
-        api_def->set_visibility(ApiDef_Visibility_HIDDEN);
-      }
-      // Add endpoints
-      if (!op_override.rename_to().empty()) {
-        auto* endpoint = api_def->add_endpoint();
-        endpoint->set_name(op_override.rename_to());
-      } else {
-        auto* endpoint = api_def->add_endpoint();
-        endpoint->set_name(op.name());
-      }
-      for (auto& alias : op_override.alias()) {
-        auto* endpoint = api_def->add_endpoint();
-        endpoint->set_name(alias);
-      }
-      // Add attributes
-      for (auto& attr : op.attr()) {
-        auto* api_def_attr = api_def->add_attr();
-        api_def_attr->set_name(attr.name());
-        for (auto& attr_override : op_override.attr_default()) {
-          if (attr.name() == attr_override.name()) {
-            *(api_def_attr->mutable_default_value()) = attr_override.value();
-          }
-        }
-        for (auto& attr_rename : op_override.attr_rename()) {
-          if (attr.name() == attr_rename.from()) {
-            api_def_attr->set_rename_to(attr_rename.to());
-          }
-        }
-      }
-    } else {
-      auto* endpoint = api_def->add_endpoint();
-      endpoint->set_name(op.name());
+      ApplyOverridesToApiDef(api_def, op, name_to_override[op.name()]);
     }
-    // Add docs
-    api_def->set_summary(op.summary());
-    api_def->set_description(op.description());
   }
   return api_defs_map;
 }
 
-// Reads golden api defs file with the given suffix.
-string GetGoldenApiDefsStr(Env* env, const string& api_files_dir, char suffix) {
-  string file_path = strings::Printf(
-      io::JoinPath(api_files_dir, kApiDefFileFormat).c_str(), suffix);
-  if (env->FileExists(file_path).ok()) {
+// Reads golden ApiDef files and returns a map from file name to ApiDef file
+// contents.
+std::unordered_map<string, string> GetGoldenApiDefs(
+    Env* env, const string& api_files_dir) {
+  std::vector<string> matching_paths;
+  TF_CHECK_OK(env->GetMatchingPaths(
+      io::JoinPath(api_files_dir, kApiDefFilePattern), &matching_paths));
+
+  std::unordered_map<string, string> file_path_to_api_def;
+  for (auto& file_path : matching_paths) {
     string file_contents;
-    TF_EXPECT_OK(ReadFileToString(env, file_path, &file_contents));
-    return file_contents;
+    TF_CHECK_OK(ReadFileToString(env, file_path, &file_contents));
+    file_path_to_api_def[file_path] = file_contents;
   }
-  return "";
+  return file_path_to_api_def;
 }
 
 void RunApiTest(bool update_api_def, const string& api_files_dir) {
   // Read C++ overrides file
-  string overrides_file_contents;
+  OpGenOverrides overrides;
   Env* env = Env::Default();
-  TF_EXPECT_OK(
-      ReadFileToString(env, kOverridesFilePath, &overrides_file_contents));
+  TF_EXPECT_OK(ReadTextProto(env, kOverridesFilePath, &overrides));
 
   // Read all ops
   OpList ops;
@@ -139,29 +274,25 @@ void RunApiTest(bool update_api_def, const string& api_files_dir) {
   const std::vector<string> multi_line_fields = {"description"};
 
   // Get expected ApiDefs
-  OpGenOverrides overrides;
-  auto new_api_defs_map = GenerateApiDef(ops, overrides);
+  const auto new_api_defs_map = GenerateApiDef(api_files_dir, ops, overrides);
 
   bool updated_at_least_one_file = false;
+  const auto golden_api_defs_map = GetGoldenApiDefs(env, api_files_dir);
 
-  for (char c : kAlphabet) {
-    string golden_api_defs_str = GetGoldenApiDefsStr(env, api_files_dir, c);
-    string new_api_defs_str = new_api_defs_map[c].DebugString();
+  for (auto new_api_entry : new_api_defs_map) {
+    const auto& file_path = new_api_entry.first;
+    std::string golden_api_defs_str = "";
+    if (golden_api_defs_map.find(file_path) != golden_api_defs_map.end()) {
+      golden_api_defs_str = golden_api_defs_map.at(file_path);
+    }
+    string new_api_defs_str = new_api_entry.second.DebugString();
     new_api_defs_str = PBTxtToMultiline(new_api_defs_str, multi_line_fields);
     if (golden_api_defs_str == new_api_defs_str) {
       continue;
     }
     if (update_api_def) {
-      string output_file_path =
-          io::JoinPath(api_files_dir, strings::Printf(kApiDefFileFormat, c));
-      if (new_api_defs_str.empty()) {
-        std::cout << "Deleting " << output_file_path << "..." << std::endl;
-        TF_EXPECT_OK(env->DeleteFile(output_file_path));
-      } else {
-        std::cout << "Updating " << output_file_path << "..." << std::endl;
-        TF_EXPECT_OK(
-            WriteStringToFile(env, output_file_path, new_api_defs_str));
-      }
+      std::cout << "Updating " << file_path << "..." << std::endl;
+      TF_EXPECT_OK(WriteStringToFile(env, file_path, new_api_defs_str));
       updated_at_least_one_file = true;
     } else {
       EXPECT_EQ(golden_api_defs_str, new_api_defs_str)
@@ -170,6 +301,21 @@ void RunApiTest(bool update_api_def, const string& api_files_dir) {
     }
   }
 
+  for (const auto& golden_api_entry : golden_api_defs_map) {
+    const auto& file_path = golden_api_entry.first;
+    if (new_api_defs_map.find(file_path) == new_api_defs_map.end()) {
+      if (update_api_def) {
+        std::cout << "Deleting " << file_path << "..." << std::endl;
+        TF_EXPECT_OK(env->DeleteFile(file_path));
+        updated_at_least_one_file = true;
+      } else {
+        EXPECT_EQ("", golden_api_entry.second)
+            << "To update golden API files, run "
+            << "tensorflow/core/api_def/update_api_def.sh.";
+      }
+    }
+  }
+
   if (update_api_def && !updated_at_least_one_file) {
     std::cout << "Api def files are already up to date." << std::endl;
   }
diff --git a/tensorflow/core/api_def/base_api/api_def_A.pbtxt b/tensorflow/core/api_def/base_api/api_def_A.pbtxt
deleted file mode 100644
index 8193d1bc624535c7894430284686e8664fb71a2d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_A.pbtxt
+++ /dev/null
@@ -1,670 +0,0 @@
-op {
-  graph_op_name: "Abort"
-  endpoint {
-    name: "Abort"
-  }
-  summary: "Raise a exception to abort the process when called."
-  description: <<END
-If exit_without_error is true, the process will exit normally,
-otherwise it will exit with a SIGABORT signal.
-
-Returns nothing but an exception.
-END
-}
-op {
-  graph_op_name: "Abs"
-  endpoint {
-    name: "Abs"
-  }
-  summary: "Computes the absolute value of a tensor."
-  description: <<END
-Given a tensor `x`, this operation returns a tensor containing the absolute
-value of each element in `x`. For example, if x is an input element and y is
-an output element, this operation computes \\(y = |x|\\).
-END
-}
-op {
-  graph_op_name: "AccumulatorApplyGradient"
-  endpoint {
-    name: "AccumulatorApplyGradient"
-  }
-  summary: "Applies a gradient to a given accumulator."
-  description: <<END
-Does not add if local_step is lesser than the accumulator's global_step.
-END
-}
-op {
-  graph_op_name: "AccumulatorNumAccumulated"
-  endpoint {
-    name: "AccumulatorNumAccumulated"
-  }
-  summary: "Returns the number of gradients aggregated in the given accumulators."
-}
-op {
-  graph_op_name: "AccumulatorSetGlobalStep"
-  endpoint {
-    name: "AccumulatorSetGlobalStep"
-  }
-  summary: "Updates the accumulator with a new value for global_step."
-  description: <<END
-Logs warning if the accumulator's value is already higher than
-new_global_step.
-END
-}
-op {
-  graph_op_name: "AccumulatorTakeGradient"
-  endpoint {
-    name: "AccumulatorTakeGradient"
-  }
-  summary: "Extracts the average gradient in the given ConditionalAccumulator."
-  description: <<END
-The op blocks until sufficient (i.e., more than num_required)
-gradients have been accumulated.  If the accumulator has already
-aggregated more than num_required gradients, it returns the average of
-the accumulated gradients.  Also automatically increments the recorded
-global_step in the accumulator by 1, and resets the aggregate to 0.
-END
-}
-op {
-  graph_op_name: "Acos"
-  endpoint {
-    name: "Acos"
-  }
-  summary: "Computes acos of x element-wise."
-}
-op {
-  graph_op_name: "Acosh"
-  endpoint {
-    name: "Acosh"
-  }
-  summary: "Computes inverse hyperbolic cosine of x element-wise."
-}
-op {
-  graph_op_name: "Add"
-  endpoint {
-    name: "Add"
-  }
-  summary: "Returns x + y element-wise."
-  description: <<END
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "AddManySparseToTensorsMap"
-  endpoint {
-    name: "AddManySparseToTensorsMap"
-  }
-  summary: "Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles."
-  description: <<END
-A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-`sparse_values`, and `sparse_shape`, where
-
-```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-
-An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-having a first `sparse_indices` column taking values between `[0, N)`, where
-the minibatch size `N == sparse_shape[0]`.
-
-The input `SparseTensor` must have rank `R` greater than 1, and the first
-dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-must be sorted in increasing order of this first dimension.  The stored
-`SparseTensor` objects pointed to by each row of the output `sparse_handles`
-will have rank `R-1`.
-
-The `SparseTensor` values can then be read out as part of a minibatch by passing
-the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-the correct `SparseTensorsMap` is accessed, ensure that the same
-`container` and `shared_name` are passed to that Op.  If no `shared_name`
-is provided here, instead use the *name* of the Operation created by calling
-`AddManySparseToTensorsMap` as the `shared_name` passed to
-`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-END
-}
-op {
-  graph_op_name: "AddN"
-  endpoint {
-    name: "AddN"
-  }
-  summary: "Add all input tensors element wise."
-}
-op {
-  graph_op_name: "AddSparseToTensorsMap"
-  endpoint {
-    name: "AddSparseToTensorsMap"
-  }
-  summary: "Add a `SparseTensor` to a `SparseTensorsMap` return its handle."
-  description: <<END
-A `SparseTensor` is represented by three tensors: `sparse_indices`,
-`sparse_values`, and `sparse_shape`.
-
-This operator takes the given `SparseTensor` and adds it to a container
-object (a `SparseTensorsMap`).  A unique key within this container is generated
-in the form of an `int64`, and this is the value that is returned.
-
-The `SparseTensor` can then be read out as part of a minibatch by passing
-the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-the correct `SparseTensorsMap` is accessed, ensure that the same
-`container` and `shared_name` are passed to that Op.  If no `shared_name`
-is provided here, instead use the *name* of the Operation created by calling
-`AddSparseToTensorsMap` as the `shared_name` passed to
-`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-END
-}
-op {
-  graph_op_name: "AdjustContrast"
-  endpoint {
-    name: "AdjustContrast"
-  }
-  summary: "Deprecated. Disallowed in GraphDef version >= 2."
-}
-op {
-  graph_op_name: "AdjustContrastv2"
-  endpoint {
-    name: "AdjustContrastv2"
-  }
-  summary: "Adjust the contrast of one or more images."
-  description: <<END
-`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-interpreted as `[height, width, channels]`.  The other dimensions only
-represent a collection of images, such as `[batch, height, width, channels].`
-
-Contrast is adjusted independently for each channel of each image.
-
-For each channel, the Op first computes the mean of the image pixels in the
-channel and then adjusts each component of each pixel to
-`(x - mean) * contrast_factor + mean`.
-END
-}
-op {
-  graph_op_name: "AdjustHue"
-  endpoint {
-    name: "AdjustHue"
-  }
-  summary: "Adjust the hue of one or more images."
-  description: <<END
-`images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
-
-The input image is considered in the RGB colorspace. Conceptually, the RGB
-colors are first mapped into HSV. A delta is then applied all the hue values,
-and then remapped back to RGB colorspace.
-END
-}
-op {
-  graph_op_name: "AdjustSaturation"
-  endpoint {
-    name: "AdjustSaturation"
-  }
-  summary: "Adjust the saturation of one or more images."
-  description: <<END
-`images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
-
-The input image is considered in the RGB colorspace. Conceptually, the RGB
-colors are first mapped into HSV. A scale is then applied all the saturation
-values, and then remapped back to RGB colorspace.
-END
-}
-op {
-  graph_op_name: "All"
-  endpoint {
-    name: "All"
-  }
-  summary: "Computes the \"logical and\" of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "AllCandidateSampler"
-  endpoint {
-    name: "AllCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: <<END
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "Angle"
-  endpoint {
-    name: "Angle"
-  }
-  summary: "Returns the argument of a complex number."
-  description: <<END
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the argument of each element in `input`. All elements in
-`input` must be complex numbers of the form \\(a + bj\\), where *a*
-is the real part and *b* is the imaginary part.
-
-The argument returned by this operation is of the form \\(atan2(b, a)\\).
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.angle(input) ==> [2.0132, 1.056]
-```
-
-@compatibility(numpy)
-Equivalent to np.angle.
-@end_compatibility
-END
-}
-op {
-  graph_op_name: "Any"
-  endpoint {
-    name: "Any"
-  }
-  summary: "Computes the \"logical or\" of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "ApplyAdadelta"
-  endpoint {
-    name: "ApplyAdadelta"
-  }
-  summary: "Update \'*var\' according to the adadelta scheme."
-  description: <<END
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-END
-}
-op {
-  graph_op_name: "ApplyAdagrad"
-  endpoint {
-    name: "ApplyAdagrad"
-  }
-  summary: "Update \'*var\' according to the adagrad scheme."
-  description: <<END
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-END
-}
-op {
-  graph_op_name: "ApplyAdagradDA"
-  endpoint {
-    name: "ApplyAdagradDA"
-  }
-  summary: "Update \'*var\' according to the proximal adagrad scheme."
-}
-op {
-  graph_op_name: "ApplyAdam"
-  endpoint {
-    name: "ApplyAdam"
-  }
-  summary: "Update \'*var\' according to the Adam algorithm."
-  description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-END
-}
-op {
-  graph_op_name: "ApplyCenteredRMSProp"
-  endpoint {
-    name: "ApplyCenteredRMSProp"
-  }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: <<END
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ApplyFtrl"
-  endpoint {
-    name: "ApplyFtrl"
-  }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ApplyFtrlV2"
-  endpoint {
-    name: "ApplyFtrlV2"
-  }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ApplyGradientDescent"
-  endpoint {
-    name: "ApplyGradientDescent"
-  }
-  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
-}
-op {
-  graph_op_name: "ApplyMomentum"
-  endpoint {
-    name: "ApplyMomentum"
-  }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
-  description: <<END
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-END
-}
-op {
-  graph_op_name: "ApplyProximalAdagrad"
-  endpoint {
-    name: "ApplyProximalAdagrad"
-  }
-  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
-  description: <<END
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-END
-}
-op {
-  graph_op_name: "ApplyProximalGradientDescent"
-  endpoint {
-    name: "ApplyProximalGradientDescent"
-  }
-  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: <<END
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-END
-}
-op {
-  graph_op_name: "ApplyRMSProp"
-  endpoint {
-    name: "ApplyRMSProp"
-  }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: <<END
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ApproximateEqual"
-  endpoint {
-    name: "ApproximateEqual"
-  }
-  summary: "Returns the truth value of abs(x-y) < tolerance element-wise."
-}
-op {
-  graph_op_name: "ArgMax"
-  endpoint {
-    name: "ArgMax"
-  }
-  summary: "Returns the index with the largest value across dimensions of a tensor."
-  description: <<END
-Note that in case of ties the identity of the return value is not guaranteed.
-END
-}
-op {
-  graph_op_name: "ArgMin"
-  endpoint {
-    name: "ArgMin"
-  }
-  summary: "Returns the index with the smallest value across dimensions of a tensor."
-  description: <<END
-Note that in case of ties the identity of the return value is not guaranteed.
-END
-}
-op {
-  graph_op_name: "AsString"
-  endpoint {
-    name: "AsString"
-  }
-  summary: "Converts each entry in the given tensor to strings.  Supports many numeric"
-  description: <<END
-types and boolean.
-END
-}
-op {
-  graph_op_name: "Asin"
-  endpoint {
-    name: "Asin"
-  }
-  summary: "Computes asin of x element-wise."
-}
-op {
-  graph_op_name: "Asinh"
-  endpoint {
-    name: "Asinh"
-  }
-  summary: "Computes inverse hyperbolic sine of x element-wise."
-}
-op {
-  graph_op_name: "Assert"
-  endpoint {
-    name: "Assert"
-  }
-  summary: "Asserts that the given condition is true."
-  description: <<END
-If `condition` evaluates to false, print the list of tensors in `data`.
-`summarize` determines how many entries of the tensors to print.
-END
-}
-op {
-  graph_op_name: "Assign"
-  endpoint {
-    name: "Assign"
-  }
-  summary: "Update \'ref\' by assigning \'value\' to it."
-  description: <<END
-This operation outputs "ref" after the assignment is done.
-This makes it easier to chain operations that need to use the reset value.
-END
-}
-op {
-  graph_op_name: "AssignAdd"
-  endpoint {
-    name: "AssignAdd"
-  }
-  summary: "Update \'ref\' by adding \'value\' to it."
-  description: <<END
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-END
-}
-op {
-  graph_op_name: "AssignSub"
-  endpoint {
-    name: "AssignSub"
-  }
-  summary: "Update \'ref\' by subtracting \'value\' from it."
-  description: <<END
-This operation outputs "ref" after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-END
-}
-op {
-  graph_op_name: "Atan"
-  endpoint {
-    name: "Atan"
-  }
-  summary: "Computes atan of x element-wise."
-}
-op {
-  graph_op_name: "Atan2"
-  endpoint {
-    name: "Atan2"
-  }
-  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
-  description: <<END
-This is the angle \( \theta \in [-\pi, \pi] \) such that
-\[ x = r \cos(\theta) \]
-and
-\[ y = r \sin(\theta) \]
-where \(r = \sqrt(x^2 + y^2) \).
-END
-}
-op {
-  graph_op_name: "Atanh"
-  endpoint {
-    name: "Atanh"
-  }
-  summary: "Computes inverse hyperbolic tangent of x element-wise."
-}
-op {
-  graph_op_name: "AudioSpectrogram"
-  endpoint {
-    name: "AudioSpectrogram"
-  }
-  summary: "Produces a visualization of audio data over time."
-  description: <<END
-Spectrograms are a standard way of representing audio information as a series of
-slices of frequency information, one slice for each window of time. By joining
-these together into a sequence, they form a distinctive fingerprint of the sound
-over time.
-
-This op expects to receive audio data as an input, stored as floats in the range
--1 to 1, together with a window width in samples, and a stride specifying how
-far to move the window between slices. From this it generates a three
-dimensional output. The lowest dimension has an amplitude value for each
-frequency during that time slice. The next dimension is time, with successive
-frequency slices. The final dimension is for the channels in the input, so a
-stereo audio input would have two here for example.
-
-This means the layout when converted and saved as an image is rotated 90 degrees
-clockwise from a typical spectrogram. Time is descending down the Y axis, and
-the frequency decreases from left to right.
-
-Each value in the result represents the square root of the sum of the real and
-imaginary parts of an FFT on the current window of samples. In this way, the
-lowest dimension represents the power of each frequency in the current window,
-and adjacent windows are concatenated in the next dimension.
-
-To get a more intuitive and visual look at what this operation does, you can run
-tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-resulting spectrogram as a PNG image.
-END
-}
-op {
-  graph_op_name: "AudioSummary"
-  endpoint {
-    name: "AudioSummary"
-  }
-  summary: "Outputs a `Summary` protocol buffer with audio."
-  description: <<END
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-END
-}
-op {
-  graph_op_name: "AudioSummaryV2"
-  endpoint {
-    name: "AudioSummaryV2"
-  }
-  summary: "Outputs a `Summary` protocol buffer with audio."
-  description: <<END
-The summary has up to `max_outputs` summary values containing audio. The
-audio is built from `tensor` which must be 3-D with shape `[batch_size,
-frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-*  If `max_outputs` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-END
-}
-op {
-  graph_op_name: "AvgPool"
-  endpoint {
-    name: "AvgPool"
-  }
-  summary: "Performs average pooling on the input."
-  description: <<END
-Each entry in `output` is the mean of the corresponding size `ksize`
-window in `value`.
-END
-}
-op {
-  graph_op_name: "AvgPool3D"
-  endpoint {
-    name: "AvgPool3D"
-  }
-  summary: "Performs 3D average pooling on the input."
-}
-op {
-  graph_op_name: "AvgPool3DGrad"
-  endpoint {
-    name: "AvgPool3DGrad"
-  }
-  summary: "Computes gradients of average pooling function."
-}
-op {
-  graph_op_name: "AvgPoolGrad"
-  endpoint {
-    name: "AvgPoolGrad"
-  }
-  summary: "Computes gradients of the average pooling function."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Abort.pbtxt b/tensorflow/core/api_def/base_api/api_def_Abort.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6dd923c512af8d38ec04ec1116cc5da1e97d7e92
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Abort.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "Abort"
+  attr {
+    name: "error_msg"
+    description: <<END
+A string which is the message associated with the exception.
+END
+  }
+  summary: "Raise a exception to abort the process when called."
+  description: <<END
+If exit_without_error is true, the process will exit normally,
+otherwise it will exit with a SIGABORT signal.
+
+Returns nothing but an exception.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Abs.pbtxt b/tensorflow/core/api_def/base_api/api_def_Abs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..412891f4f4f97cb66e94db58bf089d105f4351f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Abs.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "Abs"
+  summary: "Computes the absolute value of a tensor."
+  description: <<END
+Given a tensor `x`, this operation returns a tensor containing the absolute
+value of each element in `x`. For example, if x is an input element and y is
+an output element, this operation computes \\(y = |x|\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AccumulateNV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_AccumulateNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f20911d2d685ff129f02365a9d54e24ed724e0b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AccumulateNV2.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "AccumulateNV2"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of `Tensor` objects, each with same shape and type.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+Shape of elements of `inputs`.
+END
+  }
+  summary: "Returns the element-wise sum of a list of tensors."
+  description: <<END
+`tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+wait for all of its inputs to be ready before beginning to sum. This can
+save memory if inputs are ready at different times, since minimum temporary
+storage is proportional to the output size rather than the inputs size.
+
+Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+
+Returns a `Tensor` of same shape and type as the elements of `inputs`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_AccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25928a32ca03e45a211ca99aa97cf413036e27e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AccumulatorApplyGradient.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "AccumulatorApplyGradient"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a accumulator.
+END
+  }
+  in_arg {
+    name: "local_step"
+    description: <<END
+The local_step value at which the gradient was computed.
+END
+  }
+  in_arg {
+    name: "gradient"
+    description: <<END
+A tensor of the gradient to be accumulated.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The data type of accumulated gradients. Needs to correspond to the type
+of the accumulator.
+END
+  }
+  summary: "Applies a gradient to a given accumulator."
+  description: <<END
+Does not add if local_step is lesser than the accumulator's global_step.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AccumulatorNumAccumulated.pbtxt b/tensorflow/core/api_def/base_api/api_def_AccumulatorNumAccumulated.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..270265a8040b3f70637f2ee3d50066c2ad0eca6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "AccumulatorNumAccumulated"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to an accumulator.
+END
+  }
+  out_arg {
+    name: "num_accumulated"
+    description: <<END
+The number of gradients aggregated in the given accumulator.
+END
+  }
+  summary: "Returns the number of gradients aggregated in the given accumulators."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/api_def/base_api/api_def_AccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b08a0afbc2d21676109b5c9ffafb6e0fc5701bda
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "AccumulatorSetGlobalStep"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to an accumulator.
+END
+  }
+  in_arg {
+    name: "new_global_step"
+    description: <<END
+The new global_step value to set.
+END
+  }
+  summary: "Updates the accumulator with a new value for global_step."
+  description: <<END
+Logs warning if the accumulator's value is already higher than
+new_global_step.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_AccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e53de7c6fd990081c2ed14f0dc2c1117aae0527
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AccumulatorTakeGradient.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "AccumulatorTakeGradient"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to an accumulator.
+END
+  }
+  in_arg {
+    name: "num_required"
+    description: <<END
+Number of gradients required before we return an aggregate.
+END
+  }
+  out_arg {
+    name: "average"
+    description: <<END
+The average of the accumulated gradients.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The data type of accumulated gradients. Needs to correspond to the type
+of the accumulator.
+END
+  }
+  summary: "Extracts the average gradient in the given ConditionalAccumulator."
+  description: <<END
+The op blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated.  If the accumulator has already
+aggregated more than num_required gradients, it returns the average of
+the accumulated gradients.  Also automatically increments the recorded
+global_step in the accumulator by 1, and resets the aggregate to 0.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2184b644b230186a0b379d129052c4d75b6fb4a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Acos"
+  summary: "Computes acos of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Acosh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Acosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da77e814988d88a91e6f0fee717c88150a15ac97
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Acosh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Acosh"
+  summary: "Computes inverse hyperbolic cosine of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Add.pbtxt b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a408af380a36bfd335d2f00f64a4388c36aca4e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Add"
+  summary: "Returns x + y element-wise."
+  description: <<END
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AddManySparseToTensorsMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_AddManySparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e5726a2d3e629ae8bc5b49e806935ee135e4fd9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AddManySparseToTensorsMap.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "AddManySparseToTensorsMap"
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+2-D.  The `indices` of the minibatch `SparseTensor`.
+`sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+END
+  }
+  in_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  The `values` of the minibatch `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_shape"
+    description: <<END
+1-D.  The `shape` of the minibatch `SparseTensor`.
+The minibatch size `N == sparse_shape[0]`.
+END
+  }
+  out_arg {
+    name: "sparse_handles"
+    description: <<END
+1-D.  The handles of the `SparseTensor` now stored in the
+`SparseTensorsMap`.  Shape: `[N]`.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+The container name for the `SparseTensorsMap` created by this op.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+The shared name for the `SparseTensorsMap` created by this op.
+If blank, the new Operation's unique name is used.
+END
+  }
+  summary: "Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles."
+  description: <<END
+A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+`sparse_values`, and `sparse_shape`, where
+
+```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+
+An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+having a first `sparse_indices` column taking values between `[0, N)`, where
+the minibatch size `N == sparse_shape[0]`.
+
+The input `SparseTensor` must have rank `R` greater than 1, and the first
+dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+must be sorted in increasing order of this first dimension.  The stored
+`SparseTensor` objects pointed to by each row of the output `sparse_handles`
+will have rank `R-1`.
+
+The `SparseTensor` values can then be read out as part of a minibatch by passing
+the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+the correct `SparseTensorsMap` is accessed, ensure that the same
+`container` and `shared_name` are passed to that Op.  If no `shared_name`
+is provided here, instead use the *name* of the Operation created by calling
+`AddManySparseToTensorsMap` as the `shared_name` passed to
+`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AddN.pbtxt b/tensorflow/core/api_def/base_api/api_def_AddN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64677763a498c89368a510186cede3a553a31df4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AddN.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "AddN"
+  in_arg {
+    name: "inputs"
+    description: <<END
+Must all be the same size and shape.
+END
+  }
+  summary: "Add all input tensors element wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AddSparseToTensorsMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_AddSparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0438eac6549d974773e156ad00513671431e92a7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AddSparseToTensorsMap.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "AddSparseToTensorsMap"
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  The `values` of the `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_shape"
+    description: <<END
+1-D.  The `shape` of the `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "sparse_handle"
+    description: <<END
+0-D.  The handle of the `SparseTensor` now stored in the
+`SparseTensorsMap`.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+The container name for the `SparseTensorsMap` created by this op.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+The shared name for the `SparseTensorsMap` created by this op.
+If blank, the new Operation's unique name is used.
+END
+  }
+  summary: "Add a `SparseTensor` to a `SparseTensorsMap` return its handle."
+  description: <<END
+A `SparseTensor` is represented by three tensors: `sparse_indices`,
+`sparse_values`, and `sparse_shape`.
+
+This operator takes the given `SparseTensor` and adds it to a container
+object (a `SparseTensorsMap`).  A unique key within this container is generated
+in the form of an `int64`, and this is the value that is returned.
+
+The `SparseTensor` can then be read out as part of a minibatch by passing
+the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+the correct `SparseTensorsMap` is accessed, ensure that the same
+`container` and `shared_name` are passed to that Op.  If no `shared_name`
+is provided here, instead use the *name* of the Operation created by calling
+`AddSparseToTensorsMap` as the `shared_name` passed to
+`TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AddV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_AddV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e4db21151ec50b66b1befe34ac29c714e5d1ff2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AddV2.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "AddV2"
+  summary: "Returns x + y element-wise."
+  description: <<END
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustContrast.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustContrast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45988d7e36196e401093d584082d73e94dc3df3b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustContrast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustContrast"
+  summary: "Deprecated. Disallowed in GraphDef version >= 2."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustContrastv2.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustContrastv2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..429a5e4434e011d1ba43847b9abf8877b4d41e7a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustContrastv2.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "AdjustContrastv2"
+  endpoint {
+    name: "AdjustContrast"
+  }
+  in_arg {
+    name: "images"
+    description: <<END
+Images to adjust.  At least 3-D.
+END
+  }
+  in_arg {
+    name: "contrast_factor"
+    description: <<END
+A float multiplier for adjusting contrast.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The contrast-adjusted image or images.
+END
+  }
+  summary: "Adjust the contrast of one or more images."
+  description: <<END
+`images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+interpreted as `[height, width, channels]`.  The other dimensions only
+represent a collection of images, such as `[batch, height, width, channels].`
+
+Contrast is adjusted independently for each channel of each image.
+
+For each channel, the Op first computes the mean of the image pixels in the
+channel and then adjusts each component of each pixel to
+`(x - mean) * contrast_factor + mean`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bfaf676860134f9af407a7a94b34d257eb4cf96b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustHue.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "AdjustHue"
+  in_arg {
+    name: "images"
+    description: <<END
+Images to adjust.  At least 3-D.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+A float delta to add to the hue.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The hue-adjusted image or images.
+END
+  }
+  summary: "Adjust the hue of one or more images."
+  description: <<END
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpretted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A delta is then applied all the hue values,
+and then remapped back to RGB colorspace.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt b/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97be0fda11f143a92d6d4f6cb909f00c1cee7fb0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AdjustSaturation.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "AdjustSaturation"
+  in_arg {
+    name: "images"
+    description: <<END
+Images to adjust.  At least 3-D.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A float scale to add to the saturation.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The hue-adjusted image or images.
+END
+  }
+  summary: "Adjust the saturation of one or more images."
+  description: <<END
+`images` is a tensor of at least 3 dimensions.  The last dimension is
+interpretted as channels, and must be three.
+
+The input image is considered in the RGB colorspace. Conceptually, the RGB
+colors are first mapped into HSV. A scale is then applied all the saturation
+values, and then remapped back to RGB colorspace.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_All.pbtxt b/tensorflow/core/api_def/base_api/api_def_All.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..623389988a3257cd8fbe6d96d862343f976f6b55
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_All.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "All"
+  endpoint {
+    name: "All"
+  }
+  endpoint {
+    name: "ReduceAll"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the \"logical and\" of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AllCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_AllCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38b8e2bfbababa717f534300986f54951e357fde
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AllCandidateSampler.pbtxt
@@ -0,0 +1,80 @@
+op {
+  graph_op_name: "AllCandidateSampler"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to produce.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a learned unigram distribution."
+  description: <<END
+See explanations of candidate sampling and the data formats at
+go/candidate-sampling.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt b/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a26e5e244719a6f40edfa93a0c863b47f68cdaf5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Angle.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Angle"
+  summary: "Returns the argument of a complex number."
+  description: <<END
+Given a tensor `input` of complex numbers, this operation returns a tensor of
+type `float` that is the argument of each element in `input`. All elements in
+`input` must be complex numbers of the form \\(a + bj\\), where *a*
+is the real part and *b* is the imaginary part.
+
+The argument returned by this operation is of the form \\(atan2(b, a)\\).
+
+For example:
+
+```
+# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+tf.angle(input) ==> [2.0132, 1.056]
+```
+
+@compatibility(numpy)
+Equivalent to np.angle.
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Any.pbtxt b/tensorflow/core/api_def/base_api/api_def_Any.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09fd4e0b6036447dfe355ff56da29e276de62f2b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Any.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Any"
+  endpoint {
+    name: "Any"
+  }
+  endpoint {
+    name: "ReduceAny"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the \"logical or\" of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdadelta.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3aa32ba9fa1a589d84a735ce0b958a4d4272041
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdadelta.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyAdadelta"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum_update"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var, accum and update_accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' according to the adadelta scheme."
+  description: <<END
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..057786b6aa4ea741d09d7b5534692ecbd52dce00
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdagrad.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "ApplyAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the adagrad scheme."
+  description: <<END
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdagradDA.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1453bb558dfcf2db2c3a881b71d10a38b1999322
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdagradDA.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyAdagradDA"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_squared_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "global_step"
+    description: <<END
+Training step number. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' according to the proximal adagrad scheme."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2858a1bfbb5a30e5e72bcfad694f696e08d2346
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAdam.pbtxt
@@ -0,0 +1,90 @@
+op {
+  graph_op_name: "ApplyAdam"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, uses the nesterov update.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd4609525262d1e03af7d945cdacac7ea32f0546
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyAddSign.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyAddSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c88d18d3b2050e428a59aa5f49d12bcc266cd80a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "ApplyCenteredRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, mg, ms, and mom tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: <<END
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77da9e4d510b0d5d57c7e46fae044a55f87ef828
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
@@ -0,0 +1,73 @@
+op {
+  graph_op_name: "ApplyFtrl"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..974f3adc196129f9fe83d098c22dc3cd237263d6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "ApplyFtrlV2"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 shrinkage regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f38ebd1b8c89a1a65368d3da38cead73225ada5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyGradientDescent.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "ApplyGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The change.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..55326fd35cf9c96c23da422896f8fede1a276e5b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyMomentum.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "ApplyMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum + grad
+var -= lr * accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa5619b87944a80c2915c196e4ae10a4cccb25f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyPowerSign.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ApplyPowerSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "logbase"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a683ba12a45716dcacef4ae2deef5952ec2293da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyProximalAdagrad.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "ApplyProximalAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
+  description: <<END
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7914c60b7137320b5ee81e7afb30e8a884d71fe6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "ApplyProximalGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The change.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: <<END
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ecf89c0f4a86c09b87338843f8836e5c33aa50c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyRMSProp.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "ApplyRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, ms, and mom tensors is protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: <<END
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ApproximateEqual.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApproximateEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8842fa9bbcec584895efc7216983a121c9b71246
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ApproximateEqual.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApproximateEqual"
+  summary: "Returns the truth value of abs(x-y) < tolerance element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cc81d1c8b7194cabaf7f7c3329b4984d4de0276
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ArgMax.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ArgMax"
+  in_arg {
+    name: "dimension"
+    description: <<END
+int32 or int64, must be in the range `[-rank(input), rank(input))`.
+Describes which dimension of the input Tensor to reduce across. For vectors,
+use dimension = 0.
+END
+  }
+  summary: "Returns the index with the largest value across dimensions of a tensor."
+  description: <<END
+Note that in case of ties the identity of the return value is not guaranteed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb7410c5fd6e8a13f39cb633e44a0f89a3ba5b87
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ArgMin.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ArgMin"
+  in_arg {
+    name: "dimension"
+    description: <<END
+int32 or int64, must be in the range `[-rank(input), rank(input))`.
+Describes which dimension of the input Tensor to reduce across. For vectors,
+use dimension = 0.
+END
+  }
+  summary: "Returns the index with the smallest value across dimensions of a tensor."
+  description: <<END
+Note that in case of ties the identity of the return value is not guaranteed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/base_api/api_def_AsString.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f2bca8eda86056d77a0274a76b44555bfc33220
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AsString.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "AsString"
+  attr {
+    name: "precision"
+    description: <<END
+The post-decimal precision to use for floating point numbers.
+Only used if precision > -1.
+END
+  }
+  attr {
+    name: "scientific"
+    description: <<END
+Use scientific notation for floating point numbers.
+END
+  }
+  attr {
+    name: "shortest"
+    description: <<END
+Use shortest representation (either scientific or standard) for
+floating point numbers.
+END
+  }
+  attr {
+    name: "width"
+    description: <<END
+Pad pre-decimal numbers to this width.
+Applies to both floating point and integer numbers.
+Only used if width > -1.
+END
+  }
+  attr {
+    name: "fill"
+    description: <<END
+The value to pad if width > -1.  If empty, pads with spaces.
+Another typical value is '0'.  String cannot be longer than 1 character.
+END
+  }
+  summary: "Converts each entry in the given tensor to strings.  Supports many numeric"
+  description: <<END
+types and boolean.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Asin.pbtxt b/tensorflow/core/api_def/base_api/api_def_Asin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19e1b144214e65b928309bd8e1075a4f35528d2b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Asin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Asin"
+  summary: "Computes asin of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Asinh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Asinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20f4dab8613b1bb4418ae2754f39e327bb484a11
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Asinh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Asinh"
+  summary: "Computes inverse hyperbolic sine of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/base_api/api_def_Assert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90e5df814936f4851bd515d1faa34eb206197181
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Assert.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "Assert"
+  in_arg {
+    name: "condition"
+    description: <<END
+The condition to evaluate.
+END
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+The tensors to print out when condition is false.
+END
+  }
+  attr {
+    name: "summarize"
+    description: <<END
+Print this many entries of each tensor.
+END
+  }
+  summary: "Asserts that the given condition is true."
+  description: <<END
+If `condition` evaluates to false, print the list of tensors in `data`.
+`summarize` determines how many entries of the tensors to print.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Assign.pbtxt b/tensorflow/core/api_def/base_api/api_def_Assign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae9b49f49c1ec0e74305653cbc45ac8344b5bac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Assign.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Assign"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node. May be uninitialized.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The value to be assigned to the variable.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as "ref".  Returned as a convenience for operations that want
+to use the new value after the variable has been reset.
+END
+  }
+  attr {
+    name: "validate_shape"
+    description: <<END
+If true, the operation will validate that the shape
+of 'value' matches the shape of the Tensor being assigned to.  If false,
+'ref' will take on the shape of 'value'.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the assignment will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'ref\' by assigning \'value\' to it."
+  description: <<END
+This operation outputs "ref" after the assignment is done.
+This makes it easier to chain operations that need to use the reset value.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d09ec5e196359202ccdb73ca4db504a8a3901b7f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAdd.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "AssignAdd"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The value to be added to the variable.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as "ref".  Returned as a convenience for operations that want
+to use the new value after the variable has been updated.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the addition will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'ref\' by adding \'value\' to it."
+  description: <<END
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d21d7bab699ff481c65ed44eb9bf66ec14ea387
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AssignAddVariableOp.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "AssignAddVariableOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+handle to the resource in which to store the variable.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+the value by which the variable will be incremented.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+the dtype of the value.
+END
+  }
+  summary: "Adds a value to the current value of a variable."
+  description: <<END
+Any ReadVariableOp which depends directly or indirectly on this assign is
+guaranteed to see the incremented value or a subsequent newer one.
+
+Outputs the incremented value, which can be used to totally order the
+increments to this variable.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..191a5c34fea6ec1a1b85c3135372ba4b91f47909
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSub.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "AssignSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The value to be subtracted to the variable.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as "ref".  Returned as a convenience for operations that want
+to use the new value after the variable has been updated.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'ref\' by subtracting \'value\' from it."
+  description: <<END
+This operation outputs "ref" after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..102201c4cb07b080c46a28a91af8a4176034f6e6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AssignSubVariableOp.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "AssignSubVariableOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+handle to the resource in which to store the variable.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+the value by which the variable will be incremented.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+the dtype of the value.
+END
+  }
+  summary: "Subtracts a value from the current value of a variable."
+  description: <<END
+Any ReadVariableOp which depends directly or indirectly on this assign is
+guaranteed to see the incremented value or a subsequent newer one.
+
+Outputs the incremented value, which can be used to totally order the
+increments to this variable.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AssignVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_AssignVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6fe81d573120678b350b07c35b1da4960fcb36e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AssignVariableOp.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "AssignVariableOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+handle to the resource in which to store the variable.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+the value to set the new tensor to use.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+the dtype of the value.
+END
+  }
+  summary: "Assigns a new value to a variable."
+  description: <<END
+Any ReadVariableOp with a control dependency on this op is guaranteed to return
+this value or a subsequent newer value of the variable.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Atan.pbtxt b/tensorflow/core/api_def/base_api/api_def_Atan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..557cf183e4f321afea6246c0348342281ecfdea9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Atan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Atan"
+  summary: "Computes atan of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Atan2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Atan2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2c8ef593932374b940898cb4c72f37e3a7ff14e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Atan2.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "Atan2"
+  summary: "Computes arctangent of `y/x` element-wise, respecting signs of the arguments."
+  description: <<END
+This is the angle \( \theta \in [-\pi, \pi] \) such that
+\[ x = r \cos(\theta) \]
+and
+\[ y = r \sin(\theta) \]
+where \(r = \sqrt(x^2 + y^2) \).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Atanh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Atanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ef1180f3d61f85adb853e17d98eb89ea21645f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Atanh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Atanh"
+  summary: "Computes inverse hyperbolic tangent of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6631f4e04cb4dc9a911d563a75433c52f4077dc3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AudioSpectrogram.pbtxt
@@ -0,0 +1,63 @@
+op {
+  graph_op_name: "AudioSpectrogram"
+  in_arg {
+    name: "input"
+    description: <<END
+Float representation of audio data.
+END
+  }
+  out_arg {
+    name: "spectrogram"
+    description: <<END
+3D representation of the audio frequencies as an image.
+END
+  }
+  attr {
+    name: "window_size"
+    description: <<END
+How wide the input window is in samples. For the highest efficiency
+this should be a power of two, but other values are accepted.
+END
+  }
+  attr {
+    name: "stride"
+    description: <<END
+How widely apart the center of adjacent sample windows should be.
+END
+  }
+  attr {
+    name: "magnitude_squared"
+    description: <<END
+Whether to return the squared magnitude or just the
+magnitude. Using squared magnitude can avoid extra calculations.
+END
+  }
+  summary: "Produces a visualization of audio data over time."
+  description: <<END
+Spectrograms are a standard way of representing audio information as a series of
+slices of frequency information, one slice for each window of time. By joining
+these together into a sequence, they form a distinctive fingerprint of the sound
+over time.
+
+This op expects to receive audio data as an input, stored as floats in the range
+-1 to 1, together with a window width in samples, and a stride specifying how
+far to move the window between slices. From this it generates a three
+dimensional output. The lowest dimension has an amplitude value for each
+frequency during that time slice. The next dimension is time, with successive
+frequency slices. The final dimension is for the channels in the input, so a
+stereo audio input would have two here for example.
+
+This means the layout when converted and saved as an image is rotated 90 degrees
+clockwise from a typical spectrogram. Time is descending down the Y axis, and
+the frequency decreases from left to right.
+
+Each value in the result represents the square root of the sum of the real and
+imaginary parts of an FFT on the current window of samples. In this way, the
+lowest dimension represents the power of each frequency in the current window,
+and adjacent windows are concatenated in the next dimension.
+
+To get a more intuitive and visual look at what this operation does, you can run
+tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+resulting spectrogram as a PNG image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AudioSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_AudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3bc70d7ce8afa94b48ed1313936371689caf397a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AudioSummary.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "AudioSummary"
+  in_arg {
+    name: "tag"
+    description: <<END
+Scalar. Used to build the `tag` attribute of the summary values.
+END
+  }
+  in_arg {
+    name: "tensor"
+    description: <<END
+2-D of shape `[batch_size, frames]`.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar. Serialized `Summary` protocol buffer.
+END
+  }
+  attr {
+    name: "sample_rate"
+    description: <<END
+The sample rate of the signal in hertz.
+END
+  }
+  attr {
+    name: "max_outputs"
+    description: <<END
+Max number of batch elements to generate audio for.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with audio."
+  description: <<END
+The summary has up to `max_outputs` summary values containing audio. The
+audio is built from `tensor` which must be 3-D with shape `[batch_size,
+frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+*  If `max_outputs` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AudioSummaryV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_AudioSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d406f22d3511f9d80a56b7296dfe5e9727aef2cb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AudioSummaryV2.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "AudioSummaryV2"
+  endpoint {
+    name: "AudioSummary"
+  }
+  in_arg {
+    name: "tag"
+    description: <<END
+Scalar. Used to build the `tag` attribute of the summary values.
+END
+  }
+  in_arg {
+    name: "tensor"
+    description: <<END
+2-D of shape `[batch_size, frames]`.
+END
+  }
+  in_arg {
+    name: "sample_rate"
+    description: <<END
+The sample rate of the signal in hertz.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar. Serialized `Summary` protocol buffer.
+END
+  }
+  attr {
+    name: "max_outputs"
+    description: <<END
+Max number of batch elements to generate audio for.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with audio."
+  description: <<END
+The summary has up to `max_outputs` summary values containing audio. The
+audio is built from `tensor` which must be 3-D with shape `[batch_size,
+frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+*  If `max_outputs` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AvgPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_AvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d94662f6a22175cafe76de19d9502fad95d9644
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AvgPool.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "AvgPool"
+  in_arg {
+    name: "value"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The average pooled output tensor.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the sliding window for each dimension of `value`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of `value`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Performs average pooling on the input."
+  description: <<END
+Each entry in `output` is the mean of the corresponding size `ksize`
+window in `value`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8171566a212b73ac15dbc116d2ea44eb6cb4fb92
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AvgPool3D.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "AvgPool3D"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The average pooled output tensor.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+1-D tensor of length 5. The size of the window for each dimension of
+the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  summary: "Performs 3D average pooling on the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AvgPool3DGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_AvgPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f96be48739a1a207a2ff0bd05aadd88278b2591
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AvgPool3DGrad.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "AvgPool3DGrad"
+  in_arg {
+    name: "orig_input_shape"
+    description: <<END
+The original input dimensions.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+Output backprop of shape `[batch, depth, rows, cols, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The backprop for input.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+1-D tensor of length 5. The size of the window for each dimension of
+the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  summary: "Computes gradients of average pooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_AvgPoolGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_AvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84e77f3ced7fdcb0bc6c0537cfed2210a460e443
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AvgPoolGrad.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "AvgPoolGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "orig_input_shape"
+    description: <<END
+1-D.  Shape of the original input to `avg_pool`.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+the output of `avg_pool`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D.  Gradients w.r.t. the input of `avg_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the sliding window for each dimension of the input.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Computes gradients of the average pooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_B.pbtxt b/tensorflow/core/api_def/base_api/api_def_B.pbtxt
deleted file mode 100644
index 716d397f9a3d4bca268d625116890430d0c294f8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_B.pbtxt
+++ /dev/null
@@ -1,448 +0,0 @@
-op {
-  graph_op_name: "Barrier"
-  endpoint {
-    name: "Barrier"
-  }
-  summary: "Defines a barrier that persists across different graph executions."
-  description: <<END
-A barrier represents a key-value map, where each key is a string, and
-each value is a tuple of tensors.
-
-At runtime, the barrier contains 'complete' and 'incomplete'
-elements. A complete element has defined tensors for all components of
-its value tuple, and may be accessed using BarrierTakeMany. An
-incomplete element has some undefined components in its value tuple,
-and may be updated using BarrierInsertMany.
-END
-}
-op {
-  graph_op_name: "BarrierClose"
-  endpoint {
-    name: "BarrierClose"
-  }
-  summary: "Closes the given barrier."
-  description: <<END
-This operation signals that no more new elements will be inserted in the
-given barrier. Subsequent InsertMany that try to introduce a new key will fail.
-Subsequent InsertMany operations that just add missing components to already
-existing elements will continue to succeed. Subsequent TakeMany operations will
-continue to succeed if sufficient completed elements remain in the barrier.
-Subsequent TakeMany operations that would block will fail immediately.
-END
-}
-op {
-  graph_op_name: "BarrierIncompleteSize"
-  endpoint {
-    name: "BarrierIncompleteSize"
-  }
-  summary: "Computes the number of incomplete elements in the given barrier."
-}
-op {
-  graph_op_name: "BarrierInsertMany"
-  endpoint {
-    name: "BarrierInsertMany"
-  }
-  summary: "For each key, assigns the respective value to the specified component."
-  description: <<END
-If a key is not found in the barrier, this operation will create a new
-incomplete element. If a key is found in the barrier, and the element
-already has a value at component_index, this operation will fail with
-INVALID_ARGUMENT, and leave the barrier in an undefined state.
-END
-}
-op {
-  graph_op_name: "BarrierReadySize"
-  endpoint {
-    name: "BarrierReadySize"
-  }
-  summary: "Computes the number of complete elements in the given barrier."
-}
-op {
-  graph_op_name: "BarrierTakeMany"
-  endpoint {
-    name: "BarrierTakeMany"
-  }
-  summary: "Takes the given number of completed elements from a barrier."
-  description: <<END
-This operation concatenates completed-element component tensors along
-the 0th dimension to make a single component tensor.
-
-Elements come out of the barrier when they are complete, and in the order
-in which they were placed into the barrier.  The indices output provides
-information about the batch in which each element was originally inserted
-into the barrier.
-END
-}
-op {
-  graph_op_name: "BatchCholesky"
-  endpoint {
-    name: "BatchCholesky"
-  }
-}
-op {
-  graph_op_name: "BatchCholeskyGrad"
-  endpoint {
-    name: "BatchCholeskyGrad"
-  }
-}
-op {
-  graph_op_name: "BatchDataset"
-  endpoint {
-    name: "BatchDataset"
-  }
-  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
-}
-op {
-  graph_op_name: "BatchFFT"
-  endpoint {
-    name: "BatchFFT"
-  }
-}
-op {
-  graph_op_name: "BatchFFT2D"
-  endpoint {
-    name: "BatchFFT2D"
-  }
-}
-op {
-  graph_op_name: "BatchFFT3D"
-  endpoint {
-    name: "BatchFFT3D"
-  }
-}
-op {
-  graph_op_name: "BatchIFFT"
-  endpoint {
-    name: "BatchIFFT"
-  }
-}
-op {
-  graph_op_name: "BatchIFFT2D"
-  endpoint {
-    name: "BatchIFFT2D"
-  }
-}
-op {
-  graph_op_name: "BatchIFFT3D"
-  endpoint {
-    name: "BatchIFFT3D"
-  }
-}
-op {
-  graph_op_name: "BatchMatMul"
-  endpoint {
-    name: "BatchMatMul"
-  }
-  summary: "Multiplies slices of two tensors in batches."
-  description: <<END
-Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-viewed as an element of a batch), and arranges the individual results
-in a single output tensor of the same batch size. Each of the
-individual slices can optionally be adjointed (to adjoint a matrix
-means to transpose and conjugate it) before multiplication by setting
-the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-
-The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-and `[..., r_y, c_y]`.
-
-The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-
-    r_o = c_x if adj_x else r_x
-    c_o = r_y if adj_y else c_y
-
-It is computed as:
-
-    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-END
-}
-op {
-  graph_op_name: "BatchMatrixBandPart"
-  endpoint {
-    name: "BatchMatrixBandPart"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixDeterminant"
-  endpoint {
-    name: "BatchMatrixDeterminant"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixDiag"
-  endpoint {
-    name: "BatchMatrixDiag"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixDiagPart"
-  endpoint {
-    name: "BatchMatrixDiagPart"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixInverse"
-  endpoint {
-    name: "BatchMatrixInverse"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixSetDiag"
-  endpoint {
-    name: "BatchMatrixSetDiag"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixSolve"
-  endpoint {
-    name: "BatchMatrixSolve"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixSolveLs"
-  endpoint {
-    name: "BatchMatrixSolveLs"
-  }
-}
-op {
-  graph_op_name: "BatchMatrixTriangularSolve"
-  endpoint {
-    name: "BatchMatrixTriangularSolve"
-  }
-}
-op {
-  graph_op_name: "BatchNormWithGlobalNormalization"
-  endpoint {
-    name: "BatchNormWithGlobalNormalization"
-  }
-  summary: "Batch normalization."
-  description: <<END
-This op is deprecated. Prefer `tf.nn.batch_normalization`.
-END
-}
-op {
-  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
-  endpoint {
-    name: "BatchNormWithGlobalNormalizationGrad"
-  }
-  summary: "Gradients for batch normalization."
-  description: <<END
-This op is deprecated. See `tf.nn.batch_normalization`.
-END
-}
-op {
-  graph_op_name: "BatchSelfAdjointEig"
-  endpoint {
-    name: "BatchSelfAdjointEig"
-  }
-}
-op {
-  graph_op_name: "BatchSelfAdjointEigV2"
-  endpoint {
-    name: "BatchSelfAdjointEigV2"
-  }
-}
-op {
-  graph_op_name: "BatchSvd"
-  endpoint {
-    name: "BatchSvd"
-  }
-}
-op {
-  graph_op_name: "BatchToSpace"
-  endpoint {
-    name: "BatchToSpace"
-  }
-  summary: "BatchToSpace for 4-D tensors of type T."
-  description: <<END
-This is a legacy version of the more general BatchToSpaceND.
-
-Rearranges (permutes) data from batch into blocks of spatial data, followed by
-cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-this op outputs a copy of the input tensor where values from the `batch`
-dimension are moved in spatial blocks to the `height` and `width` dimensions,
-followed by cropping along the `height` and `width` dimensions.
-END
-}
-op {
-  graph_op_name: "BatchToSpaceND"
-  endpoint {
-    name: "BatchToSpaceND"
-  }
-  summary: "BatchToSpace for N-D tensors of type T."
-  description: <<END
-This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-`block_shape + [batch]`, interleaves these blocks back into the grid defined by
-the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-the input.  The spatial dimensions of this intermediate result are then
-optionally cropped according to `crops` to produce the output.  This is the
-reverse of SpaceToBatch.  See below for a precise description.
-END
-}
-op {
-  graph_op_name: "Betainc"
-  endpoint {
-    name: "Betainc"
-  }
-  summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
-  description: <<END
-The regularized incomplete beta integral is defined as:
-
-
-\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-
-where
-
-
-\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-
-
-is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-beta function.
-END
-}
-op {
-  graph_op_name: "BiasAdd"
-  endpoint {
-    name: "BiasAdd"
-  }
-  summary: "Adds `bias` to `value`."
-  description: <<END
-This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-END
-}
-op {
-  graph_op_name: "BiasAddGrad"
-  endpoint {
-    name: "BiasAddGrad"
-  }
-  summary: "The backward operation for \"BiasAdd\" on the \"bias\" tensor."
-  description: <<END
-It accumulates all the values from out_backprop into the feature dimension.
-For NHWC data format, the feature dimension is the last. For NCHW data format,
-the feature dimension is the third-to-last.
-END
-}
-op {
-  graph_op_name: "BiasAddV1"
-  endpoint {
-    name: "BiasAddV1"
-  }
-  summary: "Adds `bias` to `value`."
-  description: <<END
-This is a deprecated version of BiasAdd and will be soon removed.
-
-This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-Broadcasting is supported, so `value` may have any number of dimensions.
-END
-}
-op {
-  graph_op_name: "Bincount"
-  endpoint {
-    name: "Bincount"
-  }
-  summary: "Counts the number of occurrences of each value in an integer array."
-  description: <<END
-Outputs a vector with length `size` and the same dtype as `weights`. If
-`weights` are empty, then index `i` stores the number of times the value `i` is
-counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-the value in `weights` at each index where the corresponding value in `arr` is
-`i`.
-
-Values in `arr` outside of the range [0, size) are ignored.
-END
-}
-op {
-  graph_op_name: "Bitcast"
-  endpoint {
-    name: "Bitcast"
-  }
-  summary: "Bitcasts a tensor from one type to another without copying data."
-  description: <<END
-Given a tensor `input`, this operation returns a tensor that has the same buffer
-data as `input` with datatype `type`.
-
-If the input datatype `T` is larger than the output datatype `type` then the
-shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
-
-If `T` is smaller than `type`, the operator requires that the rightmost
-dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
-[..., sizeof(`type`)/sizeof(`T`)] to [...].
-
-*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
-endian orderings will give different results.
-END
-}
-op {
-  graph_op_name: "BitwiseAnd"
-  endpoint {
-    name: "BitwiseAnd"
-  }
-  summary: "Elementwise computes the bitwise AND of `x` and `y`."
-  description: <<END
-The result will have those bits set, that are set in both `x` and `y`. The
-computation is performed on the underlying representations of `x` and `y`.
-END
-}
-op {
-  graph_op_name: "BitwiseOr"
-  endpoint {
-    name: "BitwiseOr"
-  }
-  summary: "Elementwise computes the bitwise OR of `x` and `y`."
-  description: <<END
-The result will have those bits set, that are set in `x`, `y` or both. The
-computation is performed on the underlying representations of `x` and `y`.
-END
-}
-op {
-  graph_op_name: "BitwiseXor"
-  endpoint {
-    name: "BitwiseXor"
-  }
-  summary: "Elementwise computes the bitwise XOR of `x` and `y`."
-  description: <<END
-The result will have those bits set, that are different in `x` and `y`. The
-computation is performed on the underlying representations of `x` and `y`.
-END
-}
-op {
-  graph_op_name: "BroadcastArgs"
-  endpoint {
-    name: "BroadcastArgs"
-  }
-  summary: "Return the shape of s0 op s1 with broadcast."
-  description: <<END
-Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-END
-}
-op {
-  graph_op_name: "BroadcastGradientArgs"
-  endpoint {
-    name: "BroadcastGradientArgs"
-  }
-  summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
-  description: <<END
-This is typically used by gradient computations for a broadcasting operation.
-END
-}
-op {
-  graph_op_name: "Bucketize"
-  endpoint {
-    name: "Bucketize"
-  }
-  summary: "Bucketizes \'input\' based on \'boundaries\'."
-  description: <<END
-For example, if the inputs are
-    boundaries = [0, 10, 100]
-    input = [[-5, 10000]
-             [150,   10]
-             [5,    100]]
-
-then the output will be
-    output = [[0, 3]
-              [3, 2]
-              [1, 3]]
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Barrier.pbtxt b/tensorflow/core/api_def/base_api/api_def_Barrier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3422ebf2f6d607addd90b4bb9705fa44656ae79c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Barrier.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "Barrier"
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the barrier.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. Each shape must be 1 in the
+first dimension. The length of this attr must be the same as the length of
+component_types.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The capacity of the barrier.  The default capacity is MAX_INT32,
+which is the largest capacity of the underlying queue.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this barrier is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this barrier will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "Defines a barrier that persists across different graph executions."
+  description: <<END
+A barrier represents a key-value map, where each key is a string, and
+each value is a tuple of tensors.
+
+At runtime, the barrier contains 'complete' and 'incomplete'
+elements. A complete element has defined tensors for all components of
+its value tuple, and may be accessed using BarrierTakeMany. An
+incomplete element has some undefined components in its value tuple,
+and may be updated using BarrierInsertMany.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BarrierClose.pbtxt b/tensorflow/core/api_def/base_api/api_def_BarrierClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a81235ce8a71d9c7613901ec8b71c7e380b74894
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BarrierClose.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BarrierClose"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a barrier.
+END
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    description: <<END
+If true, all pending enqueue requests that are
+blocked on the barrier's queue will be canceled. InsertMany will fail, even
+if no new key is introduced.
+END
+  }
+  summary: "Closes the given barrier."
+  description: <<END
+This operation signals that no more new elements will be inserted in the
+given barrier. Subsequent InsertMany that try to introduce a new key will fail.
+Subsequent InsertMany operations that just add missing components to already
+existing elements will continue to succeed. Subsequent TakeMany operations will
+continue to succeed if sufficient completed elements remain in the barrier.
+Subsequent TakeMany operations that would block will fail immediately.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BarrierIncompleteSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BarrierIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61f41da77ffec7489987f1d4de1169800bb5117d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BarrierIncompleteSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "BarrierIncompleteSize"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a barrier.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+The number of incomplete elements (i.e. those with some of their value
+components not set) in the barrier.
+END
+  }
+  summary: "Computes the number of incomplete elements in the given barrier."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BarrierInsertMany.pbtxt b/tensorflow/core/api_def/base_api/api_def_BarrierInsertMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..645e1eee083ab15f7ee73a2576729d3fa2fcadfe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BarrierInsertMany.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "BarrierInsertMany"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a barrier.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+A one-dimensional tensor of keys, with length n.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+An any-dimensional tensor of values, which are associated with the
+respective keys. The 0th dimension must have length n.
+END
+  }
+  attr {
+    name: "component_index"
+    description: <<END
+The component of the barrier elements that is being assigned.
+END
+  }
+  summary: "For each key, assigns the respective value to the specified component."
+  description: <<END
+If a key is not found in the barrier, this operation will create a new
+incomplete element. If a key is found in the barrier, and the element
+already has a value at component_index, this operation will fail with
+INVALID_ARGUMENT, and leave the barrier in an undefined state.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BarrierReadySize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BarrierReadySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38e92d348302ec6c7cfb92eeb4713b7136667f2a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BarrierReadySize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "BarrierReadySize"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a barrier.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+The number of complete elements (i.e. those with all of their value
+components set) in the barrier.
+END
+  }
+  summary: "Computes the number of complete elements in the given barrier."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BarrierTakeMany.pbtxt b/tensorflow/core/api_def/base_api/api_def_BarrierTakeMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..584ce7536b8af67c09949a72ef97f4cb93b6d143
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BarrierTakeMany.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "BarrierTakeMany"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a barrier.
+END
+  }
+  in_arg {
+    name: "num_elements"
+    description: <<END
+A single-element tensor containing the number of elements to
+take.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+A one-dimensional tensor of indices, with length num_elems.
+These indices refer to the batch in which the values were placed into the
+barrier (starting with MIN_LONG and increasing with each BarrierInsertMany).
+END
+  }
+  out_arg {
+    name: "keys"
+    description: <<END
+A one-dimensional tensor of keys, with length num_elements.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+One any-dimensional tensor per component in a barrier element. All
+values have length num_elements in the 0th dimension.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "allow_small_batch"
+    description: <<END
+Allow to return less than num_elements items if barrier is
+already closed.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is empty, this operation will block for up to
+timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Takes the given number of completed elements from a barrier."
+  description: <<END
+This operation concatenates completed-element component tensors along
+the 0th dimension to make a single component tensor.
+
+Elements come out of the barrier when they are complete, and in the order
+in which they were placed into the barrier.  The indices output provides
+information about the batch in which each element was originally inserted
+into the barrier.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchCholesky.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchCholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..758ed3c6d398158952aa751efb60ce4976b97c0f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchCholesky.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchCholesky"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchCholeskyGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchCholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9099433f0bedfe551ae6587c67f21659713b4211
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchCholeskyGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchCholeskyGrad"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..639d962874d083472e6df13550e107026fd2d0a1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "BatchDataset"
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch.
+END
+  }
+  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ef542cc8b1367715361eec98fe727d9fa9f1a0f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFFT.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchFFT"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ce0612aaf2063ae07f9070160b7ec4a1518e20e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFFT2D.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchFFT2D"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5834e0337fa3faa7a1de89547cae8570bcfabe73
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFFT3D.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchFFT3D"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchIFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchIFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..931365f0a485ddb34a03c1c8497f7fc308daf920
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchIFFT.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchIFFT"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchIFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchIFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af0bf6246164281026900525f04aa5c60bbad8b8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchIFFT2D.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchIFFT2D"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchIFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchIFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f051e1f5e0a7101e7c3ea518a58f0cdb93030436
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchIFFT3D.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchIFFT3D"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7999598aff2155b2b4620ce79120d60fad7aa63b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatMul.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "BatchMatMul"
+  in_arg {
+    name: "x"
+    description: <<END
+2-D or higher with shape `[..., r_x, c_x]`.
+END
+  }
+  in_arg {
+    name: "y"
+    description: <<END
+2-D or higher with shape `[..., r_y, c_y]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+3-D or higher with shape `[..., r_o, c_o]`
+END
+  }
+  attr {
+    name: "adj_x"
+    description: <<END
+If `True`, adjoint the slices of `x`. Defaults to `False`.
+END
+  }
+  attr {
+    name: "adj_y"
+    description: <<END
+If `True`, adjoint the slices of `y`. Defaults to `False`.
+END
+  }
+  summary: "Multiplies slices of two tensors in batches."
+  description: <<END
+Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+viewed as an element of a batch), and arranges the individual results
+in a single output tensor of the same batch size. Each of the
+individual slices can optionally be adjointed (to adjoint a matrix
+means to transpose and conjugate it) before multiplication by setting
+the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
+
+The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+and `[..., r_y, c_y]`.
+
+The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+
+    r_o = c_x if adj_x else r_x
+    c_o = r_y if adj_y else c_y
+
+It is computed as:
+
+    output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixBandPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..592a95a14e80ec247b0842d8793ac54d164d7fe9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixBandPart.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixBandPart"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixDeterminant.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f1c5a897ca005f2edd658b24394048294a1fd7d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDeterminant.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixDeterminant"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiag.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7ed5cca2b3f50319b4b70d10ed960fcbcbbb79b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiag.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixDiag"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiagPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e96bb9c57f772dbf26f18d0f85f2e1a2f2b9e74d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixDiagPart.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixDiagPart"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixInverse.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41d4305f5f4a2e638f042ca56e18d99b76a24c19
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixInverse.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixInverse"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixSetDiag.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b11edf2ba10658619e466c9bf2ba9a4f677c277e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSetDiag.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixSetDiag"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6012ea4a224eee8af98306ddf80bb99b82a1034c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolve.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixSolve"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolveLs.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fd6e055c4ec41d31556891fdef38b583adefb12
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixSolveLs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixSolveLs"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchMatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22fcb4a02fe86007be02a5ef79093ef3e60d599f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchMatrixTriangularSolve.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchMatrixTriangularSolve"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2943f5f009f9268d13b9542ec4e4ba9cd5275a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalization"
+  in_arg {
+    name: "t"
+    description: <<END
+A 4D input Tensor.
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+A 1D mean Tensor with size matching the last dimension of t.
+This is the first output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+A 1D variance Tensor with size matching the last dimension of t.
+This is the second output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+A 1D beta Tensor with size matching the last dimension of t.
+An offset to be added to the normalized tensor.
+END
+  }
+  in_arg {
+    name: "gamma"
+    description: <<END
+A 1D gamma Tensor with size matching the last dimension of t.
+If "scale_after_normalization" is true, this tensor will be multiplied
+with the normalized tensor.
+END
+  }
+  attr {
+    name: "variance_epsilon"
+    description: <<END
+A small float number to avoid dividing by 0.
+END
+  }
+  attr {
+    name: "scale_after_normalization"
+    description: <<END
+A bool indicating whether the resulted tensor
+needs to be multiplied with gamma.
+END
+  }
+  summary: "Batch normalization."
+  description: <<END
+This op is deprecated. Prefer `tf.nn.batch_normalization`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a702e303f3d71c55a4eb1ab708bb41969c643980
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
+  in_arg {
+    name: "t"
+    description: <<END
+A 4D input Tensor.
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+A 1D mean Tensor with size matching the last dimension of t.
+This is the first output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+A 1D variance Tensor with size matching the last dimension of t.
+This is the second output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "gamma"
+    description: <<END
+A 1D gamma Tensor with size matching the last dimension of t.
+If "scale_after_normalization" is true, this Tensor will be multiplied
+with the normalized Tensor.
+END
+  }
+  in_arg {
+    name: "backprop"
+    description: <<END
+4D backprop Tensor.
+END
+  }
+  out_arg {
+    name: "dx"
+    description: <<END
+4D backprop tensor for input.
+END
+  }
+  out_arg {
+    name: "dm"
+    description: <<END
+1D backprop tensor for mean.
+END
+  }
+  out_arg {
+    name: "dv"
+    description: <<END
+1D backprop tensor for variance.
+END
+  }
+  out_arg {
+    name: "db"
+    description: <<END
+1D backprop tensor for beta.
+END
+  }
+  out_arg {
+    name: "dg"
+    description: <<END
+1D backprop tensor for gamma.
+END
+  }
+  attr {
+    name: "variance_epsilon"
+    description: <<END
+A small float number to avoid dividing by 0.
+END
+  }
+  attr {
+    name: "scale_after_normalization"
+    description: <<END
+A bool indicating whether the resulted tensor
+needs to be multiplied with gamma.
+END
+  }
+  summary: "Gradients for batch normalization."
+  description: <<END
+This op is deprecated. See `tf.nn.batch_normalization`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEig.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8fd3ee3b6b456e5a1278edb81db80b60dc814154
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEig.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchSelfAdjointEig"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b025ab0486c29aff43c591ce9d99d43849b1c34
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchSelfAdjointEigV2.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchSelfAdjointEigV2"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchSvd.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchSvd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e5a51b58fd1134141caa180a0a34151467877e8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchSvd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchSvd"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchToSpace.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee9a5a01a4b725478e92e65b243624e2b7c7f960
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchToSpace.pbtxt
@@ -0,0 +1,104 @@
+op {
+  graph_op_name: "BatchToSpace"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D tensor with shape
+`[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+  depth]`. Note that the batch size of the input tensor must be divisible by
+`block_size * block_size`.
+END
+  }
+  in_arg {
+    name: "crops"
+    description: <<END
+2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+how many elements to crop from the intermediate result across the spatial
+dimensions as follows:
+
+    crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, height, width, depth]`, where:
+
+      height = height_pad - crop_top - crop_bottom
+      width = width_pad - crop_left - crop_right
+
+The attr `block_size` must be greater than one. It indicates the block size.
+
+Some examples:
+
+(1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+
+```
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+The output tensor has shape `[1, 2, 2, 1]` and value:
+
+```
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+(2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+
+```
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+The output tensor has shape `[1, 2, 2, 3]` and value:
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+(3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+
+```
+x = [[[[1], [3]], [[9], [11]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+The output tensor has shape `[1, 4, 4, 1]` and value:
+
+```
+x = [[[1],   [2],  [3],  [4]],
+     [[5],   [6],  [7],  [8]],
+     [[9],  [10], [11],  [12]],
+     [[13], [14], [15],  [16]]]
+```
+
+(4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+
+```
+x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+```
+
+The output tensor has shape `[2, 2, 4, 1]` and value:
+
+```
+x = [[[[1], [3]], [[5], [7]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+END
+  }
+  summary: "BatchToSpace for 4-D tensors of type T."
+  description: <<END
+This is a legacy version of the more general BatchToSpaceND.
+
+Rearranges (permutes) data from batch into blocks of spatial data, followed by
+cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+this op outputs a copy of the input tensor where values from the `batch`
+dimension are moved in spatial blocks to the `height` and `width` dimensions,
+followed by cropping along the `height` and `width` dimensions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchToSpaceND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e25f9995e7c008ecc000e10213ece67d6f25172
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchToSpaceND.pbtxt
@@ -0,0 +1,139 @@
+op {
+  graph_op_name: "BatchToSpaceND"
+  in_arg {
+    name: "input"
+    description: <<END
+N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+where spatial_shape has M dimensions.
+END
+  }
+  in_arg {
+    name: "block_shape"
+    description: <<END
+1-D with shape `[M]`, all values must be >= 1.
+END
+  }
+  in_arg {
+    name: "crops"
+    description: <<END
+2-D with shape `[M, 2]`, all values must be >= 0.
+  `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+  dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+  required that
+  `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+This operation is equivalent to the following steps:
+
+1. Reshape `input` to `reshaped` of shape:
+     [block_shape[0], ..., block_shape[M-1],
+      batch / prod(block_shape),
+      input_shape[1], ..., input_shape[N-1]]
+
+2. Permute dimensions of `reshaped` to produce `permuted` of shape
+     [batch / prod(block_shape),
+
+      input_shape[1], block_shape[0],
+      ...,
+      input_shape[M], block_shape[M-1],
+
+      input_shape[M+1], ..., input_shape[N-1]]
+
+3. Reshape `permuted` to produce `reshaped_permuted` of shape
+     [batch / prod(block_shape),
+
+      input_shape[1] * block_shape[0],
+      ...,
+      input_shape[M] * block_shape[M-1],
+
+      input_shape[M+1],
+      ...,
+      input_shape[N-1]]
+
+4. Crop the start and end of dimensions `[1, ..., M]` of
+   `reshaped_permuted` according to `crops` to produce the output of shape:
+     [batch / prod(block_shape),
+
+      input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+      ...,
+      input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+      input_shape[M+1], ..., input_shape[N-1]]
+
+Some examples:
+
+(1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+The output tensor has shape `[1, 2, 2, 1]` and value:
+
+```
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+(2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+The output tensor has shape `[1, 2, 2, 3]` and value:
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+(3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [0, 0]]`:
+
+```
+x = [[[[1], [3]], [[9], [11]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+The output tensor has shape `[1, 4, 4, 1]` and value:
+
+```
+x = [[[1],   [2],  [3],  [4]],
+     [[5],   [6],  [7],  [8]],
+     [[9],  [10], [11],  [12]],
+     [[13], [14], [15],  [16]]]
+```
+
+(4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+    `crops = [[0, 0], [2, 0]]`:
+
+```
+x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+     [[[0], [2], [4]]], [[[0], [10], [12]]],
+     [[[0], [5], [7]]], [[[0], [13], [15]]],
+     [[[0], [6], [8]]], [[[0], [14], [16]]]]
+```
+
+The output tensor has shape `[2, 2, 4, 1]` and value:
+
+```
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]]],
+     [[[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+END
+  }
+  summary: "BatchToSpace for N-D tensors of type T."
+  description: <<END
+This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+`block_shape + [batch]`, interleaves these blocks back into the grid defined by
+the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+the input.  The spatial dimensions of this intermediate result are then
+optionally cropped according to `crops` to produce the output.  This is the
+reverse of SpaceToBatch.  See below for a precise description.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Betainc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Betainc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d7df75122c7893175d1efe39fd439dbe642e2c3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Betainc.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "Betainc"
+  summary: "Compute the regularized incomplete beta integral \\\\(I_x(a, b)\\\\)."
+  description: <<END
+The regularized incomplete beta integral is defined as:
+
+
+\\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+
+where
+
+
+\\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+
+
+is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+beta function.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BiasAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_BiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58266e74a2e335472d4fb4fe36d0d9e0c16649a4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BiasAdd.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "BiasAdd"
+  in_arg {
+    name: "value"
+    description: <<END
+Any number of dimensions.
+END
+  }
+  in_arg {
+    name: "bias"
+    description: <<END
+1-D with size the last dimension of `value`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Broadcasted sum of `value` and `bias`.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the bias tensor will be added to the last dimension
+of the value tensor.
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+The tensor will be added to "in_channels", the third-to-the-last
+    dimension.
+END
+  }
+  summary: "Adds `bias` to `value`."
+  description: <<END
+This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+Broadcasting is supported, so `value` may have any number of dimensions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BiasAddGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_BiasAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f2adf1a35737271ea4fcd28707bf3cf7bf249a8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BiasAddGrad.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "BiasAddGrad"
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+Any number of dimensions.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D with size the feature dimension of `out_backprop`.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the bias tensor will be added to the last dimension
+of the value tensor.
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+The tensor will be added to "in_channels", the third-to-the-last
+    dimension.
+END
+  }
+  summary: "The backward operation for \"BiasAdd\" on the \"bias\" tensor."
+  description: <<END
+It accumulates all the values from out_backprop into the feature dimension.
+For NHWC data format, the feature dimension is the last. For NCHW data format,
+the feature dimension is the third-to-last.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BiasAddV1.pbtxt b/tensorflow/core/api_def/base_api/api_def_BiasAddV1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9799682bf27b84baa525698c90abc5b69341e184
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BiasAddV1.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "BiasAddV1"
+  visibility: SKIP
+  in_arg {
+    name: "value"
+    description: <<END
+Any number of dimensions.
+END
+  }
+  in_arg {
+    name: "bias"
+    description: <<END
+1-D with size the last dimension of `value`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Broadcasted sum of `value` and `bias`.
+END
+  }
+  summary: "Adds `bias` to `value`."
+  description: <<END
+This is a deprecated version of BiasAdd and will be soon removed.
+
+This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+Broadcasting is supported, so `value` may have any number of dimensions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Bincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_Bincount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1016f2ff6716994fc7a047f2641dce48e48c7367
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Bincount.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "Bincount"
+  in_arg {
+    name: "arr"
+    description: <<END
+int32 `Tensor`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+non-negative int32 scalar `Tensor`.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+is an int32, int64, float32, or float64 `Tensor` with the same
+shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+equal to 1.
+END
+  }
+  out_arg {
+    name: "bins"
+    description: <<END
+1D `Tensor` with length equal to `size`. The counts or summed weights for
+each value in the range [0, size).
+END
+  }
+  summary: "Counts the number of occurrences of each value in an integer array."
+  description: <<END
+Outputs a vector with length `size` and the same dtype as `weights`. If
+`weights` are empty, then index `i` stores the number of times the value `i` is
+counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+the value in `weights` at each index where the corresponding value in `arr` is
+`i`.
+
+Values in `arr` outside of the range [0, size) are ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt b/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4d4f9ea08e0da2f22af5d07ae34c16c84d05eee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Bitcast.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "Bitcast"
+  summary: "Bitcasts a tensor from one type to another without copying data."
+  description: <<END
+Given a tensor `input`, this operation returns a tensor that has the same buffer
+data as `input` with datatype `type`.
+
+If the input datatype `T` is larger than the output datatype `type` then the
+shape changes from [...] to [..., sizeof(`T`)/sizeof(`type`)].
+
+If `T` is smaller than `type`, the operator requires that the rightmost
+dimension be equal to sizeof(`type`)/sizeof(`T`). The shape then goes from
+[..., sizeof(`type`)/sizeof(`T`)] to [...].
+
+*NOTE*: Bitcast is implemented as a low-level cast, so machines with different
+endian orderings will give different results.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BitwiseAnd.pbtxt b/tensorflow/core/api_def/base_api/api_def_BitwiseAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44d34ce9ec2aa52357254ebef80b52a88a4e0eb7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BitwiseAnd.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "BitwiseAnd"
+  summary: "Elementwise computes the bitwise AND of `x` and `y`."
+  description: <<END
+The result will have those bits set, that are set in both `x` and `y`. The
+computation is performed on the underlying representations of `x` and `y`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BitwiseOr.pbtxt b/tensorflow/core/api_def/base_api/api_def_BitwiseOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9c8feb40d54877ae44dd445e293ece44a3e532d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BitwiseOr.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "BitwiseOr"
+  summary: "Elementwise computes the bitwise OR of `x` and `y`."
+  description: <<END
+The result will have those bits set, that are set in `x`, `y` or both. The
+computation is performed on the underlying representations of `x` and `y`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BitwiseXor.pbtxt b/tensorflow/core/api_def/base_api/api_def_BitwiseXor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22be3d134ab31fd91876641fb0909ddb982cccc3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BitwiseXor.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "BitwiseXor"
+  summary: "Elementwise computes the bitwise XOR of `x` and `y`."
+  description: <<END
+The result will have those bits set, that are different in `x` and `y`. The
+computation is performed on the underlying representations of `x` and `y`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c8564e2184cdf68c50e47078a7f9ecf7b7eb398
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastArgs.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "BroadcastArgs"
+  endpoint {
+    name: "BroadcastDynamicShape"
+  }
+  summary: "Return the shape of s0 op s1 with broadcast."
+  description: <<END
+Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BroadcastGradientArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_BroadcastGradientArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6e4516a2690f146ce417c36dec903beaff8d7a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BroadcastGradientArgs.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "BroadcastGradientArgs"
+  visibility: HIDDEN
+  summary: "Return the reduction indices for computing gradients of s0 op s1 with broadcast."
+  description: <<END
+This is typically used by gradient computations for a broadcasting operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Bucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Bucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b464af95302704339f0742ff061df103d8578990
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Bucketize.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "Bucketize"
+  in_arg {
+    name: "input"
+    description: <<END
+Any shape of Tensor contains with int or float type.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Same shape with 'input', each value of input replaced with bucket index.
+
+@compatibility(numpy)
+Equivalent to np.digitize.
+@end_compatibility
+END
+  }
+  attr {
+    name: "boundaries"
+    description: <<END
+A sorted list of floats gives the boundary of the buckets.
+END
+  }
+  summary: "Bucketizes \'input\' based on \'boundaries\'."
+  description: <<END
+For example, if the inputs are
+    boundaries = [0, 10, 100]
+    input = [[-5, 10000]
+             [150,   10]
+             [5,    100]]
+
+then the output will be
+    output = [[0, 3]
+              [3, 2]
+              [1, 3]]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73df11b2f75f82fad174fb7e77eccbef35c2c7d1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BytesProducedStatsDataset"
+  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_C.pbtxt b/tensorflow/core/api_def/base_api/api_def_C.pbtxt
deleted file mode 100644
index 48b04b79710b70d541812400586208a26dc41124..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_C.pbtxt
+++ /dev/null
@@ -1,513 +0,0 @@
-op {
-  graph_op_name: "CTCBeamSearchDecoder"
-  endpoint {
-    name: "CTCBeamSearchDecoder"
-  }
-  summary: "Performs beam search decoding on the logits given in input."
-  description: <<END
-A note about the attribute merge_repeated: For the beam search decoder,
-this means that if consecutive entries in a beam are the same, only
-the first of these is emitted.  That is, when the top path is "A B B B B",
-"A B" is returned if merge_repeated = True but "A B B B B" is
-returned if merge_repeated = False.
-END
-}
-op {
-  graph_op_name: "CTCGreedyDecoder"
-  endpoint {
-    name: "CTCGreedyDecoder"
-  }
-  summary: "Performs greedy decoding on the logits given in inputs."
-  description: <<END
-A note about the attribute merge_repeated: if enabled, when
-consecutive logits' maximum indices are the same, only the first of
-these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-becomes "A B B" if merge_repeated = True and "A B B B B" if
-merge_repeated = False.
-
-Regardless of the value of merge_repeated, if the maximum index of a given
-time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-element is emitted.
-END
-}
-op {
-  graph_op_name: "CTCLoss"
-  endpoint {
-    name: "CTCLoss"
-  }
-  summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
-  description: <<END
-the gradient.  This class performs the softmax operation for you, so inputs
-should be e.g. linear projections of outputs by an LSTM.
-END
-}
-op {
-  graph_op_name: "CacheDataset"
-  endpoint {
-    name: "CacheDataset"
-  }
-  summary: "Creates a dataset that caches elements from `input_dataset`."
-  description: <<END
-A CacheDataset will iterate over the input_dataset, and store tensors. If the
-cache already exists, the cache will be used. If the cache is inappropriate
-(e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-will the returned when used.
-END
-}
-op {
-  graph_op_name: "Cast"
-  endpoint {
-    name: "Cast"
-  }
-  summary: "Cast x of type SrcT to y of DstT."
-}
-op {
-  graph_op_name: "Ceil"
-  endpoint {
-    name: "Ceil"
-  }
-  summary: "Returns element-wise smallest integer in not less than x."
-}
-op {
-  graph_op_name: "CheckNumerics"
-  endpoint {
-    name: "CheckNumerics"
-  }
-  summary: "Checks a tensor for NaN and Inf values."
-  description: <<END
-When run, reports an `InvalidArgument` error if `tensor` has any values
-that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
-END
-}
-op {
-  graph_op_name: "Cholesky"
-  endpoint {
-    name: "Cholesky"
-  }
-  summary: "Computes the Cholesky decomposition of one or more square matrices."
-  description: <<END
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices.
-
-The input has to be symmetric and positive definite. Only the lower-triangular
-part of the input will be used for this operation. The upper-triangular part
-will not be read.
-
-The output is a tensor of the same shape as the input
-containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
-
-**Note**: The gradient computation on GPU is faster for large matrices but
-not for large batch dimensions when the submatrices are small. In this
-case it might be faster to use the CPU.
-END
-}
-op {
-  graph_op_name: "CholeskyGrad"
-  endpoint {
-    name: "CholeskyGrad"
-  }
-  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
-  description: <<END
-For an explanation see "Differentiation of the Cholesky algorithm" by
-Iain Murray http://arxiv.org/abs/1602.07527.
-END
-}
-op {
-  graph_op_name: "CompareAndBitpack"
-  endpoint {
-    name: "CompareAndBitpack"
-  }
-  summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`."
-  description: <<END
-Each comparison returns a boolean `true` (if `input_value > threshold`)
-or and `false` otherwise.
-
-This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-algorithms that use hashing approximations of cosine and `L2` distances;
-codes can be generated from an input via:
-
-```python
-codebook_size = 50
-codebook_bits = codebook_size * 32
-codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-                           dtype=x.dtype,
-                           initializer=tf.orthogonal_initializer())
-codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-# now codes has shape x.shape[:-1] + [codebook_size]
-```
-
-**NOTE**: Currently, the innermost dimension of the tensor must be divisible
-by 8.
-
-Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
-END
-}
-op {
-  graph_op_name: "Complex"
-  endpoint {
-    name: "Complex"
-  }
-  summary: "Converts two real numbers to a complex number."
-  description: <<END
-Given a tensor `real` representing the real part of a complex number, and a
-tensor `imag` representing the imaginary part of a complex number, this
-operation returns complex numbers elementwise of the form \\(a + bj\\), where
-*a* represents the `real` part and *b* represents the `imag` part.
-
-The input tensors `real` and `imag` must have the same shape.
-
-For example:
-
-```
-# tensor 'real' is [2.25, 3.25]
-# tensor `imag` is [4.75, 5.75]
-tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-```
-END
-}
-op {
-  graph_op_name: "ComplexAbs"
-  endpoint {
-    name: "ComplexAbs"
-  }
-  summary: "Computes the complex absolute value of a tensor."
-  description: <<END
-Given a tensor `x` of complex numbers, this operation returns a tensor of type
-`float` or `double` that is the absolute value of each element in `x`. All
-elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-value is computed as \\( \sqrt{a^2 + b^2}\\).
-END
-}
-op {
-  graph_op_name: "ComputeAccidentalHits"
-  endpoint {
-    name: "ComputeAccidentalHits"
-  }
-  summary: "Computes the ids of the positions in sampled_candidates that match true_labels."
-  description: <<END
-When doing log-odds NCE, the result of this op should be passed through a
-SparseToDense op, then added to the logits of the sampled candidates. This has
-the effect of 'removing' the sampled labels that match the true labels by
-making the classifier sure that they are sampled labels.
-END
-}
-op {
-  graph_op_name: "Concat"
-  endpoint {
-    name: "Concat"
-  }
-  summary: "Concatenates tensors along one dimension."
-}
-op {
-  graph_op_name: "ConcatOffset"
-  endpoint {
-    name: "ConcatOffset"
-  }
-  summary: "Computes offsets of concat inputs within its output."
-  description: <<END
-For example:
-
-```
-# 'x' is [2, 2, 7]
-# 'y' is [2, 3, 7]
-# 'z' is [2, 5, 7]
-concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-```
-
-This is typically used by gradient computations for a concat operation.
-END
-}
-op {
-  graph_op_name: "ConcatV2"
-  endpoint {
-    name: "ConcatV2"
-  }
-  summary: "Concatenates tensors along one dimension."
-}
-op {
-  graph_op_name: "ConcatenateDataset"
-  endpoint {
-    name: "ConcatenateDataset"
-  }
-  summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
-}
-op {
-  graph_op_name: "ConditionalAccumulator"
-  endpoint {
-    name: "ConditionalAccumulator"
-  }
-  summary: "A conditional accumulator for aggregating gradients."
-  description: <<END
-The accumulator accepts gradients marked with local_step greater or
-equal to the most recent global_step known to the accumulator. The
-average can be extracted from the accumulator, provided sufficient
-gradients have been accumulated. Extracting the average automatically
-resets the aggregate to 0, and increments the global_step recorded by
-the accumulator.
-END
-}
-op {
-  graph_op_name: "Conj"
-  endpoint {
-    name: "Conj"
-  }
-  summary: "Returns the complex conjugate of a complex number."
-  description: <<END
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-complex numbers that are the complex conjugate of each element in `input`. The
-complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-real part and *b* is the imaginary part.
-
-The complex conjugate returned by this operation is of the form \\(a - bj\\).
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-```
-END
-}
-op {
-  graph_op_name: "Const"
-  endpoint {
-    name: "Const"
-  }
-  summary: "Returns a constant tensor."
-}
-op {
-  graph_op_name: "ControlTrigger"
-  endpoint {
-    name: "ControlTrigger"
-  }
-  summary: "Does nothing. Serves as a control trigger for scheduling."
-  description: <<END
-Only useful as a placeholder for control edges.
-END
-}
-op {
-  graph_op_name: "Conv2D"
-  endpoint {
-    name: "Conv2D"
-  }
-  summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
-  description: <<END
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, out_channels]`, this op
-performs the following:
-
-1. Flattens the filter to a 2-D matrix with shape
-   `[filter_height * filter_width * in_channels, output_channels]`.
-2. Extracts image patches from the input tensor to form a *virtual*
-   tensor of shape `[batch, out_height, out_width,
-   filter_height * filter_width * in_channels]`.
-3. For each patch, right-multiplies the filter matrix and the image patch
-   vector.
-
-In detail, with the default NHWC format,
-
-    output[b, i, j, k] =
-        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-                        filter[di, dj, q, k]
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-END
-}
-op {
-  graph_op_name: "Conv2DBackpropFilter"
-  endpoint {
-    name: "Conv2DBackpropFilter"
-  }
-  summary: "Computes the gradients of convolution with respect to the filter."
-}
-op {
-  graph_op_name: "Conv2DBackpropInput"
-  endpoint {
-    name: "Conv2DBackpropInput"
-  }
-  summary: "Computes the gradients of convolution with respect to the input."
-}
-op {
-  graph_op_name: "Conv3D"
-  endpoint {
-    name: "Conv3D"
-  }
-  summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
-  description: <<END
-In signal processing, cross-correlation is a measure of similarity of
-two waveforms as a function of a time-lag applied to one of them. This
-is also known as a sliding dot product or sliding inner-product.
-
-Our Conv3D implements a form of cross-correlation.
-END
-}
-op {
-  graph_op_name: "Conv3DBackpropFilter"
-  endpoint {
-    name: "Conv3DBackpropFilter"
-  }
-  summary: "Computes the gradients of 3-D convolution with respect to the filter."
-}
-op {
-  graph_op_name: "Conv3DBackpropFilterV2"
-  endpoint {
-    name: "Conv3DBackpropFilterV2"
-  }
-  summary: "Computes the gradients of 3-D convolution with respect to the filter."
-}
-op {
-  graph_op_name: "Conv3DBackpropInput"
-  endpoint {
-    name: "Conv3DBackpropInput"
-  }
-  summary: "Computes the gradients of 3-D convolution with respect to the input."
-}
-op {
-  graph_op_name: "Conv3DBackpropInputV2"
-  endpoint {
-    name: "Conv3DBackpropInputV2"
-  }
-  summary: "Computes the gradients of 3-D convolution with respect to the input."
-}
-op {
-  graph_op_name: "Cos"
-  endpoint {
-    name: "Cos"
-  }
-  summary: "Computes cos of x element-wise."
-}
-op {
-  graph_op_name: "Cosh"
-  endpoint {
-    name: "Cosh"
-  }
-  summary: "Computes hyperbolic cosine of x element-wise."
-}
-op {
-  graph_op_name: "CountUpTo"
-  endpoint {
-    name: "CountUpTo"
-  }
-  summary: "Increments \'ref\' until it reaches \'limit\'."
-}
-op {
-  graph_op_name: "CropAndResize"
-  endpoint {
-    name: "CropAndResize"
-  }
-  summary: "Extracts crops from the input image tensor and bilinearly resizes them (possibly"
-  description: <<END
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
-
-Returns a tensor with `crops` from the input `image` at positions defined at the
-bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`.
-END
-}
-op {
-  graph_op_name: "CropAndResizeGradBoxes"
-  endpoint {
-    name: "CropAndResizeGradBoxes"
-  }
-  summary: "Computes the gradient of the crop_and_resize op wrt the input boxes tensor."
-}
-op {
-  graph_op_name: "CropAndResizeGradImage"
-  endpoint {
-    name: "CropAndResizeGradImage"
-  }
-  summary: "Computes the gradient of the crop_and_resize op wrt the input image tensor."
-}
-op {
-  graph_op_name: "Cross"
-  endpoint {
-    name: "Cross"
-  }
-  summary: "Compute the pairwise cross product."
-  description: <<END
-`a` and `b` must be the same shape; they can either be simple 3-element vectors,
-or any shape where the innermost dimension is 3. In the latter case, each pair
-of corresponding 3-element vectors is cross-multiplied independently.
-END
-}
-op {
-  graph_op_name: "Cumprod"
-  endpoint {
-    name: "Cumprod"
-  }
-  summary: "Compute the cumulative product of the tensor `x` along `axis`."
-  description: <<END
-By default, this op performs an inclusive cumprod, which means that the first
-element of the input is identical to the first element of the output:
-
-```python
-tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-performed instead:
-
-```python
-tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-opposite direction:
-
-```python
-tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-```
-
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-
-```python
-tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-```
-END
-}
-op {
-  graph_op_name: "Cumsum"
-  endpoint {
-    name: "Cumsum"
-  }
-  summary: "Compute the cumulative sum of the tensor `x` along `axis`."
-  description: <<END
-By default, this op performs an inclusive cumsum, which means that the first
-element of the input is identical to the first element of the output:
-
-```python
-tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-```
-
-By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-performed instead:
-
-```python
-tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-```
-
-By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-opposite direction:
-
-```python
-tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-```
-
-This is more efficient than using separate `tf.reverse` ops.
-
-The `reverse` and `exclusive` kwargs can also be combined:
-
-```python
-tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-```
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_CTCBeamSearchDecoder.pbtxt b/tensorflow/core/api_def/base_api/api_def_CTCBeamSearchDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36eb2fb7b44642b0bbf77afcc5aa3ec1f6692e03
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CTCBeamSearchDecoder.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "CTCBeamSearchDecoder"
+  in_arg {
+    name: "inputs"
+    description: <<END
+3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+END
+  }
+  in_arg {
+    name: "sequence_length"
+    description: <<END
+A vector containing sequence lengths, size `(batch)`.
+END
+  }
+  out_arg {
+    name: "decoded_indices"
+    description: <<END
+A list (length: top_paths) of indices matrices.  Matrix j,
+size `(total_decoded_outputs[j] x 2)`, has indices of a
+`SparseTensor<int64, 2>`.  The rows store: [batch, time].
+END
+  }
+  out_arg {
+    name: "decoded_values"
+    description: <<END
+A list (length: top_paths) of values vectors.  Vector j,
+size `(length total_decoded_outputs[j])`, has the values of a
+`SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.
+END
+  }
+  out_arg {
+    name: "decoded_shape"
+    description: <<END
+A list (length: top_paths) of shape vector.  Vector j,
+size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+Its values are: `[batch_size, max_decoded_length[j]]`.
+END
+  }
+  out_arg {
+    name: "log_probability"
+    description: <<END
+A matrix, shaped: `(batch_size x top_paths)`.  The
+sequence log-probabilities.
+END
+  }
+  attr {
+    name: "beam_width"
+    description: <<END
+A scalar >= 0 (beam search beam width).
+END
+  }
+  attr {
+    name: "top_paths"
+    description: <<END
+A scalar >= 0, <= beam_width (controls output size).
+END
+  }
+  attr {
+    name: "merge_repeated"
+    description: <<END
+If true, merge repeated classes in output.
+END
+  }
+  summary: "Performs beam search decoding on the logits given in input."
+  description: <<END
+A note about the attribute merge_repeated: For the beam search decoder,
+this means that if consecutive entries in a beam are the same, only
+the first of these is emitted.  That is, when the top path is "A B B B B",
+"A B" is returned if merge_repeated = True but "A B B B B" is
+returned if merge_repeated = False.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CTCGreedyDecoder.pbtxt b/tensorflow/core/api_def/base_api/api_def_CTCGreedyDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..814f5350a2ec21731a08a607e7cc12090377285d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CTCGreedyDecoder.pbtxt
@@ -0,0 +1,61 @@
+op {
+  graph_op_name: "CTCGreedyDecoder"
+  in_arg {
+    name: "inputs"
+    description: <<END
+3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+END
+  }
+  in_arg {
+    name: "sequence_length"
+    description: <<END
+A vector containing sequence lengths, size `(batch_size)`.
+END
+  }
+  out_arg {
+    name: "decoded_indices"
+    description: <<END
+Indices matrix, size `(total_decoded_outputs x 2)`,
+of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].
+END
+  }
+  out_arg {
+    name: "decoded_values"
+    description: <<END
+Values vector, size: `(total_decoded_outputs)`,
+of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.
+END
+  }
+  out_arg {
+    name: "decoded_shape"
+    description: <<END
+Shape vector, size `(2)`, of the decoded SparseTensor.
+Values are: `[batch_size, max_decoded_length]`.
+END
+  }
+  out_arg {
+    name: "log_probability"
+    description: <<END
+Matrix, size `(batch_size x 1)`, containing sequence
+log-probabilities.
+END
+  }
+  attr {
+    name: "merge_repeated"
+    description: <<END
+If True, merge repeated classes in output.
+END
+  }
+  summary: "Performs greedy decoding on the logits given in inputs."
+  description: <<END
+A note about the attribute merge_repeated: if enabled, when
+consecutive logits' maximum indices are the same, only the first of
+these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+becomes "A B B" if merge_repeated = True and "A B B B B" if
+merge_repeated = False.
+
+Regardless of the value of merge_repeated, if the maximum index of a given
+time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+element is emitted.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CTCLoss.pbtxt b/tensorflow/core/api_def/base_api/api_def_CTCLoss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a85597ae6ec2cc7bce550b54aa10fc6ec04d5dcb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CTCLoss.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "CTCLoss"
+  in_arg {
+    name: "inputs"
+    description: <<END
+3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+END
+  }
+  in_arg {
+    name: "labels_indices"
+    description: <<END
+The indices of a `SparseTensor<int32, 2>`.
+`labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+`(batch b, time t)`.
+END
+  }
+  in_arg {
+    name: "labels_values"
+    description: <<END
+The values (labels) associated with the given batch and time.
+END
+  }
+  in_arg {
+    name: "sequence_length"
+    description: <<END
+A vector containing sequence lengths (batch).
+END
+  }
+  out_arg {
+    name: "loss"
+    description: <<END
+A vector (batch) containing log-probabilities.
+END
+  }
+  out_arg {
+    name: "gradient"
+    description: <<END
+The gradient of `loss`.  3-D, shape:
+`(max_time x batch_size x num_classes)`.
+END
+  }
+  attr {
+    name: "preprocess_collapse_repeated"
+    description: <<END
+Scalar, if true then repeated labels are
+collapsed prior to the CTC calculation.
+END
+  }
+  attr {
+    name: "ctc_merge_repeated"
+    description: <<END
+Scalar.  If set to false, *during* CTC calculation
+repeated non-blank labels will not be merged and are interpreted as
+individual labels.  This is a simplified version of CTC.
+END
+  }
+  attr {
+    name: "ignore_longer_outputs_than_inputs"
+    description: <<END
+Scalar. If set to true, during CTC
+calculation, items that have longer output sequences than input sequences
+are skipped: they don't contribute to the loss term and have zero-gradient.
+END
+  }
+  summary: "Calculates the CTC Loss (log probability) for each batch entry.  Also calculates"
+  description: <<END
+the gradient.  This class performs the softmax operation for you, so inputs
+should be e.g. linear projections of outputs by an LSTM.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6889b8ea148b57da847964c062bd52b1027b8d22
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "CacheDataset"
+  in_arg {
+    name: "filename"
+    description: <<END
+A path on the filesystem where we should cache the dataset. Note: this
+will be a directory.
+END
+  }
+  summary: "Creates a dataset that caches elements from `input_dataset`."
+  description: <<END
+A CacheDataset will iterate over the input_dataset, and store tensors. If the
+cache already exists, the cache will be used. If the cache is inappropriate
+(e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+will the returned when used.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cast.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a0ba505cbaf6bf37be8a1dcb17928117ac89891
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cast"
+  summary: "Cast x of type SrcT to y of DstT."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad1ada8d717a51ee3a058da5d32ed7bf50375b13
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Ceil.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Ceil"
+  summary: "Returns element-wise smallest integer in not less than x."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/base_api/api_def_CheckNumerics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cadf3667e2bc487d83911734281346eefdf88252
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CheckNumerics.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "CheckNumerics"
+  attr {
+    name: "message"
+    description: <<END
+Prefix of the error message.
+END
+  }
+  summary: "Checks a tensor for NaN and Inf values."
+  description: <<END
+When run, reports an `InvalidArgument` error if `tensor` has any values
+that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cholesky.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..713abee6309e73bda6b85b1742885c9768a66134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cholesky.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "Cholesky"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  summary: "Computes the Cholesky decomposition of one or more square matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be symmetric and positive definite. Only the lower-triangular
+part of the input will be used for this operation. The upper-triangular part
+will not be read.
+
+The output is a tensor of the same shape as the input
+containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+
+**Note**: The gradient computation on GPU is faster for large matrices but
+not for large batch dimensions when the submatrices are small. In this
+case it might be faster to use the CPU.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CholeskyGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_CholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..faf5e274b340a40d58cac3315b2b3b9a0b29a0b9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CholeskyGrad.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "CholeskyGrad"
+  in_arg {
+    name: "l"
+    description: <<END
+Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+Algorithm depends only on lower triangular part of the innermost matrices of
+this tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+df/dl where f is some scalar function. Shape is `[..., M, M]`.
+Algorithm depends only on lower triangular part of the innermost matrices of
+this tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Symmetrized version of df/dA . Shape is `[..., M, M]`
+END
+  }
+  summary: "Computes the reverse mode backpropagated gradient of the Cholesky algorithm."
+  description: <<END
+For an explanation see "Differentiation of the Cholesky algorithm" by
+Iain Murray http://arxiv.org/abs/1602.07527.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CompareAndBitpack.pbtxt b/tensorflow/core/api_def/base_api/api_def_CompareAndBitpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57ba4f8f4c49d4cddf3681fd1cc79a0faad74259
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CompareAndBitpack.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "CompareAndBitpack"
+  in_arg {
+    name: "input"
+    description: <<END
+Values to compare against `threshold` and bitpack.
+END
+  }
+  in_arg {
+    name: "threshold"
+    description: <<END
+Threshold to compare against.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The bitpacked comparisons.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of the input and threshold.
+END
+  }
+  summary: "Compare values of `input` to `threshold` and pack resulting bits into a `uint8`."
+  description: <<END
+Each comparison returns a boolean `true` (if `input_value > threshold`)
+or and `false` otherwise.
+
+This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+algorithms that use hashing approximations of cosine and `L2` distances;
+codes can be generated from an input via:
+
+```python
+codebook_size = 50
+codebook_bits = codebook_size * 32
+codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+                           dtype=x.dtype,
+                           initializer=tf.orthogonal_initializer())
+codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+# now codes has shape x.shape[:-1] + [codebook_size]
+```
+
+**NOTE**: Currently, the innermost dimension of the tensor must be divisible
+by 8.
+
+Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Complex.pbtxt b/tensorflow/core/api_def/base_api/api_def_Complex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e421d8ce0bd64b4b3a6db09393e7d5c4955e373a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Complex.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "Complex"
+  summary: "Converts two real numbers to a complex number."
+  description: <<END
+Given a tensor `real` representing the real part of a complex number, and a
+tensor `imag` representing the imaginary part of a complex number, this
+operation returns complex numbers elementwise of the form \\(a + bj\\), where
+*a* represents the `real` part and *b* represents the `imag` part.
+
+The input tensors `real` and `imag` must have the same shape.
+
+For example:
+
+```
+# tensor 'real' is [2.25, 3.25]
+# tensor `imag` is [4.75, 5.75]
+tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19088f5dfcde375236bf46d364a9e0a41211eab3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComplexAbs.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "ComplexAbs"
+  summary: "Computes the complex absolute value of a tensor."
+  description: <<END
+Given a tensor `x` of complex numbers, this operation returns a tensor of type
+`float` or `double` that is the absolute value of each element in `x`. All
+elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+value is computed as \\( \sqrt{a^2 + b^2}\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ComputeAccidentalHits.pbtxt b/tensorflow/core/api_def/base_api/api_def_ComputeAccidentalHits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf1e8054205dc75d2b9ace36b26dee33ea4651f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ComputeAccidentalHits.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "ComputeAccidentalHits"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+The true_classes output of UnpackSparseLabels.
+END
+  }
+  in_arg {
+    name: "sampled_candidates"
+    description: <<END
+The sampled_candidates output of CandidateSampler.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+A vector of indices corresponding to rows of true_candidates.
+END
+  }
+  out_arg {
+    name: "ids"
+    description: <<END
+A vector of IDs of positions in sampled_candidates that match a true_label
+for the row with the corresponding index in indices.
+END
+  }
+  out_arg {
+    name: "weights"
+    description: <<END
+A vector of the same length as indices and ids, in which each element
+is -FLOAT_MAX.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Computes the ids of the positions in sampled_candidates that match true_labels."
+  description: <<END
+When doing log-odds NCE, the result of this op should be passed through a
+SparseToDense op, then added to the logits of the sampled candidates. This has
+the effect of 'removing' the sampled labels that match the true labels by
+making the classifier sure that they are sampled labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Concat.pbtxt b/tensorflow/core/api_def/base_api/api_def_Concat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bad600e5bfb579fd960c8ac3a61e7754dc78b48
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Concat.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "Concat"
+  visibility: SKIP
+  in_arg {
+    name: "concat_dim"
+    description: <<END
+0-D.  The dimension along which to concatenate.  Must be in the
+range [0, rank(values)).
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+The `N` Tensors to concatenate. Their ranks and types must match,
+and their sizes must match in all dimensions except `concat_dim`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A `Tensor` with the concatenation of values stacked along the
+`concat_dim` dimension.  This tensor's shape matches that of `values` except
+in `concat_dim` where it has the sum of the sizes.
+END
+  }
+  summary: "Concatenates tensors along one dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84b11715ceedfe5d7959793813a07cca806ffcad
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ConcatOffset"
+  visibility: SKIP
+  in_arg {
+    name: "concat_dim"
+    description: <<END
+The dimension along which to concatenate.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The `N` int32 vectors representing shape of tensors being concatenated.
+END
+  }
+  out_arg {
+    name: "offset"
+    description: <<END
+The `N` int32 vectors representing the starting offset
+of input tensors within the concatenated output.
+END
+  }
+  summary: "Computes offsets of concat inputs within its output."
+  description: <<END
+For example:
+
+```
+# 'x' is [2, 2, 7]
+# 'y' is [2, 3, 7]
+# 'z' is [2, 5, 7]
+concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+```
+
+This is typically used by gradient computations for a concat operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1a7a81c73882b23b47d6c93fe147a39e89c49da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "ConcatV2"
+  endpoint {
+    name: "Concat"
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+List of `N` Tensors to concatenate. Their ranks and types must match,
+and their sizes must match in all dimensions except `concat_dim`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+0-D.  The dimension along which to concatenate.  Must be in the
+range [-rank(values), rank(values)).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A `Tensor` with the concatenation of values stacked along the
+`concat_dim` dimension.  This tensor's shape matches that of `values` except
+in `concat_dim` where it has the sum of the sizes.
+END
+  }
+  summary: "Concatenates tensors along one dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67281f9547ac6bb9df5b19e9f31da891454993bd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConcatenateDataset"
+  summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConditionalAccumulator.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64672e0e58737545d021f45371f8d4d443d3a860
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConditionalAccumulator.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "ConditionalAccumulator"
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the accumulator.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the value being accumulated.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the values, can be [], in which case shape is unknown.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this accumulator is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this accumulator will be shared under the
+given name across multiple sessions.
+END
+  }
+  summary: "A conditional accumulator for aggregating gradients."
+  description: <<END
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conj.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conj.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e161dc5b154616abfbce218163fa1442bc29f9ff
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conj.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "Conj"
+  summary: "Returns the complex conjugate of a complex number."
+  description: <<END
+Given a tensor `input` of complex numbers, this operation returns a tensor of
+complex numbers that are the complex conjugate of each element in `input`. The
+complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+real part and *b* is the imaginary part.
+
+The complex conjugate returned by this operation is of the form \\(a - bj\\).
+
+For example:
+
+```
+# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConjugateTranspose.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConjugateTranspose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..508c7a8bff5c1fe7e7641163b370a728a0ba6878
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConjugateTranspose.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "ConjugateTranspose"
+  summary: "Shuffle dimensions of x according to a permutation and conjugate the result."
+  description: <<END
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Const.pbtxt b/tensorflow/core/api_def/base_api/api_def_Const.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d9e909f89a4de4665aa6a7dcd749b422ddc4498
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Const.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "Const"
+  attr {
+    name: "value"
+    description: <<END
+Attr `value` is the tensor to return.
+END
+  }
+  summary: "Returns a constant tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ControlTrigger.pbtxt b/tensorflow/core/api_def/base_api/api_def_ControlTrigger.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9902e3a784b1982f5db73271b20b16a4eaa63ce3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ControlTrigger.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ControlTrigger"
+  summary: "Does nothing. Serves as a control trigger for scheduling."
+  description: <<END
+Only useful as a placeholder for control edges.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..070d6adb978e4a62e7209f299dba08515aa21e83
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -0,0 +1,82 @@
+op {
+  graph_op_name: "Conv2D"
+  in_arg {
+    name: "input"
+    description: <<END
+A 4-D tensor. The dimension order is interpreted according to the value
+of `data_format`, see below for details.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+A 4-D tensor of shape
+`[filter_height, filter_width, in_channels, out_channels]`
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A 4-D tensor. The dimension order is determined by the value of
+`data_format`, see below for details.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 4.  The stride of the sliding window for each
+dimension of `input`. The dimension order is determined by the value of
+`data_format`, see below for details.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, height, width, channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
+END
+  }
+  summary: "Computes a 2-D convolution given 4-D `input` and `filter` tensors."
+  description: <<END
+Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+and a filter / kernel tensor of shape
+`[filter_height, filter_width, in_channels, out_channels]`, this op
+performs the following:
+
+1. Flattens the filter to a 2-D matrix with shape
+   `[filter_height * filter_width * in_channels, output_channels]`.
+2. Extracts image patches from the input tensor to form a *virtual*
+   tensor of shape `[batch, out_height, out_width,
+   filter_height * filter_width * in_channels]`.
+3. For each patch, right-multiplies the filter matrix and the image patch
+   vector.
+
+In detail, with the default NHWC format,
+
+    output[b, i, j, k] =
+        sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                        filter[di, dj, q, k]
+
+Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff2d9d71db646a27a88763f79bb6beb6b5ede44b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "Conv2DBackpropFilter"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter_sizes"
+    description: <<END
+An integer vector representing the tensor shape of `filter`,
+where `filter` is a 4-D
+`[filter_height, filter_width, in_channels, out_channels]` tensor.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+the `filter` input of the convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution. Must be in the same order as the dimension specified with
+format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of convolution with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2de38b4263a380b5d0aec45270b9b67347c7021d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "Conv2DBackpropInput"
+  in_arg {
+    name: "input_sizes"
+    description: <<END
+An integer vector representing the shape of `input`,
+where `input` is a 4-D `[batch, height, width, channels]` tensor.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+w.r.t. the input of the convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution. Must be in the same order as the dimension specified with
+format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of convolution with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d26564097e976013fbb7f026c6a403cf6bd808e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3D.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "Conv3D"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+Shape `[filter_depth, filter_height, filter_width, in_channels,
+out_channels]`. `in_channels` must match between `input` and `filter`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
+END
+  }
+  summary: "Computes a 3-D convolution given 5-D `input` and `filter` tensors."
+  description: <<END
+In signal processing, cross-correlation is a measure of similarity of
+two waveforms as a function of a time-lag applied to one of them. This
+is also known as a sliding dot product or sliding inner-product.
+
+Our Conv3D implements a form of cross-correlation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3da4a878651a1e88e6c149c2e620089be489d1b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilter.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "Conv3DBackpropFilter"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, depth, rows, cols, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+Shape `[depth, rows, cols, in_channels, out_channels]`.
+`in_channels` must match between `input` and `filter`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes the gradients of 3-D convolution with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..937c9c8eadaaeceaadc180ad44f35a12ba9a2dfb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "Conv3DBackpropFilterV2"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, depth, rows, cols, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter_sizes"
+    description: <<END
+An integer vector representing the tensor shape of `filter`,
+where `filter` is a 5-D
+`[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+tensor.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of 3-D convolution with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c40a9a91a50c30d17d6ec22c3229bd88c5fe8535
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInput.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "Conv3DBackpropInput"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, depth, rows, cols, in_channels]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+Shape `[depth, rows, cols, in_channels, out_channels]`.
+`in_channels` must match between `input` and `filter`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes the gradients of 3-D convolution with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..414e418dc5a91e55f22dc5eec93d16fabad3d8fb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "Conv3DBackpropInputV2"
+  in_arg {
+    name: "input_sizes"
+    description: <<END
+An integer vector representing the tensor shape of `input`,
+where `input` is a 5-D
+`[batch, depth, rows, cols, in_channels]` tensor.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+Shape `[depth, rows, cols, in_channels, out_channels]`.
+`in_channels` must match between `input` and `filter`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+out_channels]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 5.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of 3-D convolution with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cos.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43fb75836f2b68388275e166f52c9e8a65bcbc17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cos.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cos"
+  summary: "Computes cos of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cosh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aaeb4ccbd50468ef2e7bac6a5a10098c5539a7a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cosh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Cosh"
+  summary: "Computes hyperbolic cosine of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CountUpTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_CountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7b5e2901ab6af6fe71dfdf4f43d0dbffe1b4e1c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CountUpTo.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "CountUpTo"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a scalar `Variable` node.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A copy of the input before increment. If nothing else modifies the
+input, the values produced will all be distinct.
+END
+  }
+  attr {
+    name: "limit"
+    description: <<END
+If incrementing ref would bring it above limit, instead generates an
+'OutOfRange' error.
+END
+  }
+  summary: "Increments \'ref\' until it reaches \'limit\'."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..629f575d0a25ceb97dedb4e94f84e3204450da6a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "CropAndResize"
+  in_arg {
+    name: "image"
+    description: <<END
+A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+Both `image_height` and `image_width` need to be positive.
+END
+  }
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+specifies the coordinates of a box in the `box_ind[i]` image and is specified
+in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+`[0, 1]` interval of normalized image height is mapped to
+`[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+which case the sampled crop is an up-down flipped version of the original
+image. The width dimension is treated similarly. Normalized coordinates
+outside the `[0, 1]` range are allowed, in which case we use
+`extrapolation_value` to extrapolate the input image values.
+END
+  }
+  in_arg {
+    name: "box_ind"
+    description: <<END
+A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+END
+  }
+  in_arg {
+    name: "crop_size"
+    description: <<END
+A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+cropped image patches are resized to this size. The aspect ratio of the image
+content is not preserved. Both `crop_height` and `crop_width` need to be
+positive.
+END
+  }
+  out_arg {
+    name: "crops"
+    description: <<END
+A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+END
+  }
+  attr {
+    name: "method"
+    description: <<END
+A string specifying the interpolation method. Only 'bilinear' is
+supported for now.
+END
+  }
+  attr {
+    name: "extrapolation_value"
+    description: <<END
+Value used for extrapolation, when applicable.
+END
+  }
+  summary: "Extracts crops from the input image tensor and bilinearly resizes them (possibly"
+  description: <<END
+with aspect ratio change) to a common output size specified by `crop_size`. This
+is more general than the `crop_to_bounding_box` op which extracts a fixed size
+slice from the input image and does not allow resizing or aspect ratio change.
+
+Returns a tensor with `crops` from the input `image` at positions defined at the
+bounding box locations in `boxes`. The cropped boxes are all resized (with
+bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+method will give identical results to using `tf.image.resize_bilinear()`
+with `align_corners=True`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradBoxes.pbtxt b/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c03b233efc9f3c428d0e1ac4b9399f176b351b08
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradBoxes.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "CropAndResizeGradBoxes"
+  in_arg {
+    name: "grads"
+    description: <<END
+A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+END
+  }
+  in_arg {
+    name: "image"
+    description: <<END
+A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+Both `image_height` and `image_width` need to be positive.
+END
+  }
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+specifies the coordinates of a box in the `box_ind[i]` image and is specified
+in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+`[0, 1]` interval of normalized image height is mapped to
+`[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+which case the sampled crop is an up-down flipped version of the original
+image. The width dimension is treated similarly. Normalized coordinates
+outside the `[0, 1]` range are allowed, in which case we use
+`extrapolation_value` to extrapolate the input image values.
+END
+  }
+  in_arg {
+    name: "box_ind"
+    description: <<END
+A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A 2-D tensor of shape `[num_boxes, 4]`.
+END
+  }
+  attr {
+    name: "method"
+    description: <<END
+A string specifying the interpolation method. Only 'bilinear' is
+supported for now.
+END
+  }
+  summary: "Computes the gradient of the crop_and_resize op wrt the input boxes tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradImage.pbtxt b/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradImage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51fb810007356a33f14b9131dedd08916a6d90fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CropAndResizeGradImage.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "CropAndResizeGradImage"
+  in_arg {
+    name: "grads"
+    description: <<END
+A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+END
+  }
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+specifies the coordinates of a box in the `box_ind[i]` image and is specified
+in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+`y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+`[0, 1]` interval of normalized image height is mapped to
+`[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+which case the sampled crop is an up-down flipped version of the original
+image. The width dimension is treated similarly. Normalized coordinates
+outside the `[0, 1]` range are allowed, in which case we use
+`extrapolation_value` to extrapolate the input image values.
+END
+  }
+  in_arg {
+    name: "box_ind"
+    description: <<END
+A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+END
+  }
+  in_arg {
+    name: "image_size"
+    description: <<END
+A 1-D tensor with value `[batch, image_height, image_width, depth]`
+containing the original image size. Both `image_height` and `image_width` need
+to be positive.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+END
+  }
+  attr {
+    name: "method"
+    description: <<END
+A string specifying the interpolation method. Only 'bilinear' is
+supported for now.
+END
+  }
+  summary: "Computes the gradient of the crop_and_resize op wrt the input image tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cross.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26c12e459bb81125370eb320de208e4fac66fe73
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cross.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "Cross"
+  in_arg {
+    name: "a"
+    description: <<END
+A tensor containing 3-element vectors.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+Another tensor, of same type and shape as `a`.
+END
+  }
+  out_arg {
+    name: "product"
+    description: <<END
+Pairwise cross product of the vectors in `a` and `b`.
+END
+  }
+  summary: "Compute the pairwise cross product."
+  description: <<END
+`a` and `b` must be the same shape; they can either be simple 3-element vectors,
+or any shape where the innermost dimension is 3. In the latter case, each pair
+of corresponding 3-element vectors is cross-multiplied independently.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cumprod.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cumprod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96e599365aef7a4226c9ec20a6d819fffa91c4e3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cumprod.pbtxt
@@ -0,0 +1,61 @@
+op {
+  graph_op_name: "Cumprod"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`. Must be one of the following types: `float32`, `float64`,
+`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+`complex128`, `qint8`, `quint8`, `qint32`, `half`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: 0). Must be in the range
+`[-rank(x), rank(x))`.
+END
+  }
+  attr {
+    name: "exclusive"
+    description: <<END
+If `True`, perform exclusive cumprod.
+END
+  }
+  attr {
+    name: "reverse"
+    description: <<END
+A `bool` (default: False).
+END
+  }
+  summary: "Compute the cumulative product of the tensor `x` along `axis`."
+  description: <<END
+By default, this op performs an inclusive cumprod, which means that the first
+element of the input is identical to the first element of the output:
+
+```python
+tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+performed instead:
+
+```python
+tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+opposite direction:
+
+```python
+tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+```
+
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+
+```python
+tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Cumsum.pbtxt b/tensorflow/core/api_def/base_api/api_def_Cumsum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6267f0dfa22b905b7dbc4b6b7b76fa410ee61a68
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Cumsum.pbtxt
@@ -0,0 +1,61 @@
+op {
+  graph_op_name: "Cumsum"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`. Must be one of the following types: `float32`, `float64`,
+`int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+`complex128`, `qint8`, `quint8`, `qint32`, `half`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int32` (default: 0). Must be in the range
+`[-rank(x), rank(x))`.
+END
+  }
+  attr {
+    name: "exclusive"
+    description: <<END
+If `True`, perform exclusive cumsum.
+END
+  }
+  attr {
+    name: "reverse"
+    description: <<END
+A `bool` (default: False).
+END
+  }
+  summary: "Compute the cumulative sum of the tensor `x` along `axis`."
+  description: <<END
+By default, this op performs an inclusive cumsum, which means that the first
+element of the input is identical to the first element of the output:
+
+```python
+tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+```
+
+By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+performed instead:
+
+```python
+tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+```
+
+By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+opposite direction:
+
+```python
+tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+```
+
+This is more efficient than using separate `tf.reverse` ops.
+
+The `reverse` and `exclusive` kwargs can also be combined:
+
+```python
+tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_D.pbtxt b/tensorflow/core/api_def/base_api/api_def_D.pbtxt
deleted file mode 100644
index ff8a7223c7223f5c4b72ffc7154b7fc77d8eeb06..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_D.pbtxt
+++ /dev/null
@@ -1,790 +0,0 @@
-op {
-  graph_op_name: "DebugGradientIdentity"
-  endpoint {
-    name: "DebugGradientIdentity"
-  }
-  summary: "Identity op for gradient debugging."
-  description: <<END
-This op is hidden from public in Python. It is used by TensorFlow Debugger to
-register gradient tensors for gradient debugging.
-END
-}
-op {
-  graph_op_name: "DecodeAndCropJpeg"
-  endpoint {
-    name: "DecodeAndCropJpeg"
-  }
-  summary: "Decode and Crop a JPEG-encoded image to a uint8 tensor."
-  description: <<END
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-
-It is equivalent to a combination of decode and crop, but much faster by only
-decoding partial jpeg image.
-END
-}
-op {
-  graph_op_name: "DecodeBase64"
-  endpoint {
-    name: "DecodeBase64"
-  }
-  summary: "Decode web-safe base64-encoded strings."
-  description: <<END
-Input may or may not have padding at the end. See EncodeBase64 for padding.
-Web-safe means that input must use - and _ instead of + and /.
-END
-}
-op {
-  graph_op_name: "DecodeBmp"
-  endpoint {
-    name: "DecodeBmp"
-  }
-  summary: "Decode the first frame of a BMP-encoded image to a uint8 tensor."
-  description: <<END
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the BMP-encoded image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-END
-}
-op {
-  graph_op_name: "DecodeCSV"
-  endpoint {
-    name: "DecodeCSV"
-  }
-  summary: "Convert CSV records to tensors. Each column maps to one tensor."
-  description: <<END
-RFC 4180 format is expected for the CSV records.
-(https://tools.ietf.org/html/rfc4180)
-Note that we allow leading and trailing spaces with int or float field.
-END
-}
-op {
-  graph_op_name: "DecodeGif"
-  endpoint {
-    name: "DecodeGif"
-  }
-  summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
-  description: <<END
-GIF with frame or transparency compression are not supported
-convert animated GIF from compressed to uncompressed by:
-
-    convert $src.gif -coalesce $dst.gif
-
-This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-`tf.image.decode_image`.
-END
-}
-op {
-  graph_op_name: "DecodeJSONExample"
-  endpoint {
-    name: "DecodeJSONExample"
-  }
-  summary: "Convert JSON-encoded Example records to binary protocol buffer strings."
-  description: <<END
-This op translates a tensor containing Example records, encoded using
-the [standard JSON
-mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-into a tensor containing the same records encoded as binary protocol
-buffers. The resulting tensor can then be fed to any of the other
-Example-parsing ops.
-END
-}
-op {
-  graph_op_name: "DecodeJpeg"
-  endpoint {
-    name: "DecodeJpeg"
-  }
-  summary: "Decode a JPEG-encoded image to a uint8 tensor."
-  description: <<END
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the JPEG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-
-If needed, the JPEG-encoded image is transformed to match the requested number
-of color channels.
-
-The attr `ratio` allows downscaling the image by an integer factor during
-decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-downscaling the image later.
-
-
-This op also supports decoding PNGs and non-animated GIFs since the interface is
-the same, though it is cleaner to use `tf.image.decode_image`.
-END
-}
-op {
-  graph_op_name: "DecodePng"
-  endpoint {
-    name: "DecodePng"
-  }
-  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
-  description: <<END
-The attr `channels` indicates the desired number of color channels for the
-decoded image.
-
-Accepted values are:
-
-*   0: Use the number of channels in the PNG-encoded image.
-*   1: output a grayscale image.
-*   3: output an RGB image.
-*   4: output an RGBA image.
-
-If needed, the PNG-encoded image is transformed to match the requested number
-of color channels.
-
-This op also supports decoding JPEGs and non-animated GIFs since the interface
-is the same, though it is cleaner to use `tf.image.decode_image`.
-END
-}
-op {
-  graph_op_name: "DecodeRaw"
-  endpoint {
-    name: "DecodeRaw"
-  }
-  summary: "Reinterpret the bytes of a string as a vector of numbers."
-}
-op {
-  graph_op_name: "DecodeWav"
-  endpoint {
-    name: "DecodeWav"
-  }
-  summary: "Decode a 16-bit PCM WAV file to a float tensor."
-  description: <<END
-The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
-
-When desired_channels is set, if the input contains fewer channels than this
-then the last channel will be duplicated to give the requested number, else if
-the input has more channels than requested then the additional channels will be
-ignored.
-
-If desired_samples is set, then the audio will be cropped or padded with zeroes
-to the requested length.
-
-The first output contains a Tensor with the content of the audio samples. The
-lowest dimension will be the number of channels, and the second will be the
-number of samples. For example, a ten-sample-long stereo WAV file should give an
-output shape of [10, 2].
-END
-}
-op {
-  graph_op_name: "DeleteSessionTensor"
-  endpoint {
-    name: "DeleteSessionTensor"
-  }
-  summary: "Delete the tensor specified by its handle in the session."
-}
-op {
-  graph_op_name: "DenseToDenseSetOperation"
-  endpoint {
-    name: "DenseToDenseSetOperation"
-  }
-  summary: "Applies set operation along last dimension of 2 `Tensor` inputs."
-  description: <<END
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-END
-}
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  endpoint {
-    name: "DenseToSparseBatchDataset"
-  }
-  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
-}
-op {
-  graph_op_name: "DenseToSparseSetOperation"
-  endpoint {
-    name: "DenseToSparseSetOperation"
-  }
-  summary: "Applies set operation along last dimension of `Tensor` and `SparseTensor`."
-  description: <<END
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set2`
-indices.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-END
-}
-op {
-  graph_op_name: "DepthToSpace"
-  endpoint {
-    name: "DepthToSpace"
-  }
-  summary: "DepthToSpace for tensors of type T."
-  description: <<END
-Rearranges data from depth into blocks of spatial data.
-This is the reverse transformation of SpaceToDepth. More specifically,
-this op outputs a copy of the input tensor where values from the `depth`
-dimension are moved in spatial blocks to the `height` and `width` dimensions.
-The attr `block_size` indicates the input block size and how the data is moved.
-
-  * Chunks of data of size `block_size * block_size` from depth are rearranged
-    into non-overlapping blocks of size `block_size x block_size`
-  * The width the output tensor is `input_depth * block_size`, whereas the
-    height is `input_height * block_size`.
-  * The Y, X coordinates within each block of the output image are determined
-    by the high order component of the input channel index.
-  * The depth of the input tensor must be divisible by
-    `block_size * block_size`.
-
-The `data_format` attr specifies the layout of the input and output tensors
-with the following options:
-  "NHWC": `[ batch, height, width, channels ]`
-  "NCHW": `[ batch, channels, height, width ]`
-  "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-
-It is useful to consider the operation as transforming a 6-D Tensor.
-e.g. for data_format = NHWC,
-     Each element in the input tensor can be specified via 6 coordinates,
-     ordered by decreasing memory layout significance as:
-     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-                        within the input image, bX, bY means coordinates
-                        within the output block, oC means output channels).
-     The output would be the input transposed to the following layout:
-     n,iY,bY,iX,bX,oC
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-block_size = 2:
-
-```
-x = [[[[1, 2, 3, 4]]]]
-
-```
-
-This operation will output a tensor of shape `[1, 2, 2, 1]`:
-
-```
-   [[[[1], [2]],
-     [[3], [4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-the corresponding output will have 2x2 elements and will have a depth of
-1 channel (1 = `4 / (block_size * block_size)`).
-The output element shape is `[2, 2, 1]`.
-
-For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
-
-```
-x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-This operation, for block size of 2, will return the following tensor of shape
-`[1, 2, 2, 3]`
-
-```
-   [[[[1, 2, 3], [4, 5, 6]],
-     [[7, 8, 9], [10, 11, 12]]]]
-
-```
-
-Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
-
-```
-x =  [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-
-the operator will return the following tensor of shape `[1 4 4 1]`:
-
-```
-x = [[[ [1],   [2],  [5],  [6]],
-      [ [3],   [4],  [7],  [8]],
-      [ [9],  [10], [13],  [14]],
-      [ [11], [12], [15],  [16]]]]
-
-```
-END
-}
-op {
-  graph_op_name: "DepthwiseConv2dNative"
-  endpoint {
-    name: "DepthwiseConv2dNative"
-  }
-  summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
-  description: <<END
-Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-and a filter / kernel tensor of shape
-`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-a different filter to each input channel (expanding from 1 channel to
-`channel_multiplier` channels for each), then concatenates the results
-together. Thus, the output has `in_channels * channel_multiplier` channels.
-
-```
-for k in 0..in_channels-1
-  for q in 0..channel_multiplier-1
-    output[b, i, j, k * channel_multiplier + q] =
-      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-                        filter[di, dj, k, q]
-```
-
-Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-END
-}
-op {
-  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
-  endpoint {
-    name: "DepthwiseConv2dNativeBackpropFilter"
-  }
-  summary: "Computes the gradients of depthwise convolution with respect to the filter."
-}
-op {
-  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
-  endpoint {
-    name: "DepthwiseConv2dNativeBackpropInput"
-  }
-  summary: "Computes the gradients of depthwise convolution with respect to the input."
-}
-op {
-  graph_op_name: "Dequantize"
-  endpoint {
-    name: "Dequantize"
-  }
-  summary: "Dequantize the \'input\' tensor into a float Tensor."
-  description: <<END
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-if T == qint8, in[i] += (range(T) + 1)/ 2.0
-out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-If the input comes from a QuantizedRelu6, the output type is
-quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-Dequantize on quint8 will take each value, cast to float, and multiply
-by 6 / 255.
-Note that if quantizedtype is qint8, the operation will additionally add
-each value by 128 prior to casting.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```c++
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = range / number_of_steps
-const double offset_input = static_cast<double>(input) - lowest_quantized;
-result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-```
-
-*SCALED mode Example*
-
-`SCALED` mode matches the quantization approach used in
-`QuantizeAndDequantize{V2|V3}`.
-
-If the mode is `SCALED`, we do not use the full range of the output type,
-choosing to elide the lowest possible value for symmetry (e.g., output range is
--127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-0.
-
-We first find the range of values in our tensor. The
-range we use is always centered on 0, so we find m such that
-```c++
-  m = max(abs(input_min), abs(input_max))
-```
-
-Our input tensor range is then `[-m, m]`.
-
-Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-If T is signed, this is
-```
-  num_bits = sizeof(T) * 8
-  [min_fixed, max_fixed] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-```
-
-Otherwise, if T is unsigned, the fixed-point range is
-```
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-```
-
-From this we compute our scaling factor, s:
-```c++
-  s = (2 * m) / (max_fixed - min_fixed)
-```
-
-Now we can dequantize the elements of our tensor:
-```c++
-result = input * s
-```
-END
-}
-op {
-  graph_op_name: "DeserializeManySparse"
-  endpoint {
-    name: "DeserializeManySparse"
-  }
-  summary: "Deserialize and concatenate `SparseTensors` from a serialized minibatch."
-  description: <<END
-The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-`N` is the minibatch size and the rows correspond to packed outputs of
-`SerializeSparse`.  The ranks of the original `SparseTensor` objects
-must all match.  When the final `SparseTensor` is created, it has rank one
-higher than the ranks of the incoming `SparseTensor` objects
-(they have been concatenated along a new row dimension).
-
-The output `SparseTensor` object's shape values for all dimensions but the
-first are the max across the input `SparseTensor` objects' shape values
-for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-size.
-
-The input `SparseTensor` objects' indices are assumed ordered in
-standard lexicographic order.  If this is not the case, after this
-step run `SparseReorder` to restore index ordering.
-
-For example, if the serialized input is a `[2 x 3]` matrix representing two
-original `SparseTensor` objects:
-
-    index = [ 0]
-            [10]
-            [20]
-    values = [1, 2, 3]
-    shape = [50]
-
-and
-
-    index = [ 2]
-            [10]
-    values = [4, 5]
-    shape = [30]
-
-then the final deserialized `SparseTensor` will be:
-
-    index = [0  0]
-            [0 10]
-            [0 20]
-            [1  2]
-            [1 10]
-    values = [1, 2, 3, 4, 5]
-    shape = [2 50]
-END
-}
-op {
-  graph_op_name: "DestroyTemporaryVariable"
-  endpoint {
-    name: "DestroyTemporaryVariable"
-  }
-  summary: "Destroys the temporary variable and returns its final value."
-  description: <<END
-Sets output to the value of the Tensor pointed to by 'ref', then destroys
-the temporary variable called 'var_name'.
-All other uses of 'ref' *must* have executed before this op.
-This is typically achieved by chaining the ref through each assign op, or by
-using control dependencies.
-
-Outputs the final value of the tensor pointed to by 'ref'.
-END
-}
-op {
-  graph_op_name: "Diag"
-  endpoint {
-    name: "Diag"
-  }
-  summary: "Returns a diagonal tensor with a given diagonal values."
-  description: <<END
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-
-`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-
-For example:
-
-```
-# 'diagonal' is [1, 2, 3, 4]
-tf.diag(diagonal) ==> [[1, 0, 0, 0]
-                       [0, 2, 0, 0]
-                       [0, 0, 3, 0]
-                       [0, 0, 0, 4]]
-```
-END
-}
-op {
-  graph_op_name: "DiagPart"
-  endpoint {
-    name: "DiagPart"
-  }
-  summary: "Returns the diagonal part of the tensor."
-  description: <<END
-This operation returns a tensor with the `diagonal` part
-of the `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-
-`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-
-For example:
-
-```
-# 'input' is [[1, 0, 0, 0]
-              [0, 2, 0, 0]
-              [0, 0, 3, 0]
-              [0, 0, 0, 4]]
-
-tf.diag_part(input) ==> [1, 2, 3, 4]
-```
-END
-}
-op {
-  graph_op_name: "Digamma"
-  endpoint {
-    name: "Digamma"
-  }
-  summary: "Computes Psi, the derivative of Lgamma (the log of the absolute value of"
-  description: <<END
-`Gamma(x)`), element-wise.
-END
-}
-op {
-  graph_op_name: "Dilation2D"
-  endpoint {
-    name: "Dilation2D"
-  }
-  summary: "Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors."
-  description: <<END
-The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-input channel is processed independently of the others with its own structuring
-function. The `output` tensor has shape
-`[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-tensor depend on the `padding` algorithm. We currently only support the default
-"NHWC" `data_format`.
-
-In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-(for consistency with `conv2d`, we use unmirrored filters):
-
-    output[b, y, x, c] =
-       max_{dy, dx} input[b,
-                          strides[1] * y + rates[1] * dy,
-                          strides[2] * x + rates[2] * dx,
-                          c] +
-                    filter[dy, dx, c]
-
-Max-pooling is a special case when the filter has size equal to the pooling
-kernel size and contains all zeros.
-
-Note on duality: The dilation of `input` by the `filter` is equal to the
-negation of the erosion of `-input` by the reflected `filter`.
-END
-}
-op {
-  graph_op_name: "Dilation2DBackpropFilter"
-  endpoint {
-    name: "Dilation2DBackpropFilter"
-  }
-  summary: "Computes the gradient of morphological 2-D dilation with respect to the filter."
-}
-op {
-  graph_op_name: "Dilation2DBackpropInput"
-  endpoint {
-    name: "Dilation2DBackpropInput"
-  }
-  summary: "Computes the gradient of morphological 2-D dilation with respect to the input."
-}
-op {
-  graph_op_name: "Div"
-  endpoint {
-    name: "Div"
-  }
-  summary: "Returns x / y element-wise."
-  description: <<END
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "DrawBoundingBoxes"
-  endpoint {
-    name: "DrawBoundingBoxes"
-  }
-  summary: "Draw bounding boxes on a batch of images."
-  description: <<END
-Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-boxes specified by the locations in `boxes`. The coordinates of the each
-bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example, if an image is 100 x 200 pixels (height x width) and the bounding
-box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-
-Parts of the bounding box may fall outside the image.
-END
-}
-op {
-  graph_op_name: "DynamicPartition"
-  endpoint {
-    name: "DynamicPartition"
-  }
-  summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
-  description: <<END
-For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-are placed in `outputs[i]` in lexicographic order of `js`, and the first
-dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-In detail,
-
-```python
-    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-
-    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-```
-
-`data.shape` must start with `partitions.shape`.
-
-For example:
-
-```python
-    # Scalar partitions.
-    partitions = 1
-    num_partitions = 2
-    data = [10, 20]
-    outputs[0] = []  # Empty with shape [0, 2]
-    outputs[1] = [[10, 20]]
-
-    # Vector partitions.
-    partitions = [0, 0, 1, 1, 0]
-    num_partitions = 2
-    data = [10, 20, 30, 40, 50]
-    outputs[0] = [10, 20, 50]
-    outputs[1] = [30, 40]
-```
-
-See `dynamic_stitch` for an example on how to merge partitions back.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "DynamicStitch"
-  endpoint {
-    name: "DynamicStitch"
-  }
-  summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: <<END
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values are merged in order, so if an index appears in both `indices[m][i]` and
-`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-merged result. If you do not need this guarantee, ParallelDynamicStitch might
-perform better on some devices.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-This method can be used to merge partitions created by `dynamic_partition`
-as illustrated on the following example:
-
-```python
-    # Apply function (increments x_i) on elements for which a certain condition
-    # apply (x_i != -1 in this example).
-    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-    condition_mask=tf.not_equal(x,tf.constant(-1.))
-    partitioned_data = tf.dynamic_partition(
-        x, tf.cast(condition_mask, tf.int32) , 2)
-    partitioned_data[1] = partitioned_data[1] + 1.0
-    condition_indices = tf.dynamic_partition(
-        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-    x = tf.dynamic_stitch(condition_indices, partitioned_data)
-    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-    # unchanged.
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-</div>
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..62098acd38239f0ee29198796415cd33a627a5a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  in_arg {
+    name: "x"
+    description: <<END
+Scalar. Dimension index in source data format. Must be in the range [-4, 4).
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+Scalar. Dimension index in destination data format.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the dimension index in the destination data format given the one in"
+  description: <<END
+the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2fa61aaed8a2afd989f69a1084fd899ec0ddf12
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  in_arg {
+    name: "x"
+    description: <<END
+Vector in source data format. Must be of size 4.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+Vector in destination data format. Must be of size 4.
+END
+  }
+  attr {
+    name: "src_format"
+    description: <<END
+source data format.
+END
+  }
+  attr {
+    name: "dst_format"
+    description: <<END
+destination data format.
+END
+  }
+  summary: "Returns the permuted vector in the destination data format given the one in"
+  description: <<END
+the source data format.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b9dffd883250fd5631444252e7b236116e2e822
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "DatasetToSingleElement"
+  in_arg {
+    name: "dataset"
+    description: <<END
+A handle to a dataset that contains a single element.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+The components of the single element of `input`.
+END
+  }
+  summary: "Outputs the single element from the given dataset."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt b/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38fd6877e9d26e7ab86a4e7f95352a4a39efb7c2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DebugGradientIdentity.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "DebugGradientIdentity"
+  summary: "Identity op for gradient debugging."
+  description: <<END
+This op is hidden from public in Python. It is used by TensorFlow Debugger to
+register gradient tensors for gradient debugging.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeAndCropJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28318274f3897e152fc7320778fd3e8dbbf15ebc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "DecodeAndCropJpeg"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The JPEG-encoded image.
+END
+  }
+  in_arg {
+    name: "crop_window"
+    description: <<END
+1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`..
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  attr {
+    name: "ratio"
+    description: <<END
+Downscaling ratio.
+END
+  }
+  attr {
+    name: "fancy_upscaling"
+    description: <<END
+If true use a slower but nicer upscaling of the
+chroma planes (yuv420/422 only).
+END
+  }
+  attr {
+    name: "try_recover_truncated"
+    description: <<END
+If true try to recover an image from truncated input.
+END
+  }
+  attr {
+    name: "acceptable_fraction"
+    description: <<END
+The minimum required fraction of lines before a truncated
+input is accepted.
+END
+  }
+  attr {
+    name: "dct_method"
+    description: <<END
+string specifying a hint about the algorithm used for
+decompression.  Defaults to "" which maps to a system-specific
+default.  Currently valid values are ["INTEGER_FAST",
+"INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+jpeg library changes to a version that does not have that specific
+option.)
+END
+  }
+  summary: "Decode and Crop a JPEG-encoded image to a uint8 tensor."
+  description: <<END
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the JPEG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+
+If needed, the JPEG-encoded image is transformed to match the requested number
+of color channels.
+
+The attr `ratio` allows downscaling the image by an integer factor during
+decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+downscaling the image later.
+
+
+It is equivalent to a combination of decode and crop, but much faster by only
+decoding partial jpeg image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeBase64.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6bae3a62d7dfdbc71ececdd6fd3bba0f059a7498
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeBase64.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "DecodeBase64"
+  in_arg {
+    name: "input"
+    description: <<END
+Base64 strings to decode.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Decoded strings.
+END
+  }
+  summary: "Decode web-safe base64-encoded strings."
+  description: <<END
+Input may or may not have padding at the end. See EncodeBase64 for padding.
+Web-safe means that input must use - and _ instead of + and /.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeBmp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c6918e6a0fc3e8fd7fe40ad6d1f19302b7fe650
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeBmp.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "DecodeBmp"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The BMP-encoded image.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`. RGB order
+END
+  }
+  summary: "Decode the first frame of a BMP-encoded image to a uint8 tensor."
+  description: <<END
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the BMP-encoded image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeCSV.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeCSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e39213cbc77a21788d72cf60349e2a3798a93596
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeCSV.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "DecodeCSV"
+  in_arg {
+    name: "records"
+    description: <<END
+Each string is a record/row in the csv and all records should have
+the same format.
+END
+  }
+  in_arg {
+    name: "record_defaults"
+    description: <<END
+One tensor per column of the input record, with either a
+scalar default value for that column or empty if the column is required.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Each tensor will have the same shape as records.
+END
+  }
+  attr {
+    name: "field_delim"
+    description: <<END
+char delimiter to separate fields in a record.
+END
+  }
+  attr {
+    name: "use_quote_delim"
+    description: <<END
+If false, treats double quotation marks as regular
+characters inside of the string fields (ignoring RFC 4180, Section 2,
+Bullet 5).
+END
+  }
+  attr {
+    name: "na_value"
+    description: <<END
+Additional string to recognize as NA/NaN.
+END
+  }
+  summary: "Convert CSV records to tensors. Each column maps to one tensor."
+  description: <<END
+RFC 4180 format is expected for the CSV records.
+(https://tools.ietf.org/html/rfc4180)
+Note that we allow leading and trailing spaces with int or float field.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a90b7341555cb7a306ec6f58d727da0707503f52
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeGif.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "DecodeGif"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The GIF-encoded image.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+4-D with shape `[num_frames, height, width, 3]`. RGB order
+END
+  }
+  summary: "Decode the first frame of a GIF-encoded image to a uint8 tensor."
+  description: <<END
+GIF with frame or transparency compression are not supported
+convert animated GIF from compressed to uncompressed by:
+
+    convert $src.gif -coalesce $dst.gif
+
+This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+`tf.image.decode_image`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeJSONExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeJSONExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdf1c5f37ddcaa0e5aecfab2cb681298072961eb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeJSONExample.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "DecodeJSONExample"
+  in_arg {
+    name: "json_examples"
+    description: <<END
+Each string is a JSON object serialized according to the JSON
+mapping of the Example proto.
+END
+  }
+  out_arg {
+    name: "binary_examples"
+    description: <<END
+Each string is a binary Example protocol buffer corresponding
+to the respective element of `json_examples`.
+END
+  }
+  summary: "Convert JSON-encoded Example records to binary protocol buffer strings."
+  description: <<END
+This op translates a tensor containing Example records, encoded using
+the [standard JSON
+mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+into a tensor containing the same records encoded as binary protocol
+buffers. The resulting tensor can then be fed to any of the other
+Example-parsing ops.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9521370d35050c37282adda774ac58f5e14effa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeJpeg.pbtxt
@@ -0,0 +1,80 @@
+op {
+  graph_op_name: "DecodeJpeg"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The JPEG-encoded image.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`..
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  attr {
+    name: "ratio"
+    description: <<END
+Downscaling ratio.
+END
+  }
+  attr {
+    name: "fancy_upscaling"
+    description: <<END
+If true use a slower but nicer upscaling of the
+chroma planes (yuv420/422 only).
+END
+  }
+  attr {
+    name: "try_recover_truncated"
+    description: <<END
+If true try to recover an image from truncated input.
+END
+  }
+  attr {
+    name: "acceptable_fraction"
+    description: <<END
+The minimum required fraction of lines before a truncated
+input is accepted.
+END
+  }
+  attr {
+    name: "dct_method"
+    description: <<END
+string specifying a hint about the algorithm used for
+decompression.  Defaults to "" which maps to a system-specific
+default.  Currently valid values are ["INTEGER_FAST",
+"INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+jpeg library changes to a version that does not have that specific
+option.)
+END
+  }
+  summary: "Decode a JPEG-encoded image to a uint8 tensor."
+  description: <<END
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the JPEG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+
+If needed, the JPEG-encoded image is transformed to match the requested number
+of color channels.
+
+The attr `ratio` allows downscaling the image by an integer factor during
+decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+downscaling the image later.
+
+
+This op also supports decoding PNGs and non-animated GIFs since the interface is
+the same, though it is cleaner to use `tf.image.decode_image`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..63404db800968db5ba34a8a0f9509573b9d9824a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodePng.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "DecodePng"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D.  The PNG-encoded image.
+END
+  }
+  out_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`.
+END
+  }
+  attr {
+    name: "channels"
+    description: <<END
+Number of color channels for the decoded image.
+END
+  }
+  summary: "Decode a PNG-encoded image to a uint8 or uint16 tensor."
+  description: <<END
+The attr `channels` indicates the desired number of color channels for the
+decoded image.
+
+Accepted values are:
+
+*   0: Use the number of channels in the PNG-encoded image.
+*   1: output a grayscale image.
+*   3: output an RGB image.
+*   4: output an RGBA image.
+
+If needed, the PNG-encoded image is transformed to match the requested number
+of color channels.
+
+This op also supports decoding JPEGs and non-animated GIFs since the interface
+is the same, though it is cleaner to use `tf.image.decode_image`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeRaw.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeRaw.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27ca061013ef3c94f17828fb17d1279c0a97ac35
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeRaw.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "DecodeRaw"
+  in_arg {
+    name: "bytes"
+    description: <<END
+All the elements must have the same length.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor with one more dimension than the input `bytes`.  The
+added dimension will have size equal to the length of the elements
+of `bytes` divided by the number of bytes to represent `out_type`.
+END
+  }
+  attr {
+    name: "little_endian"
+    description: <<END
+Whether the input `bytes` are in little-endian order.
+Ignored for `out_type` values that are stored in a single byte like
+`uint8`.
+END
+  }
+  summary: "Reinterpret the bytes of a string as a vector of numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f055e73d3e57f01a57d9f27a995c3acffc8abba
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeWav.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "DecodeWav"
+  in_arg {
+    name: "contents"
+    description: <<END
+The WAV-encoded audio, usually from a file.
+END
+  }
+  out_arg {
+    name: "audio"
+    description: <<END
+2-D with shape `[length, channels]`.
+END
+  }
+  out_arg {
+    name: "sample_rate"
+    description: <<END
+Scalar holding the sample rate found in the WAV header.
+END
+  }
+  attr {
+    name: "desired_channels"
+    description: <<END
+Number of sample channels wanted.
+END
+  }
+  attr {
+    name: "desired_samples"
+    description: <<END
+Length of audio requested.
+END
+  }
+  summary: "Decode a 16-bit PCM WAV file to a float tensor."
+  description: <<END
+The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+
+When desired_channels is set, if the input contains fewer channels than this
+then the last channel will be duplicated to give the requested number, else if
+the input has more channels than requested then the additional channels will be
+ignored.
+
+If desired_samples is set, then the audio will be cropped or padded with zeroes
+to the requested length.
+
+The first output contains a Tensor with the content of the audio samples. The
+lowest dimension will be the number of channels, and the second will be the
+number of samples. For example, a ten-sample-long stereo WAV file should give an
+output shape of [10, 2].
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeleteSessionTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeleteSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..16aaa7a802f7b790d8c6fc990869edb2d930c2a9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeleteSessionTensor.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "DeleteSessionTensor"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle for a tensor stored in the session state.
+END
+  }
+  summary: "Delete the tensor specified by its handle in the session."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToDenseSetOperation.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseToDenseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8a469de95ff8940b4f3cae30d3c350ac626dcbb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DenseToDenseSetOperation.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "DenseToDenseSetOperation"
+  in_arg {
+    name: "set1"
+    description: <<END
+`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+Dimension `n` contains values in a set, duplicates are allowed but ignored.
+END
+  }
+  in_arg {
+    name: "set2"
+    description: <<END
+`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+Dimension `n` contains values in a set, duplicates are allowed but ignored.
+END
+  }
+  out_arg {
+    name: "result_indices"
+    description: <<END
+2D indices of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_values"
+    description: <<END
+1D values of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_shape"
+    description: <<END
+1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+is the max result set size across all `0...n-1` dimensions.
+END
+  }
+  summary: "Applies set operation along last dimension of 2 `Tensor` inputs."
+  description: <<END
+See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+
+Output `result` is a `SparseTensor` represented by `result_indices`,
+`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+dimension contains the result of `set_operation` applied to the corresponding
+`[0...n-1]` dimension of `set`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f2f5594c7c16b20ef934539b96bc78d324c1542d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "DenseToSparseBatchDataset"
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A handle to an input dataset. Must have a single component.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch.
+END
+  }
+  in_arg {
+    name: "row_shape"
+    description: <<END
+A vector representing the dense shape of each row in the produced
+SparseTensor. The shape may be partially specified, using `-1` to indicate
+that a particular dimension should use the maximum size of all batch elements.
+END
+  }
+  summary: "Creates a dataset that yields a SparseTensor for each element of the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4b086637346429c541a85567bb9220877a57d24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DenseToSparseSetOperation.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "DenseToSparseSetOperation"
+  in_arg {
+    name: "set1"
+    description: <<END
+`Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+Dimension `n` contains values in a set, duplicates are allowed but ignored.
+END
+  }
+  in_arg {
+    name: "set2_indices"
+    description: <<END
+2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set2_values"
+    description: <<END
+1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set2_shape"
+    description: <<END
+1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+max set size across `n-1` dimensions.
+END
+  }
+  out_arg {
+    name: "result_indices"
+    description: <<END
+2D indices of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_values"
+    description: <<END
+1D values of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_shape"
+    description: <<END
+1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+is the max result set size across all `0...n-1` dimensions.
+END
+  }
+  summary: "Applies set operation along last dimension of `Tensor` and `SparseTensor`."
+  description: <<END
+See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+
+Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+ignored.
+
+If `validate_indices` is `True`, this op validates the order and range of `set2`
+indices.
+
+Output `result` is a `SparseTensor` represented by `result_indices`,
+`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+dimension contains the result of `set_operation` applied to the corresponding
+`[0...n-1]` dimension of `set`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d20b47a3ed50f9a8bb65f0cd6c332d03172e6bd0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DepthToSpace.pbtxt
@@ -0,0 +1,101 @@
+op {
+  graph_op_name: "DepthToSpace"
+  attr {
+    name: "block_size"
+    description: <<END
+The size of the spatial block, same as in Space2Depth.
+END
+  }
+  summary: "DepthToSpace for tensors of type T."
+  description: <<END
+Rearranges data from depth into blocks of spatial data.
+This is the reverse transformation of SpaceToDepth. More specifically,
+this op outputs a copy of the input tensor where values from the `depth`
+dimension are moved in spatial blocks to the `height` and `width` dimensions.
+The attr `block_size` indicates the input block size and how the data is moved.
+
+  * Chunks of data of size `block_size * block_size` from depth are rearranged
+    into non-overlapping blocks of size `block_size x block_size`
+  * The width the output tensor is `input_depth * block_size`, whereas the
+    height is `input_height * block_size`.
+  * The Y, X coordinates within each block of the output image are determined
+    by the high order component of the input channel index.
+  * The depth of the input tensor must be divisible by
+    `block_size * block_size`.
+
+The `data_format` attr specifies the layout of the input and output tensors
+with the following options:
+  "NHWC": `[ batch, height, width, channels ]`
+  "NCHW": `[ batch, channels, height, width ]`
+  "NCHW_VECT_C":
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
+
+It is useful to consider the operation as transforming a 6-D Tensor.
+e.g. for data_format = NHWC,
+     Each element in the input tensor can be specified via 6 coordinates,
+     ordered by decreasing memory layout significance as:
+     n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+                        within the input image, bX, bY means coordinates
+                        within the output block, oC means output channels).
+     The output would be the input transposed to the following layout:
+     n,iY,bY,iX,bX,oC
+
+This operation is useful for resizing the activations between convolutions
+(but keeping all data), e.g. instead of pooling. It is also useful for training
+purely convolutional models.
+
+For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+block_size = 2:
+
+```
+x = [[[[1, 2, 3, 4]]]]
+
+```
+
+This operation will output a tensor of shape `[1, 2, 2, 1]`:
+
+```
+   [[[[1], [2]],
+     [[3], [4]]]]
+```
+
+Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+the corresponding output will have 2x2 elements and will have a depth of
+1 channel (1 = `4 / (block_size * block_size)`).
+The output element shape is `[2, 2, 1]`.
+
+For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+
+```
+x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+```
+
+This operation, for block size of 2, will return the following tensor of shape
+`[1, 2, 2, 3]`
+
+```
+   [[[[1, 2, 3], [4, 5, 6]],
+     [[7, 8, 9], [10, 11, 12]]]]
+
+```
+
+Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+
+```
+x =  [[[[1, 2, 3, 4],
+       [5, 6, 7, 8]],
+      [[9, 10, 11, 12],
+       [13, 14, 15, 16]]]]
+```
+
+the operator will return the following tensor of shape `[1 4 4 1]`:
+
+```
+x = [[[ [1],   [2],  [5],  [6]],
+      [ [3],   [4],  [7],  [8]],
+      [ [9],  [10], [13],  [14]],
+      [ [11], [12], [15],  [16]]]]
+
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c313f7be6b38317ab7721a0d494fec42bdb52f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "DepthwiseConv2dNative"
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4.  The stride of the sliding window for each dimension
+of `input`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, height, width, channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors."
+  description: <<END
+Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+and a filter / kernel tensor of shape
+`[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+`in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+a different filter to each input channel (expanding from 1 channel to
+`channel_multiplier` channels for each), then concatenates the results
+together. Thus, the output has `in_channels * channel_multiplier` channels.
+
+```
+for k in 0..in_channels-1
+  for q in 0..channel_multiplier-1
+    output[b, i, j, k * channel_multiplier + q] =
+      sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+                        filter[di, dj, k, q]
+```
+
+Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e66aa3b70707c2216ff5195b9d2dda407c50ec74
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape based on `data_format`.  For example, if
+`data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+in_width, in_channels]` tensor.
+END
+  }
+  in_arg {
+    name: "filter_sizes"
+    description: <<END
+An integer vector representing the tensor shape of `filter`,
+where `filter` is a 4-D
+`[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape  based on `data_format`.
+For example, if `data_format` is 'NHWC' then
+out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+the `filter` input of the convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, height, width, channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of depthwise convolution with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f501ad21b35b6ad8d3ee16650919b1ff897cdccb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
+  in_arg {
+    name: "input_sizes"
+    description: <<END
+An integer vector representing the shape of `input`, based
+on `data_format`.  For example, if `data_format` is 'NHWC' then
+ `input` is a 4-D `[batch, height, width, channels]` tensor.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape  based on `data_format`.
+For example, if `data_format` is 'NHWC' then
+out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+Gradients w.r.t. the output of the convolution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape according to `data_format`.  For example, if
+`data_format` is 'NHWC', output shape is `[batch, in_height,
+in_width, in_channels]`.  Gradient w.r.t. the input of the
+convolution.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+of the convolution.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, height, width, channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, channels, height, width].
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each filter
+element on that dimension. The dimension order is determined by the value of
+`data_format`, see above for details. Dilations in the batch and depth
+dimensions must be 1.
+END
+  }
+  summary: "Computes the gradients of depthwise convolution with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40c00ef58f8d9e6262023d6a3299fa5f6fbd8f2a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Dequantize.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "Dequantize"
+  in_arg {
+    name: "min_range"
+    description: <<END
+The minimum scalar value possibly produced for the input.
+END
+  }
+  in_arg {
+    name: "max_range"
+    description: <<END
+The maximum scalar value possibly produced for the input.
+END
+  }
+  summary: "Dequantize the \'input\' tensor into a float Tensor."
+  description: <<END
+[min_range, max_range] are scalar floats that specify the range for
+the 'input' data. The 'mode' attribute controls exactly which calculations are
+used to convert the float values to their quantized equivalents.
+
+In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+
+```
+if T == qint8, in[i] += (range(T) + 1)/ 2.0
+out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+```
+here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+
+*MIN_COMBINED Mode Example*
+
+If the input comes from a QuantizedRelu6, the output type is
+quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+Dequantize on quint8 will take each value, cast to float, and multiply
+by 6 / 255.
+Note that if quantizedtype is qint8, the operation will additionally add
+each value by 128 prior to casting.
+
+If the mode is 'MIN_FIRST', then this approach is used:
+
+```c++
+num_discrete_values = 1 << (# of bits in T)
+range_adjust = num_discrete_values / (num_discrete_values - 1)
+range = (range_max - range_min) * range_adjust
+range_scale = range / num_discrete_values
+const double offset_input = static_cast<double>(input) - lowest_quantized;
+result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+```
+
+*SCALED mode Example*
+
+`SCALED` mode matches the quantization approach used in
+`QuantizeAndDequantize{V2|V3}`.
+
+If the mode is `SCALED`, we do not use the full range of the output type,
+choosing to elide the lowest possible value for symmetry (e.g., output range is
+-127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+0.
+
+We first find the range of values in our tensor. The
+range we use is always centered on 0, so we find m such that
+```c++
+  m = max(abs(input_min), abs(input_max))
+```
+
+Our input tensor range is then `[-m, m]`.
+
+Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+If T is signed, this is
+```
+  num_bits = sizeof(T) * 8
+  [min_fixed, max_fixed] =
+      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+```
+
+Otherwise, if T is unsigned, the fixed-point range is
+```
+  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+```
+
+From this we compute our scaling factor, s:
+```c++
+  s = (2 * m) / (max_fixed - min_fixed)
+```
+
+Now we can dequantize the elements of our tensor:
+```c++
+result = input * s
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeserializeIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeserializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..653f6789db1119d464931f2b2eb737a125d29a4f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeserializeIterator.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "DeserializeIterator"
+  in_arg {
+    name: "resource_handle"
+    description: <<END
+A handle to an iterator resource.
+END
+  }
+  in_arg {
+    name: "serialized"
+    description: <<END
+A variant tensor storing the state of the iterator contained in the
+resource.
+END
+  }
+  summary: "Converts the given variant tensor to an iterator and stores it in the given resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeserializeManySparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeserializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1fb5eae02a35d42eb7f68a6597293beb0ebcaf7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeserializeManySparse.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "DeserializeManySparse"
+  in_arg {
+    name: "serialized_sparse"
+    description: <<END
+2-D, The `N` serialized `SparseTensor` objects.
+Must have 3 columns.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The `dtype` of the serialized `SparseTensor` objects.
+END
+  }
+  summary: "Deserialize and concatenate `SparseTensors` from a serialized minibatch."
+  description: <<END
+The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+`N` is the minibatch size and the rows correspond to packed outputs of
+`SerializeSparse`.  The ranks of the original `SparseTensor` objects
+must all match.  When the final `SparseTensor` is created, it has rank one
+higher than the ranks of the incoming `SparseTensor` objects
+(they have been concatenated along a new row dimension).
+
+The output `SparseTensor` object's shape values for all dimensions but the
+first are the max across the input `SparseTensor` objects' shape values
+for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+size.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dfaa531cbcc8adf46e5c6c57164fa7f674cda18d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  in_arg {
+    name: "serialized_sparse"
+    description: <<END
+The serialized `SparseTensor` objects. The last dimension
+must have 3 columns.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The `dtype` of the serialized `SparseTensor` objects.
+END
+  }
+  summary: "Deserialize `SparseTensor` objects."
+  description: <<END
+The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+the last dimension stores serialized `SparseTensor` objects and the other N
+dimensions (N >= 0) correspond to a batch. The ranks of the original
+`SparseTensor` objects must all match. When the final `SparseTensor` is
+created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+the sparse tensors have been concatenated along new dimensions, one for each
+batch.
+
+The output `SparseTensor` object's shape values for the original dimensions
+are the max across the input `SparseTensor` objects' shape values for the
+corresponding dimensions. The new dimensions match the size of the batch.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DestroyResourceOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_DestroyResourceOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..910d25ec825dcbea8d73364ee6b599b6c59c1876
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DestroyResourceOp.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "DestroyResourceOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+handle to the resource to delete.
+END
+  }
+  attr {
+    name: "ignore_lookup_error"
+    description: <<END
+whether to ignore the error when the resource
+doesn't exist.
+END
+  }
+  summary: "Deletes the resource specified by the handle."
+  description: <<END
+All subsequent operations using the resource will result in a NotFound
+error status.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DestroyTemporaryVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_DestroyTemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ae9a30cb43613ef351b57c56d0b215fa7dd1e6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DestroyTemporaryVariable.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "DestroyTemporaryVariable"
+  in_arg {
+    name: "ref"
+    description: <<END
+A reference to the temporary variable tensor.
+END
+  }
+  attr {
+    name: "var_name"
+    description: <<END
+Name of the temporary variable, usually the name of the matching
+'TemporaryVariable' op.
+END
+  }
+  summary: "Destroys the temporary variable and returns its final value."
+  description: <<END
+Sets output to the value of the Tensor pointed to by 'ref', then destroys
+the temporary variable called 'var_name'.
+All other uses of 'ref' *must* have executed before this op.
+This is typically achieved by chaining the ref through each assign op, or by
+using control dependencies.
+
+Outputs the final value of the tensor pointed to by 'ref'.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Diag.pbtxt b/tensorflow/core/api_def/base_api/api_def_Diag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e69d9077f96502c7fb7e0f012e0b1e5417dab438
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Diag.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "Diag"
+  in_arg {
+    name: "diagonal"
+    description: <<END
+Rank k tensor where k is at most 1.
+END
+  }
+  summary: "Returns a diagonal tensor with a given diagonal values."
+  description: <<END
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+
+`output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+
+For example:
+
+```
+# 'diagonal' is [1, 2, 3, 4]
+tf.diag(diagonal) ==> [[1, 0, 0, 0]
+                       [0, 2, 0, 0]
+                       [0, 0, 3, 0]
+                       [0, 0, 0, 4]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_DiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1af7df95b7ce4071a5634bc9fa089e6e8c703664
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DiagPart.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "DiagPart"
+  in_arg {
+    name: "input"
+    description: <<END
+Rank k tensor where k is even and not zero.
+END
+  }
+  out_arg {
+    name: "diagonal"
+    description: <<END
+The extracted diagonal.
+END
+  }
+  summary: "Returns the diagonal part of the tensor."
+  description: <<END
+This operation returns a tensor with the `diagonal` part
+of the `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+
+`diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+
+For example:
+
+```
+# 'input' is [[1, 0, 0, 0]
+              [0, 2, 0, 0]
+              [0, 0, 3, 0]
+              [0, 0, 0, 4]]
+
+tf.diag_part(input) ==> [1, 2, 3, 4]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Digamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Digamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a8280701ba3c6cd5db21e966eff27f2e7639a6e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Digamma.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Digamma"
+  summary: "Computes Psi, the derivative of Lgamma (the log of the absolute value of"
+  description: <<END
+`Gamma(x)`), element-wise.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Dilation2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b38f5aa4f9cb9257ac15238115ff1b7165fbc94b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Dilation2D.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "Dilation2D"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, depth]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+3-D with shape `[filter_height, filter_width, depth]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, depth]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+tensor. Must be: `[1, stride_height, stride_width, 1]`.
+END
+  }
+  attr {
+    name: "rates"
+    description: <<END
+The input stride for atrous morphological dilation. Must be:
+`[1, rate_height, rate_width, 1]`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors."
+  description: <<END
+The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+`filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+input channel is processed independently of the others with its own structuring
+function. The `output` tensor has shape
+`[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+tensor depend on the `padding` algorithm. We currently only support the default
+"NHWC" `data_format`.
+
+In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+(for consistency with `conv2d`, we use unmirrored filters):
+
+    output[b, y, x, c] =
+       max_{dy, dx} input[b,
+                          strides[1] * y + rates[1] * dy,
+                          strides[2] * x + rates[2] * dx,
+                          c] +
+                    filter[dy, dx, c]
+
+Max-pooling is a special case when the filter has size equal to the pooling
+kernel size and contains all zeros.
+
+Note on duality: The dilation of `input` by the `filter` is equal to the
+negation of the erosion of `-input` by the reflected `filter`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58f3b48edc8a82ba24ac80c81255afdd30989e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropFilter.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "Dilation2DBackpropFilter"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, depth]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+3-D with shape `[filter_height, filter_width, depth]`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, depth]`.
+END
+  }
+  out_arg {
+    name: "filter_backprop"
+    description: <<END
+3-D with shape `[filter_height, filter_width, depth]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4. The stride of the sliding window for each dimension of
+the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+END
+  }
+  attr {
+    name: "rates"
+    description: <<END
+1-D of length 4. The input stride for atrous morphological dilation.
+Must be: `[1, rate_height, rate_width, 1]`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes the gradient of morphological 2-D dilation with respect to the filter."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f966c1aae8b0f78db95485cbdc93eb8e7e4ecaa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Dilation2DBackpropInput.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "Dilation2DBackpropInput"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, depth]`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+3-D with shape `[filter_height, filter_width, depth]`.
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, out_height, out_width, depth]`.
+END
+  }
+  out_arg {
+    name: "in_backprop"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, depth]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4. The stride of the sliding window for each dimension of
+the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+END
+  }
+  attr {
+    name: "rates"
+    description: <<END
+1-D of length 4. The input stride for atrous morphological dilation.
+Must be: `[1, rate_height, rate_width, 1]`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes the gradient of morphological 2-D dilation with respect to the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Div.pbtxt b/tensorflow/core/api_def/base_api/api_def_Div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12b6fb5b4cec00da6843cdeb1ebd7e3d53d97ba4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Div.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Div"
+  summary: "Returns x / y element-wise."
+  description: <<END
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DrawBoundingBoxes.pbtxt b/tensorflow/core/api_def/base_api/api_def_DrawBoundingBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c3ae09f5d6e448a34032dd3dec2280290584d13
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DrawBoundingBoxes.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "DrawBoundingBoxes"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, depth]`. A batch of images.
+END
+  }
+  in_arg {
+    name: "boxes"
+    description: <<END
+3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+boxes.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with the same shape as `images`. The batch of input images with
+bounding boxes drawn on the images.
+END
+  }
+  summary: "Draw bounding boxes on a batch of images."
+  description: <<END
+Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+boxes specified by the locations in `boxes`. The coordinates of the each
+bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+height of the underlying image.
+
+For example, if an image is 100 x 200 pixels (height x width) and the bounding
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
+
+Parts of the bounding box may fall outside the image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt b/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5c44b5e073761262c153246d32fccaffdb34149
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DynamicPartition.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "DynamicPartition"
+  in_arg {
+    name: "partitions"
+    description: <<END
+Any shape.  Indices in the range `[0, num_partitions)`.
+END
+  }
+  attr {
+    name: "num_partitions"
+    description: <<END
+The number of partitions to output.
+END
+  }
+  summary: "Partitions `data` into `num_partitions` tensors using indices from `partitions`."
+  description: <<END
+For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+are placed in `outputs[i]` in lexicographic order of `js`, and the first
+dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+In detail,
+
+```python
+    outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+
+    outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+```
+
+`data.shape` must start with `partitions.shape`.
+
+For example:
+
+```python
+    # Scalar partitions.
+    partitions = 1
+    num_partitions = 2
+    data = [10, 20]
+    outputs[0] = []  # Empty with shape [0, 2]
+    outputs[1] = [[10, 20]]
+
+    # Vector partitions.
+    partitions = [0, 0, 1, 1, 0]
+    num_partitions = 2
+    data = [10, 20, 30, 40, 50]
+    outputs[0] = [10, 20, 50]
+    outputs[1] = [30, 40]
+```
+
+See `dynamic_stitch` for an example on how to merge partitions back.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DynamicStitch.pbtxt b/tensorflow/core/api_def/base_api/api_def_DynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34bd77bc0e4c50496894a3786a3f790d00292e3e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DynamicStitch.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "DynamicStitch"
+  summary: "Interleave the values from the `data` tensors into a single tensor."
+  description: <<END
+Builds a merged tensor such that
+
+```python
+    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+```
+
+For example, if each `indices[m]` is scalar or vector, we have
+
+```python
+    # Scalar indices:
+    merged[indices[m], ...] = data[m][...]
+
+    # Vector indices:
+    merged[indices[m][i], ...] = data[m][i, ...]
+```
+
+Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+`constant`, the output shape is
+
+    merged.shape = [max(indices)] + constant
+
+Values are merged in order, so if an index appears in both `indices[m][i]` and
+`indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+merged result. If you do not need this guarantee, ParallelDynamicStitch might
+perform better on some devices.
+
+For example:
+
+```python
+    indices[0] = 6
+    indices[1] = [4, 1]
+    indices[2] = [[5, 2], [0, 3]]
+    data[0] = [61, 62]
+    data[1] = [[41, 42], [11, 12]]
+    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+              [51, 52], [61, 62]]
+```
+
+This method can be used to merge partitions created by `dynamic_partition`
+as illustrated on the following example:
+
+```python
+    # Apply function (increments x_i) on elements for which a certain condition
+    # apply (x_i != -1 in this example).
+    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+    condition_mask=tf.not_equal(x,tf.constant(-1.))
+    partitioned_data = tf.dynamic_partition(
+        x, tf.cast(condition_mask, tf.int32) , 2)
+    partitioned_data[1] = partitioned_data[1] + 1.0
+    condition_indices = tf.dynamic_partition(
+        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+    x = tf.dynamic_stitch(condition_indices, partitioned_data)
+    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+    # unchanged.
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_E.pbtxt b/tensorflow/core/api_def/base_api/api_def_E.pbtxt
deleted file mode 100644
index b49146f7c4e4e1d6470241123dbe8a1fbdb741c2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_E.pbtxt
+++ /dev/null
@@ -1,261 +0,0 @@
-op {
-  graph_op_name: "EditDistance"
-  endpoint {
-    name: "EditDistance"
-  }
-  summary: "Computes the (possibly normalized) Levenshtein Edit Distance."
-  description: <<END
-The inputs are variable-length sequences provided by SparseTensors
-  (hypothesis_indices, hypothesis_values, hypothesis_shape)
-and
-  (truth_indices, truth_values, truth_shape).
-
-The inputs are:
-END
-}
-op {
-  graph_op_name: "Elu"
-  endpoint {
-    name: "Elu"
-  }
-  summary: "Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise."
-  description: <<END
-See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-](http://arxiv.org/abs/1511.07289)
-END
-}
-op {
-  graph_op_name: "EluGrad"
-  endpoint {
-    name: "EluGrad"
-  }
-  summary: "Computes gradients for the exponential linear (Elu) operation."
-}
-op {
-  graph_op_name: "EncodeBase64"
-  endpoint {
-    name: "EncodeBase64"
-  }
-  summary: "Encode strings into web-safe base64 format."
-  description: <<END
-Refer to the following article for more information on base64 format:
-en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-end so that the encoded has length multiple of 4. See Padding section of the
-link above.
-
-Web-safe means that the encoder uses - and _ instead of + and /.
-END
-}
-op {
-  graph_op_name: "EncodeJpeg"
-  endpoint {
-    name: "EncodeJpeg"
-  }
-  summary: "JPEG-encode an image."
-  description: <<END
-`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-
-The attr `format` can be used to override the color format of the encoded
-output.  Values can be:
-
-*   `''`: Use a default format based on the number of channels in the image.
-*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-    of `image` must be 1.
-*   `rgb`: Output an RGB JPEG image. The `channels` dimension
-    of `image` must be 3.
-
-If `format` is not specified or is the empty string, a default format is picked
-in function of the number of channels in `image`:
-
-*   1: Output a grayscale image.
-*   3: Output an RGB image.
-END
-}
-op {
-  graph_op_name: "EncodePng"
-  endpoint {
-    name: "EncodePng"
-  }
-  summary: "PNG-encode an image."
-  description: <<END
-`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-where `channels` is:
-
-*   1: for grayscale.
-*   2: for grayscale + alpha.
-*   3: for RGB.
-*   4: for RGBA.
-
-The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-default or a value from 0 to 9.  9 is the highest compression level, generating
-the smallest output, but is slower.
-END
-}
-op {
-  graph_op_name: "EncodeWav"
-  endpoint {
-    name: "EncodeWav"
-  }
-  summary: "Encode audio data using the WAV file format."
-  description: <<END
-This operation will generate a string suitable to be saved out to create a .wav
-audio file. It will be encoded in the 16-bit PCM format. It takes in float
-values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-that range.
-
-`audio` is a 2-D float Tensor of shape `[length, channels]`.
-`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-END
-}
-op {
-  graph_op_name: "Enter"
-  endpoint {
-    name: "Enter"
-  }
-  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
-  description: <<END
-This op is used together with `Exit` to create loops in the graph.
-The unique `frame_name` is used by the `Executor` to identify frames. If
-`is_constant` is true, `output` is a constant in the child frame; otherwise
-it may be changed in the child frame. At most `parallel_iterations` iterations
-are run in parallel in the child frame.
-END
-}
-op {
-  graph_op_name: "Equal"
-  endpoint {
-    name: "Equal"
-  }
-  summary: "Returns the truth value of (x == y) element-wise."
-  description: <<END
-*NOTE*: `Equal` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Erf"
-  endpoint {
-    name: "Erf"
-  }
-  summary: "Computes the Gauss error function of `x` element-wise."
-}
-op {
-  graph_op_name: "Erfc"
-  endpoint {
-    name: "Erfc"
-  }
-  summary: "Computes the complementary error function of `x` element-wise."
-}
-op {
-  graph_op_name: "Exit"
-  endpoint {
-    name: "Exit"
-  }
-  summary: "Exits the current frame to its parent frame."
-  description: <<END
-Exit makes its input `data` available to the parent frame.
-END
-}
-op {
-  graph_op_name: "Exp"
-  endpoint {
-    name: "Exp"
-  }
-  summary: "Computes exponential of x element-wise.  \\\\(y = e^x\\\\)."
-}
-op {
-  graph_op_name: "ExpandDims"
-  endpoint {
-    name: "ExpandDims"
-  }
-  summary: "Inserts a dimension of 1 into a tensor\'s shape."
-  description: <<END
-Given a tensor `input`, this operation inserts a dimension of 1 at the
-dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
-zero; if you specify a negative number for `dim` it is counted backward from
-the end.
-
-This operation is useful if you want to add a batch dimension to a single
-element. For example, if you have a single image of shape `[height, width,
-channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
-which will make the shape `[1, height, width, channels]`.
-
-Other examples:
-
-```
-# 't' is a tensor of shape [2]
-shape(expand_dims(t, 0)) ==> [1, 2]
-shape(expand_dims(t, 1)) ==> [2, 1]
-shape(expand_dims(t, -1)) ==> [2, 1]
-
-# 't2' is a tensor of shape [2, 3, 5]
-shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
-shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
-shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
-```
-
-This operation requires that:
-
-`-1-input.dims() <= dim <= input.dims()`
-
-This operation is related to `squeeze()`, which removes dimensions of
-size 1.
-END
-}
-op {
-  graph_op_name: "Expm1"
-  endpoint {
-    name: "Expm1"
-  }
-  summary: "Computes exponential of x - 1 element-wise."
-  description: <<END
-I.e., \\(y = (\exp x) - 1\\).
-END
-}
-op {
-  graph_op_name: "ExtractGlimpse"
-  endpoint {
-    name: "ExtractGlimpse"
-  }
-  summary: "Extracts a glimpse from the input tensor."
-  description: <<END
-Returns a set of windows called glimpses extracted at location
-`offsets` from the input tensor. If the windows only partially
-overlaps the inputs, the non overlapping areas will be filled with
-random noise.
-
-The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-glimpse_width, channels]`. The channels and batch dimensions are the
-same as that of the input tensor. The height and width of the output
-windows are specified in the `size` parameter.
-
-The argument `normalized` and `centered` controls how the windows are built:
-
-* If the coordinates are normalized but not centered, 0.0 and 1.0
-  correspond to the minimum and maximum of each height and width
-  dimension.
-* If the coordinates are both normalized and centered, they range from
-  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-  left corner, the lower right corner is located at (1.0, 1.0) and the
-  center is at (0, 0).
-* If the coordinates are not normalized they are interpreted as
-  numbers of pixels.
-END
-}
-op {
-  graph_op_name: "ExtractImagePatches"
-  endpoint {
-    name: "ExtractImagePatches"
-  }
-  summary: "Extract `patches` from `images` and put them in the \"depth\" output dimension."
-}
-op {
-  graph_op_name: "ExtractJpegShape"
-  endpoint {
-    name: "ExtractJpegShape"
-  }
-  summary: "Extract the shape information of a JPEG-encoded image."
-  description: <<END
-This op only parses the image header, so it is much faster than DecodeJpeg.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9231368e1654d6bb710a128e076e93005f31116d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  summary: "Eagerly executes a python function to compute func(input)->output. The"
+  description: <<END
+semantics of the input, output, and attributes are the same as those for
+PyFunc.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EditDistance.pbtxt b/tensorflow/core/api_def/base_api/api_def_EditDistance.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678c451a8aa69fcc8446b6f7a790e6e32dd388ce
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EditDistance.pbtxt
@@ -0,0 +1,96 @@
+op {
+  graph_op_name: "EditDistance"
+  in_arg {
+    name: "hypothesis_indices"
+    description: <<END
+The indices of the hypothesis list SparseTensor.
+This is an N x R int64 matrix.
+END
+  }
+  in_arg {
+    name: "hypothesis_values"
+    description: <<END
+The values of the hypothesis list SparseTensor.
+This is an N-length vector.
+END
+  }
+  in_arg {
+    name: "hypothesis_shape"
+    description: <<END
+The shape of the hypothesis list SparseTensor.
+This is an R-length vector.
+END
+  }
+  in_arg {
+    name: "truth_indices"
+    description: <<END
+The indices of the truth list SparseTensor.
+This is an M x R int64 matrix.
+END
+  }
+  in_arg {
+    name: "truth_values"
+    description: <<END
+The values of the truth list SparseTensor.
+This is an M-length vector.
+END
+  }
+  in_arg {
+    name: "truth_shape"
+    description: <<END
+truth indices, vector.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A dense float tensor with rank R - 1.
+
+For the example input:
+
+    // hypothesis represents a 2x1 matrix with variable-length values:
+    //   (0,0) = ["a"]
+    //   (1,0) = ["b"]
+    hypothesis_indices = [[0, 0, 0],
+                          [1, 0, 0]]
+    hypothesis_values = ["a", "b"]
+    hypothesis_shape = [2, 1, 1]
+
+    // truth represents a 2x2 matrix with variable-length values:
+    //   (0,0) = []
+    //   (0,1) = ["a"]
+    //   (1,0) = ["b", "c"]
+    //   (1,1) = ["a"]
+    truth_indices = [[0, 1, 0],
+                     [1, 0, 0],
+                     [1, 0, 1],
+                     [1, 1, 0]]
+    truth_values = ["a", "b", "c", "a"]
+    truth_shape = [2, 2, 2]
+    normalize = true
+
+The output will be:
+
+    // output is a 2x2 matrix with edit distances normalized by truth lengths.
+    output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+              [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+END
+  }
+  attr {
+    name: "normalize"
+    description: <<END
+boolean (if true, edit distances are normalized by length of truth).
+
+The output is:
+END
+  }
+  summary: "Computes the (possibly normalized) Levenshtein Edit Distance."
+  description: <<END
+The inputs are variable-length sequences provided by SparseTensors
+  (hypothesis_indices, hypothesis_values, hypothesis_shape)
+and
+  (truth_indices, truth_values, truth_shape).
+
+The inputs are:
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Elu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Elu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf3d4b73d3388e030ddef2d05d875ed2dea0e8bd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Elu.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Elu"
+  summary: "Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise."
+  description: <<END
+See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+](http://arxiv.org/abs/1511.07289)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EluGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_EluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41aa5a2ac78d9955419c940a999e76fb20e4795e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EluGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "EluGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding Elu operation.
+END
+  }
+  in_arg {
+    name: "outputs"
+    description: <<END
+The outputs of the corresponding Elu operation.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+The gradients: `gradients * (outputs + 1)` if outputs < 0,
+`gradients` otherwise.
+END
+  }
+  summary: "Computes gradients for the exponential linear (Elu) operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeBase64.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f25fe05cfd33fe68f64d56b4c3ce0d9313dd8aa1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeBase64.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "EncodeBase64"
+  in_arg {
+    name: "input"
+    description: <<END
+Strings to be encoded.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Input strings encoded in base64.
+END
+  }
+  attr {
+    name: "pad"
+    description: <<END
+Bool whether padding is applied at the ends.
+END
+  }
+  summary: "Encode strings into web-safe base64 format."
+  description: <<END
+Refer to the following article for more information on base64 format:
+en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+end so that the encoded has length multiple of 4. See Padding section of the
+link above.
+
+Web-safe means that the encoder uses - and _ instead of + and /.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05a46ed29119cc735c3135ac044857052b5f19ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeJpeg.pbtxt
@@ -0,0 +1,89 @@
+op {
+  graph_op_name: "EncodeJpeg"
+  in_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "contents"
+    description: <<END
+0-D. JPEG-encoded image.
+END
+  }
+  attr {
+    name: "format"
+    description: <<END
+Per pixel image format.
+END
+  }
+  attr {
+    name: "quality"
+    description: <<END
+Quality of the compression from 0 to 100 (higher is better and slower).
+END
+  }
+  attr {
+    name: "progressive"
+    description: <<END
+If True, create a JPEG that loads progressively (coarse to fine).
+END
+  }
+  attr {
+    name: "optimize_size"
+    description: <<END
+If True, spend CPU/RAM to reduce size with no quality change.
+END
+  }
+  attr {
+    name: "chroma_downsampling"
+    description: <<END
+See http://en.wikipedia.org/wiki/Chroma_subsampling.
+END
+  }
+  attr {
+    name: "density_unit"
+    description: <<END
+Unit used to specify `x_density` and `y_density`:
+pixels per inch (`'in'`) or centimeter (`'cm'`).
+END
+  }
+  attr {
+    name: "x_density"
+    description: <<END
+Horizontal pixels per density unit.
+END
+  }
+  attr {
+    name: "y_density"
+    description: <<END
+Vertical pixels per density unit.
+END
+  }
+  attr {
+    name: "xmp_metadata"
+    description: <<END
+If not empty, embed this XMP metadata in the image header.
+END
+  }
+  summary: "JPEG-encode an image."
+  description: <<END
+`image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+
+The attr `format` can be used to override the color format of the encoded
+output.  Values can be:
+
+*   `''`: Use a default format based on the number of channels in the image.
+*   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+    of `image` must be 1.
+*   `rgb`: Output an RGB JPEG image. The `channels` dimension
+    of `image` must be 3.
+
+If `format` is not specified or is the empty string, a default format is picked
+in function of the number of channels in `image`:
+
+*   1: Output a grayscale image.
+*   3: Output an RGB image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodePng.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d3b37334e24f1fb7c52f56ef2af1392bb6603
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodePng.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "EncodePng"
+  in_arg {
+    name: "image"
+    description: <<END
+3-D with shape `[height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "contents"
+    description: <<END
+0-D. PNG-encoded image.
+END
+  }
+  attr {
+    name: "compression"
+    description: <<END
+Compression level.
+END
+  }
+  summary: "PNG-encode an image."
+  description: <<END
+`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+where `channels` is:
+
+*   1: for grayscale.
+*   2: for grayscale + alpha.
+*   3: for RGB.
+*   4: for RGBA.
+
+The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+default or a value from 0 to 9.  9 is the highest compression level, generating
+the smallest output, but is slower.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeWav.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54a8b1fa550796550cf2eaf90b23626e3757042a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeWav.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "EncodeWav"
+  in_arg {
+    name: "audio"
+    description: <<END
+2-D with shape `[length, channels]`.
+END
+  }
+  in_arg {
+    name: "sample_rate"
+    description: <<END
+Scalar containing the sample frequency.
+END
+  }
+  out_arg {
+    name: "contents"
+    description: <<END
+0-D. WAV-encoded file contents.
+END
+  }
+  summary: "Encode audio data using the WAV file format."
+  description: <<END
+This operation will generate a string suitable to be saved out to create a .wav
+audio file. It will be encoded in the 16-bit PCM format. It takes in float
+values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+that range.
+
+`audio` is a 2-D float Tensor of shape `[length, channels]`.
+`sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Enter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Enter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dfff8e6ddbfcb093c9ae1c8de6615229e8f955fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Enter.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Enter"
+  visibility: HIDDEN
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the child frame.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  attr {
+    name: "frame_name"
+    description: <<END
+The name of the child frame.
+END
+  }
+  attr {
+    name: "is_constant"
+    description: <<END
+If true, the output is constant within the child frame.
+END
+  }
+  attr {
+    name: "parallel_iterations"
+    description: <<END
+The number of iterations allowed to run in parallel.
+END
+  }
+  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
+  description: <<END
+This op is used together with `Exit` to create loops in the graph.
+The unique `frame_name` is used by the `Executor` to identify frames. If
+`is_constant` is true, `output` is a constant in the child frame; otherwise
+it may be changed in the child frame. At most `parallel_iterations` iterations
+are run in parallel in the child frame.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/base_api/api_def_Equal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ca8ef945554cd38da4e073657e32ff45c7efa07
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Equal.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Equal"
+  summary: "Returns the truth value of (x == y) element-wise."
+  description: <<END
+*NOTE*: `Equal` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/base_api/api_def_Erf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..408df8a633289e10eef39bb135e33f4847ac2efe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Erf.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Erf"
+  summary: "Computes the Gauss error function of `x` element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Erfc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Erfc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad70def47f87dbd7f348cd80e553c6549f5f10a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Erfc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Erfc"
+  summary: "Computes the complementary error function of `x` element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Exit.pbtxt b/tensorflow/core/api_def/base_api/api_def_Exit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec97b7ac04ff39d2afc1f82e47a72ff4b82c81a4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Exit.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "Exit"
+  visibility: HIDDEN
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the parent frame.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  summary: "Exits the current frame to its parent frame."
+  description: <<END
+Exit makes its input `data` available to the parent frame.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/base_api/api_def_Exp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd1e3d5dfceae2a133066b7e72665aed5ad30ddc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Exp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Exp"
+  summary: "Computes exponential of x element-wise.  \\\\(y = e^x\\\\)."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExpandDims.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExpandDims.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b9a03f0ea619dfa5e8144c5949134f25c841584
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExpandDims.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "ExpandDims"
+  in_arg {
+    name: "dim"
+    rename_to: "axis"
+    description: <<END
+0-D (scalar). Specifies the dimension index at which to
+expand the shape of `input`. Must be in the range
+`[-rank(input) - 1, rank(input)]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Contains the same data as `input`, but its shape has an additional
+dimension of size 1 added.
+END
+  }
+  summary: "Inserts a dimension of 1 into a tensor\'s shape."
+  description: <<END
+Given a tensor `input`, this operation inserts a dimension of 1 at the
+dimension index `dim` of `input`'s shape. The dimension index `dim` starts at
+zero; if you specify a negative number for `dim` it is counted backward from
+the end.
+
+This operation is useful if you want to add a batch dimension to a single
+element. For example, if you have a single image of shape `[height, width,
+channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+which will make the shape `[1, height, width, channels]`.
+
+Other examples:
+
+```
+# 't' is a tensor of shape [2]
+shape(expand_dims(t, 0)) ==> [1, 2]
+shape(expand_dims(t, 1)) ==> [2, 1]
+shape(expand_dims(t, -1)) ==> [2, 1]
+
+# 't2' is a tensor of shape [2, 3, 5]
+shape(expand_dims(t2, 0)) ==> [1, 2, 3, 5]
+shape(expand_dims(t2, 2)) ==> [2, 3, 1, 5]
+shape(expand_dims(t2, 3)) ==> [2, 3, 5, 1]
+```
+
+This operation requires that:
+
+`-1-input.dims() <= dim <= input.dims()`
+
+This operation is related to `squeeze()`, which removes dimensions of
+size 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Expm1.pbtxt b/tensorflow/core/api_def/base_api/api_def_Expm1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a048f2aa8b1c3193ad1d85e118ba637476ed8b80
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Expm1.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Expm1"
+  summary: "Computes exponential of x - 1 element-wise."
+  description: <<END
+I.e., \\(y = (\exp x) - 1\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c10a1bb778e1d8b45b59113d255d69c55a224643
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
@@ -0,0 +1,77 @@
+op {
+  graph_op_name: "ExtractGlimpse"
+  in_arg {
+    name: "input"
+    description: <<END
+A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+A 1-D tensor of 2 elements containing the size of the glimpses
+to extract.  The glimpse height must be specified first, following
+by the glimpse width.
+END
+  }
+  in_arg {
+    name: "offsets"
+    description: <<END
+A 2-D integer tensor of shape `[batch_size, 2]` containing
+the y, x locations of the center of each window.
+END
+  }
+  out_arg {
+    name: "glimpse"
+    description: <<END
+A tensor representing the glimpses `[batch_size,
+glimpse_height, glimpse_width, channels]`.
+END
+  }
+  attr {
+    name: "centered"
+    description: <<END
+indicates if the offset coordinates are centered relative to
+the image, in which case the (0, 0) offset is relative to the center
+of the input images. If false, the (0,0) offset corresponds to the
+upper left corner of the input images.
+END
+  }
+  attr {
+    name: "normalized"
+    description: <<END
+indicates if the offset coordinates are normalized.
+END
+  }
+  attr {
+    name: "uniform_noise"
+    description: <<END
+indicates if the noise should be generated using a
+uniform distribution or a Gaussian distribution.
+END
+  }
+  summary: "Extracts a glimpse from the input tensor."
+  description: <<END
+Returns a set of windows called glimpses extracted at location
+`offsets` from the input tensor. If the windows only partially
+overlaps the inputs, the non overlapping areas will be filled with
+random noise.
+
+The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+glimpse_width, channels]`. The channels and batch dimensions are the
+same as that of the input tensor. The height and width of the output
+windows are specified in the `size` parameter.
+
+The argument `normalized` and `centered` controls how the windows are built:
+
+* If the coordinates are normalized but not centered, 0.0 and 1.0
+  correspond to the minimum and maximum of each height and width
+  dimension.
+* If the coordinates are both normalized and centered, they range from
+  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+  left corner, the lower right corner is located at (1.0, 1.0) and the
+  center is at (0, 0).
+* If the coordinates are not normalized they are interpreted as
+  numbers of pixels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractImagePatches.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712a3b0a0f92145eeab7a5e557072b4220184124
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractImagePatches.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "ExtractImagePatches"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+END
+  }
+  out_arg {
+    name: "patches"
+    description: <<END
+4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+ksize_cols * depth]` containing image patches with size
+`ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+`out_rows` and `out_cols` are the dimensions of the output patches.
+END
+  }
+  attr {
+    name: "ksizes"
+    description: <<END
+The size of the sliding window for each dimension of `images`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4. How far the centers of two consecutive patches are in
+the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+END
+  }
+  attr {
+    name: "rates"
+    description: <<END
+1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+input stride, specifying how far two consecutive patch samples are in the
+input. Equivalent to extracting patches with
+`patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+subsampling them spatially by a factor of `rates`. This is equivalent to
+`rate` in dilated (a.k.a. Atrous) convolutions.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+
+We specify the size-related attributes as:
+
+```python
+      ksizes = [1, ksize_rows, ksize_cols, 1]
+      strides = [1, strides_rows, strides_cols, 1]
+      rates = [1, rates_rows, rates_cols, 1]
+```
+END
+  }
+  summary: "Extract `patches` from `images` and put them in the \"depth\" output dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractJpegShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c604adf449f56b5e91c8ee321a2f959073b0503a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractJpegShape.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "ExtractJpegShape"
+  in_arg {
+    name: "contents"
+    description: <<END
+0-D. The JPEG-encoded image.
+END
+  }
+  out_arg {
+    name: "image_shape"
+    description: <<END
+1-D. The image shape with format [height, width, channels].
+END
+  }
+  attr {
+    name: "output_type"
+    description: <<END
+(Optional) The output type of the operation (int32 or int64).
+Defaults to int32.
+END
+  }
+  summary: "Extract the shape information of a JPEG-encoded image."
+  description: <<END
+This op only parses the image header, so it is much faster than DecodeJpeg.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_F.pbtxt b/tensorflow/core/api_def/base_api/api_def_F.pbtxt
deleted file mode 100644
index 8c073d3369c21afedcb35d4ce414e6498156af52..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_F.pbtxt
+++ /dev/null
@@ -1,411 +0,0 @@
-op {
-  graph_op_name: "FFT"
-  endpoint {
-    name: "FFT"
-  }
-  summary: "Fast Fourier transform."
-  description: <<END
-Computes the 1-dimensional discrete Fourier transform over the inner-most
-dimension of `input`.
-END
-}
-op {
-  graph_op_name: "FFT2D"
-  endpoint {
-    name: "FFT2D"
-  }
-  summary: "2D fast Fourier transform."
-  description: <<END
-Computes the 2-dimensional discrete Fourier transform over the inner-most
-2 dimensions of `input`.
-END
-}
-op {
-  graph_op_name: "FFT3D"
-  endpoint {
-    name: "FFT3D"
-  }
-  summary: "3D fast Fourier transform."
-  description: <<END
-Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-dimensions of `input`.
-END
-}
-op {
-  graph_op_name: "FIFOQueue"
-  endpoint {
-    name: "FIFOQueue"
-  }
-  summary: "A queue that produces elements in first-in first-out order."
-}
-op {
-  graph_op_name: "FIFOQueueV2"
-  endpoint {
-    name: "FIFOQueueV2"
-  }
-  summary: "A queue that produces elements in first-in first-out order."
-}
-op {
-  graph_op_name: "Fact"
-  endpoint {
-    name: "Fact"
-  }
-  summary: "Output a fact about factorials."
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxArgs"
-  endpoint {
-    name: "FakeQuantWithMinMaxArgs"
-  }
-  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
-  description: <<END
-Attributes `[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-Quantization is called fake since the output is still in floating point.
-END
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxArgsGradient"
-  endpoint {
-    name: "FakeQuantWithMinMaxArgsGradient"
-  }
-  summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxVars"
-  endpoint {
-    name: "FakeQuantWithMinMaxVars"
-  }
-  summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
-  description: <<END
-and `max` to 'outputs' tensor of same shape as `inputs`.
-
-`[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-This operation has a gradient and thus allows for training `min` and `max`
-values.
-END
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxVarsGradient"
-  endpoint {
-    name: "FakeQuantWithMinMaxVarsGradient"
-  }
-  summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxVarsPerChannel"
-  endpoint {
-    name: "FakeQuantWithMinMaxVarsPerChannel"
-  }
-  summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
-  description: <<END
-`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
-to 'outputs' tensor of same shape as `inputs`.
-
-`[min; max]` define the clamping range for the `inputs` data.
-`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-then de-quantized and output as floats in `[min; max]` interval.
-`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
-
-This operation has a gradient and thus allows for training `min` and `max`
-values.
-END
-}
-op {
-  graph_op_name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  endpoint {
-    name: "FakeQuantWithMinMaxVarsPerChannelGradient"
-  }
-  summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
-}
-op {
-  graph_op_name: "FakeQueue"
-  endpoint {
-    name: "FakeQueue"
-  }
-  summary: "Deprecated. Do not use."
-}
-op {
-  graph_op_name: "Fill"
-  endpoint {
-    name: "Fill"
-  }
-  summary: "Creates a tensor filled with a scalar value."
-  description: <<END
-This operation creates a tensor of shape `dims` and fills it with `value`.
-
-For example:
-
-```
-# Output tensor has shape [2, 3].
-fill([2, 3], 9) ==> [[9, 9, 9]
-                     [9, 9, 9]]
-```
-END
-}
-op {
-  graph_op_name: "FilterDataset"
-  endpoint {
-    name: "FilterDataset"
-  }
-  summary: "Creates a dataset containing elements of `input_dataset` matching `predicate`."
-  description: <<END
-The `predicate` function must return a scalar boolean and accept the
-following arguments:
-
-* One tensor for each component of an element of `input_dataset`.
-* One tensor for each value in `other_arguments`.
-END
-}
-op {
-  graph_op_name: "FixedLengthRecordDataset"
-  endpoint {
-    name: "FixedLengthRecordDataset"
-  }
-  summary: "Creates a dataset that emits the records from one or more binary files."
-}
-op {
-  graph_op_name: "FixedLengthRecordReader"
-  endpoint {
-    name: "FixedLengthRecordReader"
-  }
-  summary: "A Reader that outputs fixed-length records from a file."
-}
-op {
-  graph_op_name: "FixedLengthRecordReaderV2"
-  endpoint {
-    name: "FixedLengthRecordReaderV2"
-  }
-  summary: "A Reader that outputs fixed-length records from a file."
-}
-op {
-  graph_op_name: "FixedUnigramCandidateSampler"
-  endpoint {
-    name: "FixedUnigramCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: <<END
-A unigram sampler could use a fixed unigram distribution read from a
-file or passed in as an in-memory array instead of building up the distribution
-from data on the fly. There is also an option to skew the distribution by
-applying a distortion power to the weights.
-
-The vocabulary file should be in CSV-like format, with the last field
-being the weight associated with the word.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "FlatMapDataset"
-  endpoint {
-    name: "FlatMapDataset"
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: <<END
-Unlike MapDataset, the `f` in FlatMapDataset is expected to return a
-Dataset variant, and FlatMapDataset will flatten successive results
-into a single Dataset.
-END
-}
-op {
-  graph_op_name: "Floor"
-  endpoint {
-    name: "Floor"
-  }
-  summary: "Returns element-wise largest integer not greater than x."
-}
-op {
-  graph_op_name: "FloorDiv"
-  endpoint {
-    name: "FloorDiv"
-  }
-  summary: "Returns x // y element-wise."
-  description: <<END
-*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "FloorMod"
-  endpoint {
-    name: "FloorMod"
-  }
-  summary: "Returns element-wise remainder of division. When `x < 0` xor `y < 0` is"
-  description: <<END
-true, this follows Python semantics in that the result here is consistent
-with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
-
-*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "FractionalAvgPool"
-  }
-  summary: "Performs fractional average pooling on the input."
-  description: <<END
-Fractional average pooling is similar to Fractional max pooling in the pooling
-region generation step. The only difference is that after pooling regions are
-generated, a mean operation is performed instead of a max operation in each
-pooling region.
-END
-}
-op {
-  graph_op_name: "FractionalAvgPoolGrad"
-  endpoint {
-    name: "FractionalAvgPoolGrad"
-  }
-  summary: "Computes gradient of the FractionalAvgPool function."
-  description: <<END
-Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-out_backprop to those indices that form the same pooling cell. Therefore, we
-just need to know the shape of original input tensor, instead of the whole
-tensor.
-END
-}
-op {
-  graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "FractionalMaxPool"
-  }
-  summary: "Performs fractional max pooling on the input."
-  description: <<END
-Fractional max pooling is slightly different than regular max pooling.  In
-regular max pooling, you downsize an input set by taking the maximum value of
-smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-a factor of N, where N is an integer.  Fractional max pooling, as you might
-expect from the word "fractional", means that the overall reduction ratio N
-does not have to be an integer.
-
-The sizes of the pooling regions are generated randomly but are fairly uniform.
-For example, let's look at the height dimension, and the constraints on the
-list of rows that will be pool boundaries.
-
-First we define the following:
-
-1.  input_row_length : the number of rows from the input set
-2.  output_row_length : which will be smaller than the input
-3.  alpha = input_row_length / output_row_length : our reduction ratio
-4.  K = floor(alpha)
-5.  row_pooling_sequence : this is the result list of pool boundary rows
-
-Then, row_pooling_sequence should satisfy:
-
-1.  a[0] = 0 : the first value of the sequence is 0
-2.  a[end] = input_row_length : the last value of the sequence is the size
-3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-4.  length(row_pooling_sequence) = output_row_length+1
-
-For more details on fractional max pooling, see this paper:
-[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
-END
-}
-op {
-  graph_op_name: "FractionalMaxPoolGrad"
-  endpoint {
-    name: "FractionalMaxPoolGrad"
-  }
-  summary: "Computes gradient of the FractionalMaxPool function."
-}
-op {
-  graph_op_name: "FusedBatchNorm"
-  endpoint {
-    name: "FusedBatchNorm"
-  }
-  summary: "Batch normalization."
-  description: <<END
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-END
-}
-op {
-  graph_op_name: "FusedBatchNormGrad"
-  endpoint {
-    name: "FusedBatchNormGrad"
-  }
-  summary: "Gradient for batch normalization."
-  description: <<END
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-END
-}
-op {
-  graph_op_name: "FusedBatchNormGradV2"
-  endpoint {
-    name: "FusedBatchNormGradV2"
-  }
-  summary: "Gradient for batch normalization."
-  description: <<END
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-END
-}
-op {
-  graph_op_name: "FusedBatchNormV2"
-  endpoint {
-    name: "FusedBatchNormV2"
-  }
-  summary: "Batch normalization."
-  description: <<END
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-END
-}
-op {
-  graph_op_name: "FusedPadConv2D"
-  endpoint {
-    name: "FusedPadConv2D"
-  }
-  summary: "Performs a padding as a preprocess during a convolution."
-  description: <<END
-Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-implementation where the spatial padding transformation stage is fused with the
-im2col lookup, but in this case without the bilinear filtering required for
-resizing. Fusing the padding prevents the need to write out the intermediate
-results as whole tensors, reducing memory pressure, and we can get some latency
-gains by merging the transformation calculations.
-The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-order is used instead.
-Internally this op uses a single per-graph scratch buffer, which means that it
-will block if multiple versions are being run in parallel. This is because this
-operator is primarily an optimization to minimize memory usage.
-END
-}
-op {
-  graph_op_name: "FusedResizeAndPadConv2D"
-  endpoint {
-    name: "FusedResizeAndPadConv2D"
-  }
-  summary: "Performs a resize and padding as a preprocess during a convolution."
-  description: <<END
-It's often possible to do spatial transformations more efficiently as part of
-the packing stage of a convolution, so this op allows for an optimized
-implementation where these stages are fused together. This prevents the need to
-write out the intermediate results as whole tensors, reducing memory pressure,
-and we can get some latency gains by merging the transformation calculations.
-The data_format attribute for Conv2D isn't supported by this op, and defaults to
-'NHWC' order.
-Internally this op uses a single per-graph scratch buffer, which means that it
-will block if multiple versions are being run in parallel. This is because this
-operator is primarily an optimization to minimize memory usage.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e48d6c169b6641ece5f11d5add478ce25611ee8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "FFT"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most
+  dimension of `input` is replaced with its 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fft
+@end_compatibility
+END
+  }
+  summary: "Fast Fourier transform."
+  description: <<END
+Computes the 1-dimensional discrete Fourier transform over the inner-most
+dimension of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..555f8e60673d71e43dbb5d4dc17ae345606a2089
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "FFT2D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most 2
+  dimensions of `input` are replaced with their 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fft2
+@end_compatibility
+END
+  }
+  summary: "2D fast Fourier transform."
+  description: <<END
+Computes the 2-dimensional discrete Fourier transform over the inner-most
+2 dimensions of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..abd2e67bcebb70c5a0957996284bab53f106f5b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FFT3D.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "FFT3D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most 3
+  dimensions of `input` are replaced with their 3D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.fftn with 3 dimensions.
+@end_compatibility
+END
+  }
+  summary: "3D fast Fourier transform."
+  description: <<END
+Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+dimensions of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FIFOQueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_FIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..751f73d66e0247cf0b45058a8ac4567c86fbfb26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FIFOQueue.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "FIFOQueue"
+  visibility: SKIP
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements in first-in first-out order."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FIFOQueueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7b84ff2a7db36316b831b7e95691952ba7cffd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FIFOQueueV2.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "FIFOQueueV2"
+  endpoint {
+    name: "FIFOQueue"
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements in first-in first-out order."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Fact.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fact.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aad4aac327bb40a18e6c6761f407acf4506e7e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Fact.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fact"
+  summary: "Output a fact about factorials."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..561c86ddf68a4fb093d263d076fb6ccc8d408733
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgs"
+  summary: "Fake-quantize the \'inputs\' tensor, type float to \'outputs\' tensor of same type."
+  description: <<END
+Attributes `[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+
+Quantization is called fake since the output is still in floating point.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5241acc559ead80e239c352f9a51017bca8340df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgsGradient"
+  in_arg {
+    name: "gradients"
+    description: <<END
+Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+END
+  }
+  in_arg {
+    name: "inputs"
+    description: <<END
+Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+`gradients * (inputs >= min && inputs <= max)`.
+END
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxArgs operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2713c01b27f6bc45eb6117047243f06873d4dd87
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVars"
+  summary: "Fake-quantize the \'inputs\' tensor of type float via global float scalars `min`"
+  description: <<END
+and `max` to 'outputs' tensor of same shape as `inputs`.
+
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+
+This operation has a gradient and thus allows for training `min` and `max`
+values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d07d3b333b71cb270ddaccdded048d2848670732
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsGradient"
+  in_arg {
+    name: "gradients"
+    description: <<END
+Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+END
+  }
+  in_arg {
+    name: "inputs"
+    description: <<END
+Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+min, max: Quantization interval, scalar floats.
+END
+  }
+  out_arg {
+    name: "backprops_wrt_input"
+    description: <<END
+Backpropagated gradients w.r.t. inputs:
+`gradients * (inputs >= min && inputs <= max)`.
+END
+  }
+  out_arg {
+    name: "backprop_wrt_min"
+    description: <<END
+Backpropagated gradients w.r.t. min parameter:
+`sum(gradients * (inputs < min))`.
+END
+  }
+  out_arg {
+    name: "backprop_wrt_max"
+    description: <<END
+Backpropagated gradients w.r.t. max parameter:
+`sum(gradients * (inputs > max))`.
+END
+  }
+  attr {
+    name: "num_bits"
+    description: <<END
+The bitwidth of the quantization; between 2 and 8, inclusive.
+END
+  }
+  attr {
+    name: "narrow_range"
+    description: <<END
+Whether to quantize into 2^num_bits - 1 distinct values.
+END
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxVars operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e293d4d084bc90f24ee0cc1111f750ddfa46465b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannel"
+  summary: "Fake-quantize the \'inputs\' tensor of type float and one of the shapes: `[d]`,"
+  description: <<END
+`[b, d]` `[b, h, w, d]` via per-channel floats `min` and `max` of shape `[d]`
+to 'outputs' tensor of same shape as `inputs`.
+
+`[min; max]` define the clamping range for the `inputs` data.
+`inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+then de-quantized and output as floats in `[min; max]` interval.
+`num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
+
+This operation has a gradient and thus allows for training `min` and `max`
+values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a4ab368b5a8c4d8ac756513da14796ff3a41551
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  in_arg {
+    name: "gradients"
+    description: <<END
+Backpropagated gradients above the FakeQuantWithMinMaxVars operation,
+shape one of: `[d]`, `[b, d]`,  `[b, h, w, d]`.
+END
+  }
+  in_arg {
+    name: "inputs"
+    description: <<END
+Values passed as inputs to the FakeQuantWithMinMaxVars operation, shape
+  same as `gradients`.
+min, max: Quantization interval, floats of shape `[d]`.
+END
+  }
+  out_arg {
+    name: "backprops_wrt_input"
+    description: <<END
+Backpropagated gradients w.r.t. inputs, shape same as
+`inputs`:
+  `gradients * (inputs >= min && inputs <= max)`.
+END
+  }
+  out_arg {
+    name: "backprop_wrt_min"
+    description: <<END
+Backpropagated gradients w.r.t. min parameter, shape `[d]`:
+`sum_per_d(gradients * (inputs < min))`.
+END
+  }
+  out_arg {
+    name: "backprop_wrt_max"
+    description: <<END
+Backpropagated gradients w.r.t. max parameter, shape `[d]`:
+`sum_per_d(gradients * (inputs > max))`.
+END
+  }
+  attr {
+    name: "num_bits"
+    description: <<END
+The bitwidth of the quantization; between 2 and 8, inclusive.
+END
+  }
+  attr {
+    name: "narrow_range"
+    description: <<END
+Whether to quantize into 2^num_bits - 1 distinct values.
+END
+  }
+  summary: "Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeQueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..224862246ecb8fe664e52bb569ca4d5f49e3f7d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeQueue.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "FakeQueue"
+  visibility: SKIP
+  summary: "Deprecated. Do not use."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58262a385c356816df1d119324731dbf7176376d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Fill.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "Fill"
+  in_arg {
+    name: "dims"
+    description: <<END
+1-D. Represents the shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+0-D (scalar). Value to fill the returned tensor.
+
+@compatibility(numpy)
+Equivalent to np.full
+@end_compatibility
+END
+  }
+  summary: "Creates a tensor filled with a scalar value."
+  description: <<END
+This operation creates a tensor of shape `dims` and fills it with `value`.
+
+For example:
+
+```
+# Output tensor has shape [2, 3].
+fill([2, 3], 9) ==> [[9, 9, 9]
+                     [9, 9, 9]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd60c0f3785a22f456c63285bf59381e6a2a5d66
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "FilterDataset"
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `predicate`.
+END
+  }
+  attr {
+    name: "predicate"
+    description: <<END
+A function returning a scalar boolean.
+END
+  }
+  summary: "Creates a dataset containing elements of `input_dataset` matching `predicate`."
+  description: <<END
+The `predicate` function must return a scalar boolean and accept the
+following arguments:
+
+* One tensor for each component of an element of `input_dataset`.
+* One tensor for each value in `other_arguments`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..651b84d0d660a0bfc0ef45dd841dfc51ee1e3340
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "FixedLengthRecordDataset"
+  in_arg {
+    name: "filenames"
+    description: <<END
+A scalar or a vector containing the name(s) of the file(s) to be
+read.
+END
+  }
+  in_arg {
+    name: "header_bytes"
+    description: <<END
+A scalar representing the number of bytes to skip at the
+beginning of a file.
+END
+  }
+  in_arg {
+    name: "record_bytes"
+    description: <<END
+A scalar representing the number of bytes in each record.
+END
+  }
+  in_arg {
+    name: "footer_bytes"
+    description: <<END
+A scalar representing the number of bytes to skip at the end
+of a file.
+END
+  }
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+A scalar representing the number of bytes to buffer. Must be > 0.
+END
+  }
+  summary: "Creates a dataset that emits the records from one or more binary files."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d7f3cbb4387101b9783296ec413cfb666ed3f21
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReader.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "FixedLengthRecordReader"
+  visibility: SKIP
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "header_bytes"
+    description: <<END
+Number of bytes in the header, defaults to 0.
+END
+  }
+  attr {
+    name: "record_bytes"
+    description: <<END
+Number of bytes in the record.
+END
+  }
+  attr {
+    name: "footer_bytes"
+    description: <<END
+Number of bytes in the footer, defaults to 0.
+END
+  }
+  attr {
+    name: "hop_bytes"
+    description: <<END
+Number of bytes to hop before each read. Default of 0 means using
+record_bytes.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs fixed-length records from a file."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a9067a592f8fe1da61b5ba835c38dd049e22cda
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordReaderV2.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "FixedLengthRecordReaderV2"
+  endpoint {
+    name: "FixedLengthRecordReader"
+  }
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "header_bytes"
+    description: <<END
+Number of bytes in the header, defaults to 0.
+END
+  }
+  attr {
+    name: "record_bytes"
+    description: <<END
+Number of bytes in the record.
+END
+  }
+  attr {
+    name: "footer_bytes"
+    description: <<END
+Number of bytes in the footer, defaults to 0.
+END
+  }
+  attr {
+    name: "hop_bytes"
+    description: <<END
+Number of bytes to hop before each read. Default of 0 means using
+record_bytes.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  attr {
+    name: "encoding"
+    description: <<END
+The type of encoding for the file. Currently ZLIB and GZIP
+are supported. Defaults to none.
+END
+  }
+  summary: "A Reader that outputs fixed-length records from a file."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c40b161225ea916433a36ff91c4668ed5bc6b68
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FixedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,144 @@
+op {
+  graph_op_name: "FixedUnigramCandidateSampler"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to randomly sample.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "range_max"
+    description: <<END
+The sampler will sample integers from the interval [0, range_max).
+END
+  }
+  attr {
+    name: "vocab_file"
+    description: <<END
+Each valid line in this file (which should have a CSV-like format)
+corresponds to a valid word ID. IDs are in sequential order, starting from
+num_reserved_ids. The last entry in each line is expected to be a value
+corresponding to the count or relative probability. Exactly one of vocab_file
+and unigrams needs to be passed to this op.
+END
+  }
+  attr {
+    name: "distortion"
+    description: <<END
+The distortion is used to skew the unigram probability distribution.
+Each weight is first raised to the distortion's power before adding to the
+internal unigram distribution. As a result, distortion = 1.0 gives regular
+unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+a uniform distribution.
+END
+  }
+  attr {
+    name: "num_reserved_ids"
+    description: <<END
+Optionally some reserved IDs can be added in the range [0,
+..., num_reserved_ids) by the users. One use case is that a special unknown
+word token is used as ID 0. These IDs will have a sampling probability of 0.
+END
+  }
+  attr {
+    name: "num_shards"
+    description: <<END
+A sampler can be used to sample from a subset of the original range
+in order to speed up the whole computation through parallelism. This parameter
+(together with 'shard') indicates the number of partitions that are being
+used in the overall computation.
+END
+  }
+  attr {
+    name: "shard"
+    description: <<END
+A sampler can be used to sample from a subset of the original range
+in order to speed up the whole computation through parallelism. This parameter
+(together with 'num_shards') indicates the particular partition number of a
+sampler op, when partitioning is being used.
+END
+  }
+  attr {
+    name: "unigrams"
+    description: <<END
+A list of unigram counts or probabilities, one per ID in sequential
+order. Exactly one of vocab_file and unigrams should be passed to this op.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a learned unigram distribution."
+  description: <<END
+A unigram sampler could use a fixed unigram distribution read from a
+file or passed in as an in-memory array instead of building up the distribution
+from data on the fly. There is also an option to skew the distribution by
+applying a distortion power to the weights.
+
+The vocabulary file should be in CSV-like format, with the last field
+being the weight associated with the word.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1936119c50f5323e69465a79cda784afc68c3aca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "FlatMapDataset"
+  attr {
+    name: "f"
+    description: <<END
+A function mapping elements of `input_dataset`, concatenated with
+`other_arguments`, to a Dataset variant that contains elements matching
+`output_types` and `output_shapes`.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+Unlike MapDataset, the `f` in FlatMapDataset is expected to return a
+Dataset variant, and FlatMapDataset will flatten successive results
+into a single Dataset.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/base_api/api_def_Floor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ecb697cc7a03ae3334ad934547142306e2c53ea9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Floor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Floor"
+  summary: "Returns element-wise largest integer not greater than x."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_FloorDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..913d4a1a52119825e55362a5c624e553564349aa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FloorDiv.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "FloorDiv"
+  summary: "Returns x // y element-wise."
+  description: <<END
+*NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/base_api/api_def_FloorMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3c0be91ae8eac6b8177d0f38937c3255d003043
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FloorMod.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "FloorMod"
+  summary: "Returns element-wise remainder of division. When `x < 0` xor `y < 0` is"
+  description: <<END
+true, this follows Python semantics in that the result here is consistent
+with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+
+*NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_FractionalAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03495b7ea51cc22022837e7a8cb1391e58357d66
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FractionalAvgPool.pbtxt
@@ -0,0 +1,90 @@
+op {
+  graph_op_name: "FractionalAvgPool"
+  in_arg {
+    name: "value"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+output tensor after fractional avg pooling.
+END
+  }
+  out_arg {
+    name: "row_pooling_sequence"
+    description: <<END
+row pooling sequence, needed to calculate gradient.
+END
+  }
+  out_arg {
+    name: "col_pooling_sequence"
+    description: <<END
+column pooling sequence, needed to calculate gradient.
+END
+  }
+  attr {
+    name: "pooling_ratio"
+    description: <<END
+Pooling ratio for each dimension of `value`, currently only
+supports row and col dimension and should be >= 1.0. For example, a valid
+pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+must be 1.0 because we don't allow pooling on batch and channels
+dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+respectively.
+END
+  }
+  attr {
+    name: "pseudo_random"
+    description: <<END
+When set to True, generates the pooling sequence in a
+pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+difference between pseudorandom and random.
+END
+  }
+  attr {
+    name: "overlapping"
+    description: <<END
+When set to True, it means when pooling, the values at the boundary
+of adjacent pooling cells are used by both cells. For example:
+
+`index  0  1  2  3  4`
+
+`value  20 5  16 3  7`
+
+If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+The result would be [41/3, 26/3] for fractional avg pooling.
+END
+  }
+  attr {
+    name: "deterministic"
+    description: <<END
+When set to True, a fixed pooling region will be used when
+iterating over a FractionalAvgPool node in the computation graph. Mainly used
+in unit test to make FractionalAvgPool deterministic.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Performs fractional average pooling on the input."
+  description: <<END
+Fractional average pooling is similar to Fractional max pooling in the pooling
+region generation step. The only difference is that after pooling regions are
+generated, a mean operation is performed instead of a max operation in each
+pooling region.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FractionalAvgPoolGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_FractionalAvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0cda03295945ee86f0f4e9d77159bdd7f9b2ab9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FractionalAvgPoolGrad.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "FractionalAvgPoolGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "orig_input_tensor_shape"
+    description: <<END
+Original input tensor shape for `fractional_avg_pool`
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Gradients
+w.r.t. the output of `fractional_avg_pool`.
+END
+  }
+  in_arg {
+    name: "row_pooling_sequence"
+    description: <<END
+row pooling sequence, form pooling region with
+col_pooling_sequence.
+END
+  }
+  in_arg {
+    name: "col_pooling_sequence"
+    description: <<END
+column pooling sequence, form pooling region with
+row_pooling sequence.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+END
+  }
+  attr {
+    name: "overlapping"
+    description: <<END
+When set to True, it means when pooling, the values at the boundary
+of adjacent pooling cells are used by both cells. For example:
+
+`index  0  1  2  3  4`
+
+`value  20 5  16 3  7`
+
+If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+The result would be [41/3, 26/3] for fractional avg pooling.
+END
+  }
+  summary: "Computes gradient of the FractionalAvgPool function."
+  description: <<END
+Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+out_backprop to those indices that form the same pooling cell. Therefore, we
+just need to know the shape of original input tensor, instead of the whole
+tensor.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_FractionalMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..efc7719329dabee1e50f13e325c791988f11a562
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FractionalMaxPool.pbtxt
@@ -0,0 +1,114 @@
+op {
+  graph_op_name: "FractionalMaxPool"
+  in_arg {
+    name: "value"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+output tensor after fractional max pooling.
+END
+  }
+  out_arg {
+    name: "row_pooling_sequence"
+    description: <<END
+row pooling sequence, needed to calculate gradient.
+END
+  }
+  out_arg {
+    name: "col_pooling_sequence"
+    description: <<END
+column pooling sequence, needed to calculate gradient.
+END
+  }
+  attr {
+    name: "pooling_ratio"
+    description: <<END
+Pooling ratio for each dimension of `value`, currently only
+supports row and col dimension and should be >= 1.0. For example, a valid
+pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+must be 1.0 because we don't allow pooling on batch and channels
+dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+respectively.
+END
+  }
+  attr {
+    name: "pseudo_random"
+    description: <<END
+When set to True, generates the pooling sequence in a
+pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+difference between pseudorandom and random.
+END
+  }
+  attr {
+    name: "overlapping"
+    description: <<END
+When set to True, it means when pooling, the values at the boundary
+of adjacent pooling cells are used by both cells. For example:
+
+`index  0  1  2  3  4`
+
+`value  20 5  16 3  7`
+
+If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+The result would be [20, 16] for fractional max pooling.
+END
+  }
+  attr {
+    name: "deterministic"
+    description: <<END
+When set to True, a fixed pooling region will be used when
+iterating over a FractionalMaxPool node in the computation graph. Mainly used
+in unit test to make FractionalMaxPool deterministic.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Performs fractional max pooling on the input."
+  description: <<END
+Fractional max pooling is slightly different than regular max pooling.  In
+regular max pooling, you downsize an input set by taking the maximum value of
+smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+a factor of N, where N is an integer.  Fractional max pooling, as you might
+expect from the word "fractional", means that the overall reduction ratio N
+does not have to be an integer.
+
+The sizes of the pooling regions are generated randomly but are fairly uniform.
+For example, let's look at the height dimension, and the constraints on the
+list of rows that will be pool boundaries.
+
+First we define the following:
+
+1.  input_row_length : the number of rows from the input set
+2.  output_row_length : which will be smaller than the input
+3.  alpha = input_row_length / output_row_length : our reduction ratio
+4.  K = floor(alpha)
+5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+Then, row_pooling_sequence should satisfy:
+
+1.  a[0] = 0 : the first value of the sequence is 0
+2.  a[end] = input_row_length : the last value of the sequence is the size
+3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+4.  length(row_pooling_sequence) = output_row_length+1
+
+For more details on fractional max pooling, see this paper:
+[Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FractionalMaxPoolGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_FractionalMaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7faa5b24a04e4cd31d4d2a8c84427307e6a6015
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FractionalMaxPoolGrad.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "FractionalMaxPoolGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "orig_input"
+    description: <<END
+Original input for `fractional_max_pool`
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+Original output for `fractional_max_pool`
+END
+  }
+  in_arg {
+    name: "out_backprop"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Gradients
+w.r.t. the output of `fractional_max_pool`.
+END
+  }
+  in_arg {
+    name: "row_pooling_sequence"
+    description: <<END
+row pooling sequence, form pooling region with
+col_pooling_sequence.
+END
+  }
+  in_arg {
+    name: "col_pooling_sequence"
+    description: <<END
+column pooling sequence, form pooling region with
+row_pooling sequence.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+END
+  }
+  attr {
+    name: "overlapping"
+    description: <<END
+When set to True, it means when pooling, the values at the boundary
+of adjacent pooling cells are used by both cells. For example:
+
+`index  0  1  2  3  4`
+
+`value  20 5  16 3  7`
+
+If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+The result would be [20, 16] for fractional max pooling.
+END
+  }
+  summary: "Computes gradient of the FractionalMaxPool function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNorm.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f065d96fc20eac081a309e06a2367ce016be84d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNorm.pbtxt
@@ -0,0 +1,99 @@
+op {
+  graph_op_name: "FusedBatchNorm"
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "offset"
+    description: <<END
+A 1D Tensor for offset, to shift to the normalized x.
+END
+  }
+  in_arg {
+    name: "mean"
+    description: <<END
+A 1D Tensor for population mean. Used for inference only;
+must be empty for training.
+END
+  }
+  in_arg {
+    name: "variance"
+    description: <<END
+A 1D Tensor for population variance. Used for inference only;
+must be empty for training.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A 4D Tensor for output data.
+END
+  }
+  out_arg {
+    name: "batch_mean"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be used by TensorFlow
+to compute the running mean.
+END
+  }
+  out_arg {
+    name: "batch_variance"
+    description: <<END
+A 1D Tensor for the computed batch variance, to be used by
+TensorFlow to compute the running variance.
+END
+  }
+  out_arg {
+    name: "reserve_space_1"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be reused
+in the gradient computation.
+END
+  }
+  out_arg {
+    name: "reserve_space_2"
+    description: <<END
+A 1D Tensor for the computed batch variance (inverted variance
+in the cuDNN case), to be reused in the gradient computation.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for x and y. Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d436e36906cdbde2967cc49d1b7f9e86c7fe77d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGrad.pbtxt
@@ -0,0 +1,102 @@
+op {
+  graph_op_name: "FusedBatchNormGrad"
+  in_arg {
+    name: "y_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to y.
+END
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "reserve_space_1"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+mean to be reused in gradient computation. When is_training is
+False, a 1D Tensor for the population mean to be reused in both
+1st and 2nd order gradient computation.
+END
+  }
+  in_arg {
+    name: "reserve_space_2"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+variance (inverted variance in the cuDNN case) to be reused in
+gradient computation. When is_training is False, a 1D Tensor
+for the population variance to be reused in both 1st and 2nd
+order gradient computation.
+END
+  }
+  out_arg {
+    name: "x_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to x.
+END
+  }
+  out_arg {
+    name: "scale_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to scale.
+END
+  }
+  out_arg {
+    name: "offset_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to offset.
+END
+  }
+  out_arg {
+    name: "reserve_space_3"
+    description: <<END
+Unused placeholder to match the mean input in FusedBatchNorm.
+END
+  }
+  out_arg {
+    name: "reserve_space_4"
+    description: <<END
+Unused placeholder to match the variance input
+in FusedBatchNorm.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for y_backprop, x, x_backprop.
+Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Gradient for batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8f04093a41b7a1418e67f344b0044cdf3ebcde6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormGradV2.pbtxt
@@ -0,0 +1,108 @@
+op {
+  graph_op_name: "FusedBatchNormGradV2"
+  in_arg {
+    name: "y_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to y.
+END
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "reserve_space_1"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+mean to be reused in gradient computation. When is_training is
+False, a 1D Tensor for the population mean to be reused in both
+1st and 2nd order gradient computation.
+END
+  }
+  in_arg {
+    name: "reserve_space_2"
+    description: <<END
+When is_training is True, a 1D Tensor for the computed batch
+variance (inverted variance in the cuDNN case) to be reused in
+gradient computation. When is_training is False, a 1D Tensor
+for the population variance to be reused in both 1st and 2nd
+order gradient computation.
+END
+  }
+  out_arg {
+    name: "x_backprop"
+    description: <<END
+A 4D Tensor for the gradient with respect to x.
+END
+  }
+  out_arg {
+    name: "scale_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to scale.
+END
+  }
+  out_arg {
+    name: "offset_backprop"
+    description: <<END
+A 1D Tensor for the gradient with respect to offset.
+END
+  }
+  out_arg {
+    name: "reserve_space_3"
+    description: <<END
+Unused placeholder to match the mean input in FusedBatchNorm.
+END
+  }
+  out_arg {
+    name: "reserve_space_4"
+    description: <<END
+Unused placeholder to match the variance input
+in FusedBatchNorm.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "U"
+    description: <<END
+The data type for the scale, offset, mean, and variance.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for y_backprop, x, x_backprop.
+Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Gradient for batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df14adf49d015875dc05db25a7abb2e69a82b068
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedBatchNormV2.pbtxt
@@ -0,0 +1,105 @@
+op {
+  graph_op_name: "FusedBatchNormV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D Tensor for input data.
+END
+  }
+  in_arg {
+    name: "scale"
+    description: <<END
+A 1D Tensor for scaling factor, to scale the normalized x.
+END
+  }
+  in_arg {
+    name: "offset"
+    description: <<END
+A 1D Tensor for offset, to shift to the normalized x.
+END
+  }
+  in_arg {
+    name: "mean"
+    description: <<END
+A 1D Tensor for population mean. Used for inference only;
+must be empty for training.
+END
+  }
+  in_arg {
+    name: "variance"
+    description: <<END
+A 1D Tensor for population variance. Used for inference only;
+must be empty for training.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A 4D Tensor for output data.
+END
+  }
+  out_arg {
+    name: "batch_mean"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be used by TensorFlow
+to compute the running mean.
+END
+  }
+  out_arg {
+    name: "batch_variance"
+    description: <<END
+A 1D Tensor for the computed batch variance, to be used by
+TensorFlow to compute the running variance.
+END
+  }
+  out_arg {
+    name: "reserve_space_1"
+    description: <<END
+A 1D Tensor for the computed batch mean, to be reused
+in the gradient computation.
+END
+  }
+  out_arg {
+    name: "reserve_space_2"
+    description: <<END
+A 1D Tensor for the computed batch variance (inverted variance
+in the cuDNN case), to be reused in the gradient computation.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The data type for the elements of input and output Tensors.
+END
+  }
+  attr {
+    name: "U"
+    description: <<END
+The data type for the scale, offset, mean, and variance.
+END
+  }
+  attr {
+    name: "epsilon"
+    description: <<END
+A small float number added to the variance of x.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format for x and y. Either "NHWC" (default) or "NCHW".
+END
+  }
+  attr {
+    name: "is_training"
+    description: <<END
+A bool value to indicate the operation is for training (default)
+or inference.
+END
+  }
+  summary: "Batch normalization."
+  description: <<END
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedPadConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2c3eb0c5396dc2c6bce2527a250a995ee13a91
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedPadConv2D.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "FusedPadConv2D"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+A two-column matrix specifying the padding sizes. The number of
+rows must be the same as the rank of `input`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4.  The stride of the sliding window for each dimension
+of `input`. Must be in the same order as the dimension specified with format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Performs a padding as a preprocess during a convolution."
+  description: <<END
+Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+implementation where the spatial padding transformation stage is fused with the
+im2col lookup, but in this case without the bilinear filtering required for
+resizing. Fusing the padding prevents the need to write out the intermediate
+results as whole tensors, reducing memory pressure, and we can get some latency
+gains by merging the transformation calculations.
+The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+order is used instead.
+Internally this op uses a single per-graph scratch buffer, which means that it
+will block if multiple versions are being run in parallel. This is because this
+operator is primarily an optimization to minimize memory usage.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a72f2bfe5fc90ed7055a0d5354af81f8eee6a7d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "FusedResizeAndPadConv2D"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, in_height, in_width, in_channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+A two-column matrix specifying the padding sizes. The number of
+rows must be the same as the rank of `input`.
+END
+  }
+  in_arg {
+    name: "filter"
+    description: <<END
+4-D with shape
+`[filter_height, filter_width, in_channels, out_channels]`.
+END
+  }
+  attr {
+    name: "resize_align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1),
+which exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D of length 4.  The stride of the sliding window for each dimension
+of `input`. Must be in the same order as the dimension specified with format.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Performs a resize and padding as a preprocess during a convolution."
+  description: <<END
+It's often possible to do spatial transformations more efficiently as part of
+the packing stage of a convolution, so this op allows for an optimized
+implementation where these stages are fused together. This prevents the need to
+write out the intermediate results as whole tensors, reducing memory pressure,
+and we can get some latency gains by merging the transformation calculations.
+The data_format attribute for Conv2D isn't supported by this op, and defaults to
+'NHWC' order.
+Internally this op uses a single per-graph scratch buffer, which means that it
+will block if multiple versions are being run in parallel. This is because this
+operator is primarily an optimization to minimize memory usage.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_G.pbtxt b/tensorflow/core/api_def/base_api/api_def_G.pbtxt
deleted file mode 100644
index 343d505718149cf09c12a777a8b025ee176e4d2f..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_G.pbtxt
+++ /dev/null
@@ -1,257 +0,0 @@
-op {
-  graph_op_name: "Gather"
-  endpoint {
-    name: "Gather"
-  }
-  summary: "Gather slices from `params` according to `indices`."
-  description: <<END
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-
-```python
-    # Scalar indices
-    output[:, ..., :] = params[indices, :, ... :]
-
-    # Vector indices
-    output[i, :, ..., :] = params[indices[i], :, ... :]
-
-    # Higher rank indices
-    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-```
-
-If `indices` is a permutation and `len(indices) == params.shape[0]` then
-this operation will permute `params` accordingly.
-
-`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-`indices` are always validated to be within range. If assigned to GPU,
-out-of-bound indices result in safe but unspecified behavior, which may include
-raising an error.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "GatherNd"
-  endpoint {
-    name: "GatherNd"
-  }
-  summary: "Gather slices from `params` into a Tensor with shape specified by `indices`."
-  description: <<END
-`indices` is an K-dimensional integer tensor, best thought of as a
-(K-1)-dimensional tensor of indices into `params`, where each element defines a
-slice of `params`:
-
-    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
-
-Whereas in @{tf.gather} `indices` defines slices into the first
-dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-first `N` dimensions of `params`, where `N = indices.shape[-1]`.
-
-The last dimension of `indices` can be at most the rank of
-`params`:
-
-    indices.shape[-1] <= params.rank
-
-The last dimension of `indices` corresponds to elements
-(if `indices.shape[-1] == params.rank`) or slices
-(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-of `params`.  The output tensor has shape
-
-    indices.shape[:-1] + params.shape[indices.shape[-1]:]
-
-Some examples below.
-
-Simple indexing into a matrix:
-
-```python
-    indices = [[0, 0], [1, 1]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = ['a', 'd']
-```
-
-Slice indexing into a matrix:
-
-```python
-    indices = [[1], [0]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['c', 'd'], ['a', 'b']]
-```
-
-Indexing into a 3-tensor:
-
-```python
-    indices = [[1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['a1', 'b1'], ['c1', 'd1']]]
-
-
-    indices = [[0, 1], [1, 0]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['c0', 'd0'], ['a1', 'b1']]
-
-
-    indices = [[0, 0, 1], [1, 0, 1]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = ['b0', 'b1']
-```
-
-Batched indexing into a matrix:
-
-```python
-    indices = [[[0, 0]], [[0, 1]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [['a'], ['b']]
-```
-
-Batched slice indexing into a matrix:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [['a', 'b'], ['c', 'd']]
-    output = [[['c', 'd']], [['a', 'b']]]
-```
-
-Batched indexing into a 3-tensor:
-
-```python
-    indices = [[[1]], [[0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[[['a1', 'b1'], ['c1', 'd1']]],
-              [[['a0', 'b0'], ['c0', 'd0']]]]
-
-    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [[['c0', 'd0'], ['a1', 'b1']],
-              [['a0', 'b0'], ['c1', 'd1']]]
-
-
-    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-    params = [[['a0', 'b0'], ['c0', 'd0']],
-              [['a1', 'b1'], ['c1', 'd1']]]
-    output = [['b0', 'b1'], ['d0', 'c1']]
-```
-END
-}
-op {
-  graph_op_name: "GatherV2"
-  endpoint {
-    name: "GatherV2"
-  }
-  summary: "Gather slices from `params` axis `axis` according to `indices`."
-  description: <<END
-`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
-
-```python
-    # Scalar indices (output is rank(params) - 1).
-    output[a_0, ..., a_n, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices, b_0, ..., b_n]
-
-    # Vector indices (output is rank(params)).
-    output[a_0, ..., a_n, i, b_0, ..., b_n] =
-      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-
-    # Higher rank indices (output is rank(params) + rank(indices) - 1).
-    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "GenerateVocabRemapping"
-  endpoint {
-    name: "GenerateVocabRemapping"
-  }
-  summary: "Given a path to new and old vocabulary files, returns a remapping Tensor of"
-  description: <<END
-length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-in the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables
-use in the partitioned variable case, and should generally be set through
-examining partitioning info.  The format of the files should be a text file,
-with each line containing a single entity within the vocabulary.
-
-For example, with `new_vocab_file` a text file containing each of the following
-elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-`[0, -1, 2]`.
-
-The op also returns a count of how many entries in the new vocabulary
-were present in the old vocabulary, which is used to calculate the number of
-values to initialize in a weight matrix remapping
-
-This functionality can be used to remap both row vocabularies (typically,
-features) and column vocabularies (typically, classes) from TensorFlow
-checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-corresponding to div-partitioned variables.  Moreover, the underlying remapping
-uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-use the corresponding index_table_from_file() as the FeatureColumn framework
-does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-END
-}
-op {
-  graph_op_name: "GetSessionHandle"
-  endpoint {
-    name: "GetSessionHandle"
-  }
-  summary: "Store the input tensor in the state of the current session."
-}
-op {
-  graph_op_name: "GetSessionHandleV2"
-  endpoint {
-    name: "GetSessionHandleV2"
-  }
-  summary: "Store the input tensor in the state of the current session."
-}
-op {
-  graph_op_name: "GetSessionTensor"
-  endpoint {
-    name: "GetSessionTensor"
-  }
-  summary: "Get the value of the tensor specified by its handle."
-}
-op {
-  graph_op_name: "Greater"
-  endpoint {
-    name: "Greater"
-  }
-  summary: "Returns the truth value of (x > y) element-wise."
-  description: <<END
-*NOTE*: `Greater` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "GreaterEqual"
-  endpoint {
-    name: "GreaterEqual"
-  }
-  summary: "Returns the truth value of (x >= y) element-wise."
-  description: <<END
-*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "GroupByWindowDataset"
-  endpoint {
-    name: "GroupByWindowDataset"
-  }
-  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
-  description: <<END
-// TODO(mrry): Support non-int64 keys.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Gather.pbtxt b/tensorflow/core/api_def/base_api/api_def_Gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6dcf2252ce6492e346ea2b3213a0f5075fcdfb25
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Gather.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "Gather"
+  summary: "Gather slices from `params` according to `indices`."
+  description: <<END
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+
+```python
+    # Scalar indices
+    output[:, ..., :] = params[indices, :, ... :]
+
+    # Vector indices
+    output[i, :, ..., :] = params[indices[i], :, ... :]
+
+    # Higher rank indices
+    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+```
+
+If `indices` is a permutation and `len(indices) == params.shape[0]` then
+this operation will permute `params` accordingly.
+
+`validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+`indices` are always validated to be within range. If assigned to GPU,
+out-of-bound indices result in safe but unspecified behavior, which may include
+raising an error.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7f8b6c21ba9fd85ee20c259425b04a8d4aade75
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GatherNd.pbtxt
@@ -0,0 +1,123 @@
+op {
+  graph_op_name: "GatherNd"
+  in_arg {
+    name: "params"
+    description: <<END
+The tensor from which to gather values.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Values from `params` gathered from indices given by `indices`, with
+shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+END
+  }
+  summary: "Gather slices from `params` into a Tensor with shape specified by `indices`."
+  description: <<END
+`indices` is an K-dimensional integer tensor, best thought of as a
+(K-1)-dimensional tensor of indices into `params`, where each element defines a
+slice of `params`:
+
+    output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+
+Whereas in @{tf.gather} `indices` defines slices into the first
+dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+
+The last dimension of `indices` can be at most the rank of
+`params`:
+
+    indices.shape[-1] <= params.rank
+
+The last dimension of `indices` corresponds to elements
+(if `indices.shape[-1] == params.rank`) or slices
+(if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+of `params`.  The output tensor has shape
+
+    indices.shape[:-1] + params.shape[indices.shape[-1]:]
+
+Some examples below.
+
+Simple indexing into a matrix:
+
+```python
+    indices = [[0, 0], [1, 1]]
+    params = [['a', 'b'], ['c', 'd']]
+    output = ['a', 'd']
+```
+
+Slice indexing into a matrix:
+
+```python
+    indices = [[1], [0]]
+    params = [['a', 'b'], ['c', 'd']]
+    output = [['c', 'd'], ['a', 'b']]
+```
+
+Indexing into a 3-tensor:
+
+```python
+    indices = [[1]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = [[['a1', 'b1'], ['c1', 'd1']]]
+
+
+    indices = [[0, 1], [1, 0]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = [['c0', 'd0'], ['a1', 'b1']]
+
+
+    indices = [[0, 0, 1], [1, 0, 1]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = ['b0', 'b1']
+```
+
+Batched indexing into a matrix:
+
+```python
+    indices = [[[0, 0]], [[0, 1]]]
+    params = [['a', 'b'], ['c', 'd']]
+    output = [['a'], ['b']]
+```
+
+Batched slice indexing into a matrix:
+
+```python
+    indices = [[[1]], [[0]]]
+    params = [['a', 'b'], ['c', 'd']]
+    output = [[['c', 'd']], [['a', 'b']]]
+```
+
+Batched indexing into a 3-tensor:
+
+```python
+    indices = [[[1]], [[0]]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = [[[['a1', 'b1'], ['c1', 'd1']]],
+              [[['a0', 'b0'], ['c0', 'd0']]]]
+
+    indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = [[['c0', 'd0'], ['a1', 'b1']],
+              [['a0', 'b0'], ['c1', 'd1']]]
+
+
+    indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+    params = [[['a0', 'b0'], ['c0', 'd0']],
+              [['a1', 'b1'], ['c1', 'd1']]]
+    output = [['b0', 'b1'], ['d0', 'c1']]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c020176a3b41b257b54601aecab0d47d36849c81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "GatherV2"
+  in_arg {
+    name: "params"
+    description: <<END
+The tensor from which to gather values. Must be at least rank
+`axis + 1`.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor. Must be in range `[0, params.shape[axis])`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+The axis in `params` to gather `indices` from. Defaults to the first
+dimension. Supports negative indexes.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Values from `params` gathered from indices given by `indices`, with
+shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+END
+  }
+  summary: "Gather slices from `params` axis `axis` according to `indices`."
+  description: <<END
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+params.shape[axis + 1:]` where:
+
+```python
+    # Scalar indices (output is rank(params) - 1).
+    output[a_0, ..., a_n, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices, b_0, ..., b_n]
+
+    # Vector indices (output is rank(params)).
+    output[a_0, ..., a_n, i, b_0, ..., b_n] =
+      params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+
+    # Higher rank indices (output is rank(params) + rank(indices) - 1).
+    output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+      params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt b/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..662e4c54b6c29124dd39ae6e14f1af20c48a0b41
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GenerateVocabRemapping.pbtxt
@@ -0,0 +1,79 @@
+op {
+  graph_op_name: "GenerateVocabRemapping"
+  in_arg {
+    name: "new_vocab_file"
+    description: <<END
+Path to the new vocab file.
+END
+  }
+  in_arg {
+    name: "old_vocab_file"
+    description: <<END
+Path to the old vocab file.
+END
+  }
+  out_arg {
+    name: "remapping"
+    description: <<END
+A Tensor of length num_new_vocab where the element at index i
+is equal to the old ID that maps to the new ID i.  This element is -1 for any
+new ID that is not found in the old vocabulary.
+END
+  }
+  out_arg {
+    name: "num_present"
+    description: <<END
+Number of new vocab entries found in old vocab.
+END
+  }
+  attr {
+    name: "new_vocab_offset"
+    description: <<END
+How many entries into the new vocab file to start reading.
+END
+  }
+  attr {
+    name: "num_new_vocab"
+    description: <<END
+Number of entries in the new vocab file to remap.
+END
+  }
+  attr {
+    name: "old_vocab_size"
+    description: <<END
+Number of entries in the old vocab file to consider.  If -1,
+use the entire old vocabulary.
+END
+  }
+  summary: "Given a path to new and old vocabulary files, returns a remapping Tensor of"
+  description: <<END
+length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+default value of -1.
+
+`num_vocab_offset` enables
+use in the partitioned variable case, and should generally be set through
+examining partitioning info.  The format of the files should be a text file,
+with each line containing a single entity within the vocabulary.
+
+For example, with `new_vocab_file` a text file containing each of the following
+elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+`[0, -1, 2]`.
+
+The op also returns a count of how many entries in the new vocabulary
+were present in the old vocabulary, which is used to calculate the number of
+values to initialize in a weight matrix remapping
+
+This functionality can be used to remap both row vocabularies (typically,
+features) and column vocabularies (typically, classes) from TensorFlow
+checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+corresponding to div-partitioned variables.  Moreover, the underlying remapping
+uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+use the corresponding index_table_from_file() as the FeatureColumn framework
+does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetSessionHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetSessionHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..243712c85315ed13c439a9ebe7e3f28da560f7fd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetSessionHandle.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "GetSessionHandle"
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to be stored.
+END
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle for the tensor stored in the session state, represented
+as a string.
+END
+  }
+  summary: "Store the input tensor in the state of the current session."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetSessionHandleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetSessionHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..63cdc053c749d0f8c4693499ddfd223509ca35c1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetSessionHandleV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "GetSessionHandleV2"
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to be stored.
+END
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle for the tensor stored in the session state, represented
+as a ResourceHandle object.
+END
+  }
+  summary: "Store the input tensor in the state of the current session."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GetSessionTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_GetSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..89bd3efe22848a78b2b694a18e7807f529f224d3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GetSessionTensor.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "GetSessionTensor"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle for a tensor stored in the session state.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+The tensor for the given handle.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output value.
+END
+  }
+  summary: "Get the value of the tensor specified by its handle."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Greater.pbtxt b/tensorflow/core/api_def/base_api/api_def_Greater.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a4e2f2eddc2acdc67d16fae517332b0a0b5b852
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Greater.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Greater"
+  summary: "Returns the truth value of (x > y) element-wise."
+  description: <<END
+*NOTE*: `Greater` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GreaterEqual.pbtxt b/tensorflow/core/api_def/base_api/api_def_GreaterEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc947f04886e49ad8656976b66a7c6796ae6b515
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GreaterEqual.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "GreaterEqual"
+  summary: "Returns the truth value of (x >= y) element-wise."
+  description: <<END
+*NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea6bcd469577d02e39afbeb2ba0c8b467e312ba9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "GroupByWindowDataset"
+  attr {
+    name: "key_func"
+    description: <<END
+A function mapping an element of `input_dataset`, concatenated
+with `key_func_other_arguments` to a scalar value of type DT_INT64.
+END
+  }
+  summary: "Creates a dataset that computes a windowed group-by on `input_dataset`."
+  description: <<END
+// TODO(mrry): Support non-int64 keys.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2a2e1aaef84f8c978f8c9312cc52b9bdcd35ca8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GuaranteeConst.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "GuaranteeConst"
+  summary: "Gives a guarantee to the TF runtime that the input tensor is a constant."
+  description: <<END
+The runtime is then free to make optimizations based on this.
+
+Only accepts value typed tensors as inputs and rejects resource variable handles
+as input.
+
+Returns the input tensor without modification.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_H.pbtxt b/tensorflow/core/api_def/base_api/api_def_H.pbtxt
deleted file mode 100644
index 71282e7defc91ef879433c4a5a25ebf6f256e629..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_H.pbtxt
+++ /dev/null
@@ -1,52 +0,0 @@
-op {
-  graph_op_name: "HSVToRGB"
-  endpoint {
-    name: "HSVToRGB"
-  }
-  summary: "Convert one or more images from HSV to RGB."
-  description: <<END
-Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-See `rgb_to_hsv` for a description of the HSV encoding.
-END
-}
-op {
-  graph_op_name: "HashTable"
-  endpoint {
-    name: "HashTable"
-  }
-  summary: "Creates a non-initialized hash table."
-  description: <<END
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-END
-}
-op {
-  graph_op_name: "HashTableV2"
-  endpoint {
-    name: "HashTableV2"
-  }
-  summary: "Creates a non-initialized hash table."
-  description: <<END
-This op creates a hash table, specifying the type of its keys and values.
-Before using the table you will have to initialize it.  After initialization the
-table will be immutable.
-END
-}
-op {
-  graph_op_name: "HistogramSummary"
-  endpoint {
-    name: "HistogramSummary"
-  }
-  summary: "Outputs a `Summary` protocol buffer with a histogram."
-  description: <<END
-The generated
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-has one summary value containing a histogram for `values`.
-
-This op reports an `InvalidArgument` error if any value is not finite.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_HSVToRGB.pbtxt b/tensorflow/core/api_def/base_api/api_def_HSVToRGB.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b23ef3c414612e3c92c1fe3195ea32da433bf39
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HSVToRGB.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "HSVToRGB"
+  in_arg {
+    name: "images"
+    description: <<END
+1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+`images` converted to RGB.
+END
+  }
+  summary: "Convert one or more images from HSV to RGB."
+  description: <<END
+Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+See `rgb_to_hsv` for a description of the HSV encoding.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_HashTable.pbtxt b/tensorflow/core/api_def/base_api/api_def_HashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb20232a8990050e91ab5656c28a15ccbd1cccd8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HashTable.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "HashTable"
+  visibility: SKIP
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "use_node_name_sharing"
+    description: <<END
+If true and shared_name is empty, the table is shared
+using the node name.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates a non-initialized hash table."
+  description: <<END
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_HashTableV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_HashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eddd4e256c93ddfd91f83c4149088fe586f2f377
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HashTableV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "HashTableV2"
+  endpoint {
+    name: "HashTable"
+  }
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "use_node_name_sharing"
+    description: <<END
+If true and shared_name is empty, the table is shared
+using the node name.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates a non-initialized hash table."
+  description: <<END
+This op creates a hash table, specifying the type of its keys and values.
+Before using the table you will have to initialize it.  After initialization the
+table will be immutable.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_HistogramFixedWidth.pbtxt b/tensorflow/core/api_def/base_api/api_def_HistogramFixedWidth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b7fcd67f1324888f82015441add7a655b90d5e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HistogramFixedWidth.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "HistogramFixedWidth"
+  in_arg {
+    name: "values"
+    description: <<END
+Numeric `Tensor`.
+END
+  }
+  in_arg {
+    name: "value_range"
+    description: <<END
+Shape [2] `Tensor` of same `dtype` as `values`.
+values <= value_range[0] will be mapped to hist[0],
+values >= value_range[1] will be mapped to hist[-1].
+END
+  }
+  in_arg {
+    name: "nbins"
+    description: <<END
+Scalar `int32 Tensor`.  Number of histogram bins.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+A 1-D `Tensor` holding histogram of values.
+END
+  }
+  summary: "Return histogram of values."
+  description: <<END
+Given the tensor `values`, this operation returns a rank 1 histogram counting
+the number of entries in `values` that fall into every bin.  The bins are
+equal width and determined by the arguments `value_range` and `nbins`.
+
+```python
+# Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+nbins = 5
+value_range = [0.0, 5.0]
+new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+
+with tf.get_default_session() as sess:
+  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  variables.global_variables_initializer().run()
+  sess.run(hist) => [2, 1, 1, 0, 2]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_HistogramSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_HistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..faf1ed5abdd151222d07bc4b15f287ee9620cdb1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_HistogramSummary.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "HistogramSummary"
+  in_arg {
+    name: "tag"
+    description: <<END
+Scalar.  Tag to use for the `Summary.Value`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Any shape. Values to use to build the histogram.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar. Serialized `Summary` protocol buffer.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with a histogram."
+  description: <<END
+The generated
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+has one summary value containing a histogram for `values`.
+
+This op reports an `InvalidArgument` error if any value is not finite.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_I.pbtxt b/tensorflow/core/api_def/base_api/api_def_I.pbtxt
deleted file mode 100644
index caaf93bf883c9e81d0199562b87533c29e8e8488..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_I.pbtxt
+++ /dev/null
@@ -1,518 +0,0 @@
-op {
-  graph_op_name: "IFFT"
-  endpoint {
-    name: "IFFT"
-  }
-  summary: "Inverse fast Fourier transform."
-  description: <<END
-Computes the inverse 1-dimensional discrete Fourier transform over the
-inner-most dimension of `input`.
-END
-}
-op {
-  graph_op_name: "IFFT2D"
-  endpoint {
-    name: "IFFT2D"
-  }
-  summary: "Inverse 2D fast Fourier transform."
-  description: <<END
-Computes the inverse 2-dimensional discrete Fourier transform over the
-inner-most 2 dimensions of `input`.
-END
-}
-op {
-  graph_op_name: "IFFT3D"
-  endpoint {
-    name: "IFFT3D"
-  }
-  summary: "Inverse 3D fast Fourier transform."
-  description: <<END
-Computes the inverse 3-dimensional discrete Fourier transform over the
-inner-most 3 dimensions of `input`.
-END
-}
-op {
-  graph_op_name: "IRFFT"
-  endpoint {
-    name: "IRFFT"
-  }
-  summary: "Inverse real-valued fast Fourier transform."
-  description: <<END
-Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most dimension of `input`.
-
-The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-`fft_length` is not provided, it is computed from the size of the inner-most
-dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-than the corresponding dimension of `input`, the dimension is cropped. If it is
-larger, the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "IRFFT2D"
-  endpoint {
-    name: "IRFFT2D"
-  }
-  summary: "Inverse 2D real-valued fast Fourier transform."
-  description: <<END
-Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most 2 dimensions of `input`.
-
-The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-to compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "IRFFT3D"
-  endpoint {
-    name: "IRFFT3D"
-  }
-  summary: "Inverse 3D real-valued fast Fourier transform."
-  description: <<END
-Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-signal over the inner-most 3 dimensions of `input`.
-
-The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-to compute `input` is odd, it should be provided since it cannot be inferred
-properly.
-
-Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "Identity"
-  endpoint {
-    name: "Identity"
-  }
-  summary: "Return a tensor with the same shape and contents as the input tensor or value."
-}
-op {
-  graph_op_name: "IdentityN"
-  endpoint {
-    name: "IdentityN"
-  }
-  summary: "Returns a list of tensors with the same shapes and contents as the input"
-  description: <<END
-tensors.
-
-This op can be used to override the gradient for complicated functions. For
-example, suppose y = f(x) and we wish to apply a custom function g for backprop
-such that dx = g(dy). In Python,
-
-```python
-with tf.get_default_graph().gradient_override_map(
-    {'IdentityN': 'OverrideGradientWithG'}):
-  y, _ = identity_n([f(x), x])
-
-@tf.RegisterGradient('OverrideGradientWithG')
-def ApplyG(op, dy, _):
-  return [None, g(dy)]  # Do not backprop to f(x).
-```
-END
-}
-op {
-  graph_op_name: "IdentityReader"
-  endpoint {
-    name: "IdentityReader"
-  }
-  summary: "A Reader that outputs the queued work as both the key and value."
-  description: <<END
-To use, enqueue strings in a Queue.  ReaderRead will take the front
-work string and output (work, work).
-END
-}
-op {
-  graph_op_name: "IdentityReaderV2"
-  endpoint {
-    name: "IdentityReaderV2"
-  }
-  summary: "A Reader that outputs the queued work as both the key and value."
-  description: <<END
-To use, enqueue strings in a Queue.  ReaderRead will take the front
-work string and output (work, work).
-END
-}
-op {
-  graph_op_name: "Igamma"
-  endpoint {
-    name: "Igamma"
-  }
-  summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
-  description: <<END
-The lower regularized incomplete Gamma function is defined as:
-
-
-\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-
-where
-
-\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
-
-is the lower incomplete Gamma function.
-
-Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-Gamma function.
-END
-}
-op {
-  graph_op_name: "Igammac"
-  endpoint {
-    name: "Igammac"
-  }
-  summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
-  description: <<END
-The upper regularized incomplete Gamma function is defined as:
-
-\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-
-where
-
-\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-
-is the upper incomplete Gama function.
-
-Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-Gamma function.
-END
-}
-op {
-  graph_op_name: "IgnoreErrorsDataset"
-  endpoint {
-    name: "IgnoreErrorsDataset"
-  }
-  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
-}
-op {
-  graph_op_name: "Imag"
-  endpoint {
-    name: "Imag"
-  }
-  summary: "Returns the imaginary part of a complex number."
-  description: <<END
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the imaginary part of each element in `input`. All
-elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-is the real part and *b* is the imaginary part returned by this operation.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.imag(input) ==> [4.75, 5.75]
-```
-END
-}
-op {
-  graph_op_name: "ImageSummary"
-  endpoint {
-    name: "ImageSummary"
-  }
-  summary: "Outputs a `Summary` protocol buffer with images."
-  description: <<END
-The summary has up to `max_images` summary values containing images. The
-images are built from `tensor` which must be 4-D with shape `[batch_size,
-height, width, channels]` and where `channels` can be:
-
-*  1: `tensor` is interpreted as Grayscale.
-*  3: `tensor` is interpreted as RGB.
-*  4: `tensor` is interpreted as RGBA.
-
-The images have the same number of channels as the input tensor. For float
-input, the values are normalized one image at a time to fit in the range
-`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-normalization algorithms:
-
-*  If the input values are all positive, they are rescaled so the largest one
-   is 255.
-
-*  If any input value is negative, the values are shifted so input value 0.0
-   is at 127.  They are then rescaled so that either the smallest value is 0,
-   or the largest one is 255.
-
-The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-build the `tag` of the summary values:
-
-*  If `max_images` is 1, the summary value tag is '*tag*/image'.
-*  If `max_images` is greater than 1, the summary value tags are
-   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-
-The `bad_color` argument is the color to use in the generated images for
-non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-Each element must be in the range `[0, 255]` (It represents the value of a
-pixel in the output image).  Non-finite values in the input tensor are
-replaced by this tensor in the output image.  The default value is the color
-red.
-END
-}
-op {
-  graph_op_name: "ImmutableConst"
-  endpoint {
-    name: "ImmutableConst"
-  }
-  summary: "Returns immutable tensor from memory region."
-  description: <<END
-The current implementation memmaps the tensor from a file.
-END
-}
-op {
-  graph_op_name: "InTopK"
-  endpoint {
-    name: "InTopK"
-  }
-  summary: "Says whether the targets are in the top `K` predictions."
-  description: <<END
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-END
-}
-op {
-  graph_op_name: "InTopKV2"
-  endpoint {
-    name: "InTopKV2"
-  }
-  summary: "Says whether the targets are in the top `K` predictions."
-  description: <<END
-This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-prediction for the target class is among the top `k` predictions among
-all predictions for example `i`. Note that the behavior of `InTopK` differs
-from the `TopK` op in its handling of ties; if multiple classes have the
-same prediction value and straddle the top-`k` boundary, all of those
-classes are considered to be in the top `k`.
-
-More formally, let
-
-  \\(predictions_i\\) be the predictions for all classes for example `i`,
-  \\(targets_i\\) be the target class for example `i`,
-  \\(out_i\\) be the output for example `i`,
-
-$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-END
-}
-op {
-  graph_op_name: "InitializeTable"
-  endpoint {
-    name: "InitializeTable"
-  }
-  summary: "Table initializer that takes two tensors for keys and values respectively."
-}
-op {
-  graph_op_name: "InitializeTableFromTextFile"
-  endpoint {
-    name: "InitializeTableFromTextFile"
-  }
-  summary: "Initializes a table from a text file."
-  description: <<END
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-END
-}
-op {
-  graph_op_name: "InitializeTableFromTextFileV2"
-  endpoint {
-    name: "InitializeTableFromTextFileV2"
-  }
-  summary: "Initializes a table from a text file."
-  description: <<END
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
-END
-}
-op {
-  graph_op_name: "InitializeTableV2"
-  endpoint {
-    name: "InitializeTableV2"
-  }
-  summary: "Table initializer that takes two tensors for keys and values respectively."
-}
-op {
-  graph_op_name: "InterleaveDataset"
-  endpoint {
-    name: "InterleaveDataset"
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: <<END
-Unlike MapDataset, the `f` in InterleaveDataset is expected to return
-a Dataset variant, and InterleaveDataset will flatten successive
-results into a single Dataset. Unlike FlatMapDataset,
-InterleaveDataset will interleave sequences of up to `block_length`
-consecutive elements from `cycle_length` input elements.
-END
-}
-op {
-  graph_op_name: "Inv"
-  endpoint {
-    name: "Inv"
-  }
-  summary: "Computes the reciprocal of x element-wise."
-  description: <<END
-I.e., \\(y = 1 / x\\).
-END
-}
-op {
-  graph_op_name: "InvGrad"
-  endpoint {
-    name: "InvGrad"
-  }
-  summary: "Computes the gradient for the inverse of `x` wrt its input."
-  description: <<END
-Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-is the corresponding input gradient.
-END
-}
-op {
-  graph_op_name: "Invert"
-  endpoint {
-    name: "Invert"
-  }
-  summary: "Flips all bits elementwise."
-  description: <<END
-The result will have exactly those bits set, that are not set in `x`. The
-computation is performed on the underlying representation of x.
-END
-}
-op {
-  graph_op_name: "InvertPermutation"
-  endpoint {
-    name: "InvertPermutation"
-  }
-  summary: "Computes the inverse permutation of a tensor."
-  description: <<END
-This operation computes the inverse of an index permutation. It takes a 1-D
-integer tensor `x`, which represents the indices of a zero-based array, and
-swaps each value with its index position. In other words, for an output tensor
-`y` and an input tensor `x`, this operation computes the following:
-
-`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-
-The values must include 0. There can be no duplicate values or negative values.
-
-For example:
-
-```
-# tensor `x` is [3, 4, 0, 2, 1]
-invert_permutation(x) ==> [2, 4, 3, 0, 1]
-```
-END
-}
-op {
-  graph_op_name: "IsFinite"
-  endpoint {
-    name: "IsFinite"
-  }
-  summary: "Returns which elements of x are finite."
-  description: <<END
-@compatibility(numpy)
-Equivalent to np.isfinite
-@end_compatibility
-END
-}
-op {
-  graph_op_name: "IsInf"
-  endpoint {
-    name: "IsInf"
-  }
-  summary: "Returns which elements of x are Inf."
-  description: <<END
-@compatibility(numpy)
-Equivalent to np.isinf
-@end_compatibility
-END
-}
-op {
-  graph_op_name: "IsNan"
-  endpoint {
-    name: "IsNan"
-  }
-  summary: "Returns which elements of x are NaN."
-  description: <<END
-@compatibility(numpy)
-Equivalent to np.isnan
-@end_compatibility
-END
-}
-op {
-  graph_op_name: "IsVariableInitialized"
-  endpoint {
-    name: "IsVariableInitialized"
-  }
-  summary: "Checks whether a tensor has been initialized."
-  description: <<END
-Outputs boolean scalar indicating whether the tensor has been initialized.
-END
-}
-op {
-  graph_op_name: "Iterator"
-  endpoint {
-    name: "Iterator"
-  }
-  summary: "A container for an iterator resource."
-}
-op {
-  graph_op_name: "IteratorFromStringHandle"
-  endpoint {
-    name: "IteratorFromStringHandle"
-  }
-  summary: "Converts the given string representing a handle to an iterator to a resource."
-}
-op {
-  graph_op_name: "IteratorGetNext"
-  endpoint {
-    name: "IteratorGetNext"
-  }
-  summary: "Gets the next output from the given iterator."
-}
-op {
-  graph_op_name: "IteratorToStringHandle"
-  endpoint {
-    name: "IteratorToStringHandle"
-  }
-  summary: "Converts the given `resource_handle` representing an iterator to a string."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b793c99cf74408305b48dbbf1c9df7b03d09b2f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "IFFT"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most
+  dimension of `input` is replaced with its inverse 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifft
+@end_compatibility
+END
+  }
+  summary: "Inverse fast Fourier transform."
+  description: <<END
+Computes the inverse 1-dimensional discrete Fourier transform over the
+inner-most dimension of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f38f14308de70fb0ebc229064d010762055c458
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "IFFT2D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most 2
+  dimensions of `input` are replaced with their inverse 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifft2
+@end_compatibility
+END
+  }
+  summary: "Inverse 2D fast Fourier transform."
+  description: <<END
+Computes the inverse 2-dimensional discrete Fourier transform over the
+inner-most 2 dimensions of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52f1118775b16820f5e1bf6f0f9d934219b10f9d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT3D.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "IFFT3D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same shape as `input`. The inner-most 3
+  dimensions of `input` are replaced with their inverse 3D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.ifftn with 3 dimensions.
+@end_compatibility
+END
+  }
+  summary: "Inverse 3D fast Fourier transform."
+  description: <<END
+Computes the inverse 3-dimensional discrete Fourier transform over the
+inner-most 3 dimensions of `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e1caa9eade5480cd95ac7da0ce66cdcbcbef662
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "IRFFT"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [1]. The FFT length.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A float32 tensor of the same rank as `input`. The inner-most
+  dimension of `input` is replaced with the `fft_length` samples of its inverse
+  1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.irfft
+@end_compatibility
+END
+  }
+  summary: "Inverse real-valued fast Fourier transform."
+  description: <<END
+Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most dimension of `input`.
+
+The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+`fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+`fft_length` is not provided, it is computed from the size of the inner-most
+dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+than the corresponding dimension of `input`, the dimension is cropped. If it is
+larger, the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b7390a38577c1a3e35a43c0814c0a5bdb8ecab9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT2D.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "IRFFT2D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [2]. The FFT length for each dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A float32 tensor of the same rank as `input`. The inner-most 2
+  dimensions of `input` are replaced with the `fft_length` samples of their
+  inverse 2D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.irfft2
+@end_compatibility
+END
+  }
+  summary: "Inverse 2D real-valued fast Fourier transform."
+  description: <<END
+Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most 2 dimensions of `input`.
+
+The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+to compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1cee2ceeff054a4bd72855e53fafbce941f9e4e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IRFFT3D.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "IRFFT3D"
+  in_arg {
+    name: "input"
+    description: <<END
+A complex64 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [3]. The FFT length for each dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A float32 tensor of the same rank as `input`. The inner-most 3
+  dimensions of `input` are replaced with the `fft_length` samples of their
+  inverse 3D real Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.irfftn with 3 dimensions.
+@end_compatibility
+END
+  }
+  summary: "Inverse 3D real-valued fast Fourier transform."
+  description: <<END
+Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+signal over the inner-most 3 dimensions of `input`.
+
+The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+to compute `input` is odd, it should be provided since it cannot be inferred
+properly.
+
+Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+`fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Identity.pbtxt b/tensorflow/core/api_def/base_api/api_def_Identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2eb82e890f97cef8c82586bab6643eb0737c295
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Identity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Identity"
+  summary: "Return a tensor with the same shape and contents as the input tensor or value."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IdentityN.pbtxt b/tensorflow/core/api_def/base_api/api_def_IdentityN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45c213bce1eebdef26e194d82155e1a8d7624c22
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IdentityN.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "IdentityN"
+  summary: "Returns a list of tensors with the same shapes and contents as the input"
+  description: <<END
+tensors.
+
+This op can be used to override the gradient for complicated functions. For
+example, suppose y = f(x) and we wish to apply a custom function g for backprop
+such that dx = g(dy). In Python,
+
+```python
+with tf.get_default_graph().gradient_override_map(
+    {'IdentityN': 'OverrideGradientWithG'}):
+  y, _ = identity_n([f(x), x])
+
+@tf.RegisterGradient('OverrideGradientWithG')
+def ApplyG(op, dy, _):
+  return [None, g(dy)]  # Do not backprop to f(x).
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IdentityReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_IdentityReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9747d5c18cdbd98d3c112c048889533040b12553
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IdentityReader.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "IdentityReader"
+  visibility: SKIP
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the queued work as both the key and value."
+  description: <<END
+To use, enqueue strings in a Queue.  ReaderRead will take the front
+work string and output (work, work).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IdentityReaderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_IdentityReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71ef0115994aaa041acbc56efe3bee214f56d65c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IdentityReaderV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "IdentityReaderV2"
+  endpoint {
+    name: "IdentityReader"
+  }
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the queued work as both the key and value."
+  description: <<END
+To use, enqueue strings in a Queue.  ReaderRead will take the front
+work string and output (work, work).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7bc5ddae237deb226606dc96141845e3efcc859
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Igamma.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "Igamma"
+  summary: "Compute the lower regularized incomplete Gamma function `Q(a, x)`."
+  description: <<END
+The lower regularized incomplete Gamma function is defined as:
+
+
+\\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+
+where
+
+\\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+
+is the lower incomplete Gamma function.
+
+Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+Gamma function.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Igammac.pbtxt b/tensorflow/core/api_def/base_api/api_def_Igammac.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12f8416774f4849cb4636df198820e47e08257e4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Igammac.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "Igammac"
+  summary: "Compute the upper regularized incomplete Gamma function `Q(a, x)`."
+  description: <<END
+The upper regularized incomplete Gamma function is defined as:
+
+\\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+
+where
+
+\\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+
+is the upper incomplete Gama function.
+
+Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+Gamma function.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e492d90287f0f1da04ca5a1eba72ed2a6c18e47a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IgnoreErrorsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IgnoreErrorsDataset"
+  summary: "Creates a dataset that contains the elements of `input_dataset` ignoring errors."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c3bb674311293a92a38f88f0340a667bec3633b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Imag.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "Imag"
+  summary: "Returns the imaginary part of a complex number."
+  description: <<END
+Given a tensor `input` of complex numbers, this operation returns a tensor of
+type `float` that is the imaginary part of each element in `input`. All
+elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+is the real part and *b* is the imaginary part returned by this operation.
+
+For example:
+
+```
+# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+tf.imag(input) ==> [4.75, 5.75]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b00f5b19d97500f01916d7e78bdaf9c64b8b66e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImageSummary.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "ImageSummary"
+  in_arg {
+    name: "tag"
+    description: <<END
+Scalar. Used to build the `tag` attribute of the summary values.
+END
+  }
+  in_arg {
+    name: "tensor"
+    description: <<END
+4-D of shape `[batch_size, height, width, channels]` where
+`channels` is 1, 3, or 4.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar. Serialized `Summary` protocol buffer.
+END
+  }
+  attr {
+    name: "max_images"
+    description: <<END
+Max number of batch elements to generate images for.
+END
+  }
+  attr {
+    name: "bad_color"
+    description: <<END
+Color to use for pixels with non-finite values.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with images."
+  description: <<END
+The summary has up to `max_images` summary values containing images. The
+images are built from `tensor` which must be 4-D with shape `[batch_size,
+height, width, channels]` and where `channels` can be:
+
+*  1: `tensor` is interpreted as Grayscale.
+*  3: `tensor` is interpreted as RGB.
+*  4: `tensor` is interpreted as RGBA.
+
+The images have the same number of channels as the input tensor. For float
+input, the values are normalized one image at a time to fit in the range
+`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+normalization algorithms:
+
+*  If the input values are all positive, they are rescaled so the largest one
+   is 255.
+
+*  If any input value is negative, the values are shifted so input value 0.0
+   is at 127.  They are then rescaled so that either the smallest value is 0,
+   or the largest one is 255.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_images` is 1, the summary value tag is '*tag*/image'.
+*  If `max_images` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+
+The `bad_color` argument is the color to use in the generated images for
+non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+Each element must be in the range `[0, 255]` (It represents the value of a
+pixel in the output image).  Non-finite values in the input tensor are
+replaced by this tensor in the output image.  The default value is the color
+red.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImmutableConst.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImmutableConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..658629df3853d683eac324e9c092e3da5009abd5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImmutableConst.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "ImmutableConst"
+  attr {
+    name: "dtype"
+    description: <<END
+Type of the returned tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+Shape of the returned tensor.
+END
+  }
+  attr {
+    name: "memory_region_name"
+    description: <<END
+Name of readonly memory region used by the tensor, see
+NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+END
+  }
+  summary: "Returns immutable tensor from memory region."
+  description: <<END
+The current implementation memmaps the tensor from a file.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InTopK.pbtxt b/tensorflow/core/api_def/base_api/api_def_InTopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e11d6e59c73d53a75859d1986dd725d180589511
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InTopK.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "InTopK"
+  in_arg {
+    name: "predictions"
+    description: <<END
+A `batch_size` x `classes` tensor.
+END
+  }
+  in_arg {
+    name: "targets"
+    description: <<END
+A `batch_size` vector of class ids.
+END
+  }
+  out_arg {
+    name: "precision"
+    description: <<END
+Computed Precision at `k` as a `bool Tensor`.
+END
+  }
+  attr {
+    name: "k"
+    description: <<END
+Number of top elements to look at for computing precision.
+END
+  }
+  summary: "Says whether the targets are in the top `K` predictions."
+  description: <<END
+This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+prediction for the target class is among the top `k` predictions among
+all predictions for example `i`. Note that the behavior of `InTopK` differs
+from the `TopK` op in its handling of ties; if multiple classes have the
+same prediction value and straddle the top-`k` boundary, all of those
+classes are considered to be in the top `k`.
+
+More formally, let
+
+  \\(predictions_i\\) be the predictions for all classes for example `i`,
+  \\(targets_i\\) be the target class for example `i`,
+  \\(out_i\\) be the output for example `i`,
+
+$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InTopKV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_InTopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f418ce0ecc2316ca0026bca49a7532855fdcddb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InTopKV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "InTopKV2"
+  in_arg {
+    name: "predictions"
+    description: <<END
+A `batch_size` x `classes` tensor.
+END
+  }
+  in_arg {
+    name: "targets"
+    description: <<END
+A `batch_size` vector of class ids.
+END
+  }
+  in_arg {
+    name: "k"
+    description: <<END
+Number of top elements to look at for computing precision.
+END
+  }
+  out_arg {
+    name: "precision"
+    description: <<END
+Computed precision at `k` as a `bool Tensor`.
+END
+  }
+  summary: "Says whether the targets are in the top `K` predictions."
+  description: <<END
+This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+prediction for the target class is among the top `k` predictions among
+all predictions for example `i`. Note that the behavior of `InTopK` differs
+from the `TopK` op in its handling of ties; if multiple classes have the
+same prediction value and straddle the top-`k` boundary, all of those
+classes are considered to be in the top `k`.
+
+More formally, let
+
+  \\(predictions_i\\) be the predictions for all classes for example `i`,
+  \\(targets_i\\) be the target class for example `i`,
+  \\(out_i\\) be the output for example `i`,
+
+$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InitializeTable.pbtxt b/tensorflow/core/api_def/base_api/api_def_InitializeTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f9a01a616a085b947543f4d0ae17ba6eafb9325
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InitializeTable.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "InitializeTable"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table which will be initialized.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Keys of type Tkey.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values of type Tval.
+END
+  }
+  summary: "Table initializer that takes two tensors for keys and values respectively."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFile.pbtxt b/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1b2888cd48796395c041ed4e18029f7ac25cca8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFile.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "InitializeTableFromTextFile"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table which will be initialized.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+Filename of a vocabulary text file.
+END
+  }
+  attr {
+    name: "key_index"
+    description: <<END
+Column index in a line to get the table `key` values from.
+END
+  }
+  attr {
+    name: "value_index"
+    description: <<END
+Column index that represents information of a line to get the table
+`value` values from.
+END
+  }
+  attr {
+    name: "vocab_size"
+    description: <<END
+Number of elements of the file, use -1 if unknown.
+END
+  }
+  attr {
+    name: "delimiter"
+    description: <<END
+Delimiter to separate fields in a line.
+END
+  }
+  summary: "Initializes a table from a text file."
+  description: <<END
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFileV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2735af4f25a0d0179ab4ca4c2487a4ec5b403f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InitializeTableFromTextFileV2.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "InitializeTableFromTextFileV2"
+  endpoint {
+    name: "InitializeTableFromTextFile"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table which will be initialized.
+END
+  }
+  in_arg {
+    name: "filename"
+    description: <<END
+Filename of a vocabulary text file.
+END
+  }
+  attr {
+    name: "key_index"
+    description: <<END
+Column index in a line to get the table `key` values from.
+END
+  }
+  attr {
+    name: "value_index"
+    description: <<END
+Column index that represents information of a line to get the table
+`value` values from.
+END
+  }
+  attr {
+    name: "vocab_size"
+    description: <<END
+Number of elements of the file, use -1 if unknown.
+END
+  }
+  attr {
+    name: "delimiter"
+    description: <<END
+Delimiter to separate fields in a line.
+END
+  }
+  summary: "Initializes a table from a text file."
+  description: <<END
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InitializeTableV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_InitializeTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a32a816da8b14c7c86bdafb7fb594f77af519596
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InitializeTableV2.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "InitializeTableV2"
+  endpoint {
+    name: "InitializeTable"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table which will be initialized.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Keys of type Tkey.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values of type Tval.
+END
+  }
+  summary: "Table initializer that takes two tensors for keys and values respectively."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bec2828e2462227b962bc045d796484a10365452
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "InterleaveDataset"
+  attr {
+    name: "f"
+    description: <<END
+A function mapping elements of `input_dataset`, concatenated with
+`other_arguments`, to a Dataset variant that contains elements matching
+`output_types` and `output_shapes`.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+Unlike MapDataset, the `f` in InterleaveDataset is expected to return
+a Dataset variant, and InterleaveDataset will flatten successive
+results into a single Dataset. Unlike FlatMapDataset,
+InterleaveDataset will interleave sequences of up to `block_length`
+consecutive elements from `cycle_length` input elements.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Inv.pbtxt b/tensorflow/core/api_def/base_api/api_def_Inv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc63276e343cc92ac345d673633141d9f8ce7c31
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Inv.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Inv"
+  summary: "Computes the reciprocal of x element-wise."
+  description: <<END
+I.e., \\(y = 1 / x\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InvGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_InvGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de2f510eb9a6a1e2a5f4d43716b2fe4c9c823320
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InvGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "InvGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient for the inverse of `x` wrt its input."
+  description: <<END
+Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt b/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4847a500a883ad817407c900babee18e79c37554
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Invert.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Invert"
+  summary: "Flips all bits elementwise."
+  description: <<END
+The result will have exactly those bits set, that are not set in `x`. The
+computation is performed on the underlying representation of x.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InvertPermutation.pbtxt b/tensorflow/core/api_def/base_api/api_def_InvertPermutation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66062d818ebeeeb9fb9723707c52249364f7ccba
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InvertPermutation.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "InvertPermutation"
+  in_arg {
+    name: "x"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+1-D.
+END
+  }
+  summary: "Computes the inverse permutation of a tensor."
+  description: <<END
+This operation computes the inverse of an index permutation. It takes a 1-D
+integer tensor `x`, which represents the indices of a zero-based array, and
+swaps each value with its index position. In other words, for an output tensor
+`y` and an input tensor `x`, this operation computes the following:
+
+`y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
+
+The values must include 0. There can be no duplicate values or negative values.
+
+For example:
+
+```
+# tensor `x` is [3, 4, 0, 2, 1]
+invert_permutation(x) ==> [2, 4, 3, 0, 1]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsFinite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bccc0e32c15a677d22a8868105ad2d3cb90ebf91
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsFinite.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IsFinite"
+  summary: "Returns which elements of x are finite."
+  description: <<END
+@compatibility(numpy)
+Equivalent to np.isfinite
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsInf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c390f32d3490f65156dc5b2b229dfc2418cfd4f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsInf.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IsInf"
+  summary: "Returns which elements of x are Inf."
+  description: <<END
+@compatibility(numpy)
+Equivalent to np.isinf
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1487fad927216fbb08a5d3779e7e8e7a5aa9dff3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsNan.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IsNan"
+  summary: "Returns which elements of x are NaN."
+  description: <<END
+@compatibility(numpy)
+Equivalent to np.isnan
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsVariableInitialized.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsVariableInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d631da711d2ed698203717ddebb48c7b219f5f4c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsVariableInitialized.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "IsVariableInitialized"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node. May be uninitialized.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the variable tensor.
+END
+  }
+  summary: "Checks whether a tensor has been initialized."
+  description: <<END
+Outputs boolean scalar indicating whether the tensor has been initialized.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Iterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_Iterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..660267c221510f489e85733ea32fbac4903d8fdb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Iterator.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "Iterator"
+  out_arg {
+    name: "handle"
+    description: <<END
+A handle to the iterator that can be passed to a "MakeIterator"
+or "IteratorGetNext" op.
+END
+  }
+  summary: "A container for an iterator resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7e382edb376599c8b3bf9af21c441771e7ef9e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorFromStringHandle.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "IteratorFromStringHandle"
+  in_arg {
+    name: "string_handle"
+    description: <<END
+A string representation of the given handle.
+END
+  }
+  out_arg {
+    name: "resource_handle"
+    description: <<END
+A handle to an iterator resource.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+If specified, defines the type of each tuple component in an
+element produced by the resulting iterator.
+END
+  }
+  attr {
+    name: "output_shapes"
+    description: <<END
+If specified, defines the shape of each tuple component in an
+element produced by the resulting iterator.
+END
+  }
+  summary: "Converts the given string representing a handle to an iterator to a resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorGetNext.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea5669693e09c576d6cf9039846903a317c3b128
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorGetNext.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorGetNext"
+  summary: "Gets the next output from the given iterator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6f2212cd4fa5fe81ecc97c33ebe17d18ac7c616
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorSetStatsAggregator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorSetStatsAggregator"
+  summary: "Associates the given iterator with the given statistics aggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IteratorToStringHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_IteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf446b41273de53d674f70dcccc8db844f33cd04
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IteratorToStringHandle.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "IteratorToStringHandle"
+  in_arg {
+    name: "resource_handle"
+    description: <<END
+A handle to an iterator resource.
+END
+  }
+  out_arg {
+    name: "string_handle"
+    description: <<END
+A string representation of the given handle.
+END
+  }
+  summary: "Converts the given `resource_handle` representing an iterator to a string."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_L.pbtxt b/tensorflow/core/api_def/base_api/api_def_L.pbtxt
deleted file mode 100644
index 09e55eacc7e2763511389434e2a4a308b76ea58e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_L.pbtxt
+++ /dev/null
@@ -1,392 +0,0 @@
-op {
-  graph_op_name: "L2Loss"
-  endpoint {
-    name: "L2Loss"
-  }
-  summary: "L2 Loss."
-  description: <<END
-Computes half the L2 norm of a tensor without the `sqrt`:
-
-    output = sum(t ** 2) / 2
-END
-}
-op {
-  graph_op_name: "LMDBReader"
-  endpoint {
-    name: "LMDBReader"
-  }
-  summary: "A Reader that outputs the records from a LMDB file."
-}
-op {
-  graph_op_name: "LRN"
-  endpoint {
-    name: "LRN"
-  }
-  summary: "Local Response Normalization."
-  description: <<END
-The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-dimension), and each vector is normalized independently.  Within a given vector,
-each component is divided by the weighted, squared sum of inputs within
-`depth_radius`.  In detail,
-
-    sqr_sum[a, b, c, d] =
-        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-    output = input / (bias + alpha * sqr_sum) ** beta
-
-For details, see [Krizhevsky et al., ImageNet classification with deep
-convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
-END
-}
-op {
-  graph_op_name: "LRNGrad"
-  endpoint {
-    name: "LRNGrad"
-  }
-  summary: "Gradients for Local Response Normalization."
-}
-op {
-  graph_op_name: "LearnedUnigramCandidateSampler"
-  endpoint {
-    name: "LearnedUnigramCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: <<END
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "Less"
-  endpoint {
-    name: "Less"
-  }
-  summary: "Returns the truth value of (x < y) element-wise."
-  description: <<END
-*NOTE*: `Less` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "LessEqual"
-  endpoint {
-    name: "LessEqual"
-  }
-  summary: "Returns the truth value of (x <= y) element-wise."
-  description: <<END
-*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Lgamma"
-  endpoint {
-    name: "Lgamma"
-  }
-  summary: "Computes the log of the absolute value of `Gamma(x)` element-wise."
-}
-op {
-  graph_op_name: "LinSpace"
-  endpoint {
-    name: "LinSpace"
-  }
-  summary: "Generates values in an interval."
-  description: <<END
-A sequence of `num` evenly-spaced values are generated beginning at `start`.
-If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-so that the last one is exactly `stop`.
-
-For example:
-
-```
-tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-```
-END
-}
-op {
-  graph_op_name: "ListDiff"
-  endpoint {
-    name: "ListDiff"
-  }
-  summary: "Computes the difference between two lists of numbers or strings."
-  description: <<END
-Given a list `x` and a list `y`, this operation returns a list `out` that
-represents all values that are in `x` but not in `y`. The returned list `out`
-is sorted in the same order that the numbers appear in `x` (duplicates are
-preserved). This operation also returns a list `idx` that represents the
-position of each `out` element in `x`. In other words:
-
-`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-
-For example, given this input:
-
-```
-x = [1, 2, 3, 4, 5, 6]
-y = [1, 3, 5]
-```
-
-This operation would return:
-
-```
-out ==> [2, 4, 6]
-idx ==> [1, 3, 5]
-```
-END
-}
-op {
-  graph_op_name: "LoadAndRemapMatrix"
-  endpoint {
-    name: "LoadAndRemapMatrix"
-  }
-  summary: "Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint"
-  description: <<END
-at `ckpt_path` and potentially reorders its rows and columns using the
-specified remappings.
-
-Most users should use one of the wrapper initializers (such as
-`tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-function directly.
-
-The remappings are 1-D tensors with the following properties:
-
-* `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-  matrix will be initialized from the row corresponding to index
-  `row_remapping[i]` in the old `Tensor` from the checkpoint.
-* `col_remapping` must have either 0 entries (indicating that no column
-  reordering is needed) or `num_cols` entries. If specified, column `j` of the
-  output matrix will be initialized from the column corresponding to index
-  `col_remapping[j]` in the old `Tensor` from the checkpoint.
-* A value of -1 in either of the remappings signifies a "missing" entry. In that
-  case, values from the `initializing_values` tensor will be used to fill that
-  missing row or column. If `row_remapping` has `r` missing entries and
-  `col_remapping` has `c` missing entries, then the following condition must be
-  true:
-
-`(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
-
-The remapping tensors can be generated using the GenerateVocabRemapping op.
-
-As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-the value from row i, column j of the old tensor in the checkpoint, the output
-matrix will look like the following:
-
-[[w(1, 0),  w(1, 2),  0.5],
- [w(0, 0),  w(0, 2), -0.5],
- [0.25,    -0.25,      42]]
-END
-}
-op {
-  graph_op_name: "Log"
-  endpoint {
-    name: "Log"
-  }
-  summary: "Computes natural logarithm of x element-wise."
-  description: <<END
-I.e., \\(y = \log_e x\\).
-END
-}
-op {
-  graph_op_name: "Log1p"
-  endpoint {
-    name: "Log1p"
-  }
-  summary: "Computes natural logarithm of (1 + x) element-wise."
-  description: <<END
-I.e., \\(y = \log_e (1 + x)\\).
-END
-}
-op {
-  graph_op_name: "LogMatrixDeterminant"
-  endpoint {
-    name: "LogMatrixDeterminant"
-  }
-  summary: "Computes the sign and the log of the absolute value of the determinant of"
-  description: <<END
-one or more square matrices.
-
-The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-form square matrices. The outputs are two tensors containing the signs and
-absolute values of the log determinants for all N input submatrices
-`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-is the LU decomposition of the input and P is the corresponding
-permutation matrix.
-END
-}
-op {
-  graph_op_name: "LogSoftmax"
-  endpoint {
-    name: "LogSoftmax"
-  }
-  summary: "Computes log softmax activations."
-  description: <<END
-For each batch `i` and class `j` we have
-
-    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
-END
-}
-op {
-  graph_op_name: "LogUniformCandidateSampler"
-  endpoint {
-    name: "LogUniformCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a log-uniform distribution."
-  description: <<END
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "LogicalAnd"
-  endpoint {
-    name: "LogicalAnd"
-  }
-  summary: "Returns the truth value of x AND y element-wise."
-  description: <<END
-*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "LogicalNot"
-  endpoint {
-    name: "LogicalNot"
-  }
-  summary: "Returns the truth value of NOT x element-wise."
-}
-op {
-  graph_op_name: "LogicalOr"
-  endpoint {
-    name: "LogicalOr"
-  }
-  summary: "Returns the truth value of x OR y element-wise."
-  description: <<END
-*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "LookupTableExport"
-  endpoint {
-    name: "LookupTableExport"
-  }
-  summary: "Outputs all keys and values in the table."
-}
-op {
-  graph_op_name: "LookupTableExportV2"
-  endpoint {
-    name: "LookupTableExportV2"
-  }
-  summary: "Outputs all keys and values in the table."
-}
-op {
-  graph_op_name: "LookupTableFind"
-  endpoint {
-    name: "LookupTableFind"
-  }
-  summary: "Looks up keys in a table, outputs the corresponding values."
-  description: <<END
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableFindV2"
-  endpoint {
-    name: "LookupTableFindV2"
-  }
-  summary: "Looks up keys in a table, outputs the corresponding values."
-  description: <<END
-The tensor `keys` must of the same type as the keys of the table.
-The output `values` is of the type of the table values.
-
-The scalar `default_value` is the value output for keys not present in the
-table. It must also be of the same type as the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableImport"
-  endpoint {
-    name: "LookupTableImport"
-  }
-  summary: "Replaces the contents of the table with the specified keys and values."
-  description: <<END
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableImportV2"
-  endpoint {
-    name: "LookupTableImportV2"
-  }
-  summary: "Replaces the contents of the table with the specified keys and values."
-  description: <<END
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableInsert"
-  endpoint {
-    name: "LookupTableInsert"
-  }
-  summary: "Updates the table to associates keys with values."
-  description: <<END
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableInsertV2"
-  endpoint {
-    name: "LookupTableInsertV2"
-  }
-  summary: "Updates the table to associates keys with values."
-  description: <<END
-The tensor `keys` must be of the same type as the keys of the table.
-The tensor `values` must be of the type of the table values.
-END
-}
-op {
-  graph_op_name: "LookupTableSize"
-  endpoint {
-    name: "LookupTableSize"
-  }
-  summary: "Computes the number of elements in the given table."
-}
-op {
-  graph_op_name: "LookupTableSizeV2"
-  endpoint {
-    name: "LookupTableSizeV2"
-  }
-  summary: "Computes the number of elements in the given table."
-}
-op {
-  graph_op_name: "LoopCond"
-  endpoint {
-    name: "LoopCond"
-  }
-  summary: "Forwards the input to the output."
-  description: <<END
-This operator represents the loop termination condition used by the
-"pivot" switches of a loop.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_L2Loss.pbtxt b/tensorflow/core/api_def/base_api/api_def_L2Loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaf4b4ec355e08207f56435e406a5a214e09e0c4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_L2Loss.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "L2Loss"
+  in_arg {
+    name: "t"
+    description: <<END
+Typically 2-D, but may have any dimensions.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+0-D.
+END
+  }
+  summary: "L2 Loss."
+  description: <<END
+Computes half the L2 norm of a tensor without the `sqrt`:
+
+    output = sum(t ** 2) / 2
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LMDBReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_LMDBReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28d19e8658e1e2853a048da4ffc4032b3657b21e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LMDBReader.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "LMDBReader"
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the records from a LMDB file."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LRN.pbtxt b/tensorflow/core/api_def/base_api/api_def_LRN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97108821864f39a5a510bbe06a6cd7f474f0c3d5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LRN.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "LRN"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D.
+END
+  }
+  attr {
+    name: "depth_radius"
+    description: <<END
+0-D.  Half-width of the 1-D normalization window.
+END
+  }
+  attr {
+    name: "bias"
+    description: <<END
+An offset (usually positive to avoid dividing by 0).
+END
+  }
+  attr {
+    name: "alpha"
+    description: <<END
+A scale factor, usually positive.
+END
+  }
+  attr {
+    name: "beta"
+    description: <<END
+An exponent.
+END
+  }
+  summary: "Local Response Normalization."
+  description: <<END
+The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+dimension), and each vector is normalized independently.  Within a given vector,
+each component is divided by the weighted, squared sum of inputs within
+`depth_radius`.  In detail,
+
+    sqr_sum[a, b, c, d] =
+        sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+    output = input / (bias + alpha * sqr_sum) ** beta
+
+For details, see [Krizhevsky et al., ImageNet classification with deep
+convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LRNGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_LRNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b2b289ba68843a6cd725109b138822f8e72bb69
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LRNGrad.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "LRNGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_grads"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "input_image"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "output_image"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The gradients for LRN.
+END
+  }
+  attr {
+    name: "depth_radius"
+    description: <<END
+A depth radius.
+END
+  }
+  attr {
+    name: "bias"
+    description: <<END
+An offset (usually > 0 to avoid dividing by 0).
+END
+  }
+  attr {
+    name: "alpha"
+    description: <<END
+A scale factor, usually positive.
+END
+  }
+  attr {
+    name: "beta"
+    description: <<END
+An exponent.
+END
+  }
+  summary: "Gradients for Local Response Normalization."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78d946b0b47044855ff145e9492fdb3721ff0044
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LatencyStatsDataset"
+  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_LearnedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7097884fde31a4b0a7b3d4d51309556c5285f2d9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LearnedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "LearnedUnigramCandidateSampler"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to randomly sample.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "range_max"
+    description: <<END
+The sampler will sample integers from the interval [0, range_max).
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a learned unigram distribution."
+  description: <<END
+See explanations of candidate sampling and the data formats at
+go/candidate-sampling.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LeftShift.pbtxt b/tensorflow/core/api_def/base_api/api_def_LeftShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..622a90d0c7f5838bd8472202a2dae2231673ea33
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LeftShift.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "LeftShift"
+  summary: "Elementwise computes the bitwise left-shift of `x` and `y`."
+  description: <<END
+If `y` is negative, or greater than or equal to the width of `x` in bits the
+result is implementation defined.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Less.pbtxt b/tensorflow/core/api_def/base_api/api_def_Less.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..104d583f42a50998ea557bba6e18d6c7c6b8255d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Less.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Less"
+  summary: "Returns the truth value of (x < y) element-wise."
+  description: <<END
+*NOTE*: `Less` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LessEqual.pbtxt b/tensorflow/core/api_def/base_api/api_def_LessEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..637fe2f47ef41f5fb04895f2ac9e00c757f7b074
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LessEqual.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "LessEqual"
+  summary: "Returns the truth value of (x <= y) element-wise."
+  description: <<END
+*NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Lgamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Lgamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa93f30f387c61ceebde126a0aceb700ad9007bb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Lgamma.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Lgamma"
+  summary: "Computes the log of the absolute value of `Gamma(x)` element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/base_api/api_def_LinSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94a4ef574d9d4e61e6c7336bc2468089a852ad04
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LinSpace.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "LinSpace"
+  in_arg {
+    name: "start"
+    description: <<END
+First entry in the range.
+END
+  }
+  in_arg {
+    name: "stop"
+    description: <<END
+Last entry in the range.
+END
+  }
+  in_arg {
+    name: "num"
+    description: <<END
+Number of values to generate.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D. The generated values.
+END
+  }
+  summary: "Generates values in an interval."
+  description: <<END
+A sequence of `num` evenly-spaced values are generated beginning at `start`.
+If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+so that the last one is exactly `stop`.
+
+For example:
+
+```
+tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ListDiff.pbtxt b/tensorflow/core/api_def/base_api/api_def_ListDiff.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60a91dfaa65eb7e102a965978d55d508430dcb16
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ListDiff.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "ListDiff"
+  endpoint {
+    name: "SetDiff1D"
+  }
+  in_arg {
+    name: "x"
+    description: <<END
+1-D. Values to keep.
+END
+  }
+  in_arg {
+    name: "y"
+    description: <<END
+1-D. Values to remove.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+1-D. Values present in `x` but not in `y`.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+1-D. Positions of `x` values preserved in `out`.
+END
+  }
+  summary: "Computes the difference between two lists of numbers or strings."
+  description: <<END
+Given a list `x` and a list `y`, this operation returns a list `out` that
+represents all values that are in `x` but not in `y`. The returned list `out`
+is sorted in the same order that the numbers appear in `x` (duplicates are
+preserved). This operation also returns a list `idx` that represents the
+position of each `out` element in `x`. In other words:
+
+`out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
+
+For example, given this input:
+
+```
+x = [1, 2, 3, 4, 5, 6]
+y = [1, 3, 5]
+```
+
+This operation would return:
+
+```
+out ==> [2, 4, 6]
+idx ==> [1, 3, 5]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadAndRemapMatrix.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadAndRemapMatrix.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1e7007f078b4c88704a69863315bd52a1bfa813
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadAndRemapMatrix.pbtxt
@@ -0,0 +1,105 @@
+op {
+  graph_op_name: "LoadAndRemapMatrix"
+  in_arg {
+    name: "ckpt_path"
+    description: <<END
+Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+which the old matrix `Tensor` will be loaded.
+END
+  }
+  in_arg {
+    name: "old_tensor_name"
+    description: <<END
+Name of the 2-D `Tensor` to load from checkpoint.
+END
+  }
+  in_arg {
+    name: "row_remapping"
+    description: <<END
+An int `Tensor` of row remappings (generally created by
+`generate_vocab_remapping`).  Even if no row remapping is needed, this must
+still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+END
+  }
+  in_arg {
+    name: "col_remapping"
+    description: <<END
+An int `Tensor` of column remappings (generally created by
+`generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+is to be done (e.g. column ordering is the same).
+END
+  }
+  in_arg {
+    name: "initializing_values"
+    description: <<END
+A float `Tensor` containing  values to fill in for cells
+in the output matrix that are not loaded from the checkpoint. Length must be
+exactly the same as the number of missing / new cells.
+END
+  }
+  out_arg {
+    name: "output_matrix"
+    description: <<END
+Output matrix containing existing values loaded from the
+checkpoint, and with any missing values filled in from initializing_values.
+END
+  }
+  attr {
+    name: "num_rows"
+    description: <<END
+Number of rows (length of the 1st dimension) in the output matrix.
+END
+  }
+  attr {
+    name: "num_cols"
+    description: <<END
+Number of columns (length of the 2nd dimension) in the output matrix.
+END
+  }
+  attr {
+    name: "max_rows_in_memory"
+    description: <<END
+The maximum number of rows to load from the checkpoint at
+once. If less than or equal to 0, the entire matrix will be loaded into
+memory. Setting this arg trades increased disk reads for lower memory usage.
+END
+  }
+  summary: "Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint"
+  description: <<END
+at `ckpt_path` and potentially reorders its rows and columns using the
+specified remappings.
+
+Most users should use one of the wrapper initializers (such as
+`tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+function directly.
+
+The remappings are 1-D tensors with the following properties:
+
+* `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+  matrix will be initialized from the row corresponding to index
+  `row_remapping[i]` in the old `Tensor` from the checkpoint.
+* `col_remapping` must have either 0 entries (indicating that no column
+  reordering is needed) or `num_cols` entries. If specified, column `j` of the
+  output matrix will be initialized from the column corresponding to index
+  `col_remapping[j]` in the old `Tensor` from the checkpoint.
+* A value of -1 in either of the remappings signifies a "missing" entry. In that
+  case, values from the `initializing_values` tensor will be used to fill that
+  missing row or column. If `row_remapping` has `r` missing entries and
+  `col_remapping` has `c` missing entries, then the following condition must be
+  true:
+
+`(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+
+The remapping tensors can be generated using the GenerateVocabRemapping op.
+
+As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+the value from row i, column j of the old tensor in the checkpoint, the output
+matrix will look like the following:
+
+[[w(1, 0),  w(1, 2),  0.5],
+ [w(0, 0),  w(0, 2), -0.5],
+ [0.25,    -0.25,      42]]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Log.pbtxt b/tensorflow/core/api_def/base_api/api_def_Log.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..056f1bc2e275fdd95a9aebc02577a6b74c490a84
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Log.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Log"
+  summary: "Computes natural logarithm of x element-wise."
+  description: <<END
+I.e., \\(y = \log_e x\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/base_api/api_def_Log1p.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc9eb2682e76ecc2f34da213f2a5694502ec48b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Log1p.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Log1p"
+  summary: "Computes natural logarithm of (1 + x) element-wise."
+  description: <<END
+I.e., \\(y = \log_e (1 + x)\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8245f7d300c3e703a99a823c0c2309e6224de28d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogMatrixDeterminant.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "LogMatrixDeterminant"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[N, M, M]`.
+END
+  }
+  out_arg {
+    name: "sign"
+    description: <<END
+The signs of the log determinants of the inputs. Shape is `[N]`.
+END
+  }
+  out_arg {
+    name: "log_abs_determinant"
+    description: <<END
+The logs of the absolute values of the determinants
+of the N input matrices.  Shape is `[N]`.
+END
+  }
+  summary: "Computes the sign and the log of the absolute value of the determinant of"
+  description: <<END
+one or more square matrices.
+
+The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+form square matrices. The outputs are two tensors containing the signs and
+absolute values of the log determinants for all N input submatrices
+`[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+is the LU decomposition of the input and P is the corresponding
+permutation matrix.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogSoftmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba02abdd0aeba28dfb93f13e90d94532f03209be
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogSoftmax.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "LogSoftmax"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D with shape `[batch_size, num_classes]`.
+END
+  }
+  out_arg {
+    name: "logsoftmax"
+    description: <<END
+Same shape as `logits`.
+END
+  }
+  summary: "Computes log softmax activations."
+  description: <<END
+For each batch `i` and class `j` we have
+
+    logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogUniformCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogUniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c6807bcb288edf75d460e9522efd1176d3512ba
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogUniformCandidateSampler.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "LogUniformCandidateSampler"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to randomly sample.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "range_max"
+    description: <<END
+The sampler will sample integers from the interval [0, range_max).
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a log-uniform distribution."
+  description: <<END
+See explanations of candidate sampling and the data formats at
+go/candidate-sampling.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogicalAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ec78d02b097918fc02f705990f0648f91946d2d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogicalAnd.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "LogicalAnd"
+  summary: "Returns the truth value of x AND y element-wise."
+  description: <<END
+*NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af29e920c9bc5626849b4aa9dc5bb3798f738556
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LogicalNot"
+  summary: "Returns the truth value of NOT x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LogicalOr.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogicalOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4f31cd5213d5b9c577073c1f10e2c0c0a1d03a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LogicalOr.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "LogicalOr"
+  summary: "Returns the truth value of x OR y element-wise."
+  description: <<END
+*NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableExport.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableExport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dfc721ddee64afaf3f0e508f28ef82cfdc340efa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableExport.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LookupTableExport"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  out_arg {
+    name: "keys"
+    description: <<END
+Vector of all keys present in the table.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+Tensor of all values in the table. Indexed in parallel with `keys`.
+END
+  }
+  summary: "Outputs all keys and values in the table."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableExportV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableExportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bc944c9180442a6385534c9e0e89ae148dff5de
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableExportV2.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "LookupTableExportV2"
+  endpoint {
+    name: "LookupTableExport"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  out_arg {
+    name: "keys"
+    description: <<END
+Vector of all keys present in the table.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+Tensor of all values in the table. Indexed in parallel with `keys`.
+END
+  }
+  summary: "Outputs all keys and values in the table."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableFind.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableFind.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce1109e7ebb5e55138d5ae0aa22582afe57041fa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableFind.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "LookupTableFind"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+Same shape as `keys`.  Values found in the table, or `default_values`
+for missing keys.
+END
+  }
+  summary: "Looks up keys in a table, outputs the corresponding values."
+  description: <<END
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableFindV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableFindV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30f69220e8dcad9a89039d157470b0603f2bb73c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableFindV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "LookupTableFindV2"
+  endpoint {
+    name: "LookupTableFind"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+Same shape as `keys`.  Values found in the table, or `default_values`
+for missing keys.
+END
+  }
+  summary: "Looks up keys in a table, outputs the corresponding values."
+  description: <<END
+The tensor `keys` must of the same type as the keys of the table.
+The output `values` is of the type of the table values.
+
+The scalar `default_value` is the value output for keys not present in the
+table. It must also be of the same type as the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableImport.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableImport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6861c4e97d95e9fdde8ddc7bc1ee87a1ff2837e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableImport.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "LookupTableImport"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values to associate with keys.
+END
+  }
+  summary: "Replaces the contents of the table with the specified keys and values."
+  description: <<END
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableImportV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableImportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f39fbc499670076c220dbd3a87c329d0fb28ccd4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableImportV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LookupTableImportV2"
+  endpoint {
+    name: "LookupTableImport"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values to associate with keys.
+END
+  }
+  summary: "Replaces the contents of the table with the specified keys and values."
+  description: <<END
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableInsert.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableInsert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f07ac2f3db9bdf980eec55726d25bf69df52415e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableInsert.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "LookupTableInsert"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values to associate with keys.
+END
+  }
+  summary: "Updates the table to associates keys with values."
+  description: <<END
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableInsertV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableInsertV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b93e68a5b0deb34955aa23e2f9326c6b8fed8175
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableInsertV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LookupTableInsertV2"
+  endpoint {
+    name: "LookupTableInsert"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  in_arg {
+    name: "keys"
+    description: <<END
+Any shape.  Keys to look up.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Values to associate with keys.
+END
+  }
+  summary: "Updates the table to associates keys with values."
+  description: <<END
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d561c45d6279bd45c40009c93831af32cb7b2366
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "LookupTableSize"
+  visibility: SKIP
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+Scalar that contains number of elements in the table.
+END
+  }
+  summary: "Computes the number of elements in the given table."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LookupTableSizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_LookupTableSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ab2566397d3b2833577add0828313317f73f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LookupTableSizeV2.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "LookupTableSizeV2"
+  endpoint {
+    name: "LookupTableSize"
+  }
+  in_arg {
+    name: "table_handle"
+    description: <<END
+Handle to the table.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+Scalar that contains number of elements in the table.
+END
+  }
+  summary: "Computes the number of elements in the given table."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoopCond.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoopCond.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b2dbdf4b45df8e87d50b253195a0a8d44b4e5da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoopCond.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "LoopCond"
+  in_arg {
+    name: "input"
+    description: <<END
+A boolean scalar, representing the branch predicate of the Switch op.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `input`.
+END
+  }
+  summary: "Forwards the input to the output."
+  description: <<END
+This operator represents the loop termination condition used by the
+"pivot" switches of a loop.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_M.pbtxt b/tensorflow/core/api_def/base_api/api_def_M.pbtxt
deleted file mode 100644
index 7295928bad88fe1a341c624b6e47a0d6db9c73e2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_M.pbtxt
+++ /dev/null
@@ -1,749 +0,0 @@
-op {
-  graph_op_name: "MakeIterator"
-  endpoint {
-    name: "MakeIterator"
-  }
-  summary: "Makes a new iterator from the given `dataset` and stores it in `iterator`."
-  description: <<END
-This operation may be executed multiple times. Each execution will reset the
-iterator in `iterator` to the first element of `dataset`.
-END
-}
-op {
-  graph_op_name: "MapClear"
-  endpoint {
-    name: "MapClear"
-  }
-  summary: "Op removes all elements in the underlying container."
-}
-op {
-  graph_op_name: "MapDataset"
-  endpoint {
-    name: "MapDataset"
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-}
-op {
-  graph_op_name: "MapIncompleteSize"
-  endpoint {
-    name: "MapIncompleteSize"
-  }
-  summary: "Op returns the number of incomplete elements in the underlying container."
-}
-op {
-  graph_op_name: "MapPeek"
-  endpoint {
-    name: "MapPeek"
-  }
-  summary: "Op peeks at the values at the specified key.  If the"
-  description: <<END
-underlying container does not contain this key
-this op will block until it does.
-END
-}
-op {
-  graph_op_name: "MapSize"
-  endpoint {
-    name: "MapSize"
-  }
-  summary: "Op returns the number of elements in the underlying container."
-}
-op {
-  graph_op_name: "MapStage"
-  endpoint {
-    name: "MapStage"
-  }
-  summary: "Stage (key, values) in the underlying container which behaves like a hashtable."
-}
-op {
-  graph_op_name: "MapUnstage"
-  endpoint {
-    name: "MapUnstage"
-  }
-  summary: "Op removes and returns the values associated with the key"
-  description: <<END
-from the underlying container.   If the underlying container
-does not contain this key, the op will block until it does.
-END
-}
-op {
-  graph_op_name: "MapUnstageNoKey"
-  endpoint {
-    name: "MapUnstageNoKey"
-  }
-  summary: "Op removes and returns a random (key, value)"
-  description: <<END
-from the underlying container.   If the underlying container
-does not contain elements, the op will block until it does.
-END
-}
-op {
-  graph_op_name: "MatMul"
-  endpoint {
-    name: "MatMul"
-  }
-  summary: "Multiply the matrix \"a\" by the matrix \"b\"."
-  description: <<END
-The inputs must be two-dimensional matrices and the inner dimension of
-"a" (after being transposed if transpose_a is true) must match the
-outer dimension of "b" (after being transposed if transposed_b is
-true).
-
-*Note*: The default kernel implementation for MatMul on GPUs uses
-cublas.
-END
-}
-op {
-  graph_op_name: "MatchingFiles"
-  endpoint {
-    name: "MatchingFiles"
-  }
-  summary: "Returns the set of files matching one or more glob patterns."
-  description: <<END
-Note that this routine only supports wildcard characters in the
-basename portion of the pattern, not in the directory portion.
-END
-}
-op {
-  graph_op_name: "MatrixBandPart"
-  endpoint {
-    name: "MatrixBandPart"
-  }
-  summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
-  description: <<END
-to zero.
-
-The `band` part is computed as follows:
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor with the same shape where
-
-`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-
-The indicator function
-
-`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-                 (num_upper < 0 || (n-m) <= num_upper)`.
-
-For example:
-
-```
-# if 'input' is [[ 0,  1,  2, 3]
-                 [-1,  0,  1, 2]
-                 [-2, -1,  0, 1]
-                 [-3, -2, -1, 0]],
-
-tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-                                       [-1,  0,  1, 2]
-                                       [ 0, -1,  0, 1]
-                                       [ 0,  0, -1, 0]],
-
-tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-                                      [-1,  0,  1, 0]
-                                      [-2, -1,  0, 1]
-                                      [ 0, -2, -1, 0]]
-```
-
-Useful special cases:
-
-```
- tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
- tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
- tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-```
-END
-}
-op {
-  graph_op_name: "MatrixDeterminant"
-  endpoint {
-    name: "MatrixDeterminant"
-  }
-  summary: "Computes the determinant of one or more square matrices."
-  description: <<END
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor containing the determinants
-for all input submatrices `[..., :, :]`.
-END
-}
-op {
-  graph_op_name: "MatrixDiag"
-  endpoint {
-    name: "MatrixDiag"
-  }
-  summary: "Returns a batched diagonal tensor with a given batched diagonal values."
-  description: <<END
-Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-everything else padded with zeros. The diagonal is computed as follows:
-
-Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-
-`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-
-For example:
-
-```
-# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-and diagonal.shape = (2, 4)
-
-tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-                                     [0, 2, 0, 0]
-                                     [0, 0, 3, 0]
-                                     [0, 0, 0, 4]],
-                                    [[5, 0, 0, 0]
-                                     [0, 6, 0, 0]
-                                     [0, 0, 7, 0]
-                                     [0, 0, 0, 8]]]
-
-which has shape (2, 4, 4)
-```
-END
-}
-op {
-  graph_op_name: "MatrixDiagPart"
-  endpoint {
-    name: "MatrixDiagPart"
-  }
-  summary: "Returns the batched diagonal part of a batched tensor."
-  description: <<END
-This operation returns a tensor with the `diagonal` part
-of the batched `input`. The `diagonal` part is computed as follows:
-
-Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-
-`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-
-The input must be at least a matrix.
-
-For example:
-
-```
-# 'input' is [[[1, 0, 0, 0]
-               [0, 2, 0, 0]
-               [0, 0, 3, 0]
-               [0, 0, 0, 4]],
-              [[5, 0, 0, 0]
-               [0, 6, 0, 0]
-               [0, 0, 7, 0]
-               [0, 0, 0, 8]]]
-
-and input.shape = (2, 4, 4)
-
-tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-
-which has shape (2, 4)
-```
-END
-}
-op {
-  graph_op_name: "MatrixInverse"
-  endpoint {
-    name: "MatrixInverse"
-  }
-  summary: "Computes the inverse of one or more square invertible matrices or their"
-  description: <<END
-adjoints (conjugate transposes).
-
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. The output is a tensor of the same shape as the input
-containing the inverse for all input submatrices `[..., :, :]`.
-
-The op uses LU decomposition with partial pivoting to compute the inverses.
-
-If a matrix is not invertible there is no guarantee what the op does. It
-may detect the condition and raise an exception or it may simply return a
-garbage result.
-END
-}
-op {
-  graph_op_name: "MatrixSetDiag"
-  endpoint {
-    name: "MatrixSetDiag"
-  }
-  summary: "Returns a batched matrix tensor with new batched diagonal values."
-  description: <<END
-Given `input` and `diagonal`, this operation returns a tensor with the
-same shape and values as `input`, except for the main diagonal of the
-innermost matrices.  These will be overwritten by the values in `diagonal`.
-
-The output is computed as follows:
-
-Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-
-  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-END
-}
-op {
-  graph_op_name: "MatrixSolve"
-  endpoint {
-    name: "MatrixSolve"
-  }
-  summary: "Solves systems of linear equations."
-  description: <<END
-`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `True` then each output matrix satisfies
-`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-END
-}
-op {
-  graph_op_name: "MatrixSolveLs"
-  endpoint {
-    name: "MatrixSolveLs"
-  }
-  summary: "Solves one or more linear least-squares problems."
-  description: <<END
-`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-type as `matrix` and shape `[..., M, K]`.
-The output is a tensor shape `[..., N, K]` where each output matrix solves
-each of the equations
-`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-in the least squares sense.
-
-We use the following notation for (complex) matrix and right-hand sides
-in the batch:
-
-`matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-`rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-`output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-`l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-
-If `fast` is `True`, then the solution is computed by solving the normal
-equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-\\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-\\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-minimum-norm solution to the under-determined linear system, i.e.
-\\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-when \\(A\\) is numerically full rank and has a condition number
-\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-sufficiently large.
-
-If `fast` is `False` an algorithm based on the numerically robust complete
-orthogonal decomposition is used. This computes the minimum-norm
-least-squares solution, even when \\(A\\) is rank deficient. This path is
-typically 6-7 times slower than the fast path. If `fast` is `False` then
-`l2_regularizer` is ignored.
-END
-}
-op {
-  graph_op_name: "MatrixTriangularSolve"
-  endpoint {
-    name: "MatrixTriangularSolve"
-  }
-  summary: "Solves systems of linear equations with upper or lower triangular matrices by"
-  description: <<END
-backsubstitution.
-
-`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-square matrices. If `lower` is `True` then the strictly upper triangular part
-of each inner-most matrix is assumed to be zero and not accessed.
-If `lower` is False then the strictly lower triangular part of each inner-most
-matrix is assumed to be zero and not accessed.
-`rhs` is a tensor of shape `[..., M, K]`.
-
-The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-`True` then the innermost matrices in `output` satisfy matrix equations
-`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-If `adjoint` is `False` then the strictly then the  innermost matrices in
-`output` satisfy matrix equations
-`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
-END
-}
-op {
-  graph_op_name: "Max"
-  endpoint {
-    name: "Max"
-  }
-  summary: "Computes the maximum of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "MaxPool"
-  endpoint {
-    name: "MaxPool"
-  }
-  summary: "Performs max pooling on the input."
-}
-op {
-  graph_op_name: "MaxPool3D"
-  endpoint {
-    name: "MaxPool3D"
-  }
-  summary: "Performs 3D max pooling on the input."
-}
-op {
-  graph_op_name: "MaxPool3DGrad"
-  endpoint {
-    name: "MaxPool3DGrad"
-  }
-  summary: "Computes gradients of max pooling function."
-}
-op {
-  graph_op_name: "MaxPool3DGradGrad"
-  endpoint {
-    name: "MaxPool3DGradGrad"
-  }
-  summary: "Computes second-order gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGrad"
-  endpoint {
-    name: "MaxPoolGrad"
-  }
-  summary: "Computes gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGradGrad"
-  endpoint {
-    name: "MaxPoolGradGrad"
-  }
-  summary: "Computes second-order gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGradGradV2"
-  endpoint {
-    name: "MaxPoolGradGradV2"
-  }
-  summary: "Computes second-order gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGradGradWithArgmax"
-  endpoint {
-    name: "MaxPoolGradGradWithArgmax"
-  }
-  summary: "Computes second-order gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGradV2"
-  endpoint {
-    name: "MaxPoolGradV2"
-  }
-  summary: "Computes gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolGradWithArgmax"
-  endpoint {
-    name: "MaxPoolGradWithArgmax"
-  }
-  summary: "Computes gradients of the maxpooling function."
-}
-op {
-  graph_op_name: "MaxPoolV2"
-  endpoint {
-    name: "MaxPoolV2"
-  }
-  summary: "Performs max pooling on the input."
-}
-op {
-  graph_op_name: "MaxPoolWithArgmax"
-  endpoint {
-    name: "MaxPoolWithArgmax"
-  }
-  summary: "Performs max pooling on the input and outputs both max values and indices."
-  description: <<END
-The indices in `argmax` are flattened, so that a maximum value at position
-`[b, y, x, c]` becomes flattened index
-`((b * height + y) * width + x) * channels + c`.
-
-The indices returned are always in `[0, height) x [0, width)` before flattening,
-even if padding is involved and the mathematically correct answer is outside
-(either negative or too large).  This is a bug, but fixing it is difficult to do
-in a safe backwards compatible way, especially due to flattening.
-END
-}
-op {
-  graph_op_name: "Maximum"
-  endpoint {
-    name: "Maximum"
-  }
-  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise."
-  description: <<END
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Mean"
-  endpoint {
-    name: "Mean"
-  }
-  summary: "Computes the mean of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "Merge"
-  endpoint {
-    name: "Merge"
-  }
-  summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: <<END
-`Merge` waits for at least one of the tensors in `inputs` to become available.
-It is usually combined with `Switch` to implement branching.
-
-`Merge` forwards the first tensor to become available to `output`, and sets
-`value_index` to its index in `inputs`.
-END
-}
-op {
-  graph_op_name: "MergeSummary"
-  endpoint {
-    name: "MergeSummary"
-  }
-  summary: "Merges summaries."
-  description: <<END
-This op creates a
-[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-protocol buffer that contains the union of all the values in the input
-summaries.
-
-When the Op is run, it reports an `InvalidArgument` error if multiple values
-in the summaries to merge use the same tag.
-END
-}
-op {
-  graph_op_name: "MergeV2Checkpoints"
-  endpoint {
-    name: "MergeV2Checkpoints"
-  }
-  summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
-  description: <<END
-result is one logical checkpoint, with one physical metadata file and renamed
-data files.
-
-Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
-
-If delete_old_dirs is true, attempts to delete recursively the dirname of each
-path in the input checkpoint_prefixes.  This is useful when those paths are non
-user-facing temporary locations.
-END
-}
-op {
-  graph_op_name: "Mfcc"
-  endpoint {
-    name: "Mfcc"
-  }
-  summary: "Transforms a spectrogram into a form that\'s useful for speech recognition."
-  description: <<END
-Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-been effective as an input feature for machine learning. They are created by
-taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-higher frequencies that are less significant to the human ear. They have a long
-history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-is a good resource to learn more.
-END
-}
-op {
-  graph_op_name: "Min"
-  endpoint {
-    name: "Min"
-  }
-  summary: "Computes the minimum of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "Minimum"
-  endpoint {
-    name: "Minimum"
-  }
-  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise."
-  description: <<END
-*NOTE*: `Minimum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "MirrorPad"
-  endpoint {
-    name: "MirrorPad"
-  }
-  summary: "Pads a tensor with mirrored values."
-  description: <<END
-This operation pads a `input` with mirrored values according to the `paddings`
-you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many values to add before the contents of `input` in that dimension, and
-`paddings[D, 1]` indicates how many values to add after the contents of `input`
-in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-(if false, respectively).
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 2, 3], [4, 5, 6]].
-# 'paddings' is [[1, 1]], [2, 2]].
-# 'mode' is SYMMETRIC.
-# rank of 't' is 2.
-pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-                      [2, 1, 1, 2, 3, 3, 2]
-                      [5, 4, 4, 5, 6, 6, 5]
-                      [5, 4, 4, 5, 6, 6, 5]]
-```
-END
-}
-op {
-  graph_op_name: "MirrorPadGrad"
-  endpoint {
-    name: "MirrorPadGrad"
-  }
-  summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
-  description: <<END
-This operation folds the padded areas of `input` by `MirrorPad` according to the
-`paddings` you specify. `paddings` must be the same as `paddings` argument
-given to the corresponding `MirrorPad` op.
-
-The folded size of each dimension D of the output is:
-
-`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
-# 'paddings' is [[0, 1]], [0, 1]].
-# 'mode' is SYMMETRIC.
-# rank of 't' is 2.
-pad(t, paddings) ==> [[ 1,  5]
-                      [11, 28]]
-```
-END
-}
-op {
-  graph_op_name: "Mod"
-  endpoint {
-    name: "Mod"
-  }
-  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
-  description: <<END
-the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-y + truncate_mod(x, y) = x`.
-
-*NOTE*: `Mod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Mul"
-  endpoint {
-    name: "Mul"
-  }
-  summary: "Returns x * y element-wise."
-  description: <<END
-*NOTE*: `Mul` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Multinomial"
-  endpoint {
-    name: "Multinomial"
-  }
-  summary: "Draws samples from a multinomial distribution."
-}
-op {
-  graph_op_name: "MutableDenseHashTable"
-  endpoint {
-    name: "MutableDenseHashTable"
-  }
-  summary: "Creates an empty hash table that uses tensors as the backing store."
-  description: <<END
-It uses "open addressing" with quadratic reprobing to resolve
-collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
-op {
-  graph_op_name: "MutableDenseHashTableV2"
-  endpoint {
-    name: "MutableDenseHashTableV2"
-  }
-  summary: "Creates an empty hash table that uses tensors as the backing store."
-  description: <<END
-It uses "open addressing" with quadratic reprobing to resolve
-collisions.
-
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
-op {
-  graph_op_name: "MutableHashTable"
-  endpoint {
-    name: "MutableHashTable"
-  }
-  summary: "Creates an empty hash table."
-  description: <<END
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
-op {
-  graph_op_name: "MutableHashTableOfTensors"
-  endpoint {
-    name: "MutableHashTableOfTensors"
-  }
-  summary: "Creates an empty hash table."
-  description: <<END
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
-op {
-  graph_op_name: "MutableHashTableOfTensorsV2"
-  endpoint {
-    name: "MutableHashTableOfTensorsV2"
-  }
-  summary: "Creates an empty hash table."
-  description: <<END
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a vector. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
-op {
-  graph_op_name: "MutableHashTableV2"
-  endpoint {
-    name: "MutableHashTableV2"
-  }
-  summary: "Creates an empty hash table."
-  description: <<END
-This op creates a mutable hash table, specifying the type of its keys and
-values. Each value must be a scalar. Data can be inserted into the table using
-the insert operations. It does not support the initialization operation.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MakeIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_MakeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..921ea86a4bc983e4da8a7bd130571b0d64680a61
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MakeIterator.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "MakeIterator"
+  summary: "Makes a new iterator from the given `dataset` and stores it in `iterator`."
+  description: <<END
+This operation may be executed multiple times. Each execution will reset the
+iterator in `iterator` to the first element of `dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf544703de559900317f71eaf5936221ac62080e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "MapAndBatchDataset"
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch. It determines the number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  in_arg {
+    name: "num_parallel_batches"
+    description: <<END
+A scalar representing the number of batches to create in
+parallel. Processing multiple batches in parallel benefits workloads prone to
+stragglers.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset` and then"
+  description: <<END
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapClear.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c3c2d48b0c5996b40cc9e66bb1a7a1d50db296b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapClear"
+  summary: "Op removes all elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76d63ec2478e07d5af09754dc63994841119fa56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapDataset"
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapIncompleteSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd63305ac205208c496d9dd0d7b6283d1104bbc0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapIncompleteSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapIncompleteSize"
+  summary: "Op returns the number of incomplete elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapPeek.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80eb6d5943cbcfc15dad585e595cc33800420ecc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapPeek.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "MapPeek"
+  summary: "Op peeks at the values at the specified key.  If the"
+  description: <<END
+underlying container does not contain this key
+this op will block until it does.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9412019f595d350eac4ac0402f6ad1a33a408acf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapSize"
+  summary: "Op returns the number of elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapStage.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..555fe538ef3fc1b78ec6cc1e1e524371e1591bc6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapStage.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "MapStage"
+  in_arg {
+    name: "key"
+    description: <<END
+int64
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+Maximum number of elements in the Staging Area. If > 0, inserts
+on the container will block when the capacity is reached.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container. Otherwise,
+a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+It is necessary to match this name to the matching Unstage Op.
+END
+  }
+  summary: "Stage (key, values) in the underlying container which behaves like a hashtable."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapUnstage.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29a10cf9282a73a2b59f36897316739e435bf09f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapUnstage.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "MapUnstage"
+  summary: "Op removes and returns the values associated with the key"
+  description: <<END
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapUnstageNoKey.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9da7e65d771871c6dae59a976f8949058d60ed4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapUnstageNoKey.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "MapUnstageNoKey"
+  summary: "Op removes and returns a random (key, value)"
+  description: <<END
+from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc55e81ae1f45276cf55ed68a9cdce7c83327f9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatMul.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "MatMul"
+  attr {
+    name: "transpose_a"
+    description: <<END
+If true, "a" is transposed before multiplication.
+END
+  }
+  attr {
+    name: "transpose_b"
+    description: <<END
+If true, "b" is transposed before multiplication.
+END
+  }
+  summary: "Multiply the matrix \"a\" by the matrix \"b\"."
+  description: <<END
+The inputs must be two-dimensional matrices and the inner dimension of
+"a" (after being transposed if transpose_a is true) must match the
+outer dimension of "b" (after being transposed if transposed_b is
+true).
+
+*Note*: The default kernel implementation for MatMul on GPUs uses
+cublas.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8da76684e5d360dd642167100b04543e93beed0a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatchingFiles.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "MatchingFiles"
+  in_arg {
+    name: "pattern"
+    description: <<END
+Shell wildcard pattern(s). Scalar or vector of type string.
+END
+  }
+  out_arg {
+    name: "filenames"
+    description: <<END
+A vector of matching filenames.
+END
+  }
+  summary: "Returns the set of files matching one or more glob patterns."
+  description: <<END
+Note that this routine only supports wildcard characters in the
+basename portion of the pattern, not in the directory portion.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaf3d28437beb8a3767d13fca57942b46ca62699
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixBandPart.pbtxt
@@ -0,0 +1,71 @@
+op {
+  graph_op_name: "MatrixBandPart"
+  in_arg {
+    name: "input"
+    description: <<END
+Rank `k` tensor.
+END
+  }
+  in_arg {
+    name: "num_lower"
+    description: <<END
+0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+lower triangle.
+END
+  }
+  in_arg {
+    name: "num_upper"
+    description: <<END
+0-D tensor. Number of superdiagonals to keep. If negative, keep
+entire upper triangle.
+END
+  }
+  out_arg {
+    name: "band"
+    description: <<END
+Rank `k` tensor of the same shape as input. The extracted banded tensor.
+END
+  }
+  summary: "Copy a tensor setting everything outside a central band in each innermost matrix"
+  description: <<END
+to zero.
+
+The `band` part is computed as follows:
+Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+tensor with the same shape where
+
+`band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+
+The indicator function
+
+`in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+                 (num_upper < 0 || (n-m) <= num_upper)`.
+
+For example:
+
+```
+# if 'input' is [[ 0,  1,  2, 3]
+                 [-1,  0,  1, 2]
+                 [-2, -1,  0, 1]
+                 [-3, -2, -1, 0]],
+
+tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+                                       [-1,  0,  1, 2]
+                                       [ 0, -1,  0, 1]
+                                       [ 0,  0, -1, 0]],
+
+tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+                                      [-1,  0,  1, 0]
+                                      [-2, -1,  0, 1]
+                                      [ 0, -2, -1, 0]]
+```
+
+Useful special cases:
+
+```
+ tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+ tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+ tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixDeterminant.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0acfee2a3097f11e3820c4ab82d7c0520bc5ac25
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixDeterminant.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "MatrixDeterminant"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[...]`.
+END
+  }
+  summary: "Computes the determinant of one or more square matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor containing the determinants
+for all input submatrices `[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixDiag.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59f8902d54c25ed2fcca11b3d1421fe637d9184c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixDiag.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "MatrixDiag"
+  in_arg {
+    name: "diagonal"
+    description: <<END
+Rank `k`, where `k >= 1`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+END
+  }
+  summary: "Returns a batched diagonal tensor with a given batched diagonal values."
+  description: <<END
+Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+everything else padded with zeros. The diagonal is computed as follows:
+
+Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+
+`output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+
+For example:
+
+```
+# 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+and diagonal.shape = (2, 4)
+
+tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+                                     [0, 2, 0, 0]
+                                     [0, 0, 3, 0]
+                                     [0, 0, 0, 4]],
+                                    [[5, 0, 0, 0]
+                                     [0, 6, 0, 0]
+                                     [0, 0, 7, 0]
+                                     [0, 0, 0, 8]]]
+
+which has shape (2, 4, 4)
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixDiagPart.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c2dbc7f262693e7af2c8ab703e0bc034cc7f6ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixDiagPart.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "MatrixDiagPart"
+  in_arg {
+    name: "input"
+    description: <<END
+Rank `k` tensor where `k >= 2`.
+END
+  }
+  out_arg {
+    name: "diagonal"
+    description: <<END
+The extracted diagonal(s) having shape
+`diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+END
+  }
+  summary: "Returns the batched diagonal part of a batched tensor."
+  description: <<END
+This operation returns a tensor with the `diagonal` part
+of the batched `input`. The `diagonal` part is computed as follows:
+
+Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+
+`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+
+The input must be at least a matrix.
+
+For example:
+
+```
+# 'input' is [[[1, 0, 0, 0]
+               [0, 2, 0, 0]
+               [0, 0, 3, 0]
+               [0, 0, 0, 4]],
+              [[5, 0, 0, 0]
+               [0, 6, 0, 0]
+               [0, 0, 7, 0]
+               [0, 0, 0, 8]]]
+
+and input.shape = (2, 4, 4)
+
+tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+which has shape (2, 4)
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d680f653121677e97d88655979521c67d566882
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "MatrixExponential"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+
+@compatibility(scipy)
+Equivalent to scipy.linalg.expm
+@end_compatibility
+END
+  }
+  summary: "Computes the matrix exponential of one or more square matrices:"
+  description: <<END
+exp(A) = \sum_{n=0}^\infty A^n/n!
+
+The exponential is computed using a combination of the scaling and squaring
+method and the Pade approximation. Details can be founds in:
+Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the exponential for all input submatrices `[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25eca0c766bf33305665b12dba2c9f1ba2cac1a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixInverse.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "MatrixInverse"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, M]`.
+
+@compatibility(numpy)
+Equivalent to np.linalg.inv
+@end_compatibility
+END
+  }
+  summary: "Computes the inverse of one or more square invertible matrices or their"
+  description: <<END
+adjoints (conjugate transposes).
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the inverse for all input submatrices `[..., :, :]`.
+
+The op uses LU decomposition with partial pivoting to compute the inverses.
+
+If a matrix is not invertible there is no guarantee what the op does. It
+may detect the condition and raise an exception or it may simply return a
+garbage result.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSetDiag.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5190902d7e1964344a0966c10ed757795b818818
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSetDiag.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "MatrixSetDiag"
+  in_arg {
+    name: "input"
+    description: <<END
+Rank `k+1`, where `k >= 1`.
+END
+  }
+  in_arg {
+    name: "diagonal"
+    description: <<END
+Rank `k`, where `k >= 1`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Rank `k+1`, with `output.shape = input.shape`.
+END
+  }
+  summary: "Returns a batched matrix tensor with new batched diagonal values."
+  description: <<END
+Given `input` and `diagonal`, this operation returns a tensor with the
+same shape and values as `input`, except for the main diagonal of the
+innermost matrices.  These will be overwritten by the values in `diagonal`.
+
+The output is computed as follows:
+
+Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+`k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+
+  * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+  * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1216d404e3dce12996fdc9a91e7336476dd82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSolve.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "MatrixSolve"
+  in_arg {
+    name: "matrix"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  attr {
+    name: "adjoint"
+    description: <<END
+Boolean indicating whether to solve with `matrix` or its (block-wise)
+adjoint.
+END
+  }
+  summary: "Solves systems of linear equations."
+  description: <<END
+`Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `True` then each output matrix satisfies
+`adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51d91399f8a53325b03e67e643c1375c2bd7cf22
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixSolveLs.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "MatrixSolveLs"
+  in_arg {
+    name: "matrix"
+    description: <<END
+Shape is `[..., M, N]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  in_arg {
+    name: "l2_regularizer"
+    description: <<END
+Scalar tensor.
+
+@compatibility(numpy)
+Equivalent to np.linalg.lstsq
+@end_compatibility
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., N, K]`.
+END
+  }
+  summary: "Solves one or more linear least-squares problems."
+  description: <<END
+`matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+type as `matrix` and shape `[..., M, K]`.
+The output is a tensor shape `[..., N, K]` where each output matrix solves
+each of the equations
+`matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+in the least squares sense.
+
+We use the following notation for (complex) matrix and right-hand sides
+in the batch:
+
+`matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+`rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+`output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+`l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+
+If `fast` is `True`, then the solution is computed by solving the normal
+equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+\\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+\lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+\\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+minimum-norm solution to the under-determined linear system, i.e.
+\\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+when \\(A\\) is numerically full rank and has a condition number
+\\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+sufficiently large.
+
+If `fast` is `False` an algorithm based on the numerically robust complete
+orthogonal decomposition is used. This computes the minimum-norm
+least-squares solution, even when \\(A\\) is rank deficient. This path is
+typically 6-7 times slower than the fast path. If `fast` is `False` then
+`l2_regularizer` is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2bfcdc66e48183edf054a3acbcaa462c80901fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "MatrixTriangularSolve"
+  in_arg {
+    name: "matrix"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  attr {
+    name: "lower"
+    description: <<END
+Boolean indicating whether the innermost matrices in `matrix` are
+lower or upper triangular.
+END
+  }
+  attr {
+    name: "adjoint"
+    description: <<END
+Boolean indicating whether to solve with `matrix` or its (block-wise)
+         adjoint.
+
+@compatibility(numpy)
+Equivalent to np.linalg.triangular_solve
+@end_compatibility
+END
+  }
+  summary: "Solves systems of linear equations with upper or lower triangular matrices by"
+  description: <<END
+backsubstitution.
+
+`matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+square matrices. If `lower` is `True` then the strictly upper triangular part
+of each inner-most matrix is assumed to be zero and not accessed.
+If `lower` is False then the strictly lower triangular part of each inner-most
+matrix is assumed to be zero and not accessed.
+`rhs` is a tensor of shape `[..., M, K]`.
+
+The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+`True` then the innermost matrices in `output` satisfy matrix equations
+`matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+If `adjoint` is `False` then the strictly then the  innermost matrices in
+`output` satisfy matrix equations
+`adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Max.pbtxt b/tensorflow/core/api_def/base_api/api_def_Max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a807d9f3755bba32c5c65b06d562ae1e152568e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Max.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Max"
+  endpoint {
+    name: "Max"
+  }
+  endpoint {
+    name: "ReduceMax"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the maximum of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..885bc1c2792bb5c825eb7ac6f029eb822f92fee7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPool.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "MaxPool"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D input to pool over.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The max pooled output tensor.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Performs max pooling on the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f07ee5fc1bdad548142b8a771d52e15be7dc0bc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPool3D.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "MaxPool3D"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The max pooled output tensor.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+1-D tensor of length 5. The size of the window for each dimension of
+the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  summary: "Performs 3D max pooling on the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPool3DGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78c3c5f4bdb565c10eb8dbc794086300f4cb083d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPool3DGrad.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "MaxPool3DGrad"
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+Output backprop of shape `[batch, depth, rows, cols, channels]`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+1-D tensor of length 5. The size of the window for each dimension of
+the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  summary: "Computes gradients of max pooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPool3DGradGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPool3DGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7593e9a7fe815be74f8a3ff0877446991759e645
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPool3DGradGrad.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "MaxPool3DGradGrad"
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+Output backprop of shape `[batch, depth, rows, cols, channels]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients of gradients w.r.t. the input to `max_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+1-D tensor of length 5. The size of the window for each dimension of
+the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+1-D tensor of length 5. The stride of the sliding window for each
+dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+The data format of the input and output data. With the
+default format "NDHWC", the data is stored in the order of:
+    [batch, in_depth, in_height, in_width, in_channels].
+Alternatively, the format could be "NCDHW", the data storage order is:
+    [batch, in_channels, in_depth, in_height, in_width].
+END
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be3e1972a03074211426f249070cdc86c1f3a467
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGrad.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "MaxPoolGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D.  Gradients w.r.t. the output of `max_pool`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients w.r.t. the input to `max_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Computes gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83f319001f16f7d00df75ceb0257424450393714
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGrad.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "MaxPoolGradGrad"
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients of gradients w.r.t. the input to `max_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a55e02ac4022417b6f5d68a0e91762e5216ee49e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradV2.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "MaxPoolGradGradV2"
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+END
+  }
+  in_arg {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  in_arg {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients of gradients w.r.t. the input to `max_pool`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..63c5604d60e682efc378e91862e5c18f0082bc23
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "MaxPoolGradGradWithArgmax"
+  in_arg {
+    name: "input"
+    description: <<END
+The original input.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+input of `max_pool`.
+END
+  }
+  in_arg {
+    name: "argmax"
+    description: <<END
+The indices of the maximum values chosen for each output of `max_pool`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients of gradients w.r.t. the input of `max_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes second-order gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e72877bb328007de3a9f49f5739bca1873fda06b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradV2.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "MaxPoolGradV2"
+  in_arg {
+    name: "orig_input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "orig_output"
+    description: <<END
+The original output tensor.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D.  Gradients w.r.t. the output of `max_pool`.
+END
+  }
+  in_arg {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  in_arg {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients w.r.t. the input to `max_pool`.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Computes gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae503e79d3abddbc972c66a662ed2e39bb4a024
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolGradWithArgmax.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "MaxPoolGradWithArgmax"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+output of `max_pool`.
+END
+  }
+  in_arg {
+    name: "argmax"
+    description: <<END
+The indices of the maximum values chosen for each output of `max_pool`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Gradients w.r.t. the input of `max_pool`.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Computes gradients of the maxpooling function."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51b1edff6fa659d54ba422c24a90775802c2af3c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolV2.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "MaxPoolV2"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D input to pool over.
+END
+  }
+  in_arg {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  in_arg {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The max pooled output tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "data_format"
+    description: <<END
+Specify the data format of the input and output data. With the
+default format "NHWC", the data is stored in the order of:
+    [batch, in_height, in_width, in_channels].
+Alternatively, the format could be "NCHW", the data storage order of:
+    [batch, in_channels, in_height, in_width].
+END
+  }
+  summary: "Performs max pooling on the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e717e57b50af3ee897c4ea7c309aebd72096f8a4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "MaxPoolWithArgmax"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The max pooled output tensor.
+END
+  }
+  out_arg {
+    name: "argmax"
+    description: <<END
+4-D.  The flattened indices of the max values chosen for each output.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the
+input tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Performs max pooling on the input and outputs both max values and indices."
+  description: <<END
+The indices in `argmax` are flattened, so that a maximum value at position
+`[b, y, x, c]` becomes flattened index
+`((b * height + y) * width + x) * channels + c`.
+
+The indices returned are always in `[0, height) x [0, width)` before flattening,
+even if padding is involved and the mathematically correct answer is outside
+(either negative or too large).  This is a bug, but fixing it is difficult to do
+in a safe backwards compatible way, especially due to flattening.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/base_api/api_def_Maximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e52ca3f45d4d17db7d9353691938471f65bfa199
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Maximum.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Maximum"
+  summary: "Returns the max of x and y (i.e. x > y ? x : y) element-wise."
+  description: <<END
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Mean.pbtxt b/tensorflow/core/api_def/base_api/api_def_Mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71301621358fdce9d27a059fd6890608010a4f09
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Mean.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Mean"
+  endpoint {
+    name: "Mean"
+  }
+  endpoint {
+    name: "ReduceMean"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the mean of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Merge.pbtxt b/tensorflow/core/api_def/base_api/api_def_Merge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..130c384158b96788aab47c39721cd608bcd6d1c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Merge.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "Merge"
+  in_arg {
+    name: "inputs"
+    description: <<END
+The input tensors, exactly one of which will become available.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Will be set to the available input tensor.
+END
+  }
+  out_arg {
+    name: "value_index"
+    description: <<END
+The index of the chosen input tensor in `inputs`.
+END
+  }
+  summary: "Forwards the value of an available tensor from `inputs` to `output`."
+  description: <<END
+`Merge` waits for at least one of the tensors in `inputs` to become available.
+It is usually combined with `Switch` to implement branching.
+
+`Merge` forwards the first tensor to become available to `output`, and sets
+`value_index` to its index in `inputs`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MergeSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_MergeSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82596901848dc1678f9a30b203b19a1cc4b9f1fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MergeSummary.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "MergeSummary"
+  in_arg {
+    name: "inputs"
+    description: <<END
+Can be of any shape.  Each must contain serialized `Summary` protocol
+buffers.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar. Serialized `Summary` protocol buffer.
+END
+  }
+  summary: "Merges summaries."
+  description: <<END
+This op creates a
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+protocol buffer that contains the union of all the values in the input
+summaries.
+
+When the Op is run, it reports an `InvalidArgument` error if multiple values
+in the summaries to merge use the same tag.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MergeV2Checkpoints.pbtxt b/tensorflow/core/api_def/base_api/api_def_MergeV2Checkpoints.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88cc164eb17beec58eb3c55b7cbfe6d534439a9c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MergeV2Checkpoints.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "MergeV2Checkpoints"
+  in_arg {
+    name: "checkpoint_prefixes"
+    description: <<END
+prefixes of V2 checkpoints to merge.
+END
+  }
+  in_arg {
+    name: "destination_prefix"
+    description: <<END
+scalar.  The desired final prefix.  Allowed to be the same
+as one of the checkpoint_prefixes.
+END
+  }
+  attr {
+    name: "delete_old_dirs"
+    description: <<END
+see above.
+END
+  }
+  summary: "V2 format specific: merges the metadata files of sharded checkpoints.  The"
+  description: <<END
+result is one logical checkpoint, with one physical metadata file and renamed
+data files.
+
+Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+
+If delete_old_dirs is true, attempts to delete recursively the dirname of each
+path in the input checkpoint_prefixes.  This is useful when those paths are non
+user-facing temporary locations.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Mfcc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Mfcc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..217a0367a522f337205159463db67d11759d426e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Mfcc.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "Mfcc"
+  in_arg {
+    name: "spectrogram"
+    description: <<END
+Typically produced by the Spectrogram op, with magnitude_squared
+set to true.
+END
+  }
+  in_arg {
+    name: "sample_rate"
+    description: <<END
+How many samples per second the source audio used.
+END
+  }
+  attr {
+    name: "upper_frequency_limit"
+    description: <<END
+The highest frequency to use when calculating the
+ceptstrum.
+END
+  }
+  attr {
+    name: "lower_frequency_limit"
+    description: <<END
+The lowest frequency to use when calculating the
+ceptstrum.
+END
+  }
+  attr {
+    name: "filterbank_channel_count"
+    description: <<END
+Resolution of the Mel bank used internally.
+END
+  }
+  attr {
+    name: "dct_coefficient_count"
+    description: <<END
+How many output channels to produce per time slice.
+END
+  }
+  summary: "Transforms a spectrogram into a form that\'s useful for speech recognition."
+  description: <<END
+Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+been effective as an input feature for machine learning. They are created by
+taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+higher frequencies that are less significant to the human ear. They have a long
+history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+is a good resource to learn more.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Min.pbtxt b/tensorflow/core/api_def/base_api/api_def_Min.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ddc865ab5233dec68101d8e693c00dfa2a2c7f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Min.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Min"
+  endpoint {
+    name: "Min"
+  }
+  endpoint {
+    name: "ReduceMin"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the minimum of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/base_api/api_def_Minimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0997f1a5c10226ddd838d92d000845a88453235
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Minimum.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Minimum"
+  summary: "Returns the min of x and y (i.e. x < y ? x : y) element-wise."
+  description: <<END
+*NOTE*: `Minimum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MirrorPad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MirrorPad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f738f72ce94f6586a7e6b6d57b096883807c8c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MirrorPad.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "MirrorPad"
+  in_arg {
+    name: "input"
+    description: <<END
+The input tensor to be padded.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+A two-column matrix specifying the padding sizes. The number of
+rows must be the same as the rank of `input`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The padded tensor.
+END
+  }
+  attr {
+    name: "mode"
+    description: <<END
+Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+do not include the borders, while in symmetric mode the padded regions
+do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+it is `[1, 2, 3, 3, 2]` in symmetric mode.
+END
+  }
+  summary: "Pads a tensor with mirrored values."
+  description: <<END
+This operation pads a `input` with mirrored values according to the `paddings`
+you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many values to add before the contents of `input` in that dimension, and
+`paddings[D, 1]` indicates how many values to add after the contents of `input`
+in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+(if false, respectively).
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 2, 3], [4, 5, 6]].
+# 'paddings' is [[1, 1]], [2, 2]].
+# 'mode' is SYMMETRIC.
+# rank of 't' is 2.
+pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+                      [2, 1, 1, 2, 3, 3, 2]
+                      [5, 4, 4, 5, 6, 6, 5]
+                      [5, 4, 4, 5, 6, 6, 5]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MirrorPadGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_MirrorPadGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20db99a9d1af034c6d0cec8fcfd0bdcc1c65939d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MirrorPadGrad.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "MirrorPadGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The input tensor to be folded.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+A two-column matrix specifying the padding sizes. The number of
+rows must be the same as the rank of `input`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The folded tensor.
+END
+  }
+  attr {
+    name: "mode"
+    description: <<END
+The mode used in the `MirrorPad` op.
+END
+  }
+  summary: "Gradient op for `MirrorPad` op. This op folds a mirror-padded tensor."
+  description: <<END
+This operation folds the padded areas of `input` by `MirrorPad` according to the
+`paddings` you specify. `paddings` must be the same as `paddings` argument
+given to the corresponding `MirrorPad` op.
+
+The folded size of each dimension D of the output is:
+
+`input.dim_size(D) - paddings(D, 0) - paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 2, 3], [4, 5, 6], [7, 8, 9]].
+# 'paddings' is [[0, 1]], [0, 1]].
+# 'mode' is SYMMETRIC.
+# rank of 't' is 2.
+pad(t, paddings) ==> [[ 1,  5]
+                      [11, 28]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Mod.pbtxt b/tensorflow/core/api_def/base_api/api_def_Mod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a49ccff687024450d681442880455b74fce30a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Mod.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "Mod"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: <<END
+the result here is consistent with a truncating divide. E.g.
+`tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+
+*NOTE*: `Mod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Mul.pbtxt b/tensorflow/core/api_def/base_api/api_def_Mul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13fad871f38d6159da9ff0978a1e49bc78ffe71f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Mul.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "Mul"
+  endpoint {
+    name: "Multiply"
+  }
+  endpoint {
+    name: "Mul"
+  }
+  summary: "Returns x * y element-wise."
+  description: <<END
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Multinomial.pbtxt b/tensorflow/core/api_def/base_api/api_def_Multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..974e81e0fda5fd3b77d025a8960e9a61e6e58bcd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Multinomial.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "Multinomial"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+represents the unnormalized log probabilities for all classes.
+END
+  }
+  in_arg {
+    name: "num_samples"
+    description: <<END
+0-D.  Number of independent samples to draw for each row slice.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+contains the drawn class labels with range `[0, num_classes)`.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 is set to be non-zero, the internal random number
+generator is seeded by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  summary: "Draws samples from a multinomial distribution."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTable.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaaed081cc8f92889bb8f76b3a190ebc3613968b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTable.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "MutableDenseHashTable"
+  visibility: SKIP
+  in_arg {
+    name: "empty_key"
+    description: <<END
+The key used to represent empty key buckets internally. Must not
+be used in insert or lookup operations.
+END
+  }
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  attr {
+    name: "value_shape"
+    description: <<END
+The shape of each value.
+END
+  }
+  attr {
+    name: "initial_num_buckets"
+    description: <<END
+The initial number of hash table buckets. Must be a power
+to 2.
+END
+  }
+  attr {
+    name: "max_load_factor"
+    description: <<END
+The maximum ratio between number of entries and number of
+buckets before growing the table. Must be between 0 and 1.
+END
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: <<END
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTableV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..55fce8317536ccde027a1fbad059837ccd462ffa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableDenseHashTableV2.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "MutableDenseHashTableV2"
+  endpoint {
+    name: "MutableDenseHashTable"
+  }
+  in_arg {
+    name: "empty_key"
+    description: <<END
+The key used to represent empty key buckets internally. Must not
+be used in insert or lookup operations.
+END
+  }
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  attr {
+    name: "value_shape"
+    description: <<END
+The shape of each value.
+END
+  }
+  attr {
+    name: "initial_num_buckets"
+    description: <<END
+The initial number of hash table buckets. Must be a power
+to 2.
+END
+  }
+  attr {
+    name: "max_load_factor"
+    description: <<END
+The maximum ratio between number of entries and number of
+buckets before growing the table. Must be between 0 and 1.
+END
+  }
+  summary: "Creates an empty hash table that uses tensors as the backing store."
+  description: <<END
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableHashTable.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4bcdcdaf8ae916bdb1662308c36d5fa4b2a7d2b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableHashTable.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "MutableHashTable"
+  visibility: SKIP
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "use_node_name_sharing"
+    description: <<END
+If true and shared_name is empty, the table is shared
+using the node name.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates an empty hash table."
+  description: <<END
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensors.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bb37a3c4045446c13aae5b2cf229e20259aee15
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensors.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "MutableHashTableOfTensors"
+  visibility: SKIP
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates an empty hash table."
+  description: <<END
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensorsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1007cc96c0bb54db6301242cdd06b97e5523c22b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableHashTableOfTensorsV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "MutableHashTableOfTensorsV2"
+  endpoint {
+    name: "MutableHashTableOfTensors"
+  }
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates an empty hash table."
+  description: <<END
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MutableHashTableV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MutableHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b37b5b07f8c24bac26a8ce770f752070d3fe2e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MutableHashTableV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "MutableHashTableV2"
+  endpoint {
+    name: "MutableHashTable"
+  }
+  out_arg {
+    name: "table_handle"
+    description: <<END
+Handle to a table.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this table is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this table is shared under the given name across
+multiple sessions.
+END
+  }
+  attr {
+    name: "use_node_name_sharing"
+    description: <<END
+If true and shared_name is empty, the table is shared
+using the node name.
+END
+  }
+  attr {
+    name: "key_dtype"
+    description: <<END
+Type of the table keys.
+END
+  }
+  attr {
+    name: "value_dtype"
+    description: <<END
+Type of the table values.
+END
+  }
+  summary: "Creates an empty hash table."
+  description: <<END
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_N.pbtxt b/tensorflow/core/api_def/base_api/api_def_N.pbtxt
deleted file mode 100644
index 0298a42cab9109bb96af4e8bdea25c6211574523..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_N.pbtxt
+++ /dev/null
@@ -1,94 +0,0 @@
-op {
-  graph_op_name: "Neg"
-  endpoint {
-    name: "Neg"
-  }
-  summary: "Computes numerical negative value element-wise."
-  description: <<END
-I.e., \\(y = -x\\).
-END
-}
-op {
-  graph_op_name: "NegTrain"
-  endpoint {
-    name: "NegTrain"
-  }
-  summary: "Training via negative sampling."
-}
-op {
-  graph_op_name: "NextIteration"
-  endpoint {
-    name: "NextIteration"
-  }
-  summary: "Makes its input available to the next iteration."
-}
-op {
-  graph_op_name: "NoOp"
-  endpoint {
-    name: "NoOp"
-  }
-  summary: "Does nothing. Only useful as a placeholder for control edges."
-}
-op {
-  graph_op_name: "NonMaxSuppression"
-  endpoint {
-    name: "NonMaxSuppression"
-  }
-  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: <<END
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-  selected_indices = tf.image.non_max_suppression(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-END
-}
-op {
-  graph_op_name: "NonMaxSuppressionV2"
-  endpoint {
-    name: "NonMaxSuppressionV2"
-  }
-  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
-  description: <<END
-pruning away boxes that have high intersection-over-union (IOU) overlap
-with previously selected boxes.  Bounding boxes are supplied as
-[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-diagonal pair of box corners and the coordinates can be provided as normalized
-(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-is agnostic to where the origin is in the coordinate system.  Note that this
-algorithm is invariant to orthogonal transformations and translations
-of the coordinate system; thus translating or reflections of the coordinate
-system result in the same boxes being selected by the algorithm.
-
-The output of this operation is a set of integers indexing into the input
-collection of bounding boxes representing the selected boxes.  The bounding
-box coordinates corresponding to the selected indices can then be obtained
-using the `tf.gather operation`.  For example:
-
-  selected_indices = tf.image.non_max_suppression_v2(
-      boxes, scores, max_output_size, iou_threshold)
-  selected_boxes = tf.gather(boxes, selected_indices)
-END
-}
-op {
-  graph_op_name: "NotEqual"
-  endpoint {
-    name: "NotEqual"
-  }
-  summary: "Returns the truth value of (x != y) element-wise."
-  description: <<END
-*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/base_api/api_def_Neg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dafa218e5a92a321b1df8e1f77ea7b160e90b7fa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Neg.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "Neg"
+  endpoint {
+    name: "Negate"
+  }
+  endpoint {
+    name: "Neg"
+  }
+  summary: "Computes numerical negative value element-wise."
+  description: <<END
+I.e., \\(y = -x\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NegTrain.pbtxt b/tensorflow/core/api_def/base_api/api_def_NegTrain.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c8efac053b7e23db7eaee64432c0f4a6b2cfb10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NegTrain.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "NegTrain"
+  in_arg {
+    name: "w_in"
+    description: <<END
+input word embedding.
+END
+  }
+  in_arg {
+    name: "w_out"
+    description: <<END
+output word embedding.
+END
+  }
+  in_arg {
+    name: "examples"
+    description: <<END
+A vector of word ids.
+END
+  }
+  in_arg {
+    name: "labels"
+    description: <<END
+A vector of word ids.
+END
+  }
+  attr {
+    name: "vocab_count"
+    description: <<END
+Count of words in the vocabulary.
+END
+  }
+  attr {
+    name: "num_negative_samples"
+    description: <<END
+Number of negative samples per example.
+END
+  }
+  summary: "Training via negative sampling."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NextIteration.pbtxt b/tensorflow/core/api_def/base_api/api_def_NextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13178619efe2411adc184b861cfe258643dcdf56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NextIteration.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "NextIteration"
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the next iteration.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  summary: "Makes its input available to the next iteration."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NoOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_NoOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d860149adc3b6975e8dbf3301210ffbc68d1a34e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NoOp.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NoOp"
+  summary: "Does nothing. Only useful as a placeholder for control edges."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NonMaxSuppression.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8352b1b8c3b33c2e1875fe08ffdc6d4c03513c8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppression.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "NonMaxSuppression"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D float tensor of shape `[num_boxes, 4]`.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 1-D float tensor of shape `[num_boxes]` representing a single
+score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size"
+    description: <<END
+A scalar integer tensor representing the maximum number of
+boxes to be selected by non max suppression.
+END
+  }
+  out_arg {
+    name: "selected_indices"
+    description: <<END
+A 1-D integer tensor of shape `[M]` representing the selected
+indices from the boxes tensor, where `M <= max_output_size`.
+END
+  }
+  attr {
+    name: "iou_threshold"
+    description: <<END
+A float representing the threshold for deciding whether boxes
+overlap too much with respect to IOU.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system.  Note that this
+algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression(
+      boxes, scores, max_output_size, iou_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42146d106c38708eb7fe72463b5dadeb1951ee80
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV2.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "NonMaxSuppressionV2"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D float tensor of shape `[num_boxes, 4]`.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 1-D float tensor of shape `[num_boxes]` representing a single
+score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size"
+    description: <<END
+A scalar integer tensor representing the maximum number of
+boxes to be selected by non max suppression.
+END
+  }
+  in_arg {
+    name: "iou_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding whether
+boxes overlap too much with respect to IOU.
+END
+  }
+  out_arg {
+    name: "selected_indices"
+    description: <<END
+A 1-D integer tensor of shape `[M]` representing the selected
+indices from the boxes tensor, where `M <= max_output_size`.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system.  Note that this
+algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/base_api/api_def_NotEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c4b31853491a71d68b6f6a91fe1327728cd885d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NotEqual.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "NotEqual"
+  summary: "Returns the truth value of (x != y) element-wise."
+  description: <<END
+*NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f5d8496190c87ef7d037f6e6ab5a6c44af100f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NthElement.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "NthElement"
+  in_arg {
+    name: "input"
+    description: <<END
+1-D or higher with last dimension at least `n+1`.
+END
+  }
+  in_arg {
+    name: "n"
+    description: <<END
+0-D. Position of sorted vector to select along the last dimension (along
+each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+The `n`-th order statistic along each last dimensional slice.
+END
+  }
+  attr {
+    name: "reverse"
+    description: <<END
+When set to True, find the nth-largest value in the vector and vice
+versa.
+END
+  }
+  summary: "Finds values of the `n`-th order statistic for the last dimension."
+  description: <<END
+If the input is a vector (rank-1), finds the entries which is the nth-smallest
+value in the vector and outputs their values as scalar tensor.
+
+For matrices (resp. higher rank input), computes the entries which is the
+nth-smallest value in each row (resp. vector along the last dimension). Thus,
+
+    values.shape = input.shape[:-1]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_O.pbtxt b/tensorflow/core/api_def/base_api/api_def_O.pbtxt
deleted file mode 100644
index 3c62335da9a508b36b857c15990bc4c4a5db05fc..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_O.pbtxt
+++ /dev/null
@@ -1,195 +0,0 @@
-op {
-  graph_op_name: "OneHot"
-  endpoint {
-    name: "OneHot"
-  }
-  summary: "Returns a one-hot tensor."
-  description: <<END
-The locations represented by indices in `indices` take value `on_value`,
-while all other locations take value `off_value`.
-
-If the input `indices` is rank `N`, the output will have rank `N+1`,
-The new axis is created at dimension `axis` (default: the new axis is
-appended at the end).
-
-If `indices` is a scalar the output shape will be a vector of length `depth`.
-
-If `indices` is a vector of length `features`, the output shape will be:
-```
-  features x depth if axis == -1
-  depth x features if axis == 0
-```
-
-If `indices` is a matrix (batch) with shape `[batch, features]`,
-the output shape will be:
-```
-  batch x features x depth if axis == -1
-  batch x depth x features if axis == 1
-  depth x batch x features if axis == 0
-```
-
-
-Examples
-=========
-
-Suppose that
-
-```
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 5.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[4 x 3]`:
-
-    ```output =
-      [5.0 0.0 0.0]  // one_hot(0)
-      [0.0 0.0 5.0]  // one_hot(2)
-      [0.0 0.0 0.0]  // one_hot(-1)
-      [0.0 5.0 0.0]  // one_hot(1)
-    ```
-
-Suppose that
-
-```
-  indices = [0, 2, -1, 1]
-  depth = 3
-  on_value = 0.0
-  off_value = 3.0
-  axis = 0
-```
-
-Then output is `[3 x 4]`:
-
-    ```output =
-      [0.0 3.0 3.0 3.0]
-      [3.0 3.0 3.0 0.0]
-      [3.0 3.0 3.0 3.0]
-      [3.0 0.0 3.0 3.0]
-    //  ^                one_hot(0)
-    //      ^            one_hot(2)
-    //          ^        one_hot(-1)
-    //              ^    one_hot(1)
-    ```
-Suppose that
-
-```
-  indices = [[0, 2], [1, -1]]
-  depth = 3
-  on_value = 1.0
-  off_value = 0.0
-  axis = -1
-```
-
-Then output is `[2 x 2 x 3]`:
-
-    ```output =
-      [
-        [1.0, 0.0, 0.0]  // one_hot(0)
-        [0.0, 0.0, 1.0]  // one_hot(2)
-      ][
-        [0.0, 1.0, 0.0]  // one_hot(1)
-        [0.0, 0.0, 0.0]  // one_hot(-1)
-      ]```
-END
-}
-op {
-  graph_op_name: "OneShotIterator"
-  endpoint {
-    name: "OneShotIterator"
-  }
-  summary: "Makes a \"one-shot\" iterator that can be iterated only once."
-  description: <<END
-A one-shot iterator bundles the logic for defining the dataset and
-the state of the iterator in a single op, which allows simple input
-pipelines to be defined without an additional initialization
-("MakeIterator") step.
-
-One-shot iterators have the following limitations:
-
-* They do not support parameterization: all logic for creating the underlying
-  dataset must be bundled in the `dataset_factory` function.
-* They are not resettable. Once a one-shot iterator reaches the end of its
-  underlying dataset, subsequent "IteratorGetNext" operations on that
-  iterator will always produce an `OutOfRange` error.
-
-For greater flexibility, use "Iterator" and "MakeIterator" to define
-an iterator using an arbitrary subgraph, which may capture tensors
-(including fed values) as parameters, and which may be reset multiple
-times by rerunning "MakeIterator".
-END
-}
-op {
-  graph_op_name: "OnesLike"
-  endpoint {
-    name: "OnesLike"
-  }
-  summary: "Returns a tensor of ones with the same shape and type as x."
-}
-op {
-  graph_op_name: "OrderedMapClear"
-  endpoint {
-    name: "OrderedMapClear"
-  }
-  summary: "Op removes all elements in the underlying container."
-}
-op {
-  graph_op_name: "OrderedMapIncompleteSize"
-  endpoint {
-    name: "OrderedMapIncompleteSize"
-  }
-  summary: "Op returns the number of incomplete elements in the underlying container."
-}
-op {
-  graph_op_name: "OrderedMapPeek"
-  endpoint {
-    name: "OrderedMapPeek"
-  }
-  summary: "Op peeks at the values at the specified key.  If the"
-  description: <<END
-underlying container does not contain this key
-this op will block until it does.   This Op is optimized for
-performance.
-END
-}
-op {
-  graph_op_name: "OrderedMapSize"
-  endpoint {
-    name: "OrderedMapSize"
-  }
-  summary: "Op returns the number of elements in the underlying container."
-}
-op {
-  graph_op_name: "OrderedMapStage"
-  endpoint {
-    name: "OrderedMapStage"
-  }
-  summary: "Stage (key, values) in the underlying container which behaves like a ordered"
-  description: <<END
-associative container.   Elements are ordered by key.
-END
-}
-op {
-  graph_op_name: "OrderedMapUnstage"
-  endpoint {
-    name: "OrderedMapUnstage"
-  }
-  summary: "Op removes and returns the values associated with the key"
-  description: <<END
-from the underlying container.   If the underlying container
-does not contain this key, the op will block until it does.
-END
-}
-op {
-  graph_op_name: "OrderedMapUnstageNoKey"
-  endpoint {
-    name: "OrderedMapUnstageNoKey"
-  }
-  summary: "Op removes and returns the (key, value) element with the smallest"
-  description: <<END
-key from the underlying container.   If the underlying container
-does not contain elements, the op will block until it does.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..807b8ae31015e4bcb73e54e98d879460f0d92f62
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
@@ -0,0 +1,130 @@
+op {
+  graph_op_name: "OneHot"
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices.
+END
+  }
+  in_arg {
+    name: "depth"
+    description: <<END
+A scalar defining the depth of the one hot dimension.
+END
+  }
+  in_arg {
+    name: "on_value"
+    description: <<END
+A scalar defining the value to fill in output when `indices[j] = i`.
+END
+  }
+  in_arg {
+    name: "off_value"
+    description: <<END
+A scalar defining the value to fill in output when `indices[j] != i`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The one-hot tensor.
+END
+  }
+  attr {
+    name: "axis"
+    description: <<END
+The axis to fill (default: -1, a new inner-most axis).
+END
+  }
+  summary: "Returns a one-hot tensor."
+  description: <<END
+The locations represented by indices in `indices` take value `on_value`,
+while all other locations take value `off_value`.
+
+If the input `indices` is rank `N`, the output will have rank `N+1`,
+The new axis is created at dimension `axis` (default: the new axis is
+appended at the end).
+
+If `indices` is a scalar the output shape will be a vector of length `depth`.
+
+If `indices` is a vector of length `features`, the output shape will be:
+```
+  features x depth if axis == -1
+  depth x features if axis == 0
+```
+
+If `indices` is a matrix (batch) with shape `[batch, features]`,
+the output shape will be:
+```
+  batch x features x depth if axis == -1
+  batch x depth x features if axis == 1
+  depth x batch x features if axis == 0
+```
+
+
+Examples
+=========
+
+Suppose that
+
+```
+  indices = [0, 2, -1, 1]
+  depth = 3
+  on_value = 5.0
+  off_value = 0.0
+  axis = -1
+```
+
+Then output is `[4 x 3]`:
+
+    ```output =
+      [5.0 0.0 0.0]  // one_hot(0)
+      [0.0 0.0 5.0]  // one_hot(2)
+      [0.0 0.0 0.0]  // one_hot(-1)
+      [0.0 5.0 0.0]  // one_hot(1)
+    ```
+
+Suppose that
+
+```
+  indices = [0, 2, -1, 1]
+  depth = 3
+  on_value = 0.0
+  off_value = 3.0
+  axis = 0
+```
+
+Then output is `[3 x 4]`:
+
+    ```output =
+      [0.0 3.0 3.0 3.0]
+      [3.0 3.0 3.0 0.0]
+      [3.0 3.0 3.0 3.0]
+      [3.0 0.0 3.0 3.0]
+    //  ^                one_hot(0)
+    //      ^            one_hot(2)
+    //          ^        one_hot(-1)
+    //              ^    one_hot(1)
+    ```
+Suppose that
+
+```
+  indices = [[0, 2], [1, -1]]
+  depth = 3
+  on_value = 1.0
+  off_value = 0.0
+  axis = -1
+```
+
+Then output is `[2 x 2 x 3]`:
+
+    ```output =
+      [
+        [1.0, 0.0, 0.0]  // one_hot(0)
+        [0.0, 0.0, 1.0]  // one_hot(2)
+      ][
+        [0.0, 1.0, 0.0]  // one_hot(1)
+        [0.0, 0.0, 0.0]  // one_hot(-1)
+      ]```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OneShotIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_OneShotIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9040f2d9823ff7c5b7c930ff4d729bef0cabe4e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OneShotIterator.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "OneShotIterator"
+  out_arg {
+    name: "handle"
+    description: <<END
+A handle to the iterator that can be passed to an "IteratorGetNext"
+op.
+END
+  }
+  attr {
+    name: "dataset_factory"
+    description: <<END
+A function of type `() -> DT_VARIANT`, where the returned
+DT_VARIANT is a dataset.
+END
+  }
+  summary: "Makes a \"one-shot\" iterator that can be iterated only once."
+  description: <<END
+A one-shot iterator bundles the logic for defining the dataset and
+the state of the iterator in a single op, which allows simple input
+pipelines to be defined without an additional initialization
+("MakeIterator") step.
+
+One-shot iterators have the following limitations:
+
+* They do not support parameterization: all logic for creating the underlying
+  dataset must be bundled in the `dataset_factory` function.
+* They are not resettable. Once a one-shot iterator reaches the end of its
+  underlying dataset, subsequent "IteratorGetNext" operations on that
+  iterator will always produce an `OutOfRange` error.
+
+For greater flexibility, use "Iterator" and "MakeIterator" to define
+an iterator using an arbitrary subgraph, which may capture tensors
+(including fed values) as parameters, and which may be reset multiple
+times by rerunning "MakeIterator".
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OnesLike.pbtxt b/tensorflow/core/api_def/base_api/api_def_OnesLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c640ab84e14ef41c6bebbedc1f8d3d41a7bc001
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OnesLike.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "OnesLike"
+  in_arg {
+    name: "x"
+    description: <<END
+a tensor of type T.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+a tensor of the same shape and type as x but filled with ones.
+END
+  }
+  summary: "Returns a tensor of ones with the same shape and type as x."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapClear.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8af5a8237447dd7a6a144691aae0fa0fa5008530
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapClear"
+  summary: "Op removes all elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapIncompleteSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1cb89477aba7c699993a62d9abde5c759318d64b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapIncompleteSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapIncompleteSize"
+  summary: "Op returns the number of incomplete elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapPeek.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bafdd425e2f2de0829518fdda19fbd7f366aa90f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapPeek.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "OrderedMapPeek"
+  summary: "Op peeks at the values at the specified key.  If the"
+  description: <<END
+underlying container does not contain this key
+this op will block until it does.   This Op is optimized for
+performance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5bad3012cc93378783100b3419fdbcf4da6681e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "OrderedMapSize"
+  summary: "Op returns the number of elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapStage.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dad0b27601fe0bf7a0ebe3acda347de936b5d7a4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapStage.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "OrderedMapStage"
+  in_arg {
+    name: "key"
+    description: <<END
+int64
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+Maximum number of elements in the Staging Area. If > 0, inserts
+on the container will block when the capacity is reached.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container. Otherwise,
+a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+It is necessary to match this name to the matching Unstage Op.
+END
+  }
+  summary: "Stage (key, values) in the underlying container which behaves like a ordered"
+  description: <<END
+associative container.   Elements are ordered by key.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstage.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..731f1ac6cc76fe62e068606b0bcb76c8ae6eb89e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstage.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "OrderedMapUnstage"
+  summary: "Op removes and returns the values associated with the key"
+  description: <<END
+from the underlying container.   If the underlying container
+does not contain this key, the op will block until it does.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca517a1331fd608142e366a9c0c166591da52ec3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OrderedMapUnstageNoKey.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "OrderedMapUnstageNoKey"
+  summary: "Op removes and returns the (key, value) element with the smallest"
+  description: <<END
+key from the underlying container.   If the underlying container
+does not contain elements, the op will block until it does.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_P.pbtxt b/tensorflow/core/api_def/base_api/api_def_P.pbtxt
deleted file mode 100644
index a3abb079e95e07c5c13e19b36ba6582ade14f84c..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_P.pbtxt
+++ /dev/null
@@ -1,431 +0,0 @@
-op {
-  graph_op_name: "Pack"
-  endpoint {
-    name: "Pack"
-  }
-  summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
-  description: <<END
-Packs the `N` tensors in `values` into a tensor with rank one higher than each
-tensor in `values`, by packing them along the `axis` dimension.
-Given a list of tensors of shape `(A, B, C)`;
-
-if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-Etc.
-
-For example:
-
-```
-# 'x' is [1, 4]
-# 'y' is [2, 5]
-# 'z' is [3, 6]
-pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-```
-
-This is the opposite of `unpack`.
-END
-}
-op {
-  graph_op_name: "Pad"
-  endpoint {
-    name: "Pad"
-  }
-  summary: "Pads a tensor with zeros."
-  description: <<END
-This operation pads a `input` with zeros according to the `paddings` you
-specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many zeros to add before the contents of `input` in that dimension, and
-`paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-in that dimension.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-END
-}
-op {
-  graph_op_name: "PadV2"
-  endpoint {
-    name: "PadV2"
-  }
-  summary: "Pads a tensor."
-  description: <<END
-This operation pads `input` according to the `paddings` and `constant_values`
-you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
-the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-how many padding values to add before the contents of `input` in that dimension,
-and `paddings[D, 1]` indicates how many padding values to add after the contents
-of `input` in that dimension. `constant_values` is a scalar tensor of the same
-type as `input` that indicates the value to use for padding `input`.
-
-The padded size of each dimension D of the output is:
-
-`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-
-For example:
-
-```
-# 't' is [[1, 1], [2, 2]]
-# 'paddings' is [[1, 1], [2, 2]]
-# 'constant_values' is 0
-# rank of 't' is 2
-pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-                      [0, 0, 1, 1, 0, 0]
-                      [0, 0, 2, 2, 0, 0]
-                      [0, 0, 0, 0, 0, 0]]
-```
-END
-}
-op {
-  graph_op_name: "PaddedBatchDataset"
-  endpoint {
-    name: "PaddedBatchDataset"
-  }
-  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
-}
-op {
-  graph_op_name: "PaddingFIFOQueue"
-  endpoint {
-    name: "PaddingFIFOQueue"
-  }
-  summary: "A queue that produces elements in first-in first-out order."
-  description: <<END
-Variable-size shapes are allowed by setting the corresponding shape dimensions
-to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-size of any given element in the minibatch.  See below for details.
-END
-}
-op {
-  graph_op_name: "PaddingFIFOQueueV2"
-  endpoint {
-    name: "PaddingFIFOQueueV2"
-  }
-  summary: "A queue that produces elements in first-in first-out order."
-  description: <<END
-Variable-size shapes are allowed by setting the corresponding shape dimensions
-to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-size of any given element in the minibatch.  See below for details.
-END
-}
-op {
-  graph_op_name: "ParallelConcat"
-  endpoint {
-    name: "ParallelConcat"
-  }
-  summary: "Concatenates a list of `N` tensors along the first dimension."
-  description: <<END
-The input tensors are all required to have size 1 in the first dimension.
-
-For example:
-
-```
-# 'x' is [[1, 4]]
-# 'y' is [[2, 5]]
-# 'z' is [[3, 6]]
-parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-```
-
-The difference between concat and parallel_concat is that concat requires all
-of the inputs be computed before the operation will begin but doesn't require
-that the input shapes be known during graph construction.  Parallel concat
-will copy pieces of the input into the output as they become available, in
-some situations this can provide a performance benefit.
-END
-}
-op {
-  graph_op_name: "ParallelDynamicStitch"
-  endpoint {
-    name: "ParallelDynamicStitch"
-  }
-  summary: "Interleave the values from the `data` tensors into a single tensor."
-  description: <<END
-Builds a merged tensor such that
-
-```python
-    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-```
-
-For example, if each `indices[m]` is scalar or vector, we have
-
-```python
-    # Scalar indices:
-    merged[indices[m], ...] = data[m][...]
-
-    # Vector indices:
-    merged[indices[m][i], ...] = data[m][i, ...]
-```
-
-Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-`constant`, the output shape is
-
-    merged.shape = [max(indices)] + constant
-
-Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-and `indices[n][j]`, the result may be invalid. This differs from the normal
-DynamicStitch operator that defines the behavior in that case.
-
-For example:
-
-```python
-    indices[0] = 6
-    indices[1] = [4, 1]
-    indices[2] = [[5, 2], [0, 3]]
-    data[0] = [61, 62]
-    data[1] = [[41, 42], [11, 12]]
-    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-              [51, 52], [61, 62]]
-```
-
-This method can be used to merge partitions created by `dynamic_partition`
-as illustrated on the following example:
-
-```python
-    # Apply function (increments x_i) on elements for which a certain condition
-    # apply (x_i != -1 in this example).
-    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-    condition_mask=tf.not_equal(x,tf.constant(-1.))
-    partitioned_data = tf.dynamic_partition(
-        x, tf.cast(condition_mask, tf.int32) , 2)
-    partitioned_data[1] = partitioned_data[1] + 1.0
-    condition_indices = tf.dynamic_partition(
-        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-    x = tf.dynamic_stitch(condition_indices, partitioned_data)
-    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-    # unchanged.
-```
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "ParallelMapDataset"
-  endpoint {
-    name: "ParallelMapDataset"
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: <<END
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `num_parallel_calls` copies of `f` in parallel.
-END
-}
-op {
-  graph_op_name: "ParameterizedTruncatedNormal"
-  endpoint {
-    name: "ParameterizedTruncatedNormal"
-  }
-  summary: "Outputs random values from a normal distribution. The parameters may each be a"
-  description: <<END
-scalar which applies to the entire output, or a vector of length shape[0] which
-stores the parameters for each batch.
-END
-}
-op {
-  graph_op_name: "ParseExample"
-  endpoint {
-    name: "ParseExample"
-  }
-  summary: "Transforms a vector of brain.Example protos (as strings) into typed tensors."
-}
-op {
-  graph_op_name: "ParseSingleSequenceExample"
-  endpoint {
-    name: "ParseSingleSequenceExample"
-  }
-  summary: "Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors."
-}
-op {
-  graph_op_name: "ParseTensor"
-  endpoint {
-    name: "ParseTensor"
-  }
-  summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
-}
-op {
-  graph_op_name: "Placeholder"
-  endpoint {
-    name: "Placeholder"
-  }
-  summary: "A placeholder op for a value that will be fed into the computation."
-  description: <<END
-N.B. This operation will fail with an error if it is executed. It is
-intended as a way to represent a value that will always be fed, and to
-provide attrs that enable the fed value to be checked at runtime.
-END
-}
-op {
-  graph_op_name: "PlaceholderV2"
-  endpoint {
-    name: "PlaceholderV2"
-  }
-  summary: "A placeholder op for a value that will be fed into the computation."
-  description: <<END
-N.B. This operation will fail with an error if it is executed. It is
-intended as a way to represent a value that will always be fed, and to
-provide attrs that enable the fed value to be checked at runtime.
-END
-}
-op {
-  graph_op_name: "PlaceholderWithDefault"
-  endpoint {
-    name: "PlaceholderWithDefault"
-  }
-  summary: "A placeholder op that passes through `input` when its output is not fed."
-}
-op {
-  graph_op_name: "Polygamma"
-  endpoint {
-    name: "Polygamma"
-  }
-  summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
-  description: <<END
-The polygamma function is defined as:
-
-
-\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
-
-where \\(\psi(x)\\) is the digamma function.
-END
-}
-op {
-  graph_op_name: "PopulationCount"
-  endpoint {
-    name: "PopulationCount"
-  }
-  summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)."
-  description: <<END
-For each entry in `x`, calculates the number of `1` (on) bits in the binary
-representation of that entry.
-
-**NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-`int32` or `int64` and perform the bitcount on the result, than to feed in
-8- or 16-bit inputs and then aggregate the resulting counts.
-END
-}
-op {
-  graph_op_name: "Pow"
-  endpoint {
-    name: "Pow"
-  }
-  summary: "Computes the power of one value to another."
-  description: <<END
-Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-corresponding elements in `x` and `y`. For example:
-
-```
-# tensor 'x' is [[2, 2]], [3, 3]]
-# tensor 'y' is [[8, 16], [2, 3]]
-tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-```
-END
-}
-op {
-  graph_op_name: "PrefetchDataset"
-  endpoint {
-    name: "PrefetchDataset"
-  }
-  summary: "Creates a dataset that asynchronously prefetches elements from `input_dataset`."
-}
-op {
-  graph_op_name: "PreventGradient"
-  endpoint {
-    name: "PreventGradient"
-  }
-  summary: "An identity op that triggers an error if a gradient is requested."
-  description: <<END
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, the TensorFlow gradient system
-will return an error when trying to lookup the gradient of this op,
-because no gradient must ever be registered for this function.  This
-op exists to prevent subtle bugs from silently returning unimplemented
-gradients in some corner cases.
-END
-}
-op {
-  graph_op_name: "Print"
-  endpoint {
-    name: "Print"
-  }
-  summary: "Prints a list of tensors."
-  description: <<END
-Passes `input` through to `output` and prints `data` when evaluating.
-END
-}
-op {
-  graph_op_name: "PriorityQueue"
-  endpoint {
-    name: "PriorityQueue"
-  }
-  summary: "A queue that produces elements sorted by the first component value."
-  description: <<END
-Note that the PriorityQueue requires the first component of any element
-to be a scalar int64, in addition to the other elements declared by
-component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-entry in their input (resp. output) lists.
-END
-}
-op {
-  graph_op_name: "PriorityQueueV2"
-  endpoint {
-    name: "PriorityQueueV2"
-  }
-  summary: "A queue that produces elements sorted by the first component value."
-  description: <<END
-Note that the PriorityQueue requires the first component of any element
-to be a scalar int64, in addition to the other elements declared by
-component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-entry in their input (resp. output) lists.
-END
-}
-op {
-  graph_op_name: "Prod"
-  endpoint {
-    name: "Prod"
-  }
-  summary: "Computes the product of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "PyFunc"
-  endpoint {
-    name: "PyFunc"
-  }
-  summary: "Invokes a python function to compute func(input)->output."
-  description: <<END
-This operation is considered stateful. For a stateless version, see
-PyFuncStateless.
-END
-}
-op {
-  graph_op_name: "PyFuncStateless"
-  endpoint {
-    name: "PyFuncStateless"
-  }
-  summary: "A stateless version of PyFunc."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Pack.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..106ca3cd86fafa3affb2e1e99de68ad95224a862
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Pack.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "Pack"
+  endpoint {
+    name: "Stack"
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Must be of same shape and type.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The packed tensor.
+END
+  }
+  attr {
+    name: "axis"
+    description: <<END
+Dimension along which to pack.  Negative values wrap around, so the
+valid range is `[-(R+1), R+1)`.
+END
+  }
+  summary: "Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor."
+  description: <<END
+Packs the `N` tensors in `values` into a tensor with rank one higher than each
+tensor in `values`, by packing them along the `axis` dimension.
+Given a list of tensors of shape `(A, B, C)`;
+
+if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+Etc.
+
+For example:
+
+```
+# 'x' is [1, 4]
+# 'y' is [2, 5]
+# 'z' is [3, 6]
+pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+```
+
+This is the opposite of `unpack`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e45e2375eb9eb732712ab3bde5b33ac6de884e09
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Pad.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "Pad"
+  summary: "Pads a tensor with zeros."
+  description: <<END
+This operation pads a `input` with zeros according to the `paddings` you
+specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many zeros to add before the contents of `input` in that dimension, and
+`paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+in that dimension.
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PadV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_PadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e2765764e20a8a4ccf8f05900c46b96aaaddc97
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PadV2.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "PadV2"
+  summary: "Pads a tensor."
+  description: <<END
+This operation pads `input` according to the `paddings` and `constant_values`
+you specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is
+the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+how many padding values to add before the contents of `input` in that dimension,
+and `paddings[D, 1]` indicates how many padding values to add after the contents
+of `input` in that dimension. `constant_values` is a scalar tensor of the same
+type as `input` that indicates the value to use for padding `input`.
+
+The padded size of each dimension D of the output is:
+
+`paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+
+For example:
+
+```
+# 't' is [[1, 1], [2, 2]]
+# 'paddings' is [[1, 1], [2, 2]]
+# 'constant_values' is 0
+# rank of 't' is 2
+pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+                      [0, 0, 1, 1, 0, 0]
+                      [0, 0, 2, 2, 0, 0]
+                      [0, 0, 0, 0, 0, 0]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d243dfe8b67bc14e9c5e22d5e68e3faf5d4684a8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "PaddedBatchDataset"
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch.
+END
+  }
+  in_arg {
+    name: "padded_shapes"
+    description: <<END
+A list of int64 tensors representing the desired padded shapes
+of the corresponding output components. These shapes may be partially
+specified, using `-1` to indicate that a particular dimension should be
+padded to the maximum size of all batch elements.
+END
+  }
+  in_arg {
+    name: "padding_values"
+    description: <<END
+A list of scalars containing the padding value to use for
+each of the outputs.
+END
+  }
+  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6671a2f126f470a70ceda86240645472351a8e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueue.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "PaddingFIFOQueue"
+  visibility: SKIP
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types.
+Shapes of fixed rank but variable size are allowed by setting
+any shape dimension to -1.  In this case, the inputs' shape may vary along
+the given dimension, and DequeueMany will pad the given dimension with
+zeros up to the maximum shape of all elements in the given batch.
+If the length of this attr is 0, different queue elements may have
+different ranks and shapes, but only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements in first-in first-out order."
+  description: <<END
+Variable-size shapes are allowed by setting the corresponding shape dimensions
+to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+size of any given element in the minibatch.  See below for details.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b65be6f4f5c0c37e855c61ed44a4b348bd95b98d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PaddingFIFOQueueV2.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "PaddingFIFOQueueV2"
+  endpoint {
+    name: "PaddingFIFOQueue"
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types.
+Shapes of fixed rank but variable size are allowed by setting
+any shape dimension to -1.  In this case, the inputs' shape may vary along
+the given dimension, and DequeueMany will pad the given dimension with
+zeros up to the maximum shape of all elements in the given batch.
+If the length of this attr is 0, different queue elements may have
+different ranks and shapes, but only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements in first-in first-out order."
+  description: <<END
+Variable-size shapes are allowed by setting the corresponding shape dimensions
+to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+size of any given element in the minibatch.  See below for details.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cf2449c9f810b65bc1b418a92b01d681a7e34d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelConcat.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "ParallelConcat"
+  in_arg {
+    name: "values"
+    description: <<END
+Tensors to be concatenated. All must have size 1 in the first dimension
+and same shape.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The concatenated tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+the final shape of the result; should be equal to the shapes of any input
+but with the number of input values in the first dimension.
+END
+  }
+  summary: "Concatenates a list of `N` tensors along the first dimension."
+  description: <<END
+The input tensors are all required to have size 1 in the first dimension.
+
+For example:
+
+```
+# 'x' is [[1, 4]]
+# 'y' is [[2, 5]]
+# 'z' is [[3, 6]]
+parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+```
+
+The difference between concat and parallel_concat is that concat requires all
+of the inputs be computed before the operation will begin but doesn't require
+that the input shapes be known during graph construction.  Parallel concat
+will copy pieces of the input into the output as they become available, in
+some situations this can provide a performance benefit.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelDynamicStitch.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelDynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9404a4dee098b27ab25d1ed7d60e954e4e120710
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelDynamicStitch.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "ParallelDynamicStitch"
+  summary: "Interleave the values from the `data` tensors into a single tensor."
+  description: <<END
+Builds a merged tensor such that
+
+```python
+    merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+```
+
+For example, if each `indices[m]` is scalar or vector, we have
+
+```python
+    # Scalar indices:
+    merged[indices[m], ...] = data[m][...]
+
+    # Vector indices:
+    merged[indices[m][i], ...] = data[m][i, ...]
+```
+
+Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+`constant`, the output shape is
+
+    merged.shape = [max(indices)] + constant
+
+Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+and `indices[n][j]`, the result may be invalid. This differs from the normal
+DynamicStitch operator that defines the behavior in that case.
+
+For example:
+
+```python
+    indices[0] = 6
+    indices[1] = [4, 1]
+    indices[2] = [[5, 2], [0, 3]]
+    data[0] = [61, 62]
+    data[1] = [[41, 42], [11, 12]]
+    data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+    merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+              [51, 52], [61, 62]]
+```
+
+This method can be used to merge partitions created by `dynamic_partition`
+as illustrated on the following example:
+
+```python
+    # Apply function (increments x_i) on elements for which a certain condition
+    # apply (x_i != -1 in this example).
+    x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+    condition_mask=tf.not_equal(x,tf.constant(-1.))
+    partitioned_data = tf.dynamic_partition(
+        x, tf.cast(condition_mask, tf.int32) , 2)
+    partitioned_data[1] = partitioned_data[1] + 1.0
+    condition_indices = tf.dynamic_partition(
+        tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+    x = tf.dynamic_stitch(condition_indices, partitioned_data)
+    # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+    # unchanged.
+```
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6889b54a032bb20896dc7b03af5621f45d365d9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ParallelInterleaveDataset"
+  attr {
+    name: "f"
+    description: <<END
+A function mapping elements of `input_dataset`, concatenated with
+`other_arguments`, to a Dataset variant that contains elements matching
+`output_types` and `output_shapes`.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+The resulting dataset is similar to the `InterleaveDataset`, with the exception
+that if retrieving the next value from a dataset would cause the requester to
+block, it will skip that input dataset. This dataset is especially useful
+when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
+allows the training step to proceed so long as some data is available.
+
+!! WARNING !! This dataset is not deterministic!
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..313494dd738b02d09807ec78fc8e0802e719e116
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ParallelMapDataset"
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+The number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  description: <<END
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `num_parallel_calls` copies of `f` in parallel.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a01c39a96ad83f98b039682e04df55221c1e0ecb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "ParameterizedTruncatedNormal"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor. Batches are indexed by the 0th dimension.
+END
+  }
+  in_arg {
+    name: "means"
+    description: <<END
+The mean parameter of each batch.
+END
+  }
+  in_arg {
+    name: "stdevs"
+    description: <<END
+The standard deviation parameter of each batch. Must be greater than 0.
+END
+  }
+  in_arg {
+    name: "minvals"
+    description: <<END
+The minimum cutoff. May be -infinity.
+END
+  }
+  in_arg {
+    name: "maxvals"
+    description: <<END
+The maximum cutoff. May be +infinity, and must be more than the minval
+for each batch.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A matrix of shape num_batches x samples_per_batch, filled with random
+truncated normal values using the parameters for each row.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution. The parameters may each be a"
+  description: <<END
+scalar which applies to the entire output, or a vector of length shape[0] which
+stores the parameters for each batch.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f404206ecf91083cb4e233bf18d30b5bd44e9c6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseExample.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "ParseExample"
+  in_arg {
+    name: "serialized"
+    description: <<END
+A vector containing a batch of binary serialized Example protos.
+END
+  }
+  in_arg {
+    name: "names"
+    description: <<END
+A vector containing the names of the serialized protos.
+May contain, for example, table key (descriptive) names for the
+corresponding serialized protos.  These are purely useful for debugging
+purposes, and the presence of values here has no effect on the output.
+May also be an empty vector if no names are available.
+If non-empty, this vector must be the same length as "serialized".
+END
+  }
+  in_arg {
+    name: "sparse_keys"
+    description: <<END
+A list of Nsparse string Tensors (scalars).
+The keys expected in the Examples' features associated with sparse values.
+END
+  }
+  in_arg {
+    name: "dense_keys"
+    description: <<END
+A list of Ndense string Tensors (scalars).
+The keys expected in the Examples' features associated with dense values.
+END
+  }
+  in_arg {
+    name: "dense_defaults"
+    description: <<END
+A list of Ndense Tensors (some may be empty).
+dense_defaults[j] provides default values
+when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+The input type is inferred from dense_defaults[j], even when it's empty.
+If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+then the shape of dense_defaults[j] must match that of dense_shapes[j].
+If dense_shapes[j] has an undefined major dimension (variable strides dense
+feature), dense_defaults[j] must contain a single element:
+the padding element.
+END
+  }
+  attr {
+    name: "sparse_types"
+    description: <<END
+A list of Nsparse types; the data types of data in each Feature
+given in sparse_keys.
+Currently the ParseExample supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+  }
+  attr {
+    name: "dense_shapes"
+    description: <<END
+A list of Ndense shapes; the shapes of data in each Feature
+given in dense_keys.
+The number of elements in the Feature corresponding to dense_key[j]
+must always equal dense_shapes[j].NumEntries().
+If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+The dense outputs are just the inputs row-stacked by batch.
+This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+the shape of the output Tensor dense_values[j] will be
+(|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+of elements of length D1 * .... * DN, across all minibatch entries
+in the input.  Any minibatch entry with less than M blocks of elements of
+length D1 * ... * DN will be padded with the corresponding default_value
+scalar element along the second dimension.
+END
+  }
+  summary: "Transforms a vector of brain.Example protos (as strings) into typed tensors."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseSingleSequenceExample.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseSingleSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a087c11d469ed406b3036076e9b5b6c51f5d2d75
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseSingleSequenceExample.pbtxt
@@ -0,0 +1,112 @@
+op {
+  graph_op_name: "ParseSingleSequenceExample"
+  in_arg {
+    name: "serialized"
+    description: <<END
+A scalar containing a binary serialized SequenceExample proto.
+END
+  }
+  in_arg {
+    name: "feature_list_dense_missing_assumed_empty"
+    description: <<END
+A vector listing the
+FeatureList keys which may be missing from the SequenceExample.  If the
+associated FeatureList is missing, it is treated as empty.  By default,
+any FeatureList not listed in this vector must exist in the SequenceExample.
+END
+  }
+  in_arg {
+    name: "context_sparse_keys"
+    description: <<END
+A list of Ncontext_sparse string Tensors (scalars).
+The keys expected in the Examples' features associated with context_sparse
+values.
+END
+  }
+  in_arg {
+    name: "context_dense_keys"
+    description: <<END
+A list of Ncontext_dense string Tensors (scalars).
+The keys expected in the SequenceExamples' context features associated with
+dense values.
+END
+  }
+  in_arg {
+    name: "feature_list_sparse_keys"
+    description: <<END
+A list of Nfeature_list_sparse string Tensors
+(scalars).  The keys expected in the FeatureLists associated with sparse
+values.
+END
+  }
+  in_arg {
+    name: "feature_list_dense_keys"
+    description: <<END
+A list of Nfeature_list_dense string Tensors (scalars).
+The keys expected in the SequenceExamples' feature_lists associated
+with lists of dense values.
+END
+  }
+  in_arg {
+    name: "context_dense_defaults"
+    description: <<END
+A list of Ncontext_dense Tensors (some may be empty).
+context_dense_defaults[j] provides default values
+when the SequenceExample's context map lacks context_dense_key[j].
+If an empty Tensor is provided for context_dense_defaults[j],
+then the Feature context_dense_keys[j] is required.
+The input type is inferred from context_dense_defaults[j], even when it's
+empty.  If context_dense_defaults[j] is not empty, its shape must match
+context_dense_shapes[j].
+END
+  }
+  in_arg {
+    name: "debug_name"
+    description: <<END
+A scalar containing the name of the serialized proto.
+May contain, for example, table key (descriptive) name for the
+corresponding serialized proto.  This is purely useful for debugging
+purposes, and the presence of values here has no effect on the output.
+May also be an empty scalar if no name is available.
+END
+  }
+  attr {
+    name: "context_sparse_types"
+    description: <<END
+A list of Ncontext_sparse types; the data types of data in
+each context Feature given in context_sparse_keys.
+Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+  }
+  attr {
+    name: "context_dense_shapes"
+    description: <<END
+A list of Ncontext_dense shapes; the shapes of data in
+each context Feature given in context_dense_keys.
+The number of elements in the Feature corresponding to context_dense_key[j]
+must always equal context_dense_shapes[j].NumEntries().
+The shape of context_dense_values[j] will match context_dense_shapes[j].
+END
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    description: <<END
+A list of Nfeature_list_sparse types; the data types
+of data in each FeatureList given in feature_list_sparse_keys.
+Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+DT_INT64 (Int64List), and DT_STRING (BytesList).
+END
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    description: <<END
+A list of Nfeature_list_dense shapes; the shapes of
+data in each FeatureList given in feature_list_dense_keys.
+The shape of each Feature in the FeatureList corresponding to
+feature_list_dense_key[j] must always equal
+feature_list_dense_shapes[j].NumEntries().
+END
+  }
+  summary: "Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParseTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d05efdf09523b1985d980da2623fa210091df0dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ParseTensor.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "ParseTensor"
+  in_arg {
+    name: "serialized"
+    description: <<END
+A scalar string containing a serialized TensorProto proto.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of type `out_type`.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The type of the serialized tensor.  The provided type must match the
+type of the serialized tensor and no implicit conversion will take place.
+END
+  }
+  summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Placeholder.pbtxt b/tensorflow/core/api_def/base_api/api_def_Placeholder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb27bc6142480e3340f2e6ff180009161faab0ae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Placeholder.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "Placeholder"
+  out_arg {
+    name: "output"
+    description: <<END
+A placeholder tensor that must be replaced using the feed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+(Optional) The shape of the tensor. If the shape has 0 dimensions, the
+shape is unconstrained.
+END
+  }
+  summary: "A placeholder op for a value that will be fed into the computation."
+  description: <<END
+N.B. This operation will fail with an error if it is executed. It is
+intended as a way to represent a value that will always be fed, and to
+provide attrs that enable the fed value to be checked at runtime.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PlaceholderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_PlaceholderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c67f6e12e0b0664cc6ed78b2c2d79c4c5e2544c8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PlaceholderV2.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "PlaceholderV2"
+  out_arg {
+    name: "output"
+    description: <<END
+A placeholder tensor that must be replaced using the feed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor. The shape can be any partially-specified
+shape.  To be unconstrained, pass in a shape with unknown rank.
+END
+  }
+  summary: "A placeholder op for a value that will be fed into the computation."
+  description: <<END
+N.B. This operation will fail with an error if it is executed. It is
+intended as a way to represent a value that will always be fed, and to
+provide attrs that enable the fed value to be checked at runtime.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/base_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c20383faf54499a86d64add7b1dca99484637700
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+  in_arg {
+    name: "input"
+    description: <<END
+The default value to produce when `output` is not fed.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A placeholder tensor that defaults to `input` if it is not fed.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The (possibly partial) shape of the tensor.
+END
+  }
+  summary: "A placeholder op that passes through `input` when its output is not fed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10bf370f5493cd7e0848adfefb20c861cab076cf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Polygamma.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "Polygamma"
+  summary: "Compute the polygamma function \\\\(\\psi^{(n)}(x)\\\\)."
+  description: <<END
+The polygamma function is defined as:
+
+
+\\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+
+where \\(\psi(x)\\) is the digamma function.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PopulationCount.pbtxt b/tensorflow/core/api_def/base_api/api_def_PopulationCount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97b106cd3586567902bc5b71d7b9cf9d143a12f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PopulationCount.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "PopulationCount"
+  summary: "Computes element-wise population count (a.k.a. popcount, bitsum, bitcount)."
+  description: <<END
+For each entry in `x`, calculates the number of `1` (on) bits in the binary
+representation of that entry.
+
+**NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+`int32` or `int64` and perform the bitcount on the result, than to feed in
+8- or 16-bit inputs and then aggregate the resulting counts.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Pow.pbtxt b/tensorflow/core/api_def/base_api/api_def_Pow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ace5f3100ae0b448b80548908ab304878322f75
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Pow.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "Pow"
+  summary: "Computes the power of one value to another."
+  description: <<END
+Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+corresponding elements in `x` and `y`. For example:
+
+```
+# tensor 'x' is [[2, 2]], [3, 3]]
+# tensor 'y' is [[8, 16], [2, 3]]
+tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e158eedc6f0ef11de3c8979d65dd69d8bece1eb4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "PrefetchDataset"
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+The maximum number of elements to buffer in an iterator over
+this dataset.
+END
+  }
+  summary: "Creates a dataset that asynchronously prefetches elements from `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PreventGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_PreventGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6332192fb74fb827aaca475a8bb156e95fd8991f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PreventGradient.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "PreventGradient"
+  in_arg {
+    name: "input"
+    description: <<END
+any tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+the same input tensor.
+END
+  }
+  attr {
+    name: "message"
+    description: <<END
+Will be printed in the error when anyone tries to differentiate
+this operation.
+END
+  }
+  summary: "An identity op that triggers an error if a gradient is requested."
+  description: <<END
+When executed in a graph, this op outputs its input tensor as-is.
+
+When building ops to compute gradients, the TensorFlow gradient system
+will return an error when trying to lookup the gradient of this op,
+because no gradient must ever be registered for this function.  This
+op exists to prevent subtle bugs from silently returning unimplemented
+gradients in some corner cases.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Print.pbtxt b/tensorflow/core/api_def/base_api/api_def_Print.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..effbde1623efc78351c043eceabc261a8c325f88
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Print.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "Print"
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor passed to `output`
+END
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+A list of tensors to print out when op is evaluated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+= The unmodified `input` tensor
+END
+  }
+  attr {
+    name: "message"
+    description: <<END
+A string, prefix of the error message.
+END
+  }
+  attr {
+    name: "first_n"
+    description: <<END
+Only log `first_n` number of times. -1 disables logging.
+END
+  }
+  attr {
+    name: "summarize"
+    description: <<END
+Only print this many entries of each tensor.
+END
+  }
+  summary: "Prints a list of tensors."
+  description: <<END
+Passes `input` through to `output` and prints `data` when evaluating.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PriorityQueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_PriorityQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cbcef11f8e851a84267fd014eb587725c8938f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PriorityQueue.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "PriorityQueue"
+  visibility: SKIP
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements sorted by the first component value."
+  description: <<END
+Note that the PriorityQueue requires the first component of any element
+to be a scalar int64, in addition to the other elements declared by
+component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+entry in their input (resp. output) lists.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PriorityQueueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_PriorityQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0c1499e3936a4c0ead61f8a08bb43e0f558d7c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PriorityQueueV2.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "PriorityQueueV2"
+  endpoint {
+    name: "PriorityQueue"
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that produces elements sorted by the first component value."
+  description: <<END
+Note that the PriorityQueue requires the first component of any element
+to be a scalar int64, in addition to the other elements declared by
+component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+entry in their input (resp. output) lists.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Prod.pbtxt b/tensorflow/core/api_def/base_api/api_def_Prod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02b6e425f8ae50bdde4637210484a4736088d157
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Prod.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Prod"
+  endpoint {
+    name: "Prod"
+  }
+  endpoint {
+    name: "ReduceProd"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the product of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PyFunc.pbtxt b/tensorflow/core/api_def/base_api/api_def_PyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b8bcf5e1210e0302102fb5baacfc769ff07d550
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PyFunc.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "PyFunc"
+  visibility: SKIP
+  in_arg {
+    name: "input"
+    description: <<END
+List of Tensors that will provide input to the Op.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The outputs from the Op.
+END
+  }
+  attr {
+    name: "token"
+    description: <<END
+A token representing a registered python function in this address space.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+Data types of the inputs to the op.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+Data types of the outputs from the op.
+The length of the list specifies the number of outputs.
+END
+  }
+  summary: "Invokes a python function to compute func(input)->output."
+  description: <<END
+This operation is considered stateful. For a stateless version, see
+PyFuncStateless.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PyFuncStateless.pbtxt b/tensorflow/core/api_def/base_api/api_def_PyFuncStateless.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12962928621636e0c6b9bac9f38292d7f34ac519
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PyFuncStateless.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "PyFuncStateless"
+  visibility: SKIP
+  summary: "A stateless version of PyFunc."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Q.pbtxt b/tensorflow/core/api_def/base_api/api_def_Q.pbtxt
deleted file mode 100644
index 4af60a184110de3da2a0a1eb79b3fc912f9d940d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_Q.pbtxt
+++ /dev/null
@@ -1,609 +0,0 @@
-op {
-  graph_op_name: "Qr"
-  endpoint {
-    name: "Qr"
-  }
-  summary: "Computes the QR decompositions of one or more matrices."
-  description: <<END
-Computes the QR decomposition of each inner matrix in `tensor` such that
-`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-
-```python
-# a is a tensor.
-# q is a tensor of orthonormal matrices.
-# r is a tensor of upper triangular matrices.
-q, r = qr(a)
-q_full, r_full = qr(a, full_matrices=True)
-```
-END
-}
-op {
-  graph_op_name: "QuantizeAndDequantize"
-  endpoint {
-    name: "QuantizeAndDequantize"
-  }
-  summary: "Use QuantizeAndDequantizeV2 instead."
-}
-op {
-  graph_op_name: "QuantizeAndDequantizeV2"
-  endpoint {
-    name: "QuantizeAndDequantizeV2"
-  }
-  summary: "Quantizes then dequantizes a tensor."
-  description: <<END
-This op simulates the precision loss from the quantized forward pass by:
-1. Quantizing the tensor to fixed point numbers, which should match the target
-   quantization method when it is used in inference.
-2. Dequantizing it back to floating point numbers for the following ops, most
-   likely matmul.
-
-There are different ways to quantize. This version does not use the full range
-of the output type, choosing to elide the lowest possible value for symmetry
-(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-quantization), so that 0.0 maps to 0.
-
-To perform this op, we first find the range of values in our tensor. The range
-we use is always centered on 0, so we find m such that
-
-1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
-
-Our input tensor range is then [-m, m].
-
-Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-If signed_input is true, this is
-
-  [min_fixed, max_fixed ] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
-
-Otherwise, if signed_input is false, the fixed-point range is
-
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
-
-From this we compute our scaling factor, s:
-
-  s = (max_fixed - min_fixed) / (2 * m).
-
-Now we can quantize and dequantize the elements of our tensor.  An element e
-is transformed into e':
-
-  e' = (e * s).round_to_nearest() / s.
-
-Note that we have a different number of buckets in the signed vs. unsigned
-cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-vs. 255 in the unsigned case.
-
-For example, suppose num_bits = 8 and m = 1.  Then
-
-  [min_fixed, max_fixed] = [-127, 127], and
-  s = (127 + 127) / 2 = 127.
-
-Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
-END
-}
-op {
-  graph_op_name: "QuantizeAndDequantizeV3"
-  endpoint {
-    name: "QuantizeAndDequantizeV3"
-  }
-  summary: "Quantizes then dequantizes a tensor."
-  description: <<END
-This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-tensor, so its value can change during training.
-END
-}
-op {
-  graph_op_name: "QuantizeDownAndShrinkRange"
-  endpoint {
-    name: "QuantizeDownAndShrinkRange"
-  }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
-  description: <<END
-actual distribution of the values to maximize the usage of the lower bit depth
-and adjusting the output min and max ranges accordingly.
-
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-
-This operator tries to squeeze as much precision as possible into an output with
-a lower bit depth by calculating the actual min and max values found in the
-data. For example, maybe that quint16 input has no values lower than 16,384 and
-none higher than 49,152. That means only half the range is actually needed, all
-the float interpretations are between -0.5f and 0.5f, so if we want to compress
-the data into a quint8 output, we can use that range rather than the theoretical
--1.0f to 1.0f that is suggested by the input min and max.
-
-In practice, this is most useful for taking output from operations like
-QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-may have large potential output ranges, but in practice have a distribution of
-input values that only uses a small fraction of the possible range. By feeding
-that output into this operator, we can reduce it from 32 bits down to 8 with
-minimal loss of accuracy.
-END
-}
-op {
-  graph_op_name: "QuantizeV2"
-  endpoint {
-    name: "QuantizeV2"
-  }
-  summary: "Quantize the \'input\' tensor of type float to \'output\' tensor of type \'T\'."
-  description: <<END
-[min_range, max_range] are scalar floats that specify the range for
-the 'input' data. The 'mode' attribute controls exactly which calculations are
-used to convert the float values to their quantized equivalents.
-
-In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-
-```
-out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-if T == qint8, out[i] -= (range(T) + 1) / 2.0
-```
-here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-
-*MIN_COMBINED Mode Example*
-
-Assume the input is type float and has a possible range of [0.0, 6.0] and the
-output type is quint8 ([0, 255]). The min_range and max_range values should be
-specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-value of the input by 255/6 and cast to quint8.
-
-If the output type was qint8 ([-128, 127]), the operation will additionally
-subtract each value by 128 prior to casting, so that the range of values aligns
-with the range of qint8.
-
-If the mode is 'MIN_FIRST', then this approach is used:
-
-```
-number_of_steps = 1 << (# of bits in T)
-range_adjust = number_of_steps / (number_of_steps - 1)
-range = (range_max - range_min) * range_adjust
-range_scale = number_of_steps / range
-quantized = round(input * range_scale) - round(range_min * range_scale) +
-  numeric_limits<T>::min()
-quantized = max(quantized, numeric_limits<T>::min())
-quantized = min(quantized, numeric_limits<T>::max())
-```
-
-The biggest difference between this and MIN_COMBINED is that the minimum range
-is rounded first, before it's subtracted from the rounded value. With
-MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-and dequantizing will introduce a larger and larger error.
-
-*SCALED mode Example*
-
-`SCALED` mode matches the quantization approach used in
-`QuantizeAndDequantize{V2|V3}`.
-
-If the mode is `SCALED`, we do not use the full range of the output type,
-choosing to elide the lowest possible value for symmetry (e.g., output range is
--127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-0.
-
-We first find the range of values in our tensor. The
-range we use is always centered on 0, so we find m such that
-```c++
-  m = max(abs(input_min), abs(input_max))
-```
-
-Our input tensor range is then `[-m, m]`.
-
-Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-If T is signed, this is
-```
-  num_bits = sizeof(T) * 8
-  [min_fixed, max_fixed] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-```
-
-Otherwise, if T is unsigned, the fixed-point range is
-```
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-```
-
-From this we compute our scaling factor, s:
-```c++
-  s = (max_fixed - min_fixed) / (2 * m)
-```
-
-Now we can quantize the elements of our tensor:
-```c++
-result = (input * s).round_to_nearest()
-```
-
-One thing to watch out for is that the operator may choose to adjust the
-requested minimum and maximum values slightly during the quantization process,
-so you should always use the output ports as the range for further calculations.
-For example, if the requested minimum and maximum values are close to equal,
-they will be separated by a small epsilon value to prevent ill-formed quantized
-buffers from being created. Otherwise, you can end up with buffers where all the
-quantized values map to the same float value, which causes problems for
-operations that have to perform further calculations on them.
-END
-}
-op {
-  graph_op_name: "QuantizedAdd"
-  endpoint {
-    name: "QuantizedAdd"
-  }
-  summary: "Returns x + y element-wise, working on quantized buffers."
-}
-op {
-  graph_op_name: "QuantizedAvgPool"
-  endpoint {
-    name: "QuantizedAvgPool"
-  }
-  summary: "Produces the average pool of the input tensor for quantized types."
-}
-op {
-  graph_op_name: "QuantizedBatchNormWithGlobalNormalization"
-  endpoint {
-    name: "QuantizedBatchNormWithGlobalNormalization"
-  }
-  summary: "Quantized Batch normalization."
-  description: <<END
-This op is deprecated and will be removed in the future. Prefer
-`tf.nn.batch_normalization`.
-END
-}
-op {
-  graph_op_name: "QuantizedBiasAdd"
-  endpoint {
-    name: "QuantizedBiasAdd"
-  }
-  summary: "Adds Tensor \'bias\' to Tensor \'input\' for Quantized types."
-  description: <<END
-Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
-END
-}
-op {
-  graph_op_name: "QuantizedConcat"
-  endpoint {
-    name: "QuantizedConcat"
-  }
-  summary: "Concatenates quantized tensors along one dimension."
-}
-op {
-  graph_op_name: "QuantizedConv2D"
-  endpoint {
-    name: "QuantizedConv2D"
-  }
-  summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
-  description: <<END
-The inputs are quantized tensors where the lowest value represents the real
-number of the associated minimum, and the highest represents the maximum.
-This means that you can only interpret the quantized output in the same way, by
-taking the returned minimum and maximum values into account.
-END
-}
-op {
-  graph_op_name: "QuantizedInstanceNorm"
-  endpoint {
-    name: "QuantizedInstanceNorm"
-  }
-  summary: "Quantized Instance normalization."
-}
-op {
-  graph_op_name: "QuantizedMatMul"
-  endpoint {
-    name: "QuantizedMatMul"
-  }
-  summary: "Perform a quantized matrix multiplication of  `a` by the matrix `b`."
-  description: <<END
-The inputs must be two-dimensional matrices and the inner dimension of
-`a` (after being transposed if `transpose_a` is non-zero) must match the
-outer dimension of `b` (after being transposed if `transposed_b` is
-non-zero).
-END
-}
-op {
-  graph_op_name: "QuantizedMaxPool"
-  endpoint {
-    name: "QuantizedMaxPool"
-  }
-  summary: "Produces the max pool of the input tensor for quantized types."
-}
-op {
-  graph_op_name: "QuantizedMul"
-  endpoint {
-    name: "QuantizedMul"
-  }
-  summary: "Returns x * y element-wise, working on quantized buffers."
-}
-op {
-  graph_op_name: "QuantizedRelu"
-  endpoint {
-    name: "QuantizedRelu"
-  }
-  summary: "Computes Quantized Rectified Linear: `max(features, 0)`"
-}
-op {
-  graph_op_name: "QuantizedRelu6"
-  endpoint {
-    name: "QuantizedRelu6"
-  }
-  summary: "Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`"
-}
-op {
-  graph_op_name: "QuantizedReluX"
-  endpoint {
-    name: "QuantizedReluX"
-  }
-  summary: "Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`"
-}
-op {
-  graph_op_name: "QuantizedReshape"
-  endpoint {
-    name: "QuantizedReshape"
-  }
-  summary: "Reshapes a quantized tensor as per the Reshape op."
-  description: <<END
-```
-END
-}
-op {
-  graph_op_name: "QuantizedResizeBilinear"
-  endpoint {
-    name: "QuantizedResizeBilinear"
-  }
-  summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
-  description: <<END
-Input images and output images must be quantized types.
-END
-}
-op {
-  graph_op_name: "QueueClose"
-  endpoint {
-    name: "QueueClose"
-  }
-  summary: "Closes the given queue."
-  description: <<END
-This operation signals that no more elements will be enqueued in the
-given queue. Subsequent Enqueue(Many) operations will fail.
-Subsequent Dequeue(Many) operations will continue to succeed if
-sufficient elements remain in the queue. Subsequent Dequeue(Many)
-operations that would block will fail immediately.
-END
-}
-op {
-  graph_op_name: "QueueCloseV2"
-  endpoint {
-    name: "QueueCloseV2"
-  }
-  summary: "Closes the given queue."
-  description: <<END
-This operation signals that no more elements will be enqueued in the
-given queue. Subsequent Enqueue(Many) operations will fail.
-Subsequent Dequeue(Many) operations will continue to succeed if
-sufficient elements remain in the queue. Subsequent Dequeue(Many)
-operations that would block will fail immediately.
-END
-}
-op {
-  graph_op_name: "QueueDequeue"
-  endpoint {
-    name: "QueueDequeue"
-  }
-  summary: "Dequeues a tuple of one or more tensors from the given queue."
-  description: <<END
-This operation has k outputs, where k is the number of components
-in the tuples stored in the given queue, and output i is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until an element
-has been dequeued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueDequeueMany"
-  endpoint {
-    name: "QueueDequeueMany"
-  }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: <<END
-If the queue is closed and there are fewer than `n` elements, then an
-OutOfRange error is returned.
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until `n` elements
-have been dequeued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueDequeueManyV2"
-  endpoint {
-    name: "QueueDequeueManyV2"
-  }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: <<END
-If the queue is closed and there are fewer than `n` elements, then an
-OutOfRange error is returned.
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until `n` elements
-have been dequeued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueDequeueUpTo"
-  endpoint {
-    name: "QueueDequeueUpTo"
-  }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: <<END
-This operation is not supported by all queues.  If a queue does not support
-DequeueUpTo, then an Unimplemented error is returned.
-
-If the queue is closed and there are more than 0 but less than `n`
-elements remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If
-the queue is closed and there are 0 elements left in the queue, then
-an OutOfRange error is returned just like in QueueDequeueMany.
-Otherwise the behavior is identical to QueueDequeueMany:
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size `n` in the 0th dimension.
-
-This operation has k outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-END
-}
-op {
-  graph_op_name: "QueueDequeueUpToV2"
-  endpoint {
-    name: "QueueDequeueUpToV2"
-  }
-  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
-  description: <<END
-This operation is not supported by all queues.  If a queue does not support
-DequeueUpTo, then an Unimplemented error is returned.
-
-If the queue is closed and there are more than 0 but less than `n`
-elements remaining, then instead of returning an OutOfRange error like
-QueueDequeueMany, less than `n` elements are returned immediately.  If
-the queue is closed and there are 0 elements left in the queue, then
-an OutOfRange error is returned just like in QueueDequeueMany.
-Otherwise the behavior is identical to QueueDequeueMany:
-
-This operation concatenates queue-element component tensors along the
-0th dimension to make a single component tensor.  All of the components
-in the dequeued tuple will have size n in the 0th dimension.
-
-This operation has `k` outputs, where `k` is the number of components in
-the tuples stored in the given queue, and output `i` is the ith
-component of the dequeued tuple.
-END
-}
-op {
-  graph_op_name: "QueueDequeueV2"
-  endpoint {
-    name: "QueueDequeueV2"
-  }
-  summary: "Dequeues a tuple of one or more tensors from the given queue."
-  description: <<END
-This operation has k outputs, where k is the number of components
-in the tuples stored in the given queue, and output i is the ith
-component of the dequeued tuple.
-
-N.B. If the queue is empty, this operation will block until an element
-has been dequeued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueEnqueue"
-  endpoint {
-    name: "QueueEnqueue"
-  }
-  summary: "Enqueues a tuple of one or more tensors in the given queue."
-  description: <<END
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-element has been enqueued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueEnqueueMany"
-  endpoint {
-    name: "QueueEnqueueMany"
-  }
-  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
-  description: <<END
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tuple components must have the
-same size in the 0th dimension.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-elements have been enqueued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueEnqueueManyV2"
-  endpoint {
-    name: "QueueEnqueueManyV2"
-  }
-  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
-  description: <<END
-This operation slices each component tensor along the 0th dimension to
-make multiple queue elements. All of the tuple components must have the
-same size in the 0th dimension.
-
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-elements have been enqueued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueEnqueueV2"
-  endpoint {
-    name: "QueueEnqueueV2"
-  }
-  summary: "Enqueues a tuple of one or more tensors in the given queue."
-  description: <<END
-The components input has k elements, which correspond to the components of
-tuples stored in the given queue.
-
-N.B. If the queue is full, this operation will block until the given
-element has been enqueued (or 'timeout_ms' elapses, if specified).
-END
-}
-op {
-  graph_op_name: "QueueIsClosed"
-  endpoint {
-    name: "QueueIsClosed"
-  }
-  summary: "Returns true if queue is closed."
-  description: <<END
-This operation returns true if the queue is closed and false if the queue
-is open.
-END
-}
-op {
-  graph_op_name: "QueueIsClosedV2"
-  endpoint {
-    name: "QueueIsClosedV2"
-  }
-  summary: "Returns true if queue is closed."
-  description: <<END
-This operation returns true if the queue is closed and false if the queue
-is open.
-END
-}
-op {
-  graph_op_name: "QueueSize"
-  endpoint {
-    name: "QueueSize"
-  }
-  summary: "Computes the number of elements in the given queue."
-}
-op {
-  graph_op_name: "QueueSizeV2"
-  endpoint {
-    name: "QueueSizeV2"
-  }
-  summary: "Computes the number of elements in the given queue."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac8f7597aae9e9ba04cfbfa65ad9417f816fa153
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "Qr"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+END
+  }
+  out_arg {
+    name: "q"
+    description: <<END
+Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+`[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "r"
+    description: <<END
+Triangular factor. If `full_matrices` is `False` then shape is
+`[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+END
+  }
+  attr {
+    name: "full_matrices"
+    description: <<END
+If true, compute full-sized `q` and `r`. If false
+(the default), compute only the leading `P` columns of `q`.
+END
+  }
+  summary: "Computes the QR decompositions of one or more matrices."
+  description: <<END
+Computes the QR decomposition of each inner matrix in `tensor` such that
+`tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+
+```python
+# a is a tensor.
+# q is a tensor of orthonormal matrices.
+# r is a tensor of upper triangular matrices.
+q, r = qr(a)
+q_full, r_full = qr(a, full_matrices=True)
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d84144d336f2adf5e05b7519325dd8ae6c9ec85
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantize"
+  summary: "Use QuantizeAndDequantizeV2 instead."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fc9c9034a1f56ea319f4d3e6df0ded47fda4200
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -0,0 +1,93 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2"
+  in_arg {
+    name: "input"
+    description: <<END
+Tensor to quantize and then dequantize.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+If range_given, this is the min of the range, otherwise this input
+will be ignored.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+If range_given, this is the max of the range, otherwise this input
+will be ignored.
+END
+  }
+  attr {
+    name: "signed_input"
+    description: <<END
+If the quantization is signed or unsigned.
+END
+  }
+  attr {
+    name: "num_bits"
+    description: <<END
+The bitwidth of the quantization.
+END
+  }
+  attr {
+    name: "range_given"
+    description: <<END
+If the range is given or should be computed from the tensor.
+END
+  }
+  summary: "Quantizes then dequantizes a tensor."
+  description: <<END
+This op simulates the precision loss from the quantized forward pass by:
+1. Quantizing the tensor to fixed point numbers, which should match the target
+   quantization method when it is used in inference.
+2. Dequantizing it back to floating point numbers for the following ops, most
+   likely matmul.
+
+There are different ways to quantize. This version does not use the full range
+of the output type, choosing to elide the lowest possible value for symmetry
+(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
+quantization), so that 0.0 maps to 0.
+
+To perform this op, we first find the range of values in our tensor. The range
+we use is always centered on 0, so we find m such that
+
+1. m = max(abs(input_min), abs(input_max)) if range_given is true,
+2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+
+Our input tensor range is then [-m, m].
+
+Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
+If signed_input is true, this is
+
+  [min_fixed, max_fixed ] =
+      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+
+Otherwise, if signed_input is false, the fixed-point range is
+
+  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+
+From this we compute our scaling factor, s:
+
+  s = (max_fixed - min_fixed) / (2 * m).
+
+Now we can quantize and dequantize the elements of our tensor.  An element e
+is transformed into e':
+
+  e' = (e * s).round_to_nearest() / s.
+
+Note that we have a different number of buckets in the signed vs. unsigned
+cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
+vs. 255 in the unsigned case.
+
+For example, suppose num_bits = 8 and m = 1.  Then
+
+  [min_fixed, max_fixed] = [-127, 127], and
+  s = (127 + 127) / 2 = 127.
+
+Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
+{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57128a842a7c016e649ef3da5afc6d282ec6550a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV3.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV3"
+  summary: "Quantizes then dequantizes a tensor."
+  description: <<END
+This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+tensor, so its value can change during training.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeDownAndShrinkRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af7729e2384d3c1872edb7c199f7c60d6bde27b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeDownAndShrinkRange.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "QuantizeDownAndShrinkRange"
+  in_arg {
+    name: "input_min"
+    description: <<END
+The float value that the minimum quantized input value represents.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The float value that the maximum quantized input value represents.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The float value that the minimum quantized output value represents.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The float value that the maximum quantized output value represents.
+END
+  }
+  attr {
+    name: "Tinput"
+    description: <<END
+The type of the input.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The type of the output. Should be a lower bit depth than Tinput.
+END
+  }
+  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
+  description: <<END
+actual distribution of the values to maximize the usage of the lower bit depth
+and adjusting the output min and max ranges accordingly.
+
+[input_min, input_max] are scalar floats that specify the range for the float
+interpretation of the 'input' data. For example, if input_min is -1.0f and
+input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+
+This operator tries to squeeze as much precision as possible into an output with
+a lower bit depth by calculating the actual min and max values found in the
+data. For example, maybe that quint16 input has no values lower than 16,384 and
+none higher than 49,152. That means only half the range is actually needed, all
+the float interpretations are between -0.5f and 0.5f, so if we want to compress
+the data into a quint8 output, we can use that range rather than the theoretical
+-1.0f to 1.0f that is suggested by the input min and max.
+
+In practice, this is most useful for taking output from operations like
+QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+may have large potential output ranges, but in practice have a distribution of
+input values that only uses a small fraction of the possible range. By feeding
+that output into this operator, we can reduce it from 32 bits down to 8 with
+minimal loss of accuracy.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9e75caf02b3b557e632a8137cf4780178b8ad8a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeV2.pbtxt
@@ -0,0 +1,128 @@
+op {
+  graph_op_name: "QuantizeV2"
+  in_arg {
+    name: "min_range"
+    description: <<END
+The minimum scalar value possibly produced for the input.
+END
+  }
+  in_arg {
+    name: "max_range"
+    description: <<END
+The maximum scalar value possibly produced for the input.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The quantized data produced from the float input.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The actual minimum scalar value used for the output.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The actual maximum scalar value used for the output.
+END
+  }
+  summary: "Quantize the \'input\' tensor of type float to \'output\' tensor of type \'T\'."
+  description: <<END
+[min_range, max_range] are scalar floats that specify the range for
+the 'input' data. The 'mode' attribute controls exactly which calculations are
+used to convert the float values to their quantized equivalents.  The
+'round_mode' attribute controls which rounding tie-breaking algorithm is used
+when rounding float values to their quantized equivalents.
+
+In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+
+```
+out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+if T == qint8, out[i] -= (range(T) + 1) / 2.0
+```
+here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+
+*MIN_COMBINED Mode Example*
+
+Assume the input is type float and has a possible range of [0.0, 6.0] and the
+output type is quint8 ([0, 255]). The min_range and max_range values should be
+specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+value of the input by 255/6 and cast to quint8.
+
+If the output type was qint8 ([-128, 127]), the operation will additionally
+subtract each value by 128 prior to casting, so that the range of values aligns
+with the range of qint8.
+
+If the mode is 'MIN_FIRST', then this approach is used:
+
+```
+num_discrete_values = 1 << (# of bits in T)
+range_adjust = num_discrete_values / (num_discrete_values - 1)
+range = (range_max - range_min) * range_adjust
+range_scale = num_discrete_values / range
+quantized = round(input * range_scale) - round(range_min * range_scale) +
+  numeric_limits<T>::min()
+quantized = max(quantized, numeric_limits<T>::min())
+quantized = min(quantized, numeric_limits<T>::max())
+```
+
+The biggest difference between this and MIN_COMBINED is that the minimum range
+is rounded first, before it's subtracted from the rounded value. With
+MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+and dequantizing will introduce a larger and larger error.
+
+*SCALED mode Example*
+
+`SCALED` mode matches the quantization approach used in
+`QuantizeAndDequantize{V2|V3}`.
+
+If the mode is `SCALED`, we do not use the full range of the output type,
+choosing to elide the lowest possible value for symmetry (e.g., output range is
+-127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+0.
+
+We first find the range of values in our tensor. The
+range we use is always centered on 0, so we find m such that
+```c++
+  m = max(abs(input_min), abs(input_max))
+```
+
+Our input tensor range is then `[-m, m]`.
+
+Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+If T is signed, this is
+```
+  num_bits = sizeof(T) * 8
+  [min_fixed, max_fixed] =
+      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+```
+
+Otherwise, if T is unsigned, the fixed-point range is
+```
+  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+```
+
+From this we compute our scaling factor, s:
+```c++
+  s = (max_fixed - min_fixed) / (2 * m)
+```
+
+Now we can quantize the elements of our tensor:
+```c++
+result = round(input * s)
+```
+
+One thing to watch out for is that the operator may choose to adjust the
+requested minimum and maximum values slightly during the quantization process,
+so you should always use the output ports as the range for further calculations.
+For example, if the requested minimum and maximum values are close to equal,
+they will be separated by a small epsilon value to prevent ill-formed quantized
+buffers from being created. Otherwise, you can end up with buffers where all the
+quantized values map to the same float value, which causes problems for
+operations that have to perform further calculations on them.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..193bee4db915e5b12f5a316e9249e3a953019b1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedAdd.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "QuantizedAdd"
+  in_arg {
+    name: "min_x"
+    description: <<END
+The float value that the lowest quantized `x` value represents.
+END
+  }
+  in_arg {
+    name: "max_x"
+    description: <<END
+The float value that the highest quantized `x` value represents.
+END
+  }
+  in_arg {
+    name: "min_y"
+    description: <<END
+The float value that the lowest quantized `y` value represents.
+END
+  }
+  in_arg {
+    name: "max_y"
+    description: <<END
+The float value that the highest quantized `y` value represents.
+END
+  }
+  out_arg {
+    name: "min_z"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_z"
+    description: <<END
+The float value that the highest quantized output value represents.
+
+*NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+  }
+  summary: "Returns x + y element-wise, working on quantized buffers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..912ab540264198963783957300e0300368cd5b5d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedAvgPool.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "QuantizedAvgPool"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "min_input"
+    description: <<END
+The float value that the lowest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "max_input"
+    description: <<END
+The float value that the highest quantized input value represents.
+END
+  }
+  out_arg {
+    name: "min_output"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_output"
+    description: <<END
+The float value that the highest quantized output value represents.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+The length must be 4 to match the number of dimensions of the input.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+tensor.  The length must be 4 to match the number of dimensions of the input.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Produces the average pool of the input tensor for quantized types."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27990db1d07f901e9503d868d7f2b8cabdd5282c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,118 @@
+op {
+  graph_op_name: "QuantizedBatchNormWithGlobalNormalization"
+  in_arg {
+    name: "t"
+    description: <<END
+A 4D input Tensor.
+END
+  }
+  in_arg {
+    name: "t_min"
+    description: <<END
+The value represented by the lowest quantized input.
+END
+  }
+  in_arg {
+    name: "t_max"
+    description: <<END
+The value represented by the highest quantized input.
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+A 1D mean Tensor with size matching the last dimension of t.
+This is the first output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "m_min"
+    description: <<END
+The value represented by the lowest quantized mean.
+END
+  }
+  in_arg {
+    name: "m_max"
+    description: <<END
+The value represented by the highest quantized mean.
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+A 1D variance Tensor with size matching the last dimension of t.
+This is the second output from tf.nn.moments,
+or a saved moving average thereof.
+END
+  }
+  in_arg {
+    name: "v_min"
+    description: <<END
+The value represented by the lowest quantized variance.
+END
+  }
+  in_arg {
+    name: "v_max"
+    description: <<END
+The value represented by the highest quantized variance.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+A 1D beta Tensor with size matching the last dimension of t.
+An offset to be added to the normalized tensor.
+END
+  }
+  in_arg {
+    name: "beta_min"
+    description: <<END
+The value represented by the lowest quantized offset.
+END
+  }
+  in_arg {
+    name: "beta_max"
+    description: <<END
+The value represented by the highest quantized offset.
+END
+  }
+  in_arg {
+    name: "gamma"
+    description: <<END
+A 1D gamma Tensor with size matching the last dimension of t.
+If "scale_after_normalization" is true, this tensor will be multiplied
+with the normalized tensor.
+END
+  }
+  in_arg {
+    name: "gamma_min"
+    description: <<END
+The value represented by the lowest quantized gamma.
+END
+  }
+  in_arg {
+    name: "gamma_max"
+    description: <<END
+The value represented by the highest quantized gamma.
+END
+  }
+  attr {
+    name: "variance_epsilon"
+    description: <<END
+A small float number to avoid dividing by 0.
+END
+  }
+  attr {
+    name: "scale_after_normalization"
+    description: <<END
+A bool indicating whether the resulted tensor
+needs to be multiplied with gamma.
+END
+  }
+  summary: "Quantized Batch normalization."
+  description: <<END
+This op is deprecated and will be removed in the future. Prefer
+`tf.nn.batch_normalization`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedBiasAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedBiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d714e3aa28ed47b58d44df1f77b9ad9821838d3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedBiasAdd.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "QuantizedBiasAdd"
+  in_arg {
+    name: "bias"
+    description: <<END
+A 1D bias Tensor with size matching the last dimension of 'input'.
+END
+  }
+  in_arg {
+    name: "min_input"
+    description: <<END
+The float value that the lowest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "max_input"
+    description: <<END
+The float value that the highest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "min_bias"
+    description: <<END
+The float value that the lowest quantized bias value represents.
+END
+  }
+  in_arg {
+    name: "max_bias"
+    description: <<END
+The float value that the highest quantized bias value represents.
+END
+  }
+  out_arg {
+    name: "min_out"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_out"
+    description: <<END
+The float value that the highest quantized output value represents.
+END
+  }
+  summary: "Adds Tensor \'bias\' to Tensor \'input\' for Quantized types."
+  description: <<END
+Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e39654fe9010836f5e8c1e8be80c677b33252fae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConcat.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "QuantizedConcat"
+  in_arg {
+    name: "concat_dim"
+    description: <<END
+0-D.  The dimension along which to concatenate.  Must be in the
+range [0, rank(values)).
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+The `N` Tensors to concatenate. Their ranks and types must match,
+and their sizes must match in all dimensions except `concat_dim`.
+END
+  }
+  in_arg {
+    name: "input_mins"
+    description: <<END
+The minimum scalar values for each of the input tensors.
+END
+  }
+  in_arg {
+    name: "input_maxes"
+    description: <<END
+The maximum scalar values for each of the input tensors.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A `Tensor` with the concatenation of values stacked along the
+`concat_dim` dimension.  This tensor's shape matches that of `values` except
+in `concat_dim` where it has the sum of the sizes.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The float value that the minimum quantized output value represents.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The float value that the maximum quantized output value represents.
+END
+  }
+  summary: "Concatenates quantized tensors along one dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d18bafdce9b3aaccfae6eff0c489e133b492f26d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2D.pbtxt
@@ -0,0 +1,75 @@
+op {
+  graph_op_name: "QuantizedConv2D"
+  in_arg {
+    name: "filter"
+    description: <<END
+filter's input_depth dimension must match input's depth dimensions.
+END
+  }
+  in_arg {
+    name: "min_input"
+    description: <<END
+The float value that the lowest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "max_input"
+    description: <<END
+The float value that the highest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "min_filter"
+    description: <<END
+The float value that the lowest quantized filter value represents.
+END
+  }
+  in_arg {
+    name: "max_filter"
+    description: <<END
+The float value that the highest quantized filter value represents.
+END
+  }
+  out_arg {
+    name: "min_output"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_output"
+    description: <<END
+The float value that the highest quantized output value represents.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+tensor.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "dilations"
+    description: <<END
+1-D tensor of length 4.  The dilation factor for each dimension of
+`input`. If set to k > 1, there will be k-1 skipped cells between each
+filter element on that dimension. The dimension order is determined by the
+value of `data_format`, see above for details. Dilations in the batch and
+depth dimensions must be 1.
+END
+  }
+  summary: "Computes a 2D convolution given quantized 4D input and filter tensors."
+  description: <<END
+The inputs are quantized tensors where the lowest value represents the real
+number of the associated minimum, and the highest represents the maximum.
+This means that you can only interpret the quantized output in the same way, by
+taking the returned minimum and maximum values into account.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedInstanceNorm.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedInstanceNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c30870fde48782101849fcfdd02169f201bfe60
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedInstanceNorm.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "QuantizedInstanceNorm"
+  in_arg {
+    name: "x"
+    description: <<END
+A 4D input Tensor.
+END
+  }
+  in_arg {
+    name: "x_min"
+    description: <<END
+The value represented by the lowest quantized input.
+END
+  }
+  in_arg {
+    name: "x_max"
+    description: <<END
+The value represented by the highest quantized input.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A 4D Tensor.
+END
+  }
+  out_arg {
+    name: "y_min"
+    description: <<END
+The value represented by the lowest quantized output.
+END
+  }
+  out_arg {
+    name: "y_max"
+    description: <<END
+The value represented by the highest quantized output.
+END
+  }
+  attr {
+    name: "output_range_given"
+    description: <<END
+If True, `given_y_min` and `given_y_min`
+and `given_y_max` are used as the output range. Otherwise,
+the implementation computes the output range.
+END
+  }
+  attr {
+    name: "given_y_min"
+    description: <<END
+Output in `y_min` if `output_range_given` is True.
+END
+  }
+  attr {
+    name: "given_y_max"
+    description: <<END
+Output in `y_max` if `output_range_given` is True.
+END
+  }
+  attr {
+    name: "variance_epsilon"
+    description: <<END
+A small float number to avoid dividing by 0.
+END
+  }
+  attr {
+    name: "min_separation"
+    description: <<END
+Minimum value of `y_max - y_min`
+END
+  }
+  summary: "Quantized Instance normalization."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d31820890084f39e746b1f7ee92c3dbc29d8f520
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMatMul.pbtxt
@@ -0,0 +1,77 @@
+op {
+  graph_op_name: "QuantizedMatMul"
+  in_arg {
+    name: "a"
+    description: <<END
+Must be a two-dimensional tensor.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+Must be a two-dimensional tensor.
+END
+  }
+  in_arg {
+    name: "min_a"
+    description: <<END
+The float value that the lowest quantized `a` value represents.
+END
+  }
+  in_arg {
+    name: "max_a"
+    description: <<END
+The float value that the highest quantized `a` value represents.
+END
+  }
+  in_arg {
+    name: "min_b"
+    description: <<END
+The float value that the lowest quantized `b` value represents.
+END
+  }
+  in_arg {
+    name: "max_b"
+    description: <<END
+The float value that the highest quantized `b` value represents.
+END
+  }
+  out_arg {
+    name: "min_out"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_out"
+    description: <<END
+The float value that the highest quantized output value represents.
+END
+  }
+  attr {
+    name: "transpose_a"
+    description: <<END
+If true, `a` is transposed before multiplication.
+END
+  }
+  attr {
+    name: "transpose_b"
+    description: <<END
+If true, `b` is transposed before multiplication.
+END
+  }
+  attr {
+    name: "Tactivation"
+    description: <<END
+The type of output produced by activation function
+following this operation.
+END
+  }
+  summary: "Perform a quantized matrix multiplication of  `a` by the matrix `b`."
+  description: <<END
+The inputs must be two-dimensional matrices and the inner dimension of
+`a` (after being transposed if `transpose_a` is non-zero) must match the
+outer dimension of `b` (after being transposed if `transposed_b` is
+non-zero).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..208950754b169ed77b29a0d270188706ef4b607c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMaxPool.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "QuantizedMaxPool"
+  in_arg {
+    name: "input"
+    description: <<END
+The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+END
+  }
+  in_arg {
+    name: "min_input"
+    description: <<END
+The float value that the lowest quantized input value represents.
+END
+  }
+  in_arg {
+    name: "max_input"
+    description: <<END
+The float value that the highest quantized input value represents.
+END
+  }
+  out_arg {
+    name: "min_output"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_output"
+    description: <<END
+The float value that the highest quantized output value represents.
+END
+  }
+  attr {
+    name: "ksize"
+    description: <<END
+The size of the window for each dimension of the input tensor.
+The length must be 4 to match the number of dimensions of the input.
+END
+  }
+  attr {
+    name: "strides"
+    description: <<END
+The stride of the sliding window for each dimension of the input
+tensor. The length must be 4 to match the number of dimensions of the input.
+END
+  }
+  attr {
+    name: "padding"
+    description: <<END
+The type of padding algorithm to use.
+END
+  }
+  summary: "Produces the max pool of the input tensor for quantized types."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6061204f359f5c374b9587a32de436a13eebb2d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedMul.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "QuantizedMul"
+  in_arg {
+    name: "min_x"
+    description: <<END
+The float value that the lowest quantized `x` value represents.
+END
+  }
+  in_arg {
+    name: "max_x"
+    description: <<END
+The float value that the highest quantized `x` value represents.
+END
+  }
+  in_arg {
+    name: "min_y"
+    description: <<END
+The float value that the lowest quantized `y` value represents.
+END
+  }
+  in_arg {
+    name: "max_y"
+    description: <<END
+The float value that the highest quantized `y` value represents.
+END
+  }
+  out_arg {
+    name: "min_z"
+    description: <<END
+The float value that the lowest quantized output value represents.
+END
+  }
+  out_arg {
+    name: "max_z"
+    description: <<END
+The float value that the highest quantized output value represents.
+
+*NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+  }
+  summary: "Returns x * y element-wise, working on quantized buffers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..519fbf1806dcdecc5d31d1fab77f9ff35922ac2d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedRelu.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "QuantizedRelu"
+  in_arg {
+    name: "min_features"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  in_arg {
+    name: "max_features"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  out_arg {
+    name: "activations"
+    description: <<END
+Has the same output shape as "features".
+END
+  }
+  out_arg {
+    name: "min_activations"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  out_arg {
+    name: "max_activations"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  summary: "Computes Quantized Rectified Linear: `max(features, 0)`"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedRelu6.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedRelu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..62fd01b4aaba69bb46f1d6052fb76dede326e400
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedRelu6.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "QuantizedRelu6"
+  in_arg {
+    name: "min_features"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  in_arg {
+    name: "max_features"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  out_arg {
+    name: "activations"
+    description: <<END
+Has the same output shape as "features".
+END
+  }
+  out_arg {
+    name: "min_activations"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  out_arg {
+    name: "max_activations"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  summary: "Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedReluX.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5763a19677fd4b3580e7a5f613692ad0290d701d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedReluX.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "QuantizedReluX"
+  in_arg {
+    name: "min_features"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  in_arg {
+    name: "max_features"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  out_arg {
+    name: "activations"
+    description: <<END
+Has the same output shape as "features".
+END
+  }
+  out_arg {
+    name: "min_activations"
+    description: <<END
+The float value that the lowest quantized value represents.
+END
+  }
+  out_arg {
+    name: "max_activations"
+    description: <<END
+The float value that the highest quantized value represents.
+END
+  }
+  summary: "Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedReshape.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b20333f8c71ef7aced5731811059200c2c9a6243
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedReshape.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "QuantizedReshape"
+  in_arg {
+    name: "shape"
+    description: <<END
+Defines the shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+This value is copied from input_min.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+This value is copied from input_max.
+END
+  }
+  summary: "Reshapes a quantized tensor as per the Reshape op."
+  description: <<END
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b3ba72e530043746a33a9467594ed8fb49dd2e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "QuantizedResizeBilinear"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  out_arg {
+    name: "resized_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1), which
+exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Resize quantized `images` to `size` using quantized bilinear interpolation."
+  description: <<END
+Input images and output images must be quantized types.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueClose.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..950425a853e1502980e5ada1ce0598530afed1c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueClose.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "QueueClose"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    description: <<END
+If true, all pending enqueue requests that are
+blocked on the given queue will be canceled.
+END
+  }
+  summary: "Closes the given queue."
+  description: <<END
+This operation signals that no more elements will be enqueued in the
+given queue. Subsequent Enqueue(Many) operations will fail.
+Subsequent Dequeue(Many) operations will continue to succeed if
+sufficient elements remain in the queue. Subsequent Dequeue(Many)
+operations that would block will fail immediately.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueCloseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5603269a02acdf297202a79ea6d1231db9f42e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueCloseV2.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "QueueCloseV2"
+  endpoint {
+    name: "QueueClose"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  attr {
+    name: "cancel_pending_enqueues"
+    description: <<END
+If true, all pending enqueue requests that are
+blocked on the given queue will be canceled.
+END
+  }
+  summary: "Closes the given queue."
+  description: <<END
+This operation signals that no more elements will be enqueued in the
+given queue. Subsequent Enqueue(Many) operations will fail.
+Subsequent Dequeue(Many) operations will continue to succeed if
+sufficient elements remain in the queue. Subsequent Dequeue(Many)
+operations that would block will fail immediately.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3290e10f0b32d7ea303056b3fb6ba64460590d15
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeue.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "QueueDequeue"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is empty, this operation will block for up to
+timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues a tuple of one or more tensors from the given queue."
+  description: <<END
+This operation has k outputs, where k is the number of components
+in the tuples stored in the given queue, and output i is the ith
+component of the dequeued tuple.
+
+N.B. If the queue is empty, this operation will block until an element
+has been dequeued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeueMany.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2247b37bb2f40f0216f9c4b17c966a4f4c83596d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeueMany.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "QueueDequeueMany"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "n"
+    description: <<END
+The number of tuples to dequeue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue has fewer than n elements, this operation
+will block for up to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: <<END
+If the queue is closed and there are fewer than `n` elements, then an
+OutOfRange error is returned.
+
+This operation concatenates queue-element component tensors along the
+0th dimension to make a single component tensor.  All of the components
+in the dequeued tuple will have size `n` in the 0th dimension.
+
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
+component of the dequeued tuple.
+
+N.B. If the queue is empty, this operation will block until `n` elements
+have been dequeued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeueManyV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34a65c2944d0010f49ecf1243229d09ec7e7e1e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeueManyV2.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "QueueDequeueManyV2"
+  endpoint {
+    name: "QueueDequeueMany"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "n"
+    description: <<END
+The number of tuples to dequeue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue has fewer than n elements, this operation
+will block for up to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: <<END
+If the queue is closed and there are fewer than `n` elements, then an
+OutOfRange error is returned.
+
+This operation concatenates queue-element component tensors along the
+0th dimension to make a single component tensor.  All of the components
+in the dequeued tuple will have size `n` in the 0th dimension.
+
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
+component of the dequeued tuple.
+
+N.B. If the queue is empty, this operation will block until `n` elements
+have been dequeued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0c7c204aaa057038796017da433147d05f1bb89
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpTo.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "QueueDequeueUpTo"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "n"
+    description: <<END
+The number of tuples to dequeue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue has fewer than n elements, this operation
+will block for up to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: <<END
+This operation is not supported by all queues.  If a queue does not support
+DequeueUpTo, then an Unimplemented error is returned.
+
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
+
+This operation concatenates queue-element component tensors along the
+0th dimension to make a single component tensor.  All of the components
+in the dequeued tuple will have size `n` in the 0th dimension.
+
+This operation has k outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
+component of the dequeued tuple.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpToV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..003e5f2c75c593e5be256237d89be23341459642
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeueUpToV2.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "QueueDequeueUpToV2"
+  endpoint {
+    name: "QueueDequeueUpTo"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "n"
+    description: <<END
+The number of tuples to dequeue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue has fewer than n elements, this operation
+will block for up to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues `n` tuples of one or more tensors from the given queue."
+  description: <<END
+This operation is not supported by all queues.  If a queue does not support
+DequeueUpTo, then an Unimplemented error is returned.
+
+If the queue is closed and there are more than 0 but less than `n`
+elements remaining, then instead of returning an OutOfRange error like
+QueueDequeueMany, less than `n` elements are returned immediately.  If
+the queue is closed and there are 0 elements left in the queue, then
+an OutOfRange error is returned just like in QueueDequeueMany.
+Otherwise the behavior is identical to QueueDequeueMany:
+
+This operation concatenates queue-element component tensors along the
+0th dimension to make a single component tensor.  All of the components
+in the dequeued tuple will have size n in the 0th dimension.
+
+This operation has `k` outputs, where `k` is the number of components in
+the tuples stored in the given queue, and output `i` is the ith
+component of the dequeued tuple.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueDequeueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueDequeueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fda760cfe569e9775cf1ceab3cd4cf4384b0c66b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueDequeueV2.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "QueueDequeueV2"
+  endpoint {
+    name: "QueueDequeue"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  out_arg {
+    name: "components"
+    description: <<END
+One or more tensors that were dequeued as a tuple.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a tuple.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is empty, this operation will block for up to
+timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Dequeues a tuple of one or more tensors from the given queue."
+  description: <<END
+This operation has k outputs, where k is the number of components
+in the tuples stored in the given queue, and output i is the ith
+component of the dequeued tuple.
+
+N.B. If the queue is empty, this operation will block until an element
+has been dequeued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76477b51da2868e03715f6fbae515b7003a10455
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueEnqueue.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "QueueEnqueue"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "components"
+    description: <<END
+One or more tensors from which the enqueued tensors should be taken.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is full, this operation will block for up to
+timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Enqueues a tuple of one or more tensors in the given queue."
+  description: <<END
+The components input has k elements, which correspond to the components of
+tuples stored in the given queue.
+
+N.B. If the queue is full, this operation will block until the given
+element has been enqueued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueEnqueueMany.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbd282d9b9d16dfad48400b89dbe48d39ddcdaac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueMany.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "QueueEnqueueMany"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "components"
+    description: <<END
+One or more tensors from which the enqueued tensors should
+be taken.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is too full, this operation will block for up
+to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
+  description: <<END
+This operation slices each component tensor along the 0th dimension to
+make multiple queue elements. All of the tuple components must have the
+same size in the 0th dimension.
+
+The components input has k elements, which correspond to the components of
+tuples stored in the given queue.
+
+N.B. If the queue is full, this operation will block until the given
+elements have been enqueued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueEnqueueManyV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c721caa253487f69c8f846810954e524db259f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueManyV2.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "QueueEnqueueManyV2"
+  endpoint {
+    name: "QueueEnqueueMany"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "components"
+    description: <<END
+One or more tensors from which the enqueued tensors should
+be taken.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is too full, this operation will block for up
+to timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Enqueues zero or more tuples of one or more tensors in the given queue."
+  description: <<END
+This operation slices each component tensor along the 0th dimension to
+make multiple queue elements. All of the tuple components must have the
+same size in the 0th dimension.
+
+The components input has k elements, which correspond to the components of
+tuples stored in the given queue.
+
+N.B. If the queue is full, this operation will block until the given
+elements have been enqueued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueEnqueueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..367d197cb09fe0a213ae8f8d8c70a6a699a12aa2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueEnqueueV2.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "QueueEnqueueV2"
+  endpoint {
+    name: "QueueEnqueue"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  in_arg {
+    name: "components"
+    description: <<END
+One or more tensors from which the enqueued tensors should be taken.
+END
+  }
+  attr {
+    name: "timeout_ms"
+    description: <<END
+If the queue is full, this operation will block for up to
+timeout_ms milliseconds.
+Note: This option is not supported yet.
+END
+  }
+  summary: "Enqueues a tuple of one or more tensors in the given queue."
+  description: <<END
+The components input has k elements, which correspond to the components of
+tuples stored in the given queue.
+
+N.B. If the queue is full, this operation will block until the given
+element has been enqueued (or 'timeout_ms' elapses, if specified).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueIsClosed.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueIsClosed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9412b2e6d691cccc9beab727b147a4de73fa8a24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueIsClosed.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "QueueIsClosed"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  summary: "Returns true if queue is closed."
+  description: <<END
+This operation returns true if the queue is closed and false if the queue
+is open.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueIsClosedV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueIsClosedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45aa4d10fbb24d877e5ed1e17628df164ae5ac2a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueIsClosedV2.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "QueueIsClosedV2"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  summary: "Returns true if queue is closed."
+  description: <<END
+This operation returns true if the queue is closed and false if the queue
+is open.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74fd38c0ee1bb6323ad830b8aa39a8c7eb0c67ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "QueueSize"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+The number of elements in the given queue.
+END
+  }
+  summary: "Computes the number of elements in the given queue."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QueueSizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QueueSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0cfa40f652681bd30fc1493bdcc8fc55cca31de
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QueueSizeV2.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "QueueSizeV2"
+  endpoint {
+    name: "QueueSize"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a queue.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+The number of elements in the given queue.
+END
+  }
+  summary: "Computes the number of elements in the given queue."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_R.pbtxt b/tensorflow/core/api_def/base_api/api_def_R.pbtxt
deleted file mode 100644
index 4c398c97713fab491204a7793046569142eb5f6e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_R.pbtxt
+++ /dev/null
@@ -1,1392 +0,0 @@
-op {
-  graph_op_name: "RFFT"
-  endpoint {
-    name: "RFFT"
-  }
-  summary: "Real-valued fast Fourier transform."
-  description: <<END
-Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most dimension of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-followed by the `fft_length / 2` positive-frequency terms.
-
-Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "RFFT2D"
-  endpoint {
-    name: "RFFT2D"
-  }
-  summary: "2D real-valued fast Fourier transform."
-  description: <<END
-Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most 2 dimensions of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-of `output`: the zero-frequency term, followed by the `fft_length / 2`
-positive-frequency terms.
-
-Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "RFFT3D"
-  endpoint {
-    name: "RFFT3D"
-  }
-  summary: "3D real-valued fast Fourier transform."
-  description: <<END
-Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-over the inner-most 3 dimensions of `input`.
-
-Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-of `output`: the zero-frequency term, followed by the `fft_length / 2`
-positive-frequency terms.
-
-Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-corresponding dimension of `input`, the dimension is cropped. If it is larger,
-the dimension is padded with zeros.
-END
-}
-op {
-  graph_op_name: "RGBToHSV"
-  endpoint {
-    name: "RGBToHSV"
-  }
-  summary: "Converts one or more images from RGB to HSV."
-  description: <<END
-Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-value of the pixels. The output is only well defined if the value in `images`
-are in `[0,1]`.
-
-`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
-END
-}
-op {
-  graph_op_name: "RandomCrop"
-  endpoint {
-    name: "RandomCrop"
-  }
-  summary: "Randomly crop `image`."
-  description: <<END
-`size` is a 1-D int64 tensor with 2 elements representing the crop height and
-width.  The values must be non negative.
-
-This Op picks a random location in `image` and crops a `height` by `width`
-rectangle from that location.  The random location is picked so the cropped
-area will fit inside the original image.
-END
-}
-op {
-  graph_op_name: "RandomGamma"
-  endpoint {
-    name: "RandomGamma"
-  }
-  summary: "Outputs random values from the Gamma distribution(s) described by alpha."
-  description: <<END
-This op uses the algorithm by Marsaglia et al. to acquire samples via
-transformation-rejection from pairs of uniform and normal random variables.
-See http://dl.acm.org/citation.cfm?id=358414
-END
-}
-op {
-  graph_op_name: "RandomPoisson"
-  endpoint {
-    name: "RandomPoisson"
-  }
-  summary: "Outputs random values from the Poisson distribution(s) described by rate."
-  description: <<END
-This op uses two algorithms, depending on rate. If rate >= 10, then
-the algorithm by Hormann is used to acquire samples via
-transformation-rejection.
-See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-
-Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-random variables.
-See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-Programming, Volume 2. Addison Wesley
-END
-}
-op {
-  graph_op_name: "RandomPoissonV2"
-  endpoint {
-    name: "RandomPoissonV2"
-  }
-  summary: "Outputs random values from the Poisson distribution(s) described by rate."
-  description: <<END
-This op uses two algorithms, depending on rate. If rate >= 10, then
-the algorithm by Hormann is used to acquire samples via
-transformation-rejection.
-See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-
-Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-random variables.
-See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-Programming, Volume 2. Addison Wesley
-END
-}
-op {
-  graph_op_name: "RandomShuffle"
-  endpoint {
-    name: "RandomShuffle"
-  }
-  summary: "Randomly shuffles a tensor along its first dimension."
-  description: <<END
-  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-  to one and only one `output[i]`. For example, a mapping that might occur for a
-  3x2 tensor is:
-
-```
-[[1, 2],       [[5, 6],
- [3, 4],  ==>   [1, 2],
- [5, 6]]        [3, 4]]
-```
-END
-}
-op {
-  graph_op_name: "RandomShuffleQueue"
-  endpoint {
-    name: "RandomShuffleQueue"
-  }
-  summary: "A queue that randomizes the order of elements."
-}
-op {
-  graph_op_name: "RandomShuffleQueueV2"
-  endpoint {
-    name: "RandomShuffleQueueV2"
-  }
-  summary: "A queue that randomizes the order of elements."
-}
-op {
-  graph_op_name: "RandomStandardNormal"
-  endpoint {
-    name: "RandomStandardNormal"
-  }
-  summary: "Outputs random values from a normal distribution."
-  description: <<END
-The generated values will have mean 0 and standard deviation 1.
-END
-}
-op {
-  graph_op_name: "RandomUniform"
-  endpoint {
-    name: "RandomUniform"
-  }
-  summary: "Outputs random values from a uniform distribution."
-  description: <<END
-The generated values follow a uniform distribution in the range `[0, 1)`. The
-lower bound 0 is included in the range, while the upper bound 1 is excluded.
-END
-}
-op {
-  graph_op_name: "RandomUniformInt"
-  endpoint {
-    name: "RandomUniformInt"
-  }
-  summary: "Outputs random integers from a uniform distribution."
-  description: <<END
-The generated values are uniform integers in the range `[minval, maxval)`.
-The lower bound `minval` is included in the range, while the upper bound
-`maxval` is excluded.
-
-The random integers are slightly biased unless `maxval - minval` is an exact
-power of two.  The bias is small for values of `maxval - minval` significantly
-smaller than the range of the output (either `2^32` or `2^64`).
-END
-}
-op {
-  graph_op_name: "Range"
-  endpoint {
-    name: "Range"
-  }
-  summary: "Creates a sequence of numbers."
-  description: <<END
-This operation creates a sequence of numbers that begins at `start` and
-extends by increments of `delta` up to but not including `limit`.
-
-For example:
-
-```
-# 'start' is 3
-# 'limit' is 18
-# 'delta' is 3
-tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-```
-END
-}
-op {
-  graph_op_name: "RangeDataset"
-  endpoint {
-    name: "RangeDataset"
-  }
-  summary: "Creates a dataset with a range of values. Corresponds to python\'s xrange."
-}
-op {
-  graph_op_name: "Rank"
-  endpoint {
-    name: "Rank"
-  }
-  summary: "Returns the rank of a tensor."
-  description: <<END
-This operation returns an integer representing the rank of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-# shape of tensor 't' is [2, 2, 3]
-rank(t) ==> 3
-```
-
-**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-of a tensor is the number of indices required to uniquely select each element
-of the tensor. Rank is also known as "order", "degree", or "ndims."
-END
-}
-op {
-  graph_op_name: "ReadFile"
-  endpoint {
-    name: "ReadFile"
-  }
-  summary: "Reads and outputs the entire contents of the input filename."
-}
-op {
-  graph_op_name: "ReaderNumRecordsProduced"
-  endpoint {
-    name: "ReaderNumRecordsProduced"
-  }
-  summary: "Returns the number of records this Reader has produced."
-  description: <<END
-This is the same as the number of ReaderRead executions that have
-succeeded.
-END
-}
-op {
-  graph_op_name: "ReaderNumRecordsProducedV2"
-  endpoint {
-    name: "ReaderNumRecordsProducedV2"
-  }
-  summary: "Returns the number of records this Reader has produced."
-  description: <<END
-This is the same as the number of ReaderRead executions that have
-succeeded.
-END
-}
-op {
-  graph_op_name: "ReaderNumWorkUnitsCompleted"
-  endpoint {
-    name: "ReaderNumWorkUnitsCompleted"
-  }
-  summary: "Returns the number of work units this Reader has finished processing."
-}
-op {
-  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
-  endpoint {
-    name: "ReaderNumWorkUnitsCompletedV2"
-  }
-  summary: "Returns the number of work units this Reader has finished processing."
-}
-op {
-  graph_op_name: "ReaderRead"
-  endpoint {
-    name: "ReaderRead"
-  }
-  summary: "Returns the next record (key, value pair) produced by a Reader."
-  description: <<END
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-END
-}
-op {
-  graph_op_name: "ReaderReadUpTo"
-  endpoint {
-    name: "ReaderReadUpTo"
-  }
-  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
-  description: <<END
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-It may return less than `num_records` even before the last batch.
-END
-}
-op {
-  graph_op_name: "ReaderReadUpToV2"
-  endpoint {
-    name: "ReaderReadUpToV2"
-  }
-  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
-  description: <<END
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-It may return less than `num_records` even before the last batch.
-END
-}
-op {
-  graph_op_name: "ReaderReadV2"
-  endpoint {
-    name: "ReaderReadV2"
-  }
-  summary: "Returns the next record (key, value pair) produced by a Reader."
-  description: <<END
-Will dequeue from the input queue if necessary (e.g. when the
-Reader needs to start reading from a new file since it has finished
-with the previous file).
-END
-}
-op {
-  graph_op_name: "ReaderReset"
-  endpoint {
-    name: "ReaderReset"
-  }
-  summary: "Restore a Reader to its initial clean state."
-}
-op {
-  graph_op_name: "ReaderResetV2"
-  endpoint {
-    name: "ReaderResetV2"
-  }
-  summary: "Restore a Reader to its initial clean state."
-}
-op {
-  graph_op_name: "ReaderRestoreState"
-  endpoint {
-    name: "ReaderRestoreState"
-  }
-  summary: "Restore a reader to a previously saved state."
-  description: <<END
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-END
-}
-op {
-  graph_op_name: "ReaderRestoreStateV2"
-  endpoint {
-    name: "ReaderRestoreStateV2"
-  }
-  summary: "Restore a reader to a previously saved state."
-  description: <<END
-Not all Readers support being restored, so this can produce an
-Unimplemented error.
-END
-}
-op {
-  graph_op_name: "ReaderSerializeState"
-  endpoint {
-    name: "ReaderSerializeState"
-  }
-  summary: "Produce a string tensor that encodes the state of a Reader."
-  description: <<END
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-END
-}
-op {
-  graph_op_name: "ReaderSerializeStateV2"
-  endpoint {
-    name: "ReaderSerializeStateV2"
-  }
-  summary: "Produce a string tensor that encodes the state of a Reader."
-  description: <<END
-Not all Readers support being serialized, so this can produce an
-Unimplemented error.
-END
-}
-op {
-  graph_op_name: "Real"
-  endpoint {
-    name: "Real"
-  }
-  summary: "Returns the real part of a complex number."
-  description: <<END
-Given a tensor `input` of complex numbers, this operation returns a tensor of
-type `float` that is the real part of each element in `input`. All elements in
-`input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
- part returned by this operation and *b* is the imaginary part.
-
-For example:
-
-```
-# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-tf.real(input) ==> [-2.25, 3.25]
-```
-END
-}
-op {
-  graph_op_name: "RealDiv"
-  endpoint {
-    name: "RealDiv"
-  }
-  summary: "Returns x / y element-wise for real types."
-  description: <<END
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Reciprocal"
-  endpoint {
-    name: "Reciprocal"
-  }
-  summary: "Computes the reciprocal of x element-wise."
-  description: <<END
-I.e., \\(y = 1 / x\\).
-END
-}
-op {
-  graph_op_name: "ReciprocalGrad"
-  endpoint {
-    name: "ReciprocalGrad"
-  }
-  summary: "Computes the gradient for the inverse of `x` wrt its input."
-  description: <<END
-Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-is the corresponding input gradient.
-END
-}
-op {
-  graph_op_name: "RecordInput"
-  endpoint {
-    name: "RecordInput"
-  }
-  summary: "Emits randomized records."
-}
-op {
-  graph_op_name: "ReduceJoin"
-  endpoint {
-    name: "ReduceJoin"
-  }
-  summary: "Joins a string Tensor across the given dimensions."
-  description: <<END
-Computes the string join across dimensions in the given string Tensor of shape
-`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-strings with the given separator (default: empty string).  Negative indices are
-counted backwards from the end, with `-1` being equivalent to `n - 1`.
-
-For example:
-
-```python
-# tensor `a` is [["a", "b"], ["c", "d"]]
-tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-tf.reduce_join(a, []) ==> ["abcd"]
-```
-END
-}
-op {
-  graph_op_name: "RefEnter"
-  endpoint {
-    name: "RefEnter"
-  }
-  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
-  description: <<END
-The unique `frame_name` is used by the `Executor` to identify frames. If
-`is_constant` is true, `output` is a constant in the child frame; otherwise
-it may be changed in the child frame. At most `parallel_iterations` iterations
-are run in parallel in the child frame.
-END
-}
-op {
-  graph_op_name: "RefExit"
-  endpoint {
-    name: "RefExit"
-  }
-  summary: "Exits the current frame to its parent frame."
-  description: <<END
-Exit makes its input `data` available to the parent frame.
-END
-}
-op {
-  graph_op_name: "RefIdentity"
-  endpoint {
-    name: "RefIdentity"
-  }
-  summary: "Return the same ref tensor as the input ref tensor."
-}
-op {
-  graph_op_name: "RefMerge"
-  endpoint {
-    name: "RefMerge"
-  }
-  summary: "Forwards the value of an available tensor from `inputs` to `output`."
-  description: <<END
-`Merge` waits for at least one of the tensors in `inputs` to become available.
-It is usually combined with `Switch` to implement branching.
-
-`Merge` forwards the first tensor for become available to `output`, and sets
-`value_index` to its index in `inputs`.
-END
-}
-op {
-  graph_op_name: "RefNextIteration"
-  endpoint {
-    name: "RefNextIteration"
-  }
-  summary: "Makes its input available to the next iteration."
-}
-op {
-  graph_op_name: "RefSelect"
-  endpoint {
-    name: "RefSelect"
-  }
-  summary: "Forwards the `index`th element of `inputs` to `output`."
-}
-op {
-  graph_op_name: "RefSwitch"
-  endpoint {
-    name: "RefSwitch"
-  }
-  summary: "Forwards the ref tensor `data` to the output port determined by `pred`."
-  description: <<END
-If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-the data goes to `output_false`.
-
-See also `Switch` and `Merge`.
-END
-}
-op {
-  graph_op_name: "Relu"
-  endpoint {
-    name: "Relu"
-  }
-  summary: "Computes rectified linear: `max(features, 0)`."
-}
-op {
-  graph_op_name: "Relu6"
-  endpoint {
-    name: "Relu6"
-  }
-  summary: "Computes rectified linear 6: `min(max(features, 0), 6)`."
-}
-op {
-  graph_op_name: "Relu6Grad"
-  endpoint {
-    name: "Relu6Grad"
-  }
-  summary: "Computes rectified linear 6 gradients for a Relu6 operation."
-}
-op {
-  graph_op_name: "ReluGrad"
-  endpoint {
-    name: "ReluGrad"
-  }
-  summary: "Computes rectified linear gradients for a Relu operation."
-}
-op {
-  graph_op_name: "RemoteCall"
-  endpoint {
-    name: "RemoteCall"
-  }
-  summary: "Runs function `f` on a remote device indicated by `target`."
-}
-op {
-  graph_op_name: "RemoteFusedGraphExecute"
-  endpoint {
-    name: "RemoteFusedGraphExecute"
-  }
-  summary: "Execute a sub graph on a remote processor."
-  description: <<END
-The graph specifications(such as graph itself, input tensors and output names)
-are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-as serialized_remote_fused_graph_execute_info.
-The specifications will be passed to a dedicated registered
-remote fused graph executor.  The executor will send the graph specifications
-to a remote processor and execute that graph.  The execution results
-will be passed to consumer nodes as outputs of this node.
-END
-}
-op {
-  graph_op_name: "RepeatDataset"
-  endpoint {
-    name: "RepeatDataset"
-  }
-  summary: "Creates a dataset that emits the outputs of `input_dataset` `count` times."
-}
-op {
-  graph_op_name: "RequantizationRange"
-  endpoint {
-    name: "RequantizationRange"
-  }
-  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
-  description: <<END
-range that covers the actual values present in that tensor.  This op is
-typically used to produce the requested_output_min and requested_output_max for
-Requantize.
-END
-}
-op {
-  graph_op_name: "Requantize"
-  endpoint {
-    name: "Requantize"
-  }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
-  description: <<END
-output range specified with 'requested_output_min' and 'requested_output_max'.
-
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-END
-}
-op {
-  graph_op_name: "Reshape"
-  endpoint {
-    name: "Reshape"
-  }
-  summary: "Reshapes a tensor."
-  description: <<END
-Given `tensor`, this operation returns a tensor that has the same values
-as `tensor` with shape `shape`.
-
-If one component of `shape` is the special value -1, the size of that dimension
-is computed so that the total size remains constant.  In particular, a `shape`
-of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-
-If `shape` is 1-D or higher, then the operation returns a tensor with shape
-`shape` filled with the values of `tensor`. In this case, the number of elements
-implied by `shape` must be the same as the number of elements in `tensor`.
-
-For example:
-
-```
-# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-# tensor 't' has shape [9]
-reshape(t, [3, 3]) ==> [[1, 2, 3],
-                        [4, 5, 6],
-                        [7, 8, 9]]
-
-# tensor 't' is [[[1, 1], [2, 2]],
-#                [[3, 3], [4, 4]]]
-# tensor 't' has shape [2, 2, 2]
-reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-                        [3, 3, 4, 4]]
-
-# tensor 't' is [[[1, 1, 1],
-#                 [2, 2, 2]],
-#                [[3, 3, 3],
-#                 [4, 4, 4]],
-#                [[5, 5, 5],
-#                 [6, 6, 6]]]
-# tensor 't' has shape [3, 2, 3]
-# pass '[-1]' to flatten 't'
-reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
-
-# -1 can also be used to infer the shape
-
-# -1 is inferred to be 9:
-reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 2:
-reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-# -1 is inferred to be 3:
-reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-                              [2, 2, 2],
-                              [3, 3, 3]],
-                             [[4, 4, 4],
-                              [5, 5, 5],
-                              [6, 6, 6]]]
-
-# tensor 't' is [7]
-# shape `[]` reshapes to a scalar
-reshape(t, []) ==> 7
-```
-END
-}
-op {
-  graph_op_name: "ResizeArea"
-  endpoint {
-    name: "ResizeArea"
-  }
-  summary: "Resize `images` to `size` using area interpolation."
-  description: <<END
-Input images can be of different types but output images are always float.
-
-Each output pixel is computed by first transforming the pixel's footprint into
-the input tensor and then averaging the pixels that intersect the footprint. An
-input pixel's contribution to the average is weighted by the fraction of its
-area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-END
-}
-op {
-  graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "ResizeBicubic"
-  }
-  summary: "Resize `images` to `size` using bicubic interpolation."
-  description: <<END
-Input images can be of different types but output images are always float.
-END
-}
-op {
-  graph_op_name: "ResizeBicubicGrad"
-  endpoint {
-    name: "ResizeBicubicGrad"
-  }
-  summary: "Computes the gradient of bicubic interpolation."
-}
-op {
-  graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "ResizeBilinear"
-  }
-  summary: "Resize `images` to `size` using bilinear interpolation."
-  description: <<END
-Input images can be of different types but output images are always float.
-END
-}
-op {
-  graph_op_name: "ResizeBilinearGrad"
-  endpoint {
-    name: "ResizeBilinearGrad"
-  }
-  summary: "Computes the gradient of bilinear interpolation."
-}
-op {
-  graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "ResizeNearestNeighbor"
-  }
-  summary: "Resize `images` to `size` using nearest neighbor interpolation."
-}
-op {
-  graph_op_name: "ResizeNearestNeighborGrad"
-  endpoint {
-    name: "ResizeNearestNeighborGrad"
-  }
-  summary: "Computes the gradient of nearest neighbor interpolation."
-}
-op {
-  graph_op_name: "ResourceApplyAdadelta"
-  endpoint {
-    name: "ResourceApplyAdadelta"
-  }
-  summary: "Update \'*var\' according to the adadelta scheme."
-  description: <<END
-accum = rho() * accum + (1 - rho()) * grad.square();
-update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-update_accum = rho() * update_accum + (1 - rho()) * update.square();
-var -= update;
-END
-}
-op {
-  graph_op_name: "ResourceApplyAdagrad"
-  endpoint {
-    name: "ResourceApplyAdagrad"
-  }
-  summary: "Update \'*var\' according to the adagrad scheme."
-  description: <<END
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-END
-}
-op {
-  graph_op_name: "ResourceApplyAdagradDA"
-  endpoint {
-    name: "ResourceApplyAdagradDA"
-  }
-  summary: "Update \'*var\' according to the proximal adagrad scheme."
-}
-op {
-  graph_op_name: "ResourceApplyAdam"
-  endpoint {
-    name: "ResourceApplyAdam"
-  }
-  summary: "Update \'*var\' according to the Adam algorithm."
-  description: <<END
-lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
-END
-}
-op {
-  graph_op_name: "ResourceApplyCenteredRMSProp"
-  endpoint {
-    name: "ResourceApplyCenteredRMSProp"
-  }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: <<END
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-mg <- rho * mg_{t-1} + (1-rho) * grad
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ResourceApplyFtrl"
-  endpoint {
-    name: "ResourceApplyFtrl"
-  }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-accum_new = accum + grad * grad
-linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ResourceApplyFtrlV2"
-  endpoint {
-    name: "ResourceApplyFtrlV2"
-  }
-  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ResourceApplyGradientDescent"
-  endpoint {
-    name: "ResourceApplyGradientDescent"
-  }
-  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
-}
-op {
-  graph_op_name: "ResourceApplyMomentum"
-  endpoint {
-    name: "ResourceApplyMomentum"
-  }
-  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
-  description: <<END
-want to use Nesterov momentum.
-
-accum = accum * momentum + grad
-var -= lr * accum
-END
-}
-op {
-  graph_op_name: "ResourceApplyProximalAdagrad"
-  endpoint {
-    name: "ResourceApplyProximalAdagrad"
-  }
-  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
-  description: <<END
-accum += grad * grad
-prox_v = var - lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-END
-}
-op {
-  graph_op_name: "ResourceApplyProximalGradientDescent"
-  endpoint {
-    name: "ResourceApplyProximalGradientDescent"
-  }
-  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: <<END
-prox_v = var - alpha * delta
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-END
-}
-op {
-  graph_op_name: "ResourceApplyRMSProp"
-  endpoint {
-    name: "ResourceApplyRMSProp"
-  }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: <<END
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyAdadelta"
-  endpoint {
-    name: "ResourceSparseApplyAdadelta"
-  }
-  summary: "var: Should be from a Variable()."
-}
-op {
-  graph_op_name: "ResourceSparseApplyAdagrad"
-  endpoint {
-    name: "ResourceSparseApplyAdagrad"
-  }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
-  description: <<END
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyAdagradDA"
-  endpoint {
-    name: "ResourceSparseApplyAdagradDA"
-  }
-  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
-}
-op {
-  graph_op_name: "ResourceSparseApplyCenteredRMSProp"
-  endpoint {
-    name: "ResourceSparseApplyCenteredRMSProp"
-  }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: <<END
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyFtrl"
-  endpoint {
-    name: "ResourceSparseApplyFtrl"
-  }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyFtrlV2"
-  endpoint {
-    name: "ResourceSparseApplyFtrlV2"
-  }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyMomentum"
-  endpoint {
-    name: "ResourceSparseApplyMomentum"
-  }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
-  description: <<END
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyProximalAdagrad"
-  endpoint {
-    name: "ResourceSparseApplyProximalAdagrad"
-  }
-  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
-  description: <<END
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyProximalGradientDescent"
-  endpoint {
-    name: "ResourceSparseApplyProximalGradientDescent"
-  }
-  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: <<END
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-END
-}
-op {
-  graph_op_name: "ResourceSparseApplyRMSProp"
-  endpoint {
-    name: "ResourceSparseApplyRMSProp"
-  }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: <<END
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "ResourceStridedSliceAssign"
-  endpoint {
-    name: "ResourceStridedSliceAssign"
-  }
-  summary: "Assign `value` to the sliced l-value reference of `ref`."
-  description: <<END
-The values of `value` are assigned to the positions in the variable
-`ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-
-NOTE this op currently does not support broadcasting and so `value`'s
-shape must be exactly the shape produced by the slice of `ref`.
-END
-}
-op {
-  graph_op_name: "Restore"
-  endpoint {
-    name: "Restore"
-  }
-  summary: "Restores a tensor from checkpoint files."
-  description: <<END
-Reads a tensor stored in one or several files. If there are several files (for
-instance because a tensor was saved as slices), `file_pattern` may contain
-wildcard symbols (`*` and `?`) in the filename portion only, not in the
-directory portion.
-
-If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-in which file the requested tensor is likely to be found. This op will first
-open the file at index `preferred_shard` in the list of matching files and try
-to restore tensors from that file.  Only if some tensors or tensor slices are
-not found in that first file, then the Op opens all the files. Setting
-`preferred_shard` to match the value passed as the `shard` input
-of a matching `Save` Op may speed up Restore.  This attribute only affects
-performance, not correctness.  The default value -1 means files are processed in
-order.
-
-See also `RestoreSlice`.
-END
-}
-op {
-  graph_op_name: "RestoreIterator"
-  endpoint {
-    name: "RestoreIterator"
-  }
-  summary: "Restores the state of the `iterator` from the checkpoint saved at `path` using \"SaveIterator\"."
-}
-op {
-  graph_op_name: "RestoreSlice"
-  endpoint {
-    name: "RestoreSlice"
-  }
-  summary: "Restores a tensor from checkpoint files."
-  description: <<END
-This is like `Restore` except that restored tensor can be listed as filling
-only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-larger tensor and the slice that the restored tensor covers.
-
-The `shape_and_slice` input has the same format as the
-elements of the `shapes_and_slices` input of the `SaveSlices` op.
-END
-}
-op {
-  graph_op_name: "RestoreV2"
-  endpoint {
-    name: "RestoreV2"
-  }
-  summary: "Restores tensors from a V2 checkpoint."
-  description: <<END
-For backward compatibility with the V1 format, this Op currently allows
-restoring from a V1 checkpoint as well:
-  - This Op first attempts to find the V2 index file pointed to by "prefix", and
-    if found proceed to read it as a V2 checkpoint;
-  - Otherwise the V1 read path is invoked.
-Relying on this behavior is not recommended, as the ability to fall back to read
-V1 might be deprecated and eventually removed.
-
-By default, restores the named tensors in full.  If the caller wishes to restore
-specific slices of stored tensors, "shape_and_slices" should be non-empty
-strings and correspondingly well-formed.
-
-Callers must ensure all the named tensors are indeed stored in the checkpoint.
-END
-}
-op {
-  graph_op_name: "Reverse"
-  endpoint {
-    name: "Reverse"
-  }
-  summary: "Reverses specific dimensions of a tensor."
-  description: <<END
-Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-of `tensor`, this operation reverses each dimension i of `tensor` where
-`dims[i]` is `True`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions
-of `tensor` must equal the number of elements in `dims`. In other words:
-
-`rank(tensor) = size(dims)`
-
-For example:
-
-```
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [False, False, False, True]
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is [False, True, False, False]
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is [False, False, True, False]
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-END
-}
-op {
-  graph_op_name: "ReverseSequence"
-  endpoint {
-    name: "ReverseSequence"
-  }
-  summary: "Reverses variable length slices."
-  description: <<END
-This op first slices `input` along the dimension `batch_dim`, and for each
-slice `i`, reverses the first `seq_lengths[i]` elements along
-the dimension `seq_dim`.
-
-The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
-
-The output slice `i` along dimension `batch_dim` is then given by input
-slice `i`, with the first `seq_lengths[i]` slices along dimension
-`seq_dim` reversed.
-
-For example:
-
-```
-# Given this:
-batch_dim = 0
-seq_dim = 1
-input.dims = (4, 8, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-
-# while entries past seq_lens are copied through:
-output[0, 7:, :, ...] = input[0, 7:, :, ...]
-output[1, 2:, :, ...] = input[1, 2:, :, ...]
-output[2, 3:, :, ...] = input[2, 3:, :, ...]
-output[3, 2:, :, ...] = input[3, 2:, :, ...]
-```
-
-In contrast, if:
-
-```
-# Given this:
-batch_dim = 2
-seq_dim = 0
-input.dims = (8, ?, 4, ...)
-seq_lengths = [7, 2, 3, 5]
-
-# then slices of input are reversed on seq_dim, but only up to seq_lengths:
-output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
-
-# while entries past seq_lens are copied through:
-output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-```
-END
-}
-op {
-  graph_op_name: "ReverseV2"
-  endpoint {
-    name: "ReverseV2"
-  }
-  summary: "Reverses specific dimensions of a tensor."
-  description: <<END
-NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-
-Given a `tensor`, and a `int32` tensor `axis` representing the set of
-dimensions of `tensor` to reverse. This operation reverses each dimension
-`i` for which there exists `j` s.t. `axis[j] == i`.
-
-`tensor` can have up to 8 dimensions. The number of dimensions specified
-in `axis` may be 0 or more entries. If an index is specified more than
-once, a InvalidArgument error is raised.
-
-For example:
-
-```
-# tensor 't' is [[[[ 0,  1,  2,  3],
-#                  [ 4,  5,  6,  7],
-#                  [ 8,  9, 10, 11]],
-#                 [[12, 13, 14, 15],
-#                  [16, 17, 18, 19],
-#                  [20, 21, 22, 23]]]]
-# tensor 't' shape is [1, 2, 3, 4]
-
-# 'dims' is [3] or 'dims' is -1
-reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-                        [ 7,  6,  5,  4],
-                        [ 11, 10, 9, 8]],
-                       [[15, 14, 13, 12],
-                        [19, 18, 17, 16],
-                        [23, 22, 21, 20]]]]
-
-# 'dims' is '[1]' (or 'dims' is '[-3]')
-reverse(t, dims) ==> [[[[12, 13, 14, 15],
-                        [16, 17, 18, 19],
-                        [20, 21, 22, 23]
-                       [[ 0,  1,  2,  3],
-                        [ 4,  5,  6,  7],
-                        [ 8,  9, 10, 11]]]]
-
-# 'dims' is '[2]' (or 'dims' is '[-2]')
-reverse(t, dims) ==> [[[[8, 9, 10, 11],
-                        [4, 5, 6, 7],
-                        [0, 1, 2, 3]]
-                       [[20, 21, 22, 23],
-                        [16, 17, 18, 19],
-                        [12, 13, 14, 15]]]]
-```
-END
-}
-op {
-  graph_op_name: "Rint"
-  endpoint {
-    name: "Rint"
-  }
-  summary: "Returns element-wise integer closest to x."
-  description: <<END
-If the result is midway between two representable values,
-the even representable is chosen.
-For example:
-
-```
-rint(-1.5) ==> -2.0
-rint(0.5000001) ==> 1.0
-rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-```
-END
-}
-op {
-  graph_op_name: "Round"
-  endpoint {
-    name: "Round"
-  }
-  summary: "Rounds the values of a tensor to the nearest integer, element-wise."
-  description: <<END
-Rounds half to even.  Also known as bankers rounding. If you want to round
-according to the current system rounding mode use std::cint.
-END
-}
-op {
-  graph_op_name: "Rsqrt"
-  endpoint {
-    name: "Rsqrt"
-  }
-  summary: "Computes reciprocal of square root of x element-wise."
-  description: <<END
-I.e., \\(y = 1 / \sqrt{x}\\).
-END
-}
-op {
-  graph_op_name: "RsqrtGrad"
-  endpoint {
-    name: "RsqrtGrad"
-  }
-  summary: "Computes the gradient for the rsqrt of `x` wrt its input."
-  description: <<END
-Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-is the corresponding input gradient.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_RFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_RFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf680e2ca0aef7155809ffe2b28cdcb191f2fff
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RFFT.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "RFFT"
+  in_arg {
+    name: "input"
+    description: <<END
+A float32 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [1]. The FFT length.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same rank as `input`. The inner-most
+  dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+  frequency components of its 1D Fourier transform.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfft
+@end_compatibility
+END
+  }
+  summary: "Real-valued fast Fourier transform."
+  description: <<END
+Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most dimension of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+`fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+followed by the `fft_length / 2` positive-frequency terms.
+
+Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_RFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a901ee704c46556a8fd2a24d300e35f239310eea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RFFT2D.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "RFFT2D"
+  in_arg {
+    name: "input"
+    description: <<END
+A float32 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [2]. The FFT length for each dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same rank as `input`. The inner-most 2
+  dimensions of `input` are replaced with their 2D Fourier transform. The
+  inner-most dimension contains `fft_length / 2 + 1` unique frequency
+  components.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfft2
+@end_compatibility
+END
+  }
+  summary: "2D real-valued fast Fourier transform."
+  description: <<END
+Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most 2 dimensions of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+of `output`: the zero-frequency term, followed by the `fft_length / 2`
+positive-frequency terms.
+
+Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RFFT3D.pbtxt b/tensorflow/core/api_def/base_api/api_def_RFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4a3ad667bab252b480e30061d8e365cd4e762e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RFFT3D.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "RFFT3D"
+  in_arg {
+    name: "input"
+    description: <<END
+A float32 tensor.
+END
+  }
+  in_arg {
+    name: "fft_length"
+    description: <<END
+An int32 tensor of shape [3]. The FFT length for each dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A complex64 tensor of the same rank as `input`. The inner-most 3
+  dimensions of `input` are replaced with the their 3D Fourier transform. The
+  inner-most dimension contains `fft_length / 2 + 1` unique frequency
+  components.
+
+@compatibility(numpy)
+Equivalent to np.fft.rfftn with 3 dimensions.
+@end_compatibility
+END
+  }
+  summary: "3D real-valued fast Fourier transform."
+  description: <<END
+Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+over the inner-most 3 dimensions of `input`.
+
+Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+`fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+of `output`: the zero-frequency term, followed by the `fft_length / 2`
+positive-frequency terms.
+
+Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+corresponding dimension of `input`, the dimension is cropped. If it is larger,
+the dimension is padded with zeros.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RGBToHSV.pbtxt b/tensorflow/core/api_def/base_api/api_def_RGBToHSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08629610ed4ba48e2a29e6294b9c7008359ff531
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RGBToHSV.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "RGBToHSV"
+  in_arg {
+    name: "images"
+    description: <<END
+1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+`images` converted to HSV.
+END
+  }
+  summary: "Converts one or more images from RGB to HSV."
+  description: <<END
+Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+value of the pixels. The output is only well defined if the value in `images`
+are in `[0,1]`.
+
+`output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+`output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomCrop.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomCrop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd549dda14f7d7b1935e66f1ddf957bffe98c791
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomCrop.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "RandomCrop"
+  in_arg {
+    name: "image"
+    description: <<END
+3-D of shape `[height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+1-D of length 2 containing: `crop_height`, `crop_width`..
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+3-D of shape `[crop_height, crop_width, channels].`
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Randomly crop `image`."
+  description: <<END
+`size` is a 1-D int64 tensor with 2 elements representing the crop height and
+width.  The values must be non negative.
+
+This Op picks a random location in `image` and crops a `height` by `width`
+rectangle from that location.  The random location is picked so the cropped
+area will fit inside the original image.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0466b40f85eb118c94404e2f0d7670392bc7afdf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "RandomDataset"
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  summary: "Creates a Dataset that returns pseudorandom numbers."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomGamma.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomGamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a10392b6d24e94c35751c1c0881d1f7a6ea7ba8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomGamma.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "RandomGamma"
+  in_arg {
+    name: "shape"
+    description: <<END
+1-D integer tensor. Shape of independent samples to draw from each
+distribution described by the shape parameters given in alpha.
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+A tensor in which each scalar is a "shape" parameter describing the
+associated gamma distribution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor with shape `shape + shape(alpha)`. Each slice
+`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+`alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  summary: "Outputs random values from the Gamma distribution(s) described by alpha."
+  description: <<END
+This op uses the algorithm by Marsaglia et al. to acquire samples via
+transformation-rejection from pairs of uniform and normal random variables.
+See http://dl.acm.org/citation.cfm?id=358414
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomPoisson.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomPoisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b75ecd2e1938fa13cd86e70370a4246887a3aef5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomPoisson.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomPoisson"
+  summary: "Use RandomPoissonV2 instead."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomPoissonV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomPoissonV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aa8c30294963976db191641deb0781e82278164
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomPoissonV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "RandomPoissonV2"
+  in_arg {
+    name: "shape"
+    description: <<END
+1-D integer tensor. Shape of independent samples to draw from each
+distribution described by the shape parameters given in rate.
+END
+  }
+  in_arg {
+    name: "rate"
+    description: <<END
+A tensor in which each scalar is a "rate" parameter describing the
+associated poisson distribution.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor with shape `shape + shape(rate)`. Each slice
+`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+`rate[i0, i1, ...iN]`.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  summary: "Outputs random values from the Poisson distribution(s) described by rate."
+  description: <<END
+This op uses two algorithms, depending on rate. If rate >= 10, then
+the algorithm by Hormann is used to acquire samples via
+transformation-rejection.
+See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+
+Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+random variables.
+See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+Programming, Volume 2. Addison Wesley
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomShuffle.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomShuffle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7490361712f87c759145b0f01c1c8c3a5b273ad6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomShuffle.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "RandomShuffle"
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to be shuffled.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of same shape and type as `value`, shuffled along its first
+dimension.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  summary: "Randomly shuffles a tensor along its first dimension."
+  description: <<END
+  The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+  to one and only one `output[i]`. For example, a mapping that might occur for a
+  3x2 tensor is:
+
+```
+[[1, 2],       [[5, 6],
+ [3, 4],  ==>   [1, 2],
+ [5, 6]]        [3, 4]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..258ef00b5c9cad94517b8e8a113dcb047aaeebed
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueue.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "RandomShuffleQueue"
+  visibility: SKIP
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "min_after_dequeue"
+    description: <<END
+Dequeue will block unless there would be this
+many elements after the dequeue or the queue is closed. This
+ensures a minimum level of mixing of elements.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 is set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that randomizes the order of elements."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb5a0fb8ede0653ee4db782aa3476fd1ba03a6e8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomShuffleQueueV2.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "RandomShuffleQueueV2"
+  endpoint {
+    name: "RandomShuffleQueue"
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the queue.
+END
+  }
+  attr {
+    name: "component_types"
+    description: <<END
+The type of each component in a value.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shape of each component in a value. The length of this attr must
+be either 0 or the same as the length of component_types. If the length of
+this attr is 0, the shapes of queue elements are not constrained, and
+only one element may be dequeued at a time.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+The upper bound on the number of elements in this queue.
+Negative numbers mean no limit.
+END
+  }
+  attr {
+    name: "min_after_dequeue"
+    description: <<END
+Dequeue will block unless there would be this
+many elements after the dequeue or the queue is closed. This
+ensures a minimum level of mixing of elements.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 is set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this queue will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A queue that randomizes the order of elements."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomStandardNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d534785b14202fa7f0dbb28077b10a3846c5cce9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomStandardNormal.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "RandomStandardNormal"
+  endpoint {
+    name: "RandomNormal"
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomUniform.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..148a5b1c9aa40180ae8cce7e26deb5acbca9fd30
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomUniform.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "RandomUniform"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with uniform random values.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomUniformInt.pbtxt b/tensorflow/core/api_def/base_api/api_def_RandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76a8f4b3e43b6874d43de24cd8feda1244ffa0f5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RandomUniformInt.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "RandomUniformInt"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+0-D.  Inclusive lower bound on the generated integers.
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+0-D.  Exclusive upper bound on the generated integers.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with uniform random integers.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  summary: "Outputs random integers from a uniform distribution."
+  description: <<END
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Range.pbtxt b/tensorflow/core/api_def/base_api/api_def_Range.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf1021ccfbf6e2b7460be89352a225ac72a96362
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Range.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "Range"
+  in_arg {
+    name: "start"
+    description: <<END
+0-D (scalar). First entry in the sequence.
+END
+  }
+  in_arg {
+    name: "limit"
+    description: <<END
+0-D (scalar). Upper limit of sequence, exclusive.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+0-D (scalar). Optional. Default is 1. Number that increments `start`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D.
+END
+  }
+  summary: "Creates a sequence of numbers."
+  description: <<END
+This operation creates a sequence of numbers that begins at `start` and
+extends by increments of `delta` up to but not including `limit`.
+
+For example:
+
+```
+# 'start' is 3
+# 'limit' is 18
+# 'delta' is 3
+tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9e14b8a052e416dd78f1abdc25c9b024a778107
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RangeDataset"
+  in_arg {
+    name: "start"
+    description: <<END
+corresponds to start in python's xrange().
+END
+  }
+  in_arg {
+    name: "stop"
+    description: <<END
+corresponds to stop in python's xrange().
+END
+  }
+  in_arg {
+    name: "step"
+    description: <<END
+corresponds to step in python's xrange().
+END
+  }
+  summary: "Creates a dataset with a range of values. Corresponds to python\'s xrange."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rank.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rank.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec1c61671dee4f673cd808a79ba8a47f44e3f7bd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rank.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "Rank"
+  summary: "Returns the rank of a tensor."
+  description: <<END
+This operation returns an integer representing the rank of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+# shape of tensor 't' is [2, 2, 3]
+rank(t) ==> 3
+```
+
+**Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+of a tensor is the number of indices required to uniquely select each element
+of the tensor. Rank is also known as "order", "degree", or "ndims."
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReadFile.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReadFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6161453d47647456c3b46aa8ac9ece549d7ad6bc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReadFile.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReadFile"
+  summary: "Reads and outputs the entire contents of the input filename."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReadVariableOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReadVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaa41b462cfaf95f7fdc92bb72733830a1165d9b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReadVariableOp.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "ReadVariableOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+handle to the resource in which to store the variable.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+the dtype of the value.
+END
+  }
+  summary: "Reads the value of a variable."
+  description: <<END
+The tensor returned by this operation is immutable.
+
+The value returned by this operation is guaranteed to be influenced by all the
+writes on which this operation depends directly or indirectly, and to not be
+influenced by any of the writes which depend directly or indirectly on this
+operation.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProduced.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProduced.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27c74890f2e696a235ba10bbc995037c17cb6cbc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProduced.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ReaderNumRecordsProduced"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Returns the number of records this Reader has produced."
+  description: <<END
+This is the same as the number of ReaderRead executions that have
+succeeded.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProducedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..caf4f6b90364b88ef4a212a9c9c095361b58d6ea
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderNumRecordsProducedV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "ReaderNumRecordsProducedV2"
+  endpoint {
+    name: "ReaderNumRecordsProduced"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Returns the number of records this Reader has produced."
+  description: <<END
+This is the same as the number of ReaderRead executions that have
+succeeded.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba9143534d62522ddf57690d0b6691908be7dd6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompleted"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Returns the number of work units this Reader has finished processing."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5289c84240d3746b8163b4fda2f4f79666dc0d11
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
+  endpoint {
+    name: "ReaderNumWorkUnitsCompleted"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Returns the number of work units this Reader has finished processing."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderRead.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..624b1c7fade5e9e311b81ff60740539cc7027181
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderRead.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "ReaderRead"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  in_arg {
+    name: "queue_handle"
+    description: <<END
+Handle to a Queue, with string work items.
+END
+  }
+  out_arg {
+    name: "key"
+    description: <<END
+A scalar.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+A scalar.
+END
+  }
+  summary: "Returns the next record (key, value pair) produced by a Reader."
+  description: <<END
+Will dequeue from the input queue if necessary (e.g. when the
+Reader needs to start reading from a new file since it has finished
+with the previous file).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderReadUpTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderReadUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..53e6e44838c1fc91c66d26545ef58367db02bae3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderReadUpTo.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "ReaderReadUpTo"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a `Reader`.
+END
+  }
+  in_arg {
+    name: "queue_handle"
+    description: <<END
+Handle to a `Queue`, with string work items.
+END
+  }
+  in_arg {
+    name: "num_records"
+    description: <<END
+number of records to read from `Reader`.
+END
+  }
+  out_arg {
+    name: "keys"
+    description: <<END
+A 1-D tensor.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+A 1-D tensor.
+END
+  }
+  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
+  description: <<END
+Will dequeue from the input queue if necessary (e.g. when the
+Reader needs to start reading from a new file since it has finished
+with the previous file).
+It may return less than `num_records` even before the last batch.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderReadUpToV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderReadUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1d2206ffe1730a9a9d75e1409148bf830f522e2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderReadUpToV2.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ReaderReadUpToV2"
+  endpoint {
+    name: "ReaderReadUpTo"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a `Reader`.
+END
+  }
+  in_arg {
+    name: "queue_handle"
+    description: <<END
+Handle to a `Queue`, with string work items.
+END
+  }
+  in_arg {
+    name: "num_records"
+    description: <<END
+number of records to read from `Reader`.
+END
+  }
+  out_arg {
+    name: "keys"
+    description: <<END
+A 1-D tensor.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+A 1-D tensor.
+END
+  }
+  summary: "Returns up to `num_records` (key, value) pairs produced by a Reader."
+  description: <<END
+Will dequeue from the input queue if necessary (e.g. when the
+Reader needs to start reading from a new file since it has finished
+with the previous file).
+It may return less than `num_records` even before the last batch.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderReadV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a6c4efdf5b16fb8dd0a01bfc66dfd825c589924
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderReadV2.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ReaderReadV2"
+  endpoint {
+    name: "ReaderRead"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  in_arg {
+    name: "queue_handle"
+    description: <<END
+Handle to a Queue, with string work items.
+END
+  }
+  out_arg {
+    name: "key"
+    description: <<END
+A scalar.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+A scalar.
+END
+  }
+  summary: "Returns the next record (key, value pair) produced by a Reader."
+  description: <<END
+Will dequeue from the input queue if necessary (e.g. when the
+Reader needs to start reading from a new file since it has finished
+with the previous file).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderReset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ddb494293e4fc567e3d742162c78607b9c0a09d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderReset.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "ReaderReset"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Restore a Reader to its initial clean state."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderResetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderResetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ac5b77d271eeee3f6bcdbb637b73e2f869d0f54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderResetV2.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ReaderResetV2"
+  endpoint {
+    name: "ReaderReset"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Restore a Reader to its initial clean state."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderRestoreState.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderRestoreState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05084ba367205e8e4657bec432e8fbd103faca9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderRestoreState.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "ReaderRestoreState"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  in_arg {
+    name: "state"
+    description: <<END
+Result of a ReaderSerializeState of a Reader with type
+matching reader_handle.
+END
+  }
+  summary: "Restore a reader to a previously saved state."
+  description: <<END
+Not all Readers support being restored, so this can produce an
+Unimplemented error.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderRestoreStateV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderRestoreStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35e053d0ea21a66893e3a6e55c33604a84da4761
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderRestoreStateV2.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "ReaderRestoreStateV2"
+  endpoint {
+    name: "ReaderRestoreState"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  in_arg {
+    name: "state"
+    description: <<END
+Result of a ReaderSerializeState of a Reader with type
+matching reader_handle.
+END
+  }
+  summary: "Restore a reader to a previously saved state."
+  description: <<END
+Not all Readers support being restored, so this can produce an
+Unimplemented error.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderSerializeState.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderSerializeState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..401c22abd0db6fe1e1e6d07f78915ab7ecd0e64d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderSerializeState.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "ReaderSerializeState"
+  visibility: SKIP
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Produce a string tensor that encodes the state of a Reader."
+  description: <<END
+Not all Readers support being serialized, so this can produce an
+Unimplemented error.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReaderSerializeStateV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReaderSerializeStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..855ba3c2eed02af1aed3aaf36410e5d9c23e4cb5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReaderSerializeStateV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "ReaderSerializeStateV2"
+  endpoint {
+    name: "ReaderSerializeState"
+  }
+  in_arg {
+    name: "reader_handle"
+    description: <<END
+Handle to a Reader.
+END
+  }
+  summary: "Produce a string tensor that encodes the state of a Reader."
+  description: <<END
+Not all Readers support being serialized, so this can produce an
+Unimplemented error.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Real.pbtxt b/tensorflow/core/api_def/base_api/api_def_Real.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..225d45fd70ea90247d11c44e81a9cba655de5717
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Real.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "Real"
+  summary: "Returns the real part of a complex number."
+  description: <<END
+Given a tensor `input` of complex numbers, this operation returns a tensor of
+type `float` that is the real part of each element in `input`. All elements in
+`input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+ part returned by this operation and *b* is the imaginary part.
+
+For example:
+
+```
+# tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+tf.real(input) ==> [-2.25, 3.25]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_RealDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da0e55b08f56475b21d4f7a9ab82677011406092
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RealDiv.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "RealDiv"
+  summary: "Returns x / y element-wise for real types."
+  description: <<END
+If `x` and `y` are reals, this will return the floating-point division.
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Reciprocal.pbtxt b/tensorflow/core/api_def/base_api/api_def_Reciprocal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c66b84e268c706bba28d34c69d42b599f179e367
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Reciprocal.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Reciprocal"
+  summary: "Computes the reciprocal of x element-wise."
+  description: <<END
+I.e., \\(y = 1 / x\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReciprocalGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReciprocalGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583e5ecee1fe8ff2e187fd909c09b2381f9bee74
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReciprocalGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "ReciprocalGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient for the inverse of `x` wrt its input."
+  description: <<END
+Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7efc8cd8334e80be3b1cc8ba5b50c2259931b1b6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RecordInput.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "RecordInput"
+  out_arg {
+    name: "records"
+    description: <<END
+A tensor of shape [batch_size].
+END
+  }
+  attr {
+    name: "file_pattern"
+    description: <<END
+Glob pattern for the data files.
+END
+  }
+  attr {
+    name: "file_random_seed"
+    description: <<END
+Random seeds used to produce randomized records.
+END
+  }
+  attr {
+    name: "file_shuffle_shift_ratio"
+    description: <<END
+Shifts the list of files after the list is randomly
+shuffled.
+END
+  }
+  attr {
+    name: "file_buffer_size"
+    description: <<END
+The randomization shuffling buffer.
+END
+  }
+  attr {
+    name: "file_parallelism"
+    description: <<END
+How many sstables are opened and concurrently iterated over.
+END
+  }
+  attr {
+    name: "batch_size"
+    description: <<END
+The batch size.
+END
+  }
+  summary: "Emits randomized records."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca7e0d3beefa219a468135ec53b2564f0d0287c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ReduceJoin"
+  in_arg {
+    name: "inputs"
+    description: <<END
+The input to be joined.  All reduced indices must have non-zero size.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    description: <<END
+The dimensions to reduce over.  Dimensions are reduced in the
+order specified.  Omitting `reduction_indices` is equivalent to passing
+`[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has shape equal to that of the input with reduced dimensions removed or
+set to `1` depending on `keep_dims`.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If `True`, retain reduced dimensions with length `1`.
+END
+  }
+  attr {
+    name: "separator"
+    description: <<END
+The separator to use when joining.
+END
+  }
+  summary: "Joins a string Tensor across the given dimensions."
+  description: <<END
+Computes the string join across dimensions in the given string Tensor of shape
+`[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+strings with the given separator (default: empty string).  Negative indices are
+counted backwards from the end, with `-1` being equivalent to `n - 1`.
+
+For example:
+
+```python
+# tensor `a` is [["a", "b"], ["c", "d"]]
+tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+tf.reduce_join(a, []) ==> ["abcd"]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefEnter.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefEnter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..092f285b270f7af74ae3a7e0b6797fc5d92d1895
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefEnter.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "RefEnter"
+  visibility: HIDDEN
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the child frame.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  attr {
+    name: "frame_name"
+    description: <<END
+The name of the child frame.
+END
+  }
+  attr {
+    name: "is_constant"
+    description: <<END
+If true, the output is constant within the child frame.
+END
+  }
+  attr {
+    name: "parallel_iterations"
+    description: <<END
+The number of iterations allowed to run in parallel.
+END
+  }
+  summary: "Creates or finds a child frame, and makes `data` available to the child frame."
+  description: <<END
+The unique `frame_name` is used by the `Executor` to identify frames. If
+`is_constant` is true, `output` is a constant in the child frame; otherwise
+it may be changed in the child frame. At most `parallel_iterations` iterations
+are run in parallel in the child frame.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefExit.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefExit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d3083d6d98f16560e3dcd152e1d0c815c0b57f1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefExit.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "RefExit"
+  visibility: HIDDEN
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the parent frame.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  summary: "Exits the current frame to its parent frame."
+  description: <<END
+Exit makes its input `data` available to the parent frame.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefIdentity.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b29606837edc3cae002bb762f7e48c2e306a733e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefIdentity.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "RefIdentity"
+  visibility: HIDDEN
+  summary: "Return the same ref tensor as the input ref tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefMerge.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefMerge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc7ad303c5812daa8119516b151cea4a37e21d77
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefMerge.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "RefMerge"
+  visibility: HIDDEN
+  in_arg {
+    name: "inputs"
+    description: <<END
+The input tensors, exactly one of which will become available.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Will be set to the available input tensor.
+END
+  }
+  out_arg {
+    name: "value_index"
+    description: <<END
+The index of the chosen input tensor in `inputs`.
+END
+  }
+  summary: "Forwards the value of an available tensor from `inputs` to `output`."
+  description: <<END
+`Merge` waits for at least one of the tensors in `inputs` to become available.
+It is usually combined with `Switch` to implement branching.
+
+`Merge` forwards the first tensor for become available to `output`, and sets
+`value_index` to its index in `inputs`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefNextIteration.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefNextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd126e99b85dd047ec84ec994ae00b4931b10bd3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefNextIteration.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "RefNextIteration"
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be made available to the next iteration.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as `data`.
+END
+  }
+  summary: "Makes its input available to the next iteration."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefSelect.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefSelect.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24a0c4684e76c9a61388ad566d97ef4bdf8c21be
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefSelect.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RefSelect"
+  in_arg {
+    name: "index"
+    description: <<END
+A scalar that determines the input that gets selected.
+END
+  }
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of ref tensors, one of which will be forwarded to `output`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The forwarded tensor.
+END
+  }
+  summary: "Forwards the `index`th element of `inputs` to `output`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RefSwitch.pbtxt b/tensorflow/core/api_def/base_api/api_def_RefSwitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11db13a17edb3b859241511a9fb81c88bbbbcac8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RefSwitch.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RefSwitch"
+  in_arg {
+    name: "data"
+    description: <<END
+The ref tensor to be forwarded to the appropriate output.
+END
+  }
+  in_arg {
+    name: "pred"
+    description: <<END
+A scalar that specifies which output port will receive data.
+END
+  }
+  out_arg {
+    name: "output_false"
+    description: <<END
+If `pred` is false, data will be forwarded to this output.
+END
+  }
+  out_arg {
+    name: "output_true"
+    description: <<END
+If `pred` is true, data will be forwarded to this output.
+END
+  }
+  summary: "Forwards the ref tensor `data` to the output port determined by `pred`."
+  description: <<END
+If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+the data goes to `output_false`.
+
+See also `Switch` and `Merge`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Relu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44f79b0e29a04cc53dded5052788e370644821fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Relu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Relu"
+  summary: "Computes rectified linear: `max(features, 0)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Relu6.pbtxt b/tensorflow/core/api_def/base_api/api_def_Relu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13a737394c0171cfd81bbf23fb85ac9b1de4b6f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Relu6.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Relu6"
+  summary: "Computes rectified linear 6: `min(max(features, 0), 6)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Relu6Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_Relu6Grad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc81506f66a09385f843790a7c2665e25082ca54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Relu6Grad.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "Relu6Grad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding Relu6 operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding Relu6 operation, or
+its output; using either one produces the same result.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+The gradients:
+`gradients * (features > 0) * (features < 6)`.
+END
+  }
+  summary: "Computes rectified linear 6 gradients for a Relu6 operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReluGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94affbc3b71a52e7910477f4a33119ac7ceed6c8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReluGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "ReluGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding Relu operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding Relu operation, OR
+the outputs of that operation (both work equivalently).
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+`gradients * (features > 0)`.
+END
+  }
+  summary: "Computes rectified linear gradients for a Relu operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RemoteCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_RemoteCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f75f32ebcc8820bf642f3066d1eb0d7fa61da3f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RemoteCall.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "RemoteCall"
+  in_arg {
+    name: "target"
+    description: <<END
+A fully specified device name where we want to run the function.
+END
+  }
+  in_arg {
+    name: "args"
+    description: <<END
+A list of arguments for the function.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A list of return values.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The type list for the arguments.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The type list for the return values.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The function to run remotely.
+END
+  }
+  summary: "Runs function `f` on a remote device indicated by `target`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RemoteFusedGraphExecute.pbtxt b/tensorflow/core/api_def/base_api/api_def_RemoteFusedGraphExecute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..190df5ecbbd01aa6a39f382e45d631e3224fa879
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RemoteFusedGraphExecute.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "RemoteFusedGraphExecute"
+  in_arg {
+    name: "inputs"
+    description: <<END
+Arbitrary number of tensors with arbitrary data types
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+Arbitrary number of tensors with arbitrary data types
+END
+  }
+  attr {
+    name: "serialized_remote_fused_graph_execute_info"
+    description: <<END
+Serialized protocol buffer
+of RemoteFusedGraphExecuteInfo which contains graph specifications.
+END
+  }
+  summary: "Execute a sub graph on a remote processor."
+  description: <<END
+The graph specifications(such as graph itself, input tensors and output names)
+are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+as serialized_remote_fused_graph_execute_info.
+The specifications will be passed to a dedicated registered
+remote fused graph executor.  The executor will send the graph specifications
+to a remote processor and execute that graph.  The execution results
+will be passed to consumer nodes as outputs of this node.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc6169cd32f1671000a9cb96209059d062c00db8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "RepeatDataset"
+  in_arg {
+    name: "count"
+    description: <<END
+A scalar representing the number of times that `input_dataset` should
+be repeated. A value of `-1` indicates that it should be repeated infinitely.
+END
+  }
+  summary: "Creates a dataset that emits the outputs of `input_dataset` `count` times."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07bbd4ac6031765a070c5e5b4ee0726512dbb6ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "RequantizationRange"
+  in_arg {
+    name: "input_min"
+    description: <<END
+The float value that the minimum quantized input value represents.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The float value that the maximum quantized input value represents.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The computed min output.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+the computed max output.
+END
+  }
+  attr {
+    name: "Tinput"
+    description: <<END
+The type of the input.
+END
+  }
+  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
+  description: <<END
+range that covers the actual values present in that tensor.  This op is
+typically used to produce the requested_output_min and requested_output_max for
+Requantize.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b03f63b261e00c6b1dfdc0b1f11c69d71b536eb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "Requantize"
+  in_arg {
+    name: "input_min"
+    description: <<END
+The float value that the minimum quantized input value represents.
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The float value that the maximum quantized input value represents.
+END
+  }
+  in_arg {
+    name: "requested_output_min"
+    description: <<END
+The float value that the minimum quantized output value represents.
+END
+  }
+  in_arg {
+    name: "requested_output_max"
+    description: <<END
+The float value that the maximum quantized output value represents.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The requested_output_min value is copied into this output.
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The requested_output_max value is copied into this output.
+END
+  }
+  attr {
+    name: "Tinput"
+    description: <<END
+The type of the input.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The type of the output. Should be a lower bit depth than Tinput.
+END
+  }
+  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
+  description: <<END
+output range specified with 'requested_output_min' and 'requested_output_max'.
+
+[input_min, input_max] are scalar floats that specify the range for the float
+interpretation of the 'input' data. For example, if input_min is -1.0f and
+input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Reshape.pbtxt b/tensorflow/core/api_def/base_api/api_def_Reshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa32b2537450271cc53f8bd90bd92a7c6570af10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Reshape.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "Reshape"
+  in_arg {
+    name: "shape"
+    description: <<END
+Defines the shape of the output tensor.
+END
+  }
+  summary: "Reshapes a tensor."
+  description: <<END
+Given `tensor`, this operation returns a tensor that has the same values
+as `tensor` with shape `shape`.
+
+If one component of `shape` is the special value -1, the size of that dimension
+is computed so that the total size remains constant.  In particular, a `shape`
+of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
+
+If `shape` is 1-D or higher, then the operation returns a tensor with shape
+`shape` filled with the values of `tensor`. In this case, the number of elements
+implied by `shape` must be the same as the number of elements in `tensor`.
+
+For example:
+
+```
+# tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+# tensor 't' has shape [9]
+reshape(t, [3, 3]) ==> [[1, 2, 3],
+                        [4, 5, 6],
+                        [7, 8, 9]]
+
+# tensor 't' is [[[1, 1], [2, 2]],
+#                [[3, 3], [4, 4]]]
+# tensor 't' has shape [2, 2, 2]
+reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+                        [3, 3, 4, 4]]
+
+# tensor 't' is [[[1, 1, 1],
+#                 [2, 2, 2]],
+#                [[3, 3, 3],
+#                 [4, 4, 4]],
+#                [[5, 5, 5],
+#                 [6, 6, 6]]]
+# tensor 't' has shape [3, 2, 3]
+# pass '[-1]' to flatten 't'
+reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+
+# -1 can also be used to infer the shape
+
+# -1 is inferred to be 9:
+reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+# -1 is inferred to be 2:
+reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+                         [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+# -1 is inferred to be 3:
+reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+                              [2, 2, 2],
+                              [3, 3, 3]],
+                             [[4, 4, 4],
+                              [5, 5, 5],
+                              [6, 6, 6]]]
+
+# tensor 't' is [7]
+# shape `[]` reshapes to a scalar
+reshape(t, []) ==> 7
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6dc321a54496ea3f91b7efe0d28c8596cd18fc1c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeArea.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "ResizeArea"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  out_arg {
+    name: "resized_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1), which
+exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Resize `images` to `size` using area interpolation."
+  description: <<END
+Input images can be of different types but output images are always float.
+
+Each output pixel is computed by first transforming the pixel's footprint into
+the input tensor and then averaging the pixels that intersect the footprint. An
+input pixel's contribution to the average is weighted by the fraction of its
+area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06e645e3ee937f81e3b546d24250f1c1d6ad2680
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubic.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "ResizeBicubic"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  out_arg {
+    name: "resized_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1), which
+exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Resize `images` to `size` using bicubic interpolation."
+  description: <<END
+Input images can be of different types but output images are always float.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf5201d82e4b7b6fb463e17138e92f3033c6992e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBicubicGrad.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "ResizeBicubicGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "grads"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "original_image"
+    description: <<END
+4-D with shape `[batch, orig_height, orig_width, channels]`,
+The image tensor that was resized.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, orig_height, orig_width, channels]`.
+Gradients with respect to the input image. Input image must have been
+float or double.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale grads by (orig_height - 1) / (height - 1), which
+exactly aligns the 4 corners of grads and original_image. If false, rescale by
+orig_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Computes the gradient of bicubic interpolation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0768e437fa00a9adeec00498e968986125602822
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinear.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "ResizeBilinear"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  out_arg {
+    name: "resized_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1), which
+exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Resize `images` to `size` using bilinear interpolation."
+  description: <<END
+Input images can be of different types but output images are always float.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fba64203c236399e79a051206e936ec3ebb27b14
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeBilinearGrad.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "ResizeBilinearGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "grads"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "original_image"
+    description: <<END
+4-D with shape `[batch, orig_height, orig_width, channels]`,
+The image tensor that was resized.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, orig_height, orig_width, channels]`.
+Gradients with respect to the input image. Input image must have been
+float or double.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale grads by (orig_height - 1) / (height - 1), which
+exactly aligns the 4 corners of grads and original_image. If false, rescale by
+orig_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Computes the gradient of bilinear interpolation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a74db4c9dc340b90817567751da110ef8989850f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "ResizeNearestNeighbor"
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+new size for the images.
+END
+  }
+  out_arg {
+    name: "resized_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale input by (new_height - 1) / (height - 1), which
+exactly aligns the 4 corners of images and resized images. If false, rescale
+by new_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Resize `images` to `size` using nearest neighbor interpolation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ef1547eb4fab02392bc2b98a21ef01340b621f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "ResizeNearestNeighborGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "grads"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+= A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+original input size.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+with respect to the input image.
+END
+  }
+  attr {
+    name: "align_corners"
+    description: <<END
+If true, rescale grads by (orig_height - 1) / (height - 1), which
+exactly aligns the 4 corners of grads and original_image. If false, rescale by
+orig_height / height. Treat similarly the width dimension.
+END
+  }
+  summary: "Computes the gradient of nearest neighbor interpolation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdadelta.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f2708a8348846b87daad5bc9039e0a70755c83b4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdadelta.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyAdadelta"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum_update"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var, accum and update_accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' according to the adadelta scheme."
+  description: <<END
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5982d4d371af48ea757f4733bece2e853538d18a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagrad.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "ResourceApplyAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the adagrad scheme."
+  description: <<END
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradDA.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..254e0c609ad061fc2273036edc413af2f9dcc4e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdagradDA.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyAdagradDA"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_squared_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "global_step"
+    description: <<END
+Training step number. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' according to the proximal adagrad scheme."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bea1fd67627cc69354f81da7cf36d20babb9f38d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdam.pbtxt
@@ -0,0 +1,84 @@
+op {
+  graph_op_name: "ResourceApplyAdam"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, uses the nesterov update.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94ba3a8d81abdc40e781c8c76c43123b38567c6b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAddSign.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyAddSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cc033cc897bbc2296b0bf77c5f4e18171a4646b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,80 @@
+op {
+  graph_op_name: "ResourceApplyCenteredRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, mg, ms, and mom tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: <<END
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+mg <- rho * mg_{t-1} + (1-rho) * grad
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6a29b164e2c7cc5cc6230fe6cc4248563de9978
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrl.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "ResourceApplyFtrl"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+accum_new = accum + grad * grad
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a71c835b7896374801fba29983481c8faa26baa6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyFtrlV2.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ResourceApplyFtrlV2"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 shrinkage regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01f235f224818104522bce46e4530c639d1ab49a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyGradientDescent.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "ResourceApplyGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The change.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' by subtracting \'alpha\' * \'delta\' from it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1a84a4c34bb7df8e5a6d8d16d29322f26b13657
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyMomentum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "ResourceApplyMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum + grad
+var -= lr * accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909861e668a7b6911523861624c37657df5a154f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "ResourceApplyPowerSign"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "logbase"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "sign_decay"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and m tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: <<END
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1eaa86ea14af40a8b0219b62822bb2249d871401
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalAdagrad.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "ResourceApplyProximalAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' and \'*accum\' according to FOBOS with Adagrad learning rate."
+  description: <<END
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c22e931a2b68ebc03d131d4a6ac84e26c8f2d463
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "ResourceApplyProximalGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The change.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: <<END
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a24f23f9cfd1d709de606539dc175676aa5fa19
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyRMSProp.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "ResourceApplyRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, ms, and mom tensors is protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: <<END
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceCountUpTo.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceCountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc70d79a1e34f30a8e8583b5d78ce1ec62da3f04
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceCountUpTo.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "ResourceCountUpTo"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a scalar `Variable` node.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A copy of the input before increment. If nothing else modifies the
+input, the values produced will all be distinct.
+END
+  }
+  attr {
+    name: "limit"
+    description: <<END
+If incrementing ref would bring it above limit, instead generates an
+'OutOfRange' error.
+END
+  }
+  summary: "Increments variable pointed to by \'resource\' until it reaches \'limit\'."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae5d38a5014753afc592653bc088af8a19514fc3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceGather.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "ResourceGather"
+  summary: "Gather slices from the variable pointed to by `resource` according to `indices`."
+  description: <<END
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+
+```python
+    # Scalar indices
+    output[:, ..., :] = params[indices, :, ... :]
+
+    # Vector indices
+    output[i, :, ..., :] = params[indices[i], :, ... :]
+
+    # Higher rank indices
+    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e0de08267288e32e34cef323761cf4566fce128
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterAdd.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "ResourceScatterAdd"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Adds sparse updates to the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] += updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] += updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b07ee9fda94851b7bc64a02dbf748b74eb63cdee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse `updates` to individual values or slices within a given"
+  description: <<END
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..947535c6c8f5087392d254460fef2fc8941dba72
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterUpdate.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "ResourceScatterUpdate"
+  in_arg {
+    name: "resource"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  summary: "Assigns sparse updates to the variable referenced by `resource`."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bea6d614c58315dd1ceb6178c48fce36289e8bd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdadelta.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdadelta"
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum_update"
+    description: <<END
+: Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "var: Should be from a Variable()."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6463947601a3cfd39008b27077e319071f7e912
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagrad.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96833d8f097bb92cf59cca222d7198b7d8df65cf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagradDA"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_squared_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "global_step"
+    description: <<END
+Training step number. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..433d040fe741b9c8b2b6dbe7b71cb44bb4c6c0d9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,84 @@
+op {
+  graph_op_name: "ResourceSparseApplyCenteredRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var, ms and mom.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, mg, ms, and mom tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: <<END
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f75272a63b135ef5324a86fab4b89f2a9ae48a09
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrl"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+That is for rows we have grad for, we update var, accum and linear as follows:
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45ea013ce89782972144a2560b81acbefebe53f5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
@@ -0,0 +1,76 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrlV2"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 shrinkage regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..671465377a175d79e1608e4a68bef75bc129e853
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyMomentum.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "ResourceSparseApplyMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum + grad
+var -= lr * accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3a588adaaaa3043a48987e2f06dadc8f78230e3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a6333c0b56212d9ae36e82562283919f1d02e03
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: <<END
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6310711ea6fac8ffaf799a192ff859cf9457966
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyRMSProp.pbtxt
@@ -0,0 +1,72 @@
+op {
+  graph_op_name: "ResourceSparseApplyRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var, ms and mom.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, ms, and mom tensors is protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: <<END
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceStridedSliceAssign.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceStridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec8acbb5bf4b7fc8242ce21606e73a234b0819a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceStridedSliceAssign.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "ResourceStridedSliceAssign"
+  summary: "Assign `value` to the sliced l-value reference of `ref`."
+  description: <<END
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Restore.pbtxt b/tensorflow/core/api_def/base_api/api_def_Restore.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..816b79cf5357c74d1e92c3bb5ebcd28b69b02e7e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Restore.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "Restore"
+  in_arg {
+    name: "file_pattern"
+    description: <<END
+Must have a single element. The pattern of the files from
+which we read the tensor.
+END
+  }
+  in_arg {
+    name: "tensor_name"
+    description: <<END
+Must have a single element. The name of the tensor to be
+restored.
+END
+  }
+  out_arg {
+    name: "tensor"
+    description: <<END
+The restored tensor.
+END
+  }
+  attr {
+    name: "dt"
+    description: <<END
+The type of the tensor to be restored.
+END
+  }
+  attr {
+    name: "preferred_shard"
+    description: <<END
+Index of file to open first if multiple files match
+`file_pattern`.
+END
+  }
+  summary: "Restores a tensor from checkpoint files."
+  description: <<END
+Reads a tensor stored in one or several files. If there are several files (for
+instance because a tensor was saved as slices), `file_pattern` may contain
+wildcard symbols (`*` and `?`) in the filename portion only, not in the
+directory portion.
+
+If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+in which file the requested tensor is likely to be found. This op will first
+open the file at index `preferred_shard` in the list of matching files and try
+to restore tensors from that file.  Only if some tensors or tensor slices are
+not found in that first file, then the Op opens all the files. Setting
+`preferred_shard` to match the value passed as the `shard` input
+of a matching `Save` Op may speed up Restore.  This attribute only affects
+performance, not correctness.  The default value -1 means files are processed in
+order.
+
+See also `RestoreSlice`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RestoreSlice.pbtxt b/tensorflow/core/api_def/base_api/api_def_RestoreSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e57b1ea42d3c790a87c3206113ca57c03e189343
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RestoreSlice.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "RestoreSlice"
+  in_arg {
+    name: "file_pattern"
+    description: <<END
+Must have a single element. The pattern of the files from
+which we read the tensor.
+END
+  }
+  in_arg {
+    name: "tensor_name"
+    description: <<END
+Must have a single element. The name of the tensor to be
+restored.
+END
+  }
+  in_arg {
+    name: "shape_and_slice"
+    description: <<END
+Scalar. The shapes and slice specifications to use when
+restoring a tensors.
+END
+  }
+  out_arg {
+    name: "tensor"
+    description: <<END
+The restored tensor.
+END
+  }
+  attr {
+    name: "dt"
+    description: <<END
+The type of the tensor to be restored.
+END
+  }
+  attr {
+    name: "preferred_shard"
+    description: <<END
+Index of file to open first if multiple files match
+`file_pattern`. See the documentation for `Restore`.
+END
+  }
+  summary: "Restores a tensor from checkpoint files."
+  description: <<END
+This is like `Restore` except that restored tensor can be listed as filling
+only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+larger tensor and the slice that the restored tensor covers.
+
+The `shape_and_slice` input has the same format as the
+elements of the `shapes_and_slices` input of the `SaveSlices` op.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RestoreV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_RestoreV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a64ef36d05727e6f4c67e1f7fb4cf68545823df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RestoreV2.pbtxt
@@ -0,0 +1,52 @@
+op {
+  graph_op_name: "RestoreV2"
+  in_arg {
+    name: "prefix"
+    description: <<END
+Must have a single element.  The prefix of a V2 checkpoint.
+END
+  }
+  in_arg {
+    name: "tensor_names"
+    description: <<END
+shape {N}.  The names of the tensors to be restored.
+END
+  }
+  in_arg {
+    name: "shape_and_slices"
+    description: <<END
+shape {N}.  The slice specs of the tensors to be restored.
+Empty strings indicate that they are non-partitioned tensors.
+END
+  }
+  out_arg {
+    name: "tensors"
+    description: <<END
+shape {N}.  The restored tensors, whose shapes are read from the
+checkpoint directly.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+shape {N}.  The list of expected dtype for the tensors.  Must match
+those stored in the checkpoint.
+END
+  }
+  summary: "Restores tensors from a V2 checkpoint."
+  description: <<END
+For backward compatibility with the V1 format, this Op currently allows
+restoring from a V1 checkpoint as well:
+  - This Op first attempts to find the V2 index file pointed to by "prefix", and
+    if found proceed to read it as a V2 checkpoint;
+  - Otherwise the V1 read path is invoked.
+Relying on this behavior is not recommended, as the ability to fall back to read
+V1 might be deprecated and eventually removed.
+
+By default, restores the named tensors in full.  If the caller wishes to restore
+specific slices of stored tensors, "shape_and_slices" should be non-empty
+strings and correspondingly well-formed.
+
+Callers must ensure all the named tensors are indeed stored in the checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Reverse.pbtxt b/tensorflow/core/api_def/base_api/api_def_Reverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83d7ee77989ab51488c651b6b2d0958bbacb276e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Reverse.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "Reverse"
+  visibility: SKIP
+  in_arg {
+    name: "tensor"
+    description: <<END
+Up to 8-D.
+END
+  }
+  in_arg {
+    name: "dims"
+    description: <<END
+1-D. The dimensions to reverse.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same shape as `tensor`.
+END
+  }
+  summary: "Reverses specific dimensions of a tensor."
+  description: <<END
+Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+of `tensor`, this operation reverses each dimension i of `tensor` where
+`dims[i]` is `True`.
+
+`tensor` can have up to 8 dimensions. The number of dimensions
+of `tensor` must equal the number of elements in `dims`. In other words:
+
+`rank(tensor) = size(dims)`
+
+For example:
+
+```
+# tensor 't' is [[[[ 0,  1,  2,  3],
+#                  [ 4,  5,  6,  7],
+#                  [ 8,  9, 10, 11]],
+#                 [[12, 13, 14, 15],
+#                  [16, 17, 18, 19],
+#                  [20, 21, 22, 23]]]]
+# tensor 't' shape is [1, 2, 3, 4]
+
+# 'dims' is [False, False, False, True]
+reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+                        [ 7,  6,  5,  4],
+                        [ 11, 10, 9, 8]],
+                       [[15, 14, 13, 12],
+                        [19, 18, 17, 16],
+                        [23, 22, 21, 20]]]]
+
+# 'dims' is [False, True, False, False]
+reverse(t, dims) ==> [[[[12, 13, 14, 15],
+                        [16, 17, 18, 19],
+                        [20, 21, 22, 23]
+                       [[ 0,  1,  2,  3],
+                        [ 4,  5,  6,  7],
+                        [ 8,  9, 10, 11]]]]
+
+# 'dims' is [False, False, True, False]
+reverse(t, dims) ==> [[[[8, 9, 10, 11],
+                        [4, 5, 6, 7],
+                        [0, 1, 2, 3]]
+                       [[20, 21, 22, 23],
+                        [16, 17, 18, 19],
+                        [12, 13, 14, 15]]]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReverseSequence.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReverseSequence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ee4ead539f3922ce715f62b364757df1943baae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReverseSequence.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "ReverseSequence"
+  in_arg {
+    name: "input"
+    description: <<END
+The input to reverse.
+END
+  }
+  in_arg {
+    name: "seq_lengths"
+    description: <<END
+1-D with length `input.dims(batch_dim)` and
+`max(seq_lengths) <= input.dims(seq_dim)`
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The partially reversed input. It has the same shape as `input`.
+END
+  }
+  attr {
+    name: "seq_dim"
+    description: <<END
+The dimension which is partially reversed.
+END
+  }
+  attr {
+    name: "batch_dim"
+    description: <<END
+The dimension along which reversal is performed.
+END
+  }
+  summary: "Reverses variable length slices."
+  description: <<END
+This op first slices `input` along the dimension `batch_dim`, and for each
+slice `i`, reverses the first `seq_lengths[i]` elements along
+the dimension `seq_dim`.
+
+The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+
+The output slice `i` along dimension `batch_dim` is then given by input
+slice `i`, with the first `seq_lengths[i]` slices along dimension
+`seq_dim` reversed.
+
+For example:
+
+```
+# Given this:
+batch_dim = 0
+seq_dim = 1
+input.dims = (4, 8, ...)
+seq_lengths = [7, 2, 3, 5]
+
+# then slices of input are reversed on seq_dim, but only up to seq_lengths:
+output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
+
+# while entries past seq_lens are copied through:
+output[0, 7:, :, ...] = input[0, 7:, :, ...]
+output[1, 2:, :, ...] = input[1, 2:, :, ...]
+output[2, 3:, :, ...] = input[2, 3:, :, ...]
+output[3, 2:, :, ...] = input[3, 2:, :, ...]
+```
+
+In contrast, if:
+
+```
+# Given this:
+batch_dim = 2
+seq_dim = 0
+input.dims = (8, ?, 4, ...)
+seq_lengths = [7, 2, 3, 5]
+
+# then slices of input are reversed on seq_dim, but only up to seq_lengths:
+output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+
+# while entries past seq_lens are copied through:
+output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ReverseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReverseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c9e4c29be89b13541e161d2a1a38c3d33b79edf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ReverseV2.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "ReverseV2"
+  endpoint {
+    name: "Reverse"
+  }
+  in_arg {
+    name: "tensor"
+    description: <<END
+Up to 8-D.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+1-D. The indices of the dimensions to reverse. Must be in the range
+`[-rank(tensor), rank(tensor))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same shape as `tensor`.
+END
+  }
+  summary: "Reverses specific dimensions of a tensor."
+  description: <<END
+NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+`tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+
+Given a `tensor`, and a `int32` tensor `axis` representing the set of
+dimensions of `tensor` to reverse. This operation reverses each dimension
+`i` for which there exists `j` s.t. `axis[j] == i`.
+
+`tensor` can have up to 8 dimensions. The number of dimensions specified
+in `axis` may be 0 or more entries. If an index is specified more than
+once, a InvalidArgument error is raised.
+
+For example:
+
+```
+# tensor 't' is [[[[ 0,  1,  2,  3],
+#                  [ 4,  5,  6,  7],
+#                  [ 8,  9, 10, 11]],
+#                 [[12, 13, 14, 15],
+#                  [16, 17, 18, 19],
+#                  [20, 21, 22, 23]]]]
+# tensor 't' shape is [1, 2, 3, 4]
+
+# 'dims' is [3] or 'dims' is [-1]
+reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+                        [ 7,  6,  5,  4],
+                        [ 11, 10, 9, 8]],
+                       [[15, 14, 13, 12],
+                        [19, 18, 17, 16],
+                        [23, 22, 21, 20]]]]
+
+# 'dims' is '[1]' (or 'dims' is '[-3]')
+reverse(t, dims) ==> [[[[12, 13, 14, 15],
+                        [16, 17, 18, 19],
+                        [20, 21, 22, 23]
+                       [[ 0,  1,  2,  3],
+                        [ 4,  5,  6,  7],
+                        [ 8,  9, 10, 11]]]]
+
+# 'dims' is '[2]' (or 'dims' is '[-2]')
+reverse(t, dims) ==> [[[[8, 9, 10, 11],
+                        [4, 5, 6, 7],
+                        [0, 1, 2, 3]]
+                       [[20, 21, 22, 23],
+                        [16, 17, 18, 19],
+                        [12, 13, 14, 15]]]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RightShift.pbtxt b/tensorflow/core/api_def/base_api/api_def_RightShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7c56a00f1f81b5b9de94a759bc4e32f0d1e05c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RightShift.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "RightShift"
+  summary: "Elementwise computes the bitwise right-shift of `x` and `y`."
+  description: <<END
+Performs a logical shift for unsigned integer types, and an arithmetic shift
+for signed integer types.
+
+If `y` is negative, or greater than or equal to than the width of `x` in bits
+the result is implementation defined.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rint.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73699c9b6cf58ea02e64fe6cc45c42d8e8e76d73
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rint.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "Rint"
+  summary: "Returns element-wise integer closest to x."
+  description: <<END
+If the result is midway between two representable values,
+the even representable is chosen.
+For example:
+
+```
+rint(-1.5) ==> -2.0
+rint(0.5000001) ==> 1.0
+rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Round.pbtxt b/tensorflow/core/api_def/base_api/api_def_Round.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a7105eae74fd2e6297c1c3d0b6b097751bfdc1c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Round.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Round"
+  summary: "Rounds the values of a tensor to the nearest integer, element-wise."
+  description: <<END
+Rounds half to even.  Also known as bankers rounding. If you want to round
+according to the current system rounding mode use std::cint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rsqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7f768c505b925e2cc07db5de82c0973e719c4ec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rsqrt.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Rsqrt"
+  summary: "Computes reciprocal of square root of x element-wise."
+  description: <<END
+I.e., \\(y = 1 / \sqrt{x}\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RsqrtGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_RsqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..501936c5c8881147f7bfc705857af71806a4d1c9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RsqrtGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "RsqrtGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient for the rsqrt of `x` wrt its input."
+  description: <<END
+Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_S.pbtxt b/tensorflow/core/api_def/base_api/api_def_S.pbtxt
deleted file mode 100644
index 9c53f9ac6207ed4a3214bc38a8a417364e74e776..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_S.pbtxt
+++ /dev/null
@@ -1,2678 +0,0 @@
-op {
-  graph_op_name: "SampleDistortedBoundingBox"
-  endpoint {
-    name: "SampleDistortedBoundingBox"
-  }
-  summary: "Generate a single randomly distorted bounding box for an image."
-  description: <<END
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-END
-}
-op {
-  graph_op_name: "SampleDistortedBoundingBoxV2"
-  endpoint {
-    name: "SampleDistortedBoundingBoxV2"
-  }
-  summary: "Generate a single randomly distorted bounding box for an image."
-  description: <<END
-Bounding box annotations are often supplied in addition to ground-truth labels
-in image recognition or object localization tasks. A common technique for
-training such a system is to randomly distort an image while preserving
-its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-localization of an object, i.e. bounding box, given an `image_size`,
-`bounding_boxes` and a series of constraints.
-
-The output of this Op is a single bounding box that may be used to crop the
-original image. The output is returned as 3 tensors: `begin`, `size` and
-`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-what the bounding box looks like.
-
-Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-height of the underlying image.
-
-For example,
-
-```python
-    # Generate a single distorted bounding box.
-    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-        tf.shape(image),
-        bounding_boxes=bounding_boxes)
-
-    # Draw the bounding box in an image summary.
-    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-                                                  bbox_for_draw)
-    tf.image_summary('images_with_box', image_with_box)
-
-    # Employ the bounding box to distort the image.
-    distorted_image = tf.slice(image, begin, size)
-```
-
-Note that if no bounding box information is available, setting
-`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-false and no bounding boxes are supplied, an error is raised.
-END
-}
-op {
-  graph_op_name: "Save"
-  endpoint {
-    name: "Save"
-  }
-  summary: "Saves the input tensors to disk."
-  description: <<END
-The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-is written to `filename` with name `tensor_names[i]`.
-
-See also `SaveSlices`.
-END
-}
-op {
-  graph_op_name: "SaveIterator"
-  endpoint {
-    name: "SaveIterator"
-  }
-  summary: "Saves the state of the `iterator` at `path`."
-  description: <<END
-This state can be restored using "RestoreIterator".
-END
-}
-op {
-  graph_op_name: "SaveSlices"
-  endpoint {
-    name: "SaveSlices"
-  }
-  summary: "Saves input tensors slices to disk."
-  description: <<END
-This is like `Save` except that tensors can be listed in the saved file as being
-a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-have as many elements as `tensor_names`.
-
-Elements of the `shapes_and_slices` input must either be:
-
-*  The empty string, in which case the corresponding tensor is
-   saved normally.
-*  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-   `dimI` are the dimensions of the larger tensor and `slice-spec`
-   specifies what part is covered by the tensor to save.
-
-`slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-where each `sliceI` is either:
-
-*  The string `-` meaning that the slice covers all indices of this dimension
-*  `start,length` where `start` and `length` are integers.  In that
-   case the slice covers `length` indices starting at `start`.
-
-See also `Save`.
-END
-}
-op {
-  graph_op_name: "SaveV2"
-  endpoint {
-    name: "SaveV2"
-  }
-  summary: "Saves tensors in V2 checkpoint format."
-  description: <<END
-By default, saves the named tensors in full.  If the caller wishes to save
-specific slices of full tensors, "shape_and_slices" should be non-empty strings
-and correspondingly well-formed.
-END
-}
-op {
-  graph_op_name: "ScalarSummary"
-  endpoint {
-    name: "ScalarSummary"
-  }
-  summary: "Outputs a `Summary` protocol buffer with scalar values."
-  description: <<END
-The input `tags` and `values` must have the same shape.  The generated summary
-has a summary value for each tag-value pair in `tags` and `values`.
-END
-}
-op {
-  graph_op_name: "ScatterAdd"
-  endpoint {
-    name: "ScatterAdd"
-  }
-  summary: "Adds sparse updates to a variable reference."
-  description: <<END
-This operation computes
-
-    # Scalar indices
-    ref[indices, ...] += updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] += updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "ScatterDiv"
-  endpoint {
-    name: "ScatterDiv"
-  }
-  summary: "Divides a variable reference by sparse updates."
-  description: <<END
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] /= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] /= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions divide.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-END
-}
-op {
-  graph_op_name: "ScatterMul"
-  endpoint {
-    name: "ScatterMul"
-  }
-  summary: "Multiplies sparse updates into a variable reference."
-  description: <<END
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] *= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] *= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their contributions multiply.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-END
-}
-op {
-  graph_op_name: "ScatterNd"
-  endpoint {
-    name: "ScatterNd"
-  }
-  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
-  description: <<END
-Creates a new tensor by applying sparse `updates` to individual
-values or slices within a zero tensor of the given `shape` according to
-indices.  This operator is the inverse of the @{tf.gather_nd} operator which
-extracts values or slices from a given tensor.
-
-**WARNING**: The order in which updates are applied is nondeterministic, so the
-output will be nondeterministic if `indices` contains duplicates.
-
-`indices` is an integer tensor containing indices into a new tensor of shape
-`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-
-    indices.shape[-1] <= shape.rank
-
-The last dimension of `indices` corresponds to indices into elements
-(if `indices.shape[-1] = shape.rank`) or slices
-(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-`shape`.  `updates` is a tensor with shape
-
-    indices.shape[:-1] + shape[indices.shape[-1]:]
-
-The simplest form of scatter is to insert individual elements in a tensor by
-index. For example, say we want to insert 4 scattered elements in a rank-1
-tensor with 8 elements.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-```python
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    shape = tf.constant([8])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print(sess.run(scatter))
-```
-
-The resulting tensor would look like this:
-
-    [0, 11, 0, 10, 9, 0, 0, 12]
-
-We can also, insert entire slices of a higher rank tensor all at once. For
-example, if we wanted to insert two slices in the first dimension of a
-rank-3 tensor with two matrices of new values.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-</div>
-
-In Python, this scatter operation would look like this:
-
-```python
-    indices = tf.constant([[0], [2]])
-    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]],
-                           [[5, 5, 5, 5], [6, 6, 6, 6],
-                            [7, 7, 7, 7], [8, 8, 8, 8]]])
-    shape = tf.constant([4, 4, 4])
-    scatter = tf.scatter_nd(indices, updates, shape)
-    with tf.Session() as sess:
-      print(sess.run(scatter))
-```
-
-The resulting tensor would look like this:
-
-    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-END
-}
-op {
-  graph_op_name: "ScatterNdAdd"
-  endpoint {
-    name: "ScatterNdAdd"
-  }
-  summary: "Applies sparse addition between `updates` and individual values or slices"
-  description: <<END
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
-
-The resulting update to ref would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-END
-}
-op {
-  graph_op_name: "ScatterNdNonAliasingAdd"
-  endpoint {
-    name: "ScatterNdNonAliasingAdd"
-  }
-  summary: "Applies sparse addition to `input` using individual values or slices"
-  description: <<END
-from `updates` according to indices `indices`.  The updates are non-aliasing:
-`input` is only modified in-place if no other operations will use it.
-Otherwise, a copy of `input` is made.  This operation has a gradient with
-respect to both `input` and `updates`.
-
-`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `input`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-(if `K < P`) along the `K`th dimension of `input`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
-```
-
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
-
-    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-    with tf.Session() as sess:
-      print(sess.run(output))
-
-The resulting value `output` would look like this:
-
-    [1, 13, 3, 14, 14, 6, 7, 20]
-
-See @{tf.scatter_nd} for more details about how to make updates to slices.
-END
-}
-op {
-  graph_op_name: "ScatterNdSub"
-  endpoint {
-    name: "ScatterNdSub"
-  }
-  summary: "Applies sparse subtraction between `updates` and individual values or slices"
-  description: <<END
-within a given variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-with 8 elements. In Python, that subtraction would look like this:
-
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
-
-The resulting update to ref would look like this:
-
-    [1, -9, 3, -6, -4, 6, 7, -4]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-END
-}
-op {
-  graph_op_name: "ScatterNdUpdate"
-  endpoint {
-    name: "ScatterNdUpdate"
-  }
-  summary: "Applies sparse `updates` to individual values or slices within a given"
-  description: <<END
-variable according to `indices`.
-
-`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-
-`indices` must be integer tensor, containing indices into `ref`.
-It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-
-The innermost dimension of `indices` (with length `K`) corresponds to
-indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-dimension of `ref`.
-
-`updates` is `Tensor` of rank `Q-1+P-K` with shape:
-
-```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-```
-
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
-
-```python
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_update(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
-```
-
-The resulting update to ref would look like this:
-
-    [1, 11, 3, 10, 9, 6, 7, 12]
-
-See @{tf.scatter_nd} for more details about how to make updates to
-slices.
-END
-}
-op {
-  graph_op_name: "ScatterSub"
-  endpoint {
-    name: "ScatterSub"
-  }
-  summary: "Subtracts sparse updates to a variable reference."
-  description: <<END
-```python
-    # Scalar indices
-    ref[indices, ...] -= updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] -= updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-Duplicate entries are handled correctly: if multiple `indices` reference
-the same location, their (negated) contributions add.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "ScatterUpdate"
-  endpoint {
-    name: "ScatterUpdate"
-  }
-  summary: "Applies sparse updates to a variable reference."
-  description: <<END
-This operation computes
-
-```python
-    # Scalar indices
-    ref[indices, ...] = updates[...]
-
-    # Vector indices (for each i)
-    ref[indices[i], ...] = updates[i, ...]
-
-    # High rank indices (for each i, ..., j)
-    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
-```
-
-This operation outputs `ref` after the update is done.
-This makes it easier to chain operations that need to use the reset value.
-
-If values in `ref` is to be updated more than once, because there are
-duplicate entries in `indices`, the order at which the updates happen
-for each value is undefined.
-
-Requires `updates.shape = indices.shape + ref.shape[1:]`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "SdcaFprint"
-  endpoint {
-    name: "SdcaFprint"
-  }
-  summary: "Computes fingerprints of the input strings."
-}
-op {
-  graph_op_name: "SdcaOptimizer"
-  endpoint {
-    name: "SdcaOptimizer"
-  }
-  summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
-  description: <<END
-linear models with L1 + L2 regularization. As global optimization objective is
-strongly-convex, the optimizer optimizes the dual objective at each step. The
-optimizer applies each update one example at a time. Examples are sampled
-uniformly, and the optimizer is learning rate free and enjoys linear convergence
-rate.
-
-[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-Shai Shalev-Shwartz, Tong Zhang. 2012
-
-$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-
-[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-Peter Richtarik, Martin Takac. 2015
-
-[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
-END
-}
-op {
-  graph_op_name: "SdcaShrinkL1"
-  endpoint {
-    name: "SdcaShrinkL1"
-  }
-  summary: "Applies L1 regularization shrink step on the parameters."
-}
-op {
-  graph_op_name: "SegmentMax"
-  endpoint {
-    name: "SegmentMax"
-  }
-  summary: "Computes the maximum along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-that `segment_ids[j] == i`.
-
-If the max is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "SegmentMean"
-  endpoint {
-    name: "SegmentMean"
-  }
-  summary: "Computes the mean along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-over `j` such that `segment_ids[j] == i` and `N` is the total number of
-values summed.
-
-If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "SegmentMin"
-  endpoint {
-    name: "SegmentMin"
-  }
-  summary: "Computes the minimum along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-that `segment_ids[j] == i`.
-
-If the min is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "SegmentProd"
-  endpoint {
-    name: "SegmentProd"
-  }
-  summary: "Computes the product along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \prod_j data_j\\) where the product is over `j` such
-that `segment_ids[j] == i`.
-
-If the product is empty for a given segment ID `i`, `output[i] = 1`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "SegmentSum"
-  endpoint {
-    name: "SegmentSum"
-  }
-  summary: "Computes the sum along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-\\(output_i = \sum_j data_j\\) where sum is over `j` such
-that `segment_ids[j] == i`.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "Select"
-  endpoint {
-    name: "Select"
-  }
-  summary: "Selects elements from `t` or `e`, depending on `condition`."
-  description: <<END
-The `t`, and `e` tensors must all have the same shape, and the
-output will also have that shape.
-
-The `condition` tensor must be a scalar if `t` and `e` are scalars.
-If `t` and `e` are vectors or higher rank, then `condition` must be either a
-scalar, a vector with size matching the first dimension of `t`, or must have
-the same shape as `t`.
-
-The `condition` tensor acts as a mask that chooses, based on the value at each
-element, whether the corresponding element / row in the output should be
-taken from `t` (if true) or `e` (if false).
-
-If `condition` is a vector and `t` and `e` are higher rank matrices, then
-it chooses which row (outer dimension) to copy from `t` and `e`.
-If `condition` has the same shape as `t` and `e`, then it chooses which
-element to copy from `t` and `e`.
-
-For example:
-
-```python
-# 'condition' tensor is [[True,  False]
-#                        [False, True]]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e)  # => [[1, 6], [7, 4]]
-
-
-# 'condition' tensor is [True, False]
-# 't' is [[1, 2],
-#         [3, 4]]
-# 'e' is [[5, 6],
-#         [7, 8]]
-select(condition, t, e) ==> [[1, 2],
-                             [7, 8]]
-
-```
-END
-}
-op {
-  graph_op_name: "SelfAdjointEig"
-  endpoint {
-    name: "SelfAdjointEig"
-  }
-  summary: "Computes the Eigen Decomposition of a batch of square self-adjoint matrices."
-  description: <<END
-The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-form square matrices, with the same constraints as the single matrix
-SelfAdjointEig.
-
-The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
-END
-}
-op {
-  graph_op_name: "SelfAdjointEigV2"
-  endpoint {
-    name: "SelfAdjointEigV2"
-  }
-  summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
-  description: <<END
-Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
-
-```python
-# a is a tensor.
-# e is a tensor of eigenvalues.
-# v is a tensor of eigenvectors.
-e, v = self_adjoint_eig(a)
-e = self_adjoint_eig(a, compute_v=False)
-```
-END
-}
-op {
-  graph_op_name: "Selu"
-  endpoint {
-    name: "Selu"
-  }
-  summary: "Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`"
-  description: <<END
-if < 0, `scale * features` otherwise.
-
-See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-END
-}
-op {
-  graph_op_name: "SeluGrad"
-  endpoint {
-    name: "SeluGrad"
-  }
-  summary: "Computes gradients for the scaled exponential linear (Selu) operation."
-}
-op {
-  graph_op_name: "SerializeManySparse"
-  endpoint {
-    name: "SerializeManySparse"
-  }
-  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`."
-  description: <<END
-The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-is treated as the minibatch dimension.  Elements of the `SparseTensor`
-must be sorted in increasing order of this first dimension.  The serialized
-`SparseTensor` objects going into each row of `serialized_sparse` will have
-rank `R-1`.
-
-The minibatch size `N` is extracted from `sparse_shape[0]`.
-END
-}
-op {
-  graph_op_name: "SerializeSparse"
-  endpoint {
-    name: "SerializeSparse"
-  }
-  summary: "Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object."
-}
-op {
-  graph_op_name: "SerializeTensor"
-  endpoint {
-    name: "SerializeTensor"
-  }
-  summary: "Transforms a Tensor into a serialized TensorProto proto."
-}
-op {
-  graph_op_name: "SetSize"
-  endpoint {
-    name: "SetSize"
-  }
-  summary: "Number of unique elements along last dimension of input `set`."
-  description: <<END
-Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-and `set_shape`. The last dimension contains values in a set, duplicates are
-allowed but ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set`
-indices.
-END
-}
-op {
-  graph_op_name: "Shape"
-  endpoint {
-    name: "Shape"
-  }
-  summary: "Returns the shape of a tensor."
-  description: <<END
-This operation returns a 1-D integer tensor representing the shape of `input`.
-
-For example:
-
-```
-# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-shape(t) ==> [2, 2, 3]
-```
-END
-}
-op {
-  graph_op_name: "ShapeN"
-  endpoint {
-    name: "ShapeN"
-  }
-  summary: "Returns shape of tensors."
-  description: <<END
-This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-END
-}
-op {
-  graph_op_name: "ShardedFilename"
-  endpoint {
-    name: "ShardedFilename"
-  }
-  summary: "Generate a sharded filename. The filename is printf formatted as"
-  description: <<END
-   %s-%05d-of-%05d, basename, shard, num_shards.
-END
-}
-op {
-  graph_op_name: "ShardedFilespec"
-  endpoint {
-    name: "ShardedFilespec"
-  }
-  summary: "Generate a glob pattern matching all sharded file names."
-}
-op {
-  graph_op_name: "ShuffleDataset"
-  endpoint {
-    name: "ShuffleDataset"
-  }
-  summary: "Creates a dataset that shuffles elements from `input_dataset` pseudorandomly."
-}
-op {
-  graph_op_name: "Sigmoid"
-  endpoint {
-    name: "Sigmoid"
-  }
-  summary: "Computes sigmoid of `x` element-wise."
-  description: <<END
-Specifically, `y = 1 / (1 + exp(-x))`.
-END
-}
-op {
-  graph_op_name: "SigmoidGrad"
-  endpoint {
-    name: "SigmoidGrad"
-  }
-  summary: "Computes the gradient of the sigmoid of `x` wrt its input."
-  description: <<END
-Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-`dy` is the corresponding input gradient.
-END
-}
-op {
-  graph_op_name: "Sign"
-  endpoint {
-    name: "Sign"
-  }
-  summary: "Returns an element-wise indication of the sign of a number."
-  description: <<END
-`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-
-For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-END
-}
-op {
-  graph_op_name: "Sin"
-  endpoint {
-    name: "Sin"
-  }
-  summary: "Computes sin of x element-wise."
-}
-op {
-  graph_op_name: "Sinh"
-  endpoint {
-    name: "Sinh"
-  }
-  summary: "Computes hyperbolic sine of x element-wise."
-}
-op {
-  graph_op_name: "Size"
-  endpoint {
-    name: "Size"
-  }
-  summary: "Returns the size of a tensor."
-  description: <<END
-This operation returns an integer representing the number of elements in
-`input`.
-
-For example:
-
-```
-# 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-size(t) ==> 12
-```
-END
-}
-op {
-  graph_op_name: "SkipDataset"
-  endpoint {
-    name: "SkipDataset"
-  }
-  summary: "Creates a dataset that skips `count` elements from the `input_dataset`."
-}
-op {
-  graph_op_name: "Skipgram"
-  endpoint {
-    name: "Skipgram"
-  }
-  summary: "Parses a text file and creates a batch of examples."
-}
-op {
-  graph_op_name: "Slice"
-  endpoint {
-    name: "Slice"
-  }
-  summary: "Return a slice from \'input\'."
-  description: <<END
-The output tensor is a tensor with dimensions described by 'size'
-whose values are extracted from 'input' starting at the offsets in
-'begin'.
-
-*Requirements*:
-  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-END
-}
-op {
-  graph_op_name: "SloppyInterleaveDataset"
-  endpoint {
-    name: "SloppyInterleaveDataset"
-  }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
-  description: <<END
-The resulting dataset is similar to the `InterleaveDataset`, with the exception
-that if retrieving the next value from a dataset would cause the requester to
-block, it will skip that input dataset. This dataset is especially useful
-when loading data from a variable-latency datastores (e.g. HDFS, GCS), as it
-allows the training step to proceed so long as some data is available.
-
-!! WARNING !! This dataset is not deterministic!
-END
-}
-op {
-  graph_op_name: "Softmax"
-  endpoint {
-    name: "Softmax"
-  }
-  summary: "Computes softmax activations."
-  description: <<END
-For each batch `i` and class `j` we have
-
-    softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
-END
-}
-op {
-  graph_op_name: "SoftmaxCrossEntropyWithLogits"
-  endpoint {
-    name: "SoftmaxCrossEntropyWithLogits"
-  }
-  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
-  description: <<END
-Inputs are the logits, not probabilities.
-END
-}
-op {
-  graph_op_name: "Softplus"
-  endpoint {
-    name: "Softplus"
-  }
-  summary: "Computes softplus: `log(exp(features) + 1)`."
-}
-op {
-  graph_op_name: "SoftplusGrad"
-  endpoint {
-    name: "SoftplusGrad"
-  }
-  summary: "Computes softplus gradients for a softplus operation."
-}
-op {
-  graph_op_name: "Softsign"
-  endpoint {
-    name: "Softsign"
-  }
-  summary: "Computes softsign: `features / (abs(features) + 1)`."
-}
-op {
-  graph_op_name: "SoftsignGrad"
-  endpoint {
-    name: "SoftsignGrad"
-  }
-  summary: "Computes softsign gradients for a softsign operation."
-}
-op {
-  graph_op_name: "SpaceToBatch"
-  endpoint {
-    name: "SpaceToBatch"
-  }
-  summary: "SpaceToBatch for 4-D tensors of type T."
-  description: <<END
-This is a legacy version of the more general SpaceToBatchND.
-
-Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
-More specifically, this op outputs a copy of the input tensor where values from
-the `height` and `width` dimensions are moved to the `batch` dimension. After
-the zero-padding, both `height` and `width` of the input must be divisible by the
-block size.
-END
-}
-op {
-  graph_op_name: "SpaceToBatchND"
-  endpoint {
-    name: "SpaceToBatchND"
-  }
-  summary: "SpaceToBatch for N-D tensors of type T."
-  description: <<END
-This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
-grid of blocks of shape `block_shape`, and interleaves these blocks with the
-"batch" dimension (0) such that in the output, the spatial dimensions
-`[1, ..., M]` correspond to the position within the grid, and the batch
-dimension combines both the position within a spatial block and the original
-batch position.  Prior to division into blocks, the spatial dimensions of the
-input are optionally zero padded according to `paddings`.  See below for a
-precise description.
-END
-}
-op {
-  graph_op_name: "SpaceToDepth"
-  endpoint {
-    name: "SpaceToDepth"
-  }
-  summary: "SpaceToDepth for tensors of type T."
-  description: <<END
-Rearranges blocks of spatial data, into depth. More specifically,
-this op outputs a copy of the input tensor where values from the `height`
-and `width` dimensions are moved to the `depth` dimension.
-The attr `block_size` indicates the input block size.
-
-  * Non-overlapping blocks of size `block_size x block size` are rearranged
-    into depth at each location.
-  * The depth of the output tensor is `block_size * block_size * input_depth`.
-  * The Y, X coordinates within each block of the input become the high order
-    component of the output channel index.
-  * The input tensor's height and width must be divisible by block_size.
-
-The `data_format` attr specifies the layout of the input and output tensors
-with the following options:
-  "NHWC": `[ batch, height, width, channels ]`
-  "NCHW": `[ batch, channels, height, width ]`
-  "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-
-It is useful to consider the operation as transforming a 6-D Tensor.
-e.g. for data_format = NHWC,
-     Each element in the input tensor can be specified via 6 coordinates,
-     ordered by decreasing memory layout significance as:
-     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
-                        within the output image, bX, bY means coordinates
-                        within the input block, iC means input channels).
-     The output would be a transpose to the following layout:
-     n,oY,oX,bY,bX,iC
-
-This operation is useful for resizing the activations between convolutions
-(but keeping all data), e.g. instead of pooling. It is also useful for training
-purely convolutional models.
-
-For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
-block_size = 2:
-
-```
-x = [[[[1], [2]],
-      [[3], [4]]]]
-```
-
-This operation will output a tensor of shape `[1, 1, 1, 4]`:
-
-```
-[[[[1, 2, 3, 4]]]]
-```
-
-Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
-the corresponding output will have a single element (i.e. width and height are
-both 1) and will have a depth of 4 channels (1 * block_size * block_size).
-The output element shape is `[1, 1, 4]`.
-
-For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
-
-```
-x = [[[[1, 2, 3], [4, 5, 6]],
-      [[7, 8, 9], [10, 11, 12]]]]
-```
-
-This operation, for block_size of 2, will return the following tensor of shape
-`[1, 1, 1, 12]`
-
-```
-[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-```
-
-Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
-
-```
-x = [[[[1],   [2],  [5],  [6]],
-      [[3],   [4],  [7],  [8]],
-      [[9],  [10], [13],  [14]],
-      [[11], [12], [15],  [16]]]]
-```
-
-the operator will return the following tensor of shape `[1 2 2 4]`:
-
-```
-x = [[[[1, 2, 3, 4],
-       [5, 6, 7, 8]],
-      [[9, 10, 11, 12],
-       [13, 14, 15, 16]]]]
-```
-END
-}
-op {
-  graph_op_name: "SparseAccumulatorApplyGradient"
-  endpoint {
-    name: "SparseAccumulatorApplyGradient"
-  }
-  summary: "Applies a sparse gradient to a given accumulator."
-  description: <<END
-Does not add if local_step is smaller than the accumulator's
-global_step.
-END
-}
-op {
-  graph_op_name: "SparseAccumulatorTakeGradient"
-  endpoint {
-    name: "SparseAccumulatorTakeGradient"
-  }
-  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
-  description: <<END
-The op will blocks until sufficient (i.e., more than num_required)
-gradients have been accumulated. If the accumulator has already
-aggregated more than num_required gradients, it will return its
-average of the accumulated gradients.  Also automatically increments
-the recorded global_step in the accumulator by 1, and resets the
-aggregate to 0.
-END
-}
-op {
-  graph_op_name: "SparseAdd"
-  endpoint {
-    name: "SparseAdd"
-  }
-  summary: "Adds two `SparseTensor` objects to produce another `SparseTensor`."
-  description: <<END
-The input `SparseTensor` objects' indices are assumed ordered in standard
-lexicographic order.  If this is not the case, before this step run
-`SparseReorder` to restore index ordering.
-
-By default, if two values sum to zero at some index, the output `SparseTensor`
-would still include that particular location in its index, storing a zero in the
-corresponding value slot.  To override this, callers can specify `thresh`,
-indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-corresponding value and index would then not be included.  In particular,
-`thresh == 0` (default) means everything is kept and actual thresholding happens
-only for a positive value.
-
-In the following shapes, `nnz` is the count after taking `thresh` into account.
-END
-}
-op {
-  graph_op_name: "SparseAddGrad"
-  endpoint {
-    name: "SparseAddGrad"
-  }
-  summary: "The gradient operator for the SparseAdd op."
-  description: <<END
-The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-values of A and B.
-END
-}
-op {
-  graph_op_name: "SparseApplyAdadelta"
-  endpoint {
-    name: "SparseApplyAdadelta"
-  }
-  summary: "var: Should be from a Variable()."
-}
-op {
-  graph_op_name: "SparseApplyAdagrad"
-  endpoint {
-    name: "SparseApplyAdagrad"
-  }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
-  description: <<END
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-var -= lr * grad * (1 / sqrt(accum))
-END
-}
-op {
-  graph_op_name: "SparseApplyAdagradDA"
-  endpoint {
-    name: "SparseApplyAdagradDA"
-  }
-  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
-}
-op {
-  graph_op_name: "SparseApplyCenteredRMSProp"
-  endpoint {
-    name: "SparseApplyCenteredRMSProp"
-  }
-  summary: "Update \'*var\' according to the centered RMSProp algorithm."
-  description: <<END
-The centered RMSProp algorithm uses an estimate of the centered second moment
-(i.e., the variance) for normalization, as opposed to regular RMSProp, which
-uses the (uncentered) second moment. This often helps with training, but is
-slightly more expensive in terms of computation and memory.
-
-Note that in dense implementation of this algorithm, mg, ms, and mom will
-update even if the grad is zero, but in this sparse implementation, mg, ms,
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-mean_grad = decay * mean_grad + (1-decay) * gradient
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "SparseApplyFtrl"
-  endpoint {
-    name: "SparseApplyFtrl"
-  }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-That is for rows we have grad for, we update var, accum and linear as follows:
-accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "SparseApplyFtrlV2"
-  endpoint {
-    name: "SparseApplyFtrlV2"
-  }
-  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
-  description: <<END
-That is for rows we have grad for, we update var, accum and linear as follows:
-grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
-    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-accum = accum_new
-END
-}
-op {
-  graph_op_name: "SparseApplyMomentum"
-  endpoint {
-    name: "SparseApplyMomentum"
-  }
-  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
-  description: <<END
-Set use_nesterov = True if you want to use Nesterov momentum.
-
-That is for rows we have grad for, we update var and accum as follows:
-
-accum = accum * momentum + grad
-var -= lr * accum
-END
-}
-op {
-  graph_op_name: "SparseApplyProximalAdagrad"
-  endpoint {
-    name: "SparseApplyProximalAdagrad"
-  }
-  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
-  description: <<END
-That is for rows we have grad for, we update var and accum as follows:
-accum += grad * grad
-prox_v = var
-prox_v -= lr * grad * (1 / sqrt(accum))
-var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
-END
-}
-op {
-  graph_op_name: "SparseApplyProximalGradientDescent"
-  endpoint {
-    name: "SparseApplyProximalGradientDescent"
-  }
-  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
-  description: <<END
-That is for rows we have grad for, we update var as follows:
-prox_v = var - alpha * grad
-var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-END
-}
-op {
-  graph_op_name: "SparseApplyRMSProp"
-  endpoint {
-    name: "SparseApplyRMSProp"
-  }
-  summary: "Update \'*var\' according to the RMSProp algorithm."
-  description: <<END
-Note that in dense implementation of this algorithm, ms and mom will
-update even if the grad is zero, but in this sparse implementation, ms
-and mom will not update in iterations during which the grad is zero.
-
-mean_square = decay * mean_square + (1-decay) * gradient ** 2
-Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-
-ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-var <- var - mom
-END
-}
-op {
-  graph_op_name: "SparseConcat"
-  endpoint {
-    name: "SparseConcat"
-  }
-  summary: "Concatenates a list of `SparseTensor` along the specified dimension."
-  description: <<END
-Concatenation is with respect to the dense versions of these sparse tensors.
-It is assumed that each input is a `SparseTensor` whose elements are ordered
-along increasing dimension number.
-
-All inputs' shapes must match, except for the concat dimension.  The
-`indices`, `values`, and `shapes` lists must have the same length.
-
-The output shape is identical to the inputs', except along the concat
-dimension, where it is the sum of the inputs' sizes along that dimension.
-
-The output elements will be resorted to preserve the sort order along
-increasing dimension number.
-
-This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-values across all inputs. This is due to the need for an internal sort in
-order to concatenate efficiently across an arbitrary dimension.
-
-For example, if `concat_dim = 1` and the inputs are
-
-    sp_inputs[0]: shape = [2, 3]
-    [0, 2]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    sp_inputs[1]: shape = [2, 4]
-    [0, 1]: "d"
-    [0, 2]: "e"
-
-then the output will be
-
-    shape = [2, 7]
-    [0, 2]: "a"
-    [0, 4]: "d"
-    [0, 5]: "e"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-Graphically this is equivalent to doing
-
-    [    a] concat [  d e  ] = [    a   d e  ]
-    [b c  ]        [       ]   [b c          ]
-END
-}
-op {
-  graph_op_name: "SparseConditionalAccumulator"
-  endpoint {
-    name: "SparseConditionalAccumulator"
-  }
-  summary: "A conditional accumulator for aggregating sparse gradients."
-  description: <<END
-The accumulator accepts gradients marked with local_step greater or
-equal to the most recent global_step known to the accumulator. The
-average can be extracted from the accumulator, provided sufficient
-gradients have been accumulated. Extracting the average automatically
-resets the aggregate to 0, and increments the global_step recorded by
-the accumulator.
-END
-}
-op {
-  graph_op_name: "SparseCross"
-  endpoint {
-    name: "SparseCross"
-  }
-  summary: "Generates sparse cross from a list of sparse and dense tensors."
-  description: <<END
-The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-representing features of one feature column. It outputs a 2D `SparseTensor` with
-the batchwise crosses of these features.
-
-For example, if the inputs are
-
-    inputs[0]: SparseTensor with shape = [2, 2]
-    [0, 0]: "a"
-    [1, 0]: "b"
-    [1, 1]: "c"
-
-    inputs[1]: SparseTensor with shape = [2, 1]
-    [0, 0]: "d"
-    [1, 0]: "e"
-
-    inputs[2]: Tensor [["f"], ["g"]]
-
-then the output will be
-
-    shape = [2, 2]
-    [0, 0]: "a_X_d_X_f"
-    [1, 0]: "b_X_e_X_g"
-    [1, 1]: "c_X_e_X_g"
-
-if hashed_output=true then the output will be
-
-    shape = [2, 2]
-    [0, 0]: FingerprintCat64(
-                Fingerprint64("f"), FingerprintCat64(
-                    Fingerprint64("d"), Fingerprint64("a")))
-    [1, 0]: FingerprintCat64(
-                Fingerprint64("g"), FingerprintCat64(
-                    Fingerprint64("e"), Fingerprint64("b")))
-    [1, 1]: FingerprintCat64(
-                Fingerprint64("g"), FingerprintCat64(
-                    Fingerprint64("e"), Fingerprint64("c")))
-END
-}
-op {
-  graph_op_name: "SparseDenseCwiseAdd"
-  endpoint {
-    name: "SparseDenseCwiseAdd"
-  }
-  summary: "Adds up a SparseTensor and a dense Tensor, using these special rules:"
-  description: <<END
-(1) Broadcasts the dense side to have the same shape as the sparse side, if
-    eligible;
-(2) Then, only the dense values pointed to by the indices of the SparseTensor
-    participate in the cwise addition.
-
-By these rules, the result is a logical SparseTensor with exactly the same
-indices and shape, but possibly with different non-zero values.  The output of
-this Op is the resultant non-zero values.
-END
-}
-op {
-  graph_op_name: "SparseDenseCwiseDiv"
-  endpoint {
-    name: "SparseDenseCwiseDiv"
-  }
-  summary: "Component-wise divides a SparseTensor by a dense Tensor."
-  description: <<END
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-END
-}
-op {
-  graph_op_name: "SparseDenseCwiseMul"
-  endpoint {
-    name: "SparseDenseCwiseMul"
-  }
-  summary: "Component-wise multiplies a SparseTensor by a dense Tensor."
-  description: <<END
-The output locations corresponding to the implicitly zero elements in the sparse
-tensor will be zero (i.e., will not take up storage space), regardless of the
-contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
-
-*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-the other direction.
-END
-}
-op {
-  graph_op_name: "SparseFillEmptyRows"
-  endpoint {
-    name: "SparseFillEmptyRows"
-  }
-  summary: "Fills empty rows in the input 2-D `SparseTensor` with a default value."
-  description: <<END
-The input `SparseTensor` is represented via the tuple of inputs
-(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-same `dense_shape` but with indices `output_indices` and values
-`output_values`.
-
-This op inserts a single entry for every row that doesn't have any values.
-The index is created as `[row, 0, ..., 0]` and the inserted value
-is `default_value`.
-
-For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [2, 0]: c
-    [3, 1]: d
-
-Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-
-    [0, 1]: a
-    [0, 3]: b
-    [1, 0]: default_value
-    [2, 0]: c
-    [3, 1]: d
-    [4, 0]: default_value
-
-The output `SparseTensor` will be in row-major order and will have the
-same shape as the input.
-
-This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-
-    empty_row_indicator[i] = True iff row i was an empty row.
-
-And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-backpropagation,
-
-    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
-END
-}
-op {
-  graph_op_name: "SparseFillEmptyRowsGrad"
-  endpoint {
-    name: "SparseFillEmptyRowsGrad"
-  }
-  summary: "The gradient of SparseFillEmptyRows."
-  description: <<END
-Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-shaped `[N_full]`, where `N_full >= N` and copies data into either
-`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-`d_default_value` is a scalar.
-
-  d_values[j] = grad_values[reverse_index_map[j]]
-  d_default_value = sum_{k : 0 .. N_full - 1} (
-     grad_values[k] * 1{k not in reverse_index_map})
-END
-}
-op {
-  graph_op_name: "SparseMatMul"
-  endpoint {
-    name: "SparseMatMul"
-  }
-  summary: "Multiply matrix \"a\" by matrix \"b\"."
-  description: <<END
-The inputs must be two-dimensional matrices and the inner dimension of "a" must
-match the outer dimension of "b". This op is optimized for the case where at
-least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-matrix multiply on one platform was 30% zero values in the sparse matrix.
-
-The gradient computation of this operation will only take advantage of sparsity
-in the input gradient when that gradient comes from a Relu.
-END
-}
-op {
-  graph_op_name: "SparseReduceMax"
-  endpoint {
-    name: "SparseReduceMax"
-  }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: <<END
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-END
-}
-op {
-  graph_op_name: "SparseReduceMaxSparse"
-  endpoint {
-    name: "SparseReduceMaxSparse"
-  }
-  summary: "Computes the max of elements across dimensions of a SparseTensor."
-  description: <<END
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-END
-}
-op {
-  graph_op_name: "SparseReduceSum"
-  endpoint {
-    name: "SparseReduceSum"
-  }
-  summary: "Computes the sum of elements across dimensions of a SparseTensor."
-  description: <<END
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-instead of a sparse one.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-END
-}
-op {
-  graph_op_name: "SparseReduceSumSparse"
-  endpoint {
-    name: "SparseReduceSumSparse"
-  }
-  summary: "Computes the sum of elements across dimensions of a SparseTensor."
-  description: <<END
-This Op takes a SparseTensor and is the sparse counterpart to
-`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-SparseTensor.
-
-Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-with length 1.
-
-If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-with a single element is returned.  Additionally, the axes can be negative,
-which are interpreted according to the indexing rules in Python.
-END
-}
-op {
-  graph_op_name: "SparseReorder"
-  endpoint {
-    name: "SparseReorder"
-  }
-  summary: "Reorders a SparseTensor into the canonical, row-major ordering."
-  description: <<END
-Note that by convention, all sparse ops preserve the canonical ordering along
-increasing dimension number. The only time ordering can be violated is during
-manual manipulation of the indices and values vectors to add entries.
-
-Reordering does not affect the shape of the SparseTensor.
-
-If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-END
-}
-op {
-  graph_op_name: "SparseReshape"
-  endpoint {
-    name: "SparseReshape"
-  }
-  summary: "Reshapes a SparseTensor to represent values in a new dense shape."
-  description: <<END
-This operation has the same semantics as reshape on the represented dense
-tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-
-If one component of `new_shape` is the special value -1, the size of that
-dimension is computed so that the total dense size remains constant.  At
-most one component of `new_shape` can be -1.  The number of dense elements
-implied by `new_shape` must be the same as the number of dense elements
-originally implied by `input_shape`.
-
-Reshaping does not affect the order of values in the SparseTensor.
-
-If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-`input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-`output_shape` has length `R_out`.
-END
-}
-op {
-  graph_op_name: "SparseSegmentMean"
-  endpoint {
-    name: "SparseSegmentMean"
-  }
-  summary: "Computes the mean along sparse segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-END
-}
-op {
-  graph_op_name: "SparseSegmentMeanGrad"
-  endpoint {
-    name: "SparseSegmentMeanGrad"
-  }
-  summary: "Computes gradients for SparseSegmentMean."
-  description: <<END
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-END
-}
-op {
-  graph_op_name: "SparseSegmentSqrtN"
-  endpoint {
-    name: "SparseSegmentSqrtN"
-  }
-  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
-  description: <<END
-N is the size of the segment being reduced.
-
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-END
-}
-op {
-  graph_op_name: "SparseSegmentSqrtNGrad"
-  endpoint {
-    name: "SparseSegmentSqrtNGrad"
-  }
-  summary: "Computes gradients for SparseSegmentSqrtN."
-  description: <<END
-Returns tensor "output" with same shape as grad, except for dimension 0 whose
-value is output_dim0.
-END
-}
-op {
-  graph_op_name: "SparseSegmentSum"
-  endpoint {
-    name: "SparseSegmentSum"
-  }
-  summary: "Computes the sum along sparse segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-dimension, selecting a subset of dimension 0, specified by `indices`.
-
-For example:
-
-```python
-c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-
-# Select two rows, one segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-# => [[0 0 0 0]]
-
-# Select two rows, two segment.
-tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-# => [[ 1  2  3  4]
-#     [-1 -2 -3 -4]]
-
-# Select all rows, two segments.
-tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-# => [[0 0 0 0]
-#     [5 6 7 8]]
-
-# Which is equivalent to:
-tf.segment_sum(c, tf.constant([0, 0, 1]))
-```
-END
-}
-op {
-  graph_op_name: "SparseSlice"
-  endpoint {
-    name: "SparseSlice"
-  }
-  summary: "Slice a `SparseTensor` based on the `start` and `size`."
-  description: <<END
-For example, if the input is
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-    [    a  ]
-    [b c    ]
-
-    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-    [ d e  ]
-    [      ]
-END
-}
-op {
-  graph_op_name: "SparseSoftmax"
-  endpoint {
-    name: "SparseSoftmax"
-  }
-  summary: "Applies softmax to a batched N-D `SparseTensor`."
-  description: <<END
-The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-(where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-
-This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-zero elements do not participate*.  Specifically, the algorithm is equivalent
-to the following:
-
-  (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-      with shape `[B, C]`, along the size-C dimension;
-  (2) Masks out the original implicitly-zero locations;
-  (3) Renormalizes the remaining elements.
-
-Hence, the `SparseTensor` result has exactly the same non-zero indices and
-shape.
-END
-}
-op {
-  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
-  endpoint {
-    name: "SparseSoftmaxCrossEntropyWithLogits"
-  }
-  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
-  description: <<END
-Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-a matrix of label probabilities, but rather a single label per row
-of features.  This label is considered to have probability 1.0 for the
-given row.
-
-Inputs are the logits, not probabilities.
-END
-}
-op {
-  graph_op_name: "SparseSparseMaximum"
-  endpoint {
-    name: "SparseSparseMaximum"
-  }
-  summary: "Returns the element-wise max of two SparseTensors."
-  description: <<END
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-END
-}
-op {
-  graph_op_name: "SparseSparseMinimum"
-  endpoint {
-    name: "SparseSparseMinimum"
-  }
-  summary: "Returns the element-wise min of two SparseTensors."
-  description: <<END
-Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-END
-}
-op {
-  graph_op_name: "SparseSplit"
-  endpoint {
-    name: "SparseSplit"
-  }
-  summary: "Split a `SparseTensor` into `num_split` tensors along one dimension."
-  description: <<END
-If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-`[0 : shape[split_dim] % num_split]` gets one extra dimension.
-For example, if `split_dim = 1` and `num_split = 2` and the input is
-
-    input_tensor = shape = [2, 7]
-    [    a   d e  ]
-    [b c          ]
-
-Graphically the output tensors are:
-
-    output_tensor[0] = shape = [2, 4]
-    [    a  ]
-    [b c    ]
-
-    output_tensor[1] = shape = [2, 3]
-    [ d e  ]
-    [      ]
-END
-}
-op {
-  graph_op_name: "SparseTensorDenseAdd"
-  endpoint {
-    name: "SparseTensorDenseAdd"
-  }
-  summary: "Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`."
-  description: <<END
-This Op does not require `a_indices` be sorted in standard lexicographic order.
-END
-}
-op {
-  graph_op_name: "SparseTensorDenseMatMul"
-  endpoint {
-    name: "SparseTensorDenseMatMul"
-  }
-  summary: "Multiply SparseTensor (of rank 2) \"A\" by dense matrix \"B\"."
-  description: <<END
-No validity checking is performed on the indices of A.  However, the following
-input format is recommended for optimal behavior:
-
-if adjoint_a == false:
-  A should be sorted in lexicographically increasing order.  Use SparseReorder
-  if you're not sure.
-if adjoint_a == true:
-  A should be sorted in order of increasing dimension 1 (i.e., "column major"
-  order instead of "row major" order).
-END
-}
-op {
-  graph_op_name: "SparseTensorSliceDataset"
-  endpoint {
-    name: "SparseTensorSliceDataset"
-  }
-  summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
-}
-op {
-  graph_op_name: "SparseToDense"
-  endpoint {
-    name: "SparseToDense"
-  }
-  summary: "Converts a sparse representation into a dense tensor."
-  description: <<END
-Builds an array `dense` with shape `output_shape` such that
-
-```
-# If sparse_indices is scalar
-dense[i] = (i == sparse_indices ? sparse_values : default_value)
-
-# If sparse_indices is a vector, then for each i
-dense[sparse_indices[i]] = sparse_values[i]
-
-# If sparse_indices is an n by d matrix, then for each i in [0, n)
-dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-```
-
-All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-scalar, all sparse indices are set to this single value.
-
-Indices should be sorted in lexicographic order, and indices must not
-contain any repeats. If `validate_indices` is true, these properties
-are checked during execution.
-END
-}
-op {
-  graph_op_name: "SparseToSparseSetOperation"
-  endpoint {
-    name: "SparseToSparseSetOperation"
-  }
-  summary: "Applies set operation along last dimension of 2 `SparseTensor` inputs."
-  description: <<END
-See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-
-If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-order and range of `set1` and `set2` indices.
-
-Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-ignored.
-
-If `validate_indices` is `True`, this op validates the order and range of `set1`
-and `set2` indices.
-
-Output `result` is a `SparseTensor` represented by `result_indices`,
-`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-dimension contains the result of `set_operation` applied to the corresponding
-`[0...n-1]` dimension of `set`.
-END
-}
-op {
-  graph_op_name: "Split"
-  endpoint {
-    name: "Split"
-  }
-  summary: "Splits a tensor into `num_split` tensors along one dimension."
-}
-op {
-  graph_op_name: "SplitV"
-  endpoint {
-    name: "SplitV"
-  }
-  summary: "Splits a tensor into `num_split` tensors along one dimension."
-}
-op {
-  graph_op_name: "SqlDataset"
-  endpoint {
-    name: "SqlDataset"
-  }
-  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
-}
-op {
-  graph_op_name: "Sqrt"
-  endpoint {
-    name: "Sqrt"
-  }
-  summary: "Computes square root of x element-wise."
-  description: <<END
-I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-END
-}
-op {
-  graph_op_name: "SqrtGrad"
-  endpoint {
-    name: "SqrtGrad"
-  }
-  summary: "Computes the gradient for the sqrt of `x` wrt its input."
-  description: <<END
-Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-is the corresponding input gradient.
-END
-}
-op {
-  graph_op_name: "Square"
-  endpoint {
-    name: "Square"
-  }
-  summary: "Computes square of x element-wise."
-  description: <<END
-I.e., \\(y = x * x = x^2\\).
-END
-}
-op {
-  graph_op_name: "SquaredDifference"
-  endpoint {
-    name: "SquaredDifference"
-  }
-  summary: "Returns (x - y)(x - y) element-wise."
-  description: <<END
-*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Squeeze"
-  endpoint {
-    name: "Squeeze"
-  }
-  summary: "Removes dimensions of size 1 from the shape of a tensor."
-  description: <<END
-Given a tensor `input`, this operation returns a tensor of the same type with
-all dimensions of size 1 removed. If you don't want to remove all size 1
-dimensions, you can remove specific size 1 dimensions by specifying
-`squeeze_dims`.
-
-For example:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t)) ==> [2, 3]
-```
-
-Or, to remove specific size 1 dimensions:
-
-```
-# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-```
-END
-}
-op {
-  graph_op_name: "Stack"
-  endpoint {
-    name: "Stack"
-  }
-  summary: "Deprecated, use StackV2."
-}
-op {
-  graph_op_name: "StackClose"
-  endpoint {
-    name: "StackClose"
-  }
-  summary: "Deprecated, use StackCloseV2."
-}
-op {
-  graph_op_name: "StackCloseV2"
-  endpoint {
-    name: "StackCloseV2"
-  }
-  summary: "Delete the stack from its resource container."
-}
-op {
-  graph_op_name: "StackPop"
-  endpoint {
-    name: "StackPop"
-  }
-  summary: "Deprecated, use StackPopV2."
-}
-op {
-  graph_op_name: "StackPopV2"
-  endpoint {
-    name: "StackPopV2"
-  }
-  summary: "Pop the element at the top of the stack."
-}
-op {
-  graph_op_name: "StackPush"
-  endpoint {
-    name: "StackPush"
-  }
-  summary: "Deprecated, use StackPushV2."
-}
-op {
-  graph_op_name: "StackPushV2"
-  endpoint {
-    name: "StackPushV2"
-  }
-  summary: "Push an element onto the stack."
-}
-op {
-  graph_op_name: "StackV2"
-  endpoint {
-    name: "StackV2"
-  }
-  summary: "A stack that produces elements in first-in last-out order."
-}
-op {
-  graph_op_name: "Stage"
-  endpoint {
-    name: "Stage"
-  }
-  summary: "Stage values similar to a lightweight Enqueue."
-  description: <<END
-The basic functionality of this Op is similar to a queue with many
-fewer capabilities and options.  This Op is optimized for performance.
-END
-}
-op {
-  graph_op_name: "StageClear"
-  endpoint {
-    name: "StageClear"
-  }
-  summary: "Op removes all elements in the underlying container."
-}
-op {
-  graph_op_name: "StagePeek"
-  endpoint {
-    name: "StagePeek"
-  }
-  summary: "Op peeks at the values at the specified index.  If the"
-  description: <<END
-underlying container does not contain sufficient elements
-this op will block until it does.   This Op is optimized for
-performance.
-END
-}
-op {
-  graph_op_name: "StageSize"
-  endpoint {
-    name: "StageSize"
-  }
-  summary: "Op returns the number of elements in the underlying container."
-}
-op {
-  graph_op_name: "StatelessRandomNormal"
-  endpoint {
-    name: "StatelessRandomNormal"
-  }
-  summary: "Outputs deterministic pseudorandom values from a normal distribution."
-  description: <<END
-The generated values will have mean 0 and standard deviation 1.
-
-The outputs are a deterministic function of `shape` and `seed`.
-END
-}
-op {
-  graph_op_name: "StatelessRandomUniform"
-  endpoint {
-    name: "StatelessRandomUniform"
-  }
-  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
-  description: <<END
-The generated values follow a uniform distribution in the range `[0, 1)`. The
-lower bound 0 is included in the range, while the upper bound 1 is excluded.
-
-The outputs are a deterministic function of `shape` and `seed`.
-END
-}
-op {
-  graph_op_name: "StatelessTruncatedNormal"
-  endpoint {
-    name: "StatelessTruncatedNormal"
-  }
-  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
-  description: <<END
-The generated values follow a normal distribution with mean 0 and standard
-deviation 1, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-
-The outputs are a deterministic function of `shape` and `seed`.
-END
-}
-op {
-  graph_op_name: "StopGradient"
-  endpoint {
-    name: "StopGradient"
-  }
-  summary: "Stops gradient computation."
-  description: <<END
-When executed in a graph, this op outputs its input tensor as-is.
-
-When building ops to compute gradients, this op prevents the contribution of
-its inputs to be taken into account.  Normally, the gradient generator adds ops
-to a graph to compute the derivatives of a specified 'loss' by recursively
-finding out inputs that contributed to its computation.  If you insert this op
-in the graph it inputs are masked from the gradient generator.  They are not
-taken into account for computing gradients.
-
-This is useful any time you want to compute a value with TensorFlow but need
-to pretend that the value was a constant. Some examples include:
-
-*  The *EM* algorithm where the *M-step* should not involve backpropagation
-   through the output of the *E-step*.
-*  Contrastive divergence training of Boltzmann machines where, when
-   differentiating the energy function, the training must not backpropagate
-   through the graph that generated the samples from the model.
-*  Adversarial training, where no backprop should happen through the adversarial
-   example generation process.
-END
-}
-op {
-  graph_op_name: "StridedSlice"
-  endpoint {
-    name: "StridedSlice"
-  }
-  summary: "Return a strided slice from `input`."
-  description: <<END
-Note, most python users will want to use the Python `Tensor.__getitem__`
-or `Variable.__getitem__` rather than this op directly.
-
-The goal of this op is to produce a new tensor with a subset of
-the elements from the `n` dimensional `input` tensor. The subset is chosen using
-a sequence of `m` sparse range specifications encoded into the arguments
-of this function. Note, in some cases
-`m` could be equal to `n`, but this need not be the case. Each
-range specification entry can be one of the following:
-
-- An ellipsis (...). Ellipses are used to imply zero or more
-  dimensions of full-dimension selection and are produced using
-  `ellipsis_mask`. For example, `foo[...]` is the identity slice.
-
-- A new axis. This is used to insert a new shape=1 dimension and is
-  produced using `new_axis_mask`. For example, `foo[:, ...]` where
-  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
-
-
-- A range `begin:end:stride`. This is used to specify how much to choose from
-  a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-  which represents the index of the first value to select while `end` represents
-  the index of the last value to select. The number of values selected in each
-  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-  `begin` and `end` can be negative where `-1` is the last element, `-2` is
-  the second to last. `begin_mask` controls whether to replace the explicitly
-  given `begin` with an implicit effective value of `0` if `stride > 0` and
-  `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-  required to create the largest open interval. For example, given a shape
-  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-  first dimension of a tensor while dropping the last two (in the original
-  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-
-- A single index. This is used to keep only elements that have a given
-  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-  shape `(6,)` tensor. This is encoded in `begin` and `end` and
-  `shrink_axis_mask`.
-
-Each conceptual range specification is encoded in the op's argument. This
-encoding is best understand by considering a non-trivial example. In
-particular,
-`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
-
-```
-begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-end = [2, 4, x, x, -3, x]
-strides = [1, 1, x, x, -1, 1]
-begin_mask = 1<<4 | 1 << 5 = 48
-end_mask = 1<<5 = 32
-ellipsis_mask = 1<<3 = 8
-new_axis_mask = 1<<2 4
-shrink_axis_mask = 1<<0
-```
-
-In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-the slice becomes (2, 1, 5, 5, 2, 5).
-Let us walk step by step through each argument specification.
-
-1.  The first argument in the example slice is turned into `begin = 1` and
-`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-also set the appropriate bit in `shrink_axis_mask`.
-
-2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-zero bits contributed.
-
-3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-dimension in the final shape. Dummy values are contributed to begin,
-end and stride, while the new_axis_mask bit is set.
-
-4. `...` grab the full ranges from as many dimensions as needed to
-fully specify a slice for every dimension of the input shape.
-
-5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-with a dimension that has shape `s` is converted to a positive index
-`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-is done internally so begin, end and strides receive x, -3, and -1.
-The appropriate begin_mask bit is set to indicate the start range is the
-full range (ignoring the x).
-
-6. `:` indicates that the entire contents of the corresponding dimension
-is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-`end_mask` are also set.
-
-*Requirements*:
-  `0 != strides[i] for i in [0, m)`
-  `ellipsis_mask must be a power of two (only one ellipsis)`
-END
-}
-op {
-  graph_op_name: "StridedSliceAssign"
-  endpoint {
-    name: "StridedSliceAssign"
-  }
-  summary: "Assign `value` to the sliced l-value reference of `ref`."
-  description: <<END
-The values of `value` are assigned to the positions in the variable
-`ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
-
-NOTE this op currently does not support broadcasting and so `value`'s
-shape must be exactly the shape produced by the slice of `ref`.
-END
-}
-op {
-  graph_op_name: "StridedSliceGrad"
-  endpoint {
-    name: "StridedSliceGrad"
-  }
-  summary: "Returns the gradient of `StridedSlice`."
-  description: <<END
-Since `StridedSlice` cuts out pieces of its `input` which is size
-`shape`, its gradient will have the same shape (which is passed here
-as `shape`). The gradient will be zero in any element that the slice
-does not select.
-
-Arguments are the same as StridedSliceGrad with the exception that
-`dy` is the input gradient to be propagated and `shape` is the
-shape of `StridedSlice`'s `input`.
-END
-}
-op {
-  graph_op_name: "StringJoin"
-  endpoint {
-    name: "StringJoin"
-  }
-  summary: "Joins the strings in the given list of string tensors into one tensor;"
-  description: <<END
-with the given separator (default is an empty separator).
-END
-}
-op {
-  graph_op_name: "StringSplit"
-  endpoint {
-    name: "StringSplit"
-  }
-  summary: "Split elements of `input` based on `delimiter` into a `SparseTensor`."
-  description: <<END
-Let N be the size of source (typically N will be the batch size). Split each
-element of `input` based on `delimiter` and return a `SparseTensor`
-containing the splitted tokens. Empty tokens are ignored.
-
-`delimiter` can be empty, or a string of split characters. If `delimiter` is an
- empty string, each element of `input` is split into individual single-byte
- character strings, including splitting of UTF-8 multibyte sequences. Otherwise
- every character of `delimiter` is a potential split point.
-
-For example:
-  N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-  will be
-
-  indices = [0, 0;
-             0, 1;
-             1, 0;
-             1, 1;
-             1, 2]
-  shape = [2, 3]
-  values = ['hello', 'world', 'a', 'b', 'c']
-END
-}
-op {
-  graph_op_name: "StringToHashBucket"
-  endpoint {
-    name: "StringToHashBucket"
-  }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: <<END
-The hash function is deterministic on the content of the string within the
-process.
-
-Note that the hash function may change from time to time.
-This functionality will be deprecated and it's recommended to use
-`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-END
-}
-op {
-  graph_op_name: "StringToHashBucketFast"
-  endpoint {
-    name: "StringToHashBucketFast"
-  }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: <<END
-The hash function is deterministic on the content of the string within the
-process and will never change. However, it is not suitable for cryptography.
-This function may be used when CPU time is scarce and inputs are trusted or
-unimportant. There is a risk of adversaries constructing inputs that all hash
-to the same bucket. To prevent this problem, use a strong hash function with
-`tf.string_to_hash_bucket_strong`.
-END
-}
-op {
-  graph_op_name: "StringToHashBucketStrong"
-  endpoint {
-    name: "StringToHashBucketStrong"
-  }
-  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
-  description: <<END
-The hash function is deterministic on the content of the string within the
-process. The hash function is a keyed hash function, where attribute `key`
-defines the key of the hash function. `key` is an array of 2 elements.
-
-A strong hash is important when inputs may be malicious, e.g. URLs with
-additional components. Adversaries could try to make their inputs hash to the
-same bucket for a denial-of-service attack or to skew the results. A strong
-hash prevents this by making it difficult, if not infeasible, to compute inputs
-that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-time than `tf.string_to_hash_bucket_fast`.
-END
-}
-op {
-  graph_op_name: "StringToNumber"
-  endpoint {
-    name: "StringToNumber"
-  }
-  summary: "Converts each string in the input Tensor to the specified numeric type."
-  description: <<END
-(Note that int32 overflow results in an error while float overflow
-results in a rounded value.)
-END
-}
-op {
-  graph_op_name: "Sub"
-  endpoint {
-    name: "Sub"
-  }
-  summary: "Returns x - y element-wise."
-  description: <<END
-*NOTE*: `Sub` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "Substr"
-  endpoint {
-    name: "Substr"
-  }
-  summary: "Return substrings from `Tensor` of strings."
-  description: <<END
-For each string in the input `Tensor`, creates a substring starting at index
-`pos` with a total length of `len`.
-
-If `len` defines a substring that would extend beyond the length of the input
-string, then as many characters as possible are used.
-
-If `pos` is negative or specifies a character index larger than any of the input
-strings, then an `InvalidArgumentError` is thrown.
-
-`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-Op creation.
-
-*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-
----
-
-Examples
-
-Using scalar `pos` and `len`:
-
-```python
-input = [b'Hello', b'World']
-position = 1
-length = 3
-
-output = [b'ell', b'orl']
-```
-
-Using `pos` and `len` with same shape as `input`:
-
-```python
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen']]
-position = [[1, 2, 3],
-            [1, 2, 3],
-            [1, 2, 3]]
-length =   [[2, 3, 4],
-            [4, 3, 2],
-            [5, 5, 5]]
-
-output = [[b'en', b'eve', b'lve'],
-          [b'hirt', b'urt', b'te'],
-          [b'ixtee', b'vente', b'hteen']]
-```
-
-Broadcasting `pos` and `len` onto `input`:
-
-```
-input = [[b'ten', b'eleven', b'twelve'],
-         [b'thirteen', b'fourteen', b'fifteen'],
-         [b'sixteen', b'seventeen', b'eighteen'],
-         [b'nineteen', b'twenty', b'twentyone']]
-position = [1, 2, 3]
-length =   [1, 2, 3]
-
-output = [[b'e', b'ev', b'lve'],
-          [b'h', b'ur', b'tee'],
-          [b'i', b've', b'hte'],
-          [b'i', b'en', b'nty']]
-```
-
-Broadcasting `input` onto `pos` and `len`:
-
-```
-input = b'thirteen'
-position = [1, 5, 7]
-length =   [3, 2, 1]
-
-output = [b'hir', b'ee', b'n']
-```
-END
-}
-op {
-  graph_op_name: "Sum"
-  endpoint {
-    name: "Sum"
-  }
-  summary: "Computes the sum of elements across dimensions of a tensor."
-  description: <<END
-Reduces `input` along the dimensions given in `reduction_indices`. Unless
-`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
-retained with length 1.
-END
-}
-op {
-  graph_op_name: "Svd"
-  endpoint {
-    name: "Svd"
-  }
-  summary: "Computes the singular value decompositions of one or more matrices."
-  description: <<END
-Computes the SVD of each inner matrix in `input` such that
-`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-
-```python
-# a is a tensor containing a batch of matrices.
-# s is a tensor of singular values for each matrix.
-# u is the tensor containing of left singular vectors for each matrix.
-# v is the tensor containing of right singular vectors for each matrix.
-s, u, v = svd(a)
-s, _, _ = svd(a, compute_uv=False)
-```
-END
-}
-op {
-  graph_op_name: "Switch"
-  endpoint {
-    name: "Switch"
-  }
-  summary: "Forwards `data` to the output port determined by `pred`."
-  description: <<END
-If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-the data goes to `output_false`.
-
-See also `RefSwitch` and `Merge`.
-END
-}
-op {
-  graph_op_name: "SymbolicGradient"
-  endpoint {
-    name: "SymbolicGradient"
-  }
-  summary: "Computes the gradient function for function f via backpropagation."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0716b2611403b54d894007fad801380f30e70acc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,131 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: <<END
+1-D, containing `[height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: <<END
+3-D with shape `[batch, N, 4]` describing the N bounding boxes
+associated with the image.
+END
+  }
+  out_arg {
+    name: "begin"
+    description: <<END
+1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+1-D, containing `[target_height, target_width, -1]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "bboxes"
+    description: <<END
+3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+Provide as input to `tf.image.draw_bounding_boxes`.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to non-zero, the random number
+generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "min_object_covered"
+    description: <<END
+The cropped area of the image must contain at least this
+fraction of any bounding box supplied. The value of this parameter should be
+non-negative. In the case of 0, the cropped area does not need to overlap
+any of the bounding boxes supplied.
+END
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: <<END
+The cropped area of the image must have an aspect ratio =
+width / height within this range.
+END
+  }
+  attr {
+    name: "area_range"
+    description: <<END
+The cropped area of the image must contain a fraction of the
+supplied image within in this range.
+END
+  }
+  attr {
+    name: "max_attempts"
+    description: <<END
+Number of attempts at generating a cropped region of the image
+of the specified constraints. After `max_attempts` failures, return the entire
+image.
+END
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: <<END
+Controls behavior if no bounding boxes supplied.
+If true, assume an implicit bounding box covering the whole input. If false,
+raise an error.
+END
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+  description: <<END
+Bounding box annotations are often supplied in addition to ground-truth labels
+in image recognition or object localization tasks. A common technique for
+training such a system is to randomly distort an image while preserving
+its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+localization of an object, i.e. bounding box, given an `image_size`,
+`bounding_boxes` and a series of constraints.
+
+The output of this Op is a single bounding box that may be used to crop the
+original image. The output is returned as 3 tensors: `begin`, `size` and
+`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+what the bounding box looks like.
+
+Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+height of the underlying image.
+
+For example,
+
+```python
+    # Generate a single distorted bounding box.
+    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        bounding_boxes=bounding_boxes)
+
+    # Draw the bounding box in an image summary.
+    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                  bbox_for_draw)
+    tf.image_summary('images_with_box', image_with_box)
+
+    # Employ the bounding box to distort the image.
+    distorted_image = tf.slice(image, begin, size)
+```
+
+Note that if no bounding box information is available, setting
+`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+false and no bounding boxes are supplied, an error is raised.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9912609729fbadf7a3dd706903ecc4d915d72eb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -0,0 +1,131 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBoxV2"
+  in_arg {
+    name: "image_size"
+    description: <<END
+1-D, containing `[height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: <<END
+3-D with shape `[batch, N, 4]` describing the N bounding boxes
+associated with the image.
+END
+  }
+  in_arg {
+    name: "min_object_covered"
+    description: <<END
+The cropped area of the image must contain at least this
+fraction of any bounding box supplied. The value of this parameter should be
+non-negative. In the case of 0, the cropped area does not need to overlap
+any of the bounding boxes supplied.
+END
+  }
+  out_arg {
+    name: "begin"
+    description: <<END
+1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+1-D, containing `[target_height, target_width, -1]`. Provide as input to
+`tf.slice`.
+END
+  }
+  out_arg {
+    name: "bboxes"
+    description: <<END
+3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+Provide as input to `tf.image.draw_bounding_boxes`.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to non-zero, the random number
+generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: <<END
+The cropped area of the image must have an aspect ratio =
+width / height within this range.
+END
+  }
+  attr {
+    name: "area_range"
+    description: <<END
+The cropped area of the image must contain a fraction of the
+supplied image within in this range.
+END
+  }
+  attr {
+    name: "max_attempts"
+    description: <<END
+Number of attempts at generating a cropped region of the image
+of the specified constraints. After `max_attempts` failures, return the entire
+image.
+END
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: <<END
+Controls behavior if no bounding boxes supplied.
+If true, assume an implicit bounding box covering the whole input. If false,
+raise an error.
+END
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+  description: <<END
+Bounding box annotations are often supplied in addition to ground-truth labels
+in image recognition or object localization tasks. A common technique for
+training such a system is to randomly distort an image while preserving
+its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+localization of an object, i.e. bounding box, given an `image_size`,
+`bounding_boxes` and a series of constraints.
+
+The output of this Op is a single bounding box that may be used to crop the
+original image. The output is returned as 3 tensors: `begin`, `size` and
+`bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+what the bounding box looks like.
+
+Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+height of the underlying image.
+
+For example,
+
+```python
+    # Generate a single distorted bounding box.
+    begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+        tf.shape(image),
+        bounding_boxes=bounding_boxes)
+
+    # Draw the bounding box in an image summary.
+    image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                  bbox_for_draw)
+    tf.image_summary('images_with_box', image_with_box)
+
+    # Employ the bounding box to distort the image.
+    distorted_image = tf.slice(image, begin, size)
+```
+
+Note that if no bounding box information is available, setting
+`use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+false and no bounding boxes are supplied, an error is raised.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Save.pbtxt b/tensorflow/core/api_def/base_api/api_def_Save.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee75d6e4a6a9c2a1fa2ed17d2e9c163875c055db
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Save.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "Save"
+  in_arg {
+    name: "filename"
+    description: <<END
+Must have a single element. The name of the file to which we write
+the tensor.
+END
+  }
+  in_arg {
+    name: "tensor_names"
+    description: <<END
+Shape `[N]`. The names of the tensors to be saved.
+END
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+`N` tensors to save.
+END
+  }
+  summary: "Saves the input tensors to disk."
+  description: <<END
+The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+is written to `filename` with name `tensor_names[i]`.
+
+See also `SaveSlices`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SaveSlices.pbtxt b/tensorflow/core/api_def/base_api/api_def_SaveSlices.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61df999b2d0bf572a5231a8371df533ef5327147
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SaveSlices.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "SaveSlices"
+  in_arg {
+    name: "filename"
+    description: <<END
+Must have a single element. The name of the file to which we write the
+tensor.
+END
+  }
+  in_arg {
+    name: "tensor_names"
+    description: <<END
+Shape `[N]`. The names of the tensors to be saved.
+END
+  }
+  in_arg {
+    name: "shapes_and_slices"
+    description: <<END
+Shape `[N]`.  The shapes and slice specifications to use when
+saving the tensors.
+END
+  }
+  in_arg {
+    name: "data"
+    description: <<END
+`N` tensors to save.
+END
+  }
+  summary: "Saves input tensors slices to disk."
+  description: <<END
+This is like `Save` except that tensors can be listed in the saved file as being
+a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+have as many elements as `tensor_names`.
+
+Elements of the `shapes_and_slices` input must either be:
+
+*  The empty string, in which case the corresponding tensor is
+   saved normally.
+*  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+   `dimI` are the dimensions of the larger tensor and `slice-spec`
+   specifies what part is covered by the tensor to save.
+
+`slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+where each `sliceI` is either:
+
+*  The string `-` meaning that the slice covers all indices of this dimension
+*  `start,length` where `start` and `length` are integers.  In that
+   case the slice covers `length` indices starting at `start`.
+
+See also `Save`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SaveV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SaveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee87514f25ffbcc98e797d1461f69764c266a6dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SaveV2.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "SaveV2"
+  in_arg {
+    name: "prefix"
+    description: <<END
+Must have a single element. The prefix of the V2 checkpoint to which we
+write the tensors.
+END
+  }
+  in_arg {
+    name: "tensor_names"
+    description: <<END
+shape {N}. The names of the tensors to be saved.
+END
+  }
+  in_arg {
+    name: "shape_and_slices"
+    description: <<END
+shape {N}.  The slice specs of the tensors to be saved.
+Empty strings indicate that they are non-partitioned tensors.
+END
+  }
+  in_arg {
+    name: "tensors"
+    description: <<END
+`N` tensors to save.
+END
+  }
+  summary: "Saves tensors in V2 checkpoint format."
+  description: <<END
+By default, saves the named tensors in full.  If the caller wishes to save
+specific slices of full tensors, "shape_and_slices" should be non-empty strings
+and correspondingly well-formed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScalarSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2cedb05b719904d960e767df9fae12b019f4230e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScalarSummary.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "ScalarSummary"
+  in_arg {
+    name: "tags"
+    description: <<END
+Tags for the summary.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+Same shape as `tags.  Values for the summary.
+END
+  }
+  out_arg {
+    name: "summary"
+    description: <<END
+Scalar.  Serialized `Summary` protocol buffer.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with scalar values."
+  description: <<END
+The input `tags` and `values` must have the same shape.  The generated summary
+has a summary value for each tag-value pair in `tags` and `values`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e83d4a9e967f959b19adc5fad38a7141f8936cc4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScanDataset"
+  summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b5201f025b438a1e6bba41035004b82ab876de7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterAdd"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to add to `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the addition will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Adds sparse updates to a variable reference."
+  description: <<END
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] += updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] += updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterAdd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..771cf0b591367e18f007e91bf66bc1cfd02ab459
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterDiv.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "ScatterDiv"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of values that `ref` is divided by.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the operation will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Divides a variable reference by sparse updates."
+  description: <<END
+This operation computes
+
+```python
+    # Scalar indices
+    ref[indices, ...] /= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] /= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+```
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions divide.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a51f571b00d7fc68a24dbfc4a0104522f8c0f559
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterMul.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "ScatterMul"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to multiply to `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the operation will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Multiplies sparse updates into a variable reference."
+  description: <<END
+This operation computes
+
+```python
+    # Scalar indices
+    ref[indices, ...] *= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] *= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+```
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23732546edaf120eb1a1a9b45219014ba55c6d81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -0,0 +1,102 @@
+op {
+  graph_op_name: "ScatterNd"
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+1-D. The shape of the resulting tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor with the given shape and updates applied according
+to the indices.
+END
+  }
+  summary: "Scatter `updates` into a new (initially zero) tensor according to `indices`."
+  description: <<END
+Creates a new tensor by applying sparse `updates` to individual
+values or slices within a zero tensor of the given `shape` according to
+indices.  This operator is the inverse of the @{tf.gather_nd} operator which
+extracts values or slices from a given tensor.
+
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of scatter is to insert individual elements in a tensor by
+index. For example, say we want to insert 4 scattered elements in a rank-1
+tensor with 8 elements.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    shape = tf.constant([8])
+    scatter = tf.scatter_nd(indices, updates, shape)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [0, 11, 0, 10, 9, 0, 0, 12]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    shape = tf.constant([4, 4, 4])
+    scatter = tf.scatter_nd(indices, updates, shape)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0665ebf0e0ff6f8be34fb134e1b0d1adfa74eba
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "ScatterNdAdd"
+  in_arg {
+    name: "ref"
+    description: <<END
+A mutable Tensor. Should be from a Variable node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated values
+to add to ref.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+Same as ref. Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse addition between `updates` and individual values or slices"
+  description: <<END
+within a given variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+elements. In Python, that addition would look like this:
+
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    add = tf.scatter_nd_add(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(add)
+
+The resulting update to ref would look like this:
+
+    [1, 13, 3, 14, 14, 6, 7, 20]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5c64c2b900773d4ad9975f05f76453c1b8bf0df
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdNonAliasingAdd.pbtxt
@@ -0,0 +1,68 @@
+op {
+  graph_op_name: "ScatterNdNonAliasingAdd"
+  in_arg {
+    name: "input"
+    description: <<END
+A Tensor.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: `int32`, `int64`.
+A tensor of indices into `input`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated values
+to add to `input`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A `Tensor` with the same shape as `input`, containing values of `input`
+updated with `updates`.
+END
+  }
+  summary: "Applies sparse addition to `input` using individual values or slices"
+  description: <<END
+from `updates` according to indices `indices`.  The updates are non-aliasing:
+`input` is only modified in-place if no other operations will use it.
+Otherwise, a copy of `input` is made.  This operation has a gradient with
+respect to both `input` and `updates`.
+
+`input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `input`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+(if `K < P`) along the `K`th dimension of `input`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
+```
+
+For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+elements. In Python, that addition would look like this:
+
+    input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(output))
+
+The resulting value `output` would look like this:
+
+    [1, 13, 3, 14, 14, 6, 7, 20]
+
+See @{tf.scatter_nd} for more details about how to make updates to slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..333db017f56a47a2e3300c508da08caebe33a4f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "ScatterNdSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+A mutable Tensor. Should be from a Variable node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated values
+to subtract from ref.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+Same as ref. Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse subtraction between `updates` and individual values or slices"
+  description: <<END
+within a given variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    sub = tf.scatter_nd_sub(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(sub)
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33d98262d54da6d50dbb0659cb73fd47cf9f13d2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdUpdate.pbtxt
@@ -0,0 +1,76 @@
+op {
+  graph_op_name: "ScatterNdUpdate"
+  in_arg {
+    name: "ref"
+    description: <<END
+A mutable Tensor. Should be from a Variable node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of updated
+values to add to ref.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+Same as ref. Returned as a convenience for operations that want to
+use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse `updates` to individual values or slices within a given"
+  description: <<END
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0d3a4a1337ee1e1a32114adc51c930e014bc268
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterSub.pbtxt
@@ -0,0 +1,60 @@
+op {
+  graph_op_name: "ScatterSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to subtract from `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Subtracts sparse updates to a variable reference."
+  description: <<END
+```python
+    # Scalar indices
+    ref[indices, ...] -= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] -= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+```
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their (negated) contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterSub.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c44dbbd2332828242792d9cdd4a218e7457c7d2b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterUpdate.pbtxt
@@ -0,0 +1,63 @@
+op {
+  graph_op_name: "ScatterUpdate"
+  in_arg {
+    name: "ref"
+    description: <<END
+Should be from a `Variable` node.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A tensor of indices into the first dimension of `ref`.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A tensor of updated values to store in `ref`.
+END
+  }
+  out_arg {
+    name: "output_ref"
+    description: <<END
+= Same as `ref`.  Returned as a convenience for operations that want
+to use the updated values after the update is done.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the assignment will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse updates to a variable reference."
+  description: <<END
+This operation computes
+
+```python
+    # Scalar indices
+    ref[indices, ...] = updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+```
+
+This operation outputs `ref` after the update is done.
+This makes it easier to chain operations that need to use the reset value.
+
+If values in `ref` is to be updated more than once, because there are
+duplicate entries in `indices`, the order at which the updates happen
+for each value is undefined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterUpdate.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SdcaFprint.pbtxt b/tensorflow/core/api_def/base_api/api_def_SdcaFprint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..829840d04aaf089b75e1ce1940f06c991e93c615
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SdcaFprint.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "SdcaFprint"
+  in_arg {
+    name: "input"
+    description: <<END
+vector of strings to compute fingerprints on.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+a (N,2) shaped matrix where N is the number of elements in the input
+vector. Each row contains the low and high parts of the fingerprint.
+END
+  }
+  summary: "Computes fingerprints of the input strings."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0b58ac00e6709922ed517ad2c9efebbedf450a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SdcaOptimizer.pbtxt
@@ -0,0 +1,167 @@
+op {
+  graph_op_name: "SdcaOptimizer"
+  in_arg {
+    name: "sparse_example_indices"
+    description: <<END
+a list of vectors which contain example indices.
+END
+  }
+  in_arg {
+    name: "sparse_feature_indices"
+    description: <<END
+a list of vectors which contain feature indices.
+END
+  }
+  in_arg {
+    name: "sparse_feature_values"
+    description: <<END
+a list of vectors which contains feature value
+associated with each feature group.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+a list of matrices which contains the dense feature values.
+END
+  }
+  in_arg {
+    name: "example_weights"
+    description: <<END
+a vector which contains the weight associated with each
+example.
+END
+  }
+  in_arg {
+    name: "example_labels"
+    description: <<END
+a vector which contains the label/target associated with each
+example.
+END
+  }
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+a list of vectors where each value is the indices which has
+corresponding weights in sparse_weights. This field maybe omitted for the
+dense approach.
+END
+  }
+  in_arg {
+    name: "sparse_weights"
+    description: <<END
+a list of vectors where each value is the weight associated with
+a sparse feature group.
+END
+  }
+  in_arg {
+    name: "dense_weights"
+    description: <<END
+a list of vectors where the values are the weights associated
+with a dense feature group.
+END
+  }
+  in_arg {
+    name: "example_state_data"
+    description: <<END
+a list of vectors containing the example state data.
+END
+  }
+  out_arg {
+    name: "out_example_state_data"
+    description: <<END
+a list of vectors containing the updated example state
+data.
+END
+  }
+  out_arg {
+    name: "out_delta_sparse_weights"
+    description: <<END
+a list of vectors where each value is the delta
+weights associated with a sparse feature group.
+END
+  }
+  out_arg {
+    name: "out_delta_dense_weights"
+    description: <<END
+a list of vectors where the values are the delta
+weights associated with a dense feature group.
+END
+  }
+  attr {
+    name: "loss_type"
+    description: <<END
+Type of the primal loss. Currently SdcaSolver supports logistic,
+squared and hinge losses.
+END
+  }
+  attr {
+    name: "adaptative"
+    description: <<END
+Whether to use Adapative SDCA for the inner loop.
+END
+  }
+  attr {
+    name: "num_sparse_features"
+    description: <<END
+Number of sparse feature groups to train on.
+END
+  }
+  attr {
+    name: "num_sparse_features_with_values"
+    description: <<END
+Number of sparse feature groups with values
+associated with it, otherwise implicitly treats values as 1.0.
+END
+  }
+  attr {
+    name: "num_dense_features"
+    description: <<END
+Number of dense feature groups to train on.
+END
+  }
+  attr {
+    name: "l1"
+    description: <<END
+Symmetric l1 regularization strength.
+END
+  }
+  attr {
+    name: "l2"
+    description: <<END
+Symmetric l2 regularization strength.
+END
+  }
+  attr {
+    name: "num_loss_partitions"
+    description: <<END
+Number of partitions of the global loss function.
+END
+  }
+  attr {
+    name: "num_inner_iterations"
+    description: <<END
+Number of iterations per mini-batch.
+END
+  }
+  summary: "Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for"
+  description: <<END
+linear models with L1 + L2 regularization. As global optimization objective is
+strongly-convex, the optimizer optimizes the dual objective at each step. The
+optimizer applies each update one example at a time. Examples are sampled
+uniformly, and the optimizer is learning rate free and enjoys linear convergence
+rate.
+
+[Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+Shai Shalev-Shwartz, Tong Zhang. 2012
+
+$$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+
+[Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+Peter Richtarik, Martin Takac. 2015
+
+[Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SdcaShrinkL1.pbtxt b/tensorflow/core/api_def/base_api/api_def_SdcaShrinkL1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e723c169d91d345931cbed2c4a68a7e45faf8a4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SdcaShrinkL1.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "SdcaShrinkL1"
+  in_arg {
+    name: "weights"
+    description: <<END
+a list of vectors where each value is the weight associated with a
+feature group.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+Number of feature groups to apply shrinking step.
+END
+  }
+  attr {
+    name: "l1"
+    description: <<END
+Symmetric l1 regularization strength.
+END
+  }
+  attr {
+    name: "l2"
+    description: <<END
+Symmetric l2 regularization strength. Should be a positive float.
+END
+  }
+  summary: "Applies L1 regularization shrink step on the parameters."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db890cb2f51256fd9dabaa8aa590ccde37eec343
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SegmentMax"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the maximum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+\\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+that `segment_ids[j] == i`.
+
+If the max is empty for a given segment ID `i`, `output[i] = 0`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4713c523102a66204bcd4b0480e194ec5d14a420
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "SegmentMean"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the mean along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+\\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+over `j` such that `segment_ids[j] == i` and `N` is the total number of
+values summed.
+
+If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6316bfd1a5779ca28b4437c0324844b98e819e1a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SegmentMin"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the minimum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+\\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+that `segment_ids[j] == i`.
+
+If the min is empty for a given segment ID `i`, `output[i] = 0`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a16d03d467e9ac70e0752f29e042d50e878114b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SegmentProd"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the product along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+\\(output_i = \prod_j data_j\\) where the product is over `j` such
+that `segment_ids[j] == i`.
+
+If the product is empty for a given segment ID `i`, `output[i] = 1`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0686e17f9bdeb09076157fd664ddf58766c22560
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SegmentSum"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.  Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+\\(output_i = \sum_j data_j\\) where sum is over `j` such
+that `segment_ids[j] == i`.
+
+If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Select.pbtxt b/tensorflow/core/api_def/base_api/api_def_Select.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..456ea8c01e34ec56917f314ada14dac7ebafbf3c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Select.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "Select"
+  endpoint {
+    name: "Where3"
+  }
+  in_arg {
+    name: "t"
+    rename_to: "x"
+    description: <<END
+= A `Tensor` which may have the same shape as `condition`.
+If `condition` is rank 1, `t` may have higher rank,
+but its first dimension must match the size of `condition`.
+END
+  }
+  in_arg {
+    name: "e"
+    rename_to: "y"
+    description: <<END
+= A `Tensor` with the same type and shape as `t`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+= A `Tensor` with the same type and shape as `t` and `e`.
+END
+  }
+  summary: "Selects elements from `t` or `e`, depending on `condition`."
+  description: <<END
+The `t`, and `e` tensors must all have the same shape, and the
+output will also have that shape.
+
+The `condition` tensor must be a scalar if `t` and `e` are scalars.
+If `t` and `e` are vectors or higher rank, then `condition` must be either a
+scalar, a vector with size matching the first dimension of `t`, or must have
+the same shape as `t`.
+
+The `condition` tensor acts as a mask that chooses, based on the value at each
+element, whether the corresponding element / row in the output should be
+taken from `t` (if true) or `e` (if false).
+
+If `condition` is a vector and `t` and `e` are higher rank matrices, then
+it chooses which row (outer dimension) to copy from `t` and `e`.
+If `condition` has the same shape as `t` and `e`, then it chooses which
+element to copy from `t` and `e`.
+
+For example:
+
+```python
+# 'condition' tensor is [[True,  False]
+#                        [False, True]]
+# 't' is [[1, 2],
+#         [3, 4]]
+# 'e' is [[5, 6],
+#         [7, 8]]
+select(condition, t, e)  # => [[1, 6], [7, 4]]
+
+
+# 'condition' tensor is [True, False]
+# 't' is [[1, 2],
+#         [3, 4]]
+# 'e' is [[5, 6],
+#         [7, 8]]
+select(condition, t, e) ==> [[1, 2],
+                             [7, 8]]
+
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51d63eeb5695d6a428e990ba43e54102db58b58e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEig.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "SelfAdjointEig"
+  in_arg {
+    name: "input"
+    description: <<END
+Shape is `[..., M, M]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M+1, M]`.
+END
+  }
+  summary: "Computes the Eigen Decomposition of a batch of square self-adjoint matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices, with the same constraints as the single matrix
+SelfAdjointEig.
+
+The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a5e1252586ea8b3e03b2545e0d8646288ddc408
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SelfAdjointEigV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "SelfAdjointEigV2"
+  endpoint {
+    name: "SelfAdjointEig"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+`Tensor` input of shape `[N, N]`.
+END
+  }
+  out_arg {
+    name: "e"
+    description: <<END
+Eigenvalues. Shape is `[N]`.
+END
+  }
+  out_arg {
+    name: "v"
+    description: <<END
+Eigenvectors. Shape is `[N, N]`.
+END
+  }
+  attr {
+    name: "compute_v"
+    description: <<END
+If `True` then eigenvectors will be computed and returned in `v`.
+Otherwise, only the eigenvalues will be computed.
+END
+  }
+  summary: "Computes the eigen decomposition of one or more square self-adjoint matrices."
+  description: <<END
+Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+`input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+
+```python
+# a is a tensor.
+# e is a tensor of eigenvalues.
+# v is a tensor of eigenvectors.
+e, v = self_adjoint_eig(a)
+e = self_adjoint_eig(a, compute_v=False)
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbe76de415125663ff47d3f0fac99f27ad029086
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "Selu"
+  summary: "Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`"
+  description: <<END
+if < 0, `scale * features` otherwise.
+
+See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SeluGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SeluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5180b73d299eeaf7cbe09493c129f68dee295f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SeluGrad.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "SeluGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding Selu operation.
+END
+  }
+  in_arg {
+    name: "outputs"
+    description: <<END
+The outputs of the corresponding Selu operation.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+The gradients: `gradients * (outputs + scale * alpha)`
+if outputs < 0, `scale * gradients` otherwise.
+END
+  }
+  summary: "Computes gradients for the scaled exponential linear (Selu) operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e24b122006980e0f99e8a76cc317b1f45cec1d68
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeIterator.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "SerializeIterator"
+  in_arg {
+    name: "resource_handle"
+    description: <<END
+A handle to an iterator resource.
+END
+  }
+  out_arg {
+    name: "serialized"
+    description: <<END
+A variant tensor storing the state of the iterator contained in the
+resource.
+END
+  }
+  summary: "Converts the given `resource_handle` representing an iterator to a variant tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d46b4b20eeb58ef1cc261372d69acfe5a70668fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeManySparse.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SerializeManySparse"
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+2-D.  The `indices` of the minibatch `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  The `values` of the minibatch `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_shape"
+    description: <<END
+1-D.  The `shape` of the minibatch `SparseTensor`.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object."
+  description: <<END
+The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+is treated as the minibatch dimension.  Elements of the `SparseTensor`
+must be sorted in increasing order of this first dimension.  The serialized
+`SparseTensor` objects going into each row of `serialized_sparse` will have
+rank `R-1`.
+
+The minibatch size `N` is extracted from `sparse_shape[0]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..491f69fda088edb8a051b81e65d581094823ca5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeSparse.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "SerializeSparse"
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  The `values` of the `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "sparse_shape"
+    description: <<END
+1-D.  The `shape` of the `SparseTensor`.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The `dtype` to use for serialization; the supported types are `string`
+(default) and `variant`.
+END
+  }
+  summary: "Serialize a `SparseTensor` into a `[3]` `Tensor` object."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/base_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48f7ba7aa14823d1eaa02bc100aed8d17298d9c8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "SerializeTensor"
+  in_arg {
+    name: "tensor"
+    description: <<END
+A Tensor of type `T`.
+END
+  }
+  out_arg {
+    name: "serialized"
+    description: <<END
+A serialized TensorProto proto of the input tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of the input tensor.
+END
+  }
+  summary: "Transforms a Tensor into a serialized TensorProto proto."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..812537412e20c800042e268c04d55aa4c32912cd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SetSize.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SetSize"
+  in_arg {
+    name: "set_indices"
+    description: <<END
+2D `Tensor`, indices of a `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "set_values"
+    description: <<END
+1D `Tensor`, values of a `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "set_shape"
+    description: <<END
+1D `Tensor`, shape of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+`n-1` dimensions as `set`. Each value is the number of unique elements in
+the corresponding `[0...n-1]` dimension of `set`.
+END
+  }
+  summary: "Number of unique elements along last dimension of input `set`."
+  description: <<END
+Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+and `set_shape`. The last dimension contains values in a set, duplicates are
+allowed but ignored.
+
+If `validate_indices` is `True`, this op validates the order and range of `set`
+indices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Shape.pbtxt b/tensorflow/core/api_def/base_api/api_def_Shape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4efb5384e09efd13045597a0187930ba6a8aa67a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Shape.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "Shape"
+  summary: "Returns the shape of a tensor."
+  description: <<END
+This operation returns a 1-D integer tensor representing the shape of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+shape(t) ==> [2, 2, 3]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShapeN.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShapeN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa38320f9b42de2429679cb099baa9fdc5db6f02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShapeN.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ShapeN"
+  summary: "Returns shape of tensors."
+  description: <<END
+This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShardedFilename.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShardedFilename.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11d1352918630044b4adac1423700f9c9967d328
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShardedFilename.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ShardedFilename"
+  summary: "Generate a sharded filename. The filename is printf formatted as"
+  description: <<END
+   %s-%05d-of-%05d, basename, shard, num_shards.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShardedFilespec.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShardedFilespec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ecf0a091e2070a444ec8c1bf788142a4e3db39c0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShardedFilespec.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShardedFilespec"
+  summary: "Generate a glob pattern matching all sharded file names."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b12d3af9d74411fb46fb50d7dba57b7e60bbe933
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ShuffleDataset"
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+The number of output elements to buffer in an iterator over
+this dataset. Compare with the `min_after_dequeue` attr when creating a
+`RandomShuffleQueue`.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+A scalar seed for the random number generator. If either seed or
+seed2 is set to be non-zero, the random number generator is seeded
+by the given seed.  Otherwise, a random seed is used.
+END
+  }
+  in_arg {
+    name: "seed2"
+    description: <<END
+A second scalar seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    description: <<END
+If true, each iterator over this dataset will be given
+a different pseudorandomly generated seed, based on a sequence seeded by the
+`seed` and `seed2` inputs. If false, each iterator will be given the same
+seed, and repeated iteration over this dataset will yield the exact same
+sequence of results.
+END
+  }
+  summary: "Creates a dataset that shuffles elements from `input_dataset` pseudorandomly."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sigmoid.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sigmoid.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..300ab0cde6980b6c4262f7809ac74f41a1080e3f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sigmoid.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Sigmoid"
+  summary: "Computes sigmoid of `x` element-wise."
+  description: <<END
+Specifically, `y = 1 / (1 + exp(-x))`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SigmoidGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SigmoidGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..911d6c5eeeb9b202f4192e4abb12c9ea5373c096
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SigmoidGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "SigmoidGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient of the sigmoid of `x` wrt its input."
+  description: <<END
+Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+`dy` is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4eb4be1a7580a7d51733512508488feb9d934342
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sign.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "Sign"
+  summary: "Returns an element-wise indication of the sign of a number."
+  description: <<END
+`y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+
+For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sin.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4edefb66d461de495c296cf417a528d65993d90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sin.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sin"
+  summary: "Computes sin of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sinh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6784e8a59e87b5bfc9e760631cc1e95fff18983
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sinh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sinh"
+  summary: "Computes hyperbolic sine of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Size.pbtxt b/tensorflow/core/api_def/base_api/api_def_Size.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6cb330859f5e33e8b91ba7b871545676e95234
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Size.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "Size"
+  summary: "Returns the size of a tensor."
+  description: <<END
+This operation returns an integer representing the number of elements in
+`input`.
+
+For example:
+
+```
+# 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+size(t) ==> 12
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44e5bac79b8cdfb703d8679b66d79ab9e9e7509a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "SkipDataset"
+  in_arg {
+    name: "count"
+    description: <<END
+A scalar representing the number of elements from the `input_dataset`
+that should be skipped.  If count is -1, skips everything.
+END
+  }
+  summary: "Creates a dataset that skips `count` elements from the `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Skipgram.pbtxt b/tensorflow/core/api_def/base_api/api_def_Skipgram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6829540177efc4239d96bbc467af9005e7ad178
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Skipgram.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "Skipgram"
+  out_arg {
+    name: "vocab_word"
+    description: <<END
+A vector of words in the corpus.
+END
+  }
+  out_arg {
+    name: "vocab_freq"
+    description: <<END
+Frequencies of words. Sorted in the non-ascending order.
+END
+  }
+  out_arg {
+    name: "words_per_epoch"
+    description: <<END
+Number of words per epoch in the data file.
+END
+  }
+  out_arg {
+    name: "current_epoch"
+    description: <<END
+The current epoch number.
+END
+  }
+  out_arg {
+    name: "total_words_processed"
+    description: <<END
+The total number of words processed so far.
+END
+  }
+  out_arg {
+    name: "examples"
+    description: <<END
+A vector of word ids.
+END
+  }
+  out_arg {
+    name: "labels"
+    description: <<END
+A vector of word ids.
+END
+  }
+  attr {
+    name: "filename"
+    description: <<END
+The corpus's text file name.
+END
+  }
+  attr {
+    name: "batch_size"
+    description: <<END
+The size of produced batch.
+END
+  }
+  attr {
+    name: "window_size"
+    description: <<END
+The number of words to predict to the left and right of the target.
+END
+  }
+  attr {
+    name: "min_count"
+    description: <<END
+The minimum number of word occurrences for it to be included in the
+vocabulary.
+END
+  }
+  attr {
+    name: "subsample"
+    description: <<END
+Threshold for word occurrence. Words that appear with higher
+frequency will be randomly down-sampled. Set to 0 to disable.
+END
+  }
+  summary: "Parses a text file and creates a batch of examples."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Slice.pbtxt b/tensorflow/core/api_def/base_api/api_def_Slice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd6ad26d1b3dd3ae4190e710ca4ee2ba62846242
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Slice.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "Slice"
+  in_arg {
+    name: "begin"
+    description: <<END
+begin[i] specifies the offset into the 'i'th dimension of
+'input' to slice from.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+size[i] specifies the number of elements of the 'i'th dimension
+of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+i are included in the slice (i.e. this is equivalent to setting
+size[i] = input.dim_size(i) - begin[i]).
+END
+  }
+  summary: "Return a slice from \'input\'."
+  description: <<END
+The output tensor is a tensor with dimensions described by 'size'
+whose values are extracted from 'input' starting at the offsets in
+'begin'.
+
+*Requirements*:
+  0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49b7f5798cd58d7c96c9b0a582a6d79df4dab5a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  summary: "Returns a copy of the input tensor."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Softmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_Softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43884824c9e6e65491e51c6953f2e35eb19bd634
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Softmax.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "Softmax"
+  in_arg {
+    name: "logits"
+    description: <<END
+2-D with shape `[batch_size, num_classes]`.
+END
+  }
+  out_arg {
+    name: "softmax"
+    description: <<END
+Same shape as `logits`.
+END
+  }
+  summary: "Computes softmax activations."
+  description: <<END
+For each batch `i` and class `j` we have
+
+    softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/base_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..973fbb8f6c4469429446562a4db3ae89fdc75b28
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "SoftmaxCrossEntropyWithLogits"
+  in_arg {
+    name: "features"
+    description: <<END
+batch_size x num_classes matrix
+END
+  }
+  in_arg {
+    name: "labels"
+    description: <<END
+batch_size x num_classes matrix
+The caller must ensure that each batch of labels represents a valid
+probability distribution.
+END
+  }
+  out_arg {
+    name: "loss"
+    description: <<END
+Per example loss (batch_size vector).
+END
+  }
+  out_arg {
+    name: "backprop"
+    description: <<END
+backpropagated gradients (batch_size x num_classes matrix).
+END
+  }
+  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
+  description: <<END
+Inputs are the logits, not probabilities.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Softplus.pbtxt b/tensorflow/core/api_def/base_api/api_def_Softplus.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83f6aad87759620fa061d9cffd94b7ccd9a20b20
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Softplus.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Softplus"
+  summary: "Computes softplus: `log(exp(features) + 1)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SoftplusGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SoftplusGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96e4d8cb5ad36ddcc2c3730f6efeb8066da13bd4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SoftplusGrad.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "SoftplusGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding softplus operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding softplus operation.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+The gradients: `gradients / (1 + exp(-features))`.
+END
+  }
+  summary: "Computes softplus gradients for a softplus operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Softsign.pbtxt b/tensorflow/core/api_def/base_api/api_def_Softsign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ae451ec44ff55fdf7e2165a7b6b64dab100c4ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Softsign.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Softsign"
+  summary: "Computes softsign: `features / (abs(features) + 1)`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SoftsignGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SoftsignGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23696f12a1adad25ad1df4c0dcc434e76e206078
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SoftsignGrad.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "SoftsignGrad"
+  visibility: HIDDEN
+  in_arg {
+    name: "gradients"
+    description: <<END
+The backpropagated gradients to the corresponding softsign operation.
+END
+  }
+  in_arg {
+    name: "features"
+    description: <<END
+The features passed as input to the corresponding softsign operation.
+END
+  }
+  out_arg {
+    name: "backprops"
+    description: <<END
+The gradients: `gradients / (1 + abs(features)) ** 2`.
+END
+  }
+  summary: "Computes softsign gradients for a softsign operation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SpaceToBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_SpaceToBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de6182807a74132f743cf40dea8c6c6633ac7f13
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SpaceToBatch.pbtxt
@@ -0,0 +1,109 @@
+op {
+  graph_op_name: "SpaceToBatch"
+  in_arg {
+    name: "input"
+    description: <<END
+4-D with shape `[batch, height, width, depth]`.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+  the padding of the input with zeros across the spatial dimensions as follows:
+
+      paddings = [[pad_top, pad_bottom], [pad_left, pad_right]]
+
+  The effective spatial dimensions of the zero-padded input tensor will be:
+
+      height_pad = pad_top + height + pad_bottom
+      width_pad = pad_left + width + pad_right
+
+The attr `block_size` must be greater than one. It indicates the block size.
+
+  * Non-overlapping blocks of size `block_size x block size` in the height and
+    width dimensions are rearranged into the batch dimension at each location.
+  * The batch of the output tensor is `batch * block_size * block_size`.
+  * Both height_pad and width_pad must be divisible by block_size.
+
+The shape of the output will be:
+
+    [batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+     depth]
+
+Some examples:
+
+(1) For the following input of shape `[1, 2, 2, 1]` and block_size of 2:
+
+```
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 1]` and value:
+
+```
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+(2) For the following input of shape `[1, 2, 2, 3]` and block_size of 2:
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 3]` and value:
+
+```
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+(3) For the following input of shape `[1, 4, 4, 1]` and block_size of 2:
+
+```
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]],
+      [[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[4, 2, 2, 1]` and value:
+
+```
+x = [[[[1], [3]], [[9], [11]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+(4) For the following input of shape `[2, 2, 4, 1]` and block_size of 2:
+
+```
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]]],
+     [[[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[8, 1, 2, 1]` and value:
+
+```
+x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+     [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+```
+
+Among others, this operation is useful for reducing atrous convolution into
+regular convolution.
+END
+  }
+  summary: "SpaceToBatch for 4-D tensors of type T."
+  description: <<END
+This is a legacy version of the more general SpaceToBatchND.
+
+Zero-pads and then rearranges (permutes) blocks of spatial data into batch.
+More specifically, this op outputs a copy of the input tensor where values from
+the `height` and `width` dimensions are moved to the `batch` dimension. After
+the zero-padding, both `height` and `width` of the input must be divisible by the
+block size.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SpaceToBatchND.pbtxt b/tensorflow/core/api_def/base_api/api_def_SpaceToBatchND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c5e33791984b2e5dba336b43d967b14120b4ae7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SpaceToBatchND.pbtxt
@@ -0,0 +1,140 @@
+op {
+  graph_op_name: "SpaceToBatchND"
+  in_arg {
+    name: "input"
+    description: <<END
+N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+where spatial_shape has `M` dimensions.
+END
+  }
+  in_arg {
+    name: "block_shape"
+    description: <<END
+1-D with shape `[M]`, all values must be >= 1.
+END
+  }
+  in_arg {
+    name: "paddings"
+    description: <<END
+2-D with shape `[M, 2]`, all values must be >= 0.
+  `paddings[i] = [pad_start, pad_end]` specifies the padding for input dimension
+  `i + 1`, which corresponds to spatial dimension `i`.  It is required that
+  `block_shape[i]` divides `input_shape[i + 1] + pad_start + pad_end`.
+
+This operation is equivalent to the following steps:
+
+1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
+   input according to `paddings` to produce `padded` of shape `padded_shape`.
+
+2. Reshape `padded` to `reshaped_padded` of shape:
+
+     [batch] +
+     [padded_shape[1] / block_shape[0],
+       block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1],
+      block_shape[M-1]] +
+     remaining_shape
+
+3. Permute dimensions of `reshaped_padded` to produce
+   `permuted_reshaped_padded` of shape:
+
+     block_shape +
+     [batch] +
+     [padded_shape[1] / block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1]] +
+     remaining_shape
+
+4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the batch
+   dimension, producing an output tensor of shape:
+
+     [batch * prod(block_shape)] +
+     [padded_shape[1] / block_shape[0],
+      ...,
+      padded_shape[M] / block_shape[M-1]] +
+     remaining_shape
+
+Some examples:
+
+(1) For the following input of shape `[1, 2, 2, 1]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```
+x = [[[[1], [2]], [[3], [4]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 1]` and value:
+
+```
+[[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+```
+
+(2) For the following input of shape `[1, 2, 2, 3]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+The output tensor has shape `[4, 1, 1, 3]` and value:
+
+```
+[[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+```
+
+(3) For the following input of shape `[1, 4, 4, 1]`, `block_shape = [2, 2]`, and
+    `paddings = [[0, 0], [0, 0]]`:
+
+```
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]],
+      [[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[4, 2, 2, 1]` and value:
+
+```
+x = [[[[1], [3]], [[9], [11]]],
+     [[[2], [4]], [[10], [12]]],
+     [[[5], [7]], [[13], [15]]],
+     [[[6], [8]], [[14], [16]]]]
+```
+
+(4) For the following input of shape `[2, 2, 4, 1]`, block_shape = `[2, 2]`, and
+    paddings = `[[0, 0], [2, 0]]`:
+
+```
+x = [[[[1],   [2],  [3],  [4]],
+      [[5],   [6],  [7],  [8]]],
+     [[[9],  [10], [11],  [12]],
+      [[13], [14], [15],  [16]]]]
+```
+
+The output tensor has shape `[8, 1, 3, 1]` and value:
+
+```
+x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+     [[[0], [2], [4]]], [[[0], [10], [12]]],
+     [[[0], [5], [7]]], [[[0], [13], [15]]],
+     [[[0], [6], [8]]], [[[0], [14], [16]]]]
+```
+
+Among others, this operation is useful for reducing atrous convolution into
+regular convolution.
+END
+  }
+  summary: "SpaceToBatch for N-D tensors of type T."
+  description: <<END
+This operation divides "spatial" dimensions `[1, ..., M]` of the input into a
+grid of blocks of shape `block_shape`, and interleaves these blocks with the
+"batch" dimension (0) such that in the output, the spatial dimensions
+`[1, ..., M]` correspond to the position within the grid, and the batch
+dimension combines both the position within a spatial block and the original
+batch position.  Prior to division into blocks, the spatial dimensions of the
+input are optionally zero padded according to `paddings`.  See below for a
+precise description.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b808ff5f9cf9072bdb95e779589668160d909b8f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SpaceToDepth.pbtxt
@@ -0,0 +1,95 @@
+op {
+  graph_op_name: "SpaceToDepth"
+  attr {
+    name: "block_size"
+    description: <<END
+The size of the spatial block.
+END
+  }
+  summary: "SpaceToDepth for tensors of type T."
+  description: <<END
+Rearranges blocks of spatial data, into depth. More specifically,
+this op outputs a copy of the input tensor where values from the `height`
+and `width` dimensions are moved to the `depth` dimension.
+The attr `block_size` indicates the input block size.
+
+  * Non-overlapping blocks of size `block_size x block size` are rearranged
+    into depth at each location.
+  * The depth of the output tensor is `block_size * block_size * input_depth`.
+  * The Y, X coordinates within each block of the input become the high order
+    component of the output channel index.
+  * The input tensor's height and width must be divisible by block_size.
+
+The `data_format` attr specifies the layout of the input and output tensors
+with the following options:
+  "NHWC": `[ batch, height, width, channels ]`
+  "NCHW": `[ batch, channels, height, width ]`
+  "NCHW_VECT_C":
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
+
+It is useful to consider the operation as transforming a 6-D Tensor.
+e.g. for data_format = NHWC,
+     Each element in the input tensor can be specified via 6 coordinates,
+     ordered by decreasing memory layout significance as:
+     n,oY,bY,oX,bX,iC  (where n=batch index, oX, oY means X or Y coordinates
+                        within the output image, bX, bY means coordinates
+                        within the input block, iC means input channels).
+     The output would be a transpose to the following layout:
+     n,oY,oX,bY,bX,iC
+
+This operation is useful for resizing the activations between convolutions
+(but keeping all data), e.g. instead of pooling. It is also useful for training
+purely convolutional models.
+
+For example, given an input of shape `[1, 2, 2, 1]`, data_format = "NHWC" and
+block_size = 2:
+
+```
+x = [[[[1], [2]],
+      [[3], [4]]]]
+```
+
+This operation will output a tensor of shape `[1, 1, 1, 4]`:
+
+```
+[[[[1, 2, 3, 4]]]]
+```
+
+Here, the input has a batch of 1 and each batch element has shape `[2, 2, 1]`,
+the corresponding output will have a single element (i.e. width and height are
+both 1) and will have a depth of 4 channels (1 * block_size * block_size).
+The output element shape is `[1, 1, 4]`.
+
+For an input tensor with larger depth, here of shape `[1, 2, 2, 3]`, e.g.
+
+```
+x = [[[[1, 2, 3], [4, 5, 6]],
+      [[7, 8, 9], [10, 11, 12]]]]
+```
+
+This operation, for block_size of 2, will return the following tensor of shape
+`[1, 1, 1, 12]`
+
+```
+[[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+```
+
+Similarly, for the following input of shape `[1 4 4 1]`, and a block size of 2:
+
+```
+x = [[[[1],   [2],  [5],  [6]],
+      [[3],   [4],  [7],  [8]],
+      [[9],  [10], [13],  [14]],
+      [[11], [12], [15],  [16]]]]
+```
+
+the operator will return the following tensor of shape `[1 2 2 4]`:
+
+```
+x = [[[[1, 2, 3, 4],
+       [5, 6, 7, 8]],
+      [[9, 10, 11, 12],
+       [13, 14, 15, 16]]]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11c49805879a443375391a883acf9a0e88fa0692
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "SparseAccumulatorApplyGradient"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a accumulator.
+END
+  }
+  in_arg {
+    name: "local_step"
+    description: <<END
+The local_step value at which the sparse gradient was computed.
+END
+  }
+  in_arg {
+    name: "gradient_indices"
+    description: <<END
+Indices of the sparse gradient to be accumulated. Must be a
+vector.
+END
+  }
+  in_arg {
+    name: "gradient_values"
+    description: <<END
+Values are the non-zero slices of the gradient, and must have
+the same first dimension as indices, i.e., the nnz represented by indices and
+values must be consistent.
+END
+  }
+  in_arg {
+    name: "gradient_shape"
+    description: <<END
+Shape of the sparse gradient to be accumulated.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The data type of accumulated gradients. Needs to correspond to the type
+of the accumulator.
+END
+  }
+  attr {
+    name: "has_known_shape"
+    description: <<END
+Boolean indicating whether gradient_shape is unknown, in which
+case the input is ignored during validation.
+END
+  }
+  summary: "Applies a sparse gradient to a given accumulator."
+  description: <<END
+Does not add if local_step is smaller than the accumulator's
+global_step.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..725bbaf5018ca960fd5b544eb5199975311e2246
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "SparseAccumulatorTakeGradient"
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a SparseConditionalAccumulator.
+END
+  }
+  in_arg {
+    name: "num_required"
+    description: <<END
+Number of gradients required before we return an aggregate.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+Indices of the average of the accumulated sparse gradients.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+Values of the average of the accumulated sparse gradients.
+END
+  }
+  out_arg {
+    name: "shape"
+    description: <<END
+Shape of the average of the accumulated sparse gradients.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The data type of accumulated gradients. Needs to correspond to the type
+of the accumulator.
+END
+  }
+  summary: "Extracts the average sparse gradient in a SparseConditionalAccumulator."
+  description: <<END
+The op will blocks until sufficient (i.e., more than num_required)
+gradients have been accumulated. If the accumulator has already
+aggregated more than num_required gradients, it will return its
+average of the accumulated gradients.  Also automatically increments
+the recorded global_step in the accumulator by 1, and resets the
+aggregate to 0.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2409aa3b2fb5c9fb450d7479a78449f33517db1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseAdd.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "SparseAdd"
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+END
+  }
+  in_arg {
+    name: "a_values"
+    description: <<END
+1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+END
+  }
+  in_arg {
+    name: "a_shape"
+    description: <<END
+1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+END
+  }
+  in_arg {
+    name: "b_indices"
+    description: <<END
+2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+END
+  }
+  in_arg {
+    name: "b_values"
+    description: <<END
+1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+END
+  }
+  in_arg {
+    name: "b_shape"
+    description: <<END
+1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+END
+  }
+  in_arg {
+    name: "thresh"
+    description: <<END
+0-D.  The magnitude threshold that determines if an output value/index
+pair takes space.
+END
+  }
+  summary: "Adds two `SparseTensor` objects to produce another `SparseTensor`."
+  description: <<END
+The input `SparseTensor` objects' indices are assumed ordered in standard
+lexicographic order.  If this is not the case, before this step run
+`SparseReorder` to restore index ordering.
+
+By default, if two values sum to zero at some index, the output `SparseTensor`
+would still include that particular location in its index, storing a zero in the
+corresponding value slot.  To override this, callers can specify `thresh`,
+indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+corresponding value and index would then not be included.  In particular,
+`thresh == 0` (default) means everything is kept and actual thresholding happens
+only for a positive value.
+
+In the following shapes, `nnz` is the count after taking `thresh` into account.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseAddGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5e0a7d9cbc45f0bd3f617614a2397f7db563c64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseAddGrad.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "SparseAddGrad"
+  in_arg {
+    name: "backprop_val_grad"
+    description: <<END
+1-D with shape `[nnz(sum)]`.  The gradient with respect to
+the non-empty values of the sum.
+END
+  }
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+END
+  }
+  in_arg {
+    name: "b_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+END
+  }
+  in_arg {
+    name: "sum_indices"
+    description: <<END
+2-D.  The `indices` of the sum `SparseTensor`, size
+`[nnz(sum), ndims]`.
+END
+  }
+  out_arg {
+    name: "a_val_grad"
+    description: <<END
+1-D with shape `[nnz(A)]`. The gradient with respect to the
+non-empty values of A.
+END
+  }
+  out_arg {
+    name: "b_val_grad"
+    description: <<END
+1-D with shape `[nnz(B)]`. The gradient with respect to the
+non-empty values of B.
+END
+  }
+  summary: "The gradient operator for the SparseAdd op."
+  description: <<END
+The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+values of A and B.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15c1797d2dd6cde13b87463c74e42525bc3dd741
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdadelta.pbtxt
@@ -0,0 +1,59 @@
+op {
+  graph_op_name: "SparseApplyAdadelta"
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum_update"
+    description: <<END
+: Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Constant factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "var: Should be from a Variable()."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1698e2def0766f01a49671be7927374c033199e4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagrad.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "SparseApplyAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the adagrad scheme."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6878eb70b3f21111cee24fac0d11170df051cf6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyAdagradDA.pbtxt
@@ -0,0 +1,71 @@
+op {
+  graph_op_name: "SparseApplyAdagradDA"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "gradient_squared_accumulator"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "global_step"
+    description: <<END
+Training step number. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Update entries in \'*var\' and \'*accum\' according to the proximal adagrad scheme."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c6a36bf456e84cc855ae64fbc5a27e1ac234736
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,90 @@
+op {
+  graph_op_name: "SparseApplyCenteredRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var, ms and mom.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, mg, ms, and mom tensors is
+protected by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the centered RMSProp algorithm."
+  description: <<END
+The centered RMSProp algorithm uses an estimate of the centered second moment
+(i.e., the variance) for normalization, as opposed to regular RMSProp, which
+uses the (uncentered) second moment. This often helps with training, but is
+slightly more expensive in terms of computation and memory.
+
+Note that in dense implementation of this algorithm, mg, ms, and mom will
+update even if the grad is zero, but in this sparse implementation, mg, ms,
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+mean_grad = decay * mean_grad + (1-decay) * gradient
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..524b5c5a47dd6570d7cb7b59775babcdd2b1d19d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrl.pbtxt
@@ -0,0 +1,80 @@
+op {
+  graph_op_name: "SparseApplyFtrl"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+That is for rows we have grad for, we update var, accum and linear as follows:
+accum_new = accum + grad * grad
+linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9247fb61b882ad3fabf7622d91f5c1574b00f656
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -0,0 +1,82 @@
+op {
+  graph_op_name: "SparseApplyFtrlV2"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "linear"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 shrinkage regulariation. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr_power"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update relevant entries in \'*var\' according to the Ftrl-proximal scheme."
+  description: <<END
+That is for rows we have grad for, we update var, accum and linear as follows:
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d9ac9ea3fa46a2d19a7f4d8967a0acd17f00333
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyMomentum.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "SparseApplyMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var - lr * momentum * accum, so in the end, the var you get is actually
+var - lr * momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum + grad
+var -= lr * accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80541b91c7ed01183596de26881956aa90c14b17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,66 @@
+op {
+  graph_op_name: "SparseApplyProximalAdagrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, updating of the var and accum tensors will be protected by
+a lock; otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Sparse update entries in \'*var\' and \'*accum\' according to FOBOS algorithm."
+  description: <<END
+That is for rows we have grad for, we update var and accum as follows:
+accum += grad * grad
+prox_v = var
+prox_v -= lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5200e5516df10ca438828cb38fa1db8adba156b0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "SparseApplyProximalGradientDescent"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "alpha"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l1"
+    description: <<END
+L1 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "l2"
+    description: <<END
+L2 regularization. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If True, the subtraction will be protected by a lock;
+otherwise the behavior is undefined, but may exhibit less contention.
+END
+  }
+  summary: "Sparse update \'*var\' as FOBOS algorithm with fixed learning rate."
+  description: <<END
+That is for rows we have grad for, we update var as follows:
+prox_v = var - alpha * grad
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4dbd608b893b334cba07ea0713a45fa4125f102
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyRMSProp.pbtxt
@@ -0,0 +1,78 @@
+op {
+  graph_op_name: "SparseApplyRMSProp"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "rho"
+    description: <<END
+Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var, ms and mom.
+END
+  }
+  out_arg {
+    name: "out"
+    description: <<END
+Same as "var".
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, ms, and mom tensors is protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the RMSProp algorithm."
+  description: <<END
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a72ae904755b97a5516c3467fc3519391e0df579
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseConcat.pbtxt
@@ -0,0 +1,90 @@
+op {
+  graph_op_name: "SparseConcat"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.  Non-empty values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.  Shapes of each `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  attr {
+    name: "concat_dim"
+    description: <<END
+Dimension to concatenate along. Must be in range [-rank, rank),
+where rank is the number of dimensions in each input `SparseTensor`.
+END
+  }
+  summary: "Concatenates a list of `SparseTensor` along the specified dimension."
+  description: <<END
+Concatenation is with respect to the dense versions of these sparse tensors.
+It is assumed that each input is a `SparseTensor` whose elements are ordered
+along increasing dimension number.
+
+All inputs' shapes must match, except for the concat dimension.  The
+`indices`, `values`, and `shapes` lists must have the same length.
+
+The output shape is identical to the inputs', except along the concat
+dimension, where it is the sum of the inputs' sizes along that dimension.
+
+The output elements will be resorted to preserve the sort order along
+increasing dimension number.
+
+This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+values across all inputs. This is due to the need for an internal sort in
+order to concatenate efficiently across an arbitrary dimension.
+
+For example, if `concat_dim = 1` and the inputs are
+
+    sp_inputs[0]: shape = [2, 3]
+    [0, 2]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    sp_inputs[1]: shape = [2, 4]
+    [0, 1]: "d"
+    [0, 2]: "e"
+
+then the output will be
+
+    shape = [2, 7]
+    [0, 2]: "a"
+    [0, 4]: "d"
+    [0, 5]: "e"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+Graphically this is equivalent to doing
+
+    [    a] concat [  d e  ] = [    a   d e  ]
+    [b c  ]        [       ]   [b c          ]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseConditionalAccumulator.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c367416f2aa0045d837a489af68623e1fcea0a34
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseConditionalAccumulator.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "SparseConditionalAccumulator"
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the accumulator.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the value being accumulated.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the values.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this accumulator is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this accumulator will be shared under the given name
+across multiple sessions.
+END
+  }
+  summary: "A conditional accumulator for aggregating sparse gradients."
+  description: <<END
+The accumulator accepts gradients marked with local_step greater or
+equal to the most recent global_step known to the accumulator. The
+average can be extracted from the accumulator, provided sufficient
+gradients have been accumulated. Extracting the average automatically
+resets the aggregate to 0, and increments the global_step recorded by
+the accumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCross.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2aea6cfe4fddd274e7d1f5822d9e35badd042dfd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCross.pbtxt
@@ -0,0 +1,106 @@
+op {
+  graph_op_name: "SparseCross"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  attr {
+    name: "hashed_output"
+    description: <<END
+If true, returns the hash of the cross instead of the string.
+This will allow us avoiding string manipulations.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+It is used if hashed_output is true.
+output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+END
+  }
+  attr {
+    name: "hash_key"
+    description: <<END
+Specify the hash_key that will be used by the `FingerprintCat64`
+function to combine the crosses fingerprints.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81d346adfbf77435490b708f5eaacdc8bd701b4e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseAdd.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "SparseDenseCwiseAdd"
+  in_arg {
+    name: "sp_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "sp_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `sp_indices`.
+END
+  }
+  in_arg {
+    name: "sp_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "dense"
+    description: <<END
+`R`-D.  The dense Tensor operand.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D.  The `N` values that are operated on.
+END
+  }
+  summary: "Adds up a SparseTensor and a dense Tensor, using these special rules:"
+  description: <<END
+(1) Broadcasts the dense side to have the same shape as the sparse side, if
+    eligible;
+(2) Then, only the dense values pointed to by the indices of the SparseTensor
+    participate in the cwise addition.
+
+By these rules, the result is a logical SparseTensor with exactly the same
+indices and shape, but possibly with different non-zero values.  The output of
+this Op is the resultant non-zero values.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40ea9c846afe216b07f64de01c30ee722b620def
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseDiv.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "SparseDenseCwiseDiv"
+  in_arg {
+    name: "sp_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "sp_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `sp_indices`.
+END
+  }
+  in_arg {
+    name: "sp_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "dense"
+    description: <<END
+`R`-D.  The dense Tensor operand.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D.  The `N` values that are operated on.
+END
+  }
+  summary: "Component-wise divides a SparseTensor by a dense Tensor."
+  description: <<END
+*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+the other direction.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..262ab2dc76fa50750a117f717ea090616d4450ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseDenseCwiseMul.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "SparseDenseCwiseMul"
+  in_arg {
+    name: "sp_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "sp_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `sp_indices`.
+END
+  }
+  in_arg {
+    name: "sp_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "dense"
+    description: <<END
+`R`-D.  The dense Tensor operand.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D.  The `N` values that are operated on.
+END
+  }
+  summary: "Component-wise multiplies a SparseTensor by a dense Tensor."
+  description: <<END
+The output locations corresponding to the implicitly zero elements in the sparse
+tensor will be zero (i.e., will not take up storage space), regardless of the
+contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+
+*Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+the other direction.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRows.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRows.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9f25554b221c25cca7db7b8f9613c00b55682dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRows.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "SparseFillEmptyRows"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D. the indices of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D. the values of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "dense_shape"
+    description: <<END
+1-D. the shape of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "default_value"
+    description: <<END
+0-D. default value to insert into location `[row, 0, ..., 0]`
+  for rows missing from the input sparse tensor.
+output indices: 2-D. the indices of the filled sparse tensor.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D. the values of the filled sparse tensor.
+END
+  }
+  out_arg {
+    name: "empty_row_indicator"
+    description: <<END
+1-D. whether the dense row was missing in the
+input sparse tensor.
+END
+  }
+  out_arg {
+    name: "reverse_index_map"
+    description: <<END
+1-D. a map from the input indices to the output indices.
+END
+  }
+  summary: "Fills empty rows in the input 2-D `SparseTensor` with a default value."
+  description: <<END
+The input `SparseTensor` is represented via the tuple of inputs
+(`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+same `dense_shape` but with indices `output_indices` and values
+`output_values`.
+
+This op inserts a single entry for every row that doesn't have any values.
+The index is created as `[row, 0, ..., 0]` and the inserted value
+is `default_value`.
+
+For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [2, 0]: c
+    [3, 1]: d
+
+Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+
+    [0, 1]: a
+    [0, 3]: b
+    [1, 0]: default_value
+    [2, 0]: c
+    [3, 1]: d
+    [4, 0]: default_value
+
+The output `SparseTensor` will be in row-major order and will have the
+same shape as the input.
+
+This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+
+    empty_row_indicator[i] = True iff row i was an empty row.
+
+And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+backpropagation,
+
+    reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRowsGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eef43e61f2c8aa3263a524fa00cf89550924d404
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseFillEmptyRowsGrad.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SparseFillEmptyRowsGrad"
+  in_arg {
+    name: "reverse_index_map"
+    description: <<END
+1-D.  The reverse index map from SparseFillEmptyRows.
+END
+  }
+  in_arg {
+    name: "grad_values"
+    description: <<END
+1-D.  The gradients from backprop.
+END
+  }
+  out_arg {
+    name: "d_values"
+    description: <<END
+1-D.  The backprop into values.
+END
+  }
+  out_arg {
+    name: "d_default_value"
+    description: <<END
+0-D.  The backprop into default_value.
+END
+  }
+  summary: "The gradient of SparseFillEmptyRows."
+  description: <<END
+Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+shaped `[N_full]`, where `N_full >= N` and copies data into either
+`d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+`d_default_value` is a scalar.
+
+  d_values[j] = grad_values[reverse_index_map[j]]
+  d_default_value = sum_{k : 0 .. N_full - 1} (
+     grad_values[k] * 1{k not in reverse_index_map})
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseMatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58f2ede62984073d5944226f4b58bc95818b3f32
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseMatMul.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "SparseMatMul"
+  summary: "Multiply matrix \"a\" by matrix \"b\"."
+  description: <<END
+The inputs must be two-dimensional matrices and the inner dimension of "a" must
+match the outer dimension of "b". This op is optimized for the case where at
+least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+matrix multiply on one platform was 30% zero values in the sparse matrix.
+
+The gradient computation of this operation will only take advantage of sparsity
+in the input gradient when that gradient comes from a Relu.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReduceMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReduceMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c2e7e0df176efbc9ca997d271f3c8c08c6c01f3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReduceMax.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "SparseReduceMax"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `input_indices`.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "reduction_axes"
+    description: <<END
+1-D.  Length-`K` vector containing the reduction axes.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+`R-K`-D.  The reduced Tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the max of elements across dimensions of a SparseTensor."
+  description: <<END
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+instead of a sparse one.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReduceMaxSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReduceMaxSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c75a2bb233a6f351ff00f156bd92c67b668b77a6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReduceMaxSparse.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "SparseReduceMaxSparse"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `input_indices`.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "reduction_axes"
+    description: <<END
+1-D.  Length-`K` vector containing the reduction axes.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the max of elements across dimensions of a SparseTensor."
+  description: <<END
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+SparseTensor.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReduceSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReduceSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf6f868d1481bde17c0bda8a9bdabe19cce6e632
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReduceSum.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "SparseReduceSum"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `input_indices`.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "reduction_axes"
+    description: <<END
+1-D.  Length-`K` vector containing the reduction axes.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+`R-K`-D.  The reduced Tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the sum of elements across dimensions of a SparseTensor."
+  description: <<END
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+instead of a sparse one.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReduceSumSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReduceSumSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cad169e5f9d7f7d6c00583ad34b5dd915eb8a111
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReduceSumSparse.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "SparseReduceSumSparse"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `input_indices`.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "reduction_axes"
+    description: <<END
+1-D.  Length-`K` vector containing the reduction axes.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the sum of elements across dimensions of a SparseTensor."
+  description: <<END
+This Op takes a SparseTensor and is the sparse counterpart to
+`tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+SparseTensor.
+
+Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+with length 1.
+
+If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+with a single element is returned.  Additionally, the axes can be negative,
+which are interpreted according to the indexing rules in Python.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReorder.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReorder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07ffc6dcf33c3a725c45c124cd79e94e11af91c6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReorder.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "SparseReorder"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, possibly not in canonical ordering.
+END
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `input_indices`.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  `N x R` matrix with the same indices as input_indices, but
+in canonical row-major ordering.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `output_indices`.
+END
+  }
+  summary: "Reorders a SparseTensor into the canonical, row-major ordering."
+  description: <<END
+Note that by convention, all sparse ops preserve the canonical ordering along
+increasing dimension number. The only time ordering can be violated is during
+manual manipulation of the indices and values vectors to add entries.
+
+Reordering does not affect the shape of the SparseTensor.
+
+If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseReshape.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84fef9fbc4d037f91c0c74af55d00d014769c135
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseReshape.pbtxt
@@ -0,0 +1,55 @@
+op {
+  graph_op_name: "SparseReshape"
+  in_arg {
+    name: "input_indices"
+    description: <<END
+2-D.  `N x R_in` matrix with the indices of non-empty values in a
+SparseTensor.
+END
+  }
+  in_arg {
+    name: "input_shape"
+    description: <<END
+1-D.  `R_in` vector with the input SparseTensor's dense shape.
+END
+  }
+  in_arg {
+    name: "new_shape"
+    description: <<END
+1-D.  `R_out` vector with the requested new dense shape.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  `N x R_out` matrix with the updated indices of non-empty
+values in the output SparseTensor.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  `R_out` vector with the full dense shape of the output
+SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+filled in.
+END
+  }
+  summary: "Reshapes a SparseTensor to represent values in a new dense shape."
+  description: <<END
+This operation has the same semantics as reshape on the represented dense
+tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+
+If one component of `new_shape` is the special value -1, the size of that
+dimension is computed so that the total dense size remains constant.  At
+most one component of `new_shape` can be -1.  The number of dense elements
+implied by `new_shape` must be the same as the number of dense elements
+originally implied by `input_shape`.
+
+Reshaping does not affect the order of values in the SparseTensor.
+
+If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+`input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+`output_shape` has length `R_out`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18e66605951afcba96f5e1cca10e959850ca2bf1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "SparseSegmentMean"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+dimension, selecting a subset of dimension 0, specified by `indices`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b58d6671b5f57d8b63efdd2057b3defd6b6b02d7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanGrad.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SparseSegmentMeanGrad"
+  in_arg {
+    name: "grad"
+    description: <<END
+gradient propagated to the SparseSegmentMean op.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+indices passed to the corresponding SparseSegmentMean op.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+segment_ids passed to the corresponding SparseSegmentMean op.
+END
+  }
+  in_arg {
+    name: "output_dim0"
+    description: <<END
+dimension 0 of "data" passed to SparseSegmentMean op.
+END
+  }
+  summary: "Computes gradients for SparseSegmentMean."
+  description: <<END
+Returns tensor "output" with same shape as grad, except for dimension 0 whose
+value is output_dim0.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6e105400307b178720a3b1e04955aaad61c9931
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the mean along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fdeb66aed79f73b10096fafa8846e79a4180394
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "SparseSegmentSqrtN"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
+  description: <<END
+N is the size of the segment being reduced.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cb2e29ef4a4b29b767cf5d0902d5cd05afe7b18
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNGrad.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNGrad"
+  in_arg {
+    name: "grad"
+    description: <<END
+gradient propagated to the SparseSegmentSqrtN op.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+indices passed to the corresponding SparseSegmentSqrtN op.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+segment_ids passed to the corresponding SparseSegmentSqrtN op.
+END
+  }
+  in_arg {
+    name: "output_dim0"
+    description: <<END
+dimension 0 of "data" passed to SparseSegmentSqrtN op.
+END
+  }
+  summary: "Computes gradients for SparseSegmentSqrtN."
+  description: <<END
+Returns tensor "output" with same shape as grad, except for dimension 0 whose
+value is output_dim0.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ba98b81911cc85d942d91a0f689cb075fc987e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor divided by the sqrt of N."
+  description: <<END
+N is the size of the segment being reduced.
+
+Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdf44a89a386f7ab5fb702de96a83f307e531597
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "SparseSegmentSum"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `k`, the number of segments.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+dimension, selecting a subset of dimension 0, specified by `indices`.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+# Select two rows, one segment.
+tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+# => [[0 0 0 0]]
+
+# Select two rows, two segment.
+tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+# => [[ 1  2  3  4]
+#     [-1 -2 -3 -4]]
+
+# Select all rows, two segments.
+tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+# => [[0 0 0 0]
+#     [5 6 7 8]]
+
+# Which is equivalent to:
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aeaba38e9447d175e33eae4cf6168679129bc8d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  in_arg {
+    name: "indices"
+    description: <<END
+A 1-D tensor. Has same rank as `segment_ids`.
+END
+  }
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor. Values should be sorted and can be repeated.
+END
+  }
+  in_arg {
+    name: "num_segments"
+    description: <<END
+Should equal the number of distinct segment IDs.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the sum along sparse segments of a tensor."
+  description: <<END
+Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+tf.sparse_segment_sum_with_num_segments(
+    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+# => [[0 0 0 0]
+#     [0 0 0 0]
+#     [0 0 0 0]]
+
+tf.sparse_segment_sum_with_num_segments(c,
+                                        tf.constant([0, 1]),
+                                        tf.constant([0, 2],
+                                        num_segments=4))
+# => [[ 1  2  3  4]
+#     [ 0  0  0  0]
+#     [-1 -2 -3 -4]
+#     [ 0  0  0  0]]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSlice.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..637ba6ece4253ae65f4f1b6bb6314a3c84e5ef72
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSlice.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "SparseSlice"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D tensor represents the indices of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D tensor represents the values of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+1-D. tensor represents the shape of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "start"
+    description: <<END
+1-D. tensor represents the start of the slice.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+1-D. tensor represents the size of the slice.
+output indices: A list of 1-D tensors represents the indices of the output
+sparse tensors.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+A list of 1-D tensors represents the values of the output sparse
+tensors.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+A list of 1-D tensors represents the shape of the output sparse
+tensors.
+END
+  }
+  summary: "Slice a `SparseTensor` based on the `start` and `size`."
+  description: <<END
+For example, if the input is
+
+    input_tensor = shape = [2, 7]
+    [    a   d e  ]
+    [b c          ]
+
+Graphically the output tensors are:
+
+    sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+    [    a  ]
+    [b c    ]
+
+    sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+    [ d e  ]
+    [      ]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSoftmax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c64c3c68a19412a0f0eaf005491d54193baac0ae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSoftmax.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "SparseSoftmax"
+  in_arg {
+    name: "sp_indices"
+    description: <<END
+2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+SparseTensor, in canonical ordering.
+END
+  }
+  in_arg {
+    name: "sp_values"
+    description: <<END
+1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+END
+  }
+  in_arg {
+    name: "sp_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+1-D.  The `NNZ` values for the result `SparseTensor`.
+END
+  }
+  summary: "Applies softmax to a batched N-D `SparseTensor`."
+  description: <<END
+The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+(where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+
+This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+zero elements do not participate*.  Specifically, the algorithm is equivalent
+to the following:
+
+  (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+      with shape `[B, C]`, along the size-C dimension;
+  (2) Masks out the original implicitly-zero locations;
+  (3) Renormalizes the remaining elements.
+
+Hence, the `SparseTensor` result has exactly the same non-zero indices and
+shape.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a867bbe04d5ca2361395b859e70ce212b387a5e2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
+  in_arg {
+    name: "features"
+    description: <<END
+batch_size x num_classes matrix
+END
+  }
+  in_arg {
+    name: "labels"
+    description: <<END
+batch_size vector with values in [0, num_classes).
+This is the label for the given minibatch entry.
+END
+  }
+  out_arg {
+    name: "loss"
+    description: <<END
+Per example loss (batch_size vector).
+END
+  }
+  out_arg {
+    name: "backprop"
+    description: <<END
+backpropagated gradients (batch_size x num_classes matrix).
+END
+  }
+  summary: "Computes softmax cross entropy cost and gradients to backpropagate."
+  description: <<END
+Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+a matrix of label probabilities, but rather a single label per row
+of features.  This label is considered to have probability 1.0 for the
+given row.
+
+Inputs are the logits, not probabilities.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSparseMaximum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSparseMaximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34ccddd5d42b055b959c37d1c109565a902e1dcd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSparseMaximum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "SparseSparseMaximum"
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, in the canonical lexicographic ordering.
+END
+  }
+  in_arg {
+    name: "a_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `a_indices`.
+END
+  }
+  in_arg {
+    name: "a_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "b_indices"
+    description: <<END
+counterpart to `a_indices` for the other operand.
+END
+  }
+  in_arg {
+    name: "b_values"
+    description: <<END
+counterpart to `a_values` for the other operand; must be of the same dtype.
+END
+  }
+  in_arg {
+    name: "b_shape"
+    description: <<END
+counterpart to `a_shape` for the other operand; the two shapes must be equal.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  The indices of the output SparseTensor.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  The values of the output SparseTensor.
+END
+  }
+  summary: "Returns the element-wise max of two SparseTensors."
+  description: <<END
+Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSparseMinimum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSparseMinimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b25684bb0061f0d649df3c774ec5fae85a9c91f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSparseMinimum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "SparseSparseMinimum"
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  `N x R` matrix with the indices of non-empty values in a
+SparseTensor, in the canonical lexicographic ordering.
+END
+  }
+  in_arg {
+    name: "a_values"
+    description: <<END
+1-D.  `N` non-empty values corresponding to `a_indices`.
+END
+  }
+  in_arg {
+    name: "a_shape"
+    description: <<END
+1-D.  Shape of the input SparseTensor.
+END
+  }
+  in_arg {
+    name: "b_indices"
+    description: <<END
+counterpart to `a_indices` for the other operand.
+END
+  }
+  in_arg {
+    name: "b_values"
+    description: <<END
+counterpart to `a_values` for the other operand; must be of the same dtype.
+END
+  }
+  in_arg {
+    name: "b_shape"
+    description: <<END
+counterpart to `a_shape` for the other operand; the two shapes must be equal.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  The indices of the output SparseTensor.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  The values of the output SparseTensor.
+END
+  }
+  summary: "Returns the element-wise min of two SparseTensors."
+  description: <<END
+Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc90ad333b2ffacb0d3dec247fe7963679233c5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSplit.pbtxt
@@ -0,0 +1,70 @@
+op {
+  graph_op_name: "SparseSplit"
+  in_arg {
+    name: "split_dim"
+    description: <<END
+0-D.  The dimension along which to split.  Must be in the range
+`[0, rank(shape))`.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D tensor represents the indices of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D tensor represents the values of the sparse tensor.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+1-D. tensor represents the shape of the sparse tensor.
+output indices: A list of 1-D tensors represents the indices of the output
+sparse tensors.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+A list of 1-D tensors represents the values of the output sparse
+tensors.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+A list of 1-D tensors represents the shape of the output sparse
+tensors.
+END
+  }
+  attr {
+    name: "num_split"
+    description: <<END
+The number of ways to split.
+END
+  }
+  summary: "Split a `SparseTensor` into `num_split` tensors along one dimension."
+  description: <<END
+If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+`[0 : shape[split_dim] % num_split]` gets one extra dimension.
+For example, if `split_dim = 1` and `num_split = 2` and the input is
+
+    input_tensor = shape = [2, 7]
+    [    a   d e  ]
+    [b c          ]
+
+Graphically the output tensors are:
+
+    output_tensor[0] = shape = [2, 4]
+    [    a  ]
+    [b c    ]
+
+    output_tensor[1] = shape = [2, 3]
+    [ d e  ]
+    [      ]
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7225447188b9b356411ddc230af75eb0976c2945
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseAdd.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "SparseTensorDenseAdd"
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+END
+  }
+  in_arg {
+    name: "a_values"
+    description: <<END
+1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+END
+  }
+  in_arg {
+    name: "a_shape"
+    description: <<END
+1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+`ndims`-D Tensor.  With shape `a_shape`.
+END
+  }
+  summary: "Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`."
+  description: <<END
+This Op does not require `a_indices` be sorted in standard lexicographic order.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseMatMul.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a5dc08d212acc1bb8d48178ac163dd7b1ce2b26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorDenseMatMul.pbtxt
@@ -0,0 +1,53 @@
+op {
+  graph_op_name: "SparseTensorDenseMatMul"
+  in_arg {
+    name: "a_indices"
+    description: <<END
+2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+END
+  }
+  in_arg {
+    name: "a_values"
+    description: <<END
+1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+END
+  }
+  in_arg {
+    name: "a_shape"
+    description: <<END
+1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+END
+  }
+  in_arg {
+    name: "b"
+    description: <<END
+2-D.  A dense Matrix.
+END
+  }
+  attr {
+    name: "adjoint_a"
+    description: <<END
+Use the adjoint of A in the matrix multiply.  If A is complex, this
+is transpose(conj(A)).  Otherwise it's transpose(A).
+END
+  }
+  attr {
+    name: "adjoint_b"
+    description: <<END
+Use the adjoint of B in the matrix multiply.  If B is complex, this
+is transpose(conj(B)).  Otherwise it's transpose(B).
+END
+  }
+  summary: "Multiply SparseTensor (of rank 2) \"A\" by dense matrix \"B\"."
+  description: <<END
+No validity checking is performed on the indices of A.  However, the following
+input format is recommended for optimal behavior:
+
+if adjoint_a == false:
+  A should be sorted in lexicographically increasing order.  Use SparseReorder
+  if you're not sure.
+if adjoint_a == true:
+  A should be sorted in order of increasing dimension 1 (i.e., "column major"
+  order instead of "row major" order).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffb805834908103865e5fcb8d98fb080d60a44ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseTensorSliceDataset"
+  summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseToDense.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseToDense.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fb0012d042789fab2e13e34353532cd2c36d453
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseToDense.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "SparseToDense"
+  in_arg {
+    name: "sparse_indices"
+    description: <<END
+0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+index where `sparse_values[i]` will be placed.
+END
+  }
+  in_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the dense output tensor.
+END
+  }
+  in_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  Values corresponding to each row of `sparse_indices`,
+or a scalar value to be used for all sparse indices.
+END
+  }
+  in_arg {
+    name: "default_value"
+    description: <<END
+Scalar value to set for indices not specified in
+`sparse_indices`.
+END
+  }
+  out_arg {
+    name: "dense"
+    description: <<END
+Dense output tensor of shape `output_shape`.
+END
+  }
+  attr {
+    name: "validate_indices"
+    description: <<END
+If true, indices are checked to make sure they are sorted in
+lexicographic order and that there are no repeats.
+END
+  }
+  summary: "Converts a sparse representation into a dense tensor."
+  description: <<END
+Builds an array `dense` with shape `output_shape` such that
+
+```
+# If sparse_indices is scalar
+dense[i] = (i == sparse_indices ? sparse_values : default_value)
+
+# If sparse_indices is a vector, then for each i
+dense[sparse_indices[i]] = sparse_values[i]
+
+# If sparse_indices is an n by d matrix, then for each i in [0, n)
+dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+```
+
+All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+scalar, all sparse indices are set to this single value.
+
+Indices should be sorted in lexicographic order, and indices must not
+contain any repeats. If `validate_indices` is true, these properties
+are checked during execution.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..766f756bb58e05a38451443295551dec045a12c2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseToSparseSetOperation.pbtxt
@@ -0,0 +1,93 @@
+op {
+  graph_op_name: "SparseToSparseSetOperation"
+  in_arg {
+    name: "set1_indices"
+    description: <<END
+2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set1_values"
+    description: <<END
+1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set1_shape"
+    description: <<END
+1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+max set size across `0...n-1` dimensions.
+END
+  }
+  in_arg {
+    name: "set2_indices"
+    description: <<END
+2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set2_values"
+    description: <<END
+1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+order.
+END
+  }
+  in_arg {
+    name: "set2_shape"
+    description: <<END
+1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+max set size across `0...n-1` dimensions.
+END
+  }
+  out_arg {
+    name: "result_indices"
+    description: <<END
+2D indices of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_values"
+    description: <<END
+1D values of a `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "result_shape"
+    description: <<END
+1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+is the max result set size across all `0...n-1` dimensions.
+END
+  }
+  summary: "Applies set operation along last dimension of 2 `SparseTensor` inputs."
+  description: <<END
+See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+
+If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+order and range of `set1` and `set2` indices.
+
+Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+ignored.
+
+Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+ignored.
+
+If `validate_indices` is `True`, this op validates the order and range of `set1`
+and `set2` indices.
+
+Output `result` is a `SparseTensor` represented by `result_indices`,
+`result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+dimension contains the result of `set_operation` applied to the corresponding
+`[0...n-1]` dimension of `set`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Split.pbtxt b/tensorflow/core/api_def/base_api/api_def_Split.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..802f440896baf5817ce07bf6a1ead21b656141ec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Split.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "Split"
+  in_arg {
+    name: "split_dim"
+    rename_to: "axis"
+    description: <<END
+0-D.  The dimension along which to split.  Must be in the range
+`[-rank(value), rank(value))`.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to split.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+They are identically shaped tensors, whose shape matches that of `value`
+except along `split_dim`, where their sizes are
+`values.shape[split_dim] / num_split`.
+END
+  }
+  attr {
+    name: "num_split"
+    description: <<END
+The number of ways to split.  Must evenly divide
+`value.shape[split_dim]`.
+END
+  }
+  summary: "Splits a tensor into `num_split` tensors along one dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SplitV.pbtxt b/tensorflow/core/api_def/base_api/api_def_SplitV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c1660ffb60ff1127e512793acb0dc1019416faa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SplitV.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "SplitV"
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to split.
+END
+  }
+  in_arg {
+    name: "size_splits"
+    description: <<END
+list containing the sizes of each output tensor along the split
+dimension. Must sum to the dimension of value along split_dim.
+Can contain one -1 indicating that dimension is to be inferred.
+END
+  }
+  in_arg {
+    name: "split_dim"
+    rename_to: "axis"
+    description: <<END
+0-D.  The dimension along which to split.  Must be in the range
+`[-rank(value), rank(value))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Tensors whose shape matches that of `value`
+except along `split_dim`, where their sizes are
+`size_splits[i]`.
+END
+  }
+  summary: "Splits a tensor into `num_split` tensors along one dimension."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7570d5da5662b8eab90e7dd00f8cb225a963d373
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "SqlDataset"
+  in_arg {
+    name: "driver_name"
+    description: <<END
+The database type. Currently, the only supported type is 'sqlite'.
+END
+  }
+  in_arg {
+    name: "data_source_name"
+    description: <<END
+A connection string to connect to the database.
+END
+  }
+  in_arg {
+    name: "query"
+    description: <<END
+A SQL query to execute.
+END
+  }
+  summary: "Creates a dataset that executes a SQL query and emits rows of the result set."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..857841dc41fcc177bb7141692aa8531257826d26
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sqrt.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Sqrt"
+  summary: "Computes square root of x element-wise."
+  description: <<END
+I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SqrtGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_SqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac5b737f93511447da13dd441b8428dcf3872bd4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SqrtGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "SqrtGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient for the sqrt of `x` wrt its input."
+  description: <<END
+Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Square.pbtxt b/tensorflow/core/api_def/base_api/api_def_Square.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3e32a98d154c68756b9c50b053c49c15bfde536
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Square.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "Square"
+  summary: "Computes square of x element-wise."
+  description: <<END
+I.e., \\(y = x * x = x^2\\).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt b/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51277692d8c54a23354af33f556d4f61c0601f02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SquaredDifference.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "SquaredDifference"
+  summary: "Returns (x - y)(x - y) element-wise."
+  description: <<END
+*NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Squeeze.pbtxt b/tensorflow/core/api_def/base_api/api_def_Squeeze.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f84c51536b087ff018c870409e941880c5be74c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Squeeze.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "Squeeze"
+  in_arg {
+    name: "input"
+    description: <<END
+The `input` to squeeze.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Contains the same data as `input`, but has one or more dimensions of
+size 1 removed.
+END
+  }
+  attr {
+    name: "squeeze_dims"
+    rename_to: "axis"
+    description: <<END
+If specified, only squeezes the dimensions listed. The dimension
+index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+be in the range `[-rank(input), rank(input))`.
+END
+  }
+  summary: "Removes dimensions of size 1 from the shape of a tensor."
+  description: <<END
+Given a tensor `input`, this operation returns a tensor of the same type with
+all dimensions of size 1 removed. If you don't want to remove all size 1
+dimensions, you can remove specific size 1 dimensions by specifying
+`squeeze_dims`.
+
+For example:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t)) ==> [2, 3]
+```
+
+Or, to remove specific size 1 dimensions:
+
+```
+# 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Stack.pbtxt b/tensorflow/core/api_def/base_api/api_def_Stack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fd668213008c872e86ef0f7132032b250359e03
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Stack.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "Stack"
+  visibility: SKIP
+  summary: "Deprecated, use StackV2."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackClose.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..050d69cbaa0f35ce8960a4ec1a6790be701183c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackClose.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "StackClose"
+  visibility: SKIP
+  summary: "Deprecated, use StackCloseV2."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackCloseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9b71fec3b83e4208401970fafdf127262930c52
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackCloseV2.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "StackCloseV2"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a stack.
+END
+  }
+  summary: "Delete the stack from its resource container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackPop.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackPop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..abf45f85cc026606461bebef34475216ca14f90c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackPop.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "StackPop"
+  visibility: SKIP
+  summary: "Deprecated, use StackPopV2."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackPopV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackPopV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e0498dcf3fb41f8c77c956467573237582bbbab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackPopV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "StackPopV2"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a stack.
+END
+  }
+  out_arg {
+    name: "elem"
+    description: <<END
+The tensor that is popped from the top of the stack.
+END
+  }
+  attr {
+    name: "elem_type"
+    description: <<END
+The type of the elem that is popped.
+END
+  }
+  summary: "Pop the element at the top of the stack."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackPush.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackPush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..619f20f9aada53ac61cd3e5902691eae89f16522
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackPush.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "StackPush"
+  visibility: SKIP
+  summary: "Deprecated, use StackPushV2."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackPushV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackPushV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83d7dd1f35b4beaff86ae10e14cdfa969385aff6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackPushV2.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "StackPushV2"
+  visibility: SKIP
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a stack.
+END
+  }
+  in_arg {
+    name: "elem"
+    description: <<END
+The tensor to be pushed onto the stack.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The same tensor as the input 'elem'.
+END
+  }
+  attr {
+    name: "swap_memory"
+    description: <<END
+Swap `elem` to CPU. Default to false.
+END
+  }
+  summary: "Push an element onto the stack."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StackV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StackV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1699da1271693245aac3eca57e73838b4322ffaa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StackV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "StackV2"
+  visibility: SKIP
+  in_arg {
+    name: "max_size"
+    description: <<END
+The maximum size of the stack if non-negative. If negative, the stack
+size is unlimited.
+END
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the stack.
+END
+  }
+  attr {
+    name: "elem_type"
+    description: <<END
+The type of the elements on the stack.
+END
+  }
+  attr {
+    name: "stack_name"
+    description: <<END
+Overrides the name used for the temporary stack resource. Default
+value is the name of the 'Stack' op (which is guaranteed unique).
+END
+  }
+  summary: "A stack that produces elements in first-in last-out order."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Stage.pbtxt b/tensorflow/core/api_def/base_api/api_def_Stage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba9b4bc461d1a07d13eb9f844a08864a4befa2c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Stage.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Stage"
+  in_arg {
+    name: "values"
+    description: <<END
+a list of tensors
+dtypes A list of data types that inserted values should adhere to.
+END
+  }
+  attr {
+    name: "capacity"
+    description: <<END
+Maximum number of elements in the Staging Area. If > 0, inserts
+on the container will block when the capacity is reached.
+END
+  }
+  attr {
+    name: "memory_limit"
+    description: <<END
+The maximum number of bytes allowed for Tensors in the Staging Area.
+If > 0, inserts will block until sufficient space is available.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this queue is placed in the given container. Otherwise,
+a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+It is necessary to match this name to the matching Unstage Op.
+END
+  }
+  summary: "Stage values similar to a lightweight Enqueue."
+  description: <<END
+The basic functionality of this Op is similar to a queue with many
+fewer capabilities and options.  This Op is optimized for performance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StageClear.pbtxt b/tensorflow/core/api_def/base_api/api_def_StageClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22cbe41090c8060548ab44e2e44a9f6854d8e5b8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StageClear.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StageClear"
+  summary: "Op removes all elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StagePeek.pbtxt b/tensorflow/core/api_def/base_api/api_def_StagePeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7eba72af2a293bde4b692bbe00bca5b5b59b58de
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StagePeek.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "StagePeek"
+  summary: "Op peeks at the values at the specified index.  If the"
+  description: <<END
+underlying container does not contain sufficient elements
+this op will block until it does.   This Op is optimized for
+performance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StageSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_StageSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ae827d1b5dcd0f1f05c437317cd388cca3dde4b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StageSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StageSize"
+  summary: "Op returns the number of elements in the underlying container."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6ef8160e4818519ce503076c719697dd06a419c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormal.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "StatelessRandomNormal"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape` and `seed`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniform.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ba88c3730793b9066fbcafc195529d0fccb6eb1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniform.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "StatelessRandomUniform"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape` and `seed`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37228dba648d90dac7f1f4051dc5be1bee137781
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormal.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "StatelessTruncatedNormal"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
+  description: <<END
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape` and `seed`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b30d64afe18a71fbbe73b397979796b8b844faa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorHandle"
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bcaf9fea1af5123848b2d6267b3ef0f7279a7230
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatsAggregatorSummary"
+  summary: "Produces a summary of any statistics recorded by the given statistics manager."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StopGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_StopGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af4b9f6113cb9b76734892ec39e08dad6ffbcc1c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StopGradient.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "StopGradient"
+  summary: "Stops gradient computation."
+  description: <<END
+When executed in a graph, this op outputs its input tensor as-is.
+
+When building ops to compute gradients, this op prevents the contribution of
+its inputs to be taken into account.  Normally, the gradient generator adds ops
+to a graph to compute the derivatives of a specified 'loss' by recursively
+finding out inputs that contributed to its computation.  If you insert this op
+in the graph it inputs are masked from the gradient generator.  They are not
+taken into account for computing gradients.
+
+This is useful any time you want to compute a value with TensorFlow but need
+to pretend that the value was a constant. Some examples include:
+
+*  The *EM* algorithm where the *M-step* should not involve backpropagation
+   through the output of the *E-step*.
+*  Contrastive divergence training of Boltzmann machines where, when
+   differentiating the energy function, the training must not backpropagate
+   through the graph that generated the samples from the model.
+*  Adversarial training, where no backprop should happen through the adversarial
+   example generation process.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d6fc048471d86392c09425371169054755c5af2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSlice.pbtxt
@@ -0,0 +1,167 @@
+op {
+  graph_op_name: "StridedSlice"
+  in_arg {
+    name: "begin"
+    description: <<END
+`begin[k]` specifies the offset into the `k`th range specification.
+The exact dimension this corresponds to will be determined by context.
+Out-of-bounds values will be silently clamped. If the `k`th bit of
+`begin_mask` then `begin[k]` is ignored and the full range of the
+appropriate dimension is used instead. Negative values causes indexing
+to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+END
+  }
+  in_arg {
+    name: "end"
+    description: <<END
+`end[i]` is like `begin` with the exception that `end_mask` is
+used to determine full ranges.
+END
+  }
+  in_arg {
+    name: "strides"
+    description: <<END
+`strides[i]` specifies the increment in the `i`th specification
+after extracting a given element. Negative indices will reverse
+the original order. Out or range values are
+clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+END
+  }
+  attr {
+    name: "begin_mask"
+    description: <<END
+a bitmask where a bit i being 1 means to ignore the begin
+value and instead use the largest interval possible. At runtime
+begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+`[-1, n-1]` if `stride[i] < 0`
+END
+  }
+  attr {
+    name: "end_mask"
+    description: <<END
+analogous to `begin_mask`
+END
+  }
+  attr {
+    name: "ellipsis_mask"
+    description: <<END
+a bitmask where bit `i` being 1 means the `i`th
+position is actually an ellipsis. One bit at most can be 1.
+If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+implicitly creates as many range specifications as necessary to fully
+specify the sliced range for every dimension. For example for a 4-dimensional
+tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+END
+  }
+  attr {
+    name: "new_axis_mask"
+    description: <<END
+a bitmask where bit `i` being 1 means the `i`th
+specification creates a new shape 1 dimension. For example
+`foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+END
+  }
+  attr {
+    name: "shrink_axis_mask"
+    description: <<END
+a bitmask where bit `i` implies that the `i`th
+specification should shrink the dimensionality. begin and end
+must imply a slice of size 1 in the dimension. For example in
+python one might do `foo[:, 3, :]` which would result in
+`shrink_axis_mask` being 2.
+END
+  }
+  summary: "Return a strided slice from `input`."
+  description: <<END
+Note, most python users will want to use the Python `Tensor.__getitem__`
+or `Variable.__getitem__` rather than this op directly.
+
+The goal of this op is to produce a new tensor with a subset of
+the elements from the `n` dimensional `input` tensor. The subset is chosen using
+a sequence of `m` sparse range specifications encoded into the arguments
+of this function. Note, in some cases
+`m` could be equal to `n`, but this need not be the case. Each
+range specification entry can be one of the following:
+
+- An ellipsis (...). Ellipses are used to imply zero or more
+  dimensions of full-dimension selection and are produced using
+  `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+
+- A new axis. This is used to insert a new shape=1 dimension and is
+  produced using `new_axis_mask`. For example, `foo[:, ...]` where
+  `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+
+
+- A range `begin:end:stride`. This is used to specify how much to choose from
+  a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+  which represents the index of the first value to select while `end` represents
+  the index of the last value to select. The number of values selected in each
+  dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+  `begin` and `end` can be negative where `-1` is the last element, `-2` is
+  the second to last. `begin_mask` controls whether to replace the explicitly
+  given `begin` with an implicit effective value of `0` if `stride > 0` and
+  `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+  required to create the largest open interval. For example, given a shape
+  `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+  not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+  and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+  first dimension of a tensor while dropping the last two (in the original
+  order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+
+- A single index. This is used to keep only elements that have a given
+  index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+  shape `(6,)` tensor. This is encoded in `begin` and `end` and
+  `shrink_axis_mask`.
+
+Each conceptual range specification is encoded in the op's argument. This
+encoding is best understand by considering a non-trivial example. In
+particular,
+`foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+
+```
+begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+end = [2, 4, x, x, -3, x]
+strides = [1, 1, x, x, -1, 1]
+begin_mask = 1<<4 | 1 << 5 = 48
+end_mask = 1<<5 = 32
+ellipsis_mask = 1<<3 = 8
+new_axis_mask = 1<<2 4
+shrink_axis_mask = 1<<0
+```
+
+In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+the slice becomes (2, 1, 5, 5, 2, 5).
+Let us walk step by step through each argument specification.
+
+1.  The first argument in the example slice is turned into `begin = 1` and
+`end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+also set the appropriate bit in `shrink_axis_mask`.
+
+2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+zero bits contributed.
+
+3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+dimension in the final shape. Dummy values are contributed to begin,
+end and stride, while the new_axis_mask bit is set.
+
+4. `...` grab the full ranges from as many dimensions as needed to
+fully specify a slice for every dimension of the input shape.
+
+5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+with a dimension that has shape `s` is converted to a positive index
+`s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+is done internally so begin, end and strides receive x, -3, and -1.
+The appropriate begin_mask bit is set to indicate the start range is the
+full range (ignoring the x).
+
+6. `:` indicates that the entire contents of the corresponding dimension
+is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+`end_mask` are also set.
+
+*Requirements*:
+  `0 != strides[i] for i in [0, m)`
+  `ellipsis_mask must be a power of two (only one ellipsis)`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0fc89576ad29939837da7c55e393a0baeca90e5e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "StridedSliceAssign"
+  summary: "Assign `value` to the sliced l-value reference of `ref`."
+  description: <<END
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSliceGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c5ea059e8a410048ae3b91504f2d1ac63a641dc5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSliceGrad.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "StridedSliceGrad"
+  summary: "Returns the gradient of `StridedSlice`."
+  description: <<END
+Since `StridedSlice` cuts out pieces of its `input` which is size
+`shape`, its gradient will have the same shape (which is passed here
+as `shape`). The gradient will be zero in any element that the slice
+does not select.
+
+Arguments are the same as StridedSliceGrad with the exception that
+`dy` is the input gradient to be propagated and `shape` is the
+shape of `StridedSlice`'s `input`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringJoin.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..549ee434130cae8f05543be1163b7e0b908dd549
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringJoin.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "StringJoin"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of string tensors.  The tensors must all have the same shape,
+or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+of non-scalar inputs.
+END
+  }
+  attr {
+    name: "separator"
+    description: <<END
+string, an optional join separator.
+END
+  }
+  summary: "Joins the strings in the given list of string tensors into one tensor;"
+  description: <<END
+with the given separator (default is an empty separator).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4792f298eef619eb8a9e1176cb5b237d771df681
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplit.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "StringSplit"
+  in_arg {
+    name: "input"
+    description: <<END
+1-D. Strings to split.
+END
+  }
+  in_arg {
+    name: "delimiter"
+    description: <<END
+0-D. Delimiter characters (bytes), or empty string.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+A dense matrix of int64 representing the indices of the sparse tensor.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+A vector of strings corresponding to the splited values.
+END
+  }
+  out_arg {
+    name: "shape"
+    description: <<END
+a length-2 vector of int64 representing the shape of the sparse
+tensor, where the first value is N and the second value is the maximum number
+of tokens in a single input entry.
+END
+  }
+  attr {
+    name: "skip_empty"
+    description: <<END
+A `bool`. If `True`, skip the empty strings from the result.
+END
+  }
+  summary: "Split elements of `input` based on `delimiter` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `input` based on `delimiter` and return a `SparseTensor`
+containing the splitted tokens. Empty tokens are ignored.
+
+`delimiter` can be empty, or a string of split characters. If `delimiter` is an
+ empty string, each element of `input` is split into individual single-byte
+ character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+ every character of `delimiter` is a potential split point.
+
+For example:
+  N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+  will be
+
+  indices = [0, 0;
+             0, 1;
+             1, 0;
+             1, 1;
+             1, 2]
+  shape = [2, 3]
+  values = ['hello', 'world', 'a', 'b', 'c']
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToHashBucket.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af49dbd161de09bfe2af3472ca25ba867d8a7580
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringToHashBucket.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "StringToHashBucket"
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of the same shape as the input `string_tensor`.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+The number of buckets.
+END
+  }
+  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
+  description: <<END
+The hash function is deterministic on the content of the string within the
+process.
+
+Note that the hash function may change from time to time.
+This functionality will be deprecated and it's recommended to use
+`tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketFast.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketFast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a68d54a53412fbd67866bd47e0583e935b7eb541
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketFast.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "StringToHashBucketFast"
+  in_arg {
+    name: "input"
+    description: <<END
+The strings to assign a hash bucket.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of the same shape as the input `string_tensor`.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+The number of buckets.
+END
+  }
+  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
+  description: <<END
+The hash function is deterministic on the content of the string within the
+process and will never change. However, it is not suitable for cryptography.
+This function may be used when CPU time is scarce and inputs are trusted or
+unimportant. There is a risk of adversaries constructing inputs that all hash
+to the same bucket. To prevent this problem, use a strong hash function with
+`tf.string_to_hash_bucket_strong`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b63fbd1ff9d9ab6d0701045d30887cea29f4e1a0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringToHashBucketStrong.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "StringToHashBucketStrong"
+  in_arg {
+    name: "input"
+    description: <<END
+The strings to assign a hash bucket.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of the same shape as the input `string_tensor`.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+The number of buckets.
+END
+  }
+  attr {
+    name: "key"
+    description: <<END
+The key for the keyed hash function passed as a list of two uint64
+elements.
+END
+  }
+  summary: "Converts each string in the input Tensor to its hash mod by a number of buckets."
+  description: <<END
+The hash function is deterministic on the content of the string within the
+process. The hash function is a keyed hash function, where attribute `key`
+defines the key of the hash function. `key` is an array of 2 elements.
+
+A strong hash is important when inputs may be malicious, e.g. URLs with
+additional components. Adversaries could try to make their inputs hash to the
+same bucket for a denial-of-service attack or to skew the results. A strong
+hash prevents this by making it difficult, if not infeasible, to compute inputs
+that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+time than `tf.string_to_hash_bucket_fast`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringToNumber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6e0b1dc13d19bd9f77d52fc89080cb0e25c5274
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringToNumber.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "StringToNumber"
+  out_arg {
+    name: "output"
+    description: <<END
+A Tensor of the same shape as the input `string_tensor`.
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The numeric type to interpret each string in `string_tensor` as.
+END
+  }
+  summary: "Converts each string in the input Tensor to the specified numeric type."
+  description: <<END
+(Note that int32 overflow results in an error while float overflow
+results in a rounded value.)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sub.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73b82d6ac8f675af1802a72d8b9442d51e2c40fd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sub.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "Sub"
+  endpoint {
+    name: "Subtract"
+  }
+  endpoint {
+    name: "Sub"
+  }
+  summary: "Returns x - y element-wise."
+  description: <<END
+*NOTE*: `Sub` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8fc1e5cba39feb2066f2ee9f445c520e713a4792
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Substr.pbtxt
@@ -0,0 +1,103 @@
+op {
+  graph_op_name: "Substr"
+  in_arg {
+    name: "input"
+    description: <<END
+Tensor of strings
+END
+  }
+  in_arg {
+    name: "pos"
+    description: <<END
+Scalar defining the position of first character in each substring
+END
+  }
+  in_arg {
+    name: "len"
+    description: <<END
+Scalar defining the number of characters to include in each substring
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Tensor of substrings
+END
+  }
+  summary: "Return substrings from `Tensor` of strings."
+  description: <<END
+For each string in the input `Tensor`, creates a substring starting at index
+`pos` with a total length of `len`.
+
+If `len` defines a substring that would extend beyond the length of the input
+string, then as many characters as possible are used.
+
+If `pos` is negative or specifies a character index larger than any of the input
+strings, then an `InvalidArgumentError` is thrown.
+
+`pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+Op creation.
+
+*NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+---
+
+Examples
+
+Using scalar `pos` and `len`:
+
+```python
+input = [b'Hello', b'World']
+position = 1
+length = 3
+
+output = [b'ell', b'orl']
+```
+
+Using `pos` and `len` with same shape as `input`:
+
+```python
+input = [[b'ten', b'eleven', b'twelve'],
+         [b'thirteen', b'fourteen', b'fifteen'],
+         [b'sixteen', b'seventeen', b'eighteen']]
+position = [[1, 2, 3],
+            [1, 2, 3],
+            [1, 2, 3]]
+length =   [[2, 3, 4],
+            [4, 3, 2],
+            [5, 5, 5]]
+
+output = [[b'en', b'eve', b'lve'],
+          [b'hirt', b'urt', b'te'],
+          [b'ixtee', b'vente', b'hteen']]
+```
+
+Broadcasting `pos` and `len` onto `input`:
+
+```
+input = [[b'ten', b'eleven', b'twelve'],
+         [b'thirteen', b'fourteen', b'fifteen'],
+         [b'sixteen', b'seventeen', b'eighteen'],
+         [b'nineteen', b'twenty', b'twentyone']]
+position = [1, 2, 3]
+length =   [1, 2, 3]
+
+output = [[b'e', b'ev', b'lve'],
+          [b'h', b'ur', b'tee'],
+          [b'i', b've', b'hte'],
+          [b'i', b'en', b'nty']]
+```
+
+Broadcasting `input` onto `pos` and `len`:
+
+```
+input = b'thirteen'
+position = [1, 5, 7]
+length =   [3, 2, 1]
+
+output = [b'hir', b'ee', b'n']
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Sum.pbtxt b/tensorflow/core/api_def/base_api/api_def_Sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..295d5b86c088fde919ffe643f4f7d45a094d1776
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Sum.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "Sum"
+  endpoint {
+    name: "Sum"
+  }
+  endpoint {
+    name: "ReduceSum"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the sum of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt b/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ec746a117a6a684c3a0d136ec9ce89ed948cae4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Svd.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "Svd"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+END
+  }
+  out_arg {
+    name: "s"
+    description: <<END
+Singular values. Shape is `[..., P]`.
+END
+  }
+  out_arg {
+    name: "u"
+    description: <<END
+Left singular vectors. If `full_matrices` is `False` then shape is
+`[..., M, P]`; if `full_matrices` is `True` then shape is
+`[..., M, M]`. Undefined if `compute_uv` is `False`.
+END
+  }
+  out_arg {
+    name: "v"
+    description: <<END
+Left singular vectors. If `full_matrices` is `False` then shape is
+`[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+Undefined if `compute_uv` is false.
+END
+  }
+  attr {
+    name: "compute_uv"
+    description: <<END
+If true, left and right singular vectors will be
+computed and returned in `u` and `v`, respectively.
+If false, `u` and `v` are not set and should never referenced.
+END
+  }
+  attr {
+    name: "full_matrices"
+    description: <<END
+If true, compute full-sized `u` and `v`. If false
+(the default), compute only the leading `P` singular vectors.
+Ignored if `compute_uv` is `False`.
+END
+  }
+  summary: "Computes the singular value decompositions of one or more matrices."
+  description: <<END
+Computes the SVD of each inner matrix in `input` such that
+`input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+
+```python
+# a is a tensor containing a batch of matrices.
+# s is a tensor of singular values for each matrix.
+# u is the tensor containing of left singular vectors for each matrix.
+# v is the tensor containing of right singular vectors for each matrix.
+s, u, v = svd(a)
+s, _, _ = svd(a, compute_uv=False)
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/base_api/api_def_Switch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b9206df74290b17caeec4e63c21abe988930b6c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Switch.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "Switch"
+  in_arg {
+    name: "data"
+    description: <<END
+The tensor to be forwarded to the appropriate output.
+END
+  }
+  in_arg {
+    name: "pred"
+    description: <<END
+A scalar that specifies which output port will receive data.
+END
+  }
+  out_arg {
+    name: "output_false"
+    description: <<END
+If `pred` is false, data will be forwarded to this output.
+END
+  }
+  out_arg {
+    name: "output_true"
+    description: <<END
+If `pred` is true, data will be forwarded to this output.
+END
+  }
+  summary: "Forwards `data` to the output port determined by `pred`."
+  description: <<END
+If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+the data goes to `output_false`.
+
+See also `RefSwitch` and `Merge`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SymbolicGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_SymbolicGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5cb6dbc122aac2d2c77299b3f136ed8aae2efe9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SymbolicGradient.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "SymbolicGradient"
+  in_arg {
+    name: "input"
+    description: <<END
+a list of input tensors of size N + M;
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+a list of output tensors of size N;
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+the type list for the input list.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+the type list for the input list.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The function we want to compute the gradient for.
+
+The function 'f' must be a numerical function which takes N inputs and
+produces M outputs. Its gradient function 'g', which is computed by
+this SymbolicGradient op is a function taking N + M inputs and
+produces N outputs.
+
+I.e. if we have
+   (y1, y2, ..., y_M) = f(x1, x2, ..., x_N),
+then, g is
+   (dL/dx1, dL/dx2, ..., dL/dx_N) = g(x1, x2, ..., x_N,
+                                     dL/dy1, dL/dy2, ..., dL/dy_M),
+
+where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+loss function). dL/dx_i is the partial derivative of L with respect
+to x_i.
+
+(Needs some math expert to say the comment above better.)
+END
+  }
+  summary: "Computes the gradient function for function f via backpropagation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_T.pbtxt b/tensorflow/core/api_def/base_api/api_def_T.pbtxt
deleted file mode 100644
index 8d1cbbcc066aee477e7eb1a39b6d8cddf84e05a0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_T.pbtxt
+++ /dev/null
@@ -1,619 +0,0 @@
-op {
-  graph_op_name: "TFRecordDataset"
-  endpoint {
-    name: "TFRecordDataset"
-  }
-  summary: "Creates a dataset that emits the records from one or more TFRecord files."
-}
-op {
-  graph_op_name: "TFRecordReader"
-  endpoint {
-    name: "TFRecordReader"
-  }
-  summary: "A Reader that outputs the records from a TensorFlow Records file."
-}
-op {
-  graph_op_name: "TFRecordReaderV2"
-  endpoint {
-    name: "TFRecordReaderV2"
-  }
-  summary: "A Reader that outputs the records from a TensorFlow Records file."
-}
-op {
-  graph_op_name: "TakeDataset"
-  endpoint {
-    name: "TakeDataset"
-  }
-  summary: "Creates a dataset that contains `count` elements from the `input_dataset`."
-}
-op {
-  graph_op_name: "TakeManySparseFromTensorsMap"
-  endpoint {
-    name: "TakeManySparseFromTensorsMap"
-  }
-  summary: "Read `SparseTensors` from a `SparseTensorsMap` and concatenate them."
-  description: <<END
-The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-`N` is the minibatch size and the rows correspond to the output handles of
-`AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-original `SparseTensor` objects that went into the given input ops must all
-match.  When the final `SparseTensor` is created, it has rank one
-higher than the ranks of the incoming `SparseTensor` objects
-(they have been concatenated along a new row dimension on the left).
-
-The output `SparseTensor` object's shape values for all dimensions but the
-first are the max across the input `SparseTensor` objects' shape values
-for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-size.
-
-The input `SparseTensor` objects' indices are assumed ordered in
-standard lexicographic order.  If this is not the case, after this
-step run `SparseReorder` to restore index ordering.
-
-For example, if the handles represent an input, which is a `[2, 3]` matrix
-representing two original `SparseTensor` objects:
-
-```
-    index = [ 0]
-            [10]
-            [20]
-    values = [1, 2, 3]
-    shape = [50]
-```
-
-and
-
-```
-    index = [ 2]
-            [10]
-    values = [4, 5]
-    shape = [30]
-```
-
-then the final `SparseTensor` will be:
-
-```
-    index = [0  0]
-            [0 10]
-            [0 20]
-            [1  2]
-            [1 10]
-    values = [1, 2, 3, 4, 5]
-    shape = [2 50]
-```
-END
-}
-op {
-  graph_op_name: "Tan"
-  endpoint {
-    name: "Tan"
-  }
-  summary: "Computes tan of x element-wise."
-}
-op {
-  graph_op_name: "Tanh"
-  endpoint {
-    name: "Tanh"
-  }
-  summary: "Computes hyperbolic tangent of `x` element-wise."
-}
-op {
-  graph_op_name: "TanhGrad"
-  endpoint {
-    name: "TanhGrad"
-  }
-  summary: "Computes the gradient for the tanh of `x` wrt its input."
-  description: <<END
-Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-is the corresponding input gradient.
-END
-}
-op {
-  graph_op_name: "TemporaryVariable"
-  endpoint {
-    name: "TemporaryVariable"
-  }
-  summary: "Returns a tensor that may be mutated, but only persists within a single step."
-  description: <<END
-This is an experimental op for internal use only and it is possible to use this
-op in unsafe ways.  DO NOT USE unless you fully understand the risks.
-
-It is the caller's responsibility to ensure that 'ref' is eventually passed to a
-matching 'DestroyTemporaryVariable' op after all other uses have completed.
-
-Outputs a ref to the tensor state so it may be read or modified.
-
-  E.g.
-      var = state_ops._temporary_variable([1, 2], types.float_)
-      var_name = var.op.name
-      var = state_ops.assign(var, [[4.0, 5.0]])
-      var = state_ops.assign_add(var, [[6.0, 7.0]])
-      final = state_ops._destroy_temporary_variable(var, var_name=var_name)
-END
-}
-op {
-  graph_op_name: "TensorArray"
-  endpoint {
-    name: "TensorArray"
-  }
-}
-op {
-  graph_op_name: "TensorArrayClose"
-  endpoint {
-    name: "TensorArrayClose"
-  }
-}
-op {
-  graph_op_name: "TensorArrayCloseV2"
-  endpoint {
-    name: "TensorArrayCloseV2"
-  }
-  summary: "Deprecated. Use TensorArrayCloseV3"
-}
-op {
-  graph_op_name: "TensorArrayCloseV3"
-  endpoint {
-    name: "TensorArrayCloseV3"
-  }
-  summary: "Delete the TensorArray from its resource container."
-  description: <<END
-This enables the user to close and release the resource in the middle
-of a step/run.
-END
-}
-op {
-  graph_op_name: "TensorArrayConcat"
-  endpoint {
-    name: "TensorArrayConcat"
-  }
-}
-op {
-  graph_op_name: "TensorArrayConcatV2"
-  endpoint {
-    name: "TensorArrayConcatV2"
-  }
-  summary: "Deprecated. Use TensorArrayConcatV3"
-}
-op {
-  graph_op_name: "TensorArrayConcatV3"
-  endpoint {
-    name: "TensorArrayConcatV3"
-  }
-  summary: "Concat the elements from the TensorArray into value `value`."
-  description: <<END
-Takes `T` elements of shapes
-
-  ```
-  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-  ```
-
-and concatenates them into a Tensor of shape:
-
-  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-
-All elements must have the same shape (excepting the first dimension).
-END
-}
-op {
-  graph_op_name: "TensorArrayGather"
-  endpoint {
-    name: "TensorArrayGather"
-  }
-}
-op {
-  graph_op_name: "TensorArrayGatherV2"
-  endpoint {
-    name: "TensorArrayGatherV2"
-  }
-  summary: "Deprecated. Use TensorArrayGatherV3"
-}
-op {
-  graph_op_name: "TensorArrayGatherV3"
-  endpoint {
-    name: "TensorArrayGatherV3"
-  }
-  summary: "Gather specific elements from the TensorArray into output `value`."
-  description: <<END
-All elements selected by `indices` must have the same shape.
-END
-}
-op {
-  graph_op_name: "TensorArrayGrad"
-  endpoint {
-    name: "TensorArrayGrad"
-  }
-}
-op {
-  graph_op_name: "TensorArrayGradV2"
-  endpoint {
-    name: "TensorArrayGradV2"
-  }
-  summary: "Deprecated. Use TensorArrayGradV3"
-}
-op {
-  graph_op_name: "TensorArrayGradV3"
-  endpoint {
-    name: "TensorArrayGradV3"
-  }
-  summary: "Creates a TensorArray for storing the gradients of values in the given handle."
-  description: <<END
-If the given TensorArray gradient already exists, returns a reference to it.
-
-Locks the size of the original TensorArray by disabling its dynamic size flag.
-
-**A note about the input flow_in:**
-
-The handle flow_in forces the execution of the gradient lookup to occur
-only after certain other operations have occurred.  For example, when
-the forward TensorArray is dynamically sized, writes to this TensorArray
-may resize the object.  The gradient TensorArray is statically sized based
-on the size of the forward TensorArray when this operation executes.
-Furthermore, the size of the forward TensorArray is frozen by this call.
-As a result, the flow is used to ensure that the call to generate the gradient
-TensorArray only happens after all writes are executed.
-
-In the case of dynamically sized TensorArrays, gradient computation should
-only be performed on read operations that have themselves been chained via
-flow to occur only after all writes have executed. That way the final size
-of the forward TensorArray is known when this operation is called.
-
-**A note about the source attribute:**
-
-TensorArray gradient calls use an accumulator TensorArray object.  If
-multiple gradients are calculated and run in the same session, the multiple
-gradient nodes may accidentally flow through the same accumulator TensorArray.
-This double counts and generally breaks the TensorArray gradient flow.
-
-The solution is to identify which gradient call this particular
-TensorArray gradient is being called in.  This is performed by identifying
-a unique string (e.g. "gradients", "gradients_1", ...) from the input
-gradient Tensor's name.  This string is used as a suffix when creating
-the TensorArray gradient object here (the attribute `source`).
-
-The attribute `source` is added as a suffix to the forward TensorArray's
-name when performing the creation / lookup, so that each separate gradient
-calculation gets its own TensorArray accumulator.
-END
-}
-op {
-  graph_op_name: "TensorArrayPack"
-  endpoint {
-    name: "TensorArrayPack"
-  }
-}
-op {
-  graph_op_name: "TensorArrayRead"
-  endpoint {
-    name: "TensorArrayRead"
-  }
-}
-op {
-  graph_op_name: "TensorArrayReadV2"
-  endpoint {
-    name: "TensorArrayReadV2"
-  }
-  summary: "Deprecated. Use TensorArrayReadV3"
-}
-op {
-  graph_op_name: "TensorArrayReadV3"
-  endpoint {
-    name: "TensorArrayReadV3"
-  }
-  summary: "Read an element from the TensorArray into output `value`."
-}
-op {
-  graph_op_name: "TensorArrayScatter"
-  endpoint {
-    name: "TensorArrayScatter"
-  }
-}
-op {
-  graph_op_name: "TensorArrayScatterV2"
-  endpoint {
-    name: "TensorArrayScatterV2"
-  }
-  summary: "Deprecated. Use TensorArrayScatterV3"
-}
-op {
-  graph_op_name: "TensorArrayScatterV3"
-  endpoint {
-    name: "TensorArrayScatterV3"
-  }
-  summary: "Scatter the data from the input value into specific TensorArray elements."
-  description: <<END
-`indices` must be a vector, its length must match the first dim of `value`.
-END
-}
-op {
-  graph_op_name: "TensorArraySize"
-  endpoint {
-    name: "TensorArraySize"
-  }
-}
-op {
-  graph_op_name: "TensorArraySizeV2"
-  endpoint {
-    name: "TensorArraySizeV2"
-  }
-  summary: "Deprecated. Use TensorArraySizeV3"
-}
-op {
-  graph_op_name: "TensorArraySizeV3"
-  endpoint {
-    name: "TensorArraySizeV3"
-  }
-  summary: "Get the current size of the TensorArray."
-}
-op {
-  graph_op_name: "TensorArraySplit"
-  endpoint {
-    name: "TensorArraySplit"
-  }
-}
-op {
-  graph_op_name: "TensorArraySplitV2"
-  endpoint {
-    name: "TensorArraySplitV2"
-  }
-  summary: "Deprecated. Use TensorArraySplitV3"
-}
-op {
-  graph_op_name: "TensorArraySplitV3"
-  endpoint {
-    name: "TensorArraySplitV3"
-  }
-  summary: "Split the data from the input value into TensorArray elements."
-  description: <<END
-Assuming that `lengths` takes on values
-
-  ```(n0, n1, ..., n(T-1))```
-
-and that `value` has shape
-
-  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-
-this splits values into a TensorArray with T tensors.
-
-TensorArray index t will be the subtensor of values with starting position
-
-  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-
-and having size
-
-  ```nt x d0 x d1 x ...```
-END
-}
-op {
-  graph_op_name: "TensorArrayUnpack"
-  endpoint {
-    name: "TensorArrayUnpack"
-  }
-}
-op {
-  graph_op_name: "TensorArrayV2"
-  endpoint {
-    name: "TensorArrayV2"
-  }
-  summary: "Deprecated. Use TensorArrayV3"
-}
-op {
-  graph_op_name: "TensorArrayV3"
-  endpoint {
-    name: "TensorArrayV3"
-  }
-  summary: "An array of Tensors of given size."
-  description: <<END
-Write data via Write and read via Read or Pack.
-END
-}
-op {
-  graph_op_name: "TensorArrayWrite"
-  endpoint {
-    name: "TensorArrayWrite"
-  }
-}
-op {
-  graph_op_name: "TensorArrayWriteV2"
-  endpoint {
-    name: "TensorArrayWriteV2"
-  }
-  summary: "Deprecated. Use TensorArrayGradV3"
-}
-op {
-  graph_op_name: "TensorArrayWriteV3"
-  endpoint {
-    name: "TensorArrayWriteV3"
-  }
-  summary: "Push an element onto the tensor_array."
-}
-op {
-  graph_op_name: "TensorDataset"
-  endpoint {
-    name: "TensorDataset"
-  }
-  summary: "Creates a dataset that emits `components` as a tuple of tensors once."
-}
-op {
-  graph_op_name: "TensorSliceDataset"
-  endpoint {
-    name: "TensorSliceDataset"
-  }
-  summary: "Creates a dataset that emits each dim-0 slice of `components` once."
-}
-op {
-  graph_op_name: "TensorSummary"
-  endpoint {
-    name: "TensorSummary"
-  }
-  summary: "Outputs a `Summary` protocol buffer with a tensor."
-  description: <<END
-This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-a tag as well as a serialized SummaryMetadata proto string that contains
-plugin-specific data. We will keep this op to maintain backwards compatibility.
-END
-}
-op {
-  graph_op_name: "TensorSummaryV2"
-  endpoint {
-    name: "TensorSummaryV2"
-  }
-  summary: "Outputs a `Summary` protocol buffer with a tensor and per-plugin data."
-}
-op {
-  graph_op_name: "TextLineDataset"
-  endpoint {
-    name: "TextLineDataset"
-  }
-  summary: "Creates a dataset that emits the lines of one or more text files."
-}
-op {
-  graph_op_name: "TextLineReader"
-  endpoint {
-    name: "TextLineReader"
-  }
-  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
-}
-op {
-  graph_op_name: "TextLineReaderV2"
-  endpoint {
-    name: "TextLineReaderV2"
-  }
-  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
-}
-op {
-  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
-  endpoint {
-    name: "ThreadUnsafeUnigramCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a learned unigram distribution."
-  description: <<END
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "Tile"
-  endpoint {
-    name: "Tile"
-  }
-  summary: "Constructs a tensor by tiling a given tensor."
-  description: <<END
-This operation creates a new tensor by replicating `input` `multiples` times.
-The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-and the values of `input` are replicated `multiples[i]` times along the 'i'th
-dimension. For example, tiling `[a b c d]` by `[2]` produces
-`[a b c d a b c d]`.
-END
-}
-op {
-  graph_op_name: "TileGrad"
-  endpoint {
-    name: "TileGrad"
-  }
-  summary: "Returns the gradient of `Tile`."
-  description: <<END
-Since `Tile` takes an input and repeats the input `multiples` times
-along each dimension, `TileGrad` takes in `multiples` and aggregates
-each repeated tile of `input` into `output`.
-END
-}
-op {
-  graph_op_name: "TopK"
-  endpoint {
-    name: "TopK"
-  }
-  summary: "Finds values and indices of the `k` largest elements for the last dimension."
-  description: <<END
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-
-If `k` varies dynamically, use `TopKV2` below.
-END
-}
-op {
-  graph_op_name: "TopKV2"
-  endpoint {
-    name: "TopKV2"
-  }
-  summary: "Finds values and indices of the `k` largest elements for the last dimension."
-  description: <<END
-If the input is a vector (rank-1), finds the `k` largest entries in the vector
-and outputs their values and indices as vectors.  Thus `values[j]` is the
-`j`-th largest entry in `input`, and its index is `indices[j]`.
-
-For matrices (resp. higher rank input), computes the top `k` entries in each
-row (resp. vector along the last dimension).  Thus,
-
-    values.shape = indices.shape = input.shape[:-1] + [k]
-
-If two elements are equal, the lower-index element appears first.
-END
-}
-op {
-  graph_op_name: "Transpose"
-  endpoint {
-    name: "Transpose"
-  }
-  summary: "Shuffle dimensions of x according to a permutation."
-  description: <<END
-The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-END
-}
-op {
-  graph_op_name: "TruncateDiv"
-  endpoint {
-    name: "TruncateDiv"
-  }
-  summary: "Returns x / y element-wise for integer types."
-  description: <<END
-Truncation designates that negative numbers will round fractional quantities
-toward zero. I.e. -7 / 5 = 1. This matches C semantics but it is different
-than Python semantics. See `FloorDiv` for a division function that matches
-Python Semantics.
-
-*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "TruncateMod"
-  endpoint {
-    name: "TruncateMod"
-  }
-  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
-  description: <<END
-the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-y + truncate_mod(x, y) = x`.
-
-*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-END
-}
-op {
-  graph_op_name: "TruncatedNormal"
-  endpoint {
-    name: "TruncatedNormal"
-  }
-  summary: "Outputs random values from a truncated normal distribution."
-  description: <<END
-The generated values follow a normal distribution with mean 0 and standard
-deviation 1, except that values whose magnitude is more than 2 standard
-deviations from the mean are dropped and re-picked.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80f64cebb1bef262146afdadd5c37b0a30277db0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "TFRecordDataset"
+  in_arg {
+    name: "filenames"
+    description: <<END
+A scalar or vector containing the name(s) of the file(s) to be
+read.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+A scalar representing the number of bytes to buffer. A value of
+0 means no buffering will be performed.
+END
+  }
+  summary: "Creates a dataset that emits the records from one or more TFRecord files."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..100e3467530cd9a9234afa14c3c726b912f8998f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordReader.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "TFRecordReader"
+  visibility: SKIP
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the records from a TensorFlow Records file."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordReaderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f12ebe54effeb0b86b25cbacddd3379c65c1a058
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordReaderV2.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "TFRecordReaderV2"
+  endpoint {
+    name: "TFRecordReader"
+  }
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the records from a TensorFlow Records file."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8808dc6b1f0d0ae3a0e83f376eab245beaad2de1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TakeDataset"
+  in_arg {
+    name: "count"
+    description: <<END
+A scalar representing the number of elements from the `input_dataset`
+that should be taken. A value of `-1` indicates that all of `input_dataset`
+is taken.
+END
+  }
+  summary: "Creates a dataset that contains `count` elements from the `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeManySparseFromTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2073d7245140a666f0fe2ea5d0aa4b31ed775ef4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TakeManySparseFromTensorsMap.pbtxt
@@ -0,0 +1,100 @@
+op {
+  graph_op_name: "TakeManySparseFromTensorsMap"
+  in_arg {
+    name: "sparse_handles"
+    description: <<END
+1-D, The `N` serialized `SparseTensor` objects.
+Shape: `[N]`.
+END
+  }
+  out_arg {
+    name: "sparse_indices"
+    description: <<END
+2-D.  The `indices` of the minibatch `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "sparse_values"
+    description: <<END
+1-D.  The `values` of the minibatch `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "sparse_shape"
+    description: <<END
+1-D.  The `shape` of the minibatch `SparseTensor`.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The `dtype` of the `SparseTensor` objects stored in the
+`SparseTensorsMap`.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+The container name for the `SparseTensorsMap` read by this op.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+The shared name for the `SparseTensorsMap` read by this op.
+It should not be blank; rather the `shared_name` or unique Operation name
+of the Op that created the original `SparseTensorsMap` should be used.
+END
+  }
+  summary: "Read `SparseTensors` from a `SparseTensorsMap` and concatenate them."
+  description: <<END
+The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+`N` is the minibatch size and the rows correspond to the output handles of
+`AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+original `SparseTensor` objects that went into the given input ops must all
+match.  When the final `SparseTensor` is created, it has rank one
+higher than the ranks of the incoming `SparseTensor` objects
+(they have been concatenated along a new row dimension on the left).
+
+The output `SparseTensor` object's shape values for all dimensions but the
+first are the max across the input `SparseTensor` objects' shape values
+for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+size.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the handles represent an input, which is a `[2, 3]` matrix
+representing two original `SparseTensor` objects:
+
+```
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+```
+
+and
+
+```
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+```
+
+then the final `SparseTensor` will be:
+
+```
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Tan.pbtxt b/tensorflow/core/api_def/base_api/api_def_Tan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20f3e4eab3d175de7467f033f6f4b73089a824b2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Tan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Tan"
+  summary: "Computes tan of x element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3658ee641a71d471c4b8a203017dd385346292d9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Tanh"
+  summary: "Computes hyperbolic tangent of `x` element-wise."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TanhGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_TanhGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef71385a2db66d23a53a4853b2807f1b575a500c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TanhGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TanhGrad"
+  visibility: HIDDEN
+  summary: "Computes the gradient for the tanh of `x` wrt its input."
+  description: <<END
+Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+is the corresponding input gradient.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TemporaryVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_TemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3a41f69aa24f89121c48b9e8c5c530ccb296a8e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TemporaryVariable.pbtxt
@@ -0,0 +1,45 @@
+op {
+  graph_op_name: "TemporaryVariable"
+  out_arg {
+    name: "ref"
+    description: <<END
+A reference to the variable tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the variable tensor.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the variable tensor.
+END
+  }
+  attr {
+    name: "var_name"
+    description: <<END
+Overrides the name used for the temporary variable resource. Default
+value is the name of the 'TemporaryVariable' op (which is guaranteed unique).
+END
+  }
+  summary: "Returns a tensor that may be mutated, but only persists within a single step."
+  description: <<END
+This is an experimental op for internal use only and it is possible to use this
+op in unsafe ways.  DO NOT USE unless you fully understand the risks.
+
+It is the caller's responsibility to ensure that 'ref' is eventually passed to a
+matching 'DestroyTemporaryVariable' op after all other uses have completed.
+
+Outputs a ref to the tensor state so it may be read or modified.
+
+  E.g.
+      var = state_ops._temporary_variable([1, 2], types.float_)
+      var_name = var.op.name
+      var = state_ops.assign(var, [[4.0, 5.0]])
+      var = state_ops.assign_add(var, [[6.0, 7.0]])
+      final = state_ops._destroy_temporary_variable(var, var_name=var_name)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArray.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArray.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7eaa468130a120f21c2a0f5578b5eb512bd72894
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArray.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArray"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayClose.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e866250d3ab3a506164824bcb339fe87b6b160a8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayClose.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayClose"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec784c94fb7802d3ceb0ed0b7a6470b5636d5fac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayCloseV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayCloseV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e469e4c07b9f3948e221c0efb0c8e33106ed6cc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayCloseV3.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorArrayCloseV3"
+  endpoint {
+    name: "TensorArrayClose"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+END
+  }
+  summary: "Delete the TensorArray from its resource container."
+  description: <<END
+This enables the user to close and release the resource in the middle
+of a step/run.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e72b58de1c0aec0adddc27c955f3b3b4e2b5cc2d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcat.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayConcat"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..289b1ba3870c4b3e3715ed0d1107fc06be173032
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayConcatV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayConcatV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..502323b277aa6c66c87f3cfff1ecbbbdea9f7a90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayConcatV3.pbtxt
@@ -0,0 +1,62 @@
+op {
+  graph_op_name: "TensorArrayConcatV3"
+  endpoint {
+    name: "TensorArrayConcat"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+All of the elements in the TensorArray, concatenated along the first
+axis.
+END
+  }
+  out_arg {
+    name: "lengths"
+    description: <<END
+A vector of the row sizes of the original T elements in the
+value output.  In the example above, this would be the values:
+`(n1, n2, ..., n(T-1))`.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the elem that is returned.
+END
+  }
+  attr {
+    name: "element_shape_except0"
+    description: <<END
+The expected shape of an element, if known,
+excluding the first dimension. Used to validate the shapes of
+TensorArray elements. If this shape is not fully specified, concatenating
+zero-size TensorArrays is an error.
+END
+  }
+  summary: "Concat the elements from the TensorArray into value `value`."
+  description: <<END
+Takes `T` elements of shapes
+
+  ```
+  (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+  ```
+
+and concatenates them into a Tensor of shape:
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+
+All elements must have the same shape (excepting the first dimension).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4d179874f6abbc76ab779c58bbc56f018981c94
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayGather"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df178020260437b5846241c08b8c93d5ab2fc097
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayGatherV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayGatherV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44b4cd8143ff253c90904655e18d732003a72c02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGatherV3.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "TensorArrayGatherV3"
+  endpoint {
+    name: "TensorArrayGather"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+The locations in the TensorArray from which to read tensor elements.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+All of the elements in the TensorArray, concatenated along a new
+axis (the new dimension 0).
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the elem that is returned.
+END
+  }
+  attr {
+    name: "element_shape"
+    description: <<END
+The expected shape of an element, if known. Used to
+validate the shapes of TensorArray elements. If this shape is not
+fully specified, gathering zero-size TensorArrays is an error.
+END
+  }
+  summary: "Gather specific elements from the TensorArray into output `value`."
+  description: <<END
+All elements selected by `indices` must have the same shape.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..517461edba243d709c663768a2223bcedb320afe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayGrad"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..846aa705dbb0e664cb123038575f571de736ac8e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayGradV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayGradV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60634a0c8e7ab5f77805b65644c9ad78279cc70a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradV3.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "TensorArrayGradV3"
+  endpoint {
+    name: "TensorArrayGrad"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to the forward TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  attr {
+    name: "source"
+    description: <<END
+The gradient source string, used to decide which gradient TensorArray
+to return.
+END
+  }
+  summary: "Creates a TensorArray for storing the gradients of values in the given handle."
+  description: <<END
+If the given TensorArray gradient already exists, returns a reference to it.
+
+Locks the size of the original TensorArray by disabling its dynamic size flag.
+
+**A note about the input flow_in:**
+
+The handle flow_in forces the execution of the gradient lookup to occur
+only after certain other operations have occurred.  For example, when
+the forward TensorArray is dynamically sized, writes to this TensorArray
+may resize the object.  The gradient TensorArray is statically sized based
+on the size of the forward TensorArray when this operation executes.
+Furthermore, the size of the forward TensorArray is frozen by this call.
+As a result, the flow is used to ensure that the call to generate the gradient
+TensorArray only happens after all writes are executed.
+
+In the case of dynamically sized TensorArrays, gradient computation should
+only be performed on read operations that have themselves been chained via
+flow to occur only after all writes have executed. That way the final size
+of the forward TensorArray is known when this operation is called.
+
+**A note about the source attribute:**
+
+TensorArray gradient calls use an accumulator TensorArray object.  If
+multiple gradients are calculated and run in the same session, the multiple
+gradient nodes may accidentally flow through the same accumulator TensorArray.
+This double counts and generally breaks the TensorArray gradient flow.
+
+The solution is to identify which gradient call this particular
+TensorArray gradient is being called in.  This is performed by identifying
+a unique string (e.g. "gradients", "gradients_1", ...) from the input
+gradient Tensor's name.  This string is used as a suffix when creating
+the TensorArray gradient object here (the attribute `source`).
+
+The attribute `source` is added as a suffix to the forward TensorArray's
+name when performing the creation / lookup, so that each separate gradient
+calculation gets its own TensorArray accumulator.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayPack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayPack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..030950b06fcbf2fd5e0c2eed99e154640e0adbec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayPack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayPack"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayRead.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b62f7fac7c292e1efcc32b8211635df97127f98
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayRead.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayRead"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..934d7e432a6595682eeb231ff131e7fbdd4483e9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayReadV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayReadV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f07182f2b0fd94c6f841ceae14e431a5ff44fbd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayReadV3.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "TensorArrayReadV3"
+  endpoint {
+    name: "TensorArrayRead"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+The tensor that is read from the TensorArray.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the elem that is returned.
+END
+  }
+  summary: "Read an element from the TensorArray into output `value`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayScatter.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3e8d1625e96f50238ec34650cd2027804969e57
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayScatter"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa74b6af6ac9c3691b46b7c6316e7de6fa84f11d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayScatterV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayScatterV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..69539e82593bcdf4e5375021fb251a66f4171e14
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayScatterV3.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "TensorArrayScatterV3"
+  endpoint {
+    name: "TensorArrayScatter"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+The locations at which to write the tensor elements.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The concatenated tensor to write to the TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "flow_out"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  summary: "Scatter the data from the input value into specific TensorArray elements."
+  description: <<END
+`indices` must be a vector, its length must match the first dim of `value`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb3a6fae1c4cc95b8ccb2a020658c9d23bd1f69b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArraySize"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9c74832360f30d68f6d3f01fa21f51b7d71c675
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArraySizeV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArraySizeV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76a7c8804f3848343a1ad432d411deecefedf678
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySizeV3.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "TensorArraySizeV3"
+  endpoint {
+    name: "TensorArraySize"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "size"
+    description: <<END
+The current size of the TensorArray.
+END
+  }
+  summary: "Get the current size of the TensorArray."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3eb8d6c7ffa8641822b5b93bebf06a47c1bf288b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySplit.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArraySplit"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15a0b18d04a4842c5e9aee7cb85a45f8cc30df7c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArraySplitV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArraySplitV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2aeb4f660129e82f6a17b7f5665803881ce7452
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArraySplitV3.pbtxt
@@ -0,0 +1,57 @@
+op {
+  graph_op_name: "TensorArraySplitV3"
+  endpoint {
+    name: "TensorArraySplit"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The concatenated tensor to write to the TensorArray.
+END
+  }
+  in_arg {
+    name: "lengths"
+    description: <<END
+The vector of lengths, how to split the rows of value into the
+TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "flow_out"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  summary: "Split the data from the input value into TensorArray elements."
+  description: <<END
+Assuming that `lengths` takes on values
+
+  ```(n0, n1, ..., n(T-1))```
+
+and that `value` has shape
+
+  ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+
+this splits values into a TensorArray with T tensors.
+
+TensorArray index t will be the subtensor of values with starting position
+
+  ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+
+and having size
+
+  ```nt x d0 x d1 x ...```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayUnpack.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayUnpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9011de23ea12eee3de3f3ba83ff86907c6c967b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayUnpack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayUnpack"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4d58e7721071aa954c395e528a4eb761aa2524e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48ac6f5e7def2e19434660f96798aa1bc834c25e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayV3.pbtxt
@@ -0,0 +1,76 @@
+op {
+  graph_op_name: "TensorArrayV3"
+  endpoint {
+    name: "TensorArray"
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+The size of the array.
+END
+  }
+  out_arg {
+    name: "handle"
+    description: <<END
+The handle to the TensorArray.
+END
+  }
+  out_arg {
+    name: "flow"
+    description: <<END
+A scalar used to control gradient flow.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the elements on the tensor_array.
+END
+  }
+  attr {
+    name: "element_shape"
+    description: <<END
+The expected shape of an element, if known. Used to
+validate the shapes of TensorArray elements. If this shape is not
+fully specified, gathering zero-size TensorArrays is an error.
+END
+  }
+  attr {
+    name: "dynamic_size"
+    description: <<END
+A boolean that determines whether writes to the TensorArray
+are allowed to grow the size.  By default, this is not allowed.
+END
+  }
+  attr {
+    name: "clear_after_read"
+    description: <<END
+If true (default), Tensors in the TensorArray are cleared
+after being read.  This disables multiple read semantics but allows early
+release of memory.
+END
+  }
+  attr {
+    name: "identical_element_shapes"
+    description: <<END
+If true (default is false), then all
+elements in the TensorArray will be expected to have have identical shapes.
+This allows certain behaviors, like dynamically checking for
+consistent shapes on write, and being able to fill in properly
+shaped zero tensors on stack -- even if the element_shape attribute
+is not fully defined.
+END
+  }
+  attr {
+    name: "tensor_array_name"
+    description: <<END
+Overrides the name used for the temporary tensor_array
+resource. Default value is the name of the 'TensorArray' op (which
+is guaranteed unique).
+END
+  }
+  summary: "An array of Tensors of given size."
+  description: <<END
+Write data via Write and read via Read or Pack.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayWrite.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayWrite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92ab1764ec8fa0524c8ce102f765fc200f1b44dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayWrite.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayWrite"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7af8c3ab283c3b657eb06469933e06c4da99e62
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorArrayWriteV2"
+  visibility: SKIP
+  summary: "Deprecated. Use TensorArrayGradV3"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..312b4b472d9df17b6c8e7f4ce3aa890123d2086c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayWriteV3.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "TensorArrayWriteV3"
+  endpoint {
+    name: "TensorArrayWrite"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to a TensorArray.
+END
+  }
+  in_arg {
+    name: "index"
+    description: <<END
+The position to write to inside the TensorArray.
+END
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+The tensor to write to the TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  out_arg {
+    name: "flow_out"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  summary: "Push an element onto the tensor_array."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..050e174aacb12b415357437e7f989b09faf40621
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorDataset"
+  summary: "Creates a dataset that emits `components` as a tuple of tensors once."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a26a98fd7f3a6564309efd28dff8c2bc93d7a67f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSliceDataset"
+  summary: "Creates a dataset that emits each dim-0 slice of `components` once."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7601e7e162f433d9fbd0a2c8ac2e226d89a0a3f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSummary.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "TensorSummary"
+  in_arg {
+    name: "tensor"
+    description: <<END
+A tensor to serialize.
+END
+  }
+  attr {
+    name: "description"
+    description: <<END
+A json-encoded SummaryDescription proto.
+END
+  }
+  attr {
+    name: "labels"
+    description: <<END
+An unused list of strings.
+END
+  }
+  attr {
+    name: "display_name"
+    description: <<END
+An unused string.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with a tensor."
+  description: <<END
+This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+a tag as well as a serialized SummaryMetadata proto string that contains
+plugin-specific data. We will keep this op to maintain backwards compatibility.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSummaryV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e03c5dc059084700b7ad8bdc00f2429095b6250
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSummaryV2.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "TensorSummaryV2"
+  in_arg {
+    name: "tag"
+    description: <<END
+A string attached to this summary. Used for organization in TensorBoard.
+END
+  }
+  in_arg {
+    name: "tensor"
+    description: <<END
+A tensor to serialize.
+END
+  }
+  in_arg {
+    name: "serialized_summary_metadata"
+    description: <<END
+A serialized SummaryMetadata proto. Contains plugin
+data.
+END
+  }
+  summary: "Outputs a `Summary` protocol buffer with a tensor and per-plugin data."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b630509964ed56ecaf401b10a46c5e53cd46528
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "TextLineDataset"
+  in_arg {
+    name: "filenames"
+    description: <<END
+A scalar or a vector containing the name(s) of the file(s) to be
+read.
+END
+  }
+  in_arg {
+    name: "compression_type"
+    description: <<END
+A scalar containing either (i) the empty string (no
+compression), (ii) "ZLIB", or (iii) "GZIP".
+END
+  }
+  in_arg {
+    name: "buffer_size"
+    description: <<END
+A scalar containing the number of bytes to buffer.
+END
+  }
+  summary: "Creates a dataset that emits the lines of one or more text files."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74ed1da8ff505fc724dfbeaa6645269bf239e3e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineReader.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "TextLineReader"
+  visibility: SKIP
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "skip_header_lines"
+    description: <<END
+Number of lines to skip from the beginning of every file.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineReaderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0de7655b74468afda7ce38ca470d67eb4b266955
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineReaderV2.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "TextLineReaderV2"
+  endpoint {
+    name: "TextLineReader"
+  }
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "skip_header_lines"
+    description: <<END
+Number of lines to skip from the beginning of every file.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the lines of a file delimited by \'\\n\'."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2619aae806d4b1e42e0bca25f98ef5c9e908d264
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
+  visibility: SKIP
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to randomly sample.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "range_max"
+    description: <<END
+The sampler will sample integers from the interval [0, range_max).
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a learned unigram distribution."
+  description: <<END
+See explanations of candidate sampling and the data formats at
+go/candidate-sampling.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Tile.pbtxt b/tensorflow/core/api_def/base_api/api_def_Tile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97e1cae19c0e94fd40424538974b35e86b4a9ba5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Tile.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Tile"
+  in_arg {
+    name: "input"
+    description: <<END
+1-D or higher.
+END
+  }
+  in_arg {
+    name: "multiples"
+    description: <<END
+1-D. Length must be the same as the number of dimensions in `input`
+END
+  }
+  summary: "Constructs a tensor by tiling a given tensor."
+  description: <<END
+This operation creates a new tensor by replicating `input` `multiples` times.
+The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+and the values of `input` are replicated `multiples[i]` times along the 'i'th
+dimension. For example, tiling `[a b c d]` by `[2]` produces
+`[a b c d a b c d]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TileGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_TileGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b211534259f766ecab4d2839c6dc6eaa5ea0d2b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TileGrad.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "TileGrad"
+  summary: "Returns the gradient of `Tile`."
+  description: <<END
+Since `Tile` takes an input and repeats the input `multiples` times
+along each dimension, `TileGrad` takes in `multiples` and aggregates
+each repeated tile of `input` into `output`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TopK.pbtxt b/tensorflow/core/api_def/base_api/api_def_TopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4060d0afab54c7f887fc677fc26bb512fa3c7d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TopK.pbtxt
@@ -0,0 +1,50 @@
+op {
+  graph_op_name: "TopK"
+  in_arg {
+    name: "input"
+    description: <<END
+1-D or higher with last dimension at least `k`.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+The `k` largest elements along each last dimensional slice.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+The indices of `values` within the last dimension of `input`.
+END
+  }
+  attr {
+    name: "k"
+    description: <<END
+Number of top elements to look for along the last dimension (along each
+row for matrices).
+END
+  }
+  attr {
+    name: "sorted"
+    description: <<END
+If true the resulting `k` elements will be sorted by the values in
+descending order.
+END
+  }
+  summary: "Finds values and indices of the `k` largest elements for the last dimension."
+  description: <<END
+If the input is a vector (rank-1), finds the `k` largest entries in the vector
+and outputs their values and indices as vectors.  Thus `values[j]` is the
+`j`-th largest entry in `input`, and its index is `indices[j]`.
+
+For matrices (resp. higher rank input), computes the top `k` entries in each
+row (resp. vector along the last dimension).  Thus,
+
+    values.shape = indices.shape = input.shape[:-1] + [k]
+
+If two elements are equal, the lower-index element appears first.
+
+If `k` varies dynamically, use `TopKV2` below.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TopKV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd17df16a29060f3bbbd89cf4de96ee002ded87a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TopKV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "TopKV2"
+  endpoint {
+    name: "TopK"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+1-D or higher with last dimension at least `k`.
+END
+  }
+  in_arg {
+    name: "k"
+    description: <<END
+0-D.  Number of top elements to look for along the last dimension (along each
+row for matrices).
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+The `k` largest elements along each last dimensional slice.
+END
+  }
+  out_arg {
+    name: "indices"
+    description: <<END
+The indices of `values` within the last dimension of `input`.
+END
+  }
+  attr {
+    name: "sorted"
+    description: <<END
+If true the resulting `k` elements will be sorted by the values in
+descending order.
+END
+  }
+  summary: "Finds values and indices of the `k` largest elements for the last dimension."
+  description: <<END
+If the input is a vector (rank-1), finds the `k` largest entries in the vector
+and outputs their values and indices as vectors.  Thus `values[j]` is the
+`j`-th largest entry in `input`, and its index is `indices[j]`.
+
+For matrices (resp. higher rank input), computes the top `k` entries in each
+row (resp. vector along the last dimension).  Thus,
+
+    values.shape = indices.shape = input.shape[:-1] + [k]
+
+If two elements are equal, the lower-index element appears first.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Transpose.pbtxt b/tensorflow/core/api_def/base_api/api_def_Transpose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ec7fae659698d55a4429f8538c22b5df3258d81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Transpose.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Transpose"
+  summary: "Shuffle dimensions of x according to a permutation."
+  description: <<END
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef1b9873139f57425634d1bc1d715885dff22540
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TruncateDiv.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TruncateDiv"
+  summary: "Returns x / y element-wise for integer types."
+  description: <<END
+Truncation designates that negative numbers will round fractional quantities
+toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+than Python semantics. See `FloorDiv` for a division function that matches
+Python Semantics.
+
+*NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/base_api/api_def_TruncateMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..804f70ab52fb07ef940ee405b5f597a846c83757
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TruncateMod.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "TruncateMod"
+  summary: "Returns element-wise remainder of division. This emulates C semantics in that"
+  description: <<END
+the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+y + truncate_mod(x, y) = x`.
+
+*NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TruncatedNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_TruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3da930d6f8ad252d55a4d1a96124fb7b7996fc89
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TruncatedNormal.pbtxt
@@ -0,0 +1,42 @@
+op {
+  graph_op_name: "TruncatedNormal"
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random truncated normal
+values.
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either `seed` or `seed2` are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+A second seed to avoid seed collision.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a truncated normal distribution."
+  description: <<END
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_U.pbtxt b/tensorflow/core/api_def/base_api/api_def_U.pbtxt
deleted file mode 100644
index 6699efc0e090fa120a26758463e90ae35765dc81..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_U.pbtxt
+++ /dev/null
@@ -1,150 +0,0 @@
-op {
-  graph_op_name: "UniformCandidateSampler"
-  endpoint {
-    name: "UniformCandidateSampler"
-  }
-  summary: "Generates labels for candidate sampling with a uniform distribution."
-  description: <<END
-See explanations of candidate sampling and the data formats at
-go/candidate-sampling.
-
-For each batch, this op picks a single set of sampled candidate labels.
-
-The advantages of sampling candidates per-batch are simplicity and the
-possibility of efficient dense matrix multiplication. The disadvantage is that
-the sampled candidates must be chosen independently of the context and of the
-true labels.
-END
-}
-op {
-  graph_op_name: "Unique"
-  endpoint {
-    name: "Unique"
-  }
-  summary: "Finds unique elements in a 1-D tensor."
-  description: <<END
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx = unique(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-```
-END
-}
-op {
-  graph_op_name: "UniqueWithCounts"
-  endpoint {
-    name: "UniqueWithCounts"
-  }
-  summary: "Finds unique elements in a 1-D tensor."
-  description: <<END
-This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. Finally, it returns a third tensor `count` that
-contains the count of each element of `y` in `x`. In other words:
-
-`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-
-For example:
-
-```
-# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-y, idx, count = unique_with_counts(x)
-y ==> [1, 2, 4, 7, 8]
-idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-count ==> [2, 1, 3, 1, 2]
-```
-END
-}
-op {
-  graph_op_name: "Unpack"
-  endpoint {
-    name: "Unpack"
-  }
-  summary: "Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors."
-  description: <<END
-Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-For example, given a tensor of shape `(A, B, C, D)`;
-
-If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-  dimension unpacked along is gone, unlike `split`).
-
-If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-  and each tensor in `output` will have shape `(A, C, D)`.
-Etc.
-
-This is the opposite of `pack`.
-END
-}
-op {
-  graph_op_name: "UnsortedSegmentMax"
-  endpoint {
-    name: "UnsortedSegmentMax"
-  }
-  summary: "Computes the Max along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-Instead of computing the sum over segments, it computes the maximum
-such that:
-
-\\(output_i = \max_j data_j\\) where max is over `j` such
-that `segment_ids[j] == i`.
-
-If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
- `output[i] = numeric_limits<T>::min()`.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "UnsortedSegmentSum"
-  endpoint {
-    name: "UnsortedSegmentSum"
-  }
-  summary: "Computes the sum along segments of a tensor."
-  description: <<END
-Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-segments.
-
-Computes a tensor such that
-`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-need not be sorted and need not cover all values in the full
-range of valid values.
-
-If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-
-`num_segments` should equal the number of distinct segment IDs.
-
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-</div>
-END
-}
-op {
-  graph_op_name: "Unstage"
-  endpoint {
-    name: "Unstage"
-  }
-  summary: "Op is similar to a lightweight Dequeue."
-  description: <<END
-The basic functionality is similar to dequeue with many fewer
-capabilities and options.  This Op is optimized for performance.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniformCandidateSampler.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4cf431a2e121cbac943331ab069168716f17685b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniformCandidateSampler.pbtxt
@@ -0,0 +1,86 @@
+op {
+  graph_op_name: "UniformCandidateSampler"
+  in_arg {
+    name: "true_classes"
+    description: <<END
+A batch_size * num_true matrix, in which each row contains the
+IDs of the num_true target_classes in the corresponding original label.
+END
+  }
+  out_arg {
+    name: "sampled_candidates"
+    description: <<END
+A vector of length num_sampled, in which each element is
+the ID of a sampled candidate.
+END
+  }
+  out_arg {
+    name: "true_expected_count"
+    description: <<END
+A batch_size * num_true matrix, representing
+the number of times each candidate is expected to occur in a batch
+of sampled candidates. If unique=true, then this is a probability.
+END
+  }
+  out_arg {
+    name: "sampled_expected_count"
+    description: <<END
+A vector of length num_sampled, for each sampled
+candidate representing the number of times the candidate is expected
+to occur in a batch of sampled candidates.  If unique=true, then this is a
+probability.
+END
+  }
+  attr {
+    name: "num_true"
+    description: <<END
+Number of true labels per context.
+END
+  }
+  attr {
+    name: "num_sampled"
+    description: <<END
+Number of candidates to randomly sample.
+END
+  }
+  attr {
+    name: "unique"
+    description: <<END
+If unique is true, we sample with rejection, so that all sampled
+candidates in a batch are unique. This requires some approximation to
+estimate the post-rejection sampling probabilities.
+END
+  }
+  attr {
+    name: "range_max"
+    description: <<END
+The sampler will sample integers from the interval [0, range_max).
+END
+  }
+  attr {
+    name: "seed"
+    description: <<END
+If either seed or seed2 are set to be non-zero, the random number
+generator is seeded by the given seed.  Otherwise, it is seeded by a
+random seed.
+END
+  }
+  attr {
+    name: "seed2"
+    description: <<END
+An second seed to avoid seed collision.
+END
+  }
+  summary: "Generates labels for candidate sampling with a uniform distribution."
+  description: <<END
+See explanations of candidate sampling and the data formats at
+go/candidate-sampling.
+
+For each batch, this op picks a single set of sampled candidate labels.
+
+The advantages of sampling candidates per-batch are simplicity and the
+possibility of efficient dense matrix multiplication. The disadvantage is that
+the sampled candidates must be chosen independently of the context and of the
+true labels.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Unique.pbtxt b/tensorflow/core/api_def/base_api/api_def_Unique.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a35b67e7b5a14abf28141b8a00e7b5dc61932bdf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Unique.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "Unique"
+  in_arg {
+    name: "x"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+1-D.
+END
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: <<END
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7ec6e5518c5a7788bb4fff88a38b74295e9df4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "UniqueV2"
+  in_arg {
+    name: "x"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "axis"
+    description: <<END
+A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+find the unique elements.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+A 1-D Tensor. Has the same type as x that contains the index of each
+value of x in the output y.
+END
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: <<END
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/base_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02d670644f094410675642456a89ce6e2a77cb00
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  in_arg {
+    name: "x"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "idx"
+    description: <<END
+1-D.
+END
+  }
+  out_arg {
+    name: "count"
+    description: <<END
+1-D.
+END
+  }
+  summary: "Finds unique elements in a 1-D tensor."
+  description: <<END
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. Finally, it returns a third tensor `count` that
+contains the count of each element of `y` in `x`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx, count = unique_with_counts(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+count ==> [2, 1, 3, 1, 2]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Unpack.pbtxt b/tensorflow/core/api_def/base_api/api_def_Unpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..716aa73956cdae3ce3bf5cacb133faecd1dc61f8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Unpack.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "Unpack"
+  endpoint {
+    name: "Unstack"
+  }
+  in_arg {
+    name: "value"
+    description: <<END
+1-D or higher, with `axis` dimension size equal to `num`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The list of tensors unpacked from `value`.
+END
+  }
+  attr {
+    name: "axis"
+    description: <<END
+Dimension along which to unpack.  Negative values wrap around, so the
+valid range is `[-R, R)`.
+END
+  }
+  summary: "Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors."
+  description: <<END
+Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+For example, given a tensor of shape `(A, B, C, D)`;
+
+If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+  and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+  dimension unpacked along is gone, unlike `split`).
+
+If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+  and each tensor in `output` will have shape `(A, C, D)`.
+Etc.
+
+This is the opposite of `pack`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8298d62f253160847ee34bcdea5a81c7370e5124
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "UnsortedSegmentMax"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A 1-D tensor whose rank is equal to the rank of `data`'s
+first dimension.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for dimension 0 which
+has size `num_segments`.
+END
+  }
+  summary: "Computes the Max along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+Instead of computing the sum over segments, it computes the maximum
+such that:
+
+\\(output_i = \max_j data_j\\) where max is over `j` such
+that `segment_ids[j] == i`.
+
+If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+ `output[i] = numeric_limits<T>::min()`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77a96d1e03d577ca0f6dfd69c51d2551d1ad4b2a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "UnsortedSegmentSum"
+  in_arg {
+    name: "segment_ids"
+    description: <<END
+A tensor whose shape is a prefix of `data.shape`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Has same shape as data, except for the first `segment_ids.rank`
+dimensions, which are replaced with a single dimension which has size
+`num_segments`.
+END
+  }
+  summary: "Computes the sum along segments of a tensor."
+  description: <<END
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+Computes a tensor such that
+`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+need not be sorted and need not cover all values in the full
+range of valid values.
+
+If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+If the given segment ID `i` is negative, the value is dropped and will not be
+added to the sum of the segment.
+
+`num_segments` should equal the number of distinct segment IDs.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+</div>
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Unstage.pbtxt b/tensorflow/core/api_def/base_api/api_def_Unstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e18658430b57ca868b0ec58db35542a3f77993a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Unstage.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "Unstage"
+  summary: "Op is similar to a lightweight Dequeue."
+  description: <<END
+The basic functionality is similar to dequeue with many fewer
+capabilities and options.  This Op is optimized for performance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_V.pbtxt b/tensorflow/core/api_def/base_api/api_def_V.pbtxt
deleted file mode 100644
index 31cc147900957b6e0afe9a7075fc701b25eba342..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_V.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-op {
-  graph_op_name: "Variable"
-  endpoint {
-    name: "Variable"
-  }
-  summary: "Use VariableV2 instead."
-}
-op {
-  graph_op_name: "VariableV2"
-  endpoint {
-    name: "VariableV2"
-  }
-  summary: "Holds state in the form of a tensor that persists across steps."
-  description: <<END
-Outputs a ref to the tensor state so it may be read or modified.
-TODO(zhifengc/mrry): Adds a pointer to a more detail document
-about sharing states in tensorflow.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a4caa06bdb2f9f92ed43f4d4658c7101e622885
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_VarHandleOp.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "VarHandleOp"
+  attr {
+    name: "container"
+    description: <<END
+the container this variable is placed in.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+the name by which this variable is referred to.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+the type of this variable. Must agree with the dtypes
+of all ops using this variable.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The (possibly partially specified) shape of this variable.
+END
+  }
+  summary: "Creates a handle to a Variable resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_VarIsInitializedOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_VarIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9c4cfd0b9549d0de3f457b035dd752b6b9d33e8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_VarIsInitializedOp.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "VarIsInitializedOp"
+  in_arg {
+    name: "resource"
+    description: <<END
+the input resource handle.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+a scalar boolean which is true if the variable has been
+initialized.
+END
+  }
+  summary: "Checks whether a resource handle-based variable has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Variable.pbtxt b/tensorflow/core/api_def/base_api/api_def_Variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..112ab6549f47815bd0c1cc947c48f5d8b56f3b6f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Variable.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "Variable"
+  visibility: SKIP
+  summary: "Use VariableV2 instead."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_VariableShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_VariableShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adc4bf08fa9a44488399aeea64d98441ff630822
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_VariableShape.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "VariableShape"
+  summary: "Returns the shape of the variable pointed to by `resource`."
+  description: <<END
+This operation returns a 1-D integer tensor representing the shape of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+shape(t) ==> [2, 2, 3]
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_VariableV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_VariableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6341cc69f61dd7fd127bdfdcfde836fce4fcd443
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_VariableV2.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "VariableV2"
+  endpoint {
+    name: "Variable"
+  }
+  out_arg {
+    name: "ref"
+    description: <<END
+A reference to the variable tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the variable tensor.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the variable tensor.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this variable is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this variable is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "Holds state in the form of a tensor that persists across steps."
+  description: <<END
+Outputs a ref to the tensor state so it may be read or modified.
+TODO(zhifengc/mrry): Adds a pointer to a more detail document
+about sharing states in tensorflow.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_W.pbtxt b/tensorflow/core/api_def/base_api/api_def_Where.pbtxt
similarity index 51%
rename from tensorflow/core/api_def/base_api/api_def_W.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_Where.pbtxt
index 9120fe334edbdcf720f5ac421048479fc4d23d93..a6ea62c4ccc0701ecd29209cb706fa08698e8ab6 100644
--- a/tensorflow/core/api_def/base_api/api_def_W.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Where.pbtxt
@@ -1,9 +1,10 @@
 op {
   graph_op_name: "Where"
-  endpoint {
-    name: "Where"
+  in_arg {
+    name: "input"
+    rename_to: "condition"
   }
-  summary: "Returns locations of true values in a boolean tensor."
+  summary: "Returns locations of nonzero / true values in a tensor."
   description: <<END
 This operation returns the coordinates of true elements in `input`. The
 coordinates are returned in a 2-D tensor where the first dimension (rows)
@@ -30,6 +31,34 @@ where(input) ==> [[0, 0],
 #                     [False, True]]]
 # 'input' has 5 true values, so output has 5 coordinates.
 # 'input' has rank of 3, so coordinates have three indices.
+where(input) ==> [[0, 0, 0],
+                  [0, 1, 0],
+                  [1, 0, 1],
+                  [1, 1, 1],
+                  [2, 1, 1]]
+
+# `input` tensor is [[[1.5,  0.0]
+#                     [-0.5, 0.0]]
+#                    [[0.0,  0.25]
+#                     [0.0,  0.75]]
+#                    [[0.0,  0.0]
+#                     [0.0,  0.01]]]
+# 'input' has 5 nonzero values, so output has 5 coordinates.
+# 'input' has rank of 3, so coordinates have three indices.
+where(input) ==> [[0, 0, 0],
+                  [0, 1, 0],
+                  [1, 0, 1],
+                  [1, 1, 1],
+                  [2, 1, 1]]
+
+# `input` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+#                     [0.0 + 0.5j, 0.0  + 0.0j]]
+#                    [[0.0 + 0.0j, 0.25 + 1.5j]
+#                     [0.0 + 0.0j, 0.75 + 0.0j]]
+#                    [[0.0 + 0.0j, 0.0  + 0.0j]
+#                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+# 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+# 'input' has rank of 3, so coordinates have three indices.
 where(input) ==> [[0, 0, 0],
                   [0, 1, 0],
                   [1, 0, 1],
@@ -38,35 +67,3 @@ where(input) ==> [[0, 0, 0],
 ```
 END
 }
-op {
-  graph_op_name: "WholeFileReader"
-  endpoint {
-    name: "WholeFileReader"
-  }
-  summary: "A Reader that outputs the entire contents of a file as a value."
-  description: <<END
-To use, enqueue filenames in a Queue.  The output of ReaderRead will
-be a filename (key) and the contents of that file (value).
-END
-}
-op {
-  graph_op_name: "WholeFileReaderV2"
-  endpoint {
-    name: "WholeFileReaderV2"
-  }
-  summary: "A Reader that outputs the entire contents of a file as a value."
-  description: <<END
-To use, enqueue filenames in a Queue.  The output of ReaderRead will
-be a filename (key) and the contents of that file (value).
-END
-}
-op {
-  graph_op_name: "WriteFile"
-  endpoint {
-    name: "WriteFile"
-  }
-  summary: "Writes contents to the file at input filename. Creates file and recursively"
-  description: <<END
-creates directory if not existing.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_WholeFileReader.pbtxt b/tensorflow/core/api_def/base_api/api_def_WholeFileReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..32180e0737f790c3e12a62ed658c74c75f5a4cdc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WholeFileReader.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "WholeFileReader"
+  visibility: SKIP
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the entire contents of a file as a value."
+  description: <<END
+To use, enqueue filenames in a Queue.  The output of ReaderRead will
+be a filename (key) and the contents of that file (value).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WholeFileReaderV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_WholeFileReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9063f9588964fa21a3902981e4a7d53c2a9a0f9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WholeFileReaderV2.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "WholeFileReaderV2"
+  endpoint {
+    name: "WholeFileReader"
+  }
+  out_arg {
+    name: "reader_handle"
+    description: <<END
+The handle to reference the Reader.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+If non-empty, this reader is placed in the given container.
+Otherwise, a default container is used.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+If non-empty, this reader is named in the given bucket
+with this shared_name. Otherwise, the node name is used instead.
+END
+  }
+  summary: "A Reader that outputs the entire contents of a file as a value."
+  description: <<END
+To use, enqueue filenames in a Queue.  The output of ReaderRead will
+be a filename (key) and the contents of that file (value).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WriteFile.pbtxt b/tensorflow/core/api_def/base_api/api_def_WriteFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28b09c9bf12f31b2e6dff36ec146b3abfb21359f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WriteFile.pbtxt
@@ -0,0 +1,19 @@
+op {
+  graph_op_name: "WriteFile"
+  in_arg {
+    name: "filename"
+    description: <<END
+scalar. The name of the file to which we write the contents.
+END
+  }
+  in_arg {
+    name: "contents"
+    description: <<END
+scalar. The content to be written to the output file.
+END
+  }
+  summary: "Writes contents to the file at input filename. Creates file and recursively"
+  description: <<END
+creates directory if not existing.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Z.pbtxt b/tensorflow/core/api_def/base_api/api_def_Z.pbtxt
deleted file mode 100644
index f83fef054c6ab1e9270b3bf1d5d7143052954152..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_Z.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-op {
-  graph_op_name: "ZerosLike"
-  endpoint {
-    name: "ZerosLike"
-  }
-  summary: "Returns a tensor of zeros with the same shape and type as x."
-}
-op {
-  graph_op_name: "Zeta"
-  endpoint {
-    name: "Zeta"
-  }
-  summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
-  description: <<END
-The Hurwitz zeta function is defined as:
-
-
-\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-END
-}
-op {
-  graph_op_name: "ZipDataset"
-  endpoint {
-    name: "ZipDataset"
-  }
-  summary: "Creates a dataset that zips together `input_datasets`."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZerosLike.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZerosLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37c2d5b53405902b2f834facc57f8eeeab87010a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ZerosLike.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "ZerosLike"
+  in_arg {
+    name: "x"
+    description: <<END
+a tensor of type T.
+END
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+a tensor of the same shape and type as x but filled with zeros.
+END
+  }
+  summary: "Returns a tensor of zeros with the same shape and type as x."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Zeta.pbtxt b/tensorflow/core/api_def/base_api/api_def_Zeta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c02860a16ab4edceb03cd0f00e621b6950574ef5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Zeta.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "Zeta"
+  summary: "Compute the Hurwitz zeta function \\\\(\\zeta(x, q)\\\\)."
+  description: <<END
+The Hurwitz zeta function is defined as:
+
+
+\\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7495693ccc50fede4a359d13aa710a1fd2fd9402
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ZipDataset"
+  summary: "Creates a dataset that zips together `input_datasets`."
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_A.pbtxt b/tensorflow/core/api_def/python_api/api_def_A.pbtxt
deleted file mode 100644
index df9b3ad0b69235eaf22c1b84b624e4037084547d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_A.pbtxt
+++ /dev/null
@@ -1,56 +0,0 @@
-op {
-  graph_op_name: "Abs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AddManySparseToTensorsMap"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AddN"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AddSparseToTensorsMap"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AdjustContrastv2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "All"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AllCandidateSampler"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Any"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Assert"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AudioSummary"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AudioSummaryV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AvgPool"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AvgPool3DGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "AvgPoolGrad"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Abs.pbtxt b/tensorflow/core/api_def/python_api/api_def_Abs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f21fae28bdf579eba47f4b530032b21652a8bc0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Abs.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Abs"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AccumulateNV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_AccumulateNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a92ff5a4065f562b848d1508fde7efec2d31b34d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AccumulateNV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AccumulateNV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AddManySparseToTensorsMap.pbtxt b/tensorflow/core/api_def/python_api/api_def_AddManySparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ece23fd65d200b12695dd7411dec74b7d3947fb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AddManySparseToTensorsMap.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddManySparseToTensorsMap"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AddN.pbtxt b/tensorflow/core/api_def/python_api/api_def_AddN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cc22ad4dc50af8f9d298b95806bc2e5feee5285
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AddN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddN"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AddSparseToTensorsMap.pbtxt b/tensorflow/core/api_def/python_api/api_def_AddSparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4446bba28032ea74fce9ada48052fd021534eb1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AddSparseToTensorsMap.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddSparseToTensorsMap"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AddV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_AddV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77c879c6b3684c18df119cc36519c48bc6af56b4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AddV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AdjustContrastv2.pbtxt b/tensorflow/core/api_def/python_api/api_def_AdjustContrastv2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..889d14740697e84846d4380a3fcaec400a03bccc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AdjustContrastv2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustContrastv2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_All.pbtxt b/tensorflow/core/api_def/python_api/api_def_All.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca780f037fb6d26d89ca5b5d7b110936567590ee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_All.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "All"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AllCandidateSampler.pbtxt b/tensorflow/core/api_def/python_api/api_def_AllCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..200ae0ae490ce4e88aa314a246377c5206f544d8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AllCandidateSampler.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AllCandidateSampler"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Any.pbtxt b/tensorflow/core/api_def/python_api/api_def_Any.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4afa8acecbeafb6734631f2f4d9ef4927cb2a930
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Any.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Any"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/python_api/api_def_Assert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e27ee0bc430e8e33ecba88f6e5600626918963
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Assert.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Assert"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AudioSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_AudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94da1e06ea9a03b6ebb90e1ad03e9110b95e4aee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AudioSummaryV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_AudioSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1715576d0924456170c27e8e46cb80d0d76471e5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AudioSummaryV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSummaryV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c58d6c6039b56d7528c00e573e6ae2f4113b03b3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AvgPool"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool3DGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e4049faf497977f6bb35b9df8fd2aec24484f34
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool3DGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AvgPool3DGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_B.pbtxt b/tensorflow/core/api_def/python_api/api_def_B.pbtxt
deleted file mode 100644
index 49c74ccad28f8e1ecc12b5ad0ce6a054670da36a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_B.pbtxt
+++ /dev/null
@@ -1,142 +0,0 @@
-op {
-  graph_op_name: "Barrier"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BarrierClose"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BarrierIncompleteSize"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BarrierInsertMany"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BarrierReadySize"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BarrierTakeMany"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchCholesky"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchCholeskyGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchFFT"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchFFT2D"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchFFT3D"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchIFFT"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchIFFT2D"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchIFFT3D"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatMul"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatrixDeterminant"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatrixInverse"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatrixSolve"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatrixSolveLs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchMatrixTriangularSolve"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchNormWithGlobalNormalization"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchSelfAdjointEig"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchSelfAdjointEigV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchSvd"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BatchToSpace"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BiasAdd"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BiasAddV1"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BitwiseAnd"
-  endpoint {
-    name: "bitwise.bitwise_and"
-  }
-}
-op {
-  graph_op_name: "BitwiseOr"
-  endpoint {
-    name: "bitwise.bitwise_or"
-  }
-}
-op {
-  graph_op_name: "BitwiseXor"
-  endpoint {
-    name: "bitwise.bitwise_xor"
-  }
-}
-op {
-  graph_op_name: "BroadcastArgs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "BroadcastGradientArgs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Bucketize"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Barrier.pbtxt b/tensorflow/core/api_def/python_api/api_def_Barrier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6463fcf618eea63b43c04c4c6125bb6e4cc0712
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Barrier.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Barrier"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BarrierClose.pbtxt b/tensorflow/core/api_def/python_api/api_def_BarrierClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d903a2e29ea2e82fb1b6dbc3f95b9496656de4a1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BarrierClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BarrierClose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BarrierIncompleteSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_BarrierIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9c5a8e7fea1160ed573809fbc861d18320238b1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BarrierIncompleteSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BarrierIncompleteSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BarrierInsertMany.pbtxt b/tensorflow/core/api_def/python_api/api_def_BarrierInsertMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c7b060d41f6529699eff4158a7ddedc40d42365
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BarrierInsertMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BarrierInsertMany"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BarrierReadySize.pbtxt b/tensorflow/core/api_def/python_api/api_def_BarrierReadySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07729e0704a586af0b3220d480c7ff1fe07f4937
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BarrierReadySize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BarrierReadySize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BarrierTakeMany.pbtxt b/tensorflow/core/api_def/python_api/api_def_BarrierTakeMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de6448e3fe9a56729753848cef8a1f42e0c8cc86
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BarrierTakeMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BarrierTakeMany"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchCholesky.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchCholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83241f8e8bdf37f5211b44338aadb69110dc58e6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchCholesky.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchCholesky"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchCholeskyGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchCholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60ddfd7a26955803cd07484de6dd10045b6f572d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchCholeskyGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchCholeskyGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f735280687df93c09f7347ccf5ff147171ade9b0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchFFT.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchFFT"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7520e86d4944daf9d97ca0d35e234bcea0da1af
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchFFT2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchFFT2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27bc32046b04809a5d9e22c42478dcf02239bad8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchFFT3D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchFFT3D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchIFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchIFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f3bb2ba5d83951b450fea4973e1e662d035b67e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchIFFT.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchIFFT"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchIFFT2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchIFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b94492459554abf799dc95328e9f5b29b0a149a5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchIFFT2D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchIFFT2D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchIFFT3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchIFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13cccda1d2f6afae54d1263050d34cd0e280231a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchIFFT3D.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchIFFT3D"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3db197c2670341a438254b4c766adf529376c98
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixDeterminant.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202b0d149b1dc26fa2b5ba356d451367de549127
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixDeterminant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixDeterminant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixInverse.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fa68bdd3e085e4929e5f82a6fd1af1802e83b7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixInverse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixInverse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a458423e38def986f4ccd62db967341051b3174c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolveLs.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61b4ca3999b034a41d7ab88dd41ed8276252034a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixSolveLs.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixSolveLs"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchMatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28e674259577ef63859098165f448bdfd43c14ee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchMatrixTriangularSolve.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchMatrixTriangularSolve"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7a042bc617a3e4cd610045c0fc77e0b3bd7bd5b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e92f3a30f401b2a0569b043b81e70c762469f628
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEig.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26fef1c4b43d32c8e91ab607191833c052a3805d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchSelfAdjointEig"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..660523a8c458c203dc3215e101fc0409076d88a7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchSelfAdjointEigV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchSelfAdjointEigV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchSvd.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchSvd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..927f5483a9707da09233b6909fcfcda0f66721ad
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchSvd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchSvd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchToSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c106bb1367e74eabad03e489238098c6780c7d21
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BatchToSpace.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchToSpace"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BiasAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_BiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2397ac0ac7717075f817ef83e78a7d5700279ca
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BiasAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BiasAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BiasAddV1.pbtxt b/tensorflow/core/api_def/python_api/api_def_BiasAddV1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..93dcabecb8d0e709860f77924720763d5d33c807
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BiasAddV1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BiasAddV1"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BitwiseAnd.pbtxt b/tensorflow/core/api_def/python_api/api_def_BitwiseAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..288a3f5fc201a663efe2c651c465205739c13b94
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BitwiseAnd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseAnd"
+  endpoint {
+    name: "bitwise.bitwise_and"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BitwiseOr.pbtxt b/tensorflow/core/api_def/python_api/api_def_BitwiseOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..150dbf6bfd8ae53649654589a81b3c99603b9b02
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BitwiseOr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseOr"
+  endpoint {
+    name: "bitwise.bitwise_or"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BitwiseXor.pbtxt b/tensorflow/core/api_def/python_api/api_def_BitwiseXor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f7c6fb5fceaea12d97409fc9ebe65825ea3ea3d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BitwiseXor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseXor"
+  endpoint {
+    name: "bitwise.bitwise_xor"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastArgs.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5933fdfea1a389fc78b50e9f78153188d9fe5d92
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BroadcastArgs.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BroadcastArgs"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Bucketize.pbtxt b/tensorflow/core/api_def/python_api/api_def_Bucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49fbe175aeb3a09f04f982c31f584e58c689c59c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Bucketize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Bucketize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_C.pbtxt b/tensorflow/core/api_def/python_api/api_def_C.pbtxt
deleted file mode 100644
index 42ed24b1336efb59d835c87980f032adde59344a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_C.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-op {
-  graph_op_name: "CTCBeamSearchDecoder"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "CTCGreedyDecoder"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "CTCLoss"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Cholesky"
-  endpoint {
-    name: "cholesky"
-  }
-  endpoint {
-    name: "linalg.cholesky"
-  }
-}
-op {
-  graph_op_name: "Complex"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ComplexAbs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ComputeAccidentalHits"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Concat"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ConcatOffset"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ConcatV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Conj"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Const"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "CropAndResize"
-  endpoint {
-    name: "image.crop_and_resize"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CTCBeamSearchDecoder.pbtxt b/tensorflow/core/api_def/python_api/api_def_CTCBeamSearchDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4cc4ad05aa6497542965fd2dd095e3bf457b7b8a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CTCBeamSearchDecoder.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CTCBeamSearchDecoder"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CTCGreedyDecoder.pbtxt b/tensorflow/core/api_def/python_api/api_def_CTCGreedyDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b540add1f541737086177784f06fa05429594aa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CTCGreedyDecoder.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CTCGreedyDecoder"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CTCLoss.pbtxt b/tensorflow/core/api_def/python_api/api_def_CTCLoss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0c6bcd3949a378e6045e17920b9fe10189b6164
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CTCLoss.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CTCLoss"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt b/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2676c92bfbebeab6eac3f4052c0394e5bda1a767
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Cholesky.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "Cholesky"
+  endpoint {
+    name: "cholesky"
+  }
+  endpoint {
+    name: "linalg.cholesky"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Complex.pbtxt b/tensorflow/core/api_def/python_api/api_def_Complex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9ec8059f79a8bcff547e87a0e59af0f2e355d40
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Complex.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Complex"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/python_api/api_def_ComplexAbs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77a8a44872ff442a2336e9d2ea194588dcb15ed0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ComplexAbs.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ComplexAbs"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ComputeAccidentalHits.pbtxt b/tensorflow/core/api_def/python_api/api_def_ComputeAccidentalHits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..744949639c8e5e3d90e453c96f5530bbb08e68e0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ComputeAccidentalHits.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ComputeAccidentalHits"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Concat.pbtxt b/tensorflow/core/api_def/python_api/api_def_Concat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..503e87cd6c60a436b02b230f3e73adca63fc44e7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Concat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Concat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatOffset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatOffset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1bcb77e00fe5845107441cdd486097694584dd4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConcatOffset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConcatOffset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d5b5321fdc7759e83d7e5bb9574aabb014c6650e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConcatV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conj.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conj.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c36b1f7fadc45ef9f65b7ce9127749ef093ac7f6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Conj.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conj"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ConjugateTranspose.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConjugateTranspose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a8de53e735039daa6836767b166d781e81841e3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ConjugateTranspose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ConjugateTranspose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Const.pbtxt b/tensorflow/core/api_def/python_api/api_def_Const.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95d162ac41539a588fee13199f66009e6c276a10
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Const.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Const"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce65f8172ddfea2ae08750cf37bba8e3e012f5f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResize"
+  endpoint {
+    name: "image.crop_and_resize"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_D.pbtxt b/tensorflow/core/api_def/python_api/api_def_D.pbtxt
deleted file mode 100644
index c73982aed0cd718f65645f248e6dd16115d948c5..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_D.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-op {
-  graph_op_name: "DebugGradientIdentity"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "DecodeAndCropJpeg"
-  endpoint {
-    name: "image.decode_and_crop_jpeg"
-  }
-}
-op {
-  graph_op_name: "DecodeBmp"
-  endpoint {
-    name: "image.decode_bmp"
-  }
-}
-op {
-  graph_op_name: "DecodeCSV"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "DecodeGif"
-  endpoint {
-    name: "image.decode_gif"
-  }
-}
-op {
-  graph_op_name: "DecodeJpeg"
-  endpoint {
-    name: "image.decode_jpeg"
-  }
-}
-op {
-  graph_op_name: "DecodePng"
-  endpoint {
-    name: "image.decode_png"
-  }
-}
-op {
-  graph_op_name: "DeleteSessionTensor"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "DepthwiseConv2dNative"
-  endpoint {
-    name: "nn.depthwise_conv2d_native"
-  }
-}
-op {
-  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
-  endpoint {
-    name: "nn.depthwise_conv2d_native_backprop_filter"
-  }
-}
-op {
-  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
-  endpoint {
-    name: "nn.depthwise_conv2d_native_backprop_input"
-  }
-}
-op {
-  graph_op_name: "DeserializeManySparse"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "DestroyTemporaryVariable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "DrawBoundingBoxes"
-  endpoint {
-    name: "image.draw_bounding_boxes"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DebugGradientIdentity.pbtxt b/tensorflow/core/api_def/python_api/api_def_DebugGradientIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d50c5c8687a2cb3f550a04654fc6f0d7ec86a89
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DebugGradientIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbe9c882538776abb35b7c654ede0fffbfaa078c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeAndCropJpeg"
+  endpoint {
+    name: "image.decode_and_crop_jpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..573d83f3739a86d00550c519cb19aef452813927
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeBmp"
+  endpoint {
+    name: "image.decode_bmp"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeCSV.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeCSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21ef77e381b1975a1865a4096853a08baf04793e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeCSV.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DecodeCSV"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eed64df79cf7837c1cc0580dd2cb0f06acf289cc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeGif"
+  endpoint {
+    name: "image.decode_gif"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..994bc4e1f4fd1707579ac2bda4fae5ed327430ab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeJpeg"
+  endpoint {
+    name: "image.decode_jpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..309eec5ac368297563af7e6e752921fd270186ef
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodePng"
+  endpoint {
+    name: "image.decode_png"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeleteSessionTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeleteSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08bf4a80ec64bb3a3eb5312502d68576bee18ce1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeleteSessionTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeleteSessionTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bb17e548d1cd0ca77d6415b7fa165b1a6b7cae3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNative"
+  endpoint {
+    name: "nn.depthwise_conv2d_native"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f9df4b1a11459c252f2961fb1caacaad64021ae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
+  endpoint {
+    name: "nn.depthwise_conv2d_native_backprop_filter"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0bd72539e932f597e86f63ef52519652f0e8efd7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
+  endpoint {
+    name: "nn.depthwise_conv2d_native_backprop_input"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeserializeManySparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeserializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd43a05577905bc9918a56ac40a701f20ec953c8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeserializeManySparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeserializeManySparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0679907809ebc9f7762b2fdb4b1184d21259e3c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DestroyTemporaryVariable.pbtxt b/tensorflow/core/api_def/python_api/api_def_DestroyTemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e51a25a2c022b5d6289d2a8b6b79097f4d6b77a5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DestroyTemporaryVariable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DestroyTemporaryVariable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DrawBoundingBoxes.pbtxt b/tensorflow/core/api_def/python_api/api_def_DrawBoundingBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54d644c013b123ef862ea6b9fe04d59f9fa59499
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DrawBoundingBoxes.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DrawBoundingBoxes"
+  endpoint {
+    name: "image.draw_bounding_boxes"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_E.pbtxt b/tensorflow/core/api_def/python_api/api_def_E.pbtxt
deleted file mode 100644
index 236c344167a825a3476bb2a51534eee19bc3d138..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_E.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-op {
-  graph_op_name: "EditDistance"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Elu"
-  endpoint {
-    name: "nn.elu"
-  }
-}
-op {
-  graph_op_name: "EluGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "EncodeJpeg"
-  endpoint {
-    name: "image.encode_jpeg"
-  }
-}
-op {
-  graph_op_name: "EncodePng"
-  endpoint {
-    name: "image.encode_png"
-  }
-}
-op {
-  graph_op_name: "Exit"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ExpandDims"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ExtractGlimpse"
-  endpoint {
-    name: "image.extract_glimpse"
-  }
-}
-op {
-  graph_op_name: "ExtractJpegShape"
-  endpoint {
-    name: "image.extract_jpeg_shape"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee0f95dacbc09702039da97fccd98a2d8bb83b1b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EditDistance.pbtxt b/tensorflow/core/api_def/python_api/api_def_EditDistance.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c77accf370719c9781c950c897cb9e9186bcacc5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EditDistance.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EditDistance"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Elu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Elu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15a9f6568f373cd6e8a44677c7f4ae557f85bc8f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Elu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Elu"
+  endpoint {
+    name: "nn.elu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c31e9d0f32e6e13ba7d87d8a234e238c048a8b9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeJpeg"
+  endpoint {
+    name: "image.encode_jpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodePng.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42717ba7d56abc2aa75c208b4a9ad51086ad1381
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EncodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodePng"
+  endpoint {
+    name: "image.encode_png"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ExpandDims.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExpandDims.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29979dbf0a5110bce226dbe5ed2ae31f543ea708
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ExpandDims.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExpandDims"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed8abdfcd7f3171d431adf07d47eb3bfc60d1e8f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractGlimpse"
+  endpoint {
+    name: "image.extract_glimpse"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6849a6d3fa5f37b0d4f92829c8b07754b922a319
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractJpegShape"
+  endpoint {
+    name: "image.extract_jpeg_shape"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_F.pbtxt b/tensorflow/core/api_def/python_api/api_def_F.pbtxt
deleted file mode 100644
index a29b6a372513b8e463563212291d655c2e501615..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_F.pbtxt
+++ /dev/null
@@ -1,73 +0,0 @@
-op {
-  graph_op_name: "FFT"
-  endpoint {
-    name: "fft"
-  }
-  endpoint {
-    name: "spectral.fft"
-  }
-}
-op {
-  graph_op_name: "FIFOQueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FIFOQueueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Fact"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FakeQueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FixedLengthRecordReader"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FixedLengthRecordReaderV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FixedUnigramCandidateSampler"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FloorDiv"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FloorMod"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "nn.fractional_avg_pool"
-  }
-}
-op {
-  graph_op_name: "FractionalAvgPoolGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "nn.fractional_max_pool"
-  }
-}
-op {
-  graph_op_name: "FractionalMaxPoolGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FusedBatchNorm"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "FusedBatchNormV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3bcab994151c012719d423c4031ad6699cd5a717
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FFT.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "FFT"
+  endpoint {
+    name: "fft"
+  }
+  endpoint {
+    name: "spectral.fft"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FIFOQueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_FIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b51063b2cfef6372822babb300f8412ebf1e94e5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FIFOQueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FIFOQueueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..850fe5b89985cb7c2fe1560339f18899e43b1fa2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FIFOQueueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FIFOQueueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Fact.pbtxt b/tensorflow/core/api_def/python_api/api_def_Fact.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a8328bb8431abc41a154a9af505fa71ee92fe68
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Fact.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fact"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeQueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c5cc7116b85fbf8f4d21ea5a9595391aeefcedd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FakeQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FakeQueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da211a3bfcf09d4e3ca13911c35cdb27170b65db
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4606991e9905837cf8100f6dfd56119075888fa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordReaderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordReaderV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca70db18acd8b561bd3fbed72e5cfb873eceea2d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FixedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedUnigramCandidateSampler"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26598ab1fb918e251d4c4da7b14810ebf4c44779
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FloorDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef562e93a0dee0a3f24716719cb24232302626dc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FloorMod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..16ed9b56f2b662b6cca44f5c955e579c2f9d7971
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalAvgPool"
+  endpoint {
+    name: "nn.fractional_avg_pool"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..695559520805abd02e0575f7f85937d00f0dc5fd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalMaxPool"
+  endpoint {
+    name: "nn.fractional_max_pool"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNorm.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ac0fe72522c109af75a6595858013c8f89701c6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNorm"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70a79c906ed5356fc0e0261c7ebab998bcd2e602
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FusedBatchNormV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_G.pbtxt b/tensorflow/core/api_def/python_api/api_def_G.pbtxt
deleted file mode 100644
index 8235d245feb5403600532cfd05456e256b3faf0d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_G.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-op {
-  graph_op_name: "GenerateVocabRemapping"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "GetSessionHandle"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "GetSessionHandleV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "GetSessionTensor"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_GenerateVocabRemapping.pbtxt b/tensorflow/core/api_def/python_api/api_def_GenerateVocabRemapping.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35f09938512eb0f48cb1fc4cbaf3d94f83477279
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GenerateVocabRemapping.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GenerateVocabRemapping"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetSessionHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetSessionHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18396a1277099cd6de1926958ccdb3f51b0329ec
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetSessionHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetSessionHandle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetSessionHandleV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetSessionHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39cbcca12238663dad37fa2f220a806083732da6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetSessionHandleV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetSessionHandleV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_GetSessionTensor.pbtxt b/tensorflow/core/api_def/python_api/api_def_GetSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ef75ed34d9165d4af35ef17ad6f6b3c3a06a156
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_GetSessionTensor.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetSessionTensor"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_H.pbtxt b/tensorflow/core/api_def/python_api/api_def_H.pbtxt
deleted file mode 100644
index 9f3fe2eb08a384e5e74018b8089d8fc6293deb03..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_H.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-op {
-  graph_op_name: "HSVToRGB"
-  endpoint {
-    name: "image.hsv_to_rgb"
-  }
-}
-op {
-  graph_op_name: "HashTable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "HashTableV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "HistogramSummary"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_HSVToRGB.pbtxt b/tensorflow/core/api_def/python_api/api_def_HSVToRGB.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..55998189f47e1fc211975ed45575d23f0a1aca5b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_HSVToRGB.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HSVToRGB"
+  endpoint {
+    name: "image.hsv_to_rgb"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_HashTable.pbtxt b/tensorflow/core/api_def/python_api/api_def_HashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1c5b2a4dc758c7b1d38b93a7cc4e682a7518ceb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_HashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HashTable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_HashTableV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_HashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d702d4d0de84c26d7d2ba229286c9c69ef253be1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_HashTableV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HashTableV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_HistogramFixedWidth.pbtxt b/tensorflow/core/api_def/python_api/api_def_HistogramFixedWidth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cb4d9192e4e671c0a270afb0405b381490eee87
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_HistogramFixedWidth.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HistogramFixedWidth"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_HistogramSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_HistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..644807d16f40001552b1c41fa0f9bbd35dfe3a1c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_HistogramSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HistogramSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_I.pbtxt b/tensorflow/core/api_def/python_api/api_def_I.pbtxt
deleted file mode 100644
index db6a54dbd43030c433c5716cefe2fe410694031a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_I.pbtxt
+++ /dev/null
@@ -1,55 +0,0 @@
-op {
-  graph_op_name: "IFFT"
-  endpoint {
-    name: "ifft"
-  }
-  endpoint {
-    name: "spectral.ifft"
-  }
-}
-op {
-  graph_op_name: "IdentityReader"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "IdentityReaderV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ImageSummary"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InTopK"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InTopKV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InitializeTable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InitializeTableFromTextFile"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InitializeTableFromTextFileV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InitializeTableV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "InvGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Invert"
-  endpoint {
-    name: "bitwise.invert"
-  }
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6bbc4ed7207faefc74031a11949bbdafc59c9236
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IFFT.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "IFFT"
+  endpoint {
+    name: "ifft"
+  }
+  endpoint {
+    name: "spectral.ifft"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IdentityReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_IdentityReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a337f152090235b00f3b280843e350ad025a018
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IdentityReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IdentityReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_IdentityReaderV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_IdentityReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..efafd76c712e3d01c2eb8c49478f36af115e1392
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_IdentityReaderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IdentityReaderV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_ImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..521c8850232ede2ef5a57299cf7cb7da0c403220
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ImageSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ImageSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InTopK.pbtxt b/tensorflow/core/api_def/python_api/api_def_InTopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..357b9df14b8ed66116a8a217286c6d1b90f3d681
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InTopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InTopK"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InTopKV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_InTopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0a1c9e8314c5d53377d70b97019a6149a7fc4a1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InTopKV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InTopKV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InitializeTable.pbtxt b/tensorflow/core/api_def/python_api/api_def_InitializeTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..068030c755ea894e619c286e4b5c0184f3bb6f60
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InitializeTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFile.pbtxt b/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd0e586976e391c831460400154dd54f8de11b62
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFile.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTableFromTextFile"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFileV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..659642056d7ab36c89b125117aad6991a09047f2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InitializeTableFromTextFileV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTableFromTextFileV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InitializeTableV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_InitializeTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee73655258fe41b7de617b0f8e5c314690ceee49
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InitializeTableV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTableV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Invert.pbtxt b/tensorflow/core/api_def/python_api/api_def_Invert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a41d05a3c937bdc8fc24f28ef9d5c37535337d4e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Invert.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Invert"
+  endpoint {
+    name: "bitwise.invert"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_L.pbtxt b/tensorflow/core/api_def/python_api/api_def_L.pbtxt
deleted file mode 100644
index 083fbdae6f5706745ce763a23d4aaec25ca51b3c..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_L.pbtxt
+++ /dev/null
@@ -1,96 +0,0 @@
-op {
-  graph_op_name: "L2Loss"
-  endpoint {
-    name: "nn.l2_loss"
-  }
-}
-op {
-  graph_op_name: "LMDBReader"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LRN"
-  endpoint {
-    name: "nn.local_response_normalization"
-  }
-  endpoint {
-    name: "nn.lrn"
-  }
-}
-op {
-  graph_op_name: "LRNGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LearnedUnigramCandidateSampler"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LinSpace"
-  endpoint {
-    name: "lin_space"
-  }
-  endpoint {
-    name: "linspace"
-  }
-}
-op {
-  graph_op_name: "ListDiff"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LoadAndRemapMatrix"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LogMatrixDeterminant"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LogSoftmax"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LogUniformCandidateSampler"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableExport"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableExportV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableFind"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableFindV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableImport"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableImportV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableInsert"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableInsertV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableSize"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "LookupTableSizeV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_L2Loss.pbtxt b/tensorflow/core/api_def/python_api/api_def_L2Loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de994e7f0a7a87e647657a87c54130cc8abda33c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_L2Loss.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "L2Loss"
+  endpoint {
+    name: "nn.l2_loss"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LMDBReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_LMDBReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..63e261f6de92e2c0fe45982cddc0acae7dc8e310
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LMDBReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LMDBReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LRN.pbtxt b/tensorflow/core/api_def/python_api/api_def_LRN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6567fe33e93dda12574a1da57266c352cec5a3b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LRN.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "LRN"
+  endpoint {
+    name: "nn.local_response_normalization"
+  }
+  endpoint {
+    name: "nn.lrn"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/python_api/api_def_LearnedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b005fe81c8ae49a8d393817f2416bdf019df9383
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LearnedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LearnedUnigramCandidateSampler"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LeftShift.pbtxt b/tensorflow/core/api_def/python_api/api_def_LeftShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf703f3897e5bc2243f9b41dfce4ed7d4c229d7e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LeftShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeftShift"
+  endpoint {
+    name: "bitwise.left_shift"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1de2cb207d221593b41d82d43b759e49d411710
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "LinSpace"
+  endpoint {
+    name: "lin_space"
+  }
+  endpoint {
+    name: "linspace"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ListDiff.pbtxt b/tensorflow/core/api_def/python_api/api_def_ListDiff.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6718d5bec126c27c8fe3d9204c320d178894c030
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ListDiff.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ListDiff"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LoadAndRemapMatrix.pbtxt b/tensorflow/core/api_def/python_api/api_def_LoadAndRemapMatrix.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac0f612443a5cc69c409ef4b65901b5ab97d655f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LoadAndRemapMatrix.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LoadAndRemapMatrix"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36d1eadab4ecb5aa71ffd6bb9526b6b6592f10ee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LogMatrixDeterminant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LogMatrixDeterminant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LogSoftmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fde770eecb818fa6819ad54a64442864f393b2c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LogSoftmax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LogSoftmax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LogUniformCandidateSampler.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogUniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..276f1f576b2545c836a2c94146292edc5374e991
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LogUniformCandidateSampler.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LogUniformCandidateSampler"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableExport.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableExport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..016ad8dc60d7d45ca2328b9cc586a5e2c6dcb9fe
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableExport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableExport"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableExportV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableExportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37e9746cccf3d0a608d590bef23138978f356448
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableExportV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableExportV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableFind.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableFind.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..739196deb9d3812072262cca51f6a4694cc3c41c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableFind.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableFind"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableFindV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableFindV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da3be6db4269266a9e8fe9f3d588ecdcde8c431d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableFindV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableFindV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableImport.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableImport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52634b6fb0eb3b0d4fac09b6c7ff2540b86b464a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableImport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableImport"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableImportV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableImportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75a4e00473d83815e30d46c8398d614eda4a9dfb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableImportV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableImportV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableInsert.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableInsert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72dcc5fe6b604d4d813a6cb8746fc4d2c3ceec3b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableInsert.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableInsert"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableInsertV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableInsertV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..14ca6f80a580606b9a6c7fe504c008ee773b7a08
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableInsertV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableInsertV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..203b51aee441118abdc649969dd840f91bb8da1e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_LookupTableSizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_LookupTableSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba26ba072410e5587708e41eae621f8a6b1ca51e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_LookupTableSizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableSizeV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_M.pbtxt b/tensorflow/core/api_def/python_api/api_def_M.pbtxt
deleted file mode 100644
index c8840e0c09009992a51b4ca08fb0fc3e97868ec6..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_M.pbtxt
+++ /dev/null
@@ -1,174 +0,0 @@
-op {
-  graph_op_name: "MatMul"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MatrixBandPart"
-  endpoint {
-    name: "linalg.band_part"
-  }
-  endpoint {
-    name: "matrix_band_part"
-  }
-}
-op {
-  graph_op_name: "MatrixDeterminant"
-  endpoint {
-    name: "linalg.det"
-  }
-  endpoint {
-    name: "matrix_determinant"
-  }
-}
-op {
-  graph_op_name: "MatrixDiag"
-  endpoint {
-    name: "linalg.diag"
-  }
-  endpoint {
-    name: "matrix_diag"
-  }
-}
-op {
-  graph_op_name: "MatrixDiagPart"
-  endpoint {
-    name: "linalg.diag_part"
-  }
-  endpoint {
-    name: "matrix_diag_part"
-  }
-}
-op {
-  graph_op_name: "MatrixInverse"
-  endpoint {
-    name: "linalg.inv"
-  }
-  endpoint {
-    name: "matrix_inverse"
-  }
-}
-op {
-  graph_op_name: "MatrixSetDiag"
-  endpoint {
-    name: "linalg.set_diag"
-  }
-  endpoint {
-    name: "matrix_set_diag"
-  }
-}
-op {
-  graph_op_name: "MatrixSolve"
-  endpoint {
-    name: "linalg.solve"
-  }
-  endpoint {
-    name: "matrix_solve"
-  }
-}
-op {
-  graph_op_name: "MatrixSolveLs"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MatrixTriangularSolve"
-  endpoint {
-    name: "linalg.triangular_solve"
-  }
-  endpoint {
-    name: "matrix_triangular_solve"
-  }
-}
-op {
-  graph_op_name: "Max"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPool"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPool3DGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPool3DGradGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolGradGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolGradGradWithArgmax"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolGradWithArgmax"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MaxPoolWithArgmax"
-  endpoint {
-    name: "nn.max_pool_with_argmax"
-  }
-}
-op {
-  graph_op_name: "Mean"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Merge"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MergeSummary"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Min"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MirrorPad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MirrorPadGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Mul"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableDenseHashTable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableDenseHashTableV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableHashTable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableHashTableOfTensors"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableHashTableOfTensorsV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "MutableHashTableV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce95f857beaae60404965e64afe04e086ee9ae32
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MatMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..89b1c1f5a92995c3ef0f86c021e309d8acb91e40
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixBandPart.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixBandPart"
+  endpoint {
+    name: "linalg.band_part"
+  }
+  endpoint {
+    name: "matrix_band_part"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d289f542f3cafad4e5a3a2f2c5e8dbb43b1ccaa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDeterminant.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixDeterminant"
+  endpoint {
+    name: "linalg.det"
+  }
+  endpoint {
+    name: "matrix_determinant"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd9d34635e1409e8885e5c243b521f352bb2f852
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDiag.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixDiag"
+  endpoint {
+    name: "linalg.diag"
+  }
+  endpoint {
+    name: "matrix_diag"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa5d1f10af4626d0b33581ac284f68d9310cac1f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixDiagPart.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixDiagPart"
+  endpoint {
+    name: "linalg.diag_part"
+  }
+  endpoint {
+    name: "matrix_diag_part"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d215b86c7256145fa4ada58c7d2b54d418f8ac7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MatrixExponential"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0ddd73704f367ef069c2b970acbefd8d655e7c6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixInverse.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixInverse"
+  endpoint {
+    name: "linalg.inv"
+  }
+  endpoint {
+    name: "matrix_inverse"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01f4f0e89d3c027e4a8c1325f457c04488532f04
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSetDiag.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixSetDiag"
+  endpoint {
+    name: "linalg.set_diag"
+  }
+  endpoint {
+    name: "matrix_set_diag"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cef763e4e9a1d11201bdcb9a573ddf5d64841e90
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSolve.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixSolve"
+  endpoint {
+    name: "linalg.solve"
+  }
+  endpoint {
+    name: "matrix_solve"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f981161d496070f2f15a63e805dd10c87667bdf5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixSolveLs.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MatrixSolveLs"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0d576aa31bc72c45bf5f1433f4ec3392816e52b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MatrixTriangularSolve.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "MatrixTriangularSolve"
+  endpoint {
+    name: "linalg.triangular_solve"
+  }
+  endpoint {
+    name: "matrix_triangular_solve"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Max.pbtxt b/tensorflow/core/api_def/python_api/api_def_Max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc369ea6182a829a233a75b2af8dd628346a886a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Max.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Max"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9712911c3442f14e211d27e3087d63f3fbc0f2e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPool"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool3DGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..315c5dfa8279189cb0dd7ad6bedfb6905aeafb91
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool3DGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPool3DGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool3DGradGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool3DGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81f06ce4fb6e54fd3ef3971c9bdef5a2a7be15eb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool3DGradGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPool3DGradGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ec9357bd299150d7f728631e86ec2fba4c2fc97
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ec5a41212c6dac2e0b82881e304e532bf7126a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradGradWithArgmax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0999b80d7b60b4defecb076cd3db0eec56e16583
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d8abca5f1ad76df62e78f9d7228b586dce31bf6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolWithArgmax"
+  endpoint {
+    name: "nn.max_pool_with_argmax"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Mean.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4aa5d8582a522536365bcb1db6654a4feb695d4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mean.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mean"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Merge.pbtxt b/tensorflow/core/api_def/python_api/api_def_Merge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..059c3d127a64a38ed92755ee0034871ad61bd615
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Merge.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Merge"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MergeSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_MergeSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72095c5f91f7c37a0967a01ce5a8b409227a8bad
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MergeSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MergeSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Min.pbtxt b/tensorflow/core/api_def/python_api/api_def_Min.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8172b4f6df77de3d233c669b695f672c98e9083a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Min.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Min"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MirrorPad.pbtxt b/tensorflow/core/api_def/python_api/api_def_MirrorPad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67aebb8e862533397190ac2f735314da960cd2e6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MirrorPad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MirrorPad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Mul.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd08acd75206f6ab405edabf70181a0c57be74d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTable.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c3921654e55b0dadf66f8de5cba46fbdd35276a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableDenseHashTable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTableV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aa6f690966ad10b93cdb5d449950f79c1484727
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableDenseHashTableV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableDenseHashTableV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableHashTable.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4566b96bd49da52415ca2e92fd00db6acf76e4d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensors.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aad491fd453abab80976acc393052ae2c954b105
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTableOfTensors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensorsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..133ae60428ee03a8e860d4653ae4c9bbe1e1bc10
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableHashTableOfTensorsV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTableOfTensorsV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MutableHashTableV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_MutableHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f7be9df100db6823b6851e29ae7f030a9a95a1d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MutableHashTableV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTableV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_N.pbtxt b/tensorflow/core/api_def/python_api/api_def_N.pbtxt
deleted file mode 100644
index 60da4dcafe886bafa44301cdf3c375551530fdea..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_N.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-op {
-  graph_op_name: "Neg"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "NegTrain"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "NonMaxSuppression"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "NonMaxSuppressionV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0e2bb9b950d933f2e73272b403fba2c29110b3cb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Neg"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NegTrain.pbtxt b/tensorflow/core/api_def/python_api/api_def_NegTrain.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d536b4eaa52d624526e5e82aeed240436f83385
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NegTrain.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NegTrain"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NonMaxSuppression.pbtxt b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdd122dc2b1e996458e9b802a4439c91fe1ffc66
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppression.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppression"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ddbf2ec74e22db9285b73dc38b6cbfc95835edae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_O.pbtxt b/tensorflow/core/api_def/python_api/api_def_OneHot.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_def_O.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_OneHot.pbtxt
diff --git a/tensorflow/core/api_def/python_api/api_def_P.pbtxt b/tensorflow/core/api_def/python_api/api_def_P.pbtxt
deleted file mode 100644
index 87ca53e0b9a49e50f1937ff077b0129343ad7c42..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_P.pbtxt
+++ /dev/null
@@ -1,68 +0,0 @@
-op {
-  graph_op_name: "Pack"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Pad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PadV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PaddingFIFOQueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PaddingFIFOQueueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ParallelConcat"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ParameterizedTruncatedNormal"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ParseExample"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ParseSingleSequenceExample"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Placeholder"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Pow"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Print"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PriorityQueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PriorityQueueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Prod"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PyFunc"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "PyFuncStateless"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Pack.pbtxt b/tensorflow/core/api_def/python_api/api_def_Pack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cf7929e49a9a9fa01c217bf4f9a74aec6b924411
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Pack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Pack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/python_api/api_def_Pad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9c04ee59b3b2c067ecc1634689eef0c3f0604a1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Pad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Pad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PadV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_PadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e580992fb25cef38f9068a4667d1ce8cd782c96c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PadV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PadV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..575392b8b21f6b05ac882cac7fdfda27402d46ae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddingFIFOQueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b37b4162c8fa8c04c61285303a9b0101dcfb6cf0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PaddingFIFOQueueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddingFIFOQueueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8117b085be638df6751e18ed4e40b45cbf60ae27
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParallelConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75444351fab7b40c407ad89618ea63279cef0e35
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParameterizedTruncatedNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExample.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c68a58d311ac2a61267aec0a61c59ec95b8b9dfd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParseExample.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParseExample"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseSingleSequenceExample.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseSingleSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b47f452dd058026f12647e200b9bb59cc9dbab7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ParseSingleSequenceExample.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParseSingleSequenceExample"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Placeholder.pbtxt b/tensorflow/core/api_def/python_api/api_def_Placeholder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c70d9cfe011c871721090a7fc4aca8b982ef5dc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Placeholder.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Placeholder"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Pow.pbtxt b/tensorflow/core/api_def/python_api/api_def_Pow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bee695149ad0fcc65a51aa06e3ae281a94ff6c6b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Pow.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Pow"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Print.pbtxt b/tensorflow/core/api_def/python_api/api_def_Print.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7854d4c7276f14cd56f77a7bb3f2b20d7f3326af
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Print.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Print"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PriorityQueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_PriorityQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96d0e9bedc4cbf7eb36213539ebc150798d050b6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PriorityQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PriorityQueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PriorityQueueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_PriorityQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f640692ff822b9764a3b53ee7dcadcc3d220e991
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PriorityQueueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PriorityQueueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Prod.pbtxt b/tensorflow/core/api_def/python_api/api_def_Prod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9801fc0f054fcab3443b12b2ed8e663180642b7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Prod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Prod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PyFunc.pbtxt b/tensorflow/core/api_def/python_api/api_def_PyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df9e876f2bbd6addf7c018ead4783ee8b506e685
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFunc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_PyFuncStateless.pbtxt b/tensorflow/core/api_def/python_api/api_def_PyFuncStateless.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50c8d1a096c52733ccf352a8f552f6573f07bbb9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_PyFuncStateless.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFuncStateless"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Q.pbtxt b/tensorflow/core/api_def/python_api/api_def_Q.pbtxt
deleted file mode 100644
index 0dfb5bb703bba7cb7576f48b112d3014d69c2824..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_Q.pbtxt
+++ /dev/null
@@ -1,83 +0,0 @@
-op {
-  graph_op_name: "Qr"
-  endpoint {
-    name: "linalg.qr"
-  }
-  endpoint {
-    name: "qr"
-  }
-}
-op {
-  graph_op_name: "QuantizedAvgPool"
-  endpoint {
-    name: "nn.quantized_avg_pool"
-  }
-}
-op {
-  graph_op_name: "QuantizedMaxPool"
-  endpoint {
-    name: "nn.quantized_max_pool"
-  }
-}
-op {
-  graph_op_name: "QuantizedReluX"
-  endpoint {
-    name: "nn.quantized_relu_x"
-  }
-}
-op {
-  graph_op_name: "QueueClose"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueCloseV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeueMany"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeueManyV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeueUpTo"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeueUpToV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueDequeueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueEnqueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueEnqueueMany"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueEnqueueManyV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueEnqueueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueSize"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "QueueSizeV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b19da0d8176d90ae32830359e6608f21d592e4de
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Qr.pbtxt
@@ -0,0 +1,9 @@
+op {
+  graph_op_name: "Qr"
+  endpoint {
+    name: "linalg.qr"
+  }
+  endpoint {
+    name: "qr"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dfa793a16e18ab30891bcb9a997d7bed02410e54
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedAvgPool"
+  endpoint {
+    name: "nn.quantized_avg_pool"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3a58590f5773a3d886ace95108ee63a659362de2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMaxPool"
+  endpoint {
+    name: "nn.quantized_max_pool"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..926ec98eeb468e7fa4846ae013a112cc865bb82c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedReluX"
+  endpoint {
+    name: "nn.quantized_relu_x"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueClose.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ab6f2f821ab22662a77ed995e51a12c1d30a183
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueClose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueCloseV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a00c3d78ad3afe8a2abb39ac646504c2427ca60
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueCloseV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueCloseV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeue.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad2e246e92da4247d8f9d40259f3eb791aa4a2f8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeueMany.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff6a6e47a4aafa0e26eb45b7eed8123cfdef86a5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueMany"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeueManyV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30ed19a2106f1deb2af0b214c334ead9b48bc208
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeueManyV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueManyV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b59952a2124b4303a50a5773d6fdd7d1460094
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueUpTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpToV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd0cd2500dfefefe043ffb3c107c362507e25fc3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeueUpToV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueUpToV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueDequeueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueDequeueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dfa758f1e2debd1fc4604269bcb0fd1c8037c0a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueDequeueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueEnqueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a3698fd30586b1f3c2af9530cf1776760dbd28f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueEnqueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueEnqueueMany.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6bab13c9d981018e6cb2a0f4032dbbee94c2aa5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueueMany"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueEnqueueManyV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a70b2019a530fc82d82098a76bc511fe91ecf518
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueManyV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueueManyV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueEnqueueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a06d0a38563f8f539475275a8c97e96b40938620
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueEnqueueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueSize.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25e881d381e9497cff4dbcf88517614f4c6e129f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueSize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QueueSizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_QueueSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b33b4e804a58c9859a81bc53adff3d1dd8507cb3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QueueSizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueSizeV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_R.pbtxt b/tensorflow/core/api_def/python_api/api_def_R.pbtxt
deleted file mode 100644
index 0c8a8a4d4235ebe86a45d66d32dc307329d3f5ed..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_R.pbtxt
+++ /dev/null
@@ -1,192 +0,0 @@
-op {
-  graph_op_name: "RGBToHSV"
-  endpoint {
-    name: "image.rgb_to_hsv"
-  }
-}
-op {
-  graph_op_name: "RandomCrop"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomGamma"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomPoisson"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomShuffle"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomShuffleQueue"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomShuffleQueueV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomStandardNormal"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomUniform"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RandomUniformInt"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Range"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderNumRecordsProduced"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderNumRecordsProducedV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderNumWorkUnitsCompleted"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderRead"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderReadUpTo"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderReadUpToV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderReadV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderReset"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderResetV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderRestoreState"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderRestoreStateV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderSerializeState"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReaderSerializeStateV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RealDiv"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReciprocalGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RefExit"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RefIdentity"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RefMerge"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Relu"
-  endpoint {
-    name: "nn.relu"
-  }
-}
-op {
-  graph_op_name: "Relu6"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Relu6Grad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ReluGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ResizeArea"
-  endpoint {
-    name: "image.resize_area"
-  }
-}
-op {
-  graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "image.resize_bicubic"
-  }
-}
-op {
-  graph_op_name: "ResizeBicubicGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "image.resize_bilinear"
-  }
-}
-op {
-  graph_op_name: "ResizeBilinearGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "image.resize_nearest_neighbor"
-  }
-}
-op {
-  graph_op_name: "ResizeNearestNeighborGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Restore"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RestoreSlice"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Reverse"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "RsqrtGrad"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RGBToHSV.pbtxt b/tensorflow/core/api_def/python_api/api_def_RGBToHSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5676391e19547bb3c80304b0fdc6459acbc500f1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RGBToHSV.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RGBToHSV"
+  endpoint {
+    name: "image.rgb_to_hsv"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomCrop.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomCrop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f2da1712a20ca2f0d06e623d194c558b1fc75894
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomCrop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomCrop"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomGamma.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomGamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23509d8d61743749c741abb40450e0ccf17e2e8f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomGamma.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomGamma"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomPoisson.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomPoisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7da239cb668200ec7003baa274114bf35e076b2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomPoisson.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomPoisson"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomShuffle.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomShuffle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e265c8b4ee0eb01b2d802523f0a65fa6729a332
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomShuffle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomShuffle"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueue.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be93d99e8483d9ec0c23e9793549b7a1f712aebf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomShuffleQueue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueueV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afef0176820366018d0107f4dd5a108f8df502d7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomShuffleQueueV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomShuffleQueueV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomStandardNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3faff40d8ab4196ebb8b163becd154e1e148122a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomStandardNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomStandardNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomUniform.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b36975ca763dfa57b15ce2790bbbafb0cbb98f9b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomUniform.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomUniform"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomUniformInt.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c1da2952324896f7dcaee173469ed0eb354fef1f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RandomUniformInt.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomUniformInt"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Range.pbtxt b/tensorflow/core/api_def/python_api/api_def_Range.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48b0e9dda4a3e7b70af47aad356f2d0f0b96f325
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Range.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Range"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProduced.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProduced.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0849191a0c929f6cdb196f6cd676908d22dff0ac
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProduced.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumRecordsProduced"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProducedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad4acb68bc4d8b8f0c3456718a9a6d27b3765c07
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderNumRecordsProducedV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumRecordsProducedV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..283d3ce1d496c7295a135e05e9de087585e84cc5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompleted"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94a1af49126ac98d819b63ab1067e18253a5fd65
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderRead.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0e248dfe4cfe6d23bf000083e247cdb88e8abd1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRead"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderReadUpTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderReadUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e2502b22e03978d797b9043526e4cf100393687
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderReadUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReadUpTo"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderReadUpToV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderReadUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43b375c69f6a887c7faece56b216f6b098607cf7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderReadUpToV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReadUpToV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderReadV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef3500df063ea8a32f78312a7ee03c2134b7d34b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderReadV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReadV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderReset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..679b1caec047c574c76234ea825b595f73a4eab1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderReset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderResetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderResetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59453c479c5a96a4fc394c357a7b24fc85ccd27b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderResetV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderResetV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderRestoreState.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderRestoreState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3075388c62d4eabd05f8332770c236fb36d6c7ea
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderRestoreState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRestoreState"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderRestoreStateV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderRestoreStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0edc9e2f24869d0e7700d9cc2a3bfc5133cbd1b4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderRestoreStateV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRestoreStateV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderSerializeState.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderSerializeState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b766ce93af3d96282c0c56ed2026cc52c23e740e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderSerializeState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderSerializeState"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReaderSerializeStateV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReaderSerializeStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e1247eec60b331550083ed8bd8e2fa914d8153c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReaderSerializeStateV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderSerializeStateV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd87eef8240532c158b7604d8c5576e6d0b8b24b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RealDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Relu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64c61f4ecf450957e518944b748a038c189d2750
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Relu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu"
+  endpoint {
+    name: "nn.relu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Relu6.pbtxt b/tensorflow/core/api_def/python_api/api_def_Relu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a132abdf391a675bbb88656a18c09250e3e22d7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Relu6.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Relu6"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f1b4aee00d90221d659daa34a7eb3462f42fa0c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeArea"
+  endpoint {
+    name: "image.resize_area"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ec8e0ad6359307eab1b166801474817d8c5282b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBicubic"
+  endpoint {
+    name: "image.resize_bicubic"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb3b8d6f458fff6163932457ef6c73a8fbbd721e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBilinear"
+  endpoint {
+    name: "image.resize_bilinear"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25c5d5701feefd6f8270236f29e1c187fa3cf06a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeNearestNeighbor"
+  endpoint {
+    name: "image.resize_nearest_neighbor"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Restore.pbtxt b/tensorflow/core/api_def/python_api/api_def_Restore.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ec456467df8921634a44f82fccda6994ef72f5c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Restore.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Restore"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RestoreSlice.pbtxt b/tensorflow/core/api_def/python_api/api_def_RestoreSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f188a291e6d9066a851ecc1779dcc01c2f82a64c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RestoreSlice.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RestoreSlice"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Reverse.pbtxt b/tensorflow/core/api_def/python_api/api_def_Reverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2cad1a557b7eda2c3a78ab4de1052c65a749791
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Reverse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Reverse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8307a3c2dddd0891f21534d12e2beed19b70b552
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ReverseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReverseV2"
+  endpoint {
+    name: "reverse_v2"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RightShift.pbtxt b/tensorflow/core/api_def/python_api/api_def_RightShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d82c3d7e7a6adaea6f0d41521d8bf9a3e20222d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RightShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RightShift"
+  endpoint {
+    name: "bitwise.right_shift"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_S.pbtxt b/tensorflow/core/api_def/python_api/api_def_S.pbtxt
deleted file mode 100644
index 0c34730200c88a0c75cd4cb11f3cf8f177583417..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_S.pbtxt
+++ /dev/null
@@ -1,252 +0,0 @@
-op {
-  graph_op_name: "SampleDistortedBoundingBox"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SampleDistortedBoundingBoxV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Save"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SaveSlices"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ScalarSummary"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SdcaFprint"
-  endpoint {
-    name: "train.sdca_fprint"
-  }
-}
-op {
-  graph_op_name: "SdcaOptimizer"
-  endpoint {
-    name: "train.sdca_optimizer"
-  }
-}
-op {
-  graph_op_name: "SdcaShrinkL1"
-  endpoint {
-    name: "train.sdca_shrink_l1"
-  }
-}
-op {
-  graph_op_name: "Select"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SelfAdjointEig"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SelfAdjointEigV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Selu"
-  endpoint {
-    name: "nn.selu"
-  }
-}
-op {
-  graph_op_name: "SeluGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SerializeManySparse"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SerializeSparse"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ShardedFilename"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ShardedFilespec"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Sigmoid"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SigmoidGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Skipgram"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Slice"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Softmax"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SoftmaxCrossEntropyWithLogits"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Softplus"
-  endpoint {
-    name: "nn.softplus"
-  }
-}
-op {
-  graph_op_name: "SoftplusGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Softsign"
-  endpoint {
-    name: "nn.softsign"
-  }
-}
-op {
-  graph_op_name: "SoftsignGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SpaceToBatch"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseAdd"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseAddGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseConcat"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseCross"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseFillEmptyRows"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseFillEmptyRowsGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseMatMul"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseReorder"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseReshape"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseSplit"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseTensorDenseAdd"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseTensorDenseMatMul"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SparseToDense"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Split"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SplitV"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SqrtGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Squeeze"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Stack"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackClose"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackCloseV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackPop"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackPopV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackPush"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackPushV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StackV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "StringSplit"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Sub"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Sum"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Svd"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Switch"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "SymbolicGradient"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBox.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20a155bd5fdf6334d414fc6933e1e80a55a06067
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdd0797f6d42f1ae1a9cb3d268b17030b5ce54a8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBoxV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Save.pbtxt b/tensorflow/core/api_def/python_api/api_def_Save.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e43730f6ff97b376270d2bf73abeff76029813ce
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Save.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Save"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SaveSlices.pbtxt b/tensorflow/core/api_def/python_api/api_def_SaveSlices.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5861c84b275f478bb2d0312c0a86cd1184812f72
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SaveSlices.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SaveSlices"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScalarSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4009af08c12c576f26c334ce5516f19ac9d3ce7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScalarSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScalarSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SdcaFprint.pbtxt b/tensorflow/core/api_def/python_api/api_def_SdcaFprint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..60e249077f26da7dd507fa67a8c1810a9d71d195
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SdcaFprint.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaFprint"
+  endpoint {
+    name: "train.sdca_fprint"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SdcaOptimizer.pbtxt b/tensorflow/core/api_def/python_api/api_def_SdcaOptimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e8e95ee9d9785f640d9ed807eec1bbdafe7d380
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SdcaOptimizer.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaOptimizer"
+  endpoint {
+    name: "train.sdca_optimizer"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SdcaShrinkL1.pbtxt b/tensorflow/core/api_def/python_api/api_def_SdcaShrinkL1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..552a91fb7ef4cb9801377e01d3ce1390eac37af2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SdcaShrinkL1.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaShrinkL1"
+  endpoint {
+    name: "train.sdca_shrink_l1"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Select.pbtxt b/tensorflow/core/api_def/python_api/api_def_Select.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e0f3dc4a23ac4d2afef9c573af0149f836c37d5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Select.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Select"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/python_api/api_def_SelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..febe9f2f5ac54ec05c433edca50a3bcc0fff3570
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SelfAdjointEig"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9f3274882a582feb0c4889fef944ecda0479b7b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SelfAdjointEigV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SelfAdjointEigV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Selu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da9ad7ce3442fdc0031629332131323da2ed0fc7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Selu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Selu"
+  endpoint {
+    name: "nn.selu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b965a3ad4312fabcbc5a0038c583484f9253c217
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeManySparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SerializeManySparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/python_api/api_def_SerializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe95f20302331ec2348cfc5d0ba83dc47a19ad41
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SerializeSparse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SerializeSparse"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShardedFilename.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShardedFilename.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67b2ef6bfc8b66cd8ef435ec1718fd522ae81950
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShardedFilename.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShardedFilename"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShardedFilespec.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShardedFilespec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb25f103f17114c53c6cdae57d6747e193dff402
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ShardedFilespec.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ShardedFilespec"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sigmoid.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sigmoid.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a6ffb7198aba8d341a74ef030dc2fe570da4faa
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sigmoid.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sigmoid"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Skipgram.pbtxt b/tensorflow/core/api_def/python_api/api_def_Skipgram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bc76069f218f70dcff0e3ea24755e6f19812587
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Skipgram.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Skipgram"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Slice.pbtxt b/tensorflow/core/api_def/python_api/api_def_Slice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e7dcc20340e3af92bc437483d88458446aab3e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Slice.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Slice"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ccee39765b659cc27e04a48cffc1caf97d5af
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Snapshot"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Softmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_Softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8605c8ddd9540862287ef647a5aec701716f008
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Softmax.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Softmax"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/python_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e30b5a4821d167056cb15bbe1c5fd35f2b3879c5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SoftmaxCrossEntropyWithLogits"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Softplus.pbtxt b/tensorflow/core/api_def/python_api/api_def_Softplus.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2de56c27be2b5535fbb54cbab9c7004b9f0c2e27
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Softplus.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softplus"
+  endpoint {
+    name: "nn.softplus"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Softsign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Softsign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b47412d1356ab76b83da048e4880126229146692
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Softsign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softsign"
+  endpoint {
+    name: "nn.softsign"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SpaceToBatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_SpaceToBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a26f9a3ec47f625f3f5796bcc55f47caed4b8d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SpaceToBatch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SpaceToBatch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6de974fd274231468e84a4eeb155631b250fb4a0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseAddGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87c306aaccb0123a60965dc92ebd370d206738ef
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseAddGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseAddGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3bae51fe23aa3a6092602f66027c95a6c98b2eb9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCross.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25506cbb31539bef61fafcecf9dc7d0ac74e7993
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCross.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCross"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRows.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRows.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..242e87af1ef519a4c033073789f9b002fcf4b1c2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRows.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseFillEmptyRows"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRowsGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1cb69c48040dc03a49167cc26e76efbb8fc11090
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseFillEmptyRowsGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseFillEmptyRowsGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseMatMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f0af41dbdb3fd103d82187bb242f2732a49a43bf
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseMatMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseMatMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReorder.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReorder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18be89eff499c899911dc93de4bf3ef7ebb73000
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReorder.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReorder"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseReshape.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..010de3e4ad6d701905054183bf165e215c6c3b66
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseReshape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseReshape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06e461aaa7c53045e78afdb4420280b75fdb2aa1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..285fb96d45ed9d63c4c8ec6e01e885a27892f08c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b4b6b92327b4ac7d121436cdb36d570adc7aba1
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseTensorDenseAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseMatMul.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07878ed2e8be1166b567a07c9c3831e40557ce84
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseTensorDenseMatMul.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseTensorDenseMatMul"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseToDense.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseToDense.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f5d6f1d96c592be9a2b87cfcb5fb59c5d813807
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseToDense.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseToDense"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Split.pbtxt b/tensorflow/core/api_def/python_api/api_def_Split.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..609fd3dc2aaa9e7c466c36b43c058012301a49c2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Split.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Split"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SplitV.pbtxt b/tensorflow/core/api_def/python_api/api_def_SplitV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ae6f36d1c3ba75583cf6c44a21f8c58fa8a67d6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SplitV.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SplitV"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Squeeze.pbtxt b/tensorflow/core/api_def/python_api/api_def_Squeeze.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f5697ca94efbe071aeff18eb7229c12fc374936
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Squeeze.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Squeeze"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Stack.pbtxt b/tensorflow/core/api_def/python_api/api_def_Stack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ee4f6288b43019572d199554c7d2e245cf27009
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Stack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Stack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackClose.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..086acaa5341625465496d73a88c8840fbcd89a54
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackClose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackCloseV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e450ce00474ea02a0624b0a8c97c37bdd14f65eb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackCloseV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackCloseV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackPop.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackPop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59352ead760318e976a58135919b1198bca317ce
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackPop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPop"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackPopV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackPopV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..102fdd00b7889c7dfc0432b31e4788d4dab44ecb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackPopV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPopV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackPush.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackPush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a83c24909c2a04ce31e8baceeb81ac29b3721f08
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackPush.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPush"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackPushV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackPushV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fad442ada6c2f81e255374b22cc9a81f5616efe7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackPushV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPushV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StackV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StackV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31362f46f7426fdd009cc60a1a90978fec502823
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StackV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..891ff7157a4777def06f9340bb6bb550c80670ad
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sub.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..747b44d4feecd7f81886ed338658914694786230
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68e34721814f947609388baa6aad1a50b2ec8d71
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Sum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Sum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Svd.pbtxt b/tensorflow/core/api_def/python_api/api_def_Svd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..098180f8d906c239e55606208994dfa2ad39950a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Svd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Svd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/python_api/api_def_Switch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2087c860b425a84c6261e27e0f54c15812228ffc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Switch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Switch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SymbolicGradient.pbtxt b/tensorflow/core/api_def/python_api/api_def_SymbolicGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f747f464b8956a22db7beb795df439df3066e49
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SymbolicGradient.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SymbolicGradient"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_T.pbtxt b/tensorflow/core/api_def/python_api/api_def_T.pbtxt
deleted file mode 100644
index 8011a11243f307c4046aba376b39e34c53cd479a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_T.pbtxt
+++ /dev/null
@@ -1,196 +0,0 @@
-op {
-  graph_op_name: "TFRecordReader"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TFRecordReaderV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TakeManySparseFromTensorsMap"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "Tanh"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TanhGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TemporaryVariable"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArray"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayClose"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayCloseV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayCloseV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayConcat"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayConcatV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayConcatV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGather"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGatherV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGatherV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGradV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayGradV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayPack"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayRead"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayReadV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayReadV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayScatter"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayScatterV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayScatterV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySize"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySizeV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySizeV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySplit"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySplitV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArraySplitV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayUnpack"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayWrite"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayWriteV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorArrayWriteV3"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorSummary"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TensorSummaryV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TextLineReader"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TextLineReaderV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TileGrad"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TopK"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TopKV2"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TruncateDiv"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TruncateMod"
-  visibility: HIDDEN
-}
-op {
-  graph_op_name: "TruncatedNormal"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ec8bee340f3a825b23369e0bf87e6402308f763
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TFRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TFRecordReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordReaderV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1cda01a6eea0d135c42370cdfe2551416da4fce
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TFRecordReaderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TFRecordReaderV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeManySparseFromTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..842419cc253c2699aeb79ff195d9f8dc9a5b31df
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TakeManySparseFromTensorsMap.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TakeManySparseFromTensorsMap"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c946e0a794a77fe6f40613824e6d614e9667ccf9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Tanh"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TemporaryVariable.pbtxt b/tensorflow/core/api_def/python_api/api_def_TemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9201b4fec5c3e0095ad14df63c6359436654b93
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TemporaryVariable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TemporaryVariable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArray.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArray.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7288b85d7ec3990d591e8db38aa018ed1e1dd25
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArray.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArray"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayClose.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73e208459c0c1b0cfa8fd28e73f2fcff4844a44e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayClose"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c6955f8c7930d0c6da6672e62e3606dfee8748a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayCloseV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d95854cefb984782d11b188a6b774aad1941536c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayCloseV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayCloseV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3695a787b98c9adae8f3a8b90946c6b1b68a2f9a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac103d3c48611c89ba6da87740e78e16adbec887
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcatV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54cdd3b949d112bde2494b5e299ae601cc44377f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayConcatV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcatV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGather.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82a98fe7f5b77815e2dcd5d1a94887421a9f2582
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGather"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7fc8541ddb5baa2120c46493038d895f5fe6920
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGatherV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08dcedb8b7e19c7d244cad127fe6e75a8dd6e93b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGatherV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGatherV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04b614d22ca986cfcb42ac05095c95a024ae6af4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf204dde364e4779d37e5c7c3b3ddb07a8f57b8f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGradV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75aaf6126eb9bb06a902e730e781f1db6381f9a3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGradV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayPack.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayPack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f60367c1fc92ec39871ed77645bc6f8756d3fc35
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayPack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayPack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayRead.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6c4ccdcf528747a0e9c88f7541c7f87f212a856
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayRead"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be19fe86fa46d83a722a422b357730a3873299c3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayReadV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d024f420bc97ca1db9f40de5b53b8988f5aa5e90
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayReadV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayReadV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayScatter.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cdbb22af52d22e7c5cf83c0e732911003688304
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatter"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02e16b14073b0f5d55e46398a380d57e695b1dae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatterV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d262cc6653e6fea43c71656b081f820dd2c7fa0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayScatterV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatterV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySize.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..169e495c4f9d1616be654861fe98640507513401
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d0dbd0d813a20d71f26ff9f62287253ca17fdc7f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySizeV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4e656f5cc8c6e589d123ec145735a3c6dc20a5a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySizeV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySizeV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d9c53b2b455827f3b48c0087d81f5cf45a8c38e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..502c78d83ecad1801c5a98fc47bf8d156e1198a8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..faefa0fac25a6af5a3e6c399d5a996170cca1969
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArraySplitV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplitV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayUnpack.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayUnpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5c0a794eb66eabc83e38be07d568a9d6e0b1548
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayUnpack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayUnpack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ad5c5f288dfc17cb0f44cfbb32b51c31713414a
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da69f1513cd6f675fbfbac115340b0c0b7a7c795
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayWrite.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayWrite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58d50cb7f2c29947f8465371987079e8f6bc4646
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayWrite.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWrite"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f07bb3501784856a8e33e467d4d8f3e2fd819ebc
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWriteV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..becc140401de536005572029a76a161864b48d4f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayWriteV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWriteV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5148e5d0c88de0f168873ebb5237978e122d95b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSummary"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSummaryV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6245bdce2b1215c0ebdcaffca1b46106c7413c86
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorSummaryV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSummaryV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fa1f6a441a72e5772824c8eca4b7cdc1cb31b4f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TextLineReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TextLineReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineReaderV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc506b39d6ef50b67cb5f01c7629d443e1cde6d6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TextLineReaderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TextLineReaderV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/python_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7433d2f967b561514f2515fb270ec703c3f59711
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TileGrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_TileGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d2dce067bf8df0d3ffee97de949e8380f2f5c1c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TileGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TileGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TopK.pbtxt b/tensorflow/core/api_def/python_api/api_def_TopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85ebb650e0c12b035858437770ef657df29c061c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TopK"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TopKV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..671b04819ce40b04113af2e4b23d1ca78fc49d78
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TopKV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TopKV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a547f771cfb3d4f3d9496ea24196e1a8a1f1879
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TruncateDiv"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0731e8810e25cad2cca02522aba55d032b1765b2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TruncateMod"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncatedNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6003b2fdca1508c0b26e5232b0c1e648f636db73
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TruncatedNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TruncatedNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_U.pbtxt b/tensorflow/core/api_def/python_api/api_def_UniformCandidateSampler.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/python_api/api_def_U.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_UniformCandidateSampler.pbtxt
index d7c261c63c8dcc259b4d5e77f114dba746538f61..6a73062b0a07147179faf469d98ae6c2ea5555ba 100644
--- a/tensorflow/core/api_def/python_api/api_def_U.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_UniformCandidateSampler.pbtxt
@@ -2,7 +2,3 @@ op {
   graph_op_name: "UniformCandidateSampler"
   visibility: HIDDEN
 }
-op {
-  graph_op_name: "Unpack"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Unpack.pbtxt b/tensorflow/core/api_def/python_api/api_def_Unpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30d7b7f73471a9a48f595cf52c59c44653476c39
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Unpack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Unpack"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Variable.pbtxt b/tensorflow/core/api_def/python_api/api_def_Variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7340d2a5c4fa8eab61f8a888e0c6937d63b537b0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Variable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Variable"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_V.pbtxt b/tensorflow/core/api_def/python_api/api_def_VariableV2.pbtxt
similarity index 50%
rename from tensorflow/core/api_def/python_api/api_def_V.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_VariableV2.pbtxt
index 18be21a8866580e0a156c5edd034685062baa550..7f63a5775573432028db5eb0ffe5a7df7941bbb6 100644
--- a/tensorflow/core/api_def/python_api/api_def_V.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_VariableV2.pbtxt
@@ -1,7 +1,3 @@
-op {
-  graph_op_name: "Variable"
-  visibility: HIDDEN
-}
 op {
   graph_op_name: "VariableV2"
   visibility: HIDDEN
diff --git a/tensorflow/core/api_def/python_api/api_def_WholeFileReader.pbtxt b/tensorflow/core/api_def/python_api/api_def_WholeFileReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1cc7a00281ee10f6d3a1a16e14d99e2952506f3
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_WholeFileReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WholeFileReader"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_W.pbtxt b/tensorflow/core/api_def/python_api/api_def_WholeFileReaderV2.pbtxt
similarity index 50%
rename from tensorflow/core/api_def/python_api/api_def_W.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_WholeFileReaderV2.pbtxt
index cd8861a98f6e58cd382c56ebc9669cfe34af8688..48e7b1e0eca37e116e657d0f5b8b9a1ba4b7f6e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_W.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_WholeFileReaderV2.pbtxt
@@ -1,7 +1,3 @@
-op {
-  graph_op_name: "WholeFileReader"
-  visibility: HIDDEN
-}
 op {
   graph_op_name: "WholeFileReaderV2"
   visibility: HIDDEN
diff --git a/tensorflow/core/api_def/python_api/api_def_Z.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZerosLike.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_def_Z.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_ZerosLike.pbtxt
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 81cd44870e3031313ca8202ab67a333e1d6eca38..a1e3b21e4f2d6af1b7e3c68d82a77f96bd34e613 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -35,7 +35,7 @@ Tensor make_zeros(const DataType& dtype, const TensorShapeProto& shape) {
 // Replaces occurrences of the "AccumulateNV2" stub operator with a graph of
 // lower-level ops. The graph is equivalent (modulo certain corner cases)
 // to the semantics of the original accumulate_n() Python op in math_ops.py.
-// Implementing the op with a rewrite allows this new variant of accumulate_n 
+// Implementing the op with a rewrite allows this new variant of accumulate_n
 // to be differentiable.
 //
 // The binary code that generates AccumulateNV2 stub ops is located in a
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 38fe247521b129841d32c367b7b5416cc945553e..6399b8cf55b98f330a93ae28b516c59bee5c9d79 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -296,12 +296,13 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // it from the free bin structure prior to using.
         RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
 
-        // If we can break the size of the chunk into two reasonably
-        // large pieces, do so.
-        //
-        // TODO(vrv): What should be the criteria when deciding when
-        // to split?
-        if (chunk->size >= rounded_bytes * 2) {
+        // If we can break the size of the chunk into two reasonably large
+        // pieces, do so.  In any case don't waste more than
+        // kMaxInternalFragmentation bytes on padding this alloc.
+        const int64 kMaxInternalFragmentation = 128 << 20;  // 128mb
+        if (chunk->size >= rounded_bytes * 2 ||
+            static_cast<int64>(chunk->size) - rounded_bytes >=
+                kMaxInternalFragmentation) {
           SplitChunk(h, rounded_bytes);
           chunk = ChunkFromHandle(h);  // Update chunk pointer in case it moved
         }
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 65ffdba6b30c40db26bf16e58c4a024412f974d0..e35548729b993c68f6e58180e0c2dc18b4eea801 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -52,15 +52,7 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
                       Device* dst, Tensor* output,
                       DeviceContext* recv_dev_context, StatusCallback done) {
   if (input->dtype() == DT_VARIANT) {
-    if (input->shape().dims() != 0) {
-      // TODO(b/67311047): Expand support to non-singleton variants?
-      Status err = errors::Unimplemented(
-          "CopyTensor::ViaDMA: Only singleton Variants are "
-          "supported. Tensor has shape: ",
-          input->shape().DebugString());
-      done(err);
-    }
-    Tensor copy(cpu_allocator, DT_VARIANT, TensorShape({}));
+    Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
     auto* status_cb = new ReffedStatusCallback(std::move(done));
     core::ScopedUnref status_cb_unref(status_cb);
 
@@ -93,14 +85,18 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
 
-    const Variant& v = input->scalar<Variant>()();
-    Variant* v_out = &(copy.scalar<Variant>()());
-    Status s_copy_init =
-        VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE, v, v_out,
-                          std::move(copier));
-    if (!s_copy_init.ok()) {
-      status_cb->UpdateStatus(s_copy_init);
-    } else {
+    const Variant* v = input->flat<Variant>().data();
+    Variant* v_out = copy.flat<Variant>().data();
+    Status s_copy_init;
+    for (int64 i = 0; i < input->NumElements(); ++i) {
+      s_copy_init = VariantDeviceCopy(
+          VariantDeviceCopyDirection::HOST_TO_DEVICE, v[i], &v_out[i], copier);
+      if (!s_copy_init.ok()) {
+        status_cb->UpdateStatus(s_copy_init);
+        break;
+      }
+    }
+    if (s_copy_init.ok()) {
       *output = std::move(copy);
     }
   } else {
@@ -114,15 +110,7 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
                       Device* src, Tensor* output,
                       DeviceContext* send_dev_context, StatusCallback done) {
   if (input->dtype() == DT_VARIANT) {
-    if (input->shape().dims() != 0) {
-      // TODO(b/67311047): Expand support to non-singleton variants?
-      done(errors::Unimplemented(
-          "CopyTensor::ViaDMA: Only singleton Variants are "
-          "supported. Tensor has shape: ",
-          input->shape().DebugString()));
-      return;
-    }
-    Tensor copy(cpu_allocator, DT_VARIANT, TensorShape({}));
+    Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
     auto* status_cb = new ReffedStatusCallback(std::move(done));
     core::ScopedUnref status_cb_unref(status_cb);
 
@@ -155,14 +143,18 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
 
-    const Variant& v = input->scalar<Variant>()();
-    Variant* v_out = &(copy.scalar<Variant>()());
-    Status s_copy_init =
-        VariantDeviceCopy(VariantDeviceCopyDirection::DEVICE_TO_HOST, v, v_out,
-                          std::move(copier));
-    if (!s_copy_init.ok()) {
-      status_cb->UpdateStatus(s_copy_init);
-    } else {
+    const Variant* v = input->flat<Variant>().data();
+    Variant* v_out = copy.flat<Variant>().data();
+    Status s_copy_init;
+    for (int64 i = 0; i < input->NumElements(); ++i) {
+      s_copy_init = VariantDeviceCopy(
+          VariantDeviceCopyDirection::DEVICE_TO_HOST, v[i], &v_out[i], copier);
+      if (!s_copy_init.ok()) {
+        status_cb->UpdateStatus(s_copy_init);
+        break;
+      }
+    }
+    if (s_copy_init.ok()) {
       *output = std::move(copy);
     }
   } else {
@@ -180,15 +172,7 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
                         const Tensor* input, Tensor* output,
                         StatusCallback done) {
   if (input->dtype() == DT_VARIANT) {
-    if (input->shape().dims() != 0) {
-      // TODO(b/67311047): Expand support to non-singleton variants?
-      done(errors::Unimplemented(
-          "CopyTensor::ViaDMA: Only singleton Variants are "
-          "supported. Tensor has shape: ",
-          input->shape().DebugString()));
-      return;
-    }
-    Tensor copy(cpu_allocator, DT_VARIANT, TensorShape({}));
+    Tensor copy(cpu_allocator, DT_VARIANT, input->shape());
     auto* status_cb = new ReffedStatusCallback(std::move(done));
     core::ScopedUnref status_cb_unref(status_cb);
 
@@ -223,14 +207,19 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
         },
         std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2);
 
-    const Variant& v = input->scalar<Variant>()();
-    Variant* v_out = &(copy.scalar<Variant>()());
-    Status s_copy_init =
-        VariantDeviceCopy(VariantDeviceCopyDirection::DEVICE_TO_DEVICE, v,
-                          v_out, std::move(copier));
-    if (!s_copy_init.ok()) {
-      status_cb->UpdateStatus(s_copy_init);
-    } else {
+    const Variant* v = input->flat<Variant>().data();
+    Variant* v_out = copy.flat<Variant>().data();
+    Status s_copy_init;
+    for (int64 i = 0; i < input->NumElements(); ++i) {
+      s_copy_init =
+          VariantDeviceCopy(VariantDeviceCopyDirection::DEVICE_TO_DEVICE, v[i],
+                            &v_out[i], copier);
+      if (!s_copy_init.ok()) {
+        status_cb->UpdateStatus(s_copy_init);
+        break;
+      }
+    }
+    if (s_copy_init.ok()) {
       *output = std::move(copy);
     }
   } else {
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 674111dbe69bcd6961e80f8da6496a332d45f84b..d5a452a796d67400d56ca08c675e0386348dea13 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -110,12 +110,9 @@ class Device : public DeviceBase {
   // prototyping of TensorFlow device implementations that need to modify
   // the GraphDef before execution.
   //
-  // 'library' provides access to the function library which is shared
-  // between all device partitions.
   // 'graph' supplies the partition of the graph assigned to this
   // device.
-  virtual Status MaybeRewriteGraph(const FunctionDefLibrary& /*library*/,
-                                   std::unique_ptr<Graph>* /*graph*/) {
+  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
     return Status::OK();
   }
 
@@ -134,7 +131,7 @@ class Device : public DeviceBase {
   OpSegment* op_segment() { return &op_seg_; }
 
   // Returns the resource manager associated w/ this device.
-  ResourceMgr* resource_manager() { return rmgr_; }
+  virtual ResourceMgr* resource_manager() { return rmgr_; }
 
   // Summarizes the status of this Device, for debugging.
   string DebugString() const { return ProtoDebugString(device_attributes_); }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index fa12c48fb90064ed2de68a6d018a17551ec3390a..b43c718817558f0e44eff5f5e5d5ec3a81d25ddd 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -32,7 +32,7 @@ namespace tensorflow {
 namespace {
 
 static mutex* get_device_factory_lock() {
-  static mutex device_factory_lock;
+  static mutex device_factory_lock(LINKER_INITIALIZED);
   return &device_factory_lock;
 }
 
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index d16681ac59d3bc34a54f63b8b55f372c661591b4..cd93f76324b937046f61b305a65fb53c2c133ab7 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -68,7 +68,7 @@ class DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
-  std::unordered_map<StringPiece, Device*, StringPiece::Hasher> device_map_;
+  std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 316fb0ac1611912797d2a16e6eb49e6eed8542b2..103b4b13c78576228d3aaa3ede5cfcb52b1dbd2a 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -54,15 +54,13 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/device_tracer.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 
-#if GOOGLE_CUDA
-#include "tensorflow/core/platform/gpu_tracer.h"
-#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
 
@@ -523,9 +521,7 @@ Status DirectSession::Run(const RunOptions& run_options,
 
   args.rendezvous = run_state.rendez;
   args.cancellation_manager = &step_cancellation_manager;
-  args.runner = [this, pool](Executor::Args::Closure c) {
-    SchedClosure(pool, std::move(c));
-  };
+
   args.session_state = &session_state_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
@@ -548,21 +544,26 @@ Status DirectSession::Run(const RunOptions& run_options,
           ((measure_step_count + 1) % build_cost_model_every == 0);
     }
   }
-  if (do_trace || update_cost_model) {
+  if (do_trace || update_cost_model ||
+      run_options.report_tensor_allocations_upon_oom()) {
     run_state.collector.reset(
         new StepStatsCollector(run_metadata->mutable_step_stats()));
     args.stats_collector = run_state.collector.get();
   }
 
-#if GOOGLE_CUDA
-  std::unique_ptr<GPUTracer> tracer;
+  std::unique_ptr<DeviceTracer> tracer;
   if (run_options.trace_level() >= RunOptions::HARDWARE_TRACE) {
-    tracer = CreateGPUTracer();
-    // tracer will be NULL on non-GPU platforms.
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    if (tracer) tracer->Start().IgnoreError();
+    tracer = CreateDeviceTracer();
+    // tracer may be NULL on platforms without accelerators.
+    if (tracer) {
+      Status s = tracer->Start();
+      if (!s.ok()) {
+        run_state.executors_done.Notify();
+        delete barrier;
+        return s;
+      }
+    }
   }
-#endif  // GOOGLE_CUDA
 
   // Register this step with session's cancellation manager, so that
   // `Session::Close()` will cancel the step.
@@ -581,7 +582,24 @@ Status DirectSession::Run(const RunOptions& run_options,
     return errors::Cancelled("Run call was cancelled");
   }
 
+  Executor::Args::Runner default_runner = [this,
+                                           pool](Executor::Args::Closure c) {
+    SchedClosure(pool, std::move(c));
+  };
   for (const auto& item : executors_and_keys->items) {
+    // TODO(zhengxq): support partial run.
+    // TODO(zhengxq): support other session types.
+    // TODO(zhengxq): if the device picks its own threadpool, we need to assign
+    //     less threads to the main compute pool by default.
+    thread::ThreadPool* device_thread_pool =
+        item.device->tensorflow_device_thread_pool();
+    if (!device_thread_pool) {
+      args.runner = default_runner;
+    } else {
+      args.runner = [this, device_thread_pool](Executor::Args::Closure c) {
+        SchedClosure(device_thread_pool, std::move(c));
+      };
+    }
     item.executor->RunAsync(args, barrier->Get());
   }
 
@@ -597,13 +615,10 @@ Status DirectSession::Run(const RunOptions& run_options,
     run_state.status.Update(errors::Cancelled("Run call was cancelled"));
   }
 
-#if GOOGLE_CUDA
   if (tracer) {
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    tracer->Stop().IgnoreError();
-    tracer->Collect(args.stats_collector).IgnoreError();
+    TF_RETURN_IF_ERROR(tracer->Stop());
+    TF_RETURN_IF_ERROR(tracer->Collect(args.stats_collector));
   }
-#endif  // GOOGLE_CUDA
 
   {
     mutex_lock l(run_state.mu_);
@@ -1135,7 +1150,7 @@ Status DirectSession::GetOrCreateExecutors(
 
   if (run_state_args->is_partial_run) {
     ek->graph = std::move(run_state_args->graph);
-    std::unordered_set<StringPiece, StringPiece::Hasher> names;
+    std::unordered_set<StringPiece, StringPieceHasher> names;
     for (const string& input : inputs) {
       TensorId id(ParseTensorName(input));
       names.emplace(id.first);
@@ -1222,6 +1237,7 @@ Status DirectSession::GetOrCreateExecutors(
     // NewLocalExecutor takes ownership of partition_graph.
     item->graph = partition_graph.get();
     item->executor = nullptr;
+    item->device = device;
     Executor* executor;
     TF_RETURN_IF_ERROR(
         NewLocalExecutor(params, partition_graph.release(), &executor));
@@ -1418,11 +1434,7 @@ Status DirectSession::CreateGraphs(
     Device* d;
     s = device_mgr_->LookupDevice(partition_name, &d);
     if (!s.ok()) break;
-    // TODO(pbar) The library is currently shared and immutable. There
-    // may be possible use cases where a device may want to modify
-    // function definitions - in which case the library would need to be
-    // replicated per device.
-    s = d->MaybeRewriteGraph(client_graph->flib_def->ToProto(), graph);
+    s = d->MaybeRewriteGraph(graph);
     if (!s.ok()) {
       break;
     }
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 7fbabf6d818f3f8ace64235724f35740fee5cec0..ab768b97c48420e92beb360dc6aa97f42e59ca61 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -64,8 +64,7 @@ class DirectSession : public Session {
   ~DirectSession() override;
 
   typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
-  typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher>
-      NameNodeMap;
+  typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameNodeMap;
 
   ::tensorflow::Status Create(const GraphDef& graph) override;
   ::tensorflow::Status Extend(const GraphDef& graph) override;
@@ -113,6 +112,7 @@ class DirectSession : public Session {
   // every partition.
   struct PerPartitionExecutorsAndLib {
     Graph* graph = nullptr;                  // not owned.
+    Device* device = nullptr;                // not owned.
     FunctionLibraryRuntime* flib = nullptr;  // not owned.
     std::unique_ptr<Executor> executor;
   };
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index ada29ff2878eb48ad0209571f14ecbc5f5a13e23..fe1cf1b12e0c62e560e5bcac0cf3c203ba091af8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1188,7 +1188,7 @@ class ExecutorState {
   // QUESTION: Make it a checkpoint::TensorSliceReaderCacheWrapper
   // instead of a pointer?  (avoids having to delete).
   checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache_;
-  FunctionCallFrame* call_frame_;
+  CallFrameInterface* call_frame_;
   const ExecutorImpl* impl_;
   CancellationManager* cancellation_manager_;
   Executor::Args::Runner runner_;
@@ -1804,6 +1804,21 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       LOG(WARNING) << this << " Compute status: " << s;
       DumpState();
     }
+    if (s.code() == error::RESOURCE_EXHAUSTED) {
+      if (stats_collector_) {
+        string err = stats_collector_->ReportAllocsOnResourceExhausted(
+            s.error_message());
+        s = Status(s.code(), strings::StrCat(s.error_message(), err));
+      } else {
+        s = Status(
+            s.code(),
+            strings::StrCat(
+                s.error_message(),
+                "\nHint: If you want to see a list of allocated tensors when "
+                "OOM happens, add report_tensor_allocations_upon_oom "
+                "to RunOptions for current allocation info.\n"));
+      }
+    }
     return s;
   }
 
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index e09dc4e34630fc0ab22615b7204bd0ec2d117d35..b5f4ebb00532670f06d1182088395f46c3481ed6 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -84,7 +84,7 @@ class Executor {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
     StepStatsCollector* stats_collector = nullptr;
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
     TensorStore* tensor_store = nullptr;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 10356fc7890d1d0b8ce257bea28dbd6d9ddb6835..ee9988f0b7dd014b739ac8e70095551cf2aae5af 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -153,12 +153,20 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   Status Instantiate(const string& function_name, AttrSlice attrs,
                      Handle* handle) override;
 
+  Status ReleaseHandle(Handle handle) override;
+
   const FunctionBody* GetFunctionBody(Handle handle) override;
 
   Status CreateKernel(const NodeDef& ndef, OpKernel** kernel) override;
 
   void Run(const Options& opts, Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets, DoneCallback done) override;
+  // NOTE(mrry): This overload is currently only implemented for local function
+  // execution.
+  // TODO(b/70346412): Implement support for remote function execution when
+  // passing a call frame.
+  void Run(const Options& opts, Handle handle, CallFrameInterface* frame,
+           DoneCallback done) override;
 
   bool IsStateful(const string& function) override;
 
@@ -190,18 +198,21 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   mutable mutex mu_;
 
-  // func_graphs_ never shrinks or reorders its members.
-  std::vector<FunctionBody*> func_graphs_ GUARDED_BY(mu_);
+  int next_handle_ GUARDED_BY(mu_);
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
   struct Item : public core::RefCounted {
     const Graph* graph = nullptr;  // Owned by exec.
+    FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
 
-    ~Item() override { delete this->exec; }
+    ~Item() override {
+      delete this->func_graph;
+      delete this->exec;
+    }
   };
-  std::vector<Item*> items_;
+  std::unordered_map<Handle, Item*> items_ GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
@@ -236,6 +247,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       device_name_(device_ == nullptr
                        ? ProcessFunctionLibraryRuntime::kDefaultFLRDevice
                        : device_->name()),
+      next_handle_(0),
       parent_(parent) {
   get_func_sig_ = [this](const string& op, const OpDef** sig) {
     return lib_def_->LookUpOpDef(op, sig);
@@ -246,9 +258,9 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
 }
 
 FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  for (FunctionBody* p : func_graphs_) delete p;
-  for (Item* item : items_)
-    if (item) item->Unref();
+  for (auto item : items_) {
+    if (item.second) item.second->Unref();
+  }
 }
 
 // An asynchronous op kernel which executes an instantiated function
@@ -309,9 +321,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
   }
 
   mutex_lock l(mu_);
-  CHECK_LE(0, local_handle);
-  CHECK_LT(local_handle, func_graphs_.size());
-  return func_graphs_[local_handle];
+  CHECK_EQ(1, items_.count(local_handle));
+  return items_[local_handle]->func_graph;
 }
 
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
@@ -337,7 +348,7 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
                                  kernel);
   }
 
-  // Try to instantiate this function for the func/attr. Maybe its
+  // Try to instantiate this function for the func/attr. Maybe it's
   // cached already.
   Handle handle;
   TF_RETURN_IF_ERROR(Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
@@ -411,7 +422,11 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(const AttrSlice& attrs) {
   if (device_ == nullptr) return true;
   string target = ProcessFunctionLibraryRuntime::ObtainFunctionTarget(attrs);
   if (target.empty()) return true;
-  return target == device_->name();
+  Device* target_device;
+  if (!device_mgr_->LookupDevice(target, &target_device).ok()) {
+    return false;
+  }
+  return target_device == device_;
 }
 
 AttrValueMap FunctionLibraryRuntimeImpl::FixAttrs(const AttrSlice& attrs) {
@@ -474,14 +489,32 @@ Status FunctionLibraryRuntimeImpl::Instantiate(const string& function_name,
     if (*handle != kInvalidHandle) {
       delete fbody;
     } else {
-      *handle = parent_->AddHandle(key, device_name_, func_graphs_.size());
-      func_graphs_.push_back(fbody);
-      items_.resize(func_graphs_.size());
+      *handle = parent_->AddHandle(key, device_name_, next_handle_);
+      Item* item = new Item;
+      item->func_graph = fbody;
+      items_.insert({next_handle_, item});
+      next_handle_++;
     }
   }
   return Status::OK();
 }
 
+Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    return parent_->ReleaseHandle(handle);
+  }
+
+  LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
+  mutex_lock l(mu_);
+  CHECK_EQ(1, items_.count(h));
+  Item* item = items_[h];
+  if (item->Unref()) {
+    items_.erase(h);
+    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+  }
+  return Status::OK();
+}
+
 void DumpGraph(StringPiece label, const Graph* g) {
   // TODO(zhifengc): Change Graph to record #nodes.
   VLOG(1) << "Graph " << label << " #nodes " << g->num_nodes() << " #edges "
@@ -525,9 +558,16 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   Executor* exec;
   TF_RETURN_IF_ERROR(NewLocalExecutor(params, g.release(), &exec));
 
-  *item = new Item;
-  (*item)->graph = graph;
-  (*item)->exec = exec;
+  {
+    // Guard item since it is already inserted in items_.
+    mutex_lock l(mu_);
+    if ((*item)->exec) {
+      delete exec;
+    } else {
+      (*item)->graph = graph;
+      (*item)->exec = exec;
+    }
+  }
   return Status::OK();
 }
 
@@ -535,29 +575,18 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
   LocalHandle local_handle = parent_->GetHandleOnDevice(device_name_, handle);
   {
     mutex_lock l(mu_);
-    if (local_handle >= items_.size()) {
+    if (items_.count(local_handle) == 0) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
     *item = items_[local_handle];
-    if (*item != nullptr) {
-      (*item)->Ref();
+    if ((*item)->exec != nullptr) {
       return Status::OK();
     }
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
   // executor needs to call CreateKernel.
-  TF_RETURN_IF_ERROR(CreateItem(handle, item));
-
-  {
-    mutex_lock l(mu_);
-    if (items_[local_handle] == nullptr) {
-      // Install *item in items_.
-      items_[local_handle] = *item;
-      (*item)->Ref();
-    }
-  }
-  return Status::OK();
+  return CreateItem(handle, item);
 }
 
 void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
@@ -565,14 +594,13 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
                                            std::vector<Tensor>* rets,
                                            Executor::Args* exec_args,
                                            Item* item, DoneCallback done) {
-  FunctionCallFrame* frame = exec_args->call_frame;
+  DCHECK(exec_args->call_frame == nullptr);
   string target_device = parent_->GetDeviceName(handle);
   string source_device = opts.source_device;
   Rendezvous* rendezvous = opts.rendezvous;
   DeviceContext* device_context;
   Status s = parent_->GetDeviceContext(target_device, &device_context);
   if (!s.ok()) {
-    delete frame;
     delete exec_args;
     done(s);
     return;
@@ -580,6 +608,16 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   int64 src_incarnation, target_incarnation;
   s = parent_->GetDeviceIncarnation(source_device, &src_incarnation);
   s.Update(parent_->GetDeviceIncarnation(target_device, &target_incarnation));
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   if (!s.ok()) {
     delete frame;
     delete exec_args;
@@ -613,7 +651,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
             *exec_args, [item, frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
-              item->Unref();
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -657,17 +694,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
-  const FunctionBody* fbody = GetFunctionBody(handle);
-  FunctionCallFrame* frame =
-      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
 
-  Item* item = nullptr;
-  Status s = GetOrCreateItem(handle, &item);
-  if (!s.ok()) {
-    delete frame;
-    done(s);
-    return;
-  }
   DCHECK(run_opts.runner != nullptr);
 
   Executor::Args* exec_args = new Executor::Args;
@@ -675,16 +702,28 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->step_id = run_opts.step_id;
   exec_args->rendezvous = run_opts.rendezvous;
   exec_args->stats_collector = run_opts.stats_collector;
-  exec_args->call_frame = frame;
   exec_args->cancellation_manager = run_opts.cancellation_manager;
   exec_args->step_container = run_opts.step_container;
   exec_args->runner = *run_opts.runner;
 
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    delete exec_args;
+    done(s);
+    return;
+  }
+
   if (run_opts.remote_execution) {
+    // NOTE(mrry): `RunRemote()` will set `exec_args->call_frame` for us.
     RunRemote(run_opts, handle, args, rets, exec_args, item, done);
     return;
   }
 
+  const FunctionBody* fbody = GetFunctionBody(handle);
+  FunctionCallFrame* frame =
+      new FunctionCallFrame(fbody->arg_types, fbody->ret_types);
+  exec_args->call_frame = frame;
   s = frame->SetArgs(args);
   if (!s.ok()) {
     delete frame;
@@ -692,12 +731,12 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     done(s);
     return;
   }
+
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
       [item, frame, rets, done, exec_args](const Status& status) {
-        item->Unref();
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -708,6 +747,66 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       });
 }
 
+void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
+                                     CallFrameInterface* frame,
+                                     DoneCallback done) {
+  if (opts.cancellation_manager && opts.cancellation_manager->IsCancelled()) {
+    done(errors::Cancelled(""));
+    return;
+  }
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle) ||
+      opts.remote_execution) {
+    done(errors::Unimplemented("Remote calling with CallFrameInterface"));
+    return;
+  }
+
+  Options run_opts = opts;
+  if (opts.create_rendezvous) {
+    Rendezvous* rendezvous = new IntraProcessRendezvous(device_mgr_);
+    run_opts.rendezvous = rendezvous;
+    run_opts.create_rendezvous = false;
+    done = std::bind(
+        [rendezvous](DoneCallback done,
+                     // Begin unbound arguments.
+                     const Status& status) {
+          rendezvous->Unref();
+          done(status);
+        },
+        std::move(done), std::placeholders::_1);
+  }
+
+  Item* item = nullptr;
+  Status s = GetOrCreateItem(handle, &item);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
+  Executor::Args* exec_args = new Executor::Args;
+  // Inherit the step_id from the caller.
+  exec_args->step_id = run_opts.step_id;
+  exec_args->rendezvous = run_opts.rendezvous;
+  exec_args->stats_collector = run_opts.stats_collector;
+  exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->step_container = run_opts.step_container;
+  exec_args->runner = *run_opts.runner;
+  exec_args->call_frame = frame;
+
+  item->exec->RunAsync(
+      // Executor args
+      *exec_args,
+      // Done callback.
+      std::bind(
+          [item, frame, exec_args](DoneCallback done,
+                                   // Start unbound arguments.
+                                   const Status& status) {
+            delete exec_args;
+            done(status);
+          },
+          std::move(done), std::placeholders::_1));
+}
+
 bool FunctionLibraryRuntimeImpl::IsStateful(const string& func) {
   const OpDef* op_def;
   const Status s = lib_def_->LookUpOpDef(func, &op_def);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index b77a8f50c40f9a595ecab1bb9129a6b9395aac38..52bfb9e0ed42ca7a634b1b0233b61775a3ce7387 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -207,7 +207,83 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    return Run(flr, handle, opts, args, std::move(rets));
+    status = Run(flr, handle, opts, args, rets);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
+  }
+
+  Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
+             FunctionLibraryRuntime::Options opts, CallFrameInterface* frame) {
+    std::atomic<int32> call_count(0);
+    std::function<void(std::function<void()>)> runner =
+        [&call_count](std::function<void()> fn) {
+          ++call_count;
+          test::function::FunctionTestSchedClosure(fn);
+        };
+
+    Notification done;
+    opts.runner = &runner;
+    std::vector<Tensor> out;
+    Status status;
+    flr->Run(opts, handle, frame, [&status, &done](const Status& s) {
+      status = s;
+      done.Notify();
+    });
+    done.WaitForNotification();
+    if (!status.ok()) {
+      return status;
+    }
+
+    EXPECT_GE(call_count, 1);  // Test runner is used.
+
+    return Status::OK();
+  }
+
+  Status InstantiateAndRunViaCallFrameInterface(FunctionLibraryRuntime* flr,
+                                                const string& name,
+                                                test::function::Attrs attrs,
+                                                const std::vector<Tensor>& args,
+                                                std::vector<Tensor*> rets) {
+    FunctionLibraryRuntime::Handle handle;
+    Status status = flr->Instantiate(name, attrs, &handle);
+    if (!status.ok()) {
+      return status;
+    }
+    const FunctionBody* fbody = flr->GetFunctionBody(handle);
+    FunctionCallFrame frame(fbody->arg_types, fbody->ret_types);
+    TF_RETURN_IF_ERROR(frame.SetArgs(args));
+
+    FunctionLibraryRuntime::Options opts;
+    status = Run(flr, handle, opts, &frame);
+    if (!status.ok()) return status;
+
+    std::vector<Tensor> retvals;
+    TF_RETURN_IF_ERROR(frame.GetRetvals(&retvals));
+    CHECK_EQ(rets.size(), retvals.size());
+    for (size_t i = 0; i < rets.size(); ++i) {
+      *rets[i] = retvals[i];
+    }
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        StringPiece(status2.error_message()).contains("remote execution."));
+
+    return status;
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
@@ -268,6 +344,9 @@ TEST_F(FunctionLibraryRuntimeTest, XTimesTwo) {
   TF_CHECK_OK(
       InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
   test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  TF_CHECK_OK(InstantiateAndRunViaCallFrameInterface(
+      flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, {x}, {&y}));
+  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
 }
 
 TEST_F(FunctionLibraryRuntimeTest, XTimesN) {
@@ -498,7 +577,7 @@ TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_12__cf__2")
+        s.WithOpName("x4/x2/scale/_12__cf__4")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -694,13 +773,13 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_5__cf__6")
+        s.WithOpName("scale/_5__cf__8")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_6__cf__7")
+        s.WithOpName("Func/_1/sy/_6__cf__9")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
@@ -939,9 +1018,8 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
 TEST_F(FunctionLibraryRuntimeTest, CrossDevice) {
   Init({test::function::FindDevice()});
   FunctionLibraryRuntime::Handle handle;
-  TF_CHECK_OK(Instantiate(
-      flr0_, "FindDevice",
-      {{"_target", "/job:localhost/replica:0/task:0/cpu:1"}}, &handle));
+  TF_CHECK_OK(Instantiate(flr0_, "FindDevice", {{"_target", "/device:CPU:1"}},
+                          &handle));
 
   Tensor y;
   FunctionLibraryRuntime::Options opts;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 12d44cc6b7d0b724b5fe1c427b31e455eeca07fe..566497783352e7f7af5b941f3d3fcfdaffb18704 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -60,6 +60,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
@@ -305,6 +306,46 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->gpu_id = gpu_id_;
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
+  // Whether and how the GPU device uses its own threadpool.
+  // This option is experimental. Once we confirm the best setting, we
+  // may change the default behavior and completely remove this flag.
+  // Default values might change in future releases.
+  // Possible values:
+  //   * global: GPU uses threads shared with CPU in the main compute
+  //          thread-pool. This is currently the default.
+  //   * gpu_private: GPU uses threads dedicated to this device.
+  //   * gpu_shared: All GPUs share a dedicated thread pool.
+  string gpu_thread_mode;
+  TF_RETURN_IF_ERROR(
+      ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
+  gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
+  if (gpu_thread_mode != "global") {
+    int64 gpu_thread_count = -1;
+    // Default to two threads. One for device compute and another for memory
+    // copies.
+    TF_RETURN_IF_ERROR(
+        ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
+    if (gpu_thread_mode == "gpu_private") {
+      // TODO(zhengxq): since these threads only serve a single GPU device,
+      //   we should set the device context once for each thread, and avoid
+      //   setting them for each kernel.
+      // TODO(zhengxq): pin the thread to the same socket of the target GPU.
+      thread_pool_.reset(new thread::ThreadPool(
+          options.env, strings::StrCat("gpu_private_", gpu_id_),
+          static_cast<int32>(gpu_thread_count)));
+      set_tensorflow_device_thread_pool(thread_pool_.get());
+    } else if (gpu_thread_mode == "gpu_shared") {
+      static thread::ThreadPool* thread_pool = new thread::ThreadPool(
+          options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
+      set_tensorflow_device_thread_pool(thread_pool);
+    } else {
+      string error_message =
+          strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
+      LOG(WARNING) << error_message;
+      return errors::InvalidArgument(error_message);
+    }
+  }
+
   return Status::OK();
 }
 
@@ -539,16 +580,9 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
   }
 
   if (parsed.dtype() == DT_VARIANT) {
-    if (parsed.shape().dims() != 0) {
-      // TODO(b/67311047): Expand support to non-singleton variants?
-      return errors::Unimplemented(
-          "GPUDevice::MakeTensorFromProto: Only singleton Variants are "
-          "supported. Tensor has shape: ",
-          parsed.shape().DebugString());
-    }
-    const Variant& from = parsed.scalar<Variant>()();
-    Tensor copy(cpu_allocator(), DT_VARIANT, TensorShape({}));
-    Variant* copy_variant = &(copy.scalar<Variant>()());
+    const Variant* from = parsed.flat<Variant>().data();
+    Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
+    Variant* copy_variant = copy.flat<Variant>().data();
 
     std::list<Notification> notifications;
     Status copy_status;
@@ -566,12 +600,20 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                     n.Notify();
                                   });
     };
-    TF_RETURN_IF_ERROR(
-        VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE, from,
-                          copy_variant, std::move(copier)));
+    Status s;
+    for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
+      s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
+                            from[ix], &copy_variant[ix], copier);
+      if (!s.ok()) {
+        break;
+      }
+    }
     for (auto& n : notifications) {
       n.WaitForNotification();
     }
+    if (!s.ok()) {
+      return s;
+    }
     *tensor = std::move(copy);
     return copy_status;
   } else {
@@ -652,6 +694,36 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
   if (static_cast<size_t>(n) > valid_gpu_ids.size()) {
     n = valid_gpu_ids.size();
   }
+  if (!valid_gpu_ids.empty()) {
+    // Save the original device.
+    int original_device = 0;
+    cudaError_t err = cudaGetDevice(&original_device);
+    if (err != cudaSuccess) {
+      return errors::Internal("cudaGetDevice() failed. Status: ",
+                              cudaGetErrorString(err));
+    }
+    // Force to implicitly initialize CUDA runtime on each valid GPU before
+    // CreateGPUDevice().
+    for (int gpu_id : valid_gpu_ids) {
+      err = cudaSetDevice(gpu_id);
+      if (err != cudaSuccess) {
+        return errors::Internal("cudaSetDevice() on GPU:", gpu_id,
+                                " failed. Status: ", cudaGetErrorString(err));
+      }
+      err = cudaFree(nullptr);
+      if (err != cudaSuccess) {
+        return errors::Internal(
+            "CUDA runtime implicit initialization on GPU:", gpu_id,
+            " failed. Status: ", cudaGetErrorString(err));
+      }
+    }
+    // Reset to the original device.
+    err = cudaSetDevice(original_device);
+    if (err != cudaSuccess) {
+      return errors::Internal("cudaSetDevice() on GPU:", original_device,
+                              " failed. Status: ", cudaGetErrorString(err));
+    }
+  }
   for (int i = 0; i < n; i++) {
     BaseGPUDevice* gpu_device;
     TF_RETURN_IF_ERROR(CreateGPUDevice(
@@ -918,7 +990,7 @@ Status EnablePeerAccess(gpu::Platform* platform,
         if (!status.ok()) {
           LOG(WARNING)
               << "Unable to enable peer access between device ordinals "
-              << i_gpu_id << " and " << j_gpu_id;
+              << i_gpu_id << " and " << j_gpu_id << ", status: " << status;
         } else {
           ++enabled_peer_count;
         }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 442496437af5f4796f6d216f7c688d31f2f457d7..4585d5b04dd836135961baef4882f43d7e3a07f1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -116,6 +116,7 @@ class BaseGPUDevice : public LocalDevice {
   const bool sync_every_op_ = false;
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75be6d60b86af101fb9de7497490e72c523d632b
--- /dev/null
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
+  SessionOptions opts;
+  std::vector<tensorflow::Device*> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, "/job:localhost/replica:0/task:0", &devices));
+  EXPECT_TRUE(devices.empty());
+}
+
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+int main(int argc, char** argv) {
+#if GOOGLE_CUDA
+  // Sets CUDA_VISIBLE_DEVICES to empty string to simulate non-gpu environment.
+  setenv("CUDA_VISIBLE_DEVICES", "", 1);
+#endif  // GOOGLE_CUDA
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 4bd40c79784d078f36b9c9e03b123e95681447b6..3b309e915cdd2c6d5eead9ed0312f3873bcf7335 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -348,10 +348,6 @@ Status GraphExecutionState::OptimizeGraph(
         cpu_device = device;
       }
     }
-    if (cpu_device == nullptr) {
-      return errors::Internal(
-          "Unable to find CPU device needed for constant folding");
-    }
     grappler::VirtualCluster cluster(device_map);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 53e80b1ee302761c04df1ec9d242d9edd2a1f510..63b74e8dbf1ac6482579e96fba32c952e0fe561e 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -81,7 +81,7 @@ class MklCPUAllocator : public Allocator {
       }
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
       if (user_val > max_mem_bytes) {
-        LOG(WARNING) << "The user specifed a memory limit " << kMaxLimitStr
+        LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
                      << "=" << user_val
                      << " greater than available physical memory: "
                      << max_mem_bytes
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 9e39b6b7b93a8e35ad3b47c1c637f7d906649823..5707f5259228c0e54d6d858652a8c50986c0c49b 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -44,7 +44,7 @@ namespace tensorflow {
 
 //    PendingCounts counts(layout);
 //    ...
-//    counts.decrement_panding(h[id], 1);
+//    counts.decrement_pending(h[id], 1);
 class PendingCounts {
  public:
   // The state machine for a node's execution.
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 73fdf60fd5f1669c5c4e0d0c64b37d983c7601fd..54f082e823d463301fc5f437781d01ce96741568 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -129,7 +129,7 @@ class ColocationGraph {
     // 'string' values stored in NodeDef attribute lists, as well as StringPiece
     // values that refer to 'string' values from NodeDef::name(), without
     // performing any string allocations.
-    std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>
+    std::unordered_map<StringPiece, const Node*, StringPieceHasher>
         colocation_group_root;
 
     for (Node* node : graph_->nodes()) {
@@ -171,7 +171,7 @@ class ColocationGraph {
   }
 
   Status ColocateNodeToGroup(
-      std::unordered_map<StringPiece, const Node*, StringPiece::Hasher>*
+      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
           colocation_group_root,
       Node* node, StringPiece colocation_group) {
     const Node*& root_node = (*colocation_group_root)[colocation_group];
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index c4114ff8739f15f0993f9164e2046c94a3c586bc..53a14121d478edccbcacc12916de2ee2e12602b5 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -30,15 +30,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const FunctionLibraryDefinition* lib_def,
     const OptimizerOptions& optimizer_options,
     DistributedFunctionLibraryRuntime* parent)
-    : lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
-    flr_map_[kDefaultFLRDevice] =
+    flr_map_[nullptr] =
         NewFunctionLibraryRuntime(nullptr, env, nullptr, graph_def_version,
                                   lib_def, optimizer_options, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
-    flr_map_[d->name()] =
+    flr_map_[d] =
         NewFunctionLibraryRuntime(device_mgr, env, d, graph_def_version,
                                   lib_def, optimizer_options, this);
   }
@@ -50,15 +53,18 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     DistributedFunctionLibraryRuntime* parent)
-    : lib_def_(lib_def), parent_(parent) {
+    : device_mgr_(device_mgr),
+      lib_def_(lib_def),
+      next_handle_(0),
+      parent_(parent) {
   if (device_mgr == nullptr) {
-    flr_map_[kDefaultFLRDevice] = NewFunctionLibraryRuntime(
+    flr_map_[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, nullptr, graph_def_version, lib_def, optimizer_options,
         std::move(custom_kernel_creator), this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
-    flr_map_[d->name()] = NewFunctionLibraryRuntime(
+    flr_map_[d] = NewFunctionLibraryRuntime(
         device_mgr, env, d, graph_def_version, lib_def, optimizer_options,
         custom_kernel_creator, this);
   }
@@ -163,17 +169,19 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
 
 FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
     const string& device_name) {
-  string clean_device_name;
+  Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
-    clean_device_name = DeviceNameUtils::CanonicalizeDeviceName(device_name);
-  } else {
-    clean_device_name = device_name;
+    if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
+      LOG(ERROR) << "Could not find device: " << device_name;
+      return nullptr;
+    }
   }
-  if (flr_map_.find(clean_device_name) == flr_map_.end()) {
+  const auto& iter = flr_map_.find(device);
+  if (iter == flr_map_.end()) {
     LOG(ERROR) << "Could not find device: " << device_name;
     return nullptr;
   }
-  return flr_map_[clean_device_name].get();
+  return iter->second.get();
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
@@ -183,30 +191,38 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
   FunctionLibraryRuntime::Handle h =
       gtl::FindWithDefault(table_, function_key, kInvalidHandle);
   if (h != kInvalidHandle) {
-    return h;
+    if (function_data_.count(h) != 0) return h;
   }
-  h = function_data_.size();
-  function_data_.emplace_back(device_name, local_handle);
+  h = next_handle_;
+  function_data_.insert({h, FunctionData(device_name, local_handle)});
   table_[function_key] = h;
+  next_handle_++;
   return h;
 }
 
 FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
     const string& function_key) const {
   mutex_lock l(mu_);
-  return gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  FunctionLibraryRuntime::Handle h =
+      gtl::FindWithDefault(table_, function_key, kInvalidHandle);
+  if (h != kInvalidHandle) {
+    if (function_data_.count(h) == 0) return kInvalidHandle;
+  }
+  return h;
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
-  return GetHandleOnDevice(device_name, handle) != -1;
+  return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
     const string& device_name, FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  if (function_data_.count(handle) == 0) {
+    return kInvalidLocalHandle;
+  }
   const FunctionData& function_data = function_data_[handle];
   if (function_data.target_device != device_name) {
     return kInvalidLocalHandle;
@@ -217,7 +233,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 string ProcessFunctionLibraryRuntime::GetDeviceName(
     FunctionLibraryRuntime::Handle handle) {
   mutex_lock l(mu_);
-  CHECK_LE(handle, function_data_.size());
+  CHECK_EQ(1, function_data_.count(handle));
   const FunctionData& function_data = function_data_[handle];
   return function_data.target_device;
 }
@@ -243,6 +259,29 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::RemoveHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  function_data_.erase(handle);
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::ReleaseHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  FunctionLibraryRuntime* flr = nullptr;
+  string target_device;
+  {
+    mutex_lock l(mu_);
+    CHECK_EQ(1, function_data_.count(handle));
+    target_device = function_data_[handle].target_device;
+  }
+  flr = GetFLR(target_device);
+  if (flr != nullptr) {
+    return flr->ReleaseHandle(handle);
+  }
+  return errors::InvalidArgument("Handle not found: ", handle);
+}
+
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
@@ -259,7 +298,10 @@ void ProcessFunctionLibraryRuntime::Run(
   FunctionLibraryRuntime::LocalHandle local_handle;
   {
     mutex_lock l(mu_);
-    CHECK_LE(handle, function_data_.size());
+    if (function_data_.count(handle) == 0) {
+      done(errors::NotFound("Handle: ", handle, " not found."));
+      return;
+    }
     target_device = function_data_[handle].target_device;
     local_handle = function_data_[handle].local_handle;
   }
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 85717739d0c61006995f1961b3285c53ee0ef57f..3aa7b87286f4875740738b573e8f454cc1331a20 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -123,6 +123,12 @@ class ProcessFunctionLibraryRuntime {
   Status Instantiate(const string& function_name, AttrSlice attrs,
                      FunctionLibraryRuntime::Handle* handle);
 
+  // Delegates to the local FLR that owns state corresponding to `handle` and
+  // tells it to release it. If the `handle` isnt' needed at all, the local FLR
+  // might call RemoveHandle on this to get rid of the state owned by the Proc
+  // FLR.
+  Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
+
   // Runs the function with given `handle`. Function could have been
   // instantiated on any device. More details in framework/function.h
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -140,6 +146,9 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
+  // Removes handle from the state owned by this object.
+  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
   friend class FunctionLibraryRuntimeImpl;
 
   mutable mutex mu_;
@@ -151,14 +160,18 @@ class ProcessFunctionLibraryRuntime {
     FunctionData(const string& target_device,
                  FunctionLibraryRuntime::LocalHandle local_handle)
         : target_device(target_device), local_handle(local_handle) {}
+    FunctionData() : FunctionData("", -1) {}
   };
 
+  const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   // Holds all the function invocations here.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
-  std::vector<FunctionData> function_data_ GUARDED_BY(mu_);
-  std::unordered_map<string, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
+  std::unordered_map<FunctionLibraryRuntime::Handle, FunctionData>
+      function_data_ GUARDED_BY(mu_);
+  std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
+  int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
 };
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cb416603be20a65027102b78e5e0be922c12c7d3..270e46dfe901a985629b452a2747fa654cb4135d 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -82,6 +82,22 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
+    // Release the handle and then try running the function. It shouldn't
+    // succeed.
+    status = proc_flr_->ReleaseHandle(handle);
+    if (!status.ok()) {
+      return status;
+    }
+    Notification done2;
+    proc_flr_->Run(opts, handle, args, &out,
+                   [&status, &done2](const Status& s) {
+                     status = s;
+                     done2.Notify();
+                   });
+    done2.WaitForNotification();
+    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+
     return Status::OK();
   }
 
@@ -92,12 +108,32 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   IntraProcessRendezvous* rendezvous_;
 };
 
+TEST_F(ProcessFunctionLibraryRuntimeTest, GetFLRNull) {
+  FunctionDefLibrary proto;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def(
+      new FunctionLibraryDefinition(OpRegistry::Global(), proto));
+  OptimizerOptions opts;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr(
+      new ProcessFunctionLibraryRuntime(
+          nullptr /* device_mgr */, Env::Default(), TF_GRAPH_DEF_VERSION,
+          lib_def.get(), opts, nullptr /* cluster_flr */));
+  FunctionLibraryRuntime* flr =
+      proc_flr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+  EXPECT_NE(flr, nullptr);
+}
+
 TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   Init({});
   FunctionLibraryRuntime* flr =
       proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
   EXPECT_NE(flr, nullptr);
   EXPECT_EQ(flr->device(), devices_[0]);
+  flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
+  EXPECT_NE(flr, nullptr);
+  EXPECT_EQ(flr->device(), devices_[0]);
+  flr = proc_flr_->GetFLR("/device:CPU:0");
+  EXPECT_NE(flr, nullptr);
+  EXPECT_EQ(flr->device(), devices_[0]);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:1");
   EXPECT_NE(flr, nullptr);
   EXPECT_EQ(flr->device(), devices_[1]);
@@ -213,13 +249,11 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultipleCallsDiffDeviceFindDevice) {
   opts.rendezvous = rendezvous_;
   opts.remote_execution = true;
   Tensor y;
-  TF_CHECK_OK(Run("FindDevice", opts,
-                  {{"_target", "/job:a/replica:0/task:0/cpu:0"}}, {}, {&y}));
+  TF_CHECK_OK(Run("FindDevice", opts, {{"_target", "/cpu:0"}}, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:0"},
                                 TensorShape({})));
-  TF_CHECK_OK(Run("FindDevice", opts,
-                  {{"_target", "/job:a/replica:0/task:0/cpu:1"}}, {}, {&y}));
+  TF_CHECK_OK(Run("FindDevice", opts, {{"_target", "/cpu:1"}}, {}, {&y}));
   test::ExpectTensorEqual<string>(
       y, test::AsTensor<string>({"/job:a/replica:0/task:0/device:CPU:1"},
                                 TensorShape({})));
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index fa9713735edd05c36e1787be0e8c89e69c043fb2..56766a8df4526cb2d6fb20c5dcd461a65d2a994b 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -21,7 +21,8 @@ namespace tensorflow {
 /* static */
 Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                         Device* underlying,
-                                        bool owns_underlying) {
+                                        bool owns_underlying,
+                                        bool isolate_session_state) {
   DeviceNameUtils::ParsedName parsed_name;
   CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
   DeviceNameUtils::ParsedName underlying_parsed_name =
@@ -35,15 +36,17 @@ Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                           parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
-  return new RenamedDevice(underlying, attributes, owns_underlying);
+  return new RenamedDevice(underlying, attributes, owns_underlying,
+                           isolate_session_state);
 }
 
 RenamedDevice::RenamedDevice(Device* underlying,
                              const DeviceAttributes& attributes,
-                             bool owns_underlying)
+                             bool owns_underlying, bool isolate_session_state)
     : Device(underlying->env(), attributes),
       underlying_(underlying),
-      owns_underlying_(owns_underlying) {}
+      owns_underlying_(owns_underlying),
+      isolate_session_state_(isolate_session_state) {}
 
 RenamedDevice::~RenamedDevice() {
   if (owns_underlying_) {
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index 22a70fbdfaea3d77440e777ac5261af8c3aeb551..c5c204d4faff8c5016cc0a48fec266b06409b668 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -29,7 +29,9 @@ namespace tensorflow {
 class RenamedDevice : public Device {
  public:
   static Device* NewRenamedDevice(const string& new_base, Device* underlying,
-                                  bool owns_underlying);
+                                  bool owns_underlying,
+                                  bool isolate_session_state);
+
   ~RenamedDevice() override;
 
   // Below are virtual methods defined on DeviceBase
@@ -104,9 +106,8 @@ class RenamedDevice : public Device {
 
   Status Sync() override { return underlying_->Sync(); }
 
-  Status MaybeRewriteGraph(const FunctionDefLibrary& library,
-                           std::unique_ptr<Graph>* graph) override {
-    return underlying_->MaybeRewriteGraph(library, graph);
+  Status MaybeRewriteGraph(std::unique_ptr<Graph>* graph) override {
+    return underlying_->MaybeRewriteGraph(graph);
   }
 
   Status FillContextMap(const Graph* graph,
@@ -114,11 +115,21 @@ class RenamedDevice : public Device {
     return underlying_->FillContextMap(graph, device_context_map);
   }
 
+  // Returns the resource manager associated w/ this device.
+  ResourceMgr* resource_manager() override {
+    if (isolate_session_state_) {
+      return Device::resource_manager();
+    } else {
+      return underlying_->resource_manager();
+    }
+  }
+
  private:
   RenamedDevice(Device* underlying, const DeviceAttributes& attributes,
-                bool owns_underlying);
+                bool owns_underlying, bool isolate_session_state);
   Device* const underlying_;
   const bool owns_underlying_;
+  const bool isolate_session_state_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index a1e31016c2bc93aeae76175320255e0d43602265..92dc03812e9941e07500a9dc26baa7c1227430dc 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -32,6 +32,10 @@ Status SendTensorsToRendezvous(
         "; alloc_attrs.size() = ", alloc_attrs.size());
   }
 
+  if (!rendezvous) {
+    return errors::InvalidArgument("Rendezvous is null.");
+  }
+
   Rendezvous::ParsedKey parsed;
   for (int i = 0; i < keys.size(); ++i) {
     Rendezvous::Args rendez_args;
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index dba7a9253e9cc8837a1a471dab621475b1405a49..0234d4c37250d8ed3c645759dd17f94093e57df0 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace {
 
 static mutex* get_session_factory_lock() {
-  static mutex session_factory_lock;
+  static mutex session_factory_lock(LINKER_INITIALIZED);
   return &session_factory_lock;
 }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 1ed5eb3f228674054ecf9bb11505913f6549e460..3ae52f414faf5c47531d6e64fd8666906ce0159a 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -127,82 +127,84 @@ Status InferShapesForFunctionSubNode(const Node* node, ShapeRefiner* refiner,
 //
 // NOTE: Recursive user-defined functions are not supported.
 // Maybe we won't support recursive functions at all in TF, because of
-// other maintanabilty issues.
+// other maintainability issues.
 Status ShapeRefiner::InferShapesForFunction(
-    const tensorflow::FunctionLibraryDefinition& function_library,
-    const tensorflow::FunctionDef& function_def, bool keep_nested_shapes,
+    const tensorflow::FunctionDef* function_def, bool keep_nested_shapes,
     ExtendedInferenceContext* outer_context) {
-  InstantiationResult result;
-  TF_RETURN_IF_ERROR(InstantiateFunction(
-      function_def, outer_context->get_context()->attrs(),
-      [&function_library](const string& op, const OpDef** sig) {
-        return function_library.LookUpOpDef(op, sig);
-      },
-      &result));
-
-  Graph graph(&function_library);
-  {
+  const Graph* graph;
+  auto it = functions_.find(function_def);
+  if (it != functions_.end()) {
+    graph = it->second.get();
+  } else {
+    InstantiationResult result;
+    TF_RETURN_IF_ERROR(InstantiateFunction(
+        *function_def, outer_context->get_context()->attrs(),
+        [this](const string& op, const OpDef** sig) {
+          return this->function_library_->LookUpOpDef(op, sig);
+        },
+        &result));
+
+    Graph* new_graph = new Graph(function_library_);
     GraphConstructorOptions options;
     options.allow_internal_ops = true;
-    TF_RETURN_IF_ERROR(ConvertNodeDefsToGraph(options, result.nodes, &graph));
+    TF_RETURN_IF_ERROR(
+        ConvertNodeDefsToGraph(options, result.nodes, new_graph));
+    functions_[function_def].reset(new_graph);
+    graph = new_graph;
   }
 
-  ShapeRefiner refiner(graph.versions().producer(), &function_library);
-  refiner.set_disable_constant_propagation(disable_constant_propagation_);
-  refiner.set_function_library_for_shape_inference(&function_library);
-  if (keep_nested_shapes) refiner.set_keep_nested_shape_inferences();
-
+  std::unordered_set<const Node*> function_nodes;
+  Status inference_status = Status::OK();
   {
-    Status inference_status = Status::OK();
-    auto node_shape_inference_lambda = [&refiner, &outer_context,
+    auto node_shape_inference_lambda = [this, &outer_context, &function_nodes,
                                         &inference_status](const Node* node) {
       if (!inference_status.ok()) return;
       inference_status = InferShapesForFunctionSubNode(
-          node, &refiner, outer_context->get_context());
+          node, this, outer_context->get_context());
+      function_nodes.insert(node);
     };
 
     // Calls inference lambda for each node after visiting all predecessors.
     // Ensures that we are adding nodes to ShapeRefiner in the topological
     // order.
-    ReverseDFS(graph, {}, node_shape_inference_lambda);
-
-    TF_RETURN_IF_ERROR(inference_status);
+    ReverseDFS(*graph, {}, node_shape_inference_lambda);
   }
 
-  if (keep_nested_shapes) {
+  if (keep_nested_shapes && inference_status.ok()) {
     // Fill the nested inferences map.
     //
     // The materialized function graph has extra nodes for arguments and
     // return values, which are not explicitly listed in the FunctionDef,
     // we filter out these special nodes here to not expose the implementation
     // details and keep only inferences for the nodes listed in the FunctionDef.
-
-    auto stolen_contexts = refiner.StealInferenceContexts();
-
     std::unordered_map<string, const NodeDef*> user_defined_nodes;
-    for (const auto& node_def : function_def.node_def()) {
+    for (const auto& node_def : function_def->node_def()) {
       user_defined_nodes[node_def.name()] = &node_def;
     }
 
     std::unordered_map<string, std::unique_ptr<ExtendedInferenceContext>>
         nested_inferences;
-    for (auto& stolen_kv : stolen_contexts) {
-      auto& stolen_name = stolen_kv.first->name();
-      if (user_defined_nodes.find(stolen_name) != user_defined_nodes.end()) {
-        nested_inferences[stolen_name] = std::move(stolen_kv.second);
-
-        // By default InferenceContext refers to a NodeDef from Graph,
-        // we have to change it to a NodeDef with longer lifetime,
-        // because the Graph is a temporary in this function.
-        nested_inferences[stolen_name]->get_context()->node_def_ =
-            user_defined_nodes[stolen_name];
+    for (const Node* node : function_nodes) {
+      const string& node_name = node->name();
+      if (user_defined_nodes.find(node_name) != user_defined_nodes.end()) {
+        nested_inferences[node_name] = std::move(node_to_context_[node]);
+        node_to_context_.erase(node);
+        // By default InferenceContext refers to a NodeDef from Graph.
+        // Change it to the publicly accessible NodeDef of the function
+        // definition.
+        nested_inferences[node_name]->get_context()->node_def_ =
+            user_defined_nodes[node_name];
       }
     }
-
     outer_context->set_nested_inferences(std::move(nested_inferences));
+  } else {
+    // Delete the contexts created for the functions nodes to save memory.
+    for (const Node* node : function_nodes) {
+      node_to_context_.erase(node);
+    }
   }
 
-  return Status::OK();
+  return inference_status;
 }
 
 Status ShapeRefiner::AddNode(const Node* node) {
@@ -333,9 +335,14 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
     InferenceContext* c = iter->second->get_context();
     DCHECK_GE(dst_input, 0);
     ShapeHandle existing_input = node_context->input(dst_input);
-    if (!relax && node_context->MergeInput(dst_input, c->output(src_output))) {
-      *refined = true;
-    } else if (relax) {
+    if (!relax) {
+      if (node_context->MergeInput(dst_input, c->output(src_output))) {
+        if (!SameDefinedShape(node_context, node_context->input(dst_input),
+                              existing_input)) {
+          *refined = true;
+        }
+      }
+    } else {
       if (node_context->RelaxInput(dst_input, c->output(src_output))) {
         if (!SameDefinedShape(node_context, node_context->input(dst_input),
                               existing_input)) {
@@ -700,6 +707,8 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     *result = target_context->Scalar();
   } else if (src_op == "Shape") {
     *result = src_context->input(0);
+  } else if (src_op == "ShapeN") {
+    *result = src_context->input(input_edge->src_output());
   } else if (src_op == "Pack") {
     std::vector<DimensionHandle> dims;
     // Pack is concatenating its input scalars to form the shape tensor vector.
@@ -780,9 +789,8 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
 
       auto* func_def = function_library_->Find(op_reg_data->op_def.name());
       if (func_def) {
-        TF_RETURN_IF_ERROR(InferShapesForFunction(
-            *function_library_, *func_def, keep_nested_shape_inferences_, ec));
-        return Status::OK();
+        return InferShapesForFunction(func_def, keep_nested_shape_inferences_,
+                                      ec);
       }
     }
 
@@ -863,15 +871,22 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
 
 bool ShapeRefiner::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
                                     ShapeHandle s1) {
-  if (!c->RankKnown(s0)) {
-    return !c->RankKnown(s1);
-  } else if (!c->RankKnown(s1) || c->Rank(s0) != c->Rank(s1)) {
+  if (s0.SameHandle(s1)) {
+    return true;
+  }
+  if (c->Rank(s0) != c->Rank(s1)) {
+    return false;
+  }
+  if (!c->RankKnown(s0) && !c->RankKnown(s1)) {
     return false;
   }
-
   for (int i = 0; i < c->Rank(s0); ++i) {
-    if (c->Value(c->Dim(s0, i)) != c->Value(c->Dim(s1, i))) {
-      return false;
+    if (!c->Dim(s0, i).SameHandle(c->Dim(s1, i))) {
+      int64 val0 = c->Value(c->Dim(s0, i));
+      int64 val1 = c->Value(c->Dim(s1, i));
+      if (val0 < 0 || val1 < 0 || val0 != val1) {
+        return false;
+      }
     }
   }
 
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index 570b4db1635d52765d7ec509bf2b20d78502160b..da42c30ce949dbc3a953d20d0ff3333b6ba1b1d5 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -159,6 +159,7 @@ class ShapeRefiner {
   // With this enabled, shape inference can take more time since it descends
   // into all function calls. It doesn't do inference once for each function
   // definition, but once for each function call.
+  // The function library must outlive the shape refiner.
   void set_function_library_for_shape_inference(
       const tensorflow::FunctionLibraryDefinition* lib) {
     function_library_ = lib;
@@ -210,10 +211,9 @@ class ShapeRefiner {
   // - outer_context will contain output shapes inferred from input shapes
   // - outer_context will contain nested inferences collection, iff
   //   keep_nested_shapes is true
-  Status InferShapesForFunction(
-      const tensorflow::FunctionLibraryDefinition& function_library,
-      const tensorflow::FunctionDef& function_def, bool keep_nested_shapes,
-      ExtendedInferenceContext* outer_context);
+  Status InferShapesForFunction(const tensorflow::FunctionDef* function_def,
+                                bool keep_nested_shapes,
+                                ExtendedInferenceContext* outer_context);
 
   // Tries to infer tensor output based on the input shapes of the node. In some
   // cases, the shapes of the inputs are sufficient for inferring the contents
@@ -260,12 +260,6 @@ class ShapeRefiner {
   Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
                     ExtendedInferenceContext* ec);
 
-  // Destructive operation, which steals ownership of inference contexts map.
-  std::unordered_map<const Node*, std::unique_ptr<ExtendedInferenceContext>>
-  StealInferenceContexts() {
-    return std::move(node_to_context_);
-  }
-
   int32 graph_def_version_;
   const OpRegistryInterface* const ops_registry_;
 
@@ -299,6 +293,11 @@ class ShapeRefiner {
   // defined functions. By default that info is discarded to save memory.
   bool keep_nested_shape_inferences_ = false;
 
+  // Cache the graph corresponding to each functin definition for which shapes
+  // are refined.
+  std::unordered_map<const FunctionDef*, std::unique_ptr<const Graph>>
+      functions_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeRefiner);
 };
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 676fc7ccedf4fcdacddee71901e094d03201b439..e4eef1dbe28bc79d2838b90ba6595a04ad1e4e2e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -1161,11 +1161,13 @@ TEST_F(ShapeRefinerTest, SameDefinedShape) {
   auto s_unknown_2 = ctx->MakeShape({-1, 2});
   auto s_unknown_2_b = ctx->MakeShape({-1, 2});
 
-  EXPECT_TRUE(SameDefinedShape(ctx, unknown, unknown_b));
+  EXPECT_TRUE(SameDefinedShape(ctx, unknown, unknown));
+  EXPECT_FALSE(SameDefinedShape(ctx, unknown, unknown_b));
   EXPECT_FALSE(SameDefinedShape(ctx, unknown, s_1_2));
   EXPECT_TRUE(SameDefinedShape(ctx, s_1_2, s_1_2_b));
   EXPECT_FALSE(SameDefinedShape(ctx, s_1_2, s_2_2));
-  EXPECT_TRUE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2_b));
+  EXPECT_TRUE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2));
+  EXPECT_FALSE(SameDefinedShape(ctx, s_unknown_2, s_unknown_2_b));
 }
 
 TEST_F(ShapeRefinerTest, IsUpdatedShapesOrTypes) {
@@ -1178,14 +1180,15 @@ TEST_F(ShapeRefinerTest, IsUpdatedShapesOrTypes) {
   TF_ASSERT_OK(m.AddNode(test));
   shape_inference::InferenceContext* ctx = m.GetContext(test);
 
+  shape_inference::ShapeHandle unknown = ctx->UnknownShape();
   std::vector<shape_inference::ShapeAndType> t0{
       {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
-      {ctx->UnknownShape(), DT_INVALID},
+      {unknown, DT_INVALID},
       {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
 
   std::vector<shape_inference::ShapeAndType> t1{
       {ctx->MakeShape({1, 2, 3}), DT_FLOAT},
-      {ctx->UnknownShape(), DT_INVALID},
+      {unknown, DT_INVALID},
       {ctx->MakeShape({4, 3, 2, 1}), DT_INT32}};
 
   std::vector<shape_inference::ShapeAndType> t2{
@@ -1256,10 +1259,20 @@ TEST_F(ShapeRefinerTest, IncrementalUpdates) {
       0, std::vector<shape_inference::ShapeAndType>{{shp, DT_FLOAT}});
   refined = false;
   TF_ASSERT_OK(m.UpdateNode(dequeue, true /* relax */, &refined));
-  EXPECT_FALSE(refined);
+  EXPECT_TRUE(refined);
   ctx = m.GetContext(dequeue);
   EXPECT_EQ("[?,7]", ctx->DebugString(ctx->output(0)));
-  ASSERT_FALSE(SameHandle(ctx->Dim(ctx->output(0), 0), ctx->Dim(shp, 0)));
+  EXPECT_TRUE(SameHandle(ctx->Dim(ctx->output(0), 0), ctx->Dim(shp, 0)));
+
+  // Inject a shape of the same handle and expect refined to not change.
+  ctx = m.GetContext(queue);
+  shape_inference::ShapeHandle shp2 = shp;
+  ctx->set_output_handle_shapes_and_types(
+      0, std::vector<shape_inference::ShapeAndType>{{shp2, DT_FLOAT}});
+  refined = false;
+  TF_ASSERT_OK(m.UpdateNode(dequeue, /*relax=*/false, &refined));
+  EXPECT_FALSE(refined);
+  EXPECT_TRUE(SameHandle(ctx->Dim(shp, 0), ctx->Dim(shp2, 0)));
 }
 
 void TestSimpleFunctionInference(bool enable_function_inference,
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index e6403df97fd64d7320cceb8e688199740cf163c5..d7e01144c9ef3aa09ddd212947eafe48ccff555b 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -20,10 +20,21 @@ limitations under the License.
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
+const int kMaxAllocReportNodes = 100;
+const float kMaxAllocReportFraction = 0.99;
+
+struct AllocStats {
+  std::map<int64, std::vector<string>> nodes_by_size;
+  int64 total_bytes = 0;
+  int64 total_nodes = 0;
+};
+}  // namespace
 
 NodeExecStatsWrapper::NodeExecStatsWrapper()
     : NodeExecStatsWrapper(new NodeExecStats) {}
@@ -139,7 +150,7 @@ void StepStatsCollector::BuildCostModel(
     const DeviceStepStats* hardware_stats;
   };
 
-  std::unordered_map<StringPiece, DeviceStats, StringPiece::Hasher>
+  std::unordered_map<StringPiece, DeviceStats, StringPieceHasher>
       per_device_stats;
   std::unordered_map<int, const DeviceStepStats*> gpu_hardware_stats;
 
@@ -179,7 +190,7 @@ void StepStatsCollector::BuildCostModel(
     CostModel* cm = cost_model_manager->FindOrCreateCostModel(graph);
     cm->IncrementUpdateTimes();
 
-    std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node;
+    std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node;
     for (Node* n : graph->nodes()) {
       name_to_node.emplace(n->name(), n);
     }
@@ -267,6 +278,85 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
+string StepStatsCollector::ReportAllocsOnResourceExhausted(const string& err) {
+  mutex_lock l(mu_);
+  if (err.find("OOM") == err.npos) {
+    return "";
+  }
+  // <device, allocator> -> AllocStats
+  std::map<std::pair<string, string>, AllocStats> allocs_map;
+  string report = "\n";
+  for (const auto& dev_stat : dev_stats_) {
+    const string& device = dev_stat.first;
+    // Only print the device that has OOM.
+    // TODO(xpan): Extract device from err first to speed it up.
+    if (err.find(device) == err.npos) {
+      continue;
+    }
+    // NodeExecStatsWrapper*
+    for (const auto& stats : dev_stat.second) {
+      // std::pair<AllocatorMemoryUsed*, TrackingAllocator*>
+      for (const auto& alloc : stats->allocations_) {
+        // Only print the allocator that has OOM.
+        // TODO(xpan): Extract device from err first to speed it up.
+        if (err.find(alloc.first->allocator_name()) == err.npos) {
+          continue;
+        }
+        auto dev_allocator =
+            std::make_pair(dev_stat.first, alloc.first->allocator_name());
+        AllocStats& dev_allocs_stats = allocs_map[dev_allocator];
+        TrackingAllocator* tracking_alloc = alloc.second;
+        gtl::InlinedVector<AllocRecord, 4> cur_records =
+            tracking_alloc->GetCurrentRecords();
+        int64 cur_bytes = 0;
+        for (const auto& r : cur_records) {
+          cur_bytes += r.alloc_bytes;
+        }
+        if (cur_bytes > 0) {
+          dev_allocs_stats.total_bytes += cur_bytes;
+          dev_allocs_stats.total_nodes++;
+          dev_allocs_stats.nodes_by_size[cur_bytes].push_back(
+              stats->stats()->node_name());
+        }
+      }
+    }
+  }
+
+  for (const auto& dev_allocs_it : allocs_map) {
+    const auto& dev = dev_allocs_it.first;
+    const AllocStats& dev_allocs_stats = dev_allocs_it.second;
+    int64 reported_bytes = 0;
+    int64 reported_nodes = 0;
+    bool done = false;
+    strings::StrAppend(&report, "\nCurrent usage from device: ", dev.first,
+                       ", allocator: ", dev.second, "\n");
+    // Print allocations stats of the <device, allocator> pair.
+    for (auto it = dev_allocs_stats.nodes_by_size.rbegin();
+         it != dev_allocs_stats.nodes_by_size.rend(); ++it) {
+      for (const string& node_name : it->second) {
+        reported_bytes += it->first;
+        strings::StrAppend(&report, "  ",
+                           strings::HumanReadableNumBytes(it->first), " from ",
+                           node_name, "\n");
+        if (++reported_nodes > kMaxAllocReportNodes ||
+            reported_bytes >=
+                dev_allocs_stats.total_bytes * kMaxAllocReportFraction) {
+          done = true;
+          break;
+        }
+      }
+      if (done) break;
+    }
+    int64 remain_nodes = dev_allocs_stats.total_nodes - reported_nodes;
+    int64 remain_bytes = dev_allocs_stats.total_bytes - reported_bytes;
+    if (remain_nodes > 0) {
+      strings::StrAppend(&report, "  Remaining ", remain_nodes, " nodes with ",
+                         strings::HumanReadableNumBytes(remain_bytes), "\n");
+    }
+  }
+  return report;
+}
+
 void StepStatsCollector::Finalize() {
   mutex_lock l(mu_);
   FinalizeInternal();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index b1fd28a9826672fd0319d9f33cb66b511c8b3fa3..996dbb59bcc29b1a9b8ee47228e09c0818428a93 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -82,6 +82,13 @@ class StepStatsCollector {
   void Save(const string& device, NodeExecStats* nt);
   void Save(const string& device, NodeExecStatsWrapper* stats);
 
+  // Generates a string reporting the currently used memory based
+  // on ResourceExhausted OOM `err` message.
+  // `err` message needs to contain device name and allocator name, E.g.:
+  // "ResourceExhaustedError: OOM when allocating tensor ...
+  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
+  string ReportAllocsOnResourceExhausted(const string& err);
+
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
   // User shouldn't call Save() methods after Finalize.
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
index 9caa076c7226e66005e37697b0119c5f84747fe6..cc272d156ef67a4f4f93f35603ffe301d154932a 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.h
@@ -46,8 +46,8 @@ class GSYCLInterface {
 
     if (!found_device) {
       // Currently Intel GPU is not supported
-      LOG(WARNING) << "No OpenCL GPU found that is supported by ComputeCpp, "
-                      "trying OpenCL CPU";
+      LOG(WARNING) << "No OpenCL GPU found that is supported by "
+                   << "ComputeCpp/triSYCL, trying OpenCL CPU";
     }
 
     for (const auto& device : device_list) {
@@ -58,10 +58,24 @@ class GSYCLInterface {
       }
     }
 
+    if (!found_device) {
+      LOG(WARNING) << "No OpenCL CPU found that is supported by "
+                   << "ComputeCpp/triSYCL, checking for host sycl device";
+    }
+
+    for (const auto& device : device_list) {
+      // triSYCL only supports the host device for now
+      if (device.is_host()) {
+        LOG(WARNING) << "Found SYCL host device";
+        AddDevice(device);
+        found_device = true;
+      }
+    }
+
     if (!found_device) {
       // Currently Intel GPU is not supported
-      LOG(FATAL)
-          << "No OpenCL GPU nor CPU found that is supported by ComputeCpp";
+      LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
+                 << " supported by ComputeCPP/triSYCL was found";
     } else {
       LOG(INFO) << "Found following OpenCL devices:";
       for (int i = 0; i < device_list.size(); i++) {
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 525f96a3de3a9d7f9b4929d22d8db45bac4c5174..a32badef6dfdb8b62662da880c99842b1cafd13c 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -56,6 +56,7 @@ tf_proto_library(
     cc_grpc_version = 1,
     protodeps = [
         ":debugger_event_metadata_proto",
+        "//tensorflow/core/profiler:protos_all",
     ] + tf_additional_all_protos(),
     visibility = ["//tensorflow:__subpackages__"],
 )
@@ -89,9 +90,9 @@ tf_cuda_library(
     deps = [
         ":debug",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:device_tracer",
         "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_tracer",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
@@ -123,6 +124,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
@@ -144,6 +146,7 @@ tf_cuda_library(
         ":debugger_event_metadata_proto_cc",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
diff --git a/tensorflow/core/debug/debug_gateway_test.cc b/tensorflow/core/debug/debug_gateway_test.cc
index 3903040e4d936dd23f97a7c82c06d7524b9c98a2..57583349069a0b4deb137cb09564cdbb3909a4b0 100644
--- a/tensorflow/core/debug/debug_gateway_test.cc
+++ b/tensorflow/core/debug/debug_gateway_test.cc
@@ -40,6 +40,9 @@ std::unique_ptr<DirectSession> CreateSession() {
   options.config.mutable_graph_options()
       ->mutable_rewrite_options()
       ->set_constant_folding(RewriterConfig::OFF);
+  options.config.mutable_graph_options()
+      ->mutable_rewrite_options()
+      ->set_dependency_optimization(RewriterConfig::OFF);
 
   return std::unique_ptr<DirectSession>(
       dynamic_cast<DirectSession*>(NewSession(options)));
@@ -55,7 +58,7 @@ class SessionDebugMinusAXTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
@@ -503,7 +506,7 @@ TEST_F(SessionDebugMinusAXTest,
 }
 #endif
 
-class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
+class SessionDebugOutputSlotWithoutOutgoingEdgeTest : public ::testing::Test {
  public:
   void Initialize() {
     Graph graph(OpRegistry::Global());
@@ -513,7 +516,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     Tensor a_tensor(DT_FLOAT, TensorShape({1, 1}));
@@ -540,7 +543,7 @@ class SessionDebugOutputSlotWithoutOngoingEdgeTest : public ::testing::Test {
   GraphDef def_;
 };
 
-TEST_F(SessionDebugOutputSlotWithoutOngoingEdgeTest,
+TEST_F(SessionDebugOutputSlotWithoutOutgoingEdgeTest,
        WatchSlotWithoutOutgoingEdge) {
   Initialize();
   auto session = CreateSession();
@@ -615,7 +618,7 @@ class SessionDebugVariableTest : public ::testing::Test {
 #elif defined(TENSORFLOW_USE_SYCL)
     const string kDeviceName = "/job:localhost/replica:0/task:0/device:SYCL:0";
 #else
-    const string kDeviceName = "/job:localhost/replica:0/task:0/cpu:0";
+    const string kDeviceName = "/job:localhost/replica:0/task:0/device:CPU:0";
 #endif
 
     // Define variable node.
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 85d04daa6592afbfd024d1aeec07ad43088db19b..f81445c20bd2ba56a6d7d3bb4ddefc71f5199784 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -736,7 +736,7 @@ Status DebugGrpcChannel::ReceiveServerRepliesAndClose() {
   }
 }
 
-mutex DebugGrpcIO::streams_mu;
+mutex DebugGrpcIO::streams_mu(LINKER_INITIALIZED);
 
 int64 DebugGrpcIO::channel_connection_timeout_micros = 900 * 1000 * 1000;
 // TODO(cais): Make this configurable?
diff --git a/tensorflow/core/debug/debug_service.proto b/tensorflow/core/debug/debug_service.proto
index 547c0576f08769f9e373a98231caf172a9312937..4bef74dfc5706b0033ff91b5e6cf09bb119d657d 100644
--- a/tensorflow/core/debug/debug_service.proto
+++ b/tensorflow/core/debug/debug_service.proto
@@ -18,6 +18,8 @@ syntax = "proto3";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/profiler/tfprof_log.proto";
+import "tensorflow/core/protobuf/debug.proto";
 import "tensorflow/core/util/event.proto";
 
 // Reply message from EventListener to the client, i.e., to the source of the
@@ -46,6 +48,38 @@ message EventReply {
   // during debugging.
 }
 
+// Data on the traceback of a debugged call, e.g., a Session.run() call, or the
+// execution of an eager operation.
+message CallTraceback {
+  enum CallType {
+    UNSPECIFIED = 0;
+    GRAPH_EXECUTION = 1;
+    EAGER_EXECUTION = 2;
+  }
+
+  CallType call_type = 1;
+
+  // A key for the call. For example, for graph execution, this is a key
+  // consisting of the names of the fed and fetched tensors.
+  string call_key = 2;
+
+  // Traceback stack for the origin of the call event.
+  // For graph execution, this is the stack of the Session.run() call.
+  // For eager execution, this is the stack of the Python line that invokes
+  // the execution of the eager op.
+  tfprof.CodeDef origin_stack = 3;
+
+  // Keeps track of the mapping from integer IDs in `origin_stack` to actual
+  // string values (e.g., file paths, function names).
+  map<int64, string> origin_id_to_string = 4;
+
+  // Traceback for the graph (if any) involved in the call.
+  tfprof.OpLogProto graph_traceback = 5;
+
+  // Version of the graph in `graph_traceback` (if any).
+  int64 graph_version = 6;
+}
+
 // EventListener: Receives Event protos, e.g., from debugged TensorFlow
 // runtime(s).
 service EventListener {
@@ -57,4 +91,10 @@ service EventListener {
   //      ops that get executed immediately after the beginning of the graph
   //      execution.
   rpc SendEvents(stream Event) returns (stream EventReply);
+
+  // Send the tracebacks of a TensorFlow execution call.
+  rpc SendTracebacks(CallTraceback) returns (EventReply);
+
+  // Send a collection of source code files being debugged.
+  rpc SendSourceFiles(DebuggedSourceFiles) returns (EventReply);
 }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 07e279cb64417b0252da97b6cf353f86ecfcd111..2db7ebd7952c9e1edf374267ee33f697eb846885 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -75,7 +75,6 @@ cc_library(
     hdrs = ["message_wrappers.h"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
@@ -129,7 +128,6 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
@@ -142,6 +140,7 @@ cc_library(
     hdrs = ["session_mgr.h"],
     deps = [
         ":graph_mgr",
+        ":worker_cache_wrapper",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
@@ -169,19 +168,30 @@ cc_library(
 )
 
 cc_library(
-    name = "worker_interface",
+    name = "tensor_coding",
     srcs = ["tensor_coding.cc"],
     hdrs = [
         "tensor_coding.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "worker_interface",
+    hdrs = [
         "worker_interface.h",
     ],
     deps = [
         ":call_options",
         ":message_wrappers",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
     ],
 )
@@ -197,6 +207,7 @@ cc_library(
         ":partial_run_mgr",
         ":rendezvous_mgr_interface",
         ":session_mgr",
+        ":tensor_coding",
         ":worker_interface",
         ":worker_session",
         "//tensorflow/core:core_cpu_internal",
@@ -230,7 +241,7 @@ tf_cc_test(
     srcs = ["tensor_coding_test.cc"],
     linkstatic = 1,
     deps = [
-        ":worker_interface",
+        ":tensor_coding",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -253,6 +264,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "worker_cache_wrapper",
+    hdrs = ["worker_cache_wrapper.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "remote_device",
     srcs = ["remote_device.cc"],
@@ -313,6 +334,7 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
@@ -350,6 +372,7 @@ cc_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:tensorflow_opensource",
     ],
 )
@@ -393,6 +416,7 @@ cc_library(
         ":worker_env",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 593fe0e363edc543a74572ed51128777e048a47d..d84b69d06b77b03dee6e1041e7189ec6f3fb8682 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -105,6 +105,7 @@ Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
         Rendezvous::CreateKey(target, 1 /* src_incarnation */, target,
                               out.name(), FrameAndIter(0, 0));
     recv_keys->push_back(key);
+    ++i;
   }
   return Status::OK();
 }
@@ -124,8 +125,11 @@ Status ClusterFunctionLibraryRuntime::Instantiate(
   WorkerInterface* wi = worker_session_->worker_cache->CreateWorker(target);
 
   if (wi == nullptr) {
-    return errors::InvalidArgument("Could not find worker with target: ",
-                                   target);
+    std::vector<string> workers;
+    worker_session_->worker_cache->ListWorkers(&workers);
+    return errors::InvalidArgument(
+        "Could not find worker with target: ", target,
+        " Available workers: ", str_util::Join(workers, ", "));
   }
 
   // Make RPC and obtain a graph handle.
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 04587dd8ca8638d031d840b0b53b5168bdab63c2..6dd8b9ec73778baea0ed2876ac5111e9fd331dcf 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -103,14 +103,54 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, ConstructFunctionGraph) {
   GraphDef actual;
   std::vector<string> send_keys, recv_keys;
   TF_CHECK_OK(ConstructFunctionGraphHelper(
-      test::function::XTimesTwo().signature(),
+      test::function::Swap().signature(),
       {{"T", DT_FLOAT}, {"_target", "/job:a/replica:0/task:0/cpu:0"}}, &actual,
       &send_keys, &recv_keys));
-
   GraphDef expected;
   protobuf::TextFormat::ParseFromString(R"(
 node {
-  name: "_recv_x_0"
+  name: "_recv_i0_0"
+  op: "_Recv"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "i0"
+    }
+  }
+  attr {
+    key: "tensor_type"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "_recv_i1_1"
   op: "_Recv"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
@@ -140,7 +180,7 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "x"
+      s: "i1"
     }
   }
   attr {
@@ -151,9 +191,10 @@ node {
   }
 }
 node {
-  name: "XTimesTwo"
-  op: "XTimesTwo"
-  input: "_recv_x_0"
+  name: "Swap"
+  op: "Swap"
+  input: "_recv_i0_0"
+  input: "_recv_i1_1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -163,15 +204,57 @@ node {
   }
   attr {
     key: "_target"
+    value {
+      s: "/job:a/replica:0/task:0/cpu:0"
+    }
+  }
+}
+node {
+  name: "_send_o0_0"
+  op: "_Send"
+  input: "Swap"
+  device: "/job:a/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: true
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:a/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
     value {
       s: "/job:a/replica:0/task:0/device:CPU:0"
     }
   }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 1
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "o0"
+    }
+  }
 }
 node {
-  name: "_send_y_0"
+  name: "_send_o1_1"
   op: "_Send"
-  input: "XTimesTwo"
+  input: "Swap:1"
   device: "/job:a/replica:0/task:0/device:CPU:0"
   attr {
     key: "T"
@@ -206,10 +289,11 @@ node {
   attr {
     key: "tensor_name"
     value {
-      s: "y"
+      s: "o1"
     }
   }
-})",
+}
+)",
                                         &expected);
   TF_EXPECT_GRAPH_EQ(expected, actual);
 }
@@ -234,16 +318,18 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, DISABLED_InstantiateAndRun) {
 TEST_F(ClusterFunctionLibraryRuntimeTest,
        DISABLED_InstantiateAndRunAttrSubstitution) {
   FunctionDefLibrary proto;
-  *(proto.add_function()) = test::function::XTimesTwo();
+  *(proto.add_function()) = test::function::Swap();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
 
-  Tensor y;
-  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y1, y2;
+  auto x1 = test::AsTensor<float>({1, 2, 3, 4});
+  auto x2 = test::AsTensor<float>({4, 3, 2, 1});
   TF_EXPECT_OK(InstantiateAndRun(
-      "XTimesTwo", lib_def,
+      "Swap", lib_def,
       {{"T", DT_FLOAT}, {"_target", "/job:localhost/replica:0/task:1/cpu:0"}},
-      {x}, {&y}));
-  test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+      {x1, x2}, {&y1, &y2}));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({4, 3, 2, 1}));
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2, 3, 4}));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 391ffda25c0944490fdac6749d137b97f45d9139..60d58af61dad56fbb09df041fb5ca1429fd451ad 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -208,8 +208,7 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
     }
 
     // Give the device an opportunity to rewrite its subgraph.
-    TF_RETURN_IF_ERROR(
-        unit->device->MaybeRewriteGraph(gdef.library(), &subgraph));
+    TF_RETURN_IF_ERROR(unit->device->MaybeRewriteGraph(&subgraph));
 
     // Top-level nodes in the graph uses the op segment to cache
     // kernels. Therefore, as long as the executor is alive, we need
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index c7ba7abeaffc654b24adfcc320ed45990cf5bc77..aaa4cfa7341c42bf9f7302e8ef30a28b68e6213c 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -159,7 +159,7 @@ Status LocalMaster::Reset(CallOptions* call_options,
 
 namespace {
 mutex* get_local_master_registry_lock() {
-  static mutex local_master_registry_lock;
+  static mutex local_master_registry_lock(LINKER_INITIALIZED);
   return &local_master_registry_lock;
 }
 
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index f7fce1d0ec5bf3cd06d89b67fc6665874f1b2dff..03b65d8cba9112e272f52518ca6050ce5f16eb5d 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -67,13 +67,14 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
                     GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache)
+                    WorkerCacheInterface* worker_cache, bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
         debug_opts_(bopts.debug_options),
-        worker_cache_(worker_cache) {
+        worker_cache_(worker_cache),
+        should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
             << client_graph()->graph.num_node_ids();
 
@@ -85,7 +86,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     }
   }
 
-  ~ReffedClientGraph() override { DeregisterPartitions(); }
+  ~ReffedClientGraph() override {
+    if (should_deregister_) {
+      DeregisterPartitions();
+    }
+  }
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
@@ -208,7 +213,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const bool is_partial_;
   const DebugOptions& debug_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  std::unordered_map<StringPiece, Node*, StringPiece::Hasher> name_to_node_;
+  std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
+  const bool should_deregister_;
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -486,7 +492,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
           << execution_count;
   // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPiece::Hasher> feeds(3);
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
 
   for (size_t i = 0; i < req.num_feeds(); ++i) {
     if (!feeds.insert({req.feed_name(i), i}).second) {
@@ -498,6 +504,9 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
 
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
+  if (pss->report_tensor_allocations_upon_oom) {
+    exec_opts.set_report_tensor_allocations_upon_oom(true);
+  }
   if (pss->collect_costs) {
     exec_opts.set_record_costs(true);
   }
@@ -1040,7 +1049,11 @@ Status MasterSession::Create(GraphDef* graph_def,
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForBaseGraph(
         graph_def, execution_options, &execution_state_));
   }
-  if (options.cluster_def != nullptr) {
+  // TODO(b/36574172): Remove these conditions when ClusterSpec
+  // propagation is supported in all servers.
+  if (options.cluster_def != nullptr ||
+      session_opts_.config.isolate_session_state()) {
+    should_delete_worker_sessions_ = true;
     return CreateWorkerSessions(options);
   }
   return Status::OK();
@@ -1048,10 +1061,9 @@ Status MasterSession::Create(GraphDef* graph_def,
 
 Status MasterSession::CreateWorkerSessions(
     const WorkerCacheFactoryOptions& options) {
-  CHECK(worker_cache_) << "CreateWorkerSessions should be called only with "
-                       << "dynamic cluster membership.";
   std::vector<string> worker_names;
-  worker_cache_->ListWorkers(&worker_names);
+  WorkerCacheInterface* worker_cache = get_worker_cache();
+  worker_cache->ListWorkers(&worker_names);
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
@@ -1069,10 +1081,10 @@ Status MasterSession::CreateWorkerSessions(
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers] {
+  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
-        worker_cache_->ReleaseWorker(*worker_group.name, worker_group.worker);
+        worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
       }
     }
   });
@@ -1081,11 +1093,19 @@ Status MasterSession::CreateWorkerSessions(
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
-    workers[i].worker = worker_cache_->CreateWorker(worker_names[i]);
+    workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
     workers[i].request.set_session_handle(handle_);
-    *workers[i].request.mutable_server_def()->mutable_cluster() =
-        *options.cluster_def;
-    workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+    if (options.cluster_def) {
+      *workers[i].request.mutable_server_def()->mutable_cluster() =
+          *options.cluster_def;
+      workers[i].request.mutable_server_def()->set_protocol(*options.protocol);
+      // Session state is always isolated when ClusterSpec propagation
+      // is in use.
+      workers[i].request.set_isolate_session_state(true);
+    } else {
+      workers[i].request.set_isolate_session_state(
+          session_opts_.config.isolate_session_state());
+    }
 
     DeviceNameUtils::ParsedName name;
     if (!DeviceNameUtils::ParseFullName(worker_names[i], &name)) {
@@ -1119,6 +1139,59 @@ Status MasterSession::CreateWorkerSessions(
   return status;
 }
 
+Status MasterSession::DeleteWorkerSessions() {
+  WorkerCacheInterface* worker_cache = get_worker_cache();
+  std::vector<string> worker_names;
+  worker_cache->ListWorkers(&worker_names);
+
+  struct WorkerGroup {
+    // The worker name. (Not owned.)
+    const string* name;
+
+    // The worker referenced by name. (Not owned.)
+    WorkerInterface* worker = nullptr;
+
+    // Request and responses used for a given worker.
+    DeleteWorkerSessionRequest request;
+    DeleteWorkerSessionResponse response;
+    Status status = Status::OK();
+  };
+  BlockingCounter done(worker_names.size());
+  std::vector<WorkerGroup> workers(worker_names.size());
+
+  // Release the workers.
+  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
+    for (auto&& worker_group : workers) {
+      if (worker_group.worker != nullptr) {
+        worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
+      }
+    }
+  });
+
+  Status status = Status::OK();
+  // Create all the workers & kick off the computations.
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    workers[i].name = &worker_names[i];
+    workers[i].worker = worker_cache->CreateWorker(worker_names[i]);
+    workers[i].request.set_session_handle(handle_);
+  }
+
+  for (size_t i = 0; i < worker_names.size(); ++i) {
+    auto cb = [i, &workers, &done](const Status& s) {
+      workers[i].status = s;
+      done.DecrementCount();
+    };
+    workers[i].worker->DeleteWorkerSessionAsync(&workers[i].request,
+                                                &workers[i].response, cb);
+  }
+
+  done.Wait();
+  for (size_t i = 0; i < workers.size(); ++i) {
+    status.Update(workers[i].status);
+  }
+  return status;
+}
+
 Status MasterSession::ListDevices(ListDevicesResponse* resp) const {
   if (worker_cache_) {
     // This is a ClusterSpec-propagated session, and thus env_->local_devices
@@ -1205,7 +1278,7 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
           stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache);
+          worker_cache, !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
@@ -1368,6 +1441,8 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.report_tensor_allocations_upon_oom =
+        req.options().report_tensor_allocations_upon_oom();
 
     // Build the cost model every 'build_cost_model_every' steps after skipping
     // an
@@ -1528,7 +1603,8 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-
+  pss.report_tensor_allocations_upon_oom =
+      req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
   // initial 'build_cost_model_after' steps.
   const int64 build_cost_model_after =
@@ -1598,6 +1674,12 @@ Status MasterSession::Close() {
     ClearRunsTable(&to_unref, &partial_run_graphs_);
   }
   for (ReffedClientGraph* rcg : to_unref) rcg->Unref();
+  if (should_delete_worker_sessions_) {
+    Status s = DeleteWorkerSessions();
+    if (!s.ok()) {
+      LOG(WARNING) << s;
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 51ea92da6807ff83ad2382f801b5297e81e631a0..4bd4e1367aa75730df829a2909005a221b9ab780 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -146,6 +146,7 @@ class MasterSession : public core::RefCounted {
     bool collect_timeline = false;
     bool collect_rpcs = false;
     bool collect_partition_graphs = false;
+    bool report_tensor_allocations_upon_oom = false;
     Microseconds start_micros = Microseconds(0);
     Microseconds end_micros = Microseconds(0);
     std::vector<StepStats> step_stats;  // per partition
@@ -200,6 +201,10 @@ class MasterSession : public core::RefCounted {
   // workers.
   Status CreateWorkerSessions(const WorkerCacheFactoryOptions& server_def);
 
+  // TODO(b/36574172): Always use Create/DeleteWorkerSession.
+  bool should_delete_worker_sessions_ = false;
+  Status DeleteWorkerSessions();
+
   Status StartStep(const BuildGraphOptions& opts, int64* count,
                    ReffedClientGraph** graph, bool is_partial);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 5190288e8835ee81566db3bfa52a115c6d48667f..80640c806deedccbe15bdca3216e0c0d195045e1 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -53,9 +53,11 @@ cc_library(
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
-        "//tensorflow/core:lib",
-        "@grpc//:grpc++_unsecure",
         "@grpc//:grpc_unsecure",
+        "@grpc//:grpc++_unsecure",
+        "//tensorflow/core:lib",
+        # Required to be able to overload TensorResponse parsing.
+        "//tensorflow/core/distributed_runtime:tensor_coding",
     ],
 )
 
@@ -70,18 +72,34 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_state",
+    srcs = [],
+    hdrs = ["grpc_state.h"],
+    deps = [
+        ":grpc_client_cq_tag",
+        ":grpc_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:call_options",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
 cc_library(
     name = "grpc_remote_worker",
     srcs = ["grpc_remote_worker.cc"],
     hdrs = ["grpc_remote_worker.h"],
     deps = [
         ":grpc_client_cq_tag",
+        ":grpc_state",
         ":grpc_util",
         ":grpc_worker_service_impl",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
         "//tensorflow/core/distributed_runtime:worker_interface",
         "@grpc//:grpc++_unsecure",
@@ -182,10 +200,9 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
-        ":grpc_namespace_compat",
         ":grpc_serialization_traits",
         "//tensorflow/core:worker_proto_cc",
-        "//tensorflow/core/distributed_runtime:worker_interface",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
     ],
 )
@@ -229,22 +246,12 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        ":grpc_namespace_compat",
         ":grpc_serialization_traits",
         "//tensorflow/core:master_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
 )
 
-cc_library(
-    name = "grpc_namespace_compat",
-    srcs = [],
-    hdrs = ["grpc_namespace_compat.h"],
-    deps = [
-        "@grpc//:grpc++_unsecure",
-    ],
-)
-
 cc_library(
     name = "grpc_serialization_traits",
     srcs = [],
@@ -263,6 +270,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:base_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_interface",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index d998d51058c5e3178a015770b40f6f637ccf8088..e2016e824c0bf504af4c624cad253963b223eb35 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -49,75 +49,77 @@ MasterService::Stub::Stub(
     const std::shared_ptr< ::grpc::ChannelInterface>& channel)
     : channel_(channel),
       rpcmethod_CreateSession_(grpcMasterService_method_names[0],
-                               ::grpc::RpcMethod::NORMAL_RPC, channel),
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
       rpcmethod_ExtendSession_(grpcMasterService_method_names[1],
-                               ::grpc::RpcMethod::NORMAL_RPC, channel),
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
       rpcmethod_PartialRunSetup_(grpcMasterService_method_names[2],
-                                 ::grpc::RpcMethod::NORMAL_RPC, channel),
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel),
       rpcmethod_RunStep_(grpcMasterService_method_names[3],
-                         ::grpc::RpcMethod::NORMAL_RPC, channel),
+                         ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_CloseSession_(grpcMasterService_method_names[4],
-                              ::grpc::RpcMethod::NORMAL_RPC, channel),
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
-                             ::grpc::RpcMethod::NORMAL_RPC, channel),
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
     CreateSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_CreateSession_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CreateSession_, context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::ExtendSession(
     ::grpc::ClientContext* context, const ExtendSessionRequest& request,
     ExtendSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_ExtendSession_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ExtendSession_, context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::PartialRunSetup(
     ::grpc::ClientContext* context, const PartialRunSetupRequest& request,
     PartialRunSetupResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_PartialRunSetup_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_PartialRunSetup_, context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::RunStep(::grpc::ClientContext* context,
                                             const RunStepRequest& request,
                                             RunStepResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_RunStep_, context,
-                                   request, response);
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_RunStep_,
+                                             context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::CloseSession(
     ::grpc::ClientContext* context, const CloseSessionRequest& request,
     CloseSessionResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_CloseSession_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CloseSession_, context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::ListDevices(
     ::grpc::ClientContext* context, const ListDevicesRequest& request,
     ListDevicesResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_ListDevices_,
-                                   context, request, response);
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ListDevices_, context, request, response);
 }
 
 ::grpc::Status MasterService::Stub::Reset(::grpc::ClientContext* context,
                                           const ResetRequest& request,
                                           ResetResponse* response) {
-  return ::grpc::BlockingUnaryCall(channel_.get(), rpcmethod_Reset_, context,
-                                   request, response);
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Reset_,
+                                             context, request, response);
 }
 
 MasterService::AsyncService::AsyncService() {
   for (int i = 0; i < 7; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
-        ::grpc::RpcMethod::NORMAL_RPC,
-        nullptr));
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 131de2863f95e86d519c381ef8e100a80fa6561a..412395c52635d5c3cda95dddea50f7cd2d8c8e4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "grpc++/impl/codegen/stub_options.h"
 #include "grpc++/impl/codegen/sync_stream.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_namespace_compat.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -108,13 +107,13 @@ class MasterService final {
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
-    const ::grpc::RpcMethod rpcmethod_CreateSession_;
-    const ::grpc::RpcMethod rpcmethod_ExtendSession_;
-    const ::grpc::RpcMethod rpcmethod_PartialRunSetup_;
-    const ::grpc::RpcMethod rpcmethod_RunStep_;
-    const ::grpc::RpcMethod rpcmethod_CloseSession_;
-    const ::grpc::RpcMethod rpcmethod_ListDevices_;
-    const ::grpc::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_CreateSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ExtendSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_PartialRunSetup_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunStep_;
+    const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
+    const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
+    const ::grpc::internal::RpcMethod rpcmethod_Reset_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index a94f75418ebc0438261ca11dd2cbe417d1b38195..b3b05408b15e20ceb934267ccb66134133aff2fd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
@@ -36,36 +37,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Overload of GrpcParseProto so we can decode a TensorResponse without
-// extra copying.
-bool GrpcParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
-  struct ByteSource : public TensorResponse::Source {
-    const ::grpc::ByteBuffer* buffer;
-    GrpcByteBufferSource src;
-    bool ok;
-
-    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
-      ok = src.Init(*buffer);
-      return &src;
-    }
-  };
-  ByteSource bs;
-  bs.buffer = &src;
-  return dst->ParseFrom(&bs).ok() && bs.ok;
-}
-
 class GrpcRemoteWorker : public WorkerInterface {
  public:
-  explicit GrpcRemoteWorker(GrpcCounter* live_rpc_counter,
-                            SharedGrpcChannelPtr channel,
+  explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             WorkerCacheLogger* logger)
-      : counter_(live_rpc_counter),
-        channel_(std::move(channel)),
+      : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
+        deleteworkersession_(Method(GrpcWorkerMethod::kDeleteWorkerSession)),
         registergraph_(Method(GrpcWorkerMethod::kRegisterGraph)),
         deregistergraph_(Method(GrpcWorkerMethod::kDeregisterGraph)),
         rungraph_(Method(GrpcWorkerMethod::kRunGraph)),
@@ -90,6 +72,12 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, createworkersession_, std::move(done));
   }
 
+  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+                                DeleteWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    IssueRequest(request, response, deleteworkersession_, std::move(done));
+  }
+
   void RegisterGraphAsync(const RegisterGraphRequest* request,
                           RegisterGraphResponse* response,
                           StatusCallback done) override {
@@ -194,132 +182,31 @@ class GrpcRemoteWorker : public WorkerInterface {
   }
 
  private:
-  // Object allocated per active RPC.
-  template <class ResponseMessage>
-  class RPCState : public GrpcClientCQTag {
-   public:
-    RPCState(GrpcCounter* counter, ::grpc::GenericStub* stub,
-             ::grpc::CompletionQueue* cq, const ::grpc::string& method,
-             const protobuf::Message& request, ResponseMessage* response,
-             StatusCallback done, CallOptions* call_opts)
-        : counter_(counter), call_opts_(call_opts), done_(std::move(done)) {
-      // TODO(sanjay): The counter will no longer be needed once we
-      // get a GenericStub API which allows us to manage an entire
-      // RPC with a single completion event instead of four events.
-      counter_->Increment();
-      // The initialization and recovery protocols rely on blocking
-      // until we get a response.
-      context_.set_fail_fast(false);
-      if (call_opts) {
-        call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
-      }
-
-      failure_.store(false);
-      remaining_callbacks_.store(4);  // Init/Read/Write/Finish callbacks
-      response_ = response;
-      GrpcUnparseProto(request, &request_buf_);
-      // TODO(sanjay): When new enough grpc is available, enable the following:
-      //   context_.set_initial_metadata_corked(true);
-      // We can then skip the extra state transition for init callback.
-      call_ = std::move(stub->Call(&context_, method, cq, this));
-      call_initialized_.Notify();
-    }
-
-    // Called multiple times: when init done, read done, write done, call done.
-    void OnCompleted(bool ok) override {
-      if (!ok) failure_.store(true);
-      const int old_count = remaining_callbacks_.fetch_sub(1);
-      if (old_count > 1) {
-        if (old_count == 4) {
-          // Init callback finished.  Issue remaining ops.
-
-          // Annoyingly enough, the way the generic call API works is
-          // inherently racy.  We can get the following sequence of events:
-          //  1. stub->Call() starts.
-          //  2. some stuff happens inside grpc
-          //  3. grpc delivers the completion event
-          //  4. tensorflow event handling thread calls init metadata callback
-          //  5. stub->Call() finishes
-          //  6. the result of stub->Call() is stored in call_
-          // We are currently inside the callback and therefore need to
-          // wait for step 6 to finish before attempting to touch call_.
-          call_initialized_.WaitForNotification();
-
-          if (ok) {
-            // TODO(sanjay): Use WriteLast() when grpc version we are using
-            // is new enough.
-            call_->Write(request_buf_, this);
-            call_->Read(&response_buf_, this);
-          } else {
-            // Skip Write and Read.
-            remaining_callbacks_.fetch_sub(2);
-          }
-          call_->Finish(&status_, this);
-        }
-        // Still waiting for some more callbacks to finish.
-        return;
-      } else {  // old_count == 1, i.e., all callbacks have finished
-        // Last callback finished; clean up.
-        if (call_opts_) {
-          call_opts_->ClearCancelCallback();
-        }
-        Status s = FromGrpcStatus(status_);
-        if (s.ok() && failure_.load()) {
-          s.Update(errors::Internal("callback error"));
-        }
-        if (s.ok() && !GrpcParseProto(response_buf_, response_)) {
-          s.Update(errors::Internal("could not parse rpc response"));
-        }
-        if (!s.ok()) {
-          VLOG(2) << "Call returned with non-ok status: " << s;
-        }
-        done_(s);
-        counter_->Decrement();
-        delete this;
-      }
-    }
-
-   private:
-    GrpcCounter* const counter_;
-    CallOptions* call_opts_;
-    ::grpc::ClientContext context_;
-    std::unique_ptr<::grpc::GenericClientAsyncReaderWriter> call_;
-    ResponseMessage* response_;
-    ::grpc::ByteBuffer request_buf_;
-    ::grpc::ByteBuffer response_buf_;
-    ::grpc::Status status_;
-    StatusCallback done_;
-    std::atomic<bool> failure_;
-    std::atomic<int> remaining_callbacks_;
-    Notification call_initialized_;
-  };
-
   // Utility method for issuing a generic asynchronous request. The
   // given callback, `done`, will be called when the RPC completes.
   void IssueRequest(const protobuf::Message* request,
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr) {
-    new RPCState<protobuf::Message>(counter_, &stub_, cq_, method, *request,
-                                    response, std::move(done), call_opts);
+    new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
+                                    std::move(done), call_opts);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
-    new RPCState<TensorResponse>(counter_, &stub_, cq_, method, *request,
-                                 response, std::move(done), call_opts);
+    new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
+                                 std::move(done), call_opts);
   }
 
   // Helper function for initializing the RpcMethod objects below.
   const char* Method(GrpcWorkerMethod id) { return GrpcWorkerMethodName(id); }
 
-  GrpcCounter* const counter_;
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
-
   ::grpc::CompletionQueue* cq_;
 
   const ::grpc::string getstatus_;
   const ::grpc::string createworkersession_;
+  const ::grpc::string deleteworkersession_;
   const ::grpc::string registergraph_;
   const ::grpc::string deregistergraph_;
   const ::grpc::string rungraph_;
@@ -335,12 +222,10 @@ class GrpcRemoteWorker : public WorkerInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
 
-WorkerInterface* NewGrpcRemoteWorker(GrpcCounter* live_rpc_counter,
-                                     SharedGrpcChannelPtr channel,
+WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(live_rpc_counter, std::move(channel),
-                              completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index 174dfcc7072f49c3831b74a90f602ebcfd87b453..8ad41335409e0a7f7576134ed12b1a233aa341e0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -26,12 +26,10 @@ class CompletionQueue;
 
 namespace tensorflow {
 
-class GrpcCounter;
 class WorkerCacheLogger;
 class WorkerInterface;
 
-WorkerInterface* NewGrpcRemoteWorker(GrpcCounter* live_rpc_counter,
-                                     SharedGrpcChannelPtr channel,
+WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      WorkerCacheLogger* logger);
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f80bdfb70d0f3054b35a17ee34ec53655ccccc1
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -0,0 +1,99 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
+
+#include <utility>
+
+#include "grpc++/generic/generic_stub.h"
+#include "grpc++/grpc++.h"
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/platform/notification.h"
+
+namespace tensorflow {
+
+// Object allocated per active RPC.
+template <class Response>
+class RPCState : public GrpcClientCQTag {
+ public:
+  // Default behavior is to set fail_fast = False and handle timeouts manually.
+  RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+           const ::grpc::string& method, const protobuf::Message& request,
+           Response* response, StatusCallback done, CallOptions* call_opts)
+      : RPCState(stub, cq, method, request, response, std::move(done),
+                 call_opts, /*fail_fast=*/false, /*timeout_in_ms=*/0) {}
+
+  template <typename Request>
+  RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
+           const ::grpc::string& method, const Request& request,
+           Response* response, StatusCallback done, CallOptions* call_opts,
+           bool fail_fast, int64 timeout_in_ms)
+      : call_opts_(call_opts), done_(std::move(done)) {
+    context_.set_fail_fast(fail_fast);
+    if (timeout_in_ms > 0) {
+      context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
+    }
+
+    if (call_opts) {
+      call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
+    }
+
+    response_ = response;
+    GrpcMaybeUnparseProto(request, &request_buf_);
+    call_ =
+        std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
+    call_->StartCall();
+    call_->Finish(&response_buf_, &status_, this);
+  }
+
+  void OnCompleted(bool ok) override {
+    if (call_opts_) {
+      call_opts_->ClearCancelCallback();
+    }
+    Status s = FromGrpcStatus(status_);
+    if (s.ok() && !ok) {
+      // Since this function is only being used for processing the response
+      // to Finish for client-side unary calls, ok should never be false
+      s.Update(errors::Internal("unexpected ok value at rpc completion"));
+    }
+    if (s.ok() && !GrpcMaybeParseProto(response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
+    }
+    if (!s.ok()) {
+      VLOG(2) << "Call returned with non-ok status: " << s;
+    }
+    done_(s);
+    delete this;
+  }
+
+ private:
+  CallOptions* call_opts_;
+  ::grpc::ClientContext context_;
+  std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
+  Response* response_;
+  ::grpc::ByteBuffer request_buf_;
+  ::grpc::ByteBuffer response_buf_;
+  ::grpc::Status status_;
+  StatusCallback done_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_STATE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index 5639691804f58f8dfaa0a2d0eba5e1095ffb1534..e51894b4c756b6f4cfc09fe0adf57e06cb22ee0f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -214,22 +214,13 @@ void EncodeTensorToByteBuffer(bool is_dead, const Tensor& val,
 
     if (tensor_data_is_large) {
       // (E) Encode tensor data, but by sharing backing store
-
-      // TODO(vpai): Use the pure C++ ::grpc::Slice constructor that uses
-      // grpc_slice_new_with_user_data once TensorFlow pins a version of gRPC
-      // that includes https://github.com/grpc/grpc/pull/12065
-
       const TensorBuffer* buf = DMAHelper::buffer(&val);
       buf->Ref();
       slices[1] = ::grpc::Slice(
-          grpc_slice_new_with_user_data(
-              const_cast<void*>(static_cast<const void*>(tdata.data())),
-              tdata.size(),
-              [](void* backing) {
-                static_cast<TensorBuffer*>(backing)->Unref();
-              },
-              const_cast<TensorBuffer*>(buf)),
-          ::grpc::Slice::STEAL_REF);
+          const_cast<void*>(static_cast<const void*>(tdata.data())),
+          tdata.size(),
+          [](void* backing) { static_cast<TensorBuffer*>(backing)->Unref(); },
+          const_cast<TensorBuffer*>(buf));
       num_slices += 1;
     }
     size_t total_bytes = 0;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index 00d911a5829465e79dea62b6bb07f641e76e54bc..c80728544b089016aa58ed9e4db7275eac03fd4a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 namespace tensorflow {
 
@@ -77,7 +78,8 @@ grpc::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
   return byte_count_;
 }
 
-void GrpcUnparseProto(const protobuf::Message& src, grpc::ByteBuffer* dst) {
+void GrpcMaybeUnparseProto(const protobuf::Message& src,
+                           grpc::ByteBuffer* dst) {
   // TODO(sanjay): For bigger protos, serialize into a ZeroCopyOutputStream.
   ::grpc::Slice s(src.ByteSizeLong());
   src.SerializeWithCachedSizesToArray(
@@ -86,31 +88,51 @@ void GrpcUnparseProto(const protobuf::Message& src, grpc::ByteBuffer* dst) {
   dst->Swap(&buffer);
 }
 
-bool GrpcParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
+// GrpcMaybeUnparseProto from a string simply copies the string to the
+// ByteBuffer.
+void GrpcMaybeUnparseProto(const string& src, grpc::ByteBuffer* dst) {
+  ::grpc::Slice s(src.data(), src.size());
+  ::grpc::ByteBuffer buffer(&s, 1);
+  dst->Swap(&buffer);
+}
+
+bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, protobuf::Message* dst) {
   GrpcByteBufferSource stream;
   if (!stream.Init(src)) return false;
   return dst->ParseFromZeroCopyStream(&stream);
 }
 
-void GrpcCounter::Increment() {
-  mutex_lock l(mu_);
-  counter_++;
+// Overload of GrpcParseProto so we can decode a TensorResponse without
+// extra copying.  This overload is used by the RPCState class in
+// grpc_state.h.
+bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst) {
+  struct ByteSource : public TensorResponse::Source {
+    const ::grpc::ByteBuffer* buffer;
+    GrpcByteBufferSource src;
+    bool ok;
+
+    ::tensorflow::protobuf::io::ZeroCopyInputStream* contents() override {
+      ok = src.Init(*buffer);
+      return &src;
+    }
+  };
+  ByteSource bs;
+  bs.buffer = &src;
+  return dst->ParseFrom(&bs).ok() && bs.ok;
 }
 
-void GrpcCounter::Decrement() {
-  mutex_lock l(mu_);
-  DCHECK_GT(counter_, 0);
-  counter_--;
-  if (counter_ == 0) {
-    empty_.notify_all();
+// GrpcMaybeParseProto into a string simply copies bytes into the string.
+bool GrpcMaybeParseProto(const grpc::ByteBuffer& src, string* dst) {
+  dst->clear();
+  dst->reserve(src.Length());
+  std::vector<::grpc::Slice> slices;
+  if (!src.Dump(&slices).ok()) {
+    return false;
   }
-}
-
-void GrpcCounter::WaitUntilUnused() {
-  mutex_lock l(mu_);
-  while (counter_ != 0) {
-    empty_.wait(l);
+  for (const ::grpc::Slice& s : slices) {
+    dst->append(reinterpret_cast<const char*>(s.begin()), s.size());
   }
+  return true;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 64bc960536d3f99058c33e50fae09b58c5663382..ac0a33a2b9cbe2ba415a0f6cd7d94aee1fb142ac 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -21,16 +21,37 @@ limitations under the License.
 #include "grpc++/grpc++.h"
 #include "grpc++/impl/codegen/proto_utils.h"
 #include "grpc++/support/byte_buffer.h"
+#include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
+constexpr char kStreamRemovedMessage[] = "Stream removed";
+
+// Identify if the given grpc::Status corresponds to an HTTP stream removed
+// error (see chttp2_transport.cc).
+//
+// When auto-reconnecting to a remote TensorFlow worker after it restarts, gRPC
+// can return an UNKNOWN error code with a "Stream removed" error message.
+// This should not be treated as an unrecoverable error.
+//
+// N.B. This is dependent on the error message from grpc remaining consistent.
+inline bool IsStreamRemovedError(const ::grpc::Status& s) {
+  return !s.ok() && s.error_code() == ::grpc::StatusCode::UNKNOWN &&
+         s.error_message() == kStreamRemovedMessage;
+}
+
 inline Status FromGrpcStatus(const ::grpc::Status& s) {
   if (s.ok()) {
     return Status::OK();
   } else {
+    // Convert "UNKNOWN" stream removed errors into unavailable, to allow
+    // for retry upstream.
+    if (IsStreamRemovedError(s)) {
+      return Status(tensorflow::error::UNAVAILABLE, s.error_message());
+    }
     return Status(static_cast<tensorflow::error::Code>(s.error_code()),
                   s.error_message());
   }
@@ -50,10 +71,20 @@ typedef std::shared_ptr<::grpc::Channel> SharedGrpcChannelPtr;
 inline string GrpcIdKey() { return "tf-rpc"; }
 
 // Serialize src and store in *dst.
-void GrpcUnparseProto(const protobuf::Message& src, ::grpc::ByteBuffer* dst);
+void GrpcMaybeUnparseProto(const protobuf::Message& src,
+                           ::grpc::ByteBuffer* dst);
 
 // Parse contents of src and initialize *dst with them.
-bool GrpcParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, protobuf::Message* dst);
+
+// Specialization for TensorResponse
+bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, TensorResponse* dst);
+
+// Copy string src to grpc buffer *dst.
+void GrpcMaybeUnparseProto(const string& src, ::grpc::ByteBuffer* dst);
+
+// Copy grpc buffer src to string *dst.
+bool GrpcMaybeParseProto(const ::grpc::ByteBuffer& src, string* dst);
 
 // A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
 class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
@@ -73,29 +104,6 @@ class GrpcByteBufferSource : public ::grpc::protobuf::io::ZeroCopyInputStream {
   ::grpc::protobuf::int64 byte_count_;
 };
 
-// GrpcCounter is used to delay shutdown until all active RPCs are done.
-class GrpcCounter {
- public:
-  GrpcCounter() {}
-
-  GrpcCounter(const GrpcCounter&) = delete;
-  GrpcCounter& operator=(const GrpcCounter&) = delete;
-
-  // Increment the count of live RPCs.
-  void Increment();
-
-  // Decrement the count of live RPCs.
-  void Decrement();
-
-  // Wait until count of live RPCs is zero.
-  void WaitUntilUnused();
-
- private:
-  mutex mu_;
-  condition_variable empty_;
-  int counter_ = 0;
-};
-
 }  // namespace tensorflow
 
 #endif  // THIRD_PARTY_TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
index 1a98f1d887a6e1a6a0da331d357c40b7ae27241c..5356fb36e4bb214513b5da8b1c7ac841af8db045 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util_test.cc
@@ -67,7 +67,20 @@ TEST(GrpcProto, Unparse) {
   proto.add_container("hello");
   proto.add_container("world");
   grpc::ByteBuffer buf;
-  GrpcUnparseProto(proto, &buf);
+  GrpcMaybeUnparseProto(proto, &buf);
+  CleanupAllRequest parsed;
+  ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
+  ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+}
+
+TEST(GrpcProto, UnparseToString) {
+  CleanupAllRequest proto;
+  proto.add_container("hello");
+  proto.add_container("world");
+  string str;
+  CHECK(proto.SerializeToString(&str));
+  grpc::ByteBuffer buf;
+  GrpcMaybeUnparseProto(str, &buf);
   CleanupAllRequest parsed;
   ASSERT_TRUE(parsed.ParseFromString(ToString(buf)));
   ASSERT_EQ(proto.DebugString(), parsed.DebugString());
@@ -90,7 +103,33 @@ TEST(GrpcProto, Parse) {
     CleanupAllRequest proto = MakeProto(c.length);
     ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
     CleanupAllRequest parsed;
-    ASSERT_TRUE(GrpcParseProto(src, &parsed)) << c.length << " " << c.slices;
+    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed))
+        << c.length << " " << c.slices;
+    ASSERT_EQ(proto.DebugString(), parsed.DebugString());
+  }
+}
+
+TEST(GrpcProto, ParseFromString) {
+  // Test with serialization broken up into a bunch of slices.
+  struct Case {
+    int length;
+    int slices;
+  };
+  for (Case c : std::vector<Case>{
+           {0, 1},
+           {20, 1},
+           {100, 1},
+           {1 << 20, 1},
+           {100, 5},
+           {10000, 50},
+       }) {
+    CleanupAllRequest proto = MakeProto(c.length);
+    ::grpc::ByteBuffer src = MakeBuffer(proto.SerializeAsString(), c.slices);
+    string parsed_str;
+    CleanupAllRequest parsed;
+    ASSERT_TRUE(GrpcMaybeParseProto(src, &parsed_str))
+        << c.length << " " << c.slices;
+    ASSERT_TRUE(parsed.ParseFromString(parsed_str));
     ASSERT_EQ(proto.DebugString(), parsed.DebugString());
   }
 }
@@ -101,7 +140,7 @@ static void BM_UnparseGrpc(int iters, int size) {
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     grpc::ByteBuffer buf;
-    GrpcUnparseProto(proto, &buf);
+    GrpcMaybeUnparseProto(proto, &buf);
   }
   testing::StopTiming();
 }
@@ -128,7 +167,7 @@ static void BM_ParseGrpc(int iters, int size, int num_slices) {
   testing::StartTiming();
 
   for (int i = 0; i < iters; i++) {
-    CHECK(GrpcParseProto(buf, &proto));
+    CHECK(GrpcMaybeParseProto(buf, &proto));
   }
 
   testing::StopTiming();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 06695db77905d64dfb60c39ef879e409e3cc8f9a..a7b93e04607fe2dbb9bd87b372441607b5a19b0c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -51,9 +51,6 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
-    // Wait until all live rpcs are done since otherwise the completion
-    // queue shutdown will interfere with rpc operation.
-    live_rpc_counter_.WaitUntilUnused();
     completion_queue_.Shutdown();
     delete polling_thread_;  // Blocks until thread exits.
     delete channel_cache_;
@@ -69,8 +66,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
     } else {
       SharedGrpcChannelPtr channel = channel_cache_->FindWorkerChannel(target);
       if (!channel) return nullptr;
-      return NewGrpcRemoteWorker(&live_rpc_counter_, channel,
-                                 &completion_queue_, &logger_);
+      return NewGrpcRemoteWorker(channel, &completion_queue_, &logger_);
     }
   }
 
@@ -94,7 +90,6 @@ class GrpcWorkerCache : public WorkerCachePartial {
  private:
   const string local_target_;
   WorkerInterface* const local_worker_;  // Not owned.
-  GrpcCounter live_rpc_counter_;
   GrpcChannelCache* channel_cache_;  // Owned.
   ::grpc::CompletionQueue completion_queue_;
   Thread* polling_thread_;  // Owned.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 4ee5ae090174ce8986a471ad4a79147c0ca74419..eee93ec65726b416fdf8d4fe8a339c0fc3bf2d48 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -114,6 +114,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
     // types.
     ENQUEUE_REQUEST(GetStatus, false);
     ENQUEUE_REQUEST(CreateWorkerSession, false);
+    ENQUEUE_REQUEST(DeleteWorkerSession, false);
     ENQUEUE_REQUEST(CleanupAll, false);
     ENQUEUE_REQUEST(RegisterGraph, false);
     ENQUEUE_REQUEST(DeregisterGraph, false);
@@ -192,6 +193,16 @@ class GrpcWorkerService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(CreateWorkerSession, false);
   }
 
+  void DeleteWorkerSessionHandler(
+      WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
+          call) {
+    Schedule([this, call]() {
+      Status s = worker_->DeleteWorkerSession(&call->request, &call->response);
+      call->SendResponse(ToGrpcStatus(s));
+    });
+    ENQUEUE_REQUEST(DeleteWorkerSession, false);
+  }
+
   void CleanupAllHandler(
       WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
     Schedule([this, call]() {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 80a2f89337c6914dd871c4df346016d70d0f4093..05a9db10d3c379cae3926cf375d36d039538c5f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -32,6 +32,8 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
       return "/tensorflow.WorkerService/GetStatus";
     case GrpcWorkerMethod::kCreateWorkerSession:
       return "/tensorflow.WorkerService/CreateWorkerSession";
+    case GrpcWorkerMethod::kDeleteWorkerSession:
+      return "/tensorflow.WorkerService/DeleteWorkerSession";
     case GrpcWorkerMethod::kRegisterGraph:
       return "/tensorflow.WorkerService/RegisterGraph";
     case GrpcWorkerMethod::kDeregisterGraph:
@@ -58,9 +60,9 @@ namespace grpc {
 
 WorkerService::AsyncService::AsyncService() {
   for (int i = 0; i < kGrpcNumWorkerMethods; ++i) {
-    AddMethod(new ::grpc::RpcServiceMethod(
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
         GrpcWorkerMethodName(static_cast<GrpcWorkerMethod>(i)),
-        ::grpc::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
     ::grpc::Service::MarkMethodAsync(i);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index c8a8b5778e8ad98f9237d0b7f4f04f19beb1ac11..fb23f8631fd17a7533fde01cde9453dc8ea8505a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_namespace_compat.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -111,6 +110,7 @@ namespace tensorflow {
 enum class GrpcWorkerMethod {
   kGetStatus,
   kCreateWorkerSession,
+  kDeleteWorkerSession,
   kRegisterGraph,
   kDeregisterGraph,
   kRunGraph,
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 0b7fed79cd8fc8f987858beb957b64b461b6272a..7d308bb723a71e23482b6f52fa6d8fa53f89dda8 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 namespace {
 mutex* get_server_factory_lock() {
-  static mutex server_factory_lock;
+  static mutex server_factory_lock(LINKER_INITIALIZED);
   return &server_factory_lock;
 }
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index b97749dc41e46e500da2e656406a6b0362013969..fabcbd00f5e59a68a8db54c441dcc74377c44617 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace tensorflow {
 
@@ -29,7 +32,10 @@ SessionMgr::SessionMgr(
     std::unique_ptr<WorkerCacheInterface> default_worker_cache,
     WorkerCacheFactory worker_cache_factory)
     : worker_env_(worker_env),
-      legacy_session_("", default_worker_name, std::move(default_worker_cache),
+      default_worker_cache_(std::move(default_worker_cache)),
+      legacy_session_("", default_worker_name,
+                      std::unique_ptr<WorkerCacheInterface>(
+                          new WorkerCacheWrapper(default_worker_cache_.get())),
                       std::unique_ptr<DeviceMgr>(worker_env->device_mgr),
                       std::unique_ptr<GraphMgr>(
                           new GraphMgr(worker_env, worker_env->device_mgr))),
@@ -41,7 +47,8 @@ string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
 }
 
 Status SessionMgr::CreateSession(const string& session,
-                                 const ServerDef& server_def) {
+                                 const ServerDef& server_def,
+                                 bool isolate_session_state) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
@@ -50,12 +57,18 @@ Status SessionMgr::CreateSession(const string& session,
   const string worker_name = WorkerNameFromServerDef(server_def);
 
   WorkerCacheInterface* worker_cache = nullptr;
-  TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+  if (server_def.cluster().job().empty()) {
+    worker_cache = new WorkerCacheWrapper(default_worker_cache_.get());
+  } else {
+    TF_RETURN_IF_ERROR(worker_cache_factory_(server_def, &worker_cache));
+  }
 
+  CHECK(!worker_env_->local_devices.empty())
+      << "The WorkerEnv must have at least one device in `local_devices`.";
   std::vector<Device*> renamed_devices;
   for (Device* d : worker_env_->local_devices) {
-    renamed_devices.push_back(
-        RenamedDevice::NewRenamedDevice(worker_name, d, false));
+    renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
+        worker_name, d, false, isolate_session_state));
   }
   std::unique_ptr<DeviceMgr> device_mgr(new DeviceMgr(renamed_devices));
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index c44bca7b7a407957b1a36d7659f2b35ea0b30d07..d85b6c305941014fb52c4b4da6d646a707054c3a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -45,7 +45,8 @@ class SessionMgr {
   ~SessionMgr() {}
 
   // Allocates state for a new session.
-  Status CreateSession(const string& session, const ServerDef& server_def);
+  Status CreateSession(const string& session, const ServerDef& server_def,
+                       bool isolate_session_state);
 
   // Locates the worker session for a given session handle
   WorkerSession* WorkerSessionForSession(const string& session);
@@ -71,6 +72,7 @@ class SessionMgr {
   // legacy_session_ is deleted. Further, we must ensure that WorkerSession's
   // device_mgr is deleted after WorkerSession's graph_mgr.
 
+  std::unique_ptr<WorkerCacheInterface> default_worker_cache_;
   WorkerSession legacy_session_;
 
   const WorkerCacheFactory worker_cache_factory_;
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 7132f123a5943d0680743f3cc3bc17470f49d65d..ffe4809f2b10398ca4c7dc503dd82236cbc8dd18 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -22,14 +22,36 @@ limitations under the License.
 
 namespace tensorflow {
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> MakeCPU(const string& name) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType("FakeCPU").type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 class SessionMgrTest : public ::testing::Test {
  protected:
   SessionMgrTest()
-      : mgr_(&env_, "/job:mnist/replica:0/task:0",
-             std::unique_ptr<WorkerCacheInterface>(),
-             factory_),
-        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {}
+      : device_(FakeDevice::MakeCPU(
+            "/job:mnist/replica:0/task:0/device:fakecpu:0")),
+        mgr_(&env_, "/job:mnist/replica:0/task:0",
+             std::unique_ptr<WorkerCacheInterface>(), factory_),
+        legacy_session_(mgr_.WorkerSessionForSession("novel_session_id")) {
+    env_.local_devices = {device_.get()};
+  }
 
+  std::unique_ptr<Device> device_;
   WorkerEnv env_;
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
@@ -42,14 +64,48 @@ class SessionMgrTest : public ::testing::Test {
 
 TEST_F(SessionMgrTest, CreateSessionSimple) {
   ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+
   string session_handle = "test_session_handle";
-  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def));
+  TF_EXPECT_OK(mgr_.CreateSession(session_handle, server_def, true));
   WorkerSession* session = mgr_.WorkerSessionForSession(session_handle);
   EXPECT_NE(nullptr, session) << "Session for " << session_handle << "was null";
   EXPECT_NE(mgr_.LegacySession(), session);
   TF_EXPECT_OK(mgr_.DeleteSession(session_handle));
 }
 
+TEST_F(SessionMgrTest, CreateSessionIsolateSessionState) {
+  ServerDef server_def;
+  server_def.set_job_name("worker");
+  server_def.set_task_index(3);
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_1", server_def, false));
+  WorkerSession* session_1 = mgr_.WorkerSessionForSession("handle_1");
+  std::vector<Device*> devices_1 = session_1->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_1.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_2", server_def, false));
+  WorkerSession* session_2 = mgr_.WorkerSessionForSession("handle_2");
+  std::vector<Device*> devices_2 = session_2->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_2.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_3", server_def, true));
+  WorkerSession* session_3 = mgr_.WorkerSessionForSession("handle_3");
+  std::vector<Device*> devices_3 = session_3->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_3.size());
+
+  TF_EXPECT_OK(mgr_.CreateSession("handle_4", server_def, true));
+  WorkerSession* session_4 = mgr_.WorkerSessionForSession("handle_4");
+  std::vector<Device*> devices_4 = session_4->device_mgr->ListDevices();
+  EXPECT_EQ(1, devices_4.size());
+
+  EXPECT_EQ(devices_1[0]->resource_manager(), devices_2[0]->resource_manager());
+  EXPECT_NE(devices_1[0]->resource_manager(), devices_3[0]->resource_manager());
+  EXPECT_NE(devices_1[0]->resource_manager(), devices_4[0]->resource_manager());
+  EXPECT_NE(devices_3[0]->resource_manager(), devices_4[0]->resource_manager());
+}
+
 TEST_F(SessionMgrTest, LegacySession) {
   ServerDef server_def;
   string session_handle = "";
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 94d54a2b16bb38c44f656455749579c364bb6424..fe2d1a12934dde814344b70f52fbc972f74347e0 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 
 #include "google/protobuf/any.pb.h"
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index b7c57937368544549fd9f460916b4145526a7fe5..6cd92f5fe7a9edaef1ed7db0926281d1a91cdcf2 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -44,7 +44,15 @@ void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
                                       CreateWorkerSessionResponse* response,
                                       StatusCallback done) {
   Status s = env_->session_mgr->CreateSession(request->session_handle(),
-                                              request->server_def());
+                                              request->server_def(),
+                                              request->isolate_session_state());
+  done(s);
+}
+
+void Worker::DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+                                      DeleteWorkerSessionResponse* response,
+                                      StatusCallback done) {
+  Status s = env_->session_mgr->DeleteSession(request->session_handle());
   done(s);
 }
 
@@ -132,7 +140,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     return;
   }
   StepStatsCollector* collector = nullptr;
-  if (request->exec_opts().record_timeline() ||
+  if (request->exec_opts().report_tensor_allocations_upon_oom() ||
+      request->exec_opts().record_timeline() ||
       request->exec_opts().record_costs()) {
     collector = new StepStatsCollector(response->mutable_step_stats());
     // TODO(mrry,pbar): GPU tracing for distributed steps.
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 07300338c3871f2d85ae5a50595f1996bcc77f67..c62347926fa11c135b6116d17f6545007e9f6115 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -52,6 +52,10 @@ class Worker : public WorkerInterface {
                                 CreateWorkerSessionResponse* response,
                                 StatusCallback done) override;
 
+  void DeleteWorkerSessionAsync(const DeleteWorkerSessionRequest* request,
+                                DeleteWorkerSessionResponse* response,
+                                StatusCallback done) override;
+
   void RegisterGraphAsync(const RegisterGraphRequest* request,
                           RegisterGraphResponse* response,
                           StatusCallback done) override;
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..43c3b6285b9d1a76d5207537ccd1343928c59010
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+
+namespace tensorflow {
+
+class WorkerCacheWrapper : public WorkerCacheInterface {
+ public:
+  WorkerCacheWrapper(WorkerCacheInterface* wrapped) : wrapped_(wrapped) {}
+
+  // Updates *workers with strings naming the remote worker tasks to
+  // which open channels have been established.
+  virtual void ListWorkers(std::vector<string>* workers) const {
+    return wrapped_->ListWorkers(workers);
+  }
+
+  // If "target" names a remote task for which an RPC channel exists
+  // or can be constructed, returns a pointer to a WorkerInterface object
+  // wrapping that channel. The returned value must be destroyed by
+  // calling `this->ReleaseWorker(target, ret)`
+  // TODO(mrry): rename this to GetOrCreateWorker() or something that
+  // makes it more obvious that this method returns a potentially
+  // shared object.
+  virtual WorkerInterface* CreateWorker(const string& target) {
+    return wrapped_->CreateWorker(target);
+  }
+
+  // Release a worker previously returned by this->CreateWorker(target).
+  //
+  // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
+  // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
+  //                    per-rpc-subsystem WorkerInterface creator.
+  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+    return wrapped_->ReleaseWorker(target, worker);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Returns true if *locality
+  // was set, using only locally cached data.  Returns false
+  // if status data for that device was not available.  Never blocks.
+  virtual bool GetDeviceLocalityNonBlocking(const string& device,
+                                            DeviceLocality* locality) {
+    return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
+  }
+
+  // Set *locality with the DeviceLocality of the specified remote device
+  // within its local environment.  Callback gets Status::OK if *locality
+  // was set.
+  virtual void GetDeviceLocalityAsync(const string& device,
+                                      DeviceLocality* locality,
+                                      StatusCallback done) {
+    return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
+  }
+
+  // Start/stop logging activity.
+  virtual void SetLogging(bool active) { wrapped_->SetLogging(active); }
+
+  // Discard any saved log data.
+  virtual void ClearLogs() { wrapped_->ClearLogs(); }
+
+  // Return logs for the identified step in *ss.  Any returned data will no
+  // longer be stored.
+  virtual bool RetrieveLogs(int64 step_id, StepStats* ss) {
+    return wrapped_->RetrieveLogs(step_id, ss);
+  }
+
+ private:
+  WorkerCacheInterface* wrapped_;  // Not owned.
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_WORKER_CACHE_WRAPPER_H_
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index c9db28ec67f86d469c16427aa9343a2a1d36c0e7..4c58bf41a461160a6ea258aee207fffff01aa99d 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -44,6 +44,10 @@ class WorkerInterface {
       const CreateWorkerSessionRequest* request,
       CreateWorkerSessionResponse* response, StatusCallback done) = 0;
 
+  virtual void DeleteWorkerSessionAsync(
+      const DeleteWorkerSessionRequest* request,
+      DeleteWorkerSessionResponse* response, StatusCallback done) = 0;
+
   virtual void RegisterGraphAsync(const RegisterGraphRequest* request,
                                   RegisterGraphResponse* response,
                                   StatusCallback done) = 0;
@@ -118,6 +122,11 @@ class WorkerInterface {
     return CallAndWait(&ME::CreateWorkerSessionAsync, request, response);
   }
 
+  Status DeleteWorkerSession(const DeleteWorkerSessionRequest* request,
+                             DeleteWorkerSessionResponse* response) {
+    return CallAndWait(&ME::DeleteWorkerSessionAsync, request, response);
+  }
+
   Status RegisterGraph(const RegisterGraphRequest* request,
                        RegisterGraphResponse* response) {
     return CallAndWait(&ME::RegisterGraphAsync, request, response);
diff --git a/tensorflow/core/framework/bfloat16.cc b/tensorflow/core/framework/bfloat16.cc
index a5ac0e1a8df1ce0e6e622ae62d2ee8012fff58b7..0efe43fde2dadd42aa03d3bf2968d2cbfb113e8d 100644
--- a/tensorflow/core/framework/bfloat16.cc
+++ b/tensorflow/core/framework/bfloat16.cc
@@ -21,13 +21,13 @@ void FloatToBFloat16(const float* src, bfloat16* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p += 2, q++, size--) {  
-      *q = p[0];  
-    }  
+    for (; size != 0; p += 2, q++, size--) {
+      *q = p[0];
+    }
 #else
-    for (; size != 0; p += 2, q++, size--) {  
-     *q = p[1];  
-    }  
+    for (; size != 0; p += 2, q++, size--) {
+     *q = p[1];
+    }
 #endif
 }
 
@@ -35,15 +35,15 @@ void BFloat16ToFloat(const bfloat16* src, float* dst, int64 size) {
   const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
   uint16_t* q = reinterpret_cast<uint16_t*>(dst);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    for (; size != 0; p++, q += 2, size--) {  
-      q[0] = *p;  
-      q[1] = 0;  
+    for (; size != 0; p++, q += 2, size--) {
+      q[0] = *p;
+      q[1] = 0;
+    }
+#else
+    for (; size != 0; p++, q += 2, size--) {
+      q[0] = 0;
+      q[1] = *p;
     }
-#else  
-    for (; size != 0; p++, q += 2, size--) {  
-      q[0] = 0;  
-      q[1] = *p;  
-    } 
 #endif
 }
 
diff --git a/tensorflow/core/framework/bfloat16.h b/tensorflow/core/framework/bfloat16.h
index b936e899d4ce71de91af0934ccec982013dde658..968c18bdd2159fee4eb6982c62697951d79b706c 100644
--- a/tensorflow/core/framework/bfloat16.h
+++ b/tensorflow/core/framework/bfloat16.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
 
-#if defined(PLATFORM_WINDOWS)  
-#include "tensorflow/core/platform/windows/cpu_info.h"  
-#endif  
+#if defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/cpu_info.h"
+#endif
 
 // Compact 16-bit encoding of floating point numbers. This representation uses
 // 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa.  It
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index af4e6a4411633ff7b4ddde504d35729c56f058fa..17e6209f8e5ad5240dfc8ca1def75c178da45c27 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bfloat16.h"
 
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -27,6 +29,66 @@ TEST(Bfloat16Test, Simple) {
   EXPECT_EQ(0x4140, a.value);
 }
 
+float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
+                    uint32_t low_mantissa) {
+  return bit_cast<float>((sign << 31) + (exponent << 23) +
+                         (high_mantissa << 16) + low_mantissa);
+}
+
+struct Bfloat16TestParam {
+  float input;
+  float expected;
+};
+
+class Bfloat16Test : public ::testing::Test,
+                     public ::testing::WithParamInterface<Bfloat16TestParam> {};
+
+TEST_P(Bfloat16Test, TruncateTest) {
+  bfloat16 a(GetParam().input);
+  if (std::isnan(GetParam().input)) {
+    EXPECT_TRUE(std::isnan(float(a)) || std::isinf(float(a)));
+    return;
+  }
+  EXPECT_EQ(GetParam().expected, float(a));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Bfloat16Test_Instantiation, Bfloat16Test,
+    ::testing::Values(
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1111010111000011),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1111010111000011),
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000001),
+            BinaryToFloat(0, 0b11111111, 0b0000000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b11111111, 0b1111111, 0b1111111111111111),
+            BinaryToFloat(0, 0b11111111, 0b1111111, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b1100000000000000),
+            BinaryToFloat(1, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0100000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b10000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b00000000, 0b1001000, 0b1000000000000000),
+            BinaryToFloat(0, 0b00000000, 0b1001000, 0b0000000000000000)},
+        Bfloat16TestParam{
+            BinaryToFloat(0, 0b00000000, 0b1111111, 0b1100000000000000),
+            BinaryToFloat(0, 0b00000000, 0b1111111, 0b0000000000000000)}));
+
 TEST(Bfloat16Test, Conversion) {
   float a[100];
   for (int i = 0; i < 100; ++i) {
@@ -43,6 +105,17 @@ TEST(Bfloat16Test, Conversion) {
   }
 }
 
+TEST(Bfloat16Test, Epsilon) {
+  EXPECT_LT(1.0f, static_cast<float>(bfloat16::epsilon() + bfloat16(1.0f)));
+  EXPECT_EQ(1.0f, static_cast<float>((bfloat16::epsilon() / bfloat16(2.0f)) +
+                                     bfloat16(1.0f)));
+}
+
+TEST(Bfloat16Test, Negate) {
+  EXPECT_EQ(-3.0f, static_cast<float>(-bfloat16(3.0f)));
+  EXPECT_EQ(4.5f, static_cast<float>(-bfloat16(-4.5f)));
+}
+
 static void BM_FloatToBFloat16(int iters) {
   testing::StopTiming();
   static const int N = 32 << 20;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 315c99d32bf855d5f0941f0e5c76bb0548208257..036e3473b14fbdba10cb850e0c04b53745933db6 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -397,6 +397,15 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(
       CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
 
+  std::vector<int32> dilations;
+  TF_RETURN_IF_ERROR(c->GetAttr("dilations", &dilations));
+
+  if (dilations.size() != 4) {
+    return errors::InvalidArgument(
+        "Conv2D requires the dilation attribute to contain 4 values, but got: ",
+        dilations.size());
+  }
+
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
@@ -410,6 +419,8 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
 
   const int32 stride_rows = GetTensorDim(strides, data_format, 'H');
   const int32 stride_cols = GetTensorDim(strides, data_format, 'W');
+  const int32 dilation_rows = GetTensorDim(dilations, data_format, 'H');
+  const int32 dilation_cols = GetTensorDim(dilations, data_format, 'W');
 
   DimensionHandle batch_size_dim;
   DimensionHandle input_depth_dim;
@@ -447,12 +458,12 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
   DimensionHandle output_rows, output_cols;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[0],
-                                                   filter_rows_dim, stride_rows,
-                                                   padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(c, input_spatial_dims[1],
-                                                   filter_cols_dim, stride_cols,
-                                                   padding, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
+      padding, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
+      padding, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -1232,6 +1243,8 @@ Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
         dims.push_back(dim_y);
       } else if (c->Value(dim_y) == 1) {
         dims.push_back(dim_x);
+      } else if (dim_y.SameHandle(dim_x)) {
+        dims.push_back(dim_x);
       } else {
         dims.push_back(c->UnknownDim());
       }
@@ -1305,6 +1318,9 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
 
 Status ScatterNdUpdateShape(InferenceContext* c) {
   ShapeHandle input_shape = c->input(0);
+  if (c->input_handle_shapes_and_types(0) != nullptr) {
+    input_shape = (*c->input_handle_shapes_and_types(0))[0].shape;
+  }
   ShapeHandle indices_shape;
   TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
   ShapeHandle updates_shape;
@@ -1359,7 +1375,9 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
     }
   }
 
-  c->set_output(0, input_shape);
+  if (c->input_handle_shapes_and_types(0) == nullptr) {
+    c->set_output(0, input_shape);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index ec9746b2af1ed0da348fbe7459c5d93d842b25d9..5f3e5ad45731750bfd73181c41cd029f23aab55f 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -423,6 +423,15 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
                     .Finalize(&op.node_def));
   };
 
+  // Invalid rank for input
+  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
+  // Invalid rank for filter
+  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
+
+  // Invalid value for strides
+  set_op({{1, 1, 0, 1}}, "VALID", "NHWC", "HWIO");
+  INFER_ERROR("must be > 0", op, "[1,2,2,1];[1,1,1,1]");
+
   // 1x1 filter
   set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
@@ -443,11 +452,6 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   set_op({{1, 1, 2, 1}}, "VALID", "NHWC", "HWIO");
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
 
-  // Invalid rank for input
-  INFER_ERROR("must be rank 4", op, "[4,4];[2,1,1,1]");
-  // Invalid rank for filter
-  INFER_ERROR("must be rank 4", op, "[1,4,4,1];[2,1,1]");
-
   // Unknown dims in the critical fields lead to partial inference.
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
   INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,2,d1_3]");
@@ -538,6 +542,98 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
 }
 
+TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
+  ShapeInferenceTestOp op("Conv2D");
+  auto set_op = [&op](const std::vector<int32>& dilations,
+                      const std::vector<int32>& strides, const string& padding,
+                      const string& data_format) {
+    TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
+                    .Input("input", 0, DT_FLOAT)
+                    .Input("filter", 0, DT_FLOAT)
+                    .Attr("dilations", dilations)
+                    .Attr("strides", strides)
+                    .Attr("padding", padding)
+                    .Attr("data_format", data_format)
+                    .Finalize(&op.node_def));
+  };
+
+  // Invalid rank for dilation
+  set_op({{1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("contain 4 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Invalid value for dilation
+  set_op({{1, 0, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_ERROR("must be >= 1", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Tests for NHWC
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 2, 1, 1}}, {{1, 2, 1, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,1,3,d1_3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "VALID", "NHWC");
+  INFER_OK(op, "[1,5,5,1];[3,3,1,1]", "[d0_0,2,1,d1_3]");
+
+  // Tests for NCHW
+  // 1x1 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,2,2];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 1x1 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,4]");
+
+  // 1x1 filter, 2x1 dilations, 2x2 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,4,4];[1,1,1,1]", "[d0_0,d1_3,2,2]");
+
+  // 3x3 filter, 2x1 dilations, 1x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 1, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 2x1 dilations, 2x1 strides
+  set_op({{1, 1, 2, 1}}, {{1, 1, 2, 1}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,1,3]");
+
+  // 3x3 filter, 1x2 dilations, 2x2 strides
+  set_op({{1, 1, 1, 2}}, {{1, 1, 2, 2}}, "VALID", "NCHW");
+  INFER_OK(op, "[1,1,5,5];[3,3,1,1]", "[d0_0,d1_3,2,1]");
+
+  // Some tests for "SAME" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,2,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+}
+
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
   ShapeInferenceTestOp op("Conv3D");
   auto set_op = [&op](const std::vector<int32>& strides,
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 33bd5d250cd6b5df8c933e3f353efd9a1eee592c..1838a8ad02d2bd5522ce3162fea53e3f5afc0309 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -145,6 +145,12 @@ class DeviceBase {
     return gpu_device_info_;
   }
 
+  // The preferred thread pool for this device. If it is nullptr, the system
+  // automatically assigns a thread pool for execution.
+  virtual thread::ThreadPool* tensorflow_device_thread_pool() {
+    return device_thread_pool_;
+  }
+
   // Does not take ownership.
   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
     eigen_cpu_device_ = d;
@@ -215,10 +221,17 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+ protected:
+  // Does not take ownership.
+  void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
+    device_thread_pool_ = thread_pool;
+  }
+
  private:
   Env* const env_;
   CpuWorkerThreads* cpu_worker_threads_ = nullptr;
   GpuDeviceInfo* gpu_device_info_ = nullptr;
+  thread::ThreadPool* device_thread_pool_ = nullptr;
   Eigen::ThreadPoolDevice* eigen_cpu_device_ = nullptr;
 #ifdef TENSORFLOW_USE_SYCL
   Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 305b140a446171ddc4b249c97967057aa3e00152..1a579ab63125ff5abc2f76d06187482234a54b9c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -243,13 +243,24 @@ uint64 FunctionDefHash(const FunctionDef& fdef);
 // address spaces.
 string Canonicalize(const string& funcname, AttrSlice attrs);
 
+class CallFrameInterface {
+ public:
+  virtual ~CallFrameInterface() {}
+
+  virtual size_t num_args() const = 0;
+  virtual size_t num_retvals() const = 0;
+
+  virtual Status GetArg(int index, Tensor* val) const = 0;
+  virtual Status SetRetval(int index, const Tensor& val) = 0;
+};
+
 // Represents a function call frame. I.e., the data structure used to
 // pass arguments to a function and retrieve its results.
 //
 // Runtime must arrange accesses to one FunctionCallFrame s.t.
 //   1. SetArgs() happens before any GetArg();
 //   2. GetRetvals happens after all SetRetval();
-class FunctionCallFrame {
+class FunctionCallFrame : public CallFrameInterface {
  public:
   FunctionCallFrame(DataTypeSlice arg_types, DataTypeSlice ret_types);
   ~FunctionCallFrame();
@@ -259,9 +270,12 @@ class FunctionCallFrame {
   Status GetRetvals(std::vector<Tensor>* rets) const;
   Status ConsumeRetvals(std::vector<Tensor>* rets);
 
+  size_t num_args() const override { return arg_types_.size(); }
+  size_t num_retvals() const override { return ret_types_.size(); }
+
   // Callee methods.
-  Status GetArg(int index, Tensor* val) const;
-  Status SetRetval(int index, const Tensor& val);
+  Status GetArg(int index, Tensor* val) const override;
+  Status SetRetval(int index, const Tensor& val) override;
 
  private:
   DataTypeVector arg_types_;
@@ -408,6 +422,9 @@ class FunctionLibraryRuntime {
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
                              Handle* handle) = 0;
 
+  // Releases state associated with the handle.
+  virtual Status ReleaseHandle(Handle handle) = 0;
+
   // Returns the function body for the instantiated function given its
   // handle 'h'. Returns nullptr if "h" is not found.
   //
@@ -453,6 +470,8 @@ class FunctionLibraryRuntime {
   virtual void Run(const Options& opts, Handle handle,
                    gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
                    DoneCallback done) = 0;
+  virtual void Run(const Options& opts, Handle handle,
+                   CallFrameInterface* call_frame, DoneCallback done) = 0;
 
   // Creates a "kernel" for the given node def "ndef".
   //
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index f825335300881a0bf506ec461b6e6313fefe8cdd..b9e33b148f71cd6b1856cf55436a7e73df9df059 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -45,7 +45,7 @@ struct Library {
 // perform initialization again, so the OpList would be empty.
 Status LoadLibrary(const char* library_filename, void** result,
                    const void** buf, size_t* len) {
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   static std::unordered_map<string, Library> loaded_libs;
   Env* env = Env::Default();
   Library library;
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index f039497f13bc2118a024a123446a52420e2f3cf5..477184022df4bb7e4d329cc5ed09572f9dbe9585 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -243,6 +243,10 @@ DEFINE_GET_ATTR(Tensor, tensor, "tensor", emplace_back, t, Tensor t;
 DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
 #undef DEFINE_GET_ATTR
 
+bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name) {
+  return node_def.attr().find(attr_name.ToString()) != node_def.attr().end();
+}
+
 static const string& kEmptyString = *new string();
 
 const string& GetNodeAttrString(const AttrSlice& attrs, StringPiece attr_name) {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 523b5382954f5b7ae2bf2420e72ead67f4baa994..f6f28aac4811d30b845191735536b389e41bf259 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -157,6 +157,9 @@ class AttrSlice {
   const AttrValueMap* attrs_;
 };
 
+// Return true if the attr with the name attr_name is defined in node_def.
+bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name);
+
 // Look up the attr with name attr_name and set *value to its value.  If no
 // attr with attr_name is found in node_def, or the attr does not have
 // a matching type, a non-ok status will be returned.
diff --git a/tensorflow/core/framework/numeric_types.h b/tensorflow/core/framework/numeric_types.h
index a630bee38d8825ff8cb405ef36be05f8e9368629..7e8aeb5001c68c7fa807ebf6de68b3ac6bb77b88 100644
--- a/tensorflow/core/framework/numeric_types.h
+++ b/tensorflow/core/framework/numeric_types.h
@@ -44,7 +44,12 @@ typedef Eigen::QUInt16 quint16;
 // see framework/bfloat16.h for description.
 struct bfloat16 {
   EIGEN_DEVICE_FUNC bfloat16() {}
+
   EIGEN_DEVICE_FUNC explicit bfloat16(const float v) {
+    if (Eigen::numext::isnan(v)) {
+      value = NAN_VALUE;
+      return;
+    }
     const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     value = p[0];
@@ -53,20 +58,132 @@ struct bfloat16 {
 #endif
   }
 
+  template <class T>
+  explicit EIGEN_DEVICE_FUNC bfloat16(const T& val)
+      : bfloat16(static_cast<float>(val)) {}
+
+  EIGEN_DEVICE_FUNC explicit operator float() const {
+    float result;
+
+    uint16_t* q = reinterpret_cast<uint16_t*>(&result);
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    q[0] = value;
+    q[1] = 0;
+#else
+    q[0] = 0;
+    q[1] = value;
+#endif
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator bool() const {
+    return static_cast<bool>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator Eigen::half() const {
+    return static_cast<Eigen::half>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator short() const {
+    return static_cast<short>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator int() const {
+    return static_cast<int>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator long() const {
+    return static_cast<long>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator char() const {
+    return static_cast<char>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator signed char() const {
+    return static_cast<signed char>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator unsigned char() const {
+    return static_cast<unsigned char>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator unsigned int() const {
+    return static_cast<unsigned int>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator unsigned long() const {
+    return static_cast<unsigned long>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator unsigned long long() const {
+    return static_cast<unsigned long long>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator long long() const {
+    return static_cast<long long>(float(*this));
+  }
+
+  EIGEN_DEVICE_FUNC explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+  static bfloat16 epsilon() {
+    bfloat16 x;
+    x.value = 0x3c00;  // 0x1.0p-7
+    return x;
+  }
+
   uint16_t value;
+
+  // A value that represents "not a number".
+  static const uint16_t NAN_VALUE = 0x7FC0;
 };
 
+inline bfloat16 operator+(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+inline bfloat16 operator-(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+inline bfloat16 operator*(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+inline bfloat16 operator/(bfloat16 a, bfloat16 b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+inline bfloat16 operator-(bfloat16 a) {
+  a.value ^= 0x8000;
+  return a;
+}
+inline bool operator<(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+inline bool operator<=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+inline bool operator==(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+inline bool operator!=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+inline bool operator>(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+inline bool operator>=(bfloat16 a, bfloat16 b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
 }  // end namespace tensorflow
 
 namespace Eigen {
 template <>
 struct NumTraits<tensorflow::bfloat16> : GenericNumTraits<uint16_t> {};
 
-EIGEN_STRONG_INLINE bool operator==(const tensorflow::bfloat16 a,
-                                    const tensorflow::bfloat16 b) {
-  return a.value == b.value;
-}
-
+using ::tensorflow::operator==;
+using ::tensorflow::operator!=;
 }  // namespace Eigen
 
 #ifdef COMPILER_MSVC
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 4f5a1f80a025744f4b2189aa3216304a36b99044..fadb60d744217daa0c569601c437146a70f9b4d5 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -63,26 +63,32 @@ Status OpRegistry::LookUp(const string& op_type_name,
   const OpRegistrationData* res = nullptr;
 
   bool first_call = false;
+  bool first_unregistered = false;
   {  // Scope for lock.
     mutex_lock lock(mu_);
     first_call = MustCallDeferred();
     res = gtl::FindWithDefault(registry_, op_type_name, nullptr);
+
+    static bool unregistered_before = false;
+    first_unregistered = !unregistered_before && (res == nullptr);
+    if (first_unregistered) {
+      unregistered_before = true;
+    }
     // Note: Can't hold mu_ while calling Export() below.
   }
   if (first_call) {
     TF_QCHECK_OK(ValidateKernelRegistrations(*this));
   }
   if (res == nullptr) {
-    static bool first_unregistered = true;
     if (first_unregistered) {
       OpList op_list;
       Export(true, &op_list);
       if (VLOG_IS_ON(3)) {
-         LOG(INFO) << "All registered Ops:";
-         for (const auto& op : op_list.op())
-            LOG(INFO) << SummarizeOpDef(op);
+        LOG(INFO) << "All registered Ops:";
+        for (const auto& op : op_list.op()) {
+          LOG(INFO) << SummarizeOpDef(op);
+        }
       }
-      first_unregistered = false;
     }
     Status status =
         errors::NotFound("Op type not registered '", op_type_name,
diff --git a/tensorflow/core/framework/op_def_builder_test.cc b/tensorflow/core/framework/op_def_builder_test.cc
index c1511ebe340d99fc67f588596e028cca92e23250..9b24e3aa00425321eda2e196b1e7b243a552c730 100644
--- a/tensorflow/core/framework/op_def_builder_test.cc
+++ b/tensorflow/core/framework/op_def_builder_test.cc
@@ -124,22 +124,23 @@ TEST_F(OpDefBuilderTest, AttrWithRestrictions) {
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16] } } }");
   ExpectSuccess(
       b().Attr("a:{numbertype, variant}"),
       "attr: { name: 'a' type: 'type' allowed_values { list { type: "
       "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
       "DT_UINT16, DT_INT8, DT_COMPLEX64, DT_COMPLEX128, DT_QINT8, DT_QUINT8, "
-      "DT_QINT32, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_QINT32, DT_UINT32, DT_UINT64, DT_BFLOAT16, DT_VARIANT] } } }");
   ExpectSuccess(b().Attr("a:realnumbertype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
-                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64] } } }");
+                "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
+                "DT_BFLOAT16] } } }");
   ExpectSuccess(b().Attr("a:{realnumbertype,  variant , string, }"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, "
                 "DT_INT16, DT_UINT16, DT_INT8, DT_UINT32, DT_UINT64, "
-                "DT_VARIANT, DT_STRING] } } }");
+                "DT_BFLOAT16, DT_VARIANT, DT_STRING] } } }");
   ExpectSuccess(b().Attr("a:quantizedtype"),
                 "attr: { name: 'a' type: 'type' allowed_values { list { type: "
                 "[DT_QINT8, DT_QUINT8, DT_QINT32, DT_QINT16, DT_QUINT16]} } }");
@@ -216,12 +217,14 @@ TEST_F(OpDefBuilderTest, AttrListOfRestricted) {
       b().Attr("a:list(realnumbertype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64"
+      "] } } }");
   ExpectSuccess(
       b().Attr("a:list({realnumbertype, variant})"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
       "[DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8, DT_INT16, "
-      "DT_UINT16, DT_INT8, DT_HALF, DT_UINT32, DT_UINT64, DT_VARIANT] } } }");
+      "DT_UINT16, DT_INT8, DT_HALF, DT_BFLOAT16, DT_UINT32, DT_UINT64, "
+      "DT_VARIANT] } } }");
   ExpectSuccess(
       b().Attr("a:list(quantizedtype)"),
       "attr: { name: 'a' type: 'list(type)' allowed_values { list { type: "
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 2f737a0f16985f7e08fb5306243b0543b6c347a0..29feda499fd2646a00c1f5bc9fc7223e9f134af9 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -161,6 +161,15 @@ OpDef::AttrDef* FindAttrMutable(StringPiece name, OpDef* op_def) {
   return nullptr;
 }
 
+const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def) {
+  for (int i = 0; i < op_def.input_arg_size(); ++i) {
+    if (op_def.input_arg(i).name() == name) {
+      return &op_def.input_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 #define VALIDATE(EXPR, ...)                                          \
   do {                                                               \
     if (!(EXPR)) {                                                   \
@@ -323,7 +332,7 @@ Status CheckOpDeprecation(const OpDef& op_def, int graph_def_version) {
           ". ", dep.explanation(), ".");
     } else {
       // Warn only once for each op name, and do it in a threadsafe manner.
-      static mutex mu;
+      static mutex mu(LINKER_INITIALIZED);
       static std::unordered_set<string> warned;
       bool warn;
       {
diff --git a/tensorflow/core/framework/op_def_util.h b/tensorflow/core/framework/op_def_util.h
index c329e4627cc8c592d411e9b95c49809034ee2949..f9661dceddc1a3de694024dddb9afce1cae8680c 100644
--- a/tensorflow/core/framework/op_def_util.h
+++ b/tensorflow/core/framework/op_def_util.h
@@ -43,6 +43,10 @@ Status ValidateAttrValue(const AttrValue& attr_value,
 const OpDef::AttrDef* FindAttr(StringPiece name, const OpDef& op_def);
 OpDef::AttrDef* FindAttrMutable(StringPiece name, OpDef* op_def);
 
+// Searches op_def for input argument with the indicated name.
+// Returns nullptr if no such attr is found.
+const OpDef::ArgDef* FindInputArg(StringPiece name, const OpDef& op_def);
+
 // Produce a human-readable version of an op_def that is more concise
 // than a text-format proto.  Excludes descriptions.
 string SummarizeOpDef(const OpDef& op_def);
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 1e93e9be0955c9d62588e009e5a6d899ce33698d..acff74070da92cc7f298560b7bb81a812924cb0f 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -84,7 +84,7 @@ static bool SplitAt(char split_ch, StringPiece* orig,
   auto pos = orig->find(split_ch);
   if (pos == StringPiece::npos) {
     *before_split = *orig;
-    orig->clear();
+    *orig = StringPiece();
     return false;
   } else {
     *before_split = orig->substr(0, pos);
@@ -236,7 +236,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
         unescaped.push_back('\n');
       }
       strings::StrAppend(&unescaped, line);
-      line.clear();
+      line = StringPiece();
     }
 
     // Escape what we extracted and then output it in quotes.
@@ -281,6 +281,9 @@ static void StringReplace(const string& from, const string& to, string* s) {
     } else {
       split.push_back(s->substr(pos, found - pos));
       pos = found + from.size();
+      if (pos == s->size()) {  // handle case where `from` is at the very end.
+        split.push_back("");
+      }
     }
   }
   // Join the pieces back together with a new delimiter.
@@ -316,6 +319,36 @@ static void RenameInDocs(const string& from, const string& to, OpDef* op_def) {
   }
 }
 
+static void RenameInDocs(const string& from, const string& to,
+                         ApiDef* api_def) {
+  const string from_quoted = strings::StrCat("`", from, "`");
+  const string to_quoted = strings::StrCat("`", to, "`");
+  for (int i = 0; i < api_def->in_arg_size(); ++i) {
+    if (!api_def->in_arg(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_in_arg(i)->mutable_description());
+    }
+  }
+  for (int i = 0; i < api_def->out_arg_size(); ++i) {
+    if (!api_def->out_arg(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_out_arg(i)->mutable_description());
+    }
+  }
+  for (int i = 0; i < api_def->attr_size(); ++i) {
+    if (!api_def->attr(i).description().empty()) {
+      StringReplace(from_quoted, to_quoted,
+                    api_def->mutable_attr(i)->mutable_description());
+    }
+  }
+  if (!api_def->summary().empty()) {
+    StringReplace(from_quoted, to_quoted, api_def->mutable_summary());
+  }
+  if (!api_def->description().empty()) {
+    StringReplace(from_quoted, to_quoted, api_def->mutable_description());
+  }
+}
+
 const OpGenOverride* OpGenOverrideMap::ApplyOverride(OpDef* op_def) const {
   // Look up
   const auto iter = map_.find(op_def->name());
@@ -521,6 +554,7 @@ Status MergeApiDefs(ApiDef* base_api_def, const ApiDef& new_api_def) {
           ". All elements in arg_order override must match base arg_order: ",
           str_util::Join(base_api_def->arg_order(), ", "));
     }
+
     base_api_def->clear_arg_order();
     std::copy(
         new_api_def.arg_order().begin(), new_api_def.arg_order().end(),
@@ -595,19 +629,42 @@ Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
   ApiDefs api_defs;
   protobuf::TextFormat::ParseFromString(contents, &api_defs);
   for (const auto& api_def : api_defs.op()) {
-    // Check if the op definition is already loaded.
+    // Check if the op definition is loaded. If op definition is not
+    // loaded, then we just skip this ApiDef.
     if (map_.find(api_def.graph_op_name()) != map_.end()) {
       // Overwrite current api def with data in api_def.
       TF_RETURN_IF_ERROR(MergeApiDefs(&map_[api_def.graph_op_name()], api_def));
-    } else {
-      return errors::FailedPrecondition(
-          "Unexpected ApiDef override: ", api_def.graph_op_name(),
-          " is not defined in base ApiDef.");
     }
   }
   return Status::OK();
 }
 
+void ApiDefMap::UpdateDocs() {
+  for (auto& name_and_api_def : map_) {
+    auto& api_def = name_and_api_def.second;
+    CHECK_GT(api_def.endpoint_size(), 0);
+    const string canonical_name = api_def.endpoint(0).name();
+    if (api_def.graph_op_name() != canonical_name) {
+      RenameInDocs(api_def.graph_op_name(), canonical_name, &api_def);
+    }
+    for (const auto& in_arg : api_def.in_arg()) {
+      if (in_arg.name() != in_arg.rename_to()) {
+        RenameInDocs(in_arg.name(), in_arg.rename_to(), &api_def);
+      }
+    }
+    for (const auto& out_arg : api_def.out_arg()) {
+      if (out_arg.name() != out_arg.rename_to()) {
+        RenameInDocs(out_arg.name(), out_arg.rename_to(), &api_def);
+      }
+    }
+    for (const auto& attr : api_def.attr()) {
+      if (attr.name() != attr.rename_to()) {
+        RenameInDocs(attr.name(), attr.rename_to(), &api_def);
+      }
+    }
+  }
+}
+
 const tensorflow::ApiDef* ApiDefMap::GetApiDef(const string& name) const {
   return gtl::FindOrNull(map_, name);
 }
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index efb287477bedde9bfbdef8e318bf6804e79f1ac5..1ede3af8d7cf8f591ba3927f7fc99d646629109d 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -106,6 +106,12 @@ class ApiDefMap {
   // passed to the constructor.
   Status LoadApiDef(const string& api_def_file_contents);
 
+  // Updates ApiDef docs. For example, if ApiDef renames an argument
+  // or attribute, applies these renames to descriptions as well.
+  // UpdateDocs should only be called once after all ApiDefs are loaded
+  // since it replaces original op names.
+  void UpdateDocs();
+
   // Look up ApiDef proto based on the given graph op name.
   // If graph op name is not in this ApiDefMap, returns nullptr.
   //
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index da9b4dfbb1738c855c0bfc4752853d5d501d80a8..857b1c8dbcac66899f98bb4f2ef87f65f7442f6b 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -410,8 +410,8 @@ op {
 
   ApiDefMap api_map(op_list);
   TF_CHECK_OK(api_map.LoadApiDef(kTestApiDef));
-  auto status = api_map.LoadApiDef(api_def1);
-  ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
+  TF_CHECK_OK(api_map.LoadApiDef(api_def1));
+  ASSERT_EQ(nullptr, api_map.GetApiDef("different_testop"));
 }
 
 TEST(OpGenLibTest, ApiDefInvalidArgOrder) {
@@ -455,5 +455,62 @@ op {
   status = api_map.LoadApiDef(api_def3);
   ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
 }
+
+TEST(OpGenLibTest, ApiDefUpdateDocs) {
+  const string op_list1 = R"(op {
+  name: "testop"
+  input_arg {
+    name: "arg_a"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  output_arg {
+    name: "arg_c"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  attr {
+    name: "attr_a"
+    description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  description: "`arg_a`, `arg_c`, `attr_a`, `testop`"
+}
+)";
+
+  const string api_def1 = R"(
+op {
+  graph_op_name: "testop"
+  endpoint {
+    name: "testop2"
+  }
+  in_arg {
+    name: "arg_a"
+    rename_to: "arg_aa"
+  }
+  out_arg {
+    name: "arg_c"
+    rename_to: "arg_cc"
+    description: "New description: `arg_a`, `arg_c`, `attr_a`, `testop`"
+  }
+  attr {
+    name: "attr_a"
+    rename_to: "attr_aa"
+  }
+}
+)";
+  OpList op_list;
+  protobuf::TextFormat::ParseFromString(op_list1, &op_list);  // NOLINT
+  ApiDefMap api_map(op_list);
+  TF_CHECK_OK(api_map.LoadApiDef(api_def1));
+  api_map.UpdateDocs();
+
+  const string expected_description =
+      "`arg_aa`, `arg_cc`, `attr_aa`, `testop2`";
+  EXPECT_EQ(expected_description, api_map.GetApiDef("testop")->description());
+  EXPECT_EQ(expected_description,
+            api_map.GetApiDef("testop")->in_arg(0).description());
+  EXPECT_EQ("New description: " + expected_description,
+            api_map.GetApiDef("testop")->out_arg(0).description());
+  EXPECT_EQ(expected_description,
+            api_map.GetApiDef("testop")->attr(0).description());
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 30e3b7ef59599ce69cc5383f1443d2bdf3e20cf9..4d410809e77bd6ba7cd24f78c0ef2f97fa54e588 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -192,6 +192,10 @@ OpKernelConstruction::OpKernelConstruction(
       graph_def_version_(graph_def_version),
       status_(status) {}
 
+bool OpKernelConstruction::HasAttr(StringPiece attr_name) const {
+  return HasNodeAttr(def(), attr_name);
+}
+
 void OpKernelConstruction::SetStatus(const Status& status) {
   status_->Update(status);
 }
@@ -622,8 +626,10 @@ Status OpKernelContext::allocate_tensor(
   Tensor new_tensor(a, type, shape, logged_attr);
 
   if (!new_tensor.IsInitialized()) {
-    return errors::ResourceExhausted("OOM when allocating tensor with shape",
-                                     shape.DebugString());
+    return errors::ResourceExhausted(
+        "OOM when allocating tensor with shape", shape.DebugString(),
+        " and type ", DataTypeString(type), " on ", params_->device->name(),
+        " by allocator ", a->Name());
   }
   if (params_->log_memory) {
     LogMemory::RecordTensorAllocation(params_->op_kernel->name(),
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 7eec84e26c758cc48eefc49d0b616100fe458247..3a9a6121c05b02e0f7724dc77adbddca22f0ff19 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -61,7 +61,7 @@ class TensorSliceReaderCacheWrapper;
 }  // namespace checkpoint
 
 class AsyncOpKernel;
-class FunctionCallFrame;
+class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
 class OpKernelContext;       // declared below
@@ -301,6 +301,9 @@ class OpKernelConstruction {
   template <class T>
   Status GetAttr(StringPiece attr_name, T* value) const;
 
+  // Return true if the attr_name is defined in def().
+  bool HasAttr(StringPiece attr_name) const;
+
   // Return the device type.
   const DeviceType& device_type() const { return device_type_; }
 
@@ -545,7 +548,7 @@ class OpKernelContext {
     FrameAndIter frame_iter;
 
     // Function call supports.
-    FunctionCallFrame* call_frame = nullptr;
+    CallFrameInterface* call_frame = nullptr;
     FunctionLibraryRuntime* function_library = nullptr;
     std::function<void(std::function<void()>)>* runner = nullptr;
     StepStatsCollector* stats_collector = nullptr;
@@ -927,7 +930,7 @@ class OpKernelContext {
   //
   // If this kernel invocation is within a function execution,
   // call_frame() returns the call frame for the function call.
-  FunctionCallFrame* call_frame() const { return params_->call_frame; }
+  CallFrameInterface* call_frame() const { return params_->call_frame; }
 
   // If not nullptr, the kernel invoke functions defined in the
   // library. E.g., CHECK_NOTNULL(function_library())->Run("Foo", ...).
@@ -1489,10 +1492,12 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 // }
 
 #define OP_REQUIRES(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {        \
-    (CTX)->CtxFailure((STATUS));      \
-    return;                           \
-  }
+  do {                                \
+    if (!TF_PREDICT_TRUE(EXP)) {      \
+      (CTX)->CtxFailure((STATUS));    \
+      return;                         \
+    }                                 \
+  } while (0)
 
 #define OP_REQUIRES_OK(CTX, ...)          \
   do {                                    \
@@ -1504,11 +1509,13 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
   } while (0)
 
 #define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK) \
-  if (!TF_PREDICT_TRUE(EXP)) {                        \
-    (CTX)->CtxFailure((STATUS));                      \
-    (CALLBACK)();                                     \
-    return;                                           \
-  }
+  do {                                                \
+    if (!TF_PREDICT_TRUE(EXP)) {                      \
+      (CTX)->CtxFailure((STATUS));                    \
+      (CALLBACK)();                                   \
+      return;                                         \
+    }                                                 \
+  } while (0)
 
 #define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK) \
   do {                                              \
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index c31ab18cc12f699d9295b0688e59db775be6b5d8..4bb37e4f6ede54b96f34963890b56ae8774edced 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -87,7 +87,8 @@ limitations under the License.
 
 #elif defined(__ANDROID_TYPES_FULL__)
 
-// Only half, float, int32, int64, bool, and quantized types are supported.
+// Only string, half, float, int32, int64, bool, and quantized types
+// supported.
 #define TF_CALL_float(m) m(float)
 #define TF_CALL_double(m)
 #define TF_CALL_int32(m) m(::tensorflow::int32)
@@ -96,7 +97,7 @@ limitations under the License.
 #define TF_CALL_int16(m)
 
 #define TF_CALL_int8(m)
-#define TF_CALL_string(m)
+#define TF_CALL_string(m) m(string)
 #define TF_CALL_resource(m)
 #define TF_CALL_variant(m)
 #define TF_CALL_complex64(m)
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index a9e4c1cfb16d3114d301bc79d23b11b8139f7fa5..90756a4f2fceb366f2ec0eb991adc31dcf884d99 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -36,15 +36,15 @@ namespace tensorflow {
 Rendezvous::ParsedKey& Rendezvous::ParsedKey::operator=(const ParsedKey& b) {
   const char* b_base = b.buf_.data();
   buf_ = b.buf_;
-  src_device.set(buf_.data() + (b.src_device.data() - b_base),
-                 b.src_device.size());
+  src_device = StringPiece(buf_.data() + (b.src_device.data() - b_base),
+                           b.src_device.size());
   src = b.src;
   src_incarnation = b.src_incarnation;
-  dst_device.set(buf_.data() + (b.dst_device.data() - b_base),
-                 b.dst_device.size());
+  dst_device = StringPiece(buf_.data() + (b.dst_device.data() - b_base),
+                           b.dst_device.size());
   dst = b.dst;
-  edge_name.set(buf_.data() + (b.edge_name.data() - b_base),
-                b.edge_name.size());
+  edge_name = StringPiece(buf_.data() + (b.edge_name.data() - b_base),
+                          b.edge_name.size());
   return *this;
 }
 
@@ -104,9 +104,9 @@ Status Rendezvous::ParseKey(StringPiece key, ParsedKey* out) {
       strings::HexStringToUint64(parts[1], &out->src_incarnation) &&
       DeviceNameUtils::ParseFullName(parts[2], &out->dst) &&
       !parts[3].empty()) {
-    out->src_device.set(parts[0].data(), parts[0].size());
-    out->dst_device.set(parts[2].data(), parts[2].size());
-    out->edge_name.set(parts[3].data(), parts[3].size());
+    out->src_device = StringPiece(parts[0].data(), parts[0].size());
+    out->dst_device = StringPiece(parts[2].data(), parts[2].size());
+    out->edge_name = StringPiece(parts[3].data(), parts[3].size());
     return Status::OK();
   }
   return errors::InvalidArgument("Invalid  rendezvous key: ", key);
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index ffa235d15c09a2a621558c6941aac7485e4bf737..c13f13a126f148fa6d23dcb80c2fae8e8ecbcf3c 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -342,8 +342,8 @@ Status InferenceContext::WithRank(ShapeHandle shape, int64 rank,
     for (int i = 0; i < rank; ++i) {
       dims.push_back(UnknownDim());
     }
-    *out = shape_manager_.MakeShape(dims);
-    return Status::OK();
+    ShapeHandle shp = shape_manager_.MakeShape(dims);
+    return Merge(shape, shp, out);
   }
   *out = nullptr;
 
@@ -357,13 +357,10 @@ Status InferenceContext::WithRankAtLeast(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing >= rank) {
+  if (existing >= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
   *out = nullptr;
   return errors::InvalidArgument("Shape must be at least rank ", rank,
                                  " but is rank ", existing);
@@ -375,10 +372,7 @@ Status InferenceContext::WithRankAtMost(ShapeHandle shape, int64 rank,
     return errors::InvalidArgument("Rank cannot exceed kint32max");
   }
   const int32 existing = Rank(shape);
-  if (existing == kUnknownRank) {
-    return ReturnUnknownShape(out);
-  }
-  if (existing <= rank) {
+  if (existing <= rank || existing == kUnknownRank) {
     *out = shape;
     return Status::OK();
   }
@@ -395,34 +389,52 @@ Status InferenceContext::WithValue(DimensionHandle dim, int64 value,
     return Status::OK();
   }
   if (existing == kUnknownDim) {
-    *out = MakeDim(value);
-    return Status::OK();
+    DimensionHandle d = MakeDim(value);
+    return Merge(dim, d, out);
   }
   *out = nullptr;
   return errors::InvalidArgument("Dimension must be ", value, " but is ",
                                  existing);
 }
 
-void InferenceContext::Relax(DimensionHandle d0, DimensionHandle d1,
+void InferenceContext::Relax(DimensionHandle d_old, DimensionHandle d_new,
                              DimensionHandle* out) {
-  if (d0.SameHandle(d1)) {
-    *out = d0;
-  } else if (!ValueKnown(d0) || !ValueKnown(d1)) {
-    *out = UnknownDim();
-  } else if (Value(d0) == Value(d1)) {
-    *out = d0;
+  if (d_old.SameHandle(d_new)) {
+    *out = d_old;
+  } else if (!ValueKnown(d_old) && !ValueKnown(d_new)) {
+    // The node will be fed by the dimension d_new instead of d_old: any
+    // equality assertion between d_old and other input dimension on this node
+    // may not be true anymore, so forget them all.
+    ForgetMerges();
+    // Return the new shape handle to force the relaxation to propagate to the
+    // fanout of the context.
+    *out = d_new;
+  } else if (!ValueKnown(d_new)) {
+    ForgetMerges();
+    *out = d_new;
+  } else if (Value(d_old) == Value(d_new)) {
+    // Return the old shape handle. This will stop the relaxation in the fanout
+    // of the context.
+    *out = d_old;
   } else {
+    // Return a new handle that encodes a different unknown dim.
+    ForgetMerges();
     *out = UnknownDim();
   }
 }
 
 Status InferenceContext::Merge(DimensionHandle d0, DimensionHandle d1,
                                DimensionHandle* out) {
-  if (d0.SameHandle(d1) || !ValueKnown(d1)) {
+  if (d0.SameHandle(d1)) {
     *out = d0;
     return Status::OK();
+  } else if (!ValueKnown(d1)) {
+    *out = d0;
+    merged_dims_.emplace_back(d0, d1);
+    return Status::OK();
   } else if (!ValueKnown(d0)) {
     *out = d1;
+    merged_dims_.emplace_back(d0, d1);
     return Status::OK();
   } else if (Value(d0) == Value(d1)) {
     *out = d0;
@@ -458,55 +470,63 @@ Status InferenceContext::MergePrefix(ShapeHandle s, ShapeHandle prefix,
   return Status::OK();
 }
 
-void InferenceContext::Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out) {
-  if (s0.SameHandle(s1)) {
-    *out = s0;
+void InferenceContext::Relax(ShapeHandle s_old, ShapeHandle s_new,
+                             ShapeHandle* out) {
+  if (s_old.SameHandle(s_new)) {
+    *out = s_old;
     return;
-  } else if (!RankKnown(s0) || !RankKnown(s1)) {
-    *out = UnknownShape();
+  } else if (!RankKnown(s_new) || !s_old.IsSet()) {
+    ForgetMerges();
+    *out = s_new;
     return;
   }
 
-  const int32 rank = Rank(s0);
-  if (rank != Rank(s1)) {
+  const int32 rank = Rank(s_old);
+  if (rank != Rank(s_new)) {
+    ForgetMerges();
     *out = UnknownShape();
     return;
   }
 
-  bool return_s0 = true;
+  bool return_s_old = true;
   for (int i = 0; i < rank; ++i) {
-    auto d0 = Dim(s0, i);
-    auto d1 = Dim(s1, i);
+    auto d0 = Dim(s_old, i);
+    auto d1 = Dim(s_new, i);
     if (d0.SameHandle(d1)) continue;
 
     auto v0 = Value(d0);
     auto v1 = Value(d1);
     if (v0 == kUnknownDim || v1 == kUnknownDim || v0 != v1) {
-      return_s0 = false;
+      return_s_old = false;
       break;
     }
   }
-  if (return_s0) {
-    *out = s0;
+  if (return_s_old) {
+    *out = s_old;
     return;
   }
 
   // Relax dims.
   std::vector<DimensionHandle> dims(rank);
   for (int i = 0; i < rank; ++i) {
-    // Invariant for relax was checked earlier, so CHECK is ok.
-    Relax(Dim(s0, i), Dim(s1, i), &dims[i]);
+    Relax(Dim(s_old, i), Dim(s_new, i), &dims[i]);
   }
+  ForgetMerges();
   *out = MakeShape(dims);
 }
 
 Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
                                ShapeHandle* out) {
-  if (s0.SameHandle(s1) || !RankKnown(s1)) {
+  if (s0.SameHandle(s1)) {
+    *out = s0;
+    return Status::OK();
+  } else if (!RankKnown(s1)) {
     *out = s0;
+    merged_shapes_.emplace_back(s0, s1);
     return Status::OK();
   } else if (!RankKnown(s0)) {
     *out = s1;
+    merged_shapes_.emplace_back(s0, s1);
     return Status::OK();
   }
 
@@ -534,11 +554,15 @@ Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
       return_s1 = false;
     } else if (v0 != v1) {
       *out = nullptr;
-      return errors::InvalidArgument("Dimension ", i,
-                                     " in both shapes must be equal, but are ",
-                                     Value(d0), " and ", Value(d1));
+      return errors::InvalidArgument(
+          "Dimension ", i, " in both shapes must be equal, but are ", Value(d0),
+          " and ", Value(d1), ". Shapes are ", DebugString(s0), " and ",
+          DebugString(s1), ".");
     }
   }
+
+  merged_shapes_.emplace_back(s0, s1);
+
   if (return_s0 || return_s1) {
     *out = return_s0 ? s0 : s1;
     return Status::OK();
@@ -550,7 +574,14 @@ Status InferenceContext::Merge(ShapeHandle s0, ShapeHandle s1,
     // Invariant for merge was checked earlier, so CHECK is ok.
     TF_CHECK_OK(Merge(Dim(s0, i), Dim(s1, i), &dims[i]));
   }
-  return ReturnCreatedShape(dims, out);
+
+  Status s = ReturnCreatedShape(dims, out);
+  if (s.ok()) {
+    // Merge the new shape with s0. Since s0 and s1 are merged, this implies
+    // that s1 and out are also merged.
+    merged_shapes_.emplace_back(s0, *out);
+  }
+  return s;
 }
 
 Status InferenceContext::Subshape(ShapeHandle s, int64 start,
@@ -884,7 +915,7 @@ Status InferenceContext::Add(DimensionHandle first, DimensionOrConstant second,
   if (first_value == 0) {
     *out = MakeDim(second);
   } else if (second_value == 0) {
-    *out = MakeDim(first);
+    *out = first;
   } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
@@ -909,7 +940,7 @@ Status InferenceContext::Subtract(DimensionHandle first,
   const int64 second_value = Value(second);
   // Special cases.
   if (second_value == 0) {
-    *out = MakeDim(first);
+    *out = first;
   } else if (first_value == kUnknownDim || second_value == kUnknownDim) {
     *out = UnknownDim();
   } else {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index d1b610d68257cd037d1d0bdde2d9364237ae54d8..4a4ef12635f867fccb594d50a2c9e8f3059ce337 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -31,6 +31,7 @@ class ShapeRefinerTest;
 
 namespace grappler {
 class GraphProperties;
+class SymbolicShapeManager;
 }
 
 namespace shape_inference {
@@ -55,13 +56,14 @@ class Dimension {
 class DimensionHandle {
  public:
   DimensionHandle() {}
+  bool SameHandle(DimensionHandle d) const { return ptr_ == d.ptr_; }
+  std::size_t Handle() const { return reinterpret_cast<std::size_t>(ptr_); }
 
  private:
   DimensionHandle(const Dimension* dim) { ptr_ = dim; }
 
-  const Dimension* operator->() { return ptr_; }
+  const Dimension* operator->() const { return ptr_; }
   bool IsSet() const { return ptr_ != nullptr; }
-  bool SameHandle(DimensionHandle d) const { return ptr_ == d.ptr_; }
 
   const Dimension* ptr_ = nullptr;
 
@@ -71,6 +73,8 @@ class DimensionHandle {
   friend class ShapeInferenceTestutil;
   friend class ::tensorflow::ShapeRefinerTest;
   friend class ShapeManager;
+  friend class ::tensorflow::grappler::GraphProperties;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
 
   // Intentionally copyable.
 };
@@ -87,6 +91,7 @@ class Shape {
 
   friend class InferenceContext;
   friend class ShapeManager;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Shape);
 };
@@ -94,12 +99,13 @@ class Shape {
 class ShapeHandle {
  public:
   ShapeHandle() {}
+  bool SameHandle(ShapeHandle s) const { return ptr_ == s.ptr_; }
+  std::size_t Handle() const { return reinterpret_cast<std::size_t>(ptr_); }
 
  private:
   ShapeHandle(const Shape* shape) { ptr_ = shape; }
-  const Shape* operator->() { return ptr_; }
+  const Shape* operator->() const { return ptr_; }
   bool IsSet() const { return ptr_ != nullptr; }
-  bool SameHandle(ShapeHandle s) const { return ptr_ == s.ptr_; }
 
   const Shape* ptr_ = nullptr;
 
@@ -108,6 +114,7 @@ class ShapeHandle {
   friend class ShapeInferenceTestutil;
   friend class ::tensorflow::ShapeRefinerTest;
   friend class ShapeManager;
+  friend class ::tensorflow::grappler::SymbolicShapeManager;
 
   // Intentionally copyable.
 };
@@ -230,24 +237,19 @@ class InferenceContext {
   // - For any one dimension, if the values for that dimension in both shapes
   //   are known, then the values must match.
   // - If one shape has equal or more information than the other shape in every
-  //   dimension, the shape with more information will be returned. Otherwise a
-  //   new shape holding the combined information of the input shapes will be
-  //   returned.
+  //   dimension, the new shape will become the shape with more information.
   // - Example: merging [2,?] and [?,2] results in [2,2]
   // - Example: [2,2] cannot be merged with [1,2]
   //
   // This requires idx to be in the [0, num_inputs) range. If the merge is
-  // successful and the new shape differs from the old one, store the new shape
-  // and return true. Return false otherwise.
+  // successful, return true. Return false otherwise.
   bool MergeInput(int idx, ShapeHandle shape) {
     ShapeHandle new_shape;
-    if (!Merge(inputs_[idx], shape, &new_shape).ok() ||
-        inputs_[idx].SameHandle(new_shape)) {
-      return false;
-    }
+    if (!Merge(inputs_[idx], shape, &new_shape).ok()) return false;
     inputs_[idx] = new_shape;
     return true;
   }
+
   // Relax the stored shape of the input in position idx with <shape> according
   // to the following rules:
   //
@@ -334,22 +336,28 @@ class InferenceContext {
     if (s->rank_ == kUnknownRank) {
       return UnknownDim();
     }
+    return DimKnownRank(s, idx);
+  }
+  // As above, but asserts that the rank of the shape is known.
+  static DimensionHandle DimKnownRank(ShapeHandle s, int64 idx) {
+    CHECK_NE(s->rank_, kUnknownRank);
     if (idx < 0) {
       return s->dims_[s->dims_.size() + idx];
     }
     return s->dims_[idx];
   }
-  int32 Rank(ShapeHandle s) const {
+
+  static int32 Rank(ShapeHandle s) {
     DCHECK(s.IsSet());
     return s.IsSet() ? s->rank_ : kUnknownRank;
   }
-  bool RankKnown(ShapeHandle s) const {
+  static bool RankKnown(ShapeHandle s) {
     return (s.IsSet() && (Rank(s) != kUnknownRank));
   }
-  inline int64 Value(DimensionOrConstant d) const {
+  static inline int64 Value(DimensionOrConstant d) {
     return d.dim.IsSet() ? d.dim->value_ : d.val;
   }
-  inline bool ValueKnown(DimensionOrConstant d) const {
+  static inline bool ValueKnown(DimensionOrConstant d) {
     return Value(d) != kUnknownDim;
   }
 
@@ -601,6 +609,14 @@ class InferenceContext {
 
   int graph_def_version() const { return graph_def_version_; }
 
+  const std::vector<std::pair<ShapeHandle, ShapeHandle>>& MergedShapes() const {
+    return merged_shapes_;
+  }
+  const std::vector<std::pair<DimensionHandle, DimensionHandle>>& MergedDims()
+      const {
+    return merged_dims_;
+  }
+
  private:
   // Creates and stores shapes for use in InferenceContext.
   class ShapeManager {
@@ -662,14 +678,17 @@ class InferenceContext {
   // Adds additional context to the given status.
   Status AttachContext(const Status& status);
 
-  // Relaxes <d0> and <d1> and returns the relaxed dimension in <*out>. If <d0>
-  // and <d1> have incompatible values, returns an error.
+  // Relaxes an existing value <d_old> with a new value <d_new> and returns the
+  // relaxed dimension in <*out>. If <d_old> and <d_new> have incompatible
+  // values, returns an error.
   //
-  // Note that <*out> may be set to <d0> or <d1>.
-  void Relax(DimensionHandle d0, DimensionHandle d1, DimensionHandle* out);
-  // Relaxes <s0> and <s1> and returns the relaxed shape in <*out>. See
-  // 'RelaxInput' function for full details and examples.
-  void Relax(ShapeHandle s0, ShapeHandle s1, ShapeHandle* out);
+  // Note that <*out> may be set to <d_old> or <d_new>.
+  void Relax(DimensionHandle d_old, DimensionHandle d_new,
+             DimensionHandle* out);
+  // Relaxes an existing shape <s_old> with a new shape <s_new> and returns the
+  // relaxed shape in <*out>. See 'RelaxInput' function for full details and
+  // examples.
+  void Relax(ShapeHandle s_old, ShapeHandle s_new, ShapeHandle* out);
 
   // Used to implement MergeInputHandleShapesAndTypes and
   // MergeOutputHandleShapesAndTypes.
@@ -682,6 +701,12 @@ class InferenceContext {
       const std::vector<ShapeAndType>& shapes_and_types,
       std::vector<ShapeAndType>* to_update) TF_MUST_USE_RESULT;
 
+  // Forget all the previous merged shapes and dims.
+  void ForgetMerges() {
+    merged_shapes_.clear();
+    merged_dims_.clear();
+  }
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
@@ -717,6 +742,13 @@ class InferenceContext {
   // constructor is removed.
   Status construction_status_;
 
+  // Pair of shape or dim handles that are equivalent, ie that represent the
+  // same underlying shape of dimension. Note that for each pair at least one of
+  // the handles must contain an unknown shape, since we don't keep track of
+  // known shapes or dims here.
+  std::vector<std::pair<ShapeHandle, ShapeHandle>> merged_shapes_;
+  std::vector<std::pair<DimensionHandle, DimensionHandle>> merged_dims_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(InferenceContext);
 };
 
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index d36ff5822bae15bf0eb44c5f657afe3de09a1020..a9b63ca60e4574bb0d59c4b939ac157e62f317e8 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -359,11 +359,11 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
   // WithRankAtMost on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtMost(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtMost(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
@@ -398,11 +398,11 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
   // WithRankAtLeast on a shape with unknown dimensionality always succeeds.
   EXPECT_TRUE(c.WithRankAtLeast(in0, 1, &s1).ok());
   EXPECT_EQ("?", c.DebugString(s1));
-  EXPECT_FALSE(SameHandle(in0, s1));
+  EXPECT_TRUE(SameHandle(in0, s1));
 
   EXPECT_TRUE(c.WithRankAtLeast(in0, 2, &s2).ok());
   EXPECT_EQ("?", c.DebugString(s2));
-  EXPECT_FALSE(SameHandle(s1, s2));
+  EXPECT_TRUE(SameHandle(s1, s2));
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
@@ -485,18 +485,33 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_TRUE(c.Merge(d_unknown, d_unknown_b, &out).ok());
   EXPECT_TRUE(SameHandle(d_unknown, out));
 
-  // Merging with self returns self.
+  auto merged_dims = c.MergedDims();
+  ASSERT_EQ(3, merged_dims.size());
+  EXPECT_TRUE(merged_dims[0].first.SameHandle(d2));
+  EXPECT_TRUE(merged_dims[0].second.SameHandle(d_unknown));
+  EXPECT_TRUE(merged_dims[1].first.SameHandle(d_unknown));
+  EXPECT_TRUE(merged_dims[1].second.SameHandle(d2));
+  EXPECT_TRUE(merged_dims[2].first.SameHandle(d_unknown));
+  EXPECT_TRUE(merged_dims[2].second.SameHandle(d_unknown_b));
+
+  // Merging with self is a no-op and returns self.
   EXPECT_TRUE(c.Merge(d2, d2, &out).ok());
   EXPECT_TRUE(SameHandle(d2, out));
   EXPECT_TRUE(c.Merge(d_unknown, d_unknown, &out).ok());
   EXPECT_TRUE(SameHandle(d_unknown, out));
 
-  // Merging equal values returns first one.
+  merged_dims = c.MergedDims();
+  EXPECT_EQ(3, merged_dims.size());
+
+  // Merging equal values is a no op and returns first one.
   EXPECT_TRUE(c.Merge(d2, d2_b, &out).ok());
   EXPECT_TRUE(SameHandle(d2, out));
   EXPECT_TRUE(c.Merge(d2_b, d2, &out).ok());
   EXPECT_TRUE(SameHandle(d2_b, out));
 
+  merged_dims = c.MergedDims();
+  EXPECT_EQ(3, merged_dims.size());
+
   // Merging unequal values is an error.
   EXPECT_TRUE(
       StringPiece(c.Merge(d2, d1, &out).ToString())
@@ -510,6 +525,9 @@ TEST_F(ShapeInferenceTest, MergeDim) {
               "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
+
+  merged_dims = c.MergedDims();
+  EXPECT_EQ(3, merged_dims.size());
 }
 
 TEST_F(ShapeInferenceTest, RelaxDim) {
@@ -526,9 +544,10 @@ TEST_F(ShapeInferenceTest, RelaxDim) {
   auto d_unknown_b = c.Dim(c.input(0), 4);
   DimensionHandle out;
 
-  // Relaxing anything with unknown returns a new unknown.
+  // Relaxing anything with unknown returns a new unknown or the existing
+  // unknown.
   Relax(&c, d2, d_unknown, &out);
-  EXPECT_FALSE(SameHandle(d_unknown, out));
+  EXPECT_TRUE(SameHandle(d_unknown, out));
   EXPECT_FALSE(SameHandle(d_unknown_b, out));
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
   Relax(&c, d_unknown, d2, &out);
@@ -536,7 +555,7 @@ TEST_F(ShapeInferenceTest, RelaxDim) {
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
   Relax(&c, d_unknown, d_unknown_b, &out);
   EXPECT_FALSE(SameHandle(d_unknown, out));
-  EXPECT_FALSE(SameHandle(d_unknown_b, out));
+  EXPECT_TRUE(SameHandle(d_unknown_b, out));
   EXPECT_EQ(InferenceContext::kUnknownDim, c.Value(out));
 
   // Relaxing with self returns self.
@@ -584,7 +603,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   EXPECT_EQ("?", c.DebugString(out));
   Relax(&c, s_unknown, s_unknown_b, &out);
   EXPECT_FALSE(SameHandle(s_unknown, out));
-  EXPECT_FALSE(SameHandle(s_unknown_b, out));
+  EXPECT_TRUE(SameHandle(s_unknown_b, out));
   EXPECT_EQ("?", c.DebugString(out));
 
   // Relaxing with self returns self.
@@ -605,7 +624,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   Relax(&c, s_u_2, s_1_u, &out);
   EXPECT_EQ("[?,?]", c.DebugString(out));
   EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
-  EXPECT_FALSE(SameHandle(c.Dim(s_1_u, 1), c.Dim(out, 1)));
+  EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 1), c.Dim(out, 1)));
   auto s_u1 = c.UnknownShapeOfRank(1);
   auto s_u2 = c.UnknownShapeOfRank(1);
   Relax(&c, s_u1, s_u2, &out);
@@ -619,7 +638,7 @@ TEST_F(ShapeInferenceTest, RelaxShape) {
   EXPECT_EQ("[?,?]", c.DebugString(out));
   out = s_unknown;
   Relax(&c, s_1_3, s_u_2, &out);
-  EXPECT_FALSE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
+  EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 0), c.Dim(out, 0)));
   EXPECT_EQ("[?,?]", c.DebugString(out));
   out = s_unknown;
 
@@ -652,10 +671,22 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(c.Merge(s_unknown, s_unknown_b, &out).ok());
   EXPECT_TRUE(SameHandle(s_unknown, out));
 
+  auto merged_shapes = c.MergedShapes();
+  ASSERT_EQ(3, merged_shapes.size());
+  EXPECT_TRUE(merged_shapes[0].first.SameHandle(s_unknown));
+  EXPECT_TRUE(merged_shapes[0].second.SameHandle(s_1_2));
+  EXPECT_TRUE(merged_shapes[1].first.SameHandle(s_u_2));
+  EXPECT_TRUE(merged_shapes[1].second.SameHandle(s_unknown));
+  EXPECT_TRUE(merged_shapes[2].first.SameHandle(s_unknown));
+  EXPECT_TRUE(merged_shapes[2].second.SameHandle(s_unknown_b));
+
   // Merging with self returns self.
   EXPECT_TRUE(c.Merge(s_1_2, s_1_2, &out).ok());
   EXPECT_TRUE(SameHandle(out, s_1_2));
 
+  merged_shapes = c.MergedShapes();
+  EXPECT_EQ(3, merged_shapes.size());
+
   // Merging where one of the inputs is the right answer - return that input.
   out = ShapeHandle();
   EXPECT_TRUE(c.Merge(s_1_2, s_u_2, &out).ok());
@@ -664,6 +695,13 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(c.Merge(s_u_2, s_1_2, &out).ok());
   EXPECT_TRUE(SameHandle(s_1_2, out));
 
+  merged_shapes = c.MergedShapes();
+  ASSERT_EQ(5, merged_shapes.size());
+  EXPECT_TRUE(merged_shapes[3].first.SameHandle(s_1_2));
+  EXPECT_TRUE(merged_shapes[3].second.SameHandle(s_u_2));
+  EXPECT_TRUE(merged_shapes[4].first.SameHandle(s_u_2));
+  EXPECT_TRUE(merged_shapes[4].second.SameHandle(s_1_2));
+
   // Merging where neither input is the right answer.
   EXPECT_TRUE(c.Merge(s_u_2, s_1_u, &out).ok());
   EXPECT_FALSE(SameHandle(out, s_u_2));
@@ -672,11 +710,23 @@ TEST_F(ShapeInferenceTest, MergeShape) {
   EXPECT_TRUE(SameHandle(c.Dim(s_1_u, 0), c.Dim(out, 0)));
   EXPECT_TRUE(SameHandle(c.Dim(s_u_2, 1), c.Dim(out, 1)));
 
+  merged_shapes = c.MergedShapes();
+  ASSERT_EQ(7, merged_shapes.size());
+  EXPECT_TRUE(merged_shapes[5].first.SameHandle(s_u_2));
+  EXPECT_TRUE(merged_shapes[5].second.SameHandle(s_1_u));
+  EXPECT_TRUE(merged_shapes[6].first.SameHandle(s_u_2));
+  EXPECT_TRUE(merged_shapes[6].second.SameHandle(out));
+
   auto s_u1 = c.UnknownShapeOfRank(1);
   auto s_u2 = c.UnknownShapeOfRank(1);
   TF_EXPECT_OK(c.Merge(s_u1, s_u2, &out));
   EXPECT_TRUE(SameHandle(s_u1, out));
 
+  merged_shapes = c.MergedShapes();
+  ASSERT_EQ(8, merged_shapes.size());
+  EXPECT_TRUE(merged_shapes[7].first.SameHandle(s_u1));
+  EXPECT_TRUE(merged_shapes[7].second.SameHandle(s_u2));
+
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
   EXPECT_TRUE(
@@ -701,6 +751,9 @@ TEST_F(ShapeInferenceTest, MergeShape) {
               "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
+
+  merged_shapes = c.MergedShapes();
+  EXPECT_EQ(8, merged_shapes.size());
 }
 
 TEST_F(ShapeInferenceTest, MergePrefix) {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 3a7df6a4781ba9b1f98a9a9918bfb7ae0b655599..c195623b279a4275ab2646483851ec3a65a1f0d4 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -42,6 +42,9 @@ class TensorCApi;
 class TensorDescription;
 class TensorProto;
 class VariantTensorData;
+namespace batch_util {
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+}  // namespace batch_util
 
 /// @ingroup core
 /// Represents an n-dimensional array of values.
@@ -487,6 +490,9 @@ class Tensor {
   template <typename Device, typename T>
   friend Status PrepareToUpdateVariable(
       OpKernelContext* ctx, Tensor* tensor);  // For access to RefCountIsOne().
+  friend Status batch_util::CopyElementToSlice(
+      Tensor element, Tensor* parent,
+      int64 index);                // For access to RefCountIsOne().
   friend class NumpyTensorBuffer;  // For access to the private constructor
                                    // taking the buffer.
 
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 06c576c7d41e5bf48f9db6754e5814142632a371..d8a9c0bac5b950157044dae07771b6733481ac9e 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -359,7 +359,8 @@ Status TensorShapeOld::IsValidShape(const TensorShapeProto& proto) {
   for (const auto& d : proto.dim()) {
     if (d.size() < 0) {
       return errors::InvalidArgument("Shape ", DebugString(proto),
-                                     " has negative dimensions");
+                                     " has negative dimensions; ",
+                                     "perhaps an un-fed placeholder?");
     }
     num_elements *= d.size();
     if (num_elements > kMaxElements) {
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index db996e31b0c82a28f48e9c6605e24d003c801274..239dfd13ec2e45acb0a65700f2a8882c61fc03b3 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -183,6 +183,17 @@ gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetRecordsAndUnRef() {
   return allocations;
 }
 
+gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetCurrentRecords() {
+  gtl::InlinedVector<AllocRecord, 4> allocations;
+  {
+    mutex_lock lock(mu_);
+    for (const AllocRecord& alloc : allocations_) {
+      allocations.push_back(alloc);
+    }
+  }
+  return allocations;
+}
+
 bool TrackingAllocator::UnRef() {
   CHECK_GE(ref_, 1);
   --ref_;
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index d10b0cca51d36a18f19e761a2b8ebb0468b0928f..a6c26c89e51f1fec01886672b91f863ee36bedc8 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -85,6 +85,8 @@ class TrackingAllocator : public Allocator {
   // deallocated. After this call completes and all allocated pointers
   // have been deallocated the wrapper will delete itself.
   gtl::InlinedVector<AllocRecord, 4> GetRecordsAndUnRef();
+  // Returns a copy of allocation records collected so far.
+  gtl::InlinedVector<AllocRecord, 4> GetCurrentRecords();
 
  protected:
   ~TrackingAllocator() override {}
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index faae19585d9dd2bc5f351772af93723daaa3b8be..b082dfbd031cde572ed255a19a767c855cc56611 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -206,18 +206,18 @@ string DataTypeSliceString(const DataTypeSlice types) {
 }
 
 DataTypeVector AllTypes() {
-  return {DT_FLOAT,   DT_DOUBLE, DT_INT32,  DT_UINT8,     DT_INT16,
-          DT_UINT16,  DT_INT8,   DT_STRING, DT_COMPLEX64, DT_COMPLEX128,
-          DT_INT64,   DT_BOOL,   DT_QINT8,  DT_QUINT8,    DT_QINT16,
-          DT_QUINT16, DT_QINT32, DT_HALF,   DT_RESOURCE,  DT_VARIANT,
-          DT_UINT32,  DT_UINT64};
+  return {DT_FLOAT,   DT_DOUBLE, DT_INT32,   DT_UINT8,     DT_INT16,
+          DT_UINT16,  DT_INT8,   DT_STRING,  DT_COMPLEX64, DT_COMPLEX128,
+          DT_INT64,   DT_BOOL,   DT_QINT8,   DT_QUINT8,    DT_QINT16,
+          DT_QUINT16, DT_QINT32, DT_HALF,    DT_RESOURCE,  DT_VARIANT,
+          DT_UINT32,  DT_UINT64, DT_BFLOAT16};
 }
 
 #if !defined(IS_MOBILE_PLATFORM) || defined(SUPPORT_SELECTIVE_REGISTRATION)
 
 DataTypeVector RealNumberTypes() {
-  return {DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64,  DT_UINT8, DT_INT16,
-          DT_INT8,  DT_UINT16, DT_HALF,  DT_UINT32, DT_UINT64};
+  return {DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64,  DT_UINT8,  DT_INT16,
+          DT_INT8,  DT_UINT16, DT_HALF,  DT_UINT32, DT_UINT64, DT_BFLOAT16};
 }
 
 DataTypeVector QuantizedTypes() {
@@ -227,14 +227,14 @@ DataTypeVector QuantizedTypes() {
 DataTypeVector RealAndQuantizedTypes() {
   return {DT_FLOAT,  DT_DOUBLE,  DT_INT32,  DT_INT64, DT_UINT8,
           DT_UINT16, DT_UINT16,  DT_INT8,   DT_QINT8, DT_QUINT8,
-          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF};
+          DT_QINT16, DT_QUINT16, DT_QINT32, DT_HALF,  DT_BFLOAT16};
 }
 
 DataTypeVector NumberTypes() {
-  return {DT_FLOAT,     DT_DOUBLE,     DT_INT64,  DT_INT32,
-          DT_UINT8,     DT_UINT16,     DT_INT16,  DT_INT8,
-          DT_COMPLEX64, DT_COMPLEX128, DT_QINT8,  DT_QUINT8,
-          DT_QINT32,    DT_HALF,       DT_UINT32, DT_UINT64};
+  return {DT_FLOAT,  DT_DOUBLE,  DT_INT64,  DT_INT32,     DT_UINT8,
+          DT_UINT16, DT_INT16,   DT_INT8,   DT_COMPLEX64, DT_COMPLEX128,
+          DT_QINT8,  DT_QUINT8,  DT_QINT32, DT_HALF,      DT_UINT32,
+          DT_UINT64, DT_BFLOAT16};
 }
 
 #elif defined(__ANDROID_TYPES_FULL__)
@@ -306,6 +306,28 @@ bool DataTypeCanUseMemcpy(DataType dt) {
   }
 }
 
+bool DataTypeIsFloating(DataType dt) {
+  switch (dt) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+    case DT_FLOAT:
+    case DT_DOUBLE:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool DataTypeIsComplex(DataType dt) {
+  switch (dt) {
+    case DT_COMPLEX64:
+    case DT_COMPLEX128:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool DataTypeIsQuantized(DataType dt) {
   switch (dt) {
     case DT_QINT8:
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index dc53ed41780d90448872b1bd98e97f5e16d49592..652985658a20b094ac582466972a62b9f1e287a2 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -222,6 +222,12 @@ static_assert(IsValidDataType<int32>::value, "Incorrect impl for int32");
 
 bool DataTypeCanUseMemcpy(DataType dt);
 
+// Returns true iff 'dt' is a real, non-quantized floating point type.
+bool DataTypeIsFloating(DataType dt);
+
+// Returns true iff 'dt' is a complex type.
+bool DataTypeIsComplex(DataType dt);
+
 bool DataTypeIsQuantized(DataType dt);
 
 // Is the dtype nonquantized integral?
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index bc57740469f96fee28de1cea8920cc0431511db1..5ddc9865633623561760bbcb06d1edf4eecec7a6 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -130,6 +130,13 @@ TEST(TypesTest, QuantizedTypes) {
   EXPECT_FALSE(DataTypeIsQuantized(DT_BFLOAT16));
 }
 
+TEST(TypesTest, ComplexTypes) {
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX64));
+  EXPECT_TRUE(DataTypeIsComplex(DT_COMPLEX128));
+  EXPECT_FALSE(DataTypeIsComplex(DT_FLOAT));
+  EXPECT_FALSE(DataTypeIsComplex(DT_DOUBLE));
+}
+
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 205f2a8370501aeb60a013a8123605ece83da3e4..85e014f80434d2a2de2851d2cb361f4b0a0c9433 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
@@ -108,12 +109,17 @@ class CreateTestVariantOp : public OpKernel {
  public:
   explicit CreateTestVariantOp(OpKernelConstruction* c) : OpKernel(c) {}
   void Compute(OpKernelContext* c) override {
+    // Take the scalar tensor fed as input, and emit a Tensor
+    // containing 10 Variants (StoredTensorValues), both containing
+    // the input tensor.
     const Tensor& stored_t = c->input(0);
     Tensor* out;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &out));
+    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({10}), &out));
     StoredTensorValue store{stored_t};
     auto t = out->flat<Variant>();
-    t(0) = store;
+    for (int i = 0; i < 10; ++i) {
+      t(i) = store;
+    }
     CHECK_EQ("StoredTensorValue", t(0).TypeName());
   }
 };
@@ -175,7 +181,7 @@ TEST(VariantOpCopyTest, CreateConstOnCPU) {
   TF_ASSERT_OK(root.status());
   ClientSession session(root);
   std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({create_const}, &outputs));
+  TF_CHECK_OK(session.Run({create_const}, &outputs));
   EXPECT_EQ(1, outputs.size());
   EXPECT_EQ(DT_VARIANT, outputs[0].dtype());
   EXPECT_EQ(0, outputs[0].dims());
@@ -212,7 +218,7 @@ TEST(VariantOpCopyTest, CreateConstOnGPU) {
 
   int copy_to_gpu_before = *GetCopyCPUToGPUCounter();
   int copy_to_cpu_before = *GetCopyGPUToCPUCounter();
-  TF_EXPECT_OK(session.Run({create_const}, &outputs));
+  TF_CHECK_OK(session.Run({create_const}, &outputs));
   int copy_to_cpu_after = *GetCopyGPUToCPUCounter();
   int copy_to_gpu_after = *GetCopyCPUToGPUCounter();
 
@@ -261,7 +267,7 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
 TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
   Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
   Tensor t_42(DT_INT32, TensorShape({}));
-  t_42.scalar<int32>()() = 42;
+  t_42.flat<int32>()(0) = 42;
   Output create_op = CreateTestVariant(root, t_42);
   Output identity = ops::Identity(root, create_op);
 
@@ -269,14 +275,17 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPU) {
 
   ClientSession session(root);
   std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({create_op, identity}, &outputs));
+  TF_CHECK_OK(session.Run({create_op, identity}, &outputs));
   EXPECT_EQ(2, outputs.size());
-  const Variant& r1 = outputs[1].scalar<Variant>()();
-
-  EXPECT_EQ("StoredTensorValue", r1.TypeName());
-  const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
-  EXPECT_NE(v1, nullptr);
-  EXPECT_EQ(42, v1->stored.scalar<int32>()());
+  EXPECT_EQ(10, outputs[1].dim_size(0));
+  auto output = outputs[1].flat<Variant>();
+  for (int i = 0; i < 10; ++i) {
+    const Variant& r1 = output(i);
+    EXPECT_EQ("StoredTensorValue", r1.TypeName());
+    const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
+    EXPECT_NE(v1, nullptr);
+    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+  }
 }
 
 TEST(VariantOpCopyTest, CreateCopyCPUToCPUString) {
@@ -290,14 +299,17 @@ TEST(VariantOpCopyTest, CreateCopyCPUToCPUString) {
 
   ClientSession session(root);
   std::vector<Tensor> outputs;
-  TF_EXPECT_OK(session.Run({create_op, identity}, &outputs));
+  TF_CHECK_OK(session.Run({create_op, identity}, &outputs));
   EXPECT_EQ(2, outputs.size());
-  const Variant& r1 = outputs[1].scalar<Variant>()();
-
-  EXPECT_EQ("StoredTensorValue", r1.TypeName());
-  const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
-  EXPECT_NE(v1, nullptr);
-  EXPECT_EQ("hi", v1->stored.scalar<string>()());
+  EXPECT_EQ(10, outputs[1].dim_size(0));
+  auto output = outputs[1].flat<Variant>();
+  for (int i = 0; i < 10; ++i) {
+    const Variant& r1 = output(i);
+    EXPECT_EQ("StoredTensorValue", r1.TypeName());
+    const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
+    EXPECT_NE(v1, nullptr);
+    EXPECT_EQ("hi", v1->stored.scalar<string>()());
+  }
 }
 
 TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
@@ -318,7 +330,7 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
   int copy_to_cpu_before = *GetCopyGPUToCPUCounter();
   // Force the identity to run on GPU, and then the data to be copied
   // back to CPU for the final output.
-  TF_EXPECT_OK(session.Run({create_op, identity}, &outputs));
+  TF_CHECK_OK(session.Run({create_op, identity}, &outputs));
   int copy_to_cpu_after = *GetCopyGPUToCPUCounter();
   int copy_to_gpu_after = *GetCopyCPUToGPUCounter();
 
@@ -326,12 +338,15 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPU) {
   EXPECT_GT(copy_to_gpu_after - copy_to_gpu_before, 0);
 
   EXPECT_EQ(2, outputs.size());
-  const Variant& r1 = outputs[1].scalar<Variant>()();
-
-  EXPECT_EQ("StoredTensorValue", r1.TypeName());
-  const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
-  EXPECT_NE(v1, nullptr);
-  EXPECT_EQ(42, v1->stored.scalar<int32>()());
+  EXPECT_EQ(10, outputs[1].dim_size(0));
+  auto output = outputs[1].flat<Variant>();
+  for (int i = 0; i < 10; ++i) {
+    const Variant& r1 = output(i);
+    EXPECT_EQ("StoredTensorValue", r1.TypeName());
+    const StoredTensorValue* v1 = r1.get<StoredTensorValue>();
+    EXPECT_NE(v1, nullptr);
+    EXPECT_EQ(42, v1->stored.scalar<int32>()());
+  }
 }
 
 TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 831dbd3dffe33db3b5fab2ca8feb4225121bc0c7..13f6908cae1ed1b1964bf827dce0fcb2bee4e6d1 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -145,9 +145,8 @@ class UnaryVariantOpRegistry {
   static std::unordered_set<string>* PersistentStringStorage();
 
  private:
-  std::unordered_map<StringPiece, VariantShapeFn, StringPiece::Hasher>
-      shape_fns;
-  std::unordered_map<StringPiece, VariantDecodeFn, StringPiece::Hasher>
+  std::unordered_map<StringPiece, VariantShapeFn, StringPieceHasher> shape_fns;
+  std::unordered_map<StringPiece, VariantDecodeFn, StringPieceHasher>
       decode_fns;
 
   // Map std::pair<Direction, type_name> to function.
@@ -159,7 +158,7 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<1>(x)));
       return ret;
     }
-    StringPiece::Hasher sp_hasher_;
+    StringPieceHasher sp_hasher_;
   };
 
   std::unordered_map<std::pair<VariantDeviceCopyDirection, StringPiece>,
@@ -177,7 +176,7 @@ class UnaryVariantOpRegistry {
       ret = Hash64Combine(ret, sp_hasher_(std::get<2>(x)));
       return ret;
     }
-    StringPiece::Hasher sp_hasher_;
+    StringPieceHasher sp_hasher_;
   };
   std::unordered_map<std::tuple<VariantUnaryOp, StringPiece, StringPiece>,
                      VariantUnaryOpFn, TupleHash>
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 6ef51aa7dfcd48f840f80040f068a766a33ff5bf..4652fbe40691a01e0567c7df2fba0ca2ea482fe1 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -83,13 +83,16 @@ void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
   ReverseDFSFrom(g, {g.sink_node()}, enter, leave, stable_comparator);
 }
 
-void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
-                    const std::function<void(Node*)>& enter,
-                    const std::function<void(Node*)>& leave,
-                    const NodeComparator& stable_comparator) {
+namespace {
+
+template <typename T>
+void ReverseDFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
+                          const std::function<void(T)>& enter,
+                          const std::function<void(T)>& leave,
+                          const NodeComparator& stable_comparator) {
   // Stack of work to do.
   struct Work {
-    Node* node;
+    T node;
     bool leave;  // Are we entering or leaving n?
   };
   std::vector<Work> stack(start.size());
@@ -102,7 +105,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     Work w = stack.back();
     stack.pop_back();
 
-    Node* n = w.node;
+    T n = w.node;
     if (w.leave) {
       leave(n);
       continue;
@@ -117,7 +120,7 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
 
     gtl::iterator_range<NeighborIter> nodes = n->in_nodes();
 
-    auto add_work = [&visited, &stack](Node* out) {
+    auto add_work = [&visited, &stack](T out) {
       if (!visited[out->id()]) {
         // Note; we must not mark as visited until we actually process it.
         stack.push_back(Work{out, false});
@@ -125,22 +128,38 @@ void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
     };
 
     if (stable_comparator) {
-      std::vector<Node*> nodes_sorted;
-      for (Node* in : nodes) {
+      std::vector<T> nodes_sorted;
+      for (T in : nodes) {
         nodes_sorted.emplace_back(in);
       }
       std::sort(nodes_sorted.begin(), nodes_sorted.end(), stable_comparator);
-      for (Node* in : nodes_sorted) {
+      for (T in : nodes_sorted) {
         add_work(in);
       }
     } else {
-      for (Node* in : nodes) {
+      for (T in : nodes) {
         add_work(in);
       }
     }
   }
 }
 
+}  // namespace
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator) {
+  ReverseDFSFromHelper(g, start, enter, leave, stable_comparator);
+}
+
 void GetPostOrder(const Graph& g, std::vector<Node*>* order,
                   const NodeComparator& stable_comparator) {
   order->clear();
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 5bb6041d98b6aebd3036b68fffeed32afda85e50..ac4a099013b67e0d256a9310495e4b585eb40e0a 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -69,6 +69,10 @@ extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
                            const std::function<void(Node*)>& enter,
                            const std::function<void(Node*)>& leave,
                            const NodeComparator& stable_comparator = {});
+extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                           const std::function<void(const Node*)>& enter,
+                           const std::function<void(const Node*)>& leave,
+                           const NodeComparator& stable_comparator = {});
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index a908a4843ca0a3fadeca088f8019d2a1cb228cb4..8afa4971ad054b31eeb63d0dadaa1a2937c47a6e 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-typedef std::unordered_map<StringPiece, int32, StringPiece::Hasher>
+typedef std::unordered_map<StringPiece, int32, StringPieceHasher>
     NodeNameToCostIdMap;
 
 class StepStats;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 87c41186d546e397dbd5c46e7946f1ee5325f84a..fd1b5d33b93d0e2685cd7a909bbcc9909d7d3f87 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -453,6 +453,21 @@ const Edge* Graph::AddControlEdge(Node* source, Node* dest,
   return AddEdge(source, kControlSlot, dest, kControlSlot);
 }
 
+void Graph::RemoveControlEdge(const Edge* e) {
+  if (!e->src_->IsSource() && !e->dst_->IsSink()) {
+    e->dst_->MaybeCopyOnWrite();
+    std::string e_src_name = strings::StrCat("^", e->src_->name());
+    auto* inputs = e->dst_->props_->node_def.mutable_input();
+    for (auto it = inputs->begin(); it != inputs->end(); ++it) {
+      if (*it == e_src_name) {
+        inputs->erase(it);
+        break;
+      }
+    }
+  }
+  RemoveEdge(e);
+}
+
 Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
                          int dst_index) {
   TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index c5dde722fa6b12f4f1635efe5ba1b9069b86901e..b620127d9072a845721f97112f4bad107412b06f 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -451,6 +451,10 @@ class Graph {
   // REQUIRES: The edge must exist.
   void RemoveEdge(const Edge* edge);
 
+  // Removes control edge `edge` from the graph. Note that this also updates
+  // the corresponding NodeDef to reflect the change.
+  // REQUIRES: The control edge must exist.
+  void RemoveControlEdge(const Edge* e);
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 8fe4f535fbb8c1a93fd06c5858ad2095d50f6808..6e72d739189058b44dd6c57f4d5af648855bd4a5 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -68,24 +68,30 @@ class GraphConstructor {
     Options(const GraphConstructorOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(in.allow_internal_ops),
           expect_device_spec(in.expect_device_spec),
-          importing(false) {}
+          importing(false),
+          validate_colocation_constraints(false) {}
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
           prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
                      ? in.prefix
                      : in.prefix + "/"),
+          uniquify_names(in.uniquify_names),
+          uniquify_prefix(in.uniquify_prefix),
           input_map(in.input_map),
           skip_mapped_nodes(in.skip_mapped_nodes),
           control_dependencies(in.control_dependencies),
           return_tensors(in.return_tensors),
           return_nodes(in.return_nodes),
-          importing(true) {}
+          importing(true),
+          validate_colocation_constraints(in.validate_colocation_constraints) {}
 
     bool allow_internal_ops;
     bool expect_device_spec;
 
     string prefix;
+    bool uniquify_names;
+    bool uniquify_prefix;
     std::map<TensorId, TensorId> input_map;
     bool skip_mapped_nodes;
     std::vector<string> control_dependencies;
@@ -101,6 +107,7 @@ class GraphConstructor {
     // applicable to ConvertGraphDefToGraph as well, so make an attempt to
     // remove this.
     bool importing;
+    bool validate_colocation_constraints;
   };
 
   typedef gtl::ArraySlice<const NodeDef*> NodeDefSlice;
@@ -139,6 +146,7 @@ class GraphConstructor {
         library_(library),
         g_(g),
         original_versions_(g->versions()),
+        prefix_(opts.prefix),
         refiner_(refiner),
         return_tensors_(return_tensors),
         return_nodes_(return_nodes),
@@ -154,6 +162,7 @@ class GraphConstructor {
     TF_RETURN_IF_ERROR(UpdateVersionDef());
     TF_RETURN_IF_ERROR(PopulateReturnTensors());
     TF_RETURN_IF_ERROR(PopulateReturnNodes());
+    UpdateUniquifiedColocationNames();
     FixupSourceAndSinkEdges(g_);
     return Status::OK();
   }
@@ -190,6 +199,29 @@ class GraphConstructor {
   void AddPrefixToNodeDef(const std::vector<bool>& input_already_exists,
                           NodeDef* node_def);
 
+  // Modifies `node_def` if its name isn't unique, or if any of its inputs'
+  // names have been uniquified. This must be called in topological order on all
+  // nodes.
+  void UniquifyNames(const std::vector<bool>& input_already_exists,
+                     NodeDef* node_def);
+
+  // Updates any constructed nodes' colocation group names if the name has been
+  // updated by UniquifyNames. This is called after all the nodes have been
+  // constructed so all the names have been uniquified if necessary.
+  void UpdateUniquifiedColocationNames();
+
+  // Returns true if `name` already exists in `g_` (either as a node name or
+  // prefix).
+  bool NameExistsInGraph(StringPiece name);
+
+  // Returns true if `name` already exists in the GraphDef being imported
+  // (either as a node name or prefix).
+  bool NameExistsInGraphDef(StringPiece name);
+
+  // Returns a unique version of `original_name`, or `original_name` if it's
+  // already unique in the graph.
+  string FindUniqueName(StringPiece original_name);
+
   // From constructor
   const Options opts_;
   const NodeDefSlice node_defs_;
@@ -198,6 +230,9 @@ class GraphConstructor {
   Graph* g_;
   const VersionDef original_versions_;
 
+  // A copy of opts_.prefix, possibly uniquified.
+  string prefix_;
+
   ShapeRefiner* refiner_;
 
   // May be null. Not owned.
@@ -222,10 +257,20 @@ class GraphConstructor {
   };
   // TODO(vrv): Profile this data structure to see if we should use an
   // alternative implementation of std::unordered_map.
-  std::unordered_map<StringPiece, NodeInfo, StringPiece::Hasher> gdef_nodes_;
+  std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
+
+  // Prefixes already used in the GraphDef being imported.
+  std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_;
 
-  // Mapping from node name to the existing node in g_
-  std::unordered_map<StringPiece, Node*, StringPiece::Hasher> existing_nodes_;
+  // Mapping from node name to the existing node in g_.
+  std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_;
+
+  // Prefixes already used in the graph.
+  std::unordered_set<StringPiece, StringPieceHasher> existing_prefixes_;
+
+  // Imported node names that have been uniquified. The key is the original
+  // name, the value is the new unique name.
+  std::unordered_map<string, string> uniquified_names_;
 
   // Index of NodeDefs in node_defs_ with all inputs already converted.
   std::vector<int> ready_;
@@ -279,8 +324,19 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
                    node_name) != control_dependencies.end();
 }
 
+// Adds any prefixes of `node_name` (not including the full name itself) to
+// `prefixes`.
+void AddPrefixes(StringPiece node_name,
+                 std::unordered_set<StringPiece, StringPieceHasher>* prefixes) {
+  size_t idx = -1;
+  while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
+    prefixes->insert(node_name.substr(0, idx));
+  }
+}
+
 Status GraphConstructor::EnsureNoNameCollisions() {
   existing_nodes_.reserve(g_->num_nodes());
+  // Populate existing_nodes_ and existing_prefixes_.
   for (Node* n : g_->nodes()) {
     bool already_exists = !existing_nodes_.insert({n->name(), n}).second;
     if (already_exists) {
@@ -296,31 +352,31 @@ Status GraphConstructor::EnsureNoNameCollisions() {
             n->name(), "'");
       }
     }
+    AddPrefixes(n->name(), &existing_prefixes_);
   }
-  if (opts_.prefix.empty() && opts_.importing) {
+  if (prefix_.empty() && opts_.importing && !opts_.uniquify_names) {
     for (const NodeDef* n : node_defs_) {
       const string& name = n->name();
-      if (existing_nodes_.find(name) != existing_nodes_.end()) {
-        return errors::InvalidArgument("Node '", name,
+      if (NameExistsInGraph(name)) {
+        return errors::InvalidArgument("Node name '", name,
                                        "' already exists in the Graph");
       }
     }
-  } else if (!opts_.prefix.empty()) {
-    // Importing nodes with a prefix. No nodes should exist with the same
-    // prefix.
-    StringPiece prefix_no_slash(opts_.prefix);
+  } else if (!prefix_.empty()) {
+    StringPiece prefix_no_slash(prefix_);
     prefix_no_slash.remove_suffix(1);
     if (!IsValidNodeName(prefix_no_slash, false)) {
-      return errors::InvalidArgument("Imported node name prefix '",
-                                     opts_.prefix,
+      return errors::InvalidArgument("Imported node name prefix '", prefix_,
                                      "' would lead to invalid node names");
     }
-    for (const Node* n : g_->nodes()) {
-      if (StringPiece(n->name()).starts_with(opts_.prefix)) {
-        return errors::InvalidArgument(
-            "Import node name prefix conflicts with names of nodes already in "
-            "the Graph, such as '",
-            n->name(), "'");
+    if (NameExistsInGraph(prefix_no_slash)) {
+      if (opts_.uniquify_prefix) {
+        prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
+      } else {
+        return errors::InvalidArgument("Import node name prefix '",
+                                       prefix_no_slash,
+                                       "' conflicts with "
+                                       "name already used in the graph");
       }
     }
   }
@@ -355,7 +411,7 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
 }
 
 Status GraphConstructor::BuildNodeIndex() {
-  // Validate the node names and add them to gdef_nodes_.
+  // Validate the node names and add them to gdef_nodes_ and gdef_prefixes_.
   for (int n = 0; n < node_defs_.size(); ++n) {
     const NodeDef& node_def = *node_defs_[n];
     if (!IsValidNodeName(node_def.name(), opts_.allow_internal_ops)) {
@@ -390,6 +446,8 @@ Status GraphConstructor::BuildNodeIndex() {
             "': Control dependencies must come after regular dependencies");
       }
     }
+    // Update gdef_prefixes_.
+    AddPrefixes(node_def.name(), &gdef_prefixes_);
   }
   return Status::OK();
 }
@@ -418,6 +476,7 @@ Status GraphConstructor::InitFromEdges() {
   // Parse the inputs for each node.
   for (int n = 0; n < num_nodes; ++n) {
     const NodeDef& node_def = *node_defs_[n];
+    int pending_count = node_def.input_size();
     if (IsMerge(node_def)) {
       // Cycles in the graph are only allowed for while loops. A while loop is
       // identified by an edge from a NextIteration node to a Merge node. For
@@ -438,35 +497,41 @@ Status GraphConstructor::InitFromEdges() {
         }
       }
       if (has_loop_back_edge) {
-        pending_count_.push_back(num_control_edges + 1);
-      } else {
-        pending_count_.push_back(node_def.input_size());
+        pending_count = num_control_edges + 1;
       }
-    } else {
-      pending_count_.push_back(node_def.input_size());
-    }
-    if (node_def.input_size() == 0) {
-      ready_.push_back(n);
-      continue;
     }
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
       TensorId id(ParseTensorName(input_name));
-      auto iter = gdef_nodes_.find(id.first);
-      if (iter == gdef_nodes_.end()) {
-        return errors::InvalidArgument("Node '", node_def.name(),
-                                       "': Unknown input node '",
-                                       node_def.input(i), "'");
+      if (opts_.input_map.count(id) == 0) {
+        // If an input is not mapped, then the input should appear in the graph
+        // being imported.
+        auto iter = gdef_nodes_.find(id.first);
+        if (iter == gdef_nodes_.end()) {
+          return errors::InvalidArgument("Node '", node_def.name(),
+                                         "': Unknown input node '",
+                                         node_def.input(i), "'");
+        }
+        outputs_[iter->second.gdef_index].push_back(n);
+      } else {
+        // This input is mapped to an existing edge. Therefore this input is
+        // as good as being already processed.
+        --pending_count;
+        DCHECK_GE(pending_count, 0);
       }
-      outputs_[iter->second.gdef_index].push_back(n);
     }
+    if (pending_count == 0) {
+      ready_.push_back(n);
+    }
+    pending_count_.push_back(pending_count);
   }
   return Status::OK();
 }
 
 Status GraphConstructor::ValidateColocationConstraints(
     const NodeDef& node_def) {
-  if (!opts_.importing) return Status::OK();
+  if (!opts_.validate_colocation_constraints || !opts_.importing)
+    return Status::OK();
   const auto iter = node_def.attr().find(kColocationAttrName);
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
@@ -535,15 +600,36 @@ Status GraphConstructor::ValidateShape(Node* node) {
       const string& op = node->type_string();
       const std::vector<string> whitelist = {
           // To be removed after 2017/03/08.
-          "RandomShuffleQueue", "PaddingFIFOQueue", "FIFOQueue",
-          "PriorityQueue", "QueueSize", "Stack", "Barrier", "BarrierReadySize",
-          "BarrierIncompleteSize", "HashTable", "MutableHashTable",
-          "MutableHashTableOfTensors", "Mutex", "CuckooTable", "IndexTable",
-          "WholeFileReader", "TextLineReader", "FixedLengthRecordReader",
-          "TFRecordReader", "IdentityReader", "RefSwitch", "RefEnter",
-          "RefNextIteration", "RefMerge", "RefIdentity", "LMDBReader",
+          "RandomShuffleQueue",
+          "PaddingFIFOQueue",
+          "FIFOQueue",
+          "PriorityQueue",
+          "QueueSize",
+          "Stack",
+          "Barrier",
+          "BarrierReadySize",
+          "BarrierIncompleteSize",
+          "HashTable",
+          "MutableHashTable",
+          "MutableHashTableOfTensors",
+          "Mutex",
+          "CuckooTable",
+          "IndexTable",
+          "WholeFileReader",
+          "TextLineReader",
+          "FixedLengthRecordReader",
+          "TFRecordReader",
+          "IdentityReader",
+          "RefSwitch",
+          "RefEnter",
+          "RefNextIteration",
+          "RefMerge",
+          "RefIdentity",
+          "LMDBReader",
           // To be removed after 2017/04/24.
-          "ConditionalAccumulator", "SparseConditionalAccumulator", "Table",
+          "ConditionalAccumulator",
+          "SparseConditionalAccumulator",
+          "Table",
       };
       if (std::find(whitelist.begin(), whitelist.end(), op) ==
           whitelist.end()) {
@@ -663,19 +749,18 @@ void GraphConstructor::AddControlDependencies(
 
 void GraphConstructor::AddPrefixToNodeDef(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
-  const string& prefix = opts_.prefix;
-  if (prefix.empty()) return;
-  node_def->set_name(strings::StrCat(prefix, node_def->name()));
+  if (prefix_.empty()) return;
+  node_def->set_name(strings::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
     StringPiece input(node_def->input(i));
     // Skip remapped inputs (which already exist in g_ and are not being
-    // imported)
+    // imported).
     if (input_already_exists[i]) continue;
     if (input.Consume("^")) {
-      node_def->set_input(i, strings::StrCat("^", prefix, input));
+      node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
-      node_def->set_input(i, strings::StrCat(prefix, input));
+      node_def->set_input(i, strings::StrCat(prefix_, input));
     }
   }
   // Update names of colocation groups
@@ -685,12 +770,85 @@ void GraphConstructor::AddPrefixToNodeDef(
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
       if (v.Consume(kColocationGroupPrefix)) {
-        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix, v));
+        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
+      }
+    }
+  }
+}
+
+void GraphConstructor::UniquifyNames(
+    const std::vector<bool>& input_already_exists, NodeDef* node_def) {
+  if (NameExistsInGraph(node_def->name())) {
+    string old_name = node_def->name();
+    node_def->set_name(FindUniqueName(node_def->name()));
+    uniquified_names_[old_name] = node_def->name();
+    // Note that we don't have to update gdef_nodes_ or gdef_prefixes_ with
+    // `name` because we guarantee the original NodeDef names are unique,
+    // meaning we won't generate this name again.
+  }
+  for (int i = 0; i < node_def->input_size(); ++i) {
+    // Skip remapped inputs (which already exist in g_ and are not being
+    // imported).
+    if (input_already_exists[i]) continue;
+    TensorId id = ParseTensorName(node_def->input(i));
+    // We require that UniquifyNames() is called on all NodeDefs in topological
+    // order. This guarantees that node_def's inputs will already be uniquified
+    // if necessary.
+    auto iter = uniquified_names_.find(id.first.ToString());
+    if (iter == uniquified_names_.end()) continue;
+    id.first = iter->second;
+    node_def->set_input(i, id.ToString());
+  }
+}
+
+void GraphConstructor::UpdateUniquifiedColocationNames() {
+  for (const auto& pair : gdef_nodes_) {
+    Node* node = pair.second.node;
+    if (node == nullptr) continue;
+    std::vector<string> coloc_values;
+    Status status =
+        GetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values);
+    if (!status.ok()) continue;
+    bool updated = false;
+    for (int i = 0; i < coloc_values.size(); ++i) {
+      StringPiece val(coloc_values[i]);
+      if (val.Consume(kColocationGroupPrefix)) {
+        const auto& name_pair = uniquified_names_.find(val.ToString());
+        if (name_pair == uniquified_names_.end()) continue;
+        updated = true;
+        coloc_values[i] =
+            strings::StrCat(kColocationGroupPrefix, name_pair->second);
       }
     }
+    if (updated) {
+      node->AddAttr(kColocationAttrName, coloc_values);
+    }
   }
 }
 
+bool GraphConstructor::NameExistsInGraph(StringPiece name) {
+  if (existing_nodes_.find(name) != existing_nodes_.end()) return true;
+  if (existing_prefixes_.find(name) != existing_prefixes_.end()) return true;
+  return false;
+}
+
+bool GraphConstructor::NameExistsInGraphDef(StringPiece name) {
+  if (gdef_nodes_.find(name) != gdef_nodes_.end()) return true;
+  if (gdef_prefixes_.find(name) != gdef_prefixes_.end()) return true;
+  return false;
+}
+
+string GraphConstructor::FindUniqueName(StringPiece original_name) {
+  string name = original_name.ToString();
+  int count = 0;
+  // Check that any generated names don't collide with imported NodeDefs (as
+  // well as nodes in g_).
+  while (NameExistsInGraph(name) || (count > 0 && NameExistsInGraphDef(name))) {
+    name = strings::StrCat(original_name, "_", ++count);
+  }
+  return name;
+}
+
 Status GraphConstructor::IsNodeFullyMapped(const NodeDef& node_def,
                                            bool* is_node_mapped) {
   const OpDef* op_def;
@@ -825,7 +983,11 @@ Status GraphConstructor::Convert() {
 
     Node* node;
     if (opts_.importing) {
-      AddPrefixToNodeDef(input_already_exists, &imported_node_def);
+      if (!prefix_.empty()) {
+        AddPrefixToNodeDef(input_already_exists, &imported_node_def);
+      } else if (opts_.uniquify_names) {
+        UniquifyNames(input_already_exists, &imported_node_def);
+      }
       TF_RETURN_IF_ERROR(ModifyNodeDefForImport(&imported_node_def));
     }
     TF_RETURN_IF_ERROR(MakeNode(*node_def, &node));
diff --git a/tensorflow/core/graph/graph_constructor.h b/tensorflow/core/graph/graph_constructor.h
index a3644788788544728193e4f648fa562e1275ffdc..b4dd2ba51a6f731caf5fe3e79bdca84291268ea2 100644
--- a/tensorflow/core/graph/graph_constructor.h
+++ b/tensorflow/core/graph/graph_constructor.h
@@ -54,13 +54,28 @@ extern Status ConvertNodeDefsToGraph(const GraphConstructorOptions& opts,
 
 // Options for calling ImportGraphDef().
 struct ImportGraphDefOptions {
-  ImportGraphDefOptions() : skip_mapped_nodes(false) {}
+  ImportGraphDefOptions()
+      : uniquify_names(false),
+        uniquify_prefix(false),
+        skip_mapped_nodes(false) {}
 
   // Name prefix to use for nodes imported from the GraphDef.  For example, if
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
-  // named "animals/bunny" in *g.
+  // named "animals/bunny" in *g. Must not be already used as a node name or
+  // prefix in the graph.
   string prefix;
 
+  // If true, imported node names will be modified if their name already exists
+  // in the graph. If false, conflicting names will be treated as an error. Note
+  // that this option has no effect if `prefix` is specified, since `prefix`
+  // will guarantee all node names are unique.
+  bool uniquify_names;
+
+  // If true, `prefix` will be modified if it already exists as a node name or
+  // prefix in the graph. If false, a conflicting prefix will be treated as an
+  // error. This option has no effect if `prefix` isn't specified.
+  bool uniquify_prefix;
+
   // Maps tensors in `gdef` to existing tensors in `g`. Inputs in `gdef`
   // corresponding to `input_map` keys will be remapped to the nodes in `g`
   // corresponding to the values.
@@ -112,6 +127,9 @@ struct ImportGraphDefOptions {
   // TODO(skyewm): make this work with `skip_mapped_nodes` if there's a need.
   std::vector<string> return_nodes;
 
+  // If true, checks that all colocation constraints are nodes in the GraphDef.
+  bool validate_colocation_constraints = true;
+
   // TODO(ashankar): Enable handling of GraphDefs produced by newer binaries
   // with ops that are not defined in the binary calling ImportGraphDef.
   // Similar to the producer_op_list argument to import_graph_def in the
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 5242c56ce6de63fbe1d03e596cc6123844be4a50..9be3de23881860995ba3727e37b3861e56504ed4 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -1475,6 +1475,43 @@ TEST_F(GraphConstructorTest, ImportGraphDef_InputMapUnusedKeys) {
   EXPECT_EQ(results.unused_input_map_keys, expected_unused_keys);
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_InputMapWithUnboundInput) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Populate graph with node we'll use in input map
+  ExpectOK("node { name: 'input' op: 'TestInput' }", ImportGraphDefOptions(),
+           &refiner);
+
+  // Create input_map and use it to import more nodes
+  ImportGraphDefOptions opts;
+  opts.input_map[TensorId("new_input", 0)] = TensorId("input", 1);
+  opts.input_map[TensorId("new_input", 1)] = TensorId("input", 0);
+
+  // new_input exists in input_map but not in the graph being imported.
+  ExpectOK(
+      R"EOF(
+      node { name: 't1' op: 'TestMul' input: [ 'new_input:0', 'new_input:1' ] }
+      node { name: 't2' op: 'TestMul' input: [ 't1:0', 't1:0' ] }
+      )EOF",
+      opts, &refiner);
+
+  EXPECT_TRUE(HasNode("input"));
+  EXPECT_TRUE(HasNode("t1"));
+  EXPECT_TRUE(HasNode("t2"));
+  EXPECT_FALSE(HasNode("new_input"));
+
+  EXPECT_TRUE(HasEdge("input", 1, "t1", 0));
+  EXPECT_TRUE(HasEdge("input", 0, "t1", 1));
+  // Test that t2 is unaffected
+  EXPECT_TRUE(HasEdge("t1", 0, "t2", 0));
+
+  // Check that t1's NodeDef is consistent with graph
+  Node* t1 = FindNode("t1");
+  ASSERT_EQ(t1->requested_inputs().size(), 2);
+  ASSERT_EQ(t1->requested_inputs()[0], "input:1");
+  ASSERT_EQ(t1->requested_inputs()[1], "input:0");
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_SkipMappedNodes_FullyMapped) {
   ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
 
@@ -1731,6 +1768,218 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ReturnNodesErrors) {
                "currently supported"});
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  const char* graph_def_str =
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }";
+
+  // Initial import
+  ImportGraphDefOptions opts;
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("B");
+  ImportGraphDefResults results;
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A");
+
+  // Repeat the same import
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_1");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_1:0");
+
+  // Repeat the same import again
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_2");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_2:0");
+
+  // Import with an already-used prefix
+  opts.prefix = "A";
+  opts.uniquify_prefix = true;
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3/A");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_3/B");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_3/A");
+
+  // Create B_3 node to keep the A/B numbering in sync
+  opts = ImportGraphDefOptions();
+  ExpectOK("node { name: 'B_3' op: 'TestInput' }");
+
+  // Import with existing de-duped node names
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A_1");
+  opts.return_nodes.push_back("B_1");
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A_1' op: 'TestInput' }"
+      "node { name: 'B_1' op: 'TestOneInputTwoOutputs' input: ['A_1:0'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_1_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_1_1");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A_1_1:0");
+
+  // Import with node names that must be de-duped from names and prefixes that
+  // exist in both the existing graph and the GraphDef being imported.
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("A_4");
+  opts.return_nodes.push_back("B");
+  opts.return_nodes.push_back("B_4/B");
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'A_4' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }"
+      "node { name: 'B_4/B' op: 'TestOneInputTwoOutputs' input: ['A_4'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 4);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_5");
+  EXPECT_EQ(results.return_nodes[1]->name(), "A_4");
+  EXPECT_EQ(results.return_nodes[2]->name(), "B_5");
+  EXPECT_EQ(results.return_nodes[2]->def().input(0), "A_5:0");
+  EXPECT_EQ(results.return_nodes[3]->name(), "B_4/B");
+  EXPECT_EQ(results.return_nodes[3]->def().input(0), "A_4");
+
+  // Create node with prefix and then import node with same name
+  ExpectOK("node { name: 'foo/abc' op: 'ABC' }");
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("foo");
+  results = ImportGraphDefResults();
+  ExpectOK("node { name: 'foo' op: 'TestInput' }", opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 1);
+  EXPECT_EQ(results.return_nodes[0]->name(), "foo_1");
+
+  // Imported nodes can't conflict with intermediate name (but can conflict with
+  // outer name)
+  ExpectOK("node { name: 'outer/inner/abc' op: 'ABC' }");
+
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("outer");
+  opts.return_nodes.push_back("inner");
+  opts.return_nodes.push_back("abc");
+  opts.return_nodes.push_back("outer/inner");
+  opts.return_nodes.push_back("outer/inner/abc");
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'outer' op: 'TestInput' }"
+      "node { name: 'inner' op: 'TestInput' }"
+      "node { name: 'abc' op: 'TestInput' }"
+      "node { name: 'outer/inner' op: 'TestInput' }"
+      "node { name: 'outer/inner/abc' op: 'TestInput' }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 5);
+  EXPECT_EQ(results.return_nodes[0]->name(), "outer_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "inner");
+  EXPECT_EQ(results.return_nodes[2]->name(), "abc");
+  EXPECT_EQ(results.return_nodes[3]->name(), "outer/inner_1");
+  EXPECT_EQ(results.return_nodes[4]->name(), "outer/inner/abc_1");
+
+  // Import with input map containing conflicting names
+  opts = ImportGraphDefOptions();
+  opts.uniquify_names = true;
+  opts.input_map[TensorId("A", 0)] = TensorId("A", 0);
+  opts.input_map[TensorId("B", 0)] = TensorId("B", 0);
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("B");
+  results = ImportGraphDefResults();
+  ExpectOK(graph_def_str, opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_6");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_6");
+  EXPECT_EQ(results.return_nodes[1]->def().input(0), "A:0");
+}
+
+TEST_F(GraphConstructorTest, ImportGraphDef_UniquifyNames_ColocationGroups) {
+  ShapeRefiner refiner(TF_GRAPH_DEF_VERSION, graph_.op_registry());
+
+  // Create nodes 'A' and 'b"
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A'] }");
+
+  // Check that colocation groups are updated
+  ImportGraphDefOptions opts;
+  opts.uniquify_names = true;
+  opts.return_nodes.push_back("A");
+  opts.return_nodes.push_back("B");
+  ImportGraphDefResults results;
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
+      "       attr { key: '_class' value { list { s:'loc:@A' } } } }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_1");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_1");
+  const AttrValue* class_attr =
+      results.return_nodes[1]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@A_1");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_2");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_2");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_2");
+
+  results = ImportGraphDefResults();
+  ExpectOK(
+      "node { name: 'A' op: 'TestInput' "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }"
+      "node { name: 'B' op: 'TestOneInputTwoOutputs' input: ['A:0'] "
+      "       attr { key: '_class' value { list { s:'loc:@B' } } } }",
+      opts, &refiner, &results);
+
+  ASSERT_EQ(results.return_nodes.size(), 2);
+  EXPECT_EQ(results.return_nodes[0]->name(), "A_3");
+  EXPECT_EQ(results.return_nodes[1]->name(), "B_3");
+  class_attr = results.return_nodes[0]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
+  class_attr = results.return_nodes[1]->attrs().Find(kColocationAttrName);
+  ASSERT_TRUE(class_attr != nullptr);
+  ASSERT_EQ(class_attr->list().s_size(), 1);
+  EXPECT_EQ(class_attr->list().s(0), "loc:@B_3");
+}
+
 TEST_F(GraphConstructorTest, ImportGraphDef_WithCycle) {
   // Test graph produced in python using:
   /*
@@ -2157,7 +2406,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   } while (0)
 
   EXPECT_IMPORT_FAILURE(def, opts,
-                        "Node 'scope/A' already exists in the Graph");
+                        "Node name 'scope/A' already exists in the Graph");
 
   GraphDef bad_def;
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
@@ -2240,7 +2489,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
       "node{name:'scope/A' op:'TestParams'}", &bad_def));
   EXPECT_IMPORT_FAILURE(bad_def, opts,
-                        "Node 'scope/A' already exists in the Graph");
+                        "Node name 'scope/A' already exists in the Graph");
 
   parsed = protobuf::TextFormat::ParseFromString(
       R"EOF(
@@ -2848,5 +3097,20 @@ versions {
   EXPECT_EQ(17, refiner.graph_def_version());
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColationConstraints) {
+  GraphDef def;
+  ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
+      "node { name: 'A' op: 'TestInput' attr { key: '_class' value { list { "
+      "s:'loc:@missing' } } } }",
+      &def));
+  ImportGraphDefOptions options;
+  // TODO(yaozhang): Extend ExpectError to check error type and use ExpectError
+  // and ExpectOK to replace the code below.
+  Status s = ImportGraphDef(options, def, &graph_, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  options.validate_colocation_constraints = false;
+  TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index 4d9fe1dee977ca1c3341805be31f462ef472d4cc..a2c0c4d553e7229ae7e0f116691d8f717fe77f87 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -99,6 +99,10 @@ class GraphDefBuilder {
     // Use this to skip processing that may depend on prior results.
     bool HaveError() const { return status_ != nullptr && !status_->ok(); }
 
+    // Returns a string representation of the status associated with *this.
+    // Returns the string `"OK"` if the status doesn't have any error.
+    string StatusToString() const { return status_->ToString(); }
+
     // Given the Op type name, return a name for a node of that type.
     // Uses the value set in WithName() if that has been called.  Otherwise,
     // returns a name built out of the Op type name.
@@ -165,6 +169,20 @@ class GraphDefBuilder {
   // by name), and makes sure the resulting graph is valid.
   Status ToGraph(Graph* graph) const;
 
+  // Adds the function and gradient definitions in `fdef_lib` to this graph's op
+  // registry. Ignores duplicate functions, and returns a bad status if an
+  // imported function differs from an existing function or op with the same
+  // name.
+  Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
+    return graph_.AddFunctionLibrary(fdef_lib);
+  }
+
+  // Returns whether a user-defined function with `name` already exists in the
+  // graph.
+  bool HasFunction(const string& name) {
+    return graph_.flib_def().Find(name) != nullptr;
+  }
+
  private:
   Graph graph_;
   Status status_;
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index b9e3cba03501cde0079283c9d9030f420103b5dc..add80eda23d7887fb06902c0b123c03db8f4cccf 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -117,7 +117,7 @@ DataType EdgeType(const Edge* e) {
   }
 }
 
-// Return true iff we need to add a same device send/recv for 'edge'.
+// Return true iff we need to add the same device send/recv for 'edge'.
 bool NeedSameDeviceSendRecv(const Edge* edge, const GraphInfo& info) {
   if (edge->IsControlEdge()) {
     return false;
@@ -1116,7 +1116,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
         // before the data is available.
         AddInput(real_recv, send->name(), Graph::kControlSlot);
       } else if (control_flow_edge != nullptr) {
-        // Redirect control edge to the real recv since this is not a same
+        // Redirect control edge to the real recv since this is not the same
         // device send/recv.
         --num_control_flow_edges;
         AddInput(real_recv, control_flow_edge->src()->name(),
@@ -1152,7 +1152,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     // Add control edges from 'ref_control_inputs' to 'ref_recvs'.
     // NOTE(yuanbyu): Adding these control edges should not introduce
     // deadlocks. 'dst' has implicit "read" nodes that, when we split
-    // across devices, are made explicit; Retargettig the dependencies
+    // across devices, are made explicit; Retargeting the dependencies
     // to 'dst' to those nodes would not introduce cycles if there isn't
     // one before the transformation.
     // NOTE(yuanbyu): This may impact performance because it defers the
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e5d57facaa7c6ca3e0dc26a886110e3c0097c98a..e2ce0ba046f26b69bdb8f427afeb480727977844 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -118,6 +118,23 @@ class GraphTest : public ::testing::Test {
     LOG(FATAL) << name;
   }
 
+  bool ControlEdgeExistsInGraphOrNodeDef(const Node* src, const Node* dst) {
+    for (const Edge* e : dst->in_edges()) {
+      if (e->IsControlEdge() && e->src() == src &&
+          e->src_output() == Graph::kControlSlot &&
+          e->dst_input() == Graph::kControlSlot) {
+        return true;
+      }
+    }
+    std::string control_edge_name = strings::StrCat("^", src->name());
+    for (int i = 0; i < dst->def().input_size(); ++i) {
+      if (dst->def().input(i) == control_edge_name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
   Graph graph_;
 
  private:
@@ -458,8 +475,8 @@ TEST_F(GraphTest, AddControlEdge) {
   EXPECT_TRUE(edge == nullptr);
   EXPECT_EQ(b->def().input_size(), 2);
 
-  // Can add redundant control edge with create_duplicate.
-  edge = graph_.AddControlEdge(a, b, /*create_duplicate=*/true);
+  // Can add redundant control edge with allow_duplicates.
+  edge = graph_.AddControlEdge(a, b, /*allow_duplicates=*/true);
   EXPECT_TRUE(edge != nullptr);
   // create_duplicate causes the NodeDef not to be updated.
   ASSERT_EQ(b->def().input_size(), 2);
@@ -477,6 +494,47 @@ TEST_F(GraphTest, AddControlEdge) {
   EXPECT_EQ(b->def().input_size(), 2);
 }
 
+TEST_F(GraphTest, RemoveControlEdge) {
+  FromGraphDef(
+      "node { name: 'A' op: 'OneOutput' }"
+      "node { name: 'B' op: 'OneInputTwoOutputs' input: [ 'A:0' ] }"
+      "node { name: 'C' op: 'NoOp' } ");
+  Node* a = FindNode("A");
+  Node* b = FindNode("B");
+  Node* c = FindNode("C");
+
+  // Add a control edge.
+  const Edge* edge_1 = graph_.AddControlEdge(c, a);
+  const Edge* edge_2 = graph_.AddControlEdge(a, b);
+  ASSERT_TRUE(edge_1 != nullptr);
+  ASSERT_TRUE(edge_2 != nullptr);
+
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_1);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  graph_.RemoveControlEdge(edge_2);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(a, b));
+
+  // Test removing a duplicate control edge.
+  // Note that unless allow_duplicates is true, the duplicate edge
+  // will not be added. That's why we expect edge_4 to be a null
+  // pointer. We are not testing with allow_duplicates set to true,
+  // as that is a highly unlikely use case that does not make much
+  // sense.
+  const Edge* edge_3 = graph_.AddControlEdge(c, a);
+  const Edge* edge_4 = graph_.AddControlEdge(c, a);
+  ASSERT_TRUE(edge_3 != nullptr);
+  ASSERT_TRUE(edge_4 == nullptr);
+
+  graph_.RemoveControlEdge(edge_3);
+  ASSERT_TRUE(!ControlEdgeExistsInGraphOrNodeDef(c, a));
+}
+
 TEST_F(GraphTest, UpdateEdge) {
   // Build a little graph
   Node* a = FromNodeDef("A", "OneOutput", 0);
@@ -511,6 +569,13 @@ TEST_F(GraphTest, UpdateEdge) {
   EXPECT_EQ(
       s.error_message(),
       "Node 'A' (type: 'OneOutput', num of outputs: 1) does not have output 1");
+
+  // Update a's 1st input which is out of range.
+  s = graph_.UpdateEdge(c, 0, a, 0);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(
+      s.error_message(),
+      "Node 'A' (type: 'OneOutput', num of inputs: 0) does not have input 0");
 }
 
 TEST_F(GraphTest, InputEdges) {
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index cb32d643347c106cdd552522c789bec45709a606..3df981437afed760744ef870fd542d7abdd6e25d 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -21,107 +21,102 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
-// Since our ops are going to produce and also consume N addition tensors
-// (Mkl) for N Tensorflow tensors, we can have following different
-// orderings among these 2N tensors.
-//
-// E.g., for Tensorflow tensors A, B, and C, our ops will produce and
-// consume A_m, B_m, and C_m additionally.
-//
-// INTERLEAVED: in this case 2N tensors are interleaved. So for above
-//              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
-//
-// CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
-//             by N Mkl tensors. So for above example, the ordering looks
-//             like: A, B, C, A_m, B_m, C_m
-//
-// Following APIs map index of original Tensorflow tensors to their
-// appropriate position based on selected ordering. For contiguous ordering,
-// we need to know the total number of tensors (parameter total).
-//
-typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
-// NOTE: Currently, we use contiguous ordering. If you change this, then you
-// would need to change Mkl op definitions in nn_ops.cc.
-static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
+  // Since our ops are going to produce and also consume N addition tensors
+  // (Mkl) for N Tensorflow tensors, we can have following different
+  // orderings among these 2N tensors.
+  //
+  // E.g., for Tensorflow tensors A, B, and C, our ops will produce and
+  // consume A_m, B_m, and C_m additionally.
+  //
+  // INTERLEAVED: in this case 2N tensors are interleaved. So for above
+  //              example, the ordering looks like: A, A_m, B, B_m, C, C_m.
+  //
+  // CONTIGUOUS: in thi case N Tensorflow tensors are contiguous followed
+  //             by N Mkl tensors. So for above example, the ordering looks
+  //             like: A, B, C, A_m, B_m, C_m
+  //
+  // Following APIs map index of original Tensorflow tensors to their
+  // appropriate position based on selected ordering. For contiguous ordering,
+  // we need to know the total number of tensors (parameter total).
+  //
+  typedef enum { TENSORS_INTERLEAVED, TENSORS_CONTIGUOUS } MklTfTensorOrdering;
+  // NOTE: Currently, we use contiguous ordering. If you change this, then you
+  // would need to change Mkl op definitions in nn_ops.cc.
+  static MklTfTensorOrdering kTensorOrdering = TENSORS_CONTIGUOUS;
 
-// Get index of MetaData tensor from index 'n' of Data tensor.
-inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    // For interleaved ordering, Mkl tensor follows immediately after
-    // Tensorflow tensor.
-    return n + 1;
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
-    return n + total_tensors / 2;
+  // Get index of MetaData tensor from index 'n' of Data tensor.
+  inline int DataIndexToMetaDataIndex(int n, int total_tensors) {
+    if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+      // For interleaved ordering, Mkl tensor follows immediately after
+      // Tensorflow tensor.
+      return n + 1;
+    } else {
+      CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+      // For contiguous ordering, Mkl tensor is n+total_tensors / 2 away.
+      return n + total_tensors / 2;
+    }
   }
-}
 
-int inline GetTensorDataIndex(int n, int total_tensors) {
-  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
-    return 2 * n;  // index corresponding to nth input/output tensor
-  } else {
-    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
-    return n;
-  }
-}
+  int inline GetTensorDataIndex(int n, int total_tensors) {
+      if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+        return 2 * n;  // index corresponding to nth input/output tensor
+      } else {
+        CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+        return n;
+      }
+    }
 
-int inline GetTensorMetaDataIndex(int n, int total_tensors) {
-  // Get index for TensorData first and then use mapping function
-  // to get TensorMetaData index from TensorData index.
-  int tidx = GetTensorDataIndex(n, total_tensors);
-  return DataIndexToMetaDataIndex(tidx, total_tensors);
-}
+  int inline GetTensorMetaDataIndex(int n, int total_tensors) {
+      // Get index for TensorData first and then use mapping function
+      // to get TensorMetaData index from TensorData index.
+      int tidx = GetTensorDataIndex(n, total_tensors);
+      return DataIndexToMetaDataIndex(tidx, total_tensors);
+    }
 
 namespace mkl_op_registry {
-static const char* kMklOpLabel = "MklOp";
-static const char* kMklOpLabelPattern = "label='MklOp'";
-
-// Get the name of Mkl op from original TensorFlow op
-// We prefix 'Mkl' to the original op to get Mkl op.
-inline string GetMklOpName(const string& name) {
+  static const char* kMklOpLabel = "MklOp";
+  static const char* kMklOpLabelPattern = "label='MklOp'";
   // Prefix that we add to Tensorflow op name to construct Mkl op name.
-  const char* const kMklOpPrefix = "_Mkl";
-  return string(kMklOpPrefix) + name;
-}
+  static const char* const kMklOpPrefix = "_Mkl";
 
-// Check whether opname with type T is registered as MKL-compliant.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as Mkl op; false otherwise
-static inline bool IsMklOp(const std::string& op_name, DataType T) {
-  string kernel = KernelsRegisteredForOp(op_name);
-  bool result =
-      kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
-  if (result) {
-    VLOG(1) << "mkl_op_registry::" << op_name << " is " << kMklOpLabel;
+  // Get the name of Mkl op from original TensorFlow op
+  // We prefix 'Mkl' to the original op to get Mkl op.
+  inline string GetMklOpName(const string& name) {
+    return string(kMklOpPrefix) + name;
   }
-  return result;
-}
 
-// Check whether opname with type T is registered as MKL-compliant and
-// is element-wise.
-//
-// @input: name of the op
-// @input: T datatype to be used for checking op
-// @return: true if opname is registered as element-wise Mkl op;
-// false otherwise
-static inline bool IsMklElementWiseOp(const std::string& op_name, DataType T) {
-  if (!IsMklOp(op_name, T)) {
-    return false;
+  // Check whether opname with type T is registered as MKL-compliant.
+  //
+  // @input: name of the op
+  // @input: T datatype to be used for checking op
+  // @return: true if opname is registered as Mkl op; false otherwise
+  static inline bool IsMklOp(const std::string& op_name, DataType T) {
+    string kernel = KernelsRegisteredForOp(op_name);
+    bool result =
+        kernel.find(kMklOpLabelPattern) != string::npos && (T == DT_FLOAT);
+    return result;
   }
 
-  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
-                 0 == op_name.compare(GetMklOpName("Sub")) ||
-                 0 == op_name.compare(GetMklOpName("Mul")) ||
-                 0 == op_name.compare(GetMklOpName("Maximum")) ||
-                 0 == op_name.compare(GetMklOpName("SquaredDifference")));
+  // Check whether opname with type T is registered as MKL-compliant and
+  // is element-wise.
+  //
+  // @input: name of the op
+  // @input: T datatype to be used for checking op
+  // @return: true if opname is registered as element-wise Mkl op;
+  // false otherwise
+  static inline bool IsMklElementWiseOp(const std::string& op_name,
+    DataType T) {
+    if (!IsMklOp(op_name, T)) {
+      return false;
+    }
+    bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                    0 == op_name.compare(GetMklOpName("Sub")) ||
+                    0 == op_name.compare(GetMklOpName("Mul")) ||
+                    0 == op_name.compare(GetMklOpName("Maximum")) ||
+                    0 == op_name.compare(GetMklOpName("SquaredDifference")));
 
-  VLOG(1) << "mkl_op_registry::" << op_name
-          << " is elementwise MKL op: " << result;
-  return result;
-}
+    return result;
+  }
 }  // namespace mkl_op_registry
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index f4c9073deee0f51faaf853f01e6b40866682aa8c..3beca1e5d2922424972baf564e6b4601a9b3ee5b 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -37,11 +37,13 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_layout_pass.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 namespace tensorflow {
 
+#ifndef INTEL_MKL_DNN
+
 // This pass implements rewriting of graph to support following scenarios:
 // (A) Merging nodes in the graph
 // (B) Rewriting a node in the graph to a new node
@@ -2213,6 +2215,2087 @@ Status MklLayoutRewritePass::Run(
   return Status::OK();
 }
 
+#else  // INTEL_MKL_DNN
+
+// This pass implements rewriting of graph to support following scenarios:
+// (A) Merging nodes in the graph
+// (B) Rewriting a node in the graph to a new node
+//     Rewrite happens under following scenario:
+//     - Propagating Mkl layout as an additional output tensor
+//        (we will loosely call a tensor that carries Mkl layout as Mkl tensor
+//         henceforth.) from every Mkl supported NN layer.
+//
+// Example of A : Merging nodes in the graph
+// -----------------------------------------
+// Currently, we merge Conv2D+AddBias together. Consider Conv2D and BiasAdd as:
+//
+//           O = Conv2D(A, B)
+//           P = BiasAdd(O, C)
+//
+// We merge them into Conv2DWithBias as:
+//           P = _MklConv2DWithBias(A, A_m, B, B_m, C, C_m)
+//
+// The meaning of A_m, B_m and C_m is explained in B.1.
+//
+// Merge rules:
+//  - The merge for Conv2D and BiasAdd happens when the output of Conv2D _only_
+//    goes to BiasAdd.
+//  - Also, the intersection of attributes of both the nodes must have same
+//    values.
+//  - Both the nodes must have been assigned to same device (if any).
+//
+// Example of B.1 : Rewriting nodes to Mkl nodes
+// ---------------------------------------------
+// Consider a Relu node. Current definition of Relu node looks like:
+//
+//           O = Relu(A)
+//
+// Relu has 1 input (A), and 1 output (O).
+//
+// This rewrite pass will generate a new graph node for Relu (new node is
+// called MklRelu) as:
+//
+//          O, O_m = MklRelu(A, A_m)
+//
+// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). Here input A is
+// same as input A of Relu; output O is same as output O of Relu. O_m is the
+// additional output tensor that will be set by MklRelu, and it represents
+// Mkl tensor corresponding to O -- in other words, O_m is some kind of
+// metadata for O. A_m is additional input of Relu, and it represents metadata
+// for A - as O_m is metadata for O, A_m is metadata for A. MklRelu receives
+// this metadata from previous node in the graph.
+//
+// When a previous node in the graph is an Mkl node, A_m will represent a valid
+// Mkl tensor. But when a previous node is not an Mkl node, A_m will represent
+// a dummy Mkl tensor.
+//
+// Rewriting rules:
+//  - Selection of a node for rewriting happens by registering the op type of
+//    the node with the rewriting pass. If the op type is not registered, then
+//    all nodes of this op type will not be rewritten.
+//  - Number of inputs after rewriting:
+//      Since for every input Tensorflow tensor, the rewritten node gets Mkl
+//      tensor(s), rewritten node gets 2*N inputs, where N is the number of
+//      inputs for the original node.
+//  - Number of outputs after rewriting:
+//      Since for every output Tensorflow tensor, the rewritten node generates
+//      Mkl tensor(s), the rewritten node generates 2*N outputs, where N is the
+//      number of outputs of the original node.
+//  - Ordering of Tensorflow tensors and Mkl tensors:
+//      Since every rewritten node generates twice the number of inputs and
+//      outputs, one could imagine various orderings among Tensorflow tensors
+//      and Mkl tensors. E.g., assume an op 'Conv2D' that takes (A, B) as
+//      inputs, then the new op '_MklConv2D' can take inputs A, B, A_m and B_m
+//      in A, A_m, B, B_m order or it can also take them in A, B, A_m, B_m
+//      order. Among N inputs one can get N! permutations.
+//
+//      So the question is: which order do we follow? We support 2 types of
+//      orderings: (1) interleaved, and (2) contiguous. Interleaved ordering
+//      follows an intuitive order where an Mkl tensor follows the
+//      corresponding Tensorflow tensor immediately. In the context of the
+//      above example, it will be: A, A_m, B, B_m. Note that the ordering rule
+//      applies to both the inputs and outputs. Contiguous ordering means
+//      all the Tensorflow tensors are contiguous followed by all the Mkl
+//      tensors. We use contiguous ordering as default.
+//
+// Graph rewrite algorithm:
+//      Algorithm: Graph Rewrite
+//      Input: Graph G, Names of the nodes to rewrite and their new names
+//      Output: Modified Graph G' if the nodes are modified, G otherwise.
+//      Start:
+//        N = Topological_Sort(G) // N is a set of nodes in toposort order.
+//        foreach node n in N
+//        do
+//          if (Is_MKL_Op(n))  // Can this node accept an Mkl layout as input.
+//          then
+//            E = set of <incoming edge and its src_output slot> of n
+//            E' = {}   // a new set of edges for rewritten node
+//            foreach <e,s> in E
+//            do
+//              E' U {<e,s>}  // First copy edge which generates Tensorflow
+//                            // tensor as it is
+//              m = Source node of edge e
+//              if Is_Rewritten(m)  // Did we rewrite this node in this pass?
+//              then
+//                E' U {<m,s+1>}    // If yes, then m will generate an Mkl
+//                                  // tensor as an additional output.
+//              else
+//                d = Generate_Dummy_Mkl_Tensor()  // If not, generate a dummy
+//                                                 // Mkl tensor.
+//                E' U {<d,0>}  // The dummy Mkl tensor has only 1 output slot.
+//              fi
+//            done
+//            n' = Build_New_Node(G,new_name,E')
+//            Mark_Rewritten(n')  // Mark the new node as being rewritten.
+//          fi
+//        done
+//
+//      Explanation:
+//        For graph rewrite, we visit nodes of the input graph in the
+//        topological sort order. With this ordering, we visit nodes in the
+//        top-to-bottom fashion. We need this order because while visiting a
+//        node we want that all of its input nodes are visited and rewritten if
+//        applicable. This is because if we need to rewrite a given node
+//        then all of its input nodes need to be fixed (in other words they
+//        cannot be deleted later.)
+//
+//        While visiting a node, we first check if the op type of the node is
+//        an Mkl op. If it is, then we rewrite that node after constructing
+//        new inputs to the node. If the op type of the node is not Mkl op,
+//        then we do not rewrite that node.
+//
+// Handling workspace propagation for certain ops:
+//
+//        Certain backward ops in MKL (MaxPool, LRN and BatchNorm) require
+//        passing of a workspace from their respective forward ops. Workspace
+//        tensors provide memory for storing results of intermediate operations
+//        which are helpful in backward propagation. TensorFlow does not have
+//        a notion of a workspace and as a result does not allow producing
+//        additional outputs from these forward ops. For these ops, we need
+//        to add 2 extra edges between forward ops and their corresponding
+//        backward ops - the first extra edge carries a workspace tensor and
+//        the second one carries an Mkl tensor for the workspace tensor.
+//
+//        Example:
+//
+//        Typical graph for MaxPool and its gradient looks like:
+//
+//        A = MaxPool(T)
+//        B = MaxPoolGrad(X, A, Y)
+//
+//        We will transform this graph to propagate the workspace as:
+//        (with the contiguous ordering)
+//
+//        A, W, A_m, W_m = MklMaxPool(T, T_m)
+//        B, B_m = MklMaxPoolGrad(X, A, Y, W, X_m, A_m, Y_m, W_m)
+//
+//        Here W is the workspace tensor. Transformed tensor names with the
+//        suffix _m are Mkl tensors, and this transformation has been done
+//        using the algorithm discussed earlier. The transformation for
+//        workspace propagation only adds extra outputs (W, W_m) for a forward
+//        op and connects them to the corresponding backward ops.
+//
+//        Terms:
+//
+//        Forward op name = name of the op in the forward pass
+//          where a workspace tensor originates (MaxPool in this example)
+//        Backward op name = name of the op in the backward pass that receives
+//          a workspace tensor from the forward op (MaxPoolGrad in the example)
+//        Slot = Position of the output or input slot that will be
+//               used by the workspace tensor (1 for MklMaxPool as W is the 2nd
+//               output of MaxPool (0 is 1st); 3 for MklMaxPoolGrad)
+//
+//        Question:
+//
+//        How do we associate a backward op to a forward op? There can be more
+//        than one op with the exact same name.
+//
+//        In this example, we associate MaxPoolGrad with MaxPool. But there
+//        could be more than one MaxPool ops. To solve this problem, we look
+//        for _direct_ edge between a forward op and a backward op (tensor A is
+//        flowing along this edge in the example).
+//
+//        How do we transform forward and backward ops when there is no direct
+//        edge between them? In such a case, we generate dummy tensors for
+//        workspace tensors. For the example, transformation of MaxPool will
+//        be exactly same as it would be when there is a direct edge between
+//        the forward and the backward op --- it is just that MaxPool won't
+//        generate any workspace tensor. For MaxPoolGrad, the transformation
+//        will also be same, but instead of connecting W and W_m with the
+//        outputs of MaxPool, we will produce dummy tensors for them, and we
+//        will set workspace_enabled attribute to false.
+//
+class MklLayoutRewritePass : public GraphOptimizationPass {
+ public:
+  MklLayoutRewritePass() {
+    // NOTE: names are alphabetically sorted.
+    csinfo_.addn = "AddN";
+    csinfo_.avg_pool = "AvgPool";
+    csinfo_.avg_pool_grad = "AvgPoolGrad";
+    csinfo_.bias_add = "BiasAdd";
+    csinfo_.bias_add_grad = "BiasAddGrad";
+    csinfo_.concat = "Concat";
+    csinfo_.concatv2 = "ConcatV2";
+    csinfo_.conv2d = "Conv2D";
+    csinfo_.conv2d_with_bias = "__MklDummyConv2DWithBias";
+    csinfo_.conv2d_grad_input = "Conv2DBackpropInput";
+    csinfo_.conv2d_grad_filter = "Conv2DBackpropFilter";
+    csinfo_.conv2d_grad_filter_with_bias =
+                              "__MklDummyConv2DBackpropFilterWithBias";
+    csinfo_.fused_batch_norm = "FusedBatchNorm";
+    csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.identity = "Identity";
+    csinfo_.lrn = "LRN";
+    csinfo_.lrn_grad = "LRNGrad";
+    csinfo_.matmul = "MatMul";
+    csinfo_.max_pool = "MaxPool";
+    csinfo_.max_pool_grad = "MaxPoolGrad";
+    csinfo_.mkl_conv2d = "_MklConv2D";
+    csinfo_.mkl_conv2d_grad_input = "_MklConv2DBackpropInput";
+    csinfo_.mkl_conv2d_grad_filter = "_MklConv2DBackpropFilter";
+    csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
+    csinfo_.mkl_conv2d_grad_filter_with_bias =
+                                   "_MklConv2DBackpropFilterWithBias";
+    csinfo_.relu = "Relu";
+    csinfo_.relu_grad = "ReluGrad";
+    csinfo_.tanh       = "Tanh";
+    csinfo_.tanh_grad  = "TanhGrad";
+    csinfo_.reshape = "Reshape";
+    csinfo_.softmax = "Softmax";
+    csinfo_.split = "Split";
+    // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
+    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
+    // MklInputConversion op is added before it.
+    csinfo_.add = "Add";
+    csinfo_.maximum = "Maximum";
+    csinfo_.mul = "Mul";
+    csinfo_.squared_difference = "SquaredDifference";
+    csinfo_.sub = "Sub";
+    // End - element-wise ops. See note above.
+
+    // NOTE: names are alphabetically sorted.
+    rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
+                      CopyAttrsAddN, AddNRewrite});
+    rinfo_.push_back({csinfo_.add,
+                      mkl_op_registry::GetMklOpName(csinfo_.add),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.avg_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsConcat, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsConcatV2, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_with_bias,
+                      csinfo_.mkl_conv2d_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
+                      csinfo_.mkl_conv2d_grad_filter_with_bias,
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.conv2d_grad_input,
+                      mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
+                      CopyAttrsConv2D, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_batch_norm_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
+                      CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.identity,
+                      mkl_op_registry::GetMklOpName(csinfo_.identity),
+                      CopyAttrsDataType, AlwaysRewrite});
+    /*
+    rinfo_.push_back({csinfo_.lrn,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsLRN, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsLRN, AlwaysRewrite});
+    */
+    rinfo_.push_back({csinfo_.max_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+                      CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
+    rinfo_.push_back({csinfo_.max_pool_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
+                      CopyAttrsPooling, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.maximum,
+                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.mul,
+                      mkl_op_registry::GetMklOpName(csinfo_.mul),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsReshape, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.squared_difference,
+                      mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
+                      CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.sub,
+                      mkl_op_registry::GetMklOpName(csinfo_.sub),
+                      CopyAttrsDataType, AlwaysRewrite});
+
+    // Add info about which ops to add workspace edge to and the slots.
+    wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3});
+    wsinfo_.push_back({csinfo_.max_pool, csinfo_.max_pool_grad, 0, 1, 1, 3});
+
+    // Add a rule for merging nodes
+    minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
+                      csinfo_.conv2d_with_bias,
+                      GetConv2DOrBiasAdd});
+
+    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                      csinfo_.conv2d_grad_filter_with_bias,
+                      GetConv2DBackpropFilterOrBiasAddGrad});
+  }
+
+  // Standard interface to run pass
+  Status Run(const GraphOptimizationPassOptions& options);
+
+  // Helper function which does most of heavy lifting for rewriting
+  // Mkl nodes to propagate Mkl tensor as additional output
+  //
+  // Extracts common functionality between Run public interface and
+  // test interface.
+  //
+  // @return true, if and only if graph is mutated; false otherwise.
+  bool RunPass(std::unique_ptr<Graph>* g);
+
+  /// Structure to specify the name of an original node, its new name after
+  /// rewrite, the number of inputs to the original node, the function to
+  /// be used to copy attributes for the op, and the rule (if any) which
+  /// must hold for rewriting the node
+  typedef struct {
+    string name;      // Original name of op of the node in the graph
+    string new_name;  // New name of the op of the node in the graph
+    // A function handler to copy attributes from an old node to a new node.
+    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    // A rule under which to rewrite this node
+    std::function<bool(const Node*)> rewrite_rule;
+  } RewriteInfo;
+
+  /// Structure to specify a forward op, a backward op, and the slot numbers
+  /// in the forward and backward ops where we will add a workspace edge.
+  typedef struct {
+    string fwd_op;    // Name of a forward op in the graph
+    string bwd_op;    // Name of a backward op in the graph
+    int fwd_slot;     // Output slot in the forward op node where actual
+                      // output tensor resides
+    int bwd_slot;     // Input slot in the backward op node where actual
+                      // input tensor resides
+    int ws_fwd_slot;  // Output slot in the forward op node where workspace
+                      // edge is added
+    int ws_bwd_slot;  // Input slot in the backward op node where workspace
+                      // edge is added
+  } WorkSpaceInfo;
+
+  /// Structure to specify information used in node merge of 2 operators
+  typedef struct {
+    string op1;       // Node string for one operator.
+    string op2;       // Node string for second operator.
+    string new_node;  // Name of the node after merge
+    // Function that enables user of the node merger to specify how to find
+    // second operator given the first operator.
+    std::function<Node*(const Node*)> get_node_to_be_merged;
+  } MergeInfo;
+
+  /// Structure to store all constant strings
+  /// NOTE: names are alphabetically sorted.
+  typedef struct {
+    string addn;
+    string add;
+    string avg_pool;
+    string avg_pool_grad;
+    string bias_add;
+    string bias_add_grad;
+    string concat;
+    string concatv2;
+    string conv2d;
+    string conv2d_with_bias;
+    string conv2d_grad_input;
+    string conv2d_grad_filter;
+    string conv2d_grad_filter_with_bias;
+    string fused_batch_norm;
+    string fused_batch_norm_grad;
+    string identity;
+    string lrn;
+    string lrn_grad;
+    string matmul;
+    string max_pool;
+    string max_pool_grad;
+    string maximum;
+    string mkl_conv2d;
+    string mkl_conv2d_grad_input;
+    string mkl_conv2d_grad_filter;
+    string mkl_conv2d_grad_filter_with_bias;
+    string mkl_conv2d_with_bias;
+    string mul;
+    string relu;
+    string relu_grad;
+    string tanh;
+    string tanh_grad;
+    string reshape;
+    string softmax;
+    string split;
+    string squared_difference;
+    string sub;
+  } ConstStringsInfo;
+
+ private:
+  /// Maintain info about nodes to rewrite
+  std::vector<RewriteInfo> rinfo_;
+
+  /// Maintain info about nodes to add workspace edge
+  std::vector<WorkSpaceInfo> wsinfo_;
+
+  /// Maintain info about nodes to be merged
+  std::vector<MergeInfo> minfo_;
+
+  /// Maintain structure of constant strings
+  static ConstStringsInfo csinfo_;
+
+ private:
+  // Is OpDef::ArgDef a list type? It could be N * T or list(type).
+  // Refer to opdef.proto for details of list type.
+  inline bool ArgIsList(const OpDef::ArgDef& arg) const {
+    return !arg.type_list_attr().empty() || !arg.number_attr().empty();
+  }
+
+  // Get length of a list in 'n' if 'arg' is of list type. Refer to
+  // description of ArgIsList for definition of list type.
+  inline int GetTensorListLength(const OpDef::ArgDef& arg, Node* n) {
+    CHECK_EQ(ArgIsList(arg), true);
+    int N = 0;
+    const string attr_name = !arg.type_list_attr().empty()
+                                 ? arg.type_list_attr()
+                                 : arg.number_attr();
+    if (!arg.type_list_attr().empty()) {
+      std::vector<DataType> value;
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &value));
+      N = value.size();
+    } else {
+      TF_CHECK_OK(GetNodeAttr(n->def(), attr_name, &N));
+    }
+    return N;
+  }
+
+  // Can op represented by node 'n' run on DEVICE_CPU?
+  // Op can run on CPU with MKL if the runtime assigned device or the
+  // user requested device contains device CPU, or both are empty.
+  bool CanOpRunOnCPUDevice(const Node* n) {
+    bool result = true;
+    string reason;
+
+    // Substring that should be checked for in device name for CPU device.
+    const char* const kCPUDeviceSubStr = "CPU";
+
+    // If Op has been specifically assigned to a non-CPU device, then No.
+    if (!n->assigned_device_name().empty() &&
+        !StringPiece(n->assigned_device_name()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "Op has been assigned a runtime device that is not CPU.";
+    }
+
+    // If user has specifically assigned this op to a non-CPU device, then No.
+    if (!n->def().device().empty() &&
+        !StringPiece(n->def().device()).contains(kCPUDeviceSubStr)) {
+      result = false;
+      reason = "User has assigned a device that is not CPU.";
+    }
+
+    if (result == false) {
+      VLOG(1) << "MklLayoutRewritePass: Skipping rewriting of the node "
+              << n->type_string() << ", reason: " << reason;
+    }
+
+    // Otherwise Yes.
+    return result;
+  }
+
+  // Return a node that can be merged with input node 'n'
+  //
+  // @return pointer to the node if we can find such a
+  // node. Otherwise, it returns nullptr.
+  Node* CheckForNodeMerge(const Node* n) const;
+
+  // Merge node 'm' with node 'n'.
+  // Currently, we merge (1) Conv2D with BiasAdd, and (2) BiasAddGrad with
+  // Conv2DBackpropFilter.
+  //
+  // Input nodes m and n may be deleted if the call to
+  // this function is successful. Attempt to use the pointers
+  // after the call to function may result in undefined behaviors.
+  //
+  // @input g - input graph, m - graph node, n - graph node to be merged with m
+  // @return Status::OK(), if merging is successful and supported.
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case nodes are merged. Otherwise, it is
+  //         not updated.
+  Status MergeNode(std::unique_ptr<Graph>* g, Node* m, Node* n);
+
+  // Helper function to merge different nodes
+  Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
+                                                  Node* m, Node* n);
+
+  // Find BiasAdd or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is BiasAdd, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
+  // node that can be merged with 'm'.
+  static Node* GetConv2DOrBiasAdd(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add) {
+      // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
+      TF_CHECK_OK(m->input_node(0, &n));
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // Go over all output edges and search for BiasAdd Node.
+      // 0th input of BiasAdd is Conv2D.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add &&
+            e->dst_input() == 0) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2D and BiasAdd node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
+  // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
+  // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
+  // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
+  // then check if there exists Conv2DBackpropFilter node that can be merged
+  // with 'm'.
+  //
+  // Graph that will allow us to connect Conv2DBackpropFilter with BiasAddGrad
+  // would look like:
+  //
+  // _ = Conv2DBackpropFilter(F, _, G)
+  // _ = BiasAddGrad(G)
+  //
+  // So 1st input of BiasAddGrad connects with 3rd input of
+  // Conv2DBackpropFilter and vice versa.
+  static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
+    CHECK_NOTNULL(m);
+    Node* n = nullptr;
+
+    if (m->type_string() == csinfo_.bias_add_grad) {
+      // Get 1st input 'g' of BiasAddGrad.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(0, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // Conv2DBackpropFilter.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.conv2d_grad_filter &&
+            e->dst_input() == 2 /* 3rd input of BackpropFilter */) {
+          n = e->dst();
+          break;
+        }
+      }
+    } else {
+      CHECK_EQ(m->type_string(), csinfo_.conv2d_grad_filter);
+      // Get 3rd input 'g' of Conv2DBackpropFilter.
+      Node* g = nullptr;
+      TF_CHECK_OK(m->input_node(2, &g));
+      // Now traverse all outgoing edges from g that have destination node as
+      // BiasAddGrad.
+      for (const Edge* e : g->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.bias_add_grad &&
+            e->dst_input() == 0 /* 1st input of BiasAddGrad */) {
+          n = e->dst();
+          break;
+        }
+      }
+    }
+
+    if (n == nullptr) {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Conv2DBackpropFilter and BiasAddGrad node for merging. "
+              << "Input node: " << m->DebugString();
+    }
+    return n;
+  }
+
+  // Check if the node 'n' has any applicable rewrite rule
+  // We check for 2 scenarios for rewrite.
+  //
+  // @return RewriteInfo* for the applicable rewrite rule
+  const RewriteInfo* CheckForNodeRewrite(const Node* n) const;
+
+  // Default rewrite rule to be used in scenario 1 for rewrite.
+  // @return - true (since we want to always rewrite)
+  static bool AlwaysRewrite(const Node* n) {
+    return true;
+  }
+
+  // Check if we are performing pooling on depth or batch. If it is, then we
+  // do not rewrite MaxPool node to Mkl version.
+  // @return - true (if it is not a depth/batch wise pooling case);
+  //           false otherwise.
+  static bool NonDepthBatchWisePoolRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    string data_format_str;
+    TensorFormat data_format;
+    std::vector<int32> ksize, strides;
+    CHECK_EQ(GetNodeAttr(n->def(), "ksize", &ksize).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "strides", &strides).ok(), true);
+    CHECK_EQ(GetNodeAttr(n->def(), "data_format", &data_format_str).ok(),
+             true);
+    CHECK_EQ(FormatFromString(data_format_str, &data_format), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (GetTensorDim(ksize,   data_format, 'N') == 1 &&
+        GetTensorDim(strides, data_format, 'N') == 1 &&
+        GetTensorDim(ksize,   data_format, 'C') == 1 &&
+        GetTensorDim(strides, data_format, 'C') == 1) {
+      return true;
+    }
+
+    return false;
+  }
+
+  static bool AddNRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num;
+    CHECK_EQ(GetNodeAttr(n->def(), "N", &num).ok(), true);
+
+    // Condition that specifies non-batch-wise and non-depth-wise pooling.
+    if (num == 2) {
+      return true;
+    }
+
+    return false;
+  }
+
+  // Rewrites input node to a new node specified by its matching rewrite info.
+  //
+  // Method first searches matching rewrite info for input node and then
+  // uses that info to rewrite.
+  //
+  // Input node may be deleted in case of rewrite. Attempt to use the node
+  // after the call can result in undefined behaviors.
+  //
+  // @input  g - input graph, n - Node to be rewritten,
+  //         ri - matching rewriteinfo
+  // @return Status::OK(), if the input node is rewritten;
+  //         Returns appropriate Status error code otherwise.
+  //         Graph is updated in case the input node is rewritten.
+  //         Otherwise, it is not updated.
+  Status RewriteNode(std::unique_ptr<Graph>* g, Node* n, const RewriteInfo* ri);
+
+  // Get nodes that will feed a list of TF tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of TF tensors
+  // @output output_nodes - the list of new nodes creating TF tensors
+  //
+  // @return None
+  void GetNodesProducingTFTensorList(
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+      int* input_idx, int list_length,
+      std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get nodes that will feed a list of Mkl tensors to the new
+  // node that we are constructing.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting
+  // @input inputs - inputs to old node that we are using for constructing
+  //                 new inputs,
+  // @input input_idx - the index in the 'inputs' vector pointing to the
+  //                    current input that we have processed so far
+  // @output input_idx - index will be incremented by the number of nodes
+  //                     from 'inputs' that are processed
+  // @input list_length - The expected length of list of Mkl tensors
+  // @output output_nodes - the list of new nodes creating Mkl tensors
+  //
+  // @return None
+  void GetNodesProducingMklTensorList(std::unique_ptr<Graph>* g,
+    Node* orig_node, const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes);
+
+  // Get a node that will feed an Mkl tensor to the new
+  // node that we are constructing. The output node could be (1) 'n'
+  // if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+  // if 'n' is not an Mkl layer.
+  //
+  // @input g - input graph,
+  // @input orig_node - Original node that we are rewriting,
+  // @input n - Node based on which we are creating Mkl node,
+  // @input n_output_slot - the output slot of node 'n'
+  //            which is feeding to the node that we are constructing
+  // @output mkl_node - the new node that will feed Mkl tensor
+  // @output mkl_node_output_slot - the slot number of mkl_node that
+  //                                will feed the tensor
+  // @return None
+  void GetNodeProducingMklTensor(std::unique_ptr<Graph>* g, Node* orig_node,
+    Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'old_node'. Inputs to 'nb' are
+  // set up in contiguous fashion. 'workspace_tensors' carry graph nodes
+  // producing workspace edges if 'are_workspace_tensors_available' is true.
+  // Otherwise, 'workspace_tensors' is empty vector.
+  //
+  // For details, refer to 'Ordering of inputs after rewriting' section in the
+  // documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  int SetUpContiguousInputs(
+      std::unique_ptr<Graph>* g,
+      const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+      NodeBuilder* nb, Node* old_node,
+      std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+      bool are_workspace_tensors_available);
+
+  // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb'
+  // in graph 'g'. Original node is input in 'orig_node'.
+  //
+  // For details, refer to 'Ordering of Tensorflow tensors and Mkl tensors'
+  // section in the documentation above.
+  //
+  // Returns Status::OK() if setting up inputs is successful, otherwise
+  // returns appropriate status code.
+  Status SetUpInputs(std::unique_ptr<Graph>* g,
+                     const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+                     NodeBuilder* nb, Node* orig_node);
+
+  // Add workspace edge on the input or output side of Node 'orig_node' by using
+  // NodeBuilder 'nb' for the new node provided. If 'orig_node' does not dictate
+  // adding workspace edge then do not add it. Workspace Tensorflow and Mkl
+  // tensors, if they need to be added, will be set into these tensors.
+  // If we set workspace tensors, then are_ws_tensors_added should be true.
+  void AddWorkSpaceEdgeIfNeeded(std::unique_ptr<Graph>* g, Node* orig_node,
+                                NodeBuilder* nb,
+                                std::vector<NodeBuilder::NodeOut>* ws_tensors,
+                                bool* are_ws_tensors_added);
+
+  // Functions specific to operators to copy attributes
+  // We need operator-specific function to copy attributes because the framework
+  // does not provide any generic function for it.
+  // NOTE: names are alphabetically sorted.
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsConv2D(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+
+  // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
+  // using node for original node 'orig_node' and return it in '*out'.
+  // TODO(nhasabni) We should move this to mkl_util.h
+  void GetDummyMklTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                             Node* orig_node);
+  void GetDummyWorkspaceTensorNode(std::unique_ptr<Graph>* g, Node** out,
+                                   Node* orig_node);
+};
+
+MklLayoutRewritePass::ConstStringsInfo MklLayoutRewritePass::csinfo_;
+
+// We register Mkl rewrite pass for phase 1 in post partitioning group.
+// We register it here so that we get a complete picture of all users of Mkl
+// nodes. Do not change the ordering of the Mkl passes.
+const OptimizationPassRegistry::Grouping kMklLayoutRewritePassGroup =
+    OptimizationPassRegistry::POST_PARTITIONING;
+REGISTER_OPTIMIZATION(kMklLayoutRewritePassGroup, 1, MklLayoutRewritePass);
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for creating new node
+//////////////////////////////////////////////////////////////////////////
+
+static void FillInputs(const Node* n,
+                       gtl::InlinedVector<Node*, 4>* control_edges,
+                       gtl::InlinedVector<std::pair<Node*, int>, 4>* in) {
+  control_edges->clear();
+  for (const Edge* e : n->in_edges()) {
+    if (e->IsControlEdge()) {
+      control_edges->push_back(e->src());
+    } else {
+      (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output());
+    }
+  }
+  std::sort(control_edges->begin(), control_edges->end());
+  if (n->op_def().is_commutative()) {
+    // For commutative inputs, we sort the input by the input Node*
+    // to get a canonical ordering (so that add(a,b) and add(b, a) will
+    // hash to the same value if is_commutative is true for 'add').
+    std::sort(in->begin(), in->end());
+  }
+}
+
+void MklLayoutRewritePass::GetNodesProducingTFTensorList(
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs, int* input_idx,
+    int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If input node 'n' is just producing a single tensor at
+    // output slot 'slot' then we just add that single node.
+    output_nodes->push_back(NodeBuilder::NodeOut(n, slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
+                                                 Node** out, Node* orig_node) {
+  // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent
+  // dummy Mkl tensor. 8 = 2*size_t.
+  const DataType dt = DataTypeToEnum<uint8>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           8);
+  TensorShape dummy_shape({8});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+               .Attr("value", proto)
+               .Attr("dtype", dt)
+               .Device(orig_node->def().device())  // We place this node on
+                                                   // the same device as the
+                                                   // device of the original
+                                                   // node.
+               .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::GetNodesProducingMklTensorList(
+    std::unique_ptr<Graph>* g,
+    Node* orig_node,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& inputs,
+    int* input_idx, int list_length,
+    std::vector<NodeBuilder::NodeOut>* output_nodes) {
+  CHECK_LT(*input_idx, inputs.size());
+  CHECK_GT(list_length, 0);
+  CHECK_NOTNULL(output_nodes);
+  output_nodes->reserve(list_length);
+
+  while (list_length != 0) {
+    CHECK_GT(list_length, 0);
+    CHECK_LT(*input_idx, inputs.size());
+    Node* n = inputs[*input_idx].first;
+    int slot = inputs[*input_idx].second;
+    // If 'n' is producing a single tensor, then create a single Mkl tensor
+    // node.
+    Node* mkl_node = nullptr;
+    int mkl_node_output_slot = 0;
+    GetNodeProducingMklTensor(g, orig_node, n, slot, &mkl_node,
+                              &mkl_node_output_slot);
+    output_nodes->push_back(NodeBuilder::NodeOut(mkl_node,
+                                                mkl_node_output_slot));
+    (*input_idx)++;
+    list_length--;
+  }
+}
+
+// Get an input node that will feed Mkl tensor to the new
+// node that we are constructing. An input node could be (1) 'n'
+// if it is Mkl layer, or (2) a dummy node producing dummy Mkl tensor
+// if 'n' is not an Mkl layer.
+void MklLayoutRewritePass::GetNodeProducingMklTensor(std::unique_ptr<Graph>* g,
+    Node* orig_node, Node* n,
+    int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
+  CHECK_NOTNULL(n);
+  CHECK_NOTNULL(mkl_node);
+  CHECK_NOTNULL(mkl_node_output_slot);
+
+  // If this is an MKL op, then it will create extra output for MKL layout.
+  DataType T;
+  if (GetNodeAttr(n->def(), "T", &T).ok() &&
+      mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    // If this is an MKL op, then it will generate an edge that will receive
+    // Mkl tensor from a node.
+    // output slot number for Mkl tensor would be N+slot number of TensorFlow
+    // tensor, where N is total number of TensorFlow tensors.
+    *mkl_node = n;
+    *mkl_node_output_slot =
+        GetTensorMetaDataIndex(n_output_slot, n->num_outputs());
+  } else {
+    // If we have not visited the node and rewritten it, then we need
+    // to create a dummy node that will feed a dummy Mkl tensor to this node.
+    // DummyMklTensor node has no input and generates only 1 output
+    // (dummy Mkl tensor) as output slot number 0.
+    GetDummyMklTensorNode(g, mkl_node, orig_node);
+    CHECK_NOTNULL(*mkl_node);
+    *mkl_node_output_slot = 0;
+  }
+}
+
+int MklLayoutRewritePass::SetUpContiguousInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node,
+    std::vector<NodeBuilder::NodeOut>* workspace_tensors,
+    bool are_workspace_tensors_available) {
+  CHECK_NOTNULL(workspace_tensors);
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+
+  // TODO(nhasabni): Temporary solution to connect filter input of
+  // BackpropInput with the converted filter from Conv2D.
+  bool do_connect_conv2d_backprop_input_filter = false;
+  Node* conv2d_node = nullptr;
+  // Filter node is 2nd input (slot index 1) of Conv2D.
+  int kConv2DFilterInputSlotIdx = 1;
+  int kConv2DBackpropInputFilterInputSlotIdx = 1;
+  int kConv2DFilterOutputSlotIdx = 1;
+  if (old_node->type_string() == csinfo_.conv2d_grad_input) {
+    // We need to find Conv2D node from Conv2DBackpropInput.
+    // For that let's first find filter node that is 2nd input (slot 1)
+    // of BackpropInput.
+    Node* filter_node = nullptr;
+    old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node);
+    CHECK_NOTNULL(filter_node);
+
+    // Now check which nodes receive from filter_node. Filter feeds as
+    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    for (const Edge* e : filter_node->out_edges()) {
+      if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+          e->dst_input() == kConv2DFilterInputSlotIdx
+          /* filter is 2nd input of Conv2D and _MklConv2D. */) {
+        if (conv2d_node != nullptr) {
+          VLOG(1) << "MklLayoutRewritePass: unusual case of same filter"
+                  << " feeding multiple Conv2D nodes: "
+                  << filter_node->DebugString();
+          // We will not connect filter input of Conv2DBackpropInput
+          // to be safe here.
+          do_connect_conv2d_backprop_input_filter = false;
+          break;
+        } else {
+          conv2d_node = e->dst();
+          do_connect_conv2d_backprop_input_filter = true;
+        }
+      }
+    }
+  }
+
+  // Number of input slots to original op
+  // Input slots are represented by .Input() calls in REGISTER_OP.
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  // Actual number of inputs can be greater than or equal to number
+  // of Input slots because inputs of type list could be unfolded.
+  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
+  int nn_slot_idx = 0;  // slot index for inputs of new node
+
+  // Let's copy all inputs (TF tensors) of original node to new node.
+  int iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                    &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        nb->Input(conv2d_node, kConv2DFilterOutputSlotIdx);
+      } else {
+        nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
+      }
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Tensorflow tensor for
+  // workspace here because Tensorflow tensor for workspace is the
+  // last tensor in the list of Tensorflow tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Tensorflow tensor
+    nb->Input((*workspace_tensors)[0].node, (*workspace_tensors)[0].index);
+    nn_slot_idx++;
+  }
+
+  // Let's now setup all Mkl inputs to a new node.
+  // Number of Mkl inputs must be same as number of TF inputs.
+  iidx = 0;
+  for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
+    // An input slot could be a single tensor or a list. We need
+    // to handle this case accordingly.
+    CHECK_LT(iidx, old_node_inputs.size());
+    const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
+    if (ArgIsList(arg)) {
+      std::vector<NodeBuilder::NodeOut> new_node_inputs;
+      int N = GetTensorListLength(arg, old_node);
+      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                     N, &new_node_inputs);
+      nb->Input(new_node_inputs);
+      nn_slot_idx++;
+    } else {
+      Node* mkl_node = nullptr;
+      int mkl_node_output_slot = 0;
+      // Special case for connecting filter input of Conv2DBackpropInput
+      if (do_connect_conv2d_backprop_input_filter &&
+          iidx == kConv2DBackpropInputFilterInputSlotIdx) {
+        GetNodeProducingMklTensor(g, old_node, conv2d_node,
+                                  kConv2DFilterOutputSlotIdx, &mkl_node,
+                                  &mkl_node_output_slot);
+      } else {
+        GetNodeProducingMklTensor(g, old_node, old_node_inputs[iidx].first,
+                                  old_node_inputs[iidx].second, &mkl_node,
+                                  &mkl_node_output_slot);
+      }
+      nb->Input(mkl_node, mkl_node_output_slot);
+      iidx++;
+      nn_slot_idx++;
+    }
+  }
+
+  // If workspace tensors are available for this op and we are using
+  // contiguous ordering then we need to add Mkl tensor for
+  // workspace here because Mkl tensor for workspace is the
+  // last tensor in the list of Mkl tensors.
+  if (are_workspace_tensors_available) {
+    CHECK_EQ(workspace_tensors->size(), 2);
+    // Mkl tensor
+    nb->Input((*workspace_tensors)[1].node, (*workspace_tensors)[1].index);
+    nn_slot_idx++;
+  }
+
+  return nn_slot_idx;
+}
+
+Status MklLayoutRewritePass::SetUpInputs(
+    std::unique_ptr<Graph>* g,
+    const gtl::InlinedVector<std::pair<Node*, int>, 4>& old_node_inputs,
+    NodeBuilder* nb, Node* old_node) {
+  // Let's check if we need to add workspace tensors for this node.
+  // We add workspace edge only for MaxPool, LRN and BatchNorm.
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, old_node, nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+
+  int new_node_input_slots = 0;
+  if (kTensorOrdering == MklTfTensorOrdering::TENSORS_INTERLEAVED) {
+    // TODO(nhasabni): implement this function just for same of completion.
+    // We do not use interleaved ordering right now.
+    return Status(
+        error::Code::UNIMPLEMENTED,
+        "Interleaved ordering of tensors is currently not supported.");
+  } else {
+    CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+    new_node_input_slots = SetUpContiguousInputs(
+        g, old_node_inputs, nb, old_node, &workspace_tensors,
+        are_workspace_tensors_available);
+  }
+
+  // Sanity check
+  int old_node_input_slots = old_node->op_def().input_arg_size();
+  if (!are_workspace_tensors_available) {
+    // If we are not adding workspace tensors for this op, then the total
+    // number of input slots to the new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors.
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2);
+  } else {
+    // If we are adding workspace tensors for this op, then the total
+    // The total number of input slots to new node _must_ be 2 times the number
+    // of input slots to the original node: N original Tensorflow tensors and
+    // N for Mkl tensors corresponding to each Tensorflow tensors plus 2
+    // (for workspace Tensorflow tensor and workspace Mkl tensor).
+    CHECK_EQ(new_node_input_slots, old_node_input_slots * 2 + 2);
+  }
+
+  return Status::OK();
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to workspace pass
+//////////////////////////////////////////////////////////////////////////
+
+// TODO(nhasabni) We should move this to mkl_util.h.
+void MklLayoutRewritePass::GetDummyWorkspaceTensorNode(
+    std::unique_ptr<Graph>* g, Node** out, Node* orig_node) {
+  // We use a tensor of shape {1} and value 0 to represent
+  // dummy float tensor. We need this as a dummy workspace tensor.
+  // Workspace tensor has type float.
+  const DataType dt = DataTypeToEnum<float>::v();
+  TensorProto proto;
+  proto.set_dtype(dt);
+  float zero[1] = {0};
+  proto.set_tensor_content(const_cast<const void*>(static_cast<void*>(&zero)),
+                           4);
+  TensorShape dummy_shape({1});
+  dummy_shape.AsProto(proto.mutable_tensor_shape());
+  TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const")
+                .Attr("value", proto)
+                .Attr("dtype", dt)
+                .Device(orig_node->def().device())  // We place this node on
+                                                    // same the device as the
+                                                    // device of the original
+                                                    // node.
+                .Finalize(&**g, out));
+
+  // If number of inputs to the original node is > 0, then we add
+  // control dependency between 1st input (index 0) of the original node and
+  // the dummy Mkl node. This is needed because control-flow ops such as Enter,
+  // Merge, etc, require frame_name of the dummy Mkl node to be same as the
+  // rewritten node. Adding control edge between 1st input of the original node
+  // and the dummy Mkl node ensures that the dummy node is in the same frame
+  // as the original node. Choosing 1st input is not necessary - any input of
+  // the original node is fine because all the inputs of a node are always in
+  // the same frame.
+  if (orig_node->num_inputs() > 0) {
+    Node* orig_input0 = nullptr;
+    TF_CHECK_OK(orig_node->input_node(0,
+                                      const_cast<const Node**>(&orig_input0)));
+    CHECK_NOTNULL((*g)->AddControlEdge(orig_input0, *out));
+  }
+
+  (*out)->set_assigned_device_name(orig_node->assigned_device_name());
+}
+
+void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
+    std::unique_ptr<Graph>* g, Node* orig_node, NodeBuilder* nb,
+    std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
+  bool workspace_edge_added = false;  // Default initializer
+  CHECK_NOTNULL(are_ws_tensors_added);
+  *are_ws_tensors_added = false;  // Default initializer
+
+  DataType T;
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  for (auto ws : wsinfo_) {
+    if (orig_node->type_string() == ws.fwd_op &&
+        mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+          orig_node->type_string()), T)) {
+      // If this op is a fwd op, then we need to check if there is an
+      // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
+      // an edge, then we just add an attribute on this node for setting
+      // workspace_passed to true. We don't add actual workspace edge
+      // in this node. Actual workspace edge gets added in the backward
+      // op for this node.
+      for (const Edge* e : orig_node->out_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            e->dst()->type_string() == ws.bwd_op &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      if (!workspace_edge_added) {
+        // If we are here, then we did not find backward operator for this
+        // node.
+        nb->Attr("workspace_enabled", false);
+      }
+    } else if (orig_node->type_string() == ws.bwd_op &&
+               mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                          orig_node->type_string()), T)) {
+      // If this op is a bwd op, then we need to add workspace edge and
+      // it's Mkl tensor edge between its corresponding fwd op and this
+      // op. Corresponding fwd op is specified in 'fwd_op' field of
+      // workspace info. fwd_slot and bwd_slot in workspace info specify
+      // an edge between which slots connect forward and backward op.
+      // Once all these criteria match, we add a workspace edge between
+      // ws_fwd_slot and ws_bwd_slot. Its corresponding Mkl tensor is
+      // determined by interleaved/contiguous ordering. Function
+      // DataIndexToMetaDataIndex tells us the location of Mkl tensor
+      // from the location of the Tensorflow tensor.
+      for (const Edge* e : orig_node->in_edges()) {
+        if (e->src_output() == ws.fwd_slot &&
+            // We would have rewritten the forward op, so we need to use
+            // GetMklOpName call to get its Mkl name.
+            e->src()->type_string() == mkl_op_registry::GetMklOpName(
+                                                          ws.fwd_op) &&
+            e->dst_input() == ws.bwd_slot) {
+          nb->Attr("workspace_enabled", true);
+          CHECK_NOTNULL(ws_tensors);
+          // Add workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
+          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+          ws_tensors->push_back(NodeBuilder::NodeOut(
+              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                 e->src()->num_outputs())));
+          *are_ws_tensors_added = true;
+          // In terms of input ordering, we add these calls to add Input
+          // here because workspace edge (and its Mkl tensor) is the last
+          // edge in the fwdop and bwdop. So all inputs before workspace
+          // tensor have been added by SetUpInputs function.
+          VLOG(1) << "MklLayoutRewritePass: workspace_enabled for "
+                  << orig_node->type_string();
+          workspace_edge_added = true;
+          // We found the edge that we were looking for, so break.
+          break;
+        }
+      }
+
+      // If we are here means we did not find fwd op that feeds to this
+      // bwd op. So in this case, we need to generate dummy tensors for
+      // workspace input and Mkl tensor for workspace, and set
+      // workspace_enabled to false.
+      if (!workspace_edge_added) {
+        nb->Attr("workspace_enabled", false);
+        Node* dmt_ws = nullptr;      // Dummy tensor for workspace
+        Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
+        GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
+        GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
+        CHECK_NOTNULL(dmt_ws);
+        CHECK_NOTNULL(dmt_mkl_ws);
+        CHECK_NOTNULL(ws_tensors);
+        // We add dummy tensor as workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
+        // We add dummy tensor as Mkl tensor for workspace tensor.
+        ws_tensors->push_back(NodeBuilder::NodeOut(dmt_mkl_ws, 0));
+        *are_ws_tensors_added = true;
+        VLOG(1) << "MklLayoutRewritePass: dummy workspace_enabled for "
+                << orig_node->type_string();
+      }
+    } else {
+      // If this node does not match any workspace info, then we do not
+      // do anything special for workspace propagation for it.
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Op-specific functions to copy attributes from old node to new node
+//////////////////////////////////////////////////////////////////////////
+
+void MklLayoutRewritePass::CopyAttrsConv2D(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
+                                         NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
+                                                NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  std::vector<int32> strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
+                                        NodeBuilder* nb) {
+  DataType T;
+  int depth_radius;
+  float bias;
+  float alpha;
+  float beta;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("depth_radius", depth_radius);
+  nb->Attr("bias", bias);
+  nb->Attr("alpha", alpha);
+  nb->Attr("beta", beta);
+}
+
+void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
+                                            NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> ksize, strides;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("ksize", ksize);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+}
+
+void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  DataType Tshape;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("Tshape", Tshape);
+}
+
+void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
+                                          NodeBuilder* nb) {
+  DataType T;
+  string data_format;
+  int num_split;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_split", num_split);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
+                                           NodeBuilder* nb) {
+  DataType T;
+  int N;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+}
+
+void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
+                                             NodeBuilder* nb) {
+  DataType T;
+  int N;
+  DataType tidx;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("N", N);
+  nb->Attr("Tidx", tidx);
+}
+
+void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
+                                                   NodeBuilder* nb) {
+  DataType T;
+  float epsilon;
+  string data_format;
+  bool is_training;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("data_format", data_format);
+  nb->Attr("is_training", is_training);
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions related to node merge pass
+//////////////////////////////////////////////////////////////////////////
+
+Node* MklLayoutRewritePass::CheckForNodeMerge(const Node* a) const {
+  // TODO(nhasabni) Add check for type of node similar to CheckForNodeRewrite
+  // once we support BiasAddGrad as Mkl layer.
+
+  // Search for all matching mergeinfo.
+  // We allow more than one match for extensibility.
+  std::vector<const MergeInfo*> matching_mi;
+  for (auto mi = minfo_.cbegin(); mi != minfo_.cend(); ++mi) {
+    if (a->type_string() == mi->op1 || a->type_string() == mi->op2) {
+      matching_mi.push_back(&*mi);
+    }
+  }
+
+  for (const MergeInfo* mi : matching_mi) {
+    // Get the operand with which 'a' can be merged.
+    Node* b = nullptr;
+    if ((b = mi->get_node_to_be_merged(a)) == nullptr) {
+      continue;
+    }
+
+    // Get the control edges and input of node
+    const int N_in = a->num_inputs();
+    gtl::InlinedVector<Node*, 4> a_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> a_in(N_in);
+    FillInputs(a, &a_control_edges, &a_in);
+
+    const int B_in = b->num_inputs();
+    gtl::InlinedVector<Node*, 4> b_control_edges;
+    gtl::InlinedVector<std::pair<Node*, int>, 4> b_in(B_in);
+    FillInputs(b, &b_control_edges, &b_in);
+
+    // Shouldn't merge if a and b have different control edges.
+    if (a_control_edges != b_control_edges) {
+      continue;
+    } else {
+      // We found a match.
+      return b;
+    }
+  }
+
+  return nullptr;
+}
+
+Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
+                                                    Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add &&
+             n->type_string() == csinfo_.conv2d)) ||
+           ((n->type_string() == csinfo_.bias_add &&
+             m->type_string() == csinfo_.conv2d)), true);
+
+  // If 'm' is BiasAdd, then 'n' is Conv2D. Since Conv2D feeds BiasAdd,
+  // BiasAdd is successor node, and Conv2D predecessor node.
+  Node* pred = m->type_string() == csinfo_.bias_add ? n : m;
+  Node* succ = m->type_string() == csinfo_.bias_add ? m : n;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(
+      GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // We check to ensure that data formats of both succ and pred are same.
+  // We expect them to be same, so we can enforce this as assert.
+  // But assert can be too strict, so we enforce this as a check.
+  // If the check fails, then we do not merge two nodes.
+  // We also do same check for devices.
+  if (data_format_pred != data_format_succ || T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of Conv2D and "
+                  "BiasAdd do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Conv2D only feeds to BiasAdd (some other operator is
+  // not expecting output of Conv2D). If this is not the case, then we cannot
+  // merge Conv2D with BiasAdd.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Conv2D does not feed to BiasAdd, or "
+                    "it feeds BiasAdd but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+  // Find the 2 inputs from the conv and the bias from the add Bias.
+  // Get operand 0, 1 of conv2D.
+  CHECK_EQ(pred->in_edges().size(), 2);  // Conv2D must have 2 inputs.
+  // Get operand 1 of add_bias
+  // BiasAdd must have 2 inputs: Conv, bias
+  CHECK_EQ(succ->in_edges().size(), 2);
+
+  // We will use the node name of BiasAdd as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.conv2d_with_bias);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 of Conv2D
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 of Conv2D
+  // In1 of BiasAdd is same as output of Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
+
+  // Copy attributes from Conv2D to Conv2DWithBias.
+  CopyAttrsConv2D(const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode. We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kConv2DWithBiasOutputSlot = 0;
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot,
+                                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
+    std::unique_ptr<Graph>* g, Node* m, Node* n) {
+  CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
+             n->type_string() == csinfo_.conv2d_grad_filter)) ||
+           ((n->type_string() == csinfo_.bias_add_grad &&
+             m->type_string() == csinfo_.conv2d_grad_filter)), true);
+
+  // If 'm' is BiasAddGrad, then 'n' is BackpropFilter.
+  Node* badd = m->type_string() == csinfo_.bias_add_grad ? m : n;
+  Node* fltr = m->type_string() == csinfo_.bias_add_grad ? n : m;
+
+  // Sanity check for attributes from input nodes.
+  DataType T_b, T_f;
+  string data_format_b, data_format_f;
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "T", &T_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "T", &T_f));
+  TF_CHECK_OK(GetNodeAttr(badd->def(), "data_format", &data_format_b));
+  TF_CHECK_OK(GetNodeAttr(fltr->def(), "data_format", &data_format_f));
+  if (data_format_b != data_format_f || T_b != T_f ||
+      badd->assigned_device_name() != fltr->assigned_device_name() ||
+      badd->def().device() != fltr->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "data_format or T attribute or devices of "
+                  "Conv2DBackpropFilter and BiasAddGrad do not match. "
+                  "Will skip node merge optimization");
+  }
+
+  // We will use the node name of Conv2DBackpropFilter as the name of new node.
+  // This is because BackpropFilterWithBias is going to emit bias output also.
+  NodeBuilder nb(fltr->name(), csinfo_.conv2d_grad_filter_with_bias);
+  // Since Conv2DBackpropFilterWithBias has same number of inputs as
+  // Conv2DBackpropFilter, we can just copy input edges directly. We dont need
+  // to copy any data input of BiasAddGrad because that input also goes to
+  // Conv2DBackpropFilter.
+  const int fltr_ins = fltr->num_inputs();
+  gtl::InlinedVector<Node*, 4> fltr_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> fltr_in_edges(fltr_ins);
+  FillInputs(fltr, &fltr_control_edges, &fltr_in_edges);
+  for (int idx = 0; idx < fltr_ins; idx++) {
+    nb.Input(fltr_in_edges[idx].first, fltr_in_edges[idx].second);
+  }
+
+  // Copy attributes from Conv2DBackpropFilter.
+  CopyAttrsConv2D(const_cast<const Node*>(fltr), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(fltr->def().device());
+
+  // Create node.
+  Node* new_node;
+  nb.Finalize(&**g, &new_node);
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from BiasAddGrad node and Conv2DBackpropFilter node to
+  // new 'new_node' node are already copied in BuildNode. We handle control
+  // edges now.
+  for (const Edge* e : badd->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+  for (const Edge* e : fltr->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'badd' node.
+  // Conv2DBackpropFilter has 1 output -- filter_grad.
+  // Conv2DBackpropFilterWithBias has 2 outputs -- filter_grad and
+  // bias_grad. But filter_grad is at same slot number (0) in both the
+  // nodes. bias_grad is at slot number 1 in Conv2DBackpropFilterWithBias, while
+  // it is at slot number 0 in BiasAddGrad.
+  const int kMergedNodeFilterGradOutputIdx = 0;
+  const int kMergedNodeBiasGradOutputIdx = 1;
+
+  for (const Edge* e : badd->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'fltr' node.
+  for (const Edge* e : fltr->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
+                                  e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use badd or fltr as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(badd->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << badd->DebugString()
+          << ", and node: " << fltr->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(badd);
+  (*g)->RemoveNode(fltr);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
+                                       Node* n) {
+  CHECK_NOTNULL(m);
+  CHECK_NOTNULL(n);
+
+  if (((m->type_string() == csinfo_.bias_add &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.bias_add &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergeConv2DWithBiasAdd(g, m, n);
+  }
+
+  if (((m->type_string() == csinfo_.bias_add_grad &&
+        n->type_string() == csinfo_.conv2d_grad_filter)) ||
+      ((n->type_string() == csinfo_.bias_add_grad &&
+        m->type_string() == csinfo_.conv2d_grad_filter))) {
+    return this->MergeConv2DBackpropFilterWithBiasAddGrad(g, m, n);
+  }
+
+  return Status(error::Code::UNIMPLEMENTED,
+                "Unimplemented case for node merge optimization.");
+}
+
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node rewrite
+//////////////////////////////////////////////////////////////////////////
+
+Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
+                                         Node* orig_node,
+                                         const RewriteInfo* ri) {
+  CHECK_NOTNULL(ri);
+  CHECK_NOTNULL(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: Original node:" << orig_node->DebugString();
+
+  // Get all inputs.
+  int num_inputs = orig_node->in_edges().size();
+
+  // Drop count for control edges from inputs
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      num_inputs--;
+    }
+  }
+
+  gtl::InlinedVector<Node*, 4> control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> inputs(num_inputs);
+  FillInputs(orig_node, &control_edges, &inputs);
+
+  // Build new node. We use same name as original node, but change the op name.
+  NodeBuilder nb(orig_node->name().c_str(), ri->new_name.c_str());
+  // Copy user-specified device assigned to original node to new node.
+  nb.Device(orig_node->def().device());
+  // Set up new inputs to the rewritten node.
+  Status s = SetUpInputs(g, inputs, &nb, orig_node);
+  if (s != Status::OK()) {
+    return s;
+  }
+
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  // Set the Mkl layer label for this op.
+  nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
+
+  // Finalize graph and get new node.
+  Node* new_node = nullptr;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  CHECK_NOTNULL(new_node);
+
+  // Incoming data edges from 'orig_node' node to new 'new_node' node are
+  // already copied in BuildNode. We need to handle control edges now.
+  for (const Edge* e : orig_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(e->src(), new_node));
+    }
+  }
+
+  // Copy outgoing edges from 'orig_node' node to new
+  // 'new_node' node, since the output also follows same ordering among
+  // Tensorflow tensors and Mkl tensors. We need to connect Tensorflow
+  // tensors appropriately. Specifically, nth output of the original node
+  // will become 2*nth output of the Mkl node for the interleaved ordering
+  // of the tensors. For the contiguous ordering of the tensors, it will be n.
+  // GetTensorDataIndex provides this mapping function.
+  for (const Edge* e : orig_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      CHECK_NOTNULL((*g)->AddControlEdge(new_node, e->dst()));
+    } else {
+      CHECK_NOTNULL((*g)->AddEdge(new_node, GetTensorDataIndex(e->src_output(),
+                            e->src()->num_outputs()),
+                    e->dst(), e->dst_input()));
+    }
+  }
+
+  // Copy the runtime device assigned from original code to new node.
+  new_node->set_assigned_device_name(orig_node->assigned_device_name());
+
+  // Delete original node and mark new node as rewritten.
+  (*g)->RemoveNode(orig_node);
+
+  VLOG(1) << "MklLayoutRewritePass: New node:" << new_node->DebugString();
+  return Status::OK();
+}
+
+const MklLayoutRewritePass::RewriteInfo*
+MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
+  CHECK_NOTNULL(n);
+
+  // First check if node along with its type is supported by MKL layer.
+  // We do not want to rewrite an op into Mkl op if types are not supported.
+  // E.g., MklRelu does not support INT32. So we cannot rewrite Relu to
+  // MklRelu if type is INT32.
+  DataType T;
+  if (!GetNodeAttr(n->def(), "T", &T).ok()) {
+    return nullptr;
+  }
+
+  // We make an exception for __MklDummyConv2DWithBias and
+  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
+  // names.
+  if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(
+                                        n->type_string()), T)) {
+      return nullptr;
+  }
+
+  // For elementwise node, we reuse the Eigen implementation and pass the MKL
+  // metadata tensor through so we can avoid conversions. However, if all
+  // incoming edges are in TF format, we don't need all this overhead, so
+  // replace the elementwise node only if at least one of its parents is a MKL
+  // node.
+  //
+  // Identity nodes can also skip replacement if they are not being served by
+  // any MKL nodes.
+  //
+  // TODO(vrane): Add implementation for element-wise ops that doesn't reuse
+  // eigen code to reduce cross-library dependency.
+  VLOG(1) << "ELEMENTWISE: checking op: " << n->type_string();
+  if (mkl_op_registry::IsMklElementWiseOp(
+        mkl_op_registry::GetMklOpName(n->type_string()), T) ||
+      n->type_string().find("Identity") != string::npos) {
+    VLOG(1) << "ELEMENTWISE: op is elementwise: " << n->type_string();
+    bool incoming_mkl_edge = false;
+    int num_parent = 0;
+    for (auto parent : n->in_edges()) {
+      if (mkl_op_registry::IsMklOp(parent->src()->type_string(), T)) {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is MKL op: "
+                << parent->src()->type_string();
+        incoming_mkl_edge = true;
+        break;
+      } else {
+        VLOG(1) << "ELEMENTWISE: parent " << num_parent++ << " is NON-MKL op: "
+                << parent->src()->type_string();
+      }
+    }
+    if (incoming_mkl_edge == false) {
+      VLOG(1) << "ELEMENTWISE: Skipping replacement of elementwise node which has no MKL "
+                 "parents.";
+      return nullptr;
+    } else {
+      VLOG(1) << "ELEMENTWISE: Replacing elementwise node " << n->type_string() <<
+        " which has MKL parents";
+    }
+  }
+
+  // We now check if rewrite rule applies for this op. If rewrite rule passes
+  // for this op, then we rewrite it to Mkl op.
+  // Find matching RewriteInfo and then check that rewrite rule applies.
+  for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) {
+    if (n->type_string().compare(ri->name) == 0 &&
+        ri->rewrite_rule(n)) {
+      return &*ri;
+    }
+  }
+
+  // Else return not found.
+  return nullptr;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//              Run function for the pass
+///////////////////////////////////////////////////////////////////////////////
+
+bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
+  bool result = false;
+  CHECK_NOTNULL(g);
+
+  DumpGraph("Before running MklLayoutRewritePass", &**g);
+
+  std::vector<Node*> order;
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    Node* m = nullptr;
+    if ((m = CheckForNodeMerge(n)) != nullptr && CanOpRunOnCPUDevice(m)) {
+      // Check if the node 'n' can be merged with any other node. If it can
+      // be 'm' contains the node with which it can be merged.
+      string n1_name = n->name();
+      string n2_name = m->name();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled nodes " << n1_name << " and "
+              << n2_name << " for merging";
+
+      if (MergeNode(g, n, m) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: Merged nodes " << n1_name << " and "
+                << n2_name;
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
+
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    const RewriteInfo* ri = nullptr;
+    // We will first search if node is to be rewritten.
+    if ((ri = CheckForNodeRewrite(n)) != nullptr) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: Scheduled node " << node_name
+              << " with op " << op_name << " for rewrite using"
+              << " layout optimization.";
+
+      if (RewriteNode(g, n, ri) == Status::OK()) {
+        VLOG(1) << "MklLayoutRewritePass: rewrote node " << node_name
+                << " with op " << op_name << " for Mkl layout optimization.";
+        result = true;
+      }
+    }
+  }
+
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
+
+  return result;
+}
+
+bool RunMklLayoutRewritePass(std::unique_ptr<Graph>* g) {
+  return MklLayoutRewritePass().RunPass(g);
+}
+
+Status MklLayoutRewritePass::Run(
+  const GraphOptimizationPassOptions& options) {
+  if (options.graph == nullptr && options.partition_graphs == nullptr) {
+    return Status::OK();
+  }
+
+  auto process_graph = [&](std::unique_ptr<Graph>* g) {
+    // Get the ownership of a graph
+    std::unique_ptr<Graph>* ng = std::move(g);
+    RunPass(ng);
+    // Return the ownership of a graph back
+    g->reset(ng->release());
+  };
+
+  if (kMklLayoutRewritePassGroup !=
+      OptimizationPassRegistry::POST_PARTITIONING) {
+    // For any pre-partitioning phase, a graph is stored in options.graph.
+    process_graph(options.graph);
+  } else {
+    // For post partitioning phase, graphs are stored in
+    // options.partition_graphs.
+    for (auto& pg : *options.partition_graphs) {
+      process_graph(&pg.second);
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // INTEL_MKL_DNN
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index abc63e4f35aa9fd6f1df127741ae6d10f49024b9..75f7ca2d4d7ce7c86858a40fe34fed6aa707c9e5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -37,6 +37,9 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
+
+#ifndef INTEL_MKL_DNN
+
 namespace {
 
 const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
@@ -1881,6 +1884,1627 @@ static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
 BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
 
 }  // namespace
+
+#else  // INTEL_MKL_DNN
+
+namespace {
+
+const char kCPUDevice[] = "/job:a/replica:0/task:0/device:CPU:0";
+const char kGPUDevice[] = "/job:a/replica:0/task:0/device:GPU:0";
+
+static void InitGraph(const string& s, Graph* graph,
+                      const string& device = kCPUDevice) {
+  GraphDef graph_def;
+
+  auto parser = protobuf::TextFormat::Parser();
+  //  parser.AllowRelaxedWhitespace(true);
+  CHECK(parser.MergeFromString(s, &graph_def)) << s;
+  GraphConstructorOptions opts;
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph));
+
+  for (Node* node : graph->nodes()) {
+    node->set_assigned_device_name(device);
+  }
+}
+
+class MklLayoutPassTest : public ::testing::Test {
+ public:
+  MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+
+  void InitGraph(const string& s, const string& device = kCPUDevice) {
+    ::tensorflow::InitGraph(s, &graph_, device);
+    original_ = CanonicalGraphString(&graph_);
+  }
+
+  static bool IncludeNode(const Node* n) { return n->IsOp(); }
+
+  static string EdgeId(const Node* n, int index) {
+    if (index == 0) {
+      return n->name();
+    } else if (index == Graph::kControlSlot) {
+      return strings::StrCat(n->name(), ":control");
+    } else {
+      return strings::StrCat(n->name(), ":", index);
+    }
+  }
+
+  string CanonicalGraphString(Graph* g) {
+    std::vector<string> nodes;
+    std::vector<string> edges;
+    for (const Node* n : g->nodes()) {
+      if (IncludeNode(n)) {
+        nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")"));
+      }
+    }
+    for (const Edge* e : g->edges()) {
+      if (IncludeNode(e->src()) && IncludeNode(e->dst())) {
+        edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->",
+                                        EdgeId(e->dst(), e->dst_input())));
+      }
+    }
+    // Canonicalize
+    std::sort(nodes.begin(), nodes.end());
+    std::sort(edges.begin(), edges.end());
+    return strings::StrCat(str_util::Join(nodes, ";"), "|",
+                           str_util::Join(edges, ";"));
+  }
+
+  string DoMklLayoutOptimizationPass() {
+    string before = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "Before MKL layout rewrite pass: " << before;
+
+    std::unique_ptr<Graph>* ug = new std::unique_ptr<Graph>(&graph_);
+    RunMklLayoutRewritePass(ug);
+
+    string result = CanonicalGraphString(&graph_);
+    LOG(ERROR) << "After MKL layout rewrite pass:  " << result;
+    return result;
+  }
+
+  const string& OriginalGraph() const { return original_; }
+
+  Graph graph_;
+  string original_;
+};
+
+REGISTER_OP("Input").Output("o: float").SetIsStateful();
+REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
+REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
+REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
+REGISTER_OP("_MklInput2").Output("o: uint8")
+                        .Output("o1: uint8").SetIsStateful();
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to node merge optiimization
+/////////////////////////////////////////////////////////////////////
+
+TEST_F(MklLayoutPassTest, Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Zeta);D(Zeta)|"
+            "A->C;A->D;B->C:1;B->D:1");
+}
+
+// Test set 1: Conv2D + AddBias
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Zeta(E,Y)
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklConv2DWithBias);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+
+// Graph contains only Conv2D, no AddBias.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
+            "A->C;A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;"
+            "DMT/_0->C:2;DMT/_1->C:3");
+}
+
+// Conv2D output does not go to BiasAdd.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }");  // Output of _MklConv2D does not go to BiasAdd.
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;D->F;DMT/_0->C:2;DMT/_1->C:3;"
+            "E->F:1");
+}
+
+// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Zeta).
+// Merge should not be done in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D', 'E'] }"  // Conv2D has two outputs.
+                              // No merge should happen.
+      "node { name: 'G' op: 'Zeta'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(Input);F(BiasAdd);G(Zeta)|A->C;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;B->C:1;C->G;"
+            "D->F;DMT/_0->C:2;DMT/_1->C:3;E->F:1;E->G:1");
+}
+
+// data_format attribute value mismatch. Merge should not be done
+// in such case.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHCW' } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);E(BiasAdd)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->E;D->E:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// Test set 2: BiasAddGrad + Conv2DBackpropFilter fusion tests
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilterWithBias);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter. But nodes do not match
+// criteria for rewrite. So rewrite should not happen. 3rd input of
+// Conv2DBackpropFilter is different than input to BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion, but nodes do not match criteria for fusion.
+// Different input formats.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " input: ['A'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);"
+            "D(_MklConv2DBackpropFilter);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(BiasAddGrad)|A->D;A->E;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;C->D:2;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// BiasAddGrad fusion in the presence of BackpropFilter only. Fusion is done
+// before node rewrite. Check this ordering.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: '_MklConv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['E', 'F', 'A', 'M', 'N', 'O'] }"
+      "node { name: 'H' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(Int32Input);G(_MklConv2DBackpropFilter);H(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput);O(_MklInput)|A->D;A->E:1;A->G:2;B->D:1;"
+            "C->D:2;D->E;E->G;E->H;F->G:1;M->D:3;M->G:3;N->D:4;N->G:4;O->D:5;"
+            "O->G:5");
+}
+
+// C=Conv2D(A,B); E=BiasAdd(C,D); Y=Zeta(E,X);
+// G=Conv2DBackpropInput(F,B,E)
+// This is a case of node rewrite followed by node merge followed by connecting
+// filter output of Conv2DWithBias to filter input of Conv2DBackpropInput.
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'Y' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'X']}"
+      "node { name: 'F' op: 'Int32Input'}"
+      "node { name: 'G' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['F', 'B', 'E']}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['G', 'X']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2DWithBias);F(Int32Input);"
+            "G(_MklConv2DBackpropInput);X(Input);Y(Zeta);Z(Zeta)|"
+            "A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:1;D->E:2;DMT/_0->E:3;"
+            "DMT/_1->E:4;DMT/_2->E:5;DMT/_3->G:3;E->G:2;E->Y;E:1->G:1;E:2->G:5;"
+            "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node to Mkl node
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op; No Mkl layer on the input and on the output.
+// We will generate dummy Mkl tensor as 2nd input of Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
+// have 2 outputs, both of which will be inputs to next Conv2D.
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(_MklConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->C;A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->C:1;C->D:1;C->E;"
+            "C:2->D:3;D->E:1;DMT/_0->C:2;DMT/_1->C:3;DMT/_2->D:2");
+}
+
+// Conv2D with INT32 which is not supported by Mkl
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
+  InitGraph(
+      "node { name: 'A' op: 'HalfInput'}"
+      "node { name: 'B' op: 'HalfInput'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_HALF } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(HalfInput);B(HalfInput);C(Conv2D);D(Zeta)|"
+            "A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Polygamma'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+            "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "B->E:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(_MklConcatV2);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D:2;B->D;B:1->D:1;"
+            "B:control->DMT/_0:control;B:control->DMT/_1:control;"
+            "B:control->DMT/_2:control;C->E;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// ConcatV2 with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcatV2);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->E:1;C->F;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "D->F:1;DMT/_0->F:2;DMT/_1->F:3;DMT/_2->E:2;DMT/_3->E:3;"
+            "DMT/_4->H:5;E->H;E:2->H:3;E:control->DMT/_4:control;F->H:1;"
+            "F:2->H:4;G->H:2;H->I:1");
+}
+
+// ConcatV2 with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['E', 'F', 'G']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcatV2);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:4;DMT/_3->H:5;E->H;E:2->H:3;"
+            "E:control->DMT/_2:control;E:control->DMT/_3:control;F->H:1;"
+            "G->H:2;H->I:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+            "DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'I' op: 'Int32Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'AvgPoolGrad' "
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['I', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
+            "I:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNormGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
+            "E->F:4;F->G:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+//  Unit tests related to rewriting node for workspace edges
+/////////////////////////////////////////////////////////////////////
+
+/* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B'] }"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['B', 'C', 'D'] }"
+      "node { name: 'F' op: 'Input'}"
+      "node { name: 'G' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['E', 'F', 'B'] }"
+      "node { name: 'H' op: 'Input'}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['H', 'G'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
+      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
+      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
+      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
+      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+}
+
+/* Test LRN->LRNGrad replacement by workspace nodes. */
+TEST_F(MklLayoutPassTest, LRN_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+/* Test LRN->LRNGrad replacement when only one of them is present. */
+TEST_F(MklLayoutPassTest, LRN_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklLRNGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+/* Test LRN->LRNGrad negative case, where single LRN feeds
+   2 LRNGrad nodes at different slots. */
+TEST_F(MklLayoutPassTest, LRN_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LRN'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'D', 'B'] }"
+      "node { name: 'F' op: 'LRNGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'        value { f: 0.001 } }"
+      " attr { key: 'beta'         value { f: 0.75 } }"
+      " attr { key: 'bias'         value { f: 1.0 } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'depth_radius' value { i: 2 } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['E', 'F'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);DMT/_5(Const);"
+            "DMT/_6(Const);E(_MklLRNGrad);F(_MklLRNGrad);G(Zeta)|A->B;"
+            "A:control->DMT/_0:control;B->E:2;"
+            "B->F:1;B:1->E:3;B:2->E:6;B:2->F:5;B:3->E:7;C->E;C->F;"
+            "C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "C:control->DMT/_3:control;C:control->DMT/_4:control;"
+            "C:control->DMT/_5:control;C:control->DMT/_6:control;"
+            "D->E:1;D->F:2;DMT/_0->B:1;DMT/_1->F:3;DMT/_2->F:7;DMT/_3->F:4;"
+            "DMT/_4->F:6;DMT/_5->E:4;DMT/_6->E:5;E->G;F->G:1");
+}
+
+/* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['C', 'B', 'D'] }"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
+            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
+            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+}
+
+// Test MaxPool>MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPool node but workspace edges will not
+// be present.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+// Test MaxPoolGrad replacement when only one of them is present.
+// In this case, we will rewrite MaxPoolGrad and for workspace tensor and
+// its Mkl part, we will generate dummy tensor.
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'MaxPoolGrad'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
+      " input: ['A', 'B', 'C'] }"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklMaxPoolGrad);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
+            "A:control->DMT/_4:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:7;DMT/_2->D:4;DMT/_3->D:5;DMT/_4->D:6");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NCHW)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for batch-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Test MaxPool handling for depth-wise pooling (NHWC)
+// No rewrite should take place in such case
+TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+// Single Conv2D Op on GPU device
+// No rewrite should happen
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Conv2D);D(Zeta)|A->C;B->C:1;B->D;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'O' op: '_MklInput'}"
+      "node { name: 'D' op: '_MklConv2DWithBias'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
+      "node { name: 'E' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['D', 'A']}"
+      "node { name: 'F' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['E'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(_MklConv2DWithBias);"
+            "E(Zeta);F(BiasAddGrad);M(_MklInput);N(_MklInput);"
+            "O(_MklInput)|A->D;A->E:1;B->D:1;C->D:2;D->E;E->F;"
+            "M->D:3;N->D:4;O->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(Conv2DBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Relu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(ReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_MaxPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'MaxPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'AvgPool'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'      value { s: 'VALID' } }"
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(AvgPool);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(Concat);E(Zeta)|A->D;"
+            "B->D:1;B:1->D:2;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT32 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: 'FusedBatchNorm'"
+      " attr { key: 'T'            value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"
+      " attr { key: 'is_training'  value { b: true } }"
+      " input: ['A', 'B', 'C', 'D', 'E'] }"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'F'] }", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);E(Input);"
+            "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;"
+            "E->F:4;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
+  CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}", kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Input);E(BiasAdd);"
+            "M(_MklInput);N(_MklInput);Y(Input);Z(Zeta)|A->C;"
+            "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
+}
+
+/////////////////////////////////////////////////////////////////////
+
+static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
+  testing::StopTiming();
+  string s;
+  for (int in = 0; in < 10; in++) {
+    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+  }
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  for (int op = 0; op < op_nodes; op++) {
+    s += strings::Printf(
+        "node { name: 'op%04d' op: 'Zeta' attr { key: 'T' value { "
+        "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
+        op, rnd.Uniform(10), rnd.Uniform(10));
+  }
+
+  bool first = true;
+  while (iters > 0) {
+    Graph* graph = new Graph(OpRegistry::Global());
+    InitGraph(s, graph);
+    int N = graph->num_node_ids();
+    if (first) {
+      testing::SetLabel(strings::StrCat("Per graph node.  Nodes: ", N));
+      first = false;
+    }
+    {
+      testing::StartTiming();
+      std::unique_ptr<Graph> ug(graph);
+      RunMklLayoutRewritePass(&ug);
+      testing::StopTiming();
+    }
+    iters -= N;  // Our benchmark units are individual graph nodes,
+                 // not whole graphs
+    // delete graph;
+  }
+}
+BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000);
+
+}  // namespace
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index fe4588389e76d657b6eb07ffb24ce8a886c2eb4d..599bb88f015bfc035b7666747571a652a954139d 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -33,8 +33,8 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/graph/mkl_tfconversion_pass.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 
 namespace tensorflow {
 
@@ -68,7 +68,7 @@ namespace tensorflow {
 // take place before we hit the op. For this, we add a new op before each
 // element-wise MKL op to deal with the inputs, called _MklInputConversion.
 // This pass has been enhanced to add this capability.
-// 
+//
 // The _MklInputConversion op will check the inputs to the elementwise op and
 // make sure that either both are in MKL format or both are in TF format,
 // depending on their initial state and whether broadcast is needed or not.
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index b74fa2127e4a4f539e008d96970045904757030e..cb0fc8a1547a8498aa0bd089a2c9395119de2789 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -41,8 +41,8 @@ const uint32 kAllowedInputs = 2;
 const float kEMADecay = 0.999;
 
 // Node types to rewrite. Insert quantize_and_dequantize op for their inputs.
-const std::unordered_set<string, StringPiece::Hasher> nodes_to_rewrite{
-    "MatMul", "Conv2D"};
+const auto* nodes_to_rewrite =
+    new std::unordered_set<string, StringPieceHasher>{"MatMul", "Conv2D"};
 
 // Contains necessary parameters to convert an edge.
 struct EdgeToConvert {
@@ -563,7 +563,7 @@ Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
                           const std::vector<EdgeToConvert>& target_edges) {
   // Remember previously converted ops to avoid duplicated conversion on the
   // same input.
-  std::unordered_map<string, Node*, StringPiece::Hasher> name_index;
+  std::unordered_map<string, Node*, StringPieceHasher> name_index;
   std::vector<Node*> added_variables;
   for (const EdgeToConvert edge : target_edges) {
     Node* convert_node;
@@ -602,7 +602,8 @@ Status DoQuantizeTraining(int32 num_bits, const string& quant_op_type,
   int potential_input = 0;
   std::vector<EdgeToConvert> target_edges;
   for (Node* node : graph->nodes()) {
-    if (nodes_to_rewrite.find(node->type_string()) != nodes_to_rewrite.end() &&
+    if (nodes_to_rewrite->find(node->type_string()) !=
+            nodes_to_rewrite->end() &&
         !IsGradientNode(graph, node)) {
       // Find out which types are the inputs and convert them accordingly.
       // 1. Const/Variable OP: This is quantized as signed tensors with no given
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 8ccc27914bce325469b0e73deacf6a3c44a55246..3c1f8870f57f6d585f795cc92c320927e1a29315 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -71,7 +71,7 @@ Status RewriteGraphForExecution(
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
-typedef std::unordered_map<StringPiece, Node*, StringPiece::Hasher> NameIndex;
+typedef std::unordered_map<StringPiece, Node*, StringPieceHasher> NameIndex;
 
 // Augment "*g" by adding special "fetch" nodes that connect to the
 // tensor outputs specified in "fetch_outputs" to retrieve the output
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 678f8da298e3c852ea1a4a3cdb53b03dec3ecb87..99f1318072220d397870794cf3d2643d64b9696e 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -21,6 +21,9 @@ cc_library(
     hdrs = ["op_types.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -43,6 +46,7 @@ tf_cc_test(
     srcs = ["utils_test.cc"],
     deps = [
         ":utils",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -66,6 +70,31 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "graph_view",
+    srcs = ["graph_view.cc"],
+    hdrs = ["graph_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_view_test",
+    srcs = ["graph_view_test.cc"],
+    deps = [
+        ":graph_view",
+        ":grappler_item",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
+
 cc_library(
     name = "grappler_item",
     srcs = [
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index ead44de1e2fa960808412f4e8d55dbe38d5b5242..01a618ed7775eee64ce40e283394c09622353157 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -35,6 +35,10 @@ void Cluster::SetNumWarmupSteps(int num_steps) {
       num_steps);
 }
 
+int Cluster::NumWarmupSteps() const {
+  return options_.config.graph_options().build_cost_model_after();
+}
+
 void Cluster::DisableDetailedStats(bool disable) {
   if (disable) {
     options_.config.mutable_graph_options()->set_build_cost_model(0);
@@ -57,7 +61,7 @@ void Cluster::DisableOptimizer(bool disable) {
     // Disable Grappler optimizations.
     auto rewriter_config =
         options_.config.mutable_graph_options()->mutable_rewrite_options();
-    rewriter_config->set_optimize_tensor_layout(false);
+    rewriter_config->set_layout_optimizer(RewriterConfig::OFF);
     rewriter_config->set_disable_model_pruning(true);
     rewriter_config->set_constant_folding(RewriterConfig::OFF);
     rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 616ab6ffdcc1e62c4c56f6826a8a5852d51b00d7..d7af50f7dc7e21db189118d84f3181a4e99563b8 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -64,6 +64,9 @@ class Cluster {
   // before Provision().
   void SetNumWarmupSteps(int num_steps);
 
+  // Returns the number of warmup steps.
+  int NumWarmupSteps() const;
+
   // Disable the collection of detailed statistics. Must be called
   // before Provision().
   void DisableDetailedStats(bool disable);
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 1a6fad41828c1cc3eaa0d78d12d984dcf5b59692..b39d8c752669f84e763dd13f269f5bd30b7ee3f2 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -31,20 +31,13 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-static std::atomic<bool> already_created(false);
+static std::atomic<bool> already_provisioned(false);
 
 SingleMachine::SingleMachine(int timeout_s, int num_cpu_cores, int num_gpus)
     : Cluster(timeout_s),
       num_gpus_(num_gpus),
       expected_init_time_s_(0),
       closing_(false) {
-  // This is really ugly: to avoid leaking variables, we need to reset the tf
-  // session every time we're done processing a grappler item. However,
-  // variables are global, and therefore we can't have more than 1 session alive
-  // at a time. This check detects when more that one cluster is created.
-  CHECK(!already_created);
-  already_created = true;
-
   VLOG(1) << "Number of CPU cores: " << num_cpu_cores
           << " Number of GPUs: " << num_gpus;
   thread_pool_.reset(new thread::ThreadPool(
@@ -71,17 +64,20 @@ SingleMachine::~SingleMachine() {
   // Reset the thread-pool so that there are no outstanding Session::Run(...)s
   // when we delete the session.
   thread_pool_.reset();
-
-  CHECK(already_created);
-  already_created = false;
 }
 
 Status SingleMachine::Provision() {
-  Status status = ResetSession();
-  if (!status.ok()) {
-    return status;
+  // This is really ugly: to avoid leaking variables, we need to reset the tf
+  // session every time we're done processing a grappler item. However,
+  // variables are global, and therefore we can't have more than 1 session alive
+  // at a time. This check detects when more that one cluster is provisioned.
+  if (already_provisioned) {
+    return errors::Unavailable(
+        "Can't provision more than one single cluster at a time");
   }
 
+  TF_RETURN_IF_ERROR(ResetSession());
+
   DeviceProperties attr = GetLocalCPUInfo();
   devices_["/job:localhost/replica:0/task:0/cpu:0"] = GetLocalCPUInfo();
 
@@ -92,6 +88,7 @@ Status SingleMachine::Provision() {
     VLOG(1) << "Adding GPU device " << device_name;
     devices_[device_name] = GetLocalGPUInfo(i);
   }
+  already_provisioned = true;
   return Status::OK();
 }
 
@@ -108,27 +105,12 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
 }
 
 Status SingleMachine::Shutdown() {
-  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+  TF_RETURN_IF_ERROR(ShutdownSession());
+
+  mutex_lock l(this->last_graph_mu_);
+  last_graph_ = nullptr;
+  already_provisioned = false;
 
-  // Delete the threadpool: this ensures that all the pending closures complete
-  // before we return. Note that if TF deadlocked on us, the closures will
-  // never complete, and the call to thread_pool_.reset() will never return:
-  // therefore we need to delete the threadpool with the background thread.
-  // That thread itself will also never complete, so the user should
-  // abort the process to avoid leaking too many resources.
-  auto n = std::make_shared<Notification>();
-  Env::Default()->SchedClosure([this, n]() {
-    thread_pool_.reset();
-    n->Notify();
-  });
-  int64 timeout_us = 1000000ll * timeout_s_;
-  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
-  if (!notified) {
-    // Let the caller know that we can't shutdown the session properly since
-    // there are calls to Session::Run() still running.
-    return errors::Unavailable("The session is still running graphs after ",
-                               timeout_s_, " seconds");
-  }
   return Status::OK();
 }
 
@@ -230,7 +212,7 @@ Status SingleMachine::RunWithTimeout(
 }
 
 Status SingleMachine::CloseSession(bool use_timeout) {
-  if (!session_) {
+  if (!session_ || !thread_pool_) {
     return Status::OK();
   }
 
@@ -274,12 +256,38 @@ Status SingleMachine::CloseSession(bool use_timeout) {
   return Status::OK();
 }
 
+Status SingleMachine::ShutdownSession() {
+  TF_RETURN_IF_ERROR(CloseSession(true /*use_timeout*/));
+
+  // Delete the threadpool: this ensures that all the pending closures complete
+  // before we return. Note that if TF deadlocked on us, the closures will
+  // never complete, and the call to thread_pool_.reset() will never return:
+  // therefore we need to delete the threadpool with the background thread.
+  // That thread itself will also never complete, so the user should
+  // abort the process to avoid leaking too many resources.
+  auto n = std::make_shared<Notification>();
+  Env::Default()->SchedClosure([this, n]() {
+    thread_pool_.reset();
+    n->Notify();
+  });
+  int64 timeout_us = 1000000ll * timeout_s_;
+  const bool notified = WaitForNotificationWithTimeout(n.get(), timeout_us);
+  if (!notified) {
+    // Let the caller know that we can't shutdown the session properly since
+    // there are calls to Session::Run() still running.
+    return errors::Unavailable("The session is still running graphs after ",
+                               timeout_s_, " seconds");
+  }
+
+  return Status::OK();
+}
+
 Status SingleMachine::ResetSession() {
   if (session_) {
     LOG(INFO) << "Cleaning up previous session";
 
     // Make sure the session is properly closed
-    TF_RETURN_IF_ERROR(Shutdown());
+    TF_RETURN_IF_ERROR(ShutdownSession());
 
     // Destroying the object deletes all its variables as well. This is only
     // true for DirectSession.
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index d3efbe3c614580d0502874412697cd5719e28be5..be005a95091de5bca6e193d571dfd2f64dcf095c 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -49,6 +49,7 @@ class SingleMachine : public Cluster {
                         RunMetadata* run_metadata, int64 timeout_s);
   Status ResetSession();
   Status CloseSession(bool use_timeout);
+  Status ShutdownSession();
   void MergeCosts(CostGraphDef* graph_costs, const CostGraphDef& init_costs,
                   const CostGraphDef& queue_costs);
 
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index f6c325c2a4bb1877f07fbfd034755ff501344f48..df936efad104dd92595bcc7d325e964347b86cb8 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -48,6 +48,9 @@ class SingleMachineTest : public ::testing::Test {
   }
 
   void TearDown() override {
+    if (cluster_) {
+      TF_CHECK_OK(cluster_->Shutdown());
+    }
     cluster_.reset();
   }
 
@@ -178,8 +181,7 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   // With optimizations turned on, some nodes could have been optimized away,
   // and the cost model could be partial. Restart the cluster with optimizations
   // disabled and make sure we have all the information we're looking for.
-  cluster_.reset();
-  cluster_.reset(new SingleMachine(5, 3, 0));
+  TF_CHECK_OK(cluster_->Shutdown());
   cluster_->DisableOptimizer(true);
   TF_CHECK_OK(cluster_->Provision());
 
@@ -324,7 +326,7 @@ static void RunInfiniteTFLoop() {
 
 TEST_F(SingleMachineTest, InfiniteLoops) {
   // The RunInfiniteTFLoop function creates its own cluster.
-  cluster_.reset();
+  TF_CHECK_OK(cluster_->Shutdown());
 
   EXPECT_EXIT(RunInfiniteTFLoop(), ::testing::ExitedWithCode(0), ".*");
 }
@@ -578,7 +580,8 @@ TEST_F(SingleMachineTest, ReleaseMemoryAfterDestruction) {
   EXPECT_EQ(device_memory.size(), 1);
   EXPECT_GT(device_memory.begin()->second.bytes_in_use, 0);
 
-  // Reset cluster_ would release all memory.
+  // Shutting down the cluster_ would release all memory.
+  TF_CHECK_OK(cluster_->Shutdown());
   cluster_.reset();
   std::unordered_map<string, AllocatorStats> device_memory_after;
   TF_CHECK_OK(GetDeviceMemoryStats(options, &device_memory_after));
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 257e8e8d0417e70c673255450451cb2be2edf32a..d6ce72639ca3293b057efe70661df4d71dfad437 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -50,6 +50,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
     ],
 )
@@ -100,6 +101,7 @@ tf_cc_test(
     args = ["--heap_check=local"],  # The GPU tracer leaks memory
     deps = [
         ":graph_memory",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
@@ -131,8 +133,8 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":op_performance_data_cc",
-        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
@@ -305,6 +307,7 @@ cc_library(
         ":virtual_placer",
         ":virtual_scheduler",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index d1f3e36aa8164c4a80537b8affc324503af5488b..1c2c1713834a11d0a7c85247e9a7e4cdf779c592 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -102,8 +102,14 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   Costs summary;
   TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9156), summary.execution_time);
-  EXPECT_FALSE(summary.inaccurate);
+  EXPECT_EQ(Costs::NanoSeconds(9150), summary.execution_time);
+
+  // Make this estimate accurate:
+  // TODO(http://b/70031255): Accurate estimator for RandomUniform op needed
+  // TODO(http://b/70031363): Accurate estimator for Softmax needed
+  //
+  // Change to EXPECT_FALSE when the above TODOs are done:
+  EXPECT_TRUE(summary.inaccurate);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index 0adec584a88b607e83915c9fb81d4e9c08758772..6022c47e8f689c6d9f262caae0c5e86f4cf6fb82 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -39,6 +39,7 @@ Status GraphMemory::InferDynamically(Cluster* cluster) {
   if (!cluster->DetailedStatsEnabled()) {
     return errors::Unavailable("Detailed stats collection must be enabled");
   }
+
   TF_RETURN_IF_ERROR(cluster->Initialize(item_));
   RunMetadata metadata;
   TF_RETURN_IF_ERROR(
@@ -163,6 +164,7 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
         live->memory_used = output.tensor_description()
                                 .allocation_description()
                                 .allocated_bytes();
+
         // Allocations typically take place at the very beginning of the op
         // execution.
         live->allocation_time =
@@ -185,7 +187,10 @@ void GraphMemory::InferFromTrace(const StepStats& timeline) {
       for (const string& input : node->input()) {
         int position;
         string input_node = ParseNodeName(input, &position);
-
+        if (position < 0) {
+          // Skip control dependencies
+          continue;
+        }
         LiveTensor* live = FindOrCreateLiveTensor(
             input_node, position, &live_tensors,
             &live_tensors_per_device[node_placement[input_node]]);
diff --git a/tensorflow/core/grappler/costs/graph_memory_test.cc b/tensorflow/core/grappler/costs/graph_memory_test.cc
index e4d0cf7813c63a7388301ba51f9edbe17a551902..6f3522b068bdb74eb98d3e6071d4d4b2e21c9ff6 100644
--- a/tensorflow/core/grappler/costs/graph_memory_test.cc
+++ b/tensorflow/core/grappler/costs/graph_memory_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/graph_memory.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
@@ -133,6 +134,39 @@ TEST_F(GraphMemoryTest, MultiDevice) {
   EXPECT_EQ(gpu_expected, gpu_tensors);
 }
 
+TEST_F(GraphMemoryTest, CtrlDependencies) {
+  // Build a simple graph with a control dependency.
+  Scope s = Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a").WithDevice("/CPU:0"), 10.0f, {3});
+  Output v =
+      ops::Variable(s.WithOpName("v").WithDevice("/CPU:0"), {3}, DT_FLOAT);
+  Output assign =
+      ops::Assign(s.WithOpName("assign").WithDevice("/CPU:0"), v, a);
+  ops::NoOp init(
+      s.WithOpName("init").WithDevice("/CPU:0").WithControlDependencies(
+          assign));
+
+  GrapplerItem item;
+  item.fetch.push_back("init");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphMemory memory(item);
+  Status status = memory.InferStatically(devices_);
+  TF_CHECK_OK(status);
+
+  const GraphMemory::MemoryUsage& mem = memory.GetPeakMemoryUsage("/CPU:0");
+  EXPECT_EQ(36, mem.used_memory);
+  std::set<string> tensors;
+  for (const auto& t : mem.live_tensors) {
+    tensors.insert(strings::StrCat(t.node, ":", t.output_id));
+  }
+  std::set<string> expected;
+  expected.insert("a:0");
+  expected.insert("v:0");
+  expected.insert("assign:0");
+  EXPECT_EQ(expected, tensors);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index e9cb2ee09d52d5438c80d4601623c47eaf973a8c..0453ceb6d180de4ea9af86e676efde7716c0297c 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -22,100 +22,234 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
+namespace {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeAndType;
 using shape_inference::ShapeHandle;
 
-namespace {
+template <typename Handle>
+struct HashHandle {
+  std::size_t operator()(const Handle& h) const { return h.Handle(); }
+};
+template <typename Handle>
+struct CompareHandle {
+  bool operator()(const Handle& h1, const Handle& h2) const {
+    return h1.SameHandle(h2);
+  }
+};
 
-// If a Merge node has a NextIteration node as an input then that input will
-// try to forward an UnknownShape at graph construction time. However, the
-// Merge shape function will always propagate an UnknownShape if any of its
-// inputs are UnknownShapes. So we need to ignore the input from NextIteration
-// nodes to propagate any known shape from the Merge node.
-Status ShapeOfMergeNode(const Node* node, InferenceContext* c) {
-  ShapeHandle out = c->input(0);
-  if (!c->RankKnown(out)) {
-    out = c->UnknownShape();
-  } else {
-    int32 rank = c->Rank(out);
-    for (const Edge* e : node->in_edges()) {
-      if (e->src()->IsNextIteration() || e->dst_input() <= 0) {
-        continue;
-      }
-      ShapeHandle input = c->input(e->dst_input());
-      if (!c->RankKnown(input) || c->Rank(input) != rank) {
-        out = c->UnknownShape();
-        break;
-      }
+template <typename Handle>
+struct HandleToObject {};
+template <>
+struct HandleToObject<ShapeHandle> {
+  typedef ShapeHandle Object;
 
-      for (int d = 0; d < rank; ++d) {
-        if (c->Value(c->Dim(input, d)) != c->Value(c->Dim(out, d))) {
-          TF_RETURN_IF_ERROR(c->ReplaceDim(out, d, c->UnknownDim(), &out));
-        }
-      }
+  static ShapeHandle Unknown() { return ShapeHandle(); }
+};
+
+template <>
+struct HandleToObject<DimensionHandle> {
+  typedef int64 Object;
+
+  static int64 Unknown() { return -1; }
+};
+
+template <typename Handle>
+struct Processor {};
+
+template <>
+struct Processor<ShapeHandle> {
+  // Extract the shape or dim denoted by the handle.
+  void ExtractValue(ShapeHandle h, ShapeHandle* result) { *result = h; }
+  // Merge the shapes or dims.
+  Status Merge(ShapeHandle h1, ShapeHandle h2, ShapeHandle* result) {
+    if (InferenceContext::RankKnown(*result)) {
+      // The result was initialized in a previous merge to a shape of known
+      // rank, make sure we preserve that information.
+      return Status::OK();
+    }
+    if (InferenceContext::RankKnown(h1)) {
+      *result = h1;
+    } else {
+      *result = h2;
     }
+    return Status::OK();
   }
-  c->set_output(0, out);
-  c->set_output(1, c->Scalar());
-  return Status::OK();
-}
+};
 
-// Manually propagate the input shape for Enter nodes and update any Merge node
-// outputs.
-Status UpdateEnter(ShapeRefiner* shape_refiner, const Node* node, bool relax,
-                   std::queue<const Node*>* new_shapes) {
-  auto enter_ctx = shape_refiner->GetContext(node);
-  CHECK_NE(enter_ctx, nullptr);
-  for (int i = 0; i < enter_ctx->num_outputs(); i++) {
-    TF_RETURN_IF_ERROR(shape_refiner->SetShape(node, i, enter_ctx->input(0)));
-  }
-  for (const Edge* e : node->out_edges()) {
-    Node* dst = e->dst();
-    if (dst->IsMerge()) {
-      bool updated = false;
-      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(dst, relax, &updated));
-      if (!updated) {
-        continue;
+template <>
+struct Processor<DimensionHandle> {
+  // Assign a negative id to unknown dimensions, starting at -2 (the -1 id
+  // reserved by TensorFlow).
+  void ExtractValue(DimensionHandle d, int64* result) {
+    if (!InferenceContext::ValueKnown(d)) {
+      *result = -counter;
+      counter++;
+    } else {
+      int64 val = InferenceContext::Value(d);
+      if (val >= 0) {
+        *result = val;
+      } else {
+        // A shape inference function generated an invalid dimension handle.
+        // Use a symbolic dimension to encode this.
+        *result = -counter;
+        counter++;
       }
-      InferenceContext* merge_ctx = shape_refiner->GetContext(dst);
-      CHECK_NE(merge_ctx, nullptr);
-      TF_RETURN_IF_ERROR(ShapeOfMergeNode(dst, merge_ctx));
-      new_shapes->push(dst);
     }
   }
-  return Status::OK();
-}
 
-// Propagates the shapes in the transitive fan-out of <new_shapes>.
-Status PropagateShapes(ShapeRefiner* shape_refiner, bool relax,
-                       std::queue<const Node*>* new_shapes) {
-  while (!new_shapes->empty()) {
-    const Node* n = new_shapes->front();
-    new_shapes->pop();
-    for (const Node* fanout : n->out_nodes()) {
-      bool updated = false;
-      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(fanout, relax, &updated));
-      if (fanout->IsEnter()) {
-        TF_RETURN_IF_ERROR(
-            UpdateEnter(shape_refiner, fanout, relax, new_shapes));
-      } else if (updated) {
-        // We want to avoid propagating through loops on the merge pass because
-        // the shapes are not guaranteed to converge.
-        if (!relax && fanout->IsNextIteration()) {
-          continue;
-        }
-        new_shapes->push(fanout);
+  // Merge the dimensions d1 and d2. Return the known shape if there is one,
+  // otherwise look for a symbolic shape. If there is no symbolic shape and no
+  // known shape, the shape if fully unknown so return -1.
+  Status Merge(DimensionHandle d1, DimensionHandle d2, int64* result) {
+    const int64 dim1 = InferenceContext::Value(d1);
+    const int64 dim2 = InferenceContext::Value(d2);
+
+    if (dim1 >= 0 && dim2 >= 0) {
+      CHECK_EQ(dim1, dim2);
+      return RefineDim(dim1, result);
+    } else if (dim1 >= 0 && dim2 < 0) {
+      return RefineDim(dim1, result);
+    } else if (dim1 < 0 && dim2 >= 0) {
+      return RefineDim(dim2, result);
+    } else if (dim1 < -1) {
+      return RefineDim(dim1, result);
+    } else if (dim2 < -1) {
+      return RefineDim(dim2, result);
+    } else {
+      CHECK_EQ(dim1, dim2);
+      CHECK_EQ(-1, dim1);
+      return RefineDim(-1, result);
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status RefineDim(int64 dim, int64* result) {
+    if (*result >= 0) {
+      if (!(*result == dim || dim < 0)) {
+        return errors::InvalidArgument("Inconsistent dimensions detected");
       }
+    } else if (dim >= 0) {
+      *result = dim;
+    } else if (dim < *result) {
+      *result = dim;
+    }
+    return Status::OK();
+  }
+
+  int64 counter = 2;
+};
+
+// Traditional Disjoint-Set datastructure with path compression.
+// (https://en.wikipedia.org/wiki/Disjoint-set_data_structure)
+template <typename Handle>
+class DisjointSet {
+ public:
+  DisjointSet(const Processor<Handle>& processor) : processor_(processor) {}
+  ~DisjointSet() {
+    for (auto rep : nodes_) {
+      delete rep.second;
     }
   }
+
+  Status Merge(Handle x, Handle y);
+  const typename HandleToObject<Handle>::Object GetMergedValue(Handle value);
+
+ private:
+  // All the handles that belong to the same set are part of the same tree, and
+  // utimately represented by the root of that tree.
+  struct Rep {
+    // Parent in the tree used to encode the set.
+    Rep* parent;
+    // Rank in the tree, used to figure out how to compress the path to the root
+    // of the tree.
+    int rank;
+    // The handle.
+    typename HandleToObject<Handle>::Object value;
+  };
+
+  // Create a new set for the value if none exists, or return its representative
+  // node otherwise.
+  Rep* Find(Handle value);
+
+ private:
+  Processor<Handle> processor_;
+  std::unordered_map<Handle, Rep*, HashHandle<Handle>, CompareHandle<Handle>>
+      nodes_;
+};
+
+template <typename Handle>
+const typename HandleToObject<Handle>::Object
+DisjointSet<Handle>::GetMergedValue(Handle value) {
+  Rep* rep = Find(value);
+  if (!rep) {
+    // We don't know anything about this handle.
+    return HandleToObject<Handle>::Unknown();
+  }
+  return rep->value;
+}
+
+template <typename Handle>
+Status DisjointSet<Handle>::Merge(Handle x, Handle y) {
+  Rep* x_root = Find(x);
+  Rep* y_root = Find(y);
+
+  // x and y are already in the same set
+  if (x_root == y_root) {
+    return Status::OK();
+  }
+  // x and y are not in same set, so we merge them
+  // Use the occasion to strengthen what we know about the handle by merging the
+  // information about the 2 subsets.
+  if (x_root->rank < y_root->rank) {
+    TF_RETURN_IF_ERROR(processor_.Merge(y, x, &y_root->value));
+    x_root->parent = y_root;
+  } else if (x_root->rank > y_root->rank) {
+    TF_RETURN_IF_ERROR(processor_.Merge(x, y, &x_root->value));
+    y_root->parent = x_root;
+  } else {
+    TF_RETURN_IF_ERROR(processor_.Merge(x, y, &x_root->value));
+    // Arbitrarily make one root the new parent
+    y_root->parent = x_root;
+    x_root->rank = x_root->rank + 1;
+  }
   return Status::OK();
 }
 
+template <typename Handle>
+typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
+  auto it = nodes_.find(value);
+  if (it == nodes_.end()) {
+    // This is the first time we process this handle, create an entry for it.
+    Rep* node = new Rep;
+    node->parent = node;
+    node->rank = 0;
+    processor_.ExtractValue(value, &node->value);
+    nodes_[value] = node;
+    return node;
+  }
+  // Return the representative for the set, which is the root of the tree. Apply
+  // path compression to speedup future queries.
+  Rep* node = it->second;
+  Rep* root = node->parent;
+  while (root != root->parent) {
+    root = root->parent;
+  }
+  while (node->parent != root) {
+    Rep* next = node->parent;
+    node->parent = root;
+    node = next;
+  }
+  return root;
+}
+
 bool IsQueue(const Node& node) {
   StringPiece type(node.type_string());
   return type.ends_with("QueueV2");
@@ -131,26 +265,380 @@ bool IsEnterWithQueue(const Node& node) {
   return false;
 }
 
-}  // namespace
-
-void GraphProperties::Relax(InferenceContext* c, ShapeHandle s0, ShapeHandle s1,
-                            ShapeHandle* out) {
-  c->Relax(s0, s1, out);
+bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
+  if (proto.unknown_rank()) {
+    return true;
+  }
+  for (const auto& dim : proto.dim()) {
+    if (dim.size() < 0) {
+      return true;
+    }
+  }
+  return false;
 }
 
-bool GraphProperties::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
-                                       ShapeHandle s1) {
-  return ShapeRefiner::SameDefinedShape(c, s0, s1);
-}
+void VerboseLogUnknownDimensionSources(
+    const Graph& graph,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        input_properties_map,
+    const std::map<string, std::vector<OpInfo::TensorProperties>>&
+        output_properties_map) {
+  if (!VLOG_IS_ON(2)) {
+    return;
+  }
+
+  VLOG(2) << "Nodes with known inputs, but with unknown output dimensions:";
+
+  // Find all nodes in the graph for which we
+  // do not have any unknown dimensions in their inputs, but
+  // we have some unknown dimensions in their outputs.
+  std::map<string, int> op_to_count;
+  for (const Node* const node : graph.nodes()) {
+    if (node->num_outputs() == 0) {
+      continue;
+    }
+
+    const auto& input_properties = input_properties_map.at(node->name());
+    const auto& output_properties = output_properties_map.at(node->name());
+
+    bool has_unknown_inputs = false;
+    for (int i = 0; i < node->num_inputs(); ++i) {
+      if (HasAnyUnknownDimensions(input_properties[i].shape())) {
+        has_unknown_inputs = true;
+        break;
+      }
+    }
+
+    if (has_unknown_inputs) {
+      continue;
+    }
 
-bool GraphProperties::IsUpdatedShapesOrTypes(
-    InferenceContext* c, const std::vector<ShapeAndType>& existing,
-    const std::vector<ShapeAndType>& updated) {
-  return ShapeRefiner::IsUpdatedShapesOrTypes(c, existing, updated);
+    for (int i = 0; i < node->num_outputs(); ++i) {
+      if (HasAnyUnknownDimensions(output_properties[i].shape())) {
+        string inputs = "input_shapes=[";
+        for (int i = 0; i < node->num_inputs(); ++i) {
+          inputs +=
+              PartialTensorShape::DebugString(input_properties[i].shape());
+        }
+        inputs += "]";
+
+        string outputs = "output_shapes=[";
+        for (int i = 0; i < node->num_outputs(); ++i) {
+          outputs +=
+              PartialTensorShape::DebugString(output_properties[i].shape());
+        }
+        outputs += "]";
+
+        VLOG(2) << "Node: " << node->name() << ", Op: " << node->def().op()
+                << ", " << inputs << ", " << outputs;
+
+        op_to_count[node->def().op()]++;
+
+        // don't log again for this node
+        break;
+      }
+    }
+  }
+  VLOG(2) << "Op types with known inputs, but with unknown output dimensions "
+          << "(format: <op_type> (<count>)):";
+  for (const auto& p : op_to_count) {
+    VLOG(2) << p.first << " (" << p.second << ")";
+  }
 }
 
+}  // namespace
+
+// Queue of nodes to process. Nodes can be enqueued in any order, but will be
+// dequeued in (roughly) topological order. Propagating shapes following a
+// topological ordering isn't required for correctness but helps speed things up
+// since it avoids processing the same node multiple times as its inputs
+// information is refined.
+class TopoQueue {
+ public:
+  void push(const Node* n) { queue_.insert(n); }
+  const Node* pop() {
+    CHECK(!empty());
+    auto it = queue_.begin();
+    const Node* n = *it;
+    queue_.erase(it);
+    return n;
+  }
+
+  bool empty() const { return queue_.empty(); }
+  std::size_t size() const { return queue_.size(); }
+
+ private:
+  // Graph nodes are created in (roughly) topological order. Therefore we can
+  // use their id to ensure they're sorted topologically.
+  struct CompareNodes {
+    bool operator()(const Node* lhs, const Node* rhs) const {
+      return lhs->id() > rhs->id();
+    }
+  };
+  std::set<const Node*, CompareNodes> queue_;
+};
+
+// Merge and relax symbolic shapes.
+// Each symbolic shape or dimension is represented by a handle. Unlike the TF
+// shape refiner which creates new handles every time it processes an unknown
+// shape/dimension, the symbolic shape refiner assigns a specific handle to each
+// unknown shape/dimension of a given node.
+class SymbolicShapeRefiner {
+ public:
+  explicit SymbolicShapeRefiner(ShapeRefiner* shape_refiner)
+      : shape_refiner_(shape_refiner) {}
+
+  InferenceContext* GetContext(const Node* node) {
+    return shape_refiner_->GetContext(node);
+  }
+  Status UpdateNode(const Node* node, bool relax, bool* refined) {
+    return shape_refiner_->UpdateNode(node, relax, refined);
+  }
+  Status SetUnknownShape(const Node* node, int output_port) {
+    shape_inference::ShapeHandle shape =
+        GetUnknownOutputShape(node, output_port);
+    InferenceContext* ctx = GetContext(node);
+    if (ctx == nullptr) {
+      return errors::InvalidArgument("Missing context");
+    }
+    ctx->set_output(output_port, shape);
+    return Status::OK();
+  }
+
+  struct ShapeId {
+    const Node* node;
+    int port_id;
+    bool operator==(const ShapeId& other) const {
+      return node == other.node && port_id == other.port_id;
+    }
+  };
+  struct HashShapeId {
+    std::size_t operator()(const ShapeId& shp) const {
+      return std::hash<const Node*>{}(shp.node) + shp.port_id;
+    }
+  };
+
+  struct DimId {
+    const Node* node;
+    int port_id;
+    int dim_index;
+    bool operator==(const DimId& other) const {
+      return node == other.node && port_id == other.port_id &&
+             dim_index == other.dim_index;
+    }
+  };
+
+  struct HashDimId {
+    std::size_t operator()(const DimId& dim) const {
+      return std::hash<const Node*>{}(dim.node) + dim.port_id + dim.dim_index;
+    }
+  };
+
+  // Compute the shape of the tensors outputed by node 'node' at output port
+  // 'port_index' as the intersection of shape1 and shape2.
+  ShapeHandle OutputAsIntersection(const Node* node, int port_index,
+                                   ShapeHandle shape1, ShapeHandle shape2) {
+    if (shape1.SameHandle(shape2)) {
+      return shape1;
+    }
+    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    ShapeHandle merged = shape1;
+    if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
+      // Return either one since they're expected to represent the same value.
+      return shape1;
+    } else if (!ctx->RankKnown(shape2) && ctx->RankKnown(shape1)) {
+      return shape1;
+    } else if (ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
+      return shape2;
+    } else {
+      const int rank = ctx->Rank(shape1);
+      if (ctx->Rank(shape2) != rank) {
+        // We detected an inconsistency, return an unknown shape. This can
+        // happen in the fanout of a merge node since during the initial
+        // propagation we optimistically assume that all the inputs to the merge
+        // node have the same shape.
+        return GetUnknownOutputShape(node, port_index);
+      }
+      for (int d = 0; d < rank; ++d) {
+        if (!ctx->Dim(shape1, d).SameHandle(ctx->Dim(shape2, d))) {
+          if (ctx->Value(ctx->Dim(shape1, d)) !=
+              ctx->Value(ctx->Dim(shape2, d))) {
+            DimensionHandle new_dim;
+            if (ctx->Value(ctx->Dim(shape1, d)) < 0) {
+              new_dim = ctx->Dim(shape2, d);
+            } else if (ctx->Value(ctx->Dim(shape2, d)) < 0) {
+              new_dim = ctx->Dim(shape1, d);
+            } else {
+              new_dim = GetUnknownOutputDim(node, port_index, d);
+            }
+            TF_CHECK_OK(ctx->ReplaceDim(merged, d, new_dim, &merged));
+          }
+        }
+      }
+    }
+    return merged;
+  }
+
+  // Compute the shape of the tensors outputed by node 'node' at output port
+  // 'port_index' as the union of shape1 and shape2.
+  ShapeHandle OutputAsUnion(const Node* node, int port_index,
+                            ShapeHandle shape1, ShapeHandle shape2) {
+    if (shape1.SameHandle(shape2)) {
+      return shape1;
+    }
+    InferenceContext* ctx = shape_refiner_->GetContext(node);
+    ShapeHandle relaxed = shape1;
+    const int rank = ctx->Rank(shape1);
+    if (!ctx->RankKnown(shape2) || ctx->Rank(shape2) != rank) {
+      relaxed = GetUnknownOutputShape(node, port_index);
+    } else {
+      for (int d = 0; d < rank; ++d) {
+        if (!ctx->Dim(shape1, d).SameHandle(ctx->Dim(shape2, d))) {
+          int64 val1 = ctx->Value(ctx->Dim(shape1, d));
+          int64 val2 = ctx->Value(ctx->Dim(shape2, d));
+          if (val1 != val2 || (val1 < 0 && val2 < 0)) {
+            DimensionHandle new_dim = GetUnknownOutputDim(node, port_index, d);
+            TF_CHECK_OK(ctx->ReplaceDim(relaxed, d, new_dim, &relaxed));
+          }
+        }
+      }
+    }
+    return relaxed;
+  }
+
+  bool EquivalentShapes(ShapeHandle s1, ShapeHandle s2) const {
+    if (s1.SameHandle(s2)) {
+      return true;
+    }
+    if (InferenceContext::Rank(s1) != InferenceContext::Rank(s2)) {
+      return false;
+    }
+    if (!InferenceContext::RankKnown(s1) && !InferenceContext::RankKnown(s2)) {
+      return true;
+    }
+    const int rank = InferenceContext::Rank(s1);
+    for (int i = 0; i < rank; ++i) {
+      if (!InferenceContext::DimKnownRank(s1, i).SameHandle(
+              InferenceContext::DimKnownRank(s2, i))) {
+        int64 val1 =
+            InferenceContext::Value(InferenceContext::DimKnownRank(s1, i));
+        int64 val2 =
+            InferenceContext::Value(InferenceContext::DimKnownRank(s2, i));
+        if (val1 >= 0 && val2 >= 0 && val1 == val2) {
+          continue;
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool EquivalentShapesAndTypes(const std::vector<ShapeAndType>& st1,
+                                const std::vector<ShapeAndType>& st2) const {
+    if (st1.size() != st2.size()) {
+      return false;
+    }
+    for (int i = 0; i < st1.size(); ++i) {
+      const ShapeAndType& s1 = st1[i];
+      const ShapeAndType& s2 = st2[i];
+      if (s1.dtype != s2.dtype) {
+        return false;
+      }
+      if (!EquivalentShapes(s1.shape, s2.shape)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  // Return the one ShapeHandle used to denote a fully unknown shape for a node
+  // output.
+  ShapeHandle GetUnknownOutputShape(const Node* node, int index) {
+    ShapeId id{node, index};
+    auto it = unknown_shapes_.find(id);
+    if (it != unknown_shapes_.end()) {
+      return it->second;
+    }
+    InferenceContext* c = shape_refiner_->GetContext(node);
+    ShapeHandle shp = c->UnknownShape();
+    unknown_shapes_[id] = shp;
+    return shp;
+  }
+  // Return the one ShapeHandle used to denote a fully unknown dimension for a
+  // node output.
+  DimensionHandle GetUnknownOutputDim(const Node* node, int index, int dim_id) {
+    DimId id{node, index, dim_id};
+    auto it = unknown_dims_.find(id);
+    if (it != unknown_dims_.end()) {
+      return it->second;
+    }
+    InferenceContext* c = shape_refiner_->GetContext(node);
+    DimensionHandle dim = c->UnknownDim();
+    unknown_dims_[id] = dim;
+    return dim;
+  }
+
+  ShapeRefiner* shape_refiner_;
+
+  std::unordered_map<ShapeId, ShapeHandle, HashShapeId> unknown_shapes_;
+  std::unordered_map<DimId, DimensionHandle, HashDimId> unknown_dims_;
+};
+
+// Keep track of shapes and dimensions in a graph.
+// In particular, use disjoint sets to track equivalence between shapes and
+// dims, and consolidate the information globally.
+class SymbolicShapeManager {
+ public:
+  SymbolicShapeManager() : shapes_(shape_processor_), dims_(dim_processor_) {}
+
+  Status Merge(ShapeHandle s1, ShapeHandle s2) {
+    if (!s1.IsSet() || !s2.IsSet()) {
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(shapes_.Merge(s1, s2));
+    if (InferenceContext::Rank(s1) > 0 && InferenceContext::Rank(s2) > 0) {
+      CHECK_EQ(InferenceContext::Rank(s1), InferenceContext::Rank(s2));
+      for (int i = 0; i < InferenceContext::Rank(s1); ++i) {
+        TF_RETURN_IF_ERROR(dims_.Merge(InferenceContext::DimKnownRank(s1, i),
+                                       InferenceContext::DimKnownRank(s2, i)));
+      }
+    }
+    return Status::OK();
+  }
+  Status Merge(DimensionHandle d1, DimensionHandle d2) {
+    if (!d1.IsSet() || !d2.IsSet()) {
+      return Status::OK();
+    }
+    return dims_.Merge(d1, d2);
+  }
+
+  void AsTensorProperties(const ShapeHandle& shape, const DataType& type,
+                          OpInfo::TensorProperties* properties) {
+    properties->set_dtype(type);
+    ShapeHandle actual_shape = shapes_.GetMergedValue(shape);
+    if (!InferenceContext::RankKnown(actual_shape)) {
+      properties->mutable_shape()->set_unknown_rank(true);
+    } else {
+      for (int j = 0; j < InferenceContext::Rank(actual_shape); ++j) {
+        shape_inference::DimensionHandle dim =
+            InferenceContext::DimKnownRank(actual_shape, j);
+        int64 d = dims_.GetMergedValue(dim);
+        properties->mutable_shape()->add_dim()->set_size(d);
+      }
+    }
+  }
+
+ private:
+  Processor<ShapeHandle> shape_processor_;
+  DisjointSet<shape_inference::ShapeHandle> shapes_;
+  Processor<DimensionHandle> dim_processor_;
+  DisjointSet<shape_inference::DimensionHandle> dims_;
+};
+
 Status GraphProperties::MergeEnqueueShapesAndTypes(
-    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
     return errors::InvalidArgument(
@@ -166,13 +654,14 @@ Status GraphProperties::MergeEnqueueShapesAndTypes(
                                      DataTypeString(b.dtype));
     }
 
-    TF_RETURN_IF_ERROR(qctx->Merge(a.shape, b.shape, &b.shape));
+    b.shape = shape_refiner->OutputAsIntersection(qnode, i, a.shape, b.shape);
   }
   return Status::OK();
 }
 
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
-    const std::vector<ShapeAndType>& shapes_and_types, InferenceContext* qctx,
+    SymbolicShapeRefiner* shape_refiner, const Node* qnode,
+    const std::vector<ShapeAndType>& shapes_and_types,
     std::vector<ShapeAndType>* queue_shapes_and_types) {
   if (shapes_and_types.size() != queue_shapes_and_types->size()) {
     return errors::InvalidArgument(
@@ -188,12 +677,246 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
                                      DataTypeString(b.dtype));
     }
 
-    Relax(qctx, a.shape, b.shape, &b.shape);
+    b.shape = shape_refiner->OutputAsUnion(qnode, i, a.shape, b.shape);
+  }
+  return Status::OK();
+}
+
+// If a Merge node has a NextIteration node as an input then that input will
+// try to forward an UnknownShape at graph construction time. However, the
+// Merge shape function will always propagate an UnknownShape if any of its
+// inputs are UnknownShapes. So we need to ignore the input from NextIteration
+// nodes to propagate any known shape from the Merge node.
+Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                                        const Node* node, bool relax,
+                                        TopoQueue* new_shapes) {
+  InferenceContext* c = shape_refiner->GetContext(node);
+  CHECK_NE(c, nullptr);
+
+  ShapeHandle out;
+  bool out_initialized = false;
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    // Skip back edges during the initial propagation phase. This is equivalent
+    // to assuming that all the inputs to the merge nodes are fed by the same
+    // shape, and will be corrected as needed in the relaxation phase.
+    if (!relax && e->src()->IsNextIteration()) {
+      continue;
+    }
+
+    InferenceContext* in = shape_refiner->GetContext(e->src());
+    ShapeHandle input = in->output(e->src_output());
+    if (relax) {
+      c->RelaxInput(e->dst_input(), input);
+    } else {
+      c->MergeInput(e->dst_input(), input);
+    }
+    if (!out_initialized) {
+      out_initialized = true;
+      out = input;
+      continue;
+    }
+    if (relax) {
+      out = shape_refiner->OutputAsUnion(node, 0, input, out);
+    } else {
+      out = shape_refiner->OutputAsIntersection(node, 0, input, out);
+    }
+  }
+
+  if (!shape_refiner->EquivalentShapes(out, c->output(0))) {
+    c->set_output(0, out);
+    c->set_output(1, c->Scalar());
+    new_shapes->push(node);
+  }
+
+  return Status::OK();
+}
+
+Status GraphProperties::OverwriteFedPorts(
+    SymbolicShapeRefiner* shape_refiner,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* node, TopoQueue* new_shapes) const {
+  auto it = fed_ports.find(node->name());
+  Status status;
+  if (it != fed_ports.end()) {
+    // It is possible to feed node output ports with tensors of any shape: as a
+    // result, the shape of a fed port is completely unknown.
+    for (const int output_port : it->second) {
+      status.Update(shape_refiner->SetUnknownShape(node, output_port));
+    }
+    new_shapes->push(node);
+  }
+  return status;
+}
+
+// Manually propagate the input shape for Enter nodes and update any Merge node
+// outputs.
+Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
+                                    const Node* node, bool relax,
+                                    TopoQueue* new_shapes) {
+  auto enter_ctx = shape_refiner->GetContext(node);
+  CHECK_NE(enter_ctx, nullptr);
+
+  for (const Edge* e : node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    InferenceContext* in = shape_refiner->GetContext(e->src());
+    ShapeHandle input = in->output(e->src_output());
+    if (!enter_ctx->output(0).SameHandle(input)) {
+      if (relax) {
+        enter_ctx->RelaxInput(0, input);
+      } else {
+        enter_ctx->MergeInput(0, input);
+      }
+      enter_ctx->set_output(0, input);
+      new_shapes->push(node);
+    }
+  }
+  return Status::OK();
+}
+
+Status GraphProperties::UpdateShapes(
+    SymbolicShapeRefiner* shape_refiner, bool relax,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    const Node* n, TopoQueue* new_shapes) const {
+  if (n->IsEnter()) {
+    // The Enter shape function always forwards an UnknownShape, so do the right
+    // thing here.
+    TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes));
+  } else if (n->IsMerge()) {
+    // Properly handle merge nodes.
+    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes));
+  } else {
+    // Rely on regular TF shape refinement for all the other nodes.
+    bool updated = false;
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, relax, &updated));
+    if (updated) {
+      // We want to avoid propagating through loops on the merge pass because
+      // the shapes are not guaranteed to converge.
+      if (relax || !n->IsNextIteration()) {
+        new_shapes->push(n);
+      }
+    }
   }
+  // Nodes can be fed with any shape. The TensorFlow shape inference code can't
+  // handle this properly, so overwrite its behavior here.
+  return OverwriteFedPorts(shape_refiner, fed_ports, n, new_shapes);
+}
+
+// Propagates the shapes in the transitive fan-out of <new_shapes>.
+Status GraphProperties::PropagateShapes(
+    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
+    const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
+        resources,
+    const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+    int num_loops) const {
+  // Limit the number of iterations to prevent infinite loops in the presence of
+  // incorrect shape functions. The algoritm should converge in at most
+  // num_nested_loops^2 * max_rank. We approximate max_rank with the constant 4.
+  // The same applies to resources.
+  VLOG(1) << "Propagating (relax=" << relax << ") " << new_shapes->size()
+          << " new shapes through " << num_loops << " loops and "
+          << resources.size() << " resources" << std::endl;
+
+  const int64 max_loop_length = item_.graph.node_size();
+  const int64 max_rank = 4;
+  const int64 max_loop_iterations =
+      max_rank * max_loop_length * std::max<int64>(1, num_loops * num_loops);
+  const int64 num_queues = resources.size();
+  const int64 max_resource_iterations = num_queues * num_queues * max_rank;
+
+  int64 num_resource_iterations = 0;
+  do {
+    int64 num_loop_iterations = 0;
+    while (!new_shapes->empty() &&
+           num_loop_iterations++ < max_loop_iterations) {
+      const Node* n = new_shapes->pop();
+      for (const Edge* e : n->out_edges()) {
+        if (!e->IsControlEdge()) {
+          const Node* fanout = e->dst();
+          TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, fed_ports,
+                                          fanout, new_shapes));
+        }
+      }
+    }
+
+    for (const auto& resource : resources) {
+      // Resources need special handling: since the enqueue nodes are in the
+      // fanout of the queues, we need to manually propagate the shapes from
+      // enqueue node to the corresponding queue.
+      TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
+                                        shape_refiner, relax, new_shapes));
+    }
+  } while (!new_shapes->empty() &&
+           num_resource_iterations++ < max_resource_iterations);
+
+  if (!new_shapes->empty()) {
+    return errors::Internal("Shape inference failed to converge");
+  }
+
   return Status::OK();
 }
 
-Status GraphProperties::InferStatically() {
+Status GraphProperties::UpdateResource(
+    const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes) {
+  // Proceed only if qnode is a queue or an Enter with queue input.
+  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
+    return Status::OK();
+  }
+  auto qctx = shape_refiner->GetContext(qnode);
+  if (!qctx) {
+    return Status::OK();
+  }
+  auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
+
+  // Merge all inputs into the enqueue node, regardless of which phase we
+  // are in.
+  std::vector<ShapeAndType> queue_shapes_and_types;
+  if (queue_handle_data) {
+    queue_shapes_and_types = *queue_handle_data;
+  }
+  for (const auto& node : queue_inputs) {
+    auto ctx = shape_refiner->GetContext(node);
+    if (!ctx) {
+      continue;
+    }
+    // TODO(bsteiner): handle EnqueueMany as well.
+    if (node->type_string().find("Enqueue") != std::string::npos &&
+        node->type_string().find("EnqueueMany") == std::string::npos) {
+      std::vector<ShapeAndType> shapes_and_types;
+      for (int i = 1; i < ctx->num_inputs(); ++i) {
+        shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
+      }
+      if (queue_shapes_and_types.empty()) {
+        queue_shapes_and_types = shapes_and_types;
+      } else {
+        if (relax) {
+          TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
+        } else {
+          TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
+              shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
+        }
+      }
+    }
+  }
+
+  if (queue_handle_data == nullptr ||
+      !shape_refiner->EquivalentShapesAndTypes(*queue_handle_data,
+                                               queue_shapes_and_types)) {
+    qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
+
+    new_shapes->push(qnode);
+  }
+
+  return Status::OK();
+}
+
+Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   Graph graph(OpRegistry::Global());
   FunctionLibraryDefinition function_library(graph.op_registry(),
                                              item_.graph.library());
@@ -202,14 +925,30 @@ Status GraphProperties::InferStatically() {
   shape_refiner.set_disable_constant_propagation(true);
   shape_refiner.set_function_library_for_shape_inference(&function_library);
   ImportGraphDefOptions options;
+  // Graph optimization happens at the late stage of graph execution,
+  // when colocation constraints are already validated previously and
+  // the device placement of nodes has also completed, so there
+  // is no need to validate colocation constraints again.
+  options.validate_colocation_constraints = false;
   Status s = ImportGraphDef(options, item_.graph, &graph, &shape_refiner);
   TF_RETURN_IF_ERROR(s);
 
+  std::unordered_map<string, std::unordered_set<int>> fed_ports;
+  if (!assume_valid_feeds) {
+    for (const auto& feed : item_.feed) {
+      int port_index = 0;
+      string node_name = ParseNodeName(feed.first, &port_index);
+      fed_ports[node_name].insert(port_index);
+    }
+  }
+
   // List the resources and the nodes using them. Also collect the Enter and
   // Merge nodes.
   std::unordered_map<const Node*, std::unordered_set<const Node*>> resources;
   std::unordered_set<const Node*> enter_nodes;
   std::unordered_set<const Node*> merge_nodes;
+  std::unordered_set<const Node*> fed_nodes;
+  int num_loops = 0;
   for (const Node* const node : graph.nodes()) {
     for (int i = 0; i < node->num_inputs(); ++i) {
       if (node->input_type(i) == DataType::DT_RESOURCE) {
@@ -220,182 +959,81 @@ Status GraphProperties::InferStatically() {
     }
     if (node->IsEnter()) {
       enter_nodes.insert(node);
+    } else if (node->IsMerge()) {
+      merge_nodes.insert(node);
     } else if (node->IsNextIteration()) {
-      for (const Node* output : node->out_nodes()) {
-        if (output->IsMerge()) {
-          merge_nodes.insert(output);
-        }
-      }
+      ++num_loops;
     }
-
-    // Infer output shape for Restore op.
-    if (node->op_def().name() == "Restore" ||
-        node->op_def().name() == "RestoreV2" ||
-        node->op_def().name() == "RestoreSlice") {
-      auto ctx = shape_refiner.GetContext(node);
-      for (const Edge* out_edge : node->out_edges()) {
-        const Node* output = out_edge->dst();
-        int output_idx = out_edge->src_output();
-        if (output_idx < 0) {
-          continue;
-        }
-        if (!ctx->FullyDefined(ctx->output(output_idx)) &&
-            output->op_def().name() == "Assign") {
-          if (!output->attrs().Find("validate_shape") ||
-              !output->attrs().Find("validate_shape")->b()) {
-            continue;
-          }
-          auto output_ctx = shape_refiner.GetContext(output);
-          if (output_ctx->FullyDefined(output_ctx->output(0))) {
-            ctx->set_output(output_idx, output_ctx->output(0));
-            output_ctx->MergeInput(1, output_ctx->output(0));
-          } else {
-            const Node* var;
-            TF_CHECK_OK(node->input_node(0, &var));
-            if (node->IsVariable()) {
-              auto var_ctx = shape_refiner.GetContext(var);
-              CHECK(var_ctx->FullyDefined(var_ctx->output(0)));
-              ctx->set_output(output_idx, var_ctx->output(0));
-              output_ctx->MergeInput(1, var_ctx->output(0));
-            }
-          }
-        }
-      }
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      fed_nodes.insert(node);
     }
   }
 
-  // Propagate the initial shapes of Enter nodes manually (the Enter shape
-  // function always forwards an UnknownShape).
-  std::queue<const Node*> new_shapes;
-  for (const Node* node : enter_nodes) {
-    TF_RETURN_IF_ERROR(
-        UpdateEnter(&shape_refiner, node, false /* relax */, &new_shapes));
-  }
-  TF_RETURN_IF_ERROR(
-      PropagateShapes(&shape_refiner, false /* relax */, &new_shapes));
+  SymbolicShapeRefiner refiner(&shape_refiner);
 
   // We propagate shapes through the graph in two phases. In the first phase, we
-  // exclusively merge shapes but we do not propagate shapes through loops. Then
-  // on the second phase, we exclusively relax shapes and propagate shapes
-  // through loops until reaching fixed point.
+  // exclusively merge shapes but we do not propagate shapes through the
+  // backedge of loops (i.e. the NextIteration node). Then on the second phase,
+  // we exclusively relax shapes and propagate shapes through loops until
+  // reaching fixed point.
   for (int relax = 0; relax < 2; relax++) {
-    // We don't update Merge nodes with the input of NextIteration nodes on the
-    // merge pass. So we do that at the beginning of the relax pass instead.
-    if (relax) {
-      bool updated = false;
-      for (const Node* node : merge_nodes) {
-        TF_RETURN_IF_ERROR(
-            shape_refiner.UpdateNode(node, false /* relax */, &updated));
-      }
+    TopoQueue new_shapes;
+    // Force the propagation of shapes of Enter nodes manually (the Enter shape
+    // function always forwards an UnknownShape).
+    for (const Node* node : enter_nodes) {
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
     }
+    // Seed the propagation of shapes through merge nodes.
+    for (const Node* node : merge_nodes) {
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(&refiner, relax, fed_ports, node, &new_shapes));
+    }
+    // Also seed the propagation of shapes in the fanout of fed nodes.
+    for (const Node* node : fed_nodes) {
+      TF_RETURN_IF_ERROR(
+          OverwriteFedPorts(&refiner, fed_ports, node, &new_shapes));
+    }
+    // Propagate shapes normally.
+    TF_RETURN_IF_ERROR(PropagateShapes(&refiner, relax, &new_shapes, resources,
+                                       fed_ports, num_loops));
+  }
 
-    bool done = true;
-    do {
-      if (relax) {
-        // Propagate shapes through any loops in the graph by relaxing.
-        for (const Node* node : merge_nodes) {
-          new_shapes.push(node);
-        }
-        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
-      }
-
-      // If we found a resource, try to propagate the shapes through it.
-      new_shapes = std::queue<const Node*>();
-      for (const auto& resource_data : resources) {
-        const Node* qnode = resource_data.first;
-        // Proceed only if qnode is a queue or an Enter with queue input.
-        if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode)) {
-          continue;
-        }
-        auto qctx = shape_refiner.GetContext(qnode);
-        if (!qctx) {
-          continue;
-        }
-
-        // Check to see if the shape is fully defined.
-        auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
-        if (queue_handle_data != nullptr) {
-          bool fully_defined = true;
-          for (const auto& shape_and_type : *queue_handle_data) {
-            if (!qctx->FullyDefined(shape_and_type.shape) ||
-                shape_and_type.dtype == DT_INVALID) {
-              fully_defined = false;
-            }
-          }
-          // If we are merging, then we are done. If we are relaxing, then we
-          // could potentially propagate a less specific shape.
-          if (fully_defined && !relax) {
-            continue;
-          }
-        }
-
-        // Merge all inputs into the enqueue node, regardless of which phase we
-        // are in.
-        std::vector<ShapeAndType> queue_shapes_and_types;
-        for (const auto& node : resource_data.second) {
-          auto ctx = shape_refiner.GetContext(node);
-          if (!ctx) {
-            continue;
-          }
-          // TODO(bsteiner): handle EnqueueMany as well.
-          if (node->type_string().find("Enqueue") != std::string::npos &&
-              node->type_string().find("EnqueueMany") == std::string::npos) {
-            std::vector<ShapeAndType> shapes_and_types;
-            for (int i = 1; i < ctx->num_inputs(); ++i) {
-              shapes_and_types.push_back({ctx->input(i), node->input_type(i)});
-            }
-
-            if (queue_shapes_and_types.empty()) {
-              queue_shapes_and_types = shapes_and_types;
-            } else {
-              TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-                  shapes_and_types, qctx, &queue_shapes_and_types));
-            }
-          }
-        }
-        // Combine the input shapes with the existing output shape. We either
-        // merge or relax depending on which phase we are in.
-        if (queue_handle_data != nullptr) {
-          if (relax) {
-            TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-                *queue_handle_data, qctx, &queue_shapes_and_types));
-          } else {
-            TF_RETURN_IF_ERROR(MergeEnqueueShapesAndTypes(
-                *queue_handle_data, qctx, &queue_shapes_and_types));
-          }
-        }
-        // Set the output ShapeAndType handles. If we successfully update the
-        // resource node, add its fan-out to the queue.
-        const std::vector<ShapeAndType>* outputs =
-            qctx->output_handle_shapes_and_types(0);
-        std::vector<ShapeAndType> existing_outputs;
-        if (outputs) {
-          existing_outputs = *outputs;
-        }
-        if (!queue_shapes_and_types.empty()) {
-          if (!relax && qctx->MergeOutputHandleShapesAndTypes(
-                            0, queue_shapes_and_types)) {
-            new_shapes.push(qnode);
-          } else if (relax && qctx->RelaxOutputHandleShapesAndMergeTypes(
-                                  0, queue_shapes_and_types)) {
-            if (IsUpdatedShapesOrTypes(
-                    qctx, existing_outputs,
-                    *qctx->output_handle_shapes_and_types(0))) {
-              new_shapes.push(qnode);
-            }
-          }
-        }
+  // Track shapes globally across the graph.
+  SymbolicShapeManager shape_manager;
+  bool found_error = false;
+  for (const Node* const node : graph.nodes()) {
+    auto node_ctx = shape_refiner.GetContext(node);
+    if (!node_ctx) {
+      continue;
+    }
+    // Skip any information that comes from fed nodes.
+    if (fed_ports.find(node->name()) != fed_ports.end()) {
+      continue;
+    }
+    for (const auto& merged_shapes : node_ctx->MergedShapes()) {
+      if (!shape_manager.Merge(merged_shapes.first, merged_shapes.second)
+               .ok()) {
+        found_error = true;
+        break;
       }
-      // Propagate the shapes in the transitive fan-out of the queue.
-      done = new_shapes.empty();
-      if (!done) {
-        TF_RETURN_IF_ERROR(PropagateShapes(&shape_refiner, relax, &new_shapes));
+    }
+    for (const auto& merged_dims : node_ctx->MergedDims()) {
+      if (!shape_manager.Merge(merged_dims.first, merged_dims.second).ok()) {
+        found_error = true;
+        break;
       }
-    } while (!done);
+    }
+    if (found_error) {
+      // The shapes aren't consistent, we can't infer safely: discard all the
+      // information discovered so far.
+      shape_manager = SymbolicShapeManager();
+      break;
+    }
   }
 
   for (const Node* const node : graph.nodes()) {
-    VLOG(1) << "<Node> " << node->name();
+    VLOG(3) << "Filling in graph properties for node: " << node->name();
     auto ctx = shape_refiner.GetContext(node);
     if (!ctx) {
       continue;
@@ -411,10 +1049,13 @@ Status GraphProperties::InferStatically() {
 
       input_properties.resize(ctx->num_inputs());
       for (int i = 0; i < ctx->num_inputs(); ++i) {
-        FillTensorPropertiesFromContext(ctx->input(i), node->input_type(i), ctx,
-                                        &input_properties[i]);
+        shape_manager.AsTensorProperties(ctx->input(i), node->input_type(i),
+                                         &input_properties[i]);
       }
       for (const auto& edge : node->in_edges()) {
+        if (edge->IsControlEdge()) {
+          continue;
+        }
         if (!edge->src()->IsConstant()) {
           continue;
         }
@@ -438,12 +1079,16 @@ Status GraphProperties::InferStatically() {
 
       output_properties.resize(ctx->num_outputs());
       for (int i = 0; i < ctx->num_outputs(); ++i) {
-        FillTensorPropertiesFromContext(ctx->output(i), node->output_type(i),
-                                        ctx, &output_properties[i]);
+        shape_manager.AsTensorProperties(ctx->output(i), node->output_type(i),
+                                         &output_properties[i]);
       }
     }
   }
 
+  // Help trace the unknown dimensions to their origins.
+  VerboseLogUnknownDimensionSources(graph, input_properties_,
+                                    output_properties_);
+
   return Status::OK();
 }
 
@@ -458,7 +1103,7 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
   return InferFromCostGraph(metadata.cost_graph());
 }
 
-Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def) {
+Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def) const {
   *output_graph_def = item_.graph;
   for (int i = 0; i < output_graph_def->node_size(); i++) {
     auto node = output_graph_def->mutable_node(i);
@@ -473,6 +1118,9 @@ Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def) {
 }
 
 Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
+  if (cost_graph.node_size() == 0) {
+    LOG(WARNING) << "cost_graph is empty: nothing can be inferred!";
+  }
   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
   std::unordered_map<string, const NodeDef*> name_to_node;  // Empty
   for (auto& node : cost_graph.node()) {
@@ -531,20 +1179,5 @@ GraphProperties::GetOutputProperties(const string& node_name) const {
   return missing_properties_;
 }
 
-void GraphProperties::FillTensorPropertiesFromContext(
-    const ShapeHandle& shape, const DataType& type, InferenceContext* ctx,
-    OpInfo::TensorProperties* properties) {
-  properties->set_dtype(type);
-  if (!ctx->RankKnown(shape)) {
-    properties->mutable_shape()->set_unknown_rank(true);
-  } else {
-    for (int j = 0; j < ctx->Rank(shape); ++j) {
-      shape_inference::DimensionHandle dim = ctx->Dim(shape, j);
-      int64 d = ctx->Value(dim);
-      properties->mutable_shape()->add_dim()->set_size(d);
-    }
-  }
-}
-
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 5649788be5bfcb94308e84f449b400de9bd76ca2..6fc53a7f2e7da7bae7b6f49c7b32291c981fef53 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -26,22 +26,38 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+class SymbolicShapeRefiner;
+class TopoQueue;
+
 // A TensorFlow model to optimize.
 // Models are represented by the combination of a graph, one of more fetch
 // nodes, and potentially a set of nodes to feed.
 class GraphProperties {
  public:
-  // Factory method for creating a GrapplerShapes from a MetaGraphDef.
-  // Returns nullptr if the given meta_graph cannot be converted.
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
 
-  Status InferStatically();
+  // Infer the shapes through abstract interpretation. Feed information can be
+  // incorrect so it should be discarded to ensure correctness of the analysis.
+  // However, it can help infer shapes in the fanout of fed nodes (even though
+  // the correctness of these shapes can't be guaranteed), so in some cases
+  // (such as simulation or scheduling) it makes sense of keep these shapes.
+  Status InferStatically(bool assume_valid_feeds);
+  // Infer the shape by running the graph on the specified cluster and recording
+  // the shapes of the processed tensors.
   Status InferDynamically(Cluster* cluster);
+  // Extract the properties from a cost graph. For testing only since there is
+  // no way to ensure that the cost graph match the item.
   Status InferFromCostGraph(const CostGraphDef& cost_graph);
 
   // Stores `item_.graph` with the inferred output shapes to `output_graph_def`.
-  Status AnnotateOutputShapes(GraphDef* output_graph_def);
-
+  Status AnnotateOutputShapes(GraphDef* output_graph_def) const;
+
+  // Return the properties of node inputs/outputs, including data types and
+  // shapes. Note that the dimensions in the shapes can be negative. We use the
+  // -1 value to denote that we don't know anything about a dimension. We use
+  // values strictly less than -1 to encode symbolic dimensions: although we
+  // don't know the actual value of the symbolic dimension, we know that all the
+  // dimensions denoted by the same negative value are the equal.
   bool HasInputProperties(const string& name) const;
   bool HasOutputProperties(const string& name) const;
   const std::vector<OpInfo::TensorProperties>& GetInputProperties(
@@ -51,42 +67,64 @@ class GraphProperties {
 
   static void FillTensorPropertiesFromContext(
       const shape_inference::ShapeHandle&, const DataType&,
-      shape_inference::InferenceContext*, OpInfo::TensorProperties*);
+      shape_inference::InferenceContext*,
+      std::unordered_map<const shape_inference::Dimension*, int>* dim_ids,
+      OpInfo::TensorProperties*);
 
  private:
-  // Inputs
-  GrapplerItem item_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
-  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
-  const std::vector<OpInfo::TensorProperties> missing_properties_;
-
   // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
-  Status MergeEnqueueShapesAndTypes(
+  static Status MergeEnqueueShapesAndTypes(
+      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
-      shape_inference::InferenceContext* qctx,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
-  Status RelaxEnqueueShapesAndMergeTypes(
+  static Status RelaxEnqueueShapesAndMergeTypes(
+      SymbolicShapeRefiner* shape_refiner, const Node* qnode,
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
-      shape_inference::InferenceContext* qctx,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
 
-  // This gives access to private function of InferenceContext.
-  static void Relax(shape_inference::InferenceContext* c,
-                    shape_inference::ShapeHandle s0,
-                    shape_inference::ShapeHandle s1,
-                    shape_inference::ShapeHandle* out);
-
-  // These give access to private functions of ShapeRefiner.
-  static bool SameDefinedShape(shape_inference::InferenceContext* c,
-                               shape_inference::ShapeHandle s0,
-                               shape_inference::ShapeHandle s1);
-  static bool IsUpdatedShapesOrTypes(
-      shape_inference::InferenceContext* c,
-      const std::vector<shape_inference::ShapeAndType>& existing,
-      const std::vector<shape_inference::ShapeAndType>& updated);
+  // Update the shapes for qnode. If output shapes of qnode have changed,
+  // enqueue its fanout in 'new_shapes'.
+  static Status UpdateResource(
+      const Node* qnode, const std::unordered_set<const Node*>& queue_inputs,
+      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes);
+
+  // Update the output shapes of a Merge node, and enqueue its fanout in
+  // new_shapes if needed.
+  static Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
+                                const Node* node, bool relax,
+                                TopoQueue* new_shapes);
+  // Process the Enter node, and enqueue its fanout in new_shapes if needed.
+  static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
+                            const Node* node, bool relax,
+                            TopoQueue* new_shapes);
+  // Process a node that is used to feed the model.
+  Status OverwriteFedPorts(
+      SymbolicShapeRefiner* shape_refiner,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* node, TopoQueue* new_shapes) const;
+  // Update the shapes for node 'n'. If output shapes for n have changed,
+  // enqueue its fanout in 'new_shapes'.
+  Status UpdateShapes(
+      SymbolicShapeRefiner* shape_refiner, bool relax,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const Node* n, TopoQueue* new_shapes) const;
+  // Propagate the shapes for the nodes enqueued in new_shapes and their
+  // transitive fanout until a fixed point is reached.
+  Status PropagateShapes(
+      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
+      const std::unordered_map<const Node*, std::unordered_set<const Node*>>&
+          resources,
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      int num_loops) const;
+
+  // Data members
+  GrapplerItem item_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
+  std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
+  const std::vector<OpInfo::TensorProperties> missing_properties_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 134db5ec5a9ee11949c4ee6f869839e842089740..5f2ac0c652e601e88e6285358d959c2f6c6d59fe 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/inputs/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -42,7 +43,10 @@ class GraphPropertiesTest : public ::testing::Test {
     TF_CHECK_OK(cluster_->Provision());
   }
 
-  void TearDown() override { cluster_.reset(); }
+  void TearDown() override {
+    TF_CHECK_OK(cluster_->Shutdown());
+    cluster_.reset();
+  }
 
  protected:
   // Returns a string form of <p>, suitable for comparing type and shape.
@@ -54,7 +58,8 @@ class GraphPropertiesTest : public ::testing::Test {
     } else {
       strings::StrAppend(&s, "[");
       for (int i = 0; i < p.shape().dim_size(); ++i) {
-        strings::StrAppend(&s, i == 0 ? "" : ",", p.shape().dim(i).size());
+        strings::StrAppend(&s, i == 0 ? "" : ",",
+                           std::max<int64>(p.shape().dim(i).size(), -1));
       }
       strings::StrAppend(&s, "]");
     }
@@ -71,7 +76,7 @@ TEST_F(GraphPropertiesTest, StaticProperties) {
   CHECK(fake_input.NextItem(&item));
 
   GraphProperties properties(item);
-  Status s = properties.InferStatically();
+  Status s = properties.InferStatically(true);
   TF_CHECK_OK(s);
 
   for (const auto& node : item.graph.node()) {
@@ -177,7 +182,7 @@ TEST_F(GraphPropertiesTest, Variables) {
 
   {
     GraphProperties static_properties(item);
-    TF_CHECK_OK(static_properties.InferStatically());
+    TF_CHECK_OK(static_properties.InferStatically(false));
 
     const auto props = static_properties.GetOutputProperties("Var");
     EXPECT_EQ(1, props.size());
@@ -217,7 +222,7 @@ TEST_F(GraphPropertiesTest, VarHandles) {
                   .Finalize(item.graph.add_node()));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("VarRead");
   EXPECT_EQ(1, props.size());
@@ -284,7 +289,7 @@ TEST_F(GraphPropertiesTest, Queues) {
   TF_CHECK_OK(root.ToGraphDef(&item.graph));
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props1 = properties.GetOutputProperties("Dequeue1");
   ASSERT_EQ(1, props1.size());
@@ -294,10 +299,9 @@ TEST_F(GraphPropertiesTest, Queues) {
   ASSERT_EQ(1, props2.size());
   EXPECT_EQ("float: [3,7]", PropToString(props2[0]));
 
-  // The dequeue3 op shape is unknown.
   const auto props3 = properties.GetOutputProperties("Dequeue3");
   ASSERT_EQ(1, props3.size());
-  EXPECT_EQ("float: ?", PropToString(props3[0]));
+  EXPECT_EQ("float: [3,7]", PropToString(props3[0]));
 
   // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
   // that we merge the 2 properly to determine the shape of the data coming out
@@ -334,7 +338,7 @@ TEST_F(GraphPropertiesTest, MergeWithoutLoops) {
                                  "merge_without_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"cond/Merge", "cond/concat", "cond/concat_1"};
   std::vector<string> expected_outputs{"float: [-1,-1,1]", "float: [2,1,1]",
@@ -361,7 +365,7 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
   /*
      with tf.Graph().as_default():
        i0 = tf.constant(0)
-       m0 = tf.ones([2, 2])
+       m0 = tf.placeholder([-1, 2])
        c = lambda i, m: i < 10
        b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
        r = tf.while_loop(
@@ -376,7 +380,7 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
                                  "while_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -386,6 +390,14 @@ TEST_F(GraphPropertiesTest, WhileLoop) {
     EXPECT_EQ(DT_FLOAT, prop.dtype());
     EXPECT_EQ("float: [-1,2]", PropToString(prop));
   }
+
+  // The loop outputs batch dim should be different from the input batch dim
+  // since we concatenated along the batch dim.
+  auto shape_in = properties.GetOutputProperties("ones").at(0).shape();
+  auto shape_out = properties.GetOutputProperties("while/Exit_1").at(0).shape();
+  EXPECT_GE(-2, shape_in.dim(0).size());
+  EXPECT_GE(-2, shape_out.dim(0).size());
+  EXPECT_NE(shape_in.dim(0).size(), shape_out.dim(0).size());
 }
 
 TEST_F(GraphPropertiesTest, NestedLoop) {
@@ -426,7 +438,7 @@ TEST_F(GraphPropertiesTest, NestedLoop) {
                                  "nested_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -489,7 +501,7 @@ TEST_F(GraphPropertiesTest, LoopsAndQueues) {
                                  "loops_and_queues.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -547,7 +559,7 @@ TEST_F(GraphPropertiesTest, LoopsAndResourceVars) {
                                  "loops_and_resource_vars.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> outer_nodes{"while/Merge_1", "while/NextIteration_1",
                                   "while/Exit_1"};
@@ -599,7 +611,7 @@ TEST_F(GraphPropertiesTest, QueuesAndLoops) {
                                  "queues_and_loops.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   std::vector<string> nodes{"while/Merge_1", "while/NextIteration_1",
                             "while/Exit_1"};
@@ -648,7 +660,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
   item.fetch.push_back("init_restore");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto restore_props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& restore_prop = restore_props[0];
@@ -676,8 +688,8 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape) {
 
 TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output var =
-      ops::Variable(s.WithOpName("var"), TensorShape(), DataType::DT_FLOAT);
+  Output var = ops::Variable(s.WithOpName("var"), PartialTensorShape(),
+                             DataType::DT_FLOAT);
   Output var2 = ops::Variable(s.WithOpName("var2"), TensorShape({128, 256}),
                               DataType::DT_FLOAT);
   Output filename =
@@ -695,7 +707,7 @@ TEST_F(GraphPropertiesTest, InferRestoreOpShape_WithTwoNodesShareSameOutput) {
   item.fetch.push_back("init2");
 
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
 
   const auto props = properties.GetOutputProperties("restore");
   const OpInfo::TensorProperties& prop = props[0];
@@ -723,7 +735,7 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
-  TF_CHECK_OK(properties.InferStatically());
+  TF_CHECK_OK(properties.InferStatically(false));
   const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
   const OpInfo::TensorProperties& prop = props[0];
   EXPECT_EQ(DT_FLOAT, prop.dtype());
@@ -731,6 +743,182 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
   EXPECT_EQ(2, prop.shape().dim_size());
   EXPECT_EQ(1, prop.shape().dim(0).size());
   EXPECT_EQ(2, prop.shape().dim(1).size());
+
+  PartialTensorShape shape(prop.shape());
+  EXPECT_TRUE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
+}
+
+TEST_F(GraphPropertiesTest, SymbolicShapes) {
+  // Build a simple graph with placeholders of unknown dimensions. These
+  // dimensions will be encoded symbolically.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output b =
+      ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1})));
+  Output c = ops::Identity(s.WithOpName("c"), a);
+  Output d = ops::Identity(s.WithOpName("d"), b);
+  Output e = ops::Add(s.WithOpName("e"), c, d);
+  Output f = ops::Add(s.WithOpName("f"), a, c);
+
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  Output g = ops::Shape(s.WithOpName("g"), c);
+  Output h = ops::Fill(s.WithOpName("h"), g, zero);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
+  const auto shape_c = properties.GetOutputProperties("c").at(0).shape();
+  EXPECT_EQ(2, shape_a.dim_size());
+  EXPECT_EQ(shape_a.dim_size(), shape_c.dim_size());
+  EXPECT_GE(-2, shape_a.dim(0).size());
+  EXPECT_EQ(shape_a.dim(0).size(), shape_c.dim(0).size());
+  EXPECT_GE(-2, shape_a.dim(1).size());
+  EXPECT_EQ(shape_a.dim(1).size(), shape_c.dim(1).size());
+
+  PartialTensorShape shape(shape_a);
+  EXPECT_FALSE(shape.IsFullyDefined());
+  EXPECT_FALSE(shape.unknown_rank());
+
+  const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
+  const auto shape_d = properties.GetOutputProperties("d").at(0).shape();
+  EXPECT_EQ(1, shape_b.dim_size());
+  EXPECT_EQ(shape_b.dim_size(), shape_d.dim_size());
+  EXPECT_GE(-2, shape_b.dim(0).size());
+  EXPECT_NE(shape_a.dim(0).size(), shape_b.dim(0).size());
+  EXPECT_EQ(shape_b.dim(0).size(), shape_d.dim(0).size());
+
+  const auto shape_e = properties.GetOutputProperties("e").at(0).shape();
+  ASSERT_EQ(2, shape_e.dim_size());
+  EXPECT_EQ(shape_e.dim(0).size(), shape_c.dim(0).size());
+  EXPECT_NE(shape_e.dim(1).size(), shape_c.dim(1).size());
+  EXPECT_NE(shape_e.dim(0).size(), shape_d.dim(0).size());
+
+  const auto shape_f = properties.GetOutputProperties("f").at(0).shape();
+  ASSERT_EQ(2, shape_f.dim_size());
+  EXPECT_EQ(shape_f.dim(0).size(), shape_a.dim(0).size());
+  EXPECT_EQ(shape_f.dim(1).size(), shape_a.dim(1).size());
+
+  const auto shape_h = properties.GetOutputProperties("h").at(0).shape();
+  ASSERT_EQ(2, shape_f.dim_size());
+  EXPECT_EQ(shape_h.dim(0).size(), shape_c.dim(0).size());
+  EXPECT_EQ(shape_h.dim(1).size(), shape_c.dim(1).size());
+}
+
+TEST_F(GraphPropertiesTest, DoNotValidateColocationConstraints) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 1.0f, {1});
+  Output b = ops::Const(s.WithOpName("b"), 2.0f, {1});
+  Output c = ops::Const(s.WithOpName("c").ColocateWith(a), 3.0f, {1});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  // Create a graph with node a removed (say by some graph optimization
+  // pass), noting that node c is colocated with a. This is fine as it
+  // is in the late stage of graph execution, the colocation constraints have
+  // been validated previously and the device placement of nodes has completed.
+  GraphDef optimized_graph;
+  for (const auto& node : item.graph.node()) {
+    if (node.name() != "a") {
+      *optimized_graph.add_node() = node;
+    }
+  }
+  item.graph.Swap(&optimized_graph);
+  GraphProperties properties(item);
+  // This function should return OK, since it doesn't validate the colocation
+  // constraints internally.
+  TF_EXPECT_OK(properties.InferStatically(false));
+}
+
+TEST_F(GraphPropertiesTest, ShapeTracking) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output b =
+      ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1})));
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  auto shp = ops::ShapeN(s.WithOpName("shapes"), {a, b});
+  Output o1 = ops::Fill(s.WithOpName("o1"), shp[0], zero);
+  Output o2 = ops::Fill(s.WithOpName("o2"), shp[1], zero);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
+  const auto shape_b = properties.GetOutputProperties("b").at(0).shape();
+  const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
+  const auto shape_o2 = properties.GetOutputProperties("o2").at(0).shape();
+  EXPECT_EQ(shape_a.DebugString(), shape_o1.DebugString());
+  EXPECT_EQ(shape_b.DebugString(), shape_o2.DebugString());
+}
+
+TEST_F(GraphPropertiesTest, FedNodes) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false,
+                                          cluster_->GetDeviceNames());
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  {
+    // Conservative shape analysis: the shape of fed ports should be unknown
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(false);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Const") {
+        continue;
+      }
+      const auto in_props = properties.GetInputProperties(node.name());
+      EXPECT_EQ(1, in_props.size());
+      const OpInfo::TensorProperties& in_prop = in_props[0];
+      const auto out_props = properties.GetOutputProperties(node.name());
+      EXPECT_EQ(1, out_props.size());
+      const OpInfo::TensorProperties& out_prop = out_props[0];
+
+      if (node.name() == "x") {
+        // x is fed: its input should have a known shape, while its output
+        // doesn't
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(1, in_prop.shape().dim_size());
+        EXPECT_EQ(2, in_prop.shape().dim(0).size());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      } else if (node.op() == "Square" || node.op() == "AddN") {
+        // These nodes are in the fanout of x: their shapes should be unknown.
+        EXPECT_TRUE(in_prop.shape().unknown_rank());
+        EXPECT_TRUE(out_prop.shape().unknown_rank());
+      }
+    }
+  }
+  {
+    // Optimistic shape analysis: the shape of fed ports should be derived from
+    // the shape of the fanin.
+    GraphProperties properties(item);
+    Status s = properties.InferStatically(true);
+    TF_CHECK_OK(s);
+    for (const auto& node : item.graph.node()) {
+      if (node.op() == "Square" || node.op() == "AddN") {
+        const auto in_props = properties.GetInputProperties(node.name());
+        EXPECT_EQ(1, in_props.size());
+        const OpInfo::TensorProperties& in_prop = in_props[0];
+        EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+        EXPECT_FALSE(in_prop.shape().unknown_rank());
+        EXPECT_EQ(2, in_prop.shape().dim_size());
+        const auto out_props = properties.GetOutputProperties(node.name());
+        EXPECT_EQ(1, out_props.size());
+        const OpInfo::TensorProperties& out_prop = out_props[0];
+        EXPECT_EQ(in_prop.DebugString(), out_prop.DebugString());
+      }
+    }
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
index c11833bd1a73a6680b666027398cfd77f335aeff..fbc3659d9a796420f4be5a948b986d457e004cb4 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop.pbtxt
@@ -21,7 +21,7 @@ node {
 }
 node {
   name: "ones"
-  op: "Const"
+  op: "PlaceholderV2"
   attr {
     key: "dtype"
     value {
@@ -29,19 +29,15 @@ node {
     }
   }
   attr {
-    key: "value"
+    key: "shape"
     value {
-      tensor {
-        dtype: DT_FLOAT
-        tensor_shape {
-          dim {
-            size: 2
-          }
-          dim {
-            size: 2
-          }
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 2
         }
-        float_val: 1.0
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index a2fa847df21a540e2bc0169c887d74dcb40a559d..6bc136a3f89c9a1dbfd4be15c143d4c893897494 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -25,11 +25,13 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
+constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
+constexpr char kPlaceholder[] = "Placeholder";
 constexpr char kIdentity[] = "Identity";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
@@ -98,7 +100,7 @@ TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
     }
   } else {
     for (int i = 0; i < shape.dim_size(); i++) {
-      if (shape.dim(i).size() == -1) {
+      if (shape.dim(i).size() < 0) {
         *found_unknown_shapes = true;
         VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
         // The size of each dimension is at least 1, if unknown.
@@ -159,6 +161,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
+      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+
+      {kPlaceholder, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kRefIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kStopGradient, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -167,9 +172,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kReshape, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kRecv, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kSend, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariable, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kVariableV2, wrap(&OpLevelCostEstimator::PredictNoOp)},
-      {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
@@ -221,6 +227,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                      Eigen::internal::scalar_square_op<float>>::Cost},
       {"Tanh", Eigen::internal::functor_traits<
                    Eigen::internal::scalar_tanh_op<float>>::Cost},
+      {"Relu", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_max_op<float>>::Cost},
       {"Sigmoid", Eigen::internal::functor_traits<
                       Eigen::internal::scalar_sigmoid_op<float>>::Cost},
       {"Sign", Eigen::internal::functor_traits<
@@ -283,8 +291,10 @@ Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
     if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
       return PredictCwiseOp(op_context);
     }
-    VLOG(1) << "Missing implementation for op: " << op_features.op();
-    return DummyExecutionTime(op_context);
+
+    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+
+    return PredictCostOfAnUnknownOp(op_context);
   }
 
   std::function<Costs(const OpContext&)> estimator = it->second;
@@ -324,7 +334,8 @@ OpLevelCostEstimator::DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
       // Maxwell
       cores_per_multiprocessor = 128;
     } else {
-      // Pascal
+      // Pascal (compute capability version 6) and Volta (compute capability
+      // version 7)
       cores_per_multiprocessor = 64;
     }
     gflops = device.num_cores() * device.frequency() * 1e-3 *
@@ -365,19 +376,27 @@ Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
   }
 
   int op_cost = 1;
+  bool is_known_elementwise_op = false;
   auto it = elementwise_ops_.find(op_features.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
+    is_known_elementwise_op = true;
+  } else {
+    LOG(WARNING) << "Not a cwise op: " << op_features.op();
   }
+
   Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
-  costs.inaccurate = found_unknown_shapes;
+  if (found_unknown_shapes || !is_known_elementwise_op) {
+    costs.inaccurate = true;
+  }
   return costs;
 }
 
-Costs OpLevelCostEstimator::DummyExecutionTime(
+Costs OpLevelCostEstimator::PredictCostOfAnUnknownOp(
     const OpContext& op_context) const {
-  // Use CwiseOp time as an estimation
-  auto costs = PredictCwiseOp(op_context);
+  // Don't assume the operation is cwise, return cost based on input/output size
+  // and admit that it is inaccurate...
+  auto costs = PredictOpCountBasedCost(0, op_context.op_info);
   costs.inaccurate = true;
   return costs;
 }
@@ -390,11 +409,11 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
           << " Execution Time (ns):" << compute_cost.count();
 
   bool found_unknown_shapes = false;
-  double total_input_size =
+  const double total_input_size =
       CalculateInputSize(op_features, &found_unknown_shapes);
-  double total_output_size =
+  const double total_output_size =
       CalculateOutputSize(op_features, &found_unknown_shapes);
-  double total_io_size = total_input_size + total_output_size;
+  const double total_io_size = total_input_size + total_output_size;
 
   Costs::NanoSeconds memory_cost(
       std::ceil(total_io_size / device_perf.gb_per_sec));
@@ -509,7 +528,12 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
     bool* found_unknown_shapes) const {
   double ops = 0;
 
-  // first matrix
+  if (op_features.inputs_size() < 2) {
+    LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+    *found_unknown_shapes = true;
+    return 0;
+  }
+
   auto& a_matrix = op_features.inputs(0);
   auto& b_matrix = op_features.inputs(1);
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 3a8385dd732d1747eca690339e098d741f68effc..c6f23ee0aad3ff71b5ba8dc9017c5c1b3fd8605d 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -45,11 +45,11 @@ class OpLevelCostEstimator {
   // Returns basic device performance info.
   virtual DeviceInfo GetDeviceInfo(const DeviceProperties& device) const;
 
-  // For operations for which we haven't yet built estimates, returns a dummy
-  // value based on input size.
-  Costs DummyExecutionTime(const OpContext& op_context) const;
+  // Predict cost of an op for which no accurate estimator is defined.
+  Costs PredictCostOfAnUnknownOp(const OpContext& op_context) const;
 
-  // Naive cost estimate based on operations divided by device ops/sec.
+  // Naive cost estimate based on operations divided by device ops/sec,
+  // and input/output tensor sizes.
   Costs PredictOpCountBasedCost(double operations,
                                 const OpInfo& op_features) const;
 
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index f19be4a0ee53609fa0196405da4ecb8b94fa39e6..60fc783472d2b6a1d50eb52e912da1fccbe8cf08 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -167,8 +167,8 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
-  EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
   EXPECT_TRUE(cost.inaccurate);
 }
 
@@ -176,7 +176,7 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(true);
   auto cost = PredictCosts(DescribeOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
-  EXPECT_EQ(Costs::Duration(200), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
   EXPECT_TRUE(cost.inaccurate);
   SetComputeMemoryOverlap(false);  // Set it back to default.
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index d5625ae58f82000144da2ef0e95a0f36cb52cd03..1554aeb3c0737b6cb83c4d3807955092ea93b31e 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -43,6 +43,9 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  if (right.inaccurate) {
+    result.inaccurate = true;
+  }
   if (right.max_memory != kMemoryUnknown) {
     result.max_memory += right.max_memory;
   }
@@ -122,7 +125,7 @@ Status VirtualScheduler::Init() {
   // Construct graph properties.
   Status status;
   if (use_static_shapes_) {
-    status = graph_properties_.InferStatically();
+    status = graph_properties_.InferStatically(true);
   } else {
     status = graph_properties_.InferDynamically(cluster_);
   }
@@ -154,6 +157,16 @@ Status VirtualScheduler::Init() {
     name_to_node[node->name()] = node;
   }
 
+  // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
+  // to _Recv as control dependency when creating GrapplerItem.
+  std::unordered_map<string, const NodeDef*> name_to_send;
+  for (const auto& node : graph.node()) {
+    if (node.op() == "_Send") {
+      const auto& attr = node.attr();
+      name_to_send[attr.at("tensor_name").s()] = &node;
+    }
+  }
+
   // To reuse _Recv ops.
   std::unordered_map<RecvNodeDescriptor, const NodeDef*, RecvNodeDescritorHash,
                      RecvNodeDescriptorEqual>
@@ -164,7 +177,17 @@ Status VirtualScheduler::Init() {
   for (const auto* curr_node : nodes) {
     auto& curr_node_state = GetNodeStateOrCreateIt(curr_node);
     const string curr_node_device = DeviceName(curr_node);
-    for (const string& input_node_name : curr_node->input()) {
+    std::vector<string> inputs;
+    if (IsRecv(*curr_node)) {
+      const auto& attr = curr_node->attr();
+      const NodeDef* send = name_to_send[attr.at("tensor_name").s()];
+      inputs = {send->name()};
+    } else {
+      for (const string& input : curr_node->input()) {
+        inputs.push_back(input);
+      }
+    }
+    for (const string& input_node_name : inputs) {
       // Note that input_node_name may be in <prefix><node_name>:<port_num>
       // format, where <prefix> (e.g., "^" for control dependency) and
       // ":<port_num>" may be omitted. NodeName() extracts only the node_name.
@@ -219,7 +242,7 @@ Status VirtualScheduler::Init() {
     // Default case: node without inputs are ready at time 0.
     const bool has_no_inputs = curr_node->input().empty();
 
-    if (given_as_feed || has_no_inputs) {
+    if (!IsRecv(*curr_node) && (given_as_feed || has_no_inputs)) {
       curr_node_state.time_ready = Costs::Duration();
       ready_nodes_->AddNode(curr_node);
       VLOG(3) << "Added ready node: " << curr_node->name();
@@ -254,7 +277,10 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
   // This method is called when NodeState is created and adds input and output
   // properties for a few exceptional cases that GraphProperties cannot provide
   // input/output properties.
-  if (IsSend(*node) || IsRecv(*node)) {
+  if ((IsSend(*node) || IsRecv(*node)) && node->attr().count(kAttrInputSrc)) {
+    // _Send and _Recv ops created from VirtualScheduler have kAttrInputSrc
+    // attr; normal _Send and _Recv ops (from the input graph) do not have that
+    // attr.
     auto& node_state = node_map_[node];
     auto& inputs = node_state.input_properties;
     auto& outputs = node_state.output_properties;
@@ -515,7 +541,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   string node_description = GetOpDescription(op_context.op_info);
   op_counts_[node_description] += 1;
   op_costs_[node_description] =
-      node_costs.execution_time.asMicroSeconds().count();
+      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                     !node_costs.inaccurate);
 
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
@@ -624,8 +651,10 @@ Costs VirtualScheduler::Summary() const {
   for (const auto& op_cost_pair : op_to_cost_) {
     const auto& op = op_cost_pair.first;
     const auto& cost = op_cost_pair.second.execution_time.count();
+    const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
-      VLOG(1) << " + " << op << " : " << cost;
+      VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+              << cost;
     }
   }
 
@@ -654,10 +683,10 @@ Costs VirtualScheduler::Summary() const {
     critical_path_costs.estimated_max_memory_per_device[name] =
         max_memory_usage;
 
+    const Costs::NanoSeconds wall_time_ns = state.GetCurrTime();
     VLOG(1) << "Device = " << name
             << ", num_nodes = " << state.nodes_executed.size()
-            << ", execution_time = " << state.GetCurrTime().count()
-            << ", memory usage: "
+            << ", wall_time_ns = " << wall_time_ns.count() << ", memory usage: "
             << "persistent = "
             << strings::HumanReadableNumBytes(persistent_memory_usage)
             << ", peak = "
@@ -675,9 +704,17 @@ Costs VirtualScheduler::Summary() const {
       op_to_memory[node->op()] +=
           CalculateOutputSize(node_map_.at(node).output_properties, port);
     }
+    Costs::NanoSeconds total_compute_time_ns;
+    bool is_total_cost_accurate = true;
     for (const auto& op_cost_pair : state.op_to_cost) {
       const auto& op = op_cost_pair.first;
       const auto& cost = op_cost_pair.second.execution_time.count();
+      total_compute_time_ns += op_cost_pair.second.execution_time;
+      const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
+      if (!is_op_cost_accurate) {
+        is_total_cost_accurate = false;
+      }
+
       int64 op_mem_usage = 0;
       auto it = op_to_memory.find(op);
       if (it != op_to_memory.end()) {
@@ -689,12 +726,22 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << " + " << op << " : " << cost << " ("
-                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
-                << mem_usage_percent << "%] "
+        VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
+                << cost << " (" << strings::HumanReadableNumBytes(op_mem_usage)
+                << " [" << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
     }
+
+    int utilization = 0;
+    if (wall_time_ns.count() > 0) {
+      utilization = total_compute_time_ns.count() * 100 / wall_time_ns.count();
+    }
+    VLOG(1) << "Device = " << name << ", total_compute_time_ns = "
+            << (is_total_cost_accurate ? "" : "~")
+            << total_compute_time_ns.count()
+            << ", utilization = " << utilization << "%";
+
     if (critical_path_costs.execution_time <= state.GetCurrTime()) {
       critical_path_costs = state.device_costs;
     }
@@ -704,8 +751,11 @@ Costs VirtualScheduler::Summary() const {
     // Also log the op description and their corresponding counts.
     VLOG(2) << "Node description, counts, cost:";
     for (const auto& item : op_counts_) {
+      int cost;
+      bool is_cost_accurate;
+      std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << op_costs_.at(item.first);
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
     }
   }
 
@@ -718,8 +768,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
   if (metadata != nullptr) {
     StepStats* stepstats = metadata->mutable_step_stats();
     for (const auto& device : device_) {
-      GraphDef* device_partition_graph =
-          metadata->mutable_partition_graphs()->Add();
+      GraphDef* device_partition_graph = metadata->add_partition_graphs();
       DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
       device_stepstats->set_device(device.first);
       for (const auto& node_def : device.second.nodes_executed) {
@@ -770,7 +819,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) {
         mem_stats->set_host_persistent_memory_size(host_persistent_memory_size);
         mem_stats->set_device_persistent_memory_size(
             device_persistent_memory_size);
-        *device_partition_graph->mutable_node()->Add() = *node_def;
+        *device_partition_graph->add_node() = *node_def;
       }
     }
   }
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index c9a032d5f867d380005b69c17c28c037c33aaa31..3018e3509a2aec96eb4be631e87a77443512d2d8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -138,7 +138,10 @@ class FIFOManager : public ReadyNodeManager {
   FIFOManager() : ReadyNodeManager() {}
   ~FIFOManager() override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
-  const NodeDef* GetCurrNode() override { return nodes_.front(); }
+  const NodeDef* GetCurrNode() override {
+    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+    return nodes_.front();
+  }
   void RemoveCurrNode() override { nodes_.pop_front(); }
   bool Empty() const override { return nodes_.empty(); }
 
@@ -156,18 +159,23 @@ class LIFOManager : public ReadyNodeManager {
   ~LIFOManager() override {}
   void AddNode(const NodeDef* node) override { nodes_.push_back(node); }
   const NodeDef* GetCurrNode() override {
-    curr_pos_ = nodes_.end();
-    curr_pos_--;
-    return nodes_.back();
+    CHECK(!nodes_.empty()) << "GetCurrNode(), but there's no ready node";
+    if (curr_pos_ == nodes_.end()) {
+      curr_pos_ = --(nodes_.rbegin().base());  // Last one in the list.
+    }
+    // Once curr_pos_ is set to a valid entry in the list, we keep using the
+    // cached curr_pos_ until RemoveCurrNode() is called. AddNode() will not
+    // change the GetCurrNode() return value.
+    return *curr_pos_;
   }
   void RemoveCurrNode() override {
-    if (curr_pos_ != nodes_.end()) {
-      nodes_.erase(curr_pos_);
-    } else if (!nodes_.empty()) {
-      nodes_.pop_back();
-    }
-    curr_pos_ = nodes_.end();
-    curr_pos_--;
+    // Make sure we have curr_pos_ ready to be removed.
+    GetCurrNode();
+    // Note curr_pos_ may not be pointing the last element if some nodes are
+    // added.
+    nodes_.erase(curr_pos_);
+
+    curr_pos_ = nodes_.end();  // Reset curr_pos_.
   }
   bool Empty() const override { return nodes_.empty(); }
 
@@ -322,7 +330,10 @@ class VirtualScheduler {
 
   // Stats:
   std::map<string, int> op_counts_;  // Op counts with key with input shape.
-  std::map<string, int> op_costs_;   // Individual op costs (with input shapes).
+  // Individual op costs (with input shapes).
+  // Boolean field for whether the cost is accurate.
+  std::map<string, std::pair<int, bool>> op_costs_;
+
   Costs graph_costs_;                // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index d291a0430885cf7ec5f5e6d8c7c1a782ab934149..412b494be730c21bf8b3d8bd791cc42dcbf15794 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -265,6 +265,127 @@ class VirtualSchedulerTest : public ::testing::Test {
     dependency_["z4"] = {"bn"};
   }
 
+  void CreateGrapplerItemWithSendRecv() {
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.1415
+      }
+    }
+  }
+}
+node {
+  name: "Send"
+  op: "_Send"
+  input: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "client_terminated"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "test"
+    }
+  }
+}
+node {
+  name: "Recv"
+  op: "_Recv"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "client_terminated"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "recv_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device"
+    value {
+      s: "/job:localhost/replica:0/task:0/device:CPU:0"
+    }
+  }
+  attr {
+    key: "send_device_incarnation"
+    value {
+      i: 0
+    }
+  }
+  attr {
+    key: "tensor_name"
+    value {
+      s: "test"
+    }
+  }
+  attr {
+    key: "tensor_type"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 24
+}
+    )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"Recv"};
+  }
+
   // A simple while loop
   void CreateGrapplerItemWithLoop() {
     // Test graph produced in python using:
@@ -743,6 +864,7 @@ versions {
     do {
       OpContext op_context = scheduler_->GetCurrNode();
       ops_executed[op_context.name] = op_context;
+      std::cout << op_context.name << std::endl;
 
       Costs node_costs = SimplePredictCosts(op_context);
 
@@ -816,6 +938,18 @@ versions {
     ExpectSetEq(expected, nodes_at_peak_mem_usage);
   }
 
+  // Helper method for checking nodes dependency.
+  void ValidateDependencyChain(
+      const std::unordered_map<string, int64>& start_times,
+      const std::vector<string>& nodes_in_dependency_order) {
+    int64 prev_node_time = -1;
+    for (const auto& node : nodes_in_dependency_order) {
+      int64 curr_node_time = start_times.at(node);
+      EXPECT_GE(curr_node_time, prev_node_time);
+      prev_node_time = curr_node_time;
+    }
+  }
+
   // Helper method for converting shape vector to TensorProperty.
   OpInfo::TensorProperties ShapeToTensorProperty(
       const std::vector<int> shape, const DataType& data_type) const {
@@ -911,11 +1045,15 @@ TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleFIFOManager) {
   manager.RemoveCurrNode();
   EXPECT_EQ("Node2", manager.GetCurrNode()->name());
   manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node3", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node4", manager.GetCurrNode()->name());
   manager.AddNode(&node6_);
+  EXPECT_EQ("Node4", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node5", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
@@ -988,11 +1126,15 @@ TEST_F(VirtualSchedulerTest, AddAndRemoveMultipleLIFOManager) {
   manager.RemoveCurrNode();
   EXPECT_EQ("Node3", manager.GetCurrNode()->name());
   manager.AddNode(&node5_);
+  // GetCurrNode()  should return the same node even if some nodes are added,
+  // until RemoveCurrNode() is called.
+  EXPECT_EQ("Node3", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node5", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node2", manager.GetCurrNode()->name());
   manager.AddNode(&node6_);
+  EXPECT_EQ("Node2", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
   EXPECT_EQ("Node6", manager.GetCurrNode()->name());
   manager.RemoveCurrNode();
@@ -1059,7 +1201,7 @@ TEST_F(VirtualSchedulerTest, GetCurrNodeFirstReadyManager) {
   // should return it.
   EXPECT_EQ("Node6", manager.GetCurrNode()->name());
   // Now insret a few other nodes, but their time_ready's are even smaller than
-  // that of Node6. Befor calling RemoveCurrNode(), GetCurrNode() should return
+  // that of Node6. Before calling RemoveCurrNode(), GetCurrNode() should return
   // the same node, Node6, in this case.
 
   NodeDef node7;
@@ -1383,19 +1525,18 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   RunMetadata metadata;
   scheduler_->Summary(&metadata);
 
-  // Nodes in topological order (each node takes 1 usec) and possible start
-  // time usec:
-  // * const, ones: 0, 1 usec
-  // * while/Enter, while/Enter_1: 2, 3 usec
-  // * while/Merge, while/Merge_1: 4, 5 usec
-  // * while/Less/y: 6 usec
-  // * while/Less: 7 usec
-  // * while/LoopCond: 8 usec
-  // * while/Switch, while/Switch_1: 9, 10 usec
-  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1: 11 - 14 usec
-  // * while/add/y, while/concat/Axis: 15, 16 usec
-  // * while/add, while/concat: 17, 18 usec
-  // * while/NextIteration, while/NextIteration_1: 19, 20 usec
+  // Nodes in topological order:
+  // * const, ones
+  // * while/Enter, while/Enter_1
+  // * while/Merge, while/Merge_1
+  // * while/Less/y
+  // * while/Less
+  // * while/LoopCond
+  // * while/Switch, while/Switch_1
+  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
+  // * while/add/y, while/concat/axis
+  // * while/add, while/concat
+  // * while/NextIteration, while/NextIteration_1
 
   int num_next_iteration = 0;
   int num_next_iteration_1 = 0;
@@ -1405,45 +1546,23 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   int64 next_iter_1_start_micro;
   int64 exit_start_micro;
   int64 exit_1_start_micro;
+
+  std::unordered_map<string, int64> start_times;
   for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
     for (const auto& stats : device_step_stats.node_stats()) {
-      std::cout << stats.DebugString() << std::endl;
-      // Start micro for while/Less/y, while/Less, and while/LoopCond are fixed
-      // regardless of scheduling method.
-      if (stats.node_name() == "while/Less/y") {
-        EXPECT_EQ(6, stats.all_start_micros());
-      } else if (stats.node_name() == "while/Less") {
-        EXPECT_EQ(7, stats.all_start_micros());
-      } else if (stats.node_name() == "while/LoopCond") {
-        EXPECT_EQ(8, stats.all_start_micros());
-      } else if (stats.node_name() == "while/NextIteration") {
+      start_times[stats.node_name()] = stats.all_start_micros();
+      if (stats.node_name() == "while/NextIteration") {
         ++num_next_iteration;
-        // Start time can be either 19 or 20 depending on how the scheduler
-        // picks a node among ready nodes.
         next_iter_start_micro = stats.all_start_micros();
-        EXPECT_LE(19, next_iter_start_micro);
-        EXPECT_GE(20, next_iter_start_micro);
       } else if (stats.node_name() == "while/NextIteration_1") {
         ++num_next_iteration_1;
-        // Start time can be either 19 or 20 depending on how the scheduler
-        // picks a node among ready nodes.
         next_iter_1_start_micro = stats.all_start_micros();
-        EXPECT_LE(19, next_iter_1_start_micro);
-        EXPECT_GE(20, next_iter_1_start_micro);
       } else if (stats.node_name() == "while/Exit") {
         ++num_exit;
-        // Start time can be between 11 and 14 (inclusive) depending on how
-        // the scheduler picks a node among ready nodes.
         exit_start_micro = stats.all_start_micros();
-        EXPECT_LE(11, exit_start_micro);
-        EXPECT_GE(14, exit_start_micro);
       } else if (stats.node_name() == "while/Exit_1") {
         ++num_exit_1;
-        // Start time can be between 11 and 14 (inclusive) depending on how
-        // the scheduler picks a node among ready nodes.
         exit_1_start_micro = stats.all_start_micros();
-        EXPECT_LE(11, exit_1_start_micro);
-        EXPECT_GE(14, exit_1_start_micro);
       }
     }
   }
@@ -1459,6 +1578,30 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   // different, so should be those of while/Exit and while/Exit_1.
   EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
   EXPECT_NE(exit_start_micro, exit_1_start_micro);
+
+  // Check dependency among the nodes; no matter what scheduling mechanism we
+  // use, the scheduled ops should follow these depedency chains.
+  // Note that currently, VirtualScheduler executes while/Merge twice; hence,
+  // we're not testing dependency chains related to while/Merge.
+  // TODO(dyoon): after fixing while loop behavior correctly (run nodes in the
+  // order of Enter, Merge, ...loop condition ..., ... loop body ...,
+  // NextIteration, Merge, ... loop condition ..., Exit), re-enable dependency
+  // chaing test w/ Merge nodes.
+  ValidateDependencyChain(
+      start_times,
+      {"Const", "while/Enter",  // "while/Merge",
+       "while/Less/y", "while/Less", "while/LoopCond", "while/Switch",
+       "while/Identity", "while/add/y", "while/add", "while/NextIteration"});
+  // ValidateDependencyChain(start_times, {"while/Merge", "while/Less"});
+  ValidateDependencyChain(start_times,
+                          {"ones", "while/Enter_1",  // "while/Merge_1",
+                           "while/Switch_1", "while/Identity_1", "while/concat",
+                           "while/NextIteration_1"});
+  ValidateDependencyChain(start_times, {"while/Switch", "while/Exit"});
+  ValidateDependencyChain(
+      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+  ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
@@ -1530,5 +1673,54 @@ TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
   EXPECT_EQ(get_output_size(recv_op_names[-1]), 4);
   EXPECT_EQ(get_output_size(send_op_names[-1]), 4);
 }
+
+TEST_F(VirtualSchedulerTest, GraphWithSendRecv) {
+  // Init.
+  CreateGrapplerItemWithSendRecv();
+  InitScheduler();
+
+  // Run the scheduler.
+  auto ops_executed = RunScheduler("");
+
+  EXPECT_GT(ops_executed.count("Const"), 0);
+  EXPECT_GT(ops_executed.count("Send"), 0);
+  EXPECT_GT(ops_executed.count("Recv"), 0);
+}
+
+TEST_F(VirtualSchedulerTest, GraphWithSendRecvDifferentDevice) {
+  // Init.
+  CreateGrapplerItemWithSendRecv();
+  // Change Recv node's device so that Send and Recv are placed on different
+  // devices.
+  auto& graph = grappler_item_->graph;
+  const string recv_device = kCPU1;
+  for (int i = 0; i < graph.node_size(); i++) {
+    auto* node = graph.mutable_node(i);
+    if (node->name() == "Recv") {
+      node->set_device(recv_device);
+      auto* attr = node->mutable_attr();
+      (*attr)["recv_device"].set_s(recv_device);
+    } else if (node->name() == "Send") {
+      auto* attr = node->mutable_attr();
+      (*attr)["recv_device"].set_s(recv_device);
+    }
+  }
+  InitScheduler();
+
+  // Run the scheduler.
+  auto ops_executed = RunScheduler("");
+
+  // Expect Const, Send, Recv, and VirtualScheduler created Send and Recv ops.
+  EXPECT_GT(ops_executed.count("Const"), 0);
+  EXPECT_GT(ops_executed.count("Send"), 0);
+  EXPECT_GT(ops_executed.count("Send_Send_0_from_/job_localhost/replica_0/"
+                               "task_0/cpu_0_to_/job_localhost"
+                               "/replica_0/task_0/cpu_1"),
+            0);
+  EXPECT_GT(ops_executed.count(
+                "Recv_Send_0_on_/job_localhost/replica_0/task_0/cpu_1"),
+            0);
+  EXPECT_GT(ops_executed.count("Recv"), 0);
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bf8a98a722a1bb87ecf9c3c625a16145d74f9b01
--- /dev/null
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -0,0 +1,122 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+GraphView::GraphView(GraphDef* graph) : graph_(graph) {
+  for (int i = 0; i < graph_->node_size(); i++) {
+    auto node = graph_->mutable_node(i);
+    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+    // Check that the graph doesn't contain multiple nodes with the same name.
+    CHECK(rslt.second);
+  }
+  for (NodeDef& node : *graph_->mutable_node()) {
+    for (int i = 0; i < node.input_size(); ++i) {
+      OutputPort fanin;
+      string fanin_name = ParseNodeName(node.input(i), &fanin.port_id);
+      fanin.node = nodes_[fanin_name];
+
+      InputPort input;
+      input.node = &node;
+      if (fanin.port_id < 0) {
+        input.port_id = -1;
+      } else {
+        input.port_id = i;
+      }
+
+      fanouts_[fanin].insert(input);
+    }
+  }
+}
+
+NodeDef* GraphView::GetNode(const string& node_name) const {
+  auto it = nodes_.find(node_name);
+  if (it == nodes_.end()) {
+    return nullptr;
+  }
+  return it->second;
+}
+
+GraphView::InputPort GraphView::GetInputPort(const string& node_name,
+                                             int port_id) const {
+  InputPort result;
+  result.node = GetNode(node_name);
+  // TODO(bsteiner): verify that the node has at least port_id input ports
+  result.port_id = port_id;
+  return result;
+}
+
+GraphView::OutputPort GraphView::GetOutputPort(const string& node_name,
+                                               int port_id) const {
+  OutputPort result;
+  result.node = GetNode(node_name);
+  // TODO(bsteiner): verify that the node has at least port_id output ports
+  result.port_id = port_id;
+  return result;
+}
+
+const std::unordered_set<GraphView::InputPort, GraphView::HashPort>&
+GraphView::GetFanout(const GraphView::OutputPort& port) const {
+  auto it = fanouts_.find(port);
+  if (it == fanouts_.end()) {
+    return empty_set_;
+  }
+  return it->second;
+}
+
+const std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
+GraphView::GetFanin(const GraphView::InputPort& port) const {
+  std::unordered_set<GraphView::OutputPort, GraphView::HashPort> result;
+  if (port.port_id >= 0) {
+    result.insert(GetRegularFanin(port));
+  } else {
+    for (int i = port.node->input_size() - 1; i >= 0; --i) {
+      OutputPort fanin;
+      string fanin_name = ParseNodeName(port.node->input(i), &fanin.port_id);
+      if (fanin.port_id < 0) {
+        auto it = nodes_.find(fanin_name);
+        if (it != nodes_.end()) {
+          fanin.node = it->second;
+          result.insert(fanin);
+        }
+      } else {
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+const GraphView::OutputPort GraphView::GetRegularFanin(
+    const GraphView::InputPort& port) const {
+  CHECK_LE(0, port.port_id);
+  OutputPort fanin;
+  string fanin_name =
+      ParseNodeName(port.node->input(port.port_id), &fanin.port_id);
+  auto it = nodes_.find(fanin_name);
+  if (it == nodes_.end()) {
+    fanin.node = nullptr;
+  } else {
+    fanin.node = it->second;
+  }
+  return fanin;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..a24310ad1a40b7e84e2fa67686c1bf0575ac5881
--- /dev/null
+++ b/tensorflow/core/grappler/graph_view.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+#define TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A utility class to simplify the traversal of a GraphDef.
+class GraphView {
+ public:
+  struct Port {
+    NodeDef* node;
+    int port_id;
+
+    bool operator==(const Port& other) const {
+      return node == other.node && port_id == other.port_id;
+    }
+  };
+  struct InputPort : public Port {};
+  struct OutputPort : public Port {};
+
+  struct HashPort {
+    std::size_t operator()(const Port& port) const {
+      return reinterpret_cast<std::size_t>(port.node) + port.port_id;
+    }
+  };
+
+  explicit GraphView(GraphDef* graph);
+  NodeDef* GetNode(const string& node_name) const;
+  // Get the specified input port. Note that the special '-1' port_id can be
+  // used to access the controlling nodes (i.e. the nodes connected to node_name
+  // through an incoming control dependency).
+  InputPort GetInputPort(const string& node_name, int port_id) const;
+  // Get the specified input port. Note that the special '-1' port_id can be
+  // used to access the controlled nodes (i.e. the nodes connected to node_name
+  // through an outgoing control dependency).
+
+  // Special case: regular (i.e. non-control) ports can only have one fanin.
+  OutputPort GetOutputPort(const string& node_name, int port_id) const;
+
+  const std::unordered_set<InputPort, HashPort>& GetFanout(
+      const OutputPort& port) const;
+  const std::unordered_set<OutputPort, HashPort> GetFanin(
+      const InputPort& port) const;
+  const OutputPort GetRegularFanin(const InputPort& port) const;
+
+ private:
+  GraphDef* graph_;
+  std::unordered_map<string, NodeDef*> nodes_;
+  std::unordered_set<InputPort, HashPort> empty_set_;
+  std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
+                     HashPort>
+      fanouts_;
+  std::unordered_map<NodeDef*, std::unordered_set<NodeDef*>> controlled_nodes_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15bed07d017a18d53973da012e7add4085380a74
--- /dev/null
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GraphViewTest : public ::testing::Test {};
+
+TEST_F(GraphViewTest, BasicGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 2, 2, false, {"/CPU:0", "/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphView graph(&item.graph);
+
+  GraphView::InputPort input = graph.GetInputPort("AddN", 0);
+  EXPECT_EQ("AddN", input.node->name());
+  EXPECT_EQ(0, input.port_id);
+  GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+  EXPECT_EQ("Square", fanin.node->name());
+  EXPECT_EQ(0, fanin.port_id);
+
+  input = graph.GetInputPort("AddN", 1);
+  EXPECT_EQ("AddN", input.node->name());
+  EXPECT_EQ(1, input.port_id);
+  fanin = graph.GetRegularFanin(input);
+  EXPECT_EQ("Square_1", fanin.node->name());
+  EXPECT_EQ(0, fanin.port_id);
+
+  GraphView::OutputPort output = graph.GetOutputPort("AddN", 0);
+  EXPECT_EQ("AddN", output.node->name());
+  EXPECT_EQ(0, output.port_id);
+  EXPECT_EQ(2, graph.GetFanout(output).size());
+  for (auto fanout : graph.GetFanout(output)) {
+    if (fanout.node->name() == "AddN_2" || fanout.node->name() == "AddN_3") {
+      EXPECT_EQ(0, fanout.port_id);
+    } else {
+      // Invalid fanout
+      EXPECT_FALSE(true);
+    }
+  }
+}
+
+TEST_F(GraphViewTest, ControlDependencies) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {a});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  GraphView::OutputPort output = graph.GetOutputPort("a", -1);
+  EXPECT_EQ("a", output.node->name());
+  EXPECT_EQ(-1, output.port_id);
+  auto fanout = graph.GetFanout(output);
+  EXPECT_EQ(1, fanout.size());
+  EXPECT_EQ("d", (*fanout.begin()).node->name());
+  EXPECT_EQ(-1, (*fanout.begin()).port_id);
+
+  output = graph.GetOutputPort("a", 0);
+  EXPECT_EQ("a", output.node->name());
+  EXPECT_EQ(0, output.port_id);
+  fanout = graph.GetFanout(output);
+  EXPECT_EQ(1, fanout.size());
+  EXPECT_EQ("b", (*fanout.begin()).node->name());
+  EXPECT_EQ(0, (*fanout.begin()).port_id);
+
+  GraphView::InputPort input = graph.GetInputPort("d", -1);
+  EXPECT_EQ("d", input.node->name());
+  EXPECT_EQ(-1, input.port_id);
+  auto fanin = graph.GetFanin(input);
+  EXPECT_EQ(1, fanin.size());
+  EXPECT_EQ("a", (*fanin.begin()).node->name());
+  EXPECT_EQ(-1, (*fanin.begin()).port_id);
+
+  input = graph.GetInputPort("d", 0);
+  EXPECT_EQ("d", input.node->name());
+  EXPECT_EQ(0, input.port_id);
+  fanin = graph.GetFanin(input);
+  EXPECT_EQ(1, fanin.size());
+  EXPECT_EQ("b", (*fanin.begin()).node->name());
+  EXPECT_EQ(0, (*fanin.begin()).port_id);
+
+  input = graph.GetInputPort("d", 1);
+  EXPECT_EQ("d", input.node->name());
+  EXPECT_EQ(1, input.port_id);
+  fanin = graph.GetFanin(input);
+  EXPECT_EQ(1, fanin.size());
+  EXPECT_EQ("c", (*fanin.begin()).node->name());
+  EXPECT_EQ(0, (*fanin.begin()).port_id);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 94412eb1980d63f6193bc8ffb513db10ffdb5fac..149f6fc7353b3c96e9d780c20697873c15bccaa8 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -71,9 +72,11 @@ std::vector<const NodeDef*> GrapplerItem::MainVariables() const {
 std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   std::unordered_set<string> result;
   for (const string& f : fetch) {
+    VLOG(1) << "Add fetch " << f;
     result.insert(NodeName(f));
   }
   for (const auto& f : feed) {
+    VLOG(1) << "Add feed " << f.first;
     result.insert(NodeName(f.first));
   }
   for (const auto& node : init_ops) {
@@ -117,8 +120,13 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
     bool* ill_formed) {
   *ill_formed = false;
   std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<string, const NodeDef*> name_to_send;
   for (const auto& node : graph.node()) {
     name_to_node[node.name()] = &node;
+    if (node.op() == "_Send") {
+      const auto& attr = node.attr();
+      name_to_send[attr.at("tensor_name").s()] = &node;
+    }
   }
 
   std::vector<const NodeDef*> queue;
@@ -150,6 +158,15 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
       }
       queue.push_back(in);
     }
+    if (node->op() == "_Recv") {
+      const auto& attr = node->attr();
+      const NodeDef* send = name_to_send[attr.at("tensor_name").s()];
+      if (send) {
+        queue.push_back(send);
+      }
+      // Subgraph after partitioning may have either _Send or _Recv, not both.
+      // So, we do not set ill_formed for missing _Send.
+    }
   }
   return result;
 }
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 3f6183b6f1ecb92dcc99abccacda74ceaf72cce0..ca3c1a666726065eba2f5cae88c2a77e6cc5594e 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -126,9 +126,6 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
   std::unique_ptr<Graph> graphptr(new Graph(function_library));
-  // Populate default attrs to the NodeDefs in the GraphDef.
-  TF_RETURN_IF_ERROR(
-      AddDefaultAttrsToGraphDef(&graph_def, *graphptr->op_registry(), 0));
 
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, graphptr.get()));
@@ -138,7 +135,10 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
-  return Status::OK();
+  // The default values of attributes might have been stripped by the optimizer.
+  // Add them back.
+  return AddDefaultAttrsToGraphDef(output_graph_def, *graphptr->op_registry(),
+                                   0);
 }
 
 // Applies the same graph pruning logic to the graph as Session.Run in TF.
@@ -173,7 +173,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                  << ", skipping this input.";
       return nullptr;
     }
-    LOG(INFO) << "Will use feed node " << feed_name;
+    VLOG(1) << "Will use feed node " << feed_name;
     new_item->feed.emplace_back(feed_name, Tensor());
   }
 
@@ -188,7 +188,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
                      << ", skipping this input";
           return nullptr;
         }
-        LOG(INFO) << "Will use fetch node " << name;
+        VLOG(1) << "Will use fetch node " << name;
         new_item->fetch.push_back(name);
       }
     }
@@ -297,7 +297,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
   }
 
   for (auto& node : *new_item->graph.mutable_node()) {
-    if (IsPlaceholder(node)) {
+    if (IsPlaceholder(node) && node.op() != "PlaceholderWithDefault") {
       if (node.attr().count("dtype") == 0) {
         LOG(ERROR) << "Unknown type for placeholder " << node.name()
                    << ", skipping this input";
@@ -449,6 +449,15 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
     new_item->save_restore_loc_tensor = saver.filename_tensor_name();
   }
 
+  // Instantiate all the missing attributes with their default values.
+  Status attr_status =
+      AddDefaultAttrsToGraphDef(&new_item->graph, *OpRegistry::Global(), 0);
+  if (!attr_status.ok()) {
+    LOG(ERROR) << "Failed to instantiate default attribute values: "
+               << attr_status.error_message();
+    return nullptr;
+  }
+
   // Optimize the graph (function inlining, l1 optimizations, etc).
   VLOG(1) << "Number of nodes in graph before OptimizeGraph: "
           << new_item->graph.node_size();
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index acb849814217b97600a9cfc6b730838e0733f86b..ac94c3f81e8d1906bb844841034984e9d38f283f 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -13,24 +13,63 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <unordered_set>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsAddN(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "AddN";
+bool IsAdd(const NodeDef& node) {
+  return node.op() == "Add" || node.op() == "AddV2";
+}
+
+bool IsAddN(const NodeDef& node) { return node.op() == "AddN"; }
+
+bool IsAnyDiv(const NodeDef& node) {
+  return node.op() == "RealDiv" || node.op() == "Div" ||
+         node.op() == "FloorDiv" || node.op() == "TruncateDiv";
+}
+
+bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
+
+bool IsAssert(const NodeDef& node) { return node.op() == "Assert"; }
+
+bool IsBiasAdd(const NodeDef& node) {
+  return node.op() == "BiasAdd" || node.op() == "BiasAddV1";
+}
+
+bool IsBiasAddGrad(const NodeDef& node) { return node.op() == "BiasAddGrad"; }
+
+bool IsConcatOffset(const NodeDef& node) { return node.op() == "ConcatOffset"; }
+
+bool IsConstant(const NodeDef& node) { return node.op() == "Const"; }
+
+bool IsConv2D(const NodeDef& node) { return node.op() == "Conv2D"; }
+
+bool IsConv2DBackpropFilter(const NodeDef& node) {
+  return node.op() == "Conv2DBackpropFilter";
 }
 
-bool IsConcat(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Concat" || op == "ConcatV2";
+bool IsConv2DBackpropInput(const NodeDef& node) {
+  return node.op() == "Conv2DBackpropInput";
 }
 
-bool IsConstant(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Const";
+bool IsDepthwiseConv2dNative(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNative";
+}
+
+bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNativeBackpropFilter";
+}
+
+bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node) {
+  return node.op() == "DepthwiseConv2dNativeBackpropInput";
 }
 
 bool IsDequeueOp(const NodeDef& node) {
@@ -40,6 +79,8 @@ bool IsDequeueOp(const NodeDef& node) {
          op == "QueueDequeueUpToV2" || op == "QueueDequeueUpTo";
 }
 
+bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
+
 bool IsEnter(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Enter" || op == "RefEnter";
@@ -50,36 +91,50 @@ bool IsExit(const NodeDef& node) {
   return op == "Exit" || op == "RefExit";
 }
 
+bool IsFloorMod(const NodeDef& node) { return node.op() == "FloorMod"; }
+
+bool IsFusedBatchNormGradV1(const NodeDef& node) {
+  return node.op() == "FusedBatchNormGrad";
+}
+
 bool IsIdentity(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Identity" || op == "RefIdentity";
 }
 
+bool IsMatMul(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
+         op == "SparseMatMul";
+}
+
 bool IsMerge(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Merge" || op == "RefMerge";
 }
 
-bool IsNoOp(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "NoOp";
-}
+bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
+
+bool IsNoOp(const NodeDef& node) { return node.op() == "NoOp"; }
 
 bool IsNextIteration(const NodeDef& node) {
   const auto& op = node.op();
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsPad(const NodeDef& node) { return node.op() == "Pad"; }
+
 bool IsPlaceholder(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Placeholder" || op == "PlaceholderV2" ||
          op == "PlaceholderWithDefault";
 }
 
-bool IsRecv(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "_Recv";
-}
+bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
+
+bool IsReluGrad(const NodeDef& node) { return node.op() == "ReluGrad"; }
+
+bool IsRecv(const NodeDef& node) { return node.op() == "_Recv"; }
 
 bool IsReduction(const NodeDef& node) {
   const auto& op = node.op();
@@ -94,31 +149,108 @@ bool IsRestore(const NodeDef& node) {
           node.op() == "RestoreSlice");
 }
 
-bool IsSend(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "_Send";
+bool IsSend(const NodeDef& node) { return node.op() == "_Send"; }
+
+bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
+
+bool IsSplit(const NodeDef& node) { return node.op() == "Split"; }
+
+bool IsSquaredDifference(const NodeDef& node) {
+  return node.op() == "SquaredDifference";
 }
 
+bool IsSqueeze(const NodeDef& node) { return node.op() == "Squeeze"; }
+
 bool IsStopGradient(const NodeDef& node) {
   const auto& op = node.op();
   return op == "StopGradient" || op == "PreventGradient";
 }
 
+bool IsSub(const NodeDef& node) { return node.op() == "Sub"; }
+
+bool IsSum(const NodeDef& node) { return node.op() == "Sum"; }
+
 bool IsSwitch(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Switch" || op == "RefSwitch";
 }
 
-bool IsTranspose(const NodeDef& node) {
-  const auto op = node.op();
-  return op == "Transpose";
-}
+bool IsTranspose(const NodeDef& node) { return node.op() == "Transpose"; }
 
 bool IsVariable(const NodeDef& node) {
-  const auto op = node.op();
+  const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
          op == "VarHandleOp" || op == "ReadVariableOp";
 }
 
-}  // end namespace grappler
+namespace {
+bool GetBoolAttr(const NodeDef& node, const string& name) {
+  return node.attr().count(name) > 0 && node.attr().at(name).b();
+}
+}  // namespace
+
+bool IsFreeOfSideEffect(const NodeDef& node) {
+  // Placeholders must be preserved to keep the graph feedable.
+  if (IsPlaceholder(node)) {
+    return false;
+  }
+  const OpDef* op_def = nullptr;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok()) {
+    return false;
+  }
+  if (op_def->is_stateful()) {
+    return false;
+  }
+  // Nodes such as Assign or AssignAdd modify one of their inputs.
+  for (const auto& input : op_def->input_arg()) {
+    if (input.is_ref()) {
+      return false;
+    }
+  }
+  // Some nodes do in-place updates on regular tensor inputs.
+  if (GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace")) {
+    return false;
+  }
+  return true;
+}
+
+bool ModifiesFrameInfo(const NodeDef& node) {
+  return IsEnter(node) || IsExit(node) || IsNextIteration(node);
+}
+
+#define OPDEF_PROPERTY_HELPER(PROPERTY_CAP, PROPERTY)                      \
+  bool Is##PROPERTY_CAP(const NodeDef& node) {                             \
+    if (node.op() == "Add") {                                              \
+      /* Workaround for "Add" not being marked is_commutative and */       \
+      /* is_aggregate. (See cl/173915048). */                              \
+      const auto type = GetDataTypeFromAttr(node, "T");                    \
+      return type != DT_INVALID && type != DT_STRING;                      \
+    }                                                                      \
+    const OpDef* op_def = nullptr;                                         \
+    Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def); \
+    return status.ok() && op_def->is_##PROPERTY();                         \
+  }
+
+OPDEF_PROPERTY_HELPER(Aggregate, aggregate)
+OPDEF_PROPERTY_HELPER(Commutative, commutative)
+
+bool IsInvolution(const NodeDef& node) {
+  const std::unordered_set<string> involution_ops{
+      "Conj", "Reciprocal", "Invert", "Neg", "LogicalNot"};
+  return involution_ops.count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
+  if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
+    return true;
+  }
+  const std::unordered_set<string> value_preserving_ops{
+      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
+      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
+      "ExpandDims", "Squeeze"};
+  return value_preserving_ops.count(node.op()) > 0;
+}
+
+}  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 0de954fcb45366e40bb89d3704fec496cd514b41..b8031e011cf8e77ef635dd4685459c7997e28e1c 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -17,31 +17,78 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
+bool IsAdd(const NodeDef& node);
 bool IsAddN(const NodeDef& node);
-bool IsConcat(const NodeDef& node);
+bool IsAnyDiv(const NodeDef& node);
+bool IsAvgPoolGrad(const NodeDef& node);
+bool IsAssert(const NodeDef& node);
+bool IsBiasAdd(const NodeDef& node);
+bool IsBiasAddGrad(const NodeDef& node);
+bool IsConcatOffset(const NodeDef& node);
 bool IsConstant(const NodeDef& node);
+bool IsConv2D(const NodeDef& node);
+bool IsConv2DBackpropFilter(const NodeDef& node);
+bool IsConv2DBackpropInput(const NodeDef& node);
+bool IsDepthwiseConv2dNative(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
+bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
 bool IsDequeueOp(const NodeDef& node);
+bool IsDiv(const NodeDef& node);
 bool IsEnter(const NodeDef& node);
 bool IsExit(const NodeDef& node);
+bool IsFloorMod(const NodeDef& node);
+bool IsFusedBatchNormGradV1(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
+bool IsMul(const NodeDef& node);
+bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsPad(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
+bool IsRealDiv(const NodeDef& node);
+bool IsReluGrad(const NodeDef& node);
 bool IsRecv(const NodeDef& node);
 bool IsReduction(const NodeDef& node);
 bool IsReshape(const NodeDef& node);
 bool IsRestore(const NodeDef& node);
 bool IsSend(const NodeDef& node);
+bool IsSlice(const NodeDef& node);
+bool IsSplit(const NodeDef& node);
+bool IsSquaredDifference(const NodeDef& node);
+bool IsSqueeze(const NodeDef& node);
 bool IsStopGradient(const NodeDef& node);
+bool IsSub(const NodeDef& node);
+bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 
+// Return true if the op is an aggregation (e.g. Add, AddN).
+// Returns false if it could not be determined to be so.
+bool IsAggregate(const NodeDef& node);
+
+// Return true if the op is commutative (e.g. Mul, Add).
+// Returns false if it could not be determined to be so.
+bool IsCommutative(const NodeDef& node);
+
+bool IsFreeOfSideEffect(const NodeDef& node);
+bool ModifiesFrameInfo(const NodeDef& node);
+
+// Returns true if the op is an element-wise involution, i.e. if it is its
+// own inverse such that f(f(x)) == x.
+bool IsInvolution(const NodeDef& node);
+
+// Returns true if the op in node only rearranges the order of elements in its
+// first input tensor and possible changes its shape. More precisely, this
+// function returns true if the op commutes with all element-wise operations.
+bool IsValuePreserving(const NodeDef& node);
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 74030908fe7d1d4e6a1c4f9e47a5404ce68651ab..e557adc2111608f6a77f292275200a8b5dfab9ec 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -96,6 +96,7 @@ cc_library(
         ":graph_optimizer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
@@ -112,6 +113,7 @@ tf_cc_test(
     deps = [
         ":constant_folding",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -161,6 +163,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":constant_folding",
         ":graph_optimizer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -170,6 +173,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
     ],
 )
 
@@ -191,6 +195,47 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "dependency_optimizer",
+    srcs = ["dependency_optimizer.cc"],
+    hdrs = [
+        "dependency_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":constant_folding",
+        ":graph_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
+
+tf_cc_test(
+    name = "dependency_optimizer_test",
+    size = "small",
+    srcs = ["dependency_optimizer_test.cc"],
+    deps = [
+        ":constant_folding",
+        ":dependency_optimizer",
+        ":model_pruner",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
+
 cc_library(
     name = "model_pruner",
     srcs = ["model_pruner.cc"],
@@ -235,9 +280,11 @@ cc_library(
         ":graph_rewriter",
         ":static_schedule",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
@@ -268,6 +315,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:devices",
@@ -276,6 +324,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/costs:virtual_placer",
         "//tensorflow/core/grappler/utils:frame",
     ],
 )
@@ -286,12 +335,18 @@ tf_cc_test(
     deps = [
         ":layout_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
     ],
 )
 
@@ -306,6 +361,7 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_parallel",
         ":constant_folding",
+        ":dependency_optimizer",
         ":graph_optimizer",
         ":layout_optimizer",
         ":memory_optimizer",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 78b55237d1e665a296e945dafe0454afe722632e..d6bc8614f91af85229c2ebb8b7040a218c594c81 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -14,31 +14,78 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+
+#include <algorithm>
+#include <limits>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+using tensorflow::strings::StrCat;
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-static bool IsInvolution(const NodeDef& node) {
-  const std::unordered_set<string> involution_ops = {"Conj", "Reciprocal",
-                                                     "Neg", "LogicalNot"};
-  return involution_ops.count(node.op()) > 0;
+template <typename T>
+bool SafeSetTensorValue(double value, Tensor* tensor) {
+  using RealType = typename Eigen::NumTraits<T>::Real;
+  if (value > std::numeric_limits<RealType>::max() ||
+      value < std::numeric_limits<RealType>::min()) {
+    return false;
+  }
+  tensor->flat<T>()(0) = static_cast<T>(value);
+  return true;
+}
+
+#define HANDLE_CASE(DTYPE)                                          \
+  case DTYPE:                                                       \
+    if (!SafeSetTensorValue<EnumToDataType<DTYPE>::Type>(           \
+            static_cast<double>(value), tensor)) {                  \
+      return errors::InvalidArgument("Cannot store value ", value,  \
+                                     " in tensor of type " #DTYPE); \
+    }                                                               \
+    break
+
+Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
+  switch (dtype) {
+    //    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_FLOAT);
+    HANDLE_CASE(DT_DOUBLE);
+    HANDLE_CASE(DT_UINT8);
+    HANDLE_CASE(DT_INT8);
+    HANDLE_CASE(DT_UINT16);
+    HANDLE_CASE(DT_INT16);
+    HANDLE_CASE(DT_INT32);
+    HANDLE_CASE(DT_INT64);
+    HANDLE_CASE(DT_COMPLEX64);
+    HANDLE_CASE(DT_COMPLEX128);
+    default:
+      return errors::InvalidArgument("Unexpected type ", DataTypeString(dtype));
+  }
+  return Status::OK();
 }
 
-bool AreInversePermutations(gtl::ArraySlice<int32> a,
-                            gtl::ArraySlice<int32> b) {
+template <typename T>
+bool AreInversePermutations(const std::vector<T>& a, const std::vector<T>& b) {
   if (a.size() != b.size()) {
     return false;
   }
@@ -50,77 +97,121 @@ bool AreInversePermutations(gtl::ArraySlice<int32> a,
   return true;
 }
 
-// Extract int32 values from a Const op to `int32_values`. Returns true if
-// succeeds.
-bool Int32ValuesFromNode(const NodeDef& node, std::vector<int>* int32_values) {
+// Extract values from a Const op to `values`. Returns true if succeeds.
+template <typename T>
+bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
   if (node.op() != "Const") {
     return false;
   }
 
-  if (node.attr().at("dtype").type() != DT_INT32) {
+  if (node.attr().at("dtype").type() != DataTypeToEnum<T>::value) {
     return false;
   }
 
   // TensorProto represents the content of the tensor in either <type>_val or
   // tensor_content.
   const TensorProto& tensor = node.attr().at("value").tensor();
-  if (tensor.int_val_size() > 0 && tensor.has_tensor_shape()) {
+  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
+      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
+
+  if (!tensor_values->empty() && tensor.has_tensor_shape()) {
     // When tensor_shape is set, theoretically the representation of the data
-    // could be compressed. So, before copying int_val to the returned vector,
+    // could be compressed. So, before copying values to the returned vector,
     // make sure no compression happens.
     const TensorShapeProto& shape = tensor.tensor_shape();
-    if (shape.dim_size() == 1 && shape.dim(0).size() == tensor.int_val_size()) {
-      int32_values->insert(int32_values->end(), tensor.int_val().begin(),
-                           tensor.int_val().end());
+    if (shape.dim_size() == 1 && shape.dim(0).size() == tensor_values->size()) {
+      values->insert(values->end(), tensor_values->begin(),
+                     tensor_values->end());
+      return true;
     }
-    return true;
   }
 
   const auto tensor_content_size = tensor.tensor_content().size();
   if (tensor_content_size > 0) {
-    CHECK_EQ(0, tensor_content_size % sizeof(int32))
+    CHECK_EQ(0, tensor_content_size % sizeof(T))
         << "tensor_content_size (" << tensor_content_size
-        << ") is not a multiple of " << sizeof(int32);
-    int32_values->resize(tensor_content_size / sizeof(int32));
+        << ") is not a multiple of " << sizeof(T);
+    values->resize(tensor_content_size / sizeof(T));
     port::CopyToArray(tensor.tensor_content(),
-                      reinterpret_cast<char*>(int32_values->data()));
+                      reinterpret_cast<char*>(values->data()));
     return true;
   }
 
   return false;
 }
 
-bool SimplyReordersData(const NodeDef& node) {
-  return node.op() == "Transpose";
+template <typename T>
+bool IsInnerMatrixTranspose(const std::vector<T>& perm) {
+  const T n = perm.size();
+  if (n < 2) {
+    return false;
+  }
+  for (T i = 0; i < n - 2; ++i) {
+    if (perm[i] != i) {
+      return false;
+    }
+  }
+  return perm[n - 1] == n - 2 && perm[n - 2] == n - 1;
 }
 
-// Returns the data type in attribute `attr_name` of `node`. If that attribute
-// doesn't exist, returns DT_INVALID.
-DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
-  if (!node.attr().count(attr_name)) {
-    return DT_INVALID;
+bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
+                                const NodeMap* node_map) {
+  if (transpose_node.op() != "Transpose" &&
+      transpose_node.op() != "ConjugateTranspose") {
+    return false;
   }
-  const auto& attr = node.attr().at(attr_name);
-  if (attr.value_case() != AttrValue::kType) {
-    return DT_INVALID;
+  const NodeDef* perm_node = node_map->GetNode(transpose_node.input(1));
+  std::vector<int> perm32;
+  if (ValuesFromConstNode(*perm_node, &perm32)) {
+    return IsInnerMatrixTranspose(perm32);
   }
-  return attr.type();
+  std::vector<int64> perm64;
+  if (ValuesFromConstNode(*perm_node, &perm64)) {
+    return IsInnerMatrixTranspose(perm64);
+  }
+  return false;
 }
 
-bool IsCommutative(const OpDef& op, const NodeDef& input1) {
-  if (op.name() == "Add") {
-    // Workaround for "Add" not being marked is_commutative and is_aggregate.
-    // (See cl/173915048).
-    const auto type = GetDataTypeFromAttr(input1, "T");
-    return type != DT_INVALID && type != DT_STRING;
+bool MaybeAddControlInput(const string& new_input, NodeDef* node,
+                          GraphDef* graph, NodeMap* node_map) {
+  bool already_exists = false;
+  for (const string& input : node->input()) {
+    if (input == new_input || AsControlDependency(input) == new_input) {
+      already_exists = true;
+      break;
+    }
+  }
+  if (!already_exists) {
+    const string ctrl_dep =
+        ConstantFolding::AddControlDependency(new_input, graph, node_map);
+    node->add_input(ctrl_dep);
+    node_map->AddOutput(NodeName(new_input), node->name());
   }
-  return op.is_commutative();
+  return !already_exists;
+}
+
+int CopyControlInputs(const NodeDef& from, NodeDef* to, GraphDef* graph,
+                      NodeMap* node_map) {
+  int num_copied = 0;
+  for (const string& input : from.input()) {
+    if (IsControlInput(input) &&
+        MaybeAddControlInput(input, to, graph, node_map)) {
+      ++num_copied;
+    }
+  }
+  return num_copied;
 }
 
 void SetDataTypeToAttr(DataType dtype, const string& attr_name, NodeDef* node) {
   (*node->mutable_attr())[attr_name].set_type(dtype);
 }
 
+void FlipBooleanAttr(const string& attr_name, NodeDef* node) {
+  const bool old_value =
+      !node->attr().count(attr_name) ? false : node->attr().at(attr_name).b();
+  (*node->mutable_attr())[attr_name].set_b(!old_value);
+}
+
 string SourceDataTypeAttrName(const NodeDef& node) {
   if (node.op() == "Bitcast") {
     return "T";
@@ -162,6 +253,30 @@ bool IsNumberType(DataType dtype) {
 
 const char kOutputShapesAttr[] = "_output_shapes";
 
+PartialTensorShape GetInputShape(const string& input, const NodeMap& node_map) {
+  int output_pos;
+  string node_name = ParseNodeName(input, &output_pos);
+  const NodeDef* input_node = node_map.GetNode(node_name);
+  return input_node->attr().at(kOutputShapesAttr).list().shape(output_pos);
+}
+
+bool ShapesEqual(const string& input_x, const string& input_y,
+                 const NodeMap& node_map) {
+  PartialTensorShape x_shape = GetInputShape(input_x, node_map);
+  PartialTensorShape y_shape = GetInputShape(input_y, node_map);
+  if (x_shape.unknown_rank() || y_shape.unknown_rank() ||
+      x_shape.dims() != y_shape.dims()) {
+    return false;
+  }
+  for (int i = 0; i < x_shape.dims(); ++i) {
+    if (x_shape.dim_size(i) == -1 || y_shape.dim_size(i) == -1 ||
+        x_shape.dim_size(i) != y_shape.dim_size(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
@@ -208,6 +323,18 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
   return true;
 }
 
+NodeDef* GetTailOfValuePreservingChain(
+    const NodeDef& node, const NodeMap& node_map,
+    const std::unordered_set<string>& nodes_to_preserve) {
+  auto is_value_preserving_non_branching = [&](const NodeDef& node) {
+    return IsValuePreserving(node) &&
+           NumNonControlOutputs(node, node_map) == 1 &&
+           nodes_to_preserve.count(node.name()) == 0;
+  };
+  return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
+                        is_value_preserving_non_branching);
+}
+
 }  // namespace
 
 class UniqueNodes {
@@ -264,10 +391,7 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
 
   // Compare inputs.
-  const OpDef* op_def = nullptr;
-  Status status = OpRegistry::Global()->LookUpOpDef(node1.op(), &op_def);
-  const bool is_commutative = status.ok() && IsCommutative(*op_def, node1);
-  if (is_commutative) {
+  if (IsCommutative(node1)) {
     std::vector<string> inputs1(node1.input().begin(), node1.input().end());
     std::vector<string> inputs2(node2.input().begin(), node2.input().end());
     std::sort(inputs1.begin(), inputs1.end());
@@ -316,45 +440,53 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
+NodeDef* ArithmeticOptimizer::AddNode(const string& name,
+                                      const NodeDef* node_to_copy) {
+  NodeDef* new_node = optimized_graph_->add_node();
+  const string name_with_prefix =
+      AddPrefixToNodeName(name, kArithmeticOptimizer);
+  node_map_->AddNode(NodeName(name_with_prefix), new_node);
+  if (node_to_copy != nullptr) {
+    *new_node = *node_to_copy;
+  }
+  new_node->set_name(name_with_prefix);
+  return new_node;
+}
+
+bool ArithmeticOptimizer::OptimizedNodeExists(const string& name) {
+  const string name_with_prefix =
+      AddPrefixToNodeName(name, kArithmeticOptimizer);
+  return node_map_->NodeExists(name_with_prefix);
+}
+
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
   }
-  if (IsEnter(node) || IsExit(node) || IsPlaceholder(node)) {
+  if (IsEnter(node) || IsExit(node)) {
     return false;
   }
   if (node.device().find("SPU") != string::npos) {
     return false;
   }
-  const OpDef* op_def = nullptr;
-  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
-  if (!status.ok()) {
-    return false;
-  }
-  if (op_def->is_stateful()) {
-    return false;
-  }
-  // Don't consolidate ops such as AssignAdd
-  for (const auto& input : op_def->input_arg()) {
-    if (input.is_ref()) {
-      return false;
-    }
+  // Workaround for Assert mistakenly being labeled as stateful.
+  if (IsAssert(node)) {
+    return true;
   }
-  return true;
+  return IsFreeOfSideEffect(node);
 }
 
-void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
-  NodeMap map(optimized_graph);
+void ArithmeticOptimizer::DedupComputations() {
   bool stop = true;
   std::set<int> duplicates;
   do {
     stop = true;
     UniqueNodes nodes;
-    for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    for (int i = 0; i < optimized_graph_->node_size(); ++i) {
       if (duplicates.find(i) != duplicates.end()) {
         continue;
       }
-      NodeDef* node = optimized_graph->mutable_node(i);
+      NodeDef* node = optimized_graph_->mutable_node(i);
       if (!CanDedup(*node)) {
         continue;
       }
@@ -362,20 +494,21 @@ void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
       if (rep == node) {
         continue;
       }
-      const std::set<NodeDef*>& fanouts = map.GetOutputs(node->name());
+      const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
         for (string& name : *fanout->mutable_input()) {
           int position;
-          string nodename = ParseNodeName(name, &position);
+          const string nodename = ParseNodeName(name, &position);
           if (nodename == node->name()) {
+            // Update name in-place.
             if (position > 0) {
-              name = strings::StrCat(rep->name(), ":", position);
+              name = StrCat(rep->name(), ":", position);
             } else if (position == 0) {
               name = rep->name();
             } else {
-              name = strings::StrCat("^", rep->name());
+              name = StrCat("^", rep->name());
             }
-            map.AddOutput(rep->name(), fanout->name());
+            node_map_->AddOutput(rep->name(), fanout->name());
           }
         }
       }
@@ -385,44 +518,88 @@ void ArithmeticOptimizer::DedupComputations(GraphDef* optimized_graph) const {
   } while (!stop);
 
   // Delete duplicates
-  if (!duplicates.empty()) {
-    int last = optimized_graph->node_size() - 1;
+  if (fetch_nodes_known_ && !duplicates.empty()) {
+    int last = optimized_graph_->node_size() - 1;
     for (auto it = duplicates.rbegin(); it != duplicates.rend(); ++it) {
       int index = *it;
-      optimized_graph->mutable_node()->SwapElements(index, last);
+      optimized_graph_->mutable_node()->SwapElements(index, last);
       last--;
     }
-    optimized_graph->mutable_node()->DeleteSubrange(last + 1,
-                                                    duplicates.size());
+    optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
+                                                     duplicates.size());
+    // Rebuild the NodeMap which was invalidated by the node  swapping above.
+    node_map_.reset(new NodeMap(optimized_graph_));
+  }
+}
+
+void ArithmeticOptimizer::AddFrameControlDeps(
+    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
+    const string& source_for_ctrl_dep,
+    const std::vector<NodeDef*>& sinks_for_control_dep) {
+  const auto frame_it = frame_map_.find(old_node);
+  if (frame_it != frame_map_.end()) {
+    for (auto node : new_nodes) {
+      frame_map_.emplace(node, frame_it->second);
+    }
+    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
+      const string ctrl_dep = ConstantFolding::AddControlDependency(
+          source_for_ctrl_dep, optimized_graph_, node_map_.get());
+      for (auto node : sinks_for_control_dep) {
+        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+      }
+    }
   }
 }
 
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
-    const NodeDef* node, GraphDef* graph_def, NodeMap* node_map,
-    std::vector<const NodeDef*>* new_nodes) const {
+    const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
   // Remove involutions applied twice.
   if (IsInvolution(*node)) {
-    // An involution is a function f(x) that is its own inverse,
-    // i.e. f(f(x)) = x.
-    const NodeDef* input = node_map->GetNode(node->input(0));
-    if (input->op() == node->op()) {
-      return input->input(0);
+    // An involution is an element-wise function f(x) that is its own inverse,
+    // i.e. f(f(x)) = x. If we can find a chain of ops
+    //   f->op1->op2->...opn->f
+    // where op1 through opn preserve the values of their inputs, we can remove
+    // the two instances of the involution from the graph, since they cancel
+    // each other.
+    NodeDef* tail =
+        GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_);
+    NodeDef* involution = node_map_->GetNode(tail->input(0));
+    if (involution->op() == node->op()) {
+      // Skip both *node and *involution since they cancel each other.
+      if (tail == node) {
+        // The two nodes to eliminate are adjacent.
+        return involution->input(0);
+      } else {
+        tail->set_input(0, involution->input(0));
+        node_map_->UpdateInput(tail->name(), involution->name(),
+                               involution->input(0));
+        return node->input(0);
+      }
     }
   }
 
   // Remove inverse transposes.
   if (node->op() == "Transpose" || node->op() == "ConjugateTranspose") {
-    const NodeDef* input = node_map->GetNode(node->input(0));
+    NodeDef* input = node_map_->GetNode(node->input(0));
     if (input->op() == node->op()) {
-      const NodeDef* node_perm = node_map->GetNode(node->input(1));
-      const NodeDef* input_perm = node_map->GetNode(input->input(1));
+      const NodeDef* node_perm = node_map_->GetNode(node->input(1));
+      const NodeDef* input_perm = node_map_->GetNode(input->input(1));
+      // Try 32-bit indices.
       std::vector<int> node_perm_values;
       std::vector<int> input_perm_values;
-      if (Int32ValuesFromNode(*node_perm, &node_perm_values) &&
-          Int32ValuesFromNode(*input_perm, &input_perm_values) &&
+      if (ValuesFromConstNode(*node_perm, &node_perm_values) &&
+          ValuesFromConstNode(*input_perm, &input_perm_values) &&
           AreInversePermutations(node_perm_values, input_perm_values)) {
         return input->input(0);
       }
+      // Try 64-bit indices.
+      std::vector<int64> node_perm_values64;
+      std::vector<int64> input_perm_values64;
+      if (ValuesFromConstNode(*node_perm, &node_perm_values64) &&
+          ValuesFromConstNode(*input_perm, &input_perm_values64) &&
+          AreInversePermutations(node_perm_values64, input_perm_values64)) {
+        return input->input(0);
+      }
     }
   }
 
@@ -443,14 +620,14 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map->GetNode(node->name());
+    NodeDef* reshape = node_map_->GetNode(node->name());
     int output_pos = 0;
     string input_node_name = ParseNodeName(node->input(0), &output_pos);
-    const NodeDef* input = node_map->GetNode(input_node_name);
+    const NodeDef* input = node_map_->GetNode(input_node_name);
     if (input->op() == "Reshape") {
       reshape->set_input(0, input->input(0));
-      node_map->UpdateInput(reshape->name(), input->name(), input->input(0));
-      new_nodes->push_back(reshape);
+      node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
+      nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
@@ -490,35 +667,31 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                          &device) &&
         (StringPiece(device).contains(DEVICE_CPU) ||
          StringPiece(device).contains(DEVICE_GPU))) {
-      const NodeDef* cast = node_map->GetNode(transpose->input(0));
+      const NodeDef* cast = node_map_->GetNode(transpose->input(0));
       if (cast->op() == "Cast") {
-        const NodeDef* input = node_map->GetNode(cast->input(0));
+        const NodeDef* input = node_map_->GetNode(cast->input(0));
         const DataType src_type = GetSourceDataType(*cast);
         const DataType dst_type = GetDestinationDataType(*cast);
         if (IsNumberType(src_type) && IsNumberType(dst_type) &&
             DataTypeSize(src_type) < DataTypeSize(dst_type)) {
-          NodeDef* new_transpose = graph_def->add_node();
-          *new_transpose = *transpose;
-          new_transpose->set_name(transpose->name() + "_" +
-                                  DataTypeString(src_type));
+          NodeDef* new_transpose =
+              AddNode(StrCat(transpose->name(), "_", DataTypeString(src_type)),
+                      transpose);
           (*new_transpose->mutable_attr())["T"].set_type(src_type);
-          node_map->AddNode(new_transpose->name(), new_transpose);
-
           new_transpose->set_input(0, cast->input(0));
-          node_map->AddOutput(input->name(), new_transpose->name());
-          node_map->AddOutput(NodeName(new_transpose->input(1)),
-                              new_transpose->name());
-
-          NodeDef* new_cast = graph_def->add_node();
-          *new_cast = *cast;
-          new_cast->set_name(cast->name() + "_new");
-          node_map->AddNode(new_cast->name(), new_cast);
+          node_map_->AddOutput(input->name(), new_transpose->name());
+          node_map_->AddOutput(NodeName(new_transpose->input(1)),
+                               new_transpose->name());
 
+          NodeDef* new_cast = AddNode(StrCat(cast->name(), "_new"), cast);
           new_cast->set_input(0, new_transpose->name());
-          node_map->AddOutput(new_transpose->name(), new_cast->name());
+          node_map_->AddOutput(new_transpose->name(), new_cast->name());
+
+          nodes_to_simplify->PushBack(new_transpose);
+          //  Add frame dependencies that the original node might have had.
+          AddFrameControlDeps(node, {new_transpose, new_cast},
+                              new_transpose->input(0), {new_transpose});
 
-          new_nodes->push_back(new_transpose);
-          new_nodes->push_back(new_cast);
           return new_cast->name();
         }
       }
@@ -526,20 +699,20 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (node->op() == "Bitcast") {
-    NodeDef* bitcast = node_map->GetNode(node->name());
+    NodeDef* bitcast = node_map_->GetNode(node->name());
     // Bypass bitcasts whose source type and destination type are equal.
     if (GetSourceDataType(*bitcast) == GetDestinationDataType(*bitcast)) {
       return bitcast->input(0);
     }
 
-    const NodeDef* operand = node_map->GetNode(bitcast->input(0));
+    const NodeDef* operand = node_map_->GetNode(bitcast->input(0));
     if (operand->op() == bitcast->op()) {
       // Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
       bitcast->set_input(0, operand->input(0));
       SetSourceDataType(GetSourceDataType(*operand), bitcast);
-      node_map->UpdateInput(bitcast->name(), bitcast->input(0),
-                            operand->input(0));
-      new_nodes->push_back(bitcast);
+      node_map_->UpdateInput(bitcast->name(), bitcast->input(0),
+                             operand->input(0));
+      nodes_to_simplify->PushBack(bitcast);
       return bitcast->name();
     }
   }
@@ -581,29 +754,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   // Conv?DBackpropInput.
   if (node->op() == "Conv2D" || node->op() == "Conv3D") {
     NodeDef* conv = const_cast<NodeDef*>(node);
-    const NodeDef* weights = node_map->GetNode(NodeName(conv->input(1)));
+    const NodeDef* weights = node_map_->GetNode(NodeName(conv->input(1)));
     // Fold the multiply to conv only when the weights are constant, so the
     // multiply can be constant-folded. TODO(jingyue): When the weights aren't
     // constant, this should also help performance a bit and memory usage a lot,
     // since the weights tend to be smaller than the activations.
     if (weights->op() == "Const") {
-      const NodeDef* source = node_map->GetNode(node->input(0));
-      while (SimplyReordersData(*source) &&
-             node_map->GetOutputs(source->name()).size() == 1 &&
-             // Do not skip over preserved nodes, because folding will change
-             // the results of these skipped data-reordering nodes.
-             // TODO(jingyue): A more elegant way is to copy this chain of
-             // data-reordering nodes and modify only the copy.
-             !nodes_to_preserve_.count(source->name())) {
-        source = node_map->GetNode(source->input(0));
-      }
+      const NodeDef* source = node_map_->GetNode(
+          GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_)
+              ->input(0));
       if (source->op() == "Mul" &&
-          node_map->GetOutputs(source->name()).size() == 1) {
+          node_map_->GetOutputs(source->name()).size() == 1) {
         const NodeDef* mul = source;
         // `scale` is the scalar multiplier, and `other` is the other operand.
         // TODO(jingyue): handle the case where `scale` is 0-th operand.
-        const NodeDef* scale = node_map->GetNode(mul->input(1));
-        const NodeDef* other = node_map->GetNode(mul->input(0));
+        const NodeDef* scale = node_map_->GetNode(mul->input(1));
+        const NodeDef* other = node_map_->GetNode(mul->input(0));
         if (scale->op() == "Const" && scale->attr().at("dtype").type() ==
                                           weights->attr().at("dtype").type()) {
           const TensorProto& scale_tensor = scale->attr().at("value").tensor();
@@ -611,36 +777,36 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           if (scale_tensor.has_tensor_shape() &&
               scale_tensor.tensor_shape().dim_size() == 0) {
             // Create new node `scaled_weights`.
-            NodeDef* scaled_weights = graph_def->add_node();
-            scaled_weights->set_name(weights->name() + "_scaled");
+            NodeDef* scaled_weights = AddNode(
+                StrCat(weights->name(), "_scaled_", conv->name()), nullptr);
             scaled_weights->set_op("Mul");
             scaled_weights->set_device(weights->device());
             (*scaled_weights->mutable_attr())["T"] =
                 weights->attr().at("dtype");
-            node_map->AddNode(scaled_weights->name(), scaled_weights);
-            new_nodes->push_back(scaled_weights);
+            nodes_to_simplify->PushBack(scaled_weights);
 
             // Link in its inputs.
             scaled_weights->add_input(conv->input(1));
-            node_map->AddOutput(weights->name(), scaled_weights->name());
+            node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
-            node_map->AddOutput(scale->name(), scaled_weights->name());
+            node_map_->AddOutput(scale->name(), scaled_weights->name());
+            AddFrameControlDeps(node, {scaled_weights}, "", {});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
-            node_map->UpdateInput(conv->name(), weights->name(),
-                                  scaled_weights->name());
-            new_nodes->push_back(conv);
+            node_map_->UpdateInput(conv->name(), weights->name(),
+                                   scaled_weights->name());
+            nodes_to_simplify->PushBack(conv);
 
             // Update `mul`'s consumer to bypass `mul` because it's folded to
             // the weights.
-            CHECK_EQ(node_map->GetOutputs(mul->name()).size(), 1);
+            CHECK_EQ(node_map_->GetOutputs(mul->name()).size(), 1);
             NodeDef* consumer_of_mul =
-                *node_map->GetOutputs(mul->name()).begin();
+                *node_map_->GetOutputs(mul->name()).begin();
             consumer_of_mul->set_input(0, mul->input(0));
-            node_map->UpdateInput(consumer_of_mul->name(), mul->name(),
-                                  other->name());
-            new_nodes->push_back(consumer_of_mul);
+            node_map_->UpdateInput(consumer_of_mul->name(), mul->name(),
+                                   other->name());
+            nodes_to_simplify->PushBack(consumer_of_mul);
             return conv->name();
           }
         }
@@ -648,61 +814,266 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     }
   }
 
-  return "";
-}
+  if (node->op() == "Mul" && node->input(0) == node->input(1) &&
+      !OptimizedNodeExists(StrCat(node->name(), "_square"))) {
+    NodeDef* new_square_node =
+        AddNode(strings::StrCat(node->name(), "_square"), node);
+    new_square_node->set_op("Square");
+    for (int i = 1; i < new_square_node->input_size(); ++i) {
+      new_square_node->set_input(i - 1, new_square_node->input(i));
+    }
+    new_square_node->mutable_input()->RemoveLast();
+    return new_square_node->name();
+  }
 
-namespace {
-// A vector with a set. The set stores the same elements as the vector, and
-// quickly answers whether a value is in the vector. Duplicated elements are not
-// allowed for now.
-template <class T>
-class SetVector {
- public:
-  void PushBack(const T& value) {
-    CHECK(!Exists(value)) << "Value " << value << " is already in the set.";
-    set_.insert(value);
-    vector_.push_back(value);
+  if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
+    // Discard aggregate nodes with a single input.
+    if (node->input_size() == 1) {
+      return node->input(0);
+    }
+
+    // Try to rewrite aggregations of N >= 2 identical terms (possibly due
+    // to deduping or other rewrites) so we can get rid of the sum entirely.
+    // The expression (using AddN as an example of an aggregate op):
+    //   AddN(x, x, x, ... ,x)
+    //        <-- N terms -->
+    // can be rewritten to
+    //   Mul(Const(N), x))
+    //
+    bool all_equal = true;
+    int num_inputs = 1;
+    for (int i = 1; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) {
+        break;
+      }
+      ++num_inputs;
+      if (node->input(i) != node->input(0)) {
+        all_equal = false;
+        break;
+      }
+    }
+    const string mul_node_name = StrCat(node->name(), "_mul");
+    if (all_equal && !OptimizedNodeExists(mul_node_name)) {
+      // 1. Create constant node with value N.
+      const auto type = GetDataTypeFromAttr(*node, "T");
+      Tensor t(type, TensorShape({}));
+      Status status = SetTensorValue(type, num_inputs, &t);
+      if (!status.ok()) {
+        LOG(WARNING) << "Failed to create const node: "
+                     << status.error_message();
+        return "";
+      }
+      TensorValue value(&t);
+      NodeDef* new_const_node =
+          AddNode(StrCat(node->name(), "_const"), nullptr);
+      *new_const_node =
+          ConstantFolding::CreateNodeDef(new_const_node->name(), value);
+      new_const_node->set_device(node->device());
+      nodes_to_simplify->PushBack(new_const_node);
+
+      // 2. Replace the aggregate node with Mul(Const(N), x).
+      NodeDef* new_mul_node = AddNode(mul_node_name, nullptr);
+      new_mul_node->set_op("Mul");
+      new_mul_node->set_device(node->device());
+      SetDataTypeToAttr(type, "T", new_mul_node);
+      new_mul_node->add_input(new_const_node->name());
+      node_map_->AddOutput(new_const_node->name(), new_mul_node->name());
+      new_mul_node->add_input(node->input(0));
+      node_map_->AddOutput(node->input(0), new_mul_node->name());
+
+      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
+      AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
+                          {new_const_node});
+      return new_mul_node->name();
+    }
   }
 
-  T PopBack() {
-    T back = vector_.back();
-    set_.erase(back);
-    vector_.pop_back();
-    return back;
+  // Use the commutativity and (left- and right-) distributive property of
+  // multiplication over addition to hoist common factors out of aggregate nodes
+  // where all the inputs are Mul nodes. This pattern occurs frequently in
+  // regularization terms for the gradients during training.
+  // For example, we can rewrite an expression of the form:
+  //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
+  // to the following:
+  //   Mul(x, AddN(y1, y2, y3, ... yn))
+  if (opt_level_ == RewriterConfig::AGGRESSIVE && IsAggregate(*node) &&
+      NumNonControlInputs(*node) > 1 &&
+      !OptimizedNodeExists(StrCat(node->name(), "_hoist_add"))) {
+    // Determine the set of common factors if the input nodes are all Mul nodes.
+    std::set<string> common_factors;
+    for (int i = 0; i < node->input_size(); ++i) {
+      if (i > 0 && common_factors.empty()) {
+        break;
+      }
+      if (IsControlInput(node->input(i))) {
+        break;
+      }
+      const NodeDef* input = node_map_->GetNode(node->input(i));
+      if (input->op() == "Mul") {
+        std::set<string> factors_i{input->input(0), input->input(1)};
+        if (i == 0) {
+          std::swap(common_factors, factors_i);
+        } else {
+          std::set<string> intersection;
+          std::set_intersection(
+              factors_i.begin(), factors_i.end(), common_factors.begin(),
+              common_factors.end(),
+              std::inserter(intersection, intersection.begin()));
+          std::swap(common_factors, intersection);
+        }
+      } else {
+        common_factors.clear();
+      }
+    }
+    if (common_factors.size() == 1) {
+      const string& common_factor = *common_factors.begin();
+
+      // Gather up the non-shared factors (the y's in the example).
+      // Unless the aggregation is Add, we have to make sure that all the y's
+      // have the same shape since the other aggregation ops do not support
+      // broadcasting.
+      std::vector<string> unique_factors;
+      unique_factors.reserve(node->input_size());
+      bool shapes_match = true;
+      for (int i = 0; i < node->input_size() && shapes_match; ++i) {
+        const string& input = node->input(i);
+        if (IsControlInput(input)) {
+          break;
+        }
+        const NodeDef* mul_node = node_map_->GetNode(input);
+        const int unique_factor_index =
+            mul_node->input(0) == common_factor ? 1 : 0;
+        unique_factors.push_back(mul_node->input(unique_factor_index));
+        if (i > 0 && !IsAdd(*node)) {
+          shapes_match = ShapesEqual(unique_factors.front(),
+                                     unique_factors.back(), *node_map_);
+        }
+      }
+
+      if (shapes_match) {
+        // 1. Use a copy of the first Mul node for the outer multiplication.
+        NodeDef* new_mul_node = AddNode(StrCat(node->name(), "_hoist_mul"),
+                                        node_map_->GetNode(node->input(0)));
+        NodeDef* new_add_node =
+            AddNode(StrCat(node->name(), "_hoist_add"), node);
+        new_mul_node->set_device(node->device());
+        new_mul_node->set_input(0, common_factor);
+        node_map_->AddOutput(common_factor, new_mul_node->name());
+        new_mul_node->set_input(1, new_add_node->name());
+        node_map_->AddOutput(new_add_node->name(), new_mul_node->name());
+
+        // 2. Hoist non-shared factors up into the new AddN node.
+        nodes_to_simplify->PushBack(new_add_node);
+        for (int i = 0; i < node->input_size(); ++i) {
+          const string& input = node->input(i);
+          if (IsControlInput(input)) {
+            break;
+          }
+          new_add_node->set_input(i, unique_factors[i]);
+        }
+
+        // 3. Add frame dependencies that the original node might have had.
+        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
+                            {new_add_node});
+
+        return new_mul_node->name();
+      }
+    }
   }
 
-  bool Exists(const T& value) const { return set_.count(value); }
+  // Fold Transpose into matrix multiplication.
+  if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
+       node->op() == "BatchMatMul") &&
+      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+    const NodeDef* a = node_map_->GetNode(node->input(0));
+    const NodeDef* b = node_map_->GetNode(node->input(1));
+    bool is_complex = false;
+    if (node->op() != "SparseMatMul") {
+      const DataType type = GetDataTypeFromAttr(*node, "T");
+      is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    }
+    const std::set<string> foldable_transpose_ops =
+        !is_complex ? std::set<string>{"ConjugateTranspose", "Transpose"}
+                    : (node->op() == "BatchMatMul"
+                           ? std::set<string>{"ConjugateTranspose"}
+                           : std::set<string>{"Transpose"});
+    const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*a, node_map_.get());
+    const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*b, node_map_.get());
+    if (a_is_foldable || b_is_foldable) {
+      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), node);
+      if (a_is_foldable) {
+        const string attr_a =
+            node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
+        FlipBooleanAttr(attr_a, new_op);
+        new_op->set_input(0, a->input(0));
+        node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
+        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
+      }
+      if (b_is_foldable) {
+        const string attr_b =
+            node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
+        FlipBooleanAttr(attr_b, new_op);
+        new_op->set_input(1, b->input(0));
+        node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
+        if (!a_is_foldable) {
+          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
+        }
+      }
+    }
+  }
 
-  bool Empty() const { return vector_.empty(); }
+  // Fold Conj into Transpose or ConjugateTranspose.
+  if ((node->op() == "Conj" || node->op() == "Transpose" ||
+       node->op() == "ConjugateTranspose") &&
+      !OptimizedNodeExists(StrCat(node->name(), "_fused"))) {
+    const NodeDef* input = node_map_->GetNode(node->input(0));
+    const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
+    const NodeDef* conj_op = node->op() == "Conj" ? node : input;
+
+    if ((transpose_op->op() == "Transpose" ||
+         transpose_op->op() == "ConjugateTranspose") &&
+        conj_op->op() == "Conj") {
+      NodeDef* new_op = AddNode(StrCat(node->name(), "_fused"), transpose_op);
+      // Flip the type of transpose op to absorb the conjugation.
+      new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
+                                                       : "Transpose");
+      new_op->set_input(0, input->input(0));
+      node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
+      AddFrameControlDeps(node, {new_op}, "", {});
+      return new_op->name();
+    }
+  }
 
- private:
-  std::unordered_set<T> set_;
-  std::vector<T> vector_;
-};
-}  // namespace
+  return "";
+}
 
-void ArithmeticOptimizer::SimplifyArithmeticOps(
-    GraphDef* optimized_graph) const {
-  NodeMap node_map(optimized_graph);
-  SetVector<const NodeDef*> nodes_to_simplify;
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    nodes_to_simplify.PushBack(optimized_graph->mutable_node()->Mutable(i));
+Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+  SetVector<NodeDef*> nodes_to_simplify;
+  nodes_to_simplify.Reserve(optimized_graph_->node_size());
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    nodes_to_simplify.PushBack(optimized_graph_->mutable_node(i));
   }
   while (!nodes_to_simplify.Empty()) {
     const NodeDef* node = nodes_to_simplify.PopBack();
-    std::vector<const NodeDef*> new_nodes;
     const string simplified_tensor =
-        TrySimplifyAndReplaceUses(node, optimized_graph, &node_map, &new_nodes);
+        TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
     if (simplified_tensor.empty()) {
       continue;
     }
 
     if (NodeName(simplified_tensor) != node->name()) {
-      // When `node` is simplifed to another node rather than in-place, the
+      // Always consider simplified_tensor for further optimizations.
+      NodeDef* simplified_node = node_map_->GetNode(simplified_tensor);
+      if (simplified_node != nullptr) {
+        nodes_to_simplify.PushBack(simplified_node);
+      }
+      // When `node` is simplified to another node rather than in-place, the
       // consumers of `node` are already redirected to `simplified_tensor`.
       // Re-push the consumers into `nodes_to_simplify` for further
       // optimizations.
-      std::set<NodeDef*> consumers = node_map.GetOutputs(node->name());
+      std::set<NodeDef*> consumers = node_map_->GetOutputs(node->name());
       for (NodeDef* consumer : consumers) {
         // Update `consumer`'s use of `node` to `input`'s operand.
         for (int i = 0; i < consumer->input_size(); ++i) {
@@ -715,39 +1086,44 @@ void ArithmeticOptimizer::SimplifyArithmeticOps(
                      ? AsControlDependency(NodeName(simplified_tensor))
                      : simplified_tensor);
           }
-          VLOG(2) << "Update input " << consumer->input(i) << " of "
-                  << consumer->name() << " to " << simplified_tensor;
         }
-        node_map.UpdateInput(consumer->name(), node->name(), simplified_tensor);
-        if (!nodes_to_simplify.Exists(consumer)) {
-          nodes_to_simplify.PushBack(consumer);
-        }
-      }
-    }
-    for (const NodeDef* new_node : new_nodes) {
-      if (!nodes_to_simplify.Exists(new_node)) {
-        nodes_to_simplify.PushBack(new_node);
+        node_map_->UpdateInput(consumer->name(), node->name(),
+                               simplified_tensor);
+        nodes_to_simplify.PushBack(consumer);
       }
     }
   }
+  return Status::OK();
 }
 
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
-  nodes_to_preserve_ = item.NodesToPreserve();
+  optimized_graph_ = optimized_graph;
+  *optimized_graph_ = item.graph;
 
-  GraphProperties graph_properties(item);
-  TF_RETURN_IF_ERROR(graph_properties.InferStatically());
-  TF_RETURN_IF_ERROR(graph_properties.AnnotateOutputShapes(optimized_graph));
+  // Set up helper data structures.
+  nodes_to_preserve_ = item.NodesToPreserve();
+  fetch_nodes_known_ = !item.fetch.empty();
+  node_map_.reset(new NodeMap(optimized_graph_));
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map_, &num_frames));
+  // Shapes are only needed in aggressive mode.
+  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+    graph_properties_.reset(new GraphProperties(item));
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
+    TF_RETURN_IF_ERROR(
+        graph_properties_->AnnotateOutputShapes(optimized_graph_));
+  }
 
-  DedupComputations(optimized_graph);
-  SimplifyArithmeticOps(optimized_graph);
+  // Perform the optimizations.
+  DedupComputations();
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
   // Clear output shapes.
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    optimized_graph->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
+    optimized_graph_->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 53cec11ff63d6fb256d1420b5d7ea48c7dde3cfe..ec269792386189e5a590a99af020803810f36b1a 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -17,13 +17,17 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
 
 #include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kArithmeticOptimizer[] = "ArithmeticOptimizer";
+
 // Optimize TF computations by reducing the arithmetic complexity required to
 // run a model.
 class ArithmeticOptimizer : public GraphOptimizer {
@@ -42,11 +46,32 @@ class ArithmeticOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
+  // Returns true is a node with given name and the optimizer prefix already
+  // exists.
+  bool OptimizedNodeExists(const string& name);
+
+  // Creates a new node in the graph, prefixed with "ArithmeticOptimizer/",
+  // updates node_map_, and optionally copies *node_to_copy into the new
+  // node, if node_to_copy is not nullptr.
+  NodeDef* AddNode(const string& name, const NodeDef* node_to_copy);
+
+  // Returns true if it is safe to dedup node from the graph.
   bool CanDedup(const NodeDef& node) const;
-  void DedupComputations(GraphDef* optimized_graph) const;
+
+  // Dedup redundant nodes in the graph.
+  void DedupComputations();
+
+  // Fix frame dependencies by adding control dependencies from old_input to
+  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
+  // new_nodes.
+  void AddFrameControlDeps(const NodeDef* old_node,
+                           const std::vector<NodeDef*>& new_nodes,
+                           const string& source_for_ctrl_dep,
+                           const std::vector<NodeDef*>& sinks_for_control_dep);
+
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  void SimplifyArithmeticOps(GraphDef* optimized_graph) const;
+  Status SimplifyArithmeticOps();
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
   // tensor (e.g. "split:1") or an emtpy string if no simplification is
@@ -62,13 +87,17 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // TODO(jingyue): This interface is not suitable for optimizing nodes with
   // multiple output tensors. We should pass in a tensor name instead of a
   // NodeDef.
-  string TrySimplifyAndReplaceUses(
-      const NodeDef* node, GraphDef* graph_def, NodeMap* node_map,
-      std::vector<const NodeDef*>* new_nodes) const;
-
-  std::unordered_set<string> nodes_to_preserve_;
+  string TrySimplifyAndReplaceUses(const NodeDef* node,
+                                   SetVector<NodeDef*>* nodes_to_simplify);
 
   RewriterConfig::Toggle opt_level_;
+
+  bool fetch_nodes_known_;
+  std::unordered_set<string> nodes_to_preserve_;
+  std::unique_ptr<NodeMap> node_map_;
+  FrameMap frame_map_;
+  std::unique_ptr<GraphProperties> graph_properties_;
+  GraphDef* optimized_graph_;  // Not owned.
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 61c8b82ea0fe70ce7d1463a646024546affed501..da4263ff421d348645d33489428c1edc0bbdf9a0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -28,6 +28,25 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+string OptimizedName(const string& name) {
+  return AddPrefixToNodeName(name, kArithmeticOptimizer);
+}
+
+void VerifyGraphsMatch(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, int line) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << line;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << line;
+    EXPECT_EQ(original.op(), optimized.op()) << line;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << line;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << line;
+    }
+  }
+}
+
 class ArithmeticOptimizerTest : public ::testing::Test {};
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -38,75 +57,131 @@ TEST_F(ArithmeticOptimizerTest, NoOp) {
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status s = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(s);
-
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    EXPECT_EQ(original.op(), optimized.op());
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j));
-    }
-  }
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c1 = ops::Const(s.WithOpName("c1"), {3.14, 2.7}, {1, 2});
   Output c2 = ops::Const(s.WithOpName("c2"), {3.14, 2.7}, {1, 2});
-  Output add = ops::Add(s.WithOpName("add"), c1, c2);
+  Output div = ops::Div(s.WithOpName("div"), c1, c2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
   const NodeDef& new_c1 = output.node(0);
   EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_add = output.node(1);
-  EXPECT_EQ("add", new_add.name());
-  EXPECT_EQ(2, new_add.input_size());
-  EXPECT_EQ("c1", new_add.input(0));
-  EXPECT_EQ("c1", new_add.input(1));
+  const NodeDef& new_div = output.node(1);
+  EXPECT_EQ("div", new_div.name());
+  EXPECT_EQ(2, new_div.input_size());
+  EXPECT_EQ("c1", new_div.input(0));
+  EXPECT_EQ("c1", new_div.input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output p = ops::Placeholder(s, DT_BOOL, ops::Placeholder::Shape({}));
+  Output c = ops::Const(s.WithOpName("c"), {3.14, 2.7}, {1, 2});
+  auto check1 = ops::CheckNumerics(s.WithOpName("check1"), c, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("check2"), c, "foo");
+  auto assert1 = ops::Assert(s.WithOpName("assert1"), p, {c});
+  auto assert2 = ops::Assert(s.WithOpName("assert2"), p, {c});
+  Output div = ops::Div(s.WithOpName("div").WithControlDependencies(
+                            {assert1.operation, assert2.operation}),
+                        check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_div = output.node(3);
+  EXPECT_EQ(4, new_div.input_size());
+  EXPECT_EQ("check1", new_div.input(0));
+  EXPECT_EQ("check1", new_div.input(1));
+  EXPECT_EQ("^assert1", new_div.input(2));
+  EXPECT_EQ("^assert1", new_div.input(3));
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c1 = ops::Const(s.WithOpName("c1"), {1.0f, 2.0f}, {1, 2});
   Output c2 = ops::Const(s.WithOpName("c2"), {3.0f, 4.0f}, {1, 2});
-  Output add1 = ops::Add(s.WithOpName("add1"), c1, c2);
-  Output add2 = ops::Add(s.WithOpName("add2"), c2, c1);
-  Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
+  Output mul1 = ops::Mul(s.WithOpName("mul1"), c1, c2);
+  Output mul2 = ops::Mul(s.WithOpName("mul2"), c2, c1);
+  Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div"};
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
 
   EXPECT_EQ(4, output.node_size());
   const NodeDef& new_c1 = output.node(0);
   EXPECT_EQ("c1", new_c1.name());
   const NodeDef& new_c2 = output.node(1);
   EXPECT_EQ("c2", new_c2.name());
-  const NodeDef& new_add1 = output.node(2);
-  EXPECT_EQ("add1", new_add1.name());
-  EXPECT_EQ(2, new_add1.input_size());
-  EXPECT_EQ("c1", new_add1.input(0));
-  EXPECT_EQ("c2", new_add1.input(1));
-  const NodeDef& new_add3 = output.node(3);
-  EXPECT_EQ("add3", new_add3.name());
-  EXPECT_EQ(2, new_add3.input_size());
-  EXPECT_EQ("add1", new_add3.input(0));
-  EXPECT_EQ("add1", new_add3.input(1));
+  const NodeDef& new_mul1 = output.node(2);
+  EXPECT_EQ("mul1", new_mul1.name());
+  EXPECT_EQ(2, new_mul1.input_size());
+  EXPECT_EQ("c1", new_mul1.input(0));
+  EXPECT_EQ("c2", new_mul1.input(1));
+  const NodeDef& new_div1 = output.node(3);
+  EXPECT_EQ("div1", new_div1.name());
+  EXPECT_EQ(2, new_div1.input_size());
+  EXPECT_EQ("mul1", new_div1.input(0));
+  EXPECT_EQ("mul1", new_div1.input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MulToSquare) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output d = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithControlDependencies(d).WithOpName("mul"), c, c);
+  Output id = ops::Identity(s.WithOpName("id"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("id", output.node(3).name());
+  EXPECT_EQ(OptimizedName("mul_square"), output.node(3).input(0));
+  EXPECT_EQ("Square", output.node(4).op());
+  EXPECT_EQ(OptimizedName("mul_square"), output.node(4).name());
+  EXPECT_EQ(2, output.node(4).input_size());
+  EXPECT_EQ("c", output.node(4).input(0));
+  EXPECT_EQ("^d", output.node(4).input(1));
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -131,6 +206,405 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   EXPECT_EQ("c", output.node(5).input(0));
 }
 
+TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  Output recip2 = ops::Reciprocal(s.WithOpName("recip2"), squeeze);
+  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+  EXPECT_EQ("squeeze", output.node(5).input(0));
+  EXPECT_EQ("c", output.node(2).input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  Output recip2 = ops::Reciprocal(
+      s.WithOpName("recip2").WithControlDependencies(squeeze), c);
+  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimizer should be a noop.
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    const NodeDef& original = item.graph.node(i);
+    const NodeDef& optimized = output.node(i);
+    EXPECT_EQ(original.name(), optimized.name());
+    EXPECT_EQ(original.op(), optimized.op());
+    EXPECT_EQ(original.input_size(), optimized.input_size());
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output add = ops::Add(s.WithOpName("add"), x, x);
+  Output id = ops::Identity(s.WithOpName("id"), add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  const NodeDef& new_const = output.node(3);
+  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
+  EXPECT_EQ("^x", new_const.input(0));
+  EXPECT_EQ(std::string("\0\0\0@", 4),
+            new_const.attr().at("value").tensor().tensor_content());
+  const NodeDef& new_mul = output.node(4);
+  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
+  EXPECT_EQ("x", new_mul.input(1));
+  const NodeDef& new_id = output.node(2);
+  EXPECT_EQ("id", new_id.name());
+  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::Const(s.WithOpName("x"), {3.0f, 4.0f}, {1, 2});
+  Output add = ops::Add(s.WithOpName("add").WithControlDependencies(y), x, x);
+  Output id = ops::Identity(s.WithOpName("id"), add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(6, output.node_size());
+  const NodeDef& new_const = output.node(4);
+  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
+  EXPECT_EQ("^x", new_const.input(0));
+  EXPECT_EQ(std::string("\0\0\0@", 4),
+            new_const.attr().at("value").tensor().tensor_content());
+  const NodeDef& new_mul = output.node(5);
+  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
+  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
+  EXPECT_EQ("x", new_mul.input(1));
+  EXPECT_EQ("^y", new_mul.input(2));
+  const NodeDef& new_id = output.node(3);
+  EXPECT_EQ("id", new_id.name());
+  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
+  // Test case from b/69059093.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output p = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({10, 10}));
+  Output add = ops::Add(s.WithOpName("Add"), p, p);
+  Output add1 = ops::Add(s.WithOpName("Add_1"), p, p);
+  Output add4 = ops::Add(s.WithOpName("Add_4"), add, add1);
+  Output add5 = ops::Add(s.WithOpName("Add_5"), add, add1);
+  Output add6 = ops::Add(s.WithOpName("Add_6"), add4, add5);
+  Output id = ops::Identity(s.WithOpName("id"), add6);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  const std::vector<string> devices{
+      "/device:CPU:0", "/device:GPU:0", "/device:CPU:0", "/device:GPU:1",
+      "/device:CPU:0", "/device:CPU:0", "/device:CPU:0",
+  };
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device(devices[i]);
+  }
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(17, output.node_size());
+  // The graph gets optimized to
+  // Mul(p,
+  //     Add(Add(Const(2), Const(2)),
+  //         Add(Const(2), Const(2))))
+  EXPECT_EQ(17, output.node_size());
+  for (const auto& node : output.node()) {
+    if ("id" == node.name()) {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ(OptimizedName("Add_6_hoist_mul"), node.input(0));
+    } else if (OptimizedName("Add_6_hoist_mul") == node.name()) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("Placeholder", node.input(0));
+      EXPECT_EQ(OptimizedName("Add_6_hoist_add"), node.input(1));
+    } else if (OptimizedName("Add_6_hoist_add") == node.name()) {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ(OptimizedName("Add_4_hoist_add"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_5_hoist_add"), node.input(1));
+      EXPECT_EQ("^Placeholder", node.input(2));
+    } else if (OptimizedName("Add_4_hoist_add") == node.name()) {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
+      EXPECT_EQ("^Placeholder", node.input(2));
+    } else if (OptimizedName("Add_5_hoist_add") == node.name()) {
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ(OptimizedName("Add_const"), node.input(0));
+      EXPECT_EQ(OptimizedName("Add_1_const"), node.input(1));
+      EXPECT_EQ("^Placeholder", node.input(2));
+    } else if (OptimizedName("Add_const") == node.name()) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^Placeholder", node.input(0));
+    } else if (OptimizedName("Add_1_const") == node.name()) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^Placeholder", node.input(0));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+      Output y1 = ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+      Output y2 = matching_shapes
+                      ? ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2})
+                      : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+      Output mul1 = ops::Mul(s.WithOpName("mul1"), x, y1);
+      Output mul2 = ops::Mul(s.WithOpName("mul2"), y2, x);
+      Output id =
+          use_addn ? ops::Identity(s.WithOpName("id"),
+                                   ops::AddN(s.WithOpName("add"), {mul1, mul2}))
+                   : ops::Identity(s.WithOpName("id"),
+                                   ops::Add(s.WithOpName("add"), mul1, mul2));
+
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      Status status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+      // Run the optimizer twice to make sure the rewrite is idempotent.
+      item.graph.Swap(&output);
+      status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+
+      if (use_addn && !matching_shapes) {
+        VerifyGraphsMatch(item.graph, output, __LINE__);
+      } else {
+        EXPECT_EQ(9, output.node_size());
+        const NodeDef& new_add = output.node(8);
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_add.name());
+        EXPECT_EQ("y1", new_add.input(0));
+        EXPECT_EQ("y2", new_add.input(1));
+        const NodeDef& new_mul = output.node(7);
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_mul.name());
+        EXPECT_EQ("x", new_mul.input(0));
+        EXPECT_EQ(OptimizedName("add_hoist_add"), new_mul.input(1));
+        const NodeDef& new_id = output.node(6);
+        EXPECT_EQ("id", new_id.name());
+        EXPECT_EQ(OptimizedName("add_hoist_mul"), new_id.input(0));
+      }
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output z = ops::Complex(s.WithOpName("z"), re, im);
+  Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
+  Output conj = ops::Conj(s.WithOpName("conj"), z);
+  Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
+  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
+  EXPECT_EQ("z", output.node(6).input(0));
+  EXPECT_EQ("perm", output.node(6).input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output z = ops::Complex(s.WithOpName("z"), re, im);
+  Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
+  Output conj = ops::Conj(s.WithOpName("conj"), z);
+  Output transp =
+      ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
+  EXPECT_EQ("Transpose", output.node(6).op());
+  EXPECT_EQ("z", output.node(6).input(0));
+  EXPECT_EQ("perm", output.node(6).input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output z = ops::Complex(s.WithOpName("z"), re, im);
+  Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
+  Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
+  Output conj = ops::Conj(s.WithOpName("conj"), trans);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(7, output.node_size());
+  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
+  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
+  EXPECT_EQ("z", output.node(6).input(0));
+  EXPECT_EQ("perm", output.node(6).input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
+  for (const string matmul_type : {"MatMul", "SparseMatMul", "BatchMatMul"}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+    Output b = ops::Const(s.WithOpName("b"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
+    Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
+    Output trans_a = ops::Transpose(s.WithOpName("trans_a"), a, perm);
+    Output trans_b = ops::Transpose(s.WithOpName("trans_b"), b, perm);
+    if (matmul_type == "MatMul") {
+      Output matmul = ops::MatMul(s.WithOpName("matmul"), trans_a, trans_b);
+    } else if (matmul_type == "SparseMatMul") {
+      Output matmul =
+          ops::SparseMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+    } else if (matmul_type == "BatchMatMul") {
+      Output matmul =
+          ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+    }
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    ArithmeticOptimizer optimizer;
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    // Run the optimizer twice to make sure the rewrite is idempotent.
+    item.graph.Swap(&output);
+    status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(7, output.node_size());
+    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
+    EXPECT_EQ("a", output.node(6).input(0));
+    EXPECT_EQ("b", output.node(6).input(1));
+    if (matmul_type == "BatchMatMul") {
+      EXPECT_TRUE(output.node(6).attr().at("adj_x").b());
+      EXPECT_TRUE(output.node(6).attr().at("adj_y").b());
+    } else {
+      EXPECT_TRUE(output.node(6).attr().at("transpose_a").b());
+      EXPECT_TRUE(output.node(6).attr().at("transpose_b").b());
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output re_a =
+      ops::Const(s.WithOpName("re_a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im_a =
+      ops::Const(s.WithOpName("im_a"), {-1.0f, -2.0f, -3.0f, -4.0f}, {2, 2});
+  Output re_b =
+      ops::Const(s.WithOpName("re_b"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
+  Output im_b =
+      ops::Const(s.WithOpName("im_b"), {-5.0f, -6.0f, -7.0f, -8.0f}, {2, 2});
+  Output a = ops::Complex(s.WithOpName("a"), re_a, im_a);
+  Output b = ops::Complex(s.WithOpName("b"), re_b, im_b);
+  Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
+  Output trans_a = ops::ConjugateTranspose(s.WithOpName("trans_a"), a, perm);
+  Output trans_b = ops::ConjugateTranspose(s.WithOpName("trans_b"), b, perm);
+  Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(11, output.node_size());
+  EXPECT_EQ(OptimizedName("matmul_fused"), output.node(10).name());
+  EXPECT_EQ("a", output.node(10).input(0));
+  EXPECT_EQ("b", output.node(10).input(1));
+  EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
+  EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
+}
+
 TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
@@ -157,10 +631,6 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   item.graph = output;
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  for (const auto& node : output.node()) {
-    LOG(INFO) << node.DebugString();
-  }
-
   EXPECT_EQ(0, std::count_if(
                    output.node().begin(), output.node().end(),
                    [](const NodeDef& node) { return node.op() == "Reshape"; }));
@@ -186,10 +656,6 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   item.graph = output;
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
-  for (const auto& node : output.node()) {
-    LOG(INFO) << node.DebugString();
-  }
-
   EXPECT_EQ(1, std::count_if(
                    output.node().begin(), output.node().end(),
                    [](const NodeDef& node) { return node.op() == "Reshape"; }));
@@ -578,10 +1044,11 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   NodeMap node_map(&output);
   const NodeDef* inputs_node = CHECK_NOTNULL(node_map.GetNode("Placeholder"));
   const NodeDef* transpose_node =
-      CHECK_NOTNULL(node_map.GetNode("Transpose_uint8"));
-  const NodeDef* cast_node = CHECK_NOTNULL(node_map.GetNode("Cast_new"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Transpose_uint8")));
+  const NodeDef* cast_node =
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_new")));
   const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode("weights_scaled"));
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
   const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
 
   EXPECT_EQ(output.node_size(), 7);
@@ -591,6 +1058,50 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   EXPECT_EQ(conv_node->input(1), weights_node->name());
 }
 
+TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
+  // This unit test exercises optimization of folding mul into conv for
+  // multiple nodes in the graph.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+
+  GrapplerItem item;
+  Output conv[2];
+
+  for (int i = 0; i < 2; ++i) {
+    Output inputs =
+        ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({8, 3, 28, 28}));
+    Output mul = ops::Mul(s, inputs, ops::Const(s, 1.0f / 255.0f));
+    Output weights = ops::Const(s.WithOpName("weights"),
+                                Input::Initializer(127.0f, {5, 5, 3, 16}));
+    conv[i] = ops::Conv2D(s, mul, weights, {1, 1, 1, 1}, "VALID",
+                          ops::Conv2D::DataFormat("NCHW"));
+  }
+  Output outputs = ops::Add(s.WithOpName("outputs"), conv[0], conv[1]);
+
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+
+  item.graph = output;
+  TF_EXPECT_OK(
+      ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
+
+  item.graph = output;
+  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+
+  NodeMap node_map(&output);
+  const NodeDef* weights_node =
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
+  const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
+
+  const NodeDef* weights_node_1 =
+      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D_1")));
+  const NodeDef* conv_node_1 = CHECK_NOTNULL(node_map.GetNode("Conv2D_1"));
+  EXPECT_EQ(conv_node->input(1), weights_node->name());
+  EXPECT_EQ(conv_node_1->input(1), weights_node_1->name());
+}
+
 TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index faea843c69bff03354d1250af1c0b89e88189fd1..d90fe5704007fcff4f23fa84a0c0e858beca0da3 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -30,12 +30,16 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/bcast.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -94,14 +98,52 @@ class DeviceSimple : public DeviceBase {
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
 };
 
+template <typename T>
+bool AllValuesAre(const TensorProto& tensor, const T& value) {
+  // TensorProto represents the content of the tensor in either <type>_val or
+  // tensor_content.
+  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
+      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
+  if (!tensor_values->empty()) {
+    for (const T& tensor_value : *tensor_values) {
+      if (tensor_value != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  const auto tensor_content_size = tensor.tensor_content().size();
+  if (tensor_content_size > 0) {
+    CHECK_EQ(0, tensor_content_size % sizeof(T));
+    std::vector<T> raw_values(tensor_content_size / sizeof(T));
+    port::CopyToArray(tensor.tensor_content(),
+                      reinterpret_cast<char*>(raw_values.data()));
+    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
+      if (raw_values[i] != value) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 }  // namespace
-ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
-    : cpu_device_(cpu_device) {
+
+ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
+                                 DeviceBase* cpu_device)
+    : opt_level_(opt_level), cpu_device_(cpu_device) {
   resource_mgr_.reset(new ResourceMgr());
 }
 
-string ConstantFolding::AddControlDependency(const string& input_name) {
-  const NodeDef* node = node_map_->GetNode(input_name);
+ConstantFolding::ConstantFolding(DeviceBase* cpu_device)
+    : ConstantFolding(RewriterConfig::ON, cpu_device) {}
+
+// static
+string ConstantFolding::AddControlDependency(const string& input_name,
+                                             GraphDef* graph,
+                                             NodeMap* node_map) {
+  const NodeDef* node = node_map->GetNode(input_name);
   if (!IsSwitch(*node)) {
     return AsControlDependency(*node);
   } else {
@@ -111,10 +153,9 @@ string ConstantFolding::AddControlDependency(const string& input_name) {
     // dependency is only triggered when the corresponding output is triggered.
     // We start by looking for an identity node connected to the output of the
     // switch node, and use it to anchor the control dependency.
-    auto outputs = node_map_->GetOutputs(node->name());
+    auto outputs = node_map->GetOutputs(node->name());
     for (const NodeDef* node : outputs) {
       if (IsIdentity(*node)) {
-        CHECK_EQ(1, node->input_size());
         if (IsSameInput(node->input(0), input_name)) {
           return AsControlDependency(*node);
         }
@@ -122,114 +163,423 @@ string ConstantFolding::AddControlDependency(const string& input_name) {
     }
     // We haven't found an existing node where we can anchor the control
     // dependency: add a new identity node.
-    int position = 0;
-    string ctrl_dep_name = ParseNodeName(input_name, &position);
-    strings::StrAppend(&ctrl_dep_name, "_", position);
+    int port = 0;
+    string ctrl_dep_name = ParseNodeName(input_name, &port);
+    strings::StrAppend(&ctrl_dep_name, "_", port);
     ctrl_dep_name = AddPrefixToNodeName(ctrl_dep_name, kConstantFoldingCtrl);
     const DataType output_type = node->attr().at("T").type();
 
-    NodeDef* added_node = graph_.add_node();
+    NodeDef* added_node = graph->add_node();
     added_node->set_name(ctrl_dep_name);
     added_node->set_op("Identity");
     added_node->set_device(node->device());
 
     (*added_node->mutable_attr())["T"].set_type(output_type);
     *added_node->add_input() = input_name;
-    node_map_->AddNode(added_node->name(), added_node);
-    node_map_->AddOutput(node->name(), added_node->name());
+    node_map->AddNode(added_node->name(), added_node);
+    node_map->AddOutput(node->name(), added_node->name());
     return AsControlDependency(*added_node);
   }
 }
 
-Status ConstantFolding::MaterializeShapes(const GrapplerItem& item,
-                                          const GraphProperties& properties) {
+Status ConvertShapeToConstant(const string& op, const DataType& type,
+                              const PartialTensorShape& shp, Tensor* value) {
+  if (op == "Shape" || op == "ShapeN") {
+    *value = Tensor(type, TensorShape({shp.dims()}));
+    for (int i = 0; i < shp.dims(); ++i) {
+      if (type == DT_INT32) {
+        if (shp.dim_size(i) >= INT_MAX) {
+          return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
+        }
+        value->flat<int32>()(i) = shp.dim_size(i);
+      } else {
+        value->flat<int64>()(i) = shp.dim_size(i);
+      }
+    }
+  } else if (op == "Size") {
+    int64 size = 1;
+    for (int i = 0; i < shp.dims(); ++i) {
+      size *= shp.dim_size(i);
+    }
+    *value = Tensor(type, TensorShape({}));
+    if (type == DT_INT32) {
+      if (size >= INT_MAX) {
+        return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
+      }
+      value->flat<int32>()(0) = size;
+    } else {
+      value->flat<int64>()(0) = size;
+    }
+  } else {
+    *value = Tensor(type, TensorShape({}));
+    if (type == DT_INT32) {
+      if (shp.dims() >= INT_MAX) {
+        return Status(error::INVALID_ARGUMENT, "Invalid dimension size");
+      }
+      value->flat<int32>()(0) = shp.dims();
+    } else {
+      value->flat<int64>()(0) = shp.dims();
+    }
+  }
+  return Status::OK();
+}
+
+bool ConstantFolding::IsReallyConstant(const NodeDef& node) const {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes_.find(node.name()) == feed_nodes_.end();
+}
+
+Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   // We may add some nodes to the graph to encode control dependencies: there is
   // no need to process these, so only iterate over the nodes of the input
   // graph.
-  const int node_count = graph_.node_size();
+  const int node_count = graph_->node_size();
   for (int i = 0; i < node_count; ++i) {
-    NodeDef& node = *graph_.mutable_node(i);
+    NodeDef& node = *graph_->mutable_node(i);
     const string op = node.op();
-    if (op != "Shape" && op != "Size" && op != "Rank") {
+    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
       continue;
     }
-    std::vector<OpInfo::TensorProperties> output =
-        properties.GetOutputProperties(node.name());
-    CHECK_EQ(1, output.size());
-    const DataType type = output[0].dtype();
-    CHECK(type == DT_INT32 || type == DT_INT64);
 
-    std::vector<OpInfo::TensorProperties> input =
+    const std::vector<OpInfo::TensorProperties>& output =
+        properties.GetOutputProperties(node.name());
+    const std::vector<OpInfo::TensorProperties>& input =
         properties.GetInputProperties(node.name());
-    CHECK_EQ(1, input.size());
-
-    const TensorShapeProto shape = input[0].shape();
-    // Materialize the shapes using constants whenever possible.
-    PartialTensorShape shp(shape);
-    if (shp.IsFullyDefined() || (!shp.unknown_rank() && op == "Rank")) {
-      bool valid = true;
-      Tensor value(type);
-      if (op == "Shape") {
-        value = Tensor(type, TensorShape({shp.dims()}));
-        for (int i = 0; i < shp.dims(); ++i) {
-          if (type == DT_INT32) {
-            if (shp.dim_size(i) >= INT_MAX) {
-              valid = false;
-              break;
-            }
-            value.flat<int32>()(i) = shp.dim_size(i);
-          } else {
-            value.flat<int64>()(i) = shp.dim_size(i);
-          }
-        }
-      } else if (op == "Size") {
-        int64 size = 1;
-        for (int i = 0; i < shp.dims(); ++i) {
-          size *= shp.dim_size(i);
+    if (input.empty() || output.empty()) {
+      continue;
+    }
+    if (op == "Shape" || op == "Size" || op == "Rank") {
+      CHECK_EQ(1, output.size());
+      CHECK_EQ(1, input.size());
+    }
+    CHECK_EQ(input.size(), output.size());
+
+    for (int j = 0; j < output.size(); ++j) {
+      const DataType type = output[j].dtype();
+      CHECK(type == DT_INT32 || type == DT_INT64);
+      const TensorShapeProto shape = input[j].shape();
+      // Materialize the shapes using constants whenever possible.
+      PartialTensorShape shp(shape);
+      if (shp.IsFullyDefined() || (!shp.unknown_rank() && op == "Rank")) {
+        Tensor value(type);
+        auto status = ConvertShapeToConstant(op, type, shp, &value);
+        if (!status.ok()) {
+          continue;
         }
-        value = Tensor(type, TensorShape({}));
-        if (type == DT_INT32) {
-          if (size >= INT_MAX) {
-            valid = false;
-          } else {
-            value.flat<int32>()(0) = size;
-          }
+        // We rewrite the existing node for the first const output and
+        // create new nodes for the remaining const outputs (Note that ShapeN
+        // could have multiple outputs).
+        if (op == "Shape" || op == "Size" || op == "Rank") {
+          // Replace the node with the corresponding constant.
+          node.set_op("Const");
+          node.clear_attr();
+          (*node.mutable_attr())["dtype"].set_type(type);
+          value.AsProtoTensorContent(
+              (*node.mutable_attr())["value"].mutable_tensor());
+
+          // Turn the data input into a control dependency: this is needed to
+          // ensure that the constant value will only be run in the
+          // cases where the shape/rank/size would have been run in
+          // the original graph. Additional inputs are extra control
+          string ctrl_dep =
+              AddControlDependency(node.input(0), graph_, node_map_.get());
+          node.set_input(0, ctrl_dep);
+          node_map_->AddOutput(NodeName(ctrl_dep), node.name());
         } else {
-          value.flat<int64>()(0) = size;
-        }
-      } else {
-        value = Tensor(type, TensorShape({}));
-        if (type == DT_INT32) {
-          if (shp.dims() >= INT_MAX) {
-            valid = false;
-          } else {
-            value.flat<int32>()(0) = shp.dims();
+          auto outputs = node_map_->GetOutputs(node.name());
+          for (const auto& output : outputs) {
+            for (int k = 0; k < output->input_size(); ++k) {
+              int port;
+              string node_name = ParseNodeName(output->input(k), &port);
+              if (node_name == node.name() && port == j) {
+                // Create a const node as ShapeN's output if not already.
+                string const_name =
+                    AddPrefixToNodeName(strings::StrCat(node.name(), "-", j),
+                                        kConstantFoldingConst);
+                if (node_map_->GetNode(const_name) == nullptr) {
+                  NodeDef* added_node = graph_->add_node();
+                  added_node->set_name(const_name);
+                  added_node->set_op("Const");
+                  added_node->set_device(node.device());
+                  node_map_->AddNode(added_node->name(), added_node);
+                  (*added_node->mutable_attr())["dtype"].set_type(type);
+                  value.AsProtoTensorContent(
+                      (*added_node->mutable_attr())["value"].mutable_tensor());
+                  // We add a control dependency to the original ShapeN node,
+                  // so that the node will only be run if all inputs of the
+                  // original ShapeN node are run.
+                  string ctrl_dep = AddControlDependency(node.name(), graph_,
+                                                         node_map_.get());
+                  *added_node->add_input() = ctrl_dep;
+                  node_map_->AddOutput(NodeName(ctrl_dep), added_node->name());
+                }
+                node_map_->UpdateInput(output->name(),
+                                       NodeName(output->input(k)), const_name);
+                *output->mutable_input(k) = const_name;
+              }
+            }
           }
-        } else {
-          value.flat<int64>()(0) = shp.dims();
         }
       }
+    }
+  }
+  return Status::OK();
+}
 
-      if (valid) {
-        // Replace the node with the corresponding constant.
-        node.set_op("Const");
-        node.clear_attr();
-        (*node.mutable_attr())["dtype"].set_type(type);
-        value.AsProtoTensorContent(
-            (*node.mutable_attr())["value"].mutable_tensor());
+namespace {
+bool ShapesEqual(const TensorShapeProto& shape1,
+                 const TensorShapeProto& shape2) {
+  if (shape1.unknown_rank() || shape2.unknown_rank()) {
+    return false;
+  }
+  if (shape1.dim_size() != shape2.dim_size()) {
+    return false;
+  }
+  for (int i = 0; i < shape1.dim_size(); ++i) {
+    if (shape1.dim(i).size() != shape2.dim(i).size()) {
+      return false;
+    }
+    if (shape1.dim(i).size() == -1 || shape2.dim(i).size() == -1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ExtractShape(const NodeDef& shape_node, const GraphProperties& properties,
+                  BCast::Vec* shape, int64* min_id) {
+  if (shape_node.op() == "Shape") {
+    const std::vector<OpInfo::TensorProperties>& prop1 =
+        properties.GetInputProperties(shape_node.name());
+    if (prop1.size() != 1) {
+      return false;
+    }
+    const TensorShapeProto& shp = prop1[0].shape();
+    if (shp.unknown_rank()) {
+      return false;
+    }
+    for (const auto& dim : shp.dim()) {
+      shape->push_back(dim.size());
+      *min_id = std::min<int64>(*min_id, dim.size());
+    }
+  } else {
+    const TensorProto& raw_val = shape_node.attr().at("value").tensor();
+    if (raw_val.dtype() != DT_INT64 && raw_val.dtype() != DT_INT32) {
+      return false;
+    }
+    Tensor value(raw_val.dtype(), raw_val.tensor_shape());
+    if (!value.FromProto(raw_val)) {
+      return false;
+    }
+    for (int j = 0; j < value.NumElements(); ++j) {
+      if (raw_val.dtype() == DT_INT64) {
+        shape->push_back(value.vec<int64>()(j));
+      } else {
+        shape->push_back(value.vec<int>()(j));
+      }
+    }
+  }
+  return true;
+}
+}  // namespace
+
+Status ConstantFolding::MaterializeBroadcastGradientArgs(
+    const NodeDef& node, const GraphProperties& properties) {
+  const NodeDef* shape_node1 = node_map_->GetNode(node.input(0));
+  const NodeDef* shape_node2 = node_map_->GetNode(node.input(1));
+  if (shape_node1 == nullptr ||
+      (shape_node1->op() != "Shape" && !IsReallyConstant(*shape_node1)) ||
+      shape_node2 == nullptr ||
+      (shape_node2->op() != "Shape" && !IsReallyConstant(*shape_node2))) {
+    return Status::OK();
+  }
+
+  int64 min_id = 0;
+  BCast::Vec shape1;
+  if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
+    return Status::OK();
+  }
+  BCast::Vec shape2;
+  if (!ExtractShape(*shape_node2, properties, &shape2, &min_id)) {
+    return Status::OK();
+  }
+  // A value of -1 means we don't known anything about the dimension. Replace
+  // the -1 values with unique dimension ids since we don't want two '-1'
+  // dimensions to be considered equal.
+  for (auto& id : shape1) {
+    if (id == -1) {
+      id = --min_id;
+    }
+  }
+  for (auto& id : shape2) {
+    if (id == -1) {
+      id = --min_id;
+    }
+  }
+  BCast bcast(shape1, shape2);
+  if (!bcast.IsValid()) {
+    return Status::OK();
+  }
+  BCast::Vec reduce_dims[2];
+  reduce_dims[0] = bcast.grad_x_reduce_idx();
+  reduce_dims[1] = bcast.grad_y_reduce_idx();
+
+  const DataType type = node.attr().at("T").type();
+  NodeDef* out[2];
+  for (int j = 0; j < 2; ++j) {
+    if (!reduce_dims[j].empty()) {
+      // This is the case when a tensor dimension of 1 is matched against an
+      // unknown dimension. The unknown dimension could also be equal to 1, in
+      // which case there would be no reduction.
+      out[j] = nullptr;
+    } else {
+      string const_name = AddPrefixToNodeName(
+          strings::StrCat(node.name(), "-", j), kConstantFoldingConst);
+      out[j] = node_map_->GetNode(const_name);
+      if (out[j] == nullptr) {
+        out[j] = graph_->add_node();
+        Tensor value(type, TensorShape({0}));
+        *out[j] = CreateNodeDef(const_name, TensorValue(&value));
+        out[j]->set_device(node.device());
+        node_map_->AddNode(const_name, out[j]);
+        string ctrl_dep =
+            AddControlDependency(node.name(), graph_, node_map_.get());
+        *out[j]->add_input() = ctrl_dep;
+        node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+      }
+    }
+  }
 
-        // Turn the data input into a control dependency: this is needed to
-        // ensure that the constant value will only be generated in the cases
-        // where the shape/rank/size would have been generated in the original
-        // graph. Additional inputs are extra control dependencies that we
-        // preserve.
-        CHECK_LE(1, node.input_size());
-        string ctrl_dep = AddControlDependency(node.input(0));
-        node.set_input(0, ctrl_dep);
-        node_map_->AddOutput(NodeName(ctrl_dep), node.name());
+  const std::set<NodeDef*> outputs = node_map_->GetOutputs(node.name());
+  for (NodeDef* output : outputs) {
+    for (int k = 0; k < output->input_size(); ++k) {
+      int port;
+      string node_name = ParseNodeName(output->input(k), &port);
+      if (node_name == node.name() && port >= 0 && port < 2 && out[port]) {
+        *output->mutable_input(k) = out[port]->name();
+        node_map_->UpdateInput(output->name(), node_name, out[port]->name());
       }
     }
   }
+
+  return Status::OK();
+}
+
+Status ConstantFolding::MaterializeReductionIndices(
+    NodeDef* node, const GraphProperties& properties) {
+  if (node->input_size() < 2) {
+    return Status::OK();
+  }
+  const NodeDef* indices = node_map_->GetNode(node->input(1));
+  if (!indices || IsReallyConstant(*indices)) {
+    // The reduction indices are already constant, there's nothing to do.
+    return Status::OK();
+  }
+
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      properties.GetInputProperties(node->name());
+  if (input_props.size() != 2) {
+    return Status::OK();
+  }
+  const OpInfo::TensorProperties& input_prop = input_props[0];
+  if (input_prop.shape().unknown_rank()) {
+    // We can't do anything if we don't know the rank of the input.
+    return Status::OK();
+  }
+  const int rank = input_prop.shape().dim_size();
+  if (rank == 0) {
+    // Unexpected graph, don't try to change it.
+    return Status::OK();
+  }
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) {
+    return Status::OK();
+  }
+  const bool keep_dims =
+      node->attr().count("keep_dims") && node->attr().at("keep_dims").b();
+  const OpInfo::TensorProperties& output_prop = output_props[0];
+  PartialTensorShape output_shape(output_prop.shape());
+  if (output_shape.num_elements() != 1) {
+    bool full_reduction = false;
+    for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
+      if (!IsReshape(*fanout) && !keep_dims) {
+        // Depending on how it's setup, a full reduction will generate a tensor
+        // of shape [], [1], [1, 1], [1, 1, ...]. If keep_dims isn't true, we
+        // rely on the existence of a reshape node following the reduction to
+        // ensure that the fanout is fed a scalar of the right shape.
+        return Status::OK();
+      }
+      const std::vector<OpInfo::TensorProperties>& reshape_props =
+          properties.GetOutputProperties(fanout->name());
+      if (reshape_props.size() != 1) {
+        return Status::OK();
+      }
+      const OpInfo::TensorProperties& reshape_prop = reshape_props[0];
+      PartialTensorShape shape(reshape_prop.shape());
+      if (shape.num_elements() != 1) {
+        return Status::OK();
+      } else {
+        full_reduction = true;
+      }
+    }
+    if (!full_reduction) {
+      return Status::OK();
+    }
+  }
+
+  const OpInfo::TensorProperties& reduction_prop = input_props[1];
+  DataType dtype = reduction_prop.dtype();
+  if (dtype != DT_INT32 && dtype != DT_INT64) {
+    return Status::OK();
+  }
+  // We know it's a full reduction. We can generate the set of indices to
+  // reduce.
+  string const_name =
+      AddPrefixToNodeName(strings::StrCat(node->name(), "-reduction_indices"),
+                          kConstantFoldingConst);
+  if (node_map_->GetNode(const_name)) {
+    return Status::OK();
+  }
+  NodeDef* reduction_indices = graph_->add_node();
+  Tensor value(dtype, TensorShape({rank}));
+  for (int i = 0; i < rank; ++i) {
+    if (dtype == DT_INT32) {
+      value.vec<int32>()(i) = i;
+    } else {
+      value.vec<int64>()(i) = i;
+    }
+  }
+  *reduction_indices = CreateNodeDef(const_name, TensorValue(&value));
+  reduction_indices->set_device(node->device());
+  string ctrl_dep =
+      AddControlDependency(node->input(1), graph_, node_map_.get());
+  *reduction_indices->add_input() = ctrl_dep;
+  node_map_->AddNode(const_name, reduction_indices);
+  node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+
+  node->set_input(1, reduction_indices->name());
+  node_map_->UpdateInput(node->name(), indices->name(),
+                         reduction_indices->name());
+
+  return Status::OK();
+}
+
+Status ConstantFolding::MaterializeConstants(
+    const GraphProperties& properties) {
+  const int node_count = graph_->node_size();
+  for (int i = 0; i < node_count; ++i) {
+    NodeDef& node = *graph_->mutable_node(i);
+    const string& op = node.op();
+    if (op == "BroadcastGradientArgs") {
+      TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
+    } else if (IsReduction(node)) {
+      TF_RETURN_IF_ERROR(MaterializeReductionIndices(&node, properties));
+    }
+  }
   return Status::OK();
 }
 
@@ -238,24 +588,23 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (node.input().empty()) {
     return false;
   }
-
   // Skips nodes that must be preserved except whitelisted nodes.
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end() &&
       nodes_whitelist_.find(node.name()) == nodes_whitelist_.end()) {
     return false;
   }
-
-  // Skips ops that don't benefit from folding.
-  const string& op = node.op();
-  // Skip constants, they're already folded
-  if (op == "Const") {
+  // Skip control flow nodes, they can't be folded
+  if (ModifiesFrameInfo(node)) {
     return false;
   }
-  // Skip constrol flow nodes, they can't be folded
-  if (op == "Enter" || op == "RefEnter" || op == "Exit" || op == "RefExit" ||
-      op == "NextIteration" || op == "RefNextIteration") {
+  // Skip constants, they're already folded
+  if (IsConstant(node)) {
     return false;
   }
+
+  // Skips ops that don't benefit from folding.
+  const string& op = node.op();
+
   if (op.find("Placeholder") == 0) {
     return false;
   }
@@ -309,7 +658,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
     if (!input_node) {
       return false;
     }
-    bool is_const = IsConstant(*input_node);
+    bool is_const = IsReallyConstant(*input_node);
     if (!is_const && !is_merge) {
       return false;
     }
@@ -327,6 +676,38 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   return true;
 }
 
+namespace {
+
+#define SET_TENSOR_VAL_CASE(DTYPE, TYPE, NAME)     \
+  case DTYPE:                                      \
+    t->add_##NAME##_val(static_cast<TYPE>(value)); \
+    break;
+
+Status CreateConstantTensorAttrValue(DataType type, double value,
+                                     const TensorShapeProto& shape,
+                                     AttrValue* attr_tensor) {
+  TensorProto* t = attr_tensor->mutable_tensor();
+  t->set_dtype(type);
+  *t->mutable_tensor_shape() = shape;
+  switch (type) {
+    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    default:
+      return errors::InvalidArgument("Unsupported type: ", type);
+  }
+  return Status::OK();
+}
+
+#undef SET_TENSOR_CAL_CASE
+}  // namespace
+
+// static
 NodeDef ConstantFolding::CreateNodeDef(const string& name,
                                        const TensorValue& tensor) {
   NodeDef node;
@@ -366,6 +747,14 @@ NodeDef ConstantFolding::CreateNodeDef(const string& name,
       POPULATE_TENSOR_PROTO(tensor, t, int64, int64)
     } else if (tensor->dtype() == DT_INT32) {
       POPULATE_TENSOR_PROTO(tensor, t, int32, int)
+    } else if (tensor->dtype() == DT_INT16) {
+      POPULATE_TENSOR_PROTO(tensor, t, int16, int)
+    } else if (tensor->dtype() == DT_INT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, int8, int)
+    } else if (tensor->dtype() == DT_UINT8) {
+      POPULATE_TENSOR_PROTO(tensor, t, uint8, int)
+    } else if (tensor->dtype() == DT_BOOL) {
+      POPULATE_TENSOR_PROTO(tensor, t, bool, bool)
     }
   }
   if (optimized) {
@@ -427,14 +816,14 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   });
 
   for (const auto& input : node.input()) {
-    int position = 0;
-    ParseNodeName(input, &position);
-    if (position < 0) {
+    int port = 0;
+    ParseNodeName(input, &port);
+    if (port < 0) {
       // Control dependency
       break;
     }
     const NodeDef* input_node = node_map_->GetNode(input);
-    if (!IsConstant(*input_node)) {
+    if (!IsReallyConstant(*input_node)) {
       return Status(error::INVALID_ARGUMENT,
                     strings::StrCat("Can't fold ", node.name(), ", its ", input,
                                     " isn't constant"));
@@ -488,7 +877,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
         continue;
       }
       NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsConstant(*input_node)) {
+      if (!IsReallyConstant(*input_node)) {
         continue;
       }
       bool valid_input = true;
@@ -539,13 +928,13 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
       auto outputs = node_map_->GetOutputs(node->name());
       for (auto& output : outputs) {
         for (int i = 0; i < output->input_size(); i++) {
-          int position;
-          string node_name = ParseNodeName(output->input(i), &position);
+          int port;
+          string node_name = ParseNodeName(output->input(i), &port);
           if (node_name == node->name()) {
-            if (position == 0) {
+            if (port == 0) {
               *output->mutable_input(i) = const_out->name();
               node_map_->AddOutput(const_out->name(), output->name());
-            } else if (position == 1) {
+            } else if (port == 1) {
               *output->mutable_input(i) = const_index->name();
               node_map_->AddOutput(const_index->name(), output->name());
             } else {
@@ -630,10 +1019,10 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
     auto outputs = node_map_->GetOutputs(node->name());
     for (const auto& output : outputs) {
       for (int i = 0; i < output->input_size(); i++) {
-        int position;
-        string node_name = ParseNodeName(output->input(i), &position);
+        int port;
+        string node_name = ParseNodeName(output->input(i), &port);
         if (node_name == node->name()) {
-          if (position < 0) {
+          if (port < 0) {
             // Propagate control dependencies if possible. If not, we'll just
             // preserve the existing control dependencies.
             if (constant_output != nullptr) {
@@ -641,17 +1030,17 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
                                      constant_output->name());
               *output->mutable_input(i) = AsControlDependency(*constant_output);
             }
-          } else if (position < const_nodes.size() &&
-                     !const_nodes[position].name().empty()) {
+          } else if (port < const_nodes.size() &&
+                     !const_nodes[port].name().empty()) {
             // Replace alive outputs with the corresponding constant.
             node_map_->UpdateInput(output->name(), NodeName(output->input(i)),
-                                   const_nodes[position].name());
-            *output->mutable_input(i) = const_nodes[position].name();
+                                   const_nodes[port].name());
+            *output->mutable_input(i) = const_nodes[port].name();
           } else {
             // Leave this edge alone.
-            VLOG(1) << "Preserving edge from " << node->name() << ":"
-                    << position << "[" << node->op() << "] to "
-                    << output->name() << ":" << i << "[" << output->op() << "]";
+            VLOG(1) << "Preserving edge from " << node->name() << ":" << port
+                    << "[" << node->op() << "] to " << output->name() << ":"
+                    << i << "[" << output->op() << "]";
           }
         }
       }
@@ -669,8 +1058,8 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph) {
 Status ConstantFolding::FoldGraph(GraphDef* output) {
   std::unordered_set<string> processed_nodes;
   std::deque<NodeDef*> queue;
-  for (int i = 0; i < graph_.node_size(); i++) {
-    auto node = graph_.mutable_node(i);
+  for (int i = 0; i < graph_->node_size(); i++) {
+    auto node = graph_->mutable_node(i);
     if (IsFoldable(*node)) {
       queue.push_back(node);
     }
@@ -683,6 +1072,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     }
     // We need to record a copy of output nodes before FoldNode() modifies it.
     std::set<NodeDef*> outputs = node_map_->GetOutputs(node->name());
+
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
@@ -709,7 +1099,7 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   output->mutable_node()->DeleteSubrange(last + 1,
                                          output->node_size() - last - 1);
 
-  for (const auto& node : graph_.node()) {
+  for (const auto& node : graph_->node()) {
     // If no fetch nodes is provided, we conservatively
     // keep all nodes in the original graph in case users need to fetch
     // their values.
@@ -730,7 +1120,7 @@ bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
   if (IsReduction(node)) {
     CHECK_LE(2, node.input_size());
     const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
-    if (IsConstant(*reductions_indices)) {
+    if (IsReallyConstant(*reductions_indices)) {
       TensorVector output;
       Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
       if (!s.ok()) {
@@ -754,7 +1144,7 @@ bool ConstantFolding::IsSimplifiableReshape(
   }
   CHECK_LE(2, node.input_size());
   const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-  if (!IsConstant(*new_shape)) {
+  if (!IsReallyConstant(*new_shape)) {
     return false;
   }
   TensorVector outputs;
@@ -804,58 +1194,270 @@ bool ConstantFolding::IsSimplifiableReshape(
   return shape.IsCompatibleWith(new_dims);
 }
 
+#define IS_VALUE_CASE(DTYPE, VALUE)                   \
+  case DTYPE:                                         \
+    return AllValuesAre<EnumToDataType<DTYPE>::Type>( \
+        node.attr().at("value").tensor(), EnumToDataType<DTYPE>::Type(VALUE))
+
+#define IS_ONES_CASE(TYPE) IS_VALUE_CASE(TYPE, 1)
+#define IS_ZEROS_CASE(TYPE) IS_VALUE_CASE(TYPE, 0)
+
+bool ConstantFolding::IsOnes(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "OnesLike") {
+    return true;
+  }
+  if (node.op() != "Const") {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_FLOAT);
+    IS_ONES_CASE(DT_DOUBLE);
+    IS_ONES_CASE(DT_UINT8);
+    IS_ONES_CASE(DT_INT8);
+    IS_ONES_CASE(DT_UINT16);
+    IS_ONES_CASE(DT_INT16);
+    IS_ONES_CASE(DT_INT32);
+    IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_COMPLEX64);
+    IS_ONES_CASE(DT_COMPLEX128);
+    default:
+      LOG(ERROR) << "Unexpected type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+bool ConstantFolding::IsZeros(const NodeDef& node) const {
+  if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
+    return false;
+  }
+  if (node.op() == "ZerosLike") {
+    return true;
+  }
+  if (!IsConstant(node)) {
+    return false;
+  }
+  const auto dtype = node.attr().at("dtype").type();
+  switch (dtype) {
+    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_FLOAT);
+    IS_ZEROS_CASE(DT_DOUBLE);
+    IS_ZEROS_CASE(DT_UINT8);
+    IS_ZEROS_CASE(DT_INT8);
+    IS_ZEROS_CASE(DT_UINT16);
+    IS_ZEROS_CASE(DT_INT16);
+    IS_ZEROS_CASE(DT_INT32);
+    IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_COMPLEX64);
+    IS_ZEROS_CASE(DT_COMPLEX128);
+    default:
+      LOG(ERROR) << "Unexpected type " << DataTypeString(dtype);
+      return false;
+  }
+  return false;
+}
+
+void ConstantFolding::ReplaceOperationWithIdentity(int input_to_forward,
+                                                   NodeDef* node) {
+  node->set_op("Identity");
+  // Propagate the designated input through the identity.
+  node->mutable_input()->SwapElements(0, input_to_forward);
+  // Add all other inputs as control dependencies.
+  for (int i = 1; i < node->input_size(); ++i) {
+    node->set_input(i, AsControlDependency(node->input(i)));
+  }
+  graph_modified_ = true;
+}
+
+void ConstantFolding::ReplaceDivisionOfOnesByReciprocal(NodeDef* node) {
+  node->set_op("Reciprocal");
+  node->mutable_input()->SwapElements(0, 1);
+  node->set_input(1, AsControlDependency(node->input(1)));
+  graph_modified_ = true;
+}
+
+Status ConstantFolding::ReplaceOperationWithConstant(
+    double value, const TensorShapeProto& shape, NodeDef* node) {
+  AttrValue tensor_attr;
+  AttrValue dtype_attr = node->attr().at("T");
+  TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
+                                                   shape, &tensor_attr));
+  node->clear_attr();
+  node->mutable_attr()->insert({"dtype", dtype_attr});
+  node->mutable_attr()->insert({"value", tensor_attr});
+  node->set_op("Const");
+  // Convert all inputs to control dependencies.
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    node->set_input(i, AsControlDependency(node->input(i)));
+  }
+  graph_modified_ = true;
+  return Status::OK();
+}
+
 Status ConstantFolding::SimplifyGraph(GraphDef* output,
-                                      const GraphProperties& properties) {
-  for (auto& node : *output->mutable_node()) {
-    if (IsSimplifiableReduction(node)) {
+                                      const GraphProperties& properties,
+                                      bool use_shape_info) {
+  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+  for (int i = 0; i < output->node_size(); ++i) {
+    NodeDef* node = output->mutable_node(i);
+    if (IsSimplifiableReduction(*node)) {
       // Replace the reduction node with an identity node, that can be further
       // optimized by the model pruner.
-      const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
       DataType output_type;
-      if (node.attr().count("T") > 0) {
-        output_type = node.attr().at("T").type();
+      if (node->attr().count("T") > 0) {
+        output_type = node->attr().at("T").type();
       } else {
         // This is an 'any' or 'all' reduction. The output is always boolean.
         output_type = DT_BOOL;
       }
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      continue;
+    }
+    const bool safe_to_use_shapes =
+        use_shape_info && (feed_nodes_.empty() || is_aggressive);
+    if (safe_to_use_shapes && IsSimplifiableReshape(*node, properties)) {
+      DataType output_type = node->attr().at("T").type();
+      node->set_op("Identity");
+      node->clear_attr();
+      (*node->mutable_attr())["T"].set_type(output_type);
+      *node->mutable_input(1) = AsControlDependency(node->input(1));
+      continue;
+    }
+
+    const bool is_mul = IsMul(*node);
+    const bool is_matmul = IsMatMul(*node);
+    const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
+    const bool is_sub = IsSub(*node);
+    const bool is_any_div = IsAnyDiv(*node);
+    // Simplify multiplication by ones or zeros, and addition/subtraction of
+    // zeros.
+    if (use_shape_info &&
+        (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
+        properties.HasInputProperties(node->name()) &&
+        properties.HasOutputProperties(node->name())) {
+      const NodeDef* x = node_map_->GetNode(node->input(0));
+      const NodeDef* y = node_map_->GetNode(node->input(1));
+      if (x == nullptr || y == nullptr) {
+        return errors::InvalidArgument("Invalid inputs to node: ",
+                                       node->DebugString());
+      }
+      const TensorShapeProto& output_shape =
+          properties.GetOutputProperties(node->name())[0].shape();
+
+      // Simplify element-wise  multiplication by ones or addition/subtraction
+      // of zeros.
+      const TensorShapeProto& y_shape =
+          properties.GetInputProperties(node->name())[1].shape();
+      const bool x_is_zero = IsZeros(*x);
+      const bool x_is_one = IsOnes(*x);
+      const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
+      if (y_matches_output_shape &&
+          ((is_mul && x_is_one) || (is_add && x_is_zero))) {
+        // TODO(rmlarsen): Handle subtraction 0 - y.
+        // 1 * y = y or 0 + y = y.
+        ReplaceOperationWithIdentity(1, node);
+        continue;
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : reductions_indices->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+
+      // Replace 1 / y with Reciprocal op.
+      if (y_matches_output_shape && is_any_div && x_is_one) {
+        ReplaceDivisionOfOnesByReciprocal(node);
+        continue;
+      }
+
+      const TensorShapeProto& x_shape =
+          properties.GetInputProperties(node->name())[0].shape();
+      const bool y_is_zero = IsZeros(*y);
+      const bool y_is_one = IsOnes(*y);
+      const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
+      if (x_matches_output_shape &&
+          (((is_mul || is_any_div) && y_is_one) ||
+           ((is_add || is_sub) && y_is_zero && is_aggressive))) {
+        // x * 1 = x or x / 1 = x or x +/- 0 = x
+        ReplaceOperationWithIdentity(0, node);
+        continue;
+      }
+
+      // Simplify multiplication and matmul by zeros.
+      // Also optimize zeros divided by a tensor, but only if we are in
+      // aggressive mode, since we might get rid of divisions by zero.
+      bool optimize_zeros_divided_by_y =
+          is_any_div && x_is_zero && is_aggressive;
+      if ((x_is_zero || y_is_zero) &&
+          (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
+        const PartialTensorShape shp(output_shape);
+        if (shp.IsFullyDefined()) {
+          TF_RETURN_IF_ERROR(
+              ReplaceOperationWithConstant(0, output_shape, node));
+          continue;
+        }
+        // Even if an input shape is only partially known, we may known that it
+        // matches the output shape and thus forward the corresponding zero
+        // input.
+        if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
+          ReplaceOperationWithIdentity(0, node);
+          continue;
+        } else if (is_mul && y_is_zero && y_matches_output_shape) {
+          ReplaceOperationWithIdentity(1, node);
+          continue;
+        }
       }
     }
-    // It's possible to feed a placeholder with a tensor that doesn't have the
-    // proper shape, and reshape this tensor later on. Therefore only remove
-    // reshapes in graphs that don't have placeholders.
-    if (IsSimplifiableReshape(node, properties)) {
-      const NodeDef* new_shape = node_map_->GetNode(node.input(1));
-      DataType output_type = node.attr().at("T").type();
-      node.set_op("Identity");
-      node.clear_attr();
-      (*node.mutable_attr())["T"].set_type(output_type);
-      if (node.input_size() > 2) {
-        node.mutable_input()->SwapElements(1, node.input_size() - 1);
+
+    // Strength reduce floating point division by a constant Div(x, const) to
+    // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
+    // will be constant folded to Mul(x, 1.0/const).
+    if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
+      const string& const_input = node->input(1);
+      const NodeDef* denom = node_map_->GetNode(const_input);
+      CHECK(denom != nullptr);
+      if (!IsReallyConstant(*denom)) {
+        continue;
       }
-      node.mutable_input()->RemoveLast();
-      for (const auto& input : new_shape->input()) {
-        DCHECK(IsControlInput(input));
-        *node.add_input() = input;
+      if (node->attr().count("T") == 0) {
+        continue;
+      }
+      DataType type = node->attr().at("T").type();
+      if (IsDiv(*node) && !DataTypeIsFloating(type)) {
+        continue;
       }
+      // Insert new reciprocal op and change node from Div to Mul.
+      NodeDef* reciprocal_node = output->add_node();
+      reciprocal_node->set_name(AddPrefixToNodeName(
+          strings::StrCat(node->name(), "_recip"), kConstantFoldingConst));
+      reciprocal_node->set_op("Reciprocal");
+      reciprocal_node->set_device(node->device());
+      node->set_op("Mul");
+      // Re-wire inputs and outputs.
+      reciprocal_node->add_input(const_input);
+      (*reciprocal_node->mutable_attr())["T"].set_type(type);
+      node->set_input(1, reciprocal_node->name());
+      node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
+      node_map_->UpdateInput(node->name(), const_input,
+                             reciprocal_node->name());
+      node_map_->AddOutput(NodeName(const_input), reciprocal_node->name());
+      graph_modified_ = true;
     }
   }
+
   return Status::OK();
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* output) {
-  node_map_.reset(new NodeMap(&graph_));
+  node_map_.reset(new NodeMap(graph_));
   nodes_whitelist_.clear();
   // Fold fetch nodes iff it has a single fanout. Note that if a fetch node
   // has a single fanout, it would be rewritten as a constant with the same
@@ -865,38 +1467,38 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   // new names, and as a result users would not be able to fetch the node any
   // more with the original node name.
   for (const auto& fetch : item.fetch) {
-    auto fetch_node = node_map_->GetNode(fetch);
-    if (NumOutputs(*fetch_node) == 1) {
+    const NodeDef* fetch_node = node_map_->GetNode(fetch);
+    if (fetch_node && NumOutputs(*fetch_node) == 1) {
       nodes_whitelist_.insert(fetch_node->name());
     }
   }
 
   GraphProperties properties(item);
-  bool has_feed = !item.feed.empty();
-  if (!has_feed) {
-    // Only use static shape information when there is no feed in the
-    // graph. That's because it's possible to feed a placeholder with a tensor
-    // of any shape, which could make the static information inconsistent with
-    // the shapes actually fed.
-    Status s = properties.InferStatically();
-    if (!s.ok()) {
-      VLOG(1) << "Failed to infer graph shapes: " << s;
-    } else {
-      TF_RETURN_IF_ERROR(MaterializeShapes(item, properties));
-    }
+  // It's possible to feed a placeholder with a tensor of any shape: make sure
+  // that the shape inference deals with this conservatively unless we're in
+  // aggressive mode.
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  Status s = properties.InferStatically(assume_valid_feeds);
+  const bool can_use_shape_info = s.ok();
+
+  if (can_use_shape_info) {
+    TF_RETURN_IF_ERROR(MaterializeShapes(properties));
+    TF_RETURN_IF_ERROR(MaterializeConstants(properties));
   }
 
   TF_RETURN_IF_ERROR(FoldGraph(output));
+  node_map_.reset(new NodeMap(output));
+  TF_RETURN_IF_ERROR(SimplifyGraph(output, properties, can_use_shape_info));
 
-  if (!has_feed) {
-    TF_RETURN_IF_ERROR(SimplifyGraph(output, properties));
-  }
   return Status::OK();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   nodes_to_preserve_ = item.NodesToPreserve();
+  for (const auto& feed : item.feed) {
+    feed_nodes_.insert(NodeName(feed.first));
+  }
 
   if (cpu_device_ == nullptr) {
     owned_device_.reset(new DeviceSimple());
@@ -907,13 +1509,15 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   GrapplerItem item_to_optimize = item;
   *output = item.graph;
+  int64 node_count;
   do {
-    graph_.Swap(output);
-    item_to_optimize.graph = graph_;
+    graph_modified_ = false;
+    item_to_optimize.graph.Swap(output);
+    graph_ = &item_to_optimize.graph;
     *output = GraphDef();
+    node_count = graph_->node_size();
     TF_RETURN_IF_ERROR(RunOptimizationPass(cluster, item_to_optimize, output));
-  } while (output->node_size() < graph_.node_size());
-
+  } while (graph_modified_ || output->node_size() != node_count);
   *output->mutable_library() = item.graph.library();
   *output->mutable_versions() = item.graph.versions();
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index b115e51dbfda3a65d8913433de77f5d24fbbef78..db281dc98dae14db7b6c671b4bcb3cf73a37b069 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -32,7 +33,12 @@ const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
+  static NodeDef CreateNodeDef(const string& name, const TensorValue& tensor);
+  static string AddControlDependency(const string& input_name, GraphDef* graph,
+                                     NodeMap* node_map);
+
   ConstantFolding(DeviceBase* cpu_device);
+  ConstantFolding(RewriterConfig::Toggle opt_level, DeviceBase* cpu_device);
 
   ~ConstantFolding() override {}
 
@@ -45,13 +51,17 @@ class ConstantFolding : public GraphOptimizer {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  string AddControlDependency(const string& input_name);
-  Status MaterializeShapes(const GrapplerItem& item,
-                           const GraphProperties& properties);
+  bool IsReallyConstant(const NodeDef& node) const;
 
-  bool IsFoldable(const NodeDef& node) const;
+  Status MaterializeShapes(const GraphProperties& properties);
 
-  NodeDef CreateNodeDef(const string& name, const TensorValue& tensor);
+  Status MaterializeBroadcastGradientArgs(const NodeDef& node,
+                                          const GraphProperties& properties);
+  Status MaterializeReductionIndices(NodeDef* node,
+                                     const GraphProperties& properties);
+
+  Status MaterializeConstants(const GraphProperties& properties);
+  bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
                       const gtl::InlinedVector<TensorValue, 4>& inputs,
@@ -62,26 +72,37 @@ class ConstantFolding : public GraphOptimizer {
 
   Status FoldNode(NodeDef* node, GraphDef* output_graph);
 
+  bool IsOnes(const NodeDef& node) const;
+  bool IsZeros(const NodeDef& node) const;
+  void ReplaceOperationWithIdentity(int input_to_forward, NodeDef* node);
+  Status ReplaceOperationWithConstant(double value,
+                                      const TensorShapeProto& shape,
+                                      NodeDef* node);
+  void ReplaceDivisionOfOnesByReciprocal(NodeDef* node);
   Status FoldGraph(GraphDef* output);
 
   bool IsSimplifiableReduction(const NodeDef& node) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties);
+  Status SimplifyGraph(GraphDef* output, const GraphProperties& properties,
+                       bool use_shape_info);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* output);
 
   // Points to an externally provided device or to owned_device_;
+  RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
   std::unique_ptr<DeviceBase> owned_device_;
 
   std::unique_ptr<ResourceMgr> resource_mgr_;
-  GraphDef graph_;
+  GraphDef* graph_;
   std::unique_ptr<NodeMap> node_map_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unordered_set<string> nodes_whitelist_;
+  std::unordered_set<string> feed_nodes_;
   bool has_fetch_;
+  bool graph_modified_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 183d783b55bd03ea999150ddd4f4fc92e1f34b1c..813d0cdcb0d856adfd8e7c6bd72724413b435163 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -76,11 +77,407 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(ConstantFoldingTest, NeutralElement) {
+  for (bool use_const : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Output a = ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({3, 2})));
+    Output b = ops::Placeholder(s.WithOpName("b"), DT_FLOAT,
+                                ops::Placeholder::Shape(TensorShape({2, 3})));
+    Output bias = ops::Placeholder(s.WithOpName("bias"), DT_FLOAT,
+                                   ops::Placeholder::Shape(TensorShape({2})));
+    Output zeros = !use_const ? ops::ZerosLike(s.WithOpName("zeros"), x)
+                              : ops::Const(s.WithOpName("zeros"), 0.0f, {2, 2});
+    Output zeros_1d = ops::Const(s.WithOpName("zeros_1d"), 0.0f, {2});
+    Output ones = !use_const ? ops::OnesLike(s.WithOpName("ones"), x)
+                             : ops::Const(s.WithOpName("ones"), 1.0f, {2, 2});
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), zeros, y);
+    Output mul3 = ops::Mul(s.WithOpName("mul3"), x, ones);
+    Output mul4 = ops::Mul(s.WithOpName("mul4"), ones, y);
+    Output mul5 = ops::Mul(s.WithOpName("mul5"), x, zeros_1d);
+    Output mul6 = ops::Mul(s.WithOpName("mul6"), zeros_1d, y);
+    Output div1 = ops::Div(s.WithOpName("div1"), x, ones);
+    Output div2 = ops::Div(s.WithOpName("div2"), ones, y);
+    Output matmul1 = ops::MatMul(s.WithOpName("matmul1"), x, zeros);
+    Output matmul2 = ops::MatMul(s.WithOpName("matmul2"), zeros, y);
+    Output matmul3 = ops::MatMul(s.WithOpName("matmul3"), a, zeros);
+    Output matmul4 = ops::MatMul(s.WithOpName("matmul4"), zeros, b);
+    Output add1 = ops::Add(s.WithOpName("add1"), x, zeros);
+    Output add2 = ops::Add(s.WithOpName("add2"), zeros, y);
+    Output bias_add1 = ops::BiasAdd(s.WithOpName("bias_add1"), x, zeros_1d);
+    Output bias_add2 = ops::BiasAdd(s.WithOpName("bias_add2"), zeros, bias);
+    Output sub1 = ops::Sub(s.WithOpName("sub1"), x, zeros);
+    Output sub2 = ops::Sub(s.WithOpName("sub2"), zeros, y);
+    Output addn =
+        ops::AddN(s.WithOpName("addn"),
+                  {mul1, mul2, mul3, mul4, mul5, mul6, div1, div2, matmul1,
+                   matmul2, add1, add2, bias_add1, bias_add2, sub1, sub2});
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"addn", "matmul3", "matmul4"};
+
+    ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                              nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(27, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "mul3") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul4") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "mul5") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "mul6") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros_1d", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "div1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "div2") {
+        EXPECT_EQ("Reciprocal", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      } else if (name == "matmul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "matmul2") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^y", node.input(1));
+      } else if (name == "matmul3") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^a", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(3, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      } else if (name == "matmul4") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^zeros", node.input(0));
+        EXPECT_EQ("^b", node.input(1));
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(3, t.tensor_shape().dim(1).size());
+      } else if (name == "add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "add2") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("y", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "bias_add1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros_1d", node.input(1));
+      } else if (name == "bias_add2") {
+        // We don't eliminate this one, because it requires broadcasting.
+        EXPECT_EQ("BiasAdd", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("bias", node.input(1));
+      } else if (name == "sub1") {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "sub2") {
+        // We don't handle this case yet.
+        EXPECT_EQ("Sub", node.op());
+        EXPECT_EQ("zeros", node.input(0));
+        EXPECT_EQ("y", node.input(1));
+      }
+      const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
+                                               "mul6", "matmul1", "matmul2"};
+      if (square_zero_const.count(name) > 0) {
+        TensorProto t = node.attr().at("value").tensor();
+        EXPECT_EQ(1, t.float_val_size());
+        EXPECT_EQ(0, t.float_val(0));
+        EXPECT_EQ(2, t.tensor_shape().dim_size());
+        EXPECT_EQ(2, t.tensor_shape().dim(0).size());
+        EXPECT_EQ(2, t.tensor_shape().dim(1).size());
+      }
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
+  Output xf = ops::Placeholder(s.WithOpName("xf"), DT_FLOAT,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output xi = ops::Placeholder(s.WithOpName("xi"), DT_INT32,
+                               ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output ci = ops::Const(s.WithOpName("ci"), 2, {1});
+  Output cf = ops::Const(s.WithOpName("cf"), 2.0f, {1});
+  Output div_i = ops::Div(s.WithOpName("div_i"), xi, ci);
+  Output div_f = ops::Div(s.WithOpName("div_f"), xf, cf);
+  Output realdiv = ops::RealDiv(s.WithOpName("realdiv"), xf, cf);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"div_f", "div_i", "realdiv"};
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(8, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (name == "div_i") {
+      // Integer division is unchanged.
+      EXPECT_EQ("Div", node.op());
+      EXPECT_EQ("xi", node.input(0));
+      EXPECT_EQ("ci", node.input(1));
+    } else if (name == "div_f") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/div_f_recip", node.input(1));
+    } else if (name == "realdiv") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("ConstantFolding/realdiv_recip", node.input(1));
+    } else if (name == "ConstantFolding/div_f_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    } else if (name == "ConstantFolding/realdiv_recip") {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      TensorProto t = node.attr().at("value").tensor();
+      EXPECT_EQ(DT_FLOAT, t.dtype());
+      EXPECT_EQ(1, t.tensor_shape().dim_size());
+      EXPECT_EQ(1, t.tensor_shape().dim(0).size());
+    }
+  }
+
+  // Check that the reciprocals have the expected value.
+  std::vector<string> fetch = {"cf_half"};
+  auto tensor_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(fetch.size(), tensor_expected.size());
+  fetch = {"ConstantFolding/div_f_recip", "ConstantFolding/realdiv_recip"};
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(fetch.size(), tensors.size());
+  for (int i = 0; i < fetch.size(); i++) {
+    test::ExpectTensorEqual<float>(tensor_expected[0], tensors[i]);
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x_known =
+      ops::Placeholder(s.WithOpName("x_known"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({2, 2})));
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_known = ops::ZerosLike(s.WithOpName("zeros_known"), x_known);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // Multiplies without any additional ops to supply the output shape.
+  int count = 0;
+  std::vector<Output> muls;
+  std::unordered_set<string> not_converted;
+  std::unordered_set<string> to_const;
+  std::unordered_set<string> to_identity;
+  for (const auto* x : {&x_known, &x_partially_known, &x_unknown}) {
+    for (const auto* zeros :
+         {&zeros_known, &zeros_partially_known, &zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls.push_back(ops::Mul(s.WithOpName(name), *x, *zeros));
+      if (x == &x_partially_known && zeros == &zeros_partially_known) {
+        to_identity.insert(name);
+      } else if (x == &x_unknown || zeros == &zeros_unknown) {
+        not_converted.insert(name);
+      } else {
+        to_const.insert(name);
+      }
+    }
+  }
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(15, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+    } else if (to_identity.count(name) > 0) {
+      EXPECT_EQ("Identity", node.op()) << node.name();
+    } else if (not_converted.count(name) > 0) {
+      EXPECT_EQ("Mul", node.op()) << node.name();
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output known_shape = ops::Const(s.WithOpName("known_shape"), 0.0f, {2, 2});
+  Output x_partially_known =
+      ops::Placeholder(s.WithOpName("x_partially_unknown"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output x_unknown = ops::Placeholder(s.WithOpName("x_unknown"), DT_FLOAT);
+  Output zeros_partially_known =
+      ops::ZerosLike(s.WithOpName("zeros_partially_known"), x_partially_known);
+  Output zeros_unknown =
+      ops::ZerosLike(s.WithOpName("zeros_unknown"), x_unknown);
+
+  // If at least one of the inputs to AddN has a known shape, shape inference
+  // will propagate the shape back to the inputs of AddN, making the
+  // output shapes of all its inputs known
+  std::vector<Output> muls_deduced_output_shape;
+  std::unordered_set<string> to_const;
+  int count = 0;
+  for (const auto& x : {x_partially_known, x_unknown}) {
+    for (const auto& zeros : {zeros_partially_known, zeros_unknown}) {
+      const string name = strings::StrCat("mul_", count++);
+      muls_deduced_output_shape.push_back(
+          ops::Mul(s.WithOpName(name), x, zeros));
+      to_const.insert(name);
+    }
+  }
+  // We add a known shape as input to AddN to propagate it back to the
+  // multiplies above, which means they can all be turned into Const nodes.
+  muls_deduced_output_shape.push_back(known_shape);
+  Output addn1 = ops::AddN(s.WithOpName("addn1"), muls_deduced_output_shape);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                            nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  LOG(INFO) << output.DebugString();
+
+  EXPECT_EQ(10, output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    const string& name = node.name();
+    if (to_const.count(name) > 0) {
+      EXPECT_EQ("Const", node.op()) << node.name();
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_TRUE(IsControlInput(node.input(0)));
+      EXPECT_TRUE(IsControlInput(node.input(1)));
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, CreateConstNodes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+#define MAKE_TEST_GRAPH(TYPE)                                               \
+  Output TYPE##_const =                                                     \
+      ops::Const(s.WithOpName(#TYPE "_const"), static_cast<TYPE>(10), {5}); \
+  Output TYPE##_mul =                                                       \
+      ops::Mul(s.WithOpName(#TYPE "_mul"), TYPE##_const, TYPE##_const);     \
+  Output TYPE##_id = ops::Identity(s.WithOpName(#TYPE "_id"), TYPE##_mul)
+
+  MAKE_TEST_GRAPH(float);
+  MAKE_TEST_GRAPH(double);
+  MAKE_TEST_GRAPH(int64);
+  MAKE_TEST_GRAPH(int32);
+  MAKE_TEST_GRAPH(int16);
+  MAKE_TEST_GRAPH(int8);
+  MAKE_TEST_GRAPH(uint8);
+#undef MAKE_TEST_GRAPH
+
+  Output bool_const = ops::Const(s.WithOpName("bool_const"), true, {5});
+  Output bool_and =
+      ops::LogicalAnd(s.WithOpName("bool_and"), bool_const, bool_const);
+  Output bool_id = ops::Identity(s.WithOpName("bool_id"), bool_and);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(24, output.node_size());
+  for (const NodeDef& node : output.node()) {
+#define CHECK_RESULT(TYPE, FIELD)                                             \
+  if (node.name() == #TYPE "_mul") {                                          \
+    EXPECT_EQ(5,                                                              \
+              node.attr().at("value").tensor().tensor_shape().dim(0).size()); \
+    EXPECT_EQ(1, node.attr().at("value").tensor().FIELD##_val_size());        \
+    EXPECT_EQ(10 * 10, node.attr().at("value").tensor().FIELD##_val(0));      \
+  }
+
+    CHECK_RESULT(float, float);
+    CHECK_RESULT(double, double);
+    CHECK_RESULT(int64, int64);
+    CHECK_RESULT(int32, int);
+    CHECK_RESULT(int16, int);
+    CHECK_RESULT(int8, int);
+    CHECK_RESULT(uint8, int);
+#undef CHECK_RESULT
+
+    if (node.name() == "bool_and") {
+      EXPECT_EQ(5,
+                node.attr().at("value").tensor().tensor_shape().dim(0).size());
+      EXPECT_EQ(1, node.attr().at("value").tensor().bool_val_size());
+      EXPECT_EQ(true && true, node.attr().at("value").tensor().bool_val(0));
+    }
+  }
+}
+
 TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   // Build a simple graph with a few trivially prunable ops.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  Output a = ops::Const(s.WithOpName("a"), 10, {3});
+  Output a = ops::Const(s.WithOpName("a"), 10, {5});
   auto b = ops::Unique(s.WithOpName("b"), {a});
   Output c = ops::Identity(s.WithOpName("c"), {b.y});
   Output d = ops::Identity(s.WithOpName("d"), {b.idx});
@@ -421,6 +818,64 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   EXPECT_EQ(3, found);
 }
 
+TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output v1 = ops::Variable(scope.WithOpName("v1"), {3, -1}, DT_FLOAT);
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {}, DT_FLOAT);
+  Output v3 = ops::Variable(scope.WithOpName("v3"), {4, 6}, DT_FLOAT);
+  auto s = ops::ShapeN(scope.WithOpName("s"), {v1, v2, v3});
+  Output i1a = ops::Identity(scope.WithOpName("i1a"), s[0]);
+  Output i1b = ops::Identity(scope.WithOpName("i1b"), s[0]);
+  Output i2a = ops::Identity(scope.WithOpName("i2a"), s[1]);
+  Output i2b = ops::Identity(scope.WithOpName("i2b"), s[1]);
+  Output i2c = ops::Identity(scope.WithOpName("i2c"), s[1]);
+  Output i3a = ops::Identity(scope.WithOpName("i3a"), s[2]);
+  Output i3b = ops::Identity(scope.WithOpName("i3b"), s[2]);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  int found = 0;
+  for (const auto& node : output.node()) {
+    EXPECT_NE(AddPrefixToNodeName("s-0", kConstantFoldingConst), node.name());
+    EXPECT_NE(AddPrefixToNodeName("s-1", kConstantFoldingConst), node.name());
+    if (node.name() == "i1a" || node.name() == "i1b") {
+      ++found;
+      EXPECT_EQ("s", node.input(0));
+    }
+    if (node.name() == "i2a" || node.name() == "i2b" || node.name() == "i2c") {
+      ++found;
+      EXPECT_EQ("s:1", node.input(0));
+    }
+    if (node.name() == "i3a" || node.name() == "i3b") {
+      ++found;
+      EXPECT_EQ(AddPrefixToNodeName("s-2", kConstantFoldingConst),
+                node.input(0));
+    }
+    if (node.name() == "s") {
+      ++found;
+      EXPECT_EQ("ShapeN", node.op());
+      EXPECT_EQ("v1", node.input(0));
+      EXPECT_EQ("v2", node.input(1));
+      EXPECT_EQ("v3", node.input(2));
+    }
+    if (node.name() == AddPrefixToNodeName("s-2", kConstantFoldingConst)) {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("^s", node.input(0));
+      Tensor value;
+      CHECK(value.FromProto(node.attr().at("value").tensor()));
+      EXPECT_EQ(4, value.flat<int>()(0));
+      EXPECT_EQ(6, value.flat<int>()(1));
+    }
+  }
+  EXPECT_EQ(9, found);
+}
+
 TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
@@ -676,7 +1131,7 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v", node.input(0));
-      EXPECT_EQ("^v", node.input(1));
+      EXPECT_EQ("^i", node.input(1));
     }
   }
   EXPECT_TRUE(found);
@@ -735,20 +1190,20 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(3, node.input_size());
       EXPECT_EQ("v1", node.input(0));
-      EXPECT_EQ("^d1", node.input(1));
-      EXPECT_EQ("^v1", node.input(2));
+      EXPECT_EQ("^i1", node.input(1));
+      EXPECT_EQ("^d1", node.input(2));
     } else if (node.name() == "r3") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v3", node.input(0));
-      EXPECT_EQ("^v3", node.input(1));
+      EXPECT_EQ("^i3", node.input(1));
     } else if (node.name() == "r4") {
       ++found;
       EXPECT_EQ("Identity", node.op());
       ASSERT_EQ(2, node.input_size());
       EXPECT_EQ("v4", node.input(0));
-      EXPECT_EQ("^v4", node.input(1));
+      EXPECT_EQ("^i4", node.input(1));
     } else if (node.name() == "r2") {
       ++found;
       EXPECT_EQ("Reshape", node.op());
@@ -780,6 +1235,129 @@ TEST_F(ConstantFoldingTest, Packing) {
   // size needed to naively encode 1000 floats folded twice).
   EXPECT_GT(8000, output.ByteSizeLong());
 }
+
+TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output b = ops::Square(s.WithOpName("b"), a);
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+  Output d = ops::Shape(s.WithOpName("d"), a);
+  Output e = ops::Shape(s.WithOpName("e"), b);
+
+  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
+  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
+  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+
+  Output g = ops::Placeholder(s.WithOpName("g"), DT_FLOAT,
+                              ops::Placeholder::Shape(PartialTensorShape({1})));
+  Output h = ops::Shape(s.WithOpName("h"), g);
+  auto i = ops::internal::BroadcastGradientArgs(s.WithOpName("i"), d, h);
+  Output p1 = ops::Identity(s.WithOpName("p1"), i.r0);
+  Output p2 = ops::Identity(s.WithOpName("p2"), i.r1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(RewriterConfig::AGGRESSIVE, nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Run a second time to make sure the optimization is idempotent.
+  item.graph.Swap(&output);
+  status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "o1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-0", node.input(0));
+    } else if (node.name() == "o2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-1", node.input(0));
+    } else if (node.name() == "ConstantFolding/f-0") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "ConstantFolding/f-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^f", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "p1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/i-0", node.input(0));
+    } else if (node.name() == "p2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("i:1", node.input(0));
+    } else if (node.name() == "ConstantFolding/i-0") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^i", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    }
+  }
+  EXPECT_EQ(7, found);
+}
+
+TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output input =
+      ops::Placeholder(s.WithOpName("input"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  Output indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  Output sum = ops::Sum(s.WithOpName("sum"), input, indices);
+  Output size = ops::Const(s.WithOpName("size"), 1, {1});
+  Output reshape = ops::Reshape(s.WithOpName("reshape"), sum, size);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("reshape");
+
+  ConstantFolding fold(RewriterConfig::AGGRESSIVE, nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Run a second time to make sure the optimization is idempotent.
+  item.graph.Swap(&output);
+  status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "ConstantFolding/sum-reduction_indices") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("^indices", node.input(0));
+      EXPECT_EQ(2, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "sum") {
+      ++found;
+      EXPECT_EQ("ConstantFolding/sum-reduction_indices", node.input(1));
+    } else if (node.name() == "indices") {
+      ++found;
+    }
+  }
+  EXPECT_EQ(3, found);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
+
+//  LocalWords:  NewRootScope
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..77525cc788e7b227b6467d3fbc28131e7dde304c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -0,0 +1,446 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+
+#include <unordered_set>
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+int RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
+  int num_removed = 0;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    if (node->input(pos) == input) {
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      node_map->RemoveOutput(NodeName(input), node->name());
+    } else {
+      ++pos;
+    }
+    ++num_removed;
+  }
+  return num_removed;
+}
+
+// Remove dulicate control inputs.
+void PruneControlInputs(NodeDef* node) {
+  std::unordered_set<string> inputs;
+  int pos = 0;
+  while (pos < node->input_size()) {
+    const string& input = node->input(pos);
+    // TODO(rmlarsen): Remove control inputs that also appears as a regular
+    // inputs. Currently, doing so breaks testControlFlowStrictness in
+    // python/framework/function_test.
+    //    if (!inputs.insert(NodeName(input)).second && IsControlInput(input)) {
+    if (IsControlInput(input) && !inputs.insert(input).second) {
+      VLOG(1) << "**** Removing duplicate control input: " << input
+              << " from node " << node->DebugString();
+      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+    } else {
+      ++pos;
+    }
+  }
+}
+
+}  // namespace
+
+bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
+  if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
+  if (!fetch_nodes_known_ || NumNonControlOutputs(node, *node_map_) > 0) {
+    // The output values of this node may be needed.
+    return false;
+  }
+  if (IsMerge(node) || IsSwitch(node)) {
+    return false;
+  }
+  if (ModifiesFrameInfo(node)) {
+    return false;
+  }
+  if (!IsFreeOfSideEffect(node)) {
+    return false;
+  }
+  if (node.op() == "ControlTrigger") {
+    return false;
+  }
+  if (node.op().rfind("Submodel", 0) == 0) {
+    return false;
+  }
+  const OpDef* op_def = nullptr;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok() || op_def->output_arg_size() == 0) {
+    return false;
+  }
+
+  // TODO(rmlarsen): We have to skip Identity nodes to make an obsolete test in
+  // python/training/session_manager_test.py pass. See if we can fix or get rid
+  // of that test.
+  const std::unordered_set<string> do_not_rewrite_ops{
+      "Assert", "CheckNumerics",         "Identity",    "_Retval",
+      "_Arg",   "_ParallelConcatUpdate", "_TPUExecute", "_TPUCompile"};
+  return do_not_rewrite_ops.find(node.op()) == do_not_rewrite_ops.end();
+}
+
+void DependencyOptimizer::OptimizeNode(int node_idx,
+                                       SetVector<int>* nodes_to_simplify,
+                                       std::set<int>* nodes_to_delete) {
+  NodeDef* node = optimized_graph_->mutable_node(node_idx);
+
+  // Constant nodes with no input control dependency are always executed early,
+  // so we can prune all their output control dependencies.
+  if (IsConstant(*node) && node->input_size() == 0) {
+    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node->name());
+    for (NodeDef* fanout : output_nodes) {
+      bool optimize_fanout = false;
+      bool data_connection = false;
+      for (int i = fanout->input_size() - 1; i >= 0; --i) {
+        int pos;
+        string input_name = ParseNodeName(fanout->input(i), &pos);
+        if (input_name == node->name()) {
+          if (pos < 0) {
+            fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
+            fanout->mutable_input()->RemoveLast();
+            optimize_fanout = true;
+          } else {
+            data_connection = true;
+          }
+        }
+      }
+      if (optimize_fanout) {
+        nodes_to_simplify->PushBack(node_to_idx_[fanout]);
+        if (!data_connection) {
+          node_map_->RemoveOutput(node->name(), fanout->name());
+        }
+      }
+    }
+    if (node_map_->GetOutputs(node->name()).empty() && fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_to_idx_[node]);
+    }
+
+    return;
+  }
+
+  // Change ops that only have control dependencies as outputs to NoOps.
+  if (node->op() != "NoOp" && SafeToConvertToNoOp(*node)) {
+    VLOG(1) << "***** Replacing  " << node->name() << " (" << node->op()
+            << ") with NoOp.";
+    // The outputs of this node are not consumed. Replace its inputs with
+    // control dependencies and replace the op itself with the NoOp op.
+    std::unordered_set<string> ctrl_inputs;
+    int pos = 0;
+    while (pos < node->input_size()) {
+      const string old_input = node->input(pos);
+      if (IsControlInput(old_input)) {
+        if (!ctrl_inputs.insert(old_input).second) {
+          // We found a duplicate control input. Remove it.
+          node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+          node->mutable_input()->RemoveLast();
+        } else {
+          ++pos;
+        }
+        continue;
+      }
+      const string ctrl_input = ConstantFolding::AddControlDependency(
+          old_input, optimized_graph_, node_map_.get());
+      if (ctrl_inputs.insert(ctrl_input).second) {
+        node->set_input(pos, ctrl_input);
+        node_map_->UpdateInput(node->name(), old_input, ctrl_input);
+        const NodeDef* old_input_node = node_map_->GetNode(old_input);
+        nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
+      }
+      ++pos;
+    }
+    node->set_op("NoOp");
+    node->clear_attr();
+  }
+
+  // Remove NoOp nodes if their fan-in or fan-out is less than 2.
+  // The non-trivial rewrites take the following form:
+  //
+  // Case a)
+  //    x --^> +------+                x --^> +---+
+  //    y --^> | NoOp | --^> a   ==>   y --^> | a |
+  //    ...    |      |                  ...  |   |
+  //    z --^> +------+                z --^> +---+
+  //
+  // Case b)
+  //           +------+ --^> a         +---+ --^> a
+  //    x --^> | NoOp | --^> b  ==>    | x | --^> b
+  //           |      | ...            |   | ...
+  //           +------+ --^> c         +---+ --^> c
+  if (node->op() == "NoOp") {
+    const auto output_nodes = node_map_->GetOutputs(node->name());
+    const int num_outputs = output_nodes.size();
+    const int num_inputs = node->input_size();
+
+    if (num_inputs * num_outputs > num_inputs + num_outputs) {
+      return;
+    }
+    VLOG(1) << "***** Rerouting input around " << node->name();
+    std::vector<NodeDef*> input_nodes;
+    for (int i = 0; i < num_inputs; ++i) {
+      NodeDef* tmp = node_map_->GetNode(node->input(i));
+      CHECK_NE(tmp, nullptr);
+      input_nodes.push_back(tmp);
+    }
+
+    for (auto consumer : output_nodes) {
+      bool updated_consumer = false;
+      VLOG(1) << "***** Considering consumer  " << consumer->name() << "\n"
+              << consumer->DebugString();
+      for (int i = 0; i < num_inputs; ++i) {
+        const NodeDef* input = input_nodes[i];
+        // Forward dependency from input to consumer if it doesn't already
+        // depend on it.
+        if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
+          consumer->add_input(AsControlDependency(input->name()));
+          updated_consumer = true;
+          node_map_->AddOutput(input->name(), consumer->name());
+          nodes_to_simplify->PushBack(node_to_idx_[input]);
+        }
+      }
+      // Remove dependency on node from consumer.
+      updated_consumer |= RemoveInput(
+          consumer, AsControlDependency(node->name()), node_map_.get());
+      if (updated_consumer) {
+        VLOG(1) << "***** Updated consumer  " << consumer->name() << " ("
+                << consumer->op() << ")";
+        nodes_to_simplify->PushBack(node_to_idx_[consumer]);
+      }
+    }
+
+    node_map_->RemoveOutputs(node->name());
+    if (fetch_nodes_known_ &&
+        nodes_to_preserve_.find(node->name()) == nodes_to_preserve_.end()) {
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_idx);
+
+      // Unconnect the node from its inputs to enable further optimizations.
+      node_map_->RemoveInputs(node->name());
+      node->clear_input();
+    }
+  }
+}
+
+void DependencyOptimizer::CleanControlInputs() {
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    PruneControlInputs(optimized_graph_->mutable_node(i));
+  }
+}
+
+void DependencyOptimizer::DeleteNodes(const std::set<int>& nodes_to_delete) {
+  int last = optimized_graph_->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    optimized_graph_->mutable_node()->SwapElements(index, last);
+    last--;
+  }
+  optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
+                                                   nodes_to_delete.size());
+  // Rebuild the NodeMap which was invalidated by the node swapping above.
+  node_map_.reset(new NodeMap(optimized_graph_));
+  BuildNodeToIdx();
+}
+
+Status DependencyOptimizer::OptimizeDependencies() {
+  SetVector<int> nodes_to_simplify;
+  std::set<int> nodes_to_delete;
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    const NodeDef& node = optimized_graph_->node(i);
+    if (node.op() == "NoOp" || IsConstant(node) || SafeToConvertToNoOp(node)) {
+      nodes_to_simplify.PushBack(i);
+    }
+  }
+  while (!nodes_to_simplify.Empty()) {
+    OptimizeNode(nodes_to_simplify.PopBack(), &nodes_to_simplify,
+                 &nodes_to_delete);
+  }
+
+  if (fetch_nodes_known_) {
+    VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
+            << optimized_graph_->node_size() << " nodes.";
+    DeleteNodes(nodes_to_delete);
+  }
+  return Status::OK();
+}
+
+Status DependencyOptimizer::TransitiveReduction() {
+  // PRECONDITION: optimized_graph_ must be sorted topologically.
+  const int num_nodes = optimized_graph_->node_size();
+  // Set up a compressed version of the graph to save a constant factor in the
+  // expensive algorithm below. Also cache the set of control outputs and the
+  // highest index of a target of any control output from each node.
+  int num_controls = 0;
+  std::vector<gtl::InlinedVector<int, 4>> inputs(num_nodes);
+  std::vector<gtl::InlinedVector<std::pair<int, int>, 2>> control_outputs(
+      num_nodes);
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = optimized_graph_->node(node_idx);
+    if (ModifiesFrameInfo(node)) {
+      // Ignore nodes that modify frame info.
+      continue;
+    }
+    for (int input_slot = 0; input_slot < node.input_size(); ++input_slot) {
+      const string& input = node.input(input_slot);
+      const NodeDef* input_node = node_map_->GetNode(input);
+      if (ModifiesFrameInfo(*input_node)) {
+        // Ignore edges from nodes that modify frame info.
+        continue;
+      }
+      const int input_node_idx = node_to_idx_[input_node];
+      inputs[node_idx].push_back(input_node_idx);
+      if (IsControlInput(input)) {
+        ++num_controls;
+        control_outputs[input_node_idx].emplace_back(node_idx, input_slot);
+      }
+    }
+  }
+
+  // Run the longest path in DAG algorithm for each source node that has control
+  // outputs. If, for any target node of a control output, there exists a path
+  // of length > 1, we can drop that control dependency.
+  int num_controls_removed = 0;
+  std::vector<int> longest_distance(num_nodes);
+  for (int source = 0; source < num_nodes; ++source) {
+    int highest_control_target = -1;
+    for (const auto& control_output : control_outputs[source]) {
+      if (control_output.first > highest_control_target) {
+        highest_control_target = control_output.first;
+      }
+    }
+    if (highest_control_target < source) {
+      continue;
+    }
+    std::fill(longest_distance.begin() + source,
+              longest_distance.begin() + highest_control_target + 1, 0);
+    for (int target = source + 1; target <= highest_control_target; ++target) {
+      for (int input : inputs[target]) {
+        // If the input node is before source in the topo order, no path
+        // source -> input -> target can exits and we can skip it.
+        if (input >= source) {
+          // If source -> input -> target is longer than the longest
+          // path so far from source -> target, update the longest_distance.
+          int candidate_longest_distance = longest_distance[input] + 1;
+          if (candidate_longest_distance > longest_distance[target]) {
+            longest_distance[target] = candidate_longest_distance;
+          }
+        }
+      }
+    }
+
+    // If the longest path from the source to the target of a control dependency
+    // is longer than 1, there exists an alternate path, and we can eliminate
+    // the control dependency since it is redundant.
+    for (const auto& control_output : control_outputs[source]) {
+      const int target = control_output.first;
+      if (longest_distance[target] > 1) {
+        const int input_slot = control_output.second;
+        // We modify the node inplace here. This is safe because there can
+        // only be one control edge from a given source to a given target.
+        const NodeDef& source_node = optimized_graph_->node(source);
+        NodeDef* target_node = optimized_graph_->mutable_node(target);
+        target_node->mutable_input()->SwapElements(
+            input_slot, target_node->input_size() - 1);
+        node_map_->RemoveOutput(source_node.name(), target_node->name());
+        target_node->mutable_input()->RemoveLast();
+        ++num_controls_removed;
+      }
+    }
+  }
+  VLOG(1) << "Removed " << num_controls_removed << " out of " << num_controls
+          << " control dependencies";
+  return Status::OK();
+}
+
+void DependencyOptimizer::BuildNodeToIdx() {
+  // Set up &node -> index map.
+  node_to_idx_.clear();
+  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
+    const NodeDef& node = optimized_graph_->node(i);
+    node_to_idx_[&node] = i;
+  }
+}
+
+Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* optimized_graph) {
+  optimized_graph_ = optimized_graph;
+  *optimized_graph_ = item.graph;
+  nodes_to_preserve_ = item.NodesToPreserve();
+  fetch_nodes_known_ = !item.fetch.empty();
+
+  VLOG(1) << "Graph before optimization:\n" << optimized_graph_->DebugString();
+  CleanControlInputs();
+  const int num_iterations = opt_level_ == RewriterConfig::AGGRESSIVE ? 2 : 1;
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    Status topo_sort_status;
+    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+      // Prepare the graph for transitive reduction if enabled.
+      topo_sort_status = TopologicalSort(optimized_graph_);
+    }
+
+    node_map_.reset(new NodeMap(optimized_graph_));
+    BuildNodeToIdx();
+
+    // Remove redundant control dependencies, iteration 1.
+    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+      if (topo_sort_status.ok()) {
+        TF_RETURN_IF_ERROR(TransitiveReduction());
+      } else {
+        LOG(ERROR) << topo_sort_status.error_message();
+      }
+      VLOG(1) << "Graph after transitive reduction:\n"
+              << optimized_graph_->DebugString();
+    }
+
+    // Turn nodes without non-control outputs into NoOps, prune NoOps.
+    TF_RETURN_IF_ERROR(OptimizeDependencies());
+    VLOG(1) << "Graph after NoOp conversion & pruning:\n"
+            << optimized_graph_->DebugString();
+  }
+  VLOG(1) << "Graph after optimization:\n" << optimized_graph_->DebugString();
+
+  return Status::OK();
+}
+
+void DependencyOptimizer::Feedback(Cluster* /*cluster*/,
+                                   const GrapplerItem& /*item*/,
+                                   const GraphDef& /*optimized_graph*/,
+                                   double /*result*/) {
+  // Nothing to do for DependencyOptimizer.
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f6f418bee69cc86d8865bccd266803ade2ef2c1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -0,0 +1,76 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by removing control dependencies or re-arranging
+// them to shorten the critical path for a model step or enable other
+// optimizations, such as removing nodes that are effectively noops.
+class DependencyOptimizer : public GraphOptimizer {
+ public:
+  DependencyOptimizer() : opt_level_(RewriterConfig::ON) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+  ~DependencyOptimizer() override {}
+
+  string name() const override { return "dependency_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  // Returns true if it is safe to convert node to NoOp.
+  bool SafeToConvertToNoOp(const NodeDef& node);
+  // Removes all duplicate control dependencies.
+  void CleanControlInputs();
+  // Builds a map from the &optimized_graph_->node(i) to i.
+  void BuildNodeToIdx();
+  // Removes the given set of nodes from the graph.
+  void DeleteNodes(const std::set<int>& nodes_to_delete);
+  // Tries to optimize the node with the given index, possibly additional
+  // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
+  // inserting them in nodes_to_delete.
+  void OptimizeNode(int node_idx, SetVector<int>* nodes_to_simplify,
+                    std::set<int>* nodes_to_delete);
+  // Eliminates redundant control dependencies by computing the transitive
+  // reduction of the graph.
+  Status TransitiveReduction();
+  // Main driver of dependency optimizations.
+  Status OptimizeDependencies();
+
+  RewriterConfig::Toggle opt_level_;
+  bool fetch_nodes_known_;
+  std::unordered_set<string> nodes_to_preserve_;
+  std::unique_ptr<NodeMap> node_map_;
+  std::unordered_map<const NodeDef*, int> node_to_idx_;
+  GraphDef* optimized_graph_;  // Not owned.
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e17a8eb1cf140eec14a35b129416402f8b785b2f
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -0,0 +1,263 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class DependencyOptimizerTest : public ::testing::Test {};
+
+void VerifyGraphsEqual(const GraphDef& original_graph,
+                       const GraphDef& optimized_graph, const string& func) {
+  EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
+  for (int i = 0; i < original_graph.node_size(); ++i) {
+    const NodeDef& original = original_graph.node(i);
+    const NodeDef& optimized = optimized_graph.node(i);
+    EXPECT_EQ(original.name(), optimized.name()) << func;
+    EXPECT_EQ(original.op(), optimized.op()) << func;
+    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    for (int j = 0; j < original.input_size(); ++j) {
+      EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, NoOp) {
+  // This trivial graph is so basic there's nothing to optimize.
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
+TEST_F(DependencyOptimizerTest, DependenciesDrivenByConstants) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output y = ops::Const(s.WithOpName("y"), {1.0f, 2.0f}, {1, 2});
+  Output z = ops::Const(s.WithOpName("z"), {1.0f, 2.0f}, {1, 2});
+  Output add = ops::Add(s.WithOpName("add"), x, y);
+  Output id1 =
+      ops::Identity(s.WithOpName("id1").WithControlDependencies(x), add);
+  Output id2 = ops::Identity(
+      s.WithOpName("id2").WithControlDependencies(y).WithControlDependencies(z),
+      add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The 'z' node should have been optimized away leaving only 5 nodes.
+  EXPECT_EQ(5, output.node_size());
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.name() == "id1" || node.name() == "id2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("add", node.input(0));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, ChangeToNoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output add = ops::Add(s.WithOpName("add"), x, y);
+  Output id1 =
+      ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
+  Output id2 =
+      ops::Identity(s.WithOpName("id2").WithControlDependencies(add), y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id1");
+  item.fetch.push_back("id2");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    const NodeDef& node = item.graph.node(i);
+    if (node.name() == "add") {
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id1") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^y", node.input(1));
+    } else if (node.name() == "id2") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+  }
+}
+
+// TODO(rmlarsen): Add test to make sure we skip Switch and Merge.
+TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  Output add = ops::Add(s.WithOpName("add"), x, y);
+  Output id1 =
+      ops::Identity(s.WithOpName("id1").WithControlDependencies(add), x);
+  Output id2 =
+      ops::Identity(s.WithOpName("id2").WithControlDependencies(add), y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  TF_CHECK_OK(TopologicalSort(&item.graph));
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
+TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s, {1, 2}, DT_FLOAT);
+  auto noop1 = ops::NoOp(s);
+  auto noop2 = ops::NoOp(s.WithControlDependencies(x));
+  Output id = ops::Identity(s.WithControlDependencies({noop1.operation}), x);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "NoOp" || node.name() == "NoOp_1") {
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Identity") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("RandomUniform", node.input(0));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
+  // NoOp with a single input- and two output dependencies.
+  auto noop = ops::NoOp(s.WithControlDependencies(x));
+  // NoOp with a two input- and a single output dependency.
+  auto noop_1 =
+      ops::NoOp(s.WithControlDependencies(x).WithControlDependencies(y));
+  Output id = ops::Identity(s.WithControlDependencies({noop.operation}), x);
+  Output id_1 = ops::Identity(
+      s.WithControlDependencies({noop.operation, noop_1.operation}), y);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "NoOp" || node.name() == "NoOp_1") {
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Identity") {
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "Identity_1") {
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output x = ops::Square(s.WithOpName("x"), c);
+  Output id1 = ops::Identity(s.WithOpName("id1"), x);
+  Output id2 =
+      ops::Identity(s.WithOpName("id2").WithControlDependencies({x}), id1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("id2");
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(4, output.node_size());
+  EXPECT_EQ("id2", output.node(3).name());
+  EXPECT_EQ(1, output.node(3).input_size());
+  EXPECT_EQ("id1", output.node(3).input(0));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index b364446ad76bcc068ca4622067b92219e217c689..e9112baaff2aa1dfb3e0fbcceb05cd73fe565934 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <deque>
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -20,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -28,57 +28,118 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
+namespace {
 
-const char kConcatConst[] = "LayoutOptimizerConcatConst";
+const char kPrefix[] = "LayoutOptimizer";
+const char kDataFormatOp[] = "LayoutOptimizerDataFormatOp";
 const char kPermNHWCToNCHW[] = "LayoutOptimizerPermConstNHWCToNCHW";
 const char kPermNCHWToNHWC[] = "LayoutOptimizerPermConstNCHWToNHWC";
-const char kGatherAxisConst[] = "LayoutOptimizerGatherAxisConst";
 const char kTransposeNHWCToNCHW[] = "LayoutOptimizerTransposeNHWCToNCHW";
 const char kTransposeNCHWToNHWC[] = "LayoutOptimizerTransposeNCHWToNHWC";
-const char kPermVecNHWCToNCHW[] = "LayoutOptimizerPermVecNHWCToNCHW";
 const char kReshapeNHWCToNCHW[] = "LayoutOptimizerReshapeNHWCToNCHW";
 const char kReshapeConst[] = "LayoutOptimizerReshapeConst";
 const char kReductionConst[] = "LayoutOptimizerReductionConst";
 
 std::set<string> GetOpsFormatSupported() {
-  std::set<string> ops_format_supported = {"AvgPool",
-                                           "AvgPoolGrad",
-                                           "Conv2D",
-                                           "Conv2DBackpropFilter",
-                                           "Conv2DBackpropInput",
-                                           "BiasAdd",
-                                           "BiasAddGrad",
-                                           "FusedBatchNorm",
-                                           "FusedBatchNormGrad",
-                                           "FusedConv2DBiasActivation",
-                                           "MaxPool",
-                                           "MaxPoolGrad"};
+  std::set<string> ops_format_supported = {
+      "AvgPool",
+      "AvgPoolGrad",
+      "Conv2D",
+      "Conv2DBackpropFilter",
+      "Conv2DBackpropInput",
+      "BiasAdd",
+      "BiasAddGrad",
+      "DepthwiseConv2dNative",
+      "DepthwiseConv2dNativeBackpropInput",
+      "DepthwiseConv2dNativeBackpropFilter",
+      "FusedBatchNorm",
+      "FusedBatchNormGrad",
+      "FusedConv2DBiasActivation",
+      "MaxPool",
+      "MaxPoolGrad",
+      "SpaceToDepth",
+      "DepthToSpace"};
   return ops_format_supported;
 }
 
+// TODO(yaozhang): enable SumProcessor with auto-tuning. Currently disabled
+// because of the worse performance in some cases.
 std::set<string> GetOpsFormatAgnostic() {
   std::set<string> ops_format_agnostic = {"Add",
                                           "AddN",
+                                          "Acos",
+                                          "Acosh",
+                                          "Asin",
+                                          "Asinh",
+                                          "Atan",
+                                          "Atanh",
+                                          "Ceil",
+                                          "Cos",
+                                          "Cosh",
                                           "Concat",
                                           "ConcatV2",
+                                          "Digamma",
+                                          "Erf",
+                                          "Erfc",
+                                          "Exp",
+                                          "Expm1",
                                           "Floor",
                                           "Identity",
+                                          "Inv",
+                                          "InvGrad",
+                                          "IsFinite",
+                                          "IsInf",
+                                          "IsNan",
+                                          "Lgamma",
+                                          "Log",
+                                          "Log1p",
                                           "Mul",
                                           "Neg",
+                                          "Pad",
                                           "RealDiv",
+                                          "Reciprocal",
+                                          "ReciprocalGrad",
                                           "Relu",
+                                          "Relu6",
                                           "ReluGrad",
+                                          "Rint",
+                                          "Sigmoid",
+                                          "SigmoidGrad",
+                                          "Sign",
+                                          "Sin",
+                                          "Sinh",
                                           "Slice",
+                                          "Split",
+                                          "Round",
+                                          "Rsqrt",
+                                          "RsqrtGrad",
+                                          "Sqrt",
+                                          "SqrtGrad",
+                                          "Square",
                                           "SquaredDifference",
                                           "Squeeze",
-                                          "Sub"};
+                                          /*"Sum",*/ "Sub",
+                                          "Tan",
+                                          "Tanh",
+                                          "TanhGrad"};
   return ops_format_agnostic;
 }
 
+bool IsNodeByLayoutOptimizer(const string& node_name) {
+  const string prefix_pattern = kPrefix;
+  string prefix = node_name.substr(0, prefix_pattern.length());
+  if (prefix.compare(prefix_pattern) == 0) {
+    return true;
+  }
+  return false;
+}
+
 bool IsNodeNHWCToNCHW(const string& node_name) {
   const string transpose_node_prefix = kTransposeNHWCToNCHW;
   string prefix = node_name.substr(0, transpose_node_prefix.length());
@@ -97,10 +158,30 @@ bool IsNodeNCHWToNHWC(const string& node_name) {
   return false;
 }
 
+bool IsConcat(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat" || op == "ConcatV2";
+}
+
+bool IsConcatV1(const NodeDef& node) {
+  const auto op = node.op();
+  return op == "Concat";
+}
+
+bool IsMaxPoolGradV1(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPoolGrad";
+}
+
 class GraphProcessor {
  public:
-  GraphProcessor(GraphDef* graph, NodeMap* node_map)
-      : graph_(graph), node_map_(node_map) {}
+  GraphProcessor(const VirtualPlacer& virtual_placer,
+                 const std::unordered_set<string>& nodes_to_preserve,
+                 GraphDef* graph, NodeMap* node_map)
+      : virtual_placer_(virtual_placer),
+        nodes_to_preserve_(nodes_to_preserve),
+        graph_(graph),
+        node_map_(node_map) {}
 
  protected:
   NodeDef* AddNodePermConst(const string& name, const string& device,
@@ -109,7 +190,6 @@ class GraphProcessor {
     node_map_->AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
-    node->set_device(device);
     AttrValue attr_data_type;
     attr_data_type.set_type(DT_INT32);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -120,6 +200,13 @@ class GraphProcessor {
     }
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
+    string device_name;
+    if (device.empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node);
+    } else {
+      device_name = device;
+    }
+    node->set_device(device_name);
     return node;
   }
 
@@ -129,7 +216,6 @@ class GraphProcessor {
     node_map_->AddNode(name, node);
     node->set_name(name);
     node->set_op("Const");
-    node->set_device(device);
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     node->mutable_attr()->insert({"dtype", attr_data_type});
@@ -138,43 +224,48 @@ class GraphProcessor {
     tensor.scalar<int>()() = value;
     tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
     node->mutable_attr()->insert({"value", attr_tensor});
-    return node;
-  }
-
-  NodeDef* AddNodeReductionConst(const string& name, const string& device) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(name, node);
-    node->set_name(name);
-    node->set_op("Const");
-    node->set_device(device);
-    AttrValue attr_data_type;
-    attr_data_type.set_type(DT_INT32);
-    node->mutable_attr()->insert({"dtype", attr_data_type});
-
-    AttrValue attr_tensor;
-    Tensor tensor(DT_INT32, TensorShape({3}));
-    std::vector<int> axis = {0, 2, 3};
-    for (int i = 0; static_cast<size_t>(i) < axis.size(); i++) {
-      tensor.flat<int>()(i) = axis[i];
+    string device_name;
+    if (device.empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node);
+    } else {
+      device_name = device;
     }
-    tensor.AsProtoTensorContent(attr_tensor.mutable_tensor());
-    node->mutable_attr()->insert({"value", attr_tensor});
+    node->set_device(device_name);
     return node;
   }
 
+  const VirtualPlacer& virtual_placer_;
+  const std::unordered_set<string>& nodes_to_preserve_;
   GraphDef* graph_;
   NodeMap* node_map_;
+};
 
- private:
+struct OptimizeContext {
+  OptimizeContext(GraphDef* graph, NodeDef* node, NodeMap* node_map,
+                  const VirtualPlacer& virtual_placer,
+                  const std::unordered_set<string>& nodes_to_preserve,
+                  bool is_in_frame)
+      : graph(graph),
+        node(node),
+        node_map(node_map),
+        virtual_placer(virtual_placer),
+        nodes_to_preserve(nodes_to_preserve),
+        is_in_frame(is_in_frame) {}
+  GraphDef* graph;
+  NodeDef* node;
+  NodeMap* node_map;
+  const VirtualPlacer& virtual_placer;
+  const std::unordered_set<string>& nodes_to_preserve;
+  bool is_in_frame;
 };
 
 class NodeProcessor : public GraphProcessor {
  public:
-  NodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                bool is_in_frame)
-      : GraphProcessor(graph, node_map),
-        node_(node),
-        is_in_frame_(is_in_frame) {}
+  explicit NodeProcessor(const OptimizeContext& opt_cxt)
+      : GraphProcessor(opt_cxt.virtual_placer, opt_cxt.nodes_to_preserve,
+                       opt_cxt.graph, opt_cxt.node_map),
+        node_(opt_cxt.node),
+        is_in_frame_(opt_cxt.is_in_frame) {}
   virtual ~NodeProcessor() {}
   virtual Status ConvertNode() {
     if (ShouldProcess()) {
@@ -224,8 +315,30 @@ class NodeProcessor : public GraphProcessor {
     return Status::OK();
   }
 
+  bool MustPreserve() const {
+    return nodes_to_preserve_.find(node_->name()) != nodes_to_preserve_.end();
+  }
+
   virtual bool ShouldProcess() const {
-    return IsNHWC() && IsDimsFour(*node_) && HasOutputs();
+    return !MustPreserve() && IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
+           IsOnGPU();
+  }
+
+  virtual bool IsOnGPU() const {
+    string device_name;
+    if (node_->device().empty()) {
+      device_name = virtual_placer_.get_canonical_device_name(*node_);
+    } else {
+      device_name = node_->device();
+    }
+    string device;
+    string not_used;
+    if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
+        (StringPiece(str_util::Lowercase(device)))
+            .contains(str_util::Lowercase(DEVICE_GPU))) {
+      return true;
+    }
+    return false;
   }
 
   void UpdateAttrDataFormat() {
@@ -277,17 +390,57 @@ class NodeProcessor : public GraphProcessor {
     if (!success) {
       LOG(ERROR) << "Failed to parse TensorProto.";
     }
-    int c = tensor.flat<int>()(3);
-    tensor.flat<int>()(3) = tensor.flat<int>()(2);
-    tensor.flat<int>()(2) = tensor.flat<int>()(1);
-    tensor.flat<int>()(1) = c;
-    tensor.AsProtoTensorContent(
-        node->mutable_attr()->at({"value"}).mutable_tensor());
+    if (tensor.dims() == 0) {
+      int value = tensor.scalar<int>()();
+      value = (value >= 0) ? value : value + 4;
+      if (value == 1 || value == 2) {
+        value = value + 1;
+      } else if (value == 3) {
+        value = 1;
+      }
+      tensor.scalar<int>()() = value;
+    } else if (tensor.dims() == 1) {
+      if (tensor.flat<int>().size() == 4) {
+        int c = tensor.flat<int>()(3);
+        tensor.flat<int>()(3) = tensor.flat<int>()(2);
+        tensor.flat<int>()(2) = tensor.flat<int>()(1);
+        tensor.flat<int>()(1) = c;
+      } else if (tensor.flat<int>().size() == 3) {
+        tensor.flat<int>()(0) = 0;
+        tensor.flat<int>()(1) = 2;
+        tensor.flat<int>()(2) = 3;
+      } else {
+        return Status(error::INVALID_ARGUMENT,
+                      strings::StrCat("Unsupported tensor size: ",
+                                      tensor.flat<int>().size()));
+      }
+    } else if (tensor.dims() == 2) {
+      for (int i = 0; i < 2; i++) {
+        int c = tensor.matrix<int>()(3, i);
+        tensor.matrix<int>()(3, i) = tensor.matrix<int>()(2, i);
+        tensor.matrix<int>()(2, i) = tensor.matrix<int>()(1, i);
+        tensor.matrix<int>()(1, i) = c;
+      }
+    } else {
+      return Status(
+          error::INVALID_ARGUMENT,
+          strings::StrCat("Unsupported dimension size: ", tensor.dims()));
+    }
+    if (tensor.dims() == 0) {
+      tensor.AsProtoField(node->mutable_attr()->at({"value"}).mutable_tensor());
+    } else {
+      tensor.AsProtoTensorContent(
+          node->mutable_attr()->at({"value"}).mutable_tensor());
+    }
     return Status::OK();
   }
 
   Status UpdateAttrValueOfInput(int input_index) {
     auto input_node = node_map_->GetNode(node_->input(input_index));
+    // We created a copy of the node, so that we don't modify the original node,
+    // which might be used elsewhere. Note that this copy also copies the
+    // control dependency input in the case this node is inside a loop,
+    // to ensure added_node is in the same frame with node_.
     NodeDef* added_node = graph_->add_node();
     *added_node = *input_node;
     string base_name = strings::StrCat(node_->name(), "-", input_node->name());
@@ -304,6 +457,14 @@ class NodeProcessor : public GraphProcessor {
     return input_pos;
   }
 
+  virtual std::set<int> GetOutputPos() const {
+    // For most nodes, no need to process control nodes or nodes that use an
+    // output other than the first output: only the first output is of
+    // 4D NCHW/NHWC format and thus relevant here.
+    std::set<int> output_pos = {0};
+    return output_pos;
+  }
+
   NodeDef* AddNodeTranspose(const string& node_name, const string& input_name,
                             const string& const_name, DataType data_type,
                             const TensorShapeProto& input_shape,
@@ -369,37 +530,28 @@ class NodeProcessor : public GraphProcessor {
     auto outputs = node_map_->GetOutputs(node_->name());
     string const_name = GetOrAddNodePermNCHWToNHWC();
     for (const auto& output : outputs) {
-      string base_name = strings::StrCat(node_->name(), "-", output->name());
-      string node_name =
-          AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
-      // TODO(yaozhang): handle the rare case where node A is connected to more
-      // than one input of node B.
-      auto it = std::find_if(output->mutable_input()->begin(),
-                             output->mutable_input()->end(),
-                             [this](const string& input) {
-                               string node_name = NodeName(input);
-                               return node_name.compare(node_->name()) == 0;
-                             });
-      if (it == output->mutable_input()->end()) {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect ", node_->name(),
-                                      " to be an input of ", output->name()));
-      }
-      int output_pos = NodePosition(*it);
-      // No need to process control nodes or nodes that use an output
-      // other than the first output: only the first output is of 4D NCHW/NHWC
-      // format and thus relevant here.
-      if (output_pos != 0) {
-        continue;
+      for (int i = 0; i < output->input_size(); i++) {
+        auto& input = *output->mutable_input(i);
+        int input_port;
+        string input_name = ParseNodeName(input, &input_port);
+        auto output_pos = GetOutputPos();
+        if (input_name == node_->name() &&
+            output_pos.find(input_port) != output_pos.end()) {
+          string base_name =
+              strings::StrCat(node_->name(), "-", output->name(), "-", i);
+          string node_name =
+              AddPrefixToNodeName(base_name, kTransposeNCHWToNHWC, "-");
+          TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
+          TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
+          AddNodeTranspose(
+              node_name, input, const_name, node_->attr().at("T").type(),
+              node_->attr().at("_output_shapes").list().shape(0), false);
+          input = node_name;
+          node_map_->AddOutput(node_->name(), node_name);
+          node_map_->AddOutput(node_name, output->name());
+        }
       }
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "_output_shapes"));
-      AddNodeTranspose(
-          node_name, node_->name(), const_name, node_->attr().at("T").type(),
-          node_->attr().at("_output_shapes").list().shape(0), false);
-      *it = node_name;
-      node_map_->UpdateOutput(node_->name(), output->name(), node_name);
-      node_map_->AddOutput(node_name, output->name());
+      node_map_->RemoveOutput(node_->name(), output->name());
     }
     return Status::OK();
   }
@@ -428,6 +580,28 @@ class NodeProcessor : public GraphProcessor {
     return const_node;
   }
 
+  void AddNodeDataFormatOp(const string& op, int input_pos, DataType dtype) {
+    NodeDef* added_node = graph_->add_node();
+    added_node->set_name(
+        strings::StrCat(kDataFormatOp, "_", node_->name(), "_", input_pos));
+    added_node->set_op(op);
+    node_map_->AddNode(added_node->name(), added_node);
+    added_node->set_device(node_->device());
+    AttrValue attr_data_type;
+    attr_data_type.set_type(dtype);
+    added_node->mutable_attr()->insert({"T", attr_data_type});
+    AttrValue attr_format;
+    attr_format.set_s("NHWC");
+    added_node->mutable_attr()->insert({"src_format", attr_format});
+    attr_format.set_s("NCHW");
+    added_node->mutable_attr()->insert({"dst_format", attr_format});
+    *added_node->add_input() = node_->input(input_pos);
+    *node_->mutable_input(input_pos) = added_node->name();
+    node_map_->UpdateOutput(added_node->input(0), node_->name(),
+                            added_node->name());
+    node_map_->AddOutput(added_node->name(), node_->name());
+  }
+
   NodeDef* node_;
   bool is_in_frame_;
 
@@ -468,9 +642,8 @@ class NodeProcessor : public GraphProcessor {
 
 class AvgPoolGradProcessor : public NodeProcessor {
  public:
-  AvgPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AvgPoolGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -482,12 +655,17 @@ class AvgPoolGradProcessor : public NodeProcessor {
 
 class BiasAddGradProcessor : public NodeProcessor {
  public:
-  BiasAddGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit BiasAddGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
+    if (MustPreserve()) {
+      return false;
+    }
+    if (!IsOnGPU()) {
+      return false;
+    }
     auto input = node_map_->GetNode(node_->input(0));
     if (input) {
       if ((IsNHWC() && IsDimsFour(*input)) || IsNodeNCHWToNHWC(input->name())) {
@@ -502,14 +680,13 @@ class BiasAddGradProcessor : public NodeProcessor {
 
 class Conv2DProcessor : public NodeProcessor {
  public:
-  Conv2DProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                  bool no_gemm, bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame), no_gemm_(no_gemm) {}
+  Conv2DProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : NodeProcessor(opt_cxt), no_gemm_(no_gemm) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
-           (!IsGemmUsed() || no_gemm_);
+    return !MustPreserve() && IsNHWC() && IsDimsFour(*node_) && HasOutputs() &&
+           (!IsGemmUsed() || no_gemm_) && IsOnGPU();
   }
 
   TensorShapeProto GetShape(const string& input_name) const {
@@ -572,10 +749,8 @@ class Conv2DProcessor : public NodeProcessor {
 
 class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
  public:
-  Conv2DBackpropFilterProcessor(GraphDef* graph, NodeDef* node,
-                                NodeMap* node_map, bool no_gemm,
-                                bool is_in_frame)
-      : Conv2DProcessor(graph, node, node_map, no_gemm, is_in_frame) {}
+  Conv2DBackpropFilterProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : Conv2DProcessor(opt_cxt, no_gemm) {}
 
  protected:
   bool IsGemmUsed() const override {
@@ -598,10 +773,8 @@ class Conv2DBackpropFilterProcessor : public Conv2DProcessor {
 
 class Conv2DBackpropInputProcessor : public Conv2DProcessor {
  public:
-  Conv2DBackpropInputProcessor(GraphDef* graph, NodeDef* node,
-                               NodeMap* node_map, bool no_gemm,
-                               bool is_in_frame)
-      : Conv2DProcessor(graph, node, node_map, no_gemm, is_in_frame) {}
+  Conv2DBackpropInputProcessor(const OptimizeContext& opt_cxt, bool no_gemm)
+      : Conv2DProcessor(opt_cxt, no_gemm) {}
 
  protected:
   bool IsGemmUsed() const override {
@@ -615,27 +788,47 @@ class Conv2DBackpropInputProcessor : public Conv2DProcessor {
     return input_pos;
   }
 
-  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(0); }
+  Status CustomizedProcessing() override {
+    auto input_size_node = node_map_->GetNode(node_->input(0));
+    if (IsConstant(*input_size_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(0));
+    } else {
+      AddNodeDataFormatOp("DataFormatVecPermute", 0, DT_INT32);
+    }
+    return Status::OK();
+  }
 };
 
 class FusedBatchNormGradProcessor : public NodeProcessor {
  public:
-  FusedBatchNormGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                              bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit FusedBatchNormGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
+  bool ShouldProcess() const override {
+    return NodeProcessor::ShouldProcess() && IsTraining();
+  }
+
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos = {0, 1};
     return input_pos;
   }
+
+ private:
+  bool IsTraining() const {
+    if (node_->attr().find("is_training") != node_->attr().end()) {
+      if (node_->attr().at("is_training").b()) {
+        return true;
+      }
+    }
+    return false;
+  }
 };
 
 class MaxPoolGradProcessor : public NodeProcessor {
  public:
-  MaxPoolGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                       bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit MaxPoolGradProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -646,42 +839,69 @@ class MaxPoolGradProcessor : public NodeProcessor {
 
 class AgnosticNodeProcessor : public NodeProcessor {
  public:
-  AgnosticNodeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                        bool is_in_frame)
-      : NodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AgnosticNodeProcessor(const OptimizeContext& opt_cxt)
+      : NodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC();
+    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() && IsOnGPU();
   }
 
   bool IsNodeAfterNCHWToNHWC() const {
     std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
-    auto node = node_map_->GetNode(node_->name());
-    while (node->input_size() > 0) {
-      int data_input_pos = 0;
-      if (node->op().compare("Concat") == 0) {
-        data_input_pos = 1;
-      }
-      node = node_map_->GetNode(node->input(data_input_pos));
-      if (IsNodeNCHWToNHWC(node->name())) {
+    std::deque<NodeDef*> queue;
+    auto first_node_pos = DataInputPos(*node_);
+    for (const auto& pos : first_node_pos) {
+      auto input_node = node_map_->GetNode(node_->input(pos));
+      queue.push_back(input_node);
+    }
+    // The code will exit this while loop in one iteration in most cases, as the
+    // graph is already topologically sorted.
+    while (!queue.empty()) {
+      NodeDef* current_node = queue.front();
+      queue.pop_front();
+      if (IsNodeNCHWToNHWC(current_node->name())) {
         return true;
       }
-      bool connected =
-          ops_format_agnostic.find(node->name()) != ops_format_agnostic.end();
-      if (!connected) {
-        return false;
+      // We only continue searching if the path is connected through
+      // format-agnostic nodes.
+      if (ops_format_agnostic.find(current_node->op()) !=
+          ops_format_agnostic.end()) {
+        auto current_node_pos = DataInputPos(*current_node);
+        for (const auto& pos : current_node_pos) {
+          auto input_node = node_map_->GetNode(current_node->input(pos));
+          queue.push_back(input_node);
+        }
       }
     }
     return false;
   }
+
+ private:
+  std::vector<int> DataInputPos(const NodeDef& node) const {
+    std::vector<int> pos;
+    if (IsSplit(node)) {
+      return {1};
+    }
+    if (IsConcatV1(node)) {
+      return {1};
+    }
+    if (IsAdd(node) || IsMul(node) || IsRealDiv(node) ||
+        IsSquaredDifference(node) || IsSub(node)) {
+      return {0, 1};
+    }
+    if (node.input_size() > 0 && !IsControlInput(node.input(0))) {
+      return {0};
+    }
+    return {};
+  }
 };
 
 class AddNProcessor : public AgnosticNodeProcessor {
  public:
-  AddNProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit AddNProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   std::vector<int> GetInputPos() const override {
@@ -696,43 +916,47 @@ class AddNProcessor : public AgnosticNodeProcessor {
 
 class BinaryOpProcessor : public AgnosticNodeProcessor {
  public:
-  BinaryOpProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                    bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {
-    is_4d_with_vector_ = Is4DOperateWithVector();
-  }
+  explicit BinaryOpProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           (Is4DOperateWithND(4) || Is4DOperateWithScalar() ||
-            Is4DOperateWithVector());
+    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() &&
+           (IsNDOperateWithMD(4, 0) || IsNDOperateWithMD(4, 1) ||
+            IsNDOperateWithMD(4, 4) || IsNDOperateWithMD(0, 4) ||
+            IsNDOperateWithMD(1, 4)) &&
+           IsOnGPU();
   }
 
   std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0};
-    if (Is4DOperateWithND(4)) {
+    std::vector<int> input_pos;
+    auto input0 = node_map_->GetNode(node_->input(0));
+    auto input1 = node_map_->GetNode(node_->input(1));
+    if (IsDimsFour(*input0)) {
+      input_pos.push_back(0);
+    }
+    if (IsDimsFour(*input1)) {
       input_pos.push_back(1);
     }
     return input_pos;
   }
 
-  bool Is4DOperateWithND(int n) const {
+  bool IsDimsFour(const NodeDef& node) const {
+    return NodeProcessor::IsDimsFour(node) || IsNodeNCHWToNHWC(node.name());
+  }
+
+  bool IsNDOperateWithMD(int n, int m) const {
     auto input0 = node_map_->GetNode(node_->input(0));
     auto input1 = node_map_->GetNode(node_->input(1));
     if (input0 && input1) {
-      return (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-             ((n == 4)
-                  ? (IsDimsFour(*input1) || IsNodeNCHWToNHWC(input1->name()))
-                  : IsDimsN(*input1, n));
+      bool input0_is_n = (n == 4) ? IsDimsFour(*input0) : IsDimsN(*input0, n);
+      bool input1_is_m = (m == 4) ? IsDimsFour(*input1) : IsDimsN(*input1, m);
+      return input0_is_n && input1_is_m;
     }
     return false;
   }
 
-  bool Is4DOperateWithScalar() const { return Is4DOperateWithND(0); }
-
-  bool Is4DOperateWithVector() const { return Is4DOperateWithND(1); }
-
   NodeDef* AddNodeShapeConst(const string& name, int num_channels) {
     NodeDef* node = graph_->add_node();
     node_map_->AddNode(name, node);
@@ -776,55 +1000,52 @@ class BinaryOpProcessor : public AgnosticNodeProcessor {
   }
 
   Status CustomizedProcessing() override {
-    if (is_4d_with_vector_) {
-      string base_name = strings::StrCat(node_->name(), "-", node_->input(1));
+    int vector_index = -1;
+    if (IsNDOperateWithMD(4, 1)) {
+      vector_index = 1;
+    } else if (IsNDOperateWithMD(1, 4)) {
+      vector_index = 0;
+    }
+    if (vector_index != -1) {
+      string base_name =
+          strings::StrCat(node_->name(), "-", node_->input(vector_index));
       string reshape_node_name =
           AddPrefixToNodeName(base_name, kReshapeNHWCToNCHW, "-");
       string shape_const_node_name =
           AddPrefixToNodeName(base_name, kReshapeConst, "-");
-      auto input_node = node_map_->GetNode(node_->input(1));
+      auto input_node = node_map_->GetNode(node_->input(vector_index));
       TF_RETURN_IF_ERROR(HasAttribute(*input_node, "_output_shapes"));
       int vector_size =
           input_node->attr().at("_output_shapes").list().shape(0).dim(0).size();
       AddNodeShapeConst(shape_const_node_name, vector_size);
       TF_RETURN_IF_ERROR(HasAttribute(*node_, "T"));
-      AddNodeReshape(reshape_node_name, node_->input(1), shape_const_node_name,
-                     node_->attr().at("T").type());
+      AddNodeReshape(reshape_node_name, node_->input(vector_index),
+                     shape_const_node_name, node_->attr().at("T").type());
       node_map_->AddOutput(shape_const_node_name, reshape_node_name);
-      node_map_->UpdateOutput(node_->input(1), node_->name(),
+      node_map_->UpdateOutput(node_->input(vector_index), node_->name(),
                               reshape_node_name);
       node_map_->AddOutput(reshape_node_name, node_->name());
-      *node_->mutable_input(1) = reshape_node_name;
+      *node_->mutable_input(vector_index) = reshape_node_name;
     }
     return Status::OK();
   }
-
- private:
-  bool is_4d_with_vector_;
 };
 
 class ConcatProcessor : public AgnosticNodeProcessor {
  public:
-  ConcatProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                  bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {
+  explicit ConcatProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {
     // For Concat,  the concat axis is the first input; for ConcatV2,
     // the last input.
-    axis_node_pos_ =
-        (node_->op().compare("Concat") == 0) ? 0 : (node_->input_size() - 1);
+    axis_node_pos_ = (IsConcatV1(*node_)) ? 0 : (node_->input_size() - 1);
   }
 
  protected:
-  bool ShouldProcess() const override {
-    return IsDimsFour(*node_) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           IsAlongDimC();
-  }
-
   std::vector<int> GetInputPos() const override {
     std::vector<int> input_pos;
-    int start = (node_->op().compare("Concat") == 0) ? 1 : 0;
-    int end = (node_->op().compare("Concat") == 0) ? node_->input_size()
-                                                   : (node_->input_size() - 1);
+    int start = (IsConcatV1(*node_)) ? 1 : 0;
+    int end =
+        (IsConcatV1(*node_)) ? node_->input_size() : (node_->input_size() - 1);
     for (int i = start; i < end; i++) {
       input_pos.push_back(i);
     }
@@ -832,227 +1053,101 @@ class ConcatProcessor : public AgnosticNodeProcessor {
   }
 
   Status CustomizedProcessing() override {
-    string concat_const_name = GetOrAddNodeConcatConst();
-    node_map_->AddOutput(concat_const_name, node_->name());
-    *node_->mutable_input(axis_node_pos_) = concat_const_name;
-    return Status::OK();
-  }
-
-  bool IsAlongDimC() const {
-    auto axis_node = node_map_->GetNode(node_->input(axis_node_pos_));
-    if (axis_node->attr().find("value") != axis_node->attr().end()) {
-      return axis_node->attr().at("value").tensor().int_val(0) == 3;
-    }
-    return false;
-  }
-
-  int axis_node_pos_;
-
- private:
-  NodeDef* AddNodeConcatConst(const string& suffix, const string& depended_node,
-                              const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kConcatConst, "-", suffix), device, DT_INT32, 1);
-    // This is to ensure the concat node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
-
-  string GetOrAddNodeConcatConst() {
-    string const_name;
-    if (is_in_frame_) {
-      int value_node_pos = (axis_node_pos_ == 0) ? 1 : 0;
-      auto const_node = AddNodeConcatConst(
-          node_->name(), NodeName(node_->input(value_node_pos)),
-          node_->device());
-      const_name = const_node->name();
+    auto dim_node = node_map_->GetNode(node_->input(axis_node_pos_));
+    if (IsConstant(*dim_node)) {
+      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(axis_node_pos_));
     } else {
-      const_name = kConcatConst;
+      DataType dtype =
+          (IsSplit(*node_)) ? DT_INT32 : node_->attr().at("Tidx").type();
+      AddNodeDataFormatOp("DataFormatDimMap", axis_node_pos_, dtype);
     }
-    return const_name;
-  }
-};
-
-class ReluGradProcessor : public AgnosticNodeProcessor {
- public:
-  ReluGradProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                    bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
-
- protected:
-  std::vector<int> GetInputPos() const override {
-    std::vector<int> input_pos = {0, 1};
-    return input_pos;
+    return Status::OK();
   }
+  int axis_node_pos_;
 };
 
-class SliceProcessor : public AgnosticNodeProcessor {
+class PadProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                 bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit PadProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
-  Status CustomizedProcessing() override {
-    // Skip the first input, which is the data to be sliced.
-    for (int i = 1; i < node_->input_size(); i++) {
-      string base_name = strings::StrCat(node_->name(), "-input", i);
-      string node_name =
-          AddPrefixToNodeName(base_name, kPermVecNHWCToNCHW, "-");
-      TF_RETURN_IF_ERROR(HasAttribute(*node_, "Index"));
-      AddNodePermVec(node_name, node_->input(i),
-                     node_->attr().at("Index").type(), true);
-      node_map_->UpdateOutput(node_->input(i), node_->name(), node_name);
-      node_map_->AddOutput(node_name, node_->name());
-      *node_->mutable_input(i) = node_name;
-    }
-    return Status::OK();
+  bool ShouldProcess() const override {
+    return !MustPreserve() && IsDimsFour(*node_) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() && PaddingSupported() && IsOnGPU();
   }
+  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(1); }
 
  private:
-  NodeDef* AddNodeGatherAxisConst(const string& suffix,
-                                  const string& depended_node,
-                                  const string& device) {
-    auto const_node = AddNodeConstScalar(
-        strings::StrCat(kGatherAxisConst, "-", suffix), device, DT_INT32, 0);
-    // This is to ensure the Slice node and the const node are
-    // in the same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
-
-  string GetOrAddNodeGatherAxisConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeGatherAxisConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kGatherAxisConst;
+  bool PaddingSupported() const {
+    auto pad_const = node_map_->GetNode(node_->input(1));
+    bool is_const = IsConstant(*pad_const);
+    bool is_4D = false;
+    if (HasAttribute(*pad_const, "value").ok()) {
+      Tensor tensor;
+      if (tensor.FromProto(pad_const->mutable_attr()->at({"value"}).tensor())) {
+        if (tensor.dims() == 2) {
+          if (tensor.dim_size(0) == 4 && tensor.dim_size(1) == 2) {
+            is_4D = true;
+          }
+        }
+      }
     }
-    return const_name;
+    return is_const && is_4D;
   }
+};
 
-  string GetOrAddNodePermNHWCToNCHW() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNHWCToNCHW(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNHWCToNCHW;
-    }
-    return const_name;
+class SplitProcessor : public ConcatProcessor {
+ public:
+  explicit SplitProcessor(const OptimizeContext& opt_cxt)
+      : ConcatProcessor(opt_cxt) {
+    axis_node_pos_ = 0;
   }
 
-  string GetOrAddNodePermNCHWToNHWC() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodePermNCHWToNHWC(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kPermNCHWToNHWC;
-    }
-    return const_name;
+ protected:
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {1};
+    return input_pos;
   }
 
-  void AddNodePermVec(const string& node_name, const string& input_name,
-                      DataType data_type, bool NHWCToNCHW) {
-    NodeDef* node = graph_->add_node();
-    node_map_->AddNode(node_name, node);
-    node->set_name(node_name);
-    *node->add_input() = input_name;
-    *node->add_input() = NHWCToNCHW ? GetOrAddNodePermNHWCToNCHW()
-                                    : GetOrAddNodePermNCHWToNHWC();
-    *node->add_input() = GetOrAddNodeGatherAxisConst();
-    node->set_op("GatherV2");
-
-    AttrValue attr_type_indices;
-    attr_type_indices.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Tindices", attr_type_indices});
-
-    AttrValue attr_type_axis;
-    attr_type_axis.set_type(DT_INT32);
-    node->mutable_attr()->insert({"Taxis", attr_type_axis});
-
-    AttrValue attr_type_params;
-    attr_type_params.set_type(data_type);
-    node->mutable_attr()->insert({"Tparams", attr_type_params});
-
-    AttrValue attr_validate;
-    attr_validate.set_b(true);
-    node->mutable_attr()->insert({"validate_indices", attr_validate});
+  std::set<int> GetOutputPos() const override {
+    std::set<int> output_pos{0};
+    if (HasAttribute(*node_, "num_split").ok()) {
+      for (int i = 1; i < node_->attr().at("num_split").i(); i++) {
+        output_pos.insert(i);
+      }
+    }
+    return output_pos;
   }
 };
 
-// Specialized SliceProcessor, used if the second and third input are const
-// nodes, which could be the case if a constant folding pass is applied
-// before this optimization.
-class SliceProcessorConst : public AgnosticNodeProcessor {
+class ReluGradProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessorConst(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                      bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit ReluGradProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
-  Status CustomizedProcessing() override {
-    // Skip the first input, which is the data to be sliced.
-    for (int i = 1; i < node_->input_size(); i++) {
-      TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
-    }
-    return Status::OK();
+  std::vector<int> GetInputPos() const override {
+    std::vector<int> input_pos = {0, 1};
+    return input_pos;
   }
 };
 
-// Specialized SliceProcessor, used if the second input is ConcatOffset. An
-// example use case is in the gradient computation of Concat for InceptionV3.
-class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
+class SliceProcessor : public AgnosticNodeProcessor {
  public:
-  SliceProcessorConcatOffset(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                             bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SliceProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   Status CustomizedProcessing() override {
-    auto maybe_concatoffset_node =
-        node_map_->GetNode(NodeName(node_->input(1)));
-    if (maybe_concatoffset_node->op() == "ConcatOffset") {
-      auto maybe_axis_node =
-          node_map_->GetNode(maybe_concatoffset_node->input(0));
-      NodeDef* axis_node;
-      if (maybe_axis_node->op() == "Const") {
-        axis_node = maybe_axis_node;
-        // A FloorMod node might be added between ConcatOffset and the concat
-        // dimension const node to handle a negative dimension index -1, meaning
-        // the last dimension, which is consistent with the python's notation
-        // for negative index.
-      } else if (maybe_axis_node->op() == "FloorMod") {
-        axis_node = node_map_->GetNode(maybe_axis_node->input(0));
+    // Skip the first input, which is the data to be sliced.
+    for (int i = 1; i < node_->input_size(); i++) {
+      auto index_node = node_map_->GetNode(node_->input(i));
+      if (IsConstant(*index_node)) {
+        TF_RETURN_IF_ERROR(UpdateAttrValueOfInput(i));
       } else {
-        return Status(error::INVALID_ARGUMENT,
-                      strings::StrCat("Expect either Const or FloorMod for the "
-                                      "input 1 of ConcatOffset"));
-      }
-      // Need to process if the channel is at dimension 3, which indicates the
-      // NHWC format is being used. As multiple Slice nodes may share the same
-      // ConcatOffset node, the NHWC to NCHW conversion may have already
-      // been performed when processing other Slice nodes.
-      TF_RETURN_IF_ERROR(HasAttribute(*axis_node, "value"));
-      int concat_dim = axis_node->attr().at("value").tensor().int_val(0);
-      if (concat_dim == -1 || concat_dim == 3) {
-        // Update the dimension order for shape input nodes. Note that the input
-        // 2 of Slice also shares one of the shape nodes.
-        for (int i = 1; i < maybe_concatoffset_node->input_size(); i++) {
-          auto shape_node =
-              node_map_->GetNode(maybe_concatoffset_node->input(i));
-          TF_RETURN_IF_ERROR(UpdateAttrValue(shape_node));
-        }
-        // Set the channel dimension to 1, as we have converted the vector
-        // element order from NHWC to NCHW.
-        axis_node->mutable_attr()->at("value").mutable_tensor()->set_int_val(0,
-                                                                             1);
+        AddNodeDataFormatOp("DataFormatVecPermute", i,
+                            node_->attr().at("Index").type());
       }
     }
     return Status::OK();
@@ -1061,14 +1156,14 @@ class SliceProcessorConcatOffset : public AgnosticNodeProcessor {
 
 class SqueezeProcessor : public AgnosticNodeProcessor {
  public:
-  SqueezeProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-                   bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SqueezeProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
-    return IsDimsN(*node_, 2) && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
-           IsInputConvertible() && IsAlongDimHW();
+    return !MustPreserve() && IsDimsN(*node_, 2) && HasOutputs() &&
+           IsNodeAfterNCHWToNHWC() && IsInputConvertible() && IsAlongDimHW() &&
+           IsOnGPU();
   }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
@@ -1111,34 +1206,34 @@ class SqueezeProcessor : public AgnosticNodeProcessor {
 
 class SumProcessor : public AgnosticNodeProcessor {
  public:
-  SumProcessor(GraphDef* graph, NodeDef* node, NodeMap* node_map,
-               bool is_in_frame)
-      : AgnosticNodeProcessor(graph, node, node_map, is_in_frame) {}
+  explicit SumProcessor(const OptimizeContext& opt_cxt)
+      : AgnosticNodeProcessor(opt_cxt) {}
 
  protected:
   bool ShouldProcess() const override {
     auto input0 = node_map_->GetNode(node_->input(0));
-    return HasOutputs() && IsNodeAfterNCHWToNHWC() &&
+    return !MustPreserve() && HasOutputs() && IsNodeAfterNCHWToNHWC() &&
            (IsDimsFour(*input0) || IsNodeNCHWToNHWC(input0->name())) &&
-           IsAlongDimNHW();
+           IsAlongDimNHW() && IsOnGPU();
   }
 
   Status AddLayoutTransposeToOutputs() override { return Status::OK(); }
 
-  Status CustomizedProcessing() override {
-    node_map_->AddOutput(kReductionConst, node_->name());
-    *node_->mutable_input(1) = GetOrAddNodeReductionConst();
-    return Status::OK();
-  }
+  Status CustomizedProcessing() override { return UpdateAttrValueOfInput(1); }
 
  private:
   bool IsAlongDimNHW() const {
-    NodeDef* node = node_map_->GetNode(node_->input(1));
+    NodeDef* reduction_indices = node_map_->GetNode(node_->input(1));
+    if (!IsConstant(*reduction_indices)) {
+      return false;
+    }
     Tensor tensor;
-    if (node->attr().find({"value"}) == node->attr().end()) {
+    if (reduction_indices->attr().find({"value"}) ==
+        reduction_indices->attr().end()) {
       return false;
     }
-    auto success = tensor.FromProto(node->attr().at({"value"}).tensor());
+    auto success =
+        tensor.FromProto(reduction_indices->attr().at({"value"}).tensor());
     if (!success) {
       LOG(ERROR) << "Failed to parse TensorProto.";
       return false;
@@ -1152,119 +1247,82 @@ class SumProcessor : public AgnosticNodeProcessor {
     }
     return false;
   }
-
-  NodeDef* AddNodeReductionConst(const string& suffix,
-                                 const string& depended_node,
-                                 const string& device) {
-    auto const_node = GraphProcessor::AddNodeReductionConst(
-        strings::StrCat(kReductionConst, "-", suffix), device);
-    // This is to ensure the Sum node and the const node are in the
-    // same frame.
-    *const_node->add_input() = AsControlDependency(depended_node);
-    return const_node;
-  }
-
-  string GetOrAddNodeReductionConst() {
-    string const_name;
-    if (is_in_frame_) {
-      auto const_node = AddNodeReductionConst(
-          node_->name(), NodeName(node_->input(0)), node_->device());
-      const_name = const_node->name();
-    } else {
-      const_name = kReductionConst;
-    }
-    return const_name;
-  }
-};
-
-struct TuningConfig {
-  // If true, do not use the NHWC GEMM implementation. When filter size is
-  // one or filter size is equal to input image size,
-  // the NHWC implementation of Conv2D, Conv2DBackpropInput, and
-  // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
-  // usually faster than the NCHW implementation. The downside is that this
-  // might result in more non-cancellable layout conversion nodes (implemented
-  // by the Transpose op).
-  bool no_gemm;
 };
 
 class DataLayoutOptimizer : GraphProcessor {
  public:
-  explicit DataLayoutOptimizer(const string& default_device, GraphDef* graph,
-                               NodeMap* node_map, TuningConfig config)
-      : GraphProcessor(graph, node_map),
-        default_device_(default_device),
+  explicit DataLayoutOptimizer(
+      const VirtualPlacer& virtual_placer,
+      const LayoutOptimizer::TuningConfig& config,
+      const std::unordered_set<string>& nodes_to_preserve, GraphDef* graph,
+      NodeMap* node_map)
+      : GraphProcessor(virtual_placer, nodes_to_preserve, graph, node_map),
         config_(config) {}
 
   Status Optimize() {
-    LOG(INFO) << "Number of nodes for original graph: " << graph_->node_size();
+    VLOG(1) << "Number of nodes for original graph: " << graph_->node_size();
     TF_RETURN_IF_ERROR(Expand());
-    LOG(INFO) << "Number of nodes after Expand: " << graph_->node_size();
+    VLOG(1) << "Number of nodes after Expand: " << graph_->node_size();
     TF_RETURN_IF_ERROR(Collapse());
-    LOG(INFO) << "Number of nodes after Collapse: " << graph_->node_size();
+    VLOG(1) << "Number of nodes after Collapse: " << graph_->node_size();
     return Status::OK();
   }
 
  private:
   NodeDef* AddNodePermNHWCToNCHW() {
-    return AddNodePermConst(kPermNHWCToNCHW, default_device_, {0, 3, 1, 2});
+    return AddNodePermConst(kPermNHWCToNCHW, "", {0, 3, 1, 2});
   }
 
   NodeDef* AddNodePermNCHWToNHWC() {
-    return AddNodePermConst(kPermNCHWToNHWC, default_device_, {0, 2, 3, 1});
-  }
-
-  NodeDef* AddNodeConcatConst() {
-    return AddNodeConstScalar(kConcatConst, default_device_, DT_INT32, 1);
-  }
-
-  NodeDef* AddNodeGatherAxisConst() {
-    return AddNodeConstScalar(kGatherAxisConst, default_device_, DT_INT32, 0);
-  }
-
-  NodeDef* AddNodeReductionConst() {
-    return GraphProcessor::AddNodeReductionConst(kReductionConst,
-                                                 default_device_);
+    return AddNodePermConst(kPermNCHWToNHWC, "", {0, 2, 3, 1});
   }
 
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
   Status Expand() {
     int node_size_original = graph_->node_size();
     std::unordered_map<const NodeDef*, std::vector<int>> frames;
-    IdentifyFrames(*graph_, &frames);
+    int num_frames;
+    TF_RETURN_IF_ERROR(IdentifyFrames(*graph_, &frames, &num_frames));
 
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
     for (int i = 0; i < node_size_original; i++) {
+      if (IsNodeByLayoutOptimizer(graph_->node(i).name())) {
+        return Status(error::INVALID_ARGUMENT,
+                      "The graph is already optimized by layout optimizer.");
+      }
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
         bool is_in_frame = !frames[node].empty();
+        OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
+                                nodes_to_preserve_, is_in_frame);
         std::unique_ptr<NodeProcessor> node_processor;
-        if (node->op().compare("AvgPoolGrad") == 0) {
+        if (IsAvgPoolGrad(*node)) {
+          node_processor.reset(new AvgPoolGradProcessor(opt_cxt));
+        } else if (IsBiasAddGrad(*node)) {
+          node_processor.reset(new BiasAddGradProcessor(opt_cxt));
+        } else if (IsConv2D(*node)) {
+          node_processor.reset(new Conv2DProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsConv2DBackpropFilter(*node)) {
           node_processor.reset(
-              new AvgPoolGradProcessor(graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("BiasAddGrad") == 0) {
+              new Conv2DBackpropFilterProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsConv2DBackpropInput(*node)) {
           node_processor.reset(
-              new BiasAddGradProcessor(graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("Conv2D") == 0) {
-          node_processor.reset(new Conv2DProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("Conv2DBackpropFilter") == 0) {
-          node_processor.reset(new Conv2DBackpropFilterProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("Conv2DBackpropInput") == 0) {
-          node_processor.reset(new Conv2DBackpropInputProcessor(
-              graph_, node, node_map_, config_.no_gemm, is_in_frame));
-        } else if (node->op().compare("FusedBatchNormGrad") == 0) {
-          node_processor.reset(new FusedBatchNormGradProcessor(
-              graph_, node, node_map_, is_in_frame));
-        } else if (node->op().compare("MaxPoolGrad") == 0) {
+              new Conv2DBackpropInputProcessor(opt_cxt, config_.no_gemm));
+        } else if (IsDepthwiseConv2dNative(*node)) {
+          node_processor.reset(new Conv2DProcessor(opt_cxt, true));
+        } else if (IsDepthwiseConv2dNativeBackpropFilter(*node)) {
           node_processor.reset(
-              new MaxPoolGradProcessor(graph_, node, node_map_, is_in_frame));
+              new Conv2DBackpropFilterProcessor(opt_cxt, true));
+        } else if (IsDepthwiseConv2dNativeBackpropInput(*node)) {
+          node_processor.reset(new Conv2DBackpropInputProcessor(opt_cxt, true));
+        } else if (IsFusedBatchNormGradV1(*node)) {
+          node_processor.reset(new FusedBatchNormGradProcessor(opt_cxt));
+        } else if (IsMaxPoolGradV1(*node)) {
+          node_processor.reset(new MaxPoolGradProcessor(opt_cxt));
         } else {
-          node_processor.reset(
-              new NodeProcessor(graph_, node, node_map_, is_in_frame));
+          node_processor.reset(new NodeProcessor(opt_cxt));
         }
         TF_RETURN_IF_ERROR(node_processor->ConvertNode());
       }
@@ -1276,55 +1334,36 @@ class DataLayoutOptimizer : GraphProcessor {
     if (graph_->node_size() > node_size_original) {
       NodeDef* n = AddNodePermNHWCToNCHW();
       n = AddNodePermNCHWToNHWC();
-      n = AddNodeConcatConst();
-      n = AddNodeGatherAxisConst();
-      n = AddNodeReductionConst();
       std::set<string> ops_format_agnostic = GetOpsFormatAgnostic();
       for (int i = 0; i < graph_->node_size(); i++) {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
           bool is_in_frame = !frames[node].empty();
+          OptimizeContext opt_cxt(graph_, node, node_map_, virtual_placer_,
+                                  nodes_to_preserve_, is_in_frame);
           std::unique_ptr<NodeProcessor> node_processor;
-          if (node->op().compare("AddN") == 0) {
-            node_processor.reset(
-                new AddNProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Add") == 0 ||
-                     node->op().compare("Mul") == 0 ||
-                     node->op().compare("RealDiv") == 0 ||
-                     node->op().compare("SquaredDifference") == 0 ||
-                     node->op().compare("Sub") == 0) {
-            node_processor.reset(
-                new BinaryOpProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Concat") == 0 ||
-                     node->op().compare("ConcatV2") == 0) {
-            node_processor.reset(
-                new ConcatProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("ReluGrad") == 0) {
-            node_processor.reset(
-                new ReluGradProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Slice") == 0) {
-            auto input1 = node_map_->GetNode(NodeName(node->input(1)));
-            auto input2 = node_map_->GetNode(NodeName(node->input(2)));
-            if (input1->op() == "ConcatOffset") {
-              node_processor.reset(new SliceProcessorConcatOffset(
-                  graph_, node, node_map_, is_in_frame));
-            } else if (input1->op() == "Const" && input2->op() == "Const") {
-              node_processor.reset(new SliceProcessorConst(
-                  graph_, node, node_map_, is_in_frame));
-            } else {
-              node_processor.reset(
-                  new SliceProcessor(graph_, node, node_map_, is_in_frame));
-            }
-          } else if (node->op().compare("Squeeze") == 0) {
-            node_processor.reset(
-                new SqueezeProcessor(graph_, node, node_map_, is_in_frame));
-          } else if (node->op().compare("Sum") == 0) {
-            node_processor.reset(
-                new SumProcessor(graph_, node, node_map_, is_in_frame));
+          if (IsAddN(*node)) {
+            node_processor.reset(new AddNProcessor(opt_cxt));
+          } else if (IsAdd(*node) || IsMul(*node) || IsRealDiv(*node) ||
+                     IsSquaredDifference(*node) || IsSub(*node)) {
+            node_processor.reset(new BinaryOpProcessor(opt_cxt));
+          } else if (IsConcat(*node)) {
+            node_processor.reset(new ConcatProcessor(opt_cxt));
+          } else if (IsPad(*node)) {
+            node_processor.reset(new PadProcessor(opt_cxt));
+          } else if (IsReluGrad(*node)) {
+            node_processor.reset(new ReluGradProcessor(opt_cxt));
+          } else if (IsSlice(*node)) {
+            node_processor.reset(new SliceProcessor(opt_cxt));
+          } else if (IsSplit(*node)) {
+            node_processor.reset(new SplitProcessor(opt_cxt));
+          } else if (IsSqueeze(*node)) {
+            node_processor.reset(new SqueezeProcessor(opt_cxt));
+          } else if (IsSum(*node)) {
+            node_processor.reset(new SumProcessor(opt_cxt));
           } else {
-            node_processor.reset(new AgnosticNodeProcessor(
-                graph_, node, node_map_, is_in_frame));
+            node_processor.reset(new AgnosticNodeProcessor(opt_cxt));
           }
           TF_RETURN_IF_ERROR(node_processor->ConvertNode());
         }
@@ -1371,8 +1410,7 @@ class DataLayoutOptimizer : GraphProcessor {
     return Status::OK();
   }
 
-  string default_device_;
-  TuningConfig config_;
+  const LayoutOptimizer::TuningConfig& config_;
 };
 
 int GetNumTranspose(const GraphDef& graph) {
@@ -1382,60 +1420,69 @@ int GetNumTranspose(const GraphDef& graph) {
       number++;
     }
   }
-  LOG(INFO) << "Number of Transpose nodes: " << number;
+  VLOG(1) << "Number of Transpose nodes: " << number;
   return number;
 }
 
+int GetNumGPUs(const Cluster& cluster) {
+  auto devices = cluster.GetDevices();
+  int num_gpus = 0;
+  for (const auto& device : devices) {
+    if (device.second.type() == "GPU") {
+      if (device.second.environment().find("architecture") !=
+          device.second.environment().end()) {
+        const string arch = device.second.environment().at("architecture");
+        // TODO(yaozhang): Enable for Volta GPUs (compute capability version 7).
+        if (arch < "7") {
+          num_gpus++;
+        }
+      }
+    }
+  }
+  return num_gpus;
+}
+}  // namespace
+
+Status LayoutOptimizer::Tune(const GrapplerItem& item,
+                             const GraphProperties& graph_properties,
+                             const TuningConfig& config, GraphDef* output) {
+  auto status = graph_properties.AnnotateOutputShapes(output);
+  if (!status.ok()) {
+    *output = item.graph;
+    return status;
+  }
+  NodeMap node_map(output);
+  DataLayoutOptimizer layout_optimizer(*virtual_placer_, config,
+                                       nodes_to_preserve_, output, &node_map);
+  status = layout_optimizer.Optimize();
+  return status;
+}
+
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
-  if (num_gpus_ == 0) {
-    num_gpus_ = GetNumAvailableGPUs();
-  }
-  if (num_gpus_ < 1) {
+  if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
     return Status::OK();
   }
 
+  virtual_placer_.reset(new VirtualPlacer(cluster));
+  nodes_to_preserve_ = item.NodesToPreserve();
   GraphProperties graph_properties(item);
-  auto status = graph_properties.InferStatically();
-  if (!status.ok()) {
-    *output = item.graph;
-    return status;
-  }
-  status = graph_properties.AnnotateOutputShapes(output);
+  auto status = graph_properties.InferStatically(false);
   if (!status.ok()) {
     *output = item.graph;
     return status;
   }
 
   TuningConfig config;
-  config.no_gemm = false;
-  string default_device = "/job:localhost/replica:0/task:0/cpu:0";
-  if (cluster) {
-    if (!cluster->GetDevices().empty()) {
-      default_device = cluster->GetDevices().begin()->first;
-    }
-  }
-  std::unique_ptr<NodeMap> node_map(new NodeMap(output));
-  std::unique_ptr<DataLayoutOptimizer> layout_optimizer(
-      new DataLayoutOptimizer(default_device, output, node_map.get(), config));
-  status = layout_optimizer->Optimize();
-  // This is based on an empirical observation that if the introduced Transpose
-  // nodes is more than 30, not using GEMM implementation would result in better
-  // performance.
-  if (status.ok() && GetNumTranspose(*output) > 30) {
-    config.no_gemm = true;
-    node_map.reset(new NodeMap(output));
-    layout_optimizer.reset(new DataLayoutOptimizer(default_device, output,
-                                                   node_map.get(), config));
-    status = layout_optimizer->Optimize();
-  }
-
+  config.no_gemm = true;
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // the measurement-based estimator.
+  status = Tune(item, graph_properties, config, output);
   if (!status.ok()) {
     *output = item.graph;
   }
-
   return status;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 1bd6f9544b1da87fc86201aef67f151cd06c7124..357205828ddea3f35a6dd202606a5b59d8baa5a5 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -16,11 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
 #define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
 
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 
 namespace tensorflow {
 namespace grappler {
-
 // Convert the NHWC layout to NCHW for Conv-related ops on GPUs.
 class LayoutOptimizer : public GraphOptimizer {
  public:
@@ -29,8 +30,16 @@ class LayoutOptimizer : public GraphOptimizer {
 
   string name() const override { return "layout"; };
 
-  // This is for testing only.
-  void set_num_gpus(int num_gpus) { num_gpus_ = num_gpus; };
+  struct TuningConfig {
+    // If true, do not use the NHWC GEMM implementation. When filter size is
+    // one or filter size is equal to input image size,
+    // the NHWC implementation of Conv2D, Conv2DBackpropInput, and
+    // Conv2DBackpropFilter will use a specialized GEMM implementation, which is
+    // usually faster than the NCHW implementation. The downside is that this
+    // might result in more non-cancellable layout conversion nodes (implemented
+    // by the Transpose op).
+    bool no_gemm;
+  };
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* output) override;
@@ -39,7 +48,10 @@ class LayoutOptimizer : public GraphOptimizer {
                 const GraphDef& optimize_output, double result) override;
 
  private:
-  int num_gpus_ = 0;
+  std::unique_ptr<VirtualPlacer> virtual_placer_;
+  std::unordered_set<string> nodes_to_preserve_;
+  Status Tune(const GrapplerItem& item, const GraphProperties& graph_properties,
+              const TuningConfig& config, GraphDef* output);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 7ebc9aaf1c18607e45e1b70a46552aec94bf35d7..6e1f47f0d37558f7ff73a5ff3fd9e762cf71307a 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -28,9 +30,21 @@ namespace {
 
 class LayoutOptimizerTest : public ::testing::Test {
  protected:
+  void SetUp() override {
+    DeviceProperties device_properties;
+    device_properties.set_type("GPU");
+    device_properties.mutable_environment()->insert({"architecture", "6"});
+    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+  }
+
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
-    int batch_size = 128;
+    return SimpleConv2D(s, input_size, filter_size, padding, "");
+  }
+
+  Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
+                      const string& padding, const string& device) {
+    int batch_size = 8;
     int input_height = input_size;
     int input_width = input_size;
     int input_depth = 3;
@@ -50,13 +64,19 @@ class LayoutOptimizerTest : public ::testing::Test {
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
-    Output conv = ops::Conv2D(s->WithOpName("Conv2D"), input, filter,
-                              {1, stride, stride, 1}, padding);
+    Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
+                              filter, {1, stride, stride, 1}, padding);
     return conv;
   }
 
   Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                    int filter_size, const string& padding) {
+    return SimpleConv2DBackpropInput(s, input_size, filter_size, padding, true);
+  }
+
+  Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
+                                   int filter_size, const string& padding,
+                                   bool const_input_size) {
     int batch_size = 128;
     int input_height = input_size;
     int input_width = input_size;
@@ -86,11 +106,18 @@ class LayoutOptimizerTest : public ::testing::Test {
     Output output =
         ops::Const(s->WithOpName("Output"), Input::Initializer(output_data));
 
-    Output conv_backprop_input = ops::Conv2DBackpropInput(
-        s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
-        {1, stride, stride, 1}, padding);
-    TensorShape input_shape(
-        {batch_size, input_height, input_width, input_depth});
+    Output conv_backprop_input;
+    Output input_sizes_i =
+        ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
+    if (const_input_size) {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
+          {1, stride, stride, 1}, padding);
+    } else {
+      conv_backprop_input = ops::Conv2DBackpropInput(
+          s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
+          {1, stride, stride, 1}, padding);
+    }
     return conv_backprop_input;
   }
 
@@ -99,6 +126,38 @@ class LayoutOptimizerTest : public ::testing::Test {
     CHECK(tensor.FromProto(node.attr().at({"value"}).tensor()));
     return tensor;
   }
+
+  Output SimpleFusedBatchNormGrad(tensorflow::Scope* s, bool is_training) {
+    int batch_size = 16;
+    int input_height = 8;
+    int input_width = 8;
+    int input_channels = 3;
+    TensorShape shape({batch_size, input_height, input_width, input_channels});
+    Tensor data(DT_FLOAT, shape);
+    test::FillIota<float>(&data, 1.0f);
+    Output x = ops::Const(s->WithOpName("Input"), Input::Initializer(data));
+    Output y_backprop =
+        ops::Const(s->WithOpName("YBackprop"), Input::Initializer(data));
+
+    TensorShape shape_vector({input_channels});
+    Tensor data_vector(DT_FLOAT, shape_vector);
+    test::FillIota<float>(&data_vector, 2.0f);
+    Output scale =
+        ops::Const(s->WithOpName("Scale"), Input::Initializer(data_vector));
+    Output reserve1 =
+        ops::Const(s->WithOpName("Reserve1"), Input::Initializer(data_vector));
+    Output reserve2 =
+        ops::Const(s->WithOpName("Reserve2"), Input::Initializer(data_vector));
+
+    ops::FusedBatchNormGrad::Attrs attrs;
+    attrs.is_training_ = is_training;
+    auto output =
+        ops::FusedBatchNormGrad(s->WithOpName("FusedBatchNormGrad"), y_backprop,
+                                x, scale, reserve1, reserve2, attrs);
+    return output.x_backprop;
+  }
+
+  std::unique_ptr<VirtualCluster> virtual_cluster_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -108,9 +167,9 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   string input_name = AddPrefixToNodeName("Conv2DBackpropInput-InputSizes",
                                           "LayoutOptimizer", "-");
@@ -125,6 +184,28 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
 }
 
+TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", false);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv2d_backprop_node = node_map.GetNode("Conv2DBackpropInput");
+  CHECK(conv2d_backprop_node);
+  EXPECT_EQ(conv2d_backprop_node->input(0),
+            "LayoutOptimizerDataFormatOp_Conv2DBackpropInput_0");
+  auto input_sizes_node =
+      node_map.GetNode("LayoutOptimizerDataFormatOp_Conv2DBackpropInput_0");
+  CHECK(input_sizes_node);
+  EXPECT_EQ(input_sizes_node->input(0), "InputSizesIdentity");
+  EXPECT_EQ(input_sizes_node->op(), "DataFormatVecPermute");
+}
+
 TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 2, 1, "SAME");
@@ -132,9 +213,8 @@ TEST_F(LayoutOptimizerTest, FilterSizeIsOne) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -147,9 +227,8 @@ TEST_F(LayoutOptimizerTest, FilterSizeNotOne) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -162,9 +241,8 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithValidPadding) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_FALSE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input"));
@@ -177,9 +255,8 @@ TEST_F(LayoutOptimizerTest, EqualSizeWithSamePadding) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_TRUE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
@@ -192,14 +269,639 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   LayoutOptimizer optimizer;
-  optimizer.set_num_gpus(1);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
   NodeMap node_map(&output);
   EXPECT_TRUE(
       node_map.GetNode("LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Input-0"));
 }
 
+TEST_F(LayoutOptimizerTest, Pad) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), {1, 2, 3, 4, 5, 6, 7, 8}, {4, 2});
+  auto p = ops::Pad(s.WithOpName("p"), conv, c);
+  auto o = ops::Identity(s.WithOpName("o"), p);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+
+  auto pad = node_map.GetNode("p");
+  EXPECT_EQ(pad->input(0), "Conv2D");
+
+  auto pad_const = node_map.GetNode("LayoutOptimizer-p-c");
+  EXPECT_TRUE(pad_const);
+  EXPECT_TRUE(pad_const->attr().find("value") != pad_const->attr().end());
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(pad_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4, 2});
+  test::FillValues<int>(&tensor_expected, {1, 2, 7, 8, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, Connectivity) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto i3 = ops::Identity(s.WithOpName("i3"), i2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  // Make the graph not in topological order to test the handling of multi-hop
+  // connectivity (here we say two nodes are connected if all nodes in the
+  // middle are layout agnostic). If the graph is already in topological order,
+  // the problem is easier, where layout optimizer only needs to check
+  // single-hop connectivity.
+  NodeMap node_map_original(&item.graph);
+  auto node_i1 = node_map_original.GetNode("i1");
+  auto node_i2 = node_map_original.GetNode("i2");
+  node_i2->Swap(node_i1);
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map_output(&output);
+  auto node_i2_output = node_map_output.GetNode("i2");
+  // Layout optimizer should process i2, as it detects i2 is connected with the
+  // Conv2D node two hops away. Similarly i1 is processed as well, as i1 is
+  // directly connected to the Conv2D node. The two added transposes between
+  // i1 and i2 should cancel each other, and as a result i2 is directly
+  // connected to i1.
+  EXPECT_EQ(node_i2_output->input(0), "i1");
+}
+
+TEST_F(LayoutOptimizerTest, ConnectivityBinaryOpWithInputScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto i1 = ops::Identity(s.WithOpName("i1"), conv);
+  auto i2 = ops::Identity(s.WithOpName("i2"), i1);
+  auto scalar_sub = ops::Const(s.WithOpName("scalar_sub"), 3.0f, {});
+  auto sub = ops::Sub(s.WithOpName("sub"), scalar_sub, i2);
+  auto i3 = ops::Identity(s.WithOpName("i3"), sub);
+  auto i4 = ops::Identity(s.WithOpName("i4"), i3);
+  auto i5 = ops::Identity(s.WithOpName("i5"), i4);
+  auto scalar_mul = ops::Const(s.WithOpName("scalar_mul"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar_mul, i5);
+  auto i6 = ops::Identity(s.WithOpName("i6"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  // Make the graph not in topological order to test the handling of multi-hop
+  // connectivity (here we say two nodes are connected if all nodes in the
+  // middle are layout agnostic). If the graph is already in topological order,
+  // the problem is easier, where layout optimizer only needs to check
+  // single-hop connectivity.
+  NodeMap node_map_original(&item.graph);
+  auto node_i1 = node_map_original.GetNode("i1");
+  auto node_mul = node_map_original.GetNode("mul");
+  node_mul->Swap(node_i1);
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map_output(&output);
+  auto mul_node = node_map_output.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar_mul");
+  EXPECT_EQ(mul_node->input(1), "i5");
+}
+
+TEST_F(LayoutOptimizerTest, PreserveFetch) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto i = ops::Identity(s.WithOpName("i"), conv);
+  GrapplerItem item;
+  item.fetch.push_back("Conv2D");
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, EmptyDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, GPUDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv =
+      SimpleConv2D(&s, 3, 2, "VALID", "/job:w/replica:0/task:0/device:gpu:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, CPUDeviceLowercase) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv =
+      SimpleConv2D(&s, 3, 2, "VALID", "/job:w/replica:0/task:0/device:cpu:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, CPUDeviceUppercase) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID", "/CPU:0");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("Conv2D");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, FusedBatchNormGradTrainingTrue) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x_backprop = SimpleFusedBatchNormGrad(&s, true);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {x_backprop});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("FusedBatchNormGrad");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NCHW");
+}
+
+TEST_F(LayoutOptimizerTest, FusedBatchNormGradTrainingFalse) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x_backprop = SimpleFusedBatchNormGrad(&s, false);
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {x_backprop});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto conv_node = node_map.GetNode("FusedBatchNormGrad");
+  EXPECT_EQ(conv_node->attr().at({"data_format"}).s(), "NHWC");
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimC) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 3, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimH) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 1, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 2);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 2, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 3);
+}
+
+TEST_F(LayoutOptimizerTest, SplitDimN) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 0, {});
+  auto split = ops::Split(s.WithOpName("split"), c, conv, 2);
+  auto i = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizer-split-c");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto split_const = node_map.GetNode("LayoutOptimizer-split-c");
+  EXPECT_EQ(split_const->op(), "Const");
+  EXPECT_EQ(split_const->attr().at({"value"}).tensor().int_val(0), 0);
+}
+
+TEST_F(LayoutOptimizerTest, SplitNonConstDim) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto c = ops::Const(s.WithOpName("c"), 0, {});
+  auto i1 = ops::Identity(s.WithOpName("i1"), c);
+  auto split = ops::Split(s.WithOpName("split"), i1, conv, 2);
+  auto i2 = ops::Identity(s.WithOpName("i"), split[0]);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto split_node = node_map.GetNode("split");
+  EXPECT_EQ(split_node->input(0), "LayoutOptimizerDataFormatOp_split_0");
+  EXPECT_EQ(split_node->input(1), "Conv2D");
+  auto map_node = node_map.GetNode("LayoutOptimizerDataFormatOp_split_0");
+  EXPECT_EQ(map_node->op(), "DataFormatDimMap");
+  EXPECT_EQ(map_node->input(0), "i1");
+}
+
+TEST_F(LayoutOptimizerTest, SplitSamePortToMultipleInputsOfSameNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat =
+      ops::Concat(s.WithOpName("concat"), {split[1], split[1], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split:1");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "split:1");
+  EXPECT_EQ(concat_node->input(3), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimH) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 2);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 1);
+  auto i = ops::Identity(s.WithOpName("i"), axis);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, i);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizerDataFormatOp_concat_2");
+  auto concat_dim = node_map.GetNode("LayoutOptimizerDataFormatOp_concat_2");
+  EXPECT_EQ(concat_dim->op(), "DataFormatDimMap");
+  EXPECT_EQ(concat_dim->input(0), "i");
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 2);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 3);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimN) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 0);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 0);
+}
+
+TEST_F(LayoutOptimizerTest, ConcatDimC) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto axis = ops::Const(s.WithOpName("axis"), 3);
+  auto split = ops::Split(s.WithOpName("split"), axis, conv, 2);
+  auto concat = ops::Concat(s.WithOpName("concat"), {split[0], split[1]}, axis);
+  auto o = ops::Identity(s.WithOpName("o"), concat);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto concat_node = node_map.GetNode("concat");
+  EXPECT_EQ(concat_node->input(0), "split");
+  EXPECT_EQ(concat_node->input(1), "split:1");
+  EXPECT_EQ(concat_node->input(2), "LayoutOptimizer-concat-axis");
+  auto concat_dim = node_map.GetNode("LayoutOptimizer-concat-axis");
+  EXPECT_EQ(concat_dim->attr().at({"value"}).tensor().int_val(0), 1);
+}
+
+TEST_F(LayoutOptimizerTest, Sum) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto reduction_indices =
+      ops::Const(s.WithOpName("reduction_indices"), {0, 1, 2}, {3});
+  auto sum = ops::Sum(s.WithOpName("sum"), conv, reduction_indices);
+  auto o = ops::Identity(s.WithOpName("o"), sum);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  // TODO(yaozhang): enable SumProcessor with auto-tuning. Currently disabled
+  // because of the worse performance in some cases.
+  /*
+  NodeMap node_map(&output);
+  auto sum_node = node_map.GetNode("sum");
+  EXPECT_EQ(sum_node->input(0), "Conv2D");
+  EXPECT_EQ(sum_node->input(1), "LayoutOptimizer-sum-reduction_indices");
+  auto sum_const = node_map.GetNode("LayoutOptimizer-sum-reduction_indices");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(sum_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {3});
+  test::FillValues<int>(&tensor_expected, {0, 2, 3});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+  */
+}
+
+TEST_F(LayoutOptimizerTest, MulScalarAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "scalar");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndScalar) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto scalar = ops::Const(s.WithOpName("scalar"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "scalar");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto i = ops::Identity(s.WithOpName("i"), conv);
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, i);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "i");
+}
+
+TEST_F(LayoutOptimizerTest, Mul4DAndVector) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), conv, vector);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "Conv2D");
+  EXPECT_EQ(mul_node->input(1), "LayoutOptimizerReshapeNHWCToNCHW-mul-vector");
+  auto mul_const = node_map.GetNode("LayoutOptimizerReshapeConst-mul-vector");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, MulVectorAnd4D) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 3, 2, "VALID");
+  auto vector = ops::Const(s.WithOpName("vector"), {3.0f, 7.0f}, {2});
+  auto mul = ops::Mul(s.WithOpName("mul"), vector, conv);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto mul_node = node_map.GetNode("mul");
+  EXPECT_EQ(mul_node->input(0), "LayoutOptimizerReshapeNHWCToNCHW-mul-vector");
+  EXPECT_EQ(mul_node->input(1), "Conv2D");
+  auto mul_const = node_map.GetNode("LayoutOptimizerReshapeConst-mul-vector");
+  Tensor tensor;
+  EXPECT_TRUE(
+      tensor.FromProto(mul_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&tensor_expected, {1, 2, 1, 1});
+  test::ExpectTensorEqual<int>(tensor_expected, tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, begin, size);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1), "LayoutOptimizer-slice-begin");
+  EXPECT_EQ(slice_node->input(2), "LayoutOptimizer-slice-size");
+
+  auto begin_const = node_map.GetNode("LayoutOptimizer-slice-begin");
+  Tensor begin_tensor;
+  EXPECT_TRUE(begin_tensor.FromProto(
+      begin_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor begin_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&begin_tensor_expected, {0, 1, 2, 3});
+  test::ExpectTensorEqual<int>(begin_tensor_expected, begin_tensor);
+
+  auto size_const = node_map.GetNode("LayoutOptimizer-slice-size");
+  Tensor size_tensor;
+  EXPECT_TRUE(size_tensor.FromProto(
+      size_const->mutable_attr()->at({"value"}).tensor()));
+  Tensor size_tensor_expected(DT_INT32, {4});
+  test::FillValues<int>(&size_tensor_expected, {4, 4, 1, 2});
+  test::ExpectTensorEqual<int>(size_tensor_expected, size_tensor);
+}
+
+TEST_F(LayoutOptimizerTest, SliceNonConst) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 5, 2, "VALID");
+  auto begin = ops::Const(s.WithOpName("begin"), {0, 2, 3, 1}, {4});
+  auto ibegin = ops::Identity(s.WithOpName("ibegin"), begin);
+  auto size = ops::Const(s.WithOpName("size"), {4, 1, 2, 4}, {4});
+  auto isize = ops::Identity(s.WithOpName("isize"), size);
+  auto slice = ops::Slice(s.WithOpName("slice"), conv, ibegin, isize);
+  auto o = ops::Identity(s.WithOpName("o"), slice);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto slice_node = node_map.GetNode("slice");
+  EXPECT_EQ(slice_node->input(0), "Conv2D");
+  EXPECT_EQ(slice_node->input(1), "LayoutOptimizerDataFormatOp_slice_1");
+  EXPECT_EQ(slice_node->input(2), "LayoutOptimizerDataFormatOp_slice_2");
+  auto perm1 = node_map.GetNode("LayoutOptimizerDataFormatOp_slice_1");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm1->input(0), "ibegin");
+  auto perm2 = node_map.GetNode("LayoutOptimizerDataFormatOp_slice_2");
+  EXPECT_EQ(perm1->op(), "DataFormatVecPermute");
+  EXPECT_EQ(perm2->input(0), "isize");
+}
+
+TEST_F(LayoutOptimizerTest, DoNotApplyOptimizerTwice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto scalar =
+      ops::Const(s.WithOpName("LayoutOptimizerAlreadyApplied"), 3.0f, {});
+  auto mul = ops::Mul(s.WithOpName("mul"), scalar, scalar);
+  auto o = ops::Identity(s.WithOpName("o"), mul);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  EXPECT_TRUE(errors::IsInvalidArgument(status));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index c33a7cb894f94b510b66d36612ee359286f639c7..1420fdb6feaab32a250f2837f829a695edbabefc 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
@@ -47,12 +49,11 @@ const char* kRecomputeHint = "_recompute_hint";
 // TODO(allenl): Replace this list with a cost model.
 std::unordered_set<string> GetCheapToRecomputeOps() {
   std::unordered_set<string> cheap_ops = {
-      "Add",  "AddN",     "BiasAdd",           "Cast",
-      "Fill", "FloorDiv", "FloorMod",          "FusedBatchNorm",
-      "Mul",  "Neg",      "RealDiv",           "Reciprocal",
-      "Relu", "Relu6",    "Reshape",           "Rsqrt",
-      "Sqrt", "Square",   "SquaredDifference", "Sub",
-      "Tile", "Transpose"};
+      "Add",      "AddN",       "BiasAdd",        "Cast",   "Fill",
+      "FloorDiv", "FloorMod",   "FusedBatchNorm", "Mul",    "Neg",
+      "RealDiv",  "Reciprocal", "Relu",           "Relu6",  "Reshape",
+      "Rsqrt",    "Sigmoid",    "Sqrt",           "Square", "SquaredDifference",
+      "Sub",      "Tile",       "Transpose"};
   return cheap_ops;
 }
 
@@ -418,7 +419,7 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
   // We don't use the results of this topological sort until later, but this
   // call invalidates all NodeDef pointers, so it needs to be done before we
   // start collecting those.
-  TopologicalSort(graph);
+  TF_CHECK_OK(TopologicalSort(graph));
   NodeMap node_map(graph);
   std::vector<RecomputedSubGraph> recomputed_subgraphs;
   // Do not recompute nodes which are fed, since the recomputed node would not
@@ -431,14 +432,16 @@ void RecomputationRewritingPass(RewriterConfig::MemOptType optimization_level,
       [&recomputation_targets_name_prefix](const NodeDef& node) {
         // Nodes whose inputs we may want to recompute. Typically targets will
         // be gradients (recomputation_targets_name_prefix="gradients/"),
-        // although the prefix is configurable since gradients may be created in
-        // a name scope.
+        // although the prefix is configurable since gradients may be created
+        // in a name scope.
         // TODO(allenl): Use a static schedule
         // (grappler::EstimateEarliestExecutionTimes) to recompute only nodes
         // whose outputs will sit around for a while.
         return node.name().find(recomputation_targets_name_prefix) == 0;
       };
-  if (optimization_level == RewriterConfig::HEURISTICS) {
+
+  if (optimization_level == RewriterConfig::RECOMPUTATION_HEURISTICS ||
+      optimization_level == RewriterConfig::HEURISTICS) {
     // TODO(allenl): Handle ResNet-like architectures better. Right now all of
     // the cheap forward ops get grouped into a single subgraph which must
     // execute before gradients start executing (unless layers are manually
@@ -602,6 +605,81 @@ static const NodeDef* FindSwapTrigger(
   return nullptr;
 }
 
+static void IdentifySwappingCandidates(Cluster* cluster,
+                                       const GrapplerItem& item,
+                                       GraphDef* optimized_graph) {
+  GraphMemory memory(item);
+  const std::unordered_map<string, DeviceProperties>& devices =
+      cluster->GetDevices();
+  if (!memory.InferStatically(devices).ok()) {
+    return;
+  }
+
+  for (const auto& device : devices) {
+    const string& name = device.first;
+    const DeviceProperties& prop = device.second;
+    if (prop.type() != "GPU") {
+      continue;
+    }
+    if (prop.memory_size() <= 0) {
+      continue;
+    }
+    const GraphMemory::MemoryUsage& mem_usage = memory.GetPeakMemoryUsage(name);
+    if (mem_usage.used_memory <= prop.memory_size()) {
+      continue;
+    }
+    int64 required_savings = mem_usage.used_memory - prop.memory_size();
+    // TODO(bsteiner): sort the tensors by how long they're live.
+
+    std::unordered_map<const NodeDef*, Costs::NanoSeconds> execution_times;
+    if (!EstimateEarliestExecutionTimes(item, cluster, &execution_times).ok()) {
+      return;
+    }
+    GraphView graph(optimized_graph);
+    for (const auto& live_tensor : mem_usage.live_tensors) {
+      if (live_tensor.deallocation_time - live_tensor.allocation_time <=
+          Costs::Duration(1e6)) {
+        // Not enough time to swap.
+        continue;
+      }
+      if (live_tensor.memory_used <= 1024) {
+        // Don't bother with small tensors.
+        continue;
+      }
+      Costs::NanoSeconds execution_time(-1);
+      GraphView::InputPort fanout_to_swap;
+      GraphView::OutputPort port =
+          graph.GetOutputPort(live_tensor.node, live_tensor.output_id);
+      for (GraphView::InputPort input : graph.GetFanout(port)) {
+        auto it = execution_times.find(input.node);
+        if (it != execution_times.end()) {
+          if (it->second > execution_time) {
+            fanout_to_swap = input;
+            execution_time = it->second;
+          }
+        }
+      }
+      // Annotate the fanout to request the tensor to be swapped if it's not
+      // already been done.
+      AttrValue& val = (*fanout_to_swap.node->mutable_attr())["_swap_to_host"];
+      bool found = false;
+      for (int port_id : val.list().i()) {
+        if (port_id == fanout_to_swap.port_id) {
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        val.mutable_list()->add_i(fanout_to_swap.port_id);
+        required_savings -= live_tensor.memory_used;
+        if (required_savings < 0) {
+          break;
+        }
+      }
+    }
+  }
+}
+
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
@@ -610,6 +688,10 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                              recomputation_targets_name_prefix_,
                              optimized_graph, item);
 
+  if (optimization_level_ == RewriterConfig::SWAPPING_HEURISTICS) {
+    IdentifySwappingCandidates(cluster, item, optimized_graph);
+  }
+
   // Figure out what needs to be swapped;
   std::unordered_map<NodeDef*, SwapInfo> nodes_to_swap;
   for (auto& node : *optimized_graph->mutable_node()) {
@@ -634,7 +716,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   {
     // Estimate the size of the data to swap for each node.
     GraphProperties properties(item);
-    TF_RETURN_IF_ERROR(properties.InferStatically());
+    TF_RETURN_IF_ERROR(properties.InferStatically(true));
     for (auto& swap : nodes_to_swap) {
       const NodeDef* node = swap.first;
       std::vector<OpInfo::TensorProperties> props =
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 0b6eff4f5b97cc5f39589b57d927b75746be218d..6fa4731a863cea9d6124e379641682030ca80bed 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -153,7 +153,7 @@ TEST_F(RecomputeSubgraphTest, MultiNode) {
   pre_transform_node_map.GetNode("BN")->set_op("FusedBatchNorm");
   pre_transform_node_map.GetNode("ReLU")->set_op("Relu");
 
-  MemoryOptimizer optimizer(RewriterConfig::HEURISTICS);
+  MemoryOptimizer optimizer(RewriterConfig::RECOMPUTATION_HEURISTICS);
   GraphDef first_pass_output;
   Status first_pass_status =
       optimizer.Optimize(nullptr, item, &first_pass_output);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 1174a390f354c696ad762e6b1bda4b2c65261791..0d0b947c8ab515ace0d606e55dbc45039101f33e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
@@ -53,6 +54,10 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     graph_optimizer.reset(
         new AutoParallel(cfg_.auto_parallel().num_replicas()));
   }
+  if (optimizer == "dependency") {
+    graph_optimizer.reset(
+        new DependencyOptimizer(cfg_.dependency_optimization()));
+  }
   return graph_optimizer;
 }
 
@@ -64,14 +69,18 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(new ModelPruner()));
     }
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new ConstantFolding(cpu_device_)));
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
     }
     if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new ArithmeticOptimizer(cfg_.arithmetic_optimization())));
     }
-    if (cfg_.optimize_tensor_layout()) {
+    if (cfg_.dependency_optimization() != RewriterConfig::OFF) {
+      optimizers.push_back(std::unique_ptr<GraphOptimizer>(
+          new DependencyOptimizer(cfg_.dependency_optimization())));
+    }
+    if (cfg_.layout_optimizer() == RewriterConfig::ON) {
       optimizers.push_back(
           std::unique_ptr<GraphOptimizer>(new LayoutOptimizer()));
     }
@@ -92,9 +101,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           new AutoParallel(cfg_.auto_parallel().num_replicas())));
     }
   } else {
-    std::set<string> available_optimizers = {"pruning",      "constfold",
-                                             "layout",       "memory",
-                                             "autoparallel", "arithmetic"};
+    std::set<string> available_optimizers = {
+        "pruning",      "constfold",  "layout",    "memory",
+        "autoparallel", "arithmetic", "dependency"};
     for (const auto& optimizer : cfg_.optimizers()) {
       if (available_optimizers.find(optimizer) != available_optimizers.end()) {
         optimizers.push_back(NewOptimizer(optimizer));
@@ -110,35 +119,79 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool already_optimized = false;
   for (const auto& optimizer : optimizers) {
     if (!already_optimized) {
-      TF_RETURN_IF_ERROR(optimizer->Optimize(cluster, item, optimized_graph));
-      already_optimized = true;
+      auto status = optimizer->Optimize(cluster, item, optimized_graph);
+      string result;
+      if (!status.ok()) {
+        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
+                << ". Return status: " << status.ToString();
+        result = status.ToString();
+      } else {
+        already_optimized = true;
+        result = strings::StrCat(
+            "OK. "
+            "Graph size before: ",
+            item.graph.node_size(),
+            ". Graph size after: ", optimized_graph->node_size());
+      }
+      result_.push_back(std::make_pair(optimizer->name(), result));
+      VLOG(1) << "Optimizer " << optimizer->name()
+              << " return status: " << result;
     } else {
       GrapplerItem optimized_item(item, std::move(*optimized_graph));
-      TF_RETURN_IF_ERROR(
-          optimizer->Optimize(cluster, optimized_item, optimized_graph));
+      auto status =
+          optimizer->Optimize(cluster, optimized_item, optimized_graph);
+      string result;
+      if (!status.ok()) {
+        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
+                << ". Return status: " << status.ToString();
+        optimized_graph->Swap(&optimized_item.graph);
+        result = status.ToString();
+      } else {
+        result = strings::StrCat(
+            "OK. "
+            "Graph size before: ",
+            optimized_item.graph.node_size(),
+            ". Graph size after: ", optimized_graph->node_size());
+      }
+      result_.push_back(std::make_pair(optimizer->name(), result));
+      VLOG(1) << "Optimizer " << optimizer->name()
+              << " return status: " << result;
     }
   }
-  TopologicalSort(optimized_graph);
 
-  // Make sure that the optimizers preserved the graph version and library.
-  DCHECK_GE(optimized_graph->library().function_size(),
-            item.graph.library().function_size());
-  DCHECK_GE(optimized_graph->library().gradient_size(),
-            item.graph.library().gradient_size());
-  DCHECK_EQ(optimized_graph->versions().producer(),
-            item.graph.versions().producer());
+  if (already_optimized) {
+    TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
+    // Make sure that the optimizers preserved the graph version and library.
+    DCHECK_GE(optimized_graph->library().function_size(),
+              item.graph.library().function_size());
+    DCHECK_GE(optimized_graph->library().gradient_size(),
+              item.graph.library().gradient_size());
+    DCHECK_EQ(optimized_graph->versions().producer(),
+              item.graph.versions().producer());
+  } else {
+    *optimized_graph = item.graph;
+  }
 
   return Status::OK();
 }
 
+void MetaOptimizer::PrintResult() {
+  for (const auto& result : result_) {
+    LOG(INFO) << "Return status of optimizer " << result.first << ": "
+              << result.second;
+  }
+}
+
 void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
                              const GraphDef& pruned_graph, double result) {
   // Nothing to do for MetaOptimizer.
 }
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  return !cfg.disable_model_pruning() || cfg.optimize_tensor_layout() ||
+  return !cfg.disable_model_pruning() ||
+         cfg.layout_optimizer() == RewriterConfig::ON ||
          cfg.constant_folding() != RewriterConfig::OFF ||
+         cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() || cfg.memory_optimization() > 1 ||
          !cfg.optimizers().empty();
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index b00886b964b91e78672f6023c8d5d43ce989de49..382cfe51d42439691fcedd5b765c9ef13e055ae5 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -37,6 +37,8 @@ class MetaOptimizer : public GraphOptimizer {
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override;
 
+  void PrintResult();
+
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
 
@@ -44,6 +46,7 @@ class MetaOptimizer : public GraphOptimizer {
   std::unique_ptr<GraphOptimizer> NewOptimizer(const string& optimizer);
   DeviceBase* const cpu_device_;  // may be NULL
   RewriterConfig cfg_;
+  std::vector<std::pair<string, string>> result_;
 };
 
 bool MetaOptimizerEnabled(const RewriterConfig& cfg);
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index b9df196f83b0e986a3eb4ed4c470c5520e7d611f..c9bec7890e6af008859d21555fb7ed74451c72c6 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -26,16 +26,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-int NumNonControlInputs(const NodeDef& node) {
-  int num_inputs = node.input_size();
-  for (int i = 0; i < node.input_size(); ++i) {
-    if (!node.input(i).empty() && node.input(i)[0] == '^') {
-      num_inputs--;
-    }
-  }
-  return num_inputs;
-}
-
 bool IsTrivialOp(const NodeDef& node) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 6ce6deef2ceacdfe44b49659109e432b87739f97..450e85340796fdde9afdfebbd0eb9a724cb9440a 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -86,7 +86,7 @@ Status EstimateEarliestExecutionTimes(
   name_map.clear();
 
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
@@ -154,7 +154,7 @@ Status EstimateRequiredTimes(
     }
   }
   GraphProperties properties(item);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(true));
   OpLevelCostEstimator estimator;
   VirtualPlacer placer(cluster);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 5de593358727bf8b1f247c0fb9ec8f52b2819e4c..08580d92842377c2dd999950b2e01bef01e2fee6 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500005), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750008), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000011), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250014), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500017), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500026), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500027), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500028), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
     } else if (time.first->name() == "e") {
-      EXPECT_EQ(Costs::NanoSeconds(25000053), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500016), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250015), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 63145b4e0711dc328fcee282fa9d25d12e33c261..fc80772360a71e63c618ca4b2f697a92883196eb 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -27,20 +29,29 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-NodeMap::NodeMap(GraphDef* graph) : graph_(graph) {
-  for (int i = 0; i < graph_->node_size(); i++) {
-    auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+NodeMap::NodeMap(GraphDef* graph) {
+  CHECK(graph != nullptr);
+  for (int i = 0; i < graph->node_size(); i++) {
+    NodeDef* node = graph->mutable_node(i);
+    const string& node_name = node->name();
+    auto rslt = nodes_.emplace(node_name, node);
     // Check that the graph doesn't contain multiple nodes with the same name.
-    CHECK(rslt.second);
+    if (!rslt.second) {
+      LOG(WARNING) << "Duplicated node in the graph: " << node_name;
+    }
     for (const auto& input : node->input()) {
-      outputs_[NodeName(input)].insert(nodes_[node->name()]);
+      outputs_[NodeName(input)].insert(nodes_[node_name]);
     }
   }
 }
 
+void NodeMap::RemoveNode(const string& name) {
+  nodes_.erase(NodeName(name));
+  outputs_.erase(NodeName(name));
+}
+
 NodeDef* NodeMap::GetNode(const string& name) const {
-  string node_name = NodeName(name);
+  const string node_name = NodeName(name);
   auto it = nodes_.find(node_name);
   if (it == nodes_.end()) {
     return nullptr;
@@ -48,6 +59,11 @@ NodeDef* NodeMap::GetNode(const string& name) const {
   return it->second;
 }
 
+bool NodeMap::NodeExists(const string& name) const {
+  const string node_name = NodeName(name);
+  return nodes_.find(node_name) != nodes_.end();
+}
+
 const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
   auto it = outputs_.find(node_name);
   if (it == outputs_.end()) {
@@ -56,27 +72,27 @@ const std::set<NodeDef*>& NodeMap::GetOutputs(const string& node_name) const {
   return it->second;
 }
 
-void NodeMap::AddNode(const string& name, NodeDef* node) {
-  auto ret = nodes_.insert(std::make_pair(name, node));
-  CHECK(ret.second) << "Pair (" << name << "," << node
-                    << ") is not inserted because a same key already exists.";
+void NodeMap::AddNode(const string& node_name, NodeDef* node) {
+  auto ret = nodes_.emplace(node_name, CHECK_NOTNULL(node));
+  CHECK(ret.second) << "Pair (" << node_name << "," << node
+                    << ") is not inserted because the same key already exists.";
 }
 
 void NodeMap::AddOutput(const string& node_name, const string& output_name) {
-  auto output_node = nodes_[output_name];
+  auto output_node = nodes_[NodeName(output_name)];
   CHECK(output_node) << "Output node " << output_name
                      << " is missing in NodeMap.";
   outputs_[node_name].insert(output_node);
 }
 
 void NodeMap::RemoveOutput(const string& node_name, const string& output_name) {
-  outputs_[node_name].erase(nodes_[output_name]);
+  outputs_[node_name].erase(nodes_[NodeName(output_name)]);
 }
 
 void NodeMap::UpdateInput(const string& node_name, const string& old_input_name,
                           const string& new_input_name) {
-  RemoveOutput(old_input_name, node_name);
-  AddOutput(new_input_name, node_name);
+  RemoveOutput(NodeName(old_input_name), node_name);
+  AddOutput(NodeName(new_input_name), node_name);
 }
 
 void NodeMap::RemoveInputs(const string& node_name) {
@@ -94,14 +110,14 @@ void NodeMap::UpdateOutput(const string& node_name,
                            const string& old_output_name,
                            const string& new_output_name) {
   std::set<NodeDef*>& outputs = outputs_[node_name];
-  outputs.erase(nodes_[old_output_name]);
-  outputs.insert(nodes_[new_output_name]);
+  outputs.erase(nodes_[NodeName(old_output_name)]);
+  outputs.insert(nodes_[NodeName(new_output_name)]);
 }
 
 OutputMap::OutputMap(GraphDef* graph) : graph_(graph) {
   for (int i = 0; i < graph_->node_size(); i++) {
     auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+    auto rslt = nodes_.emplace(node->name(), node);
     // Check that the graph doesn't contain multiple nodes with the same name.
     CHECK(rslt.second);
     for (const auto& input : node->input()) {
@@ -219,8 +235,11 @@ string AsControlDependency(const NodeDef& node) {
   return strings::StrCat("^", node.name());
 }
 
-string AsControlDependency(const string& node) {
-  return strings::StrCat("^", node);
+string AsControlDependency(const string& node_name) {
+  CHECK(!node_name.empty());
+  return (!node_name.empty() && node_name[0] == '^')
+             ? node_name
+             : strings::StrCat("^", node_name);
 }
 
 int NumOutputs(const NodeDef& node) {
@@ -242,5 +261,160 @@ int NumOutputs(const NodeDef& node) {
   return num_outputs;
 }
 
+int NumNonControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  for (const string& input : node.input()) {
+    if (IsControlInput(input)) {
+      --num_inputs;
+    }
+  }
+  return num_inputs;
+}
+
+int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map) {
+  int num_outputs = 0;
+  for (const NodeDef* output : node_map.GetOutputs(node.name())) {
+    for (const string& node_as_input : output->input()) {
+      if (IsControlInput(node_as_input)) {
+        break;
+      }
+      if (NodeName(node_as_input) == node.name()) {
+        ++num_outputs;
+      }
+    }
+  }
+  return num_outputs;
+}
+
+// Returns the data type in attribute `attr_name` of `node`. If that attribute
+// doesn't exist, returns DT_INVALID.
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name) {
+  if (!node.attr().count(attr_name)) {
+    return DT_INVALID;
+  }
+  const auto& attr = node.attr().at(attr_name);
+  if (attr.value_case() != AttrValue::kType) {
+    return DT_INVALID;
+  }
+  return attr.type();
+}
+
+NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
+                        bool follow_control_input,
+                        const std::function<bool(const NodeDef&)>& pred_fn) {
+  const NodeDef* current = &source;
+  const NodeDef* next = current;
+  while (next == &source || (next != nullptr && pred_fn(*next))) {
+    current = next;
+    if (current->input_size() == 0 ||
+        (!follow_control_input && IsControlInput(current->input(0)))) {
+      break;
+    }
+    next = node_map.GetNode(current->input(0));
+    if (next == nullptr) {
+      LOG(ERROR) << "Node not found: " << current->input(0);
+    }
+  }
+  return const_cast<NodeDef*>(current);
+}
+
+// Every permutation is a product of one or more cycles. Iterate over the cycles
+// in the permutation, and convert each of those into a product of
+// transpositions (swaps): https://en.wikipedia.org/wiki/Cyclic_permutation
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation) {
+  CHECK_EQ(graph->node_size(), permutation->size());
+  std::vector<int> inv_perm(permutation->size(), 0);
+  if (invert_permutation) {
+    for (size_t n = 0; n < permutation->size(); ++n) {
+      inv_perm[(*permutation)[n]] = n;
+    }
+    permutation->swap(inv_perm);
+  }
+  for (std::size_t n = 0; n + 1 < permutation->size(); ++n) {
+    while (n != (*permutation)[n]) {
+      std::size_t r = (*permutation)[n];
+      graph->mutable_node()->SwapElements(n, r);
+      std::swap((*permutation)[n], (*permutation)[r]);
+    }
+  }
+}
+
+namespace {
+template <typename T>
+inline void STLSortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+}  // namespace
+
+Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
+                                   bool dedup_outputs) {
+  const int num_nodes = graph.node_size();
+  inputs_.clear();
+  inputs_.resize(num_nodes);
+  outputs_.clear();
+  outputs_.resize(num_nodes);
+  name_to_index_.clear();
+  name_to_index_.reserve(num_nodes);
+  index_to_name_.clear();
+  index_to_name_.reserve(num_nodes);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    name_to_index_.emplace(node.name(), node_idx);
+    index_to_name_.push_back(node.name());
+  }
+
+  // Build forward and reverse adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    inputs_[node_idx].reserve(node.input_size());
+    for (const string& input : node.input()) {
+      auto it = name_to_index_.find(NodeName(input));
+      if (it == name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent input ", input,
+                                       " for node ", node.name());
+      }
+      const int input_idx = it->second;
+      inputs_[node_idx].push_back(input_idx);
+      outputs_[input_idx].push_back(node_idx);
+    }
+    if (dedup_inputs) {
+      // Dedup the input list while it's still hot in cache.
+      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
+    }
+  }
+
+  // Dedup outputs.
+  if (dedup_outputs) {
+    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
+      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
+    }
+  }
+  return Status::OK();
+}
+
+string SimpleGraphView::PrintToString() const {
+  string str;
+  for (int i = 0; i < num_nodes(); ++i) {
+    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
+    for (int input : inputs(i)) {
+      strings::StrAppend(&str, input, " '", node_name(input), "', ");
+    }
+    strings::StrAppend(&str, "]\n", "Outputs: [");
+    for (int j = 0; j < outputs(i).size(); ++j) {
+      const int output = outputs(i)[j];
+      if (j > 0) {
+        strings::StrAppend(&str, ", ");
+      }
+      strings::StrAppend(&str, output, " '", node_name(output), "'");
+    }
+    strings::StrAppend(&str, "]\n");
+  }
+  return str;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index f9fb418140833af4b1805ff3b02b5666d886407b..476ab8b51afcee839d8f30378d2fa00ed8406cc7 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -17,12 +17,16 @@ limitations under the License.
 #define TENSORFLOW_GRAPPLER_UTILS_H_
 
 #include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -30,12 +34,16 @@ namespace grappler {
 // A utility class to lookup a node and its outputs by node name.
 class NodeMap {
  public:
+  // Note: The NodeMap will store pointers to nodes in graph, which may become
+  // invalid if graph is changed.
   explicit NodeMap(GraphDef* graph);
   NodeDef* GetNode(const string& name) const;
+  bool NodeExists(const string& name) const;
   const std::set<NodeDef*>& GetOutputs(const string& node_name) const;
   // This method doesn't record the outputs of the added node; the outputs need
   // to be explicitly added by the AddOutput method.
   void AddNode(const string& name, NodeDef* node);
+  void RemoveNode(const string& name);
   void UpdateInput(const string& node_name, const string& old_input_name,
                    const string& new_input_name);
   void AddOutput(const string& node_name, const string& output_name);
@@ -46,8 +54,7 @@ class NodeMap {
                     const string& new_output_name);
 
  private:
-  GraphDef* graph_;
-  std::set<NodeDef*> empty_set_;
+  const std::set<NodeDef*> empty_set_;
   std::unordered_map<string, NodeDef*> nodes_;
   std::unordered_map<string, std::set<NodeDef*>> outputs_;
 };
@@ -68,6 +75,39 @@ class OutputMap {
   std::unordered_map<string, std::unordered_map<NodeDef*, int>> outputs_;
 };
 
+// A vector with a set. The set stores the same elements as the vector, and
+// quickly answers whether a value is in the vector. Duplicated elements are not
+// allowed for now.
+template <class T>
+class SetVector {
+ public:
+  // Returns false if value already existed in the set, true otherwise.
+  bool PushBack(const T& value) {
+    if (!set_.insert(value).second) {
+      return false;
+    }
+    vector_.push_back(value);
+    return true;
+  }
+
+  T PopBack() {
+    T back = vector_.back();
+    set_.erase(back);
+    vector_.pop_back();
+    return back;
+  }
+
+  bool Exists(const T& value) const { return set_.find(value) != set_.end(); }
+
+  bool Empty() const { return vector_.empty(); }
+
+  void Reserve(int64 size) { vector_.reserve(size); }
+
+ private:
+  std::unordered_set<T> set_;
+  std::vector<T> vector_;
+};
+
 // True iff 'name' refers to a control inputs, i.e. a node name prefixed with
 // the ^ character.
 bool IsControlInput(const string& name);
@@ -109,10 +149,70 @@ string AsControlDependency(const NodeDef& node);
 // for control dependency, given a node name
 string AsControlDependency(const string& node);
 
-// Returns the number of outputs of a node. Note that some of the outputs may be
-// unconnected.
+// Returns the number of outputs of a node according to its OpDef. Note that
+// some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node);
 
+// Number of connected non-control inputs.
+int NumNonControlInputs(const NodeDef& node);
+
+// Number of connected non-control outputs.
+int NumNonControlOutputs(const NodeDef& node, const NodeMap& node_map);
+
+// Returns the data type in attribute `attr_name` of `node`. If that attribute
+// doesn't exist, returns DT_INVALID.
+DataType GetDataTypeFromAttr(const NodeDef& node, const string& attr_name);
+
+// Returns the last node in the simple chain starting at source and traversing
+// through the input(0) edge from each node as long as the next node satisfies
+// the predicate given in pred_fn. If no nodes satisfy the predicate, &source
+// will be returned. Example: For the chain
+//    source <- a <- b <- ... <- y <- z
+// where
+//    pred_fn(a) = pred_fn(b) = ... = pred_fn(y) = true,
+//    pred_fn(z) = false,
+// the return value will be a pointer to y.
+NodeDef* GetTailOfChain(const NodeDef& source, const NodeMap& node_map,
+                        bool follow_control_input,
+                        const std::function<bool(const NodeDef&)>& pred_fn);
+
+// Permute the nodes of graph in place according to the permutation.
+void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
+                         bool invert_permutation);
+
+class SimpleGraphView {
+ public:
+  Status Initialize(const GraphDef& graph) {
+    return Initialize(graph, true, true);
+  }
+  Status Initialize(const GraphDef& graph, bool dedup_inputs,
+                    bool dedup_outputs);
+
+  inline int num_nodes() const { return index_to_name_.size(); }
+  inline const int index(const string& node_name) const {
+    const auto& it = name_to_index_.find(node_name);
+    DCHECK(it != name_to_index_.end());
+    return it == name_to_index_.end() ? -1 : it->second;
+  }
+  inline const string& node_name(int node_idx) const {
+    return index_to_name_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
+    return inputs_[node_idx];
+  }
+  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
+    return outputs_[node_idx];
+  }
+
+  string PrintToString() const;
+
+ private:
+  std::vector<string> index_to_name_;
+  std::unordered_map<string, int> name_to_index_;
+  std::vector<gtl::InlinedVector<int, 4>> inputs_;
+  std::vector<gtl::InlinedVector<int, 2>> outputs_;
+};
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index bb161bf9a4c29cb1c684044393db11302159736a..534f7a063fe90bf72f8a2afba7ae8f75b8472a36 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -53,6 +53,7 @@ cc_library(
     hdrs = ["topological_sort.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:op_types",
@@ -78,6 +79,7 @@ cc_library(
     hdrs = ["frame.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:op_types",
diff --git a/tensorflow/core/grappler/utils/frame.cc b/tensorflow/core/grappler/utils/frame.cc
index 7655d0bee5a7fcd78b3896147f8eed82ad9d5bcf..df5f4ff7cf38dbc7ab3038346cd4ea65031c8227 100644
--- a/tensorflow/core/grappler/utils/frame.cc
+++ b/tensorflow/core/grappler/utils/frame.cc
@@ -20,27 +20,32 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-int IdentifyFrames(
-    const GraphDef& graph,
-    std::unordered_map<const NodeDef*, std::vector<int>>* frames) {
+Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
+                      int* num_frames) {
   NodeMap node_map(const_cast<GraphDef*>(&graph));
+  return IdentifyFramesWithNodeMap(graph, node_map, frame_map, num_frames);
+}
+
+Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
+                                 FrameMap* frame_map, int* num_frames) {
   std::deque<std::pair<const NodeDef*, std::vector<int>>> ready_nodes;
   for (const NodeDef& node : graph.node()) {
     if (node.input_size() == 0) {
       std::vector<int> empty;
       ready_nodes.emplace_back(&node, empty);
-      (*frames)[&node] = empty;
+      (*frame_map)[&node] = empty;
     }
   }
   std::map<string, int> name_to_id;
   while (!ready_nodes.empty()) {
     auto ready_node = ready_nodes.front();
     for (const auto& fanout : node_map.GetOutputs(ready_node.first->name())) {
-      if (frames->count(fanout) < 1) {
+      if (frame_map->count(fanout) < 1) {
         std::vector<int> frame_ids = ready_node.second;
         if (IsExit(*ready_node.first)) {
           frame_ids.pop_back();
@@ -59,9 +64,9 @@ int IdentifyFrames(
           frame_ids.push_back(id);
         }
         ready_nodes.emplace_back(fanout, frame_ids);
-        (*frames)[fanout] = frame_ids;
+        (*frame_map)[fanout] = frame_ids;
       } else {
-        auto frame_ids_fanout = (*frames)[fanout];
+        auto frame_ids_fanout = (*frame_map)[fanout];
         auto frame_ids_node = ready_node.second;
         if (IsEnter(*fanout)) {
           frame_ids_fanout.pop_back();
@@ -69,12 +74,17 @@ int IdentifyFrames(
         if (IsExit(*ready_node.first)) {
           frame_ids_node.pop_back();
         }
-        CHECK(frame_ids_node == frame_ids_fanout);
+        if (frame_ids_node != frame_ids_fanout) {
+          return errors::InvalidArgument(
+              "Invalid graph: Frame ids for node ", ready_node.first->name(),
+              " does not match frame ids for it's fanout.");
+        }
       }
     }
     ready_nodes.pop_front();
   }
-  return name_to_id.size();
+  *num_frames = name_to_id.size();
+  return Status::OK();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index d9e046a969a4582af7716a213c6c773e5d9a155e..be726ae795769609769709746ce7bb74f849e37a 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -18,16 +18,24 @@ limitations under the License.
 
 #include <unordered_map>
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
+using FrameMap = std::unordered_map<const NodeDef*, std::vector<int>>;
+
 // Returns the number of frames present in the graph, and populates
 // the 'frames' argument with the collection of frames (denoted by their
 // frame ids) in the outermost-to-innermost order. Frame ids are arbitrary.
-int IdentifyFrames(
-    const GraphDef& graph,
-    std::unordered_map<const NodeDef*, std::vector<int>>* frames);
+Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
+                      int* num_frames);
+
+// As above, but use an existing NodeMap for graph instead of building it
+// from scratch.
+Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
+                                 FrameMap* frame_map, int* num_frames);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame_test.cc b/tensorflow/core/grappler/utils/frame_test.cc
index 30673eed7a983bf641d3b86b83311f94367da287..df76083fc3a0334172ac93998e0b549a2c723431 100644
--- a/tensorflow/core/grappler/utils/frame_test.cc
+++ b/tensorflow/core/grappler/utils/frame_test.cc
@@ -78,7 +78,8 @@ TEST_F(IdentifyFramesTest, NestedLoop) {
   *graph.add_node() = CreateNode("17", {"16"});
 
   std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames = IdentifyFrames(graph, &frames);
+  int num_frames;
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}},      {"1", {0}},     {"2", {0}},     {"3", {0}},
       {"4", {0}},     {"5", {0}},     {"6", {0}},     {"7", {0, 1}},
@@ -108,7 +109,8 @@ TEST_F(IdentifyFramesTest, MultipleInputsToEnter) {
   *graph.add_node() = CreateNode("3", "Exit", {"2"});
 
   std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames = IdentifyFrames(graph, &frames);
+  int num_frames;
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {}}, {"2", {0}}, {"3", {0}}};
   EXPECT_EQ(num_frames, 1);
@@ -135,7 +137,8 @@ TEST_F(IdentifyFramesTest, ExitOutput) {
   *graph.add_node() = CreateNode("4", {"2", "3"});
 
   std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames = IdentifyFrames(graph, &frames);
+  int num_frames;
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {}}, {"4", {}}};
   EXPECT_EQ(num_frames, 1);
@@ -167,7 +170,8 @@ TEST_F(IdentifyFramesTest, MultipleEnterNodes) {
   *graph.add_node() = CreateNode("9", "Exit", {"7"});
 
   std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames = IdentifyFrames(graph, &frames);
+  int num_frames;
+  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {0}}, {"4", {0}},
       {"5", {}}, {"6", {0}}, {"7", {0}}, {"8", {0}}, {"9", {0}}};
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 77d4702d21e75b1689875eb17fbd2cda41aa1ba8..8d8ff4da3a8df5a2868f1a3a0ac6a5d0c2fd66ad 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -19,61 +19,56 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-void TopologicalSort(GraphDef* graph) {
-  OutputMap output_map(graph);
-  std::vector<NodeDef*> ready_nodes;
-  ready_nodes.reserve(graph->node_size());
+Status TopologicalSort(GraphDef* graph) {
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(*graph));
+
+  std::vector<int> ready_nodes;
+  ready_nodes.reserve(graph_view.num_nodes());
+
   int front = 0;
   int back = 0;
-  std::unordered_map<const NodeDef*, int> ready_inputs;
-  for (int i = 0; i < graph->node_size(); i++) {
-    auto node = graph->mutable_node(i);
-    if (node->input_size() == 0) {
-      ready_nodes.push_back(node);
+  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
+  for (int i = 0; i < graph_view.num_nodes(); i++) {
+    if (graph_view.inputs(i).empty()) {
+      ready_nodes.push_back(i);
       back++;
     }
-    if (IsMerge(*node)) {
-      ready_inputs[node] = 0;
-      for (const auto& input : node->input()) {
-        if (IsNextIteration(*output_map.GetNode(input))) {
-          ready_inputs[node]++;
+    if (IsMerge(graph->node(i))) {
+      for (int input : graph_view.inputs(i)) {
+        if (IsNextIteration(graph->node(input))) {
+          num_ready_inputs[i]++;
         }
       }
-    } else {
-      ready_inputs[node] = 0;
     }
   }
 
   while (front != back) {
-    auto ready_node = ready_nodes[front];
-    for (const auto& fanout_pair : output_map.GetOutputs(ready_node->name())) {
-      auto fanout = fanout_pair.first;
-      ready_inputs[fanout] += fanout_pair.second;
-      if (ready_inputs[fanout] == fanout->input_size()) {
+    int ready_node = ready_nodes[front];
+    for (int fanout : graph_view.outputs(ready_node)) {
+      ++num_ready_inputs[fanout];
+      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
         ready_nodes.push_back(fanout);
-        back++;
+        ++back;
       }
     }
-    front++;
+    ++front;
   }
 
-  if (back == graph->node_size()) {
-    GraphDef new_graph;
-    new_graph.mutable_node()->Reserve(graph->node_size());
-    for (int i = 0; i < graph->node_size(); i++) {
-      auto new_node = new_graph.add_node();
-      new_node->Swap(ready_nodes[i]);
-    }
-    graph->mutable_node()->Swap(new_graph.mutable_node());
-  } else {
-    LOG(ERROR) << "The graph couldn't be sorted in topological order.";
+  if (back != graph_view.num_nodes()) {
+    return errors::InvalidArgument(
+        "The graph couldn't be sorted in topological order.");
   }
+
+  PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
+  return Status::OK();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index d4d8034ef577a0282dbce161aed8ba440bf248ab..f2c9bbfa4ebce373a4fa80f399ce3d2b59a576f4 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // Sort a graph in topological order.
-void TopologicalSort(GraphDef* graph);
+Status TopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index dc99cb1052ce9db3035401a2cd75e838281fb748..c96f15b0e8424d70e8dd1393cf254b52f69200d2 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -51,7 +52,7 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("5", {});
   *graph.add_node() = CreateNode("4", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -67,7 +68,7 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -82,7 +83,7 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
   *graph.add_node() = CreateNode("3", {"2"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -94,13 +95,34 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
   *graph.add_node() = CreateNode("2", {"1", "1"});
   *graph.add_node() = CreateNode("1", {});
 
-  TopologicalSort(&graph);
+  TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
+TEST_F(TopologicalSortTest, Idempotent) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("1", {});
+  *graph.add_node() = CreateNode("2", {});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+  *graph.add_node() = CreateNode("4", {"1", "3"});
+  *graph.add_node() = CreateNode("5", {"2", "3"});
+
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  std::vector<string> order = {"1", "2", "3", "4", "5"};
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+
+  // Run topo sort again to verify that it is idenpotent.
+  TF_EXPECT_OK(TopologicalSort(&graph));
+  for (int i = 0; i < order.size(); i++) {
+    EXPECT_EQ(graph.node(i).name(), order[i]);
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 3193b3ec4a60c2aa0627edcaccb58b654af462c5..77371c399e5fc7321f7c2b271aae32ce9655244b 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -181,6 +182,73 @@ TEST_F(UtilsTest, NumOutputs) {
   EXPECT_EQ(1, NumOutputs(CreateDequeueNode()));
 }
 
+TEST_F(UtilsTest, AsControlDependency) {
+  NodeDef node;
+  node.set_name("foo");
+  EXPECT_EQ("^foo", AsControlDependency(node));
+  EXPECT_EQ("^foo", AsControlDependency(node.name()));
+  EXPECT_EQ("^foo", AsControlDependency("^foo"));
+}
+
+TEST_F(UtilsTest, GetTailOfChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output c0 = ops::Const(s.WithOpName("c0"), {1.0f, 2.0f}, {1, 2});
+  Output c1 = ops::Const(s.WithOpName("c1"), {3.0f, 4.0f}, {1, 2});
+  // Add a node with only connected by control output.
+  Output neg0 = ops::Neg(s.WithOpName("neg0"), c1);
+  // Add a node with two outputs.
+  Output neg1 =
+      ops::Neg(s.WithControlDependencies(neg0).WithOpName("neg1"), c0);
+  Output neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
+  Output id1 = ops::Identity(s.WithOpName("id1"), neg2);
+  Output id2 = ops::Identity(s.WithOpName("id2"), neg1);
+  auto noop = ops::NoOp(s.WithControlDependencies(neg0).WithOpName("noop"));
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  LOG(INFO) << graph.DebugString();
+
+  ASSERT_EQ("c0", graph.node(0).name());
+  ASSERT_EQ("c1", graph.node(1).name());
+  ASSERT_EQ("neg0", graph.node(2).name());
+  ASSERT_EQ("neg1", graph.node(3).name());
+  ASSERT_EQ("neg2", graph.node(4).name());
+  ASSERT_EQ("id1", graph.node(5).name());
+  ASSERT_EQ("id2", graph.node(6).name());
+  ASSERT_EQ("noop", graph.node(7).name());
+
+  NodeMap node_map(&graph);
+  auto is_neg = [&](const NodeDef& node) { return node.op() == "Neg"; };
+  // We walk backwards, starting as "id1", so tail should be "neg1".
+  NodeDef* tail = GetTailOfChain(graph.node(5), node_map,
+                                 /*follow_control_input=*/false, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg1", tail->name());
+
+  // We stop at branching nodes, so tail should be "neg2".
+  auto is_neg_and_non_branching = [&](const NodeDef& node) {
+    return node.op() == "Neg" && NumNonControlOutputs(node, node_map) == 1;
+  };
+  tail =
+      GetTailOfChain(graph.node(5), node_map,
+                     /*follow_control_input=*/false, is_neg_and_non_branching);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg2", tail->name());
+
+  // We walk backwards, starting from "noop", also following control inputs,
+  // so tail should be "neg0".
+  tail = GetTailOfChain(graph.node(7), node_map,
+                        /*follow_control_input=*/true, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("neg0", tail->name());
+
+  // We walk backwards, starting from "noop", not following control inputs,
+  // so tail should be "noop" itself.
+  tail = GetTailOfChain(graph.node(7), node_map,
+                        /*follow_control_input=*/false, is_neg);
+  EXPECT_NE(tail, nullptr);
+  EXPECT_EQ("noop", tail->name());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 1cb7c97be4f9fa86de499641f6c8165049625cde..a3262bf06aa57324112decaf6227bf56724e5dde 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -269,13 +269,11 @@ cc_library(
 cc_library(
     name = "conv_ops_gpu_hdrs",
     hdrs = ["conv_ops_gpu.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 cc_library(
     name = "gpu_util_hdrs",
     hdrs = ["gpu_utils.h"],
-    deps = ["//third_party/eigen3"],
 )
 
 tf_cc_test(
@@ -291,6 +289,17 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "reshape_util",
+    srcs = ["reshape_util.cc"],
+    hdrs = ["reshape_util.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_cc_test(
     name = "variable_ops_test",
     size = "small",
@@ -329,6 +338,7 @@ cc_library(
     srcs = ["queue_base.cc"],
     hdrs = ["queue_base.h"],
     deps = [
+        ":batch_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -350,6 +360,7 @@ cc_library(
     srcs = ["priority_queue.cc"],
     hdrs = ["priority_queue.h"],
     deps = [
+        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -576,6 +587,7 @@ cc_library(
         ":extract_image_patches_op",
         ":gather_nd_op",
         ":gather_op",
+        ":guarantee_const_op",
         ":identity_n_op",
         ":identity_op",
         ":inplace_ops",
@@ -593,6 +605,7 @@ cc_library(
         ":reverse_sequence_op",
         ":shape_ops",
         ":slice_op",
+        ":snapshot_op",
         ":split_op",
         ":split_v_op",
         ":strided_slice_op",
@@ -622,6 +635,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS,
 )
 
+tf_kernel_library(
+    name = "guarantee_const_op",
+    prefix = "guarantee_const_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "constant_op",
     prefix = "constant_op",
@@ -783,6 +802,12 @@ tf_kernel_library(
     deps = ARRAY_DEPS + [":strided_slice_op"],
 )
 
+tf_kernel_library(
+    name = "snapshot_op",
+    prefix = "snapshot_op",
+    deps = ARRAY_DEPS,
+)
+
 tf_kernel_library(
     name = "split_op",
     gpu_srcs = ["cuda_device_array.h"],
@@ -916,6 +941,25 @@ tf_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "bincount_op_test",
+    size = "small",
+    srcs = ["bincount_op_test.cc"],
+    deps = [
+        ":bincount_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "constant_op_test",
     size = "small",
@@ -1161,6 +1205,25 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "guarantee_const_op_test",
+    size = "small",
+    srcs = ["guarantee_const_op_test.cc"],
+    deps = [
+        ":guarantee_const_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "identity_op_test",
     size = "small",
@@ -1562,7 +1625,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "random_shuffle_queue_op",
     prefix = "random_shuffle_queue_op",
-    deps = DATA_FLOW_DEPS + ["//tensorflow/core:protos_all_cc"],
+    deps = DATA_FLOW_DEPS + [
+        ":batch_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 tf_kernel_library(
@@ -1601,7 +1667,10 @@ DYNAMIC_DEPS = [
 tf_kernel_library(
     name = "dynamic_partition_op",
     prefix = "dynamic_partition_op",
-    deps = DYNAMIC_DEPS,
+    deps = DYNAMIC_DEPS + [
+        ":fill_functor",
+        ":gather_functor",
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -1671,7 +1740,7 @@ tf_kernel_library(
     ],
 )
 
-tf_cc_tests(
+tf_cuda_cc_tests(
     name = "dynamic_op_test",
     size = "small",
     srcs = [
@@ -1682,6 +1751,7 @@ tf_cc_tests(
         ":data_flow",
         ":ops_testutil",
         ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1712,6 +1782,7 @@ cc_library(
     hdrs = ["fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1726,6 +1797,7 @@ cc_library(
     hdrs = ["padding_fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
+        ":batch_util",
         ":fifo_queue",
         ":queue_base",
         ":typed_queue",
@@ -2255,6 +2327,7 @@ cc_library(
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
+        ":matrix_exponential_op",
         ":matrix_inverse_op",
         ":matrix_solve_ls_op",
         ":matrix_solve_op",
@@ -2321,6 +2394,12 @@ tf_kernel_library(
     ]) + LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "matrix_exponential_op",
+    prefix = "matrix_exponential_op",
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "self_adjoint_eig_op",
     prefix = "self_adjoint_eig_op",
@@ -2535,8 +2614,13 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "batch_matmul_op",
+    srcs = [] + if_mkl([
+        "mkl_batch_matmul_op.cc",
+    ]),
     prefix = "batch_matmul_op",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+    ]),
 )
 
 tf_kernel_library(
@@ -2547,8 +2631,9 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "bucketize_op",
+    gpu_srcs = ["cuda_device_array.h"],
     prefix = "bucketize_op",
-    deps = MATH_DEPS,
+    deps = ARRAY_DEPS,
 )
 
 tf_kernel_library(
@@ -3035,6 +3120,7 @@ cc_library(
         ":batch_norm_op",
         ":bias_op",
         ":conv_ops",
+        ":data_format_ops",
         ":depthwise_conv_grad_op",
         ":depthwise_conv_op",
         ":dilation_ops",
@@ -3072,6 +3158,12 @@ tf_kernel_library(
     deps = NN_DEPS,
 )
 
+tf_kernel_library(
+    name = "data_format_ops",
+    prefix = "data_format_ops",
+    deps = NN_DEPS,
+)
+
 tf_kernel_library(
     name = "bias_op",
     prefix = "bias_op",
@@ -3149,7 +3241,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3652,7 +3744,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "sparse_reshape_op",
     prefix = "sparse_reshape_op",
-    deps = SPARSE_DEPS,
+    deps = SPARSE_DEPS + [
+        ":reshape_util",
+    ],
 )
 
 tf_kernel_library(
@@ -3700,7 +3794,10 @@ tf_kernel_library(
 tf_kernel_library(
     name = "serialize_sparse_op",
     prefix = "serialize_sparse_op",
-    deps = SPARSE_DEPS + ["//tensorflow/core:protos_all_cc"],
+    deps = SPARSE_DEPS + [
+        ":reshape_util",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 tf_kernel_library(
@@ -3855,6 +3952,8 @@ tf_kernel_library(
         "scatter_nd_op_cpu_impl_3.cc",
         "scatter_nd_op_cpu_impl_4.cc",
         "scatter_nd_op_cpu_impl_5.cc",
+        "scatter_nd_op_cpu_impl_6.cc",
+        "scatter_nd_op_cpu_impl_7.cc",
     ],
     hdrs = [
         "scatter_nd_op.h",
@@ -3864,7 +3963,11 @@ tf_kernel_library(
         "scatter_nd_op.h",
         "scatter_nd_op_gpu.cu.cc",
     ],
-    deps = STATE_DEPS + [":dense_update_functor"],
+    deps = STATE_DEPS + [
+        ":dense_update_functor",
+        ":training_op_helpers",
+        ":variable_ops",
+    ],
 )
 
 tf_kernel_library(
@@ -4336,6 +4439,7 @@ filegroup(
     name = "mobile_srcs",
     srcs = [
         "avgpooling_op.h",
+        "batch_util.h",
         "bounds_check.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
@@ -4413,6 +4517,17 @@ filegroup(
         "fill_functor.h",
         "function_ops.cc",
         "gather_functor.h",
+        "gather_nd_op.cc",
+        "gather_nd_op.h",
+        "gather_nd_op_cpu_impl.h",
+        "gather_nd_op_cpu_impl_0.cc",
+        "gather_nd_op_cpu_impl_1.cc",
+        "gather_nd_op_cpu_impl_2.cc",
+        "gather_nd_op_cpu_impl_3.cc",
+        "gather_nd_op_cpu_impl_4.cc",
+        "gather_nd_op_cpu_impl_5.cc",
+        "gather_nd_op_cpu_impl_6.cc",
+        "gather_nd_op_cpu_impl_7.cc",
         "gather_op.cc",
         "identity_n_op.cc",
         "identity_n_op.h",
@@ -4500,12 +4615,17 @@ filegroup(
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_ops.h",
+        "data_format_ops.h",
         "depthtospace_op.h",
         "depthwise_conv_op.h",
         "fake_quant_ops_functor.h",
         "fused_batch_norm_op.h",
         "gemm_functors.h",
         "image_resizer_state.h",
+        "initializable_lookup_table.h",
+        "lookup_table_init_op.h",
+        "lookup_table_op.h",
+        "lookup_util.h",
         "maxpooling_op.h",
         "mfcc.h",
         "mfcc_dct.h",
@@ -4518,10 +4638,12 @@ filegroup(
         "reduction_ops_common.h",
         "relu_op.h",
         "relu_op_functor.h",
+        "reshape_util.h",
         "resize_bilinear_op.h",
         "resize_nearest_neighbor_op.h",
         "reverse_op.h",
         "save_restore_tensor.h",
+        "segment_reduction_ops.h",
         "softplus_op.h",
         "softsign_op.h",
         "spacetobatch_functor.h",
@@ -4571,6 +4693,8 @@ filegroup(
         "cwise_op_div.cc",
         "cwise_op_equal_to_1.cc",
         "cwise_op_equal_to_2.cc",
+        "cwise_op_not_equal_to_1.cc",
+        "cwise_op_not_equal_to_2.cc",
         "cwise_op_exp.cc",
         "cwise_op_floor.cc",
         "cwise_op_floor_div.cc",
@@ -4579,6 +4703,7 @@ filegroup(
         "cwise_op_greater_equal.cc",
         "cwise_op_invert.cc",
         "cwise_op_isfinite.cc",
+        "cwise_op_isnan.cc",
         "cwise_op_left_shift.cc",
         "cwise_op_less.cc",
         "cwise_op_less_equal.cc",
@@ -4604,6 +4729,7 @@ filegroup(
         "cwise_op_squared_difference.cc",
         "cwise_op_sub.cc",
         "cwise_op_tanh.cc",
+        "data_format_ops.cc",
         "decode_wav_op.cc",
         "deep_conv2d.cc",
         "deep_conv2d.h",
@@ -4612,6 +4738,7 @@ filegroup(
         "encode_wav_op.cc",
         "fake_quant_ops.cc",
         "fifo_queue.cc",
+        "fifo_queue_op.cc",
         "fused_batch_norm_op.cc",
         "population_count_op.cc",
         "population_count_op.h",
@@ -4629,13 +4756,18 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
+        "batch_util.cc",
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
         "in_topk_op.cc",
+        "initializable_lookup_table.cc",
         "logging_ops.cc",
+        "lookup_table_init_op.cc",
+        "lookup_table_op.cc",
+        "lookup_util.cc",
         "lrn_op.cc",
         "maxpooling_op.cc",
         "mfcc.cc",
@@ -4663,6 +4795,7 @@ filegroup(
         "reduction_ops_prod.cc",
         "reduction_ops_sum.cc",
         "relu_op.cc",
+        "reshape_util.cc",
         "resize_bilinear_op.cc",
         "resize_nearest_neighbor_op.cc",
         "restore_op.cc",
@@ -4670,12 +4803,15 @@ filegroup(
         "save_op.cc",
         "save_restore_tensor.cc",
         "save_restore_v2_ops.cc",
+        "segment_reduction_ops.cc",
         "session_ops.cc",
         "softplus_op.cc",
         "softsign_op.cc",
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
+        "sparse_fill_empty_rows_op.cc",
+        "sparse_reshape_op.cc",
         "sparse_to_dense_op.cc",
         "spectrogram.cc",
         "spectrogram_op.cc",
@@ -4698,6 +4834,7 @@ filegroup(
         "training_ops.cc",
         "transpose_functor_cpu.cc",
         "transpose_op.cc",
+        "unique_op.cc",
         "warn_about_ints.cc",
         "where_op.cc",
         "xent_op.cc",
@@ -4959,7 +5096,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//third_party/eigen3",
     ],
@@ -5002,7 +5138,6 @@ tf_cc_binary(
             "//tensorflow/cc:client_session",
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5062,7 +5197,6 @@ cc_binary(
             "//tensorflow/core:tensor_testutil",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5086,7 +5220,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5107,7 +5240,6 @@ tf_cc_test(
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5151,7 +5283,6 @@ cc_binary(
             "//tensorflow/core:image_ops_op_lib",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
         ],
     }),
@@ -5265,7 +5396,6 @@ cc_binary(
             ":quantized_ops",
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
         ],
@@ -5291,7 +5421,6 @@ tf_cc_test(
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5403,7 +5532,6 @@ cc_binary(
         "//conditions:default": [
             "//tensorflow/core:framework",
             "//tensorflow/core:tensor_testutil",
-            "//tensorflow/core:test_main",
         ],
     }),
 )
@@ -5424,7 +5552,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
@@ -5719,15 +5846,46 @@ tf_mkl_kernel_library(
     ],
 )
 
+cc_library(
+    name = "stats_aggregator",
+    hdrs = ["stats_aggregator.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        ":stats_aggregator",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "batch_util",
+    srcs = ["batch_util.cc"],
+    hdrs = ["batch_util.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "dataset",
     srcs = ["dataset.cc"],
     hdrs = ["dataset.h"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/util/tensor_bundle",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -5777,6 +5935,7 @@ tf_kernel_library(
     name = "batch_dataset_op",
     srcs = ["batch_dataset_op.cc"],
     deps = [
+        ":batch_util",
         ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -5948,6 +6107,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -5999,6 +6159,31 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        ":stats_aggregator",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
@@ -6051,6 +6236,7 @@ tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
     deps = [
+        ":batch_util",
         ":dataset",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -6124,6 +6310,7 @@ tf_kernel_library(
     deps = [
         ":dataset",
         ":ops_util",
+        ":stats_aggregator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -6165,6 +6352,7 @@ tf_kernel_library(
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
         ":prefetch_dataset_op",
+        ":random_dataset_op",
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
@@ -6173,6 +6361,8 @@ tf_kernel_library(
         ":skip_dataset_op",
         ":sparse_tensor_slice_dataset_op",
         ":sql_dataset_ops",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
         ":tensor_slice_dataset_op",
@@ -6185,11 +6375,11 @@ cc_library(
     srcs = ["summary_interface.cc"],
     hdrs = ["summary_interface.h"],
     deps = [
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
     ],
 )
 
@@ -6211,8 +6401,12 @@ tf_kernel_library(
     srcs = ["summary_kernels.cc"],
     deps = [
         ":summary_interface",
+        "//tensorflow/contrib/tensorboard/db:summary_db_writer",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:summary_ops_op_lib",
+        "//tensorflow/core/lib/db:sqlite",
     ],
 )
 
@@ -6230,3 +6424,31 @@ filegroup(
     ),
     visibility = ["//tensorflow:__subpackages__"],
 )
+
+# Library to link with when compiling the cwise_op kernels directly,
+# e.g. for selective registration.
+# should not be linked by projects that also link the cwise_op library.
+cc_library(
+    name = "cwise_lib",
+    srcs = [
+        "cwise_ops_common.cc",
+        "meta_support.cc",
+        "quantization_utils.cc",
+    ],
+    hdrs = [
+        "cwise_ops.h",
+        "cwise_ops_common.h",
+        "cwise_ops_gpu_common.cu.h",
+        "cwise_ops_gpu_gradients.cu.h",
+        "cwise_ops_gradients.h",
+        "meta_support.h",
+        "quantization_utils.h",
+    ],
+    deps = [
+        ":bounds_check",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+        "@gemmlowp//:gemmlowp",
+    ],
+)
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index af629d0de8c01029eeff2c9a32733fa229513942..f9180236933d04d707eb1744de3993b9396b3dfa 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -153,7 +153,8 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
-          stride_, padding_, data_format_, tensor_in, output_shape);
+          stride_, padding_, data_format_, tensor_in, output_shape,
+          /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context,
@@ -408,7 +409,7 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
     DnnPoolingGradOp<T>::Compute(
         context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
         stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-        output_shape);
+        output_shape, /*propagate_nans=*/false);
   }
 
  private:
@@ -532,7 +533,7 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
       DnnPoolingGradOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kAverage, ksize_,
           stride_, padding_, data_format_, nullptr, nullptr, out_backprop,
-          output_shape);
+          output_shape, /*propagate_nans=*/false);
     }
   }
 
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 3b880a963538b05789e73a9100ec5d5472d3c249..d0bbea9fe27856cc0dedb4570d285bd872741099 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -161,9 +161,11 @@ class Barrier : public ResourceBase {
         component_shape.InsertDim(0, insertion_size);
         Tensor component(ready_tuples[0][i].dtype(), component_shape);
         for (int b = 0; b < insertion_size; ++b) {
-          OP_REQUIRES_OK_ASYNC(ctx, QueueBase::CopyElementToSlice(
-                                        ready_tuples[b][i], &component, b),
-                               callback);
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              batch_util::CopyElementToSlice(std::move(ready_tuples[b][i]),
+                                             &component, b),
+              callback);
         }
         insert_tuple.push_back(component);
       }
diff --git a/tensorflow/core/kernels/batch_dataset_op.cc b/tensorflow/core/kernels/batch_dataset_op.cc
index 04a41451ea5720f32b588fe98fa4e2ccf31828f9..d5f2fd4c194e8056f919441fda570cc7dd81ea96 100644
--- a/tensorflow/core/kernels/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/batch_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
 
 namespace tensorflow {
 
@@ -38,14 +39,14 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         ctx, batch_size > 0,
         errors::InvalidArgument("Batch size must be greater than zero."));
 
-    *output = new Dataset(batch_size, input);
+    *output = new Dataset(ctx, batch_size, input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 batch_size, const DatasetBase* input)
-        : batch_size_(batch_size), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 batch_size, const DatasetBase* input)
+        : GraphDatasetBase(ctx), batch_size_(batch_size), input_(input) {
       input_->Ref();
 
       // NOTE(mrry): Currently we implement "batch up to" semantics. If
@@ -79,45 +80,19 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
-   private:
-    // Copies element into the index^th slice of parent (in the 0th dimension).
-    //
-    // TODO(mrry): Reconcile this method with the similar method in
-    // the queue implementation.
-    template <typename T>
-    static Status HandleElementToSlice(const Tensor& element, Tensor* parent,
-                                       int64 index) {
-      if (element.NumElements() !=
-          (parent->NumElements() / parent->dim_size(0))) {
-        TensorShape chip_shape = parent->shape();
-        chip_shape.RemoveDim(0);
-        return errors::InvalidArgument(
-            "HandleElementToSlice Cannot copy slice: number of elements does "
-            "not match. Shapes are: [element]: ",
-            element.shape().DebugString(),
-            ", [parent slice]: ", chip_shape.DebugString());
-      }
-      auto parent_as_matrix = parent->flat_outer_dims<T>();
-      parent_as_matrix.chip(index, 0) = element.flat<T>();
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, batch_size}, output));
       return Status::OK();
     }
 
-    // Copies element into the index^th slice of parent (in the 0th dimension).
-    static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
-                                     int64 index) {
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleElementToSlice<T>(element, parent, index); \
-  }
-
-      switch (element.dtype()) {
-        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-#undef HANDLE_TYPE
-        default:
-          return errors::Unimplemented(
-              "CopyElementToSlice Unhandled data type: ", element.dtype());
-      }
-    }
+   private:
 
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -131,9 +106,13 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         // Each row of `batch_elements` is a tuple of tensors from the
         // input iterator.
         std::vector<std::vector<Tensor>> batch_elements;
-        batch_elements.reserve(dataset()->batch_size_);
         {
           mutex_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          batch_elements.reserve(dataset()->batch_size_);
           *end_of_sequence = false;
           for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
                ++i) {
@@ -142,6 +121,8 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
                                                     end_of_sequence));
             if (!*end_of_sequence) {
               batch_elements.emplace_back(std::move(batch_element_tuple));
+            } else {
+              input_impl_.reset();
             }
           }
         }
@@ -170,8 +151,19 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           // Build the output tuple component by copying one slice
           // from each input element in the batch.
           for (size_t i = 0; i < num_batch_elements; ++i) {
-            TF_RETURN_IF_ERROR(CopyElementToSlice(
-                batch_elements[i][component_index], &batch_component, i));
+            if (batch_elements[i][component_index].shape() !=
+                first_element.shape()) {
+              return errors::InvalidArgument(
+                  "Cannot batch tensors with different shapes in component ",
+                  component_index, ". First element had shape ",
+                  first_element.shape().DebugString(), " and element ", i,
+                  " had shape ",
+                  batch_elements[i][component_index].shape().DebugString(),
+                  ".");
+            }
+            TF_RETURN_IF_ERROR(batch_util::CopyElementToSlice(
+                std::move(batch_elements[i][component_index]), &batch_component,
+                i));
           }
           out_tensors->emplace_back(std::move(batch_component));
         }
@@ -179,6 +171,29 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/batch_matmul_op_complex.cc
index a58ec027262a0c2fab729d2c434098d2795d1d62..96216764fd46971db47b6a11be622cef63e5d103 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_complex.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+#if !defined(INTEL_MKL)
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_CPU);
+#endif
 
 #if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_GPU);
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 1900ed8e31483a84e216ea54bd08e6a4558bbfcb..8d155ca62b297a4bf59f62159d6b62b01f777721 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+#if !defined(INTEL_MKL)
 TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
+#endif
 TF_CALL_half(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/kernels/batch_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f2df95e2d55ac93f8a934010244dcbd1dcd28c8
--- /dev/null
+++ b/tensorflow/core/kernels/batch_util.cc
@@ -0,0 +1,119 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batch_util.h"
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+namespace {
+
+Status ValidateInput(const Tensor& parent, const Tensor& element, int64 index) {
+  DCHECK_NE(parent.dim_size(0), 0);
+  DCHECK_GE(index, 0);
+  if (element.NumElements() != (parent.NumElements() / parent.dim_size(0))) {
+    TensorShape chip_shape = parent.shape();
+    chip_shape.RemoveDim(0);
+    return errors::Internal(
+        "ValidateInput Cannot perform copy: number of elements does not match. "
+        " Shapes are: [element]: ",
+        element.shape().DebugString(),
+        ", [parent slice]: ", chip_shape.DebugString());
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status HandleElementToSlice(Tensor element, Tensor* parent, int64 index,
+                            bool /* can_move */) {
+  parent->flat_outer_dims<T>().chip(index, 0) = element.flat<T>();
+  return Status::OK();
+}
+
+template <>
+Status HandleElementToSlice<string>(Tensor element, Tensor* parent, int64 index,
+                                    bool can_move) {
+  auto parent_as_matrix = parent->flat_outer_dims<string>();
+  auto element_flat = element.flat<string>();
+  if (can_move) {
+    for (int64 i = 0; i < element.NumElements(); ++i) {
+      parent_as_matrix(index, i) = std::move(element_flat(i));
+    }
+  } else {
+    parent_as_matrix.chip(index, 0) = element_flat;
+  }
+  return Status::OK();
+}
+
+// TODO(jsimsa): Add HandleElementToSlice<variant> specialization that moves
+// the data when possible.
+
+template <typename T>
+static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
+                                   int64 index) {
+  element->flat<T>() = parent.flat_outer_dims<T>().chip(index, 0);
+  return Status::OK();
+}
+
+}  // namespace
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(*parent, element, index));
+
+  bool can_move = element.RefCountIsOne();
+#define HANDLE_TYPE(T)                                                \
+  case DataTypeToEnum<T>::value: {                                    \
+    return HandleElementToSlice<T>(std::move(element), parent, index, \
+                                   can_move);                         \
+  }
+
+  switch (element.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
+                                   element.dtype());
+  }
+}
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
+  TF_RETURN_IF_ERROR(ValidateInput(parent, *element, index));
+
+#define HANDLE_TYPE(T)                                      \
+  case DataTypeToEnum<T>::value: {                          \
+    return HandleSliceToElement<T>(parent, element, index); \
+  }
+
+  switch (parent.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
+                                   element->dtype());
+  }
+}
+
+}  // namespace batch_util
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..b066e2a5748e6c2e0a63ef7e27a528be99067b83
--- /dev/null
+++ b/tensorflow/core/kernels/batch_util.h
@@ -0,0 +1,38 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+//
+// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
+// to move the `element` argument into this function, and the implementation
+// may be able to optimize the copy to a move. This is particularly important
+// for DT_STRING tensors.
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
+
+}  // namespace batch_util
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 1cd5943ef3a21d5b18291403c1d6845d0aeb8079..890fa3121bbf719e7aa0d3e2d715ca6449af136b 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/kernels/bincount_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -27,46 +28,37 @@ namespace tensorflow {
 
 using thread::ThreadPool;
 
-template <typename T>
-class BincountOp : public OpKernel {
- public:
-  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor& arr_t = ctx->input(0);
-    const Tensor& size_tensor = ctx->input(1);
-    const Tensor& weights_t = ctx->input(2);
-    int32 size = size_tensor.scalar<int32>()();
-    OP_REQUIRES(
-        ctx, size >= 0,
-        errors::InvalidArgument("size (", size, ") must be non-negative"));
-    const bool has_weights = weights_t.NumElements() > 0;
-    OP_REQUIRES(ctx, !(has_weights && arr_t.shape() != weights_t.shape()),
-                errors::InvalidArgument(
-                    "If weights are passed, they must have the same shape (" +
-                    weights_t.shape().DebugString() + ") as arr (" +
-                    arr_t.shape().DebugString() + ")"));
-    const auto arr = arr_t.flat<int32>();
-    const auto weights = weights_t.flat<T>();
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<CPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    int size = output.size();
 
     Tensor all_nonneg_t;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_temp(DT_BOOL, TensorShape({}), &all_nonneg_t,
-                                      AllocatorAttributes()));
-    all_nonneg_t.scalar<bool>().device(ctx->eigen_cpu_device()) =
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
+    all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
         (arr >= 0).all();
-    OP_REQUIRES(ctx, all_nonneg_t.scalar<bool>()(),
-                errors::InvalidArgument("Input arr must be non-negative!"));
+    if (!all_nonneg_t.scalar<bool>()()) {
+      return errors::InvalidArgument("Input arr must be non-negative!");
+    }
 
     // Allocate partial output bin sums for each worker thread. Worker ids in
     // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
     ThreadPool* thread_pool =
-        ctx->device()->tensorflow_cpu_worker_threads()->workers;
+        context->device()->tensorflow_cpu_worker_threads()->workers;
     const int64 num_threads = thread_pool->NumThreads() + 1;
     Tensor partial_bins_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(weights_t.dtype(),
-                                           TensorShape({num_threads, size}),
-                                           &partial_bins_t));
+    TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
+                                              TensorShape({num_threads, size}),
+                                              &partial_bins_t));
     auto partial_bins = partial_bins_t.matrix<T>();
     partial_bins.setZero();
     thread_pool->ParallelForWithWorkerId(
@@ -75,7 +67,7 @@ class BincountOp : public OpKernel {
           for (int64 i = start_ind; i < limit_ind; i++) {
             int32 value = arr(i);
             if (value < size) {
-              if (has_weights) {
+              if (weights.size()) {
                 partial_bins(worker_id, value) += weights(i);
               } else {
                 // Complex numbers don't support "++".
@@ -84,25 +76,63 @@ class BincountOp : public OpKernel {
             }
           }
         });
-    TensorShape output_shape({size});
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_t));
+
     // Sum the partial bins along the 0th axis.
     Eigen::array<int, 1> reduce_dims({0});
-    output_t->flat<T>().device(ctx->eigen_cpu_device()) =
-        partial_bins.sum(reduce_dims);
+    output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
+    return Status::OK();
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T>
+class BincountOp : public OpKernel {
+ public:
+  explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& arr_t = ctx->input(0);
+    const Tensor& size_tensor = ctx->input(1);
+    const Tensor& weights_t = ctx->input(2);
+
+    int32 size = size_tensor.scalar<int32>()();
+    OP_REQUIRES(
+        ctx, size >= 0,
+        errors::InvalidArgument("size (", size, ") must be non-negative"));
+
+    const auto arr = arr_t.flat<int32>();
+    const auto weights = weights_t.flat<T>();
+    Tensor* output_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({size}), &output_t));
+    auto output = output_t->flat<T>();
+    OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
+                            ctx, arr, weights, output));
   }
 };
 
-#define REGISTER(TYPE)                                               \
+#define REGISTER_KERNELS(type)                                       \
   REGISTER_KERNEL_BUILDER(                                           \
-      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      BincountOp<TYPE>)
+      Name("Bincount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      BincountOp<CPUDevice, type>)
+
+TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_KERNELS(type)                            \
+  REGISTER_KERNEL_BUILDER(Name("Bincount")                \
+                              .Device(DEVICE_GPU)         \
+                              .HostMemory("size")         \
+                              .TypeConstraint<type>("T"), \
+                          BincountOp<GPUDevice, type>)
 
-TF_CALL_NUMBER_TYPES(REGISTER);
+TF_CALL_int32(REGISTER_KERNELS);
+TF_CALL_float(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
 
-// TODO(ringwalt): Add a GPU implementation. We probably want to take a
-// different approach, e.g. threads in a warp each taking a pass over the same
-// data, and each thread summing a single bin.
+#endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op.h b/tensorflow/core/kernels/bincount_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd3d560cd12a4afefa2c58f19fdfee44b8ed2684
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BINCOUNT_OP_H_
+#define TENSORFLOW_BINCOUNT_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <typename Device, typename T>
+struct BincountFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output);
+};
+
+}  // end namespace functor
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_BINCOUNT_OP_H_
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6074b3e1f6f29fbb05b3adff29518b35a2df3b4f
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "external/cub_archive/cub/device/device_histogram.cuh"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bincount_op.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+template <typename T>
+struct BincountFunctor<GPUDevice, T> {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<int32, 1>::ConstTensor& arr,
+                        const typename TTypes<T, 1>::ConstTensor& weights,
+                        typename TTypes<T, 1>::Tensor& output) {
+    if (weights.size() != 0) {
+      return errors::InvalidArgument(
+          "Weights should not be passed as it should be "
+          "handled by unsorted_segment_sum");
+    }
+    if (output.size() == 0) {
+      return Status::OK();
+    }
+    // In case weight.size() == 0, use CUB
+    size_t temp_storage_bytes = 0;
+    const int32* d_samples = arr.data();
+    T* d_histogram = output.data();
+    int num_levels = output.size() + 1;
+    int32 lower_level = 0;
+    int32 upper_level = output.size();
+    int num_samples = arr.size();
+    const cudaStream_t& stream = GetCudaStream(context);
+
+    // The first HistogramEven is to obtain the temp storage size required
+    // with d_temp_storage = NULL passed to the call.
+    auto err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ NULL,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "Could not launch HistogramEven to get temp storage: ",
+          cudaGetErrorString(err), ".");
+    }
+    Tensor temp_storage;
+    TF_RETURN_IF_ERROR(context->allocate_temp(
+        DataTypeToEnum<int8>::value,
+        TensorShape({static_cast<int64>(temp_storage_bytes)}), &temp_storage));
+
+    void* d_temp_storage = temp_storage.flat<int8>().data();
+    // The second HistogramEven is to actual run with d_temp_storage
+    // allocated with temp_storage_bytes.
+    err = cub::DeviceHistogram::HistogramEven(
+        /* d_temp_storage */ d_temp_storage,
+        /* temp_storage_bytes */ temp_storage_bytes,
+        /* d_samples */ d_samples,
+        /* d_histogram */ d_histogram,
+        /* num_levels */ num_levels,
+        /* lower_level */ lower_level,
+        /* upper_level */ upper_level,
+        /* num_samples */ num_samples,
+        /* stream */ stream);
+    if (err != cudaSuccess) {
+      return errors::Internal(
+          "Could not launch HistogramEven: ", cudaGetErrorString(err), ".");
+    }
+    return Status::OK();
+  }
+};
+
+}  // end namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BincountFunctor<GPUDevice, type>;
+
+TF_CALL_int32(REGISTER_GPU_SPEC);
+TF_CALL_float(REGISTER_GPU_SPEC);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/bincount_op_test.cc b/tensorflow/core/kernels/bincount_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb04b40637a67e5398514f4cdf62ea960a70bf7c
--- /dev/null
+++ b/tensorflow/core/kernels/bincount_op_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+static Graph* Bincount(int arr_size, int nbins) {
+  Graph* g = new Graph(OpRegistry::Global());
+
+  Tensor arr(DT_INT32, TensorShape({arr_size}));
+  arr.flat<int32>() = arr.flat<int32>().setRandom().abs();
+
+  Tensor size(DT_INT32, TensorShape({static_cast<int32>(1)}));
+  size.flat<int32>()(0) = static_cast<int32>(nbins);
+
+  Tensor weights(DT_INT32, TensorShape({0}));
+
+  Node* node;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Bincount")
+                  .Input(test::graph::Constant(g, arr))
+                  .Input(test::graph::Constant(g, size))
+                  .Input(test::graph::Constant(g, weights))
+                  .Attr("T", DT_INT32)
+                  .Finalize(g, &node));
+  return g;
+}
+
+#define BM_BincountDev(K, NBINS, type)                             \
+  static void BM_Bincount##_##type##_##K##_##NBINS(int iters) {    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * K * 1024); \
+    test::Benchmark(#type, Bincount(K * 1024, NBINS)).Run(iters);  \
+  }                                                                \
+  BENCHMARK(BM_Bincount##_##type##_##K##_##NBINS);
+
+BM_BincountDev(32, 1000, cpu);
+BM_BincountDev(32, 2000, cpu);
+BM_BincountDev(32, 5000, cpu);
+BM_BincountDev(64, 1000, cpu);
+BM_BincountDev(64, 2000, cpu);
+BM_BincountDev(64, 5000, cpu);
+BM_BincountDev(128, 1000, cpu);
+BM_BincountDev(128, 2000, cpu);
+BM_BincountDev(128, 5000, cpu);
+
+BM_BincountDev(32, 1000, gpu);
+BM_BincountDev(32, 2000, gpu);
+BM_BincountDev(32, 5000, gpu);
+BM_BincountDev(64, 1000, gpu);
+BM_BincountDev(64, 2000, gpu);
+BM_BincountDev(64, 5000, gpu);
+BM_BincountDev(128, 1000, gpu);
+BM_BincountDev(128, 2000, gpu);
+BM_BincountDev(128, 5000, gpu);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bucketize_op.cc b/tensorflow/core/kernels/bucketize_op.cc
index 93c2d01221f3b1d36fefa7742762025b96cc5387..4e4b6d52154cd1bacc621535f7dd9c56045a3c57 100644
--- a/tensorflow/core/kernels/bucketize_op.cc
+++ b/tensorflow/core/kernels/bucketize_op.cc
@@ -15,15 +15,42 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-#include <algorithm>
-#include <vector>
-
+#include "tensorflow/core/kernels/bucketize_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace functor {
+
 template <typename T>
+struct BucketizeFunctor<CPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const int N = input.size();
+    for (int i = 0; i < N; i++) {
+      auto first_bigger_it = std::upper_bound(
+          boundaries_vector.begin(), boundaries_vector.end(), input(i));
+      output(i) = first_bigger_it - boundaries_vector.begin();
+    }
+
+    return Status::OK();
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T>
 class BucketizeOp : public OpKernel {
  public:
   explicit BucketizeOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -34,36 +61,42 @@ class BucketizeOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input_tensor = context->input(0);
-    auto input = input_tensor.flat<T>();
+    const auto input = input_tensor.flat<T>();
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                      &output_tensor));
     auto output = output_tensor->template flat<int32>();
-
-    const int N = input.size();
-    for (int i = 0; i < N; i++) {
-      output(i) = CalculateBucketIndex(input(i));
-    }
+    OP_REQUIRES_OK(context, functor::BucketizeFunctor<Device, T>::Compute(
+                                context, input, boundaries_, output));
   }
 
  private:
-  int32 CalculateBucketIndex(const T value) {
-    auto first_bigger_it =
-        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
-    return first_bigger_it - boundaries_.begin();
-  }
   std::vector<float> boundaries_;
 };
 
 #define REGISTER_KERNEL(T)                                         \
   REGISTER_KERNEL_BUILDER(                                         \
       Name("Bucketize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      BucketizeOp<T>);
+      BucketizeOp<CPUDevice, T>);
+
+REGISTER_KERNEL(int32);
+REGISTER_KERNEL(int64);
+REGISTER_KERNEL(float);
+REGISTER_KERNEL(double);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNEL(T)                                         \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("Bucketize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      BucketizeOp<GPUDevice, T>);
 
 REGISTER_KERNEL(int32);
 REGISTER_KERNEL(int64);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
+#endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bucketize_op.h b/tensorflow/core/kernels/bucketize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8e461beb941f8092234d02306b683fdda2df451
--- /dev/null
+++ b/tensorflow/core/kernels/bucketize_op.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_BUCKETIZE_OP_H_
+#define TENSORFLOW_BUCKETIZE_OP_H_
+
+#include <vector>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T>
+struct BucketizeFunctor {
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_BUCKETIZE_OP_H_
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..325dee793b3eef4e045e2b3d5ad2f96dbf3943d8
--- /dev/null
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -0,0 +1,101 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/bucketize_op.h"
+#include "tensorflow/core/kernels/cuda_device_array.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename T>
+__global__ void BucketizeCustomKernel(
+    const int32 size_in, const T* in, const int32 size_boundaries,
+    CudaDeviceArrayStruct<float> boundaries_array, int32* out) {
+  const float* boundaries = GetCudaDeviceArrayOnDevice(&boundaries_array);
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in[i];
+    int32 bucket = 0;
+    int32 count = size_boundaries;
+    while (count > 0) {
+      int32 l = bucket;
+      int32 step = count / 2;
+      l += step;
+      if (!(value < static_cast<T>(boundaries[l]))) {
+        bucket = ++l;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    out[i] = bucket;
+  }
+}
+
+namespace functor {
+
+template <typename T>
+struct BucketizeFunctor<GPUDevice, T> {
+  // PRECONDITION: boundaries_vector must be sorted.
+  static Status Compute(OpKernelContext* context,
+                        const typename TTypes<T, 1>::ConstTensor& input,
+                        const std::vector<float>& boundaries_vector,
+                        typename TTypes<int32, 1>::Tensor& output) {
+    const GPUDevice& d = context->eigen_device<GPUDevice>();
+
+    CudaDeviceArrayOnHost<float> boundaries_array(context,
+                                                  boundaries_vector.size());
+    TF_RETURN_IF_ERROR(boundaries_array.Init());
+    for (int i = 0; i < boundaries_vector.size(); ++i) {
+      boundaries_array.Set(i, boundaries_vector[i]);
+    }
+    TF_RETURN_IF_ERROR(boundaries_array.Finalize());
+
+    CudaLaunchConfig config = GetCudaLaunchConfig(input.size(), d);
+    BucketizeCustomKernel<T>
+        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+            input.size(), input.data(), boundaries_vector.size(),
+            boundaries_array.data(), output.data());
+
+    return Status::OK();
+  }
+};
+}  // namespace functor
+
+#define REGISTER_GPU_SPEC(type) \
+  template struct functor::BucketizeFunctor<GPUDevice, type>;
+
+REGISTER_GPU_SPEC(int32);
+REGISTER_GPU_SPEC(int64);
+REGISTER_GPU_SPEC(float);
+REGISTER_GPU_SPEC(double);
+#undef REGISTER_GPU_SPEC
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/captured_function.cc b/tensorflow/core/kernels/captured_function.cc
index 00cdc1eff2d3003cb55e868389033f8504e01588..5ef331e5921731a6e44db01f99187d0e3588d608 100644
--- a/tensorflow/core/kernels/captured_function.cc
+++ b/tensorflow/core/kernels/captured_function.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_handle.pb_text.h"
 #include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -113,70 +114,220 @@ Status CapturedFunction::Create(
   FunctionLibraryRuntime::Handle f_handle;
   TF_RETURN_IF_ERROR(
       lib->Instantiate(func.name(), AttrSlice(&func.attr()), &f_handle));
+  const FunctionBody* fbody = lib->GetFunctionBody(f_handle);
+  if (fbody == nullptr) {
+    return errors::Internal("Failed to instantiate function body.");
+  }
 
   out_function->reset(new CapturedFunction(
       device, std::move(device_mgr), std::move(flib_def), std::move(pflr), lib,
-      f_handle, std::move(captured_inputs)));
+      f_handle, std::move(captured_inputs), fbody->ret_types));
   return Status::OK();
 }
 
+namespace {
+class CallFrameBase : public CallFrameInterface {
+ public:
+  explicit CallFrameBase(DataTypeSlice ret_types)
+      : ret_types_(ret_types), retvals_(ret_types.size()) {}
+
+  // Caller methods.
+  Status ConsumeRetvals(std::vector<Tensor>* retvals) {
+    retvals->reserve(retvals_.size());
+    int i = 0;
+    for (auto&& val : retvals_) {
+      if (!val) {
+        return errors::Internal("No return value for index ", i, ".");
+      }
+      retvals->emplace_back(std::move(val.value()));
+      ++i;
+    }
+    return Status::OK();
+  }
+
+  size_t num_retvals() const override { return retvals_.size(); }
+
+  // Callee methods.
+  Status SetRetval(int index, const Tensor& val) override {
+    if (index < retvals_.size() && val.dtype() == ret_types_[index] &&
+        !retvals_[index]) {
+      retvals_[index] = val;
+      return Status::OK();
+    } else if (index >= retvals_.size()) {
+      return errors::InvalidArgument("Return value ", index,
+                                     " is out of range.");
+    } else if (val.dtype() != ret_types_[index]) {
+      return errors::InvalidArgument("Expected type ",
+                                     DataTypeString(ret_types_[index]),
+                                     " for return value ", index, " but got ",
+                                     DataTypeString(val.dtype()), ".");
+    } else {
+      return errors::Internal("Attempted to set return value ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  DataTypeSlice ret_types_;
+  std::vector<gtl::optional<Tensor>> retvals_;
+  TF_DISALLOW_COPY_AND_ASSIGN(CallFrameBase);
+};
+
+class OwnedArgsCallFrame : public CallFrameBase {
+ public:
+  OwnedArgsCallFrame(std::vector<Tensor>&& args,
+                     const std::vector<Tensor>* captured_inputs,
+                     DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(std::move(args)),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      // TODO(mrry): Consider making `CallFrameInterface::GetArg` non-const in
+      // order to be able to `std::move(args_[index])` into `*val`.
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  std::vector<Tensor> args_;
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+class BorrowedArgsCallFrame : public CallFrameBase {
+ public:
+  BorrowedArgsCallFrame(const std::vector<Tensor>& args,
+                        const std::vector<Tensor>* captured_inputs,
+                        DataTypeSlice ret_types)
+      : CallFrameBase(ret_types),
+        args_(args),
+        captured_inputs_(captured_inputs) {}
+
+  size_t num_args() const override {
+    return args_.size() + captured_inputs_->size();
+  }
+
+  // Callee methods.
+  Status GetArg(int index, Tensor* val) const override {
+    if (index < args_.size() && args_[index].IsInitialized()) {
+      *val = args_[index];
+      return Status::OK();
+    } else if (index < args_.size() + captured_inputs_->size()) {
+      *val = (*captured_inputs_)[index - args_.size()];
+      return Status::OK();
+    } else if (index >= args_.size() + captured_inputs_->size()) {
+      return errors::InvalidArgument("Argument ", index, " is out of range.");
+    } else {
+      return errors::Internal("Attempted to get argument ", index,
+                              " more than once.");
+    }
+  }
+
+ private:
+  const std::vector<Tensor>& args_;                   // Not owned.
+  const std::vector<Tensor>* const captured_inputs_;  // Not owned.
+};
+
+}  // namespace
+
 Status CapturedFunction::Run(FunctionLibraryRuntime::Options f_opts,
-                             gtl::ArraySlice<Tensor> args,
+                             std::vector<Tensor>&& args,
                              std::vector<Tensor>* rets) {
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
+  auto c_mgr = new CancellationManager;
+  auto frame =
+      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+  f_opts.cancellation_manager = c_mgr;
   Notification n;
   Status s;
-  auto done_callback = [&n, &s](Status func_status) {
-    s.Update(func_status);
-    n.Notify();
-  };
+  lib_->Run(f_opts, f_handle_, frame,
+            [rets, c_mgr, frame, &n, &s](Status func_status) {
+              delete c_mgr;
+              s.Update(func_status);
+              if (s.ok()) {
+                s = frame->ConsumeRetvals(rets);
+              }
+              delete frame;
+              n.Notify();
+            });
+  n.WaitForNotification();
+  return s;
+}
+
+Status CapturedFunction::RunWithBorrowedArgs(
+    FunctionLibraryRuntime::Options f_opts, const std::vector<Tensor>& args,
+    std::vector<Tensor>* rets) {
   // TODO(mrry): Add cancellation manager support to IteratorContext
   // so that we can cancel running map functions. The local
   // cancellation manager here is created so that we can run kernels
-  // (such as queue kernels) that depend on the non-nullness
+  // (such as queue kernels) that depend on the non-nullness of
   // `OpKernelContext::cancellation_manager()`, but additional effort
   // will be required to plumb it through the `IteratorContext`.
-  CancellationManager c_mgr;
-  f_opts.cancellation_manager = &c_mgr;
-  RunHelper(std::move(f_opts), args, rets, std::move(done_callback));
+  auto c_mgr = new CancellationManager;
+  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  f_opts.cancellation_manager = c_mgr;
+  Notification n;
+  Status s;
+  lib_->Run(f_opts, f_handle_, &frame,
+            [rets, c_mgr, &frame, &n, &s](Status func_status) {
+              delete c_mgr;
+              s.Update(func_status);
+              if (s.ok()) {
+                s = frame.ConsumeRetvals(rets);
+              }
+              n.Notify();
+            });
   n.WaitForNotification();
   return s;
 }
 
 void CapturedFunction::RunAsync(FunctionLibraryRuntime::Options f_opts,
-                                gtl::ArraySlice<Tensor> args,
+                                std::vector<Tensor>&& args,
                                 std::vector<Tensor>* rets,
                                 FunctionLibraryRuntime::DoneCallback done) {
+  // TODO(mrry): Add cancellation manager support to IteratorContext
+  // so that we can cancel running map functions. The local
+  // cancellation manager here is created so that we can run kernels
+  // (such as queue kernels) that depend on the non-nullness of
+  // `OpKernelContext::cancellation_manager()`, but additional effort
+  // will be required to plumb it through the `IteratorContext`.
   auto c_mgr = new CancellationManager;
+  auto frame =
+      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
   f_opts.cancellation_manager = c_mgr;
-  FunctionLibraryRuntime::DoneCallback wrapped_done = std::bind(
-      [c_mgr](FunctionLibraryRuntime::DoneCallback done,
-              // Begin unbound arguments.
-              Status s) {
-        delete c_mgr;
-        done(s);
-      },
-      std::move(done), std::placeholders::_1);
-  RunHelper(std::move(f_opts), args, rets, std::move(wrapped_done));
-}
-
-void CapturedFunction::RunHelper(FunctionLibraryRuntime::Options f_opts,
-                                 gtl::ArraySlice<Tensor> args,
-                                 std::vector<Tensor>* rets,
-                                 FunctionLibraryRuntime::DoneCallback done) {
-  // TODO(mrry): Implement a synchronous version of
-  // FunctionLibraryRuntime::Run() that avoids a context switch for small
-  // functions.
-  if (captured_inputs_.empty()) {
-    lib_->Run(f_opts, f_handle_, args, rets, std::move(done));
-  } else {
-    std::vector<Tensor> args_with_captured;
-    args_with_captured.reserve(args.size() + captured_inputs_.size());
-    args_with_captured.insert(args_with_captured.end(), args.begin(),
-                              args.end());
-    args_with_captured.insert(args_with_captured.end(),
-                              captured_inputs_.begin(), captured_inputs_.end());
-    lib_->Run(f_opts, f_handle_, args_with_captured, rets, std::move(done));
-  }
+  lib_->Run(f_opts, f_handle_, frame,
+            std::bind(
+                [rets, c_mgr, frame](FunctionLibraryRuntime::DoneCallback done,
+                                     // Begin unbound arguments.
+                                     Status s) {
+                  delete c_mgr;
+                  if (s.ok()) {
+                    s = frame->ConsumeRetvals(rets);
+                  }
+                  delete frame;
+                  done(s);
+                },
+                std::move(done), std::placeholders::_1));
 }
 
 CapturedFunction::CapturedFunction(
@@ -184,13 +335,14 @@ CapturedFunction::CapturedFunction(
     std::unique_ptr<FunctionLibraryDefinition> flib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
     FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
-    std::vector<Tensor> captured_inputs)
+    std::vector<Tensor> captured_inputs, DataTypeSlice ret_types)
     : device_(device),
       device_mgr_(std::move(device_mgr)),
       flib_def_(std::move(flib_def)),
       pflr_(std::move(pflr)),
       lib_(lib),
       f_handle_(f_handle),
-      captured_inputs_(std::move(captured_inputs)) {}
+      captured_inputs_(std::move(captured_inputs)),
+      ret_types_(ret_types) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/captured_function.h b/tensorflow/core/kernels/captured_function.h
index 55d337d707549d73813349b43bda5fcf7c809e97..c10472dde03b6c0bc89da74ba7bff90a537e287a 100644
--- a/tensorflow/core/kernels/captured_function.h
+++ b/tensorflow/core/kernels/captured_function.h
@@ -60,17 +60,38 @@ class CapturedFunction {
                        std::vector<Tensor> captured_inputs,
                        std::unique_ptr<CapturedFunction>* out_function);
 
-  Status Run(FunctionLibraryRuntime::Options f_opts,
-             gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets);
-
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. This method takes ownership of the tensors in
+  // `args`, in order to be able to deallocate them as early as possible.
+  // Use `RunWithBorrowedArgs()` if the caller needs to retain ownership of
+  // the `args`.
+  Status Run(FunctionLibraryRuntime::Options f_opts, std::vector<Tensor>&& args,
+             std::vector<Tensor>* rets);
+
+  // Synchronously runs the captured function on the given `args`, and stores
+  // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
+  // possible.
+  Status RunWithBorrowedArgs(FunctionLibraryRuntime::Options f_opts,
+                             const std::vector<Tensor>& args,
+                             std::vector<Tensor>* rets);
+
+  // Asynchronously runs the captured function on the given `args`, stores
+  // the results in `*rets`, and calls the given `done` callback when the
+  // function returns. This method takes ownership of the tensors in `args`,
+  // in order to be able to deallocate them as early as possible.
   void RunAsync(FunctionLibraryRuntime::Options f_opts,
-                gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                std::vector<Tensor>&& args, std::vector<Tensor>* rets,
                 FunctionLibraryRuntime::DoneCallback done);
 
-  const Device* device() const { return device_; }
-
+  // Returns a borrowed pointer to the `ResourceManager` used when this
+  // function is run.
   ResourceMgr* resource_manager() const { return device_->resource_manager(); }
 
+  // Returns that additional captured inputs that will be passed to the function
+  // when `Run*()` is called.
+  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
+
+  // Returns a step ID for use when running a `CapturedFunction`.
   static int64 generate_step_id() {
     // Choose a step ID that is guaranteed not to clash with any
     // Session-generated step ID. DirectSession only generates
@@ -86,11 +107,8 @@ class CapturedFunction {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib,
                    FunctionLibraryRuntime::Handle f_handle,
-                   std::vector<Tensor> captured_inputs);
-
-  void RunHelper(FunctionLibraryRuntime::Options f_opts,
-                 gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-                 FunctionLibraryRuntime::DoneCallback done);
+                   std::vector<Tensor> captured_inputs,
+                   DataTypeSlice ret_types);
 
   Device* const device_;  // owned by device_mgr_.
   const std::unique_ptr<DeviceMgr> device_mgr_;
@@ -99,6 +117,7 @@ class CapturedFunction {
   FunctionLibraryRuntime* const lib_;  // owned by pflr_.
   const FunctionLibraryRuntime::Handle f_handle_;
   const std::vector<Tensor> captured_inputs_;
+  DataTypeSlice ret_types_;  // owned by pflr_.
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 7d3e0cbe3dc88477db8dbe048386f5f1a5971c74..8fedf2c271c2caf60a83fb1f4146dd94821c4643 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -128,10 +128,10 @@ struct scalar_cast_op<::tensorflow::bfloat16, float> {
     float ret;
     uint16_t* p = reinterpret_cast<uint16_t*>(&ret);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    p[0] = a.value;  
-    p[1] = 0;  
-#else  
-    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");  
+    p[0] = a.value;
+    p[1] = 0;
+#else
+    static_assert(::tensorflow::port::kLittleEndian, "Not a little endian system!");
     p[0] = 0;
     p[1] = a.value;
 #endif
diff --git a/tensorflow/core/kernels/check_numerics_op.cc b/tensorflow/core/kernels/check_numerics_op.cc
index 56cb50d2d181deb15570bfb269ae5ead03d20030..534527c6bdc9ab971cd4c6001dcef8ee59a13a8d 100644
--- a/tensorflow/core/kernels/check_numerics_op.cc
+++ b/tensorflow/core/kernels/check_numerics_op.cc
@@ -168,10 +168,10 @@ class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
                      abnormal_detected_host, context, done]() {
       ::perftools::gputools::cuda::ScopedActivateExecutorContext
           scoped_activation{stream->parent()};
-
       auto abnormal_detected_host_flat = abnormal_detected_host.flat<int>();
       int is_nan = abnormal_detected_host_flat(0);
       int is_inf = abnormal_detected_host_flat(1);
+      abnormal_detected_ref.Unref();
       if (is_nan || is_inf) {
         string status;
         LOG(ERROR) << "abnormal_detected_host @"
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 258ce1545607c026d2c2985ef0760c32728fa17f..743e3acfd5c415a72eb70690f9692c961733c34f 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -73,12 +73,17 @@ REGISTER(quint16)
 REGISTER(qint16)
 REGISTER(qint32)
 REGISTER(bfloat16)
+TF_CALL_variant(REGISTER)
 
-#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
-// Primarily used for SavedModel support on mobile.
-REGISTER(string);
+#if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION) && \
+    !defined(__ANDROID_TYPES_FULL__)
+    // Primarily used for SavedModel support on mobile. Registering it here only
+    // if __ANDROID_TYPES_FULL__ is not defined (which already registers string)
+    // to avoid duplicate registration.
+    REGISTER(string);
 #endif  // defined(IS_MOBILE_PLATFORM) &&
-        // !defined(SUPPORT_SELECTIVE_REGISTRATION)
+        // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
+        // !defined(__ANDROID_TYPES_FULL__)
 
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T>
diff --git a/tensorflow/core/kernels/concatenate_dataset_op.cc b/tensorflow/core/kernels/concatenate_dataset_op.cc
index 711c234129f7ca52667ca49600c35e2c8005652c..ad78ba01869a862d496d66b8dcac1243cf09fe84 100644
--- a/tensorflow/core/kernels/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/concatenate_dataset_op.cc
@@ -79,13 +79,13 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
     string DebugString() override { return "ConcatenateDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(input_, &input_graph));
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph));
       Node* to_concatenate_graph = nullptr;
       TF_RETURN_IF_ERROR(
-          b->AddParentDataset(to_concatenate_, &to_concatenate_graph));
+          b->AddParentDataset(ctx, to_concatenate_, &to_concatenate_graph));
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {input_graph, to_concatenate_graph}, output));
       return Status::OK();
@@ -104,6 +104,10 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         while (i_ < 2) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -140,7 +144,9 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
         } else if (i_ == 2) {
           input_impl_.reset();
         }
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        }
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 05ee855daee8a7ffe4730ec4a18c65a7bd91733a..27db6ee78533c59f26f538bc59956e50c6111ee7 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -162,10 +162,12 @@ class ConditionalAccumulatorBase : public ResourceBase {
  * function can get an indication that a failure has occurred.
 */
 #define OP_REQUIRES_BOOLEAN(CTX, EXP, STATUS) \
-  if (!TF_PREDICT_TRUE(EXP)) {                \
-    (CTX)->CtxFailure((STATUS));              \
-    return false;                             \
-  }
+  do {                                        \
+    if (!TF_PREDICT_TRUE(EXP)) {              \
+      (CTX)->CtxFailure((STATUS));            \
+      return false;                           \
+    }                                         \
+  } while (0)
 
 #define OP_REQUIRES_OK_BOOLEAN(CTX, STATUS) \
   do {                                      \
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 3d2bb57aff6b7c4a1de2f9221aea4b384fea45c3..1791c510966771f89d029dbc36a231d97daf2eff 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -194,7 +194,23 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -262,6 +278,7 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -290,7 +307,23 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -459,6 +492,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -510,10 +544,30 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -546,13 +600,16 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
-              stride_rows, stride_cols, padding_, filter_backprop,
-              data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              filter_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -566,38 +623,46 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& input, int row_stride,
-    int col_stride, const Padding& padding, Tensor* filter_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& input, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* filter_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
+  std::vector<int32> dilations(4, 1);
+  dilations[GetTensorDimIndex(data_format, 'H')] = row_dilation;
+  dilations[GetTensorDimIndex(data_format, 'W')] = col_dilation;
+
   std::vector<int32> strides(4, 1);
   strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
   strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
                           input.shape(), filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -730,7 +795,9 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -821,6 +888,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index d28f6b4d107647f8e2dc232dc5477cd7ee37f696..736241a029353b5872e243ce9205ff6cde2285d9 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -198,7 +198,23 @@ class Conv2DFastBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] && dilations_[3]),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current Eigen and libxsmm implementations do not "
+                    "yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -268,6 +284,7 @@ class Conv2DFastBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -296,7 +313,23 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
+    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
+                errors::InvalidArgument(
+                    "Current libxsmm and customized CPU implementations do "
+                    "not yet support dilation rates larger than 1."));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -532,6 +565,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
@@ -586,10 +620,30 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                         "specify 4 dimensions"));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    int stride_h = GetTensorDim(strides_, data_format_, 'H');
+    int stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
+    int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
@@ -622,12 +676,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     // do not support striding on the batch or depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
-              stride_rows, stride_cols, padding_, in_backprop, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              in_backprop, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
   bool use_cudnn_;
@@ -641,39 +699,48 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 template <typename T>
 void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& out_backprop, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* in_backprop,
-    TensorFormat data_format) {
+    const Tensor& out_backprop, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* in_backprop, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
 
   std::vector<int32> strides(4, 1);
-  strides[GetTensorDimIndex(data_format, 'H')] = row_stride;
-  strides[GetTensorDimIndex(data_format, 'W')] = col_stride;
+  std::vector<int32> dilations(4, 1);
+  auto input_h = GetTensorDimIndex(data_format, 'H');
+  auto input_w = GetTensorDimIndex(data_format, 'W');
+  strides[input_h] = row_stride;
+  strides[input_w] = col_stride;
+  dilations[input_h] = row_dilation;
+  dilations[input_w] = col_dilation;
   TensorShape input_shape = in_backprop->shape();
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensions(
+  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
                           "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
                           input_shape, filter_shape, out_backprop.shape(),
-                          strides, padding, data_format, &dims));
+                          dilations, strides, padding, data_format, &dims));
 
+  // TODO(yangzihao): The padding computations should be done in
+  // GetWindowedOutputSize() functions.
   const int padding_rows =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
                                      dims.spatial_dims[0].stride +
-                                 dims.spatial_dims[0].filter_size -
-                                 dims.spatial_dims[0].input_size);
+                                 (dims.spatial_dims[0].filter_size - 1) *
+                                     dims.spatial_dims[0].dilation +
+                                 1 - dims.spatial_dims[0].input_size);
   const int padding_cols =
       (padding == VALID)
           ? 0
           : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
                                      dims.spatial_dims[1].stride +
-                                 dims.spatial_dims[1].filter_size -
-                                 dims.spatial_dims[1].input_size);
+                                 (dims.spatial_dims[1].filter_size - 1) *
+                                     dims.spatial_dims[1].dilation +
+                                 1 - dims.spatial_dims[1].input_size);
 
   // TODO(keveman): cuDNN only supports equal padding on both sides, so only
   // calling it when that is true. Remove this check when (if?) cuDNN starts
@@ -789,7 +856,9 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(dims.in_depth)
       .set_output_feature_map_count(dims.out_depth);
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(dims.spatial_dims[0].stride)
+  conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
+      .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
+      .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -875,6 +944,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       dims.out_depth,                        // out_depths
       {{dims.spatial_dims[0].filter_size,    // filter_rows
         dims.spatial_dims[1].filter_size}},  // filter_cols
+      {{dims.spatial_dims[0].dilation,       // dilation_rows
+        dims.spatial_dims[1].dilation}},     // dilation_cols
       {{dims.spatial_dims[0].stride,         // stride_rows
         dims.spatial_dims[1].stride}},       // stride_cols
       {{padding_rows,                        // padding_rows
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index e068fb86848f93a4c826e1b19fc85790ab2500a4..535586d53ac916808a22a6ea55577b3be43321f9 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -175,15 +175,17 @@ template <typename Device, typename T>
 struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
-                  int row_stride, int col_stride, const Padding& padding,
-                  Tensor* in_backprop, TensorFormat data_format);
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* in_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename Device, typename T>
 struct LaunchConv2DBackpropFilterOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -191,8 +193,9 @@ struct LaunchConv2DBackpropFilterOp {
 template <typename T>
 struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -200,7 +203,8 @@ template <typename T>
 struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 21f5cb17168f7b3ed8f7129be4d4cf5ee7ba7cef..3650ab53b2533e3c95a764ead2d1318c4006c9e7 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -236,6 +236,7 @@ class Conv3DBackpropInputOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Conv3DBackpropInputV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       Conv3DBackpropInputOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -383,6 +384,7 @@ class Conv3DBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                             \
                               .TypeConstraint<T>("T"),                        \
                           Conv3DBackpropFilterOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -409,6 +411,7 @@ namespace functor {
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
 
+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
@@ -642,6 +645,9 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1008,6 +1014,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         {{input_size[0], input_size[1], input_size[2]}},
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
+        {{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{padding_planes, padding_rows, padding_cols}},
         dtype,
@@ -1098,22 +1105,27 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
   bool cudnn_use_autotune_;
 };
 
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("input_sizes"),
-                        Conv3DBackpropInputOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    Conv3DBackpropFilterOp<GPUDevice, float>);
-REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("filter_sizes"),
-                        Conv3DBackpropFilterOp<GPUDevice, float>);
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropInput").Device(DEVICE_GPU).TypeConstraint<T>("T"),  \
+      Conv3DBackpropInputOp<GPUDevice, T>);                                   \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropInputV2")                       \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("input_sizes"),                     \
+                          Conv3DBackpropInputOp<GPUDevice, T>);               \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Conv3DBackpropFilter").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      Conv3DBackpropFilterOp<GPUDevice, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("Conv3DBackpropFilterV2")                      \
+                              .Device(DEVICE_GPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .HostMemory("filter_sizes"),                    \
+                          Conv3DBackpropFilterOp<GPUDevice, T>);
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index bb67113fb003ea58e2fb12ae6d79f02251cd3c3d..ba40c428e4612af15b477ac259de6f43a7ab8012 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -112,7 +112,8 @@ struct LaunchGeneric {
 template <typename T>
 struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
+                  const Tensor& input, const Tensor& filter,
+                  int /*row_dilation*/, int /*col_dilation*/, int row_stride,
                   int col_stride, const Padding& padding, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
@@ -133,8 +134,10 @@ class LaunchDeepConvOp {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int /*out_cols*/, int /*out_depth*/, int /*dilation_rows*/,
+                  int /*dilation_cols*/, int /*stride_rows*/,
+                  int /*stride_cols*/, Tensor* /*output*/,
+                  TensorFormat /*data_format*/) {
     return false;
   }
 };
@@ -147,9 +150,11 @@ class LaunchDeepConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
-    if (data_format != FORMAT_NHWC ||
+    if (data_format != FORMAT_NHWC || dilation_rows != 1 ||
+        dilation_cols != 1 ||
         !CanUseDeepConv2D(stride_rows, stride_cols, filter_rows, filter_cols,
                           in_depth, out_depth, out_rows, out_cols)) {
       return false;
@@ -187,7 +192,8 @@ class LaunchXsmmConvOp {
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
                   int out_cols, int out_depth, int stride_rows, int stride_cols,
-                  Tensor* output, TensorFormat data_format) {
+                  int dilation_rows, int dilation_cols, Tensor* output,
+                  TensorFormat data_format) {
     return false;
   }
 };
@@ -199,7 +205,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
                   const Tensor& filter, int batch, int input_rows,
                   int input_cols, int in_depth, int filter_rows,
                   int filter_cols, int pad_rows, int pad_cols, int out_rows,
-                  int out_cols, int out_depth, int stride_rows, int stride_cols,
+                  int out_cols, int out_depth, int dilation_rows,
+                  int dilation_cols, int stride_rows, int stride_cols,
                   Tensor* output, TensorFormat data_format) {
     auto num_threads =
         ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
@@ -228,11 +235,8 @@ class LaunchXsmmConvOp<CPUDevice, float> {
     desc.options = LIBXSMM_DNN_CONV_OPTION_WU_EXT_FILTER_REDUCE_OVERWRITE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
 
-    if (!CanUseXsmmConv2D(desc, data_format)) {
-      return false;
-    }
-
-    if (!CanUseXsmmConv2D(desc, data_format)) {
+    if (dilation_rows != 1 || dilation_cols != 1 ||
+        !CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
 
@@ -251,6 +255,7 @@ template <typename Device, typename T>
 class Conv2DOp : public BinaryOp<T> {
  public:
   explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
@@ -259,15 +264,35 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
+    OP_REQUIRES(context, dilations_.size() == 4,
+                errors::InvalidArgument("Sliding window dilations field must "
+                                        "specify 4 dimensions"));
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
     const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, stride_n == 1 && stride_c == 1,
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    OP_REQUIRES(context, stride_h > 0 && stride_w > 0,
+                errors::InvalidArgument(
+                    "Row and column strides should be larger than 0."));
+
+    const int64 dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+    const int64 dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+    const int64 dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+    const int64 dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+    OP_REQUIRES(context, dilation_n == 1 && dilation_c == 1,
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
+    OP_REQUIRES(
+        context, dilation_h > 0 && dilation_w > 0,
+        errors::InvalidArgument("Dilated rates should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
@@ -334,18 +359,22 @@ class Conv2DOp : public BinaryOp<T> {
                 errors::InvalidArgument("batch is too large"));
     const int batch = static_cast<int>(batch_raw);
 
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
+    // For now we take the stride and dilation from the second and third
+    // dimensions only (we do not support striding or dilation on the batch or
+    // depth dimension).
     const int stride_rows = GetTensorDim(strides_, data_format_, 'H');
     const int stride_cols = GetTensorDim(strides_, data_format_, 'W');
 
+    const int dilation_rows = GetTensorDim(dilations_, data_format_, 'H');
+    const int dilation_cols = GetTensorDim(dilations_, data_format_, 'W');
+
     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_rows, filter_rows, dilation_rows,
+                                stride_rows, padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeV2(
+                                input_cols, filter_cols, dilation_cols,
+                                stride_cols, padding_, &out_cols, &pad_cols));
     TensorShape out_shape =
         ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
 
@@ -361,6 +390,8 @@ class Conv2DOp : public BinaryOp<T> {
             << ", filter_rows = " << filter_rows
             << ", stride_rows = " << stride_rows
             << ", stride_cols = " << stride_cols
+            << ", dilation_rows = " << dilation_rows
+            << ", dilation_cols = " << dilation_cols
             << ", out_depth = " << out_depth;
 
     // If there is nothing to compute, return.
@@ -372,7 +403,8 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 #endif
@@ -380,15 +412,18 @@ class Conv2DOp : public BinaryOp<T> {
     if (LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, batch, input_rows, input_cols, in_depth,
             filter_rows, filter_cols, pad_rows, pad_cols, out_rows, out_cols,
-            out_depth, stride_rows, stride_cols, output, data_format_)) {
+            out_depth, dilation_rows, dilation_cols, stride_rows, stride_cols,
+            output, data_format_)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-              stride_rows, stride_cols, padding_, output, data_format_);
+              dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
+              output, data_format_);
   }
 
  private:
+  std::vector<int32> dilations_;
   std::vector<int32> strides_;
   bool use_cudnn_;
   Padding padding_;
@@ -443,9 +478,9 @@ typedef AutoTuneSingleton<ConvAutoTuneGroup, ConvParameters,
 template <typename T>
 void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-    const Tensor& input_param, const Tensor& filter, int row_stride,
-    int col_stride, const Padding& padding, Tensor* output,
-    TensorFormat data_format) {
+    const Tensor& input_param, const Tensor& filter, int row_dilation,
+    int col_dilation, int row_stride, int col_stride, const Padding& padding,
+    Tensor* output, TensorFormat data_format) {
   using perftools::gputools::dnn::AlgorithmConfig;
   using perftools::gputools::dnn::AlgorithmDesc;
   using perftools::gputools::dnn::ProfileResult;
@@ -461,8 +496,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   Tensor input = input_param;
 
-  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_dilation == 1 &&
+      col_dilation == 1 && row_stride == 1 && col_stride == 1 &&
+      data_format == FORMAT_NHWC) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
     const uint64 k = filter.dim_size(2);
@@ -487,7 +523,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
     return;
   } else if (filter.dim_size(0) == input.dim_size(1) &&
-             filter.dim_size(1) == input.dim_size(2) && padding == VALID &&
+             filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+             col_dilation == 1 && padding == VALID &&
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, so call cublas
     // directly.
@@ -530,17 +567,19 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   const int64 patch_cols = filter.dim_size(1);
   if (padding == SAME) {
     // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + Kr - R
-    // Pc = (C' - 1) * S + Kc - C
+    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
+    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
     // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Kr, Kc) are filter dimensions.
+    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
     // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
     // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
     // we pad more on the right and bottom than on the top and left.
     padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+        std::max<int>(0, (out_rows - 1) * row_stride +
+                             (patch_rows - 1) * row_dilation + 1 - in_rows);
     padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+        std::max<int>(0, (out_cols - 1) * col_stride +
+                             (patch_cols - 1) * col_dilation + 1 - in_cols);
     const bool rows_odd = (padding_rows % 2 != 0);
     const bool cols_odd = (padding_cols % 2 != 0);
     if (rows_odd || cols_odd) {
@@ -605,7 +644,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_input_feature_map_count(filter.dim_size(2))
       .set_output_feature_map_count(filter.dim_size(3));
   perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
-  conv_desc.set_vertical_filter_stride(row_stride)
+  conv_desc.set_vertical_dilation_rate(row_dilation)
+      .set_horizontal_dilation_rate(col_dilation)
+      .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
@@ -652,6 +693,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
         patch_cols}},    // filter_cols
+      {{row_dilation,    // dilation_rows
+        col_dilation}},  // dilation_cols
       {{row_stride,      // stride_rows
         col_stride}},    // stride_cols
       {{padding_rows,    // padding_rows
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index e29271dff278afbc1ff2c947c161824615640b66..09a3b78776c8bf114ccd42866bc7aded92c463b5 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -34,8 +34,9 @@ class OpKernelContext;
 template <typename Device, typename T>
 struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -43,8 +44,9 @@ struct LaunchConv2DOp {
 template <typename T>
 struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
-                  const Tensor& input, const Tensor& filter, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* output,
+                  const Tensor& input, const Tensor& filter, int row_dilation,
+                  int col_dilation, int row_stride, int col_stride,
+                  const Padding& padding, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 8a89d564def3e90ed2227d97dcc1e88d794940ee..21c84b2a0ed15eaada88e308e1761dcb58cb07b3 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -145,6 +145,7 @@ class Conv3DOp : public BinaryOp<T> {
   REGISTER_KERNEL_BUILDER(                                      \
       Name("Conv3D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       Conv3DOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #undef REGISTER_CPU_KERNEL
@@ -376,6 +377,9 @@ struct LaunchConvOp<GPUDevice, T> {
         {{in_planes, in_rows, in_cols}},
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
+        // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+        // conv is supported.
+        /*dilation=*/{{1, 1, 1}},
         {{strides[0], strides[1], strides[2]}},
         {{pad_planes, pad_rows, pad_cols}},
         dtype,
@@ -482,12 +486,16 @@ namespace functor {
       const std::array<int, 3>& padding_right,                        \
       typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
 
+DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
 #undef DECLARE_GPU_SPEC
 
 }  // namespace functor
 
 // Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    Conv3DOp<GPUDevice, Eigen::half>);
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index c852dc9991c2e879c8fa6a64b2bd8b5141606409..6f82698596260d0fa9ce3198b5fc3eec18c86c98 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -91,13 +91,14 @@ class ConvParameters {
   using SpatialArray = gtl::InlinedVector<int64, 3>;
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                  int64 out_depths, const SpatialArray& filter,
-                 const SpatialArray& stride, const SpatialArray& padding,
-                 DataType dtype, int device_id)
+                 const SpatialArray& dilation, const SpatialArray& stride,
+                 const SpatialArray& padding, DataType dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
         in_(in),
         filter_(filter),
+        dilation_(dilation),
         stride_(stride),
         padding_(padding),
         dtype_(dtype),
@@ -107,6 +108,7 @@ class ConvParameters {
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
+    for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
     hash_code_ = Hash64Combine(hash_code_, dtype);
@@ -128,6 +130,7 @@ class ConvParameters {
         "(", str_util::Join(in_, ", "), "), ",
         out_depths_, ", ",
         "(", str_util::Join(filter_, ", "), "), ",
+        "(", str_util::Join(dilation_, ", "), "), ",
         "(", str_util::Join(stride_, ", "), "), ",
         "(", str_util::Join(padding_, ", "), "), ",
         dtype_, ", ",
@@ -154,11 +157,11 @@ class ConvParameters {
  protected:
   using ParameterDataType =
       std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, DataType, int>;
+                 SpatialArray, SpatialArray, DataType, int>;
 
   ParameterDataType get_data_as_tuple() const {
     return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
-                           stride_, padding_, dtype_, device_id_);
+                           dilation_, stride_, padding_, dtype_, device_id_);
   }
 
   uint64 hash_code_;
@@ -169,6 +172,7 @@ class ConvParameters {
   int64 out_depths_;
   SpatialArray in_;
   SpatialArray filter_;
+  SpatialArray dilation_;
   SpatialArray stride_;
   SpatialArray padding_;
   DataType dtype_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 88ba4330500e4bd51b680782b073f202f3cf6797..666bca265c95febf3753e71bf010a7caf95c0541 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -43,6 +43,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       128,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
@@ -60,6 +62,8 @@ TEST(ConvParameters, WinogradNonfusedAlgoSize) {
       768,       // out_depths
       {{3,       // filter_rows
         3}},     // filter_cols
+      {{1,       // dilation_rows
+        1}},     // dilation_cols
       {{1,       // stride_rows
         1}},     // stride_cols
       {{0,       // padding_rows
@@ -346,4 +350,118 @@ TEST_F(FusedResizePadConvOpTest, ResizeAndPadSymmetricComparativeLarge) {
                           "SYMMETRIC", 1, "SAME");
 }
 
+class ConvOpTest : public OpsTestBase {
+ protected:
+  void HandwrittenConv() {
+    const int stride = 1;
+    TF_EXPECT_OK(NodeDefBuilder("conv_op", "Conv2D")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("T", DT_FLOAT)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    const int depth = 1;
+    const int image_width = 4;
+    const int image_height = 3;
+    const int image_batch_count = 1;
+    // The image matrix is:
+    // |  1 |  2 |  3 |  4 |
+    // |  5 |  6 |  7 |  8 |
+    // |  9 | 10 | 11 | 12 |
+    Tensor image(DT_FLOAT,
+                 {image_batch_count, image_height, image_width, depth});
+    test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+    // The filter matrix is:
+    // | 1 | 4 | 7 |
+    // | 2 | 5 | 8 |
+    // | 3 | 6 | 9 |
+    const int filter_size = 3;
+    const int filter_count = 1;
+    Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+    test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+    AddInputFromArray<float>(image.shape(), image.flat<float>());
+    AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+    // the input set to zero because we're using the 'SAME' padding mode.
+    // The calculations behind the expected output are:
+    // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+    // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+    // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+    // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+    // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+    // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+    // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+    // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+    // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+    // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+    // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+    // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+    // This means we should end up with this matrix:
+    // |  105  |  150  |  183  |   95  |
+    // |  235  |  312  |  357  |  178  |
+    // |  187  |  234  |  261  |  121  |
+    const int expected_width = image_width;
+    const int expected_height = image_height * filter_count;
+    Tensor expected(DT_FLOAT, TensorShape({image_batch_count, expected_height,
+                                           expected_width, filter_count}));
+    test::FillValues<float>(
+        &expected, {105, 150, 183, 95, 235, 312, 357, 178, 187, 234, 261, 121});
+    const Tensor& output = *GetOutput(0);
+    test::ExpectTensorNear<float>(expected, output, 1e-5);
+  }
+
+  void AnisotropicStrides() {
+    const int stride_width = 3;
+    const int stride_height = 1;
+    TF_EXPECT_OK(NodeDefBuilder("conv_op", "Conv2D")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("T", DT_FLOAT)
+                     .Attr("strides", {1, stride_height, stride_width, 1})
+                     .Attr("padding", "VALID")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    const int depth = 1;
+    const int image_width = 6;
+    const int image_height = 3;
+    const int image_batch_count = 1;
+    Tensor image(DT_FLOAT,
+                 {image_batch_count, image_height, image_width, depth});
+    test::FillValues<float>(&image, {
+                                        3, 2, 1, -1, -2, -3,  //
+                                        4, 3, 2, -2, -3, -4,  //
+                                        5, 4, 3, -3, -4, -5,  //
+                                    });
+    const int filter_size = 2;
+    const int filter_count = 1;
+    Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+    test::FillValues<float>(&filter, {
+                                         1, 2,  //
+                                         3, 4,  //
+                                     });
+
+    AddInputFromArray<float>(image.shape(), image.flat<float>());
+    AddInputFromArray<float>(filter.shape(), filter.flat<float>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    const int expected_width = 2;
+    const int expected_height = 2;
+    Tensor expected(DT_FLOAT, TensorShape({image_batch_count, expected_height,
+                                           expected_width, filter_count}));
+    test::FillValues<float>(&expected, {31, -23, 41, -33});
+    const Tensor& output = *GetOutput(0);
+    test::ExpectTensorNear<float>(expected, output, 1e-5);
+  }
+};
+
+TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
+
+TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 7bdd8d22a3e9f7b064f147ee31b562b68f281c65..39c8814073382566bc3551fdf6d5afc7f1ef0012 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -20,16 +20,8 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Acosh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::acosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
 #endif // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index e0644323c0052e50e766cdb49c6ed172b88cd326..0aec6aac3442a98309e352cf1431b920a87f62fe 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -1,10 +1,10 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+  /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+  http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Asinh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::asinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index 058f5140c5bed7f312bf220d665da8628ca657e1..7b688db4c585b0f8d92f289cae598a78df7e379c 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -20,17 +20,9 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-                          Name("Atanh")                               \
-                          .Device(DEVICE_SYCL)                        \
-                          .TypeConstraint<TYPE>("T"),                 \
-                          UnaryOp<SYCLDevice, functor::atanh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYC
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 017a2182dcff0f0121dd6343f1c012802cdf28d1..5a6cf4bad1609cebc0fded4d212e50fb19d22558 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index 36f45fe92dfce44c68a778b6c719c45d24bcaa90..201a10198a629b26429393c5c04404175399df73 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                     \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index 36432d851d99f20706b7e7f8535e6ac241b00937..2a7cd2699596a7ace6afd5ce688ff2e186650336 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 
 #if TENSORFLOW_USE_SYCL
 #define REGISTER_SYCL_KERNEL(TYPE)                                      \
@@ -30,13 +30,15 @@ REGISTER_SYCL_KERNEL(int32);
 REGISTER_SYCL_KERNEL(int64);
 REGISTER_SYCL_KERNEL(uint8);
 REGISTER_SYCL_KERNEL(uint16);
+REGISTER_SYCL_KERNEL(uint32);
+REGISTER_SYCL_KERNEL(uint64);
 #undef REGISTER_SYCL_KERNEL
 
 #endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
-REGISTER6(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64, uint8, uint16);
+REGISTER8(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
+          int64, uint8, uint16, uint32, uint64);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
index 27f973c90d73a1d7828ce180254363a0b7b4be76..3fbf69c114d3c546eafb9f6c504568a649c52e59 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_and.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_and, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_and, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
index a34c3a52cd6253527c67d2d1f8c1498756ff5be8..8bcb82266a2d3567c0f8d79b2fdccd5916b2ecbb 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_or.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_or, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_or, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
index a4531ab7c6f283f8e732dbc87b3c64d93a8a5bef..e62a87aba44eea0fc5b1cf13a74ddfed2ef294b6 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_bitwise_xor.cu.cc
@@ -19,7 +19,8 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY6(bitwise_xor, int8, int16, int32, int64, uint8, uint16);
+DEFINE_BINARY8(bitwise_xor, int8, int16, int32, int64, uint8, uint16, uint32,
+               uint64);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 89487419ee9132320613b88950aab138f34512f4..d32185b6bf48f7b6d49f355c0653004310bde533 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -49,7 +49,11 @@ template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::asinh(a);
+#else
     return std::asinh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -61,7 +65,11 @@ template <typename T>
 struct scalar_acosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::acosh(a);
+#else
     return std::acosh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -73,7 +81,11 @@ template <typename T>
 struct scalar_atanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
+#if EIGEN_HAS_CXX11_MATH
+    return numext::atanh(a);
+#else
     return std::atanh(a);
+#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -676,7 +688,9 @@ struct sub : base<T, Eigen::internal::scalar_difference_op<T>> {
 };
 
 template <typename T>
-struct mul : base<T, Eigen::internal::scalar_product_op<T>> {};
+struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
+  static const bool use_bcast_optimization = true;
+};
 
 template <typename T>
 struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d427eddf3f7cca928bbb04427f6c53765eaa70f
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -0,0 +1,164 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/nn_ops.cc.
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/data_format_ops.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+class DataFormatDimMapOp : public OpKernel {
+ public:
+  explicit DataFormatDimMapOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(
+        context, src_format == "NHWC",
+        errors::InvalidArgument(strings::StrCat(
+            "Current implementation doesn't support source data format ",
+            src_format)));
+    OP_REQUIRES(context, dst_format == "NCHW",
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation doesn't support dst data format ",
+                    dst_format)));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, input.dims() == 0,
+        errors::InvalidArgument("input must be a scalar, but got shape ",
+                                input.shape().DebugString()));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
+                                           input.scalar<T>(),
+                                           output->scalar<T>());
+  }
+};
+
+template <typename Device, typename T>
+class DataFormatVecPermuteOp : public OpKernel {
+ public:
+  explicit DataFormatVecPermuteOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string src_format;
+    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    string dst_format;
+    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(
+        context, src_format == "NHWC",
+        errors::InvalidArgument(strings::StrCat(
+            "Current implementation doesn't support source data format ",
+            src_format)));
+    OP_REQUIRES(context, dst_format == "NCHW",
+                errors::InvalidArgument(strings::StrCat(
+                    "Current implementation doesn't support dst data format ",
+                    dst_format)));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(
+        context, input.dims() == 1,
+        errors::InvalidArgument("input must be a vector, but got shape ",
+                                input.shape().DebugString()));
+    OP_REQUIRES(
+        context, input.NumElements() == 4,
+        errors::InvalidArgument("input must be of size 4, but got shape ",
+                                input.shape().DebugString()));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    functor::DataFormatVecPermute<Device, T>()(
+        context->eigen_device<Device>(), input.vec<T>(), output->vec<T>());
+  }
+};
+
+#define REGISTER_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#define REGISTER_KERNEL(T)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                  \
+  template <>                                                \
+  void DataFormatDimMap<GPUDevice, T>::operator()(           \
+      const GPUDevice& d, typename TTypes<T>::ConstScalar x, \
+      typename TTypes<T>::Scalar y);                         \
+  extern template struct DataFormatDimMap<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+
+#define DECLARE_GPU_SPEC(T)                               \
+  template <>                                             \
+  void DataFormatVecPermute<GPUDevice, T>::operator()(    \
+      const GPUDevice& d, typename TTypes<T>::ConstVec x, \
+      typename TTypes<T>::Vec y);                         \
+  extern template struct DataFormatVecPermute<GPUDevice, T>;
+#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_GPU_KERNEL(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DataFormatDimMap").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatDimMapOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#define REGISTER_GPU_KERNEL(T)                                                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("DataFormatVecPermute").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DataFormatVecPermuteOp<GPUDevice, T>);
+TF_CALL_int32(REGISTER_GPU_KERNEL);
+TF_CALL_int64(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..54798cc6abf345f192735beb0fe9d484130ed3f6
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -0,0 +1,72 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+#define TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
+// Functor definition for data format dim mapping ops, must be compilable
+// by nvcc.
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace functor {
+
+// Functor used by DataFormatDimMapOP to do the computations.
+template <typename Device, typename T>
+struct DataFormatDimMap {
+  void operator()(const Device& d, typename TTypes<T>::ConstScalar x,
+                  typename TTypes<T>::Scalar y) {
+    auto zero = x.constant(0);
+    auto one = x.constant(1);
+    auto three = x.constant(3);
+    auto four = x.constant(4);
+    auto x_mod = (x + four) % 4;
+    auto is_zero = (x_mod == zero);
+    auto is_three = (x_mod == three);
+    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
+  }
+};
+
+template <typename T>
+struct VecPermute {
+  Eigen::DSizes<Eigen::DenseIndex, 1> dimensions(
+      typename TTypes<T>::ConstVec input) const {
+    Eigen::DSizes<Eigen::DenseIndex, 1> result;
+    result[0] = input.dimension(0);
+    return result;
+  }
+  template <typename Output, typename Device>
+  void eval(typename TTypes<T>::ConstVec input, Output& output,
+            const Device& d) const {
+    output.template chip<0>(0).device(d) = input.template chip<0>(0);
+    output.template chip<0>(1).device(d) = input.template chip<0>(3);
+    output.template chip<0>(2).device(d) = input.template chip<0>(1);
+    output.template chip<0>(3).device(d) = input.template chip<0>(2);
+  }
+};
+
+// Functor used by DataFormatVecPermuteOp to do the computations.
+template <typename Device, typename T>
+struct DataFormatVecPermute {
+  void operator()(const Device& d, typename TTypes<T>::ConstVec x,
+                  typename TTypes<T>::Vec y) {
+    y.device(d) = x.customOp(VecPermute<T>());
+  }
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_DATA_FORMAT_OPS_H_
diff --git a/tensorflow/core/kernels/data_format_ops_gpu.cu.cc b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..38ce7c28fea662cea7004c47a46c0031875e3c36
--- /dev/null
+++ b/tensorflow/core/kernels/data_format_ops_gpu.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/data_format_ops.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+template struct functor::DataFormatDimMap<GPUDevice, int32>;
+template struct functor::DataFormatDimMap<GPUDevice, int64>;
+template struct functor::DataFormatVecPermute<GPUDevice, int32>;
+template struct functor::DataFormatVecPermute<GPUDevice, int64>;
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dataset.cc b/tensorflow/core/kernels/dataset.cc
index 0414875a5d52d487ea2cf521fa7c1158f77c7326..09721297873968f1cf14307d23fa712dd5e9b27d 100644
--- a/tensorflow/core/kernels/dataset.cc
+++ b/tensorflow/core/kernels/dataset.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/dataset.h"
 
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
+
 namespace tensorflow {
 
 namespace {
@@ -70,6 +73,143 @@ class DatasetVariantWrapper {
 
 }  // namespace
 
+Status GraphDefBuilderWrapper::AddDataset(
+    const GraphDatasetBase* dataset,
+    const std::vector<std::pair<size_t, Node*>>& inputs,
+    const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+    Node** output) {
+  const string& op_type_name = dataset->op_name();
+  std::unique_ptr<const GraphDefBuilder::Options> opts(
+      new GraphDefBuilder::Options(b_->opts()));
+  // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
+  // attributes defined. It will be nice to have a consistent pattern.
+  bool has_output_types_attr = HasAttr(op_type_name, "output_types");
+  bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
+  if (has_output_shapes_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_shapes", dataset->output_shapes())));
+  }
+  if (has_output_types_attr) {
+    opts.reset(new GraphDefBuilder::Options(
+        opts->WithAttr("output_types", dataset->output_dtypes())));
+  }
+  for (auto attr : attrs) {
+    opts.reset(
+        new GraphDefBuilder::Options(opts->WithAttr(attr.first, attr.second)));
+  }
+  if (opts->HaveError()) {
+    return errors::Internal("AddDataset: Failed to build Options with error ",
+                            opts->StatusToString());
+  }
+  NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
+                           opts->op_registry());
+  {
+    size_t total_size = inputs.size() + list_inputs.size();
+    auto inputs_iter = inputs.begin();
+    auto list_inputs_iter = list_inputs.begin();
+    for (int i = 0; i < total_size; i++) {
+      if (inputs_iter != inputs.end() && inputs_iter->first == i) {
+        node_builder.Input(NodeBuilder::NodeOut(inputs_iter->second));
+        inputs_iter++;
+      } else if (list_inputs_iter != list_inputs.end() &&
+                 list_inputs_iter->first == i) {
+        std::vector<NodeBuilder::NodeOut> nodeout_inputs;
+        nodeout_inputs.reserve(list_inputs_iter->second.size());
+        for (Node* n : list_inputs_iter->second) {
+          nodeout_inputs.emplace_back(n);
+        }
+        node_builder.Input(nodeout_inputs);
+        list_inputs_iter++;
+      } else {
+        return errors::InvalidArgument("No input found for index ", i);
+      }
+    }
+  }
+  *output = opts->FinalizeBuilder(&node_builder);
+  if (*output == nullptr) {
+    return errors::Internal("AddDataset: Failed to build ", op_type_name,
+                            " op with error ", opts->StatusToString());
+  }
+  return Status::OK();
+}
+
+Status GraphDefBuilderWrapper::AddFunction(OpKernelContext* ctx,
+                                           const string& function_name) {
+  if (b_->HasFunction(function_name)) {
+    LOG(INFO) << "Function with name " << function_name << "already exists in"
+              << " the graph. It will not be added again.";
+    return Status::OK();
+  }
+  TF_RETURN_IF_ERROR(EnsureFunctionIsStateless(ctx, function_name));
+  const FunctionLibraryDefinition* flib_def =
+      ctx->function_library()->GetFunctionLibraryDefinition();
+  const FunctionDef* f_def = flib_def->Find(function_name);
+  if (f_def == nullptr) {
+    return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                   function_name, " in the registry.");
+  }
+  FunctionDefLibrary def;
+  *def.add_function() = *f_def;
+  const string gradient_func = flib_def->FindGradient(function_name);
+  if (!gradient_func.empty()) {
+    GradientDef* g_def = def.add_gradient();
+    g_def->set_function_name(function_name);
+    g_def->set_gradient_func(gradient_func);
+  }
+  TF_RETURN_IF_ERROR(b_->AddFunctionLibrary(def));
+
+  // Recursively add functions in inputs of function_name.
+  for (const NodeDef& node_def : f_def->node_def()) {
+    const OpRegistrationData* op_reg_data = nullptr;
+    TF_RETURN_IF_ERROR(flib_def->LookUp(node_def.op(), &op_reg_data));
+    if (op_reg_data->is_function_op) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, op_reg_data->op_def.name()));
+    }
+    // Recursively add functions in attrs of this NodeDef.
+    for (const auto& pair : node_def.attr()) {
+      TF_RETURN_IF_ERROR(AddAttrFunctions(pair.second, ctx));
+    }
+  }
+
+  // Recursively add functions in attrs of function_name.
+  for (auto iter = f_def->attr().begin(); iter != f_def->attr().end(); iter++) {
+    TF_RETURN_IF_ERROR(AddAttrFunctions(iter->second, ctx));
+  }
+  return Status::OK();
+}
+
+void GraphDefBuilderWrapper::AddTensorInternal(const Tensor& val,
+                                               Node** output) {
+  *output = ops::SourceOp(
+      "Const",
+      b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
+}
+
+bool GraphDefBuilderWrapper::HasAttr(const string& op_type_name,
+                                     const string& attr_name) const {
+  const OpDef* op_def = nullptr;
+  Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
+  if (!s.ok() || op_def == nullptr) {
+    return false;
+  }
+  return HasAttr(op_def, attr_name);
+}
+
+Status GraphDatasetBase::Serialize(OpKernelContext* ctx,
+                                   string* serialized_graph_def,
+                                   string* output_node) const {
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* node = nullptr;
+  TF_RETURN_IF_ERROR(AsGraphDefInternal(ctx, &db, &node));
+  *output_node = node->name();
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  graph_def.SerializeToString(serialized_graph_def);
+  return Status::OK();
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT ||
@@ -126,7 +266,6 @@ void BinaryDatasetOpKernel::MakeDataset(OpKernelContext* ctx,
   MakeDataset(ctx, input, another_input, output);
 }
 
-const char IteratorBase::kIteratorExhausted[] = "ITERATOR_EXHAUSTED";
 const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
diff --git a/tensorflow/core/kernels/dataset.h b/tensorflow/core/kernels/dataset.h
index e0ffe268dd6630f60c375f0d6a7dc4ff62b06dc2..504a88a309860e357eabb047124be04d81affd21 100644
--- a/tensorflow/core/kernels/dataset.h
+++ b/tensorflow/core/kernels/dataset.h
@@ -17,20 +17,18 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/tensor_bundle/naming.h"
-#include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
 // types. Use this macro to expand `m(T)` once for each primitive type
@@ -39,14 +37,13 @@ limitations under the License.
 
 namespace tensorflow {
 
-class ResourceMgr;
-
 // Interface for reading values from a key-value store.
 // Used for restoring iterator state.
 class IteratorStateReader {
  public:
   virtual Status ReadScalar(StringPiece key, int64* val) = 0;
   virtual Status ReadScalar(StringPiece key, string* val) = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
   virtual bool Contains(StringPiece key) = 0;
 
   virtual ~IteratorStateReader() {}
@@ -58,10 +55,17 @@ class IteratorStateWriter {
  public:
   virtual Status WriteScalar(StringPiece key, const int64 val) = 0;
   virtual Status WriteScalar(StringPiece key, const string& val) = 0;
+  virtual Status WriteTensor(StringPiece key, const Tensor& val) = 0;
 
   virtual ~IteratorStateWriter() {}
 };
 
+// Forward declarations to avoid introducing a dependency on headers in
+// "tensorflow/core/graph/...".
+class GraphDefBuilder;
+class GraphDatasetBase;
+class Node;
+
 // Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
 class GraphDefBuilderWrapper {
  public:
@@ -86,6 +90,7 @@ class GraphDefBuilderWrapper {
   // `*output` contains a pointer to the output `Node`. It is guaranteed to be
   // non-null if the method returns with an OK status.
   // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
+  // TODO(shivaniagrawal): Consider changing to gtl::ArraySlice?
   template <typename T>
   Status AddVector(const std::vector<T>& val, Node** output) {
     Tensor val_t = Tensor(DataTypeToEnum<T>::v(),
@@ -112,6 +117,11 @@ class GraphDefBuilderWrapper {
     return Status::OK();
   }
 
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs, Node** output) {
+    return AddDataset(dataset, inputs, {}, output);
+  }
+
   // Adds a node corresponding to the `DatasetType` to the Graph.
   // Return value of `DatasetType::op_name()` is used as the op type for the
   // node.
@@ -120,86 +130,77 @@ class GraphDefBuilderWrapper {
   // `*output` contains a pointer to the output `Node`. It is guaranteed to be
   // non-null if the method returns with an OK status.
   // The returned Node pointer is owned by the backing Graph of GraphDefBuilder.
-  template <class DatasetType>
-  Status AddDataset(const DatasetType* dataset,
-                    std::vector<NodeBuilder::NodeOut> inputs, Node** output) {
-    const string& op_type_name = dataset->op_name();
-    std::unique_ptr<const GraphDefBuilder::Options> opts(
-        new GraphDefBuilder::Options(b_->opts()));
-    // TODO(srbs|mrry): Not all datasets have output_types and output_shapes
-    // attributes defined. It will be nice to have a consistent pattern.
-    bool has_output_types_attr = HasAttr(op_type_name, "output_types");
-    bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
-    if (has_output_shapes_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_shapes", dataset->output_shapes())));
-    }
-    if (has_output_types_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_types", dataset->output_dtypes())));
-    }
-    if (opts->HaveError()) {
-      return errors::Internal("AddDataset: Error building Options.");
+  Status AddDataset(const GraphDatasetBase* dataset,
+                    const std::vector<Node*>& inputs,
+                    const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+                    Node** output) {
+    std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
+    for (int i = 0; i < inputs.size(); i++) {
+      enumerated_inputs[i] = std::make_pair(i, inputs[i]);
     }
-    NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
-                             opts->op_registry());
-    for (auto node_out : inputs) {
-      node_builder.Input(node_out);
-    }
-    *output = opts->FinalizeBuilder(&node_builder);
-    if (*output == nullptr) {
-      return errors::Internal("AddDataset: Failed to build ", op_type_name,
-                              " op.");
-    }
-    return Status::OK();
+    return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
   }
 
-  // TODO(shivaniagrawal): Single method for AddDataset for
-  // NodeOut/ArrraySlice<NodeOut>
-  template <class DatasetType>
-  Status AddDatasetWithInputAsList(const DatasetType* dataset,
-                                   gtl::ArraySlice<NodeBuilder::NodeOut> input,
-                                   Node** output) {
-    const string& op_type_name = dataset->op_name();
-    std::unique_ptr<const GraphDefBuilder::Options> opts(
-        new GraphDefBuilder::Options(b_->opts()));
-    bool has_output_types_attr = HasAttr(op_type_name, "output_types");
-    bool has_output_shapes_attr = HasAttr(op_type_name, "output_shapes");
-    if (has_output_shapes_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_shapes", dataset->output_shapes())));
-    }
-    if (has_output_types_attr) {
-      opts.reset(new GraphDefBuilder::Options(
-          opts->WithAttr("output_types", dataset->output_dtypes())));
-    }
-    if (opts->HaveError()) {
-      return errors::Internal("AddDataset: Error building Options.");
+  Status AddDataset(
+      const GraphDatasetBase* dataset,
+      const std::vector<std::pair<size_t, Node*>>& inputs,
+      const std::vector<std::pair<size_t, gtl::ArraySlice<Node*>>>& list_inputs,
+      const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
+      Node** output);
+
+  // Adds a user-defined function with name `function_name` to the graph and
+  // recursively adds all functions it references. If a function with a matching
+  // name has already been added, returns with OK status. If a user-defined with
+  // name `function_name` is not found in the FunctionLibraryDefinition, returns
+  // an InvalidArgumentError. If the function with name `function_name` or any
+  // of its dependent functions are stateful, returns an InvalidArgument error.
+  Status AddFunction(OpKernelContext* ctx, const string& function_name);
+
+  template <typename T>
+  void BuildAttrValue(const T& value, AttrValue* attr) {
+    SetAttrValue(value, attr);
+  }
+
+ private:
+  void AddTensorInternal(const Tensor& val, Node** output);
+
+  Status EnsureFunctionIsStateless(OpKernelContext* ctx,
+                                   const string& function_name) const {
+    const FunctionLibraryDefinition* lib_def =
+        ctx->function_library()->GetFunctionLibraryDefinition();
+    const FunctionDef* function_def = lib_def->Find(function_name);
+    if (!function_def) {
+      return errors::InvalidArgument("Unable to find FunctionDef for ",
+                                     function_name, " in registry.");
     }
-    NodeBuilder node_builder(opts->GetNameForOp(op_type_name), op_type_name,
-                             opts->op_registry());
-    node_builder.Input(input);
-    *output = opts->FinalizeBuilder(&node_builder);
-    if (*output == nullptr) {
-      return errors::Internal("AddDataset: Failed to build ", op_type_name,
-                              " op.");
+    for (const NodeDef& node_def : function_def->node_def()) {
+      const OpDef* op_def;
+      TF_RETURN_IF_ERROR(lib_def->LookUpOpDef(node_def.op(), &op_def));
+      // TODO(b/65524810): Hack to allow functions to capture Dataset op
+      // nodes needed for FlatMap. Currently, source datasets nodes have been
+      // marked stateful to avoid constant folding since we do not have a
+      // good way of serializing them.
+      if (IsOpWhitelisted(op_def)) {
+        continue;
+      }
+      if (op_def->is_stateful()) {
+        return errors::InvalidArgument(
+            "Op[name: ", node_def.name(), ", type: ", node_def.op(), "] ",
+            "in function ", function_name, " is stateful. ",
+            "Saving stateful functions is not supported yet.");
+      }
     }
     return Status::OK();
   }
 
- private:
-  void AddTensorInternal(const Tensor& val, Node** output) {
-    *output = ops::SourceOp(
-        "Const",
-        b_->opts().WithAttr("dtype", val.dtype()).WithAttr("value", val));
+  bool IsOpWhitelisted(const OpDef* op_def) const {
+    return StringPiece(op_def->name()).ends_with("Dataset") &&
+           HasAttr(op_def, "output_shapes");
   }
 
-  bool HasAttr(const string& op_type_name, const string& attr_name) {
-    const OpDef* op_def = nullptr;
-    Status s = b_->opts().op_registry()->LookUpOpDef(op_type_name, &op_def);
-    if (!s.ok() || op_def == nullptr) {
-      return false;
-    }
+  bool HasAttr(const string& op_type_name, const string& attr_name) const;
+
+  bool HasAttr(const OpDef* op_def, const string& attr_name) const {
     for (auto attr : op_def->attr()) {
       if (attr.name() == attr_name) {
         return true;
@@ -208,9 +209,22 @@ class GraphDefBuilderWrapper {
     return false;
   }
 
+  Status AddAttrFunctions(const AttrValue& attr_value, OpKernelContext* ctx) {
+    if (attr_value.has_func()) {
+      TF_RETURN_IF_ERROR(AddFunction(ctx, attr_value.func().name()));
+    } else if (attr_value.has_list()) {
+      for (const NameAttrList& name_attr_list : attr_value.list().func()) {
+        TF_RETURN_IF_ERROR(AddFunction(ctx, name_attr_list.name()));
+      }
+    }
+    return Status::OK();
+  }
+
   GraphDefBuilder* b_;
 };
 
+class StatsAggregator;
+
 // A cut-down version of OpKernelContext for running computations in
 // iterators. Note that we cannot simply use OpKernelContext here
 // because we might run computation in an iterator whose lifetime is
@@ -220,44 +234,47 @@ class GraphDefBuilderWrapper {
 // TODO(mrry): We will probably need to support more of
 // OpKernelContext here. For example, should allocation be handled by
 // the IteratorContext?
-// TODO(mrry): We will need to fabricate step IDs for calls to ops
-// that are not nested within a particular step.
 // TODO(mrry): We're making some daring assumptions about the lifetime
-// of the FunctionLibraryRuntime and runner passed in here. Once
-// created, a FunctionLibraryRuntime should stay alive for the
-// remainder of a session, so we copy the pointer. A runner will be
-// deleted when the original step ends, but all existing runners only
-// close over session-lifetime (or longer-lived) state, so we can make
-// a copy of the function. There's nothing in the definition of either
-// class to guarantee that what we are doing is safe. We should
-// formalize the properties here.
+// of the runner passed in here. A runner will be deleted when the original
+// step ends, but all existing runners only close over session-lifetime (or
+// longer-lived) state, so we can make a copy of the function. There's nothing
+// in the definition of the API from which we took the runner to guarantee that
+// what we are doing is safe. We should formalize the properties here.
 class IteratorContext {
  public:
   struct Params {
     // Interface to operating system functionality.
     Env* env;
 
-    // The step being executed.
-    int64 step_id = 0;
-
-    // Shared resources accessible by this iterator invocation.
-    ResourceMgr* resource_manager = nullptr;
-
     // Function call support.
     std::function<void(std::function<void()>)> runner = nullptr;
+
+    // A function that returns the current `StatsAggregator` instance to be
+    // used when recording statistics about the iterator.
+    //
+    // NOTE(mrry): This is somewhat awkward, because (i) the `StatsAggregator`
+    // is a property of the `IteratorResource` (which this class does not know
+    // about), and (ii) it can change after the `IteratorContext` has been
+    // created. Better suggestions are welcome!
+    std::function<std::shared_ptr<StatsAggregator>()> stats_aggregator_getter =
+        nullptr;
   };
 
   explicit IteratorContext(Params params) : params_(std::move(params)) {}
 
   Env* env() const { return params_.env; }
 
-  int64 step_id() const { return params_.step_id; }
-
   std::function<void(std::function<void()>)>* runner() {
     return &params_.runner;
   }
 
-  ResourceMgr* resource_manager() const { return params_.resource_manager; }
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    if (params_.stats_aggregator_getter) {
+      return params_.stats_aggregator_getter();
+    } else {
+      return nullptr;
+    }
+  }
 
  private:
   Params params_;
@@ -298,28 +315,15 @@ class IteratorBase {
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
   // Saves the state of this iterator.
-  virtual Status Save(IteratorStateWriter* writer) {
-    if (is_exhausted_) {
-      LOG(INFO) << "Iterator exhausted.";
-      return writer->WriteScalar(kIteratorExhausted, kIteratorExhausted);
-    } else {
-      return SaveInternal(writer);
-    }
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
+    return SaveInternal(writer);
   }
 
   // Restores the state of this iterator.
   virtual Status Restore(OpKernelContext* ctx, IteratorStateReader* reader) {
-    if (reader->Contains(kIteratorExhausted)) {
-      LOG(INFO) << "Iterator exhausted. Nothing to restore.";
-      is_exhausted_ = true;
-      return Status::OK();
-    } else {
-      return RestoreInternal(ctx, reader);
-    }
+    return RestoreInternal(ctx, reader);
   }
 
-  static const char kIteratorExhausted[];
-
  protected:
   // This is needed so that sub-classes of IteratorBase can call
   // `SaveInternal` on their parent iterators, e.g., in
@@ -347,8 +351,6 @@ class IteratorBase {
                                  IteratorStateReader* reader) {
     return errors::Unimplemented("RestoreInternal");
   }
-
-  bool is_exhausted_ = false;  // Whether the iterator has been exhausted.
 };
 
 // Represents a (potentially infinite) range of outputs, where each
@@ -384,7 +386,7 @@ class DatasetBase : public core::RefCounted {
   virtual string DebugString() = 0;
 
   // Serializes the dataset and writes it to the `writer`.
-  virtual Status Save(IteratorStateWriter* writer) const {
+  virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
     return errors::Unimplemented("DatasetBase::Save");
   }
 
@@ -396,11 +398,18 @@ class DatasetBase : public core::RefCounted {
   class DatasetGraphDefBuilder : public GraphDefBuilderWrapper {
    public:
     DatasetGraphDefBuilder(GraphDefBuilder* b) : GraphDefBuilderWrapper(b) {}
-    Status AddParentDataset(const DatasetBase* dataset, Node** output) {
-      return dataset->AsGraphDefInternal(this, output);
+    Status AddParentDataset(OpKernelContext* ctx, const DatasetBase* dataset,
+                            Node** output) {
+      return dataset->AsGraphDefInternal(ctx, this, output);
     }
   };
 
+  virtual Status AsGraphDefInternal(OpKernelContext* ctx,
+                                    DatasetGraphDefBuilder* b,
+                                    Node** node) const {
+    return AsGraphDefInternal(b, node);
+  }
+
   virtual Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                                     Node** node) const {
     return errors::Unimplemented("AsGraphDefInternal");
@@ -415,10 +424,11 @@ class GraphDatasetBase : public DatasetBase {
 
   const string op_name() const { return op_name_; }
 
-  Status Save(IteratorStateWriter* writer) const override {
+  Status Save(OpKernelContext* ctx,
+              IteratorStateWriter* writer) const override {
     string serialized_graph_def;
     string output_node;
-    TF_RETURN_IF_ERROR(Serialize(&serialized_graph_def, &output_node));
+    TF_RETURN_IF_ERROR(Serialize(ctx, &serialized_graph_def, &output_node));
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(kDatasetGraphKey, serialized_graph_def));
     TF_RETURN_IF_ERROR(
@@ -434,17 +444,8 @@ class GraphDatasetBase : public DatasetBase {
   static const char kDatasetGraphOutputNodeKey[];
 
  private:
-  Status Serialize(string* serialized_graph_def, string* output_node) const {
-    GraphDefBuilder b;
-    DatasetGraphDefBuilder db(&b);
-    Node* node = nullptr;
-    TF_RETURN_IF_ERROR(AsGraphDefInternal(&db, &node));
-    *output_node = node->name();
-    GraphDef graph_def;
-    TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-    graph_def.SerializeToString(serialized_graph_def);
-    return Status::OK();
-  }
+  Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
+                   string* output_node) const;
 
   const string op_name_;
 };
@@ -484,16 +485,12 @@ class DatasetIterator : public IteratorBase {
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) final {
     port::Tracing::TraceMe activity(params_.prefix);
-    if (is_exhausted_) {
-      *end_of_sequence = true;
-      return Status::OK();
-    }
     return GetNextInternal(ctx, out_tensors, end_of_sequence);
   }
 
-  Status Save(IteratorStateWriter* writer) final {
-    TF_RETURN_IF_ERROR(dataset()->Save(writer));
-    return IteratorBase::Save(writer);
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) final {
+    TF_RETURN_IF_ERROR(dataset()->Save(ctx, writer));
+    return IteratorBase::Save(ctx, writer);
   }
 
  protected:
diff --git a/tensorflow/core/kernels/dataset_utils.cc b/tensorflow/core/kernels/dataset_utils.cc
index cd58c8091211ae75265f6cfecb65746965f98d2f..3ce64504d083df4e7208dc76e5066568d1c6f97e 100644
--- a/tensorflow/core/kernels/dataset_utils.cc
+++ b/tensorflow/core/kernels/dataset_utils.cc
@@ -32,12 +32,13 @@ Status MakeIteratorFromInputElement(
   // is always 0, so a negative random step ID should suffice.
   opts.step_id = CapturedFunction::generate_step_id();
   ScopedStepContainer step_container(
-      opts.step_id, [captured_func, ctx](const string& name) {
+      opts.step_id, [captured_func](const string& name) {
         captured_func->resource_manager()->Cleanup(name).IgnoreError();
       });
   opts.step_container = &step_container;
   std::vector<Tensor> return_values;
-  TF_RETURN_IF_ERROR(captured_func->Run(opts, input_element, &return_values));
+  TF_RETURN_IF_ERROR(
+      captured_func->RunWithBorrowedArgs(opts, input_element, &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 2c2105312119fccd0f2be95d989f56388fc18ab4..381add3fb3bd57ebf068212cdd32a640bf60dd9b 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -185,7 +185,7 @@ class BaseDebugOp : public OpKernel {
       if (!status.ok()) {
         LOG(ERROR) << "Debug node of watch key "
                    << debug_watch_key_->debug_node_name
-                   << "failed to publish debug tensor data to all URLs "
+                   << " failed to publish debug tensor data to all URLs "
                    << str_util::Join(debug_urls_, ", ")
                    << ", due to: " << status.error_message();
       }
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index 086369a9f127143a6dfd71e10b1abffd54c8a191..c778278e8fbbec67a0255ea7d257c19da4f3612f 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -34,8 +34,10 @@ class DecodeBmpOp : public OpKernel {
   explicit DecodeBmpOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
     OP_REQUIRES(
-        context, channels_ == 0 || channels_ == 3 || channels_ == 4,
-        errors::InvalidArgument("channels must be 0, 3 or 4, got ", channels_));
+        context,
+        channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
+        errors::InvalidArgument("channels must be 0, 1, 3 or 4, got ",
+                                channels_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -47,6 +49,12 @@ class DecodeBmpOp : public OpKernel {
     // Start decoding image to get shape details
     const StringPiece input = contents.scalar<string>()();
 
+    OP_REQUIRES(context, (32 <= input.size()),
+                errors::InvalidArgument("Incomplete bmp content, requires at "
+                                        "least 32 bytes to find the header "
+                                        "size, width, height, and bpp, got ",
+                                        input.size(), " bytes"));
+
     const uint8* img_bytes = reinterpret_cast<const uint8*>(input.data());
     const int32 header_size = internal::SubtleMustCopy(
         *(reinterpret_cast<const int32*>(img_bytes + 10)));
@@ -66,11 +74,27 @@ class DecodeBmpOp : public OpKernel {
       channels_ = bpp / 8;
     }
 
-    // Current implementation only supports 3 or 4 channel
+    // Current implementation only supports 1, 3 or 4 channel
     // bitmaps.
-    OP_REQUIRES(context, (channels_ == 3 || channels_ == 4),
+    OP_REQUIRES(context, (channels_ == 1 || channels_ == 3 || channels_ == 4),
                 errors::InvalidArgument(
-                    "Number of channels must be 3 or 4, was ", channels_));
+                    "Number of channels must be 1, 3 or 4, was ", channels_));
+
+    // there may be padding bytes when the width is not a multiple of 4 bytes
+    // 8 * channels == bits per pixel
+    const int row_size = (8 * channels_ * width + 31) / 32 * 4;
+
+    const int last_pixel_offset =
+        header_size + (abs(height) - 1) * row_size + (width - 1) * channels_;
+
+    // [expected file size] = [last pixel offset] + [last pixel size=channels]
+    const int expected_file_size = last_pixel_offset + channels_;
+
+    OP_REQUIRES(
+        context, (expected_file_size <= input.size()),
+        errors::InvalidArgument("Incomplete bmp content, requires at least ",
+                                expected_file_size, " bytes, got ",
+                                input.size(), " bytes"));
 
     // if height is negative, data layout is top down
     // otherwise, it's bottom up
@@ -84,25 +108,23 @@ class DecodeBmpOp : public OpKernel {
 
     const uint8* bmp_pixels = &img_bytes[header_size];
 
-    Decode(bmp_pixels, output->flat<uint8>().data(), width, abs(height),
-           channels_, top_down);
+    Decode(bmp_pixels, row_size, output->flat<uint8>().data(), width,
+           abs(height), channels_, top_down);
   }
 
-  uint8* Decode(const uint8* input, uint8* const output, const int width,
-                const int height, const int channles, bool top_down);
+  uint8* Decode(const uint8* input, const int row_size, uint8* const output,
+                const int width, const int height, const int channles,
+                bool top_down);
 
  private:
   int channels_;
 };
 REGISTER_KERNEL_BUILDER(Name("DecodeBmp").Device(DEVICE_CPU), DecodeBmpOp);
 
-uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
-                           const int width, const int height,
-                           const int channels, bool top_down) {
-  // there may be padding bytes when the width is not a multiple of 4 bytes
-  // 8 * channels == bits per pixel
-  int row_size = (8 * channels * width + 31) / 32 * 4;
-
+uint8* DecodeBmpOp::Decode(const uint8* input, const int row_size,
+                           uint8* const output, const int width,
+                           const int height, const int channels,
+                           bool top_down) {
   for (int i = 0; i < height; i++) {
     int src_pos;
     int dst_pos;
@@ -117,6 +139,9 @@ uint8* DecodeBmpOp::Decode(const uint8* input, uint8* const output,
       dst_pos = (i * width + j) * channels;
 
       switch (channels) {
+        case 1:
+          output[dst_pos] = input[src_pos];
+          break;
         case 3:
           // BGR -> RGB
           output[dst_pos] = input[src_pos + 2];
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 9804d7d38e1a811ea30136697a11d085e3533552..9347978d515b9244dde2b50b2fcfaa3c91ab9c94 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -231,7 +231,8 @@ static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
       }
       // Pad to vector-register width (if needed).
       for (int64 d = 0; d < pad_size; ++d) {
-        buffer[buf_base + vectorized_size + scalar_size + d] = 0;
+        buffer[buf_base + vectorized_size + scalar_size + d] =
+            static_cast<T>(0);
       }
     }
   }
@@ -297,7 +298,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
 
   for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
     // Reset accumulator.
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
     for (int j = 0; j < filter_spatial_size; ++j) {
       // Calculate index.
       const int64 index = i + j * padded_filter_inner_dim_size;
@@ -318,7 +319,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
   }
 
   if (output_scalar_size > 0) {
-    auto vaccum = Eigen::internal::pset1<Packet>(0);
+    auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
     for (int j = 0; j < filter_spatial_size; ++j) {
       const int64 index =
           output_vectorized_size + j * padded_filter_inner_dim_size;
@@ -346,7 +347,7 @@ static void ComputeBackpropInput(const DepthwiseArgs& args,
   if (depth_multiplier > 1) {
     for (int64 d = 0; d < in_depth; ++d) {
       const int64 index = d * args.depth_multiplier;
-      T accum = 0;
+      T accum = static_cast<T>(0);
       for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
         const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
         accum += Eigen::internal::predux(v);
@@ -510,6 +511,8 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
+extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
+                                                          Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
@@ -884,6 +887,8 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 
 #if GOOGLE_CUDA
 
+extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
+                                                           Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index bbeeaf789544a45ced75148064be0b39c7457053..a5fd07fbe177f2206ef9b6b3252556211b9e3905 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -94,7 +94,7 @@ struct DepthwiseConv2DKernel {
 
     for (int i = 0; i < output_vectorized_size; i += kPacketSize) {
       // Reset accumulator.
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
       for (int j = 0; j < filter_spatial_size; ++j) {
         // Calculate index.
         const int64 index = i + j * padded_filter_inner_dim_size;
@@ -115,7 +115,7 @@ struct DepthwiseConv2DKernel {
     }
 
     if (output_scalar_size > 0) {
-      auto vaccum = Eigen::internal::pset1<Packet>(0);
+      auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(0));
       for (int j = 0; j < filter_spatial_size; ++j) {
         const int64 index =
             output_vectorized_size + j * padded_filter_inner_dim_size;
@@ -246,6 +246,7 @@ extern template class LaunchConv2DOp<CPUDevice, float>;
 #if GOOGLE_CUDA
 
 // Extern template instantiated in depthwise_conv_op_gpu.cc.
+extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
@@ -372,8 +373,11 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     // If in_depth==1, this operation is just a standard convolution, so
     // invoke that op.
     if (std::is_same<T, float>::value && in_depth == 1) {
+      // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+      // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                stride_, stride_, padding_, output, data_format_);
+                /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
+                padding_, output, data_format_);
       return;
     }
 
@@ -419,12 +423,17 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       DepthwiseConv2dNativeOp<CPUDevice, T>);
 
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
 #if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA
+REGISTER_KERNEL_BUILDER(
+    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
+    DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
+
 REGISTER_KERNEL_BUILDER(
     Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     DepthwiseConv2dNativeOp<GPUDevice, float>);
diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h
index aa5b5c76f6ac13d7d1dbc5bfb62710cde538621a..097a9f5bfad4f1cf0232b0bb31cf6f88fdb5696b 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.h
+++ b/tensorflow/core/kernels/depthwise_conv_op.h
@@ -158,7 +158,8 @@ struct DepthwiseFilterPadOp {
       }
       // Pad the remainder of output to vector-register boundary.
       for (int64 j = 0; j < pad_size; ++j) {
-        padded_filter[output_base + vectorized_size + scalar_size + j] = 0;
+        padded_filter[output_base + vectorized_size + scalar_size + j] =
+            static_cast<T>(0);
       }
     }
   }
@@ -266,7 +267,7 @@ struct DepthwiseInputCopyOp {
 
           // Pad the remainder of the output to vector register boundary.
           for (int64 d = 0; d < output_pad_size; ++d) {
-            in_buf[d] = 0;
+            in_buf[d] = static_cast<T>(0);
           }
           in_buf += output_pad_size;
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index ecfe51d599c8b53c32a908bad893a6d91bc6f4d4..903aac5d68baeb8c37b009a54863a084dcb75147 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -105,7 +105,7 @@ __global__ void __launch_bounds__(1024, 2)
     const int input_row_end = input_row_start + filter_rows;
     const int input_col_end = input_col_start + filter_cols;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
 
     const int input_offset_temp = in_rows * OB;
     if (input_row_start >= 0 && input_col_start >= 0 &&
@@ -258,8 +258,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
     __syncthreads();
 
     if (depth_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -369,7 +369,7 @@ __global__ void __launch_bounds__(1024, 2)
     const int input_row_end = input_row_start + filter_rows;
     const int input_col_end = input_col_start + filter_cols;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
     if (input_row_start >= 0 && input_col_start >= 0 &&
         input_row_end < in_rows && input_col_end < in_cols) {
       // Loop that doesn't need to check for boundary conditions.
@@ -529,8 +529,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
     __syncthreads();
 
     if (slice_in_range) {
-      T sum1 = 0;
-      T sum2 = 0;
+      T sum1 = static_cast<T>(0);
+      T sum2 = static_cast<T>(0);
       int shared_offset = data_idx;
       const T* filter_ptr = filter_read_offset + shared_data;
       UNROLL for (int r = 0; r < filter_rows; ++r) {
@@ -710,6 +710,7 @@ void LaunchDepthwiseConvOp<GPUDevice, T>::operator()(OpKernelContext* ctx,
                   "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
@@ -744,7 +745,7 @@ __global__ void __launch_bounds__(640, 2)
     const int in_r = (thread_id / in_depth / in_cols) % in_rows;
     const int b = thread_id / in_depth / in_cols / in_rows;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
 
     const int out_r_start =
         tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
@@ -810,7 +811,7 @@ __global__ void __launch_bounds__(640, 2)
     const int in_d = (thread_id / in_cols / in_rows) % in_depth;
     const int b = thread_id / in_depth / in_cols / in_rows;
 
-    T sum = 0;
+    T sum = static_cast<T>(0);
     const int out_d_start = in_d * depth_multiplier;
     const int out_d_end = out_d_start + depth_multiplier;
 
@@ -919,6 +920,7 @@ void LaunchDepthwiseConvBackpropInputOp<GPUDevice, T>::operator()(
                                "utGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
 
@@ -1631,6 +1633,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T>::operator()(
                                "terGPULaunch failed"));
 }
 
+template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, Eigen::half>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
 template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/diag_op.cc b/tensorflow/core/kernels/diag_op.cc
index be862b82f1b311e3e46bbe27de9921bb548fa0b6..86fa7dce36afff121dc6ff0642f45c809bc63a3d 100644
--- a/tensorflow/core/kernels/diag_op.cc
+++ b/tensorflow/core/kernels/diag_op.cc
@@ -108,7 +108,7 @@ class DiagPartOp : public OpKernel {
 };
 
 // Implementation of the functor specialization for CPU.
-// 
+//
 // According to the diagonal definition,
 // `output[i1,..., ik, i1,..., ik] = input[i1,..., ik]`,
 //
@@ -116,7 +116,7 @@ class DiagPartOp : public OpKernel {
 // pointer can be represent by coordinate [i1,..., ik],
 // where `index = i1*(s2*...*sk) + i2*(s3*...*sk) +... + ik`
 //
-// Let new_index is the offset of output's pointer with coordinate 
+// Let new_index is the offset of output's pointer with coordinate
 // [i1,..., ik, i1,..., ik], then we have
 // `new_index = i1*(s2*...sk*s1*...*sk) + i2*(s3*...*sk*s1*...*sk) +... + \
 //              ik*(s1*...*sk) + i1*(s2*...*sk) + i2*(s3*...*sk) +... + ik
diff --git a/tensorflow/core/kernels/diag_op_gpu.cu.cc b/tensorflow/core/kernels/diag_op_gpu.cu.cc
index 684f00ea61d136a3ed75d6a6b19f7eff02c30d1e..d3c529d784e3a9ba4a793cd98cff9eb5e74d6090 100644
--- a/tensorflow/core/kernels/diag_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/diag_op_gpu.cu.cc
@@ -33,7 +33,7 @@ __global__ void DiagCudaKernel(const int num_threads,
                                const T* in,
                                T* out) {
   CUDA_1D_KERNEL_LOOP(index, num_threads) {
-    // Fill the diagonal elements or set to zero in other place. 
+    // Fill the diagonal elements or set to zero in other place.
     if (index % (1 + size) == 0) {
       out[index] = in[index / (1 + size)];
     } else {
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb58b13f382970c60b551f448243a2b75e30df3
--- /dev/null
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -0,0 +1,465 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The algorithm for dynamic partition has the following steps:
+// 1. Let N be the size of partitions. We initialize a new vector indices_in
+//    with the values 0, 1, 2, ..., N-1.
+// 2. We apply cub::DeviceRadixSort::SortPairs to the key - value pairs given
+//    by partitions and indices_in. This will result in two new vectors
+//    partitions_out and indices_out, with partitions_out sorted.
+// 3. The first dimension of outputs[i] is equal to the number of i-values in
+//    partitions_out. We determine it in two steps:
+//    - apply cub::DeviceReduce::ReduceByKey to count how many times each value
+//      appears in partitions_out,
+//    - move the results to partition_count. This handles missing values
+//      (corresponding to empty parts).
+// 4. Because partition_count is on the GPU, we bring it asynchronously to
+//    the CPU. Then we can allocate the output tensors.
+// 5. Finally, we use indices_out and the gather functor to collect the output.
+//    This works, because for each interval of i-values, indices_out points
+//    to the slices which should form output[i].
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "external/cub_archive/cub/device/device_radix_sort.cuh"
+#include "external/cub_archive/cub/device/device_reduce.cuh"
+#include "external/cub_archive/cub/iterator/constant_input_iterator.cuh"
+#include "external/cub_archive/cub/thread/thread_operators.cuh"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/transform_output_iterator.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+
+template <typename T>
+__global__ void RangeInitKernel(const T start, const T delta, const int32 size,
+                                T* out) {
+  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+}
+
+__global__ void MoveValuesKernel(const int32* keys, const int32* values,
+                                 const int32* size, int32 out_size,
+                                 int32* out) {
+  int32 N = min(ldg(size), out_size);
+  CUDA_1D_KERNEL_LOOP(i, N) {
+    int32 key = ldg(keys + i);
+    int32 value = ldg(values + i);
+    if (FastBoundsCheck(key, out_size)) out[key] = value;
+  }
+}
+
+// Initialize out with range start, start + delta, start + 2 * delta, ...
+// This is needed because tf.range has no GPU implementation.
+template <typename T>
+void RangeInit(const GPUDevice& d, const T start, const T delta,
+               const int32 size, typename TTypes<T>::Flat out) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
+  RangeInitKernel<
+      T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      start, delta, size, out.data());
+}
+
+// Given *num_runs pairs (key, value), this function moves the value
+// corresponding to key i at position i in the array out.
+void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
+                int32 out_size, int32* out) {
+  // Because num_runs is located on the GPU, we can not access it directly.
+  // So we launch the kernel with size = out_size.
+  // This is valid for correct inputs, because then out_size >= *num_runs.
+  // For wrong inputs, we may have out_size < *num_runs. In this case we will
+  // only handle the first out_size values.
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  MoveValuesKernel<<<config.block_count, config.thread_per_block, 0,
+                     d.stream()>>>(keys, values, num_runs, out_size, out);
+}
+
+template <typename T>
+void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
+                      T* out, int64 gather_dim_size, int64 indices_size,
+                      int64 slice_size, int64 out_size) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
+  GatherOpKernel<
+      T, int32,
+      true><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+      params, indices, out, gather_dim_size, indices_size, slice_size,
+      out_size);
+}
+
+struct IdentityOp {
+  __device__ int32 __forceinline__ operator()(const int32& a) const {
+    return a;
+  }
+};
+
+// Define an output iterator that only allows assignment to
+// positions between [base, base + limit).
+class BoundedOutputIterator
+    : public TransformOutputIterator<int32, int32, IdentityOp> {
+ private:
+  int32 limit;
+  int32* base;
+
+  struct BoundedReference : Reference {
+    int32 limit;
+    int32* base;
+    // Constructor
+    __host__ __device__ __forceinline__
+    BoundedReference(int32* ptr, int32* base, IdentityOp op, int32 limit)
+        : Reference(ptr, op), limit(limit), base(base) {}
+
+    // Assignment
+    __host__ __device__ __forceinline__ int32 operator=(int32 val) {
+      if (ptr - base < limit && ptr - base >= 0) *ptr = val;
+      return val;
+    }
+  };
+
+ public:
+  typedef BoundedOutputIterator self_type;
+  typedef BoundedReference reference;
+
+  __host__ __device__ __forceinline__ BoundedOutputIterator(int32* ptr,
+                                                            IdentityOp op,
+                                                            int32 size)
+      : TransformOutputIterator(ptr, op), limit(size), base(ptr) {}
+
+  __host__ __device__ __forceinline__
+  BoundedOutputIterator(int32* ptr, int32* base, IdentityOp op, int32 size)
+      : TransformOutputIterator(ptr, op), limit(size), base(base) {}
+
+  // Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return BoundedReference(ptr, base, conversion_op, limit);
+  }
+
+  // Array subscript
+  __host__ __device__ __forceinline__ reference operator[](int32 n) const {
+    return BoundedReference(ptr + n, base, conversion_op, limit);
+  }
+
+  // Addition
+  __host__ __device__ __forceinline__ self_type operator+(int32 n) const {
+    self_type retval(ptr + n, base, conversion_op, limit);
+    return retval;
+  }
+
+  // Subtraction
+  __host__ __device__ __forceinline__ self_type operator-(int32 n) const {
+    self_type retval(ptr - n, base, conversion_op, limit);
+    return retval;
+  }
+};
+
+}  // namespace
+
+// The current implementation has memory cost on GPU
+// I + P + max(3N + R + P, O + N), where:
+// I - the size of the input
+// N - the size of the partitions tensor
+// R - the temporary storage used by cub::RadixSort, about 2N
+// P - the number of partitions
+// O - the size of the output
+// So roughly the cost is I + P + max(5N, O + N).
+template <typename T>
+class DynamicPartitionOpGPU : public AsyncOpKernel {
+ public:
+  explicit DynamicPartitionOpGPU(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES(c, num_partitions_ >= 1,
+                errors::InvalidArgument("num_partitions must be at least 1"));
+  }
+
+  void AllocateTempSpace(OpKernelContext* c, int32 N, Tensor* indices_in,
+                         Tensor* partitions_out, Tensor* indices_out,
+                         DoneCallback done) {
+    int32 M = std::max(N, num_partitions_);
+    // indices_in will be made slightly larger to accommodate
+    // later computations.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({M}), indices_in), done);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({N}), partitions_out), done);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({N}), indices_out), done);
+  }
+
+  void AllocateOutputs(OpKernelContext* c, const Tensor* data,
+                       const Tensor* partitions, const Tensor* partition_count,
+                       OpOutputList* Tout, DoneCallback done) {
+    auto e_part_count = partition_count->flat<int32>();
+    // Allocate output tensors of the right size
+    OP_REQUIRES_OK_ASYNC(c, c->output_list("outputs", Tout), done);
+    for (int p = 0; p < num_partitions_; p++) {
+      TensorShape shape;
+      shape.AddDim(e_part_count(p));
+      for (int i = partitions->dims(); i < data->dims(); i++) {
+        shape.AddDim(data->dim_size(i));
+      }
+      Tensor* out;
+      OP_REQUIRES_OK_ASYNC(c, Tout->allocate(p, shape, &out), done);
+    }
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) {
+    const Tensor& data = c->input(0);
+    const Tensor& partitions = c->input(1);
+
+    OP_REQUIRES_ASYNC(
+        c, TensorShapeUtils::StartsWith(data.shape(), partitions.shape()),
+        errors::InvalidArgument("data.shape must start with partitions.shape, ",
+                                "got data.shape = ", data.shape().DebugString(),
+                                ", partitions.shape = ",
+                                partitions.shape().DebugString()),
+        done);
+
+    Tensor partition_count;
+
+    // We must handle the case of empty partitions separately,
+    // because kernels don't work with 0-sized tensors.
+    if (partitions.NumElements() == 0) {
+      AllocatorAttributes alloc_attr;
+      alloc_attr.set_on_host(true);
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                              &partition_count, alloc_attr),
+          done);
+      auto e_part_count = partition_count.flat<int32>();
+      for (int i = 0; i < num_partitions_; i++) e_part_count(i) = 0;
+      OpOutputList outputs;
+      this->AllocateOutputs(c, &data, &partitions, &partition_count, &outputs,
+                            done);
+      if (c->status().ok()) done();
+      return;
+    }
+
+    // Prepare for counting.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                            &partition_count),
+        done);
+    Tensor indices_out;
+    // Count how many times each partition index occurs.
+    // Also sort the info in partitions and output it in indices_out,
+    // in preparation for the next step.
+    this->CountAndSortParts(c, &partitions, &partition_count, &indices_out,
+                            done);
+    if (!c->status().ok()) return;
+
+    // In order to allocate the output tensor we have to move partition_count
+    // to CPU.
+    auto* stream = c->op_device_context()->stream();
+    OP_REQUIRES_ASYNC(c, stream, errors::Internal("No GPU stream available."),
+                      done);
+    Tensor cpu_tensor;
+    AllocatorAttributes alloc_attr;
+    alloc_attr.set_on_host(true);
+    alloc_attr.set_gpu_compatible(true);
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(partition_count.dtype(), partition_count.shape(),
+                            &cpu_tensor, alloc_attr),
+        done);
+    perftools::gputools::DeviceMemoryBase wrapped(
+        partition_count.flat<int32>().data(), num_partitions_ * sizeof(int32));
+    const bool status =
+        stream
+            ->ThenMemcpy(cpu_tensor.flat<int32>().data(), wrapped,
+                         num_partitions_ * sizeof(int32))
+            .ok();
+    OP_REQUIRES_ASYNC(
+        c, status,
+        errors::Internal("Failed to launch copy from device to host."), done);
+
+    // Keep a reference to partition_count so that the buffer
+    // is not deallocated at the end of the function, before
+    // memcpy is completed.
+    TensorReference partition_ref(partition_count);
+    auto wrapped_callback = [this, c, &data, &partitions, indices_out,
+                             partition_ref, cpu_tensor, done]() {
+      OpOutputList outputs;
+      this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
+      if (!c->status().ok()) {
+        partition_ref.Unref();
+        return;
+      }
+      int32 N = partitions.NumElements();
+      int64 slice_size = data.NumElements() / N;
+      this->GatherSlices(c, &data, &indices_out, N, slice_size, outputs);
+      partition_ref.Unref();
+      done();
+    };
+
+    c->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+        stream, wrapped_callback);
+  }
+
+ protected:
+  void RadixSort(OpKernelContext* c, const Tensor* partitions,
+                 Tensor* indices_in, Tensor* partitions_out,
+                 Tensor* indices_out, DoneCallback done) {
+    int32 N = partitions->NumElements();
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const cudaStream_t& cu_stream = GetCudaStream(c);
+
+    // Initialize the indices_in tensor using the Range GPU kernel.
+    RangeInit(device, 0, 1, N, indices_in->flat<int32>());
+    // Obtain the pointers to inner buffers.
+    const int32* partitions_ptr = partitions->flat<int32>().data();
+    int32* partitions_out_ptr = partitions_out->flat<int32>().data();
+    int32* indices_in_ptr = indices_in->flat<int32>().data();
+    int32* indices_out_ptr = indices_out->flat<int32>().data();
+    // Determine temporary device storage requirements.
+    Tensor cub_temp_storage;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(
+        NULL, temp_storage_bytes, partitions_ptr, partitions_out_ptr,
+        indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
+    // Allocate temporary storage.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &cub_temp_storage),
+        done);
+    // Radix-sort the partition information.
+    cub::DeviceRadixSort::SortPairs(
+        cub_temp_storage.flat<int8>().data(), temp_storage_bytes,
+        partitions_ptr, partitions_out_ptr, indices_in_ptr, indices_out_ptr, N,
+        0, sizeof(int32) * 8, cu_stream);
+  }  // At this point cub_temp_storage will be marked for deallocation.
+
+  void CountAndSortParts(OpKernelContext* c, const Tensor* partitions,
+                         Tensor* partition_count, Tensor* indices_out,
+                         DoneCallback done) {
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const cudaStream_t& cu_stream = GetCudaStream(c);
+    int32 N = partitions->NumElements();
+    Tensor indices_in;
+    Tensor partitions_out;
+    Tensor aggregates_out;
+
+    // Allocate memory for Radix-Sort.
+    this->AllocateTempSpace(c, N, &indices_in, &partitions_out, indices_out,
+                            done);
+    if (!c->status().ok()) return;
+    this->RadixSort(c, partitions, &indices_in, &partitions_out, indices_out,
+                    done);
+    if (!c->status().ok()) return;
+    // We will now apply a reduce operation to count how many times
+    // each index appears in partitions.
+
+    // Zero-out the partition_count tensor.
+    functor::SetZeroFunctor<GPUDevice, int32> zero_functor;
+    zero_functor(device, partition_count->flat<int32>());
+    // Allocate memory for aggregates_out.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({num_partitions_}),
+                            &aggregates_out),
+        done);
+    // Obtain the pointers to inner buffers.
+    int32* keys_in_ptr = partitions_out.flat<int32>().data();
+    // Here we reuse the indices_in tensor for the unique keys output.
+    int32* unique_out_ptr = indices_in.flat<int32>().data();
+    int32* aggregates_out_ptr = aggregates_out.flat<int32>().data();
+    // We wrap the pointers in bounded output iterators to guard against
+    // wrong inputs (more than num_partitions distinct indices).
+    IdentityOp id_op;
+    BoundedOutputIterator unique_out_it(unique_out_ptr, id_op, num_partitions_);
+    BoundedOutputIterator aggregates_out_it(aggregates_out_ptr, id_op,
+                                            num_partitions_);
+
+    cub::ConstantInputIterator<int32> values_in(1);
+    cub::Sum reduction_op;
+
+    // Allocate space on GPU for the number of runs. This is required by CUB.
+    Tensor num_runs;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(DT_INT32, TensorShape({1}), &num_runs), done);
+    int32* num_runs_ptr = num_runs.flat<int32>().data();
+
+    // Determine temporary device storage requirements
+    Tensor cub_temp_storage;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::ReduceByKey(NULL, temp_storage_bytes, keys_in_ptr,
+                                   unique_out_it, values_in, aggregates_out_it,
+                                   num_runs_ptr, reduction_op, N, cu_stream);
+    // Allocate temporary storage.
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_temp(
+               DT_INT8, TensorShape({static_cast<int64>(temp_storage_bytes)}),
+               &cub_temp_storage),
+        done);
+    // Run reduce-by-key. The effect is that we count how many times
+    // each index appears in partitions. The distinct indices are stored
+    // in unique_out, while the count is stored in aggregates_out.
+    // The total number of distinct indices is stored in num_runs.
+    cub::DeviceReduce::ReduceByKey(cub_temp_storage.flat<int8>().data(),
+                                   temp_storage_bytes, keys_in_ptr,
+                                   unique_out_it, values_in, aggregates_out_it,
+                                   num_runs_ptr, reduction_op, N, cu_stream);
+    // We are not done yet. unique_out only contains the indices that appeared
+    // at least once in partitions. We move each value from aggregates_out
+    // to the corresponding position in partition_count. This will handle
+    // possibly empty parts.
+    MoveValues(device, unique_out_ptr, aggregates_out_ptr, num_runs_ptr,
+               num_partitions_, partition_count->flat<int32>().data());
+  }  // At this point indices_in, partitions_out, aggregates_out
+     // and cub_temp_storage will be marked for deallocation.
+
+  void GatherSlices(OpKernelContext* c, const Tensor* data,
+                    const Tensor* indices, int32 N, int64 slice_size,
+                    OpOutputList& outs) {
+    const GPUDevice& device = c->eigen_device<GPUDevice>();
+    const int32* ind_base = indices->flat<int32>().data();
+    const T* data_base = data->flat<T>().data();
+
+    for (int p = 0; p < num_partitions_; p++) {
+      int32 indices_size = outs[p]->dim_size(0);
+      int64 out_size = outs[p]->NumElements();
+      T* out_base = outs[p]->flat<T>().data();
+      if (out_size > 0)
+        CallGatherKernel<T>(device, data_base, ind_base, out_base, N,
+                            indices_size, slice_size, out_size);
+      ind_base += indices_size;
+    }
+  }
+
+  int32 num_partitions_;
+};
+
+#define REGISTER_DYNAMIC_PARTITION_GPU(T)                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DynamicPartition").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DynamicPartitionOpGPU<T>)
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_DYNAMIC_PARTITION_GPU);
+TF_CALL_complex64(REGISTER_DYNAMIC_PARTITION_GPU);
+TF_CALL_complex128(REGISTER_DYNAMIC_PARTITION_GPU);
+#undef REGISTER_DYNAMIC_PARTITION_GPU
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 0e8fbc0a67bfddaa1e3df32224fbeea3ace40b6f..9a7ed0af217b1c31fa14917f10128bb229b18dfd 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -23,10 +24,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace {
@@ -153,5 +158,58 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
       << s;
 }
 
+Node* DynamicPartitionNode(Graph* g, Node* in0, Node* in1, int num_partitions) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "DynamicPartition")
+                  .Input(in0)
+                  .Input(in1)
+                  .Attr("num_partitions", num_partitions)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
+template <typename T>
+static Graph* DynamicPartition(int num_partitions, int dim) {
+  Graph* g = new Graph(OpRegistry::Global());
+  // Always use a 128MB buffer.
+  const int kRows = ((128 << 20) / sizeof(T)) / dim;
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({kRows, dim}));
+  data.flat<T>().setRandom();
+
+  random::PhiloxRandom philox(301, 17);
+  random::SimplePhilox rnd(&philox);
+  Tensor partitions(DT_INT32, TensorShape({kRows}));
+  for (int i = 0; i < kRows; i++) {
+    partitions.flat<int32>()(i) = rnd.Uniform(num_partitions);
+  }
+  DynamicPartitionNode(g, test::graph::Constant(g, data),
+                       test::graph::Constant(g, partitions), num_partitions);
+  return g;
+}
+
+#define BM_DYNAMIC_PARTITION(DEVICE, T, num)                            \
+  static void BM_##DEVICE##_dynpart_##T##_##num(int iters, int dim) {   \
+    const int64 items = ((128 << 20) / sizeof(T));                      \
+    const int64 tot = static_cast<int64>(iters) * items;                \
+    testing::ItemsProcessed(tot);                                       \
+    testing::UseRealTime();                                             \
+    test::Benchmark(#DEVICE, DynamicPartition<T>(num, dim)).Run(iters); \
+  }                                                                     \
+  BENCHMARK(BM_##DEVICE##_dynpart_##T##_##num)->Arg(1)->Arg(256)
+
+BM_DYNAMIC_PARTITION(cpu, float, 2);
+BM_DYNAMIC_PARTITION(cpu, float, 100);
+BM_DYNAMIC_PARTITION(cpu, double, 2);
+BM_DYNAMIC_PARTITION(cpu, double, 100);
+BM_DYNAMIC_PARTITION(cpu, complex64, 2);
+BM_DYNAMIC_PARTITION(cpu, complex64, 100);
+
+BM_DYNAMIC_PARTITION(gpu, float, 2);
+BM_DYNAMIC_PARTITION(gpu, float, 100);
+BM_DYNAMIC_PARTITION(gpu, double, 2);
+BM_DYNAMIC_PARTITION(gpu, double, 100);
+BM_DYNAMIC_PARTITION(gpu, complex64, 2);
+BM_DYNAMIC_PARTITION(gpu, complex64, 100);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index a6cd0078eed86ba2e6d5a50324e351a95daf1856..7aaad6e6c7a48617d1a6cbc679eebc2297828f75 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -128,8 +128,15 @@ struct FakeQuantWithMinMaxVarsFunctor {
                   ConstScalar<float> min, ConstScalar<float> max,
                   const int quant_min, const int quant_max,
                   Flat<float> outputs) {
+    const float min_val = min();
+    const float max_val = max();
+    // If min and max are both zero, we should just return zero.
+    if (min_val == 0.0f && max_val == 0.0f) {
+      outputs.device(d) = outputs.constant(0.0f);
+      return;
+    }
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), quant_min, quant_max, &nudged_min, &nudged_max,
+    Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
           &nudged_scale);
     const auto nudged_scale_repl = inputs.constant(nudged_scale);
 
@@ -151,8 +158,17 @@ struct FakeQuantWithMinMaxVarsGradientFunctor {
                   const int quant_max, Flat<float> backprops_wrt_input,
                   Scalar<float> backprop_wrt_min,
                   Scalar<float> backprop_wrt_max) {
+    const float min_val = min();
+    const float max_val = max();
+    // If min and max are both zero, we propagate everything to inputs.
+    if (min_val == 0.0f && max_val == 0.0f) {
+      backprops_wrt_input.device(d) = gradients;
+      backprop_wrt_min.device(d) = backprop_wrt_min.constant(0.0f);
+      backprop_wrt_max.device(d) = backprop_wrt_max.constant(0.0f);
+      return;
+    }
     float nudged_min, nudged_max, nudged_scale;
-    Nudge(min(), max(), quant_min, quant_max, &nudged_min, &nudged_max,
+    Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
           &nudged_scale);
 
     const auto between_min_max =
@@ -185,8 +201,16 @@ struct FakeQuantWithMinMaxVarsPerChannelFunctor {
                   ConstVec<float> min, ConstVec<float> max, const int quant_min,
                   const int quant_max, TTypes<float>::Matrix outputs) {
     for (Index i = 0; i < min.size(); ++i) {
+      const float min_val = min(i);
+      const float max_val = max(i);
+      // If min and max are both zero, we should just return zero.
+      if (min_val == 0.0f && max_val == 0.0f) {
+        auto chip = outputs.chip<1>(i);
+        chip.device(d) = chip.constant(0.0f);
+        continue;
+      }
       float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), quant_min, quant_max, &nudged_min, &nudged_max,
+      Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
             &nudged_scale);
       const auto clamped =
           inputs.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
@@ -212,11 +236,22 @@ struct FakeQuantWithMinMaxVarsPerChannelGradientFunctor {
                   TTypes<float>::Matrix backprops_wrt_input,
                   Vec<float> backprop_wrt_min, Vec<float> backprop_wrt_max) {
     for (Index i = 0; i < min.size(); ++i) {
-      float nudged_min, nudged_max, nudged_scale;
-      Nudge(min(i), max(i), quant_min, quant_max, &nudged_min, &nudged_max,
-            &nudged_scale);
+      const float min_val = min(i);
+      const float max_val = max(i);
       const auto gradients_chip = gradients.chip<1>(i);
       const auto inputs_chip = inputs.chip<1>(i);
+      // If min and max are both zero, we propagate everything to inputs.
+      if (min_val == 0.0f && max_val == 0.0f) {
+        backprops_wrt_input.chip<1>(i).device(d) = gradients_chip;
+        auto min_chip = backprop_wrt_min.chip<0>(i);
+        auto max_chip = backprop_wrt_max.chip<0>(i);
+        min_chip.device(d) = min_chip.constant(0.0f);
+        max_chip.device(d) = max_chip.constant(0.0f);
+        continue;
+      }
+      float nudged_min, nudged_max, nudged_scale;
+      Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
+            &nudged_scale);
 
       const auto between_min_max =
           (inputs_chip >= nudged_min && inputs_chip <= nudged_max)
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 8dd3f055bc7d6fc317ca80ba4e0418dcd5e05a98..5953db14768fd4e8d6c8537a2bea91c2ca211b17 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -476,6 +476,12 @@ TEST_F(QuantOpsTest, WithArgsGradient_4Bits_NarrowRange) {
   ExpectClose(expected, *output);
 }
 
+TEST_F(QuantOpsTest, WithVars_ZeroMinAndMax) {
+  RunTestFakeQuantWithMinMaxVars(8, false, 0.0f, 0.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+}
+
 TEST_F(QuantOpsTest, WithVarsNoNudging_RegularRange) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
@@ -637,6 +643,47 @@ TEST_F(QuantOpsTest, WithVarsNudgedZero15_4Bits_NarrowRange) {
                                  {-7.0f, -7.0f, -7.0f, -6.5f, 0.0f, 0.0f});
 }
 
+TEST_F(QuantOpsTest, WithVarsGradient_ZeroMinAndMax) {
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsGradient")
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({2, 3}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({2, 3}),
+                           {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  auto in_flat = GetInput(0).flat<float>();
+  FillValues<float>(
+      &expected_bprop_wrt_input,
+      {in_flat(0), in_flat(1), in_flat(2), in_flat(3), in_flat(4), in_flat(5)});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_min.flat<float>()(0) = 0.0f;
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({}));
+  expected_bprop_wrt_max.flat<float>()(0) = 0.0f;
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
 TEST_F(QuantOpsTest, WithVarsGradient_RegularRange) {
   // Original quantization range: [-0.5 / 4 + 0 / 4, -0.5 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.5, nudged to 1.
@@ -815,6 +862,13 @@ TEST_F(QuantOpsTest, WithVarsGradient_4Bits_NarrowRange) {
   ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
 }
 
+TEST_F(QuantOpsTest, WithVarsPerChannel_ZeroMinAndMax) {
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f}, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f});
+}
+
 TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
@@ -1166,6 +1220,45 @@ TEST_F(QuantOpsTest, WithVarsPerChannelDim4NudgedUp_4Bits_NarrowRange) {
   // clang-format on
 }
 
+TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedDown_ZeroMinAndMax) {
+  TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannelGradient")
+                   .Attr("narrow_range", false)
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_FLOAT))  // inputs
+                   .Input(FakeInput(DT_FLOAT))  // min
+                   .Input(FakeInput(DT_FLOAT))  // max
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  // Upstream gradients.
+  AddRandomInput(TensorShape({4}));
+  // Downstream inputs.
+  AddInputFromArray<float>(TensorShape({4}), {0.0, 0.0, 0.0, 0.0f});
+  // Min.
+  AddInputFromArray<float>(TensorShape({4}), {0.0, 0.0, 0.0, 0.0f});
+  // Max.
+  AddInputFromArray<float>(TensorShape({4}), {0.0, 0.0, 0.0, 0.0f});
+
+  // Tested code.
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor* output_bprop_wrt_input = GetOutput(0);
+  Tensor expected_bprop_wrt_input(allocator(), DT_FLOAT, TensorShape({4}));
+  auto grad_flat = GetInput(0).flat<float>();
+  FillValues<float>(&expected_bprop_wrt_input,
+                    {grad_flat(0), grad_flat(1), grad_flat(2), grad_flat(3)});
+  ExpectClose(expected_bprop_wrt_input, *output_bprop_wrt_input);
+
+  Tensor* output_bprop_wrt_min = GetOutput(1);
+  Tensor expected_bprop_wrt_min(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_min, {0.0f, 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_min, *output_bprop_wrt_min);
+
+  Tensor* output_bprop_wrt_max = GetOutput(2);
+  Tensor expected_bprop_wrt_max(allocator(), DT_FLOAT, TensorShape({4}));
+  FillValues<float>(&expected_bprop_wrt_max, {0.0f, 0.0f, 0.0f, 0.0f});
+  ExpectClose(expected_bprop_wrt_max, *output_bprop_wrt_max);
+}
+
 TEST_F(QuantOpsTest, WithVarsPerChannelDim1GradientNudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index ea86b04762d52bd1debe80c2d404cff7bd276406..82ec87911985abe714490ad74fa19105f850b536 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -94,7 +95,7 @@ Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -329,8 +330,8 @@ void FIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   const int64 index = attempt->tuple[0].dim_size(0) -
                                       attempt->elements_requested;
                   for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(CopyElementToSlice(
-                        tuple[i], &attempt->tuple[i], index));
+                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                        std::move(tuple[i]), &attempt->tuple[i], index));
                     if (!attempt->context->status().ok()) return kComplete;
                   }
                   tuple.clear();
diff --git a/tensorflow/core/kernels/filter_dataset_op.cc b/tensorflow/core/kernels/filter_dataset_op.cc
index a69040b3bba34f08aede66e1f97c3e7092978ae3..04427d296c215c9a373b3cd7c809de827125fca6 100644
--- a/tensorflow/core/kernels/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/filter_dataset_op.cc
@@ -51,17 +51,21 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func));
+    *output = new Dataset(ctx, input, func_, std::move(captured_func));
   }
 
  private:
   const int graph_def_version_;
 
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func)
-        : input_(input), captured_func_(std::move(captured_func)) {
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
+          captured_func_(std::move(captured_func)) {
       input_->Ref();
     }
 
@@ -82,6 +86,35 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "FilterDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {{0, input_graph_node}}, {{1, other_arguments}},
+          {{"predicate", f}, {"Targuments", other_arguments_types_attr}},
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -98,16 +131,25 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         // non-deterministic order.
         bool matched;
         do {
-          TF_RETURN_IF_ERROR(
-              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          {
+            tf_shared_lock l(mu_);
+            if (!input_impl_) {
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+            TF_RETURN_IF_ERROR(
+                input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+          }
           if (*end_of_sequence) {
+            mutex_lock l(mu_);
+            input_impl_.reset();
             return Status::OK();
           }
 
           FunctionLibraryRuntime::Options opts;
           opts.step_id = CapturedFunction::generate_step_id();
           ScopedStepContainer step_container(
-              opts.step_id, [this, ctx](const string& name) {
+              opts.step_id, [this](const string& name) {
                 dataset()
                     ->captured_func_->resource_manager()
                     ->Cleanup(name)
@@ -120,7 +162,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
           Notification n;
           Status ret;
           std::vector<Tensor> result;
-          ret = dataset()->captured_func_->Run(opts, *out_tensors, &result);
+          ret = dataset()->captured_func_->RunWithBorrowedArgs(
+              opts, *out_tensors, &result);
 
           if (!ret.ok()) {
             return ret;
@@ -139,11 +182,34 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
   };
 
diff --git a/tensorflow/core/kernels/flat_map_dataset_op.cc b/tensorflow/core/kernels/flat_map_dataset_op.cc
index e62a43e94cc277dd8880d13ed22a25909e705b30..8fe8489371832cdc2b9e17829f2b3971e35c7b48 100644
--- a/tensorflow/core/kernels/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/flat_map_dataset_op.cc
@@ -54,18 +54,21 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func), output_types_,
-                          output_shapes_);
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
           output_shapes_(output_shapes) {
@@ -90,6 +93,37 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},         // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -102,6 +136,10 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
         do {
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
           if (current_element_iterator_) {
             // We are currently precessing a mapped element, so try to get the
             // next subelement.
@@ -120,26 +158,113 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
           }
 
           // Get the next element from the input dataset.
-          std::vector<Tensor> args;
-          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
+          captured_func_inputs_.clear();
+          TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &captured_func_inputs_,
+                                                  end_of_sequence));
           if (*end_of_sequence) {
+            input_impl_.reset();
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-              ctx, args, element_index_++, dataset()->captured_func_.get(),
-              prefix(), &current_element_iterator_));
+          TF_RETURN_IF_ERROR(BuildCurrentElementIteratorLocked(ctx));
         } while (true);
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("element_index"), element_index_));
+          if (current_element_iterator_) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteScalar(full_name("captured_func_inputs_size"),
+                                    captured_func_inputs_.size()));
+            for (int i = 0; i < captured_func_inputs_.size(); i++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("captured_func_inputs[", i, "]")),
+                  captured_func_inputs_[i]));
+            }
+            TF_RETURN_IF_ERROR(SaveParent(writer, current_element_iterator_));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name("current_element_iterator_uninitialized"), ""));
+          }
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("exhausted"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        input_impl_.reset();
+        element_index_ = 0;
+        current_element_iterator_.reset();
+        captured_func_inputs_.clear();
+        if (!reader->Contains(full_name("exhausted"))) {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+          {
+            int64 temp;
+            TF_RETURN_IF_ERROR(
+                reader->ReadScalar(full_name("element_index"), &temp));
+            element_index_ = temp;
+          }
+          if (!reader->Contains(
+                  full_name("current_element_iterator_uninitialized"))) {
+            size_t captured_func_inputs_size;
+            {
+              int64 temp;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name("captured_func_inputs_size"), &temp));
+              captured_func_inputs_size = static_cast<size_t>(temp);
+            }
+            captured_func_inputs_.reserve(captured_func_inputs_size);
+            for (int i = 0; i < captured_func_inputs_size; i++) {
+              captured_func_inputs_.emplace_back();
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("captured_func_inputs[", i, "]")),
+                  &captured_func_inputs_.back()));
+            }
+            element_index_--;
+            TF_RETURN_IF_ERROR(BuildCurrentElementIteratorLocked(ctx));
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, current_element_iterator_));
+          }
+        }
+        return Status::OK();
+      }
+
      private:
+      Status BuildCurrentElementIteratorLocked(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        return dataset::MakeIteratorFromInputElement(
+            ctx, captured_func_inputs_, element_index_++,
+            dataset()->captured_func_.get(), prefix(),
+            &current_element_iterator_);
+      }
+
+      Status BuildCurrentElementIteratorLocked(OpKernelContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        IteratorContext iter_ctx(std::move(params));
+        return BuildCurrentElementIteratorLocked(&iter_ctx);
+      }
+
       mutex mu_;
       size_t element_index_ GUARDED_BY(mu_) = 0;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
+      std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index f2290e87a5fdac44629ed6b81c8661cf74c2054e..9382ff7847fcbe8a7e9de4af56eac7774036042f 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -318,7 +318,7 @@ class RemoteCallOp : public AsyncOpKernel {
     if (opts.source_device != target_device) {
       opts.remote_execution = true;
     }
-    opts.rendezvous = ctx->rendezvous();
+    opts.create_rendezvous = true;
     std::vector<Tensor> args;
     args.reserve(arguments.size());
     for (const Tensor& argument : arguments) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 0ecb829f348f47cb71a91099655267f209d5547d..1688674eb784369ae8fbb2622695561cb5bebcae 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -54,25 +54,20 @@ struct FusedBatchNorm<CPUDevice, T, U> {
                   Tensor* batch_var_output, Tensor* saved_mean_output,
                   Tensor* saved_var_output, TensorFormat tensor_format,
                   bool is_training) {
-    // Currently U is ignored, since we only support the case where T and U are
-    // both float32.
-    // TODO(reedwm): Add float16 support, use U, and remove these asserts.
-    static_assert(std::is_same<T, float>::value, "T currently must be float.");
-    static_assert(std::is_same<U, float>::value, "U currently must be float.");
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
                 errors::Internal("The CPU implementation of FusedBatchNorm "
                                  "only supports NHWC tensor format for now."));
     typename TTypes<T, 4>::ConstTensor x(x_input.tensor<T, 4>());
-    typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
-    typename TTypes<T>::ConstVec offset(offset_input.vec<T>());
-    typename TTypes<T>::ConstVec estimated_mean(estimated_mean_input.vec<T>());
-    typename TTypes<T>::ConstVec estimated_variance(
-        estimated_variance_input.vec<T>());
+    typename TTypes<U>::ConstVec scale(scale_input.vec<U>());
+    typename TTypes<U>::ConstVec offset(offset_input.vec<U>());
+    typename TTypes<U>::ConstVec estimated_mean(estimated_mean_input.vec<U>());
+    typename TTypes<U>::ConstVec estimated_variance(
+        estimated_variance_input.vec<U>());
     typename TTypes<T, 4>::Tensor y(y_output->tensor<T, 4>());
-    typename TTypes<T>::Vec batch_mean(batch_mean_output->vec<T>());
-    typename TTypes<T>::Vec batch_var(batch_var_output->vec<T>());
-    typename TTypes<T>::Vec saved_mean(saved_mean_output->vec<T>());
-    typename TTypes<T>::Vec saved_var(saved_var_output->vec<T>());
+    typename TTypes<U>::Vec batch_mean(batch_mean_output->vec<U>());
+    typename TTypes<U>::Vec batch_var(batch_var_output->vec<U>());
+    typename TTypes<U>::Vec saved_mean(saved_mean_output->vec<U>());
+    typename TTypes<U>::Vec saved_var(saved_var_output->vec<U>());
 
     const CPUDevice& d = context->eigen_device<CPUDevice>();
 
@@ -93,15 +88,15 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     bcast_spec.set(0, rest_size);
 #endif
 
-    auto x_rest_by_depth = x.reshape(rest_by_depth);
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
     const int rest_size_minus_one = (rest_size > 1) ? (rest_size - 1) : 1;
-    T rest_size_inv = static_cast<T>(1.0f / static_cast<T>(rest_size));
+    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
     // This adjustment is for Bessel's correction
-    T rest_size_adjust =
-        static_cast<T>(rest_size) / static_cast<T>(rest_size_minus_one);
+    U rest_size_adjust =
+        static_cast<U>(rest_size) / static_cast<U>(rest_size_minus_one);
 
-    Eigen::Tensor<T, 1, Eigen::RowMajor> mean(depth);
-    Eigen::Tensor<T, 1, Eigen::RowMajor> variance(depth);
+    Eigen::Tensor<U, 1, Eigen::RowMajor> mean(depth);
+    Eigen::Tensor<U, 1, Eigen::RowMajor> variance(depth);
     if (is_training) {
       mean.device(d) = (x_rest_by_depth.sum(reduce_dims) * rest_size_inv);
       batch_mean.device(d) = mean;
@@ -129,7 +124,7 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     auto x_shifted =
         x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec);
 
-    y.reshape(rest_by_depth).device(d) = x_shifted;
+    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
   }
 };
 
@@ -138,7 +133,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
   void operator()(OpKernelContext* context, const Tensor& y_backprop_input,
                   const Tensor& x_input, const Tensor& scale_input,
                   const Tensor& mean_input, const Tensor& variance_input,
-                  T epsilon, Tensor* x_backprop_output,
+                  U epsilon, Tensor* x_backprop_output,
                   Tensor* scale_backprop_output, Tensor* offset_backprop_output,
                   TensorFormat tensor_format) {
     OP_REQUIRES(context, tensor_format == FORMAT_NHWC,
@@ -147,12 +142,12 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
     typename TTypes<T, 4>::ConstTensor y_backprop(
         y_backprop_input.tensor<T, 4>());
     typename TTypes<T, 4>::ConstTensor x(x_input.tensor<T, 4>());
-    typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
-    typename TTypes<T>::ConstVec mean(mean_input.vec<T>());
-    typename TTypes<T>::ConstVec variance(variance_input.vec<T>());
+    typename TTypes<U>::ConstVec scale(scale_input.vec<U>());
+    typename TTypes<U>::ConstVec mean(mean_input.vec<U>());
+    typename TTypes<U>::ConstVec variance(variance_input.vec<U>());
     typename TTypes<T, 4>::Tensor x_backprop(x_backprop_output->tensor<T, 4>());
-    typename TTypes<T>::Vec scale_backprop(scale_backprop_output->vec<T>());
-    typename TTypes<T>::Vec offset_backprop(offset_backprop_output->vec<T>());
+    typename TTypes<U>::Vec scale_backprop(scale_backprop_output->vec<U>());
+    typename TTypes<U>::Vec offset_backprop(offset_backprop_output->vec<U>());
 
     // Note: the following formulas are used to compute the gradients for
     // back propagation.
@@ -181,8 +176,8 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
     bcast_spec.set(0, rest_size);
 #endif
 
-    auto x_rest_by_depth = x.reshape(rest_by_depth);
-    T rest_size_inv = static_cast<T>(1.0f / static_cast<T>(rest_size));
+    auto x_rest_by_depth = x.reshape(rest_by_depth).template cast<U>();
+    U rest_size_inv = static_cast<U>(1.0f / static_cast<U>(rest_size));
 
     auto x_mean_rest_by_depth =
         mean.reshape(one_by_depth).broadcast(bcast_spec);
@@ -192,7 +187,8 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
         coef0.eval().reshape(one_by_depth).broadcast(bcast_spec);
     auto x_scaled = x_centered * coef0_rest_by_depth;
 
-    auto y_backprop_rest_by_depth = y_backprop.eval().reshape(rest_by_depth);
+    auto y_backprop_rest_by_depth =
+        y_backprop.eval().reshape(rest_by_depth).template cast<U>();
     scale_backprop.device(d) =
         (y_backprop_rest_by_depth * x_scaled).sum(reduce_dims);
     auto y_backprop_sum = y_backprop_rest_by_depth.sum(reduce_dims);
@@ -214,7 +210,7 @@ struct FusedBatchNormGrad<CPUDevice, T, U> {
                      .reshape(one_by_depth)
                      .broadcast(bcast_spec);
     x_backprop.reshape(rest_by_depth).device(d) =
-        coef1 * (y_backprop_centered - x_centered * coef2);
+        (coef1 * (y_backprop_centered - x_centered * coef2)).template cast<T>();
   }
 };
 
@@ -689,6 +685,18 @@ REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
                             .TypeConstraint<float>("U"),
                         FusedBatchNormGradOp<CPUDevice, float, float>);
 
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormOp<CPUDevice, Eigen::half, float>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedBatchNormGradV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Eigen::half>("T")
+                            .TypeConstraint<float>("U"),
+                        FusedBatchNormGradOp<CPUDevice, Eigen::half, float>);
+
 #if GOOGLE_CUDA
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.h b/tensorflow/core/kernels/fused_batch_norm_op.h
index 38b24d7011b450a2717cbefba9d0396ba4366f0f..3af104bf954257b260215d6a79b0a365227d7b23 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.h
+++ b/tensorflow/core/kernels/fused_batch_norm_op.h
@@ -92,26 +92,28 @@ struct FusedBatchNormFreezeGrad {
     // offset_backprop  = sum(y_backprop)
     // scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var + epsilon))
     // x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
-    offset_backprop.device(d) = y_backprop.reshape(rest_by_depth)
-                                    .template cast<U>()
-                                    .sum(reduction_axis);
+
+    auto y_backprop_rest_by_depth =
+        y_backprop.reshape(rest_by_depth).template cast<U>();
+    auto input_rest_by_depth = input.reshape(rest_by_depth).template cast<U>();
+
+    offset_backprop.device(d) = y_backprop_rest_by_depth.sum(reduction_axis);
 
     // scratch1 = rsqrt(pop_var + epsilon)
     scratch1.device(d) = (pop_var + pop_var.constant(epsilon)).rsqrt();
 
     // scratch2 = sum(y_backprop * (x - mean))
     scratch2.device(d) =
-        (y_backprop.reshape(rest_by_depth).template cast<U>() *
-         (input.reshape(rest_by_depth).template cast<U>() -
+        (y_backprop_rest_by_depth *
+         (input_rest_by_depth -
           pop_mean.reshape(one_by_depth).broadcast(rest_by_one)))
             .sum(reduction_axis);
 
     x_backprop.reshape(rest_by_depth).device(d) =
-        (y_backprop.reshape(rest_by_depth).template cast<U>() *
-         ((scratch1 * scale)
-              .eval()
-              .reshape(one_by_depth)
-              .broadcast(rest_by_one)))
+        (y_backprop_rest_by_depth * ((scratch1 * scale)
+                                         .eval()
+                                         .reshape(one_by_depth)
+                                         .broadcast(rest_by_one)))
             .template cast<T>();
     scale_backprop.device(d) = scratch2 * scratch1;
   }
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 5dc74d720ab22e2f2f10baf8309b59661740184f..7e5a9e1ec5aac26706d95646a29539dd0f4be2ed 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -176,10 +176,12 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
-            "Only indices.shape[-1] values between 1 and 5 "
+            "Only indices.shape[-1] values between 1 and 7 "
             "are currently supported.  Requested rank: ",
             indices_nd);
     }
@@ -218,7 +220,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DECLARE_GPU_SPECS(T)         \
   DECLARE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2aec872448ec02581faf95e30844e5e1e80cd277
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9222cb07695cb1c05b12da59b0c0bbc96bebb388
--- /dev/null
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl_7.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/gather_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index ed5240c20abd247404cb926dd9a455af901c0d7c..b03efc684ffca4abde99b31952983aad5f805ee3 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -111,7 +111,9 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 2); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 3); \
   DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 4); \
-  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5);
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 5); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 6); \
+  DEFINE_GPU_SPECS_INDEX_NDIM(T, Index, 7);
 
 #define DEFINE_GPU_SPECS(T)         \
   DEFINE_GPU_SPECS_INDEX(T, int32); \
diff --git a/tensorflow/core/kernels/generate_vocab_remapping_op.cc b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
index 247c1f24577212c2b7cc5bb44ba9a883636389ea..2b97677e3859002bee07f073b1de1a19c806c6ae 100644
--- a/tensorflow/core/kernels/generate_vocab_remapping_op.cc
+++ b/tensorflow/core/kernels/generate_vocab_remapping_op.cc
@@ -41,6 +41,8 @@ class GenerateVocabRemappingOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->GetAttr("new_vocab_offset", &new_vocab_offset_));
     OP_REQUIRES_OK(context, context->GetAttr("num_new_vocab", &num_new_vocab_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("old_vocab_size", &old_vocab_size_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -92,16 +94,14 @@ class GenerateVocabRemappingOp : public OpKernel {
     lookup::HashTable<string, int64>* old_vocab_table =
         new lookup::HashTable<string, int64>(context, this);
     core::ScopedUnref unref_old(old_vocab_table);
-    // Note: we pass -1 (unknown) for vocab_size, which is supposed to be the
-    // total elements in file.  This is different from num_new_vocab_, which
-    // accounts for partitioning.
-    OP_REQUIRES_OK(context, lookup::InitializeTableFromTextFile(
-                                old_vocab_filename,
-                                -1,  // vocab_size
-                                kUnusedLookupDelim,
-                                -2,  // key_index, use the whole line/token.
-                                -1,  // value_index, use the line number.
-                                context->env(), old_vocab_table));
+    // Note: If old_vocab_size_ is -1 (unknown), we retrieve all elements in
+    // file (see TextFileLineIterator).
+    OP_REQUIRES_OK(context,
+                   lookup::InitializeTableFromTextFile(
+                       old_vocab_filename, old_vocab_size_, kUnusedLookupDelim,
+                       -2,  // key_index, use the whole line/token.
+                       -1,  // value_index, use the line number.
+                       context->env(), old_vocab_table));
 
     // Fill out new_ids = [new_vocab_offset, new_vocab_offset + 1, ...,
     //                     new_vocab_offset + num_new_vocab_]
@@ -165,6 +165,7 @@ class GenerateVocabRemappingOp : public OpKernel {
  private:
   int new_vocab_offset_;
   int num_new_vocab_;
+  int old_vocab_size_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("GenerateVocabRemapping").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/group_by_window_dataset_op.cc b/tensorflow/core/kernels/group_by_window_dataset_op.cc
index 8644bcf9b509b7aaf335791b583ad8e82073f471..c70a92d0d6437403c1514cfa777e319769601853 100644
--- a/tensorflow/core/kernels/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/group_by_window_dataset_op.cc
@@ -169,7 +169,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               opts.step_id = CapturedFunction::generate_step_id();
               opts.runner = ctx->runner();
               ScopedStepContainer step_container(
-                  opts.step_id, [this, ctx](const string& name) {
+                  opts.step_id, [this](const string& name) {
                     dataset()
                         ->captured_key_func_->resource_manager()
                         ->Cleanup(name)
@@ -180,8 +180,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Run(
-                  opts, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(
+                  dataset()->captured_key_func_->RunWithBorrowedArgs(
+                      opts, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -198,7 +199,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 opts2.step_id = CapturedFunction::generate_step_id();
                 opts2.runner = ctx->runner();
                 ScopedStepContainer step_container2(
-                    opts2.step_id, [this, ctx](const string& name) {
+                    opts2.step_id, [this](const string& name) {
                       dataset()
                           ->captured_window_size_func_->resource_manager()
                           ->Cleanup(name)
@@ -210,7 +211,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 // window size.
                 std::vector<Tensor> window_size_func_output;
                 TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
-                    opts2, key_func_output, &window_size_func_output));
+                    opts2, std::move(key_func_output),
+                    &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
                     window_size_func_output[0].dtype() != DT_INT64 ||
@@ -257,7 +259,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         opts.step_id = CapturedFunction::generate_step_id();
         opts.runner = ctx->runner();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_reduce_func_->resource_manager()
                   ->Cleanup(name)
@@ -282,8 +284,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
 
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_reduce_func_->Run(opts, args, &return_values));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            opts, std::move(args), &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
diff --git a/tensorflow/core/kernels/guarantee_const_op.cc b/tensorflow/core/kernels/guarantee_const_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de3a2a1148b7e7922a08cfce159fb05ccdb9fe30
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op.cc
@@ -0,0 +1,47 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+// Refer to the Op description for detailed comments.
+class GuaranteeConstOp : public OpKernel {
+ public:
+  explicit GuaranteeConstOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const DataType input_dtype = ctx->input_dtype(0);
+    OP_REQUIRES(ctx, input_dtype != DT_RESOURCE,
+                errors::InvalidArgument(
+                    "Input tensor cannot be a resource variable handle."));
+    const Tensor& input_tensor = ctx->input(0);
+    Tensor* output = nullptr;
+    if (!ctx->forward_input_to_output_with_shape(0, 0, input_tensor.shape(),
+                                                 &output)) {
+      ctx->set_output(0, input_tensor);
+    }
+  }
+
+  bool IsExpensive() override { return false; }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GuaranteeConst").Device(DEVICE_CPU),
+                        GuaranteeConstOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/guarantee_const_op_test.cc b/tensorflow/core/kernels/guarantee_const_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..01461fbb8c22a2bfb9669bef680759ecab324a61
--- /dev/null
+++ b/tensorflow/core/kernels/guarantee_const_op_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class GuaranteeConstOpTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type) {
+    TF_CHECK_OK(NodeDefBuilder("op", "GuaranteeConst")
+                    .Input(FakeInput(input_type))
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(GuaranteeConstOpTest, Int32Success_6) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({6}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, Int32Success_2_3) {
+  TF_ASSERT_OK(Init(DT_INT32));
+  AddInputFromArray<int32>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_INT32, TensorShape({2, 3}));
+  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, StringSuccess) {
+  TF_ASSERT_OK(Init(DT_STRING));
+  AddInputFromArray<string>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<string>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<string>(expected, *GetOutput(0));
+}
+
+TEST_F(GuaranteeConstOpTest, ResourceInputError) {
+  TF_ASSERT_OK(Init(DT_RESOURCE));
+  AddResourceInput("", "resource", new Var(DT_INT32));
+  const auto status = RunOpKernel();
+  ASSERT_EQ(error::INVALID_ARGUMENT, status.code());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/ignore_errors_dataset_op.cc
index 568e7ade0ef5e4bc9648ffcfdc7e40cdc01d11a0..8cf263d87fed601ed987e5d13909dd433391e5bd 100644
--- a/tensorflow/core/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/ignore_errors_dataset_op.cc
@@ -32,13 +32,14 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
-    *output = new Dataset(input);
+    *output = new Dataset(ctx, input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(const DatasetBase* input) : input_(input) {
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
       input_->Ref();
     }
 
@@ -59,6 +60,15 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "IgnoreErrorsDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -69,16 +79,49 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-        while (!s.ok()) {
-          out_tensors->clear();
-          s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        {
+          tf_shared_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+          while (!s.ok()) {
+            out_tensors->clear();
+            s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+          }
+        }
+        if (*end_of_sequence) {
+          mutex_lock l(mu_);
+          input_impl_.reset();
         }
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index b318c9c79a5c253a9d243f6dd8fcb698f09fa45e..b3814331ee7f42a63af93cb35e943463724cf5a6 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -147,8 +147,8 @@ Status CreateTempFile(Env* env, float value, uint64 size, string* filename) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
   for (uint64 i = 0; i < size; ++i) {
-    StringPiece sp;
-    sp.set(&value, sizeof(value));
+    StringPiece sp(static_cast<char*>(static_cast<void*>(&value)),
+                   sizeof(value));
     TF_RETURN_IF_ERROR(file->Append(sp));
   }
   TF_RETURN_IF_ERROR(file->Close());
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 01ae5a83c1eec9eb4ccb74841555b5bb1b6cd60f..7728ba850c94aa79feb31d137712692df0f89176 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -52,6 +52,7 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
     return DoParallelConcatUpdate<CPUDevice, type>(d, value, loc, output);
     TF_CALL_NUMBER_TYPES(CASE);
     TF_CALL_string(CASE);
+    TF_CALL_variant(CASE);
 #undef CASE
     default:
       return errors::InvalidArgument("Unsupported data type: ", value.dtype());
diff --git a/tensorflow/core/kernels/interleave_dataset_op.cc b/tensorflow/core/kernels/interleave_dataset_op.cc
index c01d1c7cbb0c460cd5facf7b2a1b3b8af9abe6bc..833e8cb9c5a8529d827abd00560c18d937b0f0fc 100644
--- a/tensorflow/core/kernels/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/interleave_dataset_op.cc
@@ -73,18 +73,22 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func), cycle_length,
-                          block_length, output_types_, output_shapes_);
+    *output =
+        new Dataset(ctx, input, func_, std::move(captured_func), cycle_length,
+                    block_length, output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
             int64 block_length, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
           captured_func_(std::move(captured_func)),
           cycle_length_(cycle_length),
           block_length_(block_length),
@@ -110,13 +114,47 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "InterleaveDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node}, {2, cycle_length_node}, {3, block_length_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            current_elements_(params.dataset->cycle_length_) {}
+            current_elements_(params.dataset->cycle_length_),
+            args_list_(params.dataset->cycle_length_) {}
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         block_index_ = 0;
@@ -150,18 +188,19 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             // We have reached the end of the current element, so move
             // on to the next element in the cycle.
             current_elements_[cycle_index_].reset();
+            args_list_[cycle_index_].clear();
             --num_open_;
             AdvanceToNextInCycle();
           } else if (!end_of_input_) {
             // Get the next element from the input dataset, and create
             // an iterator from it.
-            std::vector<Tensor> args;
-            TF_RETURN_IF_ERROR(
-                input_impl_->GetNext(ctx, &args, &end_of_input_));
+            TF_RETURN_IF_ERROR(input_impl_->GetNext(
+                ctx, &args_list_[cycle_index_], &end_of_input_));
             if (!end_of_input_) {
               TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
-                  ctx, args, cycle_index_, dataset()->captured_func_.get(),
-                  prefix(), &current_elements_[cycle_index_]));
+                  ctx, args_list_[cycle_index_], cycle_index_,
+                  dataset()->captured_func_.get(), prefix(),
+                  &current_elements_[cycle_index_]));
               ++num_open_;
             }
           } else {
@@ -173,11 +212,100 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("cycle_index"), cycle_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_index"), block_index_));
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("num_open"), num_open_));
+        TF_RETURN_IF_ERROR(SaveCurrentElements(writer));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        int64 cycle_index;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("cycle_index"), &cycle_index));
+        cycle_index_ = size_t(cycle_index);
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("block_index"), &block_index_));
+        if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
+        int64 num_open;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("num_open"), &num_open));
+        num_open_ = size_t(num_open);
+        TF_RETURN_IF_ERROR(RestoreCurrentElements(ctx, reader));
+        return Status::OK();
+      }
+
      private:
+      Status SaveCurrentElements(IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        for (int idx = 0; idx < current_elements_.size(); idx++) {
+          if (current_elements_[idx]) {
+            TF_RETURN_IF_ERROR(SaveParent(writer, current_elements_[idx]));
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("args_size[", idx, "]")),
+                args_list_[idx].size()));
+            for (int i = 0; i < args_list_[idx].size(); i++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
+                  args_list_[idx][i]));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreCurrentElements(OpKernelContext* ctx,
+                                    IteratorStateReader* reader)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        IteratorContext::Params params;
+        params.env = ctx->env();
+        params.runner = *(ctx->runner());
+        IteratorContext iter_ctx(std::move(params));
+        for (int idx = 0; idx < current_elements_.size(); idx++) {
+          if (reader->Contains(
+                  full_name(strings::StrCat("args_size[", idx, "]")))) {
+            int64 args_size;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("args_size[", idx, "]")),
+                &args_size));
+            args_list_[idx].resize(args_size);
+            for (int i = 0; i < args_size; i++) {
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
+                  &args_list_[idx][i]));
+            }
+            TF_RETURN_IF_ERROR(dataset::MakeIteratorFromInputElement(
+                &iter_ctx, args_list_[idx], idx,
+                dataset()->captured_func_.get(), prefix(),
+                &current_elements_[idx]));
+            TF_RETURN_IF_ERROR(
+                RestoreParent(ctx, reader, current_elements_[idx]));
+          } else {
+            current_elements_[idx].reset();
+          }
+        }
+        return Status::OK();
+      }
+
       mutex mu_;
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<std::unique_ptr<IteratorBase>> current_elements_
           GUARDED_BY(mu_);
+      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
       size_t cycle_index_ GUARDED_BY(mu_) = 0;
       int64 block_index_ GUARDED_BY(mu_) = 0;
       bool end_of_input_ GUARDED_BY(mu_) = false;
@@ -185,6 +313,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const int64 cycle_length_;
     const int64 block_length_;
diff --git a/tensorflow/core/kernels/iterator_ops.cc b/tensorflow/core/kernels/iterator_ops.cc
index d8bcd09842c51f30e499de7c3a2d58c08036a202..439775157bc936d44845e7b175e62c2fc088e6cf 100644
--- a/tensorflow/core/kernels/iterator_ops.cc
+++ b/tensorflow/core/kernels/iterator_ops.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/dataset.h"
-
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/framework/iterator.pb.h"
@@ -22,7 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/stats_aggregator.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -79,10 +79,12 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
 class IteratorResource : public ResourceBase {
  public:
   IteratorResource(const DataTypeVector& output_dtypes,
-                   const std::vector<PartialTensorShape>& output_shapes)
+                   const std::vector<PartialTensorShape>& output_shapes,
+                   const int graph_def_version)
       : iterator_(nullptr),
         output_dtypes_(output_dtypes),
-        output_shapes_(output_shapes) {}
+        output_shapes_(output_shapes),
+        graph_def_version_(graph_def_version) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
@@ -97,10 +99,10 @@ class IteratorResource : public ResourceBase {
     }
   }
 
-  Status Save(IteratorStateWriter* writer) {
+  Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
     if (captured_iterator) {
-      return captured_iterator->Save(writer);
+      return captured_iterator->Save(ctx, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -125,8 +127,21 @@ class IteratorResource : public ResourceBase {
     TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
     std::vector<Tensor> outputs;
     GraphRunner graph_runner(ctx->env());
-    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, ctx->function_library(), {},
-                                        {output_node}, &outputs));
+
+    // Build a new FLR that knows about the functions in the graph.
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(
+        new FunctionLibraryDefinition(
+            *ctx->function_library()->GetFunctionLibraryDefinition()));
+    TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(
+        new ProcessFunctionLibraryRuntime(nullptr, ctx->env(),
+                                          graph_def_version_, flib_def.get(),
+                                          {}, nullptr));
+    FunctionLibraryRuntime* lib =
+        pflr->GetFLR(ProcessFunctionLibraryRuntime::kDefaultFLRDevice);
+
+    TF_RETURN_IF_ERROR(
+        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
     TF_RETURN_IF_ERROR(set_iterator(dataset->MakeIterator("Iterator")));
@@ -154,6 +169,16 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
+  void set_stats_aggregator(std::shared_ptr<StatsAggregator> stats_aggregator) {
+    mutex_lock l(mu_);
+    stats_aggregator_ = std::move(stats_aggregator);
+  }
+
+  std::shared_ptr<StatsAggregator> stats_aggregator() {
+    tf_shared_lock l(mu_);
+    return stats_aggregator_;
+  }
+
   string DebugString() override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
@@ -164,8 +189,11 @@ class IteratorResource : public ResourceBase {
 
  private:
   std::shared_ptr<IteratorBase> iterator_;
+  mutex mu_;
+  std::shared_ptr<StatsAggregator> stats_aggregator_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
 };
 
 // Helper class for reading data from a VariantTensorData object.
@@ -188,6 +216,10 @@ class VariantTensorDataReader : public IteratorStateReader {
     return ReadScalarInternal(key, val);
   }
 
+  Status ReadTensor(StringPiece key, Tensor* val) override {
+    return ReadTensorInternal(key, val);
+  }
+
   bool Contains(StringPiece key) override {
     return map_.find(key.ToString()) != map_.end();
   }
@@ -217,6 +249,14 @@ class VariantTensorDataReader : public IteratorStateReader {
     return Status::OK();
   }
 
+  Status ReadTensorInternal(StringPiece key, Tensor* val) {
+    if (map_.find(key.ToString()) == map_.end()) {
+      return errors::NotFound(key);
+    }
+    *val = data_->tensors(map_[key.ToString()]);
+    return Status::OK();
+  }
+
   std::map<string, size_t> map_;
   const VariantTensorData* data_;  // Not owned.
   Status status_;
@@ -236,6 +276,10 @@ class VariantTensorDataWriter : public IteratorStateWriter {
     return WriteScalarInternal(key, val);
   }
 
+  Status WriteTensor(StringPiece key, const Tensor& val) override {
+    return WriteTensorInternal(key, val);
+  }
+
   // Writes the metadata to `data_`.
   Status Flush() {
     string metadata;
@@ -249,15 +293,19 @@ class VariantTensorDataWriter : public IteratorStateWriter {
  private:
   template <typename T>
   Status WriteScalarInternal(StringPiece key, const T& val) {
+    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+    val_t.scalar<T>()() = val;
+    return WriteTensorInternal(key, val_t);
+  }
+
+  Status WriteTensorInternal(StringPiece key, const Tensor& val) {
     // Write key to the metadata proto. This gets written to `data_`
     // when `Flush()` is called. We do this lazily to avoid multiple
     // serialization calls.
     metadata_proto_.add_keys(key.ToString());
 
     // Update tensors.
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    *(data_->add_tensors()) = std::move(val_t);
+    *(data_->add_tensors()) = val;
     return Status::OK();
   }
 
@@ -299,11 +347,12 @@ class IteratorStateVariant {
   }
   // Initializes this object with the current state of the iterator so
   // that it can be written on the next call to Encode().
-  Status InitializeFromIterator(IteratorResource* iterator_resource) {
+  Status InitializeFromIterator(OpKernelContext* ctx,
+                                IteratorResource* iterator_resource) {
     data_.reset(new VariantTensorData());
     data_->set_type_name(TypeName());
     VariantTensorDataWriter writer(data_.get());
-    TF_RETURN_IF_ERROR(iterator_resource->Save(&writer));
+    TF_RETURN_IF_ERROR(iterator_resource->Save(ctx, &writer));
     TF_RETURN_IF_ERROR(writer.Flush());
     return Status::OK();
   }
@@ -355,7 +404,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
 class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
-      : ResourceOpKernel<IteratorResource>(ctx) {
+      : ResourceOpKernel<IteratorResource>(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_dtypes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
   }
@@ -363,7 +413,8 @@ class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
  private:
   Status CreateResource(IteratorResource** ret) override
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    *ret = new IteratorResource(output_dtypes_, output_shapes_);
+    *ret = new IteratorResource(output_dtypes_, output_shapes_,
+                                graph_def_version_);
     return Status::OK();
   }
 
@@ -378,6 +429,7 @@ class IteratorHandleOp : public ResourceOpKernel<IteratorResource> {
  private:
   DataTypeVector output_dtypes_;
   std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
 };
 
 class MakeIteratorOp : public OpKernel {
@@ -407,8 +459,6 @@ class ToSingleElementOp : public OpKernel {
 
     IteratorContext::Params params;
     params.env = ctx->env();
-    params.step_id = ctx->step_id();
-    params.resource_manager = ctx->resource_manager();
     params.runner = *(ctx->runner());
     IteratorContext iter_ctx(std::move(params));
 
@@ -442,7 +492,8 @@ class OneShotIteratorOp : public AsyncOpKernel {
             ctx->env(), ThreadOptions(),
             strings::StrCat("one_shot_iterator_initialization_thread_",
                             SanitizeThreadSuffix(name())),
-            1 /* num_threads */, false /* low_latency_hint */))
+            1 /* num_threads */, false /* low_latency_hint */)),
+        graph_def_version_(ctx->graph_def_version())
 
   {
     string shared_name;
@@ -526,7 +577,8 @@ class OneShotIteratorOp : public AsyncOpKernel {
         ctx->resource_manager()->LookupOrCreate<IteratorResource>(
             cinfo->container(), cinfo->name(), iterator,
             [this](IteratorResource** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-              *ret = new IteratorResource(output_dtypes_, output_shapes_);
+              *ret = new IteratorResource(output_dtypes_, output_shapes_,
+                                          graph_def_version_);
               return Status::OK();
             }));
 
@@ -584,7 +636,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     return Status::OK();
   }
 
-  void ProduceOutput(OpKernelContext* ctx, DoneCallback done) {
+  void ProduceOutput(OpKernelContext* ctx, const DoneCallback& done) {
     Tensor* handle;
     OP_REQUIRES_OK_ASYNC(ctx, ctx->allocate_output(0, TensorShape({}), &handle),
                          done);
@@ -616,6 +668,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
   Status initialization_status_ GUARDED_BY(mu_);
   std::vector<std::pair<OpKernelContext*, DoneCallback>> done_callbacks_
       GUARDED_BY(mu_);
+  const int graph_def_version_;
 };
 
 class IteratorGetNextOp : public AsyncOpKernel {
@@ -644,8 +697,9 @@ class IteratorGetNextOp : public AsyncOpKernel {
 
       IteratorContext::Params params;
       params.env = ctx->env();
-      params.step_id = ctx->step_id();
-      params.resource_manager = ctx->resource_manager();
+      params.stats_aggregator_getter = [iterator]() {
+        return iterator->stats_aggregator();
+      };
       params.runner = *(ctx->runner());
       IteratorContext iter_ctx(std::move(params));
 
@@ -767,11 +821,11 @@ class SerializeIteratorOp : public OpKernel {
     IteratorResource* iterator_resource;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-    iterator_resource->Unref();
+    core::ScopedUnref unref_iterator(iterator_resource);
     Tensor* variant_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &variant_t));
     IteratorStateVariant v;
-    OP_REQUIRES_OK(ctx, v.InitializeFromIterator(iterator_resource));
+    OP_REQUIRES_OK(ctx, v.InitializeFromIterator(ctx, iterator_resource));
     variant_t->scalar<Variant>()() = v;
   }
 };
@@ -797,6 +851,31 @@ class DeserializeIteratorOp : public OpKernel {
   }
 };
 
+class IteratorSetStatsAggregatorOp : public OpKernel {
+ public:
+  explicit IteratorSetStatsAggregatorOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    IteratorResource* iterator_resource;
+    OP_REQUIRES_OK(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
+    core::ScopedUnref unref_iterator(iterator_resource);
+
+    StatsAggregatorResource* stats_aggregator_resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 1),
+                                       &stats_aggregator_resource));
+    core::ScopedUnref unref_stats_aggregator(stats_aggregator_resource);
+    // TODO(mrry): Consider allowing multiple StatsAggregator ops to
+    // subscribe to updates, and/or unsubscribing.
+    OP_REQUIRES(ctx, !iterator_resource->stats_aggregator(),
+                errors::FailedPrecondition(
+                    "Iterator already associated with a StatsAggregator"));
+    iterator_resource->set_stats_aggregator(
+        stats_aggregator_resource->stats_aggregator());
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
                         MakeIteratorOp);
@@ -814,6 +893,8 @@ REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeserializeIterator").Device(DEVICE_CPU),
                         DeserializeIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSetStatsAggregator").Device(DEVICE_CPU),
+                        IteratorSetStatsAggregatorOp);
 
 }  // namespace
 
diff --git a/tensorflow/core/kernels/lmdb_reader_op.cc b/tensorflow/core/kernels/lmdb_reader_op.cc
index 3bb07301b5adc8a2a30990fbf2dff24c70705d63..31a427f2c90ad8a321d6004bf7ef85772d8e951f 100755
--- a/tensorflow/core/kernels/lmdb_reader_op.cc
+++ b/tensorflow/core/kernels/lmdb_reader_op.cc
@@ -36,7 +36,7 @@ class LMDBReader : public ReaderBase {
 
   Status OnWorkStartedLocked() override {
     MDB_CHECK(mdb_env_create(&mdb_env_));
-    int flags = MDB_RDONLY | MDB_NOTLS;
+    int flags = MDB_RDONLY | MDB_NOTLS | MDB_NOLOCK;
 
     // Check if the LMDB filename is actually a file instead of a directory.
     // If so, set appropriate flags so we can open it.
@@ -57,10 +57,13 @@ class LMDBReader : public ReaderBase {
     if (mdb_env_ != nullptr) {
       if (mdb_cursor_) {
         mdb_cursor_close(mdb_cursor_);
+        mdb_cursor_ = nullptr;
       }
-      mdb_txn_abort(mdb_txn_);
       mdb_dbi_close(mdb_env_, mdb_dbi_);
+      mdb_txn_abort(mdb_txn_);
       mdb_env_close(mdb_env_);
+      mdb_txn_ = nullptr;
+      mdb_dbi_ = 0;
       mdb_env_ = nullptr;
     }
     return Status::OK();
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e774c771b8e28c1a3c19cfafb6e7597c81e4eb5c..418d9dcc610c98bb1e7135b29d929fd17478fcd1 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -823,6 +823,7 @@ REGISTER_KERNEL(int64, int64);
 REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
 REGISTER_KERNEL(string, bool);
+REGISTER_KERNEL(int32, int32);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/map_and_batch_dataset_op.cc
index 620efdb7781e677c94af4946033e02955ee412f3..65a3ee2a64ae2a05166020d303a87d4095bf5f84 100644
--- a/tensorflow/core/kernels/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/map_and_batch_dataset_op.cc
@@ -14,13 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/captured_function.h"
 #include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -239,48 +239,67 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         // to unblock a consumer.
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
-        ScopedStepContainer* step_container = new ScopedStepContainer(
-            opts.step_id, [this, ctx](const string& name) {
+        ScopedStepContainer* step_container =
+            new ScopedStepContainer(opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
                   .IgnoreError();
             });
         opts.step_container = step_container;
-        opts.runner = ctx->runner();
-        dataset()->captured_func_->RunAsync(
-            opts, input_element, &result->return_values,
-            [this, result, step_container, batch_result,
-             offset](Status ret_status) {
-              delete step_container;
-              result->status.Update(ret_status);
-              if (ret_status.ok()) {
-                EnsureOutputAllocated(batch_result, result->return_values);
-                const size_t num_components = result->return_values.size();
-                for (size_t i = 0; i < num_components; ++i) {
-                  Tensor tensor = result->return_values[i];
-                  Tensor* batch = &(batch_result->output)[i];
-                  if (tensor.NumElements() !=
-                      (batch->NumElements() / batch->dim_size(0))) {
-                    TensorShape batch_shape = batch->shape();
-                    batch_shape.RemoveDim(0);
-                    result->status.Update(errors::InvalidArgument(
-                        "Cannot add tensor to the batch: number of "
-                        "elements does not match. Shapes are: [tensor]: ",
-                        tensor.shape().DebugString(),
-                        ", [batch]: ", batch_shape.DebugString()));
-                    break;
-                  }
-                  Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                      *dataset()->device_, tensor, offset, batch);
-                  if (!copy_status.ok()) {
-                    result->status.Update(copy_status);
-                    break;
-                  }
-                }
-              }
-              batch_result->counter->DecrementCount();
-            });
+        std::function<void(std::function<void()>)>* runner =
+            new std::function<void(std::function<void()>)>(*ctx->runner());
+        opts.runner = runner;
+        (*ctx->runner())(std::bind(
+            [=](std::vector<Tensor> input_element) {
+              dataset()->captured_func_->RunAsync(
+                  opts, std::move(input_element), &result->return_values,
+                  [this, step_container, runner, result, batch_result,
+                   offset](Status ret_status) {
+                    delete step_container;
+                    delete runner;
+                    result->status.Update(ret_status);
+                    if (ret_status.ok()) {
+                      EnsureOutputAllocated(batch_result,
+                                            result->return_values);
+                      const size_t num_components =
+                          result->return_values.size();
+                      for (size_t i = 0; i < num_components; ++i) {
+                        const Tensor& tensor = result->return_values[i];
+                        Tensor* batch = &(batch_result->output)[i];
+                        if (tensor.NumElements() !=
+                            (batch->NumElements() / batch->dim_size(0))) {
+                          TensorShape batch_shape = batch->shape();
+                          batch_shape.RemoveDim(0);
+                          result->status.Update(errors::InvalidArgument(
+                              "Cannot add tensor to the batch: number of "
+                              "elements does not match. Shapes are: [tensor]: ",
+                              tensor.shape().DebugString(),
+                              ", [batch]: ", batch_shape.DebugString()));
+                          break;
+                        }
+                        // TODO(mrry): Add a version of DoParallelConcat that
+                        // allows us to move `tensor` where possible, to speed
+                        // up string tensor batching.
+                        Status copy_status =
+                            ::tensorflow::functor::DoParallelConcat(
+                                *dataset()->device_, tensor, offset, batch);
+                        if (!copy_status.ok()) {
+                          result->status.Update(copy_status);
+                          break;
+                        }
+                      }
+                    }
+                    // NOTE(mrry): We clear the return values here to release
+                    // any memory associated with them and to paralellize the
+                    // destruction of the tensors (which can be surprisingly
+                    // expensive for map functions with large numbers of return
+                    // values).
+                    result->return_values.clear();
+                    batch_result->counter->DecrementCount();
+                  });
+            },
+            std::move(input_element)));
       }
 
       void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
@@ -297,7 +316,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         for (size_t i = 0; i < dataset()->batch_size_; ++i) {
           size_t index = ComputeInvocationIndex(batch_index, i);
           InvocationResult* result = &invocation_results_[index];
-          *result = InvocationResult();
+          // Reset the state of `result`.
+          // NOTE(mrry): `result->return_values` were cleared when the previous
+          // invocation completed.
+          result->status = Status::OK();
         }
         // Start individual invocations.
         for (size_t i = 0; i < dataset()->batch_size_; ++i) {
diff --git a/tensorflow/core/kernels/map_dataset_op.cc b/tensorflow/core/kernels/map_dataset_op.cc
index ac458701fe2f4e20dae0d2eb908b330b4551d537..f1b44beb7d9a30c0711e76fe8be110b2a91e2ca0 100644
--- a/tensorflow/core/kernels/map_dataset_op.cc
+++ b/tensorflow/core/kernels/map_dataset_op.cc
@@ -53,18 +53,21 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
                                                  std::move(other_arguments),
                                                  &captured_func));
 
-    *output = new Dataset(input, std::move(captured_func), output_types_,
-                          output_shapes_);
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input,
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : input_(input),
+        : GraphDatasetBase(ctx),
+          input_(input),
+          func_(func),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
           output_shapes_(output_shapes) {
@@ -88,6 +91,37 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "MapDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      DataTypeVector other_arguments_types(
+          captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments(
+          captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
+          {std::make_pair(1, other_arguments)},         // Tensor list inputs.
+          {std::make_pair("f", f),
+           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -112,7 +146,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
@@ -122,7 +156,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         opts.runner = ctx->runner();
         // TODO(mrry): Avoid blocking a threadpool thread. We will need to
         // stack-rip the iterators and use async kernels.
-        Status s = dataset()->captured_func_->Run(opts, args, out_tensors);
+        Status s =
+            dataset()->captured_func_->Run(opts, std::move(args), out_tensors);
         if (errors::IsOutOfRange(s)) {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
@@ -133,11 +168,24 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
      private:
       const std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* const input_;
+    const NameAttrList func_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 7b5a464b7222fdfc13568d56ea40fd228e22a33e..bdc3b5778f0bc74d7e594ea371d73a113ab781ec 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace {
@@ -36,16 +37,14 @@ namespace {
 // Partial Ordering Comparator for Tensor keys containing scalar int64's
 struct KeyTensorLess {
   bool operator()(const Tensor& lhs, const Tensor& rhs) const {
-    return std::less<int64>{}(lhs.scalar<int64>()(),
-                              rhs.scalar<int64>()());
+    return std::less<int64>{}(lhs.scalar<int64>()(), rhs.scalar<int64>()());
   }
 };
 
 // Key Equality operator for Tensor keys containing scalar int64's
 struct KeyTensorEqual {
   bool operator()(const Tensor& lhs, const Tensor& rhs) const {
-    return std::equal_to<int64>{}(lhs.scalar<int64>()(),
-                                  rhs.scalar<int64>()());
+    return std::equal_to<int64>{}(lhs.scalar<int64>()(), rhs.scalar<int64>()());
   }
 };
 
@@ -93,24 +92,23 @@ class StagingMap : public ResourceBase {
 
  private:
   // Private variables
-  DataTypeVector dtypes_;
-  std::size_t capacity_;
-  std::size_t memory_limit_;
-  std::size_t current_bytes_;
-  std::mutex mu_;
-  std::condition_variable not_empty_;
-  std::condition_variable full_;
-  IncompleteType incomplete_;
-  MapType map_;
+  DataTypeVector dtypes_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::size_t memory_limit_ GUARDED_BY(mu_);
+  std::size_t current_bytes_ GUARDED_BY(mu_);
+  tensorflow::mutex mu_;
+  tensorflow::condition_variable not_empty_;
+  tensorflow::condition_variable full_;
+  IncompleteType incomplete_ GUARDED_BY(mu_);
+  MapType map_ GUARDED_BY(mu_);
 
  private:
   // private methods
 
   // If map is configured for bounded capacity, notify
   // waiting inserters that space is now available
-  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+  void notify_inserters_if_bounded() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (has_capacity() || has_memory_limit()) {
-      lock->unlock();
       // Notify all inserters. The removal of an element
       // may make memory available for many inserters
       // to insert new elements
@@ -120,23 +118,29 @@ class StagingMap : public ResourceBase {
 
   // Notify all removers waiting to extract values
   // that data is now available
-  void notify_removers(std::unique_lock<std::mutex>* lock) {
-    lock->unlock();
+  void notify_removers() {
     // Notify all removers. This is because they are
     // waiting for specific keys to appear in the map
     // so we don't know which one to wake up.
     not_empty_.notify_all();
   }
 
-  bool has_capacity() const { return capacity_ > 0; }
+  bool has_capacity() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return capacity_ > 0;
+  }
 
-  bool has_memory_limit() const { return memory_limit_ > 0; }
+  bool has_memory_limit() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return memory_limit_ > 0;
+  }
 
-  bool would_exceed_memory_limit(std::size_t bytes) const {
-    return bytes + current_bytes_ > memory_limit_;
+  bool would_exceed_memory_limit(std::size_t bytes) const
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return has_memory_limit() && bytes + current_bytes_ > memory_limit_;
   }
 
-  bool is_capacity_full() const { return map_.size() >= capacity_; }
+  bool is_capacity_full() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return has_capacity() && map_.size() >= capacity_;
+  }
 
   // Get number of bytes in the tuple
   std::size_t get_tuple_bytes(const Tuple& tuple) {
@@ -157,7 +161,8 @@ class StagingMap : public ResourceBase {
   }
 
   // Check that the index is within bounds
-  Status check_index(const Tensor& key, std::size_t index) {
+  Status check_index(const Tensor& key, std::size_t index)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (index >= dtypes_.size()) {
       return Status(errors::InvalidArgument(
           "Index '", index, "' for key '", key.scalar<int64>()(),
@@ -169,7 +174,7 @@ class StagingMap : public ResourceBase {
 
   Status copy_or_move_tensors(OptionalTuple* map_tuple, const Tensor& key,
                               const Tensor& indices, Tuple* output,
-                              bool copy = false) {
+                              bool copy = false) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto findices = indices.flat<int>();
 
     // Return values at specified indices
@@ -201,11 +206,12 @@ class StagingMap : public ResourceBase {
   // Check that the optional value at the specified index
   // is uninitialized
   Status check_index_uninitialized(const Tensor& key, std::size_t index,
-                                   const OptionalTuple& tuple) {
+                                   const OptionalTuple& tuple)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (tuple[index].has_value()) {
-      return Status(errors::InvalidArgument("The tensor for index '",
-        index, "' for key '", key.scalar<int64>()(),
-        "' was already initialized '", dtypes_.size(), "'."));
+      return Status(errors::InvalidArgument(
+          "The tensor for index '", index, "' for key '", key.scalar<int64>()(),
+          "' was already initialized '", dtypes_.size(), "'."));
     }
 
     return Status::OK();
@@ -228,7 +234,7 @@ class StagingMap : public ResourceBase {
   }
 
   // Check bytes are within memory limits memory limits
-  Status check_memory_limit(std::size_t bytes) {
+  Status check_memory_limit(std::size_t bytes) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (has_memory_limit() && bytes > memory_limit_) {
       return Status(errors::ResourceExhausted(
           "Attempted to insert tensors with combined size of '", bytes,
@@ -241,8 +247,8 @@ class StagingMap : public ResourceBase {
 
   // Insert incomplete data into the Barrier
   Status put_incomplete(const KeyType& key, const Tensor& indices,
-                        OptionalTuple* tuple,
-                        std::unique_lock<std::mutex>* lock) {
+                        OptionalTuple* tuple, tensorflow::mutex_lock* lock)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     auto findices = indices.flat<int>();
 
     // Search for the key in our incomplete set
@@ -252,11 +258,9 @@ class StagingMap : public ResourceBase {
     std::size_t tuple_bytes = get_tuple_bytes(*tuple);
     TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
 
-    if (has_memory_limit()) {
-      full_.wait(*lock, [tuple_bytes, this]() {
-        // Stop waiting if we don't exceed the memory limit
-        return !would_exceed_memory_limit(tuple_bytes);
-      });
+    // Wait until we don't exceed the memory limit
+    while (would_exceed_memory_limit(tuple_bytes)) {
+      full_.wait(*lock);
     }
 
     // This key isn't present in the incomplete set
@@ -282,8 +286,7 @@ class StagingMap : public ResourceBase {
     // Found an entry in the incomplete index
     // Update with given data and insert complete entries
     // into the main map
-    else
-    {
+    else {
       // Reference existing incomplete tuple
       OptionalTuple& present = it->second;
 
@@ -312,7 +315,7 @@ class StagingMap : public ResourceBase {
         // Remove from incomplete
         incomplete_.erase(it);
 
-        TF_RETURN_IF_ERROR(put_complete(key, &insert_tuple, lock));
+        TF_RETURN_IF_ERROR(put_complete(key, &insert_tuple));
       }
     }
 
@@ -320,12 +323,12 @@ class StagingMap : public ResourceBase {
   }
 
   // Does the insertion into the actual staging area
-  Status put_complete(const KeyType& key, OptionalTuple* tuple,
-                      std::unique_lock<std::mutex>* lock) {
+  Status put_complete(const KeyType& key, OptionalTuple* tuple)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     // Insert key and tuples into the map
     map_.insert({key, std::move(*tuple)});
 
-    notify_removers(lock);
+    notify_removers();
 
     return Status::OK();
   }
@@ -340,7 +343,7 @@ class StagingMap : public ResourceBase {
         current_bytes_(0) {}
 
   Status put(KeyType* key, const Tensor* indices, OptionalTuple* tuple) {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
 
     // Sanity check the indices
     TF_RETURN_IF_ERROR(check_index_ordering(*indices));
@@ -354,22 +357,13 @@ class StagingMap : public ResourceBase {
     // Check that tuple_bytes fits within the memory limit
     TF_RETURN_IF_ERROR(check_memory_limit(tuple_bytes));
 
-    // If map capacity is bounded wait until map is not full
-    if (has_capacity() || has_memory_limit()) {
-      full_.wait(lock, [tuple_bytes, this]() {
-        // If there's a memory limit, check if there's space for insertion
-        bool memory_limit_valid =
-            has_memory_limit() ? !would_exceed_memory_limit(tuple_bytes) : true;
-        // If we're configured for capacity check if there's space for insertion
-        bool capacity_valid = has_capacity() ? !is_capacity_full() : true;
-
-        // Stop waiting upon success for both conditions
-        return memory_limit_valid && capacity_valid;
-      });
+    // Wait until there's space for insertion.
+    while (would_exceed_memory_limit(tuple_bytes) || is_capacity_full()) {
+      full_.wait(lock);
     }
 
     // Do the put operation
-    TF_RETURN_IF_ERROR(put_complete(*key, tuple, &lock));
+    TF_RETURN_IF_ERROR(put_complete(*key, tuple));
 
     // Update the current size
     current_bytes_ += tuple_bytes;
@@ -378,7 +372,7 @@ class StagingMap : public ResourceBase {
   }
 
   Status get(const KeyType* key, const Tensor* indices, Tuple* tuple) {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
 
     // Sanity check the indices
     TF_RETURN_IF_ERROR(check_index_ordering(*indices));
@@ -386,8 +380,9 @@ class StagingMap : public ResourceBase {
     typename MapType::iterator it;
 
     // Wait until the element with the requested key is present
-    not_empty_.wait(
-        lock, [&, this]() { return (it = map_.find(*key)) != map_.end(); });
+    while ((it = map_.find(*key)) == map_.end()) {
+      not_empty_.wait(lock);
+    }
 
     TF_RETURN_IF_ERROR(
         copy_or_move_tensors(&it->second, *key, *indices, tuple, true));
@@ -399,7 +394,7 @@ class StagingMap : public ResourceBase {
   }
 
   Status pop(const KeyType* key, const Tensor* indices, Tuple* tuple) {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
 
     // Sanity check the indices
     TF_RETURN_IF_ERROR(check_index_ordering(*indices));
@@ -407,8 +402,9 @@ class StagingMap : public ResourceBase {
     typename MapType::iterator it;
 
     // Wait until the element with the requested key is present
-    not_empty_.wait(
-        lock, [&, this]() { return (it = map_.find(*key)) != map_.end(); });
+    while ((it = map_.find(*key)) == map_.end()) {
+      not_empty_.wait(lock);
+    }
 
     TF_RETURN_IF_ERROR(
         copy_or_move_tensors(&it->second, *key, *indices, tuple));
@@ -422,19 +418,21 @@ class StagingMap : public ResourceBase {
     // Update bytes in the Staging Area
     current_bytes_ -= get_tuple_bytes(*tuple);
 
-    notify_inserters_if_bounded(&lock);
+    notify_inserters_if_bounded();
 
     return Status::OK();
   }
 
   Status popitem(KeyType* key, const Tensor* indices, Tuple* tuple) {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
 
     // Sanity check the indices
     TF_RETURN_IF_ERROR(check_index_ordering(*indices));
 
     // Wait until map is not empty
-    not_empty_.wait(lock, [this]() { return !this->map_.empty(); });
+    while (this->map_.empty()) {
+      not_empty_.wait(lock);
+    }
 
     // Move from the first element and erase it
 
@@ -454,29 +452,29 @@ class StagingMap : public ResourceBase {
     // Update bytes in the Staging Area
     current_bytes_ -= get_tuple_bytes(*tuple);
 
-    notify_inserters_if_bounded(&lock);
+    notify_inserters_if_bounded();
 
     return Status::OK();
   }
 
   Status clear() {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
     map_.clear();
     incomplete_.clear();
     current_bytes_ = 0;
 
-    notify_inserters_if_bounded(&lock);
+    notify_inserters_if_bounded();
 
     return Status::OK();
   }
 
   std::size_t incomplete_size() {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
     return incomplete_.size();
   }
 
   std::size_t size() {
-    std::unique_lock<std::mutex> lock(mu_);
+    tensorflow::mutex_lock lock(mu_);
     return map_.size();
   }
 
@@ -539,10 +537,9 @@ class MapStageOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapStage").Device(DEVICE_CPU),
-                      MapStageOp<false>);
+REGISTER_KERNEL_BUILDER(Name("MapStage").Device(DEVICE_CPU), MapStageOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapStage").Device(DEVICE_CPU),
-                      MapStageOp<true>);
+                        MapStageOp<true>);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(
@@ -553,7 +550,7 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
                             .HostMemory("indices")
                             .Device(DEVICE_GPU),
                         MapStageOp<true>);
-#endif // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapStage")
@@ -601,30 +598,34 @@ class MapUnstageOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapUnstage").Device(DEVICE_CPU),
-                            MapUnstageOp<false>);
+                        MapUnstageOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage").Device(DEVICE_CPU),
-                            MapUnstageOp<true>);
+                        MapUnstageOp<true>);
 
 #if GOOGLE_CUDA
 REGISTER_KERNEL_BUILDER(Name("MapUnstage")
-                        .HostMemory("key")
-                        .HostMemory("indices")
-                        .Device(DEVICE_GPU), MapUnstageOp<false>);
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapUnstageOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
-                        .HostMemory("key")
-                        .HostMemory("indices")
-                        .Device(DEVICE_GPU), MapUnstageOp<true>);
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_GPU),
+                        MapUnstageOp<true>);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("MapUnstage")
-                        .HostMemory("key")
-                        .HostMemory("indices")
-                        .Device(DEVICE_SYCL), MapUnstageOp<false>);
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapUnstageOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
-                        .HostMemory("key")
-                        .HostMemory("indices")
-                        .Device(DEVICE_SYCL), MapUnstageOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+                            .HostMemory("key")
+                            .HostMemory("indices")
+                            .Device(DEVICE_SYCL),
+                        MapUnstageOp<true>);
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapPeekOp : public OpKernel {
@@ -682,7 +683,7 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
                             .HostMemory("indices")
                             .Device(DEVICE_SYCL),
                         MapPeekOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageNoKeyOp : public OpKernel {
@@ -715,7 +716,7 @@ class MapUnstageNoKeyOp : public OpKernel {
                                 " vs. ", indices_tensor->NumElements()));
 
     for (std::size_t i = 0; i < tuple.size(); ++i) {
-      ctx->set_output(i+1, tuple[i]);
+      ctx->set_output(i + 1, tuple[i]);
     }
   }
 };
@@ -749,7 +750,7 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
                             .HostMemory("indices")
                             .Device(DEVICE_SYCL),
                         MapUnstageNoKeyOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapSizeOp : public OpKernel {
@@ -770,23 +771,24 @@ class MapSizeOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_CPU),
-                        MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_CPU), MapSizeOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_CPU),
                         MapSizeOp<true>);
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_GPU)
-                        .HostMemory("size"), MapSizeOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_GPU)
-                        .HostMemory("size"), MapSizeOp<true>);
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_GPU).HostMemory("size"),
+                        MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(
+    Name("OrderedMapSize").Device(DEVICE_GPU).HostMemory("size"),
+    MapSizeOp<true>);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL)
-                        .HostMemory("size"), MapSizeOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapSize").Device(DEVICE_SYCL)
-                        .HostMemory("size"), MapSizeOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL).HostMemory("size"),
+                        MapSizeOp<false>);
+REGISTER_KERNEL_BUILDER(
+    Name("OrderedMapSize").Device(DEVICE_SYCL).HostMemory("size"),
+    MapSizeOp<true>);
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapIncompleteSizeOp : public OpKernel {
@@ -813,17 +815,21 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_CPU),
                         MapIncompleteSizeOp<true>);
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_GPU)
-                        .HostMemory("size"), MapIncompleteSizeOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_GPU)
-                        .HostMemory("size"), MapIncompleteSizeOp<true>);
+REGISTER_KERNEL_BUILDER(
+    Name("MapIncompleteSize").Device(DEVICE_GPU).HostMemory("size"),
+    MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(
+    Name("OrderedMapIncompleteSize").Device(DEVICE_GPU).HostMemory("size"),
+    MapIncompleteSizeOp<true>);
 #endif
 #ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapIncompleteSize").Device(DEVICE_SYCL)
-                        .HostMemory("size"), MapIncompleteSizeOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL)
-                        .HostMemory("size"), MapIncompleteSizeOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(
+    Name("MapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
+    MapIncompleteSizeOp<false>);
+REGISTER_KERNEL_BUILDER(
+    Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
+    MapIncompleteSizeOp<true>);
+#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapClearOp : public OpKernel {
@@ -839,14 +845,12 @@ class MapClearOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_CPU),
-                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_CPU), MapClearOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_CPU),
                         MapClearOp<true>);
 
 #if GOOGLE_CUDA
-REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU),
-                        MapClearOp<false>);
+REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU), MapClearOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_GPU),
                         MapClearOp<true>);
 #endif
@@ -855,7 +859,7 @@ REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_SYCL),
                         MapClearOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_SYCL),
                         MapClearOp<true>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matrix_exponential_op.cc b/tensorflow/core/kernels/matrix_exponential_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4cc3f32f7e4a727fa2d9ec3c21a3750111f46392
--- /dev/null
+++ b/tensorflow/core/kernels/matrix_exponential_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/unsupported/Eigen/MatrixFunctions"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+
+namespace tensorflow {
+
+template <class Scalar>
+class MatrixExponentialOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit MatrixExponentialOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const ConstMatrixMap& input = inputs[0];
+    if (input.rows() == 0) return;
+    using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    Matrix tmp = input;
+    outputs->at(0) = tmp.exp();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(MatrixExponentialOp);
+};
+
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<float>), float);
+REGISTER_LINALG_OP("MatrixExponential", (MatrixExponentialOp<double>), double);
+REGISTER_LINALG_OP("MatrixExponential",
+                   (MatrixExponentialOp<complex64>), complex64);
+REGISTER_LINALG_OP("MatrixExponential",
+                   (MatrixExponentialOp<complex128>), complex128);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index e2cf605811f94bb8abe8cd064b4b707f25aaa88b..2eefadad4949fd8d78f6a27533ce0385c38d9c69 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/maxpooling_op.h"
 
 #include <vector>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -34,9 +33,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -358,6 +359,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
     use_dnn_ = CanUseCudnn();
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -405,7 +408,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
       DnnPoolingGradOp<T>::Compute(
           context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
           stride, padding_, data_format_, &tensor_in, &tensor_out, out_backprop,
-          output_shape);
+          output_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPoolGrad only supports NHWC format";
@@ -420,6 +423,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 #endif  // GOOGLE_CUDA
@@ -884,6 +888,9 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -902,14 +909,15 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
     Tensor* argmax = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
 
-    LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in,
-                                                  output, argmax);
+    LaunchMaxPoolingWithArgmax<Device, T>::launch(
+        context, params, tensor_in, output, argmax, propagate_nans_);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  bool propagate_nans_;
 };
 
 template <typename Device, typename T>
@@ -1045,6 +1053,9 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
     use_dnn_ = CanUseCudnn();
+
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1068,9 +1079,10 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
-          stride_, padding_, data_format_, tensor_in, out_shape);
+      DnnPoolingOp<T>::Compute(context,
+                               perftools::gputools::dnn::PoolingMode::kMaximum,
+                               ksize_, stride_, padding_, data_format_,
+                               tensor_in, out_shape, propagate_nans_);
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
@@ -1079,7 +1091,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                                                            tensor_in, output);
       } else if (data_format_ == FORMAT_NHWC) {
         LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
-                                                  output);
+                                                  output, propagate_nans_);
       } else {
         LOG(FATAL) << "MaxPool currently only supports the following (layout, "
                       "type) combinations: (NHWC, non-qint8), "
@@ -1098,6 +1110,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 template <typename T>
@@ -1127,6 +1140,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     use_dnn_ = CanUseCudnn();
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
+                                   &propagate_nans_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -1168,16 +1183,17 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
         ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
                         params.out_width, params.depth);
     if (use_dnn_ && data_format_ == FORMAT_NCHW) {
-      DnnPoolingOp<T>::Compute(
-          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize,
-          stride, padding_, data_format_, tensor_in, out_shape);
+      DnnPoolingOp<T>::Compute(context,
+                               perftools::gputools::dnn::PoolingMode::kMaximum,
+                               ksize, stride, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "Non-Cudnn MaxPool only supports NHWC format";
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
-                                                output);
+                                                output, propagate_nans_);
     }
   }
 
@@ -1187,18 +1203,20 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
+  bool propagate_nans_;
 };
 
 template <typename T>
 struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
-                     const Tensor& input, Tensor* output) {
+                     const Tensor& input, Tensor* output, bool propagate_nans) {
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
-        output->flat<T>().data(), nullptr, context->eigen_gpu_device());
+        output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
+        propagate_nans);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardNoMask"));
@@ -1209,7 +1227,8 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
 template <typename T>
 struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
   static void launch(OpKernelContext* context, const PoolParameters& params,
-                     const Tensor& input, Tensor* output, Tensor* argmax) {
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propagate_nans) {
     bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()(
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
@@ -1217,7 +1236,7 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
         params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
-        context->eigen_gpu_device());
+        context->eigen_gpu_device(), propagate_nans);
     if (!status) {
       context->SetStatus(
           errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 26f5274804585378984d951abbefda0804c0b8a5..f8daaca4c94aada5dbae5e5582f0da075b7222d5 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -29,6 +29,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+template <bool propagate_nans, typename dtype>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
+  if (propagate_nans) {
+    return !(a <= b);
+  } else {
+    return a > b;
+  }
+}
+
 // This is Yangqing's custom kernel for the maxpooling operation. There are
 // three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
 // forward functions, dealing with the forward case. MaxPoolBackward is the
@@ -51,7 +60,7 @@ namespace {
 // const int output_size = batch * channels * pooled_height * pooled_width;
 // MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
 //                      kThreadsPerBlock, 0, cuda_stream>>>(...);
-template <typename dtype>
+template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
                                    const int channels, const int height,
                                    const int width, const int pooled_height,
@@ -77,7 +86,7 @@ __global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = c * height * width + h * width + w;
-        if (bottom_data_n[idx] > maxval) {
+        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
           maxidx = idx;
           maxval = bottom_data_n[idx];
         }
@@ -126,7 +135,7 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
   }
 }
 
-template <typename dtype>
+template <bool propagate_nans, typename dtype>
 __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
                                    const int height, const int width,
                                    const int channels, const int pooled_height,
@@ -153,7 +162,7 @@ __global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (h * width + w) * channels + c;
-        if (bottom_data_n[idx] > maxval) {
+        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
           maxidx = idx;
           maxval = bottom_data_n[idx];
         }
@@ -390,15 +399,24 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d) {
+    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
-
-  MaxPoolForwardNHWC<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
-                       kThreadsPerBlock, 0, d.stream()>>>(
-      output_size, bottom_data, height, width, channels, pooled_height,
-      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
-      top_data, mask);
+  if (propagate_nans) {
+    MaxPoolForwardNHWC<true>
+        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+           kThreadsPerBlock, 0, d.stream()>>>(
+            output_size, bottom_data, height, width, channels, pooled_height,
+            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+            top_data, mask);
+  } else {
+    MaxPoolForwardNHWC<false>
+        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
+           kThreadsPerBlock, 0, d.stream()>>>(
+            output_size, bottom_data, height, width, channels, pooled_height,
+            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
+            top_data, mask);
+  }
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 34203797cf01af244e656a4078ed226fb9e25d4e..38ebb34248012976346b5f25472a75dfe5575aa3 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -39,7 +39,7 @@ struct MaxPoolForwardWithOptionalArgmax {
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
                   const int pad_t, const int pad_l, T* top_data, int64* mask,
-                  const Eigen::GpuDevice& d);
+                  const Eigen::GpuDevice& d, bool propagate_nans);
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index b29feb003242500548d1a4bf83a31c8c2d1c57d0..9fed01189fc3bfde4ad1e23ea8fda0c76311b3bc 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -82,7 +82,7 @@ gemmlowp::WorkersPool* GetWorkersPool() {
 }
 
 mutex& GetMutex() {
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   return mu;
 }
 
diff --git a/tensorflow/core/kernels/mfcc.h b/tensorflow/core/kernels/mfcc.h
index c39f10499091f0b5c6c74a3e70a812169b84c807..0d5d9fb90f8bd137aea5d7f3b8c08dfcd1495c18 100644
--- a/tensorflow/core/kernels/mfcc.h
+++ b/tensorflow/core/kernels/mfcc.h
@@ -33,10 +33,11 @@ class Mfcc {
   bool Initialize(int input_length,
                   double input_sample_rate);
 
-  // Input is a single magnitude spectrogram frame. The input spectrum
-  // is filtered into bands using a triangular mel filterbank and a
-  // discrete cosine transform (DCT) of the values is taken. Output is
-  // populated with the lowest dct_coefficient_count of these values.
+  // Input is a single squared-magnitude spectrogram frame. The input spectrum
+  // is converted to linear magnitude and weighted into bands using a
+  // triangular mel filterbank, and a discrete cosine transform (DCT) of the
+  // values is taken. Output is populated with the lowest dct_coefficient_count
+  // of these values.
   void Compute(const std::vector<double>& spectrogram_frame,
                std::vector<double>* output) const;
 
diff --git a/tensorflow/core/kernels/mfcc_mel_filterbank.h b/tensorflow/core/kernels/mfcc_mel_filterbank.h
index 33ea1bdb5bc3e2a2326913c99f2f6713bd82f096..a766a20cbca4a7772a62a2701334c87a5ed57531 100644
--- a/tensorflow/core/kernels/mfcc_mel_filterbank.h
+++ b/tensorflow/core/kernels/mfcc_mel_filterbank.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Basic class for applying a mel-scale filterbank to an input.
+// Basic class for applying a mel-scale mapping to a power spectrum.
 
 #ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_MFCC_MEL_FILTERBANK_H_
@@ -32,8 +32,9 @@ class MfccMelFilterbank {
                   double lower_frequency_limit,
                   double upper_frequency_limit);
 
-  // Takes a magnitude spectrogram slice as input, computes a
-  // traingular mel filterbank and places the result in output.
+  // Takes a squared-magnitude spectrogram slice as input, computes a
+  // triangular-mel-weighted linear-magnitude filterbank, and places the result
+  // in output.
   void Compute(const std::vector<double>& input,
                std::vector<double>* output) const;
 
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 935eb81dd05897b49446cc285222a946be3d2931..9aabbbdb6b4d9041ec2d8dffc0cb69199306dba1 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include <numeric>
-
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -29,10 +28,17 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
-namespace tensorflow {
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+using mkldnn::sum;
+#endif
 
+namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAddNOp : public OpKernel {
  public:
@@ -41,17 +47,18 @@ class MklAddNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     const int num = ctx->num_inputs();
     OP_REQUIRES(ctx, num / 2 == 2,
-                errors::InvalidArgument("Only additions of two arguments "
+                errors::InvalidArgument("Only additions of two tensors "
                                         "supported by MKL. Num inputs: ",
                                         num));
 
     MklAddNOpContext mkl_context;
-    const Tensor& input0 = MklGetInput(ctx, 0);
-    GetMklShape(ctx, 0, &(mkl_context.input1_shape));
+    size_t src1_idx = 0, src2_idx = 1;
+    const Tensor& input0 = MklGetInput(ctx, src1_idx);
+    GetMklShape(ctx, src1_idx, &(mkl_context.input1_shape));
     bool input1_in_mkl_format = mkl_context.input1_shape.IsMklTensor();
 
-    const Tensor& input1 = MklGetInput(ctx, 1);
-    GetMklShape(ctx, 1, &(mkl_context.input2_shape));
+    const Tensor& input1 = MklGetInput(ctx, src2_idx);
+    GetMklShape(ctx, src2_idx, &(mkl_context.input2_shape));
     bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
 
     // handle the case of a scalar
@@ -59,13 +66,12 @@ class MklAddNOp : public OpKernel {
       const TensorShape& o_shape = input0.shape();
       Tensor* out_tensor = nullptr;
       mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
+      AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
                                 mkl_context.output_shape);
       float user_i1 = (input0.scalar<T>()());
-      ;
       float user_i2 = (input1.scalar<T>()());
-      ;
-      out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
+      out_tensor->scalar<T>()() =
+          std::plus<float>{}(user_i1, user_i2);
       return;
     }
 
@@ -82,8 +88,8 @@ class MklAddNOp : public OpKernel {
       if (o_shape.num_elements() == 0) {
         Tensor* out_tensor = nullptr;
         mkl_context.output_shape.SetMklTensor(false);
-        AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
-                                  mkl_context.output_shape);
+        AllocateOutputSetMklShape(ctx, src1_idx, &out_tensor, o_shape,
+                                 mkl_context.output_shape);
         return;
       }
     }
@@ -92,9 +98,9 @@ class MklAddNOp : public OpKernel {
     mkl_context.in_strides = new size_t[mkl_context.in_dims];
     // Generate size, stride for input if input is in MKL format.
     if (input1_in_mkl_format || input2_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
-                                          ? &mkl_context.input1_shape
-                                          : &mkl_context.input2_shape;
+      const MklShape* tmp_mkl_shape =
+        (input1_in_mkl_format) ? &mkl_context.input1_shape :
+        &mkl_context.input2_shape;
       for (int i = 0; i < mkl_context.in_dims; i++) {
         mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
         mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
@@ -110,7 +116,6 @@ class MklAddNOp : public OpKernel {
             mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
       }
     }
-
     std::vector<float> coeff(2, 1.0);
     mkl_context.MklCreateInputLayouts(ctx);
     CHECK_EQ(dnnSumCreate_F32(&mkl_context.Eltwise, mkl_context.attributes, 2,
@@ -127,7 +132,7 @@ class MklAddNOp : public OpKernel {
      mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
 
      mkl_context.output_shape.SetTfLayout(
-         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
+        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
      if (input1_in_mkl_format == true) {
       mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
       mkl_context.input1_shape.GetTfToMklDimMap());
@@ -139,12 +144,12 @@ class MklAddNOp : public OpKernel {
                         mkl_context.output_shape.GetMklLayout())) /
                     sizeof(T));
 
-     AllocateOutputSetMklShape(ctx, 0, &output, tf_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, tf_shape,
                               mkl_context.output_shape);
     } else {
      const TensorShape& o_shape = input1.shape();
      mkl_context.output_shape.SetMklTensor(false);
-     AllocateOutputSetMklShape(ctx, 0, &output, o_shape,
+     AllocateOutputSetMklShape(ctx, src1_idx, &output, o_shape,
                                 mkl_context.output_shape);
     }
 
@@ -172,16 +177,18 @@ class MklAddNOp : public OpKernel {
     void MklCreateInputLayouts(OpKernelContext* context) {
       bool input1_in_mkl_format = input1_shape.IsMklTensor();
       if (!input1_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input1, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
       }
 
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       if (!input2_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
+        CHECK_EQ(
+            dnnLayoutCreate_F32(&lt_input2, in_dims, in_sizes, in_strides),
+            E_SUCCESS);
       } else {
         lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
       }
@@ -257,8 +264,8 @@ class MklAddNOp : public OpKernel {
       bool input2_in_mkl_format = input2_shape.IsMklTensor();
       dnnDelete_F32(Eltwise);
       if (!input1_in_mkl_format || !input2_in_mkl_format) {
-        delete[] in_sizes;
-        delete[] in_strides;
+         delete [] in_sizes;
+         delete [] in_strides;
       }
       if (!input1_in_mkl_format) {
          dnnLayoutDelete_F32(lt_input1);
@@ -270,6 +277,151 @@ class MklAddNOp : public OpKernel {
   } MklAddNOpContext;
 };
 
+#else  // INTEL_MKL_DNN
+template <typename Device, typename T>
+class MklAddNOp : public OpKernel {
+ public:
+  ~MklAddNOp() {}
+  explicit MklAddNOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const int num = ctx->num_inputs();
+    // Only additions of 2 input tensors is supported now
+    OP_REQUIRES(ctx, num / 2 == 2,
+                errors::InvalidArgument("Only additions of two tensors "
+                                        "supported by MKL. Num inputs: ",
+                                        num));
+
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      size_t src1_idx = 0, src2_idx = 1;
+      const Tensor& src1_tensor = MklGetInput(ctx, src1_idx);
+      const Tensor& src2_tensor = MklGetInput(ctx, src2_idx);
+
+      MklDnnShape src1_mkl_shape, src2_mkl_shape;
+      GetMklShape(ctx, src1_idx, &src1_mkl_shape);
+      GetMklShape(ctx, src2_idx, &src2_mkl_shape);
+      bool input1_in_mkl_format = src1_mkl_shape.IsMklTensor();
+      bool input2_in_mkl_format = src2_mkl_shape.IsMklTensor();
+      int src1_dims_size = input1_in_mkl_format?
+       src1_mkl_shape.GetDimension(): src1_tensor.dims();
+      int src2_dims_size = input2_in_mkl_format?
+       src2_mkl_shape.GetDimension(): src2_tensor.dims();
+
+      if (!input1_in_mkl_format && src1_dims_size == 0) {
+         Tensor* dst_tensor = nullptr;
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+         src1_tensor.shape(), mkl_shape_dst);
+         float user_i1 = (src1_tensor.scalar<T>()());
+         float user_i2 = (src2_tensor.scalar<T>()());
+         dst_tensor->scalar<T>()() =
+           std::plus<float>{}(user_i1, user_i2);
+         return;
+       }
+
+      // If there is nothing to compute, return.
+      if (!input1_in_mkl_format && !input2_in_mkl_format) {
+        if (src1_tensor.shape().num_elements() == 0) {
+           Tensor* dst_tensor = nullptr;
+           MklShape mkl_shape_dst;
+           mkl_shape_dst.SetMklTensor(false);
+           AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor,
+           src1_tensor.shape(), mkl_shape_dst);
+           return;
+        }
+      }
+
+      // element-wise add operator for tensor input1 and tensor input2
+      std::vector<double> coeff(2, 1.0);
+      MklDnnData<T> src1(&cpu_engine);
+      MklDnnData<T> src2(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      int tmp_size = input1_in_mkl_format ? src2_dims_size: src1_dims_size;
+      memory::dims dims(tmp_size);
+      memory::dims strides(tmp_size);
+      memory::desc md1({}, memory::data_undef, memory::format_undef);
+      memory::desc md2({}, memory::data_undef, memory::format_undef);
+
+      if ( input1_in_mkl_format || input2_in_mkl_format ) {
+        if ( input1_in_mkl_format ) {
+          md1 = src1_mkl_shape.GetMklLayout();
+          md2 = md1;
+          dst.SetUsrMem(md1);
+        } else {
+          md2 = src2_mkl_shape.GetMklLayout();
+          md1 = md2;
+          dst.SetUsrMem(md2);
+        }
+      } else {
+         dims = TFShapeToMklDnnDims(src1_tensor.shape());
+         strides = CalculateTFStrides(dims);
+         md1 = MklDnnData<T>::CreateBlockedMemDesc(dims, strides);
+         md2 = md1;
+         dst.SetUsrMem(dims, strides);
+      }
+
+      std::vector<memory::primitive_desc> srcs_pd;
+
+      src1.SetUsrMem(md1, &src1_tensor);
+      auto mpd1 = src1.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd1);
+
+      src2.SetUsrMem(md2, &src2_tensor);
+      auto mpd2 = src2.GetUsrMemPrimDesc();
+      srcs_pd.push_back(mpd2);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(src1.GetOpMem());
+      inputs.push_back(src2.GetOpMem());
+      auto output_pd = dst.GetUsrMemPrimDesc();
+      Tensor* dst_tensor = nullptr;
+      auto sum_pd = sum::primitive_desc(dst.GetUsrMemDesc(), coeff, srcs_pd);
+      auto sum_op = sum(sum_pd, inputs, dst.GetOpMem());
+      if ( input2_in_mkl_format || input1_in_mkl_format ) {
+         MklDnnShape output_mkl_shape;
+         output_mkl_shape.SetMklTensor(true);
+         output_mkl_shape.SetMklLayout(&output_pd);
+         output_mkl_shape.SetElemType(MklDnnType<T>());
+         if ( input1_in_mkl_format ) {
+          output_mkl_shape.SetTfLayout(src1_dims_size,
+          src1_mkl_shape.GetSizesAsMklDnnDims(),
+          src1_mkl_shape.GetTfDataFormat());
+         } else {
+          output_mkl_shape.SetTfLayout(src2_dims_size,
+          src2_mkl_shape.GetSizesAsMklDnnDims(),
+          src2_mkl_shape.GetTfDataFormat());
+         }
+         TensorShape output_tf_shape;
+         output_tf_shape.AddDim((output_pd.get_size() / sizeof(T))
+         + (output_pd.get_size()%sizeof(T) == 0 ? 0 : 1));
+         AllocateOutputSetMklShape(ctx, src1_idx, &dst_tensor, output_tf_shape,
+                                output_mkl_shape);
+      } else {
+         MklShape mkl_shape_dst;
+         mkl_shape_dst.SetMklTensor(false);
+         AllocateOutputSetMklShape(ctx, src1_idx,
+         &dst_tensor, src1_tensor.shape(), mkl_shape_dst);
+      }
+
+      dst.SetUsrMemDataHandle(dst_tensor);
+      std::vector<primitive> net;
+      net.push_back(sum_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(ctx, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+};
+
+#endif
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklAddN")                          \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index d90baee069c17e9b25169dcb2650681f6103f9b1..d751a70fc86b40d8ca656322484848cf906359fd 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -24,10 +24,25 @@
 
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklAvgPoolingOp : public OpKernel {
  public:
@@ -132,7 +147,7 @@ class MklAvgPoolingOp : public OpKernel {
         E_SUCCESS);
 
     mkl_context.MklCleanup();
-  }
+  }  // Compute
 
  private:
   typedef struct {
@@ -411,7 +426,293 @@ class MklAvgPoolingGradOp : public OpKernel {
   std::vector<int32> stride_;
   Padding padding_;
   TensorFormat data_format_;
-};
+};  // MklAvgPoolingGradOp
+
+
+#else  // INTEL_MKL_DNN is defined
+
+template <typename Device, typename T>
+class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingOp(OpKernelConstruction* context)
+  : MklPoolingForwardOpBase<T>(context) {
+    // Workspace is an MKLDNN construct that is only used in Max Pooling.
+    // So set workspace_enabled_ to false.
+    this->workspace_enabled_ = false;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+              this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                          input_tensor, &pool_params,
+                          &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to AvgPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                static_cast<memory::format>(dnn_data_input.GetUsrMemDesc()
+                    .data.format));
+
+      } else {
+          dnn_data_output.SetUsrMem(output_dims_mkl_order,
+              this->data_format_mkldnn_);
+      }
+
+        // describe the memory layout
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      // 3. create a pooling primitive descriptor
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              dnn_data_input.GetUsrMemDesc(),
+              dnn_data_output.GetUsrMemDesc(),
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_prim_desc = pooling_forward::primitive_desc(pool_desc,
+                                                 cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_prim_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      CHECK_NOTNULL(output_tensor);
+
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      this->PrepareAndExecuteNet(pool_prim_desc,
+                                &dnn_data_input,
+                                &dnn_data_output);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Operation received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+};  // MklAvgPoolingOp
+
+//-----------------------------------------------------------------------------
+
+template <class Device, class T>
+class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnShape original_input_mkl_shape, input_gradient_mkl_shape;
+      const Tensor& tensor_in_shape = MklGetInput(context,
+          kInputTensorIndexInputShape);
+      const Tensor& input_gradient_tensor = MklGetInput(context,
+          kInputTensorIndexInputGradient);
+      GetMklShape(context, kInputTensorIndexInputShape,
+            &original_input_mkl_shape);
+      GetMklShape(context, kInputTensorIndexInputGradient,
+            &input_gradient_mkl_shape);
+
+
+      SanityCheckInputs(context, tensor_in_shape,
+                        input_gradient_tensor,
+                        original_input_mkl_shape,
+                        input_gradient_mkl_shape);
+      if (!context->status().ok()) return;
+
+      // Used to allocate output_diff_src/diff_src
+      // and create pool_fwd mdm desc
+      // 0. Input("orig_input_shape: int32") //NOT a T Tensor!
+      // 1. Input("grad: T")
+
+      MklDnnData<T> input_gradient_diff_dst(&cpu_engine);
+      MklDnnData<T> output_diff_src(&cpu_engine);
+      Tensor* output_tensor_diff_src = nullptr;
+      TensorShape original_input_shape;
+      MklPoolParameters pool_params;
+      memory::dims output_dims_mkl_order, original_input_dims_nchw;
+      // Configure the original input memory descriptor
+      memory::desc original_input_md = ConfigureOriginalInput(context,
+                                      tensor_in_shape,
+                                      original_input_mkl_shape,
+                                      &original_input_dims_nchw,
+                                      &pool_params,
+                                      &original_input_shape);
+
+      // configure the original output memory descriptor
+      // by definition, the shape of the original output is the same
+      // as the shape of the gradient diff_dst
+      memory::desc original_output_md = this->ConfigureOriginalOutput(
+                pool_params, input_gradient_mkl_shape, output_dims_mkl_order);
+
+      memory::desc target_diff_dst_md = this->ConfigureInputGradient(
+                                    input_gradient_mkl_shape,
+                                    input_gradient_tensor,
+                                    &input_gradient_diff_dst,
+                                    original_output_md);
+      // The shape of the output diff src needs to be the same shape as the
+      // original input. But we will set its format to be same as the format of
+      // input gradient. We won't use format of original input since it will
+      // always be in Tensorflow layout (given that AvgPoolGrad gets shape of
+      // the input rather than actual input).
+      output_diff_src.SetUsrMem(original_input_dims_nchw,
+                                static_cast<memory::format>(
+                                  target_diff_dst_md.data.format));
+
+      // Create the forward pooling primitive descriptor so we can reference it
+      // in the backward pooling primitive descriptor
+      auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+              algorithm::pooling_avg_exclude_padding,
+              original_input_md,
+              original_output_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_fwd_prim_desc
+              = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                  cpu_engine);
+
+      auto pool_bkwd_desc = pooling_backward::desc(
+              algorithm::pooling_avg_exclude_padding,
+              output_diff_src.GetUsrMemDesc(),
+              target_diff_dst_md,
+              memory::dims({  pool_params.row_stride,
+                              pool_params.col_stride}),
+              memory::dims({  pool_params.window_rows,
+                              pool_params.window_cols}),
+              memory::dims({  static_cast<int>(pool_params.pad_top),
+                              static_cast<int>(pool_params.pad_left)}),
+              memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                              static_cast<int>(pool_params.pad_right)}),
+              TFPaddingToMklDnnPadding(this->padding_));
+      auto pool_bkwd_prim_desc
+                = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                              cpu_engine,
+                                              pool_fwd_prim_desc);
+      this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+                      original_input_dims_nchw,
+                      this->data_format_mkldnn_,
+                      &output_tensor_diff_src);
+
+      output_diff_src.SetUsrMemDataHandle(output_tensor_diff_src);
+
+      this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                          &input_gradient_diff_dst,
+                          &output_diff_src,
+                          memory::primitive_desc(
+                              target_diff_dst_md,
+                              cpu_engine));
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                      ", message: " + string(e.message) +
+                      ", in file " + string(__FILE__) + ":" +
+                      std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                      errors::Aborted("Compute received an exception:",
+                                      error_msg));
+    }
+  }  // Compute
+
+ private:
+  // 0. Input("orig_input_shape: int32")
+  // 1. Input("grad: T")
+  const int kInputTensorIndexInputShape = 0;
+  const int kInputTensorIndexInputGradient = 1;
+
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+        const Tensor& tensor_original_input_shape,
+        const MklDnnShape& original_input_mkl_shape,
+        memory::dims* original_input_dims_mkl_order,
+        MklPoolParameters* pool_params,
+        TensorShape* input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_mkl_order);
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(input_tensor_shape);
+    // For AvgPoolGrad, we only get the size of the original input because
+    // The original data is irrelvant.
+    auto shape_vec = tensor_original_input_shape.vec<int32>();
+    for (int64 i = 0; i < tensor_original_input_shape.NumElements(); ++i) {
+      input_tensor_shape->AddDim(shape_vec(i));
+    }
+
+    return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                              context,
+                                              tensor_original_input_shape,
+                                              original_input_mkl_shape,
+                                              original_input_dims_mkl_order,
+                                              pool_params,
+                                              *input_tensor_shape);
+}
+
+  void SanityCheckInputs(OpKernelContext* context,
+                        const Tensor& tensor_in_shape,
+                        const Tensor& input_gradient_tensor,
+                        const MklDnnShape& original_input_mkl_shape,
+                        const MklDnnShape& input_gradient_mkl_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, tensor_in_shape.dims() == 1 &&
+          tensor_in_shape.NumElements() == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    } else {
+      OP_REQUIRES(context, original_input_mkl_shape.GetDimension() == 1 &&
+          original_input_mkl_shape.DimSize(0) == 4,
+          errors::InvalidArgument("original input shape must be "
+                "1-dimensional and 4 elements"));
+    }
+
+    if (!input_gradient_mkl_shape.IsMklTensor()) {
+      // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
+      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    } else {
+      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be "
+                              "4-dimensional"));
+    }
+  }
+};  // MklAvgPoolingGradOp
+
+
+
+#endif  // INTEL_MKL_DNN
 
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Device(DEVICE_CPU)
@@ -427,3 +728,4 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
+
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fee94f946555480fce8acf904a7909622404524
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -0,0 +1,239 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/math_ops.cc.
+
+// This file uses MKL CBLAS batched xGEMM for acceleration of TF Batch
+// Matrix-Matrix Multiplication (MatMul) operations.
+// We currently register this kernel only for MKL supported data
+// types (float, double, complex64, complex128). The macro INTEL_MKL is defined
+// by the build system only when MKL is chosen as an option at configure stage
+// and when it is undefined at build time, this file becomes an empty
+// compilation unit
+
+#define EIGEN_USE_THREADS
+
+#if defined(INTEL_MKL)
+#include <vector>
+#include "mkl_cblas.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#define MKL_Complex8 tensorflow::complex64
+#define MKL_Complex16 tensorflow::complex128
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Scalar>
+class BatchMatMulMkl : public OpKernel {
+ public:
+  explicit BatchMatMulMkl(OpKernelConstruction *context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
+    OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+  }
+
+  virtual ~BatchMatMulMkl() {}
+
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor &lhs = ctx->input(0);
+    const Tensor &rhs = ctx->input(1);
+    OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
+                errors::InvalidArgument("lhs and rhs has different ndims: ",
+                                        lhs.shape().DebugString(), " vs. ",
+                                        rhs.shape().DebugString()));
+    const int ndims = lhs.dims();
+    OP_REQUIRES(
+        ctx, ndims >= 2,
+        errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
+    TensorShape out_shape;
+    for (int i = 0; i < ndims - 2; ++i) {
+      OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
+                  errors::InvalidArgument(
+                      "lhs.dim(", i, ") and rhs.dim(", i,
+                      ") must be the same: ", lhs.shape().DebugString(), " vs ",
+                      rhs.shape().DebugString()));
+      out_shape.AddDim(lhs.dim_size(i));
+    }
+    auto batch_size = (ndims == 2) ? 1 : out_shape.num_elements();
+    auto lhs_rows = lhs.dim_size(ndims - 2);
+    auto lhs_cols = lhs.dim_size(ndims - 1);
+    auto rhs_rows = rhs.dim_size(ndims - 2);
+    auto rhs_cols = rhs.dim_size(ndims - 1);
+    if (adj_x_) std::swap(lhs_rows, lhs_cols);
+    if (adj_y_) std::swap(rhs_rows, rhs_cols);
+    OP_REQUIRES(ctx, lhs_cols == rhs_rows,
+                errors::InvalidArgument(
+                    "lhs mismatch rhs shape: ", lhs_cols, " vs. ", rhs_rows,
+                    ": ", lhs.shape().DebugString(), " ",
+                    rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
+    out_shape.AddDim(lhs_rows);
+    out_shape.AddDim(rhs_cols);
+    Tensor *out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+    if (out->NumElements() == 0) {
+      return;
+    }
+    if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
+      functor::SetZeroFunctor<Device, Scalar> f;
+      f(ctx->eigen_device<Device>(), out->flat<Scalar>());
+      return;
+    }
+
+    auto rhs_reshaped = rhs.template flat_inner_dims<Scalar, 3>();
+    auto lhs_reshaped = lhs.template flat_inner_dims<Scalar, 3>();
+    auto out_reshaped = out->template flat_inner_dims<Scalar, 3>();
+    const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1);
+    const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2);
+    const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2);
+
+    std::vector<MKL_INT> m_array(batch_size, M);
+    std::vector<MKL_INT> n_array(batch_size, N);
+    std::vector<MKL_INT> k_array(batch_size, K);
+    std::vector<MKL_INT> lda_array(batch_size, adj_x_ ? M : K);
+    std::vector<MKL_INT> ldb_array(batch_size, adj_y_ ? K : N);
+    std::vector<MKL_INT> ldc_array(batch_size, N);
+    std::vector<MKL_INT> group_size(1, batch_size);
+    std::vector<const Scalar *> a_array;
+    std::vector<const Scalar *> b_array;
+    std::vector<Scalar *> c_array;
+    a_array.reserve(batch_size);
+    b_array.reserve(batch_size);
+    c_array.reserve(batch_size);
+    for (int64 i = 0; i < batch_size; i++) {
+      a_array.push_back(&lhs_reshaped(i, 0, 0));
+      b_array.push_back(&rhs_reshaped(i, 0, 0));
+      c_array.push_back(&out_reshaped(i, 0, 0));
+    }
+
+    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
+                      &k_array[0], &a_array[0], &lda_array[0], &b_array[0],
+                      &ldb_array[0], &c_array[0], &ldc_array[0], 1,
+                      &group_size[0]);
+  }
+
+ private:
+  bool adj_x_;
+  bool adj_y_;
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const float **A_Array, const MKL_INT *lda_Array,
+                         const float **B_Array, const MKL_INT *ldb_Array,
+                         float **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_Array(
+        group_size[0], TransA ? CblasTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_Array(
+        group_size[0], TransB ? CblasTrans : CblasNoTrans);
+    std::vector<float> alpha_Array(group_size[0], 1.0);
+    std::vector<float> beta_Array(group_size[0], 0.0);
+    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], M_Array,
+                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
+                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
+                      group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const double **A_Array, const MKL_INT *lda_Array,
+                         const double **B_Array, const MKL_INT *ldb_Array,
+                         double **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasTrans : CblasNoTrans);
+    std::vector<double> alpha_Array(group_size[0], 1.0);
+    std::vector<double> beta_Array(group_size[0], 0.0);
+    cblas_dgemm_batch(Layout, &TransA_array[0], &TransB_array[0], M_Array,
+                      N_Array, K_Array, &alpha_Array[0], A_Array, lda_Array,
+                      B_Array, ldb_Array, &beta_Array[0], C_Array, ldc_Array,
+                      group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const MKL_Complex8 **A_Array, const MKL_INT *lda_Array,
+                         const MKL_Complex8 **B_Array, const MKL_INT *ldb_Array,
+                         MKL_Complex8 **C_Array, const MKL_INT *ldc_Array,
+                         const MKL_INT group_count, const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
+    std::vector<MKL_Complex8> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<MKL_Complex8> beta_Array(group_size[0], {0.0f, 0.0f});
+    cblas_cgemm_batch(
+        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
+        static_cast<const void *>(&alpha_Array[0]),
+        reinterpret_cast<const void **>(A_Array), lda_Array,
+        reinterpret_cast<const void **>(B_Array), ldb_Array,
+        static_cast<const void *>(&beta_Array[0]),
+        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+  }
+
+  void MklCblasGemmBatch(const CBLAS_LAYOUT Layout, const bool TransA,
+                         const bool TransB, const MKL_INT *M_Array,
+                         const MKL_INT *N_Array, const MKL_INT *K_Array,
+                         const MKL_Complex16 **A_Array,
+                         const MKL_INT *lda_Array,
+                         const MKL_Complex16 **B_Array,
+                         const MKL_INT *ldb_Array, MKL_Complex16 **C_Array,
+                         const MKL_INT *ldc_Array, const MKL_INT group_count,
+                         const MKL_INT *group_size) {
+    std::vector<CBLAS_TRANSPOSE> TransA_array(
+        group_size[0], TransA ? CblasConjTrans : CblasNoTrans);
+    std::vector<CBLAS_TRANSPOSE> TransB_array(
+        group_size[0], TransB ? CblasConjTrans : CblasNoTrans);
+    std::vector<MKL_Complex16> alpha_Array(group_size[0], {1.0f, 0.0f});
+    std::vector<MKL_Complex16> beta_Array(group_size[0], {0.0f, 0.0f});
+    cblas_zgemm_batch(
+        Layout, &TransA_array[0], &TransB_array[0], M_Array, N_Array, K_Array,
+        static_cast<const void *>(&alpha_Array[0]),
+        reinterpret_cast<const void **>(A_Array), lda_Array,
+        reinterpret_cast<const void **>(B_Array), ldb_Array,
+        static_cast<const void *>(&beta_Array[0]),
+        reinterpret_cast<void **>(C_Array), ldc_Array, group_count, group_size);
+  }
+};
+
+#define REGISTER_BATCH_MATMUL_MKL(TYPE)                                 \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("BatchMatMul").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      BatchMatMulMkl<CPUDevice, TYPE>)
+
+TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_double(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL);
+TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL);
+
+}  // end namespace tensorflow
+#endif
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index e6673b2ffb7dc4a2e0127c363b4402c98a023b17..d0175dfd715bcdd2cc89fe8ca5eb7d60410f6562 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -1,11 +1,8 @@
 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -33,11 +30,22 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::concat;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// List of TensorShape objects. Used in Concat/Split layers.
+typedef std::vector<TensorShape> TensorShapeList;
+
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
+
 // TODO(intelft) Check if we can reuse existing EigenConcatOp using Mutable
 // reference inputs.
 // --------------------------------------------------------------------------
@@ -55,6 +63,8 @@ class EigenConcatBaseOp : public OpKernel {
   // we need to have empty Compute because Compute is pure virtual function.
   void Compute(OpKernelContext* c) {}
 
+#ifndef INTEL_MKL_DNN
+
   void Compute(OpKernelContext* c, const std::vector<Tensor>& values) {
     const Tensor* concat_dim_tensor;
     const char* axis_attribute_name =
@@ -139,8 +149,89 @@ class EigenConcatBaseOp : public OpKernel {
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
+
+#else  // MKL_DNN
+
+void Compute(OpKernelContext* c, const std::vector<Tensor>& values,
+                        const TensorShapeList& input_shapes) {
+    const Tensor* concat_dim_tensor;
+    const char* axis_attribute_name =
+        AxisArgName == NAME_IS_AXIS
+            ? "axis"
+            : AxisArgName == NAME_IS_CONCAT_DIM ? "concat_dim" : "<invalid>";
+    OP_REQUIRES_OK(c, c->input(axis_attribute_name, &concat_dim_tensor));
+    OP_REQUIRES(c, IsLegacyScalar(concat_dim_tensor->shape()),
+                errors::InvalidArgument(
+                    axis_attribute_name,
+                    " tensor should be a scalar integer, but got shape ",
+                    concat_dim_tensor->shape().DebugString()));
+    const int32 concat_dim =
+        internal::SubtleMustCopy(concat_dim_tensor->scalar<int32>()());
+    // Instead of accessing values from context, we use input to Compute.
+    const int N = values.size();
+    const int input_dims = input_shapes[0].dims();
+    const TensorShape& input_shape = input_shapes[0];
+
+    int32 axis = concat_dim < 0 ? concat_dim + input_dims : concat_dim;
+    OP_REQUIRES(c,
+                (0 <= axis && axis < input_dims) ||
+                    (allow_legacy_scalars() && concat_dim == 0),
+                errors::InvalidArgument(
+                    "ConcatOp : Expected concatenating dimensions in the range "
+                    "[",
+                    -input_dims, ", ", input_dims, "), but got ", concat_dim));
+    // Note that we reduce the concat of n-dimensional tensors into a two
+    // dimensional concat. Assuming the dimensions of any input/output
+    // tensor are {x0, x1,...,xn-1, y0, y1,...,ym-1}, where the concat is along
+    // the dimension indicated with size y0, we flatten it to {x, y}, where y =
+    // Prod_i(yi) and x = ((n > 0) ? Prod_i(xi) : 1).
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(N);
+    int64 inputs_flat_dim0 = 1;
+    for (int d = 0; d < axis; ++d) {
+      inputs_flat_dim0 *= input_shape.dim_size(d);
+    }
+    int64 output_concat_dim = 0;
+    const bool input_is_scalar = IsLegacyScalar(input_shape);
+    for (int i = 0; i < N; ++i) {
+      const auto in = values[i];
+      const bool in_is_scalar = IsLegacyScalar(input_shapes[i]);
+      OP_REQUIRES(
+          c, (input_shapes[i].dims() == input_dims) ||
+              (input_is_scalar && in_is_scalar),
+          errors::InvalidArgument(
+              "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
+              input_shape.DebugString(), " vs. shape[", i,
+              "] = ", input_shapes[i].DebugString()));
+      if (in.NumElements() > 0) {
+        int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            in.shaped<T, 2>({inputs_flat_dim0, inputs_flat_dim1})));
+      }
+      output_concat_dim += input_shapes[i].dims() > 0 ?
+                           input_shapes[i].dim_size(axis) : 1;
+    }
+
+    TensorShape output_shape(input_shape);
+    if (output_shape.dims() == 0) {
+      output_shape.AddDim(output_concat_dim);
+    } else {
+      output_shape.set_dim(axis, output_concat_dim);
+    }
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() > 0) {
+      int64 output_dim1 = output->NumElements() / inputs_flat_dim0;
+      auto output_flat = output->shaped<T, 2>({inputs_flat_dim0, output_dim1});
+      ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+    }
+  }
+
+#endif
 };
 
+#ifndef INTEL_MKL_DNN
+
 // --------------------------------------------------------------------------
 //                      Mkl Concat Op
 // --------------------------------------------------------------------------
@@ -327,6 +418,7 @@ class MklConcatOp : public OpKernel {
     OP_REQUIRES_OK(context, context->status());
   }
 
+
  private:
   typedef struct {
     TensorFormat data_format;
@@ -435,8 +527,284 @@ class MklConcatOp : public OpKernel {
         mkl_tensor->flat<uint8>().data(),
         mkl_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // overloading methods with input shapes as a list of TensorShape's
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++) {
+      converted_values.push_back(values[i]);
+    }
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values);
+
+    // Set dummy Mkl tensor as output Mkl tensor for this op.
+    MklShape mkl_tensor_mkl_shape;
+    mkl_tensor_mkl_shape.SetMklTensor(false);
+    mkl_tensor_mkl_shape.SetDimensions(4);
+    Tensor* mkl_tensor = nullptr;
+    TensorShape mkl_tensor_tf_shape;
+    mkl_tensor_tf_shape.AddDim(
+        SIZE_OF_MKL_SERIAL_DATA(mkl_tensor_mkl_shape.GetDimension()));
+    int tf_output_index = 0;
+    context->allocate_output(
+        GetTensorMetaDataIndex(tf_output_index, context->num_outputs()),
+        mkl_tensor_tf_shape, &mkl_tensor);
+    mkl_tensor_mkl_shape.SerializeMklShape(
+        mkl_tensor->flat<uint8>().data(),
+        mkl_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
 };
 
+#else
+
+// --------------------------------------------------------------------------
+//                      Mkl Concat Op
+// --------------------------------------------------------------------------
+
+template <typename Device, typename T, AxisArgumentName AxisArgName>
+class MklConcatOp : public OpKernel {
+ private:
+  TensorFormat data_format_;
+  EigenConcatBaseOp<Device, T, AxisArgName> eigen_concat_op_;
+
+ public:
+  typedef std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>
+      ConstMatrixVector;
+
+  explicit MklConcatOp(OpKernelConstruction* c)
+      : OpKernel(c), eigen_concat_op_(c) {}
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      OpInputList input_tensors;
+      GetMklInputList(context, "values", &input_tensors);
+      const int N = input_tensors.size();
+
+      // Get Tensor shapes.
+      std::vector<MklDnnShape> input_shapes(N);
+      GetMklShapeList(context, "values", &input_shapes);
+
+      const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
+                    ? MklGetInput(context, 0) : MklGetInput(context, N);
+      // Sanity checks
+      OP_REQUIRES(context, IsLegacyScalar(concat_dim_tensor.shape()),
+        errors::InvalidArgument(
+            "Concat dim tensor should be a scalar integer, but got shape ",
+            concat_dim_tensor.shape().DebugString()));
+      int32 concat_dim = internal::SubtleMustCopy(
+                           concat_dim_tensor.scalar<int32>()());
+      if (concat_dim < 0) concat_dim = N + concat_dim;
+
+      // check that ranks of all tensors match
+      // and that their shapes match except for concat_dim.
+      int i = 0;
+      bool invoke_eigen = false;
+      bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
+      const TensorShape expected_shape = input_shapes[0].IsMklTensor() ?
+                                         input_shapes[0].GetTfShape() :
+                                         input_tensors[0].shape();
+      size_t expected_dims = expected_shape.dims();
+      for (auto& s : input_shapes) {
+        if (s == expected_shape) {++i; continue;}
+
+        TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                      input_tensors[i].shape();
+        size_t s_dims = s_shape.dims();
+
+        OP_REQUIRES(context, s_dims == expected_dims,
+                  errors::InvalidArgument(
+                      "_MklConcatOp : Ranks of all input tensors should match:"
+                      " input dimensions = ",
+                      s_dims, " vs. expected rank = ", expected_dims));
+
+        for (int d = 0; d < expected_dims; ++d) {
+          if (d == concat_dim) continue;
+
+          size_t expected_size = expected_shape.dim_size(d);
+          size_t s_size = s_shape.dim_size(d);
+          OP_REQUIRES(
+            context, expected_size == s_size,
+            errors::InvalidArgument("_MklConcatOp : Dimensions of inputs "
+                    "should match: shape[0][", d, "]= ", expected_size,
+                    " vs. shape[", i, "][", d, "] = ", s_size));
+        }
+
+        if (s.IsMklTensor())
+          are_all_tf_inputs = false;
+        else
+          are_all_mkl_inputs = false;
+
+        if (s_dims != 4) invoke_eigen = true;
+        ++i;
+      }
+
+      // All inputs are not in one format (TF or MKL). This is mixed input case.
+      // We can potentially optimize this case by converting all TF inputs
+      // to Mkl format. But currently, we fall to Eigen for this case.
+      // It may be possible to convert inputs that in TF format to Mkl
+      // format and avoid calling eigen version.
+      if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
+
+      // Temporary fallback to Eigen until MKLDNN Concat performance
+      // is improved. To be removed.
+      invoke_eigen = true;
+
+      // Call Eigen library
+      if (invoke_eigen) {
+        TensorShapeList tf_input_shapes;
+        i = 0;
+        for (auto& s : input_shapes) {
+          TensorShape s_shape = s.IsMklTensor() ? s.GetTfShape() :
+                                input_tensors[i].shape();
+          tf_input_shapes.push_back(s_shape);
+          ++i;
+        }
+        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        return;
+      }
+
+      memory::dims dst_dims;
+      if (are_all_mkl_inputs)
+        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+      else
+        // When all the inputs are in Tensorflow format, we don't know
+        // what is the input data format. In that case, we just use
+        // output format that is same as input formats.
+        dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
+      int64 dst_concat_dim_size = 0;
+      for (int k =0; k < N; k++) {
+        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
+        memory::dims src_dims;
+
+        // Same comment as dst_dims for src_dims.
+        src_dims = (is_mkl_tensor) ?
+                   TFShapeToMklDnnDims(input_shapes[k].GetTfShape()) :
+                   TFShapeToMklDnnDims(input_tensors[k].shape());
+
+        dst_concat_dim_size += src_dims[concat_dim];
+        auto src_md = is_mkl_tensor ? input_shapes[k].GetMklLayout() :
+          // It does not matter what data format we use here (NHWC or NCHW).
+          // We just need to ensure that output of Concat uses same data format
+          // as input.
+                  memory::desc(src_dims, MklDnnType<T>(), memory::format::nhwc);
+
+        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+        srcs_pd.push_back(src_mpd);
+      }
+      dst_dims[concat_dim] = dst_concat_dim_size;
+
+      MklDnnData<T> dst(&cpu_engine);
+      memory::desc dst_md({}, memory::data_undef, memory::format_undef);
+      memory::dims dst_dims_in_nchw;
+      if (are_all_mkl_inputs) {
+        // Since we are passing a specific format for destination,
+        // we need to have dst_dims in MklDnn order (NCHW).
+        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        dst_dims_in_nchw = MklDnnDimsInNCHW(dst_dims,
+                               MklDnnDataFormatToTFDataFormat(orig_tf_format));
+        // We will set the output in the same format as input to avoid layout
+        // conversions.
+        // Currently we are setting dst format same as input format.
+        // See if we can make this choice in a better way.
+        dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
+                 (memory::format) input_shapes[0].GetMklLayout().data.format);
+      } else {
+        // Again, format does not matter here. We just need to make it same as
+        // input format.
+        dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nhwc);
+      }
+
+      std::vector<primitive::at> inputs;
+      for (int k=0; k < input_tensors.size(); k++)
+        inputs.push_back(srcs[k].GetOpMem());
+
+      // If all inputs are in MKL format, then meaning of concat_dim needs to
+      // change. Value of concat_dim is tied to input Tensorflow data format
+      // (NHWC or NCHW). MklDnn dimensions are in NCHW order. So if Tensorflow
+      // tensors are in NCHW order, then concat_dim semantics is preserved.
+      // But ifinput tensors are in NHWC order, then semantics need to change.
+      // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
+      // then since MklDnn order is NCHW, concat_dim needs to be 1.
+      if (are_all_mkl_inputs)
+        concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+
+      auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
+
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (are_all_mkl_inputs) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = concat_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
+                                  input_shapes[0].GetTfDataFormat());
+        tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = MklDnnDimsToTFShape(dst_dims);
+      }
+      AllocateOutputSetMklShape(context, 0, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+      CHECK_NOTNULL(dst_tensor);
+
+      dst_md = dnn_shape_dst.IsMklTensor() ?
+               dnn_shape_dst.GetMklLayout() : dst_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
+      std::vector<primitive> net;
+      net.push_back(concat_op);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+               ", message: " + string(e.message) + ", in file " +
+               string(__FILE__) + ":" + std::to_string(__LINE__);
+        OP_REQUIRES_OK(context, errors::Aborted(
+                "Operation received an exception:", error_msg));
+    }
+  }
+
+  void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
+                        const TensorShapeList& input_shapes) {
+    CHECK_EQ(values.size(), input_shapes.size());
+
+    std::vector<Tensor> converted_values;
+    for (int i = 0; i < input_shapes.size(); i++)
+      converted_values.push_back(values[i]);
+
+    // Call Eigen concat.
+    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+
+    // Set output Mkl tensor for this op.
+    MklDnnShape dnn_shape_output;
+    dnn_shape_output.SetMklTensor(false);
+    dnn_shape_output.SetDimensions(4);
+    Tensor* output_tensor = nullptr;
+    TensorShape tf_shape_output;
+    tf_shape_output.AddDim(
+        dnn_shape_output.GetSerializeBufferSize());
+    context->allocate_output(
+        GetTensorMetaDataIndex(0, context->num_outputs()),
+        tf_shape_output, &output_tensor);
+    dnn_shape_output.SerializeMklDnnShape(
+        output_tensor->flat<uint8>().data(),
+        output_tensor->flat<uint8>().size() * sizeof(uint8));
+  }
+};
+
+#endif
+
 /* Use optimized concat for float type only */
 #define REGISTER_MKL_CPU(type)                                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConcat")                                \
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 9080bf7be89c4d194499c75f16aed5ca536b2f48..793fa24d992723c10317b01a70134dcd4d5066db 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -45,13 +45,10 @@ limitations under the License.
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
-
+using mkldnn::prop_kind;
 using mkldnn::convolution_backward_weights;
-using mkldnn::convolution_direct;
-using mkldnn::convolution_forward;
-
+using mkldnn::memory;
 #endif
 
 namespace tensorflow {
@@ -426,181 +423,229 @@ class MklConv2DCustomBackpropFilterOp : public OpKernel {
   TensorFormat data_format_;
 };
 
+#define REGISTER_MKL_FILTER_KERNELS(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
+#undef REGISTER_MKL_FILTER_KERNELS
+
 #else
 
-template <typename Device, class T>
-class MklConv2DCustomBackpropFilterOp : public OpKernel {
+template <typename Device, class T, bool biasEnabled>
+class MklConv2DCustomBackpropFilterOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
   explicit MklConv2DCustomBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropFilterOp() {}
 
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    CHECK(!filter_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropFilter: filter should not be in MKL Layout";
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
+  size_t GetInputTensorIndexWithSizes() { return 1; /* filter index */ }
 
-      MklDnnData<T> input(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    size_t input_idx = 0;
+    return GetTfShape(context, input_idx);
+  }
 
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    TensorShape filter_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(
+             filter_tensor.vec<int32>(), &filter_tf_shape).ok(), true);
+    return filter_tf_shape;
+  }
 
-      // Generate input shapes.
-      TensorShape filter_shape;
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(filter_tensor.shape()),
-          errors::InvalidArgument(
-              "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-              filter_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  filter_tensor.vec<int32>(), &filter_shape));
-      TensorShape input_shape = input_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
-          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
-          &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md =
-          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_filter_md =
-          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
-      auto fwd_out_md =
-          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(
-          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
-          fwd_out_md, strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(filter_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      input.SetUsrMem(fwd_input_dims, mkl_data_format, &input_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's filter layout (HWIO)
-      // Shape of output of Conv2DBackpropInput is same as shape of filter.
-      memory::dims bwd_output_dims = fwd_filter_dims;
-      output.SetUsrMem(bwd_output_dims, memory::format::hwio, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward weights primitive.
-      auto bwd_desc = convolution_backward_weights::desc(
-          convolution_direct, input.GetOpMemDesc(), output.GetOpMemDesc(),
-          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_weights::primitive_desc(
-          bwd_desc, cpu_engine, fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &input, &outbackprop, &output);
-    } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Shape of output of Conv2DBackpropFilter is same as shape of filter.
+    return fwd_filter_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's filter layout (HWIO).
+    return memory::format::hwio;
+  }
+
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    MklDnnData<T>* bias_grad = nullptr;
+    int depth = 0;
+    if (biasEnabled) {
+      // Data structure for bias_grad
+      bias_grad = new MklDnnData<T> (&cpu_engine);
+      TensorShape obp_tf_shape = GetTfShape(context, 2);
+      depth = (MklConv2DBackpropCommonOp<Device, T>::GetTFDataFormat()
+                == FORMAT_NCHW) ?
+          obp_tf_shape.dim_size(1) : obp_tf_shape.dim_size(3);
+      memory::dims bias_grad_dims = {depth};
+      bias_grad->SetOpMemDesc(bias_grad_dims, memory::format::x);
+    }
+
+    // Create convolution backward weights primitive.
+    auto bwd_desc = (biasEnabled && (bias_grad != nullptr))?
+        convolution_backward_weights::desc(convolution_direct,
+                                input->GetOpMemDesc(), output->GetOpMemDesc(),
+                                bias_grad->GetOpMemDesc(),
+                                outbackprop->GetOpMemDesc(), strides, padding_l,
+                                padding_r, padding) :
+        convolution_backward_weights::desc(convolution_direct,
+                          input->GetOpMemDesc(), output->GetOpMemDesc(),
+                          outbackprop->GetOpMemDesc(), strides, padding_l,
+                          padding_r, padding);
+
+    auto bwd_pd = convolution_backward_weights::primitive_desc(bwd_desc,
+                                                            cpu_engine,
+                                                            conv_fwd_pd);
+
+    // Allocate output tensor.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
+
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      // Allocate bias_grad tensor
+      TensorShape bias_grad_shape({depth});
+      Tensor* bias_grad_tensor = nullptr;
+      AllocateBiasGradTensor(context, bias_grad_shape, &bias_grad_tensor);
+      memory::dims bias_grad_dims = {depth};
+      // Since Bias is 1D, we use format::x from MKLDNN to represent it.
+      auto bias_grad_md = memory::desc({bias_grad_dims}, MklDnnType<T>(),
+                                       memory::format::x);
+      bias_grad->SetUsrMem(bias_grad_md, bias_grad_tensor);
+      bias_grad->SetUsrMemDataHandle(bias_grad_tensor);
+    }
+
+    if (biasEnabled && (bias_grad != nullptr)) {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output, bias_grad);
+    } else {
+      PrepareAndExecutePrimitive(bwd_pd, input, outbackprop, output);
     }
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_weights::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // For BackpropFilter, we convert the output tensor back in Tensorflow
+      // layout. Because typically, BackpropFilter is the last operator in the
+      // graph that emit filter gradient that is provided to ApplyGradient
+      // method to update the filter. But it may be possible to eliminate this
+      // by forwarding filter in MKL layout if we support ApplyGradient method
+      // for MKL layout propagation.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(false);
+      // output_dims_mkl_order is in OIHW format.
+      // Allocate shape of TF tensor in HWIO format.
+      TensorShape output_tf_shape({output_dims_mkl_order[MklDnnDims::Dim_H],
+                                   output_dims_mkl_order[MklDnnDims::Dim_W],
+                                   output_dims_mkl_order[MklDnnDims::Dim_I],
+                                   output_dims_mkl_order[MklDnnDims::Dim_O]});
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
+
+  // Allocate tensor for bias grad
+  void AllocateBiasGradTensor(OpKernelContext* context,
+                              const TensorShape& bias_grad_shape,
+                              Tensor** bias_grad_tensor) {
+    CHECK_NOTNULL(bias_grad_tensor);
+
+    MklDnnShape bias_grad_mkl_shape;
+    bias_grad_mkl_shape.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-      const convolution_backward_weights::primitive_desc& conv_pd,
-      MklDnnData<T>* input, MklDnnData<T>* obp, MklDnnData<T>* output) {
+                  const convolution_backward_weights::primitive_desc& conv_pd,
+                  MklDnnData<T>* input, MklDnnData<T>* obp,
+                  MklDnnData<T>* output, MklDnnData<T>* bias_grad = nullptr) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     input->CheckReorderToOpMem(conv_pd.src_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
+    // For BackpropFilter, we convert the output tensor back in Tensorflow
+    // layout.
     bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-        conv_pd.diff_weights_primitive_desc());
+                                      conv_pd.diff_weights_primitive_desc());
 
-    net.push_back(convolution_backward_weights(
-        conv_pd, input->GetOpMem(), obp->GetOpMem(), output->GetOpMem()));
+    if (biasEnabled && (bias_grad != nullptr)) {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem(),
+                                      bias_grad->GetOpMem()));
+    } else {
+      net.push_back(convolution_backward_weights(conv_pd, input->GetOpMem(),
+                                      obp->GetOpMem(), output->GetOpMem()));
+    }
 
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
     if (output_reorder_required) {
       output->InsertReorderToUserMem(&net);
     }
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
-#endif
 
 #define REGISTER_MKL_FILTER_KERNELS(T)                              \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")          \
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DCustomBackpropFilterOp<CPUDevice, T>);
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, false>);\
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklConv2DCustomBackpropFilterOp<CPUDevice, T, true>); \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")  \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+              MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
+
+#endif  // INTEL_MKL_DNN
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 4b6bf92e426c5b4ae726797e815ccba87231649a..df51df963881b33c08fbd6486574e5e5f8c3d2ff 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,8 +23,6 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -43,16 +41,15 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
-
+using mkldnn::prop_kind;
 using mkldnn::convolution_backward_data;
-using mkldnn::convolution_direct;
-using mkldnn::convolution_forward;
 #endif
 
 namespace tensorflow {
@@ -362,169 +359,132 @@ class MklConv2DCustomBackpropInputOp : public OpKernel {
 #else
 
 template <typename Device, class T>
-class MklConv2DCustomBackpropInputOp : public OpKernel {
+class MklConv2DCustomBackpropInputOp :
+  public MklConv2DBackpropCommonOp<Device, T> {
  public:
-  ~MklConv2DCustomBackpropInputOp() {}
   explicit MklConv2DCustomBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format_str;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    int stride_n = GetTensorDim(strides_, data_format_, 'N');
-    int stride_c = GetTensorDim(strides_, data_format_, 'C');
-    OP_REQUIRES(
-        context, (stride_n == 1 && stride_c == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+      : MklConv2DBackpropCommonOp<Device, T>(context) { }
+  ~MklConv2DCustomBackpropInputOp() {}
+
+ private:
+  void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                         const MklDnnShape& filter_mkl_shape,
+                         const MklDnnShape& obp_mkl_shape) {
+    // Tensor that feeds to 'Input' slot of BackpropInput is always just a shape
+    // of the Tensor and never an actual tensor. So it will never be in MKL
+    // layout.
+    CHECK(!input_mkl_shape.IsMklTensor())
+      << "Conv2DBackpropInput: input should not be in MKL Layout";
+  }
+
+  size_t GetInputTensorIndexWithSizes() { return 0; /* input index */ }
 
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  TensorShape MakeInputTfShape(OpKernelContext* context,
+                               const Tensor& input_tensor) {
+    TensorShape input_tf_shape;
+    CHECK_EQ(TensorShapeUtils::IsVector(input_tensor.shape()), true);
+    CHECK_EQ(TensorShapeUtils::MakeShape(input_tensor.vec<int32>(),
+                                         &input_tf_shape).ok(), true);
+    return input_tf_shape;
   }
 
-  void Compute(OpKernelContext* context) override {
-    try {
-      auto cpu_engine = engine(engine::cpu, 0);
-
-      MklDnnData<T> filter(&cpu_engine);
-      MklDnnData<T> outbackprop(&cpu_engine);
-      MklDnnData<T> output(&cpu_engine);
-
-      // Input tensors
-      const Tensor& input_tensor = MklGetInput(context, 0);
-      const Tensor& filter_tensor = MklGetInput(context, 1);
-      const Tensor& obp_tensor = MklGetInput(context, 2);  // Outbackprop
-
-      // Generate input shape.
-      TensorShape input_shape;
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(input_tensor.shape()),
-          errors::InvalidArgument(
-              "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-              input_tensor.dims()));
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_tensor.vec<int32>(), &input_shape));
-      TensorShape filter_shape = filter_tensor.shape();
-      TensorShape obp_shape = obp_tensor.shape();
-
-      // By default, all dims are in MKL order. Only dims in TF order
-      // are those with prefix tf_order.
-      memory::dims obp_dims, fwd_input_dims, fwd_filter_dims;
-      memory::dims padding_l, padding_r, strides, fwd_output_dims;
-      memory::dims fwd_output_dims_tf_order;
-
-      // Get forward convolution parameters.
-      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          input_shape, filter_shape, &fwd_input_dims, &fwd_filter_dims,
-          &strides, &fwd_output_dims_tf_order, &fwd_output_dims, &padding_l,
-          &padding_r);
-      if (!context->status().ok()) return;
-
-      // Create Convolution forward descriptor since Convolution backward
-      // API needs it. For that, we first need to create input, filter
-      // and output memory descriptors.
-      auto mkl_data_format = TFDataFormatToMklDnnDataFormat(data_format_);
-      auto fwd_src_md =
-          memory::desc(fwd_input_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_filter_md =
-          memory::desc(fwd_filter_dims, MklDnnType<T>(), memory::format::hwio);
-      auto fwd_out_md =
-          memory::desc(fwd_output_dims, MklDnnType<T>(), mkl_data_format);
-      auto fwd_desc = convolution_forward::desc(
-          prop_kind::forward, convolution_direct, fwd_src_md, fwd_filter_md,
-          fwd_out_md, strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
-      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
-
-      // Allocate output tensor and shape
-      // TODO(nhasabni): Update this when support for MKL layout is added.
-      // Shape of output of Conv2DBackpropInput is same as 'input' of Conv2D.
-      TensorShape tf_output_shape(input_shape);
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      Tensor* output_tensor = nullptr;
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
-
-      // Create memory for user data.
-      // Describe how the inputs and outputs of Convolution look like. Also
-      // specify buffers containing actual input and output data.
-      // Although input shape required is in MKL-DNN order, the layout is
-      // Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      // Shape of Conv2DBackpropInput's filter is same as that of Conv2D filter.
-      filter.SetUsrMem(fwd_filter_dims, memory::format::hwio, &filter_tensor);
-      // Outbackprop shape is NHWC or NCHW depending on data format. Since
-      // GetInputSizeInMklOrder function returns size in that order we just use
-      // use that function directly.
-      conv_utl.GetInputSizeInMklOrder(obp_shape, &obp_dims);
-      if (!context->status().ok()) return;
-      outbackprop.SetUsrMem(obp_dims, mkl_data_format, &obp_tensor);
-      // Although output shape required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      // Shape of output of Conv2DBackpropInput is same as shape of 'input'
-      // of Conv2D.
-      memory::dims bwd_output_dims = fwd_input_dims;
-      output.SetUsrMem(bwd_output_dims, mkl_data_format, output_tensor);
-
-      // Create memory descriptors for convolution data w/ no specified format.
-      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
-      outbackprop.SetOpMemDesc(obp_dims, memory::format::any);
-      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
-
-      // Create convolution backward data primitive.
-      auto bwd_desc = convolution_backward_data::desc(
-          convolution_direct, output.GetOpMemDesc(), filter.GetOpMemDesc(),
-          outbackprop.GetOpMemDesc(), strides, padding_l, padding_r,
-          TFPaddingToMklDnnPadding(padding_));
-
-      auto bwd_pd = convolution_backward_data::primitive_desc(
-          bwd_desc, cpu_engine, fwd_pd);
-
-      PrepareAndExecutePrimitive(bwd_pd, &filter, &outbackprop, &output);
-    } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
-    }
+  TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                const Tensor& filter_tensor) {
+    size_t filter_idx = 1;
+    return GetTfShape(context, filter_idx);
   }
 
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) {
+    // Output Shape of Conv2DBackpropInput is same as shape of Conv2D 'input'.
+    return fwd_input_dims;
+  }
+
+  memory::format GetOutputFormat(const memory::format data_format) {
+    // Output layout is Tensorflow's layout in data format order.
+    return data_format;
+  }
+
+  void CreatePrimitive(OpKernelContext* context,
+                       const engine& cpu_engine,
+                       const convolution_forward::primitive_desc& conv_fwd_pd,
+                       MklDnnData<T>* input, MklDnnData<T>* filter,
+                       MklDnnData<T>* outbackprop, MklDnnData<T>* output,
+                       Tensor** output_tensor,
+                       const memory::dims& strides,
+                       const memory::dims& padding_l,
+                       const memory::dims& padding_r,
+                       padding_kind padding,
+                       const memory::dims& bwd_output_dims,
+                       memory::format bwd_output_format) {
+    CHECK_NOTNULL(context);
+    CHECK_NOTNULL(input);
+    CHECK_NOTNULL(filter);
+    CHECK_NOTNULL(outbackprop);
+    CHECK_NOTNULL(output);
+    CHECK_NOTNULL(output_tensor);
+
+    // Create convolution backward data primitive.
+    auto bwd_desc = convolution_backward_data::desc(convolution_direct,
+                      output->GetOpMemDesc(), filter->GetOpMemDesc(),
+                      outbackprop->GetOpMemDesc(), strides, padding_l,
+                      padding_r, padding);
+
+    auto bwd_pd = convolution_backward_data::primitive_desc(bwd_desc,
+                                                          cpu_engine,
+                                                          conv_fwd_pd);
+
+
+    // Allocate output tensor in TensorFlow and MKL layout.
+    AllocateOutputTensor(context, bwd_pd, bwd_output_dims,
+                         bwd_output_format, output_tensor);
+    CHECK_NOTNULL(*output_tensor);
+    // Set buffer handle using allocated output tensor.
+    output->SetUsrMemDataHandle(*output_tensor);
+
+    PrepareAndExecutePrimitive(bwd_pd, filter, outbackprop, output);
+  }
+
+  // Allocate output tensor.
+  void AllocateOutputTensor(OpKernelContext* context,
+                  const convolution_backward_data::primitive_desc& conv_pd,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+
+      // Output primitive descriptor for backward data is diff_src.
+      auto dst_pd = conv_pd.diff_src_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim(dst_pd.get_size() / sizeof(T));
+
+      AllocateOutputSetMklShape(context, 0, output_tensor, output_tf_shape,
+                                output_mkl_shape);
+  }
 
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecutePrimitive(
-      const convolution_backward_data::primitive_desc& conv_pd,
-      MklDnnData<T>* filter, MklDnnData<T>* obp, MklDnnData<T>* output) {
+                  const convolution_backward_data::primitive_desc& conv_pd,
+                  MklDnnData<T>* filter, MklDnnData<T>* obp,
+                  MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
     // add it to the net before convolution.
     std::vector<primitive> net;
     filter->CheckReorderToOpMem(conv_pd.weights_primitive_desc(), &net);
     obp->CheckReorderToOpMem(conv_pd.diff_dst_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required =
-        output->PrepareReorderToUserMemIfReq(conv_pd.diff_src_primitive_desc());
-
-    net.push_back(convolution_backward_data(
-        conv_pd, obp->GetOpMem(), filter->GetOpMem(), output->GetOpMem()));
-
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
-    }
+    net.push_back(convolution_backward_data(conv_pd, obp->GetOpMem(),
+                                    filter->GetOpMem(), output->GetOpMem()));
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 369f632fb4623347e5a808443397ecec87b6cfa8..04268f23bb3e07f8eb9ba66957ca00a09b1e6d5d 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <string>
 #include <vector>
+#include <string>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -40,17 +40,19 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 
 #include "tensorflow/core/util/mkl_util.h"
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
+
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
 
-using mkldnn::prop_kind;
 using mkldnn::stream;
+using mkldnn::prop_kind;
 
-using mkldnn::convolution_direct;
 using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
 namespace tensorflow {
@@ -288,10 +290,8 @@ class MklConv2DOp : public OpKernel {
     mkl_filter_output_mkl_shape.SetMklLayout(mkl_context.prim_fwd,
                                              dnnResourceFilter);
 
-    size_t filter_sizes[4] = {static_cast<size_t>(filter.dim_size(0)),
-                              static_cast<size_t>(filter.dim_size(1)),
-                              static_cast<size_t>(filter.dim_size(2)),
-                              static_cast<size_t>(filter.dim_size(3))};
+    size_t filter_sizes[4] = {filter.dim_size(0), filter.dim_size(1),
+                              filter.dim_size(2), filter.dim_size(3)};
     mkl_filter_output_mkl_shape.SetTfLayout(filter.dims(), filter_sizes,
                                             mkl_context.filter_strides);
 
@@ -514,6 +514,12 @@ class MklConv2DOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, src_idx);
       const Tensor& filter_tensor = MklGetInput(context, filter_idx);
 
+      MklDnnShape src_mkl_shape, filter_mkl_shape;
+      GetMklShape(context, src_idx, &src_mkl_shape);
+      GetMklShape(context, filter_idx, &filter_mkl_shape);
+      CHECK(!filter_mkl_shape.IsMklTensor())
+        << "Conv2D filter should not be in MKL Layout";
+
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> filter(&cpu_engine);
       MklDnnData<T> output(&cpu_engine);
@@ -523,64 +529,57 @@ class MklConv2DOp : public OpKernel {
 
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
-      conv_utl.GetConvFwdSizesInMklOrder(
-          src_tensor.shape(), filter_tensor.shape(), &src_dims, &filter_dims,
-          &strides, &output_dims_tf_order, &output_dims_mkl_order, &padding_l,
-          &padding_r);
+      auto src_tf_shape = GetTfShape(context, src_idx);
+      auto filter_tf_shape = GetTfShape(context, filter_idx);
+      conv_utl.GetConvFwdSizesInMklOrder(src_tf_shape, filter_tf_shape,
+                                         &src_dims, &filter_dims, &strides,
+                                         &output_dims_tf_order,
+                                         &output_dims_mkl_order, &padding_l,
+                                         &padding_r);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
-      TensorShape tf_output_shape(
-          {output_dims_tf_order[0], output_dims_tf_order[1],
-           output_dims_tf_order[2], output_dims_tf_order[3]});
-      Tensor* output_tensor = nullptr;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output_tensor, tf_output_shape,
-                                mkl_output_mkl_shape);
+      TensorShape output_tf_shape = MklDnnDimsToTFShape(output_dims_tf_order);
 
       // Forward filter in TF format from input at index 1 to output at index 1.
       ForwardTfTensorInToOut(context, 1, 1);
 
-      if (tf_output_shape.num_elements() == 0) {
+      // Corner cases: output with 0 elements and 0 batch size.
+      Tensor* output_tensor = nullptr;
+      if (output_tf_shape.num_elements() == 0 ||
+          output_dims_tf_order[0] == 0) {
         // TODO(jbobba): Verify correctness here
         //               Need semantics for Null MKL tensor
+        MklDnnShape output_mkl_shape;
+        output_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 0, &output_tensor, src_tf_shape,
+                                output_mkl_shape);
         return;
       }
 
-      // Corner case to handle 0 batch size.
-      if (output_dims_tf_order[0] == 0) {
-        // Nothing to do, allocate output tensor and return
-        // TODO(nhasabni): remove this code later once serialization
-        // in MKL-DNN is supported.
-        AllocateOutputSetMklShape(context, 0, &output_tensor,
-                                  src_tensor.shape(), mkl_output_mkl_shape);
-        return;
-      } else {
-        // Otherwise regular output tensor allocation
-        // Allocate output tensor.
-      }
-      CHECK_NOTNULL(output_tensor);
-
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      // Although input shape (src_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (NHWC or NCHW depending on data
-      // format).
-      src.SetUsrMem(src_dims, TFDataFormatToMklDnnDataFormat(data_format_),
-                    const_cast<void*>(
-                        static_cast<const void*>(src_tensor.flat<T>().data())));
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input Tf layout. For TF layout, although input shape
+      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
+      // layout (NHWC or NCHW depending on data format).
+      auto src_md = src_mkl_shape.IsMklTensor()
+                    ? src_mkl_shape.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), tf_fmt);
+      src.SetUsrMem(src_md, &src_tensor);
       // Although filter shape (filter_dims) required is in MKL-DNN order,
       // the layout is Tensorflow's layout (HWIO).
-      filter.SetUsrMem(filter_dims, memory::format::hwio,
-                       const_cast<void*>(static_cast<const void*>(
-                           filter_tensor.flat<T>().data())));
-      // Although output shape (output_dims) required is in MKL-DNN order,
-      // layout is Tensorflow's layout (NHWC or NCHW depending on data format).
-      output.SetUsrMem(output_dims_mkl_order,
-                       TFDataFormatToMklDnnDataFormat(data_format_),
-                       output_tensor->flat<T>().data());
+      auto filter_md = filter_mkl_shape.IsMklTensor()
+                    ? filter_mkl_shape.GetMklLayout()
+          : memory::desc(filter_dims, MklDnnType<T>(), memory::format::hwio);
+      filter.SetUsrMem(filter_md, &filter_tensor);
+      // Set output shape (output_dims) required in MKL-DNN order.
+      // Currently, we set output layout as Tensorflow's layout (NHWC or NCHW
+      // depending on data format). But later we propagate Mkl layout of the
+      // output to the next op directly.
+      output.SetUsrMem(output_dims_mkl_order, tf_fmt);
 
       // Create memory descriptors for convolution data w/ no specified format.
       src.SetOpMemDesc(src_dims, memory::format::any);
@@ -593,38 +592,44 @@ class MklConv2DOp : public OpKernel {
         memory::dims bias_size;
         conv_utl.GetBiasSizeInMklOrder(2 /* bias idx */, &bias_size);
         const Tensor& bias_tensor = MklGetInput(context, 2);
-        bias.SetUsrMem(bias_size, memory::format::x,
-                       const_cast<void*>(static_cast<const void*>(
-                           bias_tensor.flat<T>().data())));
+        bias.SetUsrMem(bias_size, memory::format::x, &bias_tensor);
         bias.SetOpMemDesc(bias_size, memory::format::any);
 
         // Create convolution primitive with Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), bias.GetOpMemDesc(), output.GetOpMemDesc(),
-            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        auto conv_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
+            bias.GetOpMemDesc(), output.GetOpMemDesc(), strides,
+            padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
+                                                                cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc,
+                             output_dims_mkl_order, tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, &bias, &output);
       } else {
         // Create convolution primitive without Bias.
-        auto conv_desc = convolution_forward::desc(
-            prop_kind::forward, convolution_direct, src.GetOpMemDesc(),
-            filter.GetOpMemDesc(), output.GetOpMemDesc(), strides, padding_l,
-            padding_r, TFPaddingToMklDnnPadding(padding_));
-
-        auto conv_prim_desc =
-            convolution_forward::primitive_desc(conv_desc, cpu_engine);
+        auto conv_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, src.GetOpMemDesc(), filter.GetOpMemDesc(),
+            output.GetOpMemDesc(), strides, padding_l, padding_r,
+            TFPaddingToMklDnnPadding(padding_));
+
+        auto conv_prim_desc = convolution_forward::primitive_desc(conv_desc,
+                                                                cpu_engine);
+        AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order,
+                             tf_fmt, &output_tensor);
+        // Set data handle for output.
+        output.SetUsrMemDataHandle(output_tensor);
         PrepareAndExecuteNet(conv_prim_desc, &src, &filter, nullptr, &output);
       }
-    } catch (mkldnn::error& e) {
+    } catch (mkldnn::error &e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + std::string(e.message) + ", in file " +
-                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
-      OP_REQUIRES_OK(
-          context,
-          errors::Aborted("Operation received an exception:", error_msg));
+                       ", message: " + std::string(e.message) +
+                       ", in file " + std::string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+        errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
@@ -633,43 +638,56 @@ class MklConv2DOp : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
 
+  // Allocate output tensor.
+  void AllocateOutputTensor(
+                  OpKernelContext* context,
+                  const convolution_forward::primitive_desc& conv_prim_desc,
+                  const memory::dims& output_dims_mkl_order,
+                  memory::format output_tf_format, Tensor** output_tensor) {
+      CHECK_NOTNULL(output_tensor);
+      auto dst_pd = conv_prim_desc.dst_primitive_desc();
+
+      // Allocate shape of Mkl tensor.
+      MklDnnShape output_mkl_shape;
+      output_mkl_shape.SetMklTensor(true);
+      output_mkl_shape.SetMklLayout(&dst_pd);
+      output_mkl_shape.SetElemType(MklDnnType<T>());
+      output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                                   output_dims_mkl_order, output_tf_format);
+
+      // Allocate shape of TF tensor.
+      TensorShape output_tf_shape;
+      output_tf_shape.AddDim((dst_pd.get_size() / sizeof(T)));
+
+      const int kOutputSlotIdx = 0;
+      AllocateOutputSetMklShape(context, kOutputSlotIdx, output_tensor,
+                                output_tf_shape, output_mkl_shape);
+  }
+
   // Prepare and execute net - checks for input and output reorders.
   void PrepareAndExecuteNet(
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<T>* src, MklDnnData<T>* filter, MklDnnData<T>* bias,
-      MklDnnData<T>* output) {
+                  const convolution_forward::primitive_desc& conv_prim_desc,
+                  MklDnnData<T>* src, MklDnnData<T>* filter,
+                  MklDnnData<T>* bias, MklDnnData<T>* output) {
     // Create reorders between user layout and MKL layout if it is needed and
-    // add it to the net before convolution.
+    // add it to the net before convolution. No need to check for output
+    // reorder as we propagate output layout to the next layer.
     std::vector<primitive> net;
     src->CheckReorderToOpMem(conv_prim_desc.src_primitive_desc(), &net);
     filter->CheckReorderToOpMem(conv_prim_desc.weights_primitive_desc(), &net);
 
-    // Memory for output of convolution. Since we may need reorder on the
-    // output side, we will prepare reorder primitive in case output
-    // reorder to user memory is required.
-    bool output_reorder_required = output->PrepareReorderToUserMemIfReq(
-        conv_prim_desc.dst_primitive_desc());
-
     // Create convolution primitive and add it to net.
     if (bias) {
       CHECK_EQ(biasEnabled, true);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                        filter->GetOpMem(), bias->GetOpMem(),
-                                        output->GetOpMem()));
+                                    filter->GetOpMem(), bias->GetOpMem(),
+                                    output->GetOpMem()));
     } else {
       CHECK_EQ(biasEnabled, false);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
-                                        filter->GetOpMem(),
-                                        output->GetOpMem()));
-    }
-
-    // Insert reorder primitive in the net for output reorder if reorder is
-    // required.
-    if (output_reorder_required) {
-      output->InsertReorderToUserMem(&net);
+                                    filter->GetOpMem(), output->GetOpMem()));
     }
 
-    // Handle output reorder
     stream(stream::kind::eager).submit(net).wait();
   }
 };
@@ -686,7 +704,12 @@ class MklConv2DOp : public OpKernel {
                               .Device(DEVICE_CPU)                   \
                               .TypeConstraint<T>("T")               \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConv2DOp<CPUDevice, T, true>);
+                          MklConv2DOp<CPUDevice, T, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e29af19ca9b911fd187f9586eab03c24db2a06f6..47a9b4bfc734dab5786b42f6e7118a798f790345 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
-#include <limits>
 #include <vector>
+#include <limits>
 
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -41,6 +41,12 @@ limitations under the License.
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+
+using mkldnn::convolution_forward;
+using mkldnn::convolution_direct;
 #endif
 
 namespace tensorflow {
@@ -49,15 +55,15 @@ namespace tensorflow {
 
 class MklDnnConvUtil {
  protected:
-  OpKernelContext *context_;  // We don't own this.
+  OpKernelContext* context_;  // We don't own this.
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
 
  public:
-  MklDnnConvUtil(OpKernelContext *context, const std::vector<int32> &strides,
-                 Padding pad, TensorFormat fm)
-      : context_(context), strides_(strides), padding_(pad), data_format_(fm) {}
+  MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
+                 Padding pad, TensorFormat fm) : context_(context),
+    strides_(strides), padding_(pad), data_format_(fm) {}
 
   virtual ~MklDnnConvUtil() { context_ = nullptr; }
 
@@ -75,14 +81,14 @@ class MklDnnConvUtil {
   // requires input in NCHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void GetInputSizeInMklOrder(const TensorShape &input_shape,
-                                             memory::dims *input_dims) {
-#define CHECK_BOUNDS(val, err_msg)                                     \
-  do {                                                                 \
-    OP_REQUIRES(context_,                                              \
-                FastBoundsCheck(val, std::numeric_limits<int>::max()), \
-                errors::InvalidArgument(err_msg));                     \
-  } while (0)
+  virtual inline void
+  GetInputSizeInMklOrder(const TensorShape& input_shape,
+                         memory::dims *input_dims) {
+  #define CHECK_BOUNDS(val, err_msg) do {                     \
+    OP_REQUIRES(context_, FastBoundsCheck(val,                \
+                            std::numeric_limits<int>::max()), \
+                errors::InvalidArgument(err_msg));            \
+  }while(0)
 
     CHECK_NOTNULL(input_dims);
 
@@ -105,10 +111,16 @@ class MklDnnConvUtil {
     CHECK_BOUNDS(input_batch_raw, "Input batch too large");
     int input_batch = static_cast<int>(input_batch_raw);
 
-#undef CHECK_BOUNDS
+  #undef CHECK_BOUNDS
 
     // MKL-DNN always requires input in NCHW format.
-    *input_dims = {input_batch, input_depth, input_rows, input_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = input_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = input_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = input_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = input_cols;
+
+    *input_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
@@ -125,9 +137,10 @@ class MklDnnConvUtil {
   // forward gets actual tensor as input).
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void GetFilterSizeInMklOrder(const TensorShape &input_shape,
-                                              const TensorShape &filter_shape,
-                                              memory::dims *filter_dims) {
+  virtual inline void
+  GetFilterSizeInMklOrder(const TensorShape& input_shape,
+                          const TensorShape& filter_shape,
+                          memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == 4,
@@ -135,18 +148,17 @@ class MklDnnConvUtil {
                                         filter_shape.DebugString()));
 
     for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(context_,
-                  FastBoundsCheck(filter_shape.dim_size(i),
-                                  std::numeric_limits<int>::max()),
-                  errors::InvalidArgument("filter too large"));
+      OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i),
+                                           std::numeric_limits<int>::max()),
+                errors::InvalidArgument("filter too large"));
     }
 
     int input_depth = GetTensorDim(input_shape, data_format_, 'C');
 
-    OP_REQUIRES(context_, input_depth == filter_shape.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", input_depth,
-                    " vs ", filter_shape.dim_size(2)));
+    OP_REQUIRES(
+        context_, input_depth == filter_shape.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                input_depth, " vs ", filter_shape.dim_size(2)));
 
     // TF filter is always in (rows, cols, in_depth, out_depth) order.
     int filter_rows = static_cast<int>(filter_shape.dim_size(0));
@@ -156,32 +168,38 @@ class MklDnnConvUtil {
 
     // MKL-DNN always needs filter in OIHW format.
     // OIHW = (out_depth, in_depth, rows, cols)
-    *filter_dims = {out_depth, in_depth, filter_rows, filter_cols};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+    mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+    *filter_dims = mkldnn_sizes;
   }
 
   // Calculate Convolution filter size in MKL-DNN order. MKL-DNN
   // requires filter in OIHW format. Function does not return anything.
   // But errors arising from sanity checks are returned in context's
   // status.
-  virtual inline void GetFilterSizeInMklOrder(size_t src_index,
-                                              size_t filter_index,
-                                              memory::dims *filter_dims) {
+  virtual inline void
+  GetFilterSizeInMklOrder(size_t src_index, size_t filter_index,
+                          memory::dims *filter_dims) {
     CHECK_NOTNULL(filter_dims);
-    const Tensor &input = MklGetInput(context_, src_index);
-    const Tensor &filter = MklGetInput(context_, filter_index);
-    GetFilterSizeInMklOrder(input.shape(), filter.shape(), filter_dims);
+    GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
+                            GetTfShape(context_, filter_index),
+                            filter_dims);
   }
 
   // Calculate Bias size for 2D Convolution. Function does not return
   // anything, but sets error in context status.
-  virtual inline void GetBiasSizeInMklOrder(size_t bias_index,
-                                            memory::dims *bias_dims) {
-    const Tensor &bias = MklGetInput(context_, bias_index);
+  virtual inline void
+  GetBiasSizeInMklOrder(size_t bias_index, memory::dims *bias_dims) {
+    const Tensor& bias = MklGetInput(context_, bias_index);
     OP_REQUIRES(context_, bias.dims() == 1,
                 errors::InvalidArgument("bias must be 1-dimensional: ",
                                         bias.shape().DebugString()));
 
-    *bias_dims = {static_cast<int>(bias.dim_size(0))};
+    *bias_dims = { static_cast<int>(bias.dim_size(0)) };
   }
 
   // Function to calculate output and padding size for 2D convolution.
@@ -193,11 +211,13 @@ class MklDnnConvUtil {
   // status is returned via context status.
   //
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
-  virtual inline void GetOutputAndPadSizeInMklOrder(
-      const TensorShape &input_shape, const TensorShape &filter_shape,
-      const memory::dims &strides, memory::dims *output_dims_tf_order,
-      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
-      memory::dims *pad_r) {
+  virtual inline void
+  GetOutputAndPadSizeInMklOrder(const TensorShape& input_shape,
+                                const TensorShape& filter_shape,
+                                const memory::dims& strides,
+                                memory::dims *output_dims_tf_order,
+                                memory::dims *output_dims_mkl_order,
+                                memory::dims *pad_l, memory::dims *pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -223,21 +243,25 @@ class MklDnnConvUtil {
     int64 out_rows = 0, out_cols = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
 
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_rows, filter_rows, stride_rows, padding_,
-                                 &out_rows, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
-                                 input_cols, filter_cols, stride_cols, padding_,
-                                 &out_cols, &pad_left, &pad_right));
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerbose(input_rows, filter_rows, stride_rows,
+                                 padding_, &out_rows, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context_,
+            GetWindowedOutputSizeVerbose(input_cols, filter_cols, stride_cols,
+                                 padding_, &out_cols, &pad_left, &pad_right));
 
     // Tensorflow output is in data_format order. (NHWC or NCHW)
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, out_batch, out_rows, out_cols, out_depth);
+    TensorShape out_shape = ShapeFromFormat(data_format_, out_batch,
+                                            out_rows, out_cols, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
     // MKL-DNN always needs output in NCHW format.
-    *output_dims_mkl_order = {out_batch, out_depth, static_cast<int>(out_rows),
-                              static_cast<int>(out_cols)};
+    std::vector<int> mkldnn_sizes(4, -1);
+    mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
+    mkldnn_sizes[MklDnnDims::Dim_C] = out_depth;
+    mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
+    mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
+    *output_dims_mkl_order = mkldnn_sizes;
 
     // Now handle padding. MKL-DNN uses asymetric padding.
     *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -248,25 +272,27 @@ class MklDnnConvUtil {
   // See comment on GetConvOutputAndPadSizeInMklOrder for parameters.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetOutputAndPadSizeInMklOrder(
-      size_t src_index, size_t filter_index, const memory::dims &strides,
-      memory::dims *output_dims_tf_order, memory::dims *output_dims_mkl_order,
-      memory::dims *pad_l, memory::dims *pad_r) {
+  inline void
+  GetOutputAndPadSizeInMklOrder(size_t src_index, size_t filter_index,
+                                const memory::dims& strides,
+                                memory::dims *output_dims_tf_order,
+                                memory::dims *output_dims_mkl_order,
+                                memory::dims *pad_l, memory::dims *pad_r) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    const Tensor &input = MklGetInput(context_, src_index);
-    const Tensor &filter = MklGetInput(context_, filter_index);
+    auto input_tf_shape = GetTfShape(context_, src_index);
+    auto filter_tf_shape = GetTfShape(context_, filter_index);
 
-    OP_REQUIRES(context_, input.dims() == 4,
+    OP_REQUIRES(context_, input_tf_shape.dims() == 4,
                 errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
+                                        input_tf_shape.DebugString()));
 
-    GetOutputAndPadSizeInMklOrder(input.shape(), filter.shape(), strides,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
+                                  strides, output_dims_tf_order,
+                                  output_dims_mkl_order, pad_l, pad_r);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -275,12 +301,15 @@ class MklDnnConvUtil {
   // also calculates strides and paddings for 2D Convolution.
   //
   // Function does not return anything, but sets error in context status.
-  inline void GetConvFwdSizesInMklOrder(
-      const TensorShape &input_shape, const TensorShape &filter_shape,
-      memory::dims *input_dims, memory::dims *filter_dims,
-      memory::dims *strides, memory::dims *output_dims_tf_order,
-      memory::dims *output_dims_mkl_order, memory::dims *pad_l,
-      memory::dims *pad_r) {
+  inline void GetConvFwdSizesInMklOrder(const TensorShape& input_shape,
+                                        const TensorShape& filter_shape,
+                                        memory::dims *input_dims,
+                                        memory::dims *filter_dims,
+                                        memory::dims *strides,
+                                        memory::dims *output_dims_tf_order,
+                                        memory::dims *output_dims_mkl_order,
+                                        memory::dims *pad_l,
+                                        memory::dims *pad_r) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -295,14 +324,238 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetOutputAndPadSizeInMklOrder(input_shape, filter_shape, *strides,
-                                  output_dims_tf_order, output_dims_mkl_order,
+                                  output_dims_tf_order,
+                                  output_dims_mkl_order,
                                   pad_l, pad_r);
     if (!context_->status().ok()) return;
   }
 };
 
+/////////////////////////////////////////////////////////////////////
+///  Common class that implements Conv2DBackpropFilter and Input
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, class T>
+class MklConv2DBackpropCommonOp :  public OpKernel {
+ public:
+  ~MklConv2DBackpropCommonOp() {}
+  explicit MklConv2DBackpropCommonOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    int stride_n = GetTensorDim(strides_, data_format_, 'N');
+    int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    OP_REQUIRES(
+        context, (stride_n == 1 && stride_c == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      // Prepare common tensors for Conv2DBackpropInput and
+      // Conv2DBackpropFilter.
+      MklDnnData<T> input(&cpu_engine);
+      MklDnnData<T> filter(&cpu_engine);
+      MklDnnData<T> outbackprop(&cpu_engine);
+      MklDnnData<T> output(&cpu_engine);
+
+      // Input tensors
+      const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
+      const Tensor& input_tensor = MklGetInput(context, kInputIdx);
+      const Tensor& filter_tensor = MklGetInput(context, kFilterIdx);
+      const Tensor& outbprop_tensor = MklGetInput(context, kOutbpropIdx);
+
+      MklDnnShape input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape;
+      GetMklShape(context, kInputIdx, &input_mkl_shape);
+      GetMklShape(context, kFilterIdx, &filter_mkl_shape);
+      GetMklShape(context, kOutbpropIdx, &outbprop_mkl_shape);
+      // Allow operator-specific sanity checking of shapes.
+      ValidateMklShapes(input_mkl_shape, filter_mkl_shape, outbprop_mkl_shape);
+
+      // Allow operator-specific generation of shapes.
+      // E.g., Conv2DBackpropFilter gets filter as filter_sizes. It is a
+      // tensor containing shape of filter. So filter.shape() is not
+      // a correct way to get filter shape. These operator-specific calls
+      // allow this class to handle this case.
+      TensorShape input_tf_shape = MakeInputTfShape(context, input_tensor);
+      TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor);
+      TensorShape outbprop_tf_shape = GetTfShape(context, kOutbpropIdx);
+
+      // By default, all dims are in MKL order. Only dims in TF order
+      // are those with prefix tf_order.
+      memory::dims outbprop_dims, fwd_input_dims, fwd_filter_dims;
+      memory::dims padding_l, padding_r, strides, fwd_output_dims;
+      memory::dims fwd_output_dims_tf_order;
+
+      // Get forward convolution parameters.
+      MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_);
+      conv_utl.GetConvFwdSizesInMklOrder(input_tf_shape, filter_tf_shape,
+                                         &fwd_input_dims, &fwd_filter_dims,
+                                         &strides,
+                                         &fwd_output_dims_tf_order,
+                                         &fwd_output_dims,
+                                         &padding_l, &padding_r);
+      if (!context->status().ok()) return;
+
+      // Create Convolution forward descriptor since Convolution backward
+      // API needs it. For that, we first need to create input, filter
+      // and output memory descriptors.
+      auto tf_fmt = TFDataFormatToMklDnnDataFormat(data_format_);
+      // If input is in MKL layout, then simply grab input layout; otherwise,
+      // construct input TF layout. For TF layout, although input shape
+      // required is in MKL-DNN order, the layout is Tensorflow's layout
+      // (NHWC or NCHW depending on data format).
+      auto fwd_input_md = input_mkl_shape.IsMklTensor() ?
+                          input_mkl_shape.GetMklLayout() :
+                       memory::desc(fwd_input_dims, MklDnnType<T>(), tf_fmt);
+      // If filter is in MKL layout, then simply grab filter layout; otherwise
+      // construct filter in TF layout. For TF layout, filter is in HWIO format.
+      auto fwd_filter_md = filter_mkl_shape.IsMklTensor() ?
+                          filter_mkl_shape.GetMklLayout() :
+                          memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                                       memory::format::hwio);
+      // Tensorflow Output of Conv2D is in data_format order.
+      auto fwd_out_md = memory::desc(fwd_output_dims, MklDnnType<T>(), tf_fmt);
+      auto fwd_desc = convolution_forward::desc(prop_kind::forward,
+            convolution_direct, fwd_input_md, fwd_filter_md, fwd_out_md,
+            strides, padding_l, padding_r, TFPaddingToMklDnnPadding(padding_));
+      auto fwd_pd = convolution_forward::primitive_desc(fwd_desc, cpu_engine);
+
+      // Create memory for user data. Describe how the inputs and outputs of
+      // Convolution look like. Also specify buffers containing actual input
+      // and output data.
+
+      // Since this is a common class for both Conv2DBackpropFilter and
+      // Conv2DBackpropInput, we skip SetUsrMem call for input tensor (for
+      // Conv2DBackpropInput) and for filter tensor (for
+      // conv2DBackpropFilter) depending on which tensor is int32 type.
+      size_t input_with_sizes = GetInputTensorIndexWithSizes();
+      if (input_with_sizes != kInputIdx) {
+        // Shape of Conv2DBackpropFilter's input is same as Conv2D input.
+        input.SetUsrMem(fwd_input_md, &input_tensor);
+      } else if (input_with_sizes != kFilterIdx) {
+        // Shape of Conv2DBackpropInput's filter is same as Conv2D filter.
+        filter.SetUsrMem(fwd_filter_md, &filter_tensor);
+      }
+
+      conv_utl.GetInputSizeInMklOrder(outbprop_tf_shape, &outbprop_dims);
+      if (!context->status().ok()) return;
+      if (outbprop_mkl_shape.IsMklTensor()) {
+        // If outbackprop is in Mkl layout, then simply grab it.
+        auto outbprop_md = outbprop_mkl_shape.GetMklLayout();
+        outbackprop.SetUsrMem(outbprop_md, &outbprop_tensor);
+      } else {
+        // If outbackprop is in TensorFlow layout, then we need to create memory
+        // descriptor for it. Outbackprop shape is data format order.
+        outbackprop.SetUsrMem(outbprop_dims, tf_fmt, &outbprop_tensor);
+      }
+
+      // Operator specific call to get output shape and data_format.
+      auto bwd_output_dims = GetOutputDims(fwd_input_dims, fwd_filter_dims);
+      auto bwd_output_format = GetOutputFormat(tf_fmt);
+      output.SetUsrMem(bwd_output_dims, bwd_output_format);
+
+      // Create memory descriptors for convolution data w/ no specified format.
+      input.SetOpMemDesc(fwd_input_dims, memory::format::any);
+      filter.SetOpMemDesc(fwd_filter_dims, memory::format::any);
+      outbackprop.SetOpMemDesc(outbprop_dims, memory::format::any);
+      output.SetOpMemDesc(bwd_output_dims, memory::format::any);
+
+      // Operator-specific call to create and execute primitive.
+      Tensor* output_tensor = nullptr;
+      CreatePrimitive(context, cpu_engine, fwd_pd, &input, &filter,
+                      &outbackprop, &output, &output_tensor,
+                      strides, padding_l, padding_r,
+                      TFPaddingToMklDnnPadding(padding_),
+                      bwd_output_dims, bwd_output_format);
+    } catch (mkldnn::error &e) {
+     string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+     OP_REQUIRES_OK(context, errors::Aborted("Operation received an exception:",
+                                            error_msg));
+    }
+  }
+
+  /// Pure virtual function to allow operator to check for validity of input
+  /// shapes. Function asserts that input shapes are valid.
+  virtual void ValidateMklShapes(const MklDnnShape& input_mkl_shape,
+                                 const MklDnnShape& filter_mkl_shape,
+                                 const MklDnnShape& outbprop_mkl_shape) = 0;
+
+  /// Operator-specific function that returns index of input that is
+  /// representing input sizes. For Conv2DBackpropFilter it returns 1 since
+  /// filter for this operator is filter shape. For Conv2DBackpropInput it
+  /// returns 0 (for input).
+  virtual size_t GetInputTensorIndexWithSizes() = 0;
+
+  /// Get TensorFlow shape of input tensor.
+  virtual TensorShape MakeInputTfShape(OpKernelContext* context,
+                                      const Tensor& input_tensor) = 0;
+
+  /// Get TensorFlow shape of filter tensor.
+  virtual TensorShape MakeFilterTfShape(OpKernelContext* context,
+                                       const Tensor& filter_tensor) = 0;
+
+  /// Get shape of output in MKL-DNN order. Computes shape of output from
+  /// input shape (fwd_input_dims) and filter shape (fwd_filter_dims).
+  virtual
+  const memory::dims& GetOutputDims(const memory::dims& fwd_input_dims,
+                                    const memory::dims& fwd_filter_dims) = 0;
+
+  /// Get data_format of output in MKL-DNN order. If output data format is
+  /// same as input data format, then it simply returns value of data_format
+  /// parameter as it is.
+  virtual memory::format GetOutputFormat(const memory::format data_format) = 0;
+
+  /// Create and execute the primitive storing output in the output_tensor.
+  virtual void CreatePrimitive(OpKernelContext* context,
+    const engine& cpu_engine,
+    const convolution_forward::primitive_desc& conv_fwd_pd,
+    MklDnnData<T>* input, MklDnnData<T>* filter, MklDnnData<T>* outbackprop,
+    MklDnnData<T>* output, Tensor** output_tensor, const memory::dims& strides,
+    const memory::dims& padding_l, const memory::dims& padding_r,
+    padding_kind padding, const memory::dims& bwd_output_dims,
+    memory::format bwd_output_format) = 0;
+
+  // Get the data_format {NCHW, NHWC}
+  TensorFormat GetTFDataFormat () { return data_format_; }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  TensorFormat data_format_;
+};
 #endif  // INTEL_MKL_DNN
 
+/////////////////////////////////////////////////////////////////////
+///  Dummy Mkl op that is just used for operators that are intermediate
+///  output of node fusion in the graph
+/////////////////////////////////////////////////////////////////////
+
+template <typename Device, typename T>
+class MklDummyOp : public OpKernel {
+ public:
+  ~MklDummyOp() {}
+
+  explicit MklDummyOp(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TF_CHECK_OK(errors::Unimplemented("This is a dummy op."
+                                      "It should not have been invoked."));
+  }
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index bc9e906c39a9a7f5f4b2ae83afc6774aecb38c48..a761562a4b9966d3dbd8bede2f64e6eb0546b42e 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -25,10 +25,24 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::use_scale_shift;
+using mkldnn::use_global_stats;
+using mkldnn::batch_normalization_forward;
+using mkldnn::batch_normalization_backward;
+#endif
+
 // TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklFusedBatchNormOp : public OpKernel {
  public:
@@ -46,7 +60,6 @@ class MklFusedBatchNormOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     MklFusedBatchNormOpContext mkl_context;
-
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& scale = MklGetInput(context, 1);
     const Tensor& shift = MklGetInput(context, 2);
@@ -55,6 +68,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
     GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
     bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
+
     if (!input_in_mkl_format) {
       OP_REQUIRES(context, input.dims() == 4,
                   errors::InvalidArgument("input must be 4-dimensional",
@@ -69,10 +83,12 @@ class MklFusedBatchNormOp : public OpKernel {
     OP_REQUIRES(context, est_mean.dims() == 1,
                 errors::InvalidArgument("estimated_mean must be 1-dimensional",
                                         est_mean.shape().DebugString()));
+
     OP_REQUIRES(
         context, est_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 est_variance.shape().DebugString()));
+
     if (is_training_) {
       OP_REQUIRES(context, est_mean.dim_size(0) == 0,
                   errors::InvalidArgument("estimated_mean empty for training",
@@ -258,7 +274,6 @@ class MklFusedBatchNormOp : public OpKernel {
             E_SUCCESS);
       }
     }
-
     void MklPrepareContextInputs(OpKernelContext* context,
                                  Tensor* mkl_tmp_input_buf_tensor,
                                  Tensor* mkl_tmp_scale_shift_buf_tensor) {
@@ -325,15 +340,6 @@ class MklFusedBatchNormOp : public OpKernel {
   } MklFusedBatchNormOpContext;
 };
 
-#define REGISTER_MKL_CPU(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklFusedBatchNormOp<CPUDevice, T>);
-TF_CALL_float(REGISTER_MKL_CPU);
-#undef REGISTER_MKL_CPU
-
 template <typename Device, typename T>
 class MklFusedBatchNormGradOp : public OpKernel {
  public:
@@ -595,7 +601,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
       mkl_res_batchnorm_bwd[dnnResourceSrc] =
           (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
 
-      bool mkl_convert_out_backprop;
+     bool mkl_convert_out_backprop;
       dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
       dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
       void* mkl_buf_converted_out_backprop = nullptr;
@@ -675,6 +681,628 @@ class MklFusedBatchNormGradOp : public OpKernel {
     }
   } MklFusedBatchNormGradOpContext;
 };
+#endif
+
+#ifdef INTEL_MKL_DNN
+
+template <typename Device, typename T>
+class MklFusedBatchNormOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;    // index of src input tensor
+      const size_t scale_index = 1;  // index of scale tensor
+      const size_t shift_index = 2;  // index of shift tensor
+      const size_t mean_index = 3;   // index of est_mean tensor
+      const size_t var_index = 4;    // index of est_variance tensor
+
+      const Tensor& src_tensor          = MklGetInput(context, src_index);
+      const Tensor& scale_tensor        = MklGetInput(context, scale_index);
+      const Tensor& shift_tensor        = MklGetInput(context, shift_index);
+      const Tensor& est_mean_tensor     = MklGetInput(context, mean_index);
+      const Tensor& est_variance_tensor = MklGetInput(context, var_index);
+
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, shift_tensor.dims() == 1,
+                  errors::InvalidArgument("offset must be 1-dimensional",
+                                        shift_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_mean must be 1-dimensional",
+                      est_mean_tensor.shape().DebugString()));
+      OP_REQUIRES(context, est_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "estimated_variance must be 1-dimensional",
+                      est_variance_tensor.shape().DebugString()));
+
+      if (is_training_) {
+        OP_REQUIRES(context, est_mean_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_mean must be empty for training",
+                        est_mean_tensor.shape().DebugString()));
+        OP_REQUIRES(context, est_variance_tensor.dim_size(0) == 0,
+                    errors::InvalidArgument(
+                        "estimated_variance must be empty for training",
+                        est_variance_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      // Indices of output tensors
+      const size_t dst_index = 0;
+      const size_t batch_mean_index = 1;
+      const size_t batch_variance_index = 2;
+      const size_t saved_mean_index = 3;
+      const size_t saved_variance_index = 4;
+
+      // allocate batch mean output tensor
+      Tensor* batch_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_mean;
+      mkl_shape_batch_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_mean_index,
+                                &batch_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_mean);
+      CHECK_NOTNULL(batch_mean_tensor);
+
+      // Batch variance
+      Tensor* batch_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_batch_variance;
+      mkl_shape_batch_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context,
+                                batch_variance_index,
+                                &batch_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_batch_variance);
+      CHECK_NOTNULL(batch_variance_tensor);
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat()) {
+          format_m = memory::format::nchw;
+        } else {
+          format_m = memory::format::nhwc;
+        }
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      // set src primitive
+      memory::dims src_dims;
+      if (dnn_shape_src.IsMklTensor()) {
+        src_dims = TFShapeToMklDnnDimsInNCHW(dnn_shape_src.GetTfShape(),
+                                             tensor_format_);
+      } else {
+        src_dims = TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                             tensor_format_);
+      }
+
+      auto src_md = dnn_shape_src.IsMklTensor()
+                    ? dnn_shape_src.GetMklLayout()
+                    : memory::desc(src_dims, MklDnnType<T>(), format_m);
+      src.SetUsrMem(src_md, &src_tensor);
+
+      // set weights primitive
+      // MKL-DNN packs scale & shift as "weights":
+      // <scale>...<scale><shift>...<shift>
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(
+                        weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(scale_tensor.flat<T>().data()));
+      T* shift_tf = reinterpret_cast<T*>(
+                    const_cast<T*>(shift_tensor.flat<T>().data()));
+
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = shift_tf[k];
+      }
+
+      // Mean and variance (without Bessel's correction) saved for backward
+      // computation to serve as pre-computed mean and variance.
+      Tensor* saved_mean_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_mean;
+      mkl_shape_saved_mean.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_mean_index,
+                                &saved_mean_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_mean);
+      CHECK_NOTNULL(saved_mean_tensor);
+
+      Tensor* saved_variance_tensor = nullptr;
+      MklDnnShape mkl_shape_saved_variance;
+      mkl_shape_saved_variance.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, saved_variance_index,
+                                &saved_variance_tensor,
+                                scale_tensor.shape(),
+                                mkl_shape_saved_variance);
+      CHECK_NOTNULL(saved_variance_tensor);
+
+      // set mean primitive
+      auto mean_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto mean_pd = memory::primitive_desc(mean_desc, cpu_engine);
+      char* saved_mean_data_tf = reinterpret_cast<char*>
+                                 (saved_mean_tensor->flat<T>().data());
+      std::memcpy(saved_mean_data_tf,
+                  reinterpret_cast<char*>(mean_values_),
+                  depth_*sizeof(T));
+      auto mean_m = memory(mean_pd,
+                           reinterpret_cast<void*>(saved_mean_data_tf));
+
+      // set variance primitive
+      auto variance_desc = memory::desc({1, depth_},
+                                    MklDnnType<T>(),
+                                    memory::format::nc);
+      auto variance_pd = memory::primitive_desc(variance_desc, cpu_engine);
+      char* saved_variance_data_tf = reinterpret_cast<char*>
+                  (saved_variance_tensor->flat<T>().data());
+      std::memcpy(saved_variance_data_tf,
+                  reinterpret_cast<char*>(variance_values_),
+                  depth_*sizeof(T));
+      auto variance_m = memory(variance_pd, saved_variance_data_tf);
+
+      prop_kind pk = (is_training_) ?
+                     prop_kind::forward_training :
+                     prop_kind::forward_scoring;
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                               pk, src.GetUsrMemDesc(), epsilon_,
+                               is_training_ ? use_scale_shift :
+                               (use_scale_shift | use_global_stats));
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                             bnrm_fwd_desc, cpu_engine);
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      Tensor* dst_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  src_dims, format_m);
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                                tf_shape_dst, dnn_shape_dst);
+
+      // Output of batchnorm has same shape as input.
+      dst.SetUsrMem(src_md, dst_tensor);
+
+      primitive bnrm_fwd_op;
+      if (is_training_) {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          weights_m,
+                          dst.GetOpMem(),
+                          mean_m,
+                          variance_m);
+      } else {
+        bnrm_fwd_op = batch_normalization_forward(
+                          bnrm_fwd_pd,
+                          src.GetOpMem(),
+                          mean_m,
+                          variance_m,
+                          (const primitive::at) weights_m,
+                          dst.GetOpMem());
+      }
+      std::vector<primitive> net;
+      net.push_back(bnrm_fwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // copy batch_mean data
+      T* batch_mean_data_tf = reinterpret_cast<T*>(
+                                batch_mean_tensor->flat<T>().data());
+      std::memcpy(reinterpret_cast<char*>(batch_mean_data_tf),
+                  reinterpret_cast<char*>(mean_m.get_data_handle()),
+                  depth_*sizeof(T));
+
+      // copy batch_variance data with Bessel's correction
+      // if training mode is on
+      float adjust_factor = 1.0;
+      if (is_training_) {
+        size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
+        size_t adjust_size = orig_size - 1;
+        adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
+      }
+      T* batch_variance_data_tf = reinterpret_cast<T*>(
+                                  batch_variance_tensor->flat<T>().data());
+      for (int k=0; k < depth_; k++)
+        batch_variance_data_tf[k] =
+            (reinterpret_cast<T*>(variance_m.get_data_handle()))[k]
+            * adjust_factor;
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  bool is_training_;
+  T* mean_values_;
+  T* variance_values_;
+  size_t depth_;          // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+    const Tensor& input = MklGetInput(context, 0);
+    depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  void SetMeanVariance(const Tensor& mean, const Tensor& variance) {
+    mean_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(mean.flat<T>().data()));
+    variance_values_ = reinterpret_cast<T*>(
+                       const_cast<T*>(variance.flat<T>().data()));
+  }
+};
+
+
+template <typename Device, typename T>
+class MklFusedBatchNormGradOp : public OpKernel {
+ public:
+  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    float epsilon;
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+    epsilon_ = T(epsilon);
+    string tensor_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
+    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t scale_index = 2;     // index of scale tensor
+      const size_t mean_index = 3;      // index of saved_mean tensor
+      const size_t variance_index = 4;  // index of saved_variance tensor
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      const Tensor& scale_tensor = MklGetInput(context, scale_index);
+      const Tensor& saved_mean_tensor = MklGetInput(context, mean_index);
+      const Tensor& saved_variance_tensor = MklGetInput(context,
+                                            variance_index);
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      if (dnn_shape_diff_dst.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_diff_dst.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, diff_dst_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        diff_dst_tensor.shape().DebugString()));
+      }
+
+      if (dnn_shape_src.IsMklTensor()) {
+        OP_REQUIRES(context, dnn_shape_src.GetDimension() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                         src_tensor.shape().DebugString()));
+      } else {
+        OP_REQUIRES(context, src_tensor.dims() == 4,
+                    errors::InvalidArgument(
+                        "input must be 4-dimensional",
+                        src_tensor.shape().DebugString()));
+      }
+
+      OP_REQUIRES(context, scale_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "scale must be 1-dimensional",
+                      scale_tensor.shape().DebugString()));
+      OP_REQUIRES(context, saved_mean_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved mean must be 1-dimensional",
+                       saved_mean_tensor.shape().DebugString()));
+
+      OP_REQUIRES(context, saved_variance_tensor.dims() == 1,
+                  errors::InvalidArgument(
+                      "saved variance must be 1-dimensional",
+                      saved_variance_tensor.shape().DebugString()));
+
+      if (dnn_shape_src.IsMklTensor())
+        depth_ = dnn_shape_src.DimSize(MklDnnDims::Dim_C);
+      else
+        ExtractParams(context);
+
+      memory::format format_m;
+      if (dnn_shape_src.IsMklTensor()) {
+        if (dnn_shape_src.IsTensorInNCHWFormat())
+          format_m = memory::format::nchw;
+        else
+          format_m = memory::format::nhwc;
+      } else {
+        format_m = TFDataFormatToMklDnnDataFormat(tensor_format_);
+      }
+
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> mean(&cpu_engine);
+      MklDnnData<T> variance(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      memory::dims src_dims, diff_dst_dims;
+      if (dnn_shape_src.IsMklTensor())
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       dnn_shape_src.GetTfShape(), tensor_format_);
+      else
+        src_dims = TFShapeToMklDnnDimsInNCHW(
+                       src_tensor.shape(), tensor_format_);
+
+      if (dnn_shape_diff_dst.IsMklTensor())
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            dnn_shape_diff_dst.GetTfShape(),
+                            tensor_format_);
+      else
+        diff_dst_dims = TFShapeToMklDnnDimsInNCHW(
+                            diff_dst_tensor.shape(),
+                            tensor_format_);
+
+      // set src and diff_dst primitives
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_src.IsMklTensor()) {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        } else {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        }
+      } else {
+        src_md =  memory::desc(src_dims, MklDnnType<T>(), format_m);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      // weights -- DNN packs scales/shifts as weights in order of
+      // scale, ..., scale, shift, ..., shift
+      auto weights_desc = memory::desc({2, depth_},
+                                       MklDnnType<T>(),
+                                       memory::format::nc);
+      auto weights_pd = memory::primitive_desc(weights_desc, cpu_engine);
+      auto weights_m = memory(weights_pd);
+      T* weights_data = reinterpret_cast<T*>(weights_m.get_data_handle());
+      T* scale_tf = reinterpret_cast<T*>(const_cast<T*>
+                                        (scale_tensor.flat<T>().data()));
+      for (int k=0; k < depth_; k++) {
+        weights_data[k] = scale_tf[k];
+        weights_data[k + depth_] = 0;
+      }
+
+      // set mean primitive
+      memory::dims mv_dims = GetMeanVarianceDims();
+      mean.SetUsrMem(mv_dims,
+                     memory::format::nc,
+                     const_cast<void*>(static_cast<const void*>
+                     (saved_mean_tensor.flat<T>().data())));
+      mean.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set variance primitive
+      variance.SetUsrMem(mv_dims,  memory::format::nc,
+                         const_cast<void*>(static_cast<const void*>
+                         (saved_variance_tensor.flat<T>().data())));
+      variance.SetOpMemDesc(mv_dims, memory::format::nc);
+
+      // set diff_weight primitive
+      auto diff_weights_desc = memory::desc(
+                                 {2, depth_},
+                                 MklDnnType<T>(),
+                                 memory::format::nc);
+      auto diff_weights_pd = memory::primitive_desc(
+                                diff_weights_desc,
+                                cpu_engine);
+      auto diff_weights_m = memory(diff_weights_pd);
+
+      auto bnrm_fwd_desc = batch_normalization_forward::desc(
+                                prop_kind::forward_training,
+                                src.GetUsrMemDesc(),
+                                epsilon_,
+                                use_scale_shift);
+      auto bnrm_fwd_pd = batch_normalization_forward::primitive_desc(
+                                bnrm_fwd_desc,
+                                cpu_engine);
+
+      // Indices of output tensors
+      const size_t diff_src_index = 0;    // index of diff_src tensor
+      const size_t diff_scale_index = 1;  // index of diff_scale tensor
+      const size_t diff_shift_index = 2;  // index of diff_shift tensor
+      const size_t p1_index = 3;  // index of 1st placeholder tensor
+      const size_t p2_index = 4;  // index of 2nd placeholder tensor
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      Tensor* diff_src_tensor = nullptr;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = bnrm_fwd_pd.dst_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(
+                              dnn_shape_src.GetDimension(),
+                              src_dims,
+                              format_m);
+        dnn_shape_diff_src.SetTfDimOrder(
+                              dnn_shape_src.GetDimension(),
+                              tensor_format_);
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                tf_shape_diff_src, dnn_shape_diff_src);
+
+      diff_src.SetUsrMem(src_md, diff_src_tensor);
+
+      prop_kind pk = prop_kind::backward;
+      auto bnrm_bwd_desc = batch_normalization_backward::desc(
+                               pk,
+                               diff_src.GetUsrMemDesc(),
+                               src.GetUsrMemDesc(),
+                               epsilon_,
+                               use_scale_shift);
+      auto bnrm_bwd_pd = batch_normalization_backward::primitive_desc(
+                               bnrm_bwd_desc,
+                               cpu_engine,
+                               bnrm_fwd_pd);
+
+      auto bnrm_bwd_op = batch_normalization_backward(
+                               bnrm_bwd_pd,
+                               src.GetOpMem(),
+                               mean.GetOpMem(),
+                               variance.GetOpMem(),
+                               diff_dst.GetOpMem(),
+                               weights_m,
+                               diff_src.GetOpMem(),
+                               diff_weights_m);
+
+      std::vector<primitive> net;
+      net.push_back(bnrm_bwd_op);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // separate out scale and shift grad and copy to individual tensors
+      const TensorShape& tf_shape_scale_shift = scale_tensor.shape();
+      Tensor* diff_scale_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_scale;
+      mkl_shape_diff_scale.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_scale_index, &diff_scale_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_scale);
+
+      Tensor* diff_shift_tensor = nullptr;
+      MklDnnShape mkl_shape_diff_shift;
+      mkl_shape_diff_shift.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, diff_shift_index, &diff_shift_tensor,
+                                tf_shape_scale_shift, mkl_shape_diff_shift);
+
+      // copy data: diff_scale and diff_shift
+      T* diff_weights_data_dnn = reinterpret_cast<T*>
+                                 (diff_weights_m.get_data_handle());
+      float* diff_scale_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_scale_tensor->flat<T>().data()));
+      float* diff_shift_data_tf = const_cast<float*>(
+             static_cast<const float*>(diff_shift_tensor->flat<T>().data()));
+      for (int i = 0; i < depth_; i++) {
+        diff_scale_data_tf[i] = diff_weights_data_dnn[i];
+        diff_shift_data_tf[i] = diff_weights_data_dnn[i + depth_];
+      }
+
+      // Placeholders for estimated_mean and estimated_variance, which are
+      // used for inference and thus not needed here for gradient computation.
+      Tensor* p1_tensor = nullptr, *p2_tensor = nullptr;
+      MklDnnShape mkl_shape_p;
+      mkl_shape_p.SetMklTensor(false);
+      AllocateOutputSetMklShape(context, p1_index, &p1_tensor,
+                                TensorShape({}), mkl_shape_p);
+      AllocateOutputSetMklShape(context, p2_index, &p2_tensor,
+                                TensorShape({}), mkl_shape_p);
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                     error_msg));
+    }
+  }
+
+ private:
+  T epsilon_;
+  TensorFormat tensor_format_;
+  int depth_;             // batch normalization is done for per channel.
+
+  void ExtractParams(OpKernelContext* context) {
+      const Tensor& input = MklGetInput(context, 0);
+      depth_ = static_cast<int>(GetTensorDim(input, tensor_format_, 'C'));
+  }
+
+  memory::dims GetMeanVarianceDims() {
+    return memory::dims({1, depth_});
+  }
+};
+
+#endif
+
+#define REGISTER_MKL_CPU(T)                                         \
+  REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklFusedBatchNormOp<CPUDevice, T>);
+TF_CALL_float(REGISTER_MKL_CPU);
+#undef REGISTER_MKL_CPU
 
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNormGrad")            \
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index f31e7afd46873a02c10277283862a7e5e2384803..9ee27ee21c8d23c8ce314a7687ac9b79a1d9ea30 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -28,8 +28,15 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklIdentityOp : public OpKernel {
  public:
@@ -50,6 +57,32 @@ class MklIdentityOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
+#else
+
+template <typename Device, typename T>
+class MklIdentityOp : public OpKernel {
+ public:
+  explicit MklIdentityOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    MklDnnShape dnn_shape_input;
+    const int kInputIdx = 0, kOutputIdx = 0;
+    GetMklShape(context, kInputIdx, &dnn_shape_input);
+
+    if (dnn_shape_input.IsMklTensor()) {
+      ForwardMklTensorInToOut(context, kInputIdx, kOutputIdx);
+    } else {
+      ForwardTfTensorInToOut(context, kInputIdx, kOutputIdx);
+    }
+  }
+
+  // TensorFlow's IdentityOp has the following member function, so kept it
+  // as it is.
+  bool IsExpensive() override { return false; }
+};
+
+#endif
+
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklIdentity")                      \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index b58e44e39800c8c047d5557ab3c84113bb78d3ca..001834b13bdd64ffd0d536897fbc4a170c4c4117 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -31,6 +31,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/mkl_tfconv_op.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -44,15 +50,16 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // else if both inputs are in mkl format:
 //   if both have the same shape:
 //     pass the inputs through to the output
-// 	else:
-// 		convert both to TF
+//   else:
+//     convert both to TF
 // else if one is TF and one is MKL:
-// 	if broadcast is needed:
-// 		convert the MKL format input to TF format
-// 	else:
-// 		convert the TF format input to MKL format
+//   if broadcast is needed:
+//     convert the MKL format input to TF format
+//   else:
+//     convert the TF format input to MKL format
 ///////////////////////////////////////////////////////////
 
+#ifndef INTEL_MKL_DNN
 template <typename Device, typename T>
 class MklInputConversionOp : public OpKernel {
  public:
@@ -242,6 +249,199 @@ class MklInputConversionOp : public OpKernel {
   bool has_avx512f_ = false;
 };
 
+#else
+
+template <typename Device, typename T>
+class MklInputConversionOp : public OpKernel {
+ public:
+  explicit MklInputConversionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type));
+    has_avx512f_ = port::TestCPUFeature(port::CPUFeature::AVX512F);
+  }
+
+ private:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    MklDnnShape input_shape_0;
+    GetMklShape(context, 0, &input_shape_0);
+
+    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    MklDnnShape input_shape_1;
+    GetMklShape(context, 1, &input_shape_1);
+
+    bool tf_shapes_are_same = context->input(0).shape() ==
+                              context->input(1).shape();
+
+    VLOG(1) << "MklInputConversionOp: Input shapes are "
+            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
+            << context->input(0).shape().DebugString() << " and "
+            << context->input(1).shape().DebugString();
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // if both inputs are in TF format, just copy input tensors to output.
+    if (!input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      VLOG(1) << "MklInputConversionOp: No conversion needed, "
+              << "copying TF inputs to output";
+
+      ForwardTfTensorInToOut(context, 0, 0);
+      ForwardTfTensorInToOut(context, 1, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // If both inputs are in MKL format
+    if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      // If both have the same shape, pass them through
+      if (tf_shapes_are_same) {
+        VLOG(1) << "MklInputConversionOp: No conversion needed, "
+                << "copying MKL inputs with identical shapes to output";
+
+        ForwardMklTensorInToOut(context, 0, 0);
+        ForwardMklTensorInToOut(context, 1, 1);
+        return;
+      }
+
+      // Sanity check
+      bool mkl_shapes_are_same = input_shape_0 == input_shape_1;
+      if (mkl_shapes_are_same) {
+        CHECK(false) << "MklInputConversionOp: Unexpected: TF shapes are "
+                        "different but MKL shapes are same";
+      }
+
+      // Both have different shapes, so broadcast will be necessary.
+      // Convert to TF and pass both tensors through (we can't do broadcast
+      // with MKL tensors)
+      VLOG(1) << "MklInputConversionOp: Broadcast needed, "
+              << "converted MKL inputs to TF format";
+
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 0);
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_, 1);
+      SetDummyMklShapeOutput(context, 0);
+      SetDummyMklShapeOutput(context, 1);
+      return;
+    }
+
+    // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    // One input is MKL and one is TF. If no broadcast is needed, convert
+    // the TF tensor to MKL, otherwise convert the MKL tensor to TF format
+    VLOG(1) << "MklInputConversionOp: Inputs in different formats (MKL/TF)";
+
+    const Tensor* mkl_tensor;
+    const MklDnnShape* mkl_shape;
+    const Tensor* tf_tensor;
+    MklDnnShape* tf_mkl_shape;
+    uint mkl_tensor_index;
+    uint tf_tensor_index;
+    if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_0;
+      mkl_shape = &input_shape_0;
+      mkl_tensor_index = 0;
+      tf_tensor = &input_tensor_1;
+      tf_mkl_shape = &input_shape_1;
+      tf_tensor_index = 1;
+    } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
+      mkl_tensor = &input_tensor_1;
+      mkl_shape = &input_shape_1;
+      mkl_tensor_index = 1;
+      tf_tensor = &input_tensor_0;
+      tf_mkl_shape = &input_shape_0;
+      tf_tensor_index = 0;
+    } else {
+      CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
+                      "shapes for MKL "
+                   << "element-wise op";
+    }
+
+    // Broadcast is needed if the shapes are not the same
+    bool broadcast_needed;
+
+    size_t in0_size = 1;
+    for (size_t i = 0; i < mkl_shape->GetDimension(); ++i)
+      in0_size *= mkl_shape->TfDimSize(i);
+
+    size_t in1_size = 1;
+    for (size_t i = 0; i < tf_tensor->shape().dims(); ++i)
+      in1_size *= tf_tensor->shape().dim_size(i);
+
+    broadcast_needed = (in0_size != in1_size);
+
+    if (!broadcast_needed) {
+      // Both shapes are same, convert the TF input to MKL
+      VLOG(1) << "MklInputConversionOp: No broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << tf_tensor_index
+              << " to MKL format";
+
+      // Create MklDnnShape for output Mkl tensor.
+      Tensor* tensor_out;
+      MklDnnShape mkl_output_mkl_shape;
+      mkl_output_mkl_shape.SetMklTensor(true);
+      mkl_output_mkl_shape.SetElemType(MklDnnType<T>());
+      mkl_output_mkl_shape.SetTfLayout(mkl_shape->GetDimension(),
+                                       mkl_shape->GetSizesAsMklDnnDims(),
+                                       mkl_shape->GetTfDataFormat());
+      // ** Temporarily borrow the layout from the MKL input **
+      auto output_mkl_md = mkl_shape->GetMklLayout();
+      mkl_output_mkl_shape.SetMklLayout(&output_mkl_md);
+
+      // Create output Mkl tensor
+      AllocateOutputSetMklShape(context, tf_tensor_index, &tensor_out,
+                                mkl_tensor->shape(), mkl_output_mkl_shape);
+
+      // Create MklDnnData object for input tensor. Input tensor is in
+      // Tensorflow layout.
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> tf_input(&cpu_engine);
+      auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
+      tf_input.SetUsrMem(input_tf_md, &tf_tensor);
+
+      // Create reorder between tensorflow layout and Mkl layout.
+      std::vector<primitive> net;
+      CHECK_EQ(tf_input.CheckReorderToOpMem(memory::primitive_desc(
+                                            output_mkl_md, cpu_engine),
+                                            tensor_out, &net),
+               true);
+      stream(stream::kind::eager).submit(net).wait();
+
+      // -- The tensor in MKL format passes through --
+      ForwardMklTensorInToOut(context, mkl_tensor_index, mkl_tensor_index);
+    } else {
+      // Broadcast is needed, so convert the MKL input to TF
+      VLOG(1) << "MklInputConversionOp: Broadcast needed.";
+      VLOG(1) << "MklInputConversionOp: Converting input " << mkl_tensor_index
+              << " to TF format";
+      MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
+                                           op_data_type, has_avx512f_,
+                                           mkl_tensor_index);
+      SetDummyMklShapeOutput(context, mkl_tensor_index);
+
+      // The tensor in TF format passes through
+      ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
+    }
+
+    VLOG(1) << "MklInputConversionOp: Shapes (output): "
+            << context->mutable_output(0)->shape().DebugString() << " and "
+            << context->mutable_output(1)->shape().DebugString();
+
+    VLOG(1) << "MklInputConversion completed successfully.";
+  }
+
+ private:
+  /// Data format of the operation
+  string data_format_str;
+
+  /// Data type of the operation
+  DataType op_data_type;
+
+  /// CPUIDInfo
+  bool has_avx512f_ = false;
+};
+
+#endif
+
 ///////////////////////////////////////////////////////////
 //               Register kernel
 ///////////////////////////////////////////////////////////
@@ -253,7 +453,10 @@ class MklInputConversionOp : public OpKernel {
                               .Label(mkl_op_registry::kMklOpLabel), \
                           MklInputConversionOp<CPUDevice, T>);
 
-TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+// TODO(nhasabni): We cannot support all number types since MklDnn does
+// not support types.
+// TF_CALL_NUMBER_TYPES(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index aa08e93924c588cfb5b4a22a20055e5c74a43b3a..227765e46d649eb0637f8e31a2ea4a0bf90f0c1a 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc. This opkernel uses MKL library, create MKL
 // layout and primitives, use MKL dnn primitives to compute local
 // response normalization
-
+#undef INTEL_MKL
 #ifdef INTEL_MKL
 
 #define EIGEN_USE_THREADS
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 846bb5710ded92c303567e4078c49a56b3746706..de4d7d2e729e0b1dec876ec6f7915acd88bf9167 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -16,17 +16,32 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
-
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include <algorithm>
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::error;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::padding_kind;
+using mkldnn::engine;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
+// For now, MKL-ML is default. So making MKL-DNN not a default choice.
+#ifndef INTEL_MKL_DNN
+
 // An implementation of MaxPooling (forward).
 template <typename Device, typename T>
 class MklMaxPoolingOp : public OpKernel {
@@ -475,8 +490,348 @@ class MklMaxPoolingGradOp : public OpKernel {
   TensorFormat data_format_;
 
   bool workspace_enabled_;
+};  // MklMaxPoolingGradOp
+
+#else  // INTEL_MKL_DNN is defined
+
+// An implementation of MaxPooling (forward).
+template <typename Device, typename T>
+class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingOp(OpKernelConstruction* context)
+            : MklPoolingForwardOpBase<T>(context) {
+    // In Max Pooling, MKLDNN does not allow passing workspace as NULL.
+    // So we set workspace_enabled_ to true.
+    this->workspace_enabled_ = true;
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const Tensor& input_tensor = MklGetInput(context,
+                this->kInputTensorIndexInput);
+      MklDnnShape dnn_shape_input;
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      this->SanityCheckInput(context, input_tensor, dnn_shape_input);
+      if (!context->status().ok()) return;
+
+      MklDnnData<T> dnn_data_input(&cpu_engine);
+      MklDnnData<T> dnn_data_output(&cpu_engine);
+      MklDnnData<T> dnn_data_wksp(&cpu_engine);
+
+      // initialize variables for the pooling op
+      MklPoolParameters pool_params;
+      // Get the input tensor and initialize the pooling parameters
+      this->ConfigureInput(context, dnn_shape_input,
+                        input_tensor, &pool_params,
+                        &dnn_data_input);
+      OP_REQUIRES_OK(context, context->status());
+
+      // Declare output tensor
+      Tensor* output_tensor = nullptr;
+      memory::dims output_dims_mkl_order;
+      this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+      // If input is in Mkl layout, then just get the memory format from it
+      // directly, instead of using input data_format to MaxPool.
+      if (dnn_shape_input.IsMklTensor()) {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  static_cast<memory::format>(
+              dnn_data_input.GetUsrMemDesc().data.format));
+      } else {
+        dnn_data_output.SetUsrMem(output_dims_mkl_order,
+                                  this->data_format_mkldnn_);
+      }
+
+      // describe the memory layout; let mkl-dnn choose the best for the op
+      dnn_data_output.SetOpMemDesc(output_dims_mkl_order, memory::format::any);
+
+      auto pool_desc = pooling_forward::desc(prop_kind::forward,
+            algorithm::pooling_max,
+            dnn_data_input.GetUsrMemDesc(),
+            dnn_data_output.GetUsrMemDesc(),
+            memory::dims({  pool_params.row_stride,
+                            pool_params.col_stride}),
+            memory::dims({  pool_params.window_rows,
+                            pool_params.window_cols}),
+            memory::dims({  static_cast<int>(pool_params.pad_top),
+                            static_cast<int>(pool_params.pad_left)}),
+            memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                            static_cast<int>(pool_params.pad_right)}),
+            TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_desc = pooling_forward::primitive_desc(pool_desc,
+            cpu_engine);
+
+      this->AllocateOutputTensor(context, pool_fwd_desc, output_dims_mkl_order,
+                            this->data_format_mkldnn_, &output_tensor);
+      OP_REQUIRES_OK(context, context->status());
+      dnn_data_output.SetUsrMemDataHandle(output_tensor);
+
+      AllocateWorkspaceTensor(context, pool_fwd_desc, &dnn_data_wksp);
+      OP_REQUIRES_OK(context, context->status());
+
+      this->PrepareAndExecuteNet(pool_fwd_desc, &dnn_data_input,
+                        &dnn_data_output, &dnn_data_wksp);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    const int kOutputTensorIndexWorkspace = 1;
+
+    void AllocateWorkspaceTensor(OpKernelContext* context,
+                const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+                MklDnnData<T>* dnn_data_wksp) {
+        CHECK_NOTNULL(dnn_data_wksp);
+        Tensor* workspace_tensor = nullptr;
+        memory::primitive_desc workspace_pd
+                    = pool_fwd_prim_desc.workspace_primitive_desc();
+        size_t workspace_t_elems = this->GetNumTElements(workspace_pd);
+        MklDnnShape workspace_mkl_shape;
+        workspace_mkl_shape.SetMklTensor(false);
+        TensorShape workspace_tf_shape;
+        workspace_tf_shape.AddDim(workspace_t_elems);
+        AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
+                                &workspace_tensor,
+                                workspace_tf_shape, workspace_mkl_shape);
+        CHECK_NOTNULL(workspace_tensor);
+        dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
+    }
 };
 
+// The operation to compute MaxPool gradients.
+// It takes three inputs:
+//   - The original input tensor
+//   - The original output tensor
+//   - Backprop tensor for output
+// It produces one output: backprop tensor for input.
+template <class Device, class T>
+class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
+ public:
+  explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
+      : MklPoolingBackwardOpBase<T>(context) {
+  }
+
+  void Compute(OpKernelContext* context) override {
+    try {
+        auto cpu_engine = engine(engine::cpu, 0);
+        const Tensor& orig_input_tensor = MklGetInput(context,
+            kInputTensorIndexOrigInput);
+        const Tensor& orig_output_tensor = MklGetInput(context,
+            kInputTensorIndexOrigOutput);
+        const Tensor& grad_tensor = MklGetInput(context,
+            kInputTensorIndexGradient);
+        const Tensor& workspace_tensor = MklGetInput(context,
+            kInputTensorIndexWorkspace);
+        MklDnnShape orig_input_mkl_shape,
+                    orig_output_mkl_shape,
+                    grad_mkl_shape,
+                    workspace_mkl_shape;
+        GetMklShape(context, kInputTensorIndexOrigInput,
+            &orig_input_mkl_shape);
+        GetMklShape(context, kInputTensorIndexOrigOutput,
+            &orig_output_mkl_shape);
+        GetMklShape(context, kInputTensorIndexGradient,
+            &grad_mkl_shape);
+        GetMklShape(context, kInputTensorIndexWorkspace,
+            &workspace_mkl_shape);
+
+        SanityCheckInputs(context,
+                            orig_input_tensor, orig_output_tensor,
+                            grad_tensor, workspace_tensor,
+                            orig_input_mkl_shape, orig_output_mkl_shape,
+                            grad_mkl_shape, workspace_mkl_shape);
+        if (!context->status().ok()) return;
+
+        MklDnnData<T> grad_dnn_data(&cpu_engine);
+        MklDnnData<T> workspace_dnn_data(&cpu_engine);
+        MklDnnData<T> output_dnn_data(&cpu_engine);
+        Tensor* output_tensor = nullptr;
+        MklPoolParameters pool_params;
+        TensorShape orig_input_shape;
+        memory::dims output_dims_mkl_order, orig_input_dims_mkl_order;
+        memory::desc original_input_md = ConfigureOriginalInput(context,
+                                orig_input_tensor,
+                                orig_input_mkl_shape,
+                                &orig_input_dims_mkl_order,
+                                &pool_params,
+                                &orig_input_shape);
+
+        memory::desc original_output_md = this->ConfigureOriginalOutput(
+                                pool_params,
+                                orig_output_mkl_shape,
+                                output_dims_mkl_order);
+
+        memory::desc target_diff_dst_md =  this->ConfigureInputGradient(
+                                        grad_mkl_shape,
+                                        grad_tensor,
+                                        &grad_dnn_data,
+                                        original_output_md);
+
+        output_dnn_data.SetUsrMem(original_input_md);
+
+        // Create the forward pooling primitive descriptor so we can
+        // pass it as a hint to the backward pooling primitive descriptor
+        auto pool_fwd_desc = pooling_forward::desc(prop_kind::forward,
+                algorithm::pooling_max,
+                original_input_md,
+                original_output_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_fwd_prim_desc
+                = pooling_forward::primitive_desc(pool_fwd_desc,
+                                                    cpu_engine);
+
+        auto pool_bkwd_desc = pooling_backward::desc(
+                algorithm::pooling_max,
+                output_dnn_data.GetUsrMemDesc(),
+                target_diff_dst_md,
+                memory::dims({  pool_params.row_stride,
+                                pool_params.col_stride}),
+                memory::dims({  pool_params.window_rows,
+                                pool_params.window_cols}),
+                memory::dims({  static_cast<int>(pool_params.pad_top),
+                                static_cast<int>(pool_params.pad_left)}),
+                memory::dims({  static_cast<int>(pool_params.pad_bottom),
+                                static_cast<int>(pool_params.pad_right)}),
+                TFPaddingToMklDnnPadding(this->padding_));
+        auto pool_bkwd_prim_desc
+            = pooling_backward::primitive_desc(pool_bkwd_desc,
+                                                cpu_engine,
+                                                pool_fwd_prim_desc);
+
+        this->AllocateOutputTensor(context, pool_bkwd_prim_desc,
+            orig_input_dims_mkl_order,
+            this->data_format_mkldnn_,
+            &output_tensor);
+        output_dnn_data.SetUsrMemDataHandle(output_tensor);
+
+        ConfigureWorkspace(workspace_tensor,
+                pool_fwd_prim_desc.workspace_primitive_desc(),
+                &workspace_dnn_data);
+        this->PrepareAndExecuteNet(pool_bkwd_prim_desc,
+                            &grad_dnn_data,
+                            &output_dnn_data,
+                            memory::primitive_desc(
+                                target_diff_dst_md,
+                                cpu_engine),
+                            &workspace_dnn_data);
+    } catch (mkldnn::error &e) {
+        string error_msg = "Status: " + std::to_string(e.status) +
+                        ", message: " + string(e.message) +
+                        ", in file " + string(__FILE__) + ":" +
+                        std::to_string(__LINE__);
+        OP_REQUIRES_OK(context,
+                        errors::Aborted("Compute received an exception:",
+                                         error_msg));
+    }
+  }  // Compute
+
+ private:
+    // .Input("orig_input: T")
+    // .Input("orig_output: T")
+    // .Input("grad: T")
+    // .Input("workspace: T")
+    const int kInputTensorIndexOrigInput = 0;
+    const int kInputTensorIndexOrigOutput = 1;
+    const int kInputTensorIndexGradient = 2;
+    const int kInputTensorIndexWorkspace = 3;
+    //  Output("output: T") in Base Class
+
+    memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                                const Tensor& tensor_original_input,
+                                const MklDnnShape& original_input_mkl_shape,
+                                memory::dims* original_input_dims_mkl_order,
+                                MklPoolParameters* pool_params,
+                                TensorShape* input_tensor_shape) {
+        *input_tensor_shape = tensor_original_input.shape();
+        return MklPoolingBackwardOpBase<T>::ConfigureOriginalInput(
+                                        context,
+                                        tensor_original_input,
+                                        original_input_mkl_shape,
+                                        original_input_dims_mkl_order,
+                                        pool_params,
+                                        *input_tensor_shape);
+    }
+
+    void ConfigureWorkspace(const Tensor& workspace_tensor,
+                        memory::primitive_desc workspace_pd,
+                        MklDnnData<T> *workspace_dnn_data) {
+        CHECK_NOTNULL(workspace_dnn_data);
+
+        workspace_dnn_data->SetUsrMem(workspace_pd, &workspace_tensor);
+    }
+
+    void SanityCheckInputs(OpKernelContext* context,
+                            const Tensor& orig_input_tensor,
+                            const Tensor& orig_output_tensor,
+                            const Tensor& grad_tensor,
+                            const Tensor& workspace_tensor,
+                            const MklDnnShape& orig_input_mkl_shape,
+                            const MklDnnShape& orig_output_mkl_shape,
+                            const MklDnnShape& grad_mkl_shape,
+                            const MklDnnShape& workspace_mkl_shape) {
+        if (!orig_input_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_input_tensor.dims() == 4,
+                errors::InvalidArgument("Original input shape must be "
+                "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original input shape must be "
+                    "4-dimensional"));
+        }
+        if (!orig_output_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, orig_output_tensor.dims() == 4,
+                errors::InvalidArgument("Original output must be "
+                        "4-dimensional"));
+        } else {
+            OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Original output must be "
+                    "4-dimensional"));
+        }
+        if (!grad_mkl_shape.IsMklTensor()) {
+            OP_REQUIRES(context, grad_tensor.dims() == 4,
+                errors::InvalidArgument("Gradient must be 4-dimensional"));
+        } else {
+            OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
+                    errors::InvalidArgument("Gradient must be "
+                    "4-dimensional"));
+        }
+        if (this->workspace_enabled_){
+            // The workspace should not be an MKL tensor
+            OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
+                    errors::InvalidArgument("Workspace tensor should not"
+                                            " be an MKL Tensor."));
+            // It should only have one dimension
+            OP_REQUIRES(context, workspace_tensor.dims() == 1,
+                    errors::InvalidArgument("Workspace tensor must be "
+                                "1-dimensional"));
+        } else {
+            OP_REQUIRES(context, this->workspace_enabled_,
+                    errors::Unimplemented("MKL-DNN Max Pooling does not "
+                                "yet support the use case "
+                                "where MaxPoolGrad is called without first"
+                                " calling MaxPool."));
+        }
+    }
+};  // MklMaxPoolingGradOp
+
+#endif  // INTEL_MKL_DNN
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 65e8852cfb11a2dd78395860a7ca7b2cc550be34..f7cadffd39c11bdedaca6a07e48f222e7ac5e0cb 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef INTEL_MKL
+
 #include <vector>
+#include <limits>
 #include "tensorflow/core/kernels/mkl_pooling_ops_common.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
@@ -39,6 +42,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
   Init(context, ksize, stride, padding, data_format);
 }
 
+#ifndef INTEL_MKL_DNN
 // Initialization for MKL format
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -53,7 +57,22 @@ void MklPoolParameters::Init(OpKernelContext* context,
 
   Init(context, ksize, stride, padding, data_format);
 }
+#else
+// Initialization for MKL format
+void MklPoolParameters::Init(OpKernelContext* context,
+                             const std::vector<int32>& ksize,
+                             const std::vector<int32>& stride, Padding padding,
+                             TensorFormat data_format,
+                             const MklDnnShape* mklInputShape) {
+  // Get the input sizes
+  depth = mklInputShape->GetDimension('C');
+  tensor_in_cols = mklInputShape->GetDimension('W');
+  tensor_in_rows = mklInputShape->GetDimension('H');
+  tensor_in_batch = mklInputShape->GetDimension('N');
 
+  Init(context, ksize, stride, padding, data_format);
+}
+#endif  // INTEL_MKL_DNN
 // Common Initialization for TensorFlow and MKL formats
 void MklPoolParameters::Init(OpKernelContext* context,
                              const std::vector<int32>& ksize,
@@ -80,7 +99,7 @@ void MklPoolParameters::Init(OpKernelContext* context,
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
 
-  if (depth_window == 1) {
+  if (depth_window == 1) {  // we are pooling in the H and W
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_rows, window_rows, row_stride,
                                 padding, &out_height, &pad_top, &pad_bottom));
@@ -88,7 +107,21 @@ void MklPoolParameters::Init(OpKernelContext* context,
     OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
                                 tensor_in_cols, window_cols, col_stride,
                                 padding, &out_width, &pad_left, &pad_right));
-  } else {
+#ifdef INTEL_MKL_DNN
+    // TF can work with int64, but mkldnn only supports int32
+    // Fail if the height or width are greater than MAX_INT
+
+    OP_REQUIRES(context, FastBoundsCheck(out_height,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output height is too large"));
+
+    OP_REQUIRES(context, FastBoundsCheck(out_width,
+                                         std::numeric_limits<int>::max()),
+                errors::InvalidArgument("output width is too large"));
+
+#endif
+    out_depth = depth;  // output will have the same depth as the input
+  } else {  // we are pooling in the depth dimension
     // Our current version of depthwise max pooling does not support
     // any padding, and expects the depth_window to equal the depth
     // stride (no overlapping).
@@ -109,7 +142,6 @@ void MklPoolParameters::Init(OpKernelContext* context,
                 errors::Unimplemented("Depthwise max pooling is currently "
                                       "only implemented for CPU devices."));
 
-    pad_depth = 0;
     out_depth = depth / depth_window;
   }
 }
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 92ea2beb25aa1fd4cab7fd787b04c4d086ca1b05..d33e91a15dcba948ad5279ea848b5d1a7cd9b119 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -18,9 +18,18 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <vector>
+#include <string>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::memory;
+using mkldnn::pooling_forward;
+using mkldnn::pooling_backward;
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -51,14 +60,28 @@ struct MklPoolParameters {
   int pad_depth;
 
   TensorFormat data_format;
+  MklPoolParameters()
+    : depth(0)
+    , tensor_in_cols(0), tensor_in_rows(0), tensor_in_batch(0)
+    , window_rows(0), window_cols(0), depth_window(0)
+    , row_stride(0), col_stride(0), depth_stride(0)
+    , out_height(0), out_width(0), out_depth(0)
+    , pad_left(0), pad_right(0), pad_top(0), pad_bottom(0), pad_depth(0)
+    , data_format(TensorFormat::FORMAT_NCHW) {}
 
   // Updates context->status if there is an invalid input.
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const TensorShape& tensor_in_shape);
+#ifndef INTEL_MKL_DNN
   void Init(OpKernelContext* context, const std::vector<int32>& ksize,
             const std::vector<int32>& stride, Padding padding,
             TensorFormat data_format, const MklShape* mkl_in_shape);
+#else
+  void Init(OpKernelContext* context, const std::vector<int32>& ksize,
+            const std::vector<int32>& stride, Padding padding,
+            TensorFormat data_format, const MklDnnShape* mkl_in_shape);
+#endif
 
  private:
   // Common initialization for TensorFlow and MKL formats
@@ -67,6 +90,325 @@ struct MklPoolParameters {
             TensorFormat data_format);
 };
 
+#ifdef INTEL_MKL_DNN
+
+template <class T>
+class MklPoolingOpBase : public OpKernel {
+ public:
+  explicit MklPoolingOpBase(OpKernelConstruction* context)
+            : OpKernel(context)
+            , workspace_enabled_(false) {
+      string data_format;
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+      OP_REQUIRES(context,
+            FormatFromString(data_format, &this->data_format_tf_),
+            errors::InvalidArgument("Invalid data format"));
+      this->data_format_mkldnn_
+                = TFDataFormatToMklDnnDataFormat(this->data_format_tf_);
+      OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
+      OP_REQUIRES(context, this->ksize_.size() == 4,
+                  errors::InvalidArgument("Sliding window ksize field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("strides", &this->stride_));
+      OP_REQUIRES(context, this->stride_.size() == 4,
+                  errors::InvalidArgument("Sliding window strides field must "
+                                          "specify 4 dimensions"));
+      OP_REQUIRES_OK(context, context->GetAttr("padding", &this->padding_));
+      OP_REQUIRES(context, this->ksize_[0] == 1 && this->stride_[0] == 1,
+                  errors::Unimplemented("Pooling is not yet supported on the "
+                                        "batch dimension."));
+
+      // We may not get this attribute for this node if it does not go through
+      // graph rewrite pass. So we do not check for error while retrieving this
+      // attribute value.
+      context->GetAttr("workspace_enabled", &this->workspace_enabled_);
+    }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  // Calculate output shape of pooling op in MKL-DNN and TensorFlow order.
+  // MKL-DNN uses NCHW for output order. But TensorFlow output will be in
+  // NHWC or NCHW format depending on data format. Function expects
+  // output height and output width to have already been int32
+  // bounds-checked
+  void GetOutputDims(const MklPoolParameters& mkl_pool_params,
+                    memory::dims* output_dims_mkl_order) {
+    // MKL-DNN always needs output in NCHW format.
+    *output_dims_mkl_order = { mkl_pool_params.tensor_in_batch,
+                              mkl_pool_params.out_depth,
+                              static_cast<int>(mkl_pool_params.out_height),
+                              static_cast<int>(mkl_pool_params.out_width)};
+  }
+
+  void InitMklPoolParameters(OpKernelContext* context,
+                      MklPoolParameters* pool_params,
+                      const MklDnnShape& original_input_mkl_shape,
+                      const TensorShape& input_tensor_shape) {
+    if (!original_input_mkl_shape.IsMklTensor()) {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, input_tensor_shape);
+    } else {
+      pool_params->Init(context, this->ksize_, this->stride_, this->padding_,
+          this->data_format_tf_, &original_input_mkl_shape);
+    }
+  }
+
+  // Checks to make sure that the memory we need to allocate
+  // is a multiple of sizeof(T)
+  // returns the number of elements
+  size_t GetNumTElements(const memory::primitive_desc& pd) {
+    size_t num_bytes = pd.get_size();
+    size_t ret_val = num_bytes / sizeof(T);
+    if ( num_bytes % sizeof(T) != 0 ) {
+        ret_val++;
+    }
+    return ret_val;
+  }
+
+
+  std::vector<int32> ksize_;
+  std::vector<int32> stride_;
+  Padding padding_;
+  TensorFormat data_format_tf_;
+  memory::format data_format_mkldnn_;
+  bool workspace_enabled_;
+};
+
+template <class T>
+class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingForwardOpBase<T>(OpKernelConstruction* context)
+      : MklPoolingOpBase<T>(context) {}
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  void ConfigureInput(OpKernelContext* context,
+                    const MklDnnShape& input_mkl_shape,
+                    const Tensor& input_tensor,
+                    MklPoolParameters* pool_params,
+                    MklDnnData<T>* dnn_data_input) {
+    CHECK_NOTNULL(pool_params);
+    CHECK_NOTNULL(dnn_data_input);
+    TensorShape input_tensor_shape = input_tensor.shape();
+    memory::desc input_md = input_mkl_shape.IsMklTensor()
+                        ? input_mkl_shape.GetMklLayout()
+                        : memory::desc(
+                              TFShapeToMklDnnDimsInNCHW(
+                                  input_tensor_shape, this->data_format_tf_),
+                              MklDnnType<T>(),
+                              this->data_format_mkldnn_);
+    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    this->InitMklPoolParameters(context, pool_params,
+                      input_mkl_shape, input_tensor_shape);
+  }
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_forward::primitive_desc& pool_fwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd = pool_fwd_prim_desc.dst_primitive_desc();
+
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+    TensorShape output_tf_shape;
+
+    // only allocate enough space for the elements we need.
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+                  const pooling_forward::primitive_desc& pool_fwd_desc,
+                  const MklDnnData<T>* src,
+                  MklDnnData<T>* dst,
+                  MklDnnData<T>* wksp = nullptr) {
+    std::vector<primitive> net;
+
+    // Create pooling primitive and add it to net
+    if (wksp != nullptr) {
+        net.push_back(pooling_forward(pool_fwd_desc,
+                        src->GetOpMem(),
+                        dst->GetOpMem(),
+                        wksp->GetOpMem()));
+    } else {
+        net.push_back(pooling_forward(pool_fwd_desc,
+            src->GetOpMem(),
+            dst->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+
+  void SanityCheckInput(OpKernelContext* context,
+                  const Tensor& input_tensor,
+                  const MklDnnShape& input_mkl_shape) {
+    if (!input_mkl_shape.IsMklTensor()) {
+      OP_REQUIRES(context, input_tensor.dims() == 4,
+          errors::InvalidArgument("Input must be 4-dimensional"));
+    } else {
+        OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4,
+                errors::InvalidArgument("Input shape must be "
+                "4-dimensional"));
+    }
+  }
+  // .Input("value: T")
+  // .Output("output: T")
+  const int kInputTensorIndexInput = 0;
+  const int kOutputTensorIndexOutput = 0;
+};  // MklPoolingForwardBaseOp
+
+
+template <class T>
+class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
+ public:
+  explicit MklPoolingBackwardOpBase<T>(OpKernelConstruction* context)
+          : MklPoolingOpBase<T>(context) { }
+  void Compute(OpKernelContext* context) override = 0;
+
+ protected:
+  const int kOutputTensorIndexOutput = 0;
+
+  void AllocateOutputTensor(OpKernelContext* context,
+            const pooling_backward::primitive_desc& pool_bkwd_prim_desc,
+            const memory::dims output_dims_mkl_order,
+            const memory::format& output_tf_format,
+            Tensor** output_tensor) {
+    CHECK_NOTNULL(output_tensor);
+    memory::primitive_desc dst_pd
+                = pool_bkwd_prim_desc.diff_src_primitive_desc();
+    MklDnnShape output_mkl_shape;
+    output_mkl_shape.SetMklTensor(true);
+    output_mkl_shape.SetMklLayout(&dst_pd);
+    output_mkl_shape.SetElemType(MklDnnType<T>());
+    output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(),
+                              output_dims_mkl_order,
+                              output_tf_format);
+
+    TensorShape output_tf_shape;
+    output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    AllocateOutputSetMklShape(context, kOutputTensorIndexOutput,
+                            output_tensor,
+                            output_tf_shape, output_mkl_shape);
+    CHECK_NOTNULL(*output_tensor);
+  }
+
+  void PrepareAndExecuteNet(
+    const pooling_backward::primitive_desc& pool_bkwd_desc,
+    MklDnnData<T>* input_gradient_diff_dst,
+    MklDnnData<T>* output_diff_src,
+    const memory::primitive_desc& target_diff_dst_pd,
+    const MklDnnData<T>* workspace = nullptr) {
+
+    std::vector<primitive> net;
+
+    // If the input gradient isn't in the same format as the output
+    // reorder it to the same format as the output
+    input_gradient_diff_dst->CheckReorderToOpMem(
+            target_diff_dst_pd,
+            &net);
+
+    // Create pooling primitive and add it to net
+    if (nullptr == workspace) {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                              input_gradient_diff_dst->GetOpMem(),
+                              output_diff_src->GetOpMem()));
+    } else {
+      net.push_back(pooling_backward(pool_bkwd_desc,
+                                  input_gradient_diff_dst->GetOpMem(),
+                                  workspace->GetOpMem(),
+                                  output_diff_src->GetOpMem()));
+    }
+    stream(stream::kind::eager).submit(net).wait();
+  }
+
+  // Max Pooling and Avg Pooling have slightly different implementations
+  // Takes the Tensor containing original input data and the original
+  // mkl Dnn Shape and populates other data
+  memory::desc ConfigureOriginalInput(OpKernelContext* context,
+                              const Tensor& tensor_original_input_shape,
+                              const MklDnnShape& original_input_mkl_shape,
+                              memory::dims* original_input_dims_nchw,
+                              MklPoolParameters* pool_params,
+                              const TensorShape& input_tensor_shape) {
+    CHECK_NOTNULL(original_input_dims_nchw);
+    CHECK_NOTNULL(pool_params);
+    this->InitMklPoolParameters(context, pool_params,
+                          original_input_mkl_shape,
+                          input_tensor_shape);
+
+    *original_input_dims_nchw
+          = original_input_mkl_shape.IsMklTensor()
+          ? original_input_mkl_shape.GetSizesAsMklDnnDims()
+          : TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
+        this->data_format_tf_);
+
+    return  original_input_mkl_shape.IsMklTensor()
+      ? original_input_mkl_shape.GetMklLayout()
+      : memory::desc(*original_input_dims_nchw,
+                      MklDnnType<T>(),
+                      this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureOriginalOutput(const MklPoolParameters& pool_params,
+                                const MklDnnShape& original_output_mkl_shape,
+                                      memory::dims output_dims_mkl_order) {
+    this->GetOutputDims(pool_params, &output_dims_mkl_order);
+
+    return original_output_mkl_shape.IsMklTensor()
+            ? original_output_mkl_shape.GetMklLayout()
+            : memory::desc(output_dims_mkl_order,
+                         MklDnnType<T>(),
+                         this->data_format_mkldnn_);
+  }
+
+  memory::desc ConfigureInputGradient(
+        const MklDnnShape& input_gradient_mkl_shape,
+        const Tensor& input_gradient_tensor,
+        MklDnnData<T>* input_gradient_dnn_data,
+        const memory::desc& original_output_md) {
+    // Configure the gradient as is
+    memory::desc original_input_grad_md
+          = input_gradient_mkl_shape.IsMklTensor()
+          ? input_gradient_mkl_shape.GetMklLayout()
+          : memory::desc(TFShapeToMklDnnDimsInNCHW(
+                    input_gradient_tensor.shape(),
+                    this->data_format_tf_),
+                    MklDnnType<T>(), this->data_format_mkldnn_);
+
+    input_gradient_dnn_data->SetUsrMem(original_input_grad_md,
+                &input_gradient_tensor);
+
+    // Check to see if input grad diff dst is in the right format
+    // Create a new memory descriptor with the same shape as the
+    // original, but the format of the other tensors.
+    memory::format original_output_format =
+            static_cast<memory::format>(original_output_md.data.format);
+    bool grad_reorder_needed = input_gradient_dnn_data->IsReorderNeeded(
+                                    original_output_format);
+    memory::dims diff_dst_dims = input_gradient_mkl_shape.IsMklTensor()
+        ? input_gradient_mkl_shape.GetSizesAsMklDnnDims()
+        : TFShapeToMklDnnDimsInNCHW(input_gradient_tensor.shape(),
+                    this->data_format_tf_);
+    memory::desc target_diff_dst_md = memory::desc(diff_dst_dims,
+        MklDnnType<T>(), original_output_format);
+
+    return grad_reorder_needed
+            ? target_diff_dst_md
+            : original_input_grad_md;
+  }
+};
+#endif  // INTEL_MKL_DNN
+
 //-------------------------------------------------------------------
 // Utility functions
 
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 86a77d769a52d7592d15627b504ae60278b45058..45bdd0ad5cbab6c806f6c008f0d2642c4845cbc2 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -28,6 +28,19 @@ limitations under the License.
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+
+using mkldnn::stream;
+using mkldnn::prop_kind;
+using mkldnn::algorithm;
+using mkldnn::relu_forward;
+using mkldnn::relu_backward;
+using mkldnn::eltwise_relu;
+using mkldnn::eltwise_elu;
+using mkldnn::eltwise_tanh;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -45,6 +58,8 @@ struct MklReluHelpers {
   }
 };
 
+#ifndef INTEL_MKL_DNN
+
 template <typename Device, typename T>
 class MklReluOp : public OpKernel {
  public:
@@ -59,6 +74,7 @@ class MklReluOp : public OpKernel {
     GetMklShape(context, 0, &mkl_context.input_shape);
     void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
     bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
+
     if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
       const TensorShape& o_shape = input.shape();
       Tensor* out_tensor = nullptr;
@@ -164,6 +180,7 @@ class MklReluOp : public OpKernel {
   } MklReluOpContext;
 };
 
+
 template <typename Device, typename T>
 class MklReluGradOp : public OpKernel {
  public:
@@ -189,18 +206,18 @@ class MklReluGradOp : public OpKernel {
       const Tensor& a = MklGetInput(context, 1);
       void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
       void* mkl_buffer_convert = nullptr;
+
       dnnPrimitive_t cv_input_to_grad = nullptr;
 
-      // if input and grad are not in the same layout, do a conversion between
-      // them.
+      // if input and grad are not in the same layout,
+      // do a conversion between them.
       if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
         AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
                        &mkl_buffer_convert);
         CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input,
                    lt_grad), E_SUCCESS);
         CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
+                                          mkl_buffer_convert), E_SUCCESS);
         relu_res[dnnResourceSrc] = mkl_buffer_convert;
         dnnDelete_F32(cv_input_to_grad);
       } else {
@@ -246,7 +263,6 @@ class MklReluGradOp : public OpKernel {
 };
 
 template <typename Device, typename T>
-
 void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   MklReluGradOpContext mkl_context;
   const Tensor& g = MklGetInput(context, 0);
@@ -264,20 +280,21 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
       !MklReluHelpers::ValidateSameSize(context, g, a))
     return;
   Tensor* output = nullptr;
-  if (!input_is_mkl && !grad_is_mkl &&
-      !a.dims()) {  // handle the case of a scalar
-    // Allocate space for g and
+
+  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
+    // handle the scalar case
     const TensorShape& g_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
     AllocateOutputSetMklShape(context, 0, &output, g_shape,
                               mkl_context.output_shape);
+
     void* out_o = static_cast<void*>(output->flat<T>().data());
     (static_cast<T*>(out_o))[0] =
         (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
     return;
   }
 
-  // Generate size, stride for input if input/grad is in MKL format.
+  // generate size, stride for input if input/grad is in mkl format.
   if (grad_is_mkl || input_is_mkl) {
     const MklShape* tmp_mkl_shape =
         (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
@@ -308,21 +325,20 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   float negative_slope = 0.0;
   CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
                                      mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
+                                     negative_slope), E_SUCCESS);
   Tensor mkl_tmp_input_buf_tensor;
   mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
 
   if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are MKL leave it in MKL*/
+      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
     TensorShape tf_shape;
     mkl_context.output_shape.SetMklTensor(true);
     mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
                                           dnnResourceDiffSrc);
     mkl_context.output_shape.SetTfLayout(
         mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // If input_is_mkl or grad_is_mkl, then we copy strides and sizes from Mkl
-    // shape of one that is in MKL layout.
+    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
+    // shape of one that is in mkl layout.
     if (grad_is_mkl == true) {
       mkl_context.output_shape.SetTfDimOrder(
           mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
@@ -332,11 +348,9 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
     }
 
     tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
+                    mkl_context.output_shape.GetMklLayout())) / sizeof(T));
     AllocateOutputSetMklShape(context, 0, &output, tf_shape,
                               mkl_context.output_shape);
-
   } else {
     const TensorShape& o_shape = g.shape();
     mkl_context.output_shape.SetMklTensor(false);
@@ -347,13 +361,430 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
   mkl_context.relu_res[dnnResourceDiffSrc] =
       static_cast<void*>(output->flat<T>().data());
 
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
+  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd,
+                          mkl_context.relu_res),
+                          E_SUCCESS);
   mkl_context.MklCleanup();
 }
 
-/* Register DNN kernels for supported operations and supported types - right now
- * it is only Relu and f32*/
+
+#else  // INTEL_MKL_DNN
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluOpBase : public OpKernel {
+ public:
+  ~MklReluOpBase() {}
+
+  explicit MklReluOpBase(OpKernelConstruction* context) : OpKernel(context) {
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context) override {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      const size_t src_index = 0;  // index of src input tensor
+      const size_t dst_index = 0;  // index of dst output tensor
+      const Tensor& src_tensor = MklGetInput(context, src_index);
+      MklDnnShape dnn_shape_src;
+      GetMklShape(context, src_index, &dnn_shape_src);
+
+      Tensor* dst_tensor = nullptr;
+      if (src_tensor.dims() == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Create relu primitive.
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> dst(&cpu_engine);
+
+      // Set DNN primitive - src
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor()) {
+        src_md = dnn_shape_src.GetMklLayout();
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        // Create blocked memory descriptor
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+          // Operator memory descriptor is same as user memory descriptor.
+                                              alg_kind, src.GetUsrMemDesc(),
+                                              alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+
+      // allocate dst tensor
+      MklDnnShape dnn_shape_dst;
+      TensorShape tf_shape_dst;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_dst.SetMklTensor(true);
+        auto dst_pd = relu_fwd_pd->dst_primitive_desc();
+        dnn_shape_dst.SetMklLayout(&dst_pd);
+        dnn_shape_dst.SetElemType(MklDnnType<T>());
+        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                  dnn_shape_src.GetSizesAsMklDnnDims(),
+                                  dnn_shape_src.GetTfDataFormat());
+        tf_shape_dst.AddDim(dst_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_dst.SetMklTensor(false);
+        tf_shape_dst = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
+                                dnn_shape_dst);
+
+      // Destination memory descriptor is same as source memory descriptor.
+      auto dst_md = src_md;
+      dst.SetUsrMem(dst_md, dst_tensor);
+
+      // execute net
+      std::vector<primitive> net;
+      auto relu_fwd = relu_forward(*relu_fwd_pd, src.GetOpMem(),
+                                   dst.GetOpMem());
+      net.push_back(relu_fwd);
+      stream(stream::kind::eager).submit(net).wait();
+    } catch (mkldnn::error &e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + string(e.message) +
+                         ", in file " + string(__FILE__) + ":" +
+                         std::to_string(__LINE__);
+      OP_REQUIRES_OK(context,
+                     errors::Aborted("Operation received an exception:",
+                        error_msg));
+    }
+  }
+};
+
+
+template <typename Device, typename T, algorithm alg_kind>
+class MklReluGradOpBase : public OpKernel {
+ public:
+  ~MklReluGradOpBase() {}
+
+  explicit MklReluGradOpBase(OpKernelConstruction* context) :
+    OpKernel(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) = 0;
+
+  void Compute(OpKernelContext* context)  {
+    try {
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> src(&cpu_engine);
+      MklDnnData<T> diff_dst(&cpu_engine);
+      MklDnnData<T> diff_src(&cpu_engine);
+
+      const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+      const size_t src_index = 1;       // index of src input tensor
+      const size_t diff_src_index = 0;  // index of diff_src output tensor
+
+      const Tensor& src_tensor      = MklGetInput(context, src_index);
+      const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+      Tensor* diff_src_tensor       = nullptr;
+
+      MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
+      GetMklShape(context, src_index, &dnn_shape_src);
+      GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+      int src_dims_size = src_tensor.dims();
+      if (src_dims_size == 0) {
+        Compute_Scalar(context);
+        return;
+      }
+
+      // Set DNN primitives for src & diff_dst
+      memory::desc src_md({}, memory::data_undef, memory::format_undef);
+      memory::desc diff_dst_md({}, memory::data_undef, memory::format_undef);
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
+        if (dnn_shape_diff_dst.IsMklTensor()) {
+          diff_dst_md = dnn_shape_diff_dst.GetMklLayout();
+          src_md = diff_dst_md;
+        } else {
+          src_md = dnn_shape_src.GetMklLayout();
+          diff_dst_md = src_md;
+        }
+      } else {
+        auto src_dims = TFShapeToMklDnnDims(src_tensor.shape());
+        auto src_strides = CalculateTFStrides(src_dims);
+        src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
+        diff_dst_md = src_md;
+      }
+      src.SetUsrMem(src_md, &src_tensor);
+      diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+
+      T alpha = 0, beta = 0;
+      std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+      auto relu_fwd_desc = relu_forward::desc(prop_kind::forward_training,
+                                              alg_kind, src_md, alpha, beta);
+      relu_fwd_pd.reset(new relu_forward::primitive_desc(relu_fwd_desc,
+                                                         cpu_engine));
+      auto relu_bwd_desc = relu_backward::desc(alg_kind, diff_dst_md, src_md,
+                                                alpha, beta);
+      auto relu_bwd_pd  = relu_backward::primitive_desc(relu_bwd_desc,
+                                                cpu_engine, *relu_fwd_pd);
+
+      // allocate diff_src tensor
+      MklDnnShape dnn_shape_diff_src;
+      TensorShape tf_shape_diff_src;
+      if (dnn_shape_src.IsMklTensor()) {
+        dnn_shape_diff_src.SetMklTensor(true);
+        auto diff_src_pd = relu_bwd_pd.diff_src_primitive_desc();
+        dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
+        dnn_shape_diff_src.SetElemType(MklDnnType<T>());
+        dnn_shape_diff_src.SetTfLayout(dnn_shape_src.GetDimension(),
+                                       dnn_shape_src.GetSizesAsMklDnnDims(),
+                                       dnn_shape_src.GetTfDataFormat());
+        tf_shape_diff_src.AddDim(diff_src_pd.get_size()/sizeof(T));
+      } else {
+        dnn_shape_diff_src.SetMklTensor(false);
+        tf_shape_diff_src = src_tensor.shape();
+      }
+      AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                                 tf_shape_diff_src, dnn_shape_diff_src);
+
+      // diff_src memory descriptor is same as diff_dst memory descriptor.
+      auto diff_src_md = diff_dst_md;
+      diff_src.SetUsrMem(diff_src_md, diff_src_tensor);
+
+      PrepareAndExecuteNet(relu_bwd_pd, &src, &diff_src, &diff_dst);
+     } catch (mkldnn::error &e) {
+       string error_msg = "Status: " + std::to_string(e.status) +
+                          ", message: " + string(e.message) +
+                          ", in file " + string(__FILE__) + ":" +
+                          std::to_string(__LINE__);
+       OP_REQUIRES_OK(context,
+                      errors::Aborted("Operation received an exception:",
+                                      error_msg));
+    }
+  }
+
+  void PrepareAndExecuteNet(const relu_backward::primitive_desc& relu_prim_desc,
+                  MklDnnData<T>* src, MklDnnData<T>* diff_src, MklDnnData<T>*
+                  diff_dst) {
+    std::vector<primitive> net;
+    net.push_back(relu_backward(relu_prim_desc, src->GetOpMem(),
+                                diff_dst->GetOpMem(), diff_src->GetOpMem()));
+    stream(stream::kind::eager).submit(net).wait();
+  }
+};
+
+
+template <typename Device, typename T>
+class MklReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluOp() {}
+
+  explicit MklReluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    (static_cast<T*>(out_o))[0] =
+              std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklReluGradOp() {}
+
+  explicit MklReluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_relu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  ((static_cast<T*>(user_i))[0] > 0);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluOp : public MklReluOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluOp() {}
+
+  explicit MklEluOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // return exp(feature) - 1 if feature > 0; feature otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature < 0)
+      (static_cast<T*>(out_o))[0] = std::exp(feature);
+    else
+      (static_cast<T*>(out_o))[0] = feature;
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklEluGradOp : public MklReluGradOpBase<Device, T, eltwise_elu> {
+ public:
+  ~MklEluGradOp() {}
+
+  explicit MklEluGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_elu>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    // gradient of elu(x) = 1 if x > 0; elu(x) + 1 otherwise
+    T feature = (static_cast<T*>(user_i))[0];
+    if (feature > 0) {
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0];
+    } else {
+      T elu = std::exp(feature) - 1;
+      (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] * (elu + 1);
+    }
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhOp : public MklReluOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhOp() {}
+
+  explicit MklTanhOp(OpKernelConstruction* context) :
+  MklReluOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    void* user_i = static_cast<void*>(const_cast<T*>(
+                         src_tensor.flat<T>().data()));
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    void* out_o = static_cast<void*>(dst_tensor->flat<T>().data());
+    // tanh(x) = (e^x - e^(-x))/ (e^x + e^(-x))
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    (static_cast<T*>(out_o))[0] = (e1 - e2)/(e1 + e2);
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklTanhGradOp : public MklReluGradOpBase<Device, T, eltwise_tanh> {
+ public:
+  ~MklTanhGradOp() {}
+
+  explicit MklTanhGradOp(OpKernelConstruction* context) :
+  MklReluGradOpBase<Device, T, eltwise_tanh>(context) {}
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor    = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    int src_dims_size = src_tensor.dims();
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    void* out_o = static_cast<void*>(diff_src_tensor->flat<T>().data());
+    void* user_i =
+          static_cast<void*>(const_cast<T*>(src_tensor.flat<T>().data()));
+    // gradient of tanh(x) = 1 - tanh(x)^2
+    T feature = (static_cast<T*>(user_i))[0];
+    T e1 = std::exp(feature);
+    T e2 = std::exp(-feature);
+    T tanh = (e1 - e2)/(e1 + e2);
+    void* user_g =
+          static_cast<void*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+    (static_cast<T*>(out_o))[0] = (static_cast<T*>(user_g))[0] *
+                                  (1 - tanh * tanh);
+  }
+};
+
+#endif
+
+// register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
   REGISTER_KERNEL_BUILDER(Name("_MklRelu")                          \
                               .Device(DEVICE_CPU)                   \
@@ -367,6 +798,38 @@ void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
+#ifdef INTEL_MKL_DNN
+
+// register dnn kernels for supported operations and supported types
+#define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklElu")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklEluGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklEluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES);
+
+#define REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES(type)             \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanh")                          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhOp<CPUDevice, type>);              \
+  REGISTER_KERNEL_BUILDER(Name("_MklTanhGrad")                      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklTanhGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
+
+#endif
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
+
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 5e985824750befb702f8fa7a59d699f853f40267..11c92ebdb41c559f10fb851c9684c0dc3d93d21e 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -28,6 +28,11 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+#include "mkldnn.hpp"
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename T>
@@ -35,6 +40,7 @@ class MklReshapeOp : public OpKernel {
  public:
   explicit MklReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
+#ifndef INTEL_MKL_DNN
   void Compute(OpKernelContext* context) override {
     const Tensor& input = MklGetInput(context, 0);
     const Tensor& sizes = MklGetInput(context, 1);
@@ -129,7 +135,183 @@ class MklReshapeOp : public OpKernel {
     }
   }
 
+#else
+
  private:
+  // When the input tensor is in MKL layout and we are reshaping the tensor to a
+  // different shape than its actual shape, then we use MKLDNN reorder primitive
+  // to put tensor back in Tensorflow layout. But we can skip this reordering
+  // some times. This function checks for all such cases.
+  bool SkipReorder(const MklDnnShape& mkl_shape_input,
+                   const TensorShape& reshape_to) {
+    CHECK_EQ(mkl_shape_input.IsMklTensor(), true);
+    bool ret = false;
+
+    // If Tensorflow's data format and the underlying format maintained by
+    // MKLDNN are equivalent (both are NHWC or both are NCHW), then we can
+    // safely return true.
+    auto input_mkl_md = mkl_shape_input.GetMklLayout();
+    if (mkl_shape_input.GetTfDataFormat() == input_mkl_md.data.format) {
+      ret = true;
+    }
+
+    return ret;
+  }
+
+ public:
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = MklGetInput(context, 0);
+    const Tensor& sizes = MklGetInput(context, 1);
+
+    MklDnnShape mkl_shape_input;
+    GetMklShape(context, kInputSlotIdx, &mkl_shape_input);
+    bool input_in_mkl_format = mkl_shape_input.IsMklTensor();
+    const int64 nelems = input_in_mkl_format ?
+                         mkl_shape_input.GetTfShape().num_elements()
+                         : input_tensor.NumElements();
+
+    // Preliminary validation of sizes.
+    OP_REQUIRES(context, IsLegacyVector(sizes.shape()),
+                errors::InvalidArgument("sizes input must be 1-D, not shape ",
+                                        sizes.shape().DebugString()));
+
+    // Compute the output shape.  Determine product of specified
+    // dimensions, and find the index of the unspecified one.
+    TensorShape shape;
+    int64 product = 1;
+    int unknown_index = -1;
+    switch (sizes.dtype()) {
+      case DT_INT32:
+        OP_REQUIRES_OK(context, ValidateSizes<int32>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      case DT_INT64:
+        OP_REQUIRES_OK(context, ValidateSizes<int64>(sizes, &product,
+                                                     &unknown_index, &shape));
+        break;
+      default:
+        context->CtxFailure(errors::InvalidArgument(
+            "desired shape must be a DT_INT32 or DT_INT64 vector, not a ",
+            DataTypeString(sizes.dtype())));
+        return;
+    }
+    if (unknown_index != -1) {
+      OP_REQUIRES(
+          context, product > 0,
+          errors::InvalidArgument("Reshape cannot infer the missing input size "
+                                  "for an empty tensor unless all specified "
+                                  "input sizes are non-zero"));
+      const int64 missing = nelems / product;
+      OP_REQUIRES(
+          context, product * missing == nelems,
+          errors::InvalidArgument(
+              "Input to reshape is a tensor with ", nelems,
+              " values, but the requested shape requires a multiple of ",
+              product));
+      shape.set_dim(unknown_index, missing);
+    }
+    OP_REQUIRES(context, shape.num_elements() == nelems,
+                errors::InvalidArgument("Input to reshape is a tensor with ",
+                                        nelems,
+                                        " values, but the requested shape has ",
+                                        shape.num_elements()));
+
+    if (input_in_mkl_format) {
+      TensorShape& shape_to = shape;
+      TensorShape shape_from = mkl_shape_input.GetTfShape();
+      if (shape_from == shape_to) {
+        CopyMklTensorInToOut(context, kInputSlotIdx, kOutputSlotIdx);
+        return;
+      } else {
+        try {
+          auto cpu_engine = engine(engine::cpu, 0);
+          MklDnnData<T> dnn_data_input(&cpu_engine);
+          // Reshape is just a logical view change operation for a tensor.
+          // It does not change underlying layout. But MKLDNN may maintain
+          // tensor data in different layout than that specified by Tensorflow.
+          // If MKLDNN maintains input tensor in different layout than that
+          // specified by Tensorflow, we will need to reorder tensor and then
+          // put it in the shape expected by Tensorflow. But if MKLDNN has
+          // maintained input tensor in the same layout as it is expected by
+          // Tensorflow, we don't need to reorder tensor contents, we just
+          // need to update MklDnnShape object associated with the input
+          // tensor to reflect the shape change expected by reshape.
+          if (!SkipReorder(mkl_shape_input, shape_to)) {
+              // If dimensions that are being expanded or collapsed are not
+              // maintained contiguously by MKLDNN, then we use reorder.
+
+              // Get Mkl layout of input tensor.
+              auto input_mkl_md = mkl_shape_input.GetMklLayout();
+              // Set input Mkl layout as the user layout.
+              dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
+              // Get expected Tensorflow layout of input tensor.
+              auto output_tf_md = mkl_shape_input.GetTfLayout();
+              auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                         cpu_engine);
+
+              Tensor* output_tensor = nullptr;
+              MklShape mkl_shape_output;
+              mkl_shape_output.SetMklTensor(false);
+              // We allocate output tensor in the shape expected by Reshape.
+              AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
+                                        shape_to, mkl_shape_output);
+
+              // Insert reorder between Mkl layout and TensorFlow layout.
+              std::vector<primitive> net;
+              CHECK_EQ(dnn_data_input.CheckReorderToOpMem(output_tf_pd,
+                       output_tensor, &net), true);
+              stream(stream::kind::eager).submit(net).wait();
+              return;
+          } else {
+            // If dimensions that are being expanded or collapsed are
+            // maintained contiguously by MKLDNN, then we skip reorder, just
+            // update MklDnnShape object for the tensorflow tensor, and forward
+            // Tensorflow tensor as it is to the output.
+            auto output_dims = TFShapeToMklDnnDims(shape_to);
+            auto output_strides = CalculateTFStrides(output_dims);
+            auto output_tf_md = MklDnnData<T>::CreateBlockedMemDesc(output_dims,
+                                                               output_strides);
+            auto output_tf_pd = memory::primitive_desc(output_tf_md,
+                                                       cpu_engine);
+
+            // Set MklDnnShape
+            MklDnnShape mkl_shape_output;
+            mkl_shape_output.SetMklTensor(true);
+            mkl_shape_output.SetMklLayout(&output_tf_pd);
+            mkl_shape_output.SetElemType(MklDnnType<T>());
+            mkl_shape_output.SetTfLayout(output_dims.size(), output_dims,
+                                         memory::format::blocked);
+
+            // We now simply forward input Mkl tensor to output and change its
+            // output MklDnnShape object.
+            ForwardMklTensorInToOutWithMklShape(context, kInputSlotIdx,
+                                              kOutputSlotIdx, mkl_shape_output);
+            return;
+          }
+        } catch (mkldnn::error &e) {
+          string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) +
+                       ", in file " + string(__FILE__) + ":" +
+                       std::to_string(__LINE__);
+          OP_REQUIRES_OK(context,
+                   errors::Aborted("Operation received an exception:",
+                      error_msg));
+        }
+      }
+    } else {
+      // If input tensor is not in Mkl format, then just copy Tensorflow tensor
+      // to output with specified shape.
+      CopyTfTensorInToOutWithShape(context, kInputSlotIdx, kOutputSlotIdx,
+                                   shape);
+    }
+  }
+
+#endif  // INTEL_MKL_DNN
+
+ private:
+  const int kInputSlotIdx = 0;
+  const int kOutputSlotIdx = 0;
+
   template <typename Tshape>
   Status ValidateSizes(const Tensor& sizes, int64* product, int* unknown_index,
                        TensorShape* shape) {
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index a240ee44fb014555b467ff2a920604dcc425972d..c4d5a45d3caff0f59b1ecc61f95dd26fe16fd06b 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
-
 #ifndef TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 #define TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 
+#ifdef INTEL_MKL
+
 #include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/numeric_op.h"
@@ -35,6 +35,10 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
+#ifdef INTEL_MKL_DNN
+using mkldnn::stream;
+#endif
+
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
@@ -57,6 +61,71 @@ class MklToTfOp : public OpKernel {
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
 
+#ifdef INTEL_MKL_DNN
+  static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
+                             string data_format_str, DataType op_data_type,
+                             bool has_avx512f, uint input_number) {
+    try {
+      // Check that input tensor is in MKL format.
+      const Tensor& input_tensor = MklGetInput(context, input_number);
+      MklDnnShape input_shape;
+      GetMklShape(context, input_number, &input_shape);
+
+      // if input is already in Tf format, then copy input tensor to output.
+      if (!input_shape.IsMklTensor()) {
+        context->set_output(input_number, input_tensor);
+        VLOG(1) << "MKLToTFConversion: No conversion needed, "
+                << "copying input to output";
+        return;
+      }
+
+      // Check that input data type is same as operator data type and that it
+      // is same as output data type.
+      DataType input_data_type = op_kernel->input_type(input_number);
+      DataType output_data_type = op_kernel->output_type(input_number);
+      CHECK_EQ(op_data_type, input_data_type);
+      CHECK_EQ(op_data_type, output_data_type);
+
+      auto cpu_engine = engine(engine::cpu, 0);
+      MklDnnData<T> input(&cpu_engine);
+
+      // Get Mkl layout of input tensor.
+      auto input_mkl_md = input_shape.GetMklLayout();
+      // Get TensorFlow layout of input tensor. Expected output of conversion
+      // has same layout as Tensorflow layout of input tensor.
+      auto output_tf_md = input_shape.GetTfLayout();
+      auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+      // Set input Mkl layout as the user layout.
+      input.SetUsrMem(input_mkl_md, &input_tensor);
+
+      // Allocate output tensor.
+      TensorShape output_shape = input_shape.GetTfShape();
+      Tensor* output_tensor = NULL;
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  input_number, output_shape, &output_tensor));
+      CHECK_NOTNULL(output_tensor);
+
+      // Do we need to reorder Mkl layout into TensorFlow layout?
+      if (input.IsReorderNeeded(output_tf_pd)) {
+        // Insert reorder between Mkl layout and TensorFlow layout.
+        std::vector<primitive> net;
+        CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, output_tensor, &net),
+                 true);
+        stream(stream::kind::eager).submit(net).wait();
+      } else {
+        // If not, just forward input tensor to output tensor.
+        CHECK(output_tensor->CopyFrom(input_tensor, output_shape));
+      }
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+#else
   static void ConvertMklToTf(OpKernel* op_kernel, OpKernelContext* context,
                              string data_format_str, DataType op_data_type,
                              bool has_avx512f, uint input_number) {
@@ -91,8 +160,8 @@ class MklToTfOp : public OpKernel {
 
     // Allocate output tensor.
     Tensor* output_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(input_number, output_shape, &output_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(input_number, output_shape,
+                                                     &output_tensor));
 
     dnnLayout_t output_layout =
         static_cast<dnnLayout_t>(input_shape.GetTfLayout());
@@ -106,6 +175,7 @@ class MklToTfOp : public OpKernel {
                                      output_buffer);
     VLOG(1) << "MKLToTFConversion complete successfully.";
   }
+#endif
 
  private:
   /// Data format of the operation
@@ -132,5 +202,5 @@ class MklToTfOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
 #endif  // INTEL_MKL
+#endif  // TENSORFLOW_CORE_KERNELS_MKL_TFCONV_OP_H_
diff --git a/tensorflow/core/kernels/multinomial_op.cc b/tensorflow/core/kernels/multinomial_op.cc
index 8c0109f5c87ce5f73621a1683471bbcb8a936ea4..d086abb24760f1ab946605fd422a4fd0d5fc866d 100644
--- a/tensorflow/core/kernels/multinomial_op.cc
+++ b/tensorflow/core/kernels/multinomial_op.cc
@@ -40,7 +40,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor {
   void operator()(OpKernelContext* ctx, const Device& d,
                   typename TTypes<T>::ConstMatrix logits,
@@ -49,11 +49,11 @@ struct MultinomialFunctor {
                   typename TTypes<float>::Flat scratch, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output);
+                  typename TTypes<OutputType>::Matrix output);
 };
 
-template <typename T>
-struct MultinomialFunctor<CPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<CPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const CPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat /* noises */,
@@ -61,7 +61,7 @@ struct MultinomialFunctor<CPUDevice, T> {
                   typename TTypes<float>::Flat /* scratch */, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
     // The implementation only parallelizes by batch.
@@ -128,7 +128,7 @@ struct MultinomialFunctor<CPUDevice, T> {
 }  // namespace functor
 
 // Samples from a multinomial distribution.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 class MultinomialOp : public OpKernel {
  public:
   explicit MultinomialOp(OpKernelConstruction* context) : OpKernel(context) {
@@ -195,11 +195,11 @@ class MultinomialOp : public OpKernel {
       if (std::is_same<Device, CPUDevice>::value) num_samples_ceil_4 *= 2;
       auto rng =
           generator_.ReserveRandomOutputs(batch_size * num_samples_ceil_4, 256);
-      functor::MultinomialFunctor<Device, T>()(
+      functor::MultinomialFunctor<Device, T, OutputType>()(
           ctx, ctx->eigen_device<Device>(), logits_t.matrix<T>(),
           noises.flat<float>(), scores.flat<float>(), scratch.flat<float>(),
           batch_size, num_classes, num_samples, rng,
-          samples_t->matrix<int64>());
+          samples_t->matrix<OutputType>());
     }
   }
 
@@ -209,10 +209,17 @@ class MultinomialOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(MultinomialOp);
 };
 
-#define REGISTER(TYPE)                                                  \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("Multinomial").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
-      MultinomialOp<CPUDevice, TYPE>);
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<CPUDevice, TYPE, int32>);        \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<CPUDevice, TYPE, int64>);
 
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
@@ -220,12 +227,20 @@ TF_CALL_double(REGISTER);
 #undef REGISTER
 
 #if GOOGLE_CUDA
-#define REGISTER(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(Name("Multinomial")             \
-                              .Device(DEVICE_GPU)         \
-                              .HostMemory("num_samples")  \
-                              .TypeConstraint<TYPE>("T"), \
-                          MultinomialOp<GPUDevice, TYPE>)
+#define REGISTER(TYPE)                                                   \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT32), \
+                          MultinomialOp<GPUDevice, TYPE, int32>)         \
+  REGISTER_KERNEL_BUILDER(Name("Multinomial")                            \
+                              .Device(DEVICE_GPU)                        \
+                              .HostMemory("num_samples")                 \
+                              .TypeConstraint<TYPE>("T")                 \
+                              .TypeConstraint("output_dtype", DT_INT64), \
+                          MultinomialOp<GPUDevice, TYPE, int64>)
+
 TF_CALL_half(REGISTER);
 TF_CALL_float(REGISTER);
 TF_CALL_double(REGISTER);
diff --git a/tensorflow/core/kernels/multinomial_op.h b/tensorflow/core/kernels/multinomial_op.h
index af5e81f219c802857fd6d5eb27e4962cc890a058..6e41060aa414b0611dd7dca31374444f8dd364ec 100644
--- a/tensorflow/core/kernels/multinomial_op.h
+++ b/tensorflow/core/kernels/multinomial_op.h
@@ -21,7 +21,7 @@ namespace tensorflow {
 namespace functor {
 
 // Generic helper functor for the Multinomial Op.
-template <typename Device, typename T>
+template <typename Device, typename T, typename OutputType>
 struct MultinomialFunctor;
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 19b4f3ca559f56d93fae203df77f0ef35718db1b..5cc5877cceb19320023423d35a352c5ba3db13e2 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -37,20 +37,22 @@ using GPUDevice = Eigen::GpuDevice;
 
 // Kernel for Multinomial op.  Data is interpreted to have the following shapes:
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
+template <typename OutputType>
 __global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
                                   const int32 num_samples, const float* scores,
-                                  const float* maxima, int64* output) {
+                                  const float* maxima, OutputType* output) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     const int maxima_idx = index / num_classes;
     if (ldg(maxima + maxima_idx) == ldg(scores + index)) {
-      CudaAtomicMax(reinterpret_cast<uint64*>(output + maxima_idx),
-                    static_cast<uint64>(index % num_classes));
+      using UnsignedOutputType = typename std::make_unsigned<OutputType>::type;
+      CudaAtomicMax(reinterpret_cast<UnsignedOutputType*>(output + maxima_idx),
+                    static_cast<UnsignedOutputType>(index % num_classes));
     }
   }
 }
 
-template <typename T>
-struct MultinomialFunctor<GPUDevice, T> {
+template <typename T, typename OutputType>
+struct MultinomialFunctor<GPUDevice, T, OutputType> {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
                   typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<float>::Flat noises,
@@ -58,7 +60,7 @@ struct MultinomialFunctor<GPUDevice, T> {
                   typename TTypes<float>::Flat maxima, int batch_size,
                   int num_classes, int num_samples,
                   const random::PhiloxRandom& gen,
-                  typename TTypes<int64>::Matrix output) {
+                  typename TTypes<OutputType>::Matrix output) {
     // Uniform, [0, 1).
     typedef random::UniformDistribution<random::PhiloxRandom, float> Dist;
     functor::FillPhiloxRandom<GPUDevice, Dist>()(ctx, d, gen, noises.data(),
@@ -111,11 +113,17 @@ struct MultinomialFunctor<GPUDevice, T> {
 };
 
 // Explicit instantiation of the GPU functors.
-template struct MultinomialFunctor<GPUDevice, Eigen::half>;
-template struct MultinomialFunctor<GPUDevice, float>;
-template struct MultinomialFunctor<GPUDevice, double>;
-template struct MultinomialFunctor<GPUDevice, int32>;
-template struct MultinomialFunctor<GPUDevice, int64>;
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int32>;
+template struct MultinomialFunctor<GPUDevice, float, int32>;
+template struct MultinomialFunctor<GPUDevice, double, int32>;
+template struct MultinomialFunctor<GPUDevice, int32, int32>;
+template struct MultinomialFunctor<GPUDevice, int64, int32>;
+
+template struct MultinomialFunctor<GPUDevice, Eigen::half, int64>;
+template struct MultinomialFunctor<GPUDevice, float, int64>;
+template struct MultinomialFunctor<GPUDevice, double, int64>;
+template struct MultinomialFunctor<GPUDevice, int32, int64>;
+template struct MultinomialFunctor<GPUDevice, int64, int64>;
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 0db7c63b8b6a25f1d495dd937d49ec9d0615ab0a..a841291ddd7d4f64b0ab2b611c59307f4d11150f 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -653,6 +653,8 @@ BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
 // Benchmarks with different stride and padding options.
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9);
+BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10);
 
 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \
   static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) {               \
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index d3d1b56c9d568487c768f1b1620d2880a3afc531..93ef5127789048b85740e276f76f97e7b46e8368 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -98,6 +98,19 @@ gtl::InlinedVector<T, 8> ComputeStride(const TensorShape& shape) {
   return strides;
 }
 
+// Helper to compute 'strides' given an Eigen TensorDimensions
+template <typename T, typename EigenDimensions>
+gtl::InlinedVector<T, 8> ComputeEigenStrides(const EigenDimensions& shape) {
+  const int ndims = shape.rank();
+  gtl::InlinedVector<T, 8> strides(ndims);
+  T stride = 1;
+  for (int i = ndims - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= static_cast<T>(shape[i]);
+  }
+  return strides;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 814128d99ac2acb4a10cfcb2907edb735eaca382..61675930135473b2e347522b39aa7bdd71f73673 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -140,6 +140,7 @@ class PackOp : public OpKernel {
 TF_CALL_ALL_TYPES(REGISTER_PACK);
 TF_CALL_QUANTIZED_TYPES(REGISTER_PACK);
 TF_CALL_bfloat16(REGISTER_PACK);
+TF_CALL_variant(REGISTER_PACK);
 
 #if defined(IS_MOBILE_PLATFORM) && !defined(SUPPORT_SELECTIVE_REGISTRATION)
 // Primarily used for SavedModel support on mobile.
diff --git a/tensorflow/core/kernels/padded_batch_dataset_op.cc b/tensorflow/core/kernels/padded_batch_dataset_op.cc
index cfc77690b568a3223ca33f359f47fe22de9b35ff..cef5bde1567fefdfdf5a8d989aaa9ef56657c704 100644
--- a/tensorflow/core/kernels/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/padded_batch_dataset_op.cc
@@ -181,16 +181,18 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       padding_values.push_back(tensor::DeepCopy(padding_value_t));
     }
 
-    *output = new Dataset(batch_size, std::move(padded_shapes),
+    *output = new Dataset(ctx, batch_size, std::move(padded_shapes),
                           std::move(padding_values), input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 batch_size, std::vector<PartialTensorShape> padded_shapes,
+    Dataset(OpKernelContext* ctx, int64 batch_size,
+            std::vector<PartialTensorShape> padded_shapes,
             std::vector<Tensor> padding_values, const DatasetBase* input)
-        : batch_size_(batch_size),
+        : GraphDatasetBase(ctx),
+          batch_size_(batch_size),
           padded_shapes_(std::move(padded_shapes)),
           padding_values_(std::move(padding_values)),
           input_(input) {
@@ -232,6 +234,47 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* batch_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
+
+      std::vector<Node*> padded_shapes;
+      padded_shapes.reserve(padded_shapes_.size());
+      for (int i = 0; i < padded_shapes_.size(); i++) {
+        Node* node;
+        Tensor t(DT_INT64, TensorShape({padded_shapes_[i].dims()}));
+        for (int j = 0; j < padded_shapes_[i].dims(); j++) {
+          t.vec<int64>()(j) = padded_shapes_[i].dim_size(j);
+        }
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        padded_shapes.emplace_back(node);
+      }
+
+      std::vector<Node*> padding_values;
+      padding_values.reserve(padding_values_.size());
+      for (const Tensor& t : padding_values_) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        padding_values.emplace_back(node);
+      }
+
+      AttrValue output_types;
+      b->BuildAttrValue(output_dtypes(), &output_types);
+
+      AttrValue N;
+      b->BuildAttrValue<int64>(padded_shapes_.size(), &N);
+
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {{0, input_graph_node}, {1, batch_size}},
+                        {{2, padded_shapes}, {3, padding_values}},
+                        {{"Toutput_types", output_types}, {"N", N}}, output));
+      return Status::OK();
+    }
+
    private:
     // Copies element into the index^th slice of parent (in the 0th dimension).
     //
@@ -248,17 +291,25 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         // Each row of `batch_elements` is a tuple of tensors from the
         // input iterator.
         std::vector<std::vector<Tensor>> batch_elements;
-        batch_elements.reserve(dataset()->batch_size_);
         {
           mutex_lock l(mu_);
-          *end_of_sequence = false;
-          for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
-               ++i) {
-            std::vector<Tensor> batch_element_tuple;
-            TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
-                                                    end_of_sequence));
-            if (!*end_of_sequence) {
-              batch_elements.push_back(std::move(batch_element_tuple));
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else {
+            *end_of_sequence = false;
+            batch_elements.reserve(dataset()->batch_size_);
+            for (int i = 0; i < dataset()->batch_size_ && !*end_of_sequence;
+                 ++i) {
+              std::vector<Tensor> batch_element_tuple;
+              TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &batch_element_tuple,
+                                                      end_of_sequence));
+              if (!*end_of_sequence) {
+                batch_elements.push_back(std::move(batch_element_tuple));
+              }
+            }
+            if (*end_of_sequence) {
+              input_impl_.reset();
             }
           }
         }
@@ -347,6 +398,28 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("exhausted"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("exhausted"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index d0f7683f3dd8d520339dfd132af8a101da3abd5a..9d35ecb66c00e0cf7a2298a9d324c910ed33c7cc 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/padding_fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -156,7 +157,7 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                 // Finished.  Allocate attempt->tuple and
                 // copy from attempt->tuples to attempt->tuple.
                 attempt->tuple.reserve(num_components());
-                const std::vector<Tuple>& tuples = attempt->tuples;
+                std::vector<Tuple>& tuples = attempt->tuples;
 
                 std::vector<bool> dynamic_shape;
                 const int64 batch_size = tuples.size();
@@ -206,8 +207,10 @@ void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                       attempt->context->SetStatus(CopyElementToLargerSlice(
                           tuples[index][i], &attempt->tuple[i], index));
                     } else {
-                      attempt->context->SetStatus(CopyElementToSlice(
-                          tuples[index][i], &attempt->tuple[i], index));
+                      attempt->context->SetStatus(
+                          batch_util::CopyElementToSlice(
+                              std::move(tuples[index][i]), &attempt->tuple[i],
+                              index));
                     }
                     if (!attempt->context->status().ok()) return kComplete;
                   }
diff --git a/tensorflow/core/kernels/parallel_map_dataset_op.cc b/tensorflow/core/kernels/parallel_map_dataset_op.cc
index 2be87f4bde6f28596213433fe287d351ccf0c721..5ba1ad222d8a7fb1ed6594b864402f39a15cf7ae 100644
--- a/tensorflow/core/kernels/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/parallel_map_dataset_op.cc
@@ -195,8 +195,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
           FunctionLibraryRuntime::Options opts;
           opts.step_id = CapturedFunction::generate_step_id();
-          ScopedStepContainer* step_container = new ScopedStepContainer(
-              opts.step_id, [this, ctx](const string& name) {
+          ScopedStepContainer* step_container =
+              new ScopedStepContainer(opts.step_id, [this](const string& name) {
                 dataset()
                     ->captured_func_->resource_manager()
                     ->Cleanup(name)
@@ -205,7 +205,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           opts.step_container = step_container;
           opts.runner = ctx->runner();
           dataset()->captured_func_->RunAsync(
-              opts, input_element, &result->return_values,
+              opts, std::move(input_element), &result->return_values,
               [result, step_container, result_index](Status ret_status) {
                 delete step_container;
                 result->status.Update(ret_status);
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 7dee751c4f3035c968b5cc69103fc1fb78bf4285..ac90f67ce0bb8d9acffc3868acffc1cdfbe0f492 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -143,7 +143,7 @@ void DnnPoolingOp<T>::Compute(
     perftools::gputools::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor& tensor_in,
-    const TensorShape& tensor_out_shape) {
+    const TensorShape& tensor_out_shape, bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -188,7 +188,8 @@ void DnnPoolingOp<T>::Compute(
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
       .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols);
+      .set_horizontal_padding(params.pad_cols)
+      .set_propagate_nans(propagate_nans);
 
   perftools::gputools::dnn::BatchDescriptor input_desc;
   input_desc.set_count(params.tensor_in_batch)
@@ -237,7 +238,7 @@ void DnnPoolingGradOp<T>::Compute(
     const std::vector<int32>& size, const std::vector<int32>& stride,
     Padding padding, TensorFormat data_format, const Tensor* tensor_in,
     const Tensor* tensor_out, const Tensor& out_backprop,
-    const TensorShape& tensor_in_shape) {
+    const TensorShape& tensor_in_shape, bool propagate_nans) {
   CHECK((pooling_mode != perftools::gputools::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
@@ -327,7 +328,8 @@ void DnnPoolingGradOp<T>::Compute(
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
       .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols);
+      .set_horizontal_padding(params.pad_cols)
+      .set_propagate_nans(propagate_nans);
 
   perftools::gputools::dnn::BatchDescriptor orig_output_desc;
   orig_output_desc.set_count(params.tensor_in_batch)
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index b594f39fadf689ba6e5ed7163396bddee49c9246..14584565857087b5c2479ff9d5ad513bd283a4a7 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -44,7 +44,7 @@ class DnnPoolingOp {
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor& tensor_in,
-                      const TensorShape& tensor_out_shape);
+                      const TensorShape& tensor_out_shape, bool propagate_nans);
 };
 
 // A helper class that launch the cudnn pooling backward operations.
@@ -60,7 +60,7 @@ class DnnPoolingGradOp {
                       const std::vector<int32>& stride, Padding padding,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
-                      const TensorShape& tensor_in_shape);
+                      const TensorShape& tensor_in_shape, bool propagate_nans);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/prefetch_dataset_op.cc b/tensorflow/core/kernels/prefetch_dataset_op.cc
index a7aac508eb3f76a588f9fc39b761e33222a37041..b02269f525a8bec3b6ddb01a5039316a7c47a309 100644
--- a/tensorflow/core/kernels/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/prefetch_dataset_op.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include <deque>
 
-#include "tensorflow/core/kernels/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
 
@@ -36,31 +37,17 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
     int64 buffer_size;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(ctx, buffer_size > 0,
+                errors::InvalidArgument("buffer_size must be > 0"));
 
-    // TODO(mrry): It seems unnatural to capture the params from *this
-    // kernel's* OpKernelContext, although the captured values should
-    // be the same for any kernel in the same session. Consider adding
-    // an IteratorContext* argument to Dataset::MakeIterator(), and
-    // threading the context information through that
-    // way. Alternatively, provide a session-scoped context that will
-    // provide this information to all users in the same session (and
-    // that will have the appropriate lifetime).
-    IteratorContext::Params params;
-    params.env = ctx->env();
-    params.resource_manager = ctx->resource_manager();
-    params.runner = *(ctx->runner());
-
-    *output = new Dataset(input, buffer_size, std::move(params));
+    *output = new Dataset(ctx, input, buffer_size);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const DatasetBase* input, int64 buffer_size,
-            IteratorContext::Params ctx_params)
-        : input_(input),
-          buffer_size_(buffer_size),
-          ctx_params_(std::move(ctx_params)) {
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
+        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
       input_->Ref();
     }
 
@@ -81,6 +68,18 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "PrefetchDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, buffer_size}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -135,7 +134,10 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
             // Wake the prefetch thread, in case it has been waiting
             // for space in the buffer.
-            cond_var_.notify_one();
+            // Also wake up threads from other calls to GetNext.
+            // TODO(mrry): Consider using different condition variables
+            // for GetNext and Prefetch.
+            cond_var_.notify_all();
             return s;
           } else if (prefetch_thread_finished_) {
             *end_of_sequence = true;
@@ -144,6 +146,69 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // Acquire both locks to ensure that the prefetch thread and
+        // all GetNext threads are blocked.
+        mutex_lock parent_l(parent_mu_);
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
+        for (size_t i = 0; i < buffer_.size(); i++) {
+          auto& buffer_element = buffer_[i];
+          TF_RETURN_IF_ERROR(WriteStatus(writer, i, buffer_element.status));
+          if (buffer_element.status.ok()) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("buffer[", i, "].size")),
+                buffer_element.value.size()));
+            for (size_t j = 0; j < buffer_element.value.size(); j++) {
+              TF_RETURN_IF_ERROR(writer->WriteTensor(
+                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  buffer_element.value[j]));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock parent_l(parent_mu_);
+        mutex_lock l(mu_);
+        buffer_.clear();
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        size_t buffer_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("buffer_size"), &temp));
+          buffer_size = static_cast<size_t>(temp);
+        }
+        for (size_t i = 0; i < buffer_size; i++) {
+          buffer_.emplace_back();
+          auto& buffer_element = buffer_.back();
+          TF_RETURN_IF_ERROR(ReadStatus(reader, i, &buffer_element.status));
+          if (buffer_element.status.ok()) {
+            size_t value_size;
+            {
+              int64 temp;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name(strings::StrCat("buffer[", i, "].size")), &temp));
+              value_size = static_cast<size_t>(temp);
+            }
+            buffer_element.value.reserve(value_size);
+            for (size_t j = 0; j < value_size; j++) {
+              buffer_element.value.emplace_back();
+              TF_RETURN_IF_ERROR(reader->ReadTensor(
+                  strings::StrCat("buffer[", i, "][", j, "]"),
+                  &buffer_element.value.back()));
+            }
+          }
+        }
+        return Status::OK();
+      }
+
      private:
       // A buffer element comprises a status and (if that status is
       // OK) a vector of tensors, representing an element of the input dataset.
@@ -187,6 +252,12 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           }
 
           // 2. Read the next element.
+          // Acquire the parent lock since we will be reading an element
+          // from the input iterator. Note that we do not wish to release
+          // this lock till we have added the fetched element to the
+          // `buffer_` else there will be local state that may be missed
+          // by SaveInternal.
+          mutex_lock parent_l(parent_mu_);
           bool end_of_sequence;
           BufferElement buffer_element;
           buffer_element.status = input_impl_->GetNext(
@@ -207,8 +278,50 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status WriteStatus(IteratorStateWriter* writer, size_t index,
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            CodeKey(index), static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
+                                                 status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatus(IteratorStateReader* reader, size_t index,
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      string CodeKey(size_t index) {
+        return full_name(strings::StrCat("status[", index, "].code"));
+      }
+
+      string ErrorMessageKey(size_t index) {
+        return full_name(strings::StrCat("status[", index, "].error_message"));
+      }
+
+      // This mutex is used to ensure exclusivity between multiple threads
+      // reading/writing this iterator's local state.
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_;
+      // This mutex is used to ensure exclusivity between multiple threads
+      // accessing the parent iterator. We keep this separate from `mu_` to
+      // allow prefetching to run in parallel with GetNext calls.
+      mutex parent_mu_ ACQUIRED_BEFORE(mu_);
+      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
@@ -218,7 +331,6 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     const int64 buffer_size_;
-    const IteratorContext::Params ctx_params_;
   };
 };
 
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 4c406fc1ed9f86477a7c0eb7c88f7dd7833f796c..bab94f7f0ad1fd7609761aaabc4f76ae6eafeb7b 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/priority_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -122,7 +123,7 @@ Status PriorityQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -358,8 +359,8 @@ void PriorityQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
               const int index =
                   attempt->tuple[0].dim_size(0) - attempt->elements_requested;
               for (int i = 0; i < num_components(); ++i) {
-                attempt->context->SetStatus(
-                    CopyElementToSlice(tuple[i], &attempt->tuple[i], index));
+                attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                    std::move(tuple[i]), &attempt->tuple[i], index));
                 if (!attempt->context->status().ok()) return kComplete;
               }
               tuple.clear();
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index eae303b85e44f87fb7e895902d02ff225f13def5..a73581fbbc1e9db4af621b109496088ba2c7c7de 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -910,42 +910,41 @@ void TestComputeLerp4xAll() {
 
 }  // namespace tensorflow
 
-#if defined(__ANDROID__)
-int main(int argc, char** argv) {
-#define RUN_TEST(t)            \
-  LOG(INFO) << "Test: " << #t; \
-  tensorflow::t();
-#else
 #define RUN_TEST(t) \
   TEST(QuantizationUtilsTest, t) { tensorflow::t(); }
-#endif
 
-  RUN_TEST(TestFloatToQuantized);
-  RUN_TEST(TestQuantizedToFloat);
-  RUN_TEST(TestAvoidBias);
-  RUN_TEST(TestRequantizeInNewRange);
-  RUN_TEST(TestRequantizeInNewRangeRealData);
-  RUN_TEST(TestRequantizeInNewRange32To8Bit);
-  RUN_TEST(TestRequantizeManyInNewRange32To8Bit);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitUsingEigen);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitEigenVsNonEigen);
-  RUN_TEST(TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen);
-  RUN_TEST(TestFloatTensorToQuantized);
-  RUN_TEST(TestRequantizeManyInNewRange8To32Bit);
-  RUN_TEST(TestFloatToQuantizedInPlaceUsingEigen);
-  RUN_TEST(TestOverflowWithEigen);
-  RUN_TEST(TestQuantizedTensorToFloat);
-  RUN_TEST(TestQuantizedToFloatInPlaceUsingEigen);
+RUN_TEST(TestFloatToQuantized);
+RUN_TEST(TestQuantizedToFloat);
+RUN_TEST(TestAvoidBias);
+RUN_TEST(TestRequantizeInNewRange);
+RUN_TEST(TestRequantizeInNewRangeRealData);
+RUN_TEST(TestRequantizeInNewRange32To8Bit);
+RUN_TEST(TestRequantizeManyInNewRange32To8Bit);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitUsingEigen);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitEigenVsNonEigen);
+RUN_TEST(TestRequantizeManyInNewRange32To8BitSignedEigenVsNonEigen);
+RUN_TEST(TestFloatTensorToQuantized);
+RUN_TEST(TestRequantizeManyInNewRange8To32Bit);
+RUN_TEST(TestFloatToQuantizedInPlaceUsingEigen);
+RUN_TEST(TestOverflowWithEigen);
+RUN_TEST(TestQuantizedTensorToFloat);
+RUN_TEST(TestQuantizedToFloatInPlaceUsingEigen);
 
 #if defined(__ANDROID__)
+
+RUN_TEST(BenchmarkRequantizeManyInNewRange);
+
 #ifdef QUANTIZATION_UTILS_USE_NEON
-  RUN_TEST(TestDivide64x2PowAll);
-  RUN_TEST(TestComputeLerp4xAll);
-#endif
 
-  tensorflow::BenchmarkRequantizeManyInNewRange();
+RUN_TEST(TestDivide64x2PowAll);
+RUN_TEST(TestComputeLerp4xAll);
+
+#endif  // QUANTIZATION_UTILS_USE_NEON
+
+#endif  // __ANDROID__
 
-  LOG(INFO) << "All tests complete.";
-  return 0;
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
index 8be0c567987df803455e05a40c6a6d82458dca59..337c8e5c17863cc06fba5828605ba5db85b22c31 100644
--- a/tensorflow/core/kernels/quantized_add_op.cc
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -489,7 +489,7 @@ class QuantizedAddOp : public OpKernel {
     // adding zero leaves the result unchanged, and to contain the largest of
     // the two input values with some room to spare.
     const float smallest_min = std::min(min_x, min_y);
-    const float largest_max = std::min(max_x, max_y);
+    const float largest_max = std::max(max_x, max_y);
     const float biggest_range =
         std::max(std::abs(smallest_min), std::abs(largest_max));
     const float output_range = (biggest_range * (1 << 14));
diff --git a/tensorflow/core/kernels/quantized_add_op_test.cc b/tensorflow/core/kernels/quantized_add_op_test.cc
index 74d16b282dff492f5493027390c39ee514f6f4c7..90bd145ad0c9b1da8805ecac7c49bd94c1db22ed 100644
--- a/tensorflow/core/kernels/quantized_add_op_test.cc
+++ b/tensorflow/core/kernels/quantized_add_op_test.cc
@@ -276,10 +276,10 @@ void BenchmarkVectorPlusTensor() {
   TimeAdd({100000, 100}, {100}, 1);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedAddOpTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
@@ -288,24 +288,16 @@ RUN_TEST(TestScalar);
 RUN_TEST(TestVector);
 RUN_TEST(TestVectorPlusTensor);
 
-#undef RUN_TEST
+#if defined(__ANDROID__)
 
-#endif  // __ANDROID__
+RUN_TEST(BenchmarkTensorScalar);
+RUN_TEST(BenchmarkVector);
+RUN_TEST(BenchmarkVectorPlusTensor);
 
-}  // end namespace tensorflow
+#endif  // __ANDROID__
 
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  LOG(INFO) << "TestManualScalar:";
-  tensorflow::TestManualScalar();
-  LOG(INFO) << "TestManualVector:";
-  tensorflow::TestManualVector();
-  LOG(INFO) << "TestManualVectorPlusTensor:";
-  tensorflow::TestManualVectorPlusTensor();
-  tensorflow::BenchmarkTensorScalar();
-  tensorflow::BenchmarkVector();
-  tensorflow::BenchmarkVectorPlusTensor();
-  LOG(INFO) << "All tests complete";
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index ee573f1bb805107299fed89df211275a1e81c35d..d67f1ab3ec28934bc08c11997a8b2f448c30ad91 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -174,13 +174,13 @@ class QuantizedConcatOp : public OpKernel {
     OP_REQUIRES(context, (input_mins.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected mins input list length ",
-                    input_mins.size(), " to equal values length ", N))
+                    input_mins.size(), " to equal values length ", N));
     OpInputList input_maxes;
     OP_REQUIRES_OK(context, context->input_list("input_maxes", &input_maxes));
     OP_REQUIRES(context, (input_maxes.size() == N),
                 errors::InvalidArgument(
                     "QuantizedConcatOp : Expected maxes input list length ",
-                    input_maxes.size(), " to equal values length ", N))
+                    input_maxes.size(), " to equal values length ", N));
     const int input_dims = values[0].dims();
     const TensorShape& input_shape = values[0].shape();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 3b0764bb9bf9ff00c71173c53cdb78b6ab3ac6ca..1921b83d12c0688a96bad0c561080a0189e49bbe 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -268,6 +268,13 @@ class Im2ColConvFunctor {
     Im2ColBufferResource<T1, chunk_value_count>* im2col_buffer_resource;
     std::function<Status(Im2ColBufferResource<T1, chunk_value_count>**)>
         creator = [](Im2ColBufferResource<T1, chunk_value_count>** resource) {
+#ifdef _MSC_VER
+          // MSVC complains about the capture of chunk_value_count which oddly
+          // works fine in conv_ops_using_gemm.cc for example.
+          // Define chunk_value_count inside the lambda for now.
+          const int64 chunk_value_count =
+              (kMaxChunkSize + (sizeof(T1) - 1)) / sizeof(T1);
+#endif
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
           return Status::OK();
         };
@@ -457,6 +464,19 @@ class QuantizedConv2DOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
+    std::vector<int32> dilations;
+    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
+    OP_REQUIRES(context, dilations.size() == 4,
+                errors::InvalidArgument("Dilations field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(context, dilations[1] == 1 && dilations[2] == 1,
+                errors::InvalidArgument(
+                    "Current implementation only supports dilated rate as 1 "
+                    "in the row and column dimensions."));
+    OP_REQUIRES(context, (dilations[0] == 1 && dilations[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "dilations in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
   }
 
diff --git a/tensorflow/core/kernels/quantized_instance_norm_test.cc b/tensorflow/core/kernels/quantized_instance_norm_test.cc
index 29d8dbc0dfcb48b83a1ec0da9085208ffb35c656..d2b15ee20bb89a28c9d7f8398435352107eb4d79 100644
--- a/tensorflow/core/kernels/quantized_instance_norm_test.cc
+++ b/tensorflow/core/kernels/quantized_instance_norm_test.cc
@@ -173,10 +173,10 @@ void TestClamp() {
   Expect(input_tensor, -10.0f, 10.0f, true, 0.0f, 1.0f);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedInstanceNormTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestBasic);
 RUN_TEST(TestZeroInput);
@@ -184,19 +184,8 @@ RUN_TEST(TestMaxInput);
 RUN_TEST(TestOutputRangeGiven);
 RUN_TEST(TestClamp);
 
-#undef RUN_TEST
-
-#endif  // __ANDROID__
-
-}  // end namespace tensorflow
-
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  tensorflow::TestBasic();
-  tensorflow::TestZeroInput();
-  tensorflow::TestMaxInput();
-  tensorflow::TestOutputRangeGiven();
-  tensorflow::TestClamp();
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_mul_op_test.cc b/tensorflow/core/kernels/quantized_mul_op_test.cc
index 45d6c51444a9981bd7567a6de8aaeb3f2c1720af..5f858eb8ce03be7d130649f814db5f1f9c68f18c 100644
--- a/tensorflow/core/kernels/quantized_mul_op_test.cc
+++ b/tensorflow/core/kernels/quantized_mul_op_test.cc
@@ -276,10 +276,10 @@ void BenchmarkVectorTimesTensor() {
   TimeMul({100000, 100}, {100}, 100);
 }
 
-#if !defined(__ANDROID__)
+}  // end namespace tensorflow
 
 #define RUN_TEST(t) \
-  TEST(QuantizedMulOpTest, t) { t(); }
+  TEST(QuantizedAddOpTest, t) { tensorflow::t(); }
 
 RUN_TEST(TestManualScalar);
 RUN_TEST(TestManualVector);
@@ -288,24 +288,16 @@ RUN_TEST(TestScalar);
 RUN_TEST(TestVector);
 RUN_TEST(TestVectorTimesTensor);
 
-#undef RUN_TEST
+#if defined(__ANDROID__)
 
-#endif  // __ANDROID__
+RUN_TEST(BenchmarkTensorScalar);
+RUN_TEST(BenchmarkVector);
+RUN_TEST(BenchmarkVectorTimesTensor);
 
-}  // end namespace tensorflow
+#endif  // __ANDROID__
 
-#if defined(__ANDROID__)
 int main(int argc, char** argv) {
-  LOG(INFO) << "TestManualScalar:";
-  tensorflow::TestManualScalar();
-  LOG(INFO) << "TestManualVector:";
-  tensorflow::TestManualVector();
-  LOG(INFO) << "TestManualVectorTimesTensor:";
-  tensorflow::TestManualVectorTimesTensor();
-  tensorflow::BenchmarkTensorScalar();
-  tensorflow::BenchmarkVector();
-  tensorflow::BenchmarkVectorTimesTensor();
-  LOG(INFO) << "All tests complete";
-  return 0;
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif  // __ANDROID__
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 8d3d7105a4a67637cffde4d6a66157789c9e2bdb..e6133415d0f5c143acad25ee6e681820e956cca8 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -373,22 +373,20 @@ void RunBenchmarkResizeBilinearTwoDims() {
 
 }  // namespace tensorflow
 
-#if defined(__ANDROID__)
-int main(int argc, char** argv) {
-#define RUN_TEST(t)            \
-  LOG(INFO) << "Test: " << #t; \
-  tensorflow::t();
-#else
 #define RUN_TEST(t) \
   TEST(QuantizationResizeBilenarTest, t) { tensorflow::t(); }
-#endif
 
-  RUN_TEST(TestResizeBilinearOneDim);
-  RUN_TEST(TestResizeBilinearTwoDims);
+RUN_TEST(TestResizeBilinearOneDim);
+RUN_TEST(TestResizeBilinearTwoDims);
 
 #if defined(__ANDROID__)
-  RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
-  LOG(INFO) << "All tests complete.";
-  return 0;
+
+RUN_TEST(RunBenchmarkResizeBilinearTwoDims);
+
+#endif  // __ANDROID__
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
-#endif
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 8a9af39e1f7af5483bc72023915dfd408907a99a..330d161c32bc1a48b671765cacc21618545fa71a 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -46,25 +47,6 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
   return Status::OK();
 }
 
-template <DataType DT>
-Status HandleElementToSlice(const Tensor& element, Tensor* parent, int index) {
-  typedef typename EnumToDataType<DT>::Type T;
-  DCHECK_NE(parent->dim_size(0), 0);
-  DCHECK_GE(index, 0);
-  if (element.NumElements() != (parent->NumElements() / parent->dim_size(0))) {
-    TensorShape chip_shape = parent->shape();
-    chip_shape.RemoveDim(0);
-    return errors::Internal(
-        "HandleElementToSlice Cannot copy slice: number of elements does not "
-        "match.  Shapes are: [element]: ",
-        element.shape().DebugString(), ", [parent slice]: ",
-        chip_shape.DebugString());
-  }
-  auto parent_as_matrix = parent->flat_outer_dims<T>();
-  parent_as_matrix.chip(index, 0) = element.flat<T>();
-  return Status::OK();
-}
-
 }  // namespace
 
 QueueBase::QueueBase(int32 capacity, const DataTypeVector& component_dtypes,
@@ -354,63 +336,13 @@ void QueueBase::FlushUnlocked() {
 
 Status QueueBase::CopySliceToElement(const Tensor& parent, Tensor* element,
                                      int64 index) {
-#define HANDLE_TYPE(DT)                                                   \
-  if (parent.dtype() == DT) {                                             \
-    TF_RETURN_IF_ERROR(HandleSliceToElement<DT>(parent, element, index)); \
-    return Status::OK();                                                  \
-  }
-  HANDLE_TYPE(DT_FLOAT);
-  HANDLE_TYPE(DT_HALF);
-  HANDLE_TYPE(DT_DOUBLE);
-  HANDLE_TYPE(DT_INT32);
-  HANDLE_TYPE(DT_UINT8);
-  HANDLE_TYPE(DT_INT16);
-  HANDLE_TYPE(DT_INT8);
-  HANDLE_TYPE(DT_STRING);
-  HANDLE_TYPE(DT_COMPLEX64);
-  HANDLE_TYPE(DT_COMPLEX128);
-  HANDLE_TYPE(DT_INT64);
-  HANDLE_TYPE(DT_BOOL);
-  HANDLE_TYPE(DT_QINT8);
-  HANDLE_TYPE(DT_QUINT8);
-  HANDLE_TYPE(DT_QINT32);
-  HANDLE_TYPE(DT_QINT16);
-  HANDLE_TYPE(DT_QUINT16);
-  HANDLE_TYPE(DT_UINT16);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("CopySliceToElement Unhandled data type: ",
-                               parent.dtype());
+  return batch_util::CopySliceToElement(parent, element, index);
 }
 
-// Static method
+/* static */
 Status QueueBase::CopyElementToSlice(const Tensor& element, Tensor* parent,
                                      int64 index) {
-#define HANDLE_TYPE(DT)                                                   \
-  if (element.dtype() == DT) {                                            \
-    TF_RETURN_IF_ERROR(HandleElementToSlice<DT>(element, parent, index)); \
-    return Status::OK();                                                  \
-  }
-  HANDLE_TYPE(DT_FLOAT);
-  HANDLE_TYPE(DT_HALF);
-  HANDLE_TYPE(DT_DOUBLE);
-  HANDLE_TYPE(DT_INT32);
-  HANDLE_TYPE(DT_UINT8);
-  HANDLE_TYPE(DT_INT16);
-  HANDLE_TYPE(DT_INT8);
-  HANDLE_TYPE(DT_STRING);
-  HANDLE_TYPE(DT_COMPLEX64);
-  HANDLE_TYPE(DT_COMPLEX128);
-  HANDLE_TYPE(DT_INT64);
-  HANDLE_TYPE(DT_BOOL);
-  HANDLE_TYPE(DT_QINT8);
-  HANDLE_TYPE(DT_QUINT8);
-  HANDLE_TYPE(DT_QINT32);
-  HANDLE_TYPE(DT_QINT16);
-  HANDLE_TYPE(DT_QUINT16);
-  HANDLE_TYPE(DT_UINT16);
-#undef HANDLE_TYPE
-  return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
-                               element.dtype());
+  return batch_util::CopyElementToSlice(element, parent, index);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index c101fb35791eafa109f1a360fe63051398d48de5..5fb1c92f9422cb6cc1e6adb6e8e0a03a80acc767 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -79,6 +79,9 @@ class QueueBase : public QueueInterface {
                                    int64 index);
 
   // Copies element into the index^th slice (in the first dimension) of parent.
+  // NOTE(mrry): This method is deprecated. Use
+  // `tensorflow::batch_util::CopySliceToElement()` defined in
+  // "./batch_util.h" instead.
   static Status CopyElementToSlice(const Tensor& element, Tensor* parent,
                                    int64 index);
 
diff --git a/tensorflow/core/kernels/queue_ops.cc b/tensorflow/core/kernels/queue_ops.cc
index d51dc4ecb00f9501d544dbbbfbd4e92ebf515682..17831b74370bcd21cf7772f0ea6809ee840511c3 100644
--- a/tensorflow/core/kernels/queue_ops.cc
+++ b/tensorflow/core/kernels/queue_ops.cc
@@ -429,7 +429,7 @@ class QueueIsClosedOp : public QueueOpKernel {
  public:
   explicit QueueIsClosedOp(OpKernelConstruction* context)
      : QueueOpKernel(context) {}
- 
+
  protected:
   void ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                     DoneCallback callback) override {
diff --git a/tensorflow/core/kernels/random_dataset_op.cc b/tensorflow/core/kernels/random_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03d481a593f8b4dc537662ea50bedecd045dd5c8
--- /dev/null
+++ b/tensorflow/core/kernels/random_dataset_op.cc
@@ -0,0 +1,154 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class RandomDatasetOp : public DatasetOpKernel {
+ public:
+  explicit RandomDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    int64 seed;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed", &seed));
+
+    int64 seed2;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "seed2", &seed2));
+
+    // By TensorFlow convention, passing 0 for both seeds indicates
+    // that the shuffling should be seeded non-deterministically.
+    if (seed == 0 && seed2 == 0) {
+      seed = random::New64();
+      seed2 = random::New64();
+    }
+
+    *output = new Dataset(ctx, seed, seed2);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
+        : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Random")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_INT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override {
+      return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
+                             ")::Dataset");
+    }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {seed, seed2}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            parent_generator_(dataset()->seed_, dataset()->seed2_),
+            generator_(&parent_generator_) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        Tensor value_tensor(cpu_allocator(), DT_INT64, {});
+        value_tensor.scalar<int64>()() = Random();
+        out_tensors->emplace_back(std::move(value_tensor));
+        *end_of_sequence = false;
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
+                                               num_random_samples_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
+                                              &num_random_samples_));
+        parent_generator_ =
+            random::PhiloxRandom(dataset()->seed_, dataset()->seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+        return Status::OK();
+      }
+
+     private:
+      random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_random_samples_++;
+        auto out = generator_();
+        return out;
+      }
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
+    const int64 seed_;
+    const int64 seed2_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+                        RandomDatasetOp);
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index a37c757865177d87ea0c47dc325be32f711007eb..55a8b9c9b67455483689a135306017bed8974ade 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -577,7 +577,7 @@ struct FillPhiloxRandomKernel<Distribution, false> {
     const size_t kGroupSize = Distribution::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range(0);
+    const size_t total_item_count = item.get_global_range();
     size_t offset = item_id * kGroupSize;
     gen_.Skip(item_id);
 
@@ -633,7 +633,7 @@ struct FillPhiloxRandomKernel<Distribution, true> {
                                                 PhiloxRandom::kResultElementCount;
 
     const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range(0);
+    const size_t total_item_count = item.get_global_range();
     size_t group_index = item_id;
     size_t offset = group_index * kGroupSize;
 
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 30bbbd4aed6924972f914c42eb8b0a7b9239f7ae..e9695cfde30945c9c99db85f33e44030e5d45054 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/queue_op.h"
 #include "tensorflow/core/kernels/typed_queue.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -170,7 +171,7 @@ Status RandomShuffleQueue::GetElementComponentFromBatch(
   TF_RETURN_IF_ERROR(ctx->allocate_persistent(
       tuple[component].dtype(), element_shape, out_tensor, &element_access));
   TF_RETURN_IF_ERROR(
-      CopySliceToElement(tuple[component], element_access, index));
+      batch_util::CopySliceToElement(tuple[component], element_access, index));
   return Status::OK();
 }
 
@@ -407,8 +408,8 @@ void RandomShuffleQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
                   const int index = attempt->tuple[0].dim_size(0) -
                                     attempt->elements_requested;
                   for (int i = 0; i < num_components(); ++i) {
-                    attempt->context->SetStatus(CopyElementToSlice(
-                        tuple[i], &attempt->tuple[i], index));
+                    attempt->context->SetStatus(batch_util::CopyElementToSlice(
+                        std::move(tuple[i]), &attempt->tuple[i], index));
                     if (!attempt->context->status().ok()) return kComplete;
                   }
                   tuple.clear();
diff --git a/tensorflow/core/kernels/range_dataset_op.cc b/tensorflow/core/kernels/range_dataset_op.cc
index 7adfcc4f8d29c67007ae08a621fd5bef0eddd498..e7ae840fc7d023cda8c11ecd1f7cde3842a9da00 100644
--- a/tensorflow/core/kernels/range_dataset_op.cc
+++ b/tensorflow/core/kernels/range_dataset_op.cc
@@ -99,7 +99,6 @@ class RangeDatasetOp : public DatasetOpKernel {
         if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
             (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
           *end_of_sequence = true;
-          is_exhausted_ = true;
           return Status::OK();
         }
         Tensor value_tensor(cpu_allocator(), DT_INT64, {});
diff --git a/tensorflow/core/kernels/reader_dataset_ops.cc b/tensorflow/core/kernels/reader_dataset_ops.cc
index 39ef92a5dec0def5ae51e41feac38f1257693376..d942ddc4a7b9042038c6b7a2a52e46c1bf45b2a9 100644
--- a/tensorflow/core/kernels/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/reader_dataset_ops.cc
@@ -402,7 +402,6 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           // Iteration ends when there are no more files to process.
           if (current_file_index_ == dataset()->filenames_.size()) {
             *end_of_sequence = true;
-            is_exhausted_ = true;
             return Status::OK();
           }
 
@@ -512,15 +511,18 @@ class TFRecordDatasetOp : public DatasetOpKernel {
                 errors::InvalidArgument(
                     "`buffer_size` must be >= 0 (0 == no buffering)"));
 
-    *output = new Dataset(std::move(filenames), compression_type, buffer_size);
+    *output =
+        new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(std::vector<string> filenames,
+    explicit Dataset(OpKernelContext* ctx, std::vector<string> filenames,
                      const string& compression_type, int64 buffer_size)
-        : filenames_(std::move(filenames)),
+        : GraphDatasetBase(ctx),
+          filenames_(std::move(filenames)),
+          compression_type_(compression_type),
           options_(io::RecordReaderOptions::CreateRecordReaderOptions(
               compression_type)) {
       if (buffer_size > 0) {
@@ -547,6 +549,20 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
     string DebugString() override { return "TFRecordDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* filenames = nullptr;
+      TF_RETURN_IF_ERROR(b->AddVector(filenames_, &filenames));
+      Node* compression_type = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(compression_type_, &compression_type));
+      Node* buffer_size = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(options_.buffer_size, &buffer_size));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {filenames, compression_type, buffer_size}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -572,8 +588,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
             // We have reached the end of the current file, so maybe
             // move on to next file.
-            reader_.reset();
-            file_.reset();
+            ResetStreamsLocked();
             ++current_file_index_;
           }
 
@@ -583,17 +598,64 @@ class TFRecordDatasetOp : public DatasetOpKernel {
             return Status::OK();
           }
 
-          // Actually move on to next file.
-          const string& next_filename =
-              dataset()->filenames_[current_file_index_];
-          TF_RETURN_IF_ERROR(
-              ctx->env()->NewRandomAccessFile(next_filename, &file_));
-          reader_.reset(
-              new io::SequentialRecordReader(file_.get(), dataset()->options_));
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
         } while (true);
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_file_index"),
+                                               current_file_index_));
+
+        if (reader_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("offset"), reader_->TellOffset()));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        ResetStreamsLocked();
+        int64 current_file_index;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("current_file_index"),
+                                              &current_file_index));
+        current_file_index_ = size_t(current_file_index);
+        if (reader->Contains(full_name("offset"))) {
+          int64 offset;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("offset"), &offset));
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+          TF_RETURN_IF_ERROR(reader_->SeekOffset(offset));
+        }
+        return Status::OK();
+      }
+
      private:
+      // Sets up reader streams to read from the file at `current_file_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_file_index_ >= dataset()->filenames_.size()) {
+          return errors::InvalidArgument(
+              "current_file_index_:", current_file_index_,
+              " >= filenames_.size():", dataset()->filenames_.size());
+        }
+
+        // Actually move on to next file.
+        const string& next_filename =
+            dataset()->filenames_[current_file_index_];
+        TF_RETURN_IF_ERROR(env->NewRandomAccessFile(next_filename, &file_));
+        reader_.reset(
+            new io::SequentialRecordReader(file_.get(), dataset()->options_));
+        return Status::OK();
+      }
+
+      // Resets all reader streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        reader_.reset();
+        file_.reset();
+      }
+
       mutex mu_;
       size_t current_file_index_ GUARDED_BY(mu_) = 0;
 
@@ -604,6 +666,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
     };
 
     const std::vector<string> filenames_;
+    const string compression_type_;
     io::RecordReaderOptions options_;
   };
 };
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 807ac0a4567790ef3fb95b4c12a91a1562f83fa7..5c537c5b9c75afef2b8f4ea5446f3d4012ed0cbb 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -50,6 +50,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .TypeConstraint<int64>("Tidx")                                       \
           .HostMemory("reduction_indices"),                                    \
       ReductionOp<GPUDevice, type, int64, Eigen::internal::MinReducer<type>>);
+REGISTER_GPU_KERNELS(Eigen::half);
 REGISTER_GPU_KERNELS(float);
 REGISTER_GPU_KERNELS(double);
 
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index 9bbe993a2f93e522688738abaf41a518e95ef871..fe8ea59f1be521166d0e42295e79d1bb5a242750 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -174,6 +174,11 @@ static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
+  ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
+}
+BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/repeat_dataset_op.cc b/tensorflow/core/kernels/repeat_dataset_op.cc
index 9813e99a70bc51e725a2974e759f3708d4f9b4d3..3d977a0fa38be77ac812cb12aade2af20b871fb8 100644
--- a/tensorflow/core/kernels/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/repeat_dataset_op.cc
@@ -73,10 +73,10 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     string DebugString() override { return "RepeatDatasetOp::Dataset"; }
 
    protected:
-    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
                               Node** output) const override {
       Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddParentDataset(input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
       Node* count = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
       TF_RETURN_IF_ERROR(
@@ -95,6 +95,15 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return Status::OK();
+      }
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        return Status::OK();
+      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -108,6 +117,10 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         while (i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -118,7 +131,6 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
           input_impl_ = dataset()->input_->MakeIterator(prefix());
         }
         *end_of_sequence = true;
-        is_exhausted_ = true;
         input_impl_.reset();
         return Status::OK();
       }
@@ -127,7 +139,12 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
-        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        }
         return Status::OK();
       }
 
@@ -135,7 +152,11 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
-        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
         return Status::OK();
       }
 
@@ -183,6 +204,29 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         } while (true);
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("uninitialized"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("uninitialized"))) {
+          input_impl_.reset();
+        } else {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/reshape_util.cc b/tensorflow/core/kernels/reshape_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4188ad233ea8f826fda28ee891a54ee9bd1156e3
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_util.cc
@@ -0,0 +1,149 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <numeric>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/util/sparse/sparse_tensor.h"
+
+namespace tensorflow {
+
+void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
+             const Tensor &input_shape_in, const Tensor &target_shape_in,
+             int output_indices_idx, int output_shape_idx) {
+  OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices_in.shape()),
+              errors::InvalidArgument(
+                  "Input indices should be a matrix but received shape ",
+                  input_indices_in.shape().DebugString()));
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
+              errors::InvalidArgument(
+                  "Input shape should be a vector but received shape ",
+                  input_shape_in.shape().DebugString()));
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(target_shape_in.shape()),
+              errors::InvalidArgument(
+                  "Target shape should be a vector but received shape ",
+                  target_shape_in.shape().DebugString()));
+
+  const int64 input_rank = input_shape_in.NumElements();
+  const int64 output_rank = target_shape_in.NumElements();
+  const TensorShape input_shape(input_shape_in.vec<int64>());
+  const int64 dense_size = input_shape.num_elements();
+  const int64 nnz = input_indices_in.shape().dim_size(0);
+
+  // Compute the output shape. Determine product of specified dimensions, and
+  // find the index of the unspecified one.
+  TensorShape output_shape;
+  int64 product = 1;
+  int unknown_index = -1;
+  auto target_shape = target_shape_in.vec<int64>();
+  for (int d = 0; d < output_rank; ++d) {
+    const int64 size = target_shape(d);
+    if (size == -1) {
+      OP_REQUIRES(
+          context, unknown_index == -1,
+          errors::InvalidArgument("only one output dimension may be -1, "
+                                  "not both ",
+                                  unknown_index, " and ", d));
+      unknown_index = d;
+      output_shape.AddDim(1);
+    } else {
+      OP_REQUIRES(context, size >= 0,
+                  errors::InvalidArgument("size ", d,
+                                          " must be non-negative, not ", size));
+      product *= size;
+      output_shape.AddDim(size);
+    }
+  }
+  if (unknown_index != -1) {
+    OP_REQUIRES(
+        context, product > 0,
+        errors::InvalidArgument("reshape cannot infer the missing "
+                                "input size for an empty tensor unless all "
+                                "specified input sizes are non-zero"));
+    const int64 missing = dense_size / product;
+    OP_REQUIRES(
+        context, product * missing == dense_size,
+        errors::InvalidArgument(
+            "Input to reshape is a SparseTensor with ", dense_size,
+            " dense values, but the requested shape requires a multiple of ",
+            product));
+    output_shape.set_dim(unknown_index, missing);
+  }
+
+  OP_REQUIRES(
+      context, output_shape.num_elements() == dense_size,
+      errors::InvalidArgument("Input to reshape is a tensor with ", dense_size,
+                              " dense values, but the requested shape has ",
+                              output_shape.num_elements()));
+
+  // Optimize for reshaping to the same shape.
+  if (input_shape == output_shape) {
+    context->set_output(output_indices_idx, input_indices_in);
+    context->set_output(output_shape_idx, input_shape_in);
+    return;
+  }
+
+  gtl::InlinedVector<int64, 8> input_strides(input_rank);
+  input_strides[input_rank - 1] = 1;
+  for (int d = input_rank - 2; d >= 0; --d) {
+    input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
+  }
+
+  gtl::InlinedVector<int64, 8> output_strides(output_rank);
+  output_strides[output_rank - 1] = 1;
+  for (int d = output_rank - 2; d >= 0; --d) {
+    output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
+  }
+
+  Tensor *result_indices = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(output_indices_idx,
+                                          TensorShape({nnz, output_rank}),
+                                          &result_indices));
+  auto input_ind = input_indices_in.matrix<int64>();
+  auto output_ind = result_indices->matrix<int64>();
+  for (int i = 0; i < nnz; ++i) {
+    int64 id = 0;
+    for (int j = 0; j < input_rank; ++j) {
+      id += input_ind(i, j) * input_strides[j];
+    }
+    for (int j = 0; j < output_rank; ++j) {
+      output_ind(i, j) = id / output_strides[j];
+      id %= output_strides[j];
+    }
+  }
+
+  Tensor *result_shape = nullptr;
+  OP_REQUIRES_OK(context, context->allocate_output(output_shape_idx,
+                                                   TensorShape({output_rank}),
+                                                   &result_shape));
+  auto output_shape_vec = result_shape->vec<int64>();
+  for (int j = 0; j < output_shape.dims(); ++j) {
+    output_shape_vec(j) = output_shape.dim_size(j);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_util.h b/tensorflow/core/kernels/reshape_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed583afd13824eff789ea556045507fb4cff44e6
--- /dev/null
+++ b/tensorflow/core/kernels/reshape_util.h
@@ -0,0 +1,31 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Reshapes the input indices and input shape to the target shape.
+void Reshape(OpKernelContext *context, const Tensor &input_indices_in,
+             const Tensor &input_shape_in, const Tensor &target_shape_in,
+             int output_indices_idx, int output_shape_idx);
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_RESHAPE_UTIL_H_
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 4f2afa5257966e525f0191adb04b925417e3dde2..7ac34d1c62376f40f9d30397cad71233db9468dc 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -35,7 +36,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -43,7 +44,7 @@ namespace {
 // NUM_CHANNELS can be <= 0 to compute it dynamically from <input>
 // Otherwise, it must equal input.dim_size(2) and is used as a compile-time
 // constant.
-template <int NUM_CHANNELS>
+template <typename T, int NUM_CHANNELS>
 void ReverseRows(OpKernelContext* context, const Tensor& input,
                  Tensor* result) {
   auto work = [&input, result](int64 start, int64 end) {
@@ -53,8 +54,8 @@ void ReverseRows(OpKernelContext* context, const Tensor& input,
     const int64 row_size = inner_size * middle_size;
     DCHECK_EQ(input.dim_size(2), inner_size);
 
-    const int32* in_ptr = input.bit_casted_tensor<int32, 3>().data();
-    int32* out_ptr = result->bit_casted_tensor<int32, 3>().data();
+    const T* in_ptr = input.bit_casted_tensor<T, 3>().data();
+    T* out_ptr = result->bit_casted_tensor<T, 3>().data();
 
     in_ptr += start * row_size;
     out_ptr += start * row_size;
@@ -64,7 +65,7 @@ void ReverseRows(OpKernelContext* context, const Tensor& input,
       int remaining = middle_size;
       while (remaining > 0) {
         out_ptr -= inner_size;
-        memcpy(out_ptr, in_ptr, inner_size * sizeof(float));
+        memcpy(out_ptr, in_ptr, inner_size * sizeof(T));
         in_ptr += inner_size;
         --remaining;
       }
@@ -81,6 +82,48 @@ void ReverseRows(OpKernelContext* context, const Tensor& input,
         std::move(work));
 }
 
+template <typename T>
+struct data_type_can_memcpy {
+  static constexpr bool value =
+      std::is_same<T, uint8>::value || std::is_same<T, int8>::value ||
+      std::is_same<T, bool>::value || std::is_same<T, uint16>::value ||
+      std::is_same<T, int16>::value || std::is_same<T, Eigen::half>::value ||
+      std::is_same<T, int32>::value || std::is_same<T, float>::value ||
+      std::is_same<T, int64>::value || std::is_same<T, double>::value ||
+      std::is_same<T, complex64>::value || std::is_same<T, complex128>::value;
+};
+
+template <typename T, int NUM_CHANNELS>
+typename std::enable_if<data_type_can_memcpy<T>::value>::type
+DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
+                    Tensor* result) {
+  if (sizeof(T) == 1) {
+    static_assert(sizeof(uint8) == 1, "uint8 must be 1 byte.");
+    ReverseRows<uint8, NUM_CHANNELS>(context, input, result);
+  } else if (sizeof(T) == 2) {
+    static_assert(sizeof(uint16) == 2, "uint16 must be 2 bytes");
+    ReverseRows<uint16, NUM_CHANNELS>(context, input, result);
+  } else if (sizeof(T) == 4) {
+    static_assert(sizeof(uint32) == 4, "uint32 must be 4 bytes");
+    ReverseRows<uint32, NUM_CHANNELS>(context, input, result);
+  } else if (sizeof(T) == 8) {
+    static_assert(sizeof(uint64) == 8, "uint64 must be 8 bytes");
+    ReverseRows<uint64, NUM_CHANNELS>(context, input, result);
+  } else if (sizeof(T) == 16) {
+    static_assert(sizeof(complex128) == 16, "complex128 must be 16 bytes");
+    ReverseRows<complex128, NUM_CHANNELS>(context, input, result);
+  } else {
+    context->CtxFailure(
+        errors::InvalidArgument("%s has unexpected size of %d bytes",
+                                DataTypeString(input.dtype()), sizeof(T)));
+  }
+}
+
+template <typename T, int NUM_CHANNELS>
+typename std::enable_if<!data_type_can_memcpy<T>::value>::type
+DoHandleReverseCase(OpKernelContext* context, const Tensor& input,
+                    Tensor* result) {}
+
 }  // namespace
 
 template <typename Device, typename T, int NDIMS>
@@ -91,15 +134,14 @@ void HandleReverseCase(OpKernelContext* context,
 
   // Use optimized reverse if possible.
   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
-      std::is_same<T, float>::value && (!dims(0) && dims(1) && !dims(2))) {
+      data_type_can_memcpy<T>::value && (!dims(0) && dims(1) && !dims(2))) {
     if (input.dim_size(2) == 3) {
-      ReverseRows<3>(context, input, result);
+      DoHandleReverseCase<T, 3>(context, input, result);
     } else {
-      ReverseRows<-1>(context, input, result);
+      DoHandleReverseCase<T, -1>(context, input, result);
     }
     return;
   }
-
   typename Eigen::array<bool, NDIMS> axes_di;
   for (int i = 0; i < NDIMS; i++) {
     axes_di[i] = dims(i);
@@ -168,11 +210,11 @@ void HandleReverseV2Case(OpKernelContext* context,
 
   // Use optimized reverse if possible.
   if (NDIMS == 3 && std::is_same<Device, CPUDevice>::value &&
-      std::is_same<T, float>::value && (!axes[0] && axes[1] && !axes[2])) {
+      data_type_can_memcpy<T>::value && (!axes[0] && axes[1] && !axes[2])) {
     if (input.dim_size(2) == 3) {
-      ReverseRows<3>(context, input, result);
+      DoHandleReverseCase<T, 3>(context, input, result);
     } else {
-      ReverseRows<-1>(context, input, result);
+      DoHandleReverseCase<T, -1>(context, input, result);
     }
     return;
   }
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 9829e40fe85656d1fa0f59787c419c59190c0aea..e8285fb0e24842b37415be9aaa62afa152897d22 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -46,69 +46,132 @@ class ReverseOpTest : public OpsTestBase {
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
   }
+
+  template <typename T>
+  void Reverse_0() {
+    MakeOp(DataTypeToEnum<T>::value);
+    AddInputFromArray<T>(TensorShape({}), {3});
+    AddInputFromArray<bool>(TensorShape({}), {true});
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor* output = GetOutput(0);
+    Tensor expected(allocator(), DataTypeToEnum<T>::value, TensorShape({}));
+    expected.scalar<T>() = expected.scalar<T>().constant(3);
+    test::ExpectTensorEqual<T>(expected, *output);
+  }
+
+  template <typename T>
+  void Reverse_234() {
+    MakeOp(DataTypeToEnum<T>::value);
+    // Feed and run
+    // [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+    //  [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]
+    AddInputFromArray<T>(TensorShape({2, 3, 4}),
+                         {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                          12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+    AddInputFromArray<bool>(TensorShape({3}), {true, false, true});
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check the new state of the input
+    Tensor* params_tensor = GetOutput(0);
+    Tensor expected(allocator(), DataTypeToEnum<T>::value,
+                    TensorShape({2, 3, 4}));
+    // Should become
+    // [[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+    //  [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]
+    test::FillValues<T>(&expected,
+                        {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20,
+                         3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8});
+    test::ExpectTensorEqual<T>(expected, *params_tensor);
+  }
+
+  template <typename T>
+  void Reverse_1234() {
+    MakeOp(DataTypeToEnum<T>::value);
+    // Feed and run
+    // [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
+    //   [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]]
+    AddInputFromArray<T>(TensorShape({1, 2, 3, 4}),
+                         {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                          12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23});
+    AddInputFromArray<bool>(TensorShape({4}), {true, true, false, true});
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Check the new state of the input
+    Tensor* params_tensor = GetOutput(0);
+    Tensor expected(allocator(), DataTypeToEnum<T>::value,
+                    TensorShape({1, 2, 3, 4}));
+    // Should become
+    // [[[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
+    //   [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]]
+    test::FillValues<T>(&expected,
+                        {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20,
+                         3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8});
+    test::ExpectTensorEqual<T>(expected, *params_tensor);
+  }
 };
 
-TEST_F(ReverseOpTest, Reverse_0) {
-  MakeOp(DT_FLOAT);
-  AddInputFromArray<float>(TensorShape({}), {3});
-  AddInputFromArray<bool>(TensorShape({}), {true});
-  TF_ASSERT_OK(RunOpKernel());
+TEST_F(ReverseOpTest, Reverse_0_uint8) { Reverse_0<uint8>(); }
 
-  Tensor* output = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({}));
-  expected.scalar<float>() = expected.scalar<float>().constant(3.f);
-  test::ExpectTensorEqual<float>(expected, *output);
-}
+TEST_F(ReverseOpTest, Reverse_0_int8) { Reverse_0<int8>(); }
 
-TEST_F(ReverseOpTest, Reverse_234) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  // [[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
-  //  [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]
-  AddInputFromArray<float>(TensorShape({2, 3, 4}),
-                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                            15, 16, 17, 18, 19, 20, 21, 22, 23});
-  AddInputFromArray<bool>(TensorShape({3}), {true, false, true});
-
-  TF_ASSERT_OK(RunOpKernel());
-
-  // Check the new state of the input
-  Tensor* params_tensor = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
-  // Should become
-  // [[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
-  //  [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]
-  test::FillValues<float>(
-      &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
-                  6, 5, 4, 11, 10, 9, 8});
-  test::ExpectTensorEqual<float>(expected, *params_tensor);
-}
+TEST_F(ReverseOpTest, Reverse_0_uint16) { Reverse_0<uint16>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234) {
-  MakeOp(DT_FLOAT);
-
-  // Feed and run
-  // [[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]
-  //   [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]]
-  AddInputFromArray<float>(TensorShape({1, 2, 3, 4}),
-                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                            15, 16, 17, 18, 19, 20, 21, 22, 23});
-  AddInputFromArray<bool>(TensorShape({4}), {true, true, false, true});
-
-  TF_ASSERT_OK(RunOpKernel());
-
-  // Check the new state of the input
-  Tensor* params_tensor = GetOutput(0);
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 2, 3, 4}));
-  // Should become
-  // [[[[15, 14, 13, 12], [19, 18, 17, 16], [23, 22, 21, 20]]
-  //   [[3, 2, 1, 0], [7, 6, 5, 4], [11, 10, 9, 8]]]]
-  test::FillValues<float>(
-      &expected, {15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 3, 2, 1, 0, 7,
-                  6, 5, 4, 11, 10, 9, 8});
-  test::ExpectTensorEqual<float>(expected, *params_tensor);
-}
+TEST_F(ReverseOpTest, Reverse_0_int16) { Reverse_0<int16>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_float) { Reverse_0<float>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_int32) { Reverse_0<int32>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_int64) { Reverse_0<int64>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_double) { Reverse_0<double>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_complex64) { Reverse_0<complex64>(); }
+
+TEST_F(ReverseOpTest, Reverse_0_complex128) { Reverse_0<complex128>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_uint8) { Reverse_234<uint8>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_int8) { Reverse_234<int8>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_uint16) { Reverse_234<uint16>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_int16) { Reverse_234<int16>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_float) { Reverse_234<float>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_int32) { Reverse_234<int32>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_int64) { Reverse_234<int64>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_double) { Reverse_234<double>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_complex64) { Reverse_234<complex64>(); }
+
+TEST_F(ReverseOpTest, Reverse_234_complex128) { Reverse_234<complex128>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_uint8) { Reverse_1234<uint8>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_int8) { Reverse_1234<int8>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_uint16) { Reverse_1234<uint16>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_int16) { Reverse_1234<int16>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_float) { Reverse_1234<float>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_int32) { Reverse_1234<int32>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_int64) { Reverse_1234<int64>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_double) { Reverse_1234<double>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_complex64) { Reverse_1234<complex64>(); }
+
+TEST_F(ReverseOpTest, Reverse_1234_complex128) { Reverse_1234<complex128>(); }
 
 static SessionOptions GetOptions(int intra_threads) {
   SessionOptions opts;
@@ -119,10 +182,11 @@ static SessionOptions GetOptions(int intra_threads) {
 
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
+template <typename T>
 static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Graph* g = new Graph(OpRegistry::Global());
-  Tensor data(DT_FLOAT, shape);
-  data.flat<float>().setRandom();
+  Tensor data(DataTypeToEnum<T>::value, shape);
+  data.flat<T>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
   axes.flat<int32>()(0) = reverse_axis;
   test::graph::Reverse(g, test::graph::Constant(g, data),
@@ -130,81 +194,149 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   return g;
 }
 
+template <typename T>
 static void RunReverseRowsBenchmark(int iters, int outer_dim, int middle_dim,
                                     int intra_threads, int channels) {
   SessionOptions opts = GetOptions(intra_threads);
   TensorShape shape{outer_dim, middle_dim, channels};
   const int64 num_items = static_cast<int64>(iters) * shape.num_elements();
   testing::ItemsProcessed(num_items);
-  testing::BytesProcessed(num_items * sizeof(float));
+  testing::BytesProcessed(num_items * sizeof(T));
   testing::UseRealTime();
-  test::Benchmark("cpu", Reverse(shape, 1), &opts).Run(iters);
+  test::Benchmark("cpu", Reverse<T>(shape, 1), &opts).Run(iters);
 }
 
-static void BM_ReverseRowsOf1Channel_1T(int iters, int outer_dim,
-                                        int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
-                          1 /* channels */);
+static void BM_ReverseRowsOf1Channel_1T_float(int iters, int outer_dim,
+                                              int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 1 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf1Channel_1T)
+BENCHMARK(BM_ReverseRowsOf1Channel_1T_float)
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf1Channel_4T(int iters, int outer_dim,
-                                        int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
-                          1 /* channels */);
+static void BM_ReverseRowsOf1Channel_1T_uint8(int iters, int outer_dim,
+                                              int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 1 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf1Channel_4T)
+BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf1Channel_4T_float(int iters, int outer_dim,
+                                              int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 1 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf1Channel_4T_float)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf1Channel_4T_uint8(int iters, int outer_dim,
+                                              int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 1 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
+    ->ArgPair(288, 288)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_1T_float(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 3 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf3Channels_1T_float)
+    ->ArgPair(288, 288)
+    ->ArgPair(30, 30)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_1T_uint8(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 3 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
+    ->ArgPair(288, 288)
+    ->ArgPair(30, 30)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_4T_float(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 3 /* channels */);
+}
+
+BENCHMARK(BM_ReverseRowsOf3Channels_4T_float)
+    ->ArgPair(288, 288)
+    ->ArgPair(30, 30)
+    ->ArgPair(1024, 1024)
+    ->ArgPair(10 * 1024, 1024);
+
+static void BM_ReverseRowsOf3Channels_4T_uint8(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 3 /* channels */);
+}
+BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
     ->ArgPair(288, 288)
+    ->ArgPair(30, 30)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_1T(int iters, int outer_dim,
-                                         int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
-                          3 /* channels */);
+static void BM_ReverseRowsOf4Channels_1T_float(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 4 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf3Channels_1T)
+BENCHMARK(BM_ReverseRowsOf4Channels_1T_float)
     ->ArgPair(288, 288)
-    ->ArgPair(224, 224)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf3Channels_4T(int iters, int outer_dim,
-                                         int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
-                          3 /* channels */);
+static void BM_ReverseRowsOf4Channels_1T_uint8(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 1 /* intra_threads */, 4 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf3Channels_4T)
+BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
     ->ArgPair(288, 288)
-    ->ArgPair(224, 224)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_1T(int iters, int outer_dim,
-                                         int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 1 /* intra_threads */,
-                          4 /* channels */);
+static void BM_ReverseRowsOf4Channels_4T_float(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<float>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 4 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf4Channels_1T)
+BENCHMARK(BM_ReverseRowsOf4Channels_4T_float)
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
 
-static void BM_ReverseRowsOf4Channels_4T(int iters, int outer_dim,
-                                         int middle_dim) {
-  RunReverseRowsBenchmark(iters, outer_dim, middle_dim, 4 /* intra_threads */,
-                          4 /* channels */);
+static void BM_ReverseRowsOf4Channels_4T_uint8(int iters, int outer_dim,
+                                               int middle_dim) {
+  RunReverseRowsBenchmark<uint8>(iters, outer_dim, middle_dim,
+                                 4 /* intra_threads */, 4 /* channels */);
 }
 
-BENCHMARK(BM_ReverseRowsOf4Channels_4T)
+BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
     ->ArgPair(288, 288)
     ->ArgPair(1024, 1024)
     ->ArgPair(10 * 1024, 1024);
diff --git a/tensorflow/core/kernels/scan_dataset_op.cc b/tensorflow/core/kernels/scan_dataset_op.cc
index 76c219f1ae6352f047035b1bfd3231689d0d3771..d0ba210a0c85c5ec0de8399f59ad0d13331d2c8a 100644
--- a/tensorflow/core/kernels/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/scan_dataset_op.cc
@@ -132,7 +132,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         FunctionLibraryRuntime::Options opts;
         opts.step_id = CapturedFunction::generate_step_id();
         ScopedStepContainer step_container(
-            opts.step_id, [this, ctx](const string& name) {
+            opts.step_id, [this](const string& name) {
               dataset()
                   ->captured_func_->resource_manager()
                   ->Cleanup(name)
@@ -143,8 +143,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> state_and_output;
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
-        Status s =
-            dataset()->captured_func_->Run(opts, args, &state_and_output);
+        Status s = dataset()->captured_func_->Run(opts, std::move(args),
+                                                  &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 484932ab0157dee1685b2b90a6c013c11dac061d..3a95dd1773398509e81a514f07fd79f5cb9a0928 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -28,6 +29,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -83,7 +86,10 @@ class ScatterNdUpdateOp : public OpKernel {
     const DataType dt = DataTypeToEnum<T>::v();
     const DataType dt_ref = DataTypeToEnum<T>::ref();
     const DataType index_t = DataTypeToEnum<Index>::v();
-    if (IsRefType(c->input_type(0))) {
+    dtype_ = c->input_type(0);
+    if (c->input_type(0) == DT_RESOURCE) {
+      // TODO(apassos): what to validate here?
+    } else if (IsRefType(c->input_type(0))) {
       OP_REQUIRES_OK(c, c->MatchSignature({dt_ref, index_t, dt}, {dt_ref}));
       OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
     } else {
@@ -93,7 +99,16 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    if (use_exclusive_lock_) {
+    if (dtype_ == DT_RESOURCE) {
+      if (use_exclusive_lock_) {
+        Var* v;
+        OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+        mutex_lock m(*v->mu());
+        DoCompute(c);
+      } else {
+        DoCompute(c);
+      }
+    } else if (use_exclusive_lock_) {
       // If we're here, it means the input type is a ref.
       DCHECK(IsRefType(c->input_dtype(0)));
       // Hold mutex while we apply updates
@@ -105,6 +120,7 @@ class ScatterNdUpdateOp : public OpKernel {
   }
 
  private:
+  DataType dtype_;
   bool use_exclusive_lock_;
 
   void DoCompute(OpKernelContext* c) {
@@ -113,7 +129,20 @@ class ScatterNdUpdateOp : public OpKernel {
     Tensor params;
     TensorShape params_shape;
 
-    if (IsRefType(c->input_dtype(0))) {
+    if (dtype_ == DT_RESOURCE) {
+      Var* v;
+      OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+      Tensor* t = v->tensor();
+      if (!use_exclusive_lock_) {
+        // We're not holding the lock in the outer scope so need it here.
+        mutex_lock m(*v->mu());
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      } else {
+        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
+      }
+      params = *t;
+      params_shape = params.shape();
+    } else if (IsRefType(c->input_dtype(0))) {
       params = c->mutable_input(0, use_exclusive_lock_);
       params_shape = params.shape();
       c->forward_ref_input_to_ref_output(0, 0);
@@ -159,6 +188,16 @@ class ScatterNdUpdateOp : public OpKernel {
           .TypeConstraint<index_type>("Tindices"),                           \
       ScatterNdUpdateOp<dev##Device, type, index_type, op>)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, index_type, \
+                                                         dev, name, op)    \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name(name)                                                           \
+          .Device(DEVICE_##dev)                                            \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<index_type>("Tindices")                          \
+          .HostMemory("ref"),                                              \
+      ScatterNdUpdateOp<dev##Device, type, index_type, op>)
+
 #define REGISTER_SCATTER_ND_KERNEL(type, dev, name)         \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int32, dev, name); \
   REGISTER_SCATTER_ND_KERNEL_INDEX(type, int64, dev, name)
@@ -167,6 +206,11 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, op); \
   REGISTER_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
 
+#define REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(type, dev, name, op)    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int32, dev, name, \
+                                                   op);                    \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL_INDEX(type, int64, dev, name, op)
+
 #define REGISTER_SCATTER_ND_ADD_SUB(type, dev)                            \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdAdd",            \
                                     scatter_nd_op::UpdateOp::ADD);        \
@@ -178,9 +222,11 @@ class ScatterNdUpdateOp : public OpKernel {
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
 
-#define REGISTER_SCATTER_ND_UPDATE(type, dev)                     \
-  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate", \
-                                    scatter_nd_op::UpdateOp::ASSIGN);
+#define REGISTER_SCATTER_ND_UPDATE(type, dev)                         \
+  REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdUpdate",     \
+                                    scatter_nd_op::UpdateOp::ASSIGN); \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                         \
+      type, dev, "ResourceScatterNdUpdate", scatter_nd_op::UpdateOp::ASSIGN);
 
 // Registers CPU kernels.
 #define REGISTER_SCATTER_ND_ADD_SUB_CPU(type) \
@@ -281,8 +327,7 @@ Status ValidateUpdateShape(const TensorShape& params_shape,
 }
 
 template <typename Index>
-Status PrepareAndValidateInputs(OpKernelContext* c,
-                                const TensorShape& params_shape,
+Status PrepareAndValidateInputs(const TensorShape& params_shape,
                                 const Tensor& indices, const Tensor& updates,
                                 int64* slice_dim, Index* num_updates,
                                 Index* slice_size) {
@@ -396,7 +441,7 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
   Index num_updates;
   Index slice_size;
   TF_RETURN_IF_ERROR(PrepareAndValidateInputs<Index>(
-      c, shape, indices, updates, &slice_dim, &num_updates, &slice_size));
+      shape, indices, updates, &slice_dim, &num_updates, &slice_size));
 
   IndexFlattener<Device, Index> index_flattener;
   auto indices_flat = index_flattener(c, indices);
@@ -442,6 +487,8 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
       PARAMS_CASE(3);
       PARAMS_CASE(4);
       PARAMS_CASE(5);
+      PARAMS_CASE(6);
+      PARAMS_CASE(7);
 #undef PARAMS_CASE
       default:
         return errors::InvalidArgument(
@@ -480,7 +527,9 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d98412e2551b5eacb9190838b922cadd26d7aaf2
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
@@ -0,0 +1,18 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 6
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a008b55603c060953015a463cf49f5768bde637a
--- /dev/null
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
@@ -0,0 +1,19 @@
+
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define CPU_PROVIDED_IXDIM 7
+#include "tensorflow/core/kernels/scatter_nd_op_cpu_impl.h"
+#undef CPU_PROVIDED_IXDIM
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 0eb3cf32dd33705cffe4c37dbe91eb0ffc31563a..31f74671cabdabce2884fcae61a6e56dbfdefe8b 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -136,7 +136,9 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 2); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 3); \
   DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 4); \
-  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5);
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 5); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 6); \
+  DECLARE_GPU_SPECS_INDEX_OP_IXDIM(T, Index, op, 7);
 
 #define DECLARE_GPU_SPECS_INDEX(T, Index)                                \
   DECLARE_GPU_SPECS_INDEX_OP(T, Index, scatter_nd_op::UpdateOp::ASSIGN); \
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 4302a68a185e66ee3a8f10e92e58d93df4979800..3ef1cd1e062b5f5abecca2f4f788e3fed20e33e9 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -376,6 +376,9 @@ struct UnsortedSegmentSumFunctor<CPUDevice, T, Index>
     auto data_flat = typename TTypes<T, 2>::ConstTensor(data, N, data_size / N);
     for (int64 i = 0; i < N; ++i) {
       Index j = internal::SubtleMustCopy(segment_ids(i));
+      if (j < 0) {
+        continue;
+      }
       OP_REQUIRES(ctx, FastBoundsCheck(j, output_rows),
                   errors::InvalidArgument(
                       "segment_ids", SliceDebugString(segment_ids_shape, i),
@@ -550,10 +553,11 @@ class SparseSegmentReductionOpBase : public OpKernel {
  public:
   explicit SparseSegmentReductionOpBase(OpKernelConstruction* context,
                                         bool is_mean, bool is_sqrtn,
-                                        T default_value)
+                                        bool has_num_segments, T default_value)
       : OpKernel(context),
         is_mean_(is_mean),
         is_sqrtn_(is_sqrtn),
+        has_num_segments_(has_num_segments),
         default_value_(default_value) {}
 
   void Compute(OpKernelContext* context) override {
@@ -561,6 +565,19 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const Tensor& indices = context->input(1);
     const Tensor& segment_ids = context->input(2);
 
+    Index output_rows = -1;
+    if (has_num_segments_) {
+      const Tensor& num_segments = context->input(3);
+
+      OP_REQUIRES(
+          context, num_segments.shape().dims() == 0,
+          errors::InvalidArgument("num_segments should be a scalar, not shape ",
+                                  num_segments.shape().DebugString()));
+      output_rows = internal::SubtleMustCopy(num_segments.scalar<int32>()());
+      OP_REQUIRES(context, output_rows >= 0,
+                  errors::InvalidArgument("segment ids must be >= 0"));
+    }
+
     OP_REQUIRES(context, TensorShapeUtils::IsVector(indices.shape()),
                 errors::InvalidArgument("indices should be a vector."));
     OP_REQUIRES(context, TensorShapeUtils::IsVector(segment_ids.shape()),
@@ -578,10 +595,17 @@ class SparseSegmentReductionOpBase : public OpKernel {
     const auto segment_vec = segment_ids.vec<OutputRow>();
     // Note that the current implementation assumes that segment_vec values are
     // sorted.
-    const OutputRow output_rows =
+    const OutputRow last_segment_id_plus_one =
         num_indices > 0
             ? internal::SubtleMustCopy(segment_vec(num_indices - 1)) + 1
             : 0;
+    if (has_num_segments_) {
+      OP_REQUIRES(
+          context, output_rows >= last_segment_id_plus_one,
+          errors::InvalidArgument("segment ids must be < num_segments"));
+    } else {
+      output_rows = last_segment_id_plus_one;
+    }
     OP_REQUIRES(context, output_rows >= 0,
                 errors::InvalidArgument("segment ids must be >= 0"));
 
@@ -643,11 +667,20 @@ class SparseSegmentReductionOpBase : public OpKernel {
                       indices_vec(start + bad_offset), " out of range [0, ",
                       input_flat.dimension(0), ")"));
 
-      if (end >= num_indices) break;
       start = end;
       ++end;
       uninitialized_index = out_index + 1;
       out_index = next_index;
+      if (end > num_indices) break;
+    }
+
+    // Fill the gap at the end with the default value.
+    if (uninitialized_index < output_rows) {
+      Eigen::DSizes<Eigen::DenseIndex, 2> gap_slice_shape(
+          output_rows - uninitialized_index, num_col);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, Eigen::Unaligned>
+          gap_slice(&output_flat(uninitialized_index, 0), gap_slice_shape);
+      gap_slice.setConstant(default_value_);
     }
   }
 
@@ -783,6 +816,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
 
   const bool is_mean_;
   const bool is_sqrtn_;
+  const bool has_num_segments_;
   const T default_value_;
 };
 
@@ -791,9 +825,20 @@ class SparseSegmentReductionMeanOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionMeanOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, true /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionMeanWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionMeanWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, true /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -801,9 +846,20 @@ class SparseSegmentReductionSqrtNOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSqrtNOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                true /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
+};
+
+template <typename Device, class T>
+class SparseSegmentReductionSqrtNWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSqrtNWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, true /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
 };
 
 template <typename Device, class T>
@@ -811,37 +867,65 @@ class SparseSegmentReductionSumOp
     : public SparseSegmentReductionOpBase<Device, T> {
  public:
   explicit SparseSegmentReductionSumOp(OpKernelConstruction* context)
-      : SparseSegmentReductionOpBase<Device, T>(context, false /*is_mean*/,
-                                                false /*is_sqrtn*/,
-                                                T(0) /* default_value */) {}
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            false /* has_num_segments */, T(0) /* default_value */) {}
 };
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")            \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSumOp<CPUDevice, type>);
+template <typename Device, class T>
+class SparseSegmentReductionSumWithNumSegmentsOp
+    : public SparseSegmentReductionOpBase<Device, T> {
+ public:
+  explicit SparseSegmentReductionSumWithNumSegmentsOp(
+      OpKernelConstruction* context)
+      : SparseSegmentReductionOpBase<Device, T>(
+            context, false /*is_mean*/, false /*is_sqrtn*/,
+            true /* has_num_segments */, T(0) /* default_value */) {}
+};
 
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSum")                       \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<type>("T")                 \
+                              .TypeConstraint<int32>("Tidx"),            \
+                          SparseSegmentReductionSumOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("SparseSegmentSumWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<type>("T")                                     \
+          .TypeConstraint<int32>("Tidx"),                                \
+      SparseSegmentReductionSumWithNumSegmentsOp<CPUDevice, type>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")           \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionMeanOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                 \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentMean")                       \
+                              .Device(DEVICE_CPU)                         \
+                              .TypeConstraint<type>("T")                  \
+                              .TypeConstraint<int32>("Tidx"),             \
+                          SparseSegmentReductionMeanOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("SparseSegmentMeanWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<type>("T")                                      \
+          .TypeConstraint<int32>("Tidx"),                                 \
+      SparseSegmentReductionMeanWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
-#define REGISTER_CPU_SPARSE_KERNELS(type)                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")          \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("T")      \
-                              .TypeConstraint<int32>("Tidx"), \
-                          SparseSegmentReductionSqrtNOp<CPUDevice, type>);
+#define REGISTER_CPU_SPARSE_KERNELS(type)                                  \
+  REGISTER_KERNEL_BUILDER(Name("SparseSegmentSqrtN")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<type>("T")                   \
+                              .TypeConstraint<int32>("Tidx"),              \
+                          SparseSegmentReductionSqrtNOp<CPUDevice, type>); \
+  REGISTER_KERNEL_BUILDER(                                                 \
+      Name("SparseSegmentSqrtNWithNumSegments")                            \
+          .Device(DEVICE_CPU)                                              \
+          .TypeConstraint<type>("T")                                       \
+          .TypeConstraint<int32>("Tidx"),                                  \
+      SparseSegmentReductionSqrtNWithNumSegmentsOp<CPUDevice, type>);
 REGISTER_CPU_SPARSE_KERNELS(float);
 REGISTER_CPU_SPARSE_KERNELS(double);
 #undef REGISTER_CPU_SPARSE_KERNELS
@@ -886,9 +970,10 @@ class SparseSegmentGradOpBase : public OpKernel {
 
     // Note that similar to SparseSegmentMean, we assume that segment_vec is
     // already sorted and has non-negative values.
-    const SegmentId num_segments =
+    const SegmentId num_segments = input.dim_size(0);
+    const SegmentId last_segment_id_plus_one =
         internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
-    OP_REQUIRES(context, input.dim_size(0) == num_segments,
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
                 errors::InvalidArgument("Invalid number of segments"));
 
     // Compute scaling factors for input.
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 412c1d601d3116b7de5ee09afe1e4f1d0253b349..b10bea72ba89e7089e0668389995c629644b534d 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -30,14 +30,14 @@ namespace functor {
 #ifdef GOOGLE_CUDA
 typedef Eigen::GpuDevice GPUDevice;
 // Functor for SegmentSumGPUOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename T, typename Index>
 struct SegmentSumFunctor {
   void operator()(OpKernelContext* ctx, const GPUDevice& d,
@@ -61,14 +61,14 @@ struct UnsortedSegmentBaseFunctor{
 };
 
 // Functor for UnsortedSegmentSumOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
 struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
@@ -79,14 +79,14 @@ struct UnsortedSegmentSumFunctor: public UnsortedSegmentBaseFunctor<Device, T, I
 };
 
 // Functor for UnsortedSegmentMaxOp.
-// 'output_rows': the number of output segments (unique segment ids in
+// output_rows: the number of output segments (unique segment ids in
 //                'segment_ids').
-// 'segment_ids_shape': shape of 'segment_ids' tensor.
-// 'segment_ids': unsorted map from input to output segment ids at which to
+// segment_ids_shape: shape of 'segment_ids' tensor.
+// segment_ids: unsorted map from input to output segment ids at which to
 //                perform segment sum operation.
-// 'data_size': size of input data tensor.
-// 'data': input data tensor.
-// 'output': output reshaped to {output_rows, output.size/output_rows}
+// data_size: size of input data tensor.
+// data: input data tensor.
+// output: output reshaped to {output_rows, output.size/output_rows}
 template <typename Device, typename T, typename Index>
 struct UnsortedSegmentMaxFunctor: public UnsortedSegmentBaseFunctor<Device, T, Index> {
   void operator()(OpKernelContext* ctx, const Device& d,
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 9c242052f7ccb0b44720b09dd00ef7db0a982a4b..206fd40fa68c3158fa60b7651d40121ab1344bbd 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -91,9 +91,9 @@ void SendOp::Compute(OpKernelContext* ctx) {
   if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
     VLOG(2) << "Send " << parsed_key_.buf_;
-    OP_REQUIRES_OK(ctx,
-                   ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
+    ctx->SetStatus(ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
                                            ctx->is_input_dead()));
+    return;
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
@@ -101,9 +101,9 @@ void SendOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
 
-    OP_REQUIRES_OK(ctx,
-                   ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0),
+    ctx->SetStatus(ctx->rendezvous()->Send(in_loop_parsed, args, ctx->input(0),
                                            ctx->is_input_dead()));
+    return;
   }
 }
 
@@ -142,17 +142,12 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   }
 }
 
-void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
-  OP_REQUIRES(
-      ctx, ctx->rendezvous() != nullptr,
-      errors::Internal("Op kernel context needs to provide a rendezvous."));
-
-  Rendezvous::Args args;
-  args.device_context = ctx->op_device_context();
-  args.alloc_attrs = ctx->output_alloc_attr(0);
+namespace {
+Rendezvous::DoneCallback make_recv_callback(OpKernelContext* ctx,
+                                            AsyncOpKernel::DoneCallback done) {
   using namespace std::placeholders;
-  Rendezvous::DoneCallback done_cb = std::bind(
-      [ctx](DoneCallback done,
+  return std::bind(
+      [ctx](AsyncOpKernel::DoneCallback done,
             // Begin unbound arguments.
             const Status& s, const Rendezvous::Args& send_args,
             const Rendezvous::Args& recv_args, const Tensor& val,
@@ -170,19 +165,31 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         done();
       },
       std::move(done), _1, _2, _3, _4, _5);
+}
+}  // namespace
+
+void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
+  OP_REQUIRES(
+      ctx, ctx->rendezvous() != nullptr,
+      errors::Internal("Op kernel context needs to provide a rendezvous."));
+
+  Rendezvous::Args args;
+  args.device_context = ctx->op_device_context();
+  args.alloc_attrs = ctx->output_alloc_attr(0);
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
     VLOG(2) << "Recv " << parsed_key_.buf_;
-    ctx->rendezvous()->RecvAsync(parsed_key_, args, std::move(done_cb));
+    ctx->rendezvous()->RecvAsync(parsed_key_, args,
+                                 make_recv_callback(ctx, std::move(done)));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
     VLOG(2) << "Recv " << in_loop_parsed.buf_;
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
-
-    ctx->rendezvous()->RecvAsync(in_loop_parsed, args, std::move(done_cb));
+    ctx->rendezvous()->RecvAsync(in_loop_parsed, args,
+                                 make_recv_callback(ctx, std::move(done)));
   }
 }
 
diff --git a/tensorflow/core/kernels/serialize_sparse_op.cc b/tensorflow/core/kernels/serialize_sparse_op.cc
index 2c7ad5bab08c403351f8a832c5ffe5bdbf4e860e..61e40caef99c019914fc331bee5d8beab0883f41 100644
--- a/tensorflow/core/kernels/serialize_sparse_op.cc
+++ b/tensorflow/core/kernels/serialize_sparse_op.cc
@@ -27,22 +27,31 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
 
 namespace tensorflow {
 
 using sparse::SparseTensor;
 
+template <typename T>
 class SerializeSparseOp : public OpKernel {
  public:
   explicit SerializeSparseOp(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  Status Initialize(Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
     const Tensor* input_shape;
+
     OP_REQUIRES_OK(context, context->input("sparse_indices", &input_indices));
     OP_REQUIRES_OK(context, context->input("sparse_values", &input_values));
     OP_REQUIRES_OK(context, context->input("sparse_shape", &input_shape));
@@ -61,34 +70,75 @@ class SerializeSparseOp : public OpKernel {
                     "Input shape should be a vector but received shape ",
                     input_shape->shape().DebugString()));
 
-    TensorProto proto_indices;
-    TensorProto proto_values;
-    TensorProto proto_shape;
-
-    input_indices->AsProtoTensorContent(&proto_indices);
-    input_values->AsProtoTensorContent(&proto_values);
-    input_shape->AsProtoTensorContent(&proto_shape);
-
-    Tensor serialized_sparse(DT_STRING, TensorShape({3}));
-    auto serialized_sparse_t = serialized_sparse.vec<string>();
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, Initialize(&serialized_sparse));
 
-    serialized_sparse_t(0) = proto_indices.SerializeAsString();
-    serialized_sparse_t(1) = proto_values.SerializeAsString();
-    serialized_sparse_t(2) = proto_shape.SerializeAsString();
+    auto serialized_sparse_t = serialized_sparse.vec<T>();
+    OP_REQUIRES_OK(context, Serialize(*input_indices, &serialized_sparse_t(0)));
+    OP_REQUIRES_OK(context, Serialize(*input_values, &serialized_sparse_t(1)));
+    OP_REQUIRES_OK(context, Serialize(*input_shape, &serialized_sparse_t(2)));
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("SerializeSparse").Device(DEVICE_CPU),
-                        SerializeSparseOp);
+template <>
+Status SerializeSparseOp<string>::Initialize(Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<string>::Serialize(const Tensor& input,
+                                            string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("out_type"),
+                        SerializeSparseOp<string>);
+
+template <>
+Status SerializeSparseOp<Variant>::Initialize(Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeSparseOp<Variant>::Serialize(const Tensor& input,
+                                             Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("SerializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("out_type"),
+                        SerializeSparseOp<Variant>);
 
 template <typename T>
-class SerializeManySparseOp : public OpKernel {
+class SerializeManySparseOpBase : public OpKernel {
  public:
-  explicit SerializeManySparseOp(OpKernelConstruction* context)
+  explicit SerializeManySparseOpBase(OpKernelConstruction* context)
       : OpKernel(context) {}
 
+  void Compute(OpKernelContext* context) override {}
+
+ protected:
+  Status Initialize(const int64 n, Tensor* result);
+  Status Serialize(const Tensor& input, T* result);
+};
+
+template <typename T, typename U>
+class SerializeManySparseOp : public SerializeManySparseOpBase<U> {
+ public:
+  explicit SerializeManySparseOp(OpKernelConstruction* context)
+      : SerializeManySparseOpBase<U>(context) {}
+
   void Compute(OpKernelContext* context) override {
     const Tensor* input_indices;
     const Tensor* input_values;
@@ -126,37 +176,31 @@ class SerializeManySparseOp : public OpKernel {
 
     auto input_shape_t = input_shape->vec<int64>();
     const int64 N = input_shape_t(0);
-
-    Tensor serialized_sparse(DT_STRING, TensorShape({N, 3}));
-    auto serialized_sparse_t = serialized_sparse.matrix<string>();
+    Tensor serialized_sparse;
+    OP_REQUIRES_OK(context, this->Initialize(N, &serialized_sparse));
+    auto serialized_sparse_t = serialized_sparse.matrix<U>();
 
     OP_REQUIRES_OK(context, input_st.IndicesValid());
 
-    // We can generate the output shape proto string now, for all
-    // minibatch entries.
-    Tensor output_shape(DT_INT64, {rank - 1});
-    auto output_shape_t = output_shape.vec<int64>();
-    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
-    TensorProto proto_shape;
-    output_shape.AsProtoTensorContent(&proto_shape);
-    const string proto_shape_string = proto_shape.SerializeAsString();
-
+    // Initialize output with empty values and the proper shapes.
     Tensor output_blank_indices(DT_INT64, {0, rank - 1});
-    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
-    TensorProto proto_blank_indices;
-    TensorProto proto_blank_values;
-    output_blank_indices.AsProtoTensorContent(&proto_blank_indices);
-    output_blank_values.AsProtoTensorContent(&proto_blank_values);
+    U serialized_indices;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_indices, &serialized_indices));
+    serialized_sparse_t.template chip<1>(0).setConstant(serialized_indices);
 
-    const string proto_blank_indices_string =
-        proto_blank_indices.SerializeAsString();
-    const string proto_blank_values_string =
-        proto_blank_values.SerializeAsString();
+    Tensor output_blank_values(DataTypeToEnum<T>::value, {0});
+    U serialized_values;
+    OP_REQUIRES_OK(context,
+                   this->Serialize(output_blank_values, &serialized_values));
+    serialized_sparse_t.template chip<1>(1).setConstant(serialized_values);
 
-    // Initialize output with empty values and the proper shapes.
-    serialized_sparse_t.chip<1>(0).setConstant(proto_blank_indices_string);
-    serialized_sparse_t.chip<1>(1).setConstant(proto_blank_values_string);
-    serialized_sparse_t.chip<1>(2).setConstant(proto_shape_string);
+    Tensor output_shape(DT_INT64, {rank - 1});
+    auto output_shape_t = output_shape.vec<int64>();
+    for (int d = 1; d < rank; d++) output_shape_t(d - 1) = input_shape_t(d);
+    U serialized_shape;
+    OP_REQUIRES_OK(context, this->Serialize(output_shape, &serialized_shape));
+    serialized_sparse_t.template chip<1>(2).setConstant(serialized_shape);
 
     // Get groups by minibatch dimension
     sparse::GroupIterable minibatch = input_st.group({0});
@@ -185,208 +229,328 @@ class SerializeManySparseOp : public OpKernel {
         output_values_t(i) = values(i);
       }
 
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      output_indices.AsProtoTensorContent(&proto_indices);
-      output_values.AsProtoTensorContent(&proto_values);
-
-      serialized_sparse_t(b, 0) = proto_indices.SerializeAsString();
-      serialized_sparse_t(b, 1) = proto_values.SerializeAsString();
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_indices, &serialized_sparse_t(b, 0)));
+      OP_REQUIRES_OK(
+          context, this->Serialize(output_values, &serialized_sparse_t(b, 1)));
     }
 
     context->set_output(0, serialized_sparse);
   }
 };
 
-#define REGISTER_KERNELS(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")     \
-                              .Device(DEVICE_CPU)         \
-                              .TypeConstraint<type>("T"), \
-                          SerializeManySparseOp<type>)
+template <>
+Status SerializeManySparseOpBase<string>::Initialize(const int64 n,
+                                                     Tensor* result) {
+  *result = Tensor(DT_STRING, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<string>::Serialize(const Tensor& input,
+                                                    string* result) {
+  TensorProto proto;
+  input.AsProtoTensorContent(&proto);
+  *result = proto.SerializeAsString();
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")              \
+                              .Device(DEVICE_CPU)                  \
+                              .TypeConstraint<type>("T")           \
+                              .TypeConstraint<string>("out_type"), \
+                          SerializeManySparseOp<type, string>)
+
+TF_CALL_ALL_TYPES(REGISTER_KERNELS);
+#undef REGISTER_KERNELS
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Initialize(const int64 n,
+                                                      Tensor* result) {
+  *result = Tensor(DT_VARIANT, TensorShape({n, 3}));
+  return Status::OK();
+}
+
+template <>
+Status SerializeManySparseOpBase<Variant>::Serialize(const Tensor& input,
+                                                     Variant* result) {
+  *result = input;
+  return Status::OK();
+}
+
+#define REGISTER_KERNELS(type)                                      \
+  REGISTER_KERNEL_BUILDER(Name("SerializeManySparse")               \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .TypeConstraint<Variant>("out_type"), \
+                          SerializeManySparseOp<type, Variant>)
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 template <typename T>
-class DeserializeManySparseOp : public OpKernel {
+class DeserializeSparseOp : public OpKernel {
  public:
-  explicit DeserializeManySparseOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+  explicit DeserializeSparseOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+  }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& serialized_sparse = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(serialized_sparse.shape()),
+    const int ndims = serialized_sparse.shape().dims();
+
+    OP_REQUIRES(
+        context, ndims > 0,
+        errors::InvalidArgument("Serialized sparse should have non-zero rank ",
+                                serialized_sparse.shape().DebugString()));
+
+    OP_REQUIRES(context, serialized_sparse.shape().dim_size(ndims - 1) == 3,
                 errors::InvalidArgument(
-                    "Serialized sparse should be a matrix but received shape ",
+                    "Serialized sparse should have 3 as the last dimension ",
                     serialized_sparse.shape().DebugString()));
-    OP_REQUIRES(
-        context, serialized_sparse.shape().dim_size(1) == 3,
-        errors::InvalidArgument(
-            "Serialize sparse should have 3 columns but received shape ",
-            serialized_sparse.shape().DebugString()));
 
-    int num_sparse_tensors = serialized_sparse.shape().dim_size(0);
+    int num_sparse_tensors = 1;
+    for (int i = 0; i < ndims - 1; ++i) {
+      num_sparse_tensors *= serialized_sparse.shape().dim_size(i);
+    }
 
     OP_REQUIRES(
         context, num_sparse_tensors > 0,
-        errors::InvalidArgument("Must have at least 1 serialized SparseTensor, "
-                                "but input matrix has 0 rows"));
+        errors::InvalidArgument(
+            "Serialized sparse should have at least 1 serialized tensor, "
+            "but has a zero dimension ",
+            serialized_sparse.shape().DebugString()));
 
-    std::vector<Tensor> indices_to_concat;
-    std::vector<Tensor> values_to_concat;
-    std::vector<TensorShape> shapes_to_concat;
+    if (num_sparse_tensors == 0 && serialized_sparse.shape().dims() == 1) {
+      // Special case with a single sparse tensor. We can avoid data
+      // motion in the Concat and Reshape.
+      const auto& serialized_sparse_t = serialized_sparse.vec<T>();
+
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(0), serialized_sparse_t(1),
+                         serialized_sparse_t(2), dtype_, 0 /* index */,
+                         &output_indices, &output_values, &output_shape));
+      context->set_output(0, output_indices);
+      context->set_output(1, output_values);
+      context->set_output(2, output_shape);
+      return;
+    }
 
-    const auto& serialized_sparse_t = serialized_sparse.matrix<string>();
+    std::vector<Tensor> indices;
+    std::vector<Tensor> values;
+    TensorShape shape;
+    indices.reserve(num_sparse_tensors);
+    values.reserve(num_sparse_tensors);
 
+    const auto& serialized_sparse_t = serialized_sparse.flat_inner_dims<T, 2>();
     for (int i = 0; i < num_sparse_tensors; ++i) {
-      Tensor output_indices(DT_INT64);
-      Tensor output_values(DataTypeToEnum<T>::value);
-      Tensor output_shape(DT_INT64);
-      TensorProto proto_indices;
-      TensorProto proto_values;
-      TensorProto proto_shape;
-
-      OP_REQUIRES(context, ParseProtoUnlimited(&proto_indices,
-                                               serialized_sparse_t(i, 0)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 0]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_values, serialized_sparse_t(i, 1)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 1]"));
-      OP_REQUIRES(context,
-                  ParseProtoUnlimited(&proto_shape, serialized_sparse_t(i, 2)),
-                  errors::InvalidArgument("Could not parse serialized_sparse[",
-                                          i, ", 2]"));
-
-      OP_REQUIRES(context, output_indices.FromProto(proto_indices),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 0] (indices)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(output_indices.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 1] to represent an index matrix but received shape ",
-                      output_indices.shape().DebugString()));
-      OP_REQUIRES(context, output_values.FromProto(proto_values),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 1] (values)"));
-      OP_REQUIRES(context, TensorShapeUtils::IsVector(output_values.shape()),
-                  errors::InvalidArgument(
-                      "Expected serialized_sparse[", i,
-                      ", 1] to represent a values vector but received shape ",
-                      output_values.shape().DebugString()));
-      OP_REQUIRES(context, output_shape.FromProto(proto_shape),
-                  errors::InvalidArgument(
-                      "Could not construct Tensor serialized_sparse[", i,
-                      ", 2] (shape)"));
-      OP_REQUIRES(
-          context, TensorShapeUtils::IsVector(output_shape.shape()),
-          errors::InvalidArgument("Expected serialized_sparse[", i,
-                                  ", 1] to be a shape vector but its shape is ",
-                                  output_shape.shape().DebugString()));
-
-      OP_REQUIRES(
-          context, DataTypeToEnum<T>::value == output_values.dtype(),
-          errors::InvalidArgument(
-              "Requested SparseTensor of type ",
-              DataTypeString(DataTypeToEnum<T>::value), " but SparseTensor[", i,
-              "].values.dtype() == ", DataTypeString(output_values.dtype())));
-
+      Tensor output_indices;
+      Tensor output_values;
+      Tensor output_shape;
+      OP_REQUIRES_OK(context,
+                     this->GetAndValidateSparseTensor(
+                         serialized_sparse_t(i, 0), serialized_sparse_t(i, 1),
+                         serialized_sparse_t(i, 2), dtype_, i, &output_indices,
+                         &output_values, &output_shape));
       int64 num_entries = output_indices.dim_size(0);
-      OP_REQUIRES(context, num_entries == output_values.dim_size(0),
-                  errors::InvalidArgument(
-                      "Expected row counts of SparseTensor[", i,
-                      "].indices and SparseTensor[", i,
-                      "].values to match but they do not: ", num_entries,
-                      " vs. ", output_values.dim_size(0)));
       int rank = output_indices.dim_size(1);
-      OP_REQUIRES(
-          context, rank == output_shape.dim_size(0),
-          errors::InvalidArgument("Expected column counts of SparseTensor[", i,
-                                  "].indices to match size of SparseTensor[", i,
-                                  "].shape "
-                                  "but they do not: ",
-                                  rank, " vs. ", output_shape.dim_size(0)));
 
       // Now we expand each SparseTensors' indices and shape by
       // prefixing a dimension
-      Tensor expanded_indices(
-          DT_INT64, TensorShape({num_entries, 1 + output_indices.dim_size(1)}));
-      Tensor expanded_shape(DT_INT64,
-                            TensorShape({1 + output_shape.dim_size(0)}));
+      Tensor expanded_indices(DT_INT64, TensorShape({num_entries, 1 + rank}));
       const auto& output_indices_t = output_indices.matrix<int64>();
-      const auto& output_shape_t = output_shape.vec<int64>();
       auto expanded_indices_t = expanded_indices.matrix<int64>();
-      auto expanded_shape_t = expanded_shape.vec<int64>();
       expanded_indices_t.chip<1>(0).setZero();
       Eigen::DSizes<Eigen::DenseIndex, 2> indices_start(0, 1);
       Eigen::DSizes<Eigen::DenseIndex, 2> indices_sizes(num_entries, rank);
       expanded_indices_t.slice(indices_start, indices_sizes) = output_indices_t;
+
+      Tensor expanded_shape(DT_INT64, TensorShape({1 + rank}));
+      const auto& output_shape_t = output_shape.vec<int64>();
+      auto expanded_shape_t = expanded_shape.vec<int64>();
       expanded_shape_t(0) = 1;
       std::copy_n(&output_shape_t(0), rank, &expanded_shape_t(1));
 
       TensorShape expanded_tensor_shape(expanded_shape.vec<int64>());
 
-      indices_to_concat.push_back(expanded_indices);
-      values_to_concat.push_back(output_values);
-      shapes_to_concat.push_back(expanded_tensor_shape);
-    }
-
-    int rank = -1;
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      if (rank < 0) rank = shapes_to_concat[i].dims();
-      OP_REQUIRES(context, rank == shapes_to_concat[i].dims(),
-                  errors::InvalidArgument(
-                      "Inconsistent rank across SparseTensors: rank prior to "
-                      "SparseTensor[",
-                      i, "] was: ", rank, " but rank of SparseTensor[", i,
-                      "] is: ", shapes_to_concat[i].dims()));
-    }
-
-    // SparseTensor::Concat requires consistent shape for all but the
-    // primary order dimension (dimension 0 in this case).  So we get
-    // the maximum value across all the input SparseTensors for each
-    // dimension and use that.
-    TensorShape preconcat_shape(shapes_to_concat[0]);
-    for (int i = 0; i < num_sparse_tensors; ++i) {
-      for (int d = 0; d < rank; ++d) {
-        preconcat_shape.set_dim(d, std::max(preconcat_shape.dim_size(d),
-                                            shapes_to_concat[i].dim_size(d)));
+      indices.push_back(expanded_indices);
+      values.push_back(output_values);
+      if (i == 0) {
+        shape = expanded_tensor_shape;
+      } else {
+        OP_REQUIRES(
+            context, shape.dims() == expanded_tensor_shape.dims(),
+            errors::InvalidArgument(
+                "Inconsistent shape across SparseTensors: rank prior to "
+                "SparseTensor[",
+                i, "] was: ", shape.dims() - 1, " but rank of SparseTensor[", i,
+                "] is: ", expanded_tensor_shape.dims() - 1));
+        for (int j = 1; j < shape.dims(); ++j) {
+          // NOTE(mrry): For compatibility with the implementations of
+          // DeserializeManySparse, and many ops that generate
+          // SparseTensors to batch that do not have a fixed
+          // dense_shape (e.g. `tf.parse_single_example()`), we
+          // compute the maximum in each dimension to find the
+          // smallest dense_shape that bounds all of the input
+          // SparseTensors.
+          shape.set_dim(j, std::max(shape.dim_size(j),
+                                    expanded_tensor_shape.dim_size(j)));
+        }
       }
     }
 
     // Dimension 0 is the primary dimension.
+    int rank = shape.dims();
     gtl::InlinedVector<int64, 8> std_order(rank);
     std::iota(std_order.begin(), std_order.end(), 0);
 
-    std::vector<SparseTensor> tensors_to_concat;
-    tensors_to_concat.reserve(num_sparse_tensors);
+    std::vector<SparseTensor> tensors;
+    tensors.reserve(num_sparse_tensors);
     for (int i = 0; i < num_sparse_tensors; ++i) {
-      tensors_to_concat.emplace_back(indices_to_concat[i], values_to_concat[i],
-                                     preconcat_shape, std_order);
+      tensors.emplace_back(indices[i], values[i], shape, std_order);
     }
 
-    SparseTensor output = SparseTensor::Concat<T>(tensors_to_concat);
+    gtl::optional<SparseTensor> maybe_output;
+#define HANDLE_TYPE(T)                               \
+  case DataTypeToEnum<T>::value: {                   \
+    maybe_output = SparseTensor::Concat<T>(tensors); \
+    break;                                           \
+  }
 
-    Tensor final_output_shape(DT_INT64, TensorShape({output.dims()}));
+    switch (dtype_) {
+      TF_CALL_ALL_TYPES(HANDLE_TYPE);
+      TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+      TF_CALL_variant(HANDLE_TYPE);
+#undef HANDLE_TYPE
+      default:
+        OP_REQUIRES(context, false,
+                    errors::Unimplemented(
+                        "DeserializeSparse Unhandled data type: ", dtype_));
+    }
+    DCHECK(maybe_output);
+    SparseTensor& output = maybe_output.value();
 
+    // Compute the input shape for the reshape operation.
+    Tensor input_shape(DT_INT64, TensorShape({output.dims()}));
     std::copy_n(output.shape().data(), output.dims(),
-                final_output_shape.vec<int64>().data());
+                input_shape.vec<int64>().data());
+
+    // Compute the target shape for the reshape operation.
+    Tensor target_shape(DT_INT64, TensorShape({ndims + output.dims() - 2}));
+    for (int i = 0; i < ndims - 1; ++i) {
+      target_shape.vec<int64>()(i) = serialized_sparse.shape().dim_size(i);
+    }
+    for (int i = 0; i < output.dims() - 1; ++i) {
+      target_shape.vec<int64>()(i + ndims - 1) = output.shape().data()[i + 1];
+    }
 
-    context->set_output(0, output.indices());
+    Tensor output_indices;
+    Tensor output_shape;
+    Reshape(context, output.indices(), input_shape, target_shape,
+            0 /* output indices index */, 2 /* output shape index */);
     context->set_output(1, output.values());
-    context->set_output(2, final_output_shape);
   }
+
+ protected:
+  Status Deserialize(const T& serialized, Tensor* result);
+
+  Status GetAndValidateSparseTensor(
+      const T& serialized_indices, const T& serialized_values,
+      const T& serialized_shape, DataType values_dtype, int index,
+      Tensor* output_indices, Tensor* output_values, Tensor* output_shape) {
+    // Deserialize and validate the indices.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_indices, output_indices));
+    if (!TensorShapeUtils::IsMatrix(output_indices->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 0] to represent an index matrix but received shape ",
+          output_indices->shape().DebugString());
+    }
+    int64 num_entries = output_indices->dim_size(0);
+    int rank = output_indices->dim_size(1);
+
+    // Deserialize and validate the values.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_values, output_values));
+    if (!TensorShapeUtils::IsVector(output_values->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to represent a values vector but received shape ",
+          output_values->shape().DebugString());
+    }
+    if (values_dtype != output_values->dtype()) {
+      return errors::InvalidArgument(
+          "Requested SparseTensor of type ", DataTypeString(values_dtype),
+          " but SparseTensor[", index,
+          "].values.dtype() == ", DataTypeString(output_values->dtype()));
+    }
+    if (num_entries != output_values->dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected row counts of SparseTensor[", index,
+          "].indices and SparseTensor[", index,
+          "].values to match but they do not: ", num_entries, " vs. ",
+          output_values->dim_size(0));
+    }
+
+    // Deserialize and validate the shape.
+    TF_RETURN_IF_ERROR(this->Deserialize(serialized_shape, output_shape));
+    if (!TensorShapeUtils::IsVector(output_shape->shape())) {
+      return errors::InvalidArgument(
+          "Expected serialized_sparse[", index,
+          ", 1] to be a shape vector but its shape is ",
+          output_shape->shape().DebugString());
+    }
+    if (rank != output_shape->dim_size(0)) {
+      return errors::InvalidArgument("Expected column counts of SparseTensor[",
+                                     index,
+                                     "].indices to match size of SparseTensor[",
+                                     index, "].shape but they do not: ", rank,
+                                     " vs. ", output_shape->dim_size(0));
+    }
+    return Status::OK();
+  }
+
+  DataType dtype_;
 };
 
-#define REGISTER_KERNELS(type)                                \
-  REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse")       \
-                              .Device(DEVICE_CPU)             \
-                              .TypeConstraint<type>("dtype"), \
-                          DeserializeManySparseOp<type>)
+template <>
+Status DeserializeSparseOp<string>::Deserialize(const string& serialized,
+                                                Tensor* result) {
+  TensorProto proto;
+  if (!ParseProtoUnlimited(&proto, serialized)) {
+    return errors::InvalidArgument("Could not parse serialized proto");
+  }
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return errors::InvalidArgument("Could not construct tensor from proto");
+  }
+  *result = tensor;
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<string>("Tserialized"),
+                        DeserializeSparseOp<string>)
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeManySparse").Device(DEVICE_CPU),
+                        DeserializeSparseOp<string>)
+
+template <>
+Status DeserializeSparseOp<Variant>::Deserialize(const Variant& serialized,
+                                                 Tensor* result) {
+  *result = *serialized.get<Tensor>();
+  return Status::OK();
+}
+
+REGISTER_KERNEL_BUILDER(Name("DeserializeSparse")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<Variant>("Tserialized"),
+                        DeserializeSparseOp<Variant>)
 
-TF_CALL_ALL_TYPES(REGISTER_KERNELS);
-#undef REGISTER_KERNELS
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index 721f9b949b13d8b48d65e28a4a4f5653b74b1344..28a39bae3ffb8bebcc9dce97d85e1126ca954882 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -341,7 +341,12 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .Device(DEVICE_CPU)
                             .HostMemory("dim")
                             .TypeConstraint<int32>("Tdim"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_CPU)
+                            .HostMemory("dim")
+                            .TypeConstraint<int64>("Tdim"),
+                        ExpandDimsOp<int64>);
 
 #if GOOGLE_CUDA
 #define REGISTER_GPU_KERNEL(type)                            \
@@ -350,7 +355,13 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<type>("T")     \
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
-                          ExpandDimsOp);
+                          ExpandDimsOp<int32>);              \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int64>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
@@ -362,7 +373,15 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("input")
                             .HostMemory("dim")
                             .HostMemory("output"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp<int64>);
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -372,7 +391,13 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                               .TypeConstraint<type>("T")     \
                               .TypeConstraint<int32>("Tdim") \
                               .HostMemory("dim"),            \
-                          ExpandDimsOp);
+                          ExpandDimsOp<int32>);              \
+  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
+                              .Device(DEVICE_SYCL)           \
+                              .TypeConstraint<type>("T")     \
+                              .TypeConstraint<int64>("Tdim") \
+                              .HostMemory("dim"),            \
+                          ExpandDimsOp<int64>);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
 TF_CALL_bool(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
@@ -384,7 +409,15 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                             .HostMemory("input")
                             .HostMemory("dim")
                             .HostMemory("output"),
-                        ExpandDimsOp);
+                        ExpandDimsOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("ExpandDims")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int64>("Tdim")
+                            .HostMemory("input")
+                            .HostMemory("dim")
+                            .HostMemory("output"),
+                        ExpandDimsOp<int64>);
 #endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index ac607f4e8b8ec05e23b90b74b1dbcc8aa3f2cc2a..8d9d0ea84612b51bdcd597698b89e3b8ffb8a915 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -145,6 +145,7 @@ class SizeOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
+template <typename Tdim>
 class ExpandDimsOp : public OpKernel {
  public:
   explicit ExpandDimsOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -153,7 +154,7 @@ class ExpandDimsOp : public OpKernel {
     OP_REQUIRES(ctx, ctx->input(0).dtype() != DT_VARIANT,
                 errors::InvalidArgument("ExpandDims on Variant not supported"));
 
-    int32 dim = ctx->input(1).flat<int32>()(0);
+    Tdim dim = ctx->input(1).flat<Tdim>()(0);
     OP_REQUIRES(
         ctx, (dim >= -1 - ctx->input(0).dims() && dim <= ctx->input(0).dims()),
         errors::InvalidArgument("Tried to expand dim index ", dim,
@@ -175,7 +176,7 @@ class ExpandDimsOp : public OpKernel {
     }
 
     // Clamp to the end if needed.
-    dim = std::min<int32>(dim, existing_dims_size);
+    dim = std::min<Tdim>(dim, existing_dims_size);
     new_shape.emplace(new_shape.begin() + dim, 1);
     const TensorShape output_shape(new_shape);
 
@@ -234,10 +235,10 @@ class SqueezeOp : public OpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument(
-                          "Tried to explicitly squeeze "
-                          "dimension ",
-                          i, " but dimension was not 1: ", existing_dim));
+                      errors::InvalidArgument("Tried to explicitly squeeze "
+                                              "dimension ",
+                                              i, " but dimension was not 1: ",
+                                              existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
diff --git a/tensorflow/core/kernels/shuffle_dataset_op.cc b/tensorflow/core/kernels/shuffle_dataset_op.cc
index c7c670deba273faa0a46cf7bf09ade97d1a40a40..72facb3a0d0cc13a559b3d8005592e19b97fed6f 100644
--- a/tensorflow/core/kernels/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/shuffle_dataset_op.cc
@@ -60,18 +60,19 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
     }
 
     if (reshuffle_each_iteration_) {
-      *output = new ReshufflingDataset(input, buffer_size, seed, seed2);
+      *output = new ReshufflingDataset(ctx, input, buffer_size, seed, seed2);
     } else {
-      *output = new FixedSeedDataset(input, buffer_size, seed, seed2);
+      *output = new FixedSeedDataset(ctx, input, buffer_size, seed, seed2);
     }
   }
 
  private:
   // Abstract base dataset that implements a shuffling iterator.
-  class ShuffleDatasetBase : public DatasetBase {
+  class ShuffleDatasetBase : public GraphDatasetBase {
    public:
-    ShuffleDatasetBase(const DatasetBase* input, int64 buffer_size)
-        : input_(input), buffer_size_(buffer_size) {
+    ShuffleDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
+                       int64 buffer_size)
+        : GraphDatasetBase(ctx), input_(input), buffer_size_(buffer_size) {
       input_->Ref();
     }
 
@@ -91,6 +92,8 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : DatasetIterator<ShuffleDatasetBase>(params),
             input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            seed_(seed),
+            seed2_(seed2),
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
         buffer_.reserve(params.dataset->buffer_size_);
@@ -102,8 +105,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         int64 start_micros = ctx->env()->NowMicros();
         int64 num_log_entries = 0;
-        while (!end_of_input_sequence_ &&
-               buffer_.size() < dataset()->buffer_size_) {
+        while (input_impl_ && buffer_.size() < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
               ((num_log_entries + 1) * kLogIntervalMicros) + start_micros) {
             num_log_entries++;
@@ -111,10 +113,13 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
                       << buffer_.size() << " of " << dataset()->buffer_size_;
           }
           std::vector<Tensor> input_element;
+          bool end_of_input_sequence;
           TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
-                                                  &end_of_input_sequence_));
-          if (!end_of_input_sequence_) {
+                                                  &end_of_input_sequence));
+          if (!end_of_input_sequence) {
             buffer_.emplace_back(std::move(input_element));
+          } else {
+            input_impl_.reset();
           }
         }
         if (num_log_entries > 0) {
@@ -125,25 +130,116 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = false;
           // Choose an element to produce uniformly at random, and
           // swap the last element into its place in the buffer.
-          int64 index = generator_() % buffer_.size();
+          int64 index = Random() % buffer_.size();
           *out_tensors = std::move(buffer_[index]);
           std::swap(buffer_[index], buffer_.back());
           buffer_.pop_back();
         } else {
-          DCHECK(end_of_input_sequence_);
+          DCHECK(input_impl_ == nullptr);
           *end_of_sequence = true;
         }
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+
+        // Save the tensors in the buffer.
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("buffer_size"), buffer_.size()));
+        for (size_t i = 0; i < buffer_.size(); i++) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("buffer_", i, "_size")),
+              buffer_[i].size()));
+          for (size_t j = 0; j < buffer_[i].size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat("buffer_", i, "_", j)),
+                buffer_[i][j]));
+          }
+        }
+
+        // Save state needed to restore the random number generators.
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
+                                               num_random_samples_));
+
+        // Save input iterator if it hasn't been exhausted else write
+        // "end_of_input_sequence".
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input_sequence"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        buffer_.clear();
+
+        // Restore the buffer.
+        size_t buffer_size;
+        {
+          int64 temp;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("buffer_size"), &temp));
+          buffer_size = static_cast<size_t>(temp);
+        }
+        buffer_.reserve(buffer_size);
+        for (size_t i = 0; i < buffer_size; i++) {
+          int64 list_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat("buffer_", i, "_size")), &list_size));
+          buffer_.emplace_back(std::vector<Tensor>(list_size));
+          for (int j = 0; j < list_size; j++) {
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat("buffer_", i, "_", j)),
+                &buffer_[i][j]));
+          }
+        }
+
+        // Restore the random number generators.
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
+                                              &num_random_samples_));
+        ResetRngs();
+
+        // Restore the input iterator if it wasn't already exhausted.
+        if (!reader->Contains(full_name("end_of_input_sequence"))) {
+          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
      private:
+      random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random()
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_random_samples_++;
+        auto out = generator_();
+        return out;
+      }
+
+      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Reset the generators based on the current iterator seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
       mutex mu_;
       std::vector<std::vector<Tensor>> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      bool end_of_input_sequence_ GUARDED_BY(mu_) = false;
+      const int64 seed_ GUARDED_BY(mu_);
+      const int64 seed2_ GUARDED_BY(mu_);
       random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
       random::SingleSampleAdapter<random::PhiloxRandom> generator_
           GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
     };
 
     const DatasetBase* const input_;
@@ -154,9 +250,9 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
-    ReshufflingDataset(const DatasetBase* input, int64 buffer_size, int64 seed,
-                       int64 seed2)
-        : ShuffleDatasetBase(input, buffer_size),
+    ReshufflingDataset(OpKernelContext* ctx, const DatasetBase* input,
+                       int64 buffer_size, int64 seed, int64 seed2)
+        : ShuffleDatasetBase(ctx, input, buffer_size),
           seed_(seed),
           seed2_(seed2),
           parent_generator_(seed, seed2),
@@ -181,6 +277,7 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           iterator_seed2));
     }
 
+   private:
     const int64 seed_;
     const int64 seed2_;
     mutable mutex mu_;
@@ -193,9 +290,11 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
   // Used when `reshuffle_each_iteration` is false.
   class FixedSeedDataset : public ShuffleDatasetBase {
    public:
-    FixedSeedDataset(const DatasetBase* input, int64 buffer_size, int64 seed,
-                     int64 seed2)
-        : ShuffleDatasetBase(input, buffer_size), seed_(seed), seed2_(seed) {}
+    FixedSeedDataset(OpKernelContext* ctx, const DatasetBase* input,
+                     int64 buffer_size, int64 seed, int64 seed2)
+        : ShuffleDatasetBase(ctx, input, buffer_size),
+          seed_(seed),
+          seed2_(seed) {}
 
     string DebugString() override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
@@ -208,6 +307,29 @@ class ShuffleDatasetOp : public UnaryDatasetOpKernel {
           {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      AttrValue reshuffle_each_iteration;
+
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      b->BuildAttrValue(false, &reshuffle_each_iteration);
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, buffer_size, seed, seed2},  // Inputs
+          {std::make_pair("reshuffle_each_iteration",
+                          reshuffle_each_iteration)},  // Attrs
+          output));
+      return Status::OK();
+    }
+
+   private:
     const int64 seed_;
     const int64 seed2_;
   };
diff --git a/tensorflow/core/kernels/skip_dataset_op.cc b/tensorflow/core/kernels/skip_dataset_op.cc
index 52a6116a7cbf15bd68b5c6045e21143affe8d2b0..1fe49271e299f042b9dc88a30d88d3d26a9e65f2 100644
--- a/tensorflow/core/kernels/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/skip_dataset_op.cc
@@ -35,14 +35,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
 
-    *output = new Dataset(count, input);
+    *output = new Dataset(ctx, count, input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 count, const DatasetBase* input)
-        : count_(count), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
+        : GraphDatasetBase(ctx), count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -71,6 +71,18 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "SkipDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* count = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, count}, output));
+      return Status::OK();
+    }
+
    private:
     class EmptyIterator : public DatasetIterator<Dataset> {
      public:
@@ -82,6 +94,16 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        return Status::OK();
+      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -96,6 +118,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
 
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
         // Keep calling GetNext().  TODO(vrv): Figure out a way to
         // skip records without reading, perhaps by adding an
         // interface to iterator.
@@ -116,6 +143,34 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
         // Return GetNext() on the underlying iterator.
         TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors,
                                                 end_of_sequence));
+        if (*end_of_sequence) {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/slice_op.h b/tensorflow/core/kernels/slice_op.h
index db7eded745eb0d3c880dc46d164aad31b2531829..0362a021336f633b88a666c68f42fa5082f4f66d 100644
--- a/tensorflow/core/kernels/slice_op.h
+++ b/tensorflow/core/kernels/slice_op.h
@@ -24,6 +24,7 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+
 template <typename Device, typename T, int NDIMS>
 struct Slice {
   void operator()(const Device& d, typename TTypes<T, NDIMS>::Tensor output,
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50157d5d48f93bfe61cbac95246123ef0a7d446e
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_CPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<CPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+#if TENSORFLOW_USE_SYCL
+typedef Eigen::SyclDevice SyclDevice;
+#define REGISTER_SYCL_KERNEL(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("Snapshot").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<SyclDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_SYCL_KERNEL);
+
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/snapshot_op.h b/tensorflow/core/kernels/snapshot_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c79893b49661519515a7b4a537ff3caeceba2be
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op.h
@@ -0,0 +1,49 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+#define TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+template <typename Device, typename Scalar>
+class SnapshotOp : public OpKernel {
+ public:
+  explicit SnapshotOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input.shape(), &output));
+    const Device& device = context->eigen_device<Device>();
+    device.memcpy(output->template flat<Scalar>().data(),
+                  input.template flat<Scalar>().data(),
+                  input.NumElements() * sizeof(Scalar));
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_SNAPSHOT_OP_H_
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52070be838d65d21813dfe097db9c395ef5a8448
--- /dev/null
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+
+// See docs in ../ops/array_ops.cc.
+#include "tensorflow/core/kernels/snapshot_op.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER_KERNEL(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("Snapshot").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      SnapshotOp<GPUDevice, TYPE>);
+
+TF_CALL_POD_TYPES(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index 1f38bdce8c3a8f70e89efe62ad6c6f385bb5dfc0..d3a267ed877eedf8ed3845ebd11255f0690b3106 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -64,23 +64,21 @@ struct SoftmaxEigenImpl {
     one_by_class.set(1, num_classes);
 #endif
     // shifted_logits = logits - max(logits along classes);
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
     if (log) {
       // Calculate the log of the softmax
       // softmax = logits - max(logits along classes);
       softmax.device(d) = shifted_logits;
       // softmax = softmax - log(sum(exp(softmax along classes)));
-      softmax.device(d) = (softmax -
-                           softmax.exp()
-                               .sum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .log()
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax - softmax.exp()
+                                         .sum(along_class)
+                                         .log()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     } else {
       // NOTE(touts): If you modify this implementation please run
       // the BM_ImageNetSoftmaxFwd benchmark in nn_ops_test.cc.
@@ -88,12 +86,11 @@ struct SoftmaxEigenImpl {
       // softmax = exp(logits - max(logits along classes));
       softmax.device(d) = shifted_logits.exp();
       // softmax = softmax * (1 / sum(softmax along classes));
-      softmax.device(d) = (softmax *
-                           softmax.sum(along_class)
-                               .inverse()
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
+      softmax.device(d) = (softmax * softmax.sum(along_class)
+                                         .inverse()
+                                         .eval()
+                                         .reshape(batch_by_one)
+                                         .broadcast(one_by_class));
     }
   }
 };
diff --git a/tensorflow/core/kernels/sparse_matmul_op_test.cc b/tensorflow/core/kernels/sparse_matmul_op_test.cc
index a0c54805e2f348362f496cd77c16508d66671ada..f815ca9e344664c4c95befccb88e750eb99d0eaf 100644
--- a/tensorflow/core/kernels/sparse_matmul_op_test.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op_test.cc
@@ -284,12 +284,12 @@ class SparseMatmulOpTest : public ::testing::Test {
       uint16_t* data3_bfloat16_p =
           reinterpret_cast<uint16_t*>(data3_bfloat16) + i;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-            data3_p[1] = 0;  
-            data3_bfloat16_p[0] = data3_p[0];  
+            data3_p[1] = 0;
+            data3_bfloat16_p[0] = data3_p[0];
 #else
-            data3_p[0] = 0;  
-            data3_bfloat16_p[0] = data3_p[1];  
-#endif  
+            data3_p[0] = 0;
+            data3_bfloat16_p[0] = data3_p[1];
+#endif
     }
   }
 
diff --git a/tensorflow/core/kernels/sparse_reshape_op.cc b/tensorflow/core/kernels/sparse_reshape_op.cc
index f0f353871d0449c08492ddb0a2fc3db27b245a9d..939d404aa442e6d3384d46f19cc54771cb53a27b 100644
--- a/tensorflow/core/kernels/sparse_reshape_op.cc
+++ b/tensorflow/core/kernels/sparse_reshape_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/reshape_util.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace tensorflow {
@@ -33,124 +34,10 @@ class SparseReshapeOp : public OpKernel {
   explicit SparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) {}
 
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_ind_in = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_ind_in.shape()),
-                errors::InvalidArgument(
-                    "Input indices should be a matrix but received shape ",
-                    input_ind_in.shape().DebugString()));
-
-    const Tensor& input_shape_in = context->input(1);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
-                errors::InvalidArgument(
-                    "Input shape should be a vector but received shape ",
-                    input_shape_in.shape().DebugString()));
-
-    const Tensor& new_shape_in = context->input(2);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(new_shape_in.shape()),
-                errors::InvalidArgument(
-                    "New shape should be a vector but received shape ",
-                    new_shape_in.shape().DebugString()));
-
-    const int64 input_rank = input_shape_in.NumElements();
-    const int64 output_rank = new_shape_in.NumElements();
-
-    const TensorShape input_shape(input_shape_in.vec<int64>());
-    const int64 dense_size = input_shape.num_elements();
-
-    const int64 nnz = input_ind_in.shape().dim_size(0);
-
-    // Compute the output shape.  Determine product of specified
-    // dimensions, and find the index of the unspecified one. Largely the
-    // same calculation as reshape_op
-    TensorShape output_shape;
-    int64 product = 1;
-    int unknown_index = -1;
-    auto new_shape = new_shape_in.vec<int64>();
-    for (int d = 0; d < output_rank; ++d) {
-      const int64 size = new_shape(d);
-      if (size == -1) {
-        OP_REQUIRES(
-            context, unknown_index == -1,
-            errors::InvalidArgument("only one output shape size may be -1, "
-                                    "not both ",
-                                    unknown_index, " and ", d));
-        unknown_index = d;
-        output_shape.AddDim(1);
-      } else {
-        OP_REQUIRES(context, size >= 0,
-                    errors::InvalidArgument(
-                        "size ", d, " must be non-negative, not ", size));
-        output_shape.AddDim(size);
-        product *= size;
-      }
-    }
-    if (unknown_index != -1) {
-      OP_REQUIRES(
-          context, product > 0,
-          errors::InvalidArgument("SparseReshape cannot infer the missing "
-                                  "input size for an empty tensor unless all "
-                                  "specified input sizes are non-zero"));
-      const int64 missing = dense_size / product;
-      OP_REQUIRES(
-          context, product * missing == dense_size,
-          errors::InvalidArgument(
-              "Input to reshape is a SparseTensor with ", dense_size,
-              " dense values, but the requested shape requires a multiple of ",
-              product));
-      output_shape.set_dim(unknown_index, missing);
-    }
-
-    OP_REQUIRES(context, output_shape.num_elements() == dense_size,
-                errors::InvalidArgument("Input to reshape is a tensor with ",
-                                        dense_size,
-                                        " dense values, but the "
-                                        "requested shape has ",
-                                        output_shape.num_elements()));
-
-    // Optimize for reshaping to the same shape.
-    if (input_shape == output_shape) {
-      context->set_output(0, input_ind_in);
-      context->set_output(1, input_shape_in);
-      return;
-    }
-
-    gtl::InlinedVector<int64, 8> input_strides(input_rank);
-    input_strides[input_rank - 1] = 1;
-    for (int d = input_rank - 2; d >= 0; --d) {
-      input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
-    }
-
-    gtl::InlinedVector<int64, 8> output_strides(output_rank);
-    output_strides[output_rank - 1] = 1;
-    for (int d = output_rank - 2; d >= 0; --d) {
-      output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
-    }
-
-    Tensor* output_ind_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({nnz, output_rank}),
-                                            &output_ind_out));
-    auto input_ind = input_ind_in.matrix<int64>();
-    auto output_ind = output_ind_out->matrix<int64>();
-    for (int i = 0; i < nnz; ++i) {
-      int64 id = 0;
-      for (int j = 0; j < input_rank; ++j) {
-        id += input_ind(i, j) * input_strides[j];
-      }
-      for (int j = 0; j < output_rank; ++j) {
-        output_ind(i, j) = id / output_strides[j];
-        id %= output_strides[j];
-      }
-    }
-
-    Tensor* output_shape_out = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(1, TensorShape({output_rank}),
-                                            &output_shape_out));
-    auto output_shape_vec = output_shape_out->vec<int64>();
-    for (int j = 0; j < output_shape.dims(); ++j) {
-      output_shape_vec(j) = output_shape.dim_size(j);
-    }
+    Tensor output_indices;
+    Tensor output_shape;
+    Reshape(context, context->input(0), context->input(1), context->input(2),
+            0 /* output indices index */, 1 /* output shape index */);
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
index 97240a066bca49e31dbf54fc09a6a6d549a81ae1..de5ab1a3678b981a95de533dc2f59cc16dd7705c 100644
--- a/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_slice_dataset_op.cc
@@ -29,10 +29,12 @@ namespace {
 // description of the following op.
 
 template <typename T>
-class Dataset : public DatasetBase {
+class Dataset : public GraphDatasetBase {
  public:
-  explicit Dataset(const sparse::SparseTensor& sparse_tensor)
-      : sparse_tensor_(sparse_tensor),
+  explicit Dataset(OpKernelContext* ctx,
+                   const sparse::SparseTensor& sparse_tensor)
+      : GraphDatasetBase(ctx),
+        sparse_tensor_(sparse_tensor),
         dtypes_({DT_INT64, sparse_tensor.dtype(), DT_INT64}),
         shapes_({{-1, sparse_tensor.dims() - 1},
                  {-1},
@@ -53,6 +55,27 @@ class Dataset : public DatasetBase {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
+ protected:
+  Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* indices_node;
+    TF_RETURN_IF_ERROR(b->AddTensor(sparse_tensor_.indices(), &indices_node));
+    Node* value_node;
+    TF_RETURN_IF_ERROR(b->AddTensor(sparse_tensor_.values(), &value_node));
+    Node* dense_shape_node;
+    std::vector<int64> dense_shape;
+    dense_shape.reserve(sparse_tensor_.shape().size());
+    for (int i = 0; i < sparse_tensor_.shape().size(); i++)
+      dense_shape.emplace_back(sparse_tensor_.shape()[i]);
+    TF_RETURN_IF_ERROR(b->AddVector(dense_shape, &dense_shape_node));
+    AttrValue val_dtype;
+    b->BuildAttrValue(sparse_tensor_.dtype(), &val_dtype);
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {indices_node, value_node, dense_shape_node},
+                      {{"Tvalues", val_dtype}}, output));
+    return Status::OK();
+  }
+
  private:
   class Iterator : public DatasetIterator<Dataset<T>> {
    public:
@@ -106,7 +129,6 @@ class Dataset : public DatasetBase {
 
         ++iter_;
       }
-
       if (i_ == next_non_empty_i_) {
         // The current position is non-empty in the input
         // `SparseTensor`, and we have already read the value from the
@@ -129,6 +151,42 @@ class Dataset : public DatasetBase {
       return Status::OK();
     }
 
+   protected:
+    Status SaveInternal(IteratorStateWriter* writer) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(writer->WriteScalar(Iterator::full_name("i"), i_));
+      TF_RETURN_IF_ERROR(
+          writer->WriteScalar(Iterator::full_name("iter_loc"), iter_.loc()));
+      TF_RETURN_IF_ERROR(writer->WriteScalar(
+          Iterator::full_name("next_non_empty_i_"), next_non_empty_i_));
+      if (i_ <= next_non_empty_i_) {
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            Iterator::full_name("next_indices_"), next_indices_));
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            Iterator::full_name("next_values_"), next_values_));
+      }
+      return Status::OK();
+    }
+
+    Status RestoreInternal(OpKernelContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(reader->ReadScalar(Iterator::full_name("i"), &i_));
+      int64 iter_loc;
+      TF_RETURN_IF_ERROR(
+          reader->ReadScalar(Iterator::full_name("iter_loc"), &iter_loc));
+      iter_ = group_iterable_.at(iter_loc);
+      TF_RETURN_IF_ERROR(reader->ReadScalar(
+          Iterator::full_name("next_non_empty_i_"), &next_non_empty_i_));
+      if (i_ <= next_non_empty_i_) {
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            Iterator::full_name("next_indices_"), &next_indices_));
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            Iterator::full_name("next_values_"), &next_values_));
+      }
+      return Status::OK();
+    }
+
    private:
     const int64 num_elements_;
 
@@ -198,7 +256,7 @@ class SparseTensorSliceDatasetOp : public DatasetOpKernel {
     sparse::SparseTensor sparse_tensor(
         *indices, *values, TensorShape(dense_shape->vec<int64>()), std_order);
 
-    *output = new Dataset<T>(sparse_tensor);
+    *output = new Dataset<T>(ctx, sparse_tensor);
   }
 
  private:
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index f6fb0a121d8336a1abd624103e33e3ed8869f0d2..88fcf542fb0cc726b228be34d0fe7b92663ce95d 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -50,9 +50,18 @@ class StatelessRandomOpBase : public OpKernel {
     if (shape.num_elements() == 0) return;
 
     // Grab the two seeds
-    const auto seed = seed_t.flat<int64>();
-    const uint64 seed0 = internal::SubtleMustCopy(seed(0));
-    const uint64 seed1 = internal::SubtleMustCopy(seed(1));
+    uint64 seed0;
+    uint64 seed1;
+    if (context->input_dtype(1) == DT_INT32) {
+      const auto seed = seed_t.flat<int32>();
+      seed0 = internal::SubtleMustCopy(seed(0));
+      seed1 = internal::SubtleMustCopy(seed(1));
+    } else {
+      CHECK_EQ(DT_INT64, context->input_dtype(1));
+      const auto seed = seed_t.flat<int64>();
+      seed0 = internal::SubtleMustCopy(seed(0));
+      seed1 = internal::SubtleMustCopy(seed(1));
+    }
 
     // Scramble the seeds so that the user doesn't need to worry about which
     // part of the seed needs to be strong.
diff --git a/tensorflow/core/kernels/stats_aggregator.h b/tensorflow/core/kernels/stats_aggregator.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f602c5f3bf4dc275538ae7884f9f552c71fc65a
--- /dev/null
+++ b/tensorflow/core/kernels/stats_aggregator.h
@@ -0,0 +1,84 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace tensorflow {
+
+class Summary;
+
+// A `StatsAggregator` accumulates statistics incrementally. A
+// `StatsAggregator` can accumulate multiple different statistics, distinguished
+// by a string name.
+//
+// The class currently supports accumulating `Histogram` objects, and we expect
+// to add other methods in future.
+//
+// NOTE(mrry): `StatsAggregator` is a virtual interface because we anticipate
+// that many different implementations will the same interface. For example, the
+// current implementation in "stats_aggregator_ops.cc" is a simple in-memory
+// implementation that integrates with the pull-based summary API, and we may
+// add implementations that work with the push-based `SummaryWriterInterface`,
+// as well as custom monitoring services.
+class StatsAggregator {
+ public:
+  virtual ~StatsAggregator() {}
+
+  // Add the given `values` to the histogram with the given `name`. Each
+  // element of `values` will be treated as a separate sample in the histogram.
+  virtual void AddToHistogram(const string& name,
+                              gtl::ArraySlice<double> values) = 0;
+
+  // Stores a protocol buffer representation of the aggregator state in the
+  // given `out_summary`.
+  // TODO(mrry): Consider separating this method from the `StatsAggregator`
+  // interface. It is possible that not all implementations will support
+  // encoding their state as a protocol buffer.
+  virtual void EncodeToProto(Summary* out_summary) = 0;
+};
+
+// A `StatsAggregatorResource` wraps a shareable `StatsAggregator` as a resource
+// in the TensorFlow resource manager.
+//
+// NOTE(mrry): This class is separate from `StatsAggregator` in order to
+// simplify the memory management of the shared object. Most users of
+// `StatsAggregator` interact with a `std::shared_ptr<StatsAggregator>` whereas
+// the `ResourceBase` API requires explicit reference counting.
+class StatsAggregatorResource : public ResourceBase {
+ public:
+  // Creates a new resource from the given `stats_aggregator`.
+  StatsAggregatorResource(std::unique_ptr<StatsAggregator> stats_aggregator)
+      : stats_aggregator_(stats_aggregator.release()) {}
+
+  // Returns the wrapped `StatsAggregator`.
+  std::shared_ptr<StatsAggregator> stats_aggregator() const {
+    return stats_aggregator_;
+  }
+
+  string DebugString() { return "StatsAggregatorResource"; }
+
+ private:
+  const std::shared_ptr<StatsAggregator> stats_aggregator_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_KERNELS_STATS_AGGREGATOR_H_
diff --git a/tensorflow/core/kernels/stats_aggregator_ops.cc b/tensorflow/core/kernels/stats_aggregator_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..037ec64a83b58fd0f32789cd7560317959529225
--- /dev/null
+++ b/tensorflow/core/kernels/stats_aggregator_ops.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/stats_aggregator.h"
+
+#include <memory>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace {
+
+class StatsAggregatorImpl : public StatsAggregator {
+ public:
+  StatsAggregatorImpl() {}
+
+  void AddToHistogram(const string& name,
+                      gtl::ArraySlice<double> values) override {
+    mutex_lock l(mu_);
+    histogram::Histogram& histogram = histograms_[name];
+    for (double value : values) {
+      histogram.Add(value);
+    }
+  }
+
+  void EncodeToProto(Summary* out_summary) override {
+    mutex_lock l(mu_);
+    for (const auto& pair : histograms_) {
+      const string& name = pair.first;
+      const histogram::Histogram& histogram = pair.second;
+
+      Summary::Value* value = out_summary->add_value();
+      value->set_tag(name);
+      histogram.EncodeToProto(value->mutable_histo(),
+                              true /* preserve_zero_buckets */);
+    }
+  }
+
+ private:
+  mutex mu_;
+  std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
+};
+
+class StatsAggregatorHandleOp
+    : public ResourceOpKernel<StatsAggregatorResource> {
+ public:
+  explicit StatsAggregatorHandleOp(OpKernelConstruction* ctx)
+      : ResourceOpKernel<StatsAggregatorResource>(ctx) {}
+
+ private:
+  Status CreateResource(StatsAggregatorResource** ret) override
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    *ret = new StatsAggregatorResource(
+        std::unique_ptr<StatsAggregator>(new StatsAggregatorImpl));
+    return Status::OK();
+  }
+
+  Status VerifyResource(StatsAggregatorResource* resource) override {
+    return Status::OK();
+  }
+};
+
+class StatsAggregatorSummaryOp : public OpKernel {
+ public:
+  explicit StatsAggregatorSummaryOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& resource_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+                errors::InvalidArgument("resource_handle must be a scalar"));
+
+    StatsAggregatorResource* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &resource));
+    core::ScopedUnref unref_iterator(resource);
+
+    Tensor* summary_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &summary_t));
+    Summary summary;
+    resource->stats_aggregator()->EncodeToProto(&summary);
+    summary_t->scalar<string>()() = summary.SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
+                        StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
+                        StatsAggregatorSummaryOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stats_dataset_ops.cc b/tensorflow/core/kernels/stats_dataset_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7b1853aba61d7eeabceeebe76187535567509252
--- /dev/null
+++ b/tensorflow/core/kernels/stats_dataset_ops.cc
@@ -0,0 +1,181 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/dataset.h"
+#include "tensorflow/core/kernels/stats_aggregator.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+// This op defines a `Dataset` that passes through its input elements and
+// records the latency of producing each element in the context's
+// `StatsAggregator`.
+//
+// TODO(mrry): It is likely that many *StatsDatasetOp kernels will have the
+// same or similar structure. We should abstract the common boilerplate into
+// a base case and/or investigate how to make general-purpose *StatsDatasetOp
+// kernels that use TensorFlow functions to represent their logic. For example,
+// if the performance were adequate, we might replace this kernel with an
+// implementation that executes functions before and after the `GetNext()` call
+// on the input, each executing an op that gets the current time and performing
+// the subtraction.
+class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit LatencyStatsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    *output = new Dataset(input, std::move(tag));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const DatasetBase* input, string tag)
+        : input_(input), tag_(std::move(tag)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override { return "LatencyStatsDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        uint64 start = ctx->env()->NowMicros();
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        uint64 end = ctx->env()->NowMicros();
+        auto stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator && !*end_of_sequence) {
+          ctx->stats_aggregator()->AddToHistogram(
+              dataset()->tag_, {static_cast<double>(end - start)});
+        }
+        return s;
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const string tag_;
+  };
+};
+
+class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit BytesProducedStatsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    *output = new Dataset(input, std::move(tag));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    explicit Dataset(const DatasetBase* input, string tag)
+        : input_(input), tag_(std::move(tag)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::BytesProducedStats")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() override {
+      return "BytesProducedStatsDatasetOp::Dataset";
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        auto stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator && s.ok() && !*end_of_sequence) {
+          size_t total_bytes = 0;
+          for (const Tensor& t : *out_tensors) {
+            total_bytes += t.TotalBytes();
+          }
+          ctx->stats_aggregator()->AddToHistogram(
+              dataset()->tag_, {static_cast<double>(total_bytes)});
+        }
+        return s;
+      }
+
+     private:
+      const std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const string tag_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
+                        LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
+                        BytesProducedStatsDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 8fc40db3cc22060eb18b64c2246188925626b8bf..73b6d4cf6a212d3f09a6955cb8a138d2aec58b75 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -427,6 +427,7 @@ REGISTER_STRIDED_SLICE(bfloat16);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
+TF_CALL_int64(REGISTER_GPU);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
index a8487f49f4488269e058c6b7ee94d0f82aeb5270..8ca27e3b920e7c0cd36343d0c9db5a6098b6bede 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
@@ -53,6 +53,7 @@ typedef Eigen::GpuDevice GPUDevice;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
 TF_CALL_complex64(DEFINE_GPU_KERNELS);
 TF_CALL_complex128(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
 DEFINE_GPU_KERNELS(int32);
 
 #undef DEFINE_GPU_KERNELS
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index de6514757242c1e1079752427b444e31a80bc5ef..afe3a051e64cbff2040d32e95c5a4aacb2decbd1 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -284,6 +284,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_N_GPU);
 TF_CALL_complex64(DECLARE_FOR_N_GPU);
 TF_CALL_complex128(DECLARE_FOR_N_GPU);
 DECLARE_FOR_N_GPU(int32);
+DECLARE_FOR_N_GPU(int64);
 #endif  // END GOOGLE_CUDA
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
@@ -299,6 +300,7 @@ DECLARE_FOR_N_CPU(bfloat16);
 TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
 DECLARE_FOR_N_SYCL(int32);
+DECLARE_FOR_N_SYCL(int64);
 
 #undef DECLARE_FOR_N_SYCL
 #endif // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 5c72c9e1ae71ec162960abf38572260d5be36db8..743f11315042af94cfe41cecf52d145ae69f8209 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -66,7 +66,7 @@ class SubstrOp : public OpKernel {
         for (size_t i = 0; i < input_tensor.NumElements(); ++i) {
           string in = input(i);
           OP_REQUIRES(
-              context, FastBoundsCheck(pos, in.size()),
+              context, FastBoundsCheck(pos, in.size() + 1),
               errors::InvalidArgument("pos ", pos, " out of range for string",
                                       "b'", in, "' at index ", i));
           output(i) = in.substr(pos, len);
@@ -80,7 +80,7 @@ class SubstrOp : public OpKernel {
           const T pos = tensorflow::internal::SubtleMustCopy(pos_flat(i));
           const T len = tensorflow::internal::SubtleMustCopy(len_flat(i));
           OP_REQUIRES(
-              context, FastBoundsCheck(pos, in.size()),
+              context, FastBoundsCheck(pos, in.size() + 1),
               errors::InvalidArgument("pos ", pos, " out of range for string",
                                       "b'", in, "' at index ", i));
           output(i) = in.substr(pos, len);
@@ -146,7 +146,7 @@ class SubstrOp : public OpKernel {
             const T pos = tensorflow::internal::SubtleMustCopy(pos_bcast(i));
             const T len = tensorflow::internal::SubtleMustCopy(len_bcast(i));
             OP_REQUIRES(
-                context, FastBoundsCheck(pos, input_bcast(i).size()),
+                context, FastBoundsCheck(pos, input_bcast(i).size() + 1),
                 errors::InvalidArgument("pos ", pos, " out of range for string",
                                         "b'", in, "' at index ", i));
             output(i) = in.substr(pos, len);
@@ -197,7 +197,7 @@ class SubstrOp : public OpKernel {
                   tensorflow::internal::SubtleMustCopy(pos_bcast(i, j));
               const T len =
                   tensorflow::internal::SubtleMustCopy(len_bcast(i, j));
-              OP_REQUIRES(context, FastBoundsCheck(pos, in.size()),
+              OP_REQUIRES(context, FastBoundsCheck(pos, in.size() + 1),
                           errors::InvalidArgument(
                               "pos ", pos, " out of range for ", "string b'",
                               in, "' at index (", i, ", ", j, ")"));
diff --git a/tensorflow/core/kernels/summary_interface.cc b/tensorflow/core/kernels/summary_interface.cc
index 313137ae4957a086be57b490fe1a5f6f95e93f0f..97c0c2c099cfceaa98a577d9642710020621e7e6 100644
--- a/tensorflow/core/kernels/summary_interface.cc
+++ b/tensorflow/core/kernels/summary_interface.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 #include <utility>
 
-#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/summary.pb.h"
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
 #include "tensorflow/core/util/events_writer.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -228,7 +229,7 @@ class SummaryWriterImpl : public SummaryWriterInterface {
     }
     mutex_lock ml(mu_);
     events_writer_ =
-        xla::MakeUnique<EventsWriter>(io::JoinPath(logdir, "events"));
+        tensorflow::MakeUnique<EventsWriter>(io::JoinPath(logdir, "events"));
     if (!events_writer_->InitWithSuffix(filename_suffix)) {
       return errors::Unknown("Could not initialize events writer.");
     }
@@ -257,7 +258,9 @@ class SummaryWriterImpl : public SummaryWriterInterface {
     Summary::Value* v = e->mutable_summary()->add_value();
     t.AsProtoTensorContent(v->mutable_tensor());
     v->set_tag(tag);
-    v->mutable_metadata()->ParseFromString(serialized_metadata);
+    if (!serialized_metadata.empty()) {
+      v->mutable_metadata()->ParseFromString(serialized_metadata);
+    }
     return WriteEvent(std::move(e));
   }
 
@@ -391,6 +394,15 @@ class SummaryWriterImpl : public SummaryWriterInterface {
     return WriteEvent(std::move(e));
   }
 
+  Status WriteGraph(int64 global_step,
+                    std::unique_ptr<GraphDef> graph) override {
+    std::unique_ptr<Event> e{new Event};
+    e->set_step(global_step);
+    e->set_wall_time(GetWallTime());
+    graph->SerializeToString(e->mutable_graph_def());
+    return WriteEvent(std::move(e));
+  }
+
   Status WriteEvent(std::unique_ptr<Event> event) override {
     mutex_lock ml(mu_);
     queue_.emplace_back(std::move(event));
diff --git a/tensorflow/core/kernels/summary_interface.h b/tensorflow/core/kernels/summary_interface.h
index ccf3459e56b690522f9551d9c1fed4e649455814..da1c28709fb35372b1f0b28faba757a23bcd9ac4 100644
--- a/tensorflow/core/kernels/summary_interface.h
+++ b/tensorflow/core/kernels/summary_interface.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/util/event.pb.h"
 
@@ -46,6 +47,9 @@ class SummaryWriterInterface : public ResourceBase {
   virtual Status WriteAudio(int64 global_step, Tensor t, const string& tag,
                             int max_outputs_, float sample_rate) = 0;
 
+  virtual Status WriteGraph(int64 global_step,
+                            std::unique_ptr<GraphDef> graph) = 0;
+
   virtual Status WriteEvent(std::unique_ptr<Event> e) = 0;
 };
 
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index cfa707de715ba41ad4f5eb2ab1732324bb1c222c..f092afe66ca1a9130410904a2c1158cfc3a8ac70 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/kernels/summary_interface.h"
+#include "tensorflow/core/lib/db/sqlite.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
@@ -46,6 +50,33 @@ class CreateSummaryFileWriterOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
                         CreateSummaryFileWriterOp);
 
+class CreateSummaryDbWriterOp : public OpKernel {
+ public:
+  explicit CreateSummaryDbWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* tmp;
+    OP_REQUIRES_OK(ctx, ctx->input("db_uri", &tmp));
+    const string db_uri = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("experiment_name", &tmp));
+    const string experiment_name = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("run_name", &tmp));
+    const string run_name = tmp->scalar<string>()();
+    OP_REQUIRES_OK(ctx, ctx->input("user_name", &tmp));
+    const string user_name = tmp->scalar<string>()();
+    SummaryWriterInterface* s;
+    auto db = Sqlite::Open(db_uri);
+    OP_REQUIRES_OK(ctx, db.status());
+    db.ValueOrDie()->UseWriteAheadLogWithReducedDurabilityIfPossible();
+    OP_REQUIRES_OK(
+        ctx, CreateSummaryDbWriter(std::move(db.ValueOrDie()), experiment_name,
+                                   run_name, user_name, ctx->env(), &s));
+    OP_REQUIRES_OK(ctx, CreateResource(ctx, HandleFromInput(ctx, 0), s));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
+                        CreateSummaryDbWriterOp);
+
 class FlushSummaryWriterOp : public OpKernel {
  public:
   explicit FlushSummaryWriterOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -81,8 +112,8 @@ class WriteSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("summary_metadata", &tmp));
@@ -91,13 +122,33 @@ class WriteSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(ctx,
-                   s->WriteTensor(global_step, *t, tag, serialized_metadata));
+    OP_REQUIRES_OK(ctx, s->WriteTensor(step, *t, tag, serialized_metadata));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteSummary").Device(DEVICE_CPU),
                         WriteSummaryOp);
 
+class ImportEventOp : public OpKernel {
+ public:
+  explicit ImportEventOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("event", &t));
+    std::unique_ptr<Event> event{new Event};
+    if (!ParseProtoUnlimited(event.get(), t->scalar<string>()())) {
+      ctx->CtxFailureWithWarning(
+          errors::DataLoss("Bad tf.Event binary proto tensor string"));
+      return;
+    }
+    OP_REQUIRES_OK(ctx, s->WriteEvent(std::move(event)));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("ImportEvent").Device(DEVICE_CPU), ImportEventOp);
+
 class WriteScalarSummaryOp : public OpKernel {
  public:
   explicit WriteScalarSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -107,15 +158,15 @@ class WriteScalarSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("value", &t));
 
-    OP_REQUIRES_OK(ctx, s->WriteScalar(global_step, *t, tag));
+    OP_REQUIRES_OK(ctx, s->WriteScalar(step, *t, tag));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteScalarSummary").Device(DEVICE_CPU),
@@ -130,15 +181,15 @@ class WriteHistogramSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
 
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("values", &t));
 
-    OP_REQUIRES_OK(ctx, s->WriteHistogram(global_step, *t, tag));
+    OP_REQUIRES_OK(ctx, s->WriteHistogram(step, *t, tag));
   }
 };
 REGISTER_KERNEL_BUILDER(Name("WriteHistogramSummary").Device(DEVICE_CPU),
@@ -159,8 +210,8 @@ class WriteImageSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     const Tensor* bad_color;
@@ -173,8 +224,7 @@ class WriteImageSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(
-        ctx, s->WriteImage(global_step, *t, tag, max_images_, *bad_color));
+    OP_REQUIRES_OK(ctx, s->WriteImage(step, *t, tag, max_images_, *bad_color));
   }
 
  private:
@@ -196,8 +246,8 @@ class WriteAudioSummaryOp : public OpKernel {
     OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
     core::ScopedUnref unref(s);
     const Tensor* tmp;
-    OP_REQUIRES_OK(ctx, ctx->input("global_step", &tmp));
-    const int64 global_step = tmp->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("step", &tmp));
+    const int64 step = tmp->scalar<int64>()();
     OP_REQUIRES_OK(ctx, ctx->input("tag", &tmp));
     const string& tag = tmp->scalar<string>()();
     OP_REQUIRES_OK(ctx, ctx->input("sample_rate", &tmp));
@@ -206,8 +256,8 @@ class WriteAudioSummaryOp : public OpKernel {
     const Tensor* t;
     OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
 
-    OP_REQUIRES_OK(
-        ctx, s->WriteAudio(global_step, *t, tag, max_outputs_, sample_rate));
+    OP_REQUIRES_OK(ctx,
+                   s->WriteAudio(step, *t, tag, max_outputs_, sample_rate));
   }
 
  private:
@@ -218,4 +268,28 @@ class WriteAudioSummaryOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("WriteAudioSummary").Device(DEVICE_CPU),
                         WriteAudioSummaryOp);
 
+class WriteGraphSummaryOp : public OpKernel {
+ public:
+  explicit WriteGraphSummaryOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    SummaryWriterInterface* s;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &s));
+    core::ScopedUnref unref(s);
+    const Tensor* t;
+    OP_REQUIRES_OK(ctx, ctx->input("step", &t));
+    const int64 step = t->scalar<int64>()();
+    OP_REQUIRES_OK(ctx, ctx->input("tensor", &t));
+    std::unique_ptr<GraphDef> graph{new GraphDef};
+    if (!ParseProtoUnlimited(graph.get(), t->scalar<string>()())) {
+      ctx->CtxFailureWithWarning(
+          errors::DataLoss("Bad tf.GraphDef binary proto tensor string"));
+      return;
+    }
+    OP_REQUIRES_OK(ctx, s->WriteGraph(step, std::move(graph)));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("WriteGraphSummary").Device(DEVICE_CPU),
+                        WriteGraphSummaryOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/take_dataset_op.cc b/tensorflow/core/kernels/take_dataset_op.cc
index c3f33d663cd9ba084cb47472218818bdeb8aabab..7a6d20d6c7cb5a9bc5142e877c5c0c5285c1fd90 100644
--- a/tensorflow/core/kernels/take_dataset_op.cc
+++ b/tensorflow/core/kernels/take_dataset_op.cc
@@ -35,14 +35,14 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     // Create a new TakeDatasetOp::Dataset, and return it as the output.
     int64 count;
     OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "count", &count));
-    *output = new Dataset(count, input);
+    *output = new Dataset(ctx, count, input);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(int64 count, const DatasetBase* input)
-        : count_(count), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 count, const DatasetBase* input)
+        : GraphDatasetBase(ctx), count_(count), input_(input) {
       input_->Ref();
     }
 
@@ -72,6 +72,18 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() override { return "TakeDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* count = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, count}, output));
+      return Status::OK();
+    }
+
    private:
     class EmptyIterator : public DatasetIterator<Dataset> {
      public:
@@ -83,6 +95,16 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         *end_of_sequence = true;
         return Status::OK();
       }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        return Status::OK();
+      }
     };
 
     class FiniteIterator : public DatasetIterator<Dataset> {
@@ -96,6 +118,10 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         while (i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -110,6 +136,31 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
      private:
       mutex mu_;
       int64 i_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 6882a8a0e5fc49809468d73a1f828403092a96fb..90b71e370c474f8d7a94a47278601fdb7f3dabe0 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -138,8 +138,9 @@ class TensorArray : public ResourceBase {
   // users to construct this many Tensors for storage in a TensorArray.
   TensorArray(const string& key, const DataType& dtype, const Tensor& handle,
               int32 N, const PartialTensorShape& element_shape,
-              bool dynamic_size, bool multiple_writes_aggregate, bool is_grad,
-              int32 marked_size, bool clear_after_read)
+              bool identical_element_shapes, bool dynamic_size,
+              bool multiple_writes_aggregate, bool is_grad, int32 marked_size,
+              bool clear_after_read)
       : key_(key),
         dtype_(dtype),
         handle_(handle),
@@ -151,6 +152,7 @@ class TensorArray : public ResourceBase {
         is_grad_(is_grad),
         marked_size_(marked_size),
         element_shape_(element_shape),
+        identical_element_shapes_(identical_element_shapes),
         tensors_(N) {}
 
   // Write PersistentTensor 'value' to index 'index'.
@@ -320,6 +322,8 @@ class TensorArray : public ResourceBase {
     return !gradients_disallowed_;
   }
 
+  bool HasIdenticalElementShapes() const { return identical_element_shapes_; }
+
   // Copy the TensorShapes from another TensorArray into this one.
   // The sizes of the two TensorArrays must match and this one
   // may not have any entries filled in.  This performs a "soft copy",
@@ -379,7 +383,7 @@ class TensorArray : public ResourceBase {
 
   // Multiple writes to the same index will result in summation of the
   // values (used by backprop)
-  bool multiple_writes_aggregate_;
+  const bool multiple_writes_aggregate_;
 
   // If multiple Writes were attempted (e.g. via attribute
   // multiple_writes_aggregate), then gradients are disallowed.
@@ -387,10 +391,10 @@ class TensorArray : public ResourceBase {
 
   // After a read at an index, clear away its PersistentTensor to
   // release memory.
-  bool clear_after_read_;
+  const bool clear_after_read_;
 
   // True iff this is a gradient tensor array.
-  bool is_grad_;
+  const bool is_grad_;
 
   // The size of the TensorArray after a (legacy) unpack or split is performed.
   // -1 if there has been no unpack or split performed on the TensorArray.
@@ -400,6 +404,13 @@ class TensorArray : public ResourceBase {
   // known at all.
   PartialTensorShape element_shape_ GUARDED_BY(mu_);
 
+  // Whether all elements in the TensorArray have identical shapes.
+  // This allows certain behaviors, like dynamically checking for
+  // consistent shapes on write, and being able to fill in properly
+  // shaped zero tensors on stack -- even if the initial element_shape
+  // was not fully defined.
+  const bool identical_element_shapes_;
+
   // TensorAndState is used to keep track of the PersistentTensors
   // stored in the TensorArray, along with their shapes, and a boolean
   // that determines whether they have already been read or not.
@@ -463,6 +474,8 @@ Status TensorArray::LockedWriteOrAggregate(OpKernelContext* ctx,
         " which is incompatible with the TensorArray's inferred element "
         "shape: ",
         element_shape_.DebugString(), " (consider setting infer_shape=False).");
+  } else if (identical_element_shapes_ && !element_shape_.IsFullyDefined()) {
+    element_shape_ = PartialTensorShape(value_t->shape().dim_sizes());
   }
 
   if (t.read) {
@@ -537,30 +550,33 @@ Status TensorArray::LockedRead(OpKernelContext* ctx, const int32 index,
                                    " but array size is: ", tensors_.size());
   }
   size_t index_t = static_cast<size_t>(index);
-  if (is_grad_ && (index_t >= tensors_.size() || !tensors_[index].written)) {
+  if ((is_grad_ && (index_t >= tensors_.size() || !tensors_[index].written)) ||
+      (!is_grad_ && (index_t < tensors_.size() && !tensors_[index].written))) {
     // Special case returning zeros if this is a gradient read that happens
     // after a stop_gradients call with dynamic forward TensorArrays.
     // There is sometimes a race condition where the gradient is not
     // written due to stop_gradients, but is later read.
     TensorShape element_shape;
-    if (index_t < tensors_.size() && tensors_[index].shape.dims() > 0) {
+    if (is_grad_ && index_t < tensors_.size() &&
+        tensors_[index].shape.dims() > 0) {
+      // A gradient TensorArray has more specific gradient information
+      // available for each entry.  A forward TensorArray must rely on
+      // the global element_shape_ to fill in zeros on read.
       element_shape = tensors_[index].shape;
     } else if (!element_shape_.IsFullyDefined()) {
       return errors::InvalidArgument(
           "TensorArray ", handle_.vec<string>()(1),
-          ": Could not read from gradient TensorArray index ", index,
+          ": Could not read from TensorArray index ", index,
           ".  Furthermore, the element shape is not fully defined: ",
           element_shape_.DebugString(),
-          ".  "
-          "It is likely you are working with a resizeable TensorArray and "
-          "stop_gradients "
-          "is not allowing the gradients to be written.  If you set the full "
-          "element_shape "
-          "property on the forward TensorArray, the proper all-zeros tensor "
-          "will be "
-          "returned instead of incurring this error.");
+          ".  It is possible you are working with a resizeable TensorArray and "
+          "stop_gradients is not allowing the gradients to be written.  If you "
+          "set the full "
+          "element_shape property on the forward TensorArray, the proper "
+          "all-zeros tensor "
+          "will be returned instead of incurring this error.");
     } else {
-      DCHECK(element_shape_.AsTensorShape(&element_shape));
+      element_shape_.AsTensorShape(&element_shape);  // Always succeeds.
     }
     if (index_t >= tensors_.size()) {
       // Fill in tensors_ up to index to have known shape.
@@ -578,13 +594,6 @@ Status TensorArray::LockedRead(OpKernelContext* ctx, const int32 index,
 
   TensorAndState& t = tensors_[index];
 
-  if (!t.written) {
-    return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
-                                   ": Could not read from TensorArray index ",
-                                   index,
-                                   " because it has not yet been written to.");
-  }
-
   if (t.cleared) {
     return errors::InvalidArgument("TensorArray ", handle_.vec<string>()(1),
                                    ": Could not read index ", index,
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 2191e4e8c5fccdaa6ad769e444b7568616c84e8e..cca6d0e35f2ee11d2a97f68581dd6f8dc87d929d 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -162,6 +162,14 @@ class TensorArrayOp : public TensorArrayCreationOp {
     OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
     OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
     OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
+    // The HasAttr check is for backwards compatibility with older op
+    // versions which do not have this attribute.
+    if (context->HasAttr("identical_element_shapes")) {
+      OP_REQUIRES_OK(context, context->GetAttr("identical_element_shapes",
+                                               &identical_element_shapes_));
+    } else {
+      identical_element_shapes_ = false;
+    }
     OP_REQUIRES_OK(context,
                    context->GetAttr("clear_after_read", &clear_after_read_));
     OP_REQUIRES_OK(context,
@@ -196,8 +204,9 @@ class TensorArrayOp : public TensorArrayCreationOp {
 
     TensorArray* tensor_array = new TensorArray(
         key, dtype_, *tensor_array_output_handle, size, element_shape_,
-        dynamic_size_, false /* multiple_writes_aggregate */,
-        false /* is_grad */, -1 /* marked_size */, clear_after_read_);
+        identical_element_shapes_, dynamic_size_,
+        false /* multiple_writes_aggregate */, false /* is_grad */,
+        -1 /* marked_size */, clear_after_read_);
 
     TF_RETURN_IF_ERROR(
         rm->Create(ctx->step_container()->name(), key, tensor_array));
@@ -210,6 +219,7 @@ class TensorArrayOp : public TensorArrayCreationOp {
  private:
   DataType dtype_;
   PartialTensorShape element_shape_;
+  bool identical_element_shapes_;
   bool dynamic_size_;
   bool clear_after_read_;
   string tensor_array_name_;  // The name used to create the TensorArray.
@@ -322,7 +332,8 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
                     output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
-          array_size, tensor_array->ElemShape(), false /* dynamic_size */,
+          array_size, tensor_array->ElemShape(),
+          tensor_array->HasIdenticalElementShapes(), false /* dynamic_size */,
           true /* multiple_writes_aggregate */, true /* is_grad */,
           marked_size /* marked_size */, true /* close_after_read */);
       TF_RETURN_IF_ERROR((*ret)->CopyShapesFrom(tensor_array));
@@ -1003,8 +1014,9 @@ class TensorArrayUnpackOrScatterOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
     TensorShape element_shape(tensor_value->shape());
 
-    OP_REQUIRES(ctx, FastBoundsCheck(element_shape.dim_size(0),
-                                     std::numeric_limits<int32>::max()),
+    OP_REQUIRES(ctx,
+                FastBoundsCheck(element_shape.dim_size(0),
+                                std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("tensor dim0 too large to unpack"));
 
     OP_REQUIRES(
@@ -1204,8 +1216,9 @@ class TensorArraySplitOp : public OpKernel {
                 errors::InvalidArgument(
                     "Expected lengths to be a vector, received shape: ",
                     tensor_lengths->shape().DebugString()));
-    OP_REQUIRES(ctx, FastBoundsCheck(tensor_lengths->NumElements(),
-                                     std::numeric_limits<int32>::max()),
+    OP_REQUIRES(ctx,
+                FastBoundsCheck(tensor_lengths->NumElements(),
+                                std::numeric_limits<int32>::max()),
                 errors::InvalidArgument(
                     "Expected lengths to have < max int32 entries"));
 
diff --git a/tensorflow/core/kernels/tensor_dataset_op.cc b/tensorflow/core/kernels/tensor_dataset_op.cc
index db7c94732873d88c343e52036a91c3da0f549f81..5cf99311885df24a8d7adf0ef566cd1df28fdee5 100644
--- a/tensorflow/core/kernels/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/tensor_dataset_op.cc
@@ -70,15 +70,17 @@ class TensorDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
         TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         components.emplace_back(node);
       }
-      TF_RETURN_IF_ERROR(
-          b->AddDatasetWithInputAsList(this, components, output));
+      AttrValue dtypes;
+      b->BuildAttrValue(dtypes_, &dtypes);
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {}, {{0, components}},
+                                       {{"Toutput_types", dtypes}}, output));
       return Status::OK();
     }
 
diff --git a/tensorflow/core/kernels/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
index fd36bf524ce2570c2af94d4daafea7d0f2ad189a..86f8f436d42dd1d6a7c3a90cb16a931de1f8d478 100644
--- a/tensorflow/core/kernels/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/tensor_slice_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/batch_util.h"
 
 namespace tensorflow {
 
@@ -86,54 +87,21 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      std::vector<NodeBuilder::NodeOut> components;
+      std::vector<Node*> components;
       components.reserve(tensors_.size());
       for (const Tensor& t : tensors_) {
         Node* node;
         TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
         components.emplace_back(node);
       }
-      TF_RETURN_IF_ERROR(
-          b->AddDatasetWithInputAsList(this, components, output));
+      AttrValue dtypes;
+      b->BuildAttrValue(dtypes_, &dtypes);
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {}, {{0, components}},
+                                       {{"Toutput_types", dtypes}}, output));
       return Status::OK();
     }
 
    private:
-    template <typename T>
-    static Status HandleSliceToElement(const Tensor& parent, Tensor* element,
-                                       int64 index) {
-      DCHECK_NE(parent.dim_size(0), 0);
-      DCHECK_GE(index, 0);
-      if (element->NumElements() !=
-          (parent.NumElements() / parent.dim_size(0))) {
-        TensorShape chip_shape = parent.shape();
-        chip_shape.RemoveDim(0);
-        return errors::Internal(
-            "HandleSliceToElement Cannot copy slice: number of elements does "
-            "not match.  Shapes are: [element]: ",
-            element->shape().DebugString(), ", [parent slice]: ",
-            chip_shape.DebugString());
-      }
-      auto parent_as_matrix = parent.flat_outer_dims<T>();
-      element->flat<T>() = parent_as_matrix.chip(index, 0);
-      return Status::OK();
-    }
-
-    static Status CopySliceToElement(const Tensor& parent, Tensor* element,
-                                     int64 index) {
-#define HANDLE_TYPE(T)                                      \
-  case DataTypeToEnum<T>::value: {                          \
-    return HandleSliceToElement<T>(parent, element, index); \
-  }
-
-      switch (parent.dtype()) {
-        TF_CALL_DATASET_TYPES(HANDLE_TYPE);
-        default:
-          return errors::Unimplemented(
-              "CopySliceToElement Unhandled data type: ", element->dtype());
-      }
-    }
-
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
@@ -152,7 +120,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
             const Tensor& t = dataset()->tensors_[i];
             Tensor t_slice(cpu_allocator(), t.dtype(),
                            TensorShape(dataset()->shapes_[i].dim_sizes()));
-            TF_RETURN_IF_ERROR(CopySliceToElement(t, &t_slice, i_));
+            TF_RETURN_IF_ERROR(batch_util::CopySliceToElement(t, &t_slice, i_));
             out_tensors->emplace_back(std::move(t_slice));
           }
           ++i_;
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
index 5a36e7567beb16e447de28d3cf930fbd29f6c078..84a5060fc3cd17c09b905d606dba62bbaa7f1373 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu.cu.cc
@@ -90,6 +90,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct Tile<GPUDevice, T, int32>; \
   template struct Tile<GPUDevice, T, int64>;
 
+TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
 TF_CALL_int64(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index fa5afe6a31b0c660151070f5cd2e1d5be280adc5..68cdae3249a070caeb77ce944be2c32791e4245c 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -222,6 +222,7 @@ TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_string(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA
+TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
 TF_CALL_float(HANDLE_TYPE_NAME_GPU);
 TF_CALL_double(HANDLE_TYPE_NAME_GPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_GPU);
@@ -534,7 +535,7 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                         TileGradientOp<CPUDevice, int64>);
 
 #if GOOGLE_CUDA
-#define REGISTER_GPU(type)                                         \
+#define REGISTER_GPU_TILE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Tile")                             \
                               .Device(DEVICE_GPU)                  \
                               .TypeConstraint<type>("T")           \
@@ -546,7 +547,9 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                               .TypeConstraint<type>("T")           \
                               .TypeConstraint<int64>("Tmultiples") \
                               .HostMemory("multiples"),            \
-                          TileOp<GPUDevice, int64>);               \
+                          TileOp<GPUDevice, int64>);
+
+#define REGISTER_GPU_TILE_GRAD(type)                               \
   REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
                               .Device(DEVICE_GPU)                  \
                               .TypeConstraint<type>("T")           \
@@ -560,6 +563,11 @@ REGISTER_KERNEL_BUILDER(Name("TileGrad")
                               .HostMemory("multiples"),            \
                           TileGradientOp<GPUDevice, int64>);
 
+#define REGISTER_GPU(type) \
+  REGISTER_GPU_TILE(type); \
+  REGISTER_GPU_TILE_GRAD(type);
+
+TF_CALL_bool(REGISTER_GPU_TILE);
 TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_half(REGISTER_GPU);
@@ -568,6 +576,8 @@ TF_CALL_int32(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU)
 
+#undef REGISTER_GPU_TILE
+#undef REGISTER_GPU_TILE_GRAD
 #undef REGISTER_GPU
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 98dfa5a3dd8ee02c077d6924ca19e90838c42074..38e77ab60fb7126bcdedc09bfe9e2ec7de88c0ad 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "tensorflow/core/kernels/training_ops.h"
 #include <algorithm>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 #ifdef TENSORFLOW_USE_SYCL
@@ -75,9 +76,9 @@ struct ApplyAdadelta<CPUDevice, T> {
         accum * rho() + grad.square() * (static_cast<T>(1) - rho());
     const auto update =
         (accum_update + epsilon()).sqrt() * (accum + epsilon()).rsqrt() * grad;
+    var.device(d) -= update * lr();
     accum_update.device(d) =
         accum_update * rho() + update.square() * (static_cast<T>(1) - rho());
-    var.device(d) -= update * lr();
   }
 };
 
@@ -361,6 +362,37 @@ struct ApplyCenteredRMSProp<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAddSign<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    auto sign_gm = grad.sign() * m.sign();
+    var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
+  }
+};
+
+template <typename T>
+struct ApplyPowerSign<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    auto sign_gm = grad.sign() * m.sign();
+    auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
+    var.device(d) -= lr() * grad_scale * grad;
+  }
+};
+
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -504,8 +536,9 @@ class ApplyAdadeltaOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    if (use_exclusive_lock_) {
-      mutex_lock l1(*GetTrainingVariableMutex(ctx, 0));
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock l1(*mu);
       // Don't try to acquire a lock on the second ref as they share the same
       // mutex.
       //
@@ -650,15 +683,21 @@ class SparseApplyAdadeltaOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    mutex* mu_var = GetTrainingVariableMutex(ctx, 0);
+  void Compute(OpKernelContext* ctx) override {
+    mutex* mu = GetTrainingVariableMutex(ctx, 0);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
     //
     // mutex* mu_accum = ctx->input_ref_mutex(1);
-    if (use_exclusive_lock_) {
-      mu_var->lock();
+    if (use_exclusive_lock_ && mu != nullptr) {
+      mutex_lock ml(*mu);
+      DoCompute(ctx);
+    } else {
+      DoCompute(ctx);
     }
+  }
+
+  void DoCompute(OpKernelContext* ctx) {
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
                             ctx, 0, use_exclusive_lock_, true, &var));
@@ -752,16 +791,13 @@ class SparseApplyAdadeltaOp : public OpKernel {
         const auto update =
             (accum_update_ + accum_update_.constant(epsilon_scalar)).sqrt() *
             (accum_ + accum_.constant(epsilon_scalar)).rsqrt() * grad_;
+        auto v = var_flat.template chip<0>(index);
+        v -= update * update.constant(lr_scalar);
         accum_update_ =
             accum_update_ * accum_update_.constant(rho_scalar) +
             update.square() * update.constant(static_cast<T>(1) - rho_scalar);
-        auto v = var_flat.template chip<0>(index);
-        v -= update * update.constant(lr_scalar);
       }
     }
-    if (use_exclusive_lock_) {
-      mu_var->unlock();
-    }
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -3243,4 +3279,220 @@ REGISTER_KERNELS(double, int64);
 
 #undef REGISTER_KERNELS
 
+
+template <typename Device, typename T>
+class ApplyAddSignOp : public OpKernel {
+ public:
+  explicit ApplyAddSignOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& alpha = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha.shape()),
+                errors::InvalidArgument("alpha is not a scalar: ",
+                                        alpha.shape().DebugString()));
+    const Tensor& sign_decay = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha.shape()),
+                errors::InvalidArgument("sign_decay is not a scalar: ",
+                                        sign_decay.shape().DebugString()));
+    const Tensor& beta = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta.shape()),
+                errors::InvalidArgument("beta is not a scalar: ",
+                                        beta.shape().DebugString()));
+    const Tensor& grad = ctx->input(6);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAddSign<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), lr.scalar<T>(), alpha.scalar<T>(),
+        sign_decay.scalar<T>(), beta.scalar<T>(), grad.flat<T>());
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("ApplyAddSign").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAddSignOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAddSign")                \
+                              .Device(DEVICE_##D)                     \
+                              .HostMemory("var")                      \
+                              .HostMemory("m")                        \
+                              .TypeConstraint<T>("T"),                \
+                          ApplyAddSignOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyAddSign<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d,                                                 \
+      typename TTypes<T>::Flat var,                                       \
+      typename TTypes<T>::Flat m,                                         \
+      typename TTypes<T>::ConstScalar lr,                                 \
+      typename TTypes<T>::ConstScalar alpha,                              \
+      typename TTypes<T>::ConstScalar sign_decay,                         \
+      typename TTypes<T>::ConstScalar beta,                               \
+      typename TTypes<T>::ConstFlat grad);                                \
+  extern template struct ApplyAddSign<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+
+template <typename Device, typename T>
+class ApplyPowerSignOp : public OpKernel {
+ public:
+  explicit ApplyPowerSignOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& logbase = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(logbase.shape()),
+                errors::InvalidArgument("logbase is not a scalar: ",
+                                        logbase.shape().DebugString()));
+    const Tensor& sign_decay = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(logbase.shape()),
+                errors::InvalidArgument("sign_decay is not a scalar: ",
+                                        sign_decay.shape().DebugString()));
+    const Tensor& beta = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta.shape()),
+                errors::InvalidArgument("beta is not a scalar: ",
+                                        beta.shape().DebugString()));
+    const Tensor& grad = ctx->input(6);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyPowerSign<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), lr.scalar<T>(), logbase.scalar<T>(),
+        sign_decay.scalar<T>(), beta.scalar<T>(), grad.flat<T>());
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ApplyPowerSign").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyPowerSignOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyPowerSign")                \
+                              .Device(DEVICE_##D)                       \
+                              .HostMemory("var")                        \
+                              .HostMemory("m")                          \
+                              .TypeConstraint<T>("T"),                  \
+                          ApplyPowerSignOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyPowerSign<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d,                                                 \
+      typename TTypes<T>::Flat var,                                       \
+      typename TTypes<T>::Flat m,                                         \
+      typename TTypes<T>::ConstScalar lr,                                 \
+      typename TTypes<T>::ConstScalar logbase,                            \
+      typename TTypes<T>::ConstScalar sign_decay,                         \
+      typename TTypes<T>::ConstScalar beta,                               \
+      typename TTypes<T>::ConstFlat grad);                                \
+  extern template struct ApplyPowerSign<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index 99a714e0a27cd66b3e53ab732fd1c8929b91e106..7ee956053abd320058963a8cc0bffa1fdc2e085c 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -161,6 +161,29 @@ struct ApplyCenteredRMSProp {
                   typename TTypes<T>::ConstScalar epsilon,
                   typename TTypes<T>::ConstFlat grad);
 };
+
+template <typename Device, typename T>
+struct ApplyAddSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
+template <typename Device, typename T>
+struct ApplyPowerSign {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 3678b96e98f49994089487a833c9a0b4d662041e..d443a6b3c1d0b548e915216adbc05549a66eaeda 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -70,11 +70,11 @@ struct ApplyAdadelta<GPUDevice, T> {
     const auto update =
         (accum_update + epsilon.reshape(single).broadcast(bcast)).sqrt() *
         (accum + epsilon.reshape(single).broadcast(bcast)).rsqrt() * grad;
+    var.device(d) -= update * lr.reshape(single).broadcast(bcast);
     accum_update.device(d) =
         accum_update * rho.reshape(single).broadcast(bcast) +
         update.square() *
             (grad.constant(T(1)) - rho.reshape(single).broadcast(bcast));
-    var.device(d) -= update * lr.reshape(single).broadcast(bcast);
   }
 };
 
@@ -193,6 +193,71 @@ struct ApplyCenteredRMSProp<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAddSign<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar alpha,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    // The following is the GPU equivalent of the CPU version:
+    // m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    const auto one = static_cast<T>(1.0);
+    auto beta_bcast = beta.reshape(single).broadcast(bcast);
+    auto one_minus_beta =
+        (beta.constant(one) - beta).reshape(single).broadcast(bcast);
+    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+
+    // The following is the GPU equivalent of the CPU version:
+    // var.device(d) -= lr() * (alpha() + sign_decay() * sign_gm) * grad;
+    auto sign_gm = grad.sign() * m.sign();
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto alpha_bcast = alpha.reshape(single).broadcast(bcast);
+    auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
+    var.device(d) -=
+        lr_bcast * (alpha_bcast + sign_decay_bcast * sign_gm) * grad;
+  }
+};
+
+template <typename T>
+struct ApplyPowerSign<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar logbase,
+                  typename TTypes<T>::ConstScalar sign_decay,
+                  typename TTypes<T>::ConstScalar beta,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+
+    // The following is the GPU equivalent of the CPU version:
+    // m.device(d) = m * beta() + grad * (static_cast<T>(1) - beta());
+    const auto one = static_cast<T>(1.0);
+    auto beta_bcast = beta.reshape(single).broadcast(bcast);
+    auto one_minus_beta =
+        (beta.constant(one) - beta).reshape(single).broadcast(bcast);
+    m.device(d) =  m * beta_bcast + grad * one_minus_beta;
+
+    // The following is the GPU equivalent of the CPU version:
+    // auto grad_scale = (logbase() * sign_decay() * sign_gm).exp();
+    // var.device(d) -= lr() * grad_scale * grad;
+    auto sign_gm = grad.sign() * m.sign();
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto logbase_bcast = logbase.reshape(single).broadcast(bcast);
+    auto sign_decay_bcast = sign_decay.reshape(single).broadcast(bcast);
+    auto grad_scale =  (logbase_bcast * sign_decay_bcast * sign_gm).exp();
+    var.device(d) -= lr_bcast * grad_scale * grad;
+  }
+};
+
 }  // namespace functor
 
 template struct functor::ApplyGradientDescent<GPUDevice, Eigen::half>;
@@ -222,6 +287,15 @@ template struct functor::ApplyRMSProp<GPUDevice, double>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, Eigen::half>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, float>;
 template struct functor::ApplyCenteredRMSProp<GPUDevice, double>;
+
+template struct functor::ApplyAddSign<GPUDevice, Eigen::half>;
+template struct functor::ApplyAddSign<GPUDevice, float>;
+template struct functor::ApplyAddSign<GPUDevice, double>;
+
+template struct functor::ApplyPowerSign<GPUDevice, Eigen::half>;
+template struct functor::ApplyPowerSign<GPUDevice, float>;
+template struct functor::ApplyPowerSign<GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 4b1c9eb8bb6f06b5827dafa423f83a8e0184dcd6..ffa7f87c9efda0e3288b9fb06d0c9d1a3dcba277 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -233,4 +233,78 @@ static void BM_RMSProp(int iters, int params) {
 }
 BENCHMARK(BM_RMSProp)->Arg(128 << 10)->Arg(256 << 10);
 
+static void AddSign(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto alpha = Scalar(g, 0.1);
+    auto sign_decay = Scalar(g, 0.9);
+    auto beta = Scalar(g, 0.8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAddSign",
+                       {var, m, lr, alpha, sign_decay, beta, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_AddSign(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  AddSign(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_AddSign)->Arg(128 << 10)->Arg(256 << 10);
+
+static void PowerSign(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto logbase = Scalar(g, 2);
+    auto sign_decay = Scalar(g, 0.9);
+    auto beta = Scalar(g, 0.8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyPowerSign",
+                       {var, m, lr, logbase, sign_decay, beta, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_PowerSign(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  PowerSign(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_PowerSign)->Arg(128 << 10)->Arg(256 << 10);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 20f0edf309a0a61f306ebc6321577830203f7764..96c051c636e54b671fec259d38218dcf7cc0837c 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -31,13 +31,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-// inv = InvertPermutationOp(T<int32> p) takes a permutation of
+// inv = InvertPermutationOp(T<int32/int64> p) takes a permutation of
 // integers 0, 1, ..., n - 1 and returns the inverted
 // permutation of p. I.e., inv[p[i]] == i, for i in [0 .. n).
 //
-// REQUIRES: input is a vector of int32.
+// REQUIRES: input is a vector of int32 or int64.
 // REQUIRES: input is a permutation of 0, 1, ..., n-1.
 
+template <typename T>
 class InvertPermutationOp : public OpKernel {
  public:
   explicit InvertPermutationOp(OpKernelConstruction* context)
@@ -48,20 +49,19 @@ class InvertPermutationOp : public OpKernel {
     OP_REQUIRES(
         context, TensorShapeUtils::IsVector(input.shape()),
         errors::InvalidArgument("invert_permutation expects a 1D vector."));
-    auto Tin = input.vec<int32>();
+    auto Tin = input.vec<T>();
     OP_REQUIRES(context,
                 FastBoundsCheck(Tin.size(), std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
-    const int32 N =
-        static_cast<int32>(Tin.size());  // Safe: bounds-checked above.
+    const T N = static_cast<T>(Tin.size());  // Safe: bounds-checked above.
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
-    auto Tout = output->vec<int32>();
+    auto Tout = output->vec<T>();
     std::fill_n(Tout.data(), N, -1);
     for (int i = 0; i < N; ++i) {
-      const int32 d = internal::SubtleMustCopy(Tin(i));
+      const T d = internal::SubtleMustCopy(Tin(i));
       OP_REQUIRES(context, FastBoundsCheck(d, N),
                   errors::InvalidArgument(d, " is not between 0 and ", N));
       OP_REQUIRES(context, Tout(d) == -1,
@@ -73,14 +73,23 @@ class InvertPermutationOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(
     Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    InvertPermutationOp);
+    InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(
+    Name("InvertPermutation").Device(DEVICE_CPU).TypeConstraint<int64>("T"),
+    InvertPermutationOp<int64>);
 
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .Device(DEVICE_GPU)
                             .TypeConstraint<int32>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp);
+                        InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp<int64>);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
@@ -88,7 +97,13 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .TypeConstraint<int32>("T")
                             .HostMemory("x")
                             .HostMemory("y"),
-                        InvertPermutationOp);
+                        InvertPermutationOp<int32>);
+REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
+                            .Device(DEVICE_SYCL)
+                            .TypeConstraint<int64>("T")
+                            .HostMemory("x")
+                            .HostMemory("y"),
+                        InvertPermutationOp<int64>);
 #endif  // TENSORFLOW_USE_SYCL
 
 namespace {
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 701c5f6d2b3817e74e52ef41cecf177f6974254d..d087784c8a0bd2a53438af4582754b2d47620545 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
 #include <unordered_map>
 #include <utility>
 
@@ -21,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace tensorflow {
 
@@ -33,8 +35,6 @@ class UniqueOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
-                errors::InvalidArgument("unique expects a 1D vector."));
     // TODO(dga):  Make unique polymorphic for returning int32 and int64
     // vectors to support large tensors.
     OP_REQUIRES(context,
@@ -42,31 +42,102 @@ class UniqueOp : public OpKernel {
                 errors::InvalidArgument(
                     "unique does not support input tensors larger than ",
                     std::numeric_limits<int32>::max(), " elements"));
-    auto Tin = input.vec<T>();
-    const int64 N = static_cast<int64>(Tin.size());
+
+    int64 axis = 0;
+    std::vector<int64> new_sizes{1, input.NumElements(), 1};
+    if (context->num_inputs() == 1) {
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+                  errors::InvalidArgument("unique expects a 1D vector."));
+    } else {
+      // In case of UniqueV2, the axis is a 1D vector. The purpose is
+      // to allow specifying either "no axis" or "axis". The `[]` means
+      // "no axis", while `[x]` means `axis = x`.
+      const Tensor& axis_tensor = context->input(1);
+      OP_REQUIRES(context, TensorShapeUtils::IsVector(axis_tensor.shape()),
+                  errors::InvalidArgument("axis expects a 1D vector."));
+      OP_REQUIRES(
+          context, axis_tensor.NumElements() <= 1,
+          errors::InvalidArgument(
+              "axis does not support input tensors larger than 1 elements"));
+      if (axis_tensor.NumElements() == 0) {
+        OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+                    errors::InvalidArgument("unique expects a 1D vector."));
+      } else {
+        auto axis_vec = axis_tensor.vec<int64>();
+        axis = axis_vec(0);
+        axis = axis < 0 ? axis + input.dims() : axis;
+        OP_REQUIRES(context, 0 <= axis && axis < input.dims(),
+                    errors::InvalidArgument("axis has to be between [0, ",
+                                            input.dims(), ")"));
+        if (axis > 0) {
+          for (int64 i = 0; i < axis; i++) {
+            new_sizes[0] *= input.dim_size(i);
+          }
+        }
+        new_sizes[1] = input.dim_size(axis);
+        if (axis + 1 < input.dims()) {
+          for (int64 i = axis + 1; i < input.dims(); i++) {
+            new_sizes[2] *= input.dim_size(i);
+          }
+        }
+      }
+    }
+
+    auto Tin = input.shaped<T, 3>(new_sizes);
 
     Tensor* idx = nullptr;
-    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 1, input.shape(), &idx));
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                1, TensorShape({Tin.dimension(1)}), &idx));
     auto idx_vec = idx->template vec<TIndex>();
 
-    std::unordered_map<T, TIndex> uniq;
-    uniq.reserve(2 * N);
-    for (int64 i = 0, j = 0; i < N; ++i) {
-      auto it = uniq.insert(std::make_pair(Tin(i), j));
+    auto hash_fn = [&Tin](const int64& key) -> unsigned long {
+      size_t h = 0;
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          h = Hash64Combine(h, hash<T>{}(Tin(i, key, j)));
+        }
+      }
+      return h;
+    };
+
+    auto equal_to_fn = [&Tin](const int64& lhs, const int64& rhs) {
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          if (Tin(i, lhs, j) != Tin(i, rhs, j)) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    std::unordered_map<int64, int64, decltype(hash_fn), decltype(equal_to_fn)>
+        uniq(0, hash_fn, equal_to_fn);
+
+    uniq.reserve(2 * Tin.dimension(1));
+
+    for (int64 i = 0, j = 0; i < Tin.dimension(1); ++i) {
+      auto it = uniq.insert(std::make_pair(i, j));
       idx_vec(i) = it.first->second;
       if (it.second) {
         ++j;
       }
     }
+
     int64 uniq_size = static_cast<int64>(uniq.size());
+    new_sizes[1] = uniq_size;
+    TensorShape output_shape(input.shape());
+    output_shape.set_dim(axis, uniq_size);
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                0, TensorShape({uniq_size}), &output));
-    auto output_vec = output->template vec<T>();
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto Tout = output->shaped<T, 3>(new_sizes);
 
     for (auto it : uniq) {
-      output_vec(it.second) = it.first;
+      for (int64 i = 0; i < Tin.dimension(0); i++) {
+        for (int64 j = 0; j < Tin.dimension(2); j++) {
+          Tout(i, it.second, j) = Tin(i, it.first, j);
+        }
+      }
     }
 
     if (num_outputs() > 2) {
@@ -74,7 +145,7 @@ class UniqueOp : public OpKernel {
                                   2, TensorShape({uniq_size}), &output));
       auto count_output_vec = output->template vec<TIndex>();
       count_output_vec.setZero();
-      for (int64 i = 0; i < N; ++i) {
+      for (int64 i = 0; i < Tin.dimension(1); ++i) {
         count_output_vec(idx_vec(i))++;
       }
     }
@@ -92,6 +163,16 @@ class UniqueOp : public OpKernel {
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
                           UniqueOp<type, int64>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueOp<type, int32>);                \
+  REGISTER_KERNEL_BUILDER(Name("UniqueV2")                       \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueOp<type, int64>);                \
   REGISTER_KERNEL_BUILDER(Name("UniqueWithCounts")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -176,5 +257,5 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("y")
                             .HostMemory("idx"),
                         UniqueOp<int64, int64>);
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 36b8ff09d7381a0b8bbb8b6f8d71b14e47fa4663..1b7079dcbae34de683951979cbf692d954a966ee 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -23,6 +23,177 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Resource stored by variables in the resource manager
+// (legacy, ref-style version).
+class LegacyVar : public ResourceBase {
+ public:
+  explicit LegacyVar(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  LegacyVar(const LegacyVar&) = delete;
+  LegacyVar& operator=(const LegacyVar&) = delete;
+
+  mutex* mu() { return &mu_; }
+  Tensor* tensor() { return &tensor_; }
+
+  string DebugString() override {
+    return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
+                           tensor_.shape().DebugString());
+  }
+
+ private:
+  mutex mu_;
+  Tensor tensor_;
+
+  ~LegacyVar() override {}
+};
+
+VariableOp::VariableOp(OpKernelConstruction* context) : OpKernel(context) {
+  OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+  dtype_ = RemoveRefType(context->output_type(0));
+}
+
+void VariableOp::Compute(OpKernelContext* ctx) {
+  mutex_lock l(init_mu_);
+  if (!initialized_) {
+    OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                    true /* use name() */));
+    initialized_ = true;
+  }
+  auto creator = [this](LegacyVar** var) {
+    *var = new LegacyVar(dtype_);
+    (*var)->tensor()->set_shape(shape_);
+    return Status::OK();
+  };
+  LegacyVar* var;
+  OP_REQUIRES_OK(ctx, cinfo_.resource_manager()->LookupOrCreate<LegacyVar>(
+                          cinfo_.container(), cinfo_.name(), &var, creator));
+  // Output a reference to our tensor, so it may be updated.
+  //
+  // As long as the resource manager hasn't been cleared the ref we return
+  // here is valid because it owns a ref on var.
+  ctx->set_output_ref(0, var->mu(), var->tensor());
+  if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    if (ctx->allocate_on_host(attr)) {
+      ctx->record_host_persistent_memory_allocation(
+          var->tensor()->AllocatedBytes());
+    } else {
+      ctx->record_device_persistent_memory_allocation(
+          var->tensor()->AllocatedBytes());
+    }
+  }
+  var->Unref();
+}
+
+class TemporaryVariableOp : public OpKernel {
+ public:
+  explicit TemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    // Variable name defaults to op name if not specified explicitly.
+    if (var_name_.empty()) var_name_ = name();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Status s;
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    auto* tmp_var = new TmpVar;
+    OP_REQUIRES(context, tmp_var,
+                errors::ResourceExhausted("Could not allocate TmpVar."));
+    tmp_var->name = var_name_;
+    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
+    if (!s.ok()) tmp_var->Unref();
+    OP_REQUIRES_OK(context, s);
+    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
+                                       var_name_, tmp_var));
+    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
+    if (context->track_allocations()) {
+      AllocatorAttributes attr;
+      if (context->allocate_on_host(attr)) {
+        context->record_host_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      } else {
+        context->record_device_persistent_memory_allocation(
+            tmp_var->val.AllocatedBytes());
+      }
+    }
+  }
+
+ private:
+  // Refcounted temporary variable resource.
+  friend class DestroyTemporaryVariableOp;
+  struct TmpVar : public ResourceBase {
+    mutex mu;
+    Tensor val;
+    string name;
+    string DebugString() override { return name; }
+    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
+  };
+
+  TensorShape shape_;
+  DataType dtype_;
+  string var_name_;
+};
+
+class DestroyTemporaryVariableOp : public OpKernel {
+ public:
+  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES(context, IsRefType(context->input_type(0)),
+                errors::InvalidArgument("lhs input needs to be a ref type"));
+    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
+    OP_REQUIRES(context, !var_name_.empty(),
+                errors::InvalidArgument("Missing var_name attribute"));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
+    // their execution before this DestroyTemporaryVariable op executes.
+    // This is typically achieved using control dependencies.
+    CHECK(IsRefType(context->input_dtype(0)));
+    Tensor tmpvar = context->mutable_input(0, false);
+    context->set_output(0, tmpvar);
+    ResourceMgr* rm = context->resource_manager();
+    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
+    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
+                                context->step_container()->name(), var_name_));
+    if (context->track_allocations()) {
+      if (context->allocate_on_host(AllocatorAttributes())) {
+        context->record_host_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      } else {
+        context->record_device_persistent_memory_allocation(
+            -static_cast<int64>(tmpvar.AllocatedBytes()));
+      }
+    }
+  }
+
+ private:
+  string var_name_;
+};
+
+class IsVariableInitializedOp : public OpKernel {
+ public:
+  explicit IsVariableInitializedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get a mutable input tensor of the Ref input.
+    const Tensor& input_tensor = context->mutable_input(0, false);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    auto output_tensor = output->tensor<bool, 0>();
+    bool result = input_tensor.IsInitialized();
+    output_tensor() = result;
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("Variable").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("VariableV2").Device(DEVICE_CPU), VariableOp);
 REGISTER_KERNEL_BUILDER(Name("TemporaryVariable").Device(DEVICE_CPU),
@@ -33,30 +204,30 @@ REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
 #ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),  \
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),\
-      VariableOp);                                                         \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                        \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype"),              \
-                          TemporaryVariableOp);                            \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                 \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T"),                  \
-                          DestroyTemporaryVariableOp);                     \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                    \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("dtype")               \
-                              .HostMemory("is_initialized"),               \
+#define REGISTER_SYCL_KERNEL(type)                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),   \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"), \
+      VariableOp);                                                          \
+  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                         \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype"),               \
+                          TemporaryVariableOp);                             \
+  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                  \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("T"),                   \
+                          DestroyTemporaryVariableOp);                      \
+  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                     \
+                              .Device(DEVICE_SYCL)                          \
+                              .TypeConstraint<type>("dtype")                \
+                              .HostMemory("is_initialized"),                \
                           IsVariableInitializedOp);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
 #undef REGISTER_SYCL_KERNEL
-#endif // TENSORFLOW_USE_SYCL
+#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/variable_ops.h b/tensorflow/core/kernels/variable_ops.h
index 355140d44c5c53c8496d5bd2b3028e9ae9b3940b..83134bad378bfef18c3e93be5cc3c6b70ab4f523 100644
--- a/tensorflow/core/kernels/variable_ops.h
+++ b/tensorflow/core/kernels/variable_ops.h
@@ -27,10 +27,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager.
+// Resource stored by variables in the resource manager
+// (new, resource-style version).
 class Var : public ResourceBase {
  public:
   explicit Var(DataType dtype) : tensor_(dtype) {}
+  // Not copyable or movable.
+  Var(const Var&) = delete;
+  Var& operator=(const Var&) = delete;
+
+  // TODO(ebrevdo): Use LockSet instead of exposing mu.
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
@@ -44,52 +50,12 @@ class Var : public ResourceBase {
   Tensor tensor_;
 
   ~Var() override {}
-  TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
 class VariableOp : public OpKernel {
  public:
-  explicit VariableOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    dtype_ = RemoveRefType(context->output_type(0));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    mutex_lock l(init_mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(
-          ctx,
-          cinfo_.Init(ctx->resource_manager(), def(), true /* use name() */));
-      initialized_ = true;
-    }
-    auto creator = [this](Var** var) {
-      *var = new Var(dtype_);
-      (*var)->tensor()->set_shape(shape_);
-      return Status::OK();
-    };
-    Var* var;
-    OP_REQUIRES_OK(ctx,
-                   cinfo_.resource_manager()->LookupOrCreate<Var>(
-                       cinfo_.container(), cinfo_.name(), &var, creator));
-    // Output a reference to our tensor, so it may be updated.
-    //
-    // As long as the resource manager hasn't been cleared the ref we return
-    // here is valid because it owns a ref on var.
-    ctx->set_output_ref(0, var->mu(), var->tensor());
-    if (ctx->track_allocations() && var->tensor()->IsInitialized()) {
-      AllocatorAttributes attr;
-      attr.set_gpu_compatible(true);
-      attr.set_nic_compatible(true);
-      if (ctx->allocate_on_host(attr)) {
-        ctx->record_host_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      } else {
-        ctx->record_device_persistent_memory_allocation(
-            var->tensor()->AllocatedBytes());
-      }
-    }
-    var->Unref();
-  }
+  explicit VariableOp(OpKernelConstruction* context);
+  void Compute(OpKernelContext* ctx) override;
 
  private:
   DataType dtype_;
@@ -102,112 +68,6 @@ class VariableOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(VariableOp);
 };
 
-class TemporaryVariableOp : public OpKernel {
- public:
-  explicit TemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    // Variable name defaults to op name if not specified explicitly.
-    if (var_name_ == "") var_name_ = name();
-  }
-
-  void Compute(OpKernelContext* context) override {
-    Status s;
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    auto* tmp_var = new TmpVar;
-    OP_REQUIRES(context, tmp_var,
-                errors::ResourceExhausted("Could not allocate TmpVar."));
-    tmp_var->name = var_name_;
-    s = context->allocate_temp(dtype_, shape_, &tmp_var->val);
-    if (!s.ok()) tmp_var->Unref();
-    OP_REQUIRES_OK(context, s);
-    OP_REQUIRES_OK(context, rm->Create(context->step_container()->name(),
-                                       var_name_, tmp_var));
-    context->set_output_ref(0, &tmp_var->mu, &tmp_var->val);
-    if (context->track_allocations()) {
-      AllocatorAttributes attr;
-      if (context->allocate_on_host(attr)) {
-        context->record_host_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      } else {
-        context->record_device_persistent_memory_allocation(
-            tmp_var->val.AllocatedBytes());
-      }
-    }
-  }
-
- private:
-  // Refcounted temporary variable resource.
-  friend class DestroyTemporaryVariableOp;
-  struct TmpVar : public ResourceBase {
-    mutex mu;
-    Tensor val;
-    string name;
-    string DebugString() override { return name; }
-    ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
-  };
-
-  TensorShape shape_;
-  DataType dtype_;
-  string var_name_;
-};
-
-class DestroyTemporaryVariableOp : public OpKernel {
- public:
-  explicit DestroyTemporaryVariableOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES(context, IsRefType(context->input_type(0)),
-                errors::InvalidArgument("lhs input needs to be a ref type"))
-    OP_REQUIRES_OK(context, context->GetAttr("var_name", &var_name_));
-    OP_REQUIRES(context, var_name_ != "",
-                errors::InvalidArgument("Missing var_name attribute"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // NOTE(pbar): All other mutators of the Tensor Ref *must* have completed
-    // their execution before this DestroyTemporaryVariable op executes.
-    // This is typically achieved using control dependencies.
-    CHECK(IsRefType(context->input_dtype(0)));
-    Tensor tmpvar = context->mutable_input(0, false);
-    context->set_output(0, tmpvar);
-    ResourceMgr* rm = context->resource_manager();
-    OP_REQUIRES(context, rm, errors::Internal("No per-step resource manager."));
-    OP_REQUIRES_OK(context, rm->Delete<TemporaryVariableOp::TmpVar>(
-                                context->step_container()->name(), var_name_));
-    if (context->track_allocations()) {
-      if (context->allocate_on_host(AllocatorAttributes())) {
-        context->record_host_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      } else {
-        context->record_device_persistent_memory_allocation(
-            -static_cast<int64>(tmpvar.AllocatedBytes()));
-      }
-    }
-  }
-
- private:
-  string var_name_;
-};
-
-class IsVariableInitializedOp : public OpKernel {
- public:
-  IsVariableInitializedOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // Get a mutable input tensor of the Ref input.
-    const Tensor& input_tensor = context->mutable_input(0, false);
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape({}), &output));
-    auto output_tensor = output->tensor<bool, 0>();
-    bool result = input_tensor.IsInitialized();
-    output_tensor() = result;
-  }
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/xsmm_conv2d_test.cc b/tensorflow/core/kernels/xsmm_conv2d_test.cc
index 381ea39b77c26e16ca8727d23dfd90c46b9e4b9a..e29470124674636a0e125a5cd1b856a467f4c6f0 100644
--- a/tensorflow/core/kernels/xsmm_conv2d_test.cc
+++ b/tensorflow/core/kernels/xsmm_conv2d_test.cc
@@ -73,7 +73,7 @@ LIBXSMM_INLINE void naive_copy_KCRS_to_RSCK(const float* kcrs, Tensor  &rsck, in
   LIBXSMM_VLA_DECL(4, const float,  input, kcrs, C, R, S);
   int r, s, c, k;
   auto output =  rsck.flat<float>();
- 
+
   for ( r = 0; r < R; r++ ) {
     for ( s = 0; s < S; s++ ) {
       for ( c = 0; c < C; c++ ) {
@@ -94,14 +94,14 @@ LIBXSMM_INLINE void zero_buf(float* buf, long size) {
     buf[i] = 0.0f;
   }
 }
- 
+
 LIBXSMM_INLINE void copy_buf(Tensor &dst,float *src,long size) {
   long  i;
   auto output =  dst.flat<float>();
-  for (i = 0; i < size; ++i) 
+  for (i = 0; i < size; ++i)
           output(i) = src[i];
 }
- 
+
 LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
 {
   int i;
@@ -110,7 +110,7 @@ LIBXSMM_INLINE void init_buf(float* buf, long size, int initPos, int initOne)
     buf[i] = (float)((initOne != 0) ? 1.0 : ((initPos != 0) ? drand48() : (0.05 - drand48()/10.0)));
   }
 }
- 
+
 
 
 LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float* output, const float* filter)
@@ -138,11 +138,11 @@ LIBXSMM_INLINE void naive_conv_fp(naive_conv_t* param, const float* input, float
   int stride_w  = param->stride_w;
   /* loop counters */
   int img, ofm, ifm, oj, oi, ij, ii, kj, ki;
- 
+
   LIBXSMM_VLA_DECL(4,       float, output_t, output + (pad_w_out * ofwp + pad_h_out), nOfm, ofhp, ofwp);
   LIBXSMM_VLA_DECL(4, const float,  input_t,  input + (pad_w_in * ifwp + pad_h_in), nIfm, ifhp, ifwp);
   LIBXSMM_VLA_DECL(4, const float, filter_t, filter, nIfm, kh, kw);
- 
+
   for (img = 0; img < nImg; ++img) {
     for (ofm = 0; ofm < nOfm; ++ofm) {
       for (ifm = 0; ifm < nIfm; ++ifm) {
@@ -172,7 +172,7 @@ void RunXsmmVsGeneric() {}
 class XsmmConv2DTest : public OpsTestBase {
  protected:
   void MakeOp(int stride) {
-  
+
     TF_CHECK_OK(NodeDefBuilder("xsmm", "Conv2D")
                       .Input(FakeInput(DT_FLOAT))
                       .Input(FakeInput(DT_FLOAT))
@@ -184,7 +184,7 @@ class XsmmConv2DTest : public OpsTestBase {
     TF_ASSERT_OK(InitOp());
   }
 };
- 
+
 TEST_F(XsmmConv2DTest, Basic) {
      MakeOp(1);
 
@@ -206,13 +206,13 @@ TEST_F(XsmmConv2DTest, Basic) {
      int stride_h = stride;
      int pad_h = pad;
      int pad_w = pad;
- 
+
      int pad_h_in = pad_h;
      int pad_w_in = pad_w;
- 
+
      int pad_h_out = 0;
      int pad_w_out = 0;
- 
+
   /* deriving some values for naive code */
      int ofh = (ifh + 2 * pad_h - kh) / stride_h + 1;
      int ofw = (ifw + 2 * pad_w - kw) / stride_w + 1;
@@ -223,7 +223,7 @@ TEST_F(XsmmConv2DTest, Basic) {
 
 
     //Initialization of Filter and Image
-    
+
     /* allocate data */
      float *naive_input           = (float*)libxsmm_aligned_scratch( nImg*nIfm*ifhp*ifwp*sizeof(float), 2097152);
      float *naive_output          = (float*)libxsmm_aligned_scratch( nImg*nOfm*ofhp*ofwp*sizeof(float), 2097152);
@@ -232,21 +232,21 @@ TEST_F(XsmmConv2DTest, Basic) {
      init_buf(naive_input,          nImg*nIfm*ifhp*ifwp, 0, 0);
      zero_buf(naive_output,         nImg*nOfm*ofhp*ofwp);
      init_buf(naive_filter,         nOfm*nIfm*kh*kw, 0, 0);
-        
+
 
      Tensor image(DT_FLOAT,
                  {nImg, ifhp, ifwp, nIfm});
- 
- 
+
+
      Tensor filter(DT_FLOAT, {kh,kw,nIfm,nOfm});
- 
+
 
      naive_copy_NCHW_to_NHWC(naive_input, image, nImg, ifhp, ifwp, nIfm);
-     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm); 
+     naive_copy_KCRS_to_RSCK(naive_filter, filter, kh, kw, nIfm, nOfm);
 
 
     //Run naive convolution
-    
+
      naive_conv_t naive_param;
 
      naive_param.nImg = nImg;
@@ -274,8 +274,8 @@ TEST_F(XsmmConv2DTest, Basic) {
 
      naive_conv_fp(&naive_param, naive_input, naive_output, naive_filter);
 
- 
- 
+
+
      AddInputFromArray<float>(image.shape(), image.flat<float>());
      AddInputFromArray<float>(filter.shape(), filter.flat<float>());
 
@@ -283,7 +283,7 @@ TEST_F(XsmmConv2DTest, Basic) {
 
      //Run Op (TF)
      TF_ASSERT_OK(RunOpKernel());
- 
+
      // Check the output.
      Tensor expected(DT_FLOAT, {nImg,ofhp,ofwp, nOfm});
      naive_copy_NCHW_to_NHWC(naive_output, expected, nImg, ofhp, ofwp, nOfm);
@@ -329,15 +329,15 @@ TEST(XsmmConv2DTest, Basic) {
     desc.fuse_ops = LIBXSMM_DNN_CONV_FUSE_NONE;
     desc.options = LIBXSMM_DNN_CONV_OPTION_NONE;
     desc.datatype = LIBXSMM_DNN_DATATYPE_F32;
- 
+
     if (!CanUseXsmmConv2D(desc, data_format)) {
       return false;
     }
- 
+
     auto input_ptr = input.template flat<float>().data();
     auto filter_ptr = filter.template flat<float>().data();
     auto output_ptr = output->template flat<float>().data();
- 
+
     bool success = functor::XsmmFwdConv2D<CPUDevice, float>()(
         ctx, desc, input_ptr, filter_ptr, output_ptr);
     return success;
diff --git a/tensorflow/core/kernels/zip_dataset_op.cc b/tensorflow/core/kernels/zip_dataset_op.cc
index a80b9edbe468b658c6b5a85b4c3c28be581fa75f..31e5737f627d551c10a755bc26b8879ef25c5b16 100644
--- a/tensorflow/core/kernels/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/zip_dataset_op.cc
@@ -35,14 +35,15 @@ class ZipDatasetOp : public DatasetOpKernel {
       OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(i), &input));
       inputs.push_back(input);
     }
-    *output = new Dataset(inputs);
+    *output = new Dataset(ctx, inputs);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    explicit Dataset(const std::vector<DatasetBase*>& inputs)
-        : inputs_(inputs) {
+    explicit Dataset(OpKernelContext* ctx,
+                     const std::vector<DatasetBase*>& inputs)
+        : GraphDatasetBase(ctx), inputs_(inputs) {
       for (const auto& input : inputs_) {
         input->Ref();
         for (DataType dt : input->output_dtypes()) {
@@ -76,6 +77,21 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() override { return "ZipDatasetOp::Dataset"; }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      std::vector<Node*> input_graph_nodes;
+      input_graph_nodes.reserve(inputs_.size());
+      for (const auto& input : inputs_) {
+        Node* input_node;
+        TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input, &input_node));
+        input_graph_nodes.emplace_back(input_node);
+      }
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {}, {std::make_pair(0, input_graph_nodes)}, {}, output));
+      return Status::OK();
+    }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
@@ -93,6 +109,10 @@ class ZipDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
+        if (input_impls_.empty()) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         out_tensors->clear();
         out_tensors->reserve(dataset()->output_dtypes().size());
         for (const auto& input_impl : input_impls_) {
@@ -100,12 +120,41 @@ class ZipDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(
               input_impl->GetNext(ctx, &input_tensors, end_of_sequence));
           if (*end_of_sequence) {
-            return Status::OK();
+            break;
           }
           out_tensors->insert(out_tensors->end(), input_tensors.begin(),
                               input_tensors.end());
         }
-        *end_of_sequence = false;
+        if (*end_of_sequence) {
+          out_tensors->clear();
+          input_impls_.clear();
+        }
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impls_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        } else {
+          for (auto& input_impl : input_impls_)
+            TF_RETURN_IF_ERROR(SaveParent(writer, input_impl));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(OpKernelContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty"))) {
+          input_impls_.clear();
+        } else {
+          DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
+          for (auto& input_impl : input_impls_)
+            TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl));
+        }
         return Status::OK();
       }
 
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index 2a04f7bd39df98a97ec7ed0f82dfdfbd8222a2da..55e481d0e60a004f2baebdcac444dd7e7cf93e66 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -113,24 +114,11 @@ void Arena::MakeNewBlock(const uint32 alignment) {
   CHECK(SatisfyAlignment(alignment));
 }
 
-// The following simple numeric routines also exist in util/math/mathutil.h
-// but we don't want to depend on that library.
-
-// Euclid's algorithm for Greatest Common Denominator.
-static uint32 GCD(uint32 x, uint32 y) {
-  while (y != 0) {
-    uint32 r = x % y;
-    x = y;
-    y = r;
-  }
-  return x;
-}
-
 static uint32 LeastCommonMultiple(uint32 a, uint32 b) {
   if (a > b) {
-    return (a / GCD(a, b)) * b;
+    return (a / MathUtil::GCD<uint32>(a, b)) * b;
   } else if (a < b) {
-    return (b / GCD(b, a)) * a;
+    return (b / MathUtil::GCD<uint32>(b, a)) * a;
   } else {
     return a;
   }
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 984f4404ce2c6b35611c0db470d127fb92d1e5be..29b727fc4463d933ceeb402c5dd92f3ea5b8a62a 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-size_t StringPiece::Hasher::operator()(StringPiece s) const {
+size_t StringPieceHasher::operator()(StringPiece s) const {
   return Hash64(s.data(), s.size());
 }
 
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 7d258b36c5ef320e2951d8a5f8ae5b6c17c1fe12..caa9642774bebec05a28b7a0c2ea71d18d6ebd1a 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -35,12 +35,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+struct StringPieceHasher;
+
 class StringPiece {
  public:
   typedef size_t size_type;
 
   // Create an empty slice.
-  StringPiece() : data_(""), size_(0) {}
+  StringPiece() : data_(nullptr), size_(0) {}
 
   // Create a slice that refers to d[0,n-1].
   StringPiece(const char* d, size_t n) : data_(d), size_(n) {}
@@ -51,11 +53,6 @@ class StringPiece {
   // Create a slice that refers to s[0,strlen(s)-1]
   StringPiece(const char* s) : data_(s), size_(strlen(s)) {}
 
-  void set(const void* data, size_t len) {
-    data_ = reinterpret_cast<const char*>(data);
-    size_ = len;
-  }
-
   // Return a pointer to the beginning of the referenced data
   const char* data() const { return data_; }
 
@@ -79,12 +76,6 @@ class StringPiece {
     return data_[n];
   }
 
-  // Change this slice to refer to an empty array
-  void clear() {
-    data_ = "";
-    size_ = 0;
-  }
-
   // Drop the first "n" bytes from this slice.
   void remove_prefix(size_t n) {
     assert(n <= size());
@@ -114,10 +105,6 @@ class StringPiece {
 
   StringPiece substr(size_t pos, size_t n = npos) const;
 
-  struct Hasher {
-    size_t operator()(StringPiece arg) const;
-  };
-
   // Return a string that contains the copy of the referenced data.
   std::string ToString() const { return std::string(data_, size_); }
 
@@ -144,6 +131,10 @@ class StringPiece {
   // Intentionally copyable
 };
 
+struct StringPieceHasher {
+  size_t operator()(StringPiece s) const;
+};
+
 inline bool operator==(StringPiece x, StringPiece y) {
   return ((x.size() == y.size()) &&
           (memcmp(x.data(), y.data(), x.size()) == 0));
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index ad70d418732af94ecc162f8ef096796138ebbcb7..8f17b85b6d7941d7084ce4e142de4ad33f1e8202 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+
+#include <unordered_map>
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -63,4 +65,74 @@ TEST(StringPiece, Contains) {
   EXPECT_TRUE(!a.contains(d));
 }
 
+TEST(StringPieceHasher, Equality) {
+  StringPieceHasher hasher;
+
+  StringPiece s1("foo");
+  StringPiece s2("bar");
+  StringPiece s3("baz");
+  StringPiece s4("zot");
+
+  EXPECT_TRUE(hasher(s1) != hasher(s2));
+  EXPECT_TRUE(hasher(s1) != hasher(s3));
+  EXPECT_TRUE(hasher(s1) != hasher(s4));
+  EXPECT_TRUE(hasher(s2) != hasher(s3));
+  EXPECT_TRUE(hasher(s2) != hasher(s4));
+  EXPECT_TRUE(hasher(s3) != hasher(s4));
+
+  EXPECT_TRUE(hasher(s1) == hasher(s1));
+  EXPECT_TRUE(hasher(s2) == hasher(s2));
+  EXPECT_TRUE(hasher(s3) == hasher(s3));
+  EXPECT_TRUE(hasher(s4) == hasher(s4));
+}
+
+TEST(StringPieceHasher, HashMap) {
+  string s1("foo");
+  string s2("bar");
+  string s3("baz");
+
+  StringPiece p1(s1);
+  StringPiece p2(s2);
+  StringPiece p3(s3);
+
+  std::unordered_map<StringPiece, int, StringPieceHasher> map;
+
+  map.insert(std::make_pair(p1, 0));
+  map.insert(std::make_pair(p2, 1));
+  map.insert(std::make_pair(p3, 2));
+  EXPECT_EQ(map.size(), 3);
+
+  bool found[3] = {false, false, false};
+  for (auto const& val : map) {
+    int x = val.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], true);
+  EXPECT_EQ(found[2], true);
+
+  auto new_iter = map.find("zot");
+  EXPECT_TRUE(new_iter == map.end());
+
+  new_iter = map.find("bar");
+  EXPECT_TRUE(new_iter != map.end());
+
+  map.erase(new_iter);
+  EXPECT_EQ(map.size(), 2);
+
+  found[0] = false;
+  found[1] = false;
+  found[2] = false;
+  for (const auto& iter : map) {
+    int x = iter.second;
+    EXPECT_TRUE(x >= 0 && x < 3);
+    EXPECT_TRUE(!found[x]);
+    found[x] = true;
+  }
+  EXPECT_EQ(found[0], true);
+  EXPECT_EQ(found[1], false);
+  EXPECT_EQ(found[2], true);
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 251d58817e729898475e087707f924b533e346da..b89b74b8dec396ae5ecfef3a927c60d22cc06c1e 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -30,7 +30,7 @@ class ThreadPool {
   // Constructs a pool that contains "num_threads" threads with specified
   // "name". env->StartThread() is used to create individual threads with the
   // given ThreadOptions. If "low_latency_hint" is true the thread pool
-  // implementation may use it as a hint that lower latency if preferred at the
+  // implementation may use it as a hint that lower latency is preferred at the
   // cost of higher CPU usage, e.g. by letting one or more idle threads spin
   // wait. Conversely, if the threadpool is used to schedule high-latency
   // operations like I/O the hint should be set to false.
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 701655f622a7ec0288f1cb53818877e65839643e..23361e64312a00658077d197650b0f9561bec40b 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -18,15 +18,36 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
+
+void ExecuteOrLog(Sqlite* db, const char* sql) {
+  Status s = db->Prepare(sql).StepAndReset();
+  if (!s.ok()) {
+    LOG(WARNING) << s.ToString();
+  }
+}
+
+string ExecuteOrEmpty(Sqlite* db, const char* sql) {
+  auto stmt = db->Prepare(sql);
+  bool is_done = false;
+  if (stmt.Step(&is_done).ok() && !is_done) {
+    return stmt.ColumnString(0);
+  }
+  return "";
+}
+
+}  // namespace
 
 /* static */
 xla::StatusOr<std::shared_ptr<Sqlite>> Sqlite::Open(const string& uri) {
   sqlite3* sqlite = nullptr;
-  Status s = MakeStatus(sqlite3_open(uri.c_str(), &sqlite));
-  if (s.ok()) {
-    return std::shared_ptr<Sqlite>(new Sqlite(sqlite));
-  }
-  return s;
+  TF_RETURN_IF_ERROR(MakeStatus(sqlite3_open(uri.c_str(), &sqlite)));
+  Sqlite* db = new Sqlite(sqlite, uri);
+  // This is the SQLite default since 2016. However it's good to set
+  // this anyway, since we might get linked against an older version of
+  // the library, and it's pretty much impossible to change later.
+  ExecuteOrLog(db, "PRAGMA page_size=4096");
+  return std::shared_ptr<Sqlite>(db);
 }
 
 /* static */ Status Sqlite::MakeStatus(int resultCode) {
@@ -75,7 +96,7 @@ xla::StatusOr<std::shared_ptr<Sqlite>> Sqlite::Open(const string& uri) {
   }
 }
 
-Sqlite::Sqlite(sqlite3* db) : db_(db) {}
+Sqlite::Sqlite(sqlite3* db, const string& uri) : db_(db), uri_(uri) {}
 
 Sqlite::~Sqlite() {
   // close_v2 doesn't care if a stmt hasn't been GC'd yet
@@ -97,6 +118,30 @@ Status Sqlite::Close() {
   return s;
 }
 
+void Sqlite::UseWriteAheadLogWithReducedDurabilityIfPossible() {
+  // TensorFlow summaries are intensively write-heavy, cf. most apps.
+  // This pragma loves writes and means that TensorBoard can read the
+  // database even as the training job inserts stuff. In other words,
+  // this makes SQLite almost as powerful as MySQL or PostgreSQL.
+  // https://www.sqlite.org/wal.html
+  string journal = ExecuteOrEmpty(this, "PRAGMA journal_mode=wal");
+  if (journal != "wal") {
+    LOG(WARNING) << "Failed to set journal_mode=wal because SQLite wants "
+                 << uri_ << " to be in '" << journal << "' mode, which might "
+                 << "be bad since WAL is important for the performance of "
+                 << "write-intensive apps. This might only happen for memory "
+                 << "databases or old versions of SQLite, but is definitely "
+                 << "worth fixing if that's not the case";
+  } else {
+    // This setting means we might lose transactions due to power loss,
+    // but the database can't become corrupted. In exchange, we get the
+    // the performance of a NoSQL database. This is a trade-off most data
+    // scientists would consider acceptable.
+    // https://www.sqlite.org/pragma.html#pragma_synchronous
+    ExecuteOrLog(this, "PRAGMA synchronous=NORMAL");
+  }
+}
+
 SqliteStatement Sqlite::Prepare(const string& sql) {
   sqlite3_stmt* stmt = nullptr;
   int rc = sqlite3_prepare_v2(db_, sql.c_str(), sql.size() + 1, &stmt, nullptr);
diff --git a/tensorflow/core/lib/db/sqlite.h b/tensorflow/core/lib/db/sqlite.h
index 774852efea7b494406c89960654b1acdca1f4ac9..12840bd42bab80c10210c8c87968835136b2d5ea 100644
--- a/tensorflow/core/lib/db/sqlite.h
+++ b/tensorflow/core/lib/db/sqlite.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 #define TENSORFLOW_CORE_LIB_DB_SQLITE_H_
 
-#include <stddef.h>
+#include <cstddef>
 #include <memory>
 #include <utility>
 
@@ -69,6 +69,13 @@ class Sqlite {
   /// beforehand. This is a no-op if already closed
   Status Close();
 
+  /// \brief Enables WAL mode with less fsync or log a warning.
+  ///
+  /// The synchronous pragma is only set to NORMAL if WAL mode was
+  /// successfully enabled. This must be called immediately after
+  /// creating the object.
+  void UseWriteAheadLogWithReducedDurabilityIfPossible();
+
   /// \brief Creates SQLite statement.
   ///
   /// Call result.status() to determine whether or not this operation
@@ -78,8 +85,9 @@ class Sqlite {
   SqliteStatement Prepare(const string& sql);
 
  private:
-  explicit Sqlite(sqlite3* db);
+  explicit Sqlite(sqlite3* db, const string& uri);
   sqlite3* db_;
+  string uri_;
   TF_DISALLOW_COPY_AND_ASSIGN(Sqlite);
 };
 
@@ -103,7 +111,7 @@ class SqliteStatement {
   SqliteStatement& operator=(SqliteStatement&& other);
 
   /// \brief Returns true if statement is not empty.
-  operator bool() const { return stmt_ != nullptr; }
+  explicit operator bool() const { return stmt_ != nullptr; }
 
   /// \brief Returns SQLite result code state.
   ///
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index cda72a579da0f76abe6c37c724f76c307890f224..ffeca4e88a93936ee6a1711afec735d97d04172e 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -29,48 +29,23 @@ limitations under the License.
 namespace tensorflow {
 namespace gtl {
 
-// Returns a mutable char* pointing to a string's internal buffer, which may not
-// be null-terminated. Returns NULL for an empty string. If not non-null,
-// writing through this pointer will modify the string.
-//
-// string_as_array(&str)[i] is valid for 0 <= i < str.size() until the
-// next call to a string method that invalidates iterators.
-//
-// In C++11 you may simply use &str[0] to get a mutable char*.
-//
-// Prior to C++11, there was no standard-blessed way of getting a mutable
-// reference to a string's internal buffer. The requirement that string be
-// contiguous is officially part of the C++11 standard [string.require]/5.
-// According to Matt Austern, this should already work on all current C++98
-// implementations.
-inline char* string_as_array(string* str) {
-  return str->empty() ? NULL : &*str->begin();
-}
-
-// Returns the T* array for the given vector, or NULL if the vector was empty.
-//
-// Note: If you know the array will never be empty, you can use &*v.begin()
-// directly, but that is may dump core if v is empty. This function is the most
-// efficient code that will work, taking into account how our STL is actually
-// implemented. THIS IS NON-PORTABLE CODE, so use this function instead of
-// repeating the nonportable code everywhere. If our STL implementation changes,
-// we will need to change this as well.
+// Returns a char* pointing to the beginning of a string's internal buffer.
+// The result is a valid "null-terminated byte string", even if *str is empty.
+// Up to C++14 it is not valid to *write* to the null terminator; as of C++17,
+// it is valid to write zero to the null terminator (but not any other value).
+inline char* string_as_array(string* str) { return &*str->begin(); }
+
+// The following vector_as_array functions return raw pointers to the underlying
+// data buffer. The return value is unspecified (but valid) if the input range
+// is empty.
 template <typename T, typename Allocator>
 inline T* vector_as_array(std::vector<T, Allocator>* v) {
-#if defined NDEBUG && !defined _GLIBCXX_DEBUG
-  return &*v->begin();
-#else
-  return v->empty() ? NULL : &*v->begin();
-#endif
+  return v->data();
 }
-// vector_as_array overload for const std::vector<>.
+
 template <typename T, typename Allocator>
 inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
-#if defined NDEBUG && !defined _GLIBCXX_DEBUG
-  return &*v->begin();
-#else
-  return v->empty() ? NULL : &*v->begin();
-#endif
+  return v->data();
 }
 
 // Like str->resize(new_size), except any new characters added to "*str" as a
diff --git a/tensorflow/core/lib/io/block.cc b/tensorflow/core/lib/io/block.cc
index 1fa26d91470843b1491002822c781341a00ac6d0..4c30486cc4973e76540f67994170cf2898d37c90 100644
--- a/tensorflow/core/lib/io/block.cc
+++ b/tensorflow/core/lib/io/block.cc
@@ -199,7 +199,7 @@ class Block::Iter : public Iterator {
     restart_index_ = num_restarts_;
     status_ = errors::DataLoss("bad entry in block");
     key_.clear();
-    value_.clear();
+    value_ = StringPiece();
   }
 
   bool ParseNextKey() {
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index d93dd0296e4f28e024600110eee45153ea9c9cbd..83f15e134d6f60c65a7523458353ffd62345b7cc 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -14,8 +14,22 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/lib/io/path.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#if !defined(PLATFORM_WINDOWS)
+#include <unistd.h>
+#endif
+
+#include <vector>
+
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 namespace io {
@@ -60,8 +74,7 @@ std::pair<StringPiece, StringPiece> SplitPath(StringPiece uri) {
 
   auto pos = path.rfind('/');
 #ifdef PLATFORM_WINDOWS
-  if (pos == StringPiece::npos)
-    pos = path.rfind('\\');
+  if (pos == StringPiece::npos) pos = path.rfind('\\');
 #endif
   // Handle the case with no '/' in 'path'.
   if (pos == StringPiece::npos)
@@ -112,7 +125,7 @@ StringPiece Extension(StringPiece path) {
 
 string CleanPath(StringPiece unclean_path) {
   string path = unclean_path.ToString();
-  const char *src = path.c_str();
+  const char* src = path.c_str();
   string::iterator dst = path.begin();
 
   // Check for absolute path and determine initial backtrack limit.
@@ -229,5 +242,52 @@ string CreateURI(StringPiece scheme, StringPiece host, StringPiece path) {
   return strings::StrCat(scheme, "://", host, path);
 }
 
+// Returns a unique number every time it is called.
+int64 UniqueId() {
+  static mutex mu(LINKER_INITIALIZED);
+  static int64 id = 0;
+  mutex_lock l(mu);
+  return ++id;
+}
+
+string GetTempFilename(const string& extension) {
+#if defined(PLATFORM_WINDOWS) || defined(__ANDROID__)
+  LOG(FATAL) << "GetTempFilename is not implemented in this platform.";
+#else
+  for (const char* dir : std::vector<const char*>(
+           {getenv("TEST_TMPDIR"), getenv("TMPDIR"), getenv("TMP"), "/tmp"})) {
+    if (!dir || !dir[0]) {
+      continue;
+    }
+    struct stat statbuf;
+    if (!stat(dir, &statbuf) && S_ISDIR(statbuf.st_mode)) {
+      // UniqueId is added here because mkstemps is not as thread safe as it
+      // looks. https://github.com/tensorflow/tensorflow/issues/5804 shows
+      // the problem.
+      string tmp_filepath;
+      int fd;
+      if (extension.length()) {
+        tmp_filepath = io::JoinPath(
+            dir, strings::StrCat("tmp_file_tensorflow_", UniqueId(), "_XXXXXX.",
+                                 extension));
+        fd = mkstemps(&tmp_filepath[0], extension.length() + 1);
+      } else {
+        tmp_filepath = io::JoinPath(
+            dir,
+            strings::StrCat("tmp_file_tensorflow_", UniqueId(), "_XXXXXX"));
+        fd = mkstemp(&tmp_filepath[0]);
+      }
+      if (fd < 0) {
+        LOG(FATAL) << "Failed to create temp file.";
+      } else {
+        close(fd);
+        return tmp_filepath;
+      }
+    }
+  }
+  LOG(FATAL) << "No temp directory found.";
+#endif
+}
+
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/path.h b/tensorflow/core/lib/io/path.h
index 955098f5b5ea38dd34c01c9913881933a2b9bd41..47bb2b998d637099b3ab788f7ce274f83e4fc646 100644
--- a/tensorflow/core/lib/io/path.h
+++ b/tensorflow/core/lib/io/path.h
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 
 namespace tensorflow {
-class StringPiece;
 namespace io {
 namespace internal {
-string JoinPathImpl(std::initializer_list<StringPiece> paths);
+string JoinPathImpl(std::initializer_list<tensorflow::StringPiece> paths);
 }
 
 // Utility routines for processing filenames
@@ -50,20 +49,20 @@ string JoinPath(const T&... args) {
 #endif /* SWIG */
 
 // Return true if path is absolute.
-bool IsAbsolutePath(StringPiece path);
+bool IsAbsolutePath(tensorflow::StringPiece path);
 
 // Returns the part of the path before the final "/".  If there is a single
 // leading "/" in the path, the result will be the leading "/".  If there is
 // no "/" in the path, the result is the empty prefix of the input.
-StringPiece Dirname(StringPiece path);
+tensorflow::StringPiece Dirname(tensorflow::StringPiece path);
 
 // Returns the part of the path after the final "/".  If there is no
 // "/" in the path, the result is the same as the input.
-StringPiece Basename(StringPiece path);
+tensorflow::StringPiece Basename(tensorflow::StringPiece path);
 
 // Returns the part of the basename of path after the final ".".  If
 // there is no "." in the basename, the result is empty.
-StringPiece Extension(StringPiece path);
+tensorflow::StringPiece Extension(tensorflow::StringPiece path);
 
 // Collapse duplicate "/"s, resolve ".." and "." path elements, remove
 // trailing "/".
@@ -72,7 +71,7 @@ StringPiece Extension(StringPiece path);
 // invoke any system calls (getcwd(2)) in order to resolve relative
 // paths with respect to the actual working directory.  That is, this is purely
 // string manipulation, completely independent of process state.
-string CleanPath(StringPiece path);
+string CleanPath(tensorflow::StringPiece path);
 
 // Populates the scheme, host, and path from a URI. scheme, host, and path are
 // guaranteed by this function to point into the contents of uri, even if
@@ -82,12 +81,16 @@ string CleanPath(StringPiece path);
 // - If the URI is invalid, scheme and host are set to empty strings and the
 //   passed string is assumed to be a path
 // - If the URI omits the path (e.g. file://host), then the path is left empty.
-void ParseURI(StringPiece uri, StringPiece* scheme, StringPiece* host,
-              StringPiece* path);
+void ParseURI(tensorflow::StringPiece uri, tensorflow::StringPiece* scheme,
+              tensorflow::StringPiece* host, tensorflow::StringPiece* path);
 
 // Creates a URI from a scheme, host, and path. If the scheme is empty, we just
 // return the path.
-string CreateURI(StringPiece scheme, StringPiece host, StringPiece path);
+string CreateURI(tensorflow::StringPiece scheme, tensorflow::StringPiece host,
+                 tensorflow::StringPiece path);
+
+// Creates a temporary file name with an extension.
+string GetTempFilename(const string& extension);
 
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/proto_encode_helper.h b/tensorflow/core/lib/io/proto_encode_helper.h
index 5d30dda90172e0f69ea1512b228d9fb95e9a6d39..f70e1cbaabf8383d255f5d339d65a7958bf67596 100644
--- a/tensorflow/core/lib/io/proto_encode_helper.h
+++ b/tensorflow/core/lib/io/proto_encode_helper.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LIB_IO_PROTO_ENCODE_HELPER_H_
 
 #include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 // A helper class for appending various kinds of values in protocol
@@ -24,7 +25,6 @@ limitations under the License.
 // a buffer and a maximum size guarantee for the number of bytes they
 // will add to this buffer.
 namespace tensorflow {
-class StringPiece;
 namespace io {
 
 class ProtoEncodeHelper {
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index c3b87ee5bf02f70bc19b0b67dc90e7ae5886b465..403c82818ef3293a1dc027d362eb766906d0e94a 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -196,6 +196,19 @@ Status RecordReader::ReadRecord(uint64* offset, string* record) {
   return Status::OK();
 }
 
+Status RecordReader::SkipNBytes(uint64 offset) {
+#if !defined(IS_SLIM_BUILD)
+  if (zlib_input_stream_) {
+    TF_RETURN_IF_ERROR(zlib_input_stream_->SkipNBytes(offset));
+  } else {
+#endif
+    if (options_.buffer_size > 0) {
+      TF_RETURN_IF_ERROR(input_stream_->SkipNBytes(offset));
+    }
+  }
+  return Status::OK();
+}
+
 SequentialRecordReader::SequentialRecordReader(
     RandomAccessFile* file, const RecordReaderOptions& options)
     : underlying_(file, options), offset_(0) {}
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index e4f6a5b492104501564fa0e6ad495b4dcdfd8fff..62dd2efb792988c4197cf7172b25ac34cdd77ed9 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -74,6 +74,10 @@ class RecordReader {
   // sequential.
   Status ReadRecord(uint64* offset, string* record);
 
+  // Skip the records till "offset". Returns OK on success,
+  // OUT_OF_RANGE for end of file, or something else for an error.
+  Status SkipNBytes(uint64 offset);
+
  private:
   Status ReadChecksummed(uint64 offset, size_t n, StringPiece* result,
                          string* storage);
@@ -107,6 +111,21 @@ class SequentialRecordReader {
     return underlying_.ReadRecord(&offset_, record);
   }
 
+  // Returns the current offset in the file.
+  uint64 TellOffset() { return offset_; }
+
+  // Seek to this offset within the file and set this offset as the current
+  // offset. Trying to seek backward will throw error.
+  Status SeekOffset(uint64 offset) {
+    if (offset < offset_)
+      return errors::InvalidArgument(
+          "Trying to seek offset: ", offset,
+          " which is less than the current offset: ", offset_);
+    TF_RETURN_IF_ERROR(underlying_.SkipNBytes(offset - offset_));
+    offset_ = offset;
+    return Status::OK();
+  }
+
  private:
   RecordReader underlying_;
   uint64 offset_ = 0;
diff --git a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
index be1fa22c69c27a5c57e3c397076a66dfe05eb035..3c310167326721e8f569ab6148622517aaf82ce5 100644
--- a/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
+++ b/tensorflow/core/lib/io/snappy/snappy_outputbuffer.cc
@@ -161,7 +161,7 @@ Status SnappyOutputBuffer::Deflate() {
   }
 
   // Write length of compressed block to output buffer.
-  char* compressed_length_array = new char[4];
+  char compressed_length_array[4];
   std::fill(compressed_length_array, compressed_length_array + 4, 0);
   for (int i = 0; i < 4; i++) {
     // Little endian.
@@ -173,7 +173,6 @@ Status SnappyOutputBuffer::Deflate() {
   TF_RETURN_IF_ERROR(AddToOutputBuffer(output.data(), output.size()));
   next_in_ += avail_in_;
   avail_in_ = 0;
-  delete[] compressed_length_array;
 
   return Status::OK();
 }
diff --git a/tensorflow/core/lib/math/math_util.h b/tensorflow/core/lib/math/math_util.h
index 6f279865e7b361d7b0d2c402747c7b3476e63448..9e71598622b1f2c9aa026472a60e7c341c95a336 100644
--- a/tensorflow/core/lib/math/math_util.h
+++ b/tensorflow/core/lib/math/math_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 #define TENSORFLOW_LIB_MATH_MATH_UTIL_H_
 
+#include <type_traits>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -59,6 +61,9 @@ class MathUtil {
   template <typename IntegralType, bool ceil>
   static IntegralType CeilOrFloorOfRatio(IntegralType numerator,
                                          IntegralType denominator);
+
+  template <typename IntegralType>
+  static IntegralType GCD(IntegralType x, IntegralType y);
 };
 
 // ---- CeilOrFloorOfRatio ----
@@ -107,6 +112,18 @@ IntegralType MathUtil::CeilOrFloorOfRatio(IntegralType numerator,
   }
 }
 
+template <typename IntegralType>
+IntegralType MathUtil::GCD(IntegralType a, IntegralType b) {
+  static_assert(std::is_unsigned<IntegralType>::value,
+                "signed GCD not supported!");
+  while (b != 0) {
+    IntegralType r = a % b;
+    a = b;
+    b = r;
+  }
+  return a;
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_LIB_MATH_MATH_UTIL_H_
diff --git a/tensorflow/core/lib/math/math_util_test.cc b/tensorflow/core/lib/math/math_util_test.cc
index eaf8c31a431728d6f728abeb50e14c443bce6d85..a96e5467c3f7bb55f9716cefd62d95b1d7bd46ab 100644
--- a/tensorflow/core/lib/math/math_util_test.cc
+++ b/tensorflow/core/lib/math/math_util_test.cc
@@ -195,4 +195,33 @@ TEST(MathUtil, CeilOfRatio) {
 #endif
 }
 
+struct GCDTestCase {
+  unsigned int x;
+  unsigned int y;
+  unsigned int gcd;
+};
+
+TEST(MathUtil, GCD) {
+  std::vector<GCDTestCase> testcases({
+      {10, 20, 10},  //
+      {27, 8, 1},    //
+      {4, 3, 1},     //
+      {6, 8, 2},     //
+      {5, 0, 5},     //
+      {5, 5, 5},     //
+      {0, 0, 0}      //
+  });
+
+  for (const auto& tc : testcases) {
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint32>(tc.y, tc.x));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.x, tc.y));
+    EXPECT_EQ(tc.gcd, MathUtil::GCD<uint64>(tc.y, tc.x));
+  }
+
+  const uint64 biggish_prime = 1666666667;
+  EXPECT_EQ(biggish_prime,
+            MathUtil::GCD<uint64>(biggish_prime * 3, biggish_prime * 4));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/collected_metrics.h b/tensorflow/core/lib/monitoring/collected_metrics.h
index 3dde55342ef1b1a1923eb29568839105a4356315..acdb0d86edb1a15631c324afe9d535e0660c4b98 100644
--- a/tensorflow/core/lib/monitoring/collected_metrics.h
+++ b/tensorflow/core/lib/monitoring/collected_metrics.h
@@ -87,6 +87,8 @@ struct Point {
   // The actual metric value, dependent on the value_type enum.
   ValueType value_type;
   int64 int64_value;
+  string string_value;
+  bool bool_value;
   HistogramProto histogram_value;
 
   // start_timestamp and end_timestamp indicate the time period over which this
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 2eff468436793d31483b5a0af4398a89a7626936..2c8e250c5631ee8a56d6871c1a61ef17efc97c82 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -218,6 +218,18 @@ inline void CollectValue(const int64& value, Point* const point) {
   point->int64_value = value;
 }
 
+template <>
+inline void CollectValue(const string& value, Point* const point) {
+  point->value_type = ValueType::kString;
+  point->string_value = value;
+}
+
+template <>
+inline void CollectValue(const bool& value, Point* const point) {
+  point->value_type = ValueType::kBool;
+  point->bool_value = value;
+}
+
 template <>
 inline void CollectValue(const HistogramProto& value, Point* const point) {
   point->value_type = ValueType::kHistogram;
@@ -315,13 +327,13 @@ void MetricCollector<metric_kind, Value, NumLabels>::CollectValue(
     const std::array<string, NumLabels>& labels, const Value& value) {
   point_set_->points.emplace_back(new Point());
   auto* const point = point_set_->points.back().get();
-  const std::vector<StringPiece> label_descriptions =
+  const std::vector<string> label_descriptions =
       metric_def_->label_descriptions();
   point->labels.reserve(NumLabels);
   for (int i = 0; i < NumLabels; ++i) {
     point->labels.push_back({});
     auto* const label = &point->labels.back();
-    label->name = label_descriptions[i].ToString();
+    label->name = label_descriptions[i];
     label->value = labels[i];
   }
   internal::CollectValue(value, point);
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index 5b9c1006900f01a126466fb8b8f243666d77cdbd..ca25f508da9635f02941c99c768947927fd97493 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/collection_registry.h"
 
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -176,6 +177,96 @@ TEST(CollectMetricsTest, Counter) {
   }
 }
 
+TEST(CollectMetricsTest, Gauge) {
+  auto string_gauge_with_labels =
+      std::unique_ptr<Gauge<string, 2>>(Gauge<string, 2>::New(
+          "/tensorflow/test/string_gauge_with_labels",
+          "String gauge with labels.", "MyLabel0", "MyLabel1"));
+  auto inteter_gauge_without_labels = std::unique_ptr<Gauge<int64, 0>>(
+      Gauge<int64, 0>::New("/tensorflow/test/integer_gauge_without_labels",
+                           "Integer gauge without labels."));
+
+  string_gauge_with_labels->GetCell("Label00", "Label10")->Set("test1");
+  string_gauge_with_labels->GetCell("Label01", "Label11")->Set("test2");
+  inteter_gauge_without_labels->GetCell()->Set(7);
+
+  for (const bool collect_metric_descriptors : {true, false}) {
+    SCOPED_TRACE(strings::StrCat("collect_metric_descriptors: ",
+                                 collect_metric_descriptors));
+
+    auto* collection_registry = CollectionRegistry::Default();
+    CollectionRegistry::CollectMetricsOptions options;
+    options.collect_metric_descriptors = collect_metric_descriptors;
+    const std::unique_ptr<CollectedMetrics> collected_metrics =
+        collection_registry->CollectMetrics(options);
+
+    if (collect_metric_descriptors) {
+      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+
+      const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
+          "/tensorflow/test/string_gauge_with_labels");
+      EXPECT_EQ("/tensorflow/test/string_gauge_with_labels", ld.name);
+      EXPECT_EQ("String gauge with labels.", ld.description);
+      ASSERT_EQ(2, ld.label_names.size());
+      EXPECT_EQ("MyLabel0", ld.label_names[0]);
+      EXPECT_EQ("MyLabel1", ld.label_names[1]);
+      EXPECT_EQ(MetricKind::kGauge, ld.metric_kind);
+      EXPECT_EQ(ValueType::kString, ld.value_type);
+
+      const MetricDescriptor& ud = *collected_metrics->metric_descriptor_map.at(
+          "/tensorflow/test/integer_gauge_without_labels");
+      EXPECT_EQ("/tensorflow/test/integer_gauge_without_labels", ud.name);
+      EXPECT_EQ("Integer gauge without labels.", ud.description);
+      ASSERT_EQ(0, ud.label_names.size());
+      EXPECT_EQ(MetricKind::kGauge, ud.metric_kind);
+      EXPECT_EQ(ValueType::kInt64, ud.value_type);
+    } else {
+      EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
+    }
+
+    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+
+    const PointSet& lps = *collected_metrics->point_set_map.at(
+        "/tensorflow/test/string_gauge_with_labels");
+    EXPECT_EQ("/tensorflow/test/string_gauge_with_labels", lps.metric_name);
+    ASSERT_EQ(2, lps.points.size());
+    ASSERT_EQ(2, lps.points[0]->labels.size());
+    EXPECT_EQ("MyLabel0", lps.points[0]->labels[0].name);
+    EXPECT_EQ("Label00", lps.points[0]->labels[0].value);
+    EXPECT_EQ("MyLabel1", lps.points[0]->labels[1].name);
+    EXPECT_EQ("Label10", lps.points[0]->labels[1].value);
+    EXPECT_EQ(ValueType::kString, lps.points[0]->value_type);
+    EXPECT_EQ("test1", lps.points[0]->string_value);
+    EXPECT_LT(0, lps.points[0]->start_timestamp_millis);
+    EXPECT_LT(0, lps.points[0]->end_timestamp_millis);
+    EXPECT_GE(lps.points[0]->end_timestamp_millis,
+              lps.points[0]->start_timestamp_millis);
+    ASSERT_EQ(2, lps.points[1]->labels.size());
+    EXPECT_EQ("MyLabel0", lps.points[1]->labels[0].name);
+    EXPECT_EQ("Label01", lps.points[1]->labels[0].value);
+    EXPECT_EQ("MyLabel1", lps.points[1]->labels[1].name);
+    EXPECT_EQ("Label11", lps.points[1]->labels[1].value);
+    EXPECT_EQ(ValueType::kString, lps.points[1]->value_type);
+    EXPECT_EQ("test2", lps.points[1]->string_value);
+    EXPECT_LT(0, lps.points[1]->start_timestamp_millis);
+    EXPECT_LT(0, lps.points[1]->end_timestamp_millis);
+    EXPECT_GE(lps.points[1]->end_timestamp_millis,
+              lps.points[1]->start_timestamp_millis);
+
+    const PointSet& ups = *collected_metrics->point_set_map.at(
+        "/tensorflow/test/integer_gauge_without_labels");
+    EXPECT_EQ("/tensorflow/test/integer_gauge_without_labels", ups.metric_name);
+    ASSERT_EQ(1, ups.points.size());
+    EXPECT_EQ(0, ups.points[0]->labels.size());
+    EXPECT_EQ(ValueType::kInt64, ups.points[0]->value_type);
+    EXPECT_EQ(7, ups.points[0]->int64_value);
+    EXPECT_LT(0, ups.points[0]->start_timestamp_millis);
+    EXPECT_LT(0, ups.points[0]->end_timestamp_millis);
+    EXPECT_GE(ups.points[0]->end_timestamp_millis,
+              ups.points[0]->start_timestamp_millis);
+  }
+}
+
 void EqHistograms(const Histogram& expected,
                   const HistogramProto& actual_proto) {
   Histogram actual;
diff --git a/tensorflow/core/lib/monitoring/counter.h b/tensorflow/core/lib/monitoring/counter.h
index 4b84e9d928c2bbae71b5ceb37638102f1cfae21b..7240348a9b764e3092f71da4bce9a953c08e7900 100644
--- a/tensorflow/core/lib/monitoring/counter.h
+++ b/tensorflow/core/lib/monitoring/counter.h
@@ -48,7 +48,7 @@ namespace monitoring {
 // This class is thread-safe.
 class CounterCell {
  public:
-  CounterCell(const int64 value) : value_(value) {}
+  CounterCell(int64 value) : value_(value) {}
   ~CounterCell() {}
 
   // Atomically increments the value by step.
diff --git a/tensorflow/core/lib/monitoring/gauge.h b/tensorflow/core/lib/monitoring/gauge.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec978a91935890cb0563f39ba0e6554a03d7c86e
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/gauge.h
@@ -0,0 +1,244 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
+
+// We replace this implementation with a null implementation for mobile
+// platforms.
+#include "tensorflow/core/platform/platform.h"
+#ifdef IS_MOBILE_PLATFORM
+#include "tensorflow/core/lib/monitoring/mobile_gauge.h"
+#else
+
+#include <array>
+#include <atomic>
+#include <map>
+
+#include "tensorflow/core/lib/monitoring/collection_registry.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// GaugeCell stores each value of a gauge.
+//
+// A cell can be passed off to a module which may repeatedly update it without
+// needing further map-indexing computations. This improves both encapsulation
+// (separate modules can own a cell each, without needing to know about the map
+// to which both cells belong) and performance (since map indexing and
+// associated locking are both avoided).
+//
+// This class is thread-safe.
+template <typename T>
+class GaugeCell {
+ public:
+  explicit GaugeCell(const T& value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(const T& value) LOCKS_EXCLUDED(mu_);
+
+  // Retrieves the current value.
+  T value() const LOCKS_EXCLUDED(mu_);
+
+ private:
+  T value_ GUARDED_BY(mu_);
+  mutable mutex mu_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
+// Explicit specialization of GaugeCell<int64>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<int64> {
+ public:
+  explicit GaugeCell(int64 value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(int64 value);
+
+  // Retrieves the current value.
+  int64 value() const;
+
+ private:
+  std::atomic<int64> value_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
+// Explicit specialization of GaugeCell<bool>. Compared to the primary
+// template, it uses atomic values as opposed to mutex. This class is
+// thread-safe.
+template <>
+class GaugeCell<bool> {
+ public:
+  explicit GaugeCell(bool value) : value_(value) {}
+  ~GaugeCell() {}
+
+  // Atomically sets the value.
+  void Set(bool value);
+
+  // Retrieves the current value.
+  bool value() const;
+
+ private:
+  std::atomic<bool> value_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
+// A stateful class for updating a gauge-like metric. Allowed ValueType are
+// int64, string and bool.
+//
+// This class encapsulates a set of values (or a single value for a label-less
+// metric). Each value is identified by a tuple of labels. The class allows the
+// user to set each value.
+//
+// Gauge allocates storage and maintains a cell for each value. You can
+// retrieve an individual cell using a label-tuple and update it separately.
+// This improves performance since operations related to retrieval, like
+// map-indexing and locking, are avoided.
+//
+// This class is thread-safe.
+template <typename ValueType, int NumLabels>
+class Gauge {
+ public:
+  ~Gauge() {
+    // Deleted here, before the metric_def is destroyed.
+    registration_handle_.reset();
+  }
+
+  // Creates the metric based on the metric-definition arguments.
+  //
+  // Example:
+  //
+  // auto* string_gauge_with_label = Gauge<string,1>::New(
+  //   "/tensorflow/string_gauge_with_label",
+  //   "String gauge with one label.", "MyLabelName");
+  //
+  // auto* integer_gauge = Gauge<int64, 0>::New("/tensorflow/integer_gauge",
+  //   "Integer gauge")
+  //
+  // auto* bool_gauge = Gauge<bool, 0>::New("/tensorflow/bool_gauge",
+  //   "Bool gauge")
+  template <typename... MetricDefArgs>
+  static Gauge* New(MetricDefArgs&&... metric_def_args);
+
+  // Retrieves the cell for the specified labels, creating it on demand if not
+  // already present.
+  template <typename... Labels>
+  GaugeCell<ValueType>* GetCell(const Labels&... labels) LOCKS_EXCLUDED(mu_);
+
+ private:
+  explicit Gauge(
+      const MetricDef<MetricKind::kGauge, ValueType, NumLabels>& metric_def)
+      : metric_def_(metric_def),
+        registration_handle_(CollectionRegistry::Default()->Register(
+            &metric_def_, [&](MetricCollectorGetter getter) {
+              auto metric_collector = getter.Get(&metric_def_);
+
+              mutex_lock l(mu_);
+              for (const auto& cell : cells_) {
+                metric_collector.CollectValue(cell.first, cell.second.value());
+              }
+            })) {}
+
+  mutable mutex mu_;
+
+  // The metric definition. This will be used to identify the metric when we
+  // register it for collection.
+  const MetricDef<MetricKind::kGauge, ValueType, NumLabels> metric_def_;
+
+  std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
+
+  using LabelArray = std::array<string, NumLabels>;
+  std::map<LabelArray, GaugeCell<ValueType> > cells_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Gauge);
+};
+
+////
+//  Implementation details follow. API readers may skip.
+////
+template <typename T>
+void GaugeCell<T>::Set(const T& value) {
+  mutex_lock l(mu_);
+  value_ = value;
+}
+
+template <typename T>
+T GaugeCell<T>::value() const {
+  mutex_lock l(mu_);
+  return value_;
+}
+
+inline void GaugeCell<int64>::Set(int64 value) { value_ = value; }
+
+inline int64 GaugeCell<int64>::value() const { return value_; }
+
+inline void GaugeCell<bool>::Set(bool value) { value_ = value; }
+
+inline bool GaugeCell<bool>::value() const { return value_; }
+
+template <typename ValueType, int NumLabels>
+template <typename... MetricDefArgs>
+Gauge<ValueType, NumLabels>* Gauge<ValueType, NumLabels>::New(
+    MetricDefArgs&&... metric_def_args) {
+  static_assert(std::is_same<ValueType, int64>::value ||
+                    std::is_same<ValueType, string>::value ||
+                    std::is_same<ValueType, bool>::value,
+                "Gauge only allows int64 and string types.");
+  return new Gauge<ValueType, NumLabels>(
+      MetricDef<MetricKind::kGauge, ValueType, NumLabels>(
+          std::forward<MetricDefArgs>(metric_def_args)...));
+}
+
+template <typename ValueType, int NumLabels>
+template <typename... Labels>
+GaugeCell<ValueType>* Gauge<ValueType, NumLabels>::GetCell(
+    const Labels&... labels) LOCKS_EXCLUDED(mu_) {
+  // Provides a more informative error message than the one during array
+  // construction below.
+  static_assert(
+      sizeof...(Labels) == NumLabels,
+      "Mismatch between Gauge<ValueType, NumLabels> and number of labels "
+      "provided in GetCell(...).");
+
+  const LabelArray& label_array = {{labels...}};
+  mutex_lock l(mu_);
+  const auto found_it = cells_.find(label_array);
+  if (found_it != cells_.end()) {
+    return &(found_it->second);
+  }
+  return &(cells_
+               .emplace(std::piecewise_construct,
+                        std::forward_as_tuple(label_array),
+                        std::forward_as_tuple(ValueType()))
+               .first->second);
+}
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // IS_MOBILE_PLATFORM
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8f673db38928b96bd4f97cbb72c1007fdc9e9bb
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/monitoring/gauge.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace monitoring {
+namespace {
+
+auto* gauge_with_labels = Gauge<int64, 1>::New(
+    "/tensorflow/test/gauge_with_labels", "Gauge with one label.", "MyLabel");
+
+TEST(LabeledGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, gauge_with_labels->GetCell("Empty")->value());
+}
+
+TEST(LabeledGaugeTest, GetCell) {
+  auto* cell = gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(0, cell->value());
+
+  cell->Set(1);
+  EXPECT_EQ(1, cell->value());
+
+  auto* same_cell = gauge_with_labels->GetCell("GetCellOp");
+  EXPECT_EQ(1, same_cell->value());
+
+  same_cell->Set(10);
+  EXPECT_EQ(10, cell->value());
+  EXPECT_EQ(10, same_cell->value());
+}
+
+auto* gauge_without_labels = Gauge<int64, 0>::New(
+    "/tensorflow/test/gauge_without_labels", "Gauge without any labels.");
+
+TEST(UnlabeledGaugeTest, InitializedWithZero) {
+  EXPECT_EQ(0, gauge_without_labels->GetCell()->value());
+}
+
+TEST(UnlabeledGaugeTest, GetCell) {
+  auto* cell = gauge_without_labels->GetCell();
+  EXPECT_EQ(0, cell->value());
+
+  cell->Set(1);
+  EXPECT_EQ(1, cell->value());
+
+  auto* same_cell = gauge_without_labels->GetCell();
+  EXPECT_EQ(1, same_cell->value());
+
+  same_cell->Set(10);
+  EXPECT_EQ(10, cell->value());
+  EXPECT_EQ(10, same_cell->value());
+}
+
+auto* string_gauge = Gauge<string, 0>::New("/tensorflow/test/string_gauge",
+                                           "Gauge of string value.");
+
+TEST(GaugeOfStringValue, InitializedWithEmptyString) {
+  EXPECT_EQ("", string_gauge->GetCell()->value());
+}
+
+TEST(GaugeOfStringValue, GetCell) {
+  auto* cell = string_gauge->GetCell();
+  EXPECT_EQ("", cell->value());
+
+  cell->Set("foo");
+  EXPECT_EQ("foo", cell->value());
+
+  auto* same_cell = string_gauge->GetCell();
+  EXPECT_EQ("foo", cell->value());
+
+  same_cell->Set("bar");
+  EXPECT_EQ("bar", cell->value());
+  EXPECT_EQ("bar", same_cell->value());
+}
+
+auto* bool_gauge =
+    Gauge<bool, 0>::New("/tensorflow/test/bool_gauge", "Gauge of bool value.");
+
+TEST(GaugeOfBoolValue, InitializedWithFalseValue) {
+  EXPECT_EQ(false, bool_gauge->GetCell()->value());
+}
+
+TEST(GaugeOfBoolValue, GetCell) {
+  auto* cell = bool_gauge->GetCell();
+  EXPECT_EQ(false, cell->value());
+
+  cell->Set(true);
+  EXPECT_EQ(true, cell->value());
+
+  auto* same_cell = bool_gauge->GetCell();
+  EXPECT_EQ(true, cell->value());
+
+  same_cell->Set(false);
+  EXPECT_EQ(false, cell->value());
+  EXPECT_EQ(false, same_cell->value());
+}
+
+}  // namespace
+}  // namespace monitoring
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index 116a73823d789a01b5782fe771400b355592c80d..f046842618a03f7a161a11d3b493b71be50ad988 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -28,38 +28,21 @@ namespace monitoring {
 // The different metric kinds available.
 //
 // Gauge indicates that the metric's values are instantaneous measurements of a
-// (typically) continuously varying quantity. Examples: a process's current heap
-// size, a queue's current length.
+// (typically) continuously varying value. Examples: a process's current heap
+// size, a queue's current length, the name of the binary used by a process,
+// whether a task is complete.
 //
 // Cumulative indicates that the metric's values represent non-negative changes
 // over specified time periods. Example: the number of rpc calls to a service.
 enum class MetricKind : int { kGauge = 0, kCumulative };
 
 // The type of the metric values.
-enum class ValueType : int { kInt64 = 0, kHistogram };
+enum class ValueType : int { kInt64 = 0, kHistogram, kString, kBool };
 
 // Everything in the internal namespace is implementation details. Do not depend
 // on this.
 namespace internal {
 
-// Ensures that the string is a compile-time string literal.
-class StringLiteral {
- public:
-  // We allow implicit conversions here on purpose.
-  template <int N>
-  StringLiteral(const char (&data)[N]) : literal_(data, N - 1) {}
-
-  // This ctor will be called for non-literals, causing compile-time failure.
-  template <typename NotStringLiteral>
-  StringLiteral(const NotStringLiteral& not_string_literal) = delete;
-
-  // Implicit conversion to StringPiece.
-  operator StringPiece() const { return literal_; }
-
- private:
-  const StringPiece literal_;
-};
-
 template <typename Value>
 ValueType GetValueType();
 
@@ -73,6 +56,16 @@ inline ValueType GetValueType<HistogramProto>() {
   return ValueType::kHistogram;
 }
 
+template <>
+inline ValueType GetValueType<string>() {
+  return ValueType::kString;
+}
+
+template <>
+inline ValueType GetValueType<bool>() {
+  return ValueType::kBool;
+}
+
 }  // namespace internal
 
 // Abstract base class for a metric definition.
@@ -92,7 +85,7 @@ class AbstractMetricDef {
 
   StringPiece description() const { return description_; }
 
-  const std::vector<StringPiece> label_descriptions() const {
+  const std::vector<string>& label_descriptions() const {
     return label_descriptions_;
   }
 
@@ -100,23 +93,21 @@ class AbstractMetricDef {
   template <MetricKind kind, typename Value, int NumLabels>
   friend class MetricDef;
 
-  AbstractMetricDef(
-      const MetricKind kind, const ValueType value_type,
-      const internal::StringLiteral name,
-      const internal::StringLiteral description,
-      const std::vector<internal::StringLiteral>& label_descriptions)
+  AbstractMetricDef(const MetricKind kind, const ValueType value_type,
+                    const StringPiece name, const StringPiece description,
+                    const std::vector<string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
-        name_(name),
-        description_(description),
-        label_descriptions_(std::vector<StringPiece>(
-            label_descriptions.begin(), label_descriptions.end())) {}
+        name_(name.ToString()),
+        description_(description.ToString()),
+        label_descriptions_(std::vector<string>(label_descriptions.begin(),
+                                                label_descriptions.end())) {}
 
   const MetricKind kind_;
   const ValueType value_type_;
-  const StringPiece name_;
-  const StringPiece description_;
-  const std::vector<StringPiece> label_descriptions_;
+  const string name_;
+  const string description_;
+  const std::vector<string> label_descriptions_;
 };
 
 // Metric definition.
@@ -124,15 +115,18 @@ class AbstractMetricDef {
 // A metric is defined by its kind, value-type, name, description and the
 // description of its labels.
 //
-// NOTE: We allow only string literals for the name, description and label
-// descriptions because these should be fixed at compile-time and shouldn't be
-// dynamic.
+// NOTE: Name, description, and label descriptions should be logically static,
+// but do not have to live for the lifetime of the MetricDef.
+//
+// By "logically static", we mean that they should never contain dynamic
+// information, but is static for the lifetime of the MetricDef, and
+// in-turn the metric; they do not need to be compile-time constants.
+// This allows for e.g. prefixed metrics in a CLIF wrapped environment.
 template <MetricKind metric_kind, typename Value, int NumLabels>
 class MetricDef : public AbstractMetricDef {
  public:
   template <typename... LabelDesc>
-  MetricDef(const internal::StringLiteral name,
-            const internal::StringLiteral description,
+  MetricDef(const StringPiece name, const StringPiece description,
             const LabelDesc&... label_descriptions)
       : AbstractMetricDef(metric_kind, internal::GetValueType<Value>(), name,
                           description, {label_descriptions...}) {
diff --git a/tensorflow/core/lib/monitoring/metric_def_test.cc b/tensorflow/core/lib/monitoring/metric_def_test.cc
index dc07a08e4feaed1045b379e2795733cb0d4f2024..66973b6b5f646218269ac5da286ceb6667d170fc 100644
--- a/tensorflow/core/lib/monitoring/metric_def_test.cc
+++ b/tensorflow/core/lib/monitoring/metric_def_test.cc
@@ -41,6 +41,24 @@ TEST(MetricDefTest, Simple) {
   EXPECT_EQ("LabelName", metric_def1.label_descriptions()[0]);
 }
 
+TEST(MetricDefTest, StringsPersist) {
+  // Ensure string attributes of the metric are copied into the metric
+  string name = "/tensorflow/metric0";
+  string description = "test description";
+  string label_description = "test label description";
+  const MetricDef<MetricKind::kCumulative, int64, 1> metric_def(
+      name, description, label_description);
+
+  // Mutate the strings
+  name[4] = 'A';
+  description[4] = 'B';
+  label_description[4] = 'C';
+
+  EXPECT_NE(name, metric_def.name());
+  EXPECT_NE(description, metric_def.description());
+  EXPECT_NE(label_description, metric_def.label_descriptions()[0]);
+}
+
 }  // namespace
 }  // namespace monitoring
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/monitoring/mobile_gauge.h b/tensorflow/core/lib/monitoring/mobile_gauge.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac13ad35c020a45770e8acd7cd0820cbc2ac8cf4
--- /dev/null
+++ b/tensorflow/core/lib/monitoring/mobile_gauge.h
@@ -0,0 +1,72 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Null implementation of the Gauge metric for mobile platforms.
+
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace monitoring {
+
+// GaugeCell which has a null implementation.
+template <typename T>
+class GaugeCell {
+ public:
+ public:
+  GaugeCell() {}
+  ~GaugeCell() {}
+
+  void Set(const T& value) {}
+  T value() const { return T(); }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(GaugeCell);
+};
+
+// Gauge which has a null implementation.
+template <typename ValueType, int NumLabels>
+class Gauge {
+ public:
+  ~Gauge() {}
+
+  template <typename... MetricDefArgs>
+  static Gauge* New(MetricDefArgs&&... metric_def_args) {
+    static_assert(std::is_same<ValueType, int64>::value ||
+                      std::is_same<ValueType, string>::value,
+                  "Gauge only allows int64 and string types.");
+    return new Gauge();
+  }
+
+  template <typename... Labels>
+  GaugeCell<ValueType>* GetCell(const Labels&... labels) {
+    return &default_gauge_cell_;
+  }
+
+ private:
+  Gauge() {}
+
+  GaugeCell<ValueType> default_gauge_cell_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Gauge);
+};
+
+}  // namespace monitoring
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_CORE_LIB_MONITORING_MOBILE_GAUGE_H_
diff --git a/tensorflow/core/lib/monitoring/sampler.h b/tensorflow/core/lib/monitoring/sampler.h
index 5a4d49d5d404de6ee709af271dfc5483bc3ee2a1..c7a05428e2dced68ce3dc165616837084916f49d 100644
--- a/tensorflow/core/lib/monitoring/sampler.h
+++ b/tensorflow/core/lib/monitoring/sampler.h
@@ -159,9 +159,10 @@ class Sampler {
   // Registration handle with the CollectionRegistry.
   std::unique_ptr<CollectionRegistry::RegistrationHandle> registration_handle_;
 
-  // We use a std::map here because we give out pointers to the SamplerCells,
-  // which need to remain valid even after more cells.
   using LabelArray = std::array<string, NumLabels>;
+  // we need a container here that guarantees pointer stability of the value,
+  // namely, the pointer of the value should remain valid even after more cells
+  // are inserted.
   std::map<LabelArray, SamplerCell> cells_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(Sampler);
diff --git a/tensorflow/core/lib/random/random.cc b/tensorflow/core/lib/random/random.cc
index 723c1100f8e49f31e1e656649472eb72cec790a9..82dc82950730aa58abc52528bfdf496284634336 100644
--- a/tensorflow/core/lib/random/random.cc
+++ b/tensorflow/core/lib/random/random.cc
@@ -33,14 +33,14 @@ std::mt19937_64 InitRngWithDefaultSeed() { return std::mt19937_64(); }
 
 uint64 New64() {
   static std::mt19937_64* rng = InitRngWithRandomSeed();
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   mutex_lock l(mu);
   return (*rng)();
 }
 
 uint64 New64DefaultSeed() {
   static std::mt19937_64 rng = InitRngWithDefaultSeed();
-  static mutex mu;
+  static mutex mu(LINKER_INITIALIZED);
   mutex_lock l(mu);
   return rng();
 }
diff --git a/tensorflow/core/lib/random/random_distributions.cc b/tensorflow/core/lib/random/random_distributions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57a7cc0866ae1832e5697f2f61fd60753933b7c0
--- /dev/null
+++ b/tensorflow/core/lib/random/random_distributions.cc
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/random/distribution_sampler.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+namespace random {
+template <>
+void SingleSampleAdapter<PhiloxRandom>::SkipFromGenerator(uint64 num_skips) {
+  // Use the O(1) PhiloxRandom::Skip instead of the default O(N) impl.
+  generator_->Skip(num_skips);
+}
+}  // namespace random
+}  // namespace tensorflow
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index c15a6436d6ea90b8c71668cb18ad12208022de48..0e281403f8748ffbb7dbfac888cd2303c0a7253f 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -219,7 +219,37 @@ class SingleSampleAdapter {
     return unused_results_[used_result_index_++];
   }
 
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64 num_skips) {
+    if (!num_skips) {
+      return;
+    }
+    int num_unused_results = kNativeElementCount - used_result_index_;
+    if (num_skips <= num_unused_results) {
+      used_result_index_ += num_skips;
+      return;
+    }
+    num_skips -= num_unused_results;
+    used_result_index_ = kNativeElementCount;
+    SkipFromGenerator(num_skips / kNativeElementCount);
+    num_skips = num_skips % kNativeElementCount;
+    if (num_skips) {
+      unused_results_ = (*generator_)();
+      used_result_index_ = num_skips;
+    }
+  }
+
  private:
+  // This implementation iteratively skips over `num_skips` samples
+  // from `generator_`. There is an O(1) implementation for PhiloxRandom
+  // in random_distributions.cc.
+  PHILOX_DEVICE_INLINE
+  void SkipFromGenerator(uint64 num_skips) {
+    while (num_skips--) {
+      (*generator_)();
+    }
+  }
+
   Generator* generator_;
   typename Generator::ResultType unused_results_;
   int used_result_index_;
diff --git a/tensorflow/core/lib/random/random_distributions_test.cc b/tensorflow/core/lib/random/random_distributions_test.cc
index 28ff5bf6e8e4d9db6a0c7baef616edba97f56521..bd574cba2f38ee23aca3dda68b9def6025bdd36e 100644
--- a/tensorflow/core/lib/random/random_distributions_test.cc
+++ b/tensorflow/core/lib/random/random_distributions_test.cc
@@ -280,6 +280,72 @@ TEST(PhiloxRandomTest, RandomParametersDoubleMomentsTest) {
   RandomParametersMomentsTest<double>(1 << 20, 40, strides, kZLimit);
 }
 
+class MockGenerator {
+ public:
+  explicit MockGenerator(uint64 seed) : counter_(seed) {}
+  using ResultType = std::vector<uint32>;
+  using ResultElementType = uint32;
+  static const int kResultElementCount = 1;
+  ResultType operator()() {
+    ResultType result;
+    result.push_back(counter_++);
+    return result;
+  }
+
+ private:
+  uint32 counter_;
+};
+
+template <typename T>
+void SingleSampleAdapterSkipTest() {
+  std::vector<uint64> skips(10);
+  std::vector<uint64> skip_afters(10);
+  std::iota(skips.begin(), skips.end(), 0);
+  std::iota(skip_afters.begin(), skip_afters.end(), 0);
+  uint64 total_samples = 100;
+  uint64 seed = GetTestSeed();
+
+  for (uint64 skip : skips) {
+    for (uint64 skip_after : skip_afters) {
+      // Baseline rngs.
+      T parent_gen(seed);
+      SingleSampleAdapter<T> gen(&parent_gen);
+
+      // Rng on which Skip() is performed.
+      T parent_gen_to_skip(seed);
+      SingleSampleAdapter<T> gen_to_skip(&parent_gen_to_skip);
+
+      // Skip over `skip_after` samples from both `gen` and `gen_to_skip`.
+      int cur = 0;
+      for (; cur < skip_after; cur++) {
+        gen();
+        gen_to_skip();
+      }
+
+      // Skip over `skip_` samples from `gen` iteratively.
+      for (; cur < skip_after + skip; cur++) {
+        gen();
+      }
+
+      // Skip over `skip_` samples from `gen_to_skip` by calling `Skip()`.
+      gen_to_skip.Skip(skip);
+
+      // Assert that they produce same outputs afterwards.
+      for (; cur < total_samples; cur++) {
+        ASSERT_EQ(gen(), gen_to_skip());
+      }
+    }
+  }
+}
+
+TEST(SingleSampleAdapterTest, PhiloxRandomSkip) {
+  SingleSampleAdapterSkipTest<PhiloxRandom>();
+}
+
+TEST(SingleSampleAdapterTest, MockGeneratorSkip) {
+  SingleSampleAdapterSkipTest<MockGenerator>();
+}
+
 }  // namespace
 }  // namespace random
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 302a6967e3a4eb355d5a5f10548f0d946b1db354..f5822fad8e3d3b8559d19c79ee2885e580ea3e11 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -81,10 +81,12 @@ T locale_independent_strtonum(const char* str, const char** endptr) {
   // number was outside the range, the stringstream sets the fail flag, but
   // returns the +/-max() value, whereas strto{f,d} functions return +/-INF.
   if (s.fail()) {
-    if (result == std::numeric_limits<T>::max()) {
+    if (result == std::numeric_limits<T>::max() ||
+        result == std::numeric_limits<T>::infinity()) {
       result = std::numeric_limits<T>::infinity();
       s.clear(s.rdstate() & ~std::ios::failbit);
-    } else if (result == -std::numeric_limits<T>::max()) {
+    } else if (result == -std::numeric_limits<T>::max() ||
+               result == -std::numeric_limits<T>::infinity()) {
       result = -std::numeric_limits<T>::infinity();
       s.clear(s.rdstate() & ~std::ios::failbit);
     }
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index ce823c3f872a73702c00460248b483e24f09364c..91870cfec6322a56c8917261d336e56dbca7aea7 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -39,11 +39,11 @@ limitations under the License.
 #define TENSORFLOW_LIB_STRINGS_ORDERED_CODE_H__
 
 #include <string>
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-class StringPiece;
 
 namespace strings {
 
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 8509c9a0417621f9c9550c6af92dcbf4b7075347..d28857803d7ef1edd66ae6c1a6b81a7ed1dbce85 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -84,15 +84,32 @@ inline int hex_digit_to_int(char c) {
   return x & 0xf;
 }
 
-bool CUnescapeInternal(StringPiece source, char* dest,
+bool CUnescapeInternal(StringPiece source, string* dest,
                        string::size_type* dest_len, string* error) {
-  char* d = dest;
   const char* p = source.data();
   const char* end = source.end();
   const char* last_byte = end - 1;
 
+  // We are going to write the result to dest with its iterator. If our string
+  // implementation uses copy-on-write, this will trigger a copy-on-write of
+  // dest's buffer; that is, dest will be assigned a new buffer.
+  //
+  // Note that the following way is NOT a legal way to modify a string's
+  // content:
+  //
+  //  char* d = const_cast<char*>(dest->data());
+  //
+  // This won't trigger copy-on-write of the string, and so is dangerous when
+  // the buffer is shared.
+  auto d = dest->begin();
+
   // Small optimization for case where source = dest and there's no escaping
-  while (p == d && p < end && *p != '\\') p++, d++;
+  if (source.data() == dest->data()) {
+    while (p < end && *p != '\\') {
+      p++;
+      d++;
+    }
+  }
 
   while (p < end) {
     if (*p != '\\') {
@@ -192,7 +209,7 @@ bool CUnescapeInternal(StringPiece source, char* dest,
       p++;  // read past letter we escaped
     }
   }
-  *dest_len = d - dest;
+  *dest_len = d - dest->begin();
   return true;
 }
 
@@ -215,8 +232,7 @@ bool SplitAndParseAsInts(StringPiece text, char delim,
 bool CUnescape(StringPiece source, string* dest, string* error) {
   dest->resize(source.size());
   string::size_type dest_size;
-  if (!CUnescapeInternal(source, const_cast<char*>(dest->data()), &dest_size,
-                         error)) {
+  if (!CUnescapeInternal(source, dest, &dest_size, error)) {
     return false;
   }
   dest->erase(dest_size);
@@ -407,11 +423,11 @@ bool ConsumeNonWhitespace(StringPiece* s, StringPiece* val) {
   }
   const size_t n = p - s->data();
   if (n > 0) {
-    val->set(s->data(), n);
+    *val = StringPiece(s->data(), n);
     s->remove_prefix(n);
     return true;
   } else {
-    val->clear();
+    *val = StringPiece();
     return false;
   }
 }
diff --git a/tensorflow/core/lib/strings/str_util_test.cc b/tensorflow/core/lib/strings/str_util_test.cc
index 5c735a87a39d2b7583da208edd9af35dad33c55e..d5909d17aaa7e401cf8028346783e638af47a168 100644
--- a/tensorflow/core/lib/strings/str_util_test.cc
+++ b/tensorflow/core/lib/strings/str_util_test.cc
@@ -43,6 +43,19 @@ TEST(CUnescape, Basic) {
   EXPECT_EQ("\320hi\200", ExpectCUnescapeSuccess("\\320hi\\200"));
 }
 
+TEST(CUnescape, HandlesCopyOnWriteStrings) {
+  string dest = "hello";
+  string read = dest;
+  // For std::string, read and dest now share the same buffer.
+
+  string error;
+  StringPiece source = "llohe";
+  // CUnescape is going to write "llohe" to dest, so dest's buffer will be
+  // reallocated, and read's buffer remains untouched.
+  EXPECT_TRUE(str_util::CUnescape(source, &dest, &error));
+  EXPECT_EQ("hello", read);
+}
+
 TEST(StripTrailingWhitespace, Basic) {
   string test;
   test = "hello";
diff --git a/tensorflow/core/lib/strings/strcat.cc b/tensorflow/core/lib/strings/strcat.cc
index 46a45a66783af3444589cd66eab16c427ae1b890..5b1cff486dba46ab761762b3076610e60d636711 100644
--- a/tensorflow/core/lib/strings/strcat.cc
+++ b/tensorflow/core/lib/strings/strcat.cc
@@ -45,7 +45,7 @@ AlphaNum::AlphaNum(Hex hex) {
     value >>= 4;
     mask >>= 4;
   } while (mask != 0);
-  piece_.set(writer, end - writer);
+  piece_ = StringPiece(writer, end - writer);
 }
 
 // ----------------------------------------------------------------------
diff --git a/tensorflow/core/ops/array_grad.cc b/tensorflow/core/ops/array_grad.cc
index 325dbc48835d2f975ecd2530486be239fdcf96c6..38bd851da89357238360dcb3dd465b5e4f6a5fdd 100644
--- a/tensorflow/core/ops/array_grad.cc
+++ b/tensorflow/core/ops/array_grad.cc
@@ -333,6 +333,25 @@ Status TransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Transpose", TransposeGrad);
 
+Status ConjugateTransposeGrad(const AttrSlice& attrs, FunctionDef* g) {
+  *g = FDH::Define(
+      // Arg defs
+      {"x: T", "p: int32", "dy: T"},
+      // Ret val defs
+      {"dx: T", "dp: int32"},
+      // Attr defs
+      {"T: type"},
+      // Nodes
+      {
+          {{"q"}, "InvertPermutation", {"p"}, {}},
+          {{"dx"}, "ConjugateTranspose", {"dy", "q"}, {{"T", "$T"}}},
+          {{"dp"}, "ZerosLike", {"p"}, {{"T", DT_INT32}}},
+      });
+  VLOG(1) << "ConjugateTransposeGrad " << DebugString(*g);
+  return Status::OK();
+}
+REGISTER_OP_GRADIENT("ConjugateTranspose", ConjugateTransposeGrad);
+
 Status ReverseGrad(const AttrSlice& attrs, FunctionDef* g) {
   *g = FDH::Define(
       // Arg defs
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index cdf370399c0cf892f5003bddc99ae2ac259cad22..5a31f433cee88e8ef6ecf6dcc85d735997a9805a 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -706,6 +706,26 @@ memory_region_name: Name of readonly memory region used by the tensor, see
   NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
 )doc");
 
+REGISTER_OP("GuaranteeConst")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      return UnchangedShape(c);
+    })
+    // We don't want this to be optimized away.
+    .SetIsStateful()
+    .Doc(R"(
+Gives a guarantee to the TF runtime that the input tensor is a constant.
+
+The runtime is then free to make optimizations based on this.
+
+Only accepts value typed tensors as inputs and rejects resource variable handles
+as input.
+
+Returns the input tensor without modification.
+)");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("ZerosLike")
     .Input("x: T")
@@ -723,7 +743,9 @@ y: a tensor of the same shape and type as x but filled with zeros.
 REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns a tensor of ones with the same shape and type as x.
@@ -736,7 +758,7 @@ y: a tensor of the same shape and type as x but filled with ones.
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -774,7 +796,7 @@ diagonal: Rank k tensor where k is at most 1.
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {float, double, int32, int64, complex64, complex128}")
+    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -1057,9 +1079,8 @@ REGISTER_OP("Reverse")
     .Input("dims: bool")
     .Output("output: T")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle dims;
@@ -1135,9 +1156,8 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, float, "
-        "double, complex64, "
-        "complex128, string}")
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       ShapeHandle axis;
@@ -1702,6 +1722,20 @@ REGISTER_OP("Identity")
 Return a tensor with the same shape and contents as the input tensor or value.
 )Doc");
 
+REGISTER_OP("Snapshot")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    })
+    .Doc(R"Doc(Returns a copy of the input tensor.)Doc");
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
     .Input("input: T")
@@ -1832,7 +1866,7 @@ this operation.
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
@@ -2031,6 +2065,46 @@ y: 1-D.
 idx: 1-D.
 )doc");
 
+REGISTER_OP("UniqueV2")
+    .Input("x: T")
+    .Input("axis: int64")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Attr("T: type")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(1, c->input(0));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Finds unique elements in a 1-D tensor.
+
+This operation returns a tensor `y` containing all of the unique elements of `x`
+sorted in the same order that they occur in `x`. This operation also returns a
+tensor `idx` the same size as `x` that contains the index of each value of `x`
+in the unique output `y`. In other words:
+
+`y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+
+For example:
+
+```
+# tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+y, idx = unique(x)
+y ==> [1, 2, 4, 7, 8]
+idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+```
+
+
+x: A `Tensor`.
+axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+  find the unique elements.
+y: A `Tensor`. Unique elements along the `axis` of `Tensor` x.
+idx: A 1-D Tensor. Has the same type as x that contains the index of each
+  value of x in the output y.
+)doc");
+
 // --------------------------------------------------------------------------
 REGISTER_OP("UniqueWithCounts")
     .Input("x: T")
@@ -2378,6 +2452,7 @@ REGISTER_OP("Slice")
           TF_RETURN_IF_ERROR(
               c->WithRank(begin_value, c->Rank(sizes_value), &begin_value));
           std::vector<DimensionHandle> dims;
+          dims.reserve(c->Rank(sizes_value));
           for (int i = 0; i < c->Rank(sizes_value); ++i) {
             dims.emplace_back(c->Dim(sizes_value, i));
           }
@@ -4184,7 +4259,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
@@ -4328,7 +4403,7 @@ with the following options:
   "NHWC": `[ batch, height, width, channels ]`
   "NCHW": `[ batch, channels, height, width ]`
   "NCHW_VECT_C":
-      `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
+      `qint8 [ batch, channels / 4, height, width, 4 ]`
 
 It is useful to consider the operation as transforming a 6-D Tensor.
 e.g. for data_format = NHWC,
@@ -4522,12 +4597,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {float, double, int64, int32, uint8, uint16, int8, int16,"
+        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
         " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .Attr(
-        "type: {float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
+        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
         " half}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -4739,7 +4814,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2")
     .Doc(R"doc(
@@ -4755,7 +4830,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -4834,7 +4909,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 94eb120175555d8d51b9be1ff98676a9dc4fff07..c8ea443613656b418dd88fc4a1b9343101d754eb 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -158,6 +158,13 @@ TEST(ArrayOpsTest, UnchangedShapes_ShapeFn) {
   INFER_OK(op, "[1,2,?,4,5];?;?", "in0");
 }
 
+TEST(ArrayOpsTest, GuaranteeConst_ShapeFn) {
+  ShapeInferenceTestOp op("GuaranteeConst");
+  INFER_OK(op, "?", "in0");
+  INFER_OK(op, "[]", "in0");
+  INFER_OK(op, "[1,2,?,4,5]", "in0");
+}
+
 TEST(ArrayOpsTest, Identity_ShapeFnHandles) {
   const char* op_name = "Identity";
   ShapeInferenceTestOp op(op_name);
@@ -514,7 +521,7 @@ TEST(ArrayOpsTest, MatrixSetDiag_ShapeFn) {
   INFER_ERROR("Dimensions must be equal, but are 2 and 3", op, "[2,3];[3]");
 
   // Output matches input.
-  INFER_OK(op, "?;?", "?");
+  INFER_OK(op, "?;?", "in0");
   INFER_OK(op, "[1,2,2];[1,2]", "in0");
   INFER_OK(op, "[1,2,3];?", "in0");
   INFER_OK(op, "[1,3,2];?", "in0");
@@ -1612,7 +1619,7 @@ TEST(ArrayOpsTest, UnchangedWithQuantizationScalars_ShapeFn) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannel");
 
-  INFER_OK(op, "?;?;?", "?");
+  INFER_OK(op, "?;?;?", "in0");
   INFER_OK(op, "[?];?;?", "in0");
   INFER_OK(op, "[1,?,3];[3];[3]", "in0");
   INFER_OK(op, "[3];[3];[3]", "in0");
@@ -1631,7 +1638,7 @@ TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannel) {
 TEST(ArrayOpsTest, FakeQuantWithMinMaxVarsPerChannelGradient) {
   ShapeInferenceTestOp op("FakeQuantWithMinMaxVarsPerChannelGradient");
 
-  INFER_OK(op, "?;?;?;?", "?;[?];[?]");
+  INFER_OK(op, "?;?;?;?", "in0;[?];[?]");
   INFER_OK(op, "[3];[3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,3];[1,3];[3];[3]", "in0;in3;in3");
   INFER_OK(op, "[1,2,3,4];[1,2,3,4];[4];[4]", "in0;in3;in3");
diff --git a/tensorflow/core/ops/checkpoint_ops.cc b/tensorflow/core/ops/checkpoint_ops.cc
index b49d7b4d40f6a79e966c0ab8f2ef8cedb004b27c..08b00c8255c8e44cea9a2e0d4c97378ecc3bb998 100644
--- a/tensorflow/core/ops/checkpoint_ops.cc
+++ b/tensorflow/core/ops/checkpoint_ops.cc
@@ -22,6 +22,7 @@ REGISTER_OP("GenerateVocabRemapping")
     .Input("old_vocab_file: string")
     .Attr("new_vocab_offset: int >= 0")
     .Attr("num_new_vocab: int >= 0")
+    .Attr("old_vocab_size: int >= -1 = -1")
     .Output("remapping: int64")
     .Output("num_present: int32")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -43,7 +44,11 @@ Given a path to new and old vocabulary files, returns a remapping Tensor of
 length `num_new_vocab`, where `remapping[i]` contains the row number in the old
 vocabulary that corresponds to row `i` in the new vocabulary (starting at line
 `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-in the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables
+in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+default value of -1.
+
+`num_vocab_offset` enables
 use in the partitioned variable case, and should generally be set through
 examining partitioning info.  The format of the files should be a text file,
 with each line containing a single entity within the vocabulary.
@@ -69,6 +74,8 @@ new_vocab_file: Path to the new vocab file.
 old_vocab_file: Path to the old vocab file.
 new_vocab_offset: How many entries into the new vocab file to start reading.
 num_new_vocab: Number of entries in the new vocab file to remap.
+old_vocab_size: Number of entries in the old vocab file to consider.  If -1,
+  use the entire old vocabulary.
 remapping: A Tensor of length num_new_vocab where the element at index i
   is equal to the old ID that maps to the new ID i.  This element is -1 for any
   new ID that is not found in the old vocabulary.
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index f385ef54f1c3a3ca22f49427b75e55c9e240936b..c7a296d9381b5263617ae9cb014856f234733fd9 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1536,6 +1536,75 @@ op {
     }
   }
 }
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyCenteredRMSProp"
   input_arg {
@@ -2228,6 +2297,75 @@ op {
     }
   }
 }
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "ApplyProximalAdagrad"
   input_arg {
@@ -6059,6 +6197,33 @@ op {
     type: "list(float)"
   }
 }
+op {
+  name: "BytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -8960,6 +9125,57 @@ op {
     }
   }
 }
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "DecodeGif"
   input_arg {
@@ -9811,6 +10027,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+}
 op {
   name: "DestroyResourceOp"
   input_arg {
@@ -13292,6 +13531,44 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "GenerateVocabRemapping"
+  input_arg {
+    name: "new_vocab_file"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "old_vocab_file"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "remapping"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_present"
+    type: DT_INT32
+  }
+  attr {
+    name: "new_vocab_offset"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_new_vocab"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+}
 op {
   name: "GetSessionHandle"
   input_arg {
@@ -14889,6 +15166,18 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "IteratorSetStatsAggregator"
+  input_arg {
+    name: "iterator_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stats_aggregator_handle"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
 op {
   name: "IteratorToStringHandle"
   input_arg {
@@ -15080,6 +15369,33 @@ op {
     }
   }
 }
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
@@ -16601,6 +16917,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatrixInverse"
   input_arg {
@@ -21558,6 +21897,52 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "PriorityQueue"
   output_arg {
@@ -23968,6 +24353,60 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RandomPoisson"
+  input_arg {
+    name: "shape"
+    type_attr: "S"
+  }
+  input_arg {
+    name: "rate"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 25
+  }
+  is_stateful: true
+}
 op {
   name: "RandomPoissonV2"
   input_arg {
@@ -26128,57 +26567,122 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "ResourceApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAdagradDA"
   input_arg {
@@ -26232,6 +26736,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -26245,21 +26751,25 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagradDA"
+  name: "ResourceApplyAdam"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "m"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "v"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
     type_attr: "T"
   }
   input_arg {
@@ -26267,16 +26777,20 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -26297,8 +26811,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -26382,6 +26894,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -26445,6 +26964,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -26465,7 +26986,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceApplyAddSign"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -26474,32 +26995,20 @@ op {
     name: "m"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "v"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "beta1_power"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "beta2_power"
-    type_attr: "T"
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "beta1"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "beta2"
+    name: "sign_decay"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "beta"
     type_attr: "T"
   }
   input_arg {
@@ -26537,13 +27046,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   is_stateful: true
 }
 op {
@@ -27172,6 +27674,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyProximalAdagrad"
   input_arg {
@@ -38183,6 +38748,40 @@ op {
     }
   }
 }
+op {
+  name: "StatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -40010,6 +40609,63 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayV3"
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+  attr {
+    name: "dynamic_size"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "clear_after_read"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "identical_element_shapes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "tensor_array_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayWrite"
   input_arg {
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 3b1ed217ce1b444b0601d5a1b1d599489ee33644..ac2dc601f1f6b48905f1269b8726ac30ba5dda67 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -1346,6 +1346,7 @@ REGISTER_OP("TensorArrayV3")
     .Attr("element_shape: shape = { unknown_rank: true }")
     .Attr("dynamic_size: bool = false")
     .Attr("clear_after_read: bool = true")
+    .Attr("identical_element_shapes: bool = false")
     .Attr("tensor_array_name: string = ''")
     .Output("handle: resource")
     .Output("flow: float")
@@ -1374,6 +1375,12 @@ dynamic_size: A boolean that determines whether writes to the TensorArray
 clear_after_read: If true (default), Tensors in the TensorArray are cleared
   after being read.  This disables multiple read semantics but allows early
   release of memory.
+identical_element_shapes: If true (default is false), then all
+  elements in the TensorArray will be expected to have have identical shapes.
+  This allows certain behaviors, like dynamically checking for
+  consistent shapes on write, and being able to fill in properly
+  shaped zero tensors on stack -- even if the element_shape attribute
+  is not fully defined.
 tensor_array_name: Overrides the name used for the temporary tensor_array
   resource. Default value is the name of the 'TensorArray' op (which
   is guaranteed unique).
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f5122139645e2d3360bdcdbde29335ccaca79fbb..be415313473c17c1eba34d83d164dae0d3c927cf 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -151,6 +151,28 @@ REGISTER_OP("IgnoreErrorsDataset")
 Creates a dataset that contains the elements of `input_dataset` ignoring errors.
 )doc");
 
+REGISTER_OP("BytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+)doc");
+
+REGISTER_OP("LatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Records the latency of producing `input_dataset` elements in a StatsAggregator.
+)doc");
+
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -447,6 +469,24 @@ stop: corresponds to stop in python's xrange().
 step: corresponds to step in python's xrange().
 )doc");
 
+REGISTER_OP("RandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Creates a Dataset that returns pseudorandom numbers.
+
+seed: A scalar seed for the random number generator. If either seed or
+  seed2 is set to be non-zero, the random number generator is seeded
+  by the given seed.  Otherwise, a random seed is used.
+seed2: A second scalar seed to avoid seed collision.
+)doc");
+
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -744,4 +784,29 @@ serialized: A variant tensor storing the state of the iterator contained in the
   resource.
 )doc");
 
+REGISTER_OP("StatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Doc(R"doc(
+Creates a statistics manager resource.
+)doc");
+
+REGISTER_OP("IteratorSetStatsAggregator")
+    .Input("iterator_handle: resource")
+    .Input("stats_aggregator_handle: resource")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Associates the given iterator with the given statistics aggregator.
+)doc");
+
+REGISTER_OP("StatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Produces a summary of any statistics recorded by the given statistics manager.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index c3f80064150ba0dcce1173de1d02142cf3dc6621..13fbd2fa515c5a7e0ec06cdc4c585f4dc691a928 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -818,8 +818,8 @@ bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
 bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
 height of the underlying image.
 
-For example, if an image is 100 x 200 pixels (height x width) and the bounding 
-box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of 
+For example, if an image is 100 x 200 pixels (height x width) and the bounding
+box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
 the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
 
 Parts of the bounding box may fall outside the image.
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 4851619f833beff71e1976e63a60cd81fb78eff8..53e2360d2321a21c658f5abb87bfbc78e2564f26 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -282,6 +282,33 @@ Equivalent to np.linalg.inv
 @end_compatibility
 )doc");
 
+REGISTER_OP("MatrixExponential")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(BatchUnchangedSquareShapeFn)
+    .Doc(R"doc(
+Computes the matrix exponential of one or more square matrices:
+
+exp(A) = \sum_{n=0}^\infty A^n/n!
+
+The exponential is computed using a combination of the scaling and squaring
+method and the Pade approximation. Details can be founds in:
+Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices. The output is a tensor of the same shape as the input
+containing the exponential for all input submatrices `[..., :, :]`.
+
+input: Shape is `[..., M, M]`.
+output: Shape is `[..., M, M]`.
+
+@compatibility(scipy)
+Equivalent to scipy.linalg.expm
+@end_compatibility
+)doc");
+
 REGISTER_OP("Cholesky")
     .Input("input: T")
     .Output("output: T")
diff --git a/tensorflow/core/ops/logging_ops.cc b/tensorflow/core/ops/logging_ops.cc
index 11cb9861a395ce39974b4b36453578957e9efb3b..e6995821df700ef6d6a736645e4d18c961b089a8 100644
--- a/tensorflow/core/ops/logging_ops.cc
+++ b/tensorflow/core/ops/logging_ops.cc
@@ -43,7 +43,7 @@ REGISTER_OP("Print")
     .Output("output: T")
     .SetIsStateful()
     .Attr("T: type")
-    .Attr("U: list(type)")
+    .Attr("U: list(type) >= 0")
     .Attr("message: string = ''")
     .Attr("first_n: int = -1")
     .Attr("summarize: int = 3")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7b10af9f44dad7c9a28a7c37d57b3b5a69cc36a1..8ea170ba14355d06cc6cd19f306674000fe3bda3 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -85,7 +85,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -184,7 +184,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes the absolute value of a tensor.
@@ -210,29 +210,31 @@ value is computed as \\( \sqrt{a^2 + b^2}\\).
 )doc");
 
 // Declares cwise unary operations signature: 't -> 't
-#define UNARY()                                                              \
-  Input("x: T")                                                              \
-      .Output("y: T")                                                        \
-      .Attr("T: {half, float, double, int32, int64, complex64, complex128}") \
+#define UNARY()                                                          \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr(                                                             \
+          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_REAL()                    \
-  Input("x: T")                         \
-      .Output("y: T")                   \
-      .Attr("T: {half, float, double}") \
+#define UNARY_REAL()                              \
+  Input("x: T")                                   \
+      .Output("y: T")                             \
+      .Attr("T: {half, bfloat16, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_COMPLEX()                                        \
-  Input("x: T")                                                \
-      .Output("y: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_COMPLEX()                                                  \
+  Input("x: T")                                                          \
+      .Output("y: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
-#define UNARY_GRADIENT_COMPLEX()                               \
-  Input("y: T")                                                \
-      .Input("dy: T")                                          \
-      .Output("z: T")                                          \
-      .Attr("T: {half, float, double, complex64, complex128}") \
+#define UNARY_GRADIENT_COMPLEX()                                         \
+  Input("y: T")                                                          \
+      .Input("dy: T")                                                    \
+      .Output("z: T")                                                    \
+      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg")
@@ -481,7 +483,7 @@ Computes atan of x element-wise.
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are NaN.
@@ -494,7 +496,7 @@ Equivalent to np.isnan
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are Inf.
@@ -507,7 +509,7 @@ Equivalent to np.isinf
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns which elements of x are finite.
@@ -520,7 +522,9 @@ Equivalent to np.isfinite
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns an element-wise indication of the sign of a number.
@@ -533,7 +537,7 @@ For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise largest integer not greater than x.
@@ -542,7 +546,7 @@ Returns element-wise largest integer not greater than x.
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise smallest integer in not less than x.
@@ -551,7 +555,7 @@ Returns element-wise smallest integer in not less than x.
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Returns element-wise integer closest to x.
@@ -569,22 +573,23 @@ rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
-#define BINARY_MORE()                                                       \
-  Input("x: T").Input("y: T").Output("z: T").Attr(                          \
-      "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, " \
-      "complex64, complex128}")
+#define BINARY_MORE()                                                          \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                             \
+      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "int64, complex64, complex128}")
 
-#define BINARY_FEWER()                             \
-  Input("x: T").Input("y: T").Output("z: T").Attr( \
-      "T: {half, float, double, int32, int64, complex64, complex128}")
+#define BINARY_FEWER()                                               \
+  Input("x: T").Input("y: T").Output("z: T").Attr(                   \
+      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "complex128}")
 
 REGISTER_OP("Add")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128, string}")
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns x + y element-wise.
@@ -600,8 +605,8 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, float, double, uint8, int8, int16, int32, int64, complex64, "
-        "complex128}")
+        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
     .SetIsCommutative()
@@ -757,7 +762,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
@@ -788,7 +793,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64}")
+    .Attr("T: {half, bfloat16, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
@@ -802,7 +807,7 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -817,7 +822,7 @@ REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
@@ -832,7 +837,7 @@ REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, float, double}")
+    .Attr("T: {int32, int64, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Returns element-wise remainder of division. This emulates C semantics in that
@@ -847,7 +852,9 @@ REGISTER_OP("Pow")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Computes the power of one value to another.
@@ -946,7 +953,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .Doc(R"doc(
 Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
@@ -1064,15 +1071,15 @@ Returns the truth value of (x >= y) element-wise.
 
 // --------------------------------------------------------------------------
 
-#define EQUALITY_COMPARISON()                                           \
-  Input("x: T")                                                         \
-      .Input("y: T")                                                    \
-      .Output("z: bool")                                                \
-      .SetIsCommutative()                                               \
-      .Attr(                                                            \
-          "T: {half, float, double, uint8, int8, int16, int32, int64, " \
-          "complex64, "                                                 \
-          "quint8, qint8, qint32, string, bool, complex128}")           \
+#define EQUALITY_COMPARISON()                                              \
+  Input("x: T")                                                            \
+      .Input("y: T")                                                       \
+      .Output("z: bool")                                                   \
+      .SetIsCommutative()                                                  \
+      .Attr(                                                               \
+          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "int64, complex64, quint8, qint8, qint32, string, bool, "        \
+          "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
 
 REGISTER_OP("Equal")
@@ -1291,7 +1298,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, float, double, int32, complex64, complex128}")
+    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape)
     .Doc(R"doc(
 Multiply the matrix "a" by the matrix "b".
@@ -1625,6 +1632,45 @@ Status SparseSegmentReductionGradShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SparseSegmentReductionWithNumSegmentsShapeFn(InferenceContext* c) {
+  ShapeHandle data_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &data_shape));
+
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &indices_shape));
+
+  ShapeHandle segment_ids_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &segment_ids_shape));
+
+  ShapeHandle num_segments_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &num_segments_shape));
+
+  // indices and segment_ids should merge cleanly.
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(indices_shape, segment_ids_shape, &unused));
+
+  ShapeHandle subshape;
+  TF_RETURN_IF_ERROR(c->Subshape(data_shape, 1, &subshape));
+
+  ShapeHandle out;
+  const Tensor* dim0 = c->input_tensor(3);
+  if (dim0 == nullptr) {
+    // We don't have the value at inference time, so the output
+    // shape is unknown.
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(InferenceContext::kUnknownDim),
+                                      subshape, &out));
+  } else {
+    auto dim0_value = dim0->scalar<int32>()();
+    if (dim0_value < 0) {
+      return errors::InvalidArgument(
+          "Cannot specify a negative value for num_segments");
+    }
+    TF_RETURN_IF_ERROR(c->Concatenate(c->Vector(dim0_value), subshape, &out));
+  }
+  c->set_output(0, out);
+  return Status::OK();
+}
+
 Status UnsortedSegmentReductionShapeFn(InferenceContext* c) {
   ShapeHandle s_data = c->input(0);
   ShapeHandle s_segment_ids = c->input(1);
@@ -1811,10 +1857,11 @@ output: Has same shape as data, except for dimension 0 which
 REGISTER_OP("UnsortedSegmentSum")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn)
     .Doc(R"doc(
 Computes the sum along segments of a tensor.
@@ -1829,6 +1876,8 @@ need not be sorted and need not cover all values in the full
 range of valid values.
 
 If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+If the given segment ID `i` is negative, the value is dropped and will not be
+added to the sum of the segment.
 
 `num_segments` should equal the number of distinct segment IDs.
 
@@ -1847,10 +1896,11 @@ output: Has same shape as data, except for the first `segment_ids.rank`
 REGISTER_OP("UnsortedSegmentMax")
     .Input("data: T")
     .Input("segment_ids: Tindices")
-    .Input("num_segments: int32")
+    .Input("num_segments: Tnumsegments")
     .Output("output: T")
     .Attr("T: realnumbertype")
     .Attr("Tindices: {int32,int64}")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn)
     .Doc(R"doc(
 Computes the Max along segments of a tensor.
@@ -1879,6 +1929,7 @@ output: Has same shape as data, except for dimension 0 which
 has size `num_segments`.
 
 )doc");
+
 REGISTER_OP("SparseSegmentSum")
     .Input("data: T")
     .Input("indices: Tidx")
@@ -1927,6 +1978,56 @@ output: Has same shape as data, except for dimension 0 which
   has size `k`, the number of segments.
 )doc");
 
+REGISTER_OP("SparseSegmentSumWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: realnumbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the sum along sparse segments of a tensor.
+
+Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+tf.sparse_segment_sum_with_num_segments(
+    c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+# => [[0 0 0 0]
+#     [0 0 0 0]
+#     [0 0 0 0]]
+
+tf.sparse_segment_sum_with_num_segments(c,
+                                        tf.constant([0, 1]),
+                                        tf.constant([0, 2],
+                                        num_segments=4))
+# => [[ 1  2  3  4]
+#     [ 0  0  0  0]
+#     [-1 -2 -3 -4]
+#     [ 0  0  0  0]]
+```
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which
+  has size `num_segments`.
+)doc");
+
 REGISTER_OP("SparseSegmentMean")
     .Input("data: T")
     .Input("indices: Tidx")
@@ -1953,6 +2054,35 @@ output: Has same shape as data, except for dimension 0 which
 
 )doc");
 
+REGISTER_OP("SparseSegmentMeanWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the mean along sparse segments of a tensor.
+
+Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which has size
+    `num_segments`.
+)doc");
+
 REGISTER_OP("SparseSegmentMeanGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
@@ -1999,6 +2129,38 @@ output: Has same shape as data, except for dimension 0 which
 
 )doc");
 
+REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
+    .Input("data: T")
+    .Input("indices: Tidx")
+    .Input("segment_ids: int32")
+    .Input("num_segments: Tnumsegments")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .Attr("Tnumsegments: {int32,int64} = DT_INT32")
+    .SetShapeFn(SparseSegmentReductionWithNumSegmentsShapeFn)
+    .Doc(R"doc(
+Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+
+N is the size of the segment being reduced.
+
+Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+misisng, the `output` tensor at that position will be zeroed.
+
+Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+segments.
+
+indices: A 1-D tensor. Has same rank as `segment_ids`.
+
+segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+
+num_segments: Should equal the number of distinct segment IDs.
+
+output: Has same shape as data, except for dimension 0 which
+  has size `k`, the number of segments.
+
+)doc");
+
 REGISTER_OP("SparseSegmentSqrtNGrad")
     .Input("grad: T")
     .Input("indices: Tidx")
@@ -2101,7 +2263,7 @@ REGISTER_OP("Range")
     .Input("limit: Tidx")
     .Input("delta: Tidx")
     .Output("output: Tidx")
-    .Attr("Tidx: {float, double, int32, int64} = DT_INT32")
+    .Attr("Tidx: {bfloat16, float, double, int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_WITH_CONTEXT_IF_ERROR(c->WithRank(c->input(0), 0, &unused),
@@ -2156,7 +2318,7 @@ REGISTER_OP("LinSpace")
     .Input("stop: T")
     .Input("num: Tidx")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
@@ -2329,11 +2491,25 @@ REGISTER_OP("Cross")
     .Input("b: T")
     .Output("product: T")
     .Attr("T: realnumbertype")
-    // TODO(cwhipkey): implement these shape inference constraints here:
-    // * Both inputs have the same shape.
-    // * Input rank >= 1.
-    // * input_shape[-1] == 3.
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle a_shape;
+      ShapeHandle b_shape;
+      // * Input rank >= 1.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &a_shape));
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &b_shape));
+
+      // * Both inputs have the same shape.
+      TF_RETURN_IF_ERROR(c->Merge(a_shape, b_shape, &a_shape));
+
+      // * input_shape[-1] == 3.
+      if (c->RankKnown(a_shape)) {
+        int rank = c->Rank(a_shape);
+        auto dim = c->Dim(a_shape, rank - 1);
+        TF_RETURN_IF_ERROR(c->WithValue(dim, 3, &dim));
+      }
+      c->set_output(0, a_shape);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Compute the pairwise cross product.
 
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 28f9969de56c93556f4746acae1a2887c27b5b98..ca3772e6f89805b70f05f1c9fd5e36ee99f2d510 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -515,4 +515,15 @@ TEST(MathOpstest, RequantizationRange_ShapeFn) {
   INFER_ERROR("must be rank 0", op, "?;?;[2]");
 }
 
+TEST(MathOpsTest, Cross_ShapeFn) {
+  ShapeInferenceTestOp op("Cross");
+
+  INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[];[]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but", op, "[3];[5]");
+  INFER_ERROR("Dimension must be 3 but", op, "[3,5];[3,5]");
+
+  INFER_OK(op, "?;?", "in0");
+  INFER_OK(op, "[?];[?]", "in0");
+  INFER_OK(op, "[1,?,3];[?,?,?]", "in0");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index de059a3e7ef2f4a732df27bff86cad79edd53541..0e91572d0eb7274dcf1e44a61d1dbfcb12eaccae 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -73,7 +73,7 @@ REGISTER_OP("AvgPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::AvgPoolShape)
     .Doc(R"doc(
 Performs average pooling on the input.
@@ -101,7 +101,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -300,7 +300,7 @@ REGISTER_OP("FusedBatchNormV2")
     .Output("batch_variance: U")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
@@ -359,7 +359,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 y_backprop: A 4D Tensor for the gradient with respect to y.
 x: A 4D Tensor for input data.
 scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch 
+reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
                  mean to be reused in gradient computation. When is_training is
                  False, a 1D Tensor for the population mean to be reused in both
                  1st and 2nd order gradient computation.
@@ -393,7 +393,7 @@ REGISTER_OP("FusedBatchNormGradV2")
     .Output("offset_backprop: U")
     .Output("reserve_space_3: U")
     .Output("reserve_space_4: U")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
@@ -407,7 +407,7 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
 y_backprop: A 4D Tensor for the gradient with respect to y.
 x: A 4D Tensor for input data.
 scale: A 1D Tensor for scaling factor, to scale the normalized x.
-reserve_space_1: When is_training is True, a 1D Tensor for the computed batch 
+reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
                  mean to be reused in gradient computation. When is_training is
                  False, a 1D Tensor for the population mean to be reused in both
                  1st and 2nd order gradient computation.
@@ -508,11 +508,12 @@ REGISTER_OP("Conv2D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.
@@ -546,7 +547,7 @@ filter: A 4-D tensor of shape
 output: A 4-D tensor. The dimension order is determined by the value of
     `data_format`, see below for details.
 strides: 1-D tensor of length 4.  The stride of the sliding window for each
-  dimension of `input`. The dimension order is determined by the value of
+    dimension of `input`. The dimension order is determined by the value of
     `data_format`, see below for details.
 padding: The type of padding algorithm to use.
 data_format: Specify the data format of the input and output data. With the
@@ -554,6 +555,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("Conv2DBackpropInput")
@@ -561,11 +567,12 @@ REGISTER_OP("Conv2DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -589,10 +596,15 @@ padding: The type of padding algorithm to use.
 output: 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
   w.r.t. the input of the convolution.
 data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
+  default format "NHWC", the data is stored in the order of:
+      [batch, in_height, in_width, in_channels].
+  Alternatively, the format could be "NCHW", the data storage order of:
+      [batch, in_channels, in_height, in_width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 // TODO(jeff): Instead of 'use_cudnn_for_gpu', maybe we should have a
@@ -603,11 +615,12 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {half, float}")
+    .Attr("T: {half, bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -632,10 +645,15 @@ output: 4-D with shape
   `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
   the `filter` input of the convolution.
 data_format: Specify the data format of the input and output data. With the
-    default format "NHWC", the data is stored in the order of:
-        [batch, in_height, in_width, in_channels].
-    Alternatively, the format could be "NCHW", the data storage order of:
-        [batch, in_channels, in_height, in_width].
+  default format "NHWC", the data is stored in the order of:
+      [batch, in_height, in_width, in_channels].
+  Alternatively, the format could be "NCHW", the data storage order of:
+      [batch, in_channels, in_height, in_width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 namespace {
@@ -733,6 +751,40 @@ Status CommonFusedConvCalculations(InferenceContext* c, bool has_resize) {
 
 }  // namespace
 
+REGISTER_OP("DataFormatDimMap")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns the dimension index in the destination data format given the one in
+the source data format.
+
+x: Scalar. Dimension index in source data format. Must be in the range [-4, 4).
+y: Scalar. Dimension index in destination data format.
+src_format: source data format.
+dst_format: destination data format.
+)doc");
+
+REGISTER_OP("DataFormatVecPermute")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32, int64} = DT_INT32")
+    .Attr("src_format: string = 'NHWC'")
+    .Attr("dst_format: string = 'NCHW'")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+Returns the permuted vector in the destination data format given the one in
+the source data format.
+
+x: Vector in source data format. Must be of size 4.
+y: Vector in destination data format. Must be of size 4.
+src_format: source data format.
+dst_format: destination data format.
+)doc");
+
 REGISTER_OP("FusedResizeAndPadConv2D")
     .Input("input: T")
     .Input("size: int32")
@@ -819,10 +871,11 @@ REGISTER_OP("DepthwiseConv2dNative")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape)
     .Doc(R"doc(
 Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
@@ -845,7 +898,6 @@ for k in 0..in_channels-1
 
 Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
 horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-
 strides: 1-D of length 4.  The stride of the sliding window for each dimension
   of `input`.
 padding: The type of padding algorithm to use.
@@ -854,6 +906,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 )doc");
 
 REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
@@ -861,10 +918,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -892,6 +950,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 output: 4-D with shape according to `data_format`.  For example, if
   `data_format` is 'NHWC', output shape is `[batch, in_height,
   in_width, in_channels]`.  Gradient w.r.t. the input of the
@@ -903,10 +966,11 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -935,6 +999,11 @@ data_format: Specify the data format of the input and output data. With the
         [batch, height, width, channels].
     Alternatively, the format could be "NCHW", the data storage order of:
         [batch, channels, height, width].
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+  `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+  element on that dimension. The dimension order is determined by the value of
+  `data_format`, see above for details. Dilations in the batch and depth
+  dimensions must be 1.
 output: 4-D with shape
   `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
   the `filter` input of the convolution.
@@ -945,10 +1014,11 @@ REGISTER_OP("Conv3D")
     .Input("input: T")
     .Input("filter: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn(shape_inference::Conv3DShape)
     .Doc(R"doc(
 Computes a 3-D convolution given 5-D `input` and `filter` tensors.
@@ -970,6 +1040,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("Conv3DBackpropInput")
@@ -977,7 +1052,7 @@ REGISTER_OP("Conv3DBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropInputV2")
@@ -1003,7 +1078,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Deprecated(10, "Use Conv3DBackpropFilterV2")
@@ -1032,10 +1107,11 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1061,6 +1137,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 
 )doc");
 
@@ -1069,10 +1150,11 @@ REGISTER_OP("Conv3DBackpropFilterV2")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
@@ -1098,6 +1180,11 @@ data_format: The data format of the input and output data. With the
         [batch, in_depth, in_height, in_width, in_channels].
     Alternatively, the format could be "NCDHW", the data storage order is:
         [batch, in_channels, in_depth, in_height, in_width].
+dilations: 1-D tensor of length 5.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 
 )doc");
 
@@ -1110,7 +1197,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D average pooling on the input.
@@ -1137,7 +1224,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1172,7 +1259,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape)
     .Doc(R"doc(
 Performs 3D max pooling on the input.
@@ -1200,8 +1287,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {float} = DT_FLOAT")
-    .Attr("TInput: {float} = DT_FLOAT")
+    .Attr("T: {bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     })
@@ -1266,7 +1353,7 @@ data_format: The data format of the input and output data. With the
 REGISTER_OP("L2Loss")
     .Input("t: T")
     .Output("output: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::ScalarShape)
     .Doc(R"doc(
 L2 Loss.
@@ -1288,7 +1375,7 @@ REGISTER_OP("LRN")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 4);
     })
@@ -1323,7 +1410,7 @@ REGISTER_OP("LRNGrad")
     .Attr("bias: float = 1.0")
     .Attr("alpha: float = 1.0")
     .Attr("beta: float = 0.5")
-    .Attr("T: {float, half} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &s));  // input_grads
@@ -1349,8 +1436,8 @@ output: The gradients for LRN.
 
 REGISTER_OP("MaxPool")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
@@ -1376,8 +1463,8 @@ output: The max pooled output tensor.
 
 REGISTER_OP("MaxPoolV2")
     .Attr(
-        "T: {float, double, int32, int64, uint8, int16, int8, uint16, "
-        "half, qint8} = DT_FLOAT")
+        "T: {half, bfloat16, float, double, int32, int64, uint8, int16, int8, "
+        "uint16, qint8} = DT_FLOAT")
     .Attr(GetPaddingAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
@@ -1860,7 +1947,7 @@ backprops: The gradients:
 REGISTER_OP("Elu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
@@ -1873,7 +1960,7 @@ REGISTER_OP("EluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the exponential linear (Elu) operation.
@@ -1887,7 +1974,7 @@ backprops: The gradients: `gradients * (outputs + 1)` if outputs < 0,
 REGISTER_OP("Selu")
     .Input("features: T")
     .Output("activations: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
@@ -1900,7 +1987,7 @@ REGISTER_OP("SeluGrad")
     .Input("gradients: T")
     .Input("outputs: T")
     .Output("backprops: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
     .Doc(R"doc(
 Computes gradients for the scaled exponential linear (Selu) operation.
@@ -1962,7 +2049,7 @@ backprops: The gradients: `gradients / (1 + abs(features)) ** 2`.
 REGISTER_OP("Softmax")
     .Input("logits: T")
     .Output("softmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
@@ -1982,7 +2069,7 @@ softmax: Same shape as `logits`.
 REGISTER_OP("LogSoftmax")
     .Input("logits: T")
     .Output("logsoftmax: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     })
@@ -2004,7 +2091,7 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
     .Input("labels: T")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
@@ -2033,7 +2120,7 @@ REGISTER_OP("SparseSoftmaxCrossEntropyWithLogits")
     .Input("labels: Tlabels")
     .Output("loss: T")
     .Output("backprop: T")
-    .Attr("T: {half, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("Tlabels: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle features;
@@ -2290,7 +2377,7 @@ REGISTER_OP("NthElement")
       return Status::OK();
     })
     .Doc(R"doc(
-Finds values of the `n`-th order statistic for the last dmension.
+Finds values of the `n`-th order statistic for the last dimension.
 
 If the input is a vector (rank-1), finds the entries which is the nth-smallest
 value in the vector and outputs their values as scalar tensor.
@@ -2613,6 +2700,7 @@ REGISTER_OP("QuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::Conv2DShape(c));
       ShapeHandle unused;
@@ -2641,7 +2729,11 @@ min_filter: The float value that the lowest quantized filter value represents.
 max_filter: The float value that the highest quantized filter value represents.
 min_output: The float value that the lowest quantized output value represents.
 max_output: The float value that the highest quantized output value represents.
-
+dilations: 1-D tensor of length 4.  The dilation factor for each dimension of
+    `input`. If set to k > 1, there will be k-1 skipped cells between each
+    filter element on that dimension. The dimension order is determined by the
+    value of `data_format`, see above for details. Dilations in the batch and
+    depth dimensions must be 1.
 )doc");
 
 REGISTER_OP("QuantizedMaxPool")
@@ -2866,6 +2958,25 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Conv2D and BiasAdd.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBias")
     .Input("input: T")
     .Input("filter: T")
@@ -2919,6 +3030,88 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Dummy node that enables fusing Conv2DBackpropFilter and BiasAddGrad operator
+for MKL. This node does not perform anything. It is just created as an
+intermediate output of merging Conv2DBackpropFilter and BiasAddGrad.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklConv2DBackpropFilterWithBias")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter_size: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("bias_grad: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_bias_grad: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input_shape;
+      // Fetch the data_format attribute, which may not exist.
+      string data_format;
+      Status s = c->GetAttr("data_format", &data_format);
+
+      if (s.ok() && data_format == "NCHW") {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -3)));
+      } else {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+        c->set_output(1, c->Vector(c->Dim(input_shape, -1)));
+      }
+      ShapeHandle sh;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &sh));
+      TF_RETURN_IF_ERROR(c->WithRank(sh, 4, &sh));
+      c->set_output(0, sh);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv2DBackpropFilterWithBias. Uses MKL DNN APIs to compute the
+gradients of convolution with respect to the filter.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -2995,6 +3188,78 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklElu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Elu operator. Uses MKL DNN APIs to implement Elu operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklEluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of EluGrad operator. Uses MKL DNN APIs to compute Elu
+gradients for Elu operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklSoftmax")
+    .Input("logits: T")
+    .Input("mkl_logits: uint8")
+    .Output("softmax: T")
+    .Output("mkl_softmax: uint8")
+    .Attr("T: {half, float, double}")
+    .SetShapeFn([](InferenceContext* c) {
+      return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
+    })
+    .Doc(R"doc(
+MKL version of ReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for Relu operation.
+)doc");
+
+REGISTER_OP("_MklTanh")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of Tanh operator. Uses MKL DNN APIs to implement Tanh operator.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklTanhGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: realnumbertype")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of TanhGrad operator. Uses MKL DNN APIs to compute tanh
+gradients for Tanh operation.
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklMaxPool")
     .Attr("T: {float, half} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 4017a46521335c5f645ac9a95f4fde4d86cb642c..9c41957ae6aa4ae1a893f09b6e5282a123831e38 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -1103,6 +1103,86 @@ op {
   summary: "Update \'*var\' according to the Adam algorithm."
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
 }
+op {
+  name: "ApplyAddSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
+}
 op {
   name: "ApplyCenteredRMSProp"
   input_arg {
@@ -1506,6 +1586,86 @@ op {
   summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
   description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
 }
+op {
+  name: "ApplyPowerSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type_attr: "T"
+    is_ref: true
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "out"
+    description: "Same as \"var\"."
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
+}
 op {
   name: "ApplyProximalAdagrad"
   input_arg {
@@ -4270,6 +4430,34 @@ op {
   summary: "Bucketizes \'input\' based on \'boundaries\'."
   description: "For example, if the inputs are\n    boundaries = [0, 10, 100]\n    input = [[-5, 10000]\n             [150,   10]\n             [5,    100]]\n\nthen the output will be\n    output = [[0, 3]\n              [3, 2]\n              [1, 3]]"
 }
+op {
+  name: "BytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
+}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -5261,6 +5449,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5327,6 +5516,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5382,6 +5572,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5447,6 +5638,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5502,6 +5694,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -6412,6 +6605,7 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
         type: DT_STRING
@@ -7130,6 +7324,32 @@ op {
   summary: "Deserialize and concatenate `SparseTensors` from a serialized minibatch."
   description: "The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where\n`N` is the minibatch size and the rows correspond to packed outputs of\n`SerializeSparse`.  The ranks of the original `SparseTensor` objects\nmust all match.  When the final `SparseTensor` is created, it has rank one\nhigher than the ranks of the incoming `SparseTensor` objects\n(they have been concatenated along a new row dimension).\n\nThe output `SparseTensor` object\'s shape values for all dimensions but the\nfirst are the max across the input `SparseTensor` objects\' shape values\nfor the corresponding dimensions.  Its first shape value is `N`, the minibatch\nsize.\n\nThe input `SparseTensor` objects\' indices are assumed ordered in\nstandard lexicographic order.  If this is not the case, after this\nstep run `SparseReorder` to restore index ordering.\n\nFor example, if the serialized input is a `[2 x 3]` matrix representing two\noriginal `SparseTensor` objects:\n\n    index = [ 0]\n            [10]\n            [20]\n    values = [1, 2, 3]\n    shape = [50]\n\nand\n\n    index = [ 2]\n            [10]\n    values = [4, 5]\n    shape = [30]\n\nthen the final deserialized `SparseTensor` will be:\n\n    index = [0  0]\n            [0 10]\n            [0 20]\n            [1  2]\n            [1 10]\n    values = [1, 2, 3, 4, 5]\n    shape = [2 50]"
 }
+op {
+  name: "DeserializeSparse"
+  input_arg {
+    name: "serialized_sparse"
+    description: "The serialized `SparseTensor` objects. The last dimension\nmust have 3 columns."
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "sparse_values"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "sparse_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    description: "The `dtype` of the serialized `SparseTensor` objects."
+  }
+  summary: "Deserialize `SparseTensor` objects."
+}
 op {
   name: "DestroyResourceOp"
   input_arg {
@@ -10076,8 +10296,18 @@ op {
     description: "Number of entries in the new vocab file to remap."
     has_minimum: true
   }
+  attr {
+    name: "old_vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    description: "Number of entries in the old vocab file to consider.  If -1,\nuse the entire old vocabulary."
+    has_minimum: true
+    minimum: -1
+  }
   summary: "Given a path to new and old vocabulary files, returns a remapping Tensor of"
-  description: "length `num_new_vocab`, where `remapping[i]` contains the row number in the old\nvocabulary that corresponds to row `i` in the new vocabulary (starting at line\n`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`\nin the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables\nuse in the partitioned variable case, and should generally be set through\nexamining partitioning info.  The format of the files should be a text file,\nwith each line containing a single entity within the vocabulary.\n\nFor example, with `new_vocab_file` a text file containing each of the following\nelements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],\n`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be\n`[0, -1, 2]`.\n\nThe op also returns a count of how many entries in the new vocabulary\nwere present in the old vocabulary, which is used to calculate the number of\nvalues to initialize in a weight matrix remapping\n\nThis functionality can be used to remap both row vocabularies (typically,\nfeatures) and column vocabularies (typically, classes) from TensorFlow\ncheckpoints.  Note that the partitioning logic relies on contiguous vocabularies\ncorresponding to div-partitioned variables.  Moreover, the underlying remapping\nuses an IndexTable (as opposed to an inexact CuckooTable), so client code should\nuse the corresponding index_table_from_file() as the FeatureColumn framework\ndoes (as opposed to tf.feature_to_id(), which uses a CuckooTable)."
+  description: "length `num_new_vocab`, where `remapping[i]` contains the row number in the old\nvocabulary that corresponds to row `i` in the new vocabulary (starting at line\n`new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`\nin the new vocabulary is not in the old vocabulary.  The old vocabulary is\nconstrained to the first `old_vocab_size` entries if `old_vocab_size` is not the\ndefault value of -1.\n\n`num_vocab_offset` enables\nuse in the partitioned variable case, and should generally be set through\nexamining partitioning info.  The format of the files should be a text file,\nwith each line containing a single entity within the vocabulary.\n\nFor example, with `new_vocab_file` a text file containing each of the following\nelements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],\n`num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be\n`[0, -1, 2]`.\n\nThe op also returns a count of how many entries in the new vocabulary\nwere present in the old vocabulary, which is used to calculate the number of\nvalues to initialize in a weight matrix remapping\n\nThis functionality can be used to remap both row vocabularies (typically,\nfeatures) and column vocabularies (typically, classes) from TensorFlow\ncheckpoints.  Note that the partitioning logic relies on contiguous vocabularies\ncorresponding to div-partitioned variables.  Moreover, the underlying remapping\nuses an IndexTable (as opposed to an inexact CuckooTable), so client code should\nuse the corresponding index_table_from_file() as the FeatureColumn framework\ndoes (as opposed to tf.feature_to_id(), which uses a CuckooTable)."
 }
 op {
   name: "GetSessionHandle"
@@ -11464,6 +11694,19 @@ op {
   summary: "Gets the next output from the given iterator."
   is_stateful: true
 }
+op {
+  name: "IteratorSetStatsAggregator"
+  input_arg {
+    name: "iterator_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stats_aggregator_handle"
+    type: DT_RESOURCE
+  }
+  summary: "Associates the given iterator with the given statistics aggregator."
+  is_stateful: true
+}
 op {
   name: "IteratorToStringHandle"
   input_arg {
@@ -11660,6 +11903,34 @@ op {
   }
   summary: "Gradients for Local Response Normalization."
 }
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
+}
 op {
   name: "LearnedUnigramCandidateSampler"
   input_arg {
@@ -13117,6 +13388,33 @@ op {
   summary: "Returns the batched diagonal part of a batched tensor."
   description: "This operation returns a tensor with the `diagonal` part\nof the batched `input`. The `diagonal` part is computed as follows:\n\nAssume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a\ntensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:\n\n`diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.\n\nThe input must be at least a matrix.\n\nFor example:\n\n```\n# \'input\' is [[[1, 0, 0, 0]\n               [0, 2, 0, 0]\n               [0, 0, 3, 0]\n               [0, 0, 0, 4]],\n              [[5, 0, 0, 0]\n               [0, 6, 0, 0]\n               [0, 0, 7, 0]\n               [0, 0, 0, 8]]]\n\nand input.shape = (2, 4, 4)\n\ntf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]\n\nwhich has shape (2, 4)\n```"
 }
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    description: "Shape is `[..., M, M]`."
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    description: "Shape is `[..., M, M]`.\n\n@compatibility(scipy)\nEquivalent to scipy.linalg.expm\n@end_compatibility"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  summary: "Computes the matrix exponential of one or more square matrices:"
+  description: "exp(A) = \\sum_{n=0}^\\infty A^n/n!\n\nThe exponential is computed using a combination of the scaling and squaring\nmethod and the Pade approximation. Details can be founds in:\nNicholas J. Higham, \"The scaling and squaring method for the matrix exponential\nrevisited,\" SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.\n\nThe input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions\nform square matrices. The output is a tensor of the same shape as the input\ncontaining the exponential for all input submatrices `[..., :, :]`."
+}
 op {
   name: "MatrixInverse"
   input_arg {
@@ -15422,7 +15720,7 @@ op {
       }
     }
   }
-  summary: "Finds values of the `n`-th order statistic for the last dmension."
+  summary: "Finds values of the `n`-th order statistic for the last dimension."
   description: "If the input is a vector (rank-1), finds the entries which is the nth-smallest\nvalue in the vector and outputs their values as scalar tensor.\n\nFor matrices (resp. higher rank input), computes the entries which is the\nnth-smallest value in each row (resp. vector along the last dimension). Thus,\n\n    values.shape = input.shape[:-1]"
 }
 op {
@@ -16939,7 +17237,6 @@ op {
     name: "U"
     type: "list(type)"
     has_minimum: true
-    minimum: 1
   }
   attr {
     name: "message"
@@ -19365,17 +19662,14 @@ op {
   name: "RandomPoisson"
   input_arg {
     name: "shape"
-    description: "1-D integer tensor. Shape of independent samples to draw from each\ndistribution described by the shape parameters given in rate."
     type_attr: "S"
   }
   input_arg {
     name: "rate"
-    description: "A tensor in which each scalar is a \"rate\" parameter describing the\nassociated poisson distribution."
     type_attr: "dtype"
   }
   output_arg {
     name: "output"
-    description: "A tensor with shape `shape + shape(rate)`. Each slice\n`[:, ..., :, i0, i1, ...iN]` contains the samples drawn for\n`rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of\nrate."
     type_attr: "dtype"
   }
   attr {
@@ -19384,7 +19678,6 @@ op {
     default_value {
       i: 0
     }
-    description: "If either `seed` or `seed2` are set to be non-zero, the random number\ngenerator is seeded by the given seed.  Otherwise, it is seeded by a\nrandom seed."
   }
   attr {
     name: "seed2"
@@ -19392,7 +19685,6 @@ op {
     default_value {
       i: 0
     }
-    description: "A second seed to avoid seed collision."
   }
   attr {
     name: "S"
@@ -19415,8 +19707,11 @@ op {
       }
     }
   }
-  summary: "Outputs random values from the Poisson distribution(s) described by rate."
-  description: "This op uses two algorithms, depending on rate. If rate >= 10, then\nthe algorithm by Hormann is used to acquire samples via\ntransformation-rejection.\nSee http://www.sciencedirect.com/science/article/pii/0167668793909974.\n\nOtherwise, Knuth\'s algorithm is used to acquire samples via multiplying uniform\nrandom variables.\nSee Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer\nProgramming, Volume 2. Addison Wesley"
+  summary: "Use RandomPoissonV2 instead."
+  deprecation {
+    version: 25
+    explanation: "Replaced by RandomPoissonV2"
+  }
   is_stateful: true
 }
 op {
@@ -21643,6 +21938,79 @@ op {
   description: "lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)\nm_t <- beta1 * m_{t-1} + (1 - beta1) * g_t\nv_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t\nvariable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)"
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAddSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "alpha"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- (alpha + sign_decay * sign(g) *sign(m)) * g\nvariable <- variable - lr_t * update"
+  is_stateful: true
+}
 op {
   name: "ResourceApplyCenteredRMSProp"
   input_arg {
@@ -22008,6 +22376,79 @@ op {
   description: "want to use Nesterov momentum.\n\naccum = accum * momentum + grad\nvar -= lr * accum"
   is_stateful: true
 }
+op {
+  name: "ResourceApplyPowerSign"
+  input_arg {
+    name: "var"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    description: "Should be from a Variable()."
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    description: "Scaling factor. Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "logbase"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sign_decay"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta"
+    description: "Must be a scalar."
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    description: "The gradient."
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If `True`, updating of the var and m tensors is\nprotected by a lock; otherwise the behavior is undefined, but may exhibit less\ncontention."
+  }
+  summary: "Update \'*var\' according to the AddSign update."
+  description: "m_t <- beta1 * m_{t-1} + (1 - beta1) * g\nupdate <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g\nvariable <- variable - lr_t * update"
+  is_stateful: true
+}
 op {
   name: "ResourceApplyProximalAdagrad"
   input_arg {
@@ -29918,6 +30359,42 @@ op {
   summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
   description: "The generated values follow a normal distribution with mean 0 and standard\ndeviation 1, except that values whose magnitude is more than 2 standard\ndeviations from the mean are dropped and re-picked.\n\nThe outputs are a deterministic function of `shape` and `seed`."
 }
+op {
+  name: "StatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  summary: "Creates a statistics manager resource."
+  is_stateful: true
+}
+op {
+  name: "StatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  summary: "Produces a summary of any statistics recorded by the given statistics manager."
+  is_stateful: true
+}
 op {
   name: "StopGradient"
   input_arg {
@@ -31734,6 +32211,14 @@ op {
     }
     description: "If true (default), Tensors in the TensorArray are cleared\nafter being read.  This disables multiple read semantics but allows early\nrelease of memory."
   }
+  attr {
+    name: "identical_element_shapes"
+    type: "bool"
+    default_value {
+      b: false
+    }
+    description: "If true (default is false), then all\nelements in the TensorArray will be expected to have have identical shapes.\nThis allows certain behaviors, like dynamically checking for\nconsistent shapes on write, and being able to fill in properly\nshaped zero tensors on stack -- even if the element_shape attribute\nis not fully defined."
+  }
   attr {
     name: "tensor_array_name"
     type: "string"
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index eee1ed1d2aa20c3d340d49ad83a69c9963ff4ef1..31d9c82e537d170bb13aa381c4a0a47feb98172b 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -29,7 +29,7 @@ REGISTER_OP("RandomUniform")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -87,7 +87,7 @@ REGISTER_OP("RandomStandardNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -115,7 +115,7 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -145,7 +145,7 @@ REGISTER_OP("TruncatedNormal")
     .Output("output: dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
-    .Attr("dtype: {half,float,double}")
+    .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
     .SetShapeFn(shape_inference::RandomShape)
     .Doc(R"doc(
@@ -201,10 +201,11 @@ REGISTER_OP("Multinomial")
     .SetIsStateful()
     .Input("logits: T")
     .Input("num_samples: int32")
-    .Output("output: int64")
+    .Output("output: output_dtype")
     .Attr("seed: int = 0")
     .Attr("seed2: int = 0")
     .Attr("T: realnumbertype")
+    .Attr("output_dtype: {int32, int64} = DT_INT64")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle logits_shape;
       ShapeHandle unused;
@@ -265,8 +266,6 @@ output: A tensor with shape `shape + shape(alpha)`. Each slice
   `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
 )doc");
 
-// TODO(dhananayn): Deprecate RandomPoisson and switch over to RandomPoissonV2
-// after forward compatibility period has passed.
 REGISTER_OP("RandomPoisson")
     .SetIsStateful()
     .Input("shape: S")
@@ -283,32 +282,9 @@ REGISTER_OP("RandomPoisson")
       c->set_output(0, out);
       return Status::OK();
     })
+    .Deprecated(25, "Replaced by RandomPoissonV2")
     .Doc(R"doc(
-Outputs random values from the Poisson distribution(s) described by rate.
-
-This op uses two algorithms, depending on rate. If rate >= 10, then
-the algorithm by Hormann is used to acquire samples via
-transformation-rejection.
-See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-
-Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-random variables.
-See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-Programming, Volume 2. Addison Wesley
-
-shape: 1-D integer tensor. Shape of independent samples to draw from each
-  distribution described by the shape parameters given in rate.
-rate: A tensor in which each scalar is a "rate" parameter describing the
-  associated poisson distribution.
-seed: If either `seed` or `seed2` are set to be non-zero, the random number
-  generator is seeded by the given seed.  Otherwise, it is seeded by a
-  random seed.
-seed2: A second seed to avoid seed collision.
-
-output: A tensor with shape `shape + shape(rate)`. Each slice
-  `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-  `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
-  rate.
+Use RandomPoissonV2 instead.
 )doc");
 
 REGISTER_OP("RandomPoissonV2")
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index cdfbec85cf1194d02c81cb4a3d66563dc85dfa57..bf9e673e8e46381fa655f37eff4a08b3f3dca38b 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -204,7 +204,10 @@ Status VariableShapeShapeFn(InferenceContext* c) {
   if (handle_data == nullptr || handle_data->empty()) {
     return errors::InvalidArgument("Handle doesn't have shape information.");
   }
-  c->set_output(0, (*handle_data)[0].shape);
+  ShapeHandle var_shape = (*handle_data)[0].shape;
+  int64 rank = c->RankKnown(var_shape) ? c->Rank(var_shape)
+                                       : InferenceContext::kUnknownDim;
+  c->set_output(0, c->Vector(rank));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/ops/script_ops.cc b/tensorflow/core/ops/script_ops.cc
index 8197327b562c5296e4bcbe43ce9ca81696dedf8b..c7c594a999a87682e36de6af54e7d7ede4486ca9 100644
--- a/tensorflow/core/ops/script_ops.cc
+++ b/tensorflow/core/ops/script_ops.cc
@@ -51,4 +51,18 @@ REGISTER_OP("PyFuncStateless")
 A stateless version of PyFunc.
 )doc");
 
+REGISTER_OP("EagerPyFunc")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("token: string")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >=0")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Eagerly executes a python function to compute func(input)->output. The
+semantics of the input, output, and attributes are the same as those for
+PyFunc.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 646c37958662b1791af6d54e914d20d058feef6c..99f61a3054563bcf66757bbd1496bb1ee1ae7a3f 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -190,7 +190,8 @@ REGISTER_OP("SerializeSparse")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -200,11 +201,13 @@ REGISTER_OP("SerializeSparse")
       return Status::OK();
     })
     .Doc(R"doc(
-Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 
 sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
 sparse_values: 1-D.  The `values` of the `SparseTensor`.
 sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+out_type: The `dtype` to use for serialization; the supported types are `string`
+  (default) and `variant`.
 )doc");
 
 REGISTER_OP("SerializeManySparse")
@@ -212,7 +215,8 @@ REGISTER_OP("SerializeManySparse")
     .Input("sparse_values: T")
     .Input("sparse_shape: int64")
     .Attr("T: type")
-    .Output("serialized_sparse: string")
+    .Output("serialized_sparse: out_type")
+    .Attr("out_type: {string, variant} = DT_STRING")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &unused));
@@ -222,7 +226,7 @@ REGISTER_OP("SerializeManySparse")
       return Status::OK();
     })
     .Doc(R"doc(
-Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 
 The `SparseTensor` must have rank `R` greater than 1, and the first dimension
 is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -235,14 +239,83 @@ The minibatch size `N` is extracted from `sparse_shape[0]`.
 sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
 sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
 sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+out_type: The `dtype` to use for serialization; the supported types are `string`
+  (default) and `variant`.
+)doc");
+
+REGISTER_OP("DeserializeSparse")
+    .Input("serialized_sparse: Tserialized")
+    .Output("sparse_indices: int64")
+    .Output("sparse_values: dtype")
+    .Output("sparse_shape: int64")
+    .Attr("dtype: type")
+    .Attr("Tserialized: {string, variant} = DT_STRING")
+    .SetShapeFn([](InferenceContext* c) {
+      // serialized sparse is [?, ..., ?, 3] vector.
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(c->input(0), -1), 3, &unused));
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim,
+                                 InferenceContext::kUnknownDim));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(InferenceContext::kUnknownDim));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Deserialize `SparseTensor` objects.
+
+The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+the last dimension stores serialized `SparseTensor` objects and the other N
+dimensions (N >= 0) correspond to a batch. The ranks of the original
+`SparseTensor` objects must all match. When the final `SparseTensor` is
+created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+the sparse tensors have been concatenated along new dimensions, one for each
+batch.
+
+The output `SparseTensor` object's shape values for the original dimensions
+are the max across the input `SparseTensor` objects' shape values for the
+corresponding dimensions. The new dimensions match the size of the batch.
+
+The input `SparseTensor` objects' indices are assumed ordered in
+standard lexicographic order.  If this is not the case, after this
+step run `SparseReorder` to restore index ordering.
+
+For example, if the serialized input is a `[2 x 3]` matrix representing two
+original `SparseTensor` objects:
+
+    index = [ 0]
+            [10]
+            [20]
+    values = [1, 2, 3]
+    shape = [50]
+
+and
+
+    index = [ 2]
+            [10]
+    values = [4, 5]
+    shape = [30]
+
+then the final deserialized `SparseTensor` will be:
+
+    index = [0  0]
+            [0 10]
+            [0 20]
+            [1  2]
+            [1 10]
+    values = [1, 2, 3, 4, 5]
+    shape = [2 50]
+
+serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+  must have 3 columns.
+dtype: The `dtype` of the serialized `SparseTensor` objects.
 )doc");
 
 REGISTER_OP("DeserializeManySparse")
     .Input("serialized_sparse: string")
-    .Attr("dtype: type")
     .Output("sparse_indices: int64")
     .Output("sparse_values: dtype")
     .Output("sparse_shape: int64")
+    .Attr("dtype: type")
     .SetShapeFn([](InferenceContext* c) {
       // serialized sparse is [?,3] matrix.
       ShapeHandle serialized_sparse;
diff --git a/tensorflow/core/ops/spectral_ops_test.cc b/tensorflow/core/ops/spectral_ops_test.cc
index 0f8a3e6ef1366b2de08ee352bc54d1bf874a6bed..b1c5e95fc5ce25496d18202182cc418496349bb6 100644
--- a/tensorflow/core/ops/spectral_ops_test.cc
+++ b/tensorflow/core/ops/spectral_ops_test.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 TEST(MathOpsTest, FFT_ShapeFn) {
   for (const auto* op_name : {"FFT", "IFFT"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 1 but is rank 0", op, "[]");
     INFER_OK(op, "[?]", "in0");
     INFER_OK(op, "[1]", "in0");
@@ -31,7 +31,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT2D", "IFFT2D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
     INFER_OK(op, "[?,1]", "in0");
     INFER_OK(op, "[1,2]", "in0");
@@ -40,7 +40,7 @@ TEST(MathOpsTest, FFT_ShapeFn) {
 
   for (const auto* op_name : {"FFT3D", "IFFT3D"}) {
     ShapeInferenceTestOp op(op_name);
-    INFER_OK(op, "?", "?");
+    INFER_OK(op, "?", "in0");
     INFER_ERROR("Shape must be at least rank 3 but is rank 2", op, "[1,2]");
     INFER_OK(op, "[?,1,?]", "in0");
     INFER_OK(op, "[1,2,3]", "in0");
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index da5f091e9f1988721b1947ad812851e0322efa9e..5b1f5d2477d662ca911f9d1aca6d495f1d63eb7e 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -513,6 +513,62 @@ output_ref: Same as ref. Returned as a convenience for operations that want to
   use the updated values after the update is done.
 )doc");
 
+REGISTER_OP("ResourceScatterNdUpdate")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape)
+    .Doc(R"doc(
+Applies sparse `updates` to individual values or slices within a given
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+
+ref: A resource handle. Must be from a VarHandleOp.
+indices: A Tensor. Must be one of the following types: int32, int64.
+  A tensor of indices into ref.
+updates: A Tensor. Must have the same type as ref. A tensor of updated
+  values to add to ref.
+use_locking: An optional bool. Defaults to True. If True, the assignment will
+  be protected by a lock; otherwise the behavior is undefined,
+  but may exhibit less contention.
+)doc");
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/stateless_random_ops.cc b/tensorflow/core/ops/stateless_random_ops.cc
index 7c00fdb99fb59a37751c4cb1797f7c51c801d3af..3e1f8781fcd7718e3443b0b4bee5ea5d33980524 100644
--- a/tensorflow/core/ops/stateless_random_ops.cc
+++ b/tensorflow/core/ops/stateless_random_ops.cc
@@ -38,10 +38,11 @@ static Status StatelessShape(shape_inference::InferenceContext* context) {
 #define REGISTER_STATELESS_OP(name)                  \
   REGISTER_OP(name)                                  \
       .Input("shape: T")                             \
-      .Input("seed: int64")                          \
+      .Input("seed: Tseed")                          \
       .Output("output: dtype")                       \
       .Attr("dtype: {half,float,double} = DT_FLOAT") \
       .Attr("T: {int32, int64} = DT_INT32")          \
+      .Attr("Tseed: {int32, int64} = DT_INT64")      \
       .SetShapeFn(StatelessShape)
 
 // This op is exposed through contrib/stateless only.  The interface may change.
diff --git a/tensorflow/core/ops/summary_ops.cc b/tensorflow/core/ops/summary_ops.cc
index f778b48797263e50e132ac369e70432276b7e8fb..aa7458f903cf76af660c04149ff50ac899987eac 100644
--- a/tensorflow/core/ops/summary_ops.cc
+++ b/tensorflow/core/ops/summary_ops.cc
@@ -38,6 +38,7 @@ REGISTER_OP("CreateSummaryFileWriter")
     .Input("max_queue: int32")
     .Input("flush_millis: int32")
     .Input("filename_suffix: string")
+    .SetShapeFn(shape_inference::NoOutputs)
     .Doc(R"doc(
 Creates a summary file writer accessible by the given resource handle.
 
@@ -49,6 +50,33 @@ flush_millis: How often, in milliseconds, to flush the pending events and
 filename_suffix: Every event file's name is suffixed with this suffix.
 )doc");
 
+REGISTER_OP("CreateSummaryDbWriter")
+    .Input("writer: resource")
+    .Input("db_uri: string")
+    .Input("experiment_name: string")
+    .Input("run_name: string")
+    .Input("user_name: string")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Creates summary database writer accessible by given resource handle.
+
+This can be used to write tensors from the execution graph directly
+to a database. Only SQLite is supported right now. This function
+will create the schema if it doesn't exist. Entries in the Users,
+Experiments, and Runs tables will be created automatically if they
+don't already exist.
+
+writer: Handle to SummaryWriter resource to overwrite.
+db_uri: For example "file:/tmp/foo.sqlite".
+experiment_name: Can't contain ASCII control characters or <>. Case
+  sensitive. If empty, then the Run will not be associated with any
+  Experiment.
+run_name: Can't contain ASCII control characters or <>. Case sensitive.
+  If empty, then each Tag will not be associated with any Run.
+user_name: Must be valid as both a DNS label and Linux username. If
+  empty, then the Experiment will not be associated with any User.
+)doc");
+
 REGISTER_OP("FlushSummaryWriter")
     .Input("writer: resource")
     .SetShapeFn(shape_inference::NoOutputs)
@@ -72,7 +100,7 @@ writer: A handle to the summary writer resource.
 
 REGISTER_OP("WriteSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tensor: T")
     .Input("tag: string")
     .Input("summary_metadata: string")
@@ -82,16 +110,30 @@ REGISTER_OP("WriteSummary")
 Outputs a `Summary` protocol buffer with a tensor.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tensor: A tensor to serialize.
 tag: The summary's tag.
 summary_metadata: Serialized SummaryMetadata protocol buffer containing
  plugin-related metadata for this summary.
 )doc");
 
+REGISTER_OP("ImportEvent")
+    .Input("writer: resource")
+    .Input("event: string")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Outputs a `tf.Event` protocol buffer.
+
+When CreateSummaryDbWriter is being used, this op can be useful for
+importing data from event logs.
+
+writer: A handle to a summary writer.
+event: A string containing a binary-encoded tf.Event proto.
+)doc");
+
 REGISTER_OP("WriteScalarSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("value: T")
     .Attr("T: realnumbertype")
@@ -102,14 +144,14 @@ Writes a `Summary` protocol buffer with scalar values.
 The input `tag` and `value` must have the scalars.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Tag for the summary.
 value: Value for the summary.
 )doc");
 
 REGISTER_OP("WriteHistogramSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("values: T")
     .Attr("T: realnumbertype = DT_FLOAT")
@@ -124,14 +166,14 @@ has one summary value containing a histogram for `values`.
 This op reports an `InvalidArgument` error if any value is not finite.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar.  Tag to use for the `Summary.Value`.
 values: Any shape. Values to use to build the histogram.
 )doc");
 
 REGISTER_OP("WriteImageSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("tensor: T")
     .Input("bad_color: uint8")
@@ -176,7 +218,7 @@ replaced by this tensor in the output image.  The default value is the color
 red.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar. Used to build the `tag` attribute of the summary values.
 tensor: 4-D of shape `[batch_size, height, width, channels]` where
   `channels` is 1, 3, or 4.
@@ -186,7 +228,7 @@ bad_color: Color to use for pixels with non-finite values.
 
 REGISTER_OP("WriteAudioSummary")
     .Input("writer: resource")
-    .Input("global_step: int64")
+    .Input("step: int64")
     .Input("tag: string")
     .Input("tensor: float")
     .Input("sample_rate: float")
@@ -208,11 +250,24 @@ build the `tag` of the summary values:
    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 
 writer: A handle to a summary writer.
-global_step: The step to write the summary for.
+step: The step to write the summary for.
 tag: Scalar. Used to build the `tag` attribute of the summary values.
 tensor: 2-D of shape `[batch_size, frames]`.
 sample_rate: The sample rate of the signal in hertz.
 max_outputs: Max number of batch elements to generate audio for.
 )doc");
 
+REGISTER_OP("WriteGraphSummary")
+    .Input("writer: resource")
+    .Input("step: int64")
+    .Input("tensor: string")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
+
+writer: Handle of `SummaryWriter`.
+step: The step to write the summary for.
+tensor: A scalar string of the serialized tf.GraphDef proto.
+)doc");
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 6f06b87d589dd4c8f41b375642da01ef37be5e67..405318caf20183ce267e84cd2554ed8c77a5b409 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -22,6 +22,48 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+const char kAddSignCommonDocStr[] = R"doc(
+Update '*var' according to the AddSign update.
+
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+sign_decay: Must be a scalar.
+alpha: Must be a scalar.
+beta: Must be a scalar.
+grad: The gradient.
+)doc";
+
+const char kPowerSignCommonDocStr[] = R"doc(
+Update '*var' according to the AddSign update.
+
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+
+var: Should be from a Variable().
+m: Should be from a Variable().
+lr: Scaling factor. Must be a scalar.
+logbase: Must be a scalar.
+sign_decay: Must be a scalar.
+beta: Must be a scalar.
+grad: The gradient.
+)doc";
+
+const char kOutDocStr[] = R"doc(
+out: Same as "var".
+)doc";
+
+const char kLockDocStr[] = R"doc(
+use_locking: If `True`, updating of the var and m tensors is
+  protected by a lock; otherwise the behavior is undefined, but may exhibit less
+  contention.
+)doc";
+
 static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
   auto* handle_data = c->input_handle_shapes_and_types(input);
   if (handle_data != nullptr && !handle_data->empty() &&
@@ -1796,4 +1838,99 @@ use_locking: If `True`, updating of the var, mg, ms, and mom tensors is
   contention.
 )doc");
 
+static Status ApplyAddSignShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // alpha
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // sign_decay
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyAddSign")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("lr: T")
+    .Input("alpha: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAddSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kAddSignCommonDocStr, kOutDocStr, kLockDocStr));
+
+REGISTER_OP("ResourceApplyAddSign")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("lr: T")
+    .Input("alpha: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAddSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kAddSignCommonDocStr, kLockDocStr));
+
+static Status ApplyPowerSignShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // logbase
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // sign_delay
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 6 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ApplyPowerSign")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("lr: T")
+    .Input("logbase: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyPowerSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kPowerSignCommonDocStr, kOutDocStr, kLockDocStr));
+
+REGISTER_OP("ResourceApplyPowerSign")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("lr: T")
+    .Input("logbase: T")
+    .Input("sign_decay: T")
+    .Input("beta: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyPowerSignShapeFn(c, /*sparse=*/false);
+    })
+    .Doc(strings::StrCat(kPowerSignCommonDocStr, kLockDocStr));
+
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops_test.cc b/tensorflow/core/ops/training_ops_test.cc
index 92d5ad99645404d50c114df6b9a45d4af64a6481..de4e3cd9e70014ea9b29d4d473d94c0abb52eabc 100644
--- a/tensorflow/core/ops/training_ops_test.cc
+++ b/tensorflow/core/ops/training_ops_test.cc
@@ -332,4 +332,38 @@ TEST(TrainingOpsTest, SparseApplyRMSProp_ShapeFn) {
   INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;?;[?];?;?");
 }
 
+TEST(TrainingOpsTest, ApplyAddSign_ShapeFn) {
+  ShapeInferenceTestOp op("ApplyAddSign");
+
+  // Output is a merge of inputs 0, 1, and 6 (var, ms, and grad).
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[];[?,?,2]", "[d0_0,d1_1,d6_2]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[2];[];[];[];[];[1]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[1];[];[];[];[];[2]");
+
+  // lr, alpha, sign_decay, and beta must be scalars.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;[?];?;?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;[?];?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;[?];?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;[?];?");
+}
+
+TEST(TrainingOpsTest, ApplyPowerSign_ShapeFn) {
+  ShapeInferenceTestOp op("ApplyPowerSign");
+
+  // Output is a merge of inputs 0, 1, and 6 (var, ms, and grad).
+  INFER_OK(op, "[1,?,?];[?,2,?];[];[];[];[];[?,?,2]", "[d0_0,d1_1,d6_2]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[2];[];[];[];[];[1]");
+  INFER_ERROR("Dimension 0 in both shapes must be equal, but are 1 and 2", op,
+              "[1];[1];[];[];[];[];[2]");
+
+  // lr, logbase, sign_decay, and beta must be scalars.
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;[?];?;?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;[?];?;?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;[?];?;?");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "?;?;?;?;?;[?];?");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 901fb79d6aa3df8a21df5a4f60f798bd6c00d720..aaeccc8324bea5237f2e4e2dea07ce630a8f5beb 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_copts",
 )
 
 filegroup(
@@ -29,6 +30,7 @@ filegroup(
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
@@ -37,20 +39,35 @@ cc_library(
     name = "file_block_cache",
     srcs = ["file_block_cache.cc"],
     hdrs = ["file_block_cache.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = ["//tensorflow/core:lib"],
 )
 
+cc_library(
+    name = "gcs_dns_cache",
+    srcs = ["gcs_dns_cache.cc"],
+    hdrs = ["gcs_dns_cache.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        ":http_request",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gcs_file_system",
     srcs = ["gcs_file_system.cc"],
     hdrs = ["gcs_file_system.h"],
+    copts = tf_copts(),
     linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
     visibility = ["//visibility:public"],
     deps = [
         ":curl_http_request",
         ":expiring_lru_cache",
         ":file_block_cache",
+        ":gcs_dns_cache",
         ":google_auth_provider",
         ":http_request",
         ":retrying_file_system",
@@ -66,6 +83,7 @@ cc_library(
 cc_library(
     name = "http_request",
     hdrs = ["http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -77,6 +95,7 @@ cc_library(
     name = "curl_http_request",
     srcs = ["curl_http_request.cc"],
     hdrs = ["curl_http_request.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":http_request",
@@ -92,6 +111,7 @@ cc_library(
     hdrs = [
         "http_request_fake.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -109,6 +129,7 @@ cc_library(
         "auth_provider.h",
         "google_auth_provider.h",
     ],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
@@ -124,6 +145,7 @@ cc_library(
     name = "now_seconds_env",
     testonly = 1,
     hdrs = ["now_seconds_env.h"],
+    copts = tf_copts(),
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:lib",
@@ -139,6 +161,7 @@ cc_library(
     hdrs = [
         "oauth_client.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -157,6 +180,7 @@ cc_library(
     hdrs = [
         "retrying_utils.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -171,6 +195,7 @@ cc_library(
     hdrs = [
         "retrying_file_system.h",
     ],
+    copts = tf_copts(),
     deps = [
         ":retrying_utils",
         "//tensorflow/core:framework_headers_lib",
@@ -186,6 +211,7 @@ cc_library(
     hdrs = [
         "time_util.h",
     ],
+    copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_internal",
@@ -231,6 +257,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gcs_dns_cache_test",
+    size = "small",
+    srcs = ["gcs_dns_cache_test.cc"],
+    deps = [
+        ":gcs_dns_cache",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "curl_http_request_test",
     size = "small",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index e2d935f35eb5134baff6364125df4b8c79205867..6575ee8c976094c4cb5b6908c3b17a9612472596 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -117,6 +117,10 @@ class LibCurlProxy : public LibCurl {
   }
 
   void curl_free(void* p) override { ::curl_free(p); }
+
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return ::curl_easy_strerror(errornum);
+  }
 };
 }  // namespace
 
@@ -131,6 +135,9 @@ CurlHttpRequest::~CurlHttpRequest() {
   if (curl_headers_) {
     libcurl_->curl_slist_free_all(curl_headers_);
   }
+  if (resolve_list_) {
+    libcurl_->curl_slist_free_all(resolve_list_);
+  }
   if (put_body_) {
     fclose(put_body_);
   }
@@ -192,6 +199,7 @@ Status CurlHttpRequest::SetUri(const string& uri) {
   TF_RETURN_IF_ERROR(CheckInitialized());
   TF_RETURN_IF_ERROR(CheckNotSent());
   is_uri_set_ = true;
+  uri_ = uri;
   libcurl_->curl_easy_setopt(curl_, CURLOPT_URL, uri.c_str());
   return Status::OK();
 }
@@ -212,6 +220,17 @@ Status CurlHttpRequest::AddHeader(const string& name, const string& value) {
   return Status::OK();
 }
 
+Status CurlHttpRequest::AddResolveOverride(const string& hostname, int64 port,
+                                           const string& ip_addr) {
+  TF_RETURN_IF_ERROR(CheckInitialized());
+  TF_RETURN_IF_ERROR(CheckNotSent());
+  // Resolve values are hostname:port:IP.add.ress
+  resolve_list_ = libcurl_->curl_slist_append(
+      resolve_list_,
+      strings::StrCat(hostname, ":", port, ":", ip_addr).c_str());
+  return Status::OK();
+}
+
 Status CurlHttpRequest::AddAuthBearerHeader(const string& auth_token) {
   TF_RETURN_IF_ERROR(CheckInitialized());
   TF_RETURN_IF_ERROR(CheckNotSent());
@@ -376,6 +395,9 @@ Status CurlHttpRequest::Send() {
   if (curl_headers_) {
     libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTPHEADER, curl_headers_);
   }
+  if (resolve_list_) {
+    libcurl_->curl_easy_setopt(curl_, CURLOPT_RESOLVE, resolve_list_);
+  }
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERDATA,
                              reinterpret_cast<void*>(this));
   libcurl_->curl_easy_setopt(curl_, CURLOPT_HEADERFUNCTION,
@@ -512,11 +534,36 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
   }
 
   if (now - that->last_progress_timestamp_ > kInactivityTimeoutSeconds) {
+    double lookup_time = -1;
+    const auto lookup_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_NAMELOOKUP_TIME, &lookup_time);
+
+    double connect_time = -1;
+    const auto connect_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_CONNECT_TIME, &connect_time);
+
+    double pretransfer_time = -1;
+    const auto pretransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &pretransfer_time);
+
+    double starttransfer_time = -1;
+    const auto starttransfer_time_status = that->libcurl_->curl_easy_getinfo(
+        that->curl_, CURLINFO_PRETRANSFER_TIME, &starttransfer_time);
+
     LOG(ERROR) << "The transmission  of request " << this_object
-               << " has been stuck at " << current_progress << " of "
-               << dltotal + ultotal << " bytes for "
-               << now - that->last_progress_timestamp_
-               << " seconds and will be aborted.";
+               << " (URI: " << that->uri_ << ") has been stuck at "
+               << current_progress << " of " << dltotal + ultotal
+               << " bytes for " << now - that->last_progress_timestamp_
+               << " seconds and will be aborted. CURL timing information: "
+               << "lookup time: " << lookup_time << " ("
+               << that->libcurl_->curl_easy_strerror(lookup_time_status)
+               << "), connect time: " << connect_time << " ("
+               << that->libcurl_->curl_easy_strerror(connect_time_status)
+               << "), pre-transfer time: " << pretransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(pretransfer_time_status)
+               << "), start-transfer time: " << starttransfer_time << " ("
+               << that->libcurl_->curl_easy_strerror(starttransfer_time_status)
+               << ")";
     return 1;  // Will abort the request.
   }
 
diff --git a/tensorflow/core/platform/cloud/curl_http_request.h b/tensorflow/core/platform/cloud/curl_http_request.h
index c7a555de10c12e78c5bc1e034de6e7752e304281..b2a5870cf7c0e67713309535cf5b1896f69d99d3 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.h
+++ b/tensorflow/core/platform/cloud/curl_http_request.h
@@ -71,6 +71,9 @@ class CurlHttpRequest : public HttpRequest {
   /// Sets a request header.
   Status AddHeader(const string& name, const string& value) override;
 
+  Status AddResolveOverride(const string& hostname, int64 port,
+                            const string& ip_addr) override;
+
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
   Status AddAuthBearerHeader(const string& auth_token) override;
 
@@ -146,6 +149,7 @@ class CurlHttpRequest : public HttpRequest {
   std::vector<char>* response_buffer_ = nullptr;
   CURL* curl_ = nullptr;
   curl_slist* curl_headers_ = nullptr;
+  curl_slist* resolve_list_ = nullptr;
 
   std::vector<char> default_response_buffer_;
 
@@ -164,6 +168,9 @@ class CurlHttpRequest : public HttpRequest {
   bool is_method_set_ = false;
   bool is_sent_ = false;
 
+  // Store the URI to help disambiguate requests when errors occur.
+  string uri_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(CurlHttpRequest);
 };
 
@@ -201,6 +208,8 @@ class LibCurl {
   virtual void curl_slist_free_all(curl_slist* list) = 0;
   virtual char* curl_easy_escape(CURL* curl, const char* str, int length) = 0;
   virtual void curl_free(void* p) = 0;
+
+  virtual const char* curl_easy_strerror(CURLcode errornum) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 6c0f0818527fdc2610d2f54a965db23a636a98c7..2d3e46edaf8eeaa4ad240b43c23f264a68d9835f 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -219,6 +219,10 @@ class FakeLibCurl : public LibCurl {
   }
   void curl_free(void* p) override { port::Free(p); }
 
+  const char* curl_easy_strerror(CURLcode errornum) override {
+    return "<unimplemented>";
+  }
+
   // Variables defining the behavior of this fake.
   string response_content_;
   uint64 response_code_;
@@ -263,7 +267,6 @@ TEST(CurlHttpRequestTest, GetRequest) {
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
   TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
@@ -594,7 +597,6 @@ TEST(CurlHttpRequestTest, ErrorReturnsNoResponse) {
 
   std::vector<char> scratch;
   scratch.insert(scratch.begin(), kTestContent.begin(), kTestContent.end());
-  StringPiece result;
   scratch.reserve(100);
 
   TF_EXPECT_OK(http_request.SetUri("http://www.testuri.com"));
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache.h b/tensorflow/core/platform/cloud/expiring_lru_cache.h
index 4fe4234e2231e1da8a6ffaf59f4b327be35d406b..3fc23a4306eb96e85099bd63c9c83c6663fe7e3c 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache.h
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow {
 
 /// \brief An LRU cache of string keys and arbitrary values, with configurable
-/// max item age and max entries.
+/// max item age (in seconds) and max entries.
 ///
 /// This class is thread safe.
 template <typename T>
@@ -48,16 +48,7 @@ class ExpiringLRUCache {
       return;
     }
     mutex_lock lock(mu_);
-    lru_list_.push_front(key);
-    Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
-    auto insert = cache_.insert(std::make_pair(key, entry));
-    if (!insert.second) {
-      lru_list_.erase(insert.first->second.lru_iterator);
-      insert.first->second = entry;
-    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
-      cache_.erase(lru_list_.back());
-      lru_list_.pop_back();
-    }
+    InsertLocked(key, value);
   }
 
   /// Look up the entry with key `key` and copy it to `value` if found. Returns
@@ -68,19 +59,33 @@ class ExpiringLRUCache {
       return false;
     }
     mutex_lock lock(mu_);
-    auto it = cache_.find(key);
-    if (it == cache_.end()) {
-      return false;
+    return LookupLocked(key, value);
+  }
+
+  typedef std::function<Status(const string&, T*)> ComputeFunc;
+
+  /// Look up the entry with key `key` and copy it to `value` if found. If not
+  /// found, call `compute_func`. If `compute_func` returns successfully, store
+  /// a copy of the output parameter in the cache, and another copy in `value`.
+  Status LookupOrCompute(const string& key, T* value,
+                         const ComputeFunc& compute_func) {
+    if (max_age_ == 0) {
+      return compute_func(key, value);
     }
-    lru_list_.erase(it->second.lru_iterator);
-    if (env_->NowSeconds() - it->second.timestamp > max_age_) {
-      cache_.erase(it);
-      return false;
+
+    // Note: we hold onto mu_ for the rest of this function. In practice, this
+    // is okay, as stat requests are typically fast, and concurrent requests are
+    // often for the same file. Future work can split this up into one lock per
+    // key if this proves to be a significant performance bottleneck.
+    mutex_lock lock(mu_);
+    if (LookupLocked(key, value)) {
+      return Status::OK();
     }
-    *value = it->second.value;
-    lru_list_.push_front(it->first);
-    it->second.lru_iterator = lru_list_.begin();
-    return true;
+    Status s = compute_func(key, value);
+    if (s.ok()) {
+      InsertLocked(key, *value);
+    }
+    return s;
   }
 
   /// Accessors for cache parameters.
@@ -99,6 +104,36 @@ class ExpiringLRUCache {
     std::list<string>::iterator lru_iterator;
   };
 
+  bool LookupLocked(const string& key, T* value) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return false;
+    }
+    lru_list_.erase(it->second.lru_iterator);
+    if (env_->NowSeconds() - it->second.timestamp > max_age_) {
+      cache_.erase(it);
+      return false;
+    }
+    *value = it->second.value;
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return true;
+  }
+
+  void InsertLocked(const string& key, const T& value)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    lru_list_.push_front(key);
+    Entry entry{env_->NowSeconds(), value, lru_list_.begin()};
+    auto insert = cache_.insert(std::make_pair(key, entry));
+    if (!insert.second) {
+      lru_list_.erase(insert.first->second.lru_iterator);
+      insert.first->second = entry;
+    } else if (max_entries_ > 0 && cache_.size() > max_entries_) {
+      cache_.erase(lru_list_.back());
+      lru_list_.pop_back();
+    }
+  }
+
   /// The maximum age of entries in the cache, in seconds. A value of 0 means
   /// that no entry is ever placed in the cache.
   const uint64 max_age_;
diff --git a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
index bf9bfcd67eb0e2b05dd796002b9de03ca2011a92..8f8d5744a4576991c0056bfefeb30c4bc58549e0 100644
--- a/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
+++ b/tensorflow/core/platform/cloud/expiring_lru_cache_test.cc
@@ -88,5 +88,69 @@ TEST(ExpiringLRUCacheTest, MaxEntries) {
   EXPECT_EQ(value, 5);
 }
 
+TEST(ExpiringLRUCacheTest, LookupOrCompute) {
+  // max_age of 0 means we should always compute.
+  uint64 num_compute_calls = 0;
+  ExpiringLRUCache<int>::ComputeFunc compute_func =
+      [&num_compute_calls](const string& key, int* value) {
+        *value = num_compute_calls;
+        num_compute_calls++;
+        return Status::OK();
+      };
+  ExpiringLRUCache<int> cache1(0, 4);
+
+  int value = -1;
+  TF_EXPECT_OK(cache1.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+  // re-read the same value, expect another lookup
+  TF_EXPECT_OK(cache1.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 2);
+
+  // Define a new cache with max_age > 0 and verify correct behavior.
+  ExpiringLRUCache<int> cache2(2, 4);
+  num_compute_calls = 0;
+  value = -1;
+
+  // Read our first value
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+  // Re-read, exepct no additional function compute_func calls.
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 0);
+  EXPECT_EQ(num_compute_calls, 1);
+
+  // Read a sequence of additional values, eventually evicting "a".
+  TF_EXPECT_OK(cache2.LookupOrCompute("b", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 2);
+  TF_EXPECT_OK(cache2.LookupOrCompute("c", &value, compute_func));
+  EXPECT_EQ(value, 2);
+  EXPECT_EQ(num_compute_calls, 3);
+  TF_EXPECT_OK(cache2.LookupOrCompute("d", &value, compute_func));
+  EXPECT_EQ(value, 3);
+  EXPECT_EQ(num_compute_calls, 4);
+  TF_EXPECT_OK(cache2.LookupOrCompute("e", &value, compute_func));
+  EXPECT_EQ(value, 4);
+  EXPECT_EQ(num_compute_calls, 5);
+  // Verify the other values remain in the cache.
+  TF_EXPECT_OK(cache2.LookupOrCompute("b", &value, compute_func));
+  EXPECT_EQ(value, 1);
+  EXPECT_EQ(num_compute_calls, 5);
+  TF_EXPECT_OK(cache2.LookupOrCompute("c", &value, compute_func));
+  EXPECT_EQ(value, 2);
+  EXPECT_EQ(num_compute_calls, 5);
+  TF_EXPECT_OK(cache2.LookupOrCompute("d", &value, compute_func));
+  EXPECT_EQ(value, 3);
+  EXPECT_EQ(num_compute_calls, 5);
+
+  // Re-read "a", ensure it is re-computed.
+  TF_EXPECT_OK(cache2.LookupOrCompute("a", &value, compute_func));
+  EXPECT_EQ(value, 5);
+  EXPECT_EQ(num_compute_calls, 6);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/file_block_cache.cc b/tensorflow/core/platform/cloud/file_block_cache.cc
index a05c18c06948aa835da7452451cb649df7a66943..e1afc7b308e740769abca5d95fde34c004df75ee 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache.cc
@@ -16,79 +16,137 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include <cstring>
 #include <memory>
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 
-std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
-  mutex_lock lock(mu_);
-  auto entry = block_map_.find(key);
-  if (entry == block_map_.end()) {
-    return std::shared_ptr<Block>();
-  }
-  // If we're enforcing max staleness and the block is stale, remove all of the
-  // file's cached blocks so we reload them.
-  if (max_staleness_ > 0 &&
-      env_->NowSeconds() - entry->second->timestamp > max_staleness_) {
-    RemoveFile_Locked(key.first);
-    return std::shared_ptr<Block>();
+bool FileBlockCache::BlockNotStale(const std::shared_ptr<Block>& block) {
+  mutex_lock l(block->mu);
+  if (block->state != FetchState::FINISHED) {
+    return true;  // No need to check for staleness.
   }
-  return entry->second;
+  if (max_staleness_ == 0) return true;  // Not enforcing staleness.
+  return env_->NowSeconds() - block->timestamp <= max_staleness_;
 }
 
-std::shared_ptr<FileBlockCache::Block> FileBlockCache::Insert(
-    const Key& key, std::shared_ptr<Block> block) {
+std::shared_ptr<FileBlockCache::Block> FileBlockCache::Lookup(const Key& key) {
   mutex_lock lock(mu_);
   auto entry = block_map_.find(key);
   if (entry != block_map_.end()) {
-    // Use the block that's already in the cache.
-    return entry->second;
-  }
-  // Sanity check to detect interrupted reads leading to partial blocks: a
-  // partial block must have a higher key than the highest existing key in the
-  // block map for the file. Note that since this check relies on the existence
-  // of a cached block with a higher key, some incomplete reads may still go
-  // undetected (if their key happens to be higher than anything in the cache).
-  if (block->data.size() < block_size_ && !block_map_.empty()) {
-    Key fmax = std::make_pair(key.first, std::numeric_limits<size_t>::max());
-    auto fcmp = block_map_.upper_bound(fmax);
-    if (fcmp != block_map_.begin() && key < (--fcmp)->first) {
-      // We expected to read a full block at this position.
-      return std::shared_ptr<Block>();
+    if (BlockNotStale(entry->second)) {
+      return entry->second;
+    } else {
+      // Remove the stale block and continue.
+      RemoveFile_Locked(key.first);
     }
   }
-  // Add the block to the cache (with necessary bookkeeping).
+
+  // Insert a new empty block, setting the bookkeeping to sentinel values
+  // in order to update them as appropriate.
+  auto new_entry = std::make_shared<Block>();
   lru_list_.push_front(key);
   lra_list_.push_front(key);
-  block->lru_iterator = lru_list_.begin();
-  block->lra_iterator = lra_list_.begin();
-  block->timestamp = env_->NowSeconds();
-  cache_size_ += block->data.size();
-  block_map_.emplace(std::make_pair(key, block));
-  return block;
+  new_entry->lru_iterator = lru_list_.begin();
+  new_entry->lra_iterator = lra_list_.begin();
+  new_entry->timestamp = env_->NowSeconds();
+  block_map_.emplace(std::make_pair(key, new_entry));
+  return new_entry;
 }
 
-// Remove blocks from the cache until there is space for a full sized block.
+// Remove blocks from the cache until we do not exceed our maximum size.
 void FileBlockCache::Trim() {
-  mutex_lock lock(mu_);
-  while (!lru_list_.empty() && cache_size_ + block_size_ > max_bytes_) {
+  while (!lru_list_.empty() && cache_size_ > max_bytes_) {
     RemoveBlock(block_map_.find(lru_list_.back()));
   }
 }
 
 /// Move the block to the front of the LRU list if it isn't already there.
-void FileBlockCache::UpdateLRU(const Key& key,
-                               const std::shared_ptr<Block>& block) {
+Status FileBlockCache::UpdateLRU(const Key& key,
+                                 const std::shared_ptr<Block>& block) {
   mutex_lock lock(mu_);
   if (block->timestamp == 0) {
     // The block was evicted from another thread. Allow it to remain evicted.
-    return;
+    return Status::OK();
   }
   if (block->lru_iterator != lru_list_.begin()) {
     lru_list_.erase(block->lru_iterator);
     lru_list_.push_front(key);
     block->lru_iterator = lru_list_.begin();
   }
+
+  // Check for inconsistent state. If there is a block later in the same file
+  // in the cache, and our current block is not block size, this likely means
+  // we have inconsistent state within the cache. Note: it's possible some
+  // incomplete reads may still go undetected.
+  if (block->data.size() < block_size_) {
+    Key fmax = std::make_pair(key.first, std::numeric_limits<size_t>::max());
+    auto fcmp = block_map_.upper_bound(fmax);
+    if (fcmp != block_map_.begin() && key < (--fcmp)->first) {
+      return errors::Internal("Block cache contents are inconsistent.");
+    }
+  }
+
+  Trim();
+
+  return Status::OK();
+}
+
+Status FileBlockCache::MaybeFetch(const Key& key,
+                                  const std::shared_ptr<Block>& block) {
+  bool downloaded_block = false;
+  auto reconcile_state =
+      gtl::MakeCleanup([this, &downloaded_block, &key, &block] {
+        // Perform this action in a cleanup callback to avoid locking mu_ after
+        // locking block->mu.
+        if (downloaded_block) {
+          mutex_lock l(mu_);
+          // Do not update state if the block is already to be evicted.
+          if (block->timestamp != 0) {
+            cache_size_ += block->data.size();
+            // Put to beginning of LRA list.
+            lra_list_.erase(block->lra_iterator);
+            lra_list_.push_front(key);
+            block->lra_iterator = lra_list_.begin();
+            block->timestamp = env_->NowSeconds();
+          }
+        }
+      });
+  // Loop until either block content is successfully fetched, or our request
+  // encounters an error.
+  mutex_lock l(block->mu);
+  Status status = Status::OK();
+  while (true) {
+    switch (block->state) {
+      case FetchState::ERROR:
+        TF_FALLTHROUGH_INTENDED;
+      case FetchState::CREATED:
+        block->state = FetchState::FETCHING;
+        block->mu.unlock();  // Release the lock while making the API call.
+        status.Update(
+            block_fetcher_(key.first, key.second, block_size_, &block->data));
+        block->mu.lock();  // Reacquire the lock immediately afterwards
+        if (status.ok()) {
+          downloaded_block = true;
+          block->state = FetchState::FINISHED;
+        } else {
+          block->state = FetchState::ERROR;
+        }
+        block->cond_var.notify_all();
+        return status;
+      case FetchState::FETCHING:
+        block->cond_var.wait_for(l, std::chrono::seconds(60));
+        if (block->state == FetchState::FINISHED) {
+          return Status::OK();
+        }
+        // Re-loop in case of errors.
+        break;
+      case FetchState::FINISHED:
+        return Status::OK();
+    }
+  }
+  return errors::Internal(
+      "Control flow should never reach the end of FileBlockCache::Fetch.");
 }
 
 Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
@@ -114,22 +172,18 @@ Status FileBlockCache::Read(const string& filename, size_t offset, size_t n,
     // Look up the block, fetching and inserting it if necessary, and update the
     // LRU iterator for the key and block.
     std::shared_ptr<Block> block = Lookup(key);
-    if (!block) {
-      Trim();
-      auto fetch = std::make_shared<Block>();
-      auto status = block_fetcher_(filename, pos, block_size_, &fetch->data);
-      if (!(block = Insert(key, fetch))) {
-        return errors::Internal("File contents are inconsistent");
-      }
-    }
-    UpdateLRU(key, block);
+    DCHECK(block) << "No block for key " << key.first << "@" << key.second;
+    TF_RETURN_IF_ERROR(MaybeFetch(key, block));
+    TF_RETURN_IF_ERROR(UpdateLRU(key, block));
     // Copy the relevant portion of the block into the result buffer.
     const auto& data = block->data;
     if (offset >= pos + data.size()) {
       // The requested offset is at or beyond the end of the file. This can
       // happen if `offset` is not block-aligned, and the read returns the last
       // block in the file, which does not extend all the way out to `offset`.
-      return errors::OutOfRange("EOF at offset ", offset);
+      return errors::OutOfRange("EOF at offset ", offset, " in file ", filename,
+                                " at position ", pos, "with data size ",
+                                data.size());
     }
     auto begin = data.begin();
     if (offset > pos) {
@@ -190,11 +244,11 @@ void FileBlockCache::RemoveFile_Locked(const string& filename) {
 }
 
 void FileBlockCache::RemoveBlock(BlockMap::iterator entry) {
-  lru_list_.erase(entry->second->lru_iterator);
-  lra_list_.erase(entry->second->lra_iterator);
   // This signals that the block is removed, and should not be inadvertently
   // reinserted into the cache in UpdateLRU.
   entry->second->timestamp = 0;
+  lru_list_.erase(entry->second->lru_iterator);
+  lra_list_.erase(entry->second->lra_iterator);
   cache_size_ -= entry->second->data.size();
   block_map_.erase(entry);
 }
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index b45d2260957858163585ae845a3867b0c01f3d0f..36dbf9db83238fa05e3b010c2a73cb823623f54b 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -115,11 +115,35 @@ class FileBlockCache {
   /// The file block cache key is a {filename, offset} pair.
   typedef std::pair<string, size_t> Key;
 
+  /// \brief The state of a block.
+  ///
+  /// A block begins in the CREATED stage. The first thread will attempt to read
+  /// the block from the filesystem, transitioning the state of the block to
+  /// FETCHING. After completing, if the read was successful the state should
+  /// be FINISHED. Otherwise the state should be ERROR. A subsequent read can
+  /// re-fetch the block if the state is ERROR.
+  enum class FetchState {
+    CREATED,
+    FETCHING,
+    FINISHED,
+    ERROR,
+  };
+
   /// \brief A block of a file.
   ///
   /// A file block consists of the block data, the block's current position in
-  /// the LRU cache, and the timestamp (seconds since epoch) at which the block
-  /// was cached.
+  /// the LRU cache, the timestamp (seconds since epoch) at which the block
+  /// was cached, a coordination lock, and state & condition variables.
+  ///
+  /// Thread safety:
+  /// The iterator and timestamp fields should only be accessed while holding
+  /// the block-cache-wide mu_ instance variable. The state variable should only
+  /// be accessed while holding the Block's mu lock. The data vector should only
+  /// be accessed after state == FINISHED, and it should never be modified.
+  ///
+  /// In order to prevent deadlocks, never grab the block-cache-wide mu_ lock
+  /// AFTER grabbing any block's mu lock. It is safe to grab mu without locking
+  /// mu_.
   struct Block {
     /// The block data.
     std::vector<char> data;
@@ -129,6 +153,12 @@ class FileBlockCache {
     std::list<Key>::iterator lra_iterator;
     /// The timestamp (seconds since epoch) at which the block was cached.
     uint64 timestamp;
+    /// Mutex to guard state variable
+    mutex mu;
+    /// The state of the block.
+    FetchState state GUARDED_BY(mu) = FetchState::CREATED;
+    /// Wait on cond_var if state is FETCHING.
+    condition_variable cond_var;
   };
 
   /// \brief The block map type for the file block cache.
@@ -139,19 +169,20 @@ class FileBlockCache {
   /// Prune the cache by removing files with expired blocks.
   void Prune() LOCKS_EXCLUDED(mu_);
 
+  bool BlockNotStale(const std::shared_ptr<Block>& block)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   /// Look up a Key in the block cache.
   std::shared_ptr<Block> Lookup(const Key& key) LOCKS_EXCLUDED(mu_);
 
-  /// Insert a block in the block cache with the given key.
-  std::shared_ptr<FileBlockCache::Block> Insert(const Key& key,
-                                                std::shared_ptr<Block> block)
+  Status MaybeFetch(const Key& key, const std::shared_ptr<Block>& block)
       LOCKS_EXCLUDED(mu_);
 
   /// Trim the block cache to make room for another entry.
-  void Trim() LOCKS_EXCLUDED(mu_);
+  void Trim() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  /// Update LRU and LRA iterators for the block at `key`.
-  void UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
+  /// Update the LRU iterator for the block at `key`.
+  Status UpdateLRU(const Key& key, const std::shared_ptr<Block>& block)
       LOCKS_EXCLUDED(mu_);
 
   /// Remove all blocks of a file, with mu_ already held.
@@ -179,6 +210,9 @@ class FileBlockCache {
 
   /// The LRA (least recently added) list of block keys. The front of the list
   /// identifies the most recently added block.
+  ///
+  /// Note: blocks are added to lra_list_ only after they have successfully been
+  /// fetched from the underlying block store.
   std::list<Key> lra_list_ GUARDED_BY(mu_);
 
   /// The combined number of bytes in all of the cached blocks.
diff --git a/tensorflow/core/platform/cloud/file_block_cache_test.cc b/tensorflow/core/platform/cloud/file_block_cache_test.cc
index 5fa738b45292f3683a2f79aee00de1aa9da619d4..081b32af64636105925240da70bf050cdec2c4b9 100644
--- a/tensorflow/core/platform/cloud/file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/file_block_cache_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/cloud/now_seconds_env.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -435,5 +436,39 @@ TEST(FileBlockCacheTest, ParallelReads) {
   // executed, or 10 seconds have passed).
 }
 
+TEST(FileBlockCacheTest, CoalesceConcurrentReads) {
+  // Concurrent reads to the same file blocks should be de-duplicated.
+  const size_t block_size = 16;
+  int num_requests = 0;
+  Notification notification;
+  auto fetcher = [&num_requests, &notification, block_size](
+                     const string& filename, size_t offset, size_t n,
+                     std::vector<char>* out) {
+    EXPECT_EQ(n, block_size);
+    EXPECT_EQ(offset, 0);
+    num_requests++;
+    out->resize(n, 'x');
+    notification.Notify();
+    // Wait for other thread to issue read.
+    Env::Default()->SleepForMicroseconds(100000);  // 0.1 secs
+    return Status::OK();
+  };
+  FileBlockCache cache(block_size, block_size, 0, fetcher);
+  // Fork off thread for parallel read.
+  std::unique_ptr<Thread> concurrent(
+      Env::Default()->StartThread({}, "concurrent", [&cache] {
+        std::vector<char> out;
+        TF_EXPECT_OK(cache.Read("", 0, block_size / 2, &out));
+        EXPECT_EQ(out.size(), block_size / 2);
+      }));
+  EXPECT_TRUE(WaitForNotificationWithTimeout(&notification, 10000))
+      << "Timeout waiting for concurrent thread to start.";
+  std::vector<char> out;
+  TF_EXPECT_OK(cache.Read("", block_size / 2, block_size / 2, &out));
+  EXPECT_EQ(out.size(), block_size / 2);
+
+  EXPECT_EQ(1, num_requests);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..840f2b21cde99d37d8b567d0d77c02e7ff31e6e6
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -0,0 +1,148 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#ifndef _WIN32
+#include <arpa/inet.h>
+#include <netdb.h>
+#else
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#include <Windows.h>
+#endif
+#include <sys/types.h>
+
+namespace tensorflow {
+
+namespace {
+
+constexpr char kStorageHost[] = "storage.googleapis.com";
+constexpr char kWwwHost[] = "www.googleapis.com";
+
+inline void print_getaddrinfo_error(const string& name, int error_code) {
+#ifndef _WIN32
+  if (error_code == EAI_SYSTEM) {
+    LOG(ERROR) << "Error resolving " << name
+               << " (EAI_SYSTEM): " << strerror(errno);
+  } else {
+    LOG(ERROR) << "Error resolving " << name << ": "
+               << gai_strerror(error_code);
+  }
+#else
+  // TODO:WSAGetLastError is better than gai_strerror
+  LOG(ERROR) << "Error resolving " << name << ": " << gai_strerror(error_code);
+#endif
+}
+}  // namespace
+
+GcsDnsCache::GcsDnsCache(Env* env, int64 refresh_rate_secs)
+    : env_(env), refresh_rate_secs_(refresh_rate_secs) {}
+
+Status GcsDnsCache::AnnotateRequest(HttpRequest* request) {
+  // TODO(saeta): Blacklist failing IP addresses.
+  mutex_lock l(mu_);
+  if (!started_) {
+    DCHECK(!worker_) << "Worker thread already exists!";
+    // Perform DNS resolutions to warm the cache.
+    std::vector<string> www_addresses = ResolveName(kWwwHost);
+    std::vector<string> storage_addresses = ResolveName(kStorageHost);
+    www_addresses.swap(www_addresses_);
+    storage_addresses.swap(storage_addresses_);
+
+    // Note: we opt to use a thread instead of a delayed closure.
+    worker_.reset(env_->StartThread(
+        {}, "gcs_dns_worker", std::bind(&GcsDnsCache::WorkerThread, this)));
+    started_ = true;
+  }
+  if (!storage_addresses_.empty()) {
+    std::uniform_int_distribution<> storage_dist(0,
+                                                 storage_addresses_.size() - 1);
+    size_t index = storage_dist(random_);
+    TF_RETURN_IF_ERROR(request->AddResolveOverride(kStorageHost, 443,
+                                                   storage_addresses_[index]));
+  } else {
+    LOG(WARNING) << "No IP addresses available for " << kStorageHost;
+  }
+  if (!www_addresses_.empty()) {
+    std::uniform_int_distribution<> www_dist(0, www_addresses_.size() - 1);
+    size_t index = www_dist(random_);
+    TF_RETURN_IF_ERROR(
+        request->AddResolveOverride(kWwwHost, 443, www_addresses_[index]));
+  } else {
+    LOG(WARNING) << "No IP addresses available for " << kWwwHost;
+  }
+  return Status::OK();
+}
+
+/* static */ std::vector<string> GcsDnsCache::ResolveName(const string& name) {
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_INET;  // Only use IPv4 for now.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo* result = nullptr;
+  int return_code = getaddrinfo(name.c_str(), nullptr, &hints, &result);
+
+  std::vector<string> output;
+  if (return_code == 0) {
+    for (const addrinfo* i = result; i != nullptr; i = i->ai_next) {
+      if (i->ai_family != AF_INET || i->ai_addr->sa_family != AF_INET) {
+        LOG(WARNING) << "Non-IPv4 address returned. ai_family: " << i->ai_family
+                     << ". sa_family: " << i->ai_addr->sa_family << ".";
+        continue;
+      }
+      char buf[INET_ADDRSTRLEN];
+      void* address_ptr =
+          &(reinterpret_cast<sockaddr_in*>(i->ai_addr)->sin_addr);
+      const char* formatted = nullptr;
+      if ((formatted = inet_ntop(i->ai_addr->sa_family, address_ptr, buf,
+                                 INET_ADDRSTRLEN)) == nullptr) {
+        LOG(ERROR) << "Error converting response to IP address for " << name
+                   << ": " << strerror(errno);
+      } else {
+        output.emplace_back(buf);
+      }
+    }
+  } else {
+    print_getaddrinfo_error(name, return_code);
+  }
+  if (result != nullptr) {
+    freeaddrinfo(result);
+  }
+  return output;
+}
+
+void GcsDnsCache::WorkerThread() {
+  while (true) {
+    {
+      // Don't immediately re-resolve the addresses.
+      mutex_lock l(mu_);
+      if (cancelled_) return;
+      cond_var_.wait_for(l, std::chrono::seconds(refresh_rate_secs_));
+      if (cancelled_) return;
+    }
+    // Resolve DNS values
+    std::vector<string> www_addresses = ResolveName(kWwwHost);
+    std::vector<string> storage_addresses = ResolveName(kStorageHost);
+
+    {
+      mutex_lock l(mu_);
+      // Update instance variables.
+      www_addresses.swap(www_addresses_);
+      storage_addresses.swap(storage_addresses_);
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4d3847a5ac82b1ced742a20ca18ba84bf6fa7c
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+#define THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
+
+#include <random>
+
+#include "tensorflow/core/platform/cloud/http_request.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+const int64 kDefaultRefreshRateSecs = 60;
+
+// DnsCache is a userspace DNS cache specialized for the GCS filesystem.
+//
+// Some environments have unreliable DNS resolvers. DnsCache ameliorates the
+// situation by radically reducing the number of DNS requests by performing
+// 2 DNS queries per minute (by default) on a background thread. Updated cache
+// entries are used to override curl's DNS resolution processes.
+class GcsDnsCache {
+ public:
+  // Default no-argument constructor.
+  GcsDnsCache() : GcsDnsCache(kDefaultRefreshRateSecs) {}
+
+  // Constructs a GcsDnsCache with the specified refresh rate.
+  GcsDnsCache(int64 refresh_rate_secs)
+      : GcsDnsCache(Env::Default(), refresh_rate_secs) {}
+
+  GcsDnsCache(Env* env, int64 refresh_rate_secs);
+
+  ~GcsDnsCache() {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+    cond_var_.notify_one();
+  }
+
+  // Annotate the given HttpRequest with resolve overrides from the cache.
+  Status AnnotateRequest(HttpRequest* request);
+
+ private:
+  static std::vector<string> ResolveName(const string& name);
+  void WorkerThread();
+
+  // Define a friend class for testing.
+  friend class GcsDnsCacheTest;
+
+  mutex mu_;
+  Env* env_;
+  condition_variable cond_var_;
+  std::default_random_engine random_ GUARDED_BY(mu_);
+  bool started_ GUARDED_BY(mu_) = false;
+  bool cancelled_ GUARDED_BY(mu_) = false;
+  std::vector<string> www_addresses_ GUARDED_BY(mu_);
+  std::vector<string> storage_addresses_ GUARDED_BY(mu_);
+  std::unique_ptr<Thread> worker_ GUARDED_BY(mu_);  // After mutable vars.
+  const int64 refresh_rate_secs_;
+};
+
+}  // namespace tensorflow
+
+#endif  // THIRD_PARTY_TENSORFLOW_PLATNFORM_CLOUD_DNS_CACHE_H_
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d1a108f30dd0461a1cd08dd217badbdf24fc400
--- /dev/null
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache_test.cc
@@ -0,0 +1,114 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestHttpRequest : public HttpRequest {
+ public:
+  Status Init() override { return Status::OK(); }
+  Status SetUri(const string& uri) override { return Status::OK(); }
+  Status SetRange(uint64 start, uint64 end) override { return Status::OK(); }
+  Status AddHeader(const string& name, const string& value) override {
+    return Status::OK();
+  }
+  Status AddResolveOverride(const string& hostname, int64 port,
+                            const string& ip_addr) override {
+    EXPECT_EQ(port, 443) << "Unexpected port set for hostname: " << hostname;
+    auto itr = resolve_overrides_.find(hostname);
+    EXPECT_EQ(itr, resolve_overrides_.end())
+        << "Hostname " << hostname << "already in map: " << itr->second;
+
+    resolve_overrides_.insert(
+        std::map<string, string>::value_type(hostname, ip_addr));
+    return Status::OK();
+  }
+
+  Status AddAuthBearerHeader(const string& auth_token) override {
+    return Status::OK();
+  }
+
+  Status SetDeleteRequest() override { return Status::OK(); }
+
+  Status SetPutFromFile(const string& body_filepath, size_t offset) override {
+    return Status::OK();
+  }
+  Status SetPutEmptyBody() override { return Status::OK(); }
+
+  Status SetPostFromBuffer(const char* buffer, size_t size) override {
+    return Status::OK();
+  }
+  Status SetPostEmptyBody() override { return Status::OK(); }
+
+  Status SetResultBuffer(std::vector<char>* out_buffer) override {
+    return Status::OK();
+  }
+
+  string GetResponseHeader(const string& name) const override { return ""; }
+  uint64 GetResponseCode() const override { return 0; }
+  Status Send() override { return Status::OK(); }
+  string EscapeString(const string& str) override { return ""; }
+
+  std::map<string, string> resolve_overrides_;
+};
+
+// Friend class for testing.
+//
+// It is written this way (as opposed to using FRIEND_TEST) to avoid a
+// non-test-time dependency on gunit.
+class GcsDnsCacheTest : public ::testing::Test {
+ protected:
+  void ResolveNameTest() {
+    auto response = GcsDnsCache::ResolveName("www.googleapis.com");
+    EXPECT_LT(1, response.size()) << str_util::Join(response, ", ");
+  }
+
+  void AnnotateRequestTest() {
+    GcsDnsCache d;
+    {
+      mutex_lock l(d.mu_);
+      d.started_ = true;  // Avoid creating a thread.
+      d.www_addresses_ = {"192.168.1.1"};
+      d.storage_addresses_ = {"172.134.1.1"};
+    }
+
+    TestHttpRequest req;
+    Status s = d.AnnotateRequest(&req);
+    EXPECT_TRUE(s.ok()) << s;
+    EXPECT_EQ("192.168.1.1", req.resolve_overrides_["www.googleapis.com"]);
+    EXPECT_EQ("172.134.1.1", req.resolve_overrides_["storage.googleapis.com"]);
+  }
+
+  void SuccessfulCleanupTest() {
+    // Create a DnsCache object, start the worker thread, ensure it cleans up in
+    // a timely manner.
+    GcsDnsCache d;
+    TestHttpRequest req;
+    Status s = d.AnnotateRequest(&req);
+    EXPECT_TRUE(s.ok()) << s;
+  }
+};
+
+// This sends a DNS name resolution request, thus it is flaky.
+// TEST_F(GcsDnsCacheTest, ResolveName) { ResolveNameTest(); }
+
+TEST_F(GcsDnsCacheTest, AnnotateRequest) { AnnotateRequestTest(); }
+
+TEST_F(GcsDnsCacheTest, SuccessfulCleanup) { SuccessfulCleanupTest(); }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index e82aebad0b011dfdec25f2e1c9b7b0098e72d3ad..c44cad9fc86f566523117fa10d1ecd878682ab85 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include <cstring>
 #include <fstream>
 #include <vector>
+#ifdef _WIN32
+#include <io.h>  //for _mktemp
+#endif
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -29,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
@@ -39,6 +43,12 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
+#ifdef _WIN32
+#ifdef DeleteFile
+#undef DeleteFile
+#endif
+#endif
+
 namespace tensorflow {
 
 namespace {
@@ -89,17 +99,30 @@ constexpr char kMatchingPathsCacheMaxEntries[] =
 constexpr size_t kMatchingPathsCacheDefaultMaxEntries = 1024;
 // The file statistics returned by Stat() for directories.
 const FileStatistics DIRECTORY_STAT(0, 0, true);
+// Some environments exhibit unreliable DNS resolution. Set this environment
+// variable to a positive integer describing the frequency used to refresh the
+// userspace DNS cache.
+constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
 
+// TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
   if (!filename) {
     return errors::Internal("'filename' cannot be nullptr.");
   }
+#ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
   if (fd < 0) {
     return errors::Internal("Failed to create a temporary file.");
   }
   close(fd);
+#else
+  char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
+  char* ret = _mktemp(buffer);
+  if (ret == nullptr) {
+    return errors::Internal("Failed to create a temporary file.");
+  }
+#endif
   *filename = buffer;
   return Status::OK();
 }
@@ -247,7 +270,7 @@ class GcsRandomAccessFile : public RandomAccessFile {
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    result->clear();
+    *result = StringPiece();
     std::vector<char> out;
     TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, &out));
     std::memcpy(scratch, out.data(), std::min(out.size(), n));
@@ -287,6 +310,7 @@ class GcsWritableFile : public WritableFile {
         file_cache_erase_(std::move(file_cache_erase)),
         sync_needed_(true),
         initial_retry_delay_usec_(initial_retry_delay_usec) {
+    // TODO: to make it safer, outfile_ should be constructed from an FD
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
       outfile_.open(tmp_content_filename_,
                     std::ofstream::binary | std::ofstream::app);
@@ -411,7 +435,7 @@ class GcsWritableFile : public WritableFile {
       return errors::Internal("'size' cannot be nullptr");
     }
     const auto tellp = outfile_.tellp();
-    if (tellp == -1) {
+    if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
           "Could not get the size of the internal temporary file.");
     }
@@ -434,8 +458,8 @@ class GcsWritableFile : public WritableFile {
     std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
     TF_RETURN_IF_ERROR(request->Init());
     TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-        kGcsUploadUriBase, "b/", bucket_, "/o?uploadType=resumable&name=",
-        request->EscapeString(object_))));
+        kGcsUploadUriBase, "b/", bucket_,
+        "/o?uploadType=resumable&name=", request->EscapeString(object_))));
     TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     TF_RETURN_IF_ERROR(request->AddHeader("X-Upload-Content-Length",
                                           std::to_string(file_size)));
@@ -624,6 +648,12 @@ GcsFileSystem::GcsFileSystem()
   }
   matching_paths_cache_.reset(new ExpiringLRUCache<std::vector<string>>(
       matching_paths_cache_max_age, matching_paths_cache_max_entries));
+
+  int64 resolve_frequency_secs;
+  if (GetEnvVar(kResolveCacheSecs, strings::safe_strto64,
+                &resolve_frequency_secs)) {
+    dns_cache_.reset(new GcsDnsCache(resolve_frequency_secs));
+  }
 }
 
 GcsFileSystem::GcsFileSystem(
@@ -678,8 +708,31 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
   TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
   TF_RETURN_IF_ERROR(request->SetRange(offset, offset + n - 1));
   TF_RETURN_IF_ERROR(request->SetResultBuffer(out));
+
+  if (dns_cache_) {
+    TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
+  }
+
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading gs://",
                                   bucket, "/", object);
+
+  VLOG(1) << "Successful read of gs://" << bucket << "/" << object << " @ "
+          << offset << " of size: " << out->size();
+
+  if (out->size() < block_size()) {
+    // Check stat cache to see if we encountered an interrupted read.
+    FileStatistics stat;
+    if (stat_cache_->Lookup(filename, &stat)) {
+      if (offset + out->size() < stat.length) {
+        return errors::Internal(strings::Printf(
+            "File contents are inconsistent for file: %s @ %lu.",
+            filename.c_str(), offset));
+      }
+      VLOG(2) << "Successful integrity check for: gs://" << bucket << "/"
+              << object << " @ " << offset;
+    }
+  }
+
   return Status::OK();
 }
 
@@ -799,48 +852,61 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
   if (!stat) {
     return errors::Internal("'stat' cannot be nullptr.");
   }
-  if (stat_cache_->Lookup(fname, stat)) {
-    if (stat->is_directory) {
-      return errors::NotFound(fname, " is a directory.");
-    } else {
-      return Status::OK();
-    }
-  }
   if (object.empty()) {
-    return errors::InvalidArgument("'object' must be a non-empty string.");
+    return errors::InvalidArgument(strings::Printf(
+        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
+  }
+
+  StatCache::ComputeFunc compute_func =
+      [this, &bucket, &object](const string& fname, FileStatistics* stat) {
+        string auth_token;
+        TF_RETURN_IF_ERROR(
+            AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+
+        std::vector<char> output_buffer;
+        std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
+        TF_RETURN_IF_ERROR(request->Init());
+        TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
+            kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object),
+            "?fields=size%2Cupdated")));
+        TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
+        TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+
+        if (dns_cache_) {
+          TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
+        }
+        TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
+                                        " when reading metadata of gs://",
+                                        bucket, "/", object);
+
+        StringPiece response_piece =
+            StringPiece(output_buffer.data(), output_buffer.size());
+        Json::Value root;
+        TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
+
+        // Parse file size.
+        TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
+
+        // Parse file modification time.
+        string updated;
+        TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+        TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+
+        VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+                << " length: " << stat->length
+                << "; mtime_nsec: " << stat->mtime_nsec
+                << "; updated: " << updated;
+
+        stat->is_directory = false;
+        return Status::OK();
+      };
+
+  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
+  if (stat->is_directory) {
+    return errors::NotFound(fname, " is a directory.");
+  } else {
+    return Status::OK();
   }
-
-  string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
-
-  std::vector<char> output_buffer;
-  std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
-  TF_RETURN_IF_ERROR(request->Init());
-  TF_RETURN_IF_ERROR(request->SetUri(strings::StrCat(
-      kGcsUriBase, "b/", bucket, "/o/", request->EscapeString(object),
-      "?fields=size%2Cupdated")));
-  TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
-  TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      request->Send(), " when reading metadata of gs://", bucket, "/", object);
-
-  StringPiece response_piece =
-      StringPiece(output_buffer.data(), output_buffer.size());
-  Json::Value root;
-  TF_RETURN_IF_ERROR(ParseJson(response_piece, &root));
-
-  // Parse file size.
-  TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &(stat->length)));
-
-  // Parse file modification time.
-  string updated;
-  TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
-
-  stat->is_directory = false;
-  stat_cache_->Insert(fname, *stat);
-
-  return Status::OK();
 }
 
 Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
@@ -872,19 +938,30 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
+  StatCache::ComputeFunc compute_func = [this](const string& dirname,
+                                               FileStatistics* stat) {
+    std::vector<string> children;
+    TF_RETURN_IF_ERROR(
+        GetChildrenBounded(dirname, 1, &children, true /* recursively */,
+                           true /* include_self_directory_marker */));
+    if (!children.empty()) {
+      *stat = DIRECTORY_STAT;
+      return Status::OK();
+    } else {
+      return errors::InvalidArgument("Not a directory!");
+    }
+  };
   FileStatistics stat;
-  if (stat_cache_->Lookup(dirname, &stat)) {
+  Status s = stat_cache_->LookupOrCompute(dirname, &stat, compute_func);
+  if (s.ok()) {
     *result = stat.is_directory;
     return Status::OK();
   }
-  std::vector<string> children;
-  TF_RETURN_IF_ERROR(
-      GetChildrenBounded(dirname, 1, &children, true /* recursively */,
-                         true /* include_self_directory_marker */));
-  if ((*result = !children.empty())) {
-    stat_cache_->Insert(dirname, DIRECTORY_STAT);
+  if (errors::IsInvalidArgument(s)) {
+    *result = false;
+    return Status::OK();
   }
-  return Status::OK();
+  return s;
 }
 
 Status GcsFileSystem::GetChildren(const string& dirname,
@@ -896,33 +973,35 @@ Status GcsFileSystem::GetChildren(const string& dirname,
 
 Status GcsFileSystem::GetMatchingPaths(const string& pattern,
                                        std::vector<string>* results) {
-  if (matching_paths_cache_->Lookup(pattern, results)) {
-    return Status::OK();
-  }
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  const string& fixed_prefix =
-      pattern.substr(0, pattern.find_first_of("*?[\\"));
-  const string& dir = io::Dirname(fixed_prefix).ToString();
-  if (dir.empty()) {
-    return errors::InvalidArgument("A GCS pattern doesn't have a bucket name: ",
-                                   pattern);
-  }
-  std::vector<string> all_files;
+  MatchingPathsCache::ComputeFunc compute_func =
+      [this](const string& pattern, std::vector<string>* results) {
+        results->clear();
+        // Find the fixed prefix by looking for the first wildcard.
+        const string& fixed_prefix =
+            pattern.substr(0, pattern.find_first_of("*?[\\"));
+        const string& dir = io::Dirname(fixed_prefix).ToString();
+        if (dir.empty()) {
+          return errors::InvalidArgument(
+              "A GCS pattern doesn't have a bucket name: ", pattern);
+        }
+        std::vector<string> all_files;
+        TF_RETURN_IF_ERROR(GetChildrenBounded(
+            dir, UINT64_MAX, &all_files, true /* recursively */,
+            false /* include_self_directory_marker */));
+
+        const auto& files_and_folders = AddAllSubpaths(all_files);
+
+        // Match all obtained paths to the input pattern.
+        for (const auto& path : files_and_folders) {
+          const string& full_path = io::JoinPath(dir, path);
+          if (Env::Default()->MatchPath(full_path, pattern)) {
+            results->push_back(full_path);
+          }
+        }
+        return Status::OK();
+      };
   TF_RETURN_IF_ERROR(
-      GetChildrenBounded(dir, UINT64_MAX, &all_files, true /* recursively */,
-                         false /* include_self_directory_marker */));
-
-  const auto& files_and_folders = AddAllSubpaths(all_files);
-
-  // Match all obtained paths to the input pattern.
-  for (const auto& path : files_and_folders) {
-    const string& full_path = io::JoinPath(dir, path);
-    if (Env::Default()->MatchPath(full_path, pattern)) {
-      results->push_back(full_path);
-    }
-  }
-  matching_paths_cache_->Insert(pattern, *results);
+      matching_paths_cache_->LookupOrCompute(pattern, results, compute_func));
   return Status::OK();
 }
 
@@ -959,12 +1038,12 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
       uri = strings::StrCat(uri, "&delimiter=%2F");
     }
     if (!object_prefix.empty()) {
-      uri = strings::StrCat(uri, "&prefix=",
-                            request->EscapeString(object_prefix));
+      uri = strings::StrCat(uri,
+                            "&prefix=", request->EscapeString(object_prefix));
     }
     if (!nextPageToken.empty()) {
-      uri = strings::StrCat(uri, "&pageToken=",
-                            request->EscapeString(nextPageToken));
+      uri = strings::StrCat(
+          uri, "&pageToken=", request->EscapeString(nextPageToken));
     }
     if (max_results - retrieved_results < kGetChildrenDefaultPageSize) {
       uri =
@@ -973,6 +1052,11 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
     TF_RETURN_IF_ERROR(request->SetUri(uri));
     TF_RETURN_IF_ERROR(request->AddAuthBearerHeader(auth_token));
     TF_RETURN_IF_ERROR(request->SetResultBuffer(&output_buffer));
+
+    if (dns_cache_) {
+      TF_RETURN_IF_ERROR(dns_cache_->AnnotateRequest(request.get()));
+    }
+
     TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when reading ", dirname);
     Json::Value root;
     StringPiece response_piece =
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 36a1d42fdef728acc1ff4bbe55dd30ace210a762..4b4853c838abb2d2cc1a6cf68877a0dedcbcc15c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/auth_provider.h"
 #include "tensorflow/core/platform/cloud/expiring_lru_cache.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
+#include "tensorflow/core/platform/cloud/gcs_dns_cache.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -141,6 +142,7 @@ class GcsFileSystem : public FileSystem {
   std::unique_ptr<AuthProvider> auth_provider_;
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
   std::unique_ptr<FileBlockCache> file_block_cache_;
+  std::unique_ptr<GcsDnsCache> dns_cache_;
 
   using StatCache = ExpiringLRUCache<FileStatistics>;
   std::unique_ptr<StatCache> stat_cache_;
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 911176365f462c6b3da88d274040c933343adaf9..7614ec4d7f01369eff1b21141818c673154b7542 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -127,12 +127,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 18-26\n",
-           ""),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n",
-           "012345678")});
+           "")});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
       std::unique_ptr<HttpRequest::Factory>(
@@ -182,8 +177,8 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
               file->Read(20, 10, &result, scratch).code());
     EXPECT_TRUE(result.empty());
 
-    // The beginning of the file has been evicted from the LRU cache.  This will
-    // result in another request. The buffer size is still 15.
+    // The beginning of the file should still be in the LRU cache. There should
+    // not be another request. The buffer size is still 15.
     TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
   }
 
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index f6fd8373cd593da3afdb159640b9cd29fcb795b5..d77f439c5acaa1712ce1f203bafa003aafa6e7c9 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/google_auth_provider.h"
+#ifndef _WIN32
 #include <pwd.h>
-#include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/platform/cloud/http_request.h b/tensorflow/core/platform/cloud/http_request.h
index 8182b63d5b26ead82125c94c2ceaddc3ff9d394e..02d9e9054ad3b22f3cd15cf7b24d917184db264b 100644
--- a/tensorflow/core/platform/cloud/http_request.h
+++ b/tensorflow/core/platform/cloud/http_request.h
@@ -64,6 +64,14 @@ class HttpRequest {
   /// Sets a request header.
   virtual Status AddHeader(const string& name, const string& value) = 0;
 
+  /// Sets a DNS resolve mapping (to skip DNS resolution).
+  ///
+  /// Note: because GCS is available over HTTPS, we cannot replace the hostname
+  /// in the URI with an IP address, as that will cause the certificate check
+  /// to fail.
+  virtual Status AddResolveOverride(const string& hostname, int64 port,
+                                    const string& ip_addr) = 0;
+
   /// Sets the 'Authorization' header to the value of 'Bearer ' + auth_token.
   virtual Status AddAuthBearerHeader(const string& auth_token) = 0;
 
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index c700b97dc95f85400f9a8c214ea1ccc2b1a3e436..3c2830ccd92acdeaa205063ab4867b0c47d567d4 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#ifndef _WIN32
 #include <pwd.h>
 #include <sys/types.h>
 #include <unistd.h>
+#else
+#include <sys/types.h>
+#endif
 #include <fstream>
 #include <openssl/bio.h>
 #include <openssl/evp.h>
diff --git a/tensorflow/core/platform/cloud/time_util.cc b/tensorflow/core/platform/cloud/time_util.cc
index 2f8643f3c7f39c53566d481c078d8f71b44bbedd..0587a65c299778b95ccdec86e03c9f5dca8ec878 100644
--- a/tensorflow/core/platform/cloud/time_util.cc
+++ b/tensorflow/core/platform/cloud/time_util.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdio>
 #include <ctime>
+#ifdef _WIN32
+#define timegm _mkgmtime
+#endif
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e4518a8e2fdfd5a4a23c86a4b287b6f9c7183ef8..948334d27ba420097d0ea686153638fc45d63606 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -399,13 +399,13 @@ def tf_env_time_srcs():
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
-def tf_additional_gpu_tracer_srcs():
-  return ["platform/default/gpu_tracer.cc"]
+def tf_additional_device_tracer_srcs():
+  return ["platform/default/device_tracer.cc"]
 
-def tf_additional_gpu_tracer_cuda_deps():
+def tf_additional_device_tracer_cuda_deps():
   return []
 
-def tf_additional_gpu_tracer_deps():
+def tf_additional_device_tracer_deps():
   return []
 
 def tf_additional_libdevice_data():
@@ -436,14 +436,16 @@ def tf_kernel_tests_linkstatic():
   return 0
 
 def tf_additional_lib_defines():
+  """Additional defines needed to build TF libraries."""
   return select({
       "//tensorflow:with_jemalloc_linux_x86_64": ["TENSORFLOW_USE_JEMALLOC"],
       "//tensorflow:with_jemalloc_linux_ppc64le":["TENSORFLOW_USE_JEMALLOC"],
       "//conditions:default": [],
-  })
+  }) + if_not_mobile(["TENSORFLOW_USE_ABSL"])
 
 def tf_additional_lib_deps():
-  return if_static(
+  """Additional dependencies needed to build TF libraries."""
+  return if_not_mobile(["@com_google_absl//absl/base:base"]) + if_static(
       ["@nsync//:nsync_cpp"],
       ["@nsync//:nsync_headers"]
   ) + select({
@@ -456,16 +458,24 @@ def tf_additional_lib_deps():
 
 def tf_additional_core_deps():
   return select({
+      "//tensorflow:with_gcp_support_android_override": [],
+      "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
           "//tensorflow/core/platform/cloud:gcs_file_system",
       ],
       "//conditions:default": [],
   }) + select({
+      "//tensorflow:with_hdfs_support_windows_override": [],
+      "//tensorflow:with_hdfs_support_android_override": [],
+      "//tensorflow:with_hdfs_support_ios_override": [],
       "//tensorflow:with_hdfs_support": [
           "//tensorflow/core/platform/hadoop:hadoop_file_system",
       ],
       "//conditions:default": [],
   }) + select({
+      "//tensorflow:with_s3_support_windows_override": [],
+      "//tensorflow:with_s3_support_android_override": [],
+      "//tensorflow:with_s3_support_ios_override": [],
       "//tensorflow:with_s3_support": [
           "//tensorflow/core/platform/s3:s3_file_system",
       ],
@@ -475,9 +485,9 @@ def tf_additional_core_deps():
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_op_deps():
   return select({
-      "//tensorflow:windows": [],
-      "//tensorflow:android": [],
-      "//tensorflow:ios": [],
+      "//tensorflow:with_gcp_support_windows_override": [],
+      "//tensorflow:with_gcp_support_android_override": [],
+      "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
         "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
       ],
@@ -487,9 +497,9 @@ def tf_additional_cloud_op_deps():
 # TODO(jart, jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
   return select({
-      "//tensorflow:windows": [],
-      "//tensorflow:android": [],
-      "//tensorflow:ios": [],
+      "//tensorflow:with_gcp_support_windows_override": [],
+      "//tensorflow:with_gcp_support_android_override": [],
+      "//tensorflow:with_gcp_support_ios_override": [],
       "//tensorflow:with_gcp_support": [
         "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
       ],
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index f746b15fee801d92e5378eef23f975add531e9fb..f2fadb45589a8b44d29db045ca4585b578c5301d 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
+load("@local_config_sycl//sycl:build_defs.bzl", "if_ccpp")
 
 cc_library(
     name = "gtest",
@@ -194,17 +195,16 @@ cc_library(
 
 cc_library(
     name = "sycl",
-    data = [
+    data = if_ccpp([
         "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
-    ],
-    linkopts = select({
-        "//conditions:default": [
-            "-Wl,-rpath,../local_config_sycl/sycl/lib",
-        ],
-    }),
-    deps = [
-        "@local_config_sycl//sycl:syclrt",
-    ],
+    ]),
+    linkopts = if_ccpp([
+        "-Wl,-rpath,../local_config_sycl/sycl/lib",
+    ]),
+    deps = if_ccpp(
+        ["@local_config_sycl//sycl:syclrt"],
+        ["@local_config_sycl//sycl:sycl_headers"],
+    ),
 )
 
 filegroup(
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index caeed0aa4a32213e490dd0a05adadeff847d14df..09029a4b256beceeb69c735c15bb1587cb1e06ac 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -10,46 +10,51 @@ def tf_sycl_tests_tags():
 
 def tf_additional_plugin_deps():
   return select({
-      "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"],
+      str(Label("//tensorflow:with_xla_support")): [
+          str(Label("//tensorflow/compiler/jit"))
+      ],
       "//conditions:default": [],
   })
 
 def tf_additional_xla_deps_py():
   return []
 
+def tf_additional_grpc_deps_py():
+  return []
+
 def tf_additional_license_deps():
   return select({
-      "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"],
+      str(Label("//tensorflow:with_xla_support")): ["@llvm//:LICENSE.TXT"],
       "//conditions:default": [],
   })
 
 def tf_additional_verbs_deps():
   return select({
-      "//tensorflow:with_verbs_support": [
-          "//tensorflow/contrib/verbs:verbs_server_lib",
-          "//tensorflow/contrib/verbs:grpc_verbs_client",
-      ], 
+      str(Label("//tensorflow:with_verbs_support")): [
+          str(Label("//tensorflow/contrib/verbs:verbs_server_lib")),
+          str(Label("//tensorflow/contrib/verbs:grpc_verbs_client")),
+      ],
       "//conditions:default": [],
   })
 
 def tf_additional_mpi_deps():
   return select({
-      "//tensorflow:with_mpi_support": [
-          "//tensorflow/contrib/mpi:mpi_server_lib",
+      str(Label("//tensorflow:with_mpi_support")): [
+          str(Label("//tensorflow/contrib/mpi:mpi_server_lib")),
       ],
       "//conditions:default": [],
   })
 
 def tf_additional_gdr_deps():
   return select({
-      "//tensorflow:with_gdr_support": [
-          "//tensorflow/contrib/gdr:gdr_server_lib",
+      str(Label("//tensorflow:with_gdr_support")): [
+          str(Label("//tensorflow/contrib/gdr:gdr_server_lib")),
       ],
       "//conditions:default": [],
   })
 
 def if_static(extra_deps, otherwise=[]):
   return select({
-      "//tensorflow:framework_shared_object": otherwise,
+      str(Label("//tensorflow:framework_shared_object")): otherwise,
       "//conditions:default": extra_deps,
   })
diff --git a/tensorflow/core/platform/default/gpu_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
similarity index 93%
rename from tensorflow/core/platform/default/gpu_tracer.cc
rename to tensorflow/core/platform/default/device_tracer.cc
index e52e37ad7120c70e2319a591eb94999fdabbd6cb..f4b0f16393d70521386ad49fbf010591e5afb08c 100644
--- a/tensorflow/core/platform/default/gpu_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/gpu_tracer.h"
+#include "tensorflow/core/platform/device_tracer.h"
 
 #if GOOGLE_CUDA
 
@@ -101,7 +101,7 @@ const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
 }  // namespace
 
 namespace tensorflow {
-namespace gputracer {
+namespace devicetracer {
 
 // Forward declaration.
 class CUPTIManager;
@@ -286,14 +286,14 @@ CUPTIManager *GetCUPTIManager() {
 // for the duration of the CUPTI API callback.
 TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
-class GPUTracerImpl : public GPUTracer,
-                      public CUPTIClient,
-                      public port::Tracing::Engine {
+class DeviceTracerImpl : public DeviceTracer,
+                         public CUPTIClient,
+                         public port::Tracing::Engine {
  public:
-  GPUTracerImpl();
-  ~GPUTracerImpl() override;
+  DeviceTracerImpl();
+  ~DeviceTracerImpl() override;
 
-  // GPUTracer interface:
+  // DeviceTracer interface:
   Status Start() override;
   Status Stop() override;
   Status Collect(StepStatsCollector *collector) override;
@@ -319,9 +319,6 @@ class GPUTracerImpl : public GPUTracer,
     // We don't do anything with 'TraceMe' regions yet.
     return nullptr;
   }
-  Tracer *StartTracing(StringPiece label) {
-    return StartTracing(label, /*is_expensive=*/true);
-  }
 
  protected:
   // This callback is used exclusively by CUPTIManager.
@@ -351,7 +348,7 @@ class GPUTracerImpl : public GPUTracer,
   };
 
   // This is the subscriber callback which is invoked directly by CUPTI.
-  // The 'userdata' argument will be a pointer to the active 'GPUTracerImpl'.
+  // The 'userdata' argument will be a pointer to the active 'DeviceTracerImpl'.
   static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
                                    CUpti_CallbackId cbid, const void *cbdata);
 
@@ -378,28 +375,28 @@ class GPUTracerImpl : public GPUTracer,
   uint64_t start_timestamp_ GUARDED_BY(mu_);
   uint64_t end_timestamp_ GUARDED_BY(mu_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(GPUTracerImpl);
+  TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
 };
 
-GPUTracerImpl::GPUTracerImpl() {
-  VLOG(1) << "GPUTracer created.";
+DeviceTracerImpl::DeviceTracerImpl() {
+  VLOG(1) << "DeviceTracer created.";
   cupti_manager_ = GetCUPTIManager();
   CHECK(cupti_manager_);
   cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
   enabled_ = false;
 }
 
-GPUTracerImpl::~GPUTracerImpl() {
+DeviceTracerImpl::~DeviceTracerImpl() {
   // Unregister the CUPTI callbacks if needed to prevent them from accessing
   // freed memory.
   Stop().IgnoreError();
 }
 
-Status GPUTracerImpl::Start() {
-  VLOG(1) << "GPUTracer::Start";
+Status DeviceTracerImpl::Start() {
+  VLOG(1) << "DeviceTracer::Start";
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("GPUTracer is already enabled.");
+    return errors::FailedPrecondition("DeviceTracer is already enabled.");
   }
   // There can only be one CUPTI subscriber.  If we can't create one then
   // there is another trace in progress (possibly by external code).
@@ -454,8 +451,8 @@ Status GPUTracerImpl::Start() {
   return Status::OK();
 }
 
-Status GPUTracerImpl::Stop() {
-  VLOG(1) << "GPUTracer::Stop";
+Status DeviceTracerImpl::Stop() {
+  VLOG(1) << "DeviceTracer::Stop";
   mutex_lock l(mu_);
   if (!enabled_) {
     return Status::OK();
@@ -469,20 +466,20 @@ Status GPUTracerImpl::Stop() {
   return Status::OK();
 }
 
-void GPUTracerImpl::AddCorrelationId(uint32 correlation_id,
-                                     const string &name) {
+void DeviceTracerImpl::AddCorrelationId(uint32 correlation_id,
+                                        const string &name) {
   VLOG(2) << correlation_id << " : " << name;
   mutex_lock l(trace_mu_);
   if (correlations_.size() >= kMaxRecords) return;
   correlations_.emplace(correlation_id, name);
 }
 
-/*static*/ void GPUTracerImpl::ApiCallback(void *userdata,
-                                           CUpti_CallbackDomain domain,
-                                           CUpti_CallbackId cbid,
-                                           const void *cbdata) {
+/*static*/ void DeviceTracerImpl::ApiCallback(void *userdata,
+                                              CUpti_CallbackDomain domain,
+                                              CUpti_CallbackId cbid,
+                                              const void *cbdata) {
   auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
-  GPUTracerImpl *tracer = reinterpret_cast<GPUTracerImpl *>(userdata);
+  DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
   VLOG(2) << "ApiCallback " << domain << ":" << cbid
           << " func: " << cbInfo->functionName;
 
@@ -536,7 +533,7 @@ void GPUTracerImpl::AddCorrelationId(uint32 correlation_id,
   }
 }
 
-void GPUTracerImpl::ActivityCallback(const CUpti_Activity &record) {
+void DeviceTracerImpl::ActivityCallback(const CUpti_Activity &record) {
   VLOG(2) << "ActivityCallback " << record.kind;
   mutex_lock l(trace_mu_);
   switch (record.kind) {
@@ -573,10 +570,10 @@ void GPUTracerImpl::ActivityCallback(const CUpti_Activity &record) {
   }
 }
 
-Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
+Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
   mutex_lock l(mu_);
   if (enabled_) {
-    return errors::FailedPrecondition("GPUTracer is still enabled.");
+    return errors::FailedPrecondition("DeviceTracer is still enabled.");
   }
 
   // TODO(pbar) Handle device IDs and prefix properly.
@@ -633,10 +630,10 @@ Status GPUTracerImpl::Collect(StepStatsCollector *collector) {
   return Status::OK();
 }
 
-}  // namespace gputracer
+}  // namespace devicetracer
 
-std::unique_ptr<GPUTracer> CreateGPUTracer() {
-  std::unique_ptr<GPUTracer> tracer(new gputracer::GPUTracerImpl());
+std::unique_ptr<DeviceTracer> CreateDeviceTracer() {
+  std::unique_ptr<DeviceTracer> tracer(new devicetracer::DeviceTracerImpl());
   return tracer;
 }
 
@@ -646,7 +643,7 @@ std::unique_ptr<GPUTracer> CreateGPUTracer() {
 
 namespace tensorflow {
 
-std::unique_ptr<GPUTracer> CreateGPUTracer() { return nullptr; }
+std::unique_ptr<DeviceTracer> CreateDeviceTracer() { return nullptr; }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/notification.h b/tensorflow/core/platform/default/notification.h
index 6a214dbd0acb993839039e81b56563e86e08e4db..5c401b74772063b21a0a0e02e0cad5587071ab0c 100644
--- a/tensorflow/core/platform/default/notification.h
+++ b/tensorflow/core/platform/default/notification.h
@@ -73,7 +73,7 @@ class Notification {
   }
 
   mutex mu_;                    // protects mutations of notified_
-  condition_variable cv_;       // signalled when notified_ becomes non-zero
+  condition_variable cv_;       // signaled when notified_ becomes non-zero
   std::atomic<bool> notified_;  // mutations under mu_
 };
 
diff --git a/tensorflow/core/platform/default/thread_annotations.h b/tensorflow/core/platform/default/thread_annotations.h
index c52c2294c71fc2a5f74fadfd28bb45425dc9fdaf..a6aa5b1b5e3e6d2ac507b847ad1455617538bcbc 100644
--- a/tensorflow/core/platform/default/thread_annotations.h
+++ b/tensorflow/core/platform/default/thread_annotations.h
@@ -50,7 +50,7 @@ limitations under the License.
 // a shared variable is guarded by some unspecified mutex, for use in rare
 // cases where a valid mutex expression cannot be specified.
 #define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-#define GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(guarded)
+#define GUARDED_VAR  // no-op
 
 // Document if the memory location pointed to by a pointer should be guarded
 // by a mutex when dereferencing the pointer.  PT_GUARDED_VAR is analogous to
@@ -60,7 +60,7 @@ limitations under the License.
 // guarded by mu2, q should be annotated as follows:
 //     int *q GUARDED_BY(mu1) PT_GUARDED_BY(mu2);
 #define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
-#define PT_GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded)
+#define PT_GUARDED_VAR  // no-op
 
 // Document the acquisition order between locks that can be held
 // simultaneously by a thread. For any two locks that need to be annotated
diff --git a/tensorflow/core/platform/gpu_tracer.h b/tensorflow/core/platform/device_tracer.h
similarity index 69%
rename from tensorflow/core/platform/gpu_tracer.h
rename to tensorflow/core/platform/device_tracer.h
index 3373d974e3815939989b5abd3fa294025082212b..d0f86a51030710cb97d2c962c460eaf87b9931d4 100644
--- a/tensorflow/core/platform/gpu_tracer.h
+++ b/tensorflow/core/platform/device_tracer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
-#define TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
+#define TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
 
 #include <memory>
 
@@ -24,16 +24,16 @@ namespace tensorflow {
 
 class StepStatsCollector;
 
-// 'GPUTracer' is an interface for collecting low-level execution timings
-// of GPU computation and DMA transfers.
+// 'DeviceTracer' is an interface for collecting low-level execution timings
+// of hardware accelerator (e.g. GPU) computation and DMA transfers.
 //
 // Typical usage pattern is as follows:
 //
-// GPUTracer* tracer = CreateGPUTracer();
+// DeviceTracer* tracer = CreateDeviceTracer();
 // if (tracer) {
 //   tracer->Start();
 //
-//   ... perform some GPU computations.
+//   ... perform some computations on a hardware accelerator.
 //
 //   tracer->Stop();
 //
@@ -44,23 +44,23 @@ class StepStatsCollector;
 //
 // Notes:
 // Tracing is not supported on all plaforms.  On platforms
-// with no GPU tracing support, 'CreateGPUTracer' will return 'nullptr'.
-// On most plaforms, GPU tracing will be a system-wide activity and
-// a single 'GPUTracer' will collect activity from all GPUs.
+// with no tracing support, 'CreateDeviceTracer' will return 'nullptr'.
+// On most plaforms, hardware tracing will be a system-wide activity and
+// a single 'DeviceTracer' will collect activity from all devices.
 // It is also common that only a single tracer may be active at any
 // given time.  The 'Start' method will return an error if tracing is
 // already in progress elsewhere.
 //
-class GPUTracer {
+class DeviceTracer {
  public:
-  virtual ~GPUTracer() {}
+  virtual ~DeviceTracer() {}
 
-  // Start GPU tracing.
+  // Start device tracing.
   // Note that only a single trace can be active, in which case this
   // methods will return an 'Unavailable' error.
   virtual Status Start() = 0;
 
-  // Stop GPU tracing.
+  // Stop device tracing.
   // It is safe to call 'Stop' on a tracer which is not enabled.
   virtual Status Stop() = 0;
 
@@ -70,10 +70,10 @@ class GPUTracer {
   virtual Status Collect(StepStatsCollector* collector) = 0;
 };
 
-// Creates a platform-specific GPUTracer.
+// Creates a platform-specific DeviceTracer.
 // Returns 'nullptr' on platforms where tracing is not supported.
-std::unique_ptr<GPUTracer> CreateGPUTracer();
+std::unique_ptr<DeviceTracer> CreateDeviceTracer();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_GPU_TRACER_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_DEVICE_TRACER_H_
diff --git a/tensorflow/core/platform/gpu_tracer_test.cc b/tensorflow/core/platform/device_tracer_test.cc
similarity index 84%
rename from tensorflow/core/platform/gpu_tracer_test.cc
rename to tensorflow/core/platform/device_tracer_test.cc
index ce2985fd47c6de819aedd78a047815edb0e29e86..c0c08dabacbcb9fdbbfd9bdbe16bcfaea7328507 100644
--- a/tensorflow/core/platform/gpu_tracer_test.cc
+++ b/tensorflow/core/platform/device_tracer_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/gpu_tracer.h"
+#include "tensorflow/core/platform/device_tracer.h"
 
 #include <map>
 #include <memory>
@@ -50,7 +50,7 @@ std::unique_ptr<Session> CreateSession() {
   return std::unique_ptr<Session>(NewSession(options));
 }
 
-class GPUTracerTest : public ::testing::Test {
+class DeviceTracerTest : public ::testing::Test {
  public:
   void Initialize(std::initializer_list<float> a_values) {
     Graph graph(OpRegistry::Global());
@@ -84,10 +84,10 @@ class GPUTracerTest : public ::testing::Test {
 
  protected:
   void ExpectFailure(const Status& status, error::Code code) {
-    EXPECT_FALSE(status.ok());
+    EXPECT_FALSE(status.ok()) << status.ToString();
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.error_message();
-      EXPECT_EQ(code, status.code());
+      EXPECT_EQ(code, status.code()) << status.ToString();
     }
   }
 
@@ -97,22 +97,22 @@ class GPUTracerTest : public ::testing::Test {
   GraphDef def_;
 };
 
-TEST_F(GPUTracerTest, StartStop) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StartStop) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, StopBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StopBeforeStart) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Stop());
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, CollectBeforeStart) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, CollectBeforeStart) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   StepStats stats;
   StepStatsCollector collector(&stats);
@@ -120,8 +120,8 @@ TEST_F(GPUTracerTest, CollectBeforeStart) {
   EXPECT_EQ(stats.dev_stats_size(), 0);
 }
 
-TEST_F(GPUTracerTest, CollectBeforeStop) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, CollectBeforeStop) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
   TF_EXPECT_OK(tracer->Start());
   StepStats stats;
@@ -131,9 +131,9 @@ TEST_F(GPUTracerTest, CollectBeforeStop) {
   TF_EXPECT_OK(tracer->Stop());
 }
 
-TEST_F(GPUTracerTest, StartTwoTracers) {
-  std::unique_ptr<GPUTracer> tracer1(CreateGPUTracer());
-  std::unique_ptr<GPUTracer> tracer2(CreateGPUTracer());
+TEST_F(DeviceTracerTest, StartTwoTracers) {
+  std::unique_ptr<DeviceTracer> tracer1(CreateDeviceTracer());
+  std::unique_ptr<DeviceTracer> tracer2(CreateDeviceTracer());
   if (!tracer1 || !tracer2) return;
 
   TF_EXPECT_OK(tracer1->Start());
@@ -144,9 +144,9 @@ TEST_F(GPUTracerTest, StartTwoTracers) {
   TF_EXPECT_OK(tracer2->Stop());
 }
 
-TEST_F(GPUTracerTest, RunWithTracer) {
-  // On non-GPU platforms, we may not support GPUTracer.
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, RunWithTracer) {
+  // On non-GPU platforms, we may not support DeviceTracer.
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
@@ -172,8 +172,8 @@ TEST_F(GPUTracerTest, RunWithTracer) {
   EXPECT_FLOAT_EQ(5.0, mat(0, 0));
 }
 
-TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
-  std::unique_ptr<GPUTracer> tracer(CreateGPUTracer());
+TEST_F(DeviceTracerTest, TraceToStepStatsCollector) {
+  std::unique_ptr<DeviceTracer> tracer(CreateDeviceTracer());
   if (!tracer) return;
 
   Initialize({3, 2, -1, 0});
@@ -198,10 +198,10 @@ TEST_F(GPUTracerTest, TraceToStepStatsCollector) {
   collector.Finalize();
   // Depending on whether this runs on CPU or GPU, we will have a
   // different number of devices.
-  EXPECT_GE(stats.dev_stats_size(), 1);
+  EXPECT_GE(stats.dev_stats_size(), 1) << "Saw stats: " << stats.DebugString();
 }
 
-TEST_F(GPUTracerTest, RunWithTraceOption) {
+TEST_F(DeviceTracerTest, RunWithTraceOption) {
   Initialize({3, 2, -1, 0});
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
diff --git a/tensorflow/core/platform/posix/error.cc b/tensorflow/core/platform/posix/error.cc
index e9baad5422694bb01f8d5e2e61114e723f693bf7..cda6d7d8f9d6ad3e7f2c8fa56cc99a8dbe07fa00 100644
--- a/tensorflow/core/platform/posix/error.cc
+++ b/tensorflow/core/platform/posix/error.cc
@@ -72,7 +72,7 @@ error::Code ErrnoToCode(int err_number) {
     case EBUSY:       // Device or resource busy
     case ECHILD:      // No child processes
     case EISCONN:     // Socket is connected
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case ENOTBLK:     // Block device required
 #endif
     case ENOTCONN:    // The socket is not connected
@@ -94,7 +94,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENODATA:  // No message is available on the STREAM read queue
     case ENOMEM:   // Not enough space
     case ENOSR:    // No STREAM resources
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case EUSERS:   // Too many users
 #endif
       code = error::RESOURCE_EXHAUSTED;
@@ -111,7 +111,7 @@ error::Code ErrnoToCode(int err_number) {
     case EPFNOSUPPORT:     // Protocol family not supported
 #endif
     case EPROTONOSUPPORT:  // Protocol not supported
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case ESOCKTNOSUPPORT:  // Socket type not supported
 #endif
     case EXDEV:            // Improper link
@@ -131,7 +131,8 @@ error::Code ErrnoToCode(int err_number) {
     case ENETUNREACH:   // Network unreachable
     case ENOLCK:        // No locks available
     case ENOLINK:       // Link has been severed
-#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32))
+#if !(defined(__APPLE__) || defined(__FreeBSD__) || defined(_WIN32) || \
+      defined(__HAIKU__))
     case ENONET:  // Machine is not on the network
 #endif
       code = error::UNAVAILABLE;
@@ -156,7 +157,7 @@ error::Code ErrnoToCode(int err_number) {
     case ENOEXEC:      // Exec format error
     case ENOMSG:       // No message of the desired type
     case EPROTO:       // Protocol error
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__HAIKU__)
     case EREMOTE:      // Object is remote
 #endif
       code = error::UNKNOWN;
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 93a59348c8a5be1d7399f35aad8a4468a03d1f2b..614ee00b0133976e9fe49caf7c75a01194e10237 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -17,11 +17,16 @@ limitations under the License.
 #include "jemalloc/jemalloc.h"
 #endif
 
+#ifdef TENSORFLOW_USE_ABSL
+#include "absl/base/internal/sysinfo.h"
+#endif
+
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
+
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
 #endif
@@ -32,7 +37,8 @@ limitations under the License.
 #ifdef TF_USE_SNAPPY
 #include "snappy.h"
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) || \
+    defined(__HAIKU__)
 #include <thread>
 #endif
 
@@ -56,7 +62,8 @@ int NumSchedulableCPUs() {
   }
   perror("sched_getaffinity");
 #endif
-#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__)
+#if (defined(__APPLE__) && defined(__MACH__)) || defined(__FreeBSD__) || \
+    defined(__HAIKU__)
   unsigned int count = std::thread::hardware_concurrency();
   if (count > 0) return static_cast<int>(count);
 #endif
@@ -157,8 +164,11 @@ bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 string Demangle(const char* mangled) { return mangled; }
 
 double NominalCPUFrequency() {
-  // TODO(yuefengz): implement it for this platform.
+#ifdef TENSORFLOW_USE_ABSL
+  return absl::base_internal::NominalCPUFrequency();
+#else
   return 1.0;
+#endif
 }
 
 }  // namespace port
diff --git a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
index fb1955edde2abfd3fe5267e1319ea128138ee092..12dc9c58b38d01f6efc5644193fbf38b0e70c8d1 100644
--- a/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
+++ b/tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.cc
@@ -118,9 +118,10 @@ int64 AndroidArmV7ACpuUtilsHelper::ReadCpuFrequencyFile(
   const int retval = fscanf(fp, "%lld", &freq_in_khz);
   if (retval < 0) {
     LOG(WARNING) << "Failed to \"" << file_path << "\"";
+    fclose(fp);
     return INVALID_CPU_FREQUENCY;
   }
-  pclose(fp);
+  fclose(fp);
   return freq_in_khz * 1000;  // The file contains cpu frequency in khz
 }
 
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 51c85592bf43bdfb68c4ba90d19d28582560d6d4..682ad97eec3b3ffd0c69120e5de359ee50c9048e 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
 
 #include <aws/core/Aws.h>
@@ -38,7 +38,7 @@ static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int kS3GetChildrenMaxKeys = 100;
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
-  static mutex cfg_lock;
+  static mutex cfg_lock(LINKER_INITIALIZED);
   static bool init(false);
   static Aws::Client::ClientConfiguration cfg;
 
@@ -49,9 +49,15 @@ Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
     if (endpoint) {
       cfg.endpointOverride = Aws::String(endpoint);
     }
-    const char* region = getenv("S3_REGION");
+    const char* region = getenv("AWS_REGION");
     if (region) {
       cfg.region = Aws::String(region);
+    } else {
+      // TODO (yongtang): `S3_REGION` should be deprecated after 2.0.
+      const char* region = getenv("S3_REGION");
+      if (region) {
+        cfg.region = Aws::String(region);
+      }
     }
     const char* use_https = getenv("S3_USE_HTTPS");
     if (use_https) {
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index bb8e902efc25420ce1b7beb00a1911500c627a00..8f7bff1bb020ee501c982d5d0761d36537993e63 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -174,6 +174,14 @@ class Tracing::Engine {
   virtual Tracer* StartTracing(string&& label, bool is_expensive) {
     return StartTracing(StringPiece(label), is_expensive);
   }
+
+  // Backwards compatibility one arg variants (assume is_expensive=true).
+  Tracer* StartTracing(StringPiece label) {
+    return StartTracing(label, /*is_expensive=*/true);
+  }
+  Tracer* StartTracing(string&& label) {
+    return StartTracing(StringPiece(label), /*is_expensive=*/true);
+  }
 };
 
 // This class permits a user to apply annotation on kernels and memcpys
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 8ca26fa5dcf944cdd2c106233324c03f38f7a13f..9e628b10651423a7ce05392e675453c87f8b6c8c 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -48,7 +48,7 @@ bazel-bin/tensorflow/python/profiler/profiler_ui \
 # Create options to profile the time and memory information.
 builder = tf.profiler.ProfileOptionBuilder
 opts = builder(builder.time_and_memory()).order_by('micros').build()
-# Create a profiling context, set constructor argument `trace_steps`, 
+# Create a profiling context, set constructor argument `trace_steps`,
 # `dump_steps` to empty for explicit control.
 with tf.contrib.tfprof.ProfileContext('/tmp/train_dir',
                                       trace_steps=[],
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 4c73e372e3bd9f24c83bdc0d3b8d98b5f8b03f11..dd12f76d6fa9a71b78a672a687b96a985641283b 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -60,11 +60,14 @@ Currently, profiler only tracks the allocation of memory. As a result, the
 accumulated memory request is uaually larger than the peak memory of the overall
 model.
 
-bytes: The memory allocations requested by the operation.
-peak_bytes: The peak requested memory (not de-allocated) by the operation.
-residual_bytes: The memory requested by the operation and not de-allocated
+It's recommended to generate timeline to see the allocator memory usage over
+time.
+
+`bytes`: The memory allocations requested by the operation.
+`peak_bytes`: The peak requested memory (not de-allocated) by the operation.
+`residual_bytes`: The memory requested by the operation and not de-allocated
                 when Compute finishes.
-output_bytes: The memory output by the operation. It's not necessarily requested
+`output_bytes`: The memory output by the operation. It's not necessarily requested
               by the current operation. For example, it can be a tensor
               forwarded from input to output, with in-place mutation.
 
diff --git a/tensorflow/core/profiler/g3doc/profiler_ui.jpg b/tensorflow/core/profiler/g3doc/profiler_ui.jpg
index 36aa94502a8c3de7915fb0e388c861cd706c3af8..77346e61ae971725e163c561a813bb6c0153ad89 100644
Binary files a/tensorflow/core/profiler/g3doc/profiler_ui.jpg and b/tensorflow/core/profiler/g3doc/profiler_ui.jpg differ
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 671b65d708f57713d984331de73ddf305675b792..2945c9510f1c91474a0a998541e394143a0490be 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -80,10 +80,15 @@ void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
 
 void ExecStep::AddMemoryStats(const string& dev,
                               const NodeExecStats& step_stat) {
-  if (exec_.memory_intialized()) {
+  ExecMemory exec_mem;
+  if (step_stat.all_start_micros() > 0) {
+    exec_mem.set_memory_micros(step_stat.all_start_micros() +
+                               step_stat.op_end_rel_micros());
+  } else {
+    fprintf(stderr, "%s has no start time, skipping\n",
+            step_stat.node_name().c_str());
     return;
   }
-  exec_.set_memory_intialized(true);
 
   int accelerator_allocator_cnt = 0;
   for (const auto& mem : step_stat.memory()) {
@@ -93,14 +98,12 @@ void ExecStep::AddMemoryStats(const string& dev,
       continue;
     }
     ++accelerator_allocator_cnt;
-    exec_.set_allocator_bytes_in_use(
-        std::max(static_cast<int64>(exec_.allocator_bytes_in_use()),
+    exec_mem.set_allocator_bytes_in_use(
+        std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
                  static_cast<int64>(mem.allocator_bytes_in_use())));
-    Allocation allocation;
     for (const auto& alloc : mem.allocation_records()) {
-      allocation.add_allocation_records()->MergeFrom(alloc);
+      allocations_.push_back(alloc);
     }
-    allocations_.push_back(allocation);
   }
   if (accelerator_allocator_cnt > 1) {
     fprintf(stderr, "found %d gpu allocator for 1 node\n",
@@ -121,24 +124,47 @@ void ExecStep::AddMemoryStats(const string& dev,
       uint64 output_ptr =
           output.tensor_description().allocation_description().ptr();
       total_output_bytes += output_bytes;
-      output_memory_[output.slot()] = std::make_pair(output_bytes, output_ptr);
+
+      auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
+      mem.set_ptr(output_ptr);
+      mem.set_bytes(output_bytes);
     }
   }
-  exec_.set_output_bytes(total_output_bytes);
+  exec_mem.set_output_bytes(total_output_bytes);
 
   if (step_stat.has_memory_stats()) {
-    exec_.set_host_temp_bytes(exec_.host_temp_bytes() +
-                              step_stat.memory_stats().host_temp_memory_size());
-    exec_.set_host_persistent_bytes(
-        exec_.host_persistent_bytes() +
+    exec_mem.set_host_temp_bytes(
+        exec_mem.host_temp_bytes() +
+        step_stat.memory_stats().host_temp_memory_size());
+    exec_mem.set_host_persistent_bytes(
+        exec_mem.host_persistent_bytes() +
         step_stat.memory_stats().host_persistent_memory_size());
-    exec_.set_accelerator_temp_bytes(
-        exec_.accelerator_temp_bytes() +
+    exec_mem.set_accelerator_temp_bytes(
+        exec_mem.accelerator_temp_bytes() +
         step_stat.memory_stats().device_temp_memory_size());
-    exec_.set_accelerator_persistent_bytes(
-        exec_.accelerator_persistent_bytes() +
+    exec_mem.set_accelerator_persistent_bytes(
+        exec_mem.accelerator_persistent_bytes() +
         step_stat.memory_stats().device_persistent_memory_size());
   }
+
+  // TODO(xpan): Make this more accurate:
+  // High level: Memory tracking is suspicous and requires large scale
+  // clean up.
+  // Investigte the memory usage difference between CPU/GPU with OpViewTest.
+  //
+  // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
+  //    discuss OpKernelContext-related allocations.
+  // 2. allocate_output calls allocate_tensor, which is properly tracked in
+  //    'NodeExecStats.memory'.
+  // 3. allocate_temp is only tracked through record_xxx_temp. It appears
+  //    in 'NodeExecStats.memory_stats'.
+  // 4. allocate_persistent calls allocate_tensor, which is properly tracked
+  //    in 'NodeExecStats.memory'. However, there is no way to count it as
+  //    persistent now.
+  // 5. record_xxx_persistent is called when allocate_persistent
+  //    is not used and hence tracks some complementary bytes. It appears in
+  //    'NodeExecStats.memory_stats'. It's suspicious. But we should
+  //    use it now since it covers constant op.
   int64 residual_bytes = 0;
   int64 requested_bytes = 0;
   int64 peak_bytes = 0;
@@ -147,9 +173,20 @@ void ExecStep::AddMemoryStats(const string& dev,
     requested_bytes += mem.total_bytes();
     peak_bytes += mem.peak_bytes();
   }
-  exec_.set_requested_bytes(requested_bytes);
-  exec_.set_residual_bytes(residual_bytes);
-  exec_.set_peak_bytes(peak_bytes);
+  residual_bytes += exec_mem.host_persistent_bytes() +
+                    exec_mem.accelerator_persistent_bytes();
+  requested_bytes += exec_mem.host_persistent_bytes() +
+                     exec_mem.accelerator_persistent_bytes() +
+                     exec_mem.host_temp_bytes() +
+                     exec_mem.accelerator_temp_bytes();
+  peak_bytes += exec_mem.host_persistent_bytes() +
+                exec_mem.accelerator_persistent_bytes() +
+                exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
+
+  exec_mem.set_requested_bytes(requested_bytes);
+  exec_mem.set_residual_bytes(residual_bytes);
+  exec_mem.set_peak_bytes(peak_bytes);
+  memory_execs_.emplace_back(exec_mem);
 }
 
 void TFGraphNode::AddStepStat(int64 step, const string& device,
@@ -251,5 +288,8 @@ bool IsPlacedOnAccelerator(const string& device) {
   return device.find("gpu") != device.npos ||
          device.find("sycl") != device.npos;
 }
+bool IsPlacedOnCPU(const string& device) {
+  return device.find("cpu") != device.npos;
+}
 }  // namespace tfprof
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index e2d0563a0747d7bec74ce3aeb9d5995f47cff915..5bc2ea3c42210991a01aea1ea731aa3b4da83acc 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -109,7 +109,6 @@ class ExecStep {
       const {
     return cpu_execs_;
   }
-
   int64 all_start_micros() const { return exec_.all_start_micros(); }
   int64 latest_end_micros() const { return exec_.latest_end_micros(); }
   int64 lastest_schedule_end_micros() const {
@@ -121,27 +120,73 @@ class ExecStep {
     }
     return ret;
   }
-
-  int64 requested_bytes() const { return exec_.requested_bytes(); }
-  int64 peak_bytes() const { return exec_.peak_bytes(); }
-  int64 residual_bytes() const { return exec_.residual_bytes(); }
-  int64 output_bytes() const { return exec_.output_bytes(); }
+  int64 requested_bytes() const {
+    int64 requested_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      requested_bytes += exec.requested_bytes();
+    }
+    return requested_bytes;
+  }
+  int64 peak_bytes() const {
+    int64 peak_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      peak_bytes += exec.peak_bytes();
+    }
+    return peak_bytes;
+  }
+  int64 residual_bytes() const {
+    int64 residual_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      residual_bytes += exec.residual_bytes();
+    }
+    return residual_bytes;
+  }
+  int64 output_bytes() const {
+    int64 output_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      output_bytes += exec.output_bytes();
+    }
+    return output_bytes;
+  }
   int64 accelerator_temp_bytes() const {
-    return exec_.accelerator_temp_bytes();
+    int64 accelerator_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_temp_bytes += exec.accelerator_temp_bytes();
+    }
+    return accelerator_temp_bytes;
+  }
+  int64 host_temp_bytes() const {
+    int64 host_temp_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_temp_bytes += exec.host_temp_bytes();
+    }
+    return host_temp_bytes;
   }
-  int64 host_temp_bytes() const { return exec_.host_temp_bytes(); }
   int64 accelerator_persistent_bytes() const {
-    return exec_.accelerator_persistent_bytes();
+    int64 accelerator_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      accelerator_persistent_bytes += exec.accelerator_persistent_bytes();
+    }
+    return accelerator_persistent_bytes;
   }
-  int64 host_persistent_bytes() const { return exec_.host_persistent_bytes(); }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory() const {
-    return output_memory_;
+  int64 host_persistent_bytes() const {
+    int64 host_persistent_bytes = 0;
+    for (const ExecMemory& exec : memory_execs_) {
+      host_persistent_bytes += exec.host_persistent_bytes();
+    }
+    return host_persistent_bytes;
   }
-  int64 allocator_bytes_in_use() const {
-    return exec_.allocator_bytes_in_use();
+  std::map<int64, int64> allocator_bytes_in_use() const {
+    std::map<int64, int64> bytes_in_use;
+    for (const ExecMemory& exec : memory_execs_) {
+      bytes_in_use[exec.memory_micros()] = exec.allocator_bytes_in_use();
+    }
+    return bytes_in_use;
   }
 
-  const std::vector<Allocation>& allocations() const { return allocations_; }
+  const std::vector<AllocationRecord>& allocations() const {
+    return allocations_;
+  }
 
   const ExecProfile& ToProto() {
     exec_.mutable_accelerator_execs()->clear();
@@ -169,19 +214,15 @@ class ExecStep {
     for (const string& d : devices_) {
       exec_.add_devices(d);
     }
-
-    exec_.mutable_output_memory()->clear();
-    for (const auto& mem : output_memory_) {
-      auto& mem_pb = (*exec_.mutable_output_memory())[mem.first];
-      mem_pb.set_bytes(mem.second.first);
-      mem_pb.set_ptr(mem.second.second);
-    }
-
     exec_.mutable_allocations()->Clear();
     for (const auto& r : allocations_) {
       exec_.add_allocations()->MergeFrom(r);
     }
 
+    exec_.mutable_memory_execs()->Clear();
+    for (const auto& m : memory_execs_) {
+      exec_.add_memory_execs()->MergeFrom(m);
+    }
     return exec_;
   }
 
@@ -197,6 +238,7 @@ class ExecStep {
     op_execs_.clear();
 
     allocations_.clear();
+    memory_execs_.clear();
 
     for (const auto& exec_time : exec_.accelerator_execs()) {
       auto& exec = accelerator_execs_[exec_time.first];
@@ -214,15 +256,12 @@ class ExecStep {
         op_exec.push_back(std::make_pair(p.int64_values(0), p.int64_values(1)));
       }
     }
-    for (const auto& output_mem : exec_.output_memory()) {
-      auto& mem = output_memory_[output_mem.first];
-      mem.first = output_mem.second.bytes();
-      mem.second = output_mem.second.ptr();
-    }
-
     for (const auto& r : exec_.allocations()) {
       allocations_.push_back(r);
     }
+    for (const auto& m : exec_.memory_execs()) {
+      memory_execs_.push_back(m);
+    }
   }
 
  private:
@@ -237,14 +276,15 @@ class ExecStep {
   std::map<string, std::vector<std::pair<int64, int64>>> cpu_execs_;
   // combines accelerator_execs_ and cpu_execs_.
   std::map<string, std::vector<std::pair<int64, int64>>> op_execs_;
+  // Each ExecMemory corresponds to one scheduling of the op. Normally,
+  // there are multiple schedulings in while_loop.
+  std::vector<ExecMemory> memory_execs_;
   // All devices the op is associated with (e.g. gpu:0 (scheduling),
   // gpu:0:stream:xx (kernel exec), cpu:0 host)
   std::set<string> devices_;
-  // output_idx -> {output_bytes, memory_ptr}
-  std::map<int32, std::pair<int64, uint64>> output_memory_;
 
   // The history of accelerator allocations and deallocations of this step.
-  std::vector<Allocation> allocations_;
+  std::vector<AllocationRecord> allocations_;
 };
 
 #define GRAPH_NODE_BYTES(type)             \
@@ -593,34 +633,20 @@ class TFGraphNode {
   int64 accelerator_persistent_bytes() const {
     int64 persistent_bytes = 0;
     for (const auto& exec : execs_) {
-      persistent_bytes += exec.second.accelerator_persistent_bytes();
+      persistent_bytes = std::max(persistent_bytes,
+                                  exec.second.accelerator_persistent_bytes());
     }
     return persistent_bytes;
   }
-  int64 host_persistent_bytes(int64 step) const {
+  const std::map<int64, int64> allocator_bytes_in_use(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
-      return 0;
-    }
-    return exec->second.host_persistent_bytes();
-  }
-  const std::map<int32, std::pair<int64, uint64>>& output_memory(
-      int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return empty_output_memory_;
-    }
-    return exec->second.output_memory();
-  }
-  int64 allocator_bytes_in_use(int64 step) const {
-    auto exec = execs_.find(step);
-    if (exec == execs_.end()) {
-      return 0;
+      return empty_bytes_in_use_;
     }
     return exec->second.allocator_bytes_in_use();
   }
 
-  const std::vector<Allocation>& allocations(int64 step) const {
+  const std::vector<AllocationRecord>& allocations(int64 step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
       return empty_allocations_;
@@ -725,9 +751,9 @@ class TFGraphNode {
   std::map<int64, ExecStep> execs_;
 
   // Placeholder for empty cases.
-  std::map<int32, std::pair<int64, uint64>> empty_output_memory_;
+  std::map<int64, int64> empty_bytes_in_use_;
   std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_;
-  std::vector<Allocation> empty_allocations_;
+  std::vector<AllocationRecord> empty_allocations_;
 };
 
 class TFMultiGraphNode {
@@ -880,6 +906,7 @@ class TFMultiGraphNode {
   std::map<string, const TFGraphNode*> nodes_;
 };
 
+bool IsPlacedOnCPU(const string& device);
 bool IsPlacedOnAccelerator(const string& device);
 bool CountAsAcceleratorTime(const string& device);
 bool CountAsCPUTime(const string& device);
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index c04b0ea0c62b83ec2cff177f2eb1cc6d5e5d21c4..5a8429d4893effc8bbfa0bf69e18b4a182e9a5df 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -109,7 +109,6 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     fprintf(stderr, "Only 'code' view supports pprof output now.\n");
     return root_.get();
   }
-
   if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
     root_->formatted_str = FormatNode(root_.get(), root_.get(), opts);
   }
@@ -130,7 +129,6 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     nodes.push_back(n.second.get());
   }
   nodes = SortNodes(nodes, opts);
-
   // pre keeps track of previous visited node.
   OpNode* pre = nullptr;
   std::vector<OpNode*> account_nodes;
@@ -166,10 +164,6 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
       (*it)->AddSelfToTotalStats();
       if (pre) (*it)->AggregateTotalStats(pre);
     }
-    if (pre) {
-      (*it)->mutable_proto()->add_children()->MergeFrom(pre->proto());
-      pre->mutable_proto()->clear_children();
-    }
     pre = *it;
   }
   if (opts.account_displayed_op_only) {
@@ -178,11 +172,6 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
       root_->AggregateTotalStats(pre);
     }
   }
-  if (pre) {
-    root_->mutable_proto()->add_children()->MergeFrom(pre->proto());
-    pre->mutable_proto()->clear_children();
-  }
-
   if (opts.output_type == kOutput[1] || opts.output_type == kOutput[2]) {
     string display_str = FormatLegend(opts);
     for (OpNode* node : show_nodes) {
@@ -192,6 +181,13 @@ const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
     // TODO(xpan): Is it the right choice?
     root_->formatted_str = display_str;
   }
+  // Populate the chidren field.
+  auto* pre_pb = root_->mutable_proto();
+  for (auto& show_node : show_nodes) {
+    pre_pb->clear_children();
+    pre_pb->add_children()->Swap(show_node->mutable_proto());
+    pre_pb = pre_pb->mutable_children(0);
+  }
   return root_.get();
 }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 1f19f8c322a15a726ce354ecf991ea902788d97b..98773ae19ea424fc1d3ca01572d9535367a41321 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -105,12 +105,13 @@ TEST_F(TFProfShowTest, DumpScopeMode) {
       "node name | # parameters | # float_ops | requested bytes | peak bytes | "
       "residual bytes | output bytes | total execution time | accelerator "
       "execution time | cpu execution time\n_TFProfRoot (--/451 params, --/0 "
-      "flops, --/0B, --/0B, --/0B, --/2.56KB, --/13us, --/0us, --/13us)\n  DW "
-      "(3x3x3x6, 162/162 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
-      "1.28KB/1.28KB, 2us/2us, 0us/0us, 2us/2us)\n  DW2 (2x2x6x12, 288/288 "
-      "params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, 1.28KB/1.28KB, 11us/11us, "
-      "0us/0us, 11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, "
-      "0B/0B, 0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
+      "flops, --/2.56KB, --/2.56KB, --/2.56KB, --/2.56KB, --/13us, --/0us, "
+      "--/13us)\n  DW (3x3x3x6, 162/162 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 2us/2us, 0us/0us, "
+      "2us/2us)\n  DW2 (2x2x6x12, 288/288 params, 0/0 flops, 1.28KB/1.28KB, "
+      "1.28KB/1.28KB, 1.28KB/1.28KB, 1.28KB/1.28KB, 11us/11us, 0us/0us, "
+      "11us/11us)\n  ScalarW (1, 1/1 params, 0/0 flops, 0B/0B, 0B/0B, 0B/0B, "
+      "0B/0B, 0us/0us, 0us/0us, 0us/0us)\n",
       dump_str);
 
   EXPECT_EQ(dump_str, TestToFromProto("scope", opts));
@@ -178,22 +179,22 @@ TEST_F(TFProfShowTest, DumpOpMode) {
   EXPECT_EQ(
       "nodename|requestedbytes|totalexecutiontime|acceleratorexecutiontime|"
       "cpuexecutiontime|#parameters|#float_ops|opoccurrence(run|defined)|"
-      "inputshapes\nVariableV20B(0.00%,0.00%),13us(100.00%,0.26%),0us(100.00%,"
-      "0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_ops(100.00%"
-      ",0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:13us\n\nAdd0B("
-      "0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99.71%,0.00%),"
-      "0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_type:0:1,"
-      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,\t1:1\t("
-      "run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t(run*0|"
-      "defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0.00%),"
-      "0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_ops("
-      "100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
+      "inputshapes\nVariableV22.56KB(100.00%,8.40%),13us(100.00%,0.26%),0us("
+      "100.00%,0.00%),13us(100.00%,0.29%),451params(100.00%,100.00%),0float_"
+      "ops(100.00%,0.00%),2|3\n\ninput_type:\t(run*2|defined*3)\texec_time:"
+      "13us\n\nAdd0B(0.00%,0.00%),0us(99.74%,0.00%),0us(100.00%,0.00%),0us(99."
+      "71%,0.00%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),0|3\n\ninput_"
+      "type:0:1,\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:2x2x6x12,"
+      "\t1:1\t(run*0|defined*1)\texec_time:0us\ninput_type:0:3x3x3x6,\t1:1\t("
+      "run*0|defined*1)\texec_time:0us\n\nAssign0B(0.00%,0.00%),0us(99.74%,0."
+      "00%),0us(100.00%,0.00%),0us(99.71%,0.00%),0params(0.00%,0.00%),0float_"
+      "ops(100.00%,0.00%),0|3\n\ninput_type:0:1,\t1:1\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:2x2x6x12,\t1:2x2x6x12\t(run*0|defined*1)\texec_"
       "time:0us\ninput_type:0:3x3x3x6,\t1:3x3x3x6\t(run*0|defined*1)\texec_"
       "time:0us\n\nConst0B(0.00%,0.00%),2us(99.74%,0.04%),0us(100.00%,0.00%),"
       "2us(99.71%,0.04%),0params(0.00%,0.00%),0float_ops(100.00%,0.00%),1|"
-      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D14.59KB("
-      "100.00%,100.00%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
+      "10\n\ninput_type:\t(run*1|defined*10)\texec_time:2us\n\nConv2D27.90KB("
+      "91.60%,91.60%),4.89ms(99.70%,98.87%),404us(100.00%,100.00%),4.49ms(99."
       "67%,98.77%),0params(0.00%,0.00%),10.44kfloat_ops(100.00%,100.00%),2|"
       "2\n\ninput_type:0:2x3x3x6,\t1:2x2x6x12\t(run*1|defined*1)\texec_time:"
       "597us\ninput_type:0:2x6x6x3,\t1:3x3x3x6\t(run*1|defined*1)\texec_time:4."
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index 2f2101d76bfd4c0741fff0eb9762444cd8b6fd92..b86a83cb1bb5fd42437692ea9aec240275c26ed8 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -89,21 +89,27 @@ TEST_F(TFProfStatsTest, CustomOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -119,21 +125,27 @@ TEST_F(TFProfStatsTest, CheckPointOpType) {
 
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
-      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_parameters: "
-      "451\nchildren {\n  name: \"DW\"\n  exec_micros: 2\n  parameters: 162\n  "
-      "total_exec_micros: 2\n  total_parameters: 162\n  devices: "
+      "name: \"_TFProfRoot\"\ntotal_exec_micros: 13\ntotal_requested_bytes: "
+      "2560\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  exec_micros: "
+      "2\n  requested_bytes: 1280\n  parameters: 162\n  total_exec_micros: 2\n "
+      " total_requested_bytes: 1280\n  total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 2\n  "
       "total_cpu_exec_micros: 2\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"DW2\"\n  exec_micros: 11\n  parameters: "
-      "288\n  total_exec_micros: 11\n  total_parameters: 288\n  devices: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"DW2\"\n  "
+      "exec_micros: 11\n  requested_bytes: 1280\n  parameters: 288\n  "
+      "total_exec_micros: 11\n  total_requested_bytes: 1280\n  "
+      "total_parameters: 288\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  cpu_exec_micros: 11\n  "
       "total_cpu_exec_micros: 11\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  output_bytes: 1280\n  total_output_bytes: "
-      "1280\n}\nchildren {\n  name: \"ScalarW\"\n  parameters: 1\n  "
-      "total_parameters: 1\n  total_definition_count: "
+      "total_definition_count: 1\n  peak_bytes: 1280\n  residual_bytes: 1280\n "
+      " output_bytes: 1280\n  total_peak_bytes: 1280\n  total_residual_bytes: "
+      "1280\n  total_output_bytes: 1280\n}\nchildren {\n  name: \"ScalarW\"\n  "
+      "parameters: 1\n  total_parameters: 1\n  total_definition_count: "
       "1\n}\ntotal_cpu_exec_micros: 13\ntotal_run_count: "
-      "2\ntotal_definition_count: 3\ntotal_output_bytes: 2560\n",
+      "2\ntotal_definition_count: 3\ntotal_peak_bytes: "
+      "2560\ntotal_residual_bytes: 2560\ntotal_output_bytes: 2560\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -150,7 +162,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: "
       "\"DW/Initializer/random_normal/mul\"\n  children {\n    name: "
       "\"DW/Initializer/random_normal/RandomStandardNormal\"\n    children {\n "
       "     name: \"DW/Initializer/random_normal/shape\"\n      "
@@ -166,7 +178,7 @@ TEST_F(TFProfStatsTest, TestGraph) {
       "4\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
       "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
       "6\ntotal_definition_count: 32\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -181,9 +193,9 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
-      "exec_micros: 4292\n  requested_bytes: 9472\n  total_exec_micros: 4292\n "
-      " total_requested_bytes: 9472\n  devices: "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"Conv2D\"\n  "
+      "exec_micros: 4292\n  requested_bytes: 18176\n  total_exec_micros: "
+      "4292\n  total_requested_bytes: 18176\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 5832\n  "
       "total_float_ops: 5832\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 6\n      "
@@ -194,11 +206,11 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "6\n      }\n    }\n  }\n  accelerator_exec_micros: 226\n  "
       "cpu_exec_micros: 4066\n  total_accelerator_exec_micros: 226\n  "
       "total_cpu_exec_micros: 4066\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 5888\n  residual_bytes: 768\n  "
-      "output_bytes: 768\n  total_peak_bytes: 5888\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 14592\n  residual_bytes: 768\n "
+      " output_bytes: 768\n  total_peak_bytes: 14592\n  total_residual_bytes: "
       "768\n  total_output_bytes: 768\n}\nchildren {\n  name: \"Conv2D_1\"\n  "
-      "exec_micros: 597\n  requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "exec_micros: 597\n  requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -209,12 +221,12 @@ TEST_F(TFProfStatsTest, TestFloatOps) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
       "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -231,9 +243,9 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 597\ntotal_requested_bytes: "
-      "5120\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
-      "requested_bytes: 5120\n  total_exec_micros: 597\n  "
-      "total_requested_bytes: 5120\n  devices: "
+      "9728\nchildren {\n  name: \"Conv2D_1\"\n  exec_micros: 597\n  "
+      "requested_bytes: 9728\n  total_exec_micros: 597\n  "
+      "total_requested_bytes: 9728\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  float_ops: 4608\n  "
       "total_float_ops: 4608\n  input_shapes {\n    key: 0\n    value {\n      "
       "dim {\n        size: 2\n      }\n      dim {\n        size: 3\n      "
@@ -244,12 +256,12 @@ TEST_F(TFProfStatsTest, TestAccountShownNameOnly) {
       "12\n      }\n    }\n  }\n  accelerator_exec_micros: 178\n  "
       "cpu_exec_micros: 419\n  total_accelerator_exec_micros: 178\n  "
       "total_cpu_exec_micros: 419\n  run_count: 1\n  total_run_count: 1\n  "
-      "total_definition_count: 1\n  peak_bytes: 4096\n  residual_bytes: 512\n  "
-      "output_bytes: 512\n  total_peak_bytes: 4096\n  total_residual_bytes: "
+      "total_definition_count: 1\n  peak_bytes: 8704\n  residual_bytes: 512\n  "
+      "output_bytes: 512\n  total_peak_bytes: 8704\n  total_residual_bytes: "
       "512\n  total_output_bytes: 512\n}\ntotal_float_ops: "
       "4608\ntotal_accelerator_exec_micros: 178\ntotal_cpu_exec_micros: "
       "419\ntotal_run_count: 1\ntotal_definition_count: 2\ntotal_peak_bytes: "
-      "4096\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
+      "8704\ntotal_residual_bytes: 512\ntotal_output_bytes: 512\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 
@@ -265,8 +277,9 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
   GraphNodeProto expected;
   CHECK(protobuf::TextFormat::ParseFromString(
       "name: \"_TFProfRoot\"\ntotal_exec_micros: 4945\ntotal_requested_bytes: "
-      "14592\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
-      "exec_micros: 2\n  parameters: 162\n  total_exec_micros: 2\n  "
+      "30464\ntotal_parameters: 451\nchildren {\n  name: \"DW\"\n  "
+      "exec_micros: 2\n  requested_bytes: 1280\n  parameters: 162\n  "
+      "total_exec_micros: 2\n  total_requested_bytes: 1280\n  "
       "total_parameters: 162\n  devices: "
       "\"/job:localhost/replica:0/task:0/gpu:0\"\n  tensor_value {\n    dtype: "
       "DT_FLOAT\n    value_double: -0.000534315\n    value_double: "
@@ -351,11 +364,13 @@ TEST_F(TFProfStatsTest, TestShowTensorValue) {
       "value_double: 0.000374641\n    value_double: -0.00149603\n    "
       "value_double: -0.000317367\n    value_double: -0.000417829\n  }\n  "
       "cpu_exec_micros: 2\n  total_cpu_exec_micros: 2\n  run_count: 1\n  "
-      "total_run_count: 1\n  total_definition_count: 10\n  output_bytes: "
-      "1280\n  total_output_bytes: 1280\n}\ntotal_float_ops: "
-      "10440\ntotal_accelerator_exec_micros: 404\ntotal_cpu_exec_micros: "
-      "4541\ntotal_run_count: 6\ntotal_definition_count: 35\ntotal_peak_bytes: "
-      "9984\ntotal_residual_bytes: 1280\ntotal_output_bytes: 4864\n",
+      "total_run_count: 1\n  total_definition_count: 10\n  peak_bytes: 1280\n  "
+      "residual_bytes: 1280\n  output_bytes: 1280\n  total_peak_bytes: 1280\n  "
+      "total_residual_bytes: 1280\n  total_output_bytes: "
+      "1280\n}\ntotal_float_ops: 10440\ntotal_accelerator_exec_micros: "
+      "404\ntotal_cpu_exec_micros: 4541\ntotal_run_count: "
+      "6\ntotal_definition_count: 35\ntotal_peak_bytes: "
+      "25856\ntotal_residual_bytes: 3840\ntotal_output_bytes: 4864\n",
       &expected));
   EXPECT_EQ(expected.DebugString(), root.DebugString());
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index bdb000747db72900d748c22140ca38e571db6691..b0dd8ce5e0f046325a309060b19467b7c1494568 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -153,10 +153,8 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
 
   std::map<int64, int64> allocs;
   for (const auto& alloc : node->node->allocations(step)) {
-    for (const auto& r : alloc.allocation_records()) {
-      allocs[r.alloc_micros()] += r.alloc_bytes();
-      dev.tracked_allocations[r.alloc_micros()] += r.alloc_bytes();
-    }
+    allocs[alloc.alloc_micros()] += alloc.alloc_bytes();
+    dev.tracked_allocations[alloc.alloc_micros()] += alloc.alloc_bytes();
   }
   dev.tracked_allocations[0] += node->node->accelerator_persistent_bytes();
   allocs[0] += node->node->accelerator_persistent_bytes();
@@ -167,9 +165,9 @@ void MemoryTracker::TrackNode(int64 step, const GraphNode* node) {
     last += it->second;
     aggregate_allocs[it->first] = last;
   }
-  int64 end_micros = node->node->lastest_schedule_end_micros(step);
-  if (end_micros > 0 && node->node->allocator_bytes_in_use(step) > 0) {
-    dev.allocations[end_micros] = node->node->allocator_bytes_in_use(step);
+  for (const auto& bytes_in_use : node->node->allocator_bytes_in_use(step)) {
+    if (bytes_in_use.first <= 0) continue;
+    dev.allocations[bytes_in_use.first] = bytes_in_use.second;
   }
 }
 
@@ -265,6 +263,10 @@ void Timeline::GenerateGraphTimeline(const std::vector<GraphNode*>& gnodes) {
     }
   }
   for (const auto& dev : mem_tracker_.devices()) {
+    if (IsPlacedOnCPU(dev.first)) {
+      // TODO(xpan): Maybe also support CPU allocator memory tracking.
+      continue;
+    }
     int64 pid = AllocatePID();
     chrome_formatter_.EmitPID(GetMemoryLaneName(dev.first), pid);
     int64 pid2 = AllocatePID();
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 91eac0cf7617eba54f6938fb893192d2a8fe2eaf..6a7ab01029a4dd1bc26f81b1d3e739812130fcd1 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -71,7 +71,7 @@ TEST_F(TFProfTimelineTest, GraphView) {
 
   string dump_str;
   TF_CHECK_OK(ReadFileToString(Env::Default(), dump_file + "_0", &dump_str));
-  EXPECT_EQ(7932146665024565912ull, Hash64(dump_str));
+  EXPECT_EQ(16556121177519539380ull, Hash64(dump_str));
 }
 
 TEST_F(TFProfTimelineTest, ScopeView) {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index a5e513aa21c56e605681aaf7e5d46815a820cec7..b280242df18272b63c7b6a683e70db6c2e315c4d 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -266,7 +266,18 @@ int Run(int argc, char** argv) {
   linenoiseSetCompletionCallback(completion);
   linenoiseHistoryLoad(".tfprof_history.txt");
 
-  for (char* line = nullptr; (line = linenoise("tfprof> ")) != nullptr;) {
+  bool looped = false;
+  while (true) {
+    char* line = linenoise("tfprof> ");
+    if (line == nullptr) {
+      if (!looped) {
+        fprintf(stderr,
+                "Cannot start interative shell, "
+                "use 'bazel-bin' instead of 'bazel run'.\n");
+      }
+      break;
+    }
+    looped = true;
     string line_s = line;
     free(line);
 
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index f92301133a3102a2e4233326dd811169e1ecd105..0bf1b477ed855e6ff877faa780d25a08e85ea1e5 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -90,10 +90,6 @@ message ProfileNode {
   map<int64, ExecProfile> execs = 12;
 }
 
-message Allocation {
-  repeated AllocationRecord allocation_records = 1;
-}
-
 message ExecProfile {
   // Can be larger than 1 if run multiple times in loop.
   int64 run_count = 1;
@@ -110,34 +106,42 @@ message ExecProfile {
   // For cpu, vector size can be larger than 1 if in tf.while_loop.
   map<string, ExecTime> cpu_execs = 5;
 
-  map<int32, Memory> output_memory = 17;
+  // Each entry to memory information of a scheduling of the node.
+  // Normally, there will be multiple entries in while_loop.
+  repeated ExecMemory memory_execs = 7;
+  // The allocation and deallocation times and sizes throughout execution.
+  repeated AllocationRecord allocations = 11;
+  // The devices related to this execution.
+  repeated string devices = 6;
+}
 
-  repeated Allocation allocations = 18;
+message ExecTime {
+  repeated Tuple times = 1;
+}
 
-  repeated string devices = 6;
+message ExecMemory {
+  // This is the timestamp when the memory information was tracked.
+  int64 memory_micros = 1;
+  // NOTE: Please don't depend on the following 4 fields yet. Due to
+  // TensorFlow internal tracing issues, the numbers can be quite wrong.
+  // TODO(xpan): Fix the TensorFlow internal tracing.
+  int64 host_temp_bytes = 2;
+  int64 host_persistent_bytes = 3;
+  int64 accelerator_temp_bytes = 4;
+  int64 accelerator_persistent_bytes = 5;
 
   // Total bytes requested by the op.
-  int64 requested_bytes = 7;
+  int64 requested_bytes = 6;
   // Total bytes requested by the op and released before op end.
-  int64 peak_bytes = 8;
+  int64 peak_bytes = 7;
   // Total bytes requested by the op and not released after op end.
-  int64 residual_bytes = 9;
+  int64 residual_bytes = 8;
   // Total bytes output by the op (not necessarily requested by the op).
-  int64 output_bytes = 10;
-  // Total temporary bytes allocated and released by the op.
-  int64 host_temp_bytes = 11;
-  // Total persistent bytes (e.g. variable) allocated by the op.
-  int64 host_persistent_bytes = 12;
-  int64 accelerator_temp_bytes = 13;
-  int64 accelerator_persistent_bytes = 14;
+  int64 output_bytes = 9;
   // The total number of bytes currently allocated by the allocator if >0.
-  int64 allocator_bytes_in_use = 15;
-
-  bool memory_intialized = 16;
-}
-
-message ExecTime {
-  repeated Tuple times = 1;
+  int64 allocator_bytes_in_use = 10;
+  // The memory of each output of the operation.
+  map<int32, Memory> output_memory = 11;
 }
 
 message Tuple {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 145311b59d9c9455bfe78fe83a005231e306c62e..1916316245063bd6e8903573a961295f3b79bcf6 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -303,7 +303,11 @@ message ConfigProto {
   // Optional list of all workers to use in this session.
   ClusterDef cluster_def = 14;
 
-  // Next: 15
+  // If true, any resources such as Variables used in the session will not be
+  // shared with other sessions.
+  bool isolate_session_state = 15;
+
+  // Next: 16
 };
 
 // Options for a single Run() call.
@@ -331,6 +335,13 @@ message RunOptions {
   // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
   DebugOptions debug_options = 6;
 
+  // When enabled, causes tensor alllocation information to be included in
+  // the error message when the Run() call fails because the allocator ran
+  // out of memory (OOM).
+  //
+  // Enabling this option can slow down the Run() call.
+  bool report_tensor_allocations_upon_oom = 7;
+
   reserved 4;
 }
 
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 136c627e25f33cb9b4ff2de7725406c0f800a5b1..56983f3b7d464f88cebe608ac15882f04f27b003 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -60,3 +60,25 @@ message DebugOptions {
   // step count.
   int64 global_step = 10;
 }
+
+message DebuggedSourceFile {
+  // The host name on which a source code file is located.
+  string host = 1;
+
+  // Path to the source code file.
+  string file_path = 2;
+
+  // The timestamp at which the source code file is last modified.
+  int64 last_modified = 3;
+
+  // Byte size of the file.
+  int64 bytes = 4;
+
+  // Line-by-line content of the source code file.
+  repeated string lines = 5;
+}
+
+message DebuggedSourceFiles {
+  // A collection of source code files.
+  repeated DebuggedSourceFile source_files = 1;
+}
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index 9b1497c710d40c4c5a989f80ae0d98ee2a2dc3a8..3bd301590034847369fb18c95b75baf5221f979f 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -49,3 +49,8 @@ message DeviceProperties {
   // Memory bandwidth in KB/s
   int64 bandwidth = 13;
 }
+
+message NamedDevice {
+  string name = 1;
+  DeviceProperties properties = 2;
+}
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index d67088311b24cc2889c1b1e6b8770a2e146b6e1a..96b55ce04ba9b791dd841cd6d2325d57aa199b8f 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -30,11 +30,13 @@ message RewriterConfig {
   }
 
   // Optimize tensor layouts
-  bool optimize_tensor_layout = 1;
+  Toggle layout_optimizer = 1;
   // Fold constants (default is ON)
   Toggle constant_folding = 3;
   // Arithmetic optimizations (default is ON)
   Toggle arithmetic_optimization = 7;
+  // Control dependency optimizations (default is ON).
+  Toggle dependency_optimization = 8;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
@@ -46,9 +48,12 @@ message RewriterConfig {
     // Driven by manual op-level annotations.
     MANUAL = 2;
     // Driven by heuristics. The behavior of these heuristics is subject to
-    // change. Currently includes an experimental recomputation
-    // heuristic. Manual annotations are respected, but additional nodes are
+    // change. Currently includes an experimental recomputation and swapping
+    // heuristics. Manual annotations are respected, but additional nodes are
     // selected automatically.
+    SWAPPING_HEURISTICS = 4;
+    RECOMPUTATION_HEURISTICS = 5;
+    // Use any combination of swapping and recomputation heuristics.
     HEURISTICS = 3;
   }
   // Configures memory optimization passes through the meta-optimizer. Has no
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 137f9bc216dcd0edc9c967a17c65710f5619edb6..385e2dd163b8c668357ea9fabd1dee7d9a675729 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -59,11 +59,31 @@ message CreateWorkerSessionRequest {
 
   // Defines the configuration of a TensorFlow worker.
   ServerDef server_def = 2;
+
+  // If true, any resources such as Variables used in the session will not be
+  // shared with other sessions.
+  bool isolate_session_state = 3;
 }
 
 message CreateWorkerSessionResponse {
 }
 
+////////////////////////////////////////////////////////////////////////////////
+//
+// DeleteSession method request/response messages
+//
+// Deletes all worker-side state associated with the given session handle.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message DeleteWorkerSessionRequest {
+  // Sessions are identified by a given handle.
+  string session_handle = 1;
+}
+
+message DeleteWorkerSessionResponse {
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // RegisterGraph method request/response messages
@@ -169,6 +189,7 @@ message ExecutorOpts {
   bool record_costs = 1;
   bool record_timeline = 3;
   bool record_partition_graphs = 4;
+  bool report_tensor_allocations_upon_oom = 5;
 };
 
 message RunGraphRequest {
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index 3de9e48b78e33758292157a5a428840362ee9f55..e1bfb04d7c53a593a6e5d547962b75af6fba4bb9 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -43,6 +43,10 @@ service WorkerService {
   rpc CreateWorkerSession(CreateWorkerSessionRequest)
       returns (CreateWorkerSessionResponse);
 
+  // See worker.proto for details.
+  rpc DeleteWorkerSession(DeleteWorkerSessionRequest)
+      returns (DeleteWorkerSessionResponse);
+
   // See worker.proto for details.
   rpc RegisterGraph(RegisterGraphRequest) returns (RegisterGraphResponse);
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 95ada559fddc5d6e87ca5778e7dfc2a5119c41c0..ec077c42837e517f94955956ed75430b7a3d0a30 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
@@ -90,6 +90,7 @@ limitations under the License.
 // 23. Remove NonMaxSuppression in favor of NonMaxSuppressionV2.
 // 24. Deprecate lookup ops (v1) ops in favor of v2 (30may2017)
 // 25. Deprecate stack (v1) ops in favor of v2 (2017/6/15).
+// 25. Deprecate RandomPoisson (v1) ops in favor of v2 (2017/10/25).
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
@@ -116,7 +117,7 @@ extern const char* tf_compiler_version();
 // The git commit designator when tensorflow was built
 // If no git repository, this will be "internal".
 extern const char* tf_git_version();
-// Value of the _GLIBCXX_USE_CXX11_ABI flag, or -1 if it's not set.
+// Value of the _GLIBCXX_USE_CXX11_ABI flag, or 0 if it's not set.
 extern const int tf_cxx11_abi_flag();
 
 #endif  // TENSORFLOW_CORE_PUBLIC_VERSION_H_
diff --git a/tensorflow/core/user_ops/fact.cc b/tensorflow/core/user_ops/fact.cc
index c512275506436d54829b355dbbd9711115d364b3..800008e0b884bee3bcd94c1d90be3d7b2a636615 100644
--- a/tensorflow/core/user_ops/fact.cc
+++ b/tensorflow/core/user_ops/fact.cc
@@ -18,27 +18,27 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
-using namespace tensorflow;
-
 REGISTER_OP("Fact")
     .Output("fact: string")
     .Doc(R"doc(
 Output a fact about factorials.
 )doc");
 
-class FactOp : public OpKernel {
+class FactOp : public tensorflow::OpKernel {
  public:
-  explicit FactOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit FactOp(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
 
-  void Compute(OpKernelContext* context) override {
+  void Compute(tensorflow::OpKernelContext* context) override {
     // Output a scalar string.
-    Tensor* output_tensor = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, TensorShape(), &output_tensor));
+    tensorflow::Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0, tensorflow::TensorShape(), &output_tensor));
+    using tensorflow::string;
     auto output = output_tensor->template scalar<string>();
 
     output() = "0! == 1";
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("Fact").Device(DEVICE_CPU), FactOp);
+REGISTER_KERNEL_BUILDER(Name("Fact").Device(tensorflow::DEVICE_CPU), FactOp);
diff --git a/tensorflow/core/util/bcast.cc b/tensorflow/core/util/bcast.cc
index 47e6ddb3d82daac7983341f49a9616fdc0888694..1eab7e3d024c181f260500686b9127dd76dbe206 100644
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@@ -68,9 +68,7 @@ BCast::BCast(const Vec& sx, const Vec& sy, const bool fewer_dims_optimization) {
       // Output shape.
       State curr = UNKNOWN;
       const int64 x_i = x[i];  // i-th dimension of x.
-      CHECK_GE(x_i, 0);
       const int64 y_i = y[i];  // i-th dimension of y.
-      CHECK_GE(y_i, 0);
       int64 o_i;   // i-th dimension of the output.
       int64 bx_i;  // i-th broadcast for x.
       int64 by_i;  // i-th broadcast for y.
diff --git a/tensorflow/core/util/cuda_kernel_helper.h b/tensorflow/core/util/cuda_kernel_helper.h
index 8315f208e735ec1e879528bef9c8d53419a0303d..cf11f419a4effd868fa9c933240acb9a05bfa355 100644
--- a/tensorflow/core/util/cuda_kernel_helper.h
+++ b/tensorflow/core/util/cuda_kernel_helper.h
@@ -374,6 +374,16 @@ __device__ __host__ inline Eigen::half ldg(const Eigen::half* address) {
 #endif
 }
 
+template <>
+__device__ __host__ inline bool ldg(const bool* address) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return *reinterpret_cast<const bool*>(
+      __ldg(reinterpret_cast<const char*>(address)));
+#else
+  return *address;
+#endif
+}
+
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
 // for some ops and provide implementation for all reasonable types.
 #define CUDA_ATOMIC_WRAPPER(op, T) \
@@ -742,6 +752,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleDown(unsigned mask, T value,
   return __shfl_down_sync(mask, value, delta, width);
 }
 
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleDown(
+    unsigned mask, Eigen::half value, int delta, int width = warpSize) {
+  return Eigen::half(
+      __shfl_down_sync(mask, static_cast<uint16>(value), delta, width));
+}
+
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
@@ -764,6 +780,12 @@ __device__ EIGEN_ALWAYS_INLINE T CudaShuffleXor(unsigned mask, T value,
   return __shfl_xor_sync(mask, value, laneMask, width);
 }
 
+__device__ EIGEN_ALWAYS_INLINE Eigen::half CudaShuffleXor(
+    unsigned mask, Eigen::half value, int laneMask, int width = warpSize) {
+  return Eigen::half(
+      __shfl_xor_sync(mask, static_cast<uint16>(value), laneMask, width));
+}
+
 // Variant of the (undocumented) version from the CUDA SDK, but using unsigned
 // instead of float for lo and hi (which is incorrect with ftz, for example).
 // A bug has been filed with NVIDIA and will be fixed in the next CUDA release.
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 2d797c855a5dee1a99178046e96902b172def23e..90c3fed2e82715c9824a0ca7411bb1ed233fe06c 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -116,7 +116,6 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
   if (fullname == "/") {
     return true;
   }
-  StringPiece tmp;
   while (!fullname.empty()) {
     bool progress = false;
     if (str_util::ConsumePrefix(&fullname, "/job:")) {
diff --git a/tensorflow/core/util/env_var.cc b/tensorflow/core/util/env_var.cc
index d4e89b966efd28e5be3d271b18e50935304067ad..c844850179235ca5ad1d43f853abb774c8402867 100644
--- a/tensorflow/core/util/env_var.cc
+++ b/tensorflow/core/util/env_var.cc
@@ -60,4 +60,15 @@ Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
       tf_env_var_val, ". Use the default value: ", default_val));
 }
 
+Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
+                            string* value) {
+  const char* tf_env_var_val = getenv(env_var_name.ToString().c_str());
+  if (tf_env_var_val != nullptr) {
+    *value = tf_env_var_val;
+  } else {
+    *value = default_val.ToString();
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/env_var.h b/tensorflow/core/util/env_var.h
index ec661f1d81bfc1d1f09c19b3d8ea87ff0cb94b22..47f9ff3a3bd421202f0f27b3a1180eebdef9a954 100644
--- a/tensorflow/core/util/env_var.h
+++ b/tensorflow/core/util/env_var.h
@@ -21,20 +21,25 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Return a boolean into "value" from the environmental variable "env_var_name".
-// If it is unset, the default value is used.
-// A string "0" or a case insensitive "false" is interpreted as false.
-// A string "1" or a case insensitive "true" is interpreted as true.
-// Otherwise, an error status is returned.
+// Returns a boolean into "value" from the environmental variable
+// "env_var_name". If it is unset, the default value is used. A string "0" or a
+// case insensitive "false" is interpreted as false. A string "1" or a case
+// insensitive "true" is interpreted as true. Otherwise, an error status is
+// returned.
 Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
                           bool* value);
 
-// Return an int64 into "value" from the environmental variable "env_var_name".
+// Returns an int64 into "value" from the environmental variable "env_var_name".
 // If it is unset, the default value is used.
 // If the string cannot be parsed into int64, an error status is returned.
 Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
                            int64* value);
 
+// Returns a string into "value" from the environmental variable "env_var_name".
+// If it is unset, the default value is used.
+Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
+                            string* value);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_UTIL_ENV_VAR_H_
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index e077e94cf879ce69596b302a74b78705deb48e10..a0f43d2d4a745722d2095b6817c9156415c78127 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -58,12 +58,13 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
   Status Read(uint64 offset, size_t to_read, StringPiece* result,
               char* scratch) const override {
     if (offset >= length_) {
-      result->set(scratch, 0);
+      *result = StringPiece(scratch, 0);
       return Status(error::OUT_OF_RANGE, "Read after file end");
     }
     const uint64 region_left =
         std::min(length_ - offset, static_cast<uint64>(to_read));
-    result->set(reinterpret_cast<const uint8*>(data_) + offset, region_left);
+    *result =
+        StringPiece(reinterpret_cast<const char*>(data_) + offset, region_left);
     return (region_left == to_read)
                ? Status::OK()
                : Status(error::OUT_OF_RANGE, "Read less bytes than requested");
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 1bfa4f83a30efacfa4725e6b1eaa7bdf069e1ab7..2caf5fc56dafb5a8879db8026a78bc7bf46346a4 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -24,10 +24,9 @@ limitations under the License.
 #include "mkl_dnn_types.h"
 #include "mkl_service.h"
 #include "mkl_trans.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -38,6 +37,12 @@ limitations under the License.
 
 #ifdef INTEL_MKL_DNN
 #include "mkldnn.hpp"
+
+using mkldnn::engine;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::primitive;
+using mkldnn::reorder;
 #endif
 
 // The file contains a number of utility classes and functions used by MKL
@@ -51,6 +56,14 @@ namespace tensorflow {
 // Tensorflow tensor.
 
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
+typedef enum {
+  Dim_N = 0,
+  Dim_C = 1,
+  Dim_H = 2,
+  Dim_W = 3,
+  Dim_O = 0,
+  Dim_I = 1
+} MklDnnDims;
 
 class MklShape {
  public:
@@ -143,7 +156,9 @@ class MklShape {
   size_t GetDimension() const { return dimension_; }
   const size_t* GetSizes() const { return sizes_; }
   int64 dim_size(int index) const { return sizes_[index]; }
-  int64 tf_dim_size(int index) const { return sizes_[tf_to_mkl_dim_map_[index]]; }
+  int64 tf_dim_size(int index) const {
+    return sizes_[tf_to_mkl_dim_map_[index]];
+  }
   const size_t* GetStrides() const { return strides_; }
   const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; }
   size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; }
@@ -309,9 +324,345 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
+#ifdef INTEL_MKL_DNN
+
+// Forward decl
+TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
+memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
+memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                        const memory::dims& strides,
+                                        memory::data_type dtype);
+
+class MklDnnShape {
+ private:
+  typedef struct {
+    /// Flag to indicate if the tensor is an  MKL tensor or not
+    bool is_mkl_tensor_ = false;
+    /// Number of dimensions in Tensorflow format
+    size_t dimension_ = 0;
+    /// Required by MKLDNN for conversions
+    mkldnn_dims_t sizes_;  // Required by MKL for conversions
+    memory::format tf_data_format_ = memory::format::format_undef;
+    memory::data_type T_ = memory::data_type::data_undef;
+    // MKL layout
+    mkldnn_memory_desc_t mkl_md_;
+    /// TF dimension corresponding to this MKL dimension
+    mkldnn_dims_t map_;
+  } MklShapeData;
+  MklShapeData data_;
+
+  typedef std::remove_extent<mkldnn_dims_t>::type mkldnn_dim_t;
+#define INVALID_DIM_SIZE -1
+
+ public:
+  MklDnnShape() {
+    for (size_t i = 0; i < sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+         ++i) {
+      data_.sizes_[i] = -1;
+    }
+    for (size_t i = 0; i < sizeof(data_.map_) / sizeof(data_.map_[0]); ++i) {
+      data_.map_[i] = -1;
+    }
+  }
+
+  ~MklDnnShape() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
+
+  /// Helper function to compare memory::desc objects for MklDnn.
+  /// May be this should go into MklDnn directly.
+  inline bool CompareMklDnnLayouts(const memory::desc& md1,
+                                   const memory::desc& md2) const {
+    mkldnn_memory_desc_t mdd1 = md1.data;
+    mkldnn_memory_desc_t mdd2 = md2.data;
+    const char* d1 = reinterpret_cast<const char*>(&mdd1);
+    const char* d2 = reinterpret_cast<const char*>(&mdd2);
+
+    size_t md_size = sizeof(mdd1);
+    for (size_t i = 0; i < md_size; i++) {
+      if (*d1++ != *d2++) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /// Equality function for MklDnnShape objects
+  /// @return true if both are equal; false otherwise.
+  inline bool operator == (const MklDnnShape& input_shape) const {
+    if (this->IsMklTensor() != input_shape.IsMklTensor()) {
+      return false;
+    }
+
+    // If input tensors are in Mkl layout, then we check for dimensions and
+    // sizes.
+    if (this->IsMklTensor()) {
+      return this->GetTfShape() == input_shape.GetTfShape() &&
+             CompareMklDnnLayouts(this->GetMklLayout(),
+                                  input_shape.GetMklLayout());
+    }
+
+    return true;
+  }
+
+  /// Equality operator for MklDnnShape and TFShape.
+  /// Returns: true if TF shapes for both are the same, false otherwise
+  inline bool operator == (const TensorShape& input_shape) const {
+    if (!this->IsMklTensor()) {
+      return false;
+    }
+
+    return this->GetTfShape() == input_shape;
+  }
+
+  inline const bool IsMklTensor() const { return data_.is_mkl_tensor_; }
+  inline void SetMklTensor(bool is_mkl_tensor) {
+    data_.is_mkl_tensor_ = is_mkl_tensor;
+  }
+
+  inline void SetDimensions(const size_t dimension) {
+    data_.dimension_ = dimension;
+  }
+  inline size_t GetDimension(char dimension) const {
+    int index = GetMklDnnTensorDimIndex(dimension);
+    CHECK(index >= 0 && index < this->GetDimension())
+      << "Invalid index from the dimension: " << index << ", " << dimension;
+    return this->DimSize(index);
+  }
+
+  inline int32 GetMklDnnTensorDimIndex(char dimension) const {
+    switch (dimension) {
+      case 'N':
+        return MklDnnDims::Dim_N;
+      case 'C':
+        return MklDnnDims::Dim_C;
+      case 'H':
+        return MklDnnDims::Dim_H;
+      case 'W':
+        return MklDnnDims::Dim_W;
+      default:
+        LOG(FATAL) << "Invalid dimension: " << dimension;
+        return -1;  // Avoid compiler warning about missing return value
+    }
+  }
+
+  inline size_t GetDimension() const { return data_.dimension_; }
+  inline const int* GetSizes() const {
+    return reinterpret_cast<const int*>(&data_.sizes_[0]);
+  }
+
+  // Returns an mkldnn::memory::dims object that contains the sizes of this
+  // MklDnnShape object.
+  inline memory::dims GetSizesAsMklDnnDims() const {
+    memory::dims retVal;
+    if (data_.is_mkl_tensor_) {
+      size_t dimensions = sizeof(data_.sizes_) / sizeof(data_.sizes_[0]);
+      for (size_t i = 0; i < dimensions; i++) {
+        if (data_.sizes_[i] != INVALID_DIM_SIZE)
+          retVal.push_back(data_.sizes_[i]);
+      }
+    } else {
+      CHECK_EQ(data_.is_mkl_tensor_, true);
+    }
+    return retVal;
+  }
+
+  inline int64 DimSize(int index) const {
+    CHECK_LT(index, sizeof(data_.sizes_) / sizeof(data_.sizes_[0]));
+    return data_.sizes_[index];
+  }
+
+  /// Return TensorShape that describes the Tensorflow shape of the tensor
+  /// represented by this MklShape.
+  inline TensorShape GetTfShape() const {
+    CHECK_EQ(data_.is_mkl_tensor_, true);
+
+    std::vector<int32> shape(data_.dimension_, -1);
+    if (data_.tf_data_format_ != memory::format::blocked) {
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[TfDimIdx(idx)];
+      }
+    } else {
+      // If Tensorflow shape is in Blocked format, then we don't have dimension
+      // map for it. So we just create Tensorflow shape from sizes in the
+      // specified order.
+      for (size_t idx = 0; idx < data_.dimension_; ++idx) {
+        shape[idx] = data_.sizes_[idx];
+      }
+    }
+
+    TensorShape ts;
+    bool ret = TensorShapeUtils::MakeShape(shape, &ts).ok();
+    CHECK_EQ(ret, true);
+    return ts;
+  }
+
+  inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
+  inline const memory::data_type GetElemType() { return data_.T_; }
+
+  inline void SetMklLayout(memory::primitive_desc* pd) {
+    CHECK_NOTNULL(pd);
+    data_.mkl_md_ = pd->desc().data;
+  }
+
+  inline void SetMklLayout(memory::desc* md) {
+    CHECK_NOTNULL(md);
+    data_.mkl_md_ = md->data;
+  }
+
+  inline const memory::desc GetMklLayout() const {
+    return memory::desc(data_.mkl_md_);
+  }
+
+  inline memory::format GetTfDataFormat() const {
+    return data_.tf_data_format_;
+  }
+  /// We don't create primitive_descriptor for TensorFlow layout now.
+  /// We use lazy evaluation and create it only when needed. Input format can
+  /// also be Blocked format.
+  inline void SetTfLayout(size_t dims, const memory::dims& sizes,
+                          memory::format format) {
+    CHECK_EQ(dims, sizes.size());
+    data_.dimension_ = dims;
+    for (size_t ii = 0; ii < dims; ii++) {
+      data_.sizes_[ii] = sizes[ii];
+    }
+    data_.tf_data_format_ = format;
+    if (format != memory::format::blocked) {
+      SetTfDimOrder(dims, format);
+    }
+  }
+
+  inline const memory::desc GetTfLayout() const {
+    memory::dims dims;
+    for (size_t ii = 0; ii < data_.dimension_; ii++) {
+      dims.push_back(data_.sizes_[ii]);
+    }
+
+    // Create Blocked memory desc if input TF format was set like that.
+    if (data_.tf_data_format_ == memory::format::blocked) {
+      auto strides = CalculateTFStrides(dims);
+      return CreateBlockedMemDescHelper(dims, strides, data_.T_);
+    } else {
+      return memory::desc(dims, data_.T_, data_.tf_data_format_);
+    }
+  }
+
+  inline const memory::desc GetCurLayout() const {
+    return IsMklTensor() ? GetMklLayout() : GetTfLayout();
+  }
+
+  // nhasabni - I've removed SetTfDimOrder that was setting default order in
+  // case of MKL-ML. We don't need a case of default dimension order because
+  // when an operator that does not get data_format attribute gets all inputs
+  // in Tensorflow format, it will produce output in Tensorflow format.
+  inline void SetTfDimOrder(const size_t dimension, const mkldnn_dims_t map) {
+    CHECK(dimension == data_.dimension_);
+    for (size_t ii = 0; ii < dimension; ii++) {
+      data_.map_[ii] = map[ii];
+    }
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, TensorFormat data_format) {
+    // TODO(nhasabni): Why do we restrict this to 4D?
+    CHECK_EQ(dimension, 4);
+    CHECK(dimension == data_.dimension_);
+    data_.map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDnnDims::Dim_W;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDnnDims::Dim_H;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDnnDims::Dim_C;
+    data_.map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDnnDims::Dim_N;
+  }
+
+  inline void SetTfDimOrder(const size_t dimension, memory::format format) {
+    TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
+    SetTfDimOrder(dimension, data_format);
+  }
+
+  inline const mkldnn_dim_t* GetTfToMklDimMap() const { return &data_.map_[0]; }
+  inline size_t TfDimIdx(int index) const { return data_.map_[index]; }
+  inline int64 TfDimSize(int index) const {
+    return data_.sizes_[TfDimIdx(index)];
+  }
+
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Channel dimension.
+  inline bool IsMklChannelDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_C;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Batch dimension.
+  inline bool IsMklBatchDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_N;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Width dimension.
+  inline bool IsMklWidthDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_W;
+  }
+  /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd'
+  /// corresponds to MKL's Height dimension.
+  inline bool IsMklHeightDim(int d) const {
+    return TfDimIdx(d) == MklDnnDims::Dim_H;
+  }
+
+  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// tensor is in NCHW format.
+  inline bool IsTensorInNCHWFormat() const {
+    TensorFormat data_format = FORMAT_NCHW;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// Check if the TF-Mkl dimension ordering map specifies if the input
+  /// tensor is in NHWC format.
+  inline bool IsTensorInNHWCFormat() const {
+    TensorFormat data_format = FORMAT_NHWC;
+    return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) &&
+            IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) &&
+            IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) &&
+            IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W')));
+  }
+
+  /// The following methods are used for serializing and de-serializing the
+  /// contents of the mklshape object.
+  /// The data is serialized in this order
+  /// is_mkl_tensor_ : dimension_ : sizes_ : map_: format_ : T_ : mkl_pd_;
+
+  /// Size of buffer to hold the serialized object, the size is computed by
+  /// following above mentioned order
+  inline size_t GetSerializeBufferSize() const { return sizeof(MklShapeData); }
+
+  void SerializeMklDnnShape(unsigned char* buf, size_t buf_size) const {
+    CHECK(buf_size >= GetSerializeBufferSize())
+        << "Buffer size is too small to SerializeMklDnnShape";
+    *reinterpret_cast<MklShapeData*>(buf) = data_;
+  }
+
+  void DeSerializeMklDnnShape(const unsigned char* buf, size_t buf_size) {
+    // Make sure buffer holds at least is_mkl_tensor_.
+    CHECK(buf_size >= sizeof(data_.is_mkl_tensor_))
+        << "Buffer size is too small in DeSerializeMklDnnShape";
+
+    const bool is_mkl_tensor = *reinterpret_cast<const bool*>(buf);
+    if (is_mkl_tensor) {  // If it is an MKL Tensor then read the rest
+      CHECK(buf_size >= GetSerializeBufferSize())
+          << "Buffer size is too small in DeSerializeMklDnnShape";
+      data_ = *reinterpret_cast<const MklShapeData*>(buf);
+    }
+  }
+};
+
+#endif
+
 // List of MklShape objects. Used in Concat/Split layers.
+
 typedef std::vector<MklShape> MklShapeList;
 
+#ifdef INTEL_MKL_DNN
+typedef std::vector<MklDnnShape> MklDnnShapeList;
+#endif
+
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -322,6 +673,7 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
+#ifndef INTEL_MKL_DNN
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -346,6 +698,19 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 
   return output_tensor;
 }
+#else
+template <typename T>
+inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
+                             const MklDnnShape& mkl_shape) {
+  Tensor output_tensor;
+  TensorShape output_shape;
+
+  TF_CHECK_OK(Status(error::Code::UNIMPLEMENTED,
+                     "Unimplemented conversion function"));
+
+  return output_tensor;
+}
+#endif
 
 // Get the MKL shape from the second string tensor
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
@@ -359,6 +724,19 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
           sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
+  mklshape->DeSerializeMklDnnShape(
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+          .flat<uint8>()
+          .data(),
+      ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
+              .flat<uint8>()
+              .size() *
+          sizeof(uint8));
+}
+#endif
+
 // Gets the actual input
 inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) {
   return ctext->input(GetTensorDataIndex(n, ctext->num_inputs()));
@@ -370,6 +748,9 @@ inline void GetMklInputList(OpKernelContext* ctext, StringPiece name,
   ctext->input_list(name, input_tensors);
 }
 
+
+#ifndef INTEL_MKL_DNN
+
 inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
                             MklShapeList* mkl_shapes) {
   OpInputList input_mkl_tensors;
@@ -382,6 +763,42 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
   }
 }
 
+#else
+
+inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name,
+                            MklDnnShapeList* mkl_shapes) {
+  OpInputList input_mkl_tensors;
+  GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors);
+
+  for (int i = 0; i < input_mkl_tensors.size(); i++) {
+    (*mkl_shapes)[i].DeSerializeMklDnnShape(
+        input_mkl_tensors[i].flat<uint8>().data(),
+        input_mkl_tensors[i].flat<uint8>().size() * sizeof(uint8));
+  }
+}
+
+#endif
+
+#ifdef INTEL_MKL_DNN
+/// Get shape of input tensor pointed by 'input_idx' in TensorShape format.
+/// If the input tensor is in MKL layout, then obtains TensorShape from
+/// MklShape.
+inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx) {
+  // Sanity check.
+  CHECK_NOTNULL(context);
+  CHECK_LT(input_idx, context->num_inputs());
+
+  MklDnnShape input_mkl_shape;
+  GetMklShape(context, input_idx, &input_mkl_shape);
+  if (input_mkl_shape.IsMklTensor()) {
+    return input_mkl_shape.GetTfShape();
+  } else {
+    const Tensor& t = MklGetInput(context, input_idx);
+    return t.shape();
+  }
+}
+#endif
+
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -397,6 +814,23 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+// Allocate the second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      const MklDnnShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklDnnShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+#endif
+
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -417,9 +851,43 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
+#ifdef INTEL_MKL_DNN
+// Allocate the output tensor, create a second output tensor that will contain
+// the MKL shape serialized
+inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
+                                      Tensor** output,
+                                      const TensorShape& tf_shape,
+                                      const MklDnnShape& mkl_shape) {
+  Tensor* second_tensor = nullptr;
+  TensorShape second_shape;
+  second_shape.AddDim(mkl_shape.GetSerializeBufferSize());
+  OP_REQUIRES_OK(
+      ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()),
+                                    tf_shape, output));
+  OP_REQUIRES_OK(ctext, ctext->allocate_output(
+                            GetTensorMetaDataIndex(n, ctext->num_outputs()),
+                            second_shape, &second_tensor));
+  mkl_shape.SerializeMklDnnShape(
+      second_tensor->flat<uint8>().data(),
+      second_tensor->flat<uint8>().size() * sizeof(uint8));
+}
+#endif
+
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 // Currently
-// we only support F32, will need to templatize if other types are added
+#ifdef INTEL_MKL_DNN
+template <typename T>
+inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
+                           const memory::primitive_desc& pd, void** buf_out) {
+  TensorShape tf_shape;
+
+  tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
+  OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
+                                                 tf_shape, tensor_out));
+  *buf_out = static_cast<void*>(tensor_out->flat<T>().data());
+}
+#endif
+
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            dnnLayout_t lt_buff, void** buf_out) {
   TensorShape tf_shape;
@@ -526,6 +994,7 @@ inline void CopyMklTensorInToOut(OpKernelContext* context,
   context->set_output(idx_meta_out, meta_output);
 }
 
+#ifndef INTEL_MKL_DNN
 inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
                                          int idx_in, int idx_out,
                                          const TensorShape& shape) {
@@ -543,6 +1012,27 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
   CHECK(output.CopyFrom(data, shape));
   context->set_output(idx_data_out, output);
 }
+#else
+inline void CopyTfTensorInToOutWithShape(OpKernelContext* context,
+                                         int idx_in, int idx_out,
+                                         const TensorShape& shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  const Tensor& data = context->input(idx_data_in);
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape_output);
+  Tensor output(data.dtype());
+  // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...)
+  CHECK(output.CopyFrom(data, shape));
+  context->set_output(idx_data_out, output);
+}
+#endif
+
+#ifndef INTEL_MKL_DNN
 
 inline void ForwardTfTensorInToOut(OpKernelContext* context,
                                   int idx_in, int idx_out) {
@@ -561,6 +1051,27 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context,
   }
 }
 
+#else
+
+inline void ForwardTfTensorInToOut(OpKernelContext* context,
+                                  int idx_in, int idx_out) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  MklDnnShape dnn_shape_output;
+  dnn_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_out, dnn_shape_output);
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+
+#endif
+
 inline void ForwardMklTensorInToOut(OpKernelContext* context,
                                    int idx_in, int idx_out) {
   int num_inputs = context->num_inputs();
@@ -579,6 +1090,25 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context,
   }
 }
 
+#ifdef INTEL_MKL_DNN
+inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+                                             int idx_in, int idx_out,
+                                             const MklDnnShape& mkl_shape) {
+  int num_inputs = context->num_inputs();
+  int num_outputs = context->num_outputs();
+  int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
+  int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
+
+  AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+
+  if (IsRefType(context->input_dtype(idx_data_in))) {
+    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  } else {
+    context->set_output(idx_data_out, context->input(idx_data_in));
+  }
+}
+#endif
+
 // Forward the MKL shape ONLY (used in elementwise and other ops where
 // we call the eigen implementation and MKL shape is not used)
 inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
@@ -602,6 +1132,10 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
+#ifndef INTEL_MKL_DNN
+// We don't need these functions in MKLDNN. We have defined equality operator
+// on MklDnnShape class directly.
+
 // Checks if the TF shape for both MKL tensors is the same or not
 // Returns: true if both TF shapes are the same, false otherwise
 inline bool MklCompareShapes(const MklShape* input_shape_0,
@@ -668,7 +1202,10 @@ inline bool MklCompareShapes(const TensorShape* input_shape_0,
 
   return true;
 }
+#endif
 
+// These functions do not compile with MKL-DNN since mkl.h is missing.
+// We may need to remove them later.
 // TODO(intel_tf): Remove this routine when faster MKL layout conversion is
 // out.
 inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) {
@@ -707,12 +1244,6 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
 
 #ifdef INTEL_MKL_DNN
 
-using mkldnn::engine;
-using mkldnn::memory;
-using mkldnn::padding_kind;
-using mkldnn::primitive;
-using mkldnn::reorder;
-
 /// Return MKL-DNN data type (memory::data_type) for input type T
 ///
 /// @input None
@@ -742,6 +1273,22 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
   return memory::format::format_undef;
 }
 
+/// Map MKL-DNN data format to TensorFlow's data format
+///
+/// @input: memory::format
+/// @return: Tensorflow data format corresponding to memory::format
+///          Fails with an error if invalid data format.
+inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) {
+  if (format == memory::format::nhwc) return FORMAT_NHWC;
+  else if (format == memory::format::nchw) return FORMAT_NCHW;
+  TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT,
+                     "Unsupported data format"));
+
+  // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure
+  // that we don't come here.
+  return FORMAT_NHWC;
+}
+
 /// Map TensorShape object into memory::dims required by MKL-DNN
 ///
 /// This function will simply map input TensorShape into MKL-DNN dims
@@ -753,7 +1300,7 @@ inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @return memory::dims corresponding to TensorShape
 inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
   memory::dims dims(shape.dims());
-  for (unsigned int d = 0; d < shape.dims(); ++d) {
+  for (int d = 0; d < shape.dims(); ++d) {
     dims[d] = shape.dim_size(d);
   }
   return dims;
@@ -783,11 +1330,102 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
   return memory::dims({n, c, h, w});
 }
 
+/// Overloaded version of function above. Input parameters are
+/// self-explanatory.
+inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
+                                     TensorFormat format) {
+  // Check validity of format.
+  CHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+           memory::format::format_undef);
+
+  int n = in_dims[GetTensorDimIndex(format, 'N')];
+  int c = in_dims[GetTensorDimIndex(format, 'C')];
+  int h = in_dims[GetTensorDimIndex(format, 'H')];
+  int w = in_dims[GetTensorDimIndex(format, 'W')];
+
+  // MKL-DNN requires dimensions in NCHW format.
+  return memory::dims({n, c, h, w});
+}
+
+/// Map MklDnn memory::dims object into TensorShape object.
+///
+/// This function will simply map input shape in MKL-DNN memory::dims format
+/// in Tensorflow's TensorShape object by perserving dimension order.
+///
+/// @input MKL-DNN memory::dims object
+/// @output TensorShape corresponding to memory::dims
+inline TensorShape MklDnnDimsToTFShape(const memory::dims& dims) {
+  std::vector<int32> shape(dims.size(), -1);
+  for (int d = 0; d < dims.size(); d++) {
+    shape[d] = dims[d];
+  }
+
+  TensorShape ret;
+  CHECK_EQ(TensorShapeUtils::MakeShape(shape, &ret).ok(), true);
+  return ret;
+}
+
+/// Function to calculate strides given tensor shape in Tensorflow order
+/// E.g., if dims_tf_order is {1, 2, 3, 4}, then as per Tensorflow convention,
+/// dimesion with size 1 is outermost dimension; while dimension with size 4 is
+/// innermost dimension. So strides for this tensor would be {4 * 3 * 2,
+/// 4 * 3, 4, 1}, i.e., {24, 12, 4, 1}.
+///
+/// @input Tensorflow shape in memory::dims type
+/// @return memory::dims containing strides for the tensor.
+inline memory::dims CalculateTFStrides(const memory::dims& dims_tf_order) {
+  CHECK_GT(dims_tf_order.size(), 0);
+  memory::dims strides(dims_tf_order.size());
+  int last_dim_idx = dims_tf_order.size() - 1;
+  strides[last_dim_idx] = 1;
+  for (int d = last_dim_idx - 1; d >= 0; d--) {
+    strides[d] = strides[d + 1] * dims_tf_order[d + 1];
+  }
+  return strides;
+}
+
 inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
   // MKL-DNN only supports zero padding.
   return padding_kind::zero;
 }
 
+/// Helper function to create memory descriptor in Blocked format
+///
+/// @input: Tensor dimensions
+/// @input: strides corresponding to dimensions. One can use utility
+///         function such as CalculateTFStrides to compute strides
+///         for given dimensions.
+/// @return: memory::desc object corresponding to blocked memory format
+///          for given dimensions and strides.
+inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim,
+                                               const memory::dims& strides,
+                                               memory::data_type dtype) {
+  CHECK_EQ(dim.size(), strides.size());
+
+  // We have to construct memory descriptor in a C style. This is not at all
+  // ideal but MKLDNN does not offer any API to construct descriptor in
+  // blocked format except a copy constructor that accepts
+  // mkldnn_memory_desc_t.
+  mkldnn_memory_desc_t md;
+  md.primitive_kind = mkldnn_memory;
+  md.ndims = dim.size();
+  md.format = mkldnn_blocked;
+  md.data_type = memory::convert_to_c(dtype);
+
+  for (size_t i = 0; i < dim.size(); i++) {
+    md.layout_desc.blocking.block_dims[i] = 1;
+    md.layout_desc.blocking.strides[1][i] = 1;
+    md.layout_desc.blocking.strides[0][i] = strides[i];
+    md.layout_desc.blocking.padding_dims[i] = dim[i];
+    md.layout_desc.blocking.offset_padding_to_data[i] = 0;
+    md.dims[i] = dim[i];
+  }
+  md.layout_desc.blocking.offset_padding = 0;
+
+  return memory::desc(md);
+}
+
+
 /*
  * Class to represent all the resources corresponding to a tensor in TensorFlow
  * that are required to execute an operation (such as Convolution).
@@ -821,7 +1459,7 @@ class MklDnnData {
     delete (op_md_);
   }
 
-  void* GetTensorBuffer(const Tensor* tensor) {
+  inline void* GetTensorBuffer(const Tensor* tensor) const {
     CHECK_NOTNULL(tensor);
     return const_cast<void*>(
         static_cast<const void*>(tensor->flat<T>().data()));
@@ -835,35 +1473,61 @@ class MklDnnData {
   /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
   /// memory format HWIO, and the buffer that contains actual values is
   /// pointed by data_buffer.
-  void SetUsrMem(memory::dims dim, memory::format fm, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
-    CHECK_NOTNULL(cpu_engine_);
-    // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ =
-        new memory(memory::primitive_desc(
-                       memory::desc(dim, MklDnnType<T>(), fm), *cpu_engine_),
-                   data_buffer);
+  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+                        void* data_buffer = nullptr) {
+    auto md = memory::desc(dim, MklDnnType<T>(), fm);
+    SetUsrMem(md, data_buffer);
   }
 
-  void SetUsrMem(memory::dims dim, memory::format fm, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::dims& dim, memory::format fm,
+                        const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(dim, fm, GetTensorBuffer(tensor));
   }
 
+  /// Helper function to create memory descriptor in Blocked format
+  ///
+  /// @input: Tensor dimensions
+  /// @input: strides corresponding to dimensions. One can use utility
+  ///         function such as CalculateTFStrides to compute strides
+  ///         for given dimensions.
+  /// @return: memory::desc object corresponding to blocked memory format
+  ///          for given dimensions and strides.
+  static inline memory::desc CreateBlockedMemDesc(const memory::dims& dim,
+                                                 const memory::dims& strides) {
+    return CreateBlockedMemDescHelper(dim, strides, MklDnnType<T>());
+  }
+
+  /// A version of SetUsrMem call that allows user to create memory in blocked
+  /// format. So in addition to accepting dimensions, it also accepts strides.
+  /// This allows user to create memory for tensor in a format that is not
+  /// supported by MKLDNN. E.g., MKLDNN does not support tensor format for 6
+  /// dimensional tensor as a native format. But by using blocked format, a user
+  /// can create memory for 6D tensor.
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        void* data_buffer = nullptr) {
+    CHECK_EQ(dim.size(), strides.size());
+    auto blocked_md = MklDnnData<T>::CreateBlockedMemDesc(dim, strides);
+    SetUsrMem(blocked_md, data_buffer);
+  }
+
+  inline void SetUsrMem(const memory::dims& dim, const memory::dims& strides,
+                        const Tensor* tensor) {
+    CHECK_NOTNULL(tensor);
+    SetUsrMem(dim, strides, GetTensorBuffer(tensor));
+  }
+
   /// A version of function to set user memory primitive that accepts memory
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic that the one above, but the function above is
   /// sufficient in most cases.
-  void SetUsrMem(memory::desc md, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
-    CHECK_NOTNULL(cpu_engine_);
-    // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ =
-        new memory(memory::primitive_desc(md, *cpu_engine_), data_buffer);
+  inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) {
+    auto pd = memory::primitive_desc(md, *cpu_engine_);
+    SetUsrMem(pd, data_buffer);
   }
 
   /// A version of SetUsrMem with memory descriptor and tensor
-  void SetUsrMem(memory::desc md, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(md, GetTensorBuffer(tensor));
   }
@@ -872,41 +1536,61 @@ class MklDnnData {
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic that the one above, but the function above is
   /// sufficient in most cases.
-  void SetUsrMem(memory::primitive_desc pd, void* data_buffer) {
-    CHECK_NOTNULL(data_buffer);
+  inline void SetUsrMem(const memory::primitive_desc& pd,
+                        void* data_buffer = nullptr) {
     CHECK_NOTNULL(cpu_engine_);
     // TODO(nhasabni): can we remove dynamic memory allocation?
-    user_memory_ = new memory(pd, data_buffer);
+    if (data_buffer) {
+      user_memory_ = new memory(pd, data_buffer);
+    } else {
+      user_memory_ = new memory(pd);
+    }
   }
 
   /// A version of SetUsrMem with primitive descriptor and tensor
-  void SetUsrMem(memory::primitive_desc pd, const Tensor* tensor) {
+  inline void SetUsrMem(const memory::primitive_desc& pd,
+                        const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
     SetUsrMem(pd, GetTensorBuffer(tensor));
   }
 
   /// Get function for user memory primitive.
-  const memory* GetUsrMem() const { return user_memory_; }
+  inline const memory* GetUsrMem() const { return user_memory_; }
 
   /// Get function for primitive descriptor of user memory primitive.
-  const memory::primitive_desc GetUsrMemPrimDesc() const {
+  inline const memory::primitive_desc GetUsrMemPrimDesc() const {
     CHECK_NOTNULL(user_memory_);
     return user_memory_->get_primitive_desc();
   }
 
+
   /// Get function for descriptor of user memory.
-  memory::desc GetUsrMemDesc() {
+  inline memory::desc GetUsrMemDesc() {
     // This is ugly. Why MKL-DNN does not provide desc() method of const type??
     const memory::primitive_desc pd = GetUsrMemPrimDesc();
     return const_cast<memory::primitive_desc*>(&pd)->desc();
   }
 
   /// Get function for data buffer of user memory primitive.
-  void* GetUsrMemDataHandle() const {
+  inline void* GetUsrMemDataHandle() const {
     CHECK_NOTNULL(user_memory_);
     return user_memory_->get_data_handle();
   }
 
+  /// Set function for data buffer of user memory primitive.
+  inline void* SetUsrMemDataHandle(void* data_buffer) {
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(data_buffer);
+    return user_memory_->set_data_handle(data_buffer);
+  }
+
+  /// Set function for data buffer of user memory primitive.
+  inline void SetUsrMemDataHandle(const Tensor* tensor) {
+    CHECK_NOTNULL(user_memory_);
+    CHECK_NOTNULL(tensor);
+    user_memory_->set_data_handle(GetTensorBuffer(tensor));
+  }
+
   /// Get the memory primitive for input and output of an op. If inputs
   /// to an op require reorders, then this function returns memory primitive
   /// for reorder. Otherwise, it will return memory primitive for user memory.
@@ -915,7 +1599,7 @@ class MklDnnData {
   /// execute Conv2D, we need memory primitive for I and F. Buf if reorder is
   /// required for I and F (say I_r is reorder primitive for I; F_r is reorder
   /// primitive for F), then we need I_r and F_r to perform Conv2D.
-  const memory& GetOpMem() const {
+  inline const memory& GetOpMem() const {
     return reorder_memory_ ? *reorder_memory_ : *user_memory_;
   }
 
@@ -923,13 +1607,43 @@ class MklDnnData {
   /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
   /// but memory::format would be mkldnn::any because we want MKL-DNN to choose
   /// best layout/format for given input dimensions.
-  void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
+  inline void SetOpMemDesc(const memory::dims& dim, memory::format fm) {
     // TODO(nhasabni): can we remove dynamic memory allocation?
     op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
   }
 
   /// Get function for memory descriptor for an operation
-  const memory::desc& GetOpMemDesc() const { return *op_md_; }
+  inline const memory::desc& GetOpMemDesc() const { return *op_md_; }
+
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// pointed by op_pd.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::primitive_desc& op_pd) const {
+    CHECK_NOTNULL(user_memory_);
+    return op_pd != user_memory_->get_primitive_desc();
+  }
+
+  /// Predicate that checks if we need to reorder user's memory into memory
+  /// based on the provided format.
+  ///
+  /// @input: target_format - memory format of the given input of an
+  ///               operation
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool IsReorderNeeded(const memory::format& target_format) const {
+    CHECK_NOTNULL(user_memory_);
+    return target_format != user_memory_->get_primitive_desc().desc().data.format;
+  }
+
+  /// Function to create a reorder from memory pointed by from to memory pointed
+  /// by to. Returns created primitive.
+  inline primitive CreateReorder(const memory* from, const memory* to) const {
+    CHECK_NOTNULL(from);
+    CHECK_NOTNULL(to);
+    return reorder(*from, *to);
+  }
 
   /// Function to handle input reordering
   ///
@@ -945,19 +1659,62 @@ class MklDnnData {
   ///               operation
   /// @input: net - net to which to add reorder primitive in case it is needed.
   /// @return: true in case reorder of input is needed; false, otherwise.
-  bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                           std::vector<primitive>* net) {
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  std::vector<primitive>* net) {
     CHECK_NOTNULL(net);
     CHECK_NOTNULL(user_memory_);
-    if (op_pd != user_memory_->get_primitive_desc()) {
+    if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd);
-      net->push_back(reorder(*user_memory_, *reorder_memory_));
+      net->push_back(CreateReorder(user_memory_, reorder_memory_));
+      return true;
+    }
+    return false;
+  }
+
+  /// Overloaded version of above function that accepts memory buffer
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @reorder_data_handle - memory buffer where output of reorder needs to be
+  ///                        stored. Primitive does not check if buffer is
+  ///                        enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  void* reorder_data_handle,
+                                  std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(reorder_data_handle);
+    CHECK_NOTNULL(user_memory_);
+    if (IsReorderNeeded(op_pd)) {
+      // TODO(nhasabni): can we remove dynamic memory allocation?
+      reorder_memory_ = new memory(op_pd, reorder_data_handle);
+      net->push_back(CreateReorder(user_memory_, reorder_memory_));
       return true;
     }
     return false;
   }
 
+  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///               operation
+  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
+  ///                   reorder. Primitive does not check if buffer is
+  ///                   enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @return: true in case reorder of input is needed; false, otherwise.
+  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
+                                  Tensor* reorder_tensor,
+                                  std::vector<primitive>* net) {
+    CHECK_NOTNULL(net);
+    CHECK_NOTNULL(reorder_tensor);
+    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net);
+  }
+
   /// Function to handle output reorder
   ///
   /// This function performs very similar functionality as input reordering
@@ -970,9 +1727,10 @@ class MklDnnData {
   ///
   /// @input memory primitive descriptor for the given output of an operation
   /// @return: true in case reorder of output is needed; false, otherwise.
-  bool PrepareReorderToUserMemIfReq(const memory::primitive_desc& op_pd) {
+  inline bool PrepareReorderToUserMemIfReq(
+      const memory::primitive_desc& op_pd) {
     CHECK_NOTNULL(user_memory_);
-    if (op_pd != user_memory_->get_primitive_desc()) {
+    if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
       reorder_memory_ = new memory(op_pd);
       return true;
@@ -987,11 +1745,11 @@ class MklDnnData {
   /// to the user-specified output buffer.
   ///
   /// @input: net - net to which to add reorder primitive
-  void InsertReorderToUserMem(std::vector<primitive>* net) {
+  inline void InsertReorderToUserMem(std::vector<primitive>* net) {
     CHECK_NOTNULL(net);
     CHECK_NOTNULL(user_memory_);
     CHECK_NOTNULL(reorder_memory_);
-    net->push_back(reorder(*reorder_memory_, *user_memory_));
+    net->push_back(CreateReorder(reorder_memory_, user_memory_));
   }
 };
 
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b73eadb40046518179fcaaa5c244aa7f3d52ebe
--- /dev/null
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/util/mkl_util.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#ifdef INTEL_MKL_DNN
+
+TEST(MklUtilTest, MklDnnTfShape) {
+  auto cpu_engine = engine(engine::cpu, 0);
+  MklDnnData<float> a(&cpu_engine);
+
+  const int N = 1, C = 2, H = 3, W = 4;
+  memory::dims a_dims = {N, C, H, W};
+  MklDnnShape a_mkldnn_shape;
+  a_mkldnn_shape.SetMklTensor(true);
+  // Create TF layout in NCHW.
+  a_mkldnn_shape.SetTfLayout(a_dims.size(), a_dims, memory::format::nchw);
+  TensorShape a_tf_shape_nchw({N, C, H, W});
+  TensorShape a_tf_shape_nhwc({N, H, W, C});
+  TensorShape a_mkldnn_tf_shape = a_mkldnn_shape.GetTfShape();
+  // Check that returned shape is in NCHW format.
+  EXPECT_EQ(a_tf_shape_nchw, a_mkldnn_tf_shape);
+  EXPECT_NE(a_tf_shape_nhwc, a_mkldnn_tf_shape);
+
+  memory::dims b_dims = {N, C, H, W};
+  MklDnnShape b_mkldnn_shape;
+  b_mkldnn_shape.SetMklTensor(true);
+  // Create TF layout in NHWC.
+  b_mkldnn_shape.SetTfLayout(b_dims.size(), b_dims, memory::format::nhwc);
+  TensorShape b_tf_shape_nhwc({N, H, W, C});
+  TensorShape b_tf_shape_nchw({N, C, H, W});
+  TensorShape b_mkldnn_tf_shape = b_mkldnn_shape.GetTfShape();
+  // Check that returned shape is in NHWC format.
+  EXPECT_EQ(b_tf_shape_nhwc, b_mkldnn_tf_shape);
+  EXPECT_NE(b_tf_shape_nchw, b_mkldnn_tf_shape);
+}
+
+TEST(MklUtilTest, MklDnnBlockedFormatTest) {
+  // Let's create 2D tensor of shape {3, 4} with 3 being innermost dimension
+  // first (case 1) and then it being outermost dimension (case 2).
+  auto cpu_engine = engine(engine::cpu, 0);
+
+  // Setting for case 1
+  MklDnnData<float> a(&cpu_engine);
+  memory::dims dim1 = {3, 4};
+  memory::dims strides1 = {1, 3};
+  a.SetUsrMem(dim1, strides1);
+
+  memory::desc a_md1 = a.GetUsrMemDesc();
+  EXPECT_EQ(a_md1.data.ndims, 2);
+  EXPECT_EQ(a_md1.data.dims[0], 3);
+  EXPECT_EQ(a_md1.data.dims[1], 4);
+  EXPECT_EQ(a_md1.data.format, mkldnn_blocked);
+
+  // Setting for case 2
+  MklDnnData<float> b(&cpu_engine);
+  memory::dims dim2 = {3, 4};
+  memory::dims strides2 = {4, 1};
+  b.SetUsrMem(dim2, strides2);
+
+  memory::desc b_md2 = b.GetUsrMemDesc();
+  EXPECT_EQ(b_md2.data.ndims, 2);
+  EXPECT_EQ(b_md2.data.dims[0], 3);
+  EXPECT_EQ(b_md2.data.dims[1], 4);
+  EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
+}
+
+#endif  // INTEL_MKL_DNN
+}  // namespace
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/util/ptr_util.h b/tensorflow/core/util/ptr_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f902b3ffa12f16c7ef44691073f3d6bff4c7dc9d
--- /dev/null
+++ b/tensorflow/core/util/ptr_util.h
@@ -0,0 +1,80 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PTR_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_PTR_UTIL_H_
+
+// Utility functions for pointers.
+
+#include <stddef.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace tensorflow {
+
+namespace helper {
+
+// Trait to select overloads and return types for MakeUnique.
+template <typename T>
+struct MakeUniqueResult {
+  using scalar = std::unique_ptr<T>;
+};
+template <typename T>
+struct MakeUniqueResult<T[]> {
+  using array = std::unique_ptr<T[]>;
+};
+template <typename T, size_t N>
+struct MakeUniqueResult<T[N]> {
+  using invalid = void;
+};
+
+}  // namespace helper
+
+// Transfers ownership of a raw pointer to a std::unique_ptr of deduced type.
+// Example:
+//   X* NewX(int, int);
+//   auto x = WrapUnique(NewX(1, 2));  // 'x' is std::unique_ptr<X>.
+//
+// WrapUnique is useful for capturing the output of a raw pointer factory.
+// However, prefer 'MakeUnique<T>(args...) over 'WrapUnique(new T(args...))'.
+//   auto x = WrapUnique(new X(1, 2));  // works, but nonideal.
+//   auto x = MakeUnique<X>(1, 2);  // safer, standard, avoids raw 'new'.
+//
+// Note: Cannot wrap pointers to array of unknown bound (i.e. U(*)[]).
+template <typename T>
+std::unique_ptr<T> WrapUnique(T* ptr) {
+  static_assert(!std::is_array<T>::value || std::extent<T>::value != 0,
+                "types T[0] or T[] are unsupported");
+  return std::unique_ptr<T>(ptr);
+}
+
+template <typename T, typename... Args>
+typename helper::MakeUniqueResult<T>::scalar MakeUnique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+// Overload for array of unknown bound.
+// The allocation of arrays needs to use the array form of new,
+// and cannot take element constructor arguments.
+template <typename T>
+typename helper::MakeUniqueResult<T>::array MakeUnique(size_t n) {
+  return std::unique_ptr<T>(new typename std::remove_extent<T>::type[n]());
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PTR_UTIL_H_
diff --git a/tensorflow/core/util/semver_test.cc b/tensorflow/core/util/semver_test.cc
index 0647f670c71915608ac67d80a0b222658569a16a..fdc34fa58bdebf529e3c9b2771b274e5fe6f6d50 100644
--- a/tensorflow/core/util/semver_test.cc
+++ b/tensorflow/core/util/semver_test.cc
@@ -39,7 +39,7 @@ bool ConsumeDotSeparatedIdentifiers(StringPiece* s, const string& prefix,
   for (i = 0; i < s->size() && IsDotOrIdentifierChar((*s)[i]); ++i) {
     // Intentionally empty
   }
-  val->set(s->data(), i);
+  *val = StringPiece(s->data(), i);
   s->remove_prefix(i);
   return i > 0;
 }
diff --git a/tensorflow/core/util/sparse/group_iterator.h b/tensorflow/core/util/sparse/group_iterator.h
index 99f3eafc604d29d2c6a9b732e89dab068f45e613..c0fce207e7a22028818abe1dcd9827434b1e4fcf 100644
--- a/tensorflow/core/util/sparse/group_iterator.h
+++ b/tensorflow/core/util/sparse/group_iterator.h
@@ -83,6 +83,11 @@ class GroupIterable {
   class IteratorStep;
 
   IteratorStep begin() { return IteratorStep(this, 0); }
+  IteratorStep at(int64 loc) {
+    CHECK(loc >= 0 && loc <= ix_.dim_size(0))
+        << "loc provided must lie between 0 and " << ix_.dim_size(0);
+    return IteratorStep(this, loc);
+  }
   IteratorStep end() { return IteratorStep(this, ix_.dim_size(0)); }
 
   template <typename TIX>
@@ -109,6 +114,7 @@ class GroupIterable {
     IteratorStep& operator++();    // prefix ++
     IteratorStep operator++(int);  // postfix ++
     Group operator*() const { return Group(iter_, loc_, next_loc_); }
+    int64 loc() const { return loc_; }
 
    private:
     GroupIterable* iter_;
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 0ea74c38b1916f777eaaf7b0907b614e680ea6e7..e816c282c81a8a3cf661b03ee7597ccfd2658648 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -69,6 +69,21 @@ class SparseTensor {
     CHECK_EQ(shape.size(), dims_) << "Shape rank must be SparseTensor rank.";
   }
 
+  SparseTensor(const SparseTensor& other)
+      : SparseTensor(other.ix_, other.vals_, other.shape_, other.order_) {}
+
+  SparseTensor(SparseTensor&& other)
+      : SparseTensor(std::move(other.ix_), std::move(other.vals_),
+                     std::move(other.shape_), std::move(other.order_)) {}
+
+  SparseTensor& operator=(const SparseTensor& other) {
+    ix_ = other.ix_;
+    vals_ = other.vals_;
+    shape_ = other.shape_;
+    order_ = other.order_;
+    return *this;
+  }
+
   std::size_t num_entries() const { return ix_.dim_size(0); }
 
   int dims() const { return shape_.size(); }
diff --git a/tensorflow/core/util/transform_output_iterator.h b/tensorflow/core/util/transform_output_iterator.h
index 1640791ad1729a57283ab5f2b91b7734c9447d8f..059206c75b97d8bbb64a663a207717387409c04b 100644
--- a/tensorflow/core/util/transform_output_iterator.h
+++ b/tensorflow/core/util/transform_output_iterator.h
@@ -24,7 +24,7 @@ namespace tensorflow {
 template <typename StoreType, typename InputType, typename ConversionOp,
           typename OffsetT = ptrdiff_t>
 class TransformOutputIterator {
- private:
+ protected:
   // Proxy object
   struct Reference {
     StoreType* ptr;
diff --git a/tensorflow/core/util/use_cudnn.cc b/tensorflow/core/util/use_cudnn.cc
index 3862f01ea1c397919f9224fddccc20609ecd6bb6..d7d03f151e2228f4dd09f0385d4a3d33a1452290 100644
--- a/tensorflow/core/util/use_cudnn.cc
+++ b/tensorflow/core/util/use_cudnn.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
 
@@ -26,7 +27,7 @@ namespace tensorflow {
     bool value;                                                            \
     Status status = ReadBoolFromEnvVar(#flag_name, default_value, &value); \
     if (!status.ok()) {                                                    \
-      LOG(ERROR) << status.error_message();                                \
+      LOG(ERROR) << status;                                                \
     }                                                                      \
     return value;                                                          \
   }
@@ -37,4 +38,24 @@ ADD_CUDNN_FLAG(CudnnDisableConv1x1Optimization,
                TF_CUDNN_DISABLE_CONV_1X1_OPTIMIZATION, false);
 
 #undef ADD_CUDNN_FLAG
+
+FP16ConvMode CudnnConvComputeMode() {
+  string value;
+  Status status = ReadStringFromEnvVar("TF_FP16_CONV_MODE", "accurate", &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  string lowercase_value = str_util::Lowercase(value);
+  if (lowercase_value == "accurate") {
+    return FP16ConvMode::kAccurate;
+  } else if (lowercase_value == "fast") {
+    return FP16ConvMode::kFast;
+  } else {
+    LOG(ERROR) << "FP16ConvMode only supports two modes, ACCURATE and FAST. "
+                  "Got unknown mode: "
+               << value;
+  }
+  return FP16ConvMode::kAccurate;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/use_cudnn.h b/tensorflow/core/util/use_cudnn.h
index 5c7d70649672851285a55f3bd235f4e4e0edd73a..a39a032e3f4de0985f446a8c0ae6e00fc56c7249 100644
--- a/tensorflow/core/util/use_cudnn.h
+++ b/tensorflow/core/util/use_cudnn.h
@@ -13,16 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// The utility to check whether we have Cudnn dependency.
+// The utility to check Cudnn dependency and set Cudnn-related flags.
 
 #ifndef TENSORFLOW_UTIL_USE_CUDNN_H_
 #define TENSORFLOW_UTIL_USE_CUDNN_H_
 
 namespace tensorflow {
 
+// FP16ConvMode: The mode to set the internal compute type for cudnn convolution
+// when the input data type is float16. Two types of modes are supported:
+//   kAccurate: Always use float32 as the internal compute type.
+//   kFast: Include both float32 and float16 compute type in the autotune.
+enum class FP16ConvMode {
+  kAccurate = 1,
+  kFast = 2,
+};
+
 bool CanUseCudnn();
 bool CudnnUseAutotune();
 bool CudnnDisableConv1x1Optimization();
+FP16ConvMode CudnnConvComputeMode();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d41818e10c924de21781c352e1a1db252b19c2ff..8818177a288ef16ac1907a20ab563ee3d871f7fd 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -5,7 +5,7 @@ This page highlights TensorFlow models in real world use.
 
 ## Model zoo
 
-Please visit our collection of TensorFlow models in the 
+Please visit our collection of TensorFlow models in the
 [TensorFlow Zoo](https://github.com/tensorflow/models).
 
 If you have built a model with TensorFlow, please consider publishing it in
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
index 1ef72d7b4465bce086a25dcace775f870a4f18ee..fc5d5d70d7ebf42c16294c84c2cc3f8381dae236 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
@@ -1,47 +1 @@
 # BayesFlow Entropy (contrib)
-[TOC]
-
-Entropy Ops.
-
-## Background
-
-Common Shannon entropy, the Evidence Lower BOund (ELBO), KL divergence, and more
-all have information theoretic use and interpretations.  They are also often
-used in variational inference.  This library brings together `Ops` for
-estimating them, e.g. using Monte Carlo expectations.
-
-## Examples
-
-Example of fitting a variational posterior with the ELBO.
-
-```python
-# We start by assuming knowledge of the log of a joint density p(z, x) over
-# latent variable z and fixed measurement x.  Since x is fixed, the Python
-# function does not take x as an argument.
-def log_joint(z):
-  theta = tf.Variable(0.)  # Trainable variable that helps define log_joint.
-  ...
-
-# Next, define a Normal distribution with trainable parameters.
-q = distributions.Normal(mu=tf.Variable(0.), sigma=tf.Variable(1.))
-
-# Now, define a loss function (negative ELBO) that, when minimized, will adjust
-# mu, sigma, and theta, increasing the ELBO, which we hope will both reduce the
-# KL divergence between q(z) and p(z | x), and increase p(x).  Note that we
-# cannot guarantee both, but in general we expect both to happen.
-elbo = entropy.elbo_ratio(log_p, q, n=10)
-loss = -elbo
-
-# Minimize the loss
-train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
-tf.global_variables_initializer().run()
-for step in range(100):
-  train_op.run()
-```
-
-## Ops
-
-*   @{tf.contrib.bayesflow.entropy.elbo_ratio}
-*   @{tf.contrib.bayesflow.entropy.entropy_shannon}
-*   @{tf.contrib.bayesflow.entropy.renyi_ratio}
-*   @{tf.contrib.bayesflow.entropy.renyi_alpha}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
index 2b575340690b18b7ef9e8fc25493ee08429180ba..d855787ae695f115368ab76671182f3a6e490411 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
@@ -1,8 +1 @@
 # BayesFlow Stochastic Graph (contrib)
-[TOC]
-
-Classes and helper functions for Stochastic Computation Graphs.
-
-## Stochastic Computation Graph Helper Functions
-
-*   @{tf.contrib.bayesflow.stochastic_graph.surrogate_loss}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
index e90f58a82224e4d6eb674acbceb159aa0b4daa3f..1cc1ac5d7e670a243f1dcda6ef8c59b6c6d8de2d 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
@@ -1,24 +1,3 @@
 # BayesFlow Stochastic Tensors (contrib)
 [TOC]
 
-Classes and helper functions for creating Stochastic Tensors.
-
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
-
-*   @{tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor}
-*   @{tf.contrib.bayesflow.stochastic_tensor.StochasticTensor}
-
-## Stochastic Tensor Value Types
-
-*   @{tf.contrib.bayesflow.stochastic_tensor.MeanValue}
-*   @{tf.contrib.bayesflow.stochastic_tensor.SampleValue}
-*   @{tf.contrib.bayesflow.stochastic_tensor.value_type}
-*   @{tf.contrib.bayesflow.stochastic_tensor.get_current_value_type}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
index e6070b9aea6356fa5630ba7c093f6a0a59022acf..8f08c09c8fbbc9b5b6ab8612f140f4b7ca7d8b73 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
@@ -2,10 +2,3 @@
 [TOC]
 
 Variational inference.
-
-## Ops
-
-*   @{tf.contrib.bayesflow.variational_inference.elbo}
-*   @{tf.contrib.bayesflow.variational_inference.elbo_with_log_joint}
-*   @{tf.contrib.bayesflow.variational_inference.ELBOForms}
-*   @{tf.contrib.bayesflow.variational_inference.register_prior}
diff --git a/tensorflow/docs_src/api_guides/python/image.md b/tensorflow/docs_src/api_guides/python/image.md
index a2c8c3c3c92e2acf177da104304746fb34281de7..051e4547ee6900ded85ae18fb80b51db1eacb009 100644
--- a/tensorflow/docs_src/api_guides/python/image.md
+++ b/tensorflow/docs_src/api_guides/python/image.md
@@ -19,6 +19,7 @@ Note: The PNG encode and decode Ops support RGBA, but the conversions Ops
 presently only support RGB, HSV, and GrayScale. Presently, the alpha channel has
 to be stripped from the image and re-attached using slicing ops.
 
+*   @{tf.image.decode_bmp}
 *   @{tf.image.decode_gif}
 *   @{tf.image.decode_jpeg}
 *   @{tf.image.encode_jpeg}
diff --git a/tensorflow/docs_src/api_guides/python/input_dataset.md b/tensorflow/docs_src/api_guides/python/input_dataset.md
index 2798d76be988e5b340ebcb717910d63201e7caf8..94c89c37d520fd1c1ec65fedc813a7b348120913 100644
--- a/tensorflow/docs_src/api_guides/python/input_dataset.md
+++ b/tensorflow/docs_src/api_guides/python/input_dataset.md
@@ -1,4 +1,4 @@
-# `Dataset` Input Pipeline
+# Dataset Input Pipeline
 [TOC]
 
 @{tf.data.Dataset} allows you to build complex input pipelines. See the
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index fa4cee87007cfd77663e74956fcfe0f15c55c52c..0eff9000931666dce742358a290f25bb2b5a7b16 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -221,15 +221,9 @@ Here are some of the typical usage models:
     # Addes loss and train.
     labels = tf.constant(0, tf.int32, shape=[100], name="labels")
     batch_size = tf.size(labels)
-    labels = tf.expand_dims(labels, 1)
-    indices = tf.expand_dims(tf.range(0, batch_size), 1)
-    concated = tf.concat([indices, labels], 1)
-    onehot_labels = tf.sparse_to_dense(
-        concated, tf.stack([batch_size, 10]), 1.0, 0.0)
     logits = tf.get_collection("logits")[0]
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
-        labels=onehot_labels, logits=logits, name="xentropy")
-    loss = tf.reduce_mean(cross_entropy, name="xentropy_mean")
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
+                                                  logits=logits)
 
     tf.summary.scalar('loss', loss)
     # Creates the gradient descent optimizer with the given learning rate.
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 75dbb04e7df6f5fef00363bab548fc04bd3c9694..8e6fd1cff93332b84f552c18f627ba05dc67103e 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -73,7 +73,7 @@ The total padding applied along the height and width is computed as:
       pad_along_width = max(filter_width - strides[2], 0)
     else:
       pad_along_width = max(filter_width - (in_width % strides[2]), 0)
-    
+
 Finally, the padding on the top, bottom, left and right are:
 
     pad_top = pad_along_height // 2
@@ -226,6 +226,8 @@ TensorFlow provides several operations that help you perform classification.
 *   @{tf.nn.softmax}
 *   @{tf.nn.log_softmax}
 *   @{tf.nn.softmax_cross_entropy_with_logits}
+*   @{tf.nn.softmax_cross_entropy_with_logits_v2} - identical to the base
+    version, except it allows gradient propagation into the labels.
 *   @{tf.nn.sparse_softmax_cross_entropy_with_logits}
 *   @{tf.nn.weighted_cross_entropy_with_logits}
 
@@ -351,7 +353,7 @@ p_i = max(s\cdot (n_o - 1) + k - n_i, 0)
 \end{equation}
 
 Remember that, for `'SAME'` padding,
-\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above. 
+\\(n_o = \left \lceil{\frac{n_i}{s}}\right \rceil\\), as mentioned above.
 We need to analyze in detail two cases:
 
 - \\(n_i \text{ mod } s = 0\\)
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index b3ebaa0f0a3645256d4e92632a10a53e4eb243cb..f316cce953da9b425463feffa317b6bf292694e4 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -1,11 +1,11 @@
 # Reading data
 
 Note: The preferred way to feed data into a tensorflow program is using the
-@{$datasets$Datasets API}.
+@{$datasets$`tf.data` API}.
 
 There are four methods of getting data into a TensorFlow program:
 
-*   `Dataset` API: Easily construct a complex input pipeline. (preferred method)
+*   `tf.data` API: Easily construct a complex input pipeline. (preferred method)
 *   Feeding: Python code provides the data when running each step.
 *   `QueueRunner`: a queue-based input pipeline reads the data from files
     at the beginning of a TensorFlow graph.
@@ -14,26 +14,27 @@ There are four methods of getting data into a TensorFlow program:
 
 [TOC]
 
-## Dataset API
+## `tf.data` API
 
 See the @{$datasets$programmer's guide} for an in-depth explanation of
-@{tf.data.Dataset}. The `Dataset` API allows you to extract and preprocess data
-from different input/file formats, and apply transformations such as batch,
-shuffle, and map to the dataset. This is an improved version of the old input
-methods, feeding and `QueueRunner`.
+@{tf.data.Dataset}. The `tf.data` API enables you to extract and preprocess data
+from different input/file formats, and apply transformations such as batching,
+shuffling, and mapping functions over the dataset. This is an improved version
+of the old input methods---feeding and `QueueRunner`---which are described
+below for historical purposes.
 
 ## Feeding
 
+Warning: "Feeding" is the least efficient way to feed data into a TensorFlow
+program and should only be used for small experiments and debugging.
+
 TensorFlow's feed mechanism lets you inject data into any Tensor in a
-computation graph. A python computation can thus feed data directly into the
+computation graph. A Python computation can thus feed data directly into the
 graph.
 
 Supply feed data through the `feed_dict` argument to a run() or eval() call
 that initiates computation.
 
-Warning: "Feeding" is the least efficient way to feed data into a tensorflow
-program and should only be used for small experiments and debugging.
-
 ```python
 with tf.Session():
   input = tf.placeholder(tf.float32)
@@ -55,6 +56,10 @@ and is described in the @{$mechanics$MNIST tutorial}.
 
 ## `QueueRunner`
 
+Warning: This section discusses implementing input pipelines using the
+queue-based APIs which can be cleanly replaced by the @{$datasets$`tf.data`
+API}.
+
 A typical queue-based pipeline for reading records from files has the following stages:
 
 1.  The list of filenames
@@ -66,9 +71,6 @@ A typical queue-based pipeline for reading records from files has the following
 7.  *Optional* preprocessing
 8.  Example queue
 
-Warning: This section discusses implementing input pipelines using the
-queue-based APIs which can be cleanly replaced by the @{$datasets$Datasets API}.
-
 ### Filenames, shuffling, and epoch limits
 
 For the list of filenames, use either a constant string Tensor (like
@@ -173,14 +175,25 @@ For example,
 [`tensorflow/examples/how_tos/reading_data/convert_to_records.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/convert_to_records.py)
 converts MNIST data to this format.
 
-To read a file of TFRecords, use
-@{tf.TFRecordReader} with
-the @{tf.parse_single_example}
-decoder. The `parse_single_example` op decodes the example protocol buffers into
-tensors. An MNIST example using the data produced by `convert_to_records` can be
-found in
-[`tensorflow/examples/how_tos/reading_data/fully_connected_reader.py`](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py),
-which you can compare with the `fully_connected_feed` version.
+The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}, [as in this example](https://www.tensorflow.org/code/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py):
+
+``` python
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
+
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+```
+
+To acomplish the same task with a queue based input pipeline requires the following code 
+(using the same `decode` function from the above example): 
+
+``` python
+  filename_queue = tf.train.string_input_producer([filename], num_epochs=num_epochs)
+  reader = tf.TFRecordReader()
+  _, serialized_example = reader.read(filename_queue)
+  image,label = decode(serialized_example)
+```
 
 ### Preprocessing
 
@@ -499,7 +512,7 @@ You can have the train and eval in the same graph in the same process, and share
 their trained variables or layers. See @{$variables$the shared variables tutorial}.
 
 To support the single-graph approach
-@{$programmers_guide/datasets$Datasets} also supplies
+@{$programmers_guide/datasets$`tf.data`} also supplies
 @{$programmers_guide/datasets#creating_an_iterator$advanced iterator types} that
 that allow the user to change the input pipeline without rebuilding the graph or
 session.
diff --git a/tensorflow/docs_src/api_guides/python/threading_and_queues.md b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
index ab95ce0af9d9f40cd32a41db988a7d48f2b0040f..8ad4c4c07512d04d1df43062954f2e64b1d8e177 100644
--- a/tensorflow/docs_src/api_guides/python/threading_and_queues.md
+++ b/tensorflow/docs_src/api_guides/python/threading_and_queues.md
@@ -3,7 +3,7 @@
 Note: In versions of TensorFlow before 1.2, we recommended using multi-threaded,
 queue-based input pipelines for performance. Beginning with TensorFlow 1.4,
 however, we recommend using the `tf.data` module instead. (See
-[Datasets](datasets) for details. In TensorFlow 1.2 and 1.3, the module was
+@{$datasets$Datasets} for details. In TensorFlow 1.2 and 1.3, the module was
 called `tf.contrib.data`.) The `tf.data` module offers an easier-to-use
 interface for constructing efficient input pipelines. Furthermore, we've stopped
 developing the old multi-threaded, queue-based input pipelines.  We've retained
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 77d4e0caece4b50222c6e8abdd7ebba006159f26..003e0a25ecd7c6afcc42aed08bd5d91f7c85a9bb 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -10,10 +10,10 @@ particular, this document explains the following:
 
 You can view TensorFlow documentation on https://www.tensorflow.org, and you
 can view and edit the raw files on
-[GitHub](https://www.tensorflow.org/code/tensorflow/docs_src/). 
+[GitHub](https://www.tensorflow.org/code/tensorflow/docs_src/).
 We're publishing our docs on GitHub so everybody can contribute. Whatever gets
 checked in to `tensorflow/docs_src` will be published soon after on
-https://www.tensorflow.org. 
+https://www.tensorflow.org.
 
 Republishing TensorFlow documentation in different forms is absolutely allowed,
 but we are unlikely to accept other documentation formats (or the tooling to
@@ -237,7 +237,7 @@ If a module is accidentally imported, it typically breaks the doc generator
 even if the doc generator succeeds, unwanted symbols may show up in the
 docs. Check the generated docs to make sure that all symbols that are documented
 are expected. If there are symbols that shouldn’t be there, you have the
-following options for dealing with them: 
+following options for dealing with them:
 
 - Private symbols and imports
 - The `remove_undocumented` filter
diff --git a/tensorflow/docs_src/community/style_guide.md b/tensorflow/docs_src/community/style_guide.md
index 40a75a4736d0e28e3c1822f5941fcb855f939f46..a4c4e2674ee78b2248323a0275a737d6417c5f99 100644
--- a/tensorflow/docs_src/community/style_guide.md
+++ b/tensorflow/docs_src/community/style_guide.md
@@ -162,7 +162,7 @@ operation.
              it's present in the scope.
 
 * Layers that behave differently during training should take:
-  - `is_training`: `bool` indicator to conditionally choose different 
+  - `is_training`: `bool` indicator to conditionally choose different
                    computation paths (e.g. using `tf.cond`) during execution.
 
 Example:
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
index 33740de5d5af11cb6a8f1f6d57baa4c0e0dbefff..a3abf2550757e825ae2d023018def919de1bcd8f 100644
--- a/tensorflow/docs_src/community/welcome.md
+++ b/tensorflow/docs_src/community/welcome.md
@@ -65,5 +65,5 @@ please read the following list carefully:
     [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
     on GitHub.  For example, use the issue tracker to request a
     new operation in TensorFlow.
-    
+
 
diff --git a/tensorflow/docs_src/deploy/hadoop.md b/tensorflow/docs_src/deploy/hadoop.md
index 7592cf828beb1f45a60ecc58b7fbfc5f4c4308ab..c4471562b9e64dda2fade7759e06fb8eecd09f5c 100644
--- a/tensorflow/docs_src/deploy/hadoop.md
+++ b/tensorflow/docs_src/deploy/hadoop.md
@@ -32,8 +32,8 @@ be set:
     source ${HADOOP_HOME}/libexec/hadoop-config.sh
     ```
 
-*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path 
-    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in 
+*   **LD_LIBRARY_PATH**: To include the path to libjvm.so, and optionally the path
+    to libhdfs.so if your Hadoop distribution does not install libhdfs.so in
     `$HADOOP_HDFS_HOME/lib/native`. On Linux:
 
     ```shell
diff --git a/tensorflow/docs_src/extend/add_filesys.md b/tensorflow/docs_src/extend/add_filesys.md
index ea3a6fe53af3e960eaccb4f7b6836364244fbe05..f0591b7b7d8af478db067ecd3bdd949e75d813c9 100644
--- a/tensorflow/docs_src/extend/add_filesys.md
+++ b/tensorflow/docs_src/extend/add_filesys.md
@@ -32,9 +32,10 @@ Note that TensorFlow already includes many filesystem implementations, such as:
 
     Note: NFS filesystems often mount as a POSIX interface, and so standard
     TensorFlow can work on top of NFS-mounted remote filesystems.
-    
+
 *   HDFS - the Hadoop File System
 *   GCS - Google Cloud Storage filesystem
+*   S3 - Amazon Simple Storage Service filesystem
 *   A "memory-mapped-file" filesystem
 
 The rest of this guide describes how to implement a custom filesystem.
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15d6d77f5ef21572daa23ed291f24e06574e6aa0..c52279b212f46215125a20815f97b07b012a5513 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -341,9 +341,9 @@ Assuming you have `g++` installed, here is the sequence of commands you can use
 to compile your op into a dynamic library.
 
 ```bash
-TF_INC=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-TF_LIB=$(python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
-g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC -I$TF_INC -I$TF_INC/external/nsync/public -L$TF_LIB -ltensorflow_framework -O2
+TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+g++ -std=c++11 -shared zero_out.cc -o zero_out.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2
 ```
 
 On Mac OS X, the additional flag "-undefined dynamic_lookup" is required when
@@ -451,17 +451,17 @@ Now that you know how to build a basic (and somewhat restricted) op and
 implementation, we'll look at some of the more complicated things you will
 typically need to build into your op. This includes:
 
-*   [Conditional checks and validation](#conditional_checks_and_validation)
-*   [Op registration](#op_registration)
+*   [Conditional checks and validation](#conditional-checks-and-validation)
+*   [Op registration](#op-registration)
     *   [Attrs](#attrs)
-    *   [Attr types](#attr_types)
+    *   [Attr types](#attr-types)
     *   [Polymorphism](#polymorphism)
-    *   [Inputs and outputs](#inputs_and_outputs)
-    *   [Backwards compatibility](#backwards_compatibility)
-*   [GPU support](#gpu_support)
-    *   [Compiling the kernel for the GPU device](#compiling_the_kernel_for_the_gpu_device)
-*   [Implement the gradient in Python](#implement_the_gradient_in_python)
-*   [Shape functions in C++](#shape_functions_in_c)
+    *   [Inputs and outputs](#inputs-and-outputs)
+    *   [Backwards compatibility](#backwards-compatibility)
+*   [GPU support](#gpu-support)
+    *   [Compiling the kernel for the GPU device](#compiling-the-kernel-for-the-gpu-device)
+*   [Implement the gradient in Python](#implement-the-gradient-in-python)
+*   [Shape functions in C++](#shape-functions-in-c)
 
 ### Conditional checks and validation
 
@@ -1228,10 +1228,10 @@ into a single dynamically loadable library:
 
 ```bash
 nvcc -std=c++11 -c -o cuda_op_kernel.cu.o cuda_op_kernel.cu.cc \
--I $TF_INC -I$TF_INC/external/nsync/public -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
+  ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC
 
 g++ -std=c++11 -shared -o cuda_op_kernel.so cuda_op_kernel.cc \
-cuda_op_kernel.cu.o -I $TF_INC -I$TF_INC/external/nsync/public -fPIC -lcudart -L$TF_LIB -ltensorflow_framework
+  cuda_op_kernel.cu.o ${TF_CFLAGS[@]} -fPIC -lcudart ${TF_LFLAGS[@]}
 ```
 
 `cuda_op_kernel.so` produced above can be loaded as usual in Python, using the
diff --git a/tensorflow/docs_src/extend/estimators.md b/tensorflow/docs_src/extend/estimators.md
index 7e6507c5840fe621aeb91842c9a83554e568db99..96fc9fae4720b5d29ff94bffe8f30e40aada0a27 100644
--- a/tensorflow/docs_src/extend/estimators.md
+++ b/tensorflow/docs_src/extend/estimators.md
@@ -515,7 +515,7 @@ using `mean_squared_error()` (in bold):
   loss = tf.losses.mean_squared_error(labels, predictions)</strong>
   ...</code></pre>
 
-See the @{$python/contrib.losses$API guide} for a
+See the @{tf.losses$API guide} for a
 full list of loss functions and more details on supported arguments and usage.
 
 Supplementary metrics for evaluation can be added to an `eval_metric_ops` dict.
@@ -694,5 +694,5 @@ For additional reference materials on building `Estimator`s, see the following
 sections of the API guides:
 
 *   @{$python/contrib.layers$Layers}
-*   @{$python/contrib.losses$Losses}
+*   @{tf.losses$Losses}
 *   @{$python/contrib.layers#optimization$Optimization}
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index 3f30b9a8c243728f6dd2a47ffa0b35fb92ee68fe..00b168c6be96a158c3be69fbcefbf941c0fbbe4d 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -20,7 +20,7 @@ TensorFlow:
 
 Python is currently the only language supported by TensorFlow's API stability
 promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
-plus community support for [Haskell](https://github.com/tensorflow/haskell) and 
+plus community support for [Haskell](https://github.com/tensorflow/haskell) and
 [Rust](https://github.com/tensorflow/rust).  If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..ae9e107e56210079197d73b3cfd7cc4f64e51743
--- /dev/null
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -0,0 +1,572 @@
+
+# Creating Custom Estimators
+This document introduces custom Estimators. In particular, this document
+demonstrates how to create a custom @{tf.estimator.Estimator$Estimator} that
+mimics the behavior of the pre-made Estimator
+@{tf.estimator.DNNClassifier$`DNNClassifier`} in solving the Iris problem. See
+the @{$get_started/estimator$Pre-Made Estimators chapter} for details.
+
+If you are feeling impatient, feel free to compare and contrast the following
+full programs:
+
+* Iris implemented with the [pre-made DNNClassifier Estimator](https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py).
+* Iris implemented with a [custom Estimator](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py).
+
+## Pre-made vs. custom
+
+As the following figure shows, pre-made Estimators are subclasses of the
+@{tf.estimator.Estimator} base class, while custom Estimators are an instance
+of tf.estimator.Estimator:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="Premade estimators are sub-classes of `Estimator`. Custom Estimators are usually (direct) instances of `Estimator`"
+  src="../images/custom_estimators/estimator_types.png">
+</div>
+<div style="text-align: center">
+Pre-made and custom Estimators are all Estimators.
+</div>
+
+Pre-made Estimators are fully baked. Sometimes though, you need more control
+over an Estimator's behavior.  That's where custom Estimators come in. You can
+create a custom Estimator to do just about anything. If you want hidden layers
+connected in some unusual fashion, write a custom Estimator. If you want to
+calculate a unique
+[metric](https://developers.google.com/machine-learning/glossary/#metric)
+for your model, write a custom Estimator.  Basically, if you want an Estimator
+optimized for your specific problem, write a custom Estimator.
+
+A model function (or `model_fn`) implements the ML algorithm. The
+only difference between working with pre-made Estimators and custom Estimators
+is:
+
+* With pre-made Estimators, someone already wrote the model function for you.
+* With custom Estimators, you must write the model function.
+
+Your model function could implement a wide range of algorithms, defining all
+sorts of hidden layers and metrics.  Like input functions, all model functions
+must accept a standard group of input parameters and return a standard group of
+output values. Just as input functions can leverage the Dataset API, model
+functions can leverage the Layers API and the Metrics API.
+
+Let's see how to solve the Iris problem with a custom Estimator. A quick
+reminder--here's the organization of the Iris model that we're trying to mimic:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
+  src="../images/custom_estimators/full_network.png">
+</div>
+<div style="text-align: center">
+Our implementation of Iris contains four features, two hidden layers,
+and a logits output layer.
+</div>
+
+## Write an Input function
+
+In our custom Estimator implementation, we'll reuse the input function we used
+in the pre-made Estimator implementation. Namely:
+
+```python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+This input function builds an input pipeline that yields batches of
+`(features, labels)` pairs, where `features` is a dictionary features.
+
+## Create feature columns
+
+<!-- TODO(markdaoust): link to feature_columns when it exists-->
+As detailed in @{$get_started/estimator$Premade Estimators}, you must define
+your model's feature columns to specify how the model should use each feature.
+Whether working with pre-made Estimators or custom Estimators, you define
+feature columns in the same fashion.
+
+The following code creates a simple `numeric_column` for each input feature,
+indicating that the value of the input feature should be used directly as an
+input to the model:
+
+```python
+# Feature columns describe how to use the input.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+## Write a model function
+
+The model function we'll use has the following call signature:
+
+```python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode,     # An instance of tf.estimator.ModeKeys
+   params):  # Additional configuration
+```
+
+The first two arguments are the batches of features and labels returned from
+the input function; that is, `features` and `labels` are the handles to the
+data your model will use. The `mode` argument indicates whether the caller is
+requesting training, predicting, or evaluation.
+
+The caller may pass `params` to an Estimator's constructor. The `params` passed
+to the constructor become the `params` passed to `model_fn`.
+
+```python
+    # Build 2 hidden layer DNN with 10, 10 units respectively.
+    classifier = tf.estimator.Estimator(
+        model_fn=my_model,
+        params={
+            'feature_columns': my_feature_columns,
+            # Two hidden layers of 10 nodes each.
+            'hidden_units': [10, 10],
+            # The model must choose between 3 classes.
+            'n_classes': 3,
+        })
+```
+
+To implement a typical model function, you must do the following:
+
+* (Define the model)[#define_the_model].
+* Specify additional calculations for each of
+  the [three different modes](#modes):
+  * [Predict](#predict)
+  * [Evaluate](#evaluate)
+  * [Train](#train)
+
+## Define the model
+
+The basic deep neural network model must define the following three sections:
+
+* An [input layer](https://developers.google.com/machine-learning/glossary/#input_layer)
+* One or more [hidden layers](https://developers.google.com/machine-learning/glossary/#hidden_layer)
+* An [output layer](https://developers.google.com/machine-learning/glossary/#output_layer)
+
+### Define the input layer
+
+Call @{tf.feature_column.input_layer} to convert your feature dictionary and
+feature columns into input for your model. For example:
+
+```python
+    # Use `input_layer` to apply the feature columns.
+    net = tf.feature_column.input_layer(features, params['feature_columns'])
+```
+
+The preceding line applies the transformations defined by your feature columns,
+creating the input layer of our model.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A diagram of the input layer, in this case a 1:1 mapping from raw-inputs to features."
+  src="../images/custom_estimators/input_layer.png">
+</div>
+
+
+### Hidden Layers
+
+If you are creating a deep neural network, you must define one or more hidden
+layers. The Layers API provides a rich set of functions to define all types of
+hidden layers, including convolutional, pooling, and dropout layers. For Iris,
+we're simply going to call @{tf.layers.dense} to create hidden layers, with
+dimensions defined by `params['hidden_layers']`. In a `dense` layer each node
+is connected to every node in the preceding layer.  Here's the relevant code:
+
+``` python
+    # Build the hidden layers, sized according to the 'hidden_units' param.
+    for units in params['hidden_units']:
+        net = tf.layers.dense(net, units=units, activation=tf.nn.relu)
+```
+* The `units` parameter defines the number of output neurons in a given layer.
+* The `activation` parameter defines the [activation function](https://developers.google.com/machine-learning/glossary/#a) —
+  [Relu](https://developers.google.com/machine-learning/glossary/#ReLU) in this
+  case.
+
+The variable `net` here signifies the current top layer of the network. During
+the first iteration, `net` signifies the input layer. On each loop iteration
+`tf.layers.dense` creates a new layer, which takes the previous layer as its
+input. So, the loop uses `net` to pass the previously created layer as input
+to the layer being created.
+
+After creating two hidden layers, our network looks as follows. For
+simplicity, the figure only shows four hidden units in each layer.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="The input layer with two hidden layers added."
+  src="../images/custom_estimators/add_hidden_layer.png">
+</div>
+
+Note that @{tf.layers.dense} provides many additional capabilities, including
+the ability to set a multitude of regularization parameters. For the sake of
+simplicity, though, we're going to simply accept the default values of the
+other parameters.
+
+### Output Layer
+
+We'll define the output layer by calling @{tf.layers.dense} yet again, this
+time without an activation function:
+
+```python
+    # Compute logits (1 per class).
+    logits = tf.layers.dense(net, params['n_classes'], activation=None)
+```
+
+Here, `net` signifies the final hidden layer. Therefore, the full set of layers
+is now connected as follows:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="A logit output layer connected to the top hidden layer"
+  src="../images/custom_estimators/add_logits.png">
+</div>
+<div style="text-align: center">
+The final hidden layer feeds into the output layer.
+</div>
+
+When defining an output layer, the `units` parameter specifies the number of
+outputs. So, by setting `units` to `params['n_classes']`, the model produces
+one output value per class. Each element of the output vector will contains the
+score, or "logit", calculated to the associated class of Iris: Setosa,
+Versicolor, or Virginica, respectively.
+
+Later on, these logits will be transformed into probabilities by the
+@{tf.nn.softmax} function.
+
+## Implement training, evaluation, and prediction {modes}
+
+The final step in creating a model function is to write branching code that
+implements prediction, evaluation, and training.
+
+The model function gets invoked whenever someone calls the Estimator's `train`,
+`evaluate`, or `predict` methods. Recall that the signature for the model
+function looks like this:
+
+``` python
+def my_model_fn(
+   features, # This is batch_features from input_fn
+   labels,   # This is batch_labels from input_fn
+   mode):    # An instance of tf.estimator.ModeKeys, see below
+```
+
+Focus on that third argument, mode. As the following table shows, when someone
+calls train, evaluate, or predict, the Estimator framework invokes your model
+function with the mode parameter set as follows:
+
+| Estimator method                 |    Estimator Mode |
+|:---------------------------------|:------------------|
+|@{tf.estimator.Estimator.train$`train()`} |@{tf.estimator.ModeKeys.TRAIN$`ModeKeys.TRAIN`} |
+|@{tf.estimator.Estimator.evaluate$`evaluate()`}  |@{tf.estimator.ModeKeys.EVAL$`ModeKeys.EVAL`}      |
+|@{tf.estimator.Estimator.predict$`predict()`}|@{tf.estimator.ModeKeys.PREDICT$`ModeKeys.PREDICT`} |
+
+For example, suppose you instantiate a custom Estimator to generate an object
+named `classifier`. Then, you make the following call:
+
+``` python
+classifier = tf.estimator.Estimator(...)
+classifier.train(input_fn=lambda: my_input_fn(FILE_TRAIN, True, 500))
+```
+The Estimator framework then calls your model function with mode set to
+`ModeKeys.TRAIN`.
+
+Your model function must provide code to handle all three of the mode values.
+For each mode value, your code must return an instance of
+`tf.estimator.EstimatorSpec`, which contains the information the caller
+requires. Let's examine each mode.
+
+### Predict
+
+When the Estimator's `predict` method is called, the `model_fn` receives
+`mode = ModeKeys.PREDICT`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the prediction.
+
+The model must have been trained prior to making a prediction. The trained model
+is stored on disk in the `model_dir` directory established when you
+instantiated the Estimator.
+
+The code to generate the prediction for this model looks as follows:
+
+```python
+# Compute predictions.
+predicted_classes = tf.argmax(logits, 1)
+if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {
+        'class_ids': predicted_classes[:, tf.newaxis],
+        'probabilities': tf.nn.softmax(logits),
+        'logits': logits,
+    }
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+```
+The prediction dictionary contains everything that your model returns when run
+in prediction mode.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="Additional outputs added to the output layer."
+  src="../images/custom_estimators/full_network.png">
+</div>
+
+The `predictions` holds the following three key/value pairs:
+
+*   `class_ids` holds the class id (0, 1, or 2) representing the model's
+    prediction of the most likely species for this example.
+*   `probabilities` holds the three probabilities (in this example, 0.02, 0.95,
+    and 0.03)
+*   `logit` holds the raw logit values (in this example, -1.3, 2.6, and -0.9)
+
+We return that dictionary to the caller via the `predictions` parameter of the
+@{tf.estimator.EstimatorSpec}. The Estimator's
+@{tf.estimator.Estimator.predict$`predict`} method will yield these
+dictionaries.
+
+### Calculate the loss
+
+For both [training](#train) and [evaluation](#evaluate) we need to calculate the
+model's loss. This is the
+[objective](https://developers.google.com/machine-learning/glossary/#objective)
+that will be optimized.
+
+We can calculate the loss by calling @{tf.losses.sparse_softmax_cross_entropy}.
+The value returned by this function will be lowest, approximately 0,
+probability of the correct class (at index `label`) is near 1.0. The loss value
+returned is progressively larger as the probability of the correct class
+decreases.
+
+This function returns the average over the whole batch.
+
+```python
+    # Compute loss.
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
+```
+
+### Evaluate
+
+When the Estimator's `evaluate` method is called, the `model_fn` receives
+`mode = ModeKeys.EVAL`. In this case, the model function must return a
+`tf.estimator.EstimatorSpec` containing the model's loss and optionally one
+or more metrics.
+
+Although returning metrics is optional, most custom Estimators do return at
+least one metric. TensorFlow provides a Metrics module @{tf.metrics} to
+calculate common metrics.  For brevity's sake, we'll only return accuracy. The
+@{tf.metrics.accuracy} function compares our predictions against the
+true values, that is, against the labels provided by the input function. The
+@{tf.metrics.accuracy} function requires the labels and predictions to have the
+same shape. Here's the call to @{tf.metrics.accuracy}:
+
+``` python
+    # Compute evaluation metrics.
+    accuracy = tf.metrics.accuracy(labels=labels,
+                                   predictions=predicted_classes,
+                                   name='acc_op')
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for evaluation
+typically contains the following information:
+
+* `loss`, which is the model's loss
+* `eval_metric_ops`, which is an optional dictionary of metrics.
+
+So, we'll create a dictionary containing our sole metric. If we had calculated
+other metrics, we would have added them as additional key/value pairs to that
+same dictionary.  Then, we'll pass that dictionary in the `eval_metric_ops`
+argument of `tf.estimator.EstimatorSpec`. Here's the code:
+
+```python
+    metrics = {'accuracy': accuracy}
+    tf.summary.scalar('accuracy', accuracy[1])
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+        return tf.estimator.EstimatorSpec(
+            mode, loss=loss, eval_metric_ops=metrics)
+```
+
+The @{tf.summary.scalar} will make accuracy available to TensorBoard (more on
+this later).
+
+### Train
+
+When the Estimator's `train` method is called, the `model_fn` is called
+with `mode = ModeKeys.TRAIN`. In this case, the model function must return an
+`EstimatorSpec` that contains the loss and a training operation.
+
+Building the training operation will require an optimizer. We will use
+@{tf.train.AdagradOptimizer} because we're mimicking the `DNNClassifier`, which
+also uses `Adagrad` by default. The `tf.train` package provides many other
+optimizers—feel free to experiment with them.
+
+Here is the code that builds the optimizer:
+
+``` python
+  # Instantiate an optimizer.
+  optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
+```
+
+Next, we train the model using the optimizer's
+@{tf.train.Optimizer.minimize$`minimize`} method on the loss we calculated
+earlier.
+
+The `minimize` method also takes a `global_step` parameter. TensorFlow uses this
+parameter to count the number of training steps that have been processed
+(to know when to end a training run). Furthermore, the `global_step` is
+essential for TensorBoard graphs to work correctly. Simply call
+@{tf.train.get_global_step} and pass the result to the `global_step`
+argument of `minimize`.
+
+Here's the code to train the model:
+
+``` python
+  # Train the model by establishing an objective, which is to
+  # minimize loss using that optimizer.
+  train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
+```
+
+The @{tf.estimator.EstimatorSpec$`EstimatorSpec`} returned for training
+must have the following fields set:
+
+* `loss`, which contains the value of the loss function.
+* `train_op`, which executes a training step.
+
+Here's our code to call `EstimatorSpec`:
+
+```python
+    # Return training information.
+    return tf.estimator.EstimatorSpec(
+        mode=tf.estimator.ModeKeys.TRAIN,
+        loss=loss,
+        train_op=train_op)
+```
+
+The model function is now complete.
+
+## The custom Estimator
+
+Instantiate the custom Estimator through the Estimator base class as follows:
+
+```python
+    # Build 2 hidden layer DNN with 10, 10 units respectively.
+    classifier = tf.estimator.Estimator(
+        model_fn=my_model,
+        params={
+            'feature_columns': my_feature_columns,
+            # Two hidden layers of 10 nodes each.
+            'hidden_units': [10, 10],
+            # The model must choose between 3 classes.
+            'n_classes': 3,
+        })
+```
+Here the `params` dictionary serves the same purpose as the key-word
+arguments of `DNNClassifier`; that is, the `params` dictionary lets you
+configure your Estimator without modifying the code in the `model_fn`.
+
+The rest of the code to train, evaluate, and generate predictions using our
+Estimator is the same as for the pre-made `DNNClassifier`. For example, the
+following line will train the model:
+
+```python
+    # Train the Model.
+    classifier.train(
+        input_fn=lambda:train_input_fn(train_x, train_y, args.batch_size),
+        steps=args.train_steps)
+```
+
+## TensorBoard
+
+You can view training results for your custom Estimator in TensorBoard. To see
+this reporting, start TensorBoard from your command line as follows:
+
+```bsh
+# Replace PATH with the actual path passed as model_dir
+tensorboard --logdir=PATH
+```
+
+Then, open TensorBoard by browsing to: [http://localhost:6006](http://localhost:6006)
+
+All the pre-made Estimators automatically log a lot of information to
+TensorBoard. With custom Estimators, however, TensorBoard only provides one
+default log (a graph of the loss) plus the information you explicitly tell
+TensorBoard to log. For the custom Estimator you just created, TensorBoard
+generates the following:
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="height:260px"
+  alt="Accuracy, steps/second, and loss 'scalar' graphs from tensorboard"
+  src="../images/custom_estimators/tensorboard.png">
+</div>
+<div style="text-align: center">
+TensorBoard displays three graphs.
+</div>
+
+In brief, here's what the three graphs tell you:
+
+* global_step/sec: A performance indicator showing how many batches (gradient
+  updates) we processed per second as the model trains.
+
+* loss: The loss reported.
+
+* accuracy: The accuracy is recorded by the following two lines:
+
+  * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+  * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
+
+These tensorboard graphs are one of the main reasons it's important to pass a
+`global_step` to your optimizer's `minimize` method. The model can't record
+the x-coordinate for these graphs without it.
+
+Note the following in the `my_accuracy` and `loss` graphs:
+
+* The orange line represents training.
+* The blue dot represents evaluation.
+
+During training, summaries (the orange line) are recorded periodically as
+batches are processed, which is why it becomes a graph spanning x-axis range.
+
+By contrast, evaluation produces only a single point on the graph for each call
+to `evaluate`. This point contains the average over the entire evaluation call.
+This has no width on the graph as it is evaluated entirely from the model state
+at a particular training step (from a single checkpoint).
+
+As suggested in the following figure, you may see and also selectively
+disable/enable the reporting using the controls on the left side.
+
+<div style="width:100%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="margin:auto;display:block;"
+  alt="Check-boxes allowing the user to select which runs are shown."
+  src="../images/custom_estimators/select_run.jpg">
+</div>
+<div style="text-align: center">
+Enable or disable reporting.
+</div>
+
+
+## Summary
+
+Although pre-made Estimators can be an effective way to quickly create new
+models, you will often need the additional flexibility that custom Estimators
+provide. Fortunately, pre-made and custom Estimators follow the same
+programming model. The only practical difference is that you must write a model
+function for custom Estimators; everything else is the same.
+
+For more details, be sure to check out:
+
+* The
+[official TensorFlow implementation of MNIST](https://github.com/tensorflow/models/tree/master/official/mnist),
+which uses a custom estimator.
+
+* The TensorFlow
+[official models repository](https://github.com/tensorflow/models/tree/master/official),
+which contains more curated examples using custom estimators.
+
+* This [TensorBoard video](https://youtu.be/eBbEDRsCmv4), which introduces
+TensorBoard.
+
+
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/get_started/feature_columns.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9537927b7e8d779b2a5e61b1362cb15769b3c78
--- /dev/null
+++ b/tensorflow/docs_src/get_started/feature_columns.md
@@ -0,0 +1,570 @@
+# Feature Columns
+
+This document details feature columns. Think of **feature columns** as the
+intermediaries between raw data and Estimators. Feature columns are very rich,
+enabling you to transform a diverse range of raw data into formats that
+Estimators can use, allowing easy experimentation.
+
+In @{$get_started/estimator$Premade Estimators}, we used the premade Estimator,
+@{tf.estimator.DNNClassifier$`DNNClassifier`} to train a model to predict
+different types of Iris flowers from four input features. That example created
+only numerical feature columns (of type @{tf.feature_column.numeric_column}).
+Although numerical feature columns model the lengths of petals and sepals
+effectively, real world data sets contain all kinds of features, many of which
+are non-numerical.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/feature_cloud.jpg">
+</div>
+<div style="text-align: center">
+Some real-world features (such as, longitude) are numerical, but many are not.
+</div>
+
+## Input to a Deep Neural Network
+
+What kind of data can a deep neural network operate on? The answer
+is, of course, numbers (for example, `tf.float32`). After all, every neuron in
+a neural network performs multiplication and addition operations on weights and
+input data. Real-life input data, however, often contains non-numerical
+(categorical) data. For example, consider a `product_class` feature that can
+contain the following three non-numerical values:
+
+* `kitchenware`
+* `electronics`
+* `sports`
+
+ML models generally represent categorical values as simple vectors in which a
+1 represents the presence of a value and a 0 represents the absence of a value.
+For example, when `product_class` is set to `sports`, an ML model would usually
+represent `product_class` as  `[0, 0, 1]`, meaning:
+
+* `0`: `kitchenware` is absent
+* `0`: `electronics` is absent
+* `1`: `sports` is present
+
+So, although raw data can be numerical or categorical, an ML model represents
+all features as numbers.
+
+## Feature Columns
+
+As the following figure suggests, you specify the input to a model through the
+`feature_columns` argument of an Estimator (`DNNClassifier` for Iris).
+Feature Columns bridge input data (as returned by `input_fn`) with your model.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/inputs_to_model_bridge.jpg">
+</div>
+<div style="text-align: center">
+Feature columns bridge raw data with the data your model needs.
+</div>
+
+To create feature columns, call functions from the
+@{tf.feature_column} module. This document explains nine of the functions in
+that module. As the following figure shows, all nine functions return either a
+Categorical-Column or a Dense-Column object, except `bucketized_column`, which
+inherits from both classes:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/some_constructors.jpg">
+</div>
+<div style="text-align: center">
+Feature column methods fall into two main categories and one hybrid category.
+</div>
+
+Let's look at these functions in more detail.
+
+### Numeric column
+
+The Iris classifier calls the @{tf.feature_column.numeric_column} function for
+all input features:
+
+  * `SepalLength`
+  * `SepalWidth`
+  * `PetalLength`
+  * `PetalWidth`
+
+Although `tf.numeric_column` provides optional arguments, calling
+`tf.numeric_column` without any arguments, as follows, is a fine way to specify
+a numerical value with the default data type (`tf.float32`) as input to your
+model:
+
+```python
+# Defaults to a tf.float32 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength")
+```
+
+To specify a non-default numerical data type, use the `dtype` argument. For
+example:
+
+``` python
+# Represent a tf.float64 scalar.
+numeric_feature_column = tf.feature_column.numeric_column(key="SepalLength",
+                                                          dtype=tf.float64)
+```
+
+By default, a numeric column creates a single value (scalar). Use the shape
+argument to specify another shape. For example:
+
+<!--TODO(markdaoust) link to full example-->
+```python
+# Represent a 10-element vector in which each cell contains a tf.float32.
+vector_feature_column = tf.feature_column.numeric_column(key="Bowling",
+                                                         shape=10)
+
+# Represent a 10x5 matrix in which each cell contains a tf.float32.
+matrix_feature_column = tf.feature_column.numeric_column(key="MyMatrix",
+                                                         shape=[10,5])
+```
+### Bucketized column
+
+Often, you don't want to feed a number directly into the model, but instead
+split its value into different categories based on numerical ranges.  To do so,
+create a @{tf.feature_column.bucketized_column$bucketized column}. For
+example, consider raw data that represents the year a house was built. Instead
+of representing that year as a scalar numeric column, we could split the year
+into the following four buckets:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/bucketized_column.jpg">
+</div>
+<div style="text-align: center">
+Dividing year data into four buckets.
+</div>
+
+The model will represent the buckets as follows:
+
+|Date Range |Represented as... |
+|:----------|:-----------------|
+|< 1960               | [1, 0, 0, 0] |
+|>= 1960 but < 1980   | [0, 1, 0, 0] |
+|>= 1980 but < 2000   | [0, 0, 1, 0] |
+|> 2000               | [0, 0, 0, 1] |
+
+Why would you want to split a number—a perfectly valid input to your
+model—into a categorical value? Well, notice that the categorization splits a
+single input number into a four-element vector. Therefore, the model now can
+learn _four individual weights_ rather than just one; four weights creates a
+richer model than one weight. More importantly, bucketizing enables the model
+to clearly distinguish between different year categories since only one of the
+elements is set (1) and the other three elements are cleared (0). When we just
+use a single number (a year) as input, the model can only learn a linear
+relationship. So, bucketing provides the model with additional flexibility that
+the model can use to learn.
+
+The following code demonstrates how to create a bucketized feature:
+
+<!--TODO(markdaoust) link to full example - housing price grid?-->
+```python
+# First, convert the raw input to a numeric column.
+numeric_feature_column = tf.feature_column.numeric_column("Year")
+
+# Then, bucketize the numeric column on the years 1960, 1980, and 2000.
+bucketized_feature_column = tf.feature_column.bucketized_column(
+    source_column = numeric_feature_column,
+    boundaries = [1960, 1980, 2000])
+```
+Note that specifying a _three_-element boundaries vector creates a
+_four_-element bucketized vector.
+
+
+### Categorical identity column
+
+**Categorical identity columns** can be seen as a special case of bucketized
+columns. In traditional bucketized columns, each bucket represents a range of
+values (for example, from 1960 to 1979). In a categorical identity column, each
+bucket represents a single, unique integer. For example, let's say you want to
+represent the integer range `[0, 4)`.  That is, you want to represent the
+integers 0, 1, 2, or 3. In this case, the categorical identity mapping looks
+like this:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+A categorical identity column mapping. Note that this is a one-hot
+encoding, not a binary numerical encoding.
+</div>
+
+As with bucketized columns, a model can learn a separate weight for each class
+in a categorical identity column. For example, instead of using a string to
+represent the `product_class`, let's represent each class with a unique integer
+value. That is:
+
+* `0="kitchenware"`
+* `1="electronics"`
+* `2="sport"`
+
+Call @{tf.feature_column.categorical_column_with_identity} to implement a
+categorical identity column. For example:
+
+``` python
+# Create categorical output for an integer feature named "my_feature_b",
+# The values of my_feature_b must be >= 0 and < num_buckets
+identity_feature_column = tf.feature_column.categorical_column_with_identity(
+    key='my_feature_b',
+    num_buckets=4) # Values [0, 4)
+
+# In order for the preceding call to work, the input_fn() must return
+# a dictionary containing 'my_feature_b' as a key. Furthermore, the values
+# assigned to 'my_feature_b' must belong to the set [0, 4).
+def input_fn():
+    ...
+    return ({ 'my_feature_a':[7, 9, 5, 2], 'my_feature_b':[3, 1, 2, 2] },
+            [Label_values])
+```
+
+### Categorical vocabulary column
+
+We cannot input strings directly to a model. Instead, we must first map strings
+to numeric or categorical values. Categorical vocabulary columns provide a good
+way to represent strings as a one-hot vector. For example:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_vocabulary.jpg">
+</div>
+<div style="text-align: center">
+Mapping string values to vocabulary columns.
+</div>
+
+As you can see, categorical vocabulary columns are kind of an enum version of
+categorical identity columns. TensorFlow provides two different functions to
+create categorical vocabulary columns:
+
+* @{tf.feature_column.categorical_column_with_vocabulary_list}
+* @{tf.feature_column.categorical_column_with_vocabulary_file}
+
+`categorical_column_with_vocabulary_list` maps each string to an integer based
+on an explicit vocabulary list. For example:
+
+```python
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature by mapping the input to one of
+# the elements in the vocabulary list.
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_list(
+        key="a feature returned by input_fn()",
+        vocabulary_list=["kitchenware", "electronics", "sports"])
+```
+
+The preceding function is pretty straightforward, but it has a significant
+drawback. Namely, there's way too much typing when the vocabulary list is long.
+For these cases, call
+`tf.feature_column.categorical_column_with_vocabulary_file` instead, which lets
+you place the vocabulary words in a separate file. For example:
+
+```python
+
+# Given input "feature_name_from_input_fn" which is a string,
+# create a categorical feature to our model by mapping the input to one of
+# the elements in the vocabulary file
+vocabulary_feature_column =
+    tf.feature_column.categorical_column_with_vocabulary_file(
+        key="a feature returned by input_fn()",
+        vocabulary_file="product_class.txt",
+        vocabulary_size=3)
+```
+
+`product_class.txt` should contain one line for each vocabulary element. In our
+case:
+
+```None
+kitchenware
+electronics
+sports
+```
+
+### Hashed Column
+
+So far, we've worked with a naively small number of categories. For example,
+our product_class example has only 3 categories. Often though, the number of
+categories can be so big that it's not possible to have individual categories
+for each vocabulary word or integer because that would consume too much memory.
+For these cases, we can instead turn the question around and ask, "How many
+categories am I willing to have for my input?"  In fact, the
+@{tf.feature_column.categorical_column_with_hash_bucket} function enables you
+to specify the number of categories. For this type of feature column the model
+calculates a hash value of the input, then puts it into one of
+the `hash_bucket_size` categories using the modulo operator, as in the following
+pseudocode:
+
+```python
+# pseudocode
+feature_id = hash(raw_feature) % hash_buckets_size
+```
+
+The code to create the `feature_column` might look something like this:
+
+``` python
+hashed_feature_column =
+    tf.feature_column.categorical_column_with_hash_bucket(
+        key = "some_feature",
+        hash_buckets_size = 100) # The number of categories
+```
+At this point, you might rightfully think: "This is crazy!" After all, we are
+forcing the different input values to a smaller set of categories. This means
+that two probably unrelated inputs will be mapped to the same
+category, and consequently mean the same thing to the neural network. The
+following figure illustrates this dilemma, showing that kitchenware and sports
+both get assigned to category (hash bucket) 12:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/hashed_column.jpg">
+</div>
+<div style="text-align: center">
+Representing data with hash buckets.
+</div>
+
+As with many counterintuitive phenomena in machine learning, it turns out that
+hashing often works well in practice. That's because hash categories provide
+the model with some separation. The model can use additional features to further
+separate kitchenware from sports.
+
+### Crossed column
+
+Combining features into a single feature, better known as
+[feature crosses](https://developers.google.com/machine-learning/glossary/#feature_cross),
+enables the model to learn separate weights for each combination of
+features.
+
+More concretely, suppose we want our model to calculate real estate prices in
+Atlanta, GA. Real-estate prices within this city vary greatly depending on
+location. Representing latitude and longitude as separate features isn't very
+useful in identifying real-estate location dependencies; however, crossing
+latitude and longitude into a single feature can pinpoint locations. Suppose we
+represent Atlanta as a grid of 100x100 rectangular sections, identifying each
+of the 10,000 sections by a feature cross of latitude and longitude. This
+feature cross enables the model to train on pricing conditions related to each
+individual section, which is a much stronger signal than latitude and longitude
+alone.
+
+The following figure shows our plan, with the latitude & longitude values for
+the corners of the city in red text:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/Atlanta.jpg">
+</div>
+<div style="text-align: center">
+Map of Atlanta. Imagine this map divided into 10,000 sections of
+equal size.
+</div>
+
+For the solution, we used a combination of the `bucketized_column` we looked at
+earlier, with the @{tf.feature_column.crossed_column} function.
+
+<!--TODO(markdaoust) link to full example-->
+
+``` python
+def make_dataset(latitude, longitude, labels):
+    assert latitude.shape == longitude.shape == labels.shape
+
+    features = {'latitude': latitude.flatten(),
+                'longitude': longitude.flatten()}
+    labels=labels.flatten()
+
+    return tf.data.Dataset.from_tensor_slices((features, labels))
+
+
+# Bucketize the latitude and longitude usig the `edges`
+latitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('latitude'),
+    list(atlanta.latitude.edges))
+
+longitude_bucket_fc = tf.feature_column.bucketized_column(
+    tf.feature_column.numeric_column('longitude'),
+    list(atlanta.longitude.edges))
+
+# Cross the bucketized columns, using 5000 hash bins.
+crossed_lat_lon_fc = tf.feature_column.crossed_column(
+    [latitude_bucket_fc, longitude_bucket_fc], 5000)
+
+fc = [
+    latitude_bucket_fc,
+    longitude_bucket_fc,
+    crossed_lat_lon_fc]
+
+# Build and train the Estimator.
+est = tf.estimator.LinearRegressor(fc, ...)
+```
+
+You may create a feature cross from either of the following:
+
+* Feature names; that is, names from the `dict` returned from `input_fn`.
+* Any categorical column, except `categorical_column_with_hash_bucket`
+  (since `crossed_column` hashes the input).
+
+When the feature columns `latitude_bucket_fc` and `longitude_bucket_fc` are
+crossed, TensorFlow will create `(latitude_fc, longitude_fc)` pairs for each
+example. This would produce a full grid of possibilities as follows:
+
+``` None
+ (0,0),  (0,1)...  (0,99)
+ (1,0),  (1,1)...  (1,99)
+   ...     ...       ...
+(99,0), (99,1)...(99, 99)
+```
+
+Except that a full grid would only be tractable for inputs with limited
+vocabularies. Instead of building this, potentially huge, table of inputs,
+the `crossed_column` only builds the number requested by the `hash_bucket_size`
+argument. The feature column assigns an example to a index by running a hash
+function on the tuple of inputs, followed by a modulo operation with
+`hash_bucket_size`.
+
+As discussed earlier, performing the
+hash and modulo function limits the number of categories, but can cause category
+collisions; that is, multiple (latitude, longitude) feature crosses will end
+up in the same hash bucket. In practice though, performing feature crosses
+still adds significant value to the learning capability of your models.
+
+Somewhat counterintuitively, when creating feature crosses, you typically still
+should include the original (uncrossed) features in your model (as in the
+preceding code snippet). The independent latitude and longitude features help the
+model distinguish between examples where a hash collision has occured in the
+crossed feature.
+
+## Indicator and embedding columns
+
+Indicator columns and embedding columns never work on features directly, but
+instead take categorical columns as input.
+
+When using an indicator column, we're telling TensorFlow to do exactly what
+we've seen in our categorical product_class example. That is, an
+**indicator column** treats each category as an element in a one-hot vector,
+where the matching category has value 1 and the rest have 0s:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/categorical_column_with_identity.jpg">
+</div>
+<div style="text-align: center">
+Representing data in indicator columns.
+</div>
+
+Here's how you create an indicator column by calling
+@{tf.feature_column.indicator_column}:
+
+``` python
+categorical_column = ... # Create any type of categorical column.
+
+# Represent the categorical column as an indicator column.
+indicator_column = tf.feature_column.indicator_column(categorical_column)
+```
+
+Now, suppose instead of having just three possible classes, we have a million.
+Or maybe a billion. For a number of reasons, as the number of categories grow
+large, it becomes infeasible to train a neural network using indicator columns.
+
+We can use an embedding column to overcome this limitation. Instead of
+representing the data as a one-hot vector of many dimensions, an
+**embedding column** represents that data as a lower-dimensional, ordinary
+vector in which each cell can contain any number, not just 0 or 1. By
+permitting a richer palette of numbers for every cell, an embedding column
+contains far fewer cells than an indicator column.
+
+Let's look at an example comparing indicator and embedding columns. Suppose our
+input examples consists of different words from a limited palette of only 81
+words. Further suppose that the data set provides provides the following input
+words in 4 separate examples:
+
+* `"dog"`
+* `"spoon"`
+* `"scissors"`
+* `"guitar"`
+
+In that case, the following figure illustrates the processing path for
+embedding columns or indicator columns.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/feature_columns/embedding_vs_indicator.jpg">
+</div>
+<div style="text-align: center">
+An embedding column stores categorical data in a lower-dimensional
+vector than an indicator column. (We just placed random numbers into the
+embedding vectors; training determines the actual numbers.)
+</div>
+
+When an example is processed, one of the `categorical_column_with...` functions
+maps the example string to a numerical categorical value. For example, a
+function maps "spoon" to `[32]`. (The 32 comes from our imagination—the actual
+values depend on the mapping function.) You may then represent these numerical
+categorical values in either of the following two ways:
+
+* As an indicator column. A function converts each numeric categorical value
+  into an 81-element vector (because our palette consists of 81 words), placing
+  a 1 in the index of the categorical value (0, 32, 79, 80) and a 0 in all the
+  other positions.
+
+* As an embedding column. A function uses the numerical categorical values
+  `(0, 32, 79, 80)` as indices to a lookup table. Each slot in that lookup table
+  contains a 3-element vector.
+
+How do the values in the embeddings vectors magically get assigned? Actually,
+the assignments happen during training. That is, the model learns the best way
+to map your input numeric categorical values to the embeddings vector value in
+order to solve your problem. Embedding columns increase your model's
+capabilities, since an embeddings vector learns new relationships between
+categories from the training data.
+
+Why is the embedding vector size 3 in our example? Well, the following "formula"
+provides a general rule of thumb about the number of embedding dimensions:
+
+```python
+embedding_dimensions =  number_of_categories**0.25
+```
+
+That is, the embedding vector dimension should be the 4th root of the number of
+categories. Since our vocabulary size in this example is 81, the recommended
+number of dimensions is 3:
+
+``` python
+3 =  81**0.25
+```
+Note that this is just a general guideline; you can set the number of embedding
+dimensions as you please.
+
+Call @{tf.feature_column.embedding_column} to create an `embedding_column` as
+suggested by the following snippet:
+
+``` python
+categorical_column = ... # Create any categorical column
+
+# Represent the categorical column as an embedding column.
+# This means creating a one-hot vector with one element for each category.
+embedding_column = tf.feature_column.embedding_column(
+    categorical_column=categorical_column,
+    dimension=dimension_of_embedding_vector)
+```
+
+@{$programmers_guide/embedding$Embeddings} is a significant topic within machine
+learning. This information was just to get you started using them as feature
+columns.
+
+## Passing feature columns to Estimators
+
+As the following list indicates, not all Estimators permit all types of
+`feature_columns` argument(s):
+
+* @{tf.estimator.LinearClassifier$`LinearClassifier`} and
+  @{tf.estimator.LinearRegressor$`LinearRegressor`}: Accept all types of
+  feature column.
+* @{tf.estimator.DNNClassifier$`DNNClassifier`} and
+  @{tf.estimator.DNNRegressor$`DNNRegressor`}: Only accept dense columns. Other
+  column types must be wrapped in either an `indicator_column` or
+  `embedding_column`.
+* @{tf.estimator.DNNLinearCombinedClassifier$`DNNLinearCombinedClassifier`} and
+  @{tf.estimator.DNNLinearCombinedRegressor$`DNNLinearCombinedRegressor`}:
+    * The `linear_feature_columns` argument accepts any feature column type.
+    * The `dnn_feature_columns` argument only accepts dense columns.
+
+## Other Sources
+
+For more examples on feature columns, view the following:
+
+* The @{$wide_and_deep$Wide & Deep Tutorial}
+* [Examples](https://github.com/tensorflow/models/tree/master/samples/cookbook/regression)
+  of DNNs and linear models that use feature columns.
+
+To learn more about embeddings, see the following:
+
+* [Deep Learning, NLP, and representations](http://colah.github.io/posts/2014-07-NLP-RNNs-Representations/)
+  (Chris Olah's blog)
+* The TensorFlow [Embedding Projector](http://projector.tensorflow.org)
diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md
index 8409962744c71eb226af8d859922729b35bf6ad3..231108215ac73bc9ab87a896b3441a7da5f2b507 100644
--- a/tensorflow/docs_src/get_started/get_started.md
+++ b/tensorflow/docs_src/get_started/get_started.md
@@ -272,7 +272,7 @@ train = optimizer.minimize(loss)
 ```
 
 ```python
-sess.run(init) # reset values to incorrect defaults.
+sess.run(init) # reset variables to incorrect defaults.
 for i in range(1000):
   sess.run(train, {x: [1, 2, 3, 4], y: [0, -1, -2, -3]})
 
@@ -317,7 +317,7 @@ y_train = [0, -1, -2, -3]
 # training loop
 init = tf.global_variables_initializer()
 sess = tf.Session()
-sess.run(init) # reset values to wrong
+sess.run(init) # initialize variables with incorrect defaults.
 for i in range(1000):
   sess.run(train, {x: x_train, y: y_train})
 
@@ -330,8 +330,8 @@ When run, it produces
 W: [-0.9999969] b: [ 0.99999082] loss: 5.69997e-11
 ```
 
-Notice that the loss is a very small number (very close to zero). If you run 
-this program, your loss may not be exactly the same as the aforementioned loss 
+Notice that the loss is a very small number (very close to zero). If you run
+this program, your loss may not be exactly the same as the aforementioned loss
 because the model is initialized with pseudorandom values.
 
 This more complicated program can still be visualized in TensorBoard
@@ -383,7 +383,7 @@ train_input_fn = tf.estimator.inputs.numpy_input_fn(
 eval_input_fn = tf.estimator.inputs.numpy_input_fn(
     {"x": x_eval}, y_eval, batch_size=4, num_epochs=1000, shuffle=False)
 
-# We can invoke 1000 training steps by invoking the  method and passing the
+# We can invoke 1000 training steps by invoking the method and passing the
 # training data set.
 estimator.train(input_fn=input_fn, steps=1000)
 
diff --git a/tensorflow/docs_src/get_started/input_fn.md b/tensorflow/docs_src/get_started/input_fn.md
index 9d3af5d96a94d3f55dc82e64459b558630e6e7f0..24bfdbdd2e91a6d87a5ab1ec2ba264d90ef8e148 100644
--- a/tensorflow/docs_src/get_started/input_fn.md
+++ b/tensorflow/docs_src/get_started/input_fn.md
@@ -191,7 +191,7 @@ import pandas as pd
 
 def get_input_fn_from_pandas(data_set, num_epochs=None, shuffle=True):
   return tf.estimator.inputs.pandas_input_fn(
-      x=pdDataFrame(...),
+      x=pd.DataFrame(...),
       y=pd.Series(...),
       num_epochs=num_epochs,
       shuffle=shuffle)
@@ -211,8 +211,8 @@ def get_input_fn_from_numpy(data_set, num_epochs=None, shuffle=True):
 ### A Neural Network Model for Boston House Values
 
 In the remainder of this tutorial, you'll write an input function for
-preprocessing a subset of Boston housing data pulled from the [UCI Housing Data
-Set](https://archive.ics.uci.edu/ml/datasets/Housing) and use it to feed data to
+preprocessing a subset of Boston housing data pulled from the UCI Housing Data
+Set and use it to feed data to
 a neural network regressor for predicting median house values.
 
 The [Boston CSV data sets](#setup) you'll use to train your neural network
@@ -267,8 +267,8 @@ tf.logging.set_verbosity(tf.logging.INFO)
 
 Define the column names for the data set in `COLUMNS`. To distinguish features
 from the label, also define `FEATURES` and `LABEL`. Then read the three CSVs
-(@{tf.train},
-@{tf.test}, and
+([train](http://download.tensorflow.org/data/boston_train.csv),
+[test](http://download.tensorflow.org/data/boston_test.csv), and
 [predict](http://download.tensorflow.org/data/boston_predict.csv)) into _pandas_
 `DataFrame`s:
 
@@ -292,7 +292,7 @@ prediction_set = pd.read_csv("boston_predict.csv", skipinitialspace=True,
 Next, create a list of `FeatureColumn`s for the input data, which formally
 specify the set of features to use for training. Because all features in the
 housing data set contain continuous values, you can create their
-`FeatureColumn`s using the `tf.contrib.layers.real_valued_column()` function:
+`FeatureColumn`s using the `tf.feature_column.numeric_column()` function:
 
 ```python
 feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]
diff --git a/tensorflow/docs_src/get_started/mnist/beginners.md b/tensorflow/docs_src/get_started/mnist/beginners.md
index 38c467ddc32c9ca21432cc7fe74a594446804293..c419ca87c363bc6c4507f70c25d1293e27612253 100644
--- a/tensorflow/docs_src/get_started/mnist/beginners.md
+++ b/tensorflow/docs_src/get_started/mnist/beginners.md
@@ -347,11 +347,10 @@ over all the examples in the batch.
 
 Note that in the source code, we don't use this formulation, because it is
 numerically unstable.  Instead, we apply
-`tf.nn.softmax_cross_entropy_with_logits` on the unnormalized logits (e.g., we
-call `softmax_cross_entropy_with_logits` on `tf.matmul(x, W) + b`), because this
-more numerically stable function internally computes the softmax activation.  In
-your code, consider using `tf.nn.softmax_cross_entropy_with_logits`
-instead.
+`tf.losses.sparse_softmax_cross_entropy` on the unnormalized logits (e.g., we
+call `sparse_softmax_cross_entropy` on the output of `tf.matmul(x, W) + b`),
+because this more numerically stable function internally computes the softmax
+activation.
 
 Now that we know what we want our model to do, it's very easy to have TensorFlow
 train it to do so.  Because TensorFlow knows the entire graph of your
diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md
index 27fae45b5b0b4126132556cfac312fbb3c4f515a..dac00498e12d180d88fae0dc405dcda013441eff 100644
--- a/tensorflow/docs_src/get_started/mnist/mechanics.md
+++ b/tensorflow/docs_src/get_started/mnist/mechanics.md
@@ -47,7 +47,7 @@ training folder and then unpack that data to return a dictionary of `DataSet`
 instances.
 
 ```python
-data_sets = input_data.read_data_sets(FLAGS.train_dir, FLAGS.fake_data)
+data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
 ```
 
 **NOTE**: The `fake_data` flag is used for unit-testing purposes and may be
@@ -167,20 +167,15 @@ Finally, the `logits` tensor that will contain the output is returned.
 The `loss()` function further builds the graph by adding the required loss
 ops.
 
-First, the values from the `labels_placeholder` are converted to 64-bit integers. Then, a @{tf.nn.sparse_softmax_cross_entropy_with_logits} op is added to automatically produce 1-hot labels from the `labels_placeholder` and compare the output logits from the `inference()` function with those 1-hot labels.
+First, the values from the `labels_placeholder` are converted to 64-bit
+integers. Then, a @{tf.losses.sparse_softmax_cross_entropy} op is used to
+calculate the batch's average cross entropy, of the `inference()` result,
+compared to the labels.
 
 ```python
 labels = tf.to_int64(labels)
-cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-    labels=labels, logits=logits, name='xentropy')
-```
-
-It then uses @{tf.reduce_mean}
-to average the cross entropy values across the batch dimension (the first
-dimension) as the total loss.
-
-```python
-loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
+cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+    labels=labels, logits=logits)
 ```
 
 And the tensor that will then contain the loss value is returned.
@@ -369,7 +364,7 @@ may be instantiated to write the events files, which
 contain both the graph itself and the values of the summaries.
 
 ```python
-summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
+summary_writer = tf.summary.FileWriter(FLAGS.log_dir, sess.graph)
 ```
 
 Lastly, the events file will be updated with new summary values every time the
@@ -403,7 +398,7 @@ method will periodically be called to write a checkpoint file to the training
 directory with the current values of all the trainable variables.
 
 ```python
-saver.save(sess, FLAGS.train_dir, global_step=step)
+saver.save(sess, checkpoint_file, global_step=step)
 ```
 
 At some later point in the future, training might be resumed by using the
@@ -411,7 +406,7 @@ At some later point in the future, training might be resumed by using the
 method to reload the model parameters.
 
 ```python
-saver.restore(sess, FLAGS.train_dir)
+saver.restore(sess, checkpoint_file)
 ```
 
 ## Evaluate the Model
diff --git a/tensorflow/docs_src/get_started/mnist/pros.md b/tensorflow/docs_src/get_started/mnist/pros.md
index 4933dd28cd37e695a10ab28832f26a613589d01a..c52e960bb34f53643bb2f8973595245e40932128 100644
--- a/tensorflow/docs_src/get_started/mnist/pros.md
+++ b/tensorflow/docs_src/get_started/mnist/pros.md
@@ -49,7 +49,7 @@ these two lines of code which will download and read in the data automatically:
 
 ```python
 from tensorflow.examples.tutorials.mnist import input_data
-mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
+mnist = input_data.read_data_sets('MNIST_data')
 ```
 
 Here `mnist` is a lightweight class which stores the training, validation, and
@@ -172,8 +172,7 @@ between the target and the softmax activation function applied to the model's
 prediction.  As in the beginners tutorial, we use the stable formulation:
 
 ```python
-cross_entropy = tf.reduce_mean(
-    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y))
 ```
 
 Note that `tf.nn.softmax_cross_entropy_with_logits` internally applies the
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff839fd040167dc16087311666ff25da2088c519
--- /dev/null
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -0,0 +1,425 @@
+
+# Getting Started with TensorFlow
+
+This document introduces the TensorFlow programming environment and shows you
+how to write the Iris classification problem in TensorFlow.
+
+Prior to reading this document, do the following:
+
+* [Install TensorFlow](install/index.md).
+* If you installed TensorFlow with virtualenv or Anaconda, activate your
+  TensorFlow environment.
+* To keep the data import simple, our Iris example uses Pandas. You can
+  install Pandas with:
+
+      `pip install pandas`
+
+## Getting the sample code
+
+Take the following steps to get the sample code for this program:
+
+1. Clone the TensorFlow Models repository from github by entering the following
+   command:
+
+       `git clone https://github.com/tensorflow/models`
+
+1. Change directory within that branch to the location containing the examples
+   used in this document:
+
+       `cd models/samples/core/get_started/`
+
+The program described in this document is called `premade_estimator.py`.
+
+### Running the program
+
+You run TensorFlow programs as you would run any Python program. For example:
+
+``` bsh
+python premade_estimator.py
+```
+
+The program should output training logs and some predictions against a test
+set. For example, the first line in the following output shows that the model
+thinks there is a 99.6% chance that the first example in the test set is a
+Sentosa. Since the test set `expected "Setosa"`, this appears to be a good
+prediction.
+
+``` None
+...
+Prediction is "Sentosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+If the program generates errors instead of answers, ask yourself the following
+questions:
+
+* Did you install TensorFlow properly?
+* Are you using the correct version of tensorflow?
+* Did you activate the environment you installed TensorFlow in? (This is
+  only relevant in certain installation environments.)
+
+## The programming stack
+
+Before getting into the details of the program itself, let's investigate the
+programming environment. As the following illustration shows, TensorFlow
+provides a programming stack consisting of multiple API layers:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/tensorflow_programming_environment.png">
+</div>
+<div style="text-align: center">
+The TensorFlow Programming Environment
+</div>
+
+We strongly recommend writing TensorFlow programs with the following APIs:
+
+* Estimators, which represent a complete model. The Estimator API provides
+  methods to train the model, to judge the model's accuracy, and to generate
+  predictions.
+* Datasets, which build a data input pipeline. The Dataset API has methods to
+  load and manipulate data, and feed it into your model. The Datasets API meshes
+  well with the Estimators API.
+
+## Classifying irises: an overview
+
+The sample program in this document builds and tests a model that
+classifies Iris flowers into three different species based on the size of their
+[sepals](https://en.wikipedia.org/wiki/Sepal) and
+[petals](https://en.wikipedia.org/wiki/Petal).
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor"
+  src="../images/iris_three_species.jpg">
+</div>
+**From left to right,
+[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
+[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
+[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
+[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
+and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
+(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
+2.0).**
+
+### The data set
+
+The Iris data set contains four features and one label.  The four features
+identify the following botanical characteristics of individual Iris flowers:
+
+* sepal length
+* sepal width
+* petal length
+* petal width
+
+Our model will represent these features as float32 numerical data.
+
+The label identifies the Iris species, which must be one of the following:
+
+* Iris setosa (0)
+* Iris versicolor (1)
+* Iris virginica (2)
+
+Our model will represent the label as `int32` categorical data.
+
+The following table shows three examples in the data set:
+
+|sepal length | sepal width | petal length | petal width| species (label) |
+|------------:|------------:|-------------:|-----------:|:---------------:|
+|         5.1 |         3.3 |          1.7 |        0.5 |   0 (Sentosa)   |
+|         5.0 |         2.3 |          3.3 |        1.0 |   1 (versicolor)|
+|         6.4 |         2.8 |          5.6 |        2.2 |   2 (virginica) |
+
+### The algorithm
+
+The program trains a Deep Neural Network classifier model having the following
+topology:
+
+* 2 hidden layers.
+* Each hidden layer contains 10 nodes.
+
+The following figure illustrates the features, hidden layers, and predictions
+(not all of the nodes in the hidden layers are shown):
+
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="A diagram of the network architecture: Inputs, 2 hidden layers, and outputs"
+  src="../images/iris_model.png">
+</div>
+<div style="text-align: center">
+The Model.
+</div>
+
+### Inference
+
+Running the trained model on an unlabeled example yields three predictions,
+namely, the likelihood that this flower is the given Iris species. The sum of
+those output predictions will be 1.0. For example, the prediction on an
+unlabeled example might be something like the following:
+
+* 0.03 for Iris Setosa
+* 0.95 for Iris Versicolor
+* 0.02 for Iris Virginica
+
+The preceding prediction indicates a 95% probability that the given unlabeled
+example is an Iris Versicolor.
+
+## Overview of programming with Estimators
+
+An Estimator is TensorFlow's high level representation of a complete model. It
+handles the details of initialization, logging, saving and restoring, and many
+other features so you can concentrate on your model. For more details see
+@{$programmers_guide/estimators}.
+
+An "Estimator" is any class derived from @{tf.estimator.Estimator}. TensorFlow
+provides a collection of
+[pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
+(for example, `LinearRegressor`) to implement common ML algorithms. Beyond
+those, you may write your own
+[custom Estimators](https://developers.google.com/machine-learning/glossary/#custom_Estimator).
+We recommend using pre-made Estimators when just getting started with
+TensorFlow. After gaining expertise with the pre-made Estimators, we recommend
+optimizing your model by creating your own custom Estimators.
+
+To write a TensorFlow program based on pre-made Estimators, you must perform the
+following tasks:
+
+* Create one or more input functions.
+* Define the model's feature columns.
+* Instantiate an Estimator, specifying the feature columns and various
+  hyperparameters.
+* Call one or more methods on the Estimator object, passing the appropriate
+  input function as the source of the data.
+
+Let's see how those tasks are implemented in Iris.
+
+## Create input functions
+
+You must create input functions to supply data for training,
+evaluating, and prediction.
+
+An **input function** is a function that returns the following two-element
+tuple:
+
+* "features" - A Python dictionary in which:
+    * Each key is the name of a feature.
+    * Each value is an array containing all of that feature's values.
+* "label" - An array containing the values of the label for every example.
+
+Just to demonstrate the format of the input function here's a simple
+implementation:
+
+```python
+def input_evaluation_set():
+    features = {'SepalLength': np.array([6.4, 5.0]),
+                'SepalWidth':  np.array([2.8, 2.3]),
+                'PetalLength': np.array([5.6, 3.3]),
+                'PetalWidth':  np.array([2.2, 1.0])}
+    labels = np.array([2, 1])
+    return features, labels
+```
+
+Your input function may generate the "features" dictionary and "label" list any
+way you like. However, we recommend using TensorFlow's Dataset API, which can
+deftly parse all sorts of data. At a high-level, the Datasets API consists of
+the following classes:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%"
+  alt="A diagram showing subclasses of the Dataset class"
+  src="../images/dataset_classes.png">
+</div>
+
+
+Where:
+
+* Dataset: Base class containing methods to create and transform datasets. Also
+  allows you to initialize a dataset from data in memory, or from a Python
+  generator.
+* TextLineDataset: Reads lines from text files.
+* TFRecordDataset: Reads records from TFRecord files.
+* FixedLengthRecordDataset: Reads fixed size records from binary files.
+* Iterator: Provides a way to access one data set element at a time.
+
+The Dataset API can handle a lot of common cases for you. For example,
+using the Dataset API, you can easily read in records from a large collection
+of files in parallel and join them into a single stream.
+
+To keep things simple in this example we are going to load the data with pandas, and build our input pipeline from this in-memory data.
+
+Here is the input function used for training in this program:
+
+``` python
+def train_input_fn(features, labels, batch_size):
+    """An input function for training"""
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
+
+    # Build the Iterator, and return the read end of the pipeline.
+    return dataset.make_one_shot_iterator().get_next()
+```
+
+## Define the Feature Columns
+
+A [**Feature Column**](https://developers.google.com/machine-learning/glossary/#feature_columns)
+is an object describing how the model should use raw input features from the
+features dictionary. When you build an Estimator model, you pass it a list of
+feature columns that describes each of the features you want the model to use.
+
+These objects are created by functions in the @{tf.feature_column} module. `tf.feature_column` methods provide many different ways to represent data.
+
+For Iris, the 4 raw features are numeric values, so we'll build a list of
+feature columns, to tell the Estimator model to represent each of the four
+features as 32-bit floating-point values. Therefore, the code to create the
+Feature Column is simply:
+
+```python
+# Feature columns describe how to use the input.
+my_feature_columns = []
+for key in train_x.keys():
+    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
+```
+
+Feature Columns can be far more sophisticated than those we're showing here.
+<!--TODO(markdaoust) add link to feature_columns doc when it exists.-->
+
+Now that we have the description of how we want the model to represent the raw
+features, we can build the estimator.
+
+
+## Instantiate an Estimator
+
+The Iris problem is a classic classifier problem. Fortunately, TensorFlow
+provides several pre-made classifier Estimators, including:
+
+* @{tf.estimator.DNNClassifier}—for deep models that perform multi-class
+  classification.
+* @{tf.estimator.DNNLinearCombinedClassifier}—for wide-n-deep models.
+* @{tf.estimator.LinearClassifier}—for linear models that feed results into
+  binary classifiers.
+
+For the Iris problem, `tf.estimator.DNNClassifier` seems like the best choice.
+Here's how we instantiated this Estimator:
+
+```python
+# Build 2 hidden layer DNN with 10, 10 units respectively.
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    # Two hidden layers of 10 nodes each.
+    hidden_units=[10, 10],
+    # The model must choose between 3 classes.
+    n_classes=3)
+```
+
+## Train, Evaluate, and Predict
+
+Now that we have an Estimator object, we can call methods to do the following:
+
+* Train the model.
+* Evaluate the trained model.
+* Use the trained model to make predictions.
+
+### Train the model
+
+Train the model by calling the Estimator's `train` method as follows:
+
+```python
+# Train the Model.
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, args.batch_size),
+    steps=args.train_steps)
+```
+
+Here we wrap up our `input_fn` call in a [`lambda`](https://docs.python.org/3/tutorial/controlflow.html)
+to allow the Estimator to call it, at the correct time, with no arguments.
+The `steps` argument tells the method to stop training after a number of
+training steps.
+
+### Evaluate the trained model
+
+Now that the model has been trained, we can get some statistics on its
+performance. The following code block evaluates the accuracy of the trained
+model on the test data:
+
+```python
+# Evaluate the model.
+eval_result = classifier.evaluate(
+    input_fn=lambda:eval_input_fn(test_x, test_y, args.batch_size))
+
+print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
+```
+
+Note how unlike our call to the `train` method, we did not pass the `steps`
+argument to evaluate. Our `eval_input_fn` doesn't use the `repeat` method on
+the dataset, so evaluation just runs to the end of the data.
+
+Running this code yields the following output (or something similar):
+
+```none
+Test set accuracy: 0.967
+```
+
+### Making predictions (inferring) from the trained model
+
+We now have a trained model that produces good evaluation results.
+We can now use the trained model to predict the species of an Iris flower
+based on some unlabeled measurments. As with training and evaluation, we make
+predictions using a single function call:
+
+```python
+# Generate predictions from the model
+expected = ['Setosa', 'Versicolor', 'Virginica']
+predict_x = {
+    'SepalLength': [5.1, 5.9, 6.9],
+    'SepalWidth': [3.3, 3.0, 3.1],
+    'PetalLength': [1.7, 4.2, 5.4],
+    'PetalWidth': [0.5, 1.5, 2.1],
+}
+
+predictions = classifier.predict(
+    input_fn=lambda:eval_input_fn(predict_x, batch_size=args.batch_size))
+```
+
+The `predict` method returns a Python iterable, yielding a dictionary of
+prediction results for each example. The following code prints a few
+predictions and their probabilities:
+
+
+``` python
+for pred_dict, expec in zip(predictions, expected):
+    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
+
+    class_id = pred_dict['class_ids'][0]
+    probability = pred_dict['probabilities'][class_id]
+    print(template.format(SPECIES[class_id], 100 * probability, expec))
+```
+
+Running the preceding code yields the following output:
+
+``` None
+...
+Prediction is "Sentosa" (99.6%), expected "Setosa"
+
+Prediction is "Versicolor" (99.8%), expected "Versicolor"
+
+Prediction is "Virginica" (97.9%), expected "Virginica"
+```
+
+## Next
+
+Now that you've gotten started writing TensorFlow programs.
+
+* For more on Datasets, see the
+  @{$programmers_guide/datasets$Programmer's guide} and
+  @{tf.data$reference documentation}.
+* For more on Estimators, see the
+  @{$programmers_guide/estimators$Programmer's guide} and
+  @{tf.estimator$reference documentation}.
+<!--TODO(markdaoust) add links to next get_started section when it exists.-->
+
diff --git a/tensorflow/docs_src/get_started/saving_models.md b/tensorflow/docs_src/get_started/saving_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..056263c1575aa596ede6ab7dc545ce6610e23554
--- /dev/null
+++ b/tensorflow/docs_src/get_started/saving_models.md
@@ -0,0 +1,237 @@
+# Checkpoints
+
+This document examines how to save and restore TensorFlow models built with
+Estimators. TensorFlow provides two model formats:
+
+*   checkpoints, which is a format dependent on the code that created
+    the model.
+*   SavedModel, which is a format independent of the code that created
+    the model.
+
+This document focuses on checkpoints. For details on SavedModel, see the
+@{$saved_model$Saving and Restoring} chapter of the
+*TensorFlow Programmer's Guide*.
+
+
+## Sample code
+
+This document relies on the same Iris classification example detailed in
+<!-- TODO (barryr): fill in link when module settles down. --> 
+@{$premade_estimators$Getting Started with TensorFlow}.
+To download and access the example, invoke the following two commands:
+
+```shell
+git clone https://github.com/tensorflow/models/
+cd models/samples/core/get_started
+```
+
+Most of the code snippets in this document are minor variations
+on `premade_estimator.py`.
+
+
+## Saving partially-trained models
+
+Estimators automatically write the following to disk:
+
+*   **checkpoints**, which are versions of the model created during training.
+*   **event files**, which contain information that
+    [TensorBoard](https://developers.google.com/machine-learning/glossary/#TensorBoard)
+    uses to create visualizations.
+
+To specify the top-level directory in which the Estimator stores its
+information, assign a value to the optional `model_dir` argument of any
+Estimator's constructor.  For example, the following code sets the `model_dir`
+argument to the `models/iris` directory:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+```
+
+Suppose you call the Estimator's `train` method. For example:
+
+
+```python
+classifier.train(
+        input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+                steps=200)
+```
+
+As suggested by the following diagrams, the first call to `train`
+adds checkpoints and other files to the `model_dir` directory:
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/first_train_calls.png">
+</div>
+<div style="text-align: center">
+The first call to train().
+</div>
+
+
+To see the objects in the created `model_dir` directory on a
+UNIX-based system, just call `ls` as follows:
+
+```none
+$ ls -1 models/iris
+checkpoint
+events.out.tfevents.timestamp.hostname
+graph.pbtxt
+model.ckpt-1.data-00000-of-00001
+model.ckpt-1.index
+model.ckpt-1.meta
+model.ckpt-200.data-00000-of-00001
+model.ckpt-200.index
+model.ckpt-200.meta
+```
+
+The preceding `ls` command shows that the Estimator created checkpoints
+at steps 1 (the start of training) and 200 (the end of training).
+
+
+### Default checkpoint directory
+
+If you don't specify `model_dir` in an Estimator's constructor, the Estimator
+writes checkpoint files to a temporary directory chosen by Python's
+[tempfile.mkdtemp](https://docs.python.org/3/library/tempfile.html#tempfile.mkdtemp)
+function. For example, the following Estimator constructor does *not* specify
+the `model_dir` argument:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3)
+
+print(classifier.model_dir)
+```
+
+The `tempfile.mkdtemp` function picks a secure, temporary directory
+appropriate for your operating system. For example, a typical temporary
+directory on macOS might be something like the following:
+
+```None
+/var/folders/0s/5q9kfzfj3gx2knj0vj8p68yc00dhcr/T/tmpYm1Rwa
+```
+
+### Checkpointing Frequency
+
+By default, the Estimator saves
+[checkpoints](https://developers.google.com/machine-learning/glossary/#checkpoint)
+in the `model_dir` according to the following schedule:
+
+*   Writes a checkpoint every 10 minutes (600 seconds).
+*   Writes a checkpoint when the `train` method starts (first iteration)
+    and completes (final iteration).
+*   Retains only the 5 most recent checkpoints in the directory.
+
+You may alter the default schedule by taking the following steps:
+
+1.  Create a @{tf.estimator.RunConfig$`RunConfig`} object that defines the
+    desired schedule.
+2.  When instantiating the Estimator, pass that `RunConfig` object to the
+    Estimator's `config` argument.
+
+For example, the following code changes the checkpointing schedule to every
+20 minutes and retains the 10 most recent checkpoints:
+
+```python
+my_checkpointing_config = tf.estimator.RunConfig(
+    save_checkpoints_secs = 20*60,  # Save checkpoints every 20 minutes.
+    keep_checkpoint_max = 10,       # Retain the 10 most recent checkpoints.
+)
+
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris',
+    config=my_checkpointing_config)
+```
+
+## Restoring your model
+
+The first time you call an Estimator's `train` method, TensorFlow saves a
+checkpoint to the `model_dir`. Each subsequent call to the Estimator's
+`train`, `eval`, or `predict` method causes the following:
+
+1.  The Estimator builds the model's
+    [graph](https://developers.google.com/machine-learning/glossary/#graph)
+    by running the `model_fn()`.  (For details on the `model_fn()`, see
+    @{$custom_estimators$Creating Custom Estimators.})
+2.  The Estimator initializes the weights of the new model from the data
+    stored in the most recent checkpoint.
+
+In other words, as the following illustration suggests, once checkpoints
+exist, TensorFlow rebuilds the model each time you call `train()`,
+`evaluate()`, or `predict()`.
+
+<div style="width:80%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="../images/subsequent_calls.png">
+</div>
+<div style="text-align: center">
+Subsequent calls to train(), evaluate(), or predict()
+</div>
+
+
+### Avoiding a bad restoration
+
+Restoring a model's state from a checkpoint only works if the model
+and checkpoint are compatible.  For example, suppose you trained a
+`DNNClassifier` Estimator containing two hidden layers,
+each having 10 nodes:
+
+```python
+classifier = tf.estimator.DNNClassifier(
+    feature_columns=feature_columns,
+    hidden_units=[10, 10],
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+After training (and, therefore, after creating checkpoints in `models/iris`),
+imagine that you changed the number of neurons in each hidden layer from 10 to
+20 and then attempted to retrain the model:
+
+``` python
+classifier2 = tf.estimator.DNNClassifier(
+    feature_columns=my_feature_columns,
+    hidden_units=[20, 20],  # Change the number of neurons in the model.
+    n_classes=3,
+    model_dir='models/iris')
+
+classifier.train(
+    input_fn=lambda:train_input_fn(train_x, train_y, batch_size=100),
+        steps=200)
+```
+
+Since the state in the checkpoint is incompatible with the model described
+in `classifier2`, retraining fails with the following error:
+
+```None
+...
+InvalidArgumentError (see above for traceback): tensor_name =
+dnn/hiddenlayer_1/bias/t_0/Adagrad; shape in shape_and_slice spec [10]
+does not match the shape stored in checkpoint: [20]
+```
+
+To run experiments in which you train and compare slightly different
+versions of a model, save a copy of the code that created each
+`model-dir`, possibly by creating a separate git branch for each version.
+This separation will keep your checkpoints recoverable.
+
+## Summary
+
+Checkpoints provide an easy automatic mechanism for storing and restoring
+models created by Estimators.  See the @{$saved_model$Saving and Restoring}
+chapter of the *TensorFlow Programmer's Guide* for details on:
+
+*   Saving and restoring models created by low-level TensorFlow APIs.
+*   Saving and restoring models in the SavedModel format, which is a
+    language-neutral, recoverable, serialization format.
diff --git a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
index ce5db079ba3a502ffdec96191b03a8b951ac3db6..32f387ae8e0da0ef3d6f6cad62001a7e9f99961b 100644
--- a/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
+++ b/tensorflow/docs_src/get_started/summaries_and_tensorboard.md
@@ -137,12 +137,10 @@ with tf.name_scope('cross_entropy'):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-  # raw outputs of the nn_layer above, and then average across
-  # the batch.
-  diff = tf.nn.softmax_cross_entropy_with_logits(targets=y_, logits=y)
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the
+  # raw logit outputs of the nn_layer above.
   with tf.name_scope('total'):
-    cross_entropy = tf.reduce_mean(diff)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
 tf.summary.scalar('cross_entropy', cross_entropy)
 
 with tf.name_scope('train'):
diff --git a/tensorflow/docs_src/install/index.md b/tensorflow/docs_src/install/index.md
index eddbfe9e31e6a3618908ab1966e548873dc6d267..c4fc882ddd43eed8fd1c8562f6ac89a7dd68535d 100644
--- a/tensorflow/docs_src/install/index.md
+++ b/tensorflow/docs_src/install/index.md
@@ -2,9 +2,11 @@
 
 We've built and tested TensorFlow on the following 64-bit laptop/desktop
 operating systems:
+
   * MacOS X 10.11 (El Capitan) or later.
   * Ubuntu 14.04 or later
   * Windows 7 or later.
+
 Although you might be able to install TensorFlow on other laptop or desktop
 systems, we only support (and only fix issues in) the preceding configurations.
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 3a153e81145676aab7e9f95f9d1c78fa7531a2cc..df622c6ac57907122e4d236e3623d947dc35ac58 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index df43255896eb5084431be8336a5778d17607fd3f..8b3da49a0d4bca1b2bc2293520e0b946a7727c88 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index f7f2c3cdc71787a9ce93e323a29b07e6e6a7779d..6eb81582491899c9c278c41fb39ae21d7fc3f4a9 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.4.0-rc1</version>
+  <version>1.4.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.4.0-rc1</version>
+                 <version>1.4.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,7 +124,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -143,7 +143,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -151,10 +151,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0.zip).
   3. Extract this .zip file.
 
 
@@ -202,7 +202,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.4.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.4.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -216,11 +216,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.4.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.4.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.4.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 414ab7b1f7def3d43b717c628979b291fd9244f0..28b04bab9561a050aee2acb4bb8b472a86c12b95 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -51,15 +51,15 @@ must be installed on your system:
     <pre>
     $ <b>sudo apt-get install cuda-command-line-tools</b>
     </pre>
-    
+
     and add its path to your `LD_LIBRARY_PATH` environment variable:
 
-    <pre> 
-    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b> 
+    <pre>
+    $ <b>export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64</b>
     </pre>
 
     For CUDA Toolkit <= 7.5 do:
-    
+
     <pre>
     $ <b>sudo apt-get install libcupti-dev</b>
     </pre>
@@ -188,7 +188,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -293,7 +293,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -480,7 +480,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -648,14 +648,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,14 +667,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -686,14 +686,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -705,14 +705,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.4.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.4.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 9a95710bfa78ab546bc9e7ff1c8bd33ccd8b23c8..3afd0aec0f372d885b5ce7b2587998432f7e8af7 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -79,22 +79,23 @@ Take the following steps to install TensorFlow with Virtualenv:
   4. Activate the Virtualenv environment by issuing one of the
      following commands:
 
-     <pre>$ <b>source ~/tensorflow/bin/activate</b>      # If using bash, sh, ksh, or zsh
-    $ <b>source ~/tensorflow/bin/activate.csh</b>  # If using csh or tcsh </pre>
+     <pre>$ <b>cd <i>targetDirectory</i></b>
+    $ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+    $ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
 
      The preceding `source` command should change your prompt to the following:
 
-     <pre> (tensorflow)$ </pre>
+     <pre> (<i>targetDirectory</i>)$ </pre>
 
   5. Ensure pip ≥8.1 is installed:
 
-     <pre> (tensorflow)$ <b>easy_install -U pip</b></pre>
+     <pre> (<i>targetDirectory</i>)$ <b>easy_install -U pip</b></pre>
 
   6. Issue one of the following commands to install TensorFlow and all the
      packages that TensorFlow requires into the active Virtualenv environment:
 
-     <pre> (tensorflow)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
-     (tensorflow)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --upgrade tensorflow</b>      # for Python 2.7
+     (<i>targetDirectory</i>)$ <b>pip3 install --upgrade tensorflow</b>     # for Python 3.n
 
   7. Optional. If Step 6 failed (typically because you invoked a pip version
      lower than 8.1), install TensorFlow in the active
@@ -114,7 +115,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -128,16 +129,18 @@ to confirm that the installation worked properly.
 
 Note that you must activate the Virtualenv environment each time you
 use TensorFlow in a new shell.  If the Virtualenv environment is not
-currently active (that is, the prompt is not `(tensorflow)`, invoke
+currently active (that is, the prompt is not `(<i>targetDirectory</i>)`, invoke
 one of the following commands:
 
-<pre>$ <b>source ~/tensorflow/bin/activate</b>      # bash, sh, ksh, or zsh
-$ <b>source ~/tensorflow/bin/activate.csh</b>  # csh or tcsh </pre>
+<pre>$ <b>cd <i>targetDirectory</i></b>
+$ <b>source ./bin/activate</b>      # If using bash, sh, ksh, or zsh
+$ <b>source ./bin/activate.csh</b>  # If using csh or tcsh </pre>
+
 
 Your prompt will transform to the following to indicate that your
 tensorflow environment is active:
 
-<pre> (tensorflow)$ </pre>
+<pre> (<i>targetDirectory</i>)$ </pre>
 
 When the Virtualenv environment is active, you may run
 TensorFlow programs from this shell.
@@ -145,7 +148,7 @@ TensorFlow programs from this shell.
 When you are done using TensorFlow, you may deactivate the
 environment by issuing the following command:
 
-<pre> (tensorflow)$ <b>deactivate</b> </pre>
+<pre> (<i>targetDirectory</i>)$ <b>deactivate</b> </pre>
 
 The prompt will revert back to your default prompt (as defined by `PS1`).
 
@@ -235,7 +238,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -331,20 +334,20 @@ Take the following steps to install TensorFlow in an Anaconda environment:
   3. Activate the conda environment by issuing the following command:
 
      <pre>$ <b>source activate tensorflow</b>
-     (tensorflow)$  # Your prompt should change</pre>
+     (<i>targetDirectory</i>)$  # Your prompt should change</pre>
 
   4. Issue a command of the following format to install
      TensorFlow inside your conda environment:
 
-     <pre>(tensorflow)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
+     <pre>(<i>targetDirectory</i>)<b>$ pip install --ignore-installed --upgrade</b> <i>TF_PYTHON_URL</i></pre>
 
      where <i>TF_PYTHON_URL</i> is the
      [URL of the TensorFlow Python package](#the_url_of_the_tensorflow_python_package).
      For example, the following command installs the CPU-only version of
      TensorFlow for Python 2.7:
 
-     <pre> (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl</b></pre>
+     <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -517,7 +520,7 @@ This section documents the relevant values for Mac OS installations.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py2-none-any.whl
 </pre>
 
 
@@ -525,7 +528,7 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.4.0-py3-none-any.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 6d0dcdcd4ae7f884b4afbd4803aebeb672a955d1..e187b0e51c21d8ef219cef84e2fc7195c5675c67 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -138,12 +138,12 @@ The following NVIDIA <i>software</i> must be installed on your system:
     `LD_LIBRARY_PATH` environment variable as described in the
     NVIDIA documentation.
   * The NVIDIA drivers associated with NVIDIA's Cuda Toolkit.
-  * cuDNN (>= v3). We recommend version 5.1. For details, see
+  * cuDNN (>= v3). We recommend version 6.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn),
     particularly the description of appending the appropriate pathname
     to your `LD_LIBRARY_PATH` environment variable.
 
-Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via 
+Finally, you must also install `libcupti` which for Cuda Toolkit >= 8.0 you do via
 
 <pre> $ <b>sudo apt-get install cuda-command-line-tools</b> </pre>
 
@@ -355,10 +355,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.4.0rc1 on Linux:
+for TensorFlow 1.4.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.4.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -441,14 +441,25 @@ Stack Overflow and specify the `tensorflow` tag.
   <td>Invoking `python` or `ipython` generates the following error:
   <pre>ImportError: cannot import name pywrap_tensorflow</pre></td>
 </tr>
+
+<tr>
+  <td><a href="https://stackoverflow.com/questions/45276830">45276830</a></td>
+  <td><pre>external/local_config_cc/BUILD:50:5: in apple_cc_toolchain rule
+  @local_config_cc//:cc-compiler-darwin_x86_64: Xcode version must be specified
+  to use an Apple CROSSTOOL.</pre>
+  </td>
+</tr>
+
 </table>
 
 ## Tested source configurations
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.4.0rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.5.4</td><td>6</td><td>8</td></tr>
+ <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.5</td><td>5.1</td><td>8</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
@@ -460,7 +471,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
+ <tr><td>tensorflow-1.3.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.5</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.1.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.4.2</td><td>5.1</td><td>8</td></tr>
@@ -471,8 +483,10 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.4.0rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.4.0rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.4.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.4.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
+<tr><td>tensorflow-1.3.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.3.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>6</td><td>8</td></tr>
 <tr><td>tensorflow-1.2.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.2.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>5.1</td><td>8</td></tr>
 <tr><td>tensorflow-1.1.0</td><td>CPU</td><td>3.5</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 4098ee5b2e51521c9c77dadc9dbf0eb6f6c78235..8d0eb7966fdf17be1c259627a64803f0a392943a 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -36,7 +36,7 @@ installed on your system:
     Ensure that you append the relevant Cuda pathnames to the `%PATH%`
     environment variable as described in the NVIDIA documentation.
   * The NVIDIA drivers associated with CUDA Toolkit 8.0.
-  * cuDNN v6.1. For details, see
+  * cuDNN v6.0. For details, see
     [NVIDIA's documentation](https://developer.nvidia.com/cudnn).
     Note that cuDNN is typically installed in a different location from the
     other CUDA DLLs. Ensure that you add the directory where you installed
@@ -84,7 +84,7 @@ install it now:
   * [Python 3.5.x 64-bit from python.org](https://www.python.org/downloads/release/python-352/)
   * [Python 3.6.x 64-bit from python.org](https://www.python.org/downloads/release/python-362/)
 
--TensorFlow supports Python 3.5.x and 3.6.x on Windows.
+TensorFlow supports Python 3.5.x and 3.6.x on Windows.
 Note that Python 3 comes with the pip3 package manager, which is the
 program you'll use to install TensorFlow.
 
@@ -98,7 +98,6 @@ To install the GPU version of TensorFlow, enter the following command:
 
 <pre>C:\> <b>pip3 install --upgrade tensorflow-gpu</b></pre>
 
-
 ## Installing with Anaconda
 
 **The Anaconda installation is community supported, not officially supported.**
@@ -219,6 +218,11 @@ ImportError: cannot import name 'descriptor'</pre>
   </td>
 </tr>
 
+<tr>
+  <td><a href="https://stackoverflow.com/q/38896424">38896424</a></td>
+  <td>
+  <pre>Could not find a version that satisfies the requirement tensorflow</pre>
+  </td>
+</tr>
 
 </table>
-
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 030cd0d051103e0d4bf903663d6fb7300c884b18..b5a1d5d7d1bf3b456ab24165e273969bdbd7bfca 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -66,7 +66,7 @@ them.
 
 ## Adding TensorFlow to your apps using Android Studio
 
-To add TensorFlow to your own apps on Android, the simplest way is to add the 
+To add TensorFlow to your own apps on Android, the simplest way is to add the
 following lines to your Gradle build file:
 
     allprojects {
@@ -74,7 +74,7 @@ following lines to your Gradle build file:
             jcenter()
         }
 	}
-											
+
     dependencies {
         compile 'org.tensorflow:tensorflow-android:+'
     }
diff --git a/tensorflow/docs_src/mobile/index.md b/tensorflow/docs_src/mobile/index.md
index a6f1422f6f170fee1a24fa12f62fc03d60632666..419ae7094a180fb166eb5b00cc382773b95b91f4 100644
--- a/tensorflow/docs_src/mobile/index.md
+++ b/tensorflow/docs_src/mobile/index.md
@@ -1,238 +1,36 @@
-# Building Mobile Apps with TensorFlow
-
-TensorFlow was designed from the ground up to be a good deep learning solution
-for mobile platforms like Android and iOS. This guide is to help you understand
-how to integrate TensorFlow into your mobile apps effectively and efficiently.
-
-## About this Guide
-
-This guide is aimed at developers who have a TensorFlow model that’s
-successfully working in a desktop environment, and who want to integrate it into
-a mobile application. Here are the main challenges you’ll face during that
-process:
-
-- Understanding how to use Tensorflow for mobile.
-- Building TensorFlow for your platform.
-- Integrating the TensorFlow library into your application.
-- Preparing your model file for mobile deployment.
-- Optimizing for latency, RAM usage, model file size, and binary size.
-
-## Why run TensorFlow on mobile?
-
-Traditionally, deep learning has been associated with data centers and giant
-clusters of high-powered GPU machines. However, it can be very expensive and
-time-consuming to send all of the data a device has access to across a network
-connection. Running on mobile makes it possible to deliver very interactive
-applications in a way that’s not possible when you have to wait for a network
-round trip.
-
-Here are some common use cases for on-device deep learning:
-
-### Speech Recognition
-
-There are a lot of interesting applications that can be built with a
-speech-driven interface, and many of these require on-device processing. Most of
-the time a user isn’t giving commands, and so streaming audio continuously to a
-remote server would be a waste of bandwidth, since it would mostly be silence or
-background noises. To solve this problem it’s common to have a small neural
-network running on-device @{$tutorials/audio_recognition$listening out for a
-particular keyword}. Once that keyword has been spotted, the rest of the
-conversation can be transmitted over to the server for further processing if
-more computing power is needed.
-
-### Image Recognition
-
-It can be very useful for a mobile app to be able to make sense of a camera
-image. If your users are taking photos, recognizing what’s in them can help your
-camera apps apply appropriate filters, or label the photos so they’re easily
-findable. It’s important for embedded applications too, since you can use image
-sensors to detect all sorts of interesting conditions, whether it’s spotting
-endangered animals in the wild
-or
-[reporting how late your train is running](https://svds.com/tensorflow-image-recognition-raspberry-pi/).
-
-TensorFlow comes with several examples of recognizing the types of objects
-inside images along with a variety of different pre-trained models, and they can
-all be run on mobile devices. You can try out
-our
-[Tensorflow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) and
-[Tensorflow for Poets 2: Optimize for Mobile](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/index.html#0) codelabs to
-see how to take a pretrained model and run some very fast and lightweight
-training to teach it to recognize specific objects, and then optimize it to
-run on mobile.
-
-### Object Localization
-
-Sometimes it’s important to know where objects are in an image as well as what
-they are. There are lots of augmented reality use cases that could benefit a
-mobile app, such as guiding users to the right component when offering them
-help fixing their wireless network or providing informative overlays on top of
-landscape features. Embedded applications often need to count objects that are
-passing by them, whether it’s pests in a field of crops, or people, cars and
-bikes going past a street lamp.
-
-TensorFlow offers a pretrained model for drawing bounding boxes around people
-detected in images, together with tracking code to follow them over time. The
-tracking is especially important for applications where you’re trying to count
-how many objects are present over time, since it gives you a good idea when a
-new object enters or leaves the scene. We have some sample code for this
-available for Android [on
-Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
-and also a [more general object detection
-model](https://github.com/tensorflow/models/tree/master/object_detection/README.md)
-available as well.
-
-### Gesture Recognition
-
-It can be useful to be able to control applications with hand or other
-gestures, either recognized from images or through analyzing accelerometer
-sensor data. Creating those models is beyond the scope of this guide, but
-TensorFlow is an effective way of deploying them.
-
-### Optical Character Recognition
-
-Google Translate’s live camera view is a great example of how effective
-interactive on-device detection of text can be.
-
-<div class="video-wrapper">
-  <iframe class="devsite-embedded-youtube-video" data-video-id="06olHmcJjS0"
-            data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
-  </iframe>
-</div>
-
-There are multiple steps involved in recognizing text in images. You first have
-to identify the areas where the text is present, which is a variation on the
-object localization problem, and can be solved with similar techniques. Once you
-have an area of text, you then need to interpret it as letters, and then use a
-language model to help guess what words they represent. The simplest way to
-estimate what letters are present is to segment the line of text into individual
-letters, and then apply a simple neural network to the bounding box of each. You
-can get good results with the kind of models used for MNIST, which you can find
-in TensorFlow’s tutorials, though you may want a higher-resolution input.  A
-more advanced alternative is to use an LSTM model to process a whole line of
-text at once, with the model itself handling the segmentation into different
-characters.
-
-### Translation
-
-Translating from one language to another quickly and accurately, even if you
-don’t have a network connection, is an important use case. Deep networks are
-very effective at this sort of task, and you can find descriptions of a lot of
-different models in the literature. Often these are sequence-to-sequence
-recurrent models where you’re able to run a single graph to do the whole
-translation, without needing to run separate parsing stages.
-
-### Text Classification
-
-If you want to suggest relevant prompts to users based on what they’re typing or
-reading, it can be very useful to understand the meaning of the text. This is
-where text classification comes in. Text classification is an umbrella term
-that covers everything from sentiment analysis to topic discovery. You’re likely
-to have your own categories or labels that you want to apply, so the best place
-to start is with an example
-like
-[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/skip_thoughts/),
-and then train on your own examples.
-
-### Voice Synthesis
-
-A synthesized voice can be a great way of giving users feedback or aiding
-accessibility, and recent advances such as
-[WaveNet](https://deepmind.com/blog/wavenet-generative-model-raw-audio/) show
-that deep learning can offer very natural-sounding speech.
-
-## How does it fit with the cloud?
-
-These examples of use cases give an idea of how on-device networks can
-complement cloud services. Cloud has a great deal of computing power in a
-controlled environment, but running on devices can offer higher interactivity.
-In situations where the cloud is unavailable, or your cloud capacity is limited,
-you can provide an offline experience, or reduce cloud workload by processing
-easy cases on device.
-
-Doing on-device computation can also signal when it's time to switch to working
-on the cloud. A good example of this is hotword detection in speech. Since
-devices are able to constantly listen out for the keywords, this then triggers a
-lot of traffic to cloud-based speech recognition once one is recognised. Without
-the on-device component, the whole application wouldn’t be feasible, and this
-pattern exists across several other applications as well. Recognizing that some
-sensor input is interesting enough for further processing makes a lot of
-interesting products possible.
-
-## What hardware and software should you have?
-
-TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
-supported operating systems and instructions to install TensorFlow, see
-@{$install$Installing Tensorflow}.
-
-Some of the scripts in this guide require you to compile TensorFlow from source,
-so you’ll need more than just `pip install` to work through all the sample code.
-
-To try out the mobile examples, you’ll need a device set up for development,
-using
-either [Android Studio](https://developer.android.com/studio/install.html),
-or [XCode](https://developer.apple.com/xcode/) if you're developing for iOS.
-
-## What should you do before you get started?
-
-Before thinking about how to get your solution on mobile:
-
-1. Determine whether your problem is solvable by mobile machine learning
-2. Create a labelled dataset to define your problem
-3. Pick an effective model for the problem
-
-We'll discuss these in more detail below.
-
-### Is your problem solvable by mobile machine learning?
-
-Once you have an idea of the problem you want to solve, you need to make a plan
-of how to build your solution. The most important first step is making sure that
-your problem is actually solvable, and the best way to do that is to mock it up
-using humans in the loop.
-
-For example, if you want to drive a robot toy car using voice commands, try
-recording some audio from the device and listen back to it to see if you can
-make sense of what’s being said. Often you’ll find there are problems in the
-capture process, such as the motor drowning out speech or not being able to hear
-at a distance, and you should tackle these problems before investing in the
-modeling process.
-
-Another example would be giving photos taken from your app to people see if they
-can classify what’s in them, in the way you’re looking for. If they can’t do
-that (for example, trying to estimate calories in food from photos may be
-impossible because all white soups look the same), then you’ll need to redesign
-your experience to cope with that. A good rule of thumb is that if a human can’t
-handle the task then it will be difficult to train a computer to do better.
-
-### Create a labelled dataset
-
-After you’ve solved any fundamental issues with your use case, you need to
-create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, moreso than picking which model to use. You want it
-to be as representative as possible of your actual use case, since the model
-will only be effective at the task you teach it. It’s also worth investing in
-tools to make labeling the data as efficient and accurate as possible. For
-example, if you’re able to switch from having to click a button on a web
-interface to simple keyboard shortcuts, you may be able to speed up the
-generation process a lot. You should also start by doing the initial labeling
-yourself, so you can learn about the difficulties and likely errors, and
-possibly change your labeling or data capture process to avoid them. Once you
-and your team are able to consistently label examples (that is once you
-generally agree on the same labels for most examples), you can then try and
-capture your knowledge in a manual and teach external raters how to run the same
-process.
-
-### Pick an effective model
-
-The next step is to pick an effective model to use. You might be able to avoid
-training a model from scratch if someone else has already implemented a model
-similar to what you need; we have a repository of models implemented in
-TensorFlow [on Github](https://github.com/tensorflow/models) that you can look
-through. Lean towards the simplest model you can find, and try to get started as
-soon as you have even a small amount of labelled data, since you’ll get the best
-results when you’re able to iterate quickly. The shorter the time it takes to
-try training a model and running it in s real application, the better overall
-results you’ll see. It’s common for an algorithm to get great training accuracy
-numbers but then fail to be useful within a real application because there’s a
-mismatch between the dataset and real usage. Prototype end-to-end usage as soon
-as possible to create a consistent user experience.
+# Overview
+
+TensorFlow was designed to be a good deep learning solution for mobile
+platforms. Currently we have two solutions for deploying machine learning
+applications on mobile and embedded devices:
+@{$mobile/mobile_intro$TensorFlow for Mobile} and @{$mobile/tflite$TensorFlow Lite}.
+
+## TensorFlow Lite versus TensorFlow Mobile
+
+Here are a few of the differences between the two:
+
+- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
+  developed with TensorFlow Lite will have a smaller binary size, fewer
+  dependencies, and better performance.
+
+- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
+  We expect you to use TensorFlow Mobile to cover production cases.
+
+- TensorFlow Lite supports only a limited set of operators, so not all models
+  will work on it by default. TensorFlow for Mobile has a fuller set of
+  supported functionality.
+
+TensorFlow Lite provides better performance and a small binary size on mobile
+platforms as well as the ability to leverage hardware acceleration if available
+on their platforms. In addition, it has many fewer dependencies so it can be
+built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
+also allows targeting accelerators through the [Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite currently has coverage for a limited set of operators. While
+TensorFlow for Mobile supports only a constrained set of ops by default, in
+principle if you use an arbitrary operator in TensorFlow, it can be customized
+to build that kernel. Thus use cases which are not currently supported by
+TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
+evolves, it will gain additional operators, and the decision will be easier to
+make.
diff --git a/tensorflow/docs_src/mobile/ios_build.md b/tensorflow/docs_src/mobile/ios_build.md
index 2e6d3bf90e739aa3dce2a8dfb2568383b68b0282..4c84a1214a26eeb90c1b6a186a369212377b06cd 100644
--- a/tensorflow/docs_src/mobile/ios_build.md
+++ b/tensorflow/docs_src/mobile/ios_build.md
@@ -24,7 +24,7 @@ If you'd like to add TensorFlow capabilities to your own app, do the following:
 
 - Open `YourProjectName.xcworkspace` and add your code.
 
-- In your app's **Build Settings**, make sure to add `$(inherited)` to the 
+- In your app's **Build Settings**, make sure to add `$(inherited)` to the
   **Other Linker Flags**, and **Header Search Paths** sections.
 
 ## Running the Samples
@@ -98,7 +98,7 @@ There are three demo applications for iOS, all defined in Xcode projects inside
 
 ## Building the TensorFlow iOS libraries from source
 
-While Cocapods is the quickest and easiest way of getting started, you sometimes
+While Cocoapods is the quickest and easiest way of getting started, you sometimes
 need more flexibility to determine which parts of TensorFlow your app should be
 shipped with. For such cases, you can build the iOS libraries from the
 sources. [This
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index 347c07d2330fb0da7e5c9f287ddba16524e4ec34..4d2c3b62341717d90d6e4afabd105d7fd7a7866d 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,8 +1,11 @@
-### TensorFlow for Mobile
 index.md
+### TensorFlow Lite
+tflite/index.md
+>>>
+### TensorFlow Mobile
+mobile_intro.md
 android_build.md
 ios_build.md
-#raspi_build.md  until this section gets rewritten, or TFLite takes over
 linking_libs.md
 prepare_models.md
 optimizing.md
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/docs_src/mobile/mobile_intro.md
new file mode 100644
index 0000000000000000000000000000000000000000..17dbf1c3e6ad89768529864ba884274a51b3dfb2
--- /dev/null
+++ b/tensorflow/docs_src/mobile/mobile_intro.md
@@ -0,0 +1,247 @@
+# Introduction to TensorFlow Mobile
+
+TensorFlow was designed from the ground up to be a good deep learning solution
+for mobile platforms like Android and iOS. This mobile guide should help you
+understand how machine learning can work on mobile platforms and how to
+integrate TensorFlow into your mobile apps effectively and efficiently.
+
+## About this Guide
+
+This guide is aimed at developers who have a TensorFlow model that’s
+successfully working in a desktop environment, who want to integrate it into
+a mobile application, and cannot use TensorFlow Lite. Here are the
+main challenges you’ll face during that process:
+
+- Understanding how to use Tensorflow for mobile.
+- Building TensorFlow for your platform.
+- Integrating the TensorFlow library into your application.
+- Preparing your model file for mobile deployment.
+- Optimizing for latency, RAM usage, model file size, and binary size.
+
+## Common use cases for mobile machine learning
+
+**Why run TensorFlow on mobile?**
+
+Traditionally, deep learning has been associated with data centers and giant
+clusters of high-powered GPU machines. However, it can be very expensive and
+time-consuming to send all of the data a device has access to across a network
+connection. Running on mobile makes it possible to deliver very interactive
+applications in a way that’s not possible when you have to wait for a network
+round trip.
+
+Here are some common use cases for on-device deep learning:
+
+### Speech Recognition
+
+There are a lot of interesting applications that can be built with a
+speech-driven interface, and many of these require on-device processing. Most of
+the time a user isn’t giving commands, and so streaming audio continuously to a
+remote server would be a waste of bandwidth, since it would mostly be silence or
+background noises. To solve this problem it’s common to have a small neural
+network running on-device @{$tutorials/audio_recognition$listening out for a particular keyword}.
+Once that keyword has been spotted, the rest of the
+conversation can be transmitted over to the server for further processing if
+more computing power is needed.
+
+### Image Recognition
+
+It can be very useful for a mobile app to be able to make sense of a camera
+image. If your users are taking photos, recognizing what’s in them can help your
+camera apps apply appropriate filters, or label the photos so they’re easily
+findable. It’s important for embedded applications too, since you can use image
+sensors to detect all sorts of interesting conditions, whether it’s spotting
+endangered animals in the wild
+or
+[reporting how late your train is running](https://svds.com/tensorflow-image-recognition-raspberry-pi/).
+
+TensorFlow comes with several examples of recognizing the types of objects
+inside images along with a variety of different pre-trained models, and they can
+all be run on mobile devices. You can try out
+our
+[Tensorflow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/index.html#0) and
+[Tensorflow for Poets 2: Optimize for Mobile](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/index.html#0) codelabs to
+see how to take a pretrained model and run some very fast and lightweight
+training to teach it to recognize specific objects, and then optimize it to
+run on mobile.
+
+### Object Localization
+
+Sometimes it’s important to know where objects are in an image as well as what
+they are. There are lots of augmented reality use cases that could benefit a
+mobile app, such as guiding users to the right component when offering them
+help fixing their wireless network or providing informative overlays on top of
+landscape features. Embedded applications often need to count objects that are
+passing by them, whether it’s pests in a field of crops, or people, cars and
+bikes going past a street lamp.
+
+TensorFlow offers a pretrained model for drawing bounding boxes around people
+detected in images, together with tracking code to follow them over time. The
+tracking is especially important for applications where you’re trying to count
+how many objects are present over time, since it gives you a good idea when a
+new object enters or leaves the scene. We have some sample code for this
+available for Android [on
+Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
+and also a [more general object detection
+model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
+available as well.
+
+### Gesture Recognition
+
+It can be useful to be able to control applications with hand or other
+gestures, either recognized from images or through analyzing accelerometer
+sensor data. Creating those models is beyond the scope of this guide, but
+TensorFlow is an effective way of deploying them.
+
+### Optical Character Recognition
+
+Google Translate’s live camera view is a great example of how effective
+interactive on-device detection of text can be.
+
+<div class="video-wrapper">
+  <iframe class="devsite-embedded-youtube-video" data-video-id="06olHmcJjS0"
+            data-autohide="1" data-showinfo="0" frameborder="0" allowfullscreen>
+  </iframe>
+</div>
+
+There are multiple steps involved in recognizing text in images. You first have
+to identify the areas where the text is present, which is a variation on the
+object localization problem, and can be solved with similar techniques. Once you
+have an area of text, you then need to interpret it as letters, and then use a
+language model to help guess what words they represent. The simplest way to
+estimate what letters are present is to segment the line of text into individual
+letters, and then apply a simple neural network to the bounding box of each. You
+can get good results with the kind of models used for MNIST, which you can find
+in TensorFlow’s tutorials, though you may want a higher-resolution input.  A
+more advanced alternative is to use an LSTM model to process a whole line of
+text at once, with the model itself handling the segmentation into different
+characters.
+
+### Translation
+
+Translating from one language to another quickly and accurately, even if you
+don’t have a network connection, is an important use case. Deep networks are
+very effective at this sort of task, and you can find descriptions of a lot of
+different models in the literature. Often these are sequence-to-sequence
+recurrent models where you’re able to run a single graph to do the whole
+translation, without needing to run separate parsing stages.
+
+### Text Classification
+
+If you want to suggest relevant prompts to users based on what they’re typing or
+reading, it can be very useful to understand the meaning of the text. This is
+where text classification comes in. Text classification is an umbrella term
+that covers everything from sentiment analysis to topic discovery. You’re likely
+to have your own categories or labels that you want to apply, so the best place
+to start is with an example
+like
+[Skip-Thoughts](https://github.com/tensorflow/models/tree/master/research/skip_thoughts/),
+and then train on your own examples.
+
+### Voice Synthesis
+
+A synthesized voice can be a great way of giving users feedback or aiding
+accessibility, and recent advances such as
+[WaveNet](https://deepmind.com/blog/wavenet-generative-model-raw-audio/) show
+that deep learning can offer very natural-sounding speech.
+
+## Mobile machine learning and the cloud
+
+These examples of use cases give an idea of how on-device networks can
+complement cloud services. Cloud has a great deal of computing power in a
+controlled environment, but running on devices can offer higher interactivity.
+In situations where the cloud is unavailable, or your cloud capacity is limited,
+you can provide an offline experience, or reduce cloud workload by processing
+easy cases on device.
+
+Doing on-device computation can also signal when it's time to switch to working
+on the cloud. A good example of this is hotword detection in speech. Since
+devices are able to constantly listen out for the keywords, this then triggers a
+lot of traffic to cloud-based speech recognition once one is recognized. Without
+the on-device component, the whole application wouldn’t be feasible, and this
+pattern exists across several other applications as well. Recognizing that some
+sensor input is interesting enough for further processing makes a lot of
+interesting products possible.
+
+## What hardware and software should you have?
+
+TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
+supported operating systems and instructions to install TensorFlow, see
+@{$install$Installing Tensorflow}.
+
+Note that some of the sample code we provide for mobile TensorFlow requires you
+to compile TensorFlow from source, so you’ll need more than just `pip install`
+to work through all the sample code.
+
+To try out the mobile examples, you’ll need a device set up for development,
+using
+either [Android Studio](https://developer.android.com/studio/install.html),
+or [XCode](https://developer.apple.com/xcode/) if you're developing for iOS.
+
+## What should you do before you get started?
+
+Before thinking about how to get your solution on mobile:
+
+1. Determine whether your problem is solvable by mobile machine learning
+2. Create a labelled dataset to define your problem
+3. Pick an effective model for the problem
+
+We'll discuss these in more detail below.
+
+### Is your problem solvable by mobile machine learning?
+
+Once you have an idea of the problem you want to solve, you need to make a plan
+of how to build your solution. The most important first step is making sure that
+your problem is actually solvable, and the best way to do that is to mock it up
+using humans in the loop.
+
+For example, if you want to drive a robot toy car using voice commands, try
+recording some audio from the device and listen back to it to see if you can
+make sense of what’s being said. Often you’ll find there are problems in the
+capture process, such as the motor drowning out speech or not being able to hear
+at a distance, and you should tackle these problems before investing in the
+modeling process.
+
+Another example would be giving photos taken from your app to people see if they
+can classify what’s in them, in the way you’re looking for. If they can’t do
+that (for example, trying to estimate calories in food from photos may be
+impossible because all white soups look the same), then you’ll need to redesign
+your experience to cope with that. A good rule of thumb is that if a human can’t
+handle the task then it will be difficult to train a computer to do better.
+
+### Create a labelled dataset
+
+After you’ve solved any fundamental issues with your use case, you need to
+create a labeled dataset to define what problem you’re trying to solve. This
+step is extremely important, moreso than picking which model to use. You want it
+to be as representative as possible of your actual use case, since the model
+will only be effective at the task you teach it. It’s also worth investing in
+tools to make labeling the data as efficient and accurate as possible. For
+example, if you’re able to switch from having to click a button on a web
+interface to simple keyboard shortcuts, you may be able to speed up the
+generation process a lot. You should also start by doing the initial labeling
+yourself, so you can learn about the difficulties and likely errors, and
+possibly change your labeling or data capture process to avoid them. Once you
+and your team are able to consistently label examples (that is once you
+generally agree on the same labels for most examples), you can then try and
+capture your knowledge in a manual and teach external raters how to run the same
+process.
+
+### Pick an effective model
+
+The next step is to pick an effective model to use. You might be able to avoid
+training a model from scratch if someone else has already implemented a model
+similar to what you need; we have a repository of models implemented in
+TensorFlow [on Github](https://github.com/tensorflow/models) that you can look
+through. Lean towards the simplest model you can find, and try to get started as
+soon as you have even a small amount of labelled data, since you’ll get the best
+results when you’re able to iterate quickly. The shorter the time it takes to
+try training a model and running it in s real application, the better overall
+results you’ll see. It’s common for an algorithm to get great training accuracy
+numbers but then fail to be useful within a real application because there’s a
+mismatch between the dataset and real usage. Prototype end-to-end usage as soon
+as possible to create a consistent user experience.
+
+## Next Steps
+
+We suggest you get started by building one of our demos for
+@{$mobile/android_build$Android} or @{$mobile/ios_build$iOS}.
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/docs_src/mobile/optimizing.md
index 1da8be5689c9ac4f5d0bfdd364c8da653618f654..44cacff5dbbcb0685044c342184464b47a8ed090 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/docs_src/mobile/optimizing.md
@@ -57,7 +57,7 @@ get one inference every two seconds.
 
 Having this estimate helps you plan for what you’ll be able to realistically
 achieve on a device. If the model is using too many ops, then there are a lot of
-opportunities to optimize the architecture to reduce that number. 
+opportunities to optimize the architecture to reduce that number.
 
 Advanced techniques include [SqueezeNet](https://arxiv.org/abs/1602.07360)
 and [MobileNet](https://arxiv.org/abs/1704.04861), which are architectures
@@ -115,7 +115,7 @@ If you look at the resulting file size, you should see that it’s about a quart
 of the original at 23MB.
 
 Another transform is `round_weights`, which doesn't make the file smaller, but it
-makes the file compressable to about the same size as when `quantize_weights` is
+makes the file compressible to about the same size as when `quantize_weights` is
 used. This is particularly useful for mobile development, taking advantage of
 the fact that app bundles are compressed before they’re downloaded by consumers.
 
@@ -278,7 +278,7 @@ The run above was on your desktop, but the tool also works on Android, which is
 where it’s most useful for mobile development. Here’s an example command line to
 run it on a 64-bit ARM device:
 
-    bazel build -c opt --config=android_arm64 \ 
+    bazel build -c opt --config=android_arm64 \
     tensorflow/tools/benchmark:benchmark_model
     adb push bazel-bin/tensorflow/tools/benchmark/benchmark_model /data/local/tmp
     adb push /tmp/tensorflow_inception_graph.pb /data/local/tmp/
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index c5a560e074e3fd51708b6867d12426297decf6ae..360ee302aa96bc3a0b65eab7b39c3dacf56b42c0 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -131,9 +131,9 @@ needs to understand which parts of the graph are actually needed, and which are
 artifacts of the training process, like summarization ops. Only ops that
 contribute to calculating the given output nodes will be kept. If you know how
 your graph is going to be used, these should just be the names of the nodes you
-pass into `Session::Run()` as your fetch targets. The easiest way to find the 
+pass into `Session::Run()` as your fetch targets. The easiest way to find the
 node names is to inspect the Node objects while building your graph in python.
-Inspecting your graph in TensorBoard is another simple way.  You can get some 
+Inspecting your graph in TensorBoard is another simple way.  You can get some
 suggestions on likely outputs by running the [`summarize_graph` tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms/README.md#inspecting-graphs).
 
 Because the output format for TensorFlow has changed over time, there are a
@@ -164,7 +164,7 @@ The trickiest part of this process is figuring out the names of the nodes you
 want to use as inputs and outputs during inference.  You'll need these anyway
 once you start to run inference, but you also need them here so that the
 transform can calculate which nodes are not needed on the inference-only
-path. These may not be obvious from the training code. The easiest way to 
+path. These may not be obvious from the training code. The easiest way to
 determine the node name is to explore the graph with TensorBoard.
 
 Remember that mobile applications typically gather their data from sensors and
@@ -187,9 +187,9 @@ output nodes.
 If you’ve just been given a frozen `GraphDef` file, and are not sure about the
 contents, try using the `summarize_graph` tool to print out information
 about the inputs and outputs it finds from the graph structure. Here’s an
-example with the original Inception v3 file: 
+example with the original Inception v3 file:
 
-    bazel run tensorflow/tools/graph_transforms:summarize_graph -- 
+    bazel run tensorflow/tools/graph_transforms:summarize_graph --
     --in_graph=tensorflow_inception_graph.pb
 
 Once you have an idea of what the input and output nodes are, you can feed them
@@ -259,7 +259,7 @@ on how to do this, and also see @{$mobile/optimizing#binary_size$Optimizing} for
 more on reducing your binary size.
 
 ### Locate the implementation
-   
+
 Operations are broken into two parts. The first is the op definition, which
 declares the signature of the operation, which inputs, outputs, and attributes
 it has. These take up very little space, and so all are included by default. The
@@ -267,7 +267,7 @@ implementations of the op computations are done in kernels, which live in the
 `tensorflow/core/kernels` folder. You need to compile the C++ file containing
 the kernel implementation of the op you need into the library. To figure out
 which file that is, you can search for the operation name in the source
-files. 
+files.
 
 [Here’s an example search in github](https://github.com/search?utf8=%E2%9C%93&q=repo%3Atensorflow%2Ftensorflow+extension%3Acc+path%3Atensorflow%2Fcore%2Fkernels+REGISTER+Mul&type=Code&ref=searchresults).
 
@@ -296,6 +296,6 @@ complains about missing header files, add the .h’s that are needed into
 the
 [`android_extended_ops`](https://www.tensorflow.org/code/tensorflow/core/kernels/BUILD#L3525) target.
 
-If you’re using a makefile targetting iOS, Raspberry Pi, etc, go to
+If you’re using a makefile targeting iOS, Raspberry Pi, etc, go to
 [`tensorflow/contrib/makefile/tf_op_files.txt`](https://www.tensorflow.org/code/tensorflow/contrib/makefile/tf_op_files.txt) and
 add the right implementation files there.
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..49d93669a2808159a87538ab1191def5ed9ab9d4
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -0,0 +1,202 @@
+# Introduction to TensorFlow Lite
+
+TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
+devices. It enables on-device machine learning inference with low latency and a
+small binary size. TensorFlow Lite also supports hardware acceleration with the
+[Android Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite uses many techniques for achieving low latency such as
+optimizing the kernels for mobile apps, pre-fused activations, and quantized
+kernels that allow smaller and faster (fixed-point math) models.
+
+Most of our TensorFlow Lite documentation is [on
+Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite)
+for the time being.
+
+## What does TensorFlow Lite contain?
+
+TensorFlow Lite supports a set of core operators, both quantized and
+float, which have been tuned for mobile platforms. They incorporate pre-fused
+activations and biases to further enhance performance and quantized
+accuracy. Additionally, TensorFlow Lite also supports using custom operations in
+models.
+
+TensorFlow Lite defines a new model file format, based on
+[FlatBuffers](https://google.github.io/flatbuffers/). FlatBuffers is an
+open-sourced, efficient cross platform serialization library. It is similar to
+[protocol buffers](https://developers.google.com/protocol-buffers/?hl=en), but
+the primary difference is that FlatBuffers does not need a parsing/unpacking
+step to a secondary representation before you can access data, often coupled
+with per-object memory allocation. Also, the code footprint of FlatBuffers is an
+order of magnitude smaller than protocol buffers.
+
+TensorFlow Lite has a new mobile-optimized interpreter, which has the key goals
+of keeping apps lean and fast. The interpreter uses a static graph ordering and
+a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
+and execution latency.
+
+TensorFlow Lite provides an interface to leverage hardware acceleration, if
+available on the device. It does so via the Android Neural Networks library,
+released as part of Android O-MR1.
+
+## Why do we need a new mobile-specific library?
+
+Machine Learning is changing the computing paradigm, and we see an emerging
+trend of new use cases on mobile and embedded devices. Consumer expectations are
+also trending toward natural, human-like interactions with their devices, driven
+by the camera and voice interaction models.
+
+There are several factors which are fueling interest in this domain:
+
+- Innovation at the silicon layer is enabling new possibilities for hardware
+  acceleration, and frameworks such as the Android Neural Networks API make it
+  easy to leverage these.
+
+- Recent advances in real-time computer-vision and spoken language understanding
+  have led to mobile-optimized benchmark models being open sourced
+  (e.g. MobileNets, SqueezeNet).
+
+- Widely-available smart appliances create new possibilities for
+  on-device intelligence.
+
+- Interest in stronger user data privacy paradigms where user data does not need
+  to leave the mobile device.
+
+- Ability to serve ‘offline’ use cases, where the device does not need to be
+  connected to a network.
+
+We believe the next wave of machine learning applications will have significant
+processing on mobile and embedded devices.
+
+## TensorFlow Lite developer preview highlights
+
+TensorFlow Lite is available as a developer preview and includes the
+following:
+
+- A set of core operators, both quantized and float, many of which have been
+  tuned for mobile platforms.  These can be used to create and run custom
+  models.  Developers can also write their own custom operators and use them in
+  models.
+
+- A new [FlatBuffers](https://google.github.io/flatbuffers/)-based
+  model file format.
+
+- On-device interpreter with kernels optimized for faster execution on mobile.
+
+- TensorFlow converter to convert TensorFlow-trained models to the TensorFlow
+  Lite format.
+
+- Smaller in size: TensorFlow Lite is smaller than 300KB when all supported
+  operators are linked and less than 200KB when using only the operators needed
+  for supporting InceptionV3 and Mobilenet.
+
+- **Pre-tested models:**
+
+    All of the following models are guaranteed to work out of the box:
+
+    - Inception V3, a popular model for detecting the the dominant objects
+      present in an image.
+
+    - [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md),
+      a family of mobile-first computer vision models designed to effectively
+      maximize accuracy while being mindful of the restricted resources for an
+      on-device or embedded application. They are small, low-latency, low-power
+      models parameterized to meet the resource constraints of a variety of use
+      cases. They can be built upon for classification, detection, embeddings
+      and segmentation. MobileNet models are smaller but [lower in
+      accuracy](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+      than Inception V3.
+
+    - On Device Smart Reply, an on-device model which provides one-touch
+      replies for an incoming text message by suggesting contextually relevant
+      messages. The model was built specifically for memory constrained devices
+      such as watches & phones and it has been successfully used to surface
+      [Smart Replies on Android
+      Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+      to all first-party and third-party apps.
+
+- Quantized versions of the MobileNet model, which runs faster than the
+  non-quantized (float) version on CPU.
+
+- New Android demo app to illustrate the use of TensorFlow Lite with a quantized
+  MobileNet model for object classification.
+
+- Java and C++ API support
+
+Note: This is a developer release, and it’s likely that there will be changes in
+the API in upcoming versions. We do not guarantee backward or forward
+compatibility with this release.
+
+## Getting Started
+
+We recommend you try out TensorFlow Lite with the pre-tested models indicated
+above. If you have an existing mode, you will need to test whether your model is
+compatible with both the converter and the supported operator set.  To test your
+model, see the [documentation on
+GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite).
+
+### Retrain Inception-V3 or MobileNet for a custom data set
+
+The pre-trained models mentioned above have been trained on the ImageNet data
+set, which consists of 1000 predefined classes. If those classes are not
+relevant or useful for your use case, you will need to retrain those
+models. This technique is called transfer learning, which starts with a model
+that has been already trained on a problem and will then be retrained on a
+similar problem. Deep learning from scratch can take days, but transfer learning
+can be done fairly quickly. In order to do this, you'll need to generate your
+custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through this process step-by-step. The retraining code supports
+retraining for both floating point and quantized inference.
+
+## TensorFlow Lite Architecture
+
+The following diagram shows the architectural design of TensorFlow Lite:
+
+<img src = "/images/tflite-architecture.jpg">
+
+Starting with a trained TensorFlow model on disk, you'll convert that model to
+the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
+Converter. Then you can use that converted file in your mobile application.
+
+Deploying the TensorFlow Lite model file uses:
+
+- Java API: A convenience wrapper around the C++ API on Android.
+
+- C++ API: Loads the TensorFlow Lite Model File and invokes the Interpreter. The
+  same library is available on both Android and iOS.
+
+- Interpreter: Executes the model using a set of kernels. The interpreter
+  supports selective kernel loading; without kernels it is only 100KB, and 300KB
+  with all the kernels loaded. This is a significant reduction from the 1.5M
+  required by TensorFlow Mobile.
+
+- On select Android devices, the Interpreter will use the Android Neural
+  Networks API for hardware acceleration, or default to CPU execution if none
+  are available.
+
+You can also implement custom kernels using the C++ API that can be used by the
+Interpreter.
+
+## Future Work
+
+In future releases, TensorFlow Lite will support more models and built-in
+operators, contain performance improvements for both fixed point and floating
+point models, improvements to the tools to enable easier developer workflows and
+support for other smaller devices and more. As we continue development, we hope
+that TensorFlow Lite will greatly simplify the developer experience of targeting
+a model for small devices.
+
+Future plans include using specialized machine learning hardware to get the best
+possible performance for a particular model on a particular device.
+
+## Next Steps
+
+For the developer preview, most of our documentation is on GitHub. Please take a
+look at the [TensorFlow Lite
+repository](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite)
+on GitHub for more information and for code samples, demo applications, and
+more.
+
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index da556bd8483b9bfcd753d6201ed401eaca9933f2..17f71a6d7705c75e7322932cc652ec6728c8c626 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -127,7 +127,7 @@ Reading large numbers of small files significantly impacts I/O performance.
 One approach to get maximum I/O throughput is to preprocess input data into
 larger (~100MB) `TFRecord` files. For smaller data sets (200MB-1GB), the best
 approach is often to load the entire data set into memory. The document
-[Downloading and converting to TFRecord format](https://github.com/tensorflow/models/tree/master/research/slim#Data)
+[Downloading and converting to TFRecord format](https://github.com/tensorflow/models/tree/master/research/slim#downloading-and-converting-to-tfrecord-format)
 includes information and scripts for creating `TFRecords` and this
 [script](https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator/generate_cifar10_tfrecords.py)
 converts the CIFAR-10 data set into `TFRecords`.
diff --git a/tensorflow/docs_src/performance/performance_models.md b/tensorflow/docs_src/performance/performance_models.md
index fcda19e74c676856d9479ab3560e419a141bb7ce..359b0e904dba1aea92f30604ff3b8abb81d432b1 100644
--- a/tensorflow/docs_src/performance/performance_models.md
+++ b/tensorflow/docs_src/performance/performance_models.md
@@ -29,8 +29,8 @@ implementation is made up of 3 stages:
 
 The dominant part of each stage is executed in parallel with the other stages
 using `data_flow_ops.StagingArea`. `StagingArea` is a queue-like operator
-similar to @{tf.FIFOQueue}. The difference is that `StagingArea`  does not 
-guarantee FIFO ordering, but offers simpler functionality and can be executed 
+similar to @{tf.FIFOQueue}. The difference is that `StagingArea`  does not
+guarantee FIFO ordering, but offers simpler functionality and can be executed
 on both CPU and GPU in parallel with other stages. Breaking the input pipeline
 into 3 stages that operate independently in parallel is scalable and takes full
 advantage of large multi-core environments. The rest of this section details
@@ -344,7 +344,7 @@ executing the main script
     `alexnet`.
 *   **`num_gpus`**: Number of GPUs to use.
 *   **`data_dir`**: Path to data to process. If not set, synthetic data is used.
-    To use Imagenet data use these
+    To use ImageNet data use these
     [instructions](https://github.com/tensorflow/models/tree/master/research/inception#getting-started)
     as a starting point.
 *   **`batch_size`**: Batch size for each GPU.
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index 8dbf0d0446f41b26489912734bc11704e61efeab..ca3bddf758cf64e7c580f9babfe559ae23708705 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -33,11 +33,11 @@ In Numpy, this is called [broadcasting]
 
 ## Principles
 
-XLA is a low-level infrastructure with a XLA language this is as strict and
-explicit as possible, avoiding implicit and "magical" features that may make
-some computations slightly easier to define, at the cost of more assumptions
-baked into user code that will be difficult to change in the long term. If
-necessary, implicit and magical features can be added in client-level wrappers.
+The XLA language is as strict and explicit as possible, avoiding implicit and
+"magical" features. Such features may make some computations slightly easier to
+define, at the cost of more assumptions baked into user code that will be
+difficult to change in the long term. If necessary, implicit and magical
+features can be added in client-level wrappers.
 
 In regards to broadcasting, explicit broadcasting specifications on operations
 between arrays of different ranks is required. This is different from Numpy,
diff --git a/tensorflow/docs_src/performance/xla/developing_new_backend.md b/tensorflow/docs_src/performance/xla/developing_new_backend.md
index 28010ff1b785813e15c56d4bb5c26b0bcedce3d9..74ea15bb2bac2014257f0b1719820f7ee313b66b 100644
--- a/tensorflow/docs_src/performance/xla/developing_new_backend.md
+++ b/tensorflow/docs_src/performance/xla/developing_new_backend.md
@@ -62,11 +62,11 @@ If it is not possible to utilize LLVM, then the best option is to implement a
 new backend for XLA for the desired hardware. This option requires the most
 effort. The classes that need to be implemented are as follows:
 
-*   [StreamExecutor](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
+*   [`StreamExecutor`](https://www.tensorflow.org/code/tensorflow/stream_executor/stream_executor.h):
     For many devices not all methods of `StreamExecutor` are needed. See
     existing `StreamExecutor` implementations for details.
-*   [xla::Compiler](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
-    This class encapsulates the compilation of a HLO computation into an
+*   [`xla::Compiler`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/compiler.h):
+    This class encapsulates the compilation of an HLO computation into an
     `xla::Executable`.
 *   [`xla::Executable`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/executable.h):
     This class is used to launch a compiled computation on the platform.
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index 19045b45d92a2ca42c3943bc0662ca42bd0c2c24..a8847830740302a0de6f57cb3b7a0d6c7e096d32 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -65,18 +65,19 @@ The following diagram shows the compilation process in XLA:
   <img src="https://www.tensorflow.org/images/how-does-xla-work.png">
 </div>
 
-XLA comes with several optimizations and analyzes that are target-independent,
-such as [CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
+XLA comes with several optimizations and analysis passes that are
+target-independent, such as
+[CSE](https://en.wikipedia.org/wiki/Common_subexpression_elimination),
 target-independent operation fusion, and buffer analysis for allocating runtime
 memory for the computation.
 
 After the target-independent step, XLA sends the HLO computation to a backend.
-The backend can perform further HLO-level analyzes and optimizations, this time
-with target specific information and needs in mind. For example, the XLA GPU
-backend may perform operation fusion beneficial specifically for the GPU
-programming model and determine how to partition the computation into streams.
-At this stage, backends may also pattern-match certain operations or
-combinations thereof to optimized library calls.
+The backend can perform further HLO-level optimizations, this time with target
+specific information and needs in mind. For example, the XLA GPU backend may
+perform operation fusion beneficial specifically for the GPU programming model
+and determine how to partition the computation into streams. At this stage,
+backends may also pattern-match certain operations or combinations thereof to
+optimized library calls.
 
 The next step is target-specific code generation. The CPU and GPU backends
 included with XLA use [LLVM](http://llvm.org) for low-level IR, optimization,
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 91c0d5b8c60725d5d979c5fcde0d30d0ff098491..d6f05f81bfeefc4dafeacdb7b30d484d302aae39 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -13,6 +13,175 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## BatchNormGrad
+
+See also
+[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Calculates gradients of batch norm.
+
+<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------  | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized (x)                   :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\sigma^2\\))                 :
+| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+:                 :                         : `BatchNormTraining`              :
+:                 :                         : (\\( \nabla y\\))                :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension in    |
+:                 :                         : `operand`                        :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the gradients with
+respect to `operand`, `offset` and `scale` across all the other dimensions. The
+`feature_index` must be a valid index for the feature dimension in `operand`.
+
+The three gradients are defined by the following formulas:
+
+\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+
+\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+
+\\( \nabla \beta = sum(\nabla y) \\)
+
+The inputs `mean` and `variance` represents moments value
+across batch and spatial dimensions.
+
+The output type is a tuple of three handles:
+
+|Outputs       | Type                    | Semantics                           |
+|------------- | ----------------------- | ------------------------------------|
+|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `operand`                           :
+|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `offset`                            :
+|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
+:              :                         : `scale`                             :
+
+## BatchNormInference
+
+See also
+[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                       |
+| --------------  | ----------------------- | ------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
+:                 :                         : normalized                      :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
+| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
+| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
+| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
+| `epsilon`       | `float`                 | Epsilon value                   |
+| `feature_index` | `int64`                 | Index to feature dimension in   |
+:                 :                         : `operand`                       :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
+computing `mean` and `variance` for each batch. It uses the input `mean` and
+`variance` instead as estimated values. The purpose of this op is to reduce
+latency in inference, hence the name `BatchNormInference`.
+
+The output is an n-dimensional, normalized array with the same shape as input
+`operand`.
+
+## BatchNormTraining
+
+See also
+[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
+[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+for a detailed description of the algorithm.
+
+Normalizes an array across batch and spatial dimensions.
+
+<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+:                 :                         : normalized                       :
+| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\gamma\\))                   :
+| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
+:                 :                         : (\\(\beta\\ )                    :
+| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
+| `feature_index` | `int64`                 | Index to feature dimension       |
+:                 :                         : in `operand`                     :
+
+For each feature in the feature dimension (`feature_index` is the index for the
+feature dimension in `operand`), the operation calculates the mean and variance
+across all the other dimensions and uses the mean and variance to normalize each
+element in `operand`. The `feature_index` must be a valid index for the feature
+dimension in `operand`.
+
+The algorithm goes as follows for each batch in `operand` \\(x\\) that
+contains `m` elements with `w` and `h` as the size of spatial dimensions (
+assuming `operand` is an 4 dimensional array):
+
+- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
+\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
+
+- Calculates batch variance \\(\sigma^2_l\\):
+\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
+
+- Normalizes, scales and shifts:
+\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
+
+The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
+
+The output type is a tuple of three `ComputationDataHandle`s:
+
+| Outputs      | Type                    | Semantics                            |
+| ------------ | ----------------------- | -------------------------------------|
+| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+:              :                         : shape as input `operand` (y)         :
+| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+
+The `batch_mean` and `batch_var` are moments calculated across the batch and
+spatial dimensions using the formulas above.
+
+## BitcastConvertType
+
+See also
+[`ComputationBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
+operation from a data shape to a target shape. The dimensions must match, and
+the conversion is an element-wise one; e.g. `s32` elements become `f32` elements
+via bitcast routine. Bitcast is implemented as a low-level cast, so machines
+with different floating point representations will give different results.
+
+<b> `BitcastConvertType(operand, new_element_type)` </b>
+
+Arguments          | Type                    | Semantics
+------------------ | ----------------------- | ---------------------------
+`operand`          | `ComputationDataHandle` | array of type T with dims D
+`new_element_type` | `PrimitiveType`         | type U
+
+The dimensions of the operand and the target shape must match. The bit-width of
+the source and destination element types must be equal. The source
+and destination element types must not be tuples.
+
 ## Broadcast
 
 See also
@@ -75,14 +244,14 @@ Clamps an operand to within the range between a minimum and maximum value.
 | `computation` | `Computation`           | computation of type `T_0, T_1,   |
 :               :                         : ..., T_N -> S` with N parameters :
 :               :                         : of arbitrary type                :
-| `operand`     | `ComputationDataHandle` | array of type T                  |
 | `min`         | `ComputationDataHandle` | array of type T                  |
+| `operand`     | `ComputationDataHandle` | array of type T                  |
 | `max`         | `ComputationDataHandle` | array of type T                  |
 
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
 operand is below this range or the maximum value if the operand is above this
-range.  That is, `clamp(x, a, b) =  max(min(x, a), b)`.
+range.  That is, `clamp(a, x, b) =  max(min(a, x), b)`.
 
 All three arrays must be the same shape. Alternately, as a restricted form of
 [broadcasting](broadcasting.md), `min` and/or `max` can be a scalar of type `T`.
@@ -94,7 +263,7 @@ let operand: s32[3] = {-1, 5, 9};
 let min: s32 = 0;
 let max: s32 = 6;
 ==>
-Clamp(operand, min, max) = s32[3]{0, 5, 6};
+Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ```
 
 ## Collapse
@@ -217,40 +386,34 @@ Diagram:
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_concatenate.png">
 </div>
 
-## ConvertElementType
-
-See also
-[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-Similar to an element-wise `static_cast` in C++, performs an element-wise
-conversion operation from a data shape to a target shape. The dimensions must
-match, and the conversion is an element-wise one; e.g. `s32` elements become
-`f32` elements via an `s32`-to-`f32` conversion routine.
+## Conditional
 
-<b> `ConvertElementType(operand, new_element_type)` </b>
+See also [`ComputationBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+<b> `Conditional(pred, true_operand, true_computation, false_operand,
+    false_computation)` </b>
 
-If the dimensions of the operand and the target shape do not match, or an
-invalid conversion is requested (e.g. to/from a tuple) an error will be
-produced.
+| Arguments           | Type                    | Semantics                   |
+| ------------------- | ----------------------- | --------------------------- |
+| `pred`              | `ComputationDataHandle` | Scalar of type `PRED`       |
+| `true_operand`      | `ComputationDataHandle` | Argument of type `T_0`      |
+| `true_computation`  | `Computation`           | Computation of type `T_0 -> |
+:                     :                         : S`                          :
+| `false_operand`     | `ComputationDataHandle` | Argument of type `T_1`      |
+| `false_computation` | `Computation`           | Computation of type `T_1 -> |
+:                     :                         : S`                          :
 
-A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
-conversion routine such as round-to-nearest-even.
+Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
+is `false`, and returns the result.
 
-> Note: The precise float-to-int and visa-versa conversions are currently
-> unspecified, but may become additional arguments to the convert operation in
-> the future.  Not all possible conversions have been implemented for all
->targets.
+The `true_computation` must take in a single argument of type `T_0` and will be
+invoked with `true_operand` which must be of the same type. The
+`false_computation` must take in a single argument of type `T_1` and will be
+invoked with `false_operand` which must be of the same type. The type of the
+returned value of `true_computation` and `false_computation` must be the same.
 
-```
-let a: s32[3] = {0, 1, 2};
-let b: f32[3] = convert(a, f32);
-then b == f32[3]{0.0, 1.0, 2.0}
-```
+Note that only one of `true_computation` and `false_computation` will be
+executed depending on the value of `pred`.
 
 ## Conv (convolution)
 
@@ -374,6 +537,40 @@ for (b, oz, oy, ox) {  // output coordinates
 }
 ```
 
+## ConvertElementType
+
+See also
+[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Similar to an element-wise `static_cast` in C++, performs an element-wise
+conversion operation from a data shape to a target shape. The dimensions must
+match, and the conversion is an element-wise one; e.g. `s32` elements become
+`f32` elements via an `s32`-to-`f32` conversion routine.
+
+<b> `ConvertElementType(operand, new_element_type)` </b>
+
+Arguments          | Type                    | Semantics
+------------------ | ----------------------- | ---------------------------
+`operand`          | `ComputationDataHandle` | array of type T with dims D
+`new_element_type` | `PrimitiveType`         | type U
+
+The dimensions of the operand and the target shape must match. The source and
+destination element types must not be tuples.
+
+A conversion such as `T=s32` to `U=f32` will perform a normalizing int-to-float
+conversion routine such as round-to-nearest-even.
+
+> Note: The precise float-to-int and visa-versa conversions are currently
+> unspecified, but may become additional arguments to the convert operation in
+> the future.  Not all possible conversions have been implemented for all
+>targets.
+
+```
+let a: s32[3] = {0, 1, 2};
+let b: f32[3] = convert(a, f32);
+then b == f32[3]{0.0, 1.0, 2.0}
+```
+
 ## CrossReplicaSum
 
 See also
@@ -388,9 +585,9 @@ Computes a sum across replicas.
 | `operand`    | `ComputationDataHandle` | Array to sum across replicas.      |
 
 The output shape is the same as the input shape. For example, if there are two
-replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.1)`
+replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
 respectively on the two replicas, then the output value from this op will be
-`(4.0, 7.6)` on both replicas.
+`(4.0, 7.75)` on both replicas.
 
 Computing the result of CrossReplicaSum requires having one input from each
 replica, so if one replica executes a CrossReplicaSum node more times than
@@ -490,6 +687,213 @@ contracted dimensions of `lhs` and `rhs` must be of the same size. In practice,
 it can be used to perform dot products between vectors, vector/matrix
 multiplications or matrix/matrix multiplications.
 
+## DotGeneral
+
+See also
+[`ComputationBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+<b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
+
+| Arguments | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| `lhs`     | `ComputationDataHandle` | array of type T
+| `rhs`     | `ComputationDataHandle` | array of type T
+| `dimension_numbers` | `DotDimensionNumbers` | array of type T
+
+As Dot, but allows contracting and batch dimension numbers to be specified for
+both the 'lhs' and 'rhs'.
+
+| DotDimensionNumbers Fields | Type                    | Semantics
+| --------- | ----------------------- | ---------------
+| 'lhs_contracting_dimensions' | repeated int64 | 'lhs' contracting dimension numbers |
+| 'rhs_contracting_dimensions' | repeated int64 | 'rhs' contracting dimension numbers |
+| 'lhs_batch_dimensions' | repeated int64 | 'lhs' batch dimension numbers |
+| 'rhs_batch_dimensions' | repeated int64 | 'rhs' batch dimension numbers |
+
+DotGeneral performs the sum of products over contracting dimensions specified
+in 'dimension_numbers'.
+
+Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
+to be the same, but must be listed in the same order in both
+'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
+
+Example with contracting dimension numbers:
+
+```
+lhs = { {1.0, 2.0, 3.0},
+        {4.0, 5.0, 6.0} }
+
+rhs = { {1.0, 1.0, 1.0},
+        {2.0, 2.0, 2.0} }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(1);
+dnums.add_rhs_contracting_dimensions(1);
+
+DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
+                                 {15.0, 30.0} }
+```
+
+Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
+dimension number, must be listed in the same order in both arrays, and must
+have the same dimension sizes.
+
+Example with batch dimension numbers (batch size 2, 2x2 matrices):
+
+```
+lhs = { { {1.0, 2.0},
+          {3.0, 4.0} },
+        { {5.0, 6.0},
+          {7.0, 8.0} } }
+
+rhs = { { {1.0, 0.0},
+          {0.0, 1.0} },
+        { {1.0, 0.0},
+          {0.0, 1.0} } }
+
+DotDimensionNumbers dnums;
+dnums.add_lhs_contracting_dimensions(2);
+dnums.add_rhs_contracting_dimensions(1);
+dnums.add_lhs_batch_dimensions(0);
+dnums.add_rhs_batch_dimensions(0);
+
+DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
+                                   {3.0, 4.0} },
+                                 { {5.0, 6.0},
+                                   {7.0, 8.0} } }
+```
+
+| Input                               | Output            | Semantics        |
+| ----------------------------------- | ----------------- | ---------------- |
+| [b0, m, k] `dot` [b0, k, n]         | [b0, m, n]        |  batch matmul    |
+| [b0, b1, m, k] `dot` [b0, b1, k, n] | [b0, b1, m, n]    |  batch matmul    |
+
+## DynamicSlice
+
+See also
+[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+DynamicSlice extracts a sub-array from the input array at dynamic
+`start_indices`. The size of the slice in each dimension is passed in
+`size_indices`, which specify the end point of exclusive slice intervals in each
+dimension: [start, start + size). The shape of `start_indices` must be rank ==
+1, with dimension size equal to the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo input dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
+
+<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
+| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
+:                 :                         : the slice size for each          :
+:                 :                         : dimension. Each value must be    :
+:                 :                         : strictly greater than zero, and  :
+:                 :                         : start + size must be less than   :
+:                 :                         : or equal to the size of the      :
+:                 :                         : dimension to avoid wrapping      :
+:                 :                         : modulo dimension size.           :
+
+1-dimensional example:
+
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let s = {2}
+
+DynamicSlice(a, s, {2}) produces:
+  {2.0, 3.0}
+```
+
+2-dimensional example:
+
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let s = {2, 1}
+
+DynamicSlice(b, s, {2, 2}) produces:
+  { { 7.0,  8.0},
+    {10.0, 11.0} }
+```
+## DynamicUpdateSlice
+
+See also
+[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+DynamicUpdateSlice generates a result which is the value of the input array
+`operand`, with a slice `update` overwritten at `start_indices`.
+The shape of `update` determines the shape of the sub-array of the result which
+is updated.
+The shape of `start_indices` must be rank == 1, with dimension size equal to
+the rank of `operand`.
+Note: handling of out-of-bounds slice indices (generated by incorrect runtime
+calculation of 'start_indices') is currently implementation-defined. Currently,
+slice indices are computed modulo update dimension sizes to prevent out-of-bound
+array accesses, but this behavior may change in future implementations.
+
+<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
+
+| Arguments       | Type                    | Semantics                        |
+| --------------- | ----------------------- | -------------------------------- |
+| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
+| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
+:                 :                         : containing the slice update.     :
+:                 :                         : Each dimension of update shape    :
+:                 :                         : must be strictly greater than    :
+:                 :                         : zero, and start + update must be :
+:                 :                         : less than operand size for each  :
+:                 :                         : dimension to avoid generating    :
+:                 :                         : out-of-bounds update indices.    :
+| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
+:                 :                         : containing the starting indices  :
+:                 :                         : of the slice for each dimension. :
+:                 :                         : Value must be greater than or    :
+:                 :                         : equal to zero.                   :
+
+1-dimensional example:
+
+```
+let a = {0.0, 1.0, 2.0, 3.0, 4.0}
+let u = {5.0, 6.0}
+let s = {2}
+
+DynamicUpdateSlice(a, u, s) produces:
+  {0.0, 1.0, 5.0, 6.0, 4.0}
+```
+
+2-dimensional example:
+
+```
+let b =
+ { {0.0,  1.0,  2.0},
+   {3.0,  4.0,  5.0},
+   {6.0,  7.0,  8.0},
+   {9.0, 10.0, 11.0} }
+let u =
+ { {12.0,  13.0},
+   {14.0,  15.0},
+   {16.0,  17.0} }
+
+let s = {1, 1}
+
+DynamicUpdateSlice(b, u, s) produces:
+ { {0.0,  1.0,  2.0},
+   {3.0, 12.0, 13.0},
+   {6.0, 14.0, 15.0},
+   {9.0, 16.0, 17.0} }
+```
+
 ## Element-wise binary arithmetic operations
 
 See also
@@ -547,7 +951,7 @@ floating-point types.
 <b> `Op(lhs, rhs)` </b>
 
 Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
-(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Le`
+(greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
 (less-than).
 
 Arguments | Type                    | Semantics
@@ -602,170 +1006,19 @@ if and only if the corresponding input element is finite.
 
 <b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
-
-using the comparison operator of the element type of `operand`.
-
-<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
-
-
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------------------
-`operand` | `ComputationDataHandle` | The operand to the function
-
-The function is applied to each element in the `operand` array, resulting in an
-array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
-
-
-## BatchNormTraining
-
-See also
-[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
-for a detailed description of the algorithm.
-
-<b> Warning: Not implemented on GPU backend yet. </b>
-
-Normalizes an array across batch and spatial dimensions.
-
-<b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized                       :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\ )                    :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension       |
-:                 :                         : in `operand`                     :
-
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. If an invalid `feature_index` is passed, an error is
-produced.
-
-The algorithm goes as follows for each batch in `operand` \\(x\\) that
-contains `m` elements with `w` and `h` as the size of spatial dimensions (
-assuming `operand` is an 4 dimensional array):
-
-- Calculates batch mean \\(\mu_l\\) for each feature `l` in feature dimension:
-\\(\mu_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h x_{ijkl}\\)
-
-- Calculates batch variance \\(\sigma^2_l\\):
-\\(\sigma^2_l=\frac{1}{mwh}\sum_{i=1}^m\sum_{j=1}^w\sum_{k=1}^h (x_{ijkl} - \mu_l)^2\\)
-
-- Normalizes, scales and shifts:
-\\(y_{ijkl}=\frac{\gamma_l(x_{ijkl}-\mu_l)}{\sqrt[2]{\sigma^2_l+\epsilon}}+\beta_l\\)
-
-The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
-
-The output type is a tuple of three ComputationDataHandles:
-
-| Outputs      | Type                    | Semantics                            |
-| ------------ | ----------------------- | -------------------------------------|
-| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
-:              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
-
-The `batch_mean` and `batch_var` are moments calculated across the batch and
-spatial dimensions using the formulars above.
-
-## BatchNormInference
-
-See also
-[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> Warning: Not implemented yet. </b>
-
-Normalizes an array across batch and spatial dimensions.
-
-<b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
-
-| Arguments       | Type                    | Semantics                       |
-| --------------  | ----------------------- | ------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
-:                 :                         : normalized                      :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
-| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
-| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
-| `epsilon`       | `float`                 | Epsilon value                   |
-| `feature_index` | `int64`                 | Index to feature dimension in   |
-:                 :                         : `operand`                       :
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the mean and variance
-across all the other dimensions and use the mean and variance to normalize each
-element in `operand`. If an invalid `feature_index` is passed, an error is
-produced.
-
-`BatchNormInference`  is equivalent to calling `BatchNormTraining` without
-computing `mean` and `variance` for each batch. It uses the input `mean` and
-`variance` instead as estimated values. The purpose of this op is to reduce
-latency in inference, hence the name `BatchNormInference`.
-
-The output is a n dimensional, normalized array with the same shape as input
-`operand`.
-
-## BatchNormGrad
-
-See also
-[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-<b> Warning: Not implemented yet. </b>
-
-Calculates gradients of batch norm.
-
-<b> `BatchNormGrad(operand, scale, mean, variance, grad_output, epsilon, feature_index)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------  | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
-:                 :                         : `BatchNormTraining`              :
-:                 :                         : (\\( \nabla y\\))                :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension in    |
-:                 :                         : `operand`                        :
-
-For each feature in the feature dimension (`feature_index` is the index for the
-feature dimension in `operand`), the operation calculates the gradients with
-respect to `operand`, `offset` and `scale` across all the other dimensions. If
-an invalid `feature_index` is passed, an error is produced.
-
-The three gradients are defined by the following formulas:
-
-\\( \nabla x = \nabla y * \gamma * \sqrt{\sigma^2+\epsilon} \\)
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
 
-\\( \nabla \gamma = sum(\nabla y * (x - \mu) * \sqrt{\sigma^2 + \epsilon}) \\)
+using the comparison operator of the element type of `operand`.
 
-\\( \nabla \beta = sum(\nabla y) \\)
+<b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
-The inputs `mean` and `variance` represents moments value
-across batch and spatial dimensions.
 
-The output type is a tuple of three ComputationDataHandles:
+Arguments | Type                    | Semantics
+--------- | ----------------------- | ---------------------------
+`operand` | `ComputationDataHandle` | The operand to the function
 
-|Outputs       | Type                    | Semantics                           |
-|------------- | ----------------------- | ------------------------------------|
-|`grad_operand`| `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `operand`                           :
-|`grad_offset` | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `offset`                            :
-|`grad_scale`  | `ComputationDataHandle` | gradient with respect to input      |
-:              :                         : `scale`                             :
+The function is applied to each element in the `operand` array, resulting in an
+array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
 
 
 ## GetTupleElement
@@ -808,8 +1061,7 @@ device, interpreting the data as the given shape and its layout, and returns a
 `ComputationDataHandle` of the data. Multiple Infeed operations are allowed in a
 computation, but there must be a total order among the Infeed operations. For
 example, two Infeeds in the code below have a total order since there is a
-dependency between the while loops. The compiler issues an error if there isn't
-a total order.
+dependency between the while loops.
 
 ```
 result1 = while (condition, init = init_value) {
@@ -901,6 +1153,40 @@ are all 0. Figure below shows examples of different `edge_padding` and
   <img style="width:100%" src="https://www.tensorflow.org/images/ops_pad.png">
 </div>
 
+## Recv
+
+See also
+[`ComputationBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+<b> `Recv(shape, channel_handle)` </b>
+
+| Arguments        | Type            | Semantics                            |
+| ---------------- | --------------- | ------------------------------------ |
+| `shape`          | `Shape`         | shape of the data to receive         |
+| `channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair |
+
+Receives data of the given shape from a `Send` instruction in another
+computation that shares the same channel handle. Returns a
+ComputationDataHandle for the received data.
+
+The client API of `Recv` operation represents synchronous communication.
+However, the instruction is internally decomposed into 2 HLO instructions
+(`Recv` and `RecvDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateRecv` and `HloInstruction::CreateRecvDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
+
+<b>`Recv(const Shape& shape, int64 channel_id)`</b>
+
+Allocates resources required to receive data from a `Send` instruction with the
+same channel_id. Returns a context for the allocated resources, which is used
+by a following `RecvDone` instruction to wait for the completion of the data
+transfer. The context is a tuple of {receive buffer (shape), request identifier
+(U32)} and it can only be used by a `RecvDone` instruction.
+
+<b> `RecvDone(HloInstruction context)` </b>
+
+Given a context created by a `Recv` instruction, waits for the data transfer to
+complete and returns the received data.
+
 ## Reduce
 
 See also
@@ -1054,7 +1340,6 @@ must have a non-negative number of mantissa bits.  The number of exponent or
 mantissa bits may exceed the corresponding value for type `T`; the corresponding
 portion of the conversion is then simply a no-op.
 
-
 ## ReduceWindow
 
 See also
@@ -1297,6 +1582,57 @@ is implementation-defined.
 :           :                         : limit of interval                 :
 | `shape`   | `Shape`                 | Output shape of type T            |
 
+## Select
+
+See also
+[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+
+Constructs an output array from elements of two input arrays, based on the
+values of a predicate array.
+
+<b> `Select(pred, on_true, on_false)` </b>
+
+Arguments  | Type                    | Semantics
+---------- | ----------------------- | ------------------
+`pred`     | `ComputationDataHandle` | array of type PRED
+`on_true`  | `ComputationDataHandle` | array of type T
+`on_false` | `ComputationDataHandle` | array of type T
+
+The arrays `on_true` and `on_false` must have the same shape. This is also the
+shape of the output array. The array `pred` must have the same dimensionality as
+`on_true` and `on_false`, with the `PRED` element type.
+
+For each element `P` of `pred`, the corresponding element of the output array is
+taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
+value of `P` is `false`. As a restricted form of [broadcasting]
+(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
+output array is taken wholly from `on_true` if `pred` is `true`, and from
+`on_false` if `pred` is `false`.
+
+Example with non-scalar `pred`:
+
+```
+let pred: PRED[4] = {true, false, false, true};
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
+```
+
+Example with scalar `pred`:
+
+```
+let pred: PRED = true;
+let v1: s32[4] = {1, 2, 3, 4};
+let v2: s32[4] = {100, 200, 300, 400};
+==>
+Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
+```
+
+Selections between tuples are supported. Tuples are considered to be scalar
+types for this purpose. If `on_true` and `on_false` are tuples (which must have
+the same shape!) then `pred` has to be a scalar of type `PRED`.
+
 ## SelectAndScatter
 
 See also
@@ -1378,56 +1714,60 @@ non-deterministic. Therefore, the `scatter` function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
 context of [`Reduce`](#reduce) for more details.
 
-## Select
+## Send
 
 See also
-[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
 
-Constructs an output array from elements of two input arrays, based on the
-values of a predicate array.
+<b> `Send(operand, channel_handle)` </b>
 
-<b> `Select(pred, on_true, on_false)` </b>
+| Arguments        | Type                    | Semantics                        |
+| ---------------- | ----------------------- | -------------------------------- |
+| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
+| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
 
-Arguments  | Type                    | Semantics
----------- | ----------------------- | ------------------
-`pred`     | `ComputationDataHandle` | array of type PRED
-`on_true`  | `ComputationDataHandle` | array of type T
-`on_false` | `ComputationDataHandle` | array of type T
+Sends the given operand data to a `Recv` instruction in another computation
+that shares the same channel handle. Does not return any data.
 
-The arrays `on_true` and `on_false` must have the same shape. This is also the
-shape of the output array. The array `pred` must have the same dimensionality as
-`on_true` and `on_false`, with the `PRED` element type.
+Similar to the `Recv` operation, the client API of `Send` operation represents
+synchronous communication, and is internally decomposed into 2 HLO instructions
+(`Send` and `SendDone`) to enable asynchronous data transfers. See also
+[`HloInstruction::CreateSend` and `HloInstruction::CreateSendDone`](https://www.tensorflow.org/code/tensorflow/compiler/xla/service/hlo_instruction.h).
 
-For each element `P` of `pred`, the corresponding element of the output array is
-taken from `on_true` if the value of `P` is `true`, and from `on_false` if the
-value of `P` is `false`. As a restricted form of [broadcasting]
-(broadcasting.md), `pred` can be a scalar of type `PRED`. In this case, the
-output array is taken wholly from `on_true` if `pred` is `true`, and from
-`on_false` if `pred` is `false`.
+<b>`Send(HloInstruction operand, int64 channel_id)`</b>
 
-Example with non-scalar `pred`:
+Initiates an asynchronous transfer of the operand to the resources allocated by
+the `Recv` instruction with the same channel id. Returns a context, which is
+used by a following `SendDone` instruction to wait for the completion of the
+data transfer. The context is a tuple of {operand (shape), request identifier
+(U32)} and it can only be used by a `SendDone` instruction.
 
-```
-let pred: PRED[4] = {true, false, false, true};
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 200, 300, 4};
-```
+<b> `SendDone(HloInstruction context)` </b>
 
-Example with scalar `pred`:
+Given a context created by a `Send` instruction, waits for the data transfer to
+complete.  The instruction does not return any data.
 
-```
-let pred: PRED = true;
-let v1: s32[4] = {1, 2, 3, 4};
-let v2: s32[4] = {100, 200, 300, 400};
-==>
-Select(pred, v1, v2) = s32[4]{1, 2, 3, 4};
-```
+<b> Scheduling of channel instructions </b>
 
-Selections between tuples are supported. Tuples are considered to be scalar
-types for this purpose. If `on_true` and `on_false` are tuples (which must have
-the same shape!) then `pred` has to be a scalar of type `PRED`.
+The execution order of the 4 instructions for each channel (`Recv`, `RecvDone`,
+`Send`, `SendDone`) is as below.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:70%" src="../../images/send_recv_order.png">
+</div>
+
+* `Recv` happens before `Send`
+* `Send` happens before `RecvDone`
+* `Recv` happens before `RecvDone`
+* `Send` happens before `SendDone`
+
+When the backend compilers generate a linear schedule for each computation that
+communicates via channel instructions, there must not be cycles across the
+computations. For example, below schedules lead to deadlocks.
+
+<div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="../../images/send_recv_schedule.png">
+</div>
 
 ## Slice
 
@@ -1481,132 +1821,6 @@ Slice(b, {2, 1}, {4, 3}) produces:
     {10.0, 11.0} }
 ```
 
-## DynamicSlice
-
-See also
-[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicSlice extracts a sub-array from the input array at dynamic
-`start_indices`. The size of the slice in each dimension is passed in
-`size_indices`, which specify the end point of exclusive slice intervals in each
-dimension: [start, start + size). The shape of `start_indices` must be rank ==
-1, with dimension size equal to the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicSlice(operand, start_indices, size_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the slice size for each          :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than zero, and  :
-:                 :                         : start + size must be less than   :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension to avoid wrapping      :
-:                 :                         : modulo dimension size.           :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let s = {2}
-
-DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let s = {2, 1}
-
-DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
-```
-## DynamicUpdateSlice
-
-See also
-[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
-
-DynamicUpdateSlice generates a result which is the value of the input array
-`operand`, with a slice `update` overwritten at `start_indices`.
-The shape of `update` determines the shape of the sub-array of the result which
-is updated.
-The shape of `start_indices` must be rank == 1, with dimension size equal to
-the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
-
-<b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
-
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
-:                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape    :
-:                 :                         : must be strictly greater than    :
-:                 :                         : zero, and start + update must be :
-:                 :                         : less than operand size for each  :
-:                 :                         : dimension to avoid generating    :
-:                 :                         : out-of-bounds update indices.    :
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-
-1-dimensional example:
-
-```
-let a = {0.0, 1.0, 2.0, 3.0, 4.0}
-let u = {5.0, 6.0}
-let s = {2}
-
-DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
-```
-
-2-dimensional example:
-
-```
-let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
-let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
-
-let s = {1, 1}
-
-DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
-```
-
 ## Sort
 
 See also
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index f458cbcef228b60fcce095a9326b5ea36494cde3..308cbad376468b4ae29b8e321ec8ce85c102cd47 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -1,16 +1,16 @@
 # Importing Data
 
-The @{tf.data.Dataset$`Dataset`} API enables you to build complex input pipelines from
+The `tf.data` API enables you to build complex input pipelines from
 simple, reusable pieces. For example, the pipeline for an image model might
 aggregate data from files in a distributed file system, apply random
 perturbations to each image, and merge randomly selected images into a batch
 for training. The pipeline for a text model might involve extracting symbols
 from raw text data, converting them to embedding identifiers with a lookup
-table, and batching together sequences of different lengths. The `Dataset` API
+table, and batching together sequences of different lengths. The `tf.data` API
 makes it easy to deal with large amounts of data, different data formats, and
 complicated transformations.
 
-The `Dataset` API introduces two new abstractions to TensorFlow:
+The `tf.data` API introduces two new abstractions to TensorFlow:
 
 * A `tf.data.Dataset` represents a sequence of elements, in which
   each element contains one or more `Tensor` objects. For example, in an image
@@ -121,7 +121,7 @@ dataset3 = dataset3.filter(lambda x, (y, z): ...)
 ### Creating an iterator
 
 Once you have built a `Dataset` to represent your input data, the next step is to
-create an `Iterator` to access elements from that dataset.  The `Dataset` API
+create an `Iterator` to access elements from that dataset.  The `tf.data` API
 currently supports the following iterators, in increasing level of
 sophistication:
 
@@ -190,8 +190,8 @@ validation_dataset = tf.data.Dataset.range(50)
 # A reinitializable iterator is defined by its structure. We could use the
 # `output_types` and `output_shapes` properties of either `training_dataset`
 # or `validation_dataset` here, because they are compatible.
-iterator = Iterator.from_structure(training_dataset.output_types,
-                                   training_dataset.output_shapes)
+iterator = tf.data.Iterator.from_structure(training_dataset.output_types,
+                                           training_dataset.output_shapes)
 next_element = iterator.get_next()
 
 training_init_op = iterator.make_initializer(training_dataset)
@@ -379,7 +379,7 @@ sess.run(iterator.initializer, feed_dict={features_placeholder: features,
 
 ### Consuming TFRecord data
 
-The `Dataset` API supports a variety of file formats so that you can process
+The `tf.data` API supports a variety of file formats so that you can process
 large datasets that do not fit in memory. For example, the TFRecord file format
 is a simple record-oriented binary format that many TensorFlow applications use
 for training data. The `tf.data.TFRecordDataset` class enables you to
@@ -628,7 +628,7 @@ TODO(mrry): Add this section.
 
 ### Processing multiple epochs
 
-The `Dataset` API offers two main ways to process multiple epochs of the same
+The `tf.data` API offers two main ways to process multiple epochs of the same
 data.
 
 The simplest way to iterate over a dataset in multiple epochs is to use the
@@ -693,7 +693,7 @@ dataset = dataset.repeat()
 The @{tf.train.MonitoredTrainingSession} API simplifies many aspects of running
 TensorFlow in a distributed setting. `MonitoredTrainingSession` uses the
 @{tf.errors.OutOfRangeError} to signal that training has completed, so to use it
-with the `Dataset` API, we recommend using
+with the `tf.data` API, we recommend using
 `Dataset.make_one_shot_iterator()`. For example:
 
 ```python
@@ -735,7 +735,7 @@ def dataset_input_fn():
     parsed = tf.parse_single_example(record, keys_to_features)
 
     # Perform additional preprocessing on the parsed data.
-    image = tf.decode_jpeg(parsed["image_data"])
+    image = tf.image.decode_jpeg(parsed["image_data"])
     image = tf.reshape(image, [299, 299, 1])
     label = tf.cast(parsed["label"], tf.int32)
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 36a016e880213a5305805247a95a19ad954e2c92..1a32882121efb1aa906bf6fb846194709d0f700e 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -9,11 +9,19 @@ lets you view the internal structure and states of running TensorFlow graphs
 during training and inference, which is difficult to debug with general-purpose
 debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
 
-> NOTE: The system requirements of tfdbg on supported external platforms include
-> the following. On Mac OS X, the `ncurses` library is required. It can be
-> installed with `brew install homebrew/dupes/ncurses`. On Windows, `pyreadline`
-> is required. If you use Anaconda3, you can install it with a command
+> NOTE: TensorFlow debugger uses a
+> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
+> text user interface. On Mac OS X, the `ncurses` library is required and can
+> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
+> isn't as well supported, so a
+> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
+> be used with tfdbg by installing `pyreadline` with pip.
+> If you use Anaconda3, you can install it with a command
 > such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
+> Unofficial Windows curses packages can be downloaded
+> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+> installed using `pip install <your_version>.whl`, however curses on Windows
+> may not work as reliably as curses on Linux or Mac.
 
 This tutorial demonstrates how to use the **tfdbg** command-line interface
 (CLI) to debug the appearance of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@@ -149,6 +157,7 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `pt <tensor>[slicing]` | Print a subarray of tensor, using [numpy](http://www.numpy.org/)-style array slicing. | `pt hidden/Relu:0[0:50,:]` |
 | | `-a` | Print the entirety of a large tensor, without using ellipses. (May take a long time for large tensors.) | `pt -a hidden/Relu:0[0:50,:]` |
 | | `-r <range>` | Highlight elements falling into specified numerical range. Multiple ranges can be used in conjunction. | `pt hidden/Relu:0 -a -r [[-inf,-1],[1,inf]]` |
+| | `-n <number>` | Print dump corresponding to specified 0-based dump number. Required for tensors with multiple dumps. | `pt -n 0 hidden/Relu:0` |
 | | `-s` | Include a summary of the numeric values of the tensor (applicable only to non-empty tensors with Boolean and numeric types such as `int*` and `float*`.) | `pt -s hidden/Relu:0[0:50,:]` |
 | **`@[coordinates]`** | | Navigate to specified element in `pt` output. | `@[10,0]` or `@10,0` |
 | **`/regex`** | |  [less](https://linux.die.net/man/1/less)-style search for given regular expression. | `/inf` |
@@ -166,10 +175,12 @@ Try the following commands at the `tfdbg>` prompt (referencing the code at
 | | `-r` | List the inputs to node, recursively (the input tree.) | `li -r hidden/Relu:0` |
 | | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `li -r -d 3 hidden/Relu:0` |
 | | `-c` | Include control inputs. | `li -c -r hidden/Relu:0` |
+| | `-t` | Show op types of input nodes. | `li -t -r hidden/Relu:0` |
 | **`lo`** | | **List output recipients of node** | |
 | | `-r` | List the output recipients of node, recursively (the output tree.) | `lo -r hidden/Relu:0` |
 | | `-d <max_depth>` | Limit recursion depth under the `-r` mode. | `lo -r -d 3 hidden/Relu:0` |
 | | `-c` | Include recipients via control edges. | `lo -c -r hidden/Relu:0` |
+| | `-t` | Show op types of recipient nodes. | `lo -t -r hidden/Relu:0` |
 | **`ls`** | | **List Python source files involved in node creation.** | |
 | | `-p <path_pattern>` | Limit output to source files matching given regular-expression path pattern. | `ls -p .*debug_mnist.*` |
 | | `-n` | Limit output to node names matching given regular-expression pattern. | `ls -n Softmax.*` |
@@ -381,7 +392,7 @@ diff = -(y_ * tf.log(y))
 to the built-in, numerically-stable implementation of softmax cross-entropy:
 
 ```python
-diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=logits)
+diff = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=logits)
 ```
 
 Rerun with the `--debug` flag as follows:
@@ -509,8 +520,12 @@ model.fit(...)  # This will break into the TFDBG CLI.
 
 ## Debugging tf-slim with TFDBG
 
-TFDBG currently supports only training with
+TFDBG supports debugging of training and evaluation with
 [tf-slim](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim).
+As detailed below, training and evaluation require slightly different debugging
+workflows.
+
+### Debugging training in tf-slim
 To debug the training process, provide `LocalCLIDebugWrapperSession` to the
 `session_wrapper` argument of `slim.learning.train()`. For example:
 
@@ -519,13 +534,31 @@ import tensorflow as tf
 from tensorflow.python import debug as tf_debug
 
 # ... Code that creates the graph and the train_op ...
-tf.contrib.slim.learning_train(
+tf.contrib.slim.learning.train(
     train_op,
     logdir,
     number_of_steps=10,
     session_wrapper=tf_debug.LocalCLIDebugWrapperSession)
 ```
 
+### Debugging evaluation in tf-slim
+To debug the evaluation process, provide `LocalCLIDebugHook` to the
+`hooks` argument of `slim.evaluation.evaluate_once()`. For example:
+
+``` python
+import tensorflow as tf
+from tensorflow.python import debug as tf_debug
+
+# ... Code that creates the graph and the eval and final ops ...
+tf.contrib.slim.evaluation.evaluate_once(
+    '',
+    checkpoint_path,
+    logdir,
+    eval_op=my_eval_op,
+    final_op=my_value_op,
+    hooks=[tf_debug.LocalCLIDebugHook()])
+```
+
 ## Offline Debugging of Remotely-Running Sessions
 
 Often, your model is running on a remote machine or a process that you don't
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index d465679817b030fe65f038750d1006d9749ad748..8b6cbbcd170efaa101af93e72c1ec24191e5759d 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -166,11 +166,29 @@ keras_inception_v3 = tf.keras.applications.inception_v3.InceptionV3(weights=None
 keras_inception_v3.compile(optimizer=tf.keras.optimizers.SGD(lr=0.0001, momentum=0.9),
                           loss='categorical_crossentropy',
                           metric='accuracy')
-# Create an Estimator from the compiled Keras model.
+# Create an Estimator from the compiled Keras model. Note the initial model
+# state of the keras model is preserved in the created Estimator.
 est_inception_v3 = tf.keras.estimator.model_to_estimator(keras_model=keras_inception_v3)
-# Treat the derived Estimator as you would any other Estimator. For example,
-# the following derived Estimator calls the train method:
-est_inception_v3.train(input_fn=my_training_set, steps=2000)
+
+# Treat the derived Estimator as you would with any other Estimator.
+# First, recover the input name(s) of Keras model, so we can use them as the
+# feature column name(s) of the Estimator input function:
+keras_inception_v3.input_names  # print out: ['input_1']
+# Once we have the input name(s), we can create the input function, for example,
+# for input(s) in the format of numpy ndarray:
+train_input_fn = tf.estimator.inputs.numpy_input_fn(
+    x={"input_1": train_data},
+    y=train_labels,
+    num_epochs=1,
+    shuffle=False)
+# To train, we call Estimator's train function:
+est_inception_v3.train(input_fn=train_input_fn, steps=2000)
 ```
+Note that the names of feature columns and labels of a keras estimator come from
+the corresponding compiled keras model. For example, the input key names for
+@{$get_started/input_fn} in above `est_inception_v3` estimator can be obtained
+from `keras_inception_v3.input_names`, and similarly, the predicted output
+names can be obtained from `keras_inception_v3.output_names`.
+
 For more details, please refer to the documentation for
 @{tf.keras.estimator.model_to_estimator}.
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index c08043835a3c575939d170c52f7f28efb5868c21..984058297f9ae1ad25ea4c0ef036f0477a6ac024 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -329,7 +329,7 @@ described below.
 * **`graph`.** By default, a new @{tf.Session} will be bound to---and only able
   to run operations in---the current default graph. If you are using multiple
   graphs in your program (see [Programming with multiple
-  graphs](programming-with-multiple-graphs) for more details), you can specify
+  graphs](#programming_with_multiple_graphs) for more details), you can specify
   an explicit @{tf.Graph} when you construct the session.
 
 * **`config`.** This argument allows you to specify a @{tf.ConfigProto} that
diff --git a/tensorflow/docs_src/programmers_guide/saved_model.md b/tensorflow/docs_src/programmers_guide/saved_model.md
index 6bc2cbb9e30b7dabd84c1659823fe6c1fe0bf2c5..54693f3d4d356da93e6e31595d04ed58e173e061 100644
--- a/tensorflow/docs_src/programmers_guide/saved_model.md
+++ b/tensorflow/docs_src/programmers_guide/saved_model.md
@@ -33,7 +33,7 @@ roughly speaking, map variable names to tensor values.
 
 Create a `Saver` with `tf.train.Saver()` to manage all variables in the
 model. For example, the following snippet demonstrates how to call the
-`tf.train.Saver.save` method to save variables to a checkpoint file:
+`tf.train.Saver.save` method to save variables to checkpoint files:
 
 ```python
 # Create some variables.
@@ -58,7 +58,7 @@ with tf.Session() as sess:
   dec_v2.op.run()
   # Save the variables to disk.
   save_path = saver.save(sess, "/tmp/model.ckpt")
-  print("Model saved in file: %s" % save_path)
+  print("Model saved in path: %s" % save_path)
 ```
 
 
@@ -66,10 +66,10 @@ with tf.Session() as sess:
 ### Restoring variables
 
 The `tf.train.Saver` object not only saves variables to checkpoint files, it
-also restores variables.  Note that when you restore variables from a file you
-do not have to initialize them beforehand. For example, the following snippet
-demonstrates how to call the `tf.train.Saver.restore` method to restore
-variables from a checkpoint file:
+also restores variables. Note that when you restore variables you do not have
+to initialize them beforehand. For example, the following snippet demonstrates
+how to call the `tf.train.Saver.restore` method to restore variables from the
+checkpoint files:
 
 ```python
 tf.reset_default_graph()
@@ -92,6 +92,12 @@ with tf.Session() as sess:
   print("v2 : %s" % v2.eval())
 ```
 
+Notes:
+
+*  There is not a physical file called "/tmp/model.ckpt". It is the **prefix**
+   of filenames created for the checkpoint. Users only interact with the
+   prefix instead of physical checkpoint files.
+
 
 ### Choosing which variables to save and restore
 
@@ -160,7 +166,7 @@ Notes:
 
 ### Inspect variables in a checkpoint
 
-We can quickly inspect variables in a checkpoint with the 
+We can quickly inspect variables in a checkpoint with the
 [`inspect_checkpoint`](https://www.tensorflow.org/code/tensorflow/python/tools/inspect_checkpoint.py) library.
 
 Continuing from the save/restore examples shown earlier:
@@ -238,7 +244,7 @@ For example, the following code suggests a typical way to use
 ```python
 export_dir = ...
 ...
-builder = tf.saved_model_builder.SavedModelBuilder(export_dir)
+builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
 with tf.Session(graph=tf.Graph()) as sess:
   ...
   builder.add_meta_graph_and_variables(sess,
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index d6f80430cdbb133a486db69bd30a1fae151e3378..47d4db2a568c9f8009982e44a85e44f0250860c1 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -29,8 +29,8 @@ Some types of tensors are special, and these will be covered in other
 units of the Programmer's guide. The main ones are:
 
   * `tf.Variable`
-  * `tf.Constant`
-  * `tf.Placeholder`
+  * `tf.constant`
+  * `tf.placeholder`
   * `tf.SparseTensor`
 
 With the exception of `tf.Variable`, the value of a tensor is immutable, which
@@ -43,8 +43,8 @@ generating a random number.
 
 The **rank** of a `tf.Tensor` object is its number of dimensions. Synonyms for
 rank include **order** or **degree** or **n-dimension**.
-Note that rank in TensorFlow is not the same as matrix rank in mathematics. 
-As the following table shows, each rank in TensorFlow corresponds to a 
+Note that rank in TensorFlow is not the same as matrix rank in mathematics.
+As the following table shows, each rank in TensorFlow corresponds to a
 different mathematical entity:
 
 Rank | Math entity
@@ -56,7 +56,7 @@ Rank | Math entity
 n | n-Tensor (you get the idea)
 
 
-### Rank 0 
+### Rank 0
 
 The following snippet demonstrates creating a few rank 0 variables:
 
@@ -64,7 +64,7 @@ The following snippet demonstrates creating a few rank 0 variables:
 mammal = tf.Variable("Elephant", tf.string)
 ignition = tf.Variable(451, tf.int16)
 floating = tf.Variable(3.14159265359, tf.float64)
-its_complicated = tf.Variable((12.3, -4.85), tf.complex64)
+its_complicated = tf.Variable(12.3 - 4.85j, tf.complex64)
 ```
 
 Note: A string is treated as a single item in TensorFlow, not as a sequence of
@@ -79,7 +79,7 @@ initial value. For example:
 mystr = tf.Variable(["Hello"], tf.string)
 cool_numbers  = tf.Variable([3.14159, 2.71828], tf.float32)
 first_primes = tf.Variable([2, 3, 5, 7, 11], tf.int32)
-its_very_complicated = tf.Variable([(12.3, -4.85), (7.5, -6.23)], tf.complex64)
+its_very_complicated = tf.Variable([12.3 - 4.85j, 7.5 - 6.23j], tf.complex64)
 ```
 
 
@@ -108,7 +108,7 @@ my_image = tf.zeros([10, 299, 299, 3])  # batch x height x width x color
 ### Getting a `tf.Tensor` object's rank
 
 To determine the rank of a `tf.Tensor` object, call the `tf.rank` method.
-For example, the following method programmatically determines the rank 
+For example, the following method programmatically determines the rank
 of the `tf.Tensor` defined in the previous section:
 
 ```python
@@ -275,8 +275,8 @@ Graphs and Sessions for more information).
 
 Sometimes it is not possible to evaluate a `tf.Tensor` with no context because
 its value might depend on dynamic information that is not available. For
-example, tensors that depend on `Placeholder`s can't be evaluated without
-providing a value for the `Placeholder`.
+example, tensors that depend on `placeholder`s can't be evaluated without
+providing a value for the `placeholder`.
 
 ``` python
 p = tf.placeholder(tf.float32)
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index f310b89380473a613540e880ff1dddc3c3406f14..bac385c02cee8906a25a1ae37eae7eec94151276 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -37,7 +37,7 @@ You may optionally specify the `dtype` and initializer to `tf.get_variable`. For
 example:
 
 ``` python
-my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32, 
+my_int_variable = tf.get_variable("my_int_variable", [1, 2, 3], dtype=tf.int32,
   initializer=tf.zeros_initializer)
 ```
 
@@ -45,7 +45,7 @@ TensorFlow provides many convenient initializers. Alternatively, you may
 initialize a `tf.Variable` to have the value of a `tf.Tensor`. For example:
 
 ``` python
-other_variable = tf.get_variable("other_variable", dtype=tf.int32, 
+other_variable = tf.get_variable("other_variable", dtype=tf.int32,
   initializer=tf.constant([23, 42]))
 ```
 
@@ -66,13 +66,13 @@ By default every `tf.Variable` gets placed in the following two collections:
 multiple devices,
  * `tf.GraphKeys.TRAINABLE_VARIABLES`--- variables for which TensorFlow will
    calculate gradients.
- 
+
 If you don't want a variable to be trainable, add it to the
 `tf.GraphKeys.LOCAL_VARIABLES` collection instead. For example, the following
 snippet demonstrates how to add a variable named `my_local` to this collection:
 
 ``` python
-my_local = tf.get_variable("my_local", shape=(), 
+my_local = tf.get_variable("my_local", shape=(),
 collections=[tf.GraphKeys.LOCAL_VARIABLES])
 ```
 
@@ -80,8 +80,8 @@ Alternatively, you can specify `trainable=False` as an argument to
 `tf.get_variable`:
 
 ``` python
-my_non_trainable = tf.get_variable("my_non_trainable", 
-                                   shape=(), 
+my_non_trainable = tf.get_variable("my_non_trainable",
+                                   shape=(),
                                    trainable=False)
 ```
 
@@ -126,7 +126,7 @@ cluster_spec = {
     "ps": ["ps0:2222", "ps1:2222"],
     "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]}
 with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
-  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed 
+  v = tf.get_variable("v", shape=[20, 20])  # this variable is placed
                                             # in the parameter server
                                             # by the replica_device_setter
 ```
@@ -142,7 +142,7 @@ high-level frameworks such as `tf.contrib.slim`, `tf.estimator.Estimator` and
 Explicit initialization is otherwise useful because it allows you not to rerun
 potentially expensive initializers when reloading a model from a checkpoint as
 well as allowing determinism when randomly-initialized variables are shared in a
-distributed setting. 
+distributed setting.
 
 To initialize all trainable variables in one go, before training starts, call
 `tf.global_variables_initializer()`. This function returns a single operation
@@ -205,7 +205,7 @@ methods:
 v = tf.get_variable("v", shape=(), initializer=tf.zeros_initializer())
 assignment = v.assign_add(1)
 tf.global_variables_initializer().run()
-assignment.run()
+sess.run(assignment)  # or assignment.op.run()
 ```
 
 Most TensorFlow optimizers have specialized ops that efficiently update the
diff --git a/tensorflow/docs_src/tutorials/deep_cnn.md b/tensorflow/docs_src/tutorials/deep_cnn.md
index 6f802fd106d0e7cc8b2049af2548c51803b43195..679754020470dddfcffa76e62ca8f55a439ec4f5 100644
--- a/tensorflow/docs_src/tutorials/deep_cnn.md
+++ b/tensorflow/docs_src/tutorials/deep_cnn.md
@@ -195,9 +195,8 @@ The usual method for training a network to perform N-way classification is
 aka. *softmax regression*. Softmax regression applies a
 @{tf.nn.softmax$softmax} nonlinearity to the
 output of the network and calculates the
-@{tf.nn.softmax_cross_entropy_with_logits$cross-entropy}
-between the normalized predictions and a
-@{tf.sparse_to_dense$1-hot encoding} of the label.
+@{tf.nn.sparse_softmax_cross_entropy_with_logits$cross-entropy}
+between the normalized predictions and the label index.
 For regularization, we also apply the usual
 @{tf.nn.l2_loss$weight decay} losses to all learned
 variables.  The objective function for the model is the sum of the cross entropy
diff --git a/tensorflow/docs_src/tutorials/image_recognition.md b/tensorflow/docs_src/tutorials/image_recognition.md
index ddb771700a03d0d4f60ff3d26afbef9d861b5691..32257f87d6662f44536f45510b6a7c82628de2ff 100644
--- a/tensorflow/docs_src/tutorials/image_recognition.md
+++ b/tensorflow/docs_src/tutorials/image_recognition.md
@@ -5,7 +5,7 @@ tell apart a lion and a jaguar, read a sign, or recognize a human's face.
 But these are actually hard problems to solve with a computer: they only
 seem easy because our brains are incredibly good at understanding images.
 
-In the last few years the field of machine learning has made tremendous
+In the last few years, the field of machine learning has made tremendous
 progress on addressing these difficult problems. In particular, we've
 found that a kind of model called a deep
 [convolutional neural network](https://colah.github.io/posts/2014-07-Conv-Nets-Modular/)
@@ -42,7 +42,7 @@ For example, here are the results from [AlexNet] classifying some images:
 To compare models, we examine how often the model fails to predict the
 correct answer as one of their top 5 guesses -- termed "top-5 error rate".
 [AlexNet] achieved by setting a top-5 error rate of 15.3% on the 2012
-validation data set; [Inception (GoogLeNet)] achieved 6.67%; 
+validation data set; [Inception (GoogLeNet)] achieved 6.67%;
 [BN-Inception-v2] achieved 4.9%; [Inception-v3] reaches 3.46%.
 
 > How well do humans do on ImageNet Challenge? There's a [blog post] by
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 5708b272780f14c5cab6863078ad3a5f466f0daf..52e6980e0070cdc6d03275c891283c25df4b31a1 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -14,10 +14,11 @@ laptop, without requiring a GPU. This tutorial will show you how to run the
 example script on your own images, and will explain some of the options you have
 to help control the training process.
 
-Note: This version of the tutorial mainly uses bazel. A bazel free version is
-also available
+Note: A version of this tutorial is also available
 [as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
 
+Before you start, you must @{$install$install tensorflow}.
+
 [TOC]
 
 ## Training on Flowers
@@ -38,26 +39,31 @@ curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
 tar xzf flower_photos.tgz
 ```
 
-Once you have the images, you can build the retrainer like this, from the root
-of your TensorFlow source directory:
+Once you have the images, you can clone the tensorflow repository using the
+following command (these examples are not included in the installation):
 
 ```sh
-bazel build tensorflow/examples/image_retraining:retrain
+git clone https://github.com/tensorflow/tensorflow
+```
+
+Then checkout the version of the tensorflow repository matching your
+installation and this tutorial as follows:
+
+``` sh
+cd tensorflow
+git checkout {version}
 ```
 
-If you have a machine which supports
-[the AVX instruction set](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
-(common in x86 CPUs produced in the last few years) you can improve the running
-speed of the retraining by building for that architecture, like this (after choosing appropriate options in `configure`):
+In the simplest cases the retrainer can then be run like this:
 
 ```sh
-bazel build --config opt tensorflow/examples/image_retraining:retrain
+python tensorflow/examples/image_retraining/retrain.py --image_dir ~/flower_photos
 ```
 
-The retrainer can then be run like this:
+The script has many other options. You can get a full listing with:
 
 ```sh
-bazel-bin/tensorflow/examples/image_retraining/retrain --image_dir ~/flower_photos
+python tensorflow/examples/image_retraining/retrain.py -h
 ```
 
 This script loads the pre-trained Inception v3 model, removes the old top layer,
@@ -149,26 +155,28 @@ can read in, so you can start using your new model immediately. Since you've
 replaced the top layer, you will need to specify the new name in the script, for
 example with the flag `--output_layer=final_result` if you're using label_image.
 
-Here's an example of how to build and run the label_image example with your
+Here's an example of how to run the label_image example with your
 retrained graphs:
 
 ```sh
-bazel build tensorflow/examples/image_retraining:label_image && \
-bazel-bin/tensorflow/examples/image_retraining/label_image \
+python tensorflow/examples/label_image/label_image.py \
 --graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---output_layer=final_result:0 \
+--input_layer=Mul \
+--output_layer=final_result \
+--input_mean=128 --input_std=128 \
 --image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
 ```
 
 You should see a list of flower labels, in most cases with daisy on top
 (though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out, and use the C++ code
-as a template to integrate with your own applications.
+`--image` parameter with your own images to try those out.
 
 If you'd like to use the retrained model in your own Python program, then the
 above
-[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/image_retraining/label_image.py)
-is a reasonable starting point.
+[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/label_image/label_image.py)
+is a reasonable starting point. The `label_image`
+directory also contains C++ code which you can use as a template to integrate
+tensorflow with your own applications.
 
 If you find the default Inception v3 model is too large or slow for your
 application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
@@ -372,3 +380,18 @@ programs, you'll need to feed in an image of the specified size converted to a
 float range into the 'input' tensor. Typically 24-bit images are in the range
 [0,255], and you must convert them to the [-1,1] float range expected by the
 model with the formula  `(image - 128.)/128.`.
+
+The default arguments for the `label_image` script are set for Inception V3.
+To use it with a MobileNet, specify the above normalization parameters as
+`input_mean` and `input_std` on the command line. You also must specify the
+image size that your model expects, as follows:
+
+```sh
+python tensorflow/examples/label_image/label_image.py \
+--graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
+--input_layer=input \
+--output_layer=final_result:0 \
+--input_height=224 --input_width=224 \
+--input_mean=128 --input_std=128 \
+--image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
+```
diff --git a/tensorflow/docs_src/tutorials/index.md b/tensorflow/docs_src/tutorials/index.md
index a34dbd69569be9cd234e98009ed148080fbbdb70..6e24f47882712591981a891b56e903ef85deceb7 100644
--- a/tensorflow/docs_src/tutorials/index.md
+++ b/tensorflow/docs_src/tutorials/index.md
@@ -46,6 +46,10 @@ The following tutorials focus on linear models:
   * @{$audio_recognition$Simple Audio Recognition}, which shows how to
     build a basic speech recognition network.
 
+The following tutorial covers building a classification model for sequences:
+
+  * ${$recurrent_quickdraw$Classifying Drawings using Recurrent Neural Networks}
+
 Although TensorFlow specializes in machine learning, you may also use
 TensorFlow to solve other kinds of math problems.  For example:
 
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index e808a3677f2a3e89597ef82cc86dd3646775d693..7c2029c4428b298ffb393d12af704a96b368f723 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -169,9 +169,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/docs_src/tutorials/leftnav_files b/tensorflow/docs_src/tutorials/leftnav_files
index 5a5d6ca558867e1c8f3dca221a98ca7c0a7ee986..e612961ae05b6d8542cf0cd6d2064a7f972dc7cd 100644
--- a/tensorflow/docs_src/tutorials/leftnav_files
+++ b/tensorflow/docs_src/tutorials/leftnav_files
@@ -6,6 +6,7 @@ layers.md
 deep_cnn.md
 word2vec.md
 recurrent.md
+recurrent_quickdraw.md
 seq2seq.md
 linear.md
 wide.md
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index a6517549c3635fb5dd251f3c3b7b8f876ab4e922..d333d01279067de47819410795505f731e14fed3 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -175,7 +175,7 @@ the name of a `FeatureColumn`. Each key's value is a tensor containing the
 values of that feature for all data instances. See
 @{$input_fn$Building Input Functions with tf.estimator} for a
 more comprehensive look at input functions, and `input_fn` in the
-[linear models tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
+[linear models tutorial code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 for an example implementation of an input function.
 
 The input function is passed to the `train()` and `evaluate()` calls that
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
new file mode 100644
index 0000000000000000000000000000000000000000..7306b4bf568397470ff3e52a7aa83e75208b1af9
--- /dev/null
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -0,0 +1,410 @@
+# Recurrent Neural Networks for Drawing Classification
+
+[Quick, Draw!]: http://quickdraw.withgoogle.com
+
+[Quick, Draw!] is a game where a player is challenged to draw a number of
+objects and see if a computer can recognize the drawing.
+
+The recognition in [Quick, Draw!] is performed by a classifier that takes the
+user input, given as a sequence of strokes of points in x and y, and recognizes
+the object category that the user tried to draw.
+
+In this tutorial we'll show how to build an RNN-based recognizer for this
+problem. The model will use a combination of convolutional layers, LSTM layers,
+and a softmax output layer to classify the drawings:
+
+<center> ![RNN model structure](../images/quickdraw_model.png) </center>
+
+The figure above shows the structure of the model that we will build in this
+tutorial. The input is a drawing that is encoded as a sequence of strokes of
+points in x, y, and n, where n indicates whether a the point is the first point
+in a new stroke.
+
+Then, a series of 1-dimensional convolutions is applied. Then LSTM layers are
+applied and the sum of the outputs of all LSTM steps is fed into a softmax layer
+to make a classification decision among the classes of drawings that we know.
+
+This tutorial uses the data from actual [Quick, Draw!] games [that is publicly
+available](https://quickdraw.withgoogle.com/data). This dataset contains of 50M
+drawings in 345 categories.
+
+## Run the tutorial code
+
+To try the code for this tutorial:
+
+1.  @{$install$Install TensorFlow} if you haven't already.
+1.  Download the [tutorial code]
+(https://github.com/tensorflow/models/tree/master/tutorials/rnn/quickdraw/train_model.py).
+1.  [Download the data](#download-the-data) in `TFRecord` format from
+    [here](http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz) and unzip it. More details about [how to
+    obtain the original Quick, Draw!
+    data](#optional-download-the-full-quick-draw-data) and [how to convert that
+    to `TFRecord` files](#optional-converting-the-data) is available below.
+
+1.  Execute the tutorial code with the following command to train the RNN-based
+    model described in this tutorial. Make sure to adjust the paths to point to
+    the unzipped data from the download in step 3.
+
+```shell
+  python train_model.py \
+    --training_data=rnn_tutorial_data/training.tfrecord-?????-of-????? \
+    --eval_data=rnn_tutorial_data/eval.tfrecord-?????-of-????? \
+    --classes_file=rnn_tutorial_data/training.tfrecord.classes
+```
+
+## Tutorial details
+
+### Download the data
+
+We make the data that we use in this tutorial available as `TFRecord` files
+containing `TFExamples`. You can download the data from here:
+
+http://download.tensorflow.org/data/quickdraw_tutorial_dataset_v1.tar.gz
+
+Alternatively you can download the original data in `ndjson` format from the
+Google cloud and convert it to the `TFRecord` files containing `TFExamples`
+yourself as described in the next section.
+
+### Optional: Download the full Quick Draw Data
+
+The full [Quick, Draw!](https://quickdraw.withgoogle.com)
+[dataset](https://quickdraw.withgoogle.com/data) is available on Google Cloud
+Storage as [ndjson](http://ndjson.org/) files separated by category. You can
+[browse the list of files in Cloud
+Console](https://console.cloud.google.com/storage/quickdraw_dataset).
+
+To download the data we recommend using
+[gsutil](https://cloud.google.com/storage/docs/gsutil_install#install) to
+download the entire dataset. Note that the original .ndjson files require
+downloading ~22GB.
+
+Then use the following command to check that your gsutil installation works and
+that you can access the data bucket:
+
+```shell
+gsutil ls -r "gs://quickdraw_dataset/full/simplified/*"
+```
+
+which will output a long list of files like the following:
+
+```shell
+gs://quickdraw_dataset/full/simplified/The Eiffel Tower.ndjson
+gs://quickdraw_dataset/full/simplified/The Great Wall of China.ndjson
+gs://quickdraw_dataset/full/simplified/The Mona Lisa.ndjson
+gs://quickdraw_dataset/full/simplified/aircraft carrier.ndjson
+...
+```
+
+Then create a folder and download the dataset there.
+
+```shell
+mkdir rnn_tutorial_data
+cd rnn_tutorial_data
+gsutil -m cp "gs://quickdraw_dataset/full/simplified/*" .
+```
+
+This download will take a while and download a bit more than 23GB of data.
+
+### Optional: Converting the data
+
+To convert the `ndjson` files to
+@{$python/python_io#tfrecords_format_details$TFRecord} files containing
+${tf.train.Example} protos run the following command.
+
+```shell
+   python create_dataset.py --ndjson_path rnn_tutorial_data \
+      --output_path rnn_tutorial_data
+```
+
+This will store the data in 10 shards of
+@{$python/python_io#tfrecords_format_details$TFRecord} files with 10000 items
+per class for the training data and 1000 items per class as eval data.
+
+This conversion process is described in more detail in the following.
+
+The original QuickDraw data is formatted as `ndjson` files where each line
+contains a JSON object like the following:
+
+```json
+{"word":"cat",
+ "countrycode":"VE",
+ "timestamp":"2017-03-02 23:25:10.07453 UTC",
+ "recognized":true,
+ "key_id":"5201136883597312",
+ "drawing":[
+   [
+     [130,113,99,109,76,64,55,48,48,51,59,86,133,154,170,203,214,217,215,208,186,176,162,157,132],
+     [72,40,27,79,82,88,100,120,134,152,165,184,189,186,179,152,131,114,100,89,76,0,31,65,70]
+   ],[
+     [76,28,7],
+     [136,128,128]
+   ],[
+     [76,23,0],
+     [160,164,175]
+   ],[
+     [87,52,37],
+     [175,191,204]
+   ],[
+     [174,220,246,251],
+     [134,132,136,139]
+   ],[
+     [175,255],
+     [147,168]
+   ],[
+     [171,208,215],
+     [164,198,210]
+   ],[
+     [130,110,108,111,130,139,139,119],
+     [129,134,137,144,148,144,136,130]
+   ],[
+     [107,106],
+     [96,113]
+   ]
+ ]
+}
+```
+
+For our purpose of building a classifier we only care about the fields "`word`"
+and "`drawing`". While parsing the ndjson files, we process them line by line
+using a function that converts the strokes from the `drawing` field into a
+tensor of size `[number of points, 3]` containing the differences of consecutive
+points. This function also returns the class name as a string.
+
+```python
+def parse_line(ndjson_line):
+  """Parse an ndjson line and return ink (as np array) and classname."""
+  sample = json.loads(ndjson_line)
+  class_name = sample["word"]
+  inkarray = sample["drawing"]
+  stroke_lengths = [len(stroke[0]) for stroke in inkarray]
+  total_points = sum(stroke_lengths)
+  np_ink = np.zeros((total_points, 3), dtype=np.float32)
+  current_t = 0
+  for stroke in inkarray:
+    for i in [0, 1]:
+      np_ink[current_t:(current_t + len(stroke[0])), i] = stroke[i]
+    current_t += len(stroke[0])
+    np_ink[current_t - 1, 2] = 1  # stroke_end
+  # Preprocessing.
+  # 1. Size normalization.
+  lower = np.min(np_ink[:, 0:2], axis=0)
+  upper = np.max(np_ink[:, 0:2], axis=0)
+  scale = upper - lower
+  scale[scale == 0] = 1
+  np_ink[:, 0:2] = (np_ink[:, 0:2] - lower) / scale
+  # 2. Compute deltas.
+  np_ink = np_ink[1:, 0:2] - np_ink[0:-1, 0:2]
+  return np_ink, class_name
+```
+
+Since we want the data to be shuffled for writing we read from each of the
+category files in random order and write to a random shard.
+
+For the training data we read the first 10000 items for each class and for the
+eval data we read the next 1000 items for each class.
+
+This data is then reformatted into a tensor of shape `[num_training_samples,
+max_length, 3]`. Then we determine the bounding box of the original drawing in
+screen coordinates and normalize the size such that the drawing has unit height.
+
+<center> ![Size normalization](../images/quickdraw_sizenormalization.png) </center>
+
+Finally, we compute the differences between consecutive points and store these
+as a `VarLenFeature` in a
+[tensorflow.Example](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+under the key `ink`. In addition we store the `class_index` as a single entry
+`FixedLengthFeature` and the `shape` of the `ink` as a `FixedLengthFeature` of
+length 2.
+
+### Defining the model
+
+To define the model we create a new `Estimator`. If you want to read more about
+estimators, we recommend @{$extend/estimators$this tutorial}.
+
+To build the model, we:
+
+1.  reshape the input back into the original shape - where the mini batch is
+    padded to the maximal length of its contents. In addition to the ink data we
+    also have the lengths for each example and the target class. This happens in
+    the function [`_get_input_tensors`](#-get-input-tensors).
+
+1.  pass the input through to a series of convolution layers in
+    [`_add_conv_layers`](#-add-conv-layers).
+
+1.  pass the output of the convolutions into a series of bidirectional LSTM
+    layers in [`_add_rnn_layers`](#-add-rnn-layers). At the end of that, the
+    outputs for each time step are summed up to have a compact, fixed length
+    embedding of the input.
+
+1.  classify this embedding using a softmax layer in
+    [`_add_fc_layers`](#-add-fc-layers).
+
+In code this looks like:
+
+```python
+inks, lengths, targets = _get_input_tensors(features, targets)
+convolved = _add_conv_layers(inks)
+final_state = _add_rnn_layers(convolved, lengths)
+logits =_add_fc_layers(final_state)
+```
+
+### _get_input_tensors
+
+To obtain the input features we first obtain the shape from the features dict
+and then create a 1D tensor of size `[batch_size]` containing the lengths of the
+input sequences. The ink is stored as a SparseTensor in the features dict which
+we convert into a dense tensor and then reshape to be `[batch_size, ?, 3]`. And
+finally, if targets were passed in we make sure they are stored as a 1D tensor
+of size `[batch_size]`
+
+In code this looks like this:
+
+```python
+shapes = features["shape"]
+lengths = tf.squeeze(
+    tf.slice(shapes, begin=[0, 0], size=[params["batch_size"], 1]))
+inks = tf.reshape(
+    tf.sparse_tensor_to_dense(features["ink"]),
+    [params["batch_size"], -1, 3])
+if targets is not None:
+  targets = tf.squeeze(targets)
+```
+
+### _add_conv_layers
+
+The desired number of convolution layers and the lengths of the filters is
+configured through the parameters `num_conv` and `conv_len` in the `params`
+dict.
+
+The input is a sequence where each point has dimensionality 3. We are going to
+use 1D convolutions where we treat the 3 input features as channels. That means
+that the input is a `[batch_size, length, 3]` tensor and the output will be a
+`[batch_size, length, number_of_filters]` tensor.
+
+```python
+convolved = inks
+for i in range(len(params.num_conv)):
+  convolved_input = convolved
+  if params.batch_norm:
+    convolved_input = tf.layers.batch_normalization(
+        convolved_input,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  # Add dropout layer if enabled and not first convolution layer.
+  if i > 0 and params.dropout:
+    convolved_input = tf.layers.dropout(
+        convolved_input,
+        rate=params.dropout,
+        training=(mode == tf.estimator.ModeKeys.TRAIN))
+  convolved = tf.layers.conv1d(
+      convolved_input,
+      filters=params.num_conv[i],
+      kernel_size=params.conv_len[i],
+      activation=None,
+      strides=1,
+      padding="same",
+      name="conv1d_%d" % i)
+return convolved, lengths
+```
+
+### _add_rnn_layers
+
+We pass the output from the convolutions into bidirectional LSTM layers for
+which we use a helper function from contrib.
+
+```python
+outputs, _, _ = contrib_rnn.stack_bidirectional_dynamic_rnn(
+    cells_fw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    cells_bw=[cell(params.num_nodes) for _ in range(params.num_layers)],
+    inputs=convolved,
+    sequence_length=lengths,
+    dtype=tf.float32,
+    scope="rnn_classification")
+```
+
+see the code for more details and how to use `CUDA` accelerated implementations.
+
+To create a compact, fixed-length embedding, we sum up the output of the LSTMs.
+We first zero out the regions of the batch where the sequences have no data.
+
+```python
+mask = tf.tile(
+    tf.expand_dims(tf.sequence_mask(lengths, tf.shape(outputs)[1]), 2),
+    [1, 1, tf.shape(outputs)[2]])
+zero_outside = tf.where(mask, outputs, tf.zeros_like(outputs))
+outputs = tf.reduce_sum(zero_outside, axis=1)
+```
+
+### _add_fc_layers
+
+The embedding of the input is passed into a fully connected layer which we then
+use as a softmax layer.
+
+```python
+tf.layers.dense(final_state, params.num_classes)
+```
+
+### Loss, predictions, and optimizer
+
+Finally, we need to add a loss, a training op, and predictions to create the
+`ModelFn`:
+
+```python
+cross_entropy = tf.reduce_mean(
+    tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=targets, logits=logits))
+# Add the optimizer.
+train_op = tf.contrib.layers.optimize_loss(
+    loss=cross_entropy,
+    global_step=tf.train.get_global_step(),
+    learning_rate=params.learning_rate,
+    optimizer="Adam",
+    # some gradient clipping stabilizes training in the beginning.
+    clip_gradients=params.gradient_clipping_norm,
+    summaries=["learning_rate", "loss", "gradients", "gradient_norm"])
+predictions = tf.argmax(logits, axis=1)
+return model_fn_lib.ModelFnOps(
+    mode=mode,
+    predictions={"logits": logits,
+                 "predictions": predictions},
+    loss=cross_entropy,
+    train_op=train_op,
+    eval_metric_ops={"accuracy": tf.metrics.accuracy(targets, predictions)})
+```
+
+### Training and evaluating the model
+
+To train and evaluate the model we can rely on the functionalities of the
+`Estimator` APIs and easily run training and evaluation with the `Experiment`
+APIs:
+
+```python
+  estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      model_dir=output_dir,
+      config=config,
+      params=model_params)
+  # Train the model.
+  tf.contrib.learn.Experiment(
+      estimator=estimator,
+      train_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.TRAIN,
+          tfrecord_pattern=FLAGS.training_data,
+          batch_size=FLAGS.batch_size),
+      train_steps=FLAGS.steps,
+      eval_input_fn=get_input_fn(
+          mode=tf.contrib.learn.ModeKeys.EVAL,
+          tfrecord_pattern=FLAGS.eval_data,
+          batch_size=FLAGS.batch_size),
+      min_eval_frequency=1000)
+```
+
+Note that this tutorial is just a quick example on a relatively small dataset to
+get you familiar with the APIs of recurrent neural networks and estimators. Such
+models can be even more powerful if you try them on a large dataset.
+
+When training the model for 1M steps you can expect to get an accuracy of
+approximately of approximately 70% on the top-1 candidate. Note that this
+accuracy is sufficient to build the quickdraw game because of the game dynamics
+the user will be able to adjust their drawing until it is ready. Also, the game
+does not use the top-1 candidate only but accepts a drawing as correct if the
+target category shows up with a score better than a fixed threshold.
diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md
index ba16e12a723938b7d9a18681aeb9a1a361a319b1..68dda1f2222b4175cd891d727065c93da6a5e68f 100644
--- a/tensorflow/docs_src/tutorials/wide.md
+++ b/tensorflow/docs_src/tutorials/wide.md
@@ -2,9 +2,9 @@
 
 In this tutorial, we will use the tf.estimator API in TensorFlow to solve a
 binary classification problem: Given census data about a person such as age,
-gender, education and occupation (the features), we will try to predict whether
-or not the person earns more than 50,000 dollars a year (the target label). We
-will train a **logistic regression** model, and given an individual's
+education, marital status, and occupation (the features), we will try to predict
+whether or not the person earns more than 50,000 dollars a year (the target
+label). We will train a **logistic regression** model, and given an individual's
 information our model will output a number between 0 and 1, which can be
 interpreted as the probability that the individual has an annual income of over
 50,000 dollars.
@@ -15,31 +15,16 @@ To try the code for this tutorial:
 
 1.  @{$install$Install TensorFlow} if you haven't already.
 
-2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+2.  Download [the tutorial code](https://github.com/tensorflow/models/tree/master/official/wide_deep/).
 
-3.  Install the pandas data analysis library. tf.estimator doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3. Execute the data download script we provide to you:
 
-    a. Get `pip`:
-
-        # Ubuntu/Linux 64-bit
-        $ sudo apt-get install python-pip python-dev
-
-        # macOS
-        $ sudo easy_install pip
-        $ sudo easy_install --upgrade six
-
-    b. Use `pip` to install pandas:
-
-        $ pip install -U pandas
-
-    If you have trouble installing pandas, consult the
-    [instructions](https://pandas.pydata.org/pandas-docs/stable/install.html)
-    on the pandas site.
+        $ python data_download.py
 
 4. Execute the tutorial code with the following command to train the linear
 model described in this tutorial:
 
-        $ python wide_n_deep_tutorial.py --model_type=wide
+        $ python wide_deep.py --model_type=wide
 
 Read on to find out how this code builds its linear model.
 
@@ -47,51 +32,23 @@ Read on to find out how this code builds its linear model.
 
 The dataset we'll be using is the
 [Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).
-You can download the
-[training data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data)
-and [test data](https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test)
-manually or use code like this:
-
-```python
-import tempfile
-import urllib
-train_file = tempfile.NamedTemporaryFile()
-test_file = tempfile.NamedTemporaryFile()
-urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
-urllib.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)
-```
-
-Once the CSV files are downloaded, let's read them into
-[Pandas](https://pandas.pydata.org/) dataframes.
-
-```python
-import pandas as pd
-CSV_COLUMNS = [
-    "age", "workclass", "fnlwgt", "education", "education_num",
-    "marital_status", "occupation", "relationship", "race", "gender",
-    "capital_gain", "capital_loss", "hours_per_week", "native_country",
-    "income_bracket"]
-df_train = pd.read_csv(train_file.name, names=CSV_COLUMNS, skipinitialspace=True)
-df_test = pd.read_csv(test_file.name, names=CSV_COLUMNS, skipinitialspace=True, skiprows=1)
-```
+We have provided
+[data_download.py](https://github.com/tensorflow/models/tree/master/official/wide_deep/data_download.py)
+which downloads the code and performs some additional cleanup.
 
 Since the task is a binary classification problem, we'll construct a label
 column named "label" whose value is 1 if the income is over 50K, and 0
-otherwise.
-
-```python
-train_labels = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-test_labels = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
-```
+otherwise. For reference, see `input_fn` in
+[wide_deep.py](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py).
 
 Next, let's take a look at the dataframe and see which columns we can use to
 predict the target label. The columns can be grouped into two types—categorical
 and continuous columns:
 
 *   A column is called **categorical** if its value can only be one of the
-    categories in a finite set. For example, the native country of a person
-    (U.S., India, Japan, etc.) or the education level (high school, college,
-    etc.) are categorical columns.
+    categories in a finite set. For example, the relationship status of a person
+    (wife, husband, unmarried, etc.) or the education level (high school,
+    college, etc.) are categorical columns.
 *   A column is called **continuous** if its value can be any numerical value in
     a continuous range. For example, the capital gain of a person (e.g. $14,084)
     is a continuous column.
@@ -127,7 +84,7 @@ Here's a list of columns available in the Census Income dataset:
 :                :             : individual.                       :
 | income         | Categorical | ">50K" or "<=50K", meaning        |
 :                :             : whether the person makes more     :
-:                :             : than $50,000 annually.           :
+:                :             : than $50,000 annually.            :
 
 ## Converting Data into Tensors
 
@@ -136,50 +93,56 @@ Input Builder function. This builder function will not be called until it is
 later passed to tf.estimator.Estimator methods such as `train` and `evaluate`.
 The purpose of this function is to construct the input data, which is
 represented in the form of @{tf.Tensor}s or @{tf.SparseTensor}s.
-In more detail, the Input Builder function returns the following as a pair:
+In more detail, the input builder function returns the following as a pair:
 
-1.  `feature_cols`: A dict from feature column names to `Tensors` or
+1.  `features`: A dict from feature column names to `Tensors` or
     `SparseTensors`.
-2.  `label`: A `Tensor` containing the label column.
+2.  `labels`: A `Tensor` containing the label column.
 
-The keys of the `feature_cols` will be used to construct columns in the
-next section. Because we want to call the `train` and `evaluate` methods with
+The keys of the `features` will be used to construct columns in the next
+section. Because we want to call the `train` and `evaluate` methods with
 different data, we define a method that returns an input function based on the
 given data. Note that the returned input function will be called while
 constructing the TensorFlow graph, not while running the graph. What it is
 returning is a representation of the input data as the fundamental unit of
 TensorFlow computations, a `Tensor` (or `SparseTensor`).
 
-We use the `tf.estimator.inputs.pandas_input_fn` method to create an input
-function from pandas dataframes.
-Each continuous column in the train or test dataframe
-will be converted into a `Tensor`, which in general is a good format to
-represent dense data. For categorical data, we must represent the data as a
-`SparseTensor`. This data format is good for representing sparse data.
-Another more advanced way to represent input data would be to
-construct an @{$python/io_ops#inputs-and-readers$Inputs And Readers}
-that represents a file or other data source, and iterates through the file as
-TensorFlow runs the graph.
+Each continuous column in the train or test data will be converted into a
+`Tensor`, which in general is a good format to represent dense data. For
+categorical data, we must represent the data as a `SparseTensor`. This data
+format is good for representing sparse data. Our `input_fn` uses the `tf.data`
+API, which makes it easy to apply transformations to our dataset:
 
 ```python
-def input_fn(data_file, num_epochs, shuffle):
-  """Input builder function."""
-  df_data = pd.read_csv(
-      tf.gfile.Open(data_file),
-      names=CSV_COLUMNS,
-      skipinitialspace=True,
-      engine="python",
-      skiprows=1)
-  # remove NaN elements
-  df_data = df_data.dropna(how="any", axis=0)
-  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
-  return tf.estimator.inputs.pandas_input_fn(
-      x=df_data,
-      y=labels,
-      batch_size=100,
-      num_epochs=num_epochs,
-      shuffle=shuffle,
-      num_threads=5)
+def input_fn(data_file, num_epochs, shuffle, batch_size):
+  """Generate an input function for the Estimator."""
+  assert tf.gfile.Exists(data_file), (
+      '%s not found. Please make sure you have either run data_download.py or '
+      'set both arguments --train_data and --test_data.' % data_file)
+
+  def parse_csv(value):
+    print('Parsing', data_file)
+    columns = tf.decode_csv(value, record_defaults=_CSV_COLUMN_DEFAULTS)
+    features = dict(zip(_CSV_COLUMNS, columns))
+    labels = features.pop('income_bracket')
+    return features, tf.equal(labels, '>50K')
+
+  # Extract lines from input files using the Dataset API.
+  dataset = tf.data.TextLineDataset(data_file)
+
+  if shuffle:
+    dataset = dataset.shuffle(buffer_size=_SHUFFLE_BUFFER)
+
+  dataset = dataset.map(parse_csv, num_parallel_calls=5)
+
+  # We call repeat after shuffling, rather than before, to prevent separate
+  # epochs from blending together.
+  dataset = dataset.repeat(num_epochs)
+  dataset = dataset.batch(batch_size)
+
+  iterator = dataset.make_one_shot_iterator()
+  features, labels = iterator.get_next()
+  return features, labels
 ```
 
 ## Selecting and Engineering Features for the Model
@@ -198,13 +161,15 @@ To define a feature column for a categorical feature, we can create a
 `CategoricalColumn` using the tf.feature_column API. If you know the set of all
 possible feature values of a column and there are only a few of them, you can
 use `categorical_column_with_vocabulary_list`. Each key in the list will get
-assigned an auto-incremental ID starting from 0. For example, for the `gender`
-column we can assign the feature string "Female" to an integer ID of 0 and
-"Male" to 1 by doing:
+assigned an auto-incremental ID starting from 0. For example, for the
+`relationship` column we can assign the feature string "Husband" to an integer
+ID of 0 and "Not-in-family" to 1, etc., by doing:
 
 ```python
-gender = tf.feature_column.categorical_column_with_vocabulary_list(
-    "gender", ["Female", "Male"])
+relationship = tf.feature_column.categorical_column_with_vocabulary_list(
+    'relationship', [
+        'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
+        'Other-relative'])
 ```
 
 What if we don't know the set of possible values in advance? Not a problem. We
@@ -212,7 +177,7 @@ can use `categorical_column_with_hash_bucket` instead:
 
 ```python
 occupation = tf.feature_column.categorical_column_with_hash_bucket(
-    "occupation", hash_bucket_size=1000)
+    'occupation', hash_bucket_size=1000)
 ```
 
 What will happen is that each possible value in the feature column `occupation`
@@ -241,29 +206,29 @@ We'll do the similar trick to define the other categorical features:
 
 ```python
 education = tf.feature_column.categorical_column_with_vocabulary_list(
-    "education", [
-        "Bachelors", "HS-grad", "11th", "Masters", "9th",
-        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
-        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
-        "Preschool", "12th"
-    ])
+    'education', [
+        'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
+        'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
+        '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
+
 marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
-    "marital_status", [
-        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
-        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
-    ])
+    'marital_status', [
+        'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
+        'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
+
 relationship = tf.feature_column.categorical_column_with_vocabulary_list(
-    "relationship", [
-        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
-        "Other-relative"
-    ])
+    'relationship', [
+        'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
+        'Other-relative'])
+
 workclass = tf.feature_column.categorical_column_with_vocabulary_list(
-    "workclass", [
-        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
-        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
-    ])
-native_country = tf.feature_column.categorical_column_with_hash_bucket(
-    "native_country", hash_bucket_size=1000)
+    'workclass', [
+        'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
+        'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
+
+# To show an example of hashing:
+occupation = tf.feature_column.categorical_column_with_hash_bucket(
+    'occupation', hash_bucket_size=1000)
 ```
 
 ### Base Continuous Feature Columns
@@ -272,11 +237,11 @@ Similarly, we can define a `NumericColumn` for each continuous feature column
 that we want to use in the model:
 
 ```python
-age = tf.feature_column.numeric_column("age")
-education_num = tf.feature_column.numeric_column("education_num")
-capital_gain = tf.feature_column.numeric_column("capital_gain")
-capital_loss = tf.feature_column.numeric_column("capital_loss")
-hours_per_week = tf.feature_column.numeric_column("hours_per_week")
+age = tf.feature_column.numeric_column('age')
+education_num = tf.feature_column.numeric_column('education_num')
+capital_gain = tf.feature_column.numeric_column('capital_gain')
+capital_loss = tf.feature_column.numeric_column('capital_loss')
+hours_per_week = tf.feature_column.numeric_column('hours_per_week')
 ```
 
 ### Making Continuous Features Categorical through Bucketization
@@ -322,7 +287,7 @@ columns** to the model.
 
 ```python
 education_x_occupation = tf.feature_column.crossed_column(
-    ["education", "occupation"], hash_bucket_size=1000)
+    ['education', 'occupation'], hash_bucket_size=1000)
 ```
 
 We can also create a `CrossedColumn` over more than two columns. Each
@@ -332,7 +297,7 @@ or even another `CrossColumn`. Here's an example:
 
 ```python
 age_buckets_x_education_x_occupation = tf.feature_column.crossed_column(
-    [age_buckets, "education", "occupation"], hash_bucket_size=1000)
+    [age_buckets, 'education', 'occupation'], hash_bucket_size=1000)
 ```
 
 ## Defining The Logistic Regression Model
@@ -352,20 +317,18 @@ added to the `feature_columns` field of a model:
 
 ```python
 base_columns = [
-    gender, native_country, education, occupation, workclass, relationship,
+    education, marital_status, relationship, workclass, occupation,
     age_buckets,
 ]
 crossed_columns = [
     tf.feature_column.crossed_column(
-        ["education", "occupation"], hash_bucket_size=1000),
+        ['education', 'occupation'], hash_bucket_size=1000),
     tf.feature_column.crossed_column(
-        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
-    tf.feature_column.crossed_column(
-        ["native_country", "occupation"], hash_bucket_size=1000)
+        [age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
 ]
 
 model_dir = tempfile.mkdtemp()
-m = tf.estimator.LinearClassifier(
+model = tf.estimator.LinearClassifier(
     model_dir=model_dir, feature_columns=base_columns + crossed_columns)
 ```
 
@@ -377,34 +340,29 @@ in `model_dir`.
 ## Training and Evaluating Our Model
 
 After adding all the features to the model, now let's look at how to actually
-train the model. Training a model is just a one-liner using the tf.estimator
-API:
+train the model. Training a model is just a single command using the
+tf.estimator API:
 
 ```python
-# set num_epochs to None to get infinite stream of data.
-m.train(
-    input_fn=input_fn(train_file.name, num_epochs=None, shuffle=True),
-    steps=train_steps)
+model.train(input_fn=lambda: input_fn(train_data, num_epochs, True, batch_size))
 ```
 
 After the model is trained, we can evaluate how good our model is at predicting
 the labels of the holdout data:
 
 ```python
-results = m.evaluate(
-    input_fn=input_fn(test_file.name, num_epochs=1, shuffle=False),
-    steps=None)
-print("model directory = %s" % model_dir)
+results = model.evaluate(input_fn=lambda: input_fn(
+    test_data, 1, False, batch_size))
 for key in sorted(results):
-  print("%s: %s" % (key, results[key]))
+  print('%s: %s' % (key, results[key]))
 ```
 
-The first line of the output should be something like `accuracy: 0.83557522`,
-which means the accuracy is 83.6%. Feel free to try more features and
-transformations and see if you can do even better!
+The first line of the final output should be something like
+`accuracy: 0.83557522`, which means the accuracy is 83.6%. Feel free to try more
+features and transformations and see if you can do even better!
 
 If you'd like to see a working end-to-end example, you can download our
-[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+[example code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 and set the `model_type` flag to `wide`.
 
 ## Adding Regularization to Prevent Overfitting
@@ -421,12 +379,12 @@ In the Linear Model library, you can add L1 and L2 regularizations to the model
 as:
 
 ```
-m = tf.estimator.LinearClassifier(
+model = tf.estimator.LinearClassifier(
     model_dir=model_dir, feature_columns=base_columns + crossed_columns,
     optimizer=tf.train.FtrlOptimizer(
-      learning_rate=0.1,
-      l1_regularization_strength=1.0,
-      l2_regularization_strength=1.0))
+        learning_rate=0.1,
+        l1_regularization_strength=1.0,
+        l2_regularization_strength=1.0))
 ```
 
 One important difference between L1 and L2 regularization is that L1
@@ -447,17 +405,17 @@ you a desirable model size.
 Finally, let's take a minute to talk about what the Logistic Regression model
 actually looks like in case you're not already familiar with it. We'll denote
 the label as \\(Y\\), and the set of observed features as a feature vector
-\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). We define \\(Y=1\\) if an individual earned >
-50,000 dollars and \\(Y=0\\) otherwise. In Logistic Regression, the probability of
-the label being positive (\\(Y=1\\)) given the features \\(\mathbf{x}\\) is given
-as:
+\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). We define \\(Y=1\\) if an individual
+earned > 50,000 dollars and \\(Y=0\\) otherwise. In Logistic Regression, the
+probability of the label being positive (\\(Y=1\\)) given the features
+\\(\mathbf{x}\\) is given as:
 
 $$ P(Y=1|\mathbf{x}) = \frac{1}{1+\exp(-(\mathbf{w}^T\mathbf{x}+b))}$$
 
-where \\(\mathbf{w}=[w_1, w_2, ..., w_d]\\) are the model weights for the features
-\\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). \\(b\\) is a constant that is often called
-the **bias** of the model. The equation consists of two parts—A linear model and
-a logistic function:
+where \\(\mathbf{w}=[w_1, w_2, ..., w_d]\\) are the model weights for the
+features \\(\mathbf{x}=[x_1, x_2, ..., x_d]\\). \\(b\\) is a constant that is
+often called the **bias** of the model. The equation consists of two parts—A
+linear model and a logistic function:
 
 *   **Linear Model**: First, we can see that \\(\mathbf{w}^T\mathbf{x}+b = b +
     w_1x_1 + ... +w_dx_d\\) is a linear model where the output is a linear
@@ -465,16 +423,17 @@ a logistic function:
     prediction one would make without observing any features. The model weight
     \\(w_i\\) reflects how the feature \\(x_i\\) is correlated with the positive
     label. If \\(x_i\\) is positively correlated with the positive label, the
-    weight \\(w_i\\) increases, and the probability \\(P(Y=1|\mathbf{x})\\) will be
-    closer to 1. On the other hand, if \\(x_i\\) is negatively correlated with the
-    positive label, then the weight \\(w_i\\) decreases and the probability
-    \\(P(Y=1|\mathbf{x})\\) will be closer to 0.
+    weight \\(w_i\\) increases, and the probability \\(P(Y=1|\mathbf{x})\\) will
+    be closer to 1. On the other hand, if \\(x_i\\) is negatively correlated
+    with the positive label, then the weight \\(w_i\\) decreases and the
+    probability \\(P(Y=1|\mathbf{x})\\) will be closer to 0.
 
 *   **Logistic Function**: Second, we can see that there's a logistic function
-    (also known as the sigmoid function) \\(S(t) = 1/(1+\exp(-t))\\) being applied
-    to the linear model. The logistic function is used to convert the output of
-    the linear model \\(\mathbf{w}^T\mathbf{x}+b\\) from any real number into the
-    range of \\([0, 1]\\), which can be interpreted as a probability.
+    (also known as the sigmoid function) \\(S(t) = 1/(1+\exp(-t))\\) being
+    applied to the linear model. The logistic function is used to convert the
+    output of the linear model \\(\mathbf{w}^T\mathbf{x}+b\\) from any real
+    number into the range of \\([0, 1]\\), which can be interpreted as a
+    probability.
 
 Model training is an optimization problem: The goal is to find a set of model
 weights (i.e. model parameters) to minimize a **loss function** defined over the
diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md
index 16f7925e8dd546a0c5407b31c6ae9b8cf0bd0853..44677a810bc5c253c198d81fae2be723c4f8ae4e 100644
--- a/tensorflow/docs_src/tutorials/wide_and_deep.md
+++ b/tensorflow/docs_src/tutorials/wide_and_deep.md
@@ -1,13 +1,12 @@
 # TensorFlow Wide & Deep Learning Tutorial
 
-In the previous @{$wide$TensorFlow Linear Model Tutorial},
-we trained a logistic regression model to predict the probability that the
-individual has an annual income of over 50,000 dollars using the
+In the previous @{$wide$TensorFlow Linear Model Tutorial}, we trained a logistic
+regression model to predict the probability that the individual has an annual
+income of over 50,000 dollars using the
 [Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).
-TensorFlow is
-great for training deep neural networks too, and you might be thinking which one
-you should choose—Well, why not both? Would it be possible to combine the
-strengths of both in one model?
+TensorFlow is great for training deep neural networks too, and you might be
+thinking which one you should choose—well, why not both? Would it be possible to
+combine the strengths of both in one model?
 
 In this tutorial, we'll introduce how to use the tf.estimator API to jointly
 train a wide linear model and a deep feed-forward neural network. This approach
@@ -40,33 +39,18 @@ To try the code for this tutorial:
 
 1.  @{$install$Install TensorFlow} if you haven't already.
 
-2.  Download [the tutorial code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+2.  Download [the tutorial code](https://github.com/tensorflow/models/tree/master/official/wide_deep/).
 
-3.  Install the pandas data analysis library. tf.estimator doesn't require pandas, but it does support it, and this tutorial uses pandas. To install pandas:
+3. Execute the data download script we provide to you:
 
-    a. Get `pip`:
+        $ python data_download.py
 
-        # Ubuntu/Linux 64-bit
-        $ sudo apt-get install python-pip python-dev
+4. Execute the tutorial code with the following command to train the wide and
+deep model described in this tutorial:
 
-        # Mac OS X
-        $ sudo easy_install pip
-        $ sudo easy_install --upgrade six
+        $ python wide_deep.py
 
-    b. Use `pip` to install pandas:
-
-        $ sudo pip install pandas
-
-    If you have trouble installing pandas, consult the
-    [instructions](https://pandas.pydata.org/pandas-docs/stable/install.html)
-    on the pandas site.
-
-4. Execute the tutorial code with the following command to train the linear
-model described in this tutorial:
-
-        $ python wide_n_deep_tutorial.py --model_type=wide_n_deep
-
-Read on to find out how this code builds its linear model.
+Read on to find out how this code builds its model.
 
 
 ## Define Base Feature Columns
@@ -78,43 +62,37 @@ part and the deep part of the model.
 ```python
 import tensorflow as tf
 
-gender = tf.feature_column.categorical_column_with_vocabulary_list(
-    "gender", ["Female", "Male"])
+# Continuous columns
+age = tf.feature_column.numeric_column('age')
+education_num = tf.feature_column.numeric_column('education_num')
+capital_gain = tf.feature_column.numeric_column('capital_gain')
+capital_loss = tf.feature_column.numeric_column('capital_loss')
+hours_per_week = tf.feature_column.numeric_column('hours_per_week')
+
 education = tf.feature_column.categorical_column_with_vocabulary_list(
-    "education", [
-        "Bachelors", "HS-grad", "11th", "Masters", "9th",
-        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
-        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
-        "Preschool", "12th"
-    ])
+    'education', [
+        'Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
+        'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
+        '5th-6th', '10th', '1st-4th', 'Preschool', '12th'])
+
 marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
-    "marital_status", [
-        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
-        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
-    ])
+    'marital_status', [
+        'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
+        'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'])
+
 relationship = tf.feature_column.categorical_column_with_vocabulary_list(
-    "relationship", [
-        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
-        "Other-relative"
-    ])
+    'relationship', [
+        'Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried',
+        'Other-relative'])
+
 workclass = tf.feature_column.categorical_column_with_vocabulary_list(
-    "workclass", [
-        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
-        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
-    ])
+    'workclass', [
+        'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov',
+        'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'])
 
 # To show an example of hashing:
 occupation = tf.feature_column.categorical_column_with_hash_bucket(
-    "occupation", hash_bucket_size=1000)
-native_country = tf.feature_column.categorical_column_with_hash_bucket(
-    "native_country", hash_bucket_size=1000)
-
-# Continuous base columns.
-age = tf.feature_column.numeric_column("age")
-education_num = tf.feature_column.numeric_column("education_num")
-capital_gain = tf.feature_column.numeric_column("capital_gain")
-capital_loss = tf.feature_column.numeric_column("capital_loss")
-hours_per_week = tf.feature_column.numeric_column("hours_per_week")
+    'occupation', hash_bucket_size=1000)
 
 # Transformations.
 age_buckets = tf.feature_column.bucketized_column(
@@ -128,20 +106,20 @@ columns:
 
 ```python
 base_columns = [
-    gender, native_country, education, occupation, workclass, relationship,
+    education, marital_status, relationship, workclass, occupation,
     age_buckets,
 ]
 
 crossed_columns = [
     tf.feature_column.crossed_column(
-        ["education", "occupation"], hash_bucket_size=1000),
-    tf.feature_column.crossed_column(
-        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
+        ['education', 'occupation'], hash_bucket_size=1000),
     tf.feature_column.crossed_column(
-        ["native_country", "occupation"], hash_bucket_size=1000)
+        [age_buckets, 'education', 'occupation'], hash_bucket_size=1000),
 ]
 ```
 
+You can also see the @{$wide$TensorFlow Linear Model Tutorial} for more details.
+
 Wide models with crossed feature columns can memorize sparse interactions
 between features effectively. That being said, one limitation of crossed feature
 columns is that they do not generalize to feature combinations that have not
@@ -158,36 +136,35 @@ concatenated with the continuous features, and then fed into the hidden layers
 of a neural network in the forward pass. The embedding values are initialized
 randomly, and are trained along with all other model parameters to minimize the
 training loss. If you're interested in learning more about embeddings, check out
-the TensorFlow tutorial on
-[Vector Representations of Words](https://www.tensorflow.org/versions/r0.9/tutorials/word2vec/index.html),
-or [Word Embedding](https://en.wikipedia.org/wiki/Word_embedding) on Wikipedia.
+the TensorFlow tutorial on @{$word2vec$Vector Representations of Words} or
+[Word embedding](https://en.wikipedia.org/wiki/Word_embedding) on Wikipedia.
 
 Another way to represent categorical columns to feed into a neural network is
-via a multi-hot representation. This is often appropriate for categorical
-columns with only a few possible values. E.g. for the gender column, `"Male"`
-can be represented as `[1, 0]` and `"Female"` as `[0, 1]`. This is a fixed
-representation, whereas embeddings are more flexible and calculated at training
-time.
+via a one-hot or multi-hot representation. This is often appropriate for
+categorical columns with only a few possible values. As an example of a one-hot
+representation, for the relationship column, `"Husband"` can be represented as
+[1, 0, 0, 0, 0, 0], and `"Not-in-family"` as [0, 1, 0, 0, 0, 0], etc. This is a
+fixed representation, whereas embeddings are more flexible and calculated at
+training time.
 
 We'll configure the embeddings for the categorical columns using
 `embedding_column`, and concatenate them with the continuous columns.
-We also use `indicator_column` to create multi-hot representation of some
+We also use `indicator_column` to create multi-hot representations of some
 categorical columns.
 
 ```python
 deep_columns = [
-    tf.feature_column.indicator_column(workclass),
-    tf.feature_column.indicator_column(education),
-    tf.feature_column.indicator_column(gender),
-    tf.feature_column.indicator_column(relationship),
-    # To show an example of embedding
-    tf.feature_column.embedding_column(native_country, dimension=8),
-    tf.feature_column.embedding_column(occupation, dimension=8),
     age,
     education_num,
     capital_gain,
     capital_loss,
     hours_per_week,
+    tf.feature_column.indicator_column(workclass),
+    tf.feature_column.indicator_column(education),
+    tf.feature_column.indicator_column(marital_status),
+    tf.feature_column.indicator_column(relationship),
+    # To show an example of embedding
+    tf.feature_column.embedding_column(occupation, dimension=8),
 ]
 ```
 
@@ -221,11 +198,9 @@ handled for you under the hood, so you simply need to create a
 `DNNLinearCombinedClassifier`:
 
 ```python
-import tempfile
-model_dir = tempfile.mkdtemp()
-m = tf.estimator.DNNLinearCombinedClassifier(
-    model_dir=model_dir,
-    linear_feature_columns=crossed_columns,
+model = tf.estimator.DNNLinearCombinedClassifier(
+    model_dir='/tmp/census_model',
+    linear_feature_columns=base_columns + crossed_columns,
     dnn_feature_columns=deep_columns,
     dnn_hidden_units=[100, 50])
 ```
@@ -233,88 +208,32 @@ m = tf.estimator.DNNLinearCombinedClassifier(
 ## Training and Evaluating The Model
 
 Before we train the model, let's read in the Census dataset as we did in the
-@{$wide$TensorFlow Linear Model tutorial}. The code for
-input data processing is provided here again for your convenience:
+@{$wide$TensorFlow Linear Model tutorial}. See `data_download.py` as well as
+`input_fn` within
+[`wide_deep.py`](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py).
+
+After reading in the data, you can train and evaluate the model:
 
 ```python
-import pandas as pd
-import urllib
-
-# Define the column names for the data sets.
-CSV_COLUMNS = [
-    "age", "workclass", "fnlwgt", "education", "education_num",
-    "marital_status", "occupation", "relationship", "race", "gender",
-    "capital_gain", "capital_loss", "hours_per_week", "native_country",
-    "income_bracket"
-]
+# Train and evaluate the model every `FLAGS.epochs_per_eval` epochs.
+for n in range(FLAGS.train_epochs // FLAGS.epochs_per_eval):
+  model.train(input_fn=lambda: input_fn(
+      FLAGS.train_data, FLAGS.epochs_per_eval, True, FLAGS.batch_size))
 
-def maybe_download(train_data, test_data):
-  """Maybe downloads training data and returns train and test file names."""
-  if train_data:
-    train_file_name = train_data
-  else:
-    train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
-        train_file.name)  # pylint: disable=line-too-long
-    train_file_name = train_file.name
-    train_file.close()
-    print("Training data is downloaded to %s" % train_file_name)
-
-  if test_data:
-    test_file_name = test_data
-  else:
-    test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
-        test_file.name)  # pylint: disable=line-too-long
-    test_file_name = test_file.name
-    test_file.close()
-    print("Test data is downloaded to %s"% test_file_name)
-
-  return train_file_name, test_file_name
-
-def input_fn(data_file, num_epochs, shuffle):
-  """Input builder function."""
-  df_data = pd.read_csv(
-      tf.gfile.Open(data_file),
-      names=CSV_COLUMNS,
-      skipinitialspace=True,
-      engine="python",
-      skiprows=1)
-  # remove NaN elements
-  df_data = df_data.dropna(how="any", axis=0)
-  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
-  return tf.estimator.inputs.pandas_input_fn(
-      x=df_data,
-      y=labels,
-      batch_size=100,
-      num_epochs=num_epochs,
-      shuffle=shuffle,
-      num_threads=5)
-```
+  results = model.evaluate(input_fn=lambda: input_fn(
+      FLAGS.test_data, 1, False, FLAGS.batch_size))
 
-After reading in the data, you can train and evaluate the model:
+  # Display evaluation metrics
+  print('Results at epoch', (n + 1) * FLAGS.epochs_per_eval)
+  print('-' * 30)
 
-```python
-# set num_epochs to None to get infinite stream of data.
-m.train(
-    input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
-    steps=train_steps)
-# set steps to None to run evaluation until all data consumed.
-results = m.evaluate(
-    input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
-    steps=None)
-print("model directory = %s" % model_dir)
-for key in sorted(results):
-  print("%s: %s" % (key, results[key]))
+  for key in sorted(results):
+    print('%s: %s' % (key, results[key]))
 ```
 
-The first line of the output should be something like `accuracy: 0.84429705`. We
-can see that the accuracy was improved from about 83.6% using a wide-only linear
-model to about 84.4% using a Wide & Deep model. If you'd like to see a working
-end-to-end example, you can download our
-[example code](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py).
+The final output accuracy should be somewhere around 85.5%. If you'd like to
+see a working end-to-end example, you can download our
+[example code](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py).
 
 Note that this tutorial is just a quick example on a small dataset to get you
 familiar with the API. Wide & Deep Learning will be even more powerful if you
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 79202a38d7199033a9fefa8c6ba71e383aa0bf19..30a26d13c5734c5cf4a3b565c793db3e093c8271 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -126,6 +126,10 @@ the Android NDK and SDK must be installed on your system.
 2.  The Android NDK is required to build the native (C/C++) TensorFlow code. The
     current recommended version is 14b, which may be found
     [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
+
+      * NDK 16, the revision released in November 2017, is **incompatible** with
+        Bazel. See [here](https://github.com/tensorflow/tensorflow/issues/14918).
+
 3.  The Android SDK and build tools may be obtained
     [here](https://developer.android.com/tools/revisions/build-tools.html), or
     alternatively as part of [Android
@@ -133,8 +137,16 @@ the Android NDK and SDK must be installed on your system.
     23 is required to build the TF Android demo (though it will run on API >= 21
     devices).
 
+      - The Android Studio SDK Manager's NDK installer will install the latest
+        revision of the NDK, which is **incompatible** with Bazel. You'll need
+        to download an older version manually, as (2) suggests.
+
 ##### Edit WORKSPACE
 
+NOTE: As long as you have the SDK and NDK installed, the `./configure` script
+will create these rules for you. Answer "Yes" when the script asks to
+automatically configure the `./WORKSPACE`.
+
 The Android entries in
 [`<workspace_root>/WORKSPACE`](../../../WORKSPACE#L19-L36) must be uncommented
 with the paths filled in appropriately depending on where you installed the NDK
@@ -156,7 +168,7 @@ download-models.gradle.
 
 **Optional**: If you wish to place the models in your assets manually, remove
 all of the `model_files` entries from the `assets` list in `tensorflow_demo`
-found in the `[BUILD](BUILD)` file. Then download and extract the archives
+found in the [`BUILD`](BUILD#L92) file. Then download and extract the archives
 yourself to the `assets` directory in the source tree:
 
 ```bash
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/examples/android/build.gradle
index 48f566f825d2714fe5970531e3d9c9f0f7ca940e..f7bdf8b816a8191770bc1ad59b890041b8e39912 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/examples/android/build.gradle
@@ -28,8 +28,8 @@ buildscript {
     }
 
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.0'
-        classpath 'org.apache.httpcomponents:httpclient:4.5.2'
+        classpath 'com.android.tools.build:gradle:3.0.1'
+        classpath 'org.apache.httpcomponents:httpclient:4.5.4'
     }
 }
 
@@ -75,7 +75,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 23
-    buildToolsVersion "25.0.2"
+    buildToolsVersion '26.0.2'
 
     if (nativeBuildSystem == 'cmake') {
         defaultConfig {
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..13372aef5e24af05341d49695ee84e5f9b594659
Binary files /dev/null and b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000000000000000000000000000000..bd9ee87db3742e9f8c62df2ec9a7852550d9bbc9
--- /dev/null
+++ b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Sat Nov 18 15:06:47 CET 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.1-all.zip
diff --git a/tensorflow/examples/android/gradlew b/tensorflow/examples/android/gradlew
new file mode 100644
index 0000000000000000000000000000000000000000..9d82f78915133e1c35a6ea51252590fb38efac2f
--- /dev/null
+++ b/tensorflow/examples/android/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/examples/android/gradlew.bat b/tensorflow/examples/android/gradlew.bat
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b282aa6885fb573c106b3551f7275c5f17e8e
--- /dev/null
+++ b/tensorflow/examples/android/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
index 7e57c17467ec86833c5e0521d333502242ba4092..8bd4abb154a8f8c74f2195d4acbb99d3d5d498ea 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -37,6 +37,7 @@ import android.os.HandlerThread;
 import android.os.Trace;
 import android.util.Size;
 import android.view.KeyEvent;
+import android.view.Surface;
 import android.view.WindowManager;
 import android.widget.Toast;
 import java.nio.ByteBuffer;
@@ -332,8 +333,12 @@ public abstract class CameraActivity extends Activity
           continue;
         }
 
-        useCamera2API = isHardwareLevelSupported(characteristics,
-            CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
         LOGGER.i("Camera API lv2?: %s", useCamera2API);
         return cameraId;
       }
@@ -426,6 +431,19 @@ public abstract class CameraActivity extends Activity
     }
   }
 
+  protected int getScreenOrientation() {
+    switch (getWindowManager().getDefaultDisplay().getRotation()) {
+      case Surface.ROTATION_270:
+        return 270;
+      case Surface.ROTATION_180:
+        return 180;
+      case Surface.ROTATION_90:
+        return 90;
+      default:
+        return 0;
+    }
+  }
+
   protected abstract void processImage();
 
   protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
index eabc724f7fd93136c49c31adc4f096865ab1c8a5..07995febaf5caab65dd4dfcc262ccf3750cfa303 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
@@ -100,7 +100,7 @@ public interface Classifier {
   List<Recognition> recognizeImage(Bitmap bitmap);
 
   void enableStatLogging(final boolean debug);
-  
+
   String getStatString();
 
   void close();
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
index b29fa1546c38b6cd37b1cd3777a49b6d9a27b5f1..e2c394dde92cf89b91e8988f2ee55b5afbee8d67 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -27,6 +27,7 @@ import android.os.SystemClock;
 import android.util.Size;
 import android.util.TypedValue;
 import android.view.Display;
+import android.view.Surface;
 import java.util.List;
 import java.util.Vector;
 import org.tensorflow.demo.OverlayView.DrawCallback;
@@ -123,12 +124,8 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
     previewWidth = size.getWidth();
     previewHeight = size.getHeight();
 
-    final Display display = getWindowManager().getDefaultDisplay();
-    final int screenOrientation = display.getRotation();
-
-    LOGGER.i("Sensor orientation: %d, Screen orientation: %d", rotation, screenOrientation);
-
-    sensorOrientation = rotation + screenOrientation;
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
 
     LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
     rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
index 3c80a2ae3c95644ec62e0927efeb230eaaf213dd..7882d87c1cf1846b229cf8d819389145720fcb1b 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -30,6 +30,7 @@ import android.os.SystemClock;
 import android.util.Size;
 import android.util.TypedValue;
 import android.view.Display;
+import android.view.Surface;
 import android.widget.Toast;
 import java.io.IOException;
 import java.util.LinkedList;
@@ -168,12 +169,8 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
     previewWidth = size.getWidth();
     previewHeight = size.getHeight();
 
-    final Display display = getWindowManager().getDefaultDisplay();
-    final int screenOrientation = display.getRotation();
-
-    LOGGER.i("Sensor orientation: %d, Screen orientation: %d", rotation, screenOrientation);
-
-    sensorOrientation = rotation + screenOrientation;
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
 
     LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
     rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
index 5629f179c4558bc66222b029bdae731badb5e2ef..a3c694cddcaf4e5f60558269105ac761c074d431 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -300,6 +300,10 @@ public class ImageUtils {
     final Matrix matrix = new Matrix();
 
     if (applyRotation != 0) {
+      if (applyRotation % 90 != 0) {
+        LOGGER.w("Rotation of %d % 90 != 0", applyRotation);
+      }
+
       // Translate so center of image is at origin.
       matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
 
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index aae0a4b62a95b2de873fc868eae18ad2db2827f0..2fe2ba539edc84e80baf36b6d1ac1e192bc92163 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -161,15 +161,16 @@ public class MultiBoxTracker {
   }
 
   public synchronized void draw(final Canvas canvas) {
-    // TODO(andrewharp): This may not work for non-90 deg rotations.
+    final boolean rotated = sensorOrientation % 180 == 90;
     final float multiplier =
-        Math.min(canvas.getWidth() / (float) frameHeight, canvas.getHeight() / (float) frameWidth);
+        Math.min(canvas.getHeight() / (float) (rotated ? frameWidth : frameHeight),
+                 canvas.getWidth() / (float) (rotated ? frameHeight : frameWidth));
     frameToCanvasMatrix =
         ImageUtils.getTransformationMatrix(
             frameWidth,
             frameHeight,
-            (int) (multiplier * frameHeight),
-            (int) (multiplier * frameWidth),
+            (int) (multiplier * (rotated ? frameHeight : frameWidth)),
+            (int) (multiplier * (rotated ? frameWidth : frameHeight)),
             sensorOrientation,
             false);
     for (final TrackedRecognition recognition : trackedObjects) {
diff --git a/tensorflow/examples/how_tos/reading_data/convert_to_records.py b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
index d14c1f7c86b7b3893b5574850a6b52abae6f7ffb..c89e83956322cb87a4cf41c6b7172f03d941b429 100644
--- a/tensorflow/examples/how_tos/reading_data/convert_to_records.py
+++ b/tensorflow/examples/how_tos/reading_data/convert_to_records.py
@@ -52,17 +52,19 @@ def convert_to(data_set, name):
 
   filename = os.path.join(FLAGS.directory, name + '.tfrecords')
   print('Writing', filename)
-  writer = tf.python_io.TFRecordWriter(filename)
-  for index in range(num_examples):
-    image_raw = images[index].tostring()
-    example = tf.train.Example(features=tf.train.Features(feature={
-        'height': _int64_feature(rows),
-        'width': _int64_feature(cols),
-        'depth': _int64_feature(depth),
-        'label': _int64_feature(int(labels[index])),
-        'image_raw': _bytes_feature(image_raw)}))
-    writer.write(example.SerializeToString())
-  writer.close()
+  with tf.python_io.TFRecordWriter(filename) as writer:
+    for index in range(num_examples):
+      image_raw = images[index].tostring()
+      example = tf.train.Example(
+          features=tf.train.Features(
+              feature={
+                  'height': _int64_feature(rows),
+                  'width': _int64_feature(cols),
+                  'depth': _int64_feature(depth),
+                  'label': _int64_feature(int(labels[index])),
+                  'image_raw': _bytes_feature(image_raw)
+              }))
+      writer.write(example.SerializeToString())
 
 
 def main(unused_argv):
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index a9ed02dd1a60ad79c2943212155bad864a750a99..9db8835d925ca426513dd01716fa223384db8213 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -45,9 +45,7 @@ TRAIN_FILE = 'train.tfrecords'
 VALIDATION_FILE = 'validation.tfrecords'
 
 
-def read_and_decode(filename_queue):
-  reader = tf.TFRecordReader()
-  _, serialized_example = reader.read(filename_queue)
+def decode(serialized_example):
   features = tf.parse_single_example(
       serialized_example,
       # Defaults are not specified since both keys are required.
@@ -60,22 +58,26 @@ def read_and_decode(filename_queue):
   # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
   # [mnist.IMAGE_PIXELS].
   image = tf.decode_raw(features['image_raw'], tf.uint8)
-  image.set_shape([mnist.IMAGE_PIXELS])
+  image.set_shape((mnist.IMAGE_PIXELS))
 
+  # Convert label from a scalar uint8 tensor to an int32 scalar.
+  label = tf.cast(features['label'], tf.int32)
+  
+  return image, label
+
+def augment(image, label):
   # OPTIONAL: Could reshape into a 28x28 image and apply distortions
   # here.  Since we are not applying any distortions in this
   # example, and the next step expects the image to be flattened
   # into a vector, we don't bother.
+  return image, label
 
+def normalize(image, label):
   # Convert from [0, 255] -> [-0.5, 0.5] floats.
   image = tf.cast(image, tf.float32) * (1. / 255) - 0.5
 
-  # Convert label from a scalar uint8 tensor to an int32 scalar.
-  label = tf.cast(features['label'], tf.int32)
-
   return image, label
 
-
 def inputs(train, batch_size, num_epochs):
   """Reads input data num_epochs times.
 
@@ -91,31 +93,32 @@ def inputs(train, batch_size, num_epochs):
       in the range [-0.5, 0.5].
     * labels is an int32 tensor with shape [batch_size] with the true label,
       a number in the range [0, mnist.NUM_CLASSES).
-    Note that an tf.train.QueueRunner is added to the graph, which
-    must be run using e.g. tf.train.start_queue_runners().
+
+    This function creates a one_shot_iterator, meaning that it will only iterate
+    over the dataset once. On the other hand there is no special initialization
+    required.
   """
   if not num_epochs: num_epochs = None
   filename = os.path.join(FLAGS.train_dir,
                           TRAIN_FILE if train else VALIDATION_FILE)
 
   with tf.name_scope('input'):
-    filename_queue = tf.train.string_input_producer(
-        [filename], num_epochs=num_epochs)
+    # TFRecordDataset opens a protobuf and reads entries line by line
+    # could also be [list, of, filenames]
+    dataset = tf.data.TFRecordDataset(filename)
+    dataset = dataset.repeat(num_epochs)
 
-    # Even when reading in multiple threads, share the filename
-    # queue.
-    image, label = read_and_decode(filename_queue)
+    # map takes a python function and applies it to every sample
+    dataset = dataset.map(decode)
+    dataset = dataset.map(augment)
+    dataset = dataset.map(normalize)
 
-    # Shuffle the examples and collect them into batch_size batches.
-    # (Internally uses a RandomShuffleQueue.)
-    # We run this in two threads to avoid being a bottleneck.
-    images, sparse_labels = tf.train.shuffle_batch(
-        [image, label], batch_size=batch_size, num_threads=2,
-        capacity=1000 + 3 * batch_size,
-        # Ensures a minimum amount of shuffling of examples.
-        min_after_dequeue=1000)
+    #the parameter is the queue size
+    dataset = dataset.shuffle(1000 + 3 * batch_size)
+    dataset = dataset.batch(batch_size)
 
-    return images, sparse_labels
+    iterator = dataset.make_one_shot_iterator()
+  return iterator.get_next()
 
 
 def run_training():
@@ -124,16 +127,16 @@ def run_training():
   # Tell TensorFlow that the model will be built into the default Graph.
   with tf.Graph().as_default():
     # Input images and labels.
-    images, labels = inputs(train=True, batch_size=FLAGS.batch_size,
-                            num_epochs=FLAGS.num_epochs)
+    image_batch, label_batch = inputs(train=True, batch_size=FLAGS.batch_size,
+                               num_epochs=FLAGS.num_epochs)
 
     # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images,
+    logits = mnist.inference(image_batch,
                              FLAGS.hidden1,
                              FLAGS.hidden2)
 
     # Add to the Graph the loss calculation.
-    loss = mnist.loss(logits, labels)
+    loss = mnist.loss(logits, label_batch)
 
     # Add to the Graph operations that train the model.
     train_op = mnist.training(loss, FLAGS.learning_rate)
@@ -143,47 +146,33 @@ def run_training():
                        tf.local_variables_initializer())
 
     # Create a session for running operations in the Graph.
-    sess = tf.Session()
-
-    # Initialize the variables (the trained variables and the
-    # epoch counter).
-    sess.run(init_op)
-
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
-
-    try:
-      step = 0
-      while not coord.should_stop():
-        start_time = time.time()
-
-        # Run one step of the model.  The return values are
-        # the activations from the `train_op` (which is
-        # discarded) and the `loss` op.  To inspect the values
-        # of your ops or variables, you may include them in
-        # the list passed to sess.run() and the value tensors
-        # will be returned in the tuple from the call.
-        _, loss_value = sess.run([train_op, loss])
-
-        duration = time.time() - start_time
-
-        # Print an overview fairly often.
-        if step % 100 == 0:
-          print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
+    with tf.Session() as sess:
+      # Initialize the variables (the trained variables and the
+      # epoch counter).
+      sess.run(init_op)
+      try:
+        step = 0
+        while True: #train until OutOfRangeError
+          start_time = time.time()
+
+          # Run one step of the model.  The return values are
+          # the activations from the `train_op` (which is
+          # discarded) and the `loss` op.  To inspect the values
+          # of your ops or variables, you may include them in
+          # the list passed to sess.run() and the value tensors
+          # will be returned in the tuple from the call.
+          _, loss_value = sess.run([train_op, loss])
+
+          duration = time.time() - start_time
+
+          # Print an overview fairly often.
+          if step % 100 == 0:
+            print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value,
                                                      duration))
-        step += 1
-    except tf.errors.OutOfRangeError:
-      print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
-    finally:
-      # When done, ask the threads to stop.
-      coord.request_stop()
-
-    # Wait for threads to finish.
-    coord.join(threads)
-    sess.close()
-
-
+          step += 1
+      except tf.errors.OutOfRangeError:
+        print('Done training for %d epochs, %d steps.' % (FLAGS.num_epochs, step))
+      
 def main(_):
   run_training()
 
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index c8c136ac14c9745d269952bdf32e7360eb83336c..9f9244a74c4d073cc67b7c8252b0bcff86e9400f 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -25,23 +25,10 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "label_image",
-    srcs = [
-        "label_image.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
 py_test(
     name = "retrain_test",
     size = "small",
     srcs = [
-        "label_image.py",
         "retrain.py",
         "retrain_test.py",
     ],
@@ -51,7 +38,6 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":label_image",
         ":retrain",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/examples/image_retraining/README.md b/tensorflow/examples/image_retraining/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a49525c6eff003f2c7acb592f213285e627eb51
--- /dev/null
+++ b/tensorflow/examples/image_retraining/README.md
@@ -0,0 +1,12 @@
+retrain.py is an example script that shows how one can adapt a pretrained
+network for other classification problems. A detailed overview of this script
+can be found at:
+https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0
+
+The script also shows how one can train layers
+with quantized weights and activations instead of taking a pre-trained floating
+point model and then quantizing weights and activations.
+The output graphdef produced by this script is compatible with the TensorFlow
+Lite Optimizing Converter and can be converted to TFLite format.
+
+
diff --git a/tensorflow/examples/image_retraining/label_image.py b/tensorflow/examples/image_retraining/label_image.py
deleted file mode 100644
index de2713fc10272b4fc45613cd1bca26c4bec6c441..0000000000000000000000000000000000000000
--- a/tensorflow/examples/image_retraining/label_image.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Simple image classification with Inception.
-
-Run image classification with your model.
-
-This script is usually used with retrain.py found in this same
-directory.
-
-This program creates a graph from a saved GraphDef protocol buffer,
-and runs inference on an input JPEG image. You are required
-to pass in the graph file and the txt file.
-
-It outputs human readable strings of the top 5 predictions along with
-their probabilities.
-
-Change the --image_file argument to any jpg image to compute a
-classification of that image.
-
-Example usage:
-python label_image.py --graph=retrained_graph.pb
-  --labels=retrained_labels.txt
-  --image=flower_photos/daisy/54377391_15648e8d18.jpg
-
-NOTE: To learn to use this file and retrain.py, please see:
-
-https://codelabs.developers.google.com/codelabs/tensorflow-for-poets
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import tensorflow as tf
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--image', required=True, type=str, help='Absolute path to image file.')
-parser.add_argument(
-    '--num_top_predictions',
-    type=int,
-    default=5,
-    help='Display this many predictions.')
-parser.add_argument(
-    '--graph',
-    required=True,
-    type=str,
-    help='Absolute path to graph file (.pb)')
-parser.add_argument(
-    '--labels',
-    required=True,
-    type=str,
-    help='Absolute path to labels file (.txt)')
-parser.add_argument(
-    '--output_layer',
-    type=str,
-    default='final_result:0',
-    help='Name of the result operation')
-parser.add_argument(
-    '--input_layer',
-    type=str,
-    default='DecodeJpeg/contents:0',
-    help='Name of the input operation')
-
-
-def load_image(filename):
-  """Read in the image_data to be classified."""
-  return tf.gfile.FastGFile(filename, 'rb').read()
-
-
-def load_labels(filename):
-  """Read in labels, one label per line."""
-  return [line.rstrip() for line in tf.gfile.GFile(filename)]
-
-
-def load_graph(filename):
-  """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
-    graph_def = tf.GraphDef()
-    graph_def.ParseFromString(f.read())
-    tf.import_graph_def(graph_def, name='')
-
-
-def run_graph(image_data, labels, input_layer_name, output_layer_name,
-              num_top_predictions):
-  with tf.Session() as sess:
-    # Feed the image_data as input to the graph.
-    #   predictions will contain a two-dimensional array, where one
-    #   dimension represents the input image count, and the other has
-    #   predictions per class
-    softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name)
-    predictions, = sess.run(softmax_tensor, {input_layer_name: image_data})
-
-    # Sort to show labels in order of confidence
-    top_k = predictions.argsort()[-num_top_predictions:][::-1]
-    for node_id in top_k:
-      human_string = labels[node_id]
-      score = predictions[node_id]
-      print('%s (score = %.5f)' % (human_string, score))
-
-    return 0
-
-
-def main(argv):
-  """Runs inference on an image."""
-  if argv[1:]:
-    raise ValueError('Unused Command Line Args: %s' % argv[1:])
-
-  if not tf.gfile.Exists(FLAGS.image):
-    tf.logging.fatal('image file does not exist %s', FLAGS.image)
-
-  if not tf.gfile.Exists(FLAGS.labels):
-    tf.logging.fatal('labels file does not exist %s', FLAGS.labels)
-
-  if not tf.gfile.Exists(FLAGS.graph):
-    tf.logging.fatal('graph file does not exist %s', FLAGS.graph)
-
-  # load image
-  image_data = load_image(FLAGS.image)
-
-  # load labels
-  labels = load_labels(FLAGS.labels)
-
-  # load graph, which is stored in the default session
-  load_graph(FLAGS.graph)
-
-  run_graph(image_data, labels, FLAGS.input_layer, FLAGS.output_layer,
-            FLAGS.num_top_predictions)
-
-
-if __name__ == '__main__':
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=sys.argv[:1]+unparsed)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 3549891461e74d96ea4a5aa98f929ddde7e62692..ec22684eaf63700c608c6ce45f22941555246b99 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -69,11 +69,18 @@ to validate that you have gathered good training data, but if you want to deploy
 on resource-limited platforms, you can try the `--architecture` flag with a
 Mobilenet model. For example:
 
+Run floating-point version of mobilenet:
 ```bash
 python tensorflow/examples/image_retraining/retrain.py \
     --image_dir ~/flower_photos --architecture mobilenet_1.0_224
 ```
 
+Run quantized version of mobilenet:
+```bash
+python tensorflow/examples/image_retraining/retrain.py \
+    --image_dir ~/flower_photos/   --architecture mobilenet_1.0_224_quantized
+```
+
 There are 32 different Mobilenet models to choose from, with a variety of file
 size and latency options. The first number can be '1.0', '0.75', '0.50', or
 '0.25' to control the size, and the second controls the input image size, either
@@ -107,6 +114,7 @@ import numpy as np
 from six.moves import urllib
 import tensorflow as tf
 
+from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import gfile
@@ -271,6 +279,7 @@ def create_model_graph(model_info):
   """
   with tf.Graph().as_default() as graph:
     model_path = os.path.join(FLAGS.model_dir, model_info['model_file_name'])
+    print('Model path: ', model_path)
     with gfile.FastGFile(model_path, 'rb') as f:
       graph_def = tf.GraphDef()
       graph_def.ParseFromString(f.read())
@@ -337,7 +346,10 @@ def maybe_download_and_extract(data_url):
     statinfo = os.stat(filepath)
     tf.logging.info('Successfully downloaded', filename, statinfo.st_size,
                     'bytes.')
-  tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+    print('Extracting file from ', filepath)
+    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+  else:
+    print('Not extracting or downloading files, model already present in disk')
 
 
 def ensure_dir_exists(dir_name):
@@ -527,10 +539,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
           sess, image_lists, label_name, image_index, image_dir, category,
           bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
           resized_input_tensor, bottleneck_tensor, architecture)
-      ground_truth = np.zeros(class_count, dtype=np.float32)
-      ground_truth[label_index] = 1.0
       bottlenecks.append(bottleneck)
-      ground_truths.append(ground_truth)
+      ground_truths.append(label_index)
       filenames.append(image_name)
   else:
     # Retrieve all bottlenecks.
@@ -543,10 +553,8 @@ def get_random_cached_bottlenecks(sess, image_lists, how_many, category,
             sess, image_lists, label_name, image_index, image_dir, category,
             bottleneck_dir, jpeg_data_tensor, decoded_image_tensor,
             resized_input_tensor, bottleneck_tensor, architecture)
-        ground_truth = np.zeros(class_count, dtype=np.float32)
-        ground_truth[label_index] = 1.0
         bottlenecks.append(bottleneck)
-        ground_truths.append(ground_truth)
+        ground_truths.append(label_index)
         filenames.append(image_name)
   return bottlenecks, ground_truths, filenames
 
@@ -598,10 +606,8 @@ def get_random_distorted_bottlenecks(
     bottleneck_values = sess.run(bottleneck_tensor,
                                  {resized_input_tensor: distorted_image_data})
     bottleneck_values = np.squeeze(bottleneck_values)
-    ground_truth = np.zeros(class_count, dtype=np.float32)
-    ground_truth[label_index] = 1.0
     bottlenecks.append(bottleneck_values)
-    ground_truths.append(ground_truth)
+    ground_truths.append(label_index)
   return bottlenecks, ground_truths
 
 
@@ -733,7 +739,7 @@ def variable_summaries(var):
 
 
 def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
-                           bottleneck_tensor_size):
+                           bottleneck_tensor_size, quantize_layer):
   """Adds a new softmax and fully-connected layer for training.
 
   We need to retrain the top layer to identify our new classes, so this function
@@ -745,10 +751,12 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
 
   Args:
     class_count: Integer of how many categories of things we're trying to
-    recognize.
+        recognize.
     final_tensor_name: Name string for the new final node that produces results.
     bottleneck_tensor: The output of the main CNN graph.
     bottleneck_tensor_size: How many entries in the bottleneck vector.
+    quantize_layer: Boolean, specifying whether the newly added layer should be
+        quantized.
 
   Returns:
     The tensors for the training and cross entropy results, and tensors for the
@@ -760,9 +768,8 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
         shape=[None, bottleneck_tensor_size],
         name='BottleneckInputPlaceholder')
 
-    ground_truth_input = tf.placeholder(tf.float32,
-                                        [None, class_count],
-                                        name='GroundTruthInput')
+    ground_truth_input = tf.placeholder(
+        tf.int64, [None], name='GroundTruthInput')
 
   # Organizing the following ops as `final_training_ops` so they're easier
   # to see in TensorBoard
@@ -771,25 +778,47 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor,
     with tf.name_scope('weights'):
       initial_value = tf.truncated_normal(
           [bottleneck_tensor_size, class_count], stddev=0.001)
-
       layer_weights = tf.Variable(initial_value, name='final_weights')
+      if quantize_layer:
+        quantized_layer_weights = quant_ops.MovingAvgQuantize(
+            layer_weights, is_training=True)
+        variable_summaries(quantized_layer_weights)
 
       variable_summaries(layer_weights)
     with tf.name_scope('biases'):
       layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
+      if quantize_layer:
+        quantized_layer_biases = quant_ops.MovingAvgQuantize(
+            layer_biases, is_training=True)
+        variable_summaries(quantized_layer_biases)
+
       variable_summaries(layer_biases)
+
     with tf.name_scope('Wx_plus_b'):
-      logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
-      tf.summary.histogram('pre_activations', logits)
+      if quantize_layer:
+        logits = tf.matmul(bottleneck_input,
+                           quantized_layer_weights) + quantized_layer_biases
+        logits = quant_ops.MovingAvgQuantize(
+            logits,
+            init_min=-32.0,
+            init_max=32.0,
+            is_training=True,
+            num_bits=8,
+            narrow_range=False,
+            ema_decay=0.5)
+        tf.summary.histogram('pre_activations', logits)
+      else:
+        logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
+        tf.summary.histogram('pre_activations', logits)
 
   final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
+
   tf.summary.histogram('activations', final_tensor)
 
   with tf.name_scope('cross_entropy'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
         labels=ground_truth_input, logits=logits)
-    with tf.name_scope('total'):
-      cross_entropy_mean = tf.reduce_mean(cross_entropy)
+
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
 
   with tf.name_scope('train'):
@@ -814,8 +843,7 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
       prediction = tf.argmax(result_tensor, 1)
-      correct_prediction = tf.equal(
-          prediction, tf.argmax(ground_truth_tensor, 1))
+      correct_prediction = tf.equal(prediction, ground_truth_tensor)
     with tf.name_scope('accuracy'):
       evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
@@ -825,6 +853,7 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
 def save_graph_to_file(sess, graph, graph_file_name):
   output_graph_def = graph_util.convert_variables_to_constants(
       sess, graph.as_graph_def(), [FLAGS.final_tensor_name])
+
   with gfile.FastGFile(graph_file_name, 'wb') as f:
     f.write(output_graph_def.SerializeToString())
   return
@@ -858,6 +887,7 @@ def create_model_info(architecture):
     ValueError: If architecture name is unknown.
   """
   architecture = architecture.lower()
+  is_quantized = False
   if architecture == 'inception_v3':
     # pylint: disable=line-too-long
     data_url = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
@@ -902,19 +932,28 @@ def create_model_info(architecture):
             architecture)
         return None
       is_quantized = True
-    data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
-    data_url += version_string + '_' + size_string + '_frozen.tgz'
-    bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
+
+    if is_quantized:
+      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
+      data_url += version_string + '_' + size_string + '_quantized_frozen.tgz'
+      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
+      resized_input_tensor_name = 'Placeholder:0'
+      model_dir_name = ('mobilenet_v1_' + version_string + '_' + size_string +
+                        '_quantized_frozen')
+      model_base_name = 'quantized_frozen_graph.pb'
+
+    else:
+      data_url = 'http://download.tensorflow.org/models/mobilenet_v1_'
+      data_url += version_string + '_' + size_string + '_frozen.tgz'
+      bottleneck_tensor_name = 'MobilenetV1/Predictions/Reshape:0'
+      resized_input_tensor_name = 'input:0'
+      model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
+      model_base_name = 'frozen_graph.pb'
+
     bottleneck_tensor_size = 1001
     input_width = int(size_string)
     input_height = int(size_string)
     input_depth = 3
-    resized_input_tensor_name = 'input:0'
-    if is_quantized:
-      model_base_name = 'quantized_graph.pb'
-    else:
-      model_base_name = 'frozen_graph.pb'
-    model_dir_name = 'mobilenet_v1_' + version_string + '_' + size_string
     model_file_name = os.path.join(model_dir_name, model_base_name)
     input_mean = 127.5
     input_std = 127.5
@@ -933,6 +972,7 @@ def create_model_info(architecture):
       'model_file_name': model_file_name,
       'input_mean': input_mean,
       'input_std': input_std,
+      'quantize_layer': is_quantized,
   }
 
 
@@ -1028,7 +1068,7 @@ def main(_):
     (train_step, cross_entropy, bottleneck_input, ground_truth_input,
      final_tensor) = add_final_training_ops(
          len(image_lists.keys()), FLAGS.final_tensor_name, bottleneck_tensor,
-         model_info['bottleneck_tensor_size'])
+         model_info['bottleneck_tensor_size'], model_info['quantize_layer'])
 
     # Create the operations we need to evaluate the accuracy of our new layer.
     evaluation_step, prediction = add_evaluation_step(
@@ -1128,7 +1168,7 @@ def main(_):
     if FLAGS.print_misclassified_test_images:
       tf.logging.info('=== MISCLASSIFIED TEST IMAGES ===')
       for i, test_filename in enumerate(test_filenames):
-        if predictions[i] != test_ground_truth[i].argmax():
+        if predictions[i] != test_ground_truth[i]:
           tf.logging.info('%70s  %s' %
                           (test_filename,
                            list(image_lists.keys())[predictions[i]]))
diff --git a/tensorflow/examples/image_retraining/retrain_test.py b/tensorflow/examples/image_retraining/retrain_test.py
index 467c15d0de5520c4f737ea3e9628a6d027388f14..8b8dd45fd72e3d29bdb7f6291cc53b912adf3644 100644
--- a/tensorflow/examples/image_retraining/retrain_test.py
+++ b/tensorflow/examples/image_retraining/retrain_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import tensorflow as tf
 import os
 
-from tensorflow.examples.image_retraining import label_image
 from tensorflow.examples.image_retraining import retrain
 from tensorflow.python.framework import test_util
 
@@ -71,48 +70,26 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
   def testAddFinalTrainingOps(self, flags_mock):
     with tf.Graph().as_default():
       with tf.Session() as sess:
-        bottleneck = tf.placeholder(
-            tf.float32, [1, 1024],
-            name='bottleneck')
-        retrain.add_final_training_ops(5, 'final', bottleneck, 1024)
+        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
+        # Test creating final training op with quantization
+        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, False)
+        self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
+
+  @tf.test.mock.patch.object(retrain, 'FLAGS', learning_rate=0.01)
+  def testAddFinalTrainingOpsQuantized(self, flags_mock):
+    with tf.Graph().as_default():
+      with tf.Session() as sess:
+        bottleneck = tf.placeholder(tf.float32, [1, 1024], name='bottleneck')
+        # Test creating final training op with quantization
+        retrain.add_final_training_ops(5, 'final', bottleneck, 1024, True)
         self.assertIsNotNone(sess.graph.get_tensor_by_name('final:0'))
 
   def testAddEvaluationStep(self):
     with tf.Graph().as_default():
       final = tf.placeholder(tf.float32, [1], name='final')
-      gt = tf.placeholder(tf.float32, [1], name='gt')
+      gt = tf.placeholder(tf.int64, [1], name='gt')
       self.assertIsNotNone(retrain.add_evaluation_step(final, gt))
 
-  def testLabelImage(self):
-
-    image_filename = ('../label_image/data/grace_hopper.jpg')
-
-    # Load some default data
-    label_path = os.path.join(tf.resource_loader.get_data_files_path(),
-                              'data/labels.txt')
-    labels = label_image.load_labels(label_path)
-    self.assertEqual(len(labels), 3)
-
-    image_path = os.path.join(tf.resource_loader.get_data_files_path(),
-                              image_filename)
-
-    image = label_image.load_image(image_path)
-    self.assertEqual(len(image), 61306)
-
-    # Create trivial graph; note that the two nodes don't meet
-    with tf.Graph().as_default():
-      jpeg = tf.constant(image)
-      # Input node that doesn't lead anywhere.
-      tf.image.decode_jpeg(jpeg, name='DecodeJpeg')
-
-      # Output node, that always outputs a constant.
-      tf.constant([[10, 30, 5]], name='final')
-
-      # As label_image outputs via print, we assume that
-      # if it returns, everything is OK.
-      result = label_image.run_graph(image, labels, jpeg, 'final:0', 3)
-      self.assertEqual(result, 0)
-
   def testAddJpegDecoding(self):
     with tf.Graph().as_default():
       jpeg_data, mul_image = retrain.add_jpeg_decoding(10, 10, 3, 0, 255)
@@ -130,5 +107,12 @@ class ImageRetrainingTest(test_util.TensorFlowTestCase):
     self.assertIsNotNone(model_info)
     self.assertEqual(299, model_info['input_width'])
 
+  def testCreateModelInfoQuantized(self):
+    # Test for mobilenet_quantized
+    model_info = retrain.create_model_info('mobilenet_1.0_224')
+    self.assertIsNotNone(model_info)
+    self.assertEqual(224, model_info['input_width'])
+
+
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
index 7d2eb870be2c23bf52cd335f7b3b4cb7f4baac52..5bdaeb43ce143e36e78cfe301fd9b59e8b85b034 100644
--- a/tensorflow/examples/ios/README.md
+++ b/tensorflow/examples/ios/README.md
@@ -6,7 +6,7 @@ This folder contains examples of how to build applications for iOS devices using
  - You'll need Xcode 7.3 or later.
 
  - There are currently three examples: simple, benchmark, and camera. For now,
-   you can download the sample code by cloning the main tensorflow repository 
+   you can download the sample code by cloning the main tensorflow repository
    (we are planning to make the samples available as a separate repository
    later).
 
@@ -48,8 +48,8 @@ open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
 ### Troubleshooting
 
  - Make sure you use the TensorFlow-experimental pod (and not TensorFlow).
-  
- - The TensorFlow-experimental pod is current about ~450MB. The reason it is 
+
+ - The TensorFlow-experimental pod is current about ~450MB. The reason it is
    so big is because we are bundling multiple platforms, and the pod includes
    all TensorFlow functionality (e.g. operations). The final app size after
    build is substantially smaller though (~25MB). Working with the complete
@@ -91,7 +91,7 @@ target 'YourProjectName'
    open up the Xcode project in the `camera` subfolder. Once you build and run
    that, you should get a live camera view that you can point at objects to get
    real-time recognition results.
-   
+
 ### Troubleshooting
 
 If you're hitting problems, here's a checklist of common things to investigate:
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index 23a42a60ba476701b42f846095aadc8acd0e9b2f..aba7f600b53cf8286d46ee70823a0a425944076f 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -113,13 +113,6 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "wide_n_deep_tutorial",
-    srcs = ["wide_n_deep_tutorial.py"],
-    srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
-)
-
 py_binary(
     name = "mnist",
     srcs = ["mnist.py"],
@@ -153,7 +146,6 @@ sh_test(
         ":text_classification_character_cnn",
         ":text_classification_character_rnn",
         ":text_classification_cnn",
-        ":wide_n_deep_tutorial",
     ],
     tags = [
         "manual",
diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md
index 70d9db85ee5b48a75c7f6829ce6a6b22ff097535..b74a8f39d98123d3e7ca6d5bbeb0a4b806097670 100644
--- a/tensorflow/examples/learn/README.md
+++ b/tensorflow/examples/learn/README.md
@@ -23,7 +23,7 @@ processing (`pip install -U pandas`).
 
 ## Specialized Models
 * [Building a Random Forest Model](https://www.tensorflow.org/code/tensorflow/examples/learn/random_forest_mnist.py)
-* [Building a Wide & Deep Model](https://www.tensorflow.org/code/tensorflow/examples/learn/wide_n_deep_tutorial.py)
+* [Building a Wide & Deep Model](https://github.com/tensorflow/models/tree/master/official/wide_deep/wide_deep.py)
 * [Building a Residual Network Model](https://www.tensorflow.org/code/tensorflow/examples/learn/resnet.py)
 
 ## Text classification
diff --git a/tensorflow/examples/learn/examples_test.sh b/tensorflow/examples/learn/examples_test.sh
index b8763de471c90a3f1d4067606222f7a7ecd2959d..ef5e8a5de25068a74b1f3ea9c3b2ce87aa470f89 100755
--- a/tensorflow/examples/learn/examples_test.sh
+++ b/tensorflow/examples/learn/examples_test.sh
@@ -56,4 +56,3 @@ test text_classification_builtin_rnn_model --test_with_fake_data
 test text_classification_character_cnn --test_with_fake_data
 test text_classification_character_rnn --test_with_fake_data
 test text_classification_cnn --test_with_fake_data
-test wide_n_deep_tutorial
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 0a50b3ba87d70a58794bc35009dc76de2cb71d1e..03e60972aa660fad4af8d3535e31463c96f7c69b 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -11,7 +11,10 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-"""Example of DNNClassifier for Iris plant dataset."""
+"""Example of DNNClassifier for Iris plant dataset.
+
+This example uses APIs in Tensorflow 1.4 or above.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 072357e51c418ae1163debe29516c31ccc367386..4a219694d10ef075e0e0403cdd7ed100c39ddadd 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -46,12 +46,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op with exponentially decaying learning rate.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index 471a99ba76dd8012ba3b1a519d5d07fb378f89e7..c6bdb86ba52b9715b977909d9b7d0fbc59161a53 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -47,12 +47,8 @@ def my_model(features, labels, mode):
     }
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-  # Convert the labels to a one-hot tensor of shape (length of features, 3) and
-  # with a on-value of 1 for each one-hot vector of length 3.
-  onehot_labels = tf.one_hot(labels, 3, 1, 0)
   # Compute loss.
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/mnist.py b/tensorflow/examples/learn/mnist.py
index 88425ea0d0bf72fb7e7d9cbab27da023f3ade122..98819b20bfea5021d52e2c50b004bccdaf1f25e7 100644
--- a/tensorflow/examples/learn/mnist.py
+++ b/tensorflow/examples/learn/mnist.py
@@ -77,9 +77,7 @@ def conv_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/multiple_gpu.py b/tensorflow/examples/learn/multiple_gpu.py
index a294950a386a7207858bbcff345f14de44ffb9ca..3bad22ddf66b7981930637d64cc8653e3fb29cdf 100644
--- a/tensorflow/examples/learn/multiple_gpu.py
+++ b/tensorflow/examples/learn/multiple_gpu.py
@@ -65,12 +65,8 @@ def my_model(features, labels, mode):
       }
       return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
-    # Convert the labels to a one-hot tensor of shape (length of features, 3)
-    # and with a on-value of 1 for each one-hot vector of length 3.
-    onehot_labels = tf.one_hot(labels, 3, 1, 0)
     # Compute loss.
-    loss = tf.losses.softmax_cross_entropy(
-        onehot_labels=onehot_labels, logits=logits)
+    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
     # Create training op.
     if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/resnet.py b/tensorflow/examples/learn/resnet.py
index 1e0966475b01d067330dc4797032d561857fd208..9542e552504580a6614f8bd2f43c38dfa795750f 100755
--- a/tensorflow/examples/learn/resnet.py
+++ b/tensorflow/examples/learn/resnet.py
@@ -151,9 +151,7 @@ def res_net_model(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, predictions=predictions)
 
   # Compute loss.
-  onehot_labels = tf.one_hot(tf.cast(labels, tf.int32), N_DIGITS, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Create training op.
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py
index ba89c532be5fa0e13a2dcb1f7894be4c631507d7..eb117c39a122f4f6c108dd18f8f8035edf05eaa1 100644
--- a/tensorflow/examples/learn/text_classification.py
+++ b/tensorflow/examples/learn/text_classification.py
@@ -46,9 +46,7 @@ def estimator_spec_for_softmax_classification(
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_character_cnn.py b/tensorflow/examples/learn/text_classification_character_cnn.py
index 363ff003628e03be40c1be6b7b32e12a07533047..afda170e2a9c1b0281fdd3d7ed210a1bfcd4481b 100644
--- a/tensorflow/examples/learn/text_classification_character_cnn.py
+++ b/tensorflow/examples/learn/text_classification_character_cnn.py
@@ -88,9 +88,7 @@ def char_cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_character_rnn.py b/tensorflow/examples/learn/text_classification_character_rnn.py
index 86adc056add508c309b3a5b93e58e9c195995642..15733821fb17eb17269fea295020f6690bb62854 100644
--- a/tensorflow/examples/learn/text_classification_character_rnn.py
+++ b/tensorflow/examples/learn/text_classification_character_rnn.py
@@ -59,9 +59,7 @@ def char_rnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index be262285a3a7aa0d6b9430a2226b448fe674cd7f..9e21aee87f629835222ab367dc3ed55863f553e4 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -87,9 +87,7 @@ def cnn_model(features, labels, mode):
             'prob': tf.nn.softmax(logits)
         })
 
-  onehot_labels = tf.one_hot(labels, MAX_LABEL, 1, 0)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
   if mode == tf.estimator.ModeKeys.TRAIN:
     optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
     train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
diff --git a/tensorflow/examples/learn/wide_n_deep_tutorial.py b/tensorflow/examples/learn/wide_n_deep_tutorial.py
deleted file mode 100644
index e447b3e24e75f0596423babfe438dc908393b7cc..0000000000000000000000000000000000000000
--- a/tensorflow/examples/learn/wide_n_deep_tutorial.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Example code for TensorFlow Wide & Deep Tutorial using TF.Learn API."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import shutil
-import sys
-import tempfile
-
-import pandas as pd
-from six.moves import urllib
-import tensorflow as tf
-
-
-CSV_COLUMNS = [
-    "age", "workclass", "fnlwgt", "education", "education_num",
-    "marital_status", "occupation", "relationship", "race", "gender",
-    "capital_gain", "capital_loss", "hours_per_week", "native_country",
-    "income_bracket"
-]
-
-gender = tf.feature_column.categorical_column_with_vocabulary_list(
-    "gender", ["Female", "Male"])
-education = tf.feature_column.categorical_column_with_vocabulary_list(
-    "education", [
-        "Bachelors", "HS-grad", "11th", "Masters", "9th",
-        "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
-        "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
-        "Preschool", "12th"
-    ])
-marital_status = tf.feature_column.categorical_column_with_vocabulary_list(
-    "marital_status", [
-        "Married-civ-spouse", "Divorced", "Married-spouse-absent",
-        "Never-married", "Separated", "Married-AF-spouse", "Widowed"
-    ])
-relationship = tf.feature_column.categorical_column_with_vocabulary_list(
-    "relationship", [
-        "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
-        "Other-relative"
-    ])
-workclass = tf.feature_column.categorical_column_with_vocabulary_list(
-    "workclass", [
-        "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
-        "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
-    ])
-
-# To show an example of hashing:
-occupation = tf.feature_column.categorical_column_with_hash_bucket(
-    "occupation", hash_bucket_size=1000)
-native_country = tf.feature_column.categorical_column_with_hash_bucket(
-    "native_country", hash_bucket_size=1000)
-
-# Continuous base columns.
-age = tf.feature_column.numeric_column("age")
-education_num = tf.feature_column.numeric_column("education_num")
-capital_gain = tf.feature_column.numeric_column("capital_gain")
-capital_loss = tf.feature_column.numeric_column("capital_loss")
-hours_per_week = tf.feature_column.numeric_column("hours_per_week")
-
-# Transformations.
-age_buckets = tf.feature_column.bucketized_column(
-    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
-
-# Wide columns and deep columns.
-base_columns = [
-    gender, education, marital_status, relationship, workclass, occupation,
-    native_country, age_buckets,
-]
-
-crossed_columns = [
-    tf.feature_column.crossed_column(
-        ["education", "occupation"], hash_bucket_size=1000),
-    tf.feature_column.crossed_column(
-        [age_buckets, "education", "occupation"], hash_bucket_size=1000),
-    tf.feature_column.crossed_column(
-        ["native_country", "occupation"], hash_bucket_size=1000)
-]
-
-deep_columns = [
-    tf.feature_column.indicator_column(workclass),
-    tf.feature_column.indicator_column(education),
-    tf.feature_column.indicator_column(gender),
-    tf.feature_column.indicator_column(relationship),
-    # To show an example of embedding
-    tf.feature_column.embedding_column(native_country, dimension=8),
-    tf.feature_column.embedding_column(occupation, dimension=8),
-    age,
-    education_num,
-    capital_gain,
-    capital_loss,
-    hours_per_week,
-]
-
-
-FLAGS = None
-
-
-def maybe_download(train_data, test_data):
-  """Maybe downloads training data and returns train and test file names."""
-  if train_data:
-    train_file_name = train_data
-  else:
-    train_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
-        train_file.name)  # pylint: disable=line-too-long
-    train_file_name = train_file.name
-    train_file.close()
-    print("Training data is downloaded to %s" % train_file_name)
-
-  if test_data:
-    test_file_name = test_data
-  else:
-    test_file = tempfile.NamedTemporaryFile(delete=False)
-    urllib.request.urlretrieve(
-        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
-        test_file.name)  # pylint: disable=line-too-long
-    test_file_name = test_file.name
-    test_file.close()
-    print("Test data is downloaded to %s"% test_file_name)
-
-  return train_file_name, test_file_name
-
-
-def build_estimator(model_dir, model_type):
-  """Build an estimator."""
-  if model_type == "wide":
-    m = tf.estimator.LinearClassifier(
-        model_dir=model_dir, feature_columns=base_columns + crossed_columns)
-  elif model_type == "deep":
-    m = tf.estimator.DNNClassifier(
-        model_dir=model_dir,
-        feature_columns=deep_columns,
-        hidden_units=[100, 50])
-  else:
-    m = tf.estimator.DNNLinearCombinedClassifier(
-        model_dir=model_dir,
-        linear_feature_columns=crossed_columns,
-        dnn_feature_columns=deep_columns,
-        dnn_hidden_units=[100, 50])
-  return m
-
-
-def input_fn(data_file, num_epochs, shuffle):
-  """Returns an `input_fn` required by Estimator train/evaluate.
-
-  Args:
-    data_file: The file path to the dataset.
-    num_epochs: Number of epochs to iterate over data. If `None`, `input_fn`
-      will generate infinite stream of data.
-    shuffle: bool, whether to read the data in random order.
-  """
-  df_data = pd.read_csv(
-      tf.gfile.Open(data_file),
-      names=CSV_COLUMNS,
-      skipinitialspace=True,
-      engine="python",
-      skiprows=1)
-  # remove NaN elements
-  df_data = df_data.dropna(how="any", axis=0)
-  labels = df_data["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
-
-  return tf.estimator.inputs.pandas_input_fn(
-      x=df_data,
-      y=labels,
-      batch_size=100,
-      num_epochs=num_epochs,
-      shuffle=shuffle,
-      num_threads=1)
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  train_file_name, test_file_name = maybe_download(FLAGS.train_data,
-                                                   FLAGS.test_data)
-
-  # Specify file path below if want to find the output easily
-  model_dir = FLAGS.model_dir if FLAGS.model_dir else tempfile.mkdtemp()
-
-  estimator = build_estimator(model_dir, FLAGS.model_type)
-
-  # `tf.estimator.TrainSpec`, `tf.estimator.EvalSpec`, and
-  # `tf.estimator.train_and_evaluate` API are available in TF 1.4.
-  train_spec = tf.estimator.TrainSpec(
-      input_fn=input_fn(train_file_name, num_epochs=None, shuffle=True),
-      max_steps=FLAGS.train_steps)
-
-  eval_spec = tf.estimator.EvalSpec(
-      input_fn=input_fn(test_file_name, num_epochs=1, shuffle=False),
-      # set steps to None to run evaluation until all data consumed.
-      steps=None)
-
-  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
-
-  # Manual cleanup
-  shutil.rmtree(model_dir)
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.register("type", "bool", lambda v: v.lower() == "true")
-  parser.add_argument(
-      "--model_dir",
-      type=str,
-      default="",
-      help="Base directory for output models."
-  )
-  parser.add_argument(
-      "--model_type",
-      type=str,
-      default="wide_n_deep",
-      help="Valid model types: {'wide', 'deep', 'wide_n_deep'}."
-  )
-  parser.add_argument(
-      "--train_steps",
-      type=int,
-      default=2000,
-      help="Number of training steps."
-  )
-  parser.add_argument(
-      "--train_data",
-      type=str,
-      default="",
-      help="Path to the training data."
-  )
-  parser.add_argument(
-      "--test_data",
-      type=str,
-      default="",
-      help="Path to the test data."
-  )
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/input_data.py b/tensorflow/examples/speech_commands/input_data.py
index 6d75fbb92b2a5e3bfa8369e0c6f354b4d8fc0074..e7db9cddf02daf9a32d3ed859ee9bd35b2cae838 100644
--- a/tensorflow/examples/speech_commands/input_data.py
+++ b/tensorflow/examples/speech_commands/input_data.py
@@ -240,7 +240,8 @@ class AudioProcessor(object):
     # Look through all the subfolders to find audio samples
     search_path = os.path.join(self.data_dir, '*', '*.wav')
     for wav_path in gfile.Glob(search_path):
-      word = re.search('.*/([^/]+)/.*.wav', wav_path).group(1).lower()
+      _, word = os.path.split(os.path.dirname(wav_path))
+      word = word.lower()
       # Treat the '_background_noise_' folder as a special case, since we expect
       # it to contain long audio samples we mix in to improve training.
       if word == BACKGROUND_NOISE_DIR_NAME:
@@ -416,8 +417,7 @@ class AudioProcessor(object):
       sess: TensorFlow session that was active when processor was created.
 
     Returns:
-      List of sample data for the transformed samples, and list of labels in
-      one-hot form.
+      List of sample data for the transformed samples, and list of label indexes
     """
     # Pick one of the partitions to choose samples from.
     candidates = self.data_index[mode]
@@ -427,7 +427,7 @@ class AudioProcessor(object):
       sample_count = max(0, min(how_many, len(candidates) - offset))
     # Data and labels will be populated and returned.
     data = np.zeros((sample_count, model_settings['fingerprint_size']))
-    labels = np.zeros((sample_count, model_settings['label_count']))
+    labels = np.zeros(sample_count)
     desired_samples = model_settings['desired_samples']
     use_background = self.background_data and (mode == 'training')
     pick_deterministically = (mode != 'training')
@@ -482,7 +482,7 @@ class AudioProcessor(object):
       # Run the graph to produce the output audio.
       data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten()
       label_index = self.word_to_index[sample['label']]
-      labels[i - offset, label_index] = 1
+      labels[i - offset] = label_index
     return data, labels
 
   def get_unprocessed_data(self, how_many, model_settings, mode):
diff --git a/tensorflow/examples/speech_commands/models.py b/tensorflow/examples/speech_commands/models.py
index 82d6a94ea1b16c37f855c21cc4d184ad7cac9d0e..ab611f414a8afa1f08b955918071b04ae0ef88db 100644
--- a/tensorflow/examples/speech_commands/models.py
+++ b/tensorflow/examples/speech_commands/models.py
@@ -326,7 +326,7 @@ def create_low_latency_conv_model(fingerprint_input, model_settings,
   first_filter_height = input_time_size
   first_filter_count = 186
   first_filter_stride_x = 1
-  first_filter_stride_y = 4
+  first_filter_stride_y = 1
   first_weights = tf.Variable(
       tf.truncated_normal(
           [first_filter_height, first_filter_width, 1, first_filter_count],
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index a54bcbdb3238933a76b8605649b89a49d8997579..a4e80041f82191d7c58a3e52c929340eb604ec9d 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -133,7 +133,7 @@ def main(_):
 
   # Define loss and optimizer
   ground_truth_input = tf.placeholder(
-      tf.float32, [None, label_count], name='groundtruth_input')
+      tf.int64, [None], name='groundtruth_input')
 
   # Optionally we can add runtime checks to spot when NaNs or other symptoms of
   # numerical errors start occurring during training.
@@ -144,9 +144,8 @@ def main(_):
 
   # Create the back propagation and training evaluation machinery in the graph.
   with tf.name_scope('cross_entropy'):
-    cross_entropy_mean = tf.reduce_mean(
-        tf.nn.softmax_cross_entropy_with_logits(
-            labels=ground_truth_input, logits=logits))
+    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
+        labels=ground_truth_input, logits=logits)
   tf.summary.scalar('cross_entropy', cross_entropy_mean)
   with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
     learning_rate_input = tf.placeholder(
@@ -154,13 +153,13 @@ def main(_):
     train_step = tf.train.GradientDescentOptimizer(
         learning_rate_input).minimize(cross_entropy_mean)
   predicted_indices = tf.argmax(logits, 1)
-  expected_indices = tf.argmax(ground_truth_input, 1)
-  correct_prediction = tf.equal(predicted_indices, expected_indices)
-  confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices)
+  correct_prediction = tf.equal(predicted_indices, ground_truth_input)
+  confusion_matrix = tf.confusion_matrix(
+      ground_truth_input, predicted_indices, num_classes=label_count)
   evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', evaluation_step)
 
-  global_step = tf.contrib.framework.get_or_create_global_step()
+  global_step = tf.train.get_or_create_global_step()
   increment_global_step = tf.assign(global_step, global_step + 1)
 
   saver = tf.train.Saver(tf.global_variables())
diff --git a/tensorflow/examples/tutorials/deepdream/README.md b/tensorflow/examples/tutorials/deepdream/README.md
index 3a715f622488d260834db6b35a0da4c7ccdcd9c0..403e4b34f9bbf161cb2aad614f352679443595d4 100644
--- a/tensorflow/examples/tutorials/deepdream/README.md
+++ b/tensorflow/examples/tutorials/deepdream/README.md
@@ -2,7 +2,7 @@
 
 by [Alexander Mordvintsev](mailto:moralex@google.com)
 
-This directory contains Jupyter notebook that demonstrates a number of Convolutional Neural Network 
+This directory contains Jupyter notebook that demonstrates a number of Convolutional Neural Network
 image generation techniques implemented with TensorFlow:
 
 - visualizing individual feature channels and their combinations to explore the space of patterns learned by the neural network (see [GoogLeNet](http://storage.googleapis.com/deepdream/visualz/tensorflow_inception/index.html) and [VGG16](http://storage.googleapis.com/deepdream/visualz/vgg16/index.html) galleries)
@@ -11,8 +11,8 @@ image generation techniques implemented with TensorFlow:
 - using Laplacian Pyramid Gradient Normalization to produce smooth and colorful visuals at low cost
 - generating DeepDream-like images with TensorFlow
 
-You can view "deepdream.ipynb" directly on GitHub. Note that GitHub Jupyter notebook preview removes 
-embedded graph visualizations. You can still see them online 
+You can view "deepdream.ipynb" directly on GitHub. Note that GitHub Jupyter notebook preview removes
+embedded graph visualizations. You can still see them online
 [using nbviewer](http://nbviewer.jupyter.org/github/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/deepdream/deepdream.ipynb)
 service.
 
@@ -23,5 +23,5 @@ In order to run the notebook locally, the following dependencies must be install
 - NumPy
 - Jupyter Notebook
 
-To open the notebook, run `ipython notebook` command in this directory, and 
+To open the notebook, run `ipython notebook` command in this directory, and
 select 'deepdream.ipynb' in the opened browser window.
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 2124843fcb21d0c4a28ef9a11aba012a5a116e84..1e8d7d05e1c6af08d788857e74c04134333d019c 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -97,9 +97,7 @@ def cnn_model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
 
   # Calculate Loss (for both TRAIN and EVAL modes)
-  onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=10)
-  loss = tf.losses.softmax_cross_entropy(
-      onehot_labels=onehot_labels, logits=logits)
+  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
   # Configure the Training Op (for TRAIN mode)
   if mode == tf.estimator.ModeKeys.TRAIN:
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
index 3585043a2a9f1920422c50cd60ce18fcfa646419..7cedd0e264f35ac4ab924c93032b019e2aae78cf 100644
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ b/tensorflow/examples/tutorials/mnist/mnist.py
@@ -94,9 +94,7 @@ def loss(logits, labels):
     loss: Loss tensor of type float.
   """
   labels = tf.to_int64(labels)
-  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
-      labels=labels, logits=logits, name='xentropy')
-  return tf.reduce_mean(cross_entropy, name='xentropy_mean')
+  return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
 
 
 def training(loss, learning_rate):
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index a4dbab5123d49ee97445a5921a14bd1764593025..1e0294db27bc675870afceca77a2cdcd4b3f5ad3 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -125,27 +125,27 @@ def bias_variable(shape):
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # Build the graph for the deep net
   y_conv, keep_prob = deepnn(x)
 
   with tf.name_scope('loss'):
-    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
-                                                            logits=y_conv)
+    cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+        labels=y_, logits=y_conv)
   cross_entropy = tf.reduce_mean(cross_entropy)
 
   with tf.name_scope('adam_optimizer'):
     train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 
   with tf.name_scope('accuracy'):
-    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
+    correct_prediction = tf.equal(tf.argmax(y_conv, 1), y_)
     correct_prediction = tf.cast(correct_prediction, tf.float32)
   accuracy = tf.reduce_mean(correct_prediction)
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax.py b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
index addd2d3810219f70ffb5f7c919f01de35dd816d9..fb3ac942039e670fb5ca975c5d9835ba065190a2 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax.py
@@ -34,7 +34,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -43,7 +43,7 @@ def main(_):
   y = tf.matmul(x, W) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -52,10 +52,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
   # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   sess = tf.InteractiveSession()
@@ -66,7 +65,7 @@ def main(_):
     sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                       y_: mnist.test.labels}))
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
index eaff05913af756c6ab0bf80e8f0893b1d239d60d..e89317494f9b7171a93b2706d9d612d456ddf937 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
@@ -32,7 +32,7 @@ FLAGS = None
 
 def main(_):
   # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+  mnist = input_data.read_data_sets(FLAGS.data_dir)
 
   # Create the model
   x = tf.placeholder(tf.float32, [None, 784])
@@ -41,7 +41,7 @@ def main(_):
   y = tf.matmul(x, w) + b
 
   # Define loss and optimizer
-  y_ = tf.placeholder(tf.float32, [None, 10])
+  y_ = tf.placeholder(tf.int64, [None])
 
   # The raw formulation of cross-entropy,
   #
@@ -50,10 +50,9 @@ def main(_):
   #
   # can be numerically unstable.
   #
-  # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
-  # outputs of 'y', and then average across the batch.
-  cross_entropy = tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
+  # So here we use tf.losses.sparse_softmax_cross_entropy on the raw
+  # logit outputs of 'y', and then average across the batch.
+  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
   train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
 
   config = tf.ConfigProto()
@@ -86,7 +85,7 @@ def main(_):
       sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
 
   # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
   accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   print(sess.run(accuracy,
                  feed_dict={x: mnist.test.images,
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index c401d09df8ca5132178ab31e3b14b3a5cf98e70d..7967e22d6a0319a530cb2f00e54872f022ac0095 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -38,7 +38,6 @@ FLAGS = None
 def train():
   # Import data
   mnist = input_data.read_data_sets(FLAGS.data_dir,
-                                    one_hot=True,
                                     fake_data=FLAGS.fake_data)
 
   sess = tf.InteractiveSession()
@@ -47,7 +46,7 @@ def train():
   # Input placeholders
   with tf.name_scope('input'):
     x = tf.placeholder(tf.float32, [None, 784], name='x-input')
-    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')
+    y_ = tf.placeholder(tf.int64, [None], name='y-input')
 
   with tf.name_scope('input_reshape'):
     image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
@@ -117,12 +116,12 @@ def train():
     #
     # can be numerically unstable.
     #
-    # So here we use tf.nn.softmax_cross_entropy_with_logits on the
-    # raw outputs of the nn_layer above, and then average across
+    # So here we use tf.losses.sparse_softmax_cross_entropy on the
+    # raw logit outputs of the nn_layer above, and then average across
     # the batch.
-    diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
     with tf.name_scope('total'):
-      cross_entropy = tf.reduce_mean(diff)
+      cross_entropy = tf.losses.sparse_softmax_cross_entropy(
+          labels=y_, logits=y)
   tf.summary.scalar('cross_entropy', cross_entropy)
 
   with tf.name_scope('train'):
@@ -131,7 +130,7 @@ def train():
 
   with tf.name_scope('accuracy'):
     with tf.name_scope('correct_prediction'):
-      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
+      correct_prediction = tf.equal(tf.argmax(y, 1), y_)
     with tf.name_scope('accuracy'):
       accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
   tf.summary.scalar('accuracy', accuracy)
diff --git a/tensorflow/examples/udacity/1_notmnist.ipynb b/tensorflow/examples/udacity/1_notmnist.ipynb
index 39674e1aa49ad70216b778444d2448d89f44d952..dffe5d37c64c33fe3d5ce632ad4671abe0b6f673 100644
--- a/tensorflow/examples/udacity/1_notmnist.ipynb
+++ b/tensorflow/examples/udacity/1_notmnist.ipynb
@@ -46,13 +46,13 @@
         "# These are all the modules we'll be using later. Make sure you can import them\n",
         "# before proceeding further.\n",
         "from __future__ import print_function\n",
+        "import imageio\n",
         "import matplotlib.pyplot as plt\n",
         "import numpy as np\n",
         "import os\n",
         "import sys\n",
         "import tarfile\n",
         "from IPython.display import display, Image\n",
-        "from scipy import ndimage\n",
         "from sklearn.linear_model import LogisticRegression\n",
         "from six.moves.urllib.request import urlretrieve\n",
         "from six.moves import cPickle as pickle\n",
@@ -325,13 +325,13 @@
         "  for image in image_files:\n",
         "    image_file = os.path.join(folder, image)\n",
         "    try:\n",
-        "      image_data = (ndimage.imread(image_file).astype(float) - \n",
+        "      image_data = (imageio.imread(image_file).astype(float) - \n",
         "                    pixel_depth / 2) / pixel_depth\n",
         "      if image_data.shape != (image_size, image_size):\n",
         "        raise Exception('Unexpected image shape: %s' % str(image_data.shape))\n",
         "      dataset[num_images, :, :] = image_data\n",
         "      num_images = num_images + 1\n",
-        "    except IOError as e:\n",
+        "    except (IOError, ValueError) as e:\n",
         "      print('Could not read:', image_file, ':', e, '- it\\'s ok, skipping.')\n",
         "    \n",
         "  dataset = dataset[0:num_images, :, :]\n",
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index 6faad294c2df59f480ed15e7cf3f216311d553bc..f80c56d1c181edcb26c93c01bf9ba4e486c6d146 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -43,15 +43,15 @@ In addition, you may need to pass `--memory=8g` as an extra argument to
 `docker-machine` is a tool to provision and manage docker hosts, it supports multiple platform (ex. aws, gce, azure, virtualbox, ...). To create a new virtual machine locally with built-in docker engine, you can use
 
     docker-machine create -d virtualbox --virtualbox-memory 8196 tensorflow
-    
+
 `-d` means the driver for the cloud platform, supported drivers listed [here](https://docs.docker.com/machine/drivers/). Here we use virtualbox to create a new virtual machine locally. `tensorflow` means the name of the virtual machine, feel free to use whatever you like. You can use
 
     docker-machine ip tensorflow
-    
+
 to get the ip of the new virtual machine. To switch from default virtual machine to a new one (here we use tensorflow), type
 
     eval $(docker-machine env tensorflow)
-    
+
 Note that `docker-machine env tensorflow` outputs some environment variables such like `DOCKER_HOST`. Then your docker client is now connected to the docker host in virtual machine `tensorflow`
 
 * **I'm getting a TLS connection error.**
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index c69a3596378f7cffb087e32461134b308f518792..4a429837b7b997f0f6571060280a9a15543b9f54 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -53,7 +53,8 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
   //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
   //  - Encodes it as a PNG stream and saves it out to a file.
-  Output file_reader = ReadFile(root.WithOpName("input_wav"), input_wav);
+  Output file_reader =
+      tensorflow::ops::ReadFile(root.WithOpName("input_wav"), input_wav);
   DecodeWav wav_decoder =
       DecodeWav(root.WithOpName("wav_decoder"), file_reader);
   Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
@@ -69,10 +70,10 @@ tensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
   Output expand_dims =
       ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
   Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
-                           Squeeze::Attrs().SqueezeDims({0}));
+                           Squeeze::Attrs().Axis({0}));
   Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
-  WriteFile file_writer =
-      WriteFile(root.WithOpName("output_image"), output_image, png_encoder);
+  tensorflow::ops::WriteFile file_writer = tensorflow::ops::WriteFile(
+      root.WithOpName("output_image"), output_image, png_encoder);
   tensorflow::GraphDef graph;
   TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
 
diff --git a/tensorflow/g3doc/README.txt b/tensorflow/g3doc/README.txt
index 6eaf1e1bda1e6c43df96195a682961cd28dc177b..ed648f8b6b8895010be84becd4fda25ded5859fb 100644
--- a/tensorflow/g3doc/README.txt
+++ b/tensorflow/g3doc/README.txt
@@ -7,7 +7,7 @@ Documentation (on Github, tensorflow.org, and anywhere else we decide to
 serve it from) is now generated from the files in
 tensorflow/docs_src/ (for tutorials and other guides) and
 TensorFlow source code (for the API reference pages). If you see a problem with
-API reference, edit the code comments in the appropriate language. If you see a 
+API reference, edit the code comments in the appropriate language. If you see a
 problem with our other docs, edit the files in docs_src.
 
 To preview the results of your changes, or generate an offline copy of
diff --git a/tensorflow/go/android.go b/tensorflow/go/android.go
new file mode 100644
index 0000000000000000000000000000000000000000..3db3ddfec5cc16dbb47bc847513989dfd3810ea3
--- /dev/null
+++ b/tensorflow/go/android.go
@@ -0,0 +1,20 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build android
+
+package tensorflow
+
+// #cgo LDFLAGS: -landroid -llog -lm -lz -ldl
+import "C"
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 46c600eab17c6c467d0b3a3312f848541f382e80..fc087d9d995dfe031e61fd0fa15d649c2ee35cc9 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -20,6 +20,25 @@ package tensorflow
 //
 // #include <stdlib.h>
 // #include <string.h>
+//
+// void TF_SetAttrShapeList_Helper(TF_OperationDescription* desc,
+//                                 const char* attr_name,
+//                                 const int64_t* flat_dims,
+//                                 const int* num_dims,
+//                                 int num_shapes) {
+//  const int64_t** dims =
+//    (const int64_t**)malloc(sizeof(const int64_t*) * num_shapes);
+//  int i = 0;
+//  for (i = 0; i < num_shapes; i++) {
+//    dims[i] = flat_dims;
+//    if (num_dims[i] > 0) {
+//      // flat_dims will be NULL iff num_shapes is 0 or all elements in num_dims are <= 0.
+//      flat_dims += num_dims[i];
+//    }
+//  }
+//  TF_SetAttrShapeList(desc, attr_name, dims, num_dims, num_shapes);
+//  free(dims);
+// }
 import "C"
 
 import (
@@ -114,6 +133,20 @@ func (g *Graph) Operation(name string) *Operation {
 	return &Operation{cop, g}
 }
 
+// Operations returns a list of all operations in the graph
+func (g *Graph) Operations() []Operation {
+	var pos C.size_t = 0
+	ops := []Operation{}
+	for {
+		cop := C.TF_GraphNextOperation(g.c, &pos)
+		if cop == nil {
+			break
+		}
+		ops = append(ops, Operation{cop, g})
+	}
+	return ops
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
@@ -289,41 +322,37 @@ func setAttr(cdesc *C.TF_OperationDescription, status *status, name string, valu
 			return fmt.Errorf("bad value for attribute %q: %v", name, err)
 		}
 	case Shape:
-		ndims, dims := cshape(value)
+		ndims := C.int(value.NumDimensions())
 		var dimsp *C.int64_t
 		if ndims > 0 {
+			dims := make([]C.int64_t, ndims)
+			for i, d := range value.dims {
+				dims[i] = C.int64_t(d)
+			}
 			dimsp = &dims[0]
 		}
 		C.TF_SetAttrShape(cdesc, cAttrName, dimsp, ndims)
 	case []Shape:
-		ndims := make([]C.int, len(value))
-		dims := make([][]C.int64_t, len(value))
-		dimsp := make([]*C.int64_t, len(value))
-		for i, s := range value {
-			ndims[i], dims[i] = cshape(s)
-			if ndims[i] > 0 {
-				dimsp[i] = &dims[i][0]
-			}
-		}
-		if len(value) > 0 {
-			C.TF_SetAttrShapeList(cdesc, cAttrName, &dimsp[0], &ndims[0], C.int(len(value)))
-		} else {
+		if len(value) == 0 {
 			C.TF_SetAttrShapeList(cdesc, cAttrName, nil, nil, 0)
+		} else {
+			var flatDims []C.int64_t
+			ndims := make([]C.int, len(value))
+			for i, s := range value {
+				nd := s.NumDimensions()
+				ndims[i] = C.int(nd)
+				for _, d := range s.dims {
+					flatDims = append(flatDims, C.int64_t(d))
+				}
+			}
+			var flatDimsp *C.int64_t
+			if len(flatDims) > 0 {
+				flatDimsp = &flatDims[0]
+			}
+			C.TF_SetAttrShapeList_Helper(cdesc, cAttrName, flatDimsp, &ndims[0], C.int(len(value)))
 		}
 	default:
 		return fmt.Errorf("attribute %q has a type (%T) which is not valid for operation attributes", name, value)
 	}
 	return nil
 }
-
-func cshape(s Shape) (C.int, []C.int64_t) {
-	ndims := C.int(s.NumDimensions())
-	if ndims < 0 {
-		return -1, nil
-	}
-	dims := make([]C.int64_t, ndims)
-	for i, s := range s.dims {
-		dims[i] = C.int64_t(s)
-	}
-	return ndims, dims
-}
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index c3120bc720308402b22884f29b7ff87ef035874b..b8d65c54f697153ad236f5e27d9f27d048c3a22e 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -29,10 +29,26 @@ func hasOperations(g *Graph, ops ...string) error {
 			missing = append(missing, op)
 		}
 	}
-	if len(missing) == 0 {
-		return nil
+	if len(missing) != 0 {
+		return fmt.Errorf("Graph does not have the operations %v", missing)
 	}
-	return fmt.Errorf("Graph does not have the operations %v", missing)
+
+	inList := map[string]bool{}
+	for _, op := range g.Operations() {
+		inList[op.Name()] = true
+	}
+
+	for _, op := range ops {
+		if !inList[op] {
+			missing = append(missing, op)
+		}
+	}
+
+	if len(missing) != 0 {
+		return fmt.Errorf("Operations %v are missing from graph.Operations()", missing)
+	}
+
+	return nil
 }
 
 func TestGraphWriteToAndImport(t *testing.T) {
diff --git a/tensorflow/go/op/op_test.go b/tensorflow/go/op/op_test.go
index 2451ba360699a7ac24f64209339e7b4f92ffb548..842dee9ffe396c44cfa26bbc7fd34a598e62bf89 100644
--- a/tensorflow/go/op/op_test.go
+++ b/tensorflow/go/op/op_test.go
@@ -58,3 +58,76 @@ func TestAddOperationFailure(t *testing.T) {
 	_ = resize.Shape()
 	t.Errorf("resize.Shape() should have paniced since the underlying Operation was not created")
 }
+
+func TestShapeAttribute(t *testing.T) {
+	s := NewScope()
+	x := Placeholder(s.SubScope("x"), tf.Int32, PlaceholderShape(tf.MakeShape(1)))
+	y := Placeholder(s.SubScope("y"), tf.Int32, PlaceholderShape(tf.Shape{}))
+	z := Add(s, x, y)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	value, err := tf.NewTensor([]int32{7})
+	if err != nil {
+		t.Fatal(err)
+	}
+	feeds := map[tf.Output]*tf.Tensor{
+		x: value,
+		y: value,
+	}
+	fetched, err := sess.Run(feeds, []tf.Output{z}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if got, want := len(fetched), 1; got != want {
+		t.Fatalf("Fetched %d tensors, expected %d", got, want)
+	}
+	if got, want := fetched[0].Value().([]int32), []int32{14}; len(got) != len(want) || len(got) != 1 || got[0] != want[0] {
+		t.Fatalf("Got %v, want %v", got, want)
+	}
+}
+
+func TestDataset(t *testing.T) {
+	var (
+		s = NewScope()
+
+		// The use of a non-scalar here is inspired by
+		// https://github.com/tensorflow/tensorflow/issues/14891
+		c       = Const(s, []int32{21718, 31415})
+		types   = []tf.DataType{c.DataType()}
+		shapes  = []tf.Shape{c.Shape()}
+		dataset = TensorDataset(s, []tf.Output{c}, shapes)
+
+		iterator = Iterator(s, "", "", types, shapes)
+		next     = IteratorGetNext(s, iterator, types, shapes)
+		init     = MakeIterator(s, dataset, iterator)
+	)
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err := sess.Run(nil, nil, []*tf.Operation{init}); err != nil {
+		t.Fatal(err)
+	}
+	results, err := sess.Run(nil, next, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	got := results[0].Value().([]int32)
+	if len(got) != 2 || got[0] != 21718 || got[1] != 31415 {
+		t.Errorf("Got %v, want {21718, 31415}", got)
+	}
+	if _, err := sess.Run(nil, next, nil); err == nil {
+		t.Errorf("Expected sess.Run() to fail since the iterator should have reached the end of the dataset")
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f3160969630db48b0c6562f1d143c188c1116564..664e37d3a15ef250e3ef90b3201504c108c5c55b 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -44,19 +44,42 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Tag for the summary.
 //	value: Value for the summary.
 //
 // Returns the created operation.
-func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
+func WriteScalarSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteScalarSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, value,
+			writer, step, tag, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Outputs a `tf.Event` protocol buffer.
+//
+// When CreateSummaryDbWriter is being used, this op can be useful for
+// importing data from event logs.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	event: A string containing a binary-encoded tf.Event proto.
+//
+// Returns the created operation.
+func ImportEvent(scope *Scope, writer tf.Output, event tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ImportEvent",
+		Input: []tf.Input{
+			writer, event,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -66,21 +89,21 @@ func WriteScalarSummary(scope *Scope, writer tf.Output, global_step tf.Output, t
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tensor: A tensor to serialize.
 //	tag: The summary's tag.
 //	summary_metadata: Serialized SummaryMetadata protocol buffer containing
 // plugin-related metadata for this summary.
 //
 // Returns the created operation.
-func WriteSummary(scope *Scope, writer tf.Output, global_step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
+func WriteSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output, tag tf.Output, summary_metadata tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteSummary",
 		Input: []tf.Input{
-			writer, global_step, tensor, tag, summary_metadata,
+			writer, step, tensor, tag, summary_metadata,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -2124,19 +2147,19 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Scalar.  Tag to use for the `Summary.Value`.
 //	values: Any shape. Values to use to build the histogram.
 //
 // Returns the created operation.
-func WriteHistogramSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
+func WriteHistogramSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
 		Type: "WriteHistogramSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, values,
+			writer, step, tag, values,
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -2658,21 +2681,6 @@ func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 	return scope.AddOperation(opspec)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SpaceToDepthAttr is an optional argument to SpaceToDepth.
 type SpaceToDepthAttr func(optionalAttr)
 
@@ -5311,6 +5319,21 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// Produces a summary of any statistics recorded by the given statistics manager.
+func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
 type FIFOQueueV2Attr func(optionalAttr)
 
@@ -5927,6 +5950,23 @@ func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.
 	return scope.AddOperation(opspec)
 }
 
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "LatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -6123,6 +6163,43 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
+type StatsAggregatorHandleAttr func(optionalAttr)
+
+// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatsAggregatorHandle",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
 type CropAndResizeGradBoxesAttr func(optionalAttr)
 
@@ -7996,99 +8073,106 @@ func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
-
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// Computes the matrix exponential of one or more square matrices:
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes second-order gradients of the maxpooling function.
+// exp(A) = \sum_{n=0}^\infty A^n/n!
+//
+// The exponential is computed using a combination of the scaling and squaring
+// method and the Pade approximation. Details can be founds in:
+// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.expm
+// @end_compatibility
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
-type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
-// If not specified, defaults to -6
-func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["min"] = value
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
+//
+// Arguments:
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
-// If not specified, defaults to 6
-func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["max"] = value
+	opspec := tf.OpSpec{
+		Type: "MergeSummary",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Compute gradients for a FakeQuantWithMinMaxArgs operation.
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
-// `gradients * (inputs >= min && inputs <= max)`.
-func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8097,9 +8181,9 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxArgsGradient",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			gradients, inputs,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -8107,184 +8191,37 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
-		Input: []tf.Input{
-			input, grad, argmax,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
-//
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
-
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs 3D average pooling on the input.
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
-//
-// Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
-		Input: []tf.Input{
-			input, min_input, max_input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8738,32 +8675,6 @@ func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
-		Input: []tf.Input{
-			value, bias,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // EncodeJpegAttr is an optional argument to EncodeJpeg.
 type EncodeJpegAttr func(optionalAttr)
 
@@ -11114,13 +11025,13 @@ func WriteAudioSummaryMaxOutputs(value int64) WriteAudioSummaryAttr {
 //
 // Arguments:
 //	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
+//	step: The step to write the summary for.
 //	tag: Scalar. Used to build the `tag` attribute of the summary values.
 //	tensor: 2-D of shape `[batch_size, frames]`.
 //	sample_rate: The sample rate of the signal in hertz.
 //
 // Returns the created operation.
-func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
+func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...WriteAudioSummaryAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11131,7 +11042,7 @@ func WriteAudioSummary(scope *Scope, writer tf.Output, global_step tf.Output, ta
 	opspec := tf.OpSpec{
 		Type: "WriteAudioSummary",
 		Input: []tf.Input{
-			writer, global_step, tag, tensor, sample_rate,
+			writer, step, tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -13141,69 +13052,292 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
-	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
-		Input: []tf.Input{
-			t, m, v, beta, gamma,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
+// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
+type WriteImageSummaryAttr func(optionalAttr)
 
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["max_images"] = value
 	}
 }
 
-// Performs max pooling on the input.
-//
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Writes a `Summary` protocol buffer with images.
+//
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
+//
+// Arguments:
+//	writer: A handle to a summary writer.
+//	step: The step to write the summary for.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
+//	bad_color: Color to use for pixels with non-finite values.
+//
+// Returns the created operation.
+func WriteImageSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteImageSummary",
+		Input: []tf.Input{
+			writer, step, tag, tensor, bad_color,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the number of elements in the given queue.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a histogram.
+//
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
+//
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that emits the lines of one or more text files.
+//
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of records this Reader has produced.
+//
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential of x - 1 element-wise.
+//
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Expm1",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Batch normalization.
+//
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+//
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalization",
+		Input: []tf.Input{
+			t, m, v, beta, gamma,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
 // Returns The max pooled output tensor.
 func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
@@ -13297,35 +13431,6 @@ func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataTyp
 	return key, values
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Read an element from the TensorArray into output `value`.
 //
 // Arguments:
@@ -13999,8 +14104,73 @@ func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 //	col_pooling_sequence: column pooling sequence, form pooling region with
 // row_pooling sequence.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14009,14 +14179,13 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Reorders a SparseTensor into the canonical, row-major ordering.
@@ -14164,6 +14333,133 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Qr",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+//
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+//
+// Arguments:
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AudioSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Reverses specific dimensions of a tensor.
 //
 // NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
@@ -14684,6 +14980,24 @@ func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.O
 	return op.Output(0), op.Output(1)
 }
 
+// Returns x - y element-wise.
+//
+// *NOTE*: `Sub` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Get the value of the tensor specified by its handle.
 //
 // Arguments:
@@ -14879,6 +15193,21 @@ func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
 	}
 }
 
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
 // TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
 //
 // value: Overrides the name used for the temporary tensor_array
@@ -16791,6 +17120,29 @@ func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Deserialize `SparseTensor` from a (serialized) string 3-vector (1-D `Tensor`)
+//
+// object.
+//
+// Arguments:
+//	serialized_sparse: 1-D, The serialized `SparseTensor` object. Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` object.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Elementwise computes the bitwise XOR of `x` and `y`.
 //
 // The result will have those bits set, that are different in `x` and `y`. The
@@ -17526,54 +17878,6 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
-
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of average pooling function.
-//
-// Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform over the
@@ -17815,14 +18119,63 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "Cumprod",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
+
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			x, axis,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
@@ -17963,6 +18316,32 @@ func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Outpu
 	return op.Output(0), op.Output(1)
 }
 
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
+//
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
 type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
@@ -18064,270 +18443,107 @@ func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	}
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
-
-// CumsumExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumsumReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
-//
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Cumsum",
-		Input: []tf.Input{
-			x, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// WriteImageSummaryAttr is an optional argument to WriteImageSummary.
-type WriteImageSummaryAttr func(optionalAttr)
-
-// WriteImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func WriteImageSummaryMaxImages(value int64) WriteImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
-
-// Writes a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
-//
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
-//
-// Arguments:
-//	writer: A handle to a summary writer.
-//	global_step: The step to write the summary for.
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
-//	bad_color: Color to use for pixels with non-finite values.
-//
-// Returns the created operation.
-func WriteImageSummary(scope *Scope, writer tf.Output, global_step tf.Output, tag tf.Output, tensor tf.Output, bad_color tf.Output, optional ...WriteImageSummaryAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "WriteImageSummary",
-		Input: []tf.Input{
-			writer, global_step, tag, tensor, bad_color,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the number of elements in the given queue.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	handle: The handle to a queue.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			handle,
+			features, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
+//
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			tag, values,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -18445,12 +18661,32 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
+
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
+//
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+	return func(m optionalAttr) {
+		m["old_vocab_size"] = value
+	}
+}
+
 // Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
 // length `num_new_vocab`, where `remapping[i]` contains the row number in the old
 // vocabulary that corresponds to row `i` in the new vocabulary (starting at line
 // `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  `num_vocab_offset` enables
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
 // use in the partitioned variable case, and should generally be set through
 // examining partitioning info.  The format of the files should be a text file,
 // with each line containing a single entity within the vocabulary.
@@ -18481,11 +18717,14 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 // Returns A Tensor of length num_new_vocab where the element at index i
 // is equal to the old ID that maps to the new ID i.  This element is -1 for any
 // new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64) (remapping tf.Output, num_present tf.Output) {
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
@@ -18891,29 +19130,180 @@ func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output
 	opspec := tf.OpSpec{
 		Type: "Substr",
 		Input: []tf.Input{
-			input, pos, len,
+			input, pos, len,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
+
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormal",
+		Input: []tf.Input{
+			shape, seed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
+
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueWithCounts",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
+//
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+//
+// Arguments:
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RestoreSlice",
+		Input: []tf.Input{
+			file_pattern, tensor_name, shape_and_slice,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
 // value: The type of the output.
 // If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
 	return func(m optionalAttr) {
 		m["dtype"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // The outputs are a deterministic function of `shape` and `seed`.
 //
@@ -18922,7 +19312,7 @@ func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 //	seed: 2 seeds (shape [2]).
 //
 // Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18931,7 +19321,7 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
 			shape, seed,
 		},
@@ -19088,6 +19478,22 @@ func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
+// Associates the given iterator with the given statistics aggregator.
+//
+// Returns the created operation.
+func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IteratorSetStatsAggregator",
+		Input: []tf.Input{
+			iterator_handle, stats_aggregator_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
 type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
@@ -19192,10 +19598,6 @@ func Abs(scope *Scope, x tf.Output) (y tf.Output) {
 type RandomPoissonAttr func(optionalAttr)
 
 // RandomPoissonSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
 // If not specified, defaults to 0
 func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
@@ -19204,8 +19606,6 @@ func RandomPoissonSeed(value int64) RandomPoissonAttr {
 }
 
 // RandomPoissonSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
 func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
@@ -19213,28 +19613,9 @@ func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+// Use RandomPoissonV2 instead.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`. The dtype of the output matches the dtype of
-// rate.
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
 func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -19852,6 +20233,119 @@ func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Inverse real-valued fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
@@ -20272,85 +20766,30 @@ func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Outp
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
-// If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
-	}
-}
-
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
-//
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
-//
-// Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dt": dt}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
-		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
-		},
-		Attrs: attrs,
+		m["signed_input"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["range_given"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// Quantizes then dequantizes a tensor.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20359,9 +20798,9 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			shape, seed,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -20369,123 +20808,123 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["data_format"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// For example:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
-// ```
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	x: 1-D.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "QuantizedMaxPool",
 		Input: []tf.Input{
-			input_dataset, count,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
 
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["data_format"] = value
 	}
 }
 
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
+// Computes gradients of average pooling function.
 //
-// For example:
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			real, imag,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -20493,42 +20932,69 @@ func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAt
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
+// Writes a `GraphDef` protocol buffer to a `SummaryWriter`.
+//
+// Arguments:
+//	writer: Handle of `SummaryWriter`.
+//	step: The step to write the summary for.
+//	tensor: A scalar string of the serialized tf.GraphDef proto.
+//
+// Returns the created operation.
+func WriteGraphSummary(scope *Scope, writer tf.Output, step tf.Output, tensor tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "WriteGraphSummary",
+		Input: []tf.Input{
+			writer, step, tensor,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
+// Computes second-order gradients of the maxpooling function.
 //
-// For example:
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			input,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -20536,79 +21002,92 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
-//
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
+type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxArgsGradientMin sets the optional min attribute to value.
+// If not specified, defaults to -6
+func FakeQuantWithMinMaxArgsGradientMin(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["min"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+}
+
+// FakeQuantWithMinMaxArgsGradientMax sets the optional max attribute to value.
+// If not specified, defaults to 6
+func FakeQuantWithMinMaxArgsGradientMax(value float32) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["max"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
-	if scope.Err() != nil {
-		return
+// FakeQuantWithMinMaxArgsGradientNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxArgsGradientNumBits(value int64) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+}
+
+// FakeQuantWithMinMaxArgsGradientNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxArgsGradientNarrowRange(value bool) FakeQuantWithMinMaxArgsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Compute gradients for a FakeQuantWithMinMaxArgs operation.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxArgs operation.
+//
+// Returns Backpropagated gradients below the FakeQuantWithMinMaxArgs operation:
+// `gradients * (inputs >= min && inputs <= max)`.
+func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, optional ...FakeQuantWithMinMaxArgsGradientAttr) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "FakeQuantWithMinMaxArgsGradient",
 		Input: []tf.Input{
-			x,
+			gradients, inputs,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x - y element-wise.
+// Computes gradients of the maxpooling function.
 //
-// *NOTE*: `Sub` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Sub",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			x, y,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -22421,6 +22900,39 @@ func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Creates summary database writer accessible by given resource handle.
+//
+// This can be used to write tensors from the execution graph directly
+// to a database. Only SQLite is supported right now. This function
+// will create the schema if it doesn't exist. Entries in the Users,
+// Experiments, and Runs tables will be created automatically if they
+// don't already exist.
+//
+// Arguments:
+//	writer: Handle to SummaryWriter resource to overwrite.
+//	db_uri: For example "file:/tmp/foo.sqlite".
+//	experiment_name: Can't contain ASCII control characters or <>. Case
+// sensitive. If empty, then the Run will not be associated with any
+// Experiment.
+//	run_name: Can't contain ASCII control characters or <>. Case sensitive.
+// If empty, then each Tag will not be associated with any Run.
+//	user_name: Must be valid as both a DNS label and Linux username. If
+// empty, then the Experiment will not be associated with any User.
+//
+// Returns the created operation.
+func CreateSummaryDbWriter(scope *Scope, writer tf.Output, db_uri tf.Output, experiment_name tf.Output, run_name tf.Output, user_name tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CreateSummaryDbWriter",
+		Input: []tf.Input{
+			writer, db_uri, experiment_name, run_name, user_name,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
 type HistogramFixedWidthAttr func(optionalAttr)
 
@@ -23133,6 +23645,101 @@ func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+//
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_random_seed"] = value
+	}
+}
+
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+//
+// value: Shifts the list of files after the list is randomly
+// shuffled.
+// If not specified, defaults to 0
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_shuffle_shift_ratio"] = value
+	}
+}
+
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+//
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+//
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
+//
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// Emits randomized records.
+//
+// Arguments:
+//	file_pattern: Glob pattern for the data files.
+//
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RecordInput",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Generates values in an interval.
 //
 // A sequence of `num` evenly-spaced values are generated beginning at `start`.
@@ -24019,7 +24626,7 @@ func NthElementReverse(value bool) NthElementAttr {
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dmension.
+// Finds values of the `n`-th order statistic for the last dimension.
 //
 // If the input is a vector (rank-1), finds the entries which is the nth-smallest
 // value in the vector and outputs their values as scalar tensor.
@@ -24479,177 +25086,67 @@ func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num
 	return op.Output(0)
 }
 
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
-//
-// See also `SaveSlices`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
-//
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Save",
-		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
-//
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns x + y element-wise.
+//
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "Add",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Returns x + y element-wise.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddV2",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+// Saves the input tensors to disk.
 //
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// See also `SaveSlices`.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "Save",
 		Input: []tf.Input{
-			tag, tensor,
+			filename, tensor_names, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // BiasAddAttr is an optional argument to BiasAdd.
@@ -24820,111 +25317,107 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 	return op.Output(0)
 }
 
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Maximum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
+
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: Shifts the list of files after the list is randomly
-// shuffled.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
+		m["seed"] = value
 	}
 }
 
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
+		m["seed2"] = value
 	}
 }
 
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_parallelism"] = value
-	}
-}
-
-// RecordInputBatchSize sets the optional batch_size attribute to value.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
-	}
-}
-
-// Emits randomized records.
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	file_pattern: Glob pattern for the data files.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			x,
+			true_classes,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// Returns the truth value of (x < y) element-wise.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "Less",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -24933,89 +25426,119 @@ func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
+// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
+type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
+// value: The bitwidth of the quantization; between 2 and 8, inclusive.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["num_bits"] = value
+	}
+}
+
+// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
+//
+// value: Whether to quantize into 2^num_bits - 1 distinct values.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// Compute gradients for a FakeQuantWithMinMaxVars operation.
+//
+// Arguments:
+//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
+//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
+// min, max: Quantization interval, scalar floats.
+//
+//
+//
+// Returns Backpropagated gradients w.r.t. inputs:
+// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
+// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
+// `sum(gradients * (inputs > max))`.
+func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FakeQuantWithMinMaxVarsGradient",
+		Input: []tf.Input{
+			gradients, inputs, min, max,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			true_classes,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "Minimum",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -26886,127 +27409,6 @@ func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, val
 	return scope.AddOperation(opspec)
 }
 
-// FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
-type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsGradientNumBits sets the optional num_bits attribute to value.
-//
-// value: The bitwidth of the quantization; between 2 and 8, inclusive.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsGradientNumBits(value int64) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// FakeQuantWithMinMaxVarsGradientNarrowRange sets the optional narrow_range attribute to value.
-//
-// value: Whether to quantize into 2^num_bits - 1 distinct values.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsGradientNarrowRange(value bool) FakeQuantWithMinMaxVarsGradientAttr {
-	return func(m optionalAttr) {
-		m["narrow_range"] = value
-	}
-}
-
-// Compute gradients for a FakeQuantWithMinMaxVars operation.
-//
-// Arguments:
-//	gradients: Backpropagated gradients above the FakeQuantWithMinMaxVars operation.
-//	inputs: Values passed as inputs to the FakeQuantWithMinMaxVars operation.
-// min, max: Quantization interval, scalar floats.
-//
-//
-//
-// Returns Backpropagated gradients w.r.t. inputs:
-// `gradients * (inputs >= min && inputs <= max)`.Backpropagated gradients w.r.t. min parameter:
-// `sum(gradients * (inputs < min))`.Backpropagated gradients w.r.t. max parameter:
-// `sum(gradients * (inputs > max))`.
-func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsGradientAttr) (backprops_wrt_input tf.Output, backprop_wrt_min tf.Output, backprop_wrt_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVarsGradient",
-		Input: []tf.Input{
-			gradients, inputs, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
@@ -27051,57 +27453,3 @@ func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_s
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
-
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
-	}
-}
-
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/go/operation_test.go b/tensorflow/go/operation_test.go
index 7cba043af29ca75fd8df95397116717f13ef8e31..40c951ab8c13f43e2063b9f9cfadcd44a6da72fe 100644
--- a/tensorflow/go/operation_test.go
+++ b/tensorflow/go/operation_test.go
@@ -123,6 +123,14 @@ func TestOutputDataTypeAndShape(t *testing.T) {
 			[]int64{2, 3},
 			Double,
 		},
+		{ // Matrix of Uint64
+			[][]uint64{
+				{1, 2, 3},
+				{4, 5, 6},
+			},
+			[]int64{2, 3},
+			Uint64,
+		},
 	}
 	for idx, test := range testdata {
 		t.Run(fmt.Sprintf("#%d Value %T", idx, test.Value), func(t *testing.T) {
diff --git a/tensorflow/go/tensor.go b/tensorflow/go/tensor.go
index 36a74c008132c3320259ab70ab5c778156d17733..2d25c04dc9b1d0bc2ae831f98c0879e73a6bfafa 100644
--- a/tensorflow/go/tensor.go
+++ b/tensorflow/go/tensor.go
@@ -101,7 +101,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
 			return nil, bug("NewTensor incorrectly calculated the size of a tensor with type %v and shape %v as %v bytes instead of %v", dataType, shape, nbytes, buf.Len())
 		}
 	} else {
-		e := stringEncoder{offsets: buf, data: raw[nflattened*8 : len(raw)], status: newStatus()}
+		e := stringEncoder{offsets: buf, data: raw[nflattened*8:], status: newStatus()}
 		if err := e.encode(reflect.ValueOf(value), shape); err != nil {
 			return nil, err
 		}
@@ -207,6 +207,9 @@ func (t *Tensor) WriteContentsTo(w io.Writer) (int64, error) {
 func tensorData(c *C.TF_Tensor) []byte {
 	// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
 	cbytes := C.TF_TensorData(c)
+	if cbytes == nil {
+		return nil
+	}
 	length := int(C.TF_TensorByteSize(c))
 	slice := (*[1 << 30]byte)(unsafe.Pointer(cbytes))[:length:length]
 	return slice
@@ -267,7 +270,7 @@ func typeOf(dt DataType, shape []int64) reflect.Type {
 		}
 	}
 	if ret == nil {
-		panic(bug("DataType %v is not supported", dt))
+		panic(bug("DataType %v is not supported (see https://www.tensorflow.org/code/tensorflow/core/framework/types.proto)", dt))
 	}
 	for range shape {
 		ret = reflect.SliceOf(ret)
@@ -310,7 +313,7 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 		if err := w.WriteByte(b); err != nil {
 			return err
 		}
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
 		if err := binary.Write(w, nativeEndian, v.Interface()); err != nil {
 			return err
 		}
@@ -325,6 +328,14 @@ func encodeTensor(w *bytes.Buffer, v reflect.Value, shape []int64) error {
 			}
 		}
 
+		// Optimisation: if only one dimension is left we can use binary.Write() directly for this slice
+		if len(shape) == 1 && v.Len() > 0 {
+			switch v.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Write(w, nativeEndian, v.Interface())
+			}
+		}
+
 		subShape := shape[1:]
 		for i := 0; i < v.Len(); i++ {
 			err := encodeTensor(w, v.Index(i), subShape)
@@ -349,7 +360,7 @@ func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.
 			return err
 		}
 		ptr.Elem().SetBool(b == 1)
-	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+	case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
 		if err := binary.Read(r, nativeEndian, ptr.Interface()); err != nil {
 			return err
 		}
@@ -357,6 +368,15 @@ func decodeTensor(r *bytes.Reader, shape []int64, typ reflect.Type, ptr reflect.
 	case reflect.Slice:
 		val := reflect.Indirect(ptr)
 		val.Set(reflect.MakeSlice(typ, int(shape[0]), int(shape[0])))
+
+		// Optimization: if only one dimension is left we can use binary.Read() directly for this slice
+		if len(shape) == 1 && val.Len() > 0 {
+			switch val.Index(0).Kind() {
+			case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128:
+				return binary.Read(r, nativeEndian, val.Interface())
+			}
+		}
+
 		for i := 0; i < val.Len(); i++ {
 			if err := decodeTensor(r, shape[1:], typ.Elem(), val.Index(i).Addr()); err != nil {
 				return err
diff --git a/tensorflow/go/tensor_test.go b/tensorflow/go/tensor_test.go
index 35bd2fd9a54a95d06f6d6c411aa74de9ebb9ea7a..793c36dd4db28fc5fdb713095c6d1d6713367a7a 100644
--- a/tensorflow/go/tensor_test.go
+++ b/tensorflow/go/tensor_test.go
@@ -34,11 +34,15 @@ func TestNewTensor(t *testing.T) {
 		{nil, int64(5)},
 		{nil, uint8(5)},
 		{nil, uint16(5)},
+		{nil, uint32(5)},
+		{nil, uint64(5)},
 		{nil, float32(5)},
 		{nil, float64(5)},
 		{nil, complex(float32(5), float32(6))},
 		{nil, complex(float64(5), float64(6))},
 		{nil, "a string"},
+		{[]int64{1}, []uint32{1}},
+		{[]int64{1}, []uint64{1}},
 		{[]int64{2}, []bool{true, false}},
 		{[]int64{1}, []float64{1}},
 		{[]int64{1}, [1]float64{1}},
@@ -71,11 +75,6 @@ func TestNewTensor(t *testing.T) {
 		// native ints not supported
 		int(5),
 		[]int{5},
-		// uint32 and uint64 are not supported in TensorFlow
-		uint32(5),
-		[]uint32{5},
-		uint64(5),
-		[]uint64{5},
 		// Mismatched dimensions
 		[][]float32{{1, 2, 3}, {4}},
 		// Mismatched dimensions. Should return "mismatched slice lengths" error instead of "BUG"
@@ -244,3 +243,23 @@ func BenchmarkNewTensor(b *testing.B) {
 	)
 	b.Run("[150528]", func(b *testing.B) { benchmarkNewTensor(b, vector) })
 }
+
+func benchmarkDecodeTensor(b *testing.B, t *Tensor) {
+	for i := 0; i < b.N; i++ {
+		_ = t.Value()
+	}
+}
+
+func BenchmarkDecodeTensor(b *testing.B) {
+	var (
+		// Some sample sizes from the Inception image labeling model.
+		// Where input tensors correspond to a 224x224 RGB image
+		// flattened into a vector.
+		vector [224 * 224 * 3]int32
+	)
+	t, err := NewTensor(vector)
+	if err != nil {
+		b.Fatalf("(%v, %v)", t, err)
+	}
+	b.Run("[150528]", func(b *testing.B) { benchmarkDecodeTensor(b, t) })
+}
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 5bd5b9a388ff334fe78d5f148ca0fc8176378bb2..ab7f60d03dfd0423fae383f8a5213f6ffbbbdf2d 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -17,6 +17,7 @@ XLINT_OPTS = [
     "-Xlint:all",
     "-Xlint:-serial",
     "-Xlint:-try",
+    "-Xlint:-classfile", # see b/32750402, go/javac-warnings#classfile
 ]
 
 # The bazel errorprone plugin currently only enables default errorChecks
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 371457087616b43ae81eb3b72b46b189f2ff5c19..d365c39ef4a5b10f45f6045567082724510fab54 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0-rc1</version>
+    <version>1.4.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 9f7eb402530bf57fa8745b47696920b51d9fe5e5..0111fc62a4d6bfb27e51fd40778edf37f8c2e501 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0-rc1</version>
+    <version>1.4.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index fac0a8bc260f1a502d905f743225a4963209dbaa..06042216b4612e4a55f712b8f941b53c2bdf1daf 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.4.0-rc1</version>
+  <version>1.4.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 135ee0f2d2a3954fdf5789545093c209ba2fad0c..2c9d76b563377c3fc4ecede0460ef4e53e27b417 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0-rc1</version>
+    <version>1.4.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 771482ba641167484482cbbcac83f22e56ff728d..474a9adb9ae6cbedcc8f67abb0431710f2ecbef9 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.4.0-rc1</version>
+    <version>1.4.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/perl/tftypes-runall.pl b/tensorflow/java/src/gen/perl/tftypes-runall.pl
index a451ce92aa272ece591e71f85d08bd08acd6430e..65fe3b150667e3a5ed73e6bbd7e9da74157b72d8 100644
--- a/tensorflow/java/src/gen/perl/tftypes-runall.pl
+++ b/tensorflow/java/src/gen/perl/tftypes-runall.pl
@@ -1,13 +1,13 @@
 #!/usr/bin/perl
 #
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/tensorflow/java/src/gen/perl/tftypes.pl b/tensorflow/java/src/gen/perl/tftypes.pl
index 115723ac8a8553966dc0906031c1962007ee6a82..c7c62e916f4860aa16503ae098eed9e90e9150e4 100644
--- a/tensorflow/java/src/gen/perl/tftypes.pl
+++ b/tensorflow/java/src/gen/perl/tftypes.pl
@@ -1,13 +1,13 @@
 #!/usr/bin/perl
 #
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -156,7 +156,7 @@ for (my $i = 1; $i <= $#info; $i++) {
                            ."   *  String elements are sequences of bytes from the last array dimension.\n";
             }
 
-    
+
             my $intro = ($trank > 0)
                 ?  "Creates a rank-$trank tensor of {\@code $jtype} elements."
                 :  "Creates a scalar tensor containing a single {\@code $jtype} element.";
diff --git a/tensorflow/java/src/gen/resources/Tensors.java.tmpl b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
index 98e15885594ed4dd06201a7252817cb66d8bc0ba..e615524c8e59f056b1aeac322afdff1739cd90bf 100644
--- a/tensorflow/java/src/gen/resources/Tensors.java.tmpl
+++ b/tensorflow/java/src/gen/resources/Tensors.java.tmpl
@@ -11,7 +11,7 @@ public final class Tensors {
   private Tensors() {}
 
   /** Creates a scalar String tensor using the default, UTF-8 encoding.
-   * 
+   *
    *  @param data  The string to put into the new scalar tensor.
    */
   public static Tensor<String> create(String data) {
@@ -19,7 +19,7 @@ public final class Tensors {
   }
 
   /** Creates a scalar String tensor using a specified encoding.
-   * 
+   *
    *  @param charset The encoding from String to bytes.
    *  @param data    The string to put into the new scalar tensor.
    */
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Input.java b/tensorflow/java/src/main/java/org/tensorflow/Input.java
deleted file mode 100644
index 13bc463e7d6a991858332a353681b24fff417547..0000000000000000000000000000000000000000
--- a/tensorflow/java/src/main/java/org/tensorflow/Input.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-package org.tensorflow;
-
-/**
- * Interface implemented by operands of a TensorFlow operation.
- *
- * <p>Example usage:
- *
- * <pre>{@code
- * // The "decodeJpeg" operation can be used as input to the "cast" operation
- * Input decodeJpeg = ops.image().decodeJpeg(...);
- * ops.math().cast(decodeJpeg, DataType.FLOAT);
- *
- * // The output "y" of the "unique" operation can be used as input to the "cast" operation
- * Output y = ops.array().unique(...).y();
- * ops.math().cast(y, DataType.FLOAT);
- *
- * // The "split" operation can be used as input list to the "concat" operation
- * Iterable<? extends Input> split = ops.array().split(...);
- * ops.array().concat(0, split);
- * }</pre>
- */
-public interface Input<T> {
-
-  /**
-   * Returns the symbolic handle of a tensor.
-   *
-   * <p>Inputs to TensorFlow operations are outputs of another TensorFlow operation. This method is
-   * used to obtain a symbolic handle that represents the computation of the input.
-   *
-   * @see OperationBuilder#addInput(Output)
-   */
-  Output<T> asOutput();
-}
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 2b431eebf5f3c66a9924ca28d221ddf3574eff75..499757e8cf4d6166e425d801ce20335bd8ad83e8 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -43,7 +43,6 @@ final class NativeLibrary {
   private static final boolean DEBUG =
       System.getProperty("org.tensorflow.NativeLibrary.DEBUG") != null;
   private static final String JNI_LIBNAME = "tensorflow_jni";
-  private static final String FRAMEWORK_LIBNAME = "tensorflow_framework";
 
   public static void load() {
     if (isLoaded() || tryLoadLibrary()) {
@@ -59,12 +58,15 @@ final class NativeLibrary {
     }
     // Native code is not present, perhaps it has been packaged into the .jar file containing this.
     // Extract the JNI library itself
-    final String jniResourceName = makeResourceName(JNI_LIBNAME);
+    final String jniLibName = System.mapLibraryName(JNI_LIBNAME);
+    final String jniResourceName = makeResourceName(jniLibName);
     log("jniResourceName: " + jniResourceName);
     final InputStream jniResource =
         NativeLibrary.class.getClassLoader().getResourceAsStream(jniResourceName);
     // Extract the JNI's dependency
-    final String frameworkResourceName = makeResourceName(FRAMEWORK_LIBNAME);
+    final String frameworkLibName =
+        maybeAdjustForMacOS(System.mapLibraryName("tensorflow_framework"));
+    final String frameworkResourceName = makeResourceName(frameworkLibName);
     log("frameworkResourceName: " + frameworkResourceName);
     final InputStream frameworkResource =
         NativeLibrary.class.getClassLoader().getResourceAsStream(frameworkResourceName);
@@ -88,12 +90,15 @@ final class NativeLibrary {
       tempPath.deleteOnExit();
       final String tempDirectory = tempPath.toString();
       if (frameworkResource != null) {
-        extractResource(frameworkResource, FRAMEWORK_LIBNAME, tempDirectory);
+        extractResource(frameworkResource, frameworkLibName, tempDirectory);
       } else {
-        log(frameworkResourceName + " not found. This is fine assuming " + jniResourceName
-            + " is not built to depend on it.");
+        log(
+            frameworkResourceName
+                + " not found. This is fine assuming "
+                + jniResourceName
+                + " is not built to depend on it.");
       }
-      System.load(extractResource(jniResource, JNI_LIBNAME, tempDirectory));
+      System.load(extractResource(jniResource, jniLibName, tempDirectory));
     } catch (IOException e) {
       throw new UnsatisfiedLinkError(
           String.format(
@@ -121,9 +126,27 @@ final class NativeLibrary {
     }
   }
 
+  private static String maybeAdjustForMacOS(String libFilename) {
+    if (!System.getProperty("os.name").contains("OS X")) {
+      return libFilename;
+    }
+    // This is macOS, and the TensorFlow release process might have setup dependencies on
+    // libtensorflow_framework.so instead of libtensorflow_framework.dylib. Adjust for that.
+    final ClassLoader cl = NativeLibrary.class.getClassLoader();
+    if (cl.getResource(makeResourceName(libFilename)) != null) {
+      return libFilename;
+    }
+    // liftensorflow_framework.dylib not found, try libtensorflow_framework.so
+    final String suffix = ".dylib";
+    if (!libFilename.endsWith(suffix)) {
+      return libFilename;
+    }
+    return libFilename.substring(0, libFilename.length() - suffix.length()) + ".so";
+  }
+
   private static String extractResource(
       InputStream resource, String resourceName, String extractToDirectory) throws IOException {
-    final File dst = new File(extractToDirectory, System.mapLibraryName(resourceName));
+    final File dst = new File(extractToDirectory, resourceName);
     dst.deleteOnExit();
     final String dstPath = dst.toString();
     log("extracting native library to: " + dstPath);
@@ -157,9 +180,7 @@ final class NativeLibrary {
   }
 
   private static String makeResourceName(String baseName) {
-    return "org/tensorflow/native/"
-        + String.format("%s-%s/", os(), architecture())
-        + System.mapLibraryName(baseName);
+    return "org/tensorflow/native/" + String.format("%s-%s/", os(), architecture()) + baseName;
   }
 
   private static long copy(InputStream src, File dstFile) throws IOException {
diff --git a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
index 9a1b7592b38dde469c0ac48f35614545c4af2729..a24150484e83dcccf3e1869155569431969b74cf 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/OperationBuilder.java
@@ -265,6 +265,36 @@ public final class OperationBuilder {
     return this;
   }
 
+  public OperationBuilder setAttr(String name, Shape[] value) {
+    int[] numDimensions = new int[value.length];
+    int totalNumDimensions = 0;
+    for (int idx = 0; idx < value.length; ++idx) {
+      int n = value[idx].numDimensions();
+      numDimensions[idx] = n;
+      if (n > 0) {
+        totalNumDimensions += n;
+      }
+    }
+    // Flatten the shapes into a single array to avoid too much overhead in the
+    // native part
+    long[] shapes = new long[totalNumDimensions];
+    int shapeIdx = 0;
+    for (Shape shape : value) {
+      if (shape.numDimensions() > 0) {
+        for (long dim : shape.asArray()) {
+          shapes[shapeIdx++] = dim;
+        }
+      }
+    }
+    Graph.Reference r = graph.ref();
+    try {
+      setAttrShapeList(unsafeNativeHandle, name, shapes, numDimensions);
+    } finally {
+      r.close();
+    }
+    return this;
+  }
+
   public OperationBuilder setAttr(String name, String[] value) {
     Charset utf8 = Charset.forName("UTF-8");
     Object[] objects = new Object[value.length];
@@ -297,8 +327,6 @@ public final class OperationBuilder {
 
   // The names of all the setAttr* family functions below correspond to the C library types, not the
   // Java library types. Roughly, setAttrFoo calls the TensorFlow C library function: TF_SetAttrFoo.
-  // TODO(ashankar):
-  // - setAttrShapeList: Which would take in a long[][]
 
   private static native void setAttrString(long handle, String name, byte[] value);
 
@@ -324,5 +352,8 @@ public final class OperationBuilder {
 
   private static native void setAttrShape(long handle, String name, long[] shape, int numDims);
 
+  private static native void setAttrShapeList(
+      long handle, String name, long[] shapes, int[] numDims);
+
   private static native void setAttrStringList(long handle, String name, Object[] value);
 }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Shape.java b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
index 9aa92be111c09bfb687822c20264afe07266e356..d533c3d480f0d70e36e2f1a1bcb2f5bdc61e4861 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Shape.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Shape.java
@@ -77,6 +77,24 @@ public final class Shape {
     return shape[i];
   }
 
+  @Override
+  public int hashCode() {
+    return Arrays.hashCode(shape);
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Shape && Arrays.equals(this.shape, ((Shape) obj).shape)) {
+      return !hasUnknownDimension();
+    }
+
+    return super.equals(obj);
+  }
+
   /** Succinct description of the shape meant for debugging. */
   @Override
   public String toString() {
@@ -98,4 +116,18 @@ public final class Shape {
   }
 
   private long[] shape;
+
+  private boolean hasUnknownDimension() {
+    if (shape == null) {
+      return true;
+    }
+
+    for (long dimension : shape) {
+      if (dimension == -1) {
+        return true;
+      }
+    }
+
+    return false;
+  }
 }
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.cc b/tensorflow/java/src/main/native/operation_builder_jni.cc
index e03be7b1103d5507310c3423e537b6809083e6c3..55d214a7c4b81a01e48121214e91397626652f11 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.cc
+++ b/tensorflow/java/src/main/native/operation_builder_jni.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/java/src/main/native/operation_builder_jni.h"
 
+#include <cstring>
 #include <memory>
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/java/src/main/native/exception_jni.h"
@@ -262,6 +263,41 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShape(
   env->ReleaseStringUTFChars(name, cname);
 }
 
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShapeList(
+    JNIEnv* env, jclass clazz, jlong handle, jstring name, jlongArray shapes,
+    jintArray num_dims) {
+  TF_OperationDescription* d = requireHandle(env, handle);
+  if (d == nullptr) return;
+  std::unique_ptr<int64_t[]> cshapes;
+  std::unique_ptr<int64_t* []> cdims;
+  std::unique_ptr<int[]> cnum_dims;
+  const int num_dims_length = env->GetArrayLength(num_dims);
+  if (num_dims_length > 0) {
+    const int shapes_length = env->GetArrayLength(shapes);
+    cshapes.reset(new int64_t[shapes_length]);
+    cdims.reset(new int64_t*[num_dims_length]);
+    cnum_dims.reset(new int[num_dims_length]);
+    jlong* shapes_elems =
+        static_cast<jlong*>(env->GetPrimitiveArrayCritical(shapes, nullptr));
+    std::memcpy(cshapes.get(), shapes_elems, shapes_length << 3);
+    env->ReleasePrimitiveArrayCritical(shapes, shapes_elems, JNI_ABORT);
+    int64_t* cshapes_ptr = cshapes.get();
+    jint* num_dims_elems =
+        static_cast<jint*>(env->GetPrimitiveArrayCritical(num_dims, nullptr));
+    for (int i = 0; i < num_dims_length; ++i) {
+      cnum_dims[i] = static_cast<int>(num_dims_elems[i]);
+      cdims[i] = cshapes_ptr;
+      if (cnum_dims[i] > 0) {
+        cshapes_ptr += cnum_dims[i];
+      }
+    }
+    env->ReleasePrimitiveArrayCritical(num_dims, num_dims_elems, JNI_ABORT);
+  }
+  const char* cname = env->GetStringUTFChars(name, nullptr);
+  TF_SetAttrShapeList(d, cname, cdims.get(), cnum_dims.get(), num_dims_length);
+  env->ReleaseStringUTFChars(name, cname);
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrStringList(
     JNIEnv* env, jclass object, jlong handle, jstring name,
     jobjectArray values) {
diff --git a/tensorflow/java/src/main/native/operation_builder_jni.h b/tensorflow/java/src/main/native/operation_builder_jni.h
index 2e72bd68da5ad5915ba8268971a2f96961a45972..cf0abe4829b8c559d029f8c59108027a4dad4648 100644
--- a/tensorflow/java/src/main/native/operation_builder_jni.h
+++ b/tensorflow/java/src/main/native/operation_builder_jni.h
@@ -169,6 +169,14 @@ JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrTensorList(
 JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShape(
     JNIEnv *, jclass, jlong, jstring, jlongArray, jint);
 
+/*
+ * Class:     org_tensorflow_OperationBuilder
+ * Method:    setAttrShapeList
+ * Signature: (JLjava/lang/String;[J[I)V
+ */
+JNIEXPORT void JNICALL Java_org_tensorflow_OperationBuilder_setAttrShapeList(
+    JNIEnv *, jclass, jlong, jstring, jlongArray, jintArray);
+
 /*
  * Class:     org_tensorflow_OperationBuilder
  * Method:    setAttrStringList
diff --git a/tensorflow/java/src/main/native/session_jni.cc b/tensorflow/java/src/main/native/session_jni.cc
index e26367ea00423c42fde348051a5d736852bfc137..2cd542d3c9be536a42037e9ef533ed629dd3ac9f 100644
--- a/tensorflow/java/src/main/native/session_jni.cc
+++ b/tensorflow/java/src/main/native/session_jni.cc
@@ -223,9 +223,8 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
   jbyteArray ret = nullptr;
   if (run_metadata != nullptr) {
     ret = env->NewByteArray(run_metadata->length);
-    jbyte* elems = env->GetByteArrayElements(ret, nullptr);
-    memcpy(elems, run_metadata->data, run_metadata->length);
-    env->ReleaseByteArrayElements(ret, elems, JNI_COMMIT);
+    env->SetByteArrayRegion(ret, 0, run_metadata->length,
+                            reinterpret_cast<const jbyte*>(run_metadata->data));
   }
   TF_DeleteStatus(status);
   return ret;
diff --git a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
index 6dc233987bb035d280766c44d75f3d4b920c40ef..0a4a8cf4e3f65311ba887b4d47bc79080bfd5382 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/OperationBuilderTest.java
@@ -148,6 +148,19 @@ public class OperationBuilderTest {
     }
   }
 
+  @Test
+  public void setAttrShapeList() {
+    // Those shapes match tensors ones, so no exception is thrown
+    testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2)});
+    try {
+      // Those shapes do not match tensors ones, exception is thrown
+      testSetAttrShapeList(new Shape[] {Shape.make(2, 2), Shape.make(2, 2, 2, 2)});
+      fail("Shapes are incompatible and an exception was expected");
+    } catch (IllegalArgumentException e) {
+      // expected
+    }
+  }
+
   @Test
   public void addControlInput() {
     try (Graph g = new Graph();
@@ -175,6 +188,30 @@ public class OperationBuilderTest {
     }
   }
 
+  private static void testSetAttrShapeList(Shape[] shapes) {
+    try (Graph g = new Graph();
+        Session s = new Session(g)) {
+      int[][] matrix = new int[][] {{0, 0}, {0, 0}};
+      Output<?> queue =
+          g.opBuilder("FIFOQueue", "queue")
+              .setAttr("component_types", new DataType[] {DataType.INT32, DataType.INT32})
+              .setAttr("shapes", shapes)
+              .build()
+              .output(0);
+      assertTrue(hasNode(g, "queue"));
+      Output<Integer> c1 = TestUtil.constant(g, "const1", matrix);
+      Output<Integer> c2 = TestUtil.constant(g, "const2", new int[][][] {matrix, matrix});
+      Operation enqueue =
+          g.opBuilder("QueueEnqueue", "enqueue")
+              .addInput(queue)
+              .addInputList(new Output<?>[] {c1, c2})
+              .build();
+      assertTrue(hasNode(g, "enqueue"));
+
+      s.runner().addTarget(enqueue).run();
+    }
+  }
+
   private static boolean hasNode(Graph g, String name) {
     return g.operation(name) != null;
   }
diff --git a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
index 3b027700c5dad12a844553246274f083ec840822..313c09e1e40a9bf4e79933ff2a9ca1d3ce58473e 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/ShapeTest.java
@@ -16,6 +16,7 @@ limitations under the License.
 package org.tensorflow;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -77,4 +78,27 @@ public class ShapeTest {
       assertEquals(5, n.shape().size(1));
     }
   }
+
+  @Test
+  public void equalsWorksCorrectly() {
+    assertEquals(Shape.scalar(), Shape.scalar());
+    assertEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 3));
+
+    assertNotEquals(Shape.make(1, 2), null);
+    assertNotEquals(Shape.make(1, 2), new Object());
+    assertNotEquals(Shape.make(1, 2, 3), Shape.make(1, 2, 4));
+
+    assertNotEquals(Shape.unknown(), Shape.unknown());
+    assertNotEquals(Shape.make(-1), Shape.make(-1));
+    assertNotEquals(Shape.make(1, -1, 3), Shape.make(1, -1, 3));
+  }
+
+  @Test
+  public void hashCodeIsAsExpected() {
+    assertEquals(Shape.make(1, 2, 3, 4).hashCode(), Shape.make(1, 2, 3, 4).hashCode());
+    assertEquals(Shape.scalar().hashCode(), Shape.scalar().hashCode());
+    assertEquals(Shape.unknown().hashCode(), Shape.unknown().hashCode());
+
+    assertNotEquals(Shape.make(1, 2).hashCode(), Shape.make(1, 3).hashCode());
+  }
 }
diff --git a/tensorflow/leakr_file_type_recipe.ftrcp b/tensorflow/leakr_file_type_recipe.ftrcp
deleted file mode 100644
index 0521a084c7663a07c9a6e8cf72567497a9517345..0000000000000000000000000000000000000000
--- a/tensorflow/leakr_file_type_recipe.ftrcp
+++ /dev/null
@@ -1,30 +0,0 @@
-name: "TensorFlow filetype recipes"
-desc: "Copybara leakr checks, used by copy.bara.sky."
-
-file_config:{
-  name: "Image labels text file skip"
-  desc: "Generic text files."
-  pattern: ".*labels.txt"
-  compression: COMPRESSION_NONE
-  scan_mode: SCAN_SKIP
-  file_group: FG_PLAIN_TEXT_GENERIC
-}
-
-file_config:{
-  name: "[Mediafiles] Graphics"
-  desc: "All media files that are images, graphics and icons."
-  ext: "bmp"
-  ext: "gif"
-  ext: "icns"
-  ext: "ico"
-  ext: "jpeg"
-  ext: "jpg"
-  ext: "png"
-  ext: "svg"
-  ext: "tga"
-  ext: "tiff"
-  ext: "webp"
-  compression: COMPRESSION_NONE
-  scan_mode: SCAN_SKIP
-  file_group: FG_MEDIA_GRAPHICS
-}
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 02e88f4888f7813162894c24770b932b49cd454a..3566a36dddf3b273cf3623454ffd81203006ac93 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5,7 +5,10 @@ package(
     default_visibility = [
         "//engedu/ml/tf_from_scratch:__pkg__",
         "//tensorflow:internal",
+        "//tensorflow/contrib/lite/toco/python:__pkg__",
         "//tensorflow_models:__subpackages__",
+        # TODO(aselle): to pass open source test.
+        "//bazel_pip/tensorflow/contrib/lite/toco/python:__pkg__",
     ],
 )
 
@@ -45,6 +48,7 @@ py_library(
         "//tensorflow/compiler/aot/tests:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/contrib/learn:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/contrib/learn/python/learn/datasets:__pkg__",  # TODO(b/34059704): remove when fixed
+        "//tensorflow/contrib/lite/toco/python:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/debug:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/tools:__pkg__",  # TODO(b/34059704): remove when fixed
         "//tensorflow/tools/api/generator:__pkg__",
@@ -130,6 +134,7 @@ py_library(
         ":pywrap_tensorflow",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "@absl_py//absl/flags",
         "@six_archive//:six",
     ],
 )
@@ -174,10 +179,7 @@ tf_py_test(
     size = "small",
     srcs = ["platform/app_test.py"],
     additional_deps = [":platform"],
-    tags = [
-        "manual",
-        "notap",
-    ],
+    tags = ["notap"],
 )
 
 cc_library(
@@ -222,11 +224,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "bfloat16_lib",
+    srcs = ["lib/core/bfloat16.cc"],
+    hdrs = ["lib/core/bfloat16.h"],
+    deps = [
+        ":numpy_lib",
+        ":safe_ptr",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "ndarray_tensor_bridge",
     srcs = ["lib/core/ndarray_tensor_bridge.cc"],
     hdrs = ["lib/core/ndarray_tensor_bridge.h"],
     deps = [
+        ":bfloat16_lib",
         ":numpy_lib",
         "//tensorflow/c:c_api",
         "//tensorflow/core:lib",
@@ -263,10 +279,15 @@ cc_library(
     deps = [
         ":ndarray_tensor_bridge",
         ":numpy_lib",
+        ":py_util",
+        ":safe_ptr",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/eager:c_api",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:script_ops_op_lib",
+        "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
         "//util/python:python_headers",
     ],
@@ -288,6 +309,7 @@ cc_library(
     srcs = ["lib/core/ndarray_tensor.cc"],
     hdrs = ["lib/core/ndarray_tensor.h"],
     deps = [
+        ":bfloat16_lib",
         ":ndarray_tensor_bridge",
         ":numpy_lib",
         ":safe_ptr",
@@ -304,6 +326,7 @@ cc_library(
     hdrs = ["lib/core/py_seq_tensor.h"],
     deps = [
         ":numpy_lib",
+        ":py_util",
         ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -311,6 +334,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_util",
+    srcs = ["lib/core/py_util.cc"],
+    hdrs = ["lib/core/py_util.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:script_ops_op_lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "py_record_reader_lib",
     srcs = ["lib/io/py_record_reader.cc"],
@@ -443,6 +477,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/eager:python_eager_op_gen",
     ],
@@ -580,6 +615,7 @@ py_library(
     srcs = ["framework/dtypes.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":pywrap_tensorflow",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -670,6 +706,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":c_api_util",
+        ":control_flow_util",
         ":device",
         ":dtypes",
         ":op_def_registry",
@@ -1252,7 +1289,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
@@ -1520,6 +1560,7 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -1536,6 +1577,7 @@ py_library(
         ":array_ops_gen",
         ":constant_op",
         ":control_flow_ops_gen",
+        ":control_flow_util",
         ":data_flow_ops_gen",
         ":dtypes",
         ":framework_ops",
@@ -1544,12 +1586,22 @@ py_library(
         ":platform",
         ":sparse_tensor",
         ":tensor_array_ops",
+        ":tf_should_use",
         ":util",
         "//tensorflow/core:protos_all_py",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "control_flow_util",
+    srcs = ["ops/control_flow_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":platform",
+    ],
+)
+
 py_library(
     name = "ctc_ops",
     srcs = ["ops/ctc_ops.py"],
@@ -1623,6 +1675,7 @@ py_library(
         ":bitwise_ops",
         ":control_flow_grad",
         ":control_flow_ops",
+        ":control_flow_util",
         ":framework",
         ":framework_for_generated_wrappers",
         ":functional_ops",
@@ -1833,6 +1886,7 @@ py_library(
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":tf_should_use",
     ],
 )
 
@@ -2307,7 +2361,7 @@ py_library(
         ":math_ops",
         ":tensor_shape",
         ":tensor_util",
-        ":util",
+        ":tf_should_use",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -2344,6 +2398,7 @@ py_library(
         ":math_ops",
         ":state_ops",
         ":tensor_shape",
+        ":tf_should_use",
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
@@ -2572,7 +2627,7 @@ cuda_py_test(
         ":nn_grad",
         "//third_party/py/numpy",
     ],
-    shard_count = 4,
+    shard_count = 16,
 )
 
 cuda_py_test(
@@ -2727,6 +2782,7 @@ py_library(
         ["util/**/*.py"],
         exclude = [
             "util/example_parser*",
+            "util/tf_should_use.py",
             "util/**/*_test.py",
         ],
     ),
@@ -2789,6 +2845,17 @@ py_test(
     ],
 )
 
+py_library(
+    name = "tf_should_use",
+    srcs = ["util/tf_should_use.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":util",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "tf_should_use_test",
     size = "small",
@@ -2796,7 +2863,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":client_testlib",
-        ":util",
+        ":tf_should_use",
     ],
 )
 
@@ -2986,6 +3053,7 @@ tf_py_wrap_cc(
         "grappler/item.i",
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
+        "lib/core/bfloat16.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3004,6 +3072,7 @@ tf_py_wrap_cc(
         "util/util.i",
     ],
     deps = [
+        ":bfloat16_lib",
         ":cost_analyzer_lib",
         ":model_analyzer_lib",
         ":cpp_python_util",
@@ -3025,7 +3094,9 @@ tf_py_wrap_cc(
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/clusters:single_machine",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_memory",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core:lib",
@@ -3076,130 +3147,124 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_test",
     size = "small",
     srcs = ["training/server_lib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_multiple_containers_test",
     size = "small",
     srcs = ["training/server_lib_multiple_containers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_container_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_container_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_same_variables_no_clear_test",
     size = "small",
     srcs = ["training/server_lib_same_variables_no_clear_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
-py_test(
+tf_py_test(
     name = "server_lib_sparse_job_test",
     size = "small",
     srcs = ["training/server_lib_sparse_job_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":data_flow_ops",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
 )
 
 cuda_py_test(
@@ -3219,6 +3284,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "oss_serial",
@@ -3237,6 +3303,7 @@ tf_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "notsan",  # data race due to b/62910646
@@ -3267,17 +3334,11 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "session_test",
     size = "small",
     srcs = ["client/session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":control_flow_ops",
@@ -3295,20 +3356,19 @@ py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_clusterspec_prop_test",
     size = "small",
     srcs = ["client/session_clusterspec_prop_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_oss",
-        "no_pip_gpu",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3323,36 +3383,40 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_oss",
+        "no_pip",
+        "no_pip_gpu",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_list_devices_test",
     size = "small",
     srcs = ["client/session_list_devices_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-        "no_pip_gpu",
-        "notsan",  # data race due to b/62910646
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":framework",
         ":framework_test_lib",
         ":platform_test",
         ":training",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_pip_gpu",
+        "notsan",  # data race due to b/62910646
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "session_partial_run_test",
     size = "small",
     srcs = ["client/session_partial_run_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_gpu",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":errors",
@@ -3365,6 +3429,11 @@ py_test(
         ":util",
         "@six_archive//:six",
     ],
+    grpc_enabled = True,
+    tags = [
+        "no_gpu",
+        "no_windows",
+    ],
 )
 
 cuda_py_test(
@@ -3397,6 +3466,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["lib/core/bfloat16_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":lib",
+        ":pywrap_tensorflow",
+    ],
+)
+
 py_test(
     name = "file_io_test",
     size = "small",
@@ -3550,7 +3631,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
     ],
+    tags = ["multi_gpu"],
 )
 
 py_test(
@@ -3607,20 +3690,18 @@ cuda_py_test(
         ":training",
         ":variables",
     ],
+    grpc_enabled = True,
     main = "training/session_manager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "supervisor_test",
     size = "small",
     srcs = ["training/supervisor_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":errors",
-        ":extra_py_tests_deps",
         ":framework",
         ":framework_for_generated_wrappers",
         ":io_ops",
@@ -3631,6 +3712,8 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 py_test(
@@ -3638,7 +3721,10 @@ py_test(
     size = "small",
     srcs = ["training/basic_session_run_hooks_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
     deps = [
         ":client",
         ":client_testlib",
@@ -3833,15 +3919,15 @@ py_library(
     deps = [
         ":array_ops",
         ":control_flow_ops",
-        ":framework",
         ":framework_for_generated_wrappers",
-        ":init_ops",
+        ":platform",
+        ":tensor_util",
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
         "//third_party/py/numpy",
-        "@six_archive//:six",
     ],
 )
 
@@ -3852,12 +3938,14 @@ py_library(
         "layers/core.py",
         "layers/layers.py",
         "layers/maxout.py",
+        "layers/network.py",
         "layers/normalization.py",
         "layers/pooling.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":array_ops_gen",
         ":control_flow_ops",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -3865,12 +3953,18 @@ py_library(
         ":layers_base",
         ":math_ops",
         ":nn",
+        ":nn_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":resource_variable_ops_gen",
         ":standard_ops",
+        ":state_ops",
         ":training",
         ":util",
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3883,14 +3977,36 @@ py_test(
     main = "layers/base_test.py",
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":init_ops",
         ":layers",
+        ":layers_base",
         ":math_ops",
         ":random_ops",
         ":variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "layers_network_test",
+    size = "small",
+    srcs = ["layers/network_test.py"],
+    main = "layers/network_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":layers",
+        ":layers_base",
+        ":sparse_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -4211,6 +4327,7 @@ cuda_py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    grpc_enabled = True,
     main = "client/session_benchmark.py",
 )
 
@@ -4294,7 +4411,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [":pywrap_tensorflow_internal"],
+    deps = [
+        ":pywrap_tensorflow_internal",
+        ":tf_cluster",
+    ],
 )
 
 py_test(
@@ -4350,8 +4470,17 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":array_ops",
+        ":constant_op",
+        ":dtypes",
+        ":functional_ops",
+        ":layers",
+        ":math_ops",
         ":nn",
+        ":ops",
         ":random_ops",
+        ":tf_cluster",
+        ":tf_optimizer",
+        ":training",
         "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
     ],
@@ -4364,7 +4493,11 @@ py_library(
         "grappler/cost_analyzer.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [":pywrap_tensorflow_internal"],
+    deps = [
+        ":pywrap_tensorflow_internal",
+        ":tf_cluster",
+        ":tf_item",
+    ],
 )
 
 py_binary(
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index 2d8625933f9ea4ab3bedf8d3157430d821f3e584..48b03fab0fd639768b3e8bcfcb38429c1e536ecc 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -27,4 +27,8 @@ def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
     deps=deps,
     require_shape_functions=require_shape_functions,
     generated_target_name=name,
+    api_def_srcs = [
+        "//tensorflow/core:base_api_def",
+        "//tensorflow/core:python_api_def",
+    ],
   )
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 759c36ad72e922671288b0d57fe9e442b915c144..1481a4d035cbc63aa655be6c4d441e6f6741e118 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -126,6 +126,12 @@ _REGISTERED_EXPANSIONS = [
      lambda feed: [feed])]
 # pylint: enable=g-long-lambda
 
+
+def _convert_to_numpy_obj(numpy_dtype, obj):
+  """Explicitly convert obj based on numpy type except for string type."""
+  return numpy_dtype(obj) if numpy_dtype is not object else str(obj)
+
+
 def register_session_run_conversion_functions(tensor_type, fetch_function,
     feed_function=None, feed_function_for_partial_run=None):
   """Register fetch and feed conversion functions for `tf.Session.run()`.
@@ -1072,12 +1078,14 @@ class BaseSession(SessionInterface):
                             'strings, lists, numpy ndarrays, or TensorHandles.')
 
           subfeed_dtype = subfeed_t.dtype.as_numpy_dtype
-          if isinstance(subfeed_val,
-                        int) and subfeed_dtype(subfeed_val) != subfeed_val:
+          if isinstance(subfeed_val, int) and _convert_to_numpy_obj(
+              subfeed_dtype, subfeed_val) != subfeed_val:
             raise TypeError(
-                'Type of feed value ' + str(subfeed_val) + ' is not'
-                ' compatible with Tensor type ' + str(subfeed_dtype) + '.'
-                ' Try explicitly setting the type of the feed tensor'
+                'Type of feed value ' + str(subfeed_val) + ' with type ' +
+                str(type(subfeed_val)) +
+                ' is not compatible with Tensor type ' +
+                str(subfeed_dtype) +
+                '. Try explicitly setting the type of the feed tensor'
                 ' to a larger type (e.g. int64).')
 
           is_tensor_handle_feed = isinstance(subfeed_val,
@@ -1160,9 +1168,6 @@ class BaseSession(SessionInterface):
       TypeError: If `fetches` or `feed_list` cannot be interpreted
         as arguments to @{tf.Session.run}.
     """
-    assert not self._created_with_new_api, ('session.make_callable() doesn\'t '
-                                            'work with C API')
-
     if feed_list is not None:
       if not isinstance(feed_list, (list, tuple)):
         raise TypeError('`feed_list` must be a list or tuple.')
@@ -1184,12 +1189,18 @@ class BaseSession(SessionInterface):
 
     # Create a fetch handler to take care of the structure of fetches.
     fetch_handler = _FetchHandler(self._graph, fetches, {})
-    fetch_list_as_strings = _name_list(fetch_handler.fetches())
-    target_list_as_strings = _name_list(fetch_handler.targets())
+    if self._created_with_new_api:
+      # pylint: disable=protected-access
+      fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
+      target_list = [op._c_op for op in fetch_handler.targets()]
+      # pylint: enable=protected-access
+    else:
+      fetch_list = _name_list(fetch_handler.fetches())
+      target_list = _name_list(fetch_handler.targets())
 
     def _callable_template_with_options_and_metadata(
-        fetch_list_as_strings,
-        target_list_as_strings,
+        fetch_list,
+        target_list,
         fetch_handler,
         options=None,
         run_metadata=None):
@@ -1199,9 +1210,14 @@ class BaseSession(SessionInterface):
       run_metadata_ptr = tf_session.TF_NewBuffer() if run_metadata else None
       try:
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(
-              self._session, options_ptr, {}, fetch_list_as_strings,
-              target_list_as_strings, status, run_metadata_ptr)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, options_ptr, {}, fetch_list, target_list,
+                run_metadata_ptr, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, options_ptr, {}, fetch_list, target_list, status,
+                run_metadata_ptr)
           if fetch_handler:
             results = fetch_handler.build_results(self, results)
           else:
@@ -1218,27 +1234,35 @@ class BaseSession(SessionInterface):
 
     if accept_options:
       return functools.partial(
-          _callable_template_with_options_and_metadata, fetch_list_as_strings,
-          target_list_as_strings, fetch_handler)
+          _callable_template_with_options_and_metadata, fetch_list,
+          target_list, fetch_handler)
     elif isinstance(fetches, ops.Operation):
       # Special case for fetching a single operation, because the
       # function will have no return value.
-      assert not fetch_list_as_strings
-      assert len(target_list_as_strings) == 1
+      assert not fetch_list
+      assert len(target_list) == 1
       def _single_operation_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_Run(self._session, None, {}, [],
-                            target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, [], target_list, None, status)
+          else:
+            tf_session.TF_Run(
+                self._session, None, {}, [], target_list, status, None)
       return _single_operation_run
     elif isinstance(fetches, ops.Tensor):
       # Special case for fetching a single tensor, because the
       # function can return the result of `TF_Run()` directly.
-      assert len(fetch_list_as_strings) == 1
-      assert not target_list_as_strings
+      assert len(fetch_list) == 1
+      assert not target_list
       def _single_tensor_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings, [], status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, [], None, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, None, {}, fetch_list, [], status, None)
         return results[0]
       return _single_tensor_run
     else:
@@ -1246,9 +1270,12 @@ class BaseSession(SessionInterface):
       # results for us.
       def _fetch_handler_run():
         with errors.raise_exception_on_not_ok_status() as status:
-          results = tf_session.TF_Run(self._session, None, {},
-                                      fetch_list_as_strings,
-                                      target_list_as_strings, status, None)
+          if self._created_with_new_api:
+            results = tf_session.TF_SessionRun_wrapper(
+                self._session, None, {}, fetch_list, target_list, None, status)
+          else:
+            results = tf_session.TF_Run(
+                self._session, None, {}, fetch_list, target_list, status, None)
         return fetch_handler.build_results(self, results)
       return _fetch_handler_run
 
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index b77912b4f7469602e84d96af094727a8f51d48e6..c85b22eb156407fcb78302c43b9cb17b8f6b5e06 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -169,7 +170,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     # BaseRemoteRendezvous::SameWorkerRecvDone that means the test doesn't
     # actually capture the motivating bug unless run on a GPU machine.
     #
-    # Example error message (before bugfix -- linebreaks added because  lint):
+    # Example error message (before bugfix -- line breaks added because  lint):
     #
     # W0718 17:14:41.521534  190121 device_mgr.cc:107] Unknown device:
     #     /job:worker/replica:0/task:0/device:CPU:0 all devices:
@@ -415,6 +416,48 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
               node_stats.node_name.startswith('Const')
           ]), run_metadata)
 
+  def testClusterSpecPropagationIsolation(self):
+    """Test that two sessions using ClusterSpec propagation are isolated."""
+    server = server_lib.Server.create_local_server()
+    init_value = array_ops.placeholder(dtypes.int32, shape=[])
+    v = variables.Variable(init_value)
+
+    cluster_def = cluster_pb2.ClusterDef()
+    job = cluster_def.job.add()
+    job.name = 'worker'
+    job.tasks[0] = server.target[len('grpc://'):]
+    config = config_pb2.ConfigProto(cluster_def=cluster_def)
+
+    sess1 = session.Session(server.target, config=config)
+    sess2 = session.Session(server.target, config=config)
+
+    # Initially, the variable is uninitialized in both sessions.
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess1.run(v)
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess2.run(v)
+
+    # An update in sess1 should be visible in sess1 only.
+    sess1.run(v.initializer, feed_dict={init_value: 37})
+    self.assertEqual(37, sess1.run(v))
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess2.run(v)
+
+    # An update in sess2 should be visible in sess2 only.
+    sess2.run(v.initializer, feed_dict={init_value: 86})
+    self.assertEqual(37, sess1.run(v))
+    self.assertEqual(86, sess2.run(v))
+
+    # Closing sess2 has no effect on the state of sess1.
+    sess2.close()
+    self.assertEqual(37, sess1.run(v))
+
+    # Subsequent sessions will not see the state of existing sessions.
+    sess3 = session.Session(server.target, config=config)
+    self.assertEqual(37, sess1.run(v))
+    with self.assertRaises(errors.FailedPreconditionError):
+      sess3.run(v)
+
   @test_util.disable_c_api  # Partial runs don't work with C API
   def testClusterSpecPropagationPartialRun(self):
     """Test successful partial run with ClusterSpec propagation."""
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 584b1abe55c0df09afad0c432837646e75beb653..5a7413c12e9db92cb85d54a69602753ff6476425 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -39,7 +39,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
@@ -65,7 +64,6 @@ class SessionListDevicesTestMethods(object):
       devices = sess.list_devices()
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
-      self.assertGreaterEqual(1, len(devices), devices)
 
   def testListDevicesClusterSpecPropagation(self):
     server1 = server_lib.Server.create_local_server()
@@ -84,7 +82,6 @@ class SessionListDevicesTestMethods(object):
           '/job:worker/replica:0/task:0/device:CPU:0' in device_names)
       self.assertTrue(
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
-      self.assertGreaterEqual(2, len(devices), devices)
 
 
 class SessionListDevicesTest(SessionListDevicesTestMethods,
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 6ecf0fc6c7b5d55d9f0f139f67f69efa0d51daf1..6a389b078a54adea18bedb1e0412835c0e997a7f 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -199,11 +199,11 @@ class PartialRunTestMethods(object):
   def testPartialRunSetupNoFeedsPassed(self):
     sess = session.Session()
     r1 = constant_op.constant([6.0])
-   
+
     h = sess.partial_run_setup([r1])
     result1 = sess.partial_run(h, r1)
     self.assertEqual([6.0], result1)
-      
+
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 6b45a5f3134a8a60445413d6afba3b2d6b8eb87e..a563f5ef4aa245bbca5077d6f382f8cfec77441d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -28,6 +28,8 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -55,13 +57,13 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
-ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
+@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
   def testUseExistingGraph(self):
@@ -163,8 +165,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
-  @test_util.disable_c_api  # No shape registration for 'ConstructionFails'
   def testOpConstructionErrorPayload(self):
+    if ops._USE_C_API: return  # No shape registration for 'ConstructionFails'
+
     with session.Session():
       failing_op = ops.get_default_graph().create_op(
           'ConstructionFails', [], [], name='f')
@@ -206,7 +209,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(TypeError):
         s.run({'a': a, 'b': None})
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchSingleton(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -229,7 +231,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.op)  # An op, not a tensor.
       self.assertEqual(None, res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchList(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -245,7 +246,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, list))
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchTuple(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -259,7 +259,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue(isinstance(res, tuple))
       self.assertEqual((42.0, None, 44.0, 42.0), res)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFetchNamedTuple(self):
     # pylint: disable=invalid-name
     ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
@@ -1176,7 +1175,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(b_val, [[2.0, 2.0, 2.0]])
       self.assertAllEqual(a2_val, [[1.0, 1.0]])
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testFeedAndFetch(self):
     with session.Session() as sess:
       for dtype in [dtypes.float16,
@@ -1223,7 +1221,6 @@ class SessionTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(np_array, out_v)
           self.assertAllEqual(np_array, feed_v)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnTensorWithRunOptions(self):
     with session.Session() as sess:
       a = constant_op.constant(42.0)
@@ -1236,7 +1233,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(42.0, res)
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableOnOperationWithRunOptions(self):
     with session.Session() as sess:
       a = variables.Variable(42.0)
@@ -1251,7 +1247,6 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(43.0, sess.run(a))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
-  @test_util.disable_c_api  # session.make_callable() doesn't work with C API
   def testMakeCallableWithFeedListAndRunOptions(self):
     with session.Session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
@@ -1459,6 +1454,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
+    # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
+    if ops._USE_C_API: return
+
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
       new_shape = constant_op.constant([2, 2])
@@ -1583,7 +1581,6 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
-  @test_util.disable_c_api  # set_device does not work with C API
   def testRegisterFetchAndFeedConversionFunctions(self):
     class SquaredTensor(object):
       def __init__(self, tensor):
@@ -1733,15 +1730,159 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = sess.run(f)
       self.assertEqual(result, 2.0)
 
-  @test_util.disable_c_api  # functions don't work with C API
   def testAddFunctionToSession(self):
     self.runTestAddFunctionToSession()
 
-  @test_util.disable_c_api  # functions don't work with C API
   def testAddFunctionToGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
 
+  def testAutoConvertAndCheckData(self):
+    with self.test_session() as sess:
+      a = array_ops.placeholder(dtype=dtypes.string)
+      with self.assertRaisesRegexp(
+          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+        sess.run(a, feed_dict={a: 1})
+
+class GraphMutationTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._original_use_c_api_value = ops._USE_C_API
+    ops._USE_C_API = True
+    super(GraphMutationTest, self).setUp()
+
+  def tearDown(self):
+    ops._USE_C_API = self._original_use_c_api_value
+    super(GraphMutationTest, self).tearDown()
+
+  def testUpdateInputAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._update_input(1, a)  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by updating input tensor after it was run'):
+        sess.run(c)
+
+      # Check that running the graph with a new session is fine
+      with session.Session(graph=g) as sess2:
+        self.assertAllEqual(2.0, sess2.run(c))
+
+  def testSetDeviceAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(3.0, sess.run(c))
+      c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'add.*was changed by setting device after it was run'):
+        sess.run(c)
+
+  def testSetAttrAfterRunning(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0, dtype=dtypes.float32)
+      b = math_ops.cast(a, dtypes.float64)
+
+    with session.Session(graph=g) as sess:
+      self.assertAllEqual(1.0, sess.run(b))
+      b.op._set_attr('DstT',
+                     attr_value_pb2.AttrValue(type=types_pb2.DT_FLOAT))
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          'Cast.*was changed by setting attribute after it was run'):
+        sess.run(b)
+
+  def testRunModifyRun(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess:
+        self.assertAllEqual(3.0, sess.run(c))
+
+        d = b + c
+        d.op._update_input(0, a)  # pylint: disable=protected-access
+        self.assertAllEqual(3.0, sess.run(c))
+        self.assertAllEqual(4.0, sess.run(d))
+
+  def testRunModifyRunTwoSessions(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+      with session.Session(graph=g) as sess1:
+        with session.Session(graph=g) as sess2:
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(3.0, sess2.run(c))
+
+          d = b + c
+          d.op._update_input(0, a)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess2.run(c))
+          self.assertAllEqual(4.0, sess2.run(d))
+
+          d.op._update_input(0, b)  # pylint: disable=protected-access
+          self.assertAllEqual(3.0, sess1.run(c))
+          self.assertAllEqual(5.0, sess1.run(d))
+
+          with self.assertRaisesRegexp(
+              errors.FailedPreconditionError,
+              'add.*was changed by updating input tensor after it was run'):
+            sess2.run(c)
+
+  def testTwoSessionsOneRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        # sess2 was not run before modification
+        self.assertAllEqual(3.0, sess2.run(c))
+
+  def testTwoSessionsBothRunBeforeModification(self):
+    with ops.Graph().as_default() as g, ops.device('/cpu:0'):
+      a = constant_op.constant(1.0)
+      b = constant_op.constant(2.0)
+      c = a + b
+
+    with session.Session(graph=g) as sess1:
+      with session.Session(graph=g) as sess2:
+        sess1.run(c)
+        sess2.run(c)
+
+        c.op._set_device('/cpu:0')  # pylint: disable=protected-access
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess1.run(c)
+
+        with self.assertRaisesRegexp(
+            errors.FailedPreconditionError,
+            'add.*was changed by setting device after it was run'):
+          sess2.run(c)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 40731aba7d4ed8bb281191d719b3ddfcd2db1ddc..f57c5d73bcdd9be7086a93f42eaa8c8eb95c13a9 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -24,6 +24,49 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
 
+// Helper function to convert a Python list of Tensors to a C++ vector of
+// TF_Outputs.
+//
+// Returns true if successful. Otherwise, returns false and sets error_msg.
+bool PyTensorListToVector(PyObject* py_tensor_list,
+                          std::vector<TF_Output>* vec,
+                          string* error_msg) {
+  if (!PyList_Check(py_tensor_list)) {
+    *error_msg = "expected Python list.";
+    return false;
+  }
+  size_t size = PyList_Size(py_tensor_list);
+  for (int i = 0; i < size; ++i) {
+    PyObject* item = PyList_GetItem(py_tensor_list, i);
+    TF_Output* input_ptr;
+    if (!SWIG_IsOK(SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
+                                   SWIGTYPE_p_TF_Output, 0))) {
+      *error_msg = "expected Python list of wrapped TF_Output objects. "
+          "Found python list of something else.";
+      return false;
+    }
+    vec->push_back(*input_ptr);
+  }
+  return true;
+}
+
+// Helper function to convert a TF_Output to a wrapped TF_Output Python object.
+PyObject* CreateWrappedTFOutput(TF_Output tf_output) {
+  // We used heap-allocated pointers in the Python runtime (this is what SWIG
+  // generates by default for functions returning TF_Output).
+  TF_Output* tf_output_ptr = new TF_Output(tf_output);
+  // Use SWIG_POINTER_OWN so the TF_Output* is deleted by Python.
+  return SWIG_NewPointerObj(tf_output_ptr, SWIGTYPE_p_TF_Output,
+                            SWIG_POINTER_OWN);
+}
+
+// Helper function to convert a TF_Operation to a wrapped TF_Operation Python
+// object.
+PyObject* CreateWrappedTFOperation(TF_Operation* tf_operation) {
+  // No flags since operation is owned by TF_Graph.
+  return SWIG_NewPointerObj(tf_operation, SWIGTYPE_p_TF_Operation, 0);
+}
+
 %}
 
 %include "tensorflow/python/client/tf_sessionrun_wrapper.i"
@@ -98,8 +141,26 @@ tensorflow::ImportNumpy();
   }
 
   for (size_t i = 0; i < $1.size(); ++i) {
-    PyList_SET_ITEM($result, i, SWIG_NewPointerObj(
-                            $1[i], SWIGTYPE_p_TF_Operation, 0));
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
+%ignore TF_OperationOutputConsumers;
+%unignore TF_OperationOutputConsumers_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetOutputConsumers_wrapper;
+
+// Build a Python list of unicode strings and return it. (Operation names are
+// always represented as unicode.)
+%typemap(out) std::vector<const char*>
+tensorflow::TF_OperationOutputConsumers_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyUnicode_FromString($1[i]));
   }
 }
 
@@ -115,16 +176,10 @@ tensorflow::ImportNumpy();
     SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
   }
 
-  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>> via &
-  std::vector<TF_Output>* tf_outputs = &$1;
-  for (size_t i = 0; i < $1.size(); ++i) {
-    // We used wrapped heap-allocated pointers in the Python runtime (this is
-    // what SWIG generates by default for functions returning TF_Output).
-    TF_Output* tf_output_ptr = new TF_Output((*tf_outputs)[i]);
-    // Use SWIG_POINTER_OWN so the TF_Output* is deleted by Python.
-    PyList_SET_ITEM($result, i,
-                    SWIG_NewPointerObj(tf_output_ptr, SWIGTYPE_p_TF_Output,
-                                       SWIG_POINTER_OWN));
+  // Unwrap the generated SwigValueWrapper<std::vector<TF_Output>>
+  const std::vector<TF_Output>& tf_outputs = $1;
+  for (size_t i = 0; i < tf_outputs.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOutput(tf_outputs[i]));
   }
 }
 
@@ -268,34 +323,6 @@ tensorflow::ImportNumpy();
       reinterpret_cast<const char*>($1.data), $1.length);
 }
 
-%inline %{
-// Helper function to convert a Python list of Tensors to a C++ vector of
-// TF_Outputs.
-//
-// Returns true if successful. Otherwise, returns false and sets error_msg.
-bool PyTensorListToVector(PyObject* py_tensor_list,
-                          std::vector<TF_Output>* vec,
-                          string* error_msg) {
-  if (!PyList_Check(py_tensor_list)) {
-    *error_msg = "expected Python list.";
-    return false;
-  }
-  size_t size = PyList_Size(py_tensor_list);
-  for (int i = 0; i < size; ++i) {
-    PyObject* item = PyList_GetItem(py_tensor_list, i);
-    TF_Output* input_ptr;
-    if (!SWIG_IsOK(SWIG_ConvertPtr(item, reinterpret_cast<void**>(&input_ptr),
-                                   SWIGTYPE_p_TF_Output, 0))) {
-      *error_msg = "expected Python list of wrapped TF_Output objects. "
-          "Found python list of something else.";
-      return false;
-    }
-    vec->push_back(*input_ptr);
-  }
-  return true;
-}
-%}
-
 // Converts input Python list of wrapped TF_Outputs into a single array
 %typemap(in) (const TF_Output* inputs, int num_inputs)
     (std::vector<TF_Output> inputs) {
@@ -307,6 +334,62 @@ bool PyTensorListToVector(PyObject* py_tensor_list,
   $2 = inputs.size();
 }
 
+// Typemaps for TF_ImportGraphDefResultsReturnOutputs
+%typemap(in, numinputs=0) (int* num_outputs, TF_Output** outputs)
+     (int num_outputs, TF_Output* outputs) {
+  $1 = &num_outputs;
+  $2 = &outputs;
+}
+
+%typemap(argout) (int* num_outputs, TF_Output** outputs) {
+  $result = PyList_New(*$1);
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  int num_outputs = *$1;
+  TF_Output* outputs = *$2;
+  for (int i = 0; i < num_outputs; ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOutput(outputs[i]));
+  }
+}
+
+// Typemaps for TF_ImportGraphDefResultsReturnOperations
+%typemap(in, numinputs=0) (int* num_opers, TF_Operation*** opers)
+     (int num_opers, TF_Operation** opers) {
+  $1 = &num_opers;
+  $2 = &opers;
+}
+
+%typemap(argout) (int* num_opers, TF_Operation*** opers) {
+  $result = PyList_New(*$1);
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+  int num_opers = *$1;
+  TF_Operation** opers = *$2;
+  for (int i = 0; i < num_opers; ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation(opers[i]));
+  }
+}
+
+// Typemaps for TF_GraphNextOperation().
+%typemap(in) size_t* pos (size_t pos) {
+  pos = PyLong_AsUnsignedLong($input);
+  $1 = &pos;
+}
+
+// Returns a (TF_Operation*, int pos) tuple.
+%typemap(argout) size_t* pos {
+  PyObject* new_result = PyTuple_New(2);
+  if (!new_result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create tuple");
+  }
+  // Steals $result reference
+  PyTuple_SET_ITEM(new_result, 0, $result);
+  PyTuple_SET_ITEM(new_result, 1, PyLong_FromSize_t(*$1));
+  $result = new_result;
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
@@ -433,6 +516,84 @@ def TF_Reset(target, containers=None, config=None):
   }
 }
 
+// Typemaps for TF_GraphGetTensorShapeHelper.
+
+// Convert from C++ integer vector to Python list of ints.
+%typemap(out) tensorflow::gtl::InlinedVector<int64_t, 6>
+     tensorflow::TF_GraphGetTensorShapeHelper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyInt_FromLong($1[i]));
+  }
+}
+
+%typemap(in, numinputs=0) bool* unknown_shape (bool temp) {
+  $1=&temp;
+}
+
+// Returns a (list(int), bool) tuple.
+%typemap(argout) bool* unknown_shape {
+  PyObject* new_result = PyTuple_New(2);
+  if (!new_result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create tuple");
+  }
+  // Steals $result reference
+  PyTuple_SET_ITEM(new_result, 0, $result);
+  PyTuple_SET_ITEM(new_result, 1, PyBool_FromLong(*$1));
+  $result = new_result;
+}
+
+%unignore tensorflow;
+%unignore TF_GraphGetTensorShapeHelper;
+%ignore TF_GraphGetTensorShape;
+
+// We use TF_GraphSetTensorShape_wrapper instead of
+// TF_GraphSetTensorShape
+%ignore TF_GraphSetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphSetTensorShape_wrapper;
+
+// $input is a Python list of ints to a vector<int> for TF_GraphSetTensorShape_wrapper
+%typemap(in) (const std::vector<int64_t>& dims)
+    (std::vector<int64_t> dims_local){
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, tensorflow::strings::Printf(
+              "$symname: expected list but got %s ", Py_TYPE($input)->tp_name).c_str());
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      dims_local.push_back(PyInt_AsLong(item));
+    }
+    $1 = &dims_local;
+  } else {
+    $1 = nullptr;
+  }
+}
+
+// We use TF_GraphGetTensorShape_wrapper instead of
+// TF_GraphGetTensorShape
+%ignore TF_GraphGetTensorShape;
+%unignore tensorflow;
+%unignore TF_GraphGetTensorShape_wrapper;
+
+// Build a Python list of ints and return it.
+%typemap(out) std::vector<int64_t> tensorflow::TF_GraphGetTensorShape_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, PyInt_FromLong($1[i]));
+  }
+}
+
 %include "tensorflow/python/client/tf_session_helper.h"
 
 %unignoreall
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index f5472f316dcd86c3bb15a042b68c51b4f04b4b10..a00fade7ac36f76f2fdedf23b8fbdaf1429c2590 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -299,6 +299,33 @@ string EqualGraphDefWrapper(const string& actual, const string& expected) {
   return EqualGraphDef(actual_def, expected_def, &diff) ? "" : diff;
 }
 
+// Return value set to 6 inlined elements so it fits in a 64-byte cache line.
+tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+    TF_Graph* graph, TF_Output output, TF_Status* out_status,
+    bool* unknown_shape) {
+  // Allocate a single variable for holding the result for RVO.
+  tensorflow::gtl::InlinedVector<int64_t, 6> result;
+  *unknown_shape = false;
+  int num_dims = TF_GraphGetTensorNumDims(graph, output, out_status);
+  if (TF_GetCode(out_status) != TF_OK) {
+    return result;
+  }
+  // If shape is unknown, set boolean and return.
+  if (num_dims == -1) {
+    *unknown_shape = true;
+    return result;
+  }
+
+  // If shape is a scalar, avoid another C call and just return {}.
+  if (num_dims == 0) {
+    return result;
+  }
+
+  result.resize(num_dims);
+  TF_GraphGetTensorShape(graph, output, result.data(), num_dims, out_status);
+  return result;
+}
+
 void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
@@ -347,6 +374,19 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out) {
+  int num_consumers = TF_OperationOutputNumConsumers(oper_out);
+  std::vector<TF_Input> consumers(num_consumers);
+  TF_OperationOutputConsumers(oper_out, consumers.data(), num_consumers);
+
+  std::vector<const char*> consumer_names(num_consumers);
+  for (int i = 0; i < num_consumers; ++i) {
+    consumer_names[i] = TF_OperationName(consumers[i].oper);
+  }
+  return consumer_names;
+}
+
 TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
@@ -380,4 +420,23 @@ TF_Function* TF_GraphToFunction_wrapper(
                             opts, description, out_status);
 }
 
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status) {
+  if (unknown_shape) {
+    TF_GraphSetTensorShape(graph, output, nullptr, -1, status);
+    return;
+  }
+  TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
+}
+
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status) {
+  std::vector<int64_t> dims(num_dims);
+  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
+  return dims;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 0aca61a2b69752634d23511084721f94911a9ac4..3a8506de4de9c765c755b9ac3a8b81c2f57b0865 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -97,6 +97,16 @@ void TF_Reset_wrapper(const TF_SessionOptions* opt,
 // for no difference.
 string EqualGraphDefWrapper(const string& actual, const string& expected);
 
+// Gets shape from C API Graph object.
+//
+// If shape is known, returns shape vector where -1 means "unknown
+// dimension".  Sets unknown_shape to false.
+//
+// If shape is unknown, sets unknown_shape to true.
+tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
+    TF_Graph* graph, TF_Output output, TF_Status* out_status,
+    bool* unknown_shape);
+
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
 // (the caller must decref these ndarrays, although this will likely be handled
@@ -150,6 +160,11 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the op names of the consumers of `oper_out`. The returned strings
+// have the lifetime of the underlying TF_Graph.
+std::vector<const char*> TF_OperationOutputConsumers_wrapper(
+    TF_Output oper_out);
+
 // `opers` equaling NULL are converted to `nopers = -1`.
 // `output_names` must be empty or have the same length as `outputs`.
 TF_Function* TF_GraphToFunction_wrapper(
@@ -158,6 +173,20 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status);
+
+// Set the shape of output. If unknown is true, `num_dims` must be set to
+// -1 and `dims` is set to nullptr.
+void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
+                                    const std::vector<int64_t>& dims,
+                                    bool unknown_shape, TF_Status* status);
+
+// Return the shape of output. `num_dims` should be the output of
+// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
+std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
+                                                    TF_Output output,
+                                                    int num_dims,
+                                                    TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_CLIENT_TF_SESSION_HELPER_H_
diff --git a/tensorflow/python/client/timeline.py b/tensorflow/python/client/timeline.py
index f3ba4244cecd5407f3c8bd2e164a424049901001..1e96ac5ed48368a7c44c06112fab1745cd678f16 100644
--- a/tensorflow/python/client/timeline.py
+++ b/tensorflow/python/client/timeline.py
@@ -275,7 +275,7 @@ class _TensorTracker(object):
       name:  The name of the Tensor as a string.
       object_id:  Chrome Trace object identifier assigned for this Tensor.
       timestamp:  The creation timestamp of this event as a long integer.
-      pid:  Process identifier of the assicaiated device, as an integer.
+      pid:  Process identifier of the associated device, as an integer.
       allocator:  Name of the allocator used to create the Tensor.
       num_bytes:  Number of bytes allocated (long integer).
 
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index b5ee8120fd3ff60028a5c99643e5d96890ec16d0..239f9b0d5923451f3967eca572b1db099d463466 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -18,9 +18,9 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Dataset
 @@Iterator
-@@TFRecordDataset
 @@FixedLengthRecordDataset
 @@TextLineDataset
+@@TFRecordDataset
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5fb389cf92818c7a464cf4a4479d86377185d5cf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -0,0 +1,378 @@
+# Tests of TensorFlow kernels written using the Python API.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test")
+
+tf_py_test(
+    name = "batch_dataset_op_test",
+    size = "small",
+    srcs = ["batch_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "dataset_constructor_op_test",
+    size = "small",
+    srcs = ["dataset_constructor_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+    tags = [
+        "manual",
+        "nomac",  # b/62040583
+    ],
+)
+
+tf_py_test(
+    name = "dataset_from_generator_op_test",
+    size = "small",
+    srcs = ["dataset_from_generator_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+tf_py_test(
+    name = "filter_dataset_op_test",
+    size = "small",
+    srcs = ["filter_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "flat_map_dataset_op_test",
+    size = "small",
+    srcs = ["flat_map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "list_files_dataset_op_test",
+    size = "small",
+    srcs = ["list_files_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "interleave_dataset_op_test",
+    size = "small",
+    srcs = ["interleave_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "prefetch_dataset_op_test",
+    size = "small",
+    srcs = ["prefetch_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "reader_dataset_ops_test",
+    size = "small",
+    srcs = ["reader_dataset_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_dataset_op_test",
+    size = "small",
+    srcs = ["sequence_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shuffle_dataset_op_test",
+    size = "small",
+    srcs = ["shuffle_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "shard_dataset_op_test",
+    size = "small",
+    srcs = ["shard_dataset_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "cache_dataset_op_test",
+    size = "small",
+    srcs = ["cache_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+tf_py_test(
+    name = "zip_dataset_op_test",
+    size = "small",
+    srcs = ["zip_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "concatenate_dataset_op_test",
+    size = "small",
+    srcs = ["concatenate_dataset_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+    ],
+    grpc_enabled = True,
+)
+
+tf_py_test(
+    name = "iterator_ops_cluster_test",
+    size = "small",
+    srcs = ["iterator_ops_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
similarity index 70%
rename from tensorflow/python/kernel_tests/batch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index 7cffa861ca41371c639ed94e12fec1f814fb883d..53c8be1d1dc8b2f23b4faef7d64350edffede34a 100644
--- a/tensorflow/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -100,6 +101,112 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testBatchSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
+        5).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(2):
+        actual = sess.run(get_next)
+        expected_indices = []
+        expected_values = []
+        for j in range(5):
+          for k in range(i * 5 + j):
+            expected_indices.append([j, k])
+            expected_values.append(i * 5 + j)
+        expected = sparse_tensor.SparseTensorValue(
+            indices=expected_indices,
+            values=expected_values,
+            dense_shape=[5, (i + 1) * 5 - 1])
+        self.assertTrue(sparse_tensor.is_sparse(actual))
+        self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
+        2).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      actual = sess.run(get_next)
+      expected = sparse_tensor.SparseTensorValue(
+          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+          dense_shape=[2, 5, 1])
+      self.assertTrue(sparse_tensor.is_sparse(actual))
+      self.assertSparseValuesEqual(actual, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testBatchShapeError(self):
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    iterator = (dataset_ops.Dataset.from_generator(generator, dtypes.float32,
+                                                   output_shapes=[None])
+                .batch(3)
+                .make_initializable_iterator())
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Cannot batch tensors with different shapes in component 0. "
+          r"First element had shape \[3\] and element 2 had shape \[4\]."):
+        sess.run(next_element)
+
   def testPaddedBatchDataset(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
@@ -225,6 +332,14 @@ class BatchDatasetTest(test.TestCase):
       self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
       self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
 
+  def testPaddedBatchSparseError(self):
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/cache_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/concatenate_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
similarity index 62%
rename from tensorflow/python/kernel_tests/dataset_constructor_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
index b51d483b5b6611d9596e59fd750c496bbb9c67d3..85ff228eb2838522d7a8264d14a79c918aba4b75 100644
--- a/tensorflow/python/kernel_tests/dataset_constructor_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -35,8 +36,8 @@ from tensorflow.python.platform import test
 
 class DatasetConstructorTest(test.TestCase):
 
-  def testTensorDataset(self):
-    """Test an dataset that represents a single tuple of tensors."""
+  def testFromTensors(self):
+    """Test a dataset that represents a single tuple of tensors."""
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
 
     iterator = (dataset_ops.Dataset.from_tensors(components)
@@ -55,8 +56,76 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDataset(self):
-    """Test an dataset that represents the slices from a tuple of tensors."""
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testFromTensorsSparse(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensors(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      results = sess.run(get_next)
+      for component, result_component in zip(components, results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlices(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
     components = (
         np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
             np.array([[12], [13], [14], [15]]), 22),
@@ -80,7 +149,127 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testTensorSliceDatasetWithDict(self):
+  def testFromTensorSlicesSparse(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(expected[i], results):
+          self.assertSparseValuesEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesMixed(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in iterator.output_shapes])
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[0]]),
+               values=np.array([1]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[1]]),
+               values=np.array([2]),
+               dense_shape=np.array([3]))),
+          (sparse_tensor.SparseTensorValue(
+              indices=np.array([[0]]),
+              values=np.array([0]),
+              dense_shape=np.array([1])),
+           sparse_tensor.SparseTensorValue(
+               indices=np.array([[2]]),
+               values=np.array([3]),
+               dense_shape=np.array([3]))),
+      ]
+      for i in range(3):
+        results = sess.run(get_next)
+        for component, result_component in zip(
+            (zip(*components[:3])[i] + expected[i]), results):
+          if sparse_tensor.is_sparse(component):
+            self.assertSparseValuesEqual(component, result_component)
+          else:
+            self.assertAllEqual(component, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromTensorSlicesWithDict(self):
     components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
     iterator = (dataset_ops.Dataset.from_tensor_slices(components)
                 .make_initializable_iterator())
@@ -101,7 +290,7 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-  def testSparseTensorSliceDataset(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
diff --git a/tensorflow/python/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/dataset_from_generator_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
diff --git a/tensorflow/python/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
similarity index 81%
rename from tensorflow/python/kernel_tests/filter_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
index 489c0375f9d2210d0543c66deda14e9ea3473e5c..b9258b720edd4ecd620c61eed18f6f975cb7f439 100644
--- a/tensorflow/python/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
@@ -124,6 +125,37 @@ class FilterDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+            lambda x, i: x).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(5):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..350234a8396a7e2d69cd016010aee4227fe222b7
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class FlatMapDatasetTest(test.TestCase):
+
+  # pylint: disable=g-long-lambda
+  def testFlatMapDataset(self):
+    repeats = [1, 2, 3, 4, 5, 0, 1]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in repeats:
+        for _ in range(i):
+          self.assertEqual(i, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
+                            .repeat(y))).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for row in repeats:
+        for i in row:
+          for _ in range(i):
+            self.assertEqual(i, sess.run(get_next))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSharedResourceNestedFlatMapDataset(self):
+    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
+    components = np.array(repeats, dtype=np.int64)
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices(components)
+        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
+                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
+                            .repeat(y))).make_initializable_iterator(
+                                shared_name="shared_flat_map_iterator"))
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    # Create two concurrent sessions that share the same iterator
+    # resource on the same server, and verify that a random
+    # interleaving of `Session.run(get_next)` calls on the two
+    # sessions yields the expected result.
+    server = server_lib.Server.create_local_server()
+    with session.Session(server.target) as sess1:
+      with session.Session(server.target) as sess2:
+        for _ in range(3):
+          sess = random.choice([sess1, sess2])
+          sess.run(init_op)
+          for row in repeats:
+            for i in row:
+              for _ in range(i):
+                sess = random.choice([sess1, sess2])
+                self.assertEqual(i, sess.run(get_next))
+
+        with self.assertRaises(errors.OutOfRangeError):
+          sess = random.choice([sess1, sess2])
+          sess.run(get_next)
+
+  def testMapDict(self):
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
+                          .repeat(d["bar"]))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for _ in range(i ** 2):
+          self.assertEqual(i * 2, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+  # pylint: enable=g-long-lambda
+
+  def testSparse(self):
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _flat_map_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for j in range(2):
+          expected = [i, 0] if j % 2 == 0 else [0, -i]
+          self.assertAllEqual(expected, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
similarity index 69%
rename from tensorflow/python/kernel_tests/flat_map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
index 76d568a0d9e1a7b0b1de5744bd78ad53bd1baea7..28cb50c00208f95e64bb11ae80656383b1f41e1e 100644
--- a/tensorflow/python/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
@@ -18,110 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import random
 
-import numpy as np
-
-from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import server_lib
-
-
-class FlatMapDatasetTest(test.TestCase):
-
-  # pylint: disable=g-long-lambda
-  def testFlatMapDataset(self):
-    repeats = [1, 2, 3, 4, 5, 0, 1]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
-    repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
-    components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    # Create two concurrent sessions that share the same iterator
-    # resource on the same server, and verify that a random
-    # interleaving of `Session.run(get_next)` calls on the two
-    # sessions yields the expected result.
-    server = server_lib.Server.create_local_server()
-    with session.Session(server.target) as sess1:
-      with session.Session(server.target) as sess2:
-        for _ in range(3):
-          sess = random.choice([sess1, sess2])
-          sess.run(init_op)
-          for row in repeats:
-            for i in row:
-              for _ in range(i):
-                sess = random.choice([sess1, sess2])
-                self.assertEqual(i, sess.run(get_next))
-
-        with self.assertRaises(errors.OutOfRangeError):
-          sess = random.choice([sess1, sess2])
-          sess.run(get_next)
-
-  def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
 
 
 class InterleaveDatasetTest(test.TestCase):
@@ -272,6 +176,31 @@ class InterleaveDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+
+    def _interleave_fn(x):
+      return dataset_ops.Dataset.from_tensor_slices(
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+            _interleave_fn, cycle_length=1).make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        for j in range(2):
+          expected = [i, 0] if j % 2 == 0 else [0, -i]
+          self.assertAllEqual(expected, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/iterator_ops_cluster_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
diff --git a/tensorflow/python/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
similarity index 91%
rename from tensorflow/python/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 2128ef4ae17668309af96c4fb21837cb7659a122..23c6d7385f8d4a12019fa514f349f2598d9629de 100644
--- a/tensorflow/python/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import warnings
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -39,6 +41,7 @@ from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -58,6 +61,15 @@ class IteratorTest(test.TestCase):
     with self.assertRaisesRegexp(LookupError, "No gradient defined"):
       gradients_impl.gradients(value, [component, side])
 
+  def testCapturingStateInOneShotRaisesException(self):
+    var = variables.Variable(37.0, name="myvar")
+    dataset = (dataset_ops.Dataset.from_tensor_slices([0.0, 1.0, 2.0])
+               .map(lambda x: x + var))
+    with self.assertRaisesRegexp(
+        ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
+        "datasets that capture stateful objects.+myvar"):
+      dataset.make_one_shot_iterator()
+
   def testOneShotIterator(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -386,6 +398,34 @@ class IteratorTest(test.TestCase):
         sess.run(next_element,
                  feed_dict={handle_placeholder: iterator_4_handle})
 
+  def testIteratorStringHandleReuseTensorObject(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
+    one_shot_iterator = dataset.make_one_shot_iterator()
+    initializable_iterator = dataset.make_initializable_iterator()
+    structure_iterator = iterator_ops.Iterator.from_structure(
+        dataset.output_types)
+
+    created_ops = len(ops.get_default_graph().get_operations())
+
+    self.assertIs(one_shot_iterator.string_handle(),
+                  one_shot_iterator.string_handle())
+    self.assertIs(initializable_iterator.string_handle(),
+                  initializable_iterator.string_handle())
+    self.assertIs(structure_iterator.string_handle(),
+                  structure_iterator.string_handle())
+
+    # Assert that getting the (default) string handle creates no ops.
+    self.assertEqual(created_ops, len(ops.get_default_graph().get_operations()))
+
+    # Specifying an explicit name will create a new op.
+    handle_with_name = one_shot_iterator.string_handle(name="foo")
+    self.assertEqual("foo", handle_with_name.op.name)
+    self.assertIsNot(one_shot_iterator.string_handle(), handle_with_name)
+
+    handle_with_same_name = one_shot_iterator.string_handle(name="foo")
+    self.assertEqual("foo_1", handle_with_same_name.op.name)
+    self.assertIsNot(handle_with_name, handle_with_same_name)
+
   def testIteratorStringHandleError(self):
     dataset_int_scalar = (dataset_ops.Dataset.from_tensor_slices([1, 2,
                                                                   3]).repeat())
@@ -595,6 +635,18 @@ class IteratorTest(test.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
+  def testRepeatedGetNextWarning(self):
+    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      for _ in range(100):
+        iterator.get_next()
+    self.assertEqual(100 - iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD,
+                     len(w))
+    for warning in w:
+      self.assertTrue(
+          iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/list_files_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
similarity index 83%
rename from tensorflow/python/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 757191363c27d96f7b5adb488957e162a06fa4b4..ad6bbc043db9e44ec7893cd9ae29898a8c7fedaa 100644
--- a/tensorflow/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -19,13 +19,17 @@ from __future__ import print_function
 
 from collections import namedtuple
 import threading
+import time
 
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -33,6 +37,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
@@ -279,9 +284,8 @@ class MapDatasetTest(test.TestCase):
     with self.test_session() as sess:
       sess.run(table.init)
       sess.run(init_op)
-
-      print(sess.run(get_next))
-      print(sess.run(get_next))
+      sess.run(get_next)
+      sess.run(get_next)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -542,6 +546,119 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def assertSparseValuesEqual(self, a, b):
+    self.assertAllEqual(a.indices, b.indices)
+    self.assertAllEqual(a.values, b.values)
+    self.assertAllEqual(a.dense_shape, b.dense_shape)
+
+  def testSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    iterator = (dataset_ops.Dataset.range(10)
+                .map(_sparse)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _sparse(i))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSparseChain(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def _check(i):
+      self.assertTrue(sparse_tensor.is_sparse(i))
+      return sparse_ops.sparse_concat(0, [i, i])
+
+    iterator = (
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for i in range(10):
+        actual = sess.run(get_next)
+        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
+        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+class MapDatasetBenchmark(test.Benchmark):
+
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+        for _ in range(chain_length):
+          dataset = dataset.map(lambda x: x)
+        iterator = dataset.make_one_shot_iterator()
+        next_element = iterator.get_next()
+
+        with session.Session() as sess:
+          for _ in range(5):
+            sess.run(next_element.op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element.op)
+            end = time.time()
+            deltas.append(end - start)
+
+          median_wall_time = np.median(deltas) / 100
+          print("Map dataset chain length: %d Median wall time: %f"
+                % (chain_length, median_wall_time))
+          self.report_benchmark(
+              iters=1000, wall_time=median_wall_time,
+              name="benchmark_map_dataset_chain_latency_%d" % chain_length)
+
+  def benchmarkMapFanOut(self):
+    fan_outs = [1, 2, 5, 10, 20, 50, 100]
+    for fan_out in fan_outs:
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.from_tensors(
+            tuple(0 for _ in range(fan_out))).repeat(None).map(lambda *xs: xs)
+        iterator = dataset.make_one_shot_iterator()
+        next_element = iterator.get_next()
+
+        with session.Session() as sess:
+          for _ in range(5):
+            sess.run(next_element[0].op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element[0].op)
+            end = time.time()
+            deltas.append(end - start)
+
+          median_wall_time = np.median(deltas) / 100
+          print("Map dataset fan out: %d Median wall time: %f"
+                % (fan_out, median_wall_time))
+          self.report_benchmark(
+              iters=1000, wall_time=median_wall_time,
+              name="benchmark_map_dataset_fan_out_%d" % fan_out)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..646324cb95df6fc1fa0a901ebdccc8d4ef74a66c
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test PrefetchDataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class PrefetchDatasetTest(test.TestCase):
+
+  def testBufferSize(self):
+    buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(10).prefetch(
+        buffer_size=buffer_size).make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op, feed_dict={buffer_size: 5})
+      for m in range(10):
+        self.assertEqual(m, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testInvalidBufferSize(self):
+    buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.Dataset.range(10).prefetch(
+        buffer_size=buffer_size).make_initializable_iterator()
+    init_op = iterator.initializer
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
+      with self.test_session() as sess:
+        sess.run(init_op, feed_dict={buffer_size: 0})
+
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
+      with self.test_session() as sess:
+        sess.run(init_op, feed_dict={buffer_size: -5})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/range_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/reader_dataset_ops_test.py
rename to tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
diff --git a/tensorflow/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/sequence_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shard_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/shuffle_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
diff --git a/tensorflow/python/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
similarity index 100%
rename from tensorflow/python/kernel_tests/zip_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 5140510409fb9849fb81ee8920564193e869364a..695d3ef7904b160a46e8755b84b2955c7a0fa882 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -21,7 +21,9 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
         "//third_party/py/numpy",
     ],
 )
@@ -50,6 +52,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 343f316281b862c8523ec2cf0375a5ba9e9520ca..76398beaa8d238d6348237ba648699678bee31ba 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -22,9 +22,11 @@ import collections
 import threading
 
 import numpy as np
+import six
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -38,6 +40,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.util import deprecation
 
 
 class Dataset(object):
@@ -94,18 +97,21 @@ class Dataset(object):
     iterator_resource = gen_dataset_ops.iterator(
         container="",
         shared_name=shared_name,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(
-          self._as_variant_tensor(), iterator_resource)
+      initializer = gen_dataset_ops.make_iterator(self._as_variant_tensor(),
+                                                  iterator_resource)
     return iterator_ops.Iterator(iterator_resource, initializer,
-                                 self.output_types, self.output_shapes)
+                                 self.output_types, self.output_shapes,
+                                 self.output_classes)
 
   def make_one_shot_iterator(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
-    **N.B.** The returned iterator will be initialized automatically.
+    Note: The returned iterator will be initialized automatically.
     A "one-shot" iterator does not currently support re-initialization.
 
     Returns:
@@ -124,14 +130,40 @@ class Dataset(object):
     def _make_dataset():
       return self._as_variant_tensor()  # pylint: disable=protected-access
 
-    _make_dataset.add_to_graph(ops.get_default_graph())
+    try:
+      _make_dataset.add_to_graph(ops.get_default_graph())
+    except ValueError as err:
+      if "Cannot capture a stateful node" in str(err):
+        raise ValueError(
+            "Failed to create a one-shot iterator for a dataset. "
+            "`Dataset.make_one_shot_iterator()` does not support datasets that "
+            "capture stateful objects, such as a `Variable` or `LookupTable`. "
+            "In these cases, use `Dataset.make_initializable_iterator()`. "
+            "(Original error: %s)" % err)
+      else:
+        six.reraise(ValueError, err)
 
     return iterator_ops.Iterator(
         gen_dataset_ops.one_shot_iterator(
             dataset_factory=_make_dataset,
-            output_types=nest.flatten(self.output_types),
-            output_shapes=nest.flatten(self.output_shapes)), None,
-        self.output_types, self.output_shapes)
+            output_types=nest.flatten(
+                sparse.as_dense_types(self.output_types, self.output_classes)),
+            output_shapes=nest.flatten(
+                sparse.as_dense_shapes(self.output_shapes,
+                                       self.output_classes))), None,
+        self.output_types, self.output_shapes, self.output_classes)
+
+  @abc.abstractproperty
+  def output_classes(self):
+    """Returns the class of each component of an element of this dataset.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    raise NotImplementedError("Dataset.output_classes")
 
   @abc.abstractproperty
   def output_shapes(self):
@@ -187,6 +219,7 @@ class Dataset(object):
     return TensorSliceDataset(tensors)
 
   @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
   def from_sparse_tensor_slices(sparse_tensor):
     """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
 
@@ -253,6 +286,23 @@ class Dataset(object):
     sess.run(value)  # (2, array([1, 1]))
     ```
 
+    NOTE: The current implementation of `Dataset.from_generator()` uses
+    @{tf.py_func} and inherits the same constraints. In particular, it
+    requires the `Dataset`- and `Iterator`-related operations to be placed
+    on a device in the same process as the Python program that called
+    `Dataset.from_generator()`. The body of `generator` will not be
+    serialized in a `GraphDef`, and you should not use this method if you
+    need to serialize your model and restore it in a different environment.
+
+    NOTE: If `generator` depends on mutable global variables or other external
+    state, be aware that the runtime may invoke `generator` multiple times
+    (in order to support repeating the `Dataset`) and at any time
+    between the call to `Dataset.from_generator()` and the production of the
+    first element from the generator. Mutating global variables or external
+    state can cause undefined behavior, and we recommend that you explicitly
+    cache any external state in `generator` before calling
+    `Dataset.from_generator()`.
+
     Args:
       generator: A callable object that takes no arguments and returns an
         object that supports the `iter()` protocol.
@@ -323,8 +373,8 @@ class Dataset(object):
         # pylint: disable=protected-access
         ret_arrays = [
             script_ops.FuncRegistry._convert(ret, dtype=dtype.as_numpy_dtype)
-            for ret, dtype in zip(nest.flatten_up_to(output_types, values),
-                                  flattened_types)
+            for ret, dtype in zip(
+                nest.flatten_up_to(output_types, values), flattened_types)
         ]
         # pylint: enable=protected-access
 
@@ -518,11 +568,14 @@ class Dataset(object):
   def repeat(self, count=None):
     """Repeats this dataset `count` times.
 
+    NOTE: If this dataset is a function of global state (e.g. a random number
+    generator), then different repetitions may produce different elements.
+
     Args:
       count: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
-        number of times the elements of this dataset should be repeated. The
-        default behavior (if `count` is `None` or `-1`) is for the elements to
-        be repeated indefinitely.
+        number of times the dataset should be repeated. The default behavior
+        (if `count` is `None` or `-1`) is for the dataset be repeated
+        indefinitely.
 
     Returns:
       A `Dataset`.
@@ -858,25 +911,37 @@ class TensorDataset(Dataset):
     """See `Dataset.from_tensors()` for details."""
     super(TensorDataset, self).__init__()
     with ops.name_scope("tensors"):
-      self._tensors = nest.pack_sequence_as(tensors, [
-          ops.convert_to_tensor(t, name="component_%d" % i)
+      tensors = nest.pack_sequence_as(tensors, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
 
+    self._tensors = sparse.serialize_sparse_tensors(tensors)
+    self._output_classes = sparse.get_classes(tensors)
+    self._output_shapes = nest.pack_sequence_as(
+        tensors, [t.get_shape() for t in nest.flatten(tensors)])
+    self._output_types = nest.pack_sequence_as(
+        tensors, [t.dtype for t in nest.flatten(tensors)])
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_dataset(
         nest.flatten(self._tensors),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.shape for t in nest.flatten(self._tensors)])
+    return self._output_shapes
 
   @property
   def output_types(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.dtype for t in nest.flatten(self._tensors)])
+    return self._output_types
 
 
 class TensorSliceDataset(Dataset):
@@ -886,32 +951,41 @@ class TensorSliceDataset(Dataset):
     """See `Dataset.from_tensor_slices()` for details."""
     super(TensorSliceDataset, self).__init__()
     with ops.name_scope("tensors"):
-      flat_tensors = [
-          ops.convert_to_tensor(t, name="component_%d" % i)
+      tensors = nest.pack_sequence_as(tensors, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
-      ]
+      ])
+      flat_tensors = nest.flatten(tensors)
 
-    self._tensors = nest.pack_sequence_as(tensors, flat_tensors)
     batch_dim = flat_tensors[0].get_shape()[0]
     for t in flat_tensors[1:]:
       batch_dim.assert_is_compatible_with(t.get_shape()[0])
+    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
+    self._output_classes = sparse.get_classes(tensors)
+    self._output_shapes = nest.pack_sequence_as(
+        tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
+    self._output_types = nest.pack_sequence_as(
+        tensors, [t.dtype for t in nest.flatten(tensors)])
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_slice_dataset(
         nest.flatten(self._tensors),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
-    return nest.pack_sequence_as(self._tensors, [
-        tensor_shape.TensorShape(t.shape[1:])
-        for t in nest.flatten(self._tensors)
-    ])
+    return self._output_shapes
 
   @property
   def output_types(self):
-    return nest.pack_sequence_as(self._tensors,
-                                 [t.dtype for t in nest.flatten(self._tensors)])
+    return self._output_types
 
 
 class SparseTensorSliceDataset(Dataset):
@@ -929,6 +1003,10 @@ class SparseTensorSliceDataset(Dataset):
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
 
+  @property
+  def output_classes(self):
+    return (ops.Tensor, ops.Tensor, ops.Tensor)
+
   @property
   def output_shapes(self):
     indices_shape = self._sparse_tensor.indices.get_shape()
@@ -936,8 +1014,8 @@ class SparseTensorSliceDataset(Dataset):
     rank = (indices_shape[1] - 1).merge_with(shape_shape[0] - 1)
     num_values = tensor_shape.Dimension(None)
     return (tensor_shape.TensorShape([num_values, rank]),
-            tensor_shape.TensorShape([num_values]), tensor_shape.TensorShape(
-                [rank]))
+            tensor_shape.TensorShape([num_values]),
+            tensor_shape.TensorShape([rank]))
 
   @property
   def output_types(self):
@@ -978,17 +1056,23 @@ class ZipDataset(Dataset):
         ])
     # pylint: enable=protected-access
 
+  @property
+  def output_classes(self):
+    return nest.pack_sequence_as(
+        self._datasets,
+        [ds.output_classes for ds in nest.flatten(self._datasets)])
+
   @property
   def output_shapes(self):
-    return nest.pack_sequence_as(self._datasets, [
-        ds.output_shapes for ds in nest.flatten(self._datasets)
-    ])
+    return nest.pack_sequence_as(
+        self._datasets,
+        [ds.output_shapes for ds in nest.flatten(self._datasets)])
 
   @property
   def output_types(self):
-    return nest.pack_sequence_as(self._datasets, [
-        ds.output_types for ds in nest.flatten(self._datasets)
-    ])
+    return nest.pack_sequence_as(
+        self._datasets,
+        [ds.output_types for ds in nest.flatten(self._datasets)])
 
 
 class ConcatenateDataset(Dataset):
@@ -1014,10 +1098,16 @@ class ConcatenateDataset(Dataset):
     return gen_dataset_ops.concatenate_dataset(
         self._input_dataset._as_variant_tensor(),
         self._dataset_to_concatenate._as_variant_tensor(),
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
     # pylint: enable=protected-access
 
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
   @property
   def output_shapes(self):
     return nest.pack_sequence_as(self._input_dataset.output_shapes, [
@@ -1049,8 +1139,14 @@ class RepeatDataset(Dataset):
     return gen_dataset_ops.repeat_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1093,8 +1189,14 @@ class RangeDataset(Dataset):
         start=self._start,
         stop=self._stop,
         step=self._step,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return ops.Tensor
 
   @property
   def output_shapes(self):
@@ -1119,8 +1221,14 @@ class CacheDataset(Dataset):
     return gen_dataset_ops.cache_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         filename=self._filename,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1134,14 +1242,44 @@ class CacheDataset(Dataset):
 class ShuffleDataset(Dataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
-  def __init__(self, input_dataset, buffer_size, seed=None,
-               reshuffle_each_iteration=None):
-    """See `Dataset.shuffle()` for details."""
+  def __init__(self,
+               input_dataset,
+               buffer_size,
+               seed=None,
+               reshuffle_each_iteration=None,
+               seed2=None):
+    """Randomly shuffles the elements of this dataset.
+
+    Args:
+      input_dataset: The input dataset.
+      buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
+        number of elements from this dataset from which the new
+        dataset will sample.
+      seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the
+        random seed that will be used to create the distribution. See
+        @{tf.set_random_seed} for behavior.
+      reshuffle_each_iteration: (Optional.) A boolean, which if true indicates
+        that the dataset should be pseudorandomly reshuffled each time it is
+        iterated over. (Defaults to `True`.)
+      seed2: (Optional.) A `tf.int64` scalar `tf.Tensor` used to avoid seed
+        collision. Users should generally not need to specify this. This is
+        supposed to be used when both the seeds for the Dataset op need to be
+        manually specified. If not None, seed must also be non-None.
+
+    Returns:
+      A `Dataset`.
+
+    Raises:
+      ValueError: if invalid arguments are provided.
+    """
     super(ShuffleDataset, self).__init__()
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-    seed, seed2 = random_seed.get_seed(seed)
+    if seed2 is None:
+      seed, seed2 = random_seed.get_seed(seed)
+    elif seed is None:
+      raise ValueError("seed must be non-None if seed2 is non-None.")
     if seed is None:
       self._seed = constant_op.constant(0, dtype=dtypes.int64, name="seed")
     else:
@@ -1163,8 +1301,14 @@ class ShuffleDataset(Dataset):
         seed=self._seed,
         seed2=self._seed2,
         reshuffle_each_iteration=self._reshuffle_each_iteration,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1188,8 +1332,14 @@ class TakeDataset(Dataset):
     return gen_dataset_ops.take_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1213,8 +1363,14 @@ class SkipDataset(Dataset):
     return gen_dataset_ops.skip_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         count=self._count,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1232,15 +1388,21 @@ class BatchDataset(Dataset):
     """See `Dataset.batch()` for details."""
     super(BatchDataset, self).__init__()
     self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64,
-                                             name="batch_size")
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         batch_size=self._batch_size,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1300,11 +1462,16 @@ class PaddedBatchDataset(Dataset):
   def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
     """See `Dataset.batch()` for details."""
     super(PaddedBatchDataset, self).__init__()
+    if sparse.any_sparse(input_dataset.output_classes):
+      # TODO(b/63669786): support batching of sparse tensors
+      raise TypeError(
+          "Batching of padded sparse tensors is not currently supported")
     self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64,
-                                             name="batch_size")
-    padding_values = (padding_values if padding_values is not None else
-                      self._default_padding(input_dataset))
+    self._batch_size = ops.convert_to_tensor(
+        batch_size, dtype=dtypes.int64, name="batch_size")
+    padding_values = (
+        padding_values
+        if padding_values is not None else self._default_padding(input_dataset))
     self._padded_shapes = nest.map_structure_up_to(
         input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
     self._padding_values = nest.map_structure_up_to(
@@ -1330,7 +1497,12 @@ class PaddedBatchDataset(Dataset):
             for s in nest.flatten(self._padded_shapes)
         ],
         padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1359,18 +1531,25 @@ class MapDataset(Dataset):
     super(MapDataset, self).__init__()
     self._input_dataset = input_dataset
 
+    self._output_classes = None
     self._output_shapes = None
     self._output_types = None
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         ret = map_func(*nested_args)
       else:
@@ -1389,14 +1568,24 @@ class MapDataset(Dataset):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Extract shape information from the returned values.
-      flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)]
+      # Convert any `SparseTensorValue`s to `SparseTensor`s.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor_lib.SparseTensor.from_value(t)
+          if sparse_tensor_lib.is_sparse(t) else t for t in nest.flatten(ret)
+      ])
+
+      self._output_classes = sparse.get_classes(ret)
       self._output_shapes = nest.pack_sequence_as(
-          ret, [t.get_shape() for t in flattened_ret])
+          ret, [t.get_shape() for t in nest.flatten(ret)])
       self._output_types = nest.pack_sequence_as(
-          ret, [t.dtype for t in flattened_ret])
+          ret, [t.dtype for t in nest.flatten(ret)])
 
-      return flattened_ret
+      # Serialize any sparse tensors and convert result to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          ops.convert_to_tensor(t)
+          for t in nest.flatten(sparse.serialize_sparse_tensors(ret))
+      ])
+      return nest.flatten(ret)
 
     self._map_func = tf_map_func
     self._map_func.add_to_graph(ops.get_default_graph())
@@ -1407,8 +1596,14 @@ class MapDataset(Dataset):
         input_t,
         self._map_func.captured_inputs,
         f=self._map_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1437,8 +1632,10 @@ class ParallelMapDataset(MapDataset):
         self._map_func.captured_inputs,
         f=self._map_func,
         num_parallel_calls=self._num_parallel_calls,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     # pylint: enable=protected-access
 
 
@@ -1450,15 +1647,21 @@ class FlatMapDataset(Dataset):
     super(FlatMapDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
@@ -1467,6 +1670,7 @@ class FlatMapDataset(Dataset):
       if not isinstance(dataset, Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -1480,8 +1684,14 @@ class FlatMapDataset(Dataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._map_func.captured_inputs,
         f=self._map_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1501,15 +1711,21 @@ class InterleaveDataset(Dataset):
     super(InterleaveDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_map_func(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         dataset = map_func(*nested_args)
       else:
@@ -1518,6 +1734,7 @@ class InterleaveDataset(Dataset):
       if not isinstance(dataset, Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
 
@@ -1526,10 +1743,10 @@ class InterleaveDataset(Dataset):
     self._map_func = tf_map_func
     self._map_func.add_to_graph(ops.get_default_graph())
 
-    self._cycle_length = ops.convert_to_tensor(cycle_length, dtype=dtypes.int64,
-                                               name="cycle_length")
-    self._block_length = ops.convert_to_tensor(block_length, dtype=dtypes.int64,
-                                               name="block_length")
+    self._cycle_length = ops.convert_to_tensor(
+        cycle_length, dtype=dtypes.int64, name="cycle_length")
+    self._block_length = ops.convert_to_tensor(
+        block_length, dtype=dtypes.int64, name="block_length")
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.interleave_dataset(
@@ -1538,8 +1755,14 @@ class InterleaveDataset(Dataset):
         self._cycle_length,
         self._block_length,
         f=self._map_func,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -1558,15 +1781,21 @@ class FilterDataset(Dataset):
     super(FilterDataset, self).__init__()
     self._input_dataset = input_dataset
 
-    @function.Defun(*nest.flatten(input_dataset.output_types))
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
     def tf_predicate(*args):
       """A wrapper for Defun that facilitates shape inference."""
       # Pass in shape information from the input_dataset.
-      for arg, shape in zip(args, nest.flatten(input_dataset.output_shapes)):
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
         arg.set_shape(shape)
 
       nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
-
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
       if _should_unpack_args(nested_args):
         ret = predicate(*nested_args)
       else:
@@ -1587,8 +1816,14 @@ class FilterDataset(Dataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         other_arguments=self._predicate.captured_inputs,
         predicate=self._predicate,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
@@ -1606,15 +1841,21 @@ class PrefetchDataset(Dataset):
     """See `Dataset.prefetch()` for details."""
     super(PrefetchDataset, self).__init__()
     self._input_dataset = input_dataset
-    self._buffer_size = ops.convert_to_tensor(buffer_size, dtype=dtypes.int64,
-                                              name="buffer_size")
+    self._buffer_size = ops.convert_to_tensor(
+        buffer_size, dtype=dtypes.int64, name="buffer_size")
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.prefetch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
-        output_shapes=nest.flatten(self.output_shapes),
-        output_types=nest.flatten(self.output_types))
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
 
   @property
   def output_shapes(self):
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d4f05a055a22838749c411887c17cc047c3ddaac..0cbdb3ab19d8f1b966a867dfcf709c1a4a49b871 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -17,18 +17,41 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 
 
+# NOTE(mrry): It is legitimate to call `Iterator.get_next()` multiple
+# times, e.g. when you are distributing different elements to multiple
+# devices in a single step. However, a common pitfall arises when
+# users call `Iterator.get_next()` in each iteration of their training
+# loop. `Iterator.get_next()` adds ops to the graph, and executing
+# each op allocates resources (including threads); as a consequence,
+# invoking it in every iteration of a training loop causes slowdown
+# and eventual resource exhaustion. To guard against this outcome, we
+# log a warning when the number of uses crosses a threshold of suspicion.
+GET_NEXT_CALL_WARNING_THRESHOLD = 32
+
+GET_NEXT_CALL_WARNING_MESSAGE = (
+    "An unusually high number of `Iterator.get_next()` calls was detected. "
+    "This often indicates that `Iterator.get_next()` is being called inside "
+    "a training loop, which will cause gradual slowdown and eventual resource "
+    "exhaustion. If this is the case, restructure your code to call "
+    "`next_element = iterator.get_next() once outside the loop, and use "
+    "`next_element` inside the loop.")
+
+
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
-               output_shapes):
+               output_shapes, output_classes):
     """Creates a new iterator from the given iterator resource.
 
     Note: Most users will not call this initializer directly, and will
@@ -41,17 +64,27 @@ class Iterator(object):
       initializer: A `tf.Operation` that should be run to initialize this
         iterator.
       output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element of this iterator.
+        each component of an element of this dataset.
       output_shapes: A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset.
+      output_classes: A nested structure of Python `type` object corresponding
+        to each
+        component of an element of this iterator.
     """
     self._iterator_resource = iterator_resource
     self._initializer = initializer
+    self._output_classes = output_classes
     self._output_types = output_types
     self._output_shapes = output_shapes
+    self._string_handle = gen_dataset_ops.iterator_to_string_handle(
+        self._iterator_resource)
+    self._get_next_call_count = 0
 
   @staticmethod
-  def from_structure(output_types, output_shapes=None, shared_name=None):
+  def from_structure(output_types,
+                     output_shapes=None,
+                     shared_name=None,
+                     output_classes=None):
     """Creates a new, uninitialized `Iterator` with the given structure.
 
     This iterator-constructing method can be used to create an iterator that
@@ -99,13 +132,16 @@ class Iterator(object):
 
     Args:
       output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element of this iterator.
+        each component of an element of this dataset.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset. If
         omitted, each component will have an unconstrainted shape.
       shared_name: (Optional.) If non-empty, this iterator will be shared under
         the given name across multiple sessions that share the same devices
         (e.g. when using a remote server).
+      output_classes: (Optional.) A nested structure of Python `type` objects
+        corresponding to each component of an element of this iterator. If
+        omitted, each component is assumed to be of type `tf.Tensor`.
 
     Returns:
       An `Iterator`.
@@ -121,6 +157,8 @@ class Iterator(object):
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
     if shared_name is None:
       shared_name = ""
@@ -129,10 +167,14 @@ class Iterator(object):
         shared_name=shared_name,
         output_types=nest.flatten(output_types),
         output_shapes=nest.flatten(output_shapes))
-    return Iterator(iterator_resource, None, output_types, output_shapes)
+    return Iterator(iterator_resource, None, output_types, output_shapes,
+                    output_classes)
 
   @staticmethod
-  def from_string_handle(string_handle, output_types, output_shapes=None):
+  def from_string_handle(string_handle,
+                         output_types,
+                         output_shapes=None,
+                         output_classes=None):
     """Creates a new, uninitialized `Iterator` based on the given handle.
 
     This method allows you to define a "feedable" iterator where you can choose
@@ -166,10 +208,13 @@ class Iterator(object):
       string_handle: A scalar `tf.Tensor` of type `tf.string` that evaluates
         to a handle produced by the `Iterator.string_handle()` method.
       output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element of this iterator.
+        each component of an element of this dataset.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset. If
         omitted, each component will have an unconstrainted shape.
+      output_classes: (Optional.) A nested structure of Python `type` objects
+        corresponding to each component of an element of this iterator. If
+        omitted, each component is assumed to be of type `tf.Tensor`.
 
     Returns:
       An `Iterator`.
@@ -181,13 +226,16 @@ class Iterator(object):
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if output_classes is None:
+      output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     iterator_resource = gen_dataset_ops.iterator_from_string_handle(
         string_handle,
         output_types=nest.flatten(output_types),
         output_shapes=nest.flatten(output_shapes))
-    return Iterator(iterator_resource, None, output_types, output_shapes)
+    return Iterator(iterator_resource, None, output_types, output_shapes,
+                    output_classes)
 
   @property
   def initializer(self):
@@ -224,6 +272,13 @@ class Iterator(object):
     with ops.name_scope(name, "make_initializer") as name:
       nest.assert_same_structure(self._output_types, dataset.output_types)
       nest.assert_same_structure(self._output_shapes, dataset.output_shapes)
+      for iterator_class, dataset_class in zip(
+          nest.flatten(self._output_classes),
+          nest.flatten(dataset.output_classes)):
+        if iterator_class is not dataset_class:
+          raise TypeError(
+              "Expected output classes %r but got dataset with output class %r."
+              % (self._output_classes, dataset.output_classes))
       for iterator_dtype, dataset_dtype in zip(
           nest.flatten(self._output_types), nest.flatten(dataset.output_types)):
         if iterator_dtype != dataset_dtype:
@@ -231,8 +286,8 @@ class Iterator(object):
               "Expected output types %r but got dataset with output types %r." %
               (self._output_types, dataset.output_types))
       for iterator_shape, dataset_shape in zip(
-          nest.flatten(self._output_shapes),
-          nest.flatten(dataset.output_shapes)):
+          nest.flatten(self._output_shapes), nest.flatten(
+              dataset.output_shapes)):
         if not iterator_shape.is_compatible_with(dataset_shape):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
@@ -250,13 +305,24 @@ class Iterator(object):
     Returns:
       A nested structure of `tf.Tensor` objects.
     """
-    return nest.pack_sequence_as(
-        self._output_types,
-        gen_dataset_ops.iterator_get_next(
-            self._iterator_resource,
-            output_types=nest.flatten(self._output_types),
-            output_shapes=nest.flatten(self._output_shapes),
-            name=name))
+    self._get_next_call_count += 1
+    if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
+
+    return sparse.deserialize_sparse_tensors(
+        nest.pack_sequence_as(self._output_types,
+                              gen_dataset_ops.iterator_get_next(
+                                  self._iterator_resource,
+                                  output_types=nest.flatten(
+                                      sparse.as_dense_types(
+                                          self._output_types,
+                                          self._output_classes)),
+                                  output_shapes=nest.flatten(
+                                      sparse.as_dense_shapes(
+                                          self._output_shapes,
+                                          self._output_classes)),
+                                  name=name)), self._output_types,
+        self._output_shapes, self._output_classes)
 
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
@@ -267,8 +333,23 @@ class Iterator(object):
     Returns:
       A scalar `tf.Tensor` of type `tf.string`.
     """
-    return gen_dataset_ops.iterator_to_string_handle(
-        self._iterator_resource, name=name)
+    if name is None:
+      return self._string_handle
+    else:
+      return gen_dataset_ops.iterator_to_string_handle(
+          self._iterator_resource, name=name)
+
+  @property
+  def output_classes(self):
+    """Returns the class of each component of an element of this iterator.
+
+    The expected values are `tf.Tensor` and `tf.SparseTensor`.
+
+    Returns:
+      A nested structure of Python `type` objects corresponding to each
+      component of an element of this dataset.
+    """
+    return self._output_classes
 
   @property
   def output_shapes(self):
@@ -276,7 +357,7 @@ class Iterator(object):
 
     Returns:
       A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this iterator.
+      component of an element of this dataset.
     """
     return self._output_shapes
 
@@ -286,6 +367,6 @@ class Iterator(object):
 
     Returns:
       A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this iterator.
+      of an element of this dataset.
     """
     return self._output_types
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 531716581ffbc2daeac8eb41c24a848bf5fbb7ad..c6fb8531aea13850524e6b9a83911d7afe950395 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -70,6 +70,10 @@ class TextLineDataset(Dataset):
     return gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.scalar()
@@ -110,6 +114,10 @@ class TFRecordDataset(Dataset):
     return gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.TensorShape([])
@@ -159,6 +167,10 @@ class FixedLengthRecordDataset(Dataset):
         self._filenames, self._header_bytes, self._record_bytes,
         self._footer_bytes, self._buffer_size)
 
+  @property
+  def output_classes(self):
+    return ops.Tensor
+
   @property
   def output_shapes(self):
     return tensor_shape.scalar()
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index a2b80590bacb0b159bcfe94cbe203be237279a20..f7d7fe98d3eca10b6481e3c0f7d08b42e95ef81a 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -31,6 +31,37 @@ py_test(
     ],
 )
 
+py_library(
+    name = "sparse",
+    srcs = ["sparse.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nest",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "sparse_test",
+    size = "small",
+    srcs = ["sparse_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nest",
+        ":sparse",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 421513cafc6b480e22a8799926b93287c85dfe7f..2455395635c4c8fa5d157a38d4e7a118f554fd9f 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -17,17 +17,22 @@
 """## Functions for working with arbitrarily nested sequences of elements.
 
 NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
-makes two changes:
+makes three changes:
 
 1. It adds support for dictionaries as a level of nesting in nested structures.
 2. It removes support for lists as a level of nesting in nested structures.
+3. It adds support for `SparseTensorValue` as an atomic element.
 
-The motivation for this change is twofold:
+The motivation for this change is threefold:
 
 1. Many input-processing functions (e.g. `tf.parse_example()`) return
    dictionaries, and we would like to support them natively in datasets.
 2. It seems more natural for lists to be treated (e.g. in Dataset constructors)
    as tensors, rather than lists of (lists of...) tensors.
+3. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
+   that would normally be flattened and we want to be able to create sparse
+   tensor from `SparseTensorValue's similarly to creating tensors from numpy
+   arrays.
 """
 
 from __future__ import absolute_import
@@ -38,6 +43,7 @@ import collections as _collections
 
 import six as _six
 
+from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 from tensorflow.python.util.all_util import remove_undocumented
 
 
@@ -87,6 +93,8 @@ def _yield_value(iterable):
     # corresponding `OrderedDict` to pack it back).
     for key in _sorted(iterable):
       yield iterable[key]
+  elif isinstance(iterable, _sparse_tensor.SparseTensorValue):
+    yield iterable
   else:
     for value in iterable:
       yield value
@@ -116,8 +124,9 @@ def is_sequence(seq):
     True if the sequence is a not a string or list and is a
     collections.Sequence.
   """
-  return (isinstance(seq, (_collections.Sequence, dict))
-          and not isinstance(seq, (list, _six.string_types)))
+  return (isinstance(seq, (_collections.Sequence, dict)) and
+          not isinstance(seq, _sparse_tensor.SparseTensorValue) and
+          not isinstance(seq, (list, _six.string_types)))
 
 
 def flatten(nest):
@@ -367,6 +376,16 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "structure has length %s, while shallow structure has length %s."
           % (len(input_tree), len(shallow_tree)))
 
+    if check_types and isinstance(shallow_tree, dict):
+      if set(input_tree) != set(shallow_tree):
+        raise ValueError(
+            "The two structures don't have the same keys. Input "
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
+             list(_six.iterkeys(shallow_tree))))
+      input_tree = list(_six.iteritems(input_tree))
+      shallow_tree = list(_six.iteritems(shallow_tree))
+
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types)
diff --git a/tensorflow/python/data/util/nest_test.py b/tensorflow/python/data/util/nest_test.py
index 6416e2850d55af8f60d416959410bef7d5329d71..90dd7dfe7775b2f10611e5579784fbda63fc9669 100644
--- a/tensorflow/python/data/util/nest_test.py
+++ b/tensorflow/python/data/util/nest_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -86,7 +87,7 @@ class NestTest(test.TestCase):
         ordered_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
-  def testFlattenAndPack_withDicts(self):
+  def testFlattenAndPackWithDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
     named_tuple = collections.namedtuple("A", ("b", "c"))
     mess = (
@@ -132,6 +133,17 @@ class NestTest(test.TestCase):
     self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
     self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
 
+  def testFlattenSparseValue(self):
+    st = sparse_tensor.SparseTensorValue([[0]], [0], [1])
+    single_value = st
+    list_of_values = [st, st, st]
+    nest_of_values = ((st), ((st), (st)))
+    dict_of_values = {"foo": st, "bar": st, "baz": st}
+    self.assertEqual([st], nest.flatten(single_value))
+    self.assertEqual([[st, st, st]], nest.flatten(list_of_values))
+    self.assertEqual([st, st, st], nest.flatten(nest_of_values))
+    self.assertEqual([st, st, st], nest.flatten(dict_of_values))
+
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertFalse(nest.is_sequence([1, 3, [4, 5]]))
@@ -143,6 +155,8 @@ class NestTest(test.TestCase):
     self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
     self.assertFalse(nest.is_sequence(np.ones((4, 5))))
     self.assertTrue(nest.is_sequence({"foo": 1, "bar": 2}))
+    self.assertFalse(
+        nest.is_sequence(sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
@@ -254,6 +268,15 @@ class NestTest(test.TestCase):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
+    inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
+    inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
+    expected_message = (
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
+    with self.assertRaisesRegexp(ValueError, expected_message):
+      nest.assert_shallow_structure(inp_ab2, inp_ab1)
+
   def testFlattenUpTo(self):
     input_tree = (((2, 2), (3, 3)), ((4, 9), (5, 5)))
     shallow_tree = ((True, True), (False, True))
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ebcb4ea81b23b60dc46bae78bfa792f4a8ab6d8
--- /dev/null
+++ b/tensorflow/python/data/util/sparse.py
@@ -0,0 +1,152 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python dataset sparse tensor utility functitons."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import sparse_ops
+
+
+def any_sparse(classes):
+  """Checks for sparse tensor.
+
+  Args:
+    classes: a structure of objects that identify the dataset item classes
+
+  Returns:
+    `True` if `classes` contains a sparse tensor type and `False` otherwise.
+  """
+  return any([c is sparse_tensor.SparseTensor for c in nest.flatten(classes)])
+
+
+def as_dense_shapes(shapes, classes):
+  """Converts sparse tensor shapes to their physical shapes.
+
+  Args:
+    shapes: a structure of shapes to convert.
+    classes: a structure of objects that identify the dataset item classes
+
+  Returns:
+    a structure matching the nested structure of `shapes`, containing
+    `tensor_shape.unknown_shape()` at positions where `classes` contains
+    `tf.SparseTensor` and matching contents of `shapes` otherwise
+  """
+  ret = nest.pack_sequence_as(shapes, [
+      tensor_shape.unknown_shape() if c is sparse_tensor.SparseTensor else shape
+      for shape, c in zip(nest.flatten(shapes), nest.flatten(classes))
+  ])
+  return ret
+
+
+def as_dense_types(types, classes):
+  """Converts sparse tensor types to `dtypes.variant`.
+
+  Args:
+    types: a structure of types to convert.
+    classes: a structure of objects that identify the dataset item classes
+
+  Returns:
+    a structure matching the nested structure of `types`, containing
+    `dtypes.variant` at positions where `classes` contains `tf.SparseTensor` and
+    matching contents of `types` otherwise
+  """
+  ret = nest.pack_sequence_as(types, [
+      dtypes.variant if c is sparse_tensor.SparseTensor else ty
+      for ty, c in zip(nest.flatten(types), nest.flatten(classes))
+  ])
+  return ret
+
+
+def deserialize_sparse_tensors(tensors, types, shapes, classes):
+  """Deserializes sparse tensors.
+
+  Args:
+    tensors: a structure of tensors to deserialize.
+    types: a structure that holds information about types of `tensors`
+    shapes: a structure that holds information about shapes of `tensors`
+    classes: a structure of objects that identify the dataset item classes
+
+  Returns:
+    `tensors` with any serialized sparse tensors replaced by their deserialized
+    version.
+  """
+  ret = nest.pack_sequence_as(types, [
+      sparse_ops.deserialize_sparse(tensor, dtype=ty, rank=shape.ndims)
+      if c is sparse_tensor.SparseTensor else tensor
+      for (tensor, ty, shape, c) in zip(
+          nest.flatten(tensors), nest.flatten(types), nest.flatten(shapes),
+          nest.flatten(classes))
+  ])
+  return ret
+
+
+def get_classes(tensors):
+  """Gets classes for a structure of tensors.
+
+  Args:
+    tensors: the tensor structure to get classes for.
+
+  Returns:
+    a structure matching the nested structure of `tensors`, containing
+    `tf.SparseTensor` at positions where `tensors` contains a sparse tensor and
+    `tf.Tensor` otherwise
+  """
+  return nest.pack_sequence_as(tensors, [
+      sparse_tensor.SparseTensor
+      if isinstance(tensor, sparse_tensor.SparseTensor) else ops.Tensor
+      for tensor in nest.flatten(tensors)
+  ])
+
+
+def serialize_many_sparse_tensors(tensors):
+  """Serializes many sparse tensors into a batch.
+
+  Args:
+    tensors: a tensor structure to serialize.
+
+  Returns:
+    `tensors` with any sparse tensors replaced by the serialized batch.
+  """
+
+  ret = nest.pack_sequence_as(tensors, [
+      sparse_ops.serialize_many_sparse(tensor, out_type=dtypes.variant)
+      if sparse_tensor.is_sparse(tensor) else tensor
+      for tensor in nest.flatten(tensors)
+  ])
+  return ret
+
+
+def serialize_sparse_tensors(tensors):
+  """Serializes sparse tensors.
+
+  Args:
+    tensors: a tensor structure to serialize.
+
+  Returns:
+    `tensors` with any sparse tensors replaced by the their serialized version.
+  """
+
+  ret = nest.pack_sequence_as(tensors, [
+      sparse_ops.serialize_sparse(tensor, out_type=dtypes.variant)
+      if isinstance(tensor, sparse_tensor.SparseTensor) else tensor
+      for tensor in nest.flatten(tensors)
+  ])
+  return ret
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d49b3ff34bd0ebd6beef1bea168dad22059317be
--- /dev/null
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -0,0 +1,359 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities working with arbitrarily nested structures."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import test
+
+
+class SparseTest(test.TestCase):
+
+  def testAnySparse(self):
+    test_cases = (
+        {
+            "classes": (),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor),
+            "expected": False
+        },
+        {
+            "classes": (((ops.Tensor))),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor, ops.Tensor),
+            "expected": False
+        },
+        {
+            "classes": (ops.Tensor, sparse_tensor.SparseTensor),
+            "expected": True
+        },
+        {
+            "classes": (sparse_tensor.SparseTensor, sparse_tensor.SparseTensor),
+            "expected":
+                True
+        },
+        {
+            "classes": (sparse_tensor.SparseTensor, ops.Tensor),
+            "expected": True
+        },
+        {
+            "classes": (((sparse_tensor.SparseTensor))),
+            "expected": True
+        },
+    )
+    for test_case in test_cases:
+      self.assertEqual(
+          sparse.any_sparse(test_case["classes"]), test_case["expected"])
+
+  def assertShapesEqual(self, a, b):
+    for a, b in zip(nest.flatten(a), nest.flatten(b)):
+      self.assertEqual(a.ndims, b.ndims)
+      if a.ndims is None:
+        continue
+      for c, d in zip(a.as_list(), b.as_list()):
+        self.assertEqual(c, d)
+
+  def testAsDenseShapes(self):
+    test_cases = (
+        {
+            "types": (),
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "types": tensor_shape.scalar(),
+            "classes": ops.Tensor,
+            "expected": tensor_shape.scalar()
+        },
+        {
+            "types": tensor_shape.scalar(),
+            "classes": sparse_tensor.SparseTensor,
+            "expected": tensor_shape.unknown_shape()
+        },
+        {
+            "types": (tensor_shape.scalar()),
+            "classes": (ops.Tensor),
+            "expected": (tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar()),
+            "classes": (sparse_tensor.SparseTensor),
+            "expected": (tensor_shape.unknown_shape())
+        },
+        {
+            "types": (tensor_shape.scalar(), ()),
+            "classes": (ops.Tensor, ()),
+            "expected": (tensor_shape.scalar(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar()),
+            "classes": ((), ops.Tensor),
+            "expected": ((), tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar(), ()),
+            "classes": (sparse_tensor.SparseTensor, ()),
+            "expected": (tensor_shape.unknown_shape(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar()),
+            "classes": ((), sparse_tensor.SparseTensor),
+            "expected": ((), tensor_shape.unknown_shape())
+        },
+        {
+            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
+            "classes": (ops.Tensor, (), ops.Tensor),
+            "expected": (tensor_shape.scalar(), (), tensor_shape.scalar())
+        },
+        {
+            "types": (tensor_shape.scalar(), (), tensor_shape.scalar()),
+            "classes": (sparse_tensor.SparseTensor, (),
+                        sparse_tensor.SparseTensor),
+            "expected": (tensor_shape.unknown_shape(), (),
+                         tensor_shape.unknown_shape())
+        },
+        {
+            "types": ((), tensor_shape.scalar(), ()),
+            "classes": ((), ops.Tensor, ()),
+            "expected": ((), tensor_shape.scalar(), ())
+        },
+        {
+            "types": ((), tensor_shape.scalar(), ()),
+            "classes": ((), sparse_tensor.SparseTensor, ()),
+            "expected": ((), tensor_shape.unknown_shape(), ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertShapesEqual(
+          sparse.as_dense_shapes(test_case["types"], test_case["classes"]),
+          test_case["expected"])
+
+  def testAsDenseTypes(self):
+    test_cases = (
+        {
+            "types": (),
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "types": dtypes.int32,
+            "classes": ops.Tensor,
+            "expected": dtypes.int32
+        },
+        {
+            "types": dtypes.int32,
+            "classes": sparse_tensor.SparseTensor,
+            "expected": dtypes.variant
+        },
+        {
+            "types": (dtypes.int32),
+            "classes": (ops.Tensor),
+            "expected": (dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32),
+            "classes": (sparse_tensor.SparseTensor),
+            "expected": (dtypes.variant)
+        },
+        {
+            "types": (dtypes.int32, ()),
+            "classes": (ops.Tensor, ()),
+            "expected": (dtypes.int32, ())
+        },
+        {
+            "types": ((), dtypes.int32),
+            "classes": ((), ops.Tensor),
+            "expected": ((), dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32, ()),
+            "classes": (sparse_tensor.SparseTensor, ()),
+            "expected": (dtypes.variant, ())
+        },
+        {
+            "types": ((), dtypes.int32),
+            "classes": ((), sparse_tensor.SparseTensor),
+            "expected": ((), dtypes.variant)
+        },
+        {
+            "types": (dtypes.int32, (), dtypes.int32),
+            "classes": (ops.Tensor, (), ops.Tensor),
+            "expected": (dtypes.int32, (), dtypes.int32)
+        },
+        {
+            "types": (dtypes.int32, (), dtypes.int32),
+            "classes": (sparse_tensor.SparseTensor, (),
+                        sparse_tensor.SparseTensor),
+            "expected": (dtypes.variant, (), dtypes.variant)
+        },
+        {
+            "types": ((), dtypes.int32, ()),
+            "classes": ((), ops.Tensor, ()),
+            "expected": ((), dtypes.int32, ())
+        },
+        {
+            "types": ((), dtypes.int32, ()),
+            "classes": ((), sparse_tensor.SparseTensor, ()),
+            "expected": ((), dtypes.variant, ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertEqual(
+          sparse.as_dense_types(test_case["types"], test_case["classes"]),
+          test_case["expected"])
+
+  def testGetClasses(self):
+    s = sparse_tensor.SparseTensor(indices=[[0]], values=[1], dense_shape=[1])
+    d = ops.Tensor
+    t = sparse_tensor.SparseTensor
+    test_cases = (
+        {
+            "classes": (),
+            "expected": ()
+        },
+        {
+            "classes": s,
+            "expected": t
+        },
+        {
+            "classes": constant_op.constant([1]),
+            "expected": d
+        },
+        {
+            "classes": (s),
+            "expected": (t)
+        },
+        {
+            "classes": (constant_op.constant([1])),
+            "expected": (d)
+        },
+        {
+            "classes": (s, ()),
+            "expected": (t, ())
+        },
+        {
+            "classes": ((), s),
+            "expected": ((), t)
+        },
+        {
+            "classes": (constant_op.constant([1]), ()),
+            "expected": (d, ())
+        },
+        {
+            "classes": ((), constant_op.constant([1])),
+            "expected": ((), d)
+        },
+        {
+            "classes": (s, (), constant_op.constant([1])),
+            "expected": (t, (), d)
+        },
+        {
+            "classes": ((), s, ()),
+            "expected": ((), t, ())
+        },
+        {
+            "classes": ((), constant_op.constant([1]), ()),
+            "expected": ((), d, ())
+        },
+    )
+    for test_case in test_cases:
+      self.assertEqual(
+          sparse.get_classes(test_case["classes"]), test_case["expected"])
+
+  def assertSparseValuesEqual(self, a, b):
+    if not isinstance(a, sparse_tensor.SparseTensor):
+      self.assertFalse(isinstance(b, sparse_tensor.SparseTensor))
+      self.assertEqual(a, b)
+      return
+    self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
+    with self.test_session():
+      self.assertAllEqual(a.eval().indices, b.eval().indices)
+      self.assertAllEqual(a.eval().values, b.eval().values)
+      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+
+  def testSerializeDeserialize(self):
+    test_cases = (
+        (),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+        sparse_tensor.SparseTensor(
+            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+    )
+    for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
+      actual = sparse.deserialize_sparse_tensors(
+          sparse.serialize_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
+      nest.assert_same_structure(expected, actual)
+      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
+        self.assertSparseValuesEqual(a, e)
+
+  def testSerializeManyDeserialize(self):
+    test_cases = (
+        (),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+        sparse_tensor.SparseTensor(
+            indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
+        sparse_tensor.SparseTensor(
+            indices=[[0, 0], [3, 4]], values=[1, -1], dense_shape=[4, 5]),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+        (sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]), ()),
+        ((),
+         sparse_tensor.SparseTensor(
+             indices=[[0, 0]], values=[1], dense_shape=[1, 1])),
+    )
+    for expected in test_cases:
+      classes = sparse.get_classes(expected)
+      shapes = nest.map_structure(lambda _: tensor_shape.TensorShape(None),
+                                  classes)
+      types = nest.map_structure(lambda _: dtypes.int32, classes)
+      actual = sparse.deserialize_sparse_tensors(
+          sparse.serialize_many_sparse_tensors(expected), types, shapes,
+          sparse.get_classes(expected))
+      nest.assert_same_structure(expected, actual)
+      for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
+        self.assertSparseValuesEqual(a, e)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 68b97ddbe3048b7aef18fcf8cc2b41ee545ee55f..789771508e2deaa7dfca1f80853e0d4d0aeb10d8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -31,6 +31,7 @@ py_library(
         ":debug_graphs",
         ":debug_utils",
         ":grpc_debug_server",
+        ":grpc_debug_test_server",
         ":hooks",
         ":local_cli_wrapper",
         "//tensorflow/python:util",
@@ -45,6 +46,7 @@ py_library(
         ":grpc_debug_test_server",
         ":offline_analyzer",
         ":session_debug_testlib",
+        ":source_remote",
     ] + if_not_windows([
         ":debug_examples",
     ]),
@@ -110,6 +112,17 @@ py_library(
     ],
 )
 
+py_library(
+    name = "source_remote",
+    srcs = ["lib/source_remote.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":debug_service_pb2_grpc",
+        "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/profiler:tfprof_logger",
+    ],
+)
+
 py_library(
     name = "stepper",
     srcs = ["lib/stepper.py"],
@@ -515,6 +528,32 @@ py_test(
     ],
 )
 
+py_test(
+    name = "source_remote_test",
+    size = "small",
+    srcs = ["lib/source_remote_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+        "nomac",
+        "oss_serial",
+    ],
+    deps = [
+        ":grpc_debug_test_server",
+        ":source_remote",
+        ":source_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "stepper_test",
     size = "small",
@@ -924,6 +963,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/tools/dist_test/server:grpc_tensorflow_server"],
+    grpc_enabled = True,
     tags = [
         "no_oss",  # Incompatible with bazel_pip.
         "no_windows",
diff --git a/tensorflow/python/debug/__init__.py b/tensorflow/python/debug/__init__.py
index 821350ee907c46aaa52b5f47ca763f34458eeb3e..34da44b60df9dbda836d6c91089c5ee90f11c584 100644
--- a/tensorflow/python/debug/__init__.py
+++ b/tensorflow/python/debug/__init__.py
@@ -30,6 +30,8 @@ See the @{$python/tfdbg} guide.
 @@GrpcDebugWrapperSession
 @@LocalCLIDebugHook
 @@LocalCLIDebugWrapperSession
+@@TensorBoardDebugHook
+@@TensorBoardDebugWrapperSession
 @@WatchOptions
 
 @@reconstruct_non_debug_graph_def
@@ -60,9 +62,11 @@ from tensorflow.python.debug.lib.debug_utils import watch_graph_with_blacklists
 from tensorflow.python.debug.wrappers.dumping_wrapper import DumpingDebugWrapperSession
 from tensorflow.python.debug.wrappers.framework import WatchOptions
 from tensorflow.python.debug.wrappers.grpc_wrapper import GrpcDebugWrapperSession
+from tensorflow.python.debug.wrappers.grpc_wrapper import TensorBoardDebugWrapperSession
 from tensorflow.python.debug.wrappers.hooks import DumpingDebugHook
 from tensorflow.python.debug.wrappers.hooks import GrpcDebugHook
 from tensorflow.python.debug.wrappers.hooks import LocalCLIDebugHook
+from tensorflow.python.debug.wrappers.hooks import TensorBoardDebugHook
 from tensorflow.python.debug.wrappers.local_cli_wrapper import LocalCLIDebugWrapperSession
 
 from tensorflow.python.util import all_util as _all_util
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index a7c1d3539943b019cdfbfb4a32e2f55dc11b81cb..847f9ec401499abb8ec4f310fa4d5118b2afca7b 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -54,7 +54,9 @@ def _cli_config_from_temp_file():
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
       disable_model_pruning=True,
-      constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
+      constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 4d7332453055f4bd799a904724a2b93170a43153..c4b13a1045dac4966b0e841155a2932216881d34 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -558,8 +558,7 @@ class DebugDumpDir(object):
     for root, _, files in gfile.Walk(device_root):
       for f in files:
         if _is_graph_file(f):
-          self._dump_graph_file_paths[device_name] = os.path.join(
-              device_root, root, f)
+          self._dump_graph_file_paths[device_name] = os.path.join(root, f)
         else:
           datum = self._dump_file_name_to_datum(root, f)
           self._dump_tensor_data[device_name].append(datum)
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 442dfb7b3f52e74d3bbbc36391e7ec052365a017..bd00f738610627a4b3bc7c61476164188a7b460c 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -22,6 +22,7 @@ import tempfile
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.lib import debug_graphs
@@ -41,6 +42,12 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
   _OP_TYPE_BLACKLIST = (
       "_Send", "_Recv", "_HostSend", "_HostRecv", "_Retval")
 
+  def _no_rewrite_session_config(self):
+    rewriter_config = rewriter_config_pb2.RewriterConfig(
+        dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+    graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+    return config_pb2.ConfigProto(graph_options=graph_options)
+
   def setUp(self):
     super(ReconstructNonDebugGraphTest, self).setUp()
     self._dump_dir = tempfile.mkdtemp()
@@ -136,7 +143,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
           sess, c, expected_output=400.0)
 
   def testReonstructGraphWithCond(self):
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = variables.Variable(10.0, name="x")
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
@@ -157,7 +164,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       self._compareOriginalAndReconstructedGraphDefs(sess, loop)
 
   def testReconstructGraphWithGradients(self):
-    with session.Session() as sess:
+    with session.Session(config=self._no_rewrite_session_config()) as sess:
       u = variables.Variable(12.0, name="u")
       v = variables.Variable(30.0, name="v")
       x = constant_op.constant(1.1, name="x")
diff --git a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
index 98adc3284b94afc8190f7ee4240d7c5fbf37b4b5..16573eab6f0e61c12020c4becb72369c38f05b42 100755
--- a/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
+++ b/tensorflow/python/debug/lib/debug_service_pb2_grpc.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 import grpc
 
 from tensorflow.core.debug import debug_service_pb2 as tensorflow_dot_core_dot_debug_dot_debug__service__pb2
+from tensorflow.core.protobuf import debug_pb2 as tensorflow_dot_core_dot_protobuf_dot_debug__pb2
 from tensorflow.core.util import event_pb2 as tensorflow_dot_core_dot_util_dot_event__pb2
 
 
@@ -42,6 +43,16 @@ class EventListenerStub(object):
         request_serializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.SerializeToString,
         response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
         )
+    self.SendTracebacks = channel.unary_unary(
+        '/tensorflow.EventListener/SendTracebacks',
+        request_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
+    self.SendSourceFiles = channel.unary_unary(
+        '/tensorflow.EventListener/SendSourceFiles',
+        request_serializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.SerializeToString,
+        response_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.FromString,
+        )
 
 
 class EventListenerServicer(object):
@@ -62,6 +73,20 @@ class EventListenerServicer(object):
     context.set_details('Method not implemented!')
     raise NotImplementedError('Method not implemented!')
 
+  def SendTracebacks(self, request, context):
+    """Send the tracebacks of ops in a Python graph definition.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
+  def SendSourceFiles(self, request, context):
+    """Send a collection of source code files being debugged.
+    """
+    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+    context.set_details('Method not implemented!')
+    raise NotImplementedError('Method not implemented!')
+
 
 def add_EventListenerServicer_to_server(servicer, server):
   rpc_method_handlers = {
@@ -70,6 +95,16 @@ def add_EventListenerServicer_to_server(servicer, server):
           request_deserializer=tensorflow_dot_core_dot_util_dot_event__pb2.Event.FromString,
           response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
       ),
+      'SendTracebacks': grpc.unary_unary_rpc_method_handler(
+          servicer.SendTracebacks,
+          request_deserializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.CallTraceback.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
+      'SendSourceFiles': grpc.unary_unary_rpc_method_handler(
+          servicer.SendSourceFiles,
+          request_deserializer=tensorflow_dot_core_dot_protobuf_dot_debug__pb2.DebuggedSourceFiles.FromString,
+          response_serializer=tensorflow_dot_core_dot_debug_dot_debug__service__pb2.EventReply.SerializeToString,
+      ),
   }
   generic_handler = grpc.method_handlers_generic_handler(
       'tensorflow.EventListener', rpc_method_handlers)
diff --git a/tensorflow/python/debug/lib/grpc_debug_server.py b/tensorflow/python/debug/lib/grpc_debug_server.py
index 5ab910fb0c9d89bc31a15ecbec48516f07a02979..1b559f1f27538364d8e12339d321e41d33c52590 100644
--- a/tensorflow/python/debug/lib/grpc_debug_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_server.py
@@ -458,3 +458,36 @@ class EventListenerBaseServicer(debug_service_pb2_grpc.EventListenerServicer):
         `debug_op` as a `str`.
     """
     return list(self._gated_grpc_debug_watches)
+
+  def SendTracebacks(self, request, context):
+    """Base implementation of the handling of SendTracebacks calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `CallTraceback` proto, containing information about the
+        type (e.g., graph vs. eager execution) and source-code traceback of the
+        call and (any) associated `tf.Graph`s.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    """Base implementation of the handling of SendSourceFiles calls.
+
+    The base implementation does nothing with the incoming request.
+    Override in an implementation of the server if necessary.
+
+    Args:
+      request: A `DebuggedSourceFiles` proto, containing the path, content, size
+        and last-modified timestamp of source files.
+      context: Server context.
+
+    Returns:
+      A `EventReply` proto.
+    """
+    return debug_service_pb2.EventReply()
diff --git a/tensorflow/python/debug/lib/grpc_debug_test_server.py b/tensorflow/python/debug/lib/grpc_debug_test_server.py
index 76e45c0bedbb463c872bfca466c6991c9d459e49..a637677d7d092152cd58c20b45520fad97eb90ff 100644
--- a/tensorflow/python/debug/lib/grpc_debug_test_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_test_server.py
@@ -238,6 +238,15 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
         self, server_port,
         functools.partial(EventListenerTestStreamHandler, dump_dir, self))
 
+    # Members for storing the graph ops traceback and source files.
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
   def _initialize_toggle_watch_state(self, toggle_watches):
     self._toggle_watches = toggle_watches
     self._toggle_watch_state = dict()
@@ -259,6 +268,97 @@ class EventListenerTestServicer(grpc_debug_server.EventListenerBaseServicer):
     self.core_metadata_json_strings = []
     self.partition_graph_defs = []
     self.debug_tensor_values = collections.defaultdict(list)
+    self._call_types = []
+    self._call_keys = []
+    self._origin_stacks = []
+    self._origin_id_to_strings = []
+    self._graph_tracebacks = []
+    self._graph_versions = []
+    self._source_files = None
+
+  def SendTracebacks(self, request, context):
+    self._call_types.append(request.call_type)
+    self._call_keys.append(request.call_key)
+    self._origin_stacks.append(request.origin_stack)
+    self._origin_id_to_strings.append(request.origin_id_to_string)
+    self._graph_tracebacks.append(request.graph_traceback)
+    self._graph_versions.append(request.graph_version)
+    return debug_service_pb2.EventReply()
+
+  def SendSourceFiles(self, request, context):
+    self._source_files = request
+    return debug_service_pb2.EventReply()
+
+  def query_op_traceback(self, op_name):
+    """Query the traceback of an op.
+
+    Args:
+      op_name: Name of the op to query.
+
+    Returns:
+      The traceback of the op, as a list of 3-tuples:
+        (filename, lineno, function_name)
+
+    Raises:
+      ValueError: If the op cannot be found in the tracebacks received by the
+        server so far.
+    """
+    for op_log_proto in self._graph_tracebacks:
+      for log_entry in op_log_proto.log_entries:
+        if log_entry.name == op_name:
+          return self._code_def_to_traceback(log_entry.code_def,
+                                             op_log_proto.id_to_string)
+    raise ValueError(
+        "Op '%s' does not exist in the tracebacks received by the debug "
+        "server.")
+
+  def query_origin_stack(self):
+    """Query the stack of the origin of the execution call.
+
+    Returns:
+      A `list` of all tracebacks. Each item corresponds to an execution call,
+        i.e., a `SendTracebacks` request. Each item is a `list` of 3-tuples:
+        (filename, lineno, function_name).
+    """
+    ret = []
+    for stack, id_to_string in zip(
+        self._origin_stacks, self._origin_id_to_strings):
+      ret.append(self._code_def_to_traceback(stack, id_to_string))
+    return ret
+
+  def query_call_types(self):
+    return self._call_types
+
+  def query_call_keys(self):
+    return self._call_keys
+
+  def query_graph_versions(self):
+    return self._graph_versions
+
+  def query_source_file_line(self, file_path, lineno):
+    """Query the content of a given line in a source file.
+
+    Args:
+      file_path: Path to the source file.
+      lineno: Line number as an `int`.
+
+    Returns:
+      Content of the line as a string.
+
+    Raises:
+      ValueError: If no source file is found at the given file_path.
+    """
+    for source_file_proto in self._source_files.source_files:
+      if source_file_proto.file_path == file_path:
+        return source_file_proto.lines[lineno - 1]
+    raise ValueError(
+        "Source file at path %s has not been received by the debug server",
+        file_path)
+
+  def _code_def_to_traceback(self, code_def, id_to_string):
+    return [(id_to_string[trace.file_id],
+             trace.lineno,
+             id_to_string[trace.function_id]) for trace in code_def.traces]
 
 
 def start_server_on_separate_thread(dump_to_filesystem=True,
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index aa5314dda590a6f7d8289e370e3aa04f3dfda1b8..1a6bedbbcbf94eb95e49d43e2d03c85b53bebb7b 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -38,7 +38,8 @@ class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
 
   def _no_rewrite_session_config(self):
     rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True)
+        disable_model_pruning=True,
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
     graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
     return config_pb2.ConfigProto(graph_options=graph_options)
 
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index fd958367cb527601cde2156484c9f31c90a9fda3..99781bd9d900eaa848b79c8a5868d37895de43f2 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -53,7 +53,8 @@ from tensorflow.python.training import monitored_session
 
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
-      disable_model_pruning=True)
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
@@ -247,10 +248,24 @@ class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
     self.assertEqual(
         14, len(dump.get_tensors("v/read", 0, "DebugNumericSummary")[0]))
 
-  def testConstructGrpcDebugHookWithGrpcInUrlRaisesValueError(self):
-    """Tests that the hook raises an error if the URL starts with grpc://."""
-    with self.assertRaises(ValueError):
-      hooks.GrpcDebugHook(["grpc://foo:42"])
+  def testTensorBoardDebugHooWorks(self):
+    u = variables.Variable(2.1, name="u")
+    v = variables.Variable(20.0, name="v")
+    w = math_ops.multiply(u, v, name="w")
+
+    sess = session.Session(config=no_rewrite_session_config())
+    sess.run(u.initializer)
+    sess.run(v.initializer)
+
+    grpc_debug_hook = hooks.TensorBoardDebugHook(
+        ["localhost:%d" % self._server_port])
+    sess = monitored_session._HookedSession(sess, [grpc_debug_hook])
+
+    self.assertAllClose(42.0, sess.run(w))
+
+  def testConstructGrpcDebugHookWithOrWithouGrpcInUrlWorks(self):
+    hooks.GrpcDebugHook(["grpc://foo:42424"])
+    hooks.GrpcDebugHook(["foo:42424"])
 
 
 class LargeGraphAndLargeTensorsDebugTest(test_util.TensorFlowTestCase):
@@ -683,6 +698,56 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
           # to disable the breakpoint at delta:0:DebugIdentity.
           self.assertSetEqual(set(), self._server_1.breakpoints)
 
+  def testTensorBoardDebuggerWrapperToggleBreakpointsWorks(self):
+    with session.Session(config=no_rewrite_session_config()) as sess:
+      v_1 = variables.Variable(50.0, name="v_1")
+      v_2 = variables.Variable(-50.0, name="v_2")
+      delta_1 = constant_op.constant(5.0, name="delta_1")
+      delta_2 = constant_op.constant(-5.0, name="delta_2")
+      inc_v_1 = state_ops.assign_add(v_1, delta_1, name="inc_v_1")
+      inc_v_2 = state_ops.assign_add(v_2, delta_2, name="inc_v_2")
+
+      sess.run([v_1.initializer, v_2.initializer])
+
+      # The TensorBoardDebugWrapperSession should add a DebugIdentity debug op
+      # with attribute gated_grpc=True for every tensor in the graph.
+      sess = grpc_wrapper.TensorBoardDebugWrapperSession(
+          sess, self._debug_server_url_1)
+
+      for i in xrange(4):
+        self._server_1.clear_data()
+
+        if i in (0, 2):
+          # Enable breakpoint at delta_[1,2]:0:DebugIdentity in runs 0 and 2.
+          self._server_1.request_watch(
+              "delta_1", 0, "DebugIdentity", breakpoint=True)
+          self._server_1.request_watch(
+              "delta_2", 0, "DebugIdentity", breakpoint=True)
+        else:
+          # Disable the breakpoint in runs 1 and 3.
+          self._server_1.request_unwatch("delta_1", 0, "DebugIdentity")
+          self._server_1.request_unwatch("delta_2", 0, "DebugIdentity")
+
+        output = sess.run([inc_v_1, inc_v_2])
+        self.assertAllClose([50.0 + 5.0 * (i + 1), -50 - 5.0 * (i + 1)], output)
+
+        if i in (0, 2):
+          # During runs 0 and 2, the server should have received the published
+          # debug tensor delta:0:DebugIdentity. The breakpoint should have been
+          # unblocked by EventReply reponses from the server.
+          self.assertAllClose(
+              [5.0],
+              self._server_1.debug_tensor_values["delta_1:0:DebugIdentity"])
+          self.assertAllClose(
+              [-5.0],
+              self._server_1.debug_tensor_values["delta_2:0:DebugIdentity"])
+          # After the runs, the server should have properly registered the
+          # breakpoints.
+        else:
+          # After the end of runs 1 and 3, the server has received the requests
+          # to disable the breakpoint at delta:0:DebugIdentity.
+          self.assertSetEqual(set(), self._server_1.breakpoints)
+
   def testGetGrpcDebugWatchesReturnsCorrectAnswer(self):
     with session.Session() as sess:
       v = variables.Variable(50.0, name="v")
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 3b9a5d07c25200a179114bad954b38c4836525b3..20a40018bf9c67c5b743963489c8fc5616efa2db 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -57,7 +57,9 @@ from tensorflow.python.training import gradient_descent
 
 def no_rewrite_session_config():
   rewriter_config = rewriter_config_pb2.RewriterConfig(
-      disable_model_pruning=True)
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
   return config_pb2.ConfigProto(graph_options=graph_options)
 
@@ -837,7 +839,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
       self.assertIsNone(dump.find_some_path("delta", "v"))
 
   def testCausalityCheckOnDumpsDetectsWrongTemporalOrder(self):
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_name = "testDumpCausalityCheck/u"
       v_name = "testDumpCausalityCheck/v"
       w_name = "testDumpCausalityCheck/w"
@@ -962,7 +964,7 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase):
   def testOutputSlotWithoutOutgoingEdgeCanBeWatched(self):
     """Test watching output slots not attached to any outgoing edges."""
 
-    with session.Session() as sess:
+    with session.Session(config=no_rewrite_session_config()) as sess:
       u_init_val = np.array([[5.0, 3.0], [-1.0, 0.0]])
       u = constant_op.constant(u_init_val, shape=[2, 2], name="u")
 
diff --git a/tensorflow/python/debug/lib/source_remote.py b/tensorflow/python/debug/lib/source_remote.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d10d5a8d11aadcb7c13e498265a4a00dbc8a1fc
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote.py
@@ -0,0 +1,205 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Communicating tracebacks and source code with debug server."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import socket
+
+import grpc
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.core.protobuf import debug_pb2
+from tensorflow.python.debug.lib import debug_service_pb2_grpc
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.platform import gfile
+from tensorflow.python.profiler import tfprof_logger
+
+
+def _load_debugged_source_file(file_path, source_file_proto):
+  file_stat = gfile.Stat(file_path)
+  source_file_proto.host = socket.gethostname()
+  source_file_proto.file_path = file_path
+  source_file_proto.last_modified = file_stat.mtime_nsec
+  source_file_proto.bytes = file_stat.length
+  try:
+    with gfile.Open(file_path, "r") as f:
+      source_lines = f.readlines()
+      for line in source_lines:
+        source_file_proto.lines.append(line.strip())
+  except IOError:
+    pass
+
+
+def _string_to_id(string, string_to_id):
+  if string not in string_to_id:
+    string_to_id[string] = len(string_to_id)
+  return string_to_id[string]
+
+
+def _format_origin_stack(origin_stack, call_traceback_proto):
+  """Format a traceback stack for a `CallTraceback` proto.
+
+  Args:
+    origin_stack: The stack list as returned by `traceback.extract_stack()`.
+    call_traceback_proto: A `CallTraceback` proto whose fields are to be
+      populated.
+  """
+  string_to_id = dict()
+  string_to_id[None] = 0
+  for frame in origin_stack:
+    file_path, lineno, func_name, line_text = frame
+    call_traceback_proto.origin_stack.traces.add(
+        file_id=_string_to_id(file_path, string_to_id),
+        lineno=lineno,
+        function_id=_string_to_id(func_name, string_to_id),
+        line_id=_string_to_id(line_text, string_to_id))
+
+  id_to_string = call_traceback_proto.origin_id_to_string
+  for key, value in string_to_id.items():
+    id_to_string[value] = key if key is not None else ""
+
+
+def _source_file_paths_outside_tensorflow_py_library(code_defs, id_to_string):
+  """Extract source file paths outside TensorFlow Python library.
+
+  Args:
+    code_defs: An iterable of `CodeDef` protos, i.e., an iterable of stack
+      traces.
+    id_to_string: A proto map from integer ids to strings.
+
+  Returns:
+    An iterable of source file paths outside the TensorFlow Python library.
+  """
+  file_ids = set()
+  for code_def in code_defs:
+    for trace in code_def.traces:
+      file_ids.add(trace.file_id)
+  non_tf_files = (id_to_string[file_id] for file_id in file_ids)
+  non_tf_files = (
+      f for f in non_tf_files
+      if not source_utils.guess_is_tensorflow_py_library(f) and gfile.Exists(f))
+  return non_tf_files
+
+
+def _send_call_tracebacks(destinations,
+                          origin_stack,
+                          is_eager_execution=False,
+                          call_key=None,
+                          graph=None,
+                          send_source=True):
+  """Send the tracebacks of a TensorFlow execution call.
+
+  To gRPC debug server(s). This applies to graph execution (`tf.Session.run()`)
+  calls and eager execution calls.
+
+  If `send_source`, also sends the underlying source files outside the
+  TensorFlow library.
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    origin_stack: The traceback stack for the origin of the execution call. For
+      graph execution, this is the traceback of the `tf.Session.run()`
+      invocation. For eager execution, this is the traceback of the Python
+      line that executes the eager opertion.
+    is_eager_execution: (`bool`) whether an eager execution call (i.e., not a
+      `tf.Session.run` or derived methods) is being sent.
+    call_key: The key of the execution call, as a string. For graph execution,
+      this is a string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call. For eager execution, this is ignored.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks, if applicable.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  if not isinstance(destinations, list):
+    destinations = [destinations]
+
+  call_type = (debug_service_pb2.CallTraceback.EAGER_EXECUTION
+               if is_eager_execution
+               else debug_service_pb2.CallTraceback.GRAPH_EXECUTION)
+  graph_traceback = tfprof_logger.merge_default_with_oplog(
+      graph, add_trainable_var=False) if graph else None
+  call_traceback = debug_service_pb2.CallTraceback(
+      call_type=call_type, call_key=call_key, graph_traceback=graph_traceback,
+      graph_version=graph.version if graph else None)
+
+  _format_origin_stack(origin_stack, call_traceback)
+
+  if send_source:
+    source_file_paths = set()
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        (log_entry.code_def for log_entry
+         in call_traceback.graph_traceback.log_entries),
+        call_traceback.graph_traceback.id_to_string))
+    source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
+        [call_traceback.origin_stack], call_traceback.origin_id_to_string))
+
+    debugged_source_files = debug_pb2.DebuggedSourceFiles()
+    for file_path in source_file_paths:
+      _load_debugged_source_file(
+          file_path, debugged_source_files.source_files.add())
+
+  for destination in destinations:
+    channel = grpc.insecure_channel(destination)
+    stub = debug_service_pb2_grpc.EventListenerStub(channel)
+    stub.SendTracebacks(call_traceback)
+    if send_source:
+      stub.SendSourceFiles(debugged_source_files)
+
+
+def send_graph_tracebacks(destinations,
+                          run_key,
+                          origin_stack,
+                          graph,
+                          send_source=True):
+  """Send the tracebacks of a graph execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+      `CallTraceback` proto payload will be sent to all the destinations.
+    run_key: A string describing the feeds, fetches (and targets) names of the
+      `tf.Session.run` call.
+    origin_stack: The traceback of the `tf.Session.run()` invocation.
+    graph: A Python `tf.Graph` object (i.e., *not* a `tf.GraphDef`), which
+      contains op tracebacks.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=False, call_key=run_key,
+      graph=graph, send_source=send_source)
+
+
+def send_eager_tracebacks(destinations,
+                          origin_stack,
+                          send_source=True):
+  """Send the tracebacks of an eager execution call to debug server(s).
+
+  Args:
+    destinations: gRPC destination addresses, a `str` or a `list` of `str`s,
+      e.g., "localhost:4242". If a `list`, gRPC requests containing the same
+    origin_stack: The traceback of the eager operation invocation.
+    send_source: Whether the source files involved in the op tracebacks but
+      outside the TensorFlow library are to be sent.
+  """
+  _send_call_tracebacks(
+      destinations, origin_stack, is_eager_execution=True,
+      send_source=send_source)
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c4517f681dbd5414de6d4df269356db3a4b654d
--- /dev/null
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for source_remote."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import traceback
+
+from tensorflow.core.debug import debug_service_pb2
+from tensorflow.python.client import session
+from tensorflow.python.debug.lib import grpc_debug_test_server
+from tensorflow.python.debug.lib import source_remote
+from tensorflow.python.debug.lib import source_utils
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import tf_inspect
+
+
+def line_number_above():
+  return tf_inspect.stack()[1][2] - 1
+
+
+class SendTracebacksTest(test_util.TensorFlowTestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    test_util.TensorFlowTestCase.setUpClass()
+    (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
+     cls._server_thread,
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address = "localhost:%d" % cls._server_port
+    (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
+     cls._server_thread_2,
+     cls._server_2) = grpc_debug_test_server.start_server_on_separate_thread()
+    cls._server_address_2 = "localhost:%d" % cls._server_port_2
+    cls._curr_file_path = os.path.normpath(os.path.abspath(__file__))
+
+  @classmethod
+  def tearDownClass(cls):
+    # Stop the test server and join the thread.
+    cls._server.stop_server().wait()
+    cls._server_thread.join()
+    cls._server_2.stop_server().wait()
+    cls._server_thread_2.join()
+    test_util.TensorFlowTestCase.tearDownClass()
+
+  def tearDown(self):
+    ops.reset_default_graph()
+    self._server.clear_data()
+    self._server_2.clear_data()
+    super(SendTracebacksTest, self).tearDown()
+
+  def _findFirstTraceInsideTensorFlowPyLibrary(self, op):
+    """Find the first trace of an op that belongs to the TF Python library."""
+    for trace in op.traceback:
+      if source_utils.guess_is_tensorflow_py_library(trace[0]):
+        return trace
+
+  def testSendGraphTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendGraphTracebacksToSingleDebugServer"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="b")
+      b_lineno = line_number_above()
+      math_ops.add(a, b, name="x")
+      x_lineno = line_number_above()
+
+      send_stack = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          self._server_address, "dummy_run_key", send_stack, sess.graph)
+
+      tb = self._server.query_op_traceback("a")
+      self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("b")
+      self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+      tb = self._server.query_op_traceback("x")
+      self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+      self.assertIn(
+          (self._curr_file_path, send_lineno, this_func_name),
+          self._server.query_origin_stack()[-1])
+
+      self.assertEqual(
+          "a = variables.Variable(21.0, name=\"a\")",
+          self._server.query_source_file_line(__file__, a_lineno))
+      # Files in the TensorFlow code base shouldn not have been sent.
+      tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(a.op)
+      with self.assertRaises(ValueError):
+        self._server.query_source_file_line(tf_trace_file_path, 0)
+      self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                       self._server.query_call_types())
+      self.assertEqual(["dummy_run_key"], self._server.query_call_keys())
+      self.assertEqual(
+          [sess.graph.version], self._server.query_graph_versions())
+
+  def testSendGraphTracebacksToTwoDebugServers(self):
+    this_func_name = "testSendGraphTracebacksToTwoDebugServers"
+    with session.Session() as sess:
+      a = variables.Variable(21.0, name="two/a")
+      a_lineno = line_number_above()
+      b = variables.Variable(2.0, name="two/b")
+      b_lineno = line_number_above()
+      x = math_ops.add(a, b, name="two/x")
+      x_lineno = line_number_above()
+
+      send_traceback = traceback.extract_stack()
+      send_lineno = line_number_above()
+      source_remote.send_graph_tracebacks(
+          [self._server_address, self._server_address_2],
+          "dummy_run_key", send_traceback, sess.graph)
+
+      servers = [self._server, self._server_2]
+      for server in servers:
+        tb = server.query_op_traceback("two/a")
+        self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/b")
+        self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+        tb = server.query_op_traceback("two/x")
+        self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+        self.assertIn(
+            (self._curr_file_path, send_lineno, this_func_name),
+            server.query_origin_stack()[-1])
+
+        self.assertEqual(
+            "x = math_ops.add(a, b, name=\"two/x\")",
+            server.query_source_file_line(__file__, x_lineno))
+        tf_trace_file_path = self._findFirstTraceInsideTensorFlowPyLibrary(x.op)
+        with self.assertRaises(ValueError):
+          server.query_source_file_line(tf_trace_file_path, 0)
+        self.assertEqual([debug_service_pb2.CallTraceback.GRAPH_EXECUTION],
+                         server.query_call_types())
+        self.assertEqual(["dummy_run_key"], server.query_call_keys())
+        self.assertEqual([sess.graph.version], server.query_graph_versions())
+
+  def testSendEagerTracebacksToSingleDebugServer(self):
+    this_func_name = "testSendEagerTracebacksToSingleDebugServer"
+    send_traceback = traceback.extract_stack()
+    send_lineno = line_number_above()
+    source_remote.send_eager_tracebacks(self._server_address, send_traceback)
+
+    self.assertEqual([debug_service_pb2.CallTraceback.EAGER_EXECUTION],
+                     self._server.query_call_types())
+    self.assertIn((self._curr_file_path, send_lineno, this_func_name),
+                  self._server.query_origin_stack()[-1])
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/debug/lib/stepper.py b/tensorflow/python/debug/lib/stepper.py
index 1fa0b3dba2b547bf1d311e42e1005a8e501f9829..c27b3f51cddb51654b1ff5a35fd7d689fc4109c4 100644
--- a/tensorflow/python/debug/lib/stepper.py
+++ b/tensorflow/python/debug/lib/stepper.py
@@ -80,7 +80,7 @@ class NodeStepper(object):
   when they are required as data dependencies.
 
   The temporary directories are automatically clean when the NodeStepper
-  instance exits as a context mananger.
+  instance exits as a context manager.
 
   Once the tracing is complete, it will issue a run() call on the
   underlying session, using the aforementioned feed_dict prepared by the input
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 863af0b924b91572299aaadf371ea62d77c2f2d5..9a3d0efabfeec50171ef91415271c824d6c34588 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -56,6 +56,7 @@ class StepperTest(test_util.TensorFlowTestCase):
 
     rewriter_config = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
         constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
     graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
     config = config_pb2.ConfigProto(graph_options=graph_options)
@@ -590,6 +591,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
 
     rewriter_config = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
         constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
     graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
     config = config_pb2.ConfigProto(graph_options=graph_options)
@@ -722,6 +724,7 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
     rewriter_config = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
         constant_folding=rewriter_config_pb2.RewriterConfig.OFF)
     graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
     config = config_pb2.ConfigProto(graph_options=graph_options)
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index d987ba84b55d6b35e90c5b137714f3eab3ce674c..acea9433e22203d56f4ceb6cd92b681e35876a09 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -111,6 +111,20 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual(repr(self.inc_v), dump.run_fetches_info)
     self.assertEqual(repr(None), dump.run_feed_keys_info)
 
+  def testDumpingOnASingleRunWorksWithRelativePathForDebugDumpDir(self):
+    sess = dumping_wrapper.DumpingDebugWrapperSession(
+        self.sess, session_root=self.session_root, log_usage=False)
+    sess.run(self.inc_v)
+    dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
+    cwd = os.getcwd()
+    try:
+      os.chdir(self.session_root)
+      dump = debug_data.DebugDumpDir(
+          os.path.relpath(dump_dirs[0], self.session_root))
+      self.assertAllClose([10.0], dump.get_tensors("v", 0, "DebugIdentity"))
+    finally:
+      os.chdir(cwd)
+
   def testDumpingOnASingleRunWithFeedDictWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess, session_root=self.session_root, log_usage=False)
@@ -350,12 +364,14 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         thread_name_filter=r"MainThread$")
 
     self.assertAllClose(1.0, sess.run(self.delta))
+    child_thread_result = []
     def child_thread_job():
-      sess.run(sess.run(self.eta))
+      child_thread_result.append(sess.run(self.eta))
 
     thread = threading.Thread(name="ChildThread", target=child_thread_job)
     thread.start()
     thread.join()
+    self.assertAllClose([-1.4], child_thread_result)
 
     dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
     self.assertEqual(1, len(dump_dirs))
diff --git a/tensorflow/python/debug/wrappers/framework.py b/tensorflow/python/debug/wrappers/framework.py
index 4e243cb6c9649a24009a0c9ac501c59eaac3bd79..909150eb6aa21b45af39f7cbfd6248c701ae1fb5 100644
--- a/tensorflow/python/debug/wrappers/framework.py
+++ b/tensorflow/python/debug/wrappers/framework.py
@@ -706,7 +706,8 @@ class BaseDebugWrapperSession(session.SessionInterface):
         exec_type, exec_value, exec_tb)
 
   def __del__(self):
-    self._sess.__del__()
+    if hasattr(self._sess, "__del__"):
+      self._sess.__del__()
 
   def close(self):
     self._sess.close()
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index 4062016607c8a56eb275fe4712a47c84bc7ed01c..16b2018b41343331b9549f1b616fc7bd023a54c9 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -38,7 +38,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       sess: The TensorFlow `Session` object being wrapped.
       grpc_debug_server_addresses: (`str` or `list` of `str`) Single or a list
         of the gRPC debug server addresses, in the format of
-        <host:port>, without the "grpc://" prefix. For example:
+        <host:port>, with or without the "grpc://" prefix. For example:
           "localhost:7000",
           ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: (`Callable`) A Callable that can be used to define per-run
@@ -62,8 +62,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
 
     if isinstance(grpc_debug_server_addresses, str):
       self._grpc_debug_server_urls = [
-          self._GRPC_URL_PREFIX + grpc_debug_server_addresses
-      ]
+          self._normalize_grpc_url(grpc_debug_server_addresses)]
     elif isinstance(grpc_debug_server_addresses, list):
       self._grpc_debug_server_urls = []
       for address in grpc_debug_server_addresses:
@@ -71,7 +70,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
           raise TypeError(
               "Expected type str in list grpc_debug_server_addresses, "
               "received type %s" % type(address))
-        self._grpc_debug_server_urls.append(self._GRPC_URL_PREFIX + address)
+        self._grpc_debug_server_urls.append(self._normalize_grpc_url(address))
     else:
       raise TypeError(
           "Expected type str or list in grpc_debug_server_addresses, "
@@ -93,3 +92,37 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
     """
 
     return self._grpc_debug_server_urls
+
+  def _normalize_grpc_url(self, address):
+    return (self._GRPC_URL_PREFIX + address
+            if not address.startswith(self._GRPC_URL_PREFIX) else address)
+
+
+class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
+  """A tfdbg Session wrapper that can be used with TensroBoard Debugger Plugin.
+
+  This wrapper is the same as `GrpcDebugWrapperSession`, except that it uses a
+    predefined `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True` to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               sess,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               log_usage=True):
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugWrapperSession, self).__init__(
+        sess,
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index 4efa97973eb893a0105ca6abce6d306c1f6867d8..430669962484211e1d07555a605b85bf149465e5 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -27,9 +27,6 @@ from tensorflow.python.debug.wrappers import grpc_wrapper
 from tensorflow.python.debug.wrappers import local_cli_wrapper
 from tensorflow.python.training import session_run_hook
 
-# The prefix for GRPC endpoint URLs.
-_GRPC_ENDPOINT_PREFIX = "grpc://"
-
 
 class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """Command-line-interface debugger hook.
@@ -249,8 +246,8 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     Args:
       grpc_debug_server_addresses: (`list` of `str`) A list of the gRPC debug
-        server addresses, in the format of <host:port>, without the "grpc://"
-        prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
+        server addresses, in the format of <host:port>, with or without the
+        "grpc://" prefix. For example: ["localhost:7000", "192.168.0.2:8000"]
       watch_fn: A function that allows for customizing which ops to watch at
         which specific steps. See doc of
         `dumping_wrapper.DumpingDebugWrapperSession.__init__` for details.
@@ -258,23 +255,14 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
       log_usage: (bool) Whether usage is to be logged.
-
-    Raises:
-      ValueError: if any debugger server addresses start with grpc://.
     """
-
-    for address in grpc_debug_server_addresses:
-      if address.startswith(_GRPC_ENDPOINT_PREFIX):
-        raise ValueError(
-            ("Debug server address %r starts with %r. It should not because "
-             "the hook already automatically adds the prefix.") % (
-                 address, _GRPC_ENDPOINT_PREFIX))
-
-    # A wrapper session responsible for GRPC communication.
     self._grpc_debug_wrapper_session = None
     self._thread_name_filter = thread_name_filter
+    self._grpc_debug_server_addresses = (
+        grpc_debug_server_addresses
+        if isinstance(grpc_debug_server_addresses, list)
+        else [grpc_debug_server_addresses])
 
-    self._grpc_debug_server_addresses = grpc_debug_server_addresses
     self._watch_fn = watch_fn
     self._log_usage = log_usage
 
@@ -315,3 +303,31 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
 
     return session_run_hook.SessionRunArgs(
         None, feed_dict=None, options=run_options)
+
+
+class TensorBoardDebugHook(GrpcDebugHook):
+  """A tfdbg hook that can be used with TensorBoard Debugger Plugin.
+
+  This hook is the same as `GrpcDebugHook`, except that it uses a predefined
+    `watch_fn` that
+    1) uses `DebugIdentity` debug ops with the `gated_grpc` attribute set to
+        `True`, to allow the interactive enabling and disabling of tensor
+       breakpoints.
+    2) watches all tensors in the graph.
+  This saves the need for the user to define a `watch_fn`.
+  """
+
+  def __init__(self,
+               grpc_debug_server_addresses,
+               thread_name_filter=None,
+               log_usage=True):
+    def _gated_grpc_watch_fn(fetches, feeds):
+      del fetches, feeds  # Unused.
+      return framework.WatchOptions(
+          debug_ops=["DebugIdentity(gated_grpc=true)"])
+
+    super(TensorBoardDebugHook, self).__init__(
+        grpc_debug_server_addresses,
+        watch_fn=_gated_grpc_watch_fn,
+        thread_name_filter=thread_name_filter,
+        log_usage=log_usage)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f5b946ec263c40bd62261297ef55ffa52cb2c042..f470e181200f19d672cced3ea21d05aa2eee0bea 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -3,6 +3,10 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/tools/test:performance.bzl",
+    "tf_py_logged_benchmark",
+)
 
 cc_library(
     name = "pywrap_tfe_lib",
@@ -10,11 +14,16 @@ cc_library(
         "pywrap_tensor.cc",
         "pywrap_tfe_src.cc",
     ],
-    hdrs = ["pywrap_tfe.h"],
+    hdrs = [
+        "pywrap_tensor.h",
+        "pywrap_tfe.h",
+    ],
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:lib",
         "//tensorflow/python:ndarray_tensor",
@@ -52,7 +61,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
-        ":memory_trace",
         "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tensorflow",
     ],
@@ -79,12 +87,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
 )
 
-py_library(
-    name = "memory_trace",
-    srcs = ["memory_trace.py"],
-    srcs_version = "PY2AND3",
-)
-
 cuda_py_test(
     name = "tensor_test",
     srcs = ["tensor_test.py"],
@@ -108,6 +110,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
@@ -142,6 +145,7 @@ cuda_py_test(
         ":test",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
     ],
 )
 
@@ -213,6 +217,7 @@ cc_library(
         ":python_eager_op_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -356,22 +361,26 @@ py_library(
     ],
 )
 
-py_test(
+cuda_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":function",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
     ],
 )
 
+tf_py_logged_benchmark(
+    name = "benchmarks",
+    target = "//tensorflow/python/eager:benchmarks_test",
+)
+
 py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
@@ -408,6 +417,7 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
     ],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 6f7f2117be6ffaa426b1ff59c18140c0cc9e552a..dc1142705abb80abe3729aa42b44f2ca1e97d31f 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -120,6 +120,7 @@ _tracing = False
 # gradient function registration site, to be less error-prone
 # TODO(apassos) add ops other than those in nn_grad and math_grad
 _ops_which_dont_need_outputs = set([
+    "Identity",
     "MatMul",
     "Conv2DBackpropInput",
     "Conv2DBackpropFilter",
@@ -195,6 +196,7 @@ _ops_which_dont_need_outputs = set([
 ])
 
 _ops_which_dont_need_inputs = set([
+    "Identity",
     "Softmax",
     "LogSoftmax",
     "BiasAdd",
@@ -303,6 +305,7 @@ def implicit_val_and_grad(f):
   is not known ahead of time.
 
   Example:
+
   ```python
   dense_layer = tf.layers.Dense(1)
   def loss(x, y):
@@ -348,9 +351,9 @@ def implicit_val_and_grad(f):
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
                              f.__name__))
-      variables = tape.top_tape_watched_variables()
     finally:
       popped_tape = tape.pop_tape()
+      variables = popped_tape.watched_variables()
     sources = [x.handle for x in variables]
 
     if not sources:
@@ -376,6 +379,7 @@ def implicit_grad(f):
   is not known ahead of time.
 
   Example:
+
   ```python
   dense_layer = tf.layers.Dense(1)
   def loss(x, y):
@@ -536,7 +540,7 @@ def _ensure_unique_tensor_objects(parameter_positions, args):
     if i in parameter_positions:
       tid = ops.tensor_id(t)
       if tid in s:
-        args[i] = args[i]._dup()  # pylint: disable=protected-access
+        args[i] = gen_array_ops.identity(args[i])
       else:
         s.add(tid)
   return args
@@ -657,14 +661,14 @@ def make_vjp(f, params=None):
       for i in parameter_positions:
         sources.append(args[i])
         tape.watch(args[i])
-        result = f(*args)
-        if result is None:
-          raise ValueError("Cannot differentiate a function that returns None; "
-                           "did you forget to return a value from {}?".format(
-                               f.__name__))
-        flat_result = nest.flatten(result)
-        flat_result = [gen_array_ops.identity(x) for x in flat_result]
-        result = nest.pack_sequence_as(result, flat_result)
+      result = f(*args)
+      if result is None:
+        raise ValueError("Cannot differentiate a function that returns None; "
+                         "did you forget to return a value from {}?".format(
+                             f.__name__))
+      flat_result = nest.flatten(result)
+      flat_result = [gen_array_ops.identity(x) for x in flat_result]
+      result = nest.pack_sequence_as(result, flat_result)
     finally:
       t = tape.pop_tape()
     def vjp(dy=None):
@@ -727,12 +731,32 @@ def _num_elements(grad):
   raise ValueError("`grad` not a Tensor or IndexedSlices.")
 
 
+_last_shape_dtype = [None, None]
+_last_zero = [None]
+
+
+def _fast_fill(value, shape, dtype):
+  return array_ops.fill(shape, constant_op.constant(value, dtype=dtype))
+
+
+def _zeros(shape, dtype):
+  """Wraps array_ops.zeros to cache last zero for a given shape and dtype."""
+  if [shape, dtype] != _last_shape_dtype:
+    _last_shape_dtype[:] = [shape, dtype]
+    _last_zero[0] = _fast_fill(0, shape, dtype)
+  return _last_zero[0]
+
+
+def _ones(shape, dtype):
+  return _fast_fill(1, shape, dtype)
+
+
 _default_vspace = imperative_grad.VSpace(
     num_elements_fn=_num_elements,
     aggregate_fn=_aggregate_grads,
     tensor_id=ops.tensor_id,
-    zeros=array_ops.zeros,
-    ones_like=lambda x: ops.convert_to_tensor(array_ops.ones_like(x)))
+    zeros=_zeros,
+    ones=_ones)
 
 
 class GradientTape(object):
@@ -774,13 +798,41 @@ class GradientTape(object):
   grad = g.gradient(y, [x])[0]
   assert grad.numpy() == 6.0
   ```
+
+  By default, the resources held by a GradientTape are released as soon as
+  GradientTape.gradient() method is called. However, if one need to compute
+  multiple gradients over the same computation, she can create a persistent
+  GradientTape. Persistent tapes allow multiple calls to the gradient() method
+  and release resources when the tape object is destructed.
+
+  Example usage:
+
+  ```python
+  with tfe.GradientTape(persistent=True) as g:
+    x = tf.constant(3.0)
+    g.watch(x)
+    y = x * x
+    z = y * y
+  dz_dx = g.gradient(z, [x])[0]
+  assert dz_dx.numpy() == 108.0   # 4*x^3 at x = 3
+  dy_dx = g.gradient(y, [x])[0]
+  assert dy_dx.numpy() == 6.0
+  del g  # Drop the reference to the tape
   """
 
-  def __init__(self):
+  def __init__(self, persistent=False):
+    """Creates a new GradientTape.
+
+    Args:
+      persistent: Boolean controlling whether a persistent gradient tape
+        is created. Must be True or False.
+
+    """
     self._tape = None
+    self._persistent = persistent
 
   def __enter__(self):
-    tape.push_new_tape()
+    tape.push_new_tape(persistent=self._persistent)
     return self
 
   def __exit__(self, typ, value, traceback):
@@ -814,12 +866,14 @@ class GradientTape(object):
        than once.
     """
     if self._tape is None:
-      raise RuntimeError("GradientTape.gradient can only be called once, and "
+      raise RuntimeError("GradientTape.gradient can only be called once "
+                         "on non-persistent tapes, and "
                          "only when the context manager has exited.")
     sources = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
                else x
                for x in sources]
     grad = imperative_grad.imperative_grad(
         _default_vspace, self._tape, [target], sources)
-    self.tape = None
+    if not self._persistent:
+      self._tape = None
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index ed54b8e12e74d2187cef6383fa77c7a8280c6d73..90c0e47ff91e9dc422c7cdc20f28b2fc00d78da0 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -24,7 +24,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import custom_gradient
-from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -41,7 +40,6 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
-from tensorflow.python.util import compat
 
 
 class BackpropTest(test.TestCase):
@@ -103,6 +101,18 @@ class BackpropTest(test.TestCase):
     grad_fn = backprop.gradients_function(f)
     self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
 
+  def testErrors(self):
+
+    @custom_gradient.custom_gradient
+    def f(x):
+      def grad(_):
+        raise RuntimeError('x')
+      return x, grad
+
+    # TODO(apassos) raise the right error here
+    with self.assertRaises(RuntimeError):
+      backprop.gradients_function(f)(constant_op.constant(1.0))
+
   def testImplicitGradOverEmbeddingLookup(self):
     batch_size = 8
     embedding_size = 512
@@ -204,6 +214,19 @@ class BackpropTest(test.TestCase):
 
     self.assertAllEqual(gradgrad(constant_op.constant(0.0))[0], 1.0)
 
+  def testStopGradient(self):
+    grad = backprop.gradients_function(
+        lambda x: array_ops.stop_gradient(math_ops.argmax(x)))
+    self.assertAllEqual(grad([0.0])[0], None)
+
+  def testArgmax(self):
+    def argmax(x):
+      i = math_ops.argmax(x)
+      return array_ops.stop_gradient(i)
+
+    grad = backprop.gradients_function(argmax)
+    self.assertAllEqual(grad([0.0])[0], None)
+
   def testGPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
@@ -293,6 +316,48 @@ class BackpropTest(test.TestCase):
     grad = g.gradient(y, [x])[0]
     self.assertEqual(grad.numpy(), 6.0)
 
+  def testGradientTapeGradientCalledMultipleTimes(self):
+    with backprop.GradientTape() as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      z = y * y
+    g.gradient(z, [x])
+    with self.assertRaisesRegexp(
+        RuntimeError, 'GradientTape.gradient can only be called once'):
+      g.gradient(y, [x])
+
+  def testPersistentTape(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      z = y * y
+    dz_dx = g.gradient(z, [x])[0]
+    self.assertEqual(dz_dx.numpy(), 4*3*3*3)
+    dy_dx = g.gradient(y, [x])[0]
+    self.assertEqual(dy_dx.numpy(), 2*3)
+    del g
+
+  def testPersistentNestedTape(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x * x
+      with backprop.GradientTape(persistent=True) as gg:
+        gg.watch(y)
+        z = 2 * y
+      for _ in range(2):
+        inner_grad = gg.gradient(z, [y])[0]
+        self.assertEqual(inner_grad.numpy(), 2.0)
+      y += inner_grad
+      del gg
+    grad = g.gradient(y, [x])[0]
+    self.assertEqual(grad.numpy(), 6.0)
+    grad = g.gradient(z, [x])[0]
+    self.assertEqual(grad.numpy(), 12.0)
+    del g
+
   def testGradientTapeVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0, name='v')
     with backprop.GradientTape() as g:
@@ -483,48 +548,6 @@ class BackpropTest(test.TestCase):
         initial_value=1., name='testSameObjectForMultipleArguments.Variable')
     self.assertAllEqual([1., 1.], np_g(v, v))
 
-  def testEarlyGradAggregation(self):
-    # Needs to be a list so mutations by the callback affect this function.
-    add_n = []
-    def callback(op_type, unused_1, unused_2, unused_3, unused_4):
-      if compat.as_bytes(op_type) == compat.as_bytes('AddN'):
-        add_n.append(1)
-    context.context().add_post_execution_callback(callback)
-
-    v = resource_variable_ops.ResourceVariable(constant_op.constant(2.0),
-                                               name='v')
-    def fn():
-      outputs = []
-      for _ in range(20):
-        outputs.append(v * constant_op.constant(2.0))
-      return math_ops.add_n(outputs)
-
-    # By default the aggregation count is 2.
-    _ = backprop.implicit_grad(fn)()[0][1]
-    self.assertEqual(len(add_n), 2)
-    del add_n[:]
-
-    # Reduce the aggregation limit, cause the backprop to do some
-    # early aggregation.
-    # pylint: disable=protected-access
-    old_cnt = imperative_grad._MIN_AGGREGATE_COUNT
-    old_bytes = imperative_grad._MIN_AGGREGATE_BYTES
-    imperative_grad._MIN_AGGREGATE_COUNT = 10
-    imperative_grad._MIN_AGGREGATE_BYTES = 1
-    _ = backprop.implicit_grad(fn)()
-    self.assertEqual(len(add_n), 6)
-    del add_n[:]
-
-    # Aggregation is also limited by the memory.
-    imperative_grad._MIN_AGGREGATE_BYTES = 10000
-    _ = backprop.implicit_grad(fn)()
-    self.assertEqual(len(add_n), 2)
-
-    imperative_grad._MIN_AGGREGATE_COUNT = old_cnt
-    imperative_grad._MIN_AGGREGATE_BYTES = old_bytes
-    # pylint: enable=protected-access
-    context.context().clear_post_execution_callbacks()
-
   def testImplicitGradientsCustomGradientAndCachedVariableValue(self):
 
     @custom_gradient.custom_gradient
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index ebc9e346c068911bfa1c8d1e8d90ded9267d669c..9849f0f322eff2d909e7396158539a9663b95f29 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -12,21 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmarks for low-level eager execution primitives.
+r"""Benchmarks for low-level eager execution primitives.
 
-Packaged as a test to ensure that this code is exercised by continuous
-integration tests. To get numbers:
+To run CPU benchmarks:
+  bazel run -c opt benchmarks_test -- --benchmarks=.
 
-  bazel build -c opt :benchmarks_test &&
-  ./bazel-bin/tensorflow/python/eager/benchmarks_test --iters=0
+To run GPU benchmarks:
+  bazel run --config=cuda -c opt --copt="-mavx" benchmarks_test -- \
+    --benchmarks=.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import contextlib
-import sys
 import time
 
 import numpy as np
@@ -39,137 +37,333 @@ from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 
-FLAGS = None
-
-
-@contextlib.contextmanager
-def timer(label, iters=30000):
-  start = time.time()
-  yield xrange(iters)
-  end = time.time()
-  t = (end - start) * 1e6 / iters
-  print("%-40s took %.2fus (%d iterations)" % (label, t, iters))
-
-
-def benchmark_create_tensor(n):
-  """Benchmark overheads of creating a Tensor object."""
-
-  def label(s):
-    return "{:20s}".format(s)
-
-  with timer(label("np.array([[3.0]])"), iters=n) as iters:
-    for _ in iters:
-      np.array([[3.0]])
-
-  ctx = context.context()
-  handle = ctx._handle
-  device = ctx.device_name
-  # May be warmup GPU.
-  ops.EagerTensor([[3.0]], context=handle, device=device)
-
-  # float32
-  dtype = dtypes.float32.as_datatype_enum
-  three = [[3.0]]
-  with timer(label("EagerTensor([[3.0]])"), iters=n) as iters:
-    for _ in iters:
-      ops.EagerTensor(three, context=handle, device=device, dtype=dtype)
-
-  np_3 = np.array([[3.0]], dtype=np.float32)
-  with timer(label("EagerTensor(np.array([[3.0]]))"), iters=n) as iters:
-    for _ in iters:
-      ops.EagerTensor(np_3, context=handle, device=device, dtype=dtype)
-
-  # int32.
-  # This is interesting since int32 will be kept on host memory for the GPU
-  # case.
-  dtype = dtypes.int32.as_datatype_enum
-  three = [[3]]
-  with timer(label("EagerTensor([[3]])"), iters=n) as iters:
-    for _ in iters:
-      ops.EagerTensor(three, context=handle, device=device, dtype=dtype)
-
-  np_3 = np.array([[3]], dtype=np.int32)
-  with timer(label("EagerTensor(np.array([[3]]))"), iters=n) as iters:
-    for _ in iters:
-      ops.EagerTensor(np_3, context=handle, device=device, dtype=dtype)
-
-
-def benchmark_matmul(shape, n, use_gpu=False):
-  """Benchmark for matrix multiplication using tf.matmul."""
-  transpose_b = (shape[0] != shape[1])
-  m = random_ops.random_uniform(shape)
-  if use_gpu:
-    m = m.gpu()
-    # Warm up the GPU - the very first kernel invocation
-    # seems to require a bunch of setup.
-    math_ops.matmul(m, m, transpose_b=transpose_b)
-
-  def label(s):
-    return "MatMul {}: {:30s}".format(shape, s)
-
-  if not use_gpu:
+
+CPU = "/device:CPU:0"
+GPU = "/device:GPU:0"
+
+
+class MicroBenchmarks(test.Benchmark):
+
+  def __init__(self):
+    # used for multiply benchmarks
+    self._m_2 = random_ops.random_uniform([2])
+
+    # used for matmul benchmarks
+    self._m_2_by_2 = random_ops.random_uniform((2, 2))
+    self._m_100_by_784 = random_ops.random_uniform((100, 784))
+    self._num_iters_2_by_2 = 30000
+    self._num_iters_100_by_784 = 1000
+
+  def _run(self, func, num_iters):
+    # call func to maybe warm up the GPU
+    func()
+    start = time.time()
+    for _ in xrange(num_iters):
+      func()
+    end = time.time()
+    mean_us = (end - start) * 1e6 / num_iters
+    self.report_benchmark(iters=num_iters, wall_time=mean_us,
+                          extras={"examples_per_sec": num_iters/(end-start)})
+
+  def benchmark_create_np_array(self):
+    func = lambda: np.array([3.0])
+    self._run(func, 30000)
+
+  def _benchmark_create_tensor(self, value, dtype, device):
+    """Benchmark overheads of creating a Tensor object."""
+    ctx = context.context()
+    handle = ctx._handle
+    if device == GPU:
+      # Warmup the GPU
+      ops.EagerTensor(value, context=handle, device=device)
+
+    def func():
+      ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+    self._run(func, 30000)
+
+  def benchmark_create_float_tensor_from_list_CPU(self):
+    self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, CPU)
+
+  def benchmark_create_float_tensor_from_np_array_CPU(self):
+    self._benchmark_create_tensor(
+        np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
+        CPU)
+
+  def benchmark_create_int32_tensor_from_list_CPU(self):
+    self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, CPU)
+
+  def benchmark_create_int32_tensor_from_np_array_CPU(self):
+    self._benchmark_create_tensor(
+        np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, CPU)
+
+  def benchmark_create_float_tensor_from_list_GPU(self):
+    if not context.num_gpus():
+      return
+    self._benchmark_create_tensor([[3.0]], dtypes.float32.as_datatype_enum, GPU)
+
+  def benchmark_create_float_tensor_from_np_array_GPU(self):
+    if not context.num_gpus():
+      return
+    self._benchmark_create_tensor(
+        np.array([[3.0]], dtype=np.float32), dtypes.float32.as_datatype_enum,
+        GPU)
+
+  def benchmark_create_int32_tensor_from_list_GPU(self):
+    # int32's are kept on host memory even when executing on GPU.
+    if not context.num_gpus():
+      return
+    self._benchmark_create_tensor([[3]], dtypes.int32.as_datatype_enum, GPU)
+
+  def benchmark_create_int32_tensor_from_np_array_GPU(self):
+    # int32's are kept on host memory even when executing on GPU.
+    if not context.num_gpus():
+      return
+    self._benchmark_create_tensor(
+        np.array([[3]], dtype=np.int32), dtypes.int32.as_datatype_enum, GPU)
+
+  def _benchmark_np_multiply(self, m, num_iters):
+    a = m.cpu().numpy()
+    func = lambda: a * a
+    self._run(func, num_iters)
+
+  def _benchmark_tf_multiply(self, m, num_iters):
+    func = lambda: m * m
+    self._run(func, num_iters)
+
+  def _benchmark_tf_multiply_op(self, m, num_iters):
+    func = lambda: math_ops.multiply(m, m)
+    self._run(func, num_iters)
+
+  def benchmark_np_multiply(self):
+    self._benchmark_np_multiply(self._m_2, 30000)
+
+  def benchmark_tf_multiply_CPU(self):
+    with context.device(CPU):
+      m = self._m_2.cpu()
+      self._benchmark_tf_multiply(m, 30000)
+
+  def benchmark_tf_multiply_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2.gpu()
+      self._benchmark_tf_multiply(m, 30000)
+
+  def benchmark_tf_multiply_op_CPU(self):
+    with context.device(CPU):
+      m = self._m_2.cpu()
+      self._benchmark_tf_multiply_op(m, 30000)
+
+  def benchmark_tf_multiply_op_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2.gpu()
+      self._benchmark_tf_multiply_op(m, 30000)
+
+  def benchmark_tf_identity(self):
+    m = self._m_2
+    self._run(lambda: gen_array_ops.identity(m), 30000)
+
+  def benchmark_tfe_py_execute_identity(self):
+    m = self._m_2
+    ctx_handle = context.context()._handle
+    attrs = ("T", self._m_2.dtype.as_datatype_enum)
+    inputs = [m]
+
+    def f():
+      pywrap_tensorflow.TFE_Py_Execute(
+          ctx_handle, None, "Identity", inputs, attrs, 1)
+
+    self._run(f, 30000)
+
+  def benchmark_tf_gradient_function_identity(self):
+    m = self._m_2
+    self._run(
+        lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
+        30000)
+
+  def benchmark_tf_gradient_forward_identity(self):
+    with backprop.GradientTape() as tape:
+      m = self._m_2
+      tape.watch(m)
+      self._run(lambda: gen_array_ops.identity(m), 30000)
+
+  def benchmark_tf_gradient_tape_push_pop(self):
+
+    def f():
+      with backprop.GradientTape():
+        pass
+    self._run(f, 30000)
+
+  def benchmark_tf_gradient_function_no_op(self):
+    m = self._m_2
+    self._run(
+        lambda: backprop.gradients_function(lambda x: x, [0])(m),
+        30000)
+
+  def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
     b = a.T if transpose_b else a
-    with timer(label("np.dot"), iters=n) as iters:
-      for _ in iters:
-        np.dot(a, b)
+    func = lambda: np.dot(a, b)
+    self._run(func, num_iters)
 
-  with timer(label("tf.matmul"), iters=n) as iters:
-    for _ in iters:
-      math_ops.matmul(m, m, transpose_b=transpose_b)
+  def _benchmark_tf_matmul(self, m, transpose_b, num_iters):
+    func = lambda: math_ops.matmul(m, m, transpose_b=transpose_b)
+    self._run(func, num_iters)
 
-  with timer(label("gen_math_ops.mat_mul"), iters=n) as iters:
-    for _ in iters:
+  def _benchmark_gen_math_ops_matmul(self, m, transpose_b, num_iters):
+    def func():
       gen_math_ops._mat_mul(m, m, transpose_b=transpose_b)
+    self._run(func, num_iters)
+
+  def _benchmark_tfe_py_execute_matmul(self, m, transpose_b, num_iters):
+    inputs = [m, m]
+    # pylint: disable=protected-access
+    ctx_handle = context.context()._handle
+    # pylint: enable=protected-access
+    attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
+             m.dtype.as_datatype_enum)
+    def func():
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul", inputs,
+                                       attrs, 1)
+
+    self._run(func, num_iters)
+
+  def _benchmark_defun_matmul(self, m, transpose_b, num_iters):
+    f = function.defun(math_ops.matmul)
+    func = lambda: f(m, m, transpose_b)
+    self._run(func, num_iters)
+
+  # Benchmarks for A^2, A of dimension 2 by 2.
+  def benchmark_np_matmul_2_by_2(self):
+    self._benchmark_np_matmul(
+        self._m_2_by_2, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_tf_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_tf_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_gen_math_ops_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_tfe_py_execute_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_tfe_py_execute_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_defun_matmul_2_by_2_CPU(self):
+    with context.device(CPU):
+      m = self._m_2_by_2.cpu()
+      self._benchmark_defun_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_tf_matmul_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_tf_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_tfe_py_execute_matmul_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_tfe_py_execute_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  def benchmark_defun_matmul_2_by_2_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_2_by_2.gpu()
+      self._benchmark_defun_matmul(
+          m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
+  # Benchmarks for AA.T, A of dimension 100 by 784.
+  def benchmark_np_matmul_100_by_784(self):
+    self._benchmark_np_matmul(
+        self._m_100_by_784,
+        transpose_b=True,
+        num_iters=self._num_iters_100_by_784)
+
+  def benchmark_tf_matmul_100_by_784_CPU(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_tf_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_gen_math_ops_matmul_100_by_784_CPU(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_tfe_py_execute_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_defun_matmul_100_by_784_CPU(self):
+    with context.device(CPU):
+      m = self._m_100_by_784.cpu()
+      self._benchmark_defun_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_tf_matmul_100_by_784_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_tf_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_gen_math_ops_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
+  def benchmark_tfe_py_execute_matmul_100_by_784_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_tfe_py_execute_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  inputs = [m, m]
-  # pylint: disable=protected-access
-  ctx_handle = context.context()._handle
-  # pylint: enable=protected-access
-  attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
-           m.dtype.as_datatype_enum)
-  with timer(label("TFE_Py_Execute"), iters=n) as iters:
-    for _ in iters:
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul",
-                                       inputs, attrs, 1)
-
-  f = function.defun(math_ops.matmul)
-  with timer(label("defun(tf.matmul)"), iters=n) as iters:
-    for _ in iters:
-      f(m, m, transpose_b=transpose_b)
-
-
-class BenchmarksTest(test_util.TensorFlowTestCase):
-
-  def testBenchmarks(self):
-    # This isn't actually a test, but benchmarks packaged as a test
-    # so that continuous integration runs catch any breakages.
-    print(context.context())
-    benchmark_create_tensor(FLAGS.iters or 30000)
-    benchmark_matmul([2, 2], FLAGS.iters or 30000)
-    benchmark_matmul([100, 28 * 28], FLAGS.iters or 1000)
-
-    if context.context().num_gpus() > 0:
-      print("---- RUNNING ON GPU NOW ----")
-      with context.device("/device:GPU:0"):
-        benchmark_create_tensor(FLAGS.iters or 30000)
-      benchmark_matmul([2, 2], FLAGS.iters or 30000, use_gpu=True)
-      benchmark_matmul([100, 28 * 28], FLAGS.iters or 1000, use_gpu=True)
+  def benchmark_defun_matmul_100_by_784_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m = self._m_100_by_784.gpu()
+      self._benchmark_defun_matmul(
+          m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  # Default iterations to 1 to keep continuos integration test times low.
-  parser.add_argument(
-      "--iters",
-      type=int,
-      default=1,
-      help="Number of iterators for each test. None or 0 for auto-selection")
-  FLAGS, unparsed = parser.parse_known_args()
-  sys.argv = [sys.argv[0]] + unparsed
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 92f4e15c054bd8cf3886b8c22e414abdfccbdae5..415416cfae61352626b84fe1a99a6345f4758a7e 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -288,6 +288,21 @@ class Context(object):
     self._initialize_handle_and_devices()
     return self._num_gpus
 
+  def add_function(self, fn):
+    """Add a function definition to the context.
+
+    Once added, the function (identified by its name) can be executed like any
+    other operation.
+
+    Args:
+      fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
+    """
+    with errors.raise_exception_on_not_ok_status() as status:
+      pywrap_tensorflow.TFE_ContextAddFunction(
+          self._handle,  # pylint: disable=protected-access
+          fn,
+          status)
+
   def add_function_def(self, fdef):
     """Add a function definition to the context.
 
diff --git a/tensorflow/python/eager/core.py b/tensorflow/python/eager/core.py
index 3f3d38b9510ace1f277017ff7d0b1de205b87f40..483b7172107838a0069831f2347b0c644c05c000 100644
--- a/tensorflow/python/eager/core.py
+++ b/tensorflow/python/eager/core.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.eager import memory_trace
 from tensorflow.python.framework import errors
 
 # Trace of execution and memory usage.
@@ -48,28 +47,3 @@ class _NotOkStatusException(Exception):
 
 
 pywrap_tensorflow.TFE_Py_RegisterExceptionClass(_NotOkStatusException)
-
-
-def enable_tracing():
-  """Enables tracing of execution and memory usage.
-
-  WARNING: tracing is not thread-safe.
-  """
-  # TODO(alive): Add code example in doc string.
-  global _active_trace
-  _active_trace = memory_trace.MemoryTrace()
-
-
-def flush_trace():
-  """Flushes the active trace, if it exists.
-
-  WARNING: tracing is not thread-safe.
-  """
-  # TODO(alive): Add code example in doc string.
-  if _active_trace is not None:
-    _active_trace.flush_trace()
-
-
-def active_trace():
-  """Returns the current global active trace of execution and memory usage."""
-  return _active_trace
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 983c1ea73e59ecdad8def57fc8af36798e2d3c57..306cf07aabe1c214d02da5f077a57043cc1f4089 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
 
 
-def execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
+def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
   """Execute a TensorFlow operation.
 
   Args:
@@ -47,8 +47,7 @@ def execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     name: Customized name for the operation.
 
   Returns:
-    None if there are no outputs, a single Tensor object if there is one output
-    and a list of Tensor objects if there are multiple outputs.
+    List of output Tensor objects. The list is empty if there are no outputs
 
   Raises:
     An exception on error.
@@ -65,24 +64,22 @@ def execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     else:
       message = e.message
     six.raise_from(core._status_to_exception(e.code, message), None)
-
-  # TODO(alive, cais): Use the execution callback mechanism.
-  if core.active_trace() is not None:
-    for t in tensors:
-      core.active_trace().record_tensor(op_name,
-                                        ops.tensor_id(t),
-                                        t.device,
-                                        t.shape.num_elements())
   # pylint: enable=protected-access
+  return tensors
+
 
-  # TODO(cais): Optimize this, perhaps by replacing this execute function with
-  # a different one when there are execution callback(s).
+def execute_with_callbacks(op_name, num_outputs, inputs, attrs, ctx, name=None):
+  """Monkey-patch to execute to enable execution callbacks."""
+  tensors = quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
   for callback in ctx.post_execution_callbacks:
     callback(op_name, name, attrs, inputs, tensors)
 
   return tensors
 
 
+execute = quick_execute
+
+
 def record_gradient(unused_op_name, unused_inputs, unused_attrs, unused_results,
                     unused_name):
   """Import backprop if you want gradients recorded."""
@@ -169,8 +166,11 @@ def make_tensor(v, arg_name):
 def args_to_matching_eager(l, ctx, default_dtype=None):
   """Convert sequence `l` to eager same-type Tensors."""
   EagerTensor = ops.EagerTensor  # pylint: disable=invalid-name
-  if all(isinstance(x, EagerTensor) for x in l):
-    return l[0].dtype, l
+  for x in l:
+    if not isinstance(x, EagerTensor):
+      break
+  else:  # note: intentional for-else
+    return l[0]._datatype_enum(), l  # pylint: disable=protected-access
   # TODO(josh11b): Could we do a better job if we also passed in the
   # allowed dtypes when that was known?
 
@@ -194,7 +194,7 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
   else:
     ret = [internal_convert_to_tensor(t, dtype, ctx=ctx) for t in l]
 
-  return dtype, ret
+  return dtype.as_datatype_enum, ret
 
 
 def convert_to_mixed_eager_tensors(values, ctx):
@@ -203,7 +203,7 @@ def convert_to_mixed_eager_tensors(values, ctx):
           t, context=ctx._handle, device=ctx.device_name)  # pylint: disable=protected-access
       for t in values
   ]
-  types = [t.dtype for t in v]
+  types = [t._datatype_enum() for t in v]  # pylint: disable=protected-access
   return types, v
 
 
@@ -241,5 +241,5 @@ def args_to_mixed_eager_tensors(lists, ctx):
       for j in range(len(lists)):
         lists_ret[j].append(
             ops.internal_convert_to_tensor(lists[j][i], dtype=dtype, ctx=ctx))
-    types.append(dtype)
+    types.append(dtype.as_datatype_enum)
   return types, lists_ret
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 6b0e7f5c3f966f06fb4795eee09d6972910220e6..2f1654dda499583fe4766cbe2e330399defc96fd 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
+from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
 _DEFAULT_CALLBACK_ACTION = "raise"
@@ -249,6 +250,7 @@ def add_execution_callback(callback):
       `outputs` is the `list` of output `Tensor`(s) from the op.
        Return value(s) from the callback are ignored.
   """
+  execute.execute = execute.execute_with_callbacks
   context.get_default_context().add_post_execution_callback(callback)
 
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index b1b1de0c41efe351e3972d5c01e8b83fe3c3fccf..cadabb3a247416a3bcb6f9512720a61d8e567cb2 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -25,15 +25,19 @@ import threading
 
 import numpy as np
 
+from tensorflow.core.framework import function_pb2
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import graph_to_function_def
+from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -47,26 +51,41 @@ _scoped_captures = threading.local()
 _scoped_captures.tensors = None
 
 
-def make_function_def(graph, operations, inputs, outputs):
-  """Makes function def where accesses to resources are serialized."""
-  last_op_using_resource_tensor = {}
-
-  # TODO(apassos) probably control flow has to be handled delicately here as in
-  # if a resource is accessed inside a control flow context we need the control
-  # dependency to point to something outside the context which is guaranteed to
-  # happen after the access.
-  #
-  # TODO(apassos) this should do some form of alias analysis as ops which
-  # forward the resources such as Identity and Switch can cause serialization to
-  # fail.
-  for op in operations:
-    for t in op.inputs:
-      if t.dtype == dtypes.resource:
-        if t.name in last_op_using_resource_tensor:
-          op._add_control_input(last_op_using_resource_tensor[t.name])  # pylint: disable=protected-access
-        last_op_using_resource_tensor[t.name] = op
-  return graph_to_function_def.graph_to_function_def(
-      graph, operations, inputs, outputs)
+def make_function_def(name, graph, operations, inputs, outputs):
+  """Makes FunctionDef proto and defined function.
+
+  Args:
+    name: the function name
+    graph: the graph from which to build the function
+    operations: the operations in the function body
+    inputs: tensors to be used as function arguments
+    outputs: tensors to be returned from the function
+
+  Returns:
+   fdef: a FunctionDef protocol buffer for the function
+   fn: a wrapped TF_Function for the function
+  """
+  with errors.raise_exception_on_not_ok_status() as status:
+    fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+        graph._c_graph,  # pylint: disable=protected-access
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        [],
+        None,
+        compat.as_str(""),
+        status)
+  # TODO(apassos) avoid creating a FunctionDef (specially to grab the signature,
+  # but also in general it's nice not to depend on it.
+  with c_api_util.tf_buffer() as buffer_:
+    with errors.raise_exception_on_not_ok_status() as status:
+      pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+    proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
+  fdef = function_pb2.FunctionDef()
+  fdef.ParseFromString(compat.as_bytes(proto_data))
+  return fdef, fn
 
 
 @contextlib.contextmanager
@@ -85,7 +104,7 @@ def capture_value(tensor_map, value, dtype, name):
   if captured_value is None:
     captured_value = graph_placeholder(
         dtype=dtype or value.dtype, shape=value.shape, name=name)
-    if captured_value.dtype == dtypes.resource:
+    if captured_value.dtype == dtypes_module.resource:
       captured_value._handle_data = value._handle_data  # pylint: disable=protected-access
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -120,11 +139,23 @@ def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
 
 
 class CapturingGraph(ops.Graph):
+  """Graph used when constructing eager functions."""
 
   def __init__(self, captures):
     super(CapturingGraph, self).__init__()
     self._building_function = True
     self.captures = captures
+    # Map from resource tensor name to last op (in program order) which uses
+    # this tensor. Used to enforce that execution order matches program order
+    # for resource tensors.
+    self._last_op_using_resource_tensor = {}
+
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    return True
+
+  def clear_resource_control_flow_state(self):
+    self._last_op_using_resource_tensor = {}
 
   def create_op(
       self,
@@ -137,12 +168,31 @@ class CapturingGraph(ops.Graph):
       op_def=None,
       compute_shapes=True,
       compute_device=True):
+    # TODO(apassos) probably control flow has to be handled delicately here as
+    # in if a resource is accessed inside a control flow context we need the
+    # control dependency to point to something outside the context which is
+    # guaranteed to happen after the access.
+    #
+    # TODO(apassos) this should do some form of alias analysis as ops which
+    # forward the resources such as Identity and Switch can cause serialization
+    # to fail.
+    resource_inputs = set()
+    control_inputs = set()
     for i, inp in enumerate(inputs):
       if inp.graph is not self:
         inputs[i] = capture_value(self.captures, inp, inp.dtype, inp.op.name)
-    return super(CapturingGraph, self).create_op(
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
-        compute_shapes, compute_device)
+      inp = inputs[i]
+      if inp.dtype == dtypes_module.resource:
+        if inp.name in self._last_op_using_resource_tensor:
+          control_inputs.add(self._last_op_using_resource_tensor[inp.name])
+        resource_inputs.add(inp.name)
+    with self.control_dependencies(list(control_inputs)):
+      op = super(CapturingGraph, self).create_op(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+    for name in resource_inputs:
+      self._last_op_using_resource_tensor[name] = op
+    return op
 
 
 # TODO(apassos): it'd be really nice if we could scope this registration.
@@ -196,14 +246,20 @@ def _inference_name(n):
   return "__inference_%s_%s" % (n, ops.uid())
 
 
+# TODO(apassos) get rid of this by splitting framework.function._DefinedFunction
+# so it doesn't have the definition-generating logic and is just a container for
+# an already-defined function.
 class _DefinedFunction(object):
   """Mocks the interface of tf _DefinedFunction."""
 
-  def __init__(self, fdef):
+  def __init__(self, fdef, fn):
     self.definition = fdef
     self.name = fdef.signature.name
+    self.signature = fdef.signature
     self.grad_func_name = None
     self.python_grad_func = None
+    self._c_func = fn
+    self._grad_func = None
 
 
 def _map_sequence_obj_to_idx(sequence):
@@ -211,7 +267,7 @@ def _map_sequence_obj_to_idx(sequence):
   return {id(x): i for i, x in enumerate(sequence)}
 
 
-class _GraphModeFunction(object):
+class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
 
   Args:
@@ -232,10 +288,20 @@ class _GraphModeFunction(object):
       func_outputs structure.
     output_shapes: List of shapes of all tensors which are output by the
       internal function.
+    variables: (optional) List of variables to watch during function execution.
   """
 
-  def __init__(self, input_placeholders, extra_inputs, fdef, graph, operations,
-               func_outputs, func_outputs_to_fdef_outputs, output_shapes):
+  def __init__(self,
+               input_placeholders,
+               extra_inputs,
+               fdef,
+               fn,
+               graph,
+               operations,
+               func_outputs,
+               func_outputs_to_fdef_outputs,
+               output_shapes,
+               variables=None):
     assert len(input_placeholders) == len(fdef.signature.input_arg), "%s %s" % (
         len(input_placeholders), len(fdef.signature.input_arg))
     self._input_placeholders = input_placeholders
@@ -243,7 +309,7 @@ class _GraphModeFunction(object):
     self._graph = graph
     self._has_backprop = False
     self._func_name = fdef.signature.name
-    self._fdef = _DefinedFunction(fdef)
+    self._fdef = _DefinedFunction(fdef, fn)
     self._num_outputs = len(fdef.signature.output_arg)
     self._ops = operations
     self._func_outputs = func_outputs
@@ -251,6 +317,11 @@ class _GraphModeFunction(object):
         func_outputs, (ops.Tensor, type(None))) else list(func_outputs)
     self._returns_to_fedf_outputs = func_outputs_to_fdef_outputs
     self._output_shapes = output_shapes
+    self._variables = variables if variables is not None else []
+
+  @property
+  def variables(self):
+    return self._variables
 
   def _compute_backprop(self):
     """Computes the backprop function object for this function."""
@@ -258,38 +329,45 @@ class _GraphModeFunction(object):
     with self._graph.as_default(), context.graph_mode():
       c = _CapturingContext()
       with c:
-        filtered_outputs = [
-            x for x in self._returns if x is not None
-        ]
+        filtered_outputs = [x for x in self._returns if x is not None]
         self._out_grad_placeholders = [
-            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs
-        ]
+            graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
         in_gradients = gradients_impl.gradients(
             filtered_outputs,
             self._input_placeholders,
             grad_ys=self._out_grad_placeholders)
-        shapes = [x.shape for x in in_gradients if x is not None]
+        shapes = tuple(x.shape for x in in_gradients if x is not None)
     captures = list(sorted(c.captured_tensors, key=lambda x: x.name))
-    forward_function_def = make_function_def(
-        self._graph, self._ops, self._input_placeholders,
+    forward_name = _forward_name(self._func_name)
+    forward_function_def, forward_fn = make_function_def(
+        forward_name, self._graph, self._ops, self._input_placeholders,
         filtered_outputs + captures)
-    self._forward_fdef = _DefinedFunction(forward_function_def)
-    _register_with_name(_forward_name(self._func_name), forward_function_def)
-    backward_outputs = [x for x in in_gradients if x is not None]
+    self._forward_fdef = _DefinedFunction(forward_function_def, forward_fn)
+    _register(forward_fn)
+    backward_outputs = tuple(x for x in in_gradients if x is not None)
     all_inputs = self._out_grad_placeholders + captures
-    backward_function_def = make_function_def(
-        self._graph, [x.op for x in self._out_grad_placeholders
-                     ] + list(sorted(c.known_ops, key=lambda x: x.name)),
+    # Excluding input ops from the body as we do not intend to execute these
+    # operations when the function is executed.
+    all_ignored_ops = frozenset(x.op for x in all_inputs)
+    # Enforce a deterministic order of operations in the generated graph. This
+    # means rerunning the function-defining code will always define the same
+    # function, which is useful if we serialize this etc.
+    fdef_ops = tuple(x for x in sorted(c.known_ops, key=lambda x: x.name)
+                     if x not in all_ignored_ops)
+    bname = _backward_name(self._func_name)
+    backward_function_def, backward_fn = make_function_def(
+        bname, self._graph, fdef_ops,
         all_inputs, backward_outputs)
-    _register_with_name(_backward_name(self._func_name), backward_function_def)
-    self._backward_function = _GraphModeFunction(
-        all_inputs, [], backward_function_def, self._graph, c.known_ops,
-        in_gradients, _map_sequence_obj_to_idx(backward_outputs), shapes)
+    _register(backward_fn)
+    self._backward_function = GraphModeFunction(
+        all_inputs, [], backward_function_def, backward_fn, self._graph,
+        c.known_ops, in_gradients, _map_sequence_obj_to_idx(backward_outputs),
+        shapes)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
     all_args = args + self._extra_inputs
-    signature = self._forward_fdef.definition.signature
+    signature = self._forward_fdef.signature
     ctx = context.context()
     if ctx.in_graph_mode():
       g = ops.get_default_graph()
@@ -300,7 +378,7 @@ class _GraphModeFunction(object):
         return ops.internal_convert_to_tensor(x, ctx=ctx)
       op = g.create_op(
           signature.name, [make_tensor(x) for x in all_args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -332,10 +410,12 @@ class _GraphModeFunction(object):
 
   def __call__(self, *args):
     """Executes the passed function in eager mode."""
-    tensor_inputs = [
-        x for x in nest.flatten(args)
-        if isinstance(x, ops.Tensor)
-    ]
+    for v in self._variables:
+      if v._trainable:  # pylint: disable=protected-access
+        tape.watch_variable(v)
+
+    tensor_inputs = [x for x in nest.flatten(args)
+                     if isinstance(x, ops.Tensor)]
     if tape.should_record(tensor_inputs) or tape.should_record(
         self._extra_inputs):
       if not self._has_backprop:
@@ -354,7 +434,7 @@ class _GraphModeFunction(object):
       args = list(tensor_inputs) + self._extra_inputs
       op = g.create_op(
           signature.name, [ops.convert_to_tensor(x) for x in args],
-          [dtypes.DType(x.type) for x in signature.output_arg],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
           op_def=signature,
           name="FunctionCall",
           compute_shapes=False)
@@ -407,9 +487,15 @@ def _get_defun_inputs(args):
 
 def _defun_internal(name, func, args, kwds):
   """Defines and returns graph-mode version of func."""
+  container_prefix = ops.get_default_graph()._container_prefix  # pylint: disable=protected-access
   with context.graph_mode():
     captures = {}
     tmp_graph = CapturingGraph(captures)
+    # Inherit the container prefix, since this is used for error checking when
+    # isolating eager execution (the container prefix at creation must match the
+    # container prefix when used, and variables accessed in the defun will be
+    # used in the outside context).
+    tmp_graph._container_prefix = container_prefix  # pylint: disable=protected-access
     # Copy the graph collections to ensure summaries and other things work. This
     # lets the function access (but not mutate) collections of the containing
     # graph, such as the global step and the summary writer collections.
@@ -421,7 +507,11 @@ def _defun_internal(name, func, args, kwds):
       func_inputs = _get_defun_inputs(args)
 
       with capture_tensors(captures):
-        func_outputs = func(*func_inputs, **kwds)
+        tape.push_new_tape()
+        try:
+          func_outputs = func(*func_inputs, **kwds)
+        finally:
+          variables = tape.pop_tape().watched_variables()
       ids = list(sorted(captures.keys()))
       if ids:
         extra_inputs, extra_placeholders = zip(* [captures[x] for x in ids])
@@ -429,27 +519,36 @@ def _defun_internal(name, func, args, kwds):
         extra_inputs = []
         extra_placeholders = []
       outputs_list = nest.flatten(func_outputs)
-      output_shapes = [x.shape for x in outputs_list if x is not None]
+      output_shapes = tuple(x.shape for x in outputs_list if x is not None)
 
-  flat_inputs = [
-      x for x in nest.flatten(func_inputs) if isinstance(x, ops.Tensor)
-  ]
+  flat_inputs = [x for x in nest.flatten(func_inputs)
+                 if isinstance(x, ops.Tensor)]
   all_inputs = flat_inputs + list(extra_placeholders)
-
+  all_ignored_ops = frozenset(x.op for x in all_inputs)
   func_def_outputs = [x for x in outputs_list if x is not None]
-  inference_function_def = make_function_def(
-      tmp_graph, tmp_graph.get_operations(), all_inputs, func_def_outputs)
+  fname = _inference_name(name)
+  operations = tuple(x for x in tmp_graph.get_operations()
+                     if x not in all_ignored_ops)
+  inference_function_def, fn = make_function_def(
+      fname, tmp_graph, operations, all_inputs, func_def_outputs)
   # Register any other functions defined in the graph
   # TODO(ashankar): Oh lord, forgive me for this lint travesty.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
     # TODO(ashankar): What about the gradient registry?
-    _register_with_name(f.name, f.definition)
-  _register_with_name(_inference_name(name), inference_function_def)
-
-  return _GraphModeFunction(
-      all_inputs, extra_inputs, inference_function_def, tmp_graph,
-      tmp_graph.get_operations(), func_outputs,
-      _map_sequence_obj_to_idx(func_def_outputs), output_shapes)
+    _register(f._c_func)  # pylint: disable=protected-access
+  _register(fn)
+
+  return GraphModeFunction(
+      all_inputs,
+      extra_inputs,
+      inference_function_def,
+      fn,
+      tmp_graph,
+      operations,
+      func_outputs,
+      _map_sequence_obj_to_idx(func_def_outputs),
+      output_shapes,
+      variables=variables)
 
 
 # Defun uses this instead of Tensor as a cache key. Using dtype because
@@ -471,10 +570,9 @@ def _cache_key(x):
   return x
 
 
-def _register_with_name(name, fdef):
-  """Registers the function `fdef` with the name `name`."""
-  fdef.signature.name = name
-  context.context().add_function_def(fdef)
+def _register(fn):
+  """Registers the function `fn`."""
+  context.context().add_function(fn)
 
 
 # TODO(apassos): better error messages for non-hashable arguments.
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 243efccac44be1fbba8a00be6683029fc5105a95..c55f2f1d5957cabfaf3bae617d88dca55f7b8e4b 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -62,13 +62,51 @@ class FunctionTest(test.TestCase):
     @function.defun
     def step():
       def inner():
-        tape.watch_variable(v)
         return v * v
 
       return backprop.implicit_grad(inner)()[0][0]
 
     self.assertAllEqual(step(), 2.0)
 
+  def testDefunReadVariable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def f():
+      return v.read_value()
+
+    self.assertEqual(1.0, float(f()))
+
+  def testDefunAssignAddVariable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def f():
+      v.assign_add(2.0)
+      return v.read_value()
+
+    self.assertEqual(3.0, float(f()))
+
+  def testDefunDifferentiable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testDefunCanBeDifferentiatedTwice(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @function.defun
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    # Ensure that v is watched again.
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
   def testGraphModeCaptureVariable(self):
     with context.graph_mode(), self.test_session() as sess:
 
diff --git a/tensorflow/python/eager/gen_op.bzl b/tensorflow/python/eager/gen_op.bzl
index 1c99d342befaf04112ac83aeecce2b122eb361c5..8bc1d6c10a60b89a026cb34dbf6fd98d29e909c2 100644
--- a/tensorflow/python/eager/gen_op.bzl
+++ b/tensorflow/python/eager/gen_op.bzl
@@ -10,7 +10,9 @@ def tfe_gen_op_wrapper_py(name,
                           out=None,
                           visibility=None,
                           deps=[],
-                          generated_target_name=None):
+                          generated_target_name=None,
+                          # ApiDefs will be loaded in the order specified in this list.
+                          api_def_srcs=[]):
   """Generate an eager-mode Python op wrapper for an op library."""
   # Construct a cc_binary containing the specified ops.
   tool_name = "gen_" + name + "_py_wrappers_cc"
@@ -30,11 +32,25 @@ def tfe_gen_op_wrapper_py(name,
   if not out:
     out = "gen_" + name + ".py"
 
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   native.genrule(
       name=name + "_pygenrule",
       outs=[out],
+      srcs=api_def_srcs,
       tools=[tool_name] + tf_binary_additional_srcs(),
-      cmd=("$(location " + tool_name + ")  > $@"))
+      cmd=("$(location " + tool_name + ") " + api_def_args_str + " > $@"))
 
   # Make a py_library out of the generated python file.
   if not generated_target_name:
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index a7f1061d18bf905caf97decc5375c3996215ec5b..3da100d8008dd53244bc66a72f62b05cd99503e1 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -165,32 +165,6 @@ class _VariableCapturingScope(object):
       yield
 
 
-class _FunctionObject(function._GraphModeFunction):  # pylint: disable=protected-access
-  """Captured graph-mode function with read-only variables.
-
-  Calling this function object will read the current values of the variables and
-  pass them to the graph mode function, which will use them as constants.
-  """
-
-  def __init__(self, variables, placeholder_inputs, extra_inputs, fdef,
-               graph, operations, outputs, func_outputs_to_fdef_outputs,
-               output_shapes):
-    self._variables = variables
-    super(_FunctionObject, self).__init__(
-        placeholder_inputs,
-        extra_inputs,
-        fdef,
-        graph,
-        operations,
-        outputs,
-        func_outputs_to_fdef_outputs,
-        output_shapes)
-
-  @property
-  def variables(self):
-    return [x.variable for x in self._variables]
-
-
 class _InitializingFunctionObject(object):
   """Responsible for deciding which version of func-to-object to call.
 
@@ -247,7 +221,9 @@ def _get_graph_callable_inputs(shape_and_dtypes):
       ret.append(_get_graph_callable_inputs(x))
     else:
       raise errors.InvalidArgumentError(
-          None, None, "shape_and_dtypes not ShapeAndDtype, type: %s " % type(x))
+          None, None, "Expected the argument to @graph_callable to be a "
+          "(possibly nested) list or tuple of ShapeAndDtype objects, "
+          "but got an object of type: %s" % type(x))
 
   return tuple(ret) if isinstance(shape_and_dtypes, tuple) else ret
 
@@ -267,7 +243,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
 
   Args:
     func: The tfe Python function to compile.
-    shape_and_dtypes: A list of type ShapeAndDtype.
+    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects.
 
   Raises:
     ValueError: If any one of func's outputs is not a Tensor.
@@ -320,6 +296,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # Call the function again, now replacing usages of variables with
       # placeholders. This assumes the variable capturing scope created above
       # knows about all variables.
+      tmp_graph.clear_resource_control_flow_state()
       with variable_captures.capturing_scope(), function.capture_tensors(
           captures):
         captured_outputs = func(*func_inputs)
@@ -341,7 +318,9 @@ def _graph_callable_internal(func, shape_and_dtypes):
   placeholder_inputs = flat_inputs+ list(extra_placeholders)
 
   func_def_outputs = [x for x in outputs_list if isinstance(x, tf_ops.Tensor)]
-  initializer_function_def = function.make_function_def(
+  initialization_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
+  initializer_function_def, initializer_fn = function.make_function_def(
+      initialization_name,
       tmp_graph,
       initializing_operations,
       placeholder_inputs,
@@ -350,13 +329,13 @@ def _graph_callable_internal(func, shape_and_dtypes):
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register_with_name(f.name, f.definition)  # pylint: disable=protected-access
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               initializer_function_def)
-  initializer_function = function._GraphModeFunction(  # pylint: disable=protected-access
+    function._register(f._c_func)  # pylint: disable=protected-access
+  function._register(initializer_fn)  # pylint: disable=protected-access
+  initializer_function = function.GraphModeFunction(
       placeholder_inputs,
       extra_inputs,
       initializer_function_def,
+      initializer_fn,
       tmp_graph,
       initializing_operations,
       func_outputs,
@@ -365,23 +344,26 @@ def _graph_callable_internal(func, shape_and_dtypes):
 
   capture_func_def_outputs = [
       x for x in captured_outlist if isinstance(x, tf_ops.Tensor)]
-  captured_function_def = function.make_function_def(
+  captured_function_name = function._inference_name(func.__name__)  # pylint: disable=protected-access
+  captured_function_def, capturing_fn = function.make_function_def(
+      captured_function_name,
       tmp_graph,
       capturing_operations,
       placeholder_inputs,
       capture_func_def_outputs)
-  function._register_with_name(function._inference_name(func.__name__),  # pylint: disable=protected-access
-                               captured_function_def)
-  captured_function = _FunctionObject(
-      sorted_variables,
+  function._register(capturing_fn)  # pylint: disable=protected-access
+
+  captured_function = function.GraphModeFunction(
       placeholder_inputs,
       extra_inputs,
       captured_function_def,
+      capturing_fn,
       tmp_graph,
       capturing_operations,
       captured_outputs,
       function._map_sequence_obj_to_idx(capture_func_def_outputs),  # pylint: disable=protected-access
-      output_shapes)
+      output_shapes,
+      variables=[x.variable for x in sorted_variables])
 
   return _InitializingFunctionObject(captured_function, initializer_function,
                                      shape_and_dtypes)
@@ -430,9 +412,10 @@ def graph_callable(shape_and_dtypes):
   ret = foo(tfe.Tensor(2.0))  # `ret` here now is a Tensor with value 9.0.
   ```
   Args:
-    shape_and_dtypes: A list of type ShapeAndDtype that specifies shape and type
-      information for each of the callable's arguments. The length of this list
-      must be equal to the number of arguments accepted by the wrapped function.
+    shape_and_dtypes: A possibly nested list or tuple of ShapeAndDtype objects
+      that specifies shape and type information for each of the callable's
+      arguments. The length of this list must be equal to the number of
+      arguments accepted by the wrapped function.
 
   Returns:
     A callable graph object.
diff --git a/tensorflow/python/eager/graph_callable_test.py b/tensorflow/python/eager/graph_callable_test.py
index 548e16a909f8fe846ea6d5a7a33c4247c5d90054..b9e6ca2a93ac6ff02b741051234dbdd8a55bf12b 100644
--- a/tensorflow/python/eager/graph_callable_test.py
+++ b/tensorflow/python/eager/graph_callable_test.py
@@ -152,7 +152,6 @@ class GraphCallableTest(test.TestCase):
     self.assertAllEqual(5, f(constant_op.constant(2)))
 
   def testNestedFunction(self):
-
     # TensorFlow function (which is what would be used in TensorFlow graph
     # construction).
     @function.Defun(dtypes.int32, dtypes.int32)
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index c87719f84abf22f4dee775ab61309d1b18129e07..837cad974ac6555ef2b13d1a1a5e0e5f5166b01d 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -20,114 +20,13 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.python.eager import tape as tape_module
-
-
-# Terminology:
-#
-#  - op: a possibly composite operation, which has an entry in the tape
-#  - target: dy in dx/dy
-#  - source: dx in dx/dy
-#  - tensor: one of the many inputs or outputs of an operation
-#
-# Below here we do the gradient algorithm. It works as follows:
-#
-# First we filter the tape to just the subset of operations we want to
-# differentiate. In the process of doing so we count how many times each Tensor
-# is used as an input to an op (so we know when we're done computing gradients
-# for that Tensor). We also count, for each tape entry, how many of its output
-# Tensors need gradients to be computed (Tensors which are not used do not need
-# any gradients to be computed).
-#
-# Finally, we start a backprop stack with a set of tape entries for which we
-# have all gradients available. This set usually is a subset of the set of
-# targets (not all since targets which have outputs in the tape will not have
-# gradients available initially).
-#
-# Then we repeatedly pop an entry from the stack, run its backprop, and update
-# the gradients of its inputs. Once we have computed all gradients for a single
-# input we can mark this input as done, and this can trigger adding an entry to
-# the stack if all outputs of that entry are now done.
-#
-# When the stack is empty we have gradients for all tensors we're interested in.
-def _prepare_backprop(vspace, target, tensor_to_op, op_to_entry, id_sources):
-  """Filters the tape to only include relevant entries and counts tensor usages.
-
-  Args:
-    vspace: information about the space we're differentiating in.
-    target: the target to optimize.
-    tensor_to_op: Map from tensor id to key in op_to_entry that produced it.
-    op_to_entry: Map from op id to a tape.TapeEntry object
-    id_sources: the ids of the sources wrt the gradient is being taken.
-
-  Returns:
-    usage counts (how many entries downstream from a tensor use it)
-    op_to_entry_map: entry map (a filtered tape, with only the relevant
-     entries),
-    missing: map from tensor id to how many downstream gradients still need
-     to be computed before this tensor's gradient can be computed.
-  """
-  tensor_stack = [vspace.tensor_id(x) for x in target]
-  tensor_usage_counts = {}
-  o_to_e = {}  # Copy of just the bits we need from op_to_entry
-  while tensor_stack:
-    t = tensor_stack.pop()
-    op = tensor_to_op.get(t, None)
-    # op is None or -1 if the tensor is a source (i.e. was watched directly)
-    if op is None or op == -1 or op in o_to_e:
-      continue
-    op_trace = tape_module.TapeEntry(*op_to_entry[op])
-    o_to_e[op] = op_trace
-    for it in op_trace.input_ids:
-      if it in tensor_usage_counts:
-        tensor_usage_counts[it] += 1
-      else:
-        tensor_usage_counts[it] = 1
-        if it not in id_sources and it in tensor_to_op:
-          tensor_stack.append(it)
-  op_missing_tensor_counts = collections.defaultdict(int)
-  for t in tensor_usage_counts:
-    if t in tensor_to_op and tensor_to_op[t] is not None:
-      op_missing_tensor_counts[tensor_to_op[t]] += 1
-  return tensor_usage_counts, o_to_e, op_missing_tensor_counts
-
-
-def _initialize_backprop_stack(op_to_entry, op_missing_tensor):
-  """Returns the set of tape entries which are available for backprop."""
-  ready_ops = []
-  for op in op_to_entry:
-    if op not in op_missing_tensor:
-      ready_ops.append(op)
-  return ready_ops
-
-
-def _initial_gradients(vspace, target, output_gradients, tensor_usage_counts):
-  """Computes the initial gradients for each Tensor."""
-  # Initialize the backprop stack
-  gradients = collections.defaultdict(list)
-  for i, t in enumerate(target):
-    if vspace.tensor_id(t) in tensor_usage_counts:
-      # Can't provide a gradient of something we're trying to differentiate
-      assert output_gradients is None or output_gradients[i] is None
-    else:
-      if output_gradients is None or output_gradients[i] is None:
-        out_grad = vspace.ones_like(t)
-      else:
-        out_grad = output_gradients[i]
-      gradients[vspace.tensor_id(t)].append(out_grad)
-  return gradients
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors
 
 
 VSpace = collections.namedtuple(
     "VSpace",
-    ["aggregate_fn", "num_elements_fn", "tensor_id", "zeros", "ones_like"])
-
-
-# If over MIN_AGGREGATE_COUNT gradients are accumulated and the total
-# memory consumption is over MIN_AGGREGATE_BYTES, do an early aggregation
-# so as to release the gradient tensor to save memory.
-_MIN_AGGREGATE_COUNT = 4
-_MIN_AGGREGATE_BYTES = 128 * 1024 * 1024
+    ["aggregate_fn", "num_elements_fn", "tensor_id", "zeros", "ones"])
 
 
 def imperative_grad(
@@ -161,89 +60,6 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  tensor_to_op, op_to_entry = tape.export()
-  # This overwrites the op_to_entry variable, which will release all memory used
-  # to keep traces that are irrelevant to the gradient computation we're doing
-  # here.
-  id_sources = [vspace.tensor_id(t) for t in sources]
-  tensor_usage_counts, op_to_entry, op_missing_tensor = _prepare_backprop(
-      vspace, target, tensor_to_op, op_to_entry, id_sources)
-  ready_ops = _initialize_backprop_stack(op_to_entry, op_missing_tensor)
-  gradients = _initial_gradients(vspace, target, output_gradients,
-                                 tensor_usage_counts)
-  gradients_size = dict()
-  # Now exhaust the backprop stack
-  while ready_ops:
-    op = ready_ops.pop()
-    op_trace = op_to_entry.pop(op)
-    out_gradients = [gradients.pop(t, None) for t in op_trace.output_ids]
-
-    # Cache the last used zero tensor. We reuse it if the next one
-    # we need is of the same shape and dtype. This is very helpful in
-    # large splits and should have negligible overhead in other cases.
-    last_shape_and_dtype = None
-    last_zeros = None
-    for i in range(len(out_gradients)):
-      if out_gradients[i] is None:
-        # TODO(apassos) this should be in the right device
-        none_indices = _grad_fn_accepts_none_for_indices.get(
-            op_trace.op_type, None)
-        if none_indices is None or i not in none_indices:
-          shape_and_dtype = op_trace.output_shape_and_dtype[i]
-          if shape_and_dtype != last_shape_and_dtype:
-            last_shape_and_dtype = shape_and_dtype
-            last_zeros = vspace.zeros(*shape_and_dtype)
-          out_gradients[i] = last_zeros
-      else:
-        out_gradients[i] = vspace.aggregate_fn(out_gradients[i])
-
-    in_gradients = op_trace.backward_function(*(out_gradients))
-    for i, t in enumerate(op_trace.input_ids):
-      if in_gradients[i] is not None:
-        t_grads = gradients.setdefault(t, [])
-        t_grads.append(in_gradients[i])
-        if len(t_grads) >= _MIN_AGGREGATE_COUNT:
-          if t not in gradients_size:
-            gradients_size[t] = vspace.num_elements_fn(t_grads[-1])
-          size = gradients_size[t]
-
-          if len(t_grads) * size * 4 > _MIN_AGGREGATE_BYTES:
-            t_grads[:] = [vspace.aggregate_fn(t_grads)]
-      if tensor_usage_counts.get(t, 0) > 0:
-        tensor_usage_counts[t] -= 1
-        if (t in tensor_to_op
-            and tensor_usage_counts[t] == 0
-            and t not in id_sources):
-          in_op = tensor_to_op[t]
-          if in_op is None or in_op == -1:
-            continue
-          if op_missing_tensor.get(in_op, 0) > 0:
-            op_missing_tensor[in_op] -= 1
-            if op_missing_tensor.get(in_op, 0) == 0:
-              ready_ops.append(in_op)
-  result = []
-  for i, s in enumerate(sources):
-    g = gradients.get(vspace.tensor_id(s), None)
-    if g is None:
-      result.append(None)
-    else:
-      result.append(vspace.aggregate_fn(g))
-  return result
-
-
-# TODO(agarwal): use an automatic mechanism for handling None arguments to
-# gradient functions.
-# Some gradient functions can accept None arguments for gradients. The following
-# maps the operation name to the indices at which the corresponding gradient
-# function can accept None values.
-# e.g. FusedBatchNorm outputs 5 values and hence receives 5 gradient values
-# during backprop. However the gradient function uses only the first of those
-# values and ignores the rest. The entry, "FusedBatchNorm": [1, 2, 3, 4],
-# indicates that only the gradient corresponding to index 0 is used, and the
-# gradient values at indices 1-4 are ignored (and hence can be None). The
-# backprop algorithm can then leverage this by not constructing zeros to
-# pass for those indices.
-_grad_fn_accepts_none_for_indices = {
-    "SoftmaxCrossEntropyWithLogits": [1],
-    "FusedBatchNorm": [1, 2, 3, 4]
-}
+  with errors.raise_exception_on_not_ok_status() as status:
+    return pywrap_tensorflow.TFE_Py_TapeGradient(
+        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/memory_trace.py b/tensorflow/python/eager/memory_trace.py
deleted file mode 100644
index 094bcab9e2eb17ab33c26e85f9bd675d8d893ef9..0000000000000000000000000000000000000000
--- a/tensorflow/python/eager/memory_trace.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to trace per-device memory consumption across time over execution."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-TraceEntry = collections.namedtuple(
-    "TraceEntry", ["op_name", "tensor_id", "mem_usage", "device", "size"])
-TensorData = collections.namedtuple(
-    "TensorData", ["op_name", "tensor_size", "device"])
-
-
-class MemoryTrace(object):
-  """Records a trace of memory usage over operation execution."""
-
-  def __init__(self):
-
-    self.trace = []
-    self.tensor_to_data = {}
-    self.current_device_mem_usage = collections.defaultdict(int)
-
-  def record_tensor(self, op_name, tensor_id, device, size):
-    self.current_device_mem_usage[device] += size
-    self.tensor_to_data[tensor_id] = TensorData(op_name, size, device)
-    self.trace.append(TraceEntry(op_name,
-                                 tensor_id,
-                                 dict(self.current_device_mem_usage.items()),
-                                 device,
-                                 size))
-
-  def delete_tensor(self, tensor_id):
-    if tensor_id not in self.tensor_to_data:
-      return
-    data = self.tensor_to_data.pop(tensor_id, None)
-    if data is None: return
-    self.current_device_mem_usage[data.device] -= data.tensor_size
-    self.trace.append(TraceEntry(data.op_name,
-                                 tensor_id,
-                                 dict(self.current_device_mem_usage.items()),
-                                 data.device,
-                                 -data.tensor_size))
-
-  def flush_trace(self):
-    """Prints the formatted trace recorded so far."""
-    longest_op_name = max(len(t.op_name) for t in self.trace)
-    longest_op_name = max(longest_op_name, len("op_name"))
-    longest_heap_size = max(max(len(str(d)) for d in t.mem_usage)
-                            for t in self.trace)
-    longest_heap_size = max(longest_heap_size, len("d0"))
-    longest_id_len = max(len(str(t.tensor_id)) for t in self.trace)
-    longest_id_len = max(longest_id_len, 2)
-    first_line = []
-    first_line.append("+/-")
-    first_line.append("op_name".ljust(longest_op_name))
-    first_line.append("id".ljust(longest_id_len))
-    for i in range(len(self.current_device_mem_usage)):
-      first_line.append(("d"+str(i)).ljust(longest_heap_size))
-    first_line.append("size")
-    print(" | ".join(first_line))
-    for t in self.trace:
-      line = []
-      if t.size > 0:
-        line.append("+  ")
-      else:
-        line.append("-  ")
-      line.append(t.op_name.ljust(longest_op_name))
-      line.append(str(t.tensor_id).ljust(longest_id_len))
-      for d in t.mem_usage:
-        line.append(str(d).ljust(longest_heap_size))
-      line.append(str(t.size))
-      print(" | ".join(line))
-    self.trace = []
-    print()
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index e86073d6b21e031ea4974f514e1401fd0211c962..48dcb4830ccf4eda649c939c81f88a10750b23da 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -30,8 +30,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 
 
@@ -321,6 +323,13 @@ class OpsTest(test_util.TensorFlowTestCase):
   def testIdentity(self):
     self.assertAllEqual(2, array_ops.identity(2))
 
+  def testIdentityOnVariable(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found')
+    with context.device('/gpu:0'):
+      v = resource_variable_ops.ResourceVariable(True)
+    self.assertAllEqual(True, array_ops.identity(v))
+
   def testIncompatibleSetShape(self):
     x = constant_op.constant(1)
     with self.assertRaises(ValueError):
@@ -345,6 +354,13 @@ class OpsTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       float(x)
 
+  def testFormatString(self):
+    x = constant_op.constant(3.1415)
+    self.assertEqual('3.14', '{:.2f}'.format(x))
+
+  def testNoOpIsNone(self):
+    self.assertTrue(control_flow_ops.no_op() is None)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index e57488cb6408cf43ddf33850f5160cb89548b8fd..90a8779ff845b2fd63d1ba1019e8601fef257e42 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stdio.h>
 #include <sstream>
 #include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
@@ -98,10 +99,20 @@ string TensorPBString(const TensorProto& pb) {
   return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
 class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
  public:
-  GenEagerPythonOp(const OpDef& op_def, const string& function_name)
-      : python_op_gen_internal::GenPythonOp(op_def, function_name) {
+  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                   const string& function_name)
+      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
     op_name_.Consume("_");
   }
@@ -139,8 +150,9 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   std::unordered_map<string, string> attr_expressions_;
 };
 
-string GetEagerPythonOp(const OpDef& op_def, const string& function_name) {
-  return GenEagerPythonOp(op_def, function_name).Code();
+string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                        const string& function_name) {
+  return GenEagerPythonOp(op_def, api_def, function_name).Code();
 }
 
 string GenEagerPythonOp::FlattenInputs(
@@ -161,14 +173,14 @@ string GenEagerPythonOp::FlattenInputs(
       } else if (inputs_state == WAS_LIST_INPUT) {
         strings::StrAppend(&inputs, " + ");
       }
-      strings::StrAppend(&inputs, "list(", param_names_[i], ")");
+      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
       inputs_state = WAS_LIST_INPUT;
       if (output_sizes != nullptr) {
         if (!arg.number_attr().empty()) {
           output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
         } else {
           output_sizes->emplace_back(
-              strings::StrCat("len(", param_names_[i], ")"));
+              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
         }
       }
     } else {
@@ -179,7 +191,7 @@ string GenEagerPythonOp::FlattenInputs(
       } else {
         strings::StrAppend(&inputs, "[");
       }
-      strings::StrAppend(&inputs, param_names_[i]);
+      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
       inputs_state = WAS_SOLO_INPUT;
       if (output_sizes != nullptr) output_sizes->emplace_back();
     }
@@ -192,15 +204,21 @@ string GenEagerPythonOp::FlattenInputs(
 }
 
 string GenEagerPythonOp::Code() {
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
-  std::vector<string> args_no_default;
+  std::vector<python_op_gen_internal::ParamNames> params_no_default;
   // The parameters with defaults (these have to be listed after those without).
   // No input args are included, just attrs.
-  std::vector<std::pair<string, string>> args_with_defaults;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    args_no_default.push_back(arg.name());
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default;
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       AddAttrForArg(arg.type_attr(), i);
     } else if (!arg.type_list_attr().empty()) {
@@ -212,31 +230,39 @@ string GenEagerPythonOp::Code() {
   }
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (attr.has_default_value()) {
+      if (api_def_attr.has_default_value()) {
         if (attr.type() == "tensor") {
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("_execute.make_tensor(",
-                              TensorPBString(attr.default_value().tensor()),
-                              ", \"", attr.name(), "\")"));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat(
+                  "_execute.make_tensor(",
+                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
+                  api_def_attr.rename_to(), "\")"));
         } else if (attr.type() == "list(tensor)") {
           std::vector<string> pbtxt;
-          for (const auto& pb : attr.default_value().list().tensor()) {
+          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
             pbtxt.emplace_back(TensorPBString(pb));
           }
-          args_with_defaults.emplace_back(
-              attr.name(),
-              strings::StrCat("[_execute.make_tensor(_pb, \"", attr.name(),
-                              "\") for _pb in ", VectorToTuple(pbtxt), "]"));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat("[_execute.make_tensor(_pb, \"",
+                              api_def_attr.rename_to(), "\") for _pb in ",
+                              VectorToTuple(pbtxt), "]"));
         } else {
-          args_with_defaults.emplace_back(
-              attr.name(), python_op_gen_internal::AttrValueToPython(
-                               attr.type(), attr.default_value(), "_dtypes."));
+          params_with_default.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
       } else {
-        args_no_default.push_back(attr.name());
+        params_no_default.emplace_back(api_def_attr.name(),
+                                       api_def_attr.rename_to());
       }
     }
   }
@@ -244,34 +270,37 @@ string GenEagerPythonOp::Code() {
   // Save the list of attr parameters (attrs that won't be inferred),
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  for (const auto& a : args_with_defaults) {
-    attrs_.push_back(a.first);
+  // from the end of params_no_default, and adding params_no_default.
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (const auto& p : params_with_default) {
+    attrs_.push_back(p.first.GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param_and_default : params_with_default) {
+    param_names_.push_back(param_and_default.first);
   }
 
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param = python_op_gen_internal::AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const auto& name_default : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default) {
     if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    const string param =
-        python_op_gen_internal::AvoidPythonReserved(name_default.first);
-    strings::StrAppend(&parameters, param, "=", name_default.second);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+                       param_and_default.second);
   }
   if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
 
+  AddExport();
   AddDefLine(parameters);
   AddDocStringDescription();
   AddDocStringArgs();
@@ -294,25 +323,26 @@ string GenEagerPythonOp::Code() {
         // inputs are lists and have the same length.
         for (auto iter = arg_list->second.begin();
              iter != arg_list->second.end(); ++iter) {
-          const string& arg_name = param_names_[*iter];
-          ExpectListArg(arg_name);
+          const string& arg_api_name = param_names_[*iter].GetRenameTo();
+          ExpectListArg(arg_api_name);
           if (iter == arg_list->second.begin()) {
-            AddInferredAttr(attr.name(), strings::StrCat("len(", arg_name, ")"),
+            AddInferredAttr(attr.name(),
+                            strings::StrCat("len(", arg_api_name, ")"),
                             &result_, &attr_expressions_);
           } else {
             const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(&result_, "  if len(", arg_name,
+            strings::StrAppend(&result_, "  if len(", arg_api_name,
                                ") != ", attr_var,
                                ":\n"
                                "    raise ValueError(\n"
                                "        \"List argument '",
-                               arg_name, "' to '", op_name_,
+                               arg_api_name, "' to '", op_name_,
                                "' Op with length %d \"\n"
                                "        \"must match length %d of argument '",
                                inferred_attrs_[attr.name()],
                                "'.\" %\n"
                                "        (len(",
-                               arg_name, "), ", attr_var, "))\n");
+                               arg_api_name, "), ", attr_var, "))\n");
           }
         }
       }
@@ -322,65 +352,76 @@ string GenEagerPythonOp::Code() {
   // Values for non-inferred attrs.
   for (int i = 0; i < attrs_.size(); ++i) {
     const string& attr_name = attrs_[i];
-    const string& param = param_names_[i + op_def_.input_arg_size()];
+    const auto& param = param_names_[i + op_def_.input_arg_size()];
     const auto& attr = *FindAttr(attr_name, op_def_);
+    const string& attr_api_name = param.GetRenameTo();
     StringPiece attr_type = attr.type();
-    attr_expressions_[attr_name] = param;
-    const int default_index = i - (attrs_.size() - args_with_defaults.size());
+    attr_expressions_[attr_name] = attr_api_name;
+    const int default_index = i - (attrs_.size() - params_with_default.size());
     if (default_index >= 0) {
-      const string& default_value = args_with_defaults[default_index].second;
-      strings::StrAppend(&result_, "  if ", param, " is None:\n");
-      strings::StrAppend(&result_, "    ", param, " = ", default_value, "\n");
+      const string& default_value = params_with_default[default_index].second;
+      strings::StrAppend(&result_, "  if ", attr_api_name, " is None:\n");
+      strings::StrAppend(&result_, "    ", attr_api_name, " = ", default_value,
+                         "\n");
     }
     if (attr_type.starts_with("list(")) {
-      ExpectListArg(param);
+      ExpectListArg(attr_api_name);
     }
 
     if (attr_type == "string") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_str(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_str(",
+                         attr_api_name, ", \"", attr_api_name, "\")\n");
     } else if (attr_type == "list(string)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_str(_s, \"",
-                         param, "\") for _s in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_str(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "int") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_int(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name, " = _execute.make_int(",
+                         attr_api_name, ", \"", attr_api_name, "\")\n");
     } else if (attr_type == "list(int)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_int(_i, \"",
-                         param, "\") for _i in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_int(_i, \"", attr_api_name,
+                         "\") for _i in ", attr_api_name, "]\n");
     } else if (attr_type == "float") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_float(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_float(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(float)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_float(_f, \"", param,
-                         "\") for _f in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_float(_f, \"", attr_api_name,
+                         "\") for _f in ", attr_api_name, "]\n");
     } else if (attr_type == "bool") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_bool(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_bool(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(bool)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_bool(_b, \"",
-                         param, "\") for _b in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_bool(_b, \"", attr_api_name,
+                         "\") for _b in ", attr_api_name, "]\n");
     } else if (attr_type == "type") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_type(", param,
-                         ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_type(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(type)") {
-      strings::StrAppend(&result_, "  ", param, " = [_execute.make_type(_t, \"",
-                         param, "\") for _t in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_type(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type == "shape") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_shape(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_shape(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(shape)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_shape(_s, \"", param,
-                         "\") for _s in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_shape(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
     } else if (attr_type == "tensor") {
-      strings::StrAppend(&result_, "  ", param, " = _execute.make_tensor(",
-                         param, ", \"", param, "\")\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = _execute.make_tensor(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
     } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(&result_, "  ", param,
-                         " = [_execute.make_tensor(_t, \"", param,
-                         "\") for _t in ", param, "]\n");
+      strings::StrAppend(&result_, "  ", attr_api_name,
+                         " = [_execute.make_tensor(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
     } else if (attr_type != "func") {
       return strings::StrCat("# No definition for ", function_name_,
                              " since we don't support attrs with type\n"
@@ -481,16 +522,20 @@ string GenEagerPythonOp::Code() {
 
   bool eager_allowed = true;
   string ref_arg;
-  for (const auto& arg : op_def_.input_arg()) {
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
     if (arg.is_ref()) {
       eager_allowed = false;
-      ref_arg = arg.name();
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
     }
   }
-  for (const auto& arg : op_def_.output_arg()) {
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
     if (arg.is_ref()) {
       eager_allowed = false;
-      ref_arg = arg.name();
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
     }
   }
 
@@ -528,6 +573,8 @@ string GenEagerPythonOp::Code() {
       strings::StrAppend(&result_, "  _result = _", op_def_.name(),
                          "Output._make(_result)\n");
     }
+  } else {
+    strings::StrAppend(&result_, "    _result = None\n");
   }
   strings::StrAppend(&result_, "  return _result\n\n");
   return prelude_ + result_;
@@ -548,6 +595,7 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
   // Figure out values for inferred attrs, and cast to eager tensors.
   for (int i = 0; i < op_def_.attr_size(); ++i) {
     const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
     auto arg_list = attr_to_args_.find(attr.name());
     if (arg_list != attr_to_args_.end()) {
       if (attr.type() == "type") {
@@ -560,14 +608,15 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           strings::StrAppend(
               &conversion, ", ",
               python_op_gen_internal::AttrValueToPython(
-                  attr.type(), attr.default_value(), "_dtypes."));
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
         strings::StrAppend(&conversion, ")");
         const string var_name = AttrVarName(attr.name(), &attr_expressions_);
         if (output_sizes.size() == 1) {
           // Avoid creating a temporary variable in the case where
           // we can easily assign to the right value directly.
-          const string inputs_var = param_names_[arg_list->second.front()];
+          const string inputs_var =
+              param_names_[arg_list->second.front()].GetRenameTo();
           if (output_sizes.front().empty()) {
             strings::StrAppend(&result_, "    ", var_name, ", (", inputs_var,
                                ",) = ", conversion, "\n");
@@ -584,13 +633,11 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           Unflatten("    ", output_sizes, inputs_var, &result_);
           std::vector<string> p;
           for (int j : arg_list->second) {
-            p.emplace_back(param_names_[j]);
+            p.emplace_back(param_names_[j].GetRenameTo());
           }
           strings::StrAppend(&result_, "    ", VectorToTuple(p), " = ",
                              inputs_var, "\n");
         }
-        strings::StrAppend(&result_, "    ", var_name, " = ", var_name,
-                           ".as_datatype_enum\n");
       } else if (attr.type() == "list(type)") {
         // NOTE: We ignore default values for these attrs, since it is
         // unclear how you would use it, and the one use case is
@@ -605,21 +652,18 @@ void GenEagerPythonOp::AddEagerInferredAttrs() {
           std::vector<string> lists;
           for (auto iter = arg_list->second.begin();
                iter != arg_list->second.end(); ++iter) {
-            lists.push_back(param_names_[*iter]);
+            lists.push_back(param_names_[*iter].GetRenameTo());
           }
           inputs_var = VectorToTuple(lists);
           conversion = "_execute.args_to_mixed_eager_tensors";
         } else {
           // For one list(tensor) argument, we just convert every
           // element of the list to an eager tensor.
-          inputs_var = param_names_[arg_list->second.front()];
+          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
           conversion = "_execute.convert_to_mixed_eager_tensors";
         }
         strings::StrAppend(&result_, "    ", var_name, ", ", inputs_var, " = ",
                            conversion, "(", inputs_var, ", _ctx)\n");
-        strings::StrAppend(&result_, "    ", var_name,
-                           " = [_t.as_datatype_enum for _t in ", var_name,
-                           "]\n");
       }
     }
   }
@@ -630,7 +674,7 @@ void GenEagerPythonOp::AddEagerInputCasts() {
   for (int i = 0; i < op_def_.input_arg_size(); ++i) {
     const auto& arg(op_def_.input_arg(i));
     if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
-    const string& param = param_names_[i];
+    const string& param = param_names_[i].GetRenameTo();
     const string fn = arg.number_attr().empty() ? "" : "n_";
     const string dtype =
         python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
@@ -667,7 +711,7 @@ void GenEagerPythonOp::AddEagerExecute(const string& num_outputs_expr) {
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
-string GetEagerPythonOps(const OpList& ops,
+string GetEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                          const std::vector<string>& hidden_ops,
                          bool require_shapes,
                          const string& source_file_name = "") {
@@ -703,6 +747,7 @@ from tensorflow.python.framework import common_shapes as _common_shapes
 from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
+from tensorflow.python.util.tf_export import tf_export
 
 )");
 
@@ -732,7 +777,9 @@ from tensorflow.python.framework import op_def_library as _op_def_library
       continue;
     }
 
-    strings::StrAppend(&result, GetEagerPythonOp(op_def, function_name));
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+    strings::StrAppend(&result,
+                       GetEagerPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
       strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
@@ -765,19 +812,21 @@ from tensorflow.python.framework import op_def_library as _op_def_library
 
 }  // namespace
 
-void PrintEagerPythonOps(const OpList& ops,
+void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                          const std::vector<string>& hidden_ops,
                          bool require_shapes, const string& source_file_name) {
-  printf("%s",
-         GetEagerPythonOps(ops, hidden_ops, require_shapes, source_file_name)
-             .c_str());
+  printf("%s", GetEagerPythonOps(ops, api_defs, hidden_ops, require_shapes,
+                                 source_file_name)
+                   .c_str());
 }
 
 string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
-  return GetEagerPythonOps(ops, {}, false);
+
+  ApiDefMap api_def_map(ops);
+  return GetEagerPythonOps(ops, api_def_map, {}, false);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
index 250623850f2c04d5deb0924cc4043226e089d425..f9dfdf0408f2ea0cf72631e67266ec445b98a868 100644
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ b/tensorflow/python/eager/python_eager_op_gen.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -26,7 +27,7 @@ namespace tensorflow {
 // in the output. Prints the output to stdout.
 // Optional fourth argument is the name of the original C++ source file
 // where the ops' REGISTER_OP() calls reside.
-void PrintEagerPythonOps(const OpList& ops,
+void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                          const std::vector<string>& hidden_ops,
                          bool require_shapes,
                          const string& source_file_name = "");
diff --git a/tensorflow/python/eager/python_eager_op_gen_main.cc b/tensorflow/python/eager/python_eager_op_gen_main.cc
index 9e4aa97ccc751fb022c92335dbe584540b950b6b..05351bd8b115ae07482b82166974e86758bc7712 100644
--- a/tensorflow/python/eager/python_eager_op_gen_main.cc
+++ b/tensorflow/python/eager/python_eager_op_gen_main.cc
@@ -20,15 +20,34 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 
 namespace tensorflow {
 namespace {
 
-void PrintAllPythonOps(const std::vector<string>& hidden_ops) {
+void PrintAllPythonOps(const std::vector<string>& hidden_ops,
+                       const std::vector<string>& api_def_dirs) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
-  PrintEagerPythonOps(ops, hidden_ops, true /* require_shapes */);
+
+  ApiDefMap api_def_map(ops);
+  if (!api_def_dirs.empty()) {
+    Env* env = Env::Default();
+
+    for (const auto& api_def_dir : api_def_dirs) {
+      std::vector<string> api_files;
+      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
+                                        &api_files));
+      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+    }
+    api_def_map.UpdateDocs();
+  }
+
+  PrintEagerPythonOps(ops, api_def_map, hidden_ops, true /* require_shapes */);
 }
 
 }  // namespace
@@ -37,8 +56,15 @@ void PrintAllPythonOps(const std::vector<string>& hidden_ops) {
 int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
+  // Usage:
+  //   python_eager_op_gen_main api_def_dir1,api_def_dir2,...
   if (argc == 1) {
-    tensorflow::PrintAllPythonOps({});
+    tensorflow::PrintAllPythonOps({}, {});
+  } else if (argc == 2) {
+    const std::vector<tensorflow::string> api_def_dirs =
+        tensorflow::str_util::Split(argv[1], ",",
+                                    tensorflow::str_util::SkipEmpty());
+    tensorflow::PrintAllPythonOps({}, api_def_dirs);
   } else {
     return -1;
   }
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 3adaea2c7913be134b0573780ddb881c219604e0..91192fea62dd3b0f94350a9b25ce8568e248e7e3 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
+#include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "tensorflow/c/c_api.h"
@@ -329,24 +330,9 @@ void EagerTensor_dealloc(EagerTensor* self) {
   // We have the global interpreter lock, so use this chance to perform delayed
   // refcount decrements.
   tensorflow::ClearDecrefCache();
-  PyObject* id = PyLong_FromLongLong(self->id);
-  PyObject* func = PyObject_GetAttrString(reinterpret_cast<PyObject*>(self),
-                                          "_delete_trace");
+  auto id = self->id;
   Py_TYPE(self)->tp_free(self);
-  self = nullptr;
-  // Note that we run `func` after calling `tp_free`. Otherwise calling that
-  // function can potentially trigger garbage collection that observes `self`
-  // in this half deleted state and crashes.
-  // Note that `func` is a staticmethod and does not need `self` to be around
-  // for running.
-  // We clear (and later restore) any errors that have already been set. Else
-  // these erorrs may appear randomly as part of the function execution.
-  PyObject *a, *b, *c;
-  PyErr_Fetch(&a, &b, &c);
-  PyObject_CallFunctionObjArgs(func, id, nullptr);
-  PyErr_Restore(a, b, c);
-  Py_DECREF(func);
-  Py_DECREF(id);
+  TFE_Py_TapeStackDeleteTrace(id);
 }
 
 // Getter for `_id`.
@@ -573,7 +559,7 @@ bool EagerTensor_CheckExact(const PyObject* o) {
   return Py_TYPE(o) == EagerTensorType;
 }
 
-TFE_TensorHandle* EagerTensorHandle(const PyObject* o) {
+TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
@@ -594,6 +580,11 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
   return reinterpret_cast<PyObject*>(t);
 }
 
+tensorflow::int64 EagerTensor_id(const PyObject* tensor) {
+  CHECK(EagerTensor_CheckExact(tensor));
+  return reinterpret_cast<const EagerTensor*>(tensor)->id;
+}
+
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   if (!PyType_Check(base_class)) {
     PyErr_SetString(
@@ -657,3 +648,69 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   EagerTensorType->tp_dictoffset = 0;
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
+
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim) {
+  if (!PyList_Check(tensor_list)) {
+    PyErr_SetString(PyExc_TypeError,
+                    tensorflow::strings::StrCat(
+                        "tensor_list argument must be a list. Got \"",
+                        Py_TYPE(tensor_list)->tp_name, "\"")
+                        .c_str());
+    return nullptr;
+  }
+  if (slice_dim < 0) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        tensorflow::strings::StrCat("Slice dimension must be non-negative. "
+                                    "Got ",
+                                    slice_dim)
+            .c_str());
+    return nullptr;
+  }
+
+  Py_ssize_t num_tensors = PyList_Size(tensor_list);
+  int64_t num_tensors_int = static_cast<int64_t>(num_tensors);
+  auto tensor = tensorflow::make_safe(TF_AllocateTensor(
+      TF_INT32, &num_tensors_int, /*num_dims=*/1, /*len=*/4 * num_tensors_int));
+  int32_t* data = reinterpret_cast<int32_t*>(TF_TensorData(tensor.get()));
+  for (Py_ssize_t i = 0; i < num_tensors; ++i) {
+    PyObject* tensor_obj = PyList_GET_ITEM(tensor_list, i);
+    if (!EagerTensor_CheckExact(tensor_obj)) {
+      PyErr_SetString(PyExc_TypeError,
+                      tensorflow::strings::StrCat(
+                          "Expected a list of EagerTensors but "
+                          "element ",
+                          i, " has type \"", Py_TYPE(tensor_obj)->tp_name, "\"")
+                          .c_str());
+      return nullptr;
+    }
+
+    EagerTensor* t = reinterpret_cast<EagerTensor*>(tensor_obj);
+    TFE_TensorHandle* handle = t->handle;
+    if (slice_dim >= TFE_TensorHandleNumDims(handle)) {
+      PyErr_SetString(PyExc_IndexError,
+                      tensorflow::strings::StrCat(
+                          "Slice dimension (", slice_dim,
+                          ") must be smaller than rank of all "
+                          "tensors, but tensor at index ",
+                          i, " has rank ", TFE_TensorHandleNumDims(handle))
+                          .c_str());
+      return nullptr;
+    }
+    int64_t dim = TFE_TensorHandleDim(handle, slice_dim);
+    data[i] = dim;
+  }
+
+  auto status = tensorflow::make_safe(TF_NewStatus());
+  TFE_TensorHandle* handle = TFE_NewTensorHandle(tensor.get(), status.get());
+  if (TF_GetCode(status.get()) != TF_OK) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat("Failed to construct new tensor handle: ",
+                                    TF_Message(status.get()))
+            .c_str());
+    return nullptr;
+  }
+
+  return EagerTensorFromHandle(handle);
+}
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa1efdd1b81cca9df0088c4cecedfe52f258d2bc
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
+#define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/lib/core/numpy.h"
+
+bool EagerTensor_CheckExact(const PyObject* o);
+tensorflow::int64 EagerTensor_id(const PyObject* tensor);
+
+#endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 9834095c87a20cf5def393cc3c8f60d513dfa511..a33b17ada6f94e43ac16696c502be4b885e9d33a 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -81,28 +81,67 @@ bool EagerTensor_CheckExact(const PyObject* o);
 PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
-TFE_TensorHandle* EagerTensorHandle(const PyObject* o);
+TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 
 // Creates the `EagerTensor` class by subclassing `base_class` and returns the
 // newly created type, or nullptr on error.
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class);
 
-PyObject* TFE_Py_NewTape();
-PyObject* TFE_Py_TapeShouldRecord(PyObject* py_tape, PyObject* tensors);
-void TFE_Py_TapeWatch(PyObject* tape, tensorflow::int64 tensor_id);
-void TFE_Py_TapeDeleteTrace(PyObject* tape, tensorflow::int64 tensor_id);
-
-// Records an operation in the gradient tape. `tape` should point to an object
-// returned by TFE_Py_NewTape. op_type is a string for the operation type, used
-// in the backprop code. output_tensors should be a list of python ops.Tensor
-// objects. input_tensor_ids should be a list of python integers with the ids of
-// the input tensors of the recorded operation. backward_function should be the
-// function to be called during backprop to, given the gradients of the output
-// tensors, produce the gradients of the input tensors.
-void TFE_Py_TapeRecordOperation(PyObject* tape, PyObject* op_type,
-                                PyObject* output_tensors,
-                                PyObject* input_tensor_ids,
-                                PyObject* backward_function);
-PyObject* TFE_Py_TapeExport(PyObject* tape);
+// Pushes a new tape into the thread-local stack.
+// `persistent` must be a PyBool_Type, i.e either Py_True or Py_False
+void TFE_Py_TapeStackPushNew(PyObject* persistent);
+
+// Pops the tape from the top of the stack and returns it.
+PyObject* TFE_Py_TapeStackPop();
+
+// Pushes an existing tape onto the stack.
+void TFE_Py_TapeStackPush(PyObject* tape);
+
+// Returns true if the tape stack is empty.
+PyObject* TFE_Py_TapeStackIsEmpty();
+
+PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors);
+void TFE_Py_TapeStackWatch(PyObject* tensor);
+void TFE_Py_TapeStackDeleteTrace(tensorflow::int64 tensor_id);
+
+// Records an operation in the gradient tape stack.type is a string for the
+// operation type, used in the backprop code. output_tensors should be a list of
+// python ops.Tensor objects. input_tensor_ids should be a list of python
+// integers with the ids of the input tensors of the recorded
+// operation. backward_function should be the function to be called during
+// backprop to, given the gradients of the output tensors, produce the gradients
+// of the input tensors.
+void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
+                                     PyObject* output_tensors,
+                                     PyObject* input_tensor_ids,
+                                     PyObject* backward_function);
+
+// Watches the given variable object on the given tape.
+void TFE_Py_TapeStackWatchVariable(PyObject* variable);
+
+// Computes a gradient based on information recorded on the tape.`tape` must
+// have been produced by TFE_Py_NewTape. `vspace` must be a
+// imperative_grad.py:VSpace named tuple. `target` and `sources` must be python
+// lists of Tensor objects. `output_gradients` is either None or a python list
+// of either Tensor or None, and if not None should have the same length as
+// target.
+PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
+                              PyObject* target, PyObject* sources,
+                              PyObject* output_gradients, TF_Status* status);
+
+// Returns the set of variables watched by the given tape.
+PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
+
+// Returns an EagerTensor of dimension [len(`tensor_list`)] containing
+// the `slice_dim`'th dimension of each tensor in `tensor_list`. In other words,
+// TFE_Py_TensorShapeSlice takes a slice of dimensions of tensors in
+// `tensor_list`. For example, if `tensor_list` contains tensors of with shapes
+// [1, 2, 3], [4, 5], [6, 7, 8, 9], TFE_Py_TensorShapeSlice called with
+// `slice_dim` equal to 1 will return [2, 5, 7].
+// On error, returns nullptr and sets python exception.
+// REQUIRES: `tensor_list` is a python list of EagerTensors
+// REQUIRES: `slice_dim` is non-negative and smaller than the rank of all
+//   tensors in `tensor_list`.
+PyObject* TFE_Py_TensorShapeSlice(PyObject* tensor_list, int slice_dim);
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 7456eb10f867e797e32e314159b70b3e06b3d01d..b52d71dc6c5dbb67c04b10dc69d34a8943094f0d 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -13,13 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <thread>
+
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
 
 using tensorflow::string;
 
@@ -440,10 +445,59 @@ void TFE_DeleteContextCapsule(PyObject* context) {
   TF_DeleteStatus(status);
 }
 
+static tensorflow::int64 MakeInt(PyObject* integer) {
+#if PY_MAJOR_VERSION >= 3
+  return PyLong_AsLong(integer);
+#else
+  return PyInt_AsLong(integer);
+#endif
+}
+
+static tensorflow::int64 FastTensorId(PyObject* tensor) {
+  if (EagerTensor_CheckExact(tensor)) {
+    return EagerTensor_id(tensor);
+  }
+  PyObject* id_field = PyObject_GetAttrString(tensor, "_id");
+  if (id_field == nullptr) {
+    return -1;
+  }
+  tensorflow::int64 id = MakeInt(id_field);
+  Py_DECREF(id_field);
+  return id;
+}
+
+class GradientTape
+    : public tensorflow::eager::GradientTape<PyObject, PyObject> {
+ public:
+  explicit GradientTape(bool persistent)
+      : tensorflow::eager::GradientTape<PyObject, PyObject>(persistent) {}
+
+  void WatchVariable(PyObject* v) {
+    watched_variables_.insert(v);
+    Py_INCREF(v);
+    PyObject* handle = PyObject_GetAttrString(v, "handle");
+    if (handle == nullptr) {
+      return;
+    }
+    tensorflow::int64 id = FastTensorId(handle);
+    Py_DECREF(handle);
+    if (!PyErr_Occurred()) {
+      this->Watch(id);
+    }
+  }
+
+  const std::unordered_set<PyObject*> WatchedVariables() {
+    return watched_variables_;
+  }
+
+ private:
+  std::unordered_set<PyObject*> watched_variables_;
+};
+
 typedef struct {
   PyObject_HEAD
       /* Type-specific fields go here. */
-      tensorflow::eager::GradientTape* tape;
+      GradientTape* tape;
 } TFE_Py_Tape;
 
 static void TFE_Py_Tape_Delete(PyObject* tape) {
@@ -474,20 +528,62 @@ static PyTypeObject TFE_Py_Tape_Type = {
     "TFE_Py_Tape objects",                        /* tp_doc */
 };
 
-PyObject* TFE_Py_NewTape() {
+// xcode 7 doesn't define thread_local, so for compatibility we implement our
+// own. TODO(apassos) remove once we can deprecate xcode 7.
+#ifndef __APPLE__
+std::vector<TFE_Py_Tape*>* GetTapeStack() {
+  thread_local std::vector<TFE_Py_Tape*> tape_stack;
+  return &tape_stack;
+}
+#else
+static tensorflow::mutex stack_mu(tensorflow::LINKER_INITIALIZED);
+static std::unordered_map<std::thread::id, std::vector<TFE_Py_Tape*>*>*
+    tape_stack GUARDED_BY(stack_mu) = nullptr;
+std::vector<TFE_Py_Tape*>* GetTapeStack() {
+  tensorflow::mutex_lock ml(stack_mu);
+  if (tape_stack == nullptr) {
+    tape_stack =
+        new std::unordered_map<std::thread::id, std::vector<TFE_Py_Tape*>*>;
+  }
+  auto it = tape_stack->find(std::this_thread::get_id());
+  if (it != tape_stack->end()) {
+    return it->second;
+  }
+  return tape_stack
+      ->emplace(std::this_thread::get_id(), new std::vector<TFE_Py_Tape*>)
+      .first->second;
+}
+#endif
+
+void TFE_Py_TapeStackPushNew(PyObject* persistent) {
   TFE_Py_Tape_Type.tp_new = PyType_GenericNew;
-  if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return nullptr;
+  if (PyType_Ready(&TFE_Py_Tape_Type) < 0) return;
   TFE_Py_Tape* tape = PyObject_NEW(TFE_Py_Tape, &TFE_Py_Tape_Type);
-  tape->tape = new tensorflow::eager::GradientTape();
-  return reinterpret_cast<PyObject*>(tape);
+  tape->tape = new GradientTape(persistent == Py_True);
+  GetTapeStack()->push_back(tape);
 }
 
-static tensorflow::int64 MakeInt(PyObject* integer) {
-#if PY_MAJOR_VERSION >= 3
-  return PyLong_AsLong(integer);
-#else
-  return PyInt_AsLong(integer);
-#endif
+void TFE_Py_TapeStackPush(PyObject* tape) {
+  Py_INCREF(tape);
+  GetTapeStack()->push_back(reinterpret_cast<TFE_Py_Tape*>(tape));
+}
+
+PyObject* TFE_Py_TapeStackIsEmpty() {
+  if (GetTapeStack()->empty()) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
+PyObject* TFE_Py_TapeStackPop() {
+  auto* stack = GetTapeStack();
+  if (stack->empty()) {
+    PyErr_SetString(PyExc_RuntimeError, "tape stack is empty.");
+    return nullptr;
+  }
+  TFE_Py_Tape* top = stack->back();
+  stack->pop_back();
+  return reinterpret_cast<PyObject*>(top);
 }
 
 static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
@@ -514,23 +610,54 @@ static std::vector<tensorflow::int64> MakeIntList(PyObject* list) {
   return tensor_ids;
 }
 
-PyObject* TFE_Py_TapeShouldRecord(PyObject* py_tape, PyObject* tensors) {
-  TFE_Py_Tape* tape = reinterpret_cast<TFE_Py_Tape*>(py_tape);
-  return PyBool_FromLong(tape->tape->ShouldRecord(MakeIntList(tensors)));
+PyObject* TFE_Py_TapeStackShouldRecord(PyObject* tensors) {
+  if (tensors == Py_None) {
+    Py_RETURN_FALSE;
+  }
+  auto* stack = GetTapeStack();
+  if (stack->empty()) {
+    Py_RETURN_FALSE;
+  }
+  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
+  if (seq == nullptr) {
+    return nullptr;
+  }
+  int len = PySequence_Fast_GET_SIZE(seq);
+  // TODO(apassos) consider not building a list and changing the API to check
+  // each tensor individually.
+  std::vector<tensorflow::int64> tensor_ids;
+  tensor_ids.reserve(len);
+  for (int i = 0; i < len; ++i) {
+    PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+    tensor_ids.push_back(FastTensorId(item));
+  }
+  Py_DECREF(seq);
+  for (TFE_Py_Tape* tape : *stack) {
+    if (tape->tape->ShouldRecord(tensor_ids)) {
+      Py_RETURN_TRUE;
+    }
+  }
+  Py_RETURN_FALSE;
 }
 
-void TFE_Py_TapeWatch(PyObject* tape, tensorflow::int64 tensor_id) {
-  reinterpret_cast<TFE_Py_Tape*>(tape)->tape->Watch(tensor_id);
+void TFE_Py_TapeStackWatch(PyObject* tensor) {
+  tensorflow::int64 tensor_id = FastTensorId(tensor);
+  if (PyErr_Occurred()) {
+    return;
+  }
+  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+    tape->tape->Watch(tensor_id);
+  }
 }
 
-// TODO(apassos) have a fast path for eager tensors here which gets information
-// from the handle instead of from the python object, and use this only for the
-// case of graph tensors.
 static tensorflow::eager::TapeTensor TapeTensorFromTensor(PyObject* tensor) {
-  PyObject* id_field = PyObject_GetAttrString(tensor, "_id");
-  tensorflow::int64 id = MakeInt(id_field);
-  Py_DECREF(id_field);
-  if (PyErr_Occurred() != nullptr) {
+  if (EagerTensor_CheckExact(tensor)) {
+    TFE_TensorHandle* t = EagerTensor_Handle(tensor);
+    tensorflow::int64 id = EagerTensor_id(tensor);
+    return tensorflow::eager::TapeTensor{id, t->t.dtype(), t->t.shape()};
+  }
+  tensorflow::int64 id = FastTensorId(tensor);
+  if (PyErr_Occurred()) {
     return tensorflow::eager::TapeTensor{
         id, static_cast<tensorflow::DataType>(0), tensorflow::TensorShape({})};
   }
@@ -563,11 +690,52 @@ static tensorflow::eager::TapeTensor TapeTensorFromTensor(PyObject* tensor) {
   return tensorflow::eager::TapeTensor{id, dtype, shape};
 }
 
-void TFE_Py_TapeRecordOperation(PyObject* tape, PyObject* op_type,
-                                PyObject* output_tensors,
-                                PyObject* input_tensor_ids,
-                                PyObject* backward_function) {
-  std::vector<tensorflow::int64> input_ids = MakeIntList(input_tensor_ids);
+std::vector<tensorflow::int64> MakeTensorIDList(PyObject* tensors) {
+  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
+  if (seq == nullptr) {
+    return {};
+  }
+  int len = PySequence_Fast_GET_SIZE(seq);
+  std::vector<tensorflow::int64> list;
+  list.reserve(len);
+  for (int i = 0; i < len; ++i) {
+    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
+    list.push_back(FastTensorId(tensor));
+    if (PyErr_Occurred()) {
+      Py_DECREF(seq);
+      return list;
+    }
+  }
+  Py_DECREF(seq);
+  return list;
+}
+
+void TFE_Py_TapeStackWatchVariable(PyObject* variable) {
+  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+    tape->tape->WatchVariable(variable);
+  }
+}
+
+PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
+  const std::unordered_set<PyObject*>& watched_variables =
+      reinterpret_cast<TFE_Py_Tape*>(tape)->tape->WatchedVariables();
+  PyObject* result = PySet_New(nullptr);
+  for (PyObject* variable : watched_variables) {
+    PySet_Add(result, variable);
+    Py_DECREF(variable);
+  }
+  return result;
+}
+
+void TFE_Py_TapeStackRecordOperation(PyObject* op_type,
+                                     PyObject* output_tensors,
+                                     PyObject* input_tensors,
+                                     PyObject* backward_function) {
+  auto* stack = GetTapeStack();
+  if (stack->empty()) {
+    return;
+  }
+  std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
@@ -582,74 +750,253 @@ void TFE_Py_TapeRecordOperation(PyObject* tape, PyObject* op_type,
     }
   }
   Py_DECREF(seq);
-  Py_INCREF(backward_function);
-  reinterpret_cast<TFE_Py_Tape*>(tape)->tape->RecordOperation(
-      PyBytes_AsString(op_type), output_info, input_ids, backward_function,
-      [backward_function]() { Py_DECREF(backward_function); });
-}
-
-void TFE_Py_TapeDeleteTrace(PyObject* tape, tensorflow::int64 tensor_id) {
-  reinterpret_cast<TFE_Py_Tape*>(tape)->tape->DeleteTrace(tensor_id);
-}
-
-// TODO(apassos) when backprop.py moves to C most of this exporting logic can
-// disappear.
-PyObject* TFE_Py_TapeExport(PyObject* tape) {
-  std::pair<tensorflow::eager::TensorTape, tensorflow::eager::OpTape> exported =
-      reinterpret_cast<TFE_Py_Tape*>(tape)->tape->Export();
-  PyObject* tensor_tape = PyDict_New();
-  for (const auto& pair : exported.first) {
-    PyObject* tid = PyLong_FromLong(pair.first);
-    PyObject* opid = PyLong_FromLong(pair.second);
-    PyDict_SetItem(tensor_tape, tid, opid);
-    Py_DECREF(tid);
-    Py_DECREF(opid);
-  }
-
-  PyObject* op_tape = PyDict_New();
-  for (const auto& pair : exported.second) {
-    PyObject* opid = PyLong_FromLong(pair.first);
-    const auto& entry = pair.second;
-    PyObject* op_type = PyBytes_FromString(entry.op_type.c_str());
-    PyObject* output_ids = PyList_New(entry.output_tensor_info.size());
-    for (int i = 0; i < entry.output_tensor_info.size(); ++i) {
-      PyObject* tid = PyLong_FromLong(entry.output_tensor_info[i].id);
-      PyList_SET_ITEM(output_ids, i, tid);
+  string op_type_str;
+  if (PyBytes_Check(op_type)) {
+    op_type_str = PyBytes_AsString(op_type);
+  } else if (PyUnicode_Check(op_type)) {
+#if PY_MAJOR_VERSION >= 3
+    op_type_str = PyUnicode_AsUTF8(op_type);
+#else
+    PyObject* py_str = PyUnicode_AsUTF8String(op_type);
+    if (py_str == nullptr) return;
+    op_type_str = PyBytes_AS_STRING(py_str);
+    Py_DECREF(py_str);
+#endif
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, "op_type should be a string.");
+    return;
+  }
+
+  for (TFE_Py_Tape* tape : *stack) {
+    Py_INCREF(backward_function);
+    tape->tape->RecordOperation(
+        op_type_str, output_info, input_ids, backward_function,
+        [backward_function]() { Py_DECREF(backward_function); });
+  }
+}
+
+void TFE_Py_TapeStackDeleteTrace(tensorflow::int64 tensor_id) {
+  for (TFE_Py_Tape* tape : *GetTapeStack()) {
+    tape->tape->DeleteTrace(tensor_id);
+  }
+}
+
+class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
+ public:
+  explicit PyVSpace(PyObject* py_vspace) : py_vspace_(py_vspace) {}
+
+  tensorflow::Status Initialize() {
+    num_elements_ = PyObject_GetAttrString(py_vspace_, "num_elements_fn");
+    if (num_elements_ == nullptr) {
+      return tensorflow::errors::InvalidArgument("invalid vspace");
+    }
+    aggregate_fn_ = PyObject_GetAttrString(py_vspace_, "aggregate_fn");
+    if (aggregate_fn_ == nullptr) {
+      return tensorflow::errors::InvalidArgument("invalid vspace");
+    }
+    zeros_ = PyObject_GetAttrString(py_vspace_, "zeros");
+    if (zeros_ == nullptr) {
+      return tensorflow::errors::InvalidArgument("invalid vspace");
+    }
+    ones_ =
+        PyObject_GetAttrString(reinterpret_cast<PyObject*>(py_vspace_), "ones");
+    if (ones_ == nullptr) {
+      return tensorflow::errors::InvalidArgument("invalid vspace");
+    }
+    return tensorflow::Status::OK();
+  }
+
+  ~PyVSpace() override {
+    Py_XDECREF(num_elements_);
+    Py_XDECREF(aggregate_fn_);
+    Py_XDECREF(zeros_);
+    Py_XDECREF(ones_);
+  }
+
+  tensorflow::int64 NumElements(PyObject* tensor) const final {
+    PyObject* arglist =
+        Py_BuildValue("(O)", reinterpret_cast<PyObject*>(tensor));
+    PyObject* result = PyEval_CallObject(num_elements_, arglist);
+    tensorflow::int64 r = MakeInt(result);
+    Py_DECREF(result);
+    Py_DECREF(arglist);
+    return r;
+  }
+
+  PyObject* AggregateGradients(
+      tensorflow::gtl::ArraySlice<PyObject*> gradient_tensors) const final {
+    PyObject* list = PyList_New(gradient_tensors.size());
+    for (int i = 0; i < gradient_tensors.size(); ++i) {
+      // Note: stealing a reference to the gradient tensors.
+      CHECK(gradient_tensors[i] != nullptr);
+      CHECK(gradient_tensors[i] != Py_None);
+      PyList_SET_ITEM(list, i,
+                      reinterpret_cast<PyObject*>(gradient_tensors[i]));
+    }
+    PyObject* arglist = Py_BuildValue("(O)", list);
+    CHECK(arglist != nullptr);
+    PyObject* result = PyEval_CallObject(aggregate_fn_, arglist);
+    Py_DECREF(arglist);
+    Py_DECREF(list);
+    return result;
+  }
+
+  PyObject* Zeros(tensorflow::TensorShape shape,
+                  tensorflow::DataType dtype) const final {
+    PyObject* py_shape = PyTuple_New(shape.dims());
+    for (int i = 0; i < shape.dims(); ++i) {
+      PyTuple_SET_ITEM(py_shape, i, PyLong_FromLong(shape.dim_size(i)));
+    }
+    PyObject* py_dtype = PyLong_FromLong(static_cast<int>(dtype));
+    PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
+    PyObject* result = PyEval_CallObject(zeros_, arg_list);
+    Py_DECREF(arg_list);
+    Py_DECREF(py_dtype);
+    Py_DECREF(py_shape);
+    return reinterpret_cast<PyObject*>(result);
+  }
+
+  PyObject* Ones(tensorflow::TensorShape shape,
+                 tensorflow::DataType dtype) const final {
+    PyObject* py_shape = PyTuple_New(shape.dims());
+    for (int i = 0; i < shape.dims(); ++i) {
+      PyTuple_SET_ITEM(py_shape, i, PyLong_FromLong(shape.dim_size(i)));
+    }
+    PyObject* py_dtype = PyLong_FromLong(static_cast<int>(dtype));
+    PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
+    PyObject* result = PyEval_CallObject(ones_, arg_list);
+    Py_DECREF(arg_list);
+    Py_DECREF(py_dtype);
+    Py_DECREF(py_shape);
+    return result;
+  }
+
+  tensorflow::Status CallBackwardFunction(
+      PyObject* backward_function,
+      tensorflow::gtl::ArraySlice<PyObject*> output_gradients,
+      std::vector<PyObject*>* result) const final {
+    PyObject* grads = PyTuple_New(output_gradients.size());
+    for (int i = 0; i < output_gradients.size(); ++i) {
+      if (output_gradients[i] == nullptr) {
+        Py_INCREF(Py_None);
+        PyTuple_SET_ITEM(grads, i, Py_None);
+      } else {
+        PyTuple_SET_ITEM(grads, i,
+                         reinterpret_cast<PyObject*>(output_gradients[i]));
+      }
     }
-    PyObject* input_ids = PyList_New(entry.input_tensor_id.size());
-    for (int i = 0; i < entry.input_tensor_id.size(); ++i) {
-      PyObject* tid = PyLong_FromLong(entry.input_tensor_id[i]);
-      PyList_SET_ITEM(input_ids, i, tid);
+    PyObject* py_result = PyEval_CallObject(
+        reinterpret_cast<PyObject*>(backward_function), grads);
+    Py_DECREF(grads);
+    if (py_result == nullptr) {
+      return tensorflow::errors::Internal("gradient function threw exceptions");
     }
-    PyObject* backward_function =
-        reinterpret_cast<PyObject*>(entry.backward_function);
-    PyObject* output_shape_and_dtype =
-        PyList_New(entry.output_tensor_info.size());
-    for (int i = 0; i < entry.output_tensor_info.size(); ++i) {
-      const tensorflow::TensorShape& shape = entry.output_tensor_info[i].shape;
-      PyObject* shape_list = PyList_New(shape.dims());
-      for (int j = 0; j < shape.dims(); ++j) {
-        PyList_SET_ITEM(shape_list, j, PyLong_FromLong(shape.dim_size(j)));
+    result->clear();
+    PyObject* seq =
+        PySequence_Fast(py_result, "expected a sequence of gradients");
+    if (seq == nullptr) {
+      return tensorflow::errors::InvalidArgument(
+          "gradient function did not return a list");
+    }
+    int len = PySequence_Fast_GET_SIZE(seq);
+    VLOG(1) << "Gradient length is " << len;
+    result->reserve(len);
+    for (int i = 0; i < len; ++i) {
+      PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
+      if (item == Py_None) {
+        result->push_back(nullptr);
+      } else {
+        Py_INCREF(item);
+        result->push_back(item);
       }
-      PyObject* type_enum = PyLong_FromLong(entry.output_tensor_info[i].dtype);
-      PyObject* tuple = PyTuple_Pack(2, shape_list, type_enum);
-      Py_DECREF(shape_list);
-      Py_DECREF(type_enum);
-      PyList_SET_ITEM(output_shape_and_dtype, i, tuple);
     }
-    PyObject* opinfo = PyTuple_Pack(5, op_type, output_ids, input_ids,
-                                    backward_function, output_shape_and_dtype);
-    Py_DECREF(op_type);
-    Py_DECREF(output_ids);
-    Py_DECREF(input_ids);
+    Py_DECREF(seq);
+    Py_DECREF(py_result);
+    return tensorflow::Status::OK();
+  }
+
+  void ReleaseBackwardFunction(PyObject* backward_function) const final {
     Py_DECREF(backward_function);
-    Py_DECREF(output_shape_and_dtype);
-    PyDict_SetItem(op_tape, opid, opinfo);
-    Py_DECREF(opid);
-    Py_DECREF(opinfo);
-  }
-  PyObject* retval = PyTuple_Pack(2, tensor_tape, op_tape);
-  Py_DECREF(tensor_tape);
-  Py_DECREF(op_tape);
-  return retval;
+  }
+
+  void DeleteGradient(PyObject* tensor) const final { Py_XDECREF(tensor); }
+
+ private:
+  PyObject* py_vspace_;
+
+  PyObject* num_elements_;
+  PyObject* aggregate_fn_;
+  PyObject* zeros_;
+  PyObject* ones_;
+};
+
+std::vector<PyObject*> MakeTensorList(PyObject* tensors) {
+  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
+  if (seq == nullptr) {
+    return {};
+  }
+  int len = PySequence_Fast_GET_SIZE(seq);
+  std::vector<PyObject*> list;
+  list.reserve(len);
+  for (int i = 0; i < len; ++i) {
+    list.push_back(PySequence_Fast_GET_ITEM(seq, i));
+  }
+  Py_DECREF(seq);
+  return list;
+}
+
+
+PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
+                              PyObject* target, PyObject* sources,
+                              PyObject* output_gradients, TF_Status* status) {
+  PyVSpace c_vspace(vspace);
+  if (!c_vspace.Initialize().ok()) {
+    return nullptr;
+  }
+
+  std::vector<tensorflow::int64> target_vec = MakeTensorIDList(target);
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+  std::vector<tensorflow::int64> sources_vec = MakeTensorIDList(sources);
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+  std::vector<PyObject*> outgrad_vec;
+  if (output_gradients != Py_None) {
+    outgrad_vec = MakeTensorList(output_gradients);
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+    for (PyObject* tensor : outgrad_vec) {
+      // Calling the backward function will eat a reference to the tensors in
+      // outgrad_vec, so we need to increase their reference count.
+      Py_INCREF(tensor);
+    }
+  }
+  TFE_Py_Tape* tape_obj = reinterpret_cast<TFE_Py_Tape*>(tape);
+  std::vector<PyObject*> result;
+  status->status = tape_obj->tape->ComputeGradient(
+      c_vspace, target_vec, sources_vec, outgrad_vec, &result);
+  if (!status->status.ok()) {
+    if (PyErr_Occurred()) {
+      // Do not propagate the erroneous status as that would swallow the
+      // exception which caused the problem.
+      status->status = tensorflow::Status::OK();
+    }
+    return nullptr;
+  }
+  if (!result.empty()) {
+    PyObject* py_result = PyList_New(result.size());
+    for (int i = 0; i < result.size(); ++i) {
+      if (result[i] == nullptr) {
+        Py_INCREF(Py_None);
+        result[i] = Py_None;
+      }
+      PyList_SET_ITEM(py_result, i, reinterpret_cast<PyObject*>(result[i]));
+    }
+    return py_result;
+  }
+  Py_INCREF(Py_None);
+  return Py_None;
 }
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index c16aa8c2f7eb48002acd354b20f8ca06febcc6f7..14b5238f74039ec23bd197699de68c4c0254e8d3 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -18,116 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import contextlib
-import threading
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.util import compat
-
-
-def tid(tensor):
-  return tensor._id  # pylint: disable=protected-access
-
-
-class TapeEntry(
-    collections.namedtuple("TapeEntry", [
-        "op_type",
-        "output_ids", "input_ids", "backward_function",
-        "output_shape_and_dtype",
-    ])):
-  """Entry in the gradient tape.
-
-  Represents the execution of one op or function, with instructions for doing
-  its backward pass and useful information for it.
-
-  Args:
-   output_ids: tensor_id(t) for each output tensor T
-   input_ids: tensor_id(t) for each input tensor T
-   backward_function: function to be called with the downstream gradients and
-    side outputs as arguments which computes the backward pass.
-   output_shape_and_dtype: a list of (shape_tuple, dtype) for every output
-    tensor_id
-  """
-
-
-def _tensor_shape(t):
-  return t._shape_tuple()  # pylint: disable=protected-access
 
 
 class Tape(object):
   """Represents a gradient propagation trace."""
 
-  def __init__(self):
-    self._tape = pywrap_tensorflow.TFE_Py_NewTape()
-    self._watched_variables = set()
-
-  def should_record(self, tensors):
-    """Returns true if any tensor should be recorded.
-
-    Args:
-      tensors: some tensors.
-
-    Returns:
-      True if any of the tensors is in the tape.
-    """
-    return pywrap_tensorflow.TFE_Py_TapeShouldRecord(
-        self._tape, [x._id  for x in tensors])  # pylint: disable=protected-access
-
-  def watch(self, tensor):
-    """Adds a tensor to the tape."""
-    pywrap_tensorflow.TFE_Py_TapeWatch(self._tape, tid(tensor))
+  def __init__(self, tape):
+    self._tape = tape
 
-  def watch_variable(self, v):
-    self._watched_variables.add(v)
-    self.watch(v.handle)
+  def watched_variables(self):
+    return pywrap_tensorflow.TFE_Py_TapeWatchedVariables(self._tape)
 
-  def record_operation(self, op_type, output_tensors, input_tensors,
-                       backward_function):
-    """Records an operation in the tape."""
-    pywrap_tensorflow.TFE_Py_TapeRecordOperation(
-        self._tape,
-        compat.as_bytes(op_type),
-        output_tensors,
-        [x._id for x in input_tensors],  # pylint: disable=protected-access
-        backward_function)
 
-  def _delete_tensor_id(self, i):
-    pywrap_tensorflow.TFE_Py_TapeDeleteTrace(self._tape, i)
-
-  def delete_trace(self, tensor_id):
-    """Deletes any trace we have for this tensor."""
-    self._delete_tensor_id(tensor_id)
-
-  def export(self):
-    """Exports the internal state of this tape.
-
-    Returns:
-      tensor_tape: a map from tensor_id(tensor) to <identifier for op>
-       responsible for generating that tensor.
-      op_tape: a map from <identifier for op> to TapeEntry for that op.
-    """
-    return pywrap_tensorflow.TFE_Py_TapeExport(self._tape)
-
-
-class _TapeStack(threading.local):
-
-  def __init__(self):
-    super(_TapeStack, self).__init__()
-    self._stack = []
-
-  @property
-  def stack(self):
-    return self._stack
-
-
-# The global tape stack.
-_tape_stack = _TapeStack()
-
-
-def push_new_tape():
+def push_new_tape(persistent=False):
   """Pushes a new tape onto the tape stack."""
-  _tape_stack.stack.append(Tape())
+  pywrap_tensorflow.TFE_Py_TapeStackPushNew(persistent)
 
 
 def watch(tensor):
@@ -136,8 +44,7 @@ def watch(tensor):
   Args:
     tensor: tensor to be watched.
   """
-  for t in _tape_stack.stack:
-    t.watch(tensor)
+  pywrap_tensorflow.TFE_Py_TapeStackWatch(tensor)
 
 
 def watch_variable(variable):
@@ -146,53 +53,42 @@ def watch_variable(variable):
   Args:
     variable: variable to be watched.
   """
-  for t in _tape_stack.stack:
-    t.watch_variable(variable)
+  pywrap_tensorflow.TFE_Py_TapeStackWatchVariable(variable)
 
 
 def pop_tape():
   """Pops the top tape in the stack, if any."""
-  if _tape_stack.stack:
-    return _tape_stack.stack.pop()
-  return None
+  return Tape(pywrap_tensorflow.TFE_Py_TapeStackPop())
 
 
 @contextlib.contextmanager
 def stop_recording():
-  old = _tape_stack.stack
-  _tape_stack._stack = []  # pylint: disable=protected-access
+  stack = []
+  while not pywrap_tensorflow.TFE_Py_TapeStackIsEmpty():
+    stack.append(pop_tape()._tape)  # pylint: disable=protected-access
   try:
     yield
   finally:
-    _tape_stack._stack = old  # pylint: disable=protected-access
+    for tape in reversed(stack):
+      pywrap_tensorflow.TFE_Py_TapeStackPush(tape)
 
 
 def should_record(tensors):
   """Returns true if any tape in the stack watches any of these tensors."""
-  if not _tape_stack.stack:
-    return False
-  return any(x.should_record(tensors) for x in _tape_stack.stack)
+  return pywrap_tensorflow.TFE_Py_TapeStackShouldRecord(tensors)
 
 
 def record_operation(op_type, output_tensors, input_tensors, backward_function):
   """Records the operation on all tapes in the stack."""
-  for t in _tape_stack.stack:
-    t.record_operation(op_type, output_tensors,
-                       input_tensors,
-                       backward_function)
+  pywrap_tensorflow.TFE_Py_TapeStackRecordOperation(
+      op_type, output_tensors, input_tensors, backward_function)
 
 
 def delete_trace(tensor_id):
   """Deletes traces for this Tensor from all tapes in the stack."""
-  for t in _tape_stack.stack:
-    t.delete_trace(tensor_id)
-
-
-def top_tape_watched_variables():
-  t = _tape_stack.stack[-1]
-  return t._watched_variables  # pylint: disable=protected-access
+  pywrap_tensorflow.TFE_Py_TapeStackDeleteTrace(tensor_id)
 
 
 def could_possibly_record():
   """Returns True if any tape is active."""
-  return len(_tape_stack.stack) > 0  # pylint: disable=g-explicit-length-test
+  return not pywrap_tensorflow.TFE_Py_TapeStackIsEmpty()
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index c97cb62125741ccdec495d925651a3559bd5fb9c..b490bac66db03b0a61a8852f45f1f558cccaf121 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import custom_gradient
-from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -166,25 +165,6 @@ class TapeTest(test.TestCase):
     g, = backprop.gradients_function(fn, [0])(t)
     self.assertAllEqual(g, 1.0)
 
-  def testTapeGC(self):
-    # TODO(apassos) figure out how to test this without using tape internal
-    # APIs.
-    tape.push_new_tape()
-
-    def f():
-      x = constant_op.constant(1.0)
-      tape.watch(x)
-      x = gradient_is_constant(x)
-      x = gradient_is_constant(x)
-      x = gradient_is_constant(x)
-
-    f()
-    t = tape.pop_tape()
-    tensor_tape, op_tape = t.export()
-    self.assertEqual(len(tensor_tape), 1)  # The watched tensor will remain on
-                                           # the tape
-    self.assertEqual(len(op_tape), 0)  # No operations should remain on the tape
-
   def testCustomGradientGraphMode(self):
     with context.graph_mode(), self.test_session():
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 2b7b5c727a2b246209629e3d293d2364b7706235..727f80efb4a5aa5b5d9bee72aed1c56c3649d3bc 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -22,6 +22,7 @@ import copy
 
 import numpy as np
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import test
@@ -105,6 +106,11 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(n)
     self.assertAllEqual([[1, 2], [3, 4]], t)
 
+  def testNumpyArrayDtype(self):
+    tensor = constant_op.constant([1.0, 2.0, 3.0])
+    numpy_tensor = np.asarray(tensor, dtype=np.int32)
+    self.assertAllEqual(numpy_tensor, [1, 2, 3])
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -207,6 +213,12 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t_np = t.numpy()
     self.assertTrue(np.all(t_np == t_np_orig), "%s vs %s" % (t_np, t_np_orig))
 
+  def testIterateOverTensor(self):
+    l = [[1, 2], [3, 4]]
+    t = _create_tensor(l)
+    for list_element, tensor_element in zip(l, t):
+      self.assertAllEqual(list_element, tensor_element.numpy())
+
   def testStringTensorOnGPU(self):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
@@ -216,5 +228,96 @@ class TFETensorTest(test_util.TensorFlowTestCase):
         _create_tensor("test string")
 
 
+class TFETensorUtilTest(test_util.TensorFlowTestCase):
+
+  def testListOfThree(self):
+    t1 = _create_tensor([[1, 2], [3, 4], [5, 6]], dtype=dtypes.int32)
+    t2 = _create_tensor([[1, 2, 5], [3, 4, 5]], dtype=dtypes.int32)
+    t3 = _create_tensor([[1], [3], [5], [6]], dtype=dtypes.int32)
+
+    r = pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1, t2, t3], 0)
+    self.assertAllEqual(np.array([3, 2, 4]), r.numpy())
+
+    r = pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1, t2, t3], 1)
+    self.assertAllEqual(np.array([2, 3, 1]), r.numpy())
+
+  def testEmptyTensorList(self):
+    a = pywrap_tensorflow.TFE_Py_TensorShapeSlice([], 0)
+    self.assertTrue(isinstance(a, ops.EagerTensor))
+    self.assertEqual(0, a.numpy().size)
+
+  def testTensorListContainsNonTensors(self):
+    t1 = _create_tensor([1, 2], dtype=dtypes.int32)
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"Expected a list of EagerTensors but element 1 has type \"str\""):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1, "abc"], 0)
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"Expected a list of EagerTensors but element 0 has type \"int\""):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([2, t1], 0)
+
+  def testTensorListNotList(self):
+    t1 = _create_tensor([1, 2], dtype=dtypes.int32)
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"tensor_list argument must be a list. Got \"EagerTensor\""):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice(t1, -2)
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"tensor_list argument must be a list. Got \"tuple\""):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice((t1,), -2)
+
+  def testNegativeSliceDim(self):
+    t1 = _create_tensor([1, 2], dtype=dtypes.int32)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Slice dimension must be non-negative. Got -2"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1], -2)
+
+  def testUnicode(self):
+    self.assertEqual(constant_op.constant(u"asdf").numpy(), b"asdf")
+
+  def testSliceDimOutOfRange(self):
+    t1 = _create_tensor([[1, 2], [3, 4], [5, 6]], dtype=dtypes.int32)
+    t2 = _create_tensor([1, 2], dtype=dtypes.int32)
+    t3 = _create_tensor(2, dtype=dtypes.int32)
+
+    with self.assertRaisesRegexp(
+        IndexError,
+        r"Slice dimension \(2\) must be smaller than rank of all tensors, "
+        "but tensor at index 0 has rank 2"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1], 2)
+
+    with self.assertRaisesRegexp(
+        IndexError,
+        r"Slice dimension \(1\) must be smaller than rank of all tensors, "
+        "but tensor at index 0 has rank 1"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t2], 1)
+
+    with self.assertRaisesRegexp(
+        IndexError,
+        r"Slice dimension \(1\) must be smaller than rank of all tensors, "
+        "but tensor at index 1 has rank 1"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t1, t2], 1)
+
+    with self.assertRaisesRegexp(
+        IndexError,
+        r"Slice dimension \(0\) must be smaller than rank of all tensors, "
+        "but tensor at index 0 has rank 0"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t3], 0)
+
+    with self.assertRaisesRegexp(
+        IndexError,
+        r"Slice dimension \(0\) must be smaller than rank of all tensors, "
+        "but tensor at index 2 has rank 0"):
+      pywrap_tensorflow.TFE_Py_TensorShapeSlice([t2, t1, t3], 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 13fbfe9f5377cc8d8b475b385217ac958a8026b4..e062e1fbfe64df2c5e6068b6f748e885b9b493a6 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -25,6 +25,7 @@ py_library(
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":baseline",
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
@@ -186,6 +187,69 @@ py_test(
     ],
 )
 
+py_library(
+    name = "baseline",
+    srcs = ["canned/baseline.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        ":optimizers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "baseline_test",
+    size = "medium",
+    srcs = ["canned/baseline_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "noasan",  # test flakily times out in asan mode.
+        "notsan",  # b/67510291
+    ],
+    deps = [
+        ":baseline",
+        ":estimator",
+        ":export_export",
+        ":metric_keys",
+        ":numpy_io",
+        ":pandas_io",
+        ":run_config",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
@@ -370,6 +434,7 @@ py_library(
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/data",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
@@ -537,6 +602,7 @@ py_library(
         ":prediction_keys",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e4ecd29fbcd4f4335077e9f81c5704ae2b9bec
--- /dev/null
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -0,0 +1,349 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Baseline estimators.
+
+Baseline estimators are bias-only estimators that can be used for debugging
+and as simple baselines.
+
+Example:
+
+```
+# Build BaselineClassifier
+classifier = BaselineClassifier(n_classes=3)
+
+# Input builders
+def input_fn_train: # returns x, y (where y represents label's class index).
+  pass
+
+def input_fn_eval: # returns x, y (where y represents label's class index).
+  pass
+
+# Fit model.
+classifier.train(input_fn=input_fn_train)
+
+# Evaluate cross entropy between the test and train labels.
+loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+# predict outputs the probability distribution of the classes as seen in
+# training.
+predictions = classifier.predict(new_samples)
+```
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import training_util
+
+# The default learning rate of 0.3 is a historical artifact of the initial
+# implementation, but seems a reasonable choice.
+_LEARNING_RATE = 0.3
+
+
+def _get_weight_column_key(weight_column):
+  if weight_column is None:
+    return None
+  if isinstance(weight_column, six.string_types):
+    return weight_column
+  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
+    raise TypeError('Weight column must be either a string or _NumericColumn.'
+                    ' Given type: {}.'.format(type(weight_column)))
+  return weight_column.key()
+
+
+def _baseline_logit_fn_builder(num_outputs, weight_column=None):
+  """Function builder for a baseline logit_fn.
+
+  Args:
+    num_outputs: Number of outputs for the model.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+       weights. It will be multiplied by the loss of the example.
+  Returns:
+    A logit_fn (see below).
+  """
+
+  def baseline_logit_fn(features):
+    """Baseline model logit_fn.
+
+    The baseline model simply learns a bias, so the output logits are a
+    `Variable` with one weight for each output that learns the bias for the
+    corresponding output.
+
+    Args:
+      features: The first item returned from the `input_fn` passed to `train`,
+        `evaluate`, and `predict`. This should be a single `Tensor` or dict with
+        `Tensor` values.
+    Returns:
+      A `Tensor` representing the logits.
+    """
+    size_checks = []
+    batch_size = None
+
+    weight_column_key = _get_weight_column_key(weight_column)
+
+    # The first dimension is assumed to be a batch size and must be consistent
+    # among all of the features.
+    for key, feature in features.items():
+      # Skip weight_column to ensure we don't add size checks to it.
+      # These would introduce a dependency on the weight at serving time.
+      if key == weight_column_key:
+        continue
+      first_dim = array_ops.shape(feature)[0]
+      if batch_size is None:
+        batch_size = first_dim
+      else:
+        size_checks.append(check_ops.assert_equal(batch_size, first_dim))
+
+    with ops.control_dependencies(size_checks):
+      with variable_scope.variable_scope('baseline'):
+        bias = variable_scope.get_variable('bias', shape=[num_outputs],
+                                           initializer=init_ops.Zeros)
+        return math_ops.multiply(bias, array_ops.ones([batch_size,
+                                                       num_outputs]))
+
+  return baseline_logit_fn
+
+
+def _baseline_model_fn(features, labels, mode, head, optimizer,
+                       weight_column=None, config=None):
+  """Model_fn for baseline models.
+
+  Args:
+    features: `Tensor` or dict of `Tensor` (depends on data passed to `train`).
+    labels: `Tensor` of labels that are compatible with the `Head` instance.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `Head` instance.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use `FtrlOptimizer`
+      with a default learning rate of 0.3.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+       weights. It will be multiplied by the loss of the example.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Raises:
+    KeyError: If weight column is specified but not present.
+    ValueError: If features is an empty dictionary.
+
+  Returns:
+    An `EstimatorSpec` instance.
+  """
+  del config  # Unused.
+
+  logit_fn = _baseline_logit_fn_builder(head.logits_dimension, weight_column)
+  logits = logit_fn(features)
+
+  def train_op_fn(loss):
+    opt = optimizers.get_optimizer_instance(
+        optimizer, learning_rate=_LEARNING_RATE)
+    return opt.minimize(loss, global_step=training_util.get_global_step())
+
+  return head.create_estimator_spec(
+      features=features,
+      mode=mode,
+      logits=logits,
+      labels=labels,
+      train_op_fn=train_op_fn)
+
+
+class BaselineClassifier(estimator.Estimator):
+  """A classifier that can establish a simple baseline.
+
+  This classifier ignores feature values and will learn to predict the average
+  value of each label. For single-label problems, this will predict the
+  probability distribution of the classes as seen in the labels. For multi-label
+  problems, this will predict the fraction of examples that are positive for
+  each class.
+
+  Example:
+
+  ```python
+
+  # Build BaselineClassifier
+  classifier = BaselineClassifier(n_classes=3)
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+
+  # Fit model.
+  classifier.train(input_fn=input_fn_train)
+
+  # Evaluate cross entropy between the test and train labels.
+  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # predict outputs the probability distribution of the classes as seen in
+  # training.
+  predictions = classifier.predict(new_samples)
+
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+     `key=weight_column` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Ftrl',
+               config=None):
+    """Initializes a BaselineClassifier instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        It must be greater than 1. Note: Class labels are integers representing
+        the class index (i.e. values from 0 to n_classes-1). For arbitrary
+        label values (e.g. string labels), convert to class indices first.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+         weights. It will be multiplied by the loss of the example.
+      label_vocabulary: Optional list of strings with size `[n_classes]`
+        defining the label vocabulary. Only supported for `n_classes` > 2.
+      optimizer: String, `tf.Optimizer` object, or callable that creates the
+        optimizer to use for training. If not specified, will use
+        `FtrlOptimizer` with a default learning rate of 0.3.
+      config: `RunConfig` object to configure the runtime settings.
+    Returns:
+      A `BaselineClassifier` estimator.
+
+    Raises:
+      ValueError: If `n_classes` < 2.
+    """
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _baseline_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          optimizer=optimizer,
+          weight_column=weight_column,
+          config=config)
+    super(BaselineClassifier, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config)
+
+
+class BaselineRegressor(estimator.Estimator):
+  """A regressor that can establish a simple baseline.
+
+  This regressor ignores feature values and will learn to predict the average
+  value of each label.
+
+  Example:
+
+  ```python
+
+  # Build BaselineRegressor
+  regressor = BaselineRegressor()
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y is the label).
+    pass
+
+  def input_fn_eval: # returns x, y (where y is the label).
+    pass
+
+  # Fit model.
+  regressor.train(input_fn=input_fn_train)
+
+  # Evaluate squared-loss between the test and train targets.
+  loss = regressor.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # predict outputs the mean value seen during training.
+  predictions = regressor.predict(new_samples)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+     `key=weight_column` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               model_dir=None,
+               label_dimension=1,
+               weight_column=None,
+               optimizer='Ftrl',
+               config=None):
+    """Initializes a BaselineRegressor instance.
+
+    Args:
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      label_dimension: Number of regression targets per example. This is the
+        size of the last dimension of the labels and logits `Tensor` objects
+        (typically, these have shape `[batch_size, label_dimension]`).
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+         weights. It will be multiplied by the loss of the example.
+      optimizer: String, `tf.Optimizer` object, or callable that creates the
+        optimizer to use for training. If not specified, will use
+        `FtrlOptimizer` with a default learning rate of 0.3.
+      config: `RunConfig` object to configure the runtime settings.
+    Returns:
+      A `BaselineRegressor` estimator.
+    """
+
+    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
+        label_dimension=label_dimension,
+        weight_column=weight_column)
+    def _model_fn(features, labels, mode, config):
+      return _baseline_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          optimizer=optimizer,
+          config=config)
+    super(BaselineRegressor, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..96639e88ea4a07e14121049d78f07e03fcb22156
--- /dev/null
+++ b/tensorflow/python/estimator/canned/baseline_test.py
@@ -0,0 +1,1545 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for baseline.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator.canned import baseline
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.estimator.inputs import pandas_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import queue_runner
+from tensorflow.python.training import saver
+
+
+try:
+  # pylint: disable=g-import-not-at-top
+  import pandas as pd
+  HAS_PANDAS = True
+except IOError:
+  # Pandas writes a temporary file during import. If it fails, don't use pandas.
+  HAS_PANDAS = False
+except ImportError:
+  HAS_PANDAS = False
+
+# pylint rules which are disabled by default for test files.
+# pylint: disable=invalid-name,protected-access,missing-docstring
+
+# Names of variables created by model.
+BIAS_NAME = 'baseline/bias'
+
+
+def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def save_variables_to_ckpt(model_dir):
+  init_all_op = [variables.global_variables_initializer()]
+  with tf_session.Session() as sess:
+    sess.run(init_all_op)
+    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+def queue_parsed_features(feature_map):
+  tensors_to_enqueue = []
+  keys = []
+  for key, tensor in six.iteritems(feature_map):
+    keys.append(key)
+    tensors_to_enqueue.append(tensor)
+  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
+  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
+  queue_runner.add_queue_runner(
+      queue_runner.QueueRunner(input_queue,
+                               [input_queue.enqueue(tensors_to_enqueue)]))
+  dequeued_tensors = input_queue.dequeue()
+  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+def sigmoid(x):
+  return 1 / (1 + np.exp(-1.0 * x))
+
+
+def _baseline_regressor_fn(*args, **kwargs):
+  return baseline.BaselineRegressor(*args, **kwargs)
+
+
+def _baseline_classifier_fn(*args, **kwargs):
+  return baseline.BaselineClassifier(*args, **kwargs)
+
+
+# Tests for Baseline Regressor.
+
+
+# TODO(b/36813849): Add tests with dynamic shape inputs using placeholders.
+class BaselineRegressorEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_evaluation_for_simple_data(self):
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
+    eval_metrics = baseline_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,),)}, ((10.,),)), steps=1)
+
+    # Logit is bias = 13, while label is 10. Loss is 3**2 = 9.
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 9.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_batch(self):
+    """Tests evaluation for batch_size==2."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
+    eval_metrics = baseline_regressor.evaluate(
+        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the sum over batch = 9 + 9 = 18
+    # Average loss is the average over batch = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 18.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_weights(self):
+    """Tests evaluation with weights."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    def _input_fn():
+      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
+      labels = ((10.,), (10.,))
+      return features, labels
+
+    baseline_regressor = _baseline_regressor_fn(
+        weight_column='weights',
+        model_dir=self._model_dir)
+    eval_metrics = baseline_regressor.evaluate(input_fn=_input_fn, steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
+    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 27.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_for_multi_dimensions(self):
+    label_dim = 2
+    with ops.Graph().as_default():
+      variables.Variable([46.0, 58.0], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_regressor = _baseline_regressor_fn(
+        label_dimension=label_dim,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([[2., 4., 5.]]),
+        },
+        y=np.array([[46., 58.]]),
+        batch_size=1,
+        num_epochs=None,
+        shuffle=False)
+    eval_metrics = baseline_regressor.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
+         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
+
+    # Logit is bias which is [46, 58]
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+
+class BaselineRegressorPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    with ops.Graph().as_default():
+      variables.Variable([.2], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[2.]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x * weight + bias = 2. * 10. + .2 = 20.2
+    self.assertAllClose([[.2]], predicted_scores)
+
+  def testMultiDim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    batch_size = 2
+    label_dimension = 3
+    with ops.Graph().as_default():
+      variables.Variable(  # shape=[label_dimension]
+          [.2, .4, .6], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_regressor = _baseline_regressor_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        # x shape=[batch_size, x_dim]
+        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_regressor.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # score = bias, shape=[batch_size, label_dimension]
+    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
+                        predicted_scores)
+
+
+class BaselineRegressorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, label_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = _baseline_regressor_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['predictions'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+  def test_pandas_input_fn(self):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+
+    # Pandas DataFrame natually supports 1 dim data only.
+    label_dimension = 1
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.array([1., 2., 3., 4.], dtype=np.float32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(data)
+    prediction_length = 4
+
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x, batch_size=batch_size, shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+  def test_input_fn_from_parse_example(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    serialized_examples = []
+    for datum in data:
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum)),
+              'y':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=datum[:label_dimension])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([label_dimension], dtypes.float32),
+    }
+
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+
+class BaselineRegressorTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step=None, var_list=None):
+      trainable_vars = var_list or ops.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(expected_var_names,
+                            [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        if global_step is not None:
+          return state_ops.assign_add(global_step, 1).op
+        return control_flow_ops.no_op()
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        if global_step is not None:
+          return state_ops.assign_add(global_step, 1).op
+        return control_flow_ops.no_op()
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(self,
+                         label_dimension,
+                         expected_global_step,
+                         expected_bias=None):
+    shapes = {
+        name: shape
+        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(self._model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([label_dimension], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertEqual(expected_bias,
+                       checkpoint_utils.load_variable(self._model_dir,
+                                                      BIAS_NAME))
+
+  def testFromScratchWithDefaultOptimizer(self):
+    # Create BaselineRegressor.
+    label = 5.
+    age = 17
+    baseline_regressor = _baseline_regressor_fn(model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    baseline_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self._assert_checkpoint(label_dimension=1, expected_global_step=num_steps)
+
+  def testTrainWithOneDimLabel(self):
+    label_dimension = 1
+    batch_size = 20
+    est = _baseline_regressor_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
+    self.assertEqual((batch_size,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
+
+  def testTrainWithOneDimWeight(self):
+    label_dimension = 1
+    batch_size = 20
+    est = _baseline_regressor_fn(
+        label_dimension=label_dimension,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    data_rank_1 = np.linspace(0., 2., batch_size, dtype=np.float32)
+    self.assertEqual((batch_size,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1,
+           'w': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(label_dimension=1, expected_global_step=200)
+
+  def testFromScratch(self):
+    # Create BaselineRegressor.
+    label = 5.
+    age = 17
+    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
+    mock_optimizer = self._mock_optimizer(expected_loss=25.)
+    baseline_regressor = _baseline_regressor_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_regressor.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=num_steps,
+        expected_bias=[0.])
+
+  def testFromCheckpoint(self):
+    # Create initial checkpoint.
+    bias = 7.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = bias = 6.
+    # loss = (logits - label)^2 = (7 - 5)^2 = 4
+    mock_optimizer = self._mock_optimizer(expected_loss=4.)
+    baseline_regressor = _baseline_regressor_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_regressor.train(
+        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=[bias])
+
+  def testFromCheckpointMultiBatch(self):
+    # Create initial checkpoint.
+    bias = 5.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = bias
+    # logits[0] = 5.
+    # logits[1] = 5.
+    # loss = sum(logits - label)^2 = (5 - 5)^2 + (5 - 3)^2 = 4
+    mock_optimizer = self._mock_optimizer(expected_loss=4.)
+    baseline_regressor = _baseline_regressor_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_regressor.train(
+        input_fn=lambda: ({'age': ((17,), (15,))}, ((5.,), (3.,))),
+        steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=bias)
+
+
+# Tests for Baseline Classifier.
+
+
+class BaselineClassifierTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(
+      self, n_classes, expected_global_step, expected_bias=None):
+    logits_dimension = n_classes if n_classes > 2 else 1
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([logits_dimension], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertAllEqual(expected_bias,
+                          checkpoint_utils.load_variable(
+                              self._model_dir, BIAS_NAME))
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    label = 0
+    age = 17
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self._assert_checkpoint(n_classes, num_steps)
+
+  def testBinaryClassesFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassesFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def _testTrainWithTwoDimsLabel(self, n_classes):
+    batch_size = 20
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    data_rank_2 = np.array([[0], [1]])
+    self.assertEqual((2,), data_rank_1.shape)
+    self.assertEqual((2, 1), data_rank_2.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_2,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithTwoDimsLabel(self):
+    self._testTrainWithTwoDimsLabel(n_classes=2)
+
+  def testMultiClassesTrainWithTwoDimsLabel(self):
+    self._testTrainWithTwoDimsLabel(n_classes=4)
+
+  def _testTrainWithOneDimLabel(self, n_classes):
+    batch_size = 20
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    self.assertEqual((2,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1},
+        y=data_rank_1,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithOneDimLabel(self):
+    self._testTrainWithOneDimLabel(n_classes=2)
+
+  def testMultiClassesTrainWithOneDimLabel(self):
+    self._testTrainWithOneDimLabel(n_classes=4)
+
+  def _testTrainWithTwoDimsWeight(self, n_classes):
+    batch_size = 20
+
+    est = baseline.BaselineClassifier(
+        weight_column='w',
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    data_rank_2 = np.array([[0], [1]])
+    self.assertEqual((2,), data_rank_1.shape)
+    self.assertEqual((2, 1), data_rank_2.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1, 'w': data_rank_2}, y=data_rank_1,
+        batch_size=batch_size, num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithTwoDimsWeight(self):
+    self._testTrainWithTwoDimsWeight(n_classes=2)
+
+  def testMultiClassesTrainWithTwoDimsWeight(self):
+    self._testTrainWithTwoDimsWeight(n_classes=4)
+
+  def _testTrainWithOneDimWeight(self, n_classes):
+    batch_size = 20
+
+    est = baseline.BaselineClassifier(
+        weight_column='w',
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    data_rank_1 = np.array([0, 1])
+    self.assertEqual((2,), data_rank_1.shape)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'age': data_rank_1, 'w': data_rank_1}, y=data_rank_1,
+        batch_size=batch_size, num_epochs=None,
+        shuffle=True)
+    est.train(train_input_fn, steps=200)
+    self._assert_checkpoint(n_classes, 200)
+
+  def testBinaryClassesTrainWithOneDimWeight(self):
+    self._testTrainWithOneDimWeight(n_classes=2)
+
+  def testMultiClassesTrainWithOneDimWeight(self):
+    self._testTrainWithOneDimWeight(n_classes=4)
+
+  def _testFromScratch(self, n_classes):
+    label = 1
+    age = 17
+    # For binary classifier:
+    #   loss = sigmoid_cross_entropy(logits, label) where logits=0 (weights are
+    #   all zero initially) and label = 1 so,
+    #      loss = 1 * -log ( sigmoid(logits) ) = 0.69315
+    # For multi class classifier:
+    #   loss = cross_entropy(logits, label) where logits are all 0s (weights are
+    #   all zero initially) and label = 1 so,
+    #      loss = 1 * -log ( 1.0 / n_classes )
+    # For this particular test case, as logits are same, the formula
+    # 1 * -log ( 1.0 / n_classes ) covers both binary and multi class cases.
+    mock_optimizer = self._mock_optimizer(
+        expected_loss=-1 * math.log(1.0/n_classes))
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=num_steps,
+        expected_bias=[0.] if n_classes == 2 else [.0] * n_classes)
+
+  def testBinaryClassesFromScratch(self):
+    self._testFromScratch(n_classes=2)
+
+  def testMultiClassesFromScratch(self):
+    self._testFromScratch(n_classes=4)
+
+  def _testFromCheckpoint(self, n_classes):
+    # Create initial checkpoint.
+    label = 1
+    age = 17
+    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # For binary classifier:
+    #   logits = bias = -1.
+    #   loss = sigmoid_cross_entropy(logits, label)
+    #   so, loss = 1 * -log ( sigmoid(-1) ) = 1.3133
+    # For multi class classifier:
+    #   loss = cross_entropy(logits, label)
+    #   where logits = bias and label = 1
+    #   so, loss = 1 * -log ( softmax(logits)[1] )
+    if n_classes == 2:
+      expected_loss = 1.3133
+    else:
+      logits = bias
+      logits_exp = np.exp(logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_loss = -1 * math.log(softmax[label])
+
+    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=bias)
+
+  def testBinaryClassesFromCheckpoint(self):
+    self._testFromCheckpoint(n_classes=2)
+
+  def testMultiClassesFromCheckpoint(self):
+    self._testFromCheckpoint(n_classes=4)
+
+  def _testFromCheckpointFloatLabels(self, n_classes):
+    """Tests float labels for binary classification."""
+    # Create initial checkpoint.
+    if n_classes > 2:
+      return
+    label = 0.8
+    age = 17
+    bias = [-1.0]
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = bias = -1.
+    # loss = sigmoid_cross_entropy(logits, label)
+    # => loss = -0.8 * log(sigmoid(-1)) -0.2 * log(sigmoid(+1)) = 1.1132617
+    mock_optimizer = self._mock_optimizer(expected_loss=1.1132617)
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testBinaryClassesFromCheckpointFloatLabels(self):
+    self._testFromCheckpointFloatLabels(n_classes=2)
+
+  def testMultiClassesFromCheckpointFloatLabels(self):
+    self._testFromCheckpointFloatLabels(n_classes=4)
+
+  def _testFromCheckpointMultiBatch(self, n_classes):
+    # Create initial checkpoint.
+    label = [1, 0]
+    age = [17, 18.5]
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # For binary classifier:
+    #   logits = bias
+    #   logits[0] = -1.
+    #   logits[1] = -1.
+    #   loss = sigmoid_cross_entropy(logits, label)
+    #   so, loss[0] = 1 * -log ( sigmoid(-1) ) = 1.3133
+    #       loss[1] = (1 - 0) * -log ( 1- sigmoid(-1) ) = 0.3132
+    # For multi class classifier:
+    #   loss = cross_entropy(logits, label)
+    #   where logits = bias and label = [1, 0]
+    #   so, loss = 1 * -log ( softmax(logits)[label] )
+    if n_classes == 2:
+      expected_loss = (1.3133 + 0.3132)
+    else:
+      # Expand logits since batch_size=2
+      logits = bias * np.ones(shape=(2, 1))
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      expected_loss = expected_loss_0 + expected_loss_1
+
+    mock_optimizer = self._mock_optimizer(expected_loss=expected_loss)
+
+    est = baseline.BaselineClassifier(
+        n_classes=n_classes,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    est.train(
+        input_fn=lambda: ({'age': (age)}, (label)),
+        steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        n_classes,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=bias)
+
+  def testBinaryClassesFromCheckpointMultiBatch(self):
+    self._testFromCheckpointMultiBatch(n_classes=2)
+
+  def testMultiClassesFromCheckpointMultiBatch(self):
+    self._testFromCheckpointMultiBatch(n_classes=4)
+
+
+class BaselineClassifierEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _test_evaluation_for_simple_data(self, n_classes):
+    label = 1
+    age = 1.
+
+    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
+
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = _baseline_classifier_fn(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=1)
+
+    if n_classes == 2:
+      # Binary classes: loss = -log(sigmoid(-1)) = 1.3133
+      # Prediction = sigmoid(-1) = 0.2689
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: 1.3133,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: 1.3133,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
+          metric_keys.MetricKeys.LABEL_MEAN: 1.,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: 1,
+          metric_keys.MetricKeys.AUC: 0.,
+          metric_keys.MetricKeys.AUC_PR: 1.,
+      }
+    else:
+      # Multi classes: loss = 1 * -log ( softmax(logits)[label] )
+      logits = bias
+      logits_exp = np.exp(logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_loss = -1 * math.log(softmax[label])
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss,
+          metric_keys.MetricKeys.ACCURACY: 0.,
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_for_simple_data(self):
+    self._test_evaluation_for_simple_data(n_classes=2)
+
+  def test_multi_classes_evaluation_for_simple_data(self):
+    self._test_evaluation_for_simple_data(n_classes=4)
+
+  def _test_evaluation_batch(self, n_classes):
+    """Tests evaluation for batch_size==2."""
+    label = [1, 0]
+    age = [17., 18.]
+    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = _baseline_classifier_fn(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': (age)}, (label)), steps=1)
+
+    if n_classes == 2:
+      # Logits are (-1., -1.) labels are (1, 0).
+      # Loss is
+      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
+      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
+      # Prediction = sigmoid(-1) = 0.2689
+      expected_loss = 1.3133 + 0.3132
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+          metric_keys.MetricKeys.ACCURACY: 0.5,
+          metric_keys.MetricKeys.PREDICTION_MEAN: 0.2689,
+          metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+          metric_keys.MetricKeys.AUC: 0.5,
+          metric_keys.MetricKeys.AUC_PR: 0.75,
+      }
+    else:
+      # Expand logits since batch_size=2
+      logits = bias * np.ones(shape=(2, 1))
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      expected_loss = expected_loss_0 + expected_loss_1
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: expected_loss / 2,
+          metric_keys.MetricKeys.ACCURACY: 0.5,
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_batch(self):
+    self._test_evaluation_batch(n_classes=2)
+
+  def test_multi_classes_evaluation_batch(self):
+    self._test_evaluation_batch(n_classes=4)
+
+  def _test_evaluation_weights(self, n_classes):
+    """Tests evaluation with weights."""
+
+    label = [1, 0]
+    age = [17., 18.]
+    weights = [1., 2.]
+    # For binary case, the expected weight has shape (1,1). For multi class
+    # case, the shape is (1, n_classes). In order to test the weights, set
+    # weights as 2.0 * range(n_classes).
+    bias = [-1.0] if n_classes == 2 else [-1.0] * n_classes
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step, name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = _baseline_classifier_fn(
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(
+        input_fn=lambda: ({'age': (age), 'w': (weights)}, (label)), steps=1)
+
+    if n_classes == 2:
+      # Logits are (-1., -1.) labels are (1, 0).
+      # Loss is
+      #   loss for row 1: 1 * -log(sigmoid(-1)) = 1.3133
+      #   loss for row 2: (1 - 0) * -log(1 - sigmoid(-1)) = 0.3132
+      #   weights = [1., 2.]
+      expected_loss = 1.3133 * 1. + 0.3132 * 2.
+      loss_mean = expected_loss / (1.0 + 2.0)
+      label_mean = np.average(label, weights=weights)
+      logits = [-1, -1]
+      logistics = sigmoid(np.array(logits))
+      predictions_mean = np.average(logistics, weights=weights)
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
+          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
+          metric_keys.MetricKeys.PREDICTION_MEAN: predictions_mean,
+          metric_keys.MetricKeys.LABEL_MEAN: label_mean,
+          metric_keys.MetricKeys.ACCURACY_BASELINE: (
+              max(label_mean, 1-label_mean)),
+          metric_keys.MetricKeys.AUC: 0.5,
+          metric_keys.MetricKeys.AUC_PR: 2. / (1. + 2.),
+      }
+    else:
+      # Multi classes: unweighted_loss = 1 * -log ( soft_max(logits)[label] )
+      # Expand logits since batch_size=2
+      logits = bias * np.ones(shape=(2, 1))
+      logits_exp = np.exp(logits)
+      softmax_row_0 = logits_exp[0] / logits_exp[0].sum()
+      softmax_row_1 = logits_exp[1] / logits_exp[1].sum()
+      expected_loss_0 = -1 * math.log(softmax_row_0[label[0]])
+      expected_loss_1 = -1 * math.log(softmax_row_1[label[1]])
+      loss_mean = np.average([expected_loss_0, expected_loss_1],
+                             weights=weights)
+      expected_loss = loss_mean * np.sum(weights)
+
+      expected_metrics = {
+          metric_keys.MetricKeys.LOSS: expected_loss,
+          ops.GraphKeys.GLOBAL_STEP: 100,
+          metric_keys.MetricKeys.LOSS_MEAN: loss_mean,
+          metric_keys.MetricKeys.ACCURACY: 2. / (1. + 2.),
+      }
+
+    self.assertAllClose(sorted_key_dict(expected_metrics),
+                        sorted_key_dict(eval_metrics), rtol=1e-3)
+
+  def test_binary_classes_evaluation_weights(self):
+    self._test_evaluation_weights(n_classes=2)
+
+  def test_multi_classes_evaluation_weights(self):
+    self._test_evaluation_weights(n_classes=4)
+
+
+class BaselineClassifierPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _testPredictions(self, n_classes, label_vocabulary, label_output_fn):
+    """Tests predict when all variables are one-dimensional."""
+    age = 1.
+
+    bias = [10.0] if n_classes == 2 else [10.0] * n_classes
+
+    with ops.Graph().as_default():
+      variables.Variable(bias, name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    est = _baseline_classifier_fn(
+        label_vocabulary=label_vocabulary,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'age': np.array([[age]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+
+    if n_classes == 2:
+      scalar_logits = bias[0]
+      two_classes_logits = [0, scalar_logits]
+      two_classes_logits_exp = np.exp(two_classes_logits)
+      softmax = two_classes_logits_exp / two_classes_logits_exp.sum()
+
+      expected_predictions = {
+          'class_ids': [1],
+          'classes': [label_output_fn(1)],
+          'logistic': [sigmoid(np.array(scalar_logits))],
+          'logits': [scalar_logits],
+          'probabilities': softmax,
+      }
+    else:
+      onedim_logits = np.array(bias)
+      class_ids = onedim_logits.argmax()
+      logits_exp = np.exp(onedim_logits)
+      softmax = logits_exp / logits_exp.sum()
+      expected_predictions = {
+          'class_ids': [class_ids],
+          'classes': [label_output_fn(class_ids)],
+          'logits': onedim_logits,
+          'probabilities': softmax,
+      }
+
+    self.assertEqual(1, len(predictions))
+    # assertAllClose cannot handle byte type.
+    self.assertEqual(expected_predictions['classes'], predictions[0]['classes'])
+    expected_predictions.pop('classes')
+    predictions[0].pop('classes')
+    self.assertAllClose(sorted_key_dict(expected_predictions),
+                        sorted_key_dict(predictions[0]))
+
+  def testBinaryClassesWithoutLabelVocabulary(self):
+    n_classes = 2
+    self._testPredictions(n_classes,
+                          label_vocabulary=None,
+                          label_output_fn=lambda x: ('%s' % x).encode())
+
+  def testBinaryClassesWithLabelVocabulary(self):
+    n_classes = 2
+    self._testPredictions(
+        n_classes,
+        label_vocabulary=['class_vocab_{}'.format(i)
+                          for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+  def testMultiClassesWithoutLabelVocabulary(self):
+    n_classes = 4
+    self._testPredictions(
+        n_classes,
+        label_vocabulary=None,
+        label_output_fn=lambda x: ('%s' % x).encode())
+
+  def testMultiClassesWithLabelVocabulary(self):
+    n_classes = 4
+    self._testPredictions(
+        n_classes,
+        label_vocabulary=['class_vocab_{}'.format(i)
+                          for i in range(n_classes)],
+        label_output_fn=lambda x: ('class_vocab_%s' % x).encode())
+
+
+class BaselineClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, n_classes, train_input_fn, eval_input_fn,
+                          predict_input_fn, input_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = _baseline_classifier_fn(
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['classes'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, 1), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def _test_numpy_input_fn(self, n_classes):
+    """Tests complete flow with numpy_input_fn."""
+    input_dimension = 4
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+    target = np.array([1] * batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=target,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=target,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_numpy_input_fn(self):
+    self._test_numpy_input_fn(n_classes=2)
+
+  def test_multi_classes_numpy_input_fn(self):
+    self._test_numpy_input_fn(n_classes=4)
+
+  def _test_pandas_input_fn(self, n_classes):
+    """Tests complete flow with pandas_input_fn."""
+    if not HAS_PANDAS:
+      return
+
+    # Pandas DataFrame natually supports 1 dim data only.
+    input_dimension = 1
+    batch_size = 10
+    data = np.array([1., 2., 3., 4.], dtype=np.float32)
+    target = np.array([1, 0, 1, 0], dtype=np.int32)
+    x = pd.DataFrame({'x': data})
+    y = pd.Series(target)
+    prediction_length = 4
+
+    train_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, num_epochs=None, shuffle=True)
+    eval_input_fn = pandas_io.pandas_input_fn(
+        x=x, y=y, batch_size=batch_size, shuffle=False)
+    predict_input_fn = pandas_io.pandas_input_fn(
+        x=x, batch_size=batch_size, shuffle=False)
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_pandas_input_fn(self):
+    self._test_pandas_input_fn(n_classes=2)
+
+  def test_multi_classes_pandas_input_fn(self):
+    self._test_pandas_input_fn(n_classes=4)
+
+  def _test_input_fn_from_parse_example(self, n_classes):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    input_dimension = 2
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * input_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, input_dimension)
+    target = np.array([1] * batch_size, dtype=np.int64)
+
+    serialized_examples = []
+    for x, y in zip(data, target):
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'x':
+                  feature_pb2.Feature(float_list=feature_pb2.FloatList(
+                      value=x)),
+              'y':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[y])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'x': parsing_ops.FixedLenFeature([input_dimension], dtypes.float32),
+        'y': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+
+    def _train_input_fn():
+      feature_map = parsing_ops.parse_example(serialized_examples, feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _eval_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      labels = features.pop('y')
+      return features, labels
+
+    def _predict_input_fn():
+      feature_map = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features = queue_parsed_features(feature_map)
+      features.pop('y')
+      return features, None
+
+    self._test_complete_flow(
+        n_classes=n_classes,
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        input_dimension=input_dimension,
+        prediction_length=prediction_length)
+
+  def test_binary_classes_input_fn_from_parse_example(self):
+    self._test_input_fn_from_parse_example(n_classes=2)
+
+  def test_multi_classes_input_fn_from_parse_example(self):
+    self._test_input_fn_from_parse_example(n_classes=4)
+
+
+# Tests for Baseline logit_fn.
+
+
+class BaselineLogitFnTest(test.TestCase):
+
+  def test_basic_logit_correctness(self):
+    """baseline_logit_fn simply returns the bias variable."""
+    with ops.Graph().as_default():
+      logit_fn = baseline._baseline_logit_fn_builder(num_outputs=2)
+      logits = logit_fn(features={'age': [[23.], [31.]]})
+      with variable_scope.variable_scope('baseline', reuse=True):
+        bias_var = variable_scope.get_variable('bias')
+      with tf_session.Session() as sess:
+        sess.run([variables.global_variables_initializer()])
+        self.assertAllClose([[0., 0.], [0., 0.]], logits.eval())
+        sess.run(bias_var.assign([10., 5.]))
+        self.assertAllClose([[10., 5.], [10., 5.]], logits.eval())
+
+
+if __name__ == '__main__':
+  test.main()
+
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 8e90fd4ec61c72cd12c3bb2c69c31cd465903cc7..6f94b2288b999b8d4d3d9f6cb2b3cb4945c39e0d 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -48,8 +48,9 @@ def _dnn_logit_fn_builder(units, hidden_units, feature_columns, activation_fn,
   """Function builder for a dnn logit_fn.
 
   Args:
-    units: An int indicating the dimension of the logit layer, or a list of ints
-      to build multiple logits in the MultiHead case.
+    units: An int indicating the dimension of the logit layer.  In the
+      MultiHead case, this should be the sum of all component Heads' logit
+      dimensions.
     hidden_units: Iterable of integer number of hidden units per layer.
     feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
     activation_fn: Activation function applied to each layer.
@@ -61,10 +62,10 @@ def _dnn_logit_fn_builder(units, hidden_units, feature_columns, activation_fn,
     A logit_fn (see below).
 
   Raises:
-    ValueError: If units is not an int or a list.
+    ValueError: If units is not an int.
   """
-  if not (isinstance(units, int) or isinstance(units, list)):
-    raise ValueError('units must be an int or list.  Given type: {}'.format(
+  if not isinstance(units, int):
+    raise ValueError('units must be an int.  Given type: {}'.format(
         type(units)))
 
   def dnn_logit_fn(features, mode):
@@ -101,29 +102,14 @@ def _dnn_logit_fn_builder(units, hidden_units, feature_columns, activation_fn,
           net = core_layers.dropout(net, rate=dropout, training=True)
       _add_hidden_layer_summary(net, hidden_layer_scope.name)
 
-    if isinstance(units, int):
-      with variable_scope.variable_scope(
-          'logits', values=(net,)) as logits_scope:
-        logits = core_layers.dense(
-            net,
-            units=units,
-            activation=None,
-            kernel_initializer=init_ops.glorot_uniform_initializer(),
-            name=logits_scope)
-      _add_hidden_layer_summary(logits, logits_scope.name)
-    else:
-      logits = []
-      for head_index, logits_dimension in enumerate(units):
-        with variable_scope.variable_scope(
-            'logits_head_{}'.format(head_index), values=(net,)) as logits_scope:
-          these_logits = core_layers.dense(
-              net,
-              units=logits_dimension,
-              activation=None,
-              kernel_initializer=init_ops.glorot_uniform_initializer(),
-              name=logits_scope)
-        _add_hidden_layer_summary(these_logits, logits_scope.name)
-        logits.append(these_logits)
+    with variable_scope.variable_scope('logits', values=(net,)) as logits_scope:
+      logits = core_layers.dense(
+          net,
+          units=units,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer(),
+          name=logits_scope)
+    _add_hidden_layer_summary(logits, logits_scope.name)
     return logits
 
   return dnn_logit_fn
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 0d5cee0c660bcd4b7ddf109d8677ef045264e713..3ffca14261386b156771906fda80914971ea1c68 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -24,8 +24,6 @@ import tempfile
 
 import numpy as np
 import six
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.client import session as tf_session
@@ -84,37 +82,25 @@ def assert_close(expected, actual, rtol=1e-04, message='', name='assert_close'):
         name=scope)
 
 
-def create_checkpoint(weights_and_biases, global_step, model_dir, num_logits=1):
+def create_checkpoint(weights_and_biases, global_step, model_dir):
   """Create checkpoint file with provided model weights.
 
   Args:
     weights_and_biases: Iterable of tuples of weight and bias values.
     global_step: Initial global step to save in checkpoint.
     model_dir: Directory into which checkpoint is saved.
-    num_logits: Number of logits trailing in weights_and_biases.
   """
   weights, biases = zip(*weights_and_biases)
   model_weights = {}
 
   # Hidden layer weights.
-  for i in range(0, len(weights) - num_logits):
+  for i in range(0, len(weights) - 1):
     model_weights[HIDDEN_WEIGHTS_NAME_PATTERN % i] = weights[i]
     model_weights[HIDDEN_BIASES_NAME_PATTERN % i] = biases[i]
 
   # Output layer weights.
-  for logit_ind in xrange(num_logits):
-    # Iteration is reversed.
-    reverse_logit_ind = num_logits - logit_ind - 1
-    logits_weight_name = (
-        LOGITS_WEIGHTS_NAME if num_logits == 1
-        else LOGITS_WEIGHTS_NAME.replace(
-            'logits', 'logits_head_{}'.format(reverse_logit_ind)))
-    logits_bias_name = (
-        LOGITS_BIASES_NAME if num_logits == 1
-        else LOGITS_BIASES_NAME.replace(
-            'logits', 'logits_head_{}'.format(reverse_logit_ind)))
-    model_weights[logits_weight_name] = weights[-(logit_ind + 1)]
-    model_weights[logits_bias_name] = biases[-(logit_ind + 1)]
+  model_weights[LOGITS_WEIGHTS_NAME] = weights[-1]
+  model_weights[LOGITS_BIASES_NAME] = biases[-1]
 
   with ops.Graph().as_default():
     # Create model variables.
@@ -496,7 +482,7 @@ class BaseDNNLogitFnTest(object):
       shutil.rmtree(self._model_dir)
 
   def _test_logits(self, mode, hidden_units, logits_dimension, inputs,
-                   expected_logits, multi_logit=False):
+                   expected_logits):
     """Tests that the expected logits are calculated."""
     with ops.Graph().as_default():
       # Global step needed for MonitoredSession, which is in turn used to
@@ -522,12 +508,7 @@ class BaseDNNLogitFnTest(object):
             features={'age': constant_op.constant(inputs)}, mode=mode)
         with monitored_session.MonitoredTrainingSession(
             checkpoint_dir=self._model_dir) as sess:
-          if multi_logit:
-            for expected_logit, obtained_logit in zip(expected_logits,
-                                                      sess.run(logits)):
-              self.assertAllClose(expected_logit, obtained_logit)
-          else:
-            self.assertAllClose(expected_logits, sess.run(logits))
+          self.assertAllClose(expected_logits, sess.run(logits))
 
   def test_one_dim_logits(self):
     """Tests one-dimensional logits.
@@ -553,35 +534,6 @@ class BaseDNNLogitFnTest(object):
           inputs=[[10.]],
           expected_logits=[[-2.08]])
 
-  def test_multihead_logits(self):
-    """Tests returning list of logits for MultiHead case.
-
-    input_layer = [[10]]
-    hidden_layer_0 = [[relu(0.6*10 +0.1), relu(0.5*10 -0.1)]] = [[6.1, 4.9]]
-    hidden_layer_1 = [[relu(1*6.1 -0.8*4.9 +0.2), relu(0.8*6.1 -1*4.9 -0.1)]]
-                   = [[relu(2.38), relu(-0.12)]] = [[2.38, 0]]
-    logits_1 = [[-1*2.38 + 1*0 + 0.3]] = [[-2.08]]
-    logits_2 = [[-1*2.38 + 1*0 + 0.3, -2*2.38 + 2*0 + 0.5]] = [[-2.08, -4.26]]
-    """
-    base_global_step = 100
-    create_checkpoint(
-        (([[.6, .5]], [.1, -.1]), ([[1., .8], [-.8, -1.]], [.2, -.2]),
-         ([[-1.], [1.]], [.3]),  # First logit weights (1d head).
-         ([[-1., -2.], [1., 2.]], [.3, .5])),  # Second logit weights (2d head).
-        base_global_step,
-        self._model_dir, num_logits=2)
-    for mode in [
-        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
-        model_fn.ModeKeys.PREDICT
-    ]:
-      self._test_logits(
-          mode,
-          hidden_units=(2, 2),
-          logits_dimension=[1, 2],
-          inputs=[[10.]],
-          expected_logits=[[[-2.08]], [[-2.08, -4.26]]],
-          multi_logit=True)
-
   def test_multi_dim_logits(self):
     """Tests multi-dimensional logits.
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 18806db5ebea042acb3c88403af4986be012a656..fa5d02c4767f9c21e7d0a3a2dad917f3cbf22c02 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -52,8 +53,12 @@ _REGRESS_SERVING_KEY = 'regression'
 _PREDICT_SERVING_KEY = 'predict'
 
 
-LossAndLabels = collections.namedtuple('LossAndLabels',
-                                       ['unweighted_loss', 'processed_labels'])
+# A LossSpec contains
+# * a scalar `Tensor` representing weighted, sum-reduced loss
+# * a scalar `Tensor` representing the sum of example weights
+# * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
+LossSpec = collections.namedtuple(
+    'LossSpec', ['weighted_sum_loss', 'example_weight_sum', 'processed_labels'])
 
 
 def _summary_key(head_name, val):
@@ -112,7 +117,7 @@ class _Head(object):
       update_op = tf.contrib.layers.optimize_loss(optimizer=sync,
                                                   loss=estimator_spec.loss, ...)
       hooks = [sync.make_session_run_hook(is_chief)]
-      ... upate train_op and hooks in EstimatorSpec and return
+      ... update train_op and hooks in EstimatorSpec and return
     ```
   """
   __metaclass__ = abc.ABCMeta
@@ -153,9 +158,13 @@ class _Head(object):
       labels: Labels `Tensor`, or `dict` of same.
 
     Returns:
-      A LossAndLabels that contains the `Tensor` representing the loss and
-      possibly processed labels (e.g. vocabulary lookup, shape manipulation,
-      etc.), to be extendable in the future.
+      A LossSpec that contains
+      * the scalar `Tensor` representing weighted, sum-reduced loss
+      * the scalar `Tensor` representing the sum of example weights
+      * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
+        etc.)
+
+      To be extendable in the future.
     """
     raise NotImplementedError('Calling an abstract method.')
 
@@ -168,7 +177,7 @@ class _Head(object):
     + All args must be passed via name.
 
     Args:
-      features: Input `dict` of `Tensor` objects.
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
       mode: Estimator's `ModeKeys`.
       logits: logits `Tensor` to be used by the head.
       labels: Labels `Tensor`, or `dict` of same.
@@ -184,46 +193,60 @@ class _Head(object):
     raise NotImplementedError('Calling an abstract method.')
 
 
-def _maybe_expand_dim(tensor):
-  """Expand the dim of `tensor` with static rank 1."""
-  with ops.name_scope(None, 'maybe_expand_dim', (tensor,)):
-    static_shape = tensor.shape
-    if static_shape is None:
-      return tensor
-
-    return (array_ops.expand_dims(tensor, -1) if static_shape.ndims == 1
-            else tensor)
+def _check_dense_labels_match_logits_and_reshape(
+    labels, logits, expected_labels_dimension):
+  """Checks that labels shape matches logits and reshapes if needed.
 
+  Consider logits of shape [D0, D1, ... DN, logits_dimension]. Then labels
+  shape must be [D0, D1, ... DN, expected_labels_dimension].
+  If expected_labels_dimension=1, labels could be [D0, D1, ... DN] and this
+  method reshapes them to [D0, D1, ... DN, 1].
 
-def _check_and_reshape_dense_labels(labels, expected_labels_dimension):
-  """Checks dense labels type and shape and reshapes to 2D Tensor."""
+  Args:
+    labels: labels Tensor.
+    logits: logits Tensor.
+    expected_labels_dimension: Integer.
+  Returns:
+    Validated and reshaped labels Tensor.
+  Raises:
+    ValueError: If labels is a SparseTensor.
+    ValueError: If labels shape is statically defined and fails validation.
+    OpError: If labels shape is not statically defined and fails validation.
+  """
   if labels is None:
     raise ValueError(
         'You must provide a labels Tensor. Given: None. '
         'Suggested troubleshooting steps: Check that your data contain '
         'your label feature. Check that your input_fn properly parses and '
         'returns labels.')
-  with ops.name_scope(None, 'labels', (labels,)) as scope:
+  with ops.name_scope(None, 'labels', (labels, logits)) as scope:
     labels = sparse_tensor.convert_to_tensor_or_sparse_tensor(labels)
     if isinstance(labels, sparse_tensor.SparseTensor):
       raise ValueError(
           'SparseTensor labels are not supported. '
-          'labels must be a Tensor of shape [batch_size, %s]. '
+          'labels must be a Tensor of shape [D0, D1, ..., DN, %s], '
+          'e.g. [batch_size, %s]. '
           'Suggested Fix (1): Check the label feature in your data. '
           'Each example must contain %s value(s). If not, your choice of label '
           'was probably incorrect. '
           'Suggested Fix (2): In your input_fn, use '
           'tf.sparse_tensor_to_dense() to turn labels into a Tensor.'
-          '' % (expected_labels_dimension, expected_labels_dimension))
-    labels = _maybe_expand_dim(labels)
+          '' % (expected_labels_dimension, expected_labels_dimension,
+                expected_labels_dimension))
+    if (labels.shape.ndims is not None and logits.shape.ndims is not None and
+        labels.shape.ndims == logits.shape.ndims - 1):
+      labels = array_ops.expand_dims(labels, -1)
     labels_shape = array_ops.shape(labels)
-    err_msg = 'labels shape must be [batch_size, {}]'.format(
-        expected_labels_dimension)
-    assert_rank = check_ops.assert_rank(labels, 2, message=err_msg)
+    logits_shape = array_ops.shape(logits)
+    err_msg = (
+        'labels shape must be [D0, D1, ... DN, {}]. '
+        'Suggested Fix: check your n_classes argument to the estimator '
+        'and/or the shape of your label.'.format(expected_labels_dimension))
+    assert_rank = check_ops.assert_rank_at_least(labels, 2, message=err_msg)
     with ops.control_dependencies([assert_rank]):
       static_shape = labels.shape
-      if static_shape is not None:
-        dim1 = static_shape[1]
+      if static_shape.ndims is not None:
+        dim1 = static_shape[-1]
         if (dim1 is not None) and (dim1 != expected_labels_dimension):
           raise ValueError(
               'Mismatched label shape. '
@@ -231,31 +254,113 @@ def _check_and_reshape_dense_labels(labels, expected_labels_dimension):
               'Suggested Fix: check your n_classes argument to the estimator '
               'and/or the shape of your label.' %
               (expected_labels_dimension, dim1))
+      expected_labels_shape = array_ops.concat(
+          [logits_shape[:-1], [expected_labels_dimension]], axis=0)
       assert_dimension = check_ops.assert_equal(
-          expected_labels_dimension, labels_shape[1], message=err_msg)
+          expected_labels_shape, labels_shape, message=err_msg,
+          data=['expected_labels_shape: ', expected_labels_shape,
+                'labels_shape: ', labels_shape])
       with ops.control_dependencies([assert_dimension]):
         return array_ops.identity(labels, name=scope)
 
 
-def _check_logits(logits, expected_logits_dimension):
-  """Check logits type and shape."""
+def _get_weights_and_check_match_logits(
+    features, weight_column, logits, allow_per_logit_weights=False):
+  """Fetches weights from features and checks that the shape matches logits.
+
+  Consider logits of shape [D0, D1, ... DN, logits_dimension]. Weights shape
+  can be either:
+  * [D0, D1, ... DN, logits_dimension] if `allow_per_logit_weights=True`.
+  * [D0, D1, ... DN, 1]
+  * [D0, D1, ... DN]: In this case, weights is reshaped into
+    [D0, D1, ... DN, 1] to work with weight broadcasting rules.
+
+  Args:
+    features: The features dict that contains weights.
+    weight_column: The weight column. If not given, this method returns 1.
+    logits: logits Tensor.
+    allow_per_logit_weights: Boolean. Whether we allow weights along the logits
+      dimension, namely shape `[D0, D1, ... DN, logits_dimension]`.
+  Returns:
+    Validated and reshaped weights Tensor.
+  Raises:
+    ValueError: If the weights `Tensor` cannot be cast into float.
+  """
+  if allow_per_logit_weights:
+    err_msg = (
+        'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or '
+        '[D0, D1, ... DN, logits_dimension]')
+  else:
+    err_msg = (
+        'weights shape must be [D0, D1, ... DN] or [D0, D1, ... DN, 1]')
+  with ops.name_scope(
+      None, 'weights',
+      values=tuple(six.itervalues(features)) + (logits,)) as scope:
+    # Fetch the weights.
+    if weight_column is None:
+      return 1.
+    if isinstance(weight_column, six.string_types):
+      weight_column = feature_column_lib.numeric_column(
+          key=weight_column, shape=(1,))
+    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
+      raise TypeError('Weight column must be either a string or _NumericColumn.'
+                      ' Given type: {}.'.format(type(weight_column)))
+    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
+        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
+    if not (weights.dtype.is_floating or weights.dtype.is_integer):
+      raise ValueError('Weight column should be castable to float. '
+                       'Given dtype: {}'.format(weights.dtype))
+    weights = math_ops.to_float(weights, name='weights')
+
+    # Validate the weights shape.
+    weights_shape = array_ops.shape(weights, name='weights_shape')
+    logits_shape = array_ops.shape(logits, name='logits_shape')
+    if (weights.shape.ndims is not None and logits.shape.ndims is not None and
+        weights.shape.ndims == logits.shape.ndims - 1):
+      assert_dimension = check_ops.assert_equal(
+          logits_shape[:-1], weights_shape, message=err_msg,
+          data=['logits_shape: ', logits_shape,
+                'weights_shape: ', weights_shape])
+      with ops.control_dependencies([assert_dimension]):
+        return array_ops.expand_dims(weights, -1, name=scope)
+    supported_weights_shape = array_ops.concat([logits_shape[:-1], [1]], axis=0)
+    if allow_per_logit_weights:
+      condition = math_ops.reduce_any(
+          [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)),
+           math_ops.reduce_all(math_ops.equal(
+               supported_weights_shape, weights_shape))])
+      assert_dimension = control_flow_ops.Assert(
+          condition=condition,
+          data=[err_msg, 'logits_shape: ', logits_shape,
+                'weights_shape: ', weights_shape])
+    else:
+      assert_dimension = check_ops.assert_equal(
+          supported_weights_shape, weights_shape, message=err_msg,
+          data=['logits_shape: ', logits_shape,
+                'weights_shape: ', weights_shape])
+    with ops.control_dependencies([assert_dimension]):
+      return array_ops.identity(weights, name=scope)
+
+
+def _check_logits_final_dim(logits, expected_logits_dimension):
+  """Checks that logits shape is [D0, D1, ... DN, logits_dimension]."""
   with ops.name_scope(None, 'logits', (logits,)) as scope:
     logits = math_ops.to_float(logits)
     logits_shape = array_ops.shape(logits)
-    assert_rank = check_ops.assert_rank(
+    assert_rank = check_ops.assert_rank_at_least(
         logits, 2, data=[logits_shape],
-        message='logits shape must be [batch_size, logits_dimension]')
+        message='logits shape must be [D0, D1, ... DN, logits_dimension]')
     with ops.control_dependencies([assert_rank]):
       static_shape = logits.shape
-      if static_shape is not None:
-        dim1 = static_shape[1]
-        if (dim1 is not None) and (dim1 != expected_logits_dimension):
+      if static_shape.ndims is not None and static_shape[-1] is not None:
+        if static_shape[-1] != expected_logits_dimension:
           raise ValueError(
-              'logits shape must be [batch_size, logits_dimension], got %s.' %
-              (static_shape,))
+              'logits shape must be [D0, D1, ... DN, logits_dimension], '
+              'got %s.' % (static_shape,))
+        return logits
       assert_dimension = check_ops.assert_equal(
-          expected_logits_dimension, logits_shape[1], data=[logits_shape],
-          message='logits shape must be [batch_size, logits_dimension]')
+          expected_logits_dimension, logits_shape[-1], data=[logits_shape],
+          message='logits shape must be [D0, D1, ... DN, logits_dimension]')
       with ops.control_dependencies([assert_dimension]):
         return array_ops.identity(logits, name=scope)
 
@@ -357,7 +462,20 @@ def _multi_class_head_with_softmax_cross_entropy_loss(n_classes,
                                                       name=None):
   """Creates a '_Head' for multi class classification.
 
-  This head expects to be fed integer labels specifying the class index.
+  The head expects `logits` with shape `[D0, D1, ... DN, n_classes]`.
+  In many applications, the shape is `[batch_size, n_classes]`.
+
+  `labels` must be a dense `Tensor` with shape matching `logits`, namely
+  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
+  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
+  `labels` must be an integer `Tensor` with values specifying the class index.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
+
+  The loss is the weighted sum over the input dimensions. Namely, if the input
+  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
+  `batch_size`.
 
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
@@ -413,18 +531,25 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
   def logits_dimension(self):
     return self._n_classes
 
-  def _eval_metric_ops(self, labels, class_ids, weights, unweighted_loss):
+  def _eval_metric_ops(self, labels, class_ids, weights, weighted_sum_loss,
+                       example_weight_sum):
     """Returns the Eval metric ops."""
     with ops.name_scope(
         None, 'metrics',
-        (labels, class_ids, weights, unweighted_loss)):
+        (labels, class_ids, weights, weighted_sum_loss, example_weight_sum)):
       keys = metric_keys.MetricKeys
       metric_ops = {
           # Estimator already adds a metric for loss.
           # TODO(xiejw): Any other metrics?
           _summary_key(self._name, keys.LOSS_MEAN):
               metrics_lib.mean(
-                  unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+                  # Both values and weights here are reduced, scalar Tensors.
+                  # values is the actual mean we want -- weights represents the
+                  # total weight of the batch and is needed to calculate
+                  # update_op over many batches.
+                  values=(weighted_sum_loss / example_weight_sum),
+                  weights=example_weight_sum,
+                  name=keys.LOSS_MEAN),
           _summary_key(self._name, keys.ACCURACY):
               metrics_lib.accuracy(
                   labels=labels,
@@ -452,27 +577,55 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
-    del mode, features  # Unused for this head.
-    label_ids = self._label_ids(_check_and_reshape_dense_labels(labels, 1))
+    del mode  # Unused for this head.
+    logits = ops.convert_to_tensor(logits)
+    labels = _check_dense_labels_match_logits_and_reshape(
+        labels=labels, logits=logits, expected_labels_dimension=1)
+    label_ids = self._label_ids(labels)
     unweighted_loss = losses.sparse_softmax_cross_entropy(
         labels=label_ids, logits=logits, reduction=losses.Reduction.NONE)
     # Restore the squeezed dim, so unweighted_loss matches the weights shape.
-    return LossAndLabels(
-        unweighted_loss=array_ops.expand_dims(unweighted_loss, axis=(1,)),
+    unweighted_loss = array_ops.expand_dims(unweighted_loss, axis=-1)
+    weights = _get_weights_and_check_match_logits(
+        features=features, weight_column=self._weight_column, logits=logits)
+    weighted_sum_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+    # _weights() can return 1.
+    example_weight_sum = math_ops.reduce_sum(
+        weights * array_ops.ones_like(unweighted_loss))
+    return LossSpec(
+        weighted_sum_loss=weighted_sum_loss,
+        example_weight_sum=example_weight_sum,
         processed_labels=label_ids)
 
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
+        For many applications, the shape is `[batch_size, logits_dimension]`.
+      labels: Labels integer or string `Tensor` with shape matching `logits`,
+        namely `[D0, D1, ... DN, 1]`. `labels` is required argument when `mode`
+        equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     with ops.name_scope(self._name, 'head'):
-      logits = _check_logits(logits, self.logits_dimension)
+      logits = _check_logits_final_dim(logits, self.logits_dimension)
 
       # Predict.
       pred_keys = prediction_keys.PredictionKeys
       with ops.name_scope(None, 'predictions', (logits,)):
-        # class_ids's shape is [batch_size]
-        class_ids = math_ops.argmax(logits, 1, name=pred_keys.CLASS_IDS)
-        class_ids = array_ops.expand_dims(class_ids, axis=(1,))
+        # class_ids's shape is [D0, D1, ... DN].
+        class_ids = math_ops.argmax(logits, axis=-1, name=pred_keys.CLASS_IDS)
+        class_ids = array_ops.expand_dims(class_ids, axis=-1)
         if self._label_vocabulary:
           table = lookup_ops.index_to_string_table_from_tensor(
               vocabulary_list=self._label_vocabulary,
@@ -502,22 +655,20 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
-      # Eval.
-      unweighted_loss, label_ids = self.create_loss(
+      weighted_sum_loss, example_weight_sum, label_ids = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
-      weights = _weights(features, self._weight_column)
-      training_loss = losses.compute_weighted_loss(
-          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+      # Eval.
       if mode == model_fn.ModeKeys.EVAL:
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=weighted_sum_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=label_ids,
                 class_ids=class_ids,
-                unweighted_loss=unweighted_loss,
-                weights=weights))
+                weights=_weights(features, self._weight_column),
+                weighted_sum_loss=weighted_sum_loss,
+                example_weight_sum=example_weight_sum))
 
       # Train.
       if train_op_fn is None:
@@ -525,26 +676,37 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
     with ops.name_scope(''):
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          training_loss)
+          weighted_sum_loss)
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
+          weighted_sum_loss / example_weight_sum)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=weighted_sum_loss,
+        train_op=train_op_fn(weighted_sum_loss))
 
 
 def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
     weight_column=None, thresholds=None, label_vocabulary=None, name=None):
-  """Creates a `Head` for single label binary classification.
+  """Creates a `_Head` for single label binary classification.
 
   This head uses `sigmoid_cross_entropy_with_logits` loss.
 
-  This head expects to be fed float labels of shape `(batch_size, 1)`.
+  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
+  In many applications, the shape is `[batch_size, 1]`.
+
+  `labels` must be a dense `Tensor` with shape matching `logits`, namely
+  `[D0, D1, ... DN, 1]`. If `label_vocabulary` given, `labels` must be a string
+  `Tensor` with values from the vocabulary. If `label_vocabulary` is not given,
+  `labels` must be float `Tensor` with values in the interval `[0, 1]`.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, or `[D0, D1, ... DN, 1]`.
+
+  The loss is the weighted sum over the input dimensions. Namely, if the input
+  labels have shape `[batch_size, 1]`, the loss is the weighted sum over
+  `batch_size`.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -565,7 +727,7 @@ def _binary_logistic_head_with_sigmoid_cross_entropy_loss(
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
   Returns:
-    An instance of `Head` for binary classification.
+    An instance of `_Head` for binary classification.
 
   Raises:
     ValueError: if `thresholds` contains a value outside of `(0, 1)`.
@@ -608,16 +770,11 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
   def logits_dimension(self):
     return 1
 
-  def _eval_metric_ops(self,
-                       labels,
-                       logits,
-                       logistic,
-                       class_ids,
-                       unweighted_loss,
-                       weights=None):
-    with ops.name_scope(
-        None, 'metrics',
-        (labels, logits, logistic, class_ids, unweighted_loss, weights)):
+  def _eval_metric_ops(self, labels, logits, logistic, class_ids, weights,
+                       weighted_sum_loss, example_weight_sum):
+    with ops.name_scope(None, 'metrics',
+                        (labels, logits, logistic, class_ids, weights,
+                         weighted_sum_loss, example_weight_sum)):
       keys = metric_keys.MetricKeys
       labels_mean = _indicator_labels_mean(
           labels=labels, weights=weights, name=keys.LABEL_MEAN)
@@ -625,7 +782,13 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
           # Estimator already adds a metric for loss.
           _summary_key(self._name, keys.LOSS_MEAN):
               metrics_lib.mean(
-                  unweighted_loss, weights=weights, name=keys.LOSS_MEAN),
+                  # Both values and weights here are reduced, scalar Tensors.
+                  # values is the actual mean we want -- weights represents the
+                  # total weight of the batch and is needed to calculate
+                  # update_op over many batches.
+                  values=(weighted_sum_loss / example_weight_sum),
+                  weights=example_weight_sum,
+                  name=keys.LOSS_MEAN),
           _summary_key(self._name, keys.ACCURACY):
               metrics_lib.accuracy(
                   labels=labels,
@@ -686,17 +849,28 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
-    del mode, features  # Unused for this head.
-    labels = _check_and_reshape_dense_labels(labels, self.logits_dimension)
+    del mode  # Unused for this head.
+    logits = ops.convert_to_tensor(logits)
+    labels = _check_dense_labels_match_logits_and_reshape(
+        labels=labels, logits=logits, expected_labels_dimension=1)
     if self._label_vocabulary is not None:
       labels = lookup_ops.index_table_from_tensor(
           vocabulary_list=tuple(self._label_vocabulary),
           name='class_id_lookup').lookup(labels)
     labels = math_ops.to_float(labels)
     labels = _assert_range(labels, 2)
-    return LossAndLabels(
-        unweighted_loss=nn.sigmoid_cross_entropy_with_logits(
-            labels=labels, logits=logits),
+    unweighted_loss = nn.sigmoid_cross_entropy_with_logits(
+        labels=labels, logits=logits)
+    weights = _get_weights_and_check_match_logits(
+        features=features, weight_column=self._weight_column, logits=logits)
+    weighted_sum_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+    # _weights() can return 1.
+    example_weight_sum = math_ops.reduce_sum(
+        weights * array_ops.ones_like(unweighted_loss))
+    return LossSpec(
+        weighted_sum_loss=weighted_sum_loss,
+        example_weight_sum=example_weight_sum,
         processed_labels=labels)
 
   def create_estimator_spec(
@@ -706,14 +880,16 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     with ops.name_scope(self._name, 'head'):
       with ops.name_scope(None, 'predictions', (logits,)):
         pred_keys = prediction_keys.PredictionKeys
-        logits = _check_logits(logits, self.logits_dimension)
+        logits = _check_logits_final_dim(logits, self.logits_dimension)
         logistic = math_ops.sigmoid(logits, name=pred_keys.LOGISTIC)
         two_class_logits = array_ops.concat(
-            (array_ops.zeros_like(logits), logits), 1, name='two_class_logits')
+            (array_ops.zeros_like(logits), logits),
+            axis=-1, name='two_class_logits')
         probabilities = nn.softmax(
             two_class_logits, name=pred_keys.PROBABILITIES)
-        class_ids = array_ops.reshape(
-            math_ops.argmax(two_class_logits, axis=1), (-1, 1), name='classes')
+        class_ids = math_ops.argmax(
+            two_class_logits, axis=-1, name=pred_keys.CLASS_IDS)
+        class_ids = array_ops.expand_dims(class_ids, axis=-1)
         if self._label_vocabulary:
           table = lookup_ops.index_to_string_table_from_tensor(
               vocabulary_list=self._label_vocabulary,
@@ -743,24 +919,26 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
+      (weighted_sum_loss, example_weight_sum,
+       processed_labels) = self.create_loss(
+           features=features, mode=mode, logits=logits, labels=labels)
+
       # Eval.
-      unweighted_loss, processed_labels = self.create_loss(
-          features=features, mode=mode, logits=logits, labels=labels)
-      weights = _weights(features, self._weight_column)
-      training_loss = losses.compute_weighted_loss(
-          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
       if mode == model_fn.ModeKeys.EVAL:
+        weights = _get_weights_and_check_match_logits(
+            features=features, weight_column=self._weight_column, logits=logits)
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=weighted_sum_loss,
             eval_metric_ops=self._eval_metric_ops(
                 labels=processed_labels,
                 logits=logits,
                 logistic=logistic,
                 class_ids=class_ids,
-                unweighted_loss=unweighted_loss,
-                weights=weights))
+                weights=weights,
+                weighted_sum_loss=weighted_sum_loss,
+                example_weight_sum=example_weight_sum))
 
       # Train.
       if train_op_fn is None:
@@ -768,23 +946,36 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
     with ops.name_scope(''):
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          training_loss)
+          weighted_sum_loss)
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
+          weighted_sum_loss / example_weight_sum)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=weighted_sum_loss,
+        train_op=train_op_fn(weighted_sum_loss))
 
 
 def _regression_head_with_mean_squared_error_loss(weight_column=None,
                                                   label_dimension=1,
                                                   name=None):
-  """Creates a `_Head` for regression using the mean squared loss.
+  """Creates a `_Head` for regression using the `mean_squared_error` loss.
+
+  The loss is the weighted sum over all input dimensions. Namely, if the input
+  labels have shape `[batch_size, label_dimension]`, the loss is the weighted
+  sum over both `batch_size` and `label_dimension`.
+
+  The head expects `logits` with shape `[D0, D1, ... DN, label_dimension]`.
+  In many applications, the shape is `[batch_size, label_dimension]`.
+
+  The `labels` shape must match `logits`, namely
+  `[D0, D1, ... DN, label_dimension]`. If `label_dimension=1`, shape
+  `[D0, D1, ... DN]` is also supported.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]`, `[D0, D1, ... DN, 1]` or
+  `[D0, D1, ... DN, label_dimension]`.
 
   Args:
     weight_column: A string or a `_NumericColumn` created by
@@ -827,20 +1018,50 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
 
   def create_loss(self, features, mode, logits, labels):
     """See `Head`."""
-    del mode, features  # Unused for this head.
-    labels = _check_and_reshape_dense_labels(labels, self._logits_dimension)
+    del mode  # Unused for this head.
+    logits = ops.convert_to_tensor(logits)
+    labels = _check_dense_labels_match_logits_and_reshape(
+        labels=labels, logits=logits,
+        expected_labels_dimension=self._logits_dimension)
     labels = math_ops.to_float(labels)
-    return LossAndLabels(
-        unweighted_loss=losses.mean_squared_error(
-            labels=labels, predictions=logits, reduction=losses.Reduction.NONE),
+    unweighted_loss = losses.mean_squared_error(
+        labels=labels, predictions=logits, reduction=losses.Reduction.NONE)
+    weights = _get_weights_and_check_match_logits(
+        features=features, weight_column=self._weight_column, logits=logits,
+        allow_per_logit_weights=True)
+    weighted_sum_loss = losses.compute_weighted_loss(
+        unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+    # _weights() can return 1.
+    example_weight_sum = math_ops.reduce_sum(
+        weights * array_ops.ones_like(unweighted_loss))
+    return LossSpec(
+        weighted_sum_loss=weighted_sum_loss,
+        example_weight_sum=example_weight_sum,
         processed_labels=labels)
 
   def create_estimator_spec(
       self, features, mode, logits, labels=None, train_op_fn=None):
-    """See `Head`."""
+    """Returns an `EstimatorSpec`.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` with shape `[D0, D1, ... DN, logits_dimension]`.
+        For many applications, the shape is `[batch_size, logits_dimension]`.
+      labels: Labels `Tensor` with shape matching `logits`, namely
+        `[D0, D1, ... DN, logits_dimension]`. When `logits_dimension=1`, shape
+        `[D0, D1, ... DN]` is also supported. `labels` is required argument when
+        `mode` equals `TRAIN` or `EVAL`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns
+        `train_op`. Required in TRAIN mode.
+    Returns:
+      `EstimatorSpec`.
+    Raises:
+      ValueError: If `train_op_fn` is `None` in TRAIN mode.
+    """
     # Predict.
     with ops.name_scope(self._name, 'head'):
-      logits = _check_logits(logits, self._logits_dimension)
+      logits = _check_logits_final_dim(logits, self._logits_dimension)
       predictions = {prediction_keys.PredictionKeys.PREDICTIONS: logits}
       if mode == model_fn.ModeKeys.PREDICT:
         regression_output = export_output.RegressionOutput(value=logits)
@@ -853,22 +1074,26 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
                 _PREDICT_SERVING_KEY: export_output.PredictOutput(predictions)
             })
 
-      # Eval.
-      unweighted_loss, _ = self.create_loss(
+      weighted_sum_loss, example_weight_sum, _ = self.create_loss(
           features=features, mode=mode, logits=logits, labels=labels)
-      weights = _weights(features, self._weight_column)
-      training_loss = losses.compute_weighted_loss(
-          unweighted_loss, weights=weights, reduction=losses.Reduction.SUM)
+
+      # Eval.
       if mode == model_fn.ModeKeys.EVAL:
         # Estimator already adds a metric for loss.
         eval_metric_ops = {
-            metric_keys.MetricKeys.LOSS_MEAN: metrics_lib.mean(
-                unweighted_loss, weights=weights)
+            _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN):
+                metrics_lib.mean(
+                    # Both values and weights here are reduced, scalar Tensors.
+                    # values is the actual mean we want -- weights represents
+                    # the total weight of the batch and is needed to calculate
+                    # update_op over many batches.
+                    values=(weighted_sum_loss / example_weight_sum),
+                    weights=example_weight_sum)
         }
         return model_fn.EstimatorSpec(
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
-            loss=training_loss,
+            loss=weighted_sum_loss,
             eval_metric_ops=eval_metric_ops)
 
       # Train.
@@ -877,38 +1102,38 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
     with ops.name_scope(''):
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS),
-          training_loss)
+          weighted_sum_loss)
       summary.scalar(
           _summary_key(self._name, metric_keys.MetricKeys.LOSS_MEAN),
-          losses.compute_weighted_loss(
-              unweighted_loss, weights=weights,
-              reduction=losses.Reduction.MEAN))
+          weighted_sum_loss / example_weight_sum)
     return model_fn.EstimatorSpec(
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
-        loss=training_loss,
-        train_op=train_op_fn(training_loss))
+        loss=weighted_sum_loss,
+        train_op=train_op_fn(weighted_sum_loss))
 
 
-def _assert_range(labels, n_classes):
+def _assert_range(labels, n_classes, message=None):
   with ops.name_scope(None, 'assert_range', (labels,)):
     assert_less = check_ops.assert_less(
         labels,
         ops.convert_to_tensor(n_classes, dtype=labels.dtype),
-        message='Label IDs must < n_classes')
+        message=message or 'Label IDs must < n_classes')
     assert_greater = check_ops.assert_non_negative(
-        labels, message='Label IDs must >= 0')
+        labels, message=message or 'Label IDs must >= 0')
     with ops.control_dependencies((assert_less, assert_greater)):
       return array_ops.identity(labels)
 
 
+# TODO(b/69000400): Delete this method.
 def _weights(features, weight_column):
   """Fetches weights from features."""
   with ops.name_scope(None, 'weights', values=features.values()):
     if weight_column is None:
       return 1.
     if isinstance(weight_column, six.string_types):
-      weight_column = feature_column_lib.numeric_column(key=weight_column)
+      weight_column = feature_column_lib.numeric_column(
+          key=weight_column, shape=(1,))
     if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
       raise TypeError('Weight column must be either a string or _NumericColumn.'
                       ' Given type: {}.'.format(type(weight_column)))
@@ -917,5 +1142,4 @@ def _weights(features, weight_column):
     if not (weights.dtype.is_floating or weights.dtype.is_integer):
       raise ValueError('Weight column should be castable to float. '
                        'Given dtype: {}'.format(weights.dtype))
-    weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights'))
-    return weights
+    return math_ops.to_float(weights, name='weights')
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 22f27a8d5a0ea08e532c57ceb61d689b46cb9ec7..f3afd84125d8758fec61d9afc08a64a0210c1f6d 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -149,14 +149,16 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
+        weighted_sum_loss.eval({
             logits_placeholder: logits_2x3,
             labels_placeholder: labels_2x2
         })
@@ -201,21 +203,21 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42.,),))},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
       with self.assertRaisesOpError('Label IDs must < n_classes'):
-        unweighted_loss.eval({
+        weighted_sum_loss.eval({
             labels_placeholder: labels_2x1_with_large_id,
             logits_placeholder: logits_2x3
         })
 
     with self.test_session():
       with self.assertRaisesOpError('Label IDs must >= 0'):
-        unweighted_loss.eval({
+        weighted_sum_loss.eval({
             labels_placeholder: labels_2x1_with_negative_id,
             logits_placeholder: logits_2x3
         })
@@ -262,16 +264,16 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.int64)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
       with self.assertRaisesRegexp(
-          errors.OpError,
-          'logits and labels must have the same first dimension'):
-        unweighted_loss.eval({
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
+        weighted_sum_loss.eval({
             labels_placeholder: values_3x1,
             logits_placeholder: values_2x3
         })
@@ -381,17 +383,20 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = np.array(((1,), (1,)), dtype=np.int64)
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = np.array(((10.0,), (0,),))
+    expected_weighted_sum_loss = 10.
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -479,16 +484,19 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = np.array(((10.0,), (0,),))
-    unweighted_loss, _ = head.create_loss(
+    expected_weighted_sum_loss = 10.
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_eval_with_label_vocabulary(self):
     n_classes = 3
@@ -584,16 +592,19 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     features = {'x': np.array(((42,),), dtype=np.int32)}
 
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = np.array(((10.0,), (0,),))
-    unweighted_loss, _ = head.create_loss(
+    expected_weighted_sum_loss = 10.
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -705,8 +716,11 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     }
 
     # loss = cross_entropy(labels, logits) = [10, 10, 0].
-    expected_unreduced_loss = np.array(((10.0,), (10.0,), (0.0,),))
-    unweighted_loss, _ = head.create_loss(
+    # weighted sum loss = 1 * 10 + 2 * 10 + 3 * 0 = 30.
+    expected_weighted_sum_loss = 30.
+    # example weight sum = 1 + 2 + 3
+    expected_example_weight_sum = 6.
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -714,7 +728,15 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
+      self.assertAllClose(
+          expected_example_weight_sum,
+          example_weight_sum.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_train_with_one_dim_label_and_weights(self):
     n_classes = 3
@@ -781,16 +803,19 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # loss = cross_entropy(labels, logits) = [10, 0].
-    expected_unreduced_loss = np.array(((10.0,), (0,),))
-    unweighted_loss, _ = head.create_loss(
+    expected_weighted_sum_loss = 10.
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_train_with_vocabulary(self):
     n_classes = 3
@@ -874,6 +899,158 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
               expected_loss / np.sum(weights_3x1),
       }, summary_str, tol)
 
+  def test_multi_dim_weighted_train_create_loss(self):
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='weights')
+
+    logits = np.array([[[10, 0, 0], [12, 0, 0]],
+                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
+    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+
+    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
+    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
+    expected_weighted_sum_loss = 55.5
+    expected_example_weight_sum = np.sum(weights)
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(
+          expected_weighted_sum_loss, weighted_sum_loss.eval(),
+          rtol=1e-2, atol=1e-2)
+      self.assertAllClose(
+          expected_example_weight_sum, example_weight_sum.eval())
+
+  def test_multi_dim_weighted_train(self):
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='weights')
+
+    logits = np.array([[[10, 0, 0], [12, 0, 0]],
+                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
+    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
+    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
+    expected_loss = 55.5
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
+  def test_multi_dim_train_weights_wrong_inner_dim(self):
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 1]."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='weights')
+    logits = np.array([[[10, 0, 0], [12, 0, 0]],
+                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
+    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
+    weights = np.array([[1.], [2.]], dtype=np.float32)
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
+        spec.loss.eval()
+
+  def test_multi_dim_train_weights_wrong_outer_dim(self):
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2, 3]."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='weights')
+    logits = np.array([[[10, 0, 0], [12, 0, 0]],
+                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
+    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
+    weights = np.array([[[1., 1.1, 1.2], [1.5, 1.6, 1.7]],
+                        [[2., 2.1, 2.2], [2.5, 2.6, 2.7]]])
+    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights_placeholder},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 3\]'):
+        spec.loss.eval({weights_placeholder: weights})
+
+  def test_multi_dim_weighted_eval(self):
+    """Logits of shape [2, 2, 2], labels [2, 2, 1], weights [2, 2]."""
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes=3, weight_column='weights')
+    logits = np.array([[[10, 0, 0], [12, 0, 0]],
+                       [[0, 10, 0], [0, 15, 0]]], dtype=np.float32)
+    labels = np.array([[[0], [1]], [[1], [2]]], dtype=np.int64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = cross_entropy(labels, logits) = [[0, 12], [0, 15]].
+    # weighted_sum_loss = 1*0 + 1.5*12 + 2*0 + 2.5*15 = 55.5
+    expected_loss = 55.5
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / np.sum(weights),
+        keys.ACCURACY: (1.*1. + 1.5*0. + 2.*1. + 2.5*0.) / np.sum(weights),
+    }
+
+    # Assert predictions, loss, and metrics.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
 
 class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
 
@@ -935,14 +1112,16 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features={'x': np.array(((42.,),))},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[2 2\]'):
+        weighted_sum_loss.eval({
             logits_placeholder: logits_2x1,
             labels_placeholder: labels_2x2
         })
@@ -974,20 +1153,24 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # Dynamic shape.
     labels_placeholder = array_ops.placeholder(dtype=dtypes.float32)
     logits_placeholder = array_ops.placeholder(dtype=dtypes.float32)
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features={'x': values_2x1},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[3 1\] \[labels_shape: \] \[2 1\]'):
+        weighted_sum_loss.eval({
             labels_placeholder: values_2x1,
             logits_placeholder: values_3x1
         })
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 1\] \[labels_shape: \] \[3 1\]'):
+        weighted_sum_loss.eval({
             labels_placeholder: values_3x1,
             logits_placeholder: values_2x1
         })
@@ -1071,17 +1254,20 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     features = {'x': np.array(((42,),), dtype=np.int32)}
 
     # loss = cross_entropy(labels, logits) = [0, 41].
-    expected_unreduced_loss = np.array(((0.,), (41.,),))
+    expected_weighted_sum_loss = 41.
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -1172,14 +1358,14 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     labels = [[b'iroh'], [b'iroh']]
     features = {'x': np.array(((42,),), dtype=np.int32)}
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(np.array(((0.,), (41.,),)), unweighted_loss.eval())
+      self.assertAllClose(41., weighted_sum_loss.eval())
 
   def test_eval_with_vocabulary_list(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
@@ -1214,17 +1400,21 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     # probabilities = [1/(1 + exp(1)), 1/(1 + exp(-1))] = [0.269, 0.731]
     # loss = -ln(probabilities[label[i]])) = [-ln(0.269), -ln(0.731)]
     #      = [1.31304389, 0.31334182]
-    expected_unreduced_loss = np.array(((1.31304389,), (0.31334182,),))
+    # weighted sum loss = 1.62638571
+    expected_weighted_sum_loss = 1.62638571
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_eval_with_thresholds(self):
     thresholds = [0.25, 0.5, 0.75]
@@ -1288,16 +1478,16 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     labels = np.array(((1,), (1,),), dtype=np.float64)
     features = {'x': np.array(((42,),), dtype=np.float32)}
     # loss = cross_entropy(labels, logits) = [0, 41].
-    expected_unreduced_loss = np.array(((0.,), (41.,),))
+    expected_weighted_sum_loss = 41.
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_unreduced_loss, unweighted_loss.eval())
+      self.assertAllClose(expected_weighted_sum_loss, weighted_sum_loss.eval())
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -1407,17 +1597,21 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     #      = [-0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5)),
     #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
     #      = [0.57407698418, 0.67435524446]
-    expected_unreduced_loss = np.array(((0.57407698418,), (0.67435524446,),))
+    # weighted sum loss = 0.57407698418 + 0.67435524446
+    expected_weighted_sum_loss = 1.24843222864
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_float_labels_train(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -1463,17 +1657,21 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     #      = [-0.8 * log(sigmoid(0.5)) -0.2 * log(sigmoid(-0.5)),
     #         -0.4 * log(sigmoid(-0.3)) -0.6 * log(sigmoid(0.3))]
     #      = [0.57407698418, 0.67435524446]
-    expected_unreduced_loss = np.array(((0.57407698418,), (0.67435524446,),))
+    # weighted sum loss = 0.57407698418 + 0.67435524446
+    expected_weighted_sum_loss = 1.24843222864
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_float_labels_eval(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
@@ -1606,9 +1804,12 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
         'label_weights': weights_rank_1,
     }
     # losses = cross_entropy(labels, logits) = [0, 41, 44]
-    expected_unreduced_loss = np.array(((0.,), (41,), (44.,),))
+    # weighted sum loss = 1 * 0 + .1 * 41 + 1.5 * 44
+    expected_weighted_sum_loss = 70.1
+    # example weight sum = 1 + 0.1 + 1.5
+    expected_example_weight_sum = 2.6
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
@@ -1616,7 +1817,15 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       self.assertAllClose(
-          expected_unreduced_loss, unweighted_loss.eval(), rtol=1e-2, atol=1e-2)
+          expected_weighted_sum_loss,
+          weighted_sum_loss.eval(),
+          rtol=1e-2,
+          atol=1e-2)
+      self.assertAllClose(
+          expected_example_weight_sum,
+          example_weight_sum.eval(),
+          rtol=1e-2,
+          atol=1e-2)
 
   def test_train_with_one_dim_labels_and_weights(self):
     """3 examples, 1 batch."""
@@ -1716,6 +1925,165 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
           metric_keys.MetricKeys.LOSS_MEAN: 26.9615384615,
       }, summary_str)
 
+  def test_multi_dim_weighted_train_create_loss(self):
+    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='weights')
+
+    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
+    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
+    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
+    expected_weighted_sum_loss = 40.
+    expected_example_weight_sum = np.sum(weights)
+    # Create loss.
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    tol = 1e-2
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(
+          expected_weighted_sum_loss, weighted_sum_loss.eval(),
+          rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_example_weight_sum, example_weight_sum.eval())
+
+  def test_multi_dim_weighted_train(self):
+    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='weights')
+
+    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
+    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
+    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
+    expected_loss = 40.
+    expected_train_result = 'my_train_op'
+    def _train_op_fn(loss):
+      return string_ops.string_join(
+          [constant_op.constant(expected_train_result),
+           string_ops.as_string(loss, precision=2)])
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    # Assert predictions, loss, train_op, and summaries.
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run((spec.loss, spec.train_op))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      self.assertEqual(
+          six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
+          train_result)
+
+  def test_multi_dim_train_weights_wrong_inner_dim(self):
+    """Logits and labels of shape [2, 2, 1], weights [2, 1]."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='weights')
+
+    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
+    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
+    weights = np.array([[1.], [2.]], dtype=np.float32)
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \] \[2 2 1\] \[weights_shape: \] \[2 1\]'):
+        spec.loss.eval()
+
+  def test_multi_dim_train_weights_wrong_outer_dim(self):
+    """Logits and labels of shape [2, 2, 1], weights [2, 2, 2]."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='weights')
+
+    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
+    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
+    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features={'weights': weights_placeholder},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \]\s\[2 2 1\]\s\[weights_shape: \]\s\[2 2 2\]'):
+        spec.loss.eval({
+            weights_placeholder: np.array([[[1., 1.1], [1.5, 1.6]],
+                                           [[2., 2.1], [2.5, 2.6]]])})
+
+  def test_multi_dim_weighted_eval(self):
+    """Logits and labels of shape [2, 2, 1], weights [2, 2]."""
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column='weights')
+
+    logits = np.array([[[10], [-10]], [[12], [-12]]], dtype=np.float32)
+    labels = np.array([[[0], [0]], [[1], [1]]], dtype=np.float64)
+    weights = np.array([[1., 1.5], [2., 2.5]], dtype=np.float32)
+    # loss = cross_entropy(labels, logits) = [[10, 0], [0, 12]].
+    # weighted_sum_loss = 1*10 + 1.5*0 + 2*0 + 2.5*12 = 40
+    expected_loss = 40.
+
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features={'weights': weights},
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        keys.LOSS_MEAN: expected_loss / np.sum(weights),
+        keys.ACCURACY: (1.*0. + 1.5*1. + 2.*1. + 2.5*0.) / np.sum(weights),
+        keys.PREDICTION_MEAN: (1.*1 + 1.5*0 + 2.*1 + 2.5*0) / np.sum(weights),
+        keys.LABEL_MEAN: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
+        keys.ACCURACY_BASELINE: (1.*0 + 1.5*0 + 2.*1 + 2.5*1) / np.sum(weights),
+        # We cannot reliably calculate AUC with only 4 data points, but the
+        # values should not change because of backwards-compatibility.
+        keys.AUC: 0.5222,
+        keys.AUC_PR: 0.7341,
+    }
+
+    tol = 1e-2
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      value_ops = {k: spec.eval_metric_ops[k][0] for k in spec.eval_metric_ops}
+      update_ops = {k: spec.eval_metric_ops[k][1] for k in spec.eval_metric_ops}
+      loss, metrics = sess.run((spec.loss, update_ops))
+      self.assertAllClose(expected_loss, loss, rtol=tol, atol=tol)
+      # Check results of both update (in `metrics`) and value ops.
+      self.assertAllClose(expected_metrics, metrics, rtol=tol, atol=tol)
+      self.assertAllClose(
+          expected_metrics, {k: value_ops[k].eval() for k in value_ops},
+          rtol=tol, atol=tol)
+
 
 class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
 
@@ -1786,14 +2154,16 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
             labels_placeholder: values_3d,
             logits_placeholder: values_1d
         })
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features={'x': values_1d},
         mode=model_fn.ModeKeys.EVAL,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
+        weighted_sum_loss.eval({
             labels_placeholder: values_1d,
             logits_placeholder: values_3d
         })
@@ -1836,14 +2206,16 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
             labels_placeholder: values_3d,
             logits_placeholder: values_1d
         })
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features={'x': values_1d},
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits_placeholder,
-        labels=labels_placeholder)
+        labels=labels_placeholder)[0]
     with self.test_session():
-      with self.assertRaisesRegexp(errors.OpError, 'labels shape'):
-        unweighted_loss.eval({
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[expected_labels_shape: \] \[2 3\] \[labels_shape: \] \[2 1\]'):
+        weighted_sum_loss.eval({
             labels_placeholder: values_1d,
             logits_placeholder: values_3d
         })
@@ -1889,15 +2261,15 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(43-45)^2, (44-41)] = [4, 9]
-      self.assertAllClose(np.array(((4.,), (9.,),)), unweighted_loss.eval())
+      self.assertAllClose(13., weighted_sum_loss.eval())
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -1953,21 +2325,39 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       self.assertAllClose(expected_loss_mean, loss_mean)
       self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
 
+  def test_eval_metric_ops_with_head_name_for_regression(self):
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        name='some_regression_head')
+    logits = np.array(((1,), (9,)), dtype=np.float32)
+    labels = np.array(((1,), (1,)), dtype=np.int64)
+    features = {'x': np.array(((42,),), dtype=np.int32)}
+    # Create estimator spec.
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.EVAL,
+        logits=logits,
+        labels=labels)
+
+    expected_metric_keys = [
+        '{}/some_regression_head'.format(metric_keys.MetricKeys.LOSS_MEAN),
+    ]
+    self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
+
   def test_train_create_loss(self):
     head = head_lib._regression_head_with_mean_squared_error_loss()
     logits = np.array(((45,), (41,),), dtype=np.float32)
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(43-45)^2, (44-41)] = [4, 9]
-      self.assertAllClose(np.array(((4.,), (9.,),)), unweighted_loss.eval())
+      self.assertAllClose(13., weighted_sum_loss.eval())
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
@@ -2203,21 +2593,26 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
     head = head_lib._regression_head_with_mean_squared_error_loss(
         weight_column='label_weights')
     logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
-    # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-    expected_unreduced_loss = np.array(((100.,), (1.,), (1.,),))
     x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
     weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
     labels_rank_1 = np.array((35., 42., 45.,))
+    # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
+    # weighted sum loss = 100 * 1 + 1 * .1 + 1.5 * 1 = 101.6
+    expected_unreduced_loss = 101.6
+    # example weight sum = 1 + 0.1 + 1.5
+    expected_example_weight_sum = 2.6
     features = {'x': x_feature_rank_1, 'label_weights': weight_rank_1}
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
         labels=labels_rank_1)
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
-      self.assertAllClose(expected_unreduced_loss, unweighted_loss.eval())
+      self.assertAllClose(expected_unreduced_loss, weighted_sum_loss.eval())
+      self.assertAllClose(expected_example_weight_sum,
+                          example_weight_sum.eval())
 
   def test_with_one_dim_label_and_weight(self):
     """1d label, 3 examples, 1 batch."""
@@ -2288,15 +2683,16 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
         'label_weights': np.array(((1., .1, 1.5),))
     }
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.EVAL,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-      self.assertAllClose(np.array(((100., 1., 1.,),)), unweighted_loss.eval())
+      # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
+      self.assertAllClose(101.6, weighted_sum_loss.eval())
 
   def test_weighted_multi_value_eval(self):
     """3d label, 1 example, 1 batch."""
@@ -2356,15 +2752,16 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
         'label_weights': np.array(((1., .1, 1.5),))
     }
     # Create loss.
-    unweighted_loss, _ = head.create_loss(
+    weighted_sum_loss = head.create_loss(
         features=features,
         mode=model_fn.ModeKeys.TRAIN,
         logits=logits,
-        labels=labels)
+        labels=labels)[0]
     with self.test_session():
       _initialize_variables(self, monitored_session.Scaffold())
       # loss = [(35-45)^2, (42-41)^2, (45-44)^2] = [100, 1, 1].
-      self.assertAllClose(np.array(((100., 1., 1.,),)), unweighted_loss.eval())
+      # weighted sum loss = 1 * 100 + .1 * 1 + 1.5 * 1 = 101.6
+      self.assertAllClose(101.6, weighted_sum_loss.eval())
 
   def test_weighted_multi_value_train(self):
     """3d label, 1 example, 1 batch."""
@@ -2536,6 +2933,125 @@ class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
       self.assertAllClose(expected_losses, [r[0] for r in results])
       self.assertAllClose(expected_losses * -7., [r[1] for r in results])
 
+  def test_multi_dim_weighted_train_create_loss(self):
+    """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
+    label_dimension = 3
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=label_dimension)
+    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
+                       [[20., 21., 22.], [30., 31., 32.]]])
+    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
+                       [[23., 24., 25.], [34., 35., 36.]]])
+    weights = np.array([[1., 1.5], [2., 2.5]])
+    expected_weighted_sum_loss = np.sum(
+        np.array([[[1. * x for x in [1., 1., 1.]],
+                   [1.5 * x for x in [4., 4., 4.]]],
+                  [[2. * x for x in [9., 9., 9.]],
+                   [2.5 * x for x in [16., 16., 16.]]]]))
+    # Weights are expanded to [2, 2, label_dimension].
+    expected_example_weight_sum = np.sum(weights) * label_dimension
+    # Create loss.
+    weighted_sum_loss, example_weight_sum, _ = head.create_loss(
+        features={'label_weights': weights},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(expected_weighted_sum_loss, weighted_sum_loss.eval())
+      self.assertAllClose(
+          expected_example_weight_sum, example_weight_sum.eval())
+
+  def test_multi_dim_weighted_train(self):
+    """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=3)
+    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
+                       [[20., 21., 22.], [30., 31., 32.]]])
+    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
+                       [[23., 24., 25.], [34., 35., 36.]]])
+    expected_train_result = b'my_train_op'
+    features = {
+        'label_weights': np.array([[1., 1.5], [2., 2.5]]),
+    }
+    # loss = 1*3*1^2 + 1.5*3*2^2 + 2*3*3^2 +2.5*3*4^2 = 195
+    expected_loss = 195.
+    # Create estimator spec.
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_equal(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      self.assertAllClose(expected_loss, spec.loss.eval())
+
+  def test_multi_dim_train_weights_wrong_inner_dim(self):
+    """Logits, labels of shape [2, 2, 3], weight shape [2, 1]."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=3)
+    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
+                       [[20., 21., 22.], [30., 31., 32.]]])
+    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
+                       [[23., 24., 25.], [34., 35., 36.]]])
+    features = {
+        'label_weights': np.array([[1.], [2]]),
+    }
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \] \[2 2 3\] \[weights_shape: \] \[2 1\]'):
+        spec.loss.eval()
+
+  def test_multi_dim_train_weights_wrong_outer_dim(self):
+    """Logits, labels of shape [2, 2, 3], weight shape [2, 2, 2]."""
+    head = head_lib._regression_head_with_mean_squared_error_loss(
+        weight_column='label_weights', label_dimension=3)
+    logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
+                       [[20., 21., 22.], [30., 31., 32.]]])
+    labels = np.array([[[01., 02., 03.], [12., 13., 14.]],
+                       [[23., 24., 25.], [34., 35., 36.]]])
+    weights_placeholder = array_ops.placeholder(dtype=dtypes.float32)
+    features = {
+        'label_weights': weights_placeholder,
+    }
+    def _no_op_train_fn(loss):
+      del loss
+      return control_flow_ops.no_op()
+
+    spec = head.create_estimator_spec(
+        features=features,
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_no_op_train_fn)
+    with self.test_session():
+      _initialize_variables(self, monitored_session.Scaffold())
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[logits_shape: \]\s\[2 2 3\]\s\[weights_shape: \]\s\[2 2 2\]'):
+        spec.loss.eval({
+            weights_placeholder: np.array([[[1., 1.1], [1.5, 1.6]],
+                                           [[2., 2.1], [2.5, 2.6]]])})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 6243cfc118b6cc16cb6d6bfeb9ad5aab72a6d702..63103ef4c123fe5d7e6a3609aa0f8d1d01a8bf94 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -26,9 +26,11 @@ import tempfile
 import numpy as np
 import six
 
+from google.protobuf import message
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
@@ -415,7 +417,7 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       self._create_and_assert_global_step(g)
-      features = self._get_features_from_input_fn(
+      features, input_hooks = self._get_features_from_input_fn(
           input_fn, model_fn_lib.ModeKeys.PREDICT)
       estimator_spec = self._call_model_fn(
           features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
@@ -425,7 +427,7 @@ class Estimator(object):
               checkpoint_filename_with_path=checkpoint_path,
               scaffold=estimator_spec.scaffold,
               config=self._session_config),
-          hooks=hooks) as mon_sess:
+          hooks=input_hooks + hooks) as mon_sess:
         while not mon_sess.should_stop():
           preds_evaluated = mon_sess.run(predictions)
           if not isinstance(predictions, dict):
@@ -460,8 +462,12 @@ class Estimator(object):
       assets_extra=None,
       as_text=False,
       checkpoint_path=None):
+    # pylint: disable=line-too-long
     """Exports inference graph as a SavedModel into given dir.
 
+    For a detailed guide, see
+    @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+
     This method builds a new graph by first calling the
     serving_input_receiver_fn to obtain feature `Tensor`s, and then calling
     this `Estimator`'s model_fn to generate the model graph based on those
@@ -505,6 +511,7 @@ class Estimator(object):
       ValueError: if no serving_input_receiver_fn is provided, no export_outputs
           are provided, or no checkpoint can be found.
     """
+    # pylint: enable=line-too-long
     if serving_input_receiver_fn is None:
       raise ValueError('serving_input_receiver_fn must be defined.')
 
@@ -536,7 +543,7 @@ class Estimator(object):
       temp_export_dir = get_temp_export_dir(export_dir)
 
       # TODO(soergel): Consider whether MonitoredSession makes sense here
-      with tf_session.Session() as session:
+      with tf_session.Session(config=self._session_config) as session:
 
         saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
             sharded=True)
@@ -576,6 +583,11 @@ class Estimator(object):
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       # Unconditionally drop the label (the second element of result).
       result = result[0]
@@ -584,16 +596,22 @@ class Estimator(object):
       logging.warning('Input graph does not use tf.data.Dataset or contain a '
                       'QueueRunner. That means predict yields forever. '
                       'This is probably a mistake.')
-    return result
+    return result, input_hooks
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
+    """Extracts the `features` and labels from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    input_hooks = []
+    if isinstance(result, dataset_ops.Dataset):
+      iterator = result.make_initializable_iterator()
+      input_hooks.append(_DatasetInitializerHook(iterator))
+      result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
             'input_fn should return (feautures, labels) as a len 2 tuple.')
-      return result
-    return result, None
+      return result[0], result[1], input_hooks
+    return result, None, input_hooks
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -717,8 +735,10 @@ class Estimator(object):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
       training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.TRAIN)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.TRAIN))
+      worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
       # Check if the user created a loss summary, and add one if they didn't.
@@ -816,8 +836,9 @@ class Estimator(object):
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels = self._get_features_and_labels_from_input_fn(
-          input_fn, model_fn_lib.ModeKeys.EVAL)
+      features, labels, input_hooks = (
+          self._get_features_and_labels_from_input_fn(
+              input_fn, model_fn_lib.ModeKeys.EVAL))
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
 
@@ -838,7 +859,8 @@ class Estimator(object):
             'already defines a default metric with the same name.')
       eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
 
-      all_hooks = list(hooks or [])
+      all_hooks = list(input_hooks)
+      all_hooks.extend(hooks)
       all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
 
       eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
@@ -998,20 +1020,27 @@ def _write_dict_to_summary(output_dir,
       continue
     if key == 'global_step':
       continue
-    value = summary_proto.value.add()
-    value.tag = key
     if (isinstance(dictionary[key], np.float32) or
         isinstance(dictionary[key], float)):
-      value.simple_value = float(dictionary[key])
+      summary_proto.value.add(tag=key, simple_value=float(dictionary[key]))
     elif (isinstance(dictionary[key], np.int64) or
           isinstance(dictionary[key], np.int32) or
           isinstance(dictionary[key], int)):
-      value.simple_value = int(dictionary[key])
+      summary_proto.value.add(tag=key, simple_value=int(dictionary[key]))
+    elif isinstance(dictionary[key], six.string_types):
+      try:
+        summ = summary_pb2.Summary.FromString(dictionary[key])
+        for i, _ in enumerate(summ.value):
+          summ.value[i].tag = key
+        summary_proto.value.extend(summ.value)
+      except message.DecodeError:
+        logging.warn('Skipping summary for %s, cannot parse string to Summary.',
+                     key)
+        continue
     else:
       logging.warn(
           'Skipping summary for %s, must be a float, np.float32, np.int64, '
-          'np.int32 or int.',
-          key)
+          'np.int32 or int or a serialized string of Summary.', key)
   summary_writer.add_summary(summary_proto, current_global_step)
   summary_writer.flush()
 
@@ -1026,3 +1055,16 @@ def _has_dataset_or_queue_runner(maybe_tensor):
 
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
+
+
+class _DatasetInitializerHook(training.SessionRunHook):
+
+  def __init__(self, iterator):
+    self._iterator = iterator
+
+  def begin(self):
+    self._initializer = self._iterator.initializer
+
+  def after_create_session(self, session, coord):
+    del coord
+    session.run(self._initializer)
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 5b82fd75ff3f99fdae102dbfa9de547a7c0f17ca..bed2b674192bd4054baa2ee5d30fc72c0e8d54ed 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.python.estimator.canned.baseline import BaselineClassifier
+from tensorflow.python.estimator.canned.baseline import BaselineRegressor
 from tensorflow.python.estimator.canned.dnn import DNNClassifier
 from tensorflow.python.estimator.canned.dnn import DNNRegressor
 from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
@@ -46,6 +48,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     # Canned Estimators
+    'BaselineClassifier',
+    'BaselineRegressor',
     'DNNClassifier',
     'DNNRegressor',
     'DNNLinearCombinedClassifier',
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 0040ec3650a073d6d1996491645da7970317ea12..db64fbc9ccc3a212e7dfa1ad4d82e3138e3a3d56 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -50,6 +50,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
@@ -57,6 +58,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary import summary
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import basic_session_run_hooks
@@ -75,6 +77,23 @@ def dummy_model_fn(features, labels, params):
   _, _, _ = features, labels, params
 
 
+def check_eventfile_for_keyword(keyword, est):
+  """Checks event files for the keyword."""
+
+  writer_cache.FileWriterCache.clear()
+
+  # Get last Event written.
+  event_paths = glob.glob(os.path.join(est.model_dir, 'events*'))
+  last_event = None
+  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
+    if last_event.summary is not None:
+      if last_event.summary.value:
+        if keyword in last_event.summary.value[0].tag:
+          return True
+
+  return False
+
+
 class EstimatorInheritanceConstraintTest(test.TestCase):
   """Tests that sub classes cannot override methods of Estimator."""
 
@@ -584,15 +603,9 @@ class EstimatorTrainTest(test.TestCase):
     # Make sure nothing is stuck in limbo.
     writer_cache.FileWriterCache.clear()
 
-    # Get last Event written.
-    event_paths = glob.glob(os.path.join(est.model_dir, 'events*'))
-    last_event = None
-    for last_event in summary_iterator.summary_iterator(event_paths[-1]):
-      if last_event.summary is not None:
-        if last_event.summary.value:
-          if 'loss' == last_event.summary.value[0].tag:
-            return
-    self.fail('loss should be part of reported summaries.')
+    if check_eventfile_for_keyword('loss', est):
+      return
+    self.fail('{} should be part of reported summaries.'.format('loss'))
 
   def test_latest_checkpoint(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
@@ -900,6 +913,80 @@ class EstimatorGetVariablesTest(test.TestCase):
     self.assertEqual(3., est.get_variable_value('three'))
 
 
+class EstimatorDatasetIntegrationTest(test.TestCase):
+  """Tests dataset integration."""
+
+  def test_returned_by_input_fn(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors(([1.], [2.]))
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features + labels,  # 1 + 2
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(3., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_none_labels(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([7.])
+
+    def _model_fn(features, labels, mode):
+      self.assertIsNone(labels)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=features,  # 7
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    scores = est.evaluate(_input_fn, steps=1)
+    self.assertEqual(7., scores[model_fn_lib.LOSS_METRIC_KEY])
+
+  def test_with_predict(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensors([10.])
+
+    def _model_fn(features, labels, mode):
+      _ = labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,  # 10
+          loss=features,  # 10
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn, steps=1)
+    self.assertEqual([10.], next(est.predict(input_fn=_input_fn)))
+
+  def test_batching(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.from_tensor_slices(([[1.], [2.]],
+                                                     [[10.], [20.]])).batch(1)
+
+    def _model_fn(features, labels, mode):
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=features,
+          loss=features + (0 if labels is None else labels),  # 11, 22
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+
+    est = estimator.Estimator(model_fn=_model_fn)
+    est.train(_input_fn)
+    scores = est.evaluate(_input_fn)
+    # (11 + 22)/2 = 16.5
+    self.assertEqual(16.5, scores[model_fn_lib.LOSS_METRIC_KEY])
+    self.assertEqual([1., 2.], list(est.predict(_input_fn)))
+
+
 class EstimatorEvaluateTest(test.TestCase):
 
   def test_input_fn_args(self):
@@ -1139,6 +1226,39 @@ class EstimatorEvaluateTest(test.TestCase):
     est.evaluate(dummy_input_fn, steps=1)
     self.assertTrue(hook.begin.called)
 
+  def test_summary_writing_with_summary_proto(self):
+
+    def model_fn_global_step_incrementer_image(features, labels, mode):
+      _, _ = features, labels
+      global_step = training.get_global_step()
+
+      image = array_ops.zeros([1, 3, 3, 1])
+      eval_metric_ops = {
+          'image': (summary.image('image', image, max_outputs=1),
+                    constant_op.constant(1))
+      }
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(1.),
+          train_op=state_ops.assign_add(global_step, 1),
+          eval_metric_ops=eval_metric_ops)
+
+    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer_image,
+                              config=run_config.RunConfig(save_summary_steps=1))
+    est.train(dummy_input_fn, steps=200)
+    est.evaluate(
+        input_fn=dummy_input_fn,
+        steps=200,
+    )
+
+    # Make sure nothing is stuck in limbo.
+    writer_cache.FileWriterCache.clear()
+
+    # Get last Event written.
+    if check_eventfile_for_keyword('image', est):
+      return
+    self.fail('{} should be part of reported summaries.'.format('image'))
+
 
 class EstimatorPredictTest(test.TestCase):
 
@@ -1865,6 +1985,71 @@ class EstimatorExportTest(test.TestCase):
     est.train(dummy_input_fn, steps=1)
     est.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn)
 
+  def test_export_savedmodel_respects_soft_placement(self):
+    def model_fn_with_a_gpu_op_but_no_kernel(features, labels, mode):
+      _, _ = features, labels
+      table = saver_test_utils.CheckpointedOp(name='v2')
+
+      update_global_step = state_ops.assign_add(training.get_global_step(), 1)
+      with ops.control_dependencies([update_global_step]):
+        train_op = table.insert('k1', 30.0)
+
+      #  In this test, there are no GPUs available.  The goal is to verify that
+      #  export_savedmodel executes nevertheless.
+      with ops.device('/gpu:0'):
+        string_op = string_ops.as_string(update_global_step)
+
+      with ops.control_dependencies([string_op]):
+        prediction = table.lookup('k1', 0.0)
+
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=prediction,
+          loss=constant_op.constant(1.),
+          train_op=train_op,
+          export_outputs={
+              'test': export_output.PredictOutput({
+                  'prediction': prediction
+              })
+          })
+
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(
+        model_fn=model_fn_with_a_gpu_op_but_no_kernel)
+    est.train(input_fn=dummy_input_fn, steps=1)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+
+    export_dir = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn)
+
+    # At this point, if export_savedmodel executed with
+    # allow_soft_placement=True, then the GPU-assigned operation was silently
+    # placed on the CPU.  Otherwise, an exception would have been raised
+    # related to the fact that the requested GPU device isn't available.
+
+    # Expectations below assume that export_savedmodel has completed normally.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('saved_model.pb'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+    gfile.DeleteRecursively(tmpdir)
+
 
 class EstimatorHookOrderingTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 31e9933c6f702393eb21b10c5bdd770739056032..51075731ddc52a55799958c3bfa6140f77404541 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -57,7 +57,7 @@ class ServingInputReceiver(collections.namedtuple(
       groups of receiver tensors, each of which may be a `Tensor` or a dict of
       string to `Tensor`.  These named receiver tensor alternatives generate
       additional serving signatures, which may be used to feed inputs at
-      different points within the input reciever subgraph.  A typical usage is
+      different points within the input receiver subgraph.  A typical usage is
       to allow feeding raw feature `Tensor`s *downstream* of the
       tf.parse_example() op.  Defaults to None.
   """
@@ -191,7 +191,8 @@ def build_all_signature_defs(receiver_tensors,
   if not isinstance(receiver_tensors, dict):
     receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
-    raise ValueError('export_outputs must be a dict.')
+    raise ValueError('export_outputs must be a dict and not'
+                     '{}'.format(type(export_outputs)))
 
   signature_def_map = {}
   excluded_signatures = {}
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index 3cbef4707a536128e0cc6ca9a14dc2aea8a44707..8442bf04accbd0bc15f5958069bf3060debd42bc 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -358,7 +358,8 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError) as e:
       export.build_all_signature_defs(receiver_tensor, None)
 
-    self.assertEqual("export_outputs must be a dict.", str(e.exception))
+    self.assertTrue(str(e.exception).startswith(
+        "export_outputs must be a dict"))
 
   def test_get_timestamped_export_dir(self):
     export_dir_base = tempfile.mkdtemp() + "export/"
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index c9f37f06e834e0d8be756097130d4cd5136ba9cf..750af20e8a1e27c0f9c4fcf3ebf586c41bc9c66c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+from six import string_types
 from tensorflow.python.estimator.inputs.queues import feeding_functions
 
 # Key name to pack the target into dict of `features`. See
@@ -51,8 +52,9 @@ def numpy_input_fn(x,
                    num_threads=1):
   """Returns input function that would feed dict of numpy arrays into the model.
 
-  This returns a function outputting `features` and `target` based on the dict
-  of numpy arrays. The dict `features` has the same keys as the `x`.
+  This returns a function outputting `features` and `targets` based on the dict
+  of numpy arrays. The dict `features` has the same keys as the `x`. The dict
+  `targets` has the same keys as the `y` if `y` is a dict.
 
   Example:
 
@@ -69,7 +71,7 @@ def numpy_input_fn(x,
 
   Args:
     x: dict of numpy array object.
-    y: numpy array object. `None` if absent.
+    y: numpy array object or dict of numpy array object. `None` if absent.
     batch_size: Integer, size of batches to return.
     num_epochs: Integer, number of epochs to iterate over data. If `None` will
       run forever.
@@ -81,11 +83,13 @@ def numpy_input_fn(x,
       such as in prediction and evaluation mode, `num_threads` should be 1.
 
   Returns:
-    Function, that has signature of ()->(dict of `features`, `target`)
+    Function, that has signature of ()->(dict of `features`, `targets`)
 
   Raises:
     ValueError: if the shape of `y` mismatches the shape of values in `x` (i.e.,
       values in `x` have same shape).
+    ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
+    ValueError: if x or y is an empty dict.
     TypeError: `x` is not a dict or `shuffle` is not bool.
   """
 
@@ -97,43 +101,75 @@ def numpy_input_fn(x,
     """Numpy input function."""
     if not isinstance(x, dict):
       raise TypeError('x must be dict; got {}'.format(type(x).__name__))
+    if not x:
+      raise ValueError('x cannot be empty')
 
     # Make a shadow copy and also ensure the order of iteration is consistent.
-    ordered_dict_x = collections.OrderedDict(
+    ordered_dict_data = collections.OrderedDict(
         sorted(x.items(), key=lambda t: t[0]))
+    # Deep copy keys which is a view in python 3
+    feature_keys = list(ordered_dict_data.keys())
+
+    if y is None:
+      target_keys = None
+    elif isinstance(y, dict):
+      if not y:
+        raise ValueError('y cannot be empty dict, use None instead.')
+
+      ordered_dict_y = collections.OrderedDict(
+          sorted(y.items(), key=lambda t: t[0]))
+      target_keys = list(ordered_dict_y.keys())
+
+      duplicate_keys = set(feature_keys).intersection(set(target_keys))
+      if duplicate_keys:
+        raise ValueError('{} duplicate keys are found in both x and y: '
+                         '{}'.format(len(duplicate_keys), duplicate_keys))
+
+      ordered_dict_data.update(ordered_dict_y)
+    else:
+      target_keys = _get_unique_target_key(ordered_dict_data)
+      ordered_dict_data[target_keys] = y
+
+    if len(set(v.shape[0] for v in ordered_dict_data.values())) != 1:
+      shape_dict_of_x = {k: ordered_dict_data[k].shape for k in feature_keys}
+
+      if target_keys is None:
+        shape_of_y = None
+      elif isinstance(target_keys, string_types):
+        shape_of_y = y.shape
+      else:
+        shape_of_y = {k: ordered_dict_data[k].shape for k in target_keys}
 
-    unique_target_key = _get_unique_target_key(ordered_dict_x)
-    if y is not None:
-      ordered_dict_x[unique_target_key] = y
-
-    if len(set(v.shape[0] for v in ordered_dict_x.values())) != 1:
-      shape_dict_of_x = {k: ordered_dict_x[k].shape
-                         for k in ordered_dict_x.keys()}
-      shape_of_y = None if y is None else y.shape
       raise ValueError('Length of tensors in x and y is mismatched. All '
                        'elements in x and y must have the same length.\n'
                        'Shapes in x: {}\n'
-                       'Shape for y: {}\n'.format(shape_dict_of_x, shape_of_y))
+                       'Shapes in y: {}\n'.format(shape_dict_of_x, shape_of_y))
 
     queue = feeding_functions._enqueue_data(  # pylint: disable=protected-access
-        ordered_dict_x,
+        ordered_dict_data,
         queue_capacity,
         shuffle=shuffle,
         num_threads=num_threads,
         enqueue_size=batch_size,
         num_epochs=num_epochs)
 
-    features = (queue.dequeue_many(batch_size) if num_epochs is None
-                else queue.dequeue_up_to(batch_size))
+    batch = (
+        queue.dequeue_many(batch_size)
+        if num_epochs is None else queue.dequeue_up_to(batch_size))
 
-    # Remove the first `Tensor` in `features`, which is the row number.
-    if len(features) > 0:
-      features.pop(0)
+    # Remove the first `Tensor` in `batch`, which is the row number.
+    if batch:
+      batch.pop(0)
 
-    features = dict(zip(ordered_dict_x.keys(), features))
-    if y is not None:
-      target = features.pop(unique_target_key)
+    features = dict(zip(feature_keys, batch[:len(feature_keys)]))
+    if target_keys is None:
+      # TODO(martinwicke), return consistent result
+      return features
+    elif isinstance(target_keys, string_types):
+      target = batch[-1]
+      return features, target
+    else:
+      target = dict(zip(target_keys, batch[-len(target_keys):]))
       return features, target
-    return features
 
   return input_fn
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 02df22b6323c65243893e1e8b0c5e8c3d74e1a62..1374e3f7e12e76683f14737747b490c9a5e319eb 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -239,6 +239,40 @@ class NumpyIoTest(test.TestCase):
             x, y, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
 
+  def testNumpyInputFnWithXIsEmptyDict(self):
+    x = {}
+    y = np.arange(4)
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'x cannot be empty'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
+  def testNumpyInputFnWithYIsNone(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = None
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features_tensor = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      feature = session.run(features_tensor)
+      self.assertEqual(len(feature), 2)
+      self.assertAllEqual(feature['a'], [0, 1])
+      self.assertAllEqual(feature['b'], [32, 33])
+
+      session.run([features_tensor])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
   def testNumpyInputFnWithNonBoolShuffle(self):
     x = np.arange(32, 36)
     y = np.arange(4)
@@ -285,6 +319,56 @@ class NumpyIoTest(test.TestCase):
             num_epochs=1)
         failing_input_fn()
 
+  def testNumpyInputFnWithYAsDict(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {'y1': np.arange(-32, -28), 'y2': np.arange(32, 28, -1)}
+
+    with self.test_session() as session:
+      input_fn = numpy_io.numpy_input_fn(
+          x, y, batch_size=2, shuffle=False, num_epochs=1)
+      features_tensor, targets_tensor = input_fn()
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(session, coord=coord)
+
+      features, targets = session.run([features_tensor, targets_tensor])
+      self.assertEqual(len(features), 2)
+      self.assertAllEqual(features['a'], [0, 1])
+      self.assertAllEqual(features['b'], [32, 33])
+      self.assertEqual(len(targets), 2)
+      self.assertAllEqual(targets['y1'], [-32, -31])
+      self.assertAllEqual(targets['y2'], [32, 31])
+
+      session.run([features_tensor, targets_tensor])
+      with self.assertRaises(errors.OutOfRangeError):
+        session.run([features_tensor, targets_tensor])
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def testNumpyInputFnWithYIsEmptyDict(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {}
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, 'y cannot be empty'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
+  def testNumpyInputFnWithDuplicateKeysInXAndY(self):
+    a = np.arange(4) * 1.0
+    b = np.arange(32, 36)
+    x = {'a': a, 'b': b}
+    y = {'y1': np.arange(-32, -28), 'a': a, 'y2': np.arange(32, 28, -1), 'b': b}
+    with self.test_session():
+      with self.assertRaisesRegexp(
+          ValueError, '2 duplicate keys are found in both x and y'):
+        failing_input_fn = numpy_io.numpy_input_fn(x, y, shuffle=False)
+        failing_input_fn()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index c0a287e922223e8999c45da16d291c95842718f9..75c0e61d47b37110b14aa57f6a185cab822a70bb 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -47,13 +47,13 @@ except ImportError:
 
 
 def _fill_array(arr, seq, fillvalue=0):
-  """ 
-  Recursively fills padded arr with elements from seq. 
+  """
+  Recursively fills padded arr with elements from seq.
   If length of seq is less than arr padded length, fillvalue used.
 
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data sampels of shape 
+    seq: Non-padded list of data sampels of shape
       [batch_size, ..., padded_dim(None)]
     fillvalue: Default fillvalue to use.
   """
@@ -73,12 +73,12 @@ def _pad_if_needed(batch_key_item, fillvalue=0):
   """ Returns padded batch.
 
   Args:
-    batch_key_item: List of data samples of any type with shape 
+    batch_key_item: List of data samples of any type with shape
       [batch_size, ..., padded_dim(None)].
     fillvalue: Default fillvalue to use.
 
   Returns:
-    Padded with zeros tensor of same type and shape 
+    Padded with zeros tensor of same type and shape
       [batch_size, ..., max_padded_dim_len].
 
   Raises:
@@ -375,7 +375,7 @@ def _enqueue_data(data,
       arrays, a numpy `ndarray`, or a generator producing these.
     NotImplementedError: padding and shuffling data at the same time.
     NotImplementedError: padding usage with non generator data type.
-  """ 
+  """
   with ops.name_scope(name):
     if isinstance(data, np.ndarray):
       types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index d71964d2ec8e8ce21934428c3fff88f65b2751da..3893f48caef1b69ccef3f13f35577a4de3c8af1d 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -80,6 +80,13 @@ def _get_master(cluster_spec, task_type, task_id):
         '%s\n\n'
         'Note that these values may be coming from the TF_CONFIG environment '
         'variable.' % (task_id, task_type, cluster_spec))
+
+  # If there is only one node in the cluster, do things locally by setting
+  # master to ''.  If a service or user sets TF_CONFIG with a single node, it's
+  # more performant to use a direct master rather than an RPC service.
+  if len(jobs) == 1 and len(cluster_spec.job_tasks(jobs[0])) == 1:
+    return _LOCAL_MASTER
+
   return _GRPC_SCHEME + addresses[task_id]
 
 
diff --git a/tensorflow/python/estimator/run_config_test.py b/tensorflow/python/estimator/run_config_test.py
index ecc850d5405837e8bf803b9a7c8c156ff19b7a90..6a62c061ff83057525424c36364bc7baea7e1d97 100644
--- a/tensorflow/python/estimator/run_config_test.py
+++ b/tensorflow/python/estimator/run_config_test.py
@@ -344,7 +344,7 @@ class RunConfigDistributedSettingTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.CHIEF,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
@@ -572,7 +572,7 @@ class RunConfigDistributedSettingWithMasterTest(test.TestCase):
         expected_cluster_spec=tf_config['cluster'],
         expected_task_type=run_config_lib.TaskType.MASTER,
         expected_task_id=0,
-        expected_master='grpc://host0:0',
+        expected_master='',
         expected_evaluation_master='',
         expected_is_chief=True,
         expected_num_worker_replicas=1,
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 1131995b3ef1a832c3312d27a46d8395d62cecc7..58fccc3a29ec4339545f36eea0cabcd6852b23e4 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -43,6 +43,8 @@ _DELAY_SECS_PER_WORKER = 5
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _ENVIRONMENT_KEY = 'environment'
 _ENVIRONMENT_GOOGLE_VALUE = 'google'
+_TRAINER_JOBS = (run_config_lib.TaskType.CHIEF, run_config_lib.TaskType.MASTER,
+                 run_config_lib.TaskType.WORKER)
 
 
 def _validate_input_fn(input_fn):
@@ -624,11 +626,28 @@ class _TrainingExecutor(object):
 
   def _start_std_server(self, config):
     """Creates, starts, and returns a server_lib.Server."""
-    if (not config.cluster_spec or not config.task_type or not config.master or
+    if (not config.cluster_spec or not config.task_type or
         config.task_id is None):
       raise RuntimeError('Could not start server; be sure to specify '
-                         'cluster_spec, task_type, master, and task in '
+                         'cluster_spec, task_type, and task in '
                          'RunConfig or set the TF_CONFIG environment variable.')
+
+    if not config.master:
+      jobs = config.cluster_spec.jobs
+      if (len(jobs) == 1 and len(config.cluster_spec.job_tasks(jobs[0])) == 1
+          and config.task_type in _TRAINER_JOBS):
+        # For distributed training, config.master is empty if and only if it has
+        # a single node in the cluster spec. In this case, we should not start
+        # the server.
+        logging.info('Skip starting Tensorflow server as there is only one '
+                     'node in the cluster.')
+        return
+      else:
+        raise RuntimeError(
+            'Could not start server; be sure to specify master in '
+            'RunConfig or set the TF_CONFIG environment variable.')
+
+    logging.info('Start Tensorflow server.')
     server = server_lib.Server(
         config.cluster_spec,
         job_name=config.task_type,
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 1862e325e2b65ae2141132c4b900673c755e179e..285671f99ff1dea4bde391654bc69e146325558e 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -480,7 +480,7 @@ class TrainAndEvaluteTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.Mock()
-    mock_est.config.cluster_spec = {'1': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'1': ['dummy']})
     mock_est.config.task_type = ''
 
     with self.assertRaisesRegexp(ValueError, _INVALID_TASK_TYPE):
@@ -598,7 +598,8 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'worker': ['dummy', 'dummy1']})
     mock_est.config.master = ''
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = 2
@@ -608,13 +609,33 @@ class _TrainingExecutorTrainingTest(object):
       self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
                                                 mock_eval_spec))
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_worker_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    # Single node cluster.
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'worker'
+    mock_est.config.task_id = 2
+
+    self._run_task(training._TrainingExecutor(mock_est, mock_train_spec,
+                                              mock_eval_spec))
+    self.assertTrue(mock_est.train.called)
+    mock_server.assert_not_called()
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -630,7 +651,7 @@ class _TrainingExecutorTrainingTest(object):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'worker': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = 'worker'
     mock_est.config.task_id = None
@@ -768,7 +789,7 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -782,23 +803,48 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy'], 'worker': ['dummy1']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'worker'
-    mock_est.config.task_id = 2
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
 
     with self.assertRaisesRegexp(RuntimeError,
                                  _INVALID_CONFIG_FOR_STD_SERVER_MSG):
       training._TrainingExecutor(
           mock_est, mock_train_spec, mock_eval_spec).run_master()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_single_master_node_with_empty_tf_master(
+      self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec, max_steps=123)
+    mock_eval_spec = test.mock.Mock(spec=training.EvalSpec, exporters=[])
+
+    mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
+    mock_est.config.cluster_spec = server_lib.ClusterSpec(
+        {'master': ['dummy']})
+    mock_est.config.master = ''
+    mock_est.config.task_type = 'master'
+    mock_est.config.task_id = 0
+
+    executor = training._TrainingExecutor(
+        mock_est, mock_train_spec, mock_eval_spec)
+    executor.run_master()
+
+    mock_server.assert_not_called()
+    self.assertTrue(mock_est.train.called)
+
   def test_fail_with_empty_task_type(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -814,9 +860,9 @@ class TrainingExecutorRunMasterTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'worker': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'master': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'worker'
+    mock_est.config.task_type = 'master'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1016,7 +1062,7 @@ class TrainingExecutorRunEvaluatorTest(test.TestCase):
                is_the_final_export):
       del export_path, checkpoint_path, eval_result
       estimator.times_export_was_called += 1
-      # final_export is happend at the end.
+      # final_export is happened at the end.
       self.assertEqual(0, estimator.times_final_export_was_called)
       if is_the_final_export:
         estimator.times_final_export_was_called += 1
@@ -1246,7 +1292,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
     mock_est.config.cluster_spec = None
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1260,9 +1306,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = ''
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = 2
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1276,7 +1322,7 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
     mock_est.config.task_type = ''
     mock_est.config.task_id = 2
@@ -1292,9 +1338,9 @@ class TrainingExecutorRunPsTest(test.TestCase):
     mock_eval_spec = test.mock.Mock(spec=training.EvalSpec)
 
     mock_est.config = test.mock.PropertyMock(spec=run_config_lib.RunConfig)
-    mock_est.config.cluster_spec = {'gs': 'dummy'}
+    mock_est.config.cluster_spec = server_lib.ClusterSpec({'ps': ['dummy']})
     mock_est.config.master = 'grpc://...'
-    mock_est.config.task_type = 'gs'
+    mock_est.config.task_type = 'ps'
     mock_est.config.task_id = None
 
     with self.assertRaisesRegexp(RuntimeError,
@@ -1361,7 +1407,7 @@ class TrainingExecutorRunLocalTest(test.TestCase):
                is_the_final_export):
       del export_path, checkpoint_path, eval_result
       estimator.times_export_was_called += 1
-      # final_export is happend at the end.
+      # final_export is happened at the end.
       self.assertEqual(0, estimator.times_final_export_was_called)
       if is_the_final_export:
         estimator.times_final_export_was_called += 1
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index 12f2592d848c3ce55777ffdae5cee7ac602ee87f..b31486dfa1122c2549ba3e9f6a730fd26444450a 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -52,7 +52,7 @@ def fn_args(fn):
   else:
     if _is_callable_object(fn):
       fn = fn.__call__
-    args = tf_inspect.getargspec(fn).args
+    args = tf_inspect.getfullargspec(fn).args
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
diff --git a/tensorflow/python/estimator/warm_starting_util.py b/tensorflow/python/estimator/warm_starting_util.py
index 1ee77d6bbf3c02a771f4f1de444783e376b22480..e5655db08201601030c4473e3194e89ef89f5a68 100644
--- a/tensorflow/python/estimator/warm_starting_util.py
+++ b/tensorflow/python/estimator/warm_starting_util.py
@@ -23,7 +23,6 @@ import six
 
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -47,10 +46,13 @@ class _WarmStartSettings(
     ckpt_to_initialize_from: [Required] A string specifying the directory with
       checkpoint file(s) or path to checkpoint from which to warm-start the
       model parameters.
-    col_to_prev_vocab: [Optional] Dict of `FeatureColumn` to path of the
-      vocabulary used for the `FeatureColumn` in `ckpt_to_initialize_from`. If
-      not explicitly provided, the vocabularies are assumed to be same between
-      previous and present checkpoints.
+    col_to_prev_vocab: [Optional] Dict of `FeatureColumn` to vocabularies used
+      for the `FeatureColumn` in `ckpt_to_initialize_from`.  Vocabularies can
+      be represented either by a string (path to vocabulary), or tuple of
+      (string, int), representing (path of the vocabulary, vocab_size) if only
+      `vocab_size` entries of the old vocabulary were used in the checkpoint. If
+      the dict is not explicitly provided, the vocabularies are assumed to be
+      same between previous and present checkpoints.
     col_to_prev_tensor: [Optional] Dict of `FeatureColumn` to name of the
       variable (corresponding to the `FeatureColumn`) in
       `ckpt_to_initialize_from`. If not explicitly provided, the name of the
@@ -76,6 +78,13 @@ class _WarmStartSettings(
   ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
                           col_to_prev_vocab={sc_vocab_file: "old_vocab.txt"})
 
+  # Warm-start all weights but the parameters corresponding to "sc_vocab_file"
+  # have a different vocab from the one used in current checkpoint, and only
+  # 100 of those entries were used.
+  ws = _WarmStartSettings(ckpt_to_initialize_from="/tmp",
+                          col_to_prev_vocab={sc_vocab_file:
+                                             ("old_vocab.txt", 100)})
+
   # Warm-start all weights but the parameters corresponding to "sc_vocab_file"
   # have a different vocab from the one used in current checkpoint and the
   # parameters corresponding to "sc_vocab_list" have a different name from the
@@ -125,7 +134,7 @@ def _infer_var_name(var):
     Name of the `var`
   """
   name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(var)
-  if len(name_to_var_dict.keys()) > 1:
+  if len(name_to_var_dict) > 1:
     raise TypeError("`var` passed as arg violates the constraints.")
   return list(name_to_var_dict.keys())[0]
 
@@ -138,26 +147,69 @@ def _warmstart_var(var, prev_ckpt, prev_tensor_name=None):
       Can be either of the following:
       (i) `Variable`
       (ii) `ResourceVariable`
-      (iii) list of `Variable`: The list must contain slices of the same larger
-        variable.
-      (iv) `PartitionedVariable`
+      (iii) `PartitionedVariable`
+      (iv) list of `Variable` and/or `PartitionedVariable`: The list may
+        contain one or more variables that has been sharded.  For example:
+        [Variable('a/part_0'), Variable('b/part_0'), Variable('a/part_1'),
+         PartitionedVariable([Variable('c/part_0'), Variable('c/part_1')])]
+        where we have three whole Variables represented ('a', 'b', and 'c').
     prev_ckpt: A string specifying the directory with checkpoint file(s) or path
       to checkpoint. The given checkpoint must have tensor with name
       `prev_tensor_name` (if not None) or tensor with name same as given `var`.
     prev_tensor_name: Name of the tensor to lookup in provided `prev_ckpt`. If
       None, we lookup tensor with same name as given `var`.
+
+  Raises:
+    ValueError: If prev_tensor_name is not None, but the given var represents
+      more than one Variable.
+    TypeError: If var is not one of the allowed types.
   """
   if _is_variable(var):
     current_var_name = _infer_var_name([var])
-  elif isinstance(var, list) and all(_is_variable(v) for v in var):
-    current_var_name = _infer_var_name(var)
   elif isinstance(var, variables.PartitionedVariable):
     current_var_name = _infer_var_name([var])
     var = var._get_variable_list()  # pylint: disable=protected-access
+  elif (isinstance(var, list) and all(
+      _is_variable(v) or isinstance(v, variables.PartitionedVariable)
+      for v in var)):
+    # Convert length-1 lists of vars to single tf.Variables.  This ensures that
+    # checkpoint_utils.init_from_checkpoint() doesn't incorrectly assume
+    # slice info is present.
+    if len(var) == 1:
+      current_var_name = _infer_var_name(var)
+      var = var[0]
+    else:
+      # If we have multiple elements in var, we cannot assume they all
+      # represent the same Variable.
+      name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(
+          var, convert_variable_to_tensor=False)
+      if prev_tensor_name:
+        # Providing a prev_tensor_name is only viable if var representes a
+        # single Variable.
+        if len(name_to_var_dict) > 1:
+          raise ValueError("var represented more than one Variable, but "
+                           "prev_tensor_name was provided.")
+        checkpoint_utils.init_from_checkpoint(prev_ckpt, {
+            prev_tensor_name: var
+        })
+      else:
+        # OpListToDict gives us roughly what we need, but
+        # the values in the dict may be PartitionedVariables (which
+        # init_from_checkpoint does not expect) that we need to convert to
+        # lists.
+        name_to_var_dict_fixed = {}
+        for name, var in six.iteritems(name_to_var_dict):
+          if isinstance(var, variables.PartitionedVariable):
+            name_to_var_dict_fixed[name] = var._get_variable_list()  # pylint: disable=protected-access
+          else:
+            name_to_var_dict_fixed[name] = var
+        checkpoint_utils.init_from_checkpoint(prev_ckpt, name_to_var_dict_fixed)
+      return
   else:
     raise TypeError(
-        "var MUST be one of the following: a Variable, list of Variable or "
-        "PartitionedVariable, but is {}".format(type(var)))
+        "var MUST be one of the following: a Variable, PartitionedVariable, or "
+        "list of Variable's and/or PartitionedVariable's, but is {}".format(
+            type(var)))
   if not prev_tensor_name:
     # Assume tensor name remains the same.
     prev_tensor_name = current_var_name
@@ -172,8 +224,10 @@ def _warmstart_var_with_vocab(var,
                               current_vocab_size,
                               prev_ckpt,
                               prev_vocab_path,
+                              previous_vocab_size=-1,
                               current_oov_buckets=0,
-                              prev_tensor_name=None):
+                              prev_tensor_name=None,
+                              initializer=None):
   """Warm-starts given variable from `prev_tensor_name` tensor in `prev_ckpt`.
 
   Use this method when the `var` is backed by vocabulary. This method stitches
@@ -196,10 +250,14 @@ def _warmstart_var_with_vocab(var,
       to checkpoint. The given checkpoint must have tensor with name
       `prev_tensor_name` (if not None) or tensor with name same as given `var`.
     prev_vocab_path: Path to the vocab file used for the tensor in `prev_ckpt`.
+    previous_vocab_size: If provided, will constrain previous vocab to the first
+      `previous_vocab_size` entries.  -1 means use the entire previous vocab.
     current_oov_buckets: An `int` specifying the number of out-of-vocabulary
       buckets used for given `var`.
     prev_tensor_name: Name of the tensor to lookup in provided `prev_ckpt`. If
       None, we lookup tensor with same name as given `var`.
+    initializer: Variable initializer to be used for missing entries.  If None,
+      missing entries will be zero-initialized.
 
   Raises:
     ValueError: If required args are not provided.
@@ -232,18 +290,6 @@ def _warmstart_var_with_vocab(var,
           full_shape=slice_info.full_shape,
           var_offset=slice_info.var_offset)
 
-    # TODO(vihanjain): This is brittle. Can we instead infer actual initializer
-    # used originally for the variable or use a fixed initializer?
-    def _missing_ids_init(shape, dtype=None):
-      # pylint: disable=cell-var-from-loop
-      if dtype and dtype.base_dtype != v.dtype.base_dtype:
-        raise ValueError("Trying to initialize missing ids with a different "
-                         "dtype `{}` than variable's dtype `{}`".format(
-                             dtype, v.dtype))
-      return array_ops.slice(v.initial_value, [0, 0], shape)
-
-      # pylint: enable=cell-var-from-loop
-
     # TODO(vihanjain): Support _WarmstartSettings where class vocabularies need
     # remapping too.
     init = checkpoint_ops._load_and_remap_matrix_initializer(
@@ -251,13 +297,14 @@ def _warmstart_var_with_vocab(var,
         old_tensor_name=prev_tensor_name,
         new_row_vocab_size=current_vocab_size,
         new_col_vocab_size=v_shape[1],
+        old_row_vocab_size=previous_vocab_size,
         old_row_vocab_file=prev_vocab_path,
         new_row_vocab_file=current_vocab_path,
         old_col_vocab_file=None,
         new_col_vocab_file=None,
         num_row_oov_buckets=current_oov_buckets,
         num_col_oov_buckets=0,
-        initializer=_missing_ids_init)
+        initializer=initializer)
     new_init_val = ops.convert_to_tensor(
         init(shape=v_shape, partition_info=partition_info))
     v._initializer_op = state_ops.assign(v, new_init_val)
@@ -305,6 +352,11 @@ def _warmstart_input_layer(cols_to_vars, warmstart_settings):
     ```
 
     The above example effectively warm-starts full linear model.
+
+  Raises:
+    ValueError: If a column in cols_to_vars has an entry in
+      warmstart_settings.cols_to_prev_vocab, but is not an instance of
+      _VocabularyFileCategoricalColumn or _EmbeddingColumn.
   """
   for col, var in six.iteritems(cols_to_vars):
     if not isinstance(col, feature_column._FeatureColumn):  # pylint: disable=protected-access
@@ -316,21 +368,56 @@ def _warmstart_input_layer(cols_to_vars, warmstart_settings):
       continue
 
     prev_tensor_name = warmstart_settings.col_to_prev_tensor.get(col)
-    if isinstance(col, feature_column._VocabularyFileCategoricalColumn):  # pylint: disable=protected-access
-      prev_vocab_path = warmstart_settings.col_to_prev_vocab.get(
-          col, col.vocabulary_file)
-      logging.info("Warm-starting column: {}; prev_vocab: {}; prev_tensor: {}".
-                   format(col.name, prev_vocab_path, (
-                       prev_tensor_name or "Unchanged")))
+    # pylint: disable=protected-access
+    is_sparse_vocab_column = isinstance(
+        col, feature_column._VocabularyFileCategoricalColumn)
+    is_embedding_vocab_column = (
+        isinstance(col, feature_column._EmbeddingColumn) and
+        isinstance(col.categorical_column,
+                   feature_column._VocabularyFileCategoricalColumn))
+    if is_sparse_vocab_column or is_embedding_vocab_column:
+      # pylint: enable=protected-access
+      initializer = None
+      if is_embedding_vocab_column:
+        initializer = col.initializer
+        vocabulary_file = col.categorical_column.vocabulary_file
+        vocabulary_size = col.categorical_column.vocabulary_size
+        num_oov_buckets = col.categorical_column.num_oov_buckets
+      else:
+        vocabulary_file = col.vocabulary_file
+        vocabulary_size = col.vocabulary_size
+        num_oov_buckets = col.num_oov_buckets
+      prev_vocab = warmstart_settings.col_to_prev_vocab.get(
+          col, vocabulary_file)
+      if isinstance(prev_vocab, str):
+        prev_vocab_path = prev_vocab
+        previous_vocab_size = -1
+        logging.info(
+            "Warm-starting column: {}; prev_vocab: {}; "
+            "prev_tensor: {}".format(col.name, prev_vocab_path,
+                                     (prev_tensor_name or "Unchanged")))
+      elif isinstance(prev_vocab, tuple):
+        prev_vocab_path = prev_vocab[0]
+        previous_vocab_size = prev_vocab[1]
+        logging.info("Warm-starting column: {}; prev_vocab: {} (first {} "
+                     "entries); prev_tensor: {}".format(
+                         col.name, prev_vocab_path, previous_vocab_size,
+                         (prev_tensor_name or "Unchanged")))
+
       _warmstart_var_with_vocab(
           var,
-          current_vocab_path=col.vocabulary_file,
-          current_vocab_size=col.vocabulary_size,
+          current_vocab_path=vocabulary_file,
+          current_vocab_size=vocabulary_size,
           prev_ckpt=warmstart_settings.ckpt_to_initialize_from,
           prev_vocab_path=prev_vocab_path,
-          current_oov_buckets=col.num_oov_buckets,
-          prev_tensor_name=prev_tensor_name)
+          previous_vocab_size=previous_vocab_size,
+          current_oov_buckets=num_oov_buckets,
+          prev_tensor_name=prev_tensor_name,
+          initializer=initializer)
     else:
+      if col in warmstart_settings.col_to_prev_vocab:
+        raise ValueError("Vocabulary provided for column %s which is not a "
+                         "_VocabularyFileCategoricalColumn or _EmbeddingColumn")
       logging.info("Warm-starting column: {}; prev_tensor: {}".format(
           col.name, prev_tensor_name or "Unchanged"))
       _warmstart_var(var, warmstart_settings.ckpt_to_initialize_from,
diff --git a/tensorflow/python/estimator/warm_starting_util_test.py b/tensorflow/python/estimator/warm_starting_util_test.py
index d4f1e3ac9df5a42a14c94f4a5a15d1aed2216134..a05dbfd7449c9e108649da9ec5a40fe220233953 100644
--- a/tensorflow/python/estimator/warm_starting_util_test.py
+++ b/tensorflow/python/estimator/warm_starting_util_test.py
@@ -72,6 +72,36 @@ class WarmStartingUtilTest(test.TestCase):
           var = var._get_variable_list()
         return var, sess.run(var)
 
+  def _create_prev_run_multiple_vars(self,
+                                     var_names,
+                                     initializers,
+                                     shapes=None,
+                                     partitioners=None):
+    if not shapes:
+      shapes = [None] * len(var_names)
+    if not partitioners:
+      partitioners = [None] * len(var_names)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        var_list = []
+        for var_name, shape, initializer, partitioner in zip(
+            var_names, shapes, initializers, partitioners):
+          var_list.append(
+              variable_scope.get_variable(
+                  var_name,
+                  shape=shape,
+                  initializer=initializer,
+                  partitioner=partitioner))
+        self._write_checkpoint(sess)
+        run_vars = []
+        for var, partitioner in zip(var_list, partitioners):
+          if partitioner:
+            self.assertTrue(isinstance(var, variables.PartitionedVariable))
+            run_vars.append(sess.run(var._get_variable_list()))
+          else:
+            run_vars.append(sess.run(var))
+        return var_list, run_vars
+
   def _create_dummy_inputs(self):
     return {
         "sc_int": array_ops.sparse_placeholder(dtypes.int32),
@@ -98,7 +128,7 @@ class WarmStartingUtilTest(test.TestCase):
   def _assert_cols_to_vars(self, cols_to_vars, cols_to_expected_values, sess):
     for col, expected_values in six.iteritems(cols_to_expected_values):
       for i, var in enumerate(cols_to_vars[col]):
-        self.assertAllEqual(expected_values[i], var.eval(sess))
+        self.assertAllClose(expected_values[i], var.eval(sess))
 
   def testWarmStartVar(self):
     _, prev_val = self._create_prev_run_var(
@@ -175,6 +205,99 @@ class WarmStartingUtilTest(test.TestCase):
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
         self.assertAllEqual(prev_val, new_val)
 
+  def testWarmStartVarMultipleVars(self):
+    _, prev_vals = self._create_prev_run_multiple_vars(
+        var_names=["fruit_weights", "other_weights"],
+        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]]])
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        fruit_weights = variable_scope.get_variable(
+            "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
+        other_weights = variable_scope.get_variable(
+            "other_weights", initializer=[[0.], [0.], [0.], [0.]])
+        ws_util._warmstart_var([fruit_weights, other_weights],
+                               self.get_temp_dir())
+        sess.run(variables.global_variables_initializer())
+        self.assertAllEqual(prev_vals[0], fruit_weights.eval(sess))
+        self.assertAllEqual(prev_vals[1], other_weights.eval(sess))
+
+  def testWarmStartVarMultipleVarsBothPartitioned(self):
+    _, prev_vals = self._create_prev_run_multiple_vars(
+        var_names=["fruit_weights", "other_weights"],
+        shapes=[[4, 1], [4, 1]],
+        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]]],
+        partitioners=[lambda shape, dtype: [2, 1], lambda shape, dtype: [2, 1]])
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        fruit_weights = variable_scope.get_variable(
+            "fruit_weights",
+            shape=[4, 1],
+            initializer=[[0.], [0.], [0.], [0.]],
+            partitioner=lambda shape, dtype: [2, 1])
+        other_weights = variable_scope.get_variable(
+            "other_weights",
+            shape=[4, 1],
+            initializer=[[0.], [0.], [0.], [0.]],
+            partitioner=lambda shape, dtype: [2, 1])
+        ws_util._warmstart_var([fruit_weights, other_weights],
+                               self.get_temp_dir())
+        sess.run(variables.global_variables_initializer())
+        fruit_weights = fruit_weights._get_variable_list()
+        new_fruit_weights_val = np.concatenate(
+            [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
+        other_weights = other_weights._get_variable_list()
+        new_other_weights_val = np.concatenate(
+            [other_weights[0].eval(sess), other_weights[1].eval(sess)], axis=0)
+        self.assertAllEqual(
+            np.concatenate(prev_vals[0], axis=0), new_fruit_weights_val)
+        self.assertAllEqual(
+            np.concatenate(prev_vals[1], axis=0), new_other_weights_val)
+
+  def testWarmStartVarMultipleVarsMixOfPartitions(self):
+    # First is not partitioned, but the second two are.
+    _, prev_vals = self._create_prev_run_multiple_vars(
+        var_names=["fruit_weights", "other_weights", "veggie_weights"],
+        shapes=[None, [4, 1], [4, 1]],
+        initializers=[[[0.5], [1.], [1.5], [2.]], [[.05], [.1], [.15], [.2]],
+                      [[5.], [10.], [15.], [20.]]],
+        partitioners=[
+            None, lambda shape, dtype: [2, 1], lambda shape, dtype: [2, 1]
+        ])
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        fruit_weights = variable_scope.get_variable(
+            "fruit_weights", initializer=[[0.], [0.], [0.], [0.]])
+        other_weights = variable_scope.get_variable(
+            "other_weights",
+            shape=[4, 1],
+            initializer=[[0.], [0.], [0.], [0.]],
+            partitioner=lambda shape, dtype: [2, 1])
+        veggie_weights = variable_scope.get_variable(
+            "veggie_weights",
+            shape=[4, 1],
+            initializer=[[0.], [0.], [0.], [0.]],
+            partitioner=lambda shape, dtype: [2, 1])
+        # Flatten one of the partitioned variables.
+        ws_util._warmstart_var([fruit_weights, other_weights] +
+                               veggie_weights._get_variable_list(),
+                               self.get_temp_dir())
+        sess.run(variables.global_variables_initializer())
+        veggie_weights = veggie_weights._get_variable_list()
+        new_veggie_weights_val = np.concatenate(
+            [veggie_weights[0].eval(sess), veggie_weights[1].eval(sess)],
+            axis=0)
+        other_weights = other_weights._get_variable_list()
+        new_other_weights_val = np.concatenate(
+            [other_weights[0].eval(sess), other_weights[1].eval(sess)], axis=0)
+        self.assertAllEqual(prev_vals[0], fruit_weights.eval(sess))
+        self.assertAllEqual(
+            np.concatenate(prev_vals[1], axis=0), new_other_weights_val)
+        self.assertAllEqual(
+            np.concatenate(prev_vals[2], axis=0), new_veggie_weights_val)
+
   def testWarmStartVarWithVocab(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
@@ -195,6 +318,32 @@ class WarmStartingUtilTest(test.TestCase):
         self.assertAllEqual([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
+  def testWarmStartVarWithVocabConstrainedOldVocabSize(self):
+    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                        "old_vocab")
+    _, _ = self._create_prev_run_var(
+        "fruit_weights", initializer=[[0.5], [1.], [1.5], [2.]])
+
+    # New vocab with elements in reverse order and one new element.
+    new_vocab_path = self._write_vocab(
+        ["orange", "guava", "banana", "apple", "raspberry"], "new_vocab")
+    # New session and new graph.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        fruit_weights = variable_scope.get_variable(
+            "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
+        ws_util._warmstart_var_with_vocab(
+            fruit_weights,
+            new_vocab_path,
+            5,
+            self.get_temp_dir(),
+            prev_vocab_path,
+            previous_vocab_size=2)
+        sess.run(variables.global_variables_initializer())
+        # Old vocabulary limited to ['apple', 'banana'].
+        self.assertAllEqual([[0.], [0.], [1.], [0.5], [0.]],
+                            fruit_weights.eval(sess))
+
   def testWarmStartVarWithVocabPrevVarPartitioned(self):
     prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                         "old_vocab")
@@ -384,6 +533,51 @@ class WarmStartingUtilTest(test.TestCase):
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
 
+  def testWarmStartInputLayer_SparseColumnVocabularyConstrainedVocabSizes(self):
+    # Create old vocabulary, and use a size smaller than the total number of
+    # entries.
+    old_vocab_path = self._write_vocab(["apple", "guava", "banana"],
+                                       "old_vocab")
+    old_vocab_size = 2  # ['apple', 'guava']
+
+    # Create new vocab for sparse column "sc_vocab".
+    current_vocab_path = self._write_vocab(
+        ["apple", "banana", "guava", "orange"], "current_vocab")
+    # Create feature column.  Only use 2 of the actual entries, resulting in
+    # ['apple', 'banana'] for the new vocabulary.
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=current_vocab_path, vocabulary_size=2)
+
+    # Save checkpoint from which to warm-start.
+    self._create_prev_run_var(
+        "linear_model/sc_vocab/weights", shape=[2, 1], initializer=ones())
+
+    partitioner = lambda shape, dtype: [1] * len(shape)
+    # New graph, new session WITHOUT warmstarting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
+        sess.run(variables.global_variables_initializer())
+        # Without warmstarting, the weights should be initialized using default
+        # initializer (which is init_ops.zeros_initializer).
+        self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
+                                  sess)
+
+    # New graph, new session with warmstarting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
+        warmstart_settings = ws_util._WarmStartSettings(
+            ckpt_to_initialize_from=self.get_temp_dir(),
+            col_to_prev_vocab={
+                sc_vocab: (old_vocab_path, old_vocab_size)
+            })
+        ws_util._warmstart_input_layer(cols_to_vars, warmstart_settings)
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warmstarted.  'banana' isn't in the
+        # first two entries of the old vocabulary, so it's newly initialized.
+        self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
+
   def testWarmStartInputLayer_BucketizedColumn(self):
     # Create feature column.
     real = fc.numeric_column("real")
@@ -558,6 +752,66 @@ class WarmStartingUtilTest(test.TestCase):
             ]
         }, sess)
 
+  def testWarmStartInputLayerEmbeddingColumn(self):
+    # Create old and new vocabs for embedding column "sc_vocab".
+    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
+                                        "old_vocab")
+    new_vocab_path = self._write_vocab(
+        ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
+        "new_vocab")
+
+    # Save checkpoint from which to warm-start.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        _ = variable_scope.get_variable(
+            "input_layer/sc_vocab_embedding/embedding_weights",
+            initializer=[[0.5, 0.4], [1., 1.1], [2., 2.2], [3., 3.3]])
+        self._write_checkpoint(sess)
+
+    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
+      # Partition each var into 2 equal slices.
+      partitions = [1] * len(shape)
+      partitions[0] = min(2, shape[0].value)
+      return partitions
+
+    # Create feature columns.
+    sc_vocab = fc.categorical_column_with_vocabulary_file(
+        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
+    emb_vocab = fc.embedding_column(
+        categorical_column=sc_vocab,
+        dimension=2,
+        # Can't use constant_initializer with load_and_remap.  In practice,
+        # use a truncated normal initializer.
+        initializer=init_ops.random_uniform_initializer(
+            minval=0.42, maxval=0.42))
+    all_deep_cols = [emb_vocab]
+    # New graph, new session with warmstarting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        cols_to_vars = {}
+        with variable_scope.variable_scope("", partitioner=_partitioner):
+          # Create the variables.
+          fc.input_layer(
+              features=self._create_dummy_inputs(),
+              feature_columns=all_deep_cols,
+              cols_to_vars=cols_to_vars)
+        ws_settings = ws_util._WarmStartSettings(
+            self.get_temp_dir(), col_to_prev_vocab={
+                emb_vocab: prev_vocab_path
+            })
+        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warmstarted. Var corresponding to
+        # emb_vocab should be correctly warmstarted after vocab remapping.
+        # Missing values are filled in with the EmbeddingColumn's initializer.
+        self._assert_cols_to_vars(
+            cols_to_vars, {
+                emb_vocab: [
+                    np.array([[3., 3.3], [2., 2.2], [1., 1.1]]),
+                    np.array([[0.5, 0.4], [0.42, 0.42], [0.42, 0.42]])
+                ]
+            }, sess)
+
   def testErrorConditions(self):
     self.assertRaises(ValueError, ws_util._WarmStartSettings, None)
     x = variable_scope.get_variable(
@@ -566,8 +820,7 @@ class WarmStartingUtilTest(test.TestCase):
         initializer=ones(),
         partitioner=lambda shape, dtype: [2, 1])
 
-    # List of PartitionedVariable is invalid type.
-    self.assertRaises(TypeError, ws_util._warmstart_var, [x], prev_ckpt="/tmp")
+    # List of PartitionedVariable is invalid type when warmstarting with vocab.
     self.assertRaises(TypeError, ws_util._warmstart_var_with_vocab, [x], "/tmp",
                       5, "/tmp", "/tmp")
     # Keys of type other than FeatureColumn.
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index b1c81dd58c7d2d9cf95821ea78eda2e7ee675d25..76d44fc474f936733f4eeeefd5d9510964ebb430 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -48,6 +48,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:template",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 190a25d4d79e9acc1986f5bd06110a29f29aee42..060fa640d573aab1b96107ca3c1f0f6c3ba4f5bb 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -134,6 +134,7 @@ import math
 import numpy as np
 import six
 
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -149,13 +150,74 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.util import nest
 
 
+def _internal_input_layer(features,
+                          feature_columns,
+                          weight_collections=None,
+                          trainable=True,
+                          cols_to_vars=None,
+                          scope=None):
+  """See input_layer. `scope` is a name or variable scope to use."""
+
+  feature_columns = _clean_feature_columns(feature_columns)
+  for column in feature_columns:
+    if not isinstance(column, _DenseColumn):
+      raise ValueError(
+          'Items of feature_columns must be a _DenseColumn. '
+          'You can wrap a categorical column with an '
+          'embedding_column or indicator_column. Given: {}'.format(column))
+  weight_collections = list(weight_collections or [])
+  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
+    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+  # a non-None `scope` can allow for variable reuse, when, e.g., this function
+  # is wrapped by a `make_template`.
+  with variable_scope.variable_scope(
+      scope, default_name='input_layer', values=features.values()):
+    builder = _LazyBuilder(features)
+    output_tensors = []
+    ordered_columns = []
+    for column in sorted(feature_columns, key=lambda x: x.name):
+      ordered_columns.append(column)
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
+        if column._var_scope_name == column.name:  # pylint: disable=protected-access
+          tensor = _get_dense_tensor(
+              column=column,
+              builder=builder,
+              weight_collections=weight_collections,
+              trainable=trainable)
+        else:
+          # This is typically the case for shared_embedding_columns. The
+          # embedding weights variable will be under the common variable_scope,
+          # but the ops for each column will be under a separate name_scope.
+          with ops.name_scope(column.name):
+            tensor = _get_dense_tensor(
+                column=column,
+                builder=builder,
+                weight_collections=weight_collections,
+                trainable=trainable)
+        output_tensors.append(tensor)
+        if cols_to_vars is not None:
+          # Retrieve any variables created (some _DenseColumn's don't create
+          # variables, in which case an empty list is returned).
+          cols_to_vars[column] = ops.get_collection(
+              ops.GraphKeys.GLOBAL_VARIABLES,
+              scope=variable_scope.get_variable_scope().name)
+    _verify_static_batch_size_equality(output_tensors, ordered_columns)
+    return array_ops.concat(output_tensors, 1)
+
+
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -192,7 +254,7 @@ def input_layer(features,
       `bucketized_column`, `indicator_column`. If you have categorical features,
       you can wrap them with an `embedding_column` or `indicator_column`.
     weight_collections: A list of collection names to which the Variable will be
-      added. Note that, variables will also be added to collections
+      added. Note that variables will also be added to collections
       `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
     trainable: If `True` also add the variable to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
@@ -214,42 +276,66 @@ def input_layer(features,
   Raises:
     ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
   """
-  feature_columns = _clean_feature_columns(feature_columns)
-  for column in feature_columns:
-    if not isinstance(column, _DenseColumn):
-      raise ValueError(
-          'Items of feature_columns must be a _DenseColumn. '
-          'You can wrap a categorical column with an '
-          'embedding_column or indicator_column. Given: {}'.format(column))
-  weight_collections = list(weight_collections or [])
-  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
-  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
-    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
-  with variable_scope.variable_scope(
-      None, default_name='input_layer', values=features.values()):
-    builder = _LazyBuilder(features)
-    output_tensors = []
-    ordered_columns = []
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(None, default_name=column.name):
-        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
-        if cols_to_vars is not None:
-          # Retrieve any variables created (some _DenseColumn's don't create
-          # variables, in which case an empty list is returned).
-          cols_to_vars[column] = ops.get_collection(
-              ops.GraphKeys.GLOBAL_VARIABLES,
-              scope=variable_scope.get_variable_scope().name)
-        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
-        batch_size = array_ops.shape(tensor)[0]
-        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-        output_tensors.append(tensor)
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
+  return _internal_input_layer(features, feature_columns, weight_collections,
+                               trainable, cols_to_vars)
+
+
+# TODO(akshayka): InputLayer should be a subclass of Layer, and it
+# should implement the logic in input_layer using Layer's build-and-call
+# paradigm; input_layer should create an instance of InputLayer and
+# return the result of inovking its apply method, just as functional layers do.
+class InputLayer(object):
+  """An object-oriented version of `input_layer` that reuses variables."""
+
+  def __init__(self,
+               feature_columns,
+               weight_collections=None,
+               trainable=True,
+               cols_to_vars=None):
+    """See `input_layer`."""
+
+    self._feature_columns = feature_columns
+    self._weight_collections = weight_collections
+    self._trainable = trainable
+    self._cols_to_vars = cols_to_vars
+    self._input_layer_template = template.make_template(
+        'feature_column_input_layer',
+        _internal_input_layer,
+        create_scope_now_=True)
+    self._scope = self._input_layer_template.variable_scope
+
+  def __call__(self, features):
+    return self._input_layer_template(
+        features=features,
+        feature_columns=self._feature_columns,
+        weight_collections=self._weight_collections,
+        trainable=self._trainable,
+        cols_to_vars=None,
+        scope=self._scope)
+
+  @property
+  def non_trainable_variables(self):
+    return self._input_layer_template.non_trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    return self._input_layer_template.non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self._input_layer_template.trainable_variables
+
+  @property
+  def trainable_weights(self):
+    return self._input_layer_template.trainable_weights
+
+  @property
+  def variables(self):
+    return self._input_layer_template.variables
+
+  @property
+  def weights(self):
+    return self._input_layer_template.weights
 
 
 def linear_model(features,
@@ -340,15 +426,29 @@ def linear_model(features,
     ordered_columns = []
     builder = _LazyBuilder(features)
     for column in sorted(feature_columns, key=lambda x: x.name):
-      with variable_scope.variable_scope(None, default_name=column.name):
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
         ordered_columns.append(column)
-        if isinstance(column, _CategoricalColumn):
-          weighted_sum = _create_categorical_column_weighted_sum(
-              column, builder, units, sparse_combiner, weight_collections,
-              trainable)
+        if column._var_scope_name == column.name:  # pylint: disable=protected-access
+          weighted_sum = _create_weighted_sum(
+              column=column,
+              builder=builder,
+              units=units,
+              sparse_combiner=sparse_combiner,
+              weight_collections=weight_collections,
+              trainable=trainable)
         else:
-          weighted_sum = _create_dense_column_weighted_sum(
-              column, builder, units, weight_collections, trainable)
+          # This is typically the case for shared_embedding_columns. The
+          # embedding weights variable will be under the common variable_scope,
+          # but the ops for each column will be under a separate name_scope.
+          with ops.name_scope(column.name):
+            weighted_sum = _create_weighted_sum(
+                column=column,
+                builder=builder,
+                units=units,
+                sparse_combiner=sparse_combiner,
+                weight_collections=weight_collections,
+                trainable=trainable)
         weighted_sums.append(weighted_sum)
         if cols_to_vars is not None:
           # Retrieve the variables created.
@@ -489,15 +589,36 @@ def embedding_column(
   representation (e.g., to feed to a DNN).
 
   Inputs must be a `_CategoricalColumn` created by any of the
-  `categorical_column_*` function. Here is an example embedding of an identity
-  column for a DNN model:
+  `categorical_column_*` function. Here is an example of using
+  `embedding_column` with `DNNClassifier`:
 
   ```python
   video_id = categorical_column_with_identity(
       key='video_id', num_buckets=1000000, default_value=0)
   columns = [embedding_column(video_id, 9),...]
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  dense_tensor = input_layer(features, columns)
+
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `embedding_column` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    video_id = categorical_column_with_identity(
+        key='video_id', num_buckets=1000000, default_value=0)
+    columns = [embedding_column(video_id, 9),...]
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
   ```
 
   Args:
@@ -531,6 +652,7 @@ def embedding_column(
     ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
       is specified.
     ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: If eager execution is enabled.
   """
   if (dimension is None) or (dimension < 1):
     raise ValueError('Invalid dimension {}.'.format(dimension))
@@ -551,12 +673,163 @@ def embedding_column(
       dimension=dimension,
       combiner=combiner,
       initializer=initializer,
+      shared_embedding_collection_name=None,
       ckpt_to_load_from=ckpt_to_load_from,
       tensor_name_in_ckpt=tensor_name_in_ckpt,
       max_norm=max_norm,
       trainable=trainable)
 
 
+def _shared_embedding_columns(
+    categorical_columns, dimension, combiner='mean', initializer=None,
+    shared_embedding_collection_name=None, ckpt_to_load_from=None,
+    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
+  """List of `_DenseColumn`s that convert from sparse, categorical input.
+
+  This is similar to `embedding_column`, except that that it produces a list of
+  embedding columns that share the same embedding weights.
+
+  Use this when your inputs are sparse and of the same type (e.g. watched and
+  impression video IDs that share the same vocabulary), and you want to convert
+  them to a dense representation (e.g., to feed to a DNN).
+
+  Inputs must be a list of `_CategoricalColumn` created by any of the
+  `categorical_column_*` function. They must all be of the same type and have
+  the same arguments except `key`. E.g. they can be
+  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
+  all columns could also be weighted_categorical_column.
+
+  Here is an example embedding of two features for a DNNClassifier model:
+
+  ```python
+  watched_video_id = categorical_column_with_vocabulary_file(
+      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+  impression_video_id = categorical_column_with_vocabulary_file(
+      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+  columns = shared_embedding_columns(
+      [watched_video_id, impression_video_id], dimension=10)
+
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `shared_embedding_columns` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    watched_video_id = categorical_column_with_vocabulary_file(
+        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+    impression_video_id = categorical_column_with_vocabulary_file(
+        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+    columns = shared_embedding_columns(
+        [watched_video_id, impression_video_id], dimension=10)
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
+  ```
+
+  Args:
+    categorical_columns: List of `_CategoricalColumn`s created by a
+      `categorical_column_with_*` function. These columns produce the sparse IDs
+      that are inputs to the embedding lookup. All columns must be of the same
+      type and have the same arguments except `key`. E.g. they can be
+      categorical_column_with_vocabulary_file with the same vocabulary_file.
+      Some or all columns could also be weighted_categorical_column.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
+      which to restore the column weights. Required if `ckpt_to_load_from` is
+      not `None`.
+    max_norm: If not `None`, embedding values are l2-normalized to this value.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    A list of `_DenseColumn`s that converts from sparse input. The order of
+    results follows the ordering of `categorical_columns`.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if any of the given `categorical_columns` is of different type
+      or has different arguments than the others.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+  """
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified.')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  if not isinstance(c0, _CategoricalColumn):
+    raise ValueError(
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0, _WeightedCategoricalColumn):
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(c, _WeightedCategoricalColumn):
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
+
+  result = []
+  for column in categorical_columns:
+    result.append(_EmbeddingColumn(
+        categorical_column=column,
+        dimension=dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable))
+  return result
+
+
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -776,9 +1049,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-def categorical_column_with_vocabulary_file(
-    key, vocabulary_file, vocabulary_size, num_oov_buckets=0,
-    default_value=None, dtype=dtypes.string):
+def categorical_column_with_vocabulary_file(key,
+                                            vocabulary_file,
+                                            vocabulary_size=None,
+                                            num_oov_buckets=0,
+                                            default_value=None,
+                                            dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -837,7 +1113,7 @@ def categorical_column_with_vocabulary_file(
     vocabulary_file: The vocabulary file name.
     vocabulary_size: Number of the elements in the vocabulary. This must be no
       greater than length of `vocabulary_file`, if less than length, later
-      values are ignored.
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
     num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
       buckets. All out-of-vocabulary inputs will be assigned IDs in the range
       `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
@@ -852,7 +1128,7 @@ def categorical_column_with_vocabulary_file(
     A `_CategoricalColumn` with a vocabulary file.
 
   Raises:
-    ValueError: `vocabulary_file` is missing.
+    ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
     ValueError: `num_oov_buckets` is a negative integer.
     ValueError: `num_oov_buckets` and `default_value` are both specified.
@@ -860,8 +1136,19 @@ def categorical_column_with_vocabulary_file(
   """
   if not vocabulary_file:
     raise ValueError('Missing vocabulary_file in {}.'.format(key))
+
+  if vocabulary_size is None:
+    if not gfile.Exists(vocabulary_file):
+      raise ValueError('vocabulary_file in {} does not exist.'.format(key))
+
+    with gfile.GFile(vocabulary_file) as f:
+      vocabulary_size = sum(1 for _ in f)
+    logging.info(
+        'vocabulary_size = %d in %s is inferred from the number of elements '
+        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
+
   # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
-  if (vocabulary_size is None) or (vocabulary_size < 1):
+  if vocabulary_size < 1:
     raise ValueError('Invalid vocabulary_size in {}.'.format(key))
   if num_oov_buckets:
     if default_value is not None:
@@ -1306,9 +1593,14 @@ class _FeatureColumn(object):
 
   @abc.abstractproperty
   def name(self):
-    """Returns string. used for variable_scope and naming."""
+    """Returns string. Used for naming and for name_scope."""
     pass
 
+  @property
+  def _var_scope_name(self):
+    """Returns string. Used for variable_scope. Defaults to self.name."""
+    return self.name
+
   @abc.abstractmethod
   def _transform_feature(self, inputs):
     """Returns intermediate representation (usually a `Tensor`).
@@ -1399,6 +1691,38 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
+def _get_dense_tensor(
+    column,
+    builder,
+    weight_collections,
+    trainable):
+  """Creates a dense Tensor for a _DenseColumn for input_layer."""
+  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+      builder,
+      weight_collections=weight_collections,
+      trainable=trainable)
+  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
+  batch_size = array_ops.shape(tensor)[0]
+  return array_ops.reshape(tensor, shape=(batch_size, num_elements))
+
+
+def _create_weighted_sum(
+    column,
+    builder,
+    units,
+    sparse_combiner,
+    weight_collections,
+    trainable):
+  """Creates a weighted sum for a dense or sparse column for linear_model."""
+  if isinstance(column, _CategoricalColumn):
+    return _create_categorical_column_weighted_sum(
+        column, builder, units, sparse_combiner, weight_collections,
+        trainable)
+  else:
+    return _create_dense_column_weighted_sum(
+        column, builder, units, weight_collections, trainable)
+
+
 def _create_dense_column_weighted_sum(
     column, builder, units, weight_collections, trainable):
   """Create a weighted sum of a dense column for linear_model."""
@@ -1664,29 +1988,26 @@ def _to_sparse_input(input_tensor, ignore_value=None):
   if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
     return input_tensor
   with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
-    input_rank = input_tensor.get_shape().ndims
-    if input_rank is None:
-      # TODO(b/32318825): Implement dense_to_sparse_tensor for undefined rank.
-      raise ValueError('Undefined input_tensor shape.')
     if ignore_value is None:
-      ignore_value = '' if input_tensor.dtype == dtypes.string else -1
-    dense_shape = math_ops.cast(array_ops.shape(input_tensor), dtypes.int64)
-    indices = array_ops.where(math_ops.not_equal(
-        input_tensor, math_ops.cast(ignore_value, input_tensor.dtype)))
-    # Flattens the tensor and indices for use with gather.
-    flat_tensor = array_ops.reshape(input_tensor, [-1])
-    flat_indices = indices[:, input_rank - 1]
-    # Computes the correct flattened indices for 2d (or higher) tensors.
-    if input_rank > 1:
-      higher_dims = indices[:, :input_rank - 1]
-      shape_offsets = array_ops.stack(
-          _shape_offsets(array_ops.unstack(dense_shape)[1:]))
-      offsets = math_ops.reduce_sum(
-          math_ops.multiply(higher_dims, shape_offsets),
-          reduction_indices=[1])
-      flat_indices = math_ops.add(flat_indices, offsets)
-    values = array_ops.gather(flat_tensor, flat_indices)
-    return sparse_tensor_lib.SparseTensor(indices, values, dense_shape)
+      if input_tensor.dtype == dtypes.string:
+        # Exception due to TF strings are converted to numpy objects by default.
+        ignore_value = ''
+      elif input_tensor.dtype.is_integer:
+        ignore_value = -1  # -1 has a special meaning of missing feature
+      else:
+        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
+        # constructing a new numpy object of the given type, which yields the
+        # default value for that type.
+        ignore_value = input_tensor.dtype.as_numpy_dtype()
+    ignore_value = math_ops.cast(
+        ignore_value, input_tensor.dtype, name='ignore_value')
+    indices = array_ops.where(
+        math_ops.not_equal(input_tensor, ignore_value), name='indices')
+    return sparse_tensor_lib.SparseTensor(
+        indices=indices,
+        values=array_ops.gather_nd(input_tensor, indices, name='values'),
+        dense_shape=array_ops.shape(
+            input_tensor, out_type=dtypes.int64, name='dense_shape'))
 
 
 def _clean_feature_columns(feature_columns):
@@ -1847,16 +2168,24 @@ class _EmbeddingColumn(
     _DenseColumn,
     collections.namedtuple('_EmbeddingColumn', (
         'categorical_column', 'dimension', 'combiner', 'initializer',
-        'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'
+        'shared_embedding_collection_name', 'ckpt_to_load_from',
+        'tensor_name_in_ckpt', 'max_norm', 'trainable'
     ))):
-  """See `_embedding_column`."""
+  """See `embedding_column`."""
 
   @property
   def name(self):
     if not hasattr(self, '_name'):
-      self._name = '{}_embedding'.format(self.categorical_column.name)
+      if self.shared_embedding_collection_name:
+        self._name = '{}_shared_embedding'.format(self.categorical_column.name)
+      else:
+        self._name = '{}_embedding'.format(self.categorical_column.name)
     return self._name
 
+  @property
+  def _var_scope_name(self):
+    return self.shared_embedding_collection_name or self.name
+
   @property
   def _parse_example_spec(self):
     return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
@@ -1877,14 +2206,47 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    # Create embedding weight, and restore from checkpoint if necessary.
-    embedding_weights = variable_scope.get_variable(
-        name='embedding_weights',
-        shape=(self.categorical_column._num_buckets, self.dimension),  # pylint: disable=protected-access
-        dtype=dtypes.float32,
-        initializer=self.initializer,
-        trainable=self.trainable and trainable,
-        collections=weight_collections)
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if self.shared_embedding_collection_name:
+      shared_embedding_collection = ops.get_collection(
+          self.shared_embedding_collection_name)
+      if shared_embedding_collection:
+        if len(shared_embedding_collection) > 1:
+          raise ValueError(
+              'Collection {} can only contain one variable. '
+              'Suggested fix A: Choose a unique name for this collection. '
+              'Suggested fix B: Do not add any variables to this collection. '
+              'The feature_column library already adds a variable under the '
+              'hood.'.format(shared_embedding_collection))
+        embedding_weights = shared_embedding_collection[0]
+        if embedding_weights.shape != embedding_shape:
+          raise ValueError(
+              'Shared embedding collection {} contains variable {} of '
+              'unexpected shape {}. Expected shape is {}. '
+              'Suggested fix A: Choose a unique name for this collection. '
+              'Suggested fix B: Do not add any variables to this collection. '
+              'The feature_column library already adds a variable under the '
+              'hood.'.format(
+                  self.shared_embedding_collection_name, embedding_weights.name,
+                  embedding_weights.shape, embedding_shape))
+      else:
+        embedding_weights = variable_scope.get_variable(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable and trainable,
+            collections=weight_collections)
+        ops.add_to_collection(
+            self.shared_embedding_collection_name, embedding_weights)
+    else:
+      embedding_weights = variable_scope.get_variable(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable and trainable,
+          collections=weight_collections)
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index e57e9a9836c1cb38b2e3cea8a9d16283049e9c7d..019415857e1a9de7ca2892dcebeb0cf507040a12 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -26,18 +26,23 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc_lib
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.feature_column.feature_column import _transform_features
+from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
@@ -168,6 +173,8 @@ class NumericColumnTest(test.TestCase):
   def test_defaults(self):
     a = fc.numeric_column('aaa')
     self.assertEqual('aaa', a.key)
+    self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual((1,), a.shape)
     self.assertIsNone(a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
@@ -369,6 +376,11 @@ class BucketizedColumnTest(test.TestCase):
     b = fc.bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b.name)
 
+  def test_var_scope_name(self):
+    a = fc.numeric_column('aaa', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    self.assertEqual('aaa_bucketized', b._var_scope_name)
+
   def test_parse_spec(self):
     a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
@@ -556,6 +568,7 @@ class HashedCategoricalColumnTest(test.TestCase):
   def test_defaults(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
+    self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual('aaa', a.key)
     self.assertEqual(10, a.hash_bucket_size)
     self.assertEqual(dtypes.string, a.dtype)
@@ -818,6 +831,14 @@ class CrossedColumnTest(test.TestCase):
     crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
+  def test_var_scope_name(self):
+    a = fc.numeric_column('a', dtype=dtypes.int32)
+    b = fc.bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+
+    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2._var_scope_name)
+
   def test_parse_spec(self):
     a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
@@ -1629,8 +1650,9 @@ class LinearModelTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
 
-    net = fc.linear_model(features, [price_buckets, body_style])
+    net = fc.linear_model(features, [price_buckets, body_style, country])
     bias = get_linear_model_bias()
     price_buckets_var = get_linear_model_column_var(price_buckets)
     body_style_var = get_linear_model_column_var(body_style)
@@ -1639,15 +1661,14 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose(
-          [[10 - 1000 + 5.], [1000 - 10 + 5.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
-
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.linear_model(features, [price_buckets, body_style, country])
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
 
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
@@ -1673,6 +1694,105 @@ class LinearModelTest(test.TestCase):
 
 class InputLayerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
+        matrix = input_layer(features)
+        return 2 * matrix
+
+      # Sanity check: Verify that scale_matrix returns the correct output.
+      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+      # Check that the returned gradient is correct.
+      grad_function = backprop.implicit_grad(scale_matrix)
+      grads_and_vars = grad_function()
+      indexed_slice = grads_and_vars[0][0]
+      gradient = grads_and_vars[0][0].values
+
+      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+
+
+class FunctionalInputLayerTest(test.TestCase):
+
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
@@ -1999,9 +2119,9 @@ class InputLayerTest(test.TestCase):
 
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
-        (1., 2., 3., 4., 5.),  # id 0
-        (6., 7., 8., 9., 10.),  # id 1
-        (11., 12., 13., 14., 15.)  # id 2
+        (1., 2.),  # id 0
+        (6., 7.),  # id 1
+        (11., 12.)  # id 2
     )
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
@@ -2018,8 +2138,8 @@ class InputLayerTest(test.TestCase):
     # embedded_body_style has 5 dims in input_layer.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc.embedding_column(
+        country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -2037,22 +2157,24 @@ class InputLayerTest(test.TestCase):
         indices=((0,), (1,)),
         values=('sedan', 'hardtop'),
         dense_shape=(2,))
+    country_data = np.array([['US'], ['CA']])
 
-    # Dense categorical_column with unknown shape is not allowed.
-    with self.assertRaisesRegexp(ValueError, 'Undefined input_tensor shape.'):
-      fc.input_layer(features, [price, one_hot_body_style, embedded_country])
-
-    net = fc.input_layer(features, [price, one_hot_body_style])
-    self.assertEqual(1 + 3, net.shape[1])
+    net = fc.input_layer(features,
+                         [price, one_hot_body_style, embedded_country])
+    self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
       self.assertAllEqual(
-          [[0., 0., 1., 11.], [1., 0., 0., 12.]],
-          sess.run(net, feed_dict={
-              features['price']: price_data,
-              features['body-style']: body_style_data}))
+          [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
+          sess.run(
+              net,
+              feed_dict={
+                  features['price']: price_data,
+                  features['body-style']: body_style_data,
+                  features['country']: country_data
+              }))
 
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
@@ -2188,6 +2310,8 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     self.assertEqual('aaa', column.name)
+    self.assertEqual('aaa', column._var_scope_name)
+    self.assertEqual('aaa', column.key)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
@@ -2236,10 +2360,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         lookup_ops.tables_initializer().run()
 
   def test_invalid_vocabulary_size(self):
-    with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
-          vocabulary_size=None)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
@@ -2353,6 +2473,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  def test_get_sparse_tensors_none_vocabulary_size(self):
+    column = fc.categorical_column_with_vocabulary_file(
+        key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
+    inputs = sparse_tensor.SparseTensorValue(
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=('marlo', 'skywalker', 'omar'),
+        dense_shape=(2, 2))
+    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    self.assertIsNone(id_weight_pair.weight_tensor)
+    with _initialized_session():
+      _assert_sparse_tensor_value(self,
+                                  sparse_tensor.SparseTensorValue(
+                                      indices=inputs.indices,
+                                      values=np.array(
+                                          (2, -1, 0), dtype=np.int64),
+                                      dense_shape=inputs.dense_shape),
+                                  id_weight_pair.id_tensor.eval())
+
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -2571,6 +2709,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     self.assertEqual('aaa', column.name)
+    self.assertEqual('aaa', column.key)
+    self.assertEqual('aaa', column._var_scope_name)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
@@ -2580,6 +2720,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
     self.assertEqual('aaa', column.name)
+    self.assertEqual('aaa', column.key)
+    self.assertEqual('aaa', column._var_scope_name)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
@@ -2933,6 +3075,8 @@ class IdentityCategoricalColumnTest(test.TestCase):
   def test_constructor(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual('aaa', column.name)
+    self.assertEqual('aaa', column.key)
+    self.assertEqual('aaa', column._var_scope_name)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
@@ -3217,11 +3361,15 @@ class IndicatorColumnTest(test.TestCase):
     a = fc.categorical_column_with_hash_bucket('a', 4)
     indicator_a = fc.indicator_column(a)
     self.assertEqual(indicator_a.categorical_column.name, 'a')
+    self.assertEqual(indicator_a.name, 'a_indicator')
+    self.assertEqual(indicator_a._var_scope_name, 'a_indicator')
     self.assertEqual(indicator_a._variable_shape, [1, 4])
 
     b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
     indicator_b = fc.indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
+    self.assertEqual(indicator_b.name, 'b_indicator')
+    self.assertEqual(indicator_b._var_scope_name, 'b_indicator')
     self.assertEqual(indicator_b._variable_shape, [1, 100])
 
   def test_1D_shape_succeeds(self):
@@ -3403,10 +3551,12 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual('mean', embedding_column.combiner)
     self.assertIsNotNone(embedding_column.initializer)
     self.assertIsNone(embedding_column.ckpt_to_load_from)
+    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertIsNone(embedding_column.tensor_name_in_ckpt)
     self.assertIsNone(embedding_column.max_norm)
     self.assertTrue(embedding_column.trainable)
     self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual('aaa_embedding', embedding_column._var_scope_name)
     self.assertEqual(
         (embedding_dimension,), embedding_column._variable_shape)
     self.assertEqual({
@@ -3426,11 +3576,13 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
     self.assertEqual('my_initializer', embedding_column.initializer())
+    self.assertIsNone(embedding_column.shared_embedding_collection_name)
     self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
     self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
     self.assertEqual(42., embedding_column.max_norm)
     self.assertFalse(embedding_column.trainable)
     self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual('aaa_embedding', embedding_column._var_scope_name)
     self.assertEqual(
         (embedding_dimension,), embedding_column._variable_shape)
     self.assertEqual({
@@ -3456,6 +3608,7 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertEqual(embedding_dimension, embedding_column.dimension)
       self.assertEqual('my_combiner', embedding_column.combiner)
       self.assertEqual('my_initializer', embedding_column.initializer())
+      self.assertIsNone(embedding_column.shared_embedding_collection_name)
       self.assertEqual('my_ckpt', embedding_column.ckpt_to_load_from)
       self.assertEqual('my_ckpt_tensor', embedding_column.tensor_name_in_ckpt)
       self.assertEqual(42., embedding_column.max_norm)
@@ -3979,6 +4132,542 @@ class EmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups, input_layer.eval())
 
 
+class SharedEmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = fc_lib._shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
+    self.assertEqual('mean', embedding_column_a.combiner)
+    self.assertEqual('mean', embedding_column_b.combiner)
+    self.assertIsNotNone(embedding_column_a.initializer)
+    self.assertIsNotNone(embedding_column_b.initializer)
+    self.assertIsNone(embedding_column_a.ckpt_to_load_from)
+    self.assertIsNone(embedding_column_b.ckpt_to_load_from)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_a.shared_embedding_collection_name)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_b.shared_embedding_collection_name)
+    self.assertIsNone(embedding_column_a.tensor_name_in_ckpt)
+    self.assertIsNone(embedding_column_b.tensor_name_in_ckpt)
+    self.assertIsNone(embedding_column_a.max_norm)
+    self.assertIsNone(embedding_column_b.max_norm)
+    self.assertTrue(embedding_column_a.trainable)
+    self.assertTrue(embedding_column_b.trainable)
+    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+    self.assertEqual(
+        'aaa_bbb_shared_embedding', embedding_column_a._var_scope_name)
+    self.assertEqual(
+        'aaa_bbb_shared_embedding', embedding_column_b._var_scope_name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column_a._variable_shape)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column_b._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_a._parse_example_spec)
+    self.assertEqual({
+        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_b._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        shared_embedding_collection_name='shared_embedding_collection_name',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
+    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
+    self.assertEqual('my_combiner', embedding_column_a.combiner)
+    self.assertEqual('my_combiner', embedding_column_b.combiner)
+    self.assertEqual('my_initializer', embedding_column_a.initializer())
+    self.assertEqual('my_initializer', embedding_column_b.initializer())
+    self.assertEqual('shared_embedding_collection_name',
+                     embedding_column_a.shared_embedding_collection_name)
+    self.assertEqual('shared_embedding_collection_name',
+                     embedding_column_b.shared_embedding_collection_name)
+    self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
+    self.assertEqual('my_ckpt', embedding_column_b.ckpt_to_load_from)
+    self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
+    self.assertEqual('my_ckpt_tensor', embedding_column_b.tensor_name_in_ckpt)
+    self.assertEqual(42., embedding_column_a.max_norm)
+    self.assertEqual(42., embedding_column_b.max_norm)
+    self.assertFalse(embedding_column_a.trainable)
+    self.assertFalse(embedding_column_b.trainable)
+    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+    self.assertEqual(
+        'shared_embedding_collection_name', embedding_column_a._var_scope_name)
+    self.assertEqual(
+        'shared_embedding_collection_name', embedding_column_b._var_scope_name)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column_a._variable_shape)
+    self.assertEqual(
+        (embedding_dimension,), embedding_column_b._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_a._parse_example_spec)
+    self.assertEqual({
+        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_b._parse_example_spec)
+
+  def test_deep_copy(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    original_a, _ = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        shared_embedding_collection_name='shared_embedding_collection_name',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42., trainable=False)
+    for embedding_column_a in (original_a, copy.deepcopy(original_a)):
+      self.assertEqual('aaa', embedding_column_a.categorical_column.name)
+      self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column_a.categorical_column._parse_example_spec)
+
+      self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+      self.assertEqual('my_combiner', embedding_column_a.combiner)
+      self.assertEqual('my_initializer', embedding_column_a.initializer())
+      self.assertEqual('shared_embedding_collection_name',
+                       embedding_column_a.shared_embedding_collection_name)
+      self.assertEqual('my_ckpt', embedding_column_a.ckpt_to_load_from)
+      self.assertEqual('my_ckpt_tensor', embedding_column_a.tensor_name_in_ckpt)
+      self.assertEqual(42., embedding_column_a.max_norm)
+      self.assertFalse(embedding_column_a.trainable)
+      self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+      self.assertEqual(
+          (embedding_dimension,), embedding_column_a._variable_shape)
+      self.assertEqual({
+          'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+      }, embedding_column_a._parse_example_spec)
+
+  def test_invalid_initializer(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
+      fc_lib._shared_embedding_columns(
+          [categorical_column_a, categorical_column_b], dimension=2,
+          initializer='not_fn')
+
+  def test_incompatible_column_type(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    categorical_column_c = fc.categorical_column_with_hash_bucket(
+        key='ccc', hash_bucket_size=3)
+    with self.assertRaisesRegexp(
+        ValueError,
+        'all categorical_columns must have the same type.*'
+        '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
+      fc_lib._shared_embedding_columns(
+          [categorical_column_a, categorical_column_b, categorical_column_c],
+          dimension=2)
+
+  def test_weighted_categorical_column_ok(self):
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    weighted_categorical_column_a = fc.weighted_categorical_column(
+        categorical_column_a, weight_feature_key='aaa_weights')
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    weighted_categorical_column_b = fc.weighted_categorical_column(
+        categorical_column_b, weight_feature_key='bbb_weights')
+    fc_lib._shared_embedding_columns(
+        [weighted_categorical_column_a, categorical_column_b], dimension=2)
+    fc_lib._shared_embedding_columns(
+        [categorical_column_a, weighted_categorical_column_b], dimension=2)
+    fc_lib._shared_embedding_columns(
+        [weighted_categorical_column_a, weighted_categorical_column_b],
+        dimension=2)
+
+  def test_parse_example(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
+    b = fc.categorical_column_with_vocabulary_list(
+        key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
+    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+        [a, b], dimension=2)
+    data = example_pb2.Example(features=feature_pb2.Features(
+        feature={
+            'aaa':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'omar', b'stringer'])),
+            'bbb':
+                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                    value=[b'stringer', b'marlo'])),
+        }))
+    features = parsing_ops.parse_example(
+        serialized=[data.SerializeToString()],
+        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+    self.assertIn('aaa', features)
+    self.assertIn('bbb', features)
+    with self.test_session():
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'omar', b'stringer'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['aaa'].eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [0, 1]],
+              values=np.array([b'stringer', b'marlo'], dtype=np.object_),
+              dense_shape=[1, 2]),
+          features['bbb'].eval())
+
+  def test_transform_feature(self):
+    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_lib._shared_embedding_columns(
+        [a, b], dimension=2)
+    features = {
+        'aaa': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(0, 1, 0),
+            dense_shape=(2, 2)),
+        'bbb': sparse_tensor.SparseTensor(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=(1, 2, 1),
+            dense_shape=(2, 2)),
+    }
+    outputs = _transform_features(features, [a, a_embedded, b, b_embedded])
+    output_a = outputs[a]
+    output_a_embedded = outputs[a_embedded]
+    output_b = outputs[b]
+    output_b_embedded = outputs[b_embedded]
+    with _initialized_session():
+      _assert_sparse_tensor_value(
+          self, output_a.eval(), output_a_embedded.eval())
+      _assert_sparse_tensor_value(
+          self, output_b.eval(), output_b_embedded.eval())
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    input_features = {
+        'aaa': input_a,
+        'bbb': input_b
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+  def test_get_dense_tensor_placeholder_inputs(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+    # Specify shape, because dense input must have rank specified.
+    input_a_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_b_placeholder = array_ops.placeholder(
+        dtype=dtypes.int64, shape=[None, 3])
+    input_features = {
+        'aaa': input_a_placeholder,
+        'bbb': input_b_placeholder,
+    }
+    feed_dict = {
+        input_a_placeholder: input_a,
+        input_b_placeholder: input_b,
+    }
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features))
+
+    with _initialized_session() as sess:
+      sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array(
+        [[2, -1, -1],  # example 0, ids [2]
+         [0, 1, -1]])  # example 1, ids [0, 1]
+    input_b = np.array(
+        [[0, -1, -1],  # example 0, ids [0]
+         [-1, -1, -1]])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights name should follow the column name.
+      # TODO(roumposg): Fix that.
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
+  def _test_input_layer(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        (7., 11., 1., 2.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc_lib._shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension, initializer=_initializer,
+        trainable=trainable)
+
+    # Provide sparse input and get dense result.
+    input_layer = fc.input_layer(
+        features={'aaa': sparse_input_a, 'bbb': sparse_input_b},
+        feature_columns=(embedding_column_b, embedding_column_a))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+        tuple([v.name for v in global_vars]))
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertItemsEqual(
+          ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = ops.get_collection('aaa_bbb_shared_embedding')
+    self.assertItemsEqual(
+        ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
+        tuple([v.name for v in shared_embedding_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
+      self.assertAllEqual(expected_lookups, input_layer.eval())
+
+  def test_input_layer(self):
+    self._test_input_layer()
+
+  def test_input_layer_no_trainable(self):
+    self._test_input_layer(trainable=False)
+
+
 class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_defaults(self):
@@ -3987,6 +4676,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertEqual('ids_weighted_by_values', column.name)
+    self.assertEqual('ids_weighted_by_values', column._var_scope_name)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'ids': parsing_ops.VarLenFeature(dtypes.int64),
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index ddababd5b8f0697594d86348875daada2a9896ba..6c522de452b59ea9a200ccf89cfb428a26970db1 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.util import compat
 from tensorflow.python.util import tf_contextlib
 
 
@@ -49,23 +50,101 @@ class ScopedTFGraph(object):
       c_api.TF_DeleteGraph(self.graph)
 
 
+class ScopedTFImportGraphDefOptions(object):
+  """Wrapper around TF_ImportGraphDefOptions that handles deletion."""
+
+  def __init__(self):
+    self.options = c_api.TF_NewImportGraphDefOptions()
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api.TF_DeleteImportGraphDefOptions is not None:
+      c_api.TF_DeleteImportGraphDefOptions(self.options)
+
+
 @tf_contextlib.contextmanager
-def tf_buffer():
+def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
 
   Example usage:
-    wtih tf_buffer() as buf:
+    with tf_buffer() as buf:
       # get serialized graph def into buf
       ...
       proto_data = c_api.TF_GetBuffer(buf)
       graph_def.ParseFromString(compat.as_bytes(proto_data))
     # buf has been deleted
 
+    with tf_buffer(some_string) as buf:
+      c_api.TF_SomeFunction(buf)
+    # buf has been deleted
+
+  Args:
+    data: An optional `bytes`, `str`, or `unicode` object. If not None, the
+      yielded buffer will contain this data.
+
   Yields:
     Created TF_Buffer
   """
-  buf = c_api.TF_NewBuffer()
+  if data:
+    buf = c_api.TF_NewBufferFromString(compat.as_bytes(data))
+  else:
+    buf = c_api.TF_NewBuffer()
   try:
     yield buf
   finally:
     c_api.TF_DeleteBuffer(buf)
+
+
+def tf_output(c_op, index):
+  """Returns a wrapped TF_Output with specified operation and index.
+
+  Args:
+    c_op: wrapped TF_Operation
+    index: integer
+
+  Returns:
+    Wrapped TF_Output
+  """
+  ret = c_api.TF_Output()
+  ret.oper = c_op
+  ret.index = index
+  return ret
+
+
+def tf_operations(graph):
+  """Generator that yields every TF_Operation in `graph`.
+
+  Args:
+    graph: Graph
+
+  Yields:
+    wrapped TF_Operation
+  """
+  # pylint: disable=protected-access
+  pos = 0
+  c_op, pos = c_api.TF_GraphNextOperation(graph._c_graph, pos)
+  while c_op is not None:
+    yield c_op
+    c_op, pos = c_api.TF_GraphNextOperation(graph._c_graph, pos)
+  # pylint: enable=protected-access
+
+
+def new_tf_operations(graph):
+  """Generator that yields newly-added TF_Operations in `graph`.
+
+  Specifically, yields TF_Operations that don't have associated Operations in
+  `graph`. This is useful for processing nodes added by the C API.
+
+  Args:
+    graph: Graph
+
+  Yields:
+    wrapped TF_Operation
+  """
+  # TODO(b/69679162): do this more efficiently
+  for c_op in tf_operations(graph):
+    try:
+      graph._get_operation_by_tf_operation(c_op)  # pylint: disable=protected-access
+    except KeyError:
+      yield c_op
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index d51e142da1950d48eaa38ebc2366da6912cb19e7..bf3be34d85120f3d873367aa55948d27d34977cf 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -55,10 +55,10 @@ from tensorflow.python.framework import tensor_util
 
 def _eager_reshape(tensor, shape, ctx):
   """Eager-only version of Reshape op; requires tensor is an eager Tensor."""
-  attr_t = tensor.dtype.as_datatype_enum
+  attr_t = tensor._datatype_enum()  # pylint: disable=protected-access
   attr_tshape, (shape,) = execute.args_to_matching_eager(
       [shape], ctx, dtypes.int32)
-  attr_tshape = attr_tshape.as_datatype_enum
+  attr_tshape = attr_tshape
   inputs_flat = [tensor, shape]
   attrs = ("T", attr_t, "Tshape", attr_tshape)
   result, = execute.execute(
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index db124ab12acdfb9724f9800f5be36b9f1d45f323..b0422eb6be091a3fcf4b213f04a2e13a3ae8a963 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,9 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import pywrap_tensorflow
+
+
+_np_bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
 
 
 class DType(object):
@@ -146,8 +151,9 @@ class DType(object):
   @property
   def is_floating(self):
     """Returns whether this is a (non-quantized, real) floating point type."""
-    return self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
-                                                      np.floating)
+    return ((self.is_numpy_compatible and np.issubdtype(self.as_numpy_dtype,
+                                                        np.floating))
+            or self.base_dtype == bfloat16)
 
   @property
   def is_complex(self):
@@ -157,7 +163,7 @@ class DType(object):
   @property
   def is_quantized(self):
     """Returns whether this is a quantized data type."""
-    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32, bfloat16]
+    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32]
 
   @property
   def is_unsigned(self):
@@ -194,6 +200,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).min
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("-0x1.FEp127"))
         raise TypeError("Cannot find minimum value of %s." % self)
 
   @property
@@ -216,6 +224,8 @@ class DType(object):
       try:
         return np.iinfo(self.as_numpy_dtype()).max
       except:
+        if self.base_dtype == bfloat16:
+          return _np_bfloat16(float.fromhex("0x1.FEp127"))
         raise TypeError("Cannot find maximum value of %s." % self)
 
   @property
@@ -486,6 +496,8 @@ _np_qint16 = np.dtype([("qint16", np.int16, 1)])
 _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
 _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 
+# _np_bfloat16 is defined by a module import.
+
 # Custom struct dtype for directly-fed ResourceHandles of supported type(s).
 np_resource = np.dtype([("resource", np.ubyte, 1)])
 
@@ -511,7 +523,7 @@ _NP_TO_TF = frozenset([
     (_np_qint16, qint16),
     (_np_quint16, quint16),
     (_np_qint32, qint32),
-    # NOTE(touts): Intentionally no way to feed a DT_BFLOAT16.
+    (_np_bfloat16, bfloat16),
 ])
 _TF_TO_NP = {
     types_pb2.DT_HALF: np.float16,
@@ -536,7 +548,7 @@ _TF_TO_NP = {
     types_pb2.DT_QINT16: _np_qint16,
     types_pb2.DT_QUINT16: _np_quint16,
     types_pb2.DT_QINT32: _np_qint32,
-    types_pb2.DT_BFLOAT16: np.uint16,
+    types_pb2.DT_BFLOAT16: _np_bfloat16,
 
     # Ref types
     types_pb2.DT_HALF_REF: np.float16,
@@ -559,7 +571,7 @@ _TF_TO_NP = {
     types_pb2.DT_QINT16_REF: _np_qint16,
     types_pb2.DT_QUINT16_REF: _np_quint16,
     types_pb2.DT_QINT32_REF: _np_qint32,
-    types_pb2.DT_BFLOAT16_REF: np.uint16,
+    types_pb2.DT_BFLOAT16_REF: _np_bfloat16,
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 67842e14b1077fdf69aa3405f4f43fc92e499b4d..e49e2fda5d84da4f8f87fae73874351afe0a20f2 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -176,7 +176,7 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.as_dtype("float64").is_floating, True)
     self.assertEqual(dtypes.as_dtype("string").is_floating, False)
     self.assertEqual(dtypes.as_dtype("bool").is_floating, False)
-    self.assertEqual(dtypes.as_dtype("bfloat16").is_integer, False)
+    self.assertEqual(dtypes.as_dtype("bfloat16").is_floating, True)
     self.assertEqual(dtypes.as_dtype("qint8").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint16").is_floating, False)
     self.assertEqual(dtypes.as_dtype("qint32").is_floating, False)
@@ -276,6 +276,9 @@ class TypesTest(test_util.TensorFlowTestCase):
       if numpy_dtype in (np.float16, np.float32, np.float64):
         self.assertEquals(dtype.min, np.finfo(numpy_dtype).min)
         self.assertEquals(dtype.max, np.finfo(numpy_dtype).max)
+      if numpy_dtype == dtypes.bfloat16.as_numpy_dtype:
+        self.assertEquals(dtype.min, float.fromhex("-0x1.FEp127"))
+        self.assertEquals(dtype.max, float.fromhex("0x1.FEp127"))
 
   def testRepr(self):
     for enum, name in dtypes._TYPE_TO_STRING.items():
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index cef3f8d4c42e96b24986f5363f161a92ea41cf82..366025a0d8da71fd81b0d7cb878afeee4dcc414d 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -100,7 +100,7 @@ class Defun(object):
          grad_func - (optional).  A function implementing the gradient
            of the function-to-register.  This is must be a
            `_DefinedFunction` object. The gradient
-           function must satisify the criterion defined in
+           function must satisfy the criterion defined in
            function.proto:GradientDef.
 
          python_grad_func - (optional).  A function implementing the
@@ -692,7 +692,10 @@ class _FuncGraph(ops.Graph):
         else:
           # Substitute with a placeholder.
           self.extra_inputs.append(x)
-          ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
+          # Hoist the new input placeholder out of any control flow context
+          # we're currently in.
+          with ops.control_dependencies(None):
+            ph = array_ops.placeholder(x.dtype, shape=x.get_shape())
           # pylint: disable=protected-access
           ph._handle_data = x._handle_data
           # pylint: enable=protected-access
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 36b0737cfca181a1d2c2fe6df2460312ed25dfa5..8a7bf7a021e7776c5764d8d4097575cf7cf5b775 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -370,7 +370,7 @@ class FunctionTest(test.TestCase):
 
     @function.Defun(dtypes.float32)
     def Foo(x):
-      y = logging_ops.Print(x, [x], "Hello")
+      y = logging_ops.Print(x, [], "Hello")
       with ops.control_dependencies([y]):
         z = control_flow_ops.no_op()
       with ops.control_dependencies([z]):
@@ -724,6 +724,38 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  def testCaptureInWhileLoop(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun()
+      def Foo():
+        return control_flow_ops.while_loop(lambda i: i < 10,
+                                           lambda i: i + x,
+                                           [0])
+      y = Foo()
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 10)
+
+  def testCaptureInCond(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(1)
+
+      @function.Defun(dtypes.bool)
+      def Foo(pred):
+        return control_flow_ops.cond(pred,
+                                     lambda: x,
+                                     lambda: x + 1)
+      y = Foo(True)
+      z = Foo(False)
+
+    with self.test_session(graph=g) as sess:
+      self.assertEqual(sess.run(y), 1)
+      self.assertEqual(sess.run(z), 2)
+
   def testStableName(self):
 
     @function.Defun()
@@ -882,6 +914,48 @@ class FunctionTest(test.TestCase):
           np.array([1.0, 0.0]).astype(np.float32),
           sess.run(dinp, {inp: x}))
 
+  def testStatefulFunction(self):
+
+    @function.Defun()
+    def FunctionWithStatelessOp():
+      return constant_op.constant(42.0)
+
+    @function.Defun()
+    def FunctionWithStatefulOp():
+      return random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun()
+    def FunctionWithStatelessFunctionCall():
+      return FunctionWithStatelessOp()
+
+    @function.Defun()
+    def FunctionWithStatefulFunctionCall():
+      return FunctionWithStatefulOp()
+
+    # Test that the `is_stateful` bit is propagated.
+    self.assertFalse(FunctionWithStatelessOp.definition.signature.is_stateful)
+    self.assertTrue(FunctionWithStatefulOp.definition.signature.is_stateful)
+    self.assertFalse(
+        FunctionWithStatelessFunctionCall.definition.signature.is_stateful)
+    self.assertTrue(
+        FunctionWithStatefulFunctionCall.definition.signature.is_stateful)
+
+    # Ensure that two invocations of the same random-number-generating
+    # function produce different results.
+    result1 = FunctionWithStatefulFunctionCall()
+    result2 = FunctionWithStatefulFunctionCall()
+
+    # Statefulness affects how the function is treated by the various
+    # optimization passes, so run the test in each optimizer
+    # configuration.
+    for config in _OptimizerOptions():
+      with session.Session(config=config) as sess:
+        val1, val2 = sess.run((result1, result2))
+        self.assertFalse(all(val1 == val2))
+        val3, val4 = sess.run((result1, result2))
+        self.assertFalse(all(val3 == val1))
+        self.assertFalse(all(val4 == val2))
+
 
 @test_util.with_c_api
 class FunctionsFromProtos(test.TestCase):
diff --git a/tensorflow/python/framework/graph_to_function_def.py b/tensorflow/python/framework/graph_to_function_def.py
index 448f87aa6ee31127113ed10aee8e4e0fa06482f1..625f31146be89f09481b634127484d15f0631fc6 100644
--- a/tensorflow/python/framework/graph_to_function_def.py
+++ b/tensorflow/python/framework/graph_to_function_def.py
@@ -110,6 +110,13 @@ def _add_op_node(op, func, input_dict):
                                                (node_def.input[i],
                                                 input_dict.items()))
       node_def.input[i] = input_dict[node_def.input[i]]
+  # The function is stateful if any of its operations are stateful.
+  # NOTE(mrry): The "Const" node typically does not have an `OpDef` associated
+  # with it, so we assume any nodes without an `OpDef` are stateless.
+  # TODO(skyewm): Remove the `is not None` test after we transition to the C
+  # API.
+  if op.op_def is not None and op.op_def.is_stateful:
+    func.signature.is_stateful = True
 
 
 def graph_to_function_def(graph, operations, inputs, outputs, out_names=None):
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index ce85747d7cf7fd3ee1803efe0260a90ef8e4b743..6c7b4553881637ce0b2ec63449bde0a397ef2d72 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Helpers to manipulate a tensor graph in python.
 """
 
@@ -108,6 +107,46 @@ def _node_name(n):
     return n.split(":")[0]
 
 
+def _extract_graph_summary(graph_def):
+  """Extracts useful information from the graph and returns them."""
+  name_to_input_name = {}  # Keyed by the dest node name.
+  name_to_node = {}  # Keyed by node name.
+
+  # Keeps track of node sequences. It is important to still output the
+  # operations in the original order.
+  name_to_seq_num = {}  # Keyed by node name.
+  seq = 0
+  for node in graph_def.node:
+    n = _node_name(node.name)
+    name_to_node[n] = node
+    name_to_input_name[n] = [_node_name(x) for x in node.input]
+    name_to_seq_num[n] = seq
+    seq += 1
+  return name_to_input_name, name_to_node, name_to_seq_num
+
+
+def _assert_nodes_are_present(name_to_node, nodes):
+  """Assert that nodes are present in the graph."""
+  for d in nodes:
+    assert d in name_to_node, "%s is not in graph" % d
+
+
+def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
+  """Breadth first search for reachable nodes from target nodes."""
+  nodes_to_keep = set()
+  # Breadth first search to find all the nodes that we should keep.
+  next_to_visit = target_nodes[:]
+  while next_to_visit:
+    n = next_to_visit[0]
+    del next_to_visit[0]
+    if n in nodes_to_keep:
+      # Already visited this node.
+      continue
+    nodes_to_keep.add(n)
+    next_to_visit += name_to_input_name[n]
+  return nodes_to_keep
+
+
 def extract_sub_graph(graph_def, dest_nodes):
   """Extract the subgraph that can reach any of the nodes in 'dest_nodes'.
 
@@ -127,40 +166,18 @@ def extract_sub_graph(graph_def, dest_nodes):
   if isinstance(dest_nodes, six.string_types):
     raise TypeError("dest_nodes must be a list.")
 
-  edges = {}  # Keyed by the dest node name.
-  name_to_node_map = {}  # Keyed by node name.
-
-  # Keeps track of node sequences. It is important to still output the
-  # operations in the original order.
-  node_seq = {}  # Keyed by node name.
-  seq = 0
-  for node in graph_def.node:
-    n = _node_name(node.name)
-    name_to_node_map[n] = node
-    edges[n] = [_node_name(x) for x in node.input]
-    node_seq[n] = seq
-    seq += 1
-
-  for d in dest_nodes:
-    assert d in name_to_node_map, "%s is not in graph" % d
+  name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
+      graph_def)
+  _assert_nodes_are_present(name_to_node, dest_nodes)
 
-  nodes_to_keep = set()
-  # Breadth first search to find all the nodes that we should keep.
-  next_to_visit = dest_nodes[:]
-  while next_to_visit:
-    n = next_to_visit[0]
-    del next_to_visit[0]
-    if n in nodes_to_keep:
-      # Already visited this node.
-      continue
-    nodes_to_keep.add(n)
-    next_to_visit += edges[n]
+  nodes_to_keep = _bfs_for_reachable_nodes(dest_nodes, name_to_input_name)
 
-  nodes_to_keep_list = sorted(list(nodes_to_keep), key=lambda n: node_seq[n])
+  nodes_to_keep_list = sorted(
+      list(nodes_to_keep), key=lambda n: name_to_seq_num[n])
   # Now construct the output GraphDef
   out = graph_pb2.GraphDef()
   for n in nodes_to_keep_list:
-    out.node.extend([copy.deepcopy(name_to_node_map[n])])
+    out.node.extend([copy.deepcopy(name_to_node[n])])
   out.library.CopyFrom(graph_def.library)
   out.versions.CopyFrom(graph_def.versions)
 
@@ -181,7 +198,9 @@ def tensor_shape_from_node_def_name(graph, input_name):
   return shape
 
 
-def convert_variables_to_constants(sess, input_graph_def, output_node_names,
+def convert_variables_to_constants(sess,
+                                   input_graph_def,
+                                   output_node_names,
                                    variable_names_whitelist=None,
                                    variable_names_blacklist=None):
   """Replaces all the variables in a graph with constants of the same values.
@@ -237,10 +256,10 @@ def convert_variables_to_constants(sess, input_graph_def, output_node_names,
       dtype = input_node.attr["dtype"]
       data = found_variables[input_node.name]
       output_node.attr["dtype"].CopyFrom(dtype)
-      output_node.attr["value"].CopyFrom(attr_value_pb2.AttrValue(
-          tensor=tensor_util.make_tensor_proto(data,
-                                               dtype=dtype.type,
-                                               shape=data.shape)))
+      output_node.attr["value"].CopyFrom(
+          attr_value_pb2.AttrValue(
+              tensor=tensor_util.make_tensor_proto(
+                  data, dtype=dtype.type, shape=data.shape)))
       how_many_converted += 1
     else:
       output_node.CopyFrom(input_node)
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c0d221ddfeb3a5213e507733815c7ea9bc47901c..62765aff00e05723d660491d8f40c933f57bc340 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -25,8 +25,11 @@ import copy
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
@@ -147,6 +150,42 @@ def _MaybeDevice(device):
     yield
 
 
+def _ProcessGraphDefParam(graph_def):
+  """Type-checks and possibly canonicalizes `graph_def`."""
+  if not isinstance(graph_def, graph_pb2.GraphDef):
+    # `graph_def` could be a dynamically-created message, so try a duck-typed
+    # approach
+    try:
+      old_graph_def = graph_def
+      graph_def = graph_pb2.GraphDef()
+      graph_def.MergeFrom(old_graph_def)
+    except TypeError:
+      raise TypeError('graph_def must be a GraphDef proto.')
+  return graph_def
+
+
+def _ProcessInputMapParam(input_map):
+  """Type-checks and possibly canonicalizes `input_map`."""
+  if input_map is None:
+    input_map = {}
+  else:
+    if not (isinstance(input_map, dict)
+            and all(isinstance(k, compat.bytes_or_text_types)
+                    for k in input_map.keys())):
+      raise TypeError('input_map must be a dictionary mapping strings to '
+                      'Tensor objects.')
+  return input_map
+
+
+def _ProcessReturnElementsParam(return_elements):
+  """Type-checks and possibly canonicalizes `return_elements`."""
+  if return_elements is None: return None
+  if not all(isinstance(x, compat.bytes_or_text_types)
+             for x in return_elements):
+    raise TypeError('return_elements must be a list of strings.')
+  return tuple(compat.as_str(x) for x in return_elements)
+
+
 def _FindAttrInOpDef(attr_name, op_def):
   for attr_def in op_def.attr:
     if attr_name == attr_def.name:
@@ -154,6 +193,188 @@ def _FindAttrInOpDef(attr_name, op_def):
   return None
 
 
+def _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def):
+  """Removes unknown default attrs according to `producer_op_list`.
+
+  Removes any unknown attrs in `graph_def` (i.e. attrs that do not appear in
+  the OpDefs in `op_dict`) that have a default value in `producer_op_list`.
+
+  Args:
+    op_dict: dict mapping operation name to OpDef.
+    producer_op_list: OpList proto.
+    graph_def: GraphDef proto
+  """
+  producer_op_dict = {op.name: op for op in producer_op_list.op}
+  for node in graph_def.node:
+    # Remove any default attr values that aren't in op_def.
+    if node.op in producer_op_dict:
+      op_def = op_dict[node.op]
+      producer_op_def = producer_op_dict[node.op]
+      # We make a copy of node.attr to iterate through since we may modify
+      # node.attr inside the loop.
+      for key in list(node.attr):
+        if _FindAttrInOpDef(key, op_def) is None:
+          # No attr_def in consumer, look in producer.
+          attr_def = _FindAttrInOpDef(key, producer_op_def)
+          if (attr_def and attr_def.HasField('default_value') and
+              node.attr[key] == attr_def.default_value):
+            # Unknown attr had default value in producer, delete it so it can be
+            # understood by consumer.
+            del node.attr[key]
+
+
+def _ConvertInputMapValues(name, input_map):
+  """Ensures all input map values are tensors.
+
+  This should be called from inside the import name scope.
+
+  Args:
+    name: the `name` argument passed to import_graph_def
+    input_map: the `input_map` argument passed to import_graph_def.
+
+  Returns:
+    An possibly-updated version of `input_map`.
+
+  Raises:
+    ValueError: if input map values cannot be converted due to empty name scope.
+  """
+  if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
+    if name == '':  # pylint: disable=g-explicit-bool-comparison
+      raise ValueError(
+          'tf.import_graph_def() requires a non-empty `name` if `input_map` '
+          'contains non-Tensor values. Try calling tf.convert_to_tensor() on '
+          '`input_map` values before calling tf.import_graph_def().')
+    with ops.name_scope('_inputs'):
+      input_map = {k: ops.convert_to_tensor(v) for k, v in input_map.items()}
+  return input_map
+
+
+def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements):
+  """Populates the TF_ImportGraphDefOptions `options`."""
+  c_api.TF_ImportGraphDefOptionsSetPrefix(options, prefix)
+  c_api.TF_ImportGraphDefOptionsSetUniquifyNames(options, True)
+  c_api.TF_ImportGraphDefOptionsSetUniquifyPrefix(options, True)
+
+  for input_src, input_dst in input_map.items():
+    input_src = compat.as_str(input_src)
+    if input_src.startswith('^'):
+      src_name = compat.as_bytes(input_src[1:])
+      dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsRemapControlDependency(options, src_name,
+                                                           dst_op)
+    else:
+      src_name, src_idx = _ParseTensorName(input_src)
+      src_name = compat.as_str(src_name)
+      dst_output = input_dst._as_tf_output()  # pylint: disable=protected-access
+      c_api.TF_ImportGraphDefOptionsAddInputMapping(options, src_name,
+                                                    src_idx, dst_output)
+  for name in return_elements or []:
+    if ':' in name:
+      op_name, index = _ParseTensorName(name)
+      op_name = compat.as_str(op_name)
+      c_api.TF_ImportGraphDefOptionsAddReturnOutput(options, op_name, index)
+    else:
+      c_api.TF_ImportGraphDefOptionsAddReturnOperation(options,
+                                                       compat.as_str(name))
+
+  # TODO(skyewm): control dependencies
+
+
+def _ProcessNewOps(graph):
+  """Processes the newly-added TF_Operations in `graph`."""
+  # Maps from a node to the names of the ops it's colocated with, if colocation
+  # is specified in the attributes.
+  colocation_pairs = {}
+
+  for c_op in c_api_util.new_tf_operations(graph):
+    # pylint: disable=protected-access
+    new_op = graph._create_op_from_tf_operation(c_op, compute_device=False)
+    # pylint: enable=protected-access
+
+    colocation_names = _GetColocationNames(new_op)
+    if colocation_names:
+      colocation_pairs[new_op] = colocation_names
+      # Don't apply this op's device function, since colocation constraints
+      # override device functions. Note that this op's device may still be set
+      # by the loop below.
+    else:
+      with _MaybeDevice(new_op.device):
+        graph._apply_device_functions(new_op)  # pylint: disable=protected-access
+
+  # The following loop populates the device field of ops that are colocated
+  # with another op.  This is implied by the colocation attribute, but we
+  # propagate the device field for completeness.
+  for op, coloc_op_list in colocation_pairs.items():
+    coloc_device = None
+    # Find any device in the list of colocated ops that have a device, if it
+    # exists.  We assume that if multiple ops have devices, they refer to the
+    # same device.  Otherwise, a runtime error will occur since the colocation
+    # property cannot be guaranteed.
+    #
+    # One possible improvement is to try to check for compatibility of all
+    # devices in this list at import time here, which would require
+    # implementing a compatibility function for device specs in python.
+    for coloc_op_name in coloc_op_list:
+      try:
+        coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
+      except KeyError:
+        raise ValueError('Specified colocation to an op that '
+                         'does not exist during import: %s in %s' % (
+                             coloc_op_name, op.name))
+      if coloc_op.device:
+        coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+        break
+    if coloc_device:
+      op._set_device(coloc_device)  # pylint: disable=protected-access
+
+
+def _GetColocationNames(op):
+  """Returns names of the ops that `op` should be colocated with."""
+  colocation_names = []
+  try:
+    class_values = op.get_attr('_class')
+  except ValueError:
+    # No _class attr
+    return
+  for val in class_values:
+    val = compat.as_str(val)
+    if val.startswith('loc:@'):
+      colocation_node_name = val[len('loc:@'):]
+      if colocation_node_name != op.name:
+        colocation_names.append(colocation_node_name)
+  return colocation_names
+
+
+def _GatherReturnElements(requested_return_elements, graph, results):
+  """Returns the requested return elements from results.
+
+  Args:
+    requested_return_elements: list of strings of operation and tensor names
+    graph: Graph
+    results: wrapped TF_ImportGraphDefResults
+
+  Returns:
+    list of `Operation` and/or `Tensor` objects
+  """
+  return_outputs = c_api.TF_ImportGraphDefResultsReturnOutputs(results)
+  return_opers = c_api.TF_ImportGraphDefResultsReturnOperations(results)
+
+  combined_return_elements = []
+  outputs_idx = 0
+  opers_idx = 0
+  for name in requested_return_elements:
+    if ':' in name:
+      combined_return_elements.append(
+          graph._get_tensor_by_tf_output(return_outputs[outputs_idx]))  # pylint: disable=protected-access
+      outputs_idx += 1
+    else:
+      combined_return_elements.append(
+          graph._get_operation_by_tf_operation(return_opers[opers_idx]))  # pylint: disable=protected-access
+      opers_idx += 1
+  return combined_return_elements
+
+
 @deprecated_args(None, 'Please file an issue at '
                  'https://github.com/tensorflow/tensorflow/issues if you depend'
                  ' on this feature.',
@@ -201,335 +422,345 @@ def import_graph_def(graph_def, input_map=None, return_elements=None,
       do not appear in `graph_def`, or `graph_def` is not well-formed (e.g.
       it refers to an unknown tensor).
   """
-  # Type checks for inputs.
-  if not isinstance(graph_def, graph_pb2.GraphDef):
-    # `graph_def` could be a dynamically-created message, so try a duck-typed
-    # approach
-    try:
-      old_graph_def = graph_def
-      graph_def = graph_pb2.GraphDef()
-      graph_def.MergeFrom(old_graph_def)
-    except TypeError:
-      raise TypeError('graph_def must be a GraphDef proto.')
-  if input_map is None:
-    input_map = {}
-  else:
-    if not (isinstance(input_map, dict)
-            and all(isinstance(k, compat.bytes_or_text_types)
-                    for k in input_map.keys())):
-      raise TypeError('input_map must be a dictionary mapping strings to '
-                      'Tensor objects.')
-  if return_elements is not None:
-    return_elements = tuple(return_elements)
-    if not all(isinstance(x, compat.bytes_or_text_types)
-               for x in return_elements):
-      raise TypeError('return_elements must be a list of strings.')
+  graph_def = _ProcessGraphDefParam(graph_def)
+  input_map = _ProcessInputMapParam(input_map)
+  return_elements = _ProcessReturnElementsParam(return_elements)
 
-  # Use a canonical representation for all tensor names.
-  input_map = {_CanonicalInputName(k): v for k, v in input_map.items()}
-  used_input_keys = set()
+  op_dict = op_def_registry.get_registered_ops()
 
-  name_to_op = {}
+  if producer_op_list is not None:
+    # TODO(skyewm): make a copy of graph_def so we're not mutating the argument?
+    _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
 
-  op_dict = op_def_registry.get_registered_ops()
+  graph = ops.get_default_graph()
 
-  if producer_op_list is None:
-    producer_op_dict = None
-  else:
-    producer_op_dict = {op.name: op for op in producer_op_list.op}
+  if graph._c_graph:  # pylint: disable=protected-access
+    with ops.name_scope(name, 'import', input_map.values()) as scope:
+      # Save unique prefix generated by name_scope
+      if scope:
+        assert scope.endswith('/')
+        prefix = scope[:-1]
+      else:
+        prefix = ''
 
-  g = ops.get_default_graph()
+      # Generate any input map tensors inside name scope
+      input_map = _ConvertInputMapValues(name, input_map)
 
-  # Add any functions defined in `graph_def` to `g`
-  if graph_def.library and graph_def.library.function:
-    # Copy op_dict so we don't clobber the original
-    op_dict = copy.copy(op_dict)
-    # pylint: disable=protected-access
-    # Note that we do not prepend `name` to the function name. The reasoning is
-    # that function names are similar to op definition names, which currently do
-    # not have a scoped name or namespace scheme.
-    functions = function._from_library(graph_def.library)
-    for f in functions:
-      f.add_to_graph(g)
-      op_dict[f.name] = f.definition.signature
-    # pylint: enable=protected-access
+    scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
+    options = scoped_options.options
+    _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                     return_elements)
 
-  # LINT.IfChange
-  with ops.name_scope(name, 'import', input_map.values()) as scope:
-    # TODO(ashankar): Should this just copy over or should it do some
-    # more nuanced merging? For example, the graph may already have some
-    # marked "bad versions" and we don't want to lose those because of
-    # what's in graph_def.versions? The C++ ImporGraphDef does something
-    # more nuanced.
-    g.graph_def_versions.CopyFrom(graph_def.versions)
-
-    if not all(isinstance(v, ops.Tensor) for v in input_map.values()):
-      if not scope:
-        # The caller must have passed `name=''`.
-        raise ValueError(
-            'tf.import_graph_def() requires a non-empty `name` if `input_map` '
-            'contains non-Tensor values. Try calling tf.convert_to_tensor() on '
-            '`input_map` values before calling tf.import_graph_def().')
-      with ops.name_scope('_inputs'):
-        input_map = {k: ops.convert_to_tensor(v) for k, v in input_map.items()}
-
-    # NOTE(mrry): We do this in two passes, because there may be a cycle in
-    # `graph_def`.
-
-    # 1. Add operations without their inputs.
-    for node in graph_def.node:
-      # Check to see if this op's name matches a previously seen op
-      if node.name in name_to_op:
-        raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
-      # Set any default attr values that aren't present.
-      if node.op not in op_dict:
-        raise ValueError('No op named %s in defined operations.' % node.op)
-      op_def = op_dict[node.op]
-      for attr_def in op_def.attr:
-        key = attr_def.name
-        if attr_def.HasField('default_value'):
-          value = node.attr[key]
-          if value is None or value.WhichOneof('value') is None:
-            node.attr[key].CopyFrom(attr_def.default_value)
-      if producer_op_dict:
-        # Remove any default attr values that aren't in op_def.
-        if node.op in producer_op_dict:
-          producer_op_def = producer_op_dict[node.op]
-          # We make a copy of node.attr to iterate through since we
-          # may modify node.attr inside the loop.
-          for key in list(node.attr):
-            if _FindAttrInOpDef(key, op_def) is None:
-              # No attr_def in consumer, look in producer.
-              attr_def = _FindAttrInOpDef(key, producer_op_def)
-              if (attr_def and attr_def.HasField('default_value') and
-                  node.attr[key] == attr_def.default_value):
-                # Unknown attr had default value in producer, delete it
-                # so it can be understood by consumer.
-                del node.attr[key]
-
-      output_types = _OutputTypes(node, op_dict)
-      name_to_op[node.name] = g.create_op(
-          node.op, [], output_types, name=node.name, attrs=node.attr,
-          compute_shapes=False, compute_device=False,
-          op_def=op_def)
-
-    # Maps from a node to the ops it is colocated with, if colocation
-    # is specified in the attributes.
-    colocation_pairs = collections.defaultdict(list)
-
-    # 2. Add inputs to the operations.
-    for node in graph_def.node:
-      op = name_to_op[node.name]
-      input_types = _InputTypes(node, op_dict)
-      apply_device_function = True
-
-      # Rewrite the colocation attributes in the graph, since the
-      # names of new ops may have changed.
-      for key, value in op.node_def.attr.items():
-        if key == '_class':
-          class_values = value.list
-          new_class_values = []
-          for class_value in class_values.s:
-            if class_value.startswith(b'loc:@'):
-              op_to_bind_to = class_value[5:].decode()
-              # Find the op by its original name.
-              if op_to_bind_to not in name_to_op:
-                raise ValueError('Specified colocation to an op that '
-                                 'does not exist during import: %s in %s' % (
-                                     op_to_bind_to, node.name))
-              original_op = name_to_op[op_to_bind_to]
-              new_class_values.append(compat.as_bytes(
-                  'loc:@' + original_op.name))
-              if op_to_bind_to != node.name:
-                # Keep track of this mapping for a later phase.
-                colocation_pairs[op].append(original_op)
-                # Don't apply this op's device function,
-                # the colocation constraint will ensure
-                # the proper device gets assigned at runtime.
-                apply_device_function = False
+    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
+
+    _ProcessNewOps(graph)
+
+    # Create _DefinedFunctions for any imported functions.
+    #
+    # We do this by creating _DefinedFunctions directly from `graph_def`, and
+    # adding them to `graph`. Adding an existing function to a TF_Graph is a
+    # no-op, so this only has the effect of updating the Python state (usually
+    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+    #
+    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+    if graph_def.library and graph_def.library.function:
+      # pylint: disable=protected-access
+      functions = function._from_library(graph_def.library)
+      for f in functions:
+        f.add_to_graph(graph)
+      # pylint: enable=protected-access
 
-            else:
-              new_class_values.append(class_value)
-          value.list.CopyFrom(attr_value_pb2.AttrValue.ListValue(
-              s=new_class_values))
-
-      # NOTE(mrry): We cannot use zip here because control inputs do not appear
-      # in the list of input_types.
-      for i, input_name in enumerate(
-          [_CanonicalInputName(x) for x in node.input]):
-
-        if _IsControlInput(input_name):
-          # (a) Input is a control input that should be taken from an op
-          #     in "graph_def".
-          try:
-            source_op = name_to_op[input_name[1:]]
-          except KeyError:
-            raise ValueError(
-                _InvalidNodeMessage(
-                    node,
-                    'Control input %r not found in graph_def.' % (input_name,)))
-          # pylint: disable=protected-access
-          op._add_control_input(source_op)
-          # pylint: enable=protected-access
-
-        else:
-          try:
-            input_type = input_types[i]
-          except IndexError:
-            raise ValueError(_InvalidNodeMessage(
-                node, 'More inputs specified (%r) than the op expects.'
-                % (input_name,)))
-
-          if input_name in input_map:
-            # (b) Input should be replaced by a tensor from the caller.
-            source_tensor = input_map[input_name]
-            used_input_keys.add(input_name)
+    # TODO(skyewm): error if unused input map key
 
-          else:
-            # (c) Input should be taken from an op in `graph_def`.
-            operation_name, output_index = _ParseTensorName(input_name)
+    if return_elements is None:
+      return None
+    else:
+      return _GatherReturnElements(return_elements, graph, results)
+
+  else:
+    g = graph
+
+    # Use a canonical representation for all tensor names.
+    input_map = {_CanonicalInputName(k): v for k, v in input_map.items()}
+    used_input_keys = set()
+    name_to_op = {}
+
+    # Add any functions defined in `graph_def` to `g`
+    if graph_def.library and graph_def.library.function:
+      # Copy op_dict so we don't clobber the original
+      op_dict = copy.copy(op_dict)
+      # pylint: disable=protected-access
+      # Note that we do not prepend `name` to the function name. The reasoning
+      # is that function names are similar to op definition names, which
+      # currently do not have a scoped name or namespace scheme.
+      functions = function._from_library(graph_def.library)
+      for f in functions:
+        f.add_to_graph(g)
+        op_dict[f.name] = f.definition.signature
+      # pylint: enable=protected-access
+
+    # LINT.IfChange
+    with ops.name_scope(name, 'import', input_map.values()) as scope:
+      # TODO(ashankar): Should this just copy over or should it do some
+      # more nuanced merging? For example, the graph may already have some
+      # marked "bad versions" and we don't want to lose those because of
+      # what's in graph_def.versions? The C++ ImporGraphDef does something
+      # more nuanced.
+      g.graph_def_versions.CopyFrom(graph_def.versions)
+
+      input_map = _ConvertInputMapValues(name, input_map)
+
+      # NOTE(mrry): We do this in two passes, because there may be a cycle in
+      # `graph_def`.
+
+      # 1. Add operations without their inputs.
+      for node in graph_def.node:
+        # Check to see if this op's name matches a previously seen op
+        if node.name in name_to_op:
+          raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
+        # Set any default attr values that aren't present.
+        if node.op not in op_dict:
+          raise ValueError('No op named %s in defined operations.' % node.op)
+        op_def = op_dict[node.op]
+        for attr_def in op_def.attr:
+          key = attr_def.name
+          if attr_def.HasField('default_value'):
+            value = node.attr[key]
+            if value is None or value.WhichOneof('value') is None:
+              node.attr[key].CopyFrom(attr_def.default_value)
+
+        output_types = _OutputTypes(node, op_dict)
+        name_to_op[node.name] = g.create_op(
+            node.op, [], output_types, name=node.name, attrs=node.attr,
+            compute_shapes=False, compute_device=False,
+            op_def=op_def)
+
+      # Maps from a node to the ops it is colocated with, if colocation
+      # is specified in the attributes.
+      colocation_pairs = collections.defaultdict(list)
+
+      # 2. Add inputs to the operations.
+      for node in graph_def.node:
+        op = name_to_op[node.name]
+        input_types = _InputTypes(node, op_dict)
+        apply_device_function = True
+
+        # Rewrite the colocation attributes in the graph, since the
+        # names of new ops may have changed.
+        for key, value in op.node_def.attr.items():
+          if key == '_class':
+            class_values = value.list
+            new_class_values = []
+            for class_value in class_values.s:
+              if class_value.startswith(b'loc:@'):
+                op_to_bind_to = class_value[5:].decode()
+                # Find the op by its original name.
+                if op_to_bind_to not in name_to_op:
+                  raise ValueError('Specified colocation to an op that '
+                                   'does not exist during import: %s in %s' % (
+                                       op_to_bind_to, node.name))
+                original_op = name_to_op[op_to_bind_to]
+                new_class_values.append(compat.as_bytes(
+                    'loc:@' + original_op.name))
+                if op_to_bind_to != node.name:
+                  # Keep track of this mapping for a later phase.
+                  colocation_pairs[op].append(original_op)
+                  # Don't apply this op's device function,
+                  # the colocation constraint will ensure
+                  # the proper device gets assigned at runtime.
+                  apply_device_function = False
+
+              else:
+                new_class_values.append(class_value)
+            value.list.CopyFrom(attr_value_pb2.AttrValue.ListValue(
+                s=new_class_values))
+
+        # NOTE(mrry): We cannot use zip here because control inputs do not
+        # appear in the list of input_types.
+        for i, input_name in enumerate(
+            [_CanonicalInputName(x) for x in node.input]):
+
+          if _IsControlInput(input_name):
+            # (a) Input is a control input that should be taken from an op
+            #     in "graph_def".
             try:
-              source_op = name_to_op[operation_name]
-              source_tensor = list(source_op.values())[output_index]
-            except (KeyError, IndexError):
+              source_op = name_to_op[input_name[1:]]
+            except KeyError:
               raise ValueError(
                   _InvalidNodeMessage(
                       node,
-                      'Input tensor %r not found in graph_def.'
+                      'Control input %r not found in graph_def.'
                       % (input_name,)))
-
-          try:
             # pylint: disable=protected-access
-            op._add_input(source_tensor, dtype=input_type)
+            op._add_control_input(source_op)
             # pylint: enable=protected-access
-          except TypeError as te:
-            raise ValueError(_InvalidNodeMessage(
-                node, 'Input tensor %r %s' % (input_name, te)))
 
-      # pylint: disable=protected-access
-      if op._input_dtypes != input_types:
-        raise ValueError(
-            _InvalidNodeMessage(
-                node,
-                'Input types mismatch (expected %r but got %r)'
-                % (', '.join(dtypes.as_dtype(x).name for x in input_types),
-                   ', '.join(x.name for x in op._input_dtypes))))
-      # pylint: enable=protected-access
+          else:
+            try:
+              input_type = input_types[i]
+            except IndexError:
+              raise ValueError(_InvalidNodeMessage(
+                  node, 'More inputs specified (%r) than the op expects.'
+                  % (input_name,)))
+
+            if input_name in input_map:
+              # (b) Input should be replaced by a tensor from the caller.
+              source_tensor = input_map[input_name]
+              used_input_keys.add(input_name)
 
-      if not g._is_function(op.type):  # pylint: disable=protected-access
-        # Execute shape inference for this op.
-        # NOTE(mrry): If the graph contains a cycle, the full shape information
-        # may not be available for this op's inputs.
-        ops.set_shapes_for_outputs(op)
-      # For nodes with _output_shapes set, set the output shapes.
-      if '_output_shapes' in op.node_def.attr:
-        for i, output in enumerate(op.outputs):
-          dims = op.node_def.attr['_output_shapes'].list.shape[i]
-          output_shape = tensor_shape.TensorShape(
-              None if dims.unknown_rank else
-              [dim.size if dim.size >= 0 else None for dim in dims.dim])
-
-          try:
-            output.set_shape(output_shape)
-          except ValueError as e:
-            # If the output shape is incompatible with what is inferred
-            # by the graph for a very specific whitelist of ops, then we
-            # ignore this output shape.  This can happen if there is a
-            # bug in the shape function for some operation, and the
-            # serialized graph def has the incorrect shape set when
-            # running on a newer binary with the fixed shape function.
-            # This is an escape hatch that allows us to correct shape
-            # functions that are not critical to correct execution but
-            # would cause graphs to fail if imported after correcting.
-            #
-            # This can be removed after 2017/03/08.
-            if op.type in ['RandomShuffleQueue', 'PaddingFIFOQueue',
-                           'FIFOQueue', 'PriorityQueue', 'QueueSize',
-                           'Stack', 'Barrier', 'BarrierReadySize',
-                           'BarrierIncompleteSize', 'HashTable',
-                           'MutableHashTable',
-                           'MutableHashTableOfTensors', 'Mutex',
-                           'CuckooTable', 'IndexTable',
-                           'WholeFileReader', 'TextLineReader',
-                           'FixedLengthRecordReader',
-                           'TFRecordReader', 'IdentityReader',
-                           'LMDBReader',
-                           'RefSwitch', 'RefEnter', 'RefNextIteration',
-                           'RefMerge', 'RefIdentity']:
-              pass
-            elif op.type in [
-                'ConditionalAccumulator', 'SparseConditionalAccumulator',
-                'Table'
-            ]:
-              # This can be removed after 2017/04/24.
-              pass
             else:
-              raise e
-
-        del op.node_def.attr['_output_shapes']
-
-      # NOTE(mrry): We do this after configuring the inputs, because
-      # the result of the device functions may depend on the inputs.
-      if apply_device_function:
-        with _MaybeDevice(node.device):
-          g._apply_device_functions(op)  # pylint: disable=protected-access
-
-    # The following loop populates the device field of ops that are
-    # colocated with another op.  This is implied by the colocation
-    # attribute, but we propagate the device field for completeness.
-    for op, coloc_op_list in colocation_pairs.items():
-      coloc_device = None
-      # Find any device in the list of colocated ops that have a
-      # device, if it exists.  We assume that if multiple ops
-      # have devices, they refer to the same device.  Otherwise, a
-      # runtime error will occur since the colocation property
-      # cannot be guaranteed.
-      #
-      # One possible improvement is to try to check for compatibility
-      # of all devices in this list at import time here, which would
-      # require implementing a compatibility function for device specs
-      # in python.
-      for coloc_op in coloc_op_list:
-        if coloc_op.device:
-          coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
-          break
-      if coloc_device:
-        op._set_device(coloc_device)  # pylint: disable=protected-access
-
-    # Treat input mappings that don't appear in the graph as an error,
-    # because they are likely to be due to a typo.
-    def _IsImportedNodeOutput(tensor_name):
-      operation_name, output_index = _ParseTensorName(tensor_name)
-      try:
-        return output_index < len(name_to_op[operation_name].outputs)
-      except KeyError:
-        return False
-    absent_input_keys = [
-        k for k in frozenset(input_map.keys()).difference(used_input_keys)
-        if not _IsImportedNodeOutput(k)]
-    if absent_input_keys:
-      raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]'
-          % ', '.join(absent_input_keys))
+              # (c) Input should be taken from an op in `graph_def`.
+              operation_name, output_index = _ParseTensorName(input_name)
+              try:
+                source_op = name_to_op[operation_name]
+                source_tensor = list(source_op.values())[output_index]
+              except (KeyError, IndexError):
+                raise ValueError(
+                    _InvalidNodeMessage(
+                        node,
+                        'Input tensor %r not found in graph_def.'
+                        % (input_name,)))
 
-    if return_elements is None:
-      return None
-    else:
-      ret = []
-      for name in return_elements:
-        name = compat.as_str(name)
-        if ':' in name:
-          try:
-            operation_name, output_index = _ParseTensorName(name)
-            ret.append(name_to_op[operation_name].outputs[output_index])
-          except (ValueError, KeyError, IndexError):
-            raise ValueError(
-                'Requested return_element %r not found in graph_def.' % name)
-        else:
-          try:
-            ret.append(name_to_op[name])
-          except KeyError:
-            raise ValueError(
-                'Requested return_element %r not found in graph_def.' % name)
-      return ret
-  # LINT.ThenChange(//tensorflow/core/graph/graph_constructor.cc)
+            try:
+              # pylint: disable=protected-access
+              op._add_input(source_tensor, dtype=input_type)
+              # pylint: enable=protected-access
+            except TypeError as te:
+              raise ValueError(_InvalidNodeMessage(
+                  node, 'Input tensor %r %s' % (input_name, te)))
+
+        # pylint: disable=protected-access
+        if op._input_dtypes != input_types:
+          raise ValueError(
+              _InvalidNodeMessage(
+                  node,
+                  'Input types mismatch (expected %r but got %r)'
+                  % (', '.join(dtypes.as_dtype(x).name for x in input_types),
+                     ', '.join(x.name for x in op._input_dtypes))))
+        # pylint: enable=protected-access
+
+        if not g._is_function(op.type):  # pylint: disable=protected-access
+          # Execute shape inference for this op.
+          # NOTE(mrry): If the graph contains a cycle, the full shape
+          # information may not be available for this op's inputs.
+          ops.set_shapes_for_outputs(op)
+        # For nodes with _output_shapes set, set the output shapes.
+        if '_output_shapes' in op.node_def.attr:
+          for i, output in enumerate(op.outputs):
+            dims = op.node_def.attr['_output_shapes'].list.shape[i]
+            output_shape = tensor_shape.TensorShape(
+                None if dims.unknown_rank else
+                [dim.size if dim.size >= 0 else None for dim in dims.dim])
+
+            try:
+              output.set_shape(output_shape)
+            except ValueError as e:
+              # If the output shape is incompatible with what is inferred
+              # by the graph for a very specific whitelist of ops, then we
+              # ignore this output shape.  This can happen if there is a
+              # bug in the shape function for some operation, and the
+              # serialized graph def has the incorrect shape set when
+              # running on a newer binary with the fixed shape function.
+              # This is an escape hatch that allows us to correct shape
+              # functions that are not critical to correct execution but
+              # would cause graphs to fail if imported after correcting.
+              #
+              # This can be removed after 2017/03/08.
+              if op.type in ['RandomShuffleQueue', 'PaddingFIFOQueue',
+                             'FIFOQueue', 'PriorityQueue', 'QueueSize',
+                             'Stack', 'Barrier', 'BarrierReadySize',
+                             'BarrierIncompleteSize', 'HashTable',
+                             'MutableHashTable',
+                             'MutableHashTableOfTensors', 'Mutex',
+                             'CuckooTable', 'IndexTable',
+                             'WholeFileReader', 'TextLineReader',
+                             'FixedLengthRecordReader',
+                             'TFRecordReader', 'IdentityReader',
+                             'LMDBReader',
+                             'RefSwitch', 'RefEnter', 'RefNextIteration',
+                             'RefMerge', 'RefIdentity']:
+                pass
+              elif op.type in [
+                  'ConditionalAccumulator', 'SparseConditionalAccumulator',
+                  'Table'
+              ]:
+                # This can be removed after 2017/04/24.
+                pass
+              else:
+                raise e
+
+          del op.node_def.attr['_output_shapes']
+
+        # NOTE(mrry): We do this after configuring the inputs, because
+        # the result of the device functions may depend on the inputs.
+        if apply_device_function:
+          with _MaybeDevice(node.device):
+            g._apply_device_functions(op)  # pylint: disable=protected-access
+
+      # The following loop populates the device field of ops that are
+      # colocated with another op.  This is implied by the colocation
+      # attribute, but we propagate the device field for completeness.
+      for op, coloc_op_list in colocation_pairs.items():
+        coloc_device = None
+        # Find any device in the list of colocated ops that have a
+        # device, if it exists.  We assume that if multiple ops
+        # have devices, they refer to the same device.  Otherwise, a
+        # runtime error will occur since the colocation property
+        # cannot be guaranteed.
+        #
+        # One possible improvement is to try to check for compatibility
+        # of all devices in this list at import time here, which would
+        # require implementing a compatibility function for device specs
+        # in python.
+        for coloc_op in coloc_op_list:
+          if coloc_op.device:
+            coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
+            break
+        if coloc_device:
+          op._set_device(coloc_device)  # pylint: disable=protected-access
+
+      # Treat input mappings that don't appear in the graph as an error,
+      # because they are likely to be due to a typo.
+      def _IsImportedNodeOutput(tensor_name):
+        operation_name, output_index = _ParseTensorName(tensor_name)
+        try:
+          return output_index < len(name_to_op[operation_name].outputs)
+        except KeyError:
+          return False
+      absent_input_keys = [
+          k for k in frozenset(input_map.keys()).difference(used_input_keys)
+          if not _IsImportedNodeOutput(k)]
+      if absent_input_keys:
+        raise ValueError(
+            'Attempted to map inputs that were not found in graph_def: [%s]'
+            % ', '.join(absent_input_keys))
+
+      if return_elements is None:
+        return None
+      else:
+        ret = []
+        for name in return_elements:
+          name = compat.as_str(name)
+          if ':' in name:
+            try:
+              operation_name, output_index = _ParseTensorName(name)
+              ret.append(name_to_op[operation_name].outputs[output_index])
+            except (ValueError, KeyError, IndexError):
+              raise ValueError(
+                  'Requested return_element %r not found in graph_def.' % name)
+          else:
+            try:
+              ret.append(name_to_op[name])
+            except KeyError:
+              raise ValueError(
+                  'Requested return_element %r not found in graph_def.' % name)
+        return ret
+    # LINT.ThenChange(//tensorflow/core/graph/graph_constructor.cc)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index e447f9a3e8e659e4dc57fe0720635db7f8413031..7bf13ba93d031018127182874d99b5a83905ff33 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -42,6 +43,7 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class ImportGraphDefTest(test.TestCase):
 
   def _MakeGraphDef(self,
@@ -108,6 +110,94 @@ class ImportGraphDefTest(test.TestCase):
       # Check that the op_def is still available.
       self.assertNotEqual(None, a.op_def)
 
+  def testMultipleImport(self):
+    graph_def = self._MakeGraphDef("""
+    node { name: 'A' op: 'IntOutput' }
+    node { name: 'B' op: 'IntInput' input: 'A:0' }
+    """)
+
+    with ops.Graph().as_default():
+      # Initial import
+      a, b = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="")
+      self.assertEqual(a.name, "A")
+      self.assertEqual(b.name, "B")
+      self.assertEqual(list(b.inputs), [a.outputs[0]])
+
+      # Repeat the same import
+      a1, b1 = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="")
+      self.assertEqual(a1.name, "A_1")
+      self.assertEqual(b1.name, "B_1")
+      self.assertEqual(list(b1.inputs), [a1.outputs[0]])
+
+      # Repeat the same import again
+      a2, b2 = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="")
+      self.assertEqual(a2.name, "A_2")
+      self.assertEqual(b2.name, "B_2")
+      self.assertEqual(list(b2.inputs), [a2.outputs[0]])
+
+      # Import with an already-used name
+      a3, b3 = importer.import_graph_def(
+          graph_def,
+          return_elements=["A", "B"],
+          name="A")
+      self.assertEqual(a3.name, "A_3/A")
+      self.assertEqual(b3.name, "A_3/B")
+      self.assertEqual(list(b3.inputs), [a3.outputs[0]])
+
+      # Import with existing de-duped node names
+      a1_1, b1_1 = importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A_1' op: 'IntOutput' }
+          node { name: 'B_1' op: 'IntInput' input: 'A_1:0' }
+          """),
+          return_elements=["A_1", "B_1"],
+          name="")
+      self.assertEqual(a1_1.name, "A_1_1")
+      self.assertEqual(b1_1.name, "B_1_1")
+      self.assertEqual(list(b1_1.inputs), [a1_1.outputs[0]])
+
+      # Create a name scope and then import node with same name
+      with ops.name_scope("foo"):
+        constant_op.constant(1)
+      foo, = importer.import_graph_def(
+          self._MakeGraphDef("node { name: 'foo' op: 'IntOutput' }"),
+          return_elements=["foo"],
+          name="")
+      self.assertEqual(foo.name, "foo_1")
+
+      # Imported node name can't conflict with intermediate name scope (but can
+      # conflict with outer scope and full name scope)
+      with ops.name_scope("outer"):
+        with ops.name_scope("inner"):
+          c = constant_op.constant(1, name="c")
+          self.assertEqual(c.op.name, "outer/inner/c")
+
+      outer, inner, new_c, outer_inner, outer_inner_c = (
+          importer.import_graph_def(
+              self._MakeGraphDef(
+                  "node { name: 'outer' op: 'IntOutput' }"
+                  "node { name: 'inner' op: 'IntOutput' }"
+                  "node { name: 'c' op: 'IntOutput' }"
+                  "node { name: 'outer/inner' op: 'IntOutput' }"
+                  "node { name: 'outer/inner/c' op: 'IntOutput' }"),
+              return_elements=["outer", "inner", "c", "outer/inner",
+                               "outer/inner/c"],
+              name=""))
+      self.assertEqual(outer.name, "outer_1")
+      self.assertEqual(inner.name, "inner")
+      self.assertEqual(new_c.name, "c")
+      self.assertEqual(outer_inner.name, "outer/inner_1")
+      self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
+
   def testInputMap(self):
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
@@ -247,6 +337,11 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d.outputs, [])
 
   def testCyclic(self):
+    # Importing cycles not supported with C API enabled (this test will
+    # eventually be deleted).
+    # TODO(skyewm): write while loop test
+    if ops._USE_C_API: return
+
     with ops.Graph().as_default():
       a, b = importer.import_graph_def(
           self._MakeGraphDef("""
@@ -261,16 +356,21 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], a.outputs[0])
 
   def testTypeMismatchInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
+                   "incompatible with expected float.")
+    else:
+      error_msg = ("Cannot convert a tensor of type int32 to an input of type "
+                   "float")
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue(
-          "Cannot convert a tensor of type int32 to an input of type float" in
-          str(e.exception))
 
   def testShapeWhitelist(self):
     # Barrier's shape is an output vector of 2, but the
@@ -280,7 +380,9 @@ class ImportGraphDefTest(test.TestCase):
           self._MakeGraphDef("""
           node { name: 'A' op: 'Barrier'
                  attr { key: '_output_shapes'
-                        value { list { shape { } } } } }
+                        value { list { shape { } } } }
+                 attr { key: 'component_types'
+                        value { list { type: DT_FLOAT } } } }
           """),
           return_elements=["A"],
           name="import")
@@ -305,35 +407,49 @@ class ImportGraphDefTest(test.TestCase):
             "Shapes () and (43,) are not compatible" in str(e.exception))
 
   def testInvalidSignatureTooManyInputsInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = "NodeDef expected inputs '' do not match 1 inputs specified"
+    else:
+      error_msg = r"More inputs specified \('A:0'\) than the op expects"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'None' input: 'A:0' }
             """))
-      self.assertTrue("More inputs specified ('A:0') than the op expects" in
-                      str(e.exception))
 
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
+    if ops._USE_C_API:
+      # TODO(skyewm): improve error message
+      error_msg = ("NodeDef expected inputs 'int32, float' do not match 1 "
+                   "inputs specified")
+    else:
+      error_msg = (r"Input types mismatch \(expected 'int32, float32' but "
+                   r"got 'int32'\)")
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInputFloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input types mismatch (expected 'int32, float32' but "
-                      "got 'int32')" in str(e.exception))
 
   def testMissingInputOpInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:0'"
+    else:
+      error_msg = "Input tensor 'A:0' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
             """))
-      self.assertTrue("Input tensor 'A:0' not found" in str(e.exception))
 
   def testMissingInputOpInGraphDefButAppearsInInputMap(self):
     with ops.Graph().as_default():
@@ -347,85 +463,115 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(b.inputs[0], feed_a_0)
 
   def testMissingInputTensorInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = ("Node 'B': Connecting to invalid output 1 of source node A "
+                   "which has 1 outputs")
+    else:
+      error_msg = "Input tensor 'A:1' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'FloatOutput' }
             node { name: 'B' op: 'FloatInput' input: 'A:1' }
             """))
-      self.assertTrue("Input tensor 'A:1' not found" in str(e.exception))
 
   def testMissingControlInputInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = r"Node 'B': Unknown input node '\^A'"
+    else:
+      error_msg = r"Control input '\^A' not found"
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
             """))
-      self.assertTrue("Control input '^A' not found" in str(e.exception))
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B'"
+    else:
+      error_msg = "Cannot convert 'A:B' to a tensor name."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
             """))
-      self.assertEqual("Cannot convert 'A:B' to a tensor name.",
-                       str(e.exception))
 
   def testInvalidTensorNameInGraphDef(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'B': Unknown input node 'A:B:0'"
+    else:
+      error_msg = "Cannot convert 'A:B:0' to a tensor name."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
             """))
-      self.assertEqual("Cannot convert 'A:B:0' to a tensor name.",
-                       str(e.exception))
 
   def testMissingReturnOperation(self):
+    if ops._USE_C_API:
+      error_msg = "Requested return node 'B' not found in graph def"
+    else:
+      error_msg = "return_element 'B' not found in graph_def."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
             """),
             return_elements=["B"])
-      self.assertTrue(
-          "return_element 'B' not found in graph_def." in str(e.exception))
 
   def testMissingReturnTensor(self):
+    if ops._USE_C_API:
+      error_msg = (r"Invalid return output 1 of node 'A', which has 1 "
+                   r"output\(s\)")
+    else:
+      error_msg = "return_element 'A:1' not found in graph_def."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:1"])
-      self.assertTrue(
-          "return_element 'A:1' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Requested return tensor 'B:0' not found in graph def"
+      else:
+        error_msg = "return_element 'B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["B:0"])
-      self.assertTrue(
-          "return_element 'B:0' not found in graph_def." in str(e.exception))
 
-      with self.assertRaises(ValueError) as e:
+      if ops._USE_C_API:
+        error_msg = "Cannot convert 'A:B:0' to a tensor name."
+      else:
+        error_msg = "return_element 'A:B:0' not found in graph_def."
+
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:B:0"])
-      self.assertTrue(
-          "return_element 'A:B:0' not found in graph_def." in str(e.exception))
 
   def testMissingInputMap(self):
+    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+
     with ops.Graph().as_default():
       with self.assertRaises(ValueError) as e:
         importer.import_graph_def(
@@ -436,6 +582,8 @@ class ImportGraphDefTest(test.TestCase):
       self.assertTrue("not found in graph_def: [B:0]" in str(e.exception))
 
   def testInputMapUnusedAsInput(self):
+    if ops._USE_C_API: return  # TODO(skyewm): make this work with C API
+
     with ops.Graph().as_default():
       # Mapping an unused node output should succeed.
       importer.import_graph_def(
@@ -454,17 +602,20 @@ class ImportGraphDefTest(test.TestCase):
       self.assertTrue("not found in graph_def: [A:2]" in str(e.exception))
 
   def testInputMapTypeMismatch(self):
+    if ops._USE_C_API:
+      error_msg = ("Input 0 of node import/B was passed float from Const:0 "
+                   "incompatible with expected int32.")
+    else:
+      error_msg = ("Cannot convert a tensor of type float32 to an input of "
+                   "type int32.")
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntInput' input: 'A:0' }
             """),
             input_map={"A:0": constant_op.constant(5.0)})
-      self.assertTrue(
-          "Cannot convert a tensor of type float32 to an input of type int32."
-          in str(e.exception))
 
   def testNoReturns(self):
     with ops.Graph().as_default() as g:
@@ -487,6 +638,16 @@ class ImportGraphDefTest(test.TestCase):
           name="imported_graph")
       self.assertEqual(a.name, "imported_graph/A")
 
+  def testDefaultNamePrefix(self):
+    with ops.Graph().as_default():
+      a, = importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A' op: 'None' }
+          """),
+          return_elements=["A"],
+          name=None)
+      self.assertEqual(a.name, "import/A")
+
   def testNamePrefixColocationAttrs(self):
     original_graph_def = self._MakeGraphDef("""
           node { name: 'A' op: 'None' }
@@ -498,12 +659,10 @@ class ImportGraphDefTest(test.TestCase):
     with ops.Graph().as_default():
       b, = importer.import_graph_def(
           original_graph_def, return_elements=["B"], name="imported_graph")
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' }
-          node { name: 'imported_graph/B' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+      self.assertTrue("_class" in b.node_def.attr)
+      self.assertProtoEquals(
+          "list { s: 'loc:@imported_graph/A' }",
+          b.node_def.attr["_class"])
 
   def testColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
@@ -527,23 +686,17 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
-
-    # Test a scenario where 'A' doesn't get a device; 'A' should
-    # not have a device, but during runtime will get colocated with
-    # 'B' because of the colocation attribute.
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
+
+    # Test a scenario where 'A' doesn't get a device; 'A' should not have a
+    # device, but during runtime will get colocated with 'B' because of the
+    # colocation attribute. B's device function is still overridden by A.
     def BDeviceFn(op):
       if "B" in op.name:
         return "/device:B:0"
@@ -551,19 +704,13 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(BDeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None'
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
     # Only A gets a device, so B inherits it implicitly.
     def ADeviceFn(op):
@@ -573,19 +720,13 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(ADeviceFn):
-        b, = importer.import_graph_def(
-            original_graph_def, return_elements=["B"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-                }
-          }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:A:0"
-                attr {
-                  key: '_class' value { list { s: 'loc:@imported_graph/A' } }
-          } }""", b.graph.as_graph_def())
+        a, b = importer.import_graph_def(original_graph_def,
+                                         return_elements=["A", "B"],
+                                         name="imported_graph")
+      self.assertEqual(a.device, "/device:A:0")
+      self.assertEqual(b.device, "/device:A:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/A"])
 
   def testMultipleColocationWithDeviceFn(self):
     original_graph_def = self._MakeGraphDef("""
@@ -608,20 +749,16 @@ class ImportGraphDefTest(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.device(CustomDeviceFn):
-        c, = importer.import_graph_def(
-            original_graph_def, return_elements=["C"], name="imported_graph")
-
-      self.assertProtoEqualsVersion("""
-          node { name: 'imported_graph/A' op: 'None' }
-          node { name: 'imported_graph/B' op: 'None' device: "/device:B:0" }
-          node { name: 'imported_graph/C' op: 'None' device: "/device:B:0"
-                 attr {
-                   key: '_class' value {
-                     list { s: 'loc:@imported_graph/A'
-                            s: 'loc:@imported_graph/B' }
-                   }
-                 }
-               }""", c.graph.as_graph_def())
+        a, b, c = importer.import_graph_def(original_graph_def,
+                                            return_elements=["A", "B", "C"],
+                                            name="imported_graph")
+      self.assertEqual(a.device, "")
+      self.assertEqual(b.device, "/device:B:0")
+      self.assertEqual(c.device, "/device:B:0")
+      self.assertEqual(a.colocation_groups(), [b"loc:@imported_graph/A"])
+      self.assertEqual(b.colocation_groups(), [b"loc:@imported_graph/B"])
+      self.assertEqual(c.colocation_groups(),
+                       [b"loc:@imported_graph/A", b"loc:@imported_graph/B"])
 
   def testNamePrefixColocationAttrsMultipleImport(self):
     original_graph_def = self._MakeGraphDef("""
@@ -632,21 +769,18 @@ class ImportGraphDefTest(test.TestCase):
           } }""")
 
     with ops.Graph().as_default():
-      b, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      _, = importer.import_graph_def(
-          original_graph_def, return_elements=["B"], name="")
-      self.assertProtoEqualsVersion("""
-          node { name: 'A' op: 'None' }
-          node { name: 'B' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A' } }
-          } }
-          node { name: 'A_1' op: 'None' }
-          node { name: 'B_1' op: 'None'  attr {
-            key: '_class'
-            value { list { s: 'loc:@A_1' } }
-          } }""", b.graph.as_graph_def())
+      a, b = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
+      a_1, b_1 = importer.import_graph_def(
+          original_graph_def, return_elements=["A", "B"], name="")
+
+      self.assertEqual(a.name, "A")
+      self.assertEqual(b.name, "B")
+      self.assertEqual(b.colocation_groups(), [b"loc:@A"])
+
+      self.assertEqual(a_1.name, "A_1")
+      self.assertEqual(b_1.name, "B_1")
+      self.assertEqual(b_1.colocation_groups(), [b"loc:@A_1"])
 
   def testNamePrefixColocationAttrsNotFound(self):
     original_graph_def = self._MakeGraphDef("""
@@ -654,8 +788,14 @@ class ImportGraphDefTest(test.TestCase):
             key: '_class'
             value { list { s: 'loc:@A' } }
           } }""")
+
+    if ops._USE_C_API:
+      error_msg = "Node 'B' expects to be colocated with unknown node 'A'"
+    else:
+      error_msg = "does not exist during import"
+
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, "does not exist during import"):
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
 
@@ -703,21 +843,32 @@ class ImportGraphDefTest(test.TestCase):
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
-      with self.assertRaises(TypeError) as e:
+      with self.assertRaisesRegexp(
+          TypeError, "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
-      self.assertEqual("return_elements must be a list of strings.",
-                       str(e.exception))
+
+      if ops._USE_C_API:
+        error_msg = "Cannot convert 'a:b:c' to a tensor name."
+      else:
+        error_msg = "Requested return_element 'a:b:c' not found in graph_def."
+      with self.assertRaisesRegexp(ValueError, error_msg):
+        importer.import_graph_def(self._MakeGraphDef(""),
+                                  return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
+    if ops._USE_C_API:
+      error_msg = "Node 'A' is not unique"
+    else:
+      error_msg = "Duplicate name 'A' in GraphDef."
+
     with ops.Graph().as_default():
-      with self.assertRaises(ValueError) as e:
+      with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             node { name: 'B' op: 'IntOutput' }
             node { name: 'A' op: 'IntOutput' }
             """))
-      self.assertEqual("Duplicate name 'A' in GraphDef.", str(e.exception))
 
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
@@ -862,24 +1013,36 @@ class ImportGraphDefTest(test.TestCase):
       pat = (r"GraphDef producer version -1 below min producer %d supported "
              r"by TensorFlow \S+\.  Please regenerate your graph.$" %
              versions.GRAPH_DEF_VERSION_MIN_PRODUCER)
-      importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-      x = constant_op.constant(
-          7)  # Need at least one op to get a C++ graph generated
-      with self.test_session(graph=g) as sess:
+      # C API throws error during import, Python-only throws error during run
+      if ops._USE_C_API:
         with self.assertRaisesRegexp(Exception, pat):
-          sess.run(x)
+          importer.import_graph_def(self._MakeGraphDef("", producer=-1))
+      else:
+        importer.import_graph_def(self._MakeGraphDef("", producer=-1))
+        x = constant_op.constant(
+            7)  # Need at least one op to get a C++ graph generated
+        with self.test_session(graph=g) as sess:
+          with self.assertRaisesRegexp(Exception, pat):
+            sess.run(x)
 
   def testVersionHigh(self):
     with ops.Graph().as_default() as g:
       pat = (r"GraphDef min consumer version %d above current version %d "
              r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
              (1 << 30, versions.GRAPH_DEF_VERSION))
-      importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
-      x = constant_op.constant(
-          7)  # Need at least one op to get a C++ graph generated
-      with self.test_session(graph=g) as sess:
-        with self.assertRaisesRegexp(Exception, pat):
-          sess.run(x)
+
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(ValueError, pat):
+          importer.import_graph_def(self._MakeGraphDef("",
+                                                       min_consumer=1 << 30))
+      else:
+        # Python API only throws when graph is run
+        importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
+        x = constant_op.constant(
+            7)  # Need at least one op to get a C++ graph generated
+        with self.test_session(graph=g) as sess:
+          with self.assertRaisesRegexp(Exception, pat):
+            sess.run(x)
 
   def testVersionAppliesToOpConstruction(self):
     """These tests rely on shape fns in test_ops.cc."""
@@ -925,19 +1088,26 @@ class ImportGraphDefTest(test.TestCase):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      with self.assertRaisesRegexp(ValueError, "No attr named 'default_int'"):
+      if ops._USE_C_API:
+        error_msg = "Operation 'import/A' has no attr named 'default_int'."
+      else:
+        error_msg = "No attr named 'default_int'"
+      with self.assertRaisesRegexp(ValueError, error_msg):
         a[0].get_attr("default_int")
 
-    # Attr only in producer_op_list with non-default value is preserved.
-    with ops.Graph().as_default():
-      a = importer.import_graph_def(
-          self._MakeGraphDef("""
-          node { name: 'A' op: 'OpWithFutureDefaultAttr'
-                 attr { key: 'default_int' value { i: 987 } } }
-          """),
-          return_elements=["A"],
-          producer_op_list=producer_op_list)
-      self.assertEqual(987, a[0].get_attr("default_int"))
+    # Unknown attrs cannot be imported using C API. This test will eventually be
+    # deleted.
+    if not ops._USE_C_API:
+      # Attr only in producer_op_list with non-default value is preserved.
+      with ops.Graph().as_default():
+        a = importer.import_graph_def(
+            self._MakeGraphDef("""
+            node { name: 'A' op: 'OpWithFutureDefaultAttr'
+                   attr { key: 'default_int' value { i: 987 } } }
+            """),
+            return_elements=["A"],
+            producer_op_list=producer_op_list)
+        self.assertEqual(987, a[0].get_attr("default_int"))
 
   def testFunctions(self):
     dtype = dtypes.float32
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index a8bc2d2e3fb1bdddf163ff226d6430a9222bb769..c839d7a9a693a4e1201c558173662fd24b5036dd 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -663,7 +663,7 @@ def import_scoped_meta_graph(meta_graph_or_file,
         [part for part in [graph.get_name_scope(), import_scope] if part])
 
     # Restores all the other collections.
-    for key, col_def in meta_graph_def.collection_def.items():
+    for key, col_def in sorted(meta_graph_def.collection_def.items()):
       # Don't add unbound_inputs to the new graph.
       if key == unbound_inputs_col_name:
         continue
@@ -773,6 +773,7 @@ def export_scoped_meta_graph(filename=None,
     if graph_def:
       new_graph_def = graph_pb2.GraphDef()
       new_graph_def.versions.CopyFrom(graph_def.versions)
+      new_graph_def.library.CopyFrom(graph_def.library)
 
       if clear_extraneous_savers:
         exclude_nodes = _find_extraneous_saver_nodes(graph_def, saver_def)
@@ -810,6 +811,9 @@ def export_scoped_meta_graph(filename=None,
           bytesize += value.node_def.ByteSize()
           if bytesize >= (1 << 31) or bytesize < 0:
             raise ValueError("GraphDef cannot be larger than 2GB.")
+
+      graph._copy_functions_to_graph_def(graph_def, bytesize)  # pylint: disable=protected-access
+
     # It's possible that not all the inputs are in the export_scope.
     # If we would like such information included in the exported meta_graph,
     # add them to a special unbound_inputs collection.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index 06cee46bf623ff0521f4ebe91ff1909aa45e00e3..4c22c913b850685bd6e50b03b5fbb09a01441b68 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -662,22 +662,36 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 class ExportImportAcrossScopesTest(test.TestCase):
 
   def testPartionedVariables(self):
-    def make_graph_with_partitioned_variables():
+
+    def make_graph_with_partitioned_variables(use_resource):
       variable_scope.get_variable(
           name="weights",
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0),
-          initializer=random_ops.truncated_normal([100, 10]))
-    self._testExportImportAcrossScopes(make_graph_with_partitioned_variables)
+          initializer=random_ops.truncated_normal([100, 10]),
+          use_resource=use_resource)
+      # The next variable illustrates the necessity of restoring collections
+      # in a deterministic fashion when using ResourceVariables.
+      variable_scope.get_variable(
+          name="another",
+          shape=[],
+          collections=["a", "b", "z", "f", "e", "d", "g"],
+          use_resource=use_resource)
+
+    self._testExportImportAcrossScopes(
+        make_graph_with_partitioned_variables, use_resource=False)
+    self._testExportImportAcrossScopes(
+        make_graph_with_partitioned_variables, use_resource=True)
 
-  def _testExportImportAcrossScopes(self, graph_fn):
+  def _testExportImportAcrossScopes(self, graph_fn, use_resource):
     """Tests export and importing a graph across scopes.
 
     Args:
       graph_fn: A closure that creates a graph on the current scope.
+      use_resource: A bool indicating whether or not to use ResourceVariables.
     """
     with ops.Graph().as_default() as original_graph:
       with variable_scope.variable_scope("dropA/dropB/keepA"):
-        graph_fn()
+        graph_fn(use_resource=use_resource)
     exported_meta_graph_def = meta_graph.export_scoped_meta_graph(
         graph=original_graph,
         export_scope="dropA/dropB")[0]
@@ -689,10 +703,32 @@ class ExportImportAcrossScopesTest(test.TestCase):
 
     with ops.Graph().as_default() as expected_graph:
       with variable_scope.variable_scope("importA/keepA"):
-        graph_fn()
+        graph_fn(use_resource=use_resource)
+
+      if use_resource:
+        # Bringing in a collection that contains ResourceVariables adds ops
+        # to the graph, so mimic the same behavior.
+        for collection_key in sorted([
+            ops.GraphKeys.GLOBAL_VARIABLES,
+            ops.GraphKeys.TRAINABLE_VARIABLES,
+        ]):
+          for var in expected_graph.get_collection(collection_key):
+            var._read_variable_op()
 
     result = meta_graph.export_scoped_meta_graph(graph=imported_graph)[0]
     expected = meta_graph.export_scoped_meta_graph(graph=expected_graph)[0]
+
+    if use_resource:
+      # Clear all shared_name attributes before comparing, since they are
+      # supposed to be orthogonal to scopes.
+      for meta_graph_def in [result, expected]:
+        for node in meta_graph_def.graph_def.node:
+          shared_name_attr = "shared_name"
+          shared_name_value = node.attr.get(shared_name_attr, None)
+          if shared_name_value and shared_name_value.HasField("s"):
+            if shared_name_value.s:
+              node.attr[shared_name_attr].s = b""
+
     self.assertProtoEquals(expected, result)
 
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index e68eac372369b25a902936467be71b6079a23ce4..95b1cefcbee52fc66b17f9d738291f9eb08fc613 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -35,6 +35,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
@@ -47,6 +48,7 @@ from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
@@ -157,14 +159,18 @@ def register_dense_tensor_like_type(tensor_type):
   """
   try:
     if not isinstance(tensor_type.name, property):
-      raise TypeError("Type %s does not define a `name` property")
+      raise TypeError("Type %s does not define a `name` property" %
+                      tensor_type.__name__)
   except AttributeError:
-    raise TypeError("Type %s does not define a `name` property")
+    raise TypeError("Type %s does not define a `name` property" %
+                    tensor_type.__name__)
   try:
     if not isinstance(tensor_type.dtype, property):
-      raise TypeError("Type %s does not define a `dtype` property")
+      raise TypeError("Type %s does not define a `dtype` property" %
+                      tensor_type.__name__)
   except AttributeError:
-    raise TypeError("Type %s does not define a `dtype` property")
+    raise TypeError("Type %s does not define a `dtype` property" %
+                    tensor_type.__name__)
   # We expect this list to be small, so choose quadratic complexity
   # for registration, so that we have a tuple that can be used for
   # more efficient `isinstance` checks later.
@@ -369,11 +375,40 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
+    if _USE_C_API:
+      graph = self._op._graph._c_graph  # pylint: disable=protected-access
+      with errors.raise_exception_on_not_ok_status() as status:
+        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
+                                                  status)
+      if num_dims == -1:
+        dim_list = None
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
+          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
+              graph, self._as_tf_output(), num_dims, status)
+        dim_list = [None if i == -1 else i for i in dim_list]
+      return tensor_shape.TensorShape(dim_list)
     return self._shape
 
+  def __iter__(self):
+    if context.in_graph_mode():
+      raise TypeError(
+          "`Tensor` objects are not iterable when eager execution is not "
+          "enabled. To iterate over this tensor use `tf.map_fn`.")
+    shape = self._shape_tuple()
+    if shape is None:
+      raise TypeError("Cannot iterate over a tensor with unknown shape.")
+    if not shape:
+      raise TypeError("Cannot iterate over a scalar tensor.")
+    if shape[0] is None:
+      raise TypeError(
+          "Cannot iterate over a tensor with unknown first dimension.")
+    for i in xrange(shape[0]):
+      yield self[i]
+
   def _shape_as_list(self):
-    if self._shape.ndims is not None:
-      return [dim.value for dim in self._shape.dims]
+    if self.shape.ndims is not None:
+      return [dim.value for dim in self.shape.dims]
     else:
       return None
 
@@ -389,7 +424,7 @@ class Tensor(_TensorLike):
     Returns:
       Integer rank or None
     """
-    return self._shape.ndims
+    return self.shape.ndims
 
   def get_shape(self):
     """Alias of Tensor.shape."""
@@ -420,14 +455,35 @@ class Tensor(_TensorLike):
     ```
 
     Args:
-      shape: A `TensorShape` representing the shape of this tensor.
+      shape: A `TensorShape` representing the shape of this tensor, a
+      `TensorShapeProto`, a list, a tuple, or None.
 
     Raises:
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    # TODO(skyewm): call C API
-    self._shape = self._shape.merge_with(shape)
+    if not _USE_C_API:
+      self._shape = self._shape.merge_with(shape)  # pylint: disable=protected-access
+      return
+    if not isinstance(shape, tensor_shape.TensorShape):
+      shape = tensor_shape.TensorShape(shape)
+    dim_list = []
+    if shape.dims is None:
+      unknown_shape = True
+    else:
+      unknown_shape = False
+      for dim in shape.dims:
+        if dim.value is None:
+          dim_list.append(-1)
+        else:
+          dim_list.append(dim.value)
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape,
+          status)
 
   @property
   def value_index(self):
@@ -440,7 +496,17 @@ class Tensor(_TensorLike):
     Returns:
       A list of `Operation`s.
     """
-    return self._consumers
+    if self._op._c_op:  # pylint: disable=protected-access
+      consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
+          self._as_tf_output())
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(name)
+          for name in consumer_names
+      ]
+      # pylint: enable=protected-access
+    else:
+      return self._consumers
 
   def _add_consumer(self, consumer):
     """Add a consumer to this tensor.
@@ -451,6 +517,9 @@ class Tensor(_TensorLike):
     Raises:
       TypeError: if the consumer is not an Operation.
     """
+    # pylint: disable=protected-access
+    assert not self._op._c_op, "Tensor._add_consumer doesn't work with C API"
+    # pylint: enable=protected-access
     if not isinstance(consumer, Operation):
       raise TypeError("Consumer must be an Operation: %s" % consumer)
     self._consumers.append(consumer)
@@ -475,11 +544,10 @@ class Tensor(_TensorLike):
       return "%s:%d" % (self._op.name, self._value_index)
 
   def _as_tf_output(self):
-    assert self.op._c_op  # pylint: disable=protected-access
-    tf_output = c_api.TF_Output()
-    tf_output.oper = self.op._c_op  # pylint: disable=protected-access
-    tf_output.index = self.value_index
-    return tf_output
+    # pylint: disable=protected-access
+    assert self.op._c_op
+    return c_api_util.tf_output(self.op._c_op, self.value_index)
+    # pylint: enable=protected-access
 
   def __str__(self):
     return "Tensor(\"%s\"%s%s%s)" % (
@@ -514,19 +582,6 @@ class Tensor(_TensorLike):
   def _override_operator(operator, func):
     _override_helper(Tensor, operator, func)
 
-  def __iter__(self):
-    """Dummy method to prevent iteration. Do not call.
-
-    NOTE(mrry): If we register __getitem__ as an overloaded operator,
-    Python will valiantly attempt to iterate over the Tensor from 0 to
-    infinity.  Declaring this method prevents this unintended
-    behavior.
-
-    Raises:
-      TypeError: when invoked.
-    """
-    raise TypeError("'Tensor' object is not iterable.")
-
   def __bool__(self):
     """Dummy method to prevent a tensor from being used as a Python `bool`.
 
@@ -592,21 +647,11 @@ class Tensor(_TensorLike):
     """
     return _eval_using_default_session(self, feed_dict, self.graph, session)
 
-  def _dup(self):
-    ret = copy.copy(self)
-    ret._id = uid()  # pylint: disable=protected-access
-    return ret
-
 
 # TODO(agarwal): consider getting rid of this.
 class _EagerTensorBase(Tensor):
   """Base class for EagerTensor."""
 
-  @staticmethod
-  def _delete_trace(tid):
-    """Helper function to be called by __del__ of the subclass."""
-    tape.delete_trace(tid)
-
   @property
   def dtype(self):
     # Note: using the intern table directly here as this is
@@ -614,15 +659,16 @@ class _EagerTensorBase(Tensor):
     return dtypes._INTERN_TABLE[self._datatype_enum()]  # pylint: disable=protected-access
 
   def numpy(self):
-    """Returns a numpy array with the same contents as the Tensor.
+    """Returns a numpy array or a scalar with the same contents as the Tensor.
 
     TODO(ashankar,agarwal): Perhaps this should NOT reference the underlying
     buffer but instead always explicitly copy? Note that currently it may or may
     not copy based on whether the numpy data is properly aligned or not.
 
     Returns:
-      A numpy array that may share memory with the Tensor object. Any changes
-      to one may be reflected in the other.
+      A numpy array or a scalar. Numpy array may share memory with the
+      Tensor object. Any changes to one may be reflected in the other. A scalar
+      value is returned when self has rank 0.
 
     Raises:
       ValueError: if the type of this Tensor is not representable in numpy.
@@ -639,8 +685,11 @@ class _EagerTensorBase(Tensor):
   def __float__(self):
     return float(self.numpy())
 
-  def __array__(self):
-    return np.array(self.numpy())
+  def __array__(self, dtype=None):
+    return np.array(self.numpy(), dtype=dtype)
+
+  def __format__(self, format_spec):
+    return self.numpy().__format__(format_spec)
 
   def _numpy(self):
     raise NotImplementedError()
@@ -713,11 +762,6 @@ class _EagerTensorBase(Tensor):
       new_tensor = self._copy_to_device(context=ctx._handle, device=device_name)
     except core._NotOkStatusException as e:
       six.raise_from(core._status_to_exception(e.code, e.message), None)
-    if core.active_trace() is not None:
-      core.active_trace().record_tensor("COPY",
-                                        tensor_id(new_tensor),
-                                        new_tensor.device,
-                                        new_tensor.shape.num_elements())
 
     # Record the copy on tape and define backprop copy as well.
     if not context.in_graph_mode():
@@ -728,9 +772,6 @@ class _EagerTensorBase(Tensor):
     return new_tensor
     # pylint: enable=protected-access
 
-  def _dup(self):
-    return self._copy(device_name=self.device)
-
   @property
   def shape(self):
     return tensor_shape.TensorShape(self._shape_tuple())
@@ -860,6 +901,10 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
   inputs, which allows those ops to accept numpy arrays, Python lists,
   and scalars in addition to `Tensor` objects.
 
+  Note: This function diverges from default Numpy behavior for `float` and
+    `string` types when `None` is present in a Python list or scalar. Rather
+    than silently converting `None` values, an error will be thrown.
+
   Args:
     value: An object whose type has a registered `Tensor` conversion function.
     dtype: Optional element type for the returned tensor. If missing, the
@@ -934,7 +979,7 @@ def internal_convert_to_tensor(value,
     # Fast path for EagerTensors that don't need any conversion.
     if isinstance(value, EagerTensor):
       # Note that we don't check that value's dtype matches the dtype
-      # argument.  We exepct that the C runtime will do that checking
+      # argument.  We expect that the C runtime will do that checking
       # when we execute the kernel.
       return value
 
@@ -1395,6 +1440,56 @@ _VALID_OP_NAME_REGEX = re.compile("^[A-Za-z0-9.][A-Za-z0-9_.\\-/]*$")
 _VALID_SCOPE_NAME_REGEX = re.compile("^[A-Za-z0-9_.\\-/]*$")
 
 
+def _create_c_op(graph, node_def, inputs, control_inputs):
+  """Creates a TF_Operation.
+
+  Args:
+    graph: a `Graph`.
+    node_def: `node_def_pb2.NodeDef` for the operation to create.
+    inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
+      `Tensor`s (corresponding to sequence inputs, e.g. "int64 * N",
+      "list(int64)"). The length of the list should be equal to the number of
+      inputs specified by this operation's op def.
+    control_inputs: A list of `Operation`s to set as control dependencies.
+
+  Returns:
+    A wrapped TF_Operation*.
+  """
+  # pylint: disable=protected-access
+  op_desc = c_api.TF_NewOperation(graph._c_graph,
+                                  compat.as_str(node_def.op),
+                                  compat.as_str(node_def.name))
+  # Add inputs
+  for op_input in inputs:
+    if isinstance(op_input, (list, tuple)):
+      c_api.TF_AddInputList(op_desc, [t._as_tf_output() for t in op_input])
+    else:
+      c_api.TF_AddInput(op_desc, op_input._as_tf_output())
+
+  # Add control inputs
+  for control_input in control_inputs:
+    c_api.TF_AddControlInput(op_desc, control_input._c_op)
+  # pylint: enable=protected-access
+
+  # Add attrs
+  for name, attr_value in node_def.attr.items():
+    serialized = attr_value.SerializeToString()
+    # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
+    # It might be worth creating a convenient way to re-use the same status.
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_api.TF_SetAttrValueProto(op_desc,
+                                 compat.as_str(name), serialized, status)
+
+  try:
+    with errors.raise_exception_on_not_ok_status() as status:
+      c_op = c_api.TF_FinishOperation(op_desc, status)
+  except errors.InvalidArgumentError as e:
+    # Convert to ValueError for backwards compatibility.
+    raise ValueError(str(e))
+
+  return c_op
+
+
 class Operation(object):
   """Represents a graph node that performs computation on tensors.
 
@@ -1462,16 +1557,33 @@ class Operation(object):
         or if `inputs` and `input_types` are incompatible.
       ValueError: if the `node_def` name is not valid.
     """
-    if not isinstance(node_def, node_def_pb2.NodeDef):
+    # For internal use only: `node_def` can be set to a TF_Operation to create
+    # an Operation for that op. This is useful for creating Operations for ops
+    # indirectly created by C API methods, e.g. the ops created by
+    # TF_ImportGraphDef. When `node_def` is a TF_Operation, all optional fields
+    # except `control_inputs` should be None.
+
+    if isinstance(node_def, node_def_pb2.NodeDef):
+      if node_def.ByteSize() >= (1 << 31) or node_def.ByteSize() < 0:
+        raise ValueError(
+            "Cannot create a tensor proto whose content is larger than 2GB.")
+      if not _VALID_OP_NAME_REGEX.match(node_def.name):
+        raise ValueError("'%s' is not a valid node name" % node_def.name)
+      self._node_def = copy.deepcopy(node_def)
+      c_op = None
+    elif type(node_def).__name__ == "SwigPyObject":
+      assert inputs is None
+      assert output_types is None
+      assert input_types is None
+      assert original_op is None
+      assert op_def is None
+      self._node_def = None
+      c_op = node_def
+    else:
       raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
-    if node_def.ByteSize() >= (1 << 31) or node_def.ByteSize() < 0:
-      raise ValueError(
-          "Cannot create a tensor proto whose content is larger than 2GB.")
-    if not _VALID_OP_NAME_REGEX.match(node_def.name):
-      raise ValueError("'%s' is not a valid node name" % node_def.name)
+
     if not isinstance(g, Graph):
       raise TypeError("g needs to be a Graph: %s" % g)
-    self._node_def = copy.deepcopy(node_def)
     self._graph = g
     if inputs is None:
       inputs = []
@@ -1481,15 +1593,6 @@ class Operation(object):
     for a in self._inputs:
       if not isinstance(a, Tensor):
         raise TypeError("input needs to be a Tensor: %s" % a)
-      # Mark that we consume the inputs.
-      a._add_consumer(self)  # pylint: disable=protected-access
-    if output_types is None:
-      output_types = []
-    self._output_types_val = output_types
-    self._outputs = [
-        Tensor(self, i, output_type)
-        for i, output_type in enumerate(output_types)
-    ]
     if input_types is None:
       input_types = [i.dtype.base_dtype for i in self._inputs]
     else:
@@ -1506,40 +1609,28 @@ class Operation(object):
     self._control_inputs = []
     if control_inputs:
       for c in control_inputs:
-        c_op = None
+        control_op = None
         if isinstance(c, Operation):
-          c_op = c
+          control_op = c
         elif isinstance(c, (Tensor, IndexedSlices)):
-          c_op = c.op
+          control_op = c.op
         else:
           raise TypeError("Control input must be an Operation, "
                           "a Tensor, or IndexedSlices: %s" % c)
-        self._control_inputs.append(c_op)
+        self._control_inputs.append(control_op)
 
+    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
     self._op_def = op_def
     self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
-    # Define self._c_op before calling self._control_flow_context.AddOp(), since
-    # that will call methods on this op that check if self._c_op is set.
-    self._c_op = None
-    # Add this op to the current control flow context:
-    self._control_flow_context = g._get_control_flow_context()  # pylint: disable=protected-access
-    if self._control_flow_context is not None:
-      # TODO(skyewm): consider refactoring this to call self._create_c_op()
-      # first. This would require updating the TF_Operation's ID (see the
-      # comment and self._id_value update below). The disadvantage of calling
-      # AddOp() first is that we need to maintain Operation state that is
-      # accessed by AddOp() in Python, e.g. the input Tensors.
-      self._control_flow_context.AddOp(self)
-    # NOTE(keveman): Control flow context's AddOp could be creating new ops and
-    # setting op.inputs[index] = new_op. Thus the new ops' id could be larger
-    # than this op's id even though this op depend on them. Therefore, delaying
-    # assigning id to this op until all ops this could be dependent on are
-    # created.
-    self._id_value = self._graph._next_id()  # pylint: disable=protected-access
-    self._recompute_node_def()
 
-    if self._graph._c_graph:  # pylint: disable=protected-access
+    # Initialize self._c_op.
+    if c_op:
+      # TODO(skyewm): remove this assert when we remove USE_C_API
+      assert self._graph._c_graph  # pylint: disable=protected-access
+      self._c_op = c_op
+      self._add_control_inputs(self._control_inputs)
+    elif self._graph._c_graph:  # pylint: disable=protected-access
       if self._op_def:
         # TODO(skyewm): op_def_library.apply_op() flattens the incoming
         # inputs. Refactor so we don't have to do this here.
@@ -1549,53 +1640,39 @@ class Operation(object):
         # If no OpDef is specified, assume all inputs are scalar.
         grouped_inputs = self._inputs
 
-      self._c_op = self._create_c_op(self._graph, self._node_def,
-                                     grouped_inputs, self._control_inputs)
-
-  def _create_c_op(self, graph, node_def, inputs, control_inputs):
-    """Creates a TF_Operation.
-
-    Args:
-      graph: a `Graph`.
-      node_def: `node_def_pb2.NodeDef` for the operation to create.
-      inputs: A list of `Tensor`s (corresponding to scalar inputs) and lists of
-        `Tensor`s (corresponding to sequence inputs, e.g. "int64 * N",
-        "list(int64)"). The length of the list should be equal to the number of
-        inputs specified by this operation's op def.
-      control_inputs: A list of `Operation`s to set as control dependencies.
-
-    Returns:
-      A wrapped TF_Operation*.
-    """
-    # pylint: disable=protected-access
-    op_desc = c_api.TF_NewOperation(graph._c_graph,
-                                    compat.as_str(node_def.op),
-                                    compat.as_str(node_def.name))
-    # Add inputs
-    for op_input in inputs:
-      if isinstance(op_input, (list, tuple)):
-        c_api.TF_AddInputList(op_desc, [t._as_tf_output() for t in op_input])
-      else:
-        c_api.TF_AddInput(op_desc, op_input._as_tf_output())
-
-    # Add control inputs
-    for control_input in control_inputs:
-      c_api.TF_AddControlInput(op_desc, control_input._c_op)
-    # pylint: enable=protected-access
+      self._c_op = _create_c_op(self._graph, self._node_def, grouped_inputs,
+                                self._control_inputs)
+    else:
+      self._c_op = None
 
-    # Add attrs
-    for name, attr_value in node_def.attr.items():
-      serialized = attr_value.SerializeToString()
-      # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
-      # It might be worth creating a convenient way to re-use the same status.
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_SetAttrValueProto(op_desc,
-                                   compat.as_str(name), serialized, status)
+    # Mark that we consume the inputs. This is unnecessary and unsupported with
+    # the C API enabled, since the C API tracks the tensor consumers instead.
+    if not self._c_op:
+      for input_tensor in self.inputs:
+        input_tensor._add_consumer(self)  # pylint: disable=protected-access
 
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_op = c_api.TF_FinishOperation(op_desc, status)
+    # Initialize self._outputs.
+    if self._c_op:
+      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+      output_types = [
+          c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
+          for i in range(num_outputs)]
+      assert output_types is not None
+    elif output_types is None:
+      output_types = []
+    self._output_types_val = output_types
+    self._outputs = [
+        Tensor(self, i, output_type)
+        for i, output_type in enumerate(output_types)
+    ]
 
-    return c_op
+    # Add this op to the current control flow context.
+    self._control_flow_context = g._get_control_flow_context()  # pylint: disable=protected-access
+    for input_tensor in self.inputs:
+      control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
+    if self._control_flow_context is not None:
+      self._control_flow_context.AddOp(self)
+    self._recompute_node_def()
 
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
     """Regroups a flat list of input tensors into scalar and sequence inputs.
@@ -1635,15 +1712,17 @@ class Operation(object):
   def colocation_groups(self):
     """Returns the list of colocation groups of the op."""
     default_colocation_group = [
-        compat.as_bytes("loc:@%s" % self._node_def.name)
+        compat.as_bytes("loc:@%s" % self.name)
     ]
-    if "_class" not in self._node_def.attr:
+    try:
+      class_attr = self.get_attr("_class")
+    except ValueError:
       # This op has no explicit colocation group, so it is itself its
       # own root of a colocation group.
       return default_colocation_group
 
     attr_groups = [
-        class_name for class_name in self.get_attr("_class")
+        class_name for class_name in class_attr
         if class_name.startswith(b"loc:@")
     ]
 
@@ -1675,9 +1754,6 @@ class Operation(object):
   def name(self):
     """The full name of this operation."""
     if self._c_op:
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._node_def.name == c_api.TF_OperationName(self._c_op)
       return c_api.TF_OperationName(self._c_op)
     else:
       return self._node_def.name
@@ -1697,9 +1773,6 @@ class Operation(object):
       device.
     """
     if self._c_op:
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here
-      assert self._node_def.device == c_api.TF_OperationDevice(self._c_op)
       return c_api.TF_OperationDevice(self._c_op)
     else:
       return self._node_def.device
@@ -1760,9 +1833,9 @@ class Operation(object):
       c_api.SetRequestedDevice(
           self._graph._c_graph,  # pylint: disable=protected-access
           self._c_op,  # pylint: disable=protected-access
-          _device_string(device))
-    # TODO(nolivia): remove this line when switch to C api
-    self._node_def.device = _device_string(device)
+          compat.as_str(_device_string(device)))
+    else:
+      self._node_def.device = _device_string(device)
 
   def _add_input(self, tensor, dtype=None):
     """Add a new input to this operation.
@@ -1795,7 +1868,7 @@ class Operation(object):
     tensor._add_consumer(self)  # pylint: disable=protected-access
     self._recompute_node_def()
 
-  def _update_input(self, index, tensor, dtype=None):
+  def _update_input(self, index, tensor):
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1803,8 +1876,6 @@ class Operation(object):
     Args:
       index: the index of the input to update.
       tensor: the Tensor to be used as the input at the given index.
-      dtype: tf.DType: type of the input; defaults to
-        the tensor's dtype.
 
     Raises:
       TypeError: if tensor is not a Tensor,
@@ -1822,17 +1893,9 @@ class Operation(object):
             self._tf_input(index),
             status)
     else:
-      if dtype is None:
-        dtype = tensor.dtype
-      else:
-        dtype = dtypes.as_dtype(dtype)
-        if not dtype.is_compatible_with(tensor.dtype):
-          raise TypeError(
-              "Cannot convert a tensor of type %s to an input of type %s" %
-              (tensor.dtype.name, dtype.name))
       self._inputs[index].consumers().remove(self)
       self._inputs[index] = tensor
-      self._input_types_val[index] = dtype
+      self._input_types_val[index] = tensor.dtype
       tensor._add_consumer(self)  # pylint: disable=protected-access
       self._recompute_node_def()
 
@@ -1877,8 +1940,18 @@ class Operation(object):
     else:
       self._add_control_inputs([op])
 
+  def _remove_all_control_inputs(self):
+    """Removes any control inputs to this operation."""
+    if self._c_op:
+      c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
+    else:
+      del self.control_inputs[:]
+
   # Methods below are used when building the NodeDef and Graph proto.
   def _recompute_node_def(self):
+    # TODO(skyewm): remove this function when we switch to C API
+    if self._c_op: return
+
     del self._node_def.input[:]
     # pylint: disable=protected-access
     self._node_def.input.extend([t._as_node_def_input() for t in self._inputs])
@@ -1888,7 +1961,7 @@ class Operation(object):
           ["^%s" % op.name for op in self._control_inputs])
 
   def __str__(self):
-    return str(self._node_def)
+    return str(self.node_def)
 
   def __repr__(self):
     return "<tf.Operation '%s' type=%s>" % (self.name, self.type)
@@ -1948,9 +2021,6 @@ class Operation(object):
           dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
           for i in xrange(num_inputs)
       ]
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._input_types_val == input_types
       return input_types
     else:
       return self._input_types_val
@@ -1985,14 +2055,6 @@ class Operation(object):
     """The type of the op (e.g. `"MatMul"`)."""
     if self._c_op:
       op_type = c_api.TF_OperationOpType(self._c_op)
-      # TODO(iga): Remove these asserts after converting to C API by default.
-      # Just being a bit paranoid here.
-      # pylint: disable=unidiomatic-typecheck
-      assert type(op_type) == type(self._node_def.op), (
-          "Expected same types %s vs %s" % (type(op_type),
-                                            type(self._node_def.op)))
-      # pylint: enable=unidiomatic-typecheck
-      assert op_type == self._node_def.op
       return op_type
     else:
       return self._node_def.op
@@ -2005,7 +2067,7 @@ class Operation(object):
   @property
   def node_def(self):
     # pylint: disable=line-too-long
-    """Returns a serialized `NodeDef` representation of this operation.
+    """Returns the `NodeDef` representation of this operation.
 
     Returns:
       A
@@ -2013,7 +2075,16 @@ class Operation(object):
       protocol buffer.
     """
     # pylint: enable=line-too-long
-    return self._node_def
+    if self._c_op:
+      with c_api_util.tf_buffer() as buf:
+        with errors.raise_exception_on_not_ok_status() as status:
+          c_api.TF_OperationToNodeDef(self._c_op, buf, status)
+        data = c_api.TF_GetBuffer(buf)
+      node_def = node_def_pb2.NodeDef()
+      node_def.ParseFromString(compat.as_bytes(data))
+      return node_def
+    else:
+      return self._node_def
 
   @property
   def op_def(self):
@@ -2027,13 +2098,13 @@ class Operation(object):
     """
     # pylint: enable=line-too-long
     if self._c_op:
-      with errors.raise_exception_on_not_ok_status() as status:
-        with c_api_util.tf_buffer() as buf:
+      with c_api_util.tf_buffer() as buf:
+        with errors.raise_exception_on_not_ok_status() as status:
           # pylint: disable=protected-access
           c_api.TF_GraphGetOpDef(self._graph._c_graph,
                                  compat.as_bytes(self.type), buf, status)
           # pylint: enable=protected-access
-          data = c_api.TF_GetBuffer(buf)
+        data = c_api.TF_GetBuffer(buf)
       op_def = op_def_pb2.OpDef()
       op_def.ParseFromString(compat.as_bytes(data))
       return op_def
@@ -2056,6 +2127,22 @@ class Operation(object):
         self._traceback,
         include_func_start_lineno=True)
 
+  def _set_attr(self, attr_name, attr_value):
+    """Private method used to set an attribute in the node_def."""
+    if self._c_op:
+      buf = c_api.TF_NewBufferFromString(
+          compat.as_bytes(attr_value.SerializeToString()))
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          # pylint: disable=protected-access
+          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
+                        status)
+          # pylint: enable=protected-access
+      finally:
+        c_api.TF_DeleteBuffer(buf)
+    else:
+      self._node_def.attr[attr_name].CopyFrom(attr_value)
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -2069,9 +2156,23 @@ class Operation(object):
       ValueError: If this op does not have an attr with the given `name`.
     """
     fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-    if name not in self._node_def.attr:
-      raise ValueError("No attr named '" + name + "' in " + str(self._node_def))
-    x = self._node_def.attr[name]
+    if self._c_op:
+      try:
+        with c_api_util.tf_buffer() as buf:
+          with errors.raise_exception_on_not_ok_status() as status:
+            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
+          data = c_api.TF_GetBuffer(buf)
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
+      x = attr_value_pb2.AttrValue()
+      x.ParseFromString(data)
+    else:
+      if name not in self._node_def.attr:
+        raise ValueError(
+            "No attr named '" + name + "' in " + str(self._node_def))
+      x = self._node_def.attr[name]
+
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
       return []
@@ -2266,8 +2367,28 @@ class RegisterShape(object):
     return f
 
 
-def set_shapes_for_outputs(op):
-  """Uses the registered shape functions to set the shapes for op's outputs."""
+def _set_shapes_for_outputs_c_api(op):
+  """set_shapes_for_outputs implementation when C API is enabled."""
+  # The C API computes the shapes when the TF_Operation is created. Fetch the
+  # output shapes from the C object.
+  for output in op.outputs:
+    with errors.raise_exception_on_not_ok_status() as status:
+      # pylint: disable=protected-access
+      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+          op._graph._c_graph, output._as_tf_output(), status)
+      # pylint: enable=protected-access
+    if unknown_shape:
+      output.set_shape(tensor_shape.unknown_shape())
+    elif not shape_vector:
+      output.set_shape(tensor_shape.scalar())
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      output.set_shape(tensor_shape.TensorShape(shape_vector))
+
+
+# TODO(skyewm): remove this when _USE_C_API flag is removed.
+def _set_shapes_for_outputs(op):
+  """set_shapes_for_outputs implementation when C API is disabled."""
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2298,6 +2419,14 @@ def set_shapes_for_outputs(op):
     output.set_shape(s)
 
 
+def set_shapes_for_outputs(op):
+  """Set the shapes for op's outputs."""
+  if op._c_op:  # pylint: disable=protected-access
+    return _set_shapes_for_outputs_c_api(op)
+  else:
+    return _set_shapes_for_outputs(op)
+
+
 class OpStats(object):
   """A holder for statistics about an operator.
 
@@ -2569,11 +2698,16 @@ class Graph(object):
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
-    if _USE_C_API:
+    if _USE_C_API or self._use_c_api_hack():
       self._scoped_c_graph = c_api_util.ScopedTFGraph()
     else:
       self._scoped_c_graph = None
 
+  # TODO(apassos) remove once the C API is used by default.
+  def _use_c_api_hack(self):
+    """Temporary hack; can be overridden to force C API usage."""
+    return False
+
   def _convert_stack(self, stack, include_func_start_lineno=False):
     """Converts a stack extracted using _extract_stack() to a traceback stack.
 
@@ -2714,10 +2848,10 @@ class Graph(object):
     """
     # pylint: enable=line-too-long
     if self._c_graph:
-      with errors.raise_exception_on_not_ok_status() as status:
-        with c_api_util.tf_buffer() as buf:
+      with c_api_util.tf_buffer() as buf:
+        with errors.raise_exception_on_not_ok_status() as status:
           c_api.TF_GraphVersions(self._c_graph, buf, status)
-          data = c_api.TF_GetBuffer(buf)
+        data = c_api.TF_GetBuffer(buf)
       version_def = versions_pb2.VersionDef()
       version_def.ParseFromString(compat.as_bytes(data))
       return version_def
@@ -2774,6 +2908,20 @@ class Graph(object):
     """
     self._control_flow_context = ctx
 
+  def _copy_functions_to_graph_def(self, graph_def, starting_bytesize):
+    """If this graph contains functions, copy them to `graph_def`."""
+    bytesize = starting_bytesize
+    for f in self._functions.values():
+      bytesize += f.definition.ByteSize()
+      if bytesize >= (1 << 31) or bytesize < 0:
+        raise ValueError("GraphDef cannot be larger than 2GB.")
+      graph_def.library.function.extend([f.definition])
+      if f.grad_func_name:
+        grad_def = function_pb2.GradientDef()
+        grad_def.function_name = f.name
+        grad_def.gradient_func = f.grad_func_name
+        graph_def.library.gradient.extend([grad_def])
+
   def _as_graph_def(self, from_version=None, add_shapes=False):
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
@@ -2817,17 +2965,7 @@ class Graph(object):
           bytesize += op.node_def.ByteSize()
           if bytesize >= (1 << 31) or bytesize < 0:
             raise ValueError("GraphDef cannot be larger than 2GB.")
-      if self._functions:
-        for f in self._functions.values():
-          bytesize += f.definition.ByteSize()
-          if bytesize >= (1 << 31) or bytesize < 0:
-            raise ValueError("GraphDef cannot be larger than 2GB.")
-          graph.library.function.extend([f.definition])
-          if f.grad_func_name:
-            grad_def = function_pb2.GradientDef()
-            grad_def.function_name = f.name
-            grad_def.gradient_func = f.grad_func_name
-            graph.library.gradient.extend([grad_def])
+      self._copy_functions_to_graph_def(graph, bytesize)
       return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
@@ -2902,9 +3040,14 @@ class Graph(object):
     # Add function to graph
     # pylint: disable=protected-access
     if self._c_graph:
-      assert function._c_func, (
-          "Cannot add function created without C API support to graph "
-          "created with C API support")
+      # Handle functions created without using the C API. TODO(apassos,skyewm)
+      # remove this when all functions are generated using the C API by default
+      # as this will be unnecessary.
+      if not function._c_func:
+        with errors.raise_exception_on_not_ok_status() as status:
+          serialized = function.definition.SerializeToString()
+          function._c_func = c_api.TF_FunctionImportFunctionDef(
+              serialized, status)
       with errors.raise_exception_on_not_ok_status() as status:
         gradient = function._grad_func._c_func if function._grad_func else None
         c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
@@ -2921,7 +3064,11 @@ class Graph(object):
         if previous._hash_str == function._hash_str:
           return
         else:
-          raise ValueError("Another function is already defined with that name")
+          raise ValueError("Cannot add function (%s, hash %s) to graph (%s). "
+                           "Another function (%s, hash %s) is already defined "
+                           "with that name (%s)" % (
+                               function, function._hash_str, self,
+                               previous, previous._hash_str, name))
     # pylint: enable=protected-access
 
     self._functions[name] = function
@@ -3000,90 +3147,145 @@ class Graph(object):
 
     node_def = _NodeDef(op_type, name, device=None, attrs=attrs)
 
+    input_ops = set([t.op for t in inputs])
+    control_inputs = self._control_dependencies_for_inputs(input_ops)
+    ret = Operation(
+        node_def,
+        self,
+        inputs=inputs,
+        output_types=dtypes,
+        control_inputs=control_inputs,
+        input_types=input_types,
+        original_op=self._default_original_op,
+        op_def=op_def)
+    self._create_op_helper(ret, compute_shapes=compute_shapes,
+                           compute_device=compute_device)
+    return ret
+
+  def _create_op_from_tf_operation(self, c_op, compute_device=True):
+    """Creates an `Operation` in this graph from the supplied TF_Operation.
+
+    This method is like create_op() except the new Operation is constructed
+    using `c_op`. The returned Operation will have `c_op` as its _c_op
+    field. This is used to create Operation objects around TF_Operations created
+    indirectly by the C API (e.g. by TF_ImportGraphDef, TF_FinishWhile).
+
+    Args:
+      c_op: a wrapped TF_Operation
+      compute_device: (Optional.) If True, device functions will be executed
+        to compute the device property of the Operation.
+
+    Returns:
+      An `Operation` object.
+    """
+    self._check_not_finalized()
+    tf_outputs = c_api.GetOperationInputs(c_op)
+    input_ops = set(self._get_operation_by_tf_operation(output.oper)
+                    for output in tf_outputs)
+    control_inputs = self._control_dependencies_for_inputs(input_ops)
+
+    # Update _names_in_use before calling the Operation constructor since the
+    # control flow code may create more Operations, and we don't want the names
+    # to conflict.
+    op_name = c_api.TF_OperationName(c_op)
+    assert op_name not in self._names_in_use
+    self._names_in_use[op_name] = 1
+
+    ret = Operation(c_op, self, control_inputs=control_inputs)
+    self._create_op_helper(ret, compute_device=compute_device)
+    return ret
+
+  def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
+    """Common logic for creating an op in this graph."""
+    # TODO(vrv): Instead of eagerly filling in shape property for every op, only
+    # populate the shape when requested.
+    #
+    # TODO(skyewm): unlike in the original Python implementation, the C API
+    # always computes shape information (even for function calls, which the
+    # original Python shape inference code doesn't handle). Deprecate the
+    # compute_shapes argument.
+    if op._c_op or compute_shapes:  # pylint: disable=protected-access
+      set_shapes_for_outputs(op)
+    # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
+    self._add_op(op)
+
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
     for key, value in self._attr_scope_map.items():
-      if key not in node_def.attr:
+      try:
+        op.get_attr(key)
+      except ValueError:
         if callable(value):
-          value = value(node_def)
+          value = value(op.node_def)
           if not isinstance(value, (type(None), attr_value_pb2.AttrValue)):
             raise TypeError(
                 "Callable for scope map key '%s' must return either None or "
                 "an AttrValue protocol buffer; but it returned: %s" % (key,
                                                                        value))
-        node_def.attr[key].CopyFrom(value)
+        if value:
+          op._set_attr(key, value)  # pylint: disable=protected-access
 
-    # Apply a kernel label if one has been specified for this op_type.
+    # Apply a kernel label if one has been specified for this op type.
     try:
-      kernel_label = self._op_to_kernel_label_map[op_type]
-      node_def.attr["_kernel"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(kernel_label)))
+      kernel_label = self._op_to_kernel_label_map[op.type]
+      op._set_attr("_kernel",  # pylint: disable=protected-access
+                   attr_value_pb2.AttrValue(s=compat.as_bytes(kernel_label)))
     except KeyError:
       pass
 
-    # Apply the overriding op_type for gradients if one has been
-    # specified for this op_type.
+    # Apply the overriding op type for gradients if one has been specified for
+    # this op type.
     try:
-      mapped_op_type = self._gradient_override_map[op_type]
-      node_def.attr["_gradient_op_type"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(mapped_op_type)))
+      mapped_op_type = self._gradient_override_map[op.type]
+      op._set_attr("_gradient_op_type",  # pylint: disable=protected-access
+                   attr_value_pb2.AttrValue(s=compat.as_bytes(mapped_op_type)))
     except KeyError:
       pass
 
-    control_inputs = self._control_dependencies_for_inputs(inputs)
-    ret = Operation(
-        node_def,
-        self,
-        inputs=inputs,
-        output_types=dtypes,
-        control_inputs=control_inputs,
-        input_types=input_types,
-        original_op=self._default_original_op,
-        op_def=op_def)
-    if compute_shapes:
-      set_shapes_for_outputs(ret)
-    self._add_op(ret)
-    self._record_op_seen_by_control_dependencies(ret)
+    self._record_op_seen_by_control_dependencies(op)
 
     if compute_device:
-      self._apply_device_functions(ret)
+      self._apply_device_functions(op)
 
     if self._colocation_stack:
       all_colocation_groups = []
       for colocation_op in self._colocation_stack:
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          # Make this device match the device of the colocated op, to
-          # provide consistency between the device and the colocation
-          # property.
-          if (ret.device and pydev.canonical_name(ret.device) !=
+          # Make this device match the device of the colocated op, to provide
+          # consistency between the device and the colocation property.
+          if (op.device and pydev.canonical_name(op.device) !=
               pydev.canonical_name(colocation_op.device)):
             logging.warning("Tried to colocate %s with an op %s that had "
                             "a different device: %s vs %s. "
-                            "Ignoring colocation property.", name,
-                            colocation_op.name, ret.device,
+                            "Ignoring colocation property.", op.name,
+                            colocation_op.name, op.device,
                             colocation_op.device)
           else:
-            ret._set_device(colocation_op.device)  # pylint: disable=protected-access
+            op._set_device(colocation_op.device)  # pylint: disable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
-      ret.node_def.attr["_class"].CopyFrom(
-          attr_value_pb2.AttrValue(list=attr_value_pb2.AttrValue.ListValue(
-              s=all_colocation_groups)))
+      # pylint: disable=protected-access
+      op._set_attr("_class", attr_value_pb2.AttrValue(
+          list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
+      # pylint: enable=protected-access
 
     # Sets "container" attribute if
     # (1) self._container is not None
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    if (self._container and op_type in self._registered_ops and
-        self._registered_ops[op_type].is_stateful and
-        "container" in ret.node_def.attr and
-        not ret.node_def.attr["container"].s):
-      ret.node_def.attr["container"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(self._container)))
-
-    return ret
+    if (self._container and op.type in self._registered_ops and
+        self._registered_ops[op.type].is_stateful):
+      try:
+        container_attr = op.get_attr("container")
+      except ValueError:
+        # "container" attribute is not in OpDef
+        pass
+      else:
+        if not container_attr:
+          op._set_attr("container", attr_value_pb2.AttrValue(  # pylint: disable=protected-access
+              s=compat.as_bytes(self._container)))
 
   def as_graph_element(self, obj, allow_tensor=True, allow_operation=True):
     """Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
@@ -3271,6 +3473,10 @@ class Graph(object):
     with self._lock:
       return self._nodes_by_name[name]
 
+  def _get_operation_by_tf_operation(self, tf_oper):
+    op_name = c_api.TF_OperationName(tf_oper)
+    return self._get_operation_by_name_unsafe(op_name)
+
   def get_tensor_by_name(self, name):
     """Returns the `Tensor` with the given `name`.
 
@@ -3305,8 +3511,7 @@ class Graph(object):
     Returns:
       The `Tensor` that represents `tf_output`.
     """
-    op_name = c_api.TF_OperationName(tf_output.oper)
-    op = self._get_operation_by_name_unsafe(op_name)
+    op = self._get_operation_by_tf_operation(tf_output.oper)
     return op.outputs[tf_output.index]
 
   def _next_id(self):
@@ -3972,8 +4177,8 @@ class Graph(object):
         ret.add(op)
     return ret
 
-  def _control_dependencies_for_inputs(self, input_tensors):
-    """For an op that takes `input_tensors` as inputs, compute control inputs.
+  def _control_dependencies_for_inputs(self, input_ops):
+    """For an op that takes `input_ops` as inputs, compute control inputs.
 
     The returned control dependencies should yield an execution that
     is equivalent to adding all control inputs in
@@ -3984,13 +4189,12 @@ class Graph(object):
     the explicit approach redundant.
 
     Args:
-      input_tensors: The direct data dependencies for an op to be created.
+      input_ops: The data input ops for an op to be created.
 
     Returns:
       A list of control inputs for the op to be created.
     """
     ret = []
-    input_ops = set([t.op for t in input_tensors])
     for controller in self._control_dependencies_stack:
       # If any of the input_ops already depends on the inputs from controller,
       # we say that the new op is dominated (by that input), and we therefore
@@ -4651,6 +4855,16 @@ def enable_eager_execution(config=None, device_policy=None):
      or if trying to create a context with nontrivial options which differ
      from those of the existing context.
   """
+  if config is not None and not isinstance(config, config_pb2.ConfigProto):
+    raise TypeError(
+        "config must be a tf.ConfigProto, but got %s" % type(config))
+  if device_policy not in (None, context.DEVICE_PLACEMENT_EXPLICIT,
+                           context.DEVICE_PLACEMENT_WARN,
+                           context.DEVICE_PLACEMENT_SILENT):
+    raise ValueError(
+        "device_policy must be one of None, tfe.DEVICE_PLACEMENT_EXPLICIT, "
+        "tfe.DEVICE_PLACEMENT_WARN, tfe.DEVICE_PLACEMENT_SILENT"
+    )
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
@@ -4672,6 +4886,9 @@ def enable_eager_execution(config=None, device_policy=None):
                      " policy: %s." % (config, context._context._config,
                                        device_policy,
                                        context._context._device_policy))
+  else:
+    raise ValueError(
+        "tfe.enable_eager_execution has to be called at program startup.")
 
 
 def eager_run(main=None, argv=None):
@@ -4897,6 +5114,9 @@ class GraphKeys(object):
   # Key to collect local variables that are local to the machine and are not
   # saved/restored.
   LOCAL_VARIABLES = "local_variables"
+  # Key to collect local variables which are used to accumulate interal state
+  # to be used in tf.metrics.*.
+  METRIC_VARIABLES = "metric_variables"
   # Key to collect model variables defined by layers.
   MODEL_VARIABLES = "model_variables"
   # Key to collect Variable objects that will be trained by the
@@ -4957,10 +5177,14 @@ class GraphKeys(object):
   COND_CONTEXT = "cond_context"
   WHILE_CONTEXT = "while_context"
 
+  # Used to store v2 summary names.
+  _SUMMARY_COLLECTION = "_SUMMARY_V2"
+
   # List of all collections that keep track of variables.
   _VARIABLE_COLLECTIONS = [
       GLOBAL_VARIABLES,
       LOCAL_VARIABLES,
+      METRIC_VARIABLES,
       MODEL_VARIABLES,
       TRAINABLE_VARIABLES,
       MOVING_AVERAGE_VARIABLES,
@@ -5128,11 +5352,18 @@ class name_scope(object):  # pylint: disable=invalid-name
     """
     if self._in_eager_mode:
       self._old_name = self._ctx.scope_name
-      if self._name:
-        scope_name = (self._old_name + self._name + "/"
-                      if self._old_name else self._name + "/")
-      else:
+      if not self._name:
         scope_name = ""
+      else:
+        if self._name[-1] == "/":
+          # A trailing slash breaks out of nested name scopes, indicating a
+          # fully specified scope name, for compatibility with Graph.name_scope.
+          scope_name = self._name
+        else:
+          name_with_trailing_slash = self._name + "/"
+          scope_name = (
+              self._old_name + name_with_trailing_slash
+              if self._old_name else name_with_trailing_slash)
       self._ctx.scope_name = scope_name
       return scope_name
     else:
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index b1269b84bd2d3d2b2d27d559b89f49b117ddee90..7d279760c805756f8012224674d21be4c79dc6cb 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -31,9 +31,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
@@ -78,7 +80,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
 
 
 @test_util.with_c_api
-class TensorTest(test_util.TensorFlowTestCase):
+class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
     op = ops.Operation(
@@ -93,10 +95,48 @@ class TensorTest(test_util.TensorFlowTestCase):
         ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
     self.assertTrue(isinstance(t, ops.Tensor))
-    with self.assertRaisesRegexp(TypeError, "not iterable"):
+    with self.assertRaisesRegexp(TypeError, "iter"):
       for _ in t:
         pass
 
+  def testAddShape(self):
+    with self.test_session():
+      a = array_ops.zeros([2, 3])
+      b = array_ops.ones([1, 3])
+      c = a + b
+      self.assertEqual([2, 3], c.shape)
+
+  def testUnknownDim(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
+      b = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
+      c = a + b
+      self.assertEqual([2, None, 3], c.shape.as_list())
+
+  def testUnknownShape(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      b = array_ops.ones([1, 3])
+      c = a + b
+      self.assertEqual(tensor_shape.unknown_shape(), c.shape)
+
+  def testScalarShape(self):
+    with self.test_session():
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+      b = array_ops.ones([])
+      c = a + b
+      self.assertEqual(tensor_shape.scalar(), c.shape)
+
+  def testShapeFunctionError(self):
+    with self.test_session():
+      a = array_ops.ones([1, 2, 3])
+      b = array_ops.ones([4, 5, 6])
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Dimensions must be equal, but are 2 and 5 for 'add' \(op: 'Add'\) "
+          r"with input shapes: \[1,2,3\], \[4,5,6\]."):
+        _ = a + b
+
 
 @test_util.with_c_api
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
@@ -163,13 +203,13 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float32, float_t.dtype)
     self.assertEqual(op, float_t.op)
     self.assertEqual(0, float_t._value_index)
-    self.assertEqual(0, len(float_t._consumers))
+    self.assertEqual(0, len(float_t.consumers()))
     self.assertEqual("myop", float_t._as_node_def_input())
 
     self.assertEqual(dtypes.string, label_str_t.dtype)
     self.assertEqual(op, label_str_t.op)
     self.assertEqual(1, label_str_t._value_index)
-    self.assertEqual(0, len(label_str_t._consumers))
+    self.assertEqual(0, len(label_str_t.consumers()))
     self.assertEqual("myop:1", label_str_t._as_node_def_input())
 
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
@@ -183,8 +223,8 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(1, len(op2.inputs))
     self.assertIs(float_t, op2.inputs[0])
 
-    self.assertEqual(1, len(float_t._consumers))
-    self.assertEqual(op2, float_t._consumers[0])
+    self.assertEqual(1, len(float_t.consumers()))
+    self.assertEqual(op2, float_t.consumers()[0])
 
     self.assertProtoEquals("op:'FloatOutput' name:'myop1'", op1.node_def)
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
@@ -203,14 +243,14 @@ class OperationTest(test_util.TensorFlowTestCase):
     op3 = test_ops.foo2(float1_t, label2_str_t, label2_str_t, name="myop3").d.op
     self.assertEqual(2, len(op3.values()))
 
-    self.assertEqual(1, len(float1_t._consumers))
-    self.assertEqual(op3, float1_t._consumers[0])
+    self.assertEqual(1, len(float1_t.consumers()))
+    self.assertEqual(op3, float1_t.consumers()[0])
 
-    self.assertEqual(0, len(float2_t._consumers))
+    self.assertEqual(0, len(float2_t.consumers()))
 
-    self.assertEqual(2, len(label2_str_t._consumers))
-    self.assertEqual(op3, label2_str_t._consumers[0])
-    self.assertEqual(op3, label2_str_t._consumers[1])
+    self.assertEqual(2, len(label2_str_t.consumers()))
+    self.assertEqual(op3, label2_str_t.consumers()[0])
+    self.assertEqual(op3, label2_str_t.consumers()[1])
 
     self.assertProtoEquals("""
     op:'Foo2' name:'myop3'
@@ -234,18 +274,23 @@ class OperationTest(test_util.TensorFlowTestCase):
     op1 = ops.Operation(
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
+    g._add_op(op1)
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
+    self.assertEquals([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
     # NOTE(mrry): Must specify input_types to preserve ref-typed input.
     op2 = ops.Operation(
         ops._NodeDef("RefInputFloatInput", "op2"),
         g, [ref_t, nonref_t], [],
         input_types=[dtypes.float32_ref, dtypes.float32])
+    g._add_op(op2)
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
+    self.assertEquals([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
+    g._add_op(op3)
     self.assertProtoEquals(
         "op:'TwoFloatInputs' name:'op3' input:'op1' input:'op1:1'",
         op3.node_def)
@@ -357,36 +402,59 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual("<tf.Operation 'op1' type=None>", repr(op))
 
   def testGetAttr(self):
-    # TODO(skyewm): implement get_attr with C API
-    if ops._USE_C_API: return
+    op = test_ops.default_attrs()
+    self.assertEqual(op.get_attr("string_val"), b"abc")
+    self.assertEqual(op.get_attr("string_list_val"), [b"abc", b""])
+    self.assertEqual(op.get_attr("int_val"), 123)
+    self.assertEqual(op.get_attr("int_list_val"), [1, 2, 3])
+    self.assertEqual(op.get_attr("float_val"), 10.0)
+    self.assertEqual(op.get_attr("float_list_val"), [10.0])
+    self.assertEqual(op.get_attr("bool_val"), True)
+    self.assertEqual(op.get_attr("bool_list_val"), [True, False])
+    self.assertEqual(op.get_attr("shape_val"),
+                     tensor_shape.as_shape([2, 1]).as_proto())
+    self.assertEqual(op.get_attr("shape_list_val"),
+                     [tensor_shape.as_shape([]).as_proto(),
+                      tensor_shape.as_shape([1]).as_proto()])
+    self.assertEqual(op.get_attr("tensor_val"),
+                     tensor_util.make_tensor_proto(1, dtypes.int32))
+    self.assertEqual(op.get_attr("tensor_list_val"),
+                     [tensor_util.make_tensor_proto(1, dtypes.int32)])
+
+    type_val = op.get_attr("type_val")
+    # First check that type_val is a DType, because the assertEquals will work
+    # no matter what since DType overrides __eq__
+    self.assertIsInstance(type_val, dtypes.DType)
+    self.assertEqual(type_val, dtypes.int32)
+
+    type_list_val = op.get_attr("type_list_val")
+    self.assertTrue(all(isinstance(x, dtypes.DType) for x in type_list_val))
+    self.assertEqual(type_list_val, [dtypes.int32, dtypes.float32])
+
+    @function.Defun(dtypes.float32, func_name="MyFunc")
+    def func(x):
+      return x
+
+    op = test_ops.func_attr(func)
+    self.assertEqual(op.get_attr("f"),
+                     attr_value_pb2.NameAttrList(name="MyFunc"))
+
+    # Try fetching missing attr
+    if ops._USE_C_API:
+      error_msg = "Operation 'FuncAttr' has no attr named 'FakeAttr'."
+    else:
+      error_msg = "No attr named 'FakeAttr' in name: \"FuncAttr\""
 
-    list_value = attr_value_pb2.AttrValue.ListValue()
-    list_value.type.append(types_pb2.DT_STRING)
-    list_value.type.append(types_pb2.DT_DOUBLE)
-    op = ops.Operation(
-        ops._NodeDef(
-            "None",
-            "op1",
-            attrs={
-                "value": attr_value_pb2.AttrValue(i=32),
-                "dtype": attr_value_pb2.AttrValue(type=types_pb2.DT_INT32),
-                "list": attr_value_pb2.AttrValue(list=list_value),
-                "func": attr_value_pb2.AttrValue(
-                    func=attr_value_pb2.NameAttrList())
-            }), ops.Graph(), [], [dtypes.int32])
-    self.assertEqual(32, op.get_attr("value"))
-    self.assertEqual("", op.get_attr("func").name)
-
-    d = op.get_attr("dtype")
-    # First check that d is a DType, because the assertEquals will
-    # work no matter what since DType overrides __eq__
-    self.assertIsInstance(d, dtypes.DType)
-    self.assertEqual(dtypes.int32, d)
-
-    l = op.get_attr("list")
-    for x in l:
-      self.assertIsInstance(x, dtypes.DType)
-    self.assertEqual([dtypes.string, dtypes.double], l)
+    with self.assertRaisesRegexp(ValueError, error_msg):
+      op.get_attr("FakeAttr")
+
+  # TODO(b/65162920): remove this test when users who are directly mutating the
+  # node_def have been updated to proper usage.
+  def testSetAttr(self):
+    op = test_ops.int_attr().op
+    op._set_attr("foo", attr_value_pb2.AttrValue(i=2))
+    # TODO(skyewm): add node_def check
+    self.assertEqual(op.get_attr("foo"), 2)
 
   # TODO(nolivia): test all error cases
   def testAddControlInput(self):
@@ -417,6 +485,30 @@ class OperationTest(test_util.TensorFlowTestCase):
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
 
+  def testRemoveAllControlInputs(self):
+    a = constant_op.constant(1)
+    with ops.control_dependencies([a]):
+      b = constant_op.constant(2)
+    c = constant_op.constant(3)
+    d = constant_op.constant(4)
+    e = constant_op.constant(5)
+    with ops.control_dependencies([a, c]):
+      f = d + e
+
+    self.assertEqual(a.op.control_inputs, [])
+    self.assertEqual(b.op.control_inputs, [a.op])
+    self.assertEqual(f.op.control_inputs, [a.op, c.op])
+
+    a.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(a.op.control_inputs, [])
+
+    b.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(b.op.control_inputs, [])
+
+    f.op._remove_all_control_inputs()  # pylint: disable=protected-access
+    self.assertEqual(f.op.control_inputs, [])
+    self.assertEqual(list(f.op.inputs), [d, e])
+
   def testControlInputCycle(self):
     # Non-C API path has a different error message
     if not ops._USE_C_API: return
@@ -443,16 +535,22 @@ class OperationTest(test_util.TensorFlowTestCase):
 
     z.op._update_input(0, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [y, y])
+    self.assertEquals(x.consumers(), [])
+    self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
+    self.assertEquals(x.consumers(), [z.op])
+    self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
       self.assertEquals(sess.run(z), 3)
 
@@ -467,8 +565,6 @@ class OperationTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
         z.op._update_input(0, x)  # pylint: disable=protected-access
 
-  # TODO(nolivia): check the shape/type in _update_input() instead of depending
-  # on run to do that.
   def testUpdateInputTypeError(self):
     g = ops.Graph()
     with g.as_default():
@@ -484,6 +580,37 @@ class OperationTest(test_util.TensorFlowTestCase):
           "with expected int32"):
         sess.run(z)
 
+  def testUpdateInputShapeError(self):
+    # C-API throws the error differently.
+    if ops._USE_C_API:
+      return
+    g = ops.Graph()
+    with g.as_default():
+      w = constant_op.constant(2, shape=[3, 1])
+      x = constant_op.constant(0, shape=[3, 1])
+      y = constant_op.constant(1, shape=[2, 2])
+      z = w + x
+      z.op._update_input(0, y)  # pylint: disable=protected-access
+
+    with session.Session(graph=g) as sess:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   r"Incompatible shapes: \[2,2\] vs. \[3,1\]"):
+        sess.run(z)
+
+  def testUpdateInputShapeErrorC(self):
+    if not ops._USE_C_API:
+      return
+    g = ops.Graph()
+    with g.as_default():
+      w = constant_op.constant(2, shape=[3, 1])
+      x = constant_op.constant(0, shape=[3, 1])
+      y = constant_op.constant(1, shape=[2, 2])
+      z = w + x
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Cannot update edge, incompatible shapes: \[2,2\] and \[3,1\]"):
+      z.op._update_input(0, y)  # pylint: disable=protected-access
+
   def testUpdateInputOutOfRange(self):
     # C-API throws the error differently.
     if ops._USE_C_API: return
@@ -499,9 +626,11 @@ class OperationTest(test_util.TensorFlowTestCase):
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant(1)
-    with self.assertRaisesRegexp(errors.OutOfRangeError,
-                                 r"Node 'Const' \(type: 'Const', "
-                                 r"num of inputs: 0\) does not have input 1"):
+    with self.assertRaisesRegexp(
+        errors.OutOfRangeError,
+        r"Cannot update edge. Input index \[1\] is greater than the number of "
+        r"total inputs \[0\]."
+    ):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
   def testOpDef(self):
@@ -519,6 +648,16 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(len(z.op.op_def.input_arg), 2)
     self.assertEqual(len(z.op.op_def.output_arg), 1)
 
+  def testInputFromDifferentGraphError(self):
+    g_0 = ops.Graph()
+    g_1 = ops.Graph()
+    with g_0.as_default():
+      x = constant_op.constant(1)
+    with g_1.as_default():
+      y = constant_op.constant(2)
+      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+        y * x  # pylint: disable=pointless-statement
+
 
 @test_util.with_c_api
 class CreateOpTest(test_util.TensorFlowTestCase):
@@ -578,6 +717,204 @@ class CreateOpTest(test_util.TensorFlowTestCase):
     g.create_op("FloatOutput", [], [dtypes.float32], None, name="myop1")
 
 
+# NOTE(skyewm): these cases test the private Graph._create_op_from_tf_operation
+# method. Arguably we should only test the public APIs that depend on this
+# method. However, this logic is complex and tricky, and it can be difficult to
+# ascertain if we have adequate coverage (e.g. a graph may run successfully if
+# the control flow context isn't set properly, but a more complicated use case
+# that might not be obvious to test will fail). Thus we instead explicitly test
+# the low-level behavior.
+@test_util.with_c_api
+class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(
+            g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
+        op = g._create_op_from_tf_operation(c_op)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = test_ops.int_input_int_output(x, name="myop").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op.type, "IntInputIntOutput")
+    self.assertEqual(len(op.outputs), 1)
+    self.assertEqual(op.outputs[0].shape, tensor_shape.unknown_shape())
+    self.assertEqual(list(op.inputs), [x])
+    self.assertEqual(op.control_inputs, [])
+    self.assertEqual(op.graph, g)
+    self.assertEqual(x.consumers(), [op])
+    self.assertIsNotNone(op.traceback)
+    self.assertEqual(g.get_operation_by_name("myop"), op)
+    self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
+
+  def testShape(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
+        op = g._create_op_from_tf_operation(c_op)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = array_ops.identity(x, name="myop").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op.type, "Identity")
+    self.assertEqual(len(op.outputs), 1)
+    self.assertEqual(op.outputs[0].shape, tensor_shape.matrix(2, 3))
+
+  def testUniqueName(self):
+    g = ops.Graph()
+    with g.as_default():
+      if ops._USE_C_API:
+        c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
+        c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
+        op = g._create_op_from_tf_operation(c_op)
+        op2 = g._create_op_from_tf_operation(c_op2)
+      else:
+        # Test pure-Python version to make sure C API has same behavior.
+        op = test_ops.int_output(name="myop").op
+        op2 = test_ops.int_output(name="myop_1").op
+
+      # Create ops with same names as op1 and op2. We expect the new names to be
+      # uniquified.
+      op3 = test_ops.int_output(name="myop").op
+      op4 = test_ops.int_output(name="myop_1").op
+
+    self.assertEqual(op.name, "myop")
+    self.assertEqual(op2.name, "myop_1")
+    self.assertEqual(op3.name, "myop_2")
+    self.assertEqual(op4.name, "myop_1_1")
+
+  def testCond(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def true_fn():
+        if ops._USE_C_API:
+          c_op = ops._create_c_op(ops.get_default_graph(),
+                                  ops._NodeDef("IntInput", "cond/myop"), [x],
+                                  [])
+          ops.get_default_graph()._create_op_from_tf_operation(c_op)
+        else:
+          # Test pure-Python version to make sure C API has same behavior.
+          test_ops.int_input(x, name="myop")
+        return x
+
+      control_flow_ops.cond(x < 10, true_fn, lambda: x)
+
+    op = g.get_operation_by_name("cond/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(op.name, "cond/myop")
+    self.assertEqual(op.type, "IntInput")
+    self.assertEqual(op.outputs, [])
+    op_input = op.inputs[0].op
+    self.assertEqual(op_input.type, "Switch")
+    self.assertEqual(op_input.inputs[0], x)
+    self.assertEqual(op.graph, g)
+    # pylint: disable=protected-access
+    self.assertIsNotNone(op._get_control_flow_context())
+    self.assertEqual(op._get_control_flow_context().name,
+                     "cond/cond_text")
+    # pylint: enable=protected-access
+
+  def testWhileLoop(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def body(i):
+        if ops._USE_C_API:
+          c_op = ops._create_c_op(ops.get_default_graph(),
+                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
+                                  [])
+          ops.get_default_graph()._create_op_from_tf_operation(c_op)
+        else:
+          # Test pure-Python version to make sure C API has same behavior.
+          test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(op.name, "myloop/myop")
+    self.assertEqual(op.type, "IntInput")
+    self.assertEqual(op.outputs, [])
+    op_input = op.inputs[0].op
+    self.assertEqual(op_input.type, "Enter")
+    self.assertEqual(list(op_input.inputs), [x])
+    self.assertEqual(op.graph, g)
+    # pylint: disable=protected-access
+    self.assertIsNotNone(op._get_control_flow_context())
+    self.assertEqual(op._get_control_flow_context().name,
+                     "myloop/while_context")
+    # pylint: enable=protected-access
+
+  def testWhileLoopWithInternalControlDep(self):
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+
+      def body(i):
+        c = constant_op.constant(1.0, name="c")
+        if ops._USE_C_API:
+          c_op = ops._create_c_op(ops.get_default_graph(),
+                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
+                                  [])
+          with ops.control_dependencies([c]):
+            ops.get_default_graph()._create_op_from_tf_operation(c_op)
+        else:
+          with ops.control_dependencies([c]):
+            test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    c = g.get_operation_by_name("myloop/c")
+    self.assertIsNotNone(c)
+    # Internal control dep is preserved
+    self.assertEqual(op.control_inputs, [c])
+
+  def testWhileLoopWithExternalControlDep(self):
+    # TODO(skyewm): enable once ControlFlowContext._RemoveExternalControlEdges
+    # works with C API enabled
+    if ops._USE_C_API: self.skipTest("Not yet implemented with C API enabled")
+
+    g = ops.Graph()
+    with g.as_default():
+      x = test_ops.int_output()
+      c = constant_op.constant(1.0)
+
+      def body(i):
+        if ops._USE_C_API:
+          c_op = ops._create_c_op(ops.get_default_graph(),
+                                  ops._NodeDef("IntInput", "myloop/myop"), [x],
+                                  [])
+          with ops.control_dependencies([c]):
+            ops.get_default_graph()._create_op_from_tf_operation(c_op)
+        else:
+          with ops.control_dependencies([c]):
+            test_ops.int_input(x, name="myop")
+        return i
+
+      control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
+
+    op = g.get_operation_by_name("myloop/myop")
+    self.assertIsNotNone(op)
+    self.assertEqual(len(op.control_inputs), 1)
+    # External control dep is removed and replaced with internal control dep
+    self.assertNotEqual(op.control_inputs[0], c.op)
+    self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
+
+
 @test_util.with_c_api
 class ApplyOpTest(test_util.TensorFlowTestCase):
 
@@ -1238,6 +1575,29 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEager(self):
+    def future():
+      future.calls += 1
+      return constant_op.constant(2.0)
+    future.calls = 0
+
+    if context.in_graph_mode():
+      g = ops.Graph()
+      with g.as_default():
+        a = constant_op.constant(1.0)
+        b = future()
+        with g.control_dependencies([a, b]):
+          c = constant_op.constant(3.0)
+      self.assertEqual(c.op.control_inputs, [a.op, b.op])
+      self.assertEqual(future.calls, 1)
+    else:
+      a = constant_op.constant(1.0)
+      b = future()
+      with ops.control_dependencies([a, b]):
+        c = constant_op.constant(3.0)
+      self.assertEqual(future.calls, 1)
+
   def testBasicWithConversion(self):
     g = ops.Graph()
     a = _apply_op(g, "FloatOutput", [], [dtypes.float32])
@@ -1401,6 +1761,37 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
 @test_util.with_c_api
 class OpScopeTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testNames(self):
+    with ops.name_scope("foo") as foo:
+      self.assertEqual("foo/", foo)
+      with ops.name_scope("foo2") as foo2:
+        self.assertEqual("foo/foo2/", foo2)
+      with ops.name_scope(None) as empty1:
+        self.assertEqual("", empty1)
+        with ops.name_scope("foo3") as foo3:
+          self.assertEqual("foo3/", foo3)
+      with ops.name_scope("") as empty2:
+        self.assertEqual("", empty2)
+    with ops.name_scope("foo/") as outer_foo:
+      self.assertEqual("foo/", outer_foo)
+      with ops.name_scope("") as empty3:
+        self.assertEqual("", empty3)
+      with ops.name_scope("foo4") as foo4:
+        self.assertEqual("foo/foo4/", foo4)
+      with ops.name_scope("foo5//") as foo5:
+        self.assertEqual("foo5//", foo5)
+        with ops.name_scope("foo6") as foo6:
+          self.assertEqual("foo5//foo6/", foo6)
+      with ops.name_scope("/") as foo7:
+        self.assertEqual("/", foo7)
+      with ops.name_scope("//") as foo8:
+        self.assertEqual("//", foo8)
+      with ops.name_scope("a//b/c") as foo9:
+        self.assertEqual("foo/a//b/c/", foo9)
+    with ops.name_scope("a//b/c") as foo10:
+      self.assertEqual("a//b/c/", foo10)
+
   @test_util.run_in_graph_and_eager_modes()
   def testEagerDefaultScopeName(self):
     with ops.name_scope(None, "default") as scope:
@@ -1515,6 +1906,20 @@ class GraphTest(test_util.TensorFlowTestCase):
       self._AssertDefault(g0)
     self._AssertDefault(orig)
 
+  def testPreventFeeding(self):
+    g = ops.Graph()
+    a = constant_op.constant(2.0)
+    self.assertTrue(g.is_feedable(a))
+    g.prevent_feeding(a)
+    self.assertFalse(g.is_feedable(a))
+
+  def testPreventFetching(self):
+    g = ops.Graph()
+    a = constant_op.constant(2.0)
+    self.assertTrue(g.is_fetchable(a))
+    g.prevent_fetching(a.op)
+    self.assertFalse(g.is_fetchable(a))
+
   def testAsGraphElementConversions(self):
 
     class ConvertibleObj(object):
@@ -1558,6 +1963,24 @@ class GraphTest(test_util.TensorFlowTestCase):
     gc.collect()
     self.assertIsNone(g_ref())
 
+  def testRunnableAfterInvalidShape(self):
+    with ops.Graph().as_default():
+      with self.assertRaises(ValueError):
+        math_ops.add([1, 2], [1, 2, 3])
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
+  def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
+    g = ops.Graph()
+    with g.as_default():
+      with g._kernel_label_map({"KernelLabelRequired": "overload_1"}):
+        with self.assertRaises(ValueError):
+          test_ops.kernel_label_required(1)
+      a = constant_op.constant(1)
+      with session.Session() as sess:
+        sess.run(a)
+
 
 @test_util.with_c_api
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -1878,7 +2301,7 @@ class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
     op = ops.Operation(
-        ops._NodeDef("None", "myop"), ops.Graph(), [], [dtypes.float32])
+        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
     t = op.outputs[0]
     self.assertTrue(ops.is_dense_tensor_like(t))
 
@@ -2077,6 +2500,13 @@ class InputTypesTest(test_util.TensorFlowTestCase):
       self.assertEqual([dtypes.double, dtypes.double], z.op._input_dtypes)
       # pylint: enable=protected-access
 
+  def testBadArgumentsToEnableEagerExecution(self):
+    with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
+      ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
+    with self.assertRaisesRegexp(ValueError, "device_policy must be one of"):
+      c = config_pb2.ConfigProto()
+      ops.enable_eager_execution(c, c)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 3c62dfd133d7b96045499253ecdbf3bbc0d4f798..72d3ea90fd60dd532ecd71ba4257f651db963625 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdio.h>
 #include <sstream>
 #include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb_text.h"
@@ -447,23 +448,48 @@ static void AddDelimiter(string* append_to, const string& delim) {
   if (!append_to->empty()) strings::StrAppend(append_to, delim);
 }
 
-GenPythonOp::GenPythonOp(const OpDef& op_def, const string& function_name)
+const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.attr_size(); ++i) {
+    if (api_def.attr(i).name() == name) {
+      return &api_def.attr(i);
+    }
+  }
+  return nullptr;
+}
+
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
+GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                         const string& function_name)
     : op_def_(op_def),
+      api_def_(api_def),
       function_name_(function_name),
       num_outs_(op_def.output_arg_size()) {}
 
 GenPythonOp::~GenPythonOp() {}
 
 string GenPythonOp::Code() {
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
   // This has all the input args followed by those attrs that don't have
   // defaults.
-  std::vector<string> args_no_default;
+  std::vector<ParamNames> params_no_default;
   // The parameters with defaults (these have to be listed after those without).
   // No input args are included, just attrs.
-  std::vector<string> args_with_defaults;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    args_no_default.push_back(arg.name());
+  std::vector<ParamNames> params_with_default;
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
     if (!arg.type_attr().empty()) {
       gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
     } else if (!arg.type_list_attr().empty()) {
@@ -474,14 +500,14 @@ string GenPythonOp::Code() {
       gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
     }
   }
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
+  for (int i = 0; i < api_def_.attr_size(); ++i) {
+    const auto& attr(api_def_.attr(i));
     // Do not add inferred attrs to the Python function signature.
     if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
       if (attr.has_default_value()) {
-        args_with_defaults.push_back(attr.name());
+        params_with_default.emplace_back(attr.name(), attr.rename_to());
       } else {
-        args_no_default.push_back(attr.name());
+        params_no_default.emplace_back(attr.name(), attr.rename_to());
       }
     }
   }
@@ -490,31 +516,35 @@ string GenPythonOp::Code() {
   // those with defaults go at the end.
   // Get the attrs in the order we want by taking the attrs without defaults
   // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(args_no_default.size() - op_def_.input_arg_size() +
-                 args_with_defaults.size());
-  attrs_.insert(attrs_.end(),
-                args_no_default.begin() + op_def_.input_arg_size(),
-                args_no_default.end());
-  attrs_.insert(attrs_.end(), args_with_defaults.begin(),
-                args_with_defaults.end());
-
-  param_names_.reserve(args_no_default.size() + args_with_defaults.size());
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (int i = 0; i < params_with_default.size(); ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param : params_with_default) {
+    param_names_.push_back(param);
+  }
+
   string parameters;
-  for (const string& name : args_no_default) {
+  for (const auto& param : params_no_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param);
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  for (const string& name : args_with_defaults) {
+  for (const auto& param_and_default : params_with_default) {
     AddDelimiter(&parameters, ", ");
-    const string param = AvoidPythonReserved(name);
-    strings::StrAppend(&parameters, param, "=None");
-    param_names_.push_back(param);
+    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
   }
   AddDelimiter(&parameters, ", ");
   strings::StrAppend(&parameters, "name=None");
 
+  AddExport();
   AddDefLine(parameters);
   AddDocStringDescription();
   AddDocStringArgs();
@@ -530,18 +560,38 @@ string GenPythonOp::Code() {
   return prelude_ + result_;
 }
 
+void GenPythonOp::AddExport() {
+  if (api_def_.visibility() != ApiDef::VISIBLE) {
+    return;
+  }
+
+  strings::StrAppend(&result_, "@tf_export(");
+
+  // Add all endpoint names to tf_export.
+  bool first_endpoint = true;
+  for (const auto& endpoint : api_def_.endpoint()) {
+    if (!first_endpoint) {
+      strings::StrAppend(&result_, ", ");
+    } else {
+      first_endpoint = false;
+    }
+    strings::StrAppend(&result_, "'", endpoint.name(), "'");
+  }
+  strings::StrAppend(&result_, ")\n");
+}
+
 void GenPythonOp::AddDefLine(const string& parameters) {
   strings::StrAppend(&result_, "def ", function_name_, "(", parameters, "):\n");
 }
 
 void GenPythonOp::AddDocStringDescription() {
   string comment;
-  if (op_def_.summary().empty()) {
+  if (api_def_.summary().empty()) {
     comment = "TODO: add doc.\n";
   } else {
-    comment = strings::StrCat(op_def_.summary(), "\n");
-    if (!op_def_.description().empty()) {
-      strings::StrAppend(&comment, "\n", Indent(2, 2, op_def_.description()));
+    comment = strings::StrCat(api_def_.summary(), "\n");
+    if (!api_def_.description().empty()) {
+      strings::StrAppend(&comment, "\n", Indent(2, 2, api_def_.description()));
     }
   }
   strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
@@ -552,14 +602,15 @@ void GenPythonOp::AddDocStringArgs() {
 }
 
 void GenPythonOp::AddDocStringInputs() {
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    StringPiece description = op_def_.input_arg(i).description();
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    StringPiece description = api_def_arg.description();
     string desc;
     if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names_[i], ": ");
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
     } else {
-      desc = strings::StrCat(param_names_[i], ": ",
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
                              ArgTypeName(op_def_, arg, inferred_attrs_, false));
     }
     if (!description.empty()) {
@@ -572,7 +623,9 @@ void GenPythonOp::AddDocStringInputs() {
 void GenPythonOp::AddDocStringAttrs() {
   for (const string& name : attrs_) {
     const auto& attr = *FindAttr(name, op_def_);
-    string desc = strings::StrCat(AvoidPythonReserved(name), ": ");
+    const auto& api_def_attr = *FindAttr(name, api_def_);
+    string desc =
+        strings::StrCat(AvoidPythonReserved(api_def_attr.rename_to()), ": ");
 
     static const char* const kAttrTypeName[][2] = {
         {"string", "`string`"},
@@ -596,7 +649,7 @@ void GenPythonOp::AddDocStringAttrs() {
     for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
       if (attr.type() == kAttrTypeName[i][0]) {
         string s;
-        if (attr.has_default_value()) {
+        if (api_def_attr.has_default_value()) {
           s = strings::StrCat("optional ", kAttrTypeName[i][1]);
         } else {
           s = kAttrTypeName[i][1];
@@ -625,14 +678,13 @@ void GenPythonOp::AddDocStringAttrs() {
 
     strings::StrAppend(&desc, ".");
 
-    if (attr.has_default_value()) {
-      strings::StrAppend(&desc, " Defaults to `",
-                         AttrValueToPython(attr.type(), attr.default_value()),
-                         "`.");
+    if (api_def_attr.has_default_value()) {
+      strings::StrAppend(
+          &desc, " Defaults to `",
+          AttrValueToPython(attr.type(), api_def_attr.default_value()), "`.");
     }
-
-    if (!attr.description().empty()) {
-      AppendWithinWidth(&desc, attr.description(),
+    if (!api_def_attr.description().empty()) {
+      AppendWithinWidth(&desc, api_def_attr.description(),
                         kRightMargin - 4 /* indent */);
     }
     strings::StrAppend(&result_, Indent(4, 6, desc));
@@ -650,8 +702,8 @@ void GenPythonOp::AddOutputGlobals() {
     // Prepare the list of output names
     std::vector<string> out_names(num_outs_);
     for (int i = 0; i < num_outs_; ++i) {
-      if (!op_def_.output_arg(i).name().empty()) {
-        out_names[i] = op_def_.output_arg(i).name();
+      if (!api_def_.out_arg(i).rename_to().empty()) {
+        out_names[i] = api_def_.out_arg(i).rename_to();
       } else {
         out_names[i] = strings::StrCat("output", i);
       }
@@ -703,7 +755,8 @@ void GenPythonOp::AddBody(const string& prefix) {
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
   string args = strings::StrCat("\"", op_def_.name(), "\", ");
   for (size_t i = 0; i < param_names_.size(); ++i) {
-    strings::StrAppend(&args, param_names_[i], "=", param_names_[i], ", ");
+    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
+                       "=", param_names_[i].GetRenameTo(), ", ");
   }
   strings::StrAppend(&args, "name=name)");
 
@@ -714,11 +767,14 @@ void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
 
 }  // namespace python_op_gen_internal
 
-string GetPythonOp(const OpDef& op_def, const string& function_name) {
-  return python_op_gen_internal::GenPythonOp(op_def, function_name).Code();
+string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                   const string& function_name) {
+  return python_op_gen_internal::GenPythonOp(op_def, api_def, function_name)
+      .Code();
 }
 
-string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
                     bool require_shapes) {
   string result;
   // Header
@@ -738,6 +794,7 @@ from tensorflow.python.framework import common_shapes as _common_shapes
 from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
+from tensorflow.python.util.tf_export import tf_export
 )");
 
   // We'll make a copy of ops that filters out descriptions.
@@ -766,7 +823,8 @@ from tensorflow.python.framework import op_def_library as _op_def_library
       continue;
     }
 
-    strings::StrAppend(&result, GetPythonOp(op_def, function_name));
+    const auto* api_def = api_defs.GetApiDef(op_def.name());
+    strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
       strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
@@ -799,16 +857,18 @@ from tensorflow.python.framework import op_def_library as _op_def_library
   return result;
 }
 
-void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
+void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops,
                     bool require_shapes) {
-  printf("%s", GetPythonOps(ops, hidden_ops, require_shapes).c_str());
+  printf("%s", GetPythonOps(ops, api_defs, hidden_ops, require_shapes).c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
-  return GetPythonOps(ops, {}, false);
+  ApiDefMap api_def_map(ops);
+  return GetPythonOps(ops, api_def_map, {}, false);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index f485044c5aff2de07339481899b7c35249291976..4d20888dc634620515b17c4824341cdab6d6bb02 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -18,20 +18,23 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-// hidden_ops should be a comma-separated
-// list of Op names that should get a leading _ in the output.
+// hidden_ops should be a vector of Op names that should get a leading _ in the
+// output.
 // The Print* version prints the output to stdout, Get* version returns the
 // output as a string.
-void PrintPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
-                    bool require_shapes);
-string GetPythonOps(const OpList& ops, const std::vector<string>& hidden_ops,
-                    bool require_shapes);
-string GetPythonOp(const OpDef& op_def, const string& function_name);
+void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops, bool require_shapes);
+string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
+                    const std::vector<string>& hidden_ops, bool require_shapes);
+string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                   const string& function_name);
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/framework/python_op_gen_internal.h b/tensorflow/python/framework/python_op_gen_internal.h
index 92237ac81a2f2eaf20a46d613a51d2ce80c9cfd3..6b53825a6d325c00eaf9f60fbcd9d4e0f9c9183c 100644
--- a/tensorflow/python/framework/python_op_gen_internal.h
+++ b/tensorflow/python/framework/python_op_gen_internal.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -40,9 +41,32 @@ void GenerateLowerCaseOpName(const string& str, string* result);
 
 string DataTypeToPython(DataType dtype, const string& dtype_module);
 
+// Names that corresponds to a single input parameter.
+class ParamNames {
+ public:
+  // Create param based on Arg.
+  ParamNames(const string& name, const string& rename_to) : name_(name) {
+    rename_to_ = AvoidPythonReserved(rename_to);
+  }
+
+  // Get original parameter name.
+  string GetName() const { return name_; }
+
+  // Get the name to rename the parameter to. Note that AvoidPythonReserved
+  // has already been applied.
+  string GetRenameTo() const { return rename_to_; }
+
+ private:
+  // Original parameter name.
+  string name_;
+  // API name for this parameter.
+  string rename_to_;
+};
+
 class GenPythonOp {
  public:
-  GenPythonOp(const OpDef& op_def, const string& function_name);
+  GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
+              const string& function_name);
   virtual ~GenPythonOp();
 
   virtual string Code();
@@ -62,9 +86,11 @@ class GenPythonOp {
   void AddDocStringOutputs();
   void AddBody(const string& prefix);
   void AddBodyNoReturn(const string& apply_prefix);
+  void AddExport();
 
   // From constructor arguments
   const OpDef& op_def_;
+  const ApiDef& api_def_;
   const string function_name_;
   const int num_outs_;
 
@@ -80,7 +106,7 @@ class GenPythonOp {
 
   // All parameters, including inputs & non-inferred attrs, required and those
   // with defaults, except "name"
-  std::vector<string> param_names_;
+  std::vector<ParamNames> param_names_;
 };
 
 }  // namespace python_op_gen_internal
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index f681daa7e46474c9478cf9c52098158bfb357862..bc5ca195da50499c6fbab822a9a093be3f0277e0 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -103,11 +104,25 @@ string InferSourceFileName(const char* argv_zero) {
 }
 
 void PrintAllPythonOps(const std::vector<string>& op_list,
+                       const std::vector<string>& api_def_dirs,
                        const string& source_file_name, bool require_shapes,
                        bool op_list_is_whitelist) {
   OpList ops;
   OpRegistry::Global()->Export(false, &ops);
 
+  ApiDefMap api_def_map(ops);
+  if (!api_def_dirs.empty()) {
+    Env* env = Env::Default();
+
+    for (const auto& api_def_dir : api_def_dirs) {
+      std::vector<string> api_files;
+      TF_CHECK_OK(env->GetMatchingPaths(io::JoinPath(api_def_dir, "*.pbtxt"),
+                                        &api_files));
+      TF_CHECK_OK(api_def_map.LoadFileList(env, api_files));
+    }
+    api_def_map.UpdateDocs();
+  }
+
   if (op_list_is_whitelist) {
     std::unordered_set<string> whitelist(op_list.begin(), op_list.end());
     OpList pruned_ops;
@@ -116,9 +131,11 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintEagerPythonOps(pruned_ops, {}, require_shapes, source_file_name);
+    PrintEagerPythonOps(pruned_ops, api_def_map, {}, require_shapes,
+                        source_file_name);
   } else {
-    PrintEagerPythonOps(ops, op_list, require_shapes, source_file_name);
+    PrintEagerPythonOps(ops, api_def_map, op_list, require_shapes,
+                        source_file_name);
   }
 }
 
@@ -132,23 +149,30 @@ int main(int argc, char* argv[]) {
       tensorflow::InferSourceFileName(argv[0]);
 
   // Usage:
-  //   gen_main [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
-  if (argc == 2) {
-    tensorflow::PrintAllPythonOps({}, source_file_name,
-                                  tensorflow::string(argv[1]) == "1",
-                                  false /* op_list_is_whitelist */);
-  } else if (argc == 3) {
-    std::vector<tensorflow::string> hidden_ops;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &hidden_ops));
-    tensorflow::PrintAllPythonOps(hidden_ops, source_file_name,
+  //   gen_main api_def_dir1,api_def_dir2,...
+  //       [ @FILENAME | OpName[,OpName]* ] (0 | 1) [0 | 1]
+  if (argc < 3) {
+    return -1;
+  }
+  std::vector<tensorflow::string> api_def_dirs = tensorflow::str_util::Split(
+      argv[1], ",", tensorflow::str_util::SkipEmpty());
+
+  if (argc == 3) {
+    tensorflow::PrintAllPythonOps({}, api_def_dirs, source_file_name,
                                   tensorflow::string(argv[2]) == "1",
                                   false /* op_list_is_whitelist */);
   } else if (argc == 4) {
+    std::vector<tensorflow::string> hidden_ops;
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &hidden_ops));
+    tensorflow::PrintAllPythonOps(hidden_ops, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  false /* op_list_is_whitelist */);
+  } else if (argc == 5) {
     std::vector<tensorflow::string> op_list;
-    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[1], &op_list));
-    tensorflow::PrintAllPythonOps(op_list, source_file_name,
-                                  tensorflow::string(argv[2]) == "1",
-                                  tensorflow::string(argv[3]) == "1");
+    TF_CHECK_OK(tensorflow::ParseOpListCommandLine(argv[2], &op_list));
+    tensorflow::PrintAllPythonOps(op_list, api_def_dirs, source_file_name,
+                                  tensorflow::string(argv[3]) == "1",
+                                  tensorflow::string(argv[4]) == "1");
   } else {
     return -1;
   }
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 10f5579ae599bcff641ada8bb7c2b50f7a54de63..6218cc34cad50aa6e291dcffcf352c717e0d85f0 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -93,8 +93,7 @@ class SparseTensor(_TensorLike):
 
   @classmethod
   def from_value(cls, sparse_tensor_value):
-    if not (isinstance(sparse_tensor_value, SparseTensor) or
-            isinstance(sparse_tensor_value, SparseTensorValue)):
+    if not is_sparse(sparse_tensor_value):
       raise TypeError("Neither a SparseTensor nor SparseTensorValue: %s." %
                       sparse_tensor_value)
     return SparseTensor(
@@ -253,3 +252,17 @@ def convert_to_tensor_or_sparse_tensor(value, dtype=None, name=None):
     return value
   return ops.internal_convert_to_tensor(
       value, dtype=dtype, name=name)
+
+
+def is_sparse(x):
+  """Check whether `x` is sparse.
+
+  Check whether an object is a `tf.SparseTensor` or `tf.SparseTensorValue`.
+
+  Args:
+    x: A python object to check.
+
+  Returns:
+    `True` iff `x` is a `tf.SparseTensor` or `tf.SparseTensorValue`.
+  """
+  return isinstance(x, (SparseTensor, SparseTensorValue))
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index e709eaeda14e1eaae93ff39a4dc6b85970e976e1..c001fed3b058fe1e7f01f6a4f32b125783ed935e 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -51,6 +53,16 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
 
+  def testIsSparse(self):
+    self.assertFalse(sparse_tensor.is_sparse(3))
+    self.assertFalse(sparse_tensor.is_sparse("foo"))
+    self.assertFalse(sparse_tensor.is_sparse(np.array(3)))
+    self.assertTrue(
+        sparse_tensor.is_sparse(sparse_tensor.SparseTensor([[0]], [0], [1])))
+    self.assertTrue(
+        sparse_tensor.is_sparse(
+            sparse_tensor.SparseTensorValue([[0]], [0], [1])))
+
 
 class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 7e74c19124ee7942ba90b8c22e9712e4d39f0480..1b90c7ad4d68287bfa5c1c74c82d2936a20e4a80 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -49,8 +49,20 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
   tensor_proto.half_val.extend([
       ExtractBitsFromFloat16(x) for x in proto_values])
 
+
+def ExtractBitsFromBFloat16(x):
+  return np.asscalar(
+      np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+
+
+def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  tensor_proto.half_val.extend([
+      ExtractBitsFromBFloat16(x) for x in proto_values])
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
       # TODO(sesse): We should have a
       # fast_tensor_util.AppendFloat16ArrayToTensorProto,
       # but it seems np.float16_t doesn't exist?
@@ -121,6 +133,7 @@ else:
     tensor_proto.bool_val.extend([np.asscalar(x) for x in proto_values])
 
   _NP_TO_APPEND_FN = {
+      dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
       np.float16: SlowAppendFloat16ArrayToTensorProto,
       np.float32: SlowAppendFloat32ArrayToTensorProto,
       np.float64: SlowAppendFloat64ArrayToTensorProto,
@@ -286,6 +299,7 @@ _TF_TO_IS_OK = {
     dtypes.bool: [_FilterBool],
     dtypes.complex128: [_FilterComplex],
     dtypes.complex64: [_FilterComplex],
+    dtypes.float16: [_FilterFloat],
     dtypes.float32: [_FilterFloat],
     dtypes.float64: [_FilterFloat],
     dtypes.int16: [_FilterInt],
@@ -873,7 +887,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
   `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
 
   Args:
-    x: An python object to check.
+    x: A python object to check.
 
   Returns:
     `True` if `x` is a tensor, `False` if not.
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index b4f28cfce0d1897c2b3be649971a8ddc06f6998d..f2de69e159646b4a085645fa1bfef7782e78cd59 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -51,9 +51,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -69,9 +69,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -87,9 +87,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10, 20, 30], dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -106,9 +106,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(arr, dtype=dtypes.float32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -124,9 +124,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[1, 3])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -142,9 +142,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10.0, 20.0, 30.0], shape=[3, 1])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_FLOAT  
-        tensor_shape { dim { size: 3 } dim { size: 1 } }  
-        tensor_content: "A \000\000A\240\000\000A\360\000\000"  
+        dtype: DT_FLOAT
+        tensor_shape { dim { size: 3 } dim { size: 1 } }
+        tensor_content: "A \000\000A\240\000\000A\360\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -170,9 +170,9 @@ class TensorUtilTest(test.TestCase):
         np.array([[10.0, 20.0, 30.0]], dtype=np.float64))
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_DOUBLE  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"  
+        dtype: DT_DOUBLE
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "@$\000\000\000\000\000\000@4\000\000\000\000\000\000@>\000\000\000\000\000\000"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -261,9 +261,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto([10, 20, 30, 40], shape=[2, 2])
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT32  
-        tensor_shape { dim { size: 2 } dim { size: 2 } }  
-        tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("  
+        dtype: DT_INT32
+        tensor_shape { dim { size: 2 } dim { size: 2 } }
+        tensor_content: "\000\000\000\\n\000\000\000\024\000\000\000\036\000\000\000("
         """, t)
     else:
       self.assertProtoEquals("""
@@ -342,9 +342,9 @@ class TensorUtilTest(test.TestCase):
         [10, 20, 30], shape=[1, 3], dtype=dtypes.int64)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT64  
-        tensor_shape { dim { size: 1 } dim { size: 3 } }  
-        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
+        dtype: DT_INT64
+        tensor_shape { dim { size: 1 } dim { size: 3 } }
+        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -360,9 +360,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(np.array([10, 20, 30]))
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_INT64  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"  
+        dtype: DT_INT64
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\000\000\000\000\000\000\\n\000\000\000\000\000\000\000\024\000\000\000\000\000\000\000\036"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -381,9 +381,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint32)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QINT32  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"  
+        dtype: DT_QINT32
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\000\000\025\000\000\000\026\000\000\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -418,9 +418,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.quint16)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QUINT16  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\025\000\026\000\027"  
+        dtype: DT_QUINT16
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\025\000\026\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
@@ -435,9 +435,9 @@ class TensorUtilTest(test.TestCase):
     t = tensor_util.make_tensor_proto(data, dtype=dtypes.qint16)
     if sys.byteorder == "big":
       self.assertProtoEquals("""
-        dtype: DT_QINT16  
-        tensor_shape { dim { size: 3 } }  
-        tensor_content: "\000\025\000\026\000\027"  
+        dtype: DT_QINT16
+        tensor_shape { dim { size: 3 } }
+        tensor_content: "\000\025\000\026\000\027"
         """, t)
     else:
       self.assertProtoEquals("""
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index ead756a0a1344df8fdb5415c8ff1a1c90018a451..dbabce096294608b7d7df06e2b5355f5f0a6e9c2 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -26,6 +26,16 @@ REGISTER_OP("KernelLabel")
     .Output("result: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("KernelLabelRequired")
+    .Input("input: int32")
+    .Output("result: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &out));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("GraphDefVersion")
     .Output("version: int32")
     .SetIsStateful()
@@ -104,6 +114,14 @@ REGISTER_KERNEL_BUILDER(Name("KernelLabel")
                             .Label("overload_2"),
                         KernelLabelOp<OVERLOAD_2_LABEL>);
 
+// All "KernelLabelRequired" kernels have labels
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_1"),
+    KernelLabelOp<OVERLOAD_1_LABEL>);
+REGISTER_KERNEL_BUILDER(
+    Name("KernelLabelRequired").Device(DEVICE_CPU).Label("overload_2"),
+    KernelLabelOp<OVERLOAD_2_LABEL>);
+
 class GraphDefVersionOp : public OpKernel {
  public:
   explicit GraphDefVersionOp(OpKernelConstruction* ctx)
@@ -252,6 +270,11 @@ REGISTER_OP("IntInput")
     .Input("a: int32")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("IntInputIntOutput")
+    .Input("a: int32")
+    .Output("b: int32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("FloatInput")
     .Input("a: float32")
     .SetShapeFn(shape_inference::UnknownShape);
@@ -331,4 +354,37 @@ REGISTER_OP("OpWithDefaultAttr")
 REGISTER_OP("OpWithFutureDefaultAttr")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("IntAttr")
+    .Output("out: int64")
+    .Attr("foo: int = 1")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("StringListAttr")
+    .Attr("a: list(string)")
+    .Attr("b: string")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("DefaultAttrs")
+    .Attr("string_val: string = 'abc'")
+    .Attr("string_list_val: list(string) = ['abc', '']")
+    .Attr("int_val: int = 123")
+    .Attr("int_list_val: list(int) = [1, 2, 3]")
+    .Attr("float_val: float = 10.0")
+    .Attr("float_list_val: list(float) = [10.0]")
+    .Attr("bool_val: bool = true")
+    .Attr("bool_list_val: list(bool) = [true, false]")
+    .Attr("type_val: type = DT_INT32")
+    .Attr("type_list_val: list(type) = [DT_INT32, DT_FLOAT]")
+    .Attr("shape_val: shape = { dim { size: 2 } dim { size: 1 } }")
+    .Attr("shape_list_val: list(shape) = [{}, { dim { size: 1} }]")
+    .Attr("tensor_val: tensor = { dtype: DT_INT32 tensor_shape: {} int_val: 1}")
+    .Attr(
+        "tensor_list_val: list(tensor) = "
+        "[{ dtype: DT_INT32 tensor_shape: {} int_val: 1}]")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("FuncAttr")
+    .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index e545f6de8e66e2bf062249f4221fa340965ac69c..ae3b6c584a4d2f12d7eb4270e34c507148736fe0 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import gc
 import math
 import random
 import re
@@ -59,6 +60,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import nest
 from tensorflow.python.util.protobuf import compare
 
 
@@ -452,9 +454,43 @@ class IsolateTest(object):
         type_arg, value_arg, traceback_arg)
 
 
-def run_in_graph_and_eager_modes(__unused__=None, graph=None, config=None,
-                                 use_gpu=False, force_gpu=False,
-                                 reset_test=True):
+def assert_no_garbage_created(f):
+  """Test method decorator to assert that no garbage has been created.
+
+  Note that this decorator sets DEBUG_SAVEALL, which in some Python interpreters
+  cannot be un-set (i.e. will disable garbage collection for any other unit
+  tests in the same file/shard).
+
+  Args:
+    f: The function to decorate.
+  Returns:
+    The decorated function.
+  """
+
+  def decorator(self, **kwargs):
+    """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
+    gc.disable()
+    previous_debug_flags = gc.get_debug()
+    gc.set_debug(gc.DEBUG_SAVEALL)
+    gc.collect()
+    previous_garbage = len(gc.garbage)
+    f(self, **kwargs)
+    gc.collect()
+    # This will fail if any garbage has been created, typically because of a
+    # reference cycle.
+    self.assertEqual(previous_garbage, len(gc.garbage))
+    # TODO(allenl): Figure out why this debug flag reset doesn't work. It would
+    # be nice to be able to decorate arbitrary tests in a large test suite and
+    # not hold on to every object in other tests.
+    gc.set_debug(previous_debug_flags)
+    gc.enable()
+  return decorator
+
+
+def run_in_graph_and_eager_modes(
+    __unused__=None, graph=None, config=None,
+    use_gpu=False, force_gpu=False,
+    reset_test=True, assert_no_eager_garbage=False):
   """Runs the test in both graph and eager modes.
 
   Args:
@@ -465,7 +501,14 @@ def run_in_graph_and_eager_modes(__unused__=None, graph=None, config=None,
     use_gpu: If True, attempt to run as many ops as possible on GPU.
     force_gpu: If True, pin all ops to `/device:GPU:0`.
     reset_test: If True, tearDown and SetUp the test case again.
-
+    assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
+      collector and asserts that no extra garbage has been created when running
+      the test in eager mode. This will fail if there are reference cycles
+      (e.g. a = []; a.append(a)). Off by default because some tests may create
+      garbage for legitimate reasons (e.g. they define a class which inherits
+      from `object`), and because DEBUG_SAVEALL is sticky in some Python
+      interpreters (meaning that tests which rely on objects being collected
+      elsewhere in the unit test file will not work).
   Returns:
     Returns a decorator that will run the decorated test function
         using both a graph and using eager execution.
@@ -487,7 +530,7 @@ def run_in_graph_and_eager_modes(__unused__=None, graph=None, config=None,
         self.tearDown()
         self.setUp()
 
-      def run_eager_mode():
+      def run_eager_mode(self, **kwargs):
         if force_gpu:
           gpu_name = gpu_device_name()
           if not gpu_name:
@@ -501,9 +544,12 @@ def run_in_graph_and_eager_modes(__unused__=None, graph=None, config=None,
           with context.device("/device:CPU:0"):
             f(self, **kwargs)
 
+      if assert_no_eager_garbage:
+        run_eager_mode = assert_no_garbage_created(run_eager_mode)
+
       with context.eager_mode():
         with IsolateTest():
-          run_eager_mode()
+          run_eager_mode(self, **kwargs)
 
     return decorated
   return decorator
@@ -670,23 +716,22 @@ class TensorFlowTestCase(googletest.TestCase):
       fail_msg += " : %r" % (msg) if msg else ""
       self.fail(fail_msg)
 
-  def _eval_helper(self, tensors):
-    if isinstance(tensors, ops.EagerTensor):
-      return tensors.numpy()
-    if isinstance(tensors, resource_variable_ops.ResourceVariable):
-      return tensors.read_value().numpy()
-
-    if isinstance(tensors, tuple):
-      return tuple([self._eval_helper(t) for t in tensors])
-    elif isinstance(tensors, list):
-      return [self._eval_helper(t) for t in tensors]
-    elif isinstance(tensors, dict):
-      assert not tensors, "Only support empty dict now."
-      return dict()
-    elif tensors is None:
+  def _eval_tensor(self, tensor):
+    if tensor is None:
       return None
+    elif isinstance(tensor, ops.EagerTensor):
+      return tensor.numpy()
+    elif isinstance(tensor, resource_variable_ops.ResourceVariable):
+      return tensor.read_value().numpy()
+    elif callable(tensor):
+      return self._eval_helper(tensor())
     else:
-      raise ValueError("Unsupported type %s." % type(tensors))
+      raise ValueError("Unsupported type %s." % type(tensor))
+
+  def _eval_helper(self, tensors):
+    if tensors is None:
+      return None
+    return nest.map_structure(self._eval_tensor, tensors)
 
   def evaluate(self, tensors):
     """Evaluates tensors and returns numpy values.
@@ -701,7 +746,11 @@ class TensorFlowTestCase(googletest.TestCase):
       return self._eval_helper(tensors)
     else:
       sess = ops.get_default_session()
-      return sess.run(tensors)
+      if sess is None:
+        with self.test_session() as sess:
+          return sess.run(tensors)
+      else:
+        return sess.run(tensors)
 
   # pylint: disable=g-doc-return-or-yield
   @contextlib.contextmanager
@@ -935,10 +984,10 @@ class TensorFlowTestCase(googletest.TestCase):
       err: A float value.
       msg: An optional string message to append to the failure message.
     """
-    self.assertTrue(
-        math.fabs(f1 - f2) <= err,
-        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
-                               if msg is not None else ""))
+    # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
+    self.assertTrue(f1 == f2 or math.fabs(f1 - f2) <= err,
+                    "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
+                                           if msg is not None else ""))
 
   def assertArrayNear(self, farray1, farray2, err):
     """Asserts that two float arrays are near each other.
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index b2f8d62095f75ba55344a63401525ea998a70b47..90b529062635612db4905db365ad334a4245abe5 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -183,11 +183,13 @@ class TestUtilTest(test_util.TensorFlowTestCase):
 
   def _WeMustGoDeeper(self, msg):
     with self.assertRaisesOpError(msg):
-      node_def = ops._NodeDef("op_type", "name")
-      node_def_orig = ops._NodeDef("op_type_orig", "orig")
-      op_orig = ops.Operation(node_def_orig, ops.get_default_graph())
-      op = ops.Operation(node_def, ops.get_default_graph(), original_op=op_orig)
-      raise errors.UnauthenticatedError(node_def, op, "true_err")
+      with ops.Graph().as_default():
+        node_def = ops._NodeDef("op_type", "name")
+        node_def_orig = ops._NodeDef("op_type_orig", "orig")
+        op_orig = ops.Operation(node_def_orig, ops.get_default_graph())
+        op = ops.Operation(node_def, ops.get_default_graph(),
+                           original_op=op_orig)
+        raise errors.UnauthenticatedError(node_def, op, "true_err")
 
   def testAssertRaisesOpErrorDoesNotPassMessageDueToLeakedStack(self):
     with self.assertRaises(AssertionError):
@@ -328,6 +330,49 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     self.assertEqual(a_np_rand, b_np_rand)
     self.assertEqual(a_rand, b_rand)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_callable_evaluate(self):
+    def model():
+      return resource_variable_ops.ResourceVariable(
+          name="same_name",
+          initial_value=1) + 1
+    with context.eager_mode():
+      self.assertEqual(2, self.evaluate(model))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_nested_tensors_evaluate(self):
+    expected = {"a": 1, "b": 2, "nested": {"d": 3, "e": 4}}
+    nested = {"a": constant_op.constant(1),
+              "b": constant_op.constant(2),
+              "nested": {"d": constant_op.constant(3),
+                         "e": constant_op.constant(4)}}
+
+    self.assertEqual(expected, self.evaluate(nested))
+
+
+class GarbageCollectionTest(test_util.TensorFlowTestCase):
+
+  def test_no_reference_cycle_decorator(self):
+
+    class ReferenceCycleTest(object):
+
+      def __init__(inner_self):  # pylint: disable=no-self-argument
+        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
+
+      @test_util.assert_no_garbage_created
+      def test_has_cycle(self):
+        a = []
+        a.append(a)
+
+      @test_util.assert_no_garbage_created
+      def test_has_no_cycle(self):
+        pass
+
+    with self.assertRaises(AssertionError):
+      ReferenceCycleTest().test_has_cycle()
+
+    ReferenceCycleTest().test_has_no_cycle()
+
 
 @test_util.with_c_api
 class IsolationTest(test_util.TensorFlowTestCase):
@@ -395,6 +440,5 @@ class IsolationTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           first_container_variable.read_value()
 
-
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 3df9431282c1fa6a62778e474a14d01e5738c578..c9bcfeb6e8414948b3bd881ad72668bfcc4a92d9 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -14,6 +14,45 @@ limitations under the License.
 ==============================================================================*/
 
 %include "tensorflow/python/platform/base.i"
+%include <std_shared_ptr.i>
+%include "item.i"
+
+// Wrap the cluster into an object that swig can manipulate. This ensures it will call the object
+// destructor upon garbage collection instead of leaking memory.
+struct GCluster {
+  std::shared_ptr<tensorflow::grappler::Cluster> cluster_;
+};
+
+%{
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+template <>
+bool _PyObjAs(PyObject *input, tensorflow::NamedDevice *out) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize(input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    return false;
+  }
+
+  tensorflow::NamedDevice named_device;
+  if (!named_device.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The NamedDevice could not be parsed as a valid protocol buffer");
+    return false;
+  }
+  if (out) *out = named_device;
+  return true;
+}
+%}
+
+%typemap(in) const std::vector<tensorflow::NamedDevice>& (std::vector<tensorflow::NamedDevice> temp) {
+  if (!tf_vector_input_helper($input, &temp, &_PyObjAs<tensorflow::NamedDevice>)) {
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
 
 %typemap(in) const tensorflow::RunMetadata& (tensorflow::RunMetadata temp) {
   char* c_string;
@@ -26,7 +65,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The MetaGraphDef could not be parsed as a valid protocol buffer");
+        "The RunMetadata could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -41,36 +80,78 @@ limitations under the License.
 }
 
 %{
+#include <memory>
+#include <vector>
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/grappler/costs/measuring_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/protobuf/device_properties.pb.h"
+
+// Provide the implementation of the GCluster struct here.
+struct GCluster {
+  GCluster() {}
+  GCluster(tensorflow::grappler::Cluster* cluster) : cluster_(cluster) {}
 
-static tensorflow::grappler::Cluster* TF_NewCluster(
-    bool allow_soft_placement, bool disable_detailed_stats, TF_Status* out_status) {
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();;
+  tensorflow::grappler::Cluster* operator->() const {
+    return cluster_.get();
+  }
+  tensorflow::grappler::Cluster* get() const {
+    return cluster_.get();
+  }
+  bool is_none() const {
+    return cluster_.get() == nullptr;
+  }
+
+  std::shared_ptr<tensorflow::grappler::Cluster> cluster_;
+};
+
+
+static GCluster TF_NewCluster(bool allow_soft_placement,
+                   bool disable_detailed_stats, TF_Status* out_status) {
+    int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   int timeout_s = 60 * 10;
-  tensorflow::grappler::Cluster* cluster = new tensorflow::grappler::SingleMachine(
-      timeout_s, num_cpu_cores, num_gpus);
-  cluster->DisableDetailedStats(disable_detailed_stats);
-  cluster->AllowSoftPlacement(allow_soft_placement);
-  tensorflow::Status status = cluster->Provision();
+  tensorflow::grappler::Cluster* cluster_ =
+      new tensorflow::grappler::SingleMachine(
+          timeout_s, num_cpu_cores, num_gpus);
+  cluster_->DisableDetailedStats(disable_detailed_stats);
+  cluster_->AllowSoftPlacement(allow_soft_placement);
+  tensorflow::Status status = cluster_->Provision();
   tensorflow::Set_TF_Status_from_Status(out_status, status);
-  return cluster;
+  return GCluster(cluster_);
 }
 
-static void TF_DeleteCluster(tensorflow::grappler::Cluster* cluster) {
+static GCluster TF_NewVirtualCluster(
+    const std::vector<tensorflow::NamedDevice>& named_devices,
+    TF_Status* out_status) {
+  std::unordered_map<string, tensorflow::DeviceProperties> devices;
+  for (const auto& named_device : named_devices) {
+    devices[named_device.name()]= named_device.properties();
+  }
+  tensorflow::grappler::Cluster*cluster_ =
+      new tensorflow::grappler::VirtualCluster(devices);
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  tensorflow::Status status = cluster_->Provision();
+  PyGILState_Release(gstate);
+  tensorflow::Set_TF_Status_from_Status(out_status, status);
+  return GCluster(cluster_);
+}
+
+static void TF_ShutdownCluster(GCluster cluster) {
+  PyGILState_STATE gstate = PyGILState_Ensure();
   cluster->Shutdown();
-  delete cluster;
+  PyGILState_Release(gstate);
 }
 
-tensorflow::Status _GetOpPerformanceDataAndRunTime(const tensorflow::grappler::GrapplerItem& item,
-                                       tensorflow::grappler::CostEstimator* cost_measure,
-                                       tensorflow::OpPerformanceList* op_performance_data,
-                                       tensorflow::grappler::Costs* costs) {
+tensorflow::Status _GetOpPerformanceDataAndRunTime(
+    const tensorflow::grappler::GrapplerItem& item,
+    tensorflow::grappler::CostEstimator* cost_measure,
+    tensorflow::OpPerformanceList* op_performance_data,
+    tensorflow::grappler::Costs* costs) {
   tensorflow::Status status = cost_measure->Initialize(item);
   if (!status.ok()) return status;
 
@@ -85,24 +166,45 @@ tensorflow::Status _GetOpPerformanceDataAndRunTime(const tensorflow::grappler::G
   return tensorflow::Status::OK();
 }
 
+static PyObject* TF_ListDevices(GCluster cluster) {
+  const std::unordered_map<string, tensorflow::DeviceProperties>& devices = cluster->GetDevices();
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* result = PyList_New(devices.size());
+  int i = 0;
+  for (auto& dev : devices) {
+    tensorflow::NamedDevice d;
+    d.set_name(dev.first);
+    *d.mutable_properties() = dev.second;
+    string dev_str = d.SerializeAsString();
+    PyObject* dev_obj = PyBytes_FromStringAndSize(dev_str.data(),
+                                                  dev_str.size());
+    PyList_SetItem(result, i, dev_obj);
+    ++i;
+  }
+  PyGILState_Release(gstate);
+  return result;
+}
+
 static PyObject* TF_MeasureCosts(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item,
+    GCluster cluster,
     bool generate_timeline, TF_Status* out_status) {
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster, 10, 0);
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
 
   tensorflow::grappler::Costs costs;
-  tensorflow::Status status = _GetOpPerformanceDataAndRunTime(*item, &cost_measure,
-                                                 &op_performance_data, &costs);
+  tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
+      *item, &cost_measure, &op_performance_data, &costs);
   double run_time = FLT_MAX;
   if (status.ok()) {
     run_time = static_cast<double>(costs.execution_time.count()) / 1e9;
   }
   if (generate_timeline) {
     tensorflow::RunMetadata metadata;
-    tensorflow::Status s = cluster->Run(item->graph, item->feed, item->fetch, &metadata);
+    tensorflow::Status s = cluster->Run(
+        item->graph, item->feed, item->fetch, &metadata);
     if (s.ok()) {
       step_stats = metadata.step_stats();
     } else {
@@ -114,9 +216,12 @@ static PyObject* TF_MeasureCosts(
   if (!status.ok()) {
     Py_RETURN_NONE;
   }
-  PyObject* op_perf_objs = PyList_New(op_performance_data.op_performance_size());
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  PyObject* op_perf_objs = PyList_New(
+      op_performance_data.op_performance_size());
   for (int i = 0; i < op_performance_data.op_performance_size(); i++) {
-    string op_perf_str = op_performance_data.op_performance(i).SerializeAsString();
+    string op_perf_str =
+        op_performance_data.op_performance(i).SerializeAsString();
     PyObject* op_perf_obj = PyBytes_FromStringAndSize(op_perf_str.data(),
                                                       op_perf_str.size());
     PyList_SetItem(op_perf_objs, i, op_perf_obj);
@@ -139,16 +244,19 @@ static PyObject* TF_MeasureCosts(
     status = tensorflow::Status(tensorflow::error::Code::INTERNAL,
                                 "Error setting return tuples.");
     tensorflow::Set_TF_Status_from_Status(out_status, status);
-    Py_RETURN_NONE;
+    Py_INCREF(Py_None);
+    ret = Py_None;
   }
+  PyGILState_Release(gstate);
   return ret;
 }
 
 
 static PyObject* TF_DeterminePeakMemoryUsage(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item,
+    GCluster cluster,
     TF_Status* out_status) {
-  if (!item || !cluster) {
+  if (item.is_none() || cluster.is_none()) {
     tensorflow::Status status(tensorflow::error::Code::INTERNAL,
                               "You need both a cluster and an item to determine peak memory usage");
     tensorflow::Set_TF_Status_from_Status(out_status, status);
@@ -158,7 +266,7 @@ static PyObject* TF_DeterminePeakMemoryUsage(
 
   tensorflow::Status status;
   if (cluster->DetailedStatsEnabled()) {
-    status = memory.InferDynamically(cluster);
+    status = memory.InferDynamically(cluster.get());
   } else {
     status = memory.InferStatically(cluster->GetDevices());
   }
@@ -167,6 +275,7 @@ static PyObject* TF_DeterminePeakMemoryUsage(
     Py_RETURN_NONE;
   }
 
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* result = PyDict_New();
   for (const auto& device : cluster->GetDevices()) {
     const tensorflow::grappler::GraphMemory::MemoryUsage& usage =
@@ -188,19 +297,24 @@ static PyObject* TF_DeterminePeakMemoryUsage(
     PyTuple_SetItem(ret, 1, per_device);
     PyDict_SetItem(result, PyString_FromString(device.first.c_str()), ret);
   }
+  PyGILState_Release(gstate);
   return result;
 }
 
 %}
 
 // Wrap these functions.
-
-static tensorflow::grappler::Cluster* TF_NewCluster(
+static GCluster TF_NewCluster(
     bool allow_soft_placement, bool disable_detailed_stats, TF_Status* out_status);
-static void TF_DeleteCluster(tensorflow::grappler::Cluster* cluster);
+static GCluster TF_NewVirtualCluster(
+    const std::vector<tensorflow::NamedDevice>& named_devices,
+    TF_Status* out_status);
+static void TF_ShutdownCluster(GCluster cluster);
+static PyObject* TF_ListDevices(GCluster cluster);
 static PyObject* TF_MeasureCosts(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item, GCluster cluster,
     bool generate_timeline, TF_Status* out_status);
 static PyObject* TF_DeterminePeakMemoryUsage(
-    const tensorflow::grappler::GrapplerItem* item, tensorflow::grappler::Cluster* cluster,
+    GItem item, GCluster cluster,
     TF_Status* out_status);
+
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index baac604f411b3fb48ab2336e9479853f26fd690c..60e1322050d0ff6e5ca66ac2bddddc7ff613e59c 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.grappler.costs import op_performance_data_pb2
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.python import pywrap_tensorflow as tf_cluster
 from tensorflow.python.framework import errors
 
@@ -30,33 +33,59 @@ class Cluster(object):
   def __init__(self,
                allow_soft_placement=True,
                disable_detailed_stats=True,
-               disable_timeline=True):
+               disable_timeline=True,
+               devices=None):
     """Creates a Cluster.
 
     Args:
-      allow_soft_placement: if True, TF will automatically fix illegal
+      allow_soft_placement: If True, TF will automatically fix illegal
         placements instead of erroring out if the placement isn't legal.
-      disable_detailed_stats: if True, detailed statistics will not be
+      disable_detailed_stats: If True, detailed statistics will not be
         available.
-      disable_timeline: if True, the timeline information will not be
-        reported.
+      disable_timeline: If True, the timeline information will not be reported.
+      devices: A list of devices of type device_properties_pb2.NamedDevice.
+        If None, a device list will be created based on the spec of
+        the local machine.
     """
     self._tf_cluster = None
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._tf_cluster = tf_cluster.TF_NewCluster(
-          allow_soft_placement, disable_detailed_stats, status)
     self._generate_timeline = not disable_timeline
+    with errors.raise_exception_on_not_ok_status() as status:
+      if devices is None:
+        self._tf_cluster = tf_cluster.TF_NewCluster(
+            allow_soft_placement, disable_detailed_stats, status)
+      else:
+        devices_serialized = [device.SerializeToString() for device in devices]
+        self._tf_cluster = tf_cluster.TF_NewVirtualCluster(
+            devices_serialized, status)
+
+  def Shutdown(self):
+    if self._tf_cluster is not None:
+      tf_cluster.TF_ShutdownCluster(self._tf_cluster)
+      self._tf_cluster = None
 
   def __del__(self):
+    self.Shutdown()
+
+  @property
+  def tf_cluster(self):
+    return self._tf_cluster
+
+  def ListDevices(self):
+    """Returns the list of available hardware devices."""
+    devices = []
     if self._tf_cluster is not None:
-      tf_cluster.TF_DeleteCluster(self._tf_cluster)
+      ret_from_swig = tf_cluster.TF_ListDevices(self._tf_cluster)
+      devices = []
+      for raw_dev in ret_from_swig:
+        devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
+    return devices
 
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
 
     Args:
-      item: the item for which to measure the costs.
-    Returns: the triplet op_perfs, runtime, step_stats.
+      item: The item for which to measure the costs.
+    Returns: The triplet op_perfs, runtime, step_stats.
     """
     with errors.raise_exception_on_not_ok_status() as status:
       ret_from_swig = tf_cluster.TF_MeasureCosts(
@@ -77,11 +106,22 @@ class Cluster(object):
     """Returns a snapshot of the peak memory usage.
 
     Args:
-      item: the item for which to measure the costs.
-    Returns: a hashtable indexed by device name.
+      item: The item for which to measure the costs.
+    Returns: A hashtable indexed by device name.
     """
     with errors.raise_exception_on_not_ok_status() as status:
       ret_from_swig = tf_cluster.TF_DeterminePeakMemoryUsage(
           item.tf_item, self._tf_cluster, status)
 
     return ret_from_swig
+
+
+@contextlib.contextmanager
+def Provision(allow_soft_placement=True,
+              disable_detailed_stats=True,
+              disable_timeline=True,
+              devices=None):
+  cluster = Cluster(allow_soft_placement, disable_detailed_stats,
+                    disable_timeline, devices)
+  yield cluster
+  cluster.Shutdown()
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index de4ded571f79313f075ef8f63ee643332dd68033..3ddcb741b5ec66f46e26827d5b0f21e6697e1bfe 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cluster
@@ -42,7 +43,7 @@ class ClusterTest(test.TestCase):
       op_perfs, run_time, step_stats = grappler_cluster.MeasureCosts(
           grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 10)
+      self.assertEqual(len(op_perfs), 9)
       self.assertTrue(step_stats.dev_stats)
 
   def testNoDetailedStats(self):
@@ -82,6 +83,46 @@ class ClusterTest(test.TestCase):
         live_tensors = snapshot[1]
         self.assertEqual(15, len(live_tensors))
 
+  def testVirtualCluster(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=())
+      b = random_ops.random_uniform(shape=())
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+      device_properties = device_properties_pb2.DeviceProperties(
+          type='GPU',
+          frequency=1000,
+          num_cores=60,
+          environment={
+              'architecture': '7'
+          })
+      named_device = device_properties_pb2.NamedDevice(
+          properties=device_properties, name='/GPU:0')
+      grappler_cluster = cluster.Cluster(devices=[named_device])
+      op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item)
+      self.assertGreater(run_time, 0)
+      self.assertEqual(len(op_perfs), 15)
+
+  def testContext(self):
+    with ops.Graph().as_default() as g:
+      a = random_ops.random_uniform(shape=())
+      b = random_ops.random_uniform(shape=())
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+    with cluster.Provision(
+        disable_detailed_stats=False, disable_timeline=False) as gcluster:
+      op_perfs, run_time, step_stats = gcluster.MeasureCosts(grappler_item)
+      self.assertTrue(run_time > 0)
+      self.assertEqual(len(op_perfs), 9)
+      self.assertTrue(step_stats.dev_stats)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/cost_analyzer.i b/tensorflow/python/grappler/cost_analyzer.i
index 1f024e439d8d4d819e5f603ae3b8a843063baeeb..4c0953435ba3fa6423bbc869fcca909d0c2ccb25 100644
--- a/tensorflow/python/grappler/cost_analyzer.i
+++ b/tensorflow/python/grappler/cost_analyzer.i
@@ -15,6 +15,7 @@ limitations under the License.
 
 %include "tensorflow/python/lib/core/strings.i"
 %include "tensorflow/python/platform/base.i"
+%include "cluster.i"
 
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
@@ -42,8 +43,8 @@ limitations under the License.
 %}
 
 %{
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
-per_node_report) {
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
+                          GCluster cluster) {
   tensorflow::grappler::ItemConfig cfg;
   cfg.apply_optimizations = false;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
@@ -51,20 +52,9 @@ per_node_report) {
   if (!item) {
     return "Error: failed to preprocess metagraph: check your log file for errors";
   }
-  
-  // TODO(bsteiner): we should wrap the tf session instead to properly handle the case of a
-  // distributed setup.
-  const int timeout_s = 3600;
-  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
-  int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
-  tensorflow::grappler::SingleMachine cluster(timeout_s, num_cpu_cores, num_gpus);
-  cluster.SetNumWarmupSteps(10);
-  cluster.AllowSoftPlacement(true);
-  cluster.DisableDetailedStats(false);
-  TF_CHECK_OK(cluster.Provision());
 
   string suffix;
-  tensorflow::grappler::CostAnalyzer analyzer(*item, &cluster, suffix);
+  tensorflow::grappler::CostAnalyzer analyzer(*item, cluster.get(), suffix);
 
   std::stringstream os;
   analyzer.GenerateReport(os, per_node_report);
@@ -73,5 +63,5 @@ per_node_report) {
 
 %}
 
-string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool
-per_node_report);
+string GenerateCostReport(const tensorflow::MetaGraphDef& metagraph, bool per_node_report,
+                          GCluster cluster);
diff --git a/tensorflow/python/grappler/cost_analyzer.py b/tensorflow/python/grappler/cost_analyzer.py
index 75c21e572719128cfd5f9a36191b5765386c43dc..a1ff915c61ba14d9a899d7f6c9a2c49855969b00 100644
--- a/tensorflow/python/grappler/cost_analyzer.py
+++ b/tensorflow/python/grappler/cost_analyzer.py
@@ -20,21 +20,64 @@ from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as tf_wrap
 from tensorflow.python.framework import errors
+from tensorflow.python.grappler import cluster as gcluster
+from tensorflow.python.grappler import item as gitem
 
 
-def GenerateCostReport(metagraph, per_node_report=False):
+def GenerateCostReport(metagraph, per_node_report=False, cluster=None):
   """Analyze the cost of each TensorFlow op and node in the provided metagraph.
 
   Args:
-    metagraph: An TensorFlow MetaGraphDef.
+    metagraph: A TensorFlow MetaGraphDef.
     per_node_report: by default the report contains stats aggregated on a per op
       type basis, setting per_node_report to True adds results for each
       individual node to the report.
+    cluster: Analyze the costs using the specified cluster, or the local machine
+      if no cluster was specified.
 
   Returns:
     A string of cost report.
   """
+  if cluster is None:
+    cluster = gcluster.Cluster(disable_detailed_stats=False)
+
   with errors.raise_exception_on_not_ok_status():
-    ret_from_swig = tf_wrap.GenerateCostReport(metagraph.SerializeToString(),
-                                               per_node_report)
+    ret_from_swig = tf_wrap.GenerateCostReport(
+        metagraph.SerializeToString(), per_node_report, cluster.tf_cluster)
   return ret_from_swig
+
+
+def GenerateMemoryReport(metagraph, detailed_report=True, cluster=None):
+  """Analyze the peak memory usage for the provided metagraph.
+
+  Args:
+    metagraph: A TensorFlow MetaGraphDef.
+    detailed_report: print the live tensors in addition to the peak memory
+      usage.
+    cluster: Analyze the memory using the specified cluster, or the local
+      machine if no cluster was specified.
+
+  Returns:
+    A string with the formatted memory usage.
+  """
+  if cluster is None:
+    cluster = gcluster.Cluster(
+        disable_detailed_stats=True, disable_timeline=True)
+
+  item = gitem.Item(metagraph)
+  peak_usage = cluster.DeterminePeakMemoryUsage(item)
+  report = ""
+  for device, snapshot in peak_usage.items():
+    peak_usage = snapshot[0]
+    report += "Peak usage for device " + device + ": " + str(
+        peak_usage) + " bytes\n"
+    if detailed_report:
+      live_tensors = snapshot[1]
+      for tensor in live_tensors:
+        op_name = tensor[0]
+        output_id = tensor[1]
+        mem_used = tensor[2]
+        report += "  " + str(op_name) + ":" + str(output_id) + " uses " + str(
+            mem_used) + " bytes\n"
+
+  return report
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index d59f1d04f6135163152283d5d4922df800c51a00..325ff0fb00b006cf29d3ef2028d37745d7480d34 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import cost_analyzer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -35,9 +36,9 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 
 
-class PyWrapOptimizeGraphTest(test.TestCase):
+class CostAnalysisTest(test.TestCase):
 
-  def testBasic(self):
+  def testBasicCost(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name="a")
     b = constant_op.constant(20, name="b")
@@ -60,7 +61,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
-  def testSmallNetwork(self):
+  def testSmallNetworkCost(self):
     image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
     label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
     w = variables.Variable(
@@ -88,13 +89,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertTrue(b"MatMul" in report)
     self.assertTrue(b"ApplyAdam" in report)
     self.assertTrue(b"Conv2D" in report)
-    self.assertTrue(b"Conv2DBackpropInput" in report)
     self.assertTrue(b"Conv2DBackpropFilter" in report)
     self.assertTrue(b"Softmax" in report)
 
-    for op_type in [
-        b"MatMul", b"Conv2D", b"Conv2DBackpropInput", b"Conv2DBackpropFilter"
-    ]:
+    for op_type in [b"MatMul", b"Conv2D", b"Conv2DBackpropFilter"]:
       matcher = re.compile(
           br"\s+" + op_type + br",\s*(\d+),\s*(\d+),\s*([\d\.eE+-]+)%,\s*" +
           br"([\d\.eE+-]+)%,\s*(-?\d+),\s*(\d+),", re.MULTILINE)
@@ -111,6 +109,31 @@ class PyWrapOptimizeGraphTest(test.TestCase):
       # self.assertTrue(0 < upper)
       # self.assertTrue(lower <= upper)
 
+  def testBasicMemory(self):
+    """Make sure arguments can be passed correctly."""
+    with test_util.device(use_gpu=False):
+      a = constant_op.constant(10, name="a")
+      b = constant_op.constant(20, name="b")
+      c = math_ops.add_n([a, b], name="c")
+      d = math_ops.add_n([b, c], name="d")
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(d)
+      mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+
+    report = cost_analyzer.GenerateMemoryReport(mg)
+
+    # Print the report to make it easier to debug
+    print("{}".format(report))
+
+    # Check the report
+    self.assertTrue(
+        "Peak usage for device /job:localhost/replica:0/task:0/cpu:0: 16 bytes"
+        in report)
+    self.assertTrue("  a:0 uses 4 bytes" in report)
+    self.assertTrue("  b:0 uses 4 bytes" in report)
+    self.assertTrue("  c:0 uses 4 bytes" in report)
+    self.assertTrue("  d:0 uses 4 bytes" in report)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 632f614558774169f03f23c2e29719bec5740f75..8f75b827b6eea67e80e0d1aedc2a728241103086 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include <std_shared_ptr.i>
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
   Py_ssize_t py_size;
@@ -30,6 +31,13 @@ limitations under the License.
   $1 = &temp;
 }
 
+// Wrap the item into an object that swig can manipulate. This ensures it will call the object
+// destructor upon garbage collection instead of leaking memory.
+struct GItem {
+  std::shared_ptr<tensorflow::grappler::GrapplerItem> item_;
+};
+
+
 %{
 #include <unordered_set>
 #include <map>
@@ -40,8 +48,26 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+// Provide the implementation fo the GItem struct here.
+struct GItem {
+  GItem() {}
+  GItem(tensorflow::grappler::GrapplerItem* item) : item_(item) {}
+
+  tensorflow::grappler::GrapplerItem* operator->() const {
+    return item_.get();
+  }
+  const tensorflow::grappler::GrapplerItem& operator*() const {
+    return *item_.get();
+  }
+  bool is_none() const {
+    return item_.get() == nullptr;
+  }
+  std::shared_ptr<tensorflow::grappler::GrapplerItem> item_;
+};
 
-static tensorflow::grappler::GrapplerItem* TF_NewItem(
+static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status) {
   if (meta_graph.collection_def().count("train_op") == 0) {
@@ -54,6 +80,7 @@ static tensorflow::grappler::GrapplerItem* TF_NewItem(
   tensorflow::grappler::ItemConfig cfg;
   cfg.ignore_user_placement = ignore_user_placement;
   cfg.ignore_colocation = ignore_colocation;
+  cfg.inline_functions = true;
   std::unique_ptr<tensorflow::grappler::GrapplerItem> item =
       tensorflow::grappler::GrapplerItemFromMetaGraphDef("item", meta_graph, cfg);
   if (!item) {
@@ -63,15 +90,11 @@ static tensorflow::grappler::GrapplerItem* TF_NewItem(
     return nullptr;
   }
   tensorflow::Set_TF_Status_from_Status(out_status, tensorflow::Status::OK());
-  return item.release();
+  return GItem(item.release());
 }
 
-static void TF_DeleteItem(tensorflow::grappler::GrapplerItem* item) {
-  delete item;
-}
-
-static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::GrapplerItem* item) {
-  if (!item) {
+static std::vector<string> TF_IdentifyImportantOps(GItem item) {
+  if (item.is_none()) {
     return {};
   }
 
@@ -93,16 +116,17 @@ static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::G
   return ops;
 }
 
-static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* item) {
-  if (!item) {
+static PyObject* TF_GetOpProperties(GItem item) {
+  if (item.is_none()) {
     Py_RETURN_NONE;
   }
   tensorflow::grappler::GraphProperties properties(*item);
-  tensorflow::Status status = properties.InferStatically();
+  tensorflow::Status status = properties.InferStatically(false);
   if (!status.ok()) {
     Py_RETURN_NONE;
   }
 
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* props = PyDict_New();
   for (const auto& node : item->graph.node()) {
     const string& node_name = node.name();
@@ -117,8 +141,8 @@ static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* it
       PyList_SetItem(prop, i, output_prop);
     }
     CHECK_EQ(0, PyDict_SetItem(props, PyString_FromString(node_name.c_str()), prop));
-   }
-
+  }
+  PyGILState_Release(gstate);
   return props;
 }
 
@@ -126,9 +150,8 @@ static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* it
 
 
 // Wrap these functions.
-static tensorflow::grappler::GrapplerItem* TF_NewItem(
+static GItem TF_NewItem(
     const tensorflow::MetaGraphDef& meta_graph, bool ignore_colocation,
     bool ignore_user_placement, TF_Status* out_status);
-static void TF_DeleteItem(tensorflow::grappler::GrapplerItem* item);
-static std::vector<string> TF_IdentifyImportantOps(const tensorflow::grappler::GrapplerItem* item);
-static PyObject* TF_GetOpProperties(const tensorflow::grappler::GrapplerItem* item);
+static std::vector<string> TF_IdentifyImportantOps(GItem item);
+static PyObject* TF_GetOpProperties(GItem item);
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index f53fc7f337d9d76699a89e6808098531d9ed20eb..4fc94ec9680464aea17875189ac4a749f3fa11dc 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.grappler.costs import op_performance_data_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python import pywrap_tensorflow as tf_item
 from tensorflow.python.framework import errors
 
@@ -42,21 +43,18 @@ class Item(object):
       ValueError: the metagraph is incomplete or invalid.
     """
     self._metagraph = metagraph
+    self._item_graph = meta_graph_pb2.MetaGraphDef()
+    self._item_graph.CopyFrom(metagraph)
+    self._ignore_colocation = ignore_colocation
+    self._ignore_user_placement = ignore_user_placement
     self._tf_item = None
-    with errors.raise_exception_on_not_ok_status() as status:
-      self._tf_item = tf_item.TF_NewItem(metagraph.SerializeToString(),
-                                         ignore_colocation,
-                                         ignore_user_placement, status)
-
-  def __del__(self):
-    if self._tf_item:
-      tf_item.TF_DeleteItem(self._tf_item)
+    self._BuildTFItem()
 
   def IdentifyImportantOps(self):
-    return tf_item.TF_IdentifyImportantOps(self._tf_item)
+    return tf_item.TF_IdentifyImportantOps(self.tf_item)
 
   def GetOpProperties(self):
-    ret_from_swig = tf_item.TF_GetOpProperties(self._tf_item)
+    ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
     properties = {}
     for key, values in ret_from_swig.items():
       prop = []
@@ -72,4 +70,13 @@ class Item(object):
 
   @property
   def tf_item(self):
+    if self._item_graph != self._metagraph:
+      self._BuildTFItem()
+      self._item_graph.CopyFrom(self._metagraph)
     return self._tf_item
+
+  def _BuildTFItem(self):
+    with errors.raise_exception_on_not_ok_status() as status:
+      self._tf_item = tf_item.TF_NewItem(self._metagraph.SerializeToString(),
+                                         self._ignore_colocation,
+                                         self._ignore_user_placement, status)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index 0739a7a0e4c8c142d3c46ac1697ab243d7982cde..71c68d25cd928d5cb2dc5028ed331d468c5b9826 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.grappler import item
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
 
@@ -59,6 +60,7 @@ class ItemTest(test.TestCase):
       a = constant_op.constant(10)
       b = constant_op.constant(20)
       c = a + b
+      z = control_flow_ops.no_op()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(c)
       mg = meta_graph.create_meta_graph_def(graph=g)
@@ -69,9 +71,38 @@ class ItemTest(test.TestCase):
       for node in grappler_item.metagraph.graph_def.node:
         node_prop = op_properties[node.name]
 
-        self.assertEqual(1, len(node_prop))
-        self.assertEqual(dtypes.int32, node_prop[0].dtype)
-        self.assertEqual(tensor_shape.scalar(), node_prop[0].shape)
+        if node.name == z.name:
+          self.assertEqual(0, len(node_prop))
+        else:
+          self.assertEqual(1, len(node_prop))
+          self.assertEqual(dtypes.int32, node_prop[0].dtype)
+          self.assertEqual(tensor_shape.scalar(), node_prop[0].shape)
+
+  def testUpdates(self):
+    with ops.Graph().as_default() as g:
+      a = constant_op.constant(10)
+      b = constant_op.constant(20)
+      c = a + b
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(c)
+      mg = meta_graph.create_meta_graph_def(graph=g)
+      grappler_item = item.Item(mg)
+
+    initial_tf_item = grappler_item.tf_item
+    no_change_tf_item = grappler_item.tf_item
+    self.assertEqual(initial_tf_item, no_change_tf_item)
+
+    # Modify the placement.
+    for node in grappler_item.metagraph.graph_def.node:
+      node.device = '/cpu:0'
+    new_tf_item = grappler_item.tf_item
+    self.assertNotEqual(initial_tf_item, new_tf_item)
+
+    # Assign the same placement.
+    for node in grappler_item.metagraph.graph_def.node:
+      node.device = '/cpu:0'
+    newest_tf_item = grappler_item.tf_item
+    self.assertEqual(new_tf_item, newest_tf_item)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index bda9502cd115eaca0df06e80ee716726d91f13c4..831f1820093fc2272db9c19a9cacdc4141995421 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -18,87 +18,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import device_properties_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.grappler import cluster as gcluster
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import saver as saver_lib
 
 
-def weight(shape):
-  """weights generates a weight of a given shape."""
+def _weight(shape):
+  """Generates a weight of a given shape."""
   return random_ops.truncated_normal(shape, seed=0, stddev=0.1)
 
 
-def bias(shape):
-  """bias generates a bias of a given shape."""
+def _bias(shape):
+  """Generates a bias of a given shape."""
   return constant_op.constant(0.1, shape=shape)
 
 
-def conv2d(x, w):
-  """conv2d returns a 2d convolution layer with full stride."""
+def _conv2d(x, w):
+  """Returns a 2d convolution layer with full stride."""
   return nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
 
 
-def max_pool_2x2(x):
-  """max_pool_2x2 downsamples a feature map by 2X."""
+def _max_pool_2x2(x):
+  """Downsamples a feature map by 2X."""
   return nn.max_pool(
       x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 
 
 # Taken from tensorflow/examples/tutorials/mnist/mnist_deep.py
-def two_layer_model(x):
+def _two_layer_model(x):
   x_image = array_ops.reshape(x, [-1, 28, 28, 1])
-  w_conv1 = weight([5, 5, 1, 32])
-  b_conv1 = bias([32])
-  h_conv1 = nn.relu(conv2d(x_image, w_conv1) + b_conv1)
-  h_pool1 = max_pool_2x2(h_conv1)
-  w_conv2 = weight([5, 5, 32, 64])
-  b_conv2 = bias([64])
-  h_conv2 = nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
-  h_pool2 = max_pool_2x2(h_conv2)
+  w_conv1 = _weight([5, 5, 1, 32])
+  b_conv1 = _bias([32])
+  h_conv1 = nn.relu(_conv2d(x_image, w_conv1) + b_conv1)
+  h_pool1 = _max_pool_2x2(h_conv1)
+  w_conv2 = _weight([5, 5, 32, 64])
+  b_conv2 = _bias([64])
+  h_conv2 = nn.relu(_conv2d(h_pool1, w_conv2) + b_conv2)
+  h_pool2 = _max_pool_2x2(h_conv2)
   return h_pool2
 
 
-def loop():
+def _loop():
   random_seed.set_random_seed(0)
   x1 = random_ops.truncated_normal([1, 784], seed=0)
   x2 = random_ops.truncated_normal([1, 784], seed=0)
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(two_layer_model, elems, dtype=dtypes.float32)
+  outputs = functional_ops.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
   return outputs
 
 
-def get_config():
-  rewrite_options = rewriter_config_pb2.RewriterConfig(
-      optimize_tensor_layout=True)
+def _get_config(layout_optimizer=True):
+  if layout_optimizer:
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
+  else:
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_options, build_cost_model=1)
   config = config_pb2.ConfigProto(graph_options=graph_options)
   return config
 
 
+def _simple_metagraph(depthwise=False):
+  random_seed.set_random_seed(0)
+  x = variables.Variable(random_ops.truncated_normal([1, 200, 200, 3], seed=0))
+  conv = conv_layers.separable_conv2d if depthwise else conv_layers.conv2d
+  y = conv(x, 32, [3, 3])
+  z = conv(y, 32, [3, 3])
+  optimizer = gradient_descent.GradientDescentOptimizer(1e-4)
+  loss = math_ops.reduce_mean(z)
+  train_op = optimizer.minimize(loss)
+  graph = ops.get_default_graph()
+  graph.add_to_collection('train_op', train_op)
+  meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def())
+  return meta_graph
+
+
+def _get_cluster():
+  named_device = device_properties_pb2.NamedDevice()
+  named_device.name = '/GPU:0'
+  named_device.properties.type = 'GPU'
+  named_device.properties.environment['architecture'] = '4'
+  cluster = gcluster.Cluster(devices=[named_device])
+  return cluster
+
+
 class LayoutOptimizerTest(test.TestCase):
   """Tests the Grappler layout optimizer."""
 
+  def _train(self, checkpoint_path, layout_optimizer=False, restore=False):
+    ops.reset_default_graph()
+    graph = ops.get_default_graph()
+    with session.Session(
+        config=_get_config(layout_optimizer), graph=graph) as sess:
+      batch = 2
+      height = 6
+      width = 7
+      input_channels = 3
+      shape = [batch, height, width, input_channels]
+      image = array_ops.placeholder(dtype='float32', shape=shape)
+      conv1 = conv_layers.conv2d(image, 32, [3, 3])
+      conv2 = conv_layers.conv2d(conv1, 32, [3, 3])
+      optimizer = gradient_descent.GradientDescentOptimizer(0.01)
+      loss = math_ops.reduce_mean(conv2)
+      train_op = optimizer.minimize(loss)
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      if restore:
+        saver.restore(sess, checkpoint_path)
+      else:
+        sess.run(variables.global_variables_initializer())
+
+      np.random.seed(0)
+      for _ in range(2):
+        image_val = np.random.rand(*shape).astype(np.float32)
+        sess.run([loss, train_op], feed_dict={image: image_val})
+
+      if restore:
+        all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        all_vars_values = [var.eval(session=sess) for var in all_vars]
+        return all_vars_values
+      else:
+        saver.save(sess, checkpoint_path)
+
   def testTwoConvLayers(self):
     if test.is_gpu_available(cuda_only=True):
       random_seed.set_random_seed(0)
       x = random_ops.truncated_normal([1, 784], seed=0)
-      output = two_layer_model(x)
+      output = _two_layer_model(x)
 
       with session.Session() as sess:
         output_val_ref = sess.run(output)
 
-      with session.Session(config=get_config()) as sess:
+      with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
         output_val = sess.run(output, run_metadata=metadata)
 
@@ -115,24 +190,140 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertEqual(expected_num_transposes, num_transposes)
       self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
                     nodes)
-      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1',
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Relu_1-MaxPool_1-0',
+                    nodes)
+
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSplitWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      dim = array_ops.placeholder(dtype='int32')
+      split = array_ops.split(conv, 2, axis=dim)
+      output = math_ops.reduce_sum(split[0])
+
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={dim: 3})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata, feed_dict={dim: 3})
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
                     nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-split-Sum-0', nodes)
+      self.assertIn('LayoutOptimizerDataFormatOp_split_0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  def testSliceWithNonConstAxis(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 784], seed=0)
+      conv = _two_layer_model(x)
+      size = array_ops.placeholder(dtype='int32')
+      s = array_ops.slice(conv, [0, 0, 0, 0], size)
+      output = array_ops.identity(s)
+
+      size_val = [1, 2, 3, 4]
+      with session.Session() as sess:
+        output_val_ref = sess.run(output, feed_dict={size: size_val})
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(
+            output, run_metadata=metadata, feed_dict={
+                size: size_val
+            })
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if node.name.startswith('LayoutOptimizerTranspose'):
+          num_transposes += 1
+        nodes.append(node.name)
 
+      # Four transposes were initially added in the Expand phase of
+      # LayoutOptimizer; two of them are cancelled out in the Collapse phase.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self.assertIn('LayoutOptimizerTransposeNHWCToNCHW-Conv2D-Reshape-0',
+                    nodes)
+      self.assertIn('LayoutOptimizerTransposeNCHWToNHWC-Slice-Identity-0',
+                    nodes)
+      self.assertIn('LayoutOptimizerDataFormatOp_Slice_2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoop(self):
     if test.is_gpu_available(cuda_only=True):
-      output = loop()
+      output = _loop()
 
       with session.Session() as sess:
         output_val_ref = sess.run(output)
 
-      with session.Session(config=get_config()) as sess:
+      with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
         output_val = sess.run(output, run_metadata=metadata)
 
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  def testGradient(self):
+    meta_graph = _simple_metagraph()
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
+    optimized_graph = tf_optimizer.OptimizeGraph(
+        rewrite_options, meta_graph, cluster=_get_cluster())
+
+    found = 0
+    for node in optimized_graph.node:
+      if node.op in ['Conv2D', 'Conv2DBackpropFilter', 'Conv2DBackpropInput']:
+        found += 1
+        self.assertEqual(node.attr['data_format'].s, 'NCHW')
+    self.assertEqual(found, 5)
+
+  def testDepthwise(self):
+    meta_graph = _simple_metagraph(depthwise=True)
+    rewrite_options = rewriter_config_pb2.RewriterConfig(
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
+    optimized_graph = tf_optimizer.OptimizeGraph(
+        rewrite_options, meta_graph, cluster=_get_cluster())
+
+    found = 0
+    for node in optimized_graph.node:
+      if node.op in [
+          'DepthwiseConv2dNative', 'DepthwiseConv2dNativeBackpropFilter',
+          'DepthwiseConv2dNativeBackpropInput'
+      ]:
+        found += 1
+        self.assertEqual(node.attr['data_format'].s, 'NCHW')
+    self.assertEqual(found, 6)
+
+  def testCheckpointCompatibility(self):
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest('GPU required')
+
+    checkpoint_path = self.get_temp_dir()
+    self._train(checkpoint_path)
+    vars_expected = self._train(checkpoint_path, restore=True)
+    vars_layout_optimized = self._train(
+        checkpoint_path, restore=True, layout_optimizer=True)
+
+    for var_expected, var_layout_optimized in zip(vars_expected,
+                                                  vars_layout_optimized):
+      self.assertAllClose(var_expected, var_layout_optimized, atol=1e-6)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index aea7f7c57e7d20a0c501f7fde6c3496b1ddb6558..9fbadeceb3b1a8c9f949bc59a5ec75c5b7420cac 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -128,9 +128,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS),
-        original_metagraph)
+            memory_optimization=rewriter_config_pb2.RewriterConfig.
+            RECOMPUTATION_HEURISTICS), original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -151,8 +152,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.
+            RECOMPUTATION_HEURISTICS,
             memory_optimizer_target_node_name_prefix='optimizer/gradients/'),
         original_metagraph)
     self.assertGreater(
diff --git a/tensorflow/python/grappler/model_analyzer.cc b/tensorflow/python/grappler/model_analyzer.cc
index 4ec7620bce9462018c8b49ecb5116aa3f77f8271..da5b03234e9bf806727f05c20ec6aa4270f843a7 100644
--- a/tensorflow/python/grappler/model_analyzer.cc
+++ b/tensorflow/python/grappler/model_analyzer.cc
@@ -27,7 +27,7 @@ ModelAnalyzer::ModelAnalyzer(const GrapplerItem& item) : item_(item) {}
 
 Status ModelAnalyzer::GenerateReport(std::ostream& os) {
   GraphProperties properties(item_);
-  TF_RETURN_IF_ERROR(properties.InferStatically());
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
 
   for (const auto& node : item_.MainOpsFanin()) {
     PrintNodeInfo(node, properties, os);
@@ -59,10 +59,15 @@ void ModelAnalyzer::PrintNodeInfo(const NodeDef* node,
           if (i > 0) {
             os << ", ";
           }
-          if (prop.shape().dim(i).size() < 0) {
+          if (prop.shape().dim(i).size() >= 0) {
+            // Print the actual dimension.
+            os << prop.shape().dim(i).size();
+          } else if (prop.shape().dim(i).size() == -1) {
+            // We don't know anything about the dimension.
             os << "?";
           } else {
-            os << prop.shape().dim(i).size();
+            // Symbolic dimension.
+            os << "x" << -prop.shape().dim(i).size();
           }
         }
         os << "]";
diff --git a/tensorflow/python/grappler/model_analyzer.i b/tensorflow/python/grappler/model_analyzer.i
index d74bd37c6372733d25d2b5766a302aa1701dac17..726143a0bb4db28538f4338eb3773d85332dc122 100644
--- a/tensorflow/python/grappler/model_analyzer.i
+++ b/tensorflow/python/grappler/model_analyzer.i
@@ -48,7 +48,7 @@ string GenerateModelReport(const tensorflow::MetaGraphDef& metagraph) {
   if (!item) {
     return "Error: failed to preprocess metagraph: check your log file for errors";
   }
-  
+
   string suffix;
   tensorflow::grappler::ModelAnalyzer analyzer(*item);
 
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 12c5fce60f9b8b6e27aa6e3d921d79c1513897e2..f0dd4483a635ddf39e7f51ad0008390c1feb2e13 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -15,6 +15,7 @@ limitations under the License.
 
 
 %include "tensorflow/python/platform/base.i"
+%include "cluster.i"
 
 %typemap(in) const tensorflow::MetaGraphDef& (tensorflow::MetaGraphDef temp) {
   char* c_string;
@@ -55,30 +56,60 @@ limitations under the License.
   #include <memory>
   #include "tensorflow/c/tf_status_helper.h"
   #include "tensorflow/core/lib/core/status.h"
+  #include "tensorflow/core/common_runtime/device.h"
   #include "tensorflow/core/framework/device_base.h"
+  #include "tensorflow/core/common_runtime/device_factory.h"
+  #include "tensorflow/core/framework/device_attributes.pb.h"
   #include "tensorflow/core/framework/graph.pb.h"
   #include "tensorflow/core/grappler/grappler_item.h"
   #include "tensorflow/core/grappler/grappler_item_builder.h"
+  #include "tensorflow/core/grappler/clusters/cluster.h"
+  #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
   #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+  #include "tensorflow/core/public/session_options.h"
+
+
+void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* device_map) {
+  tensorflow::SessionOptions options;
+  std::vector<tensorflow::Device*> devices;
+  tensorflow::Status status = tensorflow::DeviceFactory::AddDevices(options, "", &devices);
+  if (!status.ok()) {
+    return;
+  }
+
+  for (const tensorflow::Device* device : devices) {
+    tensorflow::DeviceProperties& prop = (*device_map)[device->name()];
+    prop = tensorflow::grappler::GetDeviceInfo(device->parsed_name());
+
+    // Overwrite the memory limit since users might have requested to use only a fraction of the
+    // available device memory.
+    const tensorflow::DeviceAttributes& attr = device->attributes();
+    prop.set_memory_size(attr.memory_limit());
+    delete device;
+  }
+}
 
 PyObject* TF_OptimizeGraph(
+      GCluster cluster,
       const tensorflow::RewriterConfig& rewriter_config,
       const tensorflow::MetaGraphDef& metagraph,
-      const string& graph_id, TF_Status* out_status) {
+      bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
     item_config.inline_functions = false;
     item_config.apply_optimizations = false;
     std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
         tensorflow::grappler::GrapplerItemFromMetaGraphDef(graph_id, metagraph, item_config);
-    std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+
     tensorflow::DeviceBase* cpu_device = nullptr;
-    tensorflow::grappler::VirtualCluster cluster(device_map);
     tensorflow::GraphDef out_graph;
-    tensorflow::Status status = tensorflow::grappler::RunMetaOptimizer(
-        *grappler_item, rewriter_config, cpu_device, &cluster, &out_graph);
+    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
+    tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
+    if (verbose) {
+      optimizer.PrintResult();
+    }
     tensorflow::Set_TF_Status_from_Status(out_status, status);
     string out_graph_str = out_graph.SerializeAsString();
     PyObject* ret = PyBytes_FromStringAndSize(out_graph_str.data(),
@@ -90,8 +121,9 @@ PyObject* TF_OptimizeGraph(
 
 // Wrap this function
 PyObject* TF_OptimizeGraph(
+    GCluster cluster,
     const tensorflow::RewriterConfig& rewriter_config,
-    const tensorflow::MetaGraphDef& metagraph,
+    const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index d0464c6054293b8499231526317d5bd42bc88752..a73a4a98fc5a883cf8681a20ca332f16f3b7f0ce 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -21,14 +21,22 @@ from __future__ import print_function
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
+from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config, metagraph, graph_id=b'graph_to_optimize'):
+def OptimizeGraph(rewriter_config,
+                  metagraph,
+                  verbose=True,
+                  graph_id=b'graph_to_optimize',
+                  cluster=None):
   """Optimize the provided metagraph."""
   with errors.raise_exception_on_not_ok_status() as status:
-    ret_from_swig = tf_opt.TF_OptimizeGraph(rewriter_config.SerializeToString(),
+    if cluster is None:
+      cluster = gcluster.Cluster()
+    ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
+                                            rewriter_config.SerializeToString(),
                                             metagraph.SerializeToString(),
-                                            graph_id, status)
+                                            verbose, graph_id, status)
   if ret_from_swig is None:
     return None
   out_graph = graph_pb2.GraphDef().FromString(ret_from_swig)
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
old mode 100644
new mode 100755
index d61733dff6f0173883bf0b0b1cf416014f8546c8..4a60b7835ec3ee6224a84dcfbd7e380f9454d8eb
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -15,6 +15,7 @@ py_library(
         "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
         "_impl/keras/applications/imagenet_utils.py",
+        "_impl/keras/applications/inception_resnet_v2.py",
         "_impl/keras/applications/inception_v3.py",
         "_impl/keras/applications/mobilenet.py",
         "_impl/keras/applications/resnet50.py",
@@ -29,6 +30,7 @@ py_library(
         "_impl/keras/datasets/cifar.py",
         "_impl/keras/datasets/cifar10.py",
         "_impl/keras/datasets/cifar100.py",
+        "_impl/keras/datasets/fashion_mnist.py",
         "_impl/keras/datasets/imdb.py",
         "_impl/keras/datasets/mnist.py",
         "_impl/keras/datasets/reuters.py",
@@ -68,11 +70,13 @@ py_library(
         "_impl/keras/utils/io_utils.py",
         "_impl/keras/utils/layer_utils.py",
         "_impl/keras/utils/np_utils.py",
+        "_impl/keras/utils/training_utils.py",
         "_impl/keras/utils/vis_utils.py",
         "_impl/keras/wrappers/__init__.py",
         "_impl/keras/wrappers/scikit_learn.py",
         "activations/__init__.py",
         "applications/__init__.py",
+        "applications/inception_resnet_v2/__init__.py",
         "applications/inception_v3/__init__.py",
         "applications/mobilenet/__init__.py",
         "applications/resnet50/__init__.py",
@@ -86,6 +90,7 @@ py_library(
         "datasets/boston_housing/__init__.py",
         "datasets/cifar10/__init__.py",
         "datasets/cifar100/__init__.py",
+        "datasets/fashion_mnist/__init__.py",
         "datasets/imdb/__init__.py",
         "datasets/mnist/__init__.py",
         "datasets/reuters/__init__.py",
@@ -145,6 +150,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
         "@six_archive//:six",
     ],
 )
@@ -250,6 +256,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "inception_resnet_v2_test",
+    size = "medium",
+    srcs = ["_impl/keras/applications/inception_resnet_v2_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "inception_v3_test",
     size = "medium",
@@ -484,6 +502,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "recurrent_test",
+    size = "small",
+    srcs = ["_impl/keras/layers/recurrent_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "serialization_test",
     size = "small",
@@ -523,10 +553,11 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "small",
+    size = "medium",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # times out
         "notsan",
     ],
@@ -561,6 +592,31 @@ py_test(
     ],
 )
 
+py_test(
+    name = "np_utils_test",
+    size = "small",
+    srcs = ["_impl/keras/utils/np_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "training_utils_test",
+    size = "medium",
+    srcs = ["_impl/keras/utils/training_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["multi_gpu"],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "imagenet_utils_test",
     size = "small",
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index fa79889966504130f17baa1b94e40d17069cbb01..f56be967ff5c066c1b119aed1ed8813490d2b045 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -42,6 +42,8 @@ from tensorflow.python.keras import utils
 from tensorflow.python.keras import wrappers
 from tensorflow.python.keras._impl.keras import __version__
 from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.models import Sequential
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
index a341065100d2116d76c004baa5120fa10a10e8fc..74cc9d0488c88de04bf29aafcd0e23895c59826a 100644
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/__init__.py
@@ -37,5 +37,7 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras import utils
 from tensorflow.python.keras._impl.keras import wrappers
 from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.models import Sequential
 
-__version__ = '2.0.8-tf'
+__version__ = '2.1.1-tf'
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 4e35b79869f5ec1005bf5dfd8cac985942a18837..f017d2ae85548211070ececf48e977dd7d2f6a25 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import six
 
 from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.layers.base import Layer
 from tensorflow.python.platform import tf_logging as logging
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
index f78bbdc148145591cc16e3231bd9d2b7c06d208b..c11c52b71e9bff1cfd595a9dbc0e86dcaa8506c8 100644
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/applications/__init__.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
 from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index 43628341cb522090caf6eb996a5f0c9b44488424..58841e5db06229727ea088388a901633216aa6fe 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -29,12 +29,19 @@ CLASS_INDEX = None
 CLASS_INDEX_PATH = 'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
 
 
-def preprocess_input(x, data_format=None):
+def preprocess_input(x, data_format=None, mode='caffe'):
   """Preprocesses a tensor encoding a batch of images.
 
   Arguments:
       x: input Numpy tensor, 4D.
       data_format: data format of the image tensor.
+      mode: One of "caffe", "tf".
+          - caffe: will convert the images from RGB to BGR,
+              then will zero-center each color channel with
+              respect to the ImageNet dataset,
+              without scaling.
+          - tf: will scale pixels between -1 and 1,
+              sample-wise.
 
   Returns:
       Preprocessed tensor.
@@ -43,6 +50,12 @@ def preprocess_input(x, data_format=None):
     data_format = K.image_data_format()
   assert data_format in {'channels_last', 'channels_first'}
 
+  if mode == 'tf':
+    x /= 255.
+    x -= 0.5
+    x *= 2.
+    return x
+
   if data_format == 'channels_first':
     if x.ndim == 3:
       # 'RGB'->'BGR'
@@ -89,8 +102,10 @@ def decode_predictions(preds, top=5):
                      '(i.e. a 2D array of shape (samples, 1000)). '
                      'Found array with shape: ' + str(preds.shape))
   if CLASS_INDEX is None:
-    fpath = get_file(
-        'imagenet_class_index.json', CLASS_INDEX_PATH, cache_subdir='models')
+    fpath = get_file('imagenet_class_index.json',
+                     CLASS_INDEX_PATH,
+                     cache_subdir='models',
+                     file_hash='c2c37ea517e94d9795004a39431a14cb')
     CLASS_INDEX = json.load(open(fpath))
   results = []
   for pred in preds:
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd118a419293a95abf239a17bc7646e27699c3d
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
@@ -0,0 +1,374 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inception-ResNet V2 model for Keras.
+
+# Reference
+- [Inception-v4, Inception-ResNet and the Impact of
+   Residual Connections on Learning](https://arxiv.org/abs/1602.07261)
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
+from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
+from tensorflow.python.keras._impl.keras.layers import Activation
+from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import BatchNormalization
+from tensorflow.python.keras._impl.keras.layers import Concatenate
+from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import Dense
+from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import Input
+from tensorflow.python.keras._impl.keras.layers import Lambda
+from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.models import Model
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+
+BASE_WEIGHT_URL = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.7/'
+
+
+def preprocess_input(x):
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 4D numpy array consists of RGB values within [0, 255].
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, mode='tf')
+
+
+def conv2d_bn(x,
+              filters,
+              kernel_size,
+              strides=1,
+              padding='same',
+              activation='relu',
+              use_bias=False,
+              name=None):
+  """Utility function to apply conv + BN.
+
+  Arguments:
+      x: input tensor.
+      filters: filters in `Conv2D`.
+      kernel_size: kernel size as in `Conv2D`.
+      strides: strides in `Conv2D`.
+      padding: padding mode in `Conv2D`.
+      activation: activation in `Conv2D`.
+      use_bias: whether to use a bias in `Conv2D`.
+      name: name of the ops; will become `name + '_ac'` for the activation
+          and `name + '_bn'` for the batch norm layer.
+
+  Returns:
+      Output tensor after applying `Conv2D` and `BatchNormalization`.
+  """
+  x = Conv2D(
+      filters,
+      kernel_size,
+      strides=strides,
+      padding=padding,
+      use_bias=use_bias,
+      name=name)(
+          x)
+  if not use_bias:
+    bn_axis = 1 if K.image_data_format() == 'channels_first' else 3
+    bn_name = None if name is None else name + '_bn'
+    x = BatchNormalization(axis=bn_axis, scale=False, name=bn_name)(x)
+  if activation is not None:
+    ac_name = None if name is None else name + '_ac'
+    x = Activation(activation, name=ac_name)(x)
+  return x
+
+
+def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
+  """Adds a Inception-ResNet block.
+
+  This function builds 3 types of Inception-ResNet blocks mentioned
+  in the paper, controlled by the `block_type` argument (which is the
+  block name used in the official TF-slim implementation):
+      - Inception-ResNet-A: `block_type='block35'`
+      - Inception-ResNet-B: `block_type='block17'`
+      - Inception-ResNet-C: `block_type='block8'`
+
+  Arguments:
+      x: input tensor.
+      scale: scaling factor to scale the residuals (i.e., the output of
+          passing `x` through an inception module) before adding them
+          to the shortcut branch. Let `r` be the output from the residual
+          branch, the output of this block will be `x + scale * r`.
+      block_type: `'block35'`, `'block17'` or `'block8'`, determines
+          the network structure in the residual branch.
+      block_idx: an `int` used for generating layer names. The Inception-ResNet
+        blocks
+          are repeated many times in this network. We use `block_idx` to
+            identify
+          each of the repetitions. For example, the first Inception-ResNet-A
+            block
+          will have `block_type='block35', block_idx=0`, ane the layer names
+            will have
+          a common prefix `'block35_0'`.
+      activation: activation function to use at the end of the block
+          (see [activations](../activations.md)).
+          When `activation=None`, no activation is applied
+          (i.e., "linear" activation: `a(x) = x`).
+
+  Returns:
+      Output tensor for the block.
+
+  Raises:
+      ValueError: if `block_type` is not one of `'block35'`,
+          `'block17'` or `'block8'`.
+  """
+  if block_type == 'block35':
+    branch_0 = conv2d_bn(x, 32, 1)
+    branch_1 = conv2d_bn(x, 32, 1)
+    branch_1 = conv2d_bn(branch_1, 32, 3)
+    branch_2 = conv2d_bn(x, 32, 1)
+    branch_2 = conv2d_bn(branch_2, 48, 3)
+    branch_2 = conv2d_bn(branch_2, 64, 3)
+    branches = [branch_0, branch_1, branch_2]
+  elif block_type == 'block17':
+    branch_0 = conv2d_bn(x, 192, 1)
+    branch_1 = conv2d_bn(x, 128, 1)
+    branch_1 = conv2d_bn(branch_1, 160, [1, 7])
+    branch_1 = conv2d_bn(branch_1, 192, [7, 1])
+    branches = [branch_0, branch_1]
+  elif block_type == 'block8':
+    branch_0 = conv2d_bn(x, 192, 1)
+    branch_1 = conv2d_bn(x, 192, 1)
+    branch_1 = conv2d_bn(branch_1, 224, [1, 3])
+    branch_1 = conv2d_bn(branch_1, 256, [3, 1])
+    branches = [branch_0, branch_1]
+  else:
+    raise ValueError('Unknown Inception-ResNet block type. '
+                     'Expects "block35", "block17" or "block8", '
+                     'but got: ' + str(block_type))
+
+  block_name = block_type + '_' + str(block_idx)
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
+  mixed = Concatenate(axis=channel_axis, name=block_name + '_mixed')(branches)
+  up = conv2d_bn(
+      mixed,
+      K.int_shape(x)[channel_axis],
+      1,
+      activation=None,
+      use_bias=True,
+      name=block_name + '_conv')
+
+  x = Lambda(
+      lambda inputs, scale: inputs[0] + inputs[1] * scale,
+      arguments={'scale': scale},
+      name=block_name)([x, up])
+  if activation is not None:
+    x = Activation(activation, name=block_name + '_ac')(x)
+  return x
+
+
+def InceptionResNetV2(include_top=True,  # pylint: disable=invalid-name
+                      weights='imagenet',
+                      input_tensor=None,
+                      input_shape=None,
+                      pooling=None,
+                      classes=1000):
+  """Instantiates the Inception-ResNet v2 architecture.
+
+  Optionally loads weights pre-trained on ImageNet.
+  Note that when using TensorFlow, for best performance you should
+  set `"image_data_format": "channels_last"` in your Keras config
+  at `~/.keras/keras.json`.
+
+  The model and the weights are compatible with TensorFlow, Theano and
+  CNTK backends. The data format convention used by the model is
+  the one specified in your Keras config file.
+
+  Note that the default input image size for this model is 299x299, instead
+  of 224x224 as in the VGG16 and ResNet models. Also, the input preprocessing
+  function is different (i.e., do not use `imagenet_utils.preprocess_input()`
+  with this model. Use `preprocess_input()` defined in this module instead).
+
+  Arguments:
+      include_top: whether to include the fully-connected
+          layer at the top of the network.
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
+      input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
+          to use as image input for the model.
+      input_shape: optional shape tuple, only to be specified
+          if `include_top` is `False` (otherwise the input shape
+          has to be `(299, 299, 3)` (with `'channels_last'` data format)
+          or `(3, 299, 299)` (with `'channels_first'` data format).
+          It should have exactly 3 inputs channels,
+          and width and height should be no smaller than 139.
+          E.g. `(150, 150, 3)` would be one valid value.
+      pooling: Optional pooling mode for feature extraction
+          when `include_top` is `False`.
+          - `None` means that the output of the model will be
+              the 4D tensor output of the last convolutional layer.
+          - `'avg'` means that global average pooling
+              will be applied to the output of the
+              last convolutional layer, and thus
+              the output of the model will be a 2D tensor.
+          - `'max'` means that global max pooling will be applied.
+      classes: optional number of classes to classify images
+          into, only to be specified if `include_top` is `True`, and
+          if no `weights` argument is specified.
+
+  Returns:
+      A Keras `Model` instance.
+
+  Raises:
+      ValueError: in case of invalid argument for `weights`,
+          or invalid input shape.
+  """
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as imagenet with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = _obtain_input_shape(
+      input_shape,
+      default_size=299,
+      min_size=139,
+      data_format=K.image_data_format(),
+      require_flatten=False,
+      weights=weights)
+
+  if input_tensor is None:
+    img_input = Input(shape=input_shape)
+  else:
+    if not K.is_keras_tensor(input_tensor):
+      img_input = Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  # Stem block: 35 x 35 x 192
+  x = conv2d_bn(img_input, 32, 3, strides=2, padding='valid')
+  x = conv2d_bn(x, 32, 3, padding='valid')
+  x = conv2d_bn(x, 64, 3)
+  x = MaxPooling2D(3, strides=2)(x)
+  x = conv2d_bn(x, 80, 1, padding='valid')
+  x = conv2d_bn(x, 192, 3, padding='valid')
+  x = MaxPooling2D(3, strides=2)(x)
+
+  # Mixed 5b (Inception-A block): 35 x 35 x 320
+  branch_0 = conv2d_bn(x, 96, 1)
+  branch_1 = conv2d_bn(x, 48, 1)
+  branch_1 = conv2d_bn(branch_1, 64, 5)
+  branch_2 = conv2d_bn(x, 64, 1)
+  branch_2 = conv2d_bn(branch_2, 96, 3)
+  branch_2 = conv2d_bn(branch_2, 96, 3)
+  branch_pool = AveragePooling2D(3, strides=1, padding='same')(x)
+  branch_pool = conv2d_bn(branch_pool, 64, 1)
+  branches = [branch_0, branch_1, branch_2, branch_pool]
+  channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
+  x = Concatenate(axis=channel_axis, name='mixed_5b')(branches)
+
+  # 10x block35 (Inception-ResNet-A block): 35 x 35 x 320
+  for block_idx in range(1, 11):
+    x = inception_resnet_block(
+        x, scale=0.17, block_type='block35', block_idx=block_idx)
+
+  # Mixed 6a (Reduction-A block): 17 x 17 x 1088
+  branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='valid')
+  branch_1 = conv2d_bn(x, 256, 1)
+  branch_1 = conv2d_bn(branch_1, 256, 3)
+  branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='valid')
+  branch_pool = MaxPooling2D(3, strides=2, padding='valid')(x)
+  branches = [branch_0, branch_1, branch_pool]
+  x = Concatenate(axis=channel_axis, name='mixed_6a')(branches)
+
+  # 20x block17 (Inception-ResNet-B block): 17 x 17 x 1088
+  for block_idx in range(1, 21):
+    x = inception_resnet_block(
+        x, scale=0.1, block_type='block17', block_idx=block_idx)
+
+  # Mixed 7a (Reduction-B block): 8 x 8 x 2080
+  branch_0 = conv2d_bn(x, 256, 1)
+  branch_0 = conv2d_bn(branch_0, 384, 3, strides=2, padding='valid')
+  branch_1 = conv2d_bn(x, 256, 1)
+  branch_1 = conv2d_bn(branch_1, 288, 3, strides=2, padding='valid')
+  branch_2 = conv2d_bn(x, 256, 1)
+  branch_2 = conv2d_bn(branch_2, 288, 3)
+  branch_2 = conv2d_bn(branch_2, 320, 3, strides=2, padding='valid')
+  branch_pool = MaxPooling2D(3, strides=2, padding='valid')(x)
+  branches = [branch_0, branch_1, branch_2, branch_pool]
+  x = Concatenate(axis=channel_axis, name='mixed_7a')(branches)
+
+  # 10x block8 (Inception-ResNet-C block): 8 x 8 x 2080
+  for block_idx in range(1, 10):
+    x = inception_resnet_block(
+        x, scale=0.2, block_type='block8', block_idx=block_idx)
+  x = inception_resnet_block(
+      x, scale=1., activation=None, block_type='block8', block_idx=10)
+
+  # Final convolution block: 8 x 8 x 1536
+  x = conv2d_bn(x, 1536, 1, name='conv_7b')
+
+  if include_top:
+    # Classification block
+    x = GlobalAveragePooling2D(name='avg_pool')(x)
+    x = Dense(classes, activation='softmax', name='predictions')(x)
+  else:
+    if pooling == 'avg':
+      x = GlobalAveragePooling2D()(x)
+    elif pooling == 'max':
+      x = GlobalMaxPooling2D()(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`
+  if input_tensor is not None:
+    inputs = get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+
+  # Create model
+  model = Model(inputs, x, name='inception_resnet_v2')
+
+  # Load weights
+  if weights == 'imagenet':
+    if include_top:
+      fname = 'inception_resnet_v2_weights_tf_dim_ordering_tf_kernels.h5'
+      weights_path = get_file(
+          fname,
+          BASE_WEIGHT_URL + fname,
+          cache_subdir='models',
+          file_hash='e693bd0210a403b3192acc6073ad2e96')
+    else:
+      fname = 'inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5'
+      weights_path = get_file(
+          fname,
+          BASE_WEIGHT_URL + fname,
+          cache_subdir='models',
+          file_hash='d19885ff4a710c122648d3b5c3b684e4')
+    model.load_weights(weights_path)
+
+  return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..de71e9615a09ecdf07a51fff0b3ee3b1d8ca50ca
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Inception V3 application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class InceptionResNetV2Test(test.TestCase):
+
+  def test_with_top(self):
+    model = keras.applications.InceptionResNetV2(weights=None)
+    self.assertEqual(model.output_shape, (None, 1000))
+
+  def test_no_top(self):
+    model = keras.applications.InceptionResNetV2(weights=None,
+                                                 include_top=False)
+    self.assertEqual(model.output_shape, (None, None, None, 1536))
+
+  def test_with_pooling(self):
+    model = keras.applications.InceptionResNetV2(weights=None,
+                                                 include_top=False,
+                                                 pooling='avg')
+    self.assertEqual(model.output_shape, (None, 1536))
+
+  def test_weight_loading(self):
+    with self.assertRaises(ValueError):
+      keras.applications.InceptionResNetV2(weights='unknown',
+                                           include_top=False)
+    with self.assertRaises(ValueError):
+      keras.applications.InceptionResNetV2(weights='imagenet',
+                                           classes=2000)
+
+  def test_preprocess_input(self):
+    x = np.random.uniform(0, 255, (2, 300, 200, 3))
+    out1 = keras.applications.inception_resnet_v2.preprocess_input(x)
+    self.assertAllClose(np.mean(out1), 0., atol=0.1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
index edb4c60f8a58553a355245558b30d815000b3e11..3a17c647dd1435085c7a4ad48f0361d0fba723c1 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
@@ -29,8 +29,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
@@ -117,8 +120,9 @@ def InceptionV3(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -150,10 +154,12 @@ def InceptionV3(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -374,19 +380,26 @@ def InceptionV3(include_top=True,
           'inception_v3_weights_tf_dim_ordering_tf_kernels.h5',
           WEIGHTS_PATH,
           cache_subdir='models',
-          md5_hash='9a0d58056eeedaa3f26cb7ebd46da564')
+          file_hash='9a0d58056eeedaa3f26cb7ebd46da564')
     else:
       weights_path = get_file(
           'inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5',
           WEIGHTS_PATH_NO_TOP,
           cache_subdir='models',
-          md5_hash='bcbd6486424b2319ff4ef7d526e38f63')
+          file_hash='bcbd6486424b2319ff4ef7d526e38f63')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
 def preprocess_input(x):
-  x /= 255.
-  x -= 0.5
-  x *= 2.
-  return x
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 4D numpy array consists of RGB values within [0, 255].
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, mode='tf')
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index f6482d25496521c3926e39bdd69239a81884ece4..9179422d0e3d17fc877b2bae136246755d71f2b2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -67,12 +67,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import warnings
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.engine import InputSpec
@@ -97,10 +99,15 @@ def relu6(x):
 
 
 def preprocess_input(x):
-  x /= 255.
-  x -= 0.5
-  x *= 2.
-  return x
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 4D numpy array consists of RGB values within [0, 255].
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, mode='tf')
 
 
 class DepthwiseConv2D(Conv2D):
@@ -342,8 +349,9 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
       dropout: dropout rate
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: `None` (random initialization) or
-          `imagenet` (ImageNet weights)
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of
           `layers.Input()`)
           to use as image input for the model.
@@ -378,10 +386,12 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
                        'as other backends do not support '
                        'depthwise convolution.')
 
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as ImageNet with `include_top` '
@@ -531,6 +541,8 @@ def MobileNet(input_shape=None,  # pylint: disable=invalid-name
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index f0cff2d686f321e4ec86a85efc8a844576fc7fcf..5238ba70c1c6ac221145c4b467d62d0fafea9b54 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -26,6 +26,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
@@ -56,7 +58,7 @@ def identity_block(input_tensor, kernel_size, filters, stage, block):
   Arguments:
       input_tensor: input tensor
       kernel_size: default 3, the kernel size of middle conv layer at main path
-      filters: list of integers, the filterss of 3 conv layer at main path
+      filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
 
@@ -95,7 +97,7 @@ def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2,
   Arguments:
       input_tensor: input tensor
       kernel_size: default 3, the kernel size of middle conv layer at main path
-      filters: list of integers, the filterss of 3 conv layer at main path
+      filters: list of integers, the filters of 3 conv layer at main path
       stage: integer, current stage label, used for generating layer names
       block: 'a','b'..., current block label, used for generating layer names
       strides: Tuple of integers.
@@ -161,8 +163,9 @@ def ResNet50(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -194,10 +197,12 @@ def ResNet50(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -283,4 +288,6 @@ def ResNet50(include_top=True,
           cache_subdir='models',
           md5_hash='a268eb855778b3df3c7506639542a6af')
     model.load_weights(weights_path)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
index 485b486e9d826795d3499b978d119609050bd7de..aa2616070959d20428e8321c709edc311bd1bc4e 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg16.py
@@ -25,6 +25,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -68,8 +70,9 @@ def VGG16(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -101,10 +104,12 @@ def VGG16(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -192,12 +197,14 @@ def VGG16(include_top=True,
       weights_path = get_file(
           'vgg16_weights_tf_dim_ordering_tf_kernels.h5',
           WEIGHTS_PATH,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='64373286793e3c8b2b4e3219cbf3544b')
     else:
       weights_path = get_file(
           'vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5',
           WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='6d6bbae143d832006294945121d1f1fc')
     model.load_weights(weights_path)
     if K.backend() == 'theano':
       layer_utils.convert_all_kernels_in_model(model)
@@ -209,4 +216,6 @@ def VGG16(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
index 3af6417c8444453a9e9c3eef70097f520757f264..d842d0db6a3f500deb940bb4cf6ccab718dfa003 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/_impl/keras/applications/vgg19.py
@@ -25,6 +25,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
@@ -68,8 +70,9 @@ def VGG19(include_top=True,
   Arguments:
       include_top: whether to include the 3 fully-connected
           layers at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+         'imagenet' (pre-training on ImageNet),
+         or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -101,10 +104,12 @@ def VGG19(include_top=True,
       ValueError: in case of invalid argument for `weights`,
           or invalid input shape.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -198,12 +203,14 @@ def VGG19(include_top=True,
       weights_path = get_file(
           'vgg19_weights_tf_dim_ordering_tf_kernels.h5',
           WEIGHTS_PATH,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='cbe5617147190e668d6c5d5026f83318')
     else:
       weights_path = get_file(
           'vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5',
           WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
     if K.backend() == 'theano':
       layer_utils.convert_all_kernels_in_model(model)
@@ -215,4 +222,6 @@ def VGG19(include_top=True,
         dense = model.get_layer(name='fc1')
         layer_utils.convert_dense_weights_data_format(dense, shape,
                                                       'channels_first')
+  elif weights is not None:
+    model.load_weights(weights)
   return model
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/_impl/keras/applications/xception.py
index 6e521daa2d3b9eae33faecde6057e9fcc3222edc..25bc6288b27791886d663fcd94de4688d472d62b 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/_impl/keras/applications/xception.py
@@ -36,8 +36,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import layers
+from tensorflow.python.keras._impl.keras.applications import imagenet_utils
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
 from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions  # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.engine.topology import get_source_inputs
@@ -79,8 +82,9 @@ def Xception(include_top=True,
   Arguments:
       include_top: whether to include the fully-connected
           layer at the top of the network.
-      weights: one of `None` (random initialization)
-          or "imagenet" (pre-training on ImageNet).
+      weights: one of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded.
       input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
           to use as image input for the model.
       input_shape: optional shape tuple, only to be specified
@@ -113,10 +117,11 @@ def Xception(include_top=True,
       RuntimeError: If attempting to run this model with a
           backend that does not support separable convolutions.
   """
-  if weights not in {'imagenet', None}:
+  if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
-                     '`None` (random initialization) or `imagenet` '
-                     '(pre-training on ImageNet).')
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
 
   if weights == 'imagenet' and include_top and classes != 1000:
     raise ValueError('If using `weights` as imagenet with `include_top`'
@@ -287,21 +292,30 @@ def Xception(include_top=True,
       weights_path = get_file(
           'xception_weights_tf_dim_ordering_tf_kernels.h5',
           TF_WEIGHTS_PATH,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='0a58e3b7378bc2990ea3b43d5981f1f6')
     else:
       weights_path = get_file(
           'xception_weights_tf_dim_ordering_tf_kernels_notop.h5',
           TF_WEIGHTS_PATH_NO_TOP,
-          cache_subdir='models')
+          cache_subdir='models',
+          file_hash='b0042744bf5b25fce3cb969f33bebb97')
     model.load_weights(weights_path)
 
   if old_data_format:
     K.set_image_data_format(old_data_format)
+  elif weights is not None:
+    model.load_weights(weights)
   return model
 
 
 def preprocess_input(x):
-  x /= 255.
-  x -= 0.5
-  x *= 2.
-  return x
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments:
+      x: a 4D numpy array consists of RGB values within [0, 255].
+
+  Returns:
+      Preprocessed array.
+  """
+  return imagenet_utils.preprocess_input(x, mode='tf')
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index f02f6d10df996459aefa3fdf2d12286d8a710291..ec7a5dcffd0c0f0dda90bbc92de54af82680b607 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients as gradients_module
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -71,7 +72,7 @@ _GRAPH_LEARNING_PHASES = {}
 
 # This dictionary holds a mapping {graph: UID_DICT}.
 # each UID_DICT is a dictionary mapping name prefixes to a current index,
-# used for generatic graph-specific string UIDs
+# used for generating graph-specific string UIDs
 # for various names (e.g. layer names).
 _GRAPH_UID_DICTS = {}
 
@@ -89,6 +90,11 @@ _EPSILON = 10e-8
 # Default image data format, one of "channels_last", "channels_first".
 _IMAGE_DATA_FORMAT = 'channels_last'
 
+# This list holds the available devices.
+# It is populated when `_get_available_gpus()` is called for the first time.
+# We assume our devices don't change henceforth.
+_LOCAL_DEVICES = None
+
 
 def backend():
   """Publicly accessible method for determining the current backend.
@@ -387,39 +393,83 @@ def set_session(session):
   _SESSION = session
 
 
-# VARIABLE MANIPULATION
+# DEVICE MANIPULATION
+
+
+class _TfDeviceCaptureOp(object):
+  """Class for capturing the TF device scope."""
 
+  def __init__(self):
+    self.device = None
 
-def _convert_string_dtype(dtype):
-  """Get the type from a string.
+  def _set_device(self, device):
+    """This method captures TF's explicit device scope setting."""
+    self.device = device
+
+
+def _get_current_tf_device():
+  """Return explicit device of current context, otherwise returns `None`.
+
+  Returns:
+      If the current device scope is explicitly set, it returns a string with
+      the device (`CPU` or `GPU`). If the scope is not explicitly set, it will
+      return `None`.
+  """
+  g = ops.get_default_graph()
+  op = _TfDeviceCaptureOp()
+  g._apply_device_functions(op)
+  return op.device
+
+
+def _is_current_explicit_device(device_type):
+  """Check if the current device is explicitly set on the device type specified.
 
   Arguments:
-      dtype: A string representation of a type.
+      device_type: A string containing `GPU` or `CPU` (case-insensitive).
 
   Returns:
-      The type requested.
+      A boolean indicating if the current device scope is explicitly set on the
+      device type.
 
   Raises:
-      ValueError: if `dtype` is not supported.
-  """
-  if dtype == 'float16':
-    return dtypes_module.float16
-  if dtype == 'float32':
-    return dtypes_module.float32
-  elif dtype == 'float64':
-    return dtypes_module.float64
-  elif dtype == 'int16':
-    return dtypes_module.int16
-  elif dtype == 'int32':
-    return dtypes_module.int32
-  elif dtype == 'int64':
-    return dtypes_module.int64
-  elif dtype == 'uint8':
-    return dtypes_module.int8
-  elif dtype == 'uint16':
-    return dtypes_module.uint16
-  else:
-    raise ValueError('Unsupported dtype:', dtype)
+      ValueError: If the `device_type` string indicates an unsupported device.
+  """
+  device_type = device_type.upper()
+  if device_type not in ['CPU', 'GPU']:
+    raise ValueError('device_type should be either "CPU" or "GPU".')
+  device = _get_current_tf_device()
+  return device is not None and device.device_type == device_type.upper()
+
+
+def _get_available_gpus():
+  """Get a list of available gpu devices (formatted as strings).
+
+  Returns:
+      A list of available GPU devices.
+  """
+  global _LOCAL_DEVICES
+  if _LOCAL_DEVICES is None:
+    _LOCAL_DEVICES = get_session().list_devices()
+  return [x.name for x in _LOCAL_DEVICES if x.device_type == 'GPU']
+
+
+def _has_nchw_support():
+  """Check whether the current scope supports NCHW ops.
+
+  Tensorflow does not support NCHW on CPU. Therefore we check if we are not
+  explicitly put on
+  CPU, and have GPUs available. In this case there will be soft-placing on the
+  GPU device.
+
+  Returns:
+      bool: if the current scope device placement would support nchw
+  """
+  explicitly_on_cpu = _is_current_explicit_device('CPU')
+  gpus_available = bool(_get_available_gpus())
+  return not explicitly_on_cpu and gpus_available
+
+
+# VARIABLE MANIPULATION
 
 
 def _to_tensor(x, dtype):
@@ -432,10 +482,7 @@ def _to_tensor(x, dtype):
   Returns:
       A tensor.
   """
-  x = ops.convert_to_tensor(x)
-  if x.dtype != dtype:
-    x = math_ops.cast(x, dtype)
-  return x
+  return ops.convert_to_tensor(x, dtype=dtype)
 
 
 def is_sparse(tensor):
@@ -530,7 +577,7 @@ def variable(value, dtype=None, name=None, constraint=None):
     return v
   v = variables_module.Variable(
       value,
-      dtype=_convert_string_dtype(dtype),
+      dtype=dtypes_module.as_dtype(dtype),
       name=name,
       constraint=constraint)
   if isinstance(value, np.ndarray):
@@ -548,17 +595,18 @@ def _initialize_variables(session):
   for v in variables:
     if not getattr(v, '_keras_initialized', False):
       candidate_vars.append(v)
-  # This step is expensive, so we only run it on variables not already
-  # marked as initialized.
-  is_initialized = session.run(
-      [variables_module.is_variable_initialized(v) for v in candidate_vars])
-  uninitialized_vars = []
-  for flag, v in zip(is_initialized, candidate_vars):
-    if not flag:
-      uninitialized_vars.append(v)
-    v._keras_initialized = True
-  if uninitialized_vars:
-    session.run(variables_module.variables_initializer(uninitialized_vars))
+  if candidate_vars:
+    # This step is expensive, so we only run it on variables not already
+    # marked as initialized.
+    is_initialized = session.run(
+        [variables_module.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True
+    if uninitialized_vars:
+      session.run(variables_module.variables_initializer(uninitialized_vars))
 
 
 def constant(value, dtype=None, shape=None, name=None):
@@ -714,7 +762,7 @@ def shape(x):
 
 
 def int_shape(x):
-  """Returns the shape tensor or variable as a tuple of int or None entries.
+  """Returns the shape of tensor or variable as a tuple of int or None entries.
 
   Arguments:
       x: Tensor or variable.
@@ -841,7 +889,7 @@ def zeros(shape, dtype=None, name=None):
   """
   if dtype is None:
     dtype = floatx()
-  tf_dtype = _convert_string_dtype(dtype)
+  tf_dtype = dtypes_module.as_dtype(dtype)
   return variable(
       init_ops.constant_initializer(0., dtype=tf_dtype)(shape), dtype, name)
 
@@ -869,7 +917,7 @@ def ones(shape, dtype=None, name=None):
   """
   if dtype is None:
     dtype = floatx()
-  tf_dtype = _convert_string_dtype(dtype)
+  tf_dtype = dtypes_module.as_dtype(dtype)
   return variable(
       init_ops.constant_initializer(1., dtype=tf_dtype)(shape), dtype, name)
 
@@ -896,7 +944,10 @@ def eye(size, dtype=None, name=None):
   ```
 
   """
-  return variable(np.eye(size), dtype, name)
+  if dtype is None:
+    dtype = floatx()
+  tf_dtype = dtypes_module.as_dtype(dtype)
+  return variable(linalg_ops.eye(size, dtype=tf_dtype), dtype, name)
 
 
 def zeros_like(x, dtype=None, name=None):
@@ -949,16 +1000,17 @@ def ones_like(x, dtype=None, name=None):
   return array_ops.ones_like(x, dtype=dtype, name=name)
 
 
-def identity(x):
+def identity(x, name=None):
   """Returns a tensor with the same content as the input tensor.
 
   Arguments:
       x: The input tensor.
+      name: String, name for the variable to create.
 
   Returns:
       A tensor of the same shape, type and content.
   """
-  return array_ops.identity(x)
+  return array_ops.identity(x, name=name)
 
 
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
@@ -988,7 +1040,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """
   if dtype is None:
     dtype = floatx()
-  tf_dtype = _convert_string_dtype(dtype)
+  tf_dtype = dtypes_module.as_dtype(dtype)
   if seed is None:
     # ensure that randomness is conditioned by the Numpy RNG
     seed = np.random.randint(10e8)
@@ -1025,7 +1077,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
   """
   if dtype is None:
     dtype = floatx()
-  tf_dtype = _convert_string_dtype(dtype)
+  tf_dtype = dtypes_module.as_dtype(dtype)
   if seed is None:
     # ensure that randomness is conditioned by the Numpy RNG
     seed = np.random.randint(10e8)
@@ -1035,10 +1087,10 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
 
 
 def count_params(x):
-  """Returns the number of scalars in a Keras variable.
+  """Returns the static number of elements in a variable or tensor.
 
   Arguments:
-      x: Keras variable.
+      x: Variable or tensor.
 
   Returns:
       Integer, the number of scalars in `x`.
@@ -1053,8 +1105,7 @@ def count_params(x):
              [ 0.,  0.,  0.]], dtype=float32)
   ```
   """
-  shape = x.get_shape()
-  return np.prod([shape[i]._value for i in range(len(shape))])
+  return np.prod(x.get_shape().as_list())
 
 
 def cast(x, dtype):
@@ -2368,7 +2419,7 @@ def set_value(x, value):
           (of the same shape).
   """
   value = np.asarray(value, dtype=dtype(x))
-  tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+  tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
   if hasattr(x, '_assign_placeholder'):
     assign_placeholder = x._assign_placeholder
     assign_op = x._assign_op
@@ -2392,7 +2443,7 @@ def batch_set_value(tuples):
     feed_dict = {}
     for x, value in tuples:
       value = np.asarray(value, dtype=dtype(x))
-      tf_dtype = _convert_string_dtype(x.dtype.name.split('_')[0])
+      tf_dtype = dtypes_module.as_dtype(x.dtype.name.split('_')[0])
       if hasattr(x, '_assign_placeholder'):
         assign_placeholder = x._assign_placeholder
         assign_op = x._assign_op
@@ -2409,6 +2460,16 @@ def batch_set_value(tuples):
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
+  Note that `print_tensor` returns a new tensor identical to `x`
+  which should be used in the following code. Otherwise the
+  print operation is not taken into account during evaluation.
+
+  Example:
+
+  ```python
+     >>> x = K.print_tensor(x, message="x is: ")
+  ```
+
   Arguments:
       x: Tensor to print.
       message: Message to print jointly with the tensor.
@@ -2425,11 +2486,21 @@ def print_tensor(x, message=''):
 class Function(object):
   """Runs a computation graph.
 
+  It's possible to pass arguments to `tf.Session.run()` via `session_kwargs`.
+  In particular additional operations via `fetches` argument and additional
+  tensor substitutions via `feed_dict` arguments. Note that given
+  substitutions are merged with substitutions from `inputs`. Even though
+  `feed_dict` is passed once in the constructor (called in `model.compile()`)
+  we can modify the values in the dictionary. Through this feed_dict we can
+  provide additional substitutions besides Keras inputs.
+
   Arguments:
       inputs: Feed placeholders to the computation graph.
       outputs: Output tensors to fetch.
       updates: Additional update ops to be run at function call.
-      name: a name to help users identify what this function does.
+      name: A name to help users identify what this function does.
+      session_kwargs: Arguments to `tf.Session.run()`: `fetches`, `feed_dict`,
+        `options`, `run_metadata`
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None,
@@ -2457,12 +2528,18 @@ class Function(object):
           updates_ops.append(update)
       self.updates_op = control_flow_ops.group(*updates_ops)
     self.name = name
+    # additional tensor substitutions
+    self.feed_dict = session_kwargs.pop('feed_dict', {})
+    # additional operations
+    self.fetches = session_kwargs.pop('fetches', [])
+    if not isinstance(self.fetches, list):
+      self.fetches = [self.fetches]
     self.session_kwargs = session_kwargs
 
   def __call__(self, inputs):
     if not isinstance(inputs, (list, tuple)):
       raise TypeError('`inputs` should be a list or tuple.')
-    feed_dict = {}
+    feed_dict = self.feed_dict.copy()
     for tensor, value in zip(self.inputs, inputs):
       if is_sparse(tensor):
         sparse_coo = value.tocoo()
@@ -2470,11 +2547,10 @@ class Function(object):
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
       feed_dict[tensor] = value
+    fetches = self.outputs + [self.updates_op] + self.fetches
     session = get_session()
     updated = session.run(
-        self.outputs + [self.updates_op],
-        feed_dict=feed_dict,
-        **self.session_kwargs)
+        fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
     return updated[:len(self.outputs)]
 
 
@@ -2605,6 +2681,9 @@ def rnn(step_function,
   if constants is None:
     constants = []
 
+  global uses_learning_phase  # pylint: disable=global-variable-undefined
+  uses_learning_phase = False
+
   if unroll:
     if not inputs.get_shape()[0]:
       raise ValueError('Unrolling requires a ' 'fixed number of timesteps.')
@@ -2623,6 +2702,8 @@ def rnn(step_function,
 
       for inp, mask_t in zip(input_list, mask_list):
         output, new_states = step_function(inp, states + constants)
+        if getattr(output, '_uses_learning_phase', False):
+          uses_learning_phase = True
 
         # tf.where needs its condition tensor
         # to be the same shape as its two
@@ -2662,6 +2743,8 @@ def rnn(step_function,
     else:
       for inp in input_list:
         output, states = step_function(inp, states + constants)
+        if getattr(output, '_uses_learning_phase', False):
+          uses_learning_phase = True
         successive_outputs.append(output)
         successive_states.append(states)
       last_output = successive_outputs[-1]
@@ -2715,6 +2798,9 @@ def rnn(step_function,
         mask_t = mask_ta.read(time)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
+        if getattr(output, '_uses_learning_phase', False):
+          global uses_learning_phase  # pylint: disable=global-variable-undefined
+          uses_learning_phase = True
         for state, new_state in zip(states, new_states):
           new_state.set_shape(state.get_shape())
         tiled_mask_t = array_ops.tile(mask_t,
@@ -2743,6 +2829,9 @@ def rnn(step_function,
         current_input = input_ta.read(time)
         output, new_states = step_function(current_input,
                                            tuple(states) + tuple(constants))
+        if getattr(output, '_uses_learning_phase', False):
+          global uses_learning_phase  # pylint: disable=global-variable-undefined
+          uses_learning_phase = True
         for state, new_state in zip(states, new_states):
           new_state.set_shape(state.get_shape())
         output_ta_t = output_ta_t.write(time, output)
@@ -2763,6 +2852,7 @@ def rnn(step_function,
 
   axes = [1, 0] + list(range(2, len(outputs.get_shape())))
   outputs = array_ops.transpose(outputs, axes)
+  last_output._uses_learning_phase = uses_learning_phase
   return last_output, outputs, new_states
 
 
@@ -2773,28 +2863,59 @@ def switch(condition, then_expression, else_expression):
   should be symbolic tensors of the *same shape*.
 
   Arguments:
-      condition: scalar tensor (`int` or `bool`).
+      condition: tensor (`int` or `bool`).
       then_expression: either a tensor, or a callable that returns a tensor.
       else_expression: either a tensor, or a callable that returns a tensor.
 
   Returns:
       The selected tensor.
+
+  Raises:
+      ValueError: If rank of `condition` is greater than rank of expressions.
   """
   if condition.dtype != dtypes_module.bool:
     condition = math_ops.cast(condition, 'bool')
-  if not callable(then_expression):
+  cond_ndim = ndim(condition)
+  if not cond_ndim:
+    if not callable(then_expression):
 
-    def then_expression_fn():
-      return then_expression
-  else:
-    then_expression_fn = then_expression
-  if not callable(else_expression):
+      def then_expression_fn():
+        return then_expression
+    else:
+      then_expression_fn = then_expression
+    if not callable(else_expression):
 
-    def else_expression_fn():
-      return else_expression
+      def else_expression_fn():
+        return else_expression
+    else:
+      else_expression_fn = else_expression
+    x = control_flow_ops.cond(condition, then_expression_fn, else_expression_fn)
   else:
-    else_expression_fn = else_expression
-  x = control_flow_ops.cond(condition, then_expression_fn, else_expression_fn)
+    # tf.where needs its condition tensor
+    # to be the same shape as its two
+    # result tensors
+    if callable(then_expression):
+      then_expression = then_expression()
+    if callable(else_expression):
+      else_expression = else_expression()
+    expr_ndim = ndim(then_expression)
+    if cond_ndim > expr_ndim:
+      raise ValueError('Rank of `condition` should be less than or'
+                       ' equal to rank of `then_expression` and '
+                       '`else_expression`. ndim(condition)=' + str(cond_ndim) +
+                       ', ndim(then_expression)'
+                       '=' + str(expr_ndim))
+    if cond_ndim > 1:
+      ndim_diff = expr_ndim - cond_ndim
+      cond_shape = array_ops.concat(
+          [array_ops.shape(condition), [1] * ndim_diff], axis=0)
+      condition = array_ops.reshape(condition, cond_shape)
+      expr_shape = array_ops.shape(then_expression)
+      shape_diff = expr_shape - cond_shape
+      tile_shape = array_ops.where(shape_diff > 0, expr_shape,
+                                   array_ops.ones_like(expr_shape))
+      condition = array_ops.tile(condition, tile_shape)
+    x = array_ops.where(condition, then_expression, else_expression)
   return x
 
 
@@ -3127,45 +3248,23 @@ def in_top_k(predictions, targets, k):
 # CONVOLUTIONS
 
 
-def _preprocess_deconv_output_shape(x, shape, data_format):
-  """Get the output_shape for the deconvolution.
-
-  Arguments:
-      x: input tensor.
-      shape: output shape.
-      data_format: string, one of 'channels_last', 'channels_first'.
-
-  Returns:
-      The output shape.
-  """
-  if data_format == 'channels_first':
-    shape = (shape[0], shape[2], shape[3], shape[1])
-
-  if shape[0] is None:
-    shape = (array_ops.shape(x)[0],) + tuple(shape[1:])
-    shape = array_ops.stack(list(shape))
-  return shape
-
-
 def _preprocess_conv2d_input(x, data_format):
   """Transpose and cast the input before the conv2d.
 
   Arguments:
       x: input tensor.
-      data_format: string, one of 'channels_last', 'channels_first'.
+      data_format: string, `"channels_last"` or `"channels_first"`.
 
   Returns:
       A tensor.
   """
-  if dtype(x) == 'float64':
-    x = math_ops.cast(x, 'float32')
+  tf_data_format = 'NHWC'
   if data_format == 'channels_first':
-    # TF uses the last dimension as channel dimension,
-    # instead of the 2nd one.
-    # TH input shape: (samples, input_depth, rows, cols)
-    # TF input shape: (samples, rows, cols, input_depth)
-    x = array_ops.transpose(x, (0, 2, 3, 1))
-  return x
+    if not _has_nchw_support():
+      x = array_ops.transpose(x, (0, 2, 3, 1))  # NCHW -> NHWC
+    else:
+      tf_data_format = 'NCHW'
+  return x, tf_data_format
 
 
 def _preprocess_conv3d_input(x, data_format):
@@ -3173,16 +3272,18 @@ def _preprocess_conv3d_input(x, data_format):
 
   Arguments:
       x: input tensor.
-      data_format: string, one of 'channels_last', 'channels_first'.
+      data_format: string, `"channels_last"` or `"channels_first"`.
 
   Returns:
       A tensor.
   """
-  if dtype(x) == 'float64':
-    x = math_ops.cast(x, 'float32')
+  tf_data_format = 'NDHWC'
   if data_format == 'channels_first':
-    x = array_ops.transpose(x, (0, 2, 3, 4, 1))
-  return x
+    if not _has_nchw_support():
+      x = array_ops.transpose(x, (0, 2, 3, 4, 1))
+    else:
+      tf_data_format = 'NCDHW'
+  return x, tf_data_format
 
 
 def _preprocess_padding(padding):
@@ -3206,43 +3307,6 @@ def _preprocess_padding(padding):
   return padding
 
 
-def _postprocess_conv2d_output(x, data_format):
-  """Transpose and cast the output from conv2d if needed.
-
-  Arguments:
-      x: A tensor.
-      data_format: string, one of "channels_last", "channels_first".
-
-  Returns:
-      A tensor.
-  """
-
-  if data_format == 'channels_first':
-    x = array_ops.transpose(x, (0, 3, 1, 2))
-
-  if floatx() == 'float64':
-    x = math_ops.cast(x, 'float64')
-  return x
-
-
-def _postprocess_conv3d_output(x, data_format):
-  """Transpose and cast the output from conv3d if needed.
-
-  Arguments:
-      x: A tensor.
-      data_format: string, one of "channels_last", "channels_first".
-
-  Returns:
-      A tensor.
-  """
-  if data_format == 'channels_first':
-    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
-
-  if floatx() == 'float64':
-    x = math_ops.cast(x, 'float64')
-  return x
-
-
 def conv1d(x,
            kernel,
            strides=1,
@@ -3261,7 +3325,16 @@ def conv1d(x,
 
   Returns:
       A tensor, result of 1D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
   """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+
   kernel_shape = kernel.get_shape().as_list()
   if padding == 'causal':
     # causal (dilated) convolution:
@@ -3313,10 +3386,7 @@ def conv2d(x,
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
-  # With 4d inputs, nn.convolution only supports
-  # data_format NHWC, so we transpose the inputs
-  # in case we are in data_format channels_first.
-  x = _preprocess_conv2d_input(x, data_format)
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
   x = nn.convolution(
       input=x,
@@ -3324,8 +3394,10 @@ def conv2d(x,
       dilation_rate=dilation_rate,
       strides=strides,
       padding=padding,
-      data_format='NHWC')
-  return _postprocess_conv2d_output(x, data_format)
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+  return x
 
 
 def conv2d_transpose(x,
@@ -3344,8 +3416,8 @@ def conv2d_transpose(x,
       output_shape: 1D int tensor for the output shape.
       strides: strides tuple.
       padding: string, `"same"` or `"valid"`.
-      data_format: `"channels_last"` or `"channels_first"`.
-          Whether to use Theano or TensorFlow data format
+      data_format: string, `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow/CNTK data format
           for inputs/kernels/outputs.
 
   Returns:
@@ -3362,13 +3434,30 @@ def conv2d_transpose(x,
   if isinstance(output_shape, (tuple, list)):
     output_shape = array_ops.stack(output_shape)
 
-  x = _preprocess_conv2d_input(x, data_format)
-  output_shape = _preprocess_deconv_output_shape(x, output_shape, data_format)
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
+
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    output_shape = (output_shape[0], output_shape[2], output_shape[3],
+                    output_shape[1])
+  if output_shape[0] is None:
+    output_shape = (array_ops.shape(x)[0],) + tuple(output_shape[1:])
+    output_shape = array_ops.stack(list(output_shape))
+
   padding = _preprocess_padding(padding)
-  strides = (1,) + strides + (1,)
+  if tf_data_format == 'NHWC':
+    strides = (1,) + strides + (1,)
+  else:
+    strides = (1, 1) + strides
 
-  x = nn.conv2d_transpose(x, kernel, output_shape, strides, padding=padding)
-  x = _postprocess_conv2d_output(x, data_format)
+  x = nn.conv2d_transpose(
+      x,
+      kernel,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
   return x
 
 
@@ -3386,8 +3475,8 @@ def separable_conv2d(x,
       depthwise_kernel: convolution kernel for the depthwise convolution.
       pointwise_kernel: kernel for the 1x1 convolution.
       strides: strides tuple (length 2).
-      padding: padding mode, "valid" or "same".
-      data_format: data format, "channels_first" or "channels_last".
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
       dilation_rate: tuple of integers,
           dilation rates for the separable convolution.
 
@@ -3403,9 +3492,12 @@ def separable_conv2d(x,
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
-  x = _preprocess_conv2d_input(x, data_format)
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
-  strides = (1,) + strides + (1,)
+  if tf_data_format == 'NHWC':
+    strides = (1,) + strides + (1,)
+  else:
+    strides = (1, 1) + strides
 
   x = nn.separable_conv2d(
       x,
@@ -3413,44 +3505,59 @@ def separable_conv2d(x,
       pointwise_kernel,
       strides=strides,
       padding=padding,
-      rate=dilation_rate)
-  return _postprocess_conv2d_output(x, data_format)
+      rate=dilation_rate,
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+  return x
 
 
-def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
-                     data_format=None, dilation_rate=(1, 1)):
+def depthwise_conv2d(x,
+                     depthwise_kernel,
+                     strides=(1, 1),
+                     padding='valid',
+                     data_format=None,
+                     dilation_rate=(1, 1)):
   """2D convolution with separable filters.
 
   Arguments:
-    x: input tensor
-    depthwise_kernel: convolution kernel for the depthwise convolution.
-    strides: strides tuple (length 2).
-    padding: string, `"same"` or `"valid"`.
-    data_format: string, `"channels_last"` or `"channels_first"`.
-    dilation_rate: tuple of integers,
-        dilation rates for the separable convolution.
+      x: input tensor
+      depthwise_kernel: convolution kernel for the depthwise convolution.
+      strides: strides tuple (length 2).
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      dilation_rate: tuple of integers,
+          dilation rates for the separable convolution.
 
   Returns:
-    Output tensor.
+      Output tensor.
 
   Raises:
-    ValueError: if `data_format` is neither `channels_last`
-      or `channels_first`.
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
   """
   if data_format is None:
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
-  x = _preprocess_conv2d_input(x, data_format)
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
-  strides = (1,) + strides + (1,)
+  if tf_data_format == 'NHWC':
+    strides = (1,) + strides + (1,)
+  else:
+    strides = (1, 1) + strides
 
-  x = nn.depthwise_conv2d(x, depthwise_kernel,
-                          strides=strides,
-                          padding=padding,
-                          rate=dilation_rate)
-  return _postprocess_conv2d_output(x, data_format)
+  x = nn.depthwise_conv2d(
+      x,
+      depthwise_kernel,
+      strides=strides,
+      padding=padding,
+      rate=dilation_rate,
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+  return x
 
 
 def conv3d(x,
@@ -3466,8 +3573,8 @@ def conv3d(x,
       kernel: kernel tensor.
       strides: strides tuple.
       padding: string, `"same"` or `"valid"`.
-      data_format: `"channels_last"` or `"channels_first"`.
-          Whether to use Theano or TensorFlow data format
+      data_format: string, `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow/CNTK data format
           for inputs/kernels/outputs.
       dilation_rate: tuple of 3 integers.
 
@@ -3483,10 +3590,7 @@ def conv3d(x,
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
-  # With 5d inputs, nn.convolution only supports
-  # data_format NDHWC, so we transpose the inputs
-  # in case we are in data_format channels_first.
-  x = _preprocess_conv3d_input(x, data_format)
+  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
   padding = _preprocess_padding(padding)
   x = nn.convolution(
       input=x,
@@ -3494,8 +3598,71 @@ def conv3d(x,
       dilation_rate=dilation_rate,
       strides=strides,
       padding=padding,
-      data_format='NDHWC')
-  return _postprocess_conv3d_output(x, data_format)
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
+    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
+  return x
+
+
+def conv3d_transpose(x,
+                     kernel,
+                     output_shape,
+                     strides=(1, 1, 1),
+                     padding='valid',
+                     data_format=None):
+  """3D deconvolution (i.e.
+
+  transposed convolution).
+
+  Arguments:
+      x: input tensor.
+      kernel: kernel tensor.
+      output_shape: 1D int tensor for the output shape.
+      strides: strides tuple.
+      padding: string, "same" or "valid".
+      data_format: string, `"channels_last"` or `"channels_first"`.
+          Whether to use Theano or TensorFlow/CNTK data format
+          for inputs/kernels/outputs.
+
+  Returns:
+      A tensor, result of transposed 3D convolution.
+
+  Raises:
+      ValueError: if `data_format` is neither `channels_last` or
+      `channels_first`.
+  """
+  if data_format is None:
+    data_format = image_data_format()
+  if data_format not in {'channels_first', 'channels_last'}:
+    raise ValueError('Unknown data_format ' + str(data_format))
+  if isinstance(output_shape, (tuple, list)):
+    output_shape = array_ops.stack(output_shape)
+
+  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
+
+  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
+    output_shape = (output_shape[0], output_shape[2], output_shape[3],
+                    output_shape[4], output_shape[1])
+  if output_shape[0] is None:
+    output_shape = (array_ops.shape(x)[0],) + tuple(output_shape[1:])
+    output_shape = array_ops.stack(list(output_shape))
+
+  padding = _preprocess_padding(padding)
+  if tf_data_format == 'NDHWC':
+    strides = (1,) + strides + (1,)
+  else:
+    strides = (1, 1) + strides
+
+  x = nn.conv3d_transpose(
+      x,
+      kernel,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=tf_data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
+    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
+  return x
 
 
 def pool2d(x,
@@ -3510,37 +3677,44 @@ def pool2d(x,
       x: Tensor or variable.
       pool_size: tuple of 2 integers.
       strides: tuple of 2 integers.
-      padding: one of `"valid"`, `"same"`.
-      data_format: one of `"channels_first"`, `"channels_last"`.
-      pool_mode: one of `"max"`, `"avg"`.
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      pool_mode: string, `"max"` or `"avg"`.
 
   Returns:
       A tensor, result of 2D pooling.
 
   Raises:
-      ValueError: if `data_format` is neither `channels_last` or
-      `channels_first`.
-      ValueError: if `pool_mode` is neither `max` or `avg`.
+      ValueError: if `data_format` is neither `"channels_last"` or
+      `"channels_first"`.
+      ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
   """
   if data_format is None:
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
+  x, tf_data_format = _preprocess_conv2d_input(x, data_format)
   padding = _preprocess_padding(padding)
-  strides = (1,) + strides + (1,)
-  pool_size = (1,) + pool_size + (1,)
-
-  x = _preprocess_conv2d_input(x, data_format)
+  if tf_data_format == 'NHWC':
+    strides = (1,) + strides + (1,)
+    pool_size = (1,) + pool_size + (1,)
+  else:
+    strides = (1, 1) + strides
+    pool_size = (1, 1) + pool_size
 
   if pool_mode == 'max':
-    x = nn.max_pool(x, pool_size, strides, padding=padding)
+    x = nn.max_pool(
+        x, pool_size, strides, padding=padding, data_format=tf_data_format)
   elif pool_mode == 'avg':
-    x = nn.avg_pool(x, pool_size, strides, padding=padding)
+    x = nn.avg_pool(
+        x, pool_size, strides, padding=padding, data_format=tf_data_format)
   else:
     raise ValueError('Invalid pooling mode:', pool_mode)
 
-  return _postprocess_conv2d_output(x, data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NHWC':
+    x = array_ops.transpose(x, (0, 3, 1, 2))  # NHWC -> NCHW
+  return x
 
 
 def pool3d(x,
@@ -3555,37 +3729,44 @@ def pool3d(x,
       x: Tensor or variable.
       pool_size: tuple of 3 integers.
       strides: tuple of 3 integers.
-      padding: one of `"valid"`, `"same"`.
-      data_format: one of `"channels_first"`, `"channels_last"`.
-      pool_mode: one of `"max"`, `"avg"`.
+      padding: string, `"same"` or `"valid"`.
+      data_format: string, `"channels_last"` or `"channels_first"`.
+      pool_mode: string, `"max"` or `"avg"`.
 
   Returns:
       A tensor, result of 3D pooling.
 
   Raises:
-      ValueError: if `data_format` is neither
-          `channels_last` or `channels_first`.
-      ValueError: if `pool_mode` is neither `max` or `avg`.
+      ValueError: if `data_format` is neither `"channels_last"` or
+      `"channels_first"`.
+      ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
   """
   if data_format is None:
     data_format = image_data_format()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
+  x, tf_data_format = _preprocess_conv3d_input(x, data_format)
   padding = _preprocess_padding(padding)
-  strides = (1,) + strides + (1,)
-  pool_size = (1,) + pool_size + (1,)
-
-  x = _preprocess_conv3d_input(x, data_format)
+  if tf_data_format == 'NDHWC':
+    strides = (1,) + strides + (1,)
+    pool_size = (1,) + pool_size + (1,)
+  else:
+    strides = (1, 1) + strides
+    pool_size = (1, 1) + pool_size
 
   if pool_mode == 'max':
-    x = nn.max_pool3d(x, pool_size, strides, padding=padding)
+    x = nn.max_pool3d(
+        x, pool_size, strides, padding=padding, data_format=tf_data_format)
   elif pool_mode == 'avg':
-    x = nn.avg_pool3d(x, pool_size, strides, padding=padding)
+    x = nn.avg_pool3d(
+        x, pool_size, strides, padding=padding, data_format=tf_data_format)
   else:
     raise ValueError('Invalid pooling mode:', pool_mode)
 
-  return _postprocess_conv3d_output(x, data_format)
+  if data_format == 'channels_first' and tf_data_format == 'NDHWC':
+    x = array_ops.transpose(x, (0, 4, 1, 2, 3))
+  return x
 
 
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
@@ -3860,10 +4041,10 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 # CTC
-# tensorflow has a native implemenation, but it uses sparse tensors
+# TensorFlow has a native implementation, but it uses sparse tensors
 # and therefore requires a wrapper for Keras. The functions below convert
 # dense to sparse tensors and also wraps up the beam search code that is
-# in tensorflow's CTC implementation
+# in TensorFlow's CTC implementation
 
 
 def ctc_label_dense_to_sparse(labels, label_lengths):
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/_impl/keras/backend_test.py
index d914490f7e42aa9dc67af44afde160572fcb8642..e45e566dcac62a2d91c8e6d68caa5c15d8d80244 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/_impl/keras/backend_test.py
@@ -165,6 +165,55 @@ class BackendUtilsTest(test.TestCase):
     for y in ys:
       self.assertEqual(y.op.name[:12], 'StopGradient')
 
+  def test_function_tf_fetches(self):
+    # Additional operations can be passed to tf.Session().run() via its
+    # `fetches` arguments. In contrast to `updates` argument of
+    # keras.backend.function() these do not have control dependency on `outputs`
+    # so they can run in parallel. Also they should not contribute to output of
+    # keras.backend.function().
+    with self.test_session():
+      x = keras.backend.variable(0.)
+      y = keras.backend.variable(0.)
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      f = keras.backend.function(inputs=[x_placeholder, y_placeholder],
+                                 outputs=[x_placeholder + y_placeholder],
+                                 updates=[(x, x_placeholder + 1.)],
+                                 fetches=[keras.backend.update(y, 5.)])
+      output = f([10., 20.])
+      assert output == [30.]
+      assert keras.backend.get_session().run(fetches=[x, y]) == [11., 5.]
+
+  def test_function_tf_feed_dict(self):
+    # Additional substitutions can be passed to `tf.Session().run()` via its
+    # `feed_dict` arguments. Note that the feed_dict is passed once in the
+    # constructor but we can modify the values in the dictionary. Through
+    # this feed_dict we can provide additional substitutions besides Keras
+    # inputs.
+    with self.test_session():
+      x = keras.backend.variable(0.)
+      y = keras.backend.variable(0.)
+      x_placeholder = keras.backend.placeholder(shape=())
+      y_placeholder = keras.backend.placeholder(shape=())
+
+      feed_dict = {y_placeholder: 3.}
+      fetches = [keras.backend.update(y, y_placeholder * 10.)]
+      f = keras.backend.function(inputs=[x_placeholder],
+                                 outputs=[x_placeholder + 1.],
+                                 updates=[(x, x_placeholder + 10.)],
+                                 feed_dict=feed_dict,
+                                 fetches=fetches)
+      output = f([10.])
+      assert output == [11.]
+      assert keras.backend.get_session().run(fetches=[x, y]) == [20., 30.]
+
+      # updated value in feed_dict will be modified within the K.function()
+      feed_dict[y_placeholder] = 4.
+      output = f([20.])
+      assert output == [21.]
+      assert keras.backend.get_session().run(fetches=[x, y]) == [30., 40.]
+
 
 class BackendVariableTest(test.TestCase):
 
@@ -853,44 +902,44 @@ class BackendNNOpsTest(test.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-
-    for (i, kwargs) in enumerate(kwargs_list):
-      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                           initial_states,
-                                                           **kwargs)
-      last_output_list[i].append(keras.backend.eval(last_output))
-      outputs_list[i].append(keras.backend.eval(outputs))
-      self.assertEqual(len(new_states), 1)
-      state_list[i].append(keras.backend.eval(new_states[0]))
-
-    def assert_list_pairwise(z_list, atol=1e-05):
-      for (z1, z2) in zip(z_list[1:], z_list[:-1]):
-        self.assertAllClose(z1, z2, atol=atol)
-
-    assert_list_pairwise(last_output_list[0], atol=1e-04)
-    assert_list_pairwise(outputs_list[0], atol=1e-04)
-    assert_list_pairwise(state_list[0], atol=1e-04)
-    assert_list_pairwise(last_output_list[2], atol=1e-04)
-    assert_list_pairwise(outputs_list[2], atol=1e-04)
-    assert_list_pairwise(state_list[2], atol=1e-04)
-
-    for l, u_l in zip(last_output_list[0], last_output_list[1]):
-      self.assertAllClose(l, u_l, atol=1e-04)
-
-    for o, u_o in zip(outputs_list[0], outputs_list[1]):
-      self.assertAllClose(o, u_o, atol=1e-04)
-
-    for s, u_s in zip(state_list[0], state_list[1]):
-      self.assertAllClose(s, u_s, atol=1e-04)
-
-    for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
-      self.assertAllClose(b_l, b_u_l, atol=1e-04)
-
-    for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
-      self.assertAllClose(b_o, b_u_o, atol=1e-04)
-
-    for b_s, b_u_s in zip(state_list[2], state_list[3]):
-      self.assertAllClose(b_s, b_u_s, atol=1e-04)
+    with self.test_session():
+      for (i, kwargs) in enumerate(kwargs_list):
+        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                             initial_states,
+                                                             **kwargs)
+        last_output_list[i].append(keras.backend.eval(last_output))
+        outputs_list[i].append(keras.backend.eval(outputs))
+        self.assertEqual(len(new_states), 1)
+        state_list[i].append(keras.backend.eval(new_states[0]))
+
+      def assert_list_pairwise(z_list, atol=1e-05):
+        for (z1, z2) in zip(z_list[1:], z_list[:-1]):
+          self.assertAllClose(z1, z2, atol=atol)
+
+      assert_list_pairwise(last_output_list[0], atol=1e-04)
+      assert_list_pairwise(outputs_list[0], atol=1e-04)
+      assert_list_pairwise(state_list[0], atol=1e-04)
+      assert_list_pairwise(last_output_list[2], atol=1e-04)
+      assert_list_pairwise(outputs_list[2], atol=1e-04)
+      assert_list_pairwise(state_list[2], atol=1e-04)
+
+      for l, u_l in zip(last_output_list[0], last_output_list[1]):
+        self.assertAllClose(l, u_l, atol=1e-04)
+
+      for o, u_o in zip(outputs_list[0], outputs_list[1]):
+        self.assertAllClose(o, u_o, atol=1e-04)
+
+      for s, u_s in zip(state_list[0], state_list[1]):
+        self.assertAllClose(s, u_s, atol=1e-04)
+
+      for b_l, b_u_l in zip(last_output_list[2], last_output_list[3]):
+        self.assertAllClose(b_l, b_u_l, atol=1e-04)
+
+      for b_o, b_u_o in zip(outputs_list[2], outputs_list[3]):
+        self.assertAllClose(b_o, b_u_o, atol=1e-04)
+
+      for b_s, b_u_s in zip(state_list[2], state_list[3]):
+        self.assertAllClose(b_s, b_u_s, atol=1e-04)
 
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/_impl/keras/callbacks.py
index eb678c4d1d9fe2ed9367417b9134756768d86b37..16109b52b3ad05c1f5dd46f05bef493ce15f4295 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks.py
@@ -265,7 +265,7 @@ class ProgbarLogger(Callback):
   Arguments:
       count_mode: One of "steps" or "samples".
           Whether the progress bar should
-          count samples seens or steps (batches) seen.
+          count samples seen or steps (batches) seen.
 
   Raises:
       ValueError: In case of invalid `count_mode`.
@@ -417,7 +417,7 @@ class ModelCheckpoint(Callback):
     self.epochs_since_last_save += 1
     if self.epochs_since_last_save >= self.period:
       self.epochs_since_last_save = 0
-      filepath = self.filepath.format(epoch=epoch, **logs)
+      filepath = self.filepath.format(epoch=epoch + 1, **logs)
       if self.save_best_only:
         current = logs.get(self.monitor)
         if current is None:
@@ -427,7 +427,7 @@ class ModelCheckpoint(Callback):
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
               print('Epoch %05d: %s improved from %0.5f to %0.5f,'
-                    ' saving model to %s' % (epoch, self.monitor, self.best,
+                    ' saving model to %s' % (epoch + 1, self.monitor, self.best,
                                              current, filepath))
             self.best = current
             if self.save_weights_only:
@@ -436,10 +436,11 @@ class ModelCheckpoint(Callback):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('Epoch %05d: %s did not improve' % (epoch, self.monitor))
+              print('Epoch %05d: %s did not improve' % (epoch + 1,
+                                                        self.monitor))
       else:
         if self.verbose > 0:
-          print('Epoch %05d: saving model to %s' % (epoch, filepath))
+          print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))
         if self.save_weights_only:
           self.model.save_weights(filepath, overwrite=True)
         else:
@@ -519,14 +520,14 @@ class EarlyStopping(Callback):
       self.best = current
       self.wait = 0
     else:
+      self.wait += 1
       if self.wait >= self.patience:
         self.stopped_epoch = epoch
         self.model.stop_training = True
-      self.wait += 1
 
   def on_train_end(self, logs=None):
     if self.stopped_epoch > 0 and self.verbose > 0:
-      print('Epoch %05d: early stopping' % (self.stopped_epoch))
+      print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
 
 
 class RemoteMonitor(Callback):
@@ -767,7 +768,7 @@ class TensorBoard(Callback):
       self.writer.add_summary(summary, epoch)
     self.writer.flush()
 
-  def on_train_end(self, _):
+  def on_train_end(self, logs=None):
     self.writer.close()
 
 
diff --git a/tensorflow/python/keras/_impl/keras/callbacks_test.py b/tensorflow/python/keras/_impl/keras/callbacks_test.py
index d9d7fb5a9fb767a93019217ba16321c72f2a47ad..79dfcd1bb669db09de0cbaa103914efaaf19c6fb 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks_test.py
+++ b/tensorflow/python/keras/_impl/keras/callbacks_test.py
@@ -19,16 +19,18 @@ from __future__ import division
 from __future__ import print_function
 
 import csv
-import multiprocessing
 import os
 import re
 import shutil
+import threading
+import unittest
 
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -203,12 +205,12 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=4,
           verbose=1)
-      assert os.path.exists(filepath.format(epoch=1))
-      assert os.path.exists(filepath.format(epoch=3))
-      os.remove(filepath.format(epoch=1))
-      os.remove(filepath.format(epoch=3))
-      assert not os.path.exists(filepath.format(epoch=0))
-      assert not os.path.exists(filepath.format(epoch=2))
+      assert os.path.exists(filepath.format(epoch=2))
+      assert os.path.exists(filepath.format(epoch=4))
+      os.remove(filepath.format(epoch=2))
+      os.remove(filepath.format(epoch=4))
+      assert not os.path.exists(filepath.format(epoch=1))
+      assert not os.path.exists(filepath.format(epoch=3))
 
       # Invalid use: this will raise a warning but not an Exception.
       keras.callbacks.ModelCheckpoint(
@@ -273,12 +275,12 @@ class KerasCallbacksTest(test.TestCase):
       stopper = keras.callbacks.EarlyStopping(monitor='acc', patience=patience)
       weights = model.get_weights()
 
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
       assert len(hist.epoch) >= patience
 
       # This should allow training to go for at least `patience` epochs
       model.set_weights(weights)
-      hist = model.fit(data, labels, callbacks=[stopper], verbose=0)
+      hist = model.fit(data, labels, callbacks=[stopper], verbose=0, epochs=20)
     assert len(hist.epoch) >= patience
 
   def test_RemoteMonitor(self):
@@ -498,7 +500,10 @@ class KerasCallbacksTest(test.TestCase):
       values = []
       with open(fp) as f:
         for x in csv.reader(f):
-          values.append(x)
+          # In windows, due to \r\n line ends we may end up reading empty lines
+          # after each line. Skip empty lines.
+          if x:
+            values.append(x)
       assert 'nan' in values[-1], 'The last epoch was not logged.'
 
   def test_TerminateOnNaN(self):
@@ -571,7 +576,6 @@ class KerasCallbacksTest(test.TestCase):
           loss='categorical_crossentropy',
           optimizer='sgd',
           metrics=['accuracy'])
-
       tsb = keras.callbacks.TensorBoard(
           log_dir=temp_dir, histogram_freq=1, write_images=True,
           write_grads=True, batch_size=5)
@@ -679,23 +683,41 @@ class KerasCallbacksTest(test.TestCase):
             batch_size=5)]
 
       # fit w/o validation data should raise ValueError if histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit(x_train, y_train, batch_size=BATCH_SIZE,
-                  callbacks=callbacks_factory(histogram_freq=1), epochs=3)
+        model.fit(
+            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
+
+      for cb in cbs:
+        cb.on_train_end()
 
       # fit generator without validation data should raise ValueError if
       # histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            callbacks=callbacks_factory(histogram_freq=1))
+        model.fit_generator(
+            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
+
+      for cb in cbs:
+        cb.on_train_end()
 
       # fit generator with validation data generator should raise ValueError if
       # histogram_freq > 0
+      cbs = callbacks_factory(histogram_freq=1)
       with self.assertRaises(ValueError):
-        model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                            validation_data=data_generator(False),
-                            validation_steps=1,
-                            callbacks=callbacks_factory(histogram_freq=1))
+        model.fit_generator(
+            data_generator(True),
+            len(x_train),
+            epochs=2,
+            validation_data=data_generator(False),
+            validation_steps=1,
+            callbacks=cbs)
+
+      for cb in cbs:
+        cb.on_train_end()
+
+      # Make sure file writer cache is clear to avoid failures during cleanup.
+      writer_cache.FileWriterCache.clear()
 
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
@@ -768,6 +790,9 @@ class KerasCallbacksTest(test.TestCase):
                           callbacks=callbacks_factory(histogram_freq=1))
       assert os.path.isdir(filepath)
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_LambdaCallback(self):
     with self.test_session():
       np.random.seed(1337)
@@ -790,14 +815,15 @@ class KerasCallbacksTest(test.TestCase):
 
       # Start an arbitrary process that should run during model
       # training and be terminated after training has completed.
+      e = threading.Event()
+
       def target():
-        while True:
-          pass
+        e.wait()
 
-      p = multiprocessing.Process(target=target)
-      p.start()
+      t = threading.Thread(target=target)
+      t.start()
       cleanup_callback = keras.callbacks.LambdaCallback(
-          on_train_end=lambda logs: p.terminate())
+          on_train_end=lambda logs: e.set())
 
       cbks = [cleanup_callback]
       model.fit(
@@ -808,8 +834,8 @@ class KerasCallbacksTest(test.TestCase):
           callbacks=cbks,
           epochs=5,
           verbose=0)
-      p.join()
-      assert not p.is_alive()
+      t.join()
+      assert not t.is_alive()
 
   def test_TensorBoard_with_ReduceLROnPlateau(self):
     with self.test_session():
diff --git a/tensorflow/python/keras/_impl/keras/datasets/__init__.py b/tensorflow/python/keras/_impl/keras/datasets/__init__.py
index 22afb6a55343ce1cba66785ebc792434060eda02..60db3766fbce859269cecb92a537084ef18c0da5 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras datasets: utilities for downloading and pre-processing common datasets.
+
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras.datasets import boston_housing
 from tensorflow.python.keras._impl.keras.datasets import cifar10
 from tensorflow.python.keras._impl.keras.datasets import cifar100
+from tensorflow.python.keras._impl.keras.datasets import fashion_mnist
 from tensorflow.python.keras._impl.keras.datasets import imdb
 from tensorflow.python.keras._impl.keras.datasets import mnist
 from tensorflow.python.keras._impl.keras.datasets import reuters
-
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
index e4f7fb9d2128d305ee7e26777c7627725001cf92..4359be89280f7ffa3479af38cd66ebd3aaf6c30e 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
@@ -48,9 +48,10 @@ def load_data(path='boston_housing.npz', seed=113, test_split=0.2):
   f.close()
 
   np.random.seed(seed)
-  np.random.shuffle(x)
-  np.random.seed(seed)
-  np.random.shuffle(y)
+  indices = np.arrange(len(x))
+  np.random.shuffle(indices)
+  x = x[indices]
+  y = y[indices]
 
   x_train = np.array(x[:int(len(x) * (1 - test_split))])
   y_train = np.array(y[:int(len(x) * (1 - test_split))])
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
index 672249ff20f37e701e276ab3c2489de4630867be..7905da66c1e619153c75d7e05cad748710d63849 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
@@ -34,19 +34,18 @@ def load_data():
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   dirname = 'cifar-10-batches-py'
-  origin = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
+  origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
   path = get_file(dirname, origin=origin, untar=True)
 
   num_train_samples = 50000
 
-  x_train = np.zeros((num_train_samples, 3, 32, 32), dtype='uint8')
-  y_train = np.zeros((num_train_samples,), dtype='uint8')
+  x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8')
+  y_train = np.empty((num_train_samples,), dtype='uint8')
 
   for i in range(1, 6):
     fpath = os.path.join(path, 'data_batch_' + str(i))
-    data, labels = load_batch(fpath)
-    x_train[(i - 1) * 10000:i * 10000, :, :, :] = data
-    y_train[(i - 1) * 10000:i * 10000] = labels
+    (x_train[(i - 1) * 10000:i * 10000, :, :, :],
+     y_train[(i - 1) * 10000:i * 10000]) = load_batch(fpath)
 
   fpath = os.path.join(path, 'test_batch')
   x_test, y_test = load_batch(fpath)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
index 1be7483d27332cb89fbc02e2f4a502de7200e828..b69c0724c58d6d60a291c69db3de926605d90954 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
@@ -43,7 +43,7 @@ def load_data(label_mode='fine'):
     raise ValueError('label_mode must be one of "fine" "coarse".')
 
   dirname = 'cifar-100-python'
-  origin = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
+  origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
   path = get_file(dirname, origin=origin, untar=True)
 
   fpath = os.path.join(path, 'train')
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..17be684e4f8bdb800c6b0883649da25f18fa0402
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fashion-MNIST dataset.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import numpy as np
+from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+
+
+def load_data():
+  """Loads the Fashion-MNIST dataset.
+
+  Returns:
+      Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+  """
+  dirname = os.path.join('datasets', 'fashion-mnist')
+  base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
+  files = [
+      'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
+      't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
+  ]
+
+  paths = []
+  for given_file in files:
+    paths.append(
+        get_file(given_file, origin=base + given_file, cache_subdir=dirname))
+
+  with gzip.open(paths[0], 'rb') as lbpath:
+    y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
+
+  with gzip.open(paths[1], 'rb') as imgpath:
+    x_train = np.frombuffer(
+        imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28)
+
+  with gzip.open(paths[2], 'rb') as lbpath:
+    y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)
+
+  with gzip.open(paths[3], 'rb') as imgpath:
+    x_test = np.frombuffer(
+        imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28)
+
+  return (x_train, y_train), (x_test, y_test)
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
index 0db9d61f6d58448fb33851623991a0587d1db84e..0e83473899c303e3ad96d253cf31a1def476fa52 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/imdb.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -65,23 +65,24 @@ def load_data(path='imdb.npz',
   have simply been skipped.
   """
   path = get_file(
-      path, origin='https://s3.amazonaws.com/text-datasets/imdb.npz')
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
+      file_hash='599dadb1135973df5b59232a0e9a887c')
   f = np.load(path)
-  x_train = f['x_train']
-  labels_train = f['y_train']
-  x_test = f['x_test']
-  labels_test = f['y_test']
+  x_train, labels_train = f['x_train'], f['y_train']
+  x_test, labels_test = f['x_test'], f['y_test']
   f.close()
 
   np.random.seed(seed)
-  np.random.shuffle(x_train)
-  np.random.seed(seed)
-  np.random.shuffle(labels_train)
-
-  np.random.seed(seed * 2)
-  np.random.shuffle(x_test)
-  np.random.seed(seed * 2)
-  np.random.shuffle(labels_test)
+  indices = np.arrange(len(x_train))
+  np.random.shuffle(indices)
+  x_train = x_train[indices]
+  labels_train = labels_train[indices]
+
+  indices = np.arrange(len(x_test))
+  np.random.shuffle(indices)
+  x_test = x_test[indices]
+  labels_test = labels_test[indices]
 
   xs = np.concatenate([x_train, x_test])
   labels = np.concatenate([labels_train, labels_test])
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
index 02be5e2a407be89d93f3c20f6a01c476a35697bf..e98f29537f4e29c649d0a1879e75505b050d6639 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/mnist.py
@@ -34,7 +34,9 @@ def load_data(path='mnist.npz'):
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   path = get_file(
-      path, origin='https://s3.amazonaws.com/img-datasets/mnist.npz')
+      path,
+      origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
+      file_hash='8a61469f7ea1b51cbae51d4f78837e45')
   f = np.load(path)
   x_train = f['x_train']
   y_train = f['y_train']
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
index c36bac5cc7df157b8bbb1416ca3715a041586e27..d05eb0ef8caed93963b0059a023a06172d4e9ddb 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/_impl/keras/datasets/reuters.py
@@ -64,15 +64,20 @@ def load_data(path='reuters.npz',
   have simply been skipped.
   """
   path = get_file(
-      path, origin='https://s3.amazonaws.com/text-datasets/reuters.npz')
+      path,
+      origin='https://s3.amazonaws.com/text-datasets/reuters.npz',
+      file_hash='87aedbeb0cb229e378797a632c1997b6')
   npzfile = np.load(path)
   xs = npzfile['x']
   labels = npzfile['y']
   npzfile.close()
 
   np.random.seed(seed)
-  np.random.shuffle(xs)
-  np.random.seed(seed)
+  indices = np.arrange(len(xs))
+  np.random.shuffle(indices)
+  xs = xs[indices]
+  labels = labels[indices]
+
   np.random.shuffle(labels)
 
   if start_char is not None:
@@ -129,7 +134,8 @@ def get_word_index(path='reuters_word_index.json'):
   """
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json')
+      origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json',
+      file_hash='4d44cc38712099c9e383dc6e5f11a921')
   f = open(path)
   data = json.load(f)
   f.close()
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology.py b/tensorflow/python/keras/_impl/keras/engine/topology.py
index f9be782f85e0d22df545bd252526fcfd47a72016..4a7bb2e83894f06c433964409ccb2bd3ebfed128 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Base layer code and base model (Container) code.
+"""Base layer code and base model (Network) code.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,10 +29,15 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import constraints
+from tensorflow.python.keras._impl.keras import initializers
+from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
 from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.layers import network as tf_network
+from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -209,9 +214,9 @@ class Layer(tf_base_layers.Layer):
       dtype = K.floatx()
     weight = self.add_variable(name, shape,
                                dtype=dtype,
-                               initializer=initializer,
-                               regularizer=regularizer,
-                               constraint=constraint,
+                               initializer=initializers.get(initializer),
+                               regularizer=regularizers.get(regularizer),
+                               constraint=constraints.get(constraint),
                                trainable=trainable)
     return weight
 
@@ -447,7 +452,7 @@ class Layer(tf_base_layers.Layer):
 
     The config of a layer does not include connectivity
     information, nor the layer class name. These are handled
-    by `Container` (one layer of abstraction above).
+    by `Network` (one layer of abstraction above).
 
     Returns:
         Python dictionary.
@@ -466,7 +471,7 @@ class Layer(tf_base_layers.Layer):
     This method is the reverse of `get_config`,
     capable of instantiating the same layer from the config
     dictionary. It does not handle layer connectivity
-    (handled by Container), nor weights (handled by `set_weights`).
+    (handled by Network), nor weights (handled by `set_weights`).
 
     Arguments:
         config: A Python dictionary, typically the
@@ -482,7 +487,7 @@ class Layer(tf_base_layers.Layer):
     self._activity_regularizer = activity_regularizer
 
 
-class InputLayer(tf_base_layers.InputLayer, Layer):
+class InputLayer(tf_network.InputLayer, Layer):
   """Layer to be used as an entry point into a graph.
 
   It can either wrap an existing tensor (pass an `input_tensor` argument)
@@ -633,11 +638,11 @@ def Input(  # pylint: disable=invalid-name
     return outputs
 
 
-class Network(tf_base_layers.Network, Layer):
-  """A Container is a directed acyclic graph of layers.
+class Network(tf_network.GraphNetwork, Layer):
+  """A Network is a directed acyclic graph of layers.
 
   It is the topological form of a "model". A Model
-  is simply a Container with added training routines.
+  is simply a Network with added training routines.
 
   # Properties
       name
@@ -678,8 +683,8 @@ class Network(tf_base_layers.Network, Layer):
     for x in self.inputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None
       masks.append(mask)
-    mask_cache_key = (tf_base_layers._object_list_uid(self.inputs) + '_' +
-                      tf_base_layers._object_list_uid(masks))
+    mask_cache_key = (tf_layers_util.object_list_uid(self.inputs) + '_' +
+                      tf_layers_util.object_list_uid(masks))
     masks = []
     for x in self.outputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None
@@ -789,14 +794,14 @@ class Network(tf_base_layers.Network, Layer):
     node_conversion_map = {}
     for layer in self.layers:
       if issubclass(layer.__class__, Network):
-        # Containers start with a pre-existing node
+        # Networks start with a pre-existing node
         # linking their input to output.
         kept_nodes = 1
       else:
         kept_nodes = 0
       for original_node_index, node in enumerate(layer._inbound_nodes):
-        node_key = tf_base_layers._make_node_key(layer.name,
-                                                 original_node_index)
+        node_key = tf_network._make_node_key(layer.name,
+                                             original_node_index)
         if node_key in self._network_nodes:
           node_conversion_map[node_key] = kept_nodes
           kept_nodes += 1
@@ -806,8 +811,8 @@ class Network(tf_base_layers.Network, Layer):
       layer_config = layer.get_config()
       filtered_inbound_nodes = []
       for original_node_index, node in enumerate(layer._inbound_nodes):
-        node_key = tf_base_layers._make_node_key(layer.name,
-                                                 original_node_index)
+        node_key = tf_network._make_node_key(layer.name,
+                                             original_node_index)
         if node_key in self._network_nodes:
           # The node is relevant to the model:
           # add to filtered_inbound_nodes.
@@ -831,8 +836,8 @@ class Network(tf_base_layers.Network, Layer):
               inbound_layer = node.inbound_layers[i]
               node_index = node.node_indices[i]
               tensor_index = node.tensor_indices[i]
-              node_key = tf_base_layers._make_node_key(inbound_layer.name,
-                                                       node_index)
+              node_key = tf_network._make_node_key(inbound_layer.name,
+                                                   node_index)
               new_node_index = node_conversion_map.get(node_key, 0)
               node_data.append(
                   [inbound_layer.name, new_node_index, tensor_index, kwargs])
@@ -849,8 +854,8 @@ class Network(tf_base_layers.Network, Layer):
     model_inputs = []
     for i in range(len(self._input_layers)):
       layer, node_index, tensor_index = self._input_coordinates[i]
-      node_key = tf_base_layers._make_node_key(layer.name,
-                                               node_index)
+      node_key = tf_network._make_node_key(layer.name,
+                                           node_index)
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
@@ -859,8 +864,8 @@ class Network(tf_base_layers.Network, Layer):
     model_outputs = []
     for i in range(len(self._output_layers)):
       layer, node_index, tensor_index = self._output_coordinates[i]
-      node_key = tf_base_layers._make_node_key(layer.name,
-                                               node_index)
+      node_key = tf_network._make_node_key(layer.name,
+                                           node_index)
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
@@ -1194,10 +1199,6 @@ class Network(tf_base_layers.Network, Layer):
                         print_fn=print_fn)
 
 
-# Alias for legacy support.
-Container = Network
-
-
 def get_source_inputs(tensor, layer=None, node_index=None):
   """Returns the list of input tensors necessary to compute `tensor`.
 
@@ -1423,6 +1424,31 @@ def preprocess_weights_for_loading(layer,
       weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+
+  # convert the weights of CuDNNLSTM so that they could be loaded into LSTM
+  if layer.__class__.__name__ == 'LSTM':
+    # determine if we're loading a CuDNNLSTM layer from the number of bias
+    # weights:
+    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    units = weights[1].shape[0]
+    bias = weights[2]
+    if len(bias) == units * 8:
+      # reshape the kernels
+      kernels = np.split(weights[0], 4, axis=1)
+      kernels = [
+          kernel.reshape(-1).reshape(kernel.shape, order='F')
+          for kernel in kernels
+      ]
+      weights[0] = np.concatenate(kernels, axis=1)
+
+      # transpose the recurrent kernels
+      recurrent_kernels = np.split(weights[1], 4, axis=1)
+      recurrent_kernels = [kernel.T for kernel in recurrent_kernels]
+      weights[1] = np.concatenate(recurrent_kernels, axis=1)
+
+      # split the bias into half and merge
+      weights[2] = bias[:units * 4] + bias[units * 4:]
+
   return weights
 
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 0b04c17ad7007602e5c1d3b7241953952ad63aaf..b4205bf4a397690ce6dd3424e0dd4076d9860e9d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine.topology import Container
+from tensorflow.python.keras._impl.keras.engine.topology import Network
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
@@ -71,6 +71,9 @@ def _standardize_input_data(data,
   if data is None:
     return [None for _ in range(len(names))]
   if isinstance(data, dict):
+    for key, value in data.items():
+      if value.__class__.__name__ == 'DataFrame':
+        data[key] = value.values
     arrays = []
     for name in names:
       if name not in data:
@@ -78,6 +81,9 @@ def _standardize_input_data(data,
                          '". Need data for each key in: ' + str(names))
       arrays.append(data[name])
   elif isinstance(data, list):
+    for key, value in enumerate(data):
+      if value.__class__.__name__ == 'DataFrame':
+        data[key] = value.values
     if len(data) != len(names):
       if data and hasattr(data[0], 'shape'):
         raise ValueError(
@@ -100,6 +106,9 @@ def _standardize_input_data(data,
                            ' Numpy arrays instead. '
                            'The list you passed was: ' + str(data)[:200])
     arrays = data
+  elif data.__class__.__name__ == 'DataFrame':
+    # test if data is a DataFrame, without pandas installed
+    arrays = data.values
   else:
     if not hasattr(data, 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
@@ -262,12 +271,13 @@ def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
           is incompatible with an output.
   """
   key_losses = {
-      'mean_squared_error', 'binary_crossentropy', 'categorical_crossentropy'
+      losses.mean_squared_error, losses.binary_crossentropy,
+      losses.categorical_crossentropy
   }
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if loss is None:
       continue
-    if loss.__name__ == 'categorical_crossentropy':
+    if loss is losses.categorical_crossentropy:
       if y.shape[-1] == 1:
         raise ValueError('You are passing a target array of shape ' + str(
             y.shape) + ' while using as loss `categorical_crossentropy`. '
@@ -277,14 +287,14 @@ def _check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
                          'If your targets are integer classes, '
                          'you can convert them to the expected format via:\n'
                          '```\n'
-                         'from keras.utils.np_utils import to_categorical\n'
+                         'from keras.utils import to_categorical\n'
                          'y_binary = to_categorical(y_int)\n'
                          '```\n'
                          '\n'
                          'Alternatively, you can use the loss function '
                          '`sparse_categorical_crossentropy` instead, '
                          'which does expect integer targets.')
-    if loss.__name__ in key_losses:
+    if loss in key_losses:
       for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
         if out_dim is not None and target_dim != out_dim:
           raise ValueError('A target array with shape ' + str(y.shape) +
@@ -367,7 +377,7 @@ def _make_batches(size, batch_size):
   """
   num_batches = int(np.ceil(size / float(batch_size)))
   return [(i * batch_size, min(size, (i + 1) * batch_size))
-          for i in range(0, num_batches)]
+          for i in range(num_batches)]
 
 
 def _slice_arrays(arrays, start=None, stop=None):
@@ -559,8 +569,8 @@ def _standardize_weights(y,
       return np.ones((y.shape[0], y.shape[1]), dtype=K.floatx())
 
 
-class Model(Container):
-  """The `Model` class adds training & evaluation routines to a `Container`.
+class Model(Network):
+  """The `Model` class adds training & evaluation routines to a `Network`.
   """
 
   def compile(self,
@@ -575,7 +585,7 @@ class Model(Container):
     """Configures the model for training.
 
     Arguments:
-        optimizer: String (name of optimizer) or optimizer object.
+        optimizer: String (name of optimizer) or optimizer instance.
             See [optimizers](/optimizers).
         loss: String (name of objective function) or objective function.
             See [losses](/losses).
@@ -614,9 +624,7 @@ class Model(Container):
             can specify them via the `target_tensors` argument. It can be
             a single tensor (for a single-output model), a list of tensors,
             or a dict mapping output names to target tensors.
-        **kwargs: When using the Theano/CNTK backends, these arguments
-            are passed into K.function. When using the TensorFlow backend,
-            these arguments are passed into `tf.Session.run`.
+        **kwargs: These arguments are passed to `tf.Session.run`.
 
     Raises:
         ValueError: In case of invalid arguments for
@@ -627,6 +635,7 @@ class Model(Container):
     self.sample_weight_mode = sample_weight_mode
     self.loss = loss
     self.loss_weights = loss_weights
+    self.sample_weight_mode = sample_weight_mode
 
     # Prepare loss functions.
     if isinstance(loss, dict):
@@ -936,9 +945,28 @@ class Model(Container):
     trainable_weights = self.trainable_weights
     self._collected_trainable_weights = trainable_weights
 
+  def _check_trainable_weights_consistency(self):
+    """Check trainable weights count consistency.
+
+    This will raise a warning if `trainable_weights` and
+    `_collected_trainable_weights` are consistent (i.e. have the same
+    number of parameters).
+    Inconsistency will typically arise when one modifies `model.trainable`
+    without calling `model.compile` again.
+    """
+    if not hasattr(self, '_collected_trainable_weights'):
+      return
+
+    if len(self.trainable_weights) != len(self._collected_trainable_weights):
+      logging.warning(
+          'Discrepancy between trainable weights and collected trainable'
+          ' weights, did you set `model.trainable` without calling'
+          ' `model.compile` after ?')
+
   def _make_train_function(self):
     if not hasattr(self, 'train_function'):
       raise RuntimeError('You must compile your model before using it.')
+    self._check_trainable_weights_consistency()
     if self.train_function is None:
       inputs = (self._feed_inputs +
                 self._feed_targets +
@@ -1258,7 +1286,7 @@ class Model(Container):
         for i, batch_out in enumerate(batch_outs):
           unconcatenated_outs[i].append(batch_out)
         if verbose == 1:
-          progbar.update(step)
+          progbar.update(step + 1)
       if len(unconcatenated_outs) == 1:
         return np.concatenate(unconcatenated_outs[0], axis=0)
       return [
@@ -1313,9 +1341,13 @@ class Model(Container):
     """
     num_samples = self._check_num_samples(ins, batch_size, steps, 'steps')
     outs = []
-    if steps is not None:
-      if verbose == 1:
+
+    if verbose == 1:
+      if steps is not None:
         progbar = Progbar(target=steps)
+      else:
+        progbar = Progbar(target=num_samples)
+    if steps is not None:
       for step in range(steps):
         batch_outs = f(ins)
         if isinstance(batch_outs, list):
@@ -1329,7 +1361,7 @@ class Model(Container):
             outs.append(0.)
           outs[0] += batch_outs
         if verbose == 1:
-          progbar.update(step)
+          progbar.update(step + 1)
       for i in range(len(outs)):
         outs[i] /= steps
     else:
@@ -1380,10 +1412,8 @@ class Model(Container):
     output_shapes = []
     for output_shape, loss_fn in zip(self._feed_output_shapes,
                                      self._feed_loss_fns):
-      if loss_fn.__name__ == 'sparse_categorical_crossentropy':
+      if loss_fn is losses.sparse_categorical_crossentropy:
         output_shapes.append(output_shape[:-1] + (1,))
-      elif getattr(losses, loss_fn.__name__, None) is None:
-        output_shapes.append(None)
       else:
         output_shapes.append(output_shape)
     x = _standardize_input_data(
@@ -1451,58 +1481,76 @@ class Model(Container):
     """Trains the model for a fixed number of epochs (iterations on a dataset).
 
     Arguments:
-        x: Numpy array of training data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
+        x: Numpy array of training data (if the model has a single input),
+            or list of Numpy arrays (if the model has multiple inputs).
+            If input layers in the model are named, you can also pass a
+            dictionary mapping input names to Numpy arrays.
+            `x` can be `None` (default) if feeding from
+            TensorFlow data tensors.
+        y: Numpy array of target (label) data
+            (if the model has a single output),
+            or list of Numpy arrays (if the model has multiple outputs).
+            If output layers in the model are named, you can also pass a
+            dictionary mapping output names to Numpy arrays.
+            `y` can be `None` (default) if feeding from
+            TensorFlow data tensors.
+            Can be `None` (default) if feeding from framework-native tensors.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, it will default to 32.
-        epochs: Integer, the number of times to iterate
-            over the training data arrays.
+        epochs: Integer. Number of epochs to train the model.
+            An epoch is an iteration over the entire `x` and `y`
+            data provided.
+            Note that in conjunction with `initial_epoch`,
+            `epochs` is to be understood as "final epoch".
+            The model is not trained for a number of iterations
+            given by `epochs`, but merely until the epoch
+            of index `epochs` is reached.
         verbose: 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = verbose, 2 = one log line per epoch.
-        callbacks: List of callbacks to be called during training.
+            0 = silent, 1 = progress bar, 2 = one line per epoch.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
             See [callbacks](/callbacks).
-        validation_split: Float between 0 and 1:
-            fraction of the training data to be used as validation data.
+        validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
             The model will set apart this fraction of the training data,
             will not train on it, and will evaluate
             the loss and any model metrics
             on this data at the end of each epoch.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics
-            at the end of each epoch. The model will not
-            be trained on this data.
-            This could be a tuple (x_val, y_val)
-            or a tuple (x_val, y_val, val_sample_weights).
-        shuffle: Boolean, whether to shuffle the training data
-            before each epoch. Has no effect when `steps_per_epoch`
-            is not `None`.
-        class_weight: Optional dictionary mapping
-            class indices (integers) to
-            a weight (float) to apply to the model's loss for the samples
-            from this class during training.
-            This can be useful to tell the model to "pay more attention" to
-            samples from an under-represented class.
-        sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
+            The validation data is selected from the last samples
+            in the `x` and `y` data provided, before shuffling.
+        validation_data: tuple `(x_val, y_val)` or tuple
+            `(x_val, y_val, val_sample_weights)` on which to evaluate
+            the loss and any model metrics at the end of each epoch.
+            The model will not be trained on this data.
+            This will override `validation_split`.
+        shuffle: Boolean (whether to shuffle the training data
+            before each epoch) or str (for 'batch').
+            'batch' is a special option for dealing with the
+            limitations of HDF5 data; it shuffles in batch-sized chunks.
+            Has no effect when `steps_per_epoch` is not `None`.
+        class_weight: Optional dictionary mapping class indices (integers)
+            to a weight (float) value, used for weighting the loss function
+            (during training only).
+            This can be useful to tell the model to
+            "pay more attention" to samples from
+            an under-represented class.
+        sample_weight: Optional Numpy array of weights for
+            the training samples, used for weighting the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
+            `sample_weight_mode="temporal"` in `compile()`.
         initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
+            (useful for resuming a previous training run).
         steps_per_epoch: Total number of steps (batches of samples)
             before declaring one epoch finished and starting the
-            next epoch. When training with Input Tensors such as
+            next epoch. When training with input tensors such as
             TensorFlow data tensors, the default `None` is equal to
             the number of unique samples in your dataset divided by
             the batch size, or 1 if that cannot be determined.
@@ -1511,8 +1559,10 @@ class Model(Container):
             to validate before stopping.
 
     Returns:
-        A `History` instance. Its `history` attribute contains
-        all information collected during training.
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
 
     Raises:
         ValueError: In case of mismatch between the provided input data
@@ -1621,8 +1671,8 @@ class Model(Container):
         validation_steps=validation_steps)
 
   def evaluate(self,
-               x,
-               y,
+               x=None,
+               y=None,
                batch_size=None,
                verbose=1,
                sample_weight=None,
@@ -1632,23 +1682,40 @@ class Model(Container):
     Computation is done in batches.
 
     Arguments:
-        x: Numpy array of test data,
-            or list of Numpy arrays if the model has multiple inputs.
-            If all inputs in the model are named,
-            you can also pass a dictionary
-            mapping input names to Numpy arrays.
-        y: Numpy array of target data,
-            or list of Numpy arrays if the model has multiple outputs.
-            If all outputs in the model are named,
-            you can also pass a dictionary
-            mapping output names to Numpy arrays.
-        batch_size: Integer. If unspecified, it will default to 32.
-        verbose: Verbosity mode, 0 or 1.
-        sample_weight: Array of weights to weight the contribution
-            of different samples to the loss and metrics.
-        steps: Total number of steps (batches of samples)
+        x: Numpy array of test data (if the model has a single input),
+            or list of Numpy arrays (if the model has multiple inputs).
+            If input layers in the model are named, you can also pass a
+            dictionary mapping input names to Numpy arrays.
+            `x` can be `None` (default) if feeding from
+            framework-native tensors (e.g. TensorFlow data tensors).
+        y: Numpy array of target (label) data
+            (if the model has a single output),
+            or list of Numpy arrays (if the model has multiple outputs).
+            If output layers in the model are named, you can also pass a
+            dictionary mapping output names to Numpy arrays.
+            `y` can be `None` (default) if feeding from
+            framework-native tensors (e.g. TensorFlow data tensors).
+        batch_size: Integer or `None`.
+            Number of samples per evaluation step.
+            If unspecified, `batch_size` will default to 32.
+        verbose: 0 or 1. Verbosity mode.
+            0 = silent, 1 = progress bar.
+        sample_weight: Optional Numpy array of weights for
+            the test samples, used for weighting the loss function.
+            You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            `sample_weight_mode="temporal"` in `compile()`.
+        steps: Integer or `None`.
+            Total number of steps (batches of samples)
             before declaring the evaluation round finished.
-            Ignored with the default value of `None`.
+            The default `None` is equal to the number of unique samples in
+            your dataset divided by the batch size.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1657,7 +1724,7 @@ class Model(Container):
         the display labels for the scalar outputs.
 
     Raises:
-      ValueError: In case of invalid argument values.
+      ValueError: In case of invalid arguments.
     """
     # Backwards compatibility.
     if batch_size is None and steps is None:
@@ -1877,8 +1944,7 @@ class Model(Container):
 
     Arguments:
         generator: A generator or an instance of Sequence (keras.utils.Sequence)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            object in order to avoid duplicate data when using multiprocessing.
             The output of the generator must be either
             - a tuple (inputs, targets)
             - a tuple (inputs, targets, sample_weights).
@@ -1889,8 +1955,8 @@ class Model(Container):
         steps_per_epoch: Total number of steps (batches of samples)
             to yield from `generator` before declaring one epoch
             finished and starting the next epoch. It should typically
-            be equal to the number of unique samples if your dataset
-            divided by the batch size.
+            be equal to the number of unique samples of your dataset
+            divided by the batch size. Not used if using `Sequence`.
         epochs: Integer, total number of iterations on the data.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
@@ -1905,7 +1971,7 @@ class Model(Container):
             for the class.
         max_queue_size: Maximum size for the generator queue
         workers: Maximum number of processes to spin up
-            when using process based threading
+            when using process-based threading.
         use_multiprocessing: If True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -1914,8 +1980,8 @@ class Model(Container):
             as they can't be passed
             easily to children processes.
         shuffle: Whether to shuffle the data at the beginning of each
-            epoch. Only used with instances of `Sequence` (
-            keras.utils.Sequence).
+            epoch. Only used with instances of `Sequence`
+            (`keras.utils.Sequence`).
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
         **kwargs: support for legacy arguments.
@@ -1944,7 +2010,7 @@ class Model(Container):
         ValueError: In case the generator yields
             data in an invalid format.
     """
-     # Legacy support
+    # Legacy support
     if 'max_q_size' in kwargs:
       max_queue_size = kwargs.pop('max_q_size')
       logging.warning('The argument `max_q_size` has been renamed '
@@ -2025,6 +2091,8 @@ class Model(Container):
                           ' and multiple workers may duplicate your data.'
                           ' Please consider using the`keras.utils.Sequence'
                           ' class.'))
+    if is_sequence:
+      steps_per_epoch = len(generator)
     enqueuer = None
 
     try:
@@ -2142,13 +2210,14 @@ class Model(Container):
         generator: Generator yielding tuples (inputs, targets)
             or (inputs, targets, sample_weights)
             or an instance of Sequence (keras.utils.Sequence)
-                object in order to avoid duplicate data
-                when using multiprocessing.
+            object in order to avoid duplicate data
+            when using multiprocessing.
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
+            Not used if using `Sequence`.
         max_queue_size: maximum size for the generator queue
         workers: maximum number of processes to spin up
-            when using process based threading
+            when using process-based threading.
         use_multiprocessing: if True, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2194,6 +2263,8 @@ class Model(Container):
                           ' and multiple workers may duplicate your data.'
                           ' Please consider using the`keras.utils.Sequence'
                           ' class.'))
+    if is_sequence:
+      steps = len(generator)
     enqueuer = None
 
     try:
@@ -2273,8 +2344,9 @@ class Model(Container):
         steps: Total number of steps (batches of samples)
             to yield from `generator` before stopping.
         max_queue_size: Maximum size for the generator queue.
+          Not used if using `Sequence`.
         workers: Maximum number of processes to spin up
-            when using process based threading
+            when using process-based threading.
         use_multiprocessing: If `True`, use process based threading.
             Note that because
             this implementation relies on multiprocessing,
@@ -2315,6 +2387,8 @@ class Model(Container):
                           ' and multiple workers may duplicate your data.'
                           ' Please consider using the`keras.utils.Sequence'
                           ' class.'))
+    if is_sequence:
+      steps = len(generator)
     enqueuer = None
 
     try:
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index bc9ad6693e540585751b12fdaf63007078637547..17a26f978e24776baee77182e1f901e3ee1091c8 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -18,6 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import unittest
+
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
@@ -640,6 +643,19 @@ class LossMaskingTest(test.TestCase):
 
 class TestDynamicTrainability(test.TestCase):
 
+  def test_trainable_warning(self):
+    with self.test_session():
+      x = np.random.random((5, 3))
+      y = np.random.random((5, 2))
+
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3))
+      model.trainable = False
+      model.compile('rmsprop', 'mse')
+      model.trainable = True
+      model.train_on_batch(x, y)
+      self.assertRaises(Warning)
+
   def test_trainable_argument(self):
     with self.test_session():
       x = np.random.random((5, 3))
@@ -770,6 +786,9 @@ class TestDynamicTrainability(test.TestCase):
 
 class TestGeneratorMethods(test.TestCase):
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_methods(self):
     arr_data = np.random.random((50, 2))
     arr_labels = np.random.random((50,))
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 125e63e1b84603416237250d46cfd95441dff78b..4370341ad1b3d44cf395990bd11a18f43e3cfa75 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -19,9 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -31,9 +33,12 @@ from tensorflow.python.keras._impl.keras import models
 from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 
+_DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
 
 def _create_ordered_io(keras_model, estimator_io_dict, is_input=True):
   """Create a list of tensors from IO dictionary based on Keras IO order.
@@ -184,7 +189,11 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
         predictions=predictions,
         loss=loss,
         train_op=train_op,
-        eval_metric_ops=eval_metric_ops)
+        eval_metric_ops=eval_metric_ops,
+        export_outputs={
+            _DEFAULT_SERVING_KEY:
+            export_lib.export_output.PredictOutput(predictions)
+        })
 
   return model_fn
 
@@ -222,7 +231,7 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
           K._initialize_variables(sess)
           # pylint: enable=protected-access
         saver = saver_lib.Saver()
-        saver.save(sess, estimator.model_dir + '/')
+        saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
 
 
 def model_to_estimator(keras_model=None,
@@ -232,6 +241,9 @@ def model_to_estimator(keras_model=None,
                        config=None):
   """Constructs an `Estimator` instance from given keras model.
 
+  For usage example, please see
+  @{$programmers_guide/estimators$creating_estimators_from_keras_models}.
+
   Args:
     keras_model: Keras model in memory.
     keras_model_path: Directory to a keras model on disk.
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index 1144aa3152b79860b6fa9c4f4c361028e10ee469..a7ea3b48a33d4e2d485dd5ca40e39a6f3387facb 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras._impl.keras import testing_utils
 from tensorflow.python.keras._impl.keras.applications import mobilenet
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
 
 
 try:
@@ -132,6 +133,8 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
 
   def tearDown(self):
+    # Make sure nothing is stuck in limbo.
+    writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
@@ -153,6 +156,8 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
         after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
         self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      writer_cache.FileWriterCache.clear()
       gfile.DeleteRecursively(self._config.model_dir)
 
   def test_evaluate(self):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 711003684805d3f789881d13a2a0e757973c1995..15c3d14727a44c9726a1c2c86f47640bcc490e70 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -22,8 +22,8 @@ import numpy as np
 
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.layers import network as tf_network_layers
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
@@ -93,7 +93,7 @@ class KerasIntegrationTest(test.TestCase):
       y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
-      model.add(keras.layers.LSTM(3, return_sequences=True,
+      model.add(keras.layers.LSTM(5, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
@@ -275,7 +275,7 @@ class KerasIntegrationTest(test.TestCase):
       y_train = keras.utils.to_categorical(y_train)
       y_test = keras.utils.to_categorical(y_test)
 
-      inputs = tf_base_layers.Input(shape=(10,))
+      inputs = tf_network_layers.Input(shape=(10,))
       x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
       outputs = tf_core_layers.Dense(2, activation=nn.softmax)(x)
       model = keras.models.Model(inputs, outputs)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index ce96bc66f7cc932bae84f746276cbed98961c127..1cbae9126317479c808730ad89e86d42ae201bc6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -793,6 +793,7 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
                strides=(1, 1),
                padding='valid',
                data_format=None,
+               dilation_rate=1,
                depth_multiplier=1,
                activation=None,
                use_bias=True,
@@ -815,6 +816,7 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -831,30 +833,42 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
 
   def get_config(self):
     config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'depthwise_initializer': initializers.serialize(
-            self.depthwise_initializer),
-        'pointwise_initializer': initializers.serialize(
-            self.pointwise_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer': regularizers.serialize(
-            self.depthwise_regularizer),
-        'pointwise_regularizer': regularizers.serialize(
-            self.pointwise_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'filters':
+            self.filters,
+        'kernel_size':
+            self.kernel_size,
+        'strides':
+            self.strides,
+        'padding':
+            self.padding,
+        'data_format':
+            self.data_format,
+        'dilation_rate':
+            self.dilation_rate,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint': constraints.serialize(
-            self.depthwise_constraint),
-        'pointwise_constraint': constraints.serialize(
-            self.pointwise_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
     }
     base_config = super(SeparableConv2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index 2335bd4df0264614cb468badd782dad72262e7b8..c88122ce1887c4cb93efadc82f504792c862941d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -536,7 +536,7 @@ class ConvLSTM2D(ConvRecurrent2D):
       conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
     return conv_out
 
-  def reccurent_conv(self, x, w):
+  def recurrent_conv(self, x, w):
     conv_out = K.conv2d(
         x, w, strides=(1, 1), padding='same', data_format=self.data_format)
     return conv_out
@@ -556,10 +556,10 @@ class ConvLSTM2D(ConvRecurrent2D):
         inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
     x_o = self.input_conv(
         inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
-    h_i = self.reccurent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
-    h_f = self.reccurent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
-    h_c = self.reccurent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
-    h_o = self.reccurent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+    h_i = self.recurrent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
 
     i = self.recurrent_activation(x_i + h_i)
     f = self.recurrent_activation(x_f + h_f)
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index b2e0e7b8eeb6a9efaaff870a29bf0e08f93389bd..ef9b435322c9781d4a138620705833ddca512a64 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -52,7 +52,7 @@ class Masking(Layer):
   Example:
 
   Consider a Numpy data array `x` of shape `(samples, timesteps, features)`,
-  to be fed to a LSTM layer.
+  to be fed to an LSTM layer.
   You want to mask timestep #3 and #5 because you lack data for
   these timesteps. You can:
 
@@ -104,13 +104,13 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    self.supports_masking = True
     # Inheritance call order:
     # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dropout, self).__init__(rate=rate,
                                   noise_shape=noise_shape,
                                   seed=seed,
                                   **kwargs)
+    self.supports_masking = True
 
   def call(self, inputs, training=None):
     if training is None:
@@ -121,7 +121,11 @@ class Dropout(tf_core_layers.Dropout, Layer):
     return output
 
   def get_config(self):
-    config = {'rate': self.rate}
+    config = {
+        'rate': self.rate,
+        'noise_shape': self.noise_shape,
+        'seed': self.seed
+    }
     base_config = super(Dropout, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -383,20 +387,18 @@ class Reshape(Layer):
 
   def _compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = [input_shape[0]]
-    output_shape += self._fix_unknown_dimension(input_shape[1:],
-                                                self.target_shape)
+    if None in input_shape[1:]:
+      output_shape = [input_shape[0]]
+      # input shape (partially) unknown? replace -1's with None's
+      output_shape += tuple(s if s != -1 else None for s in self.target_shape)
+    else:
+      output_shape = [input_shape[0]]
+      output_shape += self._fix_unknown_dimension(input_shape[1:],
+                                                  self.target_shape)
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    # In case the target shape is not fully defined,
-    # we need access to the shape of x.
-    target_shape = self.target_shape
-    if -1 in target_shape:
-      # target shape not fully defined
-      target_shape = self._compute_output_shape(inputs.get_shape())
-      target_shape = target_shape.as_list()[1:]
-    return K.reshape(inputs, (-1,) + tuple(target_shape))
+    return K.reshape(inputs, (K.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -595,6 +597,7 @@ class Lambda(Layer):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
+    config = config.copy()
     globs = globals()
     if custom_objects:
       globs = dict(list(globs.items()) + list(custom_objects.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 9cdebd375c89ca6cb491e4b83c0299246acb5622..d606662c798574e1f3aba59f111fb122ddf9f889 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -47,6 +47,11 @@ class CoreLayersTest(test.TestCase):
                   'noise_shape': [3, 1]},
           input_shape=(3, 2))
 
+    # https://github.com/tensorflow/tensorflow/issues/14819
+    with self.test_session():
+      dropout = keras.layers.Dropout(0.5)
+      self.assertEqual(True, dropout.supports_masking)
+
     with self.test_session():
       testing_utils.layer_test(
           keras.layers.SpatialDropout1D,
@@ -111,6 +116,12 @@ class CoreLayersTest(test.TestCase):
           kwargs={'target_shape': (1, -1)},
           input_shape=(3, 2, 4))
 
+    with self.test_session():
+      testing_utils.layer_test(
+          keras.layers.Reshape,
+          kwargs={'target_shape': (-1, 1)},
+          input_shape=(None, None, 2))
+
   def test_permute(self):
     with self.test_session():
       testing_utils.layer_test(
diff --git a/tensorflow/python/keras/_impl/keras/layers/gru_test.py b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
index 03f0736161e6d1ce91b1efab8cfddef71e0360d3..c57fbac41cc43995ef3249414ed03928e7ffd044 100644
--- a/tensorflow/python/keras/_impl/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/gru_test.py
@@ -156,8 +156,10 @@ class GRULayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((2, 3, 2))))
-      self.assertEqual(len(layer.losses), 4)
+
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
   def test_constraints_GRU(self):
     embedding_dim = 4
@@ -175,9 +177,9 @@ class GRULayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   def test_with_masking_layer_GRU(self):
     layer_class = keras.layers.GRU
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
index f43d90fec8fb4325d808e992060a48562db224a7..8d359bf17cdb80c98aeeed6d69e301962609ce59 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
@@ -156,8 +156,9 @@ class LSTMLayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((2, 3, 2))))
-      self.assertEqual(len(layer.losses), 4)
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
@@ -175,9 +176,9 @@ class LSTMLayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index 84b65d87c2f78ec47b9679110ae44383fb49e58a..888be2736934c314474bdc9259498fa2b415a4db 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -299,11 +299,26 @@ class Maximum(_Merge):
     return output
 
 
+class Minimum(_Merge):
+  """Layer that computes the minimum (element-wise) a list of inputs.
+
+  It takes as input a list of tensors,
+  all of the same shape, and returns
+  a single tensor (also of the same shape).
+  """
+
+  def _merge_function(self, inputs):
+    output = inputs[0]
+    for i in range(1, len(inputs)):
+      output = K.minimum(output, inputs[i])
+    return output
+
+
 class Concatenate(_Merge):
   """Layer that concatenates a list of inputs.
 
   It takes as input a list of tensors,
-  all of the same shape expect for the concatenation axis,
+  all of the same shape except for the concatenation axis,
   and returns a single tensor, the concatenation of all inputs.
 
   Arguments:
@@ -375,9 +390,8 @@ class Concatenate(_Merge):
     masks = []
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
-        # Input is unmasked. Append all 1s to masks,
-        # but cast it to bool first
-        masks.append(K.cast(K.ones_like(input_i), 'bool'))
+        # Input is unmasked. Append all 1s to masks
+        masks.append(K.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
         masks.append(K.expand_dims(mask_i))
@@ -584,6 +598,19 @@ def maximum(inputs, **kwargs):
   return Maximum(**kwargs)(inputs)
 
 
+def minimum(inputs, **kwargs):
+  """Functional interface to the `Minimum` layer.
+
+  Arguments:
+      inputs: A list of input tensors (at least 2).
+      **kwargs: Standard layer keyword arguments.
+
+  Returns:
+      A tensor, the element-wise minimum of the inputs.
+  """
+  return Minimum(**kwargs)(inputs)
+
+
 def concatenate(inputs, axis=-1, **kwargs):
   """Functional interface to the `Concatenate` layer.
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge_test.py b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
index a5746582791c8c1d7db1a8d54e99a7140bdc2d5b..1f34c367e4b7593a9a7c7d320cdc1d8d75c4959e 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge_test.py
@@ -116,6 +116,20 @@ class MergeLayersTest(test.TestCase):
       self.assertEqual(out.shape, (2, 4, 5))
       self.assertAllClose(out, np.maximum(x1, x2), atol=1e-4)
 
+  def test_merge_minimum(self):
+    with self.test_session():
+      i1 = keras.layers.Input(shape=(4, 5))
+      i2 = keras.layers.Input(shape=(4, 5))
+      o = keras.layers.minimum([i1, i2])
+      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+      model = keras.models.Model([i1, i2], o)
+
+      x1 = np.random.random((2, 4, 5))
+      x2 = np.random.random((2, 4, 5))
+      out = model.predict([x1, x2])
+      self.assertEqual(out.shape, (2, 4, 5))
+      self.assertAllClose(out, np.minimum(x1, x2), atol=1e-4)
+
   def test_merge_concatenate(self):
     with self.test_session():
       i1 = keras.layers.Input(shape=(4, 5))
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index e773e396796d1d69cc5699f882384ee4b24bdbf1..afe4ebfdc5305a91dc287203d56a9b389b468663 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -367,7 +367,7 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
 
   Output shape:
       2D tensor with shape:
-      `(batch_size, channels)`
+      `(batch_size, features)`
   """
 
   def call(self, inputs):
@@ -382,7 +382,7 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
 
   Output shape:
       2D tensor with shape:
-      `(batch_size, channels)`
+      `(batch_size, features)`
   """
 
   def call(self, inputs):
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 139523403c1a2e8f00d8686f990430bb2605a9f3..8df1840b4cbfddd3d31708da5eb3a57333d621ef 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,99 +29,209 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.platform import tf_logging as logging
 
 
-# pylint: disable=access-member-before-definition
+class StackedRNNCells(Layer):
+  """Wrapper allowing a stack of RNN cells to behave as a single cell.
 
-
-def _time_distributed_dense(x,
-                            w,
-                            b=None,
-                            dropout=None,
-                            input_dim=None,
-                            output_dim=None,
-                            timesteps=None,
-                            training=None):
-  """Apply `y . w + b` for every temporal slice y of x.
+  Used to implement efficient stacked RNNs.
 
   Arguments:
-      x: input tensor.
-      w: weight matrix.
-      b: optional bias vector.
-      dropout: whether to apply dropout (same dropout mask
-          for every temporal slice of the input).
-      input_dim: integer; optional dimensionality of the input.
-      output_dim: integer; optional dimensionality of the output.
-      timesteps: integer; optional number of timesteps.
-      training: training phase tensor or boolean.
-
-  Returns:
-      Output tensor.
-  """
-  if not input_dim:
-    input_dim = K.shape(x)[2]
-  if not timesteps:
-    timesteps = K.shape(x)[1]
-  if not output_dim:
-    output_dim = K.shape(w)[1]
-
-  if dropout is not None and 0. < dropout < 1.:
-    # apply the same dropout pattern at every timestep
-    ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
-    dropout_matrix = K.dropout(ones, dropout)
-    expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
-    x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)
-
-  # collapse time dimension and batch dimension together
-  x = K.reshape(x, (-1, input_dim))
-  x = K.dot(x, w)
-  if b is not None:
-    x = K.bias_add(x, b)
-  # reshape to 3D tensor
-  if K.backend() == 'tensorflow':
-    x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
-    x.set_shape([None, None, output_dim])
-  else:
-    x = K.reshape(x, (-1, timesteps, output_dim))
-  return x
+      cells: List of RNN cell instances.
 
+  Examples:
 
-class Recurrent(Layer):
-  """Abstract base class for recurrent layers.
+  ```python
+      cells = [
+          keras.layers.LSTMCell(output_dim),
+          keras.layers.LSTMCell(output_dim),
+          keras.layers.LSTMCell(output_dim),
+      ]
 
-  Do not use in a model -- it's not a valid layer!
-  Use its children classes `LSTM`, `GRU` and `SimpleRNN` instead.
+      inputs = keras.Input((timesteps, input_dim))
+      x = keras.layers.RNN(cells)(inputs)
+  ```
+  """
 
-  All recurrent layers (`LSTM`, `GRU`, `SimpleRNN`) also
-  follow the specifications of this class and accept
-  the keyword arguments listed below.
+  def __init__(self, cells, **kwargs):
+    for cell in cells:
+      if not hasattr(cell, 'call'):
+        raise ValueError('All cells must have a `call` method. '
+                         'received cells:', cells)
+      if not hasattr(cell, 'state_size'):
+        raise ValueError('All cells must have a '
+                         '`state_size` attribute. '
+                         'received cells:', cells)
+    self.cells = cells
+    super(StackedRNNCells, self).__init__(**kwargs)
+
+  @property
+  def state_size(self):
+    # States are a flat list
+    # in reverse order of the cell stack.
+    # This allows to preserve the requirement
+    # `stack.state_size[0] == output_dim`.
+    # e.g. states of a 2-layer LSTM would be
+    # `[h2, c2, h1, c1]`
+    # (assuming one LSTM has states [h, c])
+    state_size = []
+    for cell in self.cells[::-1]:
+      if hasattr(cell.state_size, '__len__'):
+        state_size += list(cell.state_size)
+      else:
+        state_size.append(cell.state_size)
+    return tuple(state_size)
+
+  def call(self, inputs, states, **kwargs):
+    # Recover per-cell states.
+    nested_states = []
+    for cell in self.cells[::-1]:
+      if hasattr(cell.state_size, '__len__'):
+        nested_states.append(states[:len(cell.state_size)])
+        states = states[len(cell.state_size):]
+      else:
+        nested_states.append([states[0]])
+        states = states[1:]
+    nested_states = nested_states[::-1]
+
+    # Call the cells in order and store the returned states.
+    new_nested_states = []
+    for cell, states in zip(self.cells, nested_states):
+      inputs, states = cell.call(inputs, states, **kwargs)
+      new_nested_states.append(states)
+
+    # Format the new states as a flat list
+    # in reverse cell order.
+    states = []
+    for cell_states in new_nested_states[::-1]:
+      states += cell_states
+    return inputs, states
 
-  Example:
+  def build(self, input_shape):
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        cell.build(input_shape)
+      if hasattr(cell.state_size, '__len__'):
+        output_dim = cell.state_size[0]
+      else:
+        output_dim = cell.state_size
+      input_shape = (input_shape[0], input_shape[1], output_dim)
+    self.built = True
 
-  ```python
-      # as the first layer in a Sequential model
-      model = Sequential()
-      model.add(LSTM(32, input_shape=(10, 64)))
-      # now model.output_shape == (None, 32)
-      # note: `None` is the batch dimension.
-
-      # for subsequent layers, no need to specify the input size:
-      model.add(LSTM(16))
-
-      # to stack recurrent layers, you must use return_sequences=True
-      # on any recurrent layer that feeds into another recurrent layer.
-      # note that you only need to specify the input size on the first layer.
-      model = Sequential()
-      model.add(LSTM(64, input_dim=64, input_length=10, return_sequences=True))
-      model.add(LSTM(32, return_sequences=True))
-      model.add(LSTM(10))
-  ```
+  def get_config(self):
+    cells = []
+    for cell in self.cells:
+      cells.append({
+          'class_name': cell.__class__.__name__,
+          'config': cell.get_config()
+      })
+    config = {'cells': cells}
+    base_config = super(StackedRNNCells, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    cells = []
+    for cell_config in config.pop('cells'):
+      cells.append(
+          deserialize_layer(cell_config, custom_objects=custom_objects))
+    return cls(cells, **config)
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        weights += cell.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        weights += cell.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for cell in self.cells:
+        if isinstance(cell, Layer):
+          trainable_weights += cell.trainable_weights
+      return trainable_weights + weights
+    return weights
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    weights = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        weights += cell.weights
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    """Sets the weights of the model.
+
+    Arguments:
+        weights: A list of Numpy arrays with shapes and types matching
+            the output of `model.get_weights()`.
+    """
+    tuples = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        num_param = len(cell.weights)
+        weights = weights[:num_param]
+        for sw, w in zip(cell.weights, weights):
+          tuples.append((sw, w))
+        weights = weights[num_param:]
+    K.batch_set_value(tuples)
+
+  @property
+  def losses(self):
+    losses = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        cell_losses = cell.losses
+        losses += cell_losses
+    return losses
+
+  def get_losses_for(self, inputs=None):
+    losses = []
+    for cell in self.cells:
+      if isinstance(cell, Layer):
+        cell_losses = cell.get_losses_for(inputs)
+        losses += cell_losses
+    return losses
+
+
+class RNN(Layer):
+  """Base class for recurrent layers.
 
   Arguments:
-      weights: list of Numpy arrays to set as initial weights.
-          The list should have 3 elements, of shapes:
-          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
-      return_sequences: Boolean. Whether to return the last output
+      cell: A RNN cell instance. A RNN cell is a class that has:
+          - a `call(input_at_t, states_at_t)` method, returning
+              `(output_at_t, states_at_t_plus_1)`. The call method of the
+              cell can also take the optional argument `constants`, see
+              section "Note on passing external constants" below.
+          - a `state_size` attribute. This can be a single integer
+              (single state) in which case it is
+              the size of the recurrent state
+              (which should be the same as the size of the cell output).
+              This can also be a list/tuple of integers
+              (one size per state). In this case, the first entry
+              (`state_size[0]`) should be the same as
+              the size of the cell output.
+          It is also possible for `cell` to be a list of RNN cell instances,
+          in which cases the cells get stacked on after the other in the RNN,
+          implementing an efficient stacked RNN.
+      return_sequences: Boolean. Whether to return the last output.
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -137,21 +247,9 @@ class Recurrent(Layer):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
-      implementation: one of {0, 1, or 2}.
-          If set to 0, the RNN will use
-          an implementation that uses fewer, larger matrix products,
-          thus running faster on CPU but consuming more memory.
-          If set to 1, the RNN will use more matrix products,
-          but smaller ones, thus running slower
-          (may actually be faster on GPU) while consuming less memory.
-          If set to 2 (LSTM/GRU only),
-          the RNN will combine the input gate,
-          the forget gate and the output gate into a single matrix,
-          enabling more time-efficient parallelization on the GPU.
-          Note: RNN dropout must be shared for all gates,
-          resulting in a slightly reduced regularization.
       input_dim: dimensionality of the input (integer).
-          This argument (or alternatively, the keyword argument `input_shape`)
+          This argument (or alternatively,
+          the keyword argument `input_shape`)
           is required when using this layer as the first layer in a model.
       input_length: Length of input sequences, to be specified
           when it is constant.
@@ -163,7 +261,7 @@ class Recurrent(Layer):
           at the level of the first layer
           (e.g. via the `input_shape` argument)
 
-  Input shape:s
+  Input shape:
       3D tensor with shape `(batch_size, timesteps, input_dim)`,
       (Optional) 2D tensors with shape `(batch_size, output_dim)`.
 
@@ -178,7 +276,7 @@ class Recurrent(Layer):
   # Masking
       This layer supports masking for input data with a variable number
       of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
+      use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
       set to `True`.
 
   # Note on using statefulness in RNNs
@@ -212,42 +310,128 @@ class Recurrent(Layer):
       calling `reset_states` with the keyword argument `states`. The value of
       `states` should be a numpy array or list of numpy arrays representing
       the initial state of the RNN layer.
+
+  # Note on passing external constants to RNNs
+      You can pass "external" constants to the cell using the `constants`
+      keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+      requires that the `cell.call` method accepts the same keyword argument
+      `constants`. Such constants can be used to condition the cell
+      transformation on additional static inputs (not changing over time),
+      a.k.a. an attention mechanism.
+
+  Examples:
+
+  ```python
+      # First, let's define a RNN Cell, as a layer subclass.
+
+      class MinimalRNNCell(keras.layers.Layer):
+
+          def __init__(self, units, **kwargs):
+              self.units = units
+              self.state_size = units
+              super(MinimalRNNCell, self).__init__(**kwargs)
+
+          def build(self, input_shape):
+              self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                            initializer='uniform',
+                                            name='kernel')
+              self.recurrent_kernel = self.add_weight(
+                  shape=(self.units, self.units),
+                  initializer='uniform',
+                  name='recurrent_kernel')
+              self.built = True
+
+          def call(self, inputs, states):
+              prev_output = states[0]
+              h = K.dot(inputs, self.kernel)
+              output = h + K.dot(prev_output, self.recurrent_kernel)
+              return output, [output]
+
+      # Let's use this cell in a RNN layer:
+
+      cell = MinimalRNNCell(32)
+      x = keras.Input((None, 5))
+      layer = RNN(cell)
+      y = layer(x)
+
+      # Here's how to use the cell to build a stacked RNN:
+
+      cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
+      x = keras.Input((None, 5))
+      layer = RNN(cells)
+      y = layer(x)
+  ```
   """
 
   def __init__(self,
+               cell,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
-               implementation=0,
+               activity_regularizer=None,
                **kwargs):
-    super(Recurrent, self).__init__(**kwargs)
+    if isinstance(cell, (list, tuple)):
+      cell = StackedRNNCells(cell)
+    if not hasattr(cell, 'call'):
+      raise ValueError('`cell` should have a `call` method. '
+                       'The RNN was passed:', cell)
+    if not hasattr(cell, 'state_size'):
+      raise ValueError('The RNN cell should have '
+                       'an attribute `state_size` '
+                       '(tuple of integers, '
+                       'one integer per RNN state).')
+    super(RNN, self).__init__(
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    self.cell = cell
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
     self.stateful = stateful
     self.unroll = unroll
-    self.implementation = implementation
+
     self.supports_masking = True
     self.input_spec = [InputSpec(ndim=3)]
     self.state_spec = None
-    self.dropout = 0
-    self.recurrent_dropout = 0
+    self._states = None
+    self.constants_spec = None
+    self._num_constants = None
+
+  @property
+  def states(self):
+    if self._states is None:
+      if isinstance(self.cell.state_size, int):
+        num_states = 1
+      else:
+        num_states = len(self.cell.state_size)
+      return [None for _ in range(num_states)]
+    return self._states
+
+  @states.setter
+  def states(self, states):
+    self._states = states
 
   def _compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
+
+    if hasattr(self.cell.state_size, '__len__'):
+      output_dim = self.cell.state_size[0]
+    else:
+      output_dim = self.cell.state_size
+
     if self.return_sequences:
-      output_shape = (input_shape[0], input_shape[1], self.units)
+      output_shape = (input_shape[0], input_shape[1], output_dim)
     else:
-      output_shape = (input_shape[0], self.units)
+      output_shape = (input_shape[0], output_dim)
 
     if self.return_state:
-      state_shape = [tensor_shape.TensorShape(
-          (input_shape[0], self.units)) for _ in self.states]
-      return [tensor_shape.TensorShape(output_shape)] + state_shape
+      state_shape = [(input_shape[0], output_dim) for _ in self.states]
+      output_shape = [output_shape] + state_shape
+    else:
+      output_shape = output_shape
     return tensor_shape.TensorShape(output_shape)
 
   def compute_mask(self, inputs, mask):
@@ -257,82 +441,123 @@ class Recurrent(Layer):
     if self.return_state:
       state_mask = [None for _ in self.states]
       return [output_mask] + state_mask
-    return output_mask
+    else:
+      return output_mask
 
-  def step(self, inputs, states):
-    raise NotImplementedError
+  def build(self, input_shape):
+    # Note input_shape will be list of shapes of initial states and
+    # constants if these are passed in __call__.
+    if self._num_constants is not None:
+      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
+    else:
+      constants_shape = None
 
-  def get_constants(self, inputs, training=None):
-    return []
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+
+    batch_size = input_shape[0] if self.stateful else None
+    input_dim = input_shape[-1]
+    self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
+
+    # allow cell (if layer) to build before we set or validate state_spec
+    if isinstance(self.cell, Layer):
+      step_input_shape = (input_shape[0],) + input_shape[2:]
+      if constants_shape is not None:
+        self.cell.build([step_input_shape] + constants_shape)
+      else:
+        self.cell.build(step_input_shape)
+
+    # set or validate state_spec
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = list(self.cell.state_size)
+    else:
+      state_size = [self.cell.state_size]
+
+    if self.state_spec is not None:
+      # initial_state was passed in call, check compatibility
+      if [spec.shape[-1] for spec in self.state_spec] != state_size:
+        raise ValueError(
+            'An initial_state was passed that is not compatible with '
+            '`cell.state_size`. Received `state_spec`={}; '
+            'However `cell.state_size` is '
+            '{}'.format(self.state_spec, self.cell.state_size))
+    else:
+      self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
+    if self.stateful:
+      self.reset_states()
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
     initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
     initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
     initial_state = K.expand_dims(initial_state)  # (samples, 1)
-    initial_state = K.tile(initial_state, [1,
-                                           self.units])  # (samples, output_dim)
-    initial_state = [initial_state for _ in range(len(self.states))]
-    return initial_state
-
-  def preprocess_input(self, inputs, training=None):
-    return inputs
+    if hasattr(self.cell.state_size, '__len__'):
+      return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
+    else:
+      return [K.tile(initial_state, [1, self.cell.state_size])]
 
-  def __call__(self, inputs, initial_state=None, **kwargs):
-    if (isinstance(inputs, (list, tuple)) and
-        len(inputs) > 1
-        and initial_state is None):
-      initial_state = inputs[1:]
-      inputs = inputs[0]
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    inputs, initial_state, constants = self._standardize_args(
+        inputs, initial_state, constants)
 
-    # If `initial_state` is specified,
-    # and if it a Keras tensor,
-    # then add it to the inputs and temporarily
-    # modify the input spec to include the state.
-    if initial_state is None:
-      return super(Recurrent, self).__call__(inputs, **kwargs)
+    if initial_state is None and constants is None:
+      return super(RNN, self).__call__(inputs, **kwargs)
 
-    if not isinstance(initial_state, (list, tuple)):
-      initial_state = [initial_state]
+    # If any of `initial_state` or `constants` are specified and are Keras
+    # tensors, then add them to the inputs and temporarily modify the
+    # input_spec to include them.
 
-    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
-    for tensor in initial_state:
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      self.state_spec = [
+          InputSpec(shape=K.int_shape(state)) for state in initial_state
+      ]
+      additional_specs += self.state_spec
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      self.constants_spec = [
+          InputSpec(shape=K.int_shape(constant)) for constant in constants
+      ]
+      self._num_constants = len(constants)
+      additional_specs += self.constants_spec
+    # at this point additional_inputs cannot be empty
+    is_keras_tensor = hasattr(additional_inputs[0], '_keras_history')
+    for tensor in additional_inputs:
       if hasattr(tensor, '_keras_history') != is_keras_tensor:
-        raise ValueError('The initial state of an RNN layer cannot be'
-                         ' specified with a mix of Keras tensors and'
-                         ' non-Keras tensors')
+        raise ValueError('The initial state or constants of an RNN'
+                         ' layer cannot be specified with a mix of'
+                         ' Keras tensors and non-Keras tensors')
 
     if is_keras_tensor:
-      # Compute the full input spec, including state
-      input_spec = self.input_spec
-      state_spec = self.state_spec
-      if not isinstance(input_spec, list):
-        input_spec = [input_spec]
-      if not isinstance(state_spec, list):
-        state_spec = [state_spec]
-      self.input_spec = input_spec + state_spec
-
-      # Compute the full inputs, including state
-      inputs = [inputs] + list(initial_state)
-
-      # Perform the call
-      output = super(Recurrent, self).__call__(inputs, **kwargs)
-
-      # Restore original input spec
-      self.input_spec = input_spec
+      # Compute the full input spec, including state and constants
+      full_input = [inputs] + additional_inputs
+      full_input_spec = self.input_spec + additional_specs
+      # Perform the call with temporarily replaced input_spec
+      original_input_spec = self.input_spec
+      self.input_spec = full_input_spec
+      output = super(RNN, self).__call__(full_input, **kwargs)
+      self.input_spec = original_input_spec
       return output
     else:
-      kwargs['initial_state'] = initial_state
-      return super(Recurrent, self).__call__(inputs, **kwargs)
-
-  def call(self, inputs, mask=None, training=None, initial_state=None):
+      return super(RNN, self).__call__(inputs, **kwargs)
+
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           initial_state=None,
+           constants=None):
     # input shape: `(samples, time (padded with zeros), input_dim)`
     # note that the .build() method of subclasses MUST define
     # self.input_spec and self.state_spec with complete input shapes.
     if isinstance(inputs, list):
-      initial_state = inputs[1:]
       inputs = inputs[0]
-    elif initial_state is not None:
+    if initial_state is not None:
       pass
     elif self.stateful:
       initial_state = self.states
@@ -343,13 +568,14 @@ class Recurrent(Layer):
       mask = mask[0]
 
     if len(initial_state) != len(self.states):
-      raise ValueError('Layer has ' + str(len(self.states)) +
-                       ' states but was passed ' + str(len(initial_state)) +
-                       ' initial states.')
+      raise ValueError(
+          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
+          str(len(initial_state)) + ' initial states.')
     input_shape = K.int_shape(inputs)
-    if self.unroll and input_shape[1] is None:
+    timesteps = input_shape[1]
+    if self.unroll and timesteps in [None, 1]:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined. \n'
+                       'time dimension is undefined or equal to 1. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -359,15 +585,31 @@ class Recurrent(Layer):
                        '- If using the functional API, specify '
                        'the time dimension by passing a `shape` '
                        'or `batch_shape` argument to your Input layer.')
-    constants = self.get_constants(inputs, training=None)
-    preprocessed_input = self.preprocess_input(inputs, training=None)
+
+    kwargs = {}
+    if has_arg(self.cell.call, 'training'):
+      kwargs['training'] = training
+
+    if constants:
+      if not has_arg(self.cell.call, 'constants'):
+        raise ValueError('RNN cell does not support constants')
+
+      def step(inputs, states):
+        constants = states[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
+        states = states[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
+        return self.cell.call(inputs, states, constants=constants, **kwargs)
+    else:
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
     last_output, outputs, states = K.rnn(
-        self.step,
-        preprocessed_input,
+        step,
+        inputs,
         initial_state,
+        constants=constants,
         go_backwards=self.go_backwards,
         mask=mask,
-        constants=constants,
         unroll=self.unroll)
     if self.stateful:
       updates = []
@@ -375,21 +617,63 @@ class Recurrent(Layer):
         updates.append((self.states[i], states[i]))
       self.add_update(updates, inputs)
 
-    # Properly set learning phase
-    if 0 < self.dropout + self.recurrent_dropout:
-      last_output._uses_learning_phase = True
-      outputs._uses_learning_phase = True
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
 
-    if not self.return_sequences:
-      outputs = last_output
+    # Properly set learning phase
+    if getattr(last_output, '_uses_learning_phase', False):
+      output._uses_learning_phase = True
 
     if self.return_state:
       if not isinstance(states, (list, tuple)):
         states = [states]
       else:
         states = list(states)
-      return [outputs] + states
-    return outputs
+      return [output] + states
+    else:
+      return output
+
+  def _standardize_args(self, inputs, initial_state, constants):
+    """Standardize `__call__` arguments to a single list of tensor inputs.
+
+    When running a model loaded from file, the input tensors
+    `initial_state` and `constants` can be passed to `RNN.__call__` as part
+    of `inputs` instead of by the dedicated keyword arguments. This method
+    makes sure the arguments are separated and that `initial_state` and
+    `constants` are lists of tensors (or None).
+
+    Arguments:
+        inputs: tensor or list/tuple of tensors
+        initial_state: tensor or list of tensors or None
+        constants: tensor or list of tensors or None
+
+    Returns:
+        inputs: tensor
+        initial_state: list of tensors or None
+        constants: list of tensors or None
+    """
+    if isinstance(inputs, list):
+      assert initial_state is None and constants is None
+      if self._num_constants is not None:
+        constants = inputs[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
+        inputs = inputs[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
+      if len(inputs) > 1:
+        initial_state = inputs[1:]
+      inputs = inputs[0]
+
+    def to_list_or_none(x):
+      if x is None or isinstance(x, list):
+        return x
+      if isinstance(x, tuple):
+        return list(x)
+      return [x]
+
+    initial_state = to_list_or_none(initial_state)
+    constants = to_list_or_none(constants)
+
+    return inputs, initial_state, constants
 
   def reset_states(self, states=None):
     if not self.stateful:
@@ -408,10 +692,19 @@ class Recurrent(Layer):
                        '`batch_shape` argument to your Input layer.')
     # initialize state if None
     if self.states[0] is None:
-      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
+      if hasattr(self.cell.state_size, '__len__'):
+        self.states = [
+            K.zeros((batch_size, dim)) for dim in self.cell.state_size
+        ]
+      else:
+        self.states = [K.zeros((batch_size, self.cell.state_size))]
     elif states is None:
-      for state in self.states:
-        K.set_value(state, np.zeros((batch_size, self.units)))
+      if hasattr(self.cell.state_size, '__len__'):
+        for state, dim in zip(self.states, self.cell.state_size):
+          K.set_value(state, np.zeros((batch_size, dim)))
+      else:
+        K.set_value(self.states[0], np.zeros((batch_size,
+                                              self.cell.state_size)))
     else:
       if not isinstance(states, (list, tuple)):
         states = [states]
@@ -421,11 +714,16 @@ class Recurrent(Layer):
                          'but it received ' + str(len(states)) +
                          ' state values. Input received: ' + str(states))
       for index, (value, state) in enumerate(zip(states, self.states)):
-        if value.shape != (batch_size, self.units):
-          raise ValueError('State ' + str(index) +
-                           ' is incompatible with layer ' + self.name +
-                           ': expected shape=' + str((batch_size, self.units)) +
-                           ', found shape=' + str(value.shape))
+        if hasattr(self.cell.state_size, '__len__'):
+          dim = self.cell.state_size[index]
+        else:
+          dim = self.cell.state_size
+        if value.shape != (batch_size, dim):
+          raise ValueError(
+              'State ' + str(index) + ' is incompatible with layer ' +
+              self.name + ': expected shape=' + str(
+                  (batch_size, dim)) + ', found shape=' + str(value.shape))
+        # TODO(fchollet): consider batch calls to `set_value`.
         K.set_value(state, value)
 
   def get_config(self):
@@ -434,51 +732,98 @@ class Recurrent(Layer):
         'return_state': self.return_state,
         'go_backwards': self.go_backwards,
         'stateful': self.stateful,
-        'unroll': self.unroll,
-        'implementation': self.implementation
+        'unroll': self.unroll
     }
-    base_config = super(Recurrent, self).get_config()
+    if self._num_constants is not None:
+      config['num_constants'] = self._num_constants
+
+    cell_config = self.cell.get_config()
+    config['cell'] = {
+        'class_name': self.cell.__class__.__name__,
+        'config': cell_config
+    }
+    base_config = super(RNN, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
+    num_constants = config.pop('num_constants', None)
+    layer = cls(cell, **config)
+    layer._num_constants = num_constants
+    return layer
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    if isinstance(self.cell, Layer):
+      return self.cell.trainable_weights
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if isinstance(self.cell, Layer):
+      if not self.trainable:
+        return self.cell.weights
+      return self.cell.non_trainable_weights
+    return []
 
-class SimpleRNN(Recurrent):
-  """Fully-connected RNN where the output is to be fed back to input.
+  @property
+  def losses(self):
+    if isinstance(self.cell, Layer):
+      return self.cell.losses
+    return []
+
+  def get_losses_for(self, inputs=None):
+    if isinstance(self.cell, Layer):
+      cell_losses = self.cell.get_losses_for(inputs)
+      return cell_losses + super(RNN, self).get_losses_for(inputs)
+    return super(RNN, self).get_losses_for(inputs)
+
+
+class SimpleRNNCell(Layer):
+  """Cell class for SimpleRNN.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
+      activation: Activation function to use
+          (see [activations](../activations.md)).
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
-      bias_initializer: Initializer for the bias vector.
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  References:
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
   """
 
   def __init__(self,
@@ -491,15 +836,13 @@ class SimpleRNN(Recurrent):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(SimpleRNN, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(SimpleRNNCell, self).__init__(**kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.use_bias = use_bias
@@ -518,23 +861,13 @@ class SimpleRNN(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = InputSpec(shape=(None, self.units))
+    self.state_size = self.units
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_dim = input_shape[2]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
-
-    self.states = [None]
-    if self.stateful:
-      self.reset_states()
-
     self.kernel = self.add_weight(
-        shape=(self.input_dim, self.units),
+        shape=(input_shape[-1], self.units),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -556,146 +889,315 @@ class SimpleRNN(Recurrent):
       self.bias = None
     self.built = True
 
-  def preprocess_input(self, inputs, training=None):
-    if self.implementation > 0:
-      return inputs
-    else:
-      input_shape = inputs.get_shape().as_list()
-      input_dim = input_shape[2]
-      timesteps = input_shape[1]
-      return _time_distributed_dense(
-          inputs,
-          self.kernel,
-          self.bias,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-
-  def step(self, inputs, states):
-    if self.implementation == 0:
-      h = inputs
-    else:
-      if 0 < self.dropout < 1:
-        h = K.dot(inputs * states[1], self.kernel)
-      else:
-        h = K.dot(inputs, self.kernel)
-      if self.bias is not None:
-        h = K.bias_add(h, self.bias)
-
-    prev_output = states[0]
-    if 0 < self.recurrent_dropout < 1:
-      prev_output *= states[2]
-    output = h + K.dot(prev_output, self.recurrent_kernel)
-    if self.activation is not None:
-      output = self.activation(output)
-
-    # Properly set learning phase on output tensor.
-    if 0 < self.dropout + self.recurrent_dropout:
-      output._uses_learning_phase = True
-    return output, [output]
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation != 0 and 0 < self.dropout < 1:
-      input_shape = K.int_shape(inputs)
-      input_dim = input_shape[-1]
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, int(input_dim)))
+  def _generate_dropout_mask(self, inputs, training=None):
+    if 0 < self.dropout < 1:
+      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
-      constants.append(dp_mask)
+      self._dropout_mask = K.in_train_phase(
+          dropped_inputs, ones, training=training)
     else:
-      constants.append(K.cast_to_floatx(1.))
+      self._dropout_mask = None
 
+  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
 
-      rec_dp_mask = K.in_train_phase(dropped_inputs, ones, training=training)
-      constants.append(rec_dp_mask)
+      self._recurrent_dropout_mask = K.in_train_phase(
+          dropped_inputs, ones, training=training)
     else:
-      constants.append(K.cast_to_floatx(1.))
-    return constants
-
-  def get_config(self):
-    config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint),
-        'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
-    }
-    base_config = super(SimpleRNN, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+      self._recurrent_dropout_mask = None
 
+  def call(self, inputs, states, training=None):
+    prev_output = states[0]
+    dp_mask = self._dropout_mask
+    rec_dp_mask = self._recurrent_dropout_mask
 
-class GRU(Recurrent):
-  """Gated Recurrent Unit - Cho et al.
+    if dp_mask is not None:
+      h = K.dot(inputs * dp_mask, self.kernel)
+    else:
+      h = K.dot(inputs, self.kernel)
+    if self.bias is not None:
+      h = K.bias_add(h, self.bias)
 
-  2014.
+    if rec_dp_mask is not None:
+      prev_output *= rec_dp_mask
+    output = h + K.dot(prev_output, self.recurrent_kernel)
+    if self.activation is not None:
+      output = self.activation(output)
+
+    # Properly set learning phase on output tensor.
+    if 0 < self.dropout + self.recurrent_dropout:
+      if training is None:
+        output._uses_learning_phase = True
+    return output, [output]
+
+
+class SimpleRNN(RNN):
+  """Fully-connected RNN where the output is to be fed back to input.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
-      recurrent_activation: Activation function to use
-          for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
-      bias_initializer: Initializer for the bias vector.
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
       activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
+      return_sequences: Boolean. Whether to return the last output.
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+  """
 
-  References:
-      - [On the Properties of Neural Machine Translation: Encoder-Decoder
-        Approaches](https://arxiv.org/abs/1409.1259)
-      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence
-        Modeling](http://arxiv.org/abs/1412.3555v1)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
+  def __init__(self,
+               units,
+               activation='tanh',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               **kwargs):
+    if 'implementation' in kwargs:
+      kwargs.pop('implementation')
+      logging.warning('The `implementation` argument '
+                      'in `SimpleRNN` has been deprecated. '
+                      'Please remove it from your layer call.')
+    cell = SimpleRNNCell(
+        units,
+        activation=activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout)
+    super(SimpleRNN, self).__init__(
+        cell,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    self.cell._generate_dropout_mask(inputs, training=training)
+    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    return super(SimpleRNN, self).call(
+        inputs, mask=mask, training=training, initial_state=initial_state)
+
+  @property
+  def units(self):
+    return self.cell.units
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint),
+        'dropout': self.dropout,
+        'recurrent_dropout': self.recurrent_dropout
+    }
+    base_config = super(SimpleRNN, self).get_config()
+    del base_config['cell']
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    if 'implementation' in config:
+      config.pop('implementation')
+    return cls(**config)
+
+
+class GRUCell(Layer):
+  """Cell class for the GRU layer.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step
+          (see [activations](../activations.md)).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
   """
 
   def __init__(self,
@@ -709,15 +1211,14 @@ class GRU(Recurrent):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
+               implementation=1,
                **kwargs):
-    super(GRU, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(GRUCell, self).__init__(**kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -737,22 +1238,15 @@ class GRU(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = InputSpec(shape=(None, self.units))
+    self.implementation = implementation
+    self.state_size = self.units
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_dim = input_shape[2]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
-
-    self.states = [None]
-    if self.stateful:
-      self.reset_states()
-
+    input_dim = input_shape[-1]
     self.kernel = self.add_weight(
-        shape=(self.input_dim, self.units * 3),
+        shape=(input_dim, self.units * 3),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -792,89 +1286,83 @@ class GRU(Recurrent):
       self.bias_h = None
     self.built = True
 
-  def preprocess_input(self, inputs, training=None):
-    if self.implementation == 0:
-      input_shape = inputs.get_shape().as_list()
-      input_dim = input_shape[2]
-      timesteps = input_shape[1]
-
-      x_z = _time_distributed_dense(
-          inputs,
-          self.kernel_z,
-          self.bias_z,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      x_r = _time_distributed_dense(
-          inputs,
-          self.kernel_r,
-          self.bias_r,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      x_h = _time_distributed_dense(
-          inputs,
-          self.kernel_h,
-          self.bias_h,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      return K.concatenate([x_z, x_r, x_h], axis=2)
-    else:
-      return inputs
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation != 0 and 0 < self.dropout < 1:
-      input_shape = K.int_shape(inputs)
-      input_dim = input_shape[-1]
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, int(input_dim)))
+  def _generate_dropout_mask(self, inputs, training=None):
+    if 0 < self.dropout < 1:
+      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      dp_mask = [
+      self._dropout_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
-      constants.append(dp_mask)
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
+      self._dropout_mask = None
 
+  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
 
-      rec_dp_mask = [
+      self._recurrent_dropout_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(3)
       ]
-      constants.append(rec_dp_mask)
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(3)])
-    return constants
+      self._recurrent_dropout_mask = None
 
-  def step(self, inputs, states):
+  def call(self, inputs, states, training=None):
     h_tm1 = states[0]  # previous memory
-    dp_mask = states[1]  # dropout matrices for recurrent units
-    rec_dp_mask = states[2]
 
-    if self.implementation == 2:
-      matrix_x = K.dot(inputs * dp_mask[0], self.kernel)
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
+
+    if self.implementation == 1:
+      if 0. < self.dropout < 1.:
+        inputs_z = inputs * dp_mask[0]
+        inputs_r = inputs * dp_mask[1]
+        inputs_h = inputs * dp_mask[2]
+      else:
+        inputs_z = inputs
+        inputs_r = inputs
+        inputs_h = inputs
+      x_z = K.dot(inputs_z, self.kernel_z)
+      x_r = K.dot(inputs_r, self.kernel_r)
+      x_h = K.dot(inputs_h, self.kernel_h)
+      if self.use_bias:
+        x_z = K.bias_add(x_z, self.bias_z)
+        x_r = K.bias_add(x_r, self.bias_r)
+        x_h = K.bias_add(x_h, self.bias_h)
+
+      if 0. < self.recurrent_dropout < 1.:
+        h_tm1_z = h_tm1 * rec_dp_mask[0]
+        h_tm1_r = h_tm1 * rec_dp_mask[1]
+        h_tm1_h = h_tm1 * rec_dp_mask[2]
+      else:
+        h_tm1_z = h_tm1
+        h_tm1_r = h_tm1
+        h_tm1_h = h_tm1
+      z = self.recurrent_activation(
+          x_z + K.dot(h_tm1_z, self.recurrent_kernel_z))
+      r = self.recurrent_activation(
+          x_r + K.dot(h_tm1_r, self.recurrent_kernel_r))
+
+      hh = self.activation(x_h + K.dot(r * h_tm1_h, self.recurrent_kernel_h))
+    else:
+      if 0. < self.dropout < 1.:
+        inputs *= dp_mask[0]
+      matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
         matrix_x = K.bias_add(matrix_x, self.bias)
-      matrix_inner = K.dot(h_tm1 * rec_dp_mask[0],
-                           self.recurrent_kernel[:, :2 * self.units])
+      if 0. < self.recurrent_dropout < 1.:
+        h_tm1 *= rec_dp_mask[0]
+      matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
       x_z = matrix_x[:, :self.units]
       x_r = matrix_x[:, self.units:2 * self.units]
@@ -885,36 +1373,220 @@ class GRU(Recurrent):
       r = self.recurrent_activation(x_r + recurrent_r)
 
       x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1 * rec_dp_mask[0],
-                          self.recurrent_kernel[:, 2 * self.units:])
+      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
       hh = self.activation(x_h + recurrent_h)
-    else:
-      if self.implementation == 0:
-        x_z = inputs[:, :self.units]
-        x_r = inputs[:, self.units:2 * self.units]
-        x_h = inputs[:, 2 * self.units:]
-      elif self.implementation == 1:
-        x_z = K.dot(inputs * dp_mask[0], self.kernel_z)
-        x_r = K.dot(inputs * dp_mask[1], self.kernel_r)
-        x_h = K.dot(inputs * dp_mask[2], self.kernel_h)
-        if self.use_bias:
-          x_z = K.bias_add(x_z, self.bias_z)
-          x_r = K.bias_add(x_r, self.bias_r)
-          x_h = K.bias_add(x_h, self.bias_h)
-      else:
-        raise ValueError('Unknown `implementation` mode.')
-      z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
-                                                self.recurrent_kernel_z))
-      r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
-                                                self.recurrent_kernel_r))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1 * rec_dp_mask[2],
-                                       self.recurrent_kernel_h))
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
-      h._uses_learning_phase = True
+      if training is None:
+        h._uses_learning_phase = True
     return h, [h]
 
+
+class GRU(RNN):
+  # pylint: disable=line-too-long
+  """Gated Recurrent Unit - Cho et al.
+
+  2014.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step
+          (see [activations](../activations.md)).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
+      return_sequences: Boolean. Whether to return the last output.
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+
+  References:
+      - [On the Properties of Neural Machine Translation: Encoder-Decoder Approaches](https://arxiv.org/abs/1409.1259)
+      - [Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling](http://arxiv.org/abs/1412.3555v1)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
+  """
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               **kwargs):
+    if implementation == 0:
+      logging.warning('`implementation=0` has been deprecated, '
+                      'and now defaults to `implementation=1`.'
+                      'Please update your layer call.')
+    cell = GRUCell(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation)
+    super(GRU, self).__init__(
+        cell,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    self.cell._generate_dropout_mask(inputs, training=training)
+    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    return super(GRU, self).call(
+        inputs, mask=mask, training=training, initial_state=initial_state)
+
+  @property
+  def units(self):
+    return self.cell.units
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
+
+  @property
+  def implementation(self):
+    return self.cell.implementation
+
   def get_config(self):
     config = {
         'units': self.units,
@@ -937,64 +1609,75 @@ class GRU(Recurrent):
             constraints.serialize(self.recurrent_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint),
         'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
+        'recurrent_dropout': self.recurrent_dropout,
+        'implementation': self.implementation
     }
     base_config = super(GRU, self).get_config()
+    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
 
+  @classmethod
+  def from_config(cls, config):
+    if 'implementation' in config and config['implementation'] == 0:
+      config['implementation'] = 1
+    return cls(**config)
 
-class LSTM(Recurrent):
-  """Long-Short Term Memory unit - Hochreiter 1997.
 
-  For a step-by-step description of the algorithm, see
-  [this tutorial](http://deeplearning.net/tutorial/lstm.html).
+class LSTMCell(Layer):
+  """Cell class for the LSTM layer.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
           If you pass None, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
       recurrent_activation: Activation function to use
-          for the recurrent step.
+          for the recurrent step
+          (see [activations](../activations.md)).
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
-      bias_initializer: Initializer for the bias vector.
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Setting it to true will also force `bias_initializer="zeros"`.
           This is recommended in [Jozefowicz et
             al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
       recurrent_regularizer: Regularizer function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
       kernel_constraint: Constraint function applied to
-          the `kernel` weights matrix.
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
       recurrent_constraint: Constraint function applied to
-          the `recurrent_kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  References:
-      - [Long short-term
-        memory]((http://www.bioinf.jku.at/publications/older/2604.pdf)
-        (original 1997 paper)
-      - [Supervised sequence labeling with recurrent neural
-        networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
-      - [A Theoretically Grounded Application of Dropout in Recurrent Neural
-        Networks](http://arxiv.org/abs/1512.05287)
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
   """
 
   def __init__(self,
@@ -1009,15 +1692,14 @@ class LSTM(Recurrent):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
+               implementation=1,
                **kwargs):
-    super(LSTM, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    super(LSTMCell, self).__init__(**kwargs)
     self.units = units
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
@@ -1038,25 +1720,15 @@ class LSTM(Recurrent):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = [
-        InputSpec(shape=(None, self.units)),
-        InputSpec(shape=(None, self.units))
-    ]
+    self.implementation = implementation
+    self.state_size = (self.units, self.units)
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_dim = input_shape[2]
-    self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
-
-    self.states = [None, None]
-    if self.stateful:
-      self.reset_states()
-
+    input_dim = input_shape[-1]
     self.kernel = self.add_weight(
-        shape=(self.input_dim, self.units * 4),
+        shape=(input_dim, self.units * 4),
         name='kernel',
         initializer=self.kernel_initializer,
         regularizer=self.kernel_regularizer,
@@ -1112,96 +1784,90 @@ class LSTM(Recurrent):
       self.bias_o = None
     self.built = True
 
-  def preprocess_input(self, inputs, training=None):
-    if self.implementation == 0:
-      input_shape = inputs.get_shape().as_list()
-      input_dim = input_shape[2]
-      timesteps = input_shape[1]
-
-      x_i = _time_distributed_dense(
-          inputs,
-          self.kernel_i,
-          self.bias_i,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      x_f = _time_distributed_dense(
-          inputs,
-          self.kernel_f,
-          self.bias_f,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      x_c = _time_distributed_dense(
-          inputs,
-          self.kernel_c,
-          self.bias_c,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      x_o = _time_distributed_dense(
-          inputs,
-          self.kernel_o,
-          self.bias_o,
-          self.dropout,
-          input_dim,
-          self.units,
-          timesteps,
-          training=training)
-      return K.concatenate([x_i, x_f, x_c, x_o], axis=2)
-    else:
-      return inputs
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation != 0 and 0 < self.dropout < 1:
-      input_shape = K.int_shape(inputs)
-      input_dim = input_shape[-1]
-      ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-      ones = K.tile(ones, (1, int(input_dim)))
+  def _generate_dropout_mask(self, inputs, training=None):
+    if 0 < self.dropout < 1:
+      ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
 
       def dropped_inputs():
         return K.dropout(ones, self.dropout)
 
-      dp_mask = [
+      self._dropout_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
-      constants.append(dp_mask)
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
+      self._dropout_mask = None
 
+  def _generate_recurrent_dropout_mask(self, inputs, training=None):
     if 0 < self.recurrent_dropout < 1:
       ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
       ones = K.tile(ones, (1, self.units))
 
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
+      def dropped_inputs():
+        return K.dropout(ones, self.dropout)
 
-      rec_dp_mask = [
+      self._recurrent_dropout_mask = [
           K.in_train_phase(dropped_inputs, ones, training=training)
           for _ in range(4)
       ]
-      constants.append(rec_dp_mask)
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-    return constants
-
-  def step(self, inputs, states):
-    h_tm1 = states[0]
-    c_tm1 = states[1]
-    dp_mask = states[2]
-    rec_dp_mask = states[3]
-
-    if self.implementation == 2:
-      z = K.dot(inputs * dp_mask[0], self.kernel)
-      z += K.dot(h_tm1 * rec_dp_mask[0], self.recurrent_kernel)
+      self._recurrent_dropout_mask = None
+
+  def call(self, inputs, states, training=None):
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
+
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
+
+    if self.implementation == 1:
+      if 0 < self.dropout < 1.:
+        inputs_i = inputs * dp_mask[0]
+        inputs_f = inputs * dp_mask[1]
+        inputs_c = inputs * dp_mask[2]
+        inputs_o = inputs * dp_mask[3]
+      else:
+        inputs_i = inputs
+        inputs_f = inputs
+        inputs_c = inputs
+        inputs_o = inputs
+      x_i = K.dot(inputs_i, self.kernel_i)
+      x_f = K.dot(inputs_f, self.kernel_f)
+      x_c = K.dot(inputs_c, self.kernel_c)
+      x_o = K.dot(inputs_o, self.kernel_o)
+      if self.use_bias:
+        x_i = K.bias_add(x_i, self.bias_i)
+        x_f = K.bias_add(x_f, self.bias_f)
+        x_c = K.bias_add(x_c, self.bias_c)
+        x_o = K.bias_add(x_o, self.bias_o)
+
+      if 0 < self.recurrent_dropout < 1.:
+        h_tm1_i = h_tm1 * rec_dp_mask[0]
+        h_tm1_f = h_tm1 * rec_dp_mask[1]
+        h_tm1_c = h_tm1 * rec_dp_mask[2]
+        h_tm1_o = h_tm1 * rec_dp_mask[3]
+      else:
+        h_tm1_i = h_tm1
+        h_tm1_f = h_tm1
+        h_tm1_c = h_tm1
+        h_tm1_o = h_tm1
+      i = self.recurrent_activation(
+          x_i + K.dot(h_tm1_i, self.recurrent_kernel_i))
+      f = self.recurrent_activation(
+          x_f + K.dot(h_tm1_f, self.recurrent_kernel_f))
+      c = f * c_tm1 + i * self.activation(
+          x_c + K.dot(h_tm1_c, self.recurrent_kernel_c))
+      o = self.recurrent_activation(
+          x_o + K.dot(h_tm1_o, self.recurrent_kernel_o))
+    else:
+      if 0. < self.dropout < 1.:
+        inputs *= dp_mask[0]
+      z = K.dot(inputs, self.kernel)
+      if 0. < self.recurrent_dropout < 1.:
+        h_tm1 *= rec_dp_mask[0]
+      z += K.dot(h_tm1, self.recurrent_kernel)
       if self.use_bias:
         z = K.bias_add(z, self.bias)
 
@@ -1214,33 +1880,229 @@ class LSTM(Recurrent):
       f = self.recurrent_activation(z1)
       c = f * c_tm1 + i * self.activation(z2)
       o = self.recurrent_activation(z3)
-    else:
-      if self.implementation == 0:
-        x_i = inputs[:, :self.units]
-        x_f = inputs[:, self.units:2 * self.units]
-        x_c = inputs[:, 2 * self.units:3 * self.units]
-        x_o = inputs[:, 3 * self.units:]
-      elif self.implementation == 1:
-        x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
-        x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
-        x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
-        x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
-      else:
-        raise ValueError('Unknown `implementation` mode.')
 
-      i = self.recurrent_activation(x_i + K.dot(h_tm1 * rec_dp_mask[0],
-                                                self.recurrent_kernel_i))
-      f = self.recurrent_activation(x_f + K.dot(h_tm1 * rec_dp_mask[1],
-                                                self.recurrent_kernel_f))
-      c = f * c_tm1 + i * self.activation(
-          x_c + K.dot(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c))
-      o = self.recurrent_activation(x_o + K.dot(h_tm1 * rec_dp_mask[3],
-                                                self.recurrent_kernel_o))
     h = o * self.activation(c)
     if 0 < self.dropout + self.recurrent_dropout:
-      h._uses_learning_phase = True
+      if training is None:
+        h._uses_learning_phase = True
     return h, [h, c]
 
+
+class LSTM(RNN):
+  # pylint: disable=line-too-long
+  """Long-Short Term Memory layer - Hochreiter 1997.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use
+          (see [activations](../activations.md)).
+          If you pass None, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step
+          (see [activations](../activations.md)).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+          (see [initializers](../initializers.md)).
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state.
+          (see [initializers](../initializers.md)).
+      bias_initializer: Initializer for the bias vector
+          (see [initializers](../initializers.md)).
+      unit_forget_bias: Boolean.
+          If True, add 1 to the bias of the forget gate at initialization.
+          Setting it to true will also force `bias_initializer="zeros"`.
+          This is recommended in [Jozefowicz et
+            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix
+          (see [regularizer](../regularizers.md)).
+      bias_regularizer: Regularizer function applied to the bias vector
+          (see [regularizer](../regularizers.md)).
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation").
+          (see [regularizer](../regularizers.md)).
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix
+          (see [constraints](../constraints.md)).
+      bias_constraint: Constraint function applied to the bias vector
+          (see [constraints](../constraints.md)).
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
+      return_sequences: Boolean. Whether to return the last output.
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+
+  References:
+      - [Long short-term memory](http://www.bioinf.jku.at/publications/older/2604.pdf)
+      - [Learning to forget: Continual prediction with LSTM](http://www.mitpressjournals.org/doi/pdf/10.1162/089976600300015015)
+      - [Supervised sequence labeling with recurrent neural networks](http://www.cs.toronto.edu/~graves/preprint.pdf)
+      - [A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](http://arxiv.org/abs/1512.05287)
+  """
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               **kwargs):
+    if implementation == 0:
+      logging.warning('`implementation=0` has been deprecated, '
+                      'and now defaults to `implementation=1`.'
+                      'Please update your layer call.')
+    cell = LSTMCell(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        unit_forget_bias=unit_forget_bias,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation)
+    super(LSTM, self).__init__(
+        cell,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    self.cell._generate_dropout_mask(inputs, training=training)
+    self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+    return super(LSTM, self).call(
+        inputs, mask=mask, training=training, initial_state=initial_state)
+
+  @property
+  def units(self):
+    return self.cell.units
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def unit_forget_bias(self):
+    return self.cell.unit_forget_bias
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
+
+  @property
+  def implementation(self):
+    return self.cell.implementation
+
   def get_config(self):
     config = {
         'units': self.units,
@@ -1264,7 +2126,347 @@ class LSTM(Recurrent):
             constraints.serialize(self.recurrent_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint),
         'dropout': self.dropout,
-        'recurrent_dropout': self.recurrent_dropout
+        'recurrent_dropout': self.recurrent_dropout,
+        'implementation': self.implementation
     }
     base_config = super(LSTM, self).get_config()
+    del base_config['cell']
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    if 'implementation' in config and config['implementation'] == 0:
+      config['implementation'] = 1
+    return cls(**config)
+
+
+class Recurrent(Layer):
+  """Deprecated abstract base class for recurrent layers.
+
+  It still exists because it is leveraged by the convolutional-recurrent layers.
+  It will be removed entirely in the future.
+  It was never part of the public API.
+  Do not use.
+
+  Arguments:
+      weights: list of Numpy arrays to set as initial weights.
+          The list should have 3 elements, of shapes:
+          `[(input_dim, output_dim), (output_dim, output_dim), (output_dim,)]`.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+      implementation: one of {0, 1, or 2}.
+          If set to 0, the RNN will use
+          an implementation that uses fewer, larger matrix products,
+          thus running faster on CPU but consuming more memory.
+          If set to 1, the RNN will use more matrix products,
+          but smaller ones, thus running slower
+          (may actually be faster on GPU) while consuming less memory.
+          If set to 2 (LSTM/GRU only),
+          the RNN will combine the input gate,
+          the forget gate and the output gate into a single matrix,
+          enabling more time-efficient parallelization on the GPU.
+          Note: RNN dropout must be shared for all gates,
+          resulting in a slightly reduced regularization.
+      input_dim: dimensionality of the input (integer).
+          This argument (or alternatively, the keyword argument `input_shape`)
+          is required when using this layer as the first layer in a model.
+      input_length: Length of input sequences, to be specified
+          when it is constant.
+          This argument is required if you are going to connect
+          `Flatten` then `Dense` layers upstream
+          (without it, the shape of the dense outputs cannot be computed).
+          Note that if the recurrent layer is not the first layer
+          in your model, you would need to specify the input length
+          at the level of the first layer
+          (e.g. via the `input_shape` argument)
+
+  Input shape:
+      3D tensor with shape `(batch_size, timesteps, input_dim)`,
+      (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+
+  Output shape:
+      - if `return_state`: a list of tensors. The first tensor is
+          the output. The remaining tensors are the last states,
+          each with shape `(batch_size, units)`.
+      - if `return_sequences`: 3D tensor with shape
+          `(batch_size, timesteps, units)`.
+      - else, 2D tensor with shape `(batch_size, units)`.
+
+  # Masking
+      This layer supports masking for input data with a variable number
+      of timesteps. To introduce masks to your data,
+      use an `Embedding` layer with the `mask_zero` parameter
+      set to `True`.
+
+  # Note on using statefulness in RNNs
+      You can set RNN layers to be 'stateful', which means that the states
+      computed for the samples in one batch will be reused as initial states
+      for the samples in the next batch. This assumes a one-to-one mapping
+      between samples in different successive batches.
+
+      To enable statefulness:
+          - specify `stateful=True` in the layer constructor.
+          - specify a fixed batch size for your model, by passing
+              if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+              else for functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+              This is the expected shape of your inputs
+              *including the batch size*.
+              It should be a tuple of integers, e.g. `(32, 10, 100)`.
+          - specify `shuffle=False` when calling fit().
+
+      To reset the states of your model, call `.reset_states()` on either
+      a specific layer, or on your entire model.
+
+  # Note on specifying the initial state of RNNs
+      You can specify the initial state of RNN layers symbolically by
+      calling them with the keyword argument `initial_state`. The value of
+      `initial_state` should be a tensor or list of tensors representing
+      the initial state of the RNN layer.
+
+      You can specify the initial state of RNN layers numerically by
+      calling `reset_states` with the keyword argument `states`. The value of
+      `states` should be a numpy array or list of numpy arrays representing
+      the initial state of the RNN layer.
+  """
+
+  def __init__(self,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               implementation=0,
+               **kwargs):
+    super(Recurrent, self).__init__(**kwargs)
+    self.return_sequences = return_sequences
+    self.return_state = return_state
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.unroll = unroll
+    self.implementation = implementation
+    self.supports_masking = True
+    self.input_spec = [InputSpec(ndim=3)]
+    self.state_spec = None
+    self.dropout = 0
+    self.recurrent_dropout = 0
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.return_sequences:
+      output_shape = (input_shape[0], input_shape[1], self.units)
+    else:
+      output_shape = (input_shape[0], self.units)
+
+    if self.return_state:
+      state_shape = [tensor_shape.TensorShape(
+          (input_shape[0], self.units)) for _ in self.states]
+      return [tensor_shape.TensorShape(output_shape)] + state_shape
+    return tensor_shape.TensorShape(output_shape)
+
+  def compute_mask(self, inputs, mask):
+    if isinstance(mask, list):
+      mask = mask[0]
+    output_mask = mask if self.return_sequences else None
+    if self.return_state:
+      state_mask = [None for _ in self.states]
+      return [output_mask] + state_mask
+    return output_mask
+
+  def step(self, inputs, states):
+    raise NotImplementedError
+
+  def get_constants(self, inputs, training=None):
+    return []
+
+  def get_initial_state(self, inputs):
+    # build an all-zero tensor of shape (samples, output_dim)
+    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
+    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
+    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = K.tile(initial_state, [1,
+                                           self.units])  # (samples, output_dim)
+    initial_state = [initial_state for _ in range(len(self.states))]
+    return initial_state
+
+  def preprocess_input(self, inputs, training=None):
+    return inputs
+
+  def __call__(self, inputs, initial_state=None, **kwargs):
+    if (isinstance(inputs, (list, tuple)) and
+        len(inputs) > 1
+        and initial_state is None):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+
+    # If `initial_state` is specified,
+    # and if it a Keras tensor,
+    # then add it to the inputs and temporarily
+    # modify the input spec to include the state.
+    if initial_state is None:
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+    if not isinstance(initial_state, (list, tuple)):
+      initial_state = [initial_state]
+
+    is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+    for tensor in initial_state:
+      if hasattr(tensor, '_keras_history') != is_keras_tensor:
+        raise ValueError('The initial state of an RNN layer cannot be'
+                         ' specified with a mix of Keras tensors and'
+                         ' non-Keras tensors')
+
+    if is_keras_tensor:
+      # Compute the full input spec, including state
+      input_spec = self.input_spec
+      state_spec = self.state_spec
+      if not isinstance(input_spec, list):
+        input_spec = [input_spec]
+      if not isinstance(state_spec, list):
+        state_spec = [state_spec]
+      self.input_spec = input_spec + state_spec
+
+      # Compute the full inputs, including state
+      inputs = [inputs] + list(initial_state)
+
+      # Perform the call
+      output = super(Recurrent, self).__call__(inputs, **kwargs)
+
+      # Restore original input spec
+      self.input_spec = input_spec
+      return output
+    else:
+      kwargs['initial_state'] = initial_state
+      return super(Recurrent, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+    elif initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    input_shape = K.int_shape(inputs)
+    if self.unroll and input_shape[1] is None:
+      raise ValueError('Cannot unroll a RNN if the '
+                       'time dimension is undefined. \n'
+                       '- If using a Sequential model, '
+                       'specify the time dimension by passing '
+                       'an `input_shape` or `batch_input_shape` '
+                       'argument to your first layer. If your '
+                       'first layer is an Embedding, you can '
+                       'also use the `input_length` argument.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a `shape` '
+                       'or `batch_shape` argument to your Input layer.')
+    constants = self.get_constants(inputs, training=None)
+    preprocessed_input = self.preprocess_input(inputs, training=None)
+    last_output, outputs, states = K.rnn(
+        self.step,
+        preprocessed_input,
+        initial_state,
+        go_backwards=self.go_backwards,
+        mask=mask,
+        constants=constants,
+        unroll=self.unroll)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append((self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    # Properly set learning phase
+    if 0 < self.dropout + self.recurrent_dropout:
+      last_output._uses_learning_phase = True
+      outputs._uses_learning_phase = True
+
+    if not self.return_sequences:
+      outputs = last_output
+
+    if self.return_state:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [outputs] + states
+    return outputs
+
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    batch_size = self.input_spec[0].shape[0]
+    if not batch_size:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.')
+    # initialize state if None
+    if self.states[0] is None:
+      self.states = [K.zeros((batch_size, self.units)) for _ in self.states]
+    elif states is None:
+      for state in self.states:
+        K.set_value(state, np.zeros((batch_size, self.units)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, '
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if value.shape != (batch_size, self.units):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' + self.name +
+                           ': expected shape=' + str((batch_size, self.units)) +
+                           ', found shape=' + str(value.shape))
+        K.set_value(state, value)
+
+  def get_config(self):
+    config = {
+        'return_sequences': self.return_sequences,
+        'return_state': self.return_state,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful,
+        'unroll': self.unroll,
+        'implementation': self.implementation
+    }
+    base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc4c1db9b4b71775bd3c52a863752b34d9dc3ea
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -0,0 +1,397 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for recurrent layers functionality other than GRU, LSTM, SimpleRNN.
+
+See also: lstm_test.py, gru_test.py, simplernn_test.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class RNNTest(test.TestCase):
+
+  def test_minimal_rnn_cell_non_layer(self):
+
+    class MinimalRNNCell(object):
+
+      def __init__(self, units, input_dim):
+        self.units = units
+        self.state_size = units
+        self.kernel = keras.backend.variable(
+            np.random.random((input_dim, units)))
+
+      def call(self, inputs, states):
+        prev_output = states[0]
+        output = keras.backend.dot(inputs, self.kernel) + prev_output
+        return output, [output]
+
+    with self.test_session():
+      # Basic test case.
+      cell = MinimalRNNCell(32, 5)
+      x = keras.Input((None, 5))
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+      # Test stacking.
+      cells = [MinimalRNNCell(8, 5),
+               MinimalRNNCell(32, 8),
+               MinimalRNNCell(32, 32)]
+      layer = keras.layers.RNN(cells)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+  def test_minimal_rnn_cell_non_layer_multiple_states(self):
+
+    class MinimalRNNCell(object):
+
+      def __init__(self, units, input_dim):
+        self.units = units
+        self.state_size = (units, units)
+        self.kernel = keras.backend.variable(
+            np.random.random((input_dim, units)))
+
+      def call(self, inputs, states):
+        prev_output_1 = states[0]
+        prev_output_2 = states[1]
+        output = keras.backend.dot(inputs, self.kernel)
+        output += prev_output_1
+        output -= prev_output_2
+        return output, [output * 2, output * 3]
+
+    with self.test_session():
+      # Basic test case.
+      cell = MinimalRNNCell(32, 5)
+      x = keras.Input((None, 5))
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+      # Test stacking.
+      cells = [MinimalRNNCell(8, 5),
+               MinimalRNNCell(16, 8),
+               MinimalRNNCell(32, 16)]
+      layer = keras.layers.RNN(cells)
+      assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+  def test_minimal_rnn_cell_layer(self):
+
+    class MinimalRNNCell(keras.layers.Layer):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        self.state_size = units
+        super(MinimalRNNCell, self).__init__(**kwargs)
+
+      def build(self, input_shape):
+        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                      initializer='uniform',
+                                      name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.built = True
+
+      def call(self, inputs, states):
+        prev_output = states[0]
+        h = keras.backend.dot(inputs, self.kernel)
+        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+        return output, [output]
+
+      def get_config(self):
+        config = {'units': self.units}
+        base_config = super(MinimalRNNCell, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((None, 5))
+      cell = MinimalRNNCell(32)
+      layer = keras.layers.RNN(cell)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      y_np = model.predict(x_np)
+      weights = model.get_weights()
+      config = layer.get_config()
+      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+        layer = keras.layers.RNN.from_config(config)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.set_weights(weights)
+      y_np_2 = model.predict(x_np)
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Test stacking.
+      cells = [MinimalRNNCell(8),
+               MinimalRNNCell(12),
+               MinimalRNNCell(32)]
+      layer = keras.layers.RNN(cells)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+      # Test stacked RNN serialization.
+      x_np = np.random.random((6, 5, 5))
+      y_np = model.predict(x_np)
+      weights = model.get_weights()
+      config = layer.get_config()
+      with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+        layer = keras.layers.RNN.from_config(config)
+      y = layer(x)
+      model = keras.models.Model(x, y)
+      model.set_weights(weights)
+      y_np_2 = model.predict(x_np)
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+  def test_rnn_cell_with_constants_layer(self):
+
+    class RNNCellWithConstants(keras.layers.Layer):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        self.state_size = units
+        super(RNNCellWithConstants, self).__init__(**kwargs)
+
+      def build(self, input_shape):
+        if not isinstance(input_shape, list):
+          raise TypeError('expects constants shape')
+        [input_shape, constant_shape] = input_shape
+        # will (and should) raise if more than one constant passed
+
+        self.input_kernel = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            initializer='uniform',
+            name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.constant_kernel = self.add_weight(
+            shape=(constant_shape[-1], self.units),
+            initializer='uniform',
+            name='constant_kernel')
+        self.built = True
+
+      def call(self, inputs, states, constants):
+        [prev_output] = states
+        [constant] = constants
+        h_input = keras.backend.dot(inputs, self.input_kernel)
+        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+        h_const = keras.backend.dot(constant, self.constant_kernel)
+        output = h_input + h_state + h_const
+        return output, [output]
+
+      def get_config(self):
+        config = {'units': self.units}
+        base_config = super(RNNCellWithConstants, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      cell = RNNCellWithConstants(32)
+      layer = keras.layers.RNN(cell)
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
+    with self.test_session():
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.RNN.from_config(config.copy())
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+    with self.test_session():
+      # test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.RNN.from_config(config.copy())
+      y = layer([x, c])
+      model = keras.models.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+  def test_rnn_cell_with_constants_layer_passing_initial_state(self):
+
+    class RNNCellWithConstants(keras.layers.Layer):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        self.state_size = units
+        super(RNNCellWithConstants, self).__init__(**kwargs)
+
+      def build(self, input_shape):
+        if not isinstance(input_shape, list):
+          raise TypeError('expects constants shape')
+        [input_shape, constant_shape] = input_shape
+        # will (and should) raise if more than one constant passed
+
+        self.input_kernel = self.add_weight(
+            shape=(input_shape[-1], self.units),
+            initializer='uniform',
+            name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.constant_kernel = self.add_weight(
+            shape=(constant_shape[-1], self.units),
+            initializer='uniform',
+            name='constant_kernel')
+        self.built = True
+
+      def call(self, inputs, states, constants):
+        [prev_output] = states
+        [constant] = constants
+        h_input = keras.backend.dot(inputs, self.input_kernel)
+        h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+        h_const = keras.backend.dot(constant, self.constant_kernel)
+        output = h_input + h_state + h_const
+        return output, [output]
+
+      def get_config(self):
+        config = {'units': self.units}
+        base_config = super(RNNCellWithConstants, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      s = keras.Input((32,))
+      cell = RNNCellWithConstants(32)
+      layer = keras.layers.RNN(cell)
+      y = layer(x, initial_state=s, constants=c)
+      model = keras.models.Model([x, s, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
+    with self.test_session():
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      s_np = np.random.random((6, 32))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, s_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+      custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.RNN.from_config(config.copy())
+      y = layer(x, initial_state=s, constants=c)
+      model = keras.models.Model([x, s, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, s_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # verify that state is used
+      y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
+      with self.assertRaises(AssertionError):
+        self.assertAllClose(y_np, y_np_2_different_s, atol=1e-4)
+
+    with self.test_session():
+      # test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.RNN.from_config(config.copy())
+      y = layer([x, s, c])
+      model = keras.models.Model([x, s, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, s_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+  def test_stacked_rnn_attributes(self):
+    cells = [keras.layers.LSTMCell(3),
+             keras.layers.LSTMCell(3, kernel_regularizer='l2')]
+    layer = keras.layers.RNN(cells)
+    layer.build((None, None, 5))
+
+    # Test regularization losses
+    self.assertEqual(len(layer.losses), 1)
+
+    # Test weights
+    self.assertEqual(len(layer.trainable_weights), 6)
+    cells[0].trainable = False
+    self.assertEqual(len(layer.trainable_weights), 3)
+    self.assertEqual(len(layer.non_trainable_weights), 3)
+
+    # Test `get_losses_for`
+    x = keras.Input((None, 5))
+    y = keras.backend.sum(x)
+    cells[0].add_loss(y, inputs=x)
+    self.assertEqual(layer.get_losses_for(x), [y])
+
+  def test_rnn_dynamic_trainability(self):
+    layer_class = keras.layers.SimpleRNN
+    embedding_dim = 4
+    units = 3
+
+    layer = layer_class(units)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(len(layer.weights), 3)
+    self.assertEqual(len(layer.trainable_weights), 3)
+    self.assertEqual(len(layer.non_trainable_weights), 0)
+    layer.trainable = False
+    self.assertEqual(len(layer.weights), 3)
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.non_trainable_weights), 3)
+    layer.trainable = True
+    self.assertEqual(len(layer.weights), 3)
+    self.assertEqual(len(layer.trainable_weights), 3)
+    self.assertEqual(len(layer.non_trainable_weights), 0)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
index 9833485236b68095402cc2921ba7050591d44a55..7edebdacd07d74fe6b5a982d12645fb5556bdf75 100644
--- a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
@@ -156,8 +156,10 @@ class SimpleRNNLayerTest(test.TestCase):
           activity_regularizer='l1')
       layer.build((None, None, 2))
       self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((2, 3, 2))))
-      self.assertEqual(len(layer.losses), 4)
+
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
   def test_constraints_SimpleRNN(self):
     embedding_dim = 4
@@ -175,9 +177,9 @@ class SimpleRNNLayerTest(test.TestCase):
           recurrent_constraint=r_constraint,
           bias_constraint=b_constraint)
       layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+      self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index a0cca9dc2fccd3475d117d53d2e93099eae8ae44..aefa5a1c020b490991708056d609ae1efa8d4a9a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.layers import utils as tf_layers_util
 
 
 class Wrapper(Layer):
@@ -77,7 +77,7 @@ class Wrapper(Layer):
     # get the updates from the inner layer.
     inner_inputs = inputs
     if inputs is not None:
-      uid = tf_base_layers._object_list_uid(inputs)
+      uid = tf_layers_util.object_list_uid(inputs)
       if uid in self._input_map:
         inner_inputs = self._input_map[uid]
 
@@ -97,10 +97,6 @@ class Wrapper(Layer):
       return losses + super(Wrapper, self).get_losses_for(None)
     return super(Wrapper, self).get_losses_for(inputs)
 
-  @property
-  def constraints(self):
-    return self.layer.constraints
-
   def get_weights(self):
     return self.layer.get_weights()
 
@@ -227,7 +223,7 @@ class TimeDistributed(Wrapper):
         input_length = K.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = tf_base_layers._object_list_uid(inputs)
+      input_uid = tf_layers_util.object_list_uid(inputs)
       inputs = K.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
@@ -340,7 +336,8 @@ class Bidirectional(Wrapper):
       output = [y, y_rev]
 
     # Properly set learning phase
-    if 0 < self.layer.dropout + self.layer.recurrent_dropout:
+    if (getattr(y, '_uses_learning_phase', False) or
+        getattr(y_rev, '_uses_learning_phase', False)):
       if self.merge_mode is None:
         for out in output:
           out._uses_learning_phase = True
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 7c6b304622a3ec6995483bfafef1c865ce6520cc..19212aeee8cd4fbc723ba3e47c9d3e226ec339a9 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 
 
 def mean_squared_error(y_true, y_pred):
@@ -91,7 +92,7 @@ def poisson(y_true, y_pred):
 def cosine_proximity(y_true, y_pred):
   y_true = K.l2_normalize(y_true, axis=-1)
   y_pred = K.l2_normalize(y_pred, axis=-1)
-  return -K.mean(y_true * y_pred, axis=-1)
+  return -K.sum(y_true * y_pred, axis=-1)
 
 
 # Aliases.
@@ -105,7 +106,7 @@ cosine = cosine_proximity
 
 
 def serialize(loss):
-  return loss.__name__
+  return serialize_keras_object(loss)
 
 
 def deserialize(name, custom_objects=None):
@@ -122,6 +123,8 @@ def get(identifier):
   if isinstance(identifier, six.string_types):
     identifier = str(identifier)
     return deserialize(identifier)
+  if isinstance(identifier, dict):
+    return deserialize(identifier)
   elif callable(identifier):
     return identifier
   else:
diff --git a/tensorflow/python/keras/_impl/keras/losses_test.py b/tensorflow/python/keras/_impl/keras/losses_test.py
index b295356ec19c28af3ca80c81f3669bd6bec005b6..1884c0fdca79801ecd7d8cd21dae8b745ed0f6b6 100644
--- a/tensorflow/python/keras/_impl/keras/losses_test.py
+++ b/tensorflow/python/keras/_impl/keras/losses_test.py
@@ -18,11 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import shutil
+
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
 
+try:
+  import h5py  # pylint:disable=g-import-not-at-top
+except ImportError:
+  h5py = None
 
 ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.mean_absolute_error,
@@ -39,6 +46,20 @@ ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.categorical_hinge]
 
 
+class _MSEMAELoss(object):
+  """Loss function with internal state, for testing serialization code."""
+
+  def __init__(self, mse_fraction):
+    self.mse_fraction = mse_fraction
+
+  def __call__(self, y_true, y_pred):
+    return (self.mse_fraction * keras.losses.mse(y_true, y_pred) +
+            (1 - self.mse_fraction) * keras.losses.mae(y_true, y_pred))
+
+  def get_config(self):
+    return {'mse_fraction': self.mse_fraction}
+
+
 class KerasLossesTest(test.TestCase):
 
   def test_objective_shapes_3d(self):
@@ -83,6 +104,39 @@ class KerasLossesTest(test.TestCase):
     loss = keras.backend.eval(keras.losses.categorical_hinge(y_true, y_pred))
     self.assertAllClose(expected_loss, np.mean(loss))
 
+  def test_serializing_loss_class(self):
+    orig_loss_class = _MSEMAELoss(0.3)
+    with keras.utils.custom_object_scope({'_MSEMAELoss': _MSEMAELoss}):
+      serialized = keras.losses.serialize(orig_loss_class)
+
+    with keras.utils.custom_object_scope({'_MSEMAELoss': _MSEMAELoss}):
+      deserialized = keras.losses.deserialize(serialized)
+    assert isinstance(deserialized, _MSEMAELoss)
+    assert deserialized.mse_fraction == 0.3
+
+  def test_serializing_model_with_loss_class(self):
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir)
+    model_filename = os.path.join(tmpdir, 'custom_loss.h5')
+
+    with self.test_session():
+      with keras.utils.custom_object_scope({'_MSEMAELoss': _MSEMAELoss}):
+        loss = _MSEMAELoss(0.3)
+        inputs = keras.layers.Input((2,))
+        outputs = keras.layers.Dense(1, name='model_output')(inputs)
+        model = keras.models.Model(inputs, outputs)
+        model.compile(optimizer='sgd', loss={'model_output': loss})
+        model.fit(np.random.rand(256, 2), np.random.rand(256, 1))
+
+        if h5py is None:
+          return
+
+        model.save(model_filename)
+
+      with keras.utils.custom_object_scope({'_MSEMAELoss': _MSEMAELoss}):
+        loaded_model = keras.models.load_model(model_filename)
+        loaded_model.predict(np.random.rand(128, 2))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/_impl/keras/models.py
index 06941e4bac07a30271ac8344cc4979d9ab8ea14b..ba202827ce3fca397ab487f58c01667b9b0c4444 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/_impl/keras/models.py
@@ -31,6 +31,7 @@ from tensorflow.python.keras._impl.keras import layers as layer_module
 from tensorflow.python.keras._impl.keras import optimizers
 from tensorflow.python.keras._impl.keras.engine import topology
 from tensorflow.python.keras._impl.keras.engine.topology import Input
+from tensorflow.python.keras._impl.keras.engine.topology import InputLayer
 from tensorflow.python.keras._impl.keras.engine.topology import Layer
 from tensorflow.python.keras._impl.keras.engine.topology import TFBaseLayer
 from tensorflow.python.keras._impl.keras.engine.training import Model
@@ -456,38 +457,48 @@ class Sequential(Model):
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
     if not self.outputs:
-      # first layer in model: check that it is an input layer
-      if not layer._inbound_nodes:
-        # create an input layer
-        if not hasattr(layer, '_batch_input_shape'):
-          raise ValueError('The first layer in a '
-                           'Sequential model must '
-                           'get an `input_shape` or '
-                           '`batch_input_shape` argument.')
+      # First layer in model: check that it is an input layer.
+      if not isinstance(layer, InputLayer):
+        # Create an input layer.
+        # First, we need to infer its expected input shape and dtype.
+        if isinstance(layer, (Model, Sequential)):
+          # We were passed a model as first layer.
+          # This requires a specific way to figure out the
+          # input shape and dtype.
+          if not layer.layers:
+            raise ValueError('Cannot add an empty model '
+                             'to a `Sequential` model.')
+          # In case of nested models: recover the first layer
+          # of the deepest model to infer input shape and dtype.
+          first_layer = layer.layers[0]
+          while isinstance(first_layer, (Model, Sequential)):
+            first_layer = first_layer.layers[0]
+          batch_shape = first_layer._batch_input_shape
+          dtype = first_layer.dtype
+        else:
+          # We were passed a regular layer, and it should
+          # know about its input shape. Otherwise, that's an error.
+          if not hasattr(layer, '_batch_input_shape'):
+            raise ValueError('The first layer in a '
+                             'Sequential model must '
+                             'get an `input_shape` argument.')
+          batch_shape = layer._batch_input_shape
+          dtype = layer.dtype
         # Instantiate the input layer.
         x = Input(
-            batch_shape=layer._batch_input_shape,
-            dtype=layer.dtype,
-            name=layer.name + '_input')
+            batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
         # This will build the current layer
         # and create the node connecting the current layer
         # to the input layer we just created.
         layer(x)
 
-      if len(layer._inbound_nodes) != 1:
-        raise ValueError('A layer added to a Sequential model must '
-                         'not already be connected somewhere else. '
-                         'Model received layer ' + layer.name + ' which has ' +
-                         str(len(layer._inbound_nodes)) +
-                         ' pre-existing inbound connections.')
-
-      if len(layer._inbound_nodes[0].output_tensors) != 1:
+      if len(layer.inbound_nodes[-1].output_tensors) != 1:
         raise ValueError('All layers in a Sequential model '
                          'should have a single output tensor. '
                          'For multi-output layers, '
                          'use the functional API.')
 
-      self.outputs = [layer._inbound_nodes[0].output_tensors[0]]
+      self.outputs = [layer.inbound_nodes[-1].output_tensors[0]]
       self.inputs = topology.get_source_inputs(self.outputs[0])
 
       # We create an input node, which we will keep updated
@@ -716,24 +727,42 @@ class Sequential(Model):
               metrics=None,
               sample_weight_mode=None,
               weighted_metrics=None,
+              target_tensors=None,
               **kwargs):
-    """Configures the learning process.
+    """Configures the model for training.
 
     Arguments:
-        optimizer: str (name of optimizer) or optimizer object.
+        optimizer: String (name of optimizer) or optimizer object.
             See [optimizers](/optimizers).
-        loss: str (name of objective function) or objective function.
+        loss: String (name of objective function) or objective function.
             See [losses](/losses).
-        metrics: list of metrics to be evaluated by the model
+            If the model has multiple outputs, you can use a different loss
+            on each output by passing a dictionary or a list of losses.
+            The loss value that will be minimized by the model
+            will then be the sum of all individual losses.
+        metrics: List of metrics to be evaluated by the model
             during training and testing.
             Typically you will use `metrics=['accuracy']`.
-            See [metrics](/metrics).
-        sample_weight_mode: if you need to do timestep-wise
-            sample weighting (2D weights), set this to "temporal".
-            "None" defaults to sample-wise weights (1D).
+            To specify different metrics for different outputs of a
+            multi-output model, you could also pass a dictionary,
+            such as `metrics={'output_a': 'accuracy'}`.
+        sample_weight_mode: If you need to do timestep-wise
+            sample weighting (2D weights), set this to `"temporal"`.
+            `None` defaults to sample-wise weights (1D).
+            If the model has multiple outputs, you can use a different
+            `sample_weight_mode` on each output by passing a
+            dictionary or a list of modes.
         weighted_metrics: list of metrics to be evaluated and weighted
              by `sample_weight` or `class_weight` during training and testing.
-        **kwargs: These are passed into `tf.Session.run`.
+        target_tensors: By default, Keras will create a placeholder for the
+            model's target, which will be fed with the target data during
+            training. If instead you would like to use your own
+            target tensor (in turn, Keras will not expect external
+            Numpy data for these targets at training time), you
+            can specify them via the `target_tensors` argument.
+            It should be a single tensor
+            (for a single-output `Sequential` model).
+        **kwargs: These arguments are passed into `tf.Session.run`.
 
     Example:
         ```python
@@ -754,24 +783,25 @@ class Sequential(Model):
         metrics=metrics,
         sample_weight_mode=sample_weight_mode,
         weighted_metrics=weighted_metrics,
+        target_tensors=target_tensors,
         **kwargs)
     self.optimizer = self.model.optimizer
     self.loss = self.model.loss
-    self.total_loss = self.model.total_loss
-    self.loss_weights = self.model.loss_weights
     self.metrics = self.model.metrics
+    self.loss_weights = self.model.loss_weights
+    self.sample_weight_mode = self.model.sample_weight_mode
     self.weighted_metrics = self.model.weighted_metrics
+    self.targets = self.model.targets
     self.metrics_tensors = self.model.metrics_tensors
     self.metrics_names = self.model.metrics_names
-    self.sample_weight_mode = self.model.sample_weight_mode
     self.sample_weights = self.model.sample_weights
-    self.targets = self.model.targets
+    self.total_loss = self.model.total_loss
 
   def fit(self,
-          x,
-          y,
-          batch_size=32,
-          epochs=10,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
           verbose=1,
           callbacks=None,
           validation_split=0.,
@@ -779,43 +809,86 @@ class Sequential(Model):
           shuffle=True,
           class_weight=None,
           sample_weight=None,
-          initial_epoch=0):
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          **kwargs):
     """Trains the model for a fixed number of epochs.
 
     Arguments:
-        x: input data, as a Numpy array or list of Numpy arrays
-            (if the model has multiple inputs).
-        y: labels, as a Numpy array.
-        batch_size: integer. Number of samples per gradient update.
-        epochs: integer, the number of epochs to train the model.
-        verbose: 0 for no logging to stdout,
-            1 for progress bar logging, 2 for one log line per epoch.
-        callbacks: list of `keras.callbacks.Callback` instances.
+        x: Numpy array of training data.
+            If the input layer in the model is named, you can also pass a
+            dictionary mapping the input name to a Numpy array.
+            `x` can be `None` (default) if feeding from
+            TensorFlow data tensors.
+        y: Numpy array of target (label) data.
+            If the output layer in the model is named, you can also pass a
+            dictionary mapping the output name to a Numpy array.
+            `y` can be `None` (default) if feeding from
+            TensorFlow data tensors.
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, it will default to 32.
+        epochs: Integer. Number of epochs to train the model.
+            An epoch is an iteration over the entire `x` and `y`
+            data provided.
+            Note that in conjunction with `initial_epoch`,
+            `epochs` is to be understood as "final epoch".
+            The model is not trained for a number of iterations
+            given by `epochs`, but merely until the epoch
+            of index `epochs` is reached.
+        verbose: 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = one line per epoch.
+        callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
             See [callbacks](/callbacks).
-        validation_split: float (0. < x < 1).
-            Fraction of the data to use as held-out validation data.
-        validation_data: tuple (x_val, y_val) or tuple
-            (x_val, y_val, val_sample_weights) to be used as held-out
-            validation data. Will override validation_split.
-        shuffle: boolean or str (for 'batch').
-            Whether to shuffle the samples at each epoch.
+        validation_split: Float between 0 and 1:
+            Fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+            The validation data is selected from the last samples
+            in the `x` and `y` data provided, before shuffling.
+        validation_data: tuple `(x_val, y_val)` or tuple
+            `(x_val, y_val, val_sample_weights)` on which to evaluate
+            the loss and any model metrics at the end of each epoch.
+            The model will not be trained on this data.
+            This will override `validation_split`.
+        shuffle: Boolean (whether to shuffle the training data
+            before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
             limitations of HDF5 data; it shuffles in batch-sized chunks.
-        class_weight: dictionary mapping classes to a weight value,
-            used for scaling the loss function (during training only).
-        sample_weight: Numpy array of weights for
-            the training samples, used for scaling the loss function
+            Has no effect when `steps_per_epoch` is not `None`.
+        class_weight: Optional dictionary mapping class indices (integers)
+            to a weight (float) value, used for weighting the loss function
+            (during training only).
+            This can be useful to tell the model to
+            "pay more attention" to samples from
+            an under-represented class.
+        sample_weight: Optional Numpy array of weights for
+            the training samples, used for weighting the loss function
             (during training only). You can either pass a flat (1D)
             Numpy array with the same length as the input samples
             (1:1 mapping between weights and samples),
             or in the case of temporal data,
-            you can pass a 2D array with shape (samples, sequence_length),
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
-        initial_epoch: epoch at which to start training
-            (useful for resuming a previous training run)
+            `sample_weight_mode="temporal"` in `compile()`.
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run).
+        steps_per_epoch: Total number of steps (batches of samples)
+            before declaring one epoch finished and starting the
+            next epoch. When training with input tensors such as
+            TensorFlow data tensors, the default `None` is equal to
+            the number of unique samples in your dataset divided by
+            the batch size, or 1 if that cannot be determined.
+        validation_steps: Only relevant if `steps_per_epoch`
+            is specified. Total number of steps (batches of samples)
+            to validate before stopping.
+        **kwargs: Used for backwards compatibility support.
 
     Returns:
         A `History` object. Its `History.history` attribute is
@@ -824,10 +897,12 @@ class Sequential(Model):
         and validation metrics values (if applicable).
 
     Raises:
-        RuntimeError: if the model was never compiled.
+        RuntimeError: If the model was never compiled.
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects.
     """
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.fit(
         x,
         y,
@@ -840,7 +915,9 @@ class Sequential(Model):
         shuffle=shuffle,
         class_weight=class_weight,
         sample_weight=sample_weight,
-        initial_epoch=initial_epoch)
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps)
 
   def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):
     """Computes the loss on some input data, batch by batch.
@@ -863,7 +940,7 @@ class Sequential(Model):
         RuntimeError: if the model was never compiled.
     """
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.evaluate(
         x,
         y,
@@ -923,7 +1000,7 @@ class Sequential(Model):
         RuntimeError: if the model was never compiled.
     """
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.train_on_batch(
         x, y, sample_weight=sample_weight, class_weight=class_weight)
 
@@ -946,10 +1023,10 @@ class Sequential(Model):
         RuntimeError: if the model was never compiled.
     """
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.test_on_batch(x, y, sample_weight=sample_weight)
 
-  def predict_proba(self, x, batch_size=32, verbose=1):
+  def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
 
     The input samples are processed batch by batch.
@@ -971,7 +1048,7 @@ class Sequential(Model):
                       '(like softmax or sigmoid would).')
     return preds
 
-  def predict_classes(self, x, batch_size=32, verbose=1):
+  def predict_classes(self, x, batch_size=32, verbose=0):
     """Generate class predictions for the input samples.
 
     The input samples are processed batch by batch.
@@ -1003,6 +1080,7 @@ class Sequential(Model):
                     max_queue_size=10,
                     workers=1,
                     use_multiprocessing=False,
+                    shuffle=True,
                     initial_epoch=0,
                     **kwargs):
     """Fits the model on data generated batch-by-batch by a Python generator.
@@ -1026,6 +1104,10 @@ class Sequential(Model):
             be equal to the number of unique samples of your dataset
             divided by the batch size.
         epochs: Integer, total number of iterations on the data.
+            Note that in conjunction with initial_epoch, the parameter
+            epochs is to be understood as "final epoch". The model is
+            not trained for n steps given by epochs, but until the
+            epoch epochs is reached.
         verbose: Verbosity mode, 0, 1, or 2.
         callbacks: List of callbacks to be called during training.
         validation_data: This can be either
@@ -1049,6 +1131,9 @@ class Sequential(Model):
             non picklable arguments to the generator
             as they can't be passed
             easily to children processes.
+       shuffle: Whether to shuffle the order of the batches at
+              the beginning of each epoch. Only used with instances
+              of `Sequence` (keras.utils.Sequence).
         initial_epoch: Epoch at which to start training
             (useful for resuming a previous training run)
         **kwargs: support for legacy arguments.
@@ -1092,7 +1177,7 @@ class Sequential(Model):
       raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
 
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.fit_generator(
         generator,
         steps_per_epoch,
@@ -1105,6 +1190,7 @@ class Sequential(Model):
         max_queue_size=max_queue_size,
         workers=workers,
         use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
         initial_epoch=initial_epoch)
 
   def evaluate_generator(self,
@@ -1158,7 +1244,7 @@ class Sequential(Model):
       raise ValueError('Unrecognized keyword arguments: ' + str(kwargs))
 
     if not self.built:
-      raise RuntimeError('The model needs to be compiled ' 'before being used.')
+      raise RuntimeError('The model needs to be compiled before being used.')
     return self.model.evaluate_generator(
         generator,
         steps,
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/_impl/keras/models_test.py
index fd6b20e0edc024a4e90f16bc23bdb26b4ffbb019..61938066b98b9f6bb48e7e68870d15ed60ad3dd9 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/_impl/keras/models_test.py
@@ -54,10 +54,11 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       new_model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
       out2 = new_model.predict(x)
@@ -95,13 +96,14 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       model = keras.models.load_model(
           fname,
           custom_objects={'CustomOp': CustomOp,
                           'custom_loss': custom_loss})
+      os.close(fd)
       os.remove(fname)
 
       out2 = model.predict(x)
@@ -125,10 +127,11 @@ class TestModelSaving(test.TestCase):
       model.train_on_batch(x, y)
 
       out = model.predict(x)
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
 
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
       out2 = model.predict(x)
@@ -144,9 +147,10 @@ class TestModelSaving(test.TestCase):
       model.add(keras.layers.Dense(3))
       model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_with_tf_optimizer(self):
@@ -161,9 +165,10 @@ class TestModelSaving(test.TestCase):
                     optimizer=training_module.AdadeltaOptimizer(0.1),
                     metrics=['acc'])
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_right_after_compilation(self):
@@ -177,9 +182,10 @@ class TestModelSaving(test.TestCase):
       model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
       model.model._make_train_function()
 
-      _, fname = tempfile.mkstemp('.h5')
+      fd, fname = tempfile.mkstemp('.h5')
       keras.models.save_model(model, fname)
       model = keras.models.load_model(fname)
+      os.close(fd)
       os.remove(fname)
 
   def test_saving_lambda_numpy_array_arguments(self):
@@ -194,10 +200,11 @@ class TestModelSaving(test.TestCase):
     model = keras.models.Model(inputs, output)
     model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-    _, fname = tempfile.mkstemp('.h5')
+    fd, fname = tempfile.mkstemp('.h5')
     keras.models.save_model(model, fname)
 
     model = keras.models.load_model(fname)
+    os.close(fd)
     os.remove(fname)
 
     self.assertAllClose(mean, model.layers[1].arguments['mu'])
@@ -315,6 +322,24 @@ class TestSequential(test.TestCase):
       with self.assertRaises(TypeError):
         model.build()
 
+  def test_nested_sequential_trainability(self):
+    input_dim = 20
+    num_units = 10
+    num_classes = 2
+
+    inner_model = keras.models.Sequential()
+    inner_model.add(keras.layers.Dense(num_units, input_shape=(input_dim,)))
+
+    model = keras.models.Sequential()
+    model.add(inner_model)
+    model.add(keras.layers.Dense(num_classes))
+
+    self.assertEqual(len(model.trainable_weights), 4)
+    inner_model.trainable = False
+    self.assertEqual(len(model.trainable_weights), 2)
+    inner_model.trainable = True
+    self.assertEqual(len(model.trainable_weights), 4)
+
 
 class TestModelCloning(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
index 052a8addc4c37f6df01a9103dc8a07e4726ec735..12dc718cd791d0a5829c4809474a83783ed561f9 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image.py
@@ -31,6 +31,7 @@ import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
 from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -47,6 +48,21 @@ except ImportError:
   ndi = None
 # pylint: enable=g-import-not-at-top
 
+if pil_image is not None:
+  _PIL_INTERPOLATION_METHODS = {
+      'nearest': pil_image.NEAREST,
+      'bilinear': pil_image.BILINEAR,
+      'bicubic': pil_image.BICUBIC,
+  }
+  # These methods were only introduced in version 3.4.0 (2016).
+  if hasattr(pil_image, 'HAMMING'):
+    _PIL_INTERPOLATION_METHODS['hamming'] = pil_image.HAMMING
+  if hasattr(pil_image, 'BOX'):
+    _PIL_INTERPOLATION_METHODS['box'] = pil_image.BOX
+  # This method is new in version 1.1.3 (2013).
+  if hasattr(pil_image, 'LANCZOS'):
+    _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS
+
 
 def random_rotation(x,
                     rg,
@@ -172,10 +188,8 @@ def random_zoom(x,
           (one of `{'constant', 'nearest', 'reflect', 'wrap'}`).
       cval: Value used for points outside the boundaries
           of the input if `mode='constant'`.
-
   Returns:
       Zoomed Numpy image tensor.
-
   Raises:
       ValueError: if `zoom_range` isn't a tuple.
   """
@@ -344,7 +358,7 @@ def img_to_array(img, data_format=None):
   return x
 
 
-def load_img(path, grayscale=False, target_size=None):
+def load_img(path, grayscale=False, target_size=None, interpolation='nearest'):
   """Loads an image into PIL format.
 
   Arguments:
@@ -352,12 +366,19 @@ def load_img(path, grayscale=False, target_size=None):
       grayscale: Boolean, whether to load the image as grayscale.
       target_size: Either `None` (default to original size)
           or tuple of ints `(img_height, img_width)`.
+     interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image.
+          Supported methods are "nearest", "bilinear", and "bicubic".
+          If PIL version 1.1.3 or newer is installed, "lanczos" is also
+          supported. If PIL version 3.4.0 or newer is installed, "box" and
+          "hamming" are also supported. By default, "nearest" is used.
 
   Returns:
       A PIL Image instance.
 
   Raises:
       ImportError: if PIL is not available.
+      ValueError: if interpolation method is not supported.
   """
   if pil_image is None:
     raise ImportError('Could not import PIL.Image. '
@@ -369,14 +390,21 @@ def load_img(path, grayscale=False, target_size=None):
   else:
     if img.mode != 'RGB':
       img = img.convert('RGB')
-  if target_size:
-    hw_tuple = (target_size[1], target_size[0])
-    if img.size != hw_tuple:
-      img = img.resize(hw_tuple)
+  if target_size is not None:
+    width_height_tuple = (target_size[1], target_size[0])
+    if img.size != width_height_tuple:
+      if interpolation not in _PIL_INTERPOLATION_METHODS:
+        raise ValueError(
+            'Invalid interpolation method {} specified. Supported '
+            'methods are {}'.format(
+                interpolation,
+                ', '.join(_PIL_INTERPOLATION_METHODS.keys())))
+      resample = _PIL_INTERPOLATION_METHODS[interpolation]
+      img = img.resize(width_height_tuple, resample)
   return img
 
 
-def list_pictures(directory, ext='jpg|jpeg|bmp|png'):
+def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
   return [
       os.path.join(root, f)
       for root, _, files in os.walk(directory) for f in files
@@ -401,7 +429,7 @@ class ImageDataGenerator(object):
       zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
           in the range [1-z, 1+z]. A sequence of two can be passed instead
           to select this range.
-      channel_shift_range: shift range for each channels.
+      channel_shift_range: shift range for each channel.
       fill_mode: points outside the boundaries are filled according to the
           given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
           is 'nearest'.
@@ -558,12 +586,10 @@ class ImageDataGenerator(object):
       x = self.preprocessing_function(x)
     if self.rescale:
       x *= self.rescale
-    # x is a single image, so it doesn't have image number at index 0
-    img_channel_axis = self.channel_axis - 1
     if self.samplewise_center:
-      x -= np.mean(x, axis=img_channel_axis, keepdims=True)
+      x -= np.mean(x, keepdims=True)
     if self.samplewise_std_normalization:
-      x /= (np.std(x, axis=img_channel_axis, keepdims=True) + 1e-7)
+      x /= np.std(x, keepdims=True) + 1e-7
 
     if self.featurewise_center:
       if self.mean is not None:
@@ -762,49 +788,76 @@ class ImageDataGenerator(object):
           np.dot(u, np.diag(1. / np.sqrt(s + self.zca_epsilon))), u.T)
 
 
-class Iterator(object):
-  """Abstract base class for image data iterators.
+class Iterator(Sequence):
+  """Base class for image data iterators.
+
+  Every `Iterator` must implement the `_get_batches_of_transformed_samples`
+  method.
 
   Arguments:
-      n: Integer, total number of samples in the dataset to loop over.
-      batch_size: Integer, size of a batch.
-      shuffle: Boolean, whether to shuffle the data between epochs.
-      seed: Random seeding for data shuffling.
+    n: Integer, total number of samples in the dataset to loop over.
+    batch_size: Integer, size of a batch.
+    shuffle: Boolean, whether to shuffle the data between epochs.
+    seed: Random seeding for data shuffling.
   """
 
   def __init__(self, n, batch_size, shuffle, seed):
     self.n = n
     self.batch_size = batch_size
+    self.seed = seed
     self.shuffle = shuffle
     self.batch_index = 0
     self.total_batches_seen = 0
     self.lock = threading.Lock()
-    self.index_generator = self._flow_index(n, batch_size, shuffle, seed)
+    self.index_array = None
+    self.index_generator = self._flow_index()
+
+  def _set_index_array(self):
+    self.index_array = np.arange(self.n)
+    if self.shuffle:
+      self.index_array = np.random.permutation(self.n)
+
+  def __getitem__(self, idx):
+    if idx >= len(self):
+      raise ValueError('Asked to retrieve element {idx}, '
+                       'but the Sequence '
+                       'has length {length}'.format(idx=idx,
+                                                    length=len(self)))
+    if self.seed is not None:
+      np.random.seed(self.seed + self.total_batches_seen)
+    self.total_batches_seen += 1
+    if self.index_array is None:
+      self._set_index_array()
+    index_array = self.index_array[self.batch_size * idx:self.batch_size *
+                                   (idx + 1)]
+    return self._get_batches_of_transformed_samples(index_array)
+
+  def __len__(self):
+    length = int(np.ceil(self.n / float(self.batch_size)))
+    return np.maximum(length, 0)
+
+  def on_epoch_end(self):
+    self._set_index_array()
 
   def reset(self):
     self.batch_index = 0
 
-  def _flow_index(self, n, batch_size=32, shuffle=False, seed=None):
+  def _flow_index(self):
     # Ensure self.batch_index is 0.
     self.reset()
     while 1:
-      if seed is not None:
-        np.random.seed(seed + self.total_batches_seen)
+      if self.seed is not None:
+        np.random.seed(self.seed + self.total_batches_seen)
       if self.batch_index == 0:
-        index_array = np.arange(n)
-        if shuffle:
-          index_array = np.random.permutation(n)
+        self._set_index_array()
 
-      current_index = (self.batch_index * batch_size) % n
-      if n > current_index + batch_size:
-        current_batch_size = batch_size
+      current_index = (self.batch_index * self.batch_size) % self.n
+      if self.n > current_index + self.batch_size:
         self.batch_index += 1
       else:
-        current_batch_size = n - current_index
         self.batch_index = 0
       self.total_batches_seen += 1
-      yield (index_array[current_index:current_index + current_batch_size],
-             current_index, current_batch_size)
+      yield self.index_array[current_index:current_index + self.batch_size]
 
   def __iter__(self):  # pylint: disable=non-iterator-returned
     # Needed if we want to do something like:
@@ -814,6 +867,16 @@ class Iterator(object):
   def __next__(self, *args, **kwargs):
     return self.next(*args, **kwargs)
 
+  def _get_batches_of_transformed_samples(self, index_array):
+    """Gets a batch of transformed samples.
+
+    Arguments:
+        index_array: array of sample indices to include in batch.
+    Returns:
+        A batch of transformed samples.
+    """
+    raise NotImplementedError
+
 
 class NumpyArrayIterator(Iterator):
   """Iterator yielding data from a Numpy array.
@@ -883,33 +946,19 @@ class NumpyArrayIterator(Iterator):
     super(NumpyArrayIterator, self).__init__(x.shape[0], batch_size, shuffle,
                                              seed)
 
-  def next(self):
-    """For python 2.x.
-
-    Returns:
-        The next batch.
-    """
-    # Keeps under lock only the mechanism which advances
-    # the indexing of each batch.
-    with self.lock:
-      index_array, current_index, current_batch_size = next(
-          self.index_generator)
-    # The transformation of images is not under thread lock
-    # so it can be done in parallel
-    batch_x = np.zeros(
-        tuple([current_batch_size] + list(self.x.shape)[1:]), dtype=K.floatx())
+  def _get_batches_of_transformed_samples(self, index_array):
+    batch_x = np.zeros(tuple([len(index_array)] + list(self.x.shape)[1:]),
+                       dtype=K.floatx())
     for i, j in enumerate(index_array):
       x = self.x[j]
       x = self.image_data_generator.random_transform(x.astype(K.floatx()))
       x = self.image_data_generator.standardize(x)
       batch_x[i] = x
     if self.save_to_dir:
-      for i in range(current_batch_size):
+      for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=current_index + i,
-            hash=np.random.randint(1e4),
+            prefix=self.save_prefix, index=j, hash=np.random.randint(1e4),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     if self.y is None:
@@ -917,6 +966,20 @@ class NumpyArrayIterator(Iterator):
     batch_y = self.y[index_array]
     return batch_x, batch_y
 
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    # Keeps under lock only the mechanism which advances
+    # the indexing of each batch.
+    with self.lock:
+      index_array = next(self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    return self._get_batches_of_transformed_samples(index_array)
+
 
 def _count_valid_files_in_directory(directory, white_list_formats,
                                     follow_links):
@@ -939,7 +1002,7 @@ def _count_valid_files_in_directory(directory, white_list_formats,
 
   samples = 0
   for _, _, files in _recursive_list(directory):
-    for fname in files:
+    for fname in sorted(files):
       is_valid = False
       for extension in white_list_formats:
         if fname.lower().endswith('.' + extension):
@@ -1006,7 +1069,7 @@ class DirectoryIterator(Iterator):
           to use for random transformations and normalization.
       target_size: tuple of integers, dimensions to resize input images to.
       color_mode: One of `"rgb"`, `"grayscale"`. Color mode to read images.
-      classes: Optional list of strings, names of sudirectories
+      classes: Optional list of strings, names of subdirectories
           containing images from each class (e.g. `["dogs", "cats"]`).
           It will be computed automatically if not set.
       class_mode: Mode for yielding the targets:
@@ -1086,7 +1149,7 @@ class DirectoryIterator(Iterator):
       for subdir in sorted(os.listdir(directory)):
         if os.path.isdir(os.path.join(directory, subdir)):
           classes.append(subdir)
-    self.num_class = len(classes)
+    self.num_classes = len(classes)
     self.class_indices = dict(zip(classes, range(len(classes))))
 
     pool = multiprocessing.pool.ThreadPool()
@@ -1099,7 +1162,7 @@ class DirectoryIterator(Iterator):
                                     for subdir in classes)))
 
     print('Found %d images belonging to %d classes.' % (self.samples,
-                                                        self.num_class))
+                                                        self.num_classes))
 
     # second, build an index of the images in the different class subfolders
     results = []
@@ -1121,39 +1184,25 @@ class DirectoryIterator(Iterator):
     super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle,
                                             seed)
 
-  def next(self):
-    """For python 2.x.
-
-    Returns:
-        The next batch.
-    """
-    with self.lock:
-      index_array, current_index, current_batch_size = next(
-          self.index_generator)
-    # The transformation of images is not under thread lock
-    # so it can be done in parallel
-    batch_x = np.zeros(
-        (current_batch_size,) + self.image_shape, dtype=K.floatx())
+  def _get_batches_of_transformed_samples(self, index_array):
+    batch_x = np.zeros((len(index_array),) + self.image_shape, dtype=K.floatx())
     grayscale = self.color_mode == 'grayscale'
     # build batch of image data
     for i, j in enumerate(index_array):
       fname = self.filenames[j]
-      img = load_img(
-          os.path.join(self.directory, fname),
-          grayscale=grayscale,
-          target_size=self.target_size)
+      img = load_img(os.path.join(self.directory, fname),
+                     grayscale=grayscale,
+                     target_size=self.target_size)
       x = img_to_array(img, data_format=self.data_format)
       x = self.image_data_generator.random_transform(x)
       x = self.image_data_generator.standardize(x)
       batch_x[i] = x
     # optionally save augmented images to disk for debugging purposes
     if self.save_to_dir:
-      for i in range(current_batch_size):
+      for i, j in enumerate(index_array):
         img = array_to_img(batch_x[i], self.data_format, scale=True)
         fname = '{prefix}_{index}_{hash}.{format}'.format(
-            prefix=self.save_prefix,
-            index=current_index + i,
-            hash=np.random.randint(1e4),
+            prefix=self.save_prefix, index=j, hash=np.random.randint(1e7),
             format=self.save_format)
         img.save(os.path.join(self.save_to_dir, fname))
     # build batch of labels
@@ -1164,9 +1213,22 @@ class DirectoryIterator(Iterator):
     elif self.class_mode == 'binary':
       batch_y = self.classes[index_array].astype(K.floatx())
     elif self.class_mode == 'categorical':
-      batch_y = np.zeros((len(batch_x), self.num_class), dtype=K.floatx())
+      batch_y = np.zeros((len(batch_x), self.num_classes), dtype=K.floatx())
       for i, label in enumerate(self.classes[index_array]):
         batch_y[i, label] = 1.
     else:
       return batch_x
     return batch_x, batch_y
+
+  def next(self):
+    """For python 2.x.
+
+    Returns:
+        The next batch.
+    """
+    with self.lock:
+      index_array = next(self.index_generator)
+    # The transformation of images is not under thread lock
+    # so it can be done in parallel
+    return self._get_batches_of_transformed_samples(index_array)
+
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
index 19693410e761a2d800e8c8e151264f91ef30897c..c0790b5a5140193b18907d9375530f4f06e137da 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
@@ -192,6 +192,8 @@ class TestImage(test.TestCase):
     _ = keras.preprocessing.image.load_img(fname)
     _ = keras.preprocessing.image.load_img(fname, grayscale=True)
     _ = keras.preprocessing.image.load_img(fname, target_size=(10, 10))
+    _ = keras.preprocessing.image.load_img(fname, target_size=(10, 10),
+                                           interpolation='bilinear')
 
     # create iterator
     generator = keras.preprocessing.image.ImageDataGenerator()
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
index a5deec87af7729c20face3517689b7da4b48c8df..642f4f2face5bd56cdc1ed7b4f6d6621c6d1b210 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
@@ -169,7 +169,7 @@ def skipgrams(sequence,
           integers (eg. [0, 1, 1 .. ]),
           if True labels will be categorical eg. [[1,0],[0,1],[0,1] .. ]
       sampling_table: 1D array of size `vocabulary_size` where the entry i
-          encodes the probabibily to sample a word of rank i.
+          encodes the probability to sample a word of rank i.
       seed: Random seed.
 
   Returns:
diff --git a/tensorflow/python/keras/_impl/keras/utils/__init__.py b/tensorflow/python/keras/_impl/keras/utils/__init__.py
index fa50b123b79cc599e3e1bd2328823dc3eefc1f95..370ae0dd0f0d00059f1b0cc79459abe75c8ca494 100644
--- a/tensorflow/python/keras/_impl/keras/utils/__init__.py
+++ b/tensorflow/python/keras/_impl/keras/utils/__init__.py
@@ -18,11 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import data_utils
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import io_utils
-from tensorflow.python.keras._impl.keras.utils import np_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
 from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
@@ -35,9 +30,9 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary
 from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras._impl.keras.utils.training_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
 
-
-# Globally-importable utils.
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
index 0ede7f12f2cd31ee86baefc870748f206332342c..1f2e9ac44076582c7aea083203b13fddaa597474 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils.py
@@ -70,15 +70,15 @@ if sys.version_info[0] == 2:
       if content_type is not None:
         total_size = int(content_type.strip())
       count = 0
-      while 1:
+      while True:
         chunk = response.read(chunk_size)
         count += 1
-        if not chunk:
-          reporthook(count, total_size, total_size)
-          break
-        if reporthook:
+        if reporthook is not None:
           reporthook(count, chunk_size, total_size)
-        yield chunk
+        if chunk:
+          yield chunk
+        else:
+          break
 
     response = urlopen(url, data)
     with open(filename, 'wb') as fd:
@@ -262,9 +262,9 @@ def _hash_file(fpath, algorithm='sha256', chunk_size=65535):
   Example:
 
   ```python
-     >>> from keras.data_utils import _hash_file
-     >>> _hash_file('/path/to/file.zip')
-     'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
+      >>> from keras.data_utils import _hash_file
+      >>> _hash_file('/path/to/file.zip')
+      'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855'
   ```
 
   Arguments:
@@ -318,32 +318,35 @@ class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
   Every `Sequence` must implements the `__getitem__` and the `__len__` methods.
+  If you want to modify your dataset between epochs you may implement
+  `on_epoch_end`. The method `__getitem__` should return a complete batch.
 
+  Notes:
+  `Sequence` are a safer way to do multiprocessing. This structure guarantees
+   that the network will only train once on each sample per epoch which is not
+   the case with generators.
   Examples:
-
   ```python
-  from skimage.io import imread
-  from skimage.transform import resize
-  import numpy as np
-
-  # Here, `x_set` is list of path to the images
-  # and `y_set` are the associated classes.
-
-  class CIFAR10Sequence(Sequence):
-      def __init__(self, x_set, y_set, batch_size):
-          self.X,self.y = x_set,y_set
-          self.batch_size = batch_size
-
-      def __len__(self):
-          return len(self.X) // self.batch_size
-
-      def __getitem__(self,idx):
-          batch_x = self.X[idx*self.batch_size:(idx+1)*self.batch_size]
-          batch_y = self.y[idx*self.batch_size:(idx+1)*self.batch_size]
-
-          return np.array([
-              resize(imread(file_name), (200,200))
-                 for file_name in batch_x]), np.array(batch_y)
+      from skimage.io import imread
+      from skimage.transform import resize
+      import numpy as np
+      import math
+      # Here, `x_set` is list of path to the images
+      # and `y_set` are the associated classes.
+      class CIFAR10Sequence(Sequence):
+          def __init__(self, x_set, y_set, batch_size):
+              self.x, self.y = x_set, y_set
+              self.batch_size = batch_size
+          def __len__(self):
+              return math.ceil(len(self.x) / self.batch_size)
+          def __getitem__(self, idx):
+              batch_x = self.x[idx * self.batch_size:(idx + 1) *
+                        self.batch_size]
+              batch_y = self.y[idx * self.batch_size:(idx + 1) *
+                        self.batch_size]
+              return np.array([
+                  resize(imread(file_name), (200, 200))
+                     for file_name in batch_x]), np.array(batch_y)
   ```
   """
 
@@ -372,20 +375,30 @@ class Sequence(object):
   def on_epoch_end(self):
     """Method called at the end of every epoch.
     """
-    raise NotImplementedError
+    pass
+
+
+# Global variables to be shared across processes
+_SHARED_SEQUENCES = {}
+# We use a Value to provide unique id to different processes.
+_SEQUENCE_COUNTER = None
+
 
+def get_index(uid, i):
+  """Get the value from the Sequence `uid` at index `i`.
 
-def get_index(ds, i):
-  """Quick fix for Python2, otherwise, it cannot be pickled.
+  To allow multiple Sequences to be used at the same time, we use `uid` to
+  get a specific one. A single Sequence would cause the validation to
+  overwrite the training Sequence.
 
   Arguments:
-      ds: a Holder or Sequence object.
+      uid: int, Sequence identifier
       i: index
 
   Returns:
       The value at index `i`.
   """
-  return ds[i]
+  return _SHARED_SEQUENCES[uid][i]
 
 
 class SequenceEnqueuer(object):
@@ -397,13 +410,13 @@ class SequenceEnqueuer(object):
   Examples:
 
   ```python
-  enqueuer = SequenceEnqueuer(...)
-  enqueuer.start()
-  datas = enqueuer.get()
-  for data in datas:
-      # Use the inputs; training, evaluating, predicting.
-      # ... stop sometime.
-  enqueuer.close()
+      enqueuer = SequenceEnqueuer(...)
+      enqueuer.start()
+      datas = enqueuer.get()
+      for data in datas:
+          # Use the inputs; training, evaluating, predicting.
+          # ... stop sometime.
+      enqueuer.close()
   ```
 
   The `enqueuer.get()` should be an infinite stream of datas.
@@ -456,17 +469,21 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
   Arguments:
       sequence: A `keras.utils.data_utils.Sequence` object.
-      use_multiprocessing: use multiprocessing if True, otherwise threading
-      scheduling: Sequential querying of datas if 'sequential', random
-        otherwise.
-      shuffle: Whether to shuffle the data at the beginning of each epoch.
+      use_multiprocessing: Use multiprocessing if True, otherwise threading
+      shuffle: Whether to shuffle the data at the beginning of each epoch
   """
 
-  def __init__(self,
-               sequence,
-               use_multiprocessing=False,
-               shuffle=False):
+  def __init__(self, sequence, use_multiprocessing=False, shuffle=False):
     self.sequence = sequence
+
+    # Doing Multiprocessing.Value += x is not process-safe.
+    global _SEQUENCE_COUNTER
+    if _SEQUENCE_COUNTER is None:
+      _SEQUENCE_COUNTER = multiprocessing.Value('i', 0)
+
+    with _SEQUENCE_COUNTER.get_lock():
+      self.uid = _SEQUENCE_COUNTER.value
+      _SEQUENCE_COUNTER.value += 1
     self.use_multiprocessing = use_multiprocessing
     self.shuffle = shuffle
     self.workers = 0
@@ -490,15 +507,24 @@ class OrderedEnqueuer(SequenceEnqueuer):
       self.executor = multiprocessing.Pool(workers)
     else:
       self.executor = ThreadPool(workers)
+    self.workers = workers
     self.queue = queue.Queue(max_queue_size)
     self.stop_signal = threading.Event()
     self.run_thread = threading.Thread(target=self._run)
     self.run_thread.daemon = True
     self.run_thread.start()
 
+  def _wait_queue(self):
+    """Wait for the queue to be empty."""
+    while True:
+      time.sleep(0.1)
+      if self.queue.unfinished_tasks == 0 or self.stop_signal.is_set():
+        return
+
   def _run(self):
-    """Submits requests to the executor and queues the `Future` objects."""
+    """Function to submit request to the executor & queue `Future` objects."""
     sequence = list(range(len(self.sequence)))
+    self._send_sequence()  # Share the initial sequence
     while True:
       if self.shuffle:
         random.shuffle(sequence)
@@ -506,9 +532,18 @@ class OrderedEnqueuer(SequenceEnqueuer):
         if self.stop_signal.is_set():
           return
         self.queue.put(
-            self.executor.apply_async(get_index, (self.sequence, i)),
-            block=True)
+            self.executor.apply_async(get_index, (self.uid, i)), block=True)
+
+      # Done with the current epoch, waiting for the final batches
+      self._wait_queue()
+
+      if self.stop_signal.is_set():
+        # We're done
+        return
+
+      # Call the internal on epoch end.
       self.sequence.on_epoch_end()
+      self._send_sequence()  # Update the pool
 
   def get(self):
     """Creates a generator to extract data from the queue.
@@ -517,17 +552,29 @@ class OrderedEnqueuer(SequenceEnqueuer):
 
     Yields:
         Tuples (inputs, targets)
-            or (inputs, targets, sample_weights)
+        or (inputs, targets, sample_weights)
     """
     try:
       while self.is_running():
         inputs = self.queue.get(block=True).get()
+        self.queue.task_done()
         if inputs is not None:
           yield inputs
     except Exception as e:
       self.stop()
       raise StopIteration(e)
 
+  def _send_sequence(self):
+    """Send current Sequence to all workers."""
+    _SHARED_SEQUENCES[
+        self.uid] = self.sequence  # For new processes that may spawn
+
+    self._close_pool()
+    if self.use_multiprocessing:
+      self.executor = multiprocessing.Pool(self.workers)
+    else:
+      self.executor = ThreadPool(self.workers)
+
   def stop(self, timeout=None):
     """Stops running threads and wait for them to exit, if necessary.
 
@@ -541,36 +588,43 @@ class OrderedEnqueuer(SequenceEnqueuer):
       self.queue.queue.clear()
       self.queue.unfinished_tasks = 0
       self.queue.not_full.notify()
+    self._close_pool()
+    self.run_thread.join(timeout)
+    _SHARED_SEQUENCES[self.uid] = None
+
+  def _close_pool(self):
     self.executor.close()
     self.executor.join()
-    self.run_thread.join(timeout)
 
 
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
+  The provided generator can be finite in which case the class will throw
+  a `StopIteration` exception.
+
   Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
 
   Arguments:
-      generator: a generator function which endlessly yields data
+      generator: a generator function which yields data
       use_multiprocessing: use multiprocessing if True, otherwise threading
       wait_time: time to sleep in-between calls to `put()`
       random_seed: Initial seed for workers,
-          will be incremented by one for each workers.
+          will be incremented by one for each worker.
   """
 
   def __init__(self,
                generator,
                use_multiprocessing=False,
                wait_time=0.05,
-               random_seed=None):
+               seed=None):
     self.wait_time = wait_time
     self._generator = generator
     self._use_multiprocessing = use_multiprocessing
     self._threads = []
     self._stop_event = None
     self.queue = None
-    self.random_seed = random_seed
+    self.seed = seed
 
   def start(self, workers=1, max_queue_size=10):
     """Kicks off threads which add data from the generator into the queue.
@@ -589,6 +643,8 @@ class GeneratorEnqueuer(SequenceEnqueuer):
             self.queue.put(generator_output)
           else:
             time.sleep(self.wait_time)
+        except StopIteration:
+          break
         except Exception:
           self._stop_event.set()
           raise
@@ -605,11 +661,11 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         if self._use_multiprocessing:
           # Reset random seed else all children processes
           # share the same seed
-          np.random.seed(self.random_seed)
+          np.random.seed(self.seed)
           thread = multiprocessing.Process(target=data_generator_task)
           thread.daemon = True
-          if self.random_seed is not None:
-            self.random_seed += 1
+          if self.seed is not None:
+            self.seed += 1
         else:
           thread = threading.Thread(target=data_generator_task)
         self._threads.append(thread)
@@ -661,4 +717,8 @@ class GeneratorEnqueuer(SequenceEnqueuer):
         if inputs is not None:
           yield inputs
       else:
-        time.sleep(self.wait_time)
+        all_finished = all([not thread.is_alive() for thread in self._threads])
+        if all_finished and self.queue.empty():
+          raise StopIteration()
+        else:
+          time.sleep(self.wait_time)
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
index 45322f1f29cb1351c409957d060c21abffdf1d6f..47c5b4cff06c083f8ebd699b5cb9da85b74116e0 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
@@ -22,6 +22,7 @@ from itertools import cycle
 import os
 import tarfile
 import threading
+import unittest
 import zipfile
 
 import numpy as np
@@ -115,15 +116,19 @@ def threadsafe_generator(f):
 
 class TestSequence(keras.utils.data_utils.Sequence):
 
-  def __init__(self, shape):
+  def __init__(self, shape, value=1.):
     self.shape = shape
+    self.inner = value
 
   def __getitem__(self, item):
-    return np.ones(self.shape, dtype=np.uint8) * item
+    return np.ones(self.shape, dtype=np.uint32) * item * self.inner
 
   def __len__(self):
     return 100
 
+  def on_epoch_end(self):
+    self.inner *= 5.0
+
 
 class FaultSequence(keras.utils.data_utils.Sequence):
 
@@ -160,6 +165,9 @@ class TestEnqueuers(test.TestCase):
     self.assertEqual(len(set(acc) - set(range(100))), 0)
     enqueuer.stop()
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_enqueuer_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
         create_generator_from_sequence_pcs(TestSequence([3, 200, 200, 3])),
@@ -181,6 +189,9 @@ class TestEnqueuers(test.TestCase):
     with self.assertRaises(StopIteration):
       next(gen_output)
 
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
   def test_generator_enqueuer_fail_processes(self):
     enqueuer = keras.utils.data_utils.GeneratorEnqueuer(
         create_generator_from_sequence_pcs(FaultSequence()),
@@ -228,6 +239,64 @@ class TestEnqueuers(test.TestCase):
     with self.assertRaises(StopIteration):
       next(gen_output)
 
+  def test_on_epoch_end_processes(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(200):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    # Check that order was keep in GeneratorEnqueuer with processes
+    self.assertEqual(acc[100:], list([k * 5 for k in range(100)]))
+    enqueuer.stop()
+
+  def test_context_switch(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=True)
+    enqueuer2 = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3], value=15), use_multiprocessing=True)
+    enqueuer.start(3, 10)
+    enqueuer2.start(3, 10)
+    gen_output = enqueuer.get()
+    gen_output2 = enqueuer2.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    self.assertEqual(acc[-1], 99)
+    # One epoch is completed so enqueuer will switch the Sequence
+
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output2)[0, 0, 0, 0])
+    self.assertEqual(acc[-1], 99 * 15)
+    # One epoch has been completed so enqueuer2 will switch
+
+    # Be sure that both Sequence were updated
+    self.assertEqual(next(gen_output)[0, 0, 0, 0], 0)
+    self.assertEqual(next(gen_output)[0, 0, 0, 0], 5)
+    self.assertEqual(next(gen_output2)[0, 0, 0, 0], 0)
+    self.assertEqual(next(gen_output2)[0, 0, 0, 0], 15 * 5)
+
+    # Tear down everything
+    enqueuer.stop()
+    enqueuer2.stop()
+
+  def test_on_epoch_end_threads(self):
+    enqueuer = keras.utils.data_utils.OrderedEnqueuer(
+        TestSequence([3, 200, 200, 3]), use_multiprocessing=False)
+    enqueuer.start(3, 10)
+    gen_output = enqueuer.get()
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    acc = []
+    for _ in range(100):
+      acc.append(next(gen_output)[0, 0, 0, 0])
+    # Check that order was keep in GeneratorEnqueuer with processes
+    self.assertEqual(acc, list([k * 5 for k in range(100)]))
+    enqueuer.stop()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
index 39a10c8650f67216ae6a238bb6f3b7e4088ad163..025e5d30a597c560804293b12b0bd063764c87fe 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
@@ -43,7 +43,7 @@ class CustomObjectScope(object):
 
   Example:
 
-  Consider a custom object `MyObject`
+  Consider a custom object `MyObject` (e.g. a class):
 
   ```python
       with CustomObjectScope({'MyObject':MyObject}):
@@ -271,6 +271,9 @@ class Progbar(object):
     self.total_width = 0
     self.seen_so_far = 0
     self.verbose = verbose
+    self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
+                              sys.stdout.isatty()) or
+                             'ipykernel' in sys.modules)
 
   def update(self, current, values=None, force=False):
     """Updates the progress bar.
@@ -294,18 +297,23 @@ class Progbar(object):
     self.seen_so_far = current
 
     now = time.time()
+    info = ' - %.0fs' % (now - self.start)
     if self.verbose == 1:
-      if not force and (now - self.last_update) < self.interval:
+      if (not force and (now - self.last_update) < self.interval and
+          current < self.target):
         return
 
       prev_total_width = self.total_width
-      sys.stdout.write('\b' * prev_total_width)
-      sys.stdout.write('\r')
+      if self._dynamic_display:
+        sys.stdout.write('\b' * prev_total_width)
+        sys.stdout.write('\r')
+      else:
+        sys.stdout.write('\n')
 
-      if self.target is not -1:
+      if self.target is not None:
         numdigits = int(np.floor(np.log10(self.target))) + 1
-        barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
-        bar = barstr % (current, self.target)
+        barstr = '%%%dd/%d [' % (numdigits, self.target)
+        bar = barstr % current
         prog = float(current) / self.target
         prog_width = int(self.width * prog)
         if prog_width > 0:
@@ -318,17 +326,35 @@ class Progbar(object):
         bar += ']'
         sys.stdout.write(bar)
         self.total_width = len(bar)
+      else:
+        bar = '%7d/Unknown' % current
+
+      self.total_width = len(bar)
+      sys.stdout.write(bar)
 
       if current:
         time_per_unit = (now - self.start) / current
       else:
         time_per_unit = 0
-      eta = time_per_unit * (self.target - current)
-      info = ''
-      if current < self.target and self.target is not -1:
-        info += ' - ETA: %ds' % eta
+      if self.target is not None and current < self.target:
+        eta = time_per_unit * (self.target - current)
+        if eta > 3600:
+          eta_format = '%d:%02d:%02d' % (eta // 3600, (eta % 3600) // 60,
+                                         eta % 60)
+        elif eta > 60:
+          eta_format = '%d:%02d' % (eta // 60, eta % 60)
+        else:
+          eta_format = '%ds' % eta
+
+        info = ' - ETA: %s' % eta_format
       else:
-        info += ' - %ds' % (now - self.start)
+        if time_per_unit >= 1:
+          info += ' %.0fs/step' % time_per_unit
+        elif time_per_unit >= 1e-3:
+          info += ' %.0fms/step' % (time_per_unit * 1e3)
+        else:
+          info += ' %.0fus/step' % (time_per_unit * 1e6)
+
       for k in self.unique_values:
         info += ' - %s:' % k
         if isinstance(self.sum_values[k], list):
@@ -342,7 +368,9 @@ class Progbar(object):
 
       self.total_width += len(info)
       if prev_total_width > self.total_width:
-        info += ((prev_total_width - self.total_width) * ' ')
+        info += (' ' * (prev_total_width - self.total_width))
+      if self.target is not None and current >= self.target:
+        info += '\n'
 
       sys.stdout.write(info)
       sys.stdout.flush()
@@ -350,17 +378,20 @@ class Progbar(object):
       if current >= self.target:
         sys.stdout.write('\n')
 
-    if self.verbose == 2:
-      if current >= self.target:
-        info = '%ds' % (now - self.start)
+    elif self.verbose == 2:
+      if self.target is None or current >= self.target:
         for k in self.unique_values:
           info += ' - %s:' % k
-          avg = np.mean(self.sum_values[k][0] / max(1, self.sum_values[k][1]))
+          avg = np.mean(
+              self.sum_values[k][0] / max(1, self.sum_values[k][1]))
           if avg > 1e-3:
             info += ' %.4f' % avg
           else:
             info += ' %.4e' % avg
-        sys.stdout.write(info + '\n')
+        info += '\n'
+
+        sys.stdout.write(info)
+        sys.stdout.flush()
 
     self.last_update = now
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index 5f2ba99be783f8d24e4aef0eaa450a94f9da6e8b..a8fc18c17aee58fa406c3057cc98844d9687a9ba 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -63,11 +63,11 @@ class HDF5Matrix(object):
                         'HDF5 and h5py installed.')
 
     if datapath not in list(self.refs.keys()):
-      f = h5py.File(datapath)
-      self.refs[datapath] = f
+      self._f = h5py.File(datapath)
+      self.refs[datapath] = self._f
     else:
-      f = self.refs[datapath]
-    self.data = f[dataset]
+      self._f = self.refs[datapath]
+    self.data = self._f[dataset]
     self.start = start
     if end is None:
       self.end = self.data.shape[0]
@@ -78,13 +78,16 @@ class HDF5Matrix(object):
   def __len__(self):
     return self.end - self.start
 
+  def __del__(self):
+    self._f.close()
+
   def __getitem__(self, key):
     if isinstance(key, slice):
       start, stop = key.start, key.stop
       if start is None:
         start = 0
       if stop is None:
-        stop = self.data.shape[0]
+        stop = self.shape[0]
       if stop + self.start <= self.end:
         idx = slice(start + self.start, stop + self.start)
       else:
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 86c02643556fdc44e7340551f86428c05c9285ce..053c0600a33d6ab0151ecc8879cbc68fe731dbe5 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -24,6 +24,18 @@ from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
 
 
+def count_params(weights):
+  """Count the total number of scalars composing the weights.
+
+  Arguments:
+    weights: An iterable containing the weights on which to compute params
+
+  Returns:
+    The total number of scalars composing the weights
+  """
+  return int(np.sum([K.count_params(p) for p in set(weights)]))
+
+
 def print_summary(model, line_length=None, positions=None, print_fn=None):
   """Prints a summary of a model.
 
@@ -46,12 +58,28 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     sequential_like = True
   else:
     sequential_like = True
-    for v in model._nodes_by_depth.values():  # pylint: disable=protected-access
+    nodes_by_depth = model._nodes_by_depth.values()  # pylint: disable=protected-access
+    nodes = []
+    for v in nodes_by_depth:
       if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
         # If the model has multiple nodes or if the nodes have
         # multiple inbound_layers, the model is no longer sequential.
         sequential_like = False
         break
+      nodes += v
+    if sequential_like:
+      # search for shared layers
+      for layer in model.layers:
+        flag = False
+        for node in layer.inbound_nodes:
+          if node in nodes:
+            if flag:
+              sequential_like = False
+              break
+            else:
+              flag = True
+        if not sequential_like:
+          break
 
   if sequential_like:
     line_length = line_length or 65
@@ -61,7 +89,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     # header names for the different log elements
     to_display = ['Layer (type)', 'Output Shape', 'Param #']
   else:
-    line_length = line_length or 100
+    line_length = line_length or 98
     positions = positions or [.33, .55, .67, 1.]
     if positions[-1] <= 1:
       positions = [int(line_length * p) for p in positions]
@@ -144,8 +172,12 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     else:
       print_fn('_' * line_length)
 
-  trainable_count = int(
-      np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
+  model._check_trainable_weights_consistency()  # pylint: disable=protected-access
+  if hasattr(model, '_collected_trainable_weights'):
+    trainable_count = count_params(model._collected_trainable_weights)  # pylint: disable=protected-access
+  else:
+    trainable_count = count_params(model.trainable_weights)
+
   non_trainable_count = int(
       np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
index a23172d342a20f28b219546a5f5d443274a71c73..896016d4d8bb48192e32ab094f7b7a0e6799921c 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils.py
@@ -33,12 +33,18 @@ def to_categorical(y, num_classes=None):
   Returns:
       A binary matrix representation of the input.
   """
-  y = np.array(y, dtype='int').ravel()
+  y = np.array(y, dtype='int')
+  input_shape = y.shape
+  if input_shape and input_shape[-1] == 1:
+    input_shape = tuple(input_shape[:-1])
+  y = y.ravel()
   if not num_classes:
     num_classes = np.max(y) + 1
   n = y.shape[0]
   categorical = np.zeros((n, num_classes))
   categorical[np.arange(n), y] = 1
+  output_shape = input_shape + (num_classes,)
+  categorical = np.reshape(categorical, output_shape)
   return categorical
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9680c295cd31c40114726a919d4e327c07ddd240
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for np_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class TestNPUtils(test.TestCase):
+
+  def test_to_categorical(self):
+    num_classes = 5
+    shapes = [(3,), (4, 3), (5, 4, 3), (3, 1), (3, 2, 1)]
+    expected_shapes = [(3, num_classes),
+                       (4, 3, num_classes),
+                       (5, 4, 3, num_classes),
+                       (3, num_classes)]
+    labels = [np.random.randint(0, num_classes, shape) for shape in shapes]
+    one_hots = [
+        keras.utils.to_categorical(label, num_classes) for label in labels]
+    for label, one_hot, expected_shape in zip(labels,
+                                              one_hots,
+                                              expected_shapes):
+      # Check shape
+      self.assertEqual(one_hot.shape, expected_shape)
+      # Make sure there is only one 1 in a row
+      self.assertTrue(np.all(one_hot.sum(axis=-1) == 1))
+      # Get original labels back from one hots
+      self.assertTrue(np.all(
+          np.argmax(one_hot, -1).reshape(label.shape) == label))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils.py b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8939c814cf3f9c6fa2f2af79e71919c6666e5561
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils.py
@@ -0,0 +1,194 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for multi-gpu training."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.ops import array_ops
+
+
+def _get_available_devices():
+  return [x.name for x in K.get_session().list_devices()]
+
+
+def _normalize_device_name(name):
+  name = '/' + name.lower().split('device:')[1]
+  return name
+
+
+def multi_gpu_model(model, gpus):
+  """Replicates a model on different GPUs.
+
+  Specifically, this function implements single-machine
+  multi-GPU data parallelism. It works in the following way:
+
+  - Divide the model's input(s) into multiple sub-batches.
+  - Apply a model copy on each sub-batch. Every model copy
+      is executed on a dedicated GPU.
+  - Concatenate the results (on CPU) into one big batch.
+
+  E.g. if your `batch_size` is 64 and you use `gpus=2`,
+  then we will divide the input into 2 sub-batches of 32 samples,
+  process each sub-batch on one GPU, then return the full
+  batch of 64 processed samples.
+
+  This induces quasi-linear speedup on up to 8 GPUs.
+
+  This function is only available with the TensorFlow backend
+  for the time being.
+
+  Arguments:
+      model: A Keras model instance. To avoid OOM errors,
+          this model could have been built on CPU, for instance
+          (see usage example below).
+      gpus: Integer >= 2, number of on GPUs on which to create
+          model replicas.
+
+  Returns:
+      A Keras `Model` instance which can be used just like the initial
+      `model` argument, but which distributes its workload on multiple GPUs.
+
+  Example:
+
+  ```python
+      import tensorflow as tf
+      from keras.applications import Xception
+      from keras.utils import multi_gpu_model
+      import numpy as np
+
+      num_samples = 1000
+      height = 224
+      width = 224
+      num_classes = 1000
+
+      # Instantiate the base model (or "template" model).
+      # We recommend doing this with under a CPU device scope,
+      # so that the model's weights are hosted on CPU memory.
+      # Otherwise they may end up hosted on a GPU, which would
+      # complicate weight sharing.
+      with tf.device('/cpu:0'):
+          model = Xception(weights=None,
+                           input_shape=(height, width, 3),
+                           classes=num_classes)
+
+      # Replicates the model on 8 GPUs.
+      # This assumes that your machine has 8 available GPUs.
+      parallel_model = multi_gpu_model(model, gpus=8)
+      parallel_model.compile(loss='categorical_crossentropy',
+                             optimizer='rmsprop')
+
+      # Generate dummy data.
+      x = np.random.random((num_samples, height, width, 3))
+      y = np.random.random((num_samples, num_classes))
+
+      # This `fit` call will be distributed on 8 GPUs.
+      # Since the batch size is 256, each GPU will process 32 samples.
+      parallel_model.fit(x, y, epochs=20, batch_size=256)
+
+      # Save model via the template model (which shares the same weights):
+      model.save('my_model.h5')
+  ```
+
+  Raises:
+    ValueError: if the `gpus` argument does not match available devices.
+  """
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras._impl.keras.layers.core import Lambda
+  from tensorflow.python.keras._impl.keras.layers.merge import concatenate
+
+  if gpus <= 1:
+    raise ValueError('For multi-gpu usage to be effective, '
+                     'call `multi_gpu_model` with `gpus >= 2`. '
+                     'Received: `gpus=%d`' % gpus)
+
+  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in range(gpus)]
+  available_devices = _get_available_devices()
+  available_devices = [
+      _normalize_device_name(name) for name in available_devices
+  ]
+  for device in target_devices:
+    if device not in available_devices:
+      raise ValueError('To call `multi_gpu_model` with `gpus=%d`, '
+                       'we expect the following devices to be available: %s. '
+                       'However this machine only has: %s. '
+                       'Try reducing `gpus`.' % (gpus, target_devices,
+                                                 available_devices))
+
+  def get_slice(data, i, parts):
+    """Slice an array into `parts` slices and return slice `i`.
+
+    Arguments:
+      data: array to slice.
+      i: index of slice to return.
+      parts: number of slices to make.
+
+    Returns:
+      Slice `i` of `data`.
+    """
+    shape = array_ops.shape(data)
+    batch_size = shape[:1]
+    input_shape = shape[1:]
+    step = batch_size // parts
+    if i == gpus - 1:
+      size = batch_size - step * i
+    else:
+      size = step
+    size = array_ops.concat([size, input_shape], axis=0)
+    stride = array_ops.concat([step, input_shape * 0], axis=0)
+    start = stride * i
+    return array_ops.slice(data, start, size)
+
+  all_outputs = []
+  for i in range(len(model.outputs)):
+    all_outputs.append([])
+
+  # Place a copy of the model on each GPU,
+  # each getting a slice of the inputs.
+  for i in range(gpus):
+    with ops.device('/gpu:%d' % i):
+      with ops.name_scope('replica_%d' % i):
+        inputs = []
+        # Retrieve a slice of the input.
+        for x in model.inputs:
+          input_shape = tuple(x.get_shape().as_list())[1:]
+          slice_i = Lambda(
+              get_slice,
+              output_shape=input_shape,
+              arguments={
+                  'i': i,
+                  'parts': gpus
+              })(x)
+          inputs.append(slice_i)
+
+        # Apply model on slice
+        # (creating a model replica on the target device).
+        outputs = model(inputs)
+        if not isinstance(outputs, list):
+          outputs = [outputs]
+
+        # Save the outputs for merging back together later.
+        for o in range(len(outputs)):
+          all_outputs[o].append(outputs[o])
+
+  # Merge outputs on CPU.
+  with ops.device('/cpu:0'):
+    merged = []
+    for outputs in all_outputs:
+      merged.append(concatenate(outputs, axis=0))
+    return Model(model.inputs, merged)
diff --git a/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..51fbd041a4943b1837c5f725a06c0c08fb9cb216
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/training_utils_test.py
@@ -0,0 +1,94 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi-gpu training utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+from tensorflow.python.keras._impl import keras
+from tensorflow.python.platform import test
+
+
+class TestMultiGPUModel(test.TestCase):
+
+  def multi_gpu_test_simple_model(self):
+    gpus = 2
+    num_samples = 1000
+    input_dim = 10
+    output_dim = 1
+    hidden_dim = 10
+    epochs = 2
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(hidden_dim,
+                                   input_shape=(input_dim,)))
+      model.add(keras.layers.Dense(output_dim))
+
+      x = np.random.random((num_samples, input_dim))
+      y = np.random.random((num_samples, output_dim))
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
+
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit(x, y, epochs=epochs)
+
+  def multi_gpu_test_multi_io_model(self):
+    gpus = 2
+    num_samples = 1000
+    input_dim_a = 10
+    input_dim_b = 5
+    output_dim_a = 1
+    output_dim_b = 2
+    hidden_dim = 10
+    epochs = 2
+
+    with self.test_session():
+      input_a = keras.Input((input_dim_a,))
+      input_b = keras.Input((input_dim_b,))
+      a = keras.layers.Dense(hidden_dim)(input_a)
+      b = keras.layers.Dense(hidden_dim)(input_b)
+      c = keras.layers.concatenate([a, b])
+      output_a = keras.layers.Dense(output_dim_a)(c)
+      output_b = keras.layers.Dense(output_dim_b)(c)
+      model = keras.models.Model([input_a, input_b], [output_a, output_b])
+
+      a_x = np.random.random((num_samples, input_dim_a))
+      b_x = np.random.random((num_samples, input_dim_b))
+      a_y = np.random.random((num_samples, output_dim_a))
+      b_y = np.random.random((num_samples, output_dim_b))
+
+      parallel_model = keras.utils.multi_gpu_model(model, gpus=gpus)
+      parallel_model.compile(loss='mse', optimizer='rmsprop')
+      parallel_model.fit([a_x, b_x], [a_y, b_y], epochs=epochs)
+
+  def multi_gpu_test_invalid_devices(self):
+    with self.test_session():
+      input_shape = (1000, 10)
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(10,
+                                   activation='relu',
+                                   input_shape=input_shape[1:]))
+      model.add(keras.layers.Dense(1, activation='sigmoid'))
+      model.compile(loss='mse', optimizer='rmsprop')
+
+      x = np.random.random(input_shape)
+      y = np.random.random((input_shape[0], 1))
+      with self.assertRaises(ValueError):
+        parallel_model = keras.utils.multi_gpu_model(
+            model, gpus=len(keras.backend._get_available_gpus()) + 1)
+        parallel_model.fit(x, y, epochs=2)
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
index ce2faf2d96820d60d6652920ae1f27fa31dd2cad..d56c4484ce35d0c6af08d6199867b7845f367c88 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
@@ -120,7 +120,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
     layer_id = str(id(layer))
     for i, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
       node_key = layer.name + '_ib-' + str(i)
-      if node_key in model.container_nodes:
+      if node_key in model._network_nodes:  # pylint: disable=protected-access
         for inbound_layer in node.inbound_layers:
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
index ac7bd4940628fa206b08899908c1cdd72a368f07..31ef4773ad6481264aea09c72f955a5a6ef8a11d 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
@@ -352,5 +352,5 @@ class KerasRegressor(BaseWrapper):
     kwargs = self.filter_sk_params(Sequential.evaluate, kwargs)
     loss = self.model.evaluate(x, y, **kwargs)
     if isinstance(loss, list):
-      return loss[0]
-    return loss
+      return -loss[0]
+    return -loss
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index e34d9a8e0b9178a234ab6a6fc1090063363fa9b4..34f1435ffb6b65ef0e1399fb6893c3b791616f79 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,12 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.applications import inception_resnet_v2
 from tensorflow.python.keras.applications import inception_v3
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.applications import resnet50
 from tensorflow.python.keras.applications import vgg16
 from tensorflow.python.keras.applications import vgg19
 from tensorflow.python.keras.applications import xception
+from tensorflow.python.keras.applications.inception_resnet_v2 import InceptionResNetV2
 from tensorflow.python.keras.applications.inception_v3 import InceptionV3
 from tensorflow.python.keras.applications.mobilenet import MobileNet
 from tensorflow.python.keras.applications.resnet50 import ResNet50
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py b/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..223660e9bef33896bc83f43ed26c1792e48105b9
--- /dev/null
+++ b/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""InceptionResNetV2 Keras application."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import decode_predictions
+from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
+from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import preprocess_input
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/datasets/__init__.py b/tensorflow/python/keras/datasets/__init__.py
index b76f278964b5f5ac7ea666fc12225f5bbd90ec58..69e10bd63c77de1e0c7104680f64e3e6f5e51ea3 100644
--- a/tensorflow/python/keras/datasets/__init__.py
+++ b/tensorflow/python/keras/datasets/__init__.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.keras.datasets import boston_housing
 from tensorflow.python.keras.datasets import cifar10
 from tensorflow.python.keras.datasets import cifar100
+from tensorflow.python.keras.datasets import fashion_mnist
 from tensorflow.python.keras.datasets import imdb
 from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.datasets import reuters
diff --git a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py b/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index acf0a5e1799b7c57dfd82861c9ccc1f132c34375..b94bf8f0f67a7a8ddbb351d13cb17ccdbf283260 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -134,6 +134,11 @@ from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
 from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
+from tensorflow.python.keras._impl.keras.layers.recurrent import StackedRNNCells
+from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNNCell
+from tensorflow.python.keras._impl.keras.layers.recurrent import GRUCell
+from tensorflow.python.keras._impl.keras.layers.recurrent import LSTMCell
 from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
 from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index a7c2179fe7ad434356921a5fb8709aa5b1f33498..91cc8607274a80a14dd27a64274da7f8f0aafab1 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
 from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras._impl.keras.utils.training_utils import multi_gpu_model
 from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
 
 del absolute_import
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 63844177b72cacebc717665146c9e143517f80b8..31d3bd1b74988ff145352a3022b6a6477b862f80 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -420,7 +420,7 @@ tf_py_test(
 
 tf_py_test(
     name = "record_input_test",
-    size = "small",
+    size = "medium",
     srcs = ["record_input_test.py"],
     additional_deps = [
         "//tensorflow/python:client_testlib",
@@ -483,6 +483,7 @@ tf_py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
     ],
+    grpc_enabled = True,
 )
 
 tf_py_test(
@@ -505,6 +506,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "matrix_exponential_op_test",
+    size = "small",
+    srcs = ["matrix_exponential_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_inverse_op_test",
     size = "small",
@@ -664,6 +677,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
+        "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
 )
@@ -1174,6 +1188,7 @@ cuda_py_test(
     srcs = ["check_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
@@ -1217,7 +1232,9 @@ cuda_py_test(
 
 cuda_py_test(
     name = "control_flow_ops_py_test",
-    size = "small",
+    # TOOD(b/70473603): change this back to "small" once the C API is
+    # permanently enabled
+    size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1251,6 +1268,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "control_flow_util_test",
+    size = "small",
+    srcs = ["control_flow_util_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_ops_gen",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:test_ops",
+    ],
+)
+
 cuda_py_test(
     name = "conv1d_test",
     size = "small",
@@ -1357,7 +1387,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "dynamic_partition_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1414,6 +1444,7 @@ cuda_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    grpc_enabled = True,
     tags = ["no_windows"],
 )
 
@@ -1618,6 +1649,8 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -2042,7 +2075,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "transpose_op_test",
-    size = "medium",
+    size = "large",
     srcs = ["transpose_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2050,6 +2083,11 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    shard_count = 2,
+    tags = [
+        "no_gpu",
+        "no_oss",
+    ],
 )
 
 cuda_py_test(
@@ -2325,6 +2363,7 @@ cuda_py_test(
         "//tensorflow/python:rnn_cell",
         "//tensorflow/python:sparse_grad",
         "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
     ],
@@ -2347,7 +2386,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "slice_op_test",
-    size = "medium",
+    size = "large",
     srcs = ["slice_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2763,319 +2802,16 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "batch_dataset_op_test",
-    size = "small",
-    srcs = ["batch_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "dataset_constructor_op_test",
-    size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
-    ],
-)
-
-tf_py_test(
-    name = "dataset_from_generator_op_test",
+    name = "garbage_collection_test",
     size = "small",
-    srcs = ["dataset_from_generator_op_test.py"],
+    srcs = ["garbage_collection_test.py"],
     additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "filter_dataset_op_test",
-    size = "small",
-    srcs = ["filter_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "flat_map_dataset_op_test",
-    size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "list_files_dataset_op_test",
-    size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "reader_dataset_ops_test",
-    size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
-    additional_deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_dataset_op_test",
-    size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shuffle_dataset_op_test",
-    size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "shard_dataset_op_test",
-    size = "small",
-    srcs = ["shard_dataset_op_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "cache_dataset_op_test",
-    size = "small",
-    srcs = ["cache_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-)
-
-tf_py_test(
-    name = "zip_dataset_op_test",
-    size = "small",
-    srcs = ["zip_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "concatenate_dataset_op_test",
-    size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_test",
-    size = "small",
-    srcs = ["iterator_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-    ],
-)
-
-tf_py_test(
-    name = "iterator_ops_cluster_test",
-    size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
-    additional_deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 8f4c94f318b64deab7c41f505088246f31874c5e..17492e9255ca9f8cdae65a9acab33ed9156de10c 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
@@ -107,22 +110,41 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
-  def CheckVersusNumpy(self, ndims_mask, arr_shape, make_mask=None):
+  def CheckVersusNumpy(self, ndims_mask, arr_shape, make_mask=None, axis=None):
     """Check equivalence between boolean_mask and numpy masking."""
     if make_mask is None:
       make_mask = lambda shape: self.rng.randint(0, 2, size=shape).astype(bool)
     arr = np.random.rand(*arr_shape)
     mask = make_mask(arr_shape[:ndims_mask])
-    masked_arr = arr[mask]
+    if axis is not None:
+      mask = make_mask(arr_shape[axis:ndims_mask + axis])
+    if axis is None or axis == 0:
+      masked_arr = arr[mask]
+    elif axis == 1:
+      masked_arr = arr[:, mask]
+    elif axis == 2:
+      masked_arr = arr[:, :, mask]
     with self.test_session():
-      masked_tensor = array_ops.boolean_mask(arr, mask)
+      masked_tensor = array_ops.boolean_mask(arr, mask, axis=axis)
 
       # Leading dimension size of masked_tensor is always unknown until runtime
       # since we don't how many elements will be kept.
-      self.assertAllEqual(masked_tensor.get_shape()[1:], masked_arr.shape[1:])
+      leading = 1 if axis is None else axis + 1
+      self.assertAllEqual(masked_tensor.get_shape()[leading:],
+                          masked_arr.shape[leading:])
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
+  def testMaskDim1ArrDim2Axis1(self):
+    ndims_mask = 1
+    for arr_shape in [(1, 1), (2, 2), (2, 5)]:
+      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+
+  def testMaskDim2ArrDim2Axis1(self):
+    ndims_mask = 2
+    for arr_shape in [(1, 1), (2, 2), (2, 5)]:
+      self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
+
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
@@ -1060,5 +1082,58 @@ class PadTest(test_util.TensorFlowTestCase):
                            [0, 0, 0, 0, 0, 0, 0]])
 
 
+class InvertPermutationTest(test_util.TensorFlowTestCase):
+
+  def testInvertPermutation(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([3, 4, 0, 2, 1], dtype=dtype)
+        y = array_ops.invert_permutation(x)
+        self.assertAllEqual(y.get_shape(), [5])
+        self.assertAllEqual(y.eval(), [2, 4, 3, 0, 1])
+
+
+class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
+
+  def testSimple(self):
+    with self.test_session():
+      a = array_ops.constant(10)
+      guarantee_a = array_ops.guarantee_const(a)
+      self.assertEqual(10, guarantee_a.eval())
+
+  def testVariables(self):
+    with self.test_session() as sess:
+      for use_resource in [False, True]:
+        a = variable_scope.get_variable(
+            "var_{}".format(use_resource), [],
+            initializer=init_ops.constant_initializer(10.0),
+            use_resource=use_resource)
+        guarantee_a = array_ops.guarantee_const(a)
+        sess.run(variables.global_variables_initializer())
+        self.assertEqual(10.0, guarantee_a.eval())
+
+  def testResourceRejection(self):
+    with self.test_session() as sess:
+      a = variable_scope.get_variable(
+          "resource_var", [],
+          initializer=init_ops.constant_initializer(10.0),
+          use_resource=True)
+      guarantee_a = array_ops.guarantee_const(a.handle)
+      sess.run(variables.global_variables_initializer())
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               "cannot be a resource variable"):
+        guarantee_a.eval()
+
+
+class SnapshotOpTest(test_util.TensorFlowTestCase):
+
+  def testInvertPermutation(self):
+    for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([0, 1, 2, 3], dtype=dtype)
+        y = gen_array_ops._snapshot(x)
+        self.assertAllEqual(y.eval(), [0, 1, 2, 3])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 7a610debd1d0c94cf7529e6c386f06fdfb11402f..2767df127e324fe54fb1b6d068e75588d4209f98 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import googletest
 class BincountTest(test_util.TensorFlowTestCase):
 
   def test_empty(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([], minlength=5).eval(), [0, 0, 0, 0, 0])
       self.assertAllEqual(math_ops.bincount([], minlength=1).eval(), [0])
@@ -42,7 +42,7 @@ class BincountTest(test_util.TensorFlowTestCase):
           np.float64)
 
   def test_values(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount([1, 1, 1, 2, 2, 3]).eval(), [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
@@ -57,14 +57,14 @@ class BincountTest(test_util.TensorFlowTestCase):
           math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
 
   def test_maxlength(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
       self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
       self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
 
   def test_random_with_weights(self):
     num_samples = 10000
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       np.random.seed(42)
       for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
         arr = np.random.randint(0, 1000, num_samples)
@@ -72,17 +72,27 @@ class BincountTest(test_util.TensorFlowTestCase):
           weights = np.random.randint(-100, 100, num_samples)
         else:
           weights = np.random.random(num_samples)
-        self.assertAllEqual(
-            math_ops.bincount(arr, weights).eval(),
-            np.bincount(arr, weights))
+        self.assertAllClose(
+            math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
+
+  def test_random_without_weights(self):
+    num_samples = 10000
+    with self.test_session(use_gpu=True):
+      np.random.seed(42)
+      for dtype in [np.int32, np.float32]:
+        arr = np.random.randint(0, 1000, num_samples)
+        weights = np.ones(num_samples).astype(dtype)
+        self.assertAllClose(
+            math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
 
   def test_zero_weights(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       self.assertAllEqual(
           math_ops.bincount(np.arange(1000), np.zeros(1000)).eval(),
           np.zeros(1000))
 
   def test_negative(self):
+    # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.test_session():
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 6db3592055f6b6bb163fb4a2367ff468d1601e15..e612b1c1349b95899cc4809155732474e50d4b84 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -31,7 +31,7 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([-5, 0, 2, 3, 5, 8, 10, 11, 12]),
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def testFloat(self):
@@ -39,7 +39,7 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([-5., 0., 2., 3., 5., 8., 10., 11., 12.]),
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def test2DInput(self):
@@ -47,13 +47,13 @@ class BucketizationOpTest(test.TestCase):
         constant_op.constant([[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]),
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       self.assertAllEqual(expected_out, sess.run(op))
 
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
         sess.run(op)
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index c785f2358d5e659c71acf02457e2146616a9e880..214d5cb3c064dc4b046d09959eaa1d770bcabc3d 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -144,9 +144,9 @@ class CastOpTest(test.TestCase):
 
     self._compare(np.inf, np.float32, np.inf, False)
     self._compare(np.inf, np.float64, np.inf, False)
-    if sys.byteorder == "big":  
-      self._compare(np.inf, np.int32, i4.max, False)  
-      self._compare(np.inf, np.int64, i8.max, False)  
+    if sys.byteorder == "big":
+      self._compare(np.inf, np.int32, i4.max, False)
+      self._compare(np.inf, np.int64, i8.max, False)
     else:
       # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
       # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
@@ -156,7 +156,7 @@ class CastOpTest(test.TestCase):
         self._compare(-np.inf, np.int64, i8.min, False)
       else:
         self._compare(np.inf, np.int32, i4.min, False)
-        self._compare(np.inf, np.int64, i8.min, False)  
+        self._compare(np.inf, np.int64, i8.min, False)
     self._compare(-np.inf, np.float32, -np.inf, False)
     self._compare(-np.inf, np.float64, -np.inf, False)
     self._compare(-np.inf, np.int32, i4.min, False)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index ed859e37741fe391c2f003a038a64eb292e385f1..7ce0f1e7b8a4df7c8c3acb36c0d46f60cbf0f703 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.platform import test
@@ -31,38 +34,45 @@ from tensorflow.python.platform import test
 
 class AssertProperIterableTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_tensor_raises(self):
     tensor = constant_op.constant(1)
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(tensor)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_sparse_tensor_raises(self):
     ten = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(ten)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_ndarray_raises(self):
     array = np.array([1, 2, 3])
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(array)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_single_string_raises(self):
     mystr = "hello"
     with self.assertRaisesRegexp(TypeError, "proper"):
       check_ops.assert_proper_iterable(mystr)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_non_iterable_object_raises(self):
     non_iterable = 1234
     with self.assertRaisesRegexp(TypeError, "to be iterable"):
       check_ops.assert_proper_iterable(non_iterable)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_list_does_not_raise(self):
     list_of_stuff = [
         constant_op.constant([11, 22]), constant_op.constant([1, 2])
     ]
     check_ops.assert_proper_iterable(list_of_stuff)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_generator_does_not_raise(self):
     generator_of_stuff = (constant_op.constant([11, 22]), constant_op.constant(
         [1, 2]))
@@ -71,110 +81,178 @@ class AssertProperIterableTest(test.TestCase):
 
 class AssertEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    with self.test_session():
+    small = constant_op.constant([1, 2], name="small")
+    with ops.control_dependencies([check_ops.assert_equal(small, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  def test_returns_none_with_eager(self):
+    with context.eager_mode():
       small = constant_op.constant([1, 2], name="small")
-      with ops.control_dependencies([check_ops.assert_equal(small, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+      x = check_ops.assert_equal(small, small)
+      assert x is None
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    with self.test_session():
-      # Static check
-      static_small = constant_op.constant([1, 2], name="small")
-      static_big = constant_op.constant([3, 4], name="big")
-      with self.assertRaisesRegexp(ValueError, "fail"):
-        check_ops.assert_equal(static_big, static_small, message="fail")
-      # Dynamic check
-      small = array_ops.placeholder(dtypes.int32, name="small")
-      big = array_ops.placeholder(dtypes.int32, name="big")
-      with ops.control_dependencies(
-          [check_ops.assert_equal(
-              big, small, message="fail")]):
-        out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*big.*small"):
-        out.eval(feed_dict={small: [1, 2], big: [3, 4]})
-
+    # Static check
+    static_small = constant_op.constant([1, 2], name="small")
+    static_big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(static_big, static_small, message="fail")
+
+    # Dynamic check
+    if context.in_graph_mode():
+      with self.test_session():
+        small = array_ops.placeholder(dtypes.int32, name="small")
+        big = array_ops.placeholder(dtypes.int32, name="big")
+        with ops.control_dependencies(
+            [check_ops.assert_equal(
+                big, small, message="fail")]):
+          out = array_ops.identity(small)
+        with self.assertRaisesOpError("fail.*big.*small"):
+          out.eval(feed_dict={small: [1, 2], big: [3, 4]})
+
+  def test_error_message_eager(self):
+    expected_error_msg_full = r"""big does not equal small
+Condition x == y did not hold.
+Indices of first 6 different values:
+\[\[0 0\]
+ \[1 1\]
+ \[2 0\]\]
+Corresponding x values:
+\[2 3 6\]
+Corresponding y values:
+\[20 30 60\]
+First 6 elements of x:
+\[2 2 3 3 6 6\]
+First 6 elements of y:
+\[20  2  3 30 60  6\]
+"""
+    expected_error_msg_short = r"""big does not equal small
+Condition x == y did not hold.
+Indices of first 2 different values:
+\[\[0 0\]
+ \[1 1\]\]
+Corresponding x values:
+\[2 3\]
+Corresponding y values:
+\[20 30\]
+First 2 elements of x:
+\[2 2\]
+First 2 elements of y:
+\[20  2\]
+"""
+    with context.eager_mode():
+      big = constant_op.constant([[2, 2], [3, 3], [6, 6]])
+      small = constant_op.constant([[20, 2], [3, 30], [60, 6]])
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_full):
+        check_ops.assert_equal(big, small, message="big does not equal small",
+                               summarize=10)
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   expected_error_msg_short):
+        check_ops.assert_equal(big, small, message="big does not equal small",
+                               summarize=2)
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less(self):
-    with self.test_session():
-      # Static check
-      static_small = constant_op.constant([3, 1], name="small")
-      static_big = constant_op.constant([4, 2], name="big")
-      with self.assertRaisesRegexp(ValueError, "fail"):
-        check_ops.assert_equal(static_big, static_small, message="fail")
-      # Dynamic check
-      small = array_ops.placeholder(dtypes.int32, name="small")
-      big = array_ops.placeholder(dtypes.int32, name="big")
-      with ops.control_dependencies([check_ops.assert_equal(small, big)]):
-        out = array_ops.identity(small)
-      with self.assertRaisesOpError("small.*big"):
-        out.eval(feed_dict={small: [3, 1], big: [4, 2]})
+    # Static check
+    static_small = constant_op.constant([3, 1], name="small")
+    static_big = constant_op.constant([4, 2], name="big")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(static_big, static_small, message="fail")
+
+    # Dynamic check
+    if context.in_graph_mode():
+      with self.test_session():
+        small = array_ops.placeholder(dtypes.int32, name="small")
+        big = array_ops.placeholder(dtypes.int32, name="big")
+        with ops.control_dependencies([check_ops.assert_equal(small, big)]):
+          out = array_ops.identity(small)
+        with self.assertRaisesOpError("small.*big"):
+          out.eval(feed_dict={small: [3, 1], big: [4, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      small_2 = constant_op.constant([1, 2], name="small_2")
-      with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([[1, 2], [1, 2]], name="small")
+    small_2 = constant_op.constant([1, 2], name="small_2")
+    with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      small_2 = constant_op.constant([1, 1], name="small_2")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([1, 1, 1], name="small")
+    small_2 = constant_op.constant([1, 1], name="small_2")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[3\] vs. \[2\]|"
+         r"Dimensions must be equal, but are 3 and 2")):
+      with ops.control_dependencies([check_ops.assert_equal(small, small_2)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies([check_ops.assert_equal(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies([check_ops.assert_equal(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertNoneEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_not_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([10, 20], name="small")
-      with ops.control_dependencies(
-          [check_ops.assert_none_equal(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
-
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([10, 20], name="small")
+    with ops.control_dependencies(
+        [check_ops.assert_none_equal(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
+    small = constant_op.constant([3, 1], name="small")
+    with self.assertRaisesOpError("x != y did not hold"):
       with ops.control_dependencies(
           [check_ops.assert_none_equal(small, small)]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("x != y did not hold"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_not_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3], name="big")
-      with ops.control_dependencies(
-          [check_ops.assert_none_equal(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
-
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3], name="big")
+    with ops.control_dependencies(
+        [check_ops.assert_none_equal(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_not_equal_but_non_broadcastable_shapes(self):
     with self.test_session():
       small = constant_op.constant([1, 1, 1], name="small")
       big = constant_op.constant([10, 10], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
+      # The exception in eager and non-eager mode is different because
+      # eager mode relies on shape check done as part of the C++ op, while
+      # graph mode does shape checks when creating the `Operation` instance.
+      with self.assertRaisesRegexp(
+          (ValueError, errors.InvalidArgumentError),
+          (r"Incompatible shapes: \[3\] vs. \[2\]|"
+           r"Dimensions must be equal, but are 3 and 2")):
         with ops.control_dependencies(
             [check_ops.assert_none_equal(small, big)]):
           out = array_ops.identity(small)
-        out.eval()
+        self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
     with self.test_session():
       larry = constant_op.constant([])
@@ -182,325 +260,363 @@ class AssertNoneEqualTest(test.TestCase):
       with ops.control_dependencies(
           [check_ops.assert_none_equal(larry, curly)]):
         out = array_ops.identity(larry)
-      out.eval()
+      self.evaluate(out)
+
+  def test_returns_none_with_eager(self):
+    with context.eager_mode():
+      t1 = constant_op.constant([1, 2])
+      t2 = constant_op.constant([3, 4])
+      x = check_ops.assert_none_equal(t1, t2)
+      assert x is None
 
 
 class AssertLessTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
+    small = constant_op.constant([1, 2], name="small")
+    with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
       with ops.control_dependencies(
           [check_ops.assert_less(
-              small, small, message="fail")]):
+              small, small, message="failure message")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*small.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("x < y did not hold"):
       with ops.control_dependencies([check_ops.assert_less(big, small)]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("big.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less(self):
-    with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
-      big = constant_op.constant([4, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_less(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([3, 1], name="small")
+    big = constant_op.constant([4, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_less(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_less(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_less(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies([check_ops.assert_less(small, big)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([1, 1, 1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (ValueError, errors.InvalidArgumentError),
+        (r"Incompatible shapes: \[3\] vs. \[2\]|"
+         "Dimensions must be equal, but are 3 and 2")):
+      with ops.control_dependencies([check_ops.assert_less(small, big)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies([check_ops.assert_less(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies([check_ops.assert_less(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
+
+  def test_returns_none_with_eager(self):
+    with context.eager_mode():
+      t1 = constant_op.constant([1, 2])
+      t2 = constant_op.constant([3, 4])
+      x = check_ops.assert_less(t1, t2)
+      assert x is None
 
 
 class AssertLessEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      with ops.control_dependencies(
-          [check_ops.assert_less_equal(small, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    with ops.control_dependencies(
+        [check_ops.assert_less_equal(small, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_less_equal(
               big, small, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*big.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_less_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 1], name="big")
-      with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 1], name="big")
+    with ops.control_dependencies([check_ops.assert_less_equal(small, big)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      big = constant_op.constant([3, 1], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies(
-            [check_ops.assert_less_equal(small, big)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([3, 1], name="small")
+    big = constant_op.constant([1, 1, 1], name="big")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
+      with ops.control_dependencies(
+          [check_ops.assert_less_equal(small, big)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies(
-          [check_ops.assert_less_equal(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies(
+        [check_ops.assert_less_equal(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertGreaterTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
+    small = constant_op.constant([1, 2], name="small")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater(
               small, small, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*small.*small"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("x > y did not hold"):
       with ops.control_dependencies([check_ops.assert_greater(small, big)]):
         out = array_ops.identity(big)
-      with self.assertRaisesOpError("small.*big"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater(self):
-    with self.test_session():
-      small = constant_op.constant([3, 1], name="small")
-      big = constant_op.constant([4, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([3, 1], name="small")
+    big = constant_op.constant([4, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_greater_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with self.assertRaisesRegexp(ValueError, "must be"):
-        with ops.control_dependencies([check_ops.assert_greater(big, small)]):
-          out = array_ops.identity(small)
-        out.eval()
+    small = constant_op.constant([1, 1, 1], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
+      with ops.control_dependencies([check_ops.assert_greater(big, small)]):
+        out = array_ops.identity(small)
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies([check_ops.assert_greater(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies([check_ops.assert_greater(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertGreaterEqualTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(small, small)]):
-        out = array_ops.identity(small)
-      out.eval()
+    small = constant_op.constant([1, 2], name="small")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(small, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_less(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 4], name="big")
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 4], name="big")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_greater_equal(
               small, big, message="fail")]):
         out = array_ops.identity(small)
-      with self.assertRaisesOpError("fail.*small.*big"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_equal(self):
-    with self.test_session():
-      small = constant_op.constant([1, 2], name="small")
-      big = constant_op.constant([3, 2], name="big")
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(big, small)]):
-        out = array_ops.identity(small)
-      out.eval()
-
+    small = constant_op.constant([1, 2], name="small")
+    big = constant_op.constant([3, 2], name="big")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_greater_equal_and_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1], name="small")
-      big = constant_op.constant([3, 1], name="big")
+    small = constant_op.constant([1], name="small")
+    big = constant_op.constant([3, 1], name="big")
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(big, small)]):
+      out = array_ops.identity(small)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
+    small = constant_op.constant([1, 1, 1], name="big")
+    big = constant_op.constant([3, 1], name="small")
+    # The exception in eager and non-eager mode is different because
+    # eager mode relies on shape check done as part of the C++ op, while
+    # graph mode does shape checks when creating the `Operation` instance.
+    with self.assertRaisesRegexp(
+        (errors.InvalidArgumentError, ValueError),
+        (r"Incompatible shapes: \[2\] vs. \[3\]|"
+         r"Dimensions must be equal, but are 2 and 3")):
       with ops.control_dependencies(
           [check_ops.assert_greater_equal(big, small)]):
         out = array_ops.identity(small)
-      out.eval()
-
-  def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
-    with self.test_session():
-      small = constant_op.constant([1, 1, 1], name="big")
-      big = constant_op.constant([3, 1], name="small")
-      with self.assertRaisesRegexp(ValueError, "Dimensions must be equal"):
-        with ops.control_dependencies(
-            [check_ops.assert_greater_equal(big, small)]):
-          out = array_ops.identity(small)
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_both_empty(self):
-    with self.test_session():
-      larry = constant_op.constant([])
-      curly = constant_op.constant([])
-      with ops.control_dependencies(
-          [check_ops.assert_greater_equal(larry, curly)]):
-        out = array_ops.identity(larry)
-      out.eval()
+    larry = constant_op.constant([])
+    curly = constant_op.constant([])
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(larry, curly)]):
+      out = array_ops.identity(larry)
+    self.evaluate(out)
 
 
 class AssertNegativeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_negative(self):
-    with self.test_session():
-      frank = constant_op.constant([-1, -2], name="frank")
-      with ops.control_dependencies([check_ops.assert_negative(frank)]):
-        out = array_ops.identity(frank)
-      out.eval()
+    frank = constant_op.constant([-1, -2], name="frank")
+    with ops.control_dependencies([check_ops.assert_negative(frank)]):
+      out = array_ops.identity(frank)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_positive(self):
-    with self.test_session():
-      doug = constant_op.constant([1, 2], name="doug")
+    doug = constant_op.constant([1, 2], name="doug")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_negative(
               doug, message="fail")]):
         out = array_ops.identity(doug)
-      with self.assertRaisesOpError("fail.*doug"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_zero(self):
-    with self.test_session():
-      claire = constant_op.constant([0], name="claire")
+    claire = constant_op.constant([0], name="claire")
+    with self.assertRaisesOpError("x < 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_negative(claire)]):
         out = array_ops.identity(claire)
-      with self.assertRaisesOpError("claire"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is negative when it satisfies:
     #   For every element x_i in x, x_i < 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_negative(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_negative(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertPositiveTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_negative(self):
-    with self.test_session():
-      freddie = constant_op.constant([-1, -2], name="freddie")
+    freddie = constant_op.constant([-1, -2], name="freddie")
+    with self.assertRaisesOpError("fail"):
       with ops.control_dependencies(
           [check_ops.assert_positive(
               freddie, message="fail")]):
         out = array_ops.identity(freddie)
-      with self.assertRaisesOpError("fail.*freddie"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_positive(self):
-    with self.test_session():
-      remmy = constant_op.constant([1, 2], name="remmy")
-      with ops.control_dependencies([check_ops.assert_positive(remmy)]):
-        out = array_ops.identity(remmy)
-      out.eval()
+    remmy = constant_op.constant([1, 2], name="remmy")
+    with ops.control_dependencies([check_ops.assert_positive(remmy)]):
+      out = array_ops.identity(remmy)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_zero(self):
-    with self.test_session():
-      meechum = constant_op.constant([0], name="meechum")
+    meechum = constant_op.constant([0], name="meechum")
+    with self.assertRaisesOpError("x > 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_positive(meechum)]):
         out = array_ops.identity(meechum)
-      with self.assertRaisesOpError("meechum"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is positive when it satisfies:
     #   For every element x_i in x, x_i > 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_positive(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_positive(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertRankTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 1
-      with self.assertRaisesRegexp(ValueError,
-                                   "fail.*my_tensor.*must have rank 1"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(
-                tensor, desired_rank, message="fail")]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 1
+    with self.assertRaisesRegexp(ValueError,
+                                 "fail.*must have rank 1"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(
+              tensor, desired_rank, message="fail")]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -512,13 +628,13 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("fail.*my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -528,14 +644,14 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_large_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 0
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 0
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_large_dynamic_rank(self):
     with self.test_session():
@@ -546,13 +662,13 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 1
-      with ops.control_dependencies(
-          [check_ops.assert_rank(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 1
+    with ops.control_dependencies(
+        [check_ops.assert_rank(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -562,14 +678,14 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 2
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 2
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -580,11 +696,11 @@ class AssertRankTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_scalar_static(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
-        check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+      check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.test_session():
@@ -596,12 +712,12 @@ class AssertRankTest(test.TestCase):
             [check_ops.assert_rank(tensor, rank_tensor)]):
           array_ops.identity(tensor).eval(feed_dict={rank_tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_integer_static(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
-        check_ops.assert_rank(tensor, .5)
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    with self.assertRaisesRegexp(TypeError,
+                                 "must be of type <dtype: 'int32'>"):
+      check_ops.assert_rank(tensor, .5)
 
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.test_session():
@@ -617,14 +733,14 @@ class AssertRankTest(test.TestCase):
 
 class AssertRankInTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_mismatch_static_rank(self):
-    with self.test_session():
-      tensor_rank0 = constant_op.constant(42, name="my_tensor")
-      with self.assertRaisesRegexp(
-          ValueError, "fail.*my_tensor.*must have rank.*in.*1.*2"):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
-          array_ops.identity(tensor_rank0).eval()
+    tensor_rank0 = constant_op.constant(42, name="my_tensor")
+    with self.assertRaisesRegexp(
+        ValueError, "fail.*must have rank.*in.*1.*2"):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
+        self.evaluate(array_ops.identity(tensor_rank0))
 
   def test_rank_zero_tensor_raises_if_rank_mismatch_dynamic_rank(self):
     with self.test_session():
@@ -634,13 +750,13 @@ class AssertRankInTest(test.TestCase):
         with self.assertRaisesOpError("fail.*my_tensor.*rank"):
           array_ops.identity(tensor_rank0).eval(feed_dict={tensor_rank0: 42.0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_static_rank(self):
-    with self.test_session():
-      tensor_rank0 = constant_op.constant(42, name="my_tensor")
-      for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
-          array_ops.identity(tensor_rank0).eval()
+    tensor_rank0 = constant_op.constant(42, name="my_tensor")
+    for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
+        self.evaluate(array_ops.identity(tensor_rank0))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.test_session():
@@ -650,13 +766,13 @@ class AssertRankInTest(test.TestCase):
             check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
           array_ops.identity(tensor_rank0).eval(feed_dict={tensor_rank0: 42.0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_static_rank(self):
-    with self.test_session():
-      tensor_rank1 = constant_op.constant([42, 43], name="my_tensor")
-      for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
-          array_ops.identity(tensor_rank1).eval()
+    tensor_rank1 = constant_op.constant([42, 43], name="my_tensor")
+    for desired_ranks in ((0, 1, 2), (1, 0, 2), (1, 2, 0)):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
+        self.evaluate(array_ops.identity(tensor_rank1))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.test_session():
@@ -668,13 +784,13 @@ class AssertRankInTest(test.TestCase):
               tensor_rank1: (42.0, 43.0)
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_mismatches_static_rank(self):
-    with self.test_session():
-      tensor_rank1 = constant_op.constant((42, 43), name="my_tensor")
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies([
-            check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
-          array_ops.identity(tensor_rank1).eval()
+    tensor_rank1 = constant_op.constant((42, 43), name="my_tensor")
+    with self.assertRaisesRegexp(ValueError, "rank"):
+      with ops.control_dependencies([
+          check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
+        self.evaluate(array_ops.identity(tensor_rank1))
 
   def test_rank_one_tensor_raises_if_rank_mismatches_dynamic_rank(self):
     with self.test_session():
@@ -686,14 +802,14 @@ class AssertRankInTest(test.TestCase):
               tensor_rank1: (42.0, 43.0)
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_scalar_static(self):
-    with self.test_session():
-      tensor = constant_op.constant((42, 43), name="my_tensor")
-      desired_ranks = (
-          np.array(1, dtype=np.int32),
-          np.array((2, 1), dtype=np.int32))
-      with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
-        check_ops.assert_rank_in(tensor, desired_ranks)
+    tensor = constant_op.constant((42, 43), name="my_tensor")
+    desired_ranks = (
+        np.array(1, dtype=np.int32),
+        np.array((2, 1), dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
+      check_ops.assert_rank_in(tensor, desired_ranks)
 
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.test_session():
@@ -710,12 +826,12 @@ class AssertRankInTest(test.TestCase):
               desired_ranks[1]: [2, 1],
           })
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_if_rank_is_not_integer_static(self):
-    with self.test_session():
-      tensor = constant_op.constant((42, 43), name="my_tensor")
-      with self.assertRaisesRegexp(TypeError,
-                                   "must be of type <dtype: 'int32'>"):
-        check_ops.assert_rank_in(tensor, (1, .5,))
+    tensor = constant_op.constant((42, 43), name="my_tensor")
+    with self.assertRaisesRegexp(TypeError,
+                                 "must be of type <dtype: 'int32'>"):
+      check_ops.assert_rank_in(tensor, (1, .5,))
 
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.test_session():
@@ -731,14 +847,14 @@ class AssertRankInTest(test.TestCase):
 
 class AssertRankAtLeastTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 1
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank at least 1"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 1
+    with self.assertRaisesRegexp(ValueError, "rank at least 1"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -749,13 +865,13 @@ class AssertRankAtLeastTest(test.TestCase):
         with self.assertRaisesOpError("my_tensor.*rank"):
           array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant(1, name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant(1, name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -765,13 +881,13 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: 0})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_ten_doesnt_raise_raise_if_rank_too_large_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 0
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 0
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_ten_doesnt_raise_if_rank_too_large_dynamic_rank(self):
     with self.test_session():
@@ -781,13 +897,13 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 1
-      with ops.control_dependencies(
-          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-        array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 1
+    with ops.control_dependencies(
+        [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+      self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.test_session():
@@ -797,14 +913,14 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         array_ops.identity(tensor).eval(feed_dict={tensor: [1, 2]})
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_rank_one_tensor_raises_if_rank_too_small_static_rank(self):
-    with self.test_session():
-      tensor = constant_op.constant([1, 2], name="my_tensor")
-      desired_rank = 2
-      with self.assertRaisesRegexp(ValueError, "my_tensor.*rank"):
-        with ops.control_dependencies(
-            [check_ops.assert_rank_at_least(tensor, desired_rank)]):
-          array_ops.identity(tensor).eval()
+    tensor = constant_op.constant([1, 2], name="my_tensor")
+    desired_rank = 2
+    with self.assertRaisesRegexp(ValueError, "rank at least 2"):
+      with ops.control_dependencies(
+          [check_ops.assert_rank_at_least(tensor, desired_rank)]):
+        self.evaluate(array_ops.identity(tensor))
 
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.test_session():
@@ -818,144 +934,165 @@ class AssertRankAtLeastTest(test.TestCase):
 
 class AssertNonNegativeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_negative(self):
-    with self.test_session():
-      zoe = constant_op.constant([-1, -2], name="zoe")
+    zoe = constant_op.constant([-1, -2], name="zoe")
+    with self.assertRaisesOpError("x >= 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_non_negative(zoe)]):
         out = array_ops.identity(zoe)
-      with self.assertRaisesOpError("zoe"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_zero_and_positive(self):
-    with self.test_session():
-      lucas = constant_op.constant([0, 2], name="lucas")
-      with ops.control_dependencies([check_ops.assert_non_negative(lucas)]):
-        out = array_ops.identity(lucas)
-      out.eval()
+    lucas = constant_op.constant([0, 2], name="lucas")
+    with ops.control_dependencies([check_ops.assert_non_negative(lucas)]):
+      out = array_ops.identity(lucas)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is non-negative when it satisfies:
     #   For every element x_i in x, x_i >= 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_non_negative(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_non_negative(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertNonPositiveTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_zero_and_negative(self):
-    with self.test_session():
-      tom = constant_op.constant([0, -2], name="tom")
-      with ops.control_dependencies([check_ops.assert_non_positive(tom)]):
-        out = array_ops.identity(tom)
-      out.eval()
+    tom = constant_op.constant([0, -2], name="tom")
+    with ops.control_dependencies([check_ops.assert_non_positive(tom)]):
+      out = array_ops.identity(tom)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_positive(self):
-    with self.test_session():
-      rachel = constant_op.constant([0, 2], name="rachel")
+    rachel = constant_op.constant([0, 2], name="rachel")
+    with self.assertRaisesOpError("x <= 0 did not hold"):
       with ops.control_dependencies([check_ops.assert_non_positive(rachel)]):
         out = array_ops.identity(rachel)
-      with self.assertRaisesOpError("rachel"):
-        out.eval()
+      self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_doesnt_raise(self):
     # A tensor is non-positive when it satisfies:
     #   For every element x_i in x, x_i <= 0
     # and an empty tensor has no elements, so this is trivially satisfied.
     # This is standard set theory.
-    with self.test_session():
-      empty = constant_op.constant([], name="empty")
-      with ops.control_dependencies([check_ops.assert_non_positive(empty)]):
-        out = array_ops.identity(empty)
-      out.eval()
+    empty = constant_op.constant([], name="empty")
+    with ops.control_dependencies([check_ops.assert_non_positive(empty)]):
+      out = array_ops.identity(empty)
+    self.evaluate(out)
 
 
 class AssertIntegerTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_doesnt_raise_when_integer(self):
-    with self.test_session():
-      integers = constant_op.constant([1, 2], name="integers")
-      with ops.control_dependencies([check_ops.assert_integer(integers)]):
-        out = array_ops.identity(integers)
-      out.eval()
+    integers = constant_op.constant([1, 2], name="integers")
+    with ops.control_dependencies([check_ops.assert_integer(integers)]):
+      out = array_ops.identity(integers)
+    self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_raises_when_float(self):
-    with self.test_session():
-      floats = constant_op.constant([1.0, 2.0], name="floats")
-      with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
-        check_ops.assert_integer(floats)
+    floats = constant_op.constant([1.0, 2.0], name="floats")
+    with self.assertRaisesRegexp(TypeError, "Expected.*integer"):
+      check_ops.assert_integer(floats)
+
+
+class AssertTypeTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_doesnt_raise_when_correct_type(self):
+    integers = constant_op.constant([1, 2], dtype=dtypes.int64)
+    with ops.control_dependencies([
+        check_ops.assert_type(integers, dtypes.int64)]):
+      out = array_ops.identity(integers)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_raises_when_wrong_type(self):
+    floats = constant_op.constant([1.0, 2.0], dtype=dtypes.float16)
+    with self.assertRaisesRegexp(TypeError, "must be of type.*float32"):
+      check_ops.assert_type(floats, dtypes.float32)
 
 
 class IsStrictlyIncreasingTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_constant_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_strictly_increasing([1, 1, 1]).eval())
+    self.assertFalse(self.evaluate(check_ops.is_strictly_increasing([1, 1, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_decreasing_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_strictly_increasing([1, 0, -1]).eval())
+    self.assertFalse(self.evaluate(
+        check_ops.is_strictly_increasing([1, 0, -1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_2d_decreasing_tensor_is_not_strictly_increasing(self):
-    with self.test_session():
-      self.assertFalse(
-          check_ops.is_strictly_increasing([[1, 3], [2, 4]]).eval())
+    self.assertFalse(
+        self.evaluate(check_ops.is_strictly_increasing([[1, 3], [2, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_tensor_is_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([1, 2, 3]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([1, 2, 3])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_two_tensor(self):
-    with self.test_session():
-      self.assertTrue(
-          check_ops.is_strictly_increasing([[-1, 2], [3, 4]]).eval())
+    self.assertTrue(
+        self.evaluate(check_ops.is_strictly_increasing([[-1, 2], [3, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_tensor_with_one_element_is_strictly_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_is_strictly_increasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_strictly_increasing([]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_strictly_increasing([])))
 
 
 class IsNonDecreasingTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_constant_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1, 1, 1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1, 1, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_decreasing_tensor_is_not_non_decreasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_non_decreasing([3, 2, 1]).eval())
+    self.assertFalse(self.evaluate(check_ops.is_non_decreasing([3, 2, 1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_2d_decreasing_tensor_is_not_non_decreasing(self):
-    with self.test_session():
-      self.assertFalse(check_ops.is_non_decreasing([[1, 3], [2, 4]]).eval())
+    self.assertFalse(self.evaluate(
+        check_ops.is_non_decreasing([[1, 3], [2, 4]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_one_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1, 2, 3]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1, 2, 3])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_increasing_rank_two_tensor(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([[-1, 2], [3, 3]]).eval())
+    self.assertTrue(self.evaluate(
+        check_ops.is_non_decreasing([[-1, 2], [3, 3]])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_tensor_with_one_element_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([1]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([1])))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_empty_tensor_is_non_decreasing(self):
-    with self.test_session():
-      self.assertTrue(check_ops.is_non_decreasing([]).eval())
+    self.assertTrue(self.evaluate(check_ops.is_non_decreasing([])))
 
 
 class FloatDTypeTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_assert_same_float_dtype(self):
     self.assertIs(dtypes.float32,
                   check_ops.assert_same_float_dtype(None, None))
@@ -1009,6 +1146,7 @@ class FloatDTypeTest(test.TestCase):
 
 class AssertScalarTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_assert_scalar(self):
     check_ops.assert_scalar(constant_op.constant(3))
     check_ops.assert_scalar(constant_op.constant("foo"))
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index d2eb3eb80170de26c6fa00fdace1254a51a2b0ec..a786d0a47e569f71812086fb93c21dc12660a2a5 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -87,6 +87,21 @@ class GenerateVocabRemappingTest(test.TestCase):
       self.assertAllEqual(expected_remapping, remapping.eval())
       self.assertAllEqual(expected_num_present, num_present.eval())
 
+  def test_generate_remapping_with_old_vocab_size(self):
+    """Tests where old_vocab_size is specified."""
+    remapping, num_present = gen_checkpoint_ops._generate_vocab_remapping(
+        new_vocab_file=self.new_vocab_file,
+        old_vocab_file=self.old_vocab_file,
+        num_new_vocab=3,
+        new_vocab_offset=0,
+        # Old vocabulary becomes ['knitting', 'eminem'].
+        old_vocab_size=2)
+    expected_remapping = [-1, 0, 1]
+    expected_num_present = 2
+    with self.test_session():
+      self.assertAllEqual(expected_remapping, remapping.eval())
+      self.assertAllEqual(expected_num_present, num_present.eval())
+
 
 class LoadAndRemapMatrixTest(test.TestCase):
   """Tests for the load_and_remap_matrix() op."""
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index 3b71586b55451df86bf214437be3ceec8a4265eb..8e9d75667d49bf9e377ccb9290a3a91786b5a1cb 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -237,6 +237,39 @@ class ConstantTest(test.TestCase):
     self._testAll((1, x))
     self._testAll((x, 1))
 
+  def testInvalidLength(self):
+
+    class BadList(list):
+
+      def __init__(self):
+        super(BadList, self).__init__([1, 2, 3])  # pylint: disable=invalid-length-returned
+
+      def __len__(self):
+        return -1
+
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList()])
+    with self.assertRaisesRegexp(ValueError, "mixed types"):
+      constant_op.constant([1, 2, BadList()])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant(BadList())
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([[BadList(), 2], 3])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), [1, 2, 3]])
+    with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+      constant_op.constant([BadList(), []])
+
+    # TODO(allenl, josh11b): These cases should return exceptions rather than
+    # working (currently shape checking only checks the first element of each
+    # sequence recursively). Maybe the first one is fine, but the second one
+    # silently truncating is rather bad.
+
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[3, 2, 1], BadList()])
+    # with self.assertRaisesRegexp(ValueError, "should return >= 0"):
+    #   constant_op.constant([[], BadList()])
+
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError, "non-rectangular Python sequence"):
       constant_op.constant([[1, 2], [3]], dtype=dtypes_lib.int32)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 6167cb9999b1be2b1e8b530ebacfe9c4a5a2d8d1..68817cc2566847255d289f822aa69308e9c2e329 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -439,10 +439,10 @@ class ZerosLikeTest(test.TestCase):
 
   def testZerosLikeCPU(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
-        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64,
-        dtypes_lib.string
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
+        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
+        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
+        dtypes_lib.complex128, dtypes_lib.string
     ]:
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
@@ -573,9 +573,10 @@ class OnesLikeTest(test.TestCase):
 
   def testOnesLike(self):
     for dtype in [
-        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
-        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.int8,
-        dtypes_lib.complex64, dtypes_lib.complex128, dtypes_lib.int64
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int8,
+        dtypes_lib.uint8, dtypes_lib.int16, dtypes_lib.uint16, dtypes_lib.int32,
+        dtypes_lib.int64, dtypes_lib.bool, dtypes_lib.complex64,
+        dtypes_lib.complex128
     ]:
       numpy_dtype = dtype.as_numpy_dtype
       with self.test_session():
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index a21182beba3455f102bf179969018c72adf8e7d9..35ae89ed33fc2c2f1dbce1ee7bd724555b4fb0a2 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -68,16 +69,6 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import nest
 
 
-def check_op_order(graph):
-  """Sanity check on the ordering of op id."""
-
-  for op in graph.get_operations():
-    for v in op.inputs:
-      assert v.op._id < op._id or op.type == "Merge", (
-          "The id of %s must be less than the id of %s" % (v.op.name, op.name))
-  return True
-
-
 def check_consumers(graph):
   """Sanity check on the consumer list of the tensors."""
 
@@ -122,14 +113,16 @@ def opt_cfg():
               do_constant_folding=True)))
 
 
-def isum(s):
+def isum(s, maximum_iterations=None):
   i = constant_op.constant(0, name="i")
   c = lambda i, s: math_ops.less(i, 10)
   b = lambda i, s: [math_ops.add(i, 1), math_ops.add(i, s)]
-  _, r_s = control_flow_ops.while_loop(c, b, [i, s])
+  _, r_s = control_flow_ops.while_loop(
+      c, b, [i, s], maximum_iterations=maximum_iterations)
   return r_s
 
 
+@test_util.with_c_api
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -140,7 +133,6 @@ class ControlFlowTest(test.TestCase):
       op = state_ops.assign(v, 9)
       v2 = control_flow_ops.with_dependencies([op], v)
 
-      self.assertTrue(check_op_order(v.graph))
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
       self.assertEqual(9, v2.eval())
@@ -352,14 +344,20 @@ class ControlFlowTest(test.TestCase):
     grad = gradients_impl.gradients(y, [v])
     self.assertAllEqual([None], grad)
 
-  def testFetchables(self):
+  def testFetchable(self):
     with self.test_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
       control_flow_ops.cond(
           constant_op.constant(True), lambda: x + 2, lambda: x + 0)
-      tensor_names = all_fetchables()
-      for name in tensor_names:
-        sess.run(name, feed_dict={x: 3})
+      graph = ops.get_default_graph()
+      for op in graph.get_operations():
+        for t in op.inputs:
+          if graph.is_fetchable(t.op):
+            sess.run(t, feed_dict={x: 3})
+          else:
+            with self.assertRaisesRegexp(ValueError,
+                                         "has been marked as not fetchable"):
+              sess.run(t, feed_dict={x: 3})
 
   def testFeedable(self):
     with self.test_session() as sess:
@@ -390,7 +388,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
@@ -437,7 +434,6 @@ class ControlFlowTest(test.TestCase):
 
       val = r.values.eval()
       ind = r.indices.eval()
-    self.assertTrue(check_op_order(x.values.graph))
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
@@ -466,7 +462,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -480,7 +475,6 @@ class ControlFlowTest(test.TestCase):
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -493,7 +487,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn3, fn2)
 
       result = r.eval()
-    self.assertTrue(check_op_order(x.graph))
     self.assertAllEqual(12, result)
 
   def testCond_4(self):
@@ -512,7 +505,6 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertEqual(len(r), 2)
       result = r[1].eval()
-      self.assertTrue(check_op_order(age.graph))
       self.assertAllEqual(True, result)
       self.assertAllEqual(7, v1.eval())
       self.assertAllEqual(2, v2.eval())
@@ -740,6 +732,21 @@ class ControlFlowTest(test.TestCase):
       r = isum(s)
       self.assertAllEqual(45, r.eval())
 
+  def testWhileWithMaximumIterations(self):
+    with self.test_session():
+      s = constant_op.constant([1, 2, 3, 4, 5])
+      r = isum(s, maximum_iterations=3)
+      self.assertAllEqual([1+3, 2+3, 3+3, 4+3, 5+3], r.eval())
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with self.test_session():
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3,
+          lambda i: i + 1,
+          [0],
+          maximum_iterations=1)
+      self.assertEqual(1, r.eval())
+
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
   def testWhile_3(self):
@@ -760,7 +767,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, d),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(10100, result)
 
   def testWhile_4(self):
@@ -782,7 +788,6 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, s),
                                       compute, [i, m, c, o])
       result = r[3].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(42, result)
 
   def testWhile_5(self):
@@ -807,7 +812,6 @@ class ControlFlowTest(test.TestCase):
               tensor_shape.unknown_shape()
           ])
       result = r[2].eval()
-    self.assertTrue(check_op_order(i.graph))
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   def testBufferForwarding(self):
@@ -908,7 +912,13 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(r[1].get_shape()[0].value is None)
       self.assertEqual(r[1].get_shape()[1], tensor_shape.Dimension(2))
 
-      with self.assertRaisesRegexp(ValueError, "not an invariant for"):
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"The shape for while_1/Merge_1:0 is not an invariant for the loop. "
+          r"It enters the loop with shape \(2, 2\), but has shape \(4, 2\) "
+          r"after one iteration. Provide shape invariants using either the "
+          r"`shape_invariants` argument of tf.while_loop or set_shape\(\) on "
+          r"the loop variables."):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   def testWhileShapeInferenceSparseTensor(self):
@@ -1247,7 +1257,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result = select.eval()
@@ -1272,7 +1281,6 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      self.assertTrue(check_op_order(n.graph))
       variables.global_variables_initializer().run()
       self.assertEqual(3, r.eval())
       result1 = select1.eval()
@@ -1299,7 +1307,6 @@ class ControlFlowTest(test.TestCase):
           parallel_iterations=1)
       variables.global_variables_initializer().run()
       result = r[1].eval()
-    self.assertTrue(check_op_order(n.graph))
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   # b/24814703
@@ -1444,7 +1451,8 @@ class ControlFlowTest(test.TestCase):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
     ) else "/device:GPU:0"
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    graph = ops.Graph()
+    with graph.as_default():
       v = constant_op.constant(2.0, name="v")
       c = lambda v: math_ops.less(v, 100.0)
 
@@ -1455,7 +1463,8 @@ class ControlFlowTest(test.TestCase):
       loop = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
       r = gradients_impl.gradients(
           loop, v, colocate_gradients_with_ops=colocate)[0]
-    r_ops = r.graph.get_operations()
+
+    r_ops = graph.get_operations()
     r_devices = [(op.name, op.device) for op in r_ops]
 
     self.assertTrue(any("Square" in op.name for op in r_ops))
@@ -1469,7 +1478,9 @@ class ControlFlowTest(test.TestCase):
         self.assertTrue(gpu_dev_name in dev)
       else:
         self.assertFalse(gpu_dev_name in dev)
-    self.assertAllClose(1024.0, sess.run(r))
+
+    with self.test_session(graph=graph) as sess:
+      self.assertAllClose(1024.0, sess.run(r))
 
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
@@ -2616,6 +2627,124 @@ class ControlFlowTest(test.TestCase):
           1)
 
 
+@test_util.with_c_api
+class ControlFlowContextCheckTest(test.TestCase):
+
+  def _getWhileTensor(self):
+    """Creates and returns a tensor from a while context."""
+    tensor = []
+
+    def body(i):
+      if not tensor:
+        tensor.append(constant_op.constant(1))
+      return i + tensor[0]
+
+    control_flow_ops.while_loop(lambda i: i < 10, body, [0])
+    return tensor[0]
+
+  def _getCondTensor(self):
+    cond_tensor = []
+    def true_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+    control_flow_ops.cond(math_ops.less(1, 2), true_fn,
+                          lambda: constant_op.constant(0))
+    return cond_tensor[0]
+
+  def testInvalidContext(self):
+    # Accessing a while loop tensor outside of control flow is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while/Const_1' as input to 'Add' because 'while/Const_1' "
+        "is in a while loop. See info log for more details."):
+      math_ops.add(1, while_tensor)
+
+  def testInvalidContextInCond(self):
+    # Accessing a while loop tensor in cond is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while/Const_1' as input to 'cond/Add' because "
+        "'while/Const_1' is in a while loop. See info log for more details."):
+      # TODO(skyewm): this passes if we return while_tensor directly instead
+      # of using it as input to another op.
+      control_flow_ops.cond(math_ops.less(1, 2),
+                            lambda: math_ops.add(1, while_tensor),
+                            lambda: constant_op.constant(0))
+
+  def testInvalidContextInWhile(self):
+    # Accessing a while loop tensor in a different while loop is illegal.
+    while_tensor = self._getWhileTensor()
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_1/Add' as input to 'while/Const_1' because they are "
+        "in different while loops. See info log for more details."):
+      control_flow_ops.while_loop(lambda i: i < 10,
+                                  lambda x: math_ops.add(1, while_tensor), [0])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'while_2/NextIteration' as input to 'while/Const_1' "
+        "because they are in different while loops. See info log for more "
+        "details."):
+      control_flow_ops.while_loop(lambda i: i < 10, lambda i: while_tensor, [0])
+
+  def testValidCondContext(self):
+    # Accessing a tensor from a cond context is OK (although dangerous).
+    cond_tensor = self._getCondTensor()
+    math_ops.add(1, cond_tensor)
+
+  def testValidCondContextBranches(self):
+    # Accessing a tensor from a cond context from the other branch's cond
+    # context is OK (although dangerous).
+    cond_tensor = []
+    def branch_fn():
+      if not cond_tensor:
+        cond_tensor.append(constant_op.constant(1))
+      return cond_tensor[0]
+
+    control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
+
+  def testValidWhileContext(self):
+    # Accessing a tensor in a nested while is OK.
+    def body(_):
+      c = constant_op.constant(1)
+      return control_flow_ops.while_loop(lambda i: i < 3, lambda i: i + c, [0])
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testValidNestedContexts(self):
+    # Accessing a tensor from a cond context in a while context, all inside an
+    # outer while context, is OK.
+    def body(_):
+      cond_tensor = self._getCondTensor()
+      # Create another cond containing the while loop for good measure
+      return control_flow_ops.cond(
+          math_ops.less(1, 2),
+          lambda: control_flow_ops.while_loop(lambda i: i < 3,
+                                              lambda i: i + cond_tensor, [0]),
+          lambda: constant_op.constant(0))
+
+    control_flow_ops.while_loop(lambda i: i < 5, body, [0])
+
+  def testInvalidNestedContexts(self):
+    # Accessing a tensor from a while context in a different while context, all
+    # inside a cond context, is illegal.
+    def true_fn():
+      while_tensor = self._getWhileTensor()
+      return control_flow_ops.while_loop(lambda i: i < 3,
+                                         lambda i: i + while_tensor, [0])
+    with self.assertRaisesRegexp(
+        ValueError,
+        "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
+        " they are in different while loops. See info log for more details."):
+      control_flow_ops.cond(math_ops.less(1, 2), true_fn,
+                            lambda: constant_op.constant(0))
+
+
+@test_util.with_c_api
 class TupleTest(test.TestCase):
 
   def testTensors(self):
@@ -2701,6 +2830,7 @@ class TupleTest(test.TestCase):
       self.assertEquals(1, var.eval())
 
 
+@test_util.with_c_api
 class AssertTest(test.TestCase):
 
   def testGuardedAssertDoesNotCopyWhenTrue(self):
@@ -2738,6 +2868,7 @@ class AssertTest(test.TestCase):
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
 
+@test_util.with_c_api
 class WhileOpBenchmark(test.Benchmark):
   """Evaluate the performance of while_loop op."""
 
@@ -2851,16 +2982,18 @@ class WhileOpBenchmark(test.Benchmark):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
+@test_util.with_c_api
 class EagerTest(test.TestCase):
 
   def testCond(self):
     with context.eager_mode():
       pred = math_ops.less(1, 2)
-      fn1 = lambda: constant_op.constant(10)
-      fn2 = lambda: constant_op.constant(20)
+      fn1 = lambda: [constant_op.constant(10)]
+      fn2 = lambda: [constant_op.constant(20)]
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       self.assertAllEqual(r.numpy(), 10)
+      self.assertFalse(isinstance(r, list))
 
   def testWhileLoop(self):
     with context.eager_mode():
@@ -2868,6 +3001,22 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(isum(tensor).numpy(),
                           [46, 47, 48, 49, 50])
 
+  def testWhileLoopWithMaxIterations(self):
+    with context.eager_mode():
+      tensor = constant_op.constant([1, 2, 3, 4, 5])
+      self.assertAllEqual(isum(tensor, maximum_iterations=3).numpy(),
+                          [1+3, 2+3, 3+3, 4+3, 5+3])
+
+  def testWhileWithMaximumIterationsAndSingleArgument(self):
+    with context.eager_mode():
+      tensor = constant_op.constant(0)
+      r = control_flow_ops.while_loop(
+          lambda i: i < 3,
+          lambda i: i + 1,
+          [tensor],
+          maximum_iterations=1)
+      self.assertEqual(1, r.numpy())
+
   def testWithDependencies(self):
     with context.eager_mode():
       t1 = constant_op.constant(1)
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e96f74b0461da0cf499e303b30a4a41aae4899
--- /dev/null
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -0,0 +1,71 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.python.ops.control_flow_util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.platform import test
+
+
+class ControlFlowUtilTest(test.TestCase):
+
+  def testIsSwitch(self):
+    switch_false, _ = control_flow_ops.switch(1, True)
+    switch = switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(switch))
+
+    ref_switch_false, _ = control_flow_ops.ref_switch(test_ops.ref_output(),
+                                                      True)
+    ref_switch = ref_switch_false.op
+    self.assertTrue(control_flow_util.IsSwitch(ref_switch))
+
+    self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
+
+  def testIsLoopEnter(self):
+    enter = gen_control_flow_ops.enter(1, frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(enter))
+
+    ref_enter = gen_control_flow_ops.ref_enter(test_ops.ref_output(),
+                                               frame_name="name").op
+    self.assertTrue(control_flow_util.IsLoopEnter(ref_enter))
+    self.assertFalse(control_flow_util.IsLoopConstantEnter(ref_enter))
+
+    const_enter = gen_control_flow_ops.enter(1, frame_name="name",
+                                             is_constant=True).op
+    self.assertTrue(control_flow_util.IsLoopEnter(const_enter))
+    self.assertTrue(control_flow_util.IsLoopConstantEnter(const_enter))
+
+    self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
+
+  def testIsLoopExit(self):
+    exit_op = control_flow_ops.exit(1).op
+    self.assertTrue(control_flow_util.IsLoopExit(exit_op))
+
+    ref_exit = control_flow_ops.exit(test_ops.ref_output()).op
+    self.assertTrue(control_flow_util.IsLoopExit(ref_exit))
+
+    self.assertFalse(control_flow_util.IsLoopExit(test_ops.int_output().op))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 662c94eea7f08af15795ed5105e9ca67ecd8c0ce..d92797a7d38cbe359d8166ea9ad7c25bd9cd1f4b 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -17,6 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
@@ -37,7 +40,7 @@ class Conv1DTest(test.TestCase):
     filters = array_ops.expand_dims(filters, 2)  # out_channels
     # Filters is 2x1x1
     for stride in [1, 2]:
-      with self.test_session():
+      with self.test_session(use_gpu=test.is_gpu_available()):
         c = nn_ops.conv1d(x, filters, stride, padding="VALID")
         reduced = array_ops.squeeze(c)
         output = reduced.eval()
@@ -49,6 +52,46 @@ class Conv1DTest(test.TestCase):
           self.assertEqual(len(output), 2)
           self.assertAllClose(output, [2 * 1 + 1 * 2, 2 * 3 + 1 * 4])
 
+  def testConv1DTranspose(self):
+    with self.test_session():
+      stride = 2
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 9, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, stride=stride, padding="VALID")
+      value = output.eval()
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(pad, y_shape[1] - pad):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % stride == 0 and w > pad and w < y_shape[1] - 1 - pad
+            if w_in:
+              target += 3.0
+            cache_values[n, w, k] = target
+
+          # copy values in the border
+          cache_values[n, 0, k] = cache_values[n, 1, k]
+          cache_values[n, -1, k] = cache_values[n, -2, k]
+
+    self.assertAllClose(cache_values, value)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index 1679857bd5b9c5a9a1fbf89f207befc4382223b1..be299beee48cd8fb058393840eddfe08da1d6d99 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -42,17 +42,21 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           filter_shape = [3, 3, 4, 6]
           # Make a convolution op with the current settings, just to easily get
           # the shape of the output.
-          conv_out = nn_ops.conv2d(in_val,
-                                   array_ops.zeros(filter_shape),
-                                   [1, stride, stride, 1], padding)
+          conv_out = nn_ops.conv2d(
+              in_val,
+              array_ops.zeros(filter_shape),
+              strides=[1, stride, stride, 1],
+              padding=padding)
           out_backprop_shape = conv_out.get_shape().as_list()
           out_backprop_val = constant_op.constant(
               2 * np.random.random_sample(out_backprop_shape) - 1,
               dtype=dtypes.float32)
-          output = nn_ops.conv2d_backprop_filter(in_val, filter_shape,
-                                                 out_backprop_val,
-                                                 [1, stride, stride, 1],
-                                                 padding)
+          output = nn_ops.conv2d_backprop_filter(
+              in_val,
+              filter_shape,
+              out_backprop_val,
+              strides=[1, stride, stride, 1],
+              padding=padding)
           err = gradient_checker.compute_gradient_error(
               [in_val, out_backprop_val], [in_shape, out_backprop_shape],
               output, filter_shape)
@@ -60,6 +64,42 @@ class Conv2DBackpropFilterGradTest(test.TestCase):
           err_tolerance = 2e-3
           self.assertLess(err, err_tolerance)
 
+  def testGradientDilatedConv(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        for padding in ["SAME", "VALID"]:
+          for stride in [1, 2]:
+            np.random.seed(1)
+            in_shape = [5, 8, 6, 4]
+            in_val = constant_op.constant(
+                2 * np.random.random_sample(in_shape) - 1, dtype=dtypes.float32)
+            filter_shape = [3, 3, 4, 6]
+            # Make a convolution op with the current settings,
+            # just to easily get the shape of the output.
+            conv_out = nn_ops.conv2d(
+                in_val,
+                array_ops.zeros(filter_shape),
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            out_backprop_shape = conv_out.get_shape().as_list()
+            out_backprop_val = constant_op.constant(
+                2 * np.random.random_sample(out_backprop_shape) - 1,
+                dtype=dtypes.float32)
+            output = nn_ops.conv2d_backprop_filter(
+                in_val,
+                filter_shape,
+                out_backprop_val,
+                dilations=[1, 2, 2, 1],
+                strides=[1, stride, stride, 1],
+                padding=padding)
+            err = gradient_checker.compute_gradient_error(
+                [in_val, out_backprop_val], [in_shape, out_backprop_shape],
+                output, filter_shape)
+            print("conv2d_backprop_filter gradient err = %g " % err)
+            err_tolerance = 2e-3
+            self.assertLess(err, err_tolerance)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 14622ab4678864cd21257fe293a7984b39e59204..ec8ac74163d093c57e6e4ffbab6977ce732cc3ef 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -45,8 +47,19 @@ def GetTestConfigs():
 
 class Conv3DTest(test.TestCase):
 
+  def _DtypesToTest(self, use_gpu):
+    if use_gpu:
+      if not test_util.CudaSupportsHalfMatMulAndConv():
+        return [dtypes.float32]
+      else:
+        # It is important that float32 comes before float16 here,
+        # as we will be using its gradients as reference for fp16 gradients.
+        return [dtypes.float32, dtypes.float16]
+    else:
+      return [dtypes.float64, dtypes.float32, dtypes.float16]
+
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, stride,
-                            padding, data_format, use_gpu):
+                            padding, data_format, dtype, use_gpu):
     total_size_1 = 1
     total_size_2 = 1
     for s in tensor_in_sizes:
@@ -54,13 +67,14 @@ class Conv3DTest(test.TestCase):
     for s in filter_in_sizes:
       total_size_2 *= s
 
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    # Initializes the input tensor with array containing numbers from 0 to 1.
+    # We keep the input tensor values fairly small to avoid overflowing float16
+    # during the conv3d.
+    x1 = [f * 1.0 / total_size_1 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 / total_size_2 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu):
-      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
-      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
 
       if isinstance(stride, collections.Iterable):
         strides = [1] + list(stride) + [1]
@@ -81,27 +95,33 @@ class Conv3DTest(test.TestCase):
                     expected):
     results = []
     for data_format, use_gpu in GetTestConfigs():
-      result = self._SetupValuesForDevice(
-          tensor_in_sizes,
-          filter_in_sizes,
-          stride,
-          padding,
-          data_format,
-          use_gpu=use_gpu)
-      results.append(result)
-      tolerance = 1e-2 if use_gpu else 1e-5
+      for dtype in self._DtypesToTest(use_gpu):
+        result = self._SetupValuesForDevice(
+            tensor_in_sizes,
+            filter_in_sizes,
+            stride,
+            padding,
+            data_format,
+            dtype,
+            use_gpu=use_gpu)
+        results.append(result)
+
       with self.test_session() as sess:
         values = sess.run(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
-          self.assertAllClose(expected, value.flatten(), atol=tolerance,
-                              rtol=1e-6)
+          tol = 1e-6
+          if value.dtype == np.float16:
+            tol = 1e-3
+
+          self.assertAllClose(expected, value.flatten(), atol=tol, rtol=tol)
 
   def testConv3D1x1x1Filter(self):
     expected_output = [
-        30.0, 36.0, 42.0, 66.0, 81.0, 96.0, 102.0, 126.0, 150.0, 138.0, 171.0,
-        204.0, 174.0, 216.0, 258.0, 210.0, 261.0, 312.0
+        0.18518519, 0.22222222, 0.25925926, 0.40740741, 0.5, 0.59259259,
+        0.62962963, 0.77777778, 0.92592593, 0.85185185, 1.05555556, 1.25925926,
+        1.07407407, 1.33333333, 1.59259259, 1.2962963, 1.61111111, 1.92592593
     ]
 
     # These are equivalent to the Conv2D1x1 case.
@@ -127,8 +147,10 @@ class Conv3DTest(test.TestCase):
   # Expected values computed using scipy's correlate function.
   def testConv3D2x2x2Filter(self):
     expected_output = [
-        19554., 19962., 20370., 22110., 22590., 23070., 34890., 35730., 36570.,
-        37446., 38358., 39270., 50226., 51498., 52770., 52782., 54126., 55470.
+        3.77199074, 3.85069444, 3.92939815, 4.2650463, 4.35763889, 4.45023148,
+        6.73032407, 6.89236111, 7.05439815, 7.22337963, 7.39930556, 7.57523148,
+        9.68865741, 9.93402778, 10.17939815, 10.18171296, 10.44097222,
+        10.70023148
     ]
     # expected_shape = [1, 3, 1, 2, 5]
     self._VerifyValues(
@@ -140,69 +162,17 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStrides(self):
     expected_output = [
-        102.,
-        151.,
-        172.,
-        193.,
-        214.,
-        235.,
-        142.,
-        438.,
-        592.,
-        613.,
-        634.,
-        655.,
-        676.,
-        394.,
-        774.,
-        1033.,
-        1054.,
-        1075.,
-        1096.,
-        1117.,
-        646.,
-        1894.,
-        2503.,
-        2524.,
-        2545.,
-        2566.,
-        2587.,
-        1486.,
-        2230.,
-        2944.,
-        2965.,
-        2986.,
-        3007.,
-        3028.,
-        1738.,
-        2566.,
-        3385.,
-        3406.,
-        3427.,
-        3448.,
-        3469.,
-        1990.,
-        3686.,
-        4855.,
-        4876.,
-        4897.,
-        4918.,
-        4939.,
-        2830.,
-        4022.,
-        5296.,
-        5317.,
-        5338.,
-        5359.,
-        5380.,
-        3082.,
-        4358.,
-        5737.,
-        5758.,
-        5779.,
-        5800.,
-        5821.,
-        3334.,
+        0.06071429, 0.08988095, 0.10238095, 0.11488095, 0.12738095, 0.13988095,
+        0.08452381, 0.26071429, 0.35238095, 0.36488095, 0.37738095, 0.38988095,
+        0.40238095, 0.23452381, 0.46071429, 0.61488095, 0.62738095, 0.63988095,
+        0.65238095, 0.66488095, 0.38452381, 1.12738095, 1.48988095, 1.50238095,
+        1.51488095, 1.52738095, 1.53988095, 0.88452381, 1.32738095, 1.75238095,
+        1.76488095, 1.77738095, 1.78988095, 1.80238095, 1.03452381, 1.52738095,
+        2.01488095, 2.02738095, 2.03988095, 2.05238095, 2.06488095, 1.18452381,
+        2.19404762, 2.88988095, 2.90238095, 2.91488095, 2.92738095, 2.93988095,
+        1.68452381, 2.39404762, 3.15238095, 3.16488095, 3.17738095, 3.18988095,
+        3.20238095, 1.83452381, 2.59404762, 3.41488095, 3.42738095, 3.43988095,
+        3.45238095, 3.46488095, 1.98452381
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 5, 8, 7, 1],
@@ -212,7 +182,9 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
   def testConv3D2x2x2FilterStride2(self):
-    expected_output = [19554., 19962., 20370., 50226., 51498., 52770.]
+    expected_output = [
+        3.77199074, 3.85069444, 3.92939815, 9.68865741, 9.93402778, 10.17939815
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
         filter_in_sizes=[2, 2, 2, 3, 3],
@@ -222,11 +194,12 @@ class Conv3DTest(test.TestCase):
 
   def testConv3DStride3(self):
     expected_output = [
-        36564., 38022., 39480., 37824., 39354., 40884., 39084., 40686., 42288.,
-        46644., 48678., 50712., 47904., 50010., 52116., 49164., 51342., 53520.,
-        107124., 112614., 118104., 108384., 113946., 119508., 109644., 115278.,
-        120912., 117204., 123270., 129336., 118464., 124602., 130740., 119724.,
-        125934., 132144.
+        1.51140873, 1.57167659, 1.63194444, 1.56349206, 1.62673611, 1.68998016,
+        1.6155754, 1.68179563, 1.74801587, 1.9280754, 2.01215278, 2.09623016,
+        1.98015873, 2.0672123, 2.15426587, 2.03224206, 2.12227183, 2.21230159,
+        4.4280754, 4.65500992, 4.88194444, 4.48015873, 4.71006944, 4.93998016,
+        4.53224206, 4.76512897, 4.99801587, 4.84474206, 5.09548611, 5.34623016,
+        4.8968254, 5.15054563, 5.40426587, 4.94890873, 5.20560516, 5.46230159
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 6, 7, 8, 2],
@@ -237,8 +210,8 @@ class Conv3DTest(test.TestCase):
 
   def testConv3D2x2x2FilterStride2Same(self):
     expected_output = [
-        19554., 19962., 20370., 10452., 10710., 10968., 50226., 51498., 52770.,
-        23844., 24534., 25224.
+        3.77199074, 3.85069444, 3.92939815, 2.0162037, 2.06597222, 2.11574074,
+        9.68865741, 9.93402778, 10.17939815, 4.59953704, 4.73263889, 4.86574074
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 4, 2, 3, 3],
@@ -248,7 +221,10 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
   def testKernelSmallerThanStride(self):
-    expected_output = [1., 3., 7., 9., 19., 21., 25., 27.]
+    expected_output = [
+        0.03703704, 0.11111111, 0.25925926, 0.33333333, 0.7037037, 0.77777778,
+        0.92592593, 1.
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 3, 3, 3, 1],
         filter_in_sizes=[1, 1, 1, 1, 1],
@@ -263,9 +239,11 @@ class Conv3DTest(test.TestCase):
         expected=expected_output)
 
     expected_output = [
-        1484., 1592., 770., 2240., 2348., 1106., 1149., 1191., 539., 6776.,
-        6884., 3122., 7532., 7640., 3458., 3207., 3249., 1421., 3005., 3035.,
-        1225., 3215., 3245., 1309., 1013., 1022., 343.
+        0.54081633, 0.58017493, 0.28061224, 0.81632653, 0.85568513, 0.40306122,
+        0.41873178, 0.4340379, 0.19642857, 2.46938776, 2.50874636, 1.1377551,
+        2.74489796, 2.78425656, 1.26020408, 1.16873178, 1.1840379, 0.51785714,
+        1.09511662, 1.10604956, 0.44642857, 1.17164723, 1.18258017, 0.47704082,
+        0.3691691, 0.37244898, 0.125
     ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
@@ -274,7 +252,10 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         expected=expected_output)
 
-    expected_output = [1484., 1592., 2240., 2348., 6776., 6884., 7532., 7640.]
+    expected_output = [
+        0.540816, 0.580175, 0.816327, 0.855685, 2.469388, 2.508746, 2.744898,
+        2.784257
+    ]
     self._VerifyValues(
         tensor_in_sizes=[1, 7, 7, 7, 1],
         filter_in_sizes=[2, 2, 2, 1, 1],
@@ -288,7 +269,7 @@ class Conv3DTest(test.TestCase):
         filter_in_sizes=[2, 1, 2, 1, 2],
         stride=1,
         padding="VALID",
-        expected=[50, 60])
+        expected=[1.5625, 1.875])
 
   def _ConstructAndTestGradientForConfig(
       self, batch, input_shape, filter_shape, in_depth, out_depth, stride,
@@ -328,50 +309,58 @@ class Conv3DTest(test.TestCase):
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
 
-    if test.is_gpu_available() and use_gpu:
-      data_type = dtypes.float32
+    for data_type in self._DtypesToTest(use_gpu=use_gpu):
       # TODO(mjanusz): Modify gradient_checker to also provide max relative
       # error and synchronize the tolerance levels between the tests for forward
       # and backward computations.
-      if test.is_gpu_available():
+      if data_type == dtypes.float64:
+        tolerance = 1e-8
+      elif data_type == dtypes.float32:
         tolerance = 5e-3
-      else:
-        # As of Aug 2016, higher tolerance is needed for some CPU architectures.
-        # Runs on a single machine can also generate slightly different errors
-        # because of multithreading.
-        tolerance = 8e-3
-    else:
-      data_type = dtypes.float64
-      tolerance = 1e-8
-    with self.test_session(use_gpu=use_gpu):
-      orig_input_tensor = constant_op.constant(
-          input_data, shape=input_shape, dtype=data_type, name="input")
-      filter_tensor = constant_op.constant(
-          filter_data, shape=filter_shape, dtype=data_type, name="filter")
-
-      if data_format == "NCDHW":
-        input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
-        strides = test_util.NHWCToNCHW(strides)
-      else:
-        input_tensor = orig_input_tensor
-
-      conv = nn_ops.conv3d(
-          input_tensor, filter_tensor, strides, padding,
-          data_format=data_format, name="conv")
-
-      if data_format == "NCDHW":
-        conv = test_util.NCHWToNHWC(conv)
-
-      if test_input:
-        err = gradient_checker.compute_gradient_error(orig_input_tensor,
-                                                      input_shape,
-                                                      conv, output_shape)
-      else:
-        err = gradient_checker.compute_gradient_error(filter_tensor,
-                                                      filter_shape, conv,
-                                                      output_shape)
-    print("conv3d gradient error = ", err)
-    self.assertLess(err, tolerance)
+      elif data_type == dtypes.float16:
+        tolerance = 1e-3
+
+      with self.test_session(use_gpu=use_gpu):
+        orig_input_tensor = constant_op.constant(
+            input_data, shape=input_shape, dtype=data_type, name="input")
+        filter_tensor = constant_op.constant(
+            filter_data, shape=filter_shape, dtype=data_type, name="filter")
+
+        if data_format == "NCDHW":
+          input_tensor = test_util.NHWCToNCHW(orig_input_tensor)
+          new_strides = test_util.NHWCToNCHW(strides)
+        else:
+          input_tensor = orig_input_tensor
+          new_strides = strides
+
+        conv = nn_ops.conv3d(
+            input_tensor,
+            filter_tensor,
+            new_strides,
+            padding,
+            data_format=data_format,
+            name="conv")
+
+        if data_format == "NCDHW":
+          conv = test_util.NCHWToNHWC(conv)
+
+        if test_input:
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              orig_input_tensor, input_shape, conv, output_shape)
+        else:
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              filter_tensor, filter_shape, conv, output_shape)
+
+        if data_type != dtypes.float16:
+          reference_jacob_t = jacob_t
+          err = np.fabs(jacob_t - jacob_n).max()
+        else:
+          # Compare fp16 theoretical gradients to fp32 theoretical gradients,
+          # since fp16 numerical gradients are too imprecise.
+          err = np.fabs(jacob_t - reference_jacob_t).max()
+
+      print("conv3d gradient error = ", err)
+      self.assertLess(err, tolerance)
 
   def ConstructAndTestGradient(self, **kwargs):
     for data_format, use_gpu in GetTestConfigs():
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 22e5400c3745a735d783fef761276694dc830c32..a85134c288c83975771d141c9775b4711755dd4b 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import time
 
@@ -32,6 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -240,6 +242,77 @@ class Conv2DTest(test.TestCase):
     for i in range(1, len(values)):
       self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
 
+  def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
+                                   stride, dilation, padding, data_format,
+                                   use_gpu):
+    total_size_1 = 1
+    total_size_2 = 1
+    for s in tensor_in_sizes:
+      total_size_1 *= s
+    for s in filter_in_sizes:
+      total_size_2 *= s
+
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
+    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    with test_util.device(use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format)
+      computed = nn_ops.conv2d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format)
+      if data_format == "NCHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
+                               padding, dilations):
+    expected_results = []
+    computed_results = []
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    for data_format, use_gpu in GetTestConfigs():
+      # If any dilation rate is larger than 1, only do test on the GPU
+      # because we currently do not have a CPU implementation for arbitrary
+      # dilation rates.
+      if default_dilations or use_gpu:
+        expected, computed = self._ComputeReferenceDilatedConv(
+            tensor_in_sizes, filter_in_sizes, strides, dilations, padding,
+            data_format, use_gpu)
+        expected_results.append(expected)
+        computed_results.append(computed)
+        tolerance = 1e-2 if use_gpu else 1e-5
+        expected_values = self.evaluate(expected_results)
+        computed_values = self.evaluate(computed_results)
+        for e_value, c_value in zip(expected_values, computed_values):
+          print("expected = ", e_value)
+          print("actual = ", c_value)
+          self.assertAllClose(
+              e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+
   def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
                     expected):
     tensors = []
@@ -279,6 +352,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Filter2x1Dilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 4, 4, 1],
+          filter_in_sizes=[2, 2, 1, 1],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2DEmpty(self):
     expected_output = []
@@ -289,6 +372,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DEmptyDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[0, 2, 3, 3],
+          filter_in_sizes=[1, 1, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -300,6 +393,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[2, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[1, 2],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D1x2Filter(self):
     # The outputs are computed using third_party/py/IPython/notebook.
@@ -314,6 +417,16 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x2FilterDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 2, 3, 3],
+          filter_in_sizes=[1, 2, 3, 3],
+          strides=[1, 1],
+          dilations=[2, 1],
+          padding="VALID")
+
   @test_util.run_in_graph_and_eager_modes()
   def testConv2D2x2FilterStride2(self):
     expected_output = [2271.0, 2367.0, 2463.0]
@@ -386,13 +499,23 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=[50, 60])
 
-    # TODO this currently fails.
-    # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
-    #                   filter_in_sizes=[2, 2, 1, 1],
-    #                   strides=[4, 4], padding="SAME",
-    #                   expected=[72, 112, 392, 432])
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DKernelSizeMatchesInputSizeDilation(self):
+    if test.is_gpu_available(cuda_only=True):
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=[1, 3, 3, 1],
+          filter_in_sizes=[2, 2, 1, 2],
+          strides=[1, 1],
+          dilations=[2, 2],
+          padding="VALID")
+
+  # TODO this currently fails.
+  # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
+  #                   filter_in_sizes=[2, 2, 1, 1],
+  #                   strides=[4, 4], padding="SAME",
+  #                   expected=[72, 112, 392, 432])
 
-    # Testing for backprops
+  # Testing for backprops
   def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
                                  strides, padding, expected, data_format,
                                  use_gpu, err):
@@ -724,6 +847,255 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  # Testing for backprops
+  def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
+                                         output_sizes, strides, dilations,
+                                         padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t1)[0]
+        conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
+        # "values" consists of two tensors for two backprops
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  # Testing for backprops
+  def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
+                                          output_sizes, strides, dilations,
+                                          padding, data_format, use_gpu, err):
+    total_input_size = 1
+    total_filter_size = 1
+    for s in input_sizes:
+      total_input_size *= s
+    for s in filter_sizes:
+      total_filter_size *= s
+    # Initializes the input tensor with array containing incrementing
+    # numbers from 1.
+    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
+    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    default_dilations = (dilations[0] == 1 and dilations[1] == 1)
+    if default_dilations or use_gpu:
+      with self.test_session(use_gpu=use_gpu) as sess:
+        if data_format == "NCHW":
+          input_sizes = test_util.NHWCToNCHW(input_sizes)
+        t1 = constant_op.constant(x1, shape=input_sizes)
+        t2 = constant_op.constant(x2, shape=filter_sizes)
+        full_strides = [1] + strides + [1]
+        full_dilations = [1] + dilations + [1]
+        if data_format == "NCHW":
+          full_strides = test_util.NHWCToNCHW(full_strides)
+          full_dilations = test_util.NHWCToNCHW(full_dilations)
+        conv_forward = nn_ops.conv2d(
+            t1,
+            t2,
+            strides=full_strides,
+            dilations=full_dilations,
+            padding=padding,
+            data_format=data_format)
+        conv_forward_2 = nn_ops.convolution(
+            t1,
+            t2,
+            padding=padding,
+            strides=strides,
+            dilation_rate=dilations,
+            data_format=data_format)
+        if data_format == "NCHW":
+          conv_forward = test_util.NCHWToNHWC(conv_forward)
+          conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
+        conv = gradients_impl.gradients(conv_forward, t2)[0]
+        conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
+        value = sess.run(conv)
+        value_2 = sess.run(conv_2)
+        self.assertShapeEqual(value, conv)
+        self.assertShapeEqual(value_2, conv_2)
+      print("expected = ", value_2)
+      print("actual = ", value)
+      self.assertArrayNear(value_2.flatten(), value.flatten(), err)
+
+  def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropFilterDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 0],
+            output_sizes=[1, 1, 2, 0],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 4, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropFilterDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropFilterDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputStride1x1Dilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 5, 1],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth1ValidBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2DEmptyBackpropInputDilation1x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[0, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[0, 1, 2, 1],
+            strides=[1, 1],
+            dilations=[1, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
+  def testConv2D2x2Depth3ValidBackpropInputDilation2x1(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        # The GPU version of this test is not very stable. So adjusting the
+        # error threshold to 1e-4.
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 2, 3],
+            filter_sizes=[2, 2, 3, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[1, 1],
+            dilations=[2, 1],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-4)
+
+  def testConv2DKernelSizeMatchesInputSizeBackpropInputDilation2x2(self):
+    if test.is_gpu_available(cuda_only=True):
+      for (data_format, use_gpu) in GetTestConfigs():
+        self._RunAndVerifyBackpropInputDilation(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 1, 1, 2],
+            strides=[1, 1],
+            dilations=[2, 2],
+            padding="VALID",
+            data_format=data_format,
+            use_gpu=use_gpu,
+            err=1e-5)
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
@@ -1457,6 +1829,22 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
+def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
+
+  def Test(self):
+    if test.is_gpu_available(cuda_only=True) and stride == 1:
+      tf_logging.info("Testing InceptionFwd with dilations %s",
+                      (input_size, filter_size, stride, padding))
+      self._VerifyDilatedConvValues(
+          tensor_in_sizes=input_size,
+          filter_in_sizes=filter_size,
+          strides=[stride, stride],
+          dilations=[2, 2],
+          padding=padding)
+
+  return Test
+
+
 def GetInceptionBackInputTest(input_size, filter_size, output_size, stride,
                               padding,
                               gpu_only=False):
@@ -1497,6 +1885,10 @@ if __name__ == "__main__":
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionFwdTest(input_size_, filter_size_, stride_,
                                     padding_)))
+    setattr(
+        Conv2DTest, "testInceptionFwdDilatedConv_" + str(index),
+        test_util.run_in_graph_and_eager_modes()(GetInceptionFwdDilatedConvTest(
+            input_size_, filter_size_, stride_, padding_)))
     setattr(Conv2DTest, "testInceptionBackInput_" + str(index),
             test_util.run_in_graph_and_eager_modes()(
                 GetInceptionBackInputTest(input_size_, filter_size_,
@@ -1519,6 +1911,9 @@ if __name__ == "__main__":
   setattr(Conv2DTest, "testInceptionFwd_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionFwdTest(ishape, fshape, 1, "SAME", gpu_only=True)))
+  setattr(Conv2DTest, "testInceptionFwdDilatedConv_No_Winograd_Nonfused",
+          test_util.run_in_graph_and_eager_modes()(
+              GetInceptionFwdDilatedConvTest(ishape, fshape, 1, "SAME")))
   setattr(Conv2DTest, "testInceptionBackInput_No_Winograd_Nonfused",
           test_util.run_in_graph_and_eager_modes()(
               GetInceptionBackInputTest(ishape, fshape, oshape, 1, "SAME",
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index 783492a6f255b7e665615e91d0d1db380e42b7a9..c67c26b7be0777587eb6d7c49119ad6cd2e22953 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -64,6 +65,81 @@ class DecodeBmpOpTest(test.TestCase):
       decoded = decode.eval()
       self.assertAllEqual(decoded, img_bytes)
 
+  def testGrayscale(self):
+    img_bytes = [[[255], [0]], [[255], [0]]]
+    encoded_bytes = [
+        0x42,
+        0x40,
+        0x3d,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x36,
+        0,
+        0,
+        0,
+        0x28,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x2,
+        0,
+        0,
+        0,
+        0x1,
+        0,
+        0x8,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0x10,
+        0,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0x13,
+        0xb,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0,
+        0xff,
+        0,
+        0,
+        0,
+        0xff,
+        0,
+        0,
+        0,
+    ]
+
+    byte_string = bytes(bytearray(encoded_bytes))
+    img_in = constant_op.constant(byte_string, dtype=dtypes.string)
+    decode = image_ops.decode_bmp(img_in)
+
+    with self.test_session():
+      decoded = decode.eval()
+      self.assertAllEqual(decoded, img_bytes)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 792806642a4f495942700cf052deba04fff1ed74..7df2366954f3a6f3f37aef447479ba67c263025f 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -284,11 +284,16 @@ class DepthToSpaceTest(test.TestCase):
 class DepthToSpaceGradientTest(test.TestCase):
 
   # Check the gradients.
-  def _checkGrad(self, x, block_size):
+  def _checkGrad(self, x, block_size, data_format):
+    # NCHW is implemented for only GPU.
+    if data_format == "NCHW" and not test.is_gpu_available():
+      return
+
     assert 4 == x.ndim
     with self.test_session(use_gpu=True):
       tf_x = ops.convert_to_tensor(x)
-      tf_y = array_ops.depth_to_space(tf_x, block_size)
+      tf_y = array_ops.depth_to_space(tf_x, block_size, data_format=data_format)
+
       epsilon = 1e-2
       ((x_jacob_t, x_jacob_n)) = gradient_checker.compute_gradient(
           tf_x,
@@ -297,28 +302,32 @@ class DepthToSpaceGradientTest(test.TestCase):
           tf_y.get_shape().as_list(),
           x_init_value=x,
           delta=epsilon)
-
-    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=epsilon)
+      self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=epsilon)
 
   # Tests a gradient for depth_to_space of x which is a four dimensional
   # tensor of shape [b, h, w, d * block_size * block_size].
-  def _compare(self, b, h, w, d, block_size):
+  def _compare(self, b, h, w, d, block_size, data_format):
     block_size_sq = block_size * block_size
-    x = np.random.normal(
-        0, 1, b * h * w * d * block_size_sq).astype(np.float32).reshape(
-            [b, h, w, d * block_size_sq])
+    data = np.random.normal(0, 1, b * h * w * d * block_size_sq).astype(
+        np.float32)
+    if data_format == "NHWC":
+      x = data.reshape([b, h, w, d * block_size_sq])
+    else:
+      x = data.reshape([b, d * block_size_sq, h, w])
 
-    self._checkGrad(x, block_size)
+    self._checkGrad(x, block_size, data_format)
 
   # Don't use very large numbers as dimensions here, as the result is tensor
   # with cartesian product of the dimensions.
   def testSmall(self):
     block_size = 2
-    self._compare(3, 2, 5, 3, block_size)
+    self._compare(3, 2, 5, 3, block_size, "NHWC")
+    self._compare(3, 2, 5, 3, block_size, "NCHW")
 
   def testSmall2(self):
     block_size = 3
-    self._compare(1, 2, 3, 2, block_size)
+    self._compare(1, 2, 3, 2, block_size, "NHWC")
+    self._compare(1, 2, 3, 2, block_size, "NCHW")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 3298092fbeac34e542dbab7ed204e293a6774229..f7ae1a0f37ebf8e0ca23bd7029879035a8b6701b 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -122,7 +122,9 @@ class DepthwiseConv2DTest(test.TestCase):
     x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
     x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
     with self.test_session(use_gpu=use_gpu) as sess:
-      if data_type == dtypes.float32:
+      if data_type == dtypes.float16:
+        tolerance = 1e-5
+      elif data_type == dtypes.float32:
         tolerance = 1e-5
       else:
         self.assertEqual(data_type, dtypes.float64)
@@ -169,7 +171,7 @@ class DepthwiseConv2DTest(test.TestCase):
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2D,", index, "th config:", input_size, "*",
             filter_size, "stride:", stride, "padding:", padding)
-      for data_type in [dtypes.float32, dtypes.float64]:
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._VerifyValues(
             input_size, filter_size, stride, padding, data_type, use_gpu=True)
 
@@ -181,7 +183,7 @@ class DepthwiseConv2DTest(test.TestCase):
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2DFormat,", index, "th config:", input_size,
             "*", filter_size, "stride:", stride, "padding:", padding)
-      for data_type in [dtypes.float32, dtypes.float64]:
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._VerifyValues(
             input_size,
             filter_size,
@@ -318,7 +320,9 @@ class DepthwiseConv2DTest(test.TestCase):
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
     with self.test_session(use_gpu=use_gpu):
-      if data_type == dtypes.float32:
+      if data_type == dtypes.float16:
+        tolerance = 0.002
+      elif data_type == dtypes.float32:
         tolerance = 0.002
       else:
         self.assertEqual(data_type, dtypes.float64)
@@ -369,6 +373,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DInputGrad,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DInputGrad is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -389,6 +395,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DInputGradFormat,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DInputGradFormat is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -407,6 +415,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DFilterGrad,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DFilterGrad is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
@@ -427,6 +437,8 @@ class DepthwiseConv2DTest(test.TestCase):
       print("Testing DepthwiseConv2DFilterGradFormat,", index, "th config:",
             input_size, "*", filter_size, "stride:", stride, "padding:",
             padding)
+      # Note: float16 test for DepthwiseConv2DFilterGradFormat is not enabled,
+      # calculations are not very precise.
       for data_type in [dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index e21446c2efa3ab5cd8ab6ad359751e1be30243d0..e220d0569281c6dbe4107fdfb8013e99592f153c 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -193,6 +193,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["manual"],  # b/69001419
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index b1d8da771612fe42a153a1a11b6cb26bdcb983a0..d0fa1fe98996fd234f457bd0199fad5efc2547dc 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -59,13 +59,21 @@ class KLTest(test.TestCase):
     # pylint: disable=unused-argument,unused-variable
 
     with self.test_session():
-      a = MyDistException(loc=0.0, scale=1.0)
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=False)
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         kl.eval()
+      with self.assertRaisesOpError(
+          "KL calculation between .* and .* returned NaN values"):
+        a.kl_divergence(a).eval()
+      a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
       self.assertAllEqual([float("nan")], kl_ok.eval())
+      self_kl_ok = a.kl_divergence(a)
+      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      cross_ok = a.cross_entropy(a)
+      self.assertAllEqual([float("nan")], cross_ok.eval())
 
   def testRegistrationFailures(self):
 
@@ -86,16 +94,22 @@ class KLTest(test.TestCase):
     for (k, v) in _DIVERGENCES.items():
       self.assertEqual(v, _registered_kl(*k))
 
-  def testIndirectRegistration(self):
+  def _testIndirectRegistration(self, fn):
 
     class Sub1(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub2(normal.Normal):
-      pass
+
+      def entropy(self):
+        return ""
 
     class Sub11(Sub1):
-      pass
+
+      def entropy(self):
+        return ""
 
     # pylint: disable=unused-argument,unused-variable
     @kullback_leibler.RegisterKL(Sub1, Sub1)
@@ -116,16 +130,30 @@ class KLTest(test.TestCase):
     sub2 = Sub2(loc=0.0, scale=1.0)
     sub11 = Sub11(loc=0.0, scale=1.0)
 
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub1, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub1))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub11, sub1))
-    self.assertEqual("sub1-2", kullback_leibler.kl_divergence(sub11, sub2))
-    self.assertEqual("sub2-1", kullback_leibler.kl_divergence(sub2, sub11))
-    self.assertEqual("sub1-1", kullback_leibler.kl_divergence(sub1, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub1))
+    self.assertEqual("sub1-2", fn(sub1, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub1))
+    self.assertEqual("sub1-1", fn(sub11, sub11))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub1-1", fn(sub11, sub1))
+    self.assertEqual("sub1-2", fn(sub11, sub2))
+    self.assertEqual("sub2-1", fn(sub2, sub11))
+    self.assertEqual("sub1-1", fn(sub1, sub11))
+
+  def testIndirectRegistrationKLFun(self):
+    self._testIndirectRegistration(kullback_leibler.kl_divergence)
+
+  def testIndirectRegistrationKLSelf(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.kl_divergence(q))
+
+  def testIndirectRegistrationCrossEntropy(self):
+    self._testIndirectRegistration(
+        lambda p, q: p.cross_entropy(q))
+
+  def testFunctionCrossEntropy(self):
+    self._testIndirectRegistration(kullback_leibler.cross_entropy)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 614a34f077b5bd62205a9bf6ae9775e2f0fcead5..e24e8ade73a7ad762c877214f5ec3ee0848863fe 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -250,13 +250,11 @@ class MultinomialTest(test.TestCase):
     theta = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
     theta /= np.sum(theta, 1)[..., array_ops.newaxis]
-    # Ideally we'd be able to test broadcasting but, the multinomial sampler
-    # doesn't support different total counts.
-    n = np.float32(5)
+    n = np.array([[10., 9.], [8., 7.], [6., 5.]], dtype=np.float32)
     with self.test_session() as sess:
-      # batch_shape=[2], event_shape=[3]
+      # batch_shape=[3, 2], event_shape=[3]
       dist = multinomial.Multinomial(n, theta)
-      x = dist.sample(int(250e3), seed=1)
+      x = dist.sample(int(1000e3), seed=1)
       sample_mean = math_ops.reduce_mean(x, 0)
       x_centered = x - sample_mean[array_ops.newaxis, ...]
       sample_cov = math_ops.reduce_mean(math_ops.matmul(
@@ -283,17 +281,17 @@ class MultinomialTest(test.TestCase):
           dist.variance(),
           dist.stddev(),
       ])
-      self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.01)
-      self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.01)
-      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.01)
-      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.01)
+      self.assertAllClose(sample_mean_, analytic_mean, atol=0.01, rtol=0.01)
+      self.assertAllClose(sample_cov_, analytic_cov, atol=0.01, rtol=0.01)
+      self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
+      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
 
   def testSampleUnbiasedNonScalarBatch(self):
     with self.test_session() as sess:
       dist = multinomial.Multinomial(
-          total_count=5.,
+          total_count=[7., 6., 5.],
           logits=math_ops.log(2. * self._rng.rand(4, 3, 2).astype(np.float32)))
-      n = int(3e3)
+      n = int(3e4)
       x = dist.sample(n, seed=0)
       sample_mean = math_ops.reduce_mean(x, 0)
       # Cyclically rotate event dims left.
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index dc462bae56b5fbc18036e80f6bbd4177b7b9fff2..2d434a39c2933832daebbe8f710de9553d0bf38d 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -24,6 +24,7 @@ import importlib
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -90,6 +91,21 @@ class NdtriTest(test.TestCase):
       x = special_math.ndtri(p)
       self.assertAllClose(expected_x, x.eval(), atol=0.)
 
+  def testNdtriDynamicShape(self):
+    """Verifies that ndtri computation is correct."""
+    with self.test_session() as sess:
+      if not special:
+        return
+
+      p = array_ops.placeholder(np.float32)
+      p_ = np.linspace(0., 1.0, 50).astype(np.float32)
+
+      x = special_math.ndtri(p)
+      x_ = sess.run(x, feed_dict={p: p_})
+
+      expected_x_ = special.ndtri(p_)
+      self.assertAllClose(expected_x_, x_, atol=0.)
+
   def _baseNdtriFiniteGradientTest(self, dtype):
     """Verifies that ndtri has finite gradients at interesting points."""
     g = ops.Graph()
@@ -316,6 +332,32 @@ class LogNdtrGradientTest(NdtrGradientTest):
   _use_log = True
 
 
+class ErfInvTest(test.TestCase):
+
+  def testErfInvValues(self):
+    with self.test_session():
+      if not special:
+        return
+
+      x = np.linspace(0., 1.0, 50).astype(np.float64)
+
+      expected_x = special.erfinv(x)
+      x = special_math.erfinv(x)
+      self.assertAllClose(expected_x, x.eval(), atol=0.)
+
+  def testErfInvIntegerInput(self):
+    with self.test_session():
+
+      with self.assertRaises(TypeError):
+        x = np.array([1, 2, 3]).astype(np.int32)
+        special_math.erfinv(x)
+
+      with self.assertRaises(TypeError):
+        x = np.array([1, 2, 3]).astype(np.int64)
+        special_math.erfinv(x)
+
+
+
 class LogCDFLaplaceTest(test.TestCase):
   # Note that scipy.stats.laplace does not have a stable Log CDF, so we cannot
   # rely on scipy to cross check the extreme values.
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 8fd26a1c9afe0ab701db199147e2de7c3ded3211..5950241141fc743631ea64938699cf6a020b2201 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -587,7 +587,7 @@ class FillTriangularTest(test.TestCase):
     x_ = np.asarray(x_)
     with self.test_session() as sess:
       static_shape = None if use_deferred_shape else x_.shape
-      x_pl = array_ops.placeholder(dtype=x_.dtype, shape=static_shape)
+      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
       # Add `zeros_like(x)` such that x's value and gradient are identical. We
       # do this so we can ensure each gradient value is mapped to the right
       # gradient location.  (Not doing this means the gradient wrt `x` is simple
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 48830957075e38a7bd683755f4657a3204303e02..b4fb5aa41175ba61ace0bff9a15d91ec4ee3ac55 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -33,13 +33,14 @@ from tensorflow.python.platform import test
 class DynamicPartitionTest(test.TestCase):
 
   def testSimpleOneDimensional(self):
-    with self.test_session() as sess:
-      data = constant_op.constant([0, 13, 2, 39, 4, 17])
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
     self.assertAllEqual([17], partition_vals[1])
     self.assertAllEqual([2, 4], partition_vals[2])
@@ -52,14 +53,16 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None], partitions[3].get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
-                                   [12, 13, 14], [15, 16, 17]])
+                                   [12, 13, 14], [15, 16, 17]],
+                                  dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
       partition_vals = sess.run(partitions)
 
+    self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
     self.assertAllEqual([[15, 16, 17]], partition_vals[1])
     self.assertAllEqual([[6, 7, 8], [12, 13, 14]], partition_vals[2])
@@ -71,9 +74,84 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None, 3], partitions[2].get_shape().as_list())
     self.assertEqual([None, 3], partitions[3].get_shape().as_list())
 
+  def testLargeOneDimensional(self):
+    num = 100000
+    data_list = [x for x in range(num)]
+    indices_list = [x % 2 for x in range(num)]
+    part1 = [x for x in range(num) if x % 2 == 0]
+    part2 = [x for x in range(num) if x % 2 == 1]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(2, len(partition_vals))
+    self.assertAllEqual(part1, partition_vals[0])
+    self.assertAllEqual(part2, partition_vals[1])
+
+  def testLargeTwoDimensional(self):
+    rows = 100000
+    cols = 100
+    data_list = [None] * rows
+    for i in range(rows):
+      data_list[i] = [i for _ in range(cols)]
+    num_partitions = 97
+    indices_list = [(i ** 2) % num_partitions for i in range(rows)]
+    parts = [[] for _ in range(num_partitions)]
+    for i in range(rows):
+      parts[(i ** 2) % num_partitions].append(data_list[i])
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=num_partitions)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(num_partitions, len(partition_vals))
+    for i in range(num_partitions):
+      # reshape because of empty parts
+      parts_np = np.array(parts[i], dtype=np.float).reshape(-1, cols)
+      self.assertAllEqual(parts_np, partition_vals[i])
+
+  def testSimpleComplex(self):
+    data_list = [1 + 2j, 3 + 4j, 5 + 6j, 7 + 8j]
+    indices_list = [1, 0, 1, 0]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.complex64)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(2, len(partition_vals))
+    self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
+    self.assertAllEqual([1 + 2j, 5 + 6j], partition_vals[1])
+
+  def testScalarPartitions(self):
+    data_list = [10, 13, 12, 11]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float64)
+      indices = 3
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=4)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(4, len(partition_vals))
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[0])
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[1])
+    self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
+                        partition_vals[2])
+    self.assertAllEqual(np.array([10, 13, 12, 11],
+                                 dtype=np.float64).reshape(-1, 4),
+                        partition_vals[3])
+
   def testHigherRank(self):
     np.random.seed(7)
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=True) as sess:
       for n in 2, 3:
         for shape in (4,), (4, 5), (4, 5, 2):
           partitions = np.random.randint(n, size=np.prod(shape)).reshape(shape)
@@ -95,6 +173,115 @@ class DynamicPartitionTest(test.TestCase):
             self.assertEqual(grads[1], None)  # Partitions has no gradients
             self.assertAllEqual(7 * data, sess.run(grads[0]))
 
+  def testEmptyParts(self):
+    data_list = [1, 2, 3, 4]
+    indices_list = [1, 3, 1, 3]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=4)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(4, len(partition_vals))
+    self.assertAllEqual([], partition_vals[0])
+    self.assertAllEqual([1, 3], partition_vals[1])
+    self.assertAllEqual([], partition_vals[2])
+    self.assertAllEqual([2, 4], partition_vals[3])
+
+  def testEmptyDataTwoDimensional(self):
+    data_list = [[], []]
+    indices_list = [0, 1]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=3)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(3, len(partition_vals))
+    self.assertAllEqual([[]], partition_vals[0])
+    self.assertAllEqual([[]], partition_vals[1])
+    self.assertAllEqual(np.array([], dtype=np.float).reshape(0, 0),
+                        partition_vals[2])
+
+  def testEmptyPartitions(self):
+    data_list = []
+    indices_list = []
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(2, len(partition_vals))
+    self.assertAllEqual([], partition_vals[0])
+    self.assertAllEqual([], partition_vals[1])
+
+  def testGPUTooManyParts(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all but the first
+    # num_partitions indices.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1, 2, 3, 4, 5, 6]
+    indices_list = [6, 5, 4, 3, 1, 0]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=2)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(2, len(partition_vals))
+    self.assertAllEqual([6], partition_vals[0])
+    self.assertAllEqual([5], partition_vals[1])
+
+  def testGPUPartsTooLarge(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all the values
+    # larger than num_partitions.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1, 2, 3, 4, 5, 6]
+    indices_list = [10, 11, 2, 12, 0, 1000]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=5)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(5, len(partition_vals))
+    self.assertAllEqual([5], partition_vals[0])
+    self.assertAllEqual([], partition_vals[1])
+    self.assertAllEqual([3], partition_vals[2])
+    self.assertAllEqual([], partition_vals[3])
+    self.assertAllEqual([], partition_vals[4])
+
+  def testGPUAllIndicesBig(self):
+    # This test only makes sense on the GPU. There we do not check
+    # for errors. In this case, we should discard all the values
+    # and have an empty output.
+    if not test.is_gpu_available():
+      return
+
+    data_list = [1.1, 2.1, 3.1, 4.1, 5.1, 6.1]
+    indices_list = [90, 70, 60, 100, 110, 40]
+    with self.test_session(use_gpu=True) as sess:
+      data = constant_op.constant(data_list, dtype=dtypes.float32)
+      indices = constant_op.constant(indices_list, dtype=dtypes.int32)
+      partitions = data_flow_ops.dynamic_partition(
+          data, indices, num_partitions=40)
+      partition_vals = sess.run(partitions)
+
+    self.assertEqual(40, len(partition_vals))
+    for i in range(40):
+      self.assertAllEqual([], partition_vals[i])
+
   def testErrorIndexOutOfRange(self):
     with self.test_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
diff --git a/tensorflow/python/kernel_tests/garbage_collection_test.py b/tensorflow/python/kernel_tests/garbage_collection_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..39f936fbc92af597f967024cce9bca82ede15440
--- /dev/null
+++ b/tensorflow/python/kernel_tests/garbage_collection_test.py
@@ -0,0 +1,63 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests which set DEBUG_SAVEALL and assert no garbage was created.
+
+This flag seems to be sticky, so these tests have been isolated for now.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.platform import test
+
+
+class NoReferenceCycleTests(test_util.TensorFlowTestCase):
+
+  @test_util.assert_no_garbage_created
+  def testEagerResourceVariables(self):
+    with context.eager_mode():
+      resource_variable_ops.ResourceVariable(1.0, name="a")
+
+  @test_util.assert_no_garbage_created
+  def testTensorArrays(self):
+    with context.eager_mode():
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3,
+          infer_shape=False)
+
+      w0 = ta.write(0, [[4.0, 5.0]])
+      w1 = w0.write(1, [[1.0]])
+      w2 = w1.write(2, -3.0)
+
+      r0 = w2.read(0)
+      r1 = w2.read(1)
+      r2 = w2.read(2)
+
+      d0, d1, d2 = self.evaluate([r0, r1, r2])
+      self.assertAllEqual([[4.0, 5.0]], d0)
+      self.assertAllEqual([[1.0]], d1)
+      self.assertAllEqual(-3.0, d2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index af5e23c926c0ca8352426549c91994855dd27855..91ebe8de9921268b2a3c5ad645585e1fe83c7419 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -185,6 +186,9 @@ class GatherNdTest(test.TestCase):
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
     self.assertEqual([10, 10, 20], gather_nd_t.get_shape())
 
+  def assertIndexedSlices(self, t):
+    self.assertIsInstance(t, ops.IndexedSlices)
+
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
@@ -233,7 +237,8 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[3, 4], [1, 2]], dtype=np.float64)
     with self.test_session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertIndexedSlices(grads)
+      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
   def testGradientsRank3Elements(self):
     indices = constant_op.constant(
@@ -250,6 +255,35 @@ class GatherNdTest(test.TestCase):
     with self.test_session(use_gpu=True):
       self.assertAllEqual(expected_grads, grads.eval())
 
+  def testGradientsRank7Elements(self):
+    # Shape [1,1,2,1,1,2,2]
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]],
+        dtype=dtypes.int32)
+    inputs = constant_op.constant(
+        [[[
+            [[[[1, 3], [5, 7]]]],
+            [[[[2, 4], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    outputs = array_ops.gather_nd(inputs, indices)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
+    expected_grads = np.array(
+        [[[
+            [[[[5, 6], [1, 2]]]],
+            [[[[3, 4], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(expected_grads, grads.eval())
+
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
@@ -284,7 +318,8 @@ class GatherNdTest(test.TestCase):
          [0, 0, 0, 0, 0, 0, 0, 0, 0], [3, 3, 3, 3, 3, 3, 3, 3, 3]],
         dtype=np.float64)
     with self.test_session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertIndexedSlices(grads)
+      self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
 
 class GatherNdOpBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 76c790a0a201ae20b73e37b7adeba11db9ed716f..e4c799cb1cfce35143b887feb9ae1af6455d7b25 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -281,6 +281,37 @@ class IndexTableFromFile(test.TestCase):
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((1, 2, 3), ids.eval())
 
+  def test_string_index_table_from_multicolumn_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_column_index=0,
+          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
+  def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file,
+          num_oov_buckets=1,
+          key_column_index=0,
+          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          delimiter=" ")
+      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+      self.assertRaises(errors_impl.OpError, ids.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((1, 2, 3), ids.eval())
+
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.test_session():
@@ -457,6 +488,20 @@ class IndexTableFromFile(test.TestCase):
       self.assertRaises(ValueError, table.lookup,
                         constant_op.constant(["salad", "surgery", "tarkus"]))
 
+  def test_index_table_from_file_table_ref_with_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=1)
+      self.assertIsNotNone(table.table_ref)
+
+  def test_index_table_from_file_table_ref_without_oov_buckets(self):
+    vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
+    with self.test_session():
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, num_oov_buckets=0)
+      self.assertIsNotNone(table.table_ref)
+
 
 class KeyValueTensorInitializerTest(test.TestCase):
 
@@ -566,10 +611,10 @@ class IndexTableFromTensor(test.TestCase):
 
 class IndexToStringTableFromFileTest(test.TestCase):
 
-  def _createVocabFile(self, basename):
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
     vocabulary_file = os.path.join(self.get_temp_dir(), basename)
     with open(vocabulary_file, "w") as f:
-      f.write("\n".join(["brain", "salad", "surgery"]) + "\n")
+      f.write("\n".join(values) + "\n")
     return vocabulary_file
 
   def test_index_to_string_table(self):
@@ -583,6 +628,35 @@ class IndexToStringTableFromFileTest(test.TestCase):
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           features.eval())
 
+  def test_index_to_string_table_from_multicolumn_file(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          value_column_index=0)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
+  def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
+    vocabulary_file = self._createVocabFile(
+        "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
+    with self.test_session():
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file,
+          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+          value_column_index=0,
+          delimiter=" ")
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      self.assertRaises(errors_impl.OpError, features.eval)
+      lookup_ops.tables_initializer().run()
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          features.eval())
+
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -1371,6 +1445,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
             oov_buckets,
             hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
 
+  def testIdTableWithHashBucketsNoInnerTable(self):
+    with self.test_session():
+      table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
+      self.assertIsNone(table.table_ref)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a7a3ba99ba161c197643a3e3c5aed5d37e9d2b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -0,0 +1,196 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.gen_linalg_ops.matrix_exponential."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def np_expm(x):
+  """Slow but accurate Taylor series matrix exponential."""
+  y = np.zeros(x.shape, dtype=x.dtype)
+  xn = np.eye(x.shape[0], dtype=x.dtype)
+  for n in range(40):
+    y += xn / float(math.factorial(n))
+    xn = np.dot(xn, x)
+  return y
+
+
+class ExponentialOpTest(test.TestCase):
+
+  def _verifyExponential(self, x, np_type):
+    # TODO(pfau): add matrix logarithm and test that it is inverse of expm.
+    inp = x.astype(np_type)
+    with self.test_session(use_gpu=True):
+      # Verify that x^{-1} * x == Identity matrix.
+      tf_ans = gen_linalg_ops._matrix_exponential(inp)
+      if x.size == 0:
+        np_ans = np.empty(x.shape, dtype=np_type)
+      else:
+        if x.ndim > 2:
+          np_ans = np.zeros(inp.shape, dtype=np_type)
+          for i in itertools.product(*[range(x) for x in inp.shape[:-2]]):
+            np_ans[i] = np_expm(inp[i])
+        else:
+          np_ans = np_expm(inp)
+      out = tf_ans.eval()
+      self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
+
+  def _verifyExponentialReal(self, x):
+    for np_type in [np.float32, np.float64]:
+      self._verifyExponential(x, np_type)
+
+  def _verifyExponentialComplex(self, x):
+    for np_type in [np.complex64, np.complex128]:
+      self._verifyExponential(x, np_type)
+
+  def _makeBatch(self, matrix1, matrix2):
+    matrix_batch = np.concatenate(
+        [np.expand_dims(matrix1, 0),
+         np.expand_dims(matrix2, 0)])
+    matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
+    return matrix_batch
+
+  def testNonsymmetric(self):
+    # 2x2 matrices
+    matrix1 = np.array([[1., 2.], [3., 4.]])
+    matrix2 = np.array([[1., 3.], [3., 5.]])
+    self._verifyExponentialReal(matrix1)
+    self._verifyExponentialReal(matrix2)
+    # A multidimensional batch of 2x2 matrices
+    self._verifyExponentialReal(self._makeBatch(matrix1, matrix2))
+    # Complex
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyExponentialComplex(matrix1)
+    self._verifyExponentialComplex(matrix2)
+    # Complex batch
+    self._verifyExponentialComplex(self._makeBatch(matrix1, matrix2))
+
+  def testSymmetricPositiveDefinite(self):
+    # 2x2 matrices
+    matrix1 = np.array([[2., 1.], [1., 2.]])
+    matrix2 = np.array([[3., -1.], [-1., 3.]])
+    self._verifyExponentialReal(matrix1)
+    self._verifyExponentialReal(matrix2)
+    # A multidimensional batch of 2x2 matrices
+    self._verifyExponentialReal(self._makeBatch(matrix1, matrix2))
+    # Complex
+    matrix1 = matrix1.astype(np.complex64)
+    matrix1 += 1j * matrix1
+    matrix2 = matrix2.astype(np.complex64)
+    matrix2 += 1j * matrix2
+    self._verifyExponentialComplex(matrix1)
+    self._verifyExponentialComplex(matrix2)
+    # Complex batch
+    self._verifyExponentialComplex(self._makeBatch(matrix1, matrix2))
+
+  def testNonSquareMatrix(self):
+    # When the exponential of a non-square matrix is attempted we should return
+    # an error
+    with self.assertRaises(ValueError):
+      gen_linalg_ops._matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
+
+  def testWrongDimensions(self):
+    # The input to the inverse should be at least a 2-dimensional tensor.
+    tensor3 = constant_op.constant([1., 2.])
+    with self.assertRaises(ValueError):
+      gen_linalg_ops._matrix_exponential(tensor3)
+
+  def testEmpty(self):
+    self._verifyExponentialReal(np.empty([0, 2, 2]))
+    self._verifyExponentialReal(np.empty([2, 0, 0]))
+
+  def testRandomSmallAndLarge(self):
+    np.random.seed(42)
+    for dtype in np.float32, np.float64, np.complex64, np.complex128:
+      for batch_dims in [(), (1,), (3,), (2, 2)]:
+        for size in 8, 31, 32:
+          shape = batch_dims + (size, size)
+          matrix = np.random.uniform(
+              low=-1.0, high=1.0,
+              size=np.prod(shape)).reshape(shape).astype(dtype)
+          self._verifyExponentialReal(matrix)
+
+  def testConcurrentExecutesWithoutError(self):
+    with self.test_session(use_gpu=True) as sess:
+      matrix1 = random_ops.random_normal([5, 5], seed=42)
+      matrix2 = random_ops.random_normal([5, 5], seed=42)
+      expm1 = gen_linalg_ops._matrix_exponential(matrix1)
+      expm2 = gen_linalg_ops._matrix_exponential(matrix2)
+      expm = sess.run([expm1, expm2])
+      self.assertAllEqual(expm[0], expm[1])
+
+
+class MatrixExponentialBenchmark(test.Benchmark):
+
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (513, 4, 4),
+      (513, 16, 16),
+      (513, 256, 256),
+  ]
+
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (
+        2.0 * n) + np.diag(np.ones(n).astype(np.float32))
+    return variables.Variable(np.tile(matrix, batch_shape + (1, 1)))
+
+  def benchmarkMatrixExponentialOp(self):
+    for shape in self.shapes:
+      with ops.Graph().as_default(), \
+          session.Session() as sess, \
+          ops.device("/cpu:0"):
+        matrix = self._GenerateMatrix(shape)
+        expm = gen_linalg_ops._matrix_exponential(matrix)
+        variables.global_variables_initializer().run()
+        self.run_op_benchmark(
+            sess,
+            control_flow_ops.group(expm),
+            min_iters=25,
+            name="matrix_exponential_cpu_{shape}".format(
+                shape=shape))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index e5b7cbce7aa31bb3aa288ab529ef26b9c4a0003e..3358b78efd22f86b455041d72e6ff663f74acdd8 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -158,9 +158,12 @@ def _assert_nan(test_case, actual):
   test_case.assertTrue(math.isnan(actual), 'Expected NAN, got %s.' % actual)
 
 
-def _assert_local_variables(test_case, expected):
+def _assert_metric_variables(test_case, expected):
   test_case.assertEquals(
       set(expected), set(v.name for v in variables.local_variables()))
+  test_case.assertEquals(
+      set(expected),
+      set(v.name for v in ops.get_collection(ops.GraphKeys.METRIC_VARIABLES)))
 
 
 def _test_values(shape):
@@ -174,7 +177,7 @@ class MeanTest(test.TestCase):
 
   def testVars(self):
     metrics.mean(array_ops.ones([4, 3]))
-    _assert_local_variables(self, ('mean/count:0', 'mean/total:0'))
+    _assert_metric_variables(self, ('mean/count:0', 'mean/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -340,8 +343,8 @@ class MeanTensorTest(test.TestCase):
 
   def testVars(self):
     metrics.mean_tensor(array_ops.ones([4, 3]))
-    _assert_local_variables(self, ('mean/total_tensor:0',
-                                   'mean/count_tensor:0'))
+    _assert_metric_variables(self,
+                             ('mean/total_tensor:0', 'mean/count_tensor:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -506,8 +509,8 @@ class AccuracyTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         name='my_accuracy')
-    _assert_local_variables(self, ('my_accuracy/count:0',
-                                   'my_accuracy/total:0'))
+    _assert_metric_variables(self,
+                             ('my_accuracy/count:0', 'my_accuracy/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -690,8 +693,8 @@ class PrecisionTest(test.TestCase):
   def testVars(self):
     metrics.precision(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('precision/false_positives/count:0',
-                                   'precision/true_positives/count:0'))
+    _assert_metric_variables(self, ('precision/false_positives/count:0',
+                                    'precision/true_positives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -876,8 +879,9 @@ class RecallTest(test.TestCase):
   def testVars(self):
     metrics.recall(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('recall/false_negatives/count:0',
-                                   'recall/true_positives/count:0'))
+    _assert_metric_variables(
+        self,
+        ('recall/false_negatives/count:0', 'recall/true_positives/count:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -999,9 +1003,9 @@ class AUCTest(test.TestCase):
   def testVars(self):
     metrics.auc(predictions=array_ops.ones((10, 1)),
                 labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self,
-                            ('auc/true_positives:0', 'auc/false_negatives:0',
-                             'auc/false_positives:0', 'auc/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('auc/true_positives:0', 'auc/false_negatives:0',
+                              'auc/false_positives:0', 'auc/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1256,11 +1260,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         sensitivity=0.7)
-    _assert_local_variables(self,
-                            ('specificity_at_sensitivity/true_positives:0',
-                             'specificity_at_sensitivity/false_negatives:0',
-                             'specificity_at_sensitivity/false_positives:0',
-                             'specificity_at_sensitivity/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('specificity_at_sensitivity/true_positives:0',
+                              'specificity_at_sensitivity/false_negatives:0',
+                              'specificity_at_sensitivity/false_positives:0',
+                              'specificity_at_sensitivity/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1393,11 +1397,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         specificity=0.7)
-    _assert_local_variables(self,
-                            ('sensitivity_at_specificity/true_positives:0',
-                             'sensitivity_at_specificity/false_negatives:0',
-                             'sensitivity_at_specificity/false_positives:0',
-                             'sensitivity_at_specificity/true_negatives:0'))
+    _assert_metric_variables(self,
+                             ('sensitivity_at_specificity/true_positives:0',
+                              'sensitivity_at_specificity/false_negatives:0',
+                              'sensitivity_at_specificity/false_positives:0',
+                              'sensitivity_at_specificity/true_negatives:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1512,9 +1516,10 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0, 0.5, 1.0])
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'precision_at_thresholds/true_positives:0',
-        'precision_at_thresholds/false_positives:0',))
+        'precision_at_thresholds/false_positives:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -1796,17 +1801,17 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       self.assertAlmostEqual(expected_rec, rec.eval(), 2)
 
 
-def _test_sparse_precision_at_k(predictions,
-                                labels,
-                                k,
-                                expected,
-                                class_id=None,
-                                weights=None,
-                                test_case=None):
+def _test_precision_at_k(predictions,
+                         labels,
+                         k,
+                         expected,
+                         class_id=None,
+                         weights=None,
+                         test_case=None):
   with ops.Graph().as_default() as g, test_case.test_session(g):
     if weights is not None:
       weights = constant_op.constant(weights, dtypes_lib.float32)
-    metric, update = metrics.sparse_precision_at_k(
+    metric, update = metrics.precision_at_k(
         predictions=constant_op.constant(predictions, dtypes_lib.float32),
         labels=labels,
         k=k,
@@ -1859,17 +1864,17 @@ def _test_precision_at_top_k(
       test_case.assertEqual(expected, metric.eval())
 
 
-def _test_sparse_average_precision_at_k(predictions,
-                                        labels,
-                                        k,
-                                        expected,
-                                        weights=None,
-                                        test_case=None):
+def _test_average_precision_at_k(predictions,
+                                 labels,
+                                 k,
+                                 expected,
+                                 weights=None,
+                                 test_case=None):
   with ops.Graph().as_default() as g, test_case.test_session(g):
     if weights is not None:
       weights = constant_op.constant(weights, dtypes_lib.float32)
     predictions = constant_op.constant(predictions, dtypes_lib.float32)
-    metric, update = metrics.sparse_average_precision_at_k(
+    metric, update = metrics.average_precision_at_k(
         labels, predictions, k, weights=weights)
 
     # Fails without initialized vars.
@@ -1886,7 +1891,7 @@ def _test_sparse_average_precision_at_k(predictions,
       test_case.assertAlmostEqual(expected, metric.eval())
 
 
-class SingleLabelSparsePrecisionTest(test.TestCase):
+class SingleLabelPrecisionAtKTest(test.TestCase):
 
   def setUp(self):
     self._predictions = ((0.1, 0.3, 0.2, 0.4), (0.1, 0.2, 0.3, 0.4))
@@ -1899,18 +1904,18 @@ class SingleLabelSparsePrecisionTest(test.TestCase):
         _binary_2d_label_to_2d_sparse_value(indicator_labels), np.array(
             class_labels, dtype=np.int64), np.array(
                 [[class_id] for class_id in class_labels], dtype=np.int64))
-    self._test_sparse_precision_at_k = functools.partial(
-        _test_sparse_precision_at_k, test_case=self)
+    self._test_precision_at_k = functools.partial(
+        _test_precision_at_k, test_case=self)
     self._test_precision_at_top_k = functools.partial(
         _test_precision_at_top_k, test_case=self)
-    self._test_sparse_average_precision_at_k = functools.partial(
-        _test_sparse_average_precision_at_k, test_case=self)
+    self._test_average_precision_at_k = functools.partial(
+        _test_average_precision_at_k, test_case=self)
 
   def test_at_k1_nan(self):
     for labels in self._labels:
       # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
       for class_id in (-1, 0, 1, 2, 4):
-        self._test_sparse_precision_at_k(
+        self._test_precision_at_k(
             self._predictions, labels, k=1, expected=NAN, class_id=class_id)
         self._test_precision_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
@@ -1918,29 +1923,29 @@ class SingleLabelSparsePrecisionTest(test.TestCase):
   def test_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           self._predictions, labels, k=1, expected=1.0 / 2, class_id=3)
       self._test_precision_at_top_k(
           self._predictions_idx, labels, k=1, expected=1.0 / 2, class_id=3)
 
       # All classes: 2 labels, 2 predictions, 1 correct.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           self._predictions, labels, k=1, expected=1.0 / 2)
       self._test_precision_at_top_k(
           self._predictions_idx, labels, k=1, expected=1.0 / 2)
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           self._predictions, labels, k=1, expected=1.0 / 2)
 
 
-class MultiLabelSparsePrecisionTest(test.TestCase):
+class MultiLabelPrecisionAtKTest(test.TestCase):
 
   def setUp(self):
-    self._test_sparse_precision_at_k = functools.partial(
-        _test_sparse_precision_at_k, test_case=self)
+    self._test_precision_at_k = functools.partial(
+        _test_precision_at_k, test_case=self)
     self._test_precision_at_top_k = functools.partial(
         _test_precision_at_top_k, test_case=self)
-    self._test_sparse_average_precision_at_k = functools.partial(
-        _test_sparse_average_precision_at_k, test_case=self)
+    self._test_average_precision_at_k = functools.partial(
+        _test_average_precision_at_k, test_case=self)
 
   def test_average_precision(self):
     # Example 1.
@@ -1956,11 +1961,11 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
                          (precision_ex1[1] + precision_ex1[3]) / 4)
     for i in xrange(4):
       k = i + 1
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k, expected=precision_ex1[i])
       self._test_precision_at_top_k(
           (predictions_idx_ex1[:k],), labels, k=k, expected=precision_ex1[i])
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
 
     # Example 2.
@@ -1974,11 +1979,11 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
                          (precision_ex2[2] + precision_ex2[3]) / 4)
     for i in xrange(4):
       k = i + 1
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k, expected=precision_ex2[i])
       self._test_precision_at_top_k(
           (predictions_idx_ex2[:k],), labels, k=k, expected=precision_ex2[i])
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex2[i])
 
     # Both examples, we expect both precision and average precision to be the
@@ -1994,11 +1999,11 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
     for i in xrange(4):
       k = i + 1
       predictions_idx = (predictions_idx_ex1[:k], predictions_idx_ex2[:k])
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k, expected=streaming_precision[i])
       self._test_precision_at_top_k(
           predictions_idx, labels, k=k, expected=streaming_precision[i])
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           predictions, labels, k, expected=streaming_average_precision[i])
 
     # Weighted examples, we expect streaming average precision to be the
@@ -2010,7 +2015,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
     ]
     for i in xrange(4):
       k = i + 1
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           predictions,
           labels,
           k,
@@ -2029,11 +2034,11 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
                          (precision_ex1[1] + precision_ex1[3]) / 4)
     for i in xrange(4):
       k = i + 1
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k, expected=precision_ex1[i])
       self._test_precision_at_top_k(
           (predictions_idx_ex1[:k],), labels, k=k, expected=precision_ex1[i])
-      self._test_sparse_average_precision_at_k(
+      self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
 
   def test_three_labels_at_k5_no_predictions(self):
@@ -2047,7 +2052,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
     for labels in (sparse_labels, dense_labels):
       # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
       for class_id in (-1, 1, 3, 8, 10):
-        self._test_sparse_precision_at_k(
+        self._test_precision_at_k(
             predictions, labels, k=5, expected=NAN, class_id=class_id)
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
@@ -2063,7 +2068,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
     for labels in (sparse_labels, dense_labels):
       # Classes 0,4,6,9: 0 labels, >=1 prediction.
       for class_id in (0, 4, 6, 9):
-        self._test_sparse_precision_at_k(
+        self._test_precision_at_k(
             predictions, labels, k=5, expected=0.0, class_id=class_id)
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
@@ -2078,25 +2083,25 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
 
     for labels in (sparse_labels, dense_labels):
       # Class 2: 2 labels, 2 correct predictions.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=2.0 / 2, class_id=2)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=2.0 / 2, class_id=2)
 
       # Class 5: 1 label, 1 correct prediction.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=1.0 / 1, class_id=5)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=1.0 / 1, class_id=5)
 
       # Class 7: 1 label, 1 incorrect prediction.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=0.0 / 1, class_id=7)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=0.0 / 1, class_id=7)
 
       # All classes: 10 predictions, 3 correct.
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=3.0 / 10)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=3.0 / 10)
@@ -2114,25 +2119,25 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         dense_shape=[2, 4])
 
     # Class 2: 2 labels, 2 correct predictions.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, sp_labels, k=5, expected=2.0 / 2, class_id=2)
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, 1 correct prediction.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, sp_labels, k=5, expected=1.0 / 1, class_id=5)
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, 1 incorrect prediction.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, sp_labels, k=5, expected=0.0 / 1, class_id=7)
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=0.0 / 1, class_id=7)
 
     # All classes: 10 predictions, 3 correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, sp_labels, k=5, expected=3.0 / 10)
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=3.0 / 10)
@@ -2150,7 +2155,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
 
     # Classes 1,3,8 have 0 predictions, classes -1 and 10 are out of range.
     for class_id in (-1, 1, 3, 8, 10):
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=NAN, class_id=class_id)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
@@ -2168,7 +2173,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
 
     # Classes 0,4,6,9: 0 labels, >=1 prediction.
     for class_id in (0, 4, 6, 9):
-      self._test_sparse_precision_at_k(
+      self._test_precision_at_k(
           predictions, labels, k=5, expected=0.0, class_id=class_id)
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
@@ -2185,25 +2190,25 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
          [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 4 predictions, all correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=4.0 / 4, class_id=2)
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=4.0 / 4, class_id=2)
 
     # Class 5: 2 predictions, both correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=2.0 / 2, class_id=5)
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=2.0 / 2, class_id=5)
 
     # Class 7: 2 predictions, 1 correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=1.0 / 2, class_id=7)
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=1.0 / 2, class_id=7)
 
     # All classes: 20 predictions, 7 correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=7.0 / 20)
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=7.0 / 20)
@@ -2220,7 +2225,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
          [[0, 1, 1, 0, 0, 1, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 1, 0]]])
 
     # Class 2: 2 predictions, both correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
         weights=[[1], [0]])
     self._test_precision_at_top_k(
@@ -2228,7 +2233,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         weights=[[1], [0]])
 
     # Class 2: 2 predictions, both correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=2.0 / 2.0, class_id=2,
         weights=[[0], [1]])
     self._test_precision_at_top_k(
@@ -2236,7 +2241,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         weights=[[0], [1]])
 
     # Class 7: 1 incorrect prediction.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=0.0 / 1.0, class_id=7,
         weights=[[1], [0]])
     self._test_precision_at_top_k(
@@ -2244,7 +2249,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         weights=[[1], [0]])
 
     # Class 7: 1 correct prediction.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=1.0 / 1.0, class_id=7,
         weights=[[0], [1]])
     self._test_precision_at_top_k(
@@ -2252,7 +2257,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         weights=[[0], [1]])
 
     # Class 7: no predictions.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=NAN, class_id=7,
         weights=[[1, 0], [0, 1]])
     self._test_precision_at_top_k(
@@ -2260,7 +2265,7 @@ class MultiLabelSparsePrecisionTest(test.TestCase):
         weights=[[1, 0], [0, 1]])
 
     # Class 7: 2 predictions, 1 correct.
-    self._test_sparse_precision_at_k(
+    self._test_precision_at_k(
         predictions, labels, k=5, expected=1.0 / 2.0, class_id=7,
         weights=[[0, 1], [1, 0]])
     self._test_precision_at_top_k(
@@ -2299,10 +2304,43 @@ def _test_recall_at_k(predictions,
       test_case.assertEqual(expected, metric.eval())
 
 
+def _test_recall_at_top_k(
+    predictions_idx,
+    labels,
+    expected,
+    k=None,
+    class_id=None,
+    weights=None,
+    test_case=None):
+  with ops.Graph().as_default() as g, test_case.test_session(g):
+    if weights is not None:
+      weights = constant_op.constant(weights, dtypes_lib.float32)
+    metric, update = metrics.recall_at_top_k(
+        predictions_idx=constant_op.constant(predictions_idx, dtypes_lib.int32),
+        labels=labels,
+        k=k,
+        class_id=class_id,
+        weights=weights)
+
+    # Fails without initialized vars.
+    test_case.assertRaises(errors_impl.OpError, metric.eval)
+    test_case.assertRaises(errors_impl.OpError, update.eval)
+    variables.variables_initializer(variables.local_variables()).run()
+
+    # Run per-step op and assert expected values.
+    if math.isnan(expected):
+      _assert_nan(test_case, update.eval())
+      _assert_nan(test_case, metric.eval())
+    else:
+      test_case.assertEqual(expected, update.eval())
+      test_case.assertEqual(expected, metric.eval())
+
+
 class SingleLabelRecallAtKTest(test.TestCase):
 
   def setUp(self):
     self._predictions = ((0.1, 0.3, 0.2, 0.4), (0.1, 0.2, 0.3, 0.4))
+    self._predictions_idx = [[3], [3]]
     indicator_labels = ((0, 0, 0, 1), (0, 0, 1, 0))
     class_labels = (3, 2)
     # Sparse vs dense, and 1d vs 2d labels should all be handled the same.
@@ -2313,6 +2351,8 @@ class SingleLabelRecallAtKTest(test.TestCase):
                 [[class_id] for class_id in class_labels], dtype=np.int64))
     self._test_recall_at_k = functools.partial(
         _test_recall_at_k, test_case=self)
+    self._test_recall_at_top_k = functools.partial(
+        _test_recall_at_top_k, test_case=self)
 
   def test_at_k1_nan(self):
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
@@ -2321,120 +2361,100 @@ class SingleLabelRecallAtKTest(test.TestCase):
       for class_id in (-1, 0, 1, 4):
         self._test_recall_at_k(
             self._predictions, labels, k=1, expected=NAN, class_id=class_id)
+        self._test_recall_at_top_k(
+            self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
   def test_at_k1_no_predictions(self):
     for labels in self._labels:
       # Class 2: 0 predictions.
       self._test_recall_at_k(
           self._predictions, labels, k=1, expected=0.0, class_id=2)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=1, expected=0.0, class_id=2)
 
   def test_one_label_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_recall_at_k(
           self._predictions, labels, k=1, expected=1.0 / 1, class_id=3)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=1, expected=1.0 / 1, class_id=3)
 
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_recall_at_k(self._predictions, labels, k=1, expected=1.0 / 2)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=1, expected=1.0 / 2)
 
-  def test_one_label_at_k1_weighted(self):
+  def test_one_label_at_k1_weighted_class_id3(self):
     predictions = self._predictions
+    predictions_idx = self._predictions_idx
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
       self._test_recall_at_k(
           predictions, labels, k=1, expected=NAN, class_id=3, weights=(0.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0,))
       self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=1.0 / 1,
-          class_id=3,
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 1, class_id=3,
           weights=(1.0,))
       self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=1.0 / 1,
-          class_id=3,
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(2.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 1, class_id=3,
           weights=(2.0,))
       self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=NAN,
-          class_id=3,
-          weights=(0.0, 0.0))
-      self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=NAN,
-          class_id=3,
+          predictions, labels, k=1, expected=NAN, class_id=3,
+          weights=(0.0, 1.0))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=NAN, class_id=3,
           weights=(0.0, 1.0))
       self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=1.0 / 1,
-          class_id=3,
+          predictions, labels, k=1, expected=1.0 / 1, class_id=3,
+          weights=(1.0, 0.0))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 1, class_id=3,
           weights=(1.0, 0.0))
       self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=1.0 / 1,
-          class_id=3,
-          weights=(1.0, 1.0))
-      self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=2.0 / 2,
-          class_id=3,
+          predictions, labels, k=1, expected=2.0 / 2, class_id=3,
+          weights=(2.0, 3.0))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=2.0 / 2, class_id=3,
           weights=(2.0, 3.0))
-      self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=3.0 / 3,
-          class_id=3,
-          weights=(3.0, 2.0))
-      self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=0.3 / 0.3,
-          class_id=3,
-          weights=(0.3, 0.6))
-      self._test_recall_at_k(
-          predictions,
-          labels,
-          k=1,
-          expected=0.6 / 0.6,
-          class_id=3,
-          weights=(0.6, 0.3))
 
+  def test_one_label_at_k1_weighted(self):
+    predictions = self._predictions
+    predictions_idx = self._predictions_idx
+    for labels in self._labels:
       # All classes: 2 labels, 2 predictions, 1 correct.
       self._test_recall_at_k(
           predictions, labels, k=1, expected=NAN, weights=(0.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=NAN, weights=(0.0,))
       self._test_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(1.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 2, weights=(1.0,))
       self._test_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 2, weights=(2.0,))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 2, weights=(2.0,))
       self._test_recall_at_k(
           predictions, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=1.0 / 1, weights=(1.0, 0.0))
       self._test_recall_at_k(
           predictions, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
-      self._test_recall_at_k(
-          predictions, labels, k=1, expected=1.0 / 2, weights=(1.0, 1.0))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=0.0 / 1, weights=(0.0, 1.0))
       self._test_recall_at_k(
           predictions, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
-      self._test_recall_at_k(
-          predictions, labels, k=1, expected=3.0 / 5, weights=(3.0, 2.0))
-      self._test_recall_at_k(
-          predictions, labels, k=1, expected=0.3 / 0.9, weights=(0.3, 0.6))
-      self._test_recall_at_k(
-          predictions, labels, k=1, expected=0.6 / 0.9, weights=(0.6, 0.3))
+      self._test_recall_at_top_k(
+          predictions_idx, labels, k=1, expected=2.0 / 5, weights=(2.0, 3.0))
 
 
 class MultiLabel2dRecallAtKTest(test.TestCase):
@@ -2442,6 +2462,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
   def setUp(self):
     self._predictions = ((0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9),
                          (0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6))
+    self._predictions_idx = ((9, 4, 6, 2, 0), (5, 7, 2, 9, 6))
     indicator_labels = ((0, 0, 1, 0, 0, 0, 0, 1, 1, 0),
                         (0, 1, 1, 0, 0, 1, 0, 0, 0, 0))
     class_labels = ((2, 7, 8), (1, 2, 5))
@@ -2451,6 +2472,8 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
                         class_labels, dtype=np.int64))
     self._test_recall_at_k = functools.partial(
         _test_recall_at_k, test_case=self)
+    self._test_recall_at_top_k = functools.partial(
+        _test_recall_at_top_k, test_case=self)
 
   def test_at_k5_nan(self):
     for labels in self._labels:
@@ -2458,29 +2481,41 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       for class_id in (0, 3, 4, 6, 9, 10):
         self._test_recall_at_k(
             self._predictions, labels, k=5, expected=NAN, class_id=class_id)
+        self._test_recall_at_top_k(
+            self._predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
   def test_at_k5_no_predictions(self):
     for labels in self._labels:
       # Class 8: 1 label, no predictions.
       self._test_recall_at_k(
           self._predictions, labels, k=5, expected=0.0 / 1, class_id=8)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=8)
 
   def test_at_k5(self):
     for labels in self._labels:
       # Class 2: 2 labels, both correct.
       self._test_recall_at_k(
           self._predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=5, expected=2.0 / 2, class_id=2)
 
       # Class 5: 1 label, incorrect.
       self._test_recall_at_k(
           self._predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=5, expected=1.0 / 1, class_id=5)
 
       # Class 7: 1 label, incorrect.
       self._test_recall_at_k(
           self._predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=7)
 
       # All classes: 6 labels, 3 correct.
       self._test_recall_at_k(self._predictions, labels, k=5, expected=3.0 / 6)
+      self._test_recall_at_top_k(
+          self._predictions_idx, labels, k=5, expected=3.0 / 6)
 
   def test_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
@@ -2494,17 +2529,25 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
         self._predictions, labels, k=5, expected=2.0 / 2, class_id=2)
+    self._test_recall_at_top_k(
+        self._predictions_idx, labels, k=5, expected=2.0 / 2, class_id=2)
 
     # Class 5: 1 label, incorrect.
     self._test_recall_at_k(
         self._predictions, labels, k=5, expected=1.0 / 1, class_id=5)
+    self._test_recall_at_top_k(
+        self._predictions_idx, labels, k=5, expected=1.0 / 1, class_id=5)
 
     # Class 7: 1 label, incorrect.
     self._test_recall_at_k(
         self._predictions, labels, k=5, expected=0.0 / 1, class_id=7)
+    self._test_recall_at_top_k(
+        self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=7)
 
     # All classes: 8 labels, 3 correct.
     self._test_recall_at_k(self._predictions, labels, k=5, expected=3.0 / 8)
+    self._test_recall_at_top_k(
+        self._predictions_idx, labels, k=5, expected=3.0 / 8)
 
 
 class MultiLabel3dRecallAtKTest(test.TestCase):
@@ -2514,6 +2557,8 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
                           (0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6)),
                          ((0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6),
                           (0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9)))
+    self._predictions_idx = (((9, 4, 6, 2, 0), (5, 7, 2, 9, 6)),
+                             ((5, 7, 2, 9, 6), (9, 4, 6, 2, 0)))
     # Note: We don't test dense labels here, since examples have different
     # numbers of labels.
     self._labels = _binary_3d_label_to_sparse_value(((
@@ -2521,114 +2566,128 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
             (0, 1, 1, 0, 0, 1, 0, 1, 0, 0), (0, 0, 1, 0, 0, 0, 0, 0, 1, 0))))
     self._test_recall_at_k = functools.partial(
         _test_recall_at_k, test_case=self)
+    self._test_recall_at_top_k = functools.partial(
+        _test_recall_at_top_k, test_case=self)
 
   def test_3d_nan(self):
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in (0, 3, 4, 6, 9, 10):
       self._test_recall_at_k(
           self._predictions, self._labels, k=5, expected=NAN, class_id=class_id)
+      self._test_recall_at_top_k(
+          self._predictions_idx, self._labels, k=5, expected=NAN,
+          class_id=class_id)
 
   def test_3d_no_predictions(self):
     # Classes 1,8 have 0 predictions, >=1 label.
     for class_id in (1, 8):
       self._test_recall_at_k(
           self._predictions, self._labels, k=5, expected=0.0, class_id=class_id)
+      self._test_recall_at_top_k(
+          self._predictions_idx, self._labels, k=5, expected=0.0,
+          class_id=class_id)
 
   def test_3d(self):
     # Class 2: 4 labels, all correct.
     self._test_recall_at_k(
         self._predictions, self._labels, k=5, expected=4.0 / 4, class_id=2)
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=4.0 / 4,
+        class_id=2)
 
     # Class 5: 2 labels, both correct.
     self._test_recall_at_k(
         self._predictions, self._labels, k=5, expected=2.0 / 2, class_id=5)
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=2.0 / 2,
+        class_id=5)
 
     # Class 7: 2 labels, 1 incorrect.
     self._test_recall_at_k(
         self._predictions, self._labels, k=5, expected=1.0 / 2, class_id=7)
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=1.0 / 2,
+        class_id=7)
 
     # All classes: 12 labels, 7 correct.
     self._test_recall_at_k(
         self._predictions, self._labels, k=5, expected=7.0 / 12)
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=7.0 / 12)
 
   def test_3d_ignore_all(self):
     for class_id in xrange(10):
       self._test_recall_at_k(
-          self._predictions,
-          self._labels,
-          k=5,
-          expected=NAN,
-          class_id=class_id,
+          self._predictions, self._labels, k=5, expected=NAN, class_id=class_id,
           weights=[[0], [0]])
+      self._test_recall_at_top_k(
+          self._predictions_idx, self._labels, k=5, expected=NAN,
+          class_id=class_id, weights=[[0], [0]])
       self._test_recall_at_k(
-          self._predictions,
-          self._labels,
-          k=5,
-          expected=NAN,
-          class_id=class_id,
+          self._predictions, self._labels, k=5, expected=NAN, class_id=class_id,
           weights=[[0, 0], [0, 0]])
+      self._test_recall_at_top_k(
+          self._predictions_idx, self._labels, k=5, expected=NAN,
+          class_id=class_id, weights=[[0, 0], [0, 0]])
     self._test_recall_at_k(
         self._predictions, self._labels, k=5, expected=NAN, weights=[[0], [0]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=NAN,
+        weights=[[0], [0]])
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=NAN,
+        self._predictions, self._labels, k=5, expected=NAN,
+        weights=[[0, 0], [0, 0]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=NAN,
         weights=[[0, 0], [0, 0]])
 
   def test_3d_ignore_some(self):
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=2.0 / 2.0,
-        class_id=2,
+        self._predictions, self._labels, k=5, expected=2.0 / 2.0, class_id=2,
         weights=[[1], [0]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=2.0 / 2.0,
+        class_id=2, weights=[[1], [0]])
 
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=2.0 / 2.0,
-        class_id=2,
+        self._predictions, self._labels, k=5, expected=2.0 / 2.0, class_id=2,
         weights=[[0], [1]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=2.0 / 2.0,
+        class_id=2, weights=[[0], [1]])
 
     # Class 7: 1 label, correct.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=1.0 / 1.0,
-        class_id=7,
+        self._predictions, self._labels, k=5, expected=1.0 / 1.0, class_id=7,
         weights=[[0], [1]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=1.0 / 1.0,
+        class_id=7, weights=[[0], [1]])
 
     # Class 7: 1 label, incorrect.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=0.0 / 1.0,
-        class_id=7,
+        self._predictions, self._labels, k=5, expected=0.0 / 1.0, class_id=7,
         weights=[[1], [0]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=0.0 / 1.0,
+        class_id=7, weights=[[1], [0]])
 
     # Class 7: 2 labels, 1 correct.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=1.0 / 2.0,
-        class_id=7,
+        self._predictions, self._labels, k=5, expected=1.0 / 2.0, class_id=7,
         weights=[[1, 0], [1, 0]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=1.0 / 2.0,
+        class_id=7, weights=[[1, 0], [1, 0]])
 
     # Class 7: No labels.
     self._test_recall_at_k(
-        self._predictions,
-        self._labels,
-        k=5,
-        expected=NAN,
-        class_id=7,
+        self._predictions, self._labels, k=5, expected=NAN, class_id=7,
+        weights=[[0, 1], [0, 1]])
+    self._test_recall_at_top_k(
+        self._predictions_idx, self._labels, k=5, expected=NAN, class_id=7,
         weights=[[0, 1], [0, 1]])
 
 
@@ -2640,8 +2699,8 @@ class MeanAbsoluteErrorTest(test.TestCase):
   def testVars(self):
     metrics.mean_absolute_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_absolute_error/count:0',
-                                   'mean_absolute_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_absolute_error/count:0', 'mean_absolute_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2701,8 +2760,8 @@ class MeanRelativeErrorTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         normalizer=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_relative_error/count:0',
-                                   'mean_relative_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_relative_error/count:0', 'mean_relative_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2784,8 +2843,8 @@ class MeanSquaredErrorTest(test.TestCase):
   def testVars(self):
     metrics.mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('mean_squared_error/count:0',
-                                   'mean_squared_error/total:0'))
+    _assert_metric_variables(
+        self, ('mean_squared_error/count:0', 'mean_squared_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -2960,8 +3019,9 @@ class RootMeanSquaredErrorTest(test.TestCase):
   def testVars(self):
     metrics.root_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
-    _assert_local_variables(self, ('root_mean_squared_error/count:0',
-                                   'root_mean_squared_error/total:0'))
+    _assert_metric_variables(
+        self,
+        ('root_mean_squared_error/count:0', 'root_mean_squared_error/total:0'))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -3054,9 +3114,10 @@ class MeanCosineDistanceTest(test.TestCase):
         predictions=array_ops.ones((10, 3)),
         labels=array_ops.ones((10, 3)),
         dim=1)
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'mean_cosine_distance/count:0',
-        'mean_cosine_distance/total:0',))
+        'mean_cosine_distance/total:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -3191,9 +3252,10 @@ class PcntBelowThreshTest(test.TestCase):
 
   def testVars(self):
     metrics.percentage_below(values=array_ops.ones((10,)), threshold=2)
-    _assert_local_variables(self, (
+    _assert_metric_variables(self, (
         'percentage_below_threshold/count:0',
-        'percentage_below_threshold/total:0',))
+        'percentage_below_threshold/total:0',
+    ))
 
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
@@ -3263,7 +3325,7 @@ class MeanIOUTest(test.TestCase):
         predictions=array_ops.ones([10, 1]),
         labels=array_ops.ones([10, 1]),
         num_classes=2)
-    _assert_local_variables(self, ('mean_iou/total_confusion_matrix:0',))
+    _assert_metric_variables(self, ('mean_iou/total_confusion_matrix:0',))
 
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
@@ -3507,23 +3569,23 @@ class MeanIOUTest(test.TestCase):
 
   def testMissingClassInLabels(self):
     labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
+        [[0, 0, 1, 1, 0, 0],
+         [1, 0, 0, 0, 0, 1]],
+        [[1, 1, 1, 1, 1, 1],
+         [0, 0, 0, 0, 0, 0]]])
     predictions = constant_op.constant([
-      [[0, 0, 2, 1, 1, 0],
-       [0, 1, 2, 2, 0, 1]],
-      [[0, 0, 2, 1, 1, 1],
-       [1, 1, 2, 0, 0, 0]]])
+        [[0, 0, 2, 1, 1, 0],
+         [0, 1, 2, 2, 0, 1]],
+        [[0, 0, 2, 1, 1, 1],
+         [1, 1, 2, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
-        1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
-        miou.eval())
+          1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
+          miou.eval())
 
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
@@ -3537,22 +3599,22 @@ class MeanIOUTest(test.TestCase):
 
   def testMissingClassOverallLarge(self):
     labels = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 0, 0, 0, 0, 1]],
-      [[1, 1, 1, 1, 1, 1],
-       [0, 0, 0, 0, 0, 0]]])
+        [[0, 0, 1, 1, 0, 0],
+         [1, 0, 0, 0, 0, 1]],
+        [[1, 1, 1, 1, 1, 1],
+         [0, 0, 0, 0, 0, 0]]])
     predictions = constant_op.constant([
-      [[0, 0, 1, 1, 0, 0],
-       [1, 1, 0, 0, 1, 1]],
-      [[0, 0, 0, 1, 1, 1],
-       [1, 1, 1, 0, 0, 0]]])
+        [[0, 0, 1, 1, 0, 0],
+         [1, 1, 0, 0, 1, 1]],
+        [[0, 0, 0, 1, 1, 1],
+         [1, 1, 1, 0, 0, 0]]])
     num_classes = 3
     with self.test_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
       sess.run(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
-        1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
+          1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
 
 
 class MeanPerClassAccuracyTest(test.TestCase):
@@ -3566,7 +3628,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
         predictions=array_ops.ones([10, 1]),
         labels=array_ops.ones([10, 1]),
         num_classes=2)
-    _assert_local_variables(self, ('mean_accuracy/total_confusion_matrix:0',))
+    _assert_metric_variables(self, ('mean_accuracy/total_confusion_matrix:0',))
 
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
@@ -3795,6 +3857,56 @@ class MeanPerClassAccuracyTest(test.TestCase):
       self.assertAlmostEqual(desired_mean_accuracy, mean_accuracy.eval())
 
 
+class FalseNegativesTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.false_negatives(
+        labels=(0, 1, 0, 1),
+        predictions=(0, 0, 1, 1))
+    _assert_metric_variables(self, ('false_negatives/count:0',))
+
+  def testUnweighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    tn, tn_update_op = metrics.false_negatives(
+        labels=labels, predictions=predictions)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(3., tn_update_op.eval())
+      self.assertAllClose(3., tn.eval())
+
+  def testWeighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    weights = constant_op.constant((1., 1.5, 2., 2.5))
+    tn, tn_update_op = metrics.false_negatives(
+        labels=labels, predictions=predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(5., tn_update_op.eval())
+      self.assertAllClose(5., tn.eval())
+
+
 class FalseNegativesAtThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -3806,7 +3918,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0.15, 0.5, 0.85])
-    _assert_local_variables(self, ('false_negatives/false_negatives:0',))
+    _assert_metric_variables(self, ('false_negatives/false_negatives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -3844,6 +3956,56 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
 
 
+class FalsePositivesTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.false_positives(
+        labels=(0, 1, 0, 1),
+        predictions=(0, 0, 1, 1))
+    _assert_metric_variables(self, ('false_positives/count:0',))
+
+  def testUnweighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    tn, tn_update_op = metrics.false_positives(
+        labels=labels, predictions=predictions)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(7., tn_update_op.eval())
+      self.assertAllClose(7., tn.eval())
+
+  def testWeighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    weights = constant_op.constant((1., 1.5, 2., 2.5))
+    tn, tn_update_op = metrics.false_positives(
+        labels=labels, predictions=predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(14., tn_update_op.eval())
+      self.assertAllClose(14., tn.eval())
+
+
 class FalsePositivesAtThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -3855,7 +4017,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0.15, 0.5, 0.85])
-    _assert_local_variables(self, ('false_positives/false_positives:0',))
+    _assert_metric_variables(self, ('false_positives/false_positives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -3895,6 +4057,56 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
 
 
+class TrueNegativesTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.true_negatives(
+        labels=(0, 1, 0, 1),
+        predictions=(0, 0, 1, 1))
+    _assert_metric_variables(self, ('true_negatives/count:0',))
+
+  def testUnweighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    tn, tn_update_op = metrics.true_negatives(
+        labels=labels, predictions=predictions)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(3., tn_update_op.eval())
+      self.assertAllClose(3., tn.eval())
+
+  def testWeighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    weights = constant_op.constant((1., 1.5, 2., 2.5))
+    tn, tn_update_op = metrics.true_negatives(
+        labels=labels, predictions=predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(4., tn_update_op.eval())
+      self.assertAllClose(4., tn.eval())
+
+
 class TrueNegativesAtThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -3906,7 +4118,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0.15, 0.5, 0.85])
-    _assert_local_variables(self, ('true_negatives/true_negatives:0',))
+    _assert_metric_variables(self, ('true_negatives/true_negatives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
@@ -3944,6 +4156,56 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
       self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
 
 
+class TruePositivesTest(test.TestCase):
+
+  def setUp(self):
+    np.random.seed(1)
+    ops.reset_default_graph()
+
+  def testVars(self):
+    metrics.true_positives(
+        labels=(0, 1, 0, 1),
+        predictions=(0, 0, 1, 1))
+    _assert_metric_variables(self, ('true_positives/count:0',))
+
+  def testUnweighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    tn, tn_update_op = metrics.true_positives(
+        labels=labels, predictions=predictions)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(7., tn_update_op.eval())
+      self.assertAllClose(7., tn.eval())
+
+  def testWeighted(self):
+    labels = constant_op.constant(((0, 1, 0, 1, 0),
+                                   (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0),
+                                   (0, 0, 0, 0, 1)))
+    predictions = constant_op.constant(((0, 0, 1, 1, 0),
+                                        (1, 1, 1, 1, 1),
+                                        (0, 1, 0, 1, 0),
+                                        (1, 1, 1, 1, 1)))
+    weights = constant_op.constant((1., 1.5, 2., 2.5))
+    tn, tn_update_op = metrics.true_positives(
+        labels=labels, predictions=predictions, weights=weights)
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      self.assertAllClose(0., tn.eval())
+      self.assertAllClose(12., tn_update_op.eval())
+      self.assertAllClose(12., tn.eval())
+
+
 class TruePositivesAtThresholdsTest(test.TestCase):
 
   def setUp(self):
@@ -3955,7 +4217,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         predictions=array_ops.ones((10, 1)),
         labels=array_ops.ones((10, 1)),
         thresholds=[0.15, 0.5, 0.85])
-    _assert_local_variables(self, ('true_positives/true_positives:0',))
+    _assert_metric_variables(self, ('true_positives/true_positives:0',))
 
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d40517510046959e353cad4df0c6ddbed0db90aa..56a07cb012f08dec750c5ee18cc73b3b127ef5dd 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -46,6 +46,15 @@ class PartitionerCreatorsTest(test.TestCase):
         self.assertEqual(len(v0_list), 5)
         self.assertAllEqual(v0_part, (5, 1))
 
+  def testFixedSizePartitionerInt64(self):
+    with self.test_session():
+      partitioner = partitioned_variables.fixed_size_partitioner(4, axis=0)
+      with variable_scope.variable_scope("root", partitioner=partitioner):
+        v0 = variable_scope.get_variable(
+            "v0", dtype=dtypes.int64, shape=[20])
+        v0_list = v0._get_variable_list()
+        self.assertEqual(len(v0_list), 4)
+
   def testResourceFixedSizePartitioner(self):
     with self.test_session():
       partitioner = partitioned_variables.fixed_size_partitioner(5, axis=0)
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 563815b7d841d2b2d459befd21f55833a000e94c..63848976336f5487cf2a44f7cf62ea316c40d7c8 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -154,7 +154,7 @@ class PoolingTest(test.TestCase):
     self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           self._test(
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
               strides=[1, 2])
 
   def testPool1D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 2], [2, 10, 2]]:
@@ -192,7 +192,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool2D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 2], [2, 10, 9, 2]]:
@@ -218,7 +218,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testPool3D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["MAX", "AVG"]:
           for input_shape in [[2, 9, 10, 11, 2], [2, 10, 9, 11, 2]]:
@@ -302,7 +302,7 @@ class PoolingTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   def testGradient1D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 5, 2], [1, 4, 1]]:
@@ -328,7 +328,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testGradient2D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[2, 4, 5, 2], [1, 5, 4, 1]]:
@@ -354,7 +354,7 @@ class PoolingTest(test.TestCase):
                     strides=strides)
 
   def testGradient3D(self):
-    with self.test_session():
+    with self.test_session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
         for pooling_type in ["AVG", "MAX"]:
           for input_shape in [[1, 3, 5, 4, 1], [1, 5, 4, 3, 1]]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index c699d50c02d9e3141fd62b9bceee6e6c1d5c497b..6be8997cabdb4cba87f90378c405a63aa6f78ea3 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -1341,11 +1342,14 @@ class PoolingTest(test.TestCase):
       return
 
     # Test the GPU implementation that uses cudnn for now.
-    # It does not propagate the diff in cases of NaNs
+    saved_nanprop = os.environ.get("TF_ENABLE_MAXPOOL_NANPROP")
+    # Do not propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "0"
     expected_input_backprop_cudnn = [
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
+
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
           input_data,
@@ -1361,6 +1365,30 @@ class PoolingTest(test.TestCase):
           use_gpu=True,
           v2=v2)
 
+    # Propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "1"
+    expected_input_backprop_cudnn = expected_input_backprop_tf_cpu
+
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
+
+    if saved_nanprop:
+      os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = saved_nanprop
+    else:
+      del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
+
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
@@ -1391,11 +1419,14 @@ class PoolingTest(test.TestCase):
       return
 
     # Test the GPU implementation that uses cudnn for now.
-    # It does not propagate the diff in cases of NaNs
+    saved_nanprop = os.environ.get("TF_ENABLE_MAXPOOL_NANPROP")
+    # Do not propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "0"
     expected_input_backprop_cudnn = [
         0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
         0.0, 0.0
     ]
+
     for v2 in [True, False]:
       self._testMaxPoolGradDirect(
           input_data,
@@ -1411,6 +1442,30 @@ class PoolingTest(test.TestCase):
           use_gpu=True,
           v2=v2)
 
+    # Propagate the diff in cases of NaNs
+    os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = "1"
+    expected_input_backprop_cudnn = expected_input_backprop_tf_cpu
+
+    for v2 in [True, False]:
+      self._testMaxPoolGradDirect(
+          input_data,
+          output_backprop,
+          expected_input_backprop_cudnn,
+          input_sizes=[1, 4, 4, 1],
+          output_sizes=[1, 3, 3, 1],
+          window_rows=2,
+          window_cols=2,
+          row_stride=1,
+          col_stride=1,
+          padding="VALID",
+          use_gpu=True,
+          v2=v2)
+
+    if saved_nanprop:
+      os.environ["TF_ENABLE_MAXPOOL_NANPROP"] = saved_nanprop
+    else:
+      del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
+
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
     self._testMaxPoolGradDirect1_2()
@@ -1723,7 +1778,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   def testOpEdgeCases(self):
-    with self.test_session() as sess:
+    with self.test_session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
       if test.is_gpu_available():
         pool_funcs.append(nn_ops.max_pool_with_argmax)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 7ed99c1be9b62a145b9584fd6412f1074f501ae8..92fb68820e04c3db1385296d91d956134b8ff2d4 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -23,82 +23,93 @@ from six.moves import queue
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class PyOpTest(test.TestCase):
+def np_func(x, y):
+  return np.sinh(x) + np.cosh(y)
 
-  def testBasic(self):
 
-    def my_func(x, y):
-      return np.sinh(x) + np.cosh(y)
+def matmul(x, y):
+  return math_ops.matmul(x, y)
 
-    # single type
+
+class PyFuncTest(test.TestCase):
+  """Encapsulates tests for py_func and eager_py_func."""
+
+  # ----- Tests for py_func -----
+  def testSingleType(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], dtypes.float32)
-      self.assertEqual(z.eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.float32))
+      self.assertEqual(z, np_func(1.0, 2.0).astype(np.float32))
 
-    # scalar
+  def testScalar(self):
     with self.test_session():
       x = constant_op.constant(1.0, dtypes.float32)
       y = constant_op.constant(2.0, dtypes.float32)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float32])
-      self.assertEqual(z[0].eval(), my_func(1.0, 2.0).astype(np.float32))
+      z = self.evaluate(
+          script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
+      self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
-    # array
+  def testArray(self):
     with self.test_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
       y = constant_op.constant([2.0, 3.0], dtypes.float64)
-      z = script_ops.py_func(my_func, [x, y], [dtypes.float64])
-      self.assertAllEqual(z[0].eval(),
-                          my_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], [dtypes.float64]))
+      self.assertAllEqual(z[0],
+                          np_func([1.0, 2.0], [2.0, 3.0]).astype(np.float64))
 
-    # a bit exotic type (complex64)
+  def testComplexType(self):
     with self.test_session():
       x = constant_op.constant(1 + 2j, dtypes.complex64)
       y = constant_op.constant(3 + 4j, dtypes.complex64)
-      z, = script_ops.py_func(my_func, [x, y], [dtypes.complex64])
-      self.assertAllClose(z.eval(), my_func(1 + 2j, 3 + 4j))
+      z = self.evaluate(script_ops.py_func(np_func, [x, y], dtypes.complex64))
+      self.assertAllClose(z, np_func(1 + 2j, 3 + 4j))
 
-    # a bit excotic function (rfft)
+  def testRFFT(self):
     with self.test_session():
       x = constant_op.constant([1., 2., 3., 4.], dtypes.float32)
 
       def rfft(x):
         return np.fft.rfft(x).astype(np.complex64)
 
-      y, = script_ops.py_func(rfft, [x], [dtypes.complex64])
-      self.assertAllClose(y.eval(), np.fft.rfft([1., 2., 3., 4.]))
+      y = self.evaluate(script_ops.py_func(rfft, [x], dtypes.complex64))
+      self.assertAllClose(y, np.fft.rfft([1., 2., 3., 4.]))
 
-    # returns a python literal.
+  def testPythonLiteral(self):
     with self.test_session():
 
       def literal(x):
-        return 1.0 if x == 0.0 else 0.0
+        return 1.0 if float(x) == 0.0 else 0.0
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, = script_ops.py_func(literal, [x], [dtypes.float64])
-      self.assertAllClose(y.eval(), 1.0)
+      y = self.evaluate(script_ops.py_func(literal, [x], dtypes.float64))
+      self.assertAllClose(y, 1.0)
 
-    # returns a list
+  def testList(self):
     with self.test_session():
 
       def list_func(x):
         return [x, x + 1]
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(list_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(list_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
+  def testTuple(self):
     # returns a tuple
     with self.test_session():
 
@@ -106,17 +117,17 @@ class PyOpTest(test.TestCase):
         return x, x + 1
 
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2)
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, [x], [dtypes.float64] * 2))
+      self.assertAllClose(y, [0.0, 1.0])
 
     # returns a tuple, Tout and inp a tuple
     with self.test_session():
       x = constant_op.constant(0.0, dtypes.float64)
-      y, z = script_ops.py_func(tuple_func, (x,), (dtypes.float64,
-                                                   dtypes.float64))
-      self.assertAllClose(y.eval(), 0.0)
-      self.assertAllClose(z.eval(), 1.0)
+      y = self.evaluate(
+          script_ops.py_func(tuple_func, (x,),
+                             (dtypes.float64, dtypes.float64)))
+      self.assertAllClose(y, [0.0, 1.0])
 
   def testStrings(self):
 
@@ -128,10 +139,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant([b"hello", b"hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testStringsAreConvertedToBytes(self):
 
@@ -143,10 +156,12 @@ class PyOpTest(test.TestCase):
 
     with self.test_session():
       x = constant_op.constant(["hello", "hi"], dtypes.string)
-      y, = script_ops.py_func(read_fixed_length_numpy_strings, [],
-                              [dtypes.string])
-      z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
-      self.assertListEqual(list(z.eval()), [b"hello there", b"hi there"])
+      y = self.evaluate(
+          script_ops.py_func(read_fixed_length_numpy_strings, [],
+                             dtypes.string))
+      z = self.evaluate(
+          script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
+      self.assertAllEqual(z, [b"hello there", b"hi there"])
 
   def testObjectArraysAreConvertedToBytes(self):
 
@@ -186,16 +201,8 @@ class PyOpTest(test.TestCase):
 
   def testNoInput(self):
     with self.test_session():
-      x, = script_ops.py_func(lambda: 42.0, [], [dtypes.float64])
-      self.assertAllClose(x.eval(), 42.0)
-
-  def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertTrue(script_ops._py_funcs.size() < 100)
+      x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
+      self.assertAllClose(x, 42.0)
 
   def testAlias(self):
     with self.test_session():
@@ -242,8 +249,8 @@ class PyOpTest(test.TestCase):
       # Create a numpy array aliasing a tensor and a tensor aliasing this array
       z, = script_ops.py_func(ident, [p], [dtypes.float32])
       z += 0.0  # Makes sure we release the tensor aliasing the numpy array x[0]
-                # above instead of using its memory as the return value of
-                # session.run
+      # above instead of using its memory as the return value of
+      # session.run
       self.assertEqual(0.0, z.eval(feed_dict={p: [0.0]}))
 
   def testStateful(self):
@@ -319,10 +326,10 @@ class PyOpTest(test.TestCase):
       def value(self):
         return self._value
 
-    with self.test_session() as sess:
+    with self.test_session():
       s = State()
       op = s.increment(constant_op.constant(2, dtypes.int64))
-      ret = sess.run(op)
+      ret = self.evaluate(op)
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
@@ -336,15 +343,24 @@ class PyOpTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(sess.run(f), [])
 
-  def _testExceptionHandling(self, py_exp, tf_exp):
+  def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
     def raise_exception():
       raise py_exp("blah")  # pylint: disable=not-callable
 
-    f = script_ops.py_func(raise_exception, [], [])
-    with self.test_session() as sess:
+    if eager:
+      if context.in_eager_mode():
+        with self.assertRaisesRegexp(tf_exp, "blah"):
+          f = script_ops.eager_py_func(raise_exception, [], [])
+        return
+      else:
+        f = script_ops.eager_py_func(raise_exception, [], [])
+    else:
+      f = script_ops.py_func(raise_exception, [], [])
+
+    with self.test_session():
       with self.assertRaisesRegexp(tf_exp, "blah"):
-        sess.run(f)
+        self.evaluate(f)
 
   def testExceptionHandling(self):
     self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -358,6 +374,89 @@ class PyOpTest(test.TestCase):
 
     self._testExceptionHandling(WeirdError, errors.UnknownError)
 
+  # ----- Tests shared by py_func and eager_py_func -----
+  def testCleanup(self):
+    for _ in xrange(1000):
+      g = ops.Graph()
+      with g.as_default():
+        c = constant_op.constant([1.], dtypes.float32)
+        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+    self.assertTrue(script_ops._py_funcs.size() < 100)
+
+  # ----- Tests for eager_py_func -----
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputInt32(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.int32)
+    x = array_ops.ones((3, 1), dtype=dtypes.int32)
+    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[3], [3], [3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerSingleOutputFloat32(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.float32)
+    x = array_ops.ones((3, 1), dtype=dtypes.float32)
+    output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.float32)
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllClose(ret, [[3.0], [3.0], [3.0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerArrayOutput(self):
+    a = array_ops.ones((3, 3), dtype=dtypes.int32)
+    x = array_ops.ones((3, 1), dtype=dtypes.int32)
+    output = script_ops.eager_py_func(
+        lambda a, x: [matmul(a, x)], inp=[a, x], Tout=[dtypes.int32])
+
+    with self.test_session():
+      ret = self.evaluate(output)
+      self.assertAllEqual(ret, [[[3], [3], [3]]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerReturnNone(self):
+
+    def no_return_value():
+      return
+
+    output = script_ops.eager_py_func(no_return_value, inp=[], Tout=[])
+    ret = self.evaluate(output)
+    if context.in_eager_mode():
+      self.assertEquals(len(ret), 0)
+    else:
+      self.assertIsNone(ret)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerPyFuncInDefun(self):
+
+    def wrapper():
+      a = array_ops.ones((3, 3), dtype=dtypes.int32)
+      x = array_ops.ones((3, 1), dtype=dtypes.int32)
+      return script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
+
+    wrapped = function.defun(wrapper)
+    ret = self.evaluate(wrapped())
+    self.assertAllEqual(ret, [[3], [3], [3]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerExceptionHandling(self):
+    self._testExceptionHandling(
+        ValueError, errors.InvalidArgumentError, eager=True)
+    self._testExceptionHandling(
+        TypeError, errors.InvalidArgumentError, eager=True)
+    self._testExceptionHandling(
+        StopIteration, errors.OutOfRangeError, eager=True)
+    self._testExceptionHandling(
+        MemoryError, errors.ResourceExhaustedError, eager=True)
+    self._testExceptionHandling(
+        NotImplementedError, errors.UnimplementedError, eager=True)
+
+    class WeirdError(Exception):
+      pass
+
+    self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index ca48ba6cadee431c3af41b72646d4f1b3e60ec66..a9dc7b7de000024f23b88406bf0c1c2f32ac4fac 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -57,12 +57,14 @@ class MultinomialTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testSmallEntropy(self):
     random_seed.set_random_seed(1618)
-    with test_util.device(use_gpu=True):
-      # A logit value of -10 corresponds to a probability of ~5e-5.
-      logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
-      num_samples = 1000
-      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
-      self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
+    for output_dtype in [np.int32, np.int64]:
+      with test_util.device(use_gpu=True):
+        # A logit value of -10 corresponds to a probability of ~5e-5.
+        logits = constant_op.constant([[-10., 10., -10.], [-10., -10., 10.]])
+        num_samples = 1000
+        samples = self.evaluate(random_ops.multinomial(
+            logits, num_samples, output_dtype=output_dtype))
+        self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
   def testOneOpMultipleStepsIndependent(self):
     with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index ca57e380e8dfa5fc08afaec5ece2597faf095f8b..afdf71e6522f56913ffbe8f7771660f8af6c2455 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -24,11 +24,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
+# All supported dtypes for random_poisson().
+_SUPPORTED_DTYPES = (dtypes.float16, dtypes.float32, dtypes.float64,
+                     dtypes.int32, dtypes.int64)
+
 
 class RandomPoissonTest(test.TestCase):
   """This is a large test due to the moments computation taking some time."""
@@ -57,7 +60,7 @@ class RandomPoissonTest(test.TestCase):
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
     z_limit = 6.0
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64:
+    for dt in _SUPPORTED_DTYPES:
       # Test when lam < 10 and when lam >= 10
       for stride in 0, 4, 10:
         for lam in (3., 20):
@@ -102,7 +105,7 @@ class RandomPoissonTest(test.TestCase):
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
   def testCPUGPUMatch(self):
-    for dt in dtypes.float16, dtypes.float32, dtypes.float64:
+    for dt in _SUPPORTED_DTYPES:
       results = {}
       for use_gpu in [False, True]:
         sampler = self._Sampler(1000, 1.0, dt, use_gpu=use_gpu, seed=12345)
@@ -183,19 +186,11 @@ class RandomPoissonTest(test.TestCase):
 
   def testDTypeCombinationsV2(self):
     """Tests random_poisson_v2() for all supported dtype combinations."""
-    # All supported dtypes by random_poisson_v2().
-    supported_dtypes = [
-        dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
-        dtypes.int64
-    ]
-
     with self.test_session():
-      for lam_dt in supported_dtypes:
-        for out_dt in supported_dtypes:
-          # TODO(dhananjayn): Change this to use random_poisson() after
-          # switching it to RandomPoissonV2.
-          gen_random_ops.random_poisson_v2(
-              [10], constant_op.constant([1], dtype=lam_dt),
+      for lam_dt in _SUPPORTED_DTYPES:
+        for out_dt in _SUPPORTED_DTYPES:
+          random_ops.random_poisson(
+              constant_op.constant([1], dtype=lam_dt), [10],
               dtype=out_dt).eval()
 
 
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 5630259b7b7c7d4607854ee4fb8a04c404e70a17..223a4b2c8726d957f014e65ea9f87c0fb61e65bb 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -35,6 +35,9 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.util import compat
 
 prefix_path = "tensorflow/core/lib"
@@ -1011,6 +1014,25 @@ class LMDBReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testReadFromSameFile(self):
+    with self.test_session() as sess:
+      reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
+      reader2 = io_ops.LMDBReader(name="test_read_from_same_file2")
+      filename_queue = input_lib.string_input_producer(
+          [self.db_path], num_epochs=None)
+      key1, value1 = reader1.read(filename_queue)
+      key2, value2 = reader2.read(filename_queue)
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+      for _ in range(3):
+        for _ in range(10):
+          k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
+          self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
+          self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
+      coord.request_stop()
+      coord.join(threads)
+
   def testReadFromFolder(self):
     with self.test_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_folder")
@@ -1029,6 +1051,26 @@ class LMDBReaderTest(test.TestCase):
                                     "\\(requested 1, current size 0\\)"):
         k, v = sess.run([key, value])
 
+  def testReadFromFileRepeatedly(self):
+    with self.test_session() as sess:
+      reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
+      filename_queue = input_lib.string_input_producer(
+          [self.db_path], num_epochs=None)
+      key, value = reader.read(filename_queue)
+
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+      # Iterate over the lmdb 3 times.
+      for _ in range(3):
+        # Go over all 10 records each time.
+        for j in range(10):
+          k, v = sess.run([key, value])
+          self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(j)))
+          self.assertAllEqual(
+              compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + j))))
+      coord.request_stop()
+      coord.join(threads)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 2dc65b13849439b413b39c7dfec6e86225f6c49b..4231a79b2dcef951048ca54e8c8df2f42b44b1a1 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -50,7 +50,7 @@ def _powerset(iterable):
   """
   s = list(iterable)
   return itertools.chain.from_iterable(
-      itertools.combinations(s, r) for r in range(len(s)+1))
+      itertools.combinations(s, r) for r in range(len(s) + 1))
 
 
 class ReducedShapeTest(test.TestCase):
@@ -91,6 +91,23 @@ class ReducedShapeTest(test.TestCase):
       self._check([10, 10, 10], [-3], [1, 10, 10])
 
 
+class ReductionUnknownShape(test.TestCase):
+
+  def testBasic(self):
+    with self.test_session():
+      for dtype, reductions in [(dtypes.float32,
+                                 (math_ops.reduce_sum, math_ops.reduce_mean,
+                                  math_ops.reduce_prod, math_ops.reduce_max,
+                                  math_ops.reduce_min)),
+                                (dtypes.bool, (math_ops.reduce_all,
+                                               math_ops.reduce_any))]:
+        for reduction in reductions:
+          x = array_ops.placeholder(
+              dtype=dtype, shape=None)  # Some tensor w/ unknown shape.
+          y = reduction(x)
+          self.assertEqual(y.shape, ())
+
+
 class BaseReductionTest(test.TestCase):
 
   def _tf_reduce(self, x, reduction_axes, keep_dims):
@@ -200,7 +217,6 @@ class SumReductionTest(BaseReductionTest):
       tf_out_mean = sess.run(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
-
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -309,8 +325,9 @@ class SumReductionTest(BaseReductionTest):
   # Int64??
 
   def testGradient(self):
-    for dtype in [dtypes.float32, dtypes.float64, dtypes.complex64,
-                  dtypes.complex128]:
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
+    ]:
       x = self._makeIncremental([2, 3, 4, 2], dtype)
       self._compareGradientAxes(x)
 
@@ -913,8 +930,9 @@ class CountNonzeroReductionTest(test.TestCase):
   def testFloatReduce4D(self):
     # Create a 4D array of floats and reduce across some
     # dimensions
-    np_arr = np.floor(np.arange(0.0, 210.0) / 100.0).reshape(
-        [2, 3, 5, 7]).astype(np.float32)
+    np_arr = np.floor(np.arange(0.0, 210.0) / 100.0).reshape([2, 3, 5,
+                                                              7]).astype(
+                                                                  np.float32)
     self._compareAll(np_arr, None)
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 8f328cea631767085177d3e555c4f7565abc2c27..4c7a9cb0f9542afe8fc1608a05864b739d741c97 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -498,6 +498,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterUpdateCast(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
+      state_ops.scatter_update(v, [1], [3])
+      self.assertAllEqual([1.0, 3.0], v.numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index d8f4b439e37981f3d21181feae9baa8d492ee1d5..0c77d1db921566000c2a52e6ddb9d3dddd9b193c 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_lib
 import tensorflow.python.ops.data_flow_grad  # pylint: disable=unused-import
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -57,7 +58,7 @@ class Plus1RNNCell(rnn_cell_impl.RNNCell):
   def state_size(self):
     return 5
 
-  def __call__(self, input_, state, scope=None):
+  def call(self, input_, state, scope=None):
     return (input_ + 1, state + 1)
 
 
@@ -75,10 +76,31 @@ class ScalarStateRNNCell(rnn_cell_impl.RNNCell):
   def zero_state(self, batch_size, dtype):
     return array_ops.zeros([], dtype=dtypes.int32)
 
-  def __call__(self, input_, state, scope=None):
+  def call(self, input_, state, scope=None):
     return (input_, state + 1)
 
 
+class TensorArrayStateRNNCell(rnn_cell_impl.RNNCell):
+  """RNN Cell its state as a TensorArray."""
+
+  @property
+  def output_size(self):
+    return 1
+
+  @property
+  def state_size(self):
+    return (tensor_shape.TensorShape([]), ())
+
+  def zero_state(self, batch_size, dtype):
+    return (array_ops.zeros([], dtype=dtypes.int32),
+            tensor_array_ops.TensorArray(
+                dtype=dtype, size=0, dynamic_size=True))
+
+  def call(self, input_, state, scope=None):
+    new_array = state[1].write(state[0], input_)
+    return (input_, (state[0] + 1, new_array))
+
+
 class RNNTest(test.TestCase):
 
   def setUp(self):
@@ -171,6 +193,36 @@ class RNNTest(test.TestCase):
       self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
       self.assertEqual(state.numpy(), 4)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testTensorArrayStateIsAccepted(self):
+    cell = TensorArrayStateRNNCell()
+    in_graph_mode = context.in_graph_mode()
+
+    if in_graph_mode:
+      inputs = array_ops.placeholder(dtypes.float32, shape=(1, 4, 1))
+    else:
+      inputs = np.array([[[1], [2], [3], [4]]], dtype=np.float32)
+
+    with self.test_session() as sess:
+      outputs, state = rnn.dynamic_rnn(
+          cell, inputs, dtype=dtypes.float32, sequence_length=[4])
+      state = (state[0], state[1].stack())
+      if in_graph_mode:
+        outputs, state = sess.run(
+            [outputs, state], feed_dict={
+                inputs: [[[1], [2], [3], [4]]]
+            })
+
+    if in_graph_mode:
+      self.assertAllEqual(outputs, np.array([[[1], [2], [3], [4]]]))
+      self.assertEqual(state[0], 4)
+      self.assertAllEqual(state[1], np.array([[[1]], [[2]], [[3]], [[4]]]))
+    else:
+      self.assertAllEqual(outputs.numpy(), np.array([[[1], [2], [3], [4]]]))
+      self.assertEqual(state[0].numpy(), 4)
+      self.assertAllEqual(state[1].numpy(),
+                          np.array([[[1]], [[2]], [[3]], [[4]]]))
+
 
 ######### Benchmarking RNN code
 
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index a79d66e9889b4dc55a66c505bac9b29a453356be..9f5794951524b2689daa5fc4eefb19703262b8f0 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -157,6 +158,20 @@ class StatefulScatterNdTest(test.TestCase):
       result = sess.run(scatter)
       self.assertAllClose(result, expected)
 
+  def testSimpleResource(self):
+    indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    ref = resource_variable_ops.ResourceVariable(
+        [0, 0, 0, 0, 0, 0, 0, 0], dtype=dtypes.float32)
+    expected = np.array([0, 11, 0, 10, 9, 0, 0, 12])
+    scatter = state_ops.scatter_nd_update(ref, indices, updates)
+    init = variables.global_variables_initializer()
+
+    with self.test_session(use_gpu=True) as sess:
+      sess.run(init)
+      sess.run(scatter)
+      self.assertAllClose(ref.eval(), expected)
+
   def testSimple2(self):
     indices = constant_op.constant([[1, 0], [1, 1]], dtype=dtypes.int32)
     updates = constant_op.constant([11., 12.], dtype=dtypes.float32)
@@ -335,7 +350,7 @@ class StatefulScatterNdTest(test.TestCase):
         indices = np.array([2, 0, 5])
         op(ref, indices, updates).eval()
 
-        # Indicies out of range should not fail.
+        # Indices out of range should not fail.
         indices = np.array([-1, 0, 5])
         op(ref, indices, updates).eval()
         indices = np.array([2, 0, 6])
@@ -487,6 +502,43 @@ class ScatterNdTest(test.TestCase):
       if self.non_aliasing_add_test:
         self.assertAllEqual(expected_input_grad, input_grad.eval())
 
+  def testGradientsRank7SliceUpdate(self):
+    indices = constant_op.constant(
+        [[[
+            [[[[0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]]],
+            [[[[0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1]]]]
+        ]]], dtype=dtypes.int32)
+    updates = constant_op.constant(
+        [[[
+            [[[[5, 6], [2, 4]]]],
+            [[[[1, 3], [6, 8]]]]
+        ]]], dtype=dtypes.float64)
+    shape = constant_op.constant([1, 1, 2, 1, 1, 2, 2], dtype=dtypes.int32)
+    input_ = array_ops.zeros(shape, dtype=dtypes.float64)
+    outputs = self.scatter_nd(indices, updates, shape, input_)
+
+    grad_vals = constant_op.constant(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=dtypes.float64)
+    updates_grad, input_grad = gradients_impl.gradients(
+        [outputs], [updates, input_], [grad_vals])
+    expected_updates_grad = np.array(
+        [[[
+            [[[[3, 4], [5, 6]]]],
+            [[[[1, 2], [7, 8]]]]
+        ]]], dtype=np.float64)
+    expected_input_grad = np.array(
+        [[[
+            [[[[1, 2], [3, 4]]]],
+            [[[[5, 6], [7, 8]]]]
+        ]]], dtype=np.float64)
+    with self.test_session():
+      self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+      if self.non_aliasing_add_test:
+        self.assertAllEqual(expected_input_grad, input_grad.eval())
+
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 516a9d000e91f55d595ee9dc9cf633fd578942b1..5a54f448d092093db668570d055801f9f9cd0f9f 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -46,13 +46,13 @@ class SegmentReductionHelper(test.TestCase):
     return constant_op.constant(
         np_values, shape=input_shape, dtype=dtype), np_values
 
-  def _segmentReduce(self, indices, x, op1, op2=None, num_out_rows=None):
+  def _segmentReduce(self, indices, x, op1, op2=None, num_segments=None):
     if not x.size:
       return np.array([])
     indices = np.asarray(indices)
-    if num_out_rows is None:
-      num_out_rows = indices[-1] + 1
-    output = [None] * num_out_rows
+    if num_segments is None:
+      num_segments = indices[-1] + 1
+    output = [None] * num_segments
     slice_shape = x.shape[indices.ndim:]
     x_flat = x.reshape((indices.size,) + slice_shape)
     for i, index in enumerate(indices.ravel()):
@@ -259,13 +259,34 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
         with self.test_session(use_gpu=True):
           tf_x, np_x = self._input(shape, dtype=dtype)
           np_ans = self._segmentReduce(
-              indices, np_x, np.add, op2=None, num_out_rows=num_segments)
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
           tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
+  def testNumSegmentsTypes(self):
+    dtypes = [dtypes_lib.int32, dtypes_lib.int64]
+    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    num_segments = 12
+    for indices in indices_flat, indices_flat.reshape(5, 2):
+      shape = indices.shape + (2,)
+      for dtype in dtypes:
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape)
+          num_segments_constant = constant_op.constant(
+              num_segments, dtype=dtype)
+          np_ans = self._segmentReduce(
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x,
+              segment_ids=indices,
+              num_segments=num_segments_constant)
+          tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, s)
+
   def testGradientSegmentSum(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
@@ -323,8 +344,9 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
     # test, so this test is marked as cpu-only.
+    # Note: With PR #13055 a negative index will be ignored silently.
     with self.test_session(use_gpu=False):
-      for bad in [[-1]], [[7]]:
+      for bad in [[2]], [[7]]:
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
@@ -360,6 +382,32 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
             x_init_value=np_x.astype(np.double), delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testDropNegatives(self):
+    # Note: the test is done by replacing segment_ids with 8 to -1
+    # for index  and replace values generated by numpy with 0.
+    dtypes = [
+        dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int64,
+        dtypes_lib.int32, dtypes_lib.complex64, dtypes_lib.complex128
+    ]
+    indices_flat = np.array([0, 4, 0, 8, 3, 8, 4, 7, 7, 3])
+    num_segments = 12
+    for indices in indices_flat, indices_flat.reshape(5, 2):
+      shape = indices.shape + (2,)
+      for dtype in dtypes:
+        with self.test_session(use_gpu=True):
+          tf_x, np_x = self._input(shape, dtype=dtype)
+          np_ans = self._segmentReduce(
+              indices, np_x, np.add, op2=None, num_segments=num_segments)
+          # Replace np_ans[8] with 0 for the value
+          np_ans[8:] = 0
+          # Replace 8 with -1 in indices
+          np.place(indices, indices == 8, [-1])
+          s = math_ops.unsorted_segment_sum(
+              data=tf_x, segment_ids=indices, num_segments=num_segments)
+          tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+        self.assertShapeEqual(np_ans, s)
+
 
 class SparseSegmentReductionHelper(SegmentReductionHelper):
 
@@ -369,8 +417,15 @@ class SparseSegmentReductionHelper(SegmentReductionHelper):
     return (constant_op.constant(
         indices, dtype=dtypes_lib.int32), indices, a, b)
 
-  def _sparseSegmentReduce(self, x, indices, segment_indices, op1, op2=None):
-    return self._segmentReduce(segment_indices, x[indices], op1, op2)
+  def _sparseSegmentReduce(self,
+                           x,
+                           indices,
+                           segment_indices,
+                           op1,
+                           op2=None,
+                           num_segments=None):
+    return self._segmentReduce(
+        segment_indices, x[indices], op1, op2, num_segments=num_segments)
 
 
 class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
@@ -427,6 +482,31 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         tf_ans = s.eval()
         self.assertAllClose(np_ans, tf_ans)
 
+  def testWithNumSegments(self):
+    tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [(np.add, None, math_ops.sparse_segment_sum_with_num_segments),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.sparse_segment_mean_with_num_segments)]
+    segment_indices = [0, 2, 2, 2]
+    tf_indices = [8, 3, 0, 9]
+    num_segments = 5
+    with self.test_session(use_gpu=False):
+      for np_op1, np_op2, tf_op in ops_list:
+        np_ans = self._sparseSegmentReduce(
+            np_x,
+            tf_indices,
+            segment_indices,
+            np_op1,
+            np_op2,
+            num_segments=num_segments)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        tf_ans = s.eval()
+        self.assertAllClose(np_ans, tf_ans)
+
   def testSegmentIdsGreaterThanZero(self):
     tf_x, np_x = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [(np.add, None, math_ops.sparse_segment_sum), (
@@ -535,6 +615,63 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError("segment ids must be >= 0"):
           s.eval()
 
+  def testSegmentWithNumSegmentsValid(self):
+    # Baseline for the test*WithNumSegmentsInvalid* methods below.
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        s.eval()
+
+  def testSegmentWithNumSegmentsInvalid1(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = 5
+    segment_indices = [0, 1, 3, 5]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        with self.assertRaisesOpError("segment ids must be < num_segments"):
+          s.eval()
+
+  def testSegmentWithNumSegmentsInvalid2(self):
+    tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
+    ops_list = [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]
+    num_segments = -2
+    segment_indices = [0, 1, 3, 3]
+    tf_indices = [8, 3, 0, 9]
+    with self.test_session(use_gpu=False):
+      for tf_op in ops_list:
+        with self.assertRaisesRegexp(
+            ValueError, "Cannot specify a negative value for num_segments"):
+          tf_op(
+              data=tf_x,
+              indices=tf_indices,
+              segment_ids=segment_indices,
+              num_segments=num_segments)
+
   def testGradient(self):
     shape = [10, 4]
 
@@ -553,6 +690,32 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  def testGradientWithEmptySegmentsAtEnd(self):
+    shape = [10, 4]
+
+    num_segments = 5
+    segment_indices = [0, 1, 2, 2]
+    num_indices = len(segment_indices)
+    for tf_op in [
+        math_ops.sparse_segment_sum_with_num_segments,
+        math_ops.sparse_segment_mean_with_num_segments,
+    ]:
+      with self.test_session():
+        tf_indices, _, tf_x, np_x = self._sparse_input(
+            shape, num_indices, dtype=dtypes_lib.float64)
+        s = tf_op(
+            data=tf_x,
+            indices=tf_indices,
+            segment_ids=segment_indices,
+            num_segments=num_segments)
+        jacob_t, jacob_n = gradient_checker.compute_gradient(
+            tf_x,
+            shape,
+            s, [5, 4],
+            x_init_value=np_x.astype(np.double),
+            delta=1)
+      self.assertAllClose(jacob_t, jacob_n)
+
   def testGradientValid(self):
     # Baseline for the testGradient*Invalid* methods below.
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
@@ -598,7 +761,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     ops_list = [
         math_ops.sparse_segment_mean_grad, math_ops.sparse_segment_sqrt_n_grad
     ]
-    segment_indices = [0, 1, 1, 1]  # 2 segments
+    segment_indices = [0, 1, 1, 4]  # 5 segments
     tf_indices = [8, 3, 0, 9]
     with self.test_session(use_gpu=False):
       for tf_op in ops_list:
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index a9fc699b21e883db6c627c478ad29c79475b1271..7368251ab69574cc6cba703e605f108c6ab45649 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -258,6 +258,16 @@ class ShapeOpsTest(test.TestCase):
       self.assertAllEqual([True], array_ops.expand_dims(inp, 0).eval())
       self.assertAllEqual([True], array_ops.expand_dims(inp, -1).eval())
 
+  def testExpandDimsDimType(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      x = np.zeros([2])
+      np_ans = np.expand_dims(x, axis=0)
+      with self.test_session(use_gpu=True):
+        tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
+        tf_ans = tensor.eval()
+      self.assertShapeEqual(np_ans, tensor)
+      self.assertAllEqual(np_ans, tf_ans)
+
   def _compareSqueeze(self, x, squeeze_dims, use_gpu):
     with self.test_session(use_gpu=use_gpu):
       if squeeze_dims:
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index f415d9e70db7f6dbe270ce9ab2099c9464a447fc..051a25080b826de05ee3e24a82fbcd1f47995544 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -283,7 +283,7 @@ class SliceTest(test.TestCase):
     # unintended behavior is prevented.
     c = constant_op.constant(5.0)
     with self.assertRaisesWithPredicateMatch(
-        TypeError, lambda e: "'Tensor' object is not iterable" in str(e)):
+        TypeError, lambda e: "`Tensor` objects are not iterable" in str(e)):
       for _ in c:
         pass
 
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 4a9353d6bf8b5471e6f1769efe0ecfab250d5e88..3c98a685e07a1f2d55c3c1035a99ffaa593d35b3 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -277,11 +277,15 @@ class SpaceToDepthTest(test.TestCase):
 class SpaceToDepthGradientTest(test.TestCase):
 
   # Check the gradients.
-  def _checkGrad(self, x, block_size):
+  def _checkGrad(self, x, block_size, data_format):
+    # NCHW is implemented for only GPU.
+    if data_format == "NCHW" and not test.is_gpu_available():
+      return
+
     assert 4 == x.ndim
     with self.test_session(use_gpu=True):
       tf_x = ops.convert_to_tensor(x)
-      tf_y = array_ops.space_to_depth(tf_x, block_size)
+      tf_y = array_ops.space_to_depth(tf_x, block_size, data_format=data_format)
       epsilon = 1e-2
       ((x_jacob_t, x_jacob_n)) = gradient_checker.compute_gradient(
           tf_x,
@@ -295,23 +299,28 @@ class SpaceToDepthGradientTest(test.TestCase):
 
   # Tests a gradient for space_to_depth of x which is a four dimensional
   # tensor of shape [b, h * block_size, w * block_size, d].
-  def _compare(self, b, h, w, d, block_size):
+  def _compare(self, b, h, w, d, block_size, data_format):
     block_size_sq = block_size * block_size
-    x = np.random.normal(0, 1, b * h * w * d *
-                         block_size_sq).astype(np.float32).reshape(
-                             [b, h * block_size, w * block_size, d])
+    data = np.random.normal(0, 1, b * h * w * d * block_size_sq).astype(
+        np.float32)
+    if data_format == "NHWC":
+      x = data.reshape([b, h * block_size, w * block_size, d])
+    else:
+      x = data.reshape([b, d, h * block_size, w * block_size])
 
-    self._checkGrad(x, block_size)
+    self._checkGrad(x, block_size, data_format)
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
   def testSmall(self):
     block_size = 2
-    self._compare(1, 2, 3, 5, block_size)
+    self._compare(1, 2, 3, 5, block_size, "NHWC")
+    self._compare(1, 2, 3, 5, block_size, "NCHW")
 
   def testSmall2(self):
     block_size = 2
-    self._compare(2, 4, 3, 2, block_size)
+    self._compare(2, 4, 3, 2, block_size, "NHWC")
+    self._compare(2, 4, 3, 2, block_size, "NCHW")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index e87fa0c94c4cf3346c0127dd17b037cabb3cbb56..0d2887f3cef88605e87bddb7830845f12e37220b 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -196,7 +196,7 @@ class SparseReshapeTest(test.TestCase):
       sp_input = self._SparseTensorPlaceholder()
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(sp_input, [4, -1, -1])
-      with self.assertRaisesOpError("only one output shape size may be -1"):
+      with self.assertRaisesOpError("only one output dimension may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
   def testProvideStaticallyMismatchedSizes(self):
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index af395b31bfc71e85350ea4c57e34a520a80f06fd..27b39a626fcc6b2705bf9e797b5293ed3f1c7820 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -64,16 +64,75 @@ class SerializeSparseTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
-  def testSerializeDeserializeMany(self):
+  def _testSerializeDeserializeHelper(self,
+                                      serialize_fn,
+                                      deserialize_fn,
+                                      out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      indices, values, shape = sess.run(sp_deserialized)
+
+      self.assertAllEqual(indices, sp_input[0])
+      self.assertAllEqual(values, sp_input[1])
+      self.assertAllEqual(shape, sp_input[2])
+
+  def testSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserialize(self):
+    self._testSerializeDeserializeHelper(sparse_ops.serialize_sparse,
+                                         sparse_ops.deserialize_sparse,
+                                         dtypes.variant)
+
+  def _testSerializeDeserializeBatchHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      serialized = array_ops.stack([serialized, serialized])
+
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      combined_indices, combined_values, combined_shape = sess.run(
+          sp_deserialized)
+
+      self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
+      self.assertAllEqual(combined_indices[:6, 1:], sp_input[0])
+      self.assertAllEqual(combined_indices[6:, 0], [1] * 6)  # minibatch 1
+      self.assertAllEqual(combined_indices[6:, 1:], sp_input[0])
+      self.assertAllEqual(combined_values[:6], sp_input[1])
+      self.assertAllEqual(combined_values[6:], sp_input[1])
+      self.assertAllEqual(combined_shape, [2, 5, 6])
+
+  def testSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testSerializeDeserializeManyBatch(self):
+    self._testSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeDeserializeBatch(self):
+    self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testSerializeDeserializeBatchInconsistentShapeHelper(
+      self, serialize_fn, deserialize_fn, out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
       sp_input1 = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
-      serialized_concat = array_ops.stack([serialized0, serialized1])
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
+      serialized = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized)
@@ -86,18 +145,72 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testFeedSerializeDeserializeMany(self):
+  def testSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeBatchInconsistentShape(self):
+    self._testSerializeDeserializeBatchInconsistentShapeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testSerializeDeserializeNestedBatchHelper(self,
+                                                 serialize_fn,
+                                                 deserialize_fn,
+                                                 out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      sp_input = self._SparseTensorValue_5x6(np.arange(6))
+      serialized = serialize_fn(sp_input, out_type=out_type)
+      serialized = array_ops.stack([serialized, serialized])
+      serialized = array_ops.stack([serialized, serialized])
+
+      sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
+
+      combined_indices, combined_values, combined_shape = sess.run(
+          sp_deserialized)
+
+      # minibatch 0
+      self.assertAllEqual(combined_indices[:6, :2], [[0, 0]] * 6)
+      self.assertAllEqual(combined_indices[:6, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[:6], sp_input[1])
+      # minibatch 1
+      self.assertAllEqual(combined_indices[6:12, :2], [[0, 1]] * 6)
+      self.assertAllEqual(combined_indices[6:12, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[6:12], sp_input[1])
+      # minibatch 2
+      self.assertAllEqual(combined_indices[12:18, :2], [[1, 0]] * 6)
+      self.assertAllEqual(combined_indices[12:18, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[12:18], sp_input[1])
+      # minibatch 3
+      self.assertAllEqual(combined_indices[18:, :2], [[1, 1]] * 6)
+      self.assertAllEqual(combined_indices[18:, 2:], sp_input[0])
+      self.assertAllEqual(combined_values[18:], sp_input[1])
+
+      self.assertAllEqual(combined_shape, [2, 2, 5, 6])
+
+  def testSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testVariantSerializeDeserializeNestedBatch(self):
+    self._testSerializeDeserializeNestedBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testFeedSerializeDeserializeBatchHelper(self,
+                                               serialize_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       combined_indices, combined_values, combined_shape = sess.run(
           sp_deserialized, {sp_input0: input0_val,
@@ -111,40 +224,96 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
-  def testSerializeManyDeserializeManyRoundTrip(self):
+  def testFeedSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse)
+
+  def testFeedSerializeDeserializeManyBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testFeedVariantSerializeDeserializeBatch(self):
+    self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
+                                                  sparse_ops.deserialize_sparse,
+                                                  dtypes.variant)
+
+  def _testSerializeManyShapeHelper(self,
+                                    serialize_many_fn,
+                                    out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
       indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
       values_value = np.array([b"a", b"b", b"c"])
       shape_value = np.array([4, 5], dtype=np.int64)
       sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
-      serialized = sparse_ops.serialize_many_sparse(sparse_tensor)
-      deserialized = sparse_ops.deserialize_many_sparse(
-          serialized, dtype=dtypes.string)
-      serialized_value, deserialized_value = sess.run(
-          [serialized, deserialized],
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      serialized_value = sess.run(
+          serialized,
           feed_dict={
               sparse_tensor.indices: indices_value,
               sparse_tensor.values: values_value,
               sparse_tensor.dense_shape: shape_value
           })
       self.assertEqual(serialized_value.shape, (4, 3))
+
+  def testSerializeManyShape(self):
+    self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
+
+  def testVariantSerializeManyShape(self):
+    # NOTE: The following test is a no-op as it is currently not possible to
+    # convert the serialized variant value to a numpy value.
+    pass
+
+  def _testSerializeManyDeserializeBatchHelper(self,
+                                               serialize_many_fn,
+                                               deserialize_fn,
+                                               out_type=dtypes.string):
+    with self.test_session(use_gpu=False) as sess:
+      # N == 4 because shape_value == [4, 5]
+      indices_value = np.array([[0, 0], [0, 1], [2, 0]], dtype=np.int64)
+      values_value = np.array([b"a", b"b", b"c"])
+      shape_value = np.array([4, 5], dtype=np.int64)
+      sparse_tensor = self._SparseTensorPlaceholder(dtype=dtypes.string)
+      serialized = serialize_many_fn(sparse_tensor, out_type=out_type)
+      deserialized = deserialize_fn(serialized, dtype=dtypes.string)
+      deserialized_value = sess.run(
+          deserialized,
+          feed_dict={
+              sparse_tensor.indices: indices_value,
+              sparse_tensor.values: values_value,
+              sparse_tensor.dense_shape: shape_value
+          })
       self.assertAllEqual(deserialized_value.indices, indices_value)
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
-  def testDeserializeFailsWrongType(self):
+  def testSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
+
+  def testSerializeManyDeserializeManyBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantSerializeManyDeserializeBatch(self):
+    self._testSerializeManyDeserializeBatchHelper(
+        sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsWrongTypeHelper(self,
+                                           serialize_fn,
+                                           deserialize_fn,
+                                           out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_3x4(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int64)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int64)
 
       with self.assertRaisesOpError(
           r"Requested SparseTensor of type int64 but "
@@ -153,41 +322,78 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInconsistentRank(self):
+  def testDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsWrongType(self):
+    self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
+                                              sparse_ops.deserialize_sparse,
+                                              dtypes.variant)
+
+  def _testDeserializeFailsInconsistentRankHelper(self,
+                                                  serialize_fn,
+                                                  deserialize_fn,
+                                                  out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       sp_input1 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
       input1_val = self._SparseTensorValue_1x1x1()
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
-      serialized1 = sparse_ops.serialize_sparse(sp_input1)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
+      serialized1 = serialize_fn(sp_input1, out_type=out_type)
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
       with self.assertRaisesOpError(
-          r"Inconsistent rank across SparseTensors: rank prior to "
-          r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
+          r"Inconsistent shape across SparseTensors: rank prior to "
+          r"SparseTensor\[1\] was: 2 but rank of SparseTensor\[1\] is: 3"):
         sess.run(sp_deserialized,
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
-  def testDeserializeFailsInvalidProto(self):
+  def testDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
+  def testVariantDeserializeFailsInconsistentRank(self):
+    self._testDeserializeFailsInconsistentRankHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
+        dtypes.variant)
+
+  def _testDeserializeFailsInvalidProtoHelper(self,
+                                              serialize_fn,
+                                              deserialize_fn,
+                                              out_type=dtypes.string):
     with self.test_session(use_gpu=False) as sess:
       sp_input0 = self._SparseTensorPlaceholder()
       input0_val = self._SparseTensorValue_5x6(np.arange(6))
-      serialized0 = sparse_ops.serialize_sparse(sp_input0)
+      serialized0 = serialize_fn(sp_input0, out_type=out_type)
       serialized1 = ["a", "b", "c"]
       serialized_concat = array_ops.stack([serialized0, serialized1])
 
-      sp_deserialized = sparse_ops.deserialize_many_sparse(
-          serialized_concat, dtype=dtypes.int32)
+      sp_deserialized = deserialize_fn(serialized_concat, dtype=dtypes.int32)
 
-      with self.assertRaisesOpError(
-          r"Could not parse serialized_sparse\[1, 0\]"):
+      with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  def testDeserializeFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
+                                                 sparse_ops.deserialize_sparse)
+
+  def testDeserializeManyFailsInvalidProto(self):
+    self._testDeserializeFailsInvalidProtoHelper(
+        sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index b44dc037f146750c6006f79a3a0fa5745936ff2b..6171793b148f8d8f195b9548a13df89d29c5e96e 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -84,7 +85,7 @@ class SplitOpTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
 
-  def testExplicitNum(self):
+  def testFailWithoutExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
     value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
@@ -92,24 +93,31 @@ class SplitOpTest(test.TestCase):
     with self.test_session(use_gpu=True) as sess:
       with self.assertRaises(ValueError) as context:
         sess.run(array_ops.split(value, size_splits), {size_splits: [2, 2, 6]})
-
       self.assertTrue("Cannot infer num from shape" in str(context.exception))
 
-      result = sess.run(array_ops.split(
-          value, size_splits, num=3), {size_splits: [2, 2, 6]})
+  @test_util.run_in_graph_and_eager_modes()
+  def testExplicitNum(self):
+    size_splits = array_ops.constant([2, 2, 6], dtype=dtypes.int32)
+    value = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 
-    self.assertAllEqual(result[0], value[0:2])
-    self.assertAllEqual(result[1], value[2:4])
-    self.assertAllEqual(result[2], value[4:])
+    # Eager and Graph modes raise different exceptions
+    with self.assertRaises((errors_impl.InvalidArgumentError, ValueError)):
+      array_ops.split(value, size_splits, num=4)
 
+    r = self.evaluate(array_ops.split(value, size_splits, num=3))
+    self.assertAllEqual(r[0], value[0:2])
+    self.assertAllEqual(r[1], value[2:4])
+    self.assertAllEqual(r[2], value[4:])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testListOfScalarTensors(self):
     a = math_ops.to_int32(5)
     b = math_ops.to_int32(6)
 
     value = np.random.rand(11, 11)
 
-    with self.test_session(use_gpu=True) as sess:
-      result = sess.run(array_ops.split(value, [a, b]))
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(array_ops.split(value, [a, b]))
 
     self.assertAllEqual(result[0], value[0:5, :])
     self.assertAllEqual(result[1], value[5:, :])
@@ -122,11 +130,11 @@ class SplitOpTest(test.TestCase):
       num_split = np.random.randint(16, 25)
     else:
       num_split = np.random.randint(2, 8)
-    size_splits = np.random.randint(2, 8, num_split)
+    size_splits = np.random.randint(2, 8, num_split, dtype=np.int32)
     shape[split_dim] = np.sum(size_splits)
     inp = self._makeData(shape, dtype)
-    with self.test_session(use_gpu=True) as sess:
-      result = sess.run(array_ops.split(inp, size_splits, split_dim))
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
     for i in range(num_split):
@@ -137,22 +145,22 @@ class SplitOpTest(test.TestCase):
   def _testSpecialCasesVariable(self):
     inp = np.random.rand(4, 4).astype("f")
 
-    with self.test_session(use_gpu=True) as sess:
-      result = sess.run(array_ops.split(inp, [4], 0))
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(array_ops.split(inp, [4], 0))
       self.assertAllEqual(result[0], inp)
 
-      result = sess.run(array_ops.split(inp, [-1, 3], 0))
+      result = self.evaluate(array_ops.split(inp, [-1, 3], 0))
       self.assertAllEqual(result[0], inp[0:1, :])
       self.assertAllEqual(result[1], inp[1:4, :])
 
   def _testHugeNumberOfTensorsVariable(self, dtype):
-    num_split = 10000
-    size_splits = np.random.randint(1, 3, num_split)
+    num_split = 1000
+    size_splits = np.random.randint(1, 3, num_split, dtype=np.int32)
     shape = [3, np.sum(size_splits)]
     split_dim = 1
     inp = self._makeData(shape, dtype)
-    with self.test_session(use_gpu=True) as sess:
-      result = sess.run(array_ops.split(inp, size_splits, split_dim))
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(array_ops.split(inp, size_splits, split_dim))
     slices = [slice(0, x) for x in shape]
     offset = 0
     for i in range(num_split):
@@ -160,6 +168,7 @@ class SplitOpTest(test.TestCase):
       offset += size_splits[i]
       self.assertAllEqual(result[i], inp[slices])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSpecialCasesVariable(self):
     self._testSpecialCasesVariable()
     for dtype in _TEST_DTYPES:
@@ -167,7 +176,7 @@ class SplitOpTest(test.TestCase):
 
   def _testGradientsSimpleVariable(self, dtype):
     inp = self._makeData((4, 4), dtype)
-    with self.test_session(use_gpu=True):
+    with test_util.device(use_gpu=True):
       inp_tensor = ops.convert_to_tensor(inp)
       s = array_ops.split(inp_tensor, [1, 3], 1)
       inp_grads = [
@@ -175,7 +184,7 @@ class SplitOpTest(test.TestCase):
       ]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[-1]
-      result = grad.eval()
+      result = self.evaluate(grad)
 
     self.assertAllEqual(result[:, 0:1], inp_grads[0])
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
@@ -191,9 +200,9 @@ class SplitOpTest(test.TestCase):
 
   def _compare(self, x, dim, num):
     np_ans = np.split(x, num, dim)
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.device(use_gpu=True):
       tf_ans = array_ops.split(value=x, num_or_size_splits=num, axis=dim)
-      out = sess.run(tf_ans)
+      out = self.evaluate(tf_ans)
     self.assertEqual(num, len(np_ans))
     self.assertEqual(num, len(np_ans))
     self.assertEqual(num, len(out))
@@ -201,26 +210,29 @@ class SplitOpTest(test.TestCase):
       self.assertAllEqual(np_ans[i], out[i])
       self.assertShapeEqual(np_ans[i], tf_ans[i])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSplitRows(self):
     for dtype in _TEST_DTYPES:
       inp = self._makeData((4, 4), dtype)
       self._compare(inp, 0, 4)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSplitCols(self):
     for dtype in _TEST_DTYPES:
       inp = self._makeData((4, 4), dtype)
       self._compare(inp, 1, 4)
 
   def _testEmpty(self, x, dim, num, expected_shape):
-    with self.test_session() as sess:
+    with test_util.device(use_gpu=True):
       tf_ans = array_ops.split(value=x, num_or_size_splits=num, axis=dim)
-      out = sess.run(tf_ans)
+      out = self.evaluate(tf_ans)
     self.assertEqual(x.size, 0)
     self.assertEqual(len(out), num)
     for i in range(num):
       self.assertEqual(out[i].shape, expected_shape)
       self.assertEqual(expected_shape, tf_ans[i].get_shape())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testEmpty(self):
     # Note: np.split returns a rank-0 empty ndarray
     # if the input ndarray is empty.
@@ -232,6 +244,7 @@ class SplitOpTest(test.TestCase):
       self._testEmpty(inp, 2, 3, (8, 0, 7))
       self._testEmpty(inp, 2, 7, (8, 0, 3))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testIdentity(self):
     for dtype in _TEST_DTYPES:
       inp = self._makeData((2, 2, 2), dtype)
@@ -239,6 +252,7 @@ class SplitOpTest(test.TestCase):
       self._compare(inp, 1, 1)
       self._compare(inp, 2, 1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSplitDim0(self):
     for dtype in _TEST_DTYPES:
       self._compare(self._makeData((6, 10, 18), dtype), 0, 3)
@@ -255,8 +269,8 @@ class SplitOpTest(test.TestCase):
       num_split = np.random.randint(2, 8)
     shape[split_dim] = np.random.randint(2, 5) * num_split
     inp = self._makeData(shape, dtype)
-    with self.test_session(use_gpu=True) as sess:
-      result = sess.run(
+    with test_util.device(use_gpu=True):
+      result = self.evaluate(
           array_ops.split(
               value=inp, num_or_size_splits=num_split, axis=split_dim))
     slices = [slice(0, x) for x in shape]
@@ -267,6 +281,7 @@ class SplitOpTest(test.TestCase):
       offset += length
       self.assertAllEqual(result[i], inp[slices])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testRandom(self):
     for dtype in _TEST_DTYPES:
       for _ in range(5):
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 854394b0dde867f7b351619e0832a39a77c3556b..73ac71e1f5c5a8e0e935154f729f7900f887b26b 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -38,6 +38,17 @@ class SubstrOpTest(test.TestCase):
       substr = substr_op.eval()
       self.assertAllEqual(substr, expected_value)
 
+    # position is equal to the length of string.
+    test_string = b""
+    position = np.array(0, dtype)
+    length = np.array(2, dtype)
+    expected_value = b""
+
+    substr_op = string_ops.substr(test_string, position, length)
+    with self.test_session():
+      substr = substr_op.eval()
+      self.assertAllEqual(substr, expected_value)
+
   def _testVectorStrings(self, dtype):
     test_string = [b"Hello", b"World"]
     position = np.array(1, dtype)
@@ -136,7 +147,7 @@ class SubstrOpTest(test.TestCase):
 
     # Vector/Scalar
     test_string = [b"good", b"good", b"bad", b"good"]
-    position = np.array(3, dtype)
+    position = np.array(4, dtype)
     length = np.array(1, dtype)
     substr_op = string_ops.substr(test_string, position, length)
     with self.test_session():
@@ -155,7 +166,7 @@ class SubstrOpTest(test.TestCase):
     # Matrix/Matrix
     test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"],
                    [b"good", b"good", b"good"]]
-    position = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]], dtype)
+    position = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 3]], dtype)
     length = np.array([[3, 2, 1], [1, 2, 3], [2, 2, 2]], dtype)
     substr_op = string_ops.substr(test_string, position, length)
     with self.test_session():
@@ -164,7 +175,7 @@ class SubstrOpTest(test.TestCase):
 
     # Broadcast
     test_string = [[b"good", b"good", b"good"], [b"good", b"good", b"bad"]]
-    position = np.array([1, 2, 3], dtype)
+    position = np.array([1, 2, 4], dtype)
     length = np.array([1, 2, 3], dtype)
     substr_op = string_ops.substr(test_string, position, length)
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/summary_image_op_test.py b/tensorflow/python/kernel_tests/summary_image_op_test.py
index d2152ab560ad27b8a761ff8029fa425fdc9ff20d..4718827e8885c328cb2e84c2f1e8880bdbdb6cae 100644
--- a/tensorflow/python/kernel_tests/summary_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_image_op_test.py
@@ -50,7 +50,6 @@ class SummaryImageOpTest(test.TestCase):
     self.assertProtoEquals(expected, image_summ)
 
   def testImageSummary(self):
-    np.random.seed(7)
     for depth in (1, 3, 4):
       for positive in False, True:
         with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 8b9c58ac3f7c72344667e0dc8511dcfee5ceaa08..f0354374ac82ee6ac201095c24716f51589fa965 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 import traceback
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -32,9 +34,10 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-def variable_scoped_function():
+def variable_scoped_function(trainable=True):
   return variable_scope.get_variable(
-      "dummy", shape=[1], initializer=init_ops.zeros_initializer())
+      "dummy", shape=[1], trainable=trainable,
+      initializer=init_ops.zeros_initializer())
 
 
 def internally_variable_scoped_function(scope_name):
@@ -50,6 +53,13 @@ def function_with_create(trainable):
       "dummy", shape=[1], initializer=init_ops.zeros_initializer())
 
 
+def function_with_side_create(trainable, name="side"):
+  """Creates a variable as a side effect using tf.get_variable."""
+  variable_scope.get_variable(name, shape=[1], trainable=trainable)
+  return variable_scope.get_variable(
+      "dummy", shape=[1], initializer=init_ops.zeros_initializer())
+
+
 def variable_scoped_function_with_local_variable():
   variable_scope.get_local_variable(
       "local", shape=[1], initializer=init_ops.zeros_initializer())
@@ -99,6 +109,46 @@ class TemplateTest(test.TestCase):
     # Parameters are tied, so the loss should have gone down when we trained it.
     self.assertLess(final_test_loss, initial_test_loss)
 
+  def test_end_to_end_eager(self):
+    """This test shows a very simple line model with test_loss in eager mode.
+
+    The template is used to share parameters between a training and test model.
+    """
+    with context.eager_mode():
+      # y = 2x + 1
+      training_input, training_output = ([1., 2., 3., 4.], [2.8, 5.1, 7.2, 8.7])
+      test_input, test_output = ([5., 6., 7., 8.], [11, 13, 15, 17])
+
+      random_seed.set_random_seed(1234)
+
+      def test_line(x):
+        m = variable_scope.get_variable(
+            "w", shape=[], initializer=init_ops.truncated_normal_initializer())
+        b = variable_scope.get_variable(
+            "b", shape=[], initializer=init_ops.truncated_normal_initializer())
+        return x * m + b
+
+      line_template = template.make_template("line", test_line)
+
+      def train_loss():
+        train_prediction = line_template(training_input)
+        return math_ops.reduce_mean(
+            math_ops.square(train_prediction - training_output))
+
+      def test_loss():
+        test_prediction = line_template(test_input)
+        return math_ops.reduce_mean(
+            math_ops.square(test_prediction - test_output))
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.1)
+      initial_test_loss = test_loss()
+      optimizer.minimize(train_loss)
+      final_test_loss = test_loss()
+
+      # Parameters are tied, so the loss should have gone down after training.
+      self.assertLess(final_test_loss.numpy(), initial_test_loss.numpy())
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_skip_stack_frames(self):
     first = traceback.format_stack()
     second = traceback.format_stack()
@@ -106,6 +156,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(result))
     self.assertNotEqual(len(first), len(result))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_template_with_name(self):
     tmpl1 = template.make_template("s1", variable_scoped_function)
     tmpl2 = template.make_template("s1", variable_scoped_function)
@@ -118,15 +169,23 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/dummy:0", v1.name)
     self.assertEqual("s1_1/dummy:0", v3.name)
 
-  def test_unique_name_raise_error(self):
+  def test_same_unique_name_raise_error(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
     tmpl1()
     tmpl2 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "Variable s1/dummy already exists, disallowed.*"):
       tmpl2()
 
+  def test_unique_name_raise_error_in_eager(self):
+    with context.eager_mode():
+      with self.assertRaisesRegexp(
+          ValueError, "unique_name cannot be used in eager mode."):
+        template.make_template(
+            "_", variable_scoped_function, unique_name_="s1")
+
   def test_unique_name_and_reuse(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -142,6 +201,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(v1, v3)
     self.assertEqual("s1/dummy:0", v1.name)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_template_in_scope(self):
     tmpl1 = template.make_template("s1", variable_scoped_function)
     tmpl2 = template.make_template("s1", variable_scoped_function)
@@ -158,6 +218,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("scope/s1/dummy:0", v1.name)
     self.assertEqual("scope/s1_1/dummy:0", v3.name)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_template_with_internal_reuse(self):
     tmpl1 = template.make_template("s1", internally_variable_scoped_function)
     tmpl2 = template.make_template("s1", internally_variable_scoped_function)
@@ -173,10 +234,13 @@ class TemplateTest(test.TestCase):
     with self.assertRaises(ValueError):
       tmpl1("not_test")
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_template_without_name(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaisesRegexp(
+        ValueError, "name cannot be None."):
       template.make_template(None, variable_scoped_function)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_make_template(self):
     # Test both that we can call it with positional and keywords.
     tmpl1 = template.make_template(
@@ -199,10 +263,28 @@ class TemplateTest(test.TestCase):
     with self.assertRaises(ValueError):
       tmpl()
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_enforces_no_extra_trainable_variables_eager(self):
+    tmpl = template.make_template("s",
+                                  function_with_side_create,
+                                  trainable=True)
+
+    tmpl(name="1")
+    with self.assertRaises(ValueError):
+      tmpl(name="2")
+
   def test_permits_extra_non_trainable_variables(self):
     tmpl = template.make_template("s", function_with_create, trainable=False)
     self.assertEqual(tmpl(), tmpl())
 
+  def test_permits_extra_non_trainable_variables_eager(self):
+    with context.eager_mode():
+      tmpl = template.make_template("s",
+                                    function_with_side_create,
+                                    trainable=False)
+      self.assertEqual(tmpl(name="1"), tmpl(name="2"))
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_internal_variable_reuse(self):
 
     def nested():
@@ -241,11 +323,28 @@ class TemplateTest(test.TestCase):
     v1 = tmpl1()
     v2 = tmpl1()
     v3 = tmpl2()
-    self.assertEqual(v1, v2)
+    self.assertTrue(v1, v2)
     self.assertNotEqual(v1, v3)
     self.assertEqual("s1/nested_1/dummy:0", v1.name)
     self.assertEqual("s1_1/nested_1/dummy:0", v3.name)
 
+  def test_nested_eager_templates_raises_error(self):
+
+    def nested_template():
+      nested1 = template.make_template("nested", variable_scoped_function)
+      nested2 = template.make_template("nested", variable_scoped_function)
+      v1 = nested1()
+      v2 = nested2()
+      self.assertNotEqual(v1, v2)
+      return v2
+
+    with context.eager_mode():
+      tmpl1 = template.make_template("s1", nested_template)
+      with self.assertRaisesRegexp(
+          ValueError, "Nested EagerTemaplates are not currently supported."):
+        tmpl1()
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_immediate_scope_creation(self):
     # Create templates in scope a then call in scope b. make_template should
     # capture the scope the first time it is called, and make_immediate_template
@@ -270,6 +369,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("ctor_scope/a/dummy:0", inner_imm_var.name)
     self.assertEqual("call_scope/b/dummy:0", inner_defer_var.name)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_scope_access(self):
     # Ensure that we can access the scope inside the template, because the name
     # of that scope may be different from the name we pass to make_template, due
@@ -294,6 +394,7 @@ class TemplateTest(test.TestCase):
     # Template is called at the top level, so there is no preceding "foo_2".
     self.assertEqual(tc.variable_scope.name, "blah")
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_custom_getter(self):
     # Custom getter that maintains call count and forwards to true getter
     custom_getter_count = [0]
@@ -313,7 +414,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(custom_getter_count[0], 2)
 
     # Test that custom getter is called when the variable scope is created
-  # during construction
+    # during construction
     custom_getter_count[0] = 0
     tmpl2 = template.make_template(
         "s2",
@@ -326,6 +427,7 @@ class TemplateTest(test.TestCase):
     tmpl2()
     self.assertEqual(custom_getter_count[0], 2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_fails_gracefully(self):
     for create_scope_now in [True, False]:
       def module_function_with_one_arg(inputs):
@@ -336,7 +438,7 @@ class TemplateTest(test.TestCase):
       templatized_function = template.make_template(
           "f1", module_function_with_one_arg,
           create_scope_now_=create_scope_now)
-      data = array_ops.zeros(1)
+      data = array_ops.zeros([1])
       try:
         # Try to connect with a kwarg which is unsupported.
         templatized_function(data, is_training=True)
@@ -348,6 +450,7 @@ class TemplateTest(test.TestCase):
       templatized_function(data)
       self.assertTrue(templatized_function._variables_created)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_name_scopes_for_variable_scopes(self):
     # Test that name scopes are not unnecessarily uniquified (but are
     # still uniquified when necessary).
@@ -374,12 +477,13 @@ class TemplateTest(test.TestCase):
     outputs_b, _ = linear1(inputs)
     self.assertEquals("foo", linear1.variable_scope.name)
     self.assertEquals("foo/w:0", w1.name)
-    self.assertEquals("foo/add:0", outputs_a.name,
-                      "First application of template should get "
-                      "same name scope as variables.")
-    self.assertEquals("foo_1/add:0", outputs_b.name,
-                      "Second application of template should get "
-                      "a freshly uniquified name scope.")
+    if context.in_graph_mode():
+      self.assertEquals("foo/add:0", outputs_a.name,
+                        "First application of template should get "
+                        "same name scope as variables.")
+      self.assertEquals("foo_1/add:0", outputs_b.name,
+                        "Second application of template should get "
+                        "a freshly uniquified name scope.")
 
     linear2 = make_linear_module(output_size=2, name="foo")
     outputs_c, w2 = linear2(inputs)
@@ -388,24 +492,30 @@ class TemplateTest(test.TestCase):
                       "New template gets a freshly uniquified variable scope "
                       "because 'foo' is already taken.")
     self.assertEquals("foo_1/w:0", w2.name)
-    self.assertEquals("foo_1_1/add:0", outputs_c.name,
-                      "First application of template would get "
-                      "same name scope as variables, but 'foo_1' is already "
-                      "a name scope.")
-    self.assertEquals("foo_1_2/add:0", outputs_d.name,
-                      "Second application of template should also get "
-                      "a freshly uniquified name scope.")
-
+    if context.in_graph_mode():
+      self.assertEquals("foo_1_1/add:0", outputs_c.name,
+                        "First application of template would get "
+                        "same name scope as variables, but 'foo_1' is already "
+                        "a name scope.")
+      self.assertEquals("foo_1_2/add:0", outputs_d.name,
+                        "Second application of template should also get "
+                        "a freshly uniquified name scope.")
+
+  @test_util.run_in_graph_and_eager_modes()
   def test_global_variables(self):
     # Make sure global_variables are created.
     with variable_scope.variable_scope("foo"):
       # Create two templates with the same name, ensure scopes are made unique.
       ta = template.make_template("bar", variable_scoped_function, True)
-      tb = template.make_template("s", function_with_create, trainable=False)
+      if context.in_eager_mode():
+        tb = template.make_template("s", function_with_side_create,
+                                    trainable=False)
+      else:
+        tb = template.make_template("s", function_with_create, trainable=False)
 
     # Initially there are not variables created.
-    self.assertEqual([], ta.global_variables)
-    self.assertEqual([], tb.global_variables)
+    self.assertEqual([], list(ta.global_variables))
+    self.assertEqual([], list(tb.global_variables))
     # After calling there are variables created.
     ta()
     tb()
@@ -413,6 +523,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(ta.global_variables))
     self.assertEqual(2, len(tb.global_variables))
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_trainable_variables(self):
     # Make sure trainable_variables are created.
     with variable_scope.variable_scope("foo2"):
@@ -421,15 +532,46 @@ class TemplateTest(test.TestCase):
       tb = template.make_template("bar", variable_scoped_function, True)
 
     # Initially there are not variables created.
-    self.assertEqual([], ta.trainable_variables)
-    self.assertEqual([], tb.trainable_variables)
+    self.assertEqual([], list(ta.trainable_variables))
+    self.assertEqual([], list(tb.trainable_variables))
     # After calling there are variables created.
     ta()
     tb()
     # Ensure we can get the scopes before either template is actually called.
     self.assertEqual(1, len(ta.trainable_variables))
     self.assertEqual(1, len(tb.trainable_variables))
+    # None non-trainable variable was created.
+    self.assertEqual([], list(ta.non_trainable_variables))
+    self.assertEqual([], list(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_non_trainable_variables(self):
+    # Make sure non_trainable_variables are created.
+    with variable_scope.variable_scope("foo2"):
+      ta = template.make_template("a", variable_scoped_function,
+                                  trainable=True)
+      tb = template.make_template("b", variable_scoped_function,
+                                  trainable=False)
+    # Initially there are not variables created.
+    self.assertEqual([], list(ta.variables))
+    self.assertEqual([], list(tb.variables))
+    # After calling there are variables created.
+    ta()
+    tb()
+    # Check the trainable and non_trainable variables.
+    self.assertEqual(1, len(ta.trainable_variables))
+    self.assertEqual([], list(ta.non_trainable_variables))
+
+    self.assertEqual([], list(tb.trainable_variables))
+    self.assertEqual(1, len(tb.non_trainable_variables))
+    # Ensure variables returns all the variables.
+    self.assertEqual(1, len(ta.variables))
+    self.assertEqual(1, len(tb.variables))
 
+  # TODO(apassos) handle local variables in Eager
   def test_local_variables(self):
     # Make sure trainable_variables are created.
     with variable_scope.variable_scope("foo3"):
@@ -439,8 +581,8 @@ class TemplateTest(test.TestCase):
                                   variable_scoped_function_with_local_variable)
 
     # Initially there are not variables created.
-    self.assertEqual([], ta.local_variables)
-    self.assertEqual([], tb.local_variables)
+    self.assertEqual([], list(ta.local_variables))
+    self.assertEqual([], list(tb.local_variables))
     # After calling there are variables created.
     ta()
     tb()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index a1fc6d63d454c6130874da948a2c84a3e4384b20..aad2443eea7ad87faf481973e91ca3df32ccfb44 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -169,18 +169,38 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWriteConcat(dtypes.complex128)
     self._testTensorArrayWriteConcat(dtypes.string)
 
-  def _testTensorArrayPackNotAllValuesAvailableFails(self):
+  def _testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, tensor_array_name="foo", size=3)
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3,
+          element_shape=tensor_shape.TensorShape([1, 2]))
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(ta.read(0)))
+      self.assertAllEqual([[[0.0, 0.0]], [[4.0, 5.0]], [[0.0, 0.0]]],
+                          self.evaluate(ta.write(1, [[4.0, 5.0]]).stack()))
+      self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
+                          self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-      with self.assertRaisesOpError("Could not read from TensorArray index 1 "
-                                    "because it has not yet been written to."):
-        self.evaluate(ta.write(0, [[4.0, 5.0]]).stack())
+  @test_util.run_in_graph_and_eager_modes()
+  def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
+    self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
+
+  def _testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.float32,
+        tensor_array_name="foo",
+        size=3)
+    self.assertAllEqual(
+        [[0.0, 0.0]], self.evaluate(ta.write(1, [[4.0, 5.0]]).read(0)))
+    self.assertAllEqual([[[0.0, 0.0]], [[4.0, 5.0]], [[0.0, 0.0]]],
+                        self.evaluate(ta.write(1, [[4.0, 5.0]]).stack()))
+    self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
+                        self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
   @test_util.run_in_graph_and_eager_modes()
-  def testTensorArrayPackNotAllValuesAvailableFails(self):
-    self._testTensorArrayPackNotAllValuesAvailableFails()
+  def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
+    self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
   def _testTensorArrayUnpackRead(self, tf_dtype):
     with self.test_session(use_gpu=True):
@@ -423,12 +443,6 @@ class TensorArrayTest(test.TestCase):
             "TensorArray dtype is float but Op requested dtype double."):
           r0_bad.eval()
 
-      # Test reading from a different index than the one we wrote to
-      with self.assertRaisesOpError(
-          "Could not read from TensorArray index 1 because "
-          "it has not yet been written to."):
-        self.evaluate(w0.read(1))
-
       # Test reading from a negative index, which is not allowed
       if context.in_graph_mode():
         with self.assertRaisesOpError(
@@ -741,7 +755,8 @@ class TensorArrayTest(test.TestCase):
   def testTensorArrayGradientSplitConcat(self):
     with self.test_session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, tensor_array_name="foo", size=2)
+          dtype=dtypes.float32, tensor_array_name="foo", size=2,
+          infer_shape=False)
 
       value = constant_op.constant(
           [[1.0, -1.0], [10.0, -10.0], [100.0, -100.0]])
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index c551d9c3d056b50600d1331749ba865439748f7e..290200ce45488a9796f437d9f748e06483e83d96 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -53,11 +53,11 @@ class TransposeTest(test.TestCase):
       # Gradient check on CPU.
       xs = list(np.shape(x))
       ys = list(np.shape(tf_ans))
-      if x.dtype == np.float32:
+      if x.dtype in [np.float32, np.complex64]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-3, 1e-3)
-      elif x.dtype == np.float64:
+      elif x.dtype in [np.float64, np.complex128]:
         jacob_t, jacob_n = gradient_checker.compute_gradient(inx, xs, y, ys, x,
                                                              1e-2)
         self.assertAllClose(jacob_t, jacob_n, 1e-6, 1e-6)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index a50f53b3cd31c076e6a1e1798c301af6c15c3c03..6390b7c51808cf338f0651bbbdb30c7b71af7d8e 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
 
 
@@ -61,6 +62,32 @@ class UniqueTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
 
+  def testInt32Axis(self):
+    x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+    with self.test_session() as sess:
+      y0, idx0 = gen_array_ops.unique_v2(x, axis=[0])
+      tf_y0, tf_idx0 = sess.run([y0, idx0])
+      y1, idx1 = gen_array_ops.unique_v2(x, axis=[1])
+      tf_y1, tf_idx1 = sess.run([y1, idx1])
+    self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+    self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+    self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+    self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.test_session() as sess:
+      y, idx = gen_array_ops.unique_v2(x, axis=[])
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+
 class UniqueWithCountsTest(test.TestCase):
 
   def testInt32(self):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index bd4b12b7e8aee91eeabc677d9e1bfd33cde7911d..84911719239a48eca8649a86934ddb0c97080539 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -117,6 +117,20 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
+  def testEagerVaribleStore(self):
+    with context.eager_mode():
+      store = variable_scope.EagerVariableStore()
+      with store.as_default():
+        v = variable_scope.get_variable("v", shape=(), trainable=True)
+        w = variable_scope.get_variable("w", shape=(), trainable=False)
+
+      self.assertTrue(v in store.variables())
+      self.assertTrue(w in store.variables())
+      self.assertTrue(v in store.trainable_variables())
+      self.assertFalse(w in store.trainable_variables())
+      self.assertFalse(v in store.non_trainable_variables())
+      self.assertTrue(w in store.non_trainable_variables())
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -887,35 +901,6 @@ def axis0_into3_partitioner(shape=None, **unused_kwargs):
 
 class VariableScopeWithPartitioningTest(test.TestCase):
 
-  def testInitFromNonInitializer(self):
-    with self.test_session() as sess:
-      # Test various dtypes with zeros initializer as following:
-      types = [
-          dtypes.int8, dtypes.uint8, dtypes.int16, dtypes.uint16, dtypes.int32,
-          dtypes.int64, dtypes.bool
-      ]
-
-      # Use different variable_name to distinguish various dtypes
-      for (i, dtype) in enumerate(types):
-        x = variable_scope.get_variable(
-            name="x%d" % i,
-            shape=(3, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner)
-        y = variable_scope.get_variable(
-            name="y%d" % i,
-            shape=(6, 4),
-            dtype=dtype,
-            partitioner=axis0_into2_partitioner,
-            initializer=init_ops.zeros_initializer(dtype=dtype))
-
-        variables_lib.global_variables_initializer().run()
-        # x and y would become var list after partition
-        val_x = sess.run(list(x))
-        val_y = sess.run(list(y))
-
-        self.assertAllEqual(val_x, val_y)
-
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index 4b3dadc1128629f83014f3725eb41708f0429e52..43be08f8a1436eebdd712a4bbb69ce8ae8d12827 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -181,6 +181,24 @@ class XentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  def testGradientLabelWithV2(self):
+    with self.test_session():
+      l = constant_op.constant(
+          [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5],
+          shape=[3, 4],
+          dtype=dtypes.float64,
+          name="l")
+      f = constant_op.constant(
+          [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4],
+          shape=[3, 4],
+          dtype=dtypes.float64,
+          name="f")
+      x = nn_ops.softmax_cross_entropy_with_logits_v2(labels=l, logits=f,
+                                                      name="xent")
+      err = gradient_checker.compute_gradient_error(l, [3, 4], x, [3])
+
+    self.assertLess(err, 5e-8)
+
   def testSecondGradient(self):
     with self.test_session() as sess:
       l = constant_op.constant([0.0, 0.0, 1.0/3, 0.0,
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 8c2ee1f103b66df3ad26897f59b2b51f8b0a6500..c083f8a5d2a3c7b5cfda1192eabfe42fa9d28102 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -30,6 +30,7 @@ from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import utils as layers_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
@@ -102,10 +103,16 @@ class Layer(object):
     self.built = False
     self.input_spec = None
 
+    if activity_regularizer and context.in_eager_mode():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
     self._activity_regularizer = activity_regularizer
     self._trainable_weights = []
     self._non_trainable_weights = []
     self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
     self._losses = []
     self._reuse = kwargs.get('_reuse')
     self._graph = ops.get_default_graph()
@@ -220,7 +227,7 @@ class Layer(object):
 
     Weight updates (for instance, the updates of the moving mean and variance
     in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing a same layer on
+    when calling a layer. Hence, when reusing the same layer on
     different inputs `a` and `b`, some entries in `layer.updates` may be
     dependent on `a` and some on `b`. This method automatically keeps track
     of dependencies.
@@ -250,7 +257,7 @@ class Layer(object):
     if inputs is not None:
       # We compute an ID that uniquely identifies the list of tensors.
       # This ID is order-sensitive.
-      inputs_hash = _object_list_uid(inputs)
+      inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
     if inputs_hash not in self._per_input_updates:
@@ -279,29 +286,47 @@ class Layer(object):
     if not inputs:
       inputs = None
     if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
+      inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
     return self._per_input_updates.get(inputs_hash, [])
 
   @property
   def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
     if context.in_eager_mode():
-      raise RuntimeError('Layer.losses not supported in Eager mode.')
-    return self._losses
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
 
   def add_loss(self, losses, inputs=None):
     """Add loss tensor(s), potentially dependent on layer inputs.
 
     Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing a same layer
-    on different inputs `a` and `b`, some entries in `layer.losses` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
     of dependencies.
 
     The `get_losses_for` method allows to retrieve the losses relevant to a
     specific set of inputs.
 
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
     Arguments:
       losses: Loss tensor, or list/tuple of tensors.
       inputs: Optional input tensor(s) that the loss(es) depend on. Must
@@ -326,7 +351,7 @@ class Layer(object):
     if inputs is not None:
       # We compute an ID that uniquely identifies the list of tensors.
       # This ID is order-sensitive.
-      inputs_hash = _object_list_uid(inputs)
+      inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
     if inputs_hash not in self._per_input_losses:
@@ -357,7 +382,7 @@ class Layer(object):
     if not inputs:
       inputs = None
     if inputs is not None:
-      inputs_hash = _object_list_uid(inputs)
+      inputs_hash = layers_util.object_list_uid(inputs)
     else:
       inputs_hash = None
     return self._per_input_losses.get(inputs_hash, [])
@@ -378,6 +403,10 @@ class Layer(object):
     """
     return inputs
 
+  def _name_scope_name(self, current_variable_scope):
+    """Determines op naming for the Layer."""
+    return current_variable_scope.original_name_scope
+
   def _compute_output_shape(self, input_shape):
     """Computes the output shape of the layer given the input shape.
 
@@ -401,10 +430,12 @@ class Layer(object):
     """
     return input_shape
 
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None):
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
     base_name = _to_snake_case(self.__class__.__name__)
     name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names)
+                              avoid_names=avoid_names, namespace=namespace,
+                              zero_based=zero_based)
     return (name, base_name)
 
   def _set_scope(self, scope=None):
@@ -434,6 +465,9 @@ class Layer(object):
       trainable: whether the variable should be part of the layer's
         "trainable_variables" (e.g. variables, biases)
         or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable.
       constraint: constraint instance (callable).
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
@@ -452,23 +486,15 @@ class Layer(object):
     Raises:
       RuntimeError: If called in Eager mode with regularizers.
     """
-    # Note that we currently don't support variable regularization in Eager
-    # mode. An alternative is for users to directly compute these losses before
-    # performing a backward pass.
     if context.in_graph_mode():
       existing_variables = set(tf_variables.global_variables())
-    else:
-      existing_variables = []
-      if regularizer is not None:
-        raise RuntimeError('Variable regularization not supported in Eager '
-                           'mode.')
     if dtype is None:
       dtype = self.dtype or dtypes.float32
 
     self._set_scope(None)
     with vs.variable_scope(
         self._scope, reuse=(self.built or self._reuse)) as scope:
-      with ops.name_scope(scope.original_name_scope):
+      with ops.name_scope(self._name_scope_name(scope)):
         variable = vs.get_variable(name,
                                    shape=shape,
                                    initializer=initializer,
@@ -476,24 +502,39 @@ class Layer(object):
                                    constraint=constraint,
                                    trainable=trainable and self.trainable,
                                    partitioner=partitioner)
-        if variable in existing_variables:
-          return variable
-        if regularizer:
-          # To match the behavior of tf.get_variable(), we only
-          # apply regularization if the variable is newly created.
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            for v in variable:
-              with ops.colocate_with(v.op):
+        if context.in_graph_mode():
+          if (trainable and self.trainable
+              and variable not in tf_variables.trainable_variables()):
+            # A custom getter / variable scope overrode the trainable flag.
+            trainable = False
+          if variable in existing_variables:
+            return variable
+          if regularizer:
+            # To match the behavior of tf.get_variable(), we only
+            # apply regularization if the variable is newly created.
+            if isinstance(variable, tf_variables.PartitionedVariable):
+              for v in variable:
+                with ops.colocate_with(v.op):
+                  with ops.name_scope(name + '/Regularizer'):
+                    regularization = regularizer(v)
+                if regularization is not None:
+                  self.add_loss(regularization)
+            else:
+              with ops.colocate_with(variable.op):
                 with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(v)
+                  regularization = regularizer(variable)
               if regularization is not None:
                 self.add_loss(regularization)
-          else:
-            with ops.colocate_with(variable.op):
-              with ops.name_scope(name + '/Regularizer'):
-                regularization = regularizer(variable)
-            if regularization is not None:
-              self.add_loss(regularization)
+        elif regularizer:
+          if isinstance(variable, tf_variables.PartitionedVariable):
+            raise RuntimeError(
+                'Partitioned variable regularization is not yet supported when '
+                'executing eagerly. File a feature request is this is '
+                'important to you.')
+          # Save a zero-argument lambda which runs the regularizer on the
+          # variable, to be executed when `Layer.losses` is requested. This
+          # makes losses responsive to variable updates when executing eagerly.
+          self._losses.append(lambda: regularizer(variable))
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -567,7 +608,7 @@ class Layer(object):
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse)
     with scope_context_manager as scope:
-      with ops.name_scope(scope.original_name_scope):
+      with ops.name_scope(self._name_scope_name(scope)):
         if not self.built:
           if not in_graph_mode:
             # Activity regularization is currently unsupported in Eager mode.
@@ -634,7 +675,7 @@ class Layer(object):
             for output in output_list:
               with ops.name_scope('ActivityRegularizer'):
                 activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization)
+              self.add_loss(activity_regularization, inputs=inputs)
 
         if not in_deferred_mode:
           # TODO(fchollet): consider how masking will work with deferred mode.
@@ -1258,9 +1299,9 @@ class Node(object):
     # Following 2 properties: input and output shapes.
 
     # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [_static_shape(x) for x in input_tensors]
+    self.input_shapes = [layers_util.static_shape(x) for x in input_tensors]
     # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [_static_shape(x) for x in output_tensors]
+    self.output_shapes = [layers_util.static_shape(x) for x in output_tensors]
 
     # Optional keyword arguments to layer's `call`.
     self.arguments = arguments
@@ -1318,926 +1359,6 @@ class _DeferredTensor(object):
                                                          self.dtype.name)
 
 
-class InputLayer(Layer):
-  """Layer to be used as an entry point into a Network (a graph of layers).
-
-  It can either wrap an existing tensor (pass an `input_tensor` argument)
-  or create its a placeholder tensor (pass arguments `input_shape`
-  as well as `dtype`).
-
-  It is generally recommend to use the functional layer API via `Input`,
-  (which creates an `InputLayer`) without directly using `InputLayer`.
-
-  Arguments:
-      input_shape: Shape tuple (not including the batch axis), or `TensorShape`
-        instance (not including the batch axis).
-      batch_size: Optional input batch size (integer or None).
-      dtype: Datatype of the input.
-      input_tensor: Optional tensor to use as layer input
-          instead of creating a placeholder.
-      sparse: Boolean, whether the placeholder created
-          is meant to be sparse.
-      name: Name of the layer (string).
-
-    Raises:
-      RuntimeError: If created in Eager mode.
-  """
-
-  def __init__(self,
-               input_shape=None,
-               batch_size=None,
-               dtype=dtypes.float32,
-               input_tensor=None,
-               sparse=False,
-               name=None):
-    super(InputLayer, self).__init__(dtype=dtype, name=name)
-    self.built = True
-    self.sparse = sparse
-    self.batch_size = batch_size
-
-    if isinstance(input_shape, tensor_shape.TensorShape):
-      input_shape = tuple(input_shape.as_list())
-
-    if input_tensor is None:
-      if input_shape is not None:
-        batch_input_shape = (batch_size,) + tuple(input_shape)
-      else:
-        batch_input_shape = None
-
-      if context.in_eager_mode():
-        # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = _DeferredTensor(
-            shape=batch_input_shape,
-            dtype=dtype,
-            name=self.name)
-      else:
-        # In graph mode, create a graph placeholder to call the layer on.
-        if sparse:
-          input_tensor = array_ops.sparse_placeholder(
-              shape=batch_input_shape,
-              dtype=dtype,
-              name=self.name)
-        else:
-          input_tensor = array_ops.placeholder(
-              shape=batch_input_shape,
-              dtype=dtype,
-              name=self.name)
-
-      # For compatibility with Keras API.
-      self.is_placeholder = True
-      self._batch_input_shape = batch_input_shape
-    else:
-      # For compatibility with Keras API.
-      self.is_placeholder = False
-      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
-
-    # Create an input node to add to self.outbound_node
-    # and set output_tensors' _keras_history.
-    input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-    Node(
-        self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=[input_tensor],
-        output_tensors=[input_tensor])
-
-
-def Input(  # pylint: disable=invalid-name
-    shape=None,
-    batch_size=None,
-    name=None,
-    dtype=dtypes.float32,
-    sparse=False,
-    tensor=None):
-  """`Input()` is used to instantiate an input tensor for use with a `Network`.
-
-  For instance, if a, b and c are tensors created via `Input`,
-  it becomes possible to do:
-
-  `network = Network(inputs=[a, b], outputs=c)`
-
-  Example:
-
-      ```python
-      # This is a logistic regression
-      x = tf.layers.Input(shape=(32,))
-      y = tf.layers.Dense(16, activation='softmax')(x)
-      network = tf.layers.Network(x, y)
-      ```
-
-  Arguments:
-      shape: A shape tuple (integer), not including the batch size.
-          For instance, `shape=(32,)` indicates that the expected input
-          will be batches of 32-dimensional vectors.
-      batch_size: Optional input batch size (integer or None).
-      name: An optional name string for the layer.
-          Should be unique in a model (do not reuse the same name twice).
-          It will be autogenerated if it isn't provided.
-      dtype: The data type expected by the input, as a string
-          (`float32`, `float64`, `int32`...)
-      sparse: A boolean specifying whether the placeholder
-          to be created is sparse.
-      tensor: Optional existing tensor to wrap into the `Input` layer.
-          If set, the layer will not create a placeholder tensor.
-
-  Returns:
-      A tensor: either a new placeholder (with history metadata) or
-      `tensor` (if passed), with added history metadata.
-
-  Raises:
-    RuntimeError: If called in Eager mode.
-  """
-  input_layer = InputLayer(
-      input_shape=shape,
-      batch_size=batch_size,
-      name=name,
-      dtype=dtype,
-      sparse=sparse,
-      input_tensor=tensor)
-  # Return tensor including `_keras_history` metadata.
-  # Note that in this case train_output and test_output are the same pointer.
-  outputs = input_layer._inbound_nodes[0].output_tensors  # pylint: disable=protected-access
-  if len(outputs) == 1:
-    return outputs[0]
-  else:
-    return outputs
-
-
-class Network(Layer):
-  """A Network is a directed acyclic graph of layers.
-
-  It is the topological form of a "model".
-  A Model is simply a Network with added training/evaluation routines.
-
-  A Network instance implements the full Layer API. In particular, a network
-  can be called on new inputs.
-
-  Example:
-
-      ```python
-      # This is a logistic regression
-      x = tf.layers.Input(shape=(32,))
-      y = tf.layers.Dense(16, activation='softmax')(x)
-      network = tf.layers.Network(x, y)
-
-      # It is then possible to call the network on compatible inputs:
-      z = tf.layers.Input(shape=(32,))
-      w = network(z)
-
-      # It is possible to retrieve the same properties as a layer:
-      weights = network.trainable_weights
-      ```
-
-  Arguments:
-      inputs: Input tensor or list of input tensors.
-        Must come from `tf.layers.Input`.
-      output: Output tensor or list of output tensors. Must come from
-        tf.layers Layers or Keras layers.
-      name: Optional name of the model (string).
-
-  Attributes:
-    Network has the same attributes as Layer. On top of it, it also has:
-      - layers: a list of the children layers of the network,
-        a list of layer instances, ordered from "earlier in the graph"
-        to "later in the graph".
-
-  Methods:
-    Network has the same methods as Layer. On top of it, it also has:
-      - get_layer: retrieves a child layer by name or index in the graph.
-
-  Raises:
-    RuntimeError: If created in Eager mode.
-  """
-
-  def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
-    if context.in_eager_mode():
-      # TODO(fchollet): check that all inputs and outputs are DeferredTensors.
-      pass
-
-    self._init_set_name(name)
-    self._activity_regularizer = None
-    with vs.variable_scope(
-        None, default_name=self._base_name) as captured_scope:
-      self._scope = captured_scope
-    call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in call_fn_args
-
-    # This acts just like the `trainable` attribute of any layer instance.
-    # It does not affect users of the underlying layers, only users of the
-    # Network instance.
-    self.trainable = True
-    # A Network does not create weights of its own, thus it is already built.
-    self.built = True
-    # A Network does not create weights of its own, thus has no dtype.
-    self._dtype = None
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-
-    # Private attributes to implement compatibility with Layer.
-    self._per_input_losses = {}
-    self._per_input_updates = {}
-    self._updates = []
-    self._losses = []
-    self._scope = None
-    self._reuse = None
-    self._graph = ops.get_default_graph()
-
-    # Network-specific properties.
-    if isinstance(inputs, (list, tuple)):
-      self.inputs = list(inputs)  # Tensor or list of tensors.
-    else:
-      self.inputs = [inputs]
-    if isinstance(outputs, (list, tuple)):
-      self.outputs = list(outputs)
-    else:
-      self.outputs = [outputs]
-    # All layers in order of horizontal graph traversal.
-    # Entries are unique. Includes input and output layers.
-    self.layers = []
-
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-
-    # # List of initial layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._input_layers = []
-    # self._input_layers_node_indices = []
-    # self._input_layers_tensor_indices = []
-    # # list of layers (1 to 1 mapping with self.inputs,
-    # # hence the same layer might appear twice)
-    # self._output_layers = []
-    # self._output_layers_node_indices = []
-    # self._output_layers_tensor_indices = []
-
-    self._input_layers = []
-    self._output_layers = []
-    self._input_coordinates = []
-    self._output_coordinates = []
-
-    # This is for performance optimization
-    # when calling the Network on new inputs.
-    # every time the Network is called on a set on input tensors,
-    # we compute the output tensors,
-    # output masks and output shapes in one pass,
-    # then cache them here. When any of these outputs is queried later,
-    # we retrieve it from there instead of recomputing it.
-    self._output_mask_cache = {}
-    self._output_tensor_cache = {}
-    self._output_shape_cache = {}
-
-    # User-provided arguments validation.
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.layers.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.layers.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.layers.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-      # pylint: enable=protected-access
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
-
-    # Build self._output_layers:
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      self._output_layers.append(layer)
-      self._output_coordinates.append((layer, node_index, tensor_index))
-
-    # Build self._input_layers:
-    for x in self.inputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      # It's supposed to be an input layer, so only one node
-      # and one tensor output.
-      assert node_index == 0
-      assert tensor_index == 0
-      self._input_layers.append(layer)
-      self._input_coordinates.append((layer, node_index, tensor_index))
-
-    # Network_nodes: set of nodes included in the graph
-    # (not all nodes included in the layers
-    # are relevant to the current graph).
-    network_nodes = set()  # ids of all nodes relevant to the Network
-    nodes_depths = {}  # dict {node: depth value}
-    layers_depths = {}  # dict {layer: depth value}
-    layer_indices = {}  # dict {layer: index in traversal}
-    nodes_in_decreasing_depth = []
-
-    def build_map_of_graph(tensor,
-                           finished_nodes,
-                           nodes_in_progress,
-                           layer,
-                           node_index,
-                           tensor_index):
-      """Builds a map of the graph of layers.
-
-      This recursively updates the map `layer_indices`,
-      the list `nodes_in_decreasing_depth` and the set `network_nodes`.
-
-      Arguments:
-          tensor: Some tensor in a graph.
-          finished_nodes: Set of nodes whose subgraphs have been traversed
-              completely. Useful to prevent duplicated work.
-          nodes_in_progress: Set of nodes that are currently active on the
-              recursion stack. Useful to detect cycles.
-          layer: Layer from which `tensor` comes from. If not provided,
-              will be obtained from `tensor._keras_history`.
-          node_index: Node index from which `tensor` comes from.
-          tensor_index: Tensor_index from which `tensor` comes from.
-
-      Raises:
-          ValueError: if a cycle is detected.
-      """
-      node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-
-      # Prevent cycles.
-      if node in nodes_in_progress:
-        raise ValueError('The tensor ' + str(tensor) + ' at layer "' +
-                         layer.name + '" is part of a cycle.')
-
-      # Don't repeat work for shared subgraphs
-      if node in finished_nodes:
-        return
-
-      node_key = _make_node_key(layer.name, node_index)
-      # Update network_nodes.
-      network_nodes.add(node_key)
-
-      # Store the traversal order for layer sorting.
-      if layer not in layer_indices:
-        layer_indices[layer] = len(layer_indices)
-
-      nodes_in_progress.add(node)
-
-      # Propagate to all previous tensors connected to this node.
-      for i in range(len(node.inbound_layers)):
-        x = node.input_tensors[i]
-        layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        tensor_index = node.tensor_indices[i]
-        build_map_of_graph(x, finished_nodes, nodes_in_progress, layer,
-                           node_index, tensor_index)
-
-      finished_nodes.add(node)
-      nodes_in_progress.remove(node)
-      nodes_in_decreasing_depth.append(node)
-
-    finished_nodes = set()
-    nodes_in_progress = set()
-    for x in self.outputs:
-      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      build_map_of_graph(x, finished_nodes, nodes_in_progress,
-                         layer=layer,
-                         node_index=node_index,
-                         tensor_index=tensor_index)
-
-    for node in reversed(nodes_in_decreasing_depth):
-      # If the depth is not set, the node has no outbound nodes (depth 0).
-      depth = nodes_depths.setdefault(node, 0)
-
-      # Update the depth of the corresponding layer
-      previous_depth = layers_depths.get(node.outbound_layer, 0)
-      # If we've seen this layer before at a higher depth,
-      # we should use that depth instead of the node depth.
-      # This is necessary for shared layers that have inputs at different
-      # depth levels in the graph.
-      depth = max(depth, previous_depth)
-      layers_depths[node.outbound_layer] = depth
-      nodes_depths[node] = depth
-
-      # Update the depth of inbound nodes.
-      # The "depth" of a node is the max of the depths
-      # of all layers it is connected to.
-      for i in range(len(node.inbound_layers)):
-        inbound_layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
-        previous_depth = nodes_depths.get(inbound_node, 0)
-        nodes_depths[inbound_node] = max(depth + 1, previous_depth)
-
-    # Build a dict {depth: list of nodes with this depth}
-    nodes_by_depth = {}
-    for node, depth in nodes_depths.items():
-      if depth not in nodes_by_depth:
-        nodes_by_depth[depth] = []
-      nodes_by_depth[depth].append(node)
-
-    # Build a dict {depth: list of layers with this depth}
-    layers_by_depth = {}
-    for layer, depth in layers_depths.items():
-      if depth not in layers_by_depth:
-        layers_by_depth[depth] = []
-      layers_by_depth[depth].append(layer)
-
-    # Get sorted list of layer depths.
-    depth_keys = list(layers_by_depth.keys())
-    depth_keys.sort(reverse=True)
-
-    # Set self.layers and self._layers_by_depth.
-    layers = []
-    for depth in depth_keys:
-      layers_for_depth = layers_by_depth[depth]
-      # Network.layers needs to have a deterministic order:
-      # here we order them by traversal order.
-      layers_for_depth.sort(key=lambda x: layer_indices[x])
-      layers.extend(layers_for_depth)
-    self.layers = layers
-    self._layers_by_depth = layers_by_depth
-
-    # Get sorted list of node depths.
-    depth_keys = list(nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-
-    # Check that all tensors required are computable.
-    # computable_tensors: all tensors in the graph
-    # that can be computed from the inputs provided.
-    computable_tensors = []
-    for x in self.inputs:
-      computable_tensors.append(x)
-
-    layers_with_complete_input = []  # To provide a better error msg.
-    for depth in depth_keys:
-      for node in nodes_by_depth[depth]:
-        layer = node.outbound_layer
-        if layer:
-          for x in node.input_tensors:
-            if x not in computable_tensors:
-              raise ValueError('Graph disconnected: '
-                               'cannot obtain value for tensor ' + str(x) +
-                               ' at layer "' + layer.name + '". '
-                               'The following previous layers '
-                               'were accessed without issue: ' +
-                               str(layers_with_complete_input))
-          for x in node.output_tensors:
-            computable_tensors.append(x)
-          layers_with_complete_input.append(layer.name)
-
-    # Keep track of the network's nodes.
-    self._network_nodes = network_nodes
-    self._nodes_by_depth = nodes_by_depth
-
-    # Ensure name unicity, which will be crucial for serialization
-    # (since serialized nodes refer to layers by their name).
-    all_names = [layer.name for layer in self.layers]
-    for name in all_names:
-      if all_names.count(name) != 1:
-        raise ValueError('The name "' + name + '" is used ' +
-                         str(all_names.count(name)) + ' times in the model. '
-                         'All layer names should be unique.')
-
-    # Layer parameters.
-    # The new network starts with a single inbound node
-    # for its inputs, and no outbound nodes.
-    self._outbound_nodes = []  # Will be appended to by future calls to __call__
-    self._inbound_nodes = [
-    ]  # Will be appended to below, and by future calls to __call__
-    # Create the node linking internal inputs to internal outputs.
-    Node(
-        outbound_layer=self,
-        inbound_layers=[],
-        node_indices=[],
-        tensor_indices=[],
-        input_tensors=self.inputs,
-        output_tensors=self.outputs)
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-    else:
-      if not name:
-        raise ValueError('Provide either a layer name or layer index.')
-    for layer in self.layers:
-      if layer.name == name:
-        return layer
-    raise ValueError('No such layer: ' + name)
-
-  @property
-  def updates(self):
-    """Retrieve the network's updates.
-
-    Will only include updates that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include updates that depend on tensors
-    that aren't inputs to this model).
-
-    Returns:
-        A list of update ops.
-    """
-    updates = []
-    for layer in self.layers:
-      if hasattr(layer, 'updates'):
-        # Collect updates that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            updates += layer.get_updates_for(inputs)
-        # Collect unconditional updates.
-        updates += layer.get_updates_for(None)
-    return updates
-
-  @property
-  def losses(self):
-    """Retrieve the network's losses.
-
-    Will only include losses that are either
-    unconditional, or conditional on inputs to this model
-    (e.g. will not include losses that depend on tensors
-    that aren't inputs to this model).
-
-    Returns:
-        A list of loss tensors.
-    """
-    losses = []
-    # Retrieve losses for all internal layers.
-    for layer in self.layers:
-      if hasattr(layer, 'losses'):
-        # Collect losses that are dependent on inputs
-        # that are part of the model.
-        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
-          node_key = _make_node_key(layer.name, node_index)
-          if node_key in self._network_nodes:
-            # The model owns this layer node.
-            inputs = node.input_tensors
-            losses += layer.get_losses_for(inputs)
-        # Collect unconditional losses.
-        losses += layer.get_losses_for(None)
-    # Add any potential unconditional model-level loss.
-    losses += self.get_losses_for(None)
-    return losses
-
-  @property
-  def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for layer in self.layers:
-      weights += layer.trainable_weights
-    return weights
-
-  @property
-  def non_trainable_weights(self):
-    weights = []
-    for layer in self.layers:
-      weights += layer.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for layer in self.layers:
-        trainable_weights += layer.trainable_weights
-      return trainable_weights + weights
-    return weights
-
-  @property
-  def input_spec(self):
-    """Gets the network's input specs.
-
-    Returns:
-        A list of `InputSpec` instances (one per input to the model)
-            or a single instance if the model has only one input.
-    """
-    specs = []
-    for layer in self._input_layers:
-      if layer.input_spec is None:
-        specs.append(None)
-      else:
-        if not isinstance(layer.input_spec, list):
-          raise TypeError('Layer ' + layer.name +
-                          ' has an input_spec attribute that '
-                          'is not a list. We expect a list. '
-                          'Found input_spec = ' + str(layer.input_spec))
-        specs += layer.input_spec
-    if len(specs) == 1:
-      return specs[0]
-    return specs
-
-  def call(self, inputs, mask=None):
-    """Call the model on new inputs.
-
-    In this case `call` just reapplies
-    all ops in the graph to the new inputs
-    (e.g. build a new computational graph from the provided inputs).
-
-    Arguments:
-        inputs: A tensor or list of tensors.
-        mask: A mask or list of masks. A mask can be
-            either a tensor or None (no mask).
-
-    Returns:
-        A tensor if there is a single output, or
-        a list of tensors if there are more than one outputs.
-    """
-    inputs = nest.flatten(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = nest.flatten(mask)
-
-    if context.in_graph_mode():
-      # Try to retrieve cached outputs if the layer has already been called
-      # on these exact inputs.
-      cache_key = _object_list_uid(inputs) + '_' + _object_list_uid(masks)
-      if cache_key in self._output_tensor_cache:
-        # Cache hit.
-        return self._output_tensor_cache[cache_key]
-    # Actually apply the network graph to the new inputs.
-    outputs, _ = self._run_internal_graph(inputs, masks)
-    return outputs
-
-  def _compute_output_shape(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shapes = []
-      for shape in input_shape:
-        if shape is not None:
-          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
-        else:
-          input_shapes.append(None)
-    else:
-      if input_shape is not None:
-        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
-      else:
-        input_shapes = [None]
-
-    if len(input_shapes) != len(self._input_layers):
-      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
-                       ': model has ' + str(len(self._input_layers)) +
-                       ' tensor inputs.')
-
-    cache_key = _object_list_uid(input_shapes)
-    if cache_key not in self._output_shape_cache:
-      # Cache miss. We have to run the network graph manually (recursive calls
-      # to `_compute_output_shape`).
-      layers_to_output_shapes = {}
-      for i in range(len(input_shapes)):
-        layer = self._input_layers[i]
-        input_shape = input_shapes[i]
-        # It's an input layer: then `_compute_output_shape` is identity,
-        # and there is only one node and one tensor output.
-        shape_key = layer.name + '_0_0'
-        layers_to_output_shapes[shape_key] = input_shape
-
-      depth_keys = list(self._nodes_by_depth.keys())
-      depth_keys.sort(reverse=True)
-      # Iterate over nodes, by depth level.
-      if len(depth_keys) > 1:
-        for depth in depth_keys:
-          nodes = self._nodes_by_depth[depth]
-          for node in nodes:
-            # This is always a single layer, never a list.
-            layer = node.outbound_layer
-            if layer in self._input_layers:
-              # We've already covered the input layers
-              # a few lines above.
-              continue
-            # Potentially redundant list,
-            # same size as node.input_tensors.
-            input_shapes = []
-            for j in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[j]
-              node_index = node.node_indices[j]
-              tensor_index = node.tensor_indices[j]
-              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
-                                                           tensor_index)
-              input_shape = layers_to_output_shapes[shape_key]
-              input_shapes.append(input_shape)
-
-            if len(input_shapes) == 1:
-              output_shape = layer._compute_output_shape(input_shapes[0])  # pylint: disable=protected-access
-            else:
-              output_shape = layer._compute_output_shape(input_shapes)  # pylint: disable=protected-access
-            if isinstance(output_shape, list):
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(shape).as_list())
-                  for shape in output_shape
-              ]
-            else:
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(output_shape).as_list())
-              ]
-
-            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-            for j in range(len(output_shapes)):
-              shape_key = layer.name + '_%s_%s' % (node_index, j)
-              layers_to_output_shapes[shape_key] = output_shapes[j]
-
-        # Read final output shapes from layers_to_output_shapes.
-        output_shapes = []
-        for i in range(len(self._output_layers)):
-          layer, node_index, tensor_index = self._output_coordinates[i]
-          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-          output_shapes.append(layers_to_output_shapes[shape_key])
-
-        # Store in cache.
-        self._output_shape_cache[cache_key] = output_shapes
-      else:
-        # Cache hit.
-        output_shapes = self._output_shape_cache[cache_key]
-
-      if isinstance(output_shapes, list):
-        if len(output_shapes) == 1:
-          return tensor_shape.TensorShape(output_shapes[0])
-        else:
-          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-      else:
-        return tensor_shape.TensorShape(output_shapes)
-
-  def _run_internal_graph(self, inputs, masks=None):
-    """Computes output tensors for new inputs.
-
-    # Note:
-        - Expects `inputs` to be a list (potentially with 1 element).
-        - Can be run on non-Keras tensors.
-
-    Arguments:
-        inputs: List of tensors
-        masks: List of masks (tensors or None).
-
-    Returns:
-        Three lists: output_tensors, output_masks, output_shapes
-    """
-    # Note: masking support is relevant mainly for Keras.
-    # It cannot be factored out without having the fully reimplement the
-    # network calling logic on the Keras side. We choose to incorporate it
-    # in Network because 1) it may be useful to fully support in tf.layers in
-    # the future and 2) Keras is a major user of Network.
-    # If you don't use masking, it does not interfere with regular behavior
-    # at all and you can ignore it.
-    if masks is None:
-      masks = [None for _ in range(len(inputs))]
-
-    # Dictionary mapping reference tensors to tuples
-    # (computed tensor, compute mask)
-    # we assume a 1:1 mapping from tensor to mask
-    # TODO(fchollet): raise exception when a `.compute_mask()` call
-    # does not return a list the same size as `call`
-    tensor_map = {}
-    for x, y, mask in zip(self.inputs, inputs, masks):
-      tensor_map[str(id(x))] = (y, mask)
-
-    depth_keys = list(self._nodes_by_depth.keys())
-    depth_keys.sort(reverse=True)
-    for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
-      for node in nodes:
-        # This is always a single layer, never a list.
-        layer = node.outbound_layer
-
-        reference_input_tensors = node.input_tensors
-        reference_output_tensors = node.output_tensors
-
-        # If all previous input tensors are available in tensor_map,
-        # then call node.inbound_layer on them.
-        computed_data = []  # List of tuples (input, mask).
-        for x in reference_input_tensors:
-          if str(id(x)) in tensor_map:
-            computed_data.append(tensor_map[str(id(x))])
-
-        if len(computed_data) == len(reference_input_tensors):
-          # Call layer (reapplying ops to new inputs).
-          with ops.name_scope(layer.name):
-            if node.arguments:
-              kwargs = node.arguments
-            else:
-              kwargs = {}
-            if len(computed_data) == 1:
-              computed_tensor, computed_mask = computed_data[0]
-              # Ensure mask propagation if applicable.
-              if 'mask' in estimator_util.fn_args(layer.call):
-                if 'mask' not in kwargs:
-                  kwargs['mask'] = computed_mask
-
-              output_tensors = nest.flatten(
-                  layer.call(computed_tensor, **kwargs))
-              if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensor, computed_mask))
-              else:
-                output_masks = [None for _ in range(len(output_tensors))]
-              computed_tensors = [computed_tensor]
-              computed_masks = [computed_mask]
-            else:
-              computed_tensors = [x[0] for x in computed_data]
-              computed_masks = [x[1] for x in computed_data]
-              if 'mask' in estimator_util.fn_args(layer.call):
-                if 'mask' not in kwargs:
-                  kwargs['mask'] = computed_masks
-              output_tensors = nest.flatten(
-                  layer.call(computed_tensors, **kwargs))
-              if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensors, computed_masks))
-              else:
-                output_masks = [None for _ in range(len(output_tensors))]
-
-            # Apply activity regularizer if any:
-            if layer.activity_regularizer is not None:
-              regularization_losses = [
-                  layer.activity_regularizer(x) for x in computed_tensors
-              ]
-              layer.add_loss(regularization_losses, computed_tensors)
-
-          if context.in_graph_mode():
-            # Update model updates and losses:
-            # Keep track of updates that depend on the inputs
-            # (e.g. BN updates).
-            self.add_update(layer.get_updates_for(computed_tensors), inputs)
-            # Keep track of unconditional updates (e.g. a counter).
-            self.add_update(layer.get_updates_for(None), None)
-            # Keep track of losses that depend on the inputs
-            # (e.g. activity regularizers).
-            self.add_loss(layer.get_losses_for(computed_tensors), inputs)
-            # Keep track of unconditional losses
-            # (e.g. weight regularizers).
-            self.add_loss(layer.get_losses_for(None), None)
-
-          # Update tensor_map.
-          for x, y, mask in zip(reference_output_tensors, output_tensors,
-                                output_masks):
-            tensor_map[str(id(x))] = (y, mask)
-
-    output_tensors = []
-    output_masks = []
-    output_shapes = []
-    for x in self.outputs:
-      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
-      tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(_static_shape(x))
-      output_tensors.append(tensor)
-      output_masks.append(mask)
-
-    if len(output_tensors) == 1:
-      output_tensors = output_tensors[0]
-      if output_shapes is not None:
-        output_shapes = output_shapes[0]
-      if output_masks is not None:
-        output_masks = output_masks[0]
-
-    if context.in_graph_mode():
-      # Update cache;
-      # keys are based on ids on input tensors and inputs masks.
-      cache_key = _object_list_uid(inputs) + '_' + _object_list_uid(masks)
-      self._output_tensor_cache[cache_key] = output_tensors
-      if output_masks is not None:
-        self._output_mask_cache[cache_key] = output_masks
-      if output_shapes is not None:
-        input_shapes = [_static_shape(x) for x in inputs]
-        cache_key = _object_list_uid(input_shapes)
-        self._output_shape_cache[cache_key] = output_shapes
-
-    return output_tensors, output_masks
-
-
 def _is_tensor_or_tensor_list(v):
   v = nest.flatten(v)
   if v and isinstance(v[0], ops.Tensor):
@@ -2288,24 +1409,6 @@ def _add_elements_to_collection(elements, collection_list):
         collection.append(element)
 
 
-def _object_list_uid(object_list):
-  object_list = nest.flatten(object_list)
-  return ', '.join([str(abs(id(x))) for x in object_list])
-
-
-def _make_node_key(layer_name, node_index):
-  return layer_name + '_ib-' + str(node_index)
-
-
-def _static_shape(x):
-  if x is None:
-    return None
-  try:
-    return tuple(x.get_shape().as_list())
-  except ValueError:
-    return None
-
-
 def _is_all_none(iterable_or_element):
   if not isinstance(iterable_or_element, (list, tuple)):
     iterable = [iterable_or_element]
@@ -2363,7 +1466,8 @@ def _get_default_graph_uid_map():
   return name_uid_map
 
 
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None):
+def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                       zero_based=False):
   """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
 
   Arguments:
@@ -2372,6 +1476,11 @@ def _unique_layer_name(name, name_uid_map=None, avoid_names=None):
       names. If None (default), uses a per-Graph dictionary.
     avoid_names: An optional set or dict with names which should not be used. If
       None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
 
   Returns:
     Unique string name.
@@ -2389,6 +1498,15 @@ def _unique_layer_name(name, name_uid_map=None, avoid_names=None):
     avoid_names = set()
   proposed_name = None
   while proposed_name is None or proposed_name in avoid_names:
-    name_uid_map[name] += 1
-    proposed_name = name + '_' + str(name_uid_map[name])
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
   return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 71eff2f9657fde2855acfc602c54c6a38aedf5a3..3e5a51eb62378ba0ccbe241f440432685b87552e 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -20,8 +20,6 @@ from __future__ import print_function
 
 import copy
 
-import numpy as np
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -33,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
@@ -47,7 +44,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(layer.trainable_variables, [])
     self.assertEqual(layer.non_trainable_variables, [])
     if context.in_graph_mode():
-      # updates, losses only suppported in GRAPH mode
+      # updates, losses only supported in GRAPH mode
       self.assertEqual(layer.updates, [])
       self.assertEqual(layer.losses, [])
     self.assertEqual(layer.built, False)
@@ -91,6 +88,11 @@ class BaseLayerTest(test.TestCase):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testNoEagerActivityRegularizer(self):
+    with context.eager_mode():
+      with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
+        core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
+
   def testGetVariable(self):
     with self.test_session():
 
@@ -431,115 +433,6 @@ class BaseLayerTest(test.TestCase):
       layer.apply(array_ops.placeholder('int32'))
       layer.apply(array_ops.placeholder('int32', shape=(2, 3)))
 
-  def test_get_updates_for(self):
-    a = base_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_update(0, inputs=a)
-    dense_layer.add_update(1, inputs=None)
-
-    self.assertEqual(dense_layer.get_updates_for(a), [0])
-    self.assertEqual(dense_layer.get_updates_for(None), [1])
-
-  def test_get_losses_for(self):
-    a = base_layers.Input(shape=(2,))
-    dense_layer = core_layers.Dense(1)
-    dense_layer.add_loss(0, inputs=a)
-    dense_layer.add_loss(1, inputs=None)
-
-    self.assertEqual(dense_layer.get_losses_for(a), [0])
-    self.assertEqual(dense_layer.get_losses_for(None), [1])
-
-  def testTopologicalAttributes(self):
-    # test layer attributes / methods related to cross-layer connectivity.
-    a = base_layers.Input(shape=(32,), name='input_a')
-    b = base_layers.Input(shape=(32,), name='input_b')
-
-    # test input, output, input_shape, output_shape
-    test_layer = core_layers.Dense(16, name='test_layer')
-    a_test = test_layer(a)
-    self.assertEqual(test_layer.input, a)
-    self.assertEqual(test_layer.output, a_test)
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, (None, 16))
-
-    # test `get_*_at` methods
-    dense = core_layers.Dense(16, name='dense_1')
-    a_2 = dense(a)
-    b_2 = dense(b)
-
-    self.assertEqual(dense.get_input_at(0), a)
-    self.assertEqual(dense.get_input_at(1), b)
-    self.assertEqual(dense.get_output_at(0), a_2)
-    self.assertEqual(dense.get_output_at(1), b_2)
-    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
-    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
-    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
-    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
-
-    # Test invalid value for attribute retrieval.
-    with self.assertRaises(ValueError):
-      dense.get_input_at(2)
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.input
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.output
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.output_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      a = base_layers.Input(shape=(3, 32))
-      a = base_layers.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.input_shape
-    with self.assertRaises(AttributeError):
-      new_dense = core_layers.Dense(16)
-      a = base_layers.Input(shape=(3, 32))
-      a = base_layers.Input(shape=(5, 32))
-      a_2 = dense(a)
-      b_2 = dense(b)
-      _ = new_dense.output_shape
-
-  def testTopologicalAttributesMultiOutputLayer(self):
-
-    class PowersLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return [inputs**2, inputs**3]
-
-    x = base_layers.Input(shape=(32,))
-    test_layer = PowersLayer()
-    p1, p2 = test_layer(x)  # pylint: disable=not-callable
-
-    self.assertEqual(test_layer.input, x)
-    self.assertEqual(test_layer.output, [p1, p2])
-    self.assertEqual(test_layer.input_shape, (None, 32))
-    self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
-
-  def testTopologicalAttributesMultiInputLayer(self):
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        assert len(inputs) == 2
-        return inputs[0] + inputs[1]
-
-    a = base_layers.Input(shape=(32,))
-    b = base_layers.Input(shape=(32,))
-    test_layer = AddLayer()
-    y = test_layer([a, b])  # pylint: disable=not-callable
-
-    self.assertEqual(test_layer.input, [a, b])
-    self.assertEqual(test_layer.output, y)
-    self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
-    self.assertEqual(test_layer.output_shape, (None, 32))
-
   @test_util.run_in_graph_and_eager_modes()
   def test_count_params(self):
     dense = core_layers.Dense(16)
@@ -574,384 +467,13 @@ class BaseLayerTest(test.TestCase):
       self.assertEqual(3, result['label'].numpy())
       self.assertEqual(4.0, result['logits'].numpy())
 
+  def testActivityRegularizer(self):
+    regularizer = math_ops.reduce_sum
+    layer = base_layers.Layer(activity_regularizer=regularizer)
+    x = array_ops.placeholder('int32')
+    layer.apply(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-class NetworkTest(test.TestCase):
-
-  def testBasicNetwork(self):
-    # minimum viable network
-    x = base_layers.Input(shape=(32,))
-    dense = core_layers.Dense(2)
-    y = dense(x)
-    network = base_layers.Network(x, y, name='dense_network')
-
-    # test basic attributes
-    self.assertEqual(network.name, 'dense_network')
-    self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
-    self.assertEqual(network.layers[1], dense)
-    self.assertEqual(network.weights, dense.weights)
-    self.assertEqual(network.trainable_weights, dense.trainable_weights)
-    self.assertEqual(network.non_trainable_weights, dense.non_trainable_weights)
-
-    # test callability on Input
-    x_2 = base_layers.Input(shape=(32,))
-    y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
-
-    # test callability on regular tensor
-    x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y_2 = network(x_2)
-    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
-
-    # test network `trainable` attribute
-    network.trainable = False
-    self.assertEqual(network.weights, dense.weights)
-    self.assertEqual(network.trainable_weights, [])
-    self.assertEqual(network.non_trainable_weights,
-                     dense.trainable_weights + dense.non_trainable_weights)
-
-  def test_node_construction(self):
-    # test graph topology construction basics
-    a = base_layers.Input(shape=(32,), name='input_a')
-    b = base_layers.Input(shape=(32,), name='input_b')
-
-    self.assertEqual(a.get_shape().as_list(), [None, 32])
-    a_layer, a_node_index, a_tensor_index = a._keras_history
-    b_layer, _, _ = b._keras_history
-    self.assertEqual(len(a_layer._inbound_nodes), 1)
-    self.assertEqual(a_tensor_index, 0)
-    node = a_layer._inbound_nodes[a_node_index]
-    self.assertEqual(node.outbound_layer, a_layer)
-
-    self.assertEqual(node.inbound_layers, [])
-    self.assertEqual(node.input_tensors, [a])
-    self.assertEqual(node.input_shapes, [(None, 32)])
-    self.assertEqual(node.output_tensors, [a])
-    self.assertEqual(node.output_shapes, [(None, 32)])
-
-    dense = core_layers.Dense(16, name='dense_1')
-    dense(a)
-    dense(b)
-
-    self.assertEqual(len(dense._inbound_nodes), 2)
-    self.assertEqual(len(dense._outbound_nodes), 0)
-    self.assertEqual(dense._inbound_nodes[0].inbound_layers, [a_layer])
-    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[1].inbound_layers, [b_layer])
-    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertEqual(dense._inbound_nodes[0].input_tensors, [a])
-    self.assertEqual(dense._inbound_nodes[1].input_tensors, [b])
-
-    # Test config
-    config_0 = dense._inbound_nodes[0].get_config()
-    self.assertEqual(config_0['outbound_layer'], dense.name)
-
-  def testMultiInputNetwork(self):
-    a = base_layers.Input(shape=(32,), name='input_a')
-    b = base_layers.Input(shape=(32,), name='input_b')
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        assert len(inputs) == 2
-        return inputs[0] + inputs[1]
-
-    c = AddLayer()([a, b])  # pylint: disable=not-callable
-    network = base_layers.Network([a, b], c)
-    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + AddLayer
-
-    # Test callability.
-    a2 = base_layers.Input(shape=(32,))
-    b2 = base_layers.Input(shape=(32,))
-    c2 = network([a2, b2])
-    self.assertEqual(c2.get_shape().as_list(), [None, 32])
-
-  def testMultiOutputNetwork(self):
-    x = base_layers.Input(shape=(32,))
-    y1 = core_layers.Dense(2)(x)
-    y2 = core_layers.Dense(3)(x)
-    network = base_layers.Network(x, [y1, y2])
-
-    self.assertEqual(len(network.layers), 3)  # InputLayer + 2 * Dense
-
-    # Test callability.
-    x2 = base_layers.Input(shape=(32,))
-    outputs = network(x2)
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
-
-  def testMultiInputMultiOutputNetworkSharedLayer(self):
-    a = base_layers.Input(shape=(32,), name='input_a')
-    b = base_layers.Input(shape=(32,), name='input_b')
-
-    dense = core_layers.Dense(2)
-
-    y1 = dense(a)
-    y2 = dense(b)
-    network = base_layers.Network([a, b], [y1, y2])
-    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + Dense
-
-    # Test callability.
-    a2 = base_layers.Input(shape=(32,))
-    b2 = base_layers.Input(shape=(32,))
-    outputs = network([a2, b2])
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 2])
-
-  def testCrossDataFlows(self):
-    # Test the ability to have multi-output layers with outputs that get routed
-    # to separate layers
-
-    class PowersLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return [inputs**2, inputs**3]
-
-    x = base_layers.Input(shape=(32,))
-    p1, p2 = PowersLayer()(x)  # pylint: disable=not-callable
-    y1 = core_layers.Dense(2)(p1)
-    y2 = core_layers.Dense(3)(p2)
-    network = base_layers.Network(x, [y1, y2])
-
-    self.assertEqual(len(network.layers), 4)  # InputLayer + 2 * Dense + PLayer
-
-    # Test callability.
-    x2 = base_layers.Input(shape=(32,))
-    outputs = network(x2)
-
-    self.assertEqual(type(outputs), list)
-    self.assertEqual(len(outputs), 2)
-    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
-    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
-
-  def testNetworkAttributes(self):
-    x = base_layers.Input(shape=(32,))
-    z = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))(x)
-    dense = core_layers.Dense(2, name='dense')
-    dense.add_update(1)
-    y = dense(z)
-    net = base_layers.Network(x, y)
-
-    # losses
-    self.assertEqual(len(net.losses), 1)
-
-    # updates
-    self.assertEqual(len(net.updates), 1)
-
-    # get_layer
-    self.assertEqual(net.get_layer('dense'), dense)
-    self.assertEqual(net.get_layer(index=2), dense)
-    with self.assertRaises(ValueError):
-      net.get_layer('dense_unknown')
-    with self.assertRaises(ValueError):
-      net.get_layer()
-    with self.assertRaises(ValueError):
-      net.get_layer(index=4)
-
-    # input, output
-    self.assertEqual(net.input, x)
-    self.assertEqual(net.output, y)
-
-    # input_shape, output_shape
-    self.assertEqual(net.input_shape, (None, 32))
-    self.assertEqual(net.output_shape, (None, 2))
-
-    # get_*_at
-    self.assertEqual(net.get_input_at(0), x)
-    self.assertEqual(net.get_output_at(0), y)
-
-    # _compute_output_shape
-    self.assertEqual(net._compute_output_shape((3, 32)).as_list(), [3, 2])
-
-  def testInvalidNetworks(self):
-    # redundant inputs
-    x = base_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      base_layers.Network([x, x], y)
-
-    # inputs that don't come from Input
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      base_layers.Network(x, y)
-
-    # inputs that don't come from Input but have a layer history
-    x = base_layers.Input(shape=(32,))
-    x = core_layers.Dense(32)(x)
-    y = core_layers.Dense(2)(x)
-    with self.assertRaises(ValueError):
-      base_layers.Network(x, y)
-
-    # outputs that don't come from layers
-    x = base_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x)
-    y = 2 * y
-    with self.assertRaises(ValueError):
-      base_layers.Network(x, y)
-
-    # disconnected graphs
-    x1 = base_layers.Input(shape=(32,))
-    x2 = base_layers.Input(shape=(32,))
-    y = core_layers.Dense(2)(x1)
-    with self.assertRaises(ValueError):
-      base_layers.Network(x2, y)
-
-    # redundant layer names
-    x = base_layers.Input(shape=(32,))
-    z = core_layers.Dense(2, name='dense')(x)
-    y = core_layers.Dense(2, name='dense')(z)
-    with self.assertRaises(ValueError):
-      base_layers.Network(x, y)
-
-  def testInputTensorWrapping(self):
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    x = base_layers.Input(tensor=x)
-    y = core_layers.Dense(2)(x)
-    base_layers.Network(x, y)
-
-  def testExplicitBatchSize(self):
-    x = base_layers.Input(shape=(32,), batch_size=3)
-    y = core_layers.Dense(2)(x)
-    self.assertEqual(y.get_shape().as_list(), [3, 2])
-
-  def testNetworkRecursion(self):
-    # test the ability of networks to be used as layers inside networks.
-    a = base_layers.Input(shape=(32,))
-    b = core_layers.Dense(2)(a)
-    net = base_layers.Network(a, b)
-
-    c = base_layers.Input(shape=(32,))
-    d = net(c)
-
-    recursive_net = base_layers.Network(c, d)
-    self.assertEqual(len(recursive_net.layers), 2)
-    self.assertEqual(recursive_net.layers[1], net)
-    self.assertEqual(len(recursive_net.weights), 2)
-
-    # test callability
-    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
-    y = recursive_net(x)
-    self.assertEqual(y.get_shape().as_list(), [None, 2])
-
-  def testSparseInput(self):
-
-    class SparseSoftmax(base_layers.Layer):
-
-      def call(self, inputs):
-        return sparse_ops.sparse_softmax(inputs)
-
-    x = base_layers.Input(shape=(32,), sparse=True)
-    y = SparseSoftmax()(x)  # pylint: disable=not-callable
-    network = base_layers.Network(x, y)
-
-    self.assertEqual(len(network.layers), 2)
-    self.assertEqual(network.layers[0].sparse, True)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testMaskingSingleInput(self):
-
-    class MaskedLayer(base_layers.Layer):
-
-      def call(self, inputs, mask=None):
-        if mask is not None:
-          return inputs * mask
-        return inputs
-
-      def compute_mask(self, inputs, mask=None):
-        return array_ops.ones_like(inputs)
-
-    if context.in_graph_mode():
-      x = base_layers.Input(shape=(32,))
-      y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = base_layers.Network(x, y)
-
-      # test callability on Input
-      x_2 = base_layers.Input(shape=(32,))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
-
-      # test callability on regular tensor
-      x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
-      y_2 = network(x_2)
-      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
-    else:
-      a = constant_op.constant([2] * 32)
-      mask = constant_op.constant([0, 1] * 16)
-      a._keras_mask = mask
-      b = MaskedLayer().apply(a)
-      self.assertTrue(hasattr(b, '_keras_mask'))
-      self.assertAllEqual(self.evaluate(array_ops.ones_like(mask)),
-                          self.evaluate(getattr(b, '_keras_mask')))
-      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
-
-
-class DeferredModeTest(test.TestCase):
-
-  def testDeferredTensorAttributes(self):
-    x = base_layers._DeferredTensor(shape=(None, 2), dtype='float32', name='x')
-    self.assertEqual(str(x),
-                     'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
-    self.assertEqual(repr(x),
-                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testSimpleNetworkBuilding(self):
-    inputs = base_layers.Input(shape=(32,))
-    if context.in_eager_mode():
-      self.assertIsInstance(inputs, base_layers._DeferredTensor)
-      self.assertEqual(inputs.dtype.name, 'float32')
-      self.assertEqual(inputs.shape.as_list(), [None, 32])
-
-    x = core_layers.Dense(2)(inputs)
-    if context.in_eager_mode():
-      self.assertIsInstance(x, base_layers._DeferredTensor)
-      self.assertEqual(x.dtype.name, 'float32')
-      self.assertEqual(x.shape.as_list(), [None, 2])
-
-    outputs = core_layers.Dense(4)(x)
-    network = base_layers.Network(inputs, outputs)
-    self.assertIsInstance(network, base_layers.Network)
-
-    if context.in_eager_mode():
-      # It should be possible to call such a network on EagerTensors.
-      inputs = constant_op.constant(
-          np.random.random((10, 32)).astype('float32'))
-      outputs = network(inputs)
-      self.assertEqual(outputs.shape.as_list(), [10, 4])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testMultiIONetworkbuilding(self):
-    input_a = base_layers.Input(shape=(32,))
-    input_b = base_layers.Input(shape=(16,))
-    a = core_layers.Dense(16)(input_a)
-
-    class AddLayer(base_layers.Layer):
-
-      def call(self, inputs):
-        return inputs[0] + inputs[1]
-
-      def _compute_output_shape(self, input_shape):
-        return input_shape[0]
-
-    c = AddLayer()([a, input_b])  # pylint: disable=not-callable
-    c = core_layers.Dense(2)(c)
-
-    network = base_layers.Network([input_a, input_b], [a, c])
-    if context.in_eager_mode():
-      a_val = constant_op.constant(
-          np.random.random((10, 32)).astype('float32'))
-      b_val = constant_op.constant(
-          np.random.random((10, 16)).astype('float32'))
-      outputs = network([a_val, b_val])
-      self.assertEqual(len(outputs), 2)
-      self.assertEqual(outputs[0].shape.as_list(), [10, 16])
-      self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 0c7ce0283544059aa0bab8f9d79512867ab531fb..fbb13bb72c435ad3675a8f3f31c568952c043743 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -813,6 +813,7 @@ def conv3d(inputs,
       bias_constraint=bias_constraint,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
@@ -919,6 +920,7 @@ class SeparableConv2D(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
+    self.data_format = data_format
     self.depth_multiplier = depth_multiplier
     self.depthwise_initializer = depthwise_initializer
     self.pointwise_initializer = pointwise_initializer
@@ -1230,9 +1232,8 @@ class Conv2DTranspose(Conv2D):
 
   def build(self, input_shape):
     if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank ' +
-                       str(4) +
-                       'Received input shape:', str(input_shape))
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
     if self.data_format == 'channels_first':
       channel_axis = 1
     else:
@@ -1746,6 +1747,7 @@ def conv3d_transpose(inputs,
       bias_constraint=bias_constraint,
       trainable=trainable,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs)
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 76e8fbef2f4b187acbbf094f5a3b880341cbdd61..7be1fa5cfe95f13f67ee94bb20304fba00b33d1b 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -286,11 +286,19 @@ class Dropout(base.Layer):
     self.noise_shape = noise_shape
     self.seed = seed
 
-  def _get_noise_shape(self, _):
+  def _get_noise_shape(self, inputs):
     # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
     # which will override `self.noise_shape`, and allows for custom noise
     # shapes with dynamically sized inputs.
-    return self.noise_shape
+    if self.noise_shape is None:
+      return self.noise_shape
+
+    symbolic_shape = array_ops.shape(inputs)
+    noise_shape = [
+        symbolic_shape[axis] if shape is None else shape
+        for axis, shape in enumerate(self.noise_shape)
+    ]
+    return noise_shape
 
   def call(self, inputs, training=False):
 
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index b67df89f81fafb1d3df9b2caba15efa2b96d9e2f..2d47cc69798d8c3e34e14e24301e8be9a00f49bc 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -387,6 +387,16 @@ class DropoutTest(test.TestCase):
       self.assertAllClose(np.ones((5, 5)), np_output)
 
   @test_util.run_in_graph_and_eager_modes()
+  def testDynamicNoiseShape(self):
+    inputs = array_ops.ones((5, 3, 2))
+    noise_shape = [None, 1, None]
+    dp = core_layers.Dropout(0.5, noise_shape=noise_shape, seed=1)
+    dropped = dp.apply(inputs, training=True)
+    self.evaluate(variables.global_variables_initializer())
+    np_output = self.evaluate(dropped)
+    self.assertAlmostEqual(0., np_output.min())
+    self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
+
   def testCustomNoiseShape(self):
     inputs = array_ops.ones((5, 3, 2))
     noise_shape = [5, 1, 2]
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index d3f532e79c174ba77453639c51d667658cc0a2f7..0a52b1e8d9216a2535f5ae99751a4f9e9757031d 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -65,8 +65,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
-from tensorflow.python.layers.base import Input
 from tensorflow.python.layers.base import InputSpec
+from tensorflow.python.layers.network import Input
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
diff --git a/tensorflow/python/layers/network.py b/tensorflow/python/layers/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc52545f92cb9b9c6f78f5c58fe44b3187d370b
--- /dev/null
+++ b/tensorflow/python/layers/network.py
@@ -0,0 +1,957 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Contains Network, a composition of layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.layers import base
+from tensorflow.python.layers import utils as layers_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+class InputLayer(base.Layer):
+  """Layer to be used as an entry point into a Network (a graph of layers).
+
+  It can either wrap an existing tensor (pass an `input_tensor` argument)
+  or create its a placeholder tensor (pass arguments `input_shape`
+  as well as `dtype`).
+
+  It is generally recommend to use the functional layer API via `Input`,
+  (which creates an `InputLayer`) without directly using `InputLayer`.
+
+  Arguments:
+      input_shape: Shape tuple (not including the batch axis), or `TensorShape`
+        instance (not including the batch axis).
+      batch_size: Optional input batch size (integer or None).
+      dtype: Datatype of the input.
+      input_tensor: Optional tensor to use as layer input
+          instead of creating a placeholder.
+      sparse: Boolean, whether the placeholder created
+          is meant to be sparse.
+      name: Name of the layer (string).
+
+    Raises:
+      RuntimeError: If created in Eager mode.
+  """
+
+  def __init__(self,
+               input_shape=None,
+               batch_size=None,
+               dtype=dtypes.float32,
+               input_tensor=None,
+               sparse=False,
+               name=None):
+    super(InputLayer, self).__init__(dtype=dtype, name=name)
+    self.built = True
+    self.sparse = sparse
+    self.batch_size = batch_size
+
+    if isinstance(input_shape, tensor_shape.TensorShape):
+      input_shape = tuple(input_shape.as_list())
+
+    if input_tensor is None:
+      if input_shape is not None:
+        batch_input_shape = (batch_size,) + tuple(input_shape)
+      else:
+        batch_input_shape = None
+
+      if context.in_eager_mode():
+        # In eager mode, create a temporary placeholder to call the layer on.
+        input_tensor = base._DeferredTensor(  # pylint: disable=protected-access
+            shape=batch_input_shape,
+            dtype=dtype,
+            name=self.name)
+      else:
+        # In graph mode, create a graph placeholder to call the layer on.
+        if sparse:
+          input_tensor = array_ops.sparse_placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
+        else:
+          input_tensor = array_ops.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
+
+      # For compatibility with Keras API.
+      self.is_placeholder = True
+      self._batch_input_shape = batch_input_shape
+    else:
+      # For compatibility with Keras API.
+      self.is_placeholder = False
+      self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
+
+    # Create an input node to add to self.outbound_node
+    # and set output_tensors' _keras_history.
+    input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
+    base.Node(
+        self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=[input_tensor],
+        output_tensors=[input_tensor])
+
+
+def Input(  # pylint: disable=invalid-name
+    shape=None,
+    batch_size=None,
+    name=None,
+    dtype=dtypes.float32,
+    sparse=False,
+    tensor=None):
+  """`Input()` is used to instantiate an input tensor for use with a `Network`.
+
+  For instance, if a, b and c are tensors created via `Input`,
+  it becomes possible to do:
+
+  `network = Network(inputs=[a, b], outputs=c)`
+
+  Example:
+
+      ```python
+      # This is a logistic regression
+      x = tf.layers.Input(shape=(32,))
+      y = tf.layers.Dense(16, activation='softmax')(x)
+      network = tf.layers.Network(x, y)
+      ```
+
+  Arguments:
+      shape: A shape tuple (integer), not including the batch size.
+          For instance, `shape=(32,)` indicates that the expected input
+          will be batches of 32-dimensional vectors.
+      batch_size: Optional input batch size (integer or None).
+      name: An optional name string for the layer.
+          Should be unique in a model (do not reuse the same name twice).
+          It will be autogenerated if it isn't provided.
+      dtype: The data type expected by the input, as a string
+          (`float32`, `float64`, `int32`...)
+      sparse: A boolean specifying whether the placeholder
+          to be created is sparse.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+          If set, the layer will not create a placeholder tensor.
+
+  Returns:
+      A tensor: either a new placeholder (with history metadata) or
+      `tensor` (if passed), with added history metadata.
+
+  Raises:
+    RuntimeError: If called in Eager mode.
+  """
+  input_layer = InputLayer(
+      input_shape=shape,
+      batch_size=batch_size,
+      name=name,
+      dtype=dtype,
+      sparse=sparse,
+      input_tensor=tensor)
+  # Return tensor including `_keras_history` metadata.
+  # Note that in this case train_output and test_output are the same pointer.
+  outputs = input_layer._inbound_nodes[0].output_tensors  # pylint: disable=protected-access
+  if len(outputs) == 1:
+    return outputs[0]
+  else:
+    return outputs
+
+
+class GraphNetwork(base.Layer):
+  """A GraphNetwork is a directed acyclic graph of layers.
+
+  It is the topological form of a `tf.keras.models.Model`. A `Model` is simply a
+  `GraphNetwork` with added training/evaluation routines.
+
+  A `GraphNetwork` instance implements the full `Layer` API. In particular, a
+  `GraphNetwork` can be called on new inputs.
+
+  Example:
+
+      ```python
+      # This is a logistic regression
+      x = tf.layers.Input(shape=(32,))
+      y = tf.layers.Dense(16, activation='softmax')(x)
+      network = tf.layers.GraphNetwork(x, y)
+
+      # It is then possible to call the network on compatible inputs:
+      z = tf.layers.Input(shape=(32,))
+      w = network(z)
+
+      # It is possible to retrieve the same properties as a layer:
+      weights = network.trainable_weights
+      ```
+
+  Arguments:
+      inputs: Input tensor or list of input tensors.
+        Must come from `tf.layers.Input`.
+      output: Output tensor or list of output tensors. Must come from
+        tf.layers Layers or Keras layers.
+      name: Optional name of the model (string).
+
+  Attributes:
+    GraphNetwork has the same attributes as Layer. On top of it, it also has:
+      - layers: a list of the children layers of the network,
+        a list of layer instances, ordered from "earlier in the graph"
+        to "later in the graph".
+
+  Methods:
+    GraphNetwork has the same methods as Layer. On top of it, it also has:
+      - get_layer: retrieves a child layer by name or index in the graph.
+
+  Raises:
+    RuntimeError: If created in Eager mode.
+  """
+
+  def __init__(self, inputs, outputs, name=None):  # pylint: disable=super-init-not-called
+    if context.in_eager_mode():
+      # TODO(fchollet): check that all inputs and outputs are DeferredTensors.
+      pass
+
+    self._init_set_name(name)
+    self._activity_regularizer = None
+    with vs.variable_scope(
+        None, default_name=self._base_name) as captured_scope:
+      self._scope = captured_scope
+    call_fn_args = estimator_util.fn_args(self.call)
+    self._compute_previous_mask = ('mask' in call_fn_args or
+                                   hasattr(self, 'compute_mask'))
+    self._call_has_scope_arg = 'scope' in call_fn_args
+
+    # This acts just like the `trainable` attribute of any layer instance.
+    # It does not affect users of the underlying layers, only users of the
+    # GraphNetwork instance.
+    self.trainable = True
+    # A GraphNetwork does not create weights of its own, thus it is already
+    # built.
+    self.built = True
+    # A GraphNetwork does not create weights of its own, thus has no dtype.
+    self._dtype = None
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    # self.input_spec
+
+    # Private attributes to implement compatibility with Layer.
+    self._per_input_losses = {}
+    self._per_input_updates = {}
+    self._updates = []
+    self._losses = []
+    self._scope = None
+    self._reuse = None
+    self._graph = ops.get_default_graph()
+
+    # GraphNetwork-specific properties.
+    if isinstance(inputs, (list, tuple)):
+      self.inputs = list(inputs)  # Tensor or list of tensors.
+    else:
+      self.inputs = [inputs]
+    if isinstance(outputs, (list, tuple)):
+      self.outputs = list(outputs)
+    else:
+      self.outputs = [outputs]
+    # All layers in order of horizontal graph traversal.
+    # Entries are unique. Includes input and output layers.
+    self.layers = []
+
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    # # List of initial layers (1 to 1 mapping with self.inputs,
+    # # hence the same layer might appear twice)
+    # self._input_layers = []
+    # self._input_layers_node_indices = []
+    # self._input_layers_tensor_indices = []
+    # # list of layers (1 to 1 mapping with self.inputs,
+    # # hence the same layer might appear twice)
+    # self._output_layers = []
+    # self._output_layers_node_indices = []
+    # self._output_layers_tensor_indices = []
+
+    self._input_layers = []
+    self._output_layers = []
+    self._input_coordinates = []
+    self._output_coordinates = []
+
+    # This is for performance optimization when calling the GraphNetwork on new
+    # inputs. Every time the GraphNetwork is called on a set on input tensors,
+    # we compute the output tensors, output masks and output shapes in one pass,
+    # then cache them here. When any of these outputs is queried later, we
+    # retrieve it from there instead of recomputing it.
+    self._output_mask_cache = {}
+    self._output_tensor_cache = {}
+    self._output_shape_cache = {}
+
+    # User-provided arguments validation.
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.layers.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, node_index, tensor_index = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.layers.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.layers.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+      # pylint: enable=protected-access
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
+    # Build self._output_layers:
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      self._output_layers.append(layer)
+      self._output_coordinates.append((layer, node_index, tensor_index))
+
+    # Build self._input_layers:
+    for x in self.inputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      # It's supposed to be an input layer, so only one node
+      # and one tensor output.
+      assert node_index == 0
+      assert tensor_index == 0
+      self._input_layers.append(layer)
+      self._input_coordinates.append((layer, node_index, tensor_index))
+
+    # Network_nodes: set of nodes included in the graph
+    # (not all nodes included in the layers
+    # are relevant to the current graph).
+    network_nodes = set()  # ids of all nodes relevant to the GraphNetwork
+    nodes_depths = {}  # dict {node: depth value}
+    layers_depths = {}  # dict {layer: depth value}
+    layer_indices = {}  # dict {layer: index in traversal}
+    nodes_in_decreasing_depth = []
+
+    def build_map_of_graph(tensor,
+                           finished_nodes,
+                           nodes_in_progress,
+                           layer,
+                           node_index,
+                           tensor_index):
+      """Builds a map of the graph of layers.
+
+      This recursively updates the map `layer_indices`,
+      the list `nodes_in_decreasing_depth` and the set `network_nodes`.
+
+      Arguments:
+          tensor: Some tensor in a graph.
+          finished_nodes: Set of nodes whose subgraphs have been traversed
+              completely. Useful to prevent duplicated work.
+          nodes_in_progress: Set of nodes that are currently active on the
+              recursion stack. Useful to detect cycles.
+          layer: Layer from which `tensor` comes from. If not provided,
+              will be obtained from `tensor._keras_history`.
+          node_index: Node index from which `tensor` comes from.
+          tensor_index: Tensor_index from which `tensor` comes from.
+
+      Raises:
+          ValueError: if a cycle is detected.
+      """
+      node = layer._inbound_nodes[node_index]  # pylint: disable=protected-access
+
+      # Prevent cycles.
+      if node in nodes_in_progress:
+        raise ValueError('The tensor ' + str(tensor) + ' at layer "' +
+                         layer.name + '" is part of a cycle.')
+
+      # Don't repeat work for shared subgraphs
+      if node in finished_nodes:
+        return
+
+      node_key = _make_node_key(layer.name, node_index)
+      # Update network_nodes.
+      network_nodes.add(node_key)
+
+      # Store the traversal order for layer sorting.
+      if layer not in layer_indices:
+        layer_indices[layer] = len(layer_indices)
+
+      nodes_in_progress.add(node)
+
+      # Propagate to all previous tensors connected to this node.
+      for i in range(len(node.inbound_layers)):
+        x = node.input_tensors[i]
+        layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        tensor_index = node.tensor_indices[i]
+        build_map_of_graph(x, finished_nodes, nodes_in_progress, layer,
+                           node_index, tensor_index)
+
+      finished_nodes.add(node)
+      nodes_in_progress.remove(node)
+      nodes_in_decreasing_depth.append(node)
+
+    finished_nodes = set()
+    nodes_in_progress = set()
+    for x in self.outputs:
+      layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      build_map_of_graph(x, finished_nodes, nodes_in_progress,
+                         layer=layer,
+                         node_index=node_index,
+                         tensor_index=tensor_index)
+
+    for node in reversed(nodes_in_decreasing_depth):
+      # If the depth is not set, the node has no outbound nodes (depth 0).
+      depth = nodes_depths.setdefault(node, 0)
+
+      # Update the depth of the corresponding layer
+      previous_depth = layers_depths.get(node.outbound_layer, 0)
+      # If we've seen this layer before at a higher depth,
+      # we should use that depth instead of the node depth.
+      # This is necessary for shared layers that have inputs at different
+      # depth levels in the graph.
+      depth = max(depth, previous_depth)
+      layers_depths[node.outbound_layer] = depth
+      nodes_depths[node] = depth
+
+      # Update the depth of inbound nodes.
+      # The "depth" of a node is the max of the depths
+      # of all layers it is connected to.
+      for i in range(len(node.inbound_layers)):
+        inbound_layer = node.inbound_layers[i]
+        node_index = node.node_indices[i]
+        inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
+        previous_depth = nodes_depths.get(inbound_node, 0)
+        nodes_depths[inbound_node] = max(depth + 1, previous_depth)
+
+    # Build a dict {depth: list of nodes with this depth}
+    nodes_by_depth = {}
+    for node, depth in nodes_depths.items():
+      if depth not in nodes_by_depth:
+        nodes_by_depth[depth] = []
+      nodes_by_depth[depth].append(node)
+
+    # Build a dict {depth: list of layers with this depth}
+    layers_by_depth = {}
+    for layer, depth in layers_depths.items():
+      if depth not in layers_by_depth:
+        layers_by_depth[depth] = []
+      layers_by_depth[depth].append(layer)
+
+    # Get sorted list of layer depths.
+    depth_keys = list(layers_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Set self.layers and self._layers_by_depth.
+    layers = []
+    for depth in depth_keys:
+      layers_for_depth = layers_by_depth[depth]
+      # GraphNetwork.layers needs to have a deterministic order:
+      # here we order them by traversal order.
+      layers_for_depth.sort(key=lambda x: layer_indices[x])
+      layers.extend(layers_for_depth)
+    self.layers = layers
+    self._layers_by_depth = layers_by_depth
+
+    # Get sorted list of node depths.
+    depth_keys = list(nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+
+    # Check that all tensors required are computable.
+    # computable_tensors: all tensors in the graph
+    # that can be computed from the inputs provided.
+    computable_tensors = []
+    for x in self.inputs:
+      computable_tensors.append(x)
+
+    layers_with_complete_input = []  # To provide a better error msg.
+    for depth in depth_keys:
+      for node in nodes_by_depth[depth]:
+        layer = node.outbound_layer
+        if layer:
+          for x in node.input_tensors:
+            if x not in computable_tensors:
+              raise ValueError('Graph disconnected: '
+                               'cannot obtain value for tensor ' + str(x) +
+                               ' at layer "' + layer.name + '". '
+                               'The following previous layers '
+                               'were accessed without issue: ' +
+                               str(layers_with_complete_input))
+          for x in node.output_tensors:
+            computable_tensors.append(x)
+          layers_with_complete_input.append(layer.name)
+
+    # Keep track of the network's nodes.
+    self._network_nodes = network_nodes
+    self._nodes_by_depth = nodes_by_depth
+
+    # Ensure name unicity, which will be crucial for serialization
+    # (since serialized nodes refer to layers by their name).
+    all_names = [layer.name for layer in self.layers]
+    for name in all_names:
+      if all_names.count(name) != 1:
+        raise ValueError('The name "' + name + '" is used ' +
+                         str(all_names.count(name)) + ' times in the model. '
+                         'All layer names should be unique.')
+
+    # Layer parameters.
+    # The new network starts with a single inbound node
+    # for its inputs, and no outbound nodes.
+    self._outbound_nodes = []  # Will be appended to by future calls to __call__
+    self._inbound_nodes = [
+    ]  # Will be appended to below, and by future calls to __call__
+    # Create the node linking internal inputs to internal outputs.
+    base.Node(
+        outbound_layer=self,
+        inbound_layers=[],
+        node_indices=[],
+        tensor_indices=[],
+        input_tensors=self.inputs,
+        output_tensors=self.outputs)
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+    else:
+      if not name:
+        raise ValueError('Provide either a layer name or layer index.')
+    for layer in self.layers:
+      if layer.name == name:
+        return layer
+    raise ValueError('No such layer: ' + name)
+
+  @property
+  def updates(self):
+    """Retrieve the network's updates.
+
+    Will only include updates that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include updates that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of update ops.
+    """
+    updates = []
+    for layer in self.layers:
+      if hasattr(layer, 'updates'):
+        # Collect updates that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
+          node_key = _make_node_key(layer.name, node_index)
+          if node_key in self._network_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            updates += layer.get_updates_for(inputs)
+        # Collect unconditional updates.
+        updates += layer.get_updates_for(None)
+    return updates
+
+  @property
+  def losses(self):
+    """Retrieve the network's losses.
+
+    Will only include losses that are either
+    unconditional, or conditional on inputs to this model
+    (e.g. will not include losses that depend on tensors
+    that aren't inputs to this model).
+
+    Returns:
+        A list of loss tensors.
+    """
+    losses = []
+    # Retrieve losses for all internal layers.
+    for layer in self.layers:
+      if hasattr(layer, 'losses'):
+        # Collect losses that are dependent on inputs
+        # that are part of the model.
+        for node_index, node in enumerate(layer._inbound_nodes):  # pylint: disable=protected-access
+          node_key = _make_node_key(layer.name, node_index)
+          if node_key in self._network_nodes:
+            # The model owns this layer node.
+            inputs = node.input_tensors
+            losses += layer.get_losses_for(inputs)
+        # Collect unconditional losses.
+        losses += layer.get_losses_for(None)
+    # Add any potential unconditional model-level loss.
+    losses += self.get_losses_for(None)
+    return losses
+
+  @property
+  def trainable_weights(self):
+    if not self.trainable:
+      return []
+    weights = []
+    for layer in self.layers:
+      weights += layer.trainable_weights
+    return weights
+
+  @property
+  def non_trainable_weights(self):
+    weights = []
+    for layer in self.layers:
+      weights += layer.non_trainable_weights
+    if not self.trainable:
+      trainable_weights = []
+      for layer in self.layers:
+        trainable_weights += layer.trainable_weights
+      return trainable_weights + weights
+    return weights
+
+  @property
+  def input_spec(self):
+    """Gets the network's input specs.
+
+    Returns:
+        A list of `InputSpec` instances (one per input to the model)
+            or a single instance if the model has only one input.
+    """
+    specs = []
+    for layer in self._input_layers:
+      if layer.input_spec is None:
+        specs.append(None)
+      else:
+        if not isinstance(layer.input_spec, list):
+          raise TypeError('Layer ' + layer.name +
+                          ' has an input_spec attribute that '
+                          'is not a list. We expect a list. '
+                          'Found input_spec = ' + str(layer.input_spec))
+        specs += layer.input_spec
+    if len(specs) == 1:
+      return specs[0]
+    return specs
+
+  def call(self, inputs, mask=None):
+    """Call the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    inputs = nest.flatten(inputs)
+    if mask is None:
+      masks = [None for _ in range(len(inputs))]
+    else:
+      masks = nest.flatten(mask)
+
+    if context.in_graph_mode():
+      # Try to retrieve cached outputs if the layer has already been called
+      # on these exact inputs.
+      cache_key = (layers_util.object_list_uid(inputs)
+                   + '_' + layers_util.object_list_uid(masks))
+      if cache_key in self._output_tensor_cache:
+        # Cache hit.
+        return self._output_tensor_cache[cache_key]
+    # Actually apply the network graph to the new inputs.
+    outputs, _ = self._run_internal_graph(inputs, masks)
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    if isinstance(input_shape, list):
+      input_shapes = []
+      for shape in input_shape:
+        if shape is not None:
+          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
+        else:
+          input_shapes.append(None)
+    else:
+      if input_shape is not None:
+        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
+      else:
+        input_shapes = [None]
+
+    if len(input_shapes) != len(self._input_layers):
+      raise ValueError('Invalid input_shape argument ' + str(input_shape) +
+                       ': model has ' + str(len(self._input_layers)) +
+                       ' tensor inputs.')
+
+    cache_key = layers_util.object_list_uid(input_shapes)
+    if cache_key not in self._output_shape_cache:
+      # Cache miss. We have to run the network graph manually (recursive calls
+      # to `_compute_output_shape`).
+      layers_to_output_shapes = {}
+      for i in range(len(input_shapes)):
+        layer = self._input_layers[i]
+        input_shape = input_shapes[i]
+        # It's an input layer: then `_compute_output_shape` is identity,
+        # and there is only one node and one tensor output.
+        shape_key = layer.name + '_0_0'
+        layers_to_output_shapes[shape_key] = input_shape
+
+      depth_keys = list(self._nodes_by_depth.keys())
+      depth_keys.sort(reverse=True)
+      # Iterate over nodes, by depth level.
+      if len(depth_keys) > 1:
+        for depth in depth_keys:
+          nodes = self._nodes_by_depth[depth]
+          for node in nodes:
+            # This is always a single layer, never a list.
+            layer = node.outbound_layer
+            if layer in self._input_layers:
+              # We've already covered the input layers
+              # a few lines above.
+              continue
+            # Potentially redundant list,
+            # same size as node.input_tensors.
+            input_shapes = []
+            for j in range(len(node.inbound_layers)):
+              inbound_layer = node.inbound_layers[j]
+              node_index = node.node_indices[j]
+              tensor_index = node.tensor_indices[j]
+              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
+                                                           tensor_index)
+              input_shape = layers_to_output_shapes[shape_key]
+              input_shapes.append(input_shape)
+
+            if len(input_shapes) == 1:
+              output_shape = layer._compute_output_shape(input_shapes[0])  # pylint: disable=protected-access
+            else:
+              output_shape = layer._compute_output_shape(input_shapes)  # pylint: disable=protected-access
+            if isinstance(output_shape, list):
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(shape).as_list())
+                  for shape in output_shape
+              ]
+            else:
+              output_shapes = [
+                  tuple(tensor_shape.TensorShape(output_shape).as_list())
+              ]
+
+            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
+            for j in range(len(output_shapes)):
+              shape_key = layer.name + '_%s_%s' % (node_index, j)
+              layers_to_output_shapes[shape_key] = output_shapes[j]
+
+        # Read final output shapes from layers_to_output_shapes.
+        output_shapes = []
+        for i in range(len(self._output_layers)):
+          layer, node_index, tensor_index = self._output_coordinates[i]
+          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+          output_shapes.append(layers_to_output_shapes[shape_key])
+
+        # Store in cache.
+        self._output_shape_cache[cache_key] = output_shapes
+      else:
+        # Cache hit.
+        output_shapes = self._output_shape_cache[cache_key]
+
+      if isinstance(output_shapes, list):
+        if len(output_shapes) == 1:
+          return tensor_shape.TensorShape(output_shapes[0])
+        else:
+          return [tensor_shape.TensorShape(shape) for shape in output_shapes]
+      else:
+        return tensor_shape.TensorShape(output_shapes)
+
+  def _run_internal_graph(self, inputs, masks=None):
+    """Computes output tensors for new inputs.
+
+    # Note:
+        - Expects `inputs` to be a list (potentially with 1 element).
+        - Can be run on non-Keras tensors.
+
+    Arguments:
+        inputs: List of tensors
+        masks: List of masks (tensors or None).
+
+    Returns:
+        Three lists: output_tensors, output_masks, output_shapes
+    """
+    # Note: masking support is relevant mainly for Keras.
+    # It cannot be factored out without having the fully reimplement the network
+    # calling logic on the Keras side. We choose to incorporate it in
+    # GraphNetwork because 1) it may be useful to fully support in tf.layers in
+    # the future and 2) Keras is a major user of GraphNetwork.  If you don't
+    # use masking, it does not interfere with regular behavior at all and you
+    # can ignore it.
+    if masks is None:
+      masks = [None for _ in range(len(inputs))]
+
+    # Dictionary mapping reference tensors to tuples
+    # (computed tensor, compute mask)
+    # we assume a 1:1 mapping from tensor to mask
+    # TODO(fchollet): raise exception when a `.compute_mask()` call
+    # does not return a list the same size as `call`
+    tensor_map = {}
+    for x, y, mask in zip(self.inputs, inputs, masks):
+      tensor_map[str(id(x))] = (y, mask)
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    for depth in depth_keys:
+      nodes = self._nodes_by_depth[depth]
+      for node in nodes:
+        # This is always a single layer, never a list.
+        layer = node.outbound_layer
+
+        reference_input_tensors = node.input_tensors
+        reference_output_tensors = node.output_tensors
+
+        # If all previous input tensors are available in tensor_map,
+        # then call node.inbound_layer on them.
+        computed_data = []  # List of tuples (input, mask).
+        for x in reference_input_tensors:
+          if str(id(x)) in tensor_map:
+            computed_data.append(tensor_map[str(id(x))])
+
+        if len(computed_data) == len(reference_input_tensors):
+          # Call layer (reapplying ops to new inputs).
+          with ops.name_scope(layer.name):
+            if node.arguments:
+              kwargs = node.arguments
+            else:
+              kwargs = {}
+            if len(computed_data) == 1:
+              computed_tensor, computed_mask = computed_data[0]
+              # Ensure mask propagation if applicable.
+              if 'mask' in estimator_util.fn_args(layer.call):
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_mask
+
+              output_tensors = nest.flatten(
+                  layer.call(computed_tensor, **kwargs))
+              if hasattr(layer, 'compute_mask'):
+                output_masks = nest.flatten(
+                    layer.compute_mask(computed_tensor, computed_mask))
+              else:
+                output_masks = [None for _ in range(len(output_tensors))]
+              computed_tensors = [computed_tensor]
+              computed_masks = [computed_mask]
+            else:
+              computed_tensors = [x[0] for x in computed_data]
+              computed_masks = [x[1] for x in computed_data]
+              if 'mask' in estimator_util.fn_args(layer.call):
+                if 'mask' not in kwargs:
+                  kwargs['mask'] = computed_masks
+              output_tensors = nest.flatten(
+                  layer.call(computed_tensors, **kwargs))
+              if hasattr(layer, 'compute_mask'):
+                output_masks = nest.flatten(
+                    layer.compute_mask(computed_tensors, computed_masks))
+              else:
+                output_masks = [None for _ in range(len(output_tensors))]
+
+            # Apply activity regularizer if any:
+            if layer.activity_regularizer is not None:
+              regularization_losses = [
+                  layer.activity_regularizer(x) for x in computed_tensors
+              ]
+              layer.add_loss(regularization_losses, computed_tensors)
+
+          if context.in_graph_mode():
+            # Update model updates and losses:
+            # Keep track of updates that depend on the inputs
+            # (e.g. BN updates).
+            self.add_update(layer.get_updates_for(computed_tensors), inputs)
+            # Keep track of unconditional updates (e.g. a counter).
+            self.add_update(layer.get_updates_for(None), None)
+            # Keep track of losses that depend on the inputs
+            # (e.g. activity regularizers).
+            self.add_loss(layer.get_losses_for(computed_tensors), inputs)
+            # Keep track of unconditional losses
+            # (e.g. weight regularizers).
+            self.add_loss(layer.get_losses_for(None), None)
+
+          # Update tensor_map.
+          for x, y, mask in zip(reference_output_tensors, output_tensors,
+                                output_masks):
+            tensor_map[str(id(x))] = (y, mask)
+
+    output_tensors = []
+    output_masks = []
+    output_shapes = []
+    for x in self.outputs:
+      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
+      tensor, mask = tensor_map[str(id(x))]
+      output_shapes.append(layers_util.static_shape(x))
+      output_tensors.append(tensor)
+      output_masks.append(mask)
+
+    if len(output_tensors) == 1:
+      output_tensors = output_tensors[0]
+      if output_shapes is not None:
+        output_shapes = output_shapes[0]
+      if output_masks is not None:
+        output_masks = output_masks[0]
+
+    if context.in_graph_mode():
+      # Update cache;
+      # keys are based on ids on input tensors and inputs masks.
+      cache_key = (layers_util.object_list_uid(inputs)
+                   + '_' + layers_util.object_list_uid(masks))
+      self._output_tensor_cache[cache_key] = output_tensors
+      if output_masks is not None:
+        self._output_mask_cache[cache_key] = output_masks
+      if output_shapes is not None:
+        input_shapes = [layers_util.static_shape(x) for x in inputs]
+        cache_key = layers_util.object_list_uid(input_shapes)
+        self._output_shape_cache[cache_key] = output_shapes
+
+    return output_tensors, output_masks
+
+
+def _make_node_key(layer_name, node_index):
+  return layer_name + '_ib-' + str(node_index)
diff --git a/tensorflow/python/layers/network_test.py b/tensorflow/python/layers/network_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7813e26420eb6e85b204fd5b50e7ddafc2e5a1
--- /dev/null
+++ b/tensorflow/python/layers/network_test.py
@@ -0,0 +1,525 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.layers.network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import base as base_layers
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.layers import network as network_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import test
+
+
+class BaseLayerCompatibilityTest(test.TestCase):
+
+  def test_get_updates_for(self):
+    a = network_layers.Input(shape=(2,))
+    dense_layer = core_layers.Dense(1)
+    dense_layer.add_update(0, inputs=a)
+    dense_layer.add_update(1, inputs=None)
+
+    self.assertEqual(dense_layer.get_updates_for(a), [0])
+    self.assertEqual(dense_layer.get_updates_for(None), [1])
+
+  def test_get_losses_for(self):
+    a = network_layers.Input(shape=(2,))
+    dense_layer = core_layers.Dense(1)
+    dense_layer.add_loss(0, inputs=a)
+    dense_layer.add_loss(1, inputs=None)
+
+    self.assertEqual(dense_layer.get_losses_for(a), [0])
+    self.assertEqual(dense_layer.get_losses_for(None), [1])
+
+  def testTopologicalAttributes(self):
+    # test layer attributes / methods related to cross-layer connectivity.
+    a = network_layers.Input(shape=(32,), name='input_a')
+    b = network_layers.Input(shape=(32,), name='input_b')
+
+    # test input, output, input_shape, output_shape
+    test_layer = core_layers.Dense(16, name='test_layer')
+    a_test = test_layer(a)
+    self.assertEqual(test_layer.input, a)
+    self.assertEqual(test_layer.output, a_test)
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, (None, 16))
+
+    # test `get_*_at` methods
+    dense = core_layers.Dense(16, name='dense_1')
+    a_2 = dense(a)
+    b_2 = dense(b)
+
+    self.assertEqual(dense.get_input_at(0), a)
+    self.assertEqual(dense.get_input_at(1), b)
+    self.assertEqual(dense.get_output_at(0), a_2)
+    self.assertEqual(dense.get_output_at(1), b_2)
+    self.assertEqual(dense.get_input_shape_at(0), (None, 32))
+    self.assertEqual(dense.get_input_shape_at(1), (None, 32))
+    self.assertEqual(dense.get_output_shape_at(0), (None, 16))
+    self.assertEqual(dense.get_output_shape_at(1), (None, 16))
+
+    # Test invalid value for attribute retrieval.
+    with self.assertRaises(ValueError):
+      dense.get_input_at(2)
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      _ = new_dense.input
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      _ = new_dense.output
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      _ = new_dense.output_shape
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      _ = new_dense.input_shape
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      a = network_layers.Input(shape=(3, 32))
+      a = network_layers.Input(shape=(5, 32))
+      a_2 = dense(a)
+      b_2 = dense(b)
+      _ = new_dense.input_shape
+    with self.assertRaises(AttributeError):
+      new_dense = core_layers.Dense(16)
+      a = network_layers.Input(shape=(3, 32))
+      a = network_layers.Input(shape=(5, 32))
+      a_2 = dense(a)
+      b_2 = dense(b)
+      _ = new_dense.output_shape
+
+  def testTopologicalAttributesMultiOutputLayer(self):
+
+    class PowersLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        return [inputs**2, inputs**3]
+
+    x = network_layers.Input(shape=(32,))
+    test_layer = PowersLayer()
+    p1, p2 = test_layer(x)  # pylint: disable=not-callable
+
+    self.assertEqual(test_layer.input, x)
+    self.assertEqual(test_layer.output, [p1, p2])
+    self.assertEqual(test_layer.input_shape, (None, 32))
+    self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
+
+  def testTopologicalAttributesMultiInputLayer(self):
+
+    class AddLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        assert len(inputs) == 2
+        return inputs[0] + inputs[1]
+
+    a = network_layers.Input(shape=(32,))
+    b = network_layers.Input(shape=(32,))
+    test_layer = AddLayer()
+    y = test_layer([a, b])  # pylint: disable=not-callable
+
+    self.assertEqual(test_layer.input, [a, b])
+    self.assertEqual(test_layer.output, y)
+    self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
+    self.assertEqual(test_layer.output_shape, (None, 32))
+
+
+class NetworkTest(test.TestCase):
+
+  def testBasicNetwork(self):
+    # minimum viable network
+    x = network_layers.Input(shape=(32,))
+    dense = core_layers.Dense(2)
+    y = dense(x)
+    network = network_layers.GraphNetwork(x, y, name='dense_network')
+
+    # test basic attributes
+    self.assertEqual(network.name, 'dense_network')
+    self.assertEqual(len(network.layers), 2)  # InputLayer + Dense
+    self.assertEqual(network.layers[1], dense)
+    self.assertEqual(network.weights, dense.weights)
+    self.assertEqual(network.trainable_weights, dense.trainable_weights)
+    self.assertEqual(network.non_trainable_weights, dense.non_trainable_weights)
+
+    # test callability on Input
+    x_2 = network_layers.Input(shape=(32,))
+    y_2 = network(x_2)
+    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+
+    # test callability on regular tensor
+    x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
+    y_2 = network(x_2)
+    self.assertEqual(y_2.get_shape().as_list(), [None, 2])
+
+    # test network `trainable` attribute
+    network.trainable = False
+    self.assertEqual(network.weights, dense.weights)
+    self.assertEqual(network.trainable_weights, [])
+    self.assertEqual(network.non_trainable_weights,
+                     dense.trainable_weights + dense.non_trainable_weights)
+
+  def test_node_construction(self):
+    # test graph topology construction basics
+    a = network_layers.Input(shape=(32,), name='input_a')
+    b = network_layers.Input(shape=(32,), name='input_b')
+
+    self.assertEqual(a.get_shape().as_list(), [None, 32])
+    a_layer, a_node_index, a_tensor_index = a._keras_history
+    b_layer, _, _ = b._keras_history
+    self.assertEqual(len(a_layer._inbound_nodes), 1)
+    self.assertEqual(a_tensor_index, 0)
+    node = a_layer._inbound_nodes[a_node_index]
+    self.assertEqual(node.outbound_layer, a_layer)
+
+    self.assertEqual(node.inbound_layers, [])
+    self.assertEqual(node.input_tensors, [a])
+    self.assertEqual(node.input_shapes, [(None, 32)])
+    self.assertEqual(node.output_tensors, [a])
+    self.assertEqual(node.output_shapes, [(None, 32)])
+
+    dense = core_layers.Dense(16, name='dense_1')
+    dense(a)
+    dense(b)
+
+    self.assertEqual(len(dense._inbound_nodes), 2)
+    self.assertEqual(len(dense._outbound_nodes), 0)
+    self.assertEqual(dense._inbound_nodes[0].inbound_layers, [a_layer])
+    self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
+    self.assertEqual(dense._inbound_nodes[1].inbound_layers, [b_layer])
+    self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
+    self.assertEqual(dense._inbound_nodes[0].input_tensors, [a])
+    self.assertEqual(dense._inbound_nodes[1].input_tensors, [b])
+
+    # Test config
+    config_0 = dense._inbound_nodes[0].get_config()
+    self.assertEqual(config_0['outbound_layer'], dense.name)
+
+  def testMultiInputNetwork(self):
+    a = network_layers.Input(shape=(32,), name='input_a')
+    b = network_layers.Input(shape=(32,), name='input_b')
+
+    class AddLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        assert len(inputs) == 2
+        return inputs[0] + inputs[1]
+
+    c = AddLayer()([a, b])  # pylint: disable=not-callable
+    network = network_layers.GraphNetwork([a, b], c)
+    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + AddLayer
+
+    # Test callability.
+    a2 = network_layers.Input(shape=(32,))
+    b2 = network_layers.Input(shape=(32,))
+    c2 = network([a2, b2])
+    self.assertEqual(c2.get_shape().as_list(), [None, 32])
+
+  def testMultiOutputNetwork(self):
+    x = network_layers.Input(shape=(32,))
+    y1 = core_layers.Dense(2)(x)
+    y2 = core_layers.Dense(3)(x)
+    network = network_layers.GraphNetwork(x, [y1, y2])
+
+    self.assertEqual(len(network.layers), 3)  # InputLayer + 2 * Dense
+
+    # Test callability.
+    x2 = network_layers.Input(shape=(32,))
+    outputs = network(x2)
+
+    self.assertEqual(type(outputs), list)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
+    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
+
+  def testMultiInputMultiOutputNetworkSharedLayer(self):
+    a = network_layers.Input(shape=(32,), name='input_a')
+    b = network_layers.Input(shape=(32,), name='input_b')
+
+    dense = core_layers.Dense(2)
+
+    y1 = dense(a)
+    y2 = dense(b)
+    network = network_layers.GraphNetwork([a, b], [y1, y2])
+    self.assertEqual(len(network.layers), 3)  # 2 * InputLayer + Dense
+
+    # Test callability.
+    a2 = network_layers.Input(shape=(32,))
+    b2 = network_layers.Input(shape=(32,))
+    outputs = network([a2, b2])
+
+    self.assertEqual(type(outputs), list)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
+    self.assertEqual(outputs[1].get_shape().as_list(), [None, 2])
+
+  def testCrossDataFlows(self):
+    # Test the ability to have multi-output layers with outputs that get routed
+    # to separate layers
+
+    class PowersLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        return [inputs**2, inputs**3]
+
+    x = network_layers.Input(shape=(32,))
+    p1, p2 = PowersLayer()(x)  # pylint: disable=not-callable
+    y1 = core_layers.Dense(2)(p1)
+    y2 = core_layers.Dense(3)(p2)
+    network = network_layers.GraphNetwork(x, [y1, y2])
+
+    self.assertEqual(len(network.layers), 4)  # InputLayer + 2 * Dense + PLayer
+
+    # Test callability.
+    x2 = network_layers.Input(shape=(32,))
+    outputs = network(x2)
+
+    self.assertEqual(type(outputs), list)
+    self.assertEqual(len(outputs), 2)
+    self.assertEqual(outputs[0].get_shape().as_list(), [None, 2])
+    self.assertEqual(outputs[1].get_shape().as_list(), [None, 3])
+
+  def testNetworkAttributes(self):
+    x = network_layers.Input(shape=(32,))
+    z = core_layers.Dense(2, kernel_regularizer=lambda x: 0.01 * (x**2))(x)
+    dense = core_layers.Dense(2, name='dense')
+    dense.add_update(1)
+    y = dense(z)
+    net = network_layers.GraphNetwork(x, y)
+
+    # losses
+    self.assertEqual(len(net.losses), 1)
+
+    # updates
+    self.assertEqual(len(net.updates), 1)
+
+    # get_layer
+    self.assertEqual(net.get_layer('dense'), dense)
+    self.assertEqual(net.get_layer(index=2), dense)
+    with self.assertRaises(ValueError):
+      net.get_layer('dense_unknown')
+    with self.assertRaises(ValueError):
+      net.get_layer()
+    with self.assertRaises(ValueError):
+      net.get_layer(index=4)
+
+    # input, output
+    self.assertEqual(net.input, x)
+    self.assertEqual(net.output, y)
+
+    # input_shape, output_shape
+    self.assertEqual(net.input_shape, (None, 32))
+    self.assertEqual(net.output_shape, (None, 2))
+
+    # get_*_at
+    self.assertEqual(net.get_input_at(0), x)
+    self.assertEqual(net.get_output_at(0), y)
+
+    # _compute_output_shape
+    self.assertEqual(net._compute_output_shape((3, 32)).as_list(), [3, 2])
+
+  def testInvalidNetworks(self):
+    # redundant inputs
+    x = network_layers.Input(shape=(32,))
+    y = core_layers.Dense(2)(x)
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork([x, x], y)
+
+    # inputs that don't come from Input
+    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
+    y = core_layers.Dense(2)(x)
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork(x, y)
+
+    # inputs that don't come from Input but have a layer history
+    x = network_layers.Input(shape=(32,))
+    x = core_layers.Dense(32)(x)
+    y = core_layers.Dense(2)(x)
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork(x, y)
+
+    # outputs that don't come from layers
+    x = network_layers.Input(shape=(32,))
+    y = core_layers.Dense(2)(x)
+    y = 2 * y
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork(x, y)
+
+    # disconnected graphs
+    x1 = network_layers.Input(shape=(32,))
+    x2 = network_layers.Input(shape=(32,))
+    y = core_layers.Dense(2)(x1)
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork(x2, y)
+
+    # redundant layer names
+    x = network_layers.Input(shape=(32,))
+    z = core_layers.Dense(2, name='dense')(x)
+    y = core_layers.Dense(2, name='dense')(z)
+    with self.assertRaises(ValueError):
+      network_layers.GraphNetwork(x, y)
+
+  def testInputTensorWrapping(self):
+    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
+    x = network_layers.Input(tensor=x)
+    y = core_layers.Dense(2)(x)
+    network_layers.GraphNetwork(x, y)
+
+  def testExplicitBatchSize(self):
+    x = network_layers.Input(shape=(32,), batch_size=3)
+    y = core_layers.Dense(2)(x)
+    self.assertEqual(y.get_shape().as_list(), [3, 2])
+
+  def testNetworkRecursion(self):
+    # test the ability of networks to be used as layers inside networks.
+    a = network_layers.Input(shape=(32,))
+    b = core_layers.Dense(2)(a)
+    net = network_layers.GraphNetwork(a, b)
+
+    c = network_layers.Input(shape=(32,))
+    d = net(c)
+
+    recursive_net = network_layers.GraphNetwork(c, d)
+    self.assertEqual(len(recursive_net.layers), 2)
+    self.assertEqual(recursive_net.layers[1], net)
+    self.assertEqual(len(recursive_net.weights), 2)
+
+    # test callability
+    x = array_ops.placeholder(dtype='float32', shape=(None, 32))
+    y = recursive_net(x)
+    self.assertEqual(y.get_shape().as_list(), [None, 2])
+
+  def testSparseInput(self):
+
+    class SparseSoftmax(base_layers.Layer):
+
+      def call(self, inputs):
+        return sparse_ops.sparse_softmax(inputs)
+
+    x = network_layers.Input(shape=(32,), sparse=True)
+    y = SparseSoftmax()(x)  # pylint: disable=not-callable
+    network = network_layers.GraphNetwork(x, y)
+
+    self.assertEqual(len(network.layers), 2)
+    self.assertEqual(network.layers[0].sparse, True)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMaskingSingleInput(self):
+
+    class MaskedLayer(base_layers.Layer):
+
+      def call(self, inputs, mask=None):
+        if mask is not None:
+          return inputs * mask
+        return inputs
+
+      def compute_mask(self, inputs, mask=None):
+        return array_ops.ones_like(inputs)
+
+    if context.in_graph_mode():
+      x = network_layers.Input(shape=(32,))
+      y = MaskedLayer()(x)  # pylint: disable=not-callable
+      network = network_layers.GraphNetwork(x, y)
+
+      # test callability on Input
+      x_2 = network_layers.Input(shape=(32,))
+      y_2 = network(x_2)
+      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+
+      # test callability on regular tensor
+      x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32))
+      y_2 = network(x_2)
+      self.assertEqual(y_2.get_shape().as_list(), [None, 32])
+    else:
+      a = constant_op.constant([2] * 32)
+      mask = constant_op.constant([0, 1] * 16)
+      a._keras_mask = mask
+      b = MaskedLayer().apply(a)
+      self.assertTrue(hasattr(b, '_keras_mask'))
+      self.assertAllEqual(self.evaluate(array_ops.ones_like(mask)),
+                          self.evaluate(getattr(b, '_keras_mask')))
+      self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
+
+
+class DeferredModeTest(test.TestCase):
+
+  def testDeferredTensorAttributes(self):
+    x = base_layers._DeferredTensor(shape=(None, 2), dtype='float32', name='x')
+    self.assertEqual(str(x),
+                     'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
+    self.assertEqual(repr(x),
+                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSimpleNetworkBuilding(self):
+    inputs = network_layers.Input(shape=(32,))
+    if context.in_eager_mode():
+      self.assertIsInstance(inputs, base_layers._DeferredTensor)
+      self.assertEqual(inputs.dtype.name, 'float32')
+      self.assertEqual(inputs.shape.as_list(), [None, 32])
+
+    x = core_layers.Dense(2)(inputs)
+    if context.in_eager_mode():
+      self.assertIsInstance(x, base_layers._DeferredTensor)
+      self.assertEqual(x.dtype.name, 'float32')
+      self.assertEqual(x.shape.as_list(), [None, 2])
+
+    outputs = core_layers.Dense(4)(x)
+    network = network_layers.GraphNetwork(inputs, outputs)
+    self.assertIsInstance(network, network_layers.GraphNetwork)
+
+    if context.in_eager_mode():
+      # It should be possible to call such a network on EagerTensors.
+      inputs = constant_op.constant(
+          np.random.random((10, 32)).astype('float32'))
+      outputs = network(inputs)
+      self.assertEqual(outputs.shape.as_list(), [10, 4])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMultiIONetworkbuilding(self):
+    input_a = network_layers.Input(shape=(32,))
+    input_b = network_layers.Input(shape=(16,))
+    a = core_layers.Dense(16)(input_a)
+
+    class AddLayer(base_layers.Layer):
+
+      def call(self, inputs):
+        return inputs[0] + inputs[1]
+
+      def _compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+    c = AddLayer()([a, input_b])  # pylint: disable=not-callable
+    c = core_layers.Dense(2)(c)
+
+    network = network_layers.GraphNetwork([input_a, input_b], [a, c])
+    if context.in_eager_mode():
+      a_val = constant_op.constant(
+          np.random.random((10, 32)).astype('float32'))
+      b_val = constant_op.constant(
+          np.random.random((10, 16)).astype('float32'))
+      outputs = network([a_val, b_val])
+      self.assertEqual(len(outputs), 2)
+      self.assertEqual(outputs[0].shape.as_list(), [10, 16])
+      self.assertEqual(outputs[1].shape.as_list(), [10, 2])
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 01f56abc70ef52eda25a4b247ae9b536b60266d5..65e67dd016fcf4fe6e395bf983b560cd2c7b0f8a 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.layers import base
@@ -141,7 +142,10 @@ class BatchNormalization(base.Layer):
                **kwargs):
     super(BatchNormalization, self).__init__(
         name=name, trainable=trainable, **kwargs)
-    self.axis = axis
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -211,16 +215,13 @@ class BatchNormalization(base.Layer):
                          'be specified')
 
     if self.fused:
-      # Currently fused batch norm doesn't support renorm and beta/gamma
-      # regularizer; and only supports an input tensor of rank 4 and a channel
-      # dimension on axis 1 and 3.
+      # Currently fused batch norm doesn't support renorm. It also only supports
+      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
       # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
       # output back to its original shape accordingly.
       self.fused = (not self.renorm and
                     ndims == 4 and
                     self.axis in [[1], [3]] and
-                    self.beta_regularizer is None and
-                    self.gamma_regularizer is None and
                     self.virtual_batch_size is None and
                     self.adjustment is None)
       # TODO(chrisying): fused batch norm is currently not supported for
@@ -239,6 +240,12 @@ class BatchNormalization(base.Layer):
         raise ValueError('Unsupported axis, fused batch norm only supports '
                          'axis == [1] or axis == [3]')
 
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
     axis_to_dim = {x: input_shape[x].value for x in self.axis}
     for x in axis_to_dim:
       if axis_to_dim[x] is None:
@@ -260,28 +267,34 @@ class BatchNormalization(base.Layer):
           self.axis[idx] = x + 1      # Account for added dimension
 
     if self.scale:
-      self.gamma = self.add_variable(name='gamma',
-                                     shape=param_shape,
-                                     initializer=self.gamma_initializer,
-                                     regularizer=self.gamma_regularizer,
-                                     constraint=self.gamma_constraint,
-                                     trainable=True)
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
     else:
       self.gamma = None
       if self.fused:
-        self._gamma_const = array_ops.constant(1.0, shape=param_shape)
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
 
     if self.center:
-      self.beta = self.add_variable(name='beta',
-                                    shape=param_shape,
-                                    initializer=self.beta_initializer,
-                                    regularizer=self.beta_regularizer,
-                                    constraint=self.beta_constraint,
-                                    trainable=True)
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
     else:
       self.beta = None
       if self.fused:
-        self._beta_const = array_ops.constant(0.0, shape=param_shape)
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
 
     # Disable variable partitioning when creating the moving mean and variance
     try:
@@ -293,12 +306,14 @@ class BatchNormalization(base.Layer):
       self.moving_mean = self.add_variable(
           name='moving_mean',
           shape=param_shape,
+          dtype=param_dtype,
           initializer=self.moving_mean_initializer,
           trainable=False)
 
       self.moving_variance = self.add_variable(
           name='moving_variance',
           shape=param_shape,
+          dtype=param_dtype,
           initializer=self.moving_variance_initializer,
           trainable=False)
 
@@ -312,10 +327,12 @@ class BatchNormalization(base.Layer):
         # stack to be cleared. The nested ones use a `lambda` to set the desired
         # device and ignore any devices that may be set by the custom getter.
         def _renorm_variable(name, shape):
-          var = self.add_variable(name=name,
-                                  shape=shape,
-                                  initializer=init_ops.zeros_initializer(),
-                                  trainable=False)
+          var = self.add_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
           return var
 
         with ops.device(None):
@@ -356,7 +373,6 @@ class BatchNormalization(base.Layer):
 
   def _fused_batch_norm(self, inputs, training):
     """Returns the output of fused batch norm."""
-    # TODO(reedwm): Add support for fp16 inputs.
     beta = self.beta if self.center else self._beta_const
     gamma = self.gamma if self.scale else self._gamma_const
 
@@ -752,6 +768,7 @@ def batch_normalization(inputs,
       virtual_batch_size=virtual_batch_size,
       adjustment=adjustment,
       name=name,
+      dtype=inputs.dtype.base_dtype,
       _reuse=reuse,
       _scope=name)
   return layer.apply(inputs, training=training)
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index 90ebdc8c86f425c34a90204fbf4b8f2b8061ae4e..e147f348b0a60dbefb38aa9f89318f261c03684e 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -68,11 +68,12 @@ class BNTest(test.TestCase):
              use_gpu,
              is_fused,
              restore=False,
-             freeze_mode=False):
+             freeze_mode=False,
+             dtype=dtypes.float32):
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
-      image = array_ops.placeholder(dtype='float32', shape=shape)
+      image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, train_op, saver = self._simple_model(image, is_fused, freeze_mode)
       if restore:
         saver.restore(sess, checkpoint_path)
@@ -80,7 +81,7 @@ class BNTest(test.TestCase):
         sess.run(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
-        image_val = np.random.rand(*shape).astype(np.float32)
+        image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
         sess.run([loss, train_op], feed_dict={image: image_val})
       if restore:
         all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
@@ -90,15 +91,77 @@ class BNTest(test.TestCase):
         saver.save(sess, checkpoint_path)
 
   def _infer(self, checkpoint_path, image_val, shape, use_gpu, is_fused):
+    dtype = image_val.dtype
     ops.reset_default_graph()
     graph = ops.get_default_graph()
     with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
-      image = array_ops.placeholder(dtype='float32', shape=shape)
+      image = array_ops.placeholder(dtype=dtype, shape=shape)
       loss, _, saver = self._simple_model(image, is_fused, True)
       saver.restore(sess, checkpoint_path)
       loss_val = sess.run(loss, feed_dict={image: image_val})
       return loss_val
 
+  def _trainEvalSequence(self, dtype, train1_use_gpu, train2_use_gpu,
+                         infer_use_gpu):
+    batch, height, width, input_channels = 2, 4, 5, 3
+    shape = [batch, height, width, input_channels]
+
+    # Not all characters in a dtype string representation are allowed in
+    # filenames in all operating systems. This map will sanitize these.
+    dtype_to_valid_fn = {
+        dtypes.float16: 'float16',
+        dtypes.float32: 'float32',
+    }
+    checkpoint = os.path.join(
+        self.get_temp_dir(), 'cp_%s_%s_%s_%s' % (
+            dtype_to_valid_fn[dtype], train1_use_gpu, train2_use_gpu,
+            infer_use_gpu))
+
+    self._train(
+        checkpoint,
+        shape,
+        use_gpu=train1_use_gpu,
+        is_fused=True,
+        restore=False,
+        freeze_mode=False,
+        dtype=dtype)
+
+    train_vars = self._train(
+        checkpoint,
+        shape,
+        use_gpu=train2_use_gpu,
+        is_fused=True,
+        restore=True,
+        freeze_mode=False,
+        dtype=dtype)
+
+    np.random.seed(0)
+    image_val = np.random.rand(batch, height, width, input_channels).astype(
+        dtype.as_numpy_dtype)
+    loss_val = self._infer(
+        checkpoint, image_val, shape, use_gpu=infer_use_gpu, is_fused=True)
+
+    return train_vars, loss_val
+
+  def testHalfPrecision(self):
+    ref_vars, ref_loss = self._trainEvalSequence(
+        dtype=dtypes.float32,
+        train1_use_gpu=True,
+        train2_use_gpu=True,
+        infer_use_gpu=True)
+
+    self.assertEqual(len(ref_vars), 5)
+
+    for train1_use_gpu in [True, False]:
+      for train2_use_gpu in [True, False]:
+        for infer_use_gpu in [True, False]:
+          test_vars, test_loss = self._trainEvalSequence(
+              dtypes.float16, train1_use_gpu, train2_use_gpu, infer_use_gpu)
+          self.assertEqual(len(test_vars), 5)
+          for test_var, ref_var in zip(test_vars, ref_vars):
+            self.assertAllClose(test_var, ref_var, rtol=1.e-3, atol=1.e-3)
+          self.assertAllClose(test_loss, ref_loss, rtol=1.e-3, atol=1.e-3)
+
   def _testCheckpoint(self, is_fused_checkpoint_a, is_fused_checkpoint_b,
                       use_gpu_checkpoint_a, use_gpu_checkpoint_b,
                       use_gpu_test_a, use_gpu_test_b, freeze_mode):
@@ -218,6 +281,35 @@ class BNTest(test.TestCase):
         ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
         bn.trainable_variables)
 
+  def testCreateFusedBNFloat16(self):
+    # Call layer.
+    bn = normalization_layers.BatchNormalization(axis=1, fused=True)
+    inputs = random_ops.random_uniform(
+        (5, 4, 3, 3), seed=1, dtype=dtypes.float16)
+    training = array_ops.placeholder(dtype='bool')
+    outputs = bn.apply(inputs, training=training)
+
+    # Verify shape.
+    self.assertListEqual(outputs.get_shape().as_list(), [5, 4, 3, 3])
+
+    # Verify layer attributes.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertEqual(len(bn.variables), 4)
+    self.assertEqual(len(bn.trainable_variables), 2)
+    self.assertEqual(len(bn.non_trainable_variables), 2)
+    for var in bn.variables:
+      self.assertEqual(var.dtype, dtypes.float32_ref)
+
+    # Test that updates were created and added to UPDATE_OPS.
+    self.assertEqual(len(bn.updates), 2)
+    self.assertListEqual(
+        ops.get_collection(ops.GraphKeys.UPDATE_OPS), bn.updates)
+
+    # Test that weights were created and added to TRAINABLE_VARIABLES.
+    self.assertListEqual(
+        ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+        bn.trainable_variables)
+
   def test3DInputAxis1(self):
     epsilon = 1e-3
     bn = normalization_layers.BatchNormalization(
diff --git a/tensorflow/python/layers/utils.py b/tensorflow/python/layers/utils.py
index 7c71d3c952c071333cfe75d88d4eeaeffa02b6c0..766a6800d443a79d9bd130833c27f26c844cadaf 100644
--- a/tensorflow/python/layers/utils.py
+++ b/tensorflow/python/layers/utils.py
@@ -24,6 +24,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.util import nest
 
 
 def convert_data_format(data_format, ndim):
@@ -232,3 +233,19 @@ def constant_value(pred):
   else:
     raise TypeError('`pred` must be a Tensor, a Variable, or a Python bool.')
   return pred_value
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def static_shape(x):
+  """Get the static shape of a Tensor, or None if it is unavailable."""
+  if x is None:
+    return None
+  try:
+    return tuple(x.get_shape().as_list())
+  except ValueError:
+    return None
diff --git a/tensorflow/python/lib/core/bfloat16.cc b/tensorflow/python/lib/core/bfloat16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfe9eba03dbfac1ae22e6dfb2dc54fde6dc869ef
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.cc
@@ -0,0 +1,560 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/bfloat16.h"
+
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace tensorflow {
+namespace {
+
+// Workarounds for Python 2 vs 3 API differences.
+#if PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyString_FromString(s.c_str());
+}
+
+typedef long HashType;  // NOLINT
+
+bool TfPyInt_Check(PyObject* object) { return PyInt_Check(object); }
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyInt_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyInt_AsLong(x);
+}
+
+#else  // PY_MAJOR_VERSION < 3
+
+PyObject* MakePyString(const string& s) {
+  return PyUnicode_FromString(s.c_str());
+}
+
+bool TfPyInt_Check(PyObject* object) {
+  if (!PyLong_Check(object)) {
+    return 0;
+  }
+  int overflow = 0;
+  PyLong_AsLongAndOverflow(object, &overflow);
+  return (overflow == 0);
+}
+
+PyObject* TfPyInt_FromLong(long x) {  // NOLINT
+  return PyLong_FromLong(x);
+}
+
+long TfPyInt_AsLong(PyObject* x) {  // NOLINT
+  return PyLong_AsLong(x);
+}
+
+typedef Py_hash_t HashType;
+
+#endif  // PY_MAJOR_VERSION < 3
+
+// Forward declaration.
+extern PyTypeObject PyBfloat16_Type;
+
+// Representation of a Python bfloat16 object.
+struct PyBfloat16 {
+  PyObject_HEAD;  // Python object header
+  bfloat16 value;
+};
+
+// Returns true if 'object' is a PyBfloat16.
+bool PyBfloat16_Check(PyObject* object) {
+  return PyObject_IsInstance(object,
+                             reinterpret_cast<PyObject*>(&PyBfloat16_Type));
+}
+
+// Extracts the value of a PyBfloat16 object.
+bfloat16 PyBfloat16_Bfloat16(PyObject* object) {
+  return reinterpret_cast<PyBfloat16*>(object)->value;
+}
+
+// Constructs a PyBfloat16 object from a bfloat16.
+Safe_PyObjectPtr PyBfloat16_FromBfloat16(bfloat16 x) {
+  Safe_PyObjectPtr ref =
+      make_safe(PyBfloat16_Type.tp_alloc(&PyBfloat16_Type, 0));
+  PyBfloat16* p = reinterpret_cast<PyBfloat16*>(ref.get());
+  if (p) {
+    p->value = x;
+  }
+  return ref;
+}
+
+// Converts a Python object to a bfloat16 value. Returns true on success,
+// returns false and reports a Python error on failure.
+bool AsBfloat16(PyObject* arg, bfloat16* output) {
+  if (PyBfloat16_Check(arg)) {
+    *output = PyBfloat16_Bfloat16(arg);
+    return true;
+  }
+  if (PyFloat_Check(arg)) {
+    double d = PyFloat_AsDouble(arg);
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(d);
+    return true;
+  }
+  if (TfPyInt_Check(arg)) {
+    long l = TfPyInt_AsLong(arg);  // NOLINT
+    if (PyErr_Occurred()) {
+      return false;
+    }
+    // TODO(phawkins): check for overflow
+    *output = bfloat16(static_cast<float>(l));
+    return true;
+  }
+  if (PyArray_IsScalar(arg, Float)) {
+    float f;
+    PyArray_ScalarAsCtype(arg, &f);
+    *output = bfloat16(f);
+    return true;
+  }
+  PyErr_Format(PyExc_TypeError, "expected number, got %s",
+               arg->ob_type->tp_name);
+  return false;
+}
+
+// Converts a PyBfloat16 into a PyFloat.
+PyObject* PyBfloat16_Float(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyFloat_FromDouble(static_cast<double>(x));
+}
+
+// Converts a PyBfloat16 into a PyInt.
+PyObject* PyBfloat16_Int(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  long y = static_cast<long>(x);  // NOLINT
+  return TfPyInt_FromLong(y);
+}
+
+// Negates a PyBfloat16.
+PyObject* PyBfloat16_Negative(PyObject* self) {
+  bfloat16 x = PyBfloat16_Bfloat16(self);
+  return PyBfloat16_FromBfloat16(-x).release();
+}
+
+// Binary arithmetic operators on PyBfloat16 values.
+#define BFLOAT16_BINOP(name, op)                                  \
+  PyObject* PyBfloat16_##name(PyObject* a, PyObject* b) {         \
+    bfloat16 x, y;                                                \
+    if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr; \
+    bfloat16 z = x op y;                                          \
+    return PyBfloat16_FromBfloat16(z).release();                  \
+  }
+BFLOAT16_BINOP(Add, +)
+BFLOAT16_BINOP(Subtract, -)
+BFLOAT16_BINOP(Multiply, *)
+BFLOAT16_BINOP(Divide, /)
+#undef BFLOAT16_BINOP
+
+// Python number methods for PyBfloat16 objects.
+PyNumberMethods PyBfloat16_AsNumber = {
+    PyBfloat16_Add,       // nb_add
+    PyBfloat16_Subtract,  // nb_subtract
+    PyBfloat16_Multiply,  // nb_multiply
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Divide,  // nb_divide
+#endif
+    nullptr,              // nb_remainder
+    nullptr,              // nb_divmod
+    nullptr,              // nb_power
+    PyBfloat16_Negative,  // nb_negative
+    nullptr,              // nb_positive
+    nullptr,              // nb_absolute
+    nullptr,              // nb_nonzero
+    nullptr,              // nb_invert
+    nullptr,              // nb_lshift
+    nullptr,              // nb_rshift
+    nullptr,              // nb_and
+    nullptr,              // nb_xor
+    nullptr,              // nb_or
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_coerce
+#endif
+    PyBfloat16_Int,  // nb_int
+#if PY_MAJOR_VERSION < 3
+    PyBfloat16_Int,  // nb_long
+#else
+    nullptr,  // reserved
+#endif
+    PyBfloat16_Float,  // nb_float
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_oct
+    nullptr,  // nb_hex
+#endif
+
+    nullptr,  // nb_inplace_add
+    nullptr,  // nb_inplace_subtract
+    nullptr,  // nb_inplace_multiply
+#if PY_MAJOR_VERSION < 3
+    nullptr,  // nb_inplace_divide
+#endif
+    nullptr,  // nb_inplace_remainder
+    nullptr,  // nb_inplace_power
+    nullptr,  // nb_inplace_lshift
+    nullptr,  // nb_inplace_rshift
+    nullptr,  // nb_inplace_and
+    nullptr,  // nb_inplace_xor
+    nullptr,  // nb_inplace_or
+
+    nullptr,            // nb_floor_divide
+    PyBfloat16_Divide,  // nb_true_divide
+    nullptr,            // nb_inplace_floor_divide
+    nullptr,            // nb_inplace_true_divide
+    nullptr,            // nb_index
+};
+
+// Constructs a new PyBfloat16.
+PyObject* PyBfloat16_New(PyTypeObject* type, PyObject* args, PyObject* kwds) {
+  if (kwds && PyDict_Size(kwds)) {
+    PyErr_SetString(PyExc_TypeError, "constructor takes no keyword arguments");
+    return nullptr;
+  }
+  Py_ssize_t size = PyTuple_Size(args);
+  if (size != 1) {
+    PyErr_SetString(PyExc_TypeError,
+                    "expected number as argument to bfloat16 constructor");
+    return nullptr;
+  }
+  PyObject* arg = PyTuple_GetItem(args, 0);
+
+  if (PyBfloat16_Check(arg)) {
+    Py_INCREF(arg);
+    return arg;
+  } else {
+    bfloat16 value;
+    if (!AsBfloat16(arg, &value)) {
+      return nullptr;
+    }
+    return PyBfloat16_FromBfloat16(value).release();
+  }
+}
+
+// Comparisons on PyBfloat16s.
+PyObject* PyBfloat16_RichCompare(PyObject* a, PyObject* b, int op) {
+  bfloat16 x, y;
+  if (!AsBfloat16(a, &x) || !AsBfloat16(b, &y)) return nullptr;
+  bool result;
+  switch (op) {
+    case Py_LT:
+      result = x < y;
+      break;
+    case Py_LE:
+      result = x <= y;
+      break;
+    case Py_EQ:
+      result = x == y;
+      break;
+    case Py_NE:
+      result = x != y;
+      break;
+    case Py_GT:
+      result = x > y;
+      break;
+    case Py_GE:
+      result = x >= y;
+      break;
+    default:
+      LOG(FATAL) << "Invalid op type " << op;
+  }
+  return PyBool_FromLong(result);
+}
+
+// Implementation of repr() for PyBfloat16.
+PyObject* PyBfloat16_Repr(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat("bfloat16(", static_cast<float>(x), ")");
+  return MakePyString(v);
+}
+
+// Implementation of str() for PyBfloat16.
+PyObject* PyBfloat16_Str(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  string v = strings::StrCat(static_cast<float>(x));
+  return MakePyString(v);
+}
+
+// Hash function for PyBfloat16. We use the identity function, which is a weak
+// hash function.
+HashType PyBfloat16_Hash(PyObject* self) {
+  bfloat16 x = reinterpret_cast<PyBfloat16*>(self)->value;
+  return x.value;
+}
+
+// Python type for PyBfloat16 objects.
+PyTypeObject PyBfloat16_Type = {
+#if PY_MAJOR_VERSION < 3
+    PyObject_HEAD_INIT(nullptr) 0,  // ob_size
+#else
+    PyVarObject_HEAD_INIT(nullptr, 0)
+#endif
+    "bfloat16",                                // tp_name
+    sizeof(PyBfloat16),                        // tp_basicsize
+    0,                                         // tp_itemsize
+    nullptr,                                   // tp_dealloc
+    nullptr,                                   // tp_print
+    nullptr,                                   // tp_getattr
+    nullptr,                                   // tp_setattr
+    nullptr,                                   // tp_compare / tp_reserved
+    PyBfloat16_Repr,                           // tp_repr
+    &PyBfloat16_AsNumber,                      // tp_as_number
+    nullptr,                                   // tp_as_sequence
+    nullptr,                                   // tp_as_mapping
+    PyBfloat16_Hash,                           // tp_hash
+    nullptr,                                   // tp_call
+    PyBfloat16_Str,                            // tp_str
+    nullptr,                                   // tp_getattro
+    nullptr,                                   // tp_setattro
+    nullptr,                                   // tp_as_buffer
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,  // tp_flags
+    "bfloat16 floating-point values",          // tp_doc
+    nullptr,                                   // tp_traverse
+    nullptr,                                   // tp_clear
+    PyBfloat16_RichCompare,                    // tp_richcompare
+    0,                                         // tp_weaklistoffset
+    nullptr,                                   // tp_iter
+    nullptr,                                   // tp_iternext
+    nullptr,                                   // tp_methods
+    nullptr,                                   // tp_members
+    nullptr,                                   // tp_getset
+    nullptr,                                   // tp_base
+    nullptr,                                   // tp_dict
+    nullptr,                                   // tp_descr_get
+    nullptr,                                   // tp_descr_set
+    0,                                         // tp_dictoffset
+    nullptr,                                   // tp_init
+    nullptr,                                   // tp_alloc
+    PyBfloat16_New,                            // tp_new
+    nullptr,                                   // tp_free
+    nullptr,                                   // tp_is_gc
+    nullptr,                                   // tp_bases
+    nullptr,                                   // tp_mro
+    nullptr,                                   // tp_cache
+    nullptr,                                   // tp_subclasses
+    nullptr,                                   // tp_weaklist
+    nullptr,                                   // tp_del
+    0,                                         // tp_version_tag
+};
+
+// Numpy support
+
+PyArray_ArrFuncs NPyBfloat16_ArrFuncs;
+
+PyArray_Descr NPyBfloat16_Descr = {
+    PyObject_HEAD_INIT(nullptr) & PyBfloat16_Type,  // typeobj
+    // We must register bfloat16 with a kind other than "f", because numpy
+    // considers two types with the same kind and size to be equal, but
+    // float16 != bfloat16.
+    'V',  // kind
+    // TODO(phawkins): there doesn't seem to be a way of guaranteeing a type
+    // character is unique.
+    'E',                                                  // type
+    '=',                                                  // byteorder
+    NPY_NEEDS_PYAPI | NPY_USE_GETITEM | NPY_USE_SETITEM,  // hasobject
+    0,                                                    // type_num
+    sizeof(bfloat16),                                     // elsize
+    alignof(bfloat16),                                    // alignment
+    nullptr,                                              // subarray
+    nullptr,                                              // fields
+    nullptr,                                              // names
+    &NPyBfloat16_ArrFuncs,                                // f
+};
+
+// Registered numpy type ID. Global variable populated by the registration code.
+int npy_bfloat16_ = -1;
+
+// Implementations of NumPy array methods.
+
+PyObject* NPyBfloat16_GetItem(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(bfloat16));
+  return PyBfloat16_FromBfloat16(x).release();
+}
+
+int NPyBfloat16_SetItem(PyObject* item, void* data, void* arr) {
+  bfloat16 x;
+  if (!AsBfloat16(item, &x)) return -1;
+  memcpy(data, &x, sizeof(bfloat16));
+  return 0;
+}
+
+void ByteSwap16(void* value) {
+  char* p = reinterpret_cast<char*>(value);
+  std::swap(p[0], p[1]);
+}
+
+void NPyBfloat16_CopySwapN(void* dstv, npy_intp dstride, void* srcv,
+                           npy_intp sstride, npy_intp n, int swap, void* arr) {
+  char* dst = reinterpret_cast<char*>(dstv);
+  char* src = reinterpret_cast<char*>(srcv);
+  if (!src) {
+    return;
+  }
+  if (swap) {
+    for (npy_intp i = 0; i < n; i++) {
+      char* r = dst + dstride * i;
+      memcpy(r, src + sstride * i, sizeof(uint16_t));
+      ByteSwap16(r);
+    }
+  } else if (dstride == sizeof(uint16_t) && sstride == sizeof(uint16_t)) {
+    memcpy(dst, src, n * sizeof(uint16_t));
+  } else {
+    for (npy_intp i = 0; i < n; i++) {
+      memcpy(dst + dstride * i, src + sstride * i, sizeof(uint16_t));
+    }
+  }
+}
+
+void NPyBfloat16_CopySwap(void* dst, void* src, int swap, void* arr) {
+  if (!src) {
+    return;
+  }
+  memcpy(dst, src, sizeof(uint16_t));
+  if (swap) {
+    ByteSwap16(dst);
+  }
+}
+
+npy_bool NPyBfloat16_NonZero(void* data, void* arr) {
+  bfloat16 x;
+  memcpy(&x, data, sizeof(x));
+  return x != static_cast<bfloat16>(0);
+}
+
+// NumPy casts
+
+// Performs a NumPy array cast from type 'From' to 'To'.
+template <typename From, typename To>
+void NPyCast(void* from_void, void* to_void, npy_intp n, void* fromarr,
+             void* toarr) {
+  const From* from = reinterpret_cast<From*>(from_void);
+  To* to = reinterpret_cast<To*>(to_void);
+  for (npy_intp i = 0; i < n; ++i) {
+    to[i] = static_cast<To>(from[i]);
+  }
+}
+
+// Registers a cast between bfloat16 and type 'T'. 'numpy_type' is the NumPy
+// type corresponding to 'T'. If 'cast_is_safe', registers that bfloat16 can be
+// safely coerced to T.
+template <typename T>
+bool RegisterBfloat16Cast(int numpy_type, bool cast_is_safe) {
+  if (PyArray_RegisterCastFunc(PyArray_DescrFromType(numpy_type), npy_bfloat16_,
+                               NPyCast<T, bfloat16>) < 0) {
+    return false;
+  }
+  if (PyArray_RegisterCastFunc(&NPyBfloat16_Descr, numpy_type,
+                               NPyCast<bfloat16, T>) < 0) {
+    return false;
+  }
+  if (cast_is_safe && PyArray_RegisterCanCast(&NPyBfloat16_Descr, numpy_type,
+                                              NPY_NOSCALAR) < 0) {
+    return false;
+  }
+  return true;
+}
+
+// Initializes the module.
+bool Initialize() {
+  // We hit a mysterious crash if we haven't initialized numpy before this:
+  PyBfloat16_Type.tp_base = &PyGenericArrType_Type;
+
+  if (PyType_Ready(&PyBfloat16_Type) < 0) {
+    return false;
+  }
+
+  // Initializes the NumPy descriptor.
+  PyArray_InitArrFuncs(&NPyBfloat16_ArrFuncs);
+  NPyBfloat16_ArrFuncs.getitem = NPyBfloat16_GetItem;
+  NPyBfloat16_ArrFuncs.setitem = NPyBfloat16_SetItem;
+  NPyBfloat16_ArrFuncs.copyswapn = NPyBfloat16_CopySwapN;
+  NPyBfloat16_ArrFuncs.copyswap = NPyBfloat16_CopySwap;
+  NPyBfloat16_ArrFuncs.nonzero = NPyBfloat16_NonZero;
+
+  Py_TYPE(&NPyBfloat16_Descr) = &PyArrayDescr_Type;
+  npy_bfloat16_ = PyArray_RegisterDataType(&NPyBfloat16_Descr);
+  if (npy_bfloat16_ < 0) return false;
+
+  // Support dtype(bfloat16)
+  if (PyDict_SetItemString(PyBfloat16_Type.tp_dict, "dtype",
+                           reinterpret_cast<PyObject*>(&NPyBfloat16_Descr)) <
+      0) {
+    return false;
+  }
+
+  // Register casts
+
+  // We lie shamelessly and say that a cast from half to bfloat16 is safe.
+  // Numpy frequently uses the smallest legal representation type for small
+  // float constants (e.g., 1.0), which is often float16. Things break if these
+  // cannot be converted transparently to bfloat16.
+  if (!RegisterBfloat16Cast<Eigen::half>(NPY_HALF, /*cast_is_safe=*/true)) {
+    return false;
+  }
+
+  if (!RegisterBfloat16Cast<float>(NPY_FLOAT, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<double>(NPY_DOUBLE, /*cast_is_safe=*/true)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int32>(NPY_INT32, /*cast_is_safe=*/false)) {
+    return false;
+  }
+  if (!RegisterBfloat16Cast<int64>(NPY_INT64, /*cast_is_safe=*/false)) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+void RegisterNumpyBfloat16() {
+  if (npy_bfloat16_ >= 0) {
+    // Already initialized.
+    return;
+  }
+  if (!Initialize()) {
+    if (!PyErr_Occurred()) {
+      PyErr_SetString(PyExc_RuntimeError, "cannot load bfloat16 module.");
+    }
+    PyErr_Print();
+  }
+}
+
+PyObject* Bfloat16PyType() {
+  CHECK(PyBfloat16_Type.tp_base != nullptr);
+  Py_INCREF(&PyBfloat16_Type);
+  return reinterpret_cast<PyObject*>(&PyBfloat16_Type);
+}
+
+int Bfloat16NumpyType() {
+  CHECK_GE(npy_bfloat16_, 0);
+  return npy_bfloat16_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/bfloat16.h b/tensorflow/python/lib/core/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..a609928ba9029af00553a4664bef18d3749e64db
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.h
@@ -0,0 +1,34 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
+
+#include <Python.h>
+
+namespace tensorflow {
+
+// Register the bfloat16 numpy type.
+void RegisterNumpyBfloat16();
+
+// Returns the PyObject for the bfloat16 type.
+PyObject* Bfloat16PyType();
+
+// Returns the id number of the bfloat16 numpy type.
+int Bfloat16NumpyType();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_BFLOAT16_H_
diff --git a/tensorflow/python/lib/core/bfloat16.i b/tensorflow/python/lib/core/bfloat16.i
new file mode 100644
index 0000000000000000000000000000000000000000..10444b676b2549e0d9f96391f96e7a523f768d85
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16.i
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+#include "tensorflow/python/lib/core/bfloat16.h"
+%}
+
+%init %{
+tensorflow::RegisterNumpyBfloat16();
+%}
+
+%{
+PyObject* TF_bfloat16_type() {
+  return tensorflow::Bfloat16PyType();
+}
+%}
+
+PyObject* TF_bfloat16_type();
diff --git a/tensorflow/python/lib/core/bfloat16_test.py b/tensorflow/python/lib/core/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..02af33d98bc6ef3036a2e95797dfe2b1c71d11cf
--- /dev/null
+++ b/tensorflow/python/lib/core/bfloat16_test.py
@@ -0,0 +1,200 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test cases for the bfloat16 Python type."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+# pylint: disable=unused-import,g-bad-import-order
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.platform import test
+
+
+bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
+
+
+class Bfloat16Test(test.TestCase):
+
+  def float_values(self):
+    """Returns values that should round trip exactly to float and back."""
+    epsilon = float.fromhex("1.0p-7")
+    return [
+        0.0, 1.0, -1, 0.5, -0.5, epsilon, 1.0 + epsilon, 1.0 - epsilon,
+        -1.0 - epsilon, -1.0 + epsilon, 3.5, 42.0, 255.0, 256.0,
+        float("inf"), float("-inf"), float("nan")]
+
+  def _assertFloatIdentical(self, v, w):
+    if math.isnan(v):
+      self.assertTrue(math.isnan(w))
+    else:
+      self.assertEqual(v, w)
+
+  def testRoundTripToFloat(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(v, float(bfloat16(v)))
+
+  def testRoundTripToInt(self):
+    for v in [-256, -255, -34, -2, -1, 0, 1, 2, 10, 47, 128, 255, 256, 512]:
+      self.assertEqual(v, int(bfloat16(v)))
+
+  def testStr(self):
+    self.assertEqual("0", str(bfloat16(0.0)))
+    self.assertEqual("1", str(bfloat16(1.0)))
+    self.assertEqual("-3.5", str(bfloat16(-3.5)))
+    self.assertEqual("0.0078125", str(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("inf", str(bfloat16(float("inf"))))
+    self.assertEqual("-inf", str(bfloat16(float("-inf"))))
+    self.assertEqual("nan", str(bfloat16(float("nan"))))
+
+  def testRepr(self):
+    self.assertEqual("bfloat16(0)", repr(bfloat16(0)))
+    self.assertEqual("bfloat16(1)", repr(bfloat16(1)))
+    self.assertEqual("bfloat16(-3.5)", repr(bfloat16(-3.5)))
+    self.assertEqual("bfloat16(0.0078125)",
+                     repr(bfloat16(float.fromhex("1.0p-7"))))
+    self.assertEqual("bfloat16(inf)", repr(bfloat16(float("inf"))))
+    self.assertEqual("bfloat16(-inf)", repr(bfloat16(float("-inf"))))
+    self.assertEqual("bfloat16(nan)", repr(bfloat16(float("nan"))))
+
+  def testHash(self):
+    self.assertEqual(0, hash(bfloat16(0.0)))
+    self.assertEqual(0x3f80, hash(bfloat16(1.0)))
+    self.assertEqual(0x7fc0, hash(bfloat16(float("nan"))))
+
+  # Tests for Python operations
+  def testNegate(self):
+    for v in self.float_values():
+      self._assertFloatIdentical(-v, float(-bfloat16(v)))
+
+  def testAdd(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) + bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) + bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) + bfloat16(-1)))
+    self._assertFloatIdentical(5.5, float(bfloat16(2) + bfloat16(3.5)))
+    self._assertFloatIdentical(1.25, float(bfloat16(3.5) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("inf")) + bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("-inf")) + bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) + bfloat16(float("nan")))))
+
+  def testSub(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) - bfloat16(0)))
+    self._assertFloatIdentical(1, float(bfloat16(1) - bfloat16(0)))
+    self._assertFloatIdentical(2, float(bfloat16(1) - bfloat16(-1)))
+    self._assertFloatIdentical(-1.5, float(bfloat16(2) - bfloat16(3.5)))
+    self._assertFloatIdentical(5.75, float(bfloat16(3.5) - bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("inf"))))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(-2.25) - bfloat16(float("-inf"))))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) - bfloat16(float("nan")))))
+
+  def testMul(self):
+    self._assertFloatIdentical(0, float(bfloat16(0) * bfloat16(0)))
+    self._assertFloatIdentical(0, float(bfloat16(1) * bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) * bfloat16(-1)))
+    self._assertFloatIdentical(-7.875, float(bfloat16(3.5) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) * bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) * bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) * bfloat16(float("nan")))))
+
+  def testDiv(self):
+    self.assertTrue(math.isnan(float(bfloat16(0) / bfloat16(0))))
+    self._assertFloatIdentical(float("inf"), float(bfloat16(1) / bfloat16(0)))
+    self._assertFloatIdentical(-1, float(bfloat16(1) / bfloat16(-1)))
+    self._assertFloatIdentical(-1.75, float(bfloat16(3.5) / bfloat16(-2)))
+    self._assertFloatIdentical(float("-inf"),
+                               float(bfloat16(float("inf")) / bfloat16(-2.25)))
+    self._assertFloatIdentical(float("inf"),
+                               float(bfloat16(float("-inf")) / bfloat16(-2.25)))
+    self.assertTrue(math.isnan(float(bfloat16(3.5) / bfloat16(float("nan")))))
+
+  def testLess(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v < w, bfloat16(v) < bfloat16(w))
+
+  def testLessEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v <= w, bfloat16(v) <= bfloat16(w))
+
+  def testGreater(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v > w, bfloat16(v) > bfloat16(w))
+
+  def testGreaterEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v >= w, bfloat16(v) >= bfloat16(w))
+
+  def testEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v == w, bfloat16(v) == bfloat16(w))
+
+  def testNotEqual(self):
+    for v in self.float_values():
+      for w in self.float_values():
+        self.assertEqual(v != w, bfloat16(v) != bfloat16(w))
+
+
+class Bfloat16NumPyTest(test.TestCase):
+
+  def testDtype(self):
+    self.assertEqual(bfloat16, np.dtype(bfloat16))
+
+  def testArray(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    self.assertEqual(bfloat16, x.dtype)
+    self.assertEqual("[[bfloat16(1) bfloat16(2) bfloat16(3)]]", str(x))
+    self.assertAllEqual(x, x)
+    self.assertAllClose(x, x)
+
+  def testCasts(self):
+    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+      x = np.array([[1, 2, 3]], dtype=dtype)
+      y = x.astype(bfloat16)
+      z = y.astype(dtype)
+      self.assertTrue(np.all(x == y))
+      self.assertEqual(bfloat16, y.dtype)
+      self.assertTrue(np.all(x == z))
+      self.assertEqual(dtype, z.dtype)
+
+  def testAdd(self):
+    x = np.array([[1, 2, 3]], dtype=bfloat16)
+    y = np.array([[4, 5, 6]], dtype=bfloat16)
+    self.assertAllClose(np.array([[5, 7, 9]]), x + y)
+
+  def testLogSumExp(self):
+    x = np.array([[1, 2, 3]], dtype=np.float32)
+    y = np.array([[4, 5, 6]], dtype=np.float32)
+    self.assertAllClose(np.logaddexp(x, y),
+                        np.logaddexp(x.astype(bfloat16), y.astype(bfloat16)),
+                        atol=2e-2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index cf2c2e6eb00cccf82adf3c9eb65b685130a2f632..994af69386b278f6b88c051f898cd6a9dc607f3f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -125,6 +126,10 @@ Status PyArray_TYPE_to_TF_DataType(PyArrayObject* array,
       // custom struct type.
       return PyArrayDescr_to_TF_DataType(descr, out_tf_datatype);
     default:
+      if (pyarray_type == Bfloat16NumpyType()) {
+        *out_tf_datatype = TF_BFLOAT16;
+        break;
+      }
       // TODO(mrry): Support these.
       return errors::Internal("Unsupported feed type");
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 82c45f5a315d485585b1514634201225f4123de1..65e2178cda498294ffc4a5066b5692132e86180f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/python/lib/core/bfloat16.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 
 namespace tensorflow {
@@ -175,7 +176,7 @@ Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
       *out_pyarray_type = NPY_INT32;
       break;
     case TF_BFLOAT16:
-      *out_pyarray_type = NPY_UINT16;
+      *out_pyarray_type = Bfloat16NumpyType();
       break;
     default:
       return errors::Internal("Tensorflow type ", tf_datatype,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index a62847614c6d230a7c65a6f461187f1a170613cd..eae1c2eea693434e755ddd3795dc4d37861cd6ce 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -18,21 +18,25 @@ limitations under the License.
 #include <array>
 
 #include "numpy/arrayobject.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
+#include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 #include <Python.h>
 
 namespace tensorflow {
 namespace {
 
-static mutex mu;
+static mutex mu(LINKER_INITIALIZED);
 static PyObject* py_trampoline GUARDED_BY(mu) = nullptr;
 
 // Returns the py_trampoline that is used to pass the control to the
@@ -48,6 +52,9 @@ struct PyCall {
   // with this "token".
   string token;
 
+  // True if the call is associated with an EagerPyFunc.
+  bool eager;
+
   // Inputs and outputs of this function invocation.
   std::vector<Tensor> ins;
   std::vector<Tensor> out;
@@ -55,19 +62,26 @@ struct PyCall {
 
 // Givens the 'call', prepares the token and inputs as a python tuple
 // that is appropriate for calling the trampoline.
-Status MakeArgTuple(PyCall* call, PyObject** tuple) {
+Status MakeArgTuple(const PyCall* call, PyObject** tuple) {
   int64 n = call->ins.size();
   PyObject* lst = PyList_New(n);
   CHECK(lst);
   for (int64 i = 0; i < n; ++i) {
+    PyObject* arg = nullptr;
     const Tensor& t = call->ins[i];
-    PyObject* a = nullptr;
-    Status s = ConvertTensorToNdarray(t, &a);
-    if (!s.ok()) {
-      Py_DECREF(lst);
-      return s;
+    if (call->eager) {
+      arg = EagerTensorFromHandle(TFE_NewTensorHandle(t));
+      if (arg == nullptr) {
+        return errors::Internal("Unable to procure EagerTensor from Tensor.");
+      }
+    } else {
+      Status s = ConvertTensorToNdarray(t, &arg);
+      if (!s.ok()) {
+        Py_DECREF(lst);
+        return s;
+      }
     }
-    PyList_SetItem(lst, i, a);
+    PyList_SetItem(lst, i, arg);
   }
   *tuple = Py_BuildValue("(sN)", call->token.c_str(), lst);
   CHECK(*tuple);
@@ -133,50 +147,21 @@ bool IsSingleNone(PyObject* obj) {
   return item == Py_None;
 }
 
-// py.__class__.__name__
-const char* ClassName(PyObject* py) {
-/* PyPy doesn't have a separate C API for old-style classes. */
-#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
-  if (PyClass_Check(py))
-    return PyString_AS_STRING(
-        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
-  if (PyInstance_Check(py))
-    return PyString_AS_STRING(CHECK_NOTNULL(
-        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
-#endif
-  if (Py_TYPE(py) == &PyType_Type) {
-    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
-  }
-  return Py_TYPE(py)->tp_name;
-}
-
-string PyExcFetch() {
-  CHECK(PyErr_Occurred()) << "Must only call PyExcFetch after an exception.";
-  PyObject* ptype;
-  PyObject* pvalue;
-  PyObject* ptraceback;
-  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
-  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  string err = ClassName(ptype);
-  if (pvalue) {
-    PyObject* str = PyObject_Str(pvalue);
-    if (str) {
-#if PY_MAJOR_VERSION < 3
-      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
-#else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
-#endif
-      Py_DECREF(str);
-    }
-    Py_DECREF(pvalue);
-  }
-  Py_DECREF(ptype);
-  Py_XDECREF(ptraceback);
-  return err;
+// Retrieves a Tensor from `eager_tensor` and stores it in `output_tensor`.
+Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
+                                    Tensor* output_tensor,
+                                    TF_Status* tf_status) {
+  // TODO(akshayka): Lift the restriction requiring output tensors to
+  // lie in host memory; EagerPyFunc should be able to dispatch ops on GPU
+  // tensors, so we should eventually implement a GPU kernel for EagerPyFunc.
+  *output_tensor = *TFE_TensorHandleUnderlyingTensorInHostMemory(
+      EagerTensor_Handle(eager_tensor), tf_status);
+  return StatusFromTF_Status(tf_status);
 }
 
 // Calls the registered py function through the trampoline.
-Status DoCallPyFunc(PyCall* call) {
+Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
+  *out_log_on_error = true;
   PyObject* trampoline = GetPyTrampoline();
   if (trampoline == nullptr) {
     return errors::InvalidArgument(
@@ -194,17 +179,18 @@ Status DoCallPyFunc(PyCall* call) {
     if (PyErr_Occurred()) {
       if (PyErr_ExceptionMatches(PyExc_ValueError) ||
           PyErr_ExceptionMatches(PyExc_TypeError)) {
-        return errors::InvalidArgument(PyExcFetch());
+        return errors::InvalidArgument(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-        return errors::OutOfRange(PyExcFetch());
+        *out_log_on_error = false;
+        return errors::OutOfRange(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_MemoryError)) {
-        return errors::ResourceExhausted(PyExcFetch());
+        return errors::ResourceExhausted(PyExceptionFetch());
       } else if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
-        return errors::Unimplemented(PyExcFetch());
+        return errors::Unimplemented(PyExceptionFetch());
       } else {
         // TODO(ebrevdo): Check if exception is an OpError and use the
         // OpError.error_code property to map it back in the Status.
-        return errors::Unknown(PyExcFetch());
+        return errors::Unknown(PyExceptionFetch());
       }
     } else {
       return errors::Internal("Failed to run py callback ", call->token,
@@ -212,21 +198,37 @@ Status DoCallPyFunc(PyCall* call) {
     }
   }
 
-  // Process the return values and converts them to tf Tensors.
+  // Process the return values and convert them to TF Tensors.
   Status s;
   if (PyList_Check(result)) {
-    // 'result' is a list.
     call->out.clear();
     for (int i = 0; i < PyList_Size(result); ++i) {
       Tensor t;
-      s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      if (call->eager) {
+        auto tf_status = tensorflow::make_safe(TF_NewStatus());
+        s = ExtractTensorFromEagerTensor(PyList_GetItem(result, i), &t,
+                                         tf_status.get());
+      } else {
+        s = ConvertNdarrayToTensor(PyList_GetItem(result, i), &t);
+      }
+
       if (!s.ok()) {
         break;
       }
       call->out.push_back(t);
     }
+  } else if (EagerTensor_CheckExact(result) || result == Py_None) {
+    DCHECK(call->eager);
+    Tensor t;
+    if (result != Py_None) {
+      auto tf_status = tensorflow::make_safe(TF_NewStatus());
+      s = ExtractTensorFromEagerTensor(result, &t, tf_status.get());
+      if (s.ok()) {
+        call->out.push_back(t);
+      }
+    }
   } else if (PyArray_Check(result)) {
-    // 'result' is a single ndarray.
+    DCHECK(!call->eager);
     if (!IsSingleNone(result)) {
       Tensor t;
       s = ConvertNdarrayToTensor(result, &t);
@@ -415,22 +417,32 @@ class PyFuncOp : public OpKernel {
  public:
   explicit PyFuncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("token", &token_));
+    eager_ = type_string() == "EagerPyFunc";
   }
 
   void Compute(OpKernelContext* ctx) override {
     PyCall call;
     call.token = token_;
+    call.eager = eager_;
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       call.ins.push_back(ctx->input(i));
     }
 
     PyGILState_STATE py_threadstate;
     py_threadstate = PyGILState_Ensure();
-    Status s = DoCallPyFunc(&call);
+    bool log_on_error;
+    Status s = DoCallPyFunc(&call, &log_on_error);
     PyGILState_Release(py_threadstate);
 
     // Ensures that GIL is released even when !s.ok().
-    OP_REQUIRES_OK(ctx, s);
+    if (!s.ok()) {
+      if (log_on_error) {
+        ctx->CtxFailureWithWarning(s);
+      } else {
+        ctx->CtxFailure(s);
+      }
+      return;
+    }
 
     OP_REQUIRES(ctx, static_cast<int32>(call.out.size()) == ctx->num_outputs(),
                 errors::InvalidArgument(token_, " returns ", call.out.size(),
@@ -450,9 +462,15 @@ class PyFuncOp : public OpKernel {
  private:
   string token_;
 
+  // True if and only if this op should execute the python function eagerly,
+  // i.e., if and only if the eager attribute is set.
+  bool eager_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(PyFuncOp);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PyFunc").Device(DEVICE_CPU), PyFuncOp);
 REGISTER_KERNEL_BUILDER(Name("PyFuncStateless").Device(DEVICE_CPU), PyFuncOp);
+REGISTER_KERNEL_BUILDER(Name("EagerPyFunc").Device(DEVICE_CPU), PyFuncOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_func.h b/tensorflow/python/lib/core/py_func.h
index 5a451d5f43285d19dff6c158ebc28045b3ff13d4..3197a7ddfa0ce3db9f8244215690e5ede5096ac2 100644
--- a/tensorflow/python/lib/core/py_func.h
+++ b/tensorflow/python/lib/core/py_func.h
@@ -24,21 +24,27 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Called by py code on initialization.
+// Called by python code on initialization.
 //
 // "trampoline" must represent a python function which has the
 // following signature:
-//   (string, list(ndarray)) -> ndarray | list(ndarray) | python scalar
+//   (string, list(ndarray)) | (string, list(EagerTensor)) ->
+//     ndarray | list(ndarray) | python scalar |
+//     EagerTensor | list(EagerTensor) | None
 //
 // The trampoline takes two arguments, the first is a string token
 // used by the python frontend's dispatching logic; the second is a
-// list of numpy ndarrays.
+// list of numpy ndarrays or EagerTensor objects. It can return a
+// single numpy ndarray, a list of numpy ndarrays, a python scalar, an
+// EagerTensor, a list of EagerTensors, or None.
 //
-// The trampoline can return a single numpy ndarray, a list of numpy
-// ndarrays, or a simply python scalar. The C++ runtime converts them,
-// if supported, back to Tensor objects.
+// PyFunc requires inputs and outputs to be ndarrays. EagerPyFunc requires
+// inputs to be a list of EagerTensors and outputs to be an EagerTensor, a list
+// of EagerTensors, or None.
 //
-// This is called by script_ops.py during its module initialization.
+// The C++ runtime converts outputs back to Tensor objects.
+//
+// This function is called by script_ops.py during its module initialization.
 //
 // TODO(zhifengc): Support distributed runtime.
 void InitializePyTrampoline(PyObject* trampoline);
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 3b40e1c94c01f95af2d3f2728547bd5f244ff21f..317bdc2e14747583f372808f48a5928273f5570a 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
@@ -36,13 +37,7 @@ inline PyObject* PyType(PyObject* obj) {
 }
 
 bool IsPyString(PyObject* obj) {
-  // TODO(josh11b): Support unicode strings in Python 2? bytearrays? NumPy string
-  // types?
-#if PY_MAJOR_VERSION >= 3
   return PyBytes_Check(obj) || PyUnicode_Check(obj);
-#else
-  return PyBytes_Check(obj);
-#endif
 }
 
 bool IsPyInt(PyObject* obj) {
@@ -95,12 +90,25 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
       *dtype = DT_STRING;
     } else if (PySequence_Check(obj)) {
       auto length = PySequence_Length(obj);
-      shape->AddDim(length);
       if (length > 0) {
+        shape->AddDim(length);
         obj = PySequence_GetItem(obj, 0);
         continue;
-      } else {
+      } else if (length == 0) {
+        shape->AddDim(length);
         *dtype = DT_INVALID;  // Invalid dtype for empty tensors.
+      } else {
+        // The sequence does not have a valid length (PySequence_Length < 0).
+        if (PyErr_Occurred()) {
+          // PySequence_Length failed and set an exception. Fetch the message
+          // and convert it to a failed status.
+          return errors::InvalidArgument(PyExceptionFetch());
+        } else {
+          // This is almost certainly dead code: PySequence_Length failed but
+          // did not set an exception.
+          return errors::InvalidArgument(
+              "Attempted to convert an invalid sequence to a Tensor.");
+        }
       }
     } else if (IsPyFloat(obj)) {
       *dtype = DT_DOUBLE;
@@ -309,15 +317,21 @@ const char* ConvertOneString(PyObject* v, string* out) {
     out->assign(PyBytes_AS_STRING(v), PyBytes_GET_SIZE(v));
     return nullptr;
   }
-#if PY_MAJOR_VERSION >= 3
   if (PyUnicode_Check(v)) {
+#if PY_MAJOR_VERSION >= 3
     Py_ssize_t size;
     const char* str = PyUnicode_AsUTF8AndSize(v, &size);
     if (str == nullptr) return ErrorConvertingUnicodeString;
     out->assign(str, size);
     return nullptr;
-  }
+#else
+    PyObject* py_str = PyUnicode_AsUTF8String(v);
+    if (py_str == nullptr) return ErrorConvertingUnicodeString;
+    out->assign(PyBytes_AS_STRING(py_str), PyBytes_GET_SIZE(py_str));
+    Py_DECREF(py_str);
+    return nullptr;
 #endif
+  }
   return ErrorMixedTypes;
 }
 
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2635694e23c07dd8e75d4bb0cfb9e83a2042d921
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -0,0 +1,70 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include <Python.h>
+
+namespace tensorflow {
+namespace {
+
+// py.__class__.__name__
+const char* ClassName(PyObject* py) {
+/* PyPy doesn't have a separate C API for old-style classes. */
+#if PY_MAJOR_VERSION < 3 && !defined(PYPY_VERSION)
+  if (PyClass_Check(py))
+    return PyString_AS_STRING(
+        CHECK_NOTNULL(reinterpret_cast<PyClassObject*>(py)->cl_name));
+  if (PyInstance_Check(py))
+    return PyString_AS_STRING(CHECK_NOTNULL(
+        reinterpret_cast<PyInstanceObject*>(py)->in_class->cl_name));
+#endif
+  if (Py_TYPE(py) == &PyType_Type) {
+    return reinterpret_cast<PyTypeObject*>(py)->tp_name;
+  }
+  return Py_TYPE(py)->tp_name;
+}
+
+}  // end namespace
+
+string PyExceptionFetch() {
+  CHECK(PyErr_Occurred())
+      << "Must only call PyExceptionFetch after an exception.";
+  PyObject* ptype;
+  PyObject* pvalue;
+  PyObject* ptraceback;
+  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
+  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
+  string err = ClassName(ptype);
+  if (pvalue) {
+    PyObject* str = PyObject_Str(pvalue);
+    if (str) {
+#if PY_MAJOR_VERSION < 3
+      strings::StrAppend(&err, ": ", PyString_AS_STRING(str));
+#else
+      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str));
+#endif
+      Py_DECREF(str);
+    }
+    Py_DECREF(pvalue);
+  }
+  Py_DECREF(ptype);
+  Py_XDECREF(ptraceback);
+  return err;
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..44dfe7ba21285d06667a8d0f6ab8ac0ec8f2aa00
--- /dev/null
+++ b/tensorflow/python/lib/core/py_util.h
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+// Fetch the exception message as a string. An exception must be set
+// (PyErr_Occurred() must be true).
+string PyExceptionFetch();
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_UTIL_H_
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/python/lib/core/safe_ptr.cc
index 456ea3348baa634075082fedde9dac175e237997..ce34b6d0041878c4122d36ab8bf9db6c17253680 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/python/lib/core/safe_ptr.cc
@@ -16,25 +16,21 @@ limitations under the License.
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
-namespace {
 
-inline void Py_DECREF_wrapper(PyObject* o) { Py_DECREF(o); }
-
-}  // namespace
-
-Safe_PyObjectPtr make_safe(PyObject* o) {
-  return Safe_PyObjectPtr(o, Py_DECREF_wrapper);
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
 }
 
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
-  return Safe_TF_TensorPtr(tensor, TF_DeleteTensor);
+  return Safe_TF_TensorPtr(tensor);
 }
 
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle) {
-  return Safe_TFE_TensorHandlePtr(handle, TFE_DeleteTensorHandle);
+  return Safe_TFE_TensorHandlePtr(handle);
 }
 
 Safe_TF_StatusPtr make_safe(TF_Status* status) {
-  return Safe_TF_StatusPtr(status, TF_DeleteStatus);
+  return Safe_TF_StatusPtr(status);
 }
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 70cd2fdf6ccf4d722892f06e1e3aa40919b63ac7..80db840aebcc7ca341b0f6c40fdaee2136d21aaa 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -17,39 +17,51 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
 #include <memory>
-#include <Python.h>
 
+#include <Python.h>
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 
 namespace tensorflow {
+namespace detail {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+struct TFTensorDeleter {
+  void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
+};
+
+struct TFETensorHandleDeleter {
+  void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+};
+
+struct TFStatusDeleter {
+  void operator()(TF_Status* p) const { TF_DeleteStatus(p); }
+};
+
+}  // namespace detail
 
 // Safe container for an owned PyObject. On destruction, the reference count of
 // the contained object will be decremented.
-typedef void (*Py_DECREF_wrapper_type)(PyObject*);
-typedef std::unique_ptr<PyObject, Py_DECREF_wrapper_type> Safe_PyObjectPtr;
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
 Safe_PyObjectPtr make_safe(PyObject* o);
 
 // Safe containers for an owned TF_Tensor. On destruction, the tensor will be
 // deleted by TF_DeleteTensor.
-// Note: can't use decltype(&TF_DeleteTensor) due to SWIG
-typedef void (*TF_DeleteTensor_type)(TF_Tensor*);
-typedef std::unique_ptr<TF_Tensor, TF_DeleteTensor_type> Safe_TF_TensorPtr;
+using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor);
 
 // Safe containers for an owned TFE_TensorHandle. On destruction, the handle
-// will be deleted by TFE_DeleteTensorHandle. Note: can't use
-// decltype(&TFE_DeleteTensorHandle) due to SWIG
-typedef void (*TFE_DeleteTensorHandle_type)(TFE_TensorHandle*);
-typedef std::unique_ptr<TFE_TensorHandle, TFE_DeleteTensorHandle_type>
-    Safe_TFE_TensorHandlePtr;
+// will be deleted by TFE_DeleteTensorHandle.
+using Safe_TFE_TensorHandlePtr =
+    std::unique_ptr<TFE_TensorHandle, detail::TFETensorHandleDeleter>;
 Safe_TFE_TensorHandlePtr make_safe(TFE_TensorHandle* handle);
 
 // Safe containers for an owned TF_Status. On destruction, the handle
-// will be deleted by TF_DeleteStatus. Note: can't use
-// decltype(&TF_DeleteStatus) due to SWIG
-typedef void (*TF_DeleteStatus_type)(TF_Status*);
-typedef std::unique_ptr<TF_Status, TF_DeleteStatus_type> Safe_TF_StatusPtr;
+// will be deleted by TF_DeleteStatus.
+using Safe_TF_StatusPtr = std::unique_ptr<TF_Status, detail::TFStatusDeleter>;
 Safe_TF_StatusPtr make_safe(TF_Status* status);
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/strings.i b/tensorflow/python/lib/core/strings.i
index 938c13e30eb7b00a8225c8e95c7d53f2dd8398c3..9d807e51be0d203c433befb7614b2e5cd4e7358d 100644
--- a/tensorflow/python/lib/core/strings.i
+++ b/tensorflow/python/lib/core/strings.i
@@ -40,7 +40,7 @@ limitations under the License.
 // Returns true on success, false on failure.
 bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
   if (obj == Py_None) {
-    result->clear();
+    *result = tensorflow::StringPiece();
   } else {
     char* ptr;
     Py_ssize_t len;
@@ -48,7 +48,7 @@ bool _BytesToStringPiece(PyObject* obj, tensorflow::StringPiece* result) {
       // Python has raised an error (likely TypeError or UnicodeEncodeError).
       return false;
     }
-    result->set(ptr, len);
+    *result = tensorflow::StringPiece(ptr, len);
   }
   return true;
 }
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 2ee298ad44e6ea12a204779a4e2eec68015a2d3a..55cae0bcbfca8a9cacfe525fe3b69c7fb232acd3 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from math import ceil
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -102,32 +103,46 @@ def _ConcatGradHelper(op, grad, start_value_index, end_value_index, dim_index):
 
   concat_dim = op.inputs[dim_index]
   input_values = op.inputs[start_value_index:end_value_index]
-  # Using mod here for convenience since concat_dim is already verified
-  # in concat implementation to be within the allowed [-rank, rank) range.
-  non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
 
   out_grads = []
   if isinstance(grad, ops.Tensor):
-    # Get the inputs' tensor shapes
-    sizes = _ExtractInputShapes(input_values)
-    # The magic number of 16 was found through benchmarking a range of sizes
-    # on CPUs and a Maxwell TitanX.  A speedup was seen in a large majority of
-    # cases when switching implementations at N=16, but it is possible that
-    # there will be a small number of performance regressions.
-    # pylint: disable=protected-access
-    if len(sizes) > 16:
-      # extract the size of each input along the concat dimension
-      sizes = array_ops.squeeze(
-          array_ops.slice(
-              array_ops.stack(
-                  sizes, axis=1), [non_neg_concat_dim, 0], [1, -1]))
+    if context.in_eager_mode():
+      # Using mod here for convenience since concat_dim is already verified
+      # in concat implementation to be within the allowed [-rank, rank) range.
+      non_neg_concat_dim = (
+          concat_dim._numpy().item(0) % input_values[0]._rank())  # pylint: disable=protected-access
+      # All inputs are guaranteed to be EagerTensors in eager mode
+      sizes = pywrap_tensorflow.TFE_Py_TensorShapeSlice(input_values,
+                                                        non_neg_concat_dim)
       out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
     else:
-      offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
-      for (begin, size) in zip(offset, sizes):
-        out_grads.append(array_ops.slice(grad, begin, size))
-    # pylint: enable=protected-access
+      # Using mod here for convenience since concat_dim is already verified
+      # in concat implementation to be within the allowed [-rank, rank) range.
+      non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
+
+      # Get the inputs' tensor shapes
+      sizes = _ExtractInputShapes(input_values)
+      # The magic number of 16 was found through benchmarking a range of sizes
+      # on CPUs and a Maxwell TitanX.  A speedup was seen in a large majority of
+      # cases when switching implementations at N=16, but it is possible that
+      # there will be a small number of performance regressions.
+      # pylint: disable=protected-access
+      if len(sizes) > 16:
+        # extract the size of each input along the concat dimension
+        sizes = array_ops.squeeze(
+            array_ops.slice(
+                array_ops.stack(
+                    sizes, axis=1), [non_neg_concat_dim, 0], [1, -1]))
+        out_grads = array_ops.split(grad, sizes, non_neg_concat_dim)
+      else:
+        offset = gen_array_ops._concat_offset(non_neg_concat_dim, sizes)
+        for (begin, size) in zip(offset, sizes):
+          out_grads.append(array_ops.slice(grad, begin, size))
+      # pylint: enable=protected-access
   elif isinstance(grad, ops.IndexedSlices):
+    # Using mod here for convenience since concat_dim is already verified
+    # in concat implementation to be within the allowed [-rank, rank) range.
+    non_neg_concat_dim = concat_dim % array_ops.rank(input_values[0])
     concat_dim_static = tensor_util.constant_value(concat_dim)
     if concat_dim_static is None:
       raise ValueError("Can only compute IndexedSlices gradient with "
@@ -445,7 +460,11 @@ def _GatherNdGrad(op, grad):
   ref = op.inputs[0]
   indices = op.inputs[1]
   ref_shape = array_ops.shape(ref, out_type=indices.dtype)
-  ref_grad = array_ops.scatter_nd(indices, grad, ref_shape)
+  if indices.shape.ndims == 2 and indices.shape[-1].value == 1:
+    ref_grad = ops.IndexedSlices(grad, array_ops.squeeze(indices, axis=-1),
+                                 ref_shape)
+  else:
+    ref_grad = array_ops.scatter_nd(indices, grad, ref_shape)
   return [ref_grad, None]
 
 
@@ -505,6 +524,16 @@ def _TransposeGrad(op, grad):
   return [array_ops.transpose(grad, array_ops.invert_permutation(p)), None]
 
 
+@ops.RegisterGradient("ConjugateTranspose")
+def _ConjugateTransposeGrad(op, grad):
+  """Returns conj(unshuffle(grad))."""
+  p = op.inputs[1]
+  return [
+      array_ops.transpose(
+          grad, array_ops.invert_permutation(p), conjugate=True), None
+  ]
+
+
 ops.NotDifferentiable("Shape")
 
 
@@ -626,14 +655,22 @@ def _BatchToSpaceNDGrad(op, grad):
 def _SpaceToDepthGrad(op, grad):
   # Its gradient is the opposite op: DepthToSpace.
   block_size = op.get_attr("block_size")
-  return array_ops.depth_to_space(grad, block_size)
+  data_format = op.get_attr("data_format")
+  if data_format == "NCHW_VECT_C":
+    raise ValueError("Cannot compute SpaceToDepth gradient with NCHW_VECT_C. "
+                     "NCHW_VECT_C requires qint8 data type.")
+  return array_ops.depth_to_space(grad, block_size, data_format=data_format)
 
 
 @ops.RegisterGradient("DepthToSpace")
 def _DepthToSpaceGrad(op, grad):
   # Its gradient is the opposite op: SpaceToDepth.
   block_size = op.get_attr("block_size")
-  return array_ops.space_to_depth(grad, block_size)
+  data_format = op.get_attr("data_format")
+  if data_format == "NCHW_VECT_C":
+    raise ValueError("Cannot compute DepthToSpace gradient with NCHW_VECT_C. "
+                     "NCHW_VECT_C requires qint8 data type.")
+  return array_ops.space_to_depth(grad, block_size, data_format=data_format)
 
 
 ops.NotDifferentiable("OneHot")
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index e783fc29ebfdab0fa6f7c70f529ab1f9e7cd0958..74b405681b5b6cbda7df207a6deb2a172d858743 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -70,6 +70,7 @@ See the @{$python/array_ops} guide.
 @@quantize_v2
 @@quantized_concat
 @@setdiff1d
+@@guarantee_const
 @@fake_quant_with_min_max_args
 @@fake_quant_with_min_max_args_gradient
 @@fake_quant_with_min_max_vars
@@ -109,7 +110,7 @@ newaxis = None
 
 # We override the 'slice' for the "slice" op, so we keep python's
 # existing 'slice' for later use in this module.
-_baseslice = slice
+_BaseSlice = slice
 
 
 def identity(input, name=None):  # pylint: disable=redefined-builtin
@@ -125,11 +126,8 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   if context.in_graph_mode():
     return gen_array_ops.identity(input, name=name)
   else:
-    try:
-      in_device = input.device
-    except AttributeError:
-      input = ops.convert_to_tensor(input)
-      in_device = input.device
+    input = ops.convert_to_tensor(input)
+    in_device = input.device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     if context.context().device_name != in_device:
       return input._copy()  # pylint: disable=protected-access
@@ -306,6 +304,32 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       return gen_array_ops.shape(input, name=name, out_type=out_type)
 
 
+def shape_n(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  """Returns shape of tensors.
+
+  Args:
+    input: A list of at least 1 `Tensor` object with the same type.
+    out_type: The specified output type of the operation
+      (`int32` or `int64`). Defaults to `tf.int32`(optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    A list with the same length as `input` of `Tensor` objects with
+      type `out_type`.
+  """
+
+  output = gen_array_ops.shape_n(input, out_type=out_type, name=name)
+  if context.in_graph_mode():
+    for i, input_tensor in enumerate(input):
+      input_tensor = ops.convert_to_tensor(input_tensor)
+      input_shape = input_tensor.get_shape()
+      if input_shape.is_fully_defined():
+        output[i] = constant(
+            input_shape.as_list(), dtype=out_type, name=name)
+  return output
+
+
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -328,7 +352,7 @@ def size(input, name=None, out_type=dtypes.int32):
 
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
-    
+
   @compatibility(numpy)
   Equivalent to np.size()
   @end_compatibility
@@ -419,7 +443,7 @@ def rank_internal(input, name=None, optimize=True):
       return gen_array_ops.rank(input, name=name)
 
 
-def _SliceHelper(tensor, slice_spec, var=None):
+def _slice_helper(tensor, slice_spec, var=None):
   """Overload for Tensor.__getitem__.
 
   This operation extracts the specified region from the tensor.
@@ -482,7 +506,7 @@ def _SliceHelper(tensor, slice_spec, var=None):
   begin_mask, end_mask = 0, 0
   ellipsis_mask = 0
   for s in slice_spec:
-    if isinstance(s, _baseslice):
+    if isinstance(s, _BaseSlice):
       # python doesn't always use None when constructing ranges
       # for example a[:] gives slice(None,sys.maxsize,None)
       # whereas a[::1] gives slice(None,None,None)
@@ -549,7 +573,7 @@ def _SliceHelper(tensor, slice_spec, var=None):
         name=name)
 
 
-# pylint: disable=undefined-variable,protected-access
+# pylint: disable=undefined-variable,protected-access,redefined-outer-name
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -615,40 +639,35 @@ def strided_slice(input_,
                   name=None):
   """Extracts a strided slice of a tensor (generalized python array indexing).
 
-  **Most users will want to use @{tf.Tensor.__getitem__} and
-  @{tf.Variable.__getitem__}.** That allows  NumPy style slicing syntax (i.e.
-  `tensor[..., 3:4:-1, tf.newaxis, 3]`).
-  This op is the low-level interface that are used to implement operators.
-  Those interfaces are much more friendly, and highly recommended.
-
-  To a first order, this operation extracts a slice of size `end - begin`
-  from a tensor `input`
-  starting at the location specified by `begin`. The slice continues by adding
-  `stride` to the `begin` index until all dimensions are not less than `end`.
-  Note that components of stride can be negative, which causes a reverse
-  slice.
-
-  This operation can be thought of an encoding of a numpy style sliced
-  range. Given a python slice input[<spec0>, <spec1>, ..., <specn>]
+  **Instead of calling this op directly most users will want to use the
+  NumPy-style slicing syntax (e.g. `tensor[..., 3:4:-1, tf.newaxis, 3]`), which
+  is supported via @{tf.Tensor.__getitem__} and @{tf.Variable.__getitem__}.**
+  The interface of this op is a low-level encoding of the slicing syntax.
+
+  Roughly speaking, this op extracts a slice of size `(end-begin)/stride`
+  from the given `input_` tensor. Starting at the location specified by `begin`
+  the slice continues by adding `stride` to the index until all dimensions are
+  not less than `end`.
+  Note that a stride can be negative, which causes a reverse slice.
+
+  Given a Python slice `input[spec0, spec1, ..., specn]`,
   this function will be called as follows.
 
-  `begin`, `end`, and `strides` will be all length n. n is in general
-  not the same dimensionality as `input`.
+  `begin`, `end`, and `strides` will be vectors of length n.
+  n in general is not equal to the rank of the `input_` tensor.
 
-  For the ith spec,
-  `begin_mask`, `end_mask`, `ellipsis_mask`, `new_axis_mask`,
-  and `shrink_axis_mask` will have the ith bit corresponding to
+  In each mask field (`begin_mask`, `end_mask`, `ellipsis_mask`,
+  `new_axis_mask`, `shrink_axis_mask`) the ith bit will correspond to
   the ith spec.
 
-  If the ith bit of `begin_mask` is non-zero, `begin[i]` is ignored and
+  If the ith bit of `begin_mask` is set, `begin[i]` is ignored and
   the fullest possible range in that dimension is used instead.
   `end_mask` works analogously, except with the end range.
 
   `foo[5:,:,:3]` on a 7x8x9 tensor is equivalent to `foo[5:7,0:8,0:3]`.
   `foo[::-1]` reverses a tensor with shape 8.
 
-
-  If the ith bit of `ellipsis_mask` is non-zero, as many unspecified dimensions
+  If the ith bit of `ellipsis_mask` is set, as many unspecified dimensions
   as needed will be inserted between other dimensions. Only one
   non-zero bit is allowed in `ellipsis_mask`.
 
@@ -656,20 +675,21 @@ def strided_slice(input_,
   equivalent to `foo[3:5,:,:,4:5]` and
   `foo[3:5,...]` is equivalent to `foo[3:5,:,:,:]`.
 
-  If the ith bit of `new_axis_mask` is one, then `begin`,
+  If the ith bit of `new_axis_mask` is set, then `begin`,
   `end`, and `stride` are ignored and a new length 1 dimension is
   added at this point in the output tensor.
 
-  For example `foo[3:5,4]` on a 10x8 tensor produces a shape 2 tensor
-  whereas `foo[3:5,4:5]` produces a shape 2x1 tensor with shrink_mask
-  being 1<<1 == 2.
+  For example,
+  `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+
+  If the ith bit of `shrink_axis_mask` is set, it implies that the ith
+  specification shrinks the dimensionality by 1. `begin[i]`, `end[i]` and
+  `strides[i]` must imply a slice of size 1 in the dimension. For example in
+  Python one might do `foo[:, 3, :]` which would result in
+  `shrink_axis_mask` equal to 2.
 
-  If the ith bit of `shrink_axis_mask` is one, then `begin`,
-  `end[i]`, and `stride[i]` are used to do a slice in the appropriate
-  dimension, but the output tensor will be reduced in dimensionality
-  by one. This is only valid if the ith entry of slice[i]==1.
 
-  NOTE: `begin` and `end` are zero-indexed`.
+  NOTE: `begin` and `end` are zero-indexed.
   `strides` entries must be non-zero.
 
 
@@ -788,10 +808,10 @@ def _SliceHelperVar(var, slice_spec):
 
   """
 
-  return _SliceHelper(var._AsTensor(), slice_spec, var)
+  return _slice_helper(var._AsTensor(), slice_spec, var)
 
 
-ops.Tensor._override_operator("__getitem__", _SliceHelper)
+ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
 def parallel_stack(values, name="parallel_stack"):
@@ -1110,7 +1130,7 @@ def concat(values, axis, name="concat"):
   return gen_array_ops._concat_v2(values=values, axis=axis, name=name)
 
 
-def boolean_mask(tensor, mask, name="boolean_mask"):
+def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.  Numpy equivalent is `tensor[mask]`.
 
   ```python
@@ -1124,11 +1144,17 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   the first K dimensions of `tensor`'s shape.  We then have:
     `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
   where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
+  The `axis` could be used with `mask` to indicate the axis to mask from.
+  In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
+  the first `axis + dim(mask)` dimensions of `tensor`'s shape.
 
   Args:
     tensor:  N-D tensor.
     mask:  K-D boolean tensor, K <= N and K must be known statically.
     name:  A name for this operation (optional).
+    axis:  A 0-D int Tensor representing the axis in `tensor` to mask from.
+      By default, axis is 0 which will mask from the first dimension. Otherwise
+      K + axis <= N.
 
   Returns:
     (N-K+1)-dimensional tensor populated by entries in `tensor` corresponding
@@ -1147,10 +1173,10 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
   ```
   """
 
-  def _apply_mask_1d(reshaped_tensor, mask):
+  def _apply_mask_1d(reshaped_tensor, mask, axis=None):
     """Mask tensor along dimension 0 with a 1-D mask."""
     indices = squeeze(where(mask), squeeze_dims=[1])
-    return gather(reshaped_tensor, indices)
+    return gather(reshaped_tensor, indices, axis=axis)
 
   with ops.name_scope(name, values=[tensor, mask]):
     tensor = ops.convert_to_tensor(tensor, name="tensor")
@@ -1165,19 +1191,23 @@ def boolean_mask(tensor, mask, name="boolean_mask"):
       raise ValueError(
           "Number of mask dimensions must be specified, even if some dimensions"
           " are None.  E.g. shape=[None] is ok, but shape=None is not.")
-    shape_tensor[:ndims_mask].assert_is_compatible_with(shape_mask)
+    axis = 0 if axis is None else axis
+    shape_tensor[axis:axis + ndims_mask].assert_is_compatible_with(shape_mask)
 
-    leading_size = gen_math_ops._prod(shape(tensor)[:ndims_mask], [0])
+    leading_size = gen_math_ops._prod(
+        shape(tensor)[axis:axis + ndims_mask], [0])
     tensor = reshape(tensor,
-                     concat([[leading_size],
-                             shape(tensor)[ndims_mask:]], 0))
-    first_dim = shape_tensor[:ndims_mask].num_elements()
+                     concat([
+                         shape(tensor)[:axis], [leading_size],
+                         shape(tensor)[axis + ndims_mask:]
+                     ], 0))
+    first_dim = shape_tensor[axis:axis + ndims_mask].num_elements()
     tensor.set_shape(
-        tensor_shape.as_shape([first_dim])
-        .concatenate(shape_tensor[ndims_mask:]))
+        tensor_shape.as_shape(shape_tensor[:axis]).concatenate([first_dim])
+        .concatenate(shape_tensor[axis + ndims_mask:]))
 
     mask = reshape(mask, [-1])
-    return _apply_mask_1d(tensor, mask)
+    return _apply_mask_1d(tensor, mask, axis)
 
 
 def sparse_mask(a, mask_indices, name=None):
@@ -1251,7 +1281,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   Args:
     value: The `Tensor` to split.
     num_or_size_splits: Either a 0-D integer `Tensor` indicating the number of
-      splits along split_dim or a 1-D integer `Tensor` integer tensor containing
+      splits along split_dim or a 1-D integer `Tensor` containing
       the sizes of each output tensor along split_dim. If a scalar then it must
       evenly divide `value.shape[axis]`; otherwise the sum of sizes along the
       split dimension must match that of the `value`.
@@ -1271,21 +1301,21 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
     ValueError: If `num` is unspecified and cannot be inferred.
   """
   size_splits = ops.convert_to_tensor(num_or_size_splits)
-  if size_splits.get_shape().ndims == 0 and size_splits.dtype.is_integer:
+  if size_splits._rank() == 0 and size_splits.dtype.is_integer:
     return gen_array_ops._split(
-        split_dim=axis, num_split=num_or_size_splits, value=value, name=name)
-  else:
+        axis=axis, num_split=num_or_size_splits, value=value, name=name)
+
+  if num is None:
+    num = size_splits._shape_tuple()[0]
     if num is None:
-      size_splits_shape = size_splits.get_shape()
-      num = size_splits_shape.dims[0]
-      if num._value is None:
-        raise ValueError("Cannot infer num from shape %s" % num_or_size_splits)
-    return gen_array_ops._split_v(
-        value=value,
-        size_splits=size_splits,
-        split_dim=axis,
-        num_split=num,
-        name=name)
+      raise ValueError("Cannot infer num from shape %s" % num_or_size_splits)
+
+  return gen_array_ops._split_v(
+      value=value,
+      size_splits=size_splits,
+      axis=axis,
+      num_split=num,
+      name=name)
 
 
 def transpose(a, perm=None, name="transpose", conjugate=False):
@@ -1347,7 +1377,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
   with ops.name_scope(name, "transpose", [a]) as name:
     transpose_fn = (
         gen_array_ops._conjugate_transpose
-        if conjugate else gen_array_ops.transpose)
+        if (conjugate and a.dtype.is_complex) else gen_array_ops.transpose)
     if perm is None:
       rank = gen_array_ops.rank(a)
       perm = (rank - 1) - gen_math_ops._range(0, rank, 1)
@@ -1469,6 +1499,10 @@ def zeros(shape, dtype=dtypes.float32, name=None):
     if context.in_eager_mode() and dtype != dtypes.bool:
       return fill(shape, constant(zero, dtype=dtype), name=name)
     try:
+      if isinstance(shape, ops.Tensor):
+        # TODO(apassos) this is required to reproduce the behavior from before
+        # Tensors were iterable. It's a crutch.
+        raise TypeError
       shape = tensor_shape.as_shape(shape)
       output = constant(zero, shape=shape, dtype=dtype, name=name)
     except (TypeError, ValueError):
@@ -1495,7 +1529,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-    `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, or `complex128`.
+      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
@@ -1516,8 +1551,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
     # For now, variant types must be created via zeros_like; as we need to
     # pass the input variant object to the proper zeros callback.
 
-    if optimize and tensor.shape.is_fully_defined() and \
-        tensor.dtype != dtypes.variant:
+    if (optimize and tensor.shape.is_fully_defined() and
+        tensor.dtype != dtypes.variant):
       # We can produce a zeros tensor independent of the value of 'tensor',
       # since the shape is known statically.
       return zeros(tensor.shape, dtype=dtype or tensor.dtype, name=name)
@@ -1546,8 +1581,8 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, `complex128` or
-      `bool`.
+      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
     and encode it as a constant.
@@ -1591,6 +1626,9 @@ def ones(shape, dtype=dtypes.float32, name=None):
   with ops.name_scope(name, "ones", [shape]) as name:
     one = True if dtype == dtypes.bool else 1
     try:
+      if isinstance(shape, ops.Tensor):
+        raise TypeError(
+            "preserving semantics from before tensors were iterable")
       shape = tensor_shape.as_shape(shape)
       output = constant(one, shape=shape, dtype=dtype, name=name)
     except (TypeError, ValueError):
@@ -1620,6 +1658,8 @@ def placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
   ```
 
+  @compatibility{eager} Placeholders are not compatible with eager execution.
+
   Args:
     dtype: The type of elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1629,7 +1669,14 @@ def placeholder(dtype, shape=None, name=None):
   Returns:
     A `Tensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
+
+  Raises:
+    RuntimeError: if eager execution is enabled
   """
+  if context.in_eager_mode():
+    raise RuntimeError("tf.placeholder() is not compatible with "
+                       "eager execution.")
+
   return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
 
 
@@ -1673,6 +1720,8 @@ def sparse_placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: sp_value}))  # Will succeed.
   ```
 
+  @compatibility{eager} Placeholders are not compatible with eager execution.
+
   Args:
     dtype: The type of `values` elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1682,7 +1731,14 @@ def sparse_placeholder(dtype, shape=None, name=None):
   Returns:
     A `SparseTensor` that may be used as a handle for feeding a value, but not
     evaluated directly.
+
+  Raises:
+    RuntimeError: if eager execution is enabled
   """
+  if context.in_eager_mode():
+    raise RuntimeError("tf.placeholder() is not compatible with "
+                       "eager execution.")
+
   shape_name = (name + "/shape") if name is not None else None
   shape, rank = _normalize_sparse_shape(shape, shape_name)
   if shape is None:
@@ -1824,11 +1880,16 @@ def meshgrid(*args, **kwargs):
 
   Args:
     *args: `Tensor`s with rank 1.
-    indexing: Either 'xy' or 'ij' (optional, default: 'xy').
-    name: A name for the operation (optional).
+    **kwargs:
+      - indexing: Either 'xy' or 'ij' (optional, default: 'xy').
+      - name: A name for the operation (optional).
 
   Returns:
     outputs: A list of N `Tensor`s with rank N.
+
+  Raises:
+    TypeError: When no keyword arguments (kwargs) are passed.
+    ValueError: When indexing keyword argument is not one of `xy` or `ij`.
   """
 
   indexing = kwargs.pop("indexing", "xy")
@@ -1859,7 +1920,7 @@ def meshgrid(*args, **kwargs):
       output[1] = reshape(output[1], (-1, 1) + (1,) * (ndim - 2))
       shapes[0], shapes[1] = shapes[1], shapes[0]
 
-    # TODO: improve performance with a broadcast
+    # TODO(nolivia): improve performance with a broadcast
     mult_fact = ones(shapes, output_dtype)
     return [x * mult_fact for x in output]
 
@@ -1869,7 +1930,7 @@ SHRINK_AXIS = -2
 
 
 # PEP-8 naming
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name,redefined-outer-name
 def _compute_size_of_strided_dim(shrink, spec, size):
   """Computes the size of a single strided slice dimension."""
 
@@ -2250,6 +2311,7 @@ def one_hot(indices,
       != i`. (default: 0)
     axis: The axis to fill (default: -1, a new inner-most axis).
     dtype: The data type of the output tensor.
+    name: A name for the operation (optional).
 
   Returns:
     output: The one-hot tensor.
@@ -2264,19 +2326,19 @@ def one_hot(indices,
     on_exists = on_value is not None
     off_exists = off_value is not None
 
-    on_dtype = ops.convert_to_tensor(on_value).dtype.base_dtype if on_exists \
-                  else None
-    off_dtype = ops.convert_to_tensor(off_value).dtype.base_dtype if off_exists\
-                  else None
+    on_dtype = (ops.convert_to_tensor(on_value).dtype.base_dtype if on_exists
+                else None)
+    off_dtype = (ops.convert_to_tensor(off_value).dtype.base_dtype if off_exists
+                 else None)
 
     if on_exists or off_exists:
       if dtype is not None:
         # Ensure provided on_value and/or off_value match dtype
-        if (on_exists and on_dtype != dtype):
-          raise TypeError("dtype {0} of on_value does not match " \
+        if on_exists and on_dtype != dtype:
+          raise TypeError("dtype {0} of on_value does not match "
                           "dtype parameter {1}".format(on_dtype, dtype))
-        if (off_exists and off_dtype != dtype):
-          raise TypeError("dtype {0} of off_value does not match " \
+        if off_exists and off_dtype != dtype:
+          raise TypeError("dtype {0} of off_value does not match "
                           "dtype parameter {1}".format(off_dtype, dtype))
       else:
         # dtype not provided: automatically assign it
@@ -2295,7 +2357,7 @@ def one_hot(indices,
       off_dtype = dtype
 
     if on_dtype != off_dtype:
-      raise TypeError("dtype {0} of on_value does not match " \
+      raise TypeError("dtype {0} of on_value does not match "
                       "dtype {1} of off_value".format(on_dtype, off_dtype))
 
     return gen_array_ops._one_hot(indices, depth, on_value, off_value, axis,
@@ -2473,9 +2535,9 @@ def where(condition, x=None, y=None, name=None):
     with ops.name_scope(name, "Where", [condition]) as name:
       condition = ops.convert_to_tensor(
           condition, preferred_dtype=dtypes.bool, name="condition")
-      return gen_array_ops.where(input=condition, name=name)
+      return gen_array_ops.where(condition=condition, name=name)
   elif x is not None and y is not None:
-    return gen_math_ops._select(condition=condition, t=x, e=y, name=name)
+    return gen_math_ops._select(condition=condition, x=x, y=y, name=name)
   else:
     raise ValueError("x and y must both be non-None or both be None.")
 
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index fa1b219b1771dbd8f99939d8f6571d2a8791433e..75eb100a90ff86dc514e735012922101d693e3d2 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -36,7 +36,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
 
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
-                  dtypes.uint8, dtypes.uint16]
+                  dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
 
     with self.test_session(use_gpu=True) as sess:
       for dtype in dtype_list:
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index ceee009104c8ac0d87795cf9d594914e899a921b..1377af3eac43a5846353257304ef7e022d3506d4 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -48,6 +48,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
@@ -96,10 +97,16 @@ def _maybe_constant_value_string(t):
 
 
 def _assert_static(condition, data):
-  """Raises a static ValueError with as much information as possible."""
+  """Raises a InvalidArgumentError with as much information as possible."""
   if not condition:
     data_static = [_maybe_constant_value_string(x) for x in data]
-    raise ValueError('\n'.join(data_static))
+    raise errors.InvalidArgumentError(node_def=None, op=None,
+                                      message='\n'.join(data_static))
+
+
+def _shape_and_dtype_str(tensor):
+  """Returns a string containing tensor's shape and dtype."""
+  return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name)
 
 
 def assert_proper_iterable(values):
@@ -157,10 +164,14 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_negative', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message,
           'Condition x < 0 did not hold element-wise:',
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(x, zero, data=data, summarize=summarize)
 
@@ -193,9 +204,13 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message, 'Condition x > 0 did not hold element-wise:',
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less(zero, x, data=data, summarize=summarize)
 
@@ -230,7 +245,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
       if context.in_eager_mode():
-        name = str(x)
+        name = _shape_and_dtype_str(x)
       else:
         name = x.name
       data = [
@@ -270,10 +285,14 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_non_positive', [x, data]):
     x = ops.convert_to_tensor(x, name='x')
     if data is None:
+      if context.in_eager_mode():
+        name = _shape_and_dtype_str(x)
+      else:
+        name = x.name
       data = [
           message,
           'Condition x <= 0 did not hold element-wise:'
-          'x (%s) = ' % x.name, x]
+          'x (%s) = ' % name, x]
     zero = ops.convert_to_tensor(0, dtype=x.dtype)
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
@@ -303,11 +322,60 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
 
   Returns:
     Op that raises `InvalidArgumentError` if `x == y` is False.
+    @compatibility{eager} returns None
+
+  Raises:
+    InvalidArgumentError if the check can be performed immediately and
+    `x == y` is False. The check can be performed immediately during
+    eager execution or if `x` and `y` are statically known.
   """
   message = message or ''
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+
+    if context.in_eager_mode():
+      eq = math_ops.equal(x, y)
+      condition = math_ops.reduce_all(eq)
+      if not condition:
+        # Prepare a message with first elements of x and y
+        summary_msg = ''
+        if summarize:
+          # reshape((-1,)) is the fastest way to get a flat array view.
+          x_np = x.numpy().reshape((-1,))
+          y_np = y.numpy().reshape((-1,))
+          x_sum = min(x_np.size, summarize)
+          y_sum = min(y_np.size, summarize)
+          summary_msg = ('First %d elements of x:\n%s\n'
+                         'First %d elements of y:\n%s\n' %
+                         (x_sum, x_np[:x_sum],
+                          y_sum, y_np[:y_sum]))
+
+        # Get the values that actually differed and their indices
+        mask = math_ops.logical_not(eq)
+        indices = array_ops.where(mask)
+        indices_np = indices.numpy()
+        x_vals = array_ops.boolean_mask(x, mask)
+        y_vals = array_ops.boolean_mask(y, mask)
+        diff_to_print = 0
+        if summarize:
+          diff_to_print = min(summarize, indices_np.size)
+
+        raise errors.InvalidArgumentError(
+            node_def=None, op=None,
+            message=('%s\nCondition x == y did not hold.\n'
+                     'Indices of first %s different values:\n%s\n'
+                     'Corresponding x values:\n%s\n'
+                     'Corresponding y values:\n%s\n'
+                     '%s'
+                     %
+                     (message or '',
+                      diff_to_print, indices_np[:diff_to_print],
+                      x_vals.numpy().reshape((-1,))[:diff_to_print],
+                      y_vals.numpy().reshape((-1,))[:diff_to_print],
+                      summary_msg)))
+      return
+
     if data is None:
       data = [
           message,
@@ -356,12 +424,19 @@ def assert_none_equal(
   with ops.name_scope(name, 'assert_none_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
-          'Condition x != y did not hold for every single element:'
-          'x (%s) = ' % x.name, x,
-          'y (%s) = ' % y.name, y
+          'Condition x != y did not hold for every single element:',
+          'x (%s) = ' % x_name, x,
+          'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.not_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -397,11 +472,18 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
-          'Condition x < y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'Condition x < y did not hold element-wise:',
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.less(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -437,11 +519,18 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_less_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x <= y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.less_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -477,11 +566,18 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   with ops.name_scope(name, 'assert_greater', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x > y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -519,11 +615,18 @@ def assert_greater_equal(x, y, data=None, summarize=None, message=None,
   with ops.name_scope(name, 'assert_greater_equal', [x, y, data]):
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y')
+    if context.in_eager_mode():
+      x_name = _shape_and_dtype_str(x)
+      y_name = _shape_and_dtype_str(y)
+    else:
+      x_name = x.name
+      y_name = y.name
+
     if data is None:
       data = [
           message,
           'Condition x >= y did not hold element-wise:'
-          'x (%s) = ' % x.name, x, 'y (%s) = ' % y.name, y
+          'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y
       ]
     condition = math_ops.reduce_all(math_ops.greater_equal(x, y))
     return control_flow_ops.Assert(condition, data, summarize=summarize)
@@ -611,10 +714,15 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     static_condition = lambda actual_rank, given_rank: actual_rank == given_rank
     dynamic_condition = math_ops.equal
 
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
           message,
-          'Tensor %s must have rank' % x.name, rank, 'Received shape: ',
+          'Tensor %s must have rank' % name, rank, 'Received shape: ',
           array_ops.shape(x)
       ]
 
@@ -626,7 +734,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank %d.  Received rank %d, shape %s' %
-            (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -669,10 +777,16 @@ def assert_rank_at_least(
 
     static_condition = lambda actual_rank, given_rank: actual_rank >= given_rank
     dynamic_condition = math_ops.greater_equal
+
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
           message,
-          'Tensor %s must have rank at least' % x.name, rank,
+          'Tensor %s must have rank at least' % name, rank,
           'Received shape: ', array_ops.shape(x)
       ]
 
@@ -684,7 +798,7 @@ def assert_rank_at_least(
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank at least %d.  Received rank %d, '
-            'shape %s' % (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            'shape %s' % (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -791,9 +905,14 @@ def assert_rank_in(
     ranks = tuple([ops.convert_to_tensor(rank, name='rank') for rank in ranks])
     message = message or ''
 
+    if context.in_eager_mode():
+      name = ''
+    else:
+      name = x.name
+
     if data is None:
       data = [
-          message, 'Tensor %s must have rank in' % x.name
+          message, 'Tensor %s must have rank in' % name
       ] + list(ranks) + [
           'Received shape: ', array_ops.shape(x)
       ]
@@ -806,7 +925,7 @@ def assert_rank_in(
       if e.args[0] == 'Static rank condition failed':
         raise ValueError(
             '%s.  Tensor %s must have rank in %s.  Received rank %d, '
-            'shape %s' % (message, x.name, e.args[2], e.args[1], x.get_shape()))
+            'shape %s' % (message, name, e.args[2], e.args[1], x.get_shape()))
       else:
         raise
 
@@ -838,9 +957,13 @@ def assert_integer(x, message=None, name=None):
   with ops.name_scope(name, 'assert_integer', [x]):
     x = ops.convert_to_tensor(x, name='x')
     if not x.dtype.is_integer:
+      if context.in_eager_mode():
+        name = 'tensor'
+      else:
+        name = x.name
       err_msg = (
           '%s  Expected "x" to be integer type.  Found: %s of dtype %s'
-          % (message, x.name, x.dtype))
+          % (message, name, x.dtype))
       raise TypeError(err_msg)
 
     return control_flow_ops.no_op('statically_determined_was_integer')
@@ -1014,6 +1137,10 @@ def assert_scalar(tensor, name=None):
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
-      raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                       % (tensor.name, shape))
+      if context.in_eager_mode():
+        raise ValueError('Expected scalar shape, saw shape: %s.'
+                         % (shape,))
+      else:
+        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
+                         % (tensor.name, shape))
     return tensor
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 3c082b19b6b79491dc6572c056084932d8697a2d..97b57177b29986a006df992f4c0c2b79e11467aa 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
@@ -52,7 +53,8 @@ def _SwitchGrad(op, *grad):
       # TODO(yuanbyu): Perform shape inference with this new input.
       if grad[1] is not None:
         # pylint: disable=protected-access
-        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1])
+        control_flow_ops._AddNextAndBackEdge(merge_grad, grad[1],
+                                             enforce_shape_invariant=False)
         # pylint: enable=protected-access
       return None, None
     elif grad[0] is not None:
@@ -69,13 +71,12 @@ def _SwitchGrad(op, *grad):
       # meaning the output is not differentiable.
       return None, None
   elif isinstance(op_ctxt, CondContext):
-    good_grad = grad[op_ctxt.branch]
     zero_grad = grad[1 - op_ctxt.branch]
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
       return None, None
-    return merge([good_grad, zero_grad], name="cond_grad")[0], None
+    return merge(grad, name="cond_grad")[0], None
   else:
     false_grad = switch(grad[0], op.inputs[1])[0]
     true_grad = switch(grad[1], op.inputs[1])[1]
@@ -92,7 +93,7 @@ def _MergeGrad(op, grad, _):
   input_op = op.inputs[0].op
   graph = ops.get_default_graph()
   # pylint: disable=protected-access
-  op_ctxt = control_flow_ops._GetOutputContext(input_op)
+  op_ctxt = control_flow_util.GetOutputContext(input_op)
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
   if isinstance(op_ctxt, WhileContext):
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 10d8e01304342c42a4ee20a2c9b3e4a4817d7c95..8e8e7d4f8c391615fdd4b1a99976619fd81e8407 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -60,11 +60,13 @@ from tensorflow.core.protobuf import control_flow_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -86,6 +88,29 @@ from tensorflow.python.util import tf_should_use
 _basetuple = tuple
 
 
+def _summarize_eager(tensor, summarize=None):
+  """Returns a summarized string representation of eager `tensor`.
+
+  Args:
+    tensor: EagerTensor to summarize
+    summarize: Include these many first elements of `array`
+  """
+  # reshape((-1,)) is the fastest way to get a flat array view
+  if tensor._rank():  # pylint: disable=protected-access
+    flat = tensor.numpy().reshape((-1,))
+    lst = [str(x) for x in flat[:summarize]]
+    if len(lst) < flat.size:
+      lst.append("...")
+  else:
+    # tensor.numpy() returns a scalar for zero dimensional arrays
+    if summarize != 0:
+      lst = [str(tensor.numpy())]
+    else:
+      lst = []
+
+  return ", ".join(lst)
+
+
 # pylint: disable=protected-access
 
 
@@ -98,7 +123,8 @@ def Assert(condition, data, summarize=None, name=None):
   If `condition` evaluates to false, print the list of tensors in `data`.
   `summarize` determines how many entries of the tensors to print.
 
-  NOTE: To ensure that Assert executes, one usually attaches a dependency:
+  NOTE: In graph mode, to ensure that Assert executes, one usually attaches
+  a dependency:
 
   ```python
   # Ensure maximum element of x is smaller or equal to 1
@@ -117,7 +143,21 @@ def Assert(condition, data, summarize=None, name=None):
     assert_op: An `Operation` that, when executed, raises a
     `tf.errors.InvalidArgumentError` if `condition` is not true.
     @compatibility{eager} returns None.
+
+  Raises:
+    @compatibility{eager} `tf.errors.InvalidArgumentError` if `condition`
+    is not true
   """
+  if context.in_eager_mode():
+    if not condition:
+      xs = ops.convert_n_to_tensor(data)
+      data_str = [_summarize_eager(x, summarize) for x in xs]
+      raise errors.InvalidArgumentError(
+          node_def=None, op=None,
+          message="Expected '%s' to be true. Summarized data: %s" % (
+              condition, "\n".join(data_str)))
+    return
+
   with ops.name_scope(name, "Assert", [condition, data]) as name:
     xs = ops.convert_n_to_tensor(data)
     if all([x.dtype in {dtypes.string, dtypes.int32} for x in xs]):
@@ -466,29 +506,6 @@ def _convert_flows_to_tensorarrays(tensors_or_tensorarrays, tensors_or_flows):
       for (ta, t_or_flow) in zip(tensors_or_tensorarrays, tensors_or_flows)]
 
 
-def _IsLoopConstantEnter(op):
-  """Return true iff op is a loop invariant."""
-  is_enter = (op.type == "Enter" or op.type == "RefEnter")
-  return is_enter and op.get_attr("is_constant")
-
-
-def _GetLoopConstantEnter(value):
-  """Return the enter op if we can infer `value` to be a loop invariant."""
-  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
-  op = value.op
-  while op.type in id_ops:
-    op = op.inputs[0].op
-  return op if _IsLoopConstantEnter(op) else None
-
-
-def _GetOutputContext(op):
-  """Return the control flow context for the output of an op."""
-  ctxt = op._get_control_flow_context()
-  if IsLoopExit(op):
-    ctxt = ctxt.outer_context
-  return ctxt
-
-
 def _ShapeLessThanOrEqual(shape1, shape2):
   if shape2.dims is None:
     return True
@@ -573,6 +590,8 @@ def _EnforceShapeInvariant(merge_var, next_var):
     m_shape = merge_var.get_shape()
     n_shape = next_var.get_shape()
     if not _ShapeLessThanOrEqual(n_shape, m_shape):
+      # TODO(skyewm): get original loop input that caused the shape error and
+      # report its name instead of the merge node's.
       raise ValueError(
           "The shape for %s is not an invariant for the loop. It enters "
           "the loop with shape %s, but has shape %s after one iteration. "
@@ -624,11 +643,17 @@ def _EnforceShapeInvariant(merge_var, next_var):
              n_values_shape, n_indices_shape, n_shape_shape))
 
 
-def _AddNextAndBackEdge(m, v):
+def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
   """Add NextIteration and back edge from v to m."""
   if isinstance(m, ops.Tensor):
     v = ops.convert_to_tensor(v)
     v = _NextIteration(v)
+    if enforce_shape_invariant:
+      # Make sure the shapes of loop outputs are correct. We do this before
+      # calling _update_input, which will raise a less-helpful error message if
+      # the types don't match.
+      # TODO(skyewm): call this for other cases below (needs testing)
+      _EnforceShapeInvariant(m, v)
     m.op._update_input(1, v)   # pylint: disable=protected-access
   elif isinstance(m, ops.IndexedSlices):
     # pylint: disable=protected-access
@@ -879,7 +904,7 @@ class GradLoopState(object):
 
       # Add the stack_push op in the context of value.op.
       swap_enabled = self.forward_context.swap_memory
-      value_ctxt = _GetOutputContext(value.op)
+      value_ctxt = util.GetOutputContext(value.op)
       if value_ctxt == self.forward_context:
         # value is not nested in the forward context.
         self.forward_context.Enter()
@@ -989,7 +1014,7 @@ class GradLoopState(object):
       cur_value = value
       cur_grad_state = self
       while True:
-        enter_op = _GetLoopConstantEnter(cur_value)
+        enter_op = util.GetLoopConstantEnter(cur_value)
         if enter_op:
           # Special case: cur_value comes from a constant Enter node.
           cur_value = enter_op.inputs[0]
@@ -1042,7 +1067,7 @@ class ControlFlowState(object):
 
   def GetGradState(self, op, before):
     """Return the grad state for this op if it's in a forward loop context."""
-    if before and IsLoopExit(op):
+    if before and util.IsLoopExit(op):
       forward_ctxt = op._get_control_flow_context()
       forward_ctxt = forward_ctxt.outer_context
       if forward_ctxt:
@@ -1202,8 +1227,8 @@ class ControlFlowState(object):
     Returns:
       A zero tensor of the same shape of op.outputs[index].
     """
-    if IsLoopSwitch(op): return None
-    dead_branch = IsSwitch(op)
+    if util.IsLoopSwitch(op): return None
+    dead_branch = util.IsSwitch(op)
     forward_ctxt = _GetWhileContext(op)
     grad_state = self._map.get(forward_ctxt)
     if grad_state is None:
@@ -1303,7 +1328,7 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   """
   loop_state = None
   for op in between_op_list:
-    if IsLoopExit(op):
+    if util.IsLoopExit(op):
       if loop_state is None:
         loop_state = ControlFlowState()
       if colocate_gradients_with_ops:
@@ -1314,28 +1339,10 @@ def MaybeCreateControlFlowState(between_op_list, between_ops,
   return loop_state
 
 
-def IsSwitch(op):
-  """Return true if `op` is a Switch."""
-  return op.type == "Switch" or op.type == "RefSwitch"
-
-
-def IsLoopExit(op):
-  """Return true if `op` is an Exit."""
-  return op.type == "Exit" or op.type == "RefExit"
-
-
-def IsLoopSwitch(op):
-  """Return true if `op` is the Switch for a while loop."""
-  if IsSwitch(op):
-    ctxt = op._get_control_flow_context()
-    return ctxt and isinstance(ctxt, WhileContext)
-  return False
-
-
 def ZerosLikeOutsideLoop(op, index):
   """Create zeros_like for the specified output of an op."""
   val = op.outputs[index]
-  if not IsSwitch(op):
+  if not util.IsSwitch(op):
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
@@ -1472,7 +1479,7 @@ class ControlFlowContext(object):
     return None
 
   def _IsInOuterContext(self, op):
-    op_ctxt = _GetOutputContext(op)
+    op_ctxt = util.GetOutputContext(op)
     outer_ctxt = self.outer_context
     while outer_ctxt != op_ctxt:
       if outer_ctxt is None:
@@ -1490,11 +1497,11 @@ class ControlFlowContext(object):
     else:
       internal_control_inputs = []
       for x in op.control_inputs:
-        ctxt = _GetOutputContext(x)
+        ctxt = util.GetOutputContext(x)
         if ctxt is not None and ctxt.GetWhileContext() == while_ctxt:
           internal_control_inputs.append(x)
     if len(internal_control_inputs) != len(op.control_inputs):
-      del op.control_inputs[:]
+      op._remove_all_control_inputs()
       op._add_control_inputs(internal_control_inputs)
     return internal_control_inputs
   # pylint: enable=protected-access
@@ -1508,6 +1515,12 @@ class ControlFlowContext(object):
     """Returns the pivot node for this context, or None."""
     return None
 
+  def IsWhileContext(self):
+    return False
+
+  def __str__(self):
+    return self.name
+
 
 class CondContext(ControlFlowContext):
   """The context for the conditional construct."""
@@ -1681,7 +1694,7 @@ class CondContext(ControlFlowContext):
         op._add_control_input(self._pivot.op)
       # pylint: enable=protected-access
 
-    if self._outer_context or not IsLoopExit(op):
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
 
     if self._outer_context:
@@ -1725,7 +1738,19 @@ class CondContext(ControlFlowContext):
 
   def BuildCondBranch(self, fn):
     """Add the subgraph defined by fn() to the graph."""
+    pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     original_result = fn()
+    post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+    if len(post_summaries) > len(pre_summaries):
+      new_summaries = post_summaries[len(pre_summaries):]
+      summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+      summary_ref[:] = pre_summaries
+      with ops.control_dependencies(new_summaries):
+        if original_result is None:
+          return no_op(), None
+        else:
+          original_result = nest.map_structure(
+              array_ops.identity, original_result)
     if original_result is None:
       return None, None
 
@@ -1838,8 +1863,8 @@ def cond(pred, true_fn=None, false_fn=None, strict=False, name=None,
   with ops.name_scope(name, "cond", [pred]):
     if context.in_eager_mode():
       if pred:
-        return true_fn()
-      return false_fn()
+        return _UnpackIfSingleton(true_fn())
+      return _UnpackIfSingleton(false_fn())
 
     # Add the Switch to the graph.
     if isinstance(pred, bool):
@@ -2139,7 +2164,7 @@ class WhileContext(ControlFlowContext):
         grad_ctxt = grad_ctxt.GetWhileContext()
         if grad_ctxt.grad_state:
           forward_ctxt = _GetWhileContext(val.op)
-          if IsLoopExit(val.op):
+          if util.IsLoopExit(val.op):
             forward_ctxt = forward_ctxt.outer_context
             if forward_ctxt:
               forward_ctxt = forward_ctxt.GetWhileContext()
@@ -2221,7 +2246,7 @@ class WhileContext(ControlFlowContext):
       self._MaybeAddControlDependency(op)
       for x in op.outputs:
         self._values.add(x.name)
-    if self._outer_context or not IsLoopExit(op):
+    if self._outer_context or not util.IsLoopExit(op):
       op.graph.prevent_fetching(op)
       for x in op.outputs:
         op.graph.prevent_feeding(x)
@@ -2240,7 +2265,7 @@ class WhileContext(ControlFlowContext):
         return True
       # pylint: enable=protected-access
       for x in op.inputs:
-        if not _IsLoopConstantEnter(x.op):
+        if not util.IsLoopConstantEnter(x.op):
           return False
       return True
     if _IsOpFree(op):
@@ -2476,9 +2501,17 @@ class WhileContext(ControlFlowContext):
     if shape_acc is not None:
       self.AddName(shape_acc.name)
       init_acc.append(shape_acc)
+
+    # Set use_input_shape=False since the accumulator tensors will grow in
+    # size. If use_input_shape=True, the _update_input call below will result in
+    # incompatible shapes.
     enter_acc = [_Enter(x, self._name, is_constant=False,
                         parallel_iterations=self._parallel_iterations,
-                        name="b_acc") for x in init_acc]
+                        use_input_shape=False, name="b_acc") for x in init_acc]
+    # Manually set appropriate partial shapes.
+    enter_acc[0].set_shape([None])
+    if values_acc.shape.dims is not None:
+      enter_acc[1].set_shape([None] + values_acc.shape.as_list()[1:])
     self.loop_enters.extend(enter_acc)
 
     merge_acc = [merge([x, x], name="b_acc")[0] for x in enter_acc]
@@ -2556,7 +2589,7 @@ class WhileContext(ControlFlowContext):
 
     if control_pivot is not None:
       for var in enter_vars:
-        if _IsLoopConstantEnter(var.op.inputs[0].op):
+        if util.IsLoopConstantEnter(var.op.inputs[0].op):
           # pylint: disable=protected-access
           var.op._add_control_input(control_pivot.op)
           # pylint: enable=protected-access
@@ -2590,9 +2623,23 @@ class WhileContext(ControlFlowContext):
     packed_vars_for_body = nest.pack_sequence_as(
         structure=original_loop_vars,
         flat_sequence=vars_for_body_with_tensor_arrays)
+    pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     body_result = body(*packed_vars_for_body)
+    post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     if not nest.is_sequence(body_result):
       body_result = [body_result]
+    if len(post_summaries) > len(pre_summaries):
+      new_summaries = post_summaries[len(pre_summaries):]
+      summary_ref = ops.get_collection_ref(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
+      summary_ref[:] = pre_summaries
+      with ops.control_dependencies(new_summaries):
+        def map_fn(x):
+          # TODO(apassos) figure out how to trigger with tensor arrays as well
+          if isinstance(x, tensor_array_ops.TensorArray):
+            return x
+          return array_ops.identity(x)
+        body_result = nest.map_structure(map_fn, body_result)
+
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
@@ -2618,11 +2665,6 @@ class WhileContext(ControlFlowContext):
     exit_vars = [exit(x[0]) for x in switch_vars]
     self._loop_exits = exit_vars
 
-    # Make sure the shapes of loop outputs are correct.
-    for m_var, n_var in zip(merge_vars, next_vars):
-      if isinstance(m_var, ops.Tensor):
-        _EnforceShapeInvariant(m_var, n_var)
-
     # Exit the loop.
     self.ExitResult(exit_vars)
 
@@ -2669,7 +2711,7 @@ class WhileContext(ControlFlowContext):
         if shape is not None:
           xs.append(shape)
       for x in xs:
-        inp_op = x.op.inputs[0]
+        inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
         outer_control_inputs = [op for op in control_inputs
                                 if self._IsInOuterContext(op)]
@@ -2678,10 +2720,13 @@ class WhileContext(ControlFlowContext):
         graph._record_op_seen_by_control_dependencies(x.op)
     # pylint: enable=protected-access
 
+  def IsWhileContext(self):
+    return True
+
 
 def while_loop(cond, body, loop_vars, shape_invariants=None,
                parallel_iterations=10, back_prop=True, swap_memory=False,
-               name=None):
+               name=None, maximum_iterations=None):
   """Repeat `body` while the condition `cond` is true.
 
   `cond` is a callable returning a boolean scalar tensor. `body` is a callable
@@ -2753,6 +2798,10 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     back_prop: Whether backprop is enabled for this while loop.
     swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
     name: Optional name prefix for the returned tensors.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
 
   Returns:
     The output tensors for the loop variables after the loop. When the length
@@ -2806,18 +2855,47 @@ def while_loop(cond, body, loop_vars, shape_invariants=None,
     if parallel_iterations < 1:
       raise TypeError("parallel_iterations must be a positive integer.")
 
+    if maximum_iterations is not None:
+      maximum_iterations = ops.convert_to_tensor(
+          maximum_iterations, name="maximum_iterations")
+      if maximum_iterations.shape.ndims != 0:
+        raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                         maximum_iterations.shape)
+      counter = constant_op.constant(
+          0, dtype=maximum_iterations.dtype, name="iteration_counter")
+      orig_cond = cond
+      orig_body = body
+      if len(loop_vars) == 1:
+        loop_vars = (counter, loop_vars[0])
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(lv)))
+        body = lambda i, lv: (i + 1, orig_body(lv))
+      else:
+        loop_vars = (counter, loop_vars)
+        cond = lambda i, lv: (  # pylint: disable=g-long-lambda
+            math_ops.logical_and(i < maximum_iterations, orig_cond(*lv)))
+        body = lambda i, lv: (i + 1, orig_body(*lv))
+
     if context.in_eager_mode():
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
-      return loop_vars
+      if maximum_iterations is not None:
+        return loop_vars[1]
+      else:
+        return loop_vars
 
     if shape_invariants is not None:
+      if maximum_iterations is not None:
+        shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
       nest.assert_same_structure(loop_vars, shape_invariants)
 
     loop_context = WhileContext(parallel_iterations, back_prop, swap_memory)  # pylint: disable=redefined-outer-name
     ops.add_to_collection(ops.GraphKeys.WHILE_CONTEXT, loop_context)
     result = loop_context.BuildLoop(cond, body, loop_vars, shape_invariants)
-    return result
+    if maximum_iterations is not None:
+      return result[1]
+    else:
+      return result
 
 
 def _AsTensorList(x, p):
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 3e8f39dd240af3a5030d259603ab648d50c27cd3..cd3c02f5621b97db02672f7e450dc83df5bea1f8 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -51,6 +51,7 @@ TestTuple = collections.namedtuple("TestTuple", "a b")
 SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
 
 
+@test_util.with_c_api
 class GroupTestCase(test_util.TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -132,6 +133,7 @@ class GroupTestCase(test_util.TensorFlowTestCase):
         control_flow_ops.group(1, 2)
 
 
+@test_util.with_c_api
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -143,6 +145,7 @@ class ShapeTestCase(test_util.TensorFlowTestCase):
                             [constant_op.constant(1.0)], tensor).get_shape())
 
 
+@test_util.with_c_api
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
   def testTupleDependencies(self):
@@ -174,6 +177,7 @@ class WithDependenciesTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(1, counter.eval())
 
 
+@test_util.with_c_api
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
@@ -431,6 +435,7 @@ class CondTest(test_util.TensorFlowTestCase):
           control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
+@test_util.with_c_api
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
@@ -516,6 +521,7 @@ def _RawNestedShape(nested_shape):
 
 
 # TODO(yori): Add tests for indexed slices.
+@test_util.with_c_api
 class DataTypesTest(test_util.TensorFlowTestCase):
 
   def assertAllEqualNested(self, a, b):
@@ -540,7 +546,9 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def _testReturnValues(self, fn_true, fn_false, expected_value_true,
                         expected_value_false, strict=False,
-                        check_cond=True):
+                        check_cond=True, feed_dict=None):
+    if feed_dict is None: feed_dict = {}
+
     condition = array_ops.placeholder(dtypes.bool)
     output_cond = control_flow_ops.cond(condition, fn_true, fn_false,
                                         strict=strict)
@@ -549,13 +557,17 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
     with self.test_session() as sess:
       variables.global_variables_initializer().run()
+      true_feed_dict = {condition: True}
+      true_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: True})
+                                          feed_dict=true_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_true)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_true)
+      false_feed_dict = {condition: False}
+      false_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
-                                          feed_dict={condition: False})
+                                          feed_dict=false_feed_dict)
       self.assertAllEqualNested(result_cond, expected_value_false)
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
@@ -631,26 +643,26 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensors_unknown_shape(self):
     def _BuildTrueBranch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
       def _Build():
-        tensor = array_ops.zeros([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
         return tensor
-      return _Build
+      return _Build, tensor
 
     def _BuildFalseBranch(dtype):
+      tensor = array_ops.placeholder(dtype=dtype, shape=None)
       def _Build():
-        tensor = array_ops.ones([2, 2], dtype=dtype)
-        tensor._shape = tensor_shape.TensorShape(None)
         return tensor
-      return _Build
+      return _Build, tensor
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = tensor_shape.TensorShape(None)
-      fn_true = _BuildTrueBranch(dtype)
-      fn_false = _BuildFalseBranch(dtype)
+      fn_true, true_tensor = _BuildTrueBranch(dtype)
+      fn_false, false_tensor = _BuildFalseBranch(dtype)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
-                             np.zeros([2, 2]), np.ones([2, 2]))
+                             np.zeros([2, 2]), np.ones([2, 2]),
+                             feed_dict={true_tensor: np.zeros([2, 2]),
+                                        false_tensor: np.ones([2, 2])})
 
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
@@ -674,26 +686,29 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
   def test_tensors_with_partially_specified_shapes(self):
     def _BuildBranch(dtype, shape):
+      a = array_ops.placeholder(dtype=dtype, shape=shape[0])
+      b = array_ops.placeholder(dtype=dtype, shape=shape[1])
+      c = array_ops.placeholder(dtype=dtype, shape=shape[2])
       def _Build():
-        a = array_ops.zeros([2, 2], dtype=dtype)
-        b = array_ops.zeros([5], dtype=dtype)
-        c = array_ops.ones([3, 3], dtype=dtype)
-        a._shape = tensor_shape.TensorShape(shape[0])
-        b._shape = tensor_shape.TensorShape(shape[1])
-        c._shape = tensor_shape.TensorShape(shape[2])
         return a, b, c
-      return _Build
+      return _Build, (a, b, c)
 
     for dtype in (dtypes.float16, dtypes.int8, dtypes.int32, dtypes.uint8):
       shape = (tensor_shape.TensorShape([None, 2]),
                tensor_shape.TensorShape([None]),
                tensor_shape.TensorShape([3, None]))
-      fn_true = _BuildBranch(dtype, shape)
-      fn_false = _BuildBranch(dtype, shape)
+      fn_true, true_tensors = _BuildBranch(dtype, shape)
+      fn_false, false_tensors = _BuildBranch(dtype, shape)
       self._testShape(fn_true, fn_false, shape)
       self._testReturnValues(fn_true, fn_false,
                              (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
-                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])))
+                             (np.zeros([2, 2]), np.zeros(5), np.ones([3, 3])),
+                             feed_dict={true_tensors[0]: np.zeros([2, 2]),
+                                        false_tensors[0]: np.zeros([2, 2]),
+                                        true_tensors[1]: np.zeros([5]),
+                                        false_tensors[1]: np.zeros([5]),
+                                        true_tensors[2]: np.ones([3, 3]),
+                                        false_tensors[2]: np.ones([3, 3])})
 
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
@@ -837,6 +852,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
 
 
+@test_util.with_c_api
 class CaseTest(test_util.TensorFlowTestCase):
 
   def testCase_withDefault(self):
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..941a1a743e524dc9a8996ede405dc796c1efff8e
--- /dev/null
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -0,0 +1,200 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilty functions for control flow.
+
+This file is necessary to avoid cyclic dependencies between ops.py and
+control_flow_ops.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import traceback
+
+from tensorflow.python.platform import tf_logging as logging
+
+
+def IsSwitch(op):
+  """Return true if `op` is a Switch."""
+  return op.type == "Switch" or op.type == "RefSwitch"
+
+
+def IsLoopEnter(op):
+  """Returns true if `op` is an Enter."""
+  return op.type == "Enter" or op.type == "RefEnter"
+
+
+def IsLoopExit(op):
+  """Return true if `op` is an Exit."""
+  return op.type == "Exit" or op.type == "RefExit"
+
+
+def IsLoopSwitch(op):
+  """Return true if `op` is the Switch for a while loop."""
+  if IsSwitch(op):
+    ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+    return ctxt and ctxt.IsWhileContext()
+  return False
+
+
+def IsLoopConstantEnter(op):
+  """Return true iff op is a loop invariant."""
+  return IsLoopEnter(op) and op.get_attr("is_constant")
+
+
+def GetLoopConstantEnter(value):
+  """Return the enter op if we can infer `value` to be a loop invariant."""
+  id_ops = {"Switch", "RefSwitch", "Identity", "RefIdentity"}
+  op = value.op
+  while op.type in id_ops:
+    op = op.inputs[0].op
+  return op if IsLoopConstantEnter(op) else None
+
+
+def GetOutputContext(op):
+  """Return the control flow context for the output of an op."""
+  ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  if IsLoopExit(op):
+    ctxt = ctxt.outer_context
+  return ctxt
+
+
+def GetContainingWhileContext(ctxt):
+  """Returns the first ancestor WhileContext of `ctxt`.
+
+  Returns `ctxt` if `ctxt` is a WhileContext, or None if `ctxt` is not in a
+  while loop.
+
+  Args:
+    ctxt: ControlFlowContext
+
+  Returns:
+    `ctxt` if `ctxt` is a WhileContext, the most nested WhileContext containing
+    `ctxt`, or None if `ctxt` is not in a while loop.
+  """
+  while ctxt:
+    if ctxt.IsWhileContext(): return ctxt
+    ctxt = ctxt.outer_context
+  return None
+
+
+def IsContainingContext(ctxt, maybe_containing_ctxt):
+  """Returns true if `maybe_containing_ctxt` is or contains `ctxt`."""
+  while ctxt is not maybe_containing_ctxt:
+    if ctxt is None: return False
+    ctxt = ctxt.outer_context
+  return True
+
+
+def CheckInputFromValidContext(op, input_op):
+  """Returns whether `input_op` can be used from `op`s context.
+
+  Conceptually, only inputs from op's while context or any ancestor while
+  context (including outside of any context) are valid. In practice, there are
+  many other edge cases as well.
+
+  Args:
+    op: Operation
+    input_op: Operation
+
+  Raises:
+    ValueError: if input_op is from an invalid context.
+  """
+  op_ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+  input_ctxt = GetOutputContext(input_op)
+  valid = False
+
+  if not input_ctxt:
+    # input_op isn't in a control flow context.
+    valid = True
+  elif op_ctxt is input_ctxt:
+    # input_op is in the same context as op.
+    valid = True
+  else:
+    while_ctxt = GetContainingWhileContext(op_ctxt)
+    input_while_ctxt = GetContainingWhileContext(input_ctxt)
+
+    if while_ctxt is None:
+      if input_while_ctxt is None:
+        # Neither op nor input_op is in a while loop, but one or both are in
+        # conds. We allow this, although execution will fail if the branch
+        # corresponding to input_op's cond context isn't taken.
+        valid = True
+      # Invalid if op isn't in a while loop and input_op is. Unless...
+      if IsLoopEnter(op):
+        # WhileContext._BuildLoop clears context for Enter nodes.
+        valid = True
+      if IsSwitch(op):
+        # CondContext.AddValue clears context for Switch nodes.
+        valid = True
+    elif IsContainingContext(while_ctxt, input_while_ctxt):
+      # input_op is in a while loop which contains op's while loop (or not in a
+      # while loop at all).
+      valid = True
+    elif (while_ctxt.grad_state and
+          IsContainingContext(while_ctxt.grad_state.forward_context,
+                              input_while_ctxt)):
+      # op is in a gradient context and input_op is in the associated forward
+      # pass context or an ancestor thereof. This case is need to build while
+      # loop gradients.
+      # NOTE(skyewm): we theoretically also need this case for custom gradient
+      # functions that close over tensors from ancestor contexts, but I haven't
+      # verified this.
+      valid = True
+    elif (while_ctxt.grad_state and
+          while_ctxt.grad_state.forward_context is
+          input_while_ctxt._outer_context):  # pylint: disable=protected-access
+      # op is in a gradient context and input_op is in a child of the associated
+      # forward pass context. This case is needed for the gradients of while
+      # loops with conds.
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_while_ctxt.grad_state.forward_context is while_ctxt):
+      # input_op is in the gradient context of op's context. This case is needed
+      # when the gradient of a while loop gradient is requested (this will
+      # eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+    elif (input_while_ctxt.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state and
+          input_ctxt.grad_state.forward_context.grad_state.forward_context is
+          while_ctxt):
+      # input_op is in the grad grad context of op's context. This case is
+      # needed when the gradient of a while loop gradient is requested (this
+      # will eventually fail unless there is a stop_gradient() or similar).
+      valid = True
+
+  if not valid:
+    if while_ctxt:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because they are in different while"
+          " loops." % (op.name, input_op.name))
+    else:
+      error_msg = (
+          "Cannot use '%s' as input to '%s' because '%s' is in a while loop."
+          % (input_op.name, op.name, input_op.name))
+
+    # Log the error message plus the relevant stack traces. The stacks may be
+    # useful for debugging this error, but we don't want to raise an
+    # unreadable exception.
+    log_msg = error_msg
+    log_msg += "\n\n%s while context: %s" % (op.name, while_ctxt)
+    log_msg += "\n%s while context: %s" % (input_op.name, input_while_ctxt)
+    log_msg += "\n\nTraceback for %s:\n%s\nTraceback for %s:\n%s\n" % (
+        op.name, "".join(traceback.format_list(op.traceback)),
+        input_op.name, "".join(traceback.format_list(input_op.traceback)))
+    logging.info(log_msg)
+    raise ValueError(error_msg + " See info log for more details.")
diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py
index 6992fa57eada057d3ef98dcbcbcb2d45a421cb75..907df85cd954d2a897ba9a0c4b21be8586859380 100644
--- a/tensorflow/python/ops/conv2d_benchmark.py
+++ b/tensorflow/python/ops/conv2d_benchmark.py
@@ -22,6 +22,7 @@ import itertools
 import time
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn_ops
@@ -30,7 +31,8 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-def build_graph(device, input_shape, filter_shape, strides, padding, num_iters):
+def build_graph(device, input_shape, filter_shape, strides, padding, dtype,
+                num_iters, warmup_iters):
   """builds a graph containing a sequence of conv2d operations.
 
   Args:
@@ -41,14 +43,18 @@ def build_graph(device, input_shape, filter_shape, strides, padding, num_iters):
              window for each dimension of input.
     padding: A string from: "SAME", "VALID". The type of padding
              algorithm to use.
+    dtype: Data type for the convolution.
     num_iters: number of iterations to run conv2d.
+    warmup_iters: number of iterations for warmup runs.
 
   Returns:
     An array of tensors to run()
   """
   with ops.device("/%s:0" % device):
-    inp = variables.Variable(random_ops.truncated_normal(input_shape))
-    filt = variables.Variable(random_ops.truncated_normal(filter_shape))
+    inp = variables.Variable(
+        random_ops.truncated_normal(input_shape, dtype=dtype))
+    filt = variables.Variable(
+        random_ops.truncated_normal(filter_shape, dtype=dtype))
 
     outputs = []
     conv2d_op = nn_ops.conv2d(inp, filt, strides, padding, data_format="NHWC")
@@ -58,14 +64,25 @@ def build_graph(device, input_shape, filter_shape, strides, padding, num_iters):
         conv2d_op = nn_ops.conv2d(
             inp, filt, strides, padding, data_format="NHWC")
         outputs.append(conv2d_op)
-    return control_flow_ops.group(*outputs)
+
+    warmup_groups = []
+    warmup_conv2d_op = nn_ops.conv2d(
+        inp, filt, strides, padding, data_format="NHWC")
+    warmup_groups.append(warmup_conv2d_op)
+    for _ in range(1, warmup_iters):
+      with ops.control_dependencies([warmup_conv2d_op]):
+        warmup_conv2d_op = nn_ops.conv2d(
+            inp, filt, strides, padding, data_format="NHWC")
+        warmup_groups.append(warmup_conv2d_op)
+    return control_flow_ops.group(*warmup_groups), control_flow_ops.group(
+        *outputs)
 
 
 class Conv2DBenchmark(test.Benchmark):
   """Benchmark conv2d!"""
 
   def _run_graph(self, device, input_shape, filter_shape, strides, padding,
-                 num_iters):
+                 dtype, num_iters, warmup_iters):
     """runs the graph and print its execution time.
 
     Args:
@@ -77,43 +94,46 @@ class Conv2DBenchmark(test.Benchmark):
       padding: A string from: "SAME", "VALID". The type of padding
                algorithm to use.  num_iters: Number of iterations to run the
                  benchmark.
+      dtype: Data type for the convolution.
       num_iters: number of iterations to run conv2d.
+      warmup_iters: number of iterations for warmup runs.
 
     Returns:
       The duration of the run in seconds.
     """
     graph = ops.Graph()
     with graph.as_default():
-      outputs = build_graph(device, input_shape, filter_shape, strides, padding,
-                            num_iters)
+      warmup_outputs, outputs = build_graph(device, input_shape, filter_shape,
+                                            strides, padding, dtype, num_iters,
+                                            warmup_iters)
       with session_lib.Session(graph=graph) as session:
         variables.global_variables_initializer().run()
         # warmup runs
-        session.run(outputs)
+        session.run(warmup_outputs)
 
         start_time = time.time()
         session.run(outputs)
         duration = (time.time() - start_time) / num_iters
-
-        print("%s inputshape:%s filtershape:%s strides:%s padding:%s "
+        print("%s %s inputshape:%s filtershape:%s strides:%s padding:%s "
               "%d iters: %.8f sec" %
-              (device, str(input_shape).replace(" ", ""),
+              (device, str(dtype), str(input_shape).replace(" ", ""),
                str(filter_shape).replace(" ", ""),
                str(strides).replace(" ", ""), padding, num_iters, duration))
 
     name_template = (
-        "conv2d_{device}_input_shape_{inputshape}_filter_shape_{filtershape}_"
-        "strides_{strides}_padding_{padding}")
+        "conv2d_{device}_{datatype}_input_shape_{inputshape}_"
+        "filter_shape_{filtershape}_strides_{strides}_padding_{padding}")
 
     self.report_benchmark(
         name=name_template.format(
             device=device,
+            datatype=str(dtype),
             inputshape=str(input_shape).replace(" ", ""),
             filtershape=str(filter_shape).replace(" ", ""),
             strides=str(strides).replace(" ", ""),
             padding=padding).replace(" ", ""),
         iters=num_iters,
-        wall_time=duration / num_iters)
+        wall_time=duration)
 
     return duration
 
@@ -126,15 +146,18 @@ class Conv2DBenchmark(test.Benchmark):
     fw = 3
     input_shapes = []
     filter_shapes = []
+    data_types = [dtypes.float32, dtypes.float16]
     for b, c in itertools.product([4, 16, 32], [i for i in range(3, 16)]):
       input_shapes += [[b, h, w, c]]
       filter_shapes += [[fh, fw, c, b]]
     strides = [[1, 2, 2, 1]]
     paddings = ["VALID", "SAME"]
     for ishape, fshape in zip(input_shapes, filter_shapes):
-      for stride in strides:
-        for padding in paddings:
-          self._run_graph("gpu", ishape, fshape, stride, padding, 80)
+      for dtype in data_types:
+        for stride in strides:
+          for padding in paddings:
+            self._run_graph("gpu", ishape, fshape, stride, padding, dtype, 80,
+                            2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 477c0d1cb49ad44c64da8a14d05fbc796cecb9de..f037767cf4051d058a2da0cca9c4515fd9705d28 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -22,8 +22,8 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
-from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 
 
@@ -38,7 +38,8 @@ def ctc_loss(labels, inputs, sequence_length,
 
   [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
   Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
-  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
 
   Input requirements:
 
@@ -108,9 +109,9 @@ def ctc_loss(labels, inputs, sequence_length,
       See `core/ops/ctc_ops.cc` for more details.
     inputs: 3-D `float` `Tensor`.
       If time_major == False, this will be a `Tensor` shaped:
-        `[batch_size x max_time x num_classes]`.
+        `[batch_size, max_time, num_classes]`.
       If time_major == True (default), this will be a `Tensor` shaped:
-        `[max_time x batch_size x num_classes]`.
+        `[max_time, batch_size, num_classes]`.
       The logits.
     sequence_length: 1-D `int32` vector, size `[batch_size]`.
       The sequence lengths.
@@ -120,15 +121,18 @@ def ctc_loss(labels, inputs, sequence_length,
     ignore_longer_outputs_than_inputs: Boolean. Default: False.
       If True, sequences with longer outputs than inputs will be ignored.
     time_major: The shape format of the `inputs` Tensors.
-      If True, these `Tensors` must be shaped `[max_time, batch_size, num_classes]`.
-      If False, these `Tensors` must be shaped `[batch_size, max_time, num_classes]`.
-      Using `time_major = True` (default) is a bit more efficient because it avoids
-      transposes at the beginning of the ctc_loss calculation.  However, most
-      TensorFlow data is batch-major, so by this function also accepts inputs
-      in batch-major form.
+      If True, these `Tensors` must be shaped `[max_time, batch_size,
+      num_classes]`.
+      If False, these `Tensors` must be shaped `[batch_size, max_time,
+      num_classes]`.
+      Using `time_major = True` (default) is a bit more efficient because it
+      avoids transposes at the beginning of the ctc_loss calculation.  However,
+      most TensorFlow data is batch-major, so by this function also accepts
+      inputs in batch-major form.
 
   Returns:
-    A 1-D `float` `Tensor`, size `[batch]`, containing the negative log probabilities.
+    A 1-D `float` `Tensor`, size `[batch]`, containing the negative log
+      probabilities.
 
   Raises:
     TypeError: if labels is not a `SparseTensor`.
@@ -198,7 +202,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
   Args:
     inputs: 3-D `float` `Tensor` sized
-      `[max_time x batch_size x num_classes]`.  The logits.
+      `[max_time, batch_size, num_classes]`.  The logits.
     sequence_length: 1-D `int32` vector containing sequence lengths,
       having size `[batch_size]`.
     merge_repeated: Boolean.  Default: True.
@@ -207,7 +211,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
     A tuple `(decoded, neg_sum_logits)` where
     decoded: A single-element list. `decoded[0]`
       is an `SparseTensor` containing the decoded outputs s.t.:
-      `decoded.indices`: Indices matrix `(total_decoded_outputs x 2)`.
+      `decoded.indices`: Indices matrix `(total_decoded_outputs, 2)`.
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 923696a553caae80592be65f7ffeecb3f9373bb0..2accedf1b963f01034f0b4059f44e46eb9bfc5ab 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -196,7 +196,7 @@ class Dirichlet(distribution.Distribution):
         alpha=self.concentration,
         dtype=self.dtype,
         seed=seed)
-    return gamma_sample / math_ops.reduce_sum(gamma_sample, -1, keep_dims=True)
+    return gamma_sample / math_ops.reduce_sum(gamma_sample, -1, keepdims=True)
 
   @distribution_util.AppendDocstring(_dirichlet_sample_note)
   def _log_prob(self, x):
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index d792e9fe52dee4325d0956dbb74c8b408d5a1e8c..aa2b511c5413944df665198eacc26066b8457773 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -122,21 +122,22 @@ class DirichletMultinomial(distribution.Distribution):
   #### Examples
 
   ```python
-  alpha = [1, 2, 3]
-  n = 2
+  alpha = [1., 2., 3.]
+  n = 2.
   dist = DirichletMultinomial(n, alpha)
   ```
 
-  Creates a 3-class distribution, with the 3rd class is most likely to be drawn.
+  Creates a 3-class distribution, with the 3rd class is most likely to be
+  drawn.
   The distribution functions can be evaluated on counts.
 
   ```python
   # counts same shape as alpha.
-  counts = [0, 0, 2]
+  counts = [0., 0., 2.]
   dist.prob(counts)  # Shape []
 
-  # alpha will be broadcast to [[1, 2, 3], [1, 2, 3]] to match counts.
-  counts = [[1, 1, 0], [1, 0, 1]]
+  # alpha will be broadcast to [[1., 2., 3.], [1., 2., 3.]] to match counts.
+  counts = [[1., 1., 0.], [1., 0., 1.]]
   dist.prob(counts)  # Shape [2]
 
   # alpha will be broadcast to shape [5, 7, 3] to match counts.
@@ -147,12 +148,12 @@ class DirichletMultinomial(distribution.Distribution):
   Creates a 2-batch of 3-class distributions.
 
   ```python
-  alpha = [[1, 2, 3], [4, 5, 6]]  # Shape [2, 3]
-  n = [3, 3]
+  alpha = [[1., 2., 3.], [4., 5., 6.]]  # Shape [2, 3]
+  n = [3., 3.]
   dist = DirichletMultinomial(n, alpha)
 
-  # counts will be broadcast to [[2, 1, 0], [2, 1, 0]] to match alpha.
-  counts = [2, 1, 0]
+  # counts will be broadcast to [[2., 1., 0.], [2., 1., 0.]] to match alpha.
+  counts = [2., 1., 0.]
   dist.prob(counts)  # Shape [2]
   ```
 
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 22687a093ae72edff1d53131cab49fa004aa3be0..2d4c3509bc79dd44fec67dbf62ea21e1de7e2778 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util
 from tensorflow.python.util import tf_inspect
 
@@ -43,10 +44,26 @@ __all__ = [
 ]
 
 _DISTRIBUTION_PUBLIC_METHOD_WRAPPERS = [
-    "batch_shape_tensor", "batch_shape", "event_shape_tensor", "event_shape",
-    "sample", "log_prob", "prob", "log_cdf", "cdf", "log_survival_function",
-    "survival_function", "entropy", "mean", "variance", "stddev", "mode",
-    "covariance"]
+    "batch_shape",
+    "batch_shape_tensor",
+    "cdf",
+    "covariance",
+    "cross_entropy",
+    "entropy",
+    "event_shape",
+    "event_shape_tensor",
+    "kl_divergence",
+    "log_cdf",
+    "log_prob",
+    "log_survival_function",
+    "mean",
+    "mode",
+    "prob",
+    "sample",
+    "stddev",
+    "survival_function",
+    "variance",
+]
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -608,7 +625,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `event_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_event: `bool` scalar `Tensor`.
@@ -622,7 +639,7 @@ class Distribution(_BaseDistribution):
     """Indicates that `batch_shape == []`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       is_scalar_batch: `bool` scalar `Tensor`.
@@ -683,7 +700,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       log_prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -710,7 +727,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       prob: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -747,7 +764,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       logcdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -780,7 +797,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       cdf: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -818,7 +835,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -853,7 +870,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       `Tensor` of shape `sample_shape(x) + self.batch_shape` with values of type
@@ -899,7 +916,7 @@ class Distribution(_BaseDistribution):
 
     Args:
       value: `float` or `double` `Tensor`.
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       quantile: a `Tensor` of shape `sample_shape(x) + self.batch_shape` with
@@ -923,7 +940,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `Var.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       variance: Floating-point `Tensor` with shape identical to
@@ -954,7 +971,7 @@ class Distribution(_BaseDistribution):
     denotes expectation, and `stddev.shape = batch_shape + event_shape`.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       stddev: Floating-point `Tensor` with shape identical to
@@ -1002,7 +1019,7 @@ class Distribution(_BaseDistribution):
     length-`k'` vector.
 
     Args:
-      name: The name to give this op.
+      name: Python `str` prepended to names of ops created by this function.
 
     Returns:
       covariance: Floating-point `Tensor` with shape `[B1, ..., Bn, k', k']`
@@ -1020,6 +1037,67 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       return self._mode()
 
+  def _cross_entropy(self, other):
+    return kullback_leibler.cross_entropy(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def cross_entropy(self, other, name="cross_entropy"):
+    """Computes the (Shannon) cross entropy.
+
+    Denote this distribution (`self`) by `P` and the `other` distribution by
+    `Q`. Assuming `P, Q` are absolutely continuous with respect to
+    one another and permit densities `p(x) dr(x)` and `q(x) dr(x)`, (Shanon)
+    cross entropy is defined as:
+
+    ```none
+    H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+    ```
+
+    where `F` denotes the support of the random variable `X ~ P`.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      cross_entropy: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of (Shanon) cross entropy.
+    """
+    with self._name_scope(name):
+      return self._cross_entropy(other)
+
+  def _kl_divergence(self, other):
+    return kullback_leibler.kl_divergence(
+        self, other, allow_nan_stats=self.allow_nan_stats)
+
+  def kl_divergence(self, other, name="kl_divergence"):
+    """Computes the Kullback--Leibler divergence.
+
+    Denote this distribution (`self`) by `p` and the `other` distribution by
+    `q`. Assuming `p, q` are absolutely continuous with respect to reference
+    measure `r`, (Shanon) cross entropy is defined as:
+
+    ```none
+    KL[p, q] = E_p[log(p(X)/q(X))]
+             = -int_F p(x) log q(x) dr(x) + int_F p(x) log p(x) dr(x)
+             = H[p, q] - H[p]
+    ```
+
+    where `F` denotes the support of the random variable `X ~ p`, `H[., .]`
+    denotes (Shanon) cross entropy, and `H[.]` denotes (Shanon) entropy.
+
+    Args:
+      other: `tf.distributions.Distribution` instance.
+      name: Python `str` prepended to names of ops created by this function.
+
+    Returns:
+      kl_divergence: `self.dtype` `Tensor` with shape `[B1, ..., Bn]`
+        representing `n` different calculations of the Kullback-Leibler
+        divergence.
+    """
+    with self._name_scope(name):
+      return self._kl_divergence(other)
+
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
     """Helper function to standardize op scope."""
diff --git a/tensorflow/python/ops/distributions/kullback_leibler.py b/tensorflow/python/ops/distributions/kullback_leibler.py
index a6ab581cc22ce8e9a278bb8e0c7e6afc2dcc30eb..829b9611cff02895b67ec39711b8c53e682eb3c5 100644
--- a/tensorflow/python/ops/distributions/kullback_leibler.py
+++ b/tensorflow/python/ops/distributions/kullback_leibler.py
@@ -110,6 +110,38 @@ def kl_divergence(distribution_a, distribution_b,
       return array_ops.identity(kl_t, name="checked_kl")
 
 
+def cross_entropy(ref, other,
+                  allow_nan_stats=True, name=None):
+  """Computes the (Shannon) cross entropy.
+
+  Denote two distributions by `P` (`ref`) and `Q` (`other`). Assuming `P, Q`
+  are absolutely continuous with respect to one another and permit densities
+  `p(x) dr(x)` and `q(x) dr(x)`, (Shanon) cross entropy is defined as:
+
+  ```none
+  H[P, Q] = E_p[-log q(X)] = -int_F p(x) log q(x) dr(x)
+  ```
+
+  where `F` denotes the support of the random variable `X ~ P`.
+
+  Args:
+    ref: `tf.distributions.Distribution` instance.
+    other: `tf.distributions.Distribution` instance.
+    allow_nan_stats: Python `bool`, default `True`. When `True`,
+      statistics (e.g., mean, mode, variance) use the value "`NaN`" to
+      indicate the result is undefined. When `False`, an exception is raised
+      if one or more of the statistic's batch members are undefined.
+    name: Python `str` prepended to names of ops created by this function.
+
+  Returns:
+    cross_entropy: `ref.dtype` `Tensor` with shape `[B1, ..., Bn]`
+      representing `n` different calculations of (Shanon) cross entropy.
+  """
+  with ops.name_scope(name, "cross_entropy"):
+    return ref.entropy() + kl_divergence(
+        ref, other, allow_nan_stats=allow_nan_stats)
+
+
 class RegisterKL(object):
   """Decorator to register a KL divergence implementation function.
 
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 00b5697c8325256538ed01142dd2bab416e98e00..04762565c2a982f4df47a1a85547db7a104a5ec3 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -140,6 +141,8 @@ class Multinomial(distribution.Distribution):
 
   counts = [[2., 1, 1], [3, 1, 1]]
   dist.prob(counts)  # Shape [2]
+
+  dist.sample(5) # Shape [5, 2, 3]
   ```
   """
 
@@ -231,29 +234,36 @@ class Multinomial(distribution.Distribution):
 
   def _sample_n(self, n, seed=None):
     n_draws = math_ops.cast(self.total_count, dtype=dtypes.int32)
-    if self.total_count.get_shape().ndims is not None:
-      if self.total_count.get_shape().ndims != 0:
-        raise NotImplementedError(
-            "Sample only supported for scalar number of draws.")
-    elif self.validate_args:
-      is_scalar = check_ops.assert_rank(
-          n_draws, 0,
-          message="Sample only supported for scalar number of draws.")
-      n_draws = control_flow_ops.with_dependencies([is_scalar], n_draws)
     k = self.event_shape_tensor()[0]
-    # Flatten batch dims so logits has shape [B, k],
-    # where B = reduce_prod(self.batch_shape_tensor()).
-    x = random_ops.multinomial(
-        logits=array_ops.reshape(self.logits, [-1, k]),
-        num_samples=n * n_draws,
-        seed=seed)
-    x = array_ops.reshape(x, shape=[-1, n, n_draws])
-    x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k),
-                            axis=-2)  # shape: [B, n, k]
+
+    # boardcast the total_count and logits to same shape
+    n_draws = array_ops.ones_like(
+        self.logits[..., 0], dtype=n_draws.dtype) * n_draws
+    logits = array_ops.ones_like(
+        n_draws[..., array_ops.newaxis], dtype=self.logits.dtype) * self.logits
+
+    # flatten the total_count and logits
+    flat_logits = array_ops.reshape(logits, [-1, k])  # [B1B2...Bm, k]
+    flat_ndraws = n * array_ops.reshape(n_draws, [-1])  # [B1B2...Bm]
+
+    # computes each total_count and logits situation by map_fn
+    def _sample_single(args):
+      logits, n_draw = args[0], args[1]  # [K], []
+      x = random_ops.multinomial(logits[array_ops.newaxis, ...], n_draw,
+                                 seed)  # [1, n*n_draw]
+      x = array_ops.reshape(x, shape=[n, -1])  # [n, n_draw]
+      x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)  # [n, k]
+      return x
+
+    x = functional_ops.map_fn(
+        _sample_single, [flat_logits, flat_ndraws],
+        dtype=self.dtype)  # [B1B2...Bm, n, k]
+
+    # reshape the results to proper shape
     x = array_ops.transpose(x, perm=[1, 0, 2])
     final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0)
-    x = array_ops.reshape(x, final_shape)
-    return math_ops.cast(x, self.dtype)
+    x = array_ops.reshape(x, final_shape)  # [n, B1, B2,..., Bm, k]
+    return x
 
   @distribution_util.AppendDocstring(_multinomial_sample_note)
   def _log_prob(self, counts):
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 3a804c941a79ee9e58efb50c30d85795371a2824..bed4cbb2c1a43b6952861f4fab82957229e23c9c 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 __all__ = [
+    "erfinv",
     "ndtr",
     "ndtri",
     "log_ndtr",
@@ -197,9 +198,10 @@ def _ndtri(p):
   # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
   # later on. The result from the computation when p == 0 is not used so any
   # number that doesn't result in NaNs is fine.
+  one_half = constant_op.constant(0.5, dtype=p.dtype)
   sanitized_mcp = array_ops.where(
       maybe_complement_p <= 0.,
-      constant_op.constant(0.5, dtype=p.dtype, shape=p.shape),
+      array_ops.fill(array_ops.shape(p), one_half),
       maybe_complement_p)
 
   # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
@@ -226,7 +228,8 @@ def _ndtri(p):
                       array_ops.where(z >= 8.0, x_for_small_p, x_otherwise))
 
   x = array_ops.where(p > 1. - np.exp(-2.), x, -x)
-  infinity = constant_op.constant(np.inf, dtype=x.dtype, shape=x.shape)
+  infinity_scalar = constant_op.constant(np.inf, dtype=p.dtype)
+  infinity = array_ops.fill(array_ops.shape(p), infinity_scalar)
   x_nan_replaced = array_ops.where(
       p <= 0.0, -infinity, array_ops.where(p >= 1.0, infinity, x))
   return x_nan_replaced
@@ -348,6 +351,29 @@ def _log_ndtr_asymptotic_series(x, series_order):
   return 1. + even_sum - odd_sum
 
 
+def erfinv(x, name="erfinv"):
+  """The inverse function for erf, the error function.
+
+  Args:
+    x: `Tensor` of type `float32`, `float64`.
+    name: Python string. A name for the operation (default="erfinv").
+
+  Returns:
+    x: `Tensor` with `dtype=x.dtype`.
+
+  Raises:
+    TypeError: if `x` is not floating-type.
+  """
+
+  with ops.name_scope(name, values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.as_numpy_dtype not in [np.float32, np.float64]:
+      raise TypeError(
+          "x.dtype=%s is not handled, see docstring for supported types."
+          % x.dtype)
+    return ndtri((x + 1.0) / 2.0) / np.sqrt(2)
+
+
 def _double_factorial(n):
   """The double factorial function for small Python integer `n`."""
   return np.prod(np.arange(n, 1, -2))
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index ba25b2c3485706cc769b8f37118a994e065c1f93..1efcf9d32e9ea9924bb080459efb7015e33ccd54 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -434,7 +434,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     log_prob = self.distribution.log_prob(x)
     if self._is_maybe_event_override:
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
-    log_prob = ildj + log_prob
+    log_prob += math_ops.cast(ildj, log_prob.dtype)
     if self._is_maybe_event_override:
       log_prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -457,7 +457,7 @@ class TransformedDistribution(distribution_lib.Distribution):
     prob = self.distribution.prob(x)
     if self._is_maybe_event_override:
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
-    prob *= math_ops.exp(ildj)
+    prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
     if self._is_maybe_event_override:
       prob.set_shape(array_ops.broadcast_static_shape(
           y.get_shape().with_rank_at_least(1)[:-1], self.batch_shape))
@@ -546,7 +546,9 @@ class TransformedDistribution(distribution_lib.Distribution):
       ], 0)
       entropy = array_ops.tile(entropy, multiples)
     dummy = array_ops.zeros([], self.dtype)
-    entropy -= self.bijector.inverse_log_det_jacobian(dummy)
+    entropy -= math_ops.cast(
+        self.bijector.inverse_log_det_jacobian(dummy),
+        entropy.dtype)
     entropy.set_shape(self.batch_shape)
     return entropy
 
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 41b86f79409aef76dbd710606d09b21f34cab7ba..28c74bf981666ab95566dd8e90c6f80d06e44697 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -751,6 +751,7 @@ def fill_triangular(x, upper=False, name=None):
   """
 
   with ops.name_scope(name, "fill_triangular", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
     if x.shape.with_rank_at_least(1)[-1].value is not None:
       # Formula derived by solving for n: m = n(n+1)/2.
       m = np.int32(x.shape[-1].value)
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 8c1ccc68404d792889086a01088cac30f2d72f0e..f4561d1a830141a069c12ddb33b83744363844f2 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -191,12 +191,9 @@ def _embedding_lookup_and_transform(params,
             (flat_ids - extras) // ids_per_partition)
 
         # Emulate a conditional using a boolean indicator tensor
-        is_in_first_extras_partitions = math_ops.cast(p_assignments < extras,
-                                                      flat_ids.dtype)
-        new_ids = (is_in_first_extras_partitions * (flat_ids %
-                                                    (ids_per_partition + 1)) +
-                   (1 - is_in_first_extras_partitions) *
-                   ((flat_ids - extras) % ids_per_partition))
+        new_ids = array_ops.where(p_assignments < extras,
+                                  flat_ids % (ids_per_partition + 1),
+                                  (flat_ids - extras) % ids_per_partition)
       else:
         raise ValueError("Unrecognized partition strategy: " +
                          partition_strategy)
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 3addfefc99dcded6ca0546e91901b0e6ef47aea1..1ff196805507f0ca7a1123df0d2a37925fc3e503 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -348,7 +348,6 @@ def compute_gradient_error(x,
       as the initial value.
     delta: (optional) the amount of perturbation.
     init_targets: list of targets to run to initialize model params.
-      TODO(mrry): Remove this argument.
     extra_feed_dict: dict that allows fixing specified tensor values
       during the Jacobian calculation.
 
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 97a3486f616ddcd8244f182dffeb506ee54fcdb4..f5fdb12b2c8ae470a1b671b85ae12c675fd16cd4 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
@@ -425,18 +426,22 @@ def gradients(ys,
   other things, this allows computation of partial derivatives as opposed to
   total derivatives. For example:
 
-    a = tf.constant(0.)
-    b = 2 * a
-    g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  ```python
+  a = tf.constant(0.)
+  b = 2 * a
+  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  ```
 
   Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
   total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
   influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
   equivalent to:
 
-    a = tf.stop_gradient(tf.constant(0.))
-    b = tf.stop_gradient(2 * a)
-    g = tf.gradients(a + b, [a, b])
+  ```python
+  a = tf.stop_gradient(tf.constant(0.))
+  b = tf.stop_gradient(2 * a)
+  g = tf.gradients(a + b, [a, b])
+  ```
 
   `stop_gradients` provides a way of stopping gradient after the graph has
   already been constructed, as compared to `tf.stop_gradient` which is used
@@ -611,7 +616,9 @@ def gradients(ys,
               _VerifyGeneratedGradients(in_grads, op)
               if gate_gradients and len(
                   [x for x in in_grads if x is not None]) > 1:
-                in_grads = control_flow_ops.tuple(in_grads)
+                with ops.device(None):
+                  with ops.colocate_with(None, ignore_existing=True):
+                    in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
           # If no grad_fn is defined or none of out_grads is available,
@@ -662,10 +669,10 @@ def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
     ready = (pending_count[x.op._id] == 0)
     if loop_state and not ready:
       ready = (pending_count[x.op._id] > 0 and
-               control_flow_ops.IsLoopSwitch(x.op))
+               control_flow_util.IsLoopSwitch(x.op))
     # pylint: enable=protected-access
     if ready:
-      if control_flow_ops.IsLoopExit(x.op):
+      if control_flow_util.IsLoopExit(x.op):
         # if x is an exit without real gradient, defer processing them.
         grad_state = loop_state.GetGradState(x.op, before=False)
         grad_state.deferred_exits.append(x)
@@ -705,7 +712,7 @@ def _SetGrad(grads, t, grad):
   if isinstance(t_grads, list):
     t_grads.append(grad)
   else:
-    assert control_flow_ops.IsLoopSwitch(op)
+    assert control_flow_util.IsLoopSwitch(op)
     op_grads[t.value_index] = grad
 
 
@@ -845,7 +852,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
   for i, out_grad in enumerate(out_grads):
     if loop_state:
       if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-        assert control_flow_ops.IsLoopSwitch(op)
+        assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
     if (isinstance(out_grad, collections.Sequence) and not all([
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index f0cffbab3035509fed68583ceff0710ee514e109..1211b2e923082d8d24b8b924227cbc52e6f2eaef 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -23,6 +23,7 @@ import warnings
 
 import numpy as np
 
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -205,6 +206,23 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gw2 = gradients.gradients(z, [w], colocate_gradients_with_ops=False)[0]
       self.assertTrue(w.op.colocation_groups() != gw2.op.colocation_groups())
 
+  def testColocateGradientsWithGateGradients(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+    with ops.Graph().as_default() as g:
+      with g.device("/device:CPU:0"):
+        x = constant(1.0, shape=[1, 1])
+        y = constant(1.0, shape=[1, 1])
+        s = x + y
+      with g.device("/device:GPU:0"):
+        z = math_ops.reduce_sum(s)
+
+      gz_x = gradients.gradients(z, [x], colocate_gradients_with_ops=True,
+                                 gate_gradients=True)[0]
+      with session.Session():
+        # Make sure the placer doesn't complain.
+        gz_x.eval()
+
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
     # set explicitly to None so we will get an exception if the gradient code
@@ -406,8 +424,8 @@ class GradientsTest(test_util.TensorFlowTestCase):
                           constants=constants, variables=variables_))
 
     # evaluate all tensors in one call to session.run for speed
-    with self.test_session() as session:
-      results = session.run([(case["grad1"], case["grad2"]) for case in cases])
+    with self.test_session() as sess:
+      results = sess.run([(case["grad1"], case["grad2"]) for case in cases])
 
     for (npgrad1, npgrad2), case in zip(results, cases):
       for a, b in zip(npgrad1, npgrad2):
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
index 732ab8f15ab8ce7873d5454ff42cdf939cd6e5bd..ec0890c0168744e089904d94f1fddeb4f7312aca 100644
--- a/tensorflow/python/ops/hidden_ops.txt
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -21,6 +21,7 @@ ParallelConcat
 Placeholder
 RefIdentity
 Reverse
+Snapshot
 SpaceToBatch
 Split
 SplitV
@@ -223,6 +224,7 @@ BatchSelfAdjointEig
 BatchSelfAdjointEigV2
 BatchSvd
 LogMatrixDeterminant
+MatrixExponential
 MatrixSolveLs
 SelfAdjointEig
 SelfAdjointEigV2
@@ -340,6 +342,7 @@ TruncatedNormal
 # script_ops
 PyFunc
 PyFuncStateless
+EagerPyFunc
 
 # sdca_ops
 
@@ -354,6 +357,7 @@ AddSparseToTensorsMap
 AddManySparseToTensorsMap
 TakeManySparseFromTensorsMap
 DeserializeManySparse
+DeserializeSparse
 SerializeManySparse
 SerializeSparse
 SparseAdd
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 2946dbe81e6d37930874689d5c95fcdbadbbc68d..46022e2e7f1bfd061e12bd97ced8f913ca8bd1ac 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -182,8 +182,81 @@ def _CheckAtLeast3DImage(image, require_static=True):
     return []
 
 
-def fix_image_flip_shape(image, result):
-  """Set the shape to 3 dimensional if we don't know anything else.
+def _EnsureTensorIs4D(image):
+  """Converts `image` to a 4-D Tensor if it is not already one.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+  Raises:
+    ValueError: if image is not a 3-D or 4-D Tensor.
+
+  Returns:
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, width, height, channels]`
+    If `image` was 3-D, a 4-D float Tensor of shape
+    `[1, width, height, channels]`
+  """
+  original_shape = image.get_shape()
+  is_batch = True
+  if original_shape.ndims == 3:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+  elif original_shape.ndims is None:
+    is_batch = False
+    image = array_ops.expand_dims(image, 0)
+    image.set_shape([None] * 4)
+  elif original_shape.ndims != 4:
+    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+
+  return (image, is_batch)
+
+def _flip_image(image, axis, random=False, seed=None):
+  """
+  Flips image(s) around a given axis.
+
+  Args:
+    image:  4-D Tensor of shape `[batch, height, width, channels]` or
+            3-D Tensor of shape `[height, width, channels]`.
+    axis:   A Python integer representing the axis on which the image(s)
+            will be flipped. Note: The provided axis must be specified relative
+            to the shape `[batch, height, width, channels]` as 3-D images will
+            be expanded to fit this shape before being flipped.
+    random: A boolean representing whether or not we should flip the
+            image(s) at random.
+    seed:   Python integer. Used to create a random seed. See
+            tf.set_random_seed for behavior.
+
+  Raises:
+    ValueError: if image is not a 3-D or 4-D Tensor.
+
+  Returns:
+    A tensor of the same type and shape as `image`
+  """
+  image = ops.convert_to_tensor(image, name='image')
+  original_image = image
+  image, is_batch = _EnsureTensorIs4D(image)
+
+  image = control_flow_ops.with_dependencies(
+    _CheckAtLeast3DImage(image, require_static=False), image)
+
+  batch, _, _, _ = _ImageDimensions(image, rank=4)
+  flipped = array_ops.reverse(image, [axis])
+
+  if random == True:
+    uniform_random = random_ops.random_uniform([batch], 0, 1.0, seed=seed)
+    mirror_cond = math_ops.less(uniform_random, 0.5)
+    flipped = array_ops.where(mirror_cond, x=image, y=flipped)
+
+  if is_batch:
+    return fix_image_flip_shape(original_image, flipped, rank=4)
+
+  flipped = array_ops.squeeze(flipped, squeeze_dims=[0])
+  return fix_image_flip_shape(original_image, flipped, rank=3)
+
+
+def fix_image_flip_shape(image, result, rank=3):
+  """Set the shape to original dimensional if we don't know anything else.
 
   Args:
     image: original image size
@@ -195,171 +268,174 @@ def fix_image_flip_shape(image, result):
 
   image_shape = image.get_shape()
   if image_shape == tensor_shape.unknown_shape():
-    result.set_shape([None, None, None])
+    result.set_shape([None] * rank)
   else:
     result.set_shape(image_shape)
   return result
 
 
 def random_flip_up_down(image, seed=None):
-  """Randomly flips an image vertically (upside down).
+  """Randomly flips image(s) vertically (upside down).
 
-  With a 1 in 2 chance, outputs the contents of `image` flipped along the first
-  dimension, which is `height`.  Otherwise output the image as-is.
+  With a 1 in 2 chance, outputs the contents of `image` flipped along the height
+  dimension. Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [0]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  return _flip_image(image, axis=1, random=True, seed=seed)
 
 
 def random_flip_left_right(image, seed=None):
-  """Randomly flip an image horizontally (left to right).
+  """Randomly flip image(s) horizontally (left to right).
 
   With a 1 in 2 chance, outputs the contents of `image` flipped along the
-  second dimension, which is `width`.  Otherwise output the image as-is.
+  width dimension. Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-  mirror_cond = math_ops.less(uniform_random, .5)
-  result = control_flow_ops.cond(mirror_cond,
-                                 lambda: array_ops.reverse(image, [1]),
-                                 lambda: image)
-  return fix_image_flip_shape(image, result)
+  return _flip_image(image, axis=2, random=True, seed=seed)
 
 
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
-  Outputs the contents of `image` flipped along the second dimension, which is
-  `width`.
+  Outputs the contents of `image` flipped along the width dimension.
 
   See also `reverse()`.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-
+  return _flip_image(image, axis=2, random=False)
 
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
-  Outputs the contents of `image` flipped along the first dimension, which is
-  `height`.
+  Outputs the contents of `image` flipped along the height dimension.
 
   See also `reverse()`.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  image = ops.convert_to_tensor(image, name='image')
-  image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+  return _flip_image(image, axis=1, random=False)
 
 
 def rot90(image, k=1, name=None):
-  """Rotate an image counter-clockwise by 90 degrees.
+  """Rotate image(s) counter-clockwise by 90 degrees.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels]`.
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     k: A scalar integer. The number of times the image is rotated by 90 degrees.
     name: A name for this operation (optional).
 
   Returns:
-    A rotated 3-D tensor of the same type and shape as `image`.
+    A rotated of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(name, 'rot90', [image, k]) as scope:
     image = ops.convert_to_tensor(image, name='image')
+    image, is_batch = _EnsureTensorIs4D(image)
     image = control_flow_ops.with_dependencies(
-        _Check3DImage(image, require_static=False), image)
+        _CheckAtLeast3DImage(image, require_static=False), image)
     k = ops.convert_to_tensor(k, dtype=dtypes.int32, name='k')
     k.get_shape().assert_has_rank(0)
     k = math_ops.mod(k, 4)
 
     def _rot90():
-      return array_ops.transpose(array_ops.reverse_v2(image, [1]),
-                                 [1, 0, 2])
+      return array_ops.transpose(array_ops.reverse_v2(image, [2]),
+                                 [0, 2, 1, 3])
     def _rot180():
-      return array_ops.reverse_v2(image, [0, 1])
+      return array_ops.reverse_v2(image, [1, 2])
     def _rot270():
-      return array_ops.reverse_v2(array_ops.transpose(image, [1, 0, 2]),
-                                  [1])
+      return array_ops.reverse_v2(array_ops.transpose(image, [0, 2, 1, 3]),
+                                  [2])
     cases = [(math_ops.equal(k, 1), _rot90),
              (math_ops.equal(k, 2), _rot180),
              (math_ops.equal(k, 3), _rot270)]
 
-    ret = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
+    result = control_flow_ops.case(cases, default=lambda: image, exclusive=True,
                                 name=scope)
-    ret.set_shape([None, None, image.get_shape()[2]])
-    return ret
+
+    shape = image.get_shape()
+    result.set_shape([shape[0], None, None, shape[3]])
+
+    if is_batch == True:
+      return result
+
+    result = array_ops.squeeze(result, squeeze_dims=[0])
+    return result
 
 
 def transpose_image(image):
-  """Transpose an image by swapping the first and second dimension.
+  """Transpose an image by swapping the height and width dimension.
 
   See also `transpose()`.
 
   Args:
-    image: 3-D tensor of shape `[height, width, channels]`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
 
   Returns:
-    A 3-D tensor of shape `[width, height, channels]`
+    If `image` was 4-D, a 4-D float Tensor of shape
+    `[batch, width, height, channels]`
+    If `image` was 3-D, a 3-D float Tensor of shape
+    `[width, height, channels]`
 
   Raises:
     ValueError: if the shape of `image` not supported.
   """
   image = ops.convert_to_tensor(image, name='image')
+  image, is_batch = _EnsureTensorIs4D(image)
   image = control_flow_ops.with_dependencies(
-      _Check3DImage(image, require_static=False), image)
-  return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+      _CheckAtLeast3DImage(image, require_static=False), image)
+
+  result = array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+
+  if is_batch:
+    return result
+
+  result = array_ops.squeeze(result, squeeze_dims=[0])
+  return result
 
 
 def central_crop(image, central_fraction):
@@ -445,21 +521,9 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
       negative.
   """
   image = ops.convert_to_tensor(image, name='image')
-
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  image, is_batch = _EnsureTensorIs4D(image)
 
   assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   after_padding_width = target_width - offset_width - width
@@ -524,21 +588,9 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
       negative, or either `target_height` or `target_width` is not positive.
   """
   image = ops.convert_to_tensor(image, name='image')
-
-  is_batch = True
-  image_shape = image.get_shape()
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  image, is_batch = _EnsureTensorIs4D(image)
 
   assert_ops = _CheckAtLeast3DImage(image, require_static=False)
-
   batch, height, width, depth = _ImageDimensions(image, rank=4)
 
   assert_ops += _assert(offset_width >= 0, ValueError,
@@ -599,17 +651,7 @@ def resize_image_with_crop_or_pad(image, target_height, target_width):
     `[new_height, new_width, channels]`.
   """
   image = ops.convert_to_tensor(image, name='image')
-  image_shape = image.get_shape()
-  is_batch = True
-  if image_shape.ndims == 3:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-  elif image_shape.ndims is None:
-    is_batch = False
-    image = array_ops.expand_dims(image, 0)
-    image.set_shape([None] * 4)
-  elif image_shape.ndims != 4:
-    raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  image, is_batch = _EnsureTensorIs4D(image)
 
   assert_ops = _CheckAtLeast3DImage(image, require_static=False)
   assert_ops += _assert(target_width > 0, ValueError,
@@ -1119,9 +1161,8 @@ def rgb_to_grayscale(images, name=None):
     # https://en.wikipedia.org/wiki/Luma_%28video%29
     rgb_weights = [0.2989, 0.5870, 0.1140]
     rank_1 = array_ops.expand_dims(array_ops.rank(images) - 1, 0)
-    gray_float = math_ops.reduce_sum(flt_image * rgb_weights,
-                                     rank_1,
-                                     keep_dims=True)
+    gray_float = math_ops.reduce_sum(
+        flt_image * rgb_weights, rank_1, keepdims=True)
     gray_float.set_shape(images.get_shape()[:-1].concatenate([1]))
     return convert_image_dtype(gray_float, orig_dtype, name=name)
 
@@ -1169,7 +1210,7 @@ def random_hue(image, max_delta, seed=None):
       set_random_seed for its interaction with the graph-level random seed.
 
   Returns:
-    3-D float tensor of shape `[height, width, channels]`.
+    Adjusted image(s), same shape and DType as `image`.
 
   Raises:
     ValueError: if `max_delta` is invalid.
@@ -1212,26 +1253,7 @@ def adjust_hue(image, delta, name=None):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_HUE_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if not fused:
-      hsv = gen_image_ops.rgb_to_hsv(flt_image)
-
-      hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-      saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-      value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
-
-      # Note that we add 2*pi to guarantee that the resulting hue is a positive
-      # floating point number since delta is [-0.5, 0.5].
-      hue = math_ops.mod(hue + (delta + 1.), 1.)
-
-      hsv_altered = array_ops.concat([hue, saturation, value], 2)
-      rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
-    else:
-      rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
+    rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
 
     return convert_image_dtype(rgb_altered, orig_dtype)
 
@@ -1295,30 +1317,9 @@ def adjust_saturation(image, saturation_factor, name=None):
     orig_dtype = image.dtype
     flt_image = convert_image_dtype(image, dtypes.float32)
 
-    # TODO(zhengxq): we will switch to the fused version after we add a GPU
-    # kernel for that.
-    fused = os.environ.get('TF_ADJUST_SATURATION_FUSED', '')
-    fused = fused.lower() in ('true', 't', '1')
-
-    if fused:
-      return convert_image_dtype(
-          gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-          orig_dtype)
-
-    hsv = gen_image_ops.rgb_to_hsv(flt_image)
-
-    hue = array_ops.slice(hsv, [0, 0, 0], [-1, -1, 1])
-    saturation = array_ops.slice(hsv, [0, 0, 1], [-1, -1, 1])
-    value = array_ops.slice(hsv, [0, 0, 2], [-1, -1, 1])
-
-    saturation *= saturation_factor
-    saturation = clip_ops.clip_by_value(saturation, 0.0, 1.0)
-
-    hsv_altered = array_ops.concat([hue, saturation, value], 2)
-    rgb_altered = gen_image_ops.hsv_to_rgb(hsv_altered)
-
-    return convert_image_dtype(rgb_altered, orig_dtype)
-
+    return convert_image_dtype(
+        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
+        orig_dtype)
 
 def decode_image(contents, channels=None, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index d1554b399f3776933bf970f7b2ceb8db5865d844..f320b52b09084bbe077c380386ab55ee0b0320e3 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -281,6 +281,21 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchAdjustHue(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    delta = 0.25
+    y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_hue(x, delta)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjustHueNp(self, x_np, delta_h):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -632,6 +647,21 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testBatchSaturation(self):
+    x_shape = [2, 1, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
+
+    saturation_factor = 0.5
+    y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
+    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
+
+    with self.test_session(use_gpu=True):
+      x = constant_op.constant(x_np, shape=x_shape)
+      y = image_ops.adjust_saturation(x, saturation_factor)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def _adjust_saturation(self, image, saturation_factor):
     image = ops.convert_to_tensor(image, name="image")
     orig_dtype = image.dtype
@@ -729,7 +759,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
 
 class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
-  def testIdempotentLeftRight(self):
+  def testInvolutionLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
@@ -737,6 +767,15 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionLeftRightWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -747,17 +786,30 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testLeftRightWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_left_right(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
+    seed = 42
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
 
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(50):
+      for _ in range(100):
         y_tf = y.eval()
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
@@ -765,10 +817,46 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         else:
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
-      self.assertGreaterEqual(count_flipped, 1)
-      self.assertGreaterEqual(count_unflipped, 1)
+      # 100 trials
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testRandomFlipLeftRightWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    seed = 42
 
-  def testIdempotentUpDown(self):
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape).eval()
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(50):
+        y_tf = y.eval()
+        for index in range(0, x_tf.shape[0]):
+          current_x_tf = x_tf[index]
+          current_y_tf = y_tf[index]
+          current_y_np = y_np[index]
+
+          if current_y_tf[0][0] == 1:
+            self.assertAllEqual(current_y_tf, current_x_tf)
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(current_y_tf, current_y_np)
+            count_flipped += 1
+      # Batch size 2 * 50 trials = 100
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
     with self.test_session(use_gpu=True):
@@ -777,6 +865,16 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionUpDownWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -787,16 +885,29 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testUpDownWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.flip_up_down(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+    seed = 42
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(50):
+      for _ in range(100):
         y_tf = y.eval()
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
@@ -804,10 +915,45 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         else:
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
-      self.assertGreaterEqual(count_flipped, 1)
-      self.assertGreaterEqual(count_unflipped, 1)
+      # 100 trials
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testRandomFlipUpDownWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    y_np = np.array([[[4, 5, 6], [1, 2, 3]], [[4, 5, 6], [1, 2, 3]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+    seed = 42
 
-  def testIdempotentTranspose(self):
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape).eval()
+      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(50):
+        y_tf = y.eval()
+        for index in range(0, x_tf.shape[0]):
+          current_x_tf = x_tf[index]
+          current_y_tf = y_tf[index]
+          current_y_np = y_np[index]
+        if current_y_tf[0][0] == 1:
+          self.assertAllEqual(current_y_tf, current_x_tf)
+          count_unflipped += 1
+        else:
+          self.assertAllEqual(current_y_tf, current_y_np)
+          count_flipped += 1
+      # Batch size 2 * 50 trials = 100
+      # Mean: 50
+      # Std Dev: ~5
+      # Six Sigma: 50 - (5 * 6) = 20
+      self.assertGreaterEqual(count_flipped, 20)
+      self.assertGreaterEqual(count_unflipped, 20)
+
+  def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
     with self.test_session(use_gpu=True):
@@ -816,6 +962,16 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, x_np)
 
+  def testInvolutionTransposeWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, x_np)
+
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -826,11 +982,28 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       y_tf = y.eval()
       self.assertAllEqual(y_tf, y_np)
 
+  def testTransposeWithBatch(self):
+    x_np = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
+                    dtype=np.uint8).reshape([2, 2, 3, 1])
+
+    y_np = np.array([[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
+                    dtype=np.uint8).reshape([2, 3, 2, 1])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.transpose_image(x_tf)
+      y_tf = y.eval()
+      self.assertAllEqual(y_tf, y_np)
+
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
-    p_unknown_dims = array_ops.placeholder(
+    p_unknown_dims_3 = array_ops.placeholder(
         dtypes.uint8, shape=[None, None, None])
+    p_unknown_dims_4 = array_ops.placeholder(
+        dtypes.uint8, shape=[None, None, None, None])
     p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
+    p_unknown_batch = array_ops.placeholder(dtypes.uint8,
+                                            shape=[None, 64, 64, 3])
 
     p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
     p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
@@ -842,12 +1015,17 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     ]:
       transformed_unknown_rank = op(p_unknown_rank)
       self.assertEqual(3, transformed_unknown_rank.get_shape().ndims)
-      transformed_unknown_dims = op(p_unknown_dims)
-      self.assertEqual(3, transformed_unknown_dims.get_shape().ndims)
+      transformed_unknown_dims_3 = op(p_unknown_dims_3)
+      self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
+      transformed_unknown_dims_4 = op(p_unknown_dims_4)
+      self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
       transformed_unknown_width = op(p_unknown_width)
       self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
+      transformed_unknown_batch = op(p_unknown_batch)
+      self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
 
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
+      with self.assertRaisesRegexp(ValueError,
+                                   "must have either 3 or 4 dimensions."):
         op(p_wrong_rank)
       with self.assertRaisesRegexp(ValueError, "must be > 0"):
         op(p_zero_dim)
@@ -860,6 +1038,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         rotated = image_ops.rot90(rotated)
       self.assertAllEqual(image, rotated.eval())
 
+  def testRot90GroupOrderWithBatch(self):
+    image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
+    with self.test_session(use_gpu=True):
+      rotated = image
+      for _ in xrange(4):
+        rotated = image_ops.rot90(rotated)
+      self.assertAllEqual(image, rotated.eval())
+
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -869,6 +1055,14 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  def testRot90NumpyEquivalenceWithBatch(self):
+    image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
+    with self.test_session(use_gpu=True):
+      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
+      y_tf = image_ops.rot90(image, k_placeholder)
+      for k in xrange(4):
+        y_np = np.rot90(image, k=k, axes=(1, 2))
+        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
 class RandomFlipTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 04a15e3e5bc548f99bd5d4ad1fcbf0fa22b4d1ef..bf15f0e2e55385032b194c7718e175114e77dd7b 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -38,6 +38,7 @@ diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
 eigvalsh = linalg_ops.self_adjoint_eigvals
 einsum = special_math_ops.einsum
+expm = gen_linalg_ops._matrix_exponential
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 lstsq = linalg_ops.matrix_solve_ls
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 3d0ea3e11becae185710b140c2a84123a6b848b2..2c11f90e6d9de280e6020edfaa4d8ef237126705 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -66,11 +66,23 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     rtol = self._rtol[dtype]
     self.assertAllClose(x, y, atol=atol, rtol=rtol)
 
+  @property
+  def _adjoint_options(self):
+    return [False, True]
+
+  @property
+  def _adjoint_arg_options(self):
+    return [False, True]
+
   @property
   def _dtypes_to_test(self):
     # TODO(langmore) Test tf.float16 once tf.matrix_solve works in 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
+  @property
+  def _use_placeholder_options(self):
+    return [False, True]
+
   @abc.abstractproperty
   def _shapes_to_test(self):
     """Returns list of tuples, each is one shape that will be tested."""
@@ -151,7 +163,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_to_dense(self):
     self._skip_if_tests_to_skip_contains("to_dense")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
@@ -166,7 +178,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_det(self):
     self._skip_if_tests_to_skip_contains("det")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
@@ -183,7 +195,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_log_abs_det(self):
     self._skip_if_tests_to_skip_contains("log_abs_det")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
@@ -200,11 +212,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_matmul(self):
     self._skip_if_tests_to_skip_contains("matmul")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
-          for adjoint in False, True:
-            for adjoint_arg in False, True:
+          for adjoint in self._adjoint_options:
+            for adjoint_arg in self._adjoint_arg_options:
               with self.test_session(graph=ops.Graph()) as sess:
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
@@ -228,11 +240,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_solve(self):
     self._skip_if_tests_to_skip_contains("solve")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
-          for adjoint in False, True:
-            for adjoint_arg in False, True:
+          for adjoint in self._adjoint_options:
+            for adjoint_arg in self._adjoint_arg_options:
               with self.test_session(graph=ops.Graph()) as sess:
                 sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
                 operator, mat, feed_dict = self._operator_and_mat_and_feed_dict(
@@ -257,7 +269,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
@@ -274,7 +286,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_add_to_tensor(self):
     self._skip_if_tests_to_skip_contains("add_to_tensor")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
@@ -293,7 +305,7 @@ class LinearOperatorDerivedClassTest(test.TestCase):
 
   def test_diag_part(self):
     self._skip_if_tests_to_skip_contains("diag_part")
-    for use_placeholder in False, True:
+    for use_placeholder in self._use_placeholder_options:
       for shape in self._shapes_to_test:
         for dtype in self._dtypes_to_test:
           with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 2cb467c89157b2f78c5bc3ccc037360836b00ee7..be9beee633bb7c900b1618c2922b6eff5bf65df0 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 
 # Names below are lower_case.
 # pylint: disable=invalid-name
@@ -438,7 +439,14 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
+@deprecation.deprecated_args(
+    None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
+def norm(tensor,
+         ord='euclidean',
+         axis=None,
+         keepdims=None,
+         name=None,
+         keep_dims=None):
   r"""Computes the norm of vectors, matrices, and tensors.
 
   This function can compute several different vector norms (the 1-norm, the
@@ -471,13 +479,14 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       can be either a matrix or a batch of matrices at runtime, pass
       `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
       computed.
-    keep_dims: If True, the axis indicated in `axis` are kept with size 1.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
       Otherwise, the dimensions in `axis` are removed from the output shape.
     name: The name of the op.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     output: A `Tensor` of the same type as tensor, containing the vector or
-      matrix norms. If `keep_dims` is True then the rank of output is equal to
+      matrix norms. If `keepdims` is True then the rank of output is equal to
       the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
       if `axis` is an integer, the rank of `output` is one less than the rank
       of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
@@ -496,6 +505,10 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
      higher order tensors.
   @end_compatibility
   """
+  keepdims = deprecation.deprecated_argument_lookup('keepdims', keepdims,
+                                                    'keep_dims', keep_dims)
+  if keepdims is None:
+    keepdims = False
 
   is_matrix_norm = ((isinstance(axis, tuple) or isinstance(axis, list)) and
                     len(axis) == 2)
@@ -528,25 +541,25 @@ def norm(tensor, ord='euclidean', axis=None, keep_dims=False, name=None):
       # matrices.
       result = math_ops.sqrt(
           math_ops.reduce_sum(
-              tensor * math_ops.conj(tensor), axis, keep_dims=True))
+              tensor * math_ops.conj(tensor), axis, keepdims=True))
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
         sum_axis = None if axis is None else axis[0]
-        result = math_ops.reduce_sum(result, sum_axis, keep_dims=True)
+        result = math_ops.reduce_sum(result, sum_axis, keepdims=True)
         if is_matrix_norm:
-          result = math_ops.reduce_max(result, axis[-1], keep_dims=True)
+          result = math_ops.reduce_max(result, axis[-1], keepdims=True)
       elif ord == np.inf:
         if is_matrix_norm:
-          result = math_ops.reduce_sum(result, axis[1], keep_dims=True)
+          result = math_ops.reduce_sum(result, axis[1], keepdims=True)
         max_axis = None if axis is None else axis[0]
-        result = math_ops.reduce_max(result, max_axis, keep_dims=True)
+        result = math_ops.reduce_max(result, max_axis, keepdims=True)
       else:
         # General p-norms (positive p only)
         result = math_ops.pow(
-            math_ops.reduce_sum(
-                math_ops.pow(result, ord), axis, keep_dims=True), 1.0 / ord)
-    if not keep_dims:
+            math_ops.reduce_sum(math_ops.pow(result, ord), axis, keepdims=True),
+            1.0 / ord)
+    if not keepdims:
       result = array_ops.squeeze(result, axis)
     return result
 
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 08e3f83a0b21a8444ad3500c62fe624440edc255..51ab2aec2298a9072c90c226992f122a804ec02e 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -39,8 +39,8 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
 
-  This is an identity op with the side effect of printing `data` when
-  evaluating.
+  This is an identity op (behaves like `tf.identity`) with the side effect
+  of printing `data` when evaluating.
 
   Note: This op prints to the standard error. It is not currently compatible
     with jupyter notebook (printing to the notebook *server's* output, not into
@@ -57,7 +57,7 @@ def Print(input_, data, message=None, first_n=None, summarize=None,
     name: A name for the operation (optional).
 
   Returns:
-    Same tensor as `input_`.
+    A `Tensor`. Has the same type and contents as `input_`.
   """
   return gen_logging_ops._print(input_, data, message, first_n, summarize, name)
 
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index fa58ffc37e212a4000bfcb56e9c8400e1e0546de..227e1a52655a043c2b119ac98b4bd83d35a24b1d 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -561,9 +561,9 @@ class TextFileStringTableInitializer(TextFileInitializer):
         The path must be accessible from wherever the graph is initialized
         (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
       key_column_index: The column index from the text file to get the keys
-        from. The default is 0 that represents the whole line content.
+        from. The default is to use the line number, starting from zero.
       value_column_index: The column index from the text file to get the
-        values from. The default is to use the line number, starting from zero.
+        values from. The default is to use the whole line content.
       vocab_size: The number of elements in the file, if known.
       delimiter: The delimiter to separate fields in a line.
       name: Optional name for the op.
@@ -613,9 +613,9 @@ class TextFileIdTableInitializer(TextFileInitializer):
         The path must be accessible from wherever the graph is initialized
         (eg. trainer or eval workers). The filename may be a scalar `Tensor`.
       key_column_index: The column index from the text file to get the `key`
+        values from. The default is to use the whole line content.
+      value_column_index: The column index from the text file to get the `value`
         values from. The default is to use the line number, starting from zero.
-      value_column_index: The column index from the text file ro get the `value`
-        values from. The default is 0 that represents the whole line content.
       vocab_size: The number of elements in the file, if known.
       delimiter: The delimiter to separate fields in a line.
       name: Optional name for the op.
@@ -688,19 +688,22 @@ class IdTableWithHashBuckets(LookupInterface):
 
   For example, if an instance of `IdTableWithHashBuckets` is initialized with a
   string-to-id table that maps:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
 
   The `IdTableWithHashBuckets` object will performs the following mapping:
-  - emerson -> 0
-  - lake -> 1
-  - palmer -> 2
-  - <other term> -> bucket id between 3 and 3 + num_oov_buckets - 1, calculated
-    by: hash(<term>) % num_oov_buckets + vocab_size
 
-  If input_tensor is ["emerson", "lake", "palmer", "king", "crimson"],
-  the lookup result is [0, 1, 2, 4, 7]
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
+  `3 + num_oov_buckets - 1`, calculated by:
+  `hash(<term>) % num_oov_buckets + vocab_size`
+
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`.
 
   If `table` is None, only out-of-vocabulary buckets are used.
 
@@ -789,6 +792,25 @@ class IdTableWithHashBuckets(LookupInterface):
     with ops.name_scope(None, "init"):
       return control_flow_ops.no_op()
 
+  @property
+  def table_ref(self):
+    """Returns the table_ref of the underlying table, if one exists.
+
+    Only use the table_ref directly if you know what you are doing. The
+    table_ref does not have the "hash bucket" functionality, as that is provided
+    by this class.
+
+    One possible use of the table_ref is subtokenization, i.e. ops which
+    dynamically decompose tokens into subtokens based on the contents of the
+    table_ref.
+
+    Returns:
+      the underlying table_ref, or None if there is no underlying table
+    """
+    if self._table is not None:
+      return self._table.table_ref
+    return None
+
   def size(self, name=None):
     """Compute the number of elements in this table."""
     with ops.name_scope(name, "%s_Size" % self.name) as scope:
@@ -864,7 +886,10 @@ def index_table_from_file(vocabulary_file=None,
                           default_value=-1,
                           hasher_spec=FastHashSpec,
                           key_dtype=dtypes.string,
-                          name=None):
+                          name=None,
+                          key_column_index=TextFileIndex.WHOLE_LINE,
+                          value_column_index=TextFileIndex.LINE_NUMBER,
+                          delimiter="\t"):
   """Returns a lookup table that converts a string tensor into int64 IDs.
 
   This operation constructs a lookup table to convert tensor of strings into
@@ -881,6 +906,16 @@ def index_table_from_file(vocabulary_file=None,
   The underlying table must be initialized by calling
   `tf.tables_initializer.run()` or `table.init.run()` once.
 
+  To specify multi-column vocabulary files, use key_column_index and
+  value_column_index and delimiter.
+
+  - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+    expects data type int64.
+  - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+    type string.
+  - A value >=0 means use the index (starting at zero) of the split line based
+    on `delimiter`.
+
   Sample Usages:
 
   If we have a vocabulary file "test.txt" with the following content:
@@ -912,6 +947,11 @@ def index_table_from_file(vocabulary_file=None,
       assignation of out-of-vocabulary buckets.
     key_dtype: The `key` data type.
     name: A name for this op (optional).
+    key_column_index: The column index from the text file to get the `key`
+      values from. The default is to use the whole line content.
+    value_column_index: The column index from the text file to get the `value`
+      values from. The default is to use the line number, starting from zero.
+    delimiter: The delimiter to separate fields in a line.
 
   Returns:
     The lookup table to map a `key_dtype` `Tensor` to index `int64` `Tensor`.
@@ -944,19 +984,22 @@ def index_table_from_file(vocabulary_file=None,
         # Keep the shared_name:
         # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
         shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                  TextFileIndex.WHOLE_LINE,
-                                                  TextFileIndex.LINE_NUMBER)
+                                                  key_column_index,
+                                                  value_column_index)
       else:
         # Keep the shared_name
         # <table_type>_<filename>_<key_index>_<value_index>
         shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                               TextFileIndex.WHOLE_LINE,
-                                               TextFileIndex.LINE_NUMBER)
+                                               key_column_index,
+                                               value_column_index)
       init = TextFileIdTableInitializer(
           vocabulary_file,
           vocab_size=vocab_size,
           key_dtype=dtypes.int64 if key_dtype.is_integer else key_dtype,
-          name="table_init")
+          name="table_init",
+          key_column_index=key_column_index,
+          value_column_index=value_column_index,
+          delimiter=delimiter)
 
       table = HashTable(
           init, default_value, shared_name=shared_name, name=hash_table_scope)
@@ -1074,7 +1117,10 @@ def index_table_from_tensor(vocabulary_list,
 def index_to_string_table_from_file(vocabulary_file,
                                     vocab_size=None,
                                     default_value="UNK",
-                                    name=None):
+                                    name=None,
+                                    key_column_index=TextFileIndex.LINE_NUMBER,
+                                    value_column_index=TextFileIndex.WHOLE_LINE,
+                                    delimiter="\t"):
   """Returns a lookup table that maps a `Tensor` of indices into strings.
 
   This operation constructs a lookup table to map int64 indices into string
@@ -1088,6 +1134,16 @@ def index_to_string_table_from_file(vocabulary_file,
   The underlying table must be initialized by calling
   `tf.tables_initializer.run()` or `table.init.run()` once.
 
+  To specify multi-column vocabulary files, use key_column_index and
+  value_column_index and delimiter.
+
+  - TextFileIndex.LINE_NUMBER means use the line number starting from zero,
+    expects data type int64.
+  - TextFileIndex.WHOLE_LINE means use the whole line content, expects data
+    type string.
+  - A value >=0 means use the index (starting at zero) of the split line based
+    on `delimiter`.
+
   Sample Usages:
 
   If we have a vocabulary file "test.txt" with the following content:
@@ -1114,6 +1170,11 @@ def index_to_string_table_from_file(vocabulary_file,
     vocab_size: Number of the elements in the vocabulary, if known.
     default_value: The value to use for out-of-vocabulary indices.
     name: A name for this op (optional).
+    key_column_index: The column index from the text file to get the `key`
+      values from. The default is to use the line number, starting from zero.
+    value_column_index: The column index from the text file to get the `value`
+      values from. The default is to use the whole line content.
+    delimiter: The delimiter to separate fields in a line.
 
   Returns:
     The lookup table to map a string values associated to a given index `int64`
@@ -1134,15 +1195,19 @@ def index_to_string_table_from_file(vocabulary_file,
       # Keep a shared_name
       # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
       shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                TextFileIndex.LINE_NUMBER,
-                                                TextFileIndex.WHOLE_LINE)
+                                                key_column_index,
+                                                value_column_index)
     else:
       # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                             TextFileIndex.LINE_NUMBER,
-                                             TextFileIndex.WHOLE_LINE)
+      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file, key_column_index,
+                                             value_column_index)
     init = TextFileStringTableInitializer(
-        vocabulary_file, vocab_size=vocab_size, name="table_init")
+        vocabulary_file,
+        vocab_size=vocab_size,
+        name="table_init",
+        key_column_index=key_column_index,
+        value_column_index=value_column_index,
+        delimiter=delimiter)
 
     # TODO(yleon): Use a more effienct structure.
     return HashTable(init, default_value, shared_name=shared_name, name=scope)
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 55a18d28cae5c2326db98c4fed2f6bf38b39a0b0..b74971f654294e25e131a6ba21d982da16cf4264 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -652,7 +652,7 @@ def softmax_cross_entropy(
 
   Args:
     onehot_labels: `[batch_size, num_classes]` target one-hot-encoded labels.
-    logits: [batch_size, num_classes] logits outputs of the network .
+    logits: `[batch_size, num_classes]` logits outputs of the network .
     weights: Optional `Tensor` whose rank is either 0, or rank 1 and is
       broadcastable to the loss which is a `Tensor` of shape `[batch_size]`.
     label_smoothing: If greater than 0 then smooth the labels.
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 38fe093ba7236ff7fe7b580a893501c84c71f6b1..0239396ae32fe62bc75fb19bb05cb2e8e0e8695e 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -184,6 +184,15 @@ def _SparseSegmentSumGrad(op, grad):
           None)
 
 
+@ops.RegisterGradient("SparseSegmentSumWithNumSegments")
+def _SparseSegmentSumWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSumWithNumSegments."""
+  input_rows = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.unsorted_segment_sum(
+      array_ops.gather(grad, op.inputs[2]), op.inputs[1], input_rows), None,
+          None, None)
+
+
 @ops.RegisterGradient("SparseSegmentMean")
 def _SparseSegmentMeanGrad(op, grad):
   """Gradient for SparseSegmentMean."""
@@ -192,6 +201,14 @@ def _SparseSegmentMeanGrad(op, grad):
                                             dim0), None, None)
 
 
+@ops.RegisterGradient("SparseSegmentMeanWithNumSegments")
+def _SparseSegmentMeanWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentMeanWithNumSegments."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_mean_grad(grad, op.inputs[1], op.inputs[2],
+                                            dim0), None, None, None)
+
+
 @ops.RegisterGradient("SparseSegmentSqrtN")
 def _SparseSegmentSqrtNGrad(op, grad):
   """Gradient for SparseSegmentSqrtN."""
@@ -200,6 +217,14 @@ def _SparseSegmentSqrtNGrad(op, grad):
                                               dim0), None, None)
 
 
+@ops.RegisterGradient("SparseSegmentSqrtNWithNumSegments")
+def _SparseSegmentSqrtNWithNumSegmentsGrad(op, grad):
+  """Gradient for SparseSegmentSqrtNWithNumSegmnets."""
+  dim0 = array_ops.shape(op.inputs[0])[0]
+  return (math_ops.sparse_segment_sqrt_n_grad(grad, op.inputs[1], op.inputs[2],
+                                              dim0), None, None, None)
+
+
 def _SegmentMinOrMaxGrad(op, grad, is_sorted):
   """Gradient for SegmentMin and (unsorted) SegmentMax. They share similar code."""
   zeros = array_ops.zeros(array_ops.shape(op.inputs[0]),
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 5732c756cef30a2d9e2ede72459ff2e5c3822910..04eeb00518a3afa2e2ee36e84eee133304590779 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -113,6 +113,23 @@ class MinOrMaxGradientTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class MaximumOrMinimumGradientTest(test.TestCase):
+
+  def testMaximumGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs = math_ops.maximum(inputs, 3.0)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
+      self.assertLess(error, 1e-4)
+
+  def testMinimumGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs = math_ops.minimum(inputs, 2.0)
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
+      self.assertLess(error, 1e-4)
+
+
 class ProdGradientTest(test.TestCase):
 
   def testProdGradient(self):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 101eee95f14c82c462214c733f8bf483cd7f392b..6af36343d587c0498785fcace9048c56281918f9 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -89,6 +89,7 @@ See the @{$python/math_ops} guide.
 @@matrix_inverse
 @@cholesky
 @@cholesky_solve
+@@matrix_exponential
 @@matrix_solve
 @@matrix_triangular_solve
 @@matrix_solve_ls
@@ -169,28 +170,30 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
-from tensorflow.python.util.deprecation import deprecated
-from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util import deprecation
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
 
-arg_max = deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
-arg_min = deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
+arg_max = deprecation.deprecated(None, "Use `argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
+arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
 
 
 def _set_doc(doc):
+
   def _decorator(func):
     func.__doc__ = doc
     return func
+
   return _decorator
 
 
 # pylint: disable=redefined-builtin
-@deprecated_args(None, "Use the `axis` argument instead", "dimension")
-@_set_doc(gen_math_ops.arg_max.__doc__
-          .replace("dimensions", "axes")
-          .replace("dimension", "axis"))
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "dimension")
+@_set_doc(
+    gen_math_ops.arg_max.__doc__.replace("dimensions", "axes").replace(
+        "dimension", "axis"))
 def argmax(input,
            axis=None,
            name=None,
@@ -205,10 +208,11 @@ def argmax(input,
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
-@deprecated_args(None, "Use the `axis` argument instead", "dimension")
-@_set_doc(gen_math_ops.arg_min.__doc__
-          .replace("dimensions", "axes")
-          .replace("dimension", "axis"))
+@deprecation.deprecated_args(None, "Use the `axis` argument instead",
+                             "dimension")
+@_set_doc(
+    gen_math_ops.arg_min.__doc__.replace("dimensions", "axes").replace(
+        "dimension", "axis"))
 def argmin(input,
            axis=None,
            name=None,
@@ -253,7 +257,7 @@ def abs(x, name=None):
   """
   with ops.name_scope(name, "Abs", [x]) as name:
     if isinstance(x, sparse_tensor.SparseTensor):
-      if x.values.dtype in (dtypes.complex64, dtypes.complex128):
+      if x.values.dtype.is_complex:
         x_abs = gen_math_ops._complex_abs(
             x.values, Tout=x.values.dtype.real_dtype, name=name)
         return sparse_tensor.SparseTensor(
@@ -263,7 +267,7 @@ def abs(x, name=None):
           indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
     else:
       x = ops.convert_to_tensor(x, name="x")
-      if x.dtype in (dtypes.complex64, dtypes.complex128):
+      if x.dtype.is_complex:
         return gen_math_ops._complex_abs(x, Tout=x.dtype.real_dtype, name=name)
       return gen_math_ops._abs(x, name=name)
 
@@ -274,6 +278,8 @@ def abs(x, name=None):
 # pylint: disable=redefined-builtin
 def _bucketize(input, boundaries, name=None):
   return gen_math_ops._bucketize(input=input, boundaries=boundaries, name=name)
+
+
 # pylint: enable=redefined-builtin
 
 
@@ -319,15 +325,15 @@ multiply.__doc__ = gen_math_ops._mul.__doc__.replace("Mul", "`tf.multiply`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
-@deprecated(
+@deprecation.deprecated(
     "2016-12-30",
     "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`")
 def _mul(x, y, name=None):
   return gen_math_ops._mul(x, y, name)
 
 
-_mul.__doc__ = (gen_math_ops._mul.__doc__ +
-                ("" if _mul.__doc__ is None else _mul.__doc__))
+_mul.__doc__ = (
+    gen_math_ops._mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__))
 
 
 def subtract(x, y, name=None):
@@ -338,15 +344,15 @@ subtract.__doc__ = gen_math_ops._sub.__doc__.replace("`Sub`", "`tf.subtract`")
 
 
 # TODO(aselle): put deprecation in after another round of global code changes
-@deprecated(
+@deprecation.deprecated(
     "2016-12-30",
     "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`")
 def _sub(x, y, name=None):
   return gen_math_ops._sub(x, y, name)
 
 
-_sub.__doc__ = (gen_math_ops._sub.__doc__ +
-                ("" if _sub.__doc__ is None else _sub.__doc__))
+_sub.__doc__ = (
+    gen_math_ops._sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
 # pylint: disable=g-docstring-has-escape
@@ -376,8 +382,9 @@ def negative(x, name=None):
 
 
 # pylint: disable=g-docstring-has-escape
-@deprecated("2016-12-30",
-            "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
+@deprecation.deprecated(
+    "2016-12-30",
+    "`tf.neg(x)` is deprecated, please use `tf.negative(x)` or `-x`")
 def _neg(x, name=None):
   """Computes numerical negative value element-wise.
 
@@ -943,6 +950,7 @@ _TRUEDIV_TABLE = {
     dtypes.int16: dtypes.float32,
     dtypes.int32: dtypes.float64,
     dtypes.int64: dtypes.float64,
+    dtypes.bfloat16: None,
     dtypes.float16: None,
     dtypes.float32: None,
     dtypes.float64: None,
@@ -956,8 +964,8 @@ _TRUEDIV_TABLE = {
 # to explicitly use the "/" operator to invoke either truediv or div.
 def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
   """Internal helper function for 'sp_t / dense_t'."""
-  with ops.name_scope(name, "truediv", [sp_indices, sp_values, sp_shape,
-                                        y]) as name:
+  with ops.name_scope(name, "truediv",
+                      [sp_indices, sp_values, sp_shape, y]) as name:
     sp_values = ops.convert_to_tensor(sp_values, name="sp_values")
     y = ops.convert_to_tensor(y, name="y")
     x_dtype = sp_values.dtype.base_dtype
@@ -1264,16 +1272,27 @@ def _ReductionDims(x, axis, reduction_indices):
     return range(0, array_ops.rank(x))
 
 
+def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
+  """Set a reduction's output's shape to be a scalar if we are certain."""
+  if (not output.shape.is_fully_defined()) and (not keepdims) and (
+      axis is None) and (reduction_indices is None):
+    output.set_shape(())
+  return output
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_sum(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the sum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1286,7 +1305,7 @@ def reduce_sum(input_tensor,
   tf.reduce_sum(x)  # 6
   tf.reduce_sum(x, 0)  # [2, 2, 2]
   tf.reduce_sum(x, 1)  # [3, 3]
-  tf.reduce_sum(x, 1, keep_dims=True)  # [[3], [3]]
+  tf.reduce_sum(x, 1, keepdims=True)  # [[3], [3]]
   tf.reduce_sum(x, [0, 1])  # 6
   ```
 
@@ -1295,9 +1314,10 @@ def reduce_sum(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1306,24 +1326,34 @@ def reduce_sum(input_tensor,
   Equivalent to np.sum
   @end_compatibility
   """
-  return gen_math_ops._sum(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._sum(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
                   axis=None,
-                  keep_dims=False,
+                  keepdims=None,
                   dtype=dtypes.int64,
                   name=None,
-                  reduction_indices=None):
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes number of nonzero elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1340,7 +1370,7 @@ def count_nonzero(input_tensor,
   tf.count_nonzero(x)  # 3
   tf.count_nonzero(x, 0)  # [1, 2, 0]
   tf.count_nonzero(x, 1)  # [1, 2]
-  tf.count_nonzero(x, 1, keep_dims=True)  # [[1], [2]]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
   tf.count_nonzero(x, [0, 1])  # 3
   ```
 
@@ -1349,14 +1379,20 @@ def count_nonzero(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     dtype: The output dtype; defaults to `tf.int64`.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor (number of nonzero values).
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+
   with ops.name_scope(name, "count_nonzero", [input_tensor]):
     input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
     zero = input_tensor.dtype.as_numpy_dtype()
@@ -1365,21 +1401,24 @@ def count_nonzero(input_tensor,
             # int64 reduction happens on GPU
             to_int64(gen_math_ops.not_equal(input_tensor, zero)),
             axis=axis,
-            keep_dims=keep_dims,
+            keepdims=keepdims,
             reduction_indices=reduction_indices),
         dtype=dtype)
 
 
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_mean(input_tensor,
                 axis=None,
-                keep_dims=False,
+                keepdims=None,
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keep_dims=None):
   """Computes the mean of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1399,34 +1438,58 @@ def reduce_mean(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
 
   @compatibility(numpy)
   Equivalent to np.mean
-  @end_compatibility
-  """
-  return gen_math_ops._mean(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
 
+  Please note that `np.mean` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`,
+  for example:
 
+  ```python
+  x = tf.constant([1, 0, 1, 0])
+  tf.reduce_mean(x)  # 0
+  y = tf.constant([1., 0., 1., 0.])
+  tf.reduce_mean(y)  # 0.5
+  ```
+
+  @end_compatibility
+  """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._mean(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_prod(input_tensor,
                 axis=None,
-                keep_dims=False,
+                keepdims=None,
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keep_dims=None):
   """Computes the product of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1437,9 +1500,10 @@ def reduce_prod(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1448,23 +1512,33 @@ def reduce_prod(input_tensor,
   Equivalent to np.prod
   @end_compatibility
   """
-  return gen_math_ops._prod(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._prod(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_min(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the minimum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1475,9 +1549,10 @@ def reduce_min(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1486,23 +1561,32 @@ def reduce_min(input_tensor,
   Equivalent to np.min
   @end_compatibility
   """
-  return gen_math_ops._min(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._min(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_max(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the maximum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1513,9 +1597,10 @@ def reduce_max(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1524,23 +1609,32 @@ def reduce_max(input_tensor,
   Equivalent to np.max
   @end_compatibility
   """
-  return gen_math_ops._max(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._max(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_all(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the "logical and" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1560,9 +1654,10 @@ def reduce_all(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1571,23 +1666,32 @@ def reduce_all(input_tensor,
   Equivalent to np.all
   @end_compatibility
   """
-  return gen_math_ops._all(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._all(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_any(input_tensor,
                axis=None,
-               keep_dims=False,
+               keepdims=None,
                name=None,
-               reduction_indices=None):
+               reduction_indices=None,
+               keep_dims=None):
   """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1607,9 +1711,10 @@ def reduce_any(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
@@ -1618,23 +1723,32 @@ def reduce_any(input_tensor,
   Equivalent to np.any
   @end_compatibility
   """
-  return gen_math_ops._any(
-      input_tensor,
-      _ReductionDims(input_tensor, axis, reduction_indices),
-      keep_dims,
-      name=name)
-
-
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
+  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
+                               gen_math_ops._any(
+                                   input_tensor,
+                                   _ReductionDims(input_tensor, axis,
+                                                  reduction_indices),
+                                   keepdims,
+                                   name=name))
+
+
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def reduce_logsumexp(input_tensor,
                      axis=None,
-                     keep_dims=False,
+                     keepdims=None,
                      name=None,
-                     reduction_indices=None):
+                     reduction_indices=None,
+                     keep_dims=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
-  Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
-  entry in `axis`. If `keep_dims` is true, the reduced dimensions
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
   If `axis` has no entries, all dimensions are reduced, and a
@@ -1651,7 +1765,7 @@ def reduce_logsumexp(input_tensor,
   tf.reduce_logsumexp(x)  # log(6)
   tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
   tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
-  tf.reduce_logsumexp(x, 1, keep_dims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, 1, keepdims=True)  # [[log(3)], [log(3)]]
   tf.reduce_logsumexp(x, [0, 1])  # log(6)
   ```
 
@@ -1660,35 +1774,39 @@ def reduce_logsumexp(input_tensor,
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
-    keep_dims: If true, retains reduced dimensions with length 1.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
   """
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  if keepdims is None:
+    keepdims = False
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
         axis=axis,
         reduction_indices=reduction_indices,
-        keep_dims=True)
+        keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
-            gen_math_ops.is_finite(raw_max),
-            raw_max,
+            gen_math_ops.is_finite(raw_max), raw_max,
             array_ops.zeros_like(raw_max)))
     result = gen_math_ops.log(
         reduce_sum(
             gen_math_ops.exp(input_tensor - my_max),
             axis,
-            keep_dims=True,
+            keepdims=True,
             reduction_indices=reduction_indices)) + my_max
-    if not keep_dims:
+    if not keepdims:
       if isinstance(axis, int):
         axis = [axis]
       result = array_ops.squeeze(result, axis)
-    return result
+    return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
 def trace(x, name=None):
@@ -1852,9 +1970,9 @@ def matmul(a,
     # TODO(apassos) remove _shape_tuple here when it is not needed.
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
     b_shape = b._shape_tuple()  # pylint: disable=protected-access
-    if (not a_is_sparse and not b_is_sparse) and (
-        (a_shape is None or len(a_shape) > 2) and
-        (b_shape is None or len(b_shape) > 2)):
+    if (not a_is_sparse and
+        not b_is_sparse) and ((a_shape is None or len(a_shape) > 2) and
+                              (b_shape is None or len(b_shape) > 2)):
       # BatchMatmul does not support transpose, so we conjugate the matrix and
       # use adjoint instead. Conj() is a noop for real matrices.
       if transpose_a:
@@ -1879,8 +1997,8 @@ def matmul(a,
     use_sparse_matmul = False
     if a_is_sparse or b_is_sparse:
       sparse_matmul_types = [dtypes.bfloat16, dtypes.float32]
-      use_sparse_matmul = (a.dtype in sparse_matmul_types and
-                           b.dtype in sparse_matmul_types)
+      use_sparse_matmul = (
+          a.dtype in sparse_matmul_types and b.dtype in sparse_matmul_types)
     if a.dtype == dtypes.bfloat16 or b.dtype == dtypes.bfloat16:
       # matmul currently doesn't handle bfloat16 inputs.
       use_sparse_matmul = True
@@ -1971,8 +2089,8 @@ def _as_indexed_slices_list(inputs, optimize=True):
   for o in outputs:
     if o.indices.dtype == dtypes.int32:
       casted_outputs.append(
-          ops.IndexedSlices(o.values,
-                            cast(o.indices, dtypes.int64), o.dense_shape))
+          ops.IndexedSlices(o.values, cast(o.indices, dtypes.int64),
+                            o.dense_shape))
     else:
       casted_outputs.append(o)
   return casted_outputs
@@ -2071,8 +2189,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   if tensor_dtype is None:
     tensor_dtype = inputs[0].dtype
   if tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
+    raise TypeError("tensor_dtype is {}, but input is of type {}".format(
+        tensor_dtype, inputs[0].dtype))
   if len(inputs) == 1:
     return inputs[0]
   with ops.name_scope(name, "AccumulateN", inputs) as name:
@@ -2190,8 +2308,10 @@ def bincount(arr,
     maxlength = ops.convert_to_tensor(
         maxlength, name="maxlength", dtype=dtypes.int32)
     output_size = gen_math_ops.minimum(maxlength, output_size)
-  weights = (ops.convert_to_tensor(weights, name="weights")
-             if weights is not None else constant_op.constant([], dtype))
+  if weights is not None:
+    weights = ops.convert_to_tensor(weights, name="weights")
+    return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+  weights = constant_op.constant([], dtype)
   return gen_math_ops.bincount(arr, output_size, weights)
 
 
@@ -2354,7 +2474,7 @@ def reduced_shape(input_shape, axes):
     input_shape: 1-D Tensor, the shape of the Tensor being reduced.
     axes: 1-D Tensor, the reduction axes.
   Returns:
-    A 1-D Tensor, the output shape as if keep_dims were set to True.
+    A 1-D Tensor, the output shape as if keepdims were set to True.
   """
   # Example:
   # cast needed for SparseTensor reductions
@@ -2375,6 +2495,159 @@ def reduced_shape(input_shape, axes):
       ])  # [1, 1]
 
 
+def sparse_segment_sum(data, indices, segment_ids, name=None,
+                       num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  For example:
+
+  ```python
+  c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+
+  # Select two rows, one segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+  # => [[0 0 0 0]]
+
+  # Select two rows, two segment.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+  # => [[ 1  2  3  4]
+  #     [-1 -2 -3 -4]]
+
+  # With missing segment ids.
+  tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 2]),
+                        num_segments=4)
+  # => [[ 1  2  3  4]
+  #     [ 0  0  0  0]
+  #     [-1 -2 -3 -4]
+  #     [ 0  0  0  0]]
+
+  # Select all rows, two segments.
+  tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+  # => [[0 0 0 0]
+  #     [5 6 7 8]]
+
+  # Which is equivalent to:
+  tf.segment_sum(c, tf.constant([0, 0, 1]))
+  ```
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sum_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sum(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+def sparse_segment_mean(data, indices, segment_ids, name=None,
+                        num_segments=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read @{$math_ops#segmentation$the section on segmentation} for an explanation
+  of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_mean_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_mean(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
+def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+                          num_segments=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`.
+      Values should be sorted and can be repeated.
+    name: A name for the operation (optional).
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  if num_segments is not None:
+    return gen_math_ops.sparse_segment_sqrt_n_with_num_segments(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        num_segments=num_segments,
+        name=name)
+  else:
+    return gen_math_ops.sparse_segment_sqrt_n(
+        data=data,
+        indices=indices,
+        segment_ids=segment_ids,
+        name=name)
+
+
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
 
@@ -2471,7 +2744,8 @@ def tensordot(a, b, axes, name=None):
       rank_a = array_ops.rank(a)
       axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
       axes = cast(axes >= 0, dtypes.int32) * axes + cast(
-          axes < 0, dtypes.int32) * (axes + rank_a)
+          axes < 0, dtypes.int32) * (
+              axes + rank_a)
       free, _ = array_ops.setdiff1d(range(rank_a), axes)
       free_dims = array_ops.gather(shape_a, free)
       axes_dims = array_ops.gather(shape_a, axes)
@@ -2497,8 +2771,8 @@ def tensordot(a, b, axes, name=None):
         return range(a_shape.ndims - axes, a_shape.ndims), range(axes)
       else:
         rank = array_ops.rank(a)
-        return (range(rank - axes, rank, dtype=dtypes.int32), range(
-            axes, dtype=dtypes.int32))
+        return (range(rank - axes, rank, dtype=dtypes.int32),
+                range(axes, dtype=dtypes.int32))
     elif isinstance(axes, (list, tuple)):
       if len(axes) != 2:
         raise ValueError("'axes' must be an integer or have length 2.")
@@ -2522,8 +2796,8 @@ def tensordot(a, b, axes, name=None):
     b = ops.convert_to_tensor(b, name="b")
     a_axes, b_axes = _tensordot_axes(a, axes)
     a_reshape, a_free_dims, a_free_dims_static = _tensordot_reshape(a, a_axes)
-    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(b, b_axes,
-                                                                    True)
+    b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(
+        b, b_axes, True)
     ab_matmul = matmul(a_reshape, b_reshape)
     if isinstance(a_free_dims, list) and isinstance(b_free_dims, list):
       return array_ops.reshape(ab_matmul, a_free_dims + b_free_dims, name=name)
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 4642f4c580fbf5401af4c6a5ec43851e67a0af8b..bd26ff66961c858865c8a61469abac0b783ed645 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -21,7 +21,6 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -31,12 +30,12 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
-ops._USE_C_API = True
 
 exp = np.exp
 log = np.log
 
 
+@test_util.with_c_api
 class ReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -62,16 +61,16 @@ class ReduceTest(test_util.TensorFlowTestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testReduceInvalidAxis(self):
     if context.in_eager_mode():
-      # The shape check is in run a graph contruction time. In eager mode,
+      # The shape check is in run a graph construction time. In eager mode,
       # it misses the check, magically return result given wrong shape.
       return
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     axis = np.array([[0], [1]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must be at most rank 1"):
+    with self.assertRaisesRegexp(ValueError, "must be at most rank 1"):
       math_ops.reduce_sum(x, axis)
 
 
+@test_util.with_c_api
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
   def testReduceLogSumExp(self):
@@ -151,6 +150,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
       self.assertEqual(-np.inf, res)
 
 
+@test_util.with_c_api
 class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -168,6 +168,7 @@ class RoundTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
 
+@test_util.with_c_api
 class ModTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -197,6 +198,7 @@ class ModTest(test_util.TensorFlowTestCase):
         self.assertAllClose(y_tf_np, y_np)
 
 
+@test_util.with_c_api
 class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -210,6 +212,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         self.assertAllClose(z, z_tf)
 
 
+@test_util.with_c_api
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -241,6 +244,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(z, z_tf)
 
 
+@test_util.with_c_api
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -282,6 +286,7 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])
 
 
+@test_util.with_c_api
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -301,6 +306,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
+@test_util.with_c_api
 class AddNTest(test_util.TensorFlowTestCase):
 
   def testPartials(self):
@@ -354,6 +360,7 @@ class AddNTest(test_util.TensorFlowTestCase):
                             [g.eval() for g in add_n_grad])
 
 
+@test_util.with_c_api
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
index 0465c77691c7d4e9cb80791470db8d99c64318f9..7e75542aec3e117510b810bad7f92af2084ae3b3 100644
--- a/tensorflow/python/ops/metrics.py
+++ b/tensorflow/python/ops/metrics.py
@@ -34,11 +34,14 @@
 @@precision_at_thresholds
 @@recall
 @@recall_at_k
+@@recall_at_top_k
 @@recall_at_thresholds
 @@root_mean_squared_error
 @@sensitivity_at_specificity
 @@sparse_average_precision_at_k
+@@average_precision_at_k
 @@sparse_precision_at_k
+@@precision_at_k
 @@precision_at_top_k
 @@specificity_at_sensitivity
 @@true_negatives
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 68ec3c0101674f9641c17ad92974e1b469b458af..e04121ee31d1b6c82151bf7415b3e73614b24781 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -33,22 +33,20 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.util.deprecation import deprecated
 
 
-def _local_variable(initial_value, validate_shape=True, name=None):
-  """Create variable and add it to `GraphKeys.LOCAL_VARIABLES` collection.
+def metric_variable(shape, dtype, validate_shape=True, name=None):
+  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
 
-  Args:
-    initial_value: See variables.Variable.__init__.
-    validate_shape: See variables.Variable.__init__.
-    name: See variables.Variable.__init__.
-  Returns:
-    New variable.
-  """
   return variable_scope.variable(
-      initial_value, trainable=False,
-      collections=[ops.GraphKeys.LOCAL_VARIABLES],
-      validate_shape=validate_shape, name=name)
+      lambda: array_ops.zeros(shape, dtype),
+      trainable=False,
+      collections=[
+          ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
+      ],
+      validate_shape=validate_shape,
+      name=name)
 
 
 def _remove_squeezable_dimensions(predictions, labels, weights):
@@ -176,31 +174,6 @@ def _maybe_expand_labels(labels, predictions):
         lambda: labels)
 
 
-def _create_local(name, shape, collections=None, validate_shape=True,
-                  dtype=dtypes.float32):
-  """Creates a new local variable.
-
-  Args:
-    name: The name of the new or existing variable.
-    shape: Shape of the new or existing variable.
-    collections: A list of collection names to which the Variable will be added.
-    validate_shape: Whether to validate the shape of the variable.
-    dtype: Data type of the variables.
-
-  Returns:
-    The created variable.
-  """
-  # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES
-  collections = list(collections or [])
-  collections += [ops.GraphKeys.LOCAL_VARIABLES]
-  return variable_scope.variable(
-      lambda: array_ops.zeros(shape, dtype=dtype),
-      name=name,
-      trainable=False,
-      collections=collections,
-      validate_shape=validate_shape)
-
-
 def _safe_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is <= 0.
 
@@ -264,10 +237,8 @@ def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
     update_op: An operation that increments the confusion matrix.
   """
   # Local variable to accumulate the predictions in the confusion matrix.
-  total_cm = _create_local(
-      'total_confusion_matrix',
-      shape=[num_classes, num_classes],
-      dtype=dtypes.float64)
+  total_cm = metric_variable(
+      [num_classes, num_classes], dtypes.float64, name='total_confusion_matrix')
 
   # Cast the type to int64 required by confusion_matrix_ops.
   predictions = math_ops.to_int64(predictions)
@@ -337,8 +308,8 @@ def mean(values, weights=None, metrics_collections=None,
   with variable_scope.variable_scope(name, 'mean', (values, weights)):
     values = math_ops.to_float(values)
 
-    total = _create_local('total', shape=[])
-    count = _create_local('count', shape=[])
+    total = metric_variable([], dtypes.float32, name='total')
+    count = metric_variable([], dtypes.float32, name='count')
 
     if weights is None:
       num_values = math_ops.to_float(array_ops.size(values))
@@ -535,7 +506,8 @@ def _confusion_matrix_at_thresholds(
   update_ops = {}
 
   if 'tp' in includes:
-    true_p = _create_local('true_positives', shape=[num_thresholds])
+    true_p = metric_variable(
+        [num_thresholds], dtypes.float32, name='true_positives')
     is_true_positive = math_ops.to_float(
         math_ops.logical_and(label_is_pos, pred_is_pos))
     if weights_tiled is not None:
@@ -545,7 +517,8 @@ def _confusion_matrix_at_thresholds(
     values['tp'] = true_p
 
   if 'fn' in includes:
-    false_n = _create_local('false_negatives', shape=[num_thresholds])
+    false_n = metric_variable(
+        [num_thresholds], dtypes.float32, name='false_negatives')
     is_false_negative = math_ops.to_float(
         math_ops.logical_and(label_is_pos, pred_is_neg))
     if weights_tiled is not None:
@@ -555,7 +528,8 @@ def _confusion_matrix_at_thresholds(
     values['fn'] = false_n
 
   if 'tn' in includes:
-    true_n = _create_local('true_negatives', shape=[num_thresholds])
+    true_n = metric_variable(
+        [num_thresholds], dtypes.float32, name='true_negatives')
     is_true_negative = math_ops.to_float(
         math_ops.logical_and(label_is_neg, pred_is_neg))
     if weights_tiled is not None:
@@ -565,7 +539,8 @@ def _confusion_matrix_at_thresholds(
     values['tn'] = true_n
 
   if 'fp' in includes:
-    false_p = _create_local('false_positives', shape=[num_thresholds])
+    false_p = metric_variable(
+        [num_thresholds], dtypes.float32, name='false_positives')
     is_false_positive = math_ops.to_float(
         math_ops.logical_and(label_is_neg, pred_is_pos))
     if weights_tiled is not None:
@@ -817,9 +792,10 @@ def mean_cosine_distance(labels, predictions, dim, weights=None,
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
   radial_diffs = math_ops.multiply(predictions, labels)
-  radial_diffs = math_ops.reduce_sum(radial_diffs,
-                                     reduction_indices=[dim,],
-                                     keep_dims=True)
+  radial_diffs = math_ops.reduce_sum(
+      radial_diffs, reduction_indices=[
+          dim,
+      ], keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights,
                                   None,
                                   None,
@@ -1194,8 +1170,10 @@ def mean_tensor(values, weights=None, metrics_collections=None,
 
   with variable_scope.variable_scope(name, 'mean', (values, weights)):
     values = math_ops.to_float(values)
-    total = _create_local('total_tensor', shape=values.get_shape())
-    count = _create_local('count_tensor', shape=values.get_shape())
+    total = metric_variable(
+        values.get_shape(), dtypes.float32, name='total_tensor')
+    count = metric_variable(
+        values.get_shape(), dtypes.float32, name='count_tensor')
 
     num_values = array_ops.ones_like(values)
     if weights is not None:
@@ -1308,7 +1286,7 @@ def _count_condition(values, weights=None, metrics_collections=None,
       or tuple.
   """
   check_ops.assert_type(values, dtypes.bool)
-  count = _create_local('count', shape=[])
+  count = metric_variable([], dtypes.float32, name='count')
 
   values = math_ops.to_float(values)
   if weights is not None:
@@ -1534,6 +1512,56 @@ def false_positives_at_thresholds(labels, predictions, thresholds, weights=None,
     return values['fp'], update_ops['fp']
 
 
+def true_negatives(labels, predictions, weights=None,
+                   metrics_collections=None,
+                   updates_collections=None,
+                   name=None):
+  """Sum the weights of true_negatives.
+
+  If `weights` is `None`, weights default to 1. Use weights of 0 to mask values.
+
+  Args:
+    labels: The ground truth values, a `Tensor` whose dimensions must match
+      `predictions`. Will be cast to `bool`.
+    predictions: The predicted values, a `Tensor` of arbitrary dimensions. Will
+      be cast to `bool`.
+    weights: Optional `Tensor` whose rank is either 0, or the same rank as
+      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `labels` dimension).
+    metrics_collections: An optional list of collections that the metric
+      value variable should be added to.
+    updates_collections: An optional list of collections that the metric update
+      ops should be added to.
+    name: An optional variable_scope name.
+
+  Returns:
+    value_tensor: A `Tensor` representing the current value of the metric.
+    update_op: An operation that accumulates the error from a batch of data.
+
+  Raises:
+    ValueError: If `predictions` and `labels` have mismatched shapes, or if
+      `weights` is not `None` and its shape doesn't match `predictions`, or if
+      either `metrics_collections` or `updates_collections` are not a list or
+      tuple.
+    RuntimeError: If eager execution is enabled.
+  """
+  if context.in_eager_mode():
+    raise RuntimeError('tf.metrics.true_negatives is not '
+                       'supported when eager execution is enabled.')
+
+  with variable_scope.variable_scope(
+      name, 'true_negatives', (predictions, labels, weights)):
+
+    predictions, labels, weights = _remove_squeezable_dimensions(
+        predictions=math_ops.cast(predictions, dtype=dtypes.bool),
+        labels=math_ops.cast(labels, dtype=dtypes.bool),
+        weights=weights)
+    is_true_negative = math_ops.logical_and(math_ops.equal(labels, False),
+                                            math_ops.equal(predictions, False))
+    return _count_condition(is_true_negative, weights, metrics_collections,
+                            updates_collections)
+
+
 def true_negatives_at_thresholds(labels, predictions, thresholds, weights=None,
                                  metrics_collections=None,
                                  updates_collections=None,
@@ -2089,7 +2117,7 @@ def _streaming_sparse_true_positive_at_k(labels,
         weights=weights)
     batch_total_tp = math_ops.to_double(math_ops.reduce_sum(tp))
 
-    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_tp, name='update')
 
 
@@ -2185,7 +2213,7 @@ def _streaming_sparse_false_negative_at_k(labels,
         weights=weights)
     batch_total_fn = math_ops.to_double(math_ops.reduce_sum(fn))
 
-    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
 
 
@@ -2270,10 +2298,8 @@ def recall_at_k(labels,
   with ops.name_scope(
       name, _at_k_name('recall', k, class_id=class_id),
       (predictions, labels, weights)) as scope:
-    labels = _maybe_expand_labels(labels, predictions)
-
     _, top_k_idx = nn.top_k(predictions, k)
-    return _sparse_recall_at_top_k(
+    return recall_at_top_k(
         labels=labels,
         predictions_idx=top_k_idx,
         k=k,
@@ -2284,14 +2310,14 @@ def recall_at_k(labels,
         name=scope)
 
 
-def _sparse_recall_at_top_k(labels,
-                            predictions_idx,
-                            k=None,
-                            class_id=None,
-                            weights=None,
-                            metrics_collections=None,
-                            updates_collections=None,
-                            name=None):
+def recall_at_top_k(labels,
+                    predictions_idx,
+                    k=None,
+                    class_id=None,
+                    weights=None,
+                    metrics_collections=None,
+                    updates_collections=None,
+                    name=None):
   """Computes recall@k of top-k predictions with respect to sparse labels.
 
   Differs from `recall_at_k` in that predictions must be in the form of top `k`
@@ -2311,7 +2337,7 @@ def _sparse_recall_at_top_k(labels,
       Commonly, N=1 and predictions has shape [batch size, k]. The final
       dimension contains the top `k` predicted class indices. [D1, ... DN] must
       match `labels`.
-    k: Integer, k for @k metric.
+    k: Integer, k for @k metric. Only used for the default op name.
     class_id: Integer class ID for which we want binary metrics. This should be
       in range [0, num_classes), where num_classes is the last dimension of
       `predictions`. If class_id is outside this range, the method returns NAN.
@@ -2340,6 +2366,7 @@ def _sparse_recall_at_top_k(labels,
   with ops.name_scope(name,
                       _at_k_name('recall', k, class_id=class_id),
                       (predictions_idx, labels, weights)) as scope:
+    labels = _maybe_expand_labels(labels, predictions_idx)
     top_k_idx = math_ops.to_int64(predictions_idx)
     tp, tp_update = _streaming_sparse_true_positive_at_k(
         predictions_idx=top_k_idx, labels=labels, k=k, class_id=class_id,
@@ -2836,8 +2863,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
       # - For the unweighted case, this is just the number of rows.
       # - For the weighted case, it's the sum of the weights broadcast across
       #   `average_precision` rows.
-      max_var = _local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=max_scope)
+      max_var = metric_variable([], dtypes.float64, name=max_scope)
       if weights is None:
         batch_max = math_ops.to_double(
             array_ops.size(average_precision, name='batch_max'))
@@ -2845,8 +2871,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
         batch_max = math_ops.reduce_sum(weights, name='batch_max')
       max_update = state_ops.assign_add(max_var, batch_max, name='update')
     with ops.name_scope(None, 'total', (average_precision,)) as total_scope:
-      total_var = _local_variable(
-          array_ops.zeros([], dtype=dtypes.float64), name=total_scope)
+      total_var = metric_variable([], dtypes.float64, name=total_scope)
       batch_total = math_ops.reduce_sum(average_precision, name='batch_total')
       total_update = state_ops.assign_add(total_var, batch_total, name='update')
 
@@ -2862,6 +2887,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
     return mean_average_precision, update
 
 
+@deprecated(None, 'Use average_precision_at_k instead')
 def sparse_average_precision_at_k(labels,
                                   predictions,
                                   k,
@@ -2869,9 +2895,27 @@ def sparse_average_precision_at_k(labels,
                                   metrics_collections=None,
                                   updates_collections=None,
                                   name=None):
+  """Renamed to `average_precision_at_k`, please use that method instead."""
+  return average_precision_at_k(
+      labels=labels,
+      predictions=predictions,
+      k=k,
+      weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections,
+      name=name)
+
+
+def average_precision_at_k(labels,
+                           predictions,
+                           k,
+                           weights=None,
+                           metrics_collections=None,
+                           updates_collections=None,
+                           name=None):
   """Computes average precision@k of predictions with respect to sparse labels.
 
-  `sparse_average_precision_at_k` creates two local variables,
+  `average_precision_at_k` creates two local variables,
   `average_precision_at_<k>/total` and `average_precision_at_<k>/max`, that
   are used to compute the frequency. This frequency is ultimately returned as
   `average_precision_at_<k>`: an idempotent operation that simply divides
@@ -3032,7 +3076,7 @@ def _streaming_sparse_false_positive_at_k(labels,
         weights=weights)
     batch_total_fp = math_ops.to_double(math_ops.reduce_sum(fp))
 
-    var = _local_variable(array_ops.zeros([], dtype=dtypes.float64), name=scope)
+    var = metric_variable([], dtypes.float64, name=scope)
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
@@ -3115,6 +3159,7 @@ def precision_at_top_k(labels,
     return metric, update
 
 
+@deprecated(None, 'Use precision_at_k instead')
 def sparse_precision_at_k(labels,
                           predictions,
                           k,
@@ -3123,6 +3168,26 @@ def sparse_precision_at_k(labels,
                           metrics_collections=None,
                           updates_collections=None,
                           name=None):
+  """Renamed to `precision_at_k`, please use that method instead."""
+  return precision_at_k(
+      labels=labels,
+      predictions=predictions,
+      k=k,
+      class_id=class_id,
+      weights=weights,
+      metrics_collections=metrics_collections,
+      updates_collections=updates_collections,
+      name=name)
+
+
+def precision_at_k(labels,
+                   predictions,
+                   k,
+                   class_id=None,
+                   weights=None,
+                   metrics_collections=None,
+                   updates_collections=None,
+                   name=None):
   """Computes precision@k of the predictions with respect to sparse labels.
 
   If `class_id` is specified, we calculate precision by considering only the
@@ -3133,7 +3198,7 @@ def sparse_precision_at_k(labels,
       average a class among the top-k classes with the highest predicted values
       of a batch entry is correct and can be found in the label for that entry.
 
-  `sparse_precision_at_k` creates two local variables,
+  `precision_at_k` creates two local variables,
   `true_positive_at_<k>` and `false_positive_at_<k>`, that are used to compute
   the precision@k frequency. This frequency is ultimately returned as
   `precision_at_<k>`: an idempotent operation that simply divides
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 79af3ac11725d6c375ec379585c0f6cfe339692e..ee1a00623a734e18d4aebe6c84f77ba53ee1050c 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -74,6 +74,7 @@ See the @{$python/nn} guide.
 @@softmax
 @@log_softmax
 @@softmax_cross_entropy_with_logits
+@@softmax_cross_entropy_with_logits_v2
 @@sparse_softmax_cross_entropy_with_logits
 @@weighted_cross_entropy_with_logits
 @@embedding_lookup
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 1fcd0384dadf8760b4c7605e679397a655d561e4..ff7137d492ccd3ea3bd076eda4f149b53cb09260 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -333,81 +333,87 @@ class BatchNormalizationTest(test.TestCase):
     self.assertLess(err_grad_x_2, err_tolerance)
     self.assertLess(err_grad_scale, err_tolerance)
 
-  def testInference(self):
+  def testInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_inference(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NHWC')
         self._test_inference(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NCHW')
-    self._test_inference(
-        x_shape, np.float32, [1], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape2(self):
     x_shape = [1, 1, 6, 2]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_inference(
-        x_shape, np.float32, [2], np.float32, use_gpu=False, data_format='NHWC')
+        self._test_inference(
+            x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testInferenceShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_inference(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testInferenceShape4(self):
     x_shape = [27, 131, 127, 6]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_inference(
             x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
         self._test_inference(
             x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_inference(
-        x_shape, np.float32, [6], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_inference(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testTraining(self):
+  def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NHWC')
         self._test_training(
             x_shape, dtype, [1], np.float32, use_gpu=True, data_format='NCHW')
-    self._test_training(
-        x_shape, np.float32, [1], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [1], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape2(self):
     x_shape = [1, 1, 6, 2]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_training(
-        x_shape, np.float32, [2], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [2], np.float32, use_gpu=False, data_format='NHWC')
 
+  def testTrainingShape3(self):
     x_shape = [1, 2, 1, 6]
     if test.is_gpu_available(cuda_only=True):
       for dtype in [np.float16, np.float32]:
         self._test_training(
             x_shape, dtype, [2], np.float32, use_gpu=True, data_format='NCHW')
 
+  def testTrainingShape4(self):
     x_shape = [27, 131, 127, 6]
-    if test.is_gpu_available(cuda_only=True):
-      for dtype in [np.float16, np.float32]:
+    for dtype in [np.float16, np.float32]:
+      if test.is_gpu_available(cuda_only=True):
         self._test_training(
             x_shape, dtype, [131], np.float32, use_gpu=True, data_format='NCHW')
         self._test_training(
             x_shape, dtype, [6], np.float32, use_gpu=True, data_format='NHWC')
-    self._test_training(
-        x_shape, np.float32, [6], np.float32, use_gpu=False, data_format='NHWC')
+      self._test_training(
+          x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
-  def testBatchNormGrad(self):
+  def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [1],
@@ -422,17 +428,19 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NCHW',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [1],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [1],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
+  def testBatchNormGradShape2(self):
+    for is_training in [True, False]:
       x_shape = [1, 1, 6, 2]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [2],
@@ -440,14 +448,16 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NHWC',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [2],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [2],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
+  def testBatchNormGradShape3(self):
+    for is_training in [True, False]:
       x_shape = [1, 2, 1, 6]
       if test.is_gpu_available(cuda_only=True):
         for dtype in [np.float16, np.float32]:
@@ -459,9 +469,11 @@ class BatchNormalizationTest(test.TestCase):
               data_format='NCHW',
               is_training=is_training)
 
+  def testBatchNormGradShape4(self):
+    for is_training in [True, False]:
       x_shape = [5, 7, 11, 4]
-      if test.is_gpu_available(cuda_only=True):
-        for dtype in [np.float16, np.float32]:
+      for dtype in [np.float16, np.float32]:
+        if test.is_gpu_available(cuda_only=True):
           self._test_gradient(
               x_shape,
               dtype, [7],
@@ -476,13 +488,13 @@ class BatchNormalizationTest(test.TestCase):
               use_gpu=True,
               data_format='NHWC',
               is_training=is_training)
-      self._test_gradient(
-          x_shape,
-          np.float32, [4],
-          np.float32,
-          use_gpu=False,
-          data_format='NHWC',
-          is_training=is_training)
+        self._test_gradient(
+            x_shape,
+            dtype, [4],
+            np.float32,
+            use_gpu=False,
+            data_format='NHWC',
+            is_training=is_training)
 
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
@@ -506,32 +518,46 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NCHW',
             is_training=is_training,
             err_tolerance=err_tolerance)
-      if dtype != np.float16:
-        self._test_grad_grad(
-            shape,
-            np.float32, [shape[3]],
-            np.float32,
-            use_gpu=False,
-            data_format='NHWC',
-            is_training=is_training,
-            err_tolerance=err_tolerance)
+      self._test_grad_grad(
+          shape,
+          dtype, [shape[3]],
+          np.float32,
+          use_gpu=False,
+          data_format='NHWC',
+          is_training=is_training,
+          err_tolerance=err_tolerance)
 
-  def testBatchNormGradGrad(self):
-    configs = [{
+  def testBatchNormGradGradConfig1(self):
+    config = {
         'shape': [2, 3, 4, 5],
         'err_tolerance': 1e-2,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig2(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 1e-3,
         'dtype': np.float32,
-    }, {
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig3(self):
+    config = {
+        'shape': [2, 3, 4, 5],
+        'err_tolerance': 1e-2,
+        'dtype': np.float16,
+    }
+    self._testBatchNormGradGrad(config)
+
+  def testBatchNormGradGradConfig4(self):
+    config = {
         'shape': [2, 3, 2, 2],
         'err_tolerance': 2e-3,
         'dtype': np.float16,
-    }]
-    for config in configs:
-      self._testBatchNormGradGrad(config)
+    }
+    self._testBatchNormGradGrad(config)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 557f39fb42e2d096b860b44e3898bb68018c0fe8..8cd535aa0b1a220e33d766714696092f212e1e83 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -41,33 +41,48 @@ def _Conv2DBackpropInputGrad(op, grad):
   Returns:
     the gradients w.r.t. the input and the filter
   """
-  return [None,
-          nn_ops.conv2d_backprop_filter(grad, array_ops.shape(op.inputs[1]),
-                                        op.inputs[2], op.get_attr("strides"),
-                                        op.get_attr("padding"),
-                                        op.get_attr("use_cudnn_on_gpu"),
-                                        op.get_attr("data_format")),
-          nn_ops.conv2d(grad, op.inputs[1], op.get_attr("strides"),
-                        op.get_attr("padding"), op.get_attr("use_cudnn_on_gpu"),
-                        op.get_attr("data_format"))]
+  return [
+      None,
+      nn_ops.conv2d_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.conv2d(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
+  ]
 
 
 @ops.RegisterGradient("Conv2DBackpropFilter")
 def _Conv2DBackpropFilterGrad(op, grad):
   return [
       nn_ops.conv2d_backprop_input(
-          array_ops.shape(op.inputs[0]), grad, op.inputs[2],
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format")),
-      None,
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format")), None,
       nn_ops.conv2d(
-          op.inputs[0], grad,
-          op.get_attr("strides"),
-          op.get_attr("padding"),
-          op.get_attr("use_cudnn_on_gpu"),
-          op.get_attr("data_format"))
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format"))
   ]
 
 
@@ -420,7 +435,6 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
   # grad_loss is the backprop for cost, and we multiply it with the gradients
   # (which is output[1])
   # grad_grad is the backprop for softmax gradient.
-  # There is no gradient for the labels
   #
   # Second derivative is just softmax derivative w.r.t. logits.
   softmax_grad = op.outputs[1]
@@ -436,15 +450,15 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     const_fill_value = tensor_util.constant_value(g)
     return const_fill_value is not None and (const_fill_value == 0).all()
 
+  logits = op.inputs[0]
   if grad_grad is not None and not IsZero(grad_grad):
-    logits = op.inputs[0]
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
         math_ops.matmul(grad_grad[:, None, :],
                         softmax[:, :, None]), axis=1)) * softmax)
 
-  return grad, None
+  return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
 
 @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
@@ -467,25 +481,32 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
-  return [nn_ops.conv2d_backprop_input(shape_0,
-                                       op.inputs[1],
-                                       grad,
-                                       strides,
-                                       padding,
-                                       use_cudnn_on_gpu,
-                                       data_format),
-          nn_ops.conv2d_backprop_filter(op.inputs[0],
-                                        shape_1,
-                                        grad,
-                                        strides,
-                                        padding,
-                                        use_cudnn_on_gpu,
-                                        data_format)]
+  return [
+      nn_ops.conv2d_backprop_input(
+          shape_0,
+          op.inputs[1],
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format),
+      nn_ops.conv2d_backprop_filter(
+          op.inputs[0],
+          shape_1,
+          grad,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          data_format=data_format)
+  ]
 
 
 @ops.RegisterGradient("DepthwiseConv2dNative")
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 2c83e4e29f3875e2978f83ee47d9c9fab3909d63..fd96f7b8fcf423e2381f84b50b0532e46ce2fe6e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -27,11 +27,14 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 
 
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
@@ -275,9 +278,6 @@ def _swish_shape(op):
   return [op.inputs[0].shape]
 
 
-# Set noinline=True so that sigmoid(features) is re-computed during
-# backprop, and we can free the sigmoid(features) expression immediately
-# after use during the forward pass.
 @function.Defun(shape_func=_swish_shape, func_name="swish_grad", noinline=True)
 def _swish_grad(features, grad):
   """Gradient of Swish function defined below."""
@@ -287,6 +287,11 @@ def _swish_grad(features, grad):
   return grad * activation_grad
 
 
+# Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x) around
+# for backprop, effectively doubling the tensor's memory consumption. We use a
+# @Defun decorator with noinline=True so that sigmoid(features) is re-computed
+# during backprop, and we can free the sigmoid(features) expression immediately
+# after use during the forward pass.
 @function.Defun(
     grad_func=_swish_grad,
     shape_func=_swish_shape,
@@ -296,7 +301,7 @@ def swish(features):
   # pylint: disable=g-doc-args
   """Computes the Swish activation function: `x * sigmoid(x)`.
 
-  Source: "Swish: a Self-Gated Activation Function" (Ramachandran et al. 2017)
+  Source: "Searching for Activation Functions" (Ramachandran et al. 2017)
   https://arxiv.org/abs/1710.05941
 
   Args:
@@ -311,30 +316,33 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-def l2_normalize(x, dim, epsilon=1e-12, name=None):
-  """Normalizes along dimension `dim` using an L2 norm.
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
+  """Normalizes along dimension `axis` using an L2 norm.
 
-  For a 1-D tensor with `dim = 0`, computes
+  For a 1-D tensor with `axis = 0`, computes
 
       output = x / sqrt(max(sum(x**2), epsilon))
 
   For `x` with more dimensions, independently normalizes each 1-D slice along
-  dimension `dim`.
+  dimension `axis`.
 
   Args:
     x: A `Tensor`.
-    dim: Dimension along which to normalize.  A scalar or a vector of
+    axis: Dimension along which to normalize.  A scalar or a vector of
       integers.
     epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
       divisor if `norm < sqrt(epsilon)`.
     name: A name for this operation (optional).
+    dim: Deprecated alias for axis.
 
   Returns:
     A `Tensor` with the same shape as `x`.
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
+    axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
-    square_sum = math_ops.reduce_sum(math_ops.square(x), dim, keep_dims=True)
+    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
 
@@ -586,8 +594,8 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     else:  # no shift.
       m_ss = x
       v_ss = math_ops.square(x)
-    m_ss = math_ops.reduce_sum(m_ss, axes, keep_dims=keep_dims, name="mean_ss")
-    v_ss = math_ops.reduce_sum(v_ss, axes, keep_dims=keep_dims, name="var_ss")
+    m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")
+    v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")
   return counts, m_ss, v_ss, shift
 
 
@@ -631,7 +639,7 @@ def moments(x, axes,
   across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
   and variance of a vector.
 
-  Note: shift is currently not used, the true mean is computed and used.
+  Note: shift is currently not used; the true mean is computed and used.
 
   When using these moments for batch normalization (see
   `tf.nn.batch_normalization`):
@@ -657,12 +665,12 @@ def moments(x, axes,
     # on 32-bit floats before converting the mean and variance back to fp16
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
     # Compute true mean while keeping the dims for proper broadcasting.
-    mean = math_ops.reduce_mean(y, axes, keep_dims=True, name="mean")
+    mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
-        keep_dims=True,
+        keepdims=True,
         name="variance")
     if not keep_dims:
       mean = array_ops.squeeze(mean, axes)
@@ -707,7 +715,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     # Note that we use keep_dims=True for our reductions regardless of the arg;
     # this is so that the results remain broadcast-compatible with the inputs.
     weighted_input_sum = math_ops.reduce_sum(
-        frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True)
+        frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)
 
     # The shape of the weights isn't necessarily the same as x's
     # shape, just broadcast-compatible with it -- so this expression
@@ -718,7 +726,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     broadcasted_weights = frequency_weights + array_ops.zeros_like(x)
 
     sum_of_weights = math_ops.reduce_sum(
-        broadcasted_weights, axes, name="sum_of_weights", keep_dims=True)
+        broadcasted_weights, axes, name="sum_of_weights", keepdims=True)
 
     divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum")
 
@@ -729,7 +737,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
         frequency_weights * math_ops.squared_difference(x, weighted_mean),
         axes,
         name="weighted_distsq",
-        keep_dims=True)
+        keepdims=True)
 
     weighted_variance = math_ops.multiply(weighted_distsq, divisor)
 
@@ -856,7 +864,7 @@ def fused_batch_norm(
   # currently only use the V2 version for float16 inputs, which is not supported
   # by the V1 version.
   # pylint: disable=protected-access
-  if x.dtype == dtypes.float16:
+  if x.dtype == dtypes.float16 or x.dtype == dtypes.bfloat16:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm_v2
   else:
     fused_batch_norm_func = gen_nn_ops._fused_batch_norm
@@ -974,10 +982,11 @@ def _compute_sampled_logits(weights,
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
   Returns:
-    out_logits, out_labels: `Tensor` objects each with shape
+    out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
         `nn.softmax_cross_entropy_with_logits` (sampled softmax).
+    out_labels: A Tensor object with the same shape as `out_logits`.
   """
 
   if isinstance(weights, variables.PartitionedVariable):
@@ -1088,15 +1097,16 @@ def _compute_sampled_logits(weights,
 
     # Construct output logits and labels. The true labels/logits start at col 0.
     out_logits = array_ops.concat([true_logits, sampled_logits], 1)
-    # true_logits is a float tensor, ones_like(true_logits) is a float tensor
-    # of ones. We then divide by num_true to ensure the per-example labels sum
-    # to 1.0, i.e. form a proper probability distribution.
+
+    # true_logits is a float tensor, ones_like(true_logits) is a float
+    # tensor of ones. We then divide by num_true to ensure the per-example
+    # labels sum to 1.0, i.e. form a proper probability distribution.
     out_labels = array_ops.concat([
         array_ops.ones_like(true_logits) / num_true,
         array_ops.zeros_like(sampled_logits)
     ], 1)
 
-  return out_logits, out_labels
+    return out_logits, out_labels
 
 
 def nce_loss(weights,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a37b68c6fa7a4b97f0e52eab7612a7b2c06fdbe0..b3c0a22efca570e4a63a54717768913d9daa5fb6 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -32,11 +32,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 
+from tensorflow.python.util import deprecation
+
 
 # Aliases for some automatically-generated names.
 local_response_normalization = gen_nn_ops.lrn
@@ -1202,13 +1205,14 @@ def conv2d_transpose(value,
       raise ValueError("padding must be either VALID or SAME:"
                        " {}".format(padding))
 
-    return gen_nn_ops.conv2d_backprop_input(input_sizes=output_shape_,
-                                            filter=filter,
-                                            out_backprop=value,
-                                            strides=strides,
-                                            padding=padding,
-                                            data_format=data_format,
-                                            name=name)
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
 
 
 def atrous_conv2d_transpose(value,
@@ -1340,12 +1344,13 @@ def atrous_conv2d_transpose(value,
                    (in_width + pad_right_extra) // rate,
                    output_shape[3]]
 
-    value = gen_nn_ops.conv2d_backprop_input(input_sizes=input_sizes,
-                                             filter=filters,
-                                             out_backprop=value,
-                                             strides=[1, 1, 1, 1],
-                                             padding="VALID",
-                                             data_format="NHWC")
+    value = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=input_sizes,
+        filter=filters,
+        out_backprop=value,
+        strides=[1, 1, 1, 1],
+        padding="VALID",
+        data_format="NHWC")
 
     # The crops argument to batch_to_space includes both padding components.
     batch_to_space_crop = [[pad_top, pad_bottom + pad_bottom_extra],
@@ -1643,52 +1648,62 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-def softmax(logits, dim=-1, name=None):
+@deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
 
   This function performs the equivalent of
 
-      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), dim)
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
 
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    dim: The dimension softmax would be performed on. The default is -1 which
+    axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for `axis`.
 
   Returns:
     A `Tensor`. Has the same type and shape as `logits`.
 
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  return _softmax(logits, gen_nn_ops._softmax, dim, name)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops._softmax, axis, name)
 
 
-def log_softmax(logits, dim=-1, name=None):
+@deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
 
   For each batch `i` and class `j` we have
 
-      logsoftmax = logits - log(reduce_sum(exp(logits), dim))
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
 
   Args:
     logits: A non-empty `Tensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
-    dim: The dimension softmax would be performed on. The default is -1 which
+    axis: The dimension softmax would be performed on. The default is -1 which
       indicates the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for `axis`.
 
   Returns:
     A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
 
   Raises:
-    InvalidArgumentError: if `logits` is empty or `dim` is beyond the last
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
       dimension of `logits`.
   """
-  return _softmax(logits, gen_nn_ops._log_softmax, dim, name)
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops._log_softmax, axis, name)
 
 
 def _ensure_xent_args(name, sentinel, labels, logits):
@@ -1700,9 +1715,9 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
-                                      labels=None, logits=None,
-                                      dim=-1, name=None):
+def softmax_cross_entropy_with_logits_v2(_sentinel=None,  # pylint: disable=invalid-name
+                                         labels=None, logits=None,
+                                         dim=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1726,6 +1741,10 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
   `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
   or `float64`).
 
+  Backpropagation will happen into both `logits` and `labels`.  To disallow
+  backpropagation into `labels`, pass label tensors through a `stop_gradients`
+  before feeding it to this function.
+
   **Note that to avoid confusion, it is required to pass only named arguments to
   this function.**
 
@@ -1747,57 +1766,123 @@ def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid
   # could break users who call this with bad labels, but disregard the bad
   # results.
 
-  logits = ops.convert_to_tensor(logits)
-  labels = ops.convert_to_tensor(labels)
-  precise_logits = math_ops.cast(logits, dtypes.float32) if (
-      logits.dtype == dtypes.float16) else logits
-  # labels and logits must be of the same type
-  labels = math_ops.cast(labels, precise_logits.dtype)
-  input_rank = array_ops.rank(precise_logits)
-  # For shape inference.
-  shape = logits.get_shape()
+  with ops.name_scope(
+      name, "softmax_cross_entropy_with_logits", [logits, labels]) as name:
+    logits = ops.convert_to_tensor(logits, name="logits")
+    labels = ops.convert_to_tensor(labels, name="labels")
+    precise_logits = math_ops.cast(logits, dtypes.float32) if (
+        logits.dtype == dtypes.float16) else logits
+    # labels and logits must be of the same type
+    labels = math_ops.cast(labels, precise_logits.dtype)
+    input_rank = array_ops.rank(precise_logits)
+    # For shape inference.
+    shape = logits.get_shape()
 
-  # Move the dim to the end if dim is not the last dimension.
-  if dim is not -1:
-    def _move_dim_to_end(tensor, dim_index, rank):
-      return array_ops.transpose(tensor,
-                                 array_ops.concat([
-                                     math_ops.range(dim_index),
-                                     math_ops.range(dim_index + 1, rank),
-                                     [dim_index]
-                                 ], 0))
+    # Move the dim to the end if dim is not the last dimension.
+    if dim is not -1:
+      def _move_dim_to_end(tensor, dim_index, rank):
+        return array_ops.transpose(tensor,
+                                   array_ops.concat([
+                                       math_ops.range(dim_index),
+                                       math_ops.range(dim_index + 1, rank),
+                                       [dim_index]
+                                   ], 0))
 
-    precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-    labels = _move_dim_to_end(labels, dim, input_rank)
+      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
+      labels = _move_dim_to_end(labels, dim, input_rank)
 
-  input_shape = array_ops.shape(precise_logits)
+    input_shape = array_ops.shape(precise_logits)
 
-  # Make precise_logits and labels into matrices.
-  precise_logits = _flatten_outer_dims(precise_logits)
-  labels = _flatten_outer_dims(labels)
+    # Make precise_logits and labels into matrices.
+    precise_logits = _flatten_outer_dims(precise_logits)
+    labels = _flatten_outer_dims(labels)
 
-  # Do the actual op computation.
-  # The second output tensor contains the gradients.  We use it in
-  # _CrossEntropyGrad() in nn_grad but not here.
-  cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
-      precise_logits, labels, name=name)
+    # Do the actual op computation.
+    # The second output tensor contains the gradients.  We use it in
+    # _CrossEntropyGrad() in nn_grad but not here.
+    cost, unused_backprop = gen_nn_ops._softmax_cross_entropy_with_logits(
+        precise_logits, labels, name=name)
 
-  # The output cost shape should be the input minus dim.
-  output_shape = array_ops.slice(input_shape, [0],
-                                 [math_ops.subtract(input_rank, 1)])
-  cost = array_ops.reshape(cost, output_shape)
+    # The output cost shape should be the input minus dim.
+    output_shape = array_ops.slice(input_shape, [0],
+                                   [math_ops.subtract(input_rank, 1)])
+    cost = array_ops.reshape(cost, output_shape)
 
-  # Make shape inference work since reshape and transpose may erase its static
-  # shape.
-  if context.in_graph_mode() and shape is not None and shape.dims is not None:
-    shape = shape.as_list()
-    del shape[dim]
-    cost.set_shape(shape)
+    # Make shape inference work since reshape and transpose may erase its static
+    # shape.
+    if context.in_graph_mode() and shape is not None and shape.dims is not None:
+      shape = shape.as_list()
+      del shape[dim]
+      cost.set_shape(shape)
 
-  if logits.dtype == dtypes.float16:
-    return math_ops.cast(cost, dtypes.float16)
-  else:
-    return cost
+    if logits.dtype == dtypes.float16:
+      return math_ops.cast(cost, dtypes.float16)
+    else:
+      return cost
+
+
+_XENT_DEPRECATION = """
+Future major versions of TensorFlow will allow gradients to flow
+into the labels input on backprop by default.
+
+See tf.nn.softmax_cross_entropy_with_logits_v2.
+"""
+
+
+@deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
+def softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
+                                      labels=None, logits=None,
+                                      dim=-1, name=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+
+  `logits` and `labels` must have the same shape, e.g.
+  `[batch_size, num_classes]` and the same dtype (either `float16`, `float32`,
+  or `float64`).
+
+  Backpropagation will happen only into `logits`.  To calculate a cross entropy
+  loss that allows backpropagation into both `logits` and `labels`, see
+  @{tf.nn.softmax_cross_entropy_with_logits_v2}.
+
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+
+  Args:
+    _sentinel: Used to prevent positional parameters. Internal, do not use.
+    labels: Each row `labels[i]` must be a valid probability distribution.
+    logits: Unscaled log probabilities.
+    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A 1-D `Tensor` of length `batch_size` of the same type as `logits` with the
+    softmax cross entropy loss.
+  """
+  _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel,
+                    labels, logits)
+
+  with ops.name_scope(
+      name, "softmax_cross_entropy_with_logits_sg", [logits, labels]) as name:
+    labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
+
+  return softmax_cross_entropy_with_logits_v2(
+      labels=labels, logits=logits, dim=dim, name=name)
 
 
 def sparse_softmax_cross_entropy_with_logits(_sentinel=None,  # pylint: disable=invalid-name
@@ -2233,6 +2318,103 @@ def conv1d(value, filters, stride, padding,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+def conv1d_transpose(
+    value,
+    filter,  # pylint: disable=redefined-builtin
+    output_shape,
+    stride,
+    padding="SAME",
+    data_format="NWC",
+    name=None):
+  """The transpose of `conv1d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv1d` rather than an actual
+  deconvolution.
+
+  Args:
+    value: A 3-D `Tensor` of type `float` and shape
+      `[batch, in_width, in_channels]` for `NWC` data format or
+      `[batch, in_channels, in_width]` for `NCW` data format.
+    filter: A 3-D `Tensor` with the same type as `value` and shape
+      `[filter_width, output_channels, in_channels]`.  `filter`'s
+      `in_channels` dimension must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the @{tf.nn.convolution$comment here}
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  with ops.name_scope(name, "conv1d_transpose",
+                      [value, filter, output_shape]) as name:
+    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
+    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
+      raise ValueError("output_shape must have shape (3,), got {}".format(
+          output_shape_.get_shape()))
+
+    # The format could be either NWC or NCW, map to NHWC or NCHW
+    if data_format is None or data_format == "NWC":
+      data_format_2d = "NHWC"
+      axis = 2
+    elif data_format == "NCW":
+      data_format_2d = "NCHW"
+      axis = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+
+    if not value.get_shape()[axis].is_compatible_with(filter.get_shape()[2]):
+      raise ValueError("input channels does not match filter's input channels, "
+                       "{} != {}".format(value.get_shape()[axis],
+                                         filter.get_shape()[2]))
+
+    if isinstance(output_shape, (list, np.ndarray)):
+      # output_shape's shape should be == [3] if reached this point.
+      if not filter.get_shape()[1].is_compatible_with(output_shape[axis]):
+        raise ValueError(
+            "output_shape does not match filter's output channels, "
+            "{} != {}".format(output_shape[axis],
+                              filter.get_shape()[1]))
+
+    if padding != "VALID" and padding != "SAME":
+      raise ValueError("padding must be either VALID or SAME:"
+                       " {}".format(padding))
+
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    if data_format_2d == "NHWC":
+      output_shape_ = array_ops.concat(
+          [output_shape_[:1], [1], output_shape_[1:]], axis=0)
+      spatial_start_dim = 1
+      strides = [1, 1, stride, 1]
+    else:
+      output_shape_ = array_ops.concat(
+          [output_shape_[:2], [1], output_shape_[2:]], axis=0)
+      spatial_start_dim = 2
+      strides = [1, 1, 1, stride]
+    value = array_ops.expand_dims(value, spatial_start_dim)
+    filter = array_ops.expand_dims(filter, 0)
+
+    result = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape_,
+        filter=filter,
+        out_backprop=value,
+        strides=strides,
+        padding=padding,
+        data_format=data_format_2d,
+        name=name)
+    return array_ops.squeeze(result, [spatial_start_dim])
+
+
 @ops.RegisterStatistics("Dilation2D", "flops")
 def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 3b918e4f74c64868ef74f7e26295941c6f2801ff..8dfd0740bb180d1ac676a7ca353a27fd63b84846 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -953,5 +953,36 @@ class MomentsTest(test_lib.TestCase):
     self.doOutputTest((10, 10, 10, 30), (1, 2, 3))
 
 
+class DataFormatDimMapTest(test_lib.TestCase):
+
+  def _test(self, x_val, y_val_expected):
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertEqual(y_val, y_val_expected)
+
+  def test(self):
+    self._test(0, 0)
+    self._test(1, 2)
+    self._test(2, 3)
+    self._test(3, 1)
+    self._test(-1, 1)
+    self._test(-2, 3)
+    self._test(-3, 2)
+    self._test(-4, 0)
+
+
+class DataFormatVectorPermuteTest(test_lib.TestCase):
+
+  def test(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, [7, 3, 4, 9])
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index 5ea47ea40e5f283736523d5d09a63176b5e8fbbf..5e9e71002705293403de83276fb70099d8864907 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -93,7 +93,7 @@ class Conv2DTest(test.TestCase):
     quantized_range = ((quantized_max - quantized_min) * range_adjust)
     range_scale = (quantized_range / number_of_steps)
     lowest_quantized = -(1 << (number_of_bits - 1))
-    result = np.array([(quantized_min + ((x - lowest_quantized) * range_scale))
+    result = np.array([(quantized_min + ((float(x) - lowest_quantized) * range_scale))
                        for x in quantized.flatten()])
     return result
 
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3b35e13879069e40162fc50180520a5f855f6
--- /dev/null
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -0,0 +1,57 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for quantized operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class QuantizedOpsTest(test.TestCase):
+
+  def __init__(self, method_name="runTest"):
+    super(QuantizedOpsTest, self).__init__(method_name)
+
+  def testQuantizeOp(self):
+    expected_output = [1, 1, 2, 127, 255, 255]
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant([1.0, 1.25, 1.75, 127.0, 255.0, 500.0], shape=[6], dtype=dtypes.float32)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value.output, 0.1)
+
+  def testDequantizeOp(self):
+    expected_output = [1.0, 2.0, 4.0, 8.0, 16.0, 255.0]
+    inp = np.array([1, 2, 4, 8, 16, 255]).astype(np.uint8)
+    with self.test_session(use_gpu=False) as sess:
+      x = constant_op.constant(inp, shape=[6], dtype=dtypes.quint8)
+      x_min = 0.0
+      x_max = 255.0
+      op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
+      value = sess.run(op)
+      self.assertArrayNear(expected_output, value, 0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1e0bb925d415f6ae6bacb1496f87fa6b84ca13c2..a2264a7bdfff398e405ccd4a509d20c592ee886b 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -152,7 +152,7 @@ def truncated_normal(shape,
     mean: A 0-D Tensor or Python value of type `dtype`. The mean of the
       truncated normal distribution.
     stddev: A 0-D Tensor or Python value of type `dtype`. The standard deviation
-      of the truncated normal distribution.
+      of the normal distribution, before truncation.
     dtype: The type of the output.
     seed: A Python integer. Used to create a random seed for the distribution.
       See
@@ -220,8 +220,8 @@ def random_uniform(shape,
     ValueError: If `dtype` is integral and `maxval` is not specified.
   """
   dtype = dtypes.as_dtype(dtype)
-  if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
-                   dtypes.int64):
+  if dtype not in (dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.int32, dtypes.int64):
     raise ValueError("Invalid dtype %r" % dtype)
   if maxval is None:
     if dtype.is_integer:
@@ -316,7 +316,7 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-def multinomial(logits, num_samples, seed=None, name=None):
+def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
   Example:
@@ -336,6 +336,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
       @{tf.set_random_seed}
       for behavior.
     name: Optional name for the operation.
+    output_dtype: integer type to use for the output. Defaults to int64.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
@@ -344,7 +345,7 @@ def multinomial(logits, num_samples, seed=None, name=None):
     logits = ops.convert_to_tensor(logits, name="logits")
     seed1, seed2 = random_seed.get_seed(seed)
     return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2)
+        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
 
 
 ops.NotDifferentiable("Multinomial")
@@ -438,8 +439,8 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       distribution(s) to sample.
     shape: A 1-D integer Tensor or Python array. The shape of the output samples
       to be drawn per "rate"-parameterized distribution.
-    dtype: The type of `lam` and the output: `float16`, `float32`, or
-      `float64`.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32` or
+      `int64`.
     seed: A Python integer. Used to create a random seed for the distributions.
       See
       @{tf.set_random_seed}
@@ -451,7 +452,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       values of type `dtype`.
   """
   with ops.name_scope(name, "random_poisson", [lam, shape]):
-    lam = ops.convert_to_tensor(lam, name="lam", dtype=dtype)
     shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
     seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops._random_poisson(shape, lam, seed=seed1, seed2=seed2)
+    return gen_random_ops.random_poisson_v2(
+        shape, lam, dtype=dtype, seed=seed1, seed2=seed2)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 9e5bb4a225e091d14936a209b82f3d250dee8359..58ede027477667a9d5f821dbf42d8a3fdab50b1a 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -63,7 +63,7 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     raise ValueError("variable object with name '%s' already created. Use "
                      "get_variable() if reuse is desired." %
                      shared_name)
-  with context.graph_mode(), ops.Graph().as_default():
+  with context.graph_mode(), ops.Graph().as_default() as graph:
     h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
                                                 shared_name=shared_name,
                                                 name=name,
@@ -74,6 +74,25 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
     handle._handle_data = h._handle_data  # pylint: disable=protected-access
+  # Clean up our reference cycles to avoid making the garbage collector run.
+  # pylint: disable=protected-access
+  # OrderedDict, constructed on Graph creation, makes a simple reference loop
+  # and hides it in an __attribute in some Python versions. We don't need to
+  # throw an error if we can't find it, but if we do find it we can break the
+  # loop to avoid creating work for the garbage collector.
+  problematic_cycle = graph._functions.__dict__.get("_OrderedDict__root", None)
+  # pylint: enable=protected-access
+  if problematic_cycle:
+    try:
+      del problematic_cycle[0][:]
+    except TypeError:
+      # This is probably not one of the problematic Python versions. Continue
+      # with the rest of our cleanup.
+      pass
+  # Now clean up our own reference cycles by clearing all of the attributes for
+  # the Graph and op we created.
+  h.__dict__ = {}
+  graph.__dict__ = {}
   return handle
 
 
@@ -165,11 +184,12 @@ class ResourceVariable(variables.Variable):
     assign = a.assign(2.0)
     with tf.control_dependencies([assign]):
       b = a.read_value()
-
-    other_assign = a.assign(3.0)
+    with tf.control_dependencies([b]):
+      other_assign = a.assign(3.0)
     with tf.control_dependencies([other_assign]):
-      tf.Print(b, [b]).run()  # Will print 2.0 because the value was read before
-                              # other_assign ran.
+      # Will print 2.0 because the value was read before other_assign ran. If
+      # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+      tf.Print(b, [b]).eval()
   ```
 
   To enforce these consistency properties tf.ResourceVariable might make more
@@ -454,6 +474,7 @@ class ResourceVariable(variables.Variable):
           ops.add_to_collections(collections, self)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
+
     if not self._in_graph_mode:
       # After the handle has been created, set up a way to clean it up when
       # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -493,7 +514,8 @@ class ResourceVariable(variables.Variable):
       self._cached_value = None
     if variable_def.HasField("save_slice_info_def"):
       self._save_slice_info = variables.Variable.SaveSliceInfo(
-          save_slice_info_def=variable_def.save_slice_info_def)
+          save_slice_info_def=variable_def.save_slice_info_def,
+          import_scope=import_scope)
     else:
       self._save_slice_info = None
     self._caching_device = None
@@ -866,26 +888,19 @@ def _ReadGrad(_, grad):
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
-  # Walk graph back until the original handle is found.
-  # TODO(apassos): more robust way of getting the shape.
-  # TODO(apassos): implement this for EAGER mode.
-  if context.in_eager_mode():
-    dense_shape = gen_resource_variable_ops.variable_shape(op.inputs[0])
-    return (ops.IndexedSlices(grad,
-                              op.inputs[1],
-                              dense_shape=dense_shape),
-            None)
   handle = op.inputs[0]
-  while handle.op.type != "VarHandleOp":
-    handle = handle.op.inputs[0]
-  params_shape = ops.convert_to_tensor(
-      tensor_shape.TensorShape(handle.op.get_attr("shape")))
   indices = op.inputs[1]
+  if context.in_graph_mode():
+    # Walk graph back until the original handle is found.
+    # TODO(apassos): implement this for EAGER mode.
+    while handle.op.type != "VarHandleOp":
+      handle = handle.op.inputs[0]
+  params_shape = gen_resource_variable_ops.variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
   indices = array_ops.reshape(indices, size)
-  return [ops.IndexedSlices(values, indices, params_shape), None]
+  return (ops.IndexedSlices(values, indices, params_shape), None)
 
 
 def _to_proto_fn(v, export_scope=None):
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 21c7ed361dc8d613d3332905ded1952dfe34681c..e30b19842f08d335ce7967b77dcb49578fb3fe85 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -134,13 +135,20 @@ def _infer_state_dtype(explicit_dtype, state):
     return state.dtype
 
 
+def _maybe_tensor_shape_from_tensor(shape):
+  if isinstance(shape, ops.Tensor):
+    return tensor_shape.as_shape(tensor_util.constant_value(shape))
+  else:
+    return shape
+
+
 # pylint: disable=unused-argument
 def _rnn_step(
     time, sequence_length, min_sequence_length, max_sequence_length,
     zero_output, state, call_cell, state_size, skip_conditionals=False):
   """Calculate one step of a dynamic RNN minibatch.
 
-  Returns an (output, state) pair conditioned on the sequence_lengths.
+  Returns an (output, state) pair conditioned on `sequence_length`.
   When skip_conditionals=False, the pseudocode is something like:
 
   if t >= max_sequence_length:
@@ -149,14 +157,14 @@ def _rnn_step(
     return call_cell()
 
   # Selectively output zeros or output, old state or new state depending
-  # on if we've finished calculating each row.
+  # on whether we've finished calculating each row.
   new_output, new_state = call_cell()
   final_output = np.vstack([
-    zero_output if time >= sequence_lengths[r] else new_output_r
+    zero_output if time >= sequence_length[r] else new_output_r
     for r, new_output_r in enumerate(new_output)
   ])
   final_state = np.vstack([
-    state[r] if time >= sequence_lengths[r] else new_state_r
+    state[r] if time >= sequence_length[r] else new_state_r
     for r, new_state_r in enumerate(new_state)
   ])
   return (final_output, final_state)
@@ -194,9 +202,12 @@ def _rnn_step(
   flat_zero_output = nest.flatten(zero_output)
 
   def _copy_one_through(output, new_output):
-    # If the state contains a scalar value we simply pass it through.
+    # TensorArray and scalar get passed through.
+    if isinstance(output, tensor_array_ops.TensorArray):
+      return new_output
     if output.shape.ndims == 0:
       return new_output
+    # Otherwise propagate the old or the new value.
     copy_cond = (time >= sequence_length)
     with ops.colocate_with(new_output):
       return array_ops.where(copy_cond, output, new_output)
@@ -256,7 +267,8 @@ def _rnn_step(
   for output, flat_output in zip(final_output, flat_zero_output):
     output.set_shape(flat_output.get_shape())
   for substate, flat_substate in zip(final_state, flat_state):
-    substate.set_shape(flat_substate.get_shape())
+    if not isinstance(substate, tensor_array_ops.TensorArray):
+      substate.set_shape(flat_substate.get_shape())
 
   final_output = nest.pack_sequence_as(
       structure=zero_output, flat_sequence=final_output)
@@ -553,33 +565,34 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
   if not _like_rnncell(cell):
     raise TypeError("cell must be an instance of RNNCell")
 
-  # By default, time_major==False and inputs are batch-major: shaped
-  #   [batch, time, depth]
-  # For internal calculations, we transpose to [time, batch, depth]
-  flat_input = nest.flatten(inputs)
-
-  if not time_major:
-    # (B,T,D) => (T,B,D)
-    flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
-    flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
-
-  parallel_iterations = parallel_iterations or 32
-  if sequence_length is not None:
-    sequence_length = math_ops.to_int32(sequence_length)
-    if sequence_length.get_shape().ndims not in (None, 1):
-      raise ValueError(
-          "sequence_length must be a vector of length batch_size, "
-          "but saw shape: %s" % sequence_length.get_shape())
-    sequence_length = array_ops.identity(  # Just to find it in the graph.
-        sequence_length, name="sequence_length")
-
-  # Create a new scope in which the caching device is either
-  # determined by the parent scope, or is set to place the cached
-  # Variable using the same placement as for the rest of the RNN.
   with vs.variable_scope(scope or "rnn") as varscope:
+    # Create a new scope in which the caching device is either
+    # determined by the parent scope, or is set to place the cached
+    # Variable using the same placement as for the rest of the RNN.
     if context.in_graph_mode():
       if varscope.caching_device is None:
         varscope.set_caching_device(lambda op: op.device)
+
+    # By default, time_major==False and inputs are batch-major: shaped
+    #   [batch, time, depth]
+    # For internal calculations, we transpose to [time, batch, depth]
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+      # (B,T,D) => (T,B,D)
+      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+    if sequence_length is not None:
+      sequence_length = math_ops.to_int32(sequence_length)
+      if sequence_length.get_shape().ndims not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size, "
+            "but saw shape: %s" % sequence_length.get_shape())
+      sequence_length = array_ops.identity(  # Just to find it in the graph.
+          sequence_length, name="sequence_length")
+
     batch_size = _best_effort_input_batch_size(flat_input)
 
     if initial_state is not None:
@@ -715,18 +728,28 @@ def _dynamic_rnn_loop(cell,
   with ops.name_scope("dynamic_rnn") as scope:
     base_name = scope
 
-  def _create_ta(name, dtype):
+  def _create_ta(name, element_shape, dtype):
     return tensor_array_ops.TensorArray(dtype=dtype,
                                         size=time_steps,
+                                        element_shape=element_shape,
                                         tensor_array_name=base_name + name)
 
   in_graph_mode = context.in_graph_mode()
   if in_graph_mode:
-    output_ta = tuple(_create_ta("output_%d" % i,
-                                 _infer_state_dtype(dtype, state))
-                      for i in range(len(flat_output_size)))
-    input_ta = tuple(_create_ta("input_%d" % i, flat_input[i].dtype)
-                     for i in range(len(flat_input)))
+    output_ta = tuple(
+        _create_ta(
+            "output_%d" % i,
+            element_shape=(tensor_shape.TensorShape([const_batch_size])
+                           .concatenate(
+                               _maybe_tensor_shape_from_tensor(out_size))),
+            dtype=_infer_state_dtype(dtype, state))
+        for i, out_size in enumerate(flat_output_size))
+    input_ta = tuple(
+        _create_ta(
+            "input_%d" % i,
+            element_shape=flat_input_i.shape[1:],
+            dtype=flat_input_i.dtype)
+        for i, flat_input_i in enumerate(flat_input))
     input_ta = tuple(ta.unstack(input_)
                      for ta, input_ in zip(input_ta, flat_input))
   else:
@@ -1007,6 +1030,7 @@ def raw_rnn(cell, loop_fn,
       static_batch_size.merge_with(input_shape_i[0])
 
     batch_size = static_batch_size.value
+    const_batch_size = batch_size
     if batch_size is None:
       batch_size = array_ops.shape(flat_input[0])[0]
 
@@ -1029,8 +1053,15 @@ def raw_rnn(cell, loop_fn,
 
     flat_emit_ta = [
         tensor_array_ops.TensorArray(
-            dtype=dtype_i, dynamic_size=True, size=0, name="rnn_output_%d" % i)
-        for i, dtype_i in enumerate(flat_emit_dtypes)]
+            dtype=dtype_i,
+            dynamic_size=True,
+            element_shape=(tensor_shape.TensorShape([const_batch_size])
+                           .concatenate(
+                               _maybe_tensor_shape_from_tensor(size_i))),
+            size=0,
+            name="rnn_output_%d" % i)
+        for i, (dtype_i, size_i)
+        in enumerate(zip(flat_emit_dtypes, flat_emit_size))]
     emit_ta = nest.pack_sequence_as(structure=emit_structure,
                                     flat_sequence=flat_emit_ta)
     flat_zero_emit = [
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 0a497e13ff573d49a01d5f79651574e37e1a86bc..9b4526b7139411c30811cc7108efbb1d5f192b27 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -388,7 +388,64 @@ class SRUCell(_LayerRNNCell):
 
     return h, c
 
-class GRUCell(RNNCell):
+class BasicRNNCell(_LayerRNNCell):
+  """The most basic RNN cell.
+
+  Args:
+    num_units: int, The number of units in the RNN cell.
+    activation: Nonlinearity to use.  Default: `tanh`.
+    reuse: (optional) Python boolean describing whether to reuse variables
+     in an existing scope.  If not `True`, and the existing scope already has
+     the given variables, an error is raised.
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such
+      cases.
+  """
+
+  def __init__(self, num_units, activation=None, reuse=None, name=None):
+    super(BasicRNNCell, self).__init__(_reuse=reuse, name=name)
+
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._num_units = num_units
+    self._activation = activation or math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    if inputs_shape[1].value is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % inputs_shape)
+
+    input_depth = inputs_shape[1].value
+    self._kernel = self.add_variable(
+        _WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth + self._num_units, self._num_units])
+    self._bias = self.add_variable(
+        _BIAS_VARIABLE_NAME,
+        shape=[self._num_units],
+        initializer=init_ops.zeros_initializer(dtype=self.dtype))
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+
+    gate_inputs = math_ops.matmul(
+        array_ops.concat([inputs, state], 1), self._kernel)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+    output = self._activation(gate_inputs)
+    return output, output
+
+
+class GRUCell(_LayerRNNCell):
   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 
   Args:
@@ -400,6 +457,9 @@ class GRUCell(RNNCell):
     kernel_initializer: (optional) The initializer to use for the weight and
     projection matrices.
     bias_initializer: (optional) The initializer to use for the bias.
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require reuse=True in such
+      cases.
   """
 
   def __init__(self,
@@ -407,14 +467,17 @@ class GRUCell(RNNCell):
                activation=None,
                reuse=None,
                kernel_initializer=None,
-               bias_initializer=None):
-    super(GRUCell, self).__init__(_reuse=reuse)
+               bias_initializer=None,
+               name=None):
+    super(GRUCell, self).__init__(_reuse=reuse, name=name)
+
+    # Inputs must be 2-dimensional.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
-    self._gate_linear = None
-    self._candidate_linear = None
 
   @property
   def state_size(self):
@@ -424,33 +487,54 @@ class GRUCell(RNNCell):
   def output_size(self):
     return self._num_units
 
+  def build(self, inputs_shape):
+    if inputs_shape[1].value is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
+                       % inputs_shape)
+
+    input_depth = inputs_shape[1].value
+    self._gate_kernel = self.add_variable(
+        "gates/%s" % _WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth + self._num_units, 2 * self._num_units],
+        initializer=self._kernel_initializer)
+    self._gate_bias = self.add_variable(
+        "gates/%s" % _BIAS_VARIABLE_NAME,
+        shape=[2 * self._num_units],
+        initializer=(
+            self._bias_initializer
+            if self._bias_initializer is not None
+            else init_ops.constant_initializer(1.0, dtype=self.dtype)))
+    self._candidate_kernel = self.add_variable(
+        "candidate/%s" % _WEIGHTS_VARIABLE_NAME,
+        shape=[input_depth + self._num_units, self._num_units],
+        initializer=self._kernel_initializer)
+    self._candidate_bias = self.add_variable(
+        "candidate/%s" % _BIAS_VARIABLE_NAME,
+        shape=[self._num_units],
+        initializer=(
+            self._bias_initializer
+            if self._bias_initializer is not None
+            else init_ops.zeros_initializer(dtype=self.dtype)))
+
+    self.built = True
+
   def call(self, inputs, state):
     """Gated recurrent unit (GRU) with nunits cells."""
-    if self._gate_linear is None:
-      bias_ones = self._bias_initializer
-      if self._bias_initializer is None:
-        bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
-      with vs.variable_scope("gates"):  # Reset gate and update gate.
-        self._gate_linear = _Linear(
-            [inputs, state],
-            2 * self._num_units,
-            True,
-            bias_initializer=bias_ones,
-            kernel_initializer=self._kernel_initializer)
-
-    value = math_ops.sigmoid(self._gate_linear([inputs, state]))
+
+    gate_inputs = math_ops.matmul(
+        array_ops.concat([inputs, state], 1), self._gate_kernel)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias)
+
+    value = math_ops.sigmoid(gate_inputs)
     r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
 
     r_state = r * state
-    if self._candidate_linear is None:
-      with vs.variable_scope("candidate"):
-        self._candidate_linear = _Linear(
-            [inputs, r_state],
-            self._num_units,
-            True,
-            bias_initializer=self._bias_initializer,
-            kernel_initializer=self._kernel_initializer)
-    c = self._activation(self._candidate_linear([inputs, r_state]))
+
+    candidate = math_ops.matmul(
+        array_ops.concat([inputs, r_state], 1), self._candidate_kernel)
+    candidate = nn_ops.bias_add(candidate, self._candidate_bias)
+
+    c = self._activation(candidate)
     new_h = u * state + (1 - u) * c
     return new_h, new_h
 
@@ -527,7 +611,6 @@ class BasicLSTMCell(_LayerRNNCell):
     self._forget_bias = forget_bias
     self._state_is_tuple = state_is_tuple
     self._activation = activation or math_ops.tanh
-    self._linear = None
 
   @property
   def state_size(self):
@@ -551,9 +634,9 @@ class BasicLSTMCell(_LayerRNNCell):
     self._bias = self.add_variable(
         _BIAS_VARIABLE_NAME,
         shape=[4 * self._num_units],
-        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+        initializer=init_ops.zeros_initializer(dtype=self.dtype))
 
-    self._built = True
+    self.built = True
 
   def call(self, inputs, state):
     """Long short-term memory cell (LSTM).
@@ -729,7 +812,7 @@ class LSTMCell(_LayerRNNCell):
     self._bias = self.add_variable(
         _BIAS_VARIABLE_NAME,
         shape=[4 * self._num_units],
-        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+        initializer=init_ops.zeros_initializer(dtype=self.dtype))
     if self._use_peepholes:
       self._w_f_diag = self.add_variable("w_f_diag", shape=[self._num_units],
                                          initializer=self._initializer)
@@ -749,7 +832,7 @@ class LSTMCell(_LayerRNNCell):
           initializer=self._initializer,
           partitioner=maybe_proj_partitioner)
 
-    self._built = True
+    self.built = True
 
   def call(self, inputs, state):
     """Run one step of LSTM.
@@ -1279,146 +1362,3 @@ class _SlimRNNCell(RNNCell):
     scope = scope or self._cell_name
     output, state = self._cell_fn(inputs, state, scope=scope)
     return output, state
-
-
-class _Linear(object):
-  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
-
-  Args:
-    args: a 2D Tensor or a list of 2D, batch, n, Tensors.
-    output_size: int, second dimension of weight variable.
-    dtype: data type for variables.
-    build_bias: boolean, whether to build a bias variable.
-    bias_initializer: starting value to initialize the bias
-      (default is all zeros).
-    kernel_initializer: starting value to initialize the weight.
-
-  Raises:
-    ValueError: if inputs_shape is wrong.
-  """
-
-  def __init__(self,
-               args,
-               output_size,
-               build_bias,
-               bias_initializer=None,
-               kernel_initializer=None):
-    self._build_bias = build_bias
-
-    if args is None or (nest.is_sequence(args) and not args):
-      raise ValueError("`args` must be specified")
-    if not nest.is_sequence(args):
-      args = [args]
-      self._is_sequence = False
-    else:
-      self._is_sequence = True
-
-    # Calculate the total size of arguments on dimension 1.
-    total_arg_size = 0
-    shapes = [a.get_shape() for a in args]
-    for shape in shapes:
-      if shape.ndims != 2:
-        raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-      if shape[1].value is None:
-        raise ValueError("linear expects shape[1] to be provided for shape %s, "
-                         "but saw %s" % (shape, shape[1]))
-      else:
-        total_arg_size += shape[1].value
-
-    dtype = [a.dtype for a in args][0]
-
-    scope = vs.get_variable_scope()
-    with vs.variable_scope(scope) as outer_scope:
-      self._weights = vs.get_variable(
-          _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
-          dtype=dtype,
-          initializer=kernel_initializer)
-      if build_bias:
-        with vs.variable_scope(outer_scope) as inner_scope:
-          inner_scope.set_partitioner(None)
-          if bias_initializer is None:
-            bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
-          self._biases = vs.get_variable(
-              _BIAS_VARIABLE_NAME, [output_size],
-              dtype=dtype,
-              initializer=bias_initializer)
-
-  def __call__(self, args):
-    if not self._is_sequence:
-      args = [args]
-
-    if len(args) == 1:
-      res = math_ops.matmul(args[0], self._weights)
-    else:
-      # Explicitly creating a one for a minor performance improvement.
-      one = constant_op.constant(1, dtype=dtypes.int32)
-      res = math_ops.matmul(array_ops.concat(args, one), self._weights)
-    if self._build_bias:
-      res = nn_ops.bias_add(res, self._biases)
-    return res
-
-
-# TODO(xpan): Remove this function in a follow up.
-def _linear(args,
-            output_size,
-            bias,
-            bias_initializer=None,
-            kernel_initializer=None):
-  """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
-
-  Args:
-    args: a 2D Tensor or a list of 2D, batch, n, Tensors.
-    output_size: int, second dimension of W[i].
-    bias: boolean, whether to add a bias term or not.
-    bias_initializer: starting value to initialize the bias
-      (default is all zeros).
-    kernel_initializer: starting value to initialize the weight.
-
-  Returns:
-    A 2D Tensor with shape `[batch, output_size]` equal to
-    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
-
-  Raises:
-    ValueError: if some of the arguments has unspecified or wrong shape.
-  """
-  if args is None or (nest.is_sequence(args) and not args):
-    raise ValueError("`args` must be specified")
-  if not nest.is_sequence(args):
-    args = [args]
-
-  # Calculate the total size of arguments on dimension 1.
-  total_arg_size = 0
-  shapes = [a.get_shape() for a in args]
-  for shape in shapes:
-    if shape.ndims != 2:
-      raise ValueError("linear is expecting 2D arguments: %s" % shapes)
-    if shape[1].value is None:
-      raise ValueError("linear expects shape[1] to be provided for shape %s, "
-                       "but saw %s" % (shape, shape[1]))
-    else:
-      total_arg_size += shape[1].value
-
-  dtype = [a.dtype for a in args][0]
-
-  # Now the computation.
-  scope = vs.get_variable_scope()
-  with vs.variable_scope(scope) as outer_scope:
-    weights = vs.get_variable(
-        _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
-        dtype=dtype,
-        initializer=kernel_initializer)
-    if len(args) == 1:
-      res = math_ops.matmul(args[0], weights)
-    else:
-      res = math_ops.matmul(array_ops.concat(args, 1), weights)
-    if not bias:
-      return res
-    with vs.variable_scope(outer_scope) as inner_scope:
-      inner_scope.set_partitioner(None)
-      if bias_initializer is None:
-        bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
-      biases = vs.get_variable(
-          _BIAS_VARIABLE_NAME, [output_size],
-          dtype=dtype,
-          initializer=bias_initializer)
-    return nn_ops.bias_add(res, biases)
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 45d681c3d517f526abac140261fe65d54e08c597..c0c1ade495455df6a4965eefba4b823ca84e7c31 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -29,11 +29,41 @@ import numpy as np
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
 
 
+class EagerFunc(object):
+  """A wrapper for a function owned by an EagerPyFunc."""
+
+  def __init__(self, func, Tout):
+    """Constructs an EagerFunc.
+
+    Args:
+      func: The function to wrap.
+      Tout: A list of datatypes for the output; an empty list if the output is
+            None.
+    """
+    self._func = func
+    self._out_dtypes = Tout
+
+  def __call__(self, *args, **kwargs):
+    """Passes args, kwargs to `self._func`, which is executed eagerly."""
+    with context.eager_mode():
+      ret = self._func(*args, **kwargs)
+      if isinstance(ret, (tuple, list)):
+        return [
+            ops.convert_to_tensor(x, dtype=dtype)
+            for (x, dtype) in zip(ret, self._out_dtypes)
+        ]
+      elif ret is None:
+        return ret
+      else:
+        return ops.convert_to_tensor(ret, dtype=self._out_dtypes[0])
+
+
 class FuncRegistry(object):
   """A helper class to keep track of registered py functions.
 
@@ -91,16 +121,20 @@ class FuncRegistry(object):
     if func is None:
       raise ValueError("callback %s is not found" % token)
     ret = func(*args)
-    # Strings seem to lead to a memory leak here if they're not wrapped in a
-    # list.
-    if isinstance(ret, six.binary_type):
-      ret = [ret]
-    # Ensures that we return either a single numpy array or a list of numpy
-    # arrays.
-    if isinstance(ret, (tuple, list)):
-      return [self._convert(x) for x in ret]
+
+    if isinstance(func, EagerFunc):
+      return ret
     else:
-      return self._convert(ret)
+      # Strings seem to lead to a memory leak here if they're not wrapped in a
+      # list.
+      if isinstance(ret, six.binary_type):
+        ret = [ret]
+      # Ensures that we return either a single numpy array or a list of numpy
+      # arrays.
+      if isinstance(ret, (tuple, list)):
+        return [self._convert(x) for x in ret]
+      else:
+        return self._convert(ret)
 
   def size(self):
     """Returns how many functions are currently registered."""
@@ -129,6 +163,86 @@ class CleanupFunc(object):
     _py_funcs.remove(self._token)
 
 
+def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
+  """See documentation for py_func and eager_py_func."""
+
+  is_list_or_tuple = False
+  if isinstance(Tout, (list, tuple)):
+    is_list_or_tuple = True
+  else:
+    Tout = [Tout]
+
+  if eager:
+    func = EagerFunc(func, Tout)
+
+  token = _py_funcs.insert(func)
+  # We tie the registered function's lifetime with the current default graph,
+  # i.e., when the current graph is destroyed, we remove its py funcs.
+  graph = ops.get_default_graph()
+
+  # pylint: disable=protected-access
+  while isinstance(graph, function._FuncGraph):
+    # If the py_func was declared inside a _FuncGraph, its lifetime should be
+    # bound to that of the outer graph instead.
+    graph = graph._outer_graph
+
+  cleanup = CleanupFunc(token)
+
+  # TODO(zhifengc): Consider adding a Graph method to collect
+  # `cleanup` objects in one of its member.
+  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
+    graph._cleanup_py_funcs_used_in_graph = []
+
+  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
+  # will be destroyed and their __del__ will remove the 'token' from
+  # the funcs registry.
+  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # pylint: enable=protected-access
+
+  # pylint: disable=protected-access
+  if eager:
+    result = gen_script_ops._eager_py_func(
+        input=inp, token=token, Tout=Tout, name=name)
+  else:
+    if stateful:
+      result = gen_script_ops._py_func(
+          input=inp, token=token, Tout=Tout, name=name)
+    else:
+      result = gen_script_ops._py_func_stateless(
+          input=inp, token=token, Tout=Tout, name=name)
+  # pylint: enable=protected-access
+  return result if is_list_or_tuple else result[0]
+
+
+def eager_py_func(func, inp, Tout, name=None):
+  """Wraps a python function into a TensorFlow op.
+
+  When the returned op is executed, `func` is invoked with eager execution
+  enabled. Inputs are Tensor objects and func must return None or objects
+  that may be converted to Tensor objects.
+
+  This function has the same limitations as `py_func` with respect to
+  serialization and distribution.
+
+  Args:
+    func: A Python function which accepts a list of `Tensor` objects
+      having element types that match the corresponding `tf.Tensor` objects
+      in `inp` and returns a list of `Tensor` objects (or a single
+      `Tensor`, or `None`) having element types that match the
+      corresponding values in `Tout`.
+    inp: A list of `Tensor` objects.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+      type if there is only one, indicating what `func` returns; an empty list
+      if no value is returned (i.e., if the return value is `None`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
+    if `func` returns None.
+  """
+  return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
+
+
 def py_func(func, inp, Tout, stateful=True, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
@@ -163,6 +277,12 @@ def py_func(func, inp, Tout, stateful=True, name=None):
       having element types that match the corresponding `tf.Tensor` objects
       in `inp`, and returns a list of `ndarray` objects (or a single `ndarray`)
       having element types that match the corresponding values in `Tout`.
+      Important Note: Input and output numpy `ndarray`s of `func` are not
+      guaranteed to be copies. In some cases their underlying memory will be
+      shared with the corresponding TensorFlow tensors.
+      In-place modification or storing `func` input or return values in
+      python datastructures without explicit (np.)copy
+      can have non-deterministic consequences.
     inp: A list of `Tensor` objects.
     Tout: A list or tuple of tensorflow data types or a single tensorflow data
       type if there is only one, indicating what `func` returns.
@@ -176,46 +296,12 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   Returns:
     A list of `Tensor` or a single `Tensor` which `func` computes.
   """
-  token = _py_funcs.insert(func)
-  # We tie the registered function's life-time with the current
-  # default graph. I.e., when the current graph is destroyed, we
-  # should remove its py funcs.
-  g = ops.get_default_graph()
-
-  # pylint: disable=protected-access
-  while isinstance(g, function._FuncGraph):
-    # If the py_func was declared inside a _FuncGraph, its lifetime should be
-    # bound to that of the outer graph instead.
-    g = g._outer_graph
-
-  cleanup = CleanupFunc(token)
-
-  # TODO(zhifengc): Consider adding a Graph method to collect
-  # `cleanup` objects in one of its member.
-  if not hasattr(g, "_cleanup_py_funcs_used_in_graph"):
-    g._cleanup_py_funcs_used_in_graph = []
-
-  # When g is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  g._cleanup_py_funcs_used_in_graph.append(cleanup)
-  # pylint: enable=protected-access
-
-  if isinstance(Tout, (list, tuple)):
-    is_list_or_tuple = True
-  else:
-    Tout = [Tout]
-    is_list_or_tuple = False
-  # pylint: disable=protected-access
-  if stateful:
-    result = gen_script_ops._py_func(
-        input=inp, token=token, Tout=Tout, name=name)
-  else:
-    result = gen_script_ops._py_func_stateless(
-        input=inp, token=token, Tout=Tout, name=name)
-  # pylint: enable=protected-access
-  return result if is_list_or_tuple else result[0]
+  return _internal_py_func(
+      func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
 
+# TODO(akshayka): PyFuncs where the 'eager' attribute is set to True should be
+# differentiable, i.e., the gradient of PyFunc should propagate Nones if the
+# eager attribute is not set, and otherwise, it should return the gradient.
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 404041dfe14e83e23ccabd99180e73435cd5d660..62f20e8c9de58a2d40e7e8fa232493fd44429c26 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -1385,16 +1385,17 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
             empty_row_indicator)
 
 
-def serialize_sparse(sp_input, name=None):
-  """Serialize a `SparseTensor` into a string 3-vector (1-D `Tensor`) object.
+def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
   Args:
     sp_input: The input `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string 3-vector (1D `Tensor`), with each column representing the
-    serialized `SparseTensor`'s indices, values, and shape (respectively).
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1402,11 +1403,15 @@ def serialize_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
 
 
-def serialize_many_sparse(sp_input, name=None):
-  """Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` string `Tensor`.
+def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
   The `SparseTensor` must have rank `R` greater than 1, and the first dimension
   is treated as the minibatch dimension.  Elements of the `SparseTensor`
@@ -1419,11 +1424,12 @@ def serialize_many_sparse(sp_input, name=None):
   Args:
     sp_input: The input rank `R` `SparseTensor`.
     name: A name prefix for the returned tensors (optional).
+    out_type: The `dtype` to use for serialization.
 
   Returns:
-    A string matrix (2-D `Tensor`) with `N` rows and `3` columns.
-    Each column represents serialized `SparseTensor`'s indices, values, and
-    shape (respectively).
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
 
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
@@ -1431,7 +1437,77 @@ def serialize_many_sparse(sp_input, name=None):
   sp_input = _convert_to_sparse_tensor(sp_input)
 
   return gen_sparse_ops._serialize_many_sparse(
-      sp_input.indices, sp_input.values, sp_input.dense_shape, name=name)
+      sp_input.indices,
+      sp_input.values,
+      sp_input.dense_shape,
+      name=name,
+      out_type=out_type)
+
+
+def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
+  """Deserialize `SparseTensor` objects.
+
+  The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+  the last dimension stores serialized `SparseTensor` objects and the other N
+  dimensions (N >= 0) correspond to a batch. The ranks of the original
+  `SparseTensor` objects must all match. When the final `SparseTensor` is
+  created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+  the sparse tensors have been concatenated along new dimensions, one for each
+  batch.
+
+  The output `SparseTensor` object's shape values for the original dimensions
+  are the max across the input `SparseTensor` objects' shape values for the
+  corresponding dimensions. The new dimensions match the size of the batch.
+
+  The input `SparseTensor` objects' indices are assumed ordered in
+  standard lexicographic order.  If this is not the case, after this
+  step run `SparseReorder` to restore index ordering.
+
+  For example, if the serialized input is a `[2 x 3]` matrix representing two
+  original `SparseTensor` objects:
+
+      index = [ 0]
+              [10]
+              [20]
+      values = [1, 2, 3]
+      shape = [50]
+
+  and
+
+      index = [ 2]
+              [10]
+      values = [4, 5]
+      shape = [30]
+
+  then the final deserialized `SparseTensor` will be:
+
+      index = [0  0]
+              [0 10]
+              [0 20]
+              [1  2]
+              [1 10]
+      values = [1, 2, 3, 4, 5]
+      shape = [2 50]
+
+  Args:
+    serialized_sparse: The serialized `SparseTensor` objects.
+      The last dimension must have 3 columns.
+    dtype: The `dtype` of the serialized `SparseTensor` objects.
+    rank: (optional) Python int, the rank of the `SparseTensor` objects.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A `SparseTensor` representing the deserialized `SparseTensor` objects.
+
+  """
+  output_indices, output_values, output_shape = (
+      gen_sparse_ops._deserialize_sparse(serialized_sparse, dtype, name=name))
+
+  # Feed rank data back in, if available
+  output_indices.set_shape([None, rank])
+  output_shape.set_shape([rank])
+
+  return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index dbab07da42671744284d703f0cd80e601a5fa8a8..dee495f78fa5c2fa099772d0a84f5ff0981c8c59 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -347,5 +347,71 @@ def scatter_update(ref, indices, updates, use_locking=True, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.scatter_update(ref, indices, updates,
                                         use_locking=use_locking, name=name)
-  return gen_resource_variable_ops.resource_scatter_update(
-      ref.handle, indices, updates, name=name)
+  with ops.control_dependencies(
+      [gen_resource_variable_ops.resource_scatter_update(
+          ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+          name=name)]):
+    return ref.read_value()
+
+
+def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
+  r"""Applies sparse `updates` to individual values or slices in a Variable.
+
+  `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+  `indices` must be integer tensor, containing indices into `ref`.
+  It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+  The innermost dimension of `indices` (with length `K`) corresponds to
+  indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+  dimension of `ref`.
+
+  `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+  ```
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  ```
+
+  For example, say we want to update 4 scattered elements to a rank-1 tensor to
+  8 elements. In Python, that update would look like this:
+
+  ```python
+      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+      indices = tf.constant([[4], [3], [1] ,[7]])
+      updates = tf.constant([9, 10, 11, 12])
+      update = tf.scatter_nd_update(ref, indices, updates)
+      with tf.Session() as sess:
+        print sess.run(update)
+  ```
+
+  The resulting update to ref would look like this:
+
+      [1, 11, 3, 10, 9, 6, 7, 12]
+
+  See @{tf.scatter_nd} for more details about how to make updates to
+  slices.
+
+  Args:
+    ref: A Variable.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A Tensor. Must be one of the following types: int32, int64.
+      A tensor of indices into ref.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A Tensor. Must have the same type as ref. A tensor of updated
+      values to add to ref.
+    use_locking: An optional `bool`. Defaults to `True`.
+      An optional bool. Defaults to True. If True, the assignment will
+      be protected by a lock; otherwise the behavior is undefined,
+      but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    The value of the variable after the update.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_nd_update(
+        ref, indices, updates, use_locking, name)
+  with ops.control_dependencies([gen_state_ops.resource_scatter_nd_update(
+      ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
+      use_locking, name)]):
+    return ref.read_value()
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 24ef70c6f4d29e752ffd6ead08952fd53f5ca581..07796b28d9f6b85aa2d4ee8cbc47d10eef3894de 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import functools
 import traceback
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -138,6 +139,10 @@ def make_template(name_, func_, create_scope_now_=False, unique_name_=None,
   """
   if kwargs:
     func_ = functools.partial(func_, **kwargs)
+  if context.in_eager_mode():
+    return EagerTemplate(
+        name_, func_, create_scope_now=create_scope_now_,
+        unique_name=unique_name_, custom_getter=custom_getter_)
   return Template(
       name_, func_, create_scope_now=create_scope_now_,
       unique_name=unique_name_, custom_getter=custom_getter_)
@@ -302,6 +307,12 @@ class Template(object):
       # To prevent partial matches on the scope_name, we add '/' at the end.
       return name if name[-1] == "/" else name + "/"
 
+  @property
+  def variables(self):
+    """Returns the list of global and local variables created by the Template.
+    """
+    return self.global_variables + self.local_variables
+
   @property
   def trainable_variables(self):
     """Returns the list of trainable variables created by the Template."""
@@ -311,6 +322,14 @@ class Template(object):
     else:
       return []
 
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # TODO(apassos) Make sure it matches Eager when using local variables.
+    global_variables = self.global_variables
+    trainable_variables = set(self.trainable_variables)
+    return [x for x in global_variables if x not in trainable_variables]
+
   @property
   def global_variables(self):
     """Returns the list of global variables created by the Template."""
@@ -329,6 +348,21 @@ class Template(object):
     else:
       return []
 
+  @property
+  def weights(self):
+    """List of weights/variables created by the Template."""
+    return self.variables
+
+  @property
+  def trainable_weights(self):
+    """List of trainable weights/variables created by the Template."""
+    return self.trainable_variables
+
+  @property
+  def non_trainable_weights(self):
+    """List of non-trainable weights/variables created by the Template."""
+    return self.non_trainable_variables
+
   @property
   @deprecated(
       "2017-02-21", "The .var_scope property is deprecated. Please change your "
@@ -336,3 +370,190 @@ class Template(object):
   def var_scope(self):
     """Returns the variable scope object created by this Template."""
     return self._variable_scope
+
+
+class EagerTemplate(Template):
+  """Wrap a function to aid in variable sharing in Eager mode.
+
+  Templates are functions that create variables the first time they are called
+  and reuse them thereafter. See `make_template` for full documentation.
+
+  Note: By default, the full variable scope is captured at the time of first
+  call. If `create_scope_now` is passed as True to the constructor, the full
+  scope will be captured there, but no variables will be created until the first
+  call.
+  """
+
+  def __init__(self, name, func, create_scope_now=False, unique_name=None,
+               custom_getter=None):
+    """Creates a template for the given function.
+
+    Args:
+      name: A name for the scope created by this template. The
+        name will be made unique by appending `_N` to the it (see how
+        `tf.variable_scope` treats the `default_name` for details).
+      func: The function to apply each time.
+      create_scope_now: Whether to create the scope at Template construction
+        time, rather than first call. Defaults to false. Creating the scope at
+        construction time may be more convenient if the template is passed
+        through much lower level code, and you want to be sure of the scope
+        name without knowing exactly where it will be first called. If set to
+        True, the scope will be created in the constructor, and all subsequent
+        times in __call__, leading to a trailing numeral being added to the
+        names of all created Tensors. If set to False, the scope will be created
+        at the first call location.
+      unique_name: When used, it overrides name_ and is not made unique. If a
+        template of the same scope/unique_name already exists and reuse is
+        false, an error is raised. Defaults to None.
+      custom_getter: optional custom getter to pass to variable_scope()
+
+    Raises:
+      RuntimeError: if eager mode is not enabled.
+      ValueError: if the name is None or unique_name is provided.
+    """
+    if not context.in_eager_mode():
+      raise RuntimeError(
+          "{} objects can only be used when eager execution is enabled, use "
+          "tf.Template for graph construction".
+          format(type(self)))
+    if unique_name:
+      raise ValueError("unique_name cannot be used in eager mode.")
+    super(EagerTemplate, self).__init__(name, func, create_scope_now,
+                                        unique_name, custom_getter)
+    # Create an eager variable store only if the current variable store cannot
+    # store eager variables. This should allow for correct nesting.
+    default_vstore = variable_scope._get_default_variable_store()  # pylint: disable=protected-access
+    if default_vstore._store_eager_variables:  # pylint: disable=protected-access
+      raise ValueError("Nested EagerTemaplates are not currently supported.")
+    else:
+      self._eager_variable_store = variable_scope.EagerVariableStore()
+
+  def _call_func(self, args, kwargs, check_for_new_variables):
+    try:
+      vars_at_start = self._eager_variable_store.variables()
+      trainable_at_start = self._eager_variable_store.trainable_variables()
+
+      result = self._func(*args, **kwargs)
+      if check_for_new_variables:
+        trainable_variables = self._eager_variable_store.trainable_variables()
+        # If a variable that we intend to train is created as a side effect
+        # of creating a template, then that is almost certainly an error.
+        if len(trainable_at_start) != len(trainable_variables):
+          raise ValueError("Trainable variable created when calling a template "
+                           "after the first time, perhaps you used tf.Variable "
+                           "when you meant tf.get_variable: %s" %
+                           list(set(trainable_variables) -
+                                set(trainable_at_start)))
+
+        # Non-trainable tracking variables are a legitimate reason why a new
+        # variable would be created, but it is a relatively advanced use-case,
+        # so log it.
+        variables = self._eager_variable_store.variables()
+        if len(vars_at_start) != len(variables):
+          logging.info("New variables created when calling a template after "
+                       "the first time, perhaps you used tf.Variable when you "
+                       "meant tf.get_variable: %s",
+                       list(set(variables) - set(vars_at_start)))
+      return result
+    except Exception as exc:
+      # Reraise the exception, but append the original definition to the
+      # trace.
+      args = exc.args
+      if not args:
+        arg0 = ""
+      else:
+        arg0 = args[0]
+      trace = "".join(_skip_common_stack_elements(self._stacktrace,
+                                                  traceback.format_stack()))
+      arg0 = "%s\n\noriginally defined at:\n%s" % (arg0, trace)
+      new_args = [arg0]
+      new_args.extend(args[1:])
+      exc.args = tuple(new_args)
+      raise
+
+  def __call__(self, *args, **kwargs):
+    if self._variable_scope:
+      if self._variables_created:
+        # This is not the first visit to __call__, so variables have already
+        # been created, and we want to reuse them.
+        with variable_scope.variable_scope(self._variable_scope,
+                                           reuse=variable_scope.AUTO_REUSE):
+          with self._eager_variable_store.as_default():
+            return self._call_func(args, kwargs, check_for_new_variables=True)
+      else:
+        # This is the first visit to __call__, but the scope has already been
+        # created in the constructor. Set _variables_created after the inner
+        # function is successfully called so that subsequent calls take the if
+        # branch above.
+        with variable_scope.variable_scope(self._variable_scope,
+                                           reuse=variable_scope.AUTO_REUSE):
+          with self._eager_variable_store.as_default():
+            result = self._call_func(args, kwargs,
+                                     check_for_new_variables=False)
+        self._variables_created = True
+        return result
+    else:
+      # The scope was not created at construction time, so create it here.
+      # Subsequent calls should reuse variables.
+      with variable_scope.variable_scope(
+          self._unique_name, self._name,
+          custom_getter=self._custom_getter) as vs:
+        self._variable_scope = vs
+        with self._eager_variable_store.as_default():
+          result = self._call_func(args, kwargs,
+                                   check_for_new_variables=False)
+        self._variables_created = True
+        return result
+
+  @property
+  def name(self):
+    """Returns the name given to this Template."""
+    return self._name
+
+  @property
+  def func(self):
+    """Returns the func given to this Template."""
+    return self._func
+
+  @property
+  def variable_scope(self):
+    """Returns the variable scope object created by this Template."""
+    return self._variable_scope
+
+  @property
+  def variable_scope_name(self):
+    """Returns the variable scope name created by this Template."""
+    if self._variable_scope:
+      name = self._variable_scope.name
+      # To prevent partial matches on the scope_name, we add '/' at the end.
+      return name if name[-1] == "/" else name + "/"
+
+  @property
+  def variables(self):
+    """Returns the list of variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return self._eager_variable_store.variables()
+
+  @property
+  def trainable_variables(self):
+    """Returns the list of trainable variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return self._eager_variable_store.trainable_variables()
+
+  @property
+  def non_trainable_variables(self):
+    """Returns the list of non-trainable variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return self._eager_variable_store.non_trainable_variables()
+
+  @property
+  def global_variables(self):
+    """Returns the list of global variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return self.variables
+
+  @property
+  def local_variables(self):
+    """Returns the list of global variables created by the Template."""
+    # Currently there is no local variable in Eager mode.
+    return []
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index b4b7ad9d9104a2168b61ad6c3062e125be507747..398521c9b5ae9240f03a2ba5c4b0681bd8b3bfd7 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -146,10 +146,12 @@ class _GraphTensorArray(object):
         # write into the TensorArray from a Tensor with a set device
         # will retroactively set the device value of this op.
         def create():
+          """Create the TensorArray op."""
           return gen_data_flow_ops._tensor_array_v3(
               dtype=dtype,
               size=size,
               element_shape=element_shape,
+              identical_element_shapes=infer_shape,
               dynamic_size=dynamic_size,
               clear_after_read=clear_after_read,
               tensor_array_name=tensor_array_name,
@@ -593,10 +595,7 @@ class _EagerTensorArray(object):
             "a previous read (perhaps try setting clear_after_read = false?)" %
             index)
       else:
-        raise errors_impl.InvalidArgumentError(
-            None, None,
-            "Could not read from TensorArray index %d because it has not yet "
-            "been written to." % index)
+        tensor = self._maybe_zero(index)
 
     if self._clear_after_read:
       self._tensor_array[index] = None
@@ -610,52 +609,36 @@ class _EagerTensorArray(object):
     _eager_write_no_copy(ta._implementation, index, value)  # pylint: disable=protected-access
     return ta
 
+  def _maybe_zero(self, ix):
+    val = self._tensor_array[ix]
+    if val is None:
+      val = self._tensor_array[ix] = array_ops.zeros(
+          shape=self._element_shape, dtype=self._dtype)
+    return val
+
   def stack(self, name=None):
     """See TensorArray."""
-    try:
-      return array_ops.stack(self._tensor_array, name=name)
-    except ValueError:
-      if None in self._tensor_array:
-        idx = self._tensor_array.index(None)
-        raise errors_impl.InvalidArgumentError(
-            None, None, "Could not read from TensorArray index %d because "
-            "it has not yet been written to." % idx)
-      else:
-        raise
+    if self._tensor_array:
+      for ix in range(len(self._tensor_array)):
+        self._maybe_zero(ix)
+    return array_ops.stack(self._tensor_array, name=name)
 
   def gather(self, indices, name=None):
     """See TensorArray."""
     del name  # not meaningful in Eager mode
-    return array_ops.stack([self._tensor_array[i] for i in indices.numpy()])
+    return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
 
   def concat(self, name=None):
     """See TensorArray."""
     try:
-      return array_ops.concat(self._tensor_array, 0, name=name)
+      return array_ops.concat(
+          [self._maybe_zero(ix) for ix in range(len(self._tensor_array))],
+          0, name=name)
     except errors_impl.OpError:
       # Reproduce a subset of the error-handling for graph-mode TensorArrays.
       shapes = [t.shape for t in self._tensor_array]
       ndims = [s.ndims for s in shapes]
-      if None in self._tensor_array:
-        # Concatenating empty TensorArrays is permitted if the element
-        # shape is defined; the output is a tensor with shape
-        # [0] + self._element_shape[1:]
-        if all(t is None for t in self._tensor_array):
-          if self._element_shape is not None:
-            return constant_op.constant([], shape=[0] + self._element_shape[1:])
-          else:
-            raise errors_impl.UnimplementedError(
-                None, None, "TensorArray has size zero, but "
-                "element_shape_except0 %s is not fully defined. Currently only "
-                "static shapes are supported when concatenating zero-size "
-                "TensorArrays." % self._element_shape[1:])
-        # Concatenating a TensorArray in which some but not all entries have
-        # been written to is not allowed.
-        idx = self._tensor_array.index(None)
-        raise errors_impl.InvalidArgumentError(
-            None, None, "Could not read from TensorArray index %d because "
-            "it has not yet been written to." % idx)
-      elif 0 in ndims:
+      if 0 in ndims:
         idx = ndims.index(0)
         raise errors_impl.InvalidArgumentError(
             None, None, "Concat saw a scalar shape at index %d but requires "
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 92fa928eede1796df539f00751d7e419f5af8a9f..4a23d96721de1171a6f90cd0e547a8add0ce2011 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -843,6 +843,7 @@ class _VariableStore(object):
     Raises:
       ValueError: When giving unsupported dtype.
     """
+    del shape
     # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
     if dtype.is_floating:
       initializer = init_ops.glorot_uniform_initializer()
@@ -850,9 +851,8 @@ class _VariableStore(object):
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
     elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-      initializer = init_ops.zeros_initializer()(
-          shape=shape, dtype=dtype.base_dtype)
-      initializing_from_value = True
+      initializer = init_ops.zeros_initializer()
+      initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
     else:
       raise ValueError("An initializer for variable %s of %s is required"
@@ -1225,7 +1225,19 @@ class EagerVariableStore(object):
     return with_variable_store(self._store)
 
   def variables(self):
-    return self._store._vars.values()  # pylint: disable=protected-access
+    return sorted(self._store._vars.values(), key=lambda x: x.name)  # pylint: disable=protected-access
+
+  def trainable_variables(self):
+    # pylint: disable=protected-access
+    return sorted([x for x in self._store._vars.values() if x._trainable],
+                  key=lambda x: x.name)
+    # pylint: enable=protected-access
+
+  def non_trainable_variables(self):
+    # pylint: disable=protected-access
+    return sorted([x for x in self._store._vars.values() if not x._trainable],
+                  key=lambda x: x.name)
+    # pylint: enable=protected-access
 
 
 def get_variable(name,
@@ -1685,7 +1697,7 @@ class variable_scope(object):  # pylint: disable=invalid-name
   v1 = foo()  # Creates v.
   v2 = foo()  # Gets the same, existing v.
   assert v1 == v2
-
+  ```
 
   Basic example of sharing a variable with reuse=True:
 
@@ -1822,7 +1834,13 @@ class variable_scope(object):  # pylint: disable=invalid-name
     self._current_name_scope = None
 
   def __enter__(self):
-    if self._in_graph_mode:
+    # If the default graph is building a function, then we should not replace it
+    # with the cached graph.
+    if ops.get_default_graph().building_function:
+      self._building_function = True
+    else:
+      self._building_function = False
+    if self._in_graph_mode and not self._building_function:
       self._graph_context_manager = self._graph.as_default()
       self._graph_context_manager.__enter__()
     if self._cached_pure_variable_scope is not None:
@@ -1901,7 +1919,7 @@ class variable_scope(object):  # pylint: disable=invalid-name
         type_arg, value_arg, traceback_arg)
     if self._current_name_scope:
       self._current_name_scope.__exit__(type_arg, value_arg, traceback_arg)
-    if self._in_graph_mode:
+    if self._in_graph_mode and not self._building_function:
       self._graph_context_manager.__exit__(type_arg, value_arg, traceback_arg)
 
 
@@ -1973,8 +1991,10 @@ def variable(initial_value=None,
              validate_shape=True,
              caching_device=None,
              name=None,
-             dtype=None):
-  use_resource = get_variable_scope().use_resource
+             dtype=None,
+             use_resource=None):
+  if use_resource is None:
+    use_resource = get_variable_scope().use_resource
   if use_resource or (use_resource is None and context.in_eager_mode()):
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index f906b7b3c47b218cb789f96d8f258e0644e0dbe3..e0748d87e2d6ef2c2f8565669357f881334fa737 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -200,7 +200,7 @@ class Variable(object):
 
     @compatibility(eager)
     `tf.Variable` is not compatible with eager execution.  Use
-    `tfe.Variable` instead which is compatable with both eager execution
+    `tfe.Variable` instead which is compatible with both eager execution
     and graph construction.  See [the TensorFlow Eager Execution
     guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
     for details on how variables work in eager execution.
@@ -1063,13 +1063,13 @@ class Variable(object):
 class PartitionedVariable(object):
   """A container for partitioned `Variable` objects.
 
-  @compatiblity(eager) `tf.PartitionedVariable` is not compatible with
-  eager execution.  Use `tfe.Variable` instead which is compatable
+  @compatibility(eager) `tf.PartitionedVariable` is not compatible with
+  eager execution.  Use `tfe.Variable` instead which is compatible
   with both eager execution and graph construction.  See [the
   TensorFlow Eager Execution
   guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
   for details on how variables work in eager execution.
-  @end_compatiblity
+  @end_compatibility
   """
 
   class PartitionedVariableIterator(object):
@@ -1447,6 +1447,8 @@ def local_variables_initializer():
   Returns:
     An Op that initializes all local variables in the graph.
   """
+  if context.in_eager_mode():
+    return control_flow_ops.no_op(name="local_variables_initializer")
   return variables_initializer(local_variables())
 
 
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 5ecaa1baafb3dc6bb6d5f234f261837f64612c4e..9b92d9a18005ca5e6be3820427e3a3ba60a8ec2d 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -18,34 +18,110 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import errno as _errno
 import sys as _sys
 
 from tensorflow.python.platform import flags
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-def _benchmark_tests_can_log_memory():
-  return True
+def _usage(shorthelp):
+  """Writes __main__'s docstring to stdout with some help text.
+
+  Args:
+    shorthelp: bool, if True, prints only flags from the main module,
+        rather than all flags.
+  """
+  doc = _sys.modules['__main__'].__doc__
+  if not doc:
+    doc = '\nUSAGE: %s [flags]\n' % _sys.argv[0]
+    doc = flags.text_wrap(doc, indent='       ', firstline_indent='')
+  else:
+    # Replace all '%s' with sys.argv[0], and all '%%' with '%'.
+    num_specifiers = doc.count('%') - 2 * doc.count('%%')
+    try:
+      doc %= (_sys.argv[0],) * num_specifiers
+    except (OverflowError, TypeError, ValueError):
+      # Just display the docstring as-is.
+      pass
+  if shorthelp:
+    flag_str = flags.FLAGS.main_module_help()
+  else:
+    flag_str = str(flags.FLAGS)
+  try:
+    _sys.stdout.write(doc)
+    if flag_str:
+      _sys.stdout.write('\nflags:\n')
+      _sys.stdout.write(flag_str)
+    _sys.stdout.write('\n')
+  except IOError as e:
+    # We avoid printing a huge backtrace if we get EPIPE, because
+    # "foo.par --help | less" is a frequent use case.
+    if e.errno != _errno.EPIPE:
+      raise
+
+
+class _HelpFlag(flags.BooleanFlag):
+  """Special boolean flag that displays usage and raises SystemExit."""
+  NAME = 'help'
+  SHORT_NAME = 'h'
+
+  def __init__(self):
+    super(_HelpFlag, self).__init__(
+        self.NAME, False, 'show this help', short_name=self.SHORT_NAME)
+
+  def parse(self, arg):
+    if arg:
+      _usage(shorthelp=True)
+      print()
+      print('Try --helpfull to get a list of all flags.')
+      _sys.exit(1)
+
+
+class _HelpshortFlag(_HelpFlag):
+  """--helpshort is an alias for --help."""
+  NAME = 'helpshort'
+  SHORT_NAME = None
+
+
+class _HelpfullFlag(flags.BooleanFlag):
+  """Display help for flags in main module and all dependent modules."""
+
+  def __init__(self):
+    super(_HelpfullFlag, self).__init__('helpfull', False, 'show full help')
+
+  def parse(self, arg):
+    if arg:
+      _usage(shorthelp=False)
+      _sys.exit(1)
+
+
+_define_help_flags_called = False
+
+
+def _define_help_flags():
+  global _define_help_flags_called
+  if not _define_help_flags_called:
+    flags.DEFINE_flag(_HelpFlag())
+    flags.DEFINE_flag(_HelpfullFlag())
+    flags.DEFINE_flag(_HelpshortFlag())
+    _define_help_flags_called = True
 
 
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
-  f = flags.FLAGS
 
-  # Extract the args from the optional `argv` list.
-  args = argv[1:] if argv else None
+  # Define help flags.
+  _define_help_flags()
 
-  # Parse the known flags from that list, or from the command
-  # line otherwise.
-  # pylint: disable=protected-access
-  flags_passthrough = f._parse_flags(args=args)
-  # pylint: enable=protected-access
+  # Parse known flags.
+  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
   main = main or _sys.modules['__main__'].main
 
   # Call the main function, passing through any arguments
   # to the final program.
-  _sys.exit(main(_sys.argv[:1] + flags_passthrough))
+  _sys.exit(main(argv))
 
 
 _allowed_symbols = [
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 392921abb45b125bd7113bea1f9c10250ae76542..837bca1dbd06c9ee4adbf05bfc7cf3586d072d16 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -43,8 +43,6 @@ GLOBAL_BENCHMARK_REGISTRY = set()
 # See also tensorflow/core/util/reporter.h TestReporter::kTestReporterEnv.
 TEST_REPORTER_TEST_ENV = "TEST_REPORT_FILE_PREFIX"
 
-_benchmark_tests_can_log_memory = app._benchmark_tests_can_log_memory  # pylint: disable=protected-access
-
 
 def _global_report_benchmark(
     name, iters=None, cpu_time=None, wall_time=None,
@@ -216,9 +214,8 @@ class TensorFlowBenchmark(Benchmark):
         store the trace of iteration in the benchmark report.
         The trace will be stored as a string in Google Chrome trace format
         in the extras field "full_trace_chrome_format".
-      store_memory_usage: Boolean, whether to run an extra
-        untimed iteration, calculate memory usage, and store that in extras
-        fields.
+      store_memory_usage: Boolean, whether to run an extra untimed iteration,
+        calculate memory usage, and store that in extras fields.
       name: (optional) Override the BenchmarkEntry name with `name`.
         Otherwise it is inferred from the top-level method name.
       extras: (optional) Dict mapping string keys to additional benchmark info.
@@ -230,8 +227,6 @@ class TensorFlowBenchmark(Benchmark):
       A `dict` containing the key-value pairs that were passed to
       `report_benchmark`.
     """
-    store_memory_usage &= _benchmark_tests_can_log_memory()
-
     for _ in range(burn_iters):
       sess.run(op_or_tensor, feed_dict=feed_dict)
 
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index 138a0ced97bc03c491b0282fc56f25a575093684..abd6f3d85501449b4f32592aa3787d1cbdd67e40 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -13,199 +13,58 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Implementation of the flags interface."""
+"""Import router for absl.flags. See https://github.com/abseil/abseil-py."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse as _argparse
+import logging as _logging
 
-from tensorflow.python.platform import tf_logging as _logging
-from tensorflow.python.util.all_util import remove_undocumented
+# go/tf-wildcard-import
+from absl.flags import *  # pylint: disable=wildcard-import
+import six as _six
 
-_global_parser = _argparse.ArgumentParser()
+from tensorflow.python.util import tf_decorator
 
 
-# pylint: disable=invalid-name
+# Since we wrap absl.flags DEFINE functions, we need to declare this module
+# does not affect key flags.
+disclaim_key_flags()  # pylint: disable=undefined-variable
 
 
-class _FlagValues(object):
-  """Global container and accessor for flags and their values."""
+_RENAMED_ARGUMENTS = {
+    'flag_name': 'name',
+    'default_value': 'default',
+    'docstring': 'help',
+}
 
-  def __init__(self):
-    self.__dict__['__flags'] = {}
-    self.__dict__['__parsed'] = False
-    self.__dict__['__required_flags'] = set()
 
-  def _parse_flags(self, args=None):
-    result, unparsed = _global_parser.parse_known_args(args=args)
-    for flag_name, val in vars(result).items():
-      self.__dict__['__flags'][flag_name] = val
-    self.__dict__['__parsed'] = True
-    self._assert_all_required()
-    return unparsed
+def _wrap_define_function(original_function):
+  """Wraps absl.flags's define functions so tf.flags accepts old names."""
 
-  def __getattr__(self, name):
-    """Retrieves the 'value' attribute of the flag --name."""
-    try:
-      parsed = self.__dict__['__parsed']
-    except KeyError:
-      # May happen during pickle.load or copy.copy
-      raise AttributeError(name)
-    if not parsed:
-      self._parse_flags()
-    if name not in self.__dict__['__flags']:
-      raise AttributeError(name)
-    return self.__dict__['__flags'][name]
+  def wrapper(*args, **kwargs):
+    """Wrapper function that turns old keyword names to new ones."""
+    has_old_names = False
+    for old_name, new_name in _six.iteritems(_RENAMED_ARGUMENTS):
+      if old_name in kwargs:
+        has_old_names = True
+        value = kwargs.pop(old_name)
+        kwargs[new_name] = value
+    if has_old_names:
+      _logging.warning(
+          'Use of the keyword argument names (flag_name, default_value, '
+          'docstring) is deprecated, please use (name, default, help) instead.')
+    return original_function(*args, **kwargs)
 
-  def __setattr__(self, name, value):
-    """Sets the 'value' attribute of the flag --name."""
-    if not self.__dict__['__parsed']:
-      self._parse_flags()
-    self.__dict__['__flags'][name] = value
-    self._assert_required(name)
+  return tf_decorator.make_decorator(original_function, wrapper)
 
-  def _add_required_flag(self, item):
-    self.__dict__['__required_flags'].add(item)
 
-  def _assert_required(self, flag_name):
-    if (flag_name not in self.__dict__['__flags'] or
-        self.__dict__['__flags'][flag_name] is None):
-      raise AttributeError('Flag --%s must be specified.' % flag_name)
-
-  def _assert_all_required(self):
-    for flag_name in self.__dict__['__required_flags']:
-      self._assert_required(flag_name)
-
-
-def _define_helper(flag_name, default_value, docstring, flagtype):
-  """Registers 'flag_name' with 'default_value' and 'docstring'."""
-  _global_parser.add_argument('--' + flag_name,
-                              default=default_value,
-                              help=docstring,
-                              type=flagtype)
-
-
-# Provides the global object that can be used to access flags.
-FLAGS = _FlagValues()
-
-
-def DEFINE_string(flag_name, default_value, docstring):
-  """Defines a flag of type 'string'.
-
-  Args:
-    flag_name: The name of the flag as a string.
-    default_value: The default value the flag should take as a string.
-    docstring: A helpful message explaining the use of the flag.
-  """
-  _define_helper(flag_name, default_value, docstring, str)
-
-
-def DEFINE_integer(flag_name, default_value, docstring):
-  """Defines a flag of type 'int'.
-
-  Args:
-    flag_name: The name of the flag as a string.
-    default_value: The default value the flag should take as an int.
-    docstring: A helpful message explaining the use of the flag.
-  """
-  _define_helper(flag_name, default_value, docstring, int)
-
-
-def DEFINE_boolean(flag_name, default_value, docstring):
-  """Defines a flag of type 'boolean'.
-
-  Args:
-    flag_name: The name of the flag as a string.
-    default_value: The default value the flag should take as a boolean.
-    docstring: A helpful message explaining the use of the flag.
-  """
-  # Register a custom function for 'bool' so --flag=True works.
-  def str2bool(v):
-    return v.lower() in ('true', 't', '1')
-  _global_parser.add_argument('--' + flag_name,
-                              nargs='?',
-                              const=True,
-                              help=docstring,
-                              default=default_value,
-                              type=str2bool)
-
-  # Add negated version, stay consistent with argparse with regard to
-  # dashes in flag names.
-  _global_parser.add_argument('--no' + flag_name,
-                              action='store_false',
-                              dest=flag_name.replace('-', '_'))
-
-
-# The internal google library defines the following alias, so we match
-# the API for consistency.
-DEFINE_bool = DEFINE_boolean  # pylint: disable=invalid-name
-
-
-def DEFINE_float(flag_name, default_value, docstring):
-  """Defines a flag of type 'float'.
-
-  Args:
-    flag_name: The name of the flag as a string.
-    default_value: The default value the flag should take as a float.
-    docstring: A helpful message explaining the use of the flag.
-  """
-  _define_helper(flag_name, default_value, docstring, float)
-
-
-def mark_flag_as_required(flag_name):
-  """Ensures that flag is not None during program execution.
-  
-  It is recommended to call this method like this:
-  
-    if __name__ == '__main__':
-      tf.flags.mark_flag_as_required('your_flag_name')
-      tf.app.run()
-  
-  Args:
-    flag_name: string, name of the flag to mark as required.
- 
-  Raises:
-    AttributeError: if flag_name is not registered as a valid flag name.
-      NOTE: The exception raised will change in the future. 
-  """
-  if _global_parser.get_default(flag_name) is not None:
-    _logging.warn(
-        'Flag %s has a non-None default value; therefore, '
-        'mark_flag_as_required will pass even if flag is not specified in the '
-        'command line!' % flag_name)
-  FLAGS._add_required_flag(flag_name)
-
-
-def mark_flags_as_required(flag_names):
-  """Ensures that flags are not None during program execution.
-  
-  Recommended usage:
-  
-    if __name__ == '__main__':
-      tf.flags.mark_flags_as_required(['flag1', 'flag2', 'flag3'])
-      tf.app.run()
-  
-  Args:
-    flag_names: a list/tuple of flag names to mark as required.
-
-  Raises:
-    AttributeError: If any of flag name has not already been defined as a flag.
-      NOTE: The exception raised will change in the future.
-  """
-  for flag_name in flag_names:
-    mark_flag_as_required(flag_name)
-
-
-_allowed_symbols = [
-    # We rely on gflags documentation.
-    'DEFINE_bool',
-    'DEFINE_boolean',
-    'DEFINE_float',
-    'DEFINE_integer',
-    'DEFINE_string',
-    'FLAGS',
-    'mark_flag_as_required',
-    'mark_flags_as_required',
-]
-remove_undocumented(__name__, _allowed_symbols)
+# pylint: disable=invalid-name,used-before-assignment
+# absl.flags APIs use `default` as the name of the default value argument.
+# Allow the following functions continue to accept `default_value`.
+DEFINE_string = _wrap_define_function(DEFINE_string)
+DEFINE_boolean = _wrap_define_function(DEFINE_boolean)
+DEFINE_bool = DEFINE_boolean
+DEFINE_float = _wrap_define_function(DEFINE_float)
+DEFINE_integer = _wrap_define_function(DEFINE_integer)
+# pylint: enable=invalid-name,used-before-assignment
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 7b08c3f8a6da4598e40edd86b479caa6cf190fe9..e8200142dd0d5e9fbd0102deb2f2c3a9fb2197c6 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -12,108 +12,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for our flags implementation."""
+"""Sanity tests for tf.flags."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import sys
 import unittest
 
-from tensorflow.python.platform import app
-from tensorflow.python.platform import flags
+from absl import flags as absl_flags
 
-flags.DEFINE_string("string_foo", "default_val", "HelpString")
-flags.DEFINE_integer("int_foo", 42, "HelpString")
-flags.DEFINE_float("float_foo", 42.0, "HelpString")
+from tensorflow.python.platform import flags
 
-flags.DEFINE_boolean("bool_foo", True, "HelpString")
-flags.DEFINE_boolean("bool_negation", True, "HelpString")
-flags.DEFINE_boolean("bool-dash-negation", True, "HelpString")
-flags.DEFINE_boolean("bool_a", False, "HelpString")
-flags.DEFINE_boolean("bool_c", False, "HelpString")
-flags.DEFINE_boolean("bool_d", True, "HelpString")
-flags.DEFINE_bool("bool_e", True, "HelpString")
-flags.DEFINE_string("string_foo_required", "default_val", "HelpString")
-flags.DEFINE_string("none_string_foo_required", None, "HelpString")
 
-FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    flag_name='old_string', default_value='default', docstring='docstring')
+flags.DEFINE_string(
+    name='new_string', default='default', help='docstring')
+flags.DEFINE_integer(
+    flag_name='old_integer', default_value=1, docstring='docstring')
+flags.DEFINE_integer(
+    name='new_integer', default=1, help='docstring')
+flags.DEFINE_float(
+    flag_name='old_float', default_value=1.5, docstring='docstring')
+flags.DEFINE_float(
+    name='new_float', default=1.5, help='docstring')
+flags.DEFINE_bool(
+    flag_name='old_bool', default_value=True, docstring='docstring')
+flags.DEFINE_bool(
+    name='new_bool', default=True, help='docstring')
+flags.DEFINE_boolean(
+    flag_name='old_boolean', default_value=False, docstring='docstring')
+flags.DEFINE_boolean(
+    name='new_boolean', default=False, help='docstring')
 
 
 class FlagsTest(unittest.TestCase):
 
-  def testString(self):
-    res = FLAGS.string_foo
-    self.assertEqual(res, "default_val")
-    FLAGS.string_foo = "bar"
-    self.assertEqual("bar", FLAGS.string_foo)
-
-  def testBool(self):
-    res = FLAGS.bool_foo
-    self.assertTrue(res)
-    FLAGS.bool_foo = False
-    self.assertFalse(FLAGS.bool_foo)
-
-  def testBoolCommandLines(self):
-    # Specified on command line with no args, sets to True,
-    # even if default is False.
-    self.assertEqual(True, FLAGS.bool_a)
-
-    # --no before the flag forces it to False, even if the
-    # default is True
-    self.assertEqual(False, FLAGS.bool_negation)
-
-    # --bool_flag=True sets to True
-    self.assertEqual(True, FLAGS.bool_c)
-
-    # --bool_flag=False sets to False
-    self.assertEqual(False, FLAGS.bool_d)
-
-  def testInt(self):
-    res = FLAGS.int_foo
-    self.assertEquals(res, 42)
-    FLAGS.int_foo = -1
-    self.assertEqual(-1, FLAGS.int_foo)
-
-  def testFloat(self):
-    res = FLAGS.float_foo
-    self.assertEquals(42.0, res)
-    FLAGS.float_foo = -1.0
-    self.assertEqual(-1.0, FLAGS.float_foo)
-
-  def test_copy(self):
-    copied = copy.copy(FLAGS)
-    self.assertEqual(copied.__dict__, FLAGS.__dict__)
-
-  def testStringRequired(self):
-    res = FLAGS.string_foo_required
-    self.assertEqual(res, "default_val")
-    FLAGS.string_foo_required = "bar"
-    self.assertEqual("bar", FLAGS.string_foo_required)
-
-  def testNoneStringRequired(self):
-    res = FLAGS.none_string_foo_required
-    self.assertEqual(res, "default_val")
-    FLAGS.none_string_foo_required = "bar"
-    self.assertEqual("bar", FLAGS.none_string_foo_required)
-
-
-def main(_):
-  # unittest.main() tries to interpret the unknown flags, so use the
-  # direct functions instead.
-  runner = unittest.TextTestRunner()
-  itersuite = unittest.TestLoader().loadTestsFromTestCase(FlagsTest)
-  runner.run(itersuite)
-
-
-if __name__ == "__main__":
-  # Test command lines
-  sys.argv.extend([
-      "--bool_a", "--nobool_negation", "--bool_c=True", "--bool_d=False",
-      "--none_string_foo_required=default_val",
-      "and_argument"
-  ])
-  flags.mark_flag_as_required('string_foo_required')
-  flags.mark_flags_as_required(['none_string_foo_required'])
-  app.run()
+  def test_global_flags_object(self):
+    self.assertIs(flags.FLAGS, absl_flags.FLAGS)
+
+  def test_keyword_arguments(self):
+    test_cases = (
+        ('old_string', 'default'),
+        ('new_string', 'default'),
+        ('old_integer', 1),
+        ('new_integer', 1),
+        ('old_float', 1.5),
+        ('new_float', 1.5),
+        ('old_bool', True),
+        ('new_bool', True),
+        ('old_boolean', False),
+        ('new_boolean', False),
+    )
+    for flag_name, default_value in test_cases:
+      self.assertEqual(default_value, absl_flags.FLAGS[flag_name].default)
+      self.assertEqual('docstring', absl_flags.FLAGS[flag_name].help)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 167dec6551f9321d01732ab4264fdb28a7bb6916..57635fb4d9d6698f1a6f1a51918fe3f269d8909b 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -64,8 +64,7 @@ def get_compile_flags():
   flags = []
   flags.append('-I%s' % get_include())
   flags.append('-I%s/external/nsync/public' % get_include())
-  if _CXX11_ABI_FLAG != -1:
-    flags.append('-D_GLIBCXX_USE_CXX11_ABI=%d' % _CXX11_ABI_FLAG)
+  flags.append('-D_GLIBCXX_USE_CXX11_ABI=%d' % _CXX11_ABI_FLAG)
   return flags
 
 
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 71ee5e365f7d093ebc105917e7dd68ba92b31231..85ed4f071c7022801f20db75d538e5917b8eea66 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -30,64 +30,92 @@ from logging import ERROR
 from logging import FATAL
 from logging import INFO
 from logging import WARN
+import threading
 
 import six
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 
-# Determine whether we are in an interactive environment
-_interactive = False
-try:
-  # This is only defined in interactive shells
-  if _sys.ps1: _interactive = True
-except AttributeError:
-  # Even now, we may be in an interactive shell with `python -i`.
-  _interactive = _sys.flags.interactive
+# Don't use this directly. Use _get_logger() instead.
+_logger = None
+_logger_lock = threading.Lock()
 
-# Scope the tensorflow logger to not conflict with users' loggers
-_logger = _logging.getLogger('tensorflow')
 
-# If we are in an interactive environment (like jupyter), set loglevel to info
-# and pipe the output to stdout
-if _interactive:
-  _logger.setLevel(INFO)
-  _logging_target = _sys.stdout
-else:
-  _logging_target = _sys.stderr
+def _get_logger():
+  global _logger
 
-# Add the output handler
-_handler = _logging.StreamHandler(_logging_target)
-_handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
-_logger.addHandler(_handler)
+  # Use double-checked locking to avoid taking lock unnecessarily.
+  if _logger:
+    return _logger
+
+  _logger_lock.acquire()
+
+  try:
+    if _logger:
+      return _logger
+
+    # Scope the TensorFlow logger to not conflict with users' loggers.
+    logger = _logging.getLogger('tensorflow')
+
+    # Don't further configure the TensorFlow logger if the root logger is
+    # already configured. This prevents double logging in those cases.
+    if not _logging.getLogger().handlers:
+      # Determine whether we are in an interactive environment
+      _interactive = False
+      try:
+        # This is only defined in interactive shells.
+        if _sys.ps1: _interactive = True
+      except AttributeError:
+        # Even now, we may be in an interactive shell with `python -i`.
+        _interactive = _sys.flags.interactive
+
+      # If we are in an interactive environment (like Jupyter), set loglevel
+      # to INFO and pipe the output to stdout.
+      if _interactive:
+        logger.setLevel(INFO)
+        _logging_target = _sys.stdout
+      else:
+        _logging_target = _sys.stderr
+
+      # Add the output handler.
+      _handler = _logging.StreamHandler(_logging_target)
+      _handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT, None))
+      logger.addHandler(_handler)
+
+    _logger = logger
+    return _logger
+
+  finally:
+    _logger_lock.release()
 
 
 def log(level, msg, *args, **kwargs):
-  _logger.log(level, msg, *args, **kwargs)
+  _get_logger().log(level, msg, *args, **kwargs)
 
 
 def debug(msg, *args, **kwargs):
-  _logger.debug(msg, *args, **kwargs)
+  _get_logger().debug(msg, *args, **kwargs)
 
 
 def error(msg, *args, **kwargs):
-  _logger.error(msg, *args, **kwargs)
+  _get_logger().error(msg, *args, **kwargs)
 
 
 def fatal(msg, *args, **kwargs):
-  _logger.fatal(msg, *args, **kwargs)
+  _get_logger().fatal(msg, *args, **kwargs)
 
 
 def info(msg, *args, **kwargs):
-  _logger.info(msg, *args, **kwargs)
+  _get_logger().info(msg, *args, **kwargs)
 
 
 def warn(msg, *args, **kwargs):
-  _logger.warn(msg, *args, **kwargs)
+  _get_logger().warn(msg, *args, **kwargs)
 
 
 def warning(msg, *args, **kwargs):
-  _logger.warning(msg, *args, **kwargs)
+  _get_logger().warning(msg, *args, **kwargs)
 
 
 _level_names = {
@@ -118,7 +146,7 @@ def flush():
 
 # Code below is taken from pyglib/logging
 def vlog(level, msg, *args, **kwargs):
-  _logger.log(level, msg, *args, **kwargs)
+  _get_logger().log(level, msg, *args, **kwargs)
 
 
 def _GetNextLogCountPerToken(token):
@@ -225,12 +253,12 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
 
 def get_verbosity():
   """Return how much logging output will be produced."""
-  return _logger.getEffectiveLevel()
+  return _get_logger().getEffectiveLevel()
 
 
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _logger.setLevel(v)
+  _get_logger().setLevel(v)
 
 
 def _get_thread_id():
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 26cc5f0b74ecda5c0a88ee52ea5009d6aef55787..c815aad0a065eaba4a0dc52487b5ee67e271a146 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -53,10 +53,14 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 4ff09d3800f82c859c11948fc7fc580f7bdc91d3..4c915ac79a4534231846295f51c56f088948b594 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -169,7 +169,7 @@ class RunMetadataTest(test.TestCase):
       tfprof_node, run_meta = _run_loop_model()
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
-                          'rnn/while/rnn/basic_rnn_cell/MatMul')
+                          'rnn/while/basic_rnn_cell/MatMul')
       self.assertEqual(len(ret['cpu:0']), 4)
 
       total_cpu_execs = 0
@@ -178,7 +178,7 @@ class RunMetadataTest(test.TestCase):
 
       mm_node = lib.SearchTFProfNode(
           tfprof_node,
-          'rnn/while/rnn/basic_rnn_cell/MatMul')
+          'rnn/while/basic_rnn_cell/MatMul')
 
       self.assertEqual(mm_node.run_count, 4)
       self.assertEqual(mm_node.cpu_exec_micros, total_cpu_execs)
@@ -218,7 +218,7 @@ class RunMetadataTest(test.TestCase):
       tfprof_node, run_meta = _run_loop_model()
       # The while-loop caused a node to appear 4 times in scheduling.
       ret = _extract_node(run_meta,
-                          'rnn/while/rnn/basic_rnn_cell/MatMul')
+                          'rnn/while/basic_rnn_cell/MatMul')
       self.assertEqual(len(ret['gpu:0']), 4, '%s' % run_meta)
 
       total_cpu_execs = 0
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 040a4891637109590acbc8a71c11e0d863a34c11..72422f11e91993e7d6e3d905788d54f9f782c892 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -20,6 +20,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import six
 
 from google.protobuf import message
@@ -160,7 +162,7 @@ class Profiler(object):
     self._coverage = 0.0
     self._graph = graph
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, op_log=op_log)
     # pylint: enable=protected-access
 
@@ -180,7 +182,7 @@ class Profiler(object):
       run_meta: RunMetadata proto that contains statistics of a session run.
     """
     # pylint: disable=protected-access
-    op_log = tfprof_logger._merge_default_with_oplog(
+    op_log = tfprof_logger.merge_default_with_oplog(
         self._graph, run_meta=run_meta)
     # pylint: enable=protected-access
     # TODO(xpan): P1: Better to find the current graph.
@@ -206,8 +208,8 @@ class Profiler(object):
     try:
       tfprof_node.ParseFromString(
           print_mdl.Profile('code'.encode('utf-8'), opts.SerializeToString()))
-    except message.DecodeError as _:
-      pass
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
     return tfprof_node
 
   def profile_operations(self, options):
@@ -223,8 +225,8 @@ class Profiler(object):
     try:
       tfprof_node.ParseFromString(
           print_mdl.Profile('op'.encode('utf-8'), opts.SerializeToString()))
-    except message.DecodeError as _:
-      pass
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
     return tfprof_node
 
   def profile_name_scope(self, options):
@@ -240,8 +242,8 @@ class Profiler(object):
     try:
       tfprof_node.ParseFromString(
           print_mdl.Profile('scope'.encode('utf-8'), opts.SerializeToString()))
-    except message.DecodeError as _:
-      pass
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
     return tfprof_node
 
   def profile_graph(self, options):
@@ -257,8 +259,8 @@ class Profiler(object):
     try:
       tfprof_node.ParseFromString(
           print_mdl.Profile('graph'.encode('utf-8'), opts.SerializeToString()))
-    except message.DecodeError as _:
-      pass
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
     return tfprof_node
 
   def advise(self, options):
@@ -313,7 +315,7 @@ def profile(graph,
                .trainable_variables_parameter())
 
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, op_log, run_meta, add_trace=cmd == 'code')
   # pylint: enable=protected-access
 
@@ -331,9 +333,8 @@ def profile(graph,
         opts.SerializeToString())
     try:
       tfprof_node.ParseFromString(ret)
-    except message.DecodeError as _:
-      pass
-      # sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
 
   elif cmd == 'graph' or cmd == 'scope':
     tfprof_node = tfprof_output_pb2.GraphNodeProto()
@@ -345,9 +346,8 @@ def profile(graph,
         opts.SerializeToString())
     try:
       tfprof_node.ParseFromString(ret)
-    except message.DecodeError as _:
-      pass
-      # sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
+    except message.DecodeError as e:
+      sys.stderr.write('Cannot parse returned proto: %s.\n' % e)
   else:
     raise errors.InvalidArgumentError(
         None, None, 'unknown cmd: %s\n' % cmd)
@@ -374,7 +374,7 @@ def advise(graph, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
     options = ALL_ADVICE.copy()
 
   # pylint: disable=protected-access
-  op_log = tfprof_logger._merge_default_with_oplog(
+  op_log = tfprof_logger.merge_default_with_oplog(
       graph, None, run_meta, add_trace=True)
   # pylint: enable=protected-access
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 17c87bea92dedf3f04e2f4e151e45610d27e34ef..5d524c8c74bc013209b95917fc2bcf60bb1f436d 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -23,11 +23,19 @@ import os
 import random
 import re
 
+import numpy as np
+
 from tensorflow.core.profiler import profile_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -60,7 +68,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          f.read())
 
-  def testSelectEverthingDetail(self):
+  def testSelectEverythingDetail(self):
     ops.reset_default_graph()
     dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -344,8 +352,8 @@ class PrintModelAnalysisTest(test.TestCase):
       with gfile.Open(outfile, 'r') as f:
         # pylint: disable=line-too-long
         self.assertEqual(
-            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes\nConst0B(0',
-            f.read().replace('\t', '').replace(' ', '')[0:180])
+            'nodename|requestedbytes|peakbytes|residualbytes|outputbytes|totalexecutiontime|acceleratorexecutiontime|cpuexecutiontime|#parameters|opoccurrence(run|defined)|inputshapes',
+            f.read().replace('\t', '').replace(' ', '')[0:170])
         # pylint: enable=line-too-long
 
       total_children = 0
@@ -635,6 +643,133 @@ class PrintModelAnalysisTest(test.TestCase):
       self._trainLoop(x, 10, time_dir, time_steps,
                       memory_dir, memory_steps, profile_dir, dump_steps)
 
+  def testOOM(self):
+    if not test.is_gpu_available():
+      return
+    ops.reset_default_graph()
+    with ops.device('/device:GPU:0'):
+      a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+      b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+      c = a * b
+
+    try:
+      with session.Session() as sess:
+        sess.run(c, options=config_pb2.RunOptions(
+            report_tensor_allocations_upon_oom=True))
+    except Exception as e:  # pylint: disable=broad-except
+      exception_str = '%s' % e
+      # This trace reports allocations for to random tensor.
+      self.assertTrue(
+          'OOM when allocating tensor with shape[30000,10000,20000]' in
+          exception_str)
+      mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+      mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+
+  def testDistributedOOM(self):
+    if not test.is_gpu_available():
+      return
+    ops.reset_default_graph()
+
+    workers, _ = test_util.create_local_cluster(2, 0)
+
+    with ops.device('/job:worker/replica:0/task:0/gpu:0'):
+      a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+    with ops.device('/job:worker/replica:0/task:1/gpu:0'):
+      b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+      c = a * b
+
+    try:
+      with session.Session(workers[1].target) as sess:
+        sess.run(c, options=config_pb2.RunOptions(
+            report_tensor_allocations_upon_oom=True))
+    except Exception as e:  # pylint: disable=broad-except
+      exception_str = '%s' % e
+      # test_random2 is reported because it's allocated in worker 1.
+      self.assertTrue('Current usage from device: '
+                      '/job:worker/replica:0/task:1/device:GPU:0, '
+                      'allocator: GPU_0_bfc' in exception_str)
+      mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+      # test_random1 is not reported because it's allocated in worker 0.
+      mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+                      exception_str)
+      self.assertTrue(mat is None)
+
+  def testTrackPersistentBytes(self):
+    ops.reset_default_graph()
+    a = array_ops.constant(np.ones((100, 100)))
+    b = array_ops.constant(np.ones((100, 100)))
+    c = a * b
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      ret = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(c, options=run_options, run_metadata=run_metadata)
+      ret2 = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+
+      n = lib.SearchTFProfNode(ret, 'mul')
+      n2 = lib.SearchTFProfNode(ret2, 'mul')
+      self.assertGreater(n.peak_bytes, 0)
+      self.assertGreater(n.output_bytes, 0)
+      self.assertGreater(n.residual_bytes, 0)
+      self.assertEqual(n.peak_bytes, n2.peak_bytes)
+      self.assertEqual(n.output_bytes, n2.output_bytes)
+      self.assertEqual(n.residual_bytes, n2.residual_bytes)
+
+  def testTraceLoopBytes(self):
+    if not test.is_gpu_available(): return
+    ops.reset_default_graph()
+    steps = 100
+
+    with ops.device('/gpu:0'):
+      x = array_ops.ones((100, 100), dtype=dtypes.float32)
+      n = array_ops.constant(steps, dtype=dtypes.int32)
+      x1 = array_ops.ones((100, 100))
+
+      x *= x1
+      def loop_body(i, x):
+        x *= x
+        return i + 1, x
+
+      _, y = control_flow_ops.while_loop(
+          lambda i, x: i < n, loop_body,
+          [array_ops.constant(0), x])
+
+    grad = gradients.gradients(y, [x1])
+
+    with session.Session() as sess:
+      run_options = config_pb2.RunOptions(
+          trace_level=config_pb2.RunOptions.FULL_TRACE)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(grad, options=run_options, run_metadata=run_metadata)
+
+      options = option_builder.ProfileOptionBuilder.time_and_memory()
+      options['min_bytes'] = 0
+      options['min_micros'] = 0
+      options['select'] = ('bytes', 'peak_bytes', 'output_bytes',
+                           'residual_bytes')
+      options['output'] = 'none'
+      ret_pb = model_analyzer.profile(
+          sess.graph, run_meta=run_metadata, cmd='scope', options=options)
+      self.assertGreater(ret_pb.total_requested_bytes, 1000000)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index 838064a1f0836a2041c2823f54fea4e6b5606d7f..15c273794da8ab0ff46e6455e502792c5b19729f 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -139,8 +139,8 @@ def _get_logged_ops(graph, run_meta=None, add_trace=True,
   return logged_ops, string_to_id
 
 
-def _merge_default_with_oplog(graph, op_log=None, run_meta=None,
-                              add_trace=True, add_trainable_var=True):
+def merge_default_with_oplog(graph, op_log=None, run_meta=None,
+                             add_trace=True, add_trainable_var=True):
   """Merge the tfprof default extra info with caller's op_log.
 
   Args:
@@ -199,7 +199,7 @@ def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
     add_trace: Whether to add python code trace information.
         Used to support "code" view.
   """
-  op_log = _merge_default_with_oplog(graph, op_log, run_meta, add_trace)
+  op_log = merge_default_with_oplog(graph, op_log, run_meta, add_trace)
 
   with gfile.Open(os.path.join(log_dir, 'tfprof_log'), 'w') as log:
     log.write(op_log.SerializeToString())
diff --git a/tensorflow/python/pywrap_dlopen_global_flags.py b/tensorflow/python/pywrap_dlopen_global_flags.py
index 509fc2170c3920b5129be4733cf0a9c04220ca7e..411334f480e5c0fd7a76f4eeb671779d94bd70a1 100644
--- a/tensorflow/python/pywrap_dlopen_global_flags.py
+++ b/tensorflow/python/pywrap_dlopen_global_flags.py
@@ -28,13 +28,12 @@ from __future__ import print_function
 import ctypes
 import sys
 
-# On UNIX-based platforms, pywrap_tensorflow is a SWIG-generated
-# python library that dynamically loads _pywrap_tensorflow.so. The
-# default mode for loading keeps all the symbol private and not
-# visible to other libraries that may be loaded. Setting the mode to
-# RTLD_GLOBAL to make the symbols visible, so that custom op libraries
-# imported using `tf.load_op_library()` can access symbols defined in
-# _pywrap_tensorflow.so.
+# On UNIX-based platforms, pywrap_tensorflow is a SWIG-generated python library
+# that dynamically loads _pywrap_tensorflow.so. The default mode for loading
+# keeps all the symbol private and not visible to other libraries that may be
+# loaded. Setting the mode to RTLD_GLOBAL to make the symbols visible, so that
+# custom op libraries imported using `tf.load_op_library()` can access symbols
+# defined in _pywrap_tensorflow.so.
 _use_rtld_global = (hasattr(sys, 'getdlopenflags')
                     and hasattr(sys, 'setdlopenflags'))
 if _use_rtld_global:
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index fa36b77311e277e1b17b4ee70da3bcf98b65bd1e..82750e9e491dbe9b742531c431aa499621082776 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -18,22 +18,29 @@ limitations under the License.
 %rename("%s") TFE_NewContext;
 %rename("%s") TFE_DeleteContext;
 %rename("%s") TFE_ContextListDevices;
+%rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_RegisterExceptionClass;
 %rename("%s") TFE_Py_Execute;
 %rename("%s") TFE_Py_UID;
-%rename("%s") TFE_Py_NewTape;
-%rename("%s") TFE_Py_TapeShouldRecord;
-%rename("%s") TFE_Py_TapeWatch;
-%rename("%s") TFE_Py_TapeDeleteTrace;
-%rename("%s") TFE_Py_TapeRecordOperation;
-%rename("%s") TFE_Py_TapeExport;
+%rename("%s") TFE_Py_TapeStackPushNew;
+%rename("%s") TFE_Py_TapeStackPush;
+%rename("%s") TFE_Py_TapeStackPop;
+%rename("%s") TFE_Py_TapeStackIsEmpty;
+%rename("%s") TFE_Py_TapeStackShouldRecord;
+%rename("%s") TFE_Py_TapeStackWatch;
+%rename("%s") TFE_Py_TapeStackDeleteTrace;
+%rename("%s") TFE_Py_TapeStackRecordOperation;
+%rename("%s") TFE_Py_TapeStackWatchVariable;
+%rename("%s") TFE_Py_TapeGradient;
+%rename("%s") TFE_Py_TapeWatchedVariables;
 %rename("%s") TFE_NewContextOptions;
 %rename("%s") TFE_ContextOptionsSetConfig;
 %rename("%s") TFE_ContextOptionsSetDevicePlacementPolicy;
 %rename("%s") TFE_DeleteContextOptions;
+%rename("%s") TFE_Py_TensorShapeSlice;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
@@ -124,7 +131,7 @@ limitations under the License.
         SWIG_fail;
       }
       if (EagerTensor_CheckExact(elem)) {
-        (*$1)[i] = EagerTensorHandle(elem);
+        (*$1)[i] = EagerTensor_Handle(elem);
       } else {
         SWIG_exception_fail(SWIG_TypeError,
                             "provided list of inputs contains objects other "
@@ -143,7 +150,7 @@ limitations under the License.
   }
   $1 = &temp;
   $1->resize(PyInt_AsLong($input), nullptr);
-}
+} 
 
 // Create new Status object.
 %typemap(in, numinputs=0) TF_Status *out_status {
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index c6d2c3229330c64d5e788c45574dbdbd8b6616ca..92ca7dec6f63b50b33dde9909b4738676fb8c783 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -214,6 +214,13 @@ class SavedModelTest(test.TestCase):
       self._init_and_validate_variable(sess, "v", 45)
       builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
 
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - multiple tags (from predefined constants for serving on TPU).
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
     # Graph that updates the single variable. SavedModel is invoked:
     # - to add the model (weights are not updated).
     # - multiple custom tags.
@@ -244,6 +251,13 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+    # Restore the graph with multiple predefined tags (for serving on TPU)
+    # whose variables were not saved.
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, [tag_constants.SERVING, tag_constants.TPU], export_dir)
+      self.assertEqual(
+          42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
     # Restore the graph with multiple tags. Provide duplicate tags to test set
     # semantics.
     with self.test_session(graph=ops.Graph()) as sess:
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 52868bdf99b4734a99d7b9dac301f00783402d77..e2facafda51919d3f1e0ccbe646db522ed0bc49b 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -31,9 +31,13 @@ TRAINING = "train"
 # Tag for the `gpu` graph.
 GPU = "gpu"
 
+# Tag for the `tpu` graph.
+TPU = "tpu"
+
 _allowed_symbols = [
     "SERVING",
     "TRAINING",
-    "GPU"
+    "GPU",
+    "TPU"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 301f560d41378b0ec29537cd82e3e3b333f59674..6969c4cf1500bf4b1fda900336158e5af4395ea6 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -13,301 +13,14 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Reads Summaries from and writes Summaries to event files."""
+"""Provides a method for reading events from an event file via an iterator."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
-import threading
-import time
-
-import six
-
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import summary_pb2
 from tensorflow.core.util import event_pb2
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import ops
 from tensorflow.python.lib.io import tf_record
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-class SummaryWriter(object):
-  """Writes `Summary` protocol buffers to event files.
-
-  The `SummaryWriter` class provides a mechanism to create an event file in a
-  given directory and add summaries and events to it. The class updates the
-  file contents asynchronously. This allows a training program to call methods
-  to add data to the file directly from the training loop, without slowing down
-  training.
-  """
-
-  def __init__(self, logdir, graph=None, max_queue=10, flush_secs=120,
-               graph_def=None):
-    """Creates a `SummaryWriter` and an event file.
-
-    On construction the summary writer creates a new event file in `logdir`.
-    This event file will contain `Event` protocol buffers constructed when you
-    call one of the following functions: `add_summary()`, `add_session_log()`,
-    `add_event()`, or `add_graph()`.
-
-    If you pass a `Graph` to the constructor it is added to
-    the event file. (This is equivalent to calling `add_graph()` later).
-
-    TensorBoard will pick the graph from the file and display it graphically so
-    you can interactively explore the graph you built. You will usually pass
-    the graph from the session in which you launched it:
-
-    ```python
-    ...create a graph...
-    # Launch the graph in a session.
-    sess = tf.Session()
-    # Create a summary writer, add the 'graph' to the event file.
-    writer = tf.summary.FileWriter(<some-directory>, sess.graph)
-    ```
-
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
-
-    Args:
-      logdir: A string. Directory where event file will be written.
-      graph: A `Graph` object, such as `sess.graph`.
-      max_queue: Integer. Size of the queue for pending events and summaries.
-      flush_secs: Number. How often, in seconds, to flush the
-        pending events and summaries to disk.
-      graph_def: DEPRECATED: Use the `graph` argument instead.
-    """
-    self._logdir = logdir
-    if not gfile.IsDirectory(self._logdir):
-      gfile.MakeDirs(self._logdir)
-    self._event_queue = six.moves.queue.Queue(max_queue)
-    self._ev_writer = pywrap_tensorflow.EventsWriter(
-        compat.as_bytes(os.path.join(self._logdir, "events")))
-    self._closed = False
-    self._worker = _EventLoggerThread(self._event_queue, self._ev_writer,
-                                      flush_secs)
-    # For storing used tags for session.run() outputs.
-    self._session_run_tags = {}
-    self._worker.start()
-    if graph is not None or graph_def is not None:
-      # Calling it with both graph and graph_def for backward compatibility.
-      self.add_graph(graph=graph, graph_def=graph_def)
-
-  def get_logdir(self):
-    """Returns the directory where event file will be written."""
-    return self._logdir
-
-  def reopen(self):
-    """Reopens the summary writer.
-
-    Can be called after `close()` to add more events in the same directory.
-    The events will go into a new events file.
-
-    Does nothing if the summary writer was not closed.
-    """
-    if self._closed:
-      self._closed = False
-
-  def add_summary(self, summary, global_step=None):
-    """Adds a `Summary` protocol buffer to the event file.
-
-    This method wraps the provided summary in an `Event` protocol buffer
-    and adds it to the event file.
-
-    You can pass the result of evaluating any summary op, using
-    @{tf.Session.run} or
-    @{tf.Tensor.eval}, to this
-    function. Alternatively, you can pass a `tf.Summary` protocol
-    buffer that you populate with your own data. The latter is
-    commonly done to report evaluation results in event files.
-
-    Args:
-      summary: A `Summary` protocol buffer, optionally serialized as a string.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    if isinstance(summary, bytes):
-      summ = summary_pb2.Summary()
-      summ.ParseFromString(summary)
-      summary = summ
-    event = event_pb2.Event(wall_time=time.time(), summary=summary)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_session_log(self, session_log, global_step=None):
-    """Adds a `SessionLog` protocol buffer to the event file.
-
-    This method wraps the provided session in an `Event` protocol buffer
-    and adds it to the event file.
-
-    Args:
-      session_log: A `SessionLog` protocol buffer.
-      global_step: Number. Optional global step value to record with the
-        summary.
-    """
-    event = event_pb2.Event(wall_time=time.time(), session_log=session_log)
-    if global_step is not None:
-      event.step = int(global_step)
-    self.add_event(event)
-
-  def add_event(self, event):
-    """Adds an event to the event file.
-
-    Args:
-      event: An `Event` protocol buffer.
-    """
-    if not self._closed:
-      self._event_queue.put(event)
-
-  def _add_graph_def(self, graph_def, global_step=None):
-    graph_bytes = graph_def.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(), graph_def=graph_bytes)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def add_graph(self, graph, global_step=None, graph_def=None):
-    """Adds a `Graph` to the event file.
-
-    The graph described by the protocol buffer will be displayed by
-    TensorBoard. Most users pass a graph in the constructor instead.
-
-    Args:
-      graph: A `Graph` object, such as `sess.graph`.
-      global_step: Number. Optional global step counter to record with the
-        graph.
-      graph_def: DEPRECATED. Use the `graph` parameter instead.
-
-    Raises:
-      ValueError: If both graph and graph_def are passed to the method.
-    """
-
-    if graph is not None and graph_def is not None:
-      raise ValueError("Please pass only graph, or graph_def (deprecated), "
-                       "but not both.")
-
-    if isinstance(graph, ops.Graph) or isinstance(graph_def, ops.Graph):
-      # The user passed a `Graph`.
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if not isinstance(graph, ops.Graph):
-        logging.warning("When passing a `Graph` object, please use the `graph`"
-                        " named argument instead of `graph_def`.")
-        graph = graph_def
-
-      # Serialize the graph with additional info.
-      true_graph_def = graph.as_graph_def(add_shapes=True)
-    elif (isinstance(graph, graph_pb2.GraphDef)
-          or isinstance(graph_def, graph_pb2.GraphDef)):
-      # The user passed a `GraphDef`.
-      logging.warning("Passing a `GraphDef` to the SummaryWriter is deprecated."
-                      " Pass a `Graph` object instead, such as `sess.graph`.")
-
-      # Check if the user passed it via the graph or the graph_def argument and
-      # correct for that.
-      if isinstance(graph, graph_pb2.GraphDef):
-        true_graph_def = graph
-      else:
-        true_graph_def = graph_def
-
-    else:
-      # The user passed neither `Graph`, nor `GraphDef`.
-      raise TypeError("The passed graph must be an instance of `Graph` "
-                      "or the deprecated `GraphDef`")
-    # Finally, add the graph_def to the summary writer.
-    self._add_graph_def(true_graph_def, global_step)
-
-  def add_run_metadata(self, run_metadata, tag, global_step=None):
-    """Adds a metadata information for a single session.run() call.
-
-    Args:
-      run_metadata: A `RunMetadata` protobuf object.
-      tag: The tag name for this metadata.
-      global_step: Number. Optional global step counter to record with the
-        StepStats.
-
-    Raises:
-      ValueError: If the provided tag was already used for this type of event.
-    """
-    if tag in self._session_run_tags:
-      raise ValueError("The provided tag was already used for this event type")
-    self._session_run_tags[tag] = True
-
-    tagged_metadata = event_pb2.TaggedRunMetadata()
-    tagged_metadata.tag = tag
-    # Store the `RunMetadata` object as bytes in order to have postponed
-    # (lazy) deserialization when used later.
-    tagged_metadata.run_metadata = run_metadata.SerializeToString()
-    event = event_pb2.Event(wall_time=time.time(),
-                            tagged_run_metadata=tagged_metadata)
-    if global_step is not None:
-      event.step = int(global_step)
-    self._event_queue.put(event)
-
-  def flush(self):
-    """Flushes the event file to disk.
-
-    Call this method to make sure that all pending events have been written to
-    disk.
-    """
-    self._event_queue.join()
-    self._ev_writer.Flush()
-
-  def close(self):
-    """Flushes the event file to disk and close the file.
-
-    Call this method when you do not need the summary writer anymore.
-    """
-    self.flush()
-    self._ev_writer.Close()
-    self._closed = True
-
-
-class _EventLoggerThread(threading.Thread):
-  """Thread that logs events."""
-
-  def __init__(self, queue, ev_writer, flush_secs):
-    """Creates an _EventLoggerThread.
-
-    Args:
-      queue: A Queue from which to dequeue events.
-      ev_writer: An event writer. Used to log brain events for
-       the visualizer.
-      flush_secs: How often, in seconds, to flush the
-        pending file to disk.
-    """
-    threading.Thread.__init__(self)
-    self.daemon = True
-    self._queue = queue
-    self._ev_writer = ev_writer
-    self._flush_secs = flush_secs
-    # The first event will be flushed immediately.
-    self._next_event_flush_time = 0
-
-  def run(self):
-    while True:
-      event = self._queue.get()
-      try:
-        self._ev_writer.WriteEvent(event)
-        # Flush the event writer every so often.
-        now = time.time()
-        if now > self._next_event_flush_time:
-          self._ev_writer.Flush()
-          # Do it again in two minutes.
-          self._next_event_flush_time = now + self._flush_secs
-      finally:
-        self._queue.task_done()
 
 
 def summary_iterator(path):
@@ -352,37 +65,3 @@ def summary_iterator(path):
   # pylint: enable=line-too-long
   for r in tf_record.tf_record_iterator(path):
     yield event_pb2.Event.FromString(r)
-
-
-class SummaryWriterCache(object):
-  """Cache for summary writers.
-
-  This class caches summary writers, one per directory.
-  """
-  # Cache, keyed by directory.
-  _cache = {}
-
-  # Lock protecting _SUMMARY_WRITERS.
-  _lock = threading.RLock()
-
-  @staticmethod
-  def clear():
-    """Clear cached summary writers. Currently only used for unit tests."""
-    with SummaryWriterCache._lock:
-      SummaryWriterCache._cache = {}
-
-  @staticmethod
-  def get(logdir):
-    """Returns the SummaryWriter for the specified directory.
-
-    Args:
-      logdir: str, name of the directory.
-
-    Returns:
-      A `SummaryWriter`.
-    """
-    with SummaryWriterCache._lock:
-      if logdir not in SummaryWriterCache._cache:
-        SummaryWriterCache._cache[logdir] = SummaryWriter(
-            logdir, graph=ops.get_default_graph())
-      return SummaryWriterCache._cache[logdir]
diff --git a/tensorflow/python/summary/text_summary.py b/tensorflow/python/summary/text_summary.py
index 4031355b03d3831453f52848bd092c8f45e1ef69..94a85d73e2f77388f9a29b1c135fc6046a8362d0 100644
--- a/tensorflow/python/summary/text_summary.py
+++ b/tensorflow/python/summary/text_summary.py
@@ -14,21 +14,18 @@
 # ==============================================================================
 """Implements text_summary in TensorFlow, with TensorBoard support.
 
-The text_summary is basically a wrapper around the generic tensor_summary,
-and it uses a TextSummaryPluginAsset class to record which tensor_summaries
-are readable by the TensorBoard text plugin.
+The text_summary is a wrapper around the generic tensor_summary that takes a
+string-type tensor and emits a TensorSummary op with SummaryMetadata that
+notes that this summary is textual data for the TensorBoard text plugin.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops.summary_ops import tensor_summary
-from tensorflow.python.summary import plugin_asset
 
 PLUGIN_NAME = "text"
 
@@ -72,19 +69,3 @@ def text_summary(name, tensor, collections=None):
       summary_metadata=summary_metadata,
       collections=collections)
   return t_summary
-
-
-class TextSummaryPluginAsset(plugin_asset.PluginAsset):
-  """Provides a registry of text summaries for the TensorBoard text plugin."""
-  plugin_name = "tensorboard_text"
-
-  def __init__(self):
-    self._tensor_names = []
-
-  def register_tensor(self, name):
-    """Register a new Tensor Summary name as containing textual data."""
-    self._tensor_names.append(name)
-
-  def assets(self):
-    """Store the tensors registry in a file called tensors.json."""
-    return {"tensors.json": json.dumps(self._tensor_names)}
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index d221dd523b2835d51e61487c22caee961ec28e5f..344702097f658db14ae0923e1bdee3843a72645f 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -33,6 +33,8 @@ limitations under the License.
 %include "tensorflow/python/client/tf_session.i"
 %include "tensorflow/python/client/device_lib.i"
 
+%include "tensorflow/python/lib/core/bfloat16.i"
+
 %include "tensorflow/python/lib/io/file_io.i"
 %include "tensorflow/python/training/quantize_training.i"
 %include "tensorflow/python/training/server_lib.i"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 50bffd1474cca0a5f0fbd3f3a19a62440ad574d1..69586c6a47762701344aafe449e96868875f8926 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -151,7 +151,6 @@ py_library(
     srcs = ["optimize_for_inference_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":strip_unused",
         ":strip_unused_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100644
new mode 100755
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 47a74e5abfb45e9bfd87b72d1511ae2e7c2f7d6c..8716058e619d8e970834ec4d57e4d8ff21559d5c 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -29,7 +29,8 @@ from tensorflow.python.platform import flags
 FLAGS = None
 
 
-def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
+def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
+                                     all_tensor_names):
   """Prints tensors in a checkpoint file.
 
   If no `tensor_name` is provided, prints the tensor names and shapes
@@ -41,14 +42,16 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors):
     file_name: Name of the checkpoint file.
     tensor_name: Name of the tensor in the checkpoint file to print.
     all_tensors: Boolean indicating whether to print all tensors.
+    all_tensor_names: Boolean indicating whether to print all tensor names.
   """
   try:
     reader = pywrap_tensorflow.NewCheckpointReader(file_name)
-    if all_tensors:
+    if all_tensors or all_tensor_names:
       var_to_shape_map = reader.get_variable_to_shape_map()
       for key in sorted(var_to_shape_map):
         print("tensor_name: ", key)
-        print(reader.get_tensor(key))
+        if all_tensors:
+          print(reader.get_tensor(key))
     elif not tensor_name:
       print(reader.debug_string().decode("utf-8"))
     else:
@@ -104,11 +107,14 @@ def parse_numpy_printoption(kv_str):
 def main(unused_argv):
   if not FLAGS.file_name:
     print("Usage: inspect_checkpoint --file_name=checkpoint_file_name "
-          "[--tensor_name=tensor_to_print]")
+          "[--tensor_name=tensor_to_print] "
+          "[--all_tensors] "
+          "[--all_tensor_names] "
+          "[--printoptions]")
     sys.exit(1)
   else:
     print_tensors_in_checkpoint_file(FLAGS.file_name, FLAGS.tensor_name,
-                                     FLAGS.all_tensors)
+                                     FLAGS.all_tensors, FLAGS.all_tensor_names)
 
 
 if __name__ == "__main__":
@@ -130,6 +136,13 @@ if __name__ == "__main__":
       type="bool",
       default=False,
       help="If True, print the values of all the tensors.")
+  parser.add_argument(
+      "--all_tensor_names",
+      nargs="?",
+      const=True,
+      type="bool",
+      default=False,
+      help="If True, print the names of all the tensors.")
   parser.add_argument(
       "--printoptions",
       nargs="*",
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 447057cfe9fc3d7aa7bd78739ba8f1caee1ec757..6dd24c0dca1d326592e4f33eba4e6233248dac5f 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -272,7 +272,7 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("Conv2D", node.op)
       self.assertNotEqual("MirrorPad", node.op)
-      
+
 
   def testFusePadAndConv(self):
     with self.test_session() as sess:
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index fe3333bac4893c87746122c55fbfdb458a709c6e..50f435236b41fcda7ab5ea37a4e96b72dd1043e7 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -61,6 +61,13 @@ class AdadeltaOptimizerTest(test.TestCase):
             adadelta_update = adadelta_opt.apply_gradients(
                 zip([grads, grads], [var0, var1]))
 
+            opt_vars = adadelta_opt.variables()
+            self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[1].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[2].name, var1._shared_name)
+            self.assertStartsWith(opt_vars[3].name, var1._shared_name)
+            self.assertEqual(4, len(opt_vars))
+
             variables.global_variables_initializer().run()
 
             # Assign slots
@@ -105,17 +112,16 @@ class AdadeltaOptimizerTest(test.TestCase):
               # Check that the accumulators have been updated
               for slot_idx in range(2):
                 self.assertAllCloseAccordingToType(
-                    np.array(
-                        [accum, accum], dtype=dtype.as_numpy_dtype()),
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
                     slot[slot_idx].eval(),
-                    rtol=1e-3)
+                    rtol=1e-5)
 
                 self.assertAllCloseAccordingToType(
                     np.array(
                         [accum_update, accum_update],
                         dtype=dtype.as_numpy_dtype()),
                     slot_update[slot_idx].eval(),
-                    rtol=1e-3)
+                    rtol=1e-5)
 
               # Check that the parameters have been updated
               self.assertAllCloseAccordingToType(
@@ -123,14 +129,14 @@ class AdadeltaOptimizerTest(test.TestCase):
                       [var0_init[0] - tot_update, var0_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
                   var0.eval(),
-                  rtol=1e-3)
+                  rtol=1e-5)
 
               self.assertAllCloseAccordingToType(
                   np.array(
                       [var1_init[0] - tot_update, var1_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
                   var1.eval(),
-                  rtol=1e-3)
+                  rtol=1e-5)
 
   def testBasic(self):
     self.doTestBasic(use_resource=False)
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index cdc532a38e8e683b18619b0f1f795f3cb0d748f3..266f5563e0c738fe73e3a771a46e9b28c266cd73 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -112,6 +112,9 @@ class AdamOptimizer(optimizer.Optimizer):
   def _get_beta_accumulators(self):
     return self._beta1_power, self._beta2_power
 
+  def _non_slot_variables(self):
+    return self._get_beta_accumulators()
+
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
     # variable. Sort the var_list to make sure this device is consistent across
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 176d20bd60d3d042a5b7dc02387e3487b372b4a1..ffb66abc4c1a38353d602a711cab86b0d63b9e96 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -173,6 +173,13 @@ class AdamOptimizerTest(test.TestCase):
 
         opt = adam.AdamOptimizer()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        self.assertIn(opt._beta1_power, opt_variables)
+        self.assertIn(opt._beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
 
         if context.in_graph_mode():
           self.evaluate(variables.global_variables_initializer())
@@ -200,6 +207,9 @@ class AdamOptimizerTest(test.TestCase):
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
 
   def testBasic(self):
     with self.test_session():
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1fb00343ef23d6b6dc9ca41f4868f0a7d80feb7c..b499cdf7f8a296a01f54da1c81ee3d39a8227e5f 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -514,6 +514,8 @@ class StepCounterHook(session_run_hook.SessionRunHook):
 
     self._summary_writer = summary_writer
     self._output_dir = output_dir
+    self._last_global_step = None
+    self._global_step_check_count = 0
 
   def begin(self):
     if self._summary_writer is None and self._output_dir:
@@ -545,6 +547,30 @@ class StepCounterHook(session_run_hook.SessionRunHook):
             self._summary_writer.add_summary(summary, global_step)
           logging.info("%s: %g", self._summary_tag, steps_per_sec)
 
+    # Check whether the global step has been increased. Here, we do not use the
+    # timer.last_triggered_step as the timer might record a different global
+    # step value such that the comparison could be unreliable. For simplicity,
+    # we just compare the stale_global_step with previously recorded version.
+    if stale_global_step == self._last_global_step:
+      # Here, we use a counter to count how many times we have observed that the
+      # global step has not been increased. For some Optimizers, the global step
+      # is not increased each time by design. For example, SyncReplicaOptimizer
+      # doesn't increase the global step in worker's main train step.
+      self._global_step_check_count += 1
+      if self._global_step_check_count % 20 == 0:
+        self._global_step_check_count = 0
+        logging.warning(
+            "It seems that global step (tf.train.get_global_step) has not "
+            "been increased. Current value (could be stable): %s vs previous "
+            "value: %s. You could increase the global step by passing "
+            "tf.train.get_global_step() to Optimizer.apply_gradients or "
+            "Optimizer.minimize.", stale_global_step, self._last_global_step)
+    else:
+      # Whenever we observe the increment, reset the counter.
+      self._global_step_check_count = 0
+
+    self._last_global_step = stale_global_step
+
 
 class NanLossDuringTrainingError(RuntimeError):
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index e7ff7e12211ae57a8589c799efbf9eab3b3fe5da..2547661e5250e94136a100aa8c30c9dbb7455018 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -780,9 +780,12 @@ class StepCounterHookTest(test.TestCase):
       hook.begin()
       sess.run(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      for _ in range(30):
-        time.sleep(0.01)
-        mon_sess.run(train_op)
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          time.sleep(0.01)
+          mon_sess.run(train_op)
+        # logging.warning should not be called.
+        self.assertIsNone(mock_log.call_args)
       hook.end(sess)
       summary_writer.assert_summaries(
           test_case=self,
@@ -857,6 +860,24 @@ class StepCounterHookTest(test.TestCase):
       summary_value = summary_writer.summaries[2][0].value[0]
       self.assertEqual('bar/foo/sec', summary_value.tag)
 
+  def test_log_warning_if_global_step_not_increased(self):
+    with ops.Graph().as_default(), session_lib.Session() as sess:
+      variables.get_or_create_global_step()
+      train_op = training_util._increment_global_step(0)  # keep same.
+      sess.run(variables_lib.global_variables_initializer())
+      hook = basic_session_run_hooks.StepCounterHook(
+          every_n_steps=1, every_n_secs=None)
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])
+      mon_sess.run(train_op)  # Run one step to record global step.
+      with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+        for _ in range(30):
+          mon_sess.run(train_op)
+        self.assertRegexpMatches(
+            str(mock_log.call_args),
+            'global step.*has not been increased')
+      hook.end(sess)
+
 
 class SummarySaverHookTest(test.TestCase):
 
diff --git a/tensorflow/python/training/checkpoint_ops.py b/tensorflow/python/training/checkpoint_ops.py
index 70460ceb4802f3f30eaab4b3ae10a6e59589d83d..7f92d94d2be369709608d36c109863b0ebfb7bbe 100644
--- a/tensorflow/python/training/checkpoint_ops.py
+++ b/tensorflow/python/training/checkpoint_ops.py
@@ -36,6 +36,7 @@ def _load_and_remap_matrix(ckpt_path,
                            num_rows_to_load,
                            new_col_vocab_size,
                            initializer,
+                           old_row_vocab_size=-1,
                            old_row_vocab_file=None,
                            new_row_vocab_file=None,
                            old_col_vocab_file=None,
@@ -75,6 +76,12 @@ def _load_and_remap_matrix(ckpt_path,
     initializer: Callable initializer function that accepts a 1-D tensor as the
       arg to specify the shape of the returned tensor. Used to initialize
       missing values.
+    old_row_vocab_size: The number of entries to consider in the old vocabulary.
+      With the default value of -1, the entire old row vocabulary file will be
+      used.  Otherwise, only the first `old_row_vocab_size` entries will be
+      considered for remapping.Must be smaller than the length of
+      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
+      `old_col_vocab_size` for classes.
     old_row_vocab_file: A scalar `Tensor` of type `string` containing the
       path to the old row vocabulary file. Can be None, which represents no
       remapping on the row axis.
@@ -146,7 +153,8 @@ def _load_and_remap_matrix(ckpt_path,
             new_vocab_file=new_row_vocab_file,
             old_vocab_file=old_row_vocab_file,
             new_vocab_offset=new_row_vocab_offset,
-            num_new_vocab=num_rows_to_load))
+            num_new_vocab=num_rows_to_load,
+            old_vocab_size=old_row_vocab_size))
   else:
     # Even when the rows are not being reordered, we still need to generate a
     # remapping to account for initializing partitioned Variables (when
@@ -199,6 +207,7 @@ def _load_and_remap_matrix_initializer(ckpt_path,
                                        old_tensor_name,
                                        new_row_vocab_size,
                                        new_col_vocab_size,
+                                       old_row_vocab_size=-1,
                                        old_row_vocab_file=None,
                                        new_row_vocab_file=None,
                                        old_col_vocab_file=None,
@@ -280,6 +289,12 @@ def _load_and_remap_matrix_initializer(ckpt_path,
       `new_col_vocab_file`. If no column remapping is needed (no column vocab
       provided), this should be equal to the number of columns in the old
       matrix.
+    old_row_vocab_size: The number of entries to consider in the old vocabulary.
+      With the default value of -1, the entire old row vocabulary file will be
+      used.  Otherwise, only the first `old_row_vocab_size` entries will be
+      considered for remapping.Must be smaller than the length of
+      `old_row_vocab_file`.  NOTE: we do not provide an equivalent
+      `old_col_vocab_size` for classes.
     old_row_vocab_file: A scalar `Tensor` of type `string` containing the
       path to the old row vocabulary file. Can be None, which represents no
       remapping on the row axis.
@@ -372,6 +387,15 @@ def _load_and_remap_matrix_initializer(ckpt_path,
                                  max(0, offset + shape[0] - new_row_vocab_size))
     num_rows_to_load = shape[0] - row_oov_buckets_to_use
 
+    # We may be operating on an OOV-only partition, in which case we newly
+    # initialize all rows of this partition.
+    if offset > new_row_vocab_size:
+      if shape[0] != row_oov_buckets_to_use:
+        raise ValueError(
+            "Partitioned variable offset is greater than new vocab size and "
+            "not operating on OOV-only partition.")
+      return initializer(shape)
+
     return _load_and_remap_matrix(
         ckpt_path=ckpt_path,
         old_tensor_name=old_tensor_name,
@@ -379,6 +403,7 @@ def _load_and_remap_matrix_initializer(ckpt_path,
         num_rows_to_load=num_rows_to_load,
         new_col_vocab_size=new_col_vocab_size,
         initializer=initializer,
+        old_row_vocab_size=old_row_vocab_size,
         old_row_vocab_file=old_row_vocab_file,
         new_row_vocab_file=new_row_vocab_file,
         old_col_vocab_file=old_col_vocab_file,
@@ -396,6 +421,7 @@ def _load_embedding_initializer(ckpt_path,
                                 embedding_dim,
                                 old_vocab_file,
                                 new_vocab_file,
+                                old_vocab_size=-1,
                                 num_oov_buckets=0,
                                 initializer=None,
                                 max_rows_in_memory=-1):
@@ -419,6 +445,11 @@ def _load_embedding_initializer(ckpt_path,
       path to the old vocabulary file.
     new_vocab_file: A scalar `Tensor` of type `string` containing the
       path to the new vocabulary file.
+    old_vocab_size: The number of entries to consider in the old vocabulary.
+      With the default value of -1, the entire old row vocabulary file will be
+      used.  Otherwise, only the first `old_vocab_size` entries will be
+      considered for remapping.Must be smaller than the length of
+      `old_row_vocab_file`.
     num_oov_buckets: `int` specifying the number of out-of-vocabulary
       buckets to use. Must be >= 0.
     initializer: Initializer function that accepts a 1-D tensor as the arg to
@@ -443,6 +474,7 @@ def _load_embedding_initializer(ckpt_path,
       old_tensor_name=embedding_tensor_name,
       new_row_vocab_size=new_vocab_size,
       new_col_vocab_size=embedding_dim,
+      old_row_vocab_size=old_vocab_size,
       old_row_vocab_file=old_vocab_file,
       new_row_vocab_file=new_vocab_file,
       old_col_vocab_file=None,
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index 39c4d2911f2d279b8817e70fa23596ab195dbcd8..00611de862752dd7e69d867fcb50bb3e21f0ae1b 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -103,7 +103,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_col_oov_buckets=1)
 
     # [4 in vocab + 1 oov features, 4 in vocab + 1 oov classes].  The offset
-    # means we read
+    # means we read from the first line.
     expected_remapped_matrix = np.concatenate(
         [
             np.reshape([18, 34, 50, self.init_val, self.init_val], [5, 1]),
@@ -132,6 +132,9 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_col_oov_buckets=1,
         initializer=self.initializer))
 
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV].  Use a
+    # partitioned variable to confirm that the offset logic works.
     expected_remapped_matrix = np.concatenate(
         [
             np.reshape([2, 18, 34, 50, self.init_val, self.init_val], [6, 1]),
@@ -141,10 +144,6 @@ class LoadAndRemapWrappersTest(test.TestCase):
             np.reshape([self.init_val] * 6, [6, 1])
         ],
         axis=1)
-
-    # The new weight matrix is of size
-    # [5 feature vocab + 1 feature OOV, 4 class vocab + 1 class OOV].  Use a
-    # partitioned variable to confirm that the offset logic works.
     remapped_matrix = variable_scope.get_variable(
         name='linear/obtained_weight_matrix',
         shape=[6, 5],
@@ -168,6 +167,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_col_oov_buckets=1,
         initializer=self.initializer))
 
+    # The new weight matrix is of size
+    # [5-sized input layer, 4 class vocab + 1 class OOV].
     expected_remapped_matrix = np.concatenate(
         [
             np.reshape([2, 18, 34, 50, 66], [5, 1]),
@@ -177,9 +178,6 @@ class LoadAndRemapWrappersTest(test.TestCase):
             np.reshape([self.init_val] * 5, [5, 1])
         ],
         axis=1)
-
-    # The new weight matrix is of size
-    # [5-sized input layer, 4 class vocab + 1 class OOV].
     remapped_matrix = variable_scope.get_variable(
         name='dnn_output/obtained_weight_matrix',
         shape=[5, 5],
@@ -206,6 +204,9 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_col_oov_buckets=1,
         initializer=self.initializer))
 
+    # The new weight matrix is of size
+    # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV].  The
+    # second partition has only OOV.
     expected_remapped_matrix = np.concatenate(
         [
             np.reshape([2, 18, 34, 50] + [self.init_val] * 6, [10, 1]),
@@ -215,10 +216,6 @@ class LoadAndRemapWrappersTest(test.TestCase):
             np.reshape([self.init_val] * 10, [10, 1]),
         ],
         axis=1)
-
-    # The new weight matrix is of size
-    # [5 feature vocab + 5 feature OOV, 4 class vocab + 1 class OOV].  The
-    # second partition has only OOV.
     remapped_matrix = variable_scope.get_variable(
         name='linear_all_oov/obtained_weight_matrix',
         shape=[10, 5],
@@ -244,6 +241,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_row_oov_buckets=1,
         num_col_oov_buckets=1))
 
+    # Same as test_initializer_with_oov_only_partition, but with zero
+    # initialization.
     expected_remapped_matrix = np.concatenate(
         [
             np.reshape([2, 18, 34, 50, 0, 0], [6, 1]),
@@ -253,7 +252,6 @@ class LoadAndRemapWrappersTest(test.TestCase):
             np.reshape([0] * 6, [6, 1])
         ],
         axis=1)
-
     remapped_matrix = variable_scope.get_variable(
         name='linear_init_fallback/obtained_weight_matrix',
         shape=[6, 5],
@@ -277,18 +275,101 @@ class LoadAndRemapWrappersTest(test.TestCase):
         num_oov_buckets=1,
         initializer=self.initializer))
 
+    # The new weight matrix is of size
+    # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
+    # last vocab row (2nd last row) is newly initialized (wasn't found in
+    # previous vocab) and the actual last row is OOV and also newly initialized.
+    # Use a partitioned variable to confirm that the offset logic works.
     expected_remapped_embeddings = np.concatenate(
         [
             np.reshape(range(64), [4, 16]),
             np.reshape([self.init_val] * 32, [2, 16]),
         ],
         axis=0)
+    remapped_embeddings = variable_scope.get_variable(
+        name='embedding/obtained_embedding_matrix',
+        shape=[6, 16],
+        initializer=embedding_loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_embeddings,
+                          remapped_embeddings.as_tensor().eval())
+
+  def test_load_embedding_initializer_large_oov(self):
+    """Tests for the large OOV case for load_embedding_initializer wrapper."""
+    self.new_feature_vocab_file = os.path.join(
+        self.get_temp_dir(), 'new_feature_vocab.txt')
+    with open(self.new_feature_vocab_file, 'w') as f:
+      f.write('\n'.join(['one', 'zero', 'two', 'four']) + '\n')
+
+    # Checkpoint has 5 entries, 3 of which correspond to OOV.
+    self.old_feature_vocab_file = os.path.join(
+        self.get_temp_dir(), 'old_feature_vocab.txt')
+    with open(self.old_feature_vocab_file, 'w') as f:
+      f.write('\n'.join(['zero', 'one']) + '\n')
+
+    embedding_loading_initializer = (checkpoint_ops._load_embedding_initializer(
+        new_vocab_file=self.new_feature_vocab_file,
+        old_vocab_file=self.old_feature_vocab_file,
+        new_vocab_size=4,
+        embedding_dim=16,
+        embedding_tensor_name='some_scope/embeddings',
+        ckpt_path=[self.checkpoint_file],
+        num_oov_buckets=5,
+        initializer=self.initializer))
+
+    # The new weight matrix is of size
+    # [4 feature vocab + 5 feature OOV, 16 (embedding dimension)], where the
+    # 3rd and 4th rows are not found in the old vocabulary and therefore newly
+    # initialized.  The last five rows are OOV and also newly initialized.
+    # Use a partitioned variable to confirm that the offset logic works.
+    expected_remapped_embeddings = np.concatenate(
+        [
+            np.reshape(range(16, 32), [1, 16]),
+            np.reshape(range(16), [1, 16]),
+            np.reshape([self.init_val] * 112, [7, 16]),
+        ],
+        axis=0)
+    remapped_embeddings = variable_scope.get_variable(
+        name='embedding/obtained_embedding_matrix',
+        shape=[9, 16],
+        initializer=embedding_loading_initializer,
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      self.assertAllClose(expected_remapped_embeddings,
+                          remapped_embeddings.as_tensor().eval())
+
+  def test_load_embedding_initializer_old_row_vocab(self):
+    """Tests for load_embedding_initializer where we constrain old vocab."""
+    embedding_loading_initializer = (
+        checkpoint_ops._load_embedding_initializer(
+            new_vocab_file=self.new_feature_vocab_file,
+            old_vocab_file=self.old_feature_vocab_file,
+            # Considered old vocabulary becomes ['zero', 'one', 'two'].  This
+            # means 'three' in the new vocabulary is newly initialized.
+            old_vocab_size=3,
+            new_vocab_size=5,
+            embedding_dim=16,
+            embedding_tensor_name='some_scope/embeddings',
+            ckpt_path=[self.checkpoint_file],
+            num_oov_buckets=1,
+            initializer=self.initializer))
 
     # The new weight matrix is of size
     # [5 feature vocab + 1 feature OOV, 16 (embedding dimension)], where the
     # last vocab row (2nd last row) is newly initialized (wasn't found in
     # previous vocab) and the actual last row is OOV and also newly initialized.
     # Use a partitioned variable to confirm that the offset logic works.
+    expected_remapped_embeddings = np.concatenate(
+        [
+            np.reshape(range(48), [3, 16]),
+            np.reshape([self.init_val] * 48, [3, 16]),
+        ],
+        axis=0)
     remapped_embeddings = variable_scope.get_variable(
         name='embedding/obtained_embedding_matrix',
         shape=[6, 16],
@@ -300,6 +381,5 @@ class LoadAndRemapWrappersTest(test.TestCase):
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 23e863876464886855e1db671da6f02fbebeafbb..0e31255b74f64657cffc4a2f58798835513f0444 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -212,9 +212,9 @@ class Coordinator(object):
       if not self._stop_event.is_set():
         if ex and self._exc_info_to_raise is None:
           if isinstance(ex, tuple):
-            logging.info("Error reported to Coordinator: %s, %s",
-                         type(ex[1]),
-                         compat.as_str_any(ex[1]))
+            logging.info("Error reported to Coordinator: %s",
+                         compat.as_str_any(ex[1]),
+                         exc_info=ex)
             self._exc_info_to_raise = ex
           else:
             logging.info("Error reported to Coordinator: %s, %s",
@@ -284,19 +284,17 @@ class Coordinator(object):
     ```python
     try:
       ...body...
-    exception Exception as ex:
-      coord.request_stop(ex)
+    except:
+      coord.request_stop(sys.exc_info())
     ```
 
     Yields:
       nothing.
     """
-    # pylint: disable=broad-except
     try:
       yield
-    except Exception as ex:
-      self.request_stop(ex)
-    # pylint: enable=broad-except
+    except:  # pylint: disable=bare-except
+      self.request_stop(ex=sys.exc_info())
 
   def wait_for_stop(self, timeout=None):
     """Wait till the Coordinator is told to stop.
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 09671275f0c574d2589c3bdd3207f33fd0e57ca1..5370cafbcfab6e5ea46685db997989bf6f218a1a 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -39,7 +39,8 @@ class GradientDescentOptimizerTest(test.TestCase):
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+        optimizer = gradient_descent.GradientDescentOptimizer(3.0)
+        sgd_op = optimizer.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
@@ -52,6 +53,7 @@ class GradientDescentOptimizerTest(test.TestCase):
                                            var0.eval())
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            var1.eval())
+        self.assertEqual(0, len(optimizer.variables()))
 
   def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 802b930b0e391685b07802cbf6973b763e52d147..f0c28e7b89d08aed7bafb610fead9e285586e126 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -362,7 +362,13 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * t)
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
   ```
 
   Example: decay 1/t with a rate of 0.5:
@@ -371,8 +377,9 @@ def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate,
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
-  k = 0.5
-  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k)
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 3c8f472d6f9b3ae3ba62d348e7377a761409c29b..6865513b0e4aad18d77887770a11243642958e7a 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -134,6 +134,39 @@ class MomentumOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testVariablesAcrossGraphs(self):
+    optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
+    with ops.Graph().as_default():
+      var0 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var0")
+      var1 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      if context.in_eager_mode():
+        loss = lambda: math_ops.reduce_sum(var0 + var1)
+      else:
+        loss = math_ops.reduce_sum(var0 + var1)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var0")
+      self.assertStartsWith(optimizer_variables[1].name, "var1")
+      self.assertEquals(2, len(optimizer_variables))
+
+    with ops.Graph().as_default():
+      var2 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var2")
+      var3 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      if context.in_eager_mode():
+        loss = lambda: math_ops.reduce_sum(var2 + var3)
+      else:
+        loss = math_ops.reduce_sum(var2 + var3)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var2")
+      self.assertStartsWith(optimizer_variables[1].name, "var3")
+      self.assertEquals(2, len(optimizer_variables))
+
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.test_session():
@@ -201,23 +234,38 @@ class MomentumOptimizerTest(test.TestCase):
           self.assertAllClose(var0_np, var0.eval())
           self.assertAllClose(var1_np, var1.eval())
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = momentum_lib.MomentumOptimizer(
-            learning_rate=1.0, momentum=0.0).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss if context.in_eager_mode() else loss())
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index dea62d27baf8ce8a9f2ae1dfcfe277b6927467a6..f1cb81981afb11772b5e064b6e9788e841026f0a 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -52,7 +52,6 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-# TODO(touts): Share that with the Supervisor.
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -281,7 +280,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
                              save_summaries_secs=USE_DEFAULT,
                              config=None,
                              stop_grace_period_secs=120,
-                             log_step_count_steps=100):
+                             log_step_count_steps=100,
+                             max_wait_secs=7200):
   """Creates a `MonitoredSession` for training.
 
   For a chief, this utility sets proper session initializer/restorer. It also
@@ -320,6 +320,10 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       `close()` has been called.
     log_step_count_steps: The frequency, in number of global steps, that the
       global step/sec is logged.
+    max_wait_secs: Maximum time workers should wait for the session to
+      become available. This should be kept relatively short to help detect
+      incorrect code, but sometimes may need to be increased if the chief takes
+      a while to start up.
 
   Returns:
     A `MonitoredSession` object.
@@ -335,7 +339,10 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   scaffold = scaffold or Scaffold()
   if not is_chief:
     session_creator = WorkerSessionCreator(
-        scaffold=scaffold, master=master, config=config)
+        scaffold=scaffold,
+        master=master,
+        config=config,
+        max_wait_secs=max_wait_secs)
     return MonitoredSession(session_creator=session_creator, hooks=hooks or [],
                             stop_grace_period_secs=stop_grace_period_secs)
 
@@ -434,7 +441,11 @@ class ChiefSessionCreator(SessionCreator):
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
-  def __init__(self, scaffold=None, master='', config=None):
+  def __init__(self,
+               scaffold=None,
+               master='',
+               config=None,
+               max_wait_secs=30 * 60):
     """Initializes a worker session creator.
 
     Args:
@@ -442,11 +453,13 @@ class WorkerSessionCreator(SessionCreator):
         not specified a default one is created. It's used to finalize the graph.
       master: `String` representation of the TensorFlow master to use.
       config: `ConfigProto` proto used to configure the session.
+      max_wait_secs: Maximum time to wait for the session to become available.
     """
     self._scaffold = scaffold or Scaffold()
     self._session_manager = None
     self._master = master
     self._config = config
+    self._max_wait_secs = max_wait_secs
 
   def _get_session_manager(self):
     if self._session_manager:
@@ -463,7 +476,7 @@ class WorkerSessionCreator(SessionCreator):
     self._scaffold.finalize()
     return self._get_session_manager().wait_for_session(
         self._master, config=self._config,
-        max_wait_secs=30 * 60  # Wait up to 30 mins for the session to be ready.
+        max_wait_secs=self._max_wait_secs
     )
 
 
@@ -496,7 +509,6 @@ class _MonitoredSession(object):
       self._sess = _RecoverableSession(self._coordinated_creator)
     else:
       self._sess = self._coordinated_creator.create_session()
-    self._stop_requested_in_step_fn = False
 
   @property
   def graph(self):
@@ -537,6 +549,7 @@ class _MonitoredSession(object):
         will return True.
 
         Example usage:
+
         ```python
            with tf.Graph().as_default():
              c = tf.placeholder(dtypes.float32)
@@ -553,6 +566,7 @@ class _MonitoredSession(object):
                while not session.should_stop():
                  a = session.run_step_fn(step_fn)
         ```
+
         Hooks interact with the `run_with_hooks()` call inside the `step_fn`
         as they do with a `MonitoredSession.run` call.
 
@@ -576,11 +590,12 @@ class _MonitoredSession(object):
           ' `self` and `step_context` arguments if it\'s an instance'
           ' method. Got {} instead.'.format(step_fn_arguments))
 
-    try:
-      return step_fn(_MonitoredSession.StepContext(self._tf_sess(), self.run))
-    except StopIteration:
-      self._stop_requested_in_step_fn = True
-      raise
+    # `self._sess` is either `_RecoverableSession` or a `_CoordinatedSession`.
+    # Setting `run_with_hooks` to `None` will cause `run_with_hooks` to be
+    # `_CoordinatedSession.run` downstream in either case. This allows
+    # `_PREEMPTION_ERRORS` to propage from within `step_fn` to
+    # `_RecoverableSession.run_step_fn`.
+    return self._sess.run_step_fn(step_fn, self._tf_sess(), run_with_hooks=None)
 
   class StepContext(object):
     """Control flow instrument for the `step_fn` from `run_step_fn()`.
@@ -620,8 +635,7 @@ class _MonitoredSession(object):
       raise StopIteration('step_fn has requested the iterations to stop.')
 
   def should_stop(self):
-    return (self._sess is None or self._sess.should_stop() or
-            self._stop_requested_in_step_fn)
+    return self._sess is None or self._sess.should_stop()
 
   def close(self):
     self._close_internal()
@@ -924,6 +938,13 @@ class _WrappedSession(object):
   def run(self, *args, **kwargs):
     return self._sess.run(*args, **kwargs)
 
+  def run_step_fn(self, step_fn, raw_session, run_with_hooks):
+    # `_RecoverableSession` sets `run_with_hooks` to `_CoordinatedSession.run`.
+    # It is `None` when called from `_CoordinatedSession`. In that case
+    # `self.run` is `_CoordinatedSession.run`.
+    run_with_hooks = run_with_hooks or self.run
+    return step_fn(_MonitoredSession.StepContext(raw_session, run_with_hooks))
+
 
 class _RecoverableSession(_WrappedSession):
   """A wrapped session that recreates a session upon certain kinds of errors.
@@ -996,6 +1017,22 @@ class _RecoverableSession(_WrappedSession):
         self.close()
         self._sess = None
 
+  def run_step_fn(self, step_fn, raw_session, run_with_hooks):
+    while True:
+      try:
+        if not self._sess:
+          self._sess = self._create_session()
+
+        run_with_hooks = self._sess.run
+        return self._sess.run_step_fn(step_fn, raw_session, run_with_hooks)
+      except _PREEMPTION_ERRORS as e:
+        logging.info('An error was raised. This may be due to a preemption in '
+                     'a connected worker or parameter server. The current '
+                     'session will be closed and a new session will be '
+                     'created. Error: %s', e)
+        self.close()
+        self._sess = None
+
 
 class _CoordinatedSession(_WrappedSession):
   """A wrapped session that works with a `tf.Coordinator`.
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index e729b79425fdc21f7c9d5be59bf9c14594534deb..159b2d5c1605bdd95303efb25690f55a54a3625d 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -798,6 +798,214 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  def test_step_fn_recovery_from_coordinator_exception_when_run_hooks(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      session = monitored_session.MonitoredSession(
+          session_creator,
+          [StopCoordinatorWithException(calls_before_stopping=2)])
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+        def step_fn(step_context):
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: value})
+        return step_fn
+
+      # The coordinator will not abort during this call, since it's the call
+      # number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # The coordinator will abort during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # Even though the coordinator was asked to stop, the underlying session is
+      # recreated and is to be continued.
+      self.assertFalse(session.should_stop())
+      self.assertEqual(2, session_creator.number_of_sessions_created)
+
+  def test_recovery_from_non_preemption_in_coordinator_when_run_hooks(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      hook = StopCoordinatorWithException(
+          calls_before_stopping=2,
+          exception_to_raise=errors_impl.UnknownError(
+              None, None, 'Some fatal exception inside the coordinator.'))
+      session = monitored_session.MonitoredSession(session_creator, [hook])
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+        def step_fn(step_context):
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: value})
+        return step_fn
+
+      # The coordinator will not abort during this call, since it's the call
+      # number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # The coordinator will abort during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # The coordinator was asked to stop due to non-redeemable error. Training
+      # should stop and the session should not be recreated.
+      self.assertTrue(session.should_stop())
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      with self.assertRaises(errors_impl.UnknownError):
+        session.close()
+
+  def test_recovery_from_session_getting_stuck_when_run_hooks(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      session = monitored_session.MonitoredSession(
+          session_creator,
+          [FailTrainingAfterCoordinatorStopped(calls_before_stopping=2)])
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+        def step_fn(step_context):
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: value})
+        return step_fn
+
+      # Training will not fail, since it's the call number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # Training will fail during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # Even though the coordinator stopped which and training failed, the
+      # underlying session is recreated and training is to be continued.
+      self.assertFalse(session.should_stop())
+      self.assertEqual(2, session_creator.number_of_sessions_created)
+
+  def create_raw_session_with_failing_coordinator(self, session_creator, hook):
+    """Return MonitoredSession that triggers coordinator failures."""
+    session = monitored_session.MonitoredSession(session_creator, [hook])
+    # We would like to test a situation where during fetches through the
+    # raw session, the coordinator fails with an exception.  To do that, we
+    # are going to use (raw_session + StopCoordinatorWithException) hook
+    # combination that is stored in
+    # `MonitoredSession._RecoverableSession._CoordinatedSession._sess`
+    # at this point:
+    session._tf_sess = lambda: session._sess._sess._sess
+    # `run()` on such a session is equivalent to `run()` on the raw session
+    # with separate coordinator threads independently stopping with an
+    # exception.
+    return session
+
+  def test_step_fn_recovery_from_coordinator_exception_with_raw_session(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      session = self.create_raw_session_with_failing_coordinator(
+          session_creator,
+          StopCoordinatorWithException(calls_before_stopping=2))
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+
+        def step_fn(step_context):
+          return step_context.session.run(fetches=v, feed_dict={c: value})
+
+        return step_fn
+
+      # The coordinator will not abort during this call, since it's the call
+      # number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # The coordinator will abort during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # Even though the coordinator was asked to stop, the underlying session is
+      # recreated and is to be continued.
+      self.assertFalse(session.should_stop())
+      self.assertEqual(2, session_creator.number_of_sessions_created)
+
+  def test_recovery_from_non_preemption_in_coordinator_with_raw_session(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      session = self.create_raw_session_with_failing_coordinator(
+          session_creator,
+          StopCoordinatorWithException(
+              calls_before_stopping=2,
+              exception_to_raise=errors_impl.UnknownError(
+                  None, None, 'Some fatal exception inside the coordinator.')))
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+
+        def step_fn(step_context):
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: value})
+
+        return step_fn
+
+      # The coordinator will not abort during this call, since it's the call
+      # number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # The coordinator will abort during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # The coordinator was asked to stop due to non-redeemable error. Training
+      # should stop and the session should not be recreated.
+      self.assertTrue(session.should_stop())
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      with self.assertRaises(errors_impl.UnknownError):
+        session.close()
+
+  def test_recovery_from_session_getting_stuck_with_raw_session(self):
+    with self.test_session() as test_session:
+      session_creator = CountingSessionCreator(test_session)
+      session = self.create_raw_session_with_failing_coordinator(
+          session_creator,
+          FailTrainingAfterCoordinatorStopped(calls_before_stopping=2))
+
+      self.assertEqual(1, session_creator.number_of_sessions_created)
+      self.assertFalse(session.should_stop())
+
+      c = constant_op.constant(0)
+      v = array_ops.identity(c)
+
+      def feed_step_fn(value):
+
+        def step_fn(step_context):
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: value})
+
+        return step_fn
+
+      # Training will not fail, since it's the call number 0.
+      self.assertEqual(51, session.run_step_fn(feed_step_fn(51)))
+      self.assertFalse(session.should_stop())
+      # Training will fail during the next call, since it's the call
+      # number 1.
+      self.assertEqual(42, session.run_step_fn(feed_step_fn(42)))
+      # Even though the coordinator stopped which and training failed, the
+      # underlying session is recreated and training is to be continued.
+      self.assertFalse(session.should_stop())
+      self.assertEqual(2, session_creator.number_of_sessions_created)
+
 
 class FakeSession(monitored_session._WrappedSession):
 
@@ -1475,6 +1683,7 @@ class MonitoredSessionTest(test.TestCase):
 
   def test_step_request_stop_without_a_with_block(self):
     with ops.Graph().as_default():
+      was_stop_iteration_raised = False
 
       def step_fn(step_context):
         step_context.request_stop()
@@ -1483,8 +1692,10 @@ class MonitoredSessionTest(test.TestCase):
       try:
         self.assertEqual(None, session.run_step_fn(step_fn))
       except StopIteration:
-        pass
-      self.assertTrue(session.should_stop())
+        was_stop_iteration_raised = True
+
+      self.assertTrue(was_stop_iteration_raised)
+      self.assertFalse(session.should_stop())
 
   def test_step_request_stop_in_a_loop(self):
     with ops.Graph().as_default():
@@ -1526,8 +1737,7 @@ class MonitoredSessionTest(test.TestCase):
       class Model(object):
 
         def step_fn(self, step_context):
-          value = step_context.run_with_hooks(fetches=v, feed_dict={c: 3.2})
-          return value
+          return step_context.run_with_hooks(fetches=v, feed_dict={c: 3.2})
 
       with monitored_session.MonitoredSession() as session:
         model = Model()
@@ -1592,6 +1802,38 @@ class MonitoredSessionTest(test.TestCase):
       with monitored_session.MonitoredSession(hooks=[Hook(self)]) as session:
         self.assertEqual(0.3 + 0.5 + 0.7, session.run_step_fn(step_fn))
 
+  def test_step_fn_has_the_same_hooks_behavior_without_recovery(self):
+    with ops.Graph().as_default():
+      var = resource_variable_ops.ResourceVariable(0.0)
+
+      stage_0 = state_ops.assign_add(var, 0.3)
+      stage_1_0 = state_ops.assign_add(var, 0.7)
+      with ops.control_dependencies([stage_1_0]):
+        stage_1_1 = state_ops.assign_add(var, 0.5)
+      stage_2 = state_ops.assign_add(var, 1.1)
+
+      class Hook(session_run_hook.SessionRunHook):
+
+        def __init__(self, testing):
+          self._testing = testing
+
+        def before_run(self, run_context):
+          return session_run_hook.SessionRunArgs(fetches=stage_1_0)
+
+        def after_run(self, run_context, run_values):
+          self._testing.assertNear(0.3 + 0.5 + 0.7,
+                                   run_context.session.run(var), 0.1)
+          self._testing.assertNear(0.3 + 0.5 + 0.7 + 1.1,
+                                   run_context.session.run(stage_2), 0.1)
+
+      def step_fn(step_context):
+        self.assertNear(0.3, step_context.session.run(stage_0), 0.1)
+        return step_context.run_with_hooks(fetches=stage_1_1)
+
+      with monitored_session.SingularMonitoredSession(
+          hooks=[Hook(self)]) as session:
+        self.assertEqual(0.3 + 0.5 + 0.7, session.run_step_fn(step_fn))
+
   def test_step_fn_with_hooks_and_request_stop(self):
     with ops.Graph().as_default():
       trace_the_hook = {'before_run': False, 'after_run': False}
@@ -1615,6 +1857,117 @@ class MonitoredSessionTest(test.TestCase):
         self.assertFalse(trace_the_hook['before_run'])
         self.assertFalse(trace_the_hook['after_run'])
 
+  def test_recovers_from_an_exception_in_step_fn(self):
+    trace_the_exception = {'run_already': False}
+
+    with ops.Graph().as_default():
+      c = array_ops.placeholder(dtypes.float32)
+      v = array_ops.identity(c)
+
+      def step_fn(step_context):
+        if not trace_the_exception['run_already']:
+          trace_the_exception['run_already'] = True
+          raise errors_impl.AbortedError(None, None, 'Abort')
+
+        return step_context.run_with_hooks(fetches=v, feed_dict={c: 3.2})
+
+      with monitored_session.MonitoredSession() as session:
+        self.assertNear(3.2, session.run_step_fn(step_fn), 0.1)
+      self.assertTrue(trace_the_exception['run_already'])
+
+  def test_recovers_from_an_exception_in_step_fn_after_hooks(self):
+    trace_the_exception = {'run_already': False, 'side_effect_counter': 0}
+
+    with ops.Graph().as_default():
+      c = array_ops.placeholder(dtypes.float32)
+      v = array_ops.identity(c)
+      graph_state = variables.Variable(0.0)
+      graph_side_effect = state_ops.assign_add(graph_state, 0.31)
+
+      def step_fn(step_context):
+        trace_the_exception['side_effect_counter'] += 1
+        step_context.session.run(graph_side_effect)
+
+        value = step_context.run_with_hooks(fetches=v, feed_dict={c: 3.2})
+
+        if not trace_the_exception['run_already']:
+          trace_the_exception['run_already'] = True
+          raise errors_impl.AbortedError(None, None, 'Abort')
+
+        return value
+
+      with self.test_session() as test_session:
+        with monitored_session.MonitoredSession(
+            CountingSessionCreator(test_session)) as session:
+          session.run(variables.global_variables_initializer())
+
+          self.assertNear(3.2, session.run_step_fn(step_fn), 0.1)
+          self.assertTrue(trace_the_exception['run_already'])
+          # Make sure the rest of the body of the step_fn is re-executed upon
+          # AbortedError:
+          self.assertEqual(2, trace_the_exception['side_effect_counter'])
+          self.assertNear(0.62, session.run(graph_state), 0.1)
+
+  def test_step_fn_doesnt_recover_when_it_wasnt_asked_to(self):
+    trace_the_exception = {'run_already': False}
+
+    with ops.Graph().as_default():
+      c = array_ops.placeholder(dtypes.float32)
+      v = array_ops.identity(c)
+
+      def step_fn(step_context):
+        if not trace_the_exception['run_already']:
+          trace_the_exception['run_already'] = True
+          raise errors_impl.AbortedError(None, None, 'Abort')
+
+        value = step_context.run_with_hooks(fetches=v, feed_dict={c: 3.2})
+        return value
+
+      with monitored_session.SingularMonitoredSession() as session:
+        with self.assertRaisesRegexp(errors_impl.AbortedError, 'Abort'):
+          self.assertNear(3.2, session.run_step_fn(step_fn), 0.1)
+          self.fail()
+
+      self.assertTrue(trace_the_exception['run_already'])
+
+  def test_step_fn_exception_from_before_run(self):
+    trace_the_exception = {'run_already': False, 'side_effect_counter': 0}
+
+    with ops.Graph().as_default():
+      c = array_ops.placeholder(dtypes.float32)
+      v = array_ops.identity(c)
+      vv = constant_op.constant(3.2)
+      graph_state = variables.Variable(0.0)
+      graph_side_effect = state_ops.assign_add(graph_state, 0.31)
+
+      class Hook(session_run_hook.SessionRunHook):
+
+        def __init__(self, testing):
+          self._testing = testing
+
+        def before_run(self, run_context):
+          if not trace_the_exception['run_already']:
+            trace_the_exception['run_already'] = True
+            raise errors_impl.AbortedError(None, None, 'Abort')
+          return session_run_hook.SessionRunArgs(fetches=vv)
+
+        def after_run(self, run_context, run_values):
+          self._testing.assertNear(3.2, run_values.results, 0.1)
+
+      def step_fn(step_context):
+        trace_the_exception['side_effect_counter'] += 1
+        step_context.session.run(graph_side_effect)
+        return step_context.run_with_hooks(fetches=v, feed_dict={c: 1.3})
+
+      with self.test_session() as test_session:
+        with monitored_session.MonitoredSession(
+            CountingSessionCreator(test_session),
+            hooks=[Hook(self)]) as session:
+          test_session.run(variables.global_variables_initializer())
+          self.assertNear(1.3, session.run_step_fn(step_fn), 0.1)
+          self.assertEqual(2, trace_the_exception['side_effect_counter'])
+          self.assertNear(0.62, session.run(graph_state), 0.1)
+
 
 class SingularMonitoredSessionTest(test.TestCase):
   """Tests SingularMonitoredSession."""
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index eb073438506691ae4ebe8abf66ca6515e4c56e9f..e34c759e894c86a103f0228163f7bae2ffc7fb61 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -498,8 +498,9 @@ class ExponentialMovingAverage(object):
     # Collect all the variables with moving average,
     for v in moving_avg_variables:
       name_map[self.average_name(v)] = v
-    # Make sure we restore variables without moving average as well.
-    for v in list(set(variables.global_variables()) - moving_avg_variables):
-      if v.op.name not in name_map:
+    # Make sure we restore variables without moving averages as well.
+    moving_avg_variable_names = set([v.name for v in moving_avg_variables])
+    for v in list(set(variables.global_variables())):
+      if v.name not in moving_avg_variable_names and v.op.name not in name_map:
         name_map[v.op.name] = v
     return name_map
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 63604cf19d4e73370dd705ac8c117d4f2097f415..6efdeb286657e761a4c46634b9408121765a447b 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import moving_averages
+from tensorflow.python.training import saver as saver_lib
 
 
 class MovingAveragesTest(test.TestCase):
@@ -392,6 +393,32 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertEqual([b"loc:@v1"], ema.average(v1).op.colocation_groups())
     self.assertDeviceEqual("/job:default", ema.average(tensor2).device)
 
+  def _ExportAndImportGraph(self, graph):
+    """Export and import graph into a new graph."""
+    meta_graph = saver_lib.export_meta_graph(
+        graph=graph, collection_list=graph.get_all_collection_keys())
+    graph_copy = ops.Graph()
+    with graph_copy.as_default():
+      _ = saver_lib.import_meta_graph(meta_graph)
+    return graph_copy
+
+  def testImportedGraphVariablesToRestore(self):
+    g = ops.Graph()
+    with g.as_default():
+      variables.Variable(10.0, name="v")
+    # Export and import the graph into a new graph.
+    g_copy = self._ExportAndImportGraph(g)
+    with g_copy.as_default():
+      ema = moving_averages.ExponentialMovingAverage(0.25, name="foo_avg")
+      vars_to_restore = ema.variables_to_restore()
+      # There should only be one variable in vars_to_restore. This is important
+      # to check because when importing from a GraphDef, TF makes duplicate
+      # python Variable objects referring to the same underlying variable. We
+      # need to be sure that two variables referring to the same variable don't
+      # both get added to vars_to_restore.
+      self.assertEqual(len(vars_to_restore), 1)
+      self.assertTrue("v/foo_avg" in vars_to_restore)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 915214dbfaea022d6325c3cc122501687d3acf73..56cf4d42ee194885057d8bf45d9b3c1c407c4a11 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -381,7 +381,7 @@ class Optimizer(object):
       loss: A Tensor containing the value to minimize.
       var_list: Optional list or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKey.TRAINABLE_VARIABLES`.
+        under the key `GraphKeys.TRAINABLE_VARIABLES`.
       gate_gradients: How to gate the computation of gradients.  Can be
         `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
@@ -574,6 +574,47 @@ class Optimizer(object):
     """
     return sorted(self._slots.keys())
 
+  def variables(self):
+    """A list of variables which encode the current state of `Optimizer`.
+
+    Includes slot variables and additional global variables created by the
+    optimizer in the current default graph.
+
+    Returns:
+      A list of variables.
+    """
+    executing_eagerly = context.in_eager_mode()
+    current_graph = ops.get_default_graph()
+
+    def _from_current_graph(variable):
+      if executing_eagerly:
+        # No variable.op in eager mode. We don't expect lots of eager graphs,
+        # but behavior should be consistent with graph mode.
+        return variable._container_prefix == current_graph._container_prefix  # pylint: disable=protected-access
+      else:
+        return variable.op.graph is current_graph
+
+    optimizer_variables = [v for v in self._non_slot_variables()
+                           if _from_current_graph(v)]
+    for _, variable_dict in self._slots.items():
+      for _, slot_for_variable in variable_dict.items():
+        if _from_current_graph(slot_for_variable):
+          optimizer_variables.append(slot_for_variable)
+    # Sort variables by name so that the return is deterministic.
+    return sorted(optimizer_variables, key=lambda v: v.name)
+
+  def _non_slot_variables(self):
+    """Additional variables created by the `Optimizer`.
+
+    This method should be overridden by child classes which create extra
+    variables, so that `variables()` includes the `Optimizer`'s non-slot
+    variables.
+
+    Returns:
+      A list or tuple of variables.
+    """
+    return []
+
   def _assert_valid_dtypes(self, tensors):
     """Asserts tensors are all valid types (see `_valid_dtypes`).
 
@@ -603,7 +644,8 @@ class Optimizer(object):
     Returns:
       Valid types for loss, variables and gradients.
     """
-    return set([dtypes.float16, dtypes.float32, dtypes.float64])
+    return set(
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64])
 
   def _create_slots(self, var_list):
     """Create all slots needed by the variables.
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 1da7f75531ad631c08d8059d9be34056cf3c3801..430c16b3517270eefc0781e90920e4e041f87efa 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -59,6 +59,10 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       v0_val, v1_val = sess.run([var0, var1])
       self.assertAllClose(np.array([-2.60260963, -4.29698515]), v0_val)
       self.assertAllClose(np.array([-0.28432083, -0.56694895]), v1_val)
+      opt_vars = opt.variables()
+      self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+      self.assertStartsWith(opt_vars[1].name, var1._shared_name)
+      self.assertEqual(2, len(opt_vars))
 
   def testProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=False)
diff --git a/tensorflow/python/training/quantize_training.i b/tensorflow/python/training/quantize_training.i
index 40c60769731d3f7255647a07141d86b1c2594b01..17ffcd6e0758c9c1bc8bab864b6b7a2a18bc9cbf 100644
--- a/tensorflow/python/training/quantize_training.i
+++ b/tensorflow/python/training/quantize_training.i
@@ -65,6 +65,9 @@ def do_quantize_training_on_graphdef(input_graph, num_bits):
 
   graph.ParseFromString(result_graph_string)
   return graph
+
+do_quantize_training_on_graphdef._tf_api_names = [
+    'train.do_quantize_training_on_graphdef']
 %}
 
 %unignoreall
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 60420eb86afb69cdd9caa92f07061f91c6631570..ba6301e785947c8347ef23b81491e684bee62974 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -349,7 +349,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(device):
+      with ops.device(_set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -357,7 +357,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(last_device):
+      with ops.device(_set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -503,11 +503,13 @@ class BaseSaverBuilder(object):
     return sorted(per_device.items(), key=lambda t: t[0])
 
   @staticmethod
-  def OpListToDict(op_list):
+  def OpListToDict(op_list, convert_variable_to_tensor=True):
     """Create a dictionary of names to operation lists.
 
     Args:
       op_list: A list, tuple, or set of Variables or SaveableObjects.
+      convert_variable_to_tensor: Whether or not to convert single Variables
+        with no slice info into Tensors.
 
     Returns:
       A dictionary of names to the operations that must be saved under
@@ -521,7 +523,10 @@ class BaseSaverBuilder(object):
     if not isinstance(op_list, (list, tuple, set)):
       raise TypeError("Variables to save should be passed in a dict or a "
                       "list: %s" % op_list)
-    op_list = set(op_list)
+    # When ResourceVariables are converted to Tensors, read ops are added to the
+    # graph. Sorting the op_list ensures that the resulting graph is always
+    # constructed in a deterministic way:
+    op_list = sorted(op_list, key=lambda x: x.name)
     names_to_saveables = {}
     # pylint: disable=protected-access
     for var in op_list:
@@ -543,9 +548,10 @@ class BaseSaverBuilder(object):
           names_to_saveables[name] = [var]
       else:
         if context.in_graph_mode():
-          var = ops.internal_convert_to_tensor(var, as_ref=True)
-          if not BaseSaverBuilder._IsVariable(var):
-            raise TypeError("Variable to save is not a Variable: %s" % var)
+          if convert_variable_to_tensor:
+            var = ops.internal_convert_to_tensor(var, as_ref=True)
+            if not BaseSaverBuilder._IsVariable(var):
+              raise TypeError("Variable to save is not a Variable: %s" % var)
           if var.op.type == "ReadVariableOp":
             name = var.op.inputs[0].op.name
           else:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 744b17dd224297cbefedfe562ff106fe1200664f..207e4a28426f95af4d5947964cf9133be10bc0fa 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -38,6 +38,7 @@ from tensorflow.core.protobuf import queue_runner_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -164,6 +165,18 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  def testResourceVariableReadOpsAddedDeterministically(self):
+    graph_defs = []
+    num_graphs = 10
+    for _ in range(num_graphs):
+      with ops_lib.Graph().as_default() as g:
+        for i in range(20):
+          resource_variable_ops.ResourceVariable(i, name="var%s" % i)
+        saver_module.Saver()
+        graph_defs.append(g.as_graph_def())
+    for i in range(num_graphs - 1):
+      self.assertEqual(graph_defs[i], graph_defs[i + 1])
+
   def testEagerBasic(self):
     with context.eager_mode():
       ckpt_prefix = os.path.join(self.get_temp_dir(), "ckpt")
@@ -529,6 +542,23 @@ class SaverTest(test.TestCase):
       save = saver_module.Saver({"v0": v0_2})
       variables.global_variables_initializer().run()
 
+  def testSharedServerOnGPU(self):
+    if not test.is_gpu_available():
+      return
+    save_path = os.path.join(self.get_temp_dir(), "gpu")
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_1 = variables.Variable(123.45)
+      save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+      save.save(sess, save_path)
+
+    with session.Session("", graph=ops_lib.Graph()) as sess:
+      with sess.graph.device(test.gpu_device_name()):
+        v0_2 = variables.Variable(543.21)
+      save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
+      variables.global_variables_initializer().run()
+
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -714,6 +744,8 @@ class SaverTest(test.TestCase):
 
 class SaveRestoreShardedTest(test.TestCase):
 
+  _WRITE_VERSION = saver_pb2.SaverDef.V1
+
   def _get_test_dir(self, dirname):
     test_dir = os.path.join(self.get_temp_dir(), dirname)
     gfile.MakeDirs(test_dir)
@@ -739,6 +771,7 @@ class SaveRestoreShardedTest(test.TestCase):
               "t0": t0.saveable,
               "t1": t1.saveable
           },
+          write_version=self._WRITE_VERSION,
           sharded=True)
       variables.global_variables_initializer().run()
       t0.insert("k1", 30.0).run()
@@ -759,7 +792,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v0 = variables.Variable(111, name="v0")
           t0 = saver_test_utils.CheckpointedOp(name="t0")
-        save = saver_module.Saver({"v0": v0, "t0": t0.saveable}, sharded=True)
+        save = saver_module.Saver(
+            {
+                "v0": v0,
+                "t0": t0.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t0.insert("k11", 33.0).run()
         self.assertEqual(111, v0.eval())
@@ -777,7 +816,13 @@ class SaveRestoreShardedTest(test.TestCase):
         with sess.graph.device("/cpu:0"):
           v1 = variables.Variable(222)
           t1 = saver_test_utils.CheckpointedOp(name="t1")
-        save = saver_module.Saver({"v1": v1, "t1": t1.saveable}, sharded=True)
+        save = saver_module.Saver(
+            {
+                "v1": v1,
+                "t1": t1.saveable
+            },
+            write_version=self._WRITE_VERSION,
+            sharded=True)
         variables.global_variables_initializer().run()
         t1.insert("k22", 44.0).run()
         self.assertEqual(222, v1.eval())
@@ -805,6 +850,7 @@ class SaveRestoreShardedTest(test.TestCase):
               "t0": t0.saveable,
               "t1": t1.saveable
           },
+          write_version=self._WRITE_VERSION,
           sharded=True)
       variables.global_variables_initializer().run()
       t0.insert("k11", 33.0).run()
@@ -970,6 +1016,10 @@ class SaveRestoreShardedTest(test.TestCase):
     self._testPartitionedVariables(use_resource=True)
 
 
+class SaveRestoreShardedTestV2(SaveRestoreShardedTest):
+  _WRITE_VERSION = saver_pb2.SaverDef.V2
+
+
 class MaxToKeepTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2105,6 +2155,31 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testPreserveDatasetAndFunctions(self):
+    with ops_lib.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+      _ = array_ops.identity(next_element, name="output")
+
+      # Generate three MetaGraphDef protos using different code paths.
+      meta_graph_def_simple = saver_module.export_meta_graph()
+      meta_graph_def_devices_cleared = saver_module.export_meta_graph(
+          clear_devices=True)
+      meta_graph_def_from_graph_def = saver_module.export_meta_graph(
+          clear_devices=True, graph_def=g.as_graph_def())
+
+    for meta_graph_def in [meta_graph_def_simple,
+                           meta_graph_def_devices_cleared,
+                           meta_graph_def_from_graph_def]:
+      with session.Session(graph=ops_lib.Graph()) as sess:
+        saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
+        sess.run(variables.global_variables_initializer())
+        for i in range(10):
+          self.assertEqual(i * i, sess.run("new_model/output:0"))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run("new_model/output:0")
+
 
 class CheckpointReaderTest(test.TestCase):
 
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 2091eca0b9c6f0af4a043a4639b6fb72b90cef56..29da67a30a58c1b8b8e172b2ccede340880fef58 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -307,6 +307,12 @@ class ClusterSpec(object):
   def __ne__(self, other):
     return self._cluster_spec != other
 
+  def __str__(self):
+    key_values = self.as_dict()
+    string_items = [
+        repr(k) + ": " + repr(key_values[k]) for k in sorted(key_values)]
+    return "ClusterSpec({" + ", ".join(string_items) + "})"
+
   def as_dict(self):
     """Returns a dictionary from job names to their tasks.
 
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 0a8ec4901c9ef050014b6a04cdab34ca08f292c1..063044f0d05d4237830e415ac2ad800c98ae8beb 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -241,6 +241,95 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
+  def testIsolateSessionState(self):
+    server = self._cached_server
+
+    init_value = array_ops.placeholder(dtypes.int32)
+    v = variables.Variable(init_value, validate_shape=False, name="v")
+
+    sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
+    sharing_sess_0 = session.Session(server.target, config=sharing_config)
+    sharing_sess_1 = session.Session(server.target, config=sharing_config)
+
+    isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
+    isolate_sess_0 = session.Session(server.target, config=isolate_config)
+    isolate_sess_1 = session.Session(server.target, config=isolate_config)
+
+    # Initially all variables are initialized.
+    for sess in [sharing_sess_0, sharing_sess_1,
+                 isolate_sess_0, isolate_sess_1]:
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        sess.run(v)
+
+    # Shared sessions will see each other's updates, but isolated sessions
+    # will not.
+    sharing_sess_0.run(v.initializer, feed_dict={init_value: 86})
+    self.assertAllEqual(86, sharing_sess_0.run(v))
+    self.assertAllEqual(86, sharing_sess_1.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_0.run(v)
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Changing the shape works because `validate_shape` is False.
+    sharing_sess_1.run(v.initializer, feed_dict={init_value: [86, 99]})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_0.run(v)
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Initializing in an isolated session will only affect the state in that
+    # session.
+    isolate_sess_0.run(v.initializer, feed_dict={init_value: 37})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    self.assertAllEqual(37, isolate_sess_0.run(v))
+    with self.assertRaises(errors_impl.FailedPreconditionError):
+      isolate_sess_1.run(v)
+
+    # Isolated sessions can have different shapes for the same variable.
+    isolate_sess_1.run(v.initializer, feed_dict={init_value: [19, 86]})
+    self.assertAllEqual([86, 99], sharing_sess_0.run(v))
+    self.assertAllEqual([86, 99], sharing_sess_1.run(v))
+    self.assertAllEqual(37, isolate_sess_0.run(v))
+    self.assertAllEqual([19, 86], isolate_sess_1.run(v))
+
+  def testShapeChangingIsolateState(self):
+    server = self._cached_server
+    sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
+    isolate_config = config_pb2.ConfigProto(isolate_session_state=True)
+
+    with ops.Graph().as_default():
+      w_vector = variables.Variable([1, 2, 3], name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          sess.run(w_vector)
+        sess.run(w_vector.initializer)
+        self.assertAllEqual([1, 2, 3], sess.run(w_vector))
+
+    with ops.Graph().as_default():
+      w_vector = variables.Variable([4, 5, 6], name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        self.assertAllEqual([1, 2, 3], sess.run(w_vector))
+        sess.run(w_vector.initializer)
+        self.assertAllEqual([4, 5, 6], sess.run(w_vector))
+
+    with ops.Graph().as_default():
+      w_scalar = variables.Variable(86, name="w")
+      with session.Session(server.target, config=sharing_config) as sess:
+        with self.assertRaises(errors_impl.InvalidArgumentError):
+          sess.run(w_scalar.initializer)
+
+    with ops.Graph().as_default():
+      w_scalar = variables.Variable(37, name="w")
+      with session.Session(server.target, config=isolate_config) as sess:
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          sess.run(w_scalar)
+        sess.run(w_scalar.initializer)
+        self.assertAllEqual(37, sess.run(w_scalar))
+
 
 class ServerDefTest(test.TestCase):
 
@@ -332,6 +421,17 @@ class ServerDefTest(test.TestCase):
 
 class ClusterSpecTest(test.TestCase):
 
+  def testStringConversion(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:1111"],
+        "worker": ["worker0:3333", "worker1:4444"]
+    })
+
+    expected_str = (
+        "ClusterSpec({'ps': ['ps0:1111'], 'worker': ['worker0:3333', "
+        "'worker1:4444']})")
+    self.assertEqual(expected_str, str(cluster_spec))
+
   def testProtoDictDefEquivalences(self):
     cluster_spec = server_lib.ClusterSpec({
         "ps": ["ps0:2222", "ps1:2222"],
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a634a842b67033d5fde6bf8cf819f681e892a247..e4514aaea223b6b254a7a72e11e6b70b576fd54b 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -36,11 +36,15 @@ from tensorflow.python.training import coordinator
 from tensorflow.python.training import saver as saver_mod
 from tensorflow.python.training import session_manager as session_manager_mod
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 
 
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
+  This class is deprecated. Please use
+  ${tf.train.MonitoredTrainingSession} instead.
+
   The Supervisor is a small wrapper around a `Coordinator`, a `Saver`,
   and a `SessionManager` that takes care of common needs of TensorFlow
   training programs.
@@ -198,6 +202,8 @@ class Supervisor(object):
   # the default behavior should be used.
   USE_DEFAULT = 0
 
+  @deprecation.deprecated(None,
+                          "Please switch to tf.train.MonitoredTrainingSession")
   def __init__(self,
                graph=None,
                ready_op=USE_DEFAULT,
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index dcf14408c77984a376e773ee7f9b527e779f6447..47702fdad05d13015e0cbf7768129b0c53b6c14c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -99,7 +99,7 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
   # Note that if you want to have 2 backup replicas, you can change
   # total_num_replicas=52 and make sure this number matches how many physical
   # replicas you started in your job.
-  opt = tf.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
+  opt = tf.train.SyncReplicasOptimizer(opt, replicas_to_aggregate=50,
                                  total_num_replicas=50)
 
   # Some models have startup_delays to help stabilize the model but when using
@@ -374,6 +374,17 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     """
     return self._opt.get_slot(*args, **kwargs)
 
+  def variables(self):
+    """Fetches a list of optimizer variables in the default graph.
+
+    This wraps `variables()` from the actual optimizer. It does not include
+    the `SyncReplicasOptimizer`'s local step.
+
+    Returns:
+      A list of variables.
+    """
+    return self._opt.variables()
+
   def get_slot_names(self, *args, **kwargs):
     """Return a list of the names of slots created by the `Optimizer`.
 
@@ -438,7 +449,7 @@ class _SyncReplicasOptimizerHook(session_run_hook.SessionRunHook):
   """A SessionRunHook handles ops related to SyncReplicasOptimizer."""
 
   def __init__(self, sync_optimizer, is_chief, num_tokens):
-    """Creates hook to handle SyncReplicaOptimizer initialization ops.
+    """Creates hook to handle SyncReplicasOptimizer initialization ops.
 
     Args:
       sync_optimizer: `SyncReplicasOptimizer` which this hook will initialize.
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 85e8a8a4bb1dba6f0c4e7e1059b07816d893347f..297284f80c2997e21304138c5a090da76425917b 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import training
 
@@ -276,6 +277,18 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
+  def testFetchVariableList(self):
+    opt = training.SyncReplicasOptimizer(
+        opt=adam.AdamOptimizer(0.01),
+        replicas_to_aggregate=1,
+        total_num_replicas=1)
+    v = variables.Variable([0.], name="fetch_variable_test")
+    global_step = variables.Variable(0, name="global_step", trainable=False)
+    opt.minimize(v, global_step=global_step)
+    opt_variables = opt.variables()
+    self.assertIn(opt._opt._beta1_power, opt_variables)
+    self.assertIn(opt._opt._beta2_power, opt_variables)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 89a9e129328fe38da2ce497a7f26dc11446ea032..2a42ff200380dc1b1b001001bf5a14dc8b5eb398 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -221,7 +222,6 @@ def _get_or_create_global_step_read(graph=None):
   global_step_tensor = get_global_step(graph)
   if global_step_tensor is None:
     return None
-  # add 'zero' so that it will create a copy of variable as Tensor.
   with graph.as_default() as g, g.name_scope(None):
     with g.name_scope(global_step_tensor.op.name + '/'):
       # using initialized_value to ensure that global_step is initialized before
@@ -229,7 +229,10 @@ def _get_or_create_global_step_read(graph=None):
       # under global_step_read_tensor dependency.
       global_step_value = global_step_tensor.initialized_value() if isinstance(
           global_step_tensor, variables.Variable) else global_step_tensor
-      global_step_read_tensor = global_step_value + 0
+      # pylint: disable=protected-access
+      # We use the snapshot kernel to make sure a copy is made of this tensor.
+      global_step_read_tensor = gen_array_ops._snapshot(global_step_value)
+      # pylint: enable=protected-access
       ops.add_to_collection(GLOBAL_STEP_READ_KEY, global_step_read_tensor)
   return _get_global_step_read(graph)
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index dd6acee3c7537827808ec98561f3ea7fd80910d0..5c066e2bef1eb557b81b4996a4848fb18318ab4e 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -116,7 +116,7 @@ def flatten(nest):
   used instead. The same convention is followed in `pack_sequence_as`. This
   correctly repacks dicts and `OrderedDict`s after they have been flattened,
   and also allows flattening an `OrderedDict` and then repacking it back using
-  a correponding plain dict, or vice-versa.
+  a corresponding plain dict, or vice-versa.
   Dictionaries with non-sortable keys cannot be flattened.
 
   Users must not modify any collections used in `nest` while this function is
@@ -293,10 +293,10 @@ def pack_sequence_as(structure, flat_sequence):
   If `structure` is or contains a dict instance, the keys will be sorted to
   pack the flat sequence in deterministic order. This is true also for
   `OrderedDict` instances: their sequence order is ignored, the sorting order of
-  keys is used instead. The same convention is followed in `pack_sequence_as`.
+  keys is used instead. The same convention is followed in `flatten`.
   This correctly repacks dicts and `OrderedDict`s after they have been
   flattened, and also allows flattening an `OrderedDict` and then repacking it
-  back using a correponding plain dict, or vice-versa.
+  back using a corresponding plain dict, or vice-versa.
   Dictionaries with non-sortable keys cannot be flattened.
 
   Args:
@@ -452,6 +452,17 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "structure has length %s, while shallow structure has length %s."
           % (len(input_tree), len(shallow_tree)))
 
+    if check_types and isinstance(shallow_tree, dict):
+      if set(input_tree) != set(shallow_tree):
+        raise ValueError(
+            "The two structures don't have the same keys. Input "
+            "structure has keys %s, while shallow structure has keys %s." %
+            (list(_six.iterkeys(input_tree)),
+             list(_six.iterkeys(shallow_tree))))
+
+      input_tree = list(_six.iteritems(input_tree))
+      shallow_tree = list(_six.iteritems(shallow_tree))
+
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index c4020f4f3ce62d00718a9769111f7a24b9c0c70b..3d9e9f96849c1b7415892ec9341947565ed89664 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -385,6 +385,16 @@ class NestTest(test.TestCase):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
+    inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
+    inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
+    expected_message = (
+        r"The two structures don't have the same keys. Input "
+        r"structure has keys \['c'\], while shallow structure has "
+        r"keys \['d'\].")
+
+    with self.assertRaisesRegexp(ValueError, expected_message):
+      nest.assert_shallow_structure(inp_ab2, inp_ab1)
+
   def testFlattenUpTo(self):
     # Shallow tree ends at scalar.
     input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
@@ -429,8 +439,7 @@ class NestTest(test.TestCase):
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
-    shallow_tree = collections.OrderedDict([("a", 0),
-                                            ("b", {"d": 3, "e": 1})])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
                                                               input_tree)
     self.assertEqual(input_tree_flattened_as_shallow_tree,
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 0cd095d9d947f5cf76adaf83dc16272c4374573e..8004898cbcbce7ce593ce35efdc6493e052468bd 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -164,6 +164,8 @@ def NewCheckpointReader(filepattern):
   with errors.raise_exception_on_not_ok_status() as status:
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
+
+NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 %}
 
 %include "tensorflow/c/checkpoint_reader.h"
diff --git a/tensorflow/python/util/stat_summarizer.i b/tensorflow/python/util/stat_summarizer.i
index 80739195872a056e7a5443dfb81ab1440300dbff..6aeaa0e31b9b48f7e6705ab7146828cc0e0e5e08 100644
--- a/tensorflow/python/util/stat_summarizer.i
+++ b/tensorflow/python/util/stat_summarizer.i
@@ -27,8 +27,8 @@ limitations under the License.
 
 %ignoreall
 
-%unignore NewStatSummarizer;
-%unignore DeleteStatSummarizer;
+%unignore _NewStatSummarizer;
+%unignore _DeleteStatSummarizer;
 %unignore tensorflow;
 %unignore tensorflow::StatSummarizer;
 %unignore tensorflow::StatSummarizer::StatSummarizer;
@@ -43,21 +43,20 @@ limitations under the License.
 
 // TODO(ashankar): Remove the unused argument from the API.
 %{
-tensorflow::StatSummarizer* NewStatSummarizer(
+tensorflow::StatSummarizer* _NewStatSummarizer(
       const string& unused) {
   return new tensorflow::StatSummarizer(tensorflow::StatSummarizerOptions());
 }
 %}
 
-
 %{
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss) {
   delete ss;
 }
 %}
 
-tensorflow::StatSummarizer* NewStatSummarizer(const string& unused);
-void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
+tensorflow::StatSummarizer* _NewStatSummarizer(const string& unused);
+void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %extend tensorflow::StatSummarizer {
   void ProcessStepStatsStr(const string& step_stats_str) {
@@ -77,3 +76,21 @@ void DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
 
 %include "tensorflow/core/util/stat_summarizer.h"
 %unignoreall
+
+%insert("python") %{
+
+# Wrapping NewStatSummarizer and DeletStatSummarizer because
+# SWIG-generated functions are built-in functions and do not support
+# setting _tf_api_names attribute.
+
+def NewStatSummarizer(unused):
+  return _NewStatSummarizer(unused)
+
+def DeleteStatSummarizer(stat_summarizer):
+  _DeleteStatSummarizer(stat_summarizer)
+
+NewStatSummarizer._tf_api_names = ["contrib.stat_summarizer.NewStatSummarizer"]
+DeleteStatSummarizer._tf_api_names = [
+    "contrib.stat_summarizer.DeleteStatSummarizer"]
+StatSummarizer._tf_api_names = ["contrib.stat_summarizer.StatSummarizer"]
+%}
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 9ed125704b1cf2ced585db0b169a184d27e1ad72..d14e71038851db80a3837254bb2e0d694480fe40 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -45,6 +45,26 @@ def getargspec(object):  # pylint: disable=redefined-builtin
                if d.decorator_argspec is not None), _inspect.getargspec(target))
 
 
+def getfullargspec(obj):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfullargspec and fallback to
+  inspect.getargspec in Python 2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()`
+    (`inspect.getargspec()` in Python 2) will be called directly on the
+    callable.
+  """
+  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  decorators, target = tf_decorator.unwrap(obj)
+  return next((d.decorator_argspec for d in decorators
+               if d.decorator_argspec is not None), spec_fn(target))
+
+
 def getcallargs(func, *positional, **named):
   """TFDecorator-aware replacement for inspect.getcallargs.
 
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 99081cb29470900992f4583445817521e8dd2553..37733152e8ec6d7b026bf74e69e33bfe8f9f4e89 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -22,6 +22,7 @@ import types
 
 import six  # pylint: disable=unused-import
 
+from tensorflow.python.eager import context
 from tensorflow.python.util import tf_decorator
 # pylint: enable=g-bad-import-order,g-import-not-at-top
 
@@ -31,6 +32,8 @@ from tensorflow.python.util import tf_decorator
 def _add_should_use_warning(x, fatal_error=False):
   """Wraps object x so that if it is never used, a warning is logged.
 
+  Does nothing when executing eagerly.
+
   Args:
     x: Python object.
     fatal_error: Python bool.  If `True`, tf.logging.fatal is raised
@@ -41,12 +44,13 @@ def _add_should_use_warning(x, fatal_error=False):
     and is a very shallow wrapper for `x` which logs access into `x`.
   """
   del fatal_error
-  if x is None:  # special corner case where x is None
+  if x is None or x == []:  # pylint: disable=g-explicit-bool-comparison
     return x
 
-  # TODO(apassos) we don't have an easier way to check because importing context
-  # or ops here would create a BUILD dependency cycle.
-  if type(x).__name__ == 'EagerTensor':
+  if context.in_eager_mode():
+    # Typically not needed when executing eagerly (the main use case is for ops
+    # which need to be incorporated into the graph), and even the no-op wrapper
+    # creates reference cycles which require garbage collection.
     return x
 
   def override_method(method):
@@ -102,6 +106,8 @@ def should_use_result(fn):
   - `t != 0`.  In this case, comparison is done on types / ids.
   - `isinstance(t, tf.Tensor)`.  Similar to above.
 
+  Does nothing when executing eagerly.
+
   Args:
     fn: The function to wrap.
 
@@ -136,6 +142,8 @@ def must_use_result_or_fatal(fn):
   - `t != 0`.  In this case, comparison is done on types / ids.
   - `isinstance(t, tf.Tensor)`.  Similar to above.
 
+  Does nothing when executing eagerly.
+
   Args:
     fn: The function to wrap.
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 039f7ea0292bc14978e35f806062b9fc383ac575..1e26f53ae1bd1abb75b1b9d8010d7948c2412338 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -232,7 +232,6 @@ CUDNN_DNN_ROUTINE_EACH_R3(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
   __macro(cudnnRNNBackwardData)                               \
   __macro(cudnnRNNBackwardWeights)                            \
   __macro(cudnnSetRNNDescriptor)                              \
-  __macro(cudnnSetRNNDescriptor_v6)                           \
   __macro(cudnnGetFilterNdDescriptor)
 
 // clang-format on
@@ -245,7 +244,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 // clang-format off
 #if CUDNN_VERSION >= 6000
 #define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnConvolutionBiasActivationForward)
+  __macro(cudnnConvolutionBiasActivationForward)              \
+  __macro(cudnnSetRNNDescriptor_v6)
 
 // clang-format on
 CUDNN_DNN_ROUTINE_EACH_R6(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
@@ -390,8 +390,8 @@ port::Status CudnnSupport::Init() {
                  << DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
-      LOG(INFO) << "possibly insufficient driver version: "
-                << DriverVersionToString(version);
+      LOG(ERROR) << "possibly insufficient driver version: "
+                 << DriverVersionToString(version);
       // OS X kernel driver does not report version accurately
 #if !defined(__APPLE__)
       if (std::get<0>(version) < 340) {
@@ -561,7 +561,7 @@ static bool TensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool ret;
     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DISABLE_TENSOR_OP_MATH",
-                                               /*default=*/false, &ret));
+                                               /*default_val=*/false, &ret));
     return !ret;
   }();
   return is_enabled;
@@ -665,7 +665,6 @@ class ScopedPoolingDescriptor {
       LOG(FATAL) << "could not create cudnn pooling descriptor: "
                  << ToString(status);
     }
-
     const std::vector<int64> strides64 = pooling_descriptor.strides();
     const std::vector<int64> padding64 = pooling_descriptor.padding();
     const std::vector<int64> shape64 = pooling_descriptor.window();
@@ -680,14 +679,14 @@ class ScopedPoolingDescriptor {
                    &CheckedNarrowing<int64, int>);
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
+    bool propagate_nans = pooling_descriptor.propagate_nans();
     status = wrap::cudnnSetPoolingNdDescriptor(
         parent_, handle_,
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? CUDNN_POOLING_MAX
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
 #if CUDNN_VERSION >= 5000
-        // Always propagate nans.
-        CUDNN_PROPAGATE_NAN,
+        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN,
 #endif
         nd, shape.data(), padding.data(), strides.data());
     if (status != CUDNN_STATUS_SUCCESS) {
@@ -962,7 +961,8 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
       if (!allocated.ok() ||
           (state_memory = allocated.ValueOrDie()) == nullptr) {
         string error_msg =
-            port::StrCat("Fail to allocate Cudnn dropout state memory");
+            port::StrCat("Failed to allocate Cudnn dropout state memory of ",
+                         state_sizes_in_bytes, " bytes.");
         status_ = port::Status(port::error::UNKNOWN, error_msg);
         LOG(ERROR) << error_msg;
         return;
@@ -971,7 +971,10 @@ class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
     status = wrap::cudnnSetDropoutDescriptor(parent_, handle_, cudnn_handle,
                                              dropout, state_memory.opaque(),
                                              state_memory.size(), seed);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to set dropout descriptor");
+    CUDNN_RETURN_IF_FAIL(
+        status, port::StrCat(
+                    "Failed to set dropout descriptor with state memory size: ",
+                    state_memory.size(), " bytes."));
   }
 
   ~CudnnDropoutDescriptor() {
@@ -1476,7 +1479,8 @@ bool CreateRnnWorkspace(Stream* stream, CUDAExecutor* parent,
     auto allocated =
         workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
     if (!allocated.ok() || (*workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << "Failed to allocate RNN workspace";
+      LOG(ERROR) << port::StrCat("Failed to allocate RNN workspace of ",
+                                 workspace_size_in_bytes, " bytes.");
       return false;
     }
   } else {
@@ -1553,7 +1557,8 @@ bool CudnnSupport::DoRnnForwardImpl(
           stream, reserve_space_size_in_bytes);
       if (!allocated.ok() ||
           (reserve_space = allocated.ValueOrDie()) == nullptr) {
-        LOG(ERROR) << "Fail to allocate RNN reserve space";
+        LOG(ERROR) << "Failed to allocate RNN reserve space of "
+                   << reserve_space_size_in_bytes << " bytes.";
         return false;
       }
     }
@@ -1782,6 +1787,49 @@ CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
 #endif  // CUDNN_VERSION
 }
 
+bool CudnnSupport::DoRnnForward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<Eigen::half>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<Eigen::half>& input_c_data,
+    const DeviceMemory<Eigen::half>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    DeviceMemory<Eigen::half>* output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    DeviceMemory<Eigen::half>* output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+    ScratchAllocator* reserve_space_allocator,
+    ScratchAllocator* workspace_allocator) {
+#if CUDNN_VERSION >= 5000
+  const CudnnRnnDescriptor& cudnn_rnn_desc =
+      static_cast<const CudnnRnnDescriptor&>(rnn_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(input_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_c_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_output_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(output_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnForwardImpl<Eigen::half>(
+      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
+      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
+      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+#else
+  return false;
+#endif  // CUDNN_VERSION
+}
+
 bool CudnnSupport::DoRnnForward(
     Stream* stream, const dnn::RnnDescriptor& rnn_desc,
     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -1867,6 +1915,59 @@ bool CudnnSupport::DoRnnForward(
 #endif  // CUDNN_VERSION
 }
 
+bool CudnnSupport::DoRnnBackward(
+    Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor& input_desc,
+    const DeviceMemory<Eigen::half>& input_data,
+    const dnn::RnnStateTensorDescriptor& input_h_desc,
+    const DeviceMemory<Eigen::half>& input_h_data,
+    const dnn::RnnStateTensorDescriptor& input_c_desc,
+    const DeviceMemory<Eigen::half>& input_c_data,
+    const DeviceMemory<Eigen::half>& params,
+    const dnn::RnnSequenceTensorDescriptor& output_desc,
+    const DeviceMemory<Eigen::half>& output_data,
+    const dnn::RnnStateTensorDescriptor& output_h_desc,
+    const DeviceMemory<Eigen::half>& output_h_data,
+    const dnn::RnnStateTensorDescriptor& output_c_desc,
+    const DeviceMemory<Eigen::half>& output_c_data,
+    const DeviceMemory<Eigen::half>& output_backprop_data,
+    const DeviceMemory<Eigen::half>& output_h_backprop_data,
+    const DeviceMemory<Eigen::half>& output_c_backprop_data,
+    DeviceMemory<Eigen::half>* input_backprop_data,
+    DeviceMemory<Eigen::half>* input_h_backprop_data,
+    DeviceMemory<Eigen::half>* input_c_backprop_data,
+    DeviceMemory<Eigen::half>* params_backprop_data,
+    DeviceMemory<uint8>* reserve_space_data,
+    ScratchAllocator* workspace_allocator) {
+#if CUDNN_VERSION >= 5000
+  const CudnnRnnDescriptor& cudnn_rnn_desc =
+      static_cast<const CudnnRnnDescriptor&>(rnn_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(input_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_input_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(input_c_desc);
+  const CudnnRnnSequenceTensorDescriptor& cudnn_output_desc =
+      static_cast<const CudnnRnnSequenceTensorDescriptor&>(output_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_h_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
+  const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
+      static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
+
+  return DoRnnBackwardImpl<Eigen::half>(
+      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
+      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
+      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
+      output_c_data, output_backprop_data, output_h_backprop_data,
+      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
+      input_c_backprop_data, params_backprop_data, reserve_space_data,
+      workspace_allocator);
+#else
+  return false;
+#endif  // CUDNN_VERSION
+}
+
 bool CudnnSupport::DoRnnBackward(
     Stream* stream, const dnn::RnnDescriptor& rnn_desc,
     const dnn::RnnSequenceTensorDescriptor& input_desc,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 8d7069a9029625d187de9eda7ba962e5c7b7d0f0..14986286f1dd4c4ced1ebaf6adbada8e52096b92 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -62,6 +62,23 @@ class CudnnSupport : public dnn::DnnSupport {
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
 
+  bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                    const dnn::RnnSequenceTensorDescriptor& input_desc,
+                    const DeviceMemory<Eigen::half>& input_data,
+                    const dnn::RnnStateTensorDescriptor& input_h_desc,
+                    const DeviceMemory<Eigen::half>& input_h_data,
+                    const dnn::RnnStateTensorDescriptor& input_c_desc,
+                    const DeviceMemory<Eigen::half>& input_c_data,
+                    const DeviceMemory<Eigen::half>& params,
+                    const dnn::RnnSequenceTensorDescriptor& output_desc,
+                    DeviceMemory<Eigen::half>* output_data,
+                    const dnn::RnnStateTensorDescriptor& output_h_desc,
+                    DeviceMemory<Eigen::half>* output_h_data,
+                    const dnn::RnnStateTensorDescriptor& output_c_desc,
+                    DeviceMemory<Eigen::half>* output_c_data, bool is_training,
+                    ScratchAllocator* reserve_space_allocator,
+                    ScratchAllocator* workspace_allocator) override;
+
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
                     const DeviceMemory<float>& input_data,
@@ -96,6 +113,30 @@ class CudnnSupport : public dnn::DnnSupport {
                     ScratchAllocator* reserve_space_allocator,
                     ScratchAllocator* workspace_allocator) override;
 
+  bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                     const dnn::RnnSequenceTensorDescriptor& input_desc,
+                     const DeviceMemory<Eigen::half>& input_data,
+                     const dnn::RnnStateTensorDescriptor& input_h_desc,
+                     const DeviceMemory<Eigen::half>& input_h_data,
+                     const dnn::RnnStateTensorDescriptor& input_c_desc,
+                     const DeviceMemory<Eigen::half>& input_c_data,
+                     const DeviceMemory<Eigen::half>& params,
+                     const dnn::RnnSequenceTensorDescriptor& output_desc,
+                     const DeviceMemory<Eigen::half>& output_data,
+                     const dnn::RnnStateTensorDescriptor& output_h_desc,
+                     const DeviceMemory<Eigen::half>& output_h_data,
+                     const dnn::RnnStateTensorDescriptor& output_c_desc,
+                     const DeviceMemory<Eigen::half>& output_c_data,
+                     const DeviceMemory<Eigen::half>& output_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_h_backprop_data,
+                     const DeviceMemory<Eigen::half>& output_c_backprop_data,
+                     DeviceMemory<Eigen::half>* input_backprop_data,
+                     DeviceMemory<Eigen::half>* input_h_backprop_data,
+                     DeviceMemory<Eigen::half>* input_c_backprop_data,
+                     DeviceMemory<Eigen::half>* params_backprop_data,
+                     DeviceMemory<uint8>* reserve_space_data,
+                     ScratchAllocator* workspace_allocator) override;
+
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
                      const DeviceMemory<float>& input_data,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b6a96ed3e5cbda044c00bb9b940d68f80373587a..a017ff64d4c69b6952b442464877dc26a800ad37 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -1115,19 +1115,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeStream(CudaContext* context,
-                                                CUstream stream) {
+/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
+                                                        CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
   CUresult res = cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not synchronize on CUDA stream: " << ToString(res)
-               << " :: " << port::CurrentStackTrace();
-    return false;
+    port::Status status = port::InternalError(
+        port::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
   }
   VLOG(2) << "successfully synchronized stream " << stream << " on context "
           << context;
-  return true;
+  return port::Status::OK();
 }
 
 /* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 68494aba6597c2cd1ee52a7b4cb411cd50fad77b..4002ba2021d1a2e2c36bd1786a3084ee8c08bb78 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -304,7 +304,7 @@ class CUDADriver {
   // amount of time?
   //
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static bool SynchronizeStream(CudaContext* context, CUstream stream);
+  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
 
   // Blocks the calling thread until the operations associated with the context
   // have been completed, via cuCtxSynchronize.
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 33f6c628e8c12cc25b8a83a1580e8c691a09a68e..7f8a7ca7c78a9050405e19d4b6c01a1cb4b3fb46 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -108,11 +108,6 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
-static CudaContext* GetCudaContext(Stream *stream) {
-  return static_cast<CUDAExecutor *>(stream->parent()->implementation())
-      ->cuda_context();
-}
-
 CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
   CHECK(cuda_exec != nullptr);
   return cuda_exec->cuda_context();
@@ -123,12 +118,8 @@ CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
 }
 
 CUDAExecutor::~CUDAExecutor() {
-  for (auto &it : disk_modules_) {
-    CUDADriver::UnloadModule(context_, it.second);
-  }
-  for (auto &it : in_memory_modules_) {
-    CUDADriver::UnloadModule(context_, it.second);
-  }
+  CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
   if (context_ != nullptr) {
     CUDADriver::DestroyContext(context_);
   }
@@ -219,21 +210,34 @@ static string GetBinaryDir(bool strip_exe) {
 bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
                              KernelBase *kernel) {
   CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
-  CUmodule module = nullptr;
+  CUmodule module;
   const string *kernelname;
 
-  const OnDiskKernelLoaderSpec *on_disk_spec = nullptr;
-  bool has_ptx = spec.has_cuda_ptx_on_disk();
-  bool has_cubin = spec.has_cuda_cubin_on_disk();
-  if (has_cubin && (!has_ptx || FLAGS_prefer_cubin_to_ptx)) {
-    on_disk_spec = &spec.cuda_cubin_on_disk();
-  } else if (has_ptx) {
-    on_disk_spec = &spec.cuda_ptx_on_disk();
-  }
+  VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
 
-  if (on_disk_spec != nullptr) {
-    LOG(WARNING) << "loading CUDA kernel from disk is not supported";
-    return false;
+  if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+    const char *cubin = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    uint64_t module_refcount;
+    std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
+
+    if (module == nullptr) {
+      auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
+      if (!load_status.ok()) {
+        LOG(ERROR) << "failed to load CUBIN: " << load_status;
+        return false;
+      }
+      module_refcount = 1;
+      VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+              << " as module " << module;
+    } else {
+      ++module_refcount;
+      VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+              << " is already loaded as module " << module;
+    }
+    kernel_to_gpu_binary_[kernel] = cubin;
+    gpu_binary_to_module_[cubin] = {module, module_refcount};
   } else if (spec.has_cuda_ptx_in_memory()) {
     kernelname = &spec.cuda_ptx_in_memory().kernelname();
 
@@ -241,62 +245,39 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
       return false;
     }
 
-    // Note that the orignal ptx may be compressed, and the ptx we get below is
-    // the decompressed result. To cache the module we should use the original
-    // ptx (compressed one) as the key. This is because for the same compressed
-    // ptx, we may get different decompressed ptx wrt the pointer value.
     const char *ptx = spec.cuda_ptx_in_memory().text(cc_major_, cc_minor_);
-    const char *orig_ptx =
-        spec.cuda_ptx_in_memory().original_text(cc_major_, cc_minor_);
-    if (ptx == nullptr || orig_ptx == nullptr) {
+    if (ptx == nullptr) {
       ptx = spec.cuda_ptx_in_memory().default_text();
-      orig_ptx = spec.cuda_ptx_in_memory().original_default_text();
     }
-    if (ptx == nullptr || orig_ptx == nullptr) {
-      LOG(FATAL) << "could not load ptx for kernel " << kernelname;
+    if (ptx == nullptr) {
+      LOG(FATAL) << "loader spec has no ptx for kernel " << *kernelname;
       return false;
     }
 
     mutex_lock lock{in_memory_modules_mu_};
-    module = in_memory_modules_[orig_ptx];
+    uint64_t module_refcount;
+    std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
 
     if (module == nullptr) {
-      if (g_cubinate == nullptr) {
-        if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
-          return false;
-        }
-      } else {
-        string cubin = g_cubinate(ptx);
-        auto load_status =
-            CUDADriver::LoadCubin(context_, cubin.c_str(), &module);
-        if (!load_status.ok()) {
-          LOG(ERROR) << "failed to load cubin via hook: " << load_status;
-          return false;
-        }
-      }
-      in_memory_modules_[orig_ptx] = module;
-    }
-  } else if (spec.has_cuda_cubin_in_memory()) {
-    kernelname = &spec.cuda_cubin_in_memory().kernelname();
-    const char *cubin = spec.cuda_cubin_in_memory().bytes();
-    mutex_lock lock{in_memory_modules_mu_};
-    module = in_memory_modules_[cubin];
-
-    if (module == nullptr) {
-      auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
-      if (!load_status.ok()) {
-        LOG(ERROR) << "failed to load CUBIN: " << load_status;
+      if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
+        LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
         return false;
       }
-
-      in_memory_modules_[cubin] = module;
+      VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
+              << " as module " << module;
+      module_refcount = 1;
+    } else {
+      ++module_refcount;
+      VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+              << " is already loaded as module " << module;
     }
+    kernel_to_gpu_binary_[kernel] = ptx;
+    gpu_binary_to_module_[ptx] = {module, module_refcount};
   } else {
     LOG(WARNING) << "no method of loading CUDA kernel provided";
     return false;
   }
-
-  VLOG(2) << "getting function " << kernelname << " from module " << module;
+  VLOG(2) << "getting function " << *kernelname << " from module " << module;
   if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
                                      cuda_kernel->cuda_function_ptr())) {
     return false;
@@ -308,13 +289,44 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
 
   KernelMetadata kernel_metadata;
   if (!GetKernelMetadata(cuda_kernel, &kernel_metadata)) {
-    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+    LOG(WARNING) << "unable to get metadata for kernel " << *kernelname;
   }
   kernel->set_metadata(kernel_metadata);
   kernel->set_name(*kernelname);
   return true;
 }
 
+void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
+  VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
+
+  mutex_lock lock{in_memory_modules_mu_};
+  auto gpu_binary_it = kernel_to_gpu_binary_.find(kernel);
+  if (kernel_to_gpu_binary_.end() == gpu_binary_it) {
+    VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
+            << " has never been loaded.";
+    return;  // We've never seen this kernel.
+  }
+  VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
+          << " has loaded GPU code " << gpu_binary_it->second;
+  auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
+            << " has no loaded CUDA module.";
+    return;  // This kernel never loaded any modules
+  }
+  auto &module = module_it->second.first;
+  auto &refcount = module_it->second.second;
+  VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
+          << " has loaded GPU code " << gpu_binary_it->second
+          << " into CUDA module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading CUDA module " << module;
+    CUDADriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  kernel_to_gpu_binary_.erase(gpu_binary_it);
+}
+
 bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
                                      KernelMetadata *kernel_metadata) {
   int value;
@@ -363,14 +375,14 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!CUDADriver::LaunchKernel(GetCudaContext(stream), cufunc, block_dims.x,
-                                block_dims.y, block_dims.z, thread_dims.x,
-                                thread_dims.y, thread_dims.z,
-                                args.number_of_shared_bytes(), custream,
-                                kernel_params, nullptr /* = extra */)) {
-    LOG(ERROR) << "failed to launch CUDA kernel with args: "
+  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                                block_dims.z, thread_dims.x, thread_dims.y,
+                                thread_dims.z, args.number_of_shared_bytes(),
+                                custream, kernel_params,
+                                nullptr /* = extra */)) {
+    LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
-               << "; thread dim: " << thread_dims.ToString()
+               << " args; thread dim: " << thread_dims.ToString()
                << "; block dim: " << block_dims.ToString();
     return false;
   }
@@ -652,7 +664,7 @@ bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
   return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
 }
 
-bool CUDAExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status CUDAExecutor::BlockHostUntilDoneWithStatus(Stream *stream) {
   return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
 }
 
@@ -774,21 +786,12 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
 
 bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
                              size_t *bytes) {
-  {  // give limited scope to mutex_lock
-    mutex_lock lock{disk_modules_mu_};
-    for (auto &it : disk_modules_) {
-      if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
-                                      reinterpret_cast<CUdeviceptr *>(mem),
-                                      bytes)) {
-        return true;
-      }
-    }
-  }
-
   {  // give limited scope to mutex_lock
     mutex_lock lock{in_memory_modules_mu_};
-    for (auto &it : in_memory_modules_) {
-      if (CUDADriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+    for (auto &it : gpu_binary_to_module_) {
+      CUmodule module = it.second.first;
+      CHECK(module != nullptr);
+      if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
                                       reinterpret_cast<CUdeviceptr *>(mem),
                                       bytes)) {
         return true;
@@ -922,16 +925,129 @@ struct UnqueryableDeviceParams {
   uint64 shared_memory_alloc_granularity;
 };
 
+// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+// https://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls
 static const UnqueryableDeviceParams kAllUnqueryableDeviceParams[] = {
-  {
-    3, 5,       // compute capability (3.5)
-    16,         // blocks_per_core_limit
-    64 * 1024,  // registers_per_core_limit
-    255,        // registers_per_thread_limit
-    4,          // warp_alloc_granularity
-    256,        // register_alloc_granularity
-    256         // shared_memory_alloc_granularity
-  }
+    {
+        2, 0,       // compute capability (2.0)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        2, 1,       // compute capability (2.1)
+        8,          // blocks_per_core_limit
+        32 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        64,         // register_alloc_granularity
+        128,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 0,       // compute capability (3.0)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        63,         // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 2,       // compute capability (3.2)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 5,       // compute capability (3.5)
+        16,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        3, 7,        // compute capability (3.7)
+        16,          // blocks_per_core_limit
+        128 * 1024,  // registers_per_core_limit
+        255,         // registers_per_thread_limit
+        4,           // warp_alloc_granularity
+        256,         // register_alloc_granularity
+        256,         // shared_memory_alloc_granularity
+    },
+    {
+        5, 0,       // compute capability (5.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 2,       // compute capability (5.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        5, 3,       // compute capability (5.3)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 0,       // compute capability (6.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 1,       // compute capability (6.1)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    {
+        6, 2,       // compute capability (6.2)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        4,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
+    // TODO(jlebar): Confirm the alloc granularity values for sm_70.  These are
+    // not published in the spreadsheet linked above.  Currently we guess that
+    // they're the same as sm_60.
+    {
+        7, 0,       // compute capability (7.0)
+        32,         // blocks_per_core_limit
+        64 * 1024,  // registers_per_core_limit
+        255,        // registers_per_thread_limit
+        2,          // warp_alloc_granularity
+        256,        // register_alloc_granularity
+        256,        // shared_memory_alloc_granularity
+    },
 };
 
 DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 6c5b9dca90b8be632d084aff46657132807b8ea5..5adbb598567a6118168543f374ce9af3bfda6711 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -22,8 +22,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 
-#include <map>
 #include <set>
+#include <unordered_map>
 
 #include "tensorflow/stream_executor/cuda/cuda_kernel.h"
 #include "tensorflow/stream_executor/event.h"
@@ -62,6 +62,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   bool GetKernel(const MultiKernelLoaderSpec &spec,
                  KernelBase *kernel) override;
+  void UnloadKernel(const KernelBase *kernel) override;
 
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &k,
@@ -151,7 +152,7 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   Event::Status PollForEventStatus(Event *event) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDoneWithStatus(Stream *stream) override;
 
   int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
 
@@ -231,19 +232,15 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
                          const BlockDim &block_dims);
 
-  // Guards the on-disk-module mapping.
-  mutex disk_modules_mu_;
-
-  // Mapping from filename to CUmodule, if it was already retrieved.
-  // Multiple CUfunctions are usually obtained from a single CUmodule so we
-  // attempt to hit in this mapping first, before retrieving it.
-  std::map<string, CUmodule> disk_modules_ GUARDED_BY(disk_modules_mu_);
-
   // Guards the in-memory-module mapping.
   mutex in_memory_modules_mu_;
 
-  std::map<const char *, CUmodule> in_memory_modules_
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
       GUARDED_BY(in_memory_modules_mu_);
+  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
+  std::unordered_map<const void *, std::pair<CUmodule, uint64>>
+      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
 
   // Guards the launched kernel set.
   mutex launched_kernels_mu_;
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 874ac1ab6574bbf95b05893f34131b2cee9acc72..3a738461489212a026197bc58777883349ba4b54 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -197,7 +197,7 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
 static void InitializeCudaPlatform() {
   // Disabling leak checking, MultiPlatformManager does not destroy its
   // registered platforms.
-  
+
   std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 07fe8a85f4c069c4b91ffaf21150a28919ee74dc..44144a06139bf8661432cb930e53ba5218aac823 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -470,6 +470,7 @@ string ConvolutionDescriptor::ToShortString() const {
 PoolingDescriptor::PoolingDescriptor(int ndims)
     : mode_(dnn::PoolingMode::kMaximum),
       ndims_(ndims),
+      propagate_nans_(false),
       window_(ndims, 0),
       padding_(ndims, 0),
       strides_(ndims, 1) {}
@@ -482,6 +483,7 @@ void PoolingDescriptor::CloneFrom(const PoolingDescriptor& other) {
   window_ = other.window_;
   padding_ = other.padding_;
   strides_ = other.strides_;
+  propagate_nans_ = other.propagate_nans_;
 }
 
 string PoolingDescriptor::ToString() const {
@@ -495,9 +497,12 @@ string PoolingDescriptor::ToString() const {
     port::Appendf(&padding, "%lld", padding_[i]);
   }
 
-  return port::Printf("{mode: %s window: %s strides: %s padding: %s}",
-                      mode_string, window.c_str(), strides.c_str(),
-                      padding.c_str());
+  const char* propagate_string = propagate_nans_ ? "Yes" : "No";
+
+  return port::Printf(
+      "{mode: %s window: %s strides: %s padding: %s propagate NaNs: %s}",
+      mode_string, window.c_str(), strides.c_str(), padding.c_str(),
+      propagate_string);
 }
 
 string PoolingDescriptor::ToShortString() const {
@@ -508,7 +513,8 @@ string PoolingDescriptor::ToShortString() const {
     port::Appendf(&padding, "_p%d:%lld", i, padding_[i]);
   }
   return port::StrCat(mode_ == dnn::PoolingMode::kMaximum ? "max" : "avg",
-                      window, strides, padding);
+                      window, strides, padding,
+                      propagate_nans_ ? "propagate_nans" : "ignore_nans");
 }
 
 // -- NormalizeDescriptor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 624357b82f5d608813d3482211d04695c966afb1..73b96de438a0009dd2f40880b4f60d50af35b494 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -661,6 +661,10 @@ class PoolingDescriptor {
     SetDim(&strides_, dim, value);
     return *this;
   }
+  PoolingDescriptor& set_propagate_nans(bool value) {
+    propagate_nans_ = value;
+    return *this;
+  }
 
   int ndims() const { return ndims_; }
   void CloneFrom(const PoolingDescriptor& other);
@@ -681,10 +685,12 @@ class PoolingDescriptor {
   std::vector<int64> window() const { return window_; }
   std::vector<int64> padding() const { return padding_; }
   std::vector<int64> strides() const { return strides_; }
+  bool propagate_nans() const { return propagate_nans_; }
 
  private:
   PoolingMode mode_;
   int ndims_;
+  bool propagate_nans_;
 
   // Stored as: ..., y, x.
   std::vector<int64> window_;
@@ -1126,7 +1132,7 @@ class DnnSupport {
   //    space in order to speed up the convolution operation.
   //  algorithm: an integer to specify which algorithm should be used for the
   //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
+  //    by default. The coding of the algorithm is be interpreted by the
   //    underlying implementation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
@@ -2027,6 +2033,26 @@ class DnnSupport {
   //  workspace_allocator: an allocator to create temporary workspace used in
   //    this kernel. The caller is responsible for retaining the memory long
   //    enough for the lifespan of this operation, and recycles aftewards.
+  virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+                            const dnn::RnnSequenceTensorDescriptor& input_desc,
+                            const DeviceMemory<Eigen::half>& input_data,
+                            const dnn::RnnStateTensorDescriptor& input_h_desc,
+                            const DeviceMemory<Eigen::half>& input_h_data,
+                            const dnn::RnnStateTensorDescriptor& input_c_desc,
+                            const DeviceMemory<Eigen::half>& input_c_data,
+                            const DeviceMemory<Eigen::half>& params,
+                            const dnn::RnnSequenceTensorDescriptor& output_desc,
+                            DeviceMemory<Eigen::half>* output_data,
+                            const dnn::RnnStateTensorDescriptor& output_h_desc,
+                            DeviceMemory<Eigen::half>* output_h_data,
+                            const dnn::RnnStateTensorDescriptor& output_c_desc,
+                            DeviceMemory<Eigen::half>* output_c_data,
+                            bool is_training,
+                            ScratchAllocator* reserve_space_allocator,
+                            ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
   virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                             const dnn::RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<float>& input_data,
@@ -2107,6 +2133,33 @@ class DnnSupport {
   //    workspace memory used by this operation. The caller is responsible for
   //    keeping the memory alive long enough for this operation, and recylces
   //    afterwards.
+  virtual bool DoRnnBackward(
+      Stream* stream, const dnn::RnnDescriptor& rnn_desc,
+      const dnn::RnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<Eigen::half>& input_data,
+      const dnn::RnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<Eigen::half>& input_h_data,
+      const dnn::RnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<Eigen::half>& input_c_data,
+      const DeviceMemory<Eigen::half>& params,
+      const dnn::RnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<Eigen::half>& output_data,
+      const dnn::RnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<Eigen::half>& output_h_data,
+      const dnn::RnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<Eigen::half>& output_c_data,
+      const DeviceMemory<Eigen::half>& output_backprop_data,
+      const DeviceMemory<Eigen::half>& output_h_backprop_data,
+      const DeviceMemory<Eigen::half>& output_c_backprop_data,
+      DeviceMemory<Eigen::half>* input_backprop_data,
+      DeviceMemory<Eigen::half>* input_h_backprop_data,
+      DeviceMemory<Eigen::half>* input_c_backprop_data,
+      DeviceMemory<Eigen::half>* params_backprop_data,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator) {
+    return false;
+  }
+
   virtual bool DoRnnBackward(
       Stream* stream, const dnn::RnnDescriptor& rnn_desc,
       const dnn::RnnSequenceTensorDescriptor& input_desc,
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 0af2c8cc3d751aa35958a21c81a71496f994e1fb..1fd8eeb881c4eedaa72cdd7e834ee0cdcd259c02 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -177,9 +177,9 @@ bool HostExecutor::StopTimer(Stream *stream, Timer *timer) {
   return true;
 }
 
-bool HostExecutor::BlockHostUntilDone(Stream *stream) {
+port::Status HostExecutor::BlockHostUntilDoneWithStatus(Stream *stream) {
   AsHostStream(stream)->BlockUntilDone();
-  return true;
+  return port::Status::OK();
 }
 
 DeviceDescription *HostExecutor::PopulateDeviceDescription() const {
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 77b07e4a577fe321901a19369107701ec1904a80..e884554a15320516f1b9cdf7773e745c7ca1efe9 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -139,7 +139,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
 
   bool StopTimer(Stream *stream, Timer *timer) override;
 
-  bool BlockHostUntilDone(Stream *stream) override;
+  port::Status BlockHostUntilDoneWithStatus(Stream *stream) override;
 
   int PlatformDeviceCount() override { return 1; }
 
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index b09fa8aca55576ee338530bff4deba1212eea96b..e1b3635d52eac8c7181395fa76592ae3161a035a 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -65,7 +65,11 @@ KernelBase::KernelBase(StreamExecutor *parent,
                        internal::KernelInterface *implementation)
     : parent_(parent), implementation_(implementation) {}
 
-KernelBase::~KernelBase() {}
+KernelBase::~KernelBase() {
+  if (parent_) {
+    parent_->UnloadKernel(this);
+  }
+}
 
 unsigned KernelBase::Arity() const { return implementation_->Arity(); }
 
diff --git a/tensorflow/stream_executor/lib/static_threadlocal.h b/tensorflow/stream_executor/lib/static_threadlocal.h
index 6e2bd0d45563644e7572f5a2fae2dd76ee6a6ca1..02720cbd261253ca9ccfafa84963526844385919 100644
--- a/tensorflow/stream_executor/lib/static_threadlocal.h
+++ b/tensorflow/stream_executor/lib/static_threadlocal.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATIC_THREADLOCAL_H_
 
 #ifdef _MSC_VER
-#define __thread __declspec(thread) 
+#define __thread __declspec(thread)
 #endif
 
 // For POD types in TLS mode, s_obj_VAR is the thread-local variable.
diff --git a/tensorflow/stream_executor/machine_manager.cc b/tensorflow/stream_executor/machine_manager.cc
deleted file mode 100644
index 2b61c8a0bc43cee9a10f0ad5e84001c462940bc5..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/machine_manager.cc
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/stream_executor/machine_manager.h"
-
-#include "tensorflow/stream_executor/platform/port.h"
-
-#include "tensorflow/stream_executor/dso_loader.h"
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-
-namespace perftools {
-namespace gputools {
-
-mutex MachineManager::mu_{LINKER_INITIALIZED};
-
-MachineManager *MachineManager::singleton_ = nullptr;
-
-PlatformKind MachineManager::DetectPreferredPlatform() {
-// TODO(leary) for KNC card experiments, figure out a legitimate way to
-// determine this. For now, we use a compile-time hint so we can compile tests
-// for both.
-#if defined TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_PREFER_OPENCL
-  return PlatformKind::kOpenCL;
-#elif defined TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_PREFER_HOST
-  return PlatformKind::kHost;
-#else
-  return PlatformKind::kCuda;
-#endif
-}
-
-/* static */ port::StatusOr<std::unique_ptr<MachineManager>>
-MachineManager::Create(PlatformKind kind, DeviceOptions options,
-                       const PluginConfig &config) {
-  std::unique_ptr<MachineManager> machine_manager{
-      new MachineManager{kind, options, config}};
-  auto init_status = machine_manager->Init();
-  if (!init_status.ok()) {
-    return init_status;
-  }
-
-  return std::move(machine_manager);
-}
-
-MachineManager::MachineManager(PlatformKind platform,
-                               DeviceOptions device_options,
-                               const PluginConfig &config)
-    : platform_(platform),
-      device_options_(device_options),
-      plugin_config_(config),
-      min_numa_node_(0),
-      limit_numa_node_(0) {}
-
-port::Status MachineManager::Init() {
-  // Initialize the first StreamExecutor, then use that platform interface to
-  // grab the device count.
-  executors_.resize(1);
-  executors_[0].reset(new StreamExecutor{platform_, plugin_config_});
-  auto status = executors_[0]->Init(0 /* = device_ordinal */, device_options_);
-  if (!status.ok()) {
-    return port::Status{
-        port::error::FAILED_PRECONDITION,
-        port::StrCat(
-            "failed to initialize StreamExecutor for device ordinal 0: ",
-            status.ToString())};
-  }
-  int device_count = executors_[0]->PlatformDeviceCount();
-  if (device_count == 0) {
-    LOG(WARNING) << "no devices found for platform "
-                 << PlatformKindString(platform_);
-    min_numa_node_ = limit_numa_node_ = 0;
-    return port::Status::OK();
-  }
-
-  streams_.resize(device_count);
-  streams_[0].reset(new Stream(executors_[0].get()));
-  if (!streams_[0]->Init().ok()) {
-    return port::Status{
-        port::error::FAILED_PRECONDITION,
-        "failed to initialize default stream for device ordinal 0"};
-  }
-
-  min_numa_node_ = executors_[0]->GetDeviceDescription().numa_node();
-  limit_numa_node_ = min_numa_node_ + 1;
-
-  executors_.resize(device_count);
-  for (int device_ordinal = 1; device_ordinal < device_count;
-       ++device_ordinal) {
-    StreamExecutor *stream_exec = new StreamExecutor{platform_, plugin_config_};
-    executors_[device_ordinal].reset(stream_exec);
-    auto status = stream_exec->Init(device_ordinal, device_options_);
-    if (!status.ok()) {
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          port::StrCat(
-              "failed to initialize StreamExecutor for device ordinal ",
-              device_ordinal, ": ", status.ToString()));
-    }
-
-    min_numa_node_ = std::min(min_numa_node_,
-                              stream_exec->GetDeviceDescription().numa_node());
-    limit_numa_node_ = std::max(
-        limit_numa_node_, stream_exec->GetDeviceDescription().numa_node() + 1);
-
-    if (!stream_exec->GetDeviceDescription().ecc_enabled()) {
-      LOG(WARNING) << "ECC not enabled for device ordinal: " << device_ordinal;
-    }
-
-    streams_[device_ordinal].reset(
-        new Stream(executors_[device_ordinal].get()));
-    if (!streams_[device_ordinal]->Init().ok()) {
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          port::StrCat(
-              "failed to initialize default stream for device ordinal ",
-              device_ordinal));
-    }
-  }
-
-  return port::Status::OK();
-}
-
-int MachineManager::device_count() const { return executors_.size(); }
-
-port::Status MachineManager::EnablePeerAccess() {
-  auto peer_access_map = GetPeerAccessMap();
-  for (const auto &access : *peer_access_map) {
-    auto devices = access.first;
-    if (access.second) {
-      StreamExecutor *from = executors_[devices.first].get();
-      StreamExecutor *to = executors_[devices.second].get();
-      auto status = from->EnablePeerAccessTo(to);
-      if (!status.ok()) {
-        return status;
-      }
-    } else {
-      LOG(INFO) << "cannot enable peer access from device ordinal "
-                << devices.first << " to device ordinal " << devices.second;
-    }
-  }
-  return port::Status::OK();
-}
-
-std::unique_ptr<std::map<std::pair<int, int>, bool>>
-MachineManager::GetPeerAccessMap() {
-  auto *map = new std::map<std::pair<int, int>, bool>;
-  for (int i = 0; i < device_count(); ++i) {
-    for (int j = 0; j < device_count(); ++j) {
-      StreamExecutor *from = executors_[i].get();
-      StreamExecutor *to = executors_[j].get();
-      (*map)[{i, j}] = from->CanEnablePeerAccessTo(to);
-    }
-  }
-
-  return std::unique_ptr<std::map<std::pair<int, int>, bool>>{map};
-}
-
-StreamExecutor *MachineManager::executor_for_device(int device_ordinal) const {
-  CHECK_GE(device_ordinal, 0) << "device ordinal must be non-negative";
-  CHECK(0 <= device_ordinal && device_ordinal < device_count())
-      << "device " << device_ordinal << " out of range with device count "
-      << device_count();
-  StreamExecutor *executor = executors_[device_ordinal].get();
-  CHECK(executor != nullptr);
-  return executor;
-}
-
-int MachineManager::ExecutorToBus(const StreamExecutor *stream_exec) const {
-  return stream_exec->GetDeviceDescription().numa_node() - min_numa_node_;
-}
-
-int MachineManager::DeviceToBus(int device_ordinal) const {
-  return ExecutorToBus(executor_for_device(device_ordinal));
-}
-
-int MachineManager::ExecutorToNumaNode(
-    const StreamExecutor *stream_exec) const {
-  return stream_exec->GetDeviceDescription().numa_node();
-}
-
-int MachineManager::DeviceToNumaNode(int device_ordinal) const {
-  return ExecutorToNumaNode(executor_for_device(device_ordinal));
-}
-
-StreamExecutor *MachineManager::first_executor_for_bus(int bus_ordinal) {
-  CHECK_LT(bus_ordinal, bus_count()) << "bus ordinal out of available range";
-  for (auto &executor : executors_) {
-    if (ExecutorToBus(executor.get()) == bus_ordinal) {
-      return executor.get();
-    }
-  }
-
-  LOG(WARNING) << "could not find executor requested for bus ordinal: "
-               << bus_ordinal;
-  return nullptr;
-}
-
-StreamExecutor *MachineManager::first_executor_for_numa_node(int numa_node) {
-  for (auto &executor : executors_) {
-    if (ExecutorToNumaNode(executor.get()) == numa_node) {
-      return executor.get();
-    }
-  }
-
-  LOG(WARNING) << "could not find executor requested for numa_node: "
-               << numa_node;
-  return nullptr;
-}
-
-Stream *MachineManager::stream_for_device(int device_ordinal) {
-  CHECK(0 <= device_ordinal && device_ordinal < device_count());
-  Stream *stream = streams_[device_ordinal].get();
-  CHECK(stream != nullptr);
-  return stream;
-}
-
-/* static */ port::StatusOr<MachineManager *>
-MachineManager::CreateSingletonInternal(PlatformKind platform,
-                                        DeviceOptions options,
-                                        const PluginConfig &config) {
-  if (singleton_ != nullptr) {
-    return port::Status{
-        port::error::ALREADY_EXISTS,
-        "cannot create machine manager singleton; one already exists"};
-  }
-
-  auto create_status = Create(platform, options, config);
-  if (!create_status.ok()) {
-    return create_status.status();
-  }
-
-  singleton_ = create_status.ConsumeValueOrDie().release();
-
-  VLOG(1) << "machine manager singleton is " << singleton_ << " with platform "
-          << PlatformKindString(platform) << " and device options "
-          << options.ToString();
-
-  return singleton_;
-}
-
-/* static */ MachineManager *MachineManager::CreateSingletonOrDie(
-    PlatformKind platform, DeviceOptions options, const PluginConfig &config) {
-  auto status = CreateSingleton(platform, options, config);
-  if (!status.ok()) {
-    LOG(FATAL) << "failed to create MachineManager singleton: "
-               << status.status();
-  }
-  return status.ValueOrDie();
-}
-
-/* static */ port::StatusOr<MachineManager *> MachineManager::CreateSingleton(
-    PlatformKind platform, DeviceOptions device_options,
-    const PluginConfig &config) {
-  mutex_lock lock{mu_};
-  return CreateSingletonInternal(platform, device_options, config);
-}
-
-/* static */ MachineManager *MachineManager::singleton() {
-  mutex_lock lock{mu_};
-  if (singleton_ == nullptr) {
-    PlatformKind platform = DetectPreferredPlatform();
-    DeviceOptions options = DeviceOptions::Default();
-    auto status = CreateSingletonInternal(platform, options, PluginConfig());
-    if (!status.ok()) {
-      LOG(FATAL)
-          << "failed to create MachineManager singleton: "
-             "singleton accessor attempted lazy construction but failed: "
-          << status.status();
-    }
-    return status.ValueOrDie();
-  }
-
-  return singleton_;
-}
-
-}  // namespace gputools
-}  // namespace perftools
diff --git a/tensorflow/stream_executor/machine_manager.h b/tensorflow/stream_executor/machine_manager.h
deleted file mode 100644
index 65396dd1ff595f0107fa9904df5e6c64c35e4069..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/machine_manager.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This interface provides a machine-wide resource management singleton
-// interface as a convenience for users who will want to exploit all of the GPU
-// resources present on the system.
-//
-// To use the singleton interface:
-//
-//  // At start of program or in your module initializer.
-//  // Do not call this with different sets of arguments!
-//  MachineManager::CreateSingletonOrDie(
-//      MachineManager::DetectPreferredPlatform(), DeviceOptions::Default());
-//
-//  // At any point after that, this convenience interface avoids you having to
-//  // pass those two parameters:
-//  StreamExecutor *device0_executor =
-//      MachineManager::singleton()->executor_for_device(0 /* = ordinal */);
-//  ...
-
-// ----------------- THIS CLASS IS DEPRECATED - DO NOT USE ------------------
-// This class is not suitable for open-sourcing, as it does not support
-// plugins and depends on hardcoded PlatformKind enums. MultiPlatformManager and
-// Platform plugins are the replacements.
-// ----------------- THIS CLASS IS DEPRECATED - DO NOT USE ------------------
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_
-
-#include <map>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/stream_executor/device_options.h"  // IWYU pragma: export
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream.h"
-#include "tensorflow/stream_executor/stream_executor.h"
-
-namespace perftools {
-namespace gputools {
-
-// MachineManager is used to instantiate and manage singleton resources for
-// all the GPUs present on a machine. This basically amounts to having a
-// StreamExecutor-per-device pool.
-//
-// Thread-safe.
-class MachineManager {
- public:
-  // Inspects the host to determine the preferred GPU execution platform.
-  // To force OpenCL from a build target on a machine that has both OpenCL and
-  // CUDA capabilities, link against the :stream_executor_prefer_opencl target.
-  static PlatformKind DetectPreferredPlatform();
-
-  // Returns the machine manager singleton.
-  // If the singleton has not yet been created when this is invoked, this
-  // creates it with resonable default options, otherwise it returns the
-  // already-created singleton. If there are errors during creation, this call
-  // will terminate the program.
-  static MachineManager *singleton();
-
-  // Returns a singleton instance of the machine manager -- it's generally
-  // assumed that users will have one of these for a real-world application as a
-  // form of resource manager.
-  //
-  // This should only be called once, at the initialization of an application,
-  // if at all -- MachineManager::singleton() will return a value with sensible
-  // default as determined by DetectPreferredPlatform. Attempts to create the
-  // singleton with options multiple times will result in an error.
-  static port::StatusOr<MachineManager *> CreateSingleton(
-      PlatformKind platform, DeviceOptions device_options,
-      const PluginConfig &config = PluginConfig());
-
-  // Convenience "or die" wrapper around the above call.
-  static MachineManager *CreateSingletonOrDie(
-      PlatformKind platform, DeviceOptions device_options,
-      const PluginConfig &config = PluginConfig());
-
-  // Creates a new instantiation of the MachineManager.
-  // Warning: generally users will want to use the singleton form, see
-  // MachineManager::singleton().
-  //
-  // The machine manager has a number of devices that it detects on creation
-  // that does not change over the course of its lifetime. This does not support
-  // things like hot-plugging of GPUs or the event of GPUs dropping off the bus
-  // in a recoverable manner.
-  static port::StatusOr<std::unique_ptr<MachineManager>> Create(
-      PlatformKind kind, DeviceOptions options,
-      const PluginConfig &config = PluginConfig());
-
-  // Returns the number of devices visible to the machine manager.
-  int device_count() const;
-
-  // Returns the StreamExecutor for one of the machine-manager visible devices.
-  // Checks that device_ordinal is within device_count() bound.
-  StreamExecutor *executor_for_device(int device_ordinal) const;
-
-  // Returns the bus ordinal count (as determined by the span of NUMA nodes
-  // associated with the available devices).
-  int bus_count() const { return limit_numa_node_ - min_numa_node_; }
-
-  // Returns the bus ordinal associated with a given device ordinal.
-  int DeviceToBus(int device_ordinal) const;
-
-  // Returns the NUMA node associated with a given device ordinal.
-  int DeviceToNumaNode(int device_ordinal) const;
-
-  // Returns the first StreamExecutor (within device_count() ordinals that has
-  // the corresponding bus ordinal, or nullptr if none is found.
-  //
-  // The valid bus ordinals can be enumerated by scanning through the executors
-  // and seeing what bus number they are on.
-  StreamExecutor *first_executor_for_bus(int bus_ordinal);
-
-  // Returns the first StreamExecutor associated with the specified
-  // numa_node, or nullptr if none is found.
-  StreamExecutor *first_executor_for_numa_node(int numa_node);
-
-  // Returns the default stream for the default executor (that returned by
-  // executor_for_device()). The same stream will be returned for all calls to
-  // stream_for_device() (with the same device_ordinal).
-  Stream *stream_for_device(int device_ordinal);
-
-  // Returns the platform that this machine manager was created to target.
-  PlatformKind platform() const { return platform_; }
-
-  // Enables peer access between all possible devices on this platform.
-  // Only dies due to failure to enable peer access for devices in which
-  // GetPeerAccessMap() is true.
-  port::Status EnablePeerAccess();
-
-  // Returns a map that says, for pairs (device ordinal i, device ordinal j),
-  // whether i can access j's memory space.
-  std::unique_ptr<std::map<std::pair<int, int>, bool>> GetPeerAccessMap();
-
- private:
-  // Guts of the singleton creation mechanism that requires the exclusive
-  // singleton lock to be held, in order to prevent deadlock due to method
-  // composition.
-  static port::StatusOr<MachineManager *> CreateSingletonInternal(
-      PlatformKind platform, DeviceOptions options, const PluginConfig &config)
-      EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // Private constructor used in singleton creation.
-  MachineManager(PlatformKind platform, DeviceOptions options,
-                 const PluginConfig &config);
-
-  // Populates the executors_ vector with an executor per observable device
-  // ordinal on the platform. Logs and returns false if any of the
-  // Stream Executors cannot be created.
-  port::Status Init();
-
-  // Converts a StreamExecutor's NUMA node association into a bus ordinal for
-  // this machine.
-  int ExecutorToBus(const StreamExecutor *stream_exec) const;
-
-  // Returns the NUMA node association for the StreamExecutor.
-  int ExecutorToNumaNode(const StreamExecutor *stream_exec) const;
-
-  // Mutex that guards the initialization of the machine manager static
-  // variable.
-  static mutex mu_;
-
-  // Singleton MachineManager value -- assignment to this is protected by a
-  // static singleton guard clause.
-  static MachineManager *singleton_ GUARDED_BY(mu_);
-
-  // Holds an executor associated with each device ordinal present in the
-  // system, which are the indices. Immutable after initialization.
-  std::vector<std::unique_ptr<StreamExecutor>> executors_;
-
-  // Holds an stream associated with each device ordinal present in the
-  // system, which are the indices. Immutable after initialization.
-  std::vector<std::unique_ptr<Stream>> streams_;
-
-  // The platform that this is managing for the machine.
-  PlatformKind platform_;
-
-  // Options used to create StreamExecutors on each of the respective devices.
-  DeviceOptions device_options_;
-
-  // Plugin configuration to use for all StreamExecutors created by this object.
-  PluginConfig plugin_config_;
-
-  // The smallest NUMA node value for any device managed by this machine
-  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
-  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense.
-  int min_numa_node_;
-
-  // Larger than the NUMA node value for any device managed by this machine
-  // manager.
-  int limit_numa_node_;
-};
-
-}  // namespace gputools
-}  // namespace perftools
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_MACHINE_MANAGER_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 6d756ab1917738cbfbfc913c02353198f8988872..de65038d1734532d31f7bb54ccd086d9b626b285 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4679,6 +4679,39 @@ Stream &Stream::ThenMemset32(DeviceMemoryBase *location, uint32 pattern,
   return *this;
 }
 
+Stream &Stream::ThenRnnForward(
+    const dnn::RnnDescriptor &rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor &input_desc,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::RnnStateTensorDescriptor &input_h_desc,
+    const DeviceMemory<Eigen::half> &input_h_data,
+    const dnn::RnnStateTensorDescriptor &input_c_desc,
+    const DeviceMemory<Eigen::half> &input_c_data,
+    const DeviceMemory<Eigen::half> &params,
+    const dnn::RnnSequenceTensorDescriptor &output_desc,
+    DeviceMemory<Eigen::half> *output_data,
+    const dnn::RnnStateTensorDescriptor &output_h_desc,
+    DeviceMemory<Eigen::half> *output_h_data,
+    const dnn::RnnStateTensorDescriptor &output_c_desc,
+    DeviceMemory<Eigen::half> *output_c_data, bool is_training,
+    ScratchAllocator *reserve_space_allocator,
+    ScratchAllocator *workspace_allocator) {
+  // TODO(zhengxq): add VLOG PARAM calls.
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoRnnForward(
+          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, output_data,
+          output_h_desc, output_h_data, output_c_desc, output_c_data,
+          is_training, reserve_space_allocator, workspace_allocator));
+    } else {
+      SetError();
+      LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenRnnForward(
     const dnn::RnnDescriptor &rnn_desc,
     const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -4744,6 +4777,48 @@ Stream &Stream::ThenRnnForward(
   return *this;
 }
 
+Stream &Stream::ThenRnnBackward(
+    const dnn::RnnDescriptor &rnn_desc,
+    const dnn::RnnSequenceTensorDescriptor &input_desc,
+    const DeviceMemory<Eigen::half> &input_data,
+    const dnn::RnnStateTensorDescriptor &input_h_desc,
+    const DeviceMemory<Eigen::half> &input_h_data,
+    const dnn::RnnStateTensorDescriptor &input_c_desc,
+    const DeviceMemory<Eigen::half> &input_c_data,
+    const DeviceMemory<Eigen::half> &params,
+    const dnn::RnnSequenceTensorDescriptor &output_desc,
+    const DeviceMemory<Eigen::half> &output_data,
+    const dnn::RnnStateTensorDescriptor &output_h_desc,
+    const DeviceMemory<Eigen::half> &output_h_data,
+    const dnn::RnnStateTensorDescriptor &output_c_desc,
+    const DeviceMemory<Eigen::half> &output_c_data,
+    const DeviceMemory<Eigen::half> &output_backprop_data,
+    const DeviceMemory<Eigen::half> &output_h_backprop_data,
+    const DeviceMemory<Eigen::half> &output_c_backprop_data,
+    DeviceMemory<Eigen::half> *input_backprop_data,
+    DeviceMemory<Eigen::half> *input_h_backprop_data,
+    DeviceMemory<Eigen::half> *input_c_backprop_data,
+    DeviceMemory<Eigen::half> *params_backprop_data,
+    DeviceMemory<uint8> *reserve_space_data,
+    ScratchAllocator *workspace_allocator) {
+  // TODO(zhengxq): add VLOG PARAM calls.
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoRnnBackward(
+          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, output_data,
+          output_h_desc, output_h_data, output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator));
+    } else {
+      SetError();
+      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenRnnBackward(
     const dnn::RnnDescriptor &rnn_desc,
     const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -4980,22 +5055,24 @@ Stream &Stream::ThenEnqueueOnBackgroundThread(
   });
 }
 
-bool Stream::BlockHostUntilDone() {
+port::Status Stream::BlockHostUntilDoneWithStatus() {
   VLOG_CALL();
 
   if (!ok()) {
-    LOG(INFO)
-        << "stream " << this
-        << " did not block host until done; was already in an error state";
-    return false;
+    port::Status status = port::Status(
+        port::error::INTERNAL,
+        "stream did not block host until done; was already in an error state");
+    LOG(INFO) << status << " " << this;
+    return status;
   }
 
+  port::Status first_error;
   {
     // Wait until all active sub-streams have done their tasks.
     mutex_lock lock{mu_};
     for (auto &stream : sub_streams_) {
       if (!stream.second) {
-        CheckError(stream.first->BlockHostUntilDone());
+        first_error.Update(stream.first->BlockHostUntilDoneWithStatus());
         // Set this sub-stream as available.
         stream.second = true;
       }
@@ -5004,8 +5081,13 @@ bool Stream::BlockHostUntilDone() {
 
   temporary_memory_manager_.DeallocateFinalizedTemporaries();
 
-  CheckError(parent_->BlockHostUntilDone(this));
-  return ok();
+  first_error.Update(parent_->BlockHostUntilDoneWithStatus(this));
+  CheckError(first_error.ok());
+  return first_error;
+}
+
+bool Stream::BlockHostUntilDone() {
+  return BlockHostUntilDoneWithStatus().ok();
 }
 
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 21172d5a1651cc87b9fdec22e85313a4fd64ab26..15a5a2b6cbe5f862fa62cfc1bf27f69b5374ab02 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -113,7 +113,7 @@ class Stream {
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
-  Stream &Init();
+  Stream &Init() LOCKS_EXCLUDED(mu_);
 
   // Initializes timer t via the StreamExecutor.
   Stream &InitTimer(Timer *t);
@@ -124,11 +124,11 @@ class Stream {
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
   // create a new sub-stream.
-  Stream *GetOrCreateSubStream();
+  Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
   // later.
-  void ReturnSubStream(Stream *sub_stream);
+  void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
   // or destroyed.
@@ -1751,6 +1751,24 @@ class Stream {
 
   // Enqueue a forward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnForward for more details.
+  Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
+                         const dnn::RnnSequenceTensorDescriptor &input_desc,
+                         const DeviceMemory<Eigen::half> &input_data,
+                         const dnn::RnnStateTensorDescriptor &input_h_desc,
+                         const DeviceMemory<Eigen::half> &input_h_data,
+                         const dnn::RnnStateTensorDescriptor &input_c_desc,
+                         const DeviceMemory<Eigen::half> &input_c_data,
+                         const DeviceMemory<Eigen::half> &params,
+                         const dnn::RnnSequenceTensorDescriptor &output_desc,
+                         DeviceMemory<Eigen::half> *output_data,
+                         const dnn::RnnStateTensorDescriptor &output_h_desc,
+                         DeviceMemory<Eigen::half> *output_h_data,
+                         const dnn::RnnStateTensorDescriptor &output_c_desc,
+                         DeviceMemory<Eigen::half> *output_c_data,
+                         bool is_training,
+                         ScratchAllocator *reserve_space_allocator,
+                         ScratchAllocator *workspace_allocator);
+
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
                          const DeviceMemory<float> &input_data,
@@ -1787,6 +1805,31 @@ class Stream {
 
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
+  Stream &ThenRnnBackward(
+      const dnn::RnnDescriptor &rnn_desc,
+      const dnn::RnnSequenceTensorDescriptor &input_desc,
+      const DeviceMemory<Eigen::half> &input_data,
+      const dnn::RnnStateTensorDescriptor &input_h_desc,
+      const DeviceMemory<Eigen::half> &input_h_data,
+      const dnn::RnnStateTensorDescriptor &input_c_desc,
+      const DeviceMemory<Eigen::half> &input_c_data,
+      const DeviceMemory<Eigen::half> &params,
+      const dnn::RnnSequenceTensorDescriptor &output_desc,
+      const DeviceMemory<Eigen::half> &output_data,
+      const dnn::RnnStateTensorDescriptor &output_h_desc,
+      const DeviceMemory<Eigen::half> &output_h_data,
+      const dnn::RnnStateTensorDescriptor &output_c_desc,
+      const DeviceMemory<Eigen::half> &output_c_data,
+      const DeviceMemory<Eigen::half> &output_backprop_data,
+      const DeviceMemory<Eigen::half> &output_h_backprop_data,
+      const DeviceMemory<Eigen::half> &output_c_backprop_data,
+      DeviceMemory<Eigen::half> *input_backprop_data,
+      DeviceMemory<Eigen::half> *input_h_backprop_data,
+      DeviceMemory<Eigen::half> *input_c_backprop_data,
+      DeviceMemory<Eigen::half> *params_backprop_data,
+      DeviceMemory<uint8> *reserve_space_data,
+      ScratchAllocator *workspace_allocator);
+
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
                           const DeviceMemory<float> &input_data,
@@ -1860,8 +1903,17 @@ class Stream {
   // entrained on the stream (enqueued to this point in program
   // execution) to complete.
   //
-  // Returns true if the stream is ok().
-  bool BlockHostUntilDone();
+  // Returns an OK status if the blocking was successful and the stream is ok().
+  // Otherwise returns an error describing why the blocking failed.
+  //
+  // TODO(b/70298427): Rename to BlockHostUntilDone, once all callers have been
+  // converted from the bool form.
+  port::Status BlockHostUntilDoneWithStatus() LOCKS_EXCLUDED(mu_);
+
+  // DEPRECATED(b/70298427) - new code should use BlockHostUntilDoneWithStatus()
+  //
+  // Equivalent to BlockHostUntilDoneWithStatus().ok().
+  bool BlockHostUntilDone() LOCKS_EXCLUDED(mu_);
 
   // Warning! This method interacts with internal threads in
   // sometimes-unpredictable ways and is intended for GPU-Executor-internal
@@ -1917,14 +1969,14 @@ class Stream {
   friend struct ThenBlasImpl;  // for implementing ThenBlasXXX.
   friend class ocl::CLBlas;    // for parent_.
 
-  bool InErrorState() const {
+  bool InErrorState() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock{mu_};
     return !ok_;
   }
 
   // Sets the error state if operation_retcode is false.
   // This is a useful shorthand for many stream routines.
-  void CheckError(bool operation_retcode) {
+  void CheckError(bool operation_retcode) LOCKS_EXCLUDED(mu_) {
     if (operation_retcode) {
       return;
     }
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 95b285b992df91eb1adc01423bb07e2298dba9c4..273d970b6fa4a581381689191b183a30f4f2bcd3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-
 namespace perftools {
 namespace gputools {
 namespace internal {
@@ -40,7 +37,6 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-
 }  // namespace internal
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 12593e31d4e67f25136edf541d4bc03f3e72b54b..0a9bef71d08466fecea330f22a4a16416dd1aecc 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -169,6 +169,8 @@ class StreamExecutorInterface {
                       const KernelArgsArrayBase &args) {
     return false;
   }
+  // Releases any state associated with the kernel.
+  virtual void UnloadKernel(const KernelBase *kernel) {}
   virtual void *Allocate(uint64 size) = 0;
   virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset,
                                   uint64 size) = 0;
@@ -217,7 +219,7 @@ class StreamExecutorInterface {
   virtual void DeallocateTimer(Timer *timer) = 0;
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
-  virtual bool BlockHostUntilDone(Stream *stream) = 0;
+  virtual port::Status BlockHostUntilDoneWithStatus(Stream *stream) = 0;
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 9dc1749327b96ecca0a6801d57ac431eab514857..719f2929373c118cb317cccbcc2365cc463ce7e8 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -217,6 +217,10 @@ bool StreamExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return implementation_->GetKernel(spec, kernel);
 }
 
+void StreamExecutor::UnloadKernel(const KernelBase *kernel) {
+  implementation_->UnloadKernel(kernel);
+}
+
 void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
   VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
           << ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
@@ -428,11 +432,11 @@ bool StreamExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
   return implementation_->Launch(stream, thread_dims, block_dims, kernel, args);
 }
 
-bool StreamExecutor::BlockHostUntilDone(Stream *stream) {
-  bool result;
+port::Status StreamExecutor::BlockHostUntilDoneWithStatus(Stream *stream) {
+  port::Status result;
   SCOPED_TRACE(TraceListener::BlockHostUntilDone, &result, stream);
 
-  result = implementation_->BlockHostUntilDone(stream);
+  result = implementation_->BlockHostUntilDoneWithStatus(stream);
   return result;
 }
 
@@ -562,19 +566,18 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
           << device_src.opaque() << ", size=" << size
           << ", host_dst=" << host_dst << ")" << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyD2H, &result, device_src, size,
                host_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(host_dst, device_src, size);
-  if (!status.ok()) {
-    return port::Status{port::error::INTERNAL,
-                        port::Printf("failed to synchronously memcpy "
-                                     "device-to-host: device %p to host %p "
-                                     "size %lld: %s",
-                                     device_src.opaque(), host_dst, size,
-                                     status.ToString().c_str())};
+  result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
+  if (!result.ok()) {
+    result = port::Status{port::error::INTERNAL,
+                          port::Printf("failed to synchronously memcpy "
+                                       "device-to-host: device %p to host %p "
+                                       "size %lld: %s",
+                                       device_src.opaque(), host_dst, size,
+                                       result.ToString().c_str())};
   }
 
   return result;
@@ -586,19 +589,18 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(
           << ", size=" << size << ", device_dst" << device_dst->opaque() << ")"
           << StackTraceIfVLOG10();
 
-  port::Status result{port::Status::OK()};
+  port::Status result;
   SCOPED_TRACE(TraceListener::SynchronousMemcpyH2D, &result, host_src, size,
                device_dst);
 
-  port::Status status =
-      implementation_->SynchronousMemcpy(device_dst, host_src, size);
-  if (!status.ok()) {
+  result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
+  if (!result.ok()) {
     result = port::Status{
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
                      "%p to device %p size %lld: %s",
                      host_src, device_dst->opaque(), size,
-                     status.ToString().c_str())};
+                     result.ToString().c_str())};
   }
 
   return result;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index 9c225e5faea69d972802303668673d69e0697570..d2965dbfd7dcfa254e6feedc807dc32a3a27f4cc 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -104,6 +104,9 @@ class StreamExecutor {
   // platform, false is returned.
   bool GetKernel(const MultiKernelLoaderSpec &spec, KernelBase *kernel);
 
+  // Releases any state associated with the previously loaded kernel.
+  void UnloadKernel(const KernelBase *kernel);
+
   // Synchronously allocates an array on the device of type T with element_count
   // elements.
   template <typename T>
@@ -478,7 +481,7 @@ class StreamExecutor {
   // Causes the host code to synchronously wait for operations entrained onto
   // stream to complete. Effectively a join on the asynchronous device
   // operations enqueued on the stream before this program point.
-  bool BlockHostUntilDone(Stream *stream);
+  port::Status BlockHostUntilDoneWithStatus(Stream *stream);
 
   // Synchronously allocates size bytes on the underlying platform and returns
   // an opaque void* representing that allocation. In the case of failure,
diff --git a/tensorflow/stream_executor/trace_listener.h b/tensorflow/stream_executor/trace_listener.h
index 88c54f982b3cfde925dbe0ca4f7bc3a738e5f3ac..d1e87c348b1f867009fdb6b741d984b2f58cef21 100644
--- a/tensorflow/stream_executor/trace_listener.h
+++ b/tensorflow/stream_executor/trace_listener.h
@@ -65,7 +65,8 @@ class TraceListener {
                                             const port::Status* result) {}
 
   virtual void BlockHostUntilDoneBegin(int64 correlation_id, Stream* stream) {}
-  virtual void BlockHostUntilDoneComplete(int64 correlation_id, bool result) {}
+  virtual void BlockHostUntilDoneComplete(int64 correlation_id,
+                                          const port::Status* result) {}
 };
 
 }  // namespace gputools
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index e647a78055806186674eae2c3201e771ca9cbccb..044c9a96a3ba88db96bd26e85161da5cff93508f 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1,45 +1,42 @@
 # -*- Python -*-
 
-
 # Return the options to use for a C++ library or binary build.
 # Uses the ":optmode" config_setting to pick the options.
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
     "tf_sycl_tests_tags",
+    "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
-    "if_static",)
+    "if_static",
+)
 load(
     "@local_config_cuda//cuda:build_defs.bzl",
     "if_cuda",
-    "cuda_default_copts",)
-
+    "cuda_default_copts",
+)
 load(
     "//third_party/mkl:build_defs.bzl",
-    "if_mkl",)
-
+    "if_mkl",
+)
 def register_extension_info(**kwargs):
     pass
 
-
 # Given a source file, generate a test name.
 # i.e. "common_runtime/direct_session_test.cc" becomes
 #      "common_runtime_direct_session_test"
 def src_to_test_name(src):
   return src.replace("/", "_").split(".")[0]
 
-
 def full_path(relative_paths):
   return [PACKAGE_NAME + "/" + relative for relative in relative_paths]
 
-
 # List of proto files for android builds
 def tf_android_core_proto_sources(core_proto_sources_relative):
   return [
       "//tensorflow/core:" + p for p in core_proto_sources_relative
   ]
 
-
 # Returns the list of pb.h and proto.h headers that are generated for
 # tf_android_core_proto_sources().
 def tf_android_core_proto_headers(core_proto_sources_relative):
@@ -51,13 +48,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
       for p in core_proto_sources_relative
   ])
 
-
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
   return str(Label(dep))
 
-
 def if_android_x86(a):
   return select({
       clean_dep("//tensorflow:android_x86"): a,
@@ -65,35 +60,30 @@ def if_android_x86(a):
       "//conditions:default": [],
   })
 
-
 def if_android_arm(a):
   return select({
       clean_dep("//tensorflow:android_arm"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_arm64(a):
   return select({
       clean_dep("//tensorflow:android_arm64"): a,
       "//conditions:default": [],
   })
 
-
 def if_android_mips(a):
   return select({
       clean_dep("//tensorflow:android_mips"): a,
       "//conditions:default": [],
   })
 
-
 def if_not_android(a):
   return select({
       clean_dep("//tensorflow:android"): [],
       "//conditions:default": a,
   })
 
-
 def if_not_android_mips_and_mips64(a):
   return select({
       clean_dep("//tensorflow:android_mips"): [],
@@ -101,21 +91,18 @@ def if_not_android_mips_and_mips64(a):
       "//conditions:default": a,
   })
 
-
 def if_android(a):
   return select({
       clean_dep("//tensorflow:android"): a,
       "//conditions:default": [],
   })
 
-
 def if_ios(a):
   return select({
       clean_dep("//tensorflow:ios"): a,
       "//conditions:default": [],
   })
 
-
 def if_mobile(a):
   return select({
       clean_dep("//tensorflow:android"): a,
@@ -123,7 +110,6 @@ def if_mobile(a):
       "//conditions:default": [],
   })
 
-
 def if_not_mobile(a):
   return select({
       clean_dep("//tensorflow:android"): [],
@@ -131,7 +117,6 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
-
 def if_not_windows(a):
   return select({
       clean_dep("//tensorflow:windows"): [],
@@ -139,6 +124,12 @@ def if_not_windows(a):
       "//conditions:default": a,
   })
 
+def if_windows(a):
+  return select({
+      clean_dep("//tensorflow:windows"): a,
+      clean_dep("//tensorflow:windows_msvc"): a,
+      "//conditions:default": [],
+  })
 
 def if_linux_x86_64(a):
   return select({
@@ -161,36 +152,47 @@ WIN_COPTS = [
     "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
     "/DEIGEN_AVOID_STL_ARRAY",
     "/Iexternal/gemmlowp",
-    "/wd4018", # -Wno-sign-compare
-    "/U_HAS_EXCEPTIONS", "/D_HAS_EXCEPTIONS=1", "/EHsc", # -fno-exceptions
+    "/wd4018",  # -Wno-sign-compare
+    "/U_HAS_EXCEPTIONS",
+    "/D_HAS_EXCEPTIONS=1",
+    "/EHsc",  # -fno-exceptions
     "/DNOGDI",
 ]
 
 # LINT.IfChange
-def tf_copts():
-  return (if_not_windows([
-      "-DEIGEN_AVOID_STL_ARRAY",
-      "-Iexternal/gemmlowp",
-      "-Wno-sign-compare",
-      "-fno-exceptions",
-      "-ftemplate-depth=900",
-  ]) + if_cuda(["-DGOOGLE_CUDA=1"]) + if_mkl(["-DINTEL_MKL=1", "-fopenmp",]) + if_android_arm(
-      ["-mfpu=neon"]) + if_linux_x86_64(["-msse3"]) + select({
-          clean_dep("//tensorflow:android"): [
-              "-std=c++11",
-              "-DTF_LEAN_BINARY",
-              "-O2",
-              "-Wno-narrowing",
-              "-fomit-frame-pointer",
-          ],
-          clean_dep("//tensorflow:darwin"): [],
-          clean_dep("//tensorflow:windows"): WIN_COPTS,
-          clean_dep("//tensorflow:windows_msvc"): WIN_COPTS,
-          clean_dep("//tensorflow:ios"): ["-std=c++11"],
-          "//conditions:default": ["-pthread"]
+def tf_copts(android_optimization_level_override="-O2"):
+  # For compatibility reasons, android_optimization_level_override
+  # is currently only being set for Android.
+  # To clear this value, and allow the CROSSTOOL default
+  # to be used, pass android_optimization_level_override=None
+  android_copts = [
+      "-std=c++11",
+      "-DTF_LEAN_BINARY",
+      "-Wno-narrowing",
+      "-fomit-frame-pointer",
+  ]
+  if android_optimization_level_override:
+    android_copts.append(android_optimization_level_override)
+  return (
+      if_not_windows([
+          "-DEIGEN_AVOID_STL_ARRAY",
+          "-Iexternal/gemmlowp",
+          "-Wno-sign-compare",
+          "-fno-exceptions",
+          "-ftemplate-depth=900"])
+      + if_cuda(["-DGOOGLE_CUDA=1"])
+      + if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML", "-fopenmp",])
+      + if_android_arm(["-mfpu=neon"])
+      + if_linux_x86_64(["-msse3"])
+      + select({
+            clean_dep("//tensorflow:android"): android_copts,
+            clean_dep("//tensorflow:darwin"): [],
+            clean_dep("//tensorflow:windows"): WIN_COPTS,
+            clean_dep("//tensorflow:windows_msvc"): WIN_COPTS,
+            clean_dep("//tensorflow:ios"): ["-std=c++11"],
+            "//conditions:default": ["-pthread"]
       }))
 
-
 def tf_opts_nortti_if_android():
   return if_android([
       "-fno-rtti",
@@ -198,10 +200,8 @@ def tf_opts_nortti_if_android():
       "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
   ])
 
-
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
-
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps=None):
@@ -219,13 +219,11 @@ def tf_gen_op_libs(op_lib_names, deps=None):
         alwayslink=1,
         linkstatic=1,)
 
-
 def _make_search_paths(prefix, levels_to_root):
   return ",".join(
       ["-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
        for search_level in range(levels_to_root + 1)])
 
-
 def _rpath_linkopts(name):
   # Search parent directories up to the TensorFlow root directory for shared
   # object dependencies, even if this op shared object is deeply nested
@@ -244,7 +242,6 @@ def _rpath_linkopts(name):
       ],
   })
 
-
 # Bazel-generated shared objects which must be linked into TensorFlow binaries
 # to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
 def tf_binary_additional_srcs():
@@ -254,7 +251,6 @@ def tf_binary_additional_srcs():
           clean_dep("//tensorflow:libtensorflow_framework.so"),
       ])
 
-
 def tf_cc_shared_object(
     name,
     srcs=[],
@@ -276,6 +272,10 @@ def tf_cc_shared_object(
       }),
       **kwargs)
 
+register_extension_info(
+    extension_name = "tf_cc_shared_object",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
@@ -298,9 +298,9 @@ def tf_cc_binary(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_binary",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
@@ -308,7 +308,9 @@ def tf_gen_op_wrapper_cc(name,
                          op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                          deps=None,
                          override_file=None,
-                         include_internal_ops=0):
+                         include_internal_ops=0,
+                         # ApiDefs will be loaded in the order specified in this list.
+                         api_def_srcs=[]):
   # Construct an op generator binary for these ops.
   tool = out_ops_file + "_gen_cc"
   if deps == None:
@@ -320,12 +322,27 @@ def tf_gen_op_wrapper_cc(name,
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=[op_gen] + deps)
 
+  srcs = api_def_srcs[:]
+
   if override_file == None:
-    srcs = []
     override_arg = ","
   else:
-    srcs = [override_file]
+    srcs += [override_file]
     override_arg = "$(location " + override_file + ")"
+
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          " $$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   native.genrule(
       name=name + "_genrule",
       outs=[
@@ -336,8 +353,7 @@ def tf_gen_op_wrapper_cc(name,
       tools=[":" + tool] + tf_binary_additional_srcs(),
       cmd=("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
            "$(location :" + out_ops_file + ".cc) " + override_arg + " " +
-           str(include_internal_ops)))
-
+           str(include_internal_ops) + " " + api_def_args_str))
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate individual C++ .cc and .h
@@ -379,7 +395,9 @@ def tf_gen_op_wrappers_cc(name,
                           op_gen=clean_dep("//tensorflow/cc:cc_op_gen_main"),
                           override_file=None,
                           include_internal_ops=0,
-                          visibility=None):
+                          visibility=None,
+                          # ApiDefs will be loaded in the order apecified in this list.
+                          api_def_srcs=[]):
   subsrcs = other_srcs[:]
   subhdrs = other_hdrs[:]
   internalsrcs = []
@@ -391,7 +409,8 @@ def tf_gen_op_wrappers_cc(name,
         pkg=pkg,
         op_gen=op_gen,
         override_file=override_file,
-        include_internal_ops=include_internal_ops)
+        include_internal_ops=include_internal_ops,
+        api_def_srcs=api_def_srcs)
     subsrcs += ["ops/" + n + ".cc"]
     subhdrs += ["ops/" + n + ".h"]
     internalsrcs += ["ops/" + n + "_internal.cc"]
@@ -428,7 +447,6 @@ def tf_gen_op_wrappers_cc(name,
       alwayslink=1,
       visibility=[clean_dep("//tensorflow:internal")])
 
-
 # Generates a Python library target wrapping the ops registered in "deps".
 #
 # Args:
@@ -449,6 +467,8 @@ def tf_gen_op_wrappers_cc(name,
 #     "name" arg)
 #   op_whitelist: if not empty, only op names in this list will be wrapped. It
 #     is invalid to specify both "hidden" and "op_whitelist".
+#   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
+#     specified ops.
 def tf_gen_op_wrapper_py(name,
                          out=None,
                          hidden=None,
@@ -457,7 +477,9 @@ def tf_gen_op_wrapper_py(name,
                          require_shape_functions=False,
                          hidden_file=None,
                          generated_target_name=None,
-                         op_whitelist=[]):
+                         op_whitelist=[],
+                         cc_linkopts=[],
+                         api_def_srcs=[]):
   if (hidden or hidden_file) and op_whitelist:
     fail('Cannot pass specify both hidden and op_whitelist.')
 
@@ -467,7 +489,7 @@ def tf_gen_op_wrapper_py(name,
     deps = [str(Label("//tensorflow/core:" + name + "_op_lib"))]
   tf_cc_binary(
       name=tool_name,
-      linkopts=["-lm"],
+      linkopts=["-lm"] + cc_linkopts,
       copts=tf_copts(),
       linkstatic=1,  # Faster to link this one-time-use binary dynamically
       deps=([
@@ -490,22 +512,39 @@ def tf_gen_op_wrapper_py(name,
     op_list_arg = "''"
     op_list_is_whitelist = False
 
+  # Prepare ApiDef directories to pass to the genrule.
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
+
   if hidden_file:
     # `hidden_file` is file containing a list of op names to be hidden in the
     # generated module.
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
-        srcs=[hidden_file],
+        srcs=api_def_srcs + [hidden_file],
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") @$(location " + hidden_file + ") " +
+        cmd=("$(location " + tool_name + ") " + api_def_args_str +
+             " @$(location " + hidden_file + ") " +
              ("1" if require_shape_functions else "0") + " > $@"))
   else:
     native.genrule(
         name=name + "_pygenrule",
         outs=[out],
+        srcs=api_def_srcs,
         tools=[tool_name] + tf_binary_additional_srcs(),
-        cmd=("$(location " + tool_name + ") " + op_list_arg + " " +
+        cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
+             op_list_arg + " " +
              ("1" if require_shape_functions else "0") + " " +
              ("1" if op_list_is_whitelist else "0") + " > $@"))
 
@@ -521,7 +560,6 @@ def tf_gen_op_wrapper_py(name,
           clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
       ],)
 
-
 # Define a bazel macro that creates cc_test for tensorflow.
 #
 # Links in the framework shared object
@@ -564,9 +602,9 @@ def tf_cc_test(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cc_test",
-    label_regex_for_dep="{extension_name}.*")
-
+    extension_name = "tf_cc_test",
+    label_regex_for_dep = "{extension_name}.*",
+)
 
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
@@ -590,6 +628,10 @@ def tf_cc_test_gpu(name,
       suffix=suffix,
       args=args)
 
+register_extension_info(
+    extension_name = "tf_cc_test_gpu",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_cc_test(name,
                     srcs=[],
@@ -630,6 +672,11 @@ def tf_cuda_cc_test(name,
       linkopts=linkopts,
       args=args)
 
+register_extension_info(
+    extension_name = "tf_cuda_cc_test",
+    label_regex_for_dep = "{extension_name}",
+)
+
 def tf_cuda_only_cc_test(name,
                     srcs=[],
                     deps=[],
@@ -659,6 +706,11 @@ def tf_cuda_only_cc_test(name,
       }),
       tags=tags + tf_cuda_tests_tags())
 
+register_extension_info(
+    extension_name = "tf_cuda_only_cc_test",
+    label_regex_for_dep = "{extension_name}_gpu",
+)
+
 # Create a cc_test for each of the tensorflow tests listed in "tests"
 def tf_cc_tests(srcs,
                 deps,
@@ -681,7 +733,6 @@ def tf_cc_tests(srcs,
         linkopts=linkopts,
         nocopts=nocopts)
 
-
 def tf_cc_test_mkl(srcs,
                    deps,
                    name="",
@@ -691,7 +742,6 @@ def tf_cc_test_mkl(srcs,
                    args=None):
   if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
 
-
 def tf_cc_tests_gpu(srcs,
                     deps,
                     name="",
@@ -701,7 +751,6 @@ def tf_cc_tests_gpu(srcs,
                     args=None):
   tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)
 
-
 def tf_cuda_cc_tests(srcs,
                      deps,
                      name="",
@@ -733,6 +782,11 @@ def tf_java_test(name,
       *args,
       **kwargs)
 
+register_extension_info(
+    extension_name = "tf_java_test",
+    label_regex_for_dep = "{extension_name}",
+)
+
 def _cuda_copts():
   """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
@@ -751,10 +805,8 @@ def _cuda_copts():
       ]),
   })
 
-
 # Build defs for TensorFlow kernels
 
-
 # When this target is built using --config=cuda, a cc_library is built
 # that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
 # libraries needed by GPU kernels.
@@ -777,6 +829,10 @@ def tf_gpu_kernel_library(srcs,
       alwayslink=1,
       **kwargs)
 
+register_extension_info(
+    extension_name = "tf_gpu_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
   """Generate a cc_library with a conditional set of CUDA dependencies.
@@ -810,10 +866,9 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=None, **kwargs):
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_cuda_library",
-    label_regex_for_dep="{extension_name}")
-
-
+    extension_name = "tf_cuda_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_kernel_library(name,
                       prefix=None,
@@ -884,9 +939,9 @@ def tf_kernel_library(name,
       **kwargs)
 
 register_extension_info(
-    extension_name="tf_kernel_library",
-    label_regex_for_dep="{extension_name}(_gpu)?")
-
+    extension_name = "tf_kernel_library",
+    label_regex_for_dep = "{extension_name}(_gpu)?",
+)
 
 def tf_mkl_kernel_library(name,
                           prefix=None,
@@ -924,6 +979,10 @@ def tf_mkl_kernel_library(name,
           nocopts=nocopts
       ))
 
+register_extension_info(
+    extension_name = "tf_mkl_kernel_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 # Bazel rules for building swig files.
 def _py_wrap_cc_impl(ctx):
@@ -957,44 +1016,41 @@ def _py_wrap_cc_impl(ctx):
       progress_message="SWIGing " + src.path)
   return struct(files=depset(outputs))
 
-
 _py_wrap_cc = rule(
-    attrs={
-        "srcs":
-            attr.label_list(
-                mandatory=True,
-                allow_files=True,),
-        "swig_includes":
-            attr.label_list(
-                cfg="data",
-                allow_files=True,),
-        "deps":
-            attr.label_list(
-                allow_files=True,
-                providers=["cc"],),
-        "toolchain_deps":
-            attr.label_list(
-                allow_files=True,),
-        "module_name":
-            attr.string(mandatory=True),
-        "py_module_name":
-            attr.string(mandatory=True),
-        "_swig":
-            attr.label(
-                default=Label("@swig//:swig"),
-                executable=True,
-                cfg="host",),
-        "_swiglib":
-            attr.label(
-                default=Label("@swig//:templates"),
-                allow_files=True,),
+    attrs = {
+        "srcs": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+        "swig_includes": attr.label_list(
+            cfg = "data",
+            allow_files = True,
+        ),
+        "deps": attr.label_list(
+            allow_files = True,
+            providers = ["cc"],
+        ),
+        "toolchain_deps": attr.label_list(
+            allow_files = True,
+        ),
+        "module_name": attr.string(mandatory = True),
+        "py_module_name": attr.string(mandatory = True),
+        "_swig": attr.label(
+            default = Label("@swig//:swig"),
+            executable = True,
+            cfg = "host",
+        ),
+        "_swiglib": attr.label(
+            default = Label("@swig//:templates"),
+            allow_files = True,
+        ),
     },
-    outputs={
+    outputs = {
         "cc_out": "%{module_name}.cc",
         "py_out": "%{py_module_name}.py",
     },
-    implementation=_py_wrap_cc_impl,)
-
+    implementation = _py_wrap_cc_impl,
+)
 
 def _get_repository_roots(ctx, files):
   """Returns abnormal root directories under which files reside.
@@ -1025,7 +1081,6 @@ def _get_repository_roots(ctx, files):
       result[root] -= 1
   return [k for v, k in sorted([(v, k) for k, v in result.items()])]
 
-
 # Bazel rule for collecting the header files that a target depends on.
 def _transitive_hdrs_impl(ctx):
   outputs = depset()
@@ -1033,21 +1088,20 @@ def _transitive_hdrs_impl(ctx):
     outputs += dep.cc.transitive_headers
   return struct(files=outputs)
 
-
 _transitive_hdrs = rule(
-    attrs={
+    attrs = {
         "deps": attr.label_list(
-            allow_files=True,
-            providers=["cc"],),
+            allow_files = True,
+            providers = ["cc"],
+        ),
     },
-    implementation=_transitive_hdrs_impl,)
-
+    implementation = _transitive_hdrs_impl,
+)
 
 def transitive_hdrs(name, deps=[], **kwargs):
   _transitive_hdrs(name=name + "_gather", deps=deps)
   native.filegroup(name=name, srcs=[":" + name + "_gather"])
 
-
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
 def cc_header_only_library(name, deps=[], includes=[], **kwargs):
@@ -1073,7 +1127,6 @@ def cc_header_only_library(name, deps=[], includes=[], **kwargs):
                     includes=includes,
                     **kwargs)
 
-
 def tf_custom_op_library_additional_deps():
   return [
       "@protobuf_archive//:protobuf_headers",
@@ -1082,7 +1135,6 @@ def tf_custom_op_library_additional_deps():
       clean_dep("//tensorflow/core:framework_headers_lib"),
   ]
 
-
 # Traverse the dependency graph along the "deps" attribute of the
 # target and return a struct with one field called 'tf_collected_deps'.
 # tf_collected_deps will be the union of the deps of the current target
@@ -1096,16 +1148,15 @@ def _collect_deps_aspect_impl(target, ctx):
         alldeps = alldeps | dep.tf_collected_deps
   return struct(tf_collected_deps=alldeps)
 
-
 collect_deps_aspect = aspect(
-    implementation=_collect_deps_aspect_impl, attr_aspects=["deps"])
-
+    attr_aspects = ["deps"],
+    implementation = _collect_deps_aspect_impl,
+)
 
 def _dep_label(dep):
   label = dep.label
   return label.package + ":" + label.name
 
-
 # This rule checks that the transitive dependencies of targets listed
 # in the 'deps' attribute don't depend on the targets listed in
 # the 'disallowed_deps' attribute.
@@ -1122,22 +1173,24 @@ def _check_deps_impl(ctx):
                   disallowed_dep))
   return struct()
 
-
 check_deps = rule(
     _check_deps_impl,
-    attrs={
-        "deps":
-            attr.label_list(
-                aspects=[collect_deps_aspect], mandatory=True,
-                allow_files=True),
-        "disallowed_deps":
-            attr.label_list(mandatory=True, allow_files=True)
-    },)
-
+    attrs = {
+        "deps": attr.label_list(
+            aspects = [collect_deps_aspect],
+            mandatory = True,
+            allow_files = True,
+        ),
+        "disallowed_deps": attr.label_list(
+            mandatory = True,
+            allow_files = True,
+        ),
+    },
+)
 
 # Helper to build a dynamic library (.so) from the sources containing
 # implementations of custom ops and kernels.
-def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
+def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
   cuda_deps = [
       clean_dep("//tensorflow/core:stream_executor_headers_lib"),
       "@local_config_cuda//cuda:cuda_headers",
@@ -1166,13 +1219,17 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[]):
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
       copts=tf_copts(),
-      linkopts=select({
+      linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
           ],
           clean_dep("//tensorflow:darwin"): [],
       }),)
 
+register_extension_info(
+    extension_name = "tf_custom_op_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_custom_op_py_library(name,
                             srcs=[],
@@ -1191,18 +1248,16 @@ def tf_custom_op_py_library(name,
       deps=deps,)
 
 register_extension_info(
-    extension_name="tf_custom_op_py_library",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "tf_custom_op_py_library",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_extension_linkopts():
   return []  # No extension link opts
 
-
 def tf_extension_copts():
   return []  # No extension c opts
 
-
 def tf_py_wrap_cc(name,
                              srcs,
                              swig_includes=[],
@@ -1270,19 +1325,39 @@ def tf_py_wrap_cc(name,
           "//conditions:default": [":" + cc_library_name],
       }))
 
-
-def py_test(deps=[], **kwargs):
+# This macro is for running python tests against system installed pip package
+# on Windows.
+#
+# py_test is built as an exectuable python zip file on Windows, which contains all
+# dependencies of the target. Because of the C++ extensions, it would be very
+# inefficient if the py_test zips all runfiles, plus we don't need them when running
+# tests against system installed pip package. So we'd like to get rid of the deps
+# of py_test in this case.
+#
+# In order to trigger the tests without bazel clean after getting rid of deps,
+# we introduce the following :
+# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker
+#    file of the pip package, the test gets to rerun when the pip package change.
+#    Note that this only works on Windows. See the definition of
+#    //tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
+# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
+def py_test(deps=[], data=[], **kwargs):
   native.py_test(
       deps=select({
           "//conditions:default": deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
+          clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
+      }),
+      data = data + select({
+          "//conditions:default": [],
+          clean_dep("//tensorflow:no_tensorflow_py_deps"):
+          ["//tensorflow/tools/pip_package:win_pip_package_marker"],
       }),
       **kwargs)
 
 register_extension_info(
-    extension_name="py_test",
-    label_regex_for_dep="{extension_name}")
-
+    extension_name = "py_test",
+    label_regex_for_dep = "{extension_name}",
+)
 
 def tf_py_test(name,
                srcs,
@@ -1294,10 +1369,13 @@ def tf_py_test(name,
                shard_count=1,
                additional_deps=[],
                flaky=0,
-               xla_enabled=False):
+               xla_enabled=False,
+               grpc_enabled=False):
   if xla_enabled:
     additional_deps = additional_deps + tf_additional_xla_deps_py()
-  native.py_test(
+  if grpc_enabled:
+    additional_deps = additional_deps + tf_additional_grpc_deps_py()
+  py_test(
       name=name,
       size=size,
       srcs=srcs,
@@ -1307,20 +1385,17 @@ def tf_py_test(name,
       visibility=[clean_dep("//tensorflow:internal")],
       shard_count=shard_count,
       data=data,
-      deps=select({
-          "//conditions:default": [
-              clean_dep("//tensorflow/python:extra_py_tests_deps"),
-              clean_dep("//tensorflow/python:gradient_checker"),
+      deps=[
+            clean_dep("//tensorflow/python:extra_py_tests_deps"),
+            clean_dep("//tensorflow/python:gradient_checker"),
           ] + additional_deps,
-          clean_dep("//tensorflow:no_tensorflow_py_deps"): []
-      }),
       flaky=flaky,
       srcs_version="PY2AND3")
 
 register_extension_info(
-    extension_name="tf_py_test",
-    label_regex_map={"deps": "additional_deps:{extension_name}"})
-
+    extension_name = "tf_py_test",
+    label_regex_map = {"additional_deps": "deps:{extension_name}"},
+)
 
 def cuda_py_test(name,
                  srcs,
@@ -1332,7 +1407,8 @@ def cuda_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   tf_py_test(
       name=name,
@@ -1345,12 +1421,13 @@ def cuda_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="cuda_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "cuda_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def sycl_py_test(name,
                  srcs,
@@ -1362,7 +1439,8 @@ def sycl_py_test(name,
                  additional_deps=[],
                  tags=[],
                  flaky=0,
-                 xla_enabled=False):
+                 xla_enabled=False,
+                 grpc_enabled=False):
   test_tags = tags + tf_sycl_tests_tags()
   tf_py_test(
       name=name,
@@ -1375,12 +1453,13 @@ def sycl_py_test(name,
       shard_count=shard_count,
       additional_deps=additional_deps,
       flaky=flaky,
-      xla_enabled=xla_enabled)
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 register_extension_info(
-    extension_name="sycl_py_test",
-    label_regex_map={"additional_deps": "additional_deps:{extension_name}"})
-
+    extension_name = "sycl_py_test",
+    label_regex_map = {"additional_deps": "additional_deps:{extension_name}"},
+)
 
 def py_tests(name,
              srcs,
@@ -1390,7 +1469,8 @@ def py_tests(name,
              tags=[],
              shard_count=1,
              prefix="",
-             xla_enabled=False):
+             xla_enabled=False,
+             grpc_enabled=False):
   for src in srcs:
     test_name = src.split("/")[-1].split(".")[0]
     if prefix:
@@ -1404,8 +1484,8 @@ def py_tests(name,
         shard_count=shard_count,
         data=data,
         additional_deps=additional_deps,
-        xla_enabled=xla_enabled)
-
+        xla_enabled=xla_enabled,
+        grpc_enabled=grpc_enabled)
 
 def cuda_py_tests(name,
                   srcs,
@@ -1415,7 +1495,8 @@ def cuda_py_tests(name,
                   shard_count=1,
                   tags=[],
                   prefix="",
-                  xla_enabled=False):
+                  xla_enabled=False,
+                  grpc_enabled=False):
   test_tags = tags + tf_cuda_tests_tags()
   py_tests(
       name=name,
@@ -1426,8 +1507,8 @@ def cuda_py_tests(name,
       tags=test_tags,
       shard_count=shard_count,
       prefix=prefix,
-      xla_enabled=xla_enabled)
-
+      xla_enabled=xla_enabled,
+      grpc_enabled=grpc_enabled)
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
 # make the proto_text functions, for the protos passed in <srcs>.
@@ -1451,12 +1532,10 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs):
       ],)
   return struct(hdrs=out_hdrs, srcs=out_srcs)
 
-
 def tf_genrule_cmd_append_to_srcs(to_append):
   return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
           " >> $(@)")
 
-
 def tf_version_info_genrule():
   native.genrule(
       name="version_info_gen",
@@ -1471,7 +1550,6 @@ def tf_version_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
-
 def tf_py_build_info_genrule():
   native.genrule(
       name="py_build_info_gen",
@@ -1481,10 +1559,14 @@ def tf_py_build_info_genrule():
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
-
 def cc_library_with_android_deps(deps,
                                  android_deps=[],
                                  common_deps=[],
                                  **kwargs):
   deps = if_not_android(deps) + if_android(android_deps) + common_deps
   native.cc_library(deps=deps, **kwargs)
+
+register_extension_info(
+    extension_name = "cc_library_with_android_deps",
+    label_regex_for_dep = "{extension_name}",
+)
diff --git a/tensorflow/third_party/mpi/mpi.bzl b/tensorflow/third_party/mpi/mpi.bzl
deleted file mode 100644
index 38ce91c4d069fc311d5e7f17a49ff7904c9c67eb..0000000000000000000000000000000000000000
--- a/tensorflow/third_party/mpi/mpi.bzl
+++ /dev/null
@@ -1,17 +0,0 @@
-#OpenMPI and Mvapich/mpich require different headers
-#based on the configuration options return one or the other
-
-def mpi_hdr():
-    MPI_LIB_IS_OPENMPI=True
-    hdrs = []    
-    if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
-    else:
-        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
-    return hdrs
-
-def if_mpi(if_true, if_false = []):
-    return select({
-        "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false
-    })
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 3896a21b99f4756239a7ae9f3db9593504845aea..fa0f9b59aa938168cb3d318797c797eeabc9c7d9 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -41,7 +41,17 @@ genrule(
     # every module exported using tf_export. For e.g. if an op is decorated with
     # @tf_export('module1.module2', 'module3'). Then, outs should include
     # api/module1/module2/__init__.py and api/module3/__init__.py.
-    outs = ["api/__init__.py"],
+    outs = [
+        "api/__init__.py",
+        "api/bitwise/__init__.py",
+        "api/contrib/__init__.py",
+        "api/contrib/stat_summarizer/__init__.py",
+        "api/image/__init__.py",
+        "api/linalg/__init__.py",
+        "api/nn/__init__.py",
+        "api/spectral/__init__.py",
+        "api/train/__init__.py",
+    ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
 )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 5f1286aaf6c913cd299ebbfb65949ace0f593417..aab856b723cf2686e8fc9feb156b9be28470fc98 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -107,7 +107,8 @@ def get_api_imports():
   # Import all required modules in their parent modules.
   # For e.g. if we import 'tf.foo.bar.Value'. Then, we also
   # import 'bar' in 'tf.foo'.
-  for dest_module in module_imports.keys():
+  dest_modules = set(module_imports.keys())
+  for dest_module in dest_modules:
     dest_module_split = dest_module.split('.')
     for dest_submodule_index in range(1, len(dest_module_split)):
       dest_submodule = '.'.join(dest_module_split[:dest_submodule_index])
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
index da6af3919e96bd6145c33a84aca89c44473ce66c..009d64aed09ddcb47410d6ee6fb42fca42861ddd 100644
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -46,6 +46,10 @@ tf_class {
     name: "INTRA_OP_PARALLELISM_THREADS_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "ISOLATE_SESSION_STATE_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "LOG_DEVICE_PLACEMENT_FIELD_NUMBER"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
index ef2cfe3787e02da813ac0173a0fafce844bdbf38..ffe479093397a9bf98d10aa4e054c643e64d5f5d 100644
--- a/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-keys.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "LOSSES"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "METRIC_VARIABLES"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "MODEL_VARIABLES"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
index 5ad6804a78cbcf4820df5990aba099a607289bc6..2f3e7f1a847dd3609f06b1af535be6f5968edfaf 100644
--- a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
@@ -34,6 +34,10 @@ tf_class {
     name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "REPORT_TENSOR_ALLOCATIONS_UPON_OOM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "SOFTWARE_TRACE"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index d12514fe77845a1502538c0f78355e8eaf3b83a5..42de5c0c80023ad5bd7f33a564780060998307c1 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.data.Dataset"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<class \'abc.abstractproperty\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 002d0c6a9f932dd2a3a687dcbc740fc5a1222218..e2fc8d6cb1d318cc50828f22e8e575cc28c7aaad 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
index e62f6b247ae7a259385aa83d13ffa98fda0124a8..1f9aeb6ad62e1030c6e78f731fb5e05b876899e6 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-iterator.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "initializer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
@@ -16,15 +20,15 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "get_next"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index 2b476dab66c8d2b5a475fe1bbf95ce1d3615ebba..9770389e5ef1e29a80ae1da2725d9862f6521ff9 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index c4c5ac077595c520f0d5a7a0ae8e3cf89472f5de..7263230c1c7182bb812cb2e433aedd415bcd16c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "output_classes"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shapes"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
index cfe09345acccc410ad3041a965901134440e3c77..ca96f4eaece0020235d24901f51306a65676c1c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-bernoulli.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
index 2e6578bae1604f69e4697bb4668dd69d94bd68b5..d0508acd9f4f6c190b205301223599cf5b027955 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-beta.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
index d42b0e82e4fab3e30d3ebf1b8bea8b44bb61ea0f..ff0fbb56cd4b9e4c288a168a7c3d9e83c552b0e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-categorical.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
index 710164743e851f0bb5c31ebe78b260b623e87378..d75e4a2f88b29ff7f638d72f98876a230b191dce 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
index 6cc361672ed8da313e1bebc41fbf093e019d38ad..b838b9ae21decba0323211f08d09fe373ababf23 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-dirichlet.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
index 40ad07d1be4bdea9585eb276debb1fdf3dfff583..6f06b7d50dd9f5f405673d572503ff549f148f33 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-distribution.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -71,6 +75,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
index 8f34d25fea873827997ecd9df10cf1b3bfd0e56b..d34f9cde5d4d4161883f6d1b4646f22f054d16ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-exponential.pbtxt
@@ -65,6 +65,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -81,6 +85,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
index 0ae88fba3b4fd176641cc17c916181cc9a6a12c6..df268b8d99eb6bf22264ddb63231074413686efa 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-gamma.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
index e7cd595e946cb91f162a2a1af8753e44cdfbc0e1..303dcb4ed3bf8416b822bb010c2e87e8ef03b7c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-laplace.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
index 7a4a16ff836a485e65cb6e061e27b92907cb4a63..ecda8acb15c49c390eaae203a0082e78e53499bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-multinomial.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
index 14c8c34cc2d8efacec706bdb894d9f069d5e7033..92b9eeea223b488cda1ebcabd31ec808e78fcf70 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-normal.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
index 30db6d3f35c1c8ea7bbc376a20093302dd373bd9..9aa7f9a63465c78f79ae4a8a11bc63d92d027dab 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-student-t.pbtxt
@@ -68,6 +68,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -84,6 +88,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
index 46cbdf225f68e879fd18ef4a07048746a9a71b08..d1b9d3069629c552d6c6048642934f422a13dce7 100644
--- a/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.distributions.-uniform.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "covariance"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'covariance\'], "
   }
+  member_method {
+    name: "cross_entropy"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'cross_entropy\'], "
+  }
   member_method {
     name: "entropy"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'entropy\'], "
@@ -80,6 +84,10 @@ tf_class {
     name: "is_scalar_event"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'is_scalar_event\'], "
   }
+  member_method {
+    name: "kl_divergence"
+    argspec: "args=[\'self\', \'other\', \'name\'], varargs=None, keywords=None, defaults=[\'kl_divergence\'], "
+  }
   member_method {
     name: "log_cdf"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'log_cdf\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ed263f0e20d6fdf7f23a3a2ab06029084d20e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BaselineClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61a29942c577a056e94dfe661fa5fec952b4f634
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BaselineRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.baseline.BaselineRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index ef93a61bd84d488be7448294e9ce691bbf9a2dcb..cdc367b99e80104da988172bc25e76c679976b2d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.estimator"
 tf_module {
+  member {
+    name: "BaselineClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BaselineRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
index 9eb4cb8ce935a314e70866a635ce7248195e0481..018e8c909a23a9e7093c1bb411643d7db629b21c 100644
--- a/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..07b8d900da5dbd9f2c9396ecaf06b9d22ef50a0b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -0,0 +1,269 @@
+path: "tensorflow.keras.Model"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
+  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'inputs\', \'outputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..546bac44e4c9905d13c4f3b0e3d9c1b5cc6c5e59
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -0,0 +1,294 @@
+path: "tensorflow.keras.Sequential"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.models.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
+  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "regularizers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "uses_learning_phase"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'layers\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add"
+    argspec: "args=[\'self\', \'layer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'32\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "pop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_classes"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=kwargs, defaults=[\'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict_proba"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'class_weight\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..211080c19b72b744e58a15ffb08d594d24e41860
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.inception_resnet_v2.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.keras.applications.inception_resnet_v2"
+tf_module {
+  member_method {
+    name: "InceptionResNetV2"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
+  member_method {
+    name: "decode_predictions"
+    argspec: "args=[\'preds\', \'top\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "preprocess_input"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
index f50dc7d7fe432d80e91c8bbfbd8cfc36b5682fb7..daeb5aad419156a19f929fdd455f6c208cd7390f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.applications"
 tf_module {
+  member {
+    name: "inception_resnet_v2"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "inception_v3"
     mtype: "<type \'module\'>"
@@ -24,6 +28,10 @@ tf_module {
     name: "xception"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "InceptionResNetV2"
+    argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
+  }
   member_method {
     name: "InceptionV3"
     argspec: "args=[\'include_top\', \'weights\', \'input_tensor\', \'input_shape\', \'pooling\', \'classes\'], varargs=None, keywords=None, defaults=[\'True\', \'imagenet\', \'None\', \'None\', \'None\', \'1000\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt
index 57c48df2e365528d8c3812ec502661eb9576e89e..7385af064da4fdee87c3137f6a90057032400bf6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.resnet50.pbtxt
@@ -10,6 +10,6 @@ tf_module {
   }
   member_method {
     name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt
index 29d45daea44cd51ef8bc4590218c3a30a7d9f39f..ba66fba8f3086d40635b9c6a9d519af913155e75 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg16.pbtxt
@@ -10,6 +10,6 @@ tf_module {
   }
   member_method {
     name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt
index 124aa7e5e5dd6f9863790b86bf8c767f21304235..e55a1345b608bc1cf4911e394b9824e74c028d0d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.applications.vgg19.pbtxt
@@ -10,6 +10,6 @@ tf_module {
   }
   member_method {
     name: "preprocess_input"
-    argspec: "args=[\'x\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'data_format\', \'mode\'], varargs=None, keywords=None, defaults=[\'None\', \'caffe\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 6620a9d308f46cd87cedf482929e75bb5afdbaea..7de4008c4541b9054543927cad167293c5a4cf5c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -29,7 +29,7 @@ tf_class {
   }
   member_method {
     name: "on_train_end"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_model"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..791cfda23345fea7df1cfb107ae5dec06354bd48
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.fashion_mnist.pbtxt
@@ -0,0 +1,3 @@
+path: "tensorflow.keras.datasets.fashion_mnist"
+tf_module {
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt
index d4aa436f328487479b81f3bdd26062a339581c0e..36e3aafbe4dbc22fade073b45b2d7495f8f7ec52 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.datasets.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "cifar100"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "fashion_mnist"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "imdb"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index a0906e62cf537b5d1b3c2c86e9b74f85df84022a..8c2b110c6d3d0a12bf8bfde9ac939f66d6f93419 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -191,7 +191,7 @@ tf_class {
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "reccurent_conv"
+    name: "recurrent_conv"
     argspec: "args=[\'self\', \'x\', \'w\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..763184899ca05c39b56e002f1e50ce07210c7409
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -0,0 +1,179 @@
+path: "tensorflow.keras.layers.GRUCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 92373992548e3ea48ae54d1cad0a81ebd4966b1d..889f2cbc2345e605035b71d69261e92c56aa645f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,14 +1,34 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
@@ -17,6 +37,10 @@ tf_class {
     name: "graph"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "implementation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -33,6 +57,18 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -65,10 +101,34 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -77,10 +137,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "units"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -91,7 +159,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -137,10 +205,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -159,7 +223,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_output_at"
@@ -181,10 +245,6 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -193,8 +253,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index b2df5fba8fd748f43a3b88aee0993e1f5262d724..49841237cef52d3b16b498510f7c24744d57b4e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.InputLayer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.InputLayer\'>"
+  is_instance: "<class \'tensorflow.python.layers.network.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ce7c34f6c75c179442b6d7473281086115f4b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -0,0 +1,179 @@
+path: "tensorflow.keras.layers.LSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 20935e2f99a8a7a5054cda50e3b38442a216377f..e1a1d0d58ecbc9a5aa6e1bbde49d92aec9714f42 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,14 +1,34 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
@@ -17,6 +37,10 @@ tf_class {
     name: "graph"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "implementation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -33,6 +57,18 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -65,10 +101,34 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -77,10 +137,22 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "unit_forget_bias"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "units"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -91,7 +163,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -137,10 +209,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -159,7 +227,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_output_at"
@@ -181,10 +249,6 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -193,8 +257,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7c9b10f22dfc9799217727e5020d6f45bb488f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -0,0 +1,191 @@
+path: "tensorflow.keras.layers.RNN"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cell\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 7867e3c1fd3c670f3973a15047e04fc2aece0f86..f289664ba27063bcceb3b419e99e57066625cdbf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -93,7 +93,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 0fb6e84f8deeb9459d5cce6a4565da61304b6ca5..d78872861253f2f782a79e50e0f0a174464f388a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -93,7 +93,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10c7f8867cbb979e4e7a724fae41babd81d0a1ea
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -0,0 +1,179 @@
+path: "tensorflow.keras.layers.SimpleRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index f4148fcc2309f77c804fc853b1a0d8fda02d063a..588df21088fffb1ce207132a0cf043f103f71afc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -1,14 +1,34 @@
 path: "tensorflow.keras.layers.SimpleRNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
@@ -33,6 +53,18 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -65,10 +97,30 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "scope_name"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -77,10 +129,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "units"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -91,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -137,10 +197,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -159,7 +215,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_output_at"
@@ -181,10 +237,6 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -193,8 +245,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5779e41342214cc5ec60589d6c3879a79c4a639d
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -0,0 +1,183 @@
+path: "tensorflow.keras.layers.StackedRNNCells"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cells\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index 34c9efb3ca00a3b37fa6f05a4ea58cff89ccbcdf..dedef65ff931618082a4a4d1fdc01e38043ce837 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -9,10 +9,6 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "constraints"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 9cee68874a9e32a9aa4c0086a6b473c347446f8c..313b3a9e155c11e46fd70f2fea0d8dec003d6667 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -8,10 +8,6 @@ tf_class {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "constraints"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 8466c3e0390255c74be92900b40a738b5c4eb0dc..fe336c4be5a84a3764b550ca5ad2fcd1d3b85b94 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -140,6 +140,10 @@ tf_module {
     name: "GRU"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "GRUCell"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "GaussianDropout"
     mtype: "<type \'type\'>"
@@ -208,6 +212,10 @@ tf_module {
     name: "LSTM"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LSTMCell"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Lambda"
     mtype: "<type \'type\'>"
@@ -272,6 +280,10 @@ tf_module {
     name: "Permute"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RNN"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RepeatVector"
     mtype: "<type \'type\'>"
@@ -292,6 +304,10 @@ tf_module {
     name: "SimpleRNN"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SimpleRNNCell"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpatialDropout1D"
     mtype: "<type \'type\'>"
@@ -304,6 +320,10 @@ tf_module {
     name: "SpatialDropout3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StackedRNNCells"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ThresholdedReLU"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index af9a44086fd618e559d807a98e145c6f1d423156..4e522813a5a3956b4888f95b2f14ecd52d897256 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Network\'>"
+  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -152,7 +152,7 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\'], "
   }
   member_method {
     name: "evaluate_generator"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 5034fdff2a6bd78e9bad0403d4c33d72c1b766af..ddbb358c84ca50fceb4fb71eddf0083f034f65e1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.models.Sequential\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Network\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Network\'>"
+  is_instance: "<class \'tensorflow.python.layers.network.GraphNetwork\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.topology.Layer\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -153,7 +153,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -173,11 +173,11 @@ tf_class {
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'32\', \'10\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -241,7 +241,7 @@ tf_class {
   }
   member_method {
     name: "predict_classes"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'1\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
   member_method {
     name: "predict_generator"
@@ -253,7 +253,7 @@ tf_class {
   }
   member_method {
     name: "predict_proba"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'1\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
   member_method {
     name: "reset_states"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.pbtxt
index 77cfe33ac47b62341ee277cbee71b29e9259830a..754b3b84b08b08c7d12eba4ddad0a483440055a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.keras"
 tf_module {
+  member {
+    name: "Model"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sequential"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "activations"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index 8ad1f32551dda913cd98ce544d27af63310a6450..66cd37bb3a378ccd1bbdffd79f87338c9b4cf265 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.keras.preprocessing.image.DirectoryIterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.DirectoryIterator\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -11,6 +12,10 @@ tf_class {
     name: "next"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
index d30462a8eb6dfe963ab32a41a5faabcd2b743b74..69488d63bf118272d9b3f62027f10ff1c2dd0eff 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
@@ -1,11 +1,16 @@
 path: "tensorflow.keras.preprocessing.image.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'n\', \'batch_size\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
index 841f1c5585e4d8dffb782ddd989b0ba313dc2caa..4ef6e6e99e3b71d4a6e497cc577ef8b42cebab79 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.keras.preprocessing.image.NumpyArrayIterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.NumpyArrayIterator\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
@@ -11,6 +12,10 @@ tf_class {
     name: "next"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
index 5652687033559a53235056e35906140dab2d0079..d28fef696515e09990d63581de6127fd52c0a4ee 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "load_img"
-    argspec: "args=[\'path\', \'grayscale\', \'target_size\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'path\', \'grayscale\', \'target_size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'nearest\'], "
   }
   member_method {
     name: "random_channel_shift"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
index bf27a97cf25ee1ec64efa1aaeb4b10ed200f81fc..1c5868e711beeeb072e41630f06ba7d9841defbb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'generator\', \'use_multiprocessing\', \'wait_time\', \'random_seed\'], varargs=None, keywords=None, defaults=[\'False\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'generator\', \'use_multiprocessing\', \'wait_time\', \'seed\'], varargs=None, keywords=None, defaults=[\'False\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "get"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
index e840f331426c52f01db9d6280204ce3ff34a7db2..5a446c09d0130e173394b02a30f56a5c7ec9c34c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "get_file"
     argspec: "args=[\'fname\', \'origin\', \'untar\', \'md5_hash\', \'file_hash\', \'cache_subdir\', \'hash_algorithm\', \'extract\', \'archive_format\', \'cache_dir\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'datasets\', \'auto\', \'False\', \'auto\', \'None\'], "
   }
+  member_method {
+    name: "multi_gpu_model"
+    argspec: "args=[\'model\', \'gpus\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "normalize"
     argspec: "args=[\'x\', \'axis\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'2\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 0d62585ff45b396daf44988a699a308c5a16b772..62e634afb87b9dcc02ab0ceaaa7bdff62f9bfefa 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -72,6 +72,10 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "expm"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "eye"
     argspec: "args=[\'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
@@ -90,7 +94,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
diff --git a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
index 2aab2c4a778049d9ac7bfd2adb5950afa50396f1..e9b996c9f53e9062dcdd39ef22f99eef5175eb35 100644
--- a/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.metrics.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "auc"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\', \'summation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\', \'trapezoidal\'], "
   }
+  member_method {
+    name: "average_precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "false_negatives"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -64,6 +68,10 @@ tf_module {
     name: "precision"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "precision_at_k"
+    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "precision_at_thresholds"
     argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -84,6 +92,10 @@ tf_module {
     name: "recall_at_thresholds"
     argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "recall_at_top_k"
+    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "root_mean_squared_error"
     argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
@@ -104,6 +116,10 @@ tf_module {
     name: "specificity_at_sensitivity"
     argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "true_negatives"
+    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "true_negatives_at_thresholds"
     argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
index 11637814a6e5591668d9f3594898bd6123b9edd6..d920fef7702aeb716ba53d9edad28e749b11410b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.pbtxt
@@ -54,15 +54,15 @@ tf_module {
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
@@ -70,11 +70,11 @@ tf_module {
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_backprop_filter_v2"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
@@ -106,15 +106,15 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_native_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
@@ -170,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'dim\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'1e-12\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
   }
   member_method {
     name: "leaky_relu"
@@ -190,7 +190,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "log_uniform_candidate_sampler"
@@ -234,7 +234,7 @@ tf_module {
   }
   member_method {
     name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "quantized_max_pool"
@@ -282,12 +282,16 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "softmax_cross_entropy_with_logits_v2"
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "softplus"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 5646461b24de2cd73eacf89cdf7611d34af70445..bf38f678b69269e0b0a99b7812a9a304d7aaec1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.nn.rnn_cell.BasicRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.BasicRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -90,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -110,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 2adfc747d1939d38f526487082e9d3e5e9b24eae..ba15ffb792d81177040b078865134b0de7ca7a99 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.nn.rnn_cell.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<type \'object\'>"
@@ -90,7 +91,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'kernel_initializer\', \'bias_initializer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -110,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index bf7bc6a7c1556db1097e518c4d2d3ce26a4ce208..d6a7a2d19f6363c31c11ebc42e8e01f3cfc9a2ee 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -750,7 +750,7 @@ tf_module {
   }
   member_method {
     name: "boolean_mask"
-    argspec: "args=[\'tensor\', \'mask\', \'name\'], varargs=None, keywords=None, defaults=[\'boolean_mask\'], "
+    argspec: "args=[\'tensor\', \'mask\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'boolean_mask\', \'None\'], "
   }
   member_method {
     name: "broadcast_dynamic_shape"
@@ -858,7 +858,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'dtype\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \"<dtype: \'int64\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "count_up_to"
@@ -1140,6 +1140,10 @@ tf_module {
     name: "group"
     argspec: "args=[], varargs=inputs, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "guarantee_const"
+    argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "hessians"
     argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
@@ -1394,7 +1398,7 @@ tf_module {
   }
   member_method {
     name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "multiply"
@@ -1414,7 +1418,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -1546,11 +1550,11 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
@@ -1558,27 +1562,27 @@ tf_module {
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keep_dims\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "register_tensor_conversion_function"
@@ -1706,11 +1710,11 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
   }
   member_method {
     name: "serialize_tensor"
@@ -1838,15 +1842,15 @@ tf_module {
   }
   member_method {
     name: "sparse_segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_slice"
@@ -2062,7 +2066,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "write_file"
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
index 35e49ee9f4a6ee5b4da2b034ece1c1e3b2136254..6af72498d74d4bbc12e7ca68ad1e0a6f0c237e0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.tag_constants.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "SERVING"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "TPU"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "TRAINING"
     mtype: "<type \'str\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
index 8c91c5b4d9ef2436dd10a64c2adec261cd4dd282..863beaea4cf05a67e572c97b556bc1eb598d9ced 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 05d38d62ccda3a336f3f31e682d619ec8515ad3d..0a7aa9b6bc14c95e74ab05a3aeb71b770a918f60 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
index 19ca9f5763715ab6228db76033c80cbb9fbce499..83724fea55d005e9476801feb1bf58cb004aa141 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
index c8144e2db78bf96b7969f71f4776b796f4fb454c..e285b27a0531e00d27941fe451570a5056995c17 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
index 5cff6087ef533f6674d6d7f1e0a8be425c16f2ad..fc28577d6ed1328ae85970cf22cc458b7cf54344 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
index bdd4c525685f86f2a57aa7fcbb78b659ee88ba74..bf3c1d81f877e3a8a7e24d5455e9c5bf6a41f764 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
index 7cf5488a15e4832bfda4324739e97f9f5466fe2a..a640c8d2c6366951cbba6a15d2000d9369cbbdbf 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
index 20b0c4d1b565aaba30cd440a7a5480291631a89b..6b33c236a35f09422a42a17b3ffddf5ba7b1595f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -42,4 +42,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 571d846b6c5abf53a7570f996c8e59581680adbf..d23fcaed7b4cee397dcf9c51eb3b521e5461c9e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 1feb136e7f70f0d41c79eeee03fff3663bb4c643..b6c03e71d9ffb50bd6377b489fcc444453bd9752 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index 2aa4ae6d2d20af16eee5ad7dcce84d81b97d8300..4a82db11cb8d85bd0c44135ecaf507c62fae41a1 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -43,4 +43,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 915d8501af0ac238b0eb6afd200d9f7c0c432a85..e9131bf544f2e7f08928f46d2be06a00259690be 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -55,4 +55,8 @@ tf_class {
     name: "minimize"
     argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
index 140407651a9827c7250c9008e5eb46122bb4e5f0..ac263580687e53bb3fcffd5268f73f8b67aa43a1 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-worker-session-creator.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\'], "
+    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'1800\'], "
   }
   member_method {
     name: "create_session"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
index e73f6f6e6323c45d0f581efc4c5ae3615859d182..3ffc6407306b4e44ec23052187b6f9376bba833c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.pbtxt
@@ -234,7 +234,7 @@ tf_module {
   }
   member_method {
     name: "MonitoredTrainingSession"
-    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\'], "
+    argspec: "args=[\'master\', \'is_chief\', \'checkpoint_dir\', \'scaffold\', \'hooks\', \'chief_only_hooks\', \'save_checkpoint_secs\', \'save_summaries_steps\', \'save_summaries_secs\', \'config\', \'stop_grace_period_secs\', \'log_step_count_steps\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'\', \'True\', \'None\', \'None\', \'None\', \'None\', \'600\', \'<object object instance>\', \'<object object instance>\', \'None\', \'120\', \'100\', \'7200\'], "
   }
   member_method {
     name: "NewCheckpointReader"
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 6a27f6bc42fb3205b95384b66cb9d0f29f26fa55..a8fdf4c9a07a21269920c61d7f560562dab7b5f4 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -29,7 +29,6 @@ from __future__ import print_function
 
 import argparse
 from collections import defaultdict
-from operator import attrgetter
 import os
 import re
 import subprocess
@@ -68,7 +67,6 @@ _API_GOLDEN_FOLDER = 'tensorflow/tools/api/golden'
 _TEST_README_FILE = 'tensorflow/tools/api/tests/README.txt'
 _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 
-_ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 _CONVERT_FROM_MULTILINE_SCRIPT = 'tensorflow/tools/api/tests/convert_from_multiline'
 _BASE_API_DIR = 'tensorflow/core/api_def/base_api'
 _PYTHON_API_DIR = 'tensorflow/core/api_def/python_api'
@@ -137,6 +135,16 @@ def _GetHiddenOps():
   return hidden_ops
 
 
+def _GetGoldenApiDefs():
+  old_api_def_files = file_io.get_matching_files(_GetApiDefFilePath('*'))
+  return {file_path: file_io.read_file_to_string(file_path)
+          for file_path in old_api_def_files}
+
+
+def _GetApiDefFilePath(graph_op_name):
+  return os.path.join(_PYTHON_API_DIR, 'api_def_%s.pbtxt' % graph_op_name)
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -302,6 +310,14 @@ class ApiDefTest(test.TestCase):
       endpoints in base_api_def. Otherwise, returns None.
     """
     endpoint_names_set = set(endpoint_names)
+
+    # If the only endpoint is equal to graph_op_name then
+    # it is equivalent to having no endpoints.
+    if (not base_api_def.endpoint and len(endpoint_names) == 1
+        and endpoint_names[0] ==
+        self._GenerateLowerCaseOpName(base_api_def.graph_op_name)):
+      return None
+
     base_endpoint_names_set = {
         self._GenerateLowerCaseOpName(endpoint.name)
         for endpoint in base_api_def.endpoint}
@@ -349,8 +365,8 @@ class ApiDefTest(test.TestCase):
 
     Args:
       name_to_base_api_def: Map from op name to base api_def_pb2.ApiDef.
-      api_def_map: Map from first op name character (in caps) to
-        api_def_pb2.ApiDefs for Python API overrides.
+      api_def_map: Map from file path to api_def_pb2.ApiDefs for Python API
+        overrides.
     """
     hidden_ops = _GetHiddenOps()
     for hidden_op in hidden_ops:
@@ -363,7 +379,9 @@ class ApiDefTest(test.TestCase):
         api_def = api_def_pb2.ApiDef()
         api_def.graph_op_name = base_api_def.graph_op_name
         api_def.visibility = api_def_pb2.ApiDef.HIDDEN
-        api_def_map[api_def.graph_op_name[0].upper()].op.extend([api_def])
+
+        file_path = _GetApiDefFilePath(base_api_def.graph_op_name)
+        api_def_map[file_path].op.extend([api_def])
 
   @unittest.skipUnless(
       sys.version_info.major == 2 and os.uname()[0] == 'Linux',
@@ -381,8 +399,8 @@ class ApiDefTest(test.TestCase):
     traverse.traverse(tf, public_api_visitor)
     proto_dict = visitor.GetProtos()
 
-    # Map from first character of op name to Python ApiDefs.
-    api_def_map = defaultdict(api_def_pb2.ApiDefs)
+    # Map from file path to Python ApiDefs.
+    new_api_defs_map = defaultdict(api_def_pb2.ApiDefs)
     # We need to override all endpoints even if 1 endpoint differs from base
     # ApiDef. So, we first create a map from an op to all its endpoints.
     op_to_endpoint_name = defaultdict(list)
@@ -410,43 +428,45 @@ class ApiDefTest(test.TestCase):
       graph_op_name = snake_to_camel_graph_op_names[op.__name__]
       api_def = self._CreatePythonApiDef(
           name_to_base_api_def[graph_op_name], endpoint_names)
+
       if api_def:
-        api_defs = api_def_map[graph_op_name[0].upper()]
+        file_path = _GetApiDefFilePath(graph_op_name)
+        api_defs = new_api_defs_map[file_path]
         api_defs.op.extend([api_def])
 
-    self._AddHiddenOpOverrides(name_to_base_api_def, api_def_map)
+    self._AddHiddenOpOverrides(name_to_base_api_def, new_api_defs_map)
 
-    for key in _ALPHABET:
-      # Get new ApiDef for the given key.
-      new_api_defs_str = ''
-      if key in api_def_map:
-        new_api_defs = api_def_map[key]
-        new_api_defs.op.sort(key=attrgetter('graph_op_name'))
-        new_api_defs_str = str(new_api_defs)
+    old_api_defs_map = _GetGoldenApiDefs()
+    for file_path, new_api_defs in new_api_defs_map.items():
+      # Get new ApiDef string.
+      new_api_defs_str = str(new_api_defs)
 
-      # Get current ApiDef for the given key.
-      api_defs_file_path = os.path.join(
-          _PYTHON_API_DIR, 'api_def_%s.pbtxt' % key)
-      old_api_defs_str = ''
-      if file_io.file_exists(api_defs_file_path):
-        old_api_defs_str = file_io.read_file_to_string(api_defs_file_path)
+      # Get current ApiDef for the given file.
+      old_api_defs_str = (
+          old_api_defs_map[file_path] if file_path in old_api_defs_map else '')
 
       if old_api_defs_str == new_api_defs_str:
         continue
 
       if FLAGS.update_goldens:
-        if not new_api_defs_str:
-          logging.info('Deleting %s...' % api_defs_file_path)
-          file_io.delete_file(api_defs_file_path)
-        else:
-          logging.info('Updating %s...' % api_defs_file_path)
-          file_io.write_string_to_file(api_defs_file_path, new_api_defs_str)
+        logging.info('Updating %s...' % file_path)
+        file_io.write_string_to_file(file_path, new_api_defs_str)
       else:
         self.assertMultiLineEqual(
             old_api_defs_str, new_api_defs_str,
             'To update golden API files, run api_compatibility_test locally '
             'with --update_goldens=True flag.')
 
+    for file_path in set(old_api_defs_map) - set(new_api_defs_map):
+      if FLAGS.update_goldens:
+        logging.info('Deleting %s...' % file_path)
+        file_io.delete_file(file_path)
+      else:
+        self.fail(
+            '%s file is no longer needed and should be removed.'
+            'To update golden API files, run api_compatibility_test locally '
+            'with --update_goldens=True flag.' % file_path)
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 2d59299da4d313f4bf8c5174480f355c3575fa30..ecab6f8769ae2d0126f63580030ed6ff756015d0 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -530,7 +530,7 @@ int Main(int argc, char** argv) {
   }
 
   // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
+  // timing data that can be compared to other libraries.
   SleepSeconds(inter_benchmark_sleep_seconds);
   int64 no_stat_time_us = 0;
   int64 no_stat_num_runs = 0;
@@ -622,7 +622,7 @@ int Main(int argc, char** argv) {
     RecordBenchmarkEntry(output_prefix, benchmark_name, "meta-first-inference",
                          warmup_runs, warmup_time_us / 1000000.0);
 
-    // Time from starting to intialize TF to getting the first result back.
+    // Time from starting to initialize TF to getting the first result back.
     // This also assumes that only one warmup run is performed.
     RecordBenchmarkEntry(
         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 2d46ccb6b17ac3ab3af49c1649074eda8a840331..7591ecc04efa887ec1d35ba92881386f5a25241d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu_clang b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
index 0ecd8c75e036fc18d37882834ed467d0edb096b1..438a7ec532862b9cf6be57ef2712790c35a9f354 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu_clang
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu_clang
@@ -1,8 +1,8 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu14.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Ilya Biryukov <ibiryukov@google.com>"
 
-# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to
+# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
 # /usr/local/cuda
 RUN cp /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md
index 202fcb9101a42336f5f33022c3b8608e53d83dae..f2161b700a0f642dfdb5c33d7d77934c02f14d54 100644
--- a/tensorflow/tools/ci_build/README.md
+++ b/tensorflow/tools/ci_build/README.md
@@ -67,10 +67,10 @@ this UI, to see the logs for a failed build:
     the build tool divided the target into multiple shards or ran the test
     multiple times. Each test log is specific to the shard, run, and attempt.
     To see a specific log:
-    
+
     1.  Click on the log icon that is on the right next to the shard, run,
         and attempt number.
-        
+
     2.  In the grid that appears on the right, click on the specific shard,
         run, and attempt to view its log. You can also type the desired shard,
         run, or attempt number in the field above its grid.
diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 552df1434eab8c4414b8b9a8f7be9c61998d8462..82042b93c02275b51530b306d8cf4519482e5410 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -296,13 +296,11 @@ create_activate_virtualenv_and_install_tensorflow() {
     die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
   fi
 
-  # Verify that virtualenv exists
-  if [[ -z $(which virtualenv) ]]; then
-    die "FAILED: virtualenv not available on path"
-  fi
-
-  virtualenv ${VIRTUALENV_FLAGS} \
-    -p "${PYTHON_BIN_PATH}" "${VIRTUALENV_DIR}" || \
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH} -m virtualenv -p "${PYTHON_BIN_PATH}" ${VIRTUALENV_FLAGS} \
+    "${VIRTUALENV_DIR}" || \
     die "FAILED: Unable to create virtualenv"
 
   source "${VIRTUALENV_DIR}/bin/activate" || \
@@ -345,7 +343,7 @@ do_clean_virtualenv_smoke_test() {
   then
     echo "Smoke test of tensorflow install in clean virtualenv PASSED."
   else
-    echo "Smoke test of tensroflow install in clean virtualenv FAILED."
+    echo "Smoke test of tensorflow install in clean virtualenv FAILED."
     return 1
   fi
 
diff --git a/tensorflow/tools/ci_build/builds/print_build_info.sh b/tensorflow/tools/ci_build/builds/print_build_info.sh
index 7c43419a76ff26be7370326a9113f4e3db2a2b1c..e366abf8bb831688d90a0e3eabed101e42bdaf96 100755
--- a/tensorflow/tools/ci_build/builds/print_build_info.sh
+++ b/tensorflow/tools/ci_build/builds/print_build_info.sh
@@ -88,7 +88,7 @@ fi
 # Print info
 echo "TF_BUILD_INFO = {"\
 "container_type: \"${CONTAINER_TYPE}\", "\
-"command: \"${COMMAND[@]}\", "\
+"command: \"${COMMAND[*]}\", "\
 "source_HEAD: \"${TF_HEAD}\", "\
 "source_remote_origin: \"${TF_FETCH_URL}\", "\
 "OS: \"${OS}\", "\
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 4f1c61b8e9a799712e2e9def88868b44f3393325..caa3a40817c80b27271f76de0a95a743cb2916f6 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -76,17 +76,17 @@ echo "PYTHON_BIN_PATH: ${PYTHON_BIN_PATH}"
 
 pushd "${TMP_DIR}"
 
-# Obtain paths include and lib paths to the TensorFlow installation
-TF_INC=$("${PYTHON_BIN_PATH}" \
-         -c 'import tensorflow as tf; print(tf.sysconfig.get_include())')
-TF_LIB=$("${PYTHON_BIN_PATH}" \
-         -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())')
-
-if [[ -z "${TF_INC}" ]]; then
-  die "FAILED to determine TensorFlow include path"
+# Obtain compilation and linking flags
+TF_CFLAGS=( $("${PYTHON_BIN_PATH}" \
+	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
+TF_LFLAGS=( $("${PYTHON_BIN_PATH}" \
+	      -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
+
+if [[ -z "${TF_CFLAGS[*]}" || -z "${TF_LFLAGS[*]}" ]]; then
+  die "FAILED to determine TensorFlow compilation or linking flags"
 else
-  echo "TensorFlow include path: ${TF_INC}"
-  TF_INCLUDE_PATH="-I${TF_INC} -I${TF_INC}/external/nsync/public"
+  echo "TensorFlow compile flags: ${TF_CFLAGS[*]}"
+  echo "TensorFlow link flags: ${TF_LFLAGS[*]}"
 fi
 
 # Check g++ availability
@@ -145,7 +145,7 @@ if [[ ${IS_GPU} == "0" ]]; then
 
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
     -shared "${SRC_FILE}" -o "${USER_OP_SO}" \
-    -fPIC ${TF_INCLUDE_PATH} -L "${TF_LIB}" -ltensorflow_framework  || \
+    -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]}  || \
     die "g++ compilation of ${SRC_FILE} FAILED"
 
 else
@@ -184,7 +184,7 @@ else
   OP_KERNEL_O=$(echo "${OP_KERNEL_CC}" | sed -e 's/\.cc/\.o/')
   "${NVCC_BIN}" -std=c++11 \
       -c -o "${OP_KERNEL_O}" "${OP_KERNEL_CU}" \
-      ${TF_INCLUDE_PATH} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
+      ${TF_CFLAGS[@]} -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC || \
       die "nvcc compilation of ${OP_KERNEL_CC} FAILED"
 
   CUDA_LIB_DIR="/usr/local/cuda/lib64"
@@ -203,8 +203,8 @@ else
   USER_OP_SO="add_one.so"
   "${GPP_BIN}" -std=c++11 ${EXTRA_GPP_FLAGS} \
       -shared -o "${USER_OP_SO}" "${OP_KERNEL_CC}" \
-      "${OP_KERNEL_O}" ${TF_INCLUDE_PATH} -L "${CUDA_LIB_DIR}" -L "${TF_LIB}" \
-      -fPIC -lcudart -ltensorflow_framework || \
+      "${OP_KERNEL_O}" ${TF_CFLAGS[@]} -L "${CUDA_LIB_DIR}" ${TF_LFLAGS[@]} \
+      -fPIC -lcudart || \
       die "g++ compilation of ${OP_KERNEL_CC}" FAILED
 fi
 
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 7a1479c150488dc0c58b5cdc67a9526d3f04bb64..2217b110e3f4e5dd2a212fe0cb65ac9f46ce943a 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -147,6 +147,38 @@ BAZEL_TARGET="//tensorflow/... -//tensorflow/compiler/..."
 
 if [[ -n "$TF_SKIP_CONTRIB_TESTS" ]]; then
   BAZEL_TARGET="$BAZEL_TARGET -//tensorflow/contrib/..."
+else
+  BAZEL_TARGET="${BAZEL_TARGET} -//tensorflow/contrib/lite/..."
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:context_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:framework"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:interpreter_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:model_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/toco:toco"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:simple_memory_arena_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:string_util_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:activations_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:add_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:basic_rnn_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:concatenation_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:conv_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:depthwise_conv_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:fully_connected_test"
+  # BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/testing:generated_examples_zip_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:hashtable_lookup_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:local_response_norm_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lsh_projection_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lstm_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:l2norm_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:mul_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:pooling_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:reshape_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:resize_bilinear_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:skip_gram_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:softmax_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:space_to_depth_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:svdf_test"
 fi
 
 TUT_TEST_DATA_DIR="/tmp/tf_tutorial_test_data"
@@ -201,13 +233,13 @@ function get_cuda_capability_version() {
 # Container type, e.g., CPU, GPU
 CTYPE=${TF_BUILD_CONTAINER_TYPE}
 
-# Determine if Docker is available
+# Determine if the machine is a Mac
 OPT_FLAG=""
-if [[ -z "$(which docker)" ]]; then
+if [[ "$(uname -s)" == "Darwin" ]]; then
   DO_DOCKER=0
 
-  echo "It appears that Docker is not available on this system. "\
-"Will perform build without Docker."
+  echo "It appears this machine is a Mac. "\
+"We will perform this build without Docker."
   echo "Also, the additional option flags will be applied to the build:"
   echo "  ${NO_DOCKER_OPT_FLAG}"
   MAIN_CMD="${NO_DOCKER_MAIN_CMD} ${CTYPE}"
@@ -514,8 +546,9 @@ echo ""
 
 TMP_DIR=""
 DOCKERFILE_FLAG=""
-if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
-  # Modify Dockerfile for Python3.5 build
+if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]] ||
+  [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.6" ]]; then
+  # Modify Dockerfile for Python3.5 | Python3.6 build
   TMP_DIR=$(mktemp -d)
   echo "Docker build will occur in temporary directory: ${TMP_DIR}"
 
@@ -531,10 +564,10 @@ if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then
 
   # Replace a line in the Dockerfile
   if sed -i \
-      's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \
+      "s/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_${TF_BUILD_PYTHON_VERSION}_pip_packages.sh/g" \
       "${DOCKERFILE}"
   then
-    echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}"
+    echo "Copied and modified Dockerfile for ${TF_BUILD_PYTHON_VERSION} build: ${DOCKERFILE}"
   else
     die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}"
   fi
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index f1c207f9b686a77d92f2df52faaf7da4f55c5d31..4021d794b6c4aa171c041d1d6da8ce5b6a1f6a67 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -98,7 +98,9 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/evaluator\.py.*\[E0202.*method-hidden "\
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
-"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable"
+"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
+"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
 
@@ -400,9 +402,14 @@ cmd_status(){
 }
 
 # Run bazel build --nobuild to test the validity of the BUILD files
+# TODO(mikecase): Remove TF Lite exclusion from this list. Exclusion is
+# necessary since the @androidsdk WORKSPACE dependency is commented
+# out by default in TF WORKSPACE file.
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
-  BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} ${BUILD_TARGET}"
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/src/main/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/schema/..."
+  BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
 
   ${BUILD_CMD}
 
diff --git a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
index 6e7b752c06f43fe7f8fa26bd52a28ed33f38edd8..cfeaebdbf57c01fef7cd81dae76217429336d0ff 100755
--- a/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
+++ b/tensorflow/tools/ci_build/gpu_build/parallel_gpu_execute.sh
@@ -45,7 +45,7 @@ for i in `seq 0 $((TF_GPU_COUNT-1))`; do
       # This export only works within the brackets, so it is isolated to one
       # single command.
       export CUDA_VISIBLE_DEVICES=$i
-      echo "Running test $@ on GPU $CUDA_VISIBLE_DEVICES"
+      echo "Running test $* on GPU $CUDA_VISIBLE_DEVICES"
       $@
     )
     return_code=$?
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 4ab307c9253a8019f2c794b696db030722751770..96408105339d9a3e21aecb3bae9894551f8b6811 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -48,6 +48,7 @@ apt-get install -y --no-install-recommends \
     git \
     libcurl4-openssl-dev \
     libtool \
+    libssl-dev \
     mlocate \
     openjdk-8-jdk \
     openjdk-8-jre-headless \
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index 55c1674495f099c7b6eb9484f1ccf195cea4b2ee..e1edd62cc505654b7266c212822561188bbc701c 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.1.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 352af871082ed7d88bda0067b268e90c2bb88615..da58ac2407a847ed5b57c949a69d3890fe4df4cf 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -27,10 +27,17 @@ easy_install3 -U pip
 pip2 install wheel
 pip3 install wheel
 
+pip2 install virtualenv
+pip3 install virtualenv
+
 # Install six.
 pip2 install --upgrade six==1.10.0
 pip3 install --upgrade six==1.10.0
 
+# Install absl-py.
+pip2 install --upgrade absl-py
+pip3 install --upgrade absl-py
+
 # Install werkzeug.
 pip2 install --upgrade werkzeug==0.11.10
 pip3 install --upgrade werkzeug==0.11.10
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e452c50221bee4385db27856f05c539798e2ba53..9881bd99c35b29920c6db21b572d1956eb497dae 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -18,33 +18,12 @@
 # TODO(cais): Remove this file once we upgrade to ubuntu:16.04 docker images for
 # Python 3.5 builds.
 
+# LINT.IfChange
+
 # fkrull/deadsnakes is for Python3.5
 add-apt-repository -y ppa:fkrull/deadsnakes
 apt-get update
 
-set +e
-# Upgrade swig to 3.0.8
-SWIG_VERSION="3.0.8"
-swig_ver_flat=$(echo $SWIG_VERSION | sed 's/\.//g' | sed 's/^0*//g')
-local_swig_ver=$(swig -version | grep -i version | awk '{print $3}')
-local_swig_ver_flat=$(echo $local_swig_ver | sed 's/\.//g' | sed 's/^0*//g')
-if [[ -z $local_swig_ver_flat ]]; then
-  local_swig_ver_flat=0
-fi
-if (( $local_swig_ver_flat < $swig_ver_flat )); then
-  set -e
-  wget -q http://downloads.sourceforge.net/swig/swig-3.0.8.tar.gz
-  tar xzf swig-3.0.8.tar.gz
-  pushd swig-3.0.8
-  apt-get install -y --no-install-recommends libpcre3-dev
-  ./configure
-  make
-  make install
-  rm -f /usr/bin/swig
-  ln -s /usr/local/bin/swig /usr/bin/swig
-  popd
-  rm -rf swig-3.0.8 swig-3.0.8.tar.gz
-fi
 set -e
 # Install Python 3.5 and dev library
 apt-get install -y --no-install-recommends python3.5 libpython3.5-dev
@@ -60,7 +39,10 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
+pip3.5 install --upgrade virtualenv
+
 # Install six.
+pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
@@ -91,3 +73,5 @@ pip3.5 install portpicker
 pip3.5 install werkzeug
 
 pip3.5 install grpcio
+
+# LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1ca12c6c608858d78a696eed69da1ad1037de364
--- /dev/null
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Install packages required by Python3.6 build
+
+# TODO(amitpatankar): Remove this file once we upgrade to ubuntu:16.04
+# docker images for Python 3.6 builds.
+
+# LINT.IfChange
+
+# fkrull/deadsnakes is for Python3.6
+add-apt-repository -y ppa:fkrull/deadsnakes
+apt-get update
+
+set -e
+# Install Python 3.6 and dev library
+wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tar.xz
+tar xvf Python-3.6.1.tar.xz
+cd Python-3.6.1
+
+./configure
+make altinstall
+pip3.6 -V
+which pip3.6
+ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
+
+pip3 install --upgrade virtualenv
+
+set -e
+# Install six.
+pip3 install --upgrade absl-py
+pip3 install --upgrade six==1.10.0
+
+# Install protobuf.
+pip3 install --upgrade protobuf==3.3.0
+
+# Remove obsolete version of six, which can sometimes confuse virtualenv.
+rm -rf /usr/lib/python3/dist-packages/six*
+
+# Install numpy, scipy and scikit-learn required by the builds
+
+# numpy needs to be installed from source to fix segfaults. See:
+# https://github.com/tensorflow/tensorflow/issues/6968
+# This workaround isn't needed for Ubuntu 16.04 or later.
+pip3 install --no-binary=:all: --upgrade numpy==1.12.0
+
+pip3 install scipy==0.18.1
+
+pip3 install scikit-learn==0.18.1
+
+# pandas required by `inflow`
+pip3 install pandas==0.19.2
+
+# Install recent-enough version of wheel for Python 3.6 wheel builds
+pip3 install wheel==0.29.0
+
+pip3 install portpicker
+
+pip3 install werkzeug
+
+pip3 install grpcio
+
+# LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index 5de5a379ac829c20d2f60f1b5323f375c6c69017..df6016504cec19e02af988e87733fc409cef6826 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -33,4 +33,35 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --test_output=errors -- \
-    //tensorflow/contrib/...
+    //tensorflow/contrib/... \
+    -//tensorflow/contrib/lite/... \
+    //tensorflow/contrib/lite:context_test \
+    //tensorflow/contrib/lite:framework \
+    //tensorflow/contrib/lite:interpreter_test \
+    //tensorflow/contrib/lite:model_test \
+    //tensorflow/contrib/lite/toco:toco \
+    //tensorflow/contrib/lite:simple_memory_arena_test \
+    //tensorflow/contrib/lite:string_util_test \
+    //tensorflow/contrib/lite/kernels:activations_test \
+    //tensorflow/contrib/lite/kernels:add_test \
+    //tensorflow/contrib/lite/kernels:basic_rnn_test \
+    //tensorflow/contrib/lite/kernels:concatenation_test \
+    //tensorflow/contrib/lite/kernels:conv_test \
+    //tensorflow/contrib/lite/kernels:depthwise_conv_test \
+    //tensorflow/contrib/lite/kernels:embedding_lookup_test \
+    //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test \
+    //tensorflow/contrib/lite/kernels:fully_connected_test \
+    //tensorflow/contrib/lite/testing:generated_examples_zip_test \
+    //tensorflow/contrib/lite/kernels:hashtable_lookup_test \
+    //tensorflow/contrib/lite/kernels:local_response_norm_test \
+    //tensorflow/contrib/lite/kernels:lsh_projection_test \
+    //tensorflow/contrib/lite/kernels:lstm_test \
+    //tensorflow/contrib/lite/kernels:l2norm_test \
+    //tensorflow/contrib/lite/kernels:mul_test \
+    //tensorflow/contrib/lite/kernels:pooling_test \
+    //tensorflow/contrib/lite/kernels:reshape_test \
+    //tensorflow/contrib/lite/kernels:resize_bilinear_test \
+    //tensorflow/contrib/lite/kernels:skip_gram_test \
+    //tensorflow/contrib/lite/kernels:softmax_test \
+    //tensorflow/contrib/lite/kernels:space_to_depth_test \
+    //tensorflow/contrib/lite/kernels:svdf_test
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index df196f829cd920b538fd0032950a9282c3043617..ac83e90f766aab1769fc920d2938f3607aabc786 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index abd256a895ea751f84ec946a85a4331fe5b23440..6b80f44729b2a7d30bb754e07728ce4614b7cb16 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index dcda8228bc20fdd67756508e19b12392cb2000d7..e5d8303c6e5534464bc0c91a09e7a6686b19c33f 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -48,6 +48,6 @@ ${DOCKER_BINARY} run \
   -e "TF_NEED_GCP=0" \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
-  -e "TF_NEED_OPENCL=0" \
+  -e "TF_NEED_OPENCL_SYCL=0" \
   "${DOCKER_IMAGE}" \
   "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 8042522ef835cefd36986144ccec0f876aa3b483..ddaaddc9179ab640ce5b09b4d8732944b8177f8a 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -34,4 +34,4 @@ bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
-    //tensorflow/contrib/...
+    //tensorflow/contrib/... -//tensorflow/contrib/lite/...
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index d90a1b905d91415dda576c5dc71df2f41502fa9d..e1b56b9a25f663737ffe0991882f6e5e753265ed 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -27,7 +27,7 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
-export TF_NEED_OPENCL=0
+export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 79973647c11fffb1907b7f39fe5f43a3fb450b5b..5a901af3e5c77ed153b5ff5a9c5f9463620f7dca 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -28,7 +28,7 @@ export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${L
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
-export TF_NEED_OPENCL=0
+export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 5244898c40049063a40388ba65d4a2d1d761e0bd..1bd1852ffc570166ecc6efca1420bc54d702ed89 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -75,17 +75,24 @@ if [[ $1 == "PI_ONE" ]]; then
   PI_COPTS="--copt=-march=armv6 --copt=-mfpu=vfp
   --copt=-DUSE_GEMM_FOR_CONV --copt=-DUSE_OPENBLAS
   --copt=-isystem --copt=${OPENBLAS_INSTALL_PATH}/include/
+  --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+  --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
+  --copt=-O3
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
+# We need to pass down the environment variable with a possible alternate Python
+# include path for Python 3.x builds to work.
+export CROSSTOOL_PYTHON_INCLUDE_PATH
+
 cd ${WORKSPACE_PATH}
 bazel build -c opt ${PI_COPTS} \
   --config=monolithic \
diff --git a/tensorflow/tools/ci_build/remote/remote_docker_build.sh b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
index 3ac6840f4e7a881da4ab973a7fadd921ed288828..e00a66aabaf1068c772aabce2391616518be44d4 100755
--- a/tensorflow/tools/ci_build/remote/remote_docker_build.sh
+++ b/tensorflow/tools/ci_build/remote/remote_docker_build.sh
@@ -124,7 +124,7 @@ function build_tf_image {
 
 
 function publish_tf_image {
-  $gcr_tf_image="gcr.io/tensorflow/${tf_image}"
+  gcr_tf_image="gcr.io/tensorflow/${tf_image}"
   docker tag $tf_image $gcr_tf_image
   gcloud docker -- push $gcr_tf_image
 }
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 6a8b6417d65a87870407b78ab9082faa4c1361d6..8d50250c3a306cc6da5e99861a81233ac3b761ae 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -88,7 +88,7 @@ extra_failing_gpu_cc_tests="\
     //tensorflow/core:cuda_libdevice_path_test + \
     //tensorflow/core:common_runtime_direct_session_test + \
     //tensorflow/core:common_runtime_direct_session_with_tracking_alloc_test + \
-    //tensorflow/core:gpu_tracer_test + \
+    //tensorflow/core:device_tracer_test + \
     //tensorflow/core:ops_math_grad_test \
 "
 
@@ -96,10 +96,6 @@ exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
 
 exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
-function clean_output_base() {
-  bazel clean --expunge
-}
-
 function run_configure_for_cpu_build {
   # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
   # yes "" | ./configure doesn't work on Windows, so we set all the
@@ -115,9 +111,9 @@ function run_configure_for_cpu_build {
     export TF_NEED_MKL=0
   fi
   export TF_NEED_VERBS=0
-  export TF_NEED_GCP=0
+  export TF_NEED_GCP=1
   export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL=0
+  export TF_NEED_OPENCL_SYCL=0
   echo "" | ./configure
 }
 
@@ -141,7 +137,7 @@ function run_configure_for_gpu_build {
   export TF_NEED_MKL=0
   export TF_NEED_GCP=0
   export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL=0
+  export TF_NEED_OPENCL_SYCL=0
 
   # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
   # for GPU build on Windows
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 4a653698a2d7c12ce59a53bf96e1551a633f7cab..f88e7176f0803dab98efd4f9f2ca5fd8757a7272 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -36,12 +36,6 @@ export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 export PYTHON_BIN_PATH="C:/Program Files/Anaconda3/python.exe"
 export PYTHON_LIB_PATH="C:/Program Files/Anaconda3/lib/site-packages"
 
-# Set Python path for cc_configure.bzl
-export BAZEL_PYTHON="C:/Program Files/Anaconda3/python.exe"
-
-# Set Visual Studio path
-export BAZEL_VS="C:/Program Files (x86)/Microsoft Visual Studio 14.0"
-
 # Add python into PATH, it's needed because gen_git_source.py uses
 # '/usr/bin/env python' as a shebang
 export PATH="/c/Program Files/Anaconda3:$PATH"
@@ -53,13 +47,3 @@ export PATH="/c/Program Files/Anaconda3/Scripts:$PATH"
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/bin:$PATH"
 export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v8.0/extras/CUPTI/libx64:$PATH"
 export PATH="/c/tools/cuda/bin:$PATH"
-
-# Set the common build options on Windows
-export BUILD_OPTS='--config=monolithic --copt=-w --host_copt=-w --verbose_failures --experimental_ui'
-
-# Build TF with wrapper-less CROSSTOOL
-# TODO(pcloudy): Remove this after wrapper-less CROSSTOOL becomes default
-export NO_MSVC_WRAPPER=1
-
-export USE_DYNAMIC_CRT=1
-
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 8c419347d6f4b3af2e47bb96f246dc7281a92364..748a961e44c5429664e37a1456adcf02a56fa3d4 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -42,8 +42,6 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
-clean_output_base
-
 run_configure_for_cpu_build
 
 # Compliling the following test is extremely slow with -c opt
@@ -54,5 +52,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
   # We need to strip \r so that the result could be store into a variable under MSYS
   tr '\r' ' ')
 
-bazel test $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt $BUILD_OPTS -k $passing_tests --test_output=errors
+bazel test -k $slow_compiling_test --test_output=errors
+bazel test -c opt -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
index 6e600e2dcfb8380690764d43c4b731a8da6b5dc4..56bff077746b8195a93b6ab8d7ce707b06549daa 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_build.bat
@@ -37,4 +37,4 @@ SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 %CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
 
 :: Run msbuild in the resulting VS project files to build a pip package.
-%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 /verbosity:minimal tf_python_build_pip_package.vcxproj
\ No newline at end of file
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 2f6d53e171ce5236ca48a07756335ac723f88381..3c3b223a0044b7136ea4dee20fa72cd2fed3742a 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -37,6 +37,9 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index f6e3d2e6c716178609b4aeb7e25d4dc12ac12f34..31b4226a301e536ea43f9da30006feef7ec60d5d 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -44,9 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_cpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -60,11 +58,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows \
-  --build_tag_filters=-no_pip,-no_windows --build_tests_only \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
   //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index 3fd960deabbb0ace8c9598589f9f9a72fd09b3a9..f26f8727e51bf0247578c1cdfaa67e1b0f7f299d 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
@@ -56,5 +56,5 @@ passing_tests=$(bazel query "kind(cc_test, //tensorflow/cc/... + //tensorflow/co
 
 # TODO(pcloudy): There is a bug in Bazel preventing build with GPU support without -c opt
 # Re-enable this test after it is fixed.
-# bazel test --config=win-cuda $BUILD_OPTS -k $slow_compiling_test --test_output=errors
-bazel test -c opt --config=win-cuda $BUILD_OPTS -k $passing_tests --test_output=errors
+# bazel test --config=win-cuda -k $slow_compiling_test --test_output=errors
+bazel test -c opt --config=win-cuda -k $passing_tests --test_output=errors
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 44d8252a7a9b30b21097de13252e9f3a8af5b4cb..832943ad6c82855a76be0782c5332fb8e0f202b6 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -38,4 +38,4 @@ SET MSBUILD_EXE="C:\Program Files (x86)\MSBuild\14.0\Bin\msbuild.exe"
 %CMAKE_EXE% %CMAKE_DIR% -A x64 -DSWIG_EXECUTABLE=%SWIG_EXE% -DPYTHON_EXECUTABLE=%PY_EXE% -DCMAKE_BUILD_TYPE=Release -DPYTHON_LIBRARIES=%PY_LIB% -Dtensorflow_BUILD_PYTHON_TESTS=%BUILD_PYTHON_TESTS% -Dtensorflow_BUILD_CC_TESTS=%BUILD_CC_TESTS% -Dtensorflow_ENABLE_GPU=ON -DCUDNN_HOME=%CUDNN_HOME% -Dtensorflow_TF_NIGHTLY=%TF_NIGHTLY%
 
 :: Run msbuild in the resulting VS project files to build a pip package.
-%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 /verbosity:minimal tf_python_build_pip_package.vcxproj
+%MSBUILD_EXE% /p:Configuration=Release /maxcpucount:32 tf_python_build_pip_package.vcxproj
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
index 02e24c85de1deb6d7df2e16200cfed7e6d582c28..b537192a945b2a2d8c2df940b947c6c0f7d6fc06 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_py.bat
@@ -37,6 +37,9 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 25d327c8188666e34477daa0e888a9169c709c66..922bb67bbf6ce34f55acad6d3399bd810032abd0 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -44,9 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_gpu_build
 
-clean_output_base
-
-bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
@@ -61,11 +59,8 @@ reinstall_tensorflow_pip ${PIP_NAME}
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 # GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-# TODO(pcloudy): Remove TF_SAVER_LENIENT_NAMES after
-# https://github.com/tensorflow/tensorflow/issues/12844 is fixed.
-bazel test -c opt $BUILD_OPTS -k --test_output=errors \
+bazel test -c opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
-  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu \
-  --test_env=TF_SAVER_LENIENT_NAMES=True \
+  --test_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
+  --build_tag_filters=-no_pip,-no_windows,-no_windows_gpu,-no_gpu,-no_pip_gpu,no_oss \
   --local_test_jobs=1 --build_tests_only //${PY_TEST_DIR}/tensorflow/python/...
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
index 9ac3613f27e1bc96501490b7610f047785b9ada2..80f2b590c9428b19822952d8b72ca9f0a1359a50 100755
--- a/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_cpu.sh
@@ -44,13 +44,12 @@ export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:clic
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/java:libtensorflow_jni.so"
 export TF_BAZEL_TARGETS="${TF_BAZEL_TARGETS} //tensorflow/tools/lib_package:jnilicenses_generate"
 
-clean_output_base
 run_configure_for_cpu_build
 
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt ${BUILD_OPTS} \
+bazel build -c opt \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a94a627dfb632cf01518c2022fd01b168afb4a7e..88333de856a21b3faeb49f4d88c290ca89288a6e 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -28,6 +28,8 @@ echo ""
 export PYTHON_BIN_PATH=`which python3`
 
 export TF_NEED_CUDA=1
+export TF_CUDA_VERSION=8.0
+export TF_CUDNN_VERSION=6
 export TF_CUDA_COMPUTE_CAPABILITIES=3.7
 
 yes "" | $PYTHON_BIN_PATH configure.py
diff --git a/tensorflow/tools/dist_test/python/census_widendeep.py b/tensorflow/tools/dist_test/python/census_widendeep.py
index 3a557814960498cb397781232154958872234e49..8feb5386e9881596c20fba9e537a0439c8187ac4 100644
--- a/tensorflow/tools/dist_test/python/census_widendeep.py
+++ b/tensorflow/tools/dist_test/python/census_widendeep.py
@@ -263,8 +263,7 @@ if __name__ == "__main__":
       "--data_dir",
       type=str,
       default="/tmp/census-data",
-      help="Directory for storing the cesnsus data"
-  )
+      help="Directory for storing the census data")
   parser.add_argument(
       "--model_dir",
       type=str,
diff --git a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
index ea4906588da52e069f7f720d5432d326a977f22e..e703e78531bf7d34285b5faef874ddff94495950 100755
--- a/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
+++ b/tensorflow/tools/dist_test/scripts/dist_mnist_test.sh
@@ -43,7 +43,7 @@
 # NOTES:
 # If you have the error "$'\r': command not found"
 # Please run the command below to remove trailing '\r' character that causes the error:
-#   sed -i 's/\r$//' dist_mnist_test.sh 
+#   sed -i 's/\r$//' dist_mnist_test.sh
 
 
 # Configurations
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 20e1dcd08540d0cac379cf63eab2fcfdcefc510e..3525c7524f3bd844be5284d2a076eb78d1bb1a02 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -83,6 +83,11 @@ ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
     bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+        # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
+        # For ivy-bridge or sandy-bridge
+        # --copt=-march="ivybridge" \
+        # for haswell, broadwell, or skylake
+        # --copt=-march="haswell" \
         tensorflow/tools/pip_package:build_pip_package && \
     bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
     pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
@@ -96,4 +101,3 @@ EXPOSE 6006
 EXPOSE 8888
 
 WORKDIR /root
-CMD ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
new file mode 100644
index 0000000000000000000000000000000000000000..8180e5e7fb65e1eff693265ed388496b356563dd
--- /dev/null
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -0,0 +1,85 @@
+FROM tensorflow/tensorflow:latest-devel
+
+LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
+
+# These arguments are parameterized. Use --build-args to override.
+ARG TF_BRANCH=r1.4
+ARG WHL_DIR=/whl
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        golang \
+        vim \
+        emacs \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip --no-cache-dir install --upgrade \
+        pip setuptools
+
+RUN pip --no-cache-dir install wheel 
+
+# Download and build TensorFlow.
+WORKDIR /
+RUN rm -rf tensorflow && \
+    git clone https://github.com/tensorflow/tensorflow.git && \
+    cd tensorflow && \
+    git checkout ${TF_BRANCH}
+WORKDIR /tensorflow
+
+# Configure the build for CPU with MKL by accepting default build options and
+# setting library locations
+ENV CI_BUILD_PYTHON=python \
+   LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
+    PYTHON_BIN_PATH=/usr/bin/python \
+    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
+    CC_OPT_FLAGS='-march=native' \
+    TF_NEED_JEMALLOC=0 \
+    TF_NEED_GCP=0 \
+    TF_NEED_CUDA=0 \
+    TF_NEED_HDFS=0 \
+    TF_NEED_S3=0 \
+    TF_NEED_OPENCL=0 \
+    TF_NEED_GDR=0 \
+    TF_ENABLE_XLA=0 \
+    TF_NEED_VERBS=0 \
+    TF_NEED_MPI=0
+RUN ./configure
+
+# Build and Install TensorFlow.
+# The 'mkl' option builds with Intel(R) Math Kernel Library (MKL), which detects
+# the platform it is currently running on and takes appropriately optimized 
+# paths. The -march=native option is for code that is not in MKL, and assumes
+# this container will be run on the same architecture on which it is built.
+RUN LD_LIBRARY_PATH=${LD_LIBRARY_PATH} \
+    bazel build --config=mkl \
+                --config="opt" \
+                --copt="-march=native" \
+                --copt="-O3" \
+                //tensorflow/tools/pip_package:build_pip_package && \
+    mkdir ${WHL_DIR} && \
+    bazel-bin/tensorflow/tools/pip_package/build_pip_package ${WHL_DIR}
+
+# Clean up Bazel cache when done, but leave the whl.
+# This will upgrade the default Tensorflow version with the Intel MKL version
+RUN pip --no-cache-dir install --upgrade ${WHL_DIR}/tensorflow-*.whl && \
+    rm -rf /root/.cache
+
+WORKDIR /root
+
+#add welcome message with instructions
+
+RUN echo '[ ! -z "$TERM" -a -r /etc/motd ] && cat /etc/issue && cat /etc/motd' \
+	>> /etc/bash.bashrc \
+	; echo "\
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n\
+|								\n\
+| Docker container running Ubuntu				\n\
+| with TensorFlow ${TF_BRANCH} optimized for CPU		\n\
+| with Intel(R) MKL						\n\
+|								\n\
+||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n\
+\n "\
+	> /etc/motd
+
+CMD ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 21a44ee40447a628579952b47cbc64a263f07cbf..9f4cc74a66f67dd9fa61e137df94d32567cf4e9a 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -81,7 +81,8 @@ ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1
-
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
 
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
@@ -102,5 +103,3 @@ WORKDIR /root
 EXPOSE 6006
 # IPython
 EXPOSE 8888
-
-RUN ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
deleted file mode 100644
index 64ebc4607a82ce59bd3e13c28541ca93778ecdb7..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ /dev/null
@@ -1,118 +0,0 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
-
-# It is possible to override these for releases.
-ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
-ARG TF_AVAILABLE_CPUS=32
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        git \
-        golang \
-        libcurl3-dev \
-        libfreetype6-dev \
-        libpng12-dev \
-        libzmq3-dev \
-        pkg-config \
-        python-dev \
-        python-pip \
-        rsync \
-        software-properties-common \
-        unzip \
-        zip \
-        zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN pip --no-cache-dir install --upgrade \
-        pip setuptools
-
-RUN pip --no-cache-dir install \
-        ipykernel \
-        jupyter \
-        matplotlib \
-        numpy \
-        scipy \
-        sklearn \
-        pandas \
-        wheel \
-        && \
-    python -m ipykernel.kernelspec
-
-# Set up our notebook config.
-COPY jupyter_notebook_config.py /root/.jupyter/
-
-# Jupyter has issues with being run directly:
-#   https://github.com/ipython/ipython/issues/7062
-# We just add a little wrapper script.
-COPY run_jupyter.sh /
-
-# Set up Bazel.
-
-# Running bazel inside a `docker build` command causes trouble, cf:
-#   https://github.com/bazelbuild/bazel/issues/134
-# The easiest solution is to set up a bazelrc file forcing --batch.
-RUN echo "startup --batch" >>/etc/bazel.bazelrc
-# Similarly, we need to workaround sandboxing issues:
-#   https://github.com/bazelbuild/bazel/issues/418
-RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
-    >>/etc/bazel.bazelrc
-WORKDIR /
-RUN mkdir /bazel && \
-    cd /bazel && \
-    wget --quiet https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    wget --quiet https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
-    chmod +x bazel-*.sh && \
-    ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
-    rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
-
-# Download and build TensorFlow.
-WORKDIR /
-RUN git clone https://github.com/tensorflow/tensorflow.git && \
-    cd tensorflow && \
-    git checkout ${TF_BRANCH}
-WORKDIR /tensorflow
-
-# Configure the build for our CUDA configuration.
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:${LD_LIBRARY_PATH} \
-    CUDNN_INSTALL_PATH=/usr/lib/x86_64-linux-gnu \
-    PYTHON_BIN_PATH=/usr/bin/python \
-    PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=9.0 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0 \
-    TF_CUDNN_VERSION=7
-RUN ./configure
-
-# Build and Install TensorFlow.
-RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
-    LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
-    bazel build -c opt \
-                --config=cuda \
-                --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
-                --jobs=${TF_AVAILABLE_CPUS} \
-                tensorflow/tools/pip_package:build_pip_package && \
-    mkdir /pip_pkg && \
-    bazel-bin/tensorflow/tools/pip_package/build_pip_package /pip_pkg
-
-# Clean up pip wheel and Bazel cache when done.
-RUN pip --no-cache-dir install --upgrade /pip_pkg/tensorflow-*.whl && \
-    rm -rf /pip_pkg && \
-    rm -rf /root/.cache
-
-WORKDIR /root
-
-# TensorBoard
-EXPOSE 6006
-# IPython
-EXPOSE 8888
-
-RUN ["/bin/bash"]
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 0571dd73911d1cb1b1f722f603c9b2791981ad4c..b6682cd68163ec870ed815b45ac4fdd9233f88c6 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
diff --git a/tensorflow/tools/docker/README.md b/tensorflow/tools/docker/README.md
index 2e5a0038ed2964319c0795b39f1ab7f3b5374f87..f46c56e11aa72cd0df20f0d8478de2f42dbb3b72 100644
--- a/tensorflow/tools/docker/README.md
+++ b/tensorflow/tools/docker/README.md
@@ -41,7 +41,7 @@ Note: If you would have a problem running nvidia-docker you may try the old meth
 we have used. But it is not recommended. If you find a bug in nvidia-docker, please report
 it there and try using nvidia-docker as described above.
 
-    $ # The old, not recommended way to run docker with gpu support: 
+    $ # The old, not recommended way to run docker with gpu support:
     $ export CUDA_SO=$(\ls /usr/lib/x86_64-linux-gnu/libcuda.* | xargs -I{} echo '-v {}:{}')
     $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
     $ docker run -it -p 8888:8888 $CUDA_SO $DEVICES gcr.io/tensorflow/tensorflow:latest-gpu
@@ -60,6 +60,20 @@ Building TensorFlow Docker containers should be done through the
 script. The raw Dockerfiles should not be used directly as they contain strings
 to be replaced by the script during the build.
 
+Attempting to run [parameterized_docker_build.sh](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/parameterized_docker_build.sh)
+from a binary docker image such as for example `tensorflow/tensorflow:latest` will
+not work. One needs to execute the script from a developer docker image since by
+contrast with a binary docker image it contains not only the compiled solution but
+also the tensorflow source code. Please select the appropriate developer docker
+image of tensorflow at `tensorflow/tensorflow:[.](https://hub.docker.com/r/tensorflow/tensorflow/tags/)`.
+
+The smallest command line to generate a docker image will then be:
+```docker run -it tensorflow/tensorflow:"right_tag"```
+
+If you would like to start a jupyter notebook on your docker container, make sure
+to map the port 8888 of your docker container by adding -p 8888:8888 to the above
+command.
+
 To use the script, specify the container type (`CPU` vs. `GPU`), the desired
 Python version (`PYTHON2` vs. `PYTHON3`) and whether the developer Docker image
 is to be built (`NO` vs. `YES`). In addition, you need to specify the central
diff --git a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb b/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
index e171b439feea95dc649a1013f78386bc008515ff..b0963ebc3f7efb3d10957c6ed0a7175e5b7a1cb3 100644
--- a/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
+++ b/tensorflow/tools/docker/notebooks/2_getting_started.ipynb
@@ -159,7 +159,7 @@
         "X = np.array([np.linspace(-2, 4, num_examples), np.linspace(-6, 6, num_examples)])\n",
         "X += np.random.randn(2, num_examples)\n",
         "x, y = X\n",
-        "x_with_bias = np.array([(1., a) for a in x]).astype(np.float32)\n",
+        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
         "\n",
         "losses = []\n",
         "training_steps = 50\n",
@@ -167,7 +167,7 @@
         "\n",
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors, variables, and operations.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    weights = tf.Variable(tf.random_normal([2, 1], 0, 0.1))\n",
         "\n",
@@ -583,7 +583,7 @@
         "# Split into x and y\n",
         "x, y = X\n",
         "# Add the bias node which always has a value of 1\n",
-        "x_with_bias = np.array([(1., a) for a in x]).astype(np.float32)\n",
+        "bias_with_x = np.array([(1., a) for a in x]).astype(np.float32)\n",
         "\n",
         "# Keep track of the loss at each iteration so we can chart it later\n",
         "losses = []\n",
@@ -598,7 +598,7 @@
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors.\n",
         "    # Our input layer is the x value and the bias node.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    # Our target is the y values. They need to be massaged to the right shape.\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    # Weights are a variable. They change every time through the loop.\n",
@@ -621,7 +621,7 @@
         "    loss = tf.nn.l2_loss(yerror)\n",
         "\n",
         "    # Perform gradient descent. \n",
-        "    # This essentially just updates weights, like weights += grads * learning_rate\n",
+        "    # This essentially just updates weights, like weights -= grads * learning_rate\n",
         "    # using the partial derivative of the loss with respect to the\n",
         "    # weights. It's the direction we want to go to move toward lower error.\n",
         "    update_weights = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)\n",
@@ -743,7 +743,7 @@
         "with tf.Session() as sess:\n",
         "    # Set up all the tensors.\n",
         "    # The input is the x values with the bias appended on to each x.\n",
-        "    input = tf.constant(x_with_bias)\n",
+        "    input = tf.constant(bias_with_x)\n",
         "    # We're trying to find the best fit for the target y values.\n",
         "    target = tf.constant(np.transpose([y]).astype(np.float32))\n",
         "    # Let's set up the weights randomly\n",
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 614a19c178d021133041cef4dbfddd7cd4b6c020..5585ebdcd366ec9db0c47004647970cb27c8bb75 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -135,6 +135,8 @@
     "from six.moves.urllib.request import urlretrieve\n",
     "\n",
     "SOURCE_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'\n",
+    "#SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'\n",
+    "# for those who have no access to google storage, use lecun's repo please\n",
     "WORK_DIRECTORY = \"/tmp/mnist-data\"\n",
     "\n",
     "def maybe_download(filename):\n",
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index 830e3dcd32ec37a9e9a22df7158702c921fc48e6..80a07b9b3ba7fb278b01862880893aa0a2693a28 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -58,6 +58,23 @@
 #     tagged image name with an argument, to push the image to a central repo
 #     such as gcr.io or Docker Hub.
 #
+#   TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS
+#     (Optional)
+#     Do not set this along with TF_DOCKER_BUILD_PUSH_CMD. We will push with the
+#     direct commands as opposed to a script.
+#
+#   TF_DOCKER_USERNAME
+#     (Optional)
+#     Dockerhub username for pushing a package.
+#
+#   TF_DOCKER_EMAIL
+#     (Optional)
+#     Dockerhub email for pushing a package.
+#
+#   TF_DOCKER_PASSWORD
+#     (Optional)
+#     Dockerhub password for pushing a package.
+#
 #   TF_DOCKER_BUILD_PYTHON_VERSION
 #     (Optional)
 #     Specifies the desired Python version. Defaults to PYTHON2.
@@ -378,7 +395,6 @@ fi
 echo ""
 echo "Successfully tagged docker image: ${FINAL_IMG}"
 
-
 # Optional: call command specified by TF_DOCKER_BUILD_PUSH_CMD to push image
 if [[ ! -z "${TF_DOCKER_BUILD_PUSH_CMD}" ]]; then
   ${TF_DOCKER_BUILD_PUSH_CMD} ${FINAL_IMG}
@@ -388,3 +404,23 @@ if [[ ! -z "${TF_DOCKER_BUILD_PUSH_CMD}" ]]; then
     die "FAIL: Failed to push Docker image ${FINAL_IMG}"
   fi
 fi
+
+# Optional: set TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS to push image
+if [[ ! -z "${TF_DOCKER_BUILD_PUSH_WITH_CREDENTIALS}" ]]; then
+
+  docker login --username "${TF_DOCKER_USERNAME}" \
+  --email "${TF_DOCKER_EMAIL}" \
+  --password "${TF_DOCKER_PASSWORD}"
+
+  if [[ $? != "0" ]]; then
+    die "FAIL: Unable to login. Invalid credentials."
+  fi
+  docker push $1
+  if [[ $? == "0" ]]; then
+    docker logout
+    echo "Successfully pushed Docker image ${FINAL_IMG}"
+  else
+    docker logout
+    die "FAIL: Failed to push Docker image ${FINAL_IMG}"
+  fi
+fi
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 9b8b50f9cd5a108f0e553291a1c19ad6c69af7c9..003f972070cb05aa6f34a3748d47f019744de058 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import fnmatch
 import os
 import sys
 
@@ -152,19 +153,36 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
       # Generate header
       f.write('# Automatically generated file; please do not edit\ntoc:\n')
       for module in modules:
-        f.write('  - title: ' + module + '\n'
-                '    section:\n' + '    - title: Overview\n' +
-                '      path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[module]
-                + '\n')
+        indent_num = module.count('.')
+        # Don't list `tf.submodule` inside `tf`
+        indent_num = max(indent_num, 1)
+        indent = '  '*indent_num
+
+        if indent_num > 1:
+          # tf.contrib.baysflow.entropy will be under
+          #   tf.contrib->baysflow->entropy
+          title = module.split('.')[-1]
+        else:
+          title = module
+
+        header = [
+            '- title: ' + title,
+            '  section:',
+            '  - title: Overview',
+            '    path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[module]]
+        header = ''.join([indent+line+'\n' for line in header])
+        f.write(header)
 
         symbols_in_module = module_children.get(module, [])
         # Sort case-insensitive, if equal sort case sensitive (upper first)
         symbols_in_module.sort(key=lambda a: (a.upper(), a))
 
         for full_name in symbols_in_module:
-          f.write('    - title: ' + full_name[len(module) + 1:] + '\n'
-                  '      path: /TARGET_DOC_ROOT/VERSION/' +
-                  symbol_to_file[full_name] + '\n')
+          item = [
+              '  - title: ' + full_name[len(module) + 1:],
+              '    path: /TARGET_DOC_ROOT/VERSION/' + symbol_to_file[full_name]]
+          item = ''.join([indent+line+'\n' for line in item])
+          f.write(item)
 
   # Write a global index containing all full names with links.
   with open(os.path.join(output_dir, 'index.md'), 'w') as f:
@@ -181,12 +199,12 @@ def add_dict_to_dict(add_from, add_to):
       add_to[key] = add_from[key]
 
 
-# Exclude some libaries in contrib from the documentation altogether.
+# Exclude some libraries in contrib from the documentation altogether.
 def _get_default_private_map():
   return {'tf.test': ['mock']}
 
 
-# Exclude members of some libaries.
+# Exclude members of some libraries.
 def _get_default_do_not_descend_map():
   # TODO(wicke): Shrink this list once the modules get sealed.
   return {
@@ -367,10 +385,26 @@ class _UpdateTags(py_guide_parser.PyGuideParser):
 EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
 
 
-def _other_docs(src_dir, output_dir, reference_resolver):
-  """Convert all the files in `src_dir` and write results to `output_dir`."""
-  header = '<!-- DO NOT EDIT! Automatically generated file. -->\n'
+def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
+  """Fix @{} references in all files under `src_dir` matching `file_pattern`.
 
+  A matching directory structure, with the modified files is
+  written to `output_dir`.
+
+  `{"__init__.py","OWNERS","README.txt"}` are skipped.
+
+  Files not matching `file_pattern` (using `fnmatch`) are copied with no change.
+
+  Also, files in the `api_guides/python` directory get explicit ids set on all
+  heading-2s to ensure back-links work.
+
+  Args:
+    src_dir: The directory to convert files from.
+    output_dir: The root directory to write the resulting files to.
+    reference_resolver: A `parser.ReferenceResolver` to make the replacements.
+    file_pattern: Only replace references in files matching file_patters,
+      using fnmatch. Non-matching files are copied unchanged.
+  """
   # Iterate through all the source files and process them.
   tag_updater = _UpdateTags()
   for dirpath, _, filenames in os.walk(src_dir):
@@ -398,21 +432,21 @@ def _other_docs(src_dir, output_dir, reference_resolver):
 
       suffix = os.path.relpath(path=full_in_path, start=src_dir)
       full_out_path = os.path.join(output_dir, suffix)
-      if not base_name.endswith('.md'):
-        print('Copying non-md file %s...' % suffix)
+      if not fnmatch.fnmatch(base_name, file_pattern):
+        print('Copying un-matched file %s...' % suffix)
         open(full_out_path, 'w').write(open(full_in_path).read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
-        md_string = tag_updater.process(full_in_path)
+        content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        md_string = open(full_in_path).read()
+        content = open(full_in_path).read()
 
-      output = reference_resolver.replace_references(md_string,
-                                                     relative_path_to_root)
+      content = reference_resolver.replace_references(content,
+                                                      relative_path_to_root)
       with open(full_out_path, 'w') as f:
-        f.write(header + output)
+        f.write(content)
 
   print('Done.')
 
diff --git a/tensorflow/tools/git/gen/branch_ref b/tensorflow/tools/git/gen/branch_ref
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/tensorflow/tools/git/gen/branch_ref
@@ -0,0 +1 @@
+
diff --git a/tensorflow/tools/git/gen/head b/tensorflow/tools/git/gen/head
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/tensorflow/tools/git/gen/head
@@ -0,0 +1 @@
+
diff --git a/tensorflow/tools/git/gen/spec.json b/tensorflow/tools/git/gen/spec.json
new file mode 100644
index 0000000000000000000000000000000000000000..176bbc21ccb9112d5c29f0351ec937c302a1383e
--- /dev/null
+++ b/tensorflow/tools/git/gen/spec.json
@@ -0,0 +1,3 @@
+{
+  "git": false
+}
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 616ec9fbe0251f9b3d3e7d6f788c193f7856006d..0307d2a0ebee820fee0867c35c5761f2f8607aea 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -177,7 +177,7 @@ const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
 #else
-  return -1;
+  return 0;
 #endif
 }
 """ % git_version
diff --git a/tensorflow/tools/git/gen_git_source.sh b/tensorflow/tools/git/gen_git_source.sh
index eb5e1abe15eb8be0f0580a8b7412f2db6fbea616..788f9e6e5730f9e4699011298d689bc26226fb65 100755
--- a/tensorflow/tools/git/gen_git_source.sh
+++ b/tensorflow/tools/git/gen_git_source.sh
@@ -33,7 +33,7 @@ const int tf_cxx11_abi_flag() {
 #ifdef _GLIBCXX_USE_CXX11_ABI
   return _GLIBCXX_USE_CXX11_ABI;
 #else
-  return -1;
+  return 0;
 #endif
 }
 EOF
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 7975491a289e39970e42753dafe337f22ad0998b..58489b28c8b6738e22e72002ab97c1c0b994b790 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -27,7 +27,6 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -66,10 +65,8 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
     ],
 )
 
@@ -131,9 +128,12 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "//tensorflow/contrib/rnn:gru_ops_op_lib",
+        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
     ] + if_not_windows([
         "//tensorflow/core/kernels:quantized_ops",
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index c7f7eca25749bc20b2bca95956e919b861c4a71d..345d9eadb858cadebe03ecb3297aea52ba54bd37 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -95,9 +95,9 @@ transforms to modify the graph with. The transforms are given as a list of
 names, and can each have arguments themselves. These transforms define the
 pipeline of modifications that are applied in order to produce the output.
 Sometimes you need some transforms to happen before others, and the ordering
-within the list lets you specify which happen first. 
-Note that the optimization 
-`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control 
+within the list lets you specify which happen first.
+Note that the optimization
+`remove_nodes(op=Identity, op=CheckNumerics)` will break the model with control
 flow operations, such as `tf.cond`, `tf.map_fn`, and `tf.while`.
 
 ## Inspecting Graphs
diff --git a/tensorflow/tools/graph_transforms/file_utils.cc b/tensorflow/tools/graph_transforms/file_utils.cc
index 5649c971982bd7a3db2f856f4219c8f6cc1aa811..593faf7b7cf22e51bab39a514c54f1f05b337aa5 100644
--- a/tensorflow/tools/graph_transforms/file_utils.cc
+++ b/tensorflow/tools/graph_transforms/file_utils.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/tools/graph_transforms/file_utils.h"
 
-#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/platform/env.h"
 
 namespace tensorflow {
 namespace graph_transforms {
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index f2934a79bdf65473092cbf80fafbda888d7b9c7c..250f54e20fba6e24fe95741b1437ac3718ace6fb 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -39,9 +39,9 @@ limitations under the License.
 namespace tensorflow {
 namespace graph_transforms {
 namespace {
-using StringPieceSet = std::unordered_set<StringPiece, StringPiece::Hasher>;
+using StringPieceSet = std::unordered_set<StringPiece, StringPieceHasher>;
 template <typename T>
-using StringPieceMap = std::unordered_map<StringPiece, T, StringPiece::Hasher>;
+using StringPieceMap = std::unordered_map<StringPiece, T, StringPieceHasher>;
 }  // namespace
 
 Status ReplaceSendRecvs(const GraphDef& original_graph_def,
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 2b85e7e83c6f3e2c8d0840f0b9eb0b4992a8b113..97e8f77616b85955229619107b443315bca17925 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -759,6 +759,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reshape_dims;
           reshape_dims.set_op("Const");
           reshape_dims.set_name(unique_input_name + "/reshape_dims");
+          AddNodeInput("^" + input_name, &reshape_dims);
           SetNodeAttr("dtype", DT_INT32, &reshape_dims);
           Tensor reshape_dims_tensor(DT_INT32, {1});
           reshape_dims_tensor.flat<int32>()(0) = -1;
@@ -768,6 +769,7 @@ Status QuantizeNodes(const GraphDef& input_graph_def,
           NodeDef reduction_dims;
           reduction_dims.set_op("Const");
           reduction_dims.set_name(unique_input_name + "/reduction_dims");
+          AddNodeInput("^" + input_name, &reduction_dims);
           SetNodeAttr("dtype", DT_INT32, &reduction_dims);
           Tensor reduction_dims_tensor(DT_INT32, {1});
           reduction_dims_tensor.flat<int32>()(0) = 0;
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index c6e577223f94c9eeaff6aea9e815d7241852e391..321f514f6d6b5517aae9d460083b1e6d3fba1372 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -6,6 +6,7 @@ package(default_visibility = ["//visibility:private"])
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_windows",
+    "if_windows",
     "transitive_hdrs",
 )
 load("//third_party/mkl:build_defs.bzl", "if_mkl")
@@ -153,17 +154,23 @@ sh_binary(
             "//tensorflow:tensorflow_py",
             "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
             "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+            "//tensorflow/contrib/data/python/ops:prefetching_py",
             "//tensorflow/contrib/eager/python/examples:examples_pip",
+            "//tensorflow/contrib/eager/python:evaluator",
             "//tensorflow/contrib/gan:gan",
             "//tensorflow/contrib/graph_editor:graph_editor_pip",
             "//tensorflow/contrib/keras:keras",
             "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+            "//tensorflow/contrib/lite/toco:toco",
+            "//tensorflow/contrib/lite/toco/python:toco_wrapper",
+            "//tensorflow/contrib/lite/toco/python:toco_from_protos",
             "//tensorflow/contrib/ndlstm:ndlstm",
             "//tensorflow/contrib/nn:nn_py",
             "//tensorflow/contrib/predictor:predictor_pip",
             "//tensorflow/contrib/receptive_field:receptive_field_pip",
             "//tensorflow/contrib/session_bundle:session_bundle_pip",
             "//tensorflow/contrib/signal:signal_py",
+            "//tensorflow/contrib/signal:test_util",
             "//tensorflow/contrib/slim:slim",
             "//tensorflow/contrib/slim/python/slim/data:data_pip",
             "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
@@ -187,3 +194,23 @@ sh_binary(
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]),
 )
+
+# A genrule for generating a marker file for the pip package on Windows
+#
+# This only works on Windows, because :simple_console_for_windows is a
+# python zip file containing everything we need for building the pip package.
+# However, on other platforms, due to https://github.com/bazelbuild/bazel/issues/4223,
+# when C++ extensions change, this generule doesn't rebuild.
+genrule(
+    name = "win_pip_package_marker",
+    srcs = if_windows([
+        ":build_pip_package",
+        ":simple_console_for_windows",
+    ]),
+    outs = ["win_pip_package_marker_file"],
+    cmd = select({
+        "//conditions:default": "touch $@",
+        "//tensorflow:windows": "md5sum $(locations :build_pip_package) $(locations :simple_console_for_windows) > $@",
+    }),
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index ef6cf56421170a5143167948f9aeef5929b52bc2..86c5e4776df3320dc33c870a59f71b1e2c7d6292 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -4,6 +4,7 @@ recursive-include * *.so
 recursive-include * *.dll
 recursive-include * *.lib
 recursive-include * *.csv
+recursive-include tensorflow/aux-bin *
 recursive-include tensorflow/include/tensorflow *.h
 recursive-include tensorflow/include/Eigen *
 recursive-include tensorflow/include/external *
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index cbf06a97d02a98ae6743cdf74ec6f53a9c3c2a59..f5203bc5448ff2d9a9e9352f8968c4a8a31c336a 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,7 +24,7 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
 }
@@ -92,7 +92,6 @@ function main() {
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow/tensorflow \
       "${TMPDIR}"
     mkdir "${TMPDIR}/external"
-    # Note: this makes an extra copy of org_tensorflow.
     cp_external \
       bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
       "${TMPDIR}/external"
@@ -123,7 +122,6 @@ function main() {
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/tensorflow \
         "${TMPDIR}"
       mkdir "${TMPDIR}/external"
-      # Note: this makes an extra copy of org_tensorflow.
       cp_external \
         bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
         "${TMPDIR}/external"
@@ -137,6 +135,9 @@ function main() {
         fi
       fi
     fi
+    # Install toco as a binary in aux-bin.
+    mkdir "${TMPDIR}/tensorflow/aux-bin"
+    cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index cc46dd5162b396e2dc9eac6dafbc2365cafe17d8..22e1584b780bcefbc278105b794b932aacdc9992 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -42,6 +42,7 @@ BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
+    "//tensorflow/tools/pip_package:win_pip_package_marker",
     "//tensorflow/python:test_ops_2",
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 071b3a2a1888c39e0a22b92edbeecfaa06c8ea83..5ddc688a4cf2cb7991cb03673612634a7d71de0d 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,14 +29,20 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.4.0-rc1'
+_VERSION = '1.4.0'
 
 REQUIRED_PACKAGES = [
-    'enum34 >= 1.1.6',
+    'absl-py >= 0.1.6',
+    # weakref.finalize introduced in Python 3.4
+    'backports.weakref >= 1.0rc1; python_version < "3.4"',
+    # enum module introduced in Python 3.4
+    'enum34 >= 1.1.6; python_version < "3.4"',
+    # Needed for unittest.mock in Python 2
+    'mock >= 2.0.0; python_version < "3.0"',
     'numpy >= 1.12.1',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorflow-tensorboard >= 0.4.0rc1, < 0.5.0',
+    'tensorflow-tensorboard',
 ]
 
 project_name = 'tensorflow'
@@ -54,32 +60,35 @@ else:
   # mock comes with unittest.mock for python3, need to install for python2
   REQUIRED_PACKAGES.append('mock >= 2.0.0')
 
-# remove tensorboard from tf-nightly packages
+# tf-nightly should depend on tb-nightly
 if 'tf_nightly' in project_name:
-  for package in REQUIRED_PACKAGES:
-    if 'tensorflow-tensorboard' in package:
-      REQUIRED_PACKAGES.remove(package)
+  for i, pkg in enumerate(REQUIRED_PACKAGES):
+    if 'tensorboard' in pkg:
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.5.0a0, < 1.6.0a0'
       break
 
-# weakref.finalize was introduced in Python 3.4
+# weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
   REQUIRED_PACKAGES.append('backports.weakref >= 1.0rc1')
+  REQUIRED_PACKAGES.append('enum34 >= 1.1.6')
 
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:main',
+    'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
+    'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
-    'tensorboard = tensorboard.main:main',
+    'tensorboard = tensorboard.main:run_main',
 ]
 # pylint: enable=line-too-long
 
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
-  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:main')
+  CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
 TEST_PACKAGES = [
     'scipy >= 0.15.1',
@@ -187,7 +196,6 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
            list(find_files('*', 'external/eigen_archive')) +
            list(find_files('*.h', 'external/nsync/public')))
 
-
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions.cc b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
index ecb29a65a08b098cd167e5cbb2bdb5821e01a543..f0bb59acf801ba586fa8258b5b1ad9f202f014bf 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions.cc
@@ -132,6 +132,7 @@ int MainImpl(int argc, char** argv) {
       FILE* f = fopen(path.c_str(), "w");
       if (f == nullptr) return -1;
       if (fwrite(data.c_str(), 1, data.size(), f) != data.size()) {
+        fclose(f);
         return -1;
       }
       if (fclose(f) != 0) {
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index b5c4bbf5a700aedfea7abf7f1c07a62df0155cfc..cee53dd5b61e50126948e3652865a32f45eab092 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -21,8 +21,9 @@ def tf_cc_logged_benchmark(
     fail(" ".join(("Target must be a single well-defined test, e.g.,",
                    "//path/to:test. Received: %s" % target)))
 
-  all_tags = list(depset(tags) + \
-                  depset(["benchmark-test", "local", "manual", "regression-test"]))
+  all_tags = (
+    depset(tags) + depset(
+      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
 
   tf_py_test(
       name = name,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e25e12d5c5f01148221a6cb5f41aad14830dbb65..046c2b2391e732864043678d955ef1211a5c5d51 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -1,45 +1,27 @@
 # TensorFlow external dependencies that can be loaded in WORKSPACE files.
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
-load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
-load("@io_bazel_rules_closure//closure/private:java_import_external.bzl",
-     "java_import_external")
-load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl",
-     "arm_compiler_configure")
-
-
-def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return repository_ctx.os.name.lower().find("windows") != -1
-
-
-def _get_env_var(repository_ctx, name):
-  """Find an environment variable."""
-  if name in repository_ctx.os.environ:
-    return repository_ctx.os.environ[name]
-  else:
-    return None
-
+load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
+load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
+load("//third_party:repo.bzl", "tf_http_archive")
+load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
+load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 
 # Parse the bazel version string from `native.bazel_version`.
 def _parse_bazel_version(bazel_version):
   # Remove commit from version.
   version = bazel_version.split(" ", 1)[0]
-
   # Split into (release, date) parts and only return the release
   # as a tuple of integers.
   parts = version.split("-", 1)
-
   # Turn "release" into a tuple of strings
   version_tuple = ()
   for number in parts[0].split("."):
     version_tuple += (str(number),)
   return version_tuple
 
-
 # Check that a specific bazel version is being used.
 def check_version(bazel_version):
   if "bazel_version" not in dir(native):
@@ -56,86 +38,6 @@ def check_version(bazel_version):
       fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
           native.bazel_version, bazel_version))
 
-
-def _repos_are_siblings():
-  return Label("@foo//bar").workspace_root.startswith("../")
-
-
-# Temporary workaround to support including TensorFlow as a submodule until this
-# use-case is supported in the next Bazel release.
-def _temp_workaround_http_archive_impl(repo_ctx):
-  repo_ctx.template("BUILD", repo_ctx.attr.build_file, {
-      "%prefix%": ".." if _repos_are_siblings() else "external",
-      "%ws%": repo_ctx.attr.repository
-  }, False)
-  repo_ctx.download_and_extract(repo_ctx.attr.urls, "", repo_ctx.attr.sha256,
-                                "", repo_ctx.attr.strip_prefix)
-  if repo_ctx.attr.patch_file != None:
-    _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-
-temp_workaround_http_archive = repository_rule(
-    implementation = _temp_workaround_http_archive_impl,
-    attrs = {
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "patch_file": attr.label(default = None),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-)
-
-# Executes specified command with arguments and calls 'fail' if it exited with
-# non-zero code
-def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
-  result = repo_ctx.execute(cmd_and_args, timeout=10)
-  if result.return_code != 0:
-    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
-          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
-                                  result.stdout, result.stderr))
-
-
-# Apply a patch_file to the repository root directory
-# Runs 'patch -p1'
-def _apply_patch(repo_ctx, patch_file):
-  # Don't check patch on Windows, because patch is only available under bash.
-  if not _is_windows(repo_ctx) and not repo_ctx.which("patch"):
-    fail("patch command is not found, please install it")
-
-  cmd = [
-      "patch", "-p1", "-d", repo_ctx.path("."), "-i", repo_ctx.path(patch_file)
-  ]
-  if _is_windows(repo_ctx):
-    bazel_sh = _get_env_var(repo_ctx, "BAZEL_SH")
-    if not bazel_sh:
-      fail("BAZEL_SH environment variable is not set")
-    cmd = [bazel_sh, "-c", " ".join(cmd)]
-  _execute_and_check_ret_code(repo_ctx, cmd)
-
-
-# Download the repository and apply a patch to its root
-def _patched_http_archive_impl(repo_ctx):
-  repo_ctx.download_and_extract(
-      repo_ctx.attr.urls,
-      sha256=repo_ctx.attr.sha256,
-      stripPrefix=repo_ctx.attr.strip_prefix)
-  _apply_patch(repo_ctx, repo_ctx.attr.patch_file)
-
-
-patched_http_archive = repository_rule(
-    implementation = _patched_http_archive_impl,
-    attrs = {
-        "patch_file": attr.label(),
-        "build_file": attr.label(),
-        "repository": attr.string(),
-        "urls": attr.string_list(default = []),
-        "sha256": attr.string(default = ""),
-        "strip_prefix": attr.string(default = ""),
-    },
-)
-
-
 # If TensorFlow is linked as a submodule.
 # path_prefix is no longer used.
 # tf_repo_name is thought to be under consideration.
@@ -158,71 +60,79 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       name = "mkl",
       urls = [
           "https://mirror.bazel.build/github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
-          # "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
+          "https://github.com/01org/mkl-dnn/releases/download/v0.9/mklml_lnx_2018.0.20170720.tgz",
       ],
       sha256 = "57ba56c4c243f403ff78f417ff854ef50b9eddf4a610a917b7c95e7fa8553a4b",
       strip_prefix = "mklml_lnx_2018.0.20170720",
       build_file = str(Label("//third_party/mkl:mkl.BUILD")),
-      repository = tf_repo_name,
   )
 
   if path_prefix:
     print("path_prefix was specified to tf_workspace but is no longer used " +
           "and will be removed in the future.")
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
-          "https://github.com/01org/mkl-dnn/archive/b01e3a55a07be62172e713bcd2644c5176360212.tar.gz",
+          "https://mirror.bazel.build/github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
+          "https://github.com/01org/mkl-dnn/archive/aab753280e83137ba955f8f19d72cb6aaba545ef.tar.gz",
       ],
-      sha256 = "0d529ad4c49dc799e6df07c2b88b115d0668735da15fb3b3862d28d33fa68165",
-      strip_prefix = "mkl-dnn-b01e3a55a07be62172e713bcd2644c5176360212",
+      sha256 = "fb67f255a96bd4ad39b8dd104eca5aa92200c95c1ed36e59641e6c0478eefd11",
+      strip_prefix = "mkl-dnn-aab753280e83137ba955f8f19d72cb6aaba545ef",
       build_file = str(Label("//third_party/mkl_dnn:mkldnn.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
+      name = "com_google_absl",
+      urls = [
+          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
+          "https://github.com/abseil/abseil-cpp/archive/cc4bed2d74f7c8717e31f9579214ab52a9c9c610.tar.gz",
+      ],
+     sha256 = "f1a7349f88d2846210c42e2f7271dabeee404c2a3b4198e34a797993e3569b03",
+     strip_prefix = "abseil-cpp-cc4bed2d74f7c8717e31f9579214ab52a9c9c610",
+  )
+
+  tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/429aa5254200.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b6e6d0cf6a77.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/b6e6d0cf6a77.tar.gz",
       ],
-      sha256 = "61d8b6fc4279dd1dda986fb1677d15e3d641c07a3ea5abe255790b1f0c0c14e9",
-      strip_prefix = "eigen-eigen-429aa5254200",
+      sha256 = "0840c497f2749b5e90bda666aab96be6da90dc75b4e21ca9843cae69b7fed52a",
+      strip_prefix = "eigen-eigen-b6e6d0cf6a77",
       build_file = str(Label("//third_party:eigen.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_compiler",
-      build_file = str(Label("//:arm_compiler.BUILD")),
       sha256 = "970285762565c7890c6c087d262b0a18286e7d0384f13a37786d8521773bc969",
       strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf",
       urls = [
           "https://mirror.bazel.build/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz",
       ],
+      build_file = str(Label("//:arm_compiler.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "libxsmm_archive",
       urls = [
           "https://mirror.bazel.build/github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
-          # "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
+          "https://github.com/hfp/libxsmm/archive/1.8.1.tar.gz",
       ],
       sha256 = "2ade869c3f42f23b5263c7d594aa3c7e5e61ac6a3afcaf5d6e42899d2a7986ce",
       strip_prefix = "libxsmm-1.8.1",
       build_file = str(Label("//third_party:libxsmm.BUILD")),
   )
 
-  native.bind(
-      name = "xsmm_avx",
-      actual = "@libxsmm_archive//third_party:xsmm_avx",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "ortools_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          # Please uncomment me, when the next upgrade happens. Then
+          # remove the whitelist entry in third_party/repo.bzl.
           # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
       ],
       sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
@@ -230,54 +140,50 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:ortools.BUILD")),
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_googlesource_code_re2",
       urls = [
-          "https://mirror.bazel.build/github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
-          # "https://github.com/google/re2/archive/b94b7cd42e9f02673cd748c1ac1d16db4052514c.tar.gz",
+          "https://mirror.bazel.build/github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+          "https://github.com/google/re2/archive/26cd968b735e227361c9703683266f01e5df7857.tar.gz",
+
       ],
-      sha256 = "bd63550101e056427c9e7ff12a408c1c8b74e9803f393ca916b2926fc2c4906f",
-      strip_prefix = "re2-b94b7cd42e9f02673cd748c1ac1d16db4052514c",
+      sha256 = "e57eeb837ac40b5be37b2c6197438766e73343ffb32368efea793dfd8b28653b",
+      strip_prefix = "re2-26cd968b735e227361c9703683266f01e5df7857",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "gemmlowp",
       urls = [
-          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip"
-          # "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
+          "https://github.com/google/gemmlowp/archive/010bb3e71a26ca1d0884a167081d092b43563996.zip",
       ],
       sha256 = "dd2557072bde12141419cb8320a9c25e6ec41a8ae53c2ac78c076a347bb46d9d",
       strip_prefix = "gemmlowp-010bb3e71a26ca1d0884a167081d092b43563996",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "farmhash_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
-          # "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
+          "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz",
       ],
       sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
       strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45",
       build_file = str(Label("//third_party:farmhash.BUILD")),
   )
 
-  native.bind(
-      name = "farmhash",
-      actual = "@farmhash//:farmhash",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "highwayhash",
       urls = [
           "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          # "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
       ],
       sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
       strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
       build_file = str(Label("//third_party:highwayhash.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "nasm",
       urls = [
           "https://mirror.bazel.build/www.nasm.us/pub/nasm/releasebuilds/2.12.02/nasm-2.12.02.tar.bz2",
@@ -288,30 +194,29 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:nasm.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jpeg",
       urls = [
           "https://mirror.bazel.build/github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
-          # "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
+          "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/1.5.1.tar.gz",
       ],
       sha256 = "c15a9607892113946379ccea3ca8b85018301b200754f209453ab21674268e77",
       strip_prefix = "libjpeg-turbo-1.5.1",
       build_file = str(Label("//third_party/jpeg:jpeg.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "png_archive",
       urls = [
           "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          # "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
       ],
       sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
       strip_prefix = "libpng-1.2.53",
       build_file = str(Label("//third_party:png.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "sqlite_archive",
       urls = [
           "https://mirror.bazel.build/www.sqlite.org/2017/sqlite-amalgamation-3200000.zip",
@@ -319,10 +224,10 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "208780b3616f9de0aeb50822b7a8f5482f6515193859e91ed61637be6ad74fd4",
       strip_prefix = "sqlite-amalgamation-3200000",
-      build_file = str(Label("//third_party:sqlite.BUILD"))
+      build_file = str(Label("//third_party:sqlite.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "gif_archive",
       urls = [
           "https://mirror.bazel.build/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz",
@@ -333,7 +238,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:gif.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "six_archive",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz",
@@ -344,7 +249,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:six.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
+      name = "absl_py",
+      urls = [
+          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
+          "https://github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
+      ],
+      sha256 = "29e4584e778bee13aa4093824133d131d927cc160561892880118d9ff7b95a6a",
+      strip_prefix = "abseil-py-acec853355ef987eae48a8d87a79351c15dff593",
+  )
+
+  tf_http_archive(
       name = "org_python_pypi_backports_weakref",
       urls = [
           "https://mirror.bazel.build/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz",
@@ -355,11 +270,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:backports_weakref.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_github_andreif_codegen",
       urls = [
           "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          # "https://github.com/andreif/codegen/archive/1.0.tar.gz",
+          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
       ],
       sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
       strip_prefix = "codegen-1.0",
@@ -377,17 +292,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       },
   )
 
-  native.bind(
-      name = "six",
-      actual = "@six_archive//:six",
-  )
-
-  # TODO(gunan): Add github mirror back if/when sha256sum issues are resolved.
-  #   See https://github.com/libgit2/libgit2/issues/4343 for contetxt.
-  patched_http_archive(
+  tf_http_archive(
       name = "protobuf_archive",
       urls = [
           "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
       ],
       sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
       strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
@@ -398,70 +307,60 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       patch_file = str(Label("//third_party/protobuf:add_noinlines.patch")),
   )
 
-  native.bind(
-      name = "protobuf",
-      actual = "@protobuf_archive//:protobuf",
-  )
-
   # We need to import the protobuf library under the names com_google_protobuf
   # and com_google_protobuf_cc to enable proto_library support in bazel.
   # Unfortunately there is no way to alias http_archives at the moment.
-  # TODO(gunan): Add github mirror back if/when sha256sum issues are resolved.
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
       ],
-      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
-      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
+      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
+      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
   )
 
-  # TODO(gunan): Add github mirror back if/when sha256sum issues are resolved.
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_protobuf_cc",
       urls = [
-          "https://mirror.bazel.build/github.com/google/protobuf/archive/0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66.tar.gz",
+          "https://mirror.bazel.build/github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
+          "https://github.com/google/protobuf/archive/b04e5cba356212e4e8c66c61bbe0c3a20537c5b9.tar.gz",
       ],
-      sha256 = "6d43b9d223ce09e5d4ce8b0060cb8a7513577a35a64c7e3dad10f0703bf3ad93",
-      strip_prefix = "protobuf-0b059a3d8a8f8aa40dde7bea55edca4ec5dfea66",
+      sha256 = "e178a25c52efcb6b05988bdbeace4c0d3f2d2fe5b46696d1d9898875c3803d6a",
+      strip_prefix = "protobuf-b04e5cba356212e4e8c66c61bbe0c3a20537c5b9",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "nsync",
       urls = [
-          "https://mirror.bazel.build/github.com/google/nsync/archive/839fcc53ff9be58218ed55397deb3f8376a1444e.tar.gz",
-          # "https://github.com/google/nsync/archive/839fcc53ff9be58218ed55397deb3f8376a1444e.tar.gz",
+          "https://mirror.bazel.build/github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
+          "https://github.com/google/nsync/archive/8502189abfa44c249c01c2cad64e6ed660a9a668.tar.gz",
       ],
-      sha256 = "124d105edb0313ef2d7f5bb86ec94d9f8de95479e55641c4254ffa8f795e9b37",
-      strip_prefix = "nsync-839fcc53ff9be58218ed55397deb3f8376a1444e",
+      sha256 = "51f81ff4202bbb820cdbedc061bd2eb6765f2b5c06489e7a8694bedac329e8f8",
+      strip_prefix = "nsync-8502189abfa44c249c01c2cad64e6ed660a9a668",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_google_googletest",
       urls = [
           "https://mirror.bazel.build/github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
-          # "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
+          "https://github.com/google/googletest/archive/9816b96a6ddc0430671693df90192bbee57108b6.zip",
       ],
       sha256 = "9cbca84c4256bed17df2c8f4d00c912c19d247c11c9ba6647cd6dd5b5c996b8d",
       strip_prefix = "googletest-9816b96a6ddc0430671693df90192bbee57108b6",
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "com_github_gflags_gflags",
       urls = [
           "https://mirror.bazel.build/github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
-          # "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
+          "https://github.com/gflags/gflags/archive/f8a0efe03aa69b3336d8e228b37d4ccb17324b88.tar.gz",
       ],
       sha256 = "4d222fab8f1ede4709cdff417d15a1336f862d7334a81abf76d09c15ecf9acd1",
       strip_prefix = "gflags-f8a0efe03aa69b3336d8e228b37d4ccb17324b88",
   )
 
-  native.bind(
-      name = "python_headers",
-      actual = str(Label("//util/python:python_headers")),
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "pcre",
       sha256 = "ccdf7e788769838f8285b3ee672ed573358202305ee361cfec7a4a4fb005bbc7",
       urls = [
@@ -472,7 +371,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:pcre.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "swig",
       sha256 = "58a475dbbd4a4d7075e5fe86d4e54c9edde39847cdb96a3053d87cb64a23a453",
       urls = [
@@ -484,7 +383,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:swig.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "curl",
       sha256 = "ff3e80c1ca6a068428726cd7dd19037a47cc538ce58ef61c59587191039b2ca6",
       urls = [
@@ -493,57 +392,24 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "curl-7.49.1",
       build_file = str(Label("//third_party:curl.BUILD")),
-      repository = tf_repo_name
-  )
-
-  # grpc expects //external:protobuf_clib and //external:protobuf_compiler
-  # to point to the protobuf's compiler library.
-  native.bind(
-      name = "protobuf_clib",
-      actual = "@protobuf_archive//:protoc_lib",
-  )
-
-  native.bind(
-      name = "libssl",
-      actual = "@boringssl//:ssl",
   )
 
-  # gRPC has includes directly from their third_party path for nanopb, so we
-  # must depend on their version of it.
-  native.bind(
-      name = "nanopb",
-      actual = "@grpc//third_party/nanopb:nanopb",
-  )
-
-  patched_http_archive(
+  tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
-          # "https://github.com/grpc/grpc/archive/781fd6f6ea03645a520cd5c675da67ab61f87e4b.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/f836c7e941beb003289dc6e9a58a6e47f5caa5f0.tar.gz",
+          "https://github.com/grpc/grpc/archive/f836c7e941beb003289dc6e9a58a6e47f5caa5f0.tar.gz",
       ],
-      sha256 = "2004635e6a078acfac8ffa71738397796be4f8fb72f572cc44ecee5d99511d9f",
-      strip_prefix = "grpc-781fd6f6ea03645a520cd5c675da67ab61f87e4b",
-      patch_file = str(Label("//third_party/grpc:grpc.patch")),
+      sha256 = "676425fc19e0290443b21f1804e5d1096456b6512b349606e3eae8e63299e6ee",
+      strip_prefix = "grpc-f836c7e941beb003289dc6e9a58a6e47f5caa5f0",
   )
 
-  # protobuf expects //external:grpc_cpp_plugin to point to grpc's
-  # C++ plugin code generator.
-  native.bind(
-      name = "grpc_cpp_plugin",
-      actual = "@grpc//:grpc_cpp_plugin",
-  )
-
-  native.bind(
-      name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
       urls = [
           "https://mirror.bazel.build/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
-          # "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
+          "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz",
       ],
       strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3",
       build_file = str(Label("//third_party:linenoise.BUILD")),
@@ -551,55 +417,50 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
 
   # TODO(phawkins): currently, this rule uses an unofficial LLVM mirror.
   # Switch to an official source of snapshots if/when possible.
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/bb3c660e87f59abb665570a31b01ab125ec4c10e.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/bb3c660e87f59abb665570a31b01ab125ec4c10e.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/9ab4c272cb604a7f947865428c4ef2169fee2100.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/9ab4c272cb604a7f947865428c4ef2169fee2100.tar.gz",
       ],
-      sha256 = "caab6d7978e6771cb4e9b5b89607c5370de8aa642913c6c14e892468194c94e4",
-      strip_prefix = "llvm-bb3c660e87f59abb665570a31b01ab125ec4c10e",
+      sha256 = "1b1b7d3800a94ca2302e3dd670dbe84238749583027883784b55297059d83da8",
+      strip_prefix = "llvm-9ab4c272cb604a7f947865428c4ef2169fee2100",
       build_file = str(Label("//third_party/llvm:llvm.BUILD")),
-      repository = tf_repo_name,
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "lmdb",
       urls = [
           "https://mirror.bazel.build/github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
-          # "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
+          "https://github.com/LMDB/lmdb/archive/LMDB_0.9.19.tar.gz",
       ],
       sha256 = "108532fb94c6f227558d45be3f3347b52539f0f58290a7bb31ec06c462d05326",
       strip_prefix = "lmdb-LMDB_0.9.19/libraries/liblmdb",
       build_file = str(Label("//third_party:lmdb.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "jsoncpp_git",
       urls = [
           "https://mirror.bazel.build/github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
-          # "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
+          "https://github.com/open-source-parsers/jsoncpp/archive/11086dd6a7eba04289944367ca82cea71299ed70.tar.gz",
       ],
       sha256 = "07d34db40593d257324ec5fb9debc4dc33f29f8fb44e33a2eeb35503e61d0fe2",
       strip_prefix = "jsoncpp-11086dd6a7eba04289944367ca82cea71299ed70",
       build_file = str(Label("//third_party:jsoncpp.BUILD")),
   )
 
-  native.bind(
-      name = "jsoncpp",
-      actual = "@jsoncpp_git//:jsoncpp",
-  )
-
-  native.http_archive(
+  tf_http_archive(
       name = "boringssl",
       urls = [
           "https://mirror.bazel.build/github.com/google/boringssl/archive/a0fb951d2a26a8ee746b52f3ba81ab011a0af778.tar.gz",
+          "https://github.com/google/boringssl/archive/a0fb951d2a26a8ee746b52f3ba81ab011a0af778.tar.gz",
       ],
       sha256 = "524ba98a56300149696481b4cb9ddebd0c7b7ac9b9f6edee81da2d2d7e5d2bb3",
       strip_prefix = "boringssl-a0fb951d2a26a8ee746b52f3ba81ab011a0af778",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "zlib_archive",
       urls = [
           "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
@@ -610,12 +471,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:zlib.BUILD")),
   )
 
-  native.bind(
-      name = "zlib",
-      actual = "@zlib_archive//:zlib",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "fft2d",
       urls = [
           "https://mirror.bazel.build/www.kurims.kyoto-u.ac.jp/~ooura/fft.tgz",
@@ -625,40 +481,37 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party/fft2d:fft2d.BUILD")),
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "snappy",
       urls = [
           "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          # "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
       ],
       sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
       strip_prefix = "snappy-1.1.4",
       build_file = str(Label("//third_party:snappy.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "nccl_archive",
       urls = [
           "https://mirror.bazel.build/github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
-          # "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
+          "https://github.com/nvidia/nccl/archive/03d856977ecbaac87e598c0c4bafca96761b9ac7.tar.gz",
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
       build_file = str(Label("//third_party:nccl.BUILD")),
-      repository = tf_repo_name,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "aws",
       urls = [
-          "http://bazel-mirror.storage.googleapis.com/github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
-          # "https://github.com/aws/aws-sdk-cpp/archive/1.0.90.tar.gz",
+          "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+          "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
       ],
-      sha256 = "f599b57aec4f03ad696044dd430b2d201864113937353adc346f53ad47991319",
-      strip_prefix = "aws-sdk-cpp-1.0.90",
+      sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
+      strip_prefix = "aws-sdk-cpp-1.3.15",
       build_file = str(Label("//third_party:aws.BUILD")),
-      repository = tf_repo_name
   )
 
   java_import_external(
@@ -686,16 +539,15 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       testonly_ = True,
   )
 
-  temp_workaround_http_archive(
+  tf_http_archive(
       name = "jemalloc",
       urls = [
           "https://mirror.bazel.build/github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
-          # "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
+          "https://github.com/jemalloc/jemalloc/archive/4.4.0.tar.gz",
       ],
       sha256 = "3c8f25c02e806c3ce0ab5fb7da1817f89fc9732709024e2a81b6b82f7cc792a8",
       strip_prefix = "jemalloc-4.4.0",
       build_file = str(Label("//third_party:jemalloc.BUILD")),
-      repository = tf_repo_name,
   )
 
   java_import_external(
@@ -733,34 +585,29 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       licenses = ["notice"],  # Apache 2.0
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "com_google_pprof",
       urls = [
           "https://mirror.bazel.build/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
-          # "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
+          "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz",
       ],
       sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4",
       strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650",
       build_file = str(Label("//third_party:pprof.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "cub_archive",
       urls = [
           "https://mirror.bazel.build/github.com/NVlabs/cub/archive/1.7.4.zip",
-          # "https://github.com/NVlabs/cub/archive/1.7.4.zip",
+          "https://github.com/NVlabs/cub/archive/1.7.4.zip",
       ],
       sha256 = "20a1a39fd97e5da7f40f5f2e7fd73fd2ea59f9dc4bb8a6c5f228aa543e727e31",
       strip_prefix = "cub-1.7.4",
       build_file = str(Label("//third_party:cub.BUILD")),
   )
 
-  native.bind(
-      name = "cub",
-      actual = "@cub_archive//:cub",
-  )
-
-  native.new_http_archive(
+  tf_http_archive(
       name = "cython",
       sha256 = "6dcd30b5ceb887b2b965ee7ceb82ea3acb5f0642fe2206c7636b45acea4798e5",
       urls = [
@@ -769,19 +616,20 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       strip_prefix = "cython-3732784c45cfb040a5b0936951d196f83a12ea17",
       build_file = str(Label("//third_party:cython.BUILD")),
+      delete = ["BUILD.bazel"],
   )
 
-  native.http_archive(
+  tf_http_archive(
       name = "bazel_toolchains",
       urls = [
-          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
-          # "https://github.com/bazelbuild/bazel-toolchains/archive/af4681c3d19f063f090222ec3d04108c4e0ca255.tar.gz",
+          "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/b49ba3689f46ac50e9277dafd8ff32b26951f82e.tar.gz",
+          "https://github.com/bazelbuild/bazel-toolchains/archive/b49ba3689f46ac50e9277dafd8ff32b26951f82e.tar.gz",
       ],
-      sha256 = "d58bb2d6c8603f600d522b6104d6192a65339aa26cbba9f11ff5c4b36dedb928",
-      strip_prefix = "bazel-toolchains-af4681c3d19f063f090222ec3d04108c4e0ca255",
+      sha256 = "1266f1e27b4363c83222f1a776397c7a069fbfd6aacc9559afa61cdd73e1b429",
+      strip_prefix = "bazel-toolchains-b49ba3689f46ac50e9277dafd8ff32b26951f82e",
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "arm_neon_2_x86_sse",
       sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
       strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
@@ -792,13 +640,109 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = str(Label("//third_party:arm_neon_2_x86_sse.BUILD")),
   )
 
-  native.new_http_archive(
+  tf_http_archive(
       name = "flatbuffers",
-      build_file = "third_party/flatbuffers/flatbuffers.BUILD",
       strip_prefix = "flatbuffers-971a68110e4fc1bace10fcb6deeb189e7e1a34ce",
       sha256 = "874088d2ee0d9f8524191f77209556415f03dd44e156276edf19e5b90ceb5f55",
       urls = [
           "https://mirror.bazel.build/github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
           "https://github.com/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz",
       ],
+      build_file = str(Label("//third_party/flatbuffers:flatbuffers.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_mobilenet",
+      sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
+      ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_smartreply",
+      sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip"
+      ],
+      build_file = str(Label("//third_party:tflite_smartreply.BUILD")),
+  )
+
+  ##############################################################################
+  # BIND DEFINITIONS
+  #
+  # Please do not add bind() definitions unless we have no other choice.
+  # If that ends up being the case, please leave a comment explaining
+  # why we can't depend on the canonical build target.
+
+  # gRPC wants a cares dependency but its contents is not actually
+  # important since we have set GRPC_ARES=0 in tools/bazel.rc
+  native.bind(
+      name = "cares",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "grpc_cpp_plugin",
+      actual = "@grpc//:grpc_cpp_plugin",
+  )
+
+  # gRPC has three empty C++ functions which it wants the user to define
+  # at build time. https://github.com/grpc/grpc/issues/13590
+  native.bind(
+      name = "grpc_lib",
+      actual = "@grpc//:grpc++_unsecure",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "libssl",
+      actual = "@boringssl//:ssl",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "nanopb",
+      actual = "@grpc//third_party/nanopb:nanopb",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf",
+      actual = "@protobuf_archive//:protobuf",
+  )
+
+  # gRPC expects //external:protobuf_clib and //external:protobuf_compiler
+  # to point to Protobuf's compiler library.
+  native.bind(
+      name = "protobuf_clib",
+      actual = "@protobuf_archive//:protoc_lib",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "protobuf_headers",
+      actual = "@protobuf_archive//:protobuf_headers",
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "python_headers",
+      actual = str(Label("//util/python:python_headers")),
+  )
+
+  # Needed by Protobuf
+  native.bind(
+      name = "six",
+      actual = "@six_archive//:six",
+  )
+
+  # Needed by gRPC
+  native.bind(
+      name = "zlib",
+      actual = "@zlib_archive//:zlib",
   )
diff --git a/third_party/aws.BUILD b/third_party/aws.BUILD
index bc6a2fd8cc6b8db251a026749daef9c0f6e875f5..bf5310aa1657dee5e0ccc623b2028a4f8ab7aca3 100644
--- a/third_party/aws.BUILD
+++ b/third_party/aws.BUILD
@@ -7,18 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "aws",
     srcs = select({
-        "@%ws%//tensorflow:linux_x86_64": glob([
+        "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:darwin": glob([
+        "@org_tensorflow//tensorflow:darwin": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
-        "@%ws%//tensorflow:linux_ppc64le": glob([
+        "@org_tensorflow//tensorflow:linux_ppc64le": glob([
+            "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
+        ]),
+        "@org_tensorflow//tensorflow:raspberry_pi_armeabi": glob([
             "aws-cpp-sdk-core/source/platform/linux-shared/*.cpp",
         ]),
         "//conditions:default": [],
@@ -50,17 +53,17 @@ cc_library(
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
     defines = select({
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "PLATFORM_APPLE",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
         ],
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "PLATFORM_LINUX",
             "ENABLE_CURL_CLIENT",
             "ENABLE_NO_ENCRYPTION",
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 882967df1c15eceb57c6b0979c8d45abafd73bac..4def6f94892329e0d8b594b824babd60ea259351 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -6,10 +6,11 @@ licenses(["notice"])  # MIT/X derivative license
 exports_files(["COPYING"])
 
 CURL_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_FTP",
     "/DCURL_DISABLE_NTLM",
+    "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
     # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
@@ -23,6 +24,8 @@ CURL_WIN_SRCS = [
     "lib/asyn-thread.c",
     "lib/inet_ntop.c",
     "lib/system_win32.c",
+    "lib/vtls/schannel.c",
+    "lib/idn_win32.c",
 ]
 
 cc_library(
@@ -224,14 +227,14 @@ cc_library(
         "lib/wildcard.h",
         "lib/x509asn1.h",
     ] + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:ios": [
+        "@org_tensorflow//tensorflow:ios": [
             "lib/vtls/darwinssl.c",
         ],
-        "@%ws%//tensorflow:windows": CURL_WIN_SRCS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_SRCS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_SRCS,
         "//conditions:default": [
             "lib/vtls/openssl.c",
         ],
@@ -248,10 +251,10 @@ cc_library(
         "include/curl/typecheck-gcc.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_FTP",
@@ -261,14 +264,14 @@ cc_library(
             "-Wno-string-plus-int",
         ],
     }) + select({
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-fno-constant-cfstrings",
         ],
-        "@%ws%//tensorflow:windows": [
+        "@org_tensorflow//tensorflow:windows": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
+        "@org_tensorflow//tensorflow:windows_msvc": [
             # See curl.h for discussion of write size and Windows
             "/DCURL_MAX_WRITE_SIZE=16384",
         ],
@@ -276,23 +279,30 @@ cc_library(
             "-DCURL_MAX_WRITE_SIZE=65536",
         ],
     }),
+    defines = ["CURL_STATICLIB"],
     includes = ["include"],
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
-        "@%ws%//tensorflow:windows_msvc": [
-            "-Wl,ws2_32.lib",
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "-DEFAULTLIB:ws2_32.lib",
+            "-DEFAULTLIB:advapi32.lib",
+            "-DEFAULTLIB:crypt32.lib",
+            "-DEFAULTLIB:Normaliz.lib",
         ],
         "//conditions:default": [
             "-lrt",
@@ -302,9 +312,9 @@ cc_library(
     deps = [
         "@zlib_archive//:zlib",
     ] + select({
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [],
-        "@%ws%//tensorflow:windows_msvc": [],
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "@boringssl//:ssl",
         ],
@@ -312,7 +322,7 @@ cc_library(
 )
 
 CURL_BIN_WIN_COPTS = [
-    "/I%prefix%/curl/lib",
+    "/Iexternal/curl/lib",
     "/DHAVE_CONFIG_H",
     "/DCURL_DISABLE_LIBCURL_OPTION",
 ]
@@ -406,10 +416,10 @@ cc_binary(
         "src/tool_xattr.h",
     ],
     copts = select({
-        "@%ws%//tensorflow:windows": CURL_BIN_WIN_COPTS,
-        "@%ws%//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows": CURL_BIN_WIN_COPTS,
+        "@org_tensorflow//tensorflow:windows_msvc": CURL_BIN_WIN_COPTS,
         "//conditions:default": [
-            "-I%prefix%/curl/lib",
+            "-Iexternal/curl/lib",
             "-D_GNU_SOURCE",
             "-DHAVE_CONFIG_H",
             "-DCURL_DISABLE_LIBCURL_OPTION",
@@ -438,12 +448,22 @@ genrule(
         "#  include \"lib/config-win32.h\"",
         "#  define BUILDING_LIBCURL 1",
         "#  define CURL_DISABLE_CRYPTO_AUTH 1",
+        "#  define CURL_DISABLE_DICT 1",
+        "#  define CURL_DISABLE_FILE 1",
+        "#  define CURL_DISABLE_GOPHER 1",
         "#  define CURL_DISABLE_IMAP 1",
         "#  define CURL_DISABLE_LDAP 1",
         "#  define CURL_DISABLE_LDAPS 1",
         "#  define CURL_DISABLE_POP3 1",
         "#  define CURL_PULL_WS2TCPIP_H 1",
-        "#  define HTTP_ONLY 1",
+        "#  define CURL_DISABLE_SMTP 1",
+        "#  define CURL_DISABLE_TELNET 1",
+        "#  define CURL_DISABLE_TFTP 1",
+        "#  define CURL_PULL_WS2TCPIP_H 1",
+        "#  define USE_WINDOWS_SSPI 1",
+        "#  define USE_WIN32_IDN 1",
+        "#  define USE_SCHANNEL 1",
+        "#  define WANT_IDN_PROTOTYPES 1",
         "#elif defined(__APPLE__)",
         "#  define HAVE_FSETXATTR_6 1",
         "#  define HAVE_SETMODE 1",
@@ -477,7 +497,6 @@ genrule(
         "#  define HAVE_RAND_EGD 1",
         "#  define HAVE_RAND_STATUS 1",
         "#  define HAVE_SSL_GET_SHUTDOWN 1",
-        "#  define HAVE_STROPTS_H 1",
         "#  define HAVE_TERMIOS_H 1",
         "#  define OS \"x86_64-pc-linux-gnu\"",
         "#  define RANDOM_FILE \"/dev/urandom\"",
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index dc6de7bbda172ff9cb22c89b84341e4087ef76d3..07bb6645ebc2faa47ed7d52dc0e5975e55f0ed32 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -27,7 +27,6 @@ EIGEN_RESTRICTED_DEPS = [
     "Eigen/SparseLU",
 ]
 
-# Note: unsupported/Eigen is unsupported and might go away at any time.
 EIGEN_FILES = [
     "Eigen/**",
     "unsupported/Eigen/CXX11/**",
@@ -37,6 +36,7 @@ EIGEN_FILES = [
     "unsupported/Eigen/src/KroneckerProduct/**",
     "unsupported/Eigen/MatrixFunctions",
     "unsupported/Eigen/SpecialFunctions",
+    "unsupported/Eigen/src/MatrixFunctions/**",
     "unsupported/Eigen/src/SpecialFunctions/**",
 ]
 
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index ad87477b7aa304581c9164d3d10574c1069f03cc..f5f3418527f2ae0a948ac15645ebd905b59bcabf 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -26,6 +26,7 @@ cc_library(
         "Eigen/Eigenvalues",
         "Eigen/QR",
         "Eigen/SVD",
+        "unsupported/Eigen/MatrixFunctions",
         "unsupported/Eigen/SpecialFunctions",
         "unsupported/Eigen/CXX11/ThreadPool",
         "unsupported/Eigen/CXX11/Tensor",
diff --git a/third_party/eigen3/unsupported/Eigen/MatrixFunctions b/third_party/eigen3/unsupported/Eigen/MatrixFunctions
new file mode 100644
index 0000000000000000000000000000000000000000..314b325f8c293e1942a0fce82bc123b1bd7d5733
--- /dev/null
+++ b/third_party/eigen3/unsupported/Eigen/MatrixFunctions
@@ -0,0 +1 @@
+#include "unsupported/Eigen/MatrixFunctions"
diff --git a/third_party/examples/eager/spinn/BUILD b/third_party/examples/eager/spinn/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..0e39d4696fb5b4efafc94b4b96965d232ae4e473
--- /dev/null
+++ b/third_party/examples/eager/spinn/BUILD
@@ -0,0 +1,14 @@
+licenses(["notice"])  # 3-clause BSD.
+
+py_binary(
+    name = "spinn",
+    srcs = ["spinn.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//tensorflow/contrib/eager/python/examples/spinn:data",
+        "@six_archive//:six",
+    ],
+)
diff --git a/third_party/examples/eager/spinn/LICENSE b/third_party/examples/eager/spinn/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..09d493bf1fc257505c1336f3f87425568ab9da3c
--- /dev/null
+++ b/third_party/examples/eager/spinn/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, 
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/examples/eager/spinn/README.md b/third_party/examples/eager/spinn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c00d8d9015939575bc1d72ad911fa3f31d890caa
--- /dev/null
+++ b/third_party/examples/eager/spinn/README.md
@@ -0,0 +1,54 @@
+# SPINN with TensorFlow eager execution
+
+SPINN, or Stack-Augmented Parser-Interpreter Neural Network, is a recursive
+neural network that utilizes syntactic parse information for natural language
+understanding.
+
+SPINN was originally described by:
+Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+
+Our implementation is based on @jekbradbury's PyTorch implementation at:
+https://github.com/jekbradbury/examples/blob/spinn/snli/spinn.py,
+
+which was released under the BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE
+
+##  Content
+
+Python source file(s):
+- `spinn.py`: Model definition and training routines written with TensorFlow
+  eager execution idioms.
+
+## To run
+
+- Make sure you have installed the latest `tf-nightly` or `tf-nightly-gpu` pip
+  package of TensorFlow in order to access the eager execution feature.
+
+- Download and extract the raw SNLI data and GloVe embedding vectors.
+  For example:
+
+  ```bash
+  curl -fSsL https://nlp.stanford.edu/projects/snli/snli_1.0.zip --create-dirs -o /tmp/spinn-data/snli/snli_1.0.zip
+  unzip -d /tmp/spinn-data/snli /tmp/spinn-data/snli/snli_1.0.zip
+  curl -fSsL http://nlp.stanford.edu/data/glove.42B.300d.zip --create-dirs -o /tmp/spinn-data/glove/glove.42B.300d.zip
+  unzip -d /tmp/spinn-data/glove /tmp/spinn-data/glove/glove.42B.300d.zip
+  ```
+
+- Train model. E.g.,
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  During training, model checkpoints and TensorBoard summaries will be written
+  periodically to the directory specified with the `--logdir` flag.
+  The training script will reload a saved checkpoint from the directory if it
+  can find one there.
+
+  To view the summaries with TensorBoard:
+
+  ```bash
+  tensorboard --logdir /tmp/spinn-logs
+  ```
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2fa18eeb1077c8a1ccd4ab0bcd178f952e17270
--- /dev/null
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -0,0 +1,732 @@
+r"""Implementation of SPINN in TensorFlow eager execution.
+
+SPINN: Stack-Augmented Parser-Interpreter Neural Network.
+
+Ths file contains model definition and code for training the model.
+
+The model definition is based on PyTorch implementation at:
+  https://github.com/jekbradbury/examples/tree/spinn/snli
+
+which was released under a BSD 3-Clause License at:
+https://github.com/jekbradbury/examples/blob/spinn/LICENSE:
+
+Copyright (c) 2017,
+All rights reserved.
+
+See ./LICENSE for more details.
+
+Instructions for use:
+* See `README.md` for details on how to prepare the SNLI and GloVe data.
+* Suppose you have prepared the data at "/tmp/spinn-data", use the folloing
+  command to train the model:
+
+  ```bash
+  python spinn.py --data_root /tmp/spinn-data --logdir /tmp/spinn-logs
+  ```
+
+  Checkpoints and TensorBoard summaries will be written to "/tmp/spinn-logs".
+
+References:
+* Bowman, S.R., Gauthier, J., Rastogi A., Gupta, R., Manning, C.D., & Potts, C.
+  (2016). A Fast Unified Model for Parsing and Sentence Understanding.
+  https://arxiv.org/abs/1603.06021
+* Bradbury, J. (2017). Recursive Neural Networks with PyTorch.
+  https://devblogs.nvidia.com/parallelforall/recursive-neural-networks-pytorch/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import itertools
+import os
+import sys
+import time
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.spinn import data
+
+
+def _bundle(lstm_iter):
+  """Concatenate a list of Tensors along 1st axis and split result into two.
+
+  Args:
+    lstm_iter: A `list` of `N` dense `Tensor`s, each of which has the shape
+      (R, 2 * M).
+
+  Returns:
+    A `list` of two dense `Tensor`s, each of which has the shape (N * R, M).
+  """
+  return tf.split(tf.concat(lstm_iter, 0), 2, axis=1)
+
+
+def _unbundle(state):
+  """Concatenate a list of Tensors along 2nd axis and split result.
+
+  This is the inverse of `_bundle`.
+
+  Args:
+    state: A `list` of two dense `Tensor`s, each of which has the shape (R, M).
+
+  Returns:
+    A `list` of `R` dense `Tensors`, each of which has the shape (1, 2 * M).
+  """
+  return tf.split(tf.concat(state, 1), state[0].shape[0], axis=0)
+
+
+class Reducer(tfe.Network):
+  """A module that applies reduce operation on left and right vectors."""
+
+  def __init__(self, size, tracker_size=None):
+    super(Reducer, self).__init__()
+    self.left = self.track_layer(tf.layers.Dense(5 * size, activation=None))
+    self.right = self.track_layer(
+        tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    if tracker_size is not None:
+      self.track = self.track_layer(
+          tf.layers.Dense(5 * size, activation=None, use_bias=False))
+    else:
+      self.track = None
+
+  def call(self, left_in, right_in, tracking=None):
+    """Invoke forward pass of the Reduce module.
+
+    This method feeds a linear combination of `left_in`, `right_in` and
+    `tracking` into a Tree LSTM and returns the output of the Tree LSTM.
+
+    Args:
+      left_in: A list of length L. Each item is a dense `Tensor` with
+        the shape (1, n_dims). n_dims is the size of the embedding vector.
+      right_in: A list of the same length as `left_in`. Each item should have
+        the same shape as the items of `left_in`.
+      tracking: Optional list of the same length as `left_in`. Each item is a
+        dense `Tensor` with shape (1, tracker_size * 2). tracker_size is the
+        size of the Tracker's state vector.
+
+    Returns:
+      Output: A list of length batch_size. Each item has the shape (1, n_dims).
+    """
+    left, right = _bundle(left_in), _bundle(right_in)
+    lstm_in = self.left(left[0]) + self.right(right[0])
+    if self.track and tracking:
+      lstm_in += self.track(_bundle(tracking)[0])
+    return _unbundle(self._tree_lstm(left[1], right[1], lstm_in))
+
+  def _tree_lstm(self, c1, c2, lstm_in):
+    a, i, f1, f2, o = tf.split(lstm_in, 5, axis=1)
+    c = tf.tanh(a) * tf.sigmoid(i) + tf.sigmoid(f1) * c1 + tf.sigmoid(f2) * c2
+    h = tf.sigmoid(o) * tf.tanh(c)
+    return h, c
+
+
+class Tracker(tfe.Network):
+  """A module that tracks the history of the sentence with an LSTM."""
+
+  def __init__(self, tracker_size, predict):
+    """Constructor of Tracker.
+
+    Args:
+      tracker_size: Number of dimensions of the underlying `LSTMCell`.
+      predict: (`bool`) Whether prediction mode is enabled.
+    """
+    super(Tracker, self).__init__()
+    self._rnn = self.track_layer(tf.nn.rnn_cell.LSTMCell(tracker_size))
+    self._state_size = tracker_size
+    if predict:
+      self._transition = self.track_layer(tf.layers.Dense(4))
+    else:
+      self._transition = None
+
+  def reset_state(self):
+    self.state = None
+
+  def call(self, bufs, stacks):
+    """Invoke the forward pass of the Tracker module.
+
+    This method feeds the concatenation of the top two elements of the stacks
+    into an LSTM cell and returns the resultant state of the LSTM cell.
+
+    Args:
+      bufs: A `list` of length batch_size. Each item is a `list` of
+        max_sequence_len (maximum sequence length of the batch). Each item
+        of the nested list is a dense `Tensor` of shape (1, d_proj), where
+        d_proj is the size of the word embedding vector or the size of the
+        vector space that the word embedding vector is projected to.
+      stacks: A `list` of size batch_size. Each item is a `list` of
+        variable length corresponding to the current height of the stack.
+        Each item of the nested list is a dense `Tensor` of shape (1, d_proj).
+
+    Returns:
+      1. A list of length batch_size. Each item is a dense `Tensor` of shape
+        (1, d_tracker * 2).
+      2.  If under predict mode, result of applying a Dense layer on the
+        first state vector of the RNN. Else, `None`.
+    """
+    buf = _bundle([buf[-1] for buf in bufs])[0]
+    stack1 = _bundle([stack[-1] for stack in stacks])[0]
+    stack2 = _bundle([stack[-2] for stack in stacks])[0]
+    x = tf.concat([buf, stack1, stack2], 1)
+    if self.state is None:
+      batch_size = int(x.shape[0])
+      zeros = tf.zeros((batch_size, self._state_size), dtype=tf.float32)
+      self.state = [zeros, zeros]
+    _, self.state = self._rnn(x, self.state)
+    unbundled = _unbundle(self.state)
+    if self._transition:
+      return unbundled, self._transition(self.state[0])
+    else:
+      return unbundled, None
+
+
+class SPINN(tfe.Network):
+  """Stack-augmented Parser-Interpreter Neural Network.
+
+  See https://arxiv.org/abs/1603.06021 for more details.
+  """
+
+  def __init__(self, config):
+    """Constructor of SPINN.
+
+    Args:
+      config: A `namedtupled` with the following attributes.
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        d_tracker - (`int`) number of dimensions of the Tracker's state vector.
+        d_hidden - (`int`) number of the dimensions of the hidden state, for the
+          Reducer module.
+        n_mlp_layers - (`int`) number of multi-layer perceptron layers to use to
+          convert the output of the `Feature` module to logits.
+        predict - (`bool`) Whether the Tracker will enabled predictions.
+    """
+    super(SPINN, self).__init__()
+    self.config = config
+    self.reducer = self.track_layer(Reducer(config.d_hidden, config.d_tracker))
+    if config.d_tracker is not None:
+      self.tracker = self.track_layer(Tracker(config.d_tracker, config.predict))
+    else:
+      self.tracker = None
+
+  def call(self, buffers, transitions, training=False):
+    """Invoke the forward pass of the SPINN model.
+
+    Args:
+      buffers: Dense `Tensor` of shape
+        (max_sequence_len, batch_size, config.d_proj).
+      transitions: Dense `Tensor` with integer values that represent the parse
+        trees of the sentences. A value of 2 indicates "reduce"; a value of 3
+        indicates "shift". Shape: (max_sequence_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      Output `Tensor` of shape (batch_size, config.d_embed).
+    """
+    max_sequence_len, batch_size, d_proj = (int(x) for x in buffers.shape)
+
+    # Split the buffers into left and right word items and put the initial
+    # items in a stack.
+    splitted = tf.split(
+        tf.reshape(tf.transpose(buffers, [1, 0, 2]), [-1, d_proj]),
+        max_sequence_len * batch_size, axis=0)
+    buffers = [splitted[k:k + max_sequence_len]
+               for k in xrange(0, len(splitted), max_sequence_len)]
+    stacks = [[buf[0], buf[0]] for buf in buffers]
+
+    if self.tracker:
+      # Reset tracker state for new batch.
+      self.tracker.reset_state()
+
+    num_transitions = transitions.shape[0]
+
+    # Iterate through transitions and perform the appropriate stack-pop, reduce
+    # and stack-push operations.
+    transitions = transitions.numpy()
+    for i in xrange(num_transitions):
+      trans = transitions[i]
+      if self.tracker:
+        # Invoke tracker to obtain the current tracker states for the sentences.
+        tracker_states, trans_hypothesis = self.tracker(buffers, stacks)
+        if trans_hypothesis:
+          trans = tf.argmax(trans_hypothesis, axis=-1)
+      else:
+        tracker_states = itertools.repeat(None)
+      lefts, rights, trackings = [], [], []
+      for transition, buf, stack, tracking in zip(
+          trans, buffers, stacks, tracker_states):
+        if int(transition) == 3:  # Shift.
+          stack.append(buf.pop())
+        elif int(transition) == 2:  # Reduce.
+          rights.append(stack.pop())
+          lefts.append(stack.pop())
+          trackings.append(tracking)
+
+      if rights:
+        reducer_output = self.reducer(lefts, rights, trackings)
+        reduced = iter(reducer_output)
+
+        for transition, stack in zip(trans, stacks):
+          if int(transition) == 2:  # Reduce.
+            stack.append(next(reduced))
+    return _bundle([stack.pop() for stack in stacks])[0]
+
+
+class SNLIClassifier(tfe.Network):
+  """SNLI Classifier Model.
+
+  A model aimed at solving the SNLI (Standford Natural Language Inference)
+  task, using the SPINN model from above. For details of the task, see:
+    https://nlp.stanford.edu/projects/snli/
+  """
+
+  def __init__(self, config, embed):
+    """Constructor of SNLICLassifier.
+
+    Args:
+      config: A namedtuple containing required configurations for the model. It
+        needs to have the following attributes.
+        projection - (`bool`) whether the word vectors are to be projected onto
+          another vector space (of `d_proj` dimensions).
+        d_proj - (`int`) number of dimensions of the vector space to project the
+          word embeddings to.
+        embed_dropout - (`float`) dropout rate for the word embedding vectors.
+        n_mlp_layers - (`int`) number of multi-layer perceptron (MLP) layers to
+          use to convert the output of the `Feature` module to logits.
+        mlp_dropout - (`float`) dropout rate of the MLP layers.
+        d_out - (`int`) number of dimensions of the final output of the MLP
+          layers.
+        lr - (`float`) learning rate.
+      embed: A embedding matrix of shape (vocab_size, d_embed).
+    """
+    super(SNLIClassifier, self).__init__()
+    self.config = config
+    self.embed = tf.constant(embed)
+
+    self.projection = self.track_layer(tf.layers.Dense(config.d_proj))
+    self.embed_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.embed_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.embed_dropout))
+    self.encoder = self.track_layer(SPINN(config))
+
+    self.feature_bn = self.track_layer(tf.layers.BatchNormalization())
+    self.feature_dropout = self.track_layer(
+        tf.layers.Dropout(rate=config.mlp_dropout))
+
+    self.mlp_dense = []
+    self.mlp_bn = []
+    self.mlp_dropout = []
+    for _ in xrange(config.n_mlp_layers):
+      self.mlp_dense.append(self.track_layer(tf.layers.Dense(config.d_mlp)))
+      self.mlp_bn.append(
+          self.track_layer(tf.layers.BatchNormalization()))
+      self.mlp_dropout.append(
+          self.track_layer(tf.layers.Dropout(rate=config.mlp_dropout)))
+    self.mlp_output = self.track_layer(tf.layers.Dense(
+        config.d_out,
+        kernel_initializer=tf.random_uniform_initializer(minval=-5e-3,
+                                                         maxval=5e-3)))
+
+  def call(self,
+           premise,
+           premise_transition,
+           hypothesis,
+           hypothesis_transition,
+           training=False):
+    """Invoke the forward pass the SNLIClassifier model.
+
+    Args:
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+      training: Whether the invocation is under training mode.
+
+    Returns:
+      The logits, as a dense `Tensor` of shape (batch_size, d_out), where d_out
+      is the size of the output vector.
+    """
+    # Perform embedding lookup on the premise and hypothesis inputs, which have
+    # the word-index format.
+    premise_embed = tf.nn.embedding_lookup(self.embed, premise)
+    hypothesis_embed = tf.nn.embedding_lookup(self.embed, hypothesis)
+
+    if self.config.projection:
+      # Project the embedding vectors to another vector space.
+      premise_embed = self.projection(premise_embed)
+      hypothesis_embed = self.projection(hypothesis_embed)
+
+    # Perform batch normalization and dropout on the possibly projected word
+    # vectors.
+    premise_embed = self.embed_bn(premise_embed, training=training)
+    hypothesis_embed = self.embed_bn(hypothesis_embed, training=training)
+    premise_embed = self.embed_dropout(premise_embed, training=training)
+    hypothesis_embed = self.embed_dropout(hypothesis_embed, training=training)
+
+    # Run the batch-normalized and dropout-processed word vectors through the
+    # SPINN encoder.
+    premise = self.encoder(premise_embed, premise_transition,
+                           training=training)
+    hypothesis = self.encoder(hypothesis_embed, hypothesis_transition,
+                              training=training)
+
+    # Combine encoder outputs for premises and hypotheses into logits.
+    # Then apply batch normalization and dropuout on the logits.
+    logits = tf.concat(
+        [premise, hypothesis, premise - hypothesis, premise * hypothesis], 1)
+    logits = self.feature_dropout(
+        self.feature_bn(logits, training=training), training=training)
+
+    # Apply the multi-layer perceptron on the logits.
+    for dense, bn, dropout in zip(
+        self.mlp_dense, self.mlp_bn, self.mlp_dropout):
+      logits = tf.nn.elu(dense(logits))
+      logits = dropout(bn(logits, training=training), training=training)
+    logits = self.mlp_output(logits)
+    return logits
+
+
+class SNLIClassifierTrainer(object):
+  """A class that coordinates the training of an SNLIClassifier."""
+
+  def __init__(self, snli_classifier, lr):
+    """Constructor of SNLIClassifierTrainer.
+
+    Args:
+      snli_classifier: An instance of `SNLIClassifier`.
+      lr: Learning rate.
+    """
+    self._model = snli_classifier
+    # Create a custom learning rate Variable for the RMSProp optimizer, because
+    # the learning rate needs to be manually decayed later (see
+    # decay_learning_rate()).
+    self._learning_rate = tfe.Variable(lr, name="learning_rate")
+    self._optimizer = tf.train.RMSPropOptimizer(self._learning_rate,
+                                                epsilon=1e-6)
+
+  def loss(self, labels, logits):
+    """Calculate the loss given a batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      logits: The logits output from the forward pass of the SNLIClassifier
+        model, with shape (batch_size, d_out), where d_out is the output
+        dimension size of the SNLIClassifier.
+
+    Returns:
+      The loss value, as a scalar `Tensor`.
+    """
+    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=logits))
+
+  def train_batch(self,
+                  labels,
+                  premise,
+                  premise_transition,
+                  hypothesis,
+                  hypothesis_transition):
+    """Train model on batch of data.
+
+    Args:
+      labels: The truth labels, with shape (batch_size,).
+      premise: The word indices of the premise sentences, with shape
+        (max_prem_seq_len, batch_size).
+      premise_transition: The transitions for the premise sentences, with shape
+        (max_prem_seq_len * 2 - 3, batch_size).
+      hypothesis: The word indices of the hypothesis sentences, with shape
+        (max_hypo_seq_len, batch_size).
+      hypothesis_transition: The transitions for the hypothesis sentences, with
+        shape (max_hypo_seq_len * 2 - 3, batch_size).
+
+    Returns:
+      1. loss value as a scalar `Tensor`.
+      2. logits as a dense `Tensor` of shape (batch_size, d_out), where d_out is
+        the output dimension size of the SNLIClassifier.
+    """
+    with tfe.GradientTape() as tape:
+      tape.watch(self._model.variables)
+      logits = self._model(premise,
+                           premise_transition,
+                           hypothesis,
+                           hypothesis_transition,
+                           training=True)
+      loss = self.loss(labels, logits)
+    gradients = tape.gradient(loss, self._model.variables)
+    self._optimizer.apply_gradients(zip(gradients, self._model.variables),
+                                    global_step=tf.train.get_global_step())
+    return loss, logits
+
+  def decay_learning_rate(self, decay_by):
+    """Decay learning rate of the optimizer by factor decay_by."""
+    self._learning_rate.assign(self._learning_rate * decay_by)
+    print("Decayed learning rate of optimizer to: %s" %
+          self._learning_rate.numpy())
+
+  @property
+  def learning_rate(self):
+    return self._learning_rate
+
+
+def _batch_n_correct(logits, label):
+  """Calculate number of correct predictions in a batch.
+
+  Args:
+    logits: A logits Tensor of shape `(batch_size, num_categories)` and dtype
+      `float32`.
+    label: A labels Tensor of shape `(batch_size,)` and dtype `int64`
+
+  Returns:
+    Number of correct predictions.
+  """
+  return tf.reduce_sum(
+      tf.cast((tf.equal(
+          tf.argmax(logits, axis=1), label)), tf.float32)).numpy()
+
+
+def _evaluate_on_dataset(snli_data, batch_size, model, trainer, use_gpu):
+  """Run evaluation on a dataset.
+
+  Args:
+    snli_data: The `data.SnliData` to use in this evaluation.
+    batch_size: The batch size to use during this evaluation.
+    model: An instance of `SNLIClassifier` to evaluate.
+    trainer: An instance of `SNLIClassifierTrainer to use for this
+      evaluation.
+    use_gpu: Whether GPU is being used.
+
+  Returns:
+    1. Average loss across all examples of the dataset.
+    2. Average accuracy rate across all examples of the dataset.
+  """
+  mean_loss = tfe.metrics.Mean()
+  accuracy = tfe.metrics.Accuracy()
+  for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+      snli_data, batch_size):
+    if use_gpu:
+      label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+    logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
+    loss_val = trainer.loss(label, logits)
+    batch_size = tf.shape(label)[0]
+    mean_loss(loss_val, weights=batch_size.gpu() if use_gpu else batch_size)
+    accuracy(tf.argmax(logits, axis=1), label)
+  return mean_loss.result().numpy(), accuracy.result().numpy()
+
+
+def _get_dataset_iterator(snli_data, batch_size):
+  """Get a data iterator for a split of SNLI data.
+
+  Args:
+    snli_data: A `data.SnliData` object.
+    batch_size: The desired batch size.
+
+  Returns:
+    A dataset iterator.
+  """
+  with tf.device("/device:CPU:0"):
+    # Some tf.data ops, such as ShuffleDataset, are available only on CPU.
+    dataset = tf.data.Dataset.from_generator(
+        snli_data.get_generator(batch_size),
+        (tf.int64, tf.int64, tf.int64, tf.int64, tf.int64))
+    dataset = dataset.shuffle(snli_data.num_batches(batch_size))
+    return tfe.Iterator(dataset)
+
+
+def train_spinn(embed, train_data, dev_data, test_data, config):
+  """Train a SPINN model.
+
+  Args:
+    embed: The embedding matrix as a float32 numpy array with shape
+      [vocabulary_size, word_vector_len]. word_vector_len is the length of a
+      word embedding vector.
+    train_data: An instance of `data.SnliData`, for the train split.
+    dev_data: Same as above, for the dev split.
+    test_data: Same as above, for the test split.
+    config: A configuration object. See the argument to this Python binary for
+      details.
+
+  Returns:
+    1. Final loss value on the test split.
+    2. Final fraction of correct classifications on the test split.
+  """
+  use_gpu = tfe.num_gpus() > 0 and not config.force_cpu
+  device = "gpu:0" if use_gpu else "cpu:0"
+  print("Using device: %s" % device)
+
+  log_header = (
+      "  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss"
+      "     Accuracy  Dev/Accuracy")
+  log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} "
+      "{:12.4f} {}")
+  dev_log_template = (
+      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} "
+      "{:8.6f} {:12.4f} {:12.4f}")
+
+  summary_writer = tf.contrib.summary.create_file_writer(
+      config.logdir, flush_millis=10000)
+  train_len = train_data.num_batches(config.batch_size)
+  with tf.device(device), \
+       tfe.restore_variables_on_create(
+           tf.train.latest_checkpoint(config.logdir)), \
+       summary_writer.as_default(), \
+       tf.contrib.summary.always_record_summaries():
+    model = SNLIClassifier(config, embed)
+    global_step = tf.train.get_or_create_global_step()
+    trainer = SNLIClassifierTrainer(model, config.lr)
+
+    start = time.time()
+    iterations = 0
+    mean_loss = tfe.metrics.Mean()
+    accuracy = tfe.metrics.Accuracy()
+    print(log_header)
+    for epoch in xrange(config.epochs):
+      batch_idx = 0
+      for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
+          train_data, config.batch_size):
+        if use_gpu:
+          label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
+          # prem_trans and hypo_trans are used for dynamic control flow and can
+          # remain on CPU. Same in _evaluate_on_dataset().
+
+        iterations += 1
+        batch_train_loss, batch_train_logits = trainer.train_batch(
+            label, prem, prem_trans, hypo, hypo_trans)
+        batch_size = tf.shape(label)[0]
+        mean_loss(batch_train_loss.numpy(),
+                  weights=batch_size.gpu() if use_gpu else batch_size)
+        accuracy(tf.argmax(batch_train_logits, axis=1), label)
+
+        if iterations % config.save_every == 0:
+          all_variables = (
+              model.variables + [trainer.learning_rate] + [global_step])
+          saver = tfe.Saver(all_variables)
+          saver.save(os.path.join(config.logdir, "ckpt"),
+                     global_step=global_step)
+
+        if iterations % config.dev_every == 0:
+          dev_loss, dev_frac_correct = _evaluate_on_dataset(
+              dev_data, config.batch_size, model, trainer, use_gpu)
+          print(dev_log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss.result(), dev_loss,
+              accuracy.result() * 100.0, dev_frac_correct * 100.0))
+          tf.contrib.summary.scalar("dev/loss", dev_loss)
+          tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct)
+        elif iterations % config.log_every == 0:
+          mean_loss_val = mean_loss.result()
+          accuracy_val = accuracy.result()
+          print(log_template.format(
+              time.time() - start,
+              epoch, iterations, 1 + batch_idx, train_len,
+              100.0 * (1 + batch_idx) / train_len,
+              mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12))
+          tf.contrib.summary.scalar("train/loss", mean_loss_val)
+          tf.contrib.summary.scalar("train/accuracy", accuracy_val)
+          # Reset metrics.
+          mean_loss = tfe.metrics.Mean()
+          accuracy = tfe.metrics.Accuracy()
+
+        batch_idx += 1
+      if (epoch + 1) % config.lr_decay_every == 0:
+        trainer.decay_learning_rate(config.lr_decay_by)
+
+    test_loss, test_frac_correct = _evaluate_on_dataset(
+        test_data, config.batch_size, model, trainer, use_gpu)
+    print("Final test loss: %g; accuracy: %g%%" %
+          (test_loss, test_frac_correct * 100.0))
+
+
+def main(_):
+  config = FLAGS
+
+  # Load embedding vectors.
+  vocab = data.load_vocabulary(FLAGS.data_root)
+  word2index, embed = data.load_word_vectors(FLAGS.data_root, vocab)
+
+  print("Loading train, dev and test data...")
+  train_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_train.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+  dev_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_dev.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+  test_data = data.SnliData(
+      os.path.join(FLAGS.data_root, "snli/snli_1.0/snli_1.0_test.txt"),
+      word2index, sentence_len_limit=FLAGS.sentence_len_limit)
+
+  train_spinn(embed, train_data, dev_data, test_data, config)
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      description=
+      "TensorFlow eager implementation of the SPINN SNLI classifier.")
+  parser.add_argument("--data_root", type=str, default="/tmp/spinn-data",
+                      help="Root directory in which the training data and "
+                      "embedding matrix are found. See README.md for how to "
+                      "generate such a directory.")
+  parser.add_argument("--sentence_len_limit", type=int, default=-1,
+                      help="Maximum allowed sentence length (# of words). "
+                      "The default of -1 means unlimited.")
+  parser.add_argument("--logdir", type=str, default="/tmp/spinn-logs",
+                      help="Directory in which summaries will be written for "
+                      "TensorBoard.")
+  parser.add_argument("--epochs", type=int, default=50,
+                      help="Number of epochs to train.")
+  parser.add_argument("--batch_size", type=int, default=128,
+                      help="Batch size to use during training.")
+  parser.add_argument("--d_proj", type=int, default=600,
+                      help="Dimensions to project the word embedding vectors "
+                      "to.")
+  parser.add_argument("--d_hidden", type=int, default=300,
+                      help="Size of the hidden layer of the Tracker.")
+  parser.add_argument("--d_out", type=int, default=4,
+                      help="Output dimensions of the SNLIClassifier.")
+  parser.add_argument("--d_mlp", type=int, default=1024,
+                      help="Size of each layer of the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--n_mlp_layers", type=int, default=2,
+                      help="Number of layers in the multi-layer perceptron "
+                      "of the SNLICLassifier.")
+  parser.add_argument("--d_tracker", type=int, default=64,
+                      help="Size of the tracker LSTM.")
+  parser.add_argument("--log_every", type=int, default=50,
+                      help="Print log and write TensorBoard summary every _ "
+                      "training batches.")
+  parser.add_argument("--lr", type=float, default=2e-3,
+                      help="Initial learning rate.")
+  parser.add_argument("--lr_decay_by", type=float, default=0.75,
+                      help="The ratio to multiply the learning rate by every "
+                      "time the learning rate is decayed.")
+  parser.add_argument("--lr_decay_every", type=float, default=1,
+                      help="Decay the learning rate every _ epoch(s).")
+  parser.add_argument("--dev_every", type=int, default=1000,
+                      help="Run evaluation on the dev split every _ training "
+                      "batches.")
+  parser.add_argument("--save_every", type=int, default=1000,
+                      help="Save checkpoint every _ training batches.")
+  parser.add_argument("--embed_dropout", type=float, default=0.08,
+                      help="Word embedding dropout rate.")
+  parser.add_argument("--mlp_dropout", type=float, default=0.07,
+                      help="SNLIClassifier multi-layer perceptron dropout "
+                      "rate.")
+  parser.add_argument("--no-projection", action="store_false",
+                      dest="projection",
+                      help="Whether word embedding vectors are projected to "
+                      "another set of vectors (see d_proj).")
+  parser.add_argument("--predict_transitions", action="store_true",
+                      dest="predict",
+                      help="Whether the Tracker will perform prediction.")
+  parser.add_argument("--force_cpu", action="store_true", dest="force_cpu",
+                      help="Force use CPU-only regardless of whether a GPU is "
+                      "available.")
+  FLAGS, unparsed = parser.parse_known_args()
+
+  tfe.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index a426db0c5027dc27cec4c5587ddb0990d60f1d6e..0a76adcf9189b7c874ee76aad737cd0b0a1dc609 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -6,8 +6,11 @@ licenses(["notice"])  # Apache 2.0
 
 FLATBUFFERS_COPTS = [
     "-fexceptions",
-    "-Wno-implicit-fallthrough",
-]
+] + select({
+    "@bazel_tools//src:windows": [],
+    "@bazel_tools//src:windows_msvc": [],
+    "//conditions:default": ["-Wno-implicit-fallthrough"],
+})
 
 # Public flatc library to compile flatbuffer files at runtime.
 cc_library(
@@ -104,6 +107,10 @@ cc_binary(
         "grpc/",
         "include/",
     ],
+    linkopts = [
+        "-lm",
+        "-ldl",
+    ],
     deps = [
         ":flatc_library",
     ],
diff --git a/third_party/gif.BUILD b/third_party/gif.BUILD
index 27808a9d645e93644a8c2fac40974306dad444a7..78fbd6c0e098512d01478eba70fe614f0266c317 100644
--- a/third_party/gif.BUILD
+++ b/third_party/gif.BUILD
@@ -21,7 +21,7 @@ cc_library(
     ],
     hdrs = ["lib/gif_lib.h"],
     defines = select({
-        #"@%ws%//tensorflow:android": [
+        #"@org_tensorflow//tensorflow:android": [
         ":android": [
             "S_IREAD=S_IRUSR",
             "S_IWRITE=S_IWUSR",
diff --git a/third_party/grpc/grpc.patch b/third_party/grpc/grpc.patch
deleted file mode 100644
index c06d9b8aaf275b270deb48c51d3f4a5ea432f593..0000000000000000000000000000000000000000
--- a/third_party/grpc/grpc.patch
+++ /dev/null
@@ -1,105 +0,0 @@
-diff --git a/BUILD b/BUILD
-index 6552d5879e..59adb1ce1c 100644
---- a/BUILD
-+++ b/BUILD
-@@ -287,6 +287,7 @@ grpc_cc_library(
-         "grpc++_base_unsecure",
-         "grpc++_codegen_base",
-         "grpc++_codegen_base_src",
-+        "grpc++_codegen_proto",
-         "grpc_unsecure",
-     ],
- )
-@@ -1519,13 +1520,13 @@ grpc_cc_library(
- 
- grpc_cc_library(
-     name = "grpc++_config_proto",
--    external_deps = [
--        "protobuf",
--    ],
-     language = "c++",
-     public_hdrs = [
-         "include/grpc++/impl/codegen/config_protobuf.h",
-     ],
-+    deps = [
-+        "@protobuf_archive//:protobuf_headers",
-+    ],
- )
- 
- grpc_cc_library(
-diff --git a/bazel/grpc_build_system.bzl b/bazel/grpc_build_system.bzl
-index f793cae56d..0295adb8ab 100644
---- a/bazel/grpc_build_system.bzl
-+++ b/bazel/grpc_build_system.bzl
-@@ -80,7 +80,7 @@ def grpc_cc_test(name, srcs = [], deps = [], external_deps = [], args = [], data
-     linkopts = ["-pthread"],
-   )
- 
--def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], data = [], language = "C++", testonly = False, linkshared = False):
-+def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], data = [], language = "C++", testonly = False, linkshared = False, linkopts = []):
-   copts = []
-   if language.upper() == "C":
-     copts = ["-std=c99"]
-@@ -93,7 +93,7 @@ def grpc_cc_binary(name, srcs = [], deps = [], external_deps = [], args = [], da
-     linkshared = linkshared,
-     deps = deps + ["//external:" + dep for dep in external_deps],
-     copts = copts,
--    linkopts = ["-pthread"],
-+    linkopts = ["-pthread"] + linkopts,
-   )
- 
- def grpc_generate_one_off_targets():
-diff --git a/src/core/plugin_registry/grpc_unsecure_plugin_registry.c b/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
-index 7eb599d81a..4cc2e30af4 100644
---- a/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
-+++ b/src/core/plugin_registry/grpc_unsecure_plugin_registry.c
-@@ -28,18 +28,12 @@ extern void grpc_client_channel_init(void);
- extern void grpc_client_channel_shutdown(void);
- extern void grpc_inproc_plugin_init(void);
- extern void grpc_inproc_plugin_shutdown(void);
--extern void grpc_resolver_dns_ares_init(void);
--extern void grpc_resolver_dns_ares_shutdown(void);
- extern void grpc_resolver_dns_native_init(void);
- extern void grpc_resolver_dns_native_shutdown(void);
- extern void grpc_resolver_sockaddr_init(void);
- extern void grpc_resolver_sockaddr_shutdown(void);
--extern void grpc_resolver_fake_init(void);
--extern void grpc_resolver_fake_shutdown(void);
- extern void grpc_load_reporting_plugin_init(void);
- extern void grpc_load_reporting_plugin_shutdown(void);
--extern void grpc_lb_policy_grpclb_init(void);
--extern void grpc_lb_policy_grpclb_shutdown(void);
- extern void grpc_lb_policy_pick_first_init(void);
- extern void grpc_lb_policy_pick_first_shutdown(void);
- extern void grpc_lb_policy_round_robin_init(void);
-@@ -64,18 +58,12 @@ void grpc_register_built_in_plugins(void) {
-                        grpc_client_channel_shutdown);
-   grpc_register_plugin(grpc_inproc_plugin_init,
-                        grpc_inproc_plugin_shutdown);
--  grpc_register_plugin(grpc_resolver_dns_ares_init,
--                       grpc_resolver_dns_ares_shutdown);
-   grpc_register_plugin(grpc_resolver_dns_native_init,
-                        grpc_resolver_dns_native_shutdown);
-   grpc_register_plugin(grpc_resolver_sockaddr_init,
-                        grpc_resolver_sockaddr_shutdown);
--  grpc_register_plugin(grpc_resolver_fake_init,
--                       grpc_resolver_fake_shutdown);
-   grpc_register_plugin(grpc_load_reporting_plugin_init,
-                        grpc_load_reporting_plugin_shutdown);
--  grpc_register_plugin(grpc_lb_policy_grpclb_init,
--                       grpc_lb_policy_grpclb_shutdown);
-   grpc_register_plugin(grpc_lb_policy_pick_first_init,
-                        grpc_lb_policy_pick_first_shutdown);
-   grpc_register_plugin(grpc_lb_policy_round_robin_init,
-diff --git a/test/cpp/util/BUILD b/test/cpp/util/BUILD
-index 33240f6f69..d2e1f67f06 100644
---- a/test/cpp/util/BUILD
-+++ b/test/cpp/util/BUILD
-@@ -29,6 +29,7 @@ package(
- grpc_cc_binary(
-     name = "testso.so",
-     srcs = [],
-+    linkopts = ['-Wl,--no-undefined'],
-     linkshared = 1,
-     deps = ["//:grpc++_unsecure"],
- )
diff --git a/third_party/jemalloc.BUILD b/third_party/jemalloc.BUILD
index a2addf2c66bc3aa396455ab34208d6ef756b70f2..1b0829b8fea64c74fa9b462c0716cef6385dad96 100644
--- a/third_party/jemalloc.BUILD
+++ b/third_party/jemalloc.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # BSD
 
 exports_files(["COPYING"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 cc_library(
     name = "jemalloc_headers",
@@ -97,10 +97,10 @@ cc_library(
     includes = ["include"],
     # pthread_atfork() is called for PPC.
     linkopts = select({
-        "@%ws%//tensorflow:linux_ppc64le": [
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
             "-lpthread",
         ],
-        "@%ws%//tensorflow:linux_x86_64": [
+        "@org_tensorflow//tensorflow:linux_x86_64": [
             "-lpthread",
         ],
         "//conditions:default": [
@@ -208,8 +208,8 @@ genrule(
     name = "size_classes_h",
     outs = ["include/jemalloc/internal/size_classes.h"],
     cmd = select({
-        "@%ws%//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
-        "@%ws%//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
+        "@org_tensorflow//tensorflow:linux_ppc64le": "$(location :size_classes_sh) \"3 4\" 3 16 2 >$@",
+        "@org_tensorflow//tensorflow:linux_x86_64": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
         "//conditions:default": "$(location :size_classes_sh) \"3 4\" 3 12 2 >$@",
     }),
     tools = [":size_classes_sh"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index f6078052ecedd71b9af29eae628529c9045781f7..527a08c4b3732e7cfd0048d6ce4616617afcf4c2 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -5,7 +5,7 @@ licenses(["notice"])  # custom notice-style license, see LICENSE.md
 
 exports_files(["LICENSE.md"])
 
-load("@%ws%//third_party:common.bzl", "template_rule")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
 
 libjpegturbo_nocopts = "-[W]error"
 
@@ -323,14 +323,18 @@ JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
     "#undef RIGHT_SHIFT_IS_UNSIGNED": "",
 }
 
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "#define WITH_SIMD 1",
 }
 
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = JCONFIG_NOWIN_COMMON_SUBSTITUTIONS + {
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
     "#undef WITH_SIMD": "",
 }
 
+JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
+JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
+
 template_rule(
     name = "jconfig_nowin_nosimd",
     src = "jconfig.h.in",
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 97b833e49d57cbf003a7154c7e64b9a505868abf..5344525ba8b42e8a3dbcf42397458d190a77f9d3 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -7,18 +7,18 @@ licenses(["notice"])
 exports_files(["LICENSE.TXT"])
 
 load(
-    "@%ws%//third_party/llvm:llvm.bzl",
+    "@org_tensorflow//third_party/llvm:llvm.bzl",
     "gentbl",
     "expand_cmake_vars",
     "llvm_target_cmake_vars",
     "cmake_var_string",
 )
 load(
-    "@%ws%//third_party:common.bzl",
+    "@org_tensorflow//third_party:common.bzl",
     "template_rule",
 )
 
-package(default_visibility = ["@%ws%//tensorflow/compiler/xla:internal"])
+package(default_visibility = ["//visibility:public"])
 
 llvm_host_triple = "x86_64-unknown-linux_gnu"
 
@@ -145,11 +145,11 @@ darwin_cmake_vars = {
 # TODO(phawkins): use a better method to select the right host triple, rather
 # than hardcoding x86_64.
 all_cmake_vars = select({
-    "@%ws%//tensorflow:darwin": cmake_var_string(
+    "@org_tensorflow//tensorflow:darwin": cmake_var_string(
         cmake_vars + llvm_target_cmake_vars("X86", "x86_64-apple-darwin") +
         darwin_cmake_vars,
     ),
-    "@%ws%//tensorflow:linux_ppc64le": cmake_var_string(
+    "@org_tensorflow//tensorflow:linux_ppc64le": cmake_var_string(
         cmake_vars +
         llvm_target_cmake_vars("PowerPC", "powerpc64le-unknown-linux_gnu") +
         linux_cmake_vars,
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 533c0766c71a18e614f2f101a4e74b7f35fd26c3..8b73ddabdd7ff5de7374ffbbb76e7bf954c27765 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -20,7 +20,7 @@ def if_mkl(if_true, if_false = []):
 
     """
     return select({
-        "//third_party/mkl:using_mkl": if_true,
+        str(Label("//third_party/mkl:using_mkl")): if_true,
         "//conditions:default": if_false
     })
 
@@ -60,7 +60,6 @@ mkl_repository = repository_rule(
     ],
     attrs = {
         "build_file": attr.label(),
-        "repository": attr.string(),
         "urls": attr.string_list(default = []),
         "sha256": attr.string(default = ""),
         "strip_prefix": attr.string(default = ""),
diff --git a/third_party/nccl.BUILD b/third_party/nccl.BUILD
index 06b9b8ff68a5e8aa877d605daf02bec1ea4d6bfa..b2b8e188248f90805bc2904dca9111550a7dfed8 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl.BUILD
@@ -44,18 +44,18 @@ cc_library(
         "-O3",
     ] + cuda_default_copts(),
     linkopts = select({
-        "@%ws%//tensorflow:android": [
+        "@org_tensorflow//tensorflow:android": [
             "-pie",
         ],
-        "@%ws%//tensorflow:darwin": [
+        "@org_tensorflow//tensorflow:darwin": [
             "-Wl,-framework",
             "-Wl,CoreFoundation",
             "-Wl,-framework",
             "-Wl,Security",
         ],
-        "@%ws%//tensorflow:ios": [],
-        "@%ws%//tensorflow:windows": [
-            "ws2_32.lib",
+        "@org_tensorflow//tensorflow:ios": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "-DEFAULTLIB:ws2_32.lib",
         ],
         "//conditions:default": [
             "-lrt",
diff --git a/third_party/pcre.BUILD b/third_party/pcre.BUILD
index 68aadd1d408685291beaee3ebe0607f35e130ff1..e2cdec40295d369548ff26e3493b5d2300041916 100644
--- a/third_party/pcre.BUILD
+++ b/third_party/pcre.BUILD
@@ -50,12 +50,12 @@ cc_library(
         "-DNEWLINE=10",
         "-DNO_RECURSE",
         "-DPARENS_NEST_LIMIT=50",
-        "-DPCRE_STATIC=1",
         "-DPOSIX_MALLOC_THRESHOLD=10",
         "-DSTDC_HEADERS=1",
         "-DSUPPORT_UCP",
         "-DSUPPORT_UTF",
     ],
+    defines = ["PCRE_STATIC=1"],
     includes = ["."],
     visibility = ["@swig//:__pkg__"],  # Please use RE2
     alwayslink = 1,
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index bbc07905fc7f92a26d0aebade66a20209dc3e766..c16eb3a12a86f3c2eb3813f5c8c7631fec8e97c6 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -1,11 +1,8 @@
-# -*- Python -*-
 """Repository rule for Python autoconfiguration.
 
 `python_configure` depends on the following environment variables:
 
-  * `NUMPY_INCLUDE_PATH`: Location of Numpy libraries.
   * `PYTHON_BIN_PATH`: location of python binary.
-  * `PYTHON_INCLUDE_PATH`: Location of python binaries.
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
@@ -23,32 +20,13 @@ def _tpl(repository_ctx, tpl, substitutions={}, out=None):
       substitutions)
 
 
-def _python_configure_warning(msg):
-  """Output warning message during auto configuration."""
-  yellow = "\033[1;33m"
-  no_color = "\033[0m"
-  print("%sPython Configuration Warning:%s %s" % (yellow, no_color, msg))
-
-
-def _python_configure_fail(msg):
+def _fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
   no_color = "\033[0m"
   fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
 
 
-def _get_env_var(repository_ctx, name, default = None, enable_warning = True):
-  """Find an environment variable in system path."""
-  if name in repository_ctx.os.environ:
-    return repository_ctx.os.environ[name]
-  if default != None:
-    if enable_warning:
-      _python_configure_warning(
-          "'%s' environment variable is not set, using '%s' as default" % (name, default))
-    return default
-  _python_configure_fail("'%s' environment variable is not set" % name)
-
-
 def _is_windows(repository_ctx):
   """Returns true if the host operating system is windows."""
   os_name = repository_ctx.os.name.lower()
@@ -73,11 +51,10 @@ def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
   """
   result = repository_ctx.execute(cmdline)
   if result.stderr or not (empty_stdout_fine or result.stdout):
-    _python_configure_fail(
-        "\n".join([
-            error_msg.strip() if error_msg else "Repository command failed",
-            result.stderr.strip(),
-            error_details if error_details else ""]))
+    _fail("\n".join([
+        error_msg.strip() if error_msg else "Repository command failed",
+        result.stderr.strip(),
+        error_details if error_details else ""]))
   return result
 
 
@@ -163,21 +140,23 @@ def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
 
 def _get_python_bin(repository_ctx):
   """Gets the python bin path."""
-  python_bin = _get_env_var(repository_ctx, _PYTHON_BIN_PATH,
-                            None, False)
+  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
   if python_bin != None:
     return python_bin
   python_bin_path = repository_ctx.which("python")
   if python_bin_path != None:
     return str(python_bin_path)
-  path = _get_env_var(repository_ctx, "PATH")
-  _python_configure_fail("Cannot find python in PATH, please make sure " +
-      "python is installed and add its directory in PATH, or set the " +
-      "environment variable PYTHON_BIN_PATH.\nPATH=%s" % (path))
+  _fail("Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
+  python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
+  if python_lib != None:
+    return python_lib
   print_lib = ("<<END\n" +
       "from __future__ import print_function\n" +
       "import site\n" +
@@ -214,7 +193,7 @@ def _check_python_lib(repository_ctx, python_lib):
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
   result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
-    _python_configure_fail("Invalid python library path:  %s" % python_lib)
+    _fail("Invalid python library path: %s" % python_lib)
 
 
 def _check_python_bin(repository_ctx, python_bin):
@@ -222,33 +201,36 @@ def _check_python_bin(repository_ctx, python_bin):
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
   result = repository_ctx.execute(["bash", "-c", cmd])
   if result.return_code == 1:
-    _python_configure_fail(
-        "PYTHON_BIN_PATH is not executable.  Is it the python binary?")
+    _fail("--define %s='%s' is not executable. Is it the python binary?" % (
+        _PYTHON_BIN_PATH, python_bin))
 
 
 def _get_python_include(repository_ctx, python_bin):
   """Gets the python include path."""
-  result = _execute(repository_ctx,
-                    [python_bin, "-c",
-                     'from __future__ import print_function;' +
-                     'from distutils import sysconfig;' +
-                     'print(sysconfig.get_python_inc())'],
-                    error_msg="Problem getting python include path.",
-                    error_details=("Is the Python binary path set up right? " +
-                                   "(See ./configure or PYTHON_BIN_PATH.) " +
-                                   "Is distutils installed?"))
+  result = _execute(
+      repository_ctx,
+      [python_bin, "-c",
+       'from __future__ import print_function;' +
+       'from distutils import sysconfig;' +
+       'print(sysconfig.get_python_inc())'],
+      error_msg="Problem getting python include path.",
+      error_details=("Is the Python binary path set up right? " +
+                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) " +
+                     "Is distutils installed?"))
   return result.stdout.splitlines()[0]
 
 
 def _get_python_import_lib_name(repository_ctx, python_bin):
   """Get Python import library name (pythonXY.lib) on Windows."""
-  result = _execute(repository_ctx,
-                    [python_bin, "-c",
-                     'import sys;' +
-                     'print("python" + str(sys.version_info[0]) + str(sys.version_info[1]) + ".lib")'],
-                    error_msg="Problem getting python import library.",
-                    error_details=("Is the Python binary path set up right? " +
-                                   "(See ./configure or PYTHON_BIN_PATH.) "))
+  result = _execute(
+      repository_ctx,
+      [python_bin, "-c",
+       'import sys;' +
+       'print("python" + str(sys.version_info[0]) + ' +
+       '      str(sys.version_info[1]) + ".lib")'],
+      error_msg="Problem getting python import library.",
+      error_details=("Is the Python binary path set up right? " +
+                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) "))
   return result.stdout.splitlines()[0]
 
 
@@ -267,8 +249,7 @@ def _create_local_python_repository(repository_ctx):
   """Creates the repository containing files set up to build with Python."""
   python_bin = _get_python_bin(repository_ctx)
   _check_python_bin(repository_ctx, python_bin)
-  python_lib = _get_env_var(repository_ctx, _PYTHON_LIB_PATH,
-                              _get_python_lib(repository_ctx, python_bin))
+  python_lib = _get_python_lib(repository_ctx, python_bin)
   _check_python_lib(repository_ctx, python_lib)
   python_include = _get_python_include(repository_ctx, python_bin)
   numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..c29fef9629570955b4c4f192c03627bc65b2d49d
--- /dev/null
+++ b/third_party/repo.bzl
@@ -0,0 +1,106 @@
+# Copyright 2017 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining TensorFlow Bazel dependencies."""
+
+_SINGLE_URL_WHITELIST = depset([
+    "arm_compiler",
+    "ortools_archive",
+])
+
+def _is_windows(ctx):
+  return ctx.os.name.lower().find("windows") != -1
+
+def _get_env_var(ctx, name):
+  if name in ctx.os.environ:
+    return ctx.os.environ[name]
+  else:
+    return None
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+  result = repo_ctx.execute(cmd_and_args, timeout=10)
+  if result.return_code != 0:
+    fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n"
+          + "Stderr: {3}").format(" ".join(cmd_and_args), result.return_code,
+                                  result.stdout, result.stderr))
+
+def _repos_are_siblings():
+  return Label("@foo//bar").workspace_root.startswith("../")
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1'
+def _apply_patch(ctx, patch_file):
+  # Don't check patch on Windows, because patch is only available under bash.
+  if not _is_windows(ctx) and not ctx.which("patch"):
+    fail("patch command is not found, please install it")
+  cmd = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
+  if _is_windows(ctx):
+    bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+    if not bazel_sh:
+      fail("BAZEL_SH environment variable is not set")
+    cmd = [bazel_sh, "-c", " ".join(cmd)]
+  _execute_and_check_ret_code(ctx, cmd)
+
+def _apply_delete(ctx, paths):
+  for path in paths:
+    if path.startswith("/"):
+      fail("refusing to rm -rf path starting with '/': " + path)
+    if ".." in path:
+      fail("refusing to rm -rf path containing '..': " + path)
+  _execute_and_check_ret_code(
+      ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+
+def _tf_http_archive(ctx):
+  if ("mirror.bazel.build" not in ctx.attr.urls[0] or
+      (len(ctx.attr.urls) < 2 and
+       ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+    fail("tf_http_archive(urls) must have redundant URLs. The " +
+         "mirror.bazel.build URL must be present and it must come first. " +
+         "Even if you don't have permission to mirror the file, please " +
+         "put the correctly formatted mirror URL there anyway, because " +
+         "someone will come along shortly thereafter and mirror the file.")
+  ctx.download_and_extract(
+      ctx.attr.urls,
+      "",
+      ctx.attr.sha256,
+      ctx.attr.type,
+      ctx.attr.strip_prefix)
+  if ctx.attr.delete:
+    _apply_delete(ctx, ctx.attr.delete)
+  if ctx.attr.patch_file != None:
+    _apply_patch(ctx, ctx.attr.patch_file)
+  if ctx.attr.build_file != None:
+    ctx.template("BUILD", ctx.attr.build_file, {
+        "%prefix%": ".." if _repos_are_siblings() else "external",
+    }, False)
+
+tf_http_archive = repository_rule(
+    implementation=_tf_http_archive,
+    attrs={
+        "sha256": attr.string(mandatory=True),
+        "urls": attr.string_list(mandatory=True, allow_empty=False),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "patch_file": attr.label(),
+        "build_file": attr.label(),
+    })
+"""Downloads and creates Bazel repos for dependencies.
+
+This is a swappable replacement for both http_archive() and
+new_http_archive() that offers some additional features. It also helps
+ensure best practices are followed.
+"""
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index 9c00b7068a802a361effab207409138c79addde7..fd48ed8941e159a8d6176ef3f4e1982d6600e1c2 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -50,8 +50,8 @@ genrule(
            "-e 's/@ac_cv_have_stddef_h@/1/g' " +
            "-e 's/@ac_cv_have_stdint_h@/1/g' " +
            select({
-               "@%ws%//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@%ws%//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
+               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
                "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
            }) +
            "-e 's/@SNAPPY_MAJOR@/1/g' " +
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
index 32884d71e78bd07cc17b63981592ac7ab0d41bc6..f8e50efcc6572e649cbb151eeade722d89e4df85 100755
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ b/third_party/sycl/crosstool/CROSSTOOL.tpl
@@ -35,10 +35,10 @@ toolchain {
   tool_path { name: "compat-ld" path: "/usr/bin/ld" }
   tool_path { name: "cpp" path: "/usr/bin/cpp" }
   tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "computecpp" }
+  tool_path { name: "gcc" path: "%{sycl_impl}" }
   # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
   # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
+  cxx_flag: "%{c++_std}"
   linker_flag: "-Wl,-no-as-needed"
   linker_flag: "-lstdc++"
   linker_flag: "-B/usr/bin/"
@@ -53,7 +53,7 @@ toolchain {
   cxx_builtin_include_directory: "/usr/local/include"
   cxx_builtin_include_directory: "/usr/include"
 
-  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
+  cxx_builtin_include_directory: "%{sycl_include_dir}"
   cxx_builtin_include_directory: "%{python_lib_path}"
 
   tool_path { name: "gcov" path: "/usr/bin/gcov" }
@@ -214,4 +214,4 @@ toolchain {
     compiler_flag: "-O2"
     compiler_flag: "-DNDEBUG"
   }
-}
+}
\ No newline at end of file
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..87a70d8f9549d57f0fc1a2c4b56ac1c4af065e71
--- /dev/null
+++ b/third_party/sycl/crosstool/trisycl.tpl
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import tempfile
+from subprocess import call
+
+CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
+CPU_C_COMPILER = ('%{host_c_compiler}')
+
+CURRENT_DIR = os.path.dirname(sys.argv[0])
+TRISYCL_INCLUDE_DIR = CURRENT_DIR + '/../sycl/include'
+
+
+def main():
+  compiler_flags = []
+
+  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable',
+                  '-Wignored-attributes', '-fno-exceptions')
+  # remove -fsamotoze-coverage from string with g++
+  if 'g++' in CPU_CXX_COMPILER:
+    remove_flags += ('-fsanitize-coverage',)
+    compiler_flags += ['-fopenmp']
+  else:
+    compiler_flags += ['-fopenmp=libomp']
+
+  compiler_flags += [
+      flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)
+  ]
+
+  output_file_index = compiler_flags.index('-o') + 1
+  output_file_name = compiler_flags[output_file_index]
+
+  if (output_file_index == 1):
+    # we are linking
+    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
+
+  # find what we compile
+  compiling_cpp = 0
+  if ('-c' in compiler_flags):
+    compiled_file_index = compiler_flags.index('-c') + 1
+    compiled_file_name = compiler_flags[compiled_file_index]
+    if (compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C',
+                                     '.cxx'))):
+      compiling_cpp = 1
+
+  debug_flags = [
+      '-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL',
+      '-lpthread', '-lboost_log', '-g', '-rdynamic'
+  ]
+
+  opt_flags = ['-DNDEBUG', '-DBOOST_DISABLE_ASSERTS', '-O3']
+
+  compiler_flags = compiler_flags + [
+      '-DEIGEN_USE_SYCL=1', '-DEIGEN_HAS_C99_MATH',
+      '-DEIGEN_MAX_ALIGN_BYTES=16', '-DTENSORFLOW_USE_SYCL'
+  ] + opt_flags
+
+  if (compiling_cpp == 1):
+    # create a blacklist of folders that will be skipped when compiling
+    # with triSYCL
+    skip_extensions = ['.cu.cc']
+    skip_folders = [
+        'tensorflow/compiler', 'tensorflow/docs_src', 'tensorflow/tensorboard',
+        'third_party', 'external', 'hexagon'
+    ]
+    skip_folders = [(folder + '/') for folder in skip_folders]
+    # if compiling external project skip triSYCL
+    if any(
+        compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(
+            _folder in output_file_name for _folder in skip_folders):
+      return call([CPU_CXX_COMPILER] + compiler_flags)
+
+    host_compiler_flags = [
+        '-xc++', '-Wno-unused-variable', '-I', TRISYCL_INCLUDE_DIR
+    ] + compiler_flags
+    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
+    return x
+  else:
+    # compile for C
+    return call([CPU_C_COMPILER] + compiler_flags)
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
index 6cad190630062576262c977df6e84168e413bb99..21b1a2bbf7d320327d8f6e35124e6ef47019130b 100755
--- a/third_party/sycl/sycl/BUILD.tpl
+++ b/third_party/sycl/sycl/BUILD.tpl
@@ -1,25 +1,36 @@
 licenses(["notice"])  # Apache 2.0
 
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
-load("platform", "sycl_library_path")
+load(":platform.bzl", "sycl_library_path")
 
-load("platform", "readlink_command")
+load(":platform.bzl", "readlink_command")
 
 package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE.text"])
 
 config_setting(
-    name = "using_sycl",
-    values = {
-        "define": "using_sycl=true",
+    name = "using_sycl_ccpp",
+    define_values = {
+        "using_sycl": "true",
+        "using_trisycl": "false",
     },
 )
 
+config_setting(
+    name = "using_sycl_trisycl",
+    define_values = {
+        "using_sycl": "true",
+        "using_trisycl": "false",
+    },
+)
+
+
 cc_library(
     name = "sycl_headers",
     hdrs = glob([
         "**/*.h",
+        "**/*.hpp",
     ]),
     includes = [".", "include"],
 )
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
index 09bef0a6613098721d993696f6d66223a738ef0b..33386f8957c821ef579a2bc1dcfb71b94ceb0aa1 100755
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@@ -5,9 +5,24 @@ def if_sycl(if_true, if_false = []):
 
     Returns a select statement which evaluates to if_true if we're building
     with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
+    If we are building with triSYCL instead of ComputeCPP, a list with
+    the first element of if_true is returned.
+    """
+    return select({
+        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
+        "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
+        "//conditions:default": if_false
+    })
+
+def if_ccpp(if_true, if_false = []):
+    """Shorthand for select()'ing if we are building with ComputeCPP.
 
+    Returns a select statement which evaluates to if_true if we're building
+    with ComputeCPP enabled. Otherwise, the select statement evaluates
+    to if_false.
     """
     return select({
-        "@local_config_sycl//sycl:using_sycl": if_true,
+        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
+        "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
         "//conditions:default": if_false
     })
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 7af063178e04af3155888c49180abed6d19bae38..5b9d0eb383d1b069c2107c2c22a59c3790cb721e 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -5,20 +5,26 @@
   * HOST_CXX_COMPILER:  The host C++ compiler
   * HOST_C_COMPILER:    The host C compiler
   * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
+  * TRISYCL_INCLUDE_DIR: The path to the include directory of triSYCL.
+                         (if using triSYCL instead of ComputeCPP)
   * PYTHON_LIB_PATH: The path to the python lib
 """
 
 _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
 _HOST_C_COMPILER= "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
+_TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 def _enable_sycl(repository_ctx):
-  if "TF_NEED_OPENCL" in repository_ctx.os.environ:
-    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL"].strip()
+  if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
+    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
     return enable_sycl == "1"
   return False
 
+def _enable_compute_cpp(repository_ctx):
+  return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
+
 def auto_configure_fail(msg):
   """Output failure message when auto configuration fails."""
   red = "\033[0;31m"
@@ -59,6 +65,14 @@ def find_computecpp_root(repository_ctx):
     return sycl_name
   fail("Cannot find SYCL compiler, please correct your path")
 
+def find_trisycl_include_dir(repository_ctx):
+  """Find triSYCL include directory. """
+  if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
+    sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
+    if sycl_name.startswith("/"):
+      return sycl_name
+  fail( "Cannot find triSYCL include directory, please correct your path")
+
 def find_python_lib(repository_ctx):
   """Returns python path."""
   if _PYTHON_LIB_PATH in repository_ctx.os.environ:
@@ -171,26 +185,53 @@ def _sycl_autoconf_imp(repository_ctx):
     _tpl(repository_ctx, "sycl:platform.bzl")
     _tpl(repository_ctx, "crosstool:BUILD")
     _file(repository_ctx, "sycl:LICENSE.text")
-    _tpl(repository_ctx, "crosstool:computecpp",
-    {
-      "%{host_cxx_compiler}" : find_cc(repository_ctx),
-      "%{host_c_compiler}" : find_c(repository_ctx),
-    })
-
-    computecpp_root = find_computecpp_root(repository_ctx)
-    _check_dir(repository_ctx, computecpp_root)
-
-    _tpl(repository_ctx, "crosstool:CROSSTOOL",
-    {
-      "%{computecpp_toolkit_path}" : computecpp_root,
-      "%{python_lib_path}" : find_python_lib(repository_ctx),
-    })
-
-    # symlink libraries
-    _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
-    _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-    _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-    _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+
+    if _enable_compute_cpp(repository_ctx):
+      _tpl(repository_ctx, "crosstool:computecpp",
+      {
+        "%{host_cxx_compiler}" : find_cc(repository_ctx),
+        "%{host_c_compiler}" : find_c(repository_ctx)
+      })
+
+      computecpp_root = find_computecpp_root(repository_ctx);
+      _check_dir(repository_ctx, computecpp_root)
+
+      _tpl(repository_ctx, "crosstool:CROSSTOOL",
+      {
+        "%{sycl_include_dir}" : computecpp_root,
+        "%{sycl_impl}" : "computecpp",
+        "%{c++_std}" : "-std=c++11",
+        "%{python_lib_path}" : find_python_lib(repository_ctx),
+      })
+
+      # symlink libraries
+      _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
+      _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+      _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+      _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+    else:
+
+      trisycl_include_dir = find_trisycl_include_dir(repository_ctx);
+      _check_dir(repository_ctx, trisycl_include_dir)
+
+      _tpl(repository_ctx, "crosstool:trisycl",
+      {
+        "%{host_cxx_compiler}" : find_cc(repository_ctx),
+        "%{host_c_compiler}" : find_c(repository_ctx),
+        "%{trisycl_include_dir}" : trisycl_include_dir
+      })
+
+
+      _tpl(repository_ctx, "crosstool:CROSSTOOL",
+      {
+        "%{sycl_include_dir}" : trisycl_include_dir,
+        "%{sycl_impl}" : "trisycl",
+        "%{c++_std}" : "-std=c++1y",
+        "%{python_lib_path}" : find_python_lib(repository_ctx),
+      })
+
+      _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
+
 
 sycl_configure = repository_rule(
   implementation = _sycl_autoconf_imp,
diff --git a/third_party/tflite_mobilenet.BUILD b/third_party/tflite_mobilenet.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/tflite_smartreply.BUILD b/third_party/tflite_smartreply.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..75663eff48595b3a9aaa6c336d564cc3796e29cd
--- /dev/null
+++ b/third_party/tflite_smartreply.BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "model_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index 85096688914a1598ef1d51b71721d860398947cb..d164ee719c1fa4a304b82f223a432b9d087db827 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -49,7 +49,7 @@ cc_library(
         ":windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
-            "-Wno-implicit-function-declaration",
+            "-DZ_HAVE_UNISTD_H",
         ],
     }),
     includes = ["."],
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 414ddf2e475da051cad4a4534a3a0ca955229997..04c24d7511469bdc8b7fa724ca1984daa8c7e84a 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -9,25 +9,21 @@ build:win-cuda --define=using_cuda=true --define=using_cuda_nvcc=true
 build:mkl --define=using_mkl=true
 
 build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true
+build:sycl --define=using_sycl=true --define=using_trisycl=false
 
 build:sycl_nodouble --crosstool_top=@local_config_sycl//crosstool:toolchain
 build:sycl_nodouble --define=using_sycl=true --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
 
 build:sycl_asan --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl_asan --define=using_sycl=true --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+build:sycl_asan --define=using_sycl=true --define=using_trisycl=false --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
+
+build:sycl_trisycl --crosstool_top=@local_config_sycl//crosstool:toolchain
+build:sycl_trisycl --define=using_sycl=true --define=using_trisycl=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
+build --define=grpc_no_ares=true
 
 build --spawn_strategy=standalone
-test --spawn_strategy=standalone
-run --spawn_strategy=standalone
-
 build --genrule_strategy=standalone
-test --genrule_strategy=standalone
-run --genrule_strategy=standalone
-
 build -c opt
-test -c opt
-run -c opt
diff --git a/util/python/BUILD b/util/python/BUILD
index 96daf9947ad43e7d9f3a771166d714af0b1a8036..f5fa0c6d29c905cd9073e5001e993da5c8560ec0 100644
--- a/util/python/BUILD
+++ b/util/python/BUILD
@@ -1,4 +1,4 @@
-licenses(["restricted"])
+licenses(["notice"])  # New BSD, Python Software Foundation
 
 package(default_visibility = ["//visibility:public"])